Changeset 106545 in vbox for trunk/include
- Timestamp:
- Oct 21, 2024 7:52:03 AM (7 months ago)
- svn:sync-xref-src-repo-rev:
- 165379
- File:
-
- 1 edited
Legend:
- Unmodified
- Added
- Removed
-
trunk/include/iprt/asm.h
r106061 r106545 57 57 # include <iprt/sanitized/intrin.h> 58 58 # pragma intrinsic(_ReadWriteBarrier) 59 # pragma intrinsic(__cpuid) 60 # pragma intrinsic(__stosd) 61 # pragma intrinsic(__stosw) 62 # pragma intrinsic(__stosb) 59 # if defined(RT_ARCH_X86) || defined(RT_ARCH_AMD64) 60 # pragma intrinsic(__cpuid) 61 # pragma intrinsic(__stosd) 62 # pragma intrinsic(__stosw) 63 # pragma intrinsic(__stosb) 64 # ifdef RT_ARCH_AMD64 65 # pragma intrinsic(__stosq) 66 # pragma intrinsic(_byteswap_uint64) 67 # pragma intrinsic(_InterlockedCompareExchange128) 68 # pragma intrinsic(_InterlockedExchange64) 69 # pragma intrinsic(_InterlockedExchangeAdd64) 70 # pragma intrinsic(_InterlockedAnd64) 71 # pragma intrinsic(_InterlockedOr64) 72 # pragma intrinsic(_InterlockedIncrement64) 73 # pragma intrinsic(_InterlockedDecrement64) 74 # endif 75 # elif defined(RT_ARCH_ARM64) 76 # pragma intrinsic(__break) 77 # pragma intrinsic(__dmb) 78 # pragma intrinsic(__dsb) 79 # pragma intrinsic(__isb) 80 # pragma intrinsic(__nop) 81 # pragma intrinsic(__yield) 82 # pragma intrinsic(__swp8) 83 # pragma intrinsic(__swpa8) 84 # pragma intrinsic(__swpal8) 85 # pragma intrinsic(__swp16) 86 # pragma intrinsic(__swpa16) 87 # pragma intrinsic(__swpal16) 88 # pragma intrinsic(__swp32) 89 # pragma intrinsic(__swpa32) 90 # pragma intrinsic(__swpal32) 91 # pragma intrinsic(__swp64) 92 # pragma intrinsic(__swpa64) 93 # pragma intrinsic(__swpal64) 94 # pragma intrinsic(__cas8) 95 # pragma intrinsic(__casl8) 96 # pragma intrinsic(__cas16) 97 # pragma intrinsic(__casl16) 98 # pragma intrinsic(__cas32) 99 # pragma intrinsic(__casl32) 100 # pragma intrinsic(__cas64) 101 # pragma intrinsic(__casl64) 102 # pragma intrinsic(__casa8) 103 # pragma intrinsic(__casal8) 104 # pragma intrinsic(__casa16) 105 # pragma intrinsic(__casa64) 106 # pragma intrinsic(__iso_volatile_load8) 107 # pragma intrinsic(__iso_volatile_load16) 108 # pragma intrinsic(__iso_volatile_load32) 109 # pragma intrinsic(__iso_volatile_load64) 110 # pragma intrinsic(__iso_volatile_store8) 111 # pragma intrinsic(__iso_volatile_store16) 112 # pragma intrinsic(__iso_volatile_store32) 113 # pragma intrinsic(__iso_volatile_store64) 114 # pragma intrinsic(__load_acquire8) 115 # pragma intrinsic(__load_acquire16) 116 # pragma intrinsic(__load_acquire32) 117 # pragma intrinsic(__load_acquire64) 118 # pragma intrinsic(__stlr8) 119 # pragma intrinsic(__stlr16) 120 # pragma intrinsic(__stlr32) 121 # pragma intrinsic(__stlr64) 122 # else 123 # error "Port me" 124 # endif 63 125 # pragma intrinsic(_BitScanForward) 64 126 # pragma intrinsic(_BitScanReverse) … … 86 148 # pragma intrinsic(_rotl64) 87 149 # pragma intrinsic(_rotr64) 88 # ifdef RT_ARCH_AMD64 89 # pragma intrinsic(__stosq) 90 # pragma intrinsic(_byteswap_uint64) 91 # pragma intrinsic(_InterlockedCompareExchange128) 92 # pragma intrinsic(_InterlockedExchange64) 93 # pragma intrinsic(_InterlockedExchangeAdd64) 94 # pragma intrinsic(_InterlockedAnd64) 95 # pragma intrinsic(_InterlockedOr64) 96 # pragma intrinsic(_InterlockedIncrement64) 97 # pragma intrinsic(_InterlockedDecrement64) 98 # endif 99 #endif 100 101 #if (defined(RT_ARCH_ARM64) && defined(RT_OS_DARWIN)) || defined(DOXYGEN_RUNNING) 150 #endif 151 152 #if (defined(RT_ARCH_ARM64) && (defined(RT_OS_DARWIN) || defined(RT_OS_WINDOWS))) || defined(DOXYGEN_RUNNING) 102 153 /** @def RTASM_ARM64_USE_FEAT_LSE 103 154 * Use instructions from the FEAT_LSE set to implement atomic operations, … … 475 526 476 527 # elif defined(RT_ARCH_ARM32) || defined(RT_ARCH_ARM64) 528 529 # if RT_INLINE_ASM_USES_INTRIN 530 __yield(); 531 # else 477 532 __asm__ __volatile__("yield\n\t"); /* ARMv6K+ */ 533 # endif 478 534 479 535 # else … … 522 578 523 579 # elif defined(RT_ARCH_ARM32) || defined(RT_ARCH_ARM64) 580 581 # if RT_INLINE_ASM_USES_INTRIN 582 uint8_t uOld; 583 # if defined(RTASM_ARM64_USE_FEAT_LSE_WITHOUT_DMB) 584 uOld = __swpal8(pu8, u8); 585 # else 586 uOld = __swp8(pu8, u8); 587 __dmb(_ARM64_BARRIER_SY); 588 # endif 589 return uOld; 590 591 # else 524 592 uint32_t uOld; 525 # if defined(RTASM_ARM64_USE_FEAT_LSE)593 # if defined(RTASM_ARM64_USE_FEAT_LSE) 526 594 /* SWPALB is ~40% more expensive than the non-LSE variant (M1), but since we 527 595 have the barrier we shouldn't need that, right? Ordering should be taken 528 596 care of by the DMB. The SWPB is rather cheap (~70% faster). */ 529 597 __asm__ __volatile__("Lstart_ASMAtomicXchgU8_%=:\n\t" 530 # if defined(RTASM_ARM64_USE_FEAT_LSE_WITHOUT_DMB)598 # if defined(RTASM_ARM64_USE_FEAT_LSE_WITHOUT_DMB) 531 599 "swpalb %w[uNew], %w[uOld], %[pMem]\n\t" 532 # else600 # else 533 601 RTASM_ARM_DMB_SY 534 602 "swpb %w[uNew], %w[uOld], %[pMem]\n\t" 535 # endif603 # endif 536 604 : [pMem] "+Q" (*pu8) 537 605 , [uOld] "=&r" (uOld) … … 542 610 __asm__ __volatile__("Ltry_again_ASMAtomicXchgU8_%=:\n\t" 543 611 RTASM_ARM_DMB_SY 544 # if defined(RT_ARCH_ARM64)612 # if defined(RT_ARCH_ARM64) 545 613 "ldaxrb %w[uOld], %[pMem]\n\t" 546 614 "stlxrb %w[rc], %w[uNew], %[pMem]\n\t" 547 615 "cbnz %w[rc], Ltry_again_ASMAtomicXchgU8_%=\n\t" 548 # else616 # else 549 617 "ldrexb %[uOld], %[pMem]\n\t" /* ARMv6+ */ 550 618 "strexb %[rc], %[uNew], %[pMem]\n\t" 551 619 "cmp %[rc], #0\n\t" 552 620 "bne Ltry_again_ASMAtomicXchgU8_%=\n\t" 553 # endif621 # endif 554 622 : [pMem] "+Q" (*pu8) 555 623 , [uOld] "=&r" (uOld) … … 558 626 RTASM_ARM_DMB_SY_COMMA_IN_REG 559 627 : "cc"); 560 # endif628 # endif 561 629 return (uint8_t)uOld; 630 # endif 562 631 563 632 # else … … 636 705 637 706 # elif defined(RT_ARCH_ARM32) || defined(RT_ARCH_ARM64) 707 708 # if RT_INLINE_ASM_USES_INTRIN 709 uint16_t uOld; 710 # if defined(RTASM_ARM64_USE_FEAT_LSE_WITHOUT_DMB) 711 uOld = __swpal16(pu16, u16); 712 # else 713 uOld = __swp16(pu16, u16); 714 __dmb(_ARM64_BARRIER_SY); 715 # endif 716 return uOld; 717 718 # else 638 719 uint32_t uOld; 639 # if defined(RTASM_ARM64_USE_FEAT_LSE)720 # if defined(RTASM_ARM64_USE_FEAT_LSE) 640 721 /* SWPALH is ~40% more expensive than the non-LSE variant on an M1, 20% 641 722 slower if we remove the barrier. But since we have the barrier we … … 643 724 The SWPH is rather cheap (~70% faster). */ 644 725 __asm__ __volatile__("Lstart_ASMAtomicXchgU16_%=:\n\t" 645 # if defined(RTASM_ARM64_USE_FEAT_LSE_WITHOUT_DMB)726 # if defined(RTASM_ARM64_USE_FEAT_LSE_WITHOUT_DMB) 646 727 "swpalh %w[uNew], %w[uOld], %[pMem]\n\t" 647 # else728 # else 648 729 RTASM_ARM_DMB_SY 649 730 "swph %w[uNew], %w[uOld], %[pMem]\n\t" 650 # endif731 # endif 651 732 : [pMem] "+Q" (*pu16) 652 733 , [uOld] "=&r" (uOld) 653 734 : [uNew] "r" ((uint32_t)u16) 654 735 : ); 655 # else736 # else 656 737 uint32_t rcSpill; 657 738 __asm__ __volatile__("Ltry_again_ASMAtomicXchgU16_%=:\n\t" 658 739 RTASM_ARM_DMB_SY 659 # if defined(RT_ARCH_ARM64)740 # if defined(RT_ARCH_ARM64) 660 741 "ldaxrh %w[uOld], %[pMem]\n\t" 661 742 "stlxrh %w[rc], %w[uNew], %[pMem]\n\t" 662 743 "cbnz %w[rc], Ltry_again_ASMAtomicXchgU16_%=\n\t" 663 # else744 # else 664 745 "ldrexh %[uOld], %[pMem]\n\t" /* ARMv6+ */ 665 746 "strexh %[rc], %[uNew], %[pMem]\n\t" 666 747 "cmp %[rc], #0\n\t" 667 748 "bne Ltry_again_ASMAtomicXchgU16_%=\n\t" 668 # endif749 # endif 669 750 : [pMem] "+Q" (*pu16) 670 751 , [uOld] "=&r" (uOld) … … 673 754 RTASM_ARM_DMB_SY_COMMA_IN_REG 674 755 : "cc"); 675 # endif756 # endif 676 757 return (uint16_t)uOld; 758 #endif 677 759 678 760 # else … … 740 822 741 823 # elif defined(RT_ARCH_ARM32) || defined(RT_ARCH_ARM64) 824 825 # if RT_INLINE_ASM_USES_INTRIN 742 826 uint32_t uOld; 743 # if defined(RTASM_ARM64_USE_FEAT_LSE) 827 # if defined(RTASM_ARM64_USE_FEAT_LSE_WITHOUT_DMB) 828 uOld = __swpal32(pu32, u32); 829 # else 830 uOld = __swp32(pu32, u32); 831 __dmb(_ARM64_BARRIER_SY); 832 # endif 833 return uOld; 834 835 # else 836 uint32_t uOld; 837 # if defined(RTASM_ARM64_USE_FEAT_LSE) 744 838 /* SWPAL is ~40% more expensive than the non-LSE variant on an M1, 20% 745 839 slower if we remove the barrier. But since we have the barrier we … … 747 841 The SWP is rather cheap (~70% faster). */ 748 842 __asm__ __volatile__("Lstart_ASMAtomicXchgU32_%=:\n\t" 749 # if defined(RTASM_ARM64_USE_FEAT_LSE_WITHOUT_DMB)843 # if defined(RTASM_ARM64_USE_FEAT_LSE_WITHOUT_DMB) 750 844 "swpal %w[uNew], %w[uOld], %[pMem]\n\t" 751 # else845 # else 752 846 RTASM_ARM_DMB_SY 753 847 "swp %w[uNew], %w[uOld], %[pMem]\n\t" 754 # endif848 # endif 755 849 : [pMem] "+Q" (*pu32) 756 850 , [uOld] "=&r" (uOld) 757 851 : [uNew] "r" (u32) 758 852 : ); 759 # else853 # else 760 854 uint32_t rcSpill; 761 855 __asm__ __volatile__("Ltry_again_ASMAtomicXchgU32_%=:\n\t" 762 856 RTASM_ARM_DMB_SY 763 # if defined(RT_ARCH_ARM64)857 # if defined(RT_ARCH_ARM64) 764 858 "ldaxr %w[uOld], %[pMem]\n\t" 765 859 "stlxr %w[rc], %w[uNew], %[pMem]\n\t" 766 860 "cbnz %w[rc], Ltry_again_ASMAtomicXchgU32_%=\n\t" 767 # else861 # else 768 862 "ldrex %[uOld], %[pMem]\n\t" /* ARMv6+ */ 769 863 "strex %[rc], %[uNew], %[pMem]\n\t" 770 864 "cmp %[rc], #0\n\t" 771 865 "bne Ltry_again_ASMAtomicXchgU32_%=\n\t" 772 # endif866 # endif 773 867 : [pMem] "+Q" (*pu32) 774 868 , [uOld] "=&r" (uOld) … … 777 871 RTASM_ARM_DMB_SY_COMMA_IN_REG 778 872 : "cc"); 779 # endif873 # endif 780 874 return uOld; 875 # endif 781 876 782 877 # else … … 884 979 885 980 # elif defined(RT_ARCH_ARM32) || defined(RT_ARCH_ARM64) 981 982 # if RT_INLINE_ASM_USES_INTRIN 886 983 uint64_t uOld; 887 # if defined(RTASM_ARM64_USE_FEAT_LSE) 984 # if defined(RTASM_ARM64_USE_FEAT_LSE_WITHOUT_DMB) 985 uOld = __swpal64(pu64, u64); 986 # else 987 uOld = __swp64(pu64, u64); 988 # endif 989 return uOld; 990 991 # else 992 uint64_t uOld; 993 # if defined(RTASM_ARM64_USE_FEAT_LSE) 888 994 /* SWPAL is ~40% more expensive than the non-LSE variant on an M1, 20% 889 995 slower if we remove the barrier. But since we have the barrier we … … 891 997 The SWP is rather cheap (~70% faster). */ 892 998 __asm__ __volatile__("Lstart_ASMAtomicXchgU64_%=:\n\t" 893 # if defined(RTASM_ARM64_USE_FEAT_LSE_WITHOUT_DMB)999 # if defined(RTASM_ARM64_USE_FEAT_LSE_WITHOUT_DMB) 894 1000 "swpal %[uNew], %[uOld], %[pMem]\n\t" 895 # else1001 # else 896 1002 RTASM_ARM_DMB_SY 897 1003 "swp %[uNew], %[uOld], %[pMem]\n\t" 898 # endif1004 # endif 899 1005 : [pMem] "+Q" (*pu64) 900 1006 , [uOld] "=&r" (uOld) 901 1007 : [uNew] "r" (u64) 902 1008 : ); 903 # else1009 # else 904 1010 uint32_t rcSpill; 905 1011 __asm__ __volatile__("Ltry_again_ASMAtomicXchgU64_%=:\n\t" 906 1012 RTASM_ARM_DMB_SY 907 # if defined(RT_ARCH_ARM64)1013 # if defined(RT_ARCH_ARM64) 908 1014 "ldaxr %[uOld], %[pMem]\n\t" 909 1015 "stlxr %w[rc], %[uNew], %[pMem]\n\t" 910 1016 "cbnz %w[rc], Ltry_again_ASMAtomicXchgU64_%=\n\t" 911 # else1017 # else 912 1018 "ldrexd %[uOld], %H[uOld], %[pMem]\n\t" /* ARMv6+ */ 913 1019 "strexd %[rc], %[uNew], %H[uNew], %[pMem]\n\t" 914 1020 "cmp %[rc], #0\n\t" 915 1021 "bne Ltry_again_ASMAtomicXchgU64_%=\n\t" 916 # endif1022 # endif 917 1023 : [pMem] "+Q" (*pu64) 918 1024 , [uOld] "=&r" (uOld) … … 921 1027 RTASM_ARM_DMB_SY_COMMA_IN_REG 922 1028 : "cc"); 923 # endif1029 # endif 924 1030 return uOld; 1031 # endif 925 1032 926 1033 # else … … 1140 1247 * @todo Rename ASMAtomicCmpWriteU8 1141 1248 */ 1142 #if RT_INLINE_ASM_EXTERNAL_TMP_ARM || !RT_INLINE_ASM_GNU_STYLE1249 #if RT_INLINE_ASM_EXTERNAL_TMP_ARM || (!RT_INLINE_ASM_GNU_STYLE && !defined(RT_ARCH_ARM64)) 1143 1250 RT_ASM_DECL_PRAGMA_WATCOM(bool) ASMAtomicCmpXchgU8(volatile uint8_t RT_FAR *pu8, const uint8_t u8New, const uint8_t u8Old) RT_NOTHROW_PROTO; 1144 1251 #else … … 1157 1264 : "cc"); 1158 1265 return (bool)u8Ret; 1266 1267 # elif RT_INLINE_ASM_USES_INTRIN 1268 return (uint8_t)_InterlockedCompareExchange8((char RT_FAR *)pu8, u8New, u8Old) == u8Old; 1159 1269 1160 1270 # elif defined(RT_ARCH_ARM64) || defined(RT_ARCH_ARM32) … … 1318 1428 1319 1429 # elif defined(RT_ARCH_ARM64) || defined(RT_ARCH_ARM32) 1430 1431 # ifdef RT_INLINE_ASM_USES_INTRIN 1432 uint32_t uOldActual; 1433 # if defined(RTASM_ARM64_USE_FEAT_LSE_WITHOUT_DMB) 1434 uOldActual = __casal32(pu32, u32Old, u32New); 1435 # else 1436 uOldActual = __casal32(pu32, u32Old, u32New); 1437 __dmb(_ARM64_BARRIER_SY); 1438 # endif 1439 return uOldActual == u32Old; /* Lets hope the compiler is clever enough to replicate our cmp + cset optimization below. */ 1440 1441 # else 1320 1442 union { uint32_t u; bool f; } fXchg; 1321 1443 uint32_t u32Spill; 1322 1444 /* M1 bench: match: casal= 6592 vs dmb+cas= 1562 vs non-lse=5634 (ps/call) 1323 1445 mismatch: casal=18794 vs dmb+cas=19697 vs non-lse=2499 (ps/call) */ 1324 # if defined(RTASM_ARM64_USE_FEAT_LSE)1446 # if defined(RTASM_ARM64_USE_FEAT_LSE) 1325 1447 __asm__ __volatile__("Lstart_ASMAtomicCmpXchgU32_%=:\n\t" 1326 # if defined(RTASM_ARM64_USE_FEAT_LSE_WITHOUT_DMB)1448 # if defined(RTASM_ARM64_USE_FEAT_LSE_WITHOUT_DMB) 1327 1449 "casal %w[uOldActual], %w[uNew], %[pMem]\n\t" 1328 # else1450 # else 1329 1451 RTASM_ARM_DMB_SY 1330 1452 "cas %w[uOldActual], %w[uNew], %[pMem]\n\t" 1331 # endif1453 # endif 1332 1454 "cmp %w[uOldActual], %w[uOldOrg]\n\t" 1333 1455 "cset %w[fXchg], eq\n\t" … … 1339 1461 , "[uOldActual]" (u32Old) 1340 1462 : "cc"); 1341 # else1463 # else 1342 1464 uint32_t rcSpill; 1343 1465 __asm__ __volatile__("Ltry_again_ASMAtomicCmpXchgU32_%=:\n\t" 1344 1466 RTASM_ARM_DMB_SY 1345 # if defined(RT_ARCH_ARM64)1467 # if defined(RT_ARCH_ARM64) 1346 1468 "ldaxr %w[uOld], %[pMem]\n\t" 1347 1469 "cmp %w[uOld], %w[uCmp]\n\t" … … 1352 1474 "1:\n\t" 1353 1475 "clrex\n\t" 1354 # else1476 # else 1355 1477 "ldrex %[uOld], %[pMem]\n\t" 1356 1478 "teq %[uOld], %[uCmp]\n\t" … … 1362 1484 "1:\n\t" 1363 1485 /** @todo clrexne on armv7? */ 1364 # endif1486 # endif 1365 1487 : [pMem] "+Q" (*pu32) 1366 1488 , [uOld] "=&r" (u32Spill) … … 1372 1494 RTASM_ARM_DMB_SY_COMMA_IN_REG 1373 1495 : "cc"); 1496 # endif 1497 return fXchg.f; 1374 1498 # endif 1375 return fXchg.f;1376 1499 1377 1500 # else … … 1504 1627 1505 1628 # elif defined(RT_ARCH_ARM64) || defined(RT_ARCH_ARM32) 1629 1630 # ifdef RT_INLINE_ASM_USES_INTRIN 1631 uint64_t uOldActual; 1632 # if defined(RTASM_ARM64_USE_FEAT_LSE_WITHOUT_DMB) 1633 uOldActual = __casal64(pu64, u64Old, u64New); 1634 # else 1635 uOldActual = __casal64(pu64, u64Old, u64New); 1636 __dmb(_ARM64_BARRIER_SY); 1637 # endif 1638 return uOldActual == u64Old; /* Lets hope the compiler is clever enough to replicate our cmp + cset optimization below. */ 1639 1640 # else 1506 1641 union { uint32_t u; bool f; } fXchg; 1507 1642 uint64_t u64Spill; 1508 1643 /* M1 bench: match: casal= 6599 vs dmb+cas= 1565 vs non-lse=5000 (ps/call) 1509 1644 mismatch: casal=18797 vs dmb+cas=19731 vs non-lse=2512 (ps/call) */ 1510 # if defined(RTASM_ARM64_USE_FEAT_LSE)1645 # if defined(RTASM_ARM64_USE_FEAT_LSE) 1511 1646 __asm__ __volatile__("Lstart_ASMAtomicCmpXchgU75_%=:\n\t" 1512 # if defined(RTASM_ARM64_USE_FEAT_LSE_WITHOUT_DMB)1647 # if defined(RTASM_ARM64_USE_FEAT_LSE_WITHOUT_DMB) 1513 1648 "casal %[uOldActual], %[uNew], %[pMem]\n\t" 1514 # else1649 # else 1515 1650 RTASM_ARM_DMB_SY 1516 1651 "cas %[uOldActual], %[uNew], %[pMem]\n\t" 1517 # endif1652 # endif 1518 1653 "cmp %[uOldActual], %[uOldOrg]\n\t" 1519 1654 "cset %w[fXchg], eq\n\t" … … 1525 1660 , "[uOldActual]" (u64Old) 1526 1661 : "cc"); 1527 # else1662 # else 1528 1663 uint32_t rcSpill; 1529 1664 __asm__ __volatile__("Ltry_again_ASMAtomicCmpXchgU64_%=:\n\t" 1530 1665 RTASM_ARM_DMB_SY 1531 # if defined(RT_ARCH_ARM64)1666 # if defined(RT_ARCH_ARM64) 1532 1667 "ldaxr %[uOld], %[pMem]\n\t" 1533 1668 "cmp %[uOld], %[uCmp]\n\t" … … 1538 1673 "1:\n\t" 1539 1674 "clrex\n\t" 1540 # else1675 # else 1541 1676 "ldrexd %[uOld], %H[uOld], %[pMem]\n\t" 1542 1677 "teq %[uOld], %[uCmp]\n\t" … … 1549 1684 "1:\n\t" 1550 1685 /** @todo clrexne on armv7? */ 1551 # endif1686 # endif 1552 1687 : [pMem] "+Q" (*pu64) 1553 1688 , [uOld] "=&r" (u64Spill) … … 1559 1694 RTASM_ARM_DMB_SY_COMMA_IN_REG 1560 1695 : "cc"); 1561 # endif1696 # endif 1562 1697 return fXchg.f; 1698 # endif 1563 1699 1564 1700 # else … … 1875 2011 /* M1 bench: match: casalb= 6594 vs dmb+casb= 1561 vs non-lse=5051 (ps/call) 1876 2012 mismatch: casalb=15346 vs dmb+casb=16349 vs non-lse=2505 (ps/call) */ 1877 # if defined(RTASM_ARM64_USE_FEAT_LSE) 2013 # ifdef RT_INLINE_ASM_USES_INTRIN 2014 # if defined(RTASM_ARM64_USE_FEAT_LSE) 2015 uint8_t uOldActual; 2016 # if defined(RTASM_ARM64_USE_FEAT_LSE_WITHOUT_DMB) 2017 uOldActual = __casal8(pu8, u8Old, u8New); 2018 # else 2019 uOldActual = __casal8(pu8, u8Old, u8New); 2020 __dmb(_ARM64_BARRIER_SY); 2021 # endif 2022 return (*pu8Old = uOldActual) == u8Old; /* Lets hope the compiler is clever enough to replicate our cmp + cset optimization below. */ 2023 # else 2024 return (*pu8Old = _InterlockedCompareExchange8((char RT_FAR *)pu8, u8New, u8Old)) == u8Old; 2025 # endif 2026 2027 # else 2028 2029 # if defined(RTASM_ARM64_USE_FEAT_LSE) 1878 2030 union { uint32_t u; bool f; } fXchg; 1879 2031 uint32_t u32Actual; 1880 2032 __asm__ __volatile__("Lstart_ASMAtomicCmpXchgExU8_%=:\n\t" 1881 # if defined(RTASM_ARM64_USE_FEAT_LSE_WITHOUT_DMB)2033 # if defined(RTASM_ARM64_USE_FEAT_LSE_WITHOUT_DMB) 1882 2034 "casalb %w[uOldActual], %w[uNew], %[pMem]\n\t" 1883 # else2035 # else 1884 2036 RTASM_ARM_DMB_SY 1885 2037 "casb %w[uOldActual], %w[uNew], %[pMem]\n\t" 1886 # endif2038 # endif 1887 2039 "cmp %w[uOldActual], %w[uOldOrg]\n\t" 1888 2040 "cset %w[fXchg], eq\n\t" … … 1895 2047 : "cc"); 1896 2048 *pu8Old = (uint8_t)u32Actual; 1897 # else2049 # else 1898 2050 union { uint8_t u; bool f; } fXchg; 1899 2051 uint8_t u8ActualOld; … … 1901 2053 __asm__ __volatile__("Ltry_again_ASMAtomicCmpXchgExU8_%=:\n\t" 1902 2054 RTASM_ARM_DMB_SY 1903 # if defined(RT_ARCH_ARM64)2055 # if defined(RT_ARCH_ARM64) 1904 2056 "ldaxrb %w[uOld], %[pMem]\n\t" 1905 2057 "cmp %w[uOld], %w[uCmp]\n\t" … … 1910 2062 "1:\n\t" 1911 2063 "clrex\n\t" 1912 # else2064 # else 1913 2065 "ldrexb %[uOld], %[pMem]\n\t" 1914 2066 "teq %[uOld], %[uCmp]\n\t" … … 1920 2072 "1:\n\t" 1921 2073 /** @todo clrexne on armv7? */ 1922 # endif2074 # endif 1923 2075 : [pMem] "+Q" (*pu8) 1924 2076 , [uOld] "=&r" (u8ActualOld) … … 1931 2083 : "cc"); 1932 2084 *pu8Old = u8ActualOld; 1933 # endif2085 # endif 1934 2086 return fXchg.f; 2087 # endif 1935 2088 1936 2089 # else … … 2027 2180 /* M1 bench: match: casalh= 6577 vs dmb+cash= 1608 vs non-lse=5078 (ps/call) 2028 2181 mismatch: casalh=18791 vs dmb+cash=19721 vs non-lse=2543 (ps/call) */ 2029 # if defined(RTASM_ARM64_USE_FEAT_LSE) 2182 # ifdef RT_INLINE_ASM_USES_INTRIN 2183 # if defined(RTASM_ARM64_USE_FEAT_LSE) 2184 uint16_t uOldActual; 2185 # if defined(RTASM_ARM64_USE_FEAT_LSE_WITHOUT_DMB) 2186 uOldActual = __casal16(pu16, u16Old, u16New); 2187 # else 2188 uOldActual = __casal16(pu16, u16Old, u16New); 2189 __dmb(_ARM64_BARRIER_SY); 2190 # endif 2191 return (*pu16Old = uOldActual) == u16Old; /* Lets hope the compiler is clever enough to replicate our cmp + cset optimization below. */ 2192 # else 2193 return (*pu16Old = _InterlockedCompareExchange16((char RT_FAR *)pu16, u16New, u16Old)) == u16Old; 2194 # endif 2195 2196 # else 2197 2198 # if defined(RTASM_ARM64_USE_FEAT_LSE) 2030 2199 union { uint32_t u; bool f; } fXchg; 2031 2200 uint32_t u32Actual; 2032 2201 __asm__ __volatile__("Lstart_ASMAtomicCmpXchgExU16_%=:\n\t" 2033 # if defined(RTASM_ARM64_USE_FEAT_LSE_WITHOUT_DMB)2202 # if defined(RTASM_ARM64_USE_FEAT_LSE_WITHOUT_DMB) 2034 2203 "casalh %w[uOldActual], %w[uNew], %[pMem]\n\t" 2035 # else2204 # else 2036 2205 RTASM_ARM_DMB_SY 2037 2206 "cash %w[uOldActual], %w[uNew], %[pMem]\n\t" 2038 # endif2207 # endif 2039 2208 "cmp %w[uOldActual], %w[uOldOrg]\n\t" 2040 2209 "cset %w[fXchg], eq\n\t" … … 2047 2216 : "cc"); 2048 2217 *pu16Old = (uint16_t)u32Actual; 2049 # else2218 # else 2050 2219 union { uint16_t u; bool f; } fXchg; 2051 2220 uint16_t u16ActualOld; … … 2053 2222 __asm__ __volatile__("Ltry_again_ASMAtomicCmpXchgExU16_%=:\n\t" 2054 2223 RTASM_ARM_DMB_SY 2055 # if defined(RT_ARCH_ARM64)2224 # if defined(RT_ARCH_ARM64) 2056 2225 "ldaxrh %w[uOld], %[pMem]\n\t" 2057 2226 "cmp %w[uOld], %w[uCmp]\n\t" … … 2062 2231 "1:\n\t" 2063 2232 "clrex\n\t" 2064 # else2233 # else 2065 2234 "ldrexh %[uOld], %[pMem]\n\t" 2066 2235 "teq %[uOld], %[uCmp]\n\t" … … 2072 2241 "1:\n\t" 2073 2242 /** @todo clrexne on armv7? */ 2074 # endif2243 # endif 2075 2244 : [pMem] "+Q" (*pu16) 2076 2245 , [uOld] "=&r" (u16ActualOld) … … 2083 2252 : "cc"); 2084 2253 *pu16Old = u16ActualOld; 2085 # endif2254 # endif 2086 2255 return fXchg.f; 2256 # endif 2087 2257 2088 2258 # else … … 2177 2347 2178 2348 # elif defined(RT_ARCH_ARM64) || defined(RT_ARCH_ARM32) 2349 2350 # ifdef RT_INLINE_ASM_USES_INTRIN 2351 # if defined(RTASM_ARM64_USE_FEAT_LSE) 2352 uint32_t uOldActual; 2353 # if defined(RTASM_ARM64_USE_FEAT_LSE_WITHOUT_DMB) 2354 uOldActual = __casal32(pu32, u32Old, u32New); 2355 # else 2356 uOldActual = __casal32(pu32, u32Old, u32New); 2357 __dmb(_ARM64_BARRIER_SY); 2358 # endif 2359 return (*pu32Old = uOldActual) == u32Old; /* Lets hope the compiler is clever enough to replicate our cmp + cset optimization below. */ 2360 # else 2361 return (*pu32Old = _InterlockedCompareExchange((char RT_FAR *)pu32, u32New, u32Old)) == u32Old; 2362 # endif 2363 2364 # else 2365 2179 2366 union { uint32_t u; bool f; } fXchg; 2180 2367 /* M1 bench: match: casal= 6590 vs dmb+cas= 1564 vs non-lse=5033 (ps/call) 2181 2368 mismatch: casal=18790 vs dmb+cas=19711 vs non-lse=2503 (ps/call) */ 2182 # if defined(RTASM_ARM64_USE_FEAT_LSE)2369 # if defined(RTASM_ARM64_USE_FEAT_LSE) 2183 2370 __asm__ __volatile__("Lstart_ASMAtomicCmpXchgExU32_%=:\n\t" 2184 # if defined(RTASM_ARM64_USE_FEAT_LSE_WITHOUT_DMB)2371 # if defined(RTASM_ARM64_USE_FEAT_LSE_WITHOUT_DMB) 2185 2372 "casal %w[uOldActual], %w[uNew], %[pMem]\n\t" 2186 # else2373 # else 2187 2374 RTASM_ARM_DMB_SY 2188 2375 "cas %w[uOldActual], %w[uNew], %[pMem]\n\t" 2189 # endif2376 # endif 2190 2377 "cmp %w[uOldActual], %w[uOldOrg]\n\t" 2191 2378 "cset %w[fXchg], eq\n\t" … … 2197 2384 , "[uOldActual]" (u32Old) 2198 2385 : "cc"); 2199 # else2386 # else 2200 2387 uint32_t u32ActualOld; 2201 2388 uint32_t rcSpill; 2202 2389 __asm__ __volatile__("Ltry_again_ASMAtomicCmpXchgExU32_%=:\n\t" 2203 2390 RTASM_ARM_DMB_SY 2204 # if defined(RT_ARCH_ARM64)2391 # if defined(RT_ARCH_ARM64) 2205 2392 "ldaxr %w[uOld], %[pMem]\n\t" 2206 2393 "cmp %w[uOld], %w[uCmp]\n\t" … … 2211 2398 "1:\n\t" 2212 2399 "clrex\n\t" 2213 # else2400 # else 2214 2401 "ldrex %[uOld], %[pMem]\n\t" 2215 2402 "teq %[uOld], %[uCmp]\n\t" … … 2221 2408 "1:\n\t" 2222 2409 /** @todo clrexne on armv7? */ 2223 # endif2410 # endif 2224 2411 : [pMem] "+Q" (*pu32) 2225 2412 , [uOld] "=&r" (u32ActualOld) … … 2232 2419 : "cc"); 2233 2420 *pu32Old = u32ActualOld; 2234 # endif2421 # endif 2235 2422 return fXchg.f; 2423 # endif 2236 2424 2237 2425 # else … … 2828 3016 DECLINLINE(void) ASMSerializeInstruction(void) RT_NOTHROW_DEF 2829 3017 { 3018 # ifdef RT_INLINE_ASM_USES_INTRIN 3019 __dsb(_ARM64_BARRIER_SY); 3020 # else 2830 3021 __asm__ __volatile__ (RTASM_ARM_DSB_SY :: RTASM_ARM_DSB_SY_IN_REG :); 3022 # endif 2831 3023 } 2832 3024 #else … … 2855 3047 # endif 2856 3048 #elif defined(RT_ARCH_ARM64) || defined(RT_ARCH_ARM32) 3049 # ifdef RT_INLINE_ASM_USES_INTRIN 3050 __dmb(_ARM64_BARRIER_SY); 3051 # else 2857 3052 __asm__ __volatile__ (RTASM_ARM_DMB_SY :: RTASM_ARM_DMB_SY_IN_REG :); 3053 # endif 2858 3054 #elif ARCH_BITS == 16 2859 3055 uint16_t volatile u16; … … 2886 3082 # endif 2887 3083 #elif defined(RT_ARCH_ARM64) || defined(RT_ARCH_ARM32) 3084 # ifdef RT_INLINE_ASM_USES_INTRIN 3085 __dmb(_ARM64_BARRIER_ST); 3086 # else 2888 3087 __asm__ __volatile__ (RTASM_ARM_DMB_ST :: RTASM_ARM_DMB_ST_IN_REG :); 3088 # endif 2889 3089 #else 2890 3090 ASMMemoryFence(); … … 2913 3113 # endif 2914 3114 #elif defined(RT_ARCH_ARM64) || defined(RT_ARCH_ARM32) 3115 # ifdef RT_INLINE_ASM_USES_INTRIN 3116 __dmb(_ARM64_BARRIER_LD); 3117 # else 2915 3118 __asm__ __volatile__ (RTASM_ARM_DMB_LD :: RTASM_ARM_DMB_LD_IN_REG :); 3119 # endif 2916 3120 #else 2917 3121 ASMMemoryFence(); … … 2929 3133 { 2930 3134 #if defined(RT_ARCH_ARM64) || defined(RT_ARCH_ARM32) 3135 3136 # ifdef RT_INLINE_ASM_USES_INTRIN 3137 return (uint8_t)__load_acquire8(pu8); 3138 3139 # else 2931 3140 uint32_t u32; 2932 # if defined(RTASM_ARM64_USE_FEAT_LSE) && 0 /* very expensive on M1 */3141 # if defined(RTASM_ARM64_USE_FEAT_LSE) && 0 /* very expensive on M1 */ 2933 3142 __asm__ __volatile__("Lstart_ASMAtomicReadU8_%=:\n\t" 2934 3143 RTASM_ARM_DMB_SY … … 2938 3147 "0" (0) 2939 3148 RTASM_ARM_DMB_SY_COMMA_IN_REG); 2940 # else3149 # else 2941 3150 __asm__ __volatile__("Lstart_ASMAtomicReadU8_%=:\n\t" 2942 3151 RTASM_ARM_DMB_SY 2943 # if defined(RT_ARCH_ARM64)2944 # if 1 /* shouldn't be any need for more than single-copy atomicity when we've got a proper barrier, just like on x86. */3152 # if defined(RT_ARCH_ARM64) 3153 # if 1 /* shouldn't be any need for more than single-copy atomicity when we've got a proper barrier, just like on x86. */ 2945 3154 "ldurb %w[uDst], %[pMem]\n\t" 2946 # else3155 # else 2947 3156 "ldxrb %w[uDst], %[pMem]\n\t" 2948 3157 "clrex\n\t" 2949 # endif2950 # else3158 # endif 3159 # else 2951 3160 "ldrexb %[uDst], %[pMem]\n\t" 2952 3161 /** @todo clrex */ 2953 # endif3162 # endif 2954 3163 : [uDst] "=&r" (u32) 2955 3164 : [pMem] "Q" (*pu8) 2956 3165 RTASM_ARM_DMB_SY_COMMA_IN_REG); 2957 # endif3166 # endif 2958 3167 return (uint8_t)u32; 3168 # endif 3169 2959 3170 #else 2960 3171 ASMMemoryFence(); … … 2973 3184 { 2974 3185 #if defined(RT_ARCH_ARM64) || defined(RT_ARCH_ARM32) 3186 3187 # ifdef RT_INLINE_ASM_USES_INTRIN 3188 return (uint8_t)__iso_volatile_load8((volatile char *)pu8); 3189 3190 # else 3191 2975 3192 uint32_t u32; 2976 3193 __asm__ __volatile__("Lstart_ASMAtomicUoReadU8_%=:\n\t" 2977 # if defined(RT_ARCH_ARM64)3194 # if defined(RT_ARCH_ARM64) 2978 3195 "ldurb %w[uDst], %[pMem]\n\t" 2979 # else3196 # else 2980 3197 "ldrexb %[uDst], %[pMem]\n\t" /** @todo fix this */ 2981 # endif3198 # endif 2982 3199 : [uDst] "=&r" (u32) 2983 3200 : [pMem] "Q" (*pu8)); 2984 3201 return (uint8_t)u32; 3202 # endif 2985 3203 #else 2986 3204 return *pu8; /* byte reads are atomic on x86 */ … … 3015 3233 { 3016 3234 #if defined(RT_ARCH_ARM64) || defined(RT_ARCH_ARM32) 3235 3236 # ifdef RT_INLINE_ASM_USES_INTRIN 3237 return __iso_volatile_load8((volatile char *)pi8); 3238 3239 # else 3240 3017 3241 int32_t i32; 3018 3242 __asm__ __volatile__("Lstart_ASMAtomicUoReadS8_%=:\n\t" 3019 # if defined(RT_ARCH_ARM64)3243 # if defined(RT_ARCH_ARM64) 3020 3244 "ldurb %w[iDst], %[pMem]\n\t" 3021 # else3245 # else 3022 3246 "ldrexb %[iDst], %[pMem]\n\t" /** @todo fix this */ 3023 # endif3247 # endif 3024 3248 : [iDst] "=&r" (i32) 3025 3249 : [pMem] "Q" (*pi8)); 3026 3250 return (int8_t)i32; 3251 # endif 3027 3252 #else 3028 3253 return *pi8; /* byte reads are atomic on x86 */ … … 3041 3266 Assert(!((uintptr_t)pu16 & 1)); 3042 3267 #if defined(RT_ARCH_ARM64) || defined(RT_ARCH_ARM32) 3268 3269 # ifdef RT_INLINE_ASM_USES_INTRIN 3270 return (uint16_t)__load_acquire16(pu16); 3271 3272 # else 3273 3043 3274 uint32_t u32; 3044 # if defined(RTASM_ARM64_USE_FEAT_LSE) && 0 /* very expensive on M1, but alignment advantages with LEA2 (M2?). */3275 # if defined(RTASM_ARM64_USE_FEAT_LSE) && 0 /* very expensive on M1, but alignment advantages with LEA2 (M2?). */ 3045 3276 __asm__ __volatile__("Lstart_ASMAtomicReadU16_%=:\n\t" 3046 3277 RTASM_ARM_DMB_SY … … 3050 3281 "0" (0) 3051 3282 RTASM_ARM_DMB_SY_COMMA_IN_REG); 3052 # else3283 # else 3053 3284 __asm__ __volatile__("Lstart_ASMAtomicReadU16_%=:\n\t" 3054 3285 RTASM_ARM_DMB_SY 3055 # if defined(RT_ARCH_ARM64)3056 # if 1 /* ASSUMING proper barrier and aligned access, we should be fine with single-copy atomicity, just like on x86. */3286 # if defined(RT_ARCH_ARM64) 3287 # if 1 /* ASSUMING proper barrier and aligned access, we should be fine with single-copy atomicity, just like on x86. */ 3057 3288 "ldurh %w[uDst], %[pMem]\n\t" 3058 # else3289 # else 3059 3290 "ldxrh %w[uDst], %[pMem]\n\t" 3060 3291 "clrex\n\t" 3061 # endif3062 # else3292 # endif 3293 # else 3063 3294 "ldrexh %[uDst], %[pMem]\n\t" 3064 3295 /** @todo clrex */ 3065 # endif3296 # endif 3066 3297 : [uDst] "=&r" (u32) 3067 3298 : [pMem] "Q" (*pu16) 3068 3299 RTASM_ARM_DMB_SY_COMMA_IN_REG); 3069 # endif3300 # endif 3070 3301 return (uint16_t)u32; 3302 # endif 3303 3071 3304 #else 3072 3305 ASMMemoryFence(); … … 3086 3319 Assert(!((uintptr_t)pu16 & 1)); 3087 3320 #if defined(RT_ARCH_ARM64) || defined(RT_ARCH_ARM32) 3321 3322 # ifdef RT_INLINE_ASM_USES_INTRIN 3323 return (uint16_t)__iso_volatile_load16((volatile int16_t *)pu16); 3324 3325 # else 3326 3088 3327 uint32_t u32; 3089 3328 __asm__ __volatile__("Lstart_ASMAtomicUoReadU16_%=:\n\t" 3090 # if defined(RT_ARCH_ARM64)3329 # if defined(RT_ARCH_ARM64) 3091 3330 "ldurh %w[uDst], %[pMem]\n\t" 3092 # else3331 # else 3093 3332 "ldrexh %[uDst], %[pMem]\n\t" /** @todo fix this */ 3094 # endif3333 # endif 3095 3334 : [uDst] "=&r" (u32) 3096 3335 : [pMem] "Q" (*pu16)); 3097 3336 return (uint16_t)u32; 3337 # endif 3338 3098 3339 #else 3099 3340 return *pu16; … … 3130 3371 Assert(!((uintptr_t)pi16 & 1)); 3131 3372 #if defined(RT_ARCH_ARM64) || defined(RT_ARCH_ARM32) 3373 3374 # ifdef RT_INLINE_ASM_USES_INTRIN 3375 return __iso_volatile_load16(pi16); 3376 3377 # else 3378 3132 3379 int32_t i32; 3133 3380 __asm__ __volatile__("Lstart_ASMAtomicUoReadS16_%=:\n\t" 3134 # if defined(RT_ARCH_ARM64)3381 # if defined(RT_ARCH_ARM64) 3135 3382 "ldurh %w[iDst], %[pMem]\n\t" 3136 # else3383 # else 3137 3384 "ldrexh %[iDst], %[pMem]\n\t" /** @todo fix this */ 3138 # endif3385 # endif 3139 3386 : [iDst] "=&r" (i32) 3140 3387 : [pMem] "Q" (*pi16)); 3141 3388 return (int16_t)i32; 3389 # endif 3390 3142 3391 #else 3143 3392 return *pi16; … … 3156 3405 Assert(!((uintptr_t)pu32 & 3)); 3157 3406 #if defined(RT_ARCH_ARM64) || defined(RT_ARCH_ARM32) 3407 3408 # ifdef RT_INLINE_ASM_USES_INTRIN 3409 return (uint32_t)__load_acquire32(pu32); 3410 3411 # else 3412 3158 3413 uint32_t u32; 3159 # if defined(RTASM_ARM64_USE_FEAT_LSE) && 0 /* very expensive on M1, but alignment advantages with LEA2 (M2?). */3414 # if defined(RTASM_ARM64_USE_FEAT_LSE) && 0 /* very expensive on M1, but alignment advantages with LEA2 (M2?). */ 3160 3415 __asm__ __volatile__("Lstart_ASMAtomicReadU32_%=:\n\t" 3161 3416 RTASM_ARM_DMB_SY … … 3165 3420 "0" (0) 3166 3421 RTASM_ARM_DMB_SY_COMMA_IN_REG); 3167 # else3422 # else 3168 3423 __asm__ __volatile__("Lstart_ASMAtomicReadU32_%=:\n\t" 3169 3424 RTASM_ARM_DMB_SY 3170 # if defined(RT_ARCH_ARM64)3171 # if 1 /* ASSUMING proper barrier and aligned access, we should be fine with single-copy atomicity, just like on x86. */3425 # if defined(RT_ARCH_ARM64) 3426 # if 1 /* ASSUMING proper barrier and aligned access, we should be fine with single-copy atomicity, just like on x86. */ 3172 3427 "ldur %w[uDst], %[pMem]\n\t" 3173 # else3428 # else 3174 3429 "ldxr %w[uDst], %[pMem]\n\t" 3175 3430 "clrex\n\t" 3176 # endif3177 # else3431 # endif 3432 # else 3178 3433 "ldrex %[uDst], %[pMem]\n\t" 3179 3434 /** @todo clrex */ 3180 # endif3435 # endif 3181 3436 : [uDst] "=&r" (u32) 3182 3437 : [pMem] "Q" (*pu32) 3183 3438 RTASM_ARM_DMB_SY_COMMA_IN_REG); 3184 # endif3439 # endif 3185 3440 return u32; 3441 # endif 3442 3186 3443 #else 3187 3444 ASMMemoryFence(); … … 3204 3461 Assert(!((uintptr_t)pu32 & 3)); 3205 3462 #if defined(RT_ARCH_ARM64) || defined(RT_ARCH_ARM32) 3463 3464 # ifdef RT_INLINE_ASM_USES_INTRIN 3465 return (uint32_t)__iso_volatile_load32((volatile int32_t *)pu32); 3466 3467 # else 3468 3206 3469 uint32_t u32; 3207 3470 __asm__ __volatile__("Lstart_ASMAtomicUoReadU32_%=:\n\t" 3208 # if defined(RT_ARCH_ARM64)3471 # if defined(RT_ARCH_ARM64) 3209 3472 "ldur %w[uDst], %[pMem]\n\t" 3210 # else3473 # else 3211 3474 "ldrex %[uDst], %[pMem]\n\t" /** @todo fix this */ 3212 # endif3475 # endif 3213 3476 : [uDst] "=&r" (u32) 3214 3477 : [pMem] "Q" (*pu32)); 3215 3478 return u32; 3479 # endif 3480 3216 3481 #else 3217 3482 # if ARCH_BITS == 16 … … 3254 3519 Assert(!((uintptr_t)pi32 & 3)); 3255 3520 #if defined(RT_ARCH_ARM64) || defined(RT_ARCH_ARM32) 3521 3522 # ifdef RT_INLINE_ASM_USES_INTRIN 3523 return __iso_volatile_load32(pi32); 3524 3525 # else 3526 3256 3527 int32_t i32; 3257 3528 __asm__ __volatile__("Lstart_ASMAtomicUoReadS32_%=:\n\t" 3258 # if defined(RT_ARCH_ARM64)3529 # if defined(RT_ARCH_ARM64) 3259 3530 "ldur %w[iDst], %[pMem]\n\t" 3260 # else3531 # else 3261 3532 "ldrex %[iDst], %[pMem]\n\t" /** @todo thix this */ 3262 # endif3533 # endif 3263 3534 : [iDst] "=&r" (i32) 3264 3535 : [pMem] "Q" (*pi32)); 3265 3536 return i32; 3537 # endif 3266 3538 3267 3539 #else … … 3356 3628 Assert(!((uintptr_t)pu64 & 7)); 3357 3629 3358 # if defined(RTASM_ARM64_USE_FEAT_LSE) && 0 /* very expensive on M1, but alignment advantages with LEA2 (M2?). */ 3630 # ifdef RT_INLINE_ASM_USES_INTRIN 3631 u64 = (uint64_t)__load_acquire64(pu64); 3632 3633 # else 3634 3635 # if defined(RTASM_ARM64_USE_FEAT_LSE) && 0 /* very expensive on M1, but alignment advantages with LEA2 (M2?). */ 3359 3636 __asm__ __volatile__("Lstart_ASMAtomicReadU64_%=:\n\t" 3360 3637 RTASM_ARM_DMB_SY … … 3364 3641 "0" (0) 3365 3642 RTASM_ARM_DMB_SY_COMMA_IN_REG); 3366 # else3643 # else 3367 3644 __asm__ __volatile__("Lstart_ASMAtomicReadU64_%=:\n\t" 3368 3645 RTASM_ARM_DMB_SY 3369 # if defined(RT_ARCH_ARM64)3370 # if 1 /* ASSUMING proper barrier and aligned access, we should be fine with single-copy atomicity, just like on x86. */3646 # if defined(RT_ARCH_ARM64) 3647 # if 1 /* ASSUMING proper barrier and aligned access, we should be fine with single-copy atomicity, just like on x86. */ 3371 3648 "ldur %[uDst], %[pMem]\n\t" 3372 # else3649 # else 3373 3650 "ldxr %[uDst], %[pMem]\n\t" 3374 3651 "clrex\n\t" 3375 # endif3376 # else3652 # endif 3653 # else 3377 3654 "ldrexd %[uDst], %H[uDst], %[pMem]\n\t" 3378 3655 /** @todo clrex */ 3379 # endif3656 # endif 3380 3657 : [uDst] "=&r" (u64) 3381 3658 : [pMem] "Q" (*pu64) 3382 3659 RTASM_ARM_DMB_SY_COMMA_IN_REG); 3660 # endif 3383 3661 # endif 3384 3662 # else … … 3473 3751 # elif defined(RT_ARCH_ARM64) || defined(RT_ARCH_ARM32) 3474 3752 Assert(!((uintptr_t)pu64 & 7)); 3753 3754 3755 # ifdef RT_INLINE_ASM_USES_INTRIN 3756 u64 = (uint64_t)__iso_volatile_load64((volatile int64_t *)pu64); 3757 3758 # else 3759 3475 3760 __asm__ __volatile__("Lstart_ASMAtomicUoReadU64_%=:\n\t" 3476 # if defined(RT_ARCH_ARM64)3761 # if defined(RT_ARCH_ARM64) 3477 3762 "ldur %[uDst], %[pMem]\n\t" 3478 # else3763 # else 3479 3764 "ldrexd %[uDst], %H[uDst], %[pMem]\n\t" /* this is required for atomic access since it's a pair */ 3480 3765 /** @todo clrex? */ 3481 # endif3766 # endif 3482 3767 : [uDst] "=&r" (u64) 3483 3768 : [pMem] "Q" (*pu64)); 3769 # endif 3484 3770 3485 3771 # else … … 3925 4211 { 3926 4212 #if defined(RT_ARCH_ARM64) 4213 4214 # ifdef RT_INLINE_ASM_USES_INTRIN 4215 __dmb(_ARM64_BARRIER_SY); 4216 __stlr8(pu8, u8); 4217 __dmb(_ARM64_BARRIER_SY); 4218 # else 4219 3927 4220 /* The DMB SY will ensure ordering a la x86, the stlrb is probably overkill 3928 4221 as all byte accesses are single-copy atomic, which I think suffices here. */ 3929 4222 __asm__ __volatile__("Lstart_ASMAtomicWriteU8_%=:\n\t" 3930 # if defined(RTASM_ARM64_USE_FEAT_LSE) && 0 /* this is a lot slower and has no alignment benefits with LSE2 */4223 # if defined(RTASM_ARM64_USE_FEAT_LSE) && 0 /* this is a lot slower and has no alignment benefits with LSE2 */ 3931 4224 RTASM_ARM_DMB_SY 3932 4225 "swpb %w[uValue], wzr, %[pMem]\n\t" 3933 # else4226 # else 3934 4227 RTASM_ARM_DMB_SY 3935 4228 "stlrb %w[uValue], %[pMem]\n\t" /* single-copy atomic w/ release semantics. */ 3936 # endif4229 # endif 3937 4230 : [pMem] "+Q" (*pu8) 3938 4231 : [uValue] "r" ((uint32_t)u8) 3939 4232 : ); 4233 # endif 4234 3940 4235 #else 3941 4236 ASMAtomicXchgU8(pu8, u8); … … 3993 4288 { 3994 4289 #if defined(RT_ARCH_ARM64) 4290 4291 # ifdef RT_INLINE_ASM_USES_INTRIN 4292 __dmb(_ARM64_BARRIER_SY); 4293 __stlr16(pu16, u16); 4294 __dmb(_ARM64_BARRIER_SY); 4295 # else 4296 3995 4297 __asm__ __volatile__("Lstart_ASMAtomicWriteU16_%=:\n\t" 3996 # if defined(RTASM_ARM64_USE_FEAT_LSE) /* slower on M1, but benefits from relaxed LSE2 alignment requirements (M2?). */4298 # if defined(RTASM_ARM64_USE_FEAT_LSE) /* slower on M1, but benefits from relaxed LSE2 alignment requirements (M2?). */ 3997 4299 RTASM_ARM_DMB_SY 3998 4300 "swph %w[uValue], wzr, %[pMem]\n\t" 3999 # else4301 # else 4000 4302 RTASM_ARM_DMB_SY 4001 4303 "stlrh %w[uValue], %[pMem]\n\t" /* single-copy atomic w/ release semantics. */ 4002 # endif4304 # endif 4003 4305 : [pMem] "+Q" (*pu16) 4004 4306 : [uValue] "r" ((uint32_t)u16) 4005 4307 : ); 4308 # endif 4309 4006 4310 #else 4007 4311 ASMAtomicXchgU16(pu16, u16); … … 4061 4365 { 4062 4366 #if defined(RT_ARCH_ARM64) 4367 4368 4369 # ifdef RT_INLINE_ASM_USES_INTRIN 4370 __dmb(_ARM64_BARRIER_SY); 4371 __stlr32(pu32, u32); 4372 __dmb(_ARM64_BARRIER_SY); 4373 # else 4374 4063 4375 __asm__ __volatile__("Lstart_ASMAtomicWriteU32_%=:\n\t" 4064 # if defined(RTASM_ARM64_USE_FEAT_LSE) /* slower on M1, but benefits from relaxed LSE2 alignment requirements (M2?). */4376 # if defined(RTASM_ARM64_USE_FEAT_LSE) /* slower on M1, but benefits from relaxed LSE2 alignment requirements (M2?). */ 4065 4377 RTASM_ARM_DMB_SY 4066 4378 "swp %w[uValue], wzr, %[pMem]\n\t" 4067 # else4379 # else 4068 4380 RTASM_ARM_DMB_SY 4069 4381 "stlr %w[uValue], %[pMem]\n\t" /* single-copy atomic w/ release semantics. */ 4070 # endif4382 # endif 4071 4383 : [pMem] "+Q" (*pu32) 4072 4384 : [uValue] "r" (u32) 4073 4385 : "cc"); 4386 # endif 4387 4074 4388 #else 4075 4389 ASMAtomicXchgU32(pu32, u32); … … 4137 4451 { 4138 4452 #if defined(RT_ARCH_ARM64) 4453 4454 # ifdef RT_INLINE_ASM_USES_INTRIN 4455 __dmb(_ARM64_BARRIER_SY); 4456 __stlr64(pu64, u64); 4457 __dmb(_ARM64_BARRIER_SY); 4458 # else 4459 4139 4460 __asm__ __volatile__("Lstart_ASMAtomicWriteU64_%=:\n\t" 4140 # if defined(RTASM_ARM64_USE_FEAT_LSE) /* slower on M1, but benefits from relaxed LSE2 alignment requirements (M2?). */4461 # if defined(RTASM_ARM64_USE_FEAT_LSE) /* slower on M1, but benefits from relaxed LSE2 alignment requirements (M2?). */ 4141 4462 RTASM_ARM_DMB_SY 4142 4463 "swp %[uValue], xzr, %[pMem]\n\t" 4143 # else4464 # else 4144 4465 RTASM_ARM_DMB_SY /** @todo necessary? */ 4145 4466 "stlr %[uValue], %[pMem]\n\t" 4146 # endif4467 # endif 4147 4468 : [pMem] "+Q" (*pu64) 4148 4469 : [uValue] "r" (u64) 4149 4470 : ); 4471 # endif 4472 4150 4473 #else 4151 4474 ASMAtomicXchgU64(pu64, u64); … … 4795 5118 DECLINLINE(uint64_t) ASMAtomicAddU64(uint64_t volatile RT_FAR *pu64, uint64_t u64) RT_NOTHROW_DEF 4796 5119 { 4797 # if RT_INLINE_ASM_USES_INTRIN && defined(RT_ARCH_AMD64)5120 # if RT_INLINE_ASM_USES_INTRIN && (defined(RT_ARCH_AMD64) || defined(RT_ARCH_ARM64)) 4798 5121 u64 = _InterlockedExchangeAdd64((__int64 RT_FAR *)pu64, u64); 4799 5122 return u64; … … 5156 5479 DECLINLINE(uint64_t) ASMAtomicIncU64(uint64_t volatile RT_FAR *pu64) RT_NOTHROW_DEF 5157 5480 { 5158 # if RT_INLINE_ASM_USES_INTRIN && defined(RT_ARCH_AMD64)5481 # if RT_INLINE_ASM_USES_INTRIN && (defined(RT_ARCH_AMD64) || defined(RT_ARCH_ARM64)) 5159 5482 return (uint64_t)_InterlockedIncrement64((__int64 RT_FAR *)pu64); 5160 5483 … … 5353 5676 DECLINLINE(uint64_t) ASMAtomicDecU64(uint64_t volatile RT_FAR *pu64) RT_NOTHROW_DEF 5354 5677 { 5355 # if RT_INLINE_ASM_USES_INTRIN && defined(RT_ARCH_AMD64)5678 # if RT_INLINE_ASM_USES_INTRIN && (defined(RT_ARCH_AMD64) || defined(RT_ARCH_ARM64)) 5356 5679 return (uint64_t)_InterlockedDecrement64((__int64 volatile RT_FAR *)pu64); 5357 5680 … … 5515 5838 DECLINLINE(uint32_t) ASMAtomicOrExU32(uint32_t volatile RT_FAR *pu32, uint32_t u32) RT_NOTHROW_DEF 5516 5839 { 5517 #if defined(RT_ARCH_ARM64) || defined(RT_ARCH_ARM32) 5840 #if RT_INLINE_ASM_USES_INTRIN && defined(RT_ARCH_ARM64) /** @todo This should work on amd64 as well I think... */ 5841 return (uint32_t)_InterlockedOr((long volatile RT_FAR *)pu32, (long)u32); 5842 5843 #elif defined(RT_ARCH_ARM64) || defined(RT_ARCH_ARM32) 5518 5844 # if defined(RTASM_ARM64_USE_FEAT_LSE) 5519 5845 uint32_t u32OldRet; … … 5575 5901 DECLINLINE(void) ASMAtomicOrU64(uint64_t volatile RT_FAR *pu64, uint64_t u64) RT_NOTHROW_DEF 5576 5902 { 5577 # if RT_INLINE_ASM_USES_INTRIN && defined(RT_ARCH_AMD64)5903 # if RT_INLINE_ASM_USES_INTRIN && (defined(RT_ARCH_AMD64) || defined(RT_ARCH_ARM64)) 5578 5904 _InterlockedOr64((__int64 volatile RT_FAR *)pu64, (__int64)u64); 5579 5905 … … 5718 6044 DECLINLINE(uint32_t) ASMAtomicAndExU32(uint32_t volatile RT_FAR *pu32, uint32_t u32) RT_NOTHROW_DEF 5719 6045 { 5720 #if defined(RT_ARCH_ARM64) || defined(RT_ARCH_ARM32) 6046 #if RT_INLINE_ASM_USES_INTRIN && defined(RT_ARCH_ARM64) /** @todo This should work on amd64 as well I think... */ 6047 return (uint32_t)_InterlockedAnd((long volatile RT_FAR *)pu32, (long)u32); 6048 6049 #elif defined(RT_ARCH_ARM64) || defined(RT_ARCH_ARM32) 5721 6050 # if defined(RTASM_ARM64_USE_FEAT_LSE) 5722 6051 uint32_t u32OldRet; … … 5778 6107 DECLINLINE(void) ASMAtomicAndU64(uint64_t volatile RT_FAR *pu64, uint64_t u64) RT_NOTHROW_DEF 5779 6108 { 5780 # if RT_INLINE_ASM_USES_INTRIN && defined(RT_ARCH_AMD64)6109 # if RT_INLINE_ASM_USES_INTRIN && (defined(RT_ARCH_AMD64) || defined(RT_ARCH_ARM64)) 5781 6110 _InterlockedAnd64((__int64 volatile RT_FAR *)pu64, u64); 5782 6111 … … 5922 6251 DECLINLINE(uint32_t) ASMAtomicXorExU32(uint32_t volatile RT_FAR *pu32, uint32_t u32) RT_NOTHROW_DEF 5923 6252 { 5924 #if defined(RT_ARCH_ARM64) || defined(RT_ARCH_ARM32) 6253 # if RT_INLINE_ASM_USES_INTRIN 6254 return (uint32_t)_InterlockedXor((long volatile RT_FAR *)pu32, u32); 6255 6256 #elif defined(RT_ARCH_ARM64) || defined(RT_ARCH_ARM32) 5925 6257 # if defined(RTASM_ARM64_USE_FEAT_LSE) 5926 6258 uint32_t u32OldRet; … … 5982 6314 DECLINLINE(void) ASMAtomicUoOrU32(uint32_t volatile RT_FAR *pu32, uint32_t u32) RT_NOTHROW_DEF 5983 6315 { 5984 # if defined(RT_ARCH_AMD64) || defined(RT_ARCH_X86) 6316 # if RT_INLINE_ASM_USES_INTRIN /** @todo This is too much... */ 6317 _InterlockedOr((long volatile RT_FAR *)pu32, u32); 6318 6319 # elif defined(RT_ARCH_AMD64) || defined(RT_ARCH_X86) 5985 6320 # if RT_INLINE_ASM_GNU_STYLE 5986 6321 __asm__ __volatile__("orl %1, %0\n\t" … … 6035 6370 DECLINLINE(uint32_t) ASMAtomicUoOrExU32(uint32_t volatile RT_FAR *pu32, uint32_t u32) RT_NOTHROW_DEF 6036 6371 { 6037 #if defined(RT_ARCH_ARM64) || defined(RT_ARCH_ARM32) 6372 #if RT_INLINE_ASM_USES_INTRIN /** @todo Check what the compiler generates... */ 6373 return (uint32_t)_InterlockedOr_nf((long volatile RT_FAR *)pu32, u32); 6374 6375 #elif defined(RT_ARCH_ARM64) || defined(RT_ARCH_ARM32) 6038 6376 # if defined(RTASM_ARM64_USE_FEAT_LSE) 6039 6377 uint32_t u32OldRet; … … 6085 6423 DECLINLINE(void) ASMAtomicUoOrU64(uint64_t volatile RT_FAR *pu64, uint64_t u64) RT_NOTHROW_DEF 6086 6424 { 6087 # if RT_INLINE_ASM_GNU_STYLE && defined(RT_ARCH_AMD64) 6425 # if RT_INLINE_ASM_USES_INTRIN && defined(RT_ARCH_ARM64) /** @todo Check what the compiler generates... */ 6426 _InterlockedOr64_nf((volatile int64_t *)pu64, (int64_t)u64); 6427 6428 # elif RT_INLINE_ASM_GNU_STYLE && defined(RT_ARCH_AMD64) 6088 6429 __asm__ __volatile__("orq %1, %q0\n\t" 6089 6430 : "=m" (*pu64) … … 6149 6490 DECLINLINE(void) ASMAtomicUoAndU32(uint32_t volatile RT_FAR *pu32, uint32_t u32) RT_NOTHROW_DEF 6150 6491 { 6151 # if defined(RT_ARCH_AMD64) || defined(RT_ARCH_X86) 6492 # if RT_INLINE_ASM_USES_INTRIN && defined(RT_ARCH_ARM64) /** @todo Check what the compiler generates... */ 6493 _InterlockedAnd_nf((volatile long *)pu32, (long)u32); 6494 6495 # elif defined(RT_ARCH_AMD64) || defined(RT_ARCH_X86) 6152 6496 # if RT_INLINE_ASM_GNU_STYLE 6153 6497 __asm__ __volatile__("andl %1, %0\n\t" … … 6202 6546 DECLINLINE(uint32_t) ASMAtomicUoAndExU32(uint32_t volatile RT_FAR *pu32, uint32_t u32) RT_NOTHROW_DEF 6203 6547 { 6204 #if defined(RT_ARCH_ARM64) || defined(RT_ARCH_ARM32) 6548 #if RT_INLINE_ASM_USES_INTRIN && defined(RT_ARCH_ARM64) /** @todo Check what the compiler generates... */ 6549 return (uint32_t)_InterlockedAnd_nf((volatile long *)pu32, (long)u32); 6550 6551 #elif defined(RT_ARCH_ARM64) || defined(RT_ARCH_ARM32) 6205 6552 # if defined(RTASM_ARM64_USE_FEAT_LSE) 6206 6553 uint32_t u32OldRet; … … 6252 6599 DECLINLINE(void) ASMAtomicUoAndU64(uint64_t volatile RT_FAR *pu64, uint64_t u64) RT_NOTHROW_DEF 6253 6600 { 6254 # if RT_INLINE_ASM_GNU_STYLE && defined(RT_ARCH_AMD64) 6601 # if RT_INLINE_ASM_USES_INTRIN && defined(RT_ARCH_ARM64) /** @todo Check what the compiler generates... */ 6602 _InterlockedAnd64_nf((volatile int64_t *)pu64, (int64_t)u64); 6603 6604 # elif RT_INLINE_ASM_GNU_STYLE && defined(RT_ARCH_AMD64) 6255 6605 __asm__ __volatile__("andq %1, %0\n\t" 6256 6606 : "=m" (*pu64) … … 6316 6666 DECLINLINE(void) ASMAtomicUoXorU32(uint32_t volatile RT_FAR *pu32, uint32_t u32) RT_NOTHROW_DEF 6317 6667 { 6318 # if defined(RT_ARCH_AMD64) || defined(RT_ARCH_X86) 6668 # if RT_INLINE_ASM_USES_INTRIN && defined(RT_ARCH_ARM64) /** @todo Check what the compiler generates... */ 6669 _InterlockedXor_nf((volatile long *)pu32, (long)u32); 6670 6671 # elif defined(RT_ARCH_AMD64) || defined(RT_ARCH_X86) 6319 6672 # if RT_INLINE_ASM_GNU_STYLE 6320 6673 __asm__ __volatile__("xorl %1, %0\n\t" … … 6368 6721 DECLINLINE(uint32_t) ASMAtomicUoXorExU32(uint32_t volatile RT_FAR *pu32, uint32_t u32) RT_NOTHROW_DEF 6369 6722 { 6370 #if defined(RT_ARCH_ARM64) || defined(RT_ARCH_ARM32) 6723 #if RT_INLINE_ASM_USES_INTRIN && defined(RT_ARCH_ARM64) /** @todo Check what the compiler generates... */ 6724 return (uint32_t)_InterlockedXor_nf((volatile long *)pu32, (long)u32); 6725 6726 #elif defined(RT_ARCH_ARM64) || defined(RT_ARCH_ARM32) 6371 6727 # if defined(RTASM_ARM64_USE_FEAT_LSE) 6372 6728 uint32_t u32OldRet; … … 6418 6774 DECLINLINE(uint32_t) ASMAtomicUoIncU32(uint32_t volatile RT_FAR *pu32) RT_NOTHROW_DEF 6419 6775 { 6420 # if defined(RT_ARCH_AMD64) || defined(RT_ARCH_X86) 6776 # if RT_INLINE_ASM_USES_INTRIN && defined(RT_ARCH_ARM64) /** @todo Check what the compiler generates... */ 6777 return _InterlockedIncrement_nf((volatile long *)pu32); 6778 6779 # elif defined(RT_ARCH_AMD64) || defined(RT_ARCH_X86) 6421 6780 uint32_t u32; 6422 6781 # if RT_INLINE_ASM_GNU_STYLE … … 6484 6843 DECLINLINE(uint32_t) ASMAtomicUoDecU32(uint32_t volatile RT_FAR *pu32) RT_NOTHROW_DEF 6485 6844 { 6486 # if defined(RT_ARCH_AMD64) || defined(RT_ARCH_X86) 6845 # if RT_INLINE_ASM_USES_INTRIN && defined(RT_ARCH_ARM64) /** @todo Check what the compiler generates... */ 6846 return _InterlockedDecrement_nf((volatile long *)pu32); 6847 6848 # elif defined(RT_ARCH_AMD64) || defined(RT_ARCH_X86) 6487 6849 uint32_t u32; 6488 6850 # if RT_INLINE_ASM_GNU_STYLE … … 6641 7003 DECLINLINE(uint64_t) ASMByteSwapU64(uint64_t u64) RT_NOTHROW_DEF 6642 7004 { 6643 #if defined(RT_ARCH_AMD64) && RT_INLINE_ASM_USES_INTRIN7005 #if (defined(RT_ARCH_AMD64) || defined(RT_ARCH_ARM64)) && RT_INLINE_ASM_USES_INTRIN 6644 7006 return _byteswap_uint64(u64); 6645 7007
Note:
See TracChangeset
for help on using the changeset viewer.