Changeset 102941 in vbox for trunk/include
- Timestamp:
- Jan 18, 2024 12:15:14 AM (13 months ago)
- svn:sync-xref-src-repo-rev:
- 161169
- File:
-
- 1 edited
Legend:
- Unmodified
- Added
- Removed
-
trunk/include/iprt/asm.h
r102940 r102941 526 526 # elif defined(RT_ARCH_ARM32) || defined(RT_ARCH_ARM64) 527 527 uint32_t uOld; 528 # if defined(RTASM_ARM64_USE_FEAT_LSE) 529 /* SWPALB is ~40% more expensive than the non-LSE variant (M1), but since we 530 have the barrier we shouldn't need that, right? Ordering should be taken 531 care of by the DMB. The SWPB is rather cheap (~70% faster). */ 532 __asm__ __volatile__("Lstart_ASMAtomicXchgU8_%=:\n\t" 533 RTASM_ARM_DMB_SY 534 "swpb %w[uNew], %w[uOld], %[pMem]\n\t" 535 : [pMem] "+Q" (*pu8) 536 , [uOld] "=&r" (uOld) 537 : [uNew] "r" ((uint32_t)u8) 538 RTASM_ARM_DMB_SY_COMMA_IN_REG 539 : ); 540 # else 528 541 uint32_t rcSpill; 529 542 __asm__ __volatile__("Ltry_again_ASMAtomicXchgU8_%=:\n\t" 530 543 RTASM_ARM_DMB_SY 531 # if defined(RT_ARCH_ARM64)544 # if defined(RT_ARCH_ARM64) 532 545 "ldaxrb %w[uOld], %[pMem]\n\t" 533 546 "stlxrb %w[rc], %w[uNew], %[pMem]\n\t" 534 547 "cbnz %w[rc], Ltry_again_ASMAtomicXchgU8_%=\n\t" 535 # else548 # else 536 549 "ldrexb %[uOld], %[pMem]\n\t" /* ARMv6+ */ 537 550 "strexb %[rc], %[uNew], %[pMem]\n\t" 538 551 "cmp %[rc], #0\n\t" 539 552 "bne Ltry_again_ASMAtomicXchgU8_%=\n\t" 540 # endif553 # endif 541 554 : [pMem] "+Q" (*pu8) 542 555 , [uOld] "=&r" (uOld) … … 545 558 RTASM_ARM_DMB_SY_COMMA_IN_REG 546 559 : "cc"); 560 # endif 547 561 return (uint8_t)uOld; 548 562 … … 623 637 # elif defined(RT_ARCH_ARM32) || defined(RT_ARCH_ARM64) 624 638 uint32_t uOld; 639 # if defined(RTASM_ARM64_USE_FEAT_LSE) 640 /* SWPALH is ~40% more expensive than the non-LSE variant on an M1, 20% 641 slower if we remove the barrier. But since we have the barrier we 642 shouldn't need that, right? Ordering should be taken care of by the DMB. 643 The SWPH is rather cheap (~70% faster). */ 644 __asm__ __volatile__("Lstart_ASMAtomicXchgU16_%=:\n\t" 645 RTASM_ARM_DMB_SY 646 "swph %w[uNew], %w[uOld], %[pMem]\n\t" 647 : [pMem] "+Q" (*pu16) 648 , [uOld] "=&r" (uOld) 649 : [uNew] "r" ((uint32_t)u16) 650 RTASM_ARM_DMB_SY_COMMA_IN_REG 651 : ); 652 # else 625 653 uint32_t rcSpill; 626 654 __asm__ __volatile__("Ltry_again_ASMAtomicXchgU16_%=:\n\t" 627 655 RTASM_ARM_DMB_SY 628 # if defined(RT_ARCH_ARM64)656 # if defined(RT_ARCH_ARM64) 629 657 "ldaxrh %w[uOld], %[pMem]\n\t" 630 658 "stlxrh %w[rc], %w[uNew], %[pMem]\n\t" 631 659 "cbnz %w[rc], Ltry_again_ASMAtomicXchgU16_%=\n\t" 632 # else660 # else 633 661 "ldrexh %[uOld], %[pMem]\n\t" /* ARMv6+ */ 634 662 "strexh %[rc], %[uNew], %[pMem]\n\t" 635 663 "cmp %[rc], #0\n\t" 636 664 "bne Ltry_again_ASMAtomicXchgU16_%=\n\t" 637 # endif665 # endif 638 666 : [pMem] "+Q" (*pu16) 639 667 , [uOld] "=&r" (uOld) … … 642 670 RTASM_ARM_DMB_SY_COMMA_IN_REG 643 671 : "cc"); 672 # endif 644 673 return (uint16_t)uOld; 645 674 … … 709 738 # elif defined(RT_ARCH_ARM32) || defined(RT_ARCH_ARM64) 710 739 uint32_t uOld; 740 # if defined(RTASM_ARM64_USE_FEAT_LSE) 741 /* SWPAL is ~40% more expensive than the non-LSE variant on an M1, 20% 742 slower if we remove the barrier. But since we have the barrier we 743 shouldn't need that, right? Ordering should be taken care of by the DMB. 744 The SWP is rather cheap (~70% faster). */ 745 __asm__ __volatile__("Lstart_ASMAtomicXchgU32_%=:\n\t" 746 RTASM_ARM_DMB_SY 747 "swp %w[uNew], %w[uOld], %[pMem]\n\t" 748 : [pMem] "+Q" (*pu32) 749 , [uOld] "=&r" (uOld) 750 : [uNew] "r" (u32) 751 RTASM_ARM_DMB_SY_COMMA_IN_REG 752 : ); 753 # else 711 754 uint32_t rcSpill; 712 755 __asm__ __volatile__("Ltry_again_ASMAtomicXchgU32_%=:\n\t" 713 756 RTASM_ARM_DMB_SY 714 # if defined(RT_ARCH_ARM64)757 # if defined(RT_ARCH_ARM64) 715 758 "ldaxr %w[uOld], %[pMem]\n\t" 716 759 "stlxr %w[rc], %w[uNew], %[pMem]\n\t" 717 760 "cbnz %w[rc], Ltry_again_ASMAtomicXchgU32_%=\n\t" 718 # else761 # else 719 762 "ldrex %[uOld], %[pMem]\n\t" /* ARMv6+ */ 720 763 "strex %[rc], %[uNew], %[pMem]\n\t" 721 764 "cmp %[rc], #0\n\t" 722 765 "bne Ltry_again_ASMAtomicXchgU32_%=\n\t" 723 # endif766 # endif 724 767 : [pMem] "+Q" (*pu32) 725 768 , [uOld] "=&r" (uOld) … … 728 771 RTASM_ARM_DMB_SY_COMMA_IN_REG 729 772 : "cc"); 773 # endif 730 774 return uOld; 731 775 … … 834 878 835 879 # elif defined(RT_ARCH_ARM32) || defined(RT_ARCH_ARM64) 880 uint64_t uOld; 881 # if defined(RTASM_ARM64_USE_FEAT_LSE) 882 /* SWPAL is ~40% more expensive than the non-LSE variant on an M1, 20% 883 slower if we remove the barrier. But since we have the barrier we 884 shouldn't need that, right? Ordering should be taken care of by the DMB. 885 The SWP is rather cheap (~70% faster). */ 886 __asm__ __volatile__("Lstart_ASMAtomicXchgU64_%=:\n\t" 887 RTASM_ARM_DMB_SY 888 "swp %[uNew], %[uOld], %[pMem]\n\t" 889 : [pMem] "+Q" (*pu64) 890 , [uOld] "=&r" (uOld) 891 : [uNew] "r" (u64) 892 RTASM_ARM_DMB_SY_COMMA_IN_REG 893 : ); 894 # else 836 895 uint32_t rcSpill; 837 uint64_t uOld;838 896 __asm__ __volatile__("Ltry_again_ASMAtomicXchgU64_%=:\n\t" 839 897 RTASM_ARM_DMB_SY 840 # if defined(RT_ARCH_ARM64)898 # if defined(RT_ARCH_ARM64) 841 899 "ldaxr %[uOld], %[pMem]\n\t" 842 900 "stlxr %w[rc], %[uNew], %[pMem]\n\t" 843 901 "cbnz %w[rc], Ltry_again_ASMAtomicXchgU64_%=\n\t" 844 # else902 # else 845 903 "ldrexd %[uOld], %H[uOld], %[pMem]\n\t" /* ARMv6+ */ 846 904 "strexd %[rc], %[uNew], %H[uNew], %[pMem]\n\t" 847 905 "cmp %[rc], #0\n\t" 848 906 "bne Ltry_again_ASMAtomicXchgU64_%=\n\t" 849 # endif907 # endif 850 908 : [pMem] "+Q" (*pu64) 851 909 , [uOld] "=&r" (uOld) … … 854 912 RTASM_ARM_DMB_SY_COMMA_IN_REG 855 913 : "cc"); 914 # endif 856 915 return uOld; 857 916
Note:
See TracChangeset
for help on using the changeset viewer.