VirtualBox

source: vbox/trunk/include/iprt/asm.h@ 106546

Last change on this file since 106546 was 106546, checked in by vboxsync, 4 months ago

include/iprt/asm.h: Make it build on win.arm64, bugref:10392 [attempted build fix]

  • Property svn:eol-style set to native
  • Property svn:keywords set to Author Date Id Revision
File size: 297.9 KB
Line 
1/** @file
2 * IPRT - Assembly Functions.
3 */
4
5/*
6 * Copyright (C) 2006-2024 Oracle and/or its affiliates.
7 *
8 * This file is part of VirtualBox base platform packages, as
9 * available from https://www.virtualbox.org.
10 *
11 * This program is free software; you can redistribute it and/or
12 * modify it under the terms of the GNU General Public License
13 * as published by the Free Software Foundation, in version 3 of the
14 * License.
15 *
16 * This program is distributed in the hope that it will be useful, but
17 * WITHOUT ANY WARRANTY; without even the implied warranty of
18 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
19 * General Public License for more details.
20 *
21 * You should have received a copy of the GNU General Public License
22 * along with this program; if not, see <https://www.gnu.org/licenses>.
23 *
24 * The contents of this file may alternatively be used under the terms
25 * of the Common Development and Distribution License Version 1.0
26 * (CDDL), a copy of it is provided in the "COPYING.CDDL" file included
27 * in the VirtualBox distribution, in which case the provisions of the
28 * CDDL are applicable instead of those of the GPL.
29 *
30 * You may elect to license modified versions of this file under the
31 * terms and conditions of either the GPL or the CDDL or both.
32 *
33 * SPDX-License-Identifier: GPL-3.0-only OR CDDL-1.0
34 */
35
36#ifndef IPRT_INCLUDED_asm_h
37#define IPRT_INCLUDED_asm_h
38#ifndef RT_WITHOUT_PRAGMA_ONCE
39# pragma once
40#endif
41
42#include <iprt/cdefs.h>
43#include <iprt/types.h>
44#include <iprt/assert.h>
45/** @def RT_INLINE_ASM_USES_INTRIN
46 * Defined as 1 if we're using a _MSC_VER 1400.
47 * Otherwise defined as 0.
48 */
49
50/* Solaris 10 header ugliness */
51#ifdef u
52# undef u
53#endif
54
55#if defined(_MSC_VER) && RT_INLINE_ASM_USES_INTRIN
56/* Emit the intrinsics at all optimization levels. */
57# include <iprt/sanitized/intrin.h>
58# pragma intrinsic(_ReadWriteBarrier)
59# if defined(RT_ARCH_X86) || defined(RT_ARCH_AMD64)
60# pragma intrinsic(__cpuid)
61# pragma intrinsic(__stosd)
62# pragma intrinsic(__stosw)
63# pragma intrinsic(__stosb)
64# ifdef RT_ARCH_AMD64
65# pragma intrinsic(__stosq)
66# pragma intrinsic(_byteswap_uint64)
67# pragma intrinsic(_InterlockedCompareExchange128)
68# pragma intrinsic(_InterlockedExchange64)
69# pragma intrinsic(_InterlockedExchangeAdd64)
70# pragma intrinsic(_InterlockedAnd64)
71# pragma intrinsic(_InterlockedOr64)
72# pragma intrinsic(_InterlockedIncrement64)
73# pragma intrinsic(_InterlockedDecrement64)
74# endif
75# elif defined(RT_ARCH_ARM64)
76# pragma intrinsic(__break)
77# pragma intrinsic(__dmb)
78# pragma intrinsic(__dsb)
79# pragma intrinsic(__isb)
80# pragma intrinsic(__nop)
81# pragma intrinsic(__yield)
82# pragma intrinsic(__swp8)
83# pragma intrinsic(__swpa8)
84# pragma intrinsic(__swpal8)
85# pragma intrinsic(__swp16)
86# pragma intrinsic(__swpa16)
87# pragma intrinsic(__swpal16)
88# pragma intrinsic(__swp32)
89# pragma intrinsic(__swpa32)
90# pragma intrinsic(__swpal32)
91# pragma intrinsic(__swp64)
92# pragma intrinsic(__swpa64)
93# pragma intrinsic(__swpal64)
94# pragma intrinsic(__cas8)
95# pragma intrinsic(__casl8)
96# pragma intrinsic(__cas16)
97# pragma intrinsic(__casl16)
98# pragma intrinsic(__cas32)
99# pragma intrinsic(__casl32)
100# pragma intrinsic(__cas64)
101# pragma intrinsic(__casl64)
102# pragma intrinsic(__casa8)
103# pragma intrinsic(__casal8)
104# pragma intrinsic(__casa16)
105# pragma intrinsic(__casa64)
106# pragma intrinsic(__iso_volatile_load8)
107# pragma intrinsic(__iso_volatile_load16)
108# pragma intrinsic(__iso_volatile_load32)
109# pragma intrinsic(__iso_volatile_load64)
110# pragma intrinsic(__iso_volatile_store8)
111# pragma intrinsic(__iso_volatile_store16)
112# pragma intrinsic(__iso_volatile_store32)
113# pragma intrinsic(__iso_volatile_store64)
114# pragma intrinsic(__load_acquire8)
115# pragma intrinsic(__load_acquire16)
116# pragma intrinsic(__load_acquire32)
117# pragma intrinsic(__load_acquire64)
118# pragma intrinsic(__stlr8)
119# pragma intrinsic(__stlr16)
120# pragma intrinsic(__stlr32)
121# pragma intrinsic(__stlr64)
122# else
123# error "Port me"
124# endif
125# pragma intrinsic(_BitScanForward)
126# pragma intrinsic(_BitScanReverse)
127# pragma intrinsic(_bittest)
128# pragma intrinsic(_bittestandset)
129# pragma intrinsic(_bittestandreset)
130# pragma intrinsic(_bittestandcomplement)
131# pragma intrinsic(_byteswap_ushort)
132# pragma intrinsic(_byteswap_ulong)
133# pragma intrinsic(_interlockedbittestandset)
134# pragma intrinsic(_interlockedbittestandreset)
135# pragma intrinsic(_InterlockedAnd)
136# pragma intrinsic(_InterlockedOr)
137# pragma intrinsic(_InterlockedXor)
138# pragma intrinsic(_InterlockedIncrement)
139# pragma intrinsic(_InterlockedDecrement)
140# pragma intrinsic(_InterlockedExchange)
141# pragma intrinsic(_InterlockedExchangeAdd)
142# pragma intrinsic(_InterlockedCompareExchange)
143# pragma intrinsic(_InterlockedCompareExchange8)
144# pragma intrinsic(_InterlockedCompareExchange16)
145# pragma intrinsic(_InterlockedCompareExchange64)
146# pragma intrinsic(_rotl)
147# pragma intrinsic(_rotr)
148# pragma intrinsic(_rotl64)
149# pragma intrinsic(_rotr64)
150#endif
151
152#if (defined(RT_ARCH_ARM64) && (defined(RT_OS_DARWIN) || defined(RT_OS_WINDOWS))) || defined(DOXYGEN_RUNNING)
153/** @def RTASM_ARM64_USE_FEAT_LSE
154 * Use instructions from the FEAT_LSE set to implement atomic operations,
155 * assuming that the host CPU always supports these. */
156# define RTASM_ARM64_USE_FEAT_LSE 1
157/** @def RTASM_ARM64_USE_FEAT_LSE_WITHOUT_DMB
158 * Set to use DMB w/o barrier in most places and rely on the acquire-release
159 * aspects to do the serializing. The assumption is that the tstRTInline
160 * benchmark may be skewing the results testing an unusual scenario. */
161# define RTASM_ARM64_USE_FEAT_LSE_WITHOUT_DMB 1
162#endif
163
164
165/*
166 * Undefine all symbols we have Watcom C/C++ #pragma aux'es for.
167 */
168#if defined(__WATCOMC__) && ARCH_BITS == 16 && defined(RT_ARCH_X86)
169# include "asm-watcom-x86-16.h"
170#elif defined(__WATCOMC__) && ARCH_BITS == 32 && defined(RT_ARCH_X86)
171# include "asm-watcom-x86-32.h"
172#endif
173
174
175/** @defgroup grp_rt_asm ASM - Assembly Routines
176 * @ingroup grp_rt
177 *
178 * @remarks The difference between ordered and unordered atomic operations are
179 * that the former will complete outstanding reads and writes before
180 * continuing while the latter doesn't make any promises about the
181 * order. Ordered operations doesn't, it seems, make any 100% promise
182 * wrt to whether the operation will complete before any subsequent
183 * memory access. (please, correct if wrong.)
184 *
185 * ASMAtomicSomething operations are all ordered, while
186 * ASMAtomicUoSomething are unordered (note the Uo).
187 *
188 * Please note that ordered operations does not necessarily imply a
189 * compiler (memory) barrier. The user has to use the
190 * ASMCompilerBarrier() macro when that is deemed necessary.
191 *
192 * @remarks Some remarks about __volatile__: Without this keyword gcc is allowed
193 * to reorder or even optimize assembler instructions away. For
194 * instance, in the following code the second rdmsr instruction is
195 * optimized away because gcc treats that instruction as deterministic:
196 *
197 * @code
198 * static inline uint64_t rdmsr_low(int idx)
199 * {
200 * uint32_t low;
201 * __asm__ ("rdmsr" : "=a"(low) : "c"(idx) : "edx");
202 * }
203 * ...
204 * uint32_t msr1 = rdmsr_low(1);
205 * foo(msr1);
206 * msr1 = rdmsr_low(1);
207 * bar(msr1);
208 * @endcode
209 *
210 * The input parameter of rdmsr_low is the same for both calls and
211 * therefore gcc will use the result of the first call as input
212 * parameter for bar() as well. For rdmsr this is not acceptable as
213 * this instruction is _not_ deterministic. This applies to reading
214 * machine status information in general.
215 *
216 * @{
217 */
218
219
220/** @def RT_INLINE_ASM_GCC_4_3_X_X86
221 * Used to work around some 4.3.x register allocation issues in this version of
222 * the compiler. So far this workaround is still required for 4.4 and 4.5 but
223 * definitely not for 5.x */
224#if (RT_GNUC_PREREQ(4, 3) && !RT_GNUC_PREREQ(5, 0) && defined(__i386__))
225# define RT_INLINE_ASM_GCC_4_3_X_X86 1
226#else
227# define RT_INLINE_ASM_GCC_4_3_X_X86 0
228#endif
229
230/** @def RT_INLINE_DONT_MIX_CMPXCHG8B_AND_PIC
231 * i686-apple-darwin9-gcc-4.0.1 (GCC) 4.0.1 (Apple Inc. build 5493) screws up
232 * RTSemRWRequestWrite semsemrw-lockless-generic.cpp in release builds. PIC
233 * mode, x86.
234 *
235 * Some gcc 4.3.x versions may have register allocation issues with cmpxchg8b
236 * when in PIC mode on x86.
237 */
238#ifndef RT_INLINE_DONT_MIX_CMPXCHG8B_AND_PIC
239# if defined(DOXYGEN_RUNNING) || defined(__WATCOMC__) /* Watcom has trouble with the expression below */
240# define RT_INLINE_DONT_MIX_CMPXCHG8B_AND_PIC 1
241# elif defined(_MSC_VER) /* Visual C++ has trouble too, but it'll only tell us when C4688 is enabled. */
242# define RT_INLINE_DONT_MIX_CMPXCHG8B_AND_PIC 0
243# elif ( (defined(PIC) || defined(__PIC__)) \
244 && defined(RT_ARCH_X86) \
245 && ( RT_INLINE_ASM_GCC_4_3_X_X86 \
246 || defined(RT_OS_DARWIN)) )
247# define RT_INLINE_DONT_MIX_CMPXCHG8B_AND_PIC 1
248# else
249# define RT_INLINE_DONT_MIX_CMPXCHG8B_AND_PIC 0
250# endif
251#endif
252
253
254/*
255 * ARM is great fun.
256 */
257#if defined(RT_ARCH_ARM64) || defined(RT_ARCH_ARM32)
258
259# define RTASM_ARM_NO_BARRIER
260# ifdef RT_ARCH_ARM64
261# define RTASM_ARM_NO_BARRIER_IN_REG
262# define RTASM_ARM_NO_BARRIER_COMMA_IN_REG
263# define RTASM_ARM_DSB_SY "dsb sy\n\t"
264# define RTASM_ARM_DSB_SY_IN_REG
265# define RTASM_ARM_DSB_SY_COMMA_IN_REG
266# define RTASM_ARM_DMB_SY "dmb sy\n\t"
267# define RTASM_ARM_DMB_SY_IN_REG
268# define RTASM_ARM_DMB_SY_COMMA_IN_REG
269# define RTASM_ARM_DMB_ST "dmb st\n\t"
270# define RTASM_ARM_DMB_ST_IN_REG
271# define RTASM_ARM_DMB_ST_COMMA_IN_REG
272# define RTASM_ARM_DMB_LD "dmb ld\n\t"
273# define RTASM_ARM_DMB_LD_IN_REG
274# define RTASM_ARM_DMB_LD_COMMA_IN_REG
275# define RTASM_ARM_PICK_6432(expr64, expr32) expr64
276# define RTASM_ARM_LOAD_MODIFY_STORE_RET_NEW_32(name, a_pu32Mem, barrier_type, modify64, modify32, in_reg) \
277 uint32_t rcSpill; \
278 uint32_t u32NewRet; \
279 __asm__ __volatile__("Ltry_again_" #name "_%=:\n\t" \
280 RTASM_ARM_##barrier_type /* before lable? */ \
281 "ldaxr %w[uNew], %[pMem]\n\t" \
282 modify64 \
283 "stlxr %w[rc], %w[uNew], %[pMem]\n\t" \
284 "cbnz %w[rc], Ltry_again_" #name "_%=\n\t" \
285 : [pMem] "+Q" (*a_pu32Mem) \
286 , [uNew] "=&r" (u32NewRet) \
287 , [rc] "=&r" (rcSpill) \
288 : in_reg \
289 : "cc")
290# define RTASM_ARM_LOAD_MODIFY_STORE_RET_OLD_32(name, a_pu32Mem, barrier_type, modify64, modify32, in_reg) \
291 uint32_t rcSpill; \
292 uint32_t u32OldRet; \
293 uint32_t u32NewSpill; \
294 __asm__ __volatile__("Ltry_again_" #name "_%=:\n\t" \
295 RTASM_ARM_##barrier_type /* before lable? */ \
296 "ldaxr %w[uOld], %[pMem]\n\t" \
297 modify64 \
298 "stlxr %w[rc], %w[uNew], %[pMem]\n\t" \
299 "cbnz %w[rc], Ltry_again_" #name "_%=\n\t" \
300 : [pMem] "+Q" (*a_pu32Mem) \
301 , [uOld] "=&r" (u32OldRet) \
302 , [uNew] "=&r" (u32NewSpill) \
303 , [rc] "=&r" (rcSpill) \
304 : in_reg \
305 : "cc")
306# define RTASM_ARM_LOAD_MODIFY_STORE_RET_NEW_64(name, a_pu64Mem, barrier_type, modify64, modify32, in_reg) \
307 uint32_t rcSpill; \
308 uint64_t u64NewRet; \
309 __asm__ __volatile__("Ltry_again_" #name "_%=:\n\t" \
310 RTASM_ARM_##barrier_type /* before lable? */ \
311 "ldaxr %[uNew], %[pMem]\n\t" \
312 modify64 \
313 "stlxr %w[rc], %[uNew], %[pMem]\n\t" \
314 "cbnz %w[rc], Ltry_again_" #name "_%=\n\t" \
315 : [pMem] "+Q" (*a_pu64Mem) \
316 , [uNew] "=&r" (u64NewRet) \
317 , [rc] "=&r" (rcSpill) \
318 : in_reg \
319 : "cc")
320# define RTASM_ARM_LOAD_MODIFY_STORE_RET_OLD_64(name, a_pu64Mem, barrier_type, modify64, modify32, in_reg) \
321 uint32_t rcSpill; \
322 uint64_t u64OldRet; \
323 uint64_t u64NewSpill; \
324 __asm__ __volatile__("Ltry_again_" #name "_%=:\n\t" \
325 RTASM_ARM_##barrier_type /* before lable? */ \
326 "ldaxr %[uOld], %[pMem]\n\t" \
327 modify64 \
328 "stlxr %w[rc], %[uNew], %[pMem]\n\t" \
329 "cbnz %w[rc], Ltry_again_" #name "_%=\n\t" \
330 : [pMem] "+Q" (*a_pu64Mem) \
331 , [uOld] "=&r" (u64OldRet) \
332 , [uNew] "=&r" (u64NewSpill) \
333 , [rc] "=&r" (rcSpill) \
334 : in_reg \
335 : "cc")
336
337# else /* RT_ARCH_ARM32 */
338# define RTASM_ARM_PICK_6432(expr64, expr32) expr32
339# if RT_ARCH_ARM32 >= 7
340# warning armv7
341# define RTASM_ARM_NO_BARRIER_IN_REG
342# define RTASM_ARM_NO_BARRIER_COMMA_IN_REG
343# define RTASM_ARM_DSB_SY "dsb sy\n\t"
344# define RTASM_ARM_DSB_SY_IN_REG "X" (0xfade)
345# define RTASM_ARM_DMB_SY "dmb sy\n\t"
346# define RTASM_ARM_DMB_SY_IN_REG "X" (0xfade)
347# define RTASM_ARM_DMB_ST "dmb st\n\t"
348# define RTASM_ARM_DMB_ST_IN_REG "X" (0xfade)
349# define RTASM_ARM_DMB_LD "dmb ld\n\t"
350# define RTASM_ARM_DMB_LD_IN_REG "X" (0xfade)
351
352# elif RT_ARCH_ARM32 >= 6
353# warning armv6
354# define RTASM_ARM_DSB_SY "mcr p15, 0, %[uZero], c7, c10, 4\n\t"
355# define RTASM_ARM_DSB_SY_IN_REG [uZero] "r" (0)
356# define RTASM_ARM_DMB_SY "mcr p15, 0, %[uZero], c7, c10, 5\n\t"
357# define RTASM_ARM_DMB_SY_IN_REG [uZero] "r" (0)
358# define RTASM_ARM_DMB_ST RTASM_ARM_DMB_SY
359# define RTASM_ARM_DMB_ST_IN_REG RTASM_ARM_DMB_SY_IN_REG
360# define RTASM_ARM_DMB_LD RTASM_ARM_DMB_SY
361# define RTASM_ARM_DMB_LD_IN_REG RTASM_ARM_DMB_SY_IN_REG
362
363# elif RT_ARCH_ARM32 >= 4
364# warning armv5 or older
365# define RTASM_ARM_DSB_SY "mcr p15, 0, %[uZero], c7, c10, 4\n\t"
366# define RTASM_ARM_DSB_SY_IN_REG [uZero] "r" (0)
367# define RTASM_ARM_DMB_SY RTASM_ARM_DSB_SY
368# define RTASM_ARM_DMB_SY_IN_REG RTASM_ARM_DSB_SY_IN_REG
369# define RTASM_ARM_DMB_ST RTASM_ARM_DSB_SY
370# define RTASM_ARM_DMB_ST_IN_REG RTASM_ARM_DSB_SY_IN_REG
371# define RTASM_ARM_DMB_LD RTASM_ARM_DSB_SY
372# define RTASM_ARM_DMB_LD_IN_REG RTASM_ARM_DSB_SY_IN_REG
373# else
374# error "huh? Odd RT_ARCH_ARM32 value!"
375# endif
376# define RTASM_ARM_DSB_SY_COMMA_IN_REG , RTASM_ARM_DSB_SY_IN_REG
377# define RTASM_ARM_DMB_SY_COMMA_IN_REG , RTASM_ARM_DMB_SY_IN_REG
378# define RTASM_ARM_DMB_ST_COMMA_IN_REG , RTASM_ARM_DMB_ST_IN_REG
379# define RTASM_ARM_DMB_LD_COMMA_IN_REG , RTASM_ARM_DMB_LD_IN_REG
380# define RTASM_ARM_LOAD_MODIFY_STORE_RET_NEW_32(name, a_pu32Mem, barrier_type, modify64, modify32, in_reg) \
381 uint32_t rcSpill; \
382 uint32_t u32NewRet; \
383 __asm__ __volatile__("Ltry_again_" #name "_%=:\n\t" \
384 RT_CONCAT(RTASM_ARM_,barrier_type) /* before lable? */ \
385 "ldrex %[uNew], %[pMem]\n\t" \
386 modify32 \
387 "strex %[rc], %[uNew], %[pMem]\n\t" \
388 "cmp %[rc], #0\n\t" \
389 "bne Ltry_again_" #name "_%=\n\t" \
390 : [pMem] "+m" (*a_pu32Mem) \
391 , [uNew] "=&r" (u32NewRet) \
392 , [rc] "=&r" (rcSpill) \
393 : RT_CONCAT3(RTASM_ARM_,barrier_type,_IN_REG) \
394 , in_reg \
395 : "cc")
396# define RTASM_ARM_LOAD_MODIFY_STORE_RET_OLD_32(name, a_pu32Mem, barrier_type, modify64, modify32, in_reg) \
397 uint32_t rcSpill; \
398 uint32_t u32OldRet; \
399 uint32_t u32NewSpill; \
400 __asm__ __volatile__("Ltry_again_" #name "_%=:\n\t" \
401 RT_CONCAT(RTASM_ARM_,barrier_type) /* before lable? */ \
402 "ldrex %[uOld], %[pMem]\n\t" \
403 modify32 \
404 "strex %[rc], %[uNew], %[pMem]\n\t" \
405 "cmp %[rc], #0\n\t" \
406 "bne Ltry_again_" #name "_%=\n\t" \
407 : [pMem] "+m" (*a_pu32Mem) \
408 , [uOld] "=&r" (u32OldRet) \
409 , [uNew] "=&r" (u32NewSpill) \
410 , [rc] "=&r" (rcSpill) \
411 : RT_CONCAT3(RTASM_ARM_,barrier_type,_IN_REG) \
412 , in_reg \
413 : "cc")
414# define RTASM_ARM_LOAD_MODIFY_STORE_RET_NEW_64(name, a_pu64Mem, barrier_type, modify64, modify32, in_reg) \
415 uint32_t rcSpill; \
416 uint64_t u64NewRet; \
417 __asm__ __volatile__("Ltry_again_" #name "_%=:\n\t" \
418 RT_CONCAT(RTASM_ARM_,barrier_type) /* before lable? */ \
419 "ldrexd %[uNew], %H[uNew], %[pMem]\n\t" \
420 modify32 \
421 "strexd %[rc], %[uNew], %H[uNew], %[pMem]\n\t" \
422 "cmp %[rc], #0\n\t" \
423 "bne Ltry_again_" #name "_%=\n\t" \
424 : [pMem] "+m" (*a_pu64Mem), \
425 [uNew] "=&r" (u64NewRet), \
426 [rc] "=&r" (rcSpill) \
427 : RT_CONCAT3(RTASM_ARM_,barrier_type,_IN_REG) \
428 , in_reg \
429 : "cc")
430# define RTASM_ARM_LOAD_MODIFY_STORE_RET_OLD_64(name, a_pu64Mem, barrier_type, modify64, modify32, in_reg) \
431 uint32_t rcSpill; \
432 uint64_t u64OldRet; \
433 uint64_t u64NewSpill; \
434 __asm__ __volatile__("Ltry_again_" #name "_%=:\n\t" \
435 RT_CONCAT(RTASM_ARM_,barrier_type) /* before lable? */ \
436 "ldrexd %[uOld], %H[uOld], %[pMem]\n\t" \
437 modify32 \
438 "strexd %[rc], %[uNew], %H[uNew], %[pMem]\n\t" \
439 "cmp %[rc], #0\n\t" \
440 "bne Ltry_again_" #name "_%=\n\t" \
441 : [pMem] "+m" (*a_pu64Mem), \
442 [uOld] "=&r" (u64OldRet), \
443 [uNew] "=&r" (u64NewSpill), \
444 [rc] "=&r" (rcSpill) \
445 : RT_CONCAT3(RTASM_ARM_,barrier_type,_IN_REG) \
446 , in_reg \
447 : "cc")
448# endif /* RT_ARCH_ARM32 */
449#endif
450
451
452/** @def ASMReturnAddress
453 * Gets the return address of the current (or calling if you like) function or method.
454 */
455#ifdef _MSC_VER
456# ifdef __cplusplus
457extern "C"
458# endif
459void * _ReturnAddress(void);
460# pragma intrinsic(_ReturnAddress)
461# define ASMReturnAddress() _ReturnAddress()
462#elif defined(__GNUC__) || defined(DOXYGEN_RUNNING)
463# define ASMReturnAddress() __builtin_return_address(0)
464#elif defined(__WATCOMC__)
465# define ASMReturnAddress() Watcom_does_not_appear_to_have_intrinsic_return_address_function()
466#else
467# error "Unsupported compiler."
468#endif
469
470
471/**
472 * Compiler memory barrier.
473 *
474 * Ensure that the compiler does not use any cached (register/tmp stack) memory
475 * values or any outstanding writes when returning from this function.
476 *
477 * This function must be used if non-volatile data is modified by a
478 * device or the VMM. Typical cases are port access, MMIO access,
479 * trapping instruction, etc.
480 */
481#if RT_INLINE_ASM_GNU_STYLE
482# define ASMCompilerBarrier() do { __asm__ __volatile__("" : : : "memory"); } while (0)
483#elif RT_INLINE_ASM_USES_INTRIN
484# define ASMCompilerBarrier() do { _ReadWriteBarrier(); } while (0)
485#elif defined(__WATCOMC__)
486void ASMCompilerBarrier(void);
487#else /* 2003 should have _ReadWriteBarrier() but I guess we're at 2002 level then... */
488DECLINLINE(void) ASMCompilerBarrier(void) RT_NOTHROW_DEF
489{
490 __asm
491 {
492 }
493}
494#endif
495
496
497/** @def ASMBreakpoint
498 * Debugger Breakpoint.
499 * @deprecated Use RT_BREAKPOINT instead.
500 * @internal
501 */
502#define ASMBreakpoint() RT_BREAKPOINT()
503
504
505/**
506 * Spinloop hint for platforms that have these, empty function on the other
507 * platforms.
508 *
509 * x86 & AMD64: The PAUSE variant of NOP for helping hyperthreaded CPUs detecting
510 * spin locks.
511 */
512#if RT_INLINE_ASM_EXTERNAL_TMP_ARM && (defined(RT_ARCH_AMD64) || defined(RT_ARCH_X86))
513RT_ASM_DECL_PRAGMA_WATCOM(void) ASMNopPause(void) RT_NOTHROW_PROTO;
514#else
515DECLINLINE(void) ASMNopPause(void) RT_NOTHROW_DEF
516{
517# if defined(RT_ARCH_AMD64) || defined(RT_ARCH_X86)
518# if RT_INLINE_ASM_GNU_STYLE
519 __asm__ __volatile__(".byte 0xf3,0x90\n\t");
520# else
521 __asm {
522 _emit 0f3h
523 _emit 090h
524 }
525# endif
526
527# elif defined(RT_ARCH_ARM32) || defined(RT_ARCH_ARM64)
528
529# if RT_INLINE_ASM_USES_INTRIN
530 __yield();
531# else
532 __asm__ __volatile__("yield\n\t"); /* ARMv6K+ */
533# endif
534
535# else
536 /* dummy */
537# endif
538}
539#endif
540
541
542/**
543 * Atomically Exchange an unsigned 8-bit value, ordered.
544 *
545 * @returns Current *pu8 value
546 * @param pu8 Pointer to the 8-bit variable to update.
547 * @param u8 The 8-bit value to assign to *pu8.
548 */
549#if RT_INLINE_ASM_EXTERNAL_TMP_ARM
550RT_ASM_DECL_PRAGMA_WATCOM(uint8_t) ASMAtomicXchgU8(volatile uint8_t RT_FAR *pu8, uint8_t u8) RT_NOTHROW_PROTO;
551#else
552DECLINLINE(uint8_t) ASMAtomicXchgU8(volatile uint8_t RT_FAR *pu8, uint8_t u8) RT_NOTHROW_DEF
553{
554# if defined(RT_ARCH_AMD64) || defined(RT_ARCH_X86)
555# if RT_INLINE_ASM_GNU_STYLE
556 __asm__ __volatile__("xchgb %0, %1\n\t"
557 : "=m" (*pu8)
558 , "=q" (u8) /* =r - busted on g++ (GCC) 3.4.4 20050721 (Red Hat 3.4.4-2) */
559 : "1" (u8)
560 , "m" (*pu8));
561# else
562 __asm
563 {
564# ifdef RT_ARCH_AMD64
565 mov rdx, [pu8]
566 mov al, [u8]
567 xchg [rdx], al
568 mov [u8], al
569# else
570 mov edx, [pu8]
571 mov al, [u8]
572 xchg [edx], al
573 mov [u8], al
574# endif
575 }
576# endif
577 return u8;
578
579# elif defined(RT_ARCH_ARM32) || defined(RT_ARCH_ARM64)
580
581# if RT_INLINE_ASM_USES_INTRIN
582 uint8_t uOld;
583# if defined(RTASM_ARM64_USE_FEAT_LSE_WITHOUT_DMB)
584 uOld = __swpal8(pu8, u8);
585# else
586 uOld = __swp8(pu8, u8);
587 __dmb(_ARM64_BARRIER_SY);
588# endif
589 return uOld;
590
591# else
592 uint32_t uOld;
593# if defined(RTASM_ARM64_USE_FEAT_LSE)
594 /* SWPALB is ~40% more expensive than the non-LSE variant (M1), but since we
595 have the barrier we shouldn't need that, right? Ordering should be taken
596 care of by the DMB. The SWPB is rather cheap (~70% faster). */
597 __asm__ __volatile__("Lstart_ASMAtomicXchgU8_%=:\n\t"
598# if defined(RTASM_ARM64_USE_FEAT_LSE_WITHOUT_DMB)
599 "swpalb %w[uNew], %w[uOld], %[pMem]\n\t"
600# else
601 RTASM_ARM_DMB_SY
602 "swpb %w[uNew], %w[uOld], %[pMem]\n\t"
603# endif
604 : [pMem] "+Q" (*pu8)
605 , [uOld] "=&r" (uOld)
606 : [uNew] "r" ((uint32_t)u8)
607 : );
608# else
609 uint32_t rcSpill;
610 __asm__ __volatile__("Ltry_again_ASMAtomicXchgU8_%=:\n\t"
611 RTASM_ARM_DMB_SY
612# if defined(RT_ARCH_ARM64)
613 "ldaxrb %w[uOld], %[pMem]\n\t"
614 "stlxrb %w[rc], %w[uNew], %[pMem]\n\t"
615 "cbnz %w[rc], Ltry_again_ASMAtomicXchgU8_%=\n\t"
616# else
617 "ldrexb %[uOld], %[pMem]\n\t" /* ARMv6+ */
618 "strexb %[rc], %[uNew], %[pMem]\n\t"
619 "cmp %[rc], #0\n\t"
620 "bne Ltry_again_ASMAtomicXchgU8_%=\n\t"
621# endif
622 : [pMem] "+Q" (*pu8)
623 , [uOld] "=&r" (uOld)
624 , [rc] "=&r" (rcSpill)
625 : [uNew] "r" ((uint32_t)u8)
626 RTASM_ARM_DMB_SY_COMMA_IN_REG
627 : "cc");
628# endif
629 return (uint8_t)uOld;
630# endif
631
632# else
633# error "Port me"
634# endif
635}
636#endif
637
638
639/**
640 * Atomically Exchange a signed 8-bit value, ordered.
641 *
642 * @returns Current *pu8 value
643 * @param pi8 Pointer to the 8-bit variable to update.
644 * @param i8 The 8-bit value to assign to *pi8.
645 */
646DECLINLINE(int8_t) ASMAtomicXchgS8(volatile int8_t RT_FAR *pi8, int8_t i8) RT_NOTHROW_DEF
647{
648 return (int8_t)ASMAtomicXchgU8((volatile uint8_t RT_FAR *)pi8, (uint8_t)i8);
649}
650
651
652/**
653 * Atomically Exchange a bool value, ordered.
654 *
655 * @returns Current *pf value
656 * @param pf Pointer to the 8-bit variable to update.
657 * @param f The 8-bit value to assign to *pi8.
658 */
659DECLINLINE(bool) ASMAtomicXchgBool(volatile bool RT_FAR *pf, bool f) RT_NOTHROW_DEF
660{
661#ifdef _MSC_VER
662 return !!ASMAtomicXchgU8((volatile uint8_t RT_FAR *)pf, (uint8_t)f);
663#else
664 return (bool)ASMAtomicXchgU8((volatile uint8_t RT_FAR *)pf, (uint8_t)f);
665#endif
666}
667
668
669/**
670 * Atomically Exchange an unsigned 16-bit value, ordered.
671 *
672 * @returns Current *pu16 value
673 * @param pu16 Pointer to the 16-bit variable to update.
674 * @param u16 The 16-bit value to assign to *pu16.
675 */
676#if RT_INLINE_ASM_EXTERNAL_TMP_ARM
677RT_ASM_DECL_PRAGMA_WATCOM(uint16_t) ASMAtomicXchgU16(volatile uint16_t RT_FAR *pu16, uint16_t u16) RT_NOTHROW_PROTO;
678#else
679DECLINLINE(uint16_t) ASMAtomicXchgU16(volatile uint16_t RT_FAR *pu16, uint16_t u16) RT_NOTHROW_DEF
680{
681# if defined(RT_ARCH_AMD64) || defined(RT_ARCH_X86)
682# if RT_INLINE_ASM_GNU_STYLE
683 __asm__ __volatile__("xchgw %0, %1\n\t"
684 : "=m" (*pu16)
685 , "=r" (u16)
686 : "1" (u16)
687 , "m" (*pu16));
688# else
689 __asm
690 {
691# ifdef RT_ARCH_AMD64
692 mov rdx, [pu16]
693 mov ax, [u16]
694 xchg [rdx], ax
695 mov [u16], ax
696# else
697 mov edx, [pu16]
698 mov ax, [u16]
699 xchg [edx], ax
700 mov [u16], ax
701# endif
702 }
703# endif
704 return u16;
705
706# elif defined(RT_ARCH_ARM32) || defined(RT_ARCH_ARM64)
707
708# if RT_INLINE_ASM_USES_INTRIN
709 uint16_t uOld;
710# if defined(RTASM_ARM64_USE_FEAT_LSE_WITHOUT_DMB)
711 uOld = __swpal16(pu16, u16);
712# else
713 uOld = __swp16(pu16, u16);
714 __dmb(_ARM64_BARRIER_SY);
715# endif
716 return uOld;
717
718# else
719 uint32_t uOld;
720# if defined(RTASM_ARM64_USE_FEAT_LSE)
721 /* SWPALH is ~40% more expensive than the non-LSE variant on an M1, 20%
722 slower if we remove the barrier. But since we have the barrier we
723 shouldn't need that, right? Ordering should be taken care of by the DMB.
724 The SWPH is rather cheap (~70% faster). */
725 __asm__ __volatile__("Lstart_ASMAtomicXchgU16_%=:\n\t"
726# if defined(RTASM_ARM64_USE_FEAT_LSE_WITHOUT_DMB)
727 "swpalh %w[uNew], %w[uOld], %[pMem]\n\t"
728# else
729 RTASM_ARM_DMB_SY
730 "swph %w[uNew], %w[uOld], %[pMem]\n\t"
731# endif
732 : [pMem] "+Q" (*pu16)
733 , [uOld] "=&r" (uOld)
734 : [uNew] "r" ((uint32_t)u16)
735 : );
736# else
737 uint32_t rcSpill;
738 __asm__ __volatile__("Ltry_again_ASMAtomicXchgU16_%=:\n\t"
739 RTASM_ARM_DMB_SY
740# if defined(RT_ARCH_ARM64)
741 "ldaxrh %w[uOld], %[pMem]\n\t"
742 "stlxrh %w[rc], %w[uNew], %[pMem]\n\t"
743 "cbnz %w[rc], Ltry_again_ASMAtomicXchgU16_%=\n\t"
744# else
745 "ldrexh %[uOld], %[pMem]\n\t" /* ARMv6+ */
746 "strexh %[rc], %[uNew], %[pMem]\n\t"
747 "cmp %[rc], #0\n\t"
748 "bne Ltry_again_ASMAtomicXchgU16_%=\n\t"
749# endif
750 : [pMem] "+Q" (*pu16)
751 , [uOld] "=&r" (uOld)
752 , [rc] "=&r" (rcSpill)
753 : [uNew] "r" ((uint32_t)u16)
754 RTASM_ARM_DMB_SY_COMMA_IN_REG
755 : "cc");
756# endif
757 return (uint16_t)uOld;
758#endif
759
760# else
761# error "Port me"
762# endif
763}
764#endif
765
766
767/**
768 * Atomically Exchange a signed 16-bit value, ordered.
769 *
770 * @returns Current *pu16 value
771 * @param pi16 Pointer to the 16-bit variable to update.
772 * @param i16 The 16-bit value to assign to *pi16.
773 */
774DECLINLINE(int16_t) ASMAtomicXchgS16(volatile int16_t RT_FAR *pi16, int16_t i16) RT_NOTHROW_DEF
775{
776 return (int16_t)ASMAtomicXchgU16((volatile uint16_t RT_FAR *)pi16, (uint16_t)i16);
777}
778
779
780/**
781 * Atomically Exchange an unsigned 32-bit value, ordered.
782 *
783 * @returns Current *pu32 value
784 * @param pu32 Pointer to the 32-bit variable to update.
785 * @param u32 The 32-bit value to assign to *pu32.
786 *
787 * @remarks Does not work on 286 and earlier.
788 */
789#if RT_INLINE_ASM_EXTERNAL_TMP_ARM && !RT_INLINE_ASM_USES_INTRIN
790RT_ASM_DECL_PRAGMA_WATCOM(uint32_t) ASMAtomicXchgU32(volatile uint32_t RT_FAR *pu32, uint32_t u32) RT_NOTHROW_PROTO;
791#else
792DECLINLINE(uint32_t) ASMAtomicXchgU32(volatile uint32_t RT_FAR *pu32, uint32_t u32) RT_NOTHROW_DEF
793{
794# if defined(RT_ARCH_AMD64) || defined(RT_ARCH_X86)
795# if RT_INLINE_ASM_GNU_STYLE
796 __asm__ __volatile__("xchgl %0, %1\n\t"
797 : "=m" (*pu32) /** @todo r=bird: +m rather than =m here? */
798 , "=r" (u32)
799 : "1" (u32)
800 , "m" (*pu32));
801
802# elif RT_INLINE_ASM_USES_INTRIN
803 u32 = _InterlockedExchange((long RT_FAR *)pu32, u32);
804
805# else
806 __asm
807 {
808# ifdef RT_ARCH_AMD64
809 mov rdx, [pu32]
810 mov eax, u32
811 xchg [rdx], eax
812 mov [u32], eax
813# else
814 mov edx, [pu32]
815 mov eax, u32
816 xchg [edx], eax
817 mov [u32], eax
818# endif
819 }
820# endif
821 return u32;
822
823# elif defined(RT_ARCH_ARM32) || defined(RT_ARCH_ARM64)
824
825# if RT_INLINE_ASM_USES_INTRIN
826 uint32_t uOld;
827# if defined(RTASM_ARM64_USE_FEAT_LSE_WITHOUT_DMB)
828 uOld = __swpal32(pu32, u32);
829# else
830 uOld = __swp32(pu32, u32);
831 __dmb(_ARM64_BARRIER_SY);
832# endif
833 return uOld;
834
835# else
836 uint32_t uOld;
837# if defined(RTASM_ARM64_USE_FEAT_LSE)
838 /* SWPAL is ~40% more expensive than the non-LSE variant on an M1, 20%
839 slower if we remove the barrier. But since we have the barrier we
840 shouldn't need that, right? Ordering should be taken care of by the DMB.
841 The SWP is rather cheap (~70% faster). */
842 __asm__ __volatile__("Lstart_ASMAtomicXchgU32_%=:\n\t"
843# if defined(RTASM_ARM64_USE_FEAT_LSE_WITHOUT_DMB)
844 "swpal %w[uNew], %w[uOld], %[pMem]\n\t"
845# else
846 RTASM_ARM_DMB_SY
847 "swp %w[uNew], %w[uOld], %[pMem]\n\t"
848# endif
849 : [pMem] "+Q" (*pu32)
850 , [uOld] "=&r" (uOld)
851 : [uNew] "r" (u32)
852 : );
853# else
854 uint32_t rcSpill;
855 __asm__ __volatile__("Ltry_again_ASMAtomicXchgU32_%=:\n\t"
856 RTASM_ARM_DMB_SY
857# if defined(RT_ARCH_ARM64)
858 "ldaxr %w[uOld], %[pMem]\n\t"
859 "stlxr %w[rc], %w[uNew], %[pMem]\n\t"
860 "cbnz %w[rc], Ltry_again_ASMAtomicXchgU32_%=\n\t"
861# else
862 "ldrex %[uOld], %[pMem]\n\t" /* ARMv6+ */
863 "strex %[rc], %[uNew], %[pMem]\n\t"
864 "cmp %[rc], #0\n\t"
865 "bne Ltry_again_ASMAtomicXchgU32_%=\n\t"
866# endif
867 : [pMem] "+Q" (*pu32)
868 , [uOld] "=&r" (uOld)
869 , [rc] "=&r" (rcSpill)
870 : [uNew] "r" (u32)
871 RTASM_ARM_DMB_SY_COMMA_IN_REG
872 : "cc");
873# endif
874 return uOld;
875# endif
876
877# else
878# error "Port me"
879# endif
880}
881#endif
882
883
884/**
885 * Atomically Exchange a signed 32-bit value, ordered.
886 *
887 * @returns Current *pu32 value
888 * @param pi32 Pointer to the 32-bit variable to update.
889 * @param i32 The 32-bit value to assign to *pi32.
890 */
891DECLINLINE(int32_t) ASMAtomicXchgS32(volatile int32_t RT_FAR *pi32, int32_t i32) RT_NOTHROW_DEF
892{
893 return (int32_t)ASMAtomicXchgU32((volatile uint32_t RT_FAR *)pi32, (uint32_t)i32);
894}
895
896
897/**
898 * Atomically Exchange an unsigned 64-bit value, ordered.
899 *
900 * @returns Current *pu64 value
901 * @param pu64 Pointer to the 64-bit variable to update.
902 * @param u64 The 64-bit value to assign to *pu64.
903 *
904 * @remarks Works on 32-bit x86 CPUs starting with Pentium.
905 */
906#if (RT_INLINE_ASM_EXTERNAL_TMP_ARM && !RT_INLINE_ASM_USES_INTRIN) \
907 || RT_INLINE_DONT_MIX_CMPXCHG8B_AND_PIC
908RT_ASM_DECL_PRAGMA_WATCOM(uint64_t) ASMAtomicXchgU64(volatile uint64_t RT_FAR *pu64, uint64_t u64) RT_NOTHROW_PROTO;
909#else
910DECLINLINE(uint64_t) ASMAtomicXchgU64(volatile uint64_t RT_FAR *pu64, uint64_t u64) RT_NOTHROW_DEF
911{
912# if defined(RT_ARCH_AMD64)
913# if RT_INLINE_ASM_USES_INTRIN
914 return _InterlockedExchange64((__int64 *)pu64, u64);
915
916# elif RT_INLINE_ASM_GNU_STYLE
917 __asm__ __volatile__("xchgq %0, %1\n\t"
918 : "=m" (*pu64)
919 , "=r" (u64)
920 : "1" (u64)
921 , "m" (*pu64));
922 return u64;
923# else
924 __asm
925 {
926 mov rdx, [pu64]
927 mov rax, [u64]
928 xchg [rdx], rax
929 mov [u64], rax
930 }
931 return u64;
932# endif
933
934# elif defined(RT_ARCH_X86)
935# if RT_INLINE_ASM_GNU_STYLE
936# if defined(PIC) || defined(__PIC__)
937 uint32_t u32EBX = (uint32_t)u64;
938 __asm__ __volatile__(/*"xchgl %%esi, %5\n\t"*/
939 "xchgl %%ebx, %3\n\t"
940 "1:\n\t"
941 "lock; cmpxchg8b (%5)\n\t"
942 "jnz 1b\n\t"
943 "movl %3, %%ebx\n\t"
944 /*"xchgl %%esi, %5\n\t"*/
945 : "=A" (u64)
946 , "=m" (*pu64)
947 : "0" (*pu64)
948 , "m" ( u32EBX )
949 , "c" ( (uint32_t)(u64 >> 32) )
950 , "S" (pu64)
951 : "cc");
952# else /* !PIC */
953 __asm__ __volatile__("1:\n\t"
954 "lock; cmpxchg8b %1\n\t"
955 "jnz 1b\n\t"
956 : "=A" (u64)
957 , "=m" (*pu64)
958 : "0" (*pu64)
959 , "b" ( (uint32_t)u64 )
960 , "c" ( (uint32_t)(u64 >> 32) )
961 : "cc");
962# endif
963# else
964 __asm
965 {
966 mov ebx, dword ptr [u64]
967 mov ecx, dword ptr [u64 + 4]
968 mov edi, pu64
969 mov eax, dword ptr [edi]
970 mov edx, dword ptr [edi + 4]
971 retry:
972 lock cmpxchg8b [edi]
973 jnz retry
974 mov dword ptr [u64], eax
975 mov dword ptr [u64 + 4], edx
976 }
977# endif
978 return u64;
979
980# elif defined(RT_ARCH_ARM32) || defined(RT_ARCH_ARM64)
981
982# if RT_INLINE_ASM_USES_INTRIN
983 uint64_t uOld;
984# if defined(RTASM_ARM64_USE_FEAT_LSE_WITHOUT_DMB)
985 uOld = __swpal64(pu64, u64);
986# else
987 uOld = __swp64(pu64, u64);
988# endif
989 return uOld;
990
991# else
992 uint64_t uOld;
993# if defined(RTASM_ARM64_USE_FEAT_LSE)
994 /* SWPAL is ~40% more expensive than the non-LSE variant on an M1, 20%
995 slower if we remove the barrier. But since we have the barrier we
996 shouldn't need that, right? Ordering should be taken care of by the DMB.
997 The SWP is rather cheap (~70% faster). */
998 __asm__ __volatile__("Lstart_ASMAtomicXchgU64_%=:\n\t"
999# if defined(RTASM_ARM64_USE_FEAT_LSE_WITHOUT_DMB)
1000 "swpal %[uNew], %[uOld], %[pMem]\n\t"
1001# else
1002 RTASM_ARM_DMB_SY
1003 "swp %[uNew], %[uOld], %[pMem]\n\t"
1004# endif
1005 : [pMem] "+Q" (*pu64)
1006 , [uOld] "=&r" (uOld)
1007 : [uNew] "r" (u64)
1008 : );
1009# else
1010 uint32_t rcSpill;
1011 __asm__ __volatile__("Ltry_again_ASMAtomicXchgU64_%=:\n\t"
1012 RTASM_ARM_DMB_SY
1013# if defined(RT_ARCH_ARM64)
1014 "ldaxr %[uOld], %[pMem]\n\t"
1015 "stlxr %w[rc], %[uNew], %[pMem]\n\t"
1016 "cbnz %w[rc], Ltry_again_ASMAtomicXchgU64_%=\n\t"
1017# else
1018 "ldrexd %[uOld], %H[uOld], %[pMem]\n\t" /* ARMv6+ */
1019 "strexd %[rc], %[uNew], %H[uNew], %[pMem]\n\t"
1020 "cmp %[rc], #0\n\t"
1021 "bne Ltry_again_ASMAtomicXchgU64_%=\n\t"
1022# endif
1023 : [pMem] "+Q" (*pu64)
1024 , [uOld] "=&r" (uOld)
1025 , [rc] "=&r" (rcSpill)
1026 : [uNew] "r" (u64)
1027 RTASM_ARM_DMB_SY_COMMA_IN_REG
1028 : "cc");
1029# endif
1030 return uOld;
1031# endif
1032
1033# else
1034# error "Port me"
1035# endif
1036}
1037#endif
1038
1039
1040/**
1041 * Atomically Exchange an signed 64-bit value, ordered.
1042 *
1043 * @returns Current *pi64 value
1044 * @param pi64 Pointer to the 64-bit variable to update.
1045 * @param i64 The 64-bit value to assign to *pi64.
1046 */
1047DECLINLINE(int64_t) ASMAtomicXchgS64(volatile int64_t RT_FAR *pi64, int64_t i64) RT_NOTHROW_DEF
1048{
1049 return (int64_t)ASMAtomicXchgU64((volatile uint64_t RT_FAR *)pi64, (uint64_t)i64);
1050}
1051
1052
1053/**
1054 * Atomically Exchange a size_t value, ordered.
1055 *
1056 * @returns Current *ppv value
1057 * @param puDst Pointer to the size_t variable to update.
1058 * @param uNew The new value to assign to *puDst.
1059 */
1060DECLINLINE(size_t) ASMAtomicXchgZ(size_t volatile RT_FAR *puDst, const size_t uNew) RT_NOTHROW_DEF
1061{
1062#if ARCH_BITS == 16
1063 AssertCompile(sizeof(size_t) == 2);
1064 return ASMAtomicXchgU16((volatile uint16_t RT_FAR *)puDst, uNew);
1065#elif ARCH_BITS == 32
1066 return ASMAtomicXchgU32((volatile uint32_t RT_FAR *)puDst, uNew);
1067#elif ARCH_BITS == 64
1068 return ASMAtomicXchgU64((volatile uint64_t RT_FAR *)puDst, uNew);
1069#else
1070# error "ARCH_BITS is bogus"
1071#endif
1072}
1073
1074
1075/**
1076 * Atomically Exchange a pointer value, ordered.
1077 *
1078 * @returns Current *ppv value
1079 * @param ppv Pointer to the pointer variable to update.
1080 * @param pv The pointer value to assign to *ppv.
1081 */
1082DECLINLINE(void RT_FAR *) ASMAtomicXchgPtr(void RT_FAR * volatile RT_FAR *ppv, const void RT_FAR *pv) RT_NOTHROW_DEF
1083{
1084#if ARCH_BITS == 32 || ARCH_BITS == 16
1085 return (void RT_FAR *)ASMAtomicXchgU32((volatile uint32_t RT_FAR *)(void RT_FAR *)ppv, (uint32_t)pv);
1086#elif ARCH_BITS == 64
1087 return (void RT_FAR *)ASMAtomicXchgU64((volatile uint64_t RT_FAR *)(void RT_FAR *)ppv, (uint64_t)pv);
1088#else
1089# error "ARCH_BITS is bogus"
1090#endif
1091}
1092
1093
1094/**
1095 * Convenience macro for avoiding the annoying casting with ASMAtomicXchgPtr.
1096 *
1097 * @returns Current *pv value
1098 * @param ppv Pointer to the pointer variable to update.
1099 * @param pv The pointer value to assign to *ppv.
1100 * @param Type The type of *ppv, sans volatile.
1101 */
1102#ifdef __GNUC__ /* 8.2.0 requires -Wno-ignored-qualifiers */
1103# define ASMAtomicXchgPtrT(ppv, pv, Type) \
1104 __extension__ \
1105 ({\
1106 __typeof__(*(ppv)) volatile * const ppvTypeChecked = (ppv); \
1107 Type const pvTypeChecked = (pv); \
1108 Type pvTypeCheckedRet = (__typeof__(*(ppv))) ASMAtomicXchgPtr((void * volatile *)ppvTypeChecked, (void *)pvTypeChecked); \
1109 pvTypeCheckedRet; \
1110 })
1111#else
1112# define ASMAtomicXchgPtrT(ppv, pv, Type) \
1113 (Type)ASMAtomicXchgPtr((void RT_FAR * volatile RT_FAR *)(ppv), (void RT_FAR *)(pv))
1114#endif
1115
1116
1117/**
1118 * Atomically Exchange a raw-mode context pointer value, ordered.
1119 *
1120 * @returns Current *ppv value
1121 * @param ppvRC Pointer to the pointer variable to update.
1122 * @param pvRC The pointer value to assign to *ppv.
1123 */
1124DECLINLINE(RTRCPTR) ASMAtomicXchgRCPtr(RTRCPTR volatile RT_FAR *ppvRC, RTRCPTR pvRC) RT_NOTHROW_DEF
1125{
1126 return (RTRCPTR)ASMAtomicXchgU32((uint32_t volatile RT_FAR *)(void RT_FAR *)ppvRC, (uint32_t)pvRC);
1127}
1128
1129
1130/**
1131 * Atomically Exchange a ring-0 pointer value, ordered.
1132 *
1133 * @returns Current *ppv value
1134 * @param ppvR0 Pointer to the pointer variable to update.
1135 * @param pvR0 The pointer value to assign to *ppv.
1136 */
1137DECLINLINE(RTR0PTR) ASMAtomicXchgR0Ptr(RTR0PTR volatile RT_FAR *ppvR0, RTR0PTR pvR0) RT_NOTHROW_DEF
1138{
1139#if R0_ARCH_BITS == 32 || ARCH_BITS == 16
1140 return (RTR0PTR)ASMAtomicXchgU32((volatile uint32_t RT_FAR *)(void RT_FAR *)ppvR0, (uint32_t)pvR0);
1141#elif R0_ARCH_BITS == 64
1142 return (RTR0PTR)ASMAtomicXchgU64((volatile uint64_t RT_FAR *)(void RT_FAR *)ppvR0, (uint64_t)pvR0);
1143#else
1144# error "R0_ARCH_BITS is bogus"
1145#endif
1146}
1147
1148
1149/**
1150 * Atomically Exchange a ring-3 pointer value, ordered.
1151 *
1152 * @returns Current *ppv value
1153 * @param ppvR3 Pointer to the pointer variable to update.
1154 * @param pvR3 The pointer value to assign to *ppv.
1155 */
1156DECLINLINE(RTR3PTR) ASMAtomicXchgR3Ptr(RTR3PTR volatile RT_FAR *ppvR3, RTR3PTR pvR3) RT_NOTHROW_DEF
1157{
1158#if R3_ARCH_BITS == 32 || ARCH_BITS == 16
1159 return (RTR3PTR)ASMAtomicXchgU32((volatile uint32_t RT_FAR *)(void RT_FAR *)ppvR3, (uint32_t)pvR3);
1160#elif R3_ARCH_BITS == 64
1161 return (RTR3PTR)ASMAtomicXchgU64((volatile uint64_t RT_FAR *)(void RT_FAR *)ppvR3, (uint64_t)pvR3);
1162#else
1163# error "R3_ARCH_BITS is bogus"
1164#endif
1165}
1166
1167
1168/** @def ASMAtomicXchgHandle
1169 * Atomically Exchange a typical IPRT handle value, ordered.
1170 *
1171 * @param ph Pointer to the value to update.
1172 * @param hNew The new value to assigned to *pu.
1173 * @param phRes Where to store the current *ph value.
1174 *
1175 * @remarks This doesn't currently work for all handles (like RTFILE).
1176 */
1177#if HC_ARCH_BITS == 32 || ARCH_BITS == 16
1178# define ASMAtomicXchgHandle(ph, hNew, phRes) \
1179 do { \
1180 AssertCompile(sizeof(*(ph)) == sizeof(uint32_t)); \
1181 AssertCompile(sizeof(*(phRes)) == sizeof(uint32_t)); \
1182 *(uint32_t RT_FAR *)(phRes) = ASMAtomicXchgU32((uint32_t volatile RT_FAR *)(ph), (const uint32_t)(hNew)); \
1183 } while (0)
1184#elif HC_ARCH_BITS == 64
1185# define ASMAtomicXchgHandle(ph, hNew, phRes) \
1186 do { \
1187 AssertCompile(sizeof(*(ph)) == sizeof(uint64_t)); \
1188 AssertCompile(sizeof(*(phRes)) == sizeof(uint64_t)); \
1189 *(uint64_t RT_FAR *)(phRes) = ASMAtomicXchgU64((uint64_t volatile RT_FAR *)(ph), (const uint64_t)(hNew)); \
1190 } while (0)
1191#else
1192# error HC_ARCH_BITS
1193#endif
1194
1195
1196/**
1197 * Atomically Exchange a value which size might differ
1198 * between platforms or compilers, ordered.
1199 *
1200 * @param pu Pointer to the variable to update.
1201 * @param uNew The value to assign to *pu.
1202 * @todo This is busted as its missing the result argument.
1203 */
1204#define ASMAtomicXchgSize(pu, uNew) \
1205 do { \
1206 switch (sizeof(*(pu))) { \
1207 case 1: ASMAtomicXchgU8( (volatile uint8_t RT_FAR *)(void RT_FAR *)(pu), (uint8_t)(uNew)); break; \
1208 case 2: ASMAtomicXchgU16((volatile uint16_t RT_FAR *)(void RT_FAR *)(pu), (uint16_t)(uNew)); break; \
1209 case 4: ASMAtomicXchgU32((volatile uint32_t RT_FAR *)(void RT_FAR *)(pu), (uint32_t)(uNew)); break; \
1210 case 8: ASMAtomicXchgU64((volatile uint64_t RT_FAR *)(void RT_FAR *)(pu), (uint64_t)(uNew)); break; \
1211 default: AssertMsgFailed(("ASMAtomicXchgSize: size %d is not supported\n", sizeof(*(pu)))); \
1212 } \
1213 } while (0)
1214
1215/**
1216 * Atomically Exchange a value which size might differ
1217 * between platforms or compilers, ordered.
1218 *
1219 * @param pu Pointer to the variable to update.
1220 * @param uNew The value to assign to *pu.
1221 * @param puRes Where to store the current *pu value.
1222 */
1223#define ASMAtomicXchgSizeCorrect(pu, uNew, puRes) \
1224 do { \
1225 switch (sizeof(*(pu))) { \
1226 case 1: *(uint8_t RT_FAR *)(puRes) = ASMAtomicXchgU8( (volatile uint8_t RT_FAR *)(void RT_FAR *)(pu), (uint8_t)(uNew)); break; \
1227 case 2: *(uint16_t RT_FAR *)(puRes) = ASMAtomicXchgU16((volatile uint16_t RT_FAR *)(void RT_FAR *)(pu), (uint16_t)(uNew)); break; \
1228 case 4: *(uint32_t RT_FAR *)(puRes) = ASMAtomicXchgU32((volatile uint32_t RT_FAR *)(void RT_FAR *)(pu), (uint32_t)(uNew)); break; \
1229 case 8: *(uint64_t RT_FAR *)(puRes) = ASMAtomicXchgU64((volatile uint64_t RT_FAR *)(void RT_FAR *)(pu), (uint64_t)(uNew)); break; \
1230 default: AssertMsgFailed(("ASMAtomicXchgSize: size %d is not supported\n", sizeof(*(pu)))); \
1231 } \
1232 } while (0)
1233
1234
1235
1236/**
1237 * Atomically Compare and Exchange an unsigned 8-bit value, ordered.
1238 *
1239 * @returns true if xchg was done.
1240 * @returns false if xchg wasn't done.
1241 *
1242 * @param pu8 Pointer to the value to update.
1243 * @param u8New The new value to assigned to *pu8.
1244 * @param u8Old The old value to *pu8 compare with.
1245 *
1246 * @remarks x86: Requires a 486 or later.
1247 * @todo Rename ASMAtomicCmpWriteU8
1248 */
1249#if RT_INLINE_ASM_EXTERNAL_TMP_ARM || (!RT_INLINE_ASM_GNU_STYLE && !defined(RT_ARCH_ARM64))
1250RT_ASM_DECL_PRAGMA_WATCOM(bool) ASMAtomicCmpXchgU8(volatile uint8_t RT_FAR *pu8, const uint8_t u8New, const uint8_t u8Old) RT_NOTHROW_PROTO;
1251#else
1252DECLINLINE(bool) ASMAtomicCmpXchgU8(volatile uint8_t RT_FAR *pu8, const uint8_t u8New, uint8_t u8Old) RT_NOTHROW_DEF
1253{
1254# if defined(RT_ARCH_AMD64) || defined(RT_ARCH_X86)
1255 uint8_t u8Ret;
1256 __asm__ __volatile__("lock; cmpxchgb %3, %0\n\t"
1257 "setz %1\n\t"
1258 : "=m" (*pu8)
1259 , "=qm" (u8Ret)
1260 , "=a" (u8Old)
1261 : "q" (u8New)
1262 , "2" (u8Old)
1263 , "m" (*pu8)
1264 : "cc");
1265 return (bool)u8Ret;
1266
1267# elif RT_INLINE_ASM_USES_INTRIN
1268 return (uint8_t)_InterlockedCompareExchange8((char RT_FAR *)pu8, u8New, u8Old) == u8Old;
1269
1270# elif defined(RT_ARCH_ARM64) || defined(RT_ARCH_ARM32)
1271 union { uint32_t u; bool f; } fXchg;
1272 uint32_t u32Spill;
1273# if defined(RTASM_ARM64_USE_FEAT_LSE)
1274 __asm__ __volatile__("Lstart_ASMAtomicCmpXchgU8_%=:\n\t"
1275# if defined(RTASM_ARM64_USE_FEAT_LSE_WITHOUT_DMB) /* M1 bench: casalb=5625 vs dmb+casb=1597 vs non-lse=5623 (ps/call) */
1276 "casalb %w[uOldActual], %w[uNew], %[pMem]\n\t"
1277# else
1278 RTASM_ARM_DMB_SY
1279 "casb %w[uOldActual], %w[uNew], %[pMem]\n\t"
1280# endif
1281 "cmp %w[uOldActual], %w[uOldOrg]\n\t"
1282 "cset %w[fXchg], eq\n\t"
1283 : [pMem] "+Q" (*pu8)
1284 , [uOldActual] "=&r" (u32Spill)
1285 , [fXchg] "=&r" (fXchg.u)
1286 : [uNew] "r" ((uint32_t)u8New)
1287 , [uOldOrg] "r" ((uint32_t)u8Old)
1288 , "[uOldActual]" ((uint32_t)u8Old)
1289 : "cc");
1290# else
1291 uint32_t rcSpill;
1292 __asm__ __volatile__("Ltry_again_ASMAtomicCmpXchgU8_%=:\n\t"
1293 RTASM_ARM_DMB_SY
1294# if defined(RT_ARCH_ARM64)
1295 "ldaxrb %w[uOld], %[pMem]\n\t"
1296 "cmp %w[uOld], %w[uCmp]\n\t"
1297 "bne 1f\n\t" /* stop here if not equal */
1298 "stlxrb %w[rc], %w[uNew], %[pMem]\n\t"
1299 "cbnz %w[rc], Ltry_again_ASMAtomicCmpXchgU8_%=\n\t"
1300 "mov %w[fXchg], #1\n\t"
1301 "1:\n\t"
1302 "clrex\n\t"
1303# else
1304 "ldrexb %[uOld], %[pMem]\n\t"
1305 "teq %[uOld], %[uCmp]\n\t"
1306 "strexbeq %[rc], %[uNew], %[pMem]\n\t"
1307 "bne 1f\n\t" /* stop here if not equal */
1308 "cmp %[rc], #0\n\t"
1309 "bne Ltry_again_ASMAtomicCmpXchgU8_%=\n\t"
1310 "mov %[fXchg], #1\n\t"
1311 "1:\n\t"
1312 /** @todo clrexne on armv7? */
1313# endif
1314 : [pMem] "+Q" (*pu8)
1315 , [uOld] "=&r" (u32Spill)
1316 , [rc] "=&r" (rcSpill)
1317 , [fXchg] "=&r" (fXchg.u)
1318 : [uCmp] "r" ((uint32_t)u8Old)
1319 , [uNew] "r" ((uint32_t)u8New)
1320 , "[fXchg]" (0)
1321 RTASM_ARM_DMB_SY_COMMA_IN_REG
1322 : "cc");
1323# endif
1324 return fXchg.f;
1325
1326# else
1327# error "Port me"
1328# endif
1329}
1330#endif
1331
1332
1333/**
1334 * Atomically Compare and Exchange a signed 8-bit value, ordered.
1335 *
1336 * @returns true if xchg was done.
1337 * @returns false if xchg wasn't done.
1338 *
1339 * @param pi8 Pointer to the value to update.
1340 * @param i8New The new value to assigned to *pi8.
1341 * @param i8Old The old value to *pi8 compare with.
1342 *
1343 * @remarks x86: Requires a 486 or later.
1344 * @todo Rename ASMAtomicCmpWriteS8
1345 */
1346DECLINLINE(bool) ASMAtomicCmpXchgS8(volatile int8_t RT_FAR *pi8, const int8_t i8New, const int8_t i8Old) RT_NOTHROW_DEF
1347{
1348 return ASMAtomicCmpXchgU8((volatile uint8_t RT_FAR *)pi8, (uint8_t)i8New, (uint8_t)i8Old);
1349}
1350
1351
1352/**
1353 * Atomically Compare and Exchange a bool value, ordered.
1354 *
1355 * @returns true if xchg was done.
1356 * @returns false if xchg wasn't done.
1357 *
1358 * @param pf Pointer to the value to update.
1359 * @param fNew The new value to assigned to *pf.
1360 * @param fOld The old value to *pf compare with.
1361 *
1362 * @remarks x86: Requires a 486 or later.
1363 * @todo Rename ASMAtomicCmpWriteBool
1364 */
1365DECLINLINE(bool) ASMAtomicCmpXchgBool(volatile bool RT_FAR *pf, const bool fNew, const bool fOld) RT_NOTHROW_DEF
1366{
1367 return ASMAtomicCmpXchgU8((volatile uint8_t RT_FAR *)pf, (uint8_t)fNew, (uint8_t)fOld);
1368}
1369
1370
1371/**
1372 * Atomically Compare and Exchange an unsigned 32-bit value, ordered.
1373 *
1374 * @returns true if xchg was done.
1375 * @returns false if xchg wasn't done.
1376 *
1377 * @param pu32 Pointer to the value to update.
1378 * @param u32New The new value to assigned to *pu32.
1379 * @param u32Old The old value to *pu32 compare with.
1380 *
1381 * @remarks x86: Requires a 486 or later.
1382 * @todo Rename ASMAtomicCmpWriteU32
1383 */
1384#if RT_INLINE_ASM_EXTERNAL_TMP_ARM && !RT_INLINE_ASM_USES_INTRIN
1385RT_ASM_DECL_PRAGMA_WATCOM(bool) ASMAtomicCmpXchgU32(volatile uint32_t RT_FAR *pu32, const uint32_t u32New, const uint32_t u32Old) RT_NOTHROW_PROTO;
1386#else
1387DECLINLINE(bool) ASMAtomicCmpXchgU32(volatile uint32_t RT_FAR *pu32, const uint32_t u32New, uint32_t u32Old) RT_NOTHROW_DEF
1388{
1389# if defined(RT_ARCH_AMD64) || defined(RT_ARCH_X86)
1390# if RT_INLINE_ASM_GNU_STYLE
1391 uint8_t u8Ret;
1392 __asm__ __volatile__("lock; cmpxchgl %3, %0\n\t"
1393 "setz %1\n\t"
1394 : "=m" (*pu32)
1395 , "=qm" (u8Ret)
1396 , "=a" (u32Old)
1397 : "r" (u32New)
1398 , "2" (u32Old)
1399 , "m" (*pu32)
1400 : "cc");
1401 return (bool)u8Ret;
1402
1403# elif RT_INLINE_ASM_USES_INTRIN
1404 return (uint32_t)_InterlockedCompareExchange((long RT_FAR *)pu32, u32New, u32Old) == u32Old;
1405
1406# else
1407 uint32_t u32Ret;
1408 __asm
1409 {
1410# ifdef RT_ARCH_AMD64
1411 mov rdx, [pu32]
1412# else
1413 mov edx, [pu32]
1414# endif
1415 mov eax, [u32Old]
1416 mov ecx, [u32New]
1417# ifdef RT_ARCH_AMD64
1418 lock cmpxchg [rdx], ecx
1419# else
1420 lock cmpxchg [edx], ecx
1421# endif
1422 setz al
1423 movzx eax, al
1424 mov [u32Ret], eax
1425 }
1426 return !!u32Ret;
1427# endif
1428
1429# elif defined(RT_ARCH_ARM64) || defined(RT_ARCH_ARM32)
1430
1431# ifdef RT_INLINE_ASM_USES_INTRIN
1432 uint32_t uOldActual;
1433# if defined(RTASM_ARM64_USE_FEAT_LSE_WITHOUT_DMB)
1434 uOldActual = __casal32(pu32, u32Old, u32New);
1435# else
1436 uOldActual = __casal32(pu32, u32Old, u32New);
1437 __dmb(_ARM64_BARRIER_SY);
1438# endif
1439 return uOldActual == u32Old; /* Lets hope the compiler is clever enough to replicate our cmp + cset optimization below. */
1440
1441# else
1442 union { uint32_t u; bool f; } fXchg;
1443 uint32_t u32Spill;
1444 /* M1 bench: match: casal= 6592 vs dmb+cas= 1562 vs non-lse=5634 (ps/call)
1445 mismatch: casal=18794 vs dmb+cas=19697 vs non-lse=2499 (ps/call) */
1446# if defined(RTASM_ARM64_USE_FEAT_LSE)
1447 __asm__ __volatile__("Lstart_ASMAtomicCmpXchgU32_%=:\n\t"
1448# if defined(RTASM_ARM64_USE_FEAT_LSE_WITHOUT_DMB)
1449 "casal %w[uOldActual], %w[uNew], %[pMem]\n\t"
1450# else
1451 RTASM_ARM_DMB_SY
1452 "cas %w[uOldActual], %w[uNew], %[pMem]\n\t"
1453# endif
1454 "cmp %w[uOldActual], %w[uOldOrg]\n\t"
1455 "cset %w[fXchg], eq\n\t"
1456 : [pMem] "+Q" (*pu32)
1457 , [uOldActual] "=&r" (u32Spill)
1458 , [fXchg] "=&r" (fXchg.u)
1459 : [uNew] "r" (u32New)
1460 , [uOldOrg] "r" (u32Old)
1461 , "[uOldActual]" (u32Old)
1462 : "cc");
1463# else
1464 uint32_t rcSpill;
1465 __asm__ __volatile__("Ltry_again_ASMAtomicCmpXchgU32_%=:\n\t"
1466 RTASM_ARM_DMB_SY
1467# if defined(RT_ARCH_ARM64)
1468 "ldaxr %w[uOld], %[pMem]\n\t"
1469 "cmp %w[uOld], %w[uCmp]\n\t"
1470 "bne 1f\n\t" /* stop here if not equal */
1471 "stlxr %w[rc], %w[uNew], %[pMem]\n\t"
1472 "cbnz %w[rc], Ltry_again_ASMAtomicCmpXchgU32_%=\n\t"
1473 "mov %w[fXchg], #1\n\t"
1474 "1:\n\t"
1475 "clrex\n\t"
1476# else
1477 "ldrex %[uOld], %[pMem]\n\t"
1478 "teq %[uOld], %[uCmp]\n\t"
1479 "strexeq %[rc], %[uNew], %[pMem]\n\t"
1480 "bne 1f\n\t" /* stop here if not equal */
1481 "cmp %[rc], #0\n\t"
1482 "bne Ltry_again_ASMAtomicCmpXchgU32_%=\n\t"
1483 "mov %[fXchg], #1\n\t"
1484 "1:\n\t"
1485 /** @todo clrexne on armv7? */
1486# endif
1487 : [pMem] "+Q" (*pu32)
1488 , [uOld] "=&r" (u32Spill)
1489 , [rc] "=&r" (rcSpill)
1490 , [fXchg] "=&r" (fXchg.u)
1491 : [uCmp] "r" (u32Old)
1492 , [uNew] "r" (u32New)
1493 , "[fXchg]" (0)
1494 RTASM_ARM_DMB_SY_COMMA_IN_REG
1495 : "cc");
1496# endif
1497 return fXchg.f;
1498# endif
1499
1500# else
1501# error "Port me"
1502# endif
1503}
1504#endif
1505
1506
1507/**
1508 * Atomically Compare and Exchange a signed 32-bit value, ordered.
1509 *
1510 * @returns true if xchg was done.
1511 * @returns false if xchg wasn't done.
1512 *
1513 * @param pi32 Pointer to the value to update.
1514 * @param i32New The new value to assigned to *pi32.
1515 * @param i32Old The old value to *pi32 compare with.
1516 *
1517 * @remarks x86: Requires a 486 or later.
1518 * @todo Rename ASMAtomicCmpWriteS32
1519 */
1520DECLINLINE(bool) ASMAtomicCmpXchgS32(volatile int32_t RT_FAR *pi32, const int32_t i32New, const int32_t i32Old) RT_NOTHROW_DEF
1521{
1522 return ASMAtomicCmpXchgU32((volatile uint32_t RT_FAR *)pi32, (uint32_t)i32New, (uint32_t)i32Old);
1523}
1524
1525
1526/**
1527 * Atomically Compare and exchange an unsigned 64-bit value, ordered.
1528 *
1529 * @returns true if xchg was done.
1530 * @returns false if xchg wasn't done.
1531 *
1532 * @param pu64 Pointer to the 64-bit variable to update.
1533 * @param u64New The 64-bit value to assign to *pu64.
1534 * @param u64Old The value to compare with.
1535 *
1536 * @remarks x86: Requires a Pentium or later.
1537 * @todo Rename ASMAtomicCmpWriteU64
1538 */
1539#if (RT_INLINE_ASM_EXTERNAL_TMP_ARM && !RT_INLINE_ASM_USES_INTRIN) \
1540 || RT_INLINE_DONT_MIX_CMPXCHG8B_AND_PIC
1541RT_ASM_DECL_PRAGMA_WATCOM(bool) ASMAtomicCmpXchgU64(volatile uint64_t RT_FAR *pu64, const uint64_t u64New, const uint64_t u64Old) RT_NOTHROW_PROTO;
1542#else
1543DECLINLINE(bool) ASMAtomicCmpXchgU64(volatile uint64_t RT_FAR *pu64, uint64_t u64New, uint64_t u64Old) RT_NOTHROW_DEF
1544{
1545# if RT_INLINE_ASM_USES_INTRIN
1546 return (uint64_t)_InterlockedCompareExchange64((__int64 RT_FAR *)pu64, u64New, u64Old) == u64Old;
1547
1548# elif defined(RT_ARCH_AMD64)
1549# if RT_INLINE_ASM_GNU_STYLE
1550 uint8_t u8Ret;
1551 __asm__ __volatile__("lock; cmpxchgq %3, %0\n\t"
1552 "setz %1\n\t"
1553 : "=m" (*pu64)
1554 , "=qm" (u8Ret)
1555 , "=a" (u64Old)
1556 : "r" (u64New)
1557 , "2" (u64Old)
1558 , "m" (*pu64)
1559 : "cc");
1560 return (bool)u8Ret;
1561# else
1562 bool fRet;
1563 __asm
1564 {
1565 mov rdx, [pu32]
1566 mov rax, [u64Old]
1567 mov rcx, [u64New]
1568 lock cmpxchg [rdx], rcx
1569 setz al
1570 mov [fRet], al
1571 }
1572 return fRet;
1573# endif
1574
1575# elif defined(RT_ARCH_X86)
1576 uint32_t u32Ret;
1577# if RT_INLINE_ASM_GNU_STYLE
1578# if defined(PIC) || defined(__PIC__)
1579 uint32_t u32EBX = (uint32_t)u64New;
1580 uint32_t u32Spill;
1581 __asm__ __volatile__("xchgl %%ebx, %4\n\t"
1582 "lock; cmpxchg8b (%6)\n\t"
1583 "setz %%al\n\t"
1584 "movl %4, %%ebx\n\t"
1585 "movzbl %%al, %%eax\n\t"
1586 : "=a" (u32Ret)
1587 , "=d" (u32Spill)
1588# if RT_GNUC_PREREQ(4, 3)
1589 , "+m" (*pu64)
1590# else
1591 , "=m" (*pu64)
1592# endif
1593 : "A" (u64Old)
1594 , "m" ( u32EBX )
1595 , "c" ( (uint32_t)(u64New >> 32) )
1596 , "S" (pu64)
1597 : "cc");
1598# else /* !PIC */
1599 uint32_t u32Spill;
1600 __asm__ __volatile__("lock; cmpxchg8b %2\n\t"
1601 "setz %%al\n\t"
1602 "movzbl %%al, %%eax\n\t"
1603 : "=a" (u32Ret)
1604 , "=d" (u32Spill)
1605 , "+m" (*pu64)
1606 : "A" (u64Old)
1607 , "b" ( (uint32_t)u64New )
1608 , "c" ( (uint32_t)(u64New >> 32) )
1609 : "cc");
1610# endif
1611 return (bool)u32Ret;
1612# else
1613 __asm
1614 {
1615 mov ebx, dword ptr [u64New]
1616 mov ecx, dword ptr [u64New + 4]
1617 mov edi, [pu64]
1618 mov eax, dword ptr [u64Old]
1619 mov edx, dword ptr [u64Old + 4]
1620 lock cmpxchg8b [edi]
1621 setz al
1622 movzx eax, al
1623 mov dword ptr [u32Ret], eax
1624 }
1625 return !!u32Ret;
1626# endif
1627
1628# elif defined(RT_ARCH_ARM64) || defined(RT_ARCH_ARM32)
1629
1630# ifdef RT_INLINE_ASM_USES_INTRIN
1631 uint64_t uOldActual;
1632# if defined(RTASM_ARM64_USE_FEAT_LSE_WITHOUT_DMB)
1633 uOldActual = __casal64(pu64, u64Old, u64New);
1634# else
1635 uOldActual = __casal64(pu64, u64Old, u64New);
1636 __dmb(_ARM64_BARRIER_SY);
1637# endif
1638 return uOldActual == u64Old; /* Lets hope the compiler is clever enough to replicate our cmp + cset optimization below. */
1639
1640# else
1641 union { uint32_t u; bool f; } fXchg;
1642 uint64_t u64Spill;
1643 /* M1 bench: match: casal= 6599 vs dmb+cas= 1565 vs non-lse=5000 (ps/call)
1644 mismatch: casal=18797 vs dmb+cas=19731 vs non-lse=2512 (ps/call) */
1645# if defined(RTASM_ARM64_USE_FEAT_LSE)
1646 __asm__ __volatile__("Lstart_ASMAtomicCmpXchgU75_%=:\n\t"
1647# if defined(RTASM_ARM64_USE_FEAT_LSE_WITHOUT_DMB)
1648 "casal %[uOldActual], %[uNew], %[pMem]\n\t"
1649# else
1650 RTASM_ARM_DMB_SY
1651 "cas %[uOldActual], %[uNew], %[pMem]\n\t"
1652# endif
1653 "cmp %[uOldActual], %[uOldOrg]\n\t"
1654 "cset %w[fXchg], eq\n\t"
1655 : [pMem] "+Q" (*pu64)
1656 , [uOldActual] "=&r" (u64Spill)
1657 , [fXchg] "=&r" (fXchg.u)
1658 : [uNew] "r" (u64New)
1659 , [uOldOrg] "r" (u64Old)
1660 , "[uOldActual]" (u64Old)
1661 : "cc");
1662# else
1663 uint32_t rcSpill;
1664 __asm__ __volatile__("Ltry_again_ASMAtomicCmpXchgU64_%=:\n\t"
1665 RTASM_ARM_DMB_SY
1666# if defined(RT_ARCH_ARM64)
1667 "ldaxr %[uOld], %[pMem]\n\t"
1668 "cmp %[uOld], %[uCmp]\n\t"
1669 "bne 1f\n\t" /* stop here if not equal */
1670 "stlxr %w[rc], %[uNew], %[pMem]\n\t"
1671 "cbnz %w[rc], Ltry_again_ASMAtomicCmpXchgU64_%=\n\t"
1672 "mov %w[fXchg], #1\n\t"
1673 "1:\n\t"
1674 "clrex\n\t"
1675# else
1676 "ldrexd %[uOld], %H[uOld], %[pMem]\n\t"
1677 "teq %[uOld], %[uCmp]\n\t"
1678 "teqeq %H[uOld], %H[uCmp]\n\t"
1679 "strexdeq %[rc], %[uNew], %H[uNew], %[pMem]\n\t"
1680 "bne 1f\n\t" /* stop here if not equal */
1681 "cmp %[rc], #0\n\t"
1682 "bne Ltry_again_ASMAtomicCmpXchgU64_%=\n\t"
1683 "mov %[fXchg], #1\n\t"
1684 "1:\n\t"
1685 /** @todo clrexne on armv7? */
1686# endif
1687 : [pMem] "+Q" (*pu64)
1688 , [uOld] "=&r" (u64Spill)
1689 , [rc] "=&r" (rcSpill)
1690 , [fXchg] "=&r" (fXchg.u)
1691 : [uCmp] "r" (u64Old)
1692 , [uNew] "r" (u64New)
1693 , "[fXchg]" (0)
1694 RTASM_ARM_DMB_SY_COMMA_IN_REG
1695 : "cc");
1696# endif
1697 return fXchg.f;
1698# endif
1699
1700# else
1701# error "Port me"
1702# endif
1703}
1704#endif
1705
1706
1707/**
1708 * Atomically Compare and exchange a signed 64-bit value, ordered.
1709 *
1710 * @returns true if xchg was done.
1711 * @returns false if xchg wasn't done.
1712 *
1713 * @param pi64 Pointer to the 64-bit variable to update.
1714 * @param i64 The 64-bit value to assign to *pu64.
1715 * @param i64Old The value to compare with.
1716 *
1717 * @remarks x86: Requires a Pentium or later.
1718 * @todo Rename ASMAtomicCmpWriteS64
1719 */
1720DECLINLINE(bool) ASMAtomicCmpXchgS64(volatile int64_t RT_FAR *pi64, const int64_t i64, const int64_t i64Old) RT_NOTHROW_DEF
1721{
1722 return ASMAtomicCmpXchgU64((volatile uint64_t RT_FAR *)pi64, (uint64_t)i64, (uint64_t)i64Old);
1723}
1724
1725#if defined(RT_ARCH_AMD64) || defined(RT_ARCH_ARM64) || defined(DOXYGEN_RUNNING)
1726
1727/** @def RTASM_HAVE_CMP_WRITE_U128
1728 * Indicates that we've got ASMAtomicCmpWriteU128(), ASMAtomicCmpWriteU128v2()
1729 * and ASMAtomicCmpWriteExU128() available. */
1730# define RTASM_HAVE_CMP_WRITE_U128 1
1731
1732
1733/**
1734 * Atomically compare and write an unsigned 128-bit value, ordered.
1735 *
1736 * @returns true if write was done.
1737 * @returns false if write wasn't done.
1738 *
1739 * @param pu128 Pointer to the 128-bit variable to update.
1740 * @param u64NewHi The high 64 bits of the value to assign to *pu128.
1741 * @param u64NewLo The low 64 bits of the value to assign to *pu128.
1742 * @param u64OldHi The high 64-bit of the value to compare with.
1743 * @param u64OldLo The low 64-bit of the value to compare with.
1744 *
1745 * @remarks AMD64: Not present in the earliest CPUs, so check CPUID.
1746 */
1747# if (RT_INLINE_ASM_EXTERNAL_TMP_ARM && !RT_INLINE_ASM_USES_INTRIN)
1748DECLASM(bool) ASMAtomicCmpWriteU128v2(volatile uint128_t *pu128, const uint64_t u64NewHi, const uint64_t u64NewLo,
1749 const uint64_t u64OldHi, const uint64_t u64OldLo) RT_NOTHROW_PROTO;
1750# else
1751DECLINLINE(bool) ASMAtomicCmpWriteU128v2(volatile uint128_t *pu128, const uint64_t u64NewHi, const uint64_t u64NewLo,
1752 const uint64_t u64OldHi, const uint64_t u64OldLo) RT_NOTHROW_DEF
1753{
1754# if RT_INLINE_ASM_USES_INTRIN
1755 __int64 ai64Cmp[2];
1756 ai64Cmp[0] = u64OldLo;
1757 ai64Cmp[1] = u64OldHi;
1758 return _InterlockedCompareExchange128((__int64 volatile *)pu128, u64NewHi, u64NewLo, ai64Cmp) != 0;
1759
1760# elif (defined(__clang_major__) || defined(__GNUC__)) && defined(RT_ARCH_ARM64)
1761 return __sync_bool_compare_and_swap(pu128, ((uint128_t)u64OldHi << 64) | u64OldLo, ((uint128_t)u64NewHi << 64) | u64NewLo);
1762
1763# elif defined(RT_ARCH_AMD64)
1764# if RT_INLINE_ASM_GNU_STYLE
1765 uint64_t u64Ret;
1766 uint64_t u64Spill;
1767 __asm__ __volatile__("lock; cmpxchg16b %2\n\t"
1768 "setz %%al\n\t"
1769 "movzbl %%al, %%eax\n\t"
1770 : "=a" (u64Ret)
1771 , "=d" (u64Spill)
1772 , "+m" (*pu128)
1773 : "a" (u64OldLo)
1774 , "d" (u64OldHi)
1775 , "b" (u64NewLo)
1776 , "c" (u64NewHi)
1777 : "cc");
1778
1779 return (bool)u64Ret;
1780# else
1781# error "Port me"
1782# endif
1783# else
1784# error "Port me"
1785# endif
1786}
1787# endif
1788
1789
1790/**
1791 * Atomically compare and write an unsigned 128-bit value, ordered.
1792 *
1793 * @returns true if write was done.
1794 * @returns false if write wasn't done.
1795 *
1796 * @param pu128 Pointer to the 128-bit variable to update.
1797 * @param u128New The 128-bit value to assign to *pu128.
1798 * @param u128Old The value to compare with.
1799 *
1800 * @remarks AMD64: Not present in the earliest CPUs, so check CPUID.
1801 */
1802DECLINLINE(bool) ASMAtomicCmpWriteU128(volatile uint128_t *pu128, const uint128_t u128New, const uint128_t u128Old) RT_NOTHROW_DEF
1803{
1804# ifdef RT_COMPILER_WITH_128BIT_INT_TYPES
1805# if (defined(__clang_major__) || defined(__GNUC__)) && defined(RT_ARCH_ARM64)
1806 return __sync_bool_compare_and_swap(pu128, u128Old, u128New);
1807# else
1808 return ASMAtomicCmpWriteU128v2(pu128, (uint64_t)(u128New >> 64), (uint64_t)u128New,
1809 (uint64_t)(u128Old >> 64), (uint64_t)u128Old);
1810# endif
1811# else
1812 return ASMAtomicCmpWriteU128v2(pu128, u128New.Hi, u128New.Lo, u128Old.Hi, u128Old.Lo);
1813# endif
1814}
1815
1816
1817/**
1818 * RTUINT128U wrapper for ASMAtomicCmpWriteU128.
1819 */
1820DECLINLINE(bool) ASMAtomicCmpWriteU128U(volatile RTUINT128U *pu128, const RTUINT128U u128New,
1821 const RTUINT128U u128Old) RT_NOTHROW_DEF
1822{
1823# if (defined(__clang_major__) || defined(__GNUC__)) && defined(RT_ARCH_ARM64)
1824 return ASMAtomicCmpWriteU128(&pu128->u, u128New.u, u128Old.u);
1825# else
1826 return ASMAtomicCmpWriteU128v2(&pu128->u, u128New.s.Hi, u128New.s.Lo, u128Old.s.Hi, u128Old.s.Lo);
1827# endif
1828}
1829
1830#endif /* RT_ARCH_AMD64 || RT_ARCH_ARM64 */
1831
1832/**
1833 * Atomically Compare and Exchange a pointer value, ordered.
1834 *
1835 * @returns true if xchg was done.
1836 * @returns false if xchg wasn't done.
1837 *
1838 * @param ppv Pointer to the value to update.
1839 * @param pvNew The new value to assigned to *ppv.
1840 * @param pvOld The old value to *ppv compare with.
1841 *
1842 * @remarks x86: Requires a 486 or later.
1843 * @todo Rename ASMAtomicCmpWritePtrVoid
1844 */
1845DECLINLINE(bool) ASMAtomicCmpXchgPtrVoid(void RT_FAR * volatile RT_FAR *ppv, const void RT_FAR *pvNew, const void RT_FAR *pvOld) RT_NOTHROW_DEF
1846{
1847#if ARCH_BITS == 32 || ARCH_BITS == 16
1848 return ASMAtomicCmpXchgU32((volatile uint32_t RT_FAR *)(void RT_FAR *)ppv, (uint32_t)pvNew, (uint32_t)pvOld);
1849#elif ARCH_BITS == 64
1850 return ASMAtomicCmpXchgU64((volatile uint64_t RT_FAR *)(void RT_FAR *)ppv, (uint64_t)pvNew, (uint64_t)pvOld);
1851#else
1852# error "ARCH_BITS is bogus"
1853#endif
1854}
1855
1856
1857/**
1858 * Atomically Compare and Exchange a pointer value, ordered.
1859 *
1860 * @returns true if xchg was done.
1861 * @returns false if xchg wasn't done.
1862 *
1863 * @param ppv Pointer to the value to update.
1864 * @param pvNew The new value to assigned to *ppv.
1865 * @param pvOld The old value to *ppv compare with.
1866 *
1867 * @remarks This is relatively type safe on GCC platforms.
1868 * @remarks x86: Requires a 486 or later.
1869 * @todo Rename ASMAtomicCmpWritePtr
1870 */
1871#ifdef __GNUC__
1872# define ASMAtomicCmpXchgPtr(ppv, pvNew, pvOld) \
1873 __extension__ \
1874 ({\
1875 __typeof__(*(ppv)) volatile * const ppvTypeChecked = (ppv); \
1876 __typeof__(*(ppv)) const pvNewTypeChecked = (pvNew); \
1877 __typeof__(*(ppv)) const pvOldTypeChecked = (pvOld); \
1878 bool fMacroRet = ASMAtomicCmpXchgPtrVoid((void * volatile *)ppvTypeChecked, \
1879 (void *)pvNewTypeChecked, (void *)pvOldTypeChecked); \
1880 fMacroRet; \
1881 })
1882#else
1883# define ASMAtomicCmpXchgPtr(ppv, pvNew, pvOld) \
1884 ASMAtomicCmpXchgPtrVoid((void RT_FAR * volatile RT_FAR *)(ppv), (void RT_FAR *)(pvNew), (void RT_FAR *)(pvOld))
1885#endif
1886
1887
1888/** @def ASMAtomicCmpXchgHandle
1889 * Atomically Compare and Exchange a typical IPRT handle value, ordered.
1890 *
1891 * @param ph Pointer to the value to update.
1892 * @param hNew The new value to assigned to *pu.
1893 * @param hOld The old value to *pu compare with.
1894 * @param fRc Where to store the result.
1895 *
1896 * @remarks This doesn't currently work for all handles (like RTFILE).
1897 * @remarks x86: Requires a 486 or later.
1898 * @todo Rename ASMAtomicCmpWriteHandle
1899 */
1900#if HC_ARCH_BITS == 32 || ARCH_BITS == 16
1901# define ASMAtomicCmpXchgHandle(ph, hNew, hOld, fRc) \
1902 do { \
1903 AssertCompile(sizeof(*(ph)) == sizeof(uint32_t)); \
1904 (fRc) = ASMAtomicCmpXchgU32((uint32_t volatile RT_FAR *)(ph), (const uint32_t)(hNew), (const uint32_t)(hOld)); \
1905 } while (0)
1906#elif HC_ARCH_BITS == 64
1907# define ASMAtomicCmpXchgHandle(ph, hNew, hOld, fRc) \
1908 do { \
1909 AssertCompile(sizeof(*(ph)) == sizeof(uint64_t)); \
1910 (fRc) = ASMAtomicCmpXchgU64((uint64_t volatile RT_FAR *)(ph), (const uint64_t)(hNew), (const uint64_t)(hOld)); \
1911 } while (0)
1912#else
1913# error HC_ARCH_BITS
1914#endif
1915
1916
1917/** @def ASMAtomicCmpXchgSize
1918 * Atomically Compare and Exchange a value which size might differ
1919 * between platforms or compilers, ordered.
1920 *
1921 * @param pu Pointer to the value to update.
1922 * @param uNew The new value to assigned to *pu.
1923 * @param uOld The old value to *pu compare with.
1924 * @param fRc Where to store the result.
1925 *
1926 * @remarks x86: Requires a 486 or later.
1927 * @todo Rename ASMAtomicCmpWriteSize
1928 */
1929#define ASMAtomicCmpXchgSize(pu, uNew, uOld, fRc) \
1930 do { \
1931 switch (sizeof(*(pu))) { \
1932 case 4: (fRc) = ASMAtomicCmpXchgU32((volatile uint32_t RT_FAR *)(void RT_FAR *)(pu), (uint32_t)(uNew), (uint32_t)(uOld)); \
1933 break; \
1934 case 8: (fRc) = ASMAtomicCmpXchgU64((volatile uint64_t RT_FAR *)(void RT_FAR *)(pu), (uint64_t)(uNew), (uint64_t)(uOld)); \
1935 break; \
1936 default: AssertMsgFailed(("ASMAtomicCmpXchgSize: size %d is not supported\n", sizeof(*(pu)))); \
1937 (fRc) = false; \
1938 break; \
1939 } \
1940 } while (0)
1941
1942
1943/**
1944 * Atomically Compare and Exchange an unsigned 8-bit value, additionally passes
1945 * back old value, ordered.
1946 *
1947 * @returns true if xchg was done.
1948 * @returns false if xchg wasn't done.
1949 *
1950 * @param pu8 Pointer to the value to update.
1951 * @param u8New The new value to assigned to *pu32.
1952 * @param u8Old The old value to *pu8 compare with.
1953 * @param pu8Old Pointer store the old value at.
1954 *
1955 * @remarks x86: Requires a 486 or later.
1956 */
1957#if RT_INLINE_ASM_EXTERNAL_TMP_ARM && !RT_INLINE_ASM_USES_INTRIN
1958RT_ASM_DECL_PRAGMA_WATCOM(bool) ASMAtomicCmpXchgExU8(volatile uint8_t RT_FAR *pu8, const uint8_t u8New, const uint8_t u8Old, uint8_t RT_FAR *pu8Old) RT_NOTHROW_PROTO;
1959#else
1960DECLINLINE(bool) ASMAtomicCmpXchgExU8(volatile uint8_t RT_FAR *pu8, const uint8_t u8New, const uint8_t u8Old, uint8_t RT_FAR *pu8Old) RT_NOTHROW_DEF
1961{
1962# if defined(RT_ARCH_AMD64) || defined(RT_ARCH_X86)
1963# if RT_INLINE_ASM_GNU_STYLE
1964 uint8_t u8Ret;
1965 __asm__ __volatile__("lock; cmpxchgb %3, %0\n\t"
1966 "setz %1\n\t"
1967 : "=m" (*pu8)
1968 , "=qm" (u8Ret)
1969 , "=a" (*pu8Old)
1970# if defined(RT_ARCH_X86)
1971 : "q" (u8New)
1972# else
1973 : "r" (u8New)
1974# endif
1975 , "a" (u8Old)
1976 , "m" (*pu8)
1977 : "cc");
1978 return (bool)u8Ret;
1979
1980# elif RT_INLINE_ASM_USES_INTRIN
1981 return (*pu8Old = _InterlockedCompareExchange8((char RT_FAR *)pu8, u8New, u8Old)) == u8Old;
1982
1983# else
1984 uint8_t u8Ret;
1985 __asm
1986 {
1987# ifdef RT_ARCH_AMD64
1988 mov rdx, [pu8]
1989# else
1990 mov edx, [pu8]
1991# endif
1992 mov eax, [u8Old]
1993 mov ecx, [u8New]
1994# ifdef RT_ARCH_AMD64
1995 lock cmpxchg [rdx], ecx
1996 mov rdx, [pu8Old]
1997 mov [rdx], eax
1998# else
1999 lock cmpxchg [edx], ecx
2000 mov edx, [pu8Old]
2001 mov [edx], eax
2002# endif
2003 setz al
2004 movzx eax, al
2005 mov [u8Ret], eax
2006 }
2007 return !!u8Ret;
2008# endif
2009
2010# elif defined(RT_ARCH_ARM64) || defined(RT_ARCH_ARM32)
2011 /* M1 bench: match: casalb= 6594 vs dmb+casb= 1561 vs non-lse=5051 (ps/call)
2012 mismatch: casalb=15346 vs dmb+casb=16349 vs non-lse=2505 (ps/call) */
2013# ifdef RT_INLINE_ASM_USES_INTRIN
2014# if defined(RTASM_ARM64_USE_FEAT_LSE)
2015 uint8_t uOldActual;
2016# if defined(RTASM_ARM64_USE_FEAT_LSE_WITHOUT_DMB)
2017 uOldActual = __casal8(pu8, u8Old, u8New);
2018# else
2019 uOldActual = __casal8(pu8, u8Old, u8New);
2020 __dmb(_ARM64_BARRIER_SY);
2021# endif
2022 return (*pu8Old = uOldActual) == u8Old; /* Lets hope the compiler is clever enough to replicate our cmp + cset optimization below. */
2023# else
2024 return (*pu8Old = _InterlockedCompareExchange8((char RT_FAR *)pu8, u8New, u8Old)) == u8Old;
2025# endif
2026
2027# else
2028
2029# if defined(RTASM_ARM64_USE_FEAT_LSE)
2030 union { uint32_t u; bool f; } fXchg;
2031 uint32_t u32Actual;
2032 __asm__ __volatile__("Lstart_ASMAtomicCmpXchgExU8_%=:\n\t"
2033# if defined(RTASM_ARM64_USE_FEAT_LSE_WITHOUT_DMB)
2034 "casalb %w[uOldActual], %w[uNew], %[pMem]\n\t"
2035# else
2036 RTASM_ARM_DMB_SY
2037 "casb %w[uOldActual], %w[uNew], %[pMem]\n\t"
2038# endif
2039 "cmp %w[uOldActual], %w[uOldOrg]\n\t"
2040 "cset %w[fXchg], eq\n\t"
2041 : [pMem] "+Q" (*pu8)
2042 , [uOldActual] "=&r" (u32Actual)
2043 , [fXchg] "=&r" (fXchg.u)
2044 : [uNew] "r" ((uint32_t)u8New)
2045 , [uOldOrg] "r" ((uint32_t)u8Old)
2046 , "[uOldActual]" ((uint32_t)u8Old)
2047 : "cc");
2048 *pu8Old = (uint8_t)u32Actual;
2049# else
2050 union { uint8_t u; bool f; } fXchg;
2051 uint8_t u8ActualOld;
2052 uint8_t rcSpill;
2053 __asm__ __volatile__("Ltry_again_ASMAtomicCmpXchgExU8_%=:\n\t"
2054 RTASM_ARM_DMB_SY
2055# if defined(RT_ARCH_ARM64)
2056 "ldaxrb %w[uOld], %[pMem]\n\t"
2057 "cmp %w[uOld], %w[uCmp]\n\t"
2058 "bne 1f\n\t" /* stop here if not equal */
2059 "stlxrb %w[rc], %w[uNew], %[pMem]\n\t"
2060 "cbnz %w[rc], Ltry_again_ASMAtomicCmpXchgExU8_%=\n\t"
2061 "mov %w[fXchg], #1\n\t"
2062 "1:\n\t"
2063 "clrex\n\t"
2064# else
2065 "ldrexb %[uOld], %[pMem]\n\t"
2066 "teq %[uOld], %[uCmp]\n\t"
2067 "strexbeq %[rc], %[uNew], %[pMem]\n\t"
2068 "bne 1f\n\t" /* stop here if not equal */
2069 "cmp %[rc], #0\n\t"
2070 "bne Ltry_again_ASMAtomicCmpXchgExU8_%=\n\t"
2071 "mov %[fXchg], #1\n\t"
2072 "1:\n\t"
2073 /** @todo clrexne on armv7? */
2074# endif
2075 : [pMem] "+Q" (*pu8)
2076 , [uOld] "=&r" (u8ActualOld)
2077 , [rc] "=&r" (rcSpill)
2078 , [fXchg] "=&r" (fXchg.u)
2079 : [uCmp] "r" (u8Old)
2080 , [uNew] "r" (u8New)
2081 , "[fXchg]" (0)
2082 RTASM_ARM_DMB_SY_COMMA_IN_REG
2083 : "cc");
2084 *pu8Old = u8ActualOld;
2085# endif
2086 return fXchg.f;
2087# endif
2088
2089# else
2090# error "Port me"
2091# endif
2092}
2093#endif
2094
2095
2096/**
2097 * Atomically Compare and Exchange a signed 8-bit value, additionally
2098 * passes back old value, ordered.
2099 *
2100 * @returns true if xchg was done.
2101 * @returns false if xchg wasn't done.
2102 *
2103 * @param pi8 Pointer to the value to update.
2104 * @param i8New The new value to assigned to *pi8.
2105 * @param i8Old The old value to *pi8 compare with.
2106 * @param pi8Old Pointer store the old value at.
2107 *
2108 * @remarks x86: Requires a 486 or later.
2109 */
2110DECLINLINE(bool) ASMAtomicCmpXchgExS8(volatile int8_t RT_FAR *pi8, const int8_t i8New, const int8_t i8Old, int8_t RT_FAR *pi8Old) RT_NOTHROW_DEF
2111{
2112 return ASMAtomicCmpXchgExU8((volatile uint8_t RT_FAR *)pi8, (uint8_t)i8New, (uint8_t)i8Old, (uint8_t RT_FAR *)pi8Old);
2113}
2114
2115
2116/**
2117 * Atomically Compare and Exchange an unsigned 16-bit value, additionally passes
2118 * back old value, ordered.
2119 *
2120 * @returns true if xchg was done.
2121 * @returns false if xchg wasn't done.
2122 *
2123 * @param pu16 Pointer to the value to update.
2124 * @param u16New The new value to assigned to *pu16.
2125 * @param u16Old The old value to *pu32 compare with.
2126 * @param pu16Old Pointer store the old value at.
2127 *
2128 * @remarks x86: Requires a 486 or later.
2129 */
2130#if RT_INLINE_ASM_EXTERNAL_TMP_ARM && !RT_INLINE_ASM_USES_INTRIN
2131RT_ASM_DECL_PRAGMA_WATCOM(bool) ASMAtomicCmpXchgExU16(volatile uint16_t RT_FAR *pu16, const uint16_t u16New, const uint16_t u16Old, uint16_t RT_FAR *pu16Old) RT_NOTHROW_PROTO;
2132#else
2133DECLINLINE(bool) ASMAtomicCmpXchgExU16(volatile uint16_t RT_FAR *pu16, const uint16_t u16New, const uint16_t u16Old, uint16_t RT_FAR *pu16Old) RT_NOTHROW_DEF
2134{
2135# if defined(RT_ARCH_AMD64) || defined(RT_ARCH_X86)
2136# if RT_INLINE_ASM_GNU_STYLE
2137 uint8_t u8Ret;
2138 __asm__ __volatile__("lock; cmpxchgw %3, %0\n\t"
2139 "setz %1\n\t"
2140 : "=m" (*pu16)
2141 , "=qm" (u8Ret)
2142 , "=a" (*pu16Old)
2143 : "r" (u16New)
2144 , "a" (u16Old)
2145 , "m" (*pu16)
2146 : "cc");
2147 return (bool)u8Ret;
2148
2149# elif RT_INLINE_ASM_USES_INTRIN
2150 return (*pu16Old = _InterlockedCompareExchange16((short RT_FAR *)pu16, u16New, u16Old)) == u16Old;
2151
2152# else
2153 uint16_t u16Ret;
2154 __asm
2155 {
2156# ifdef RT_ARCH_AMD64
2157 mov rdx, [pu16]
2158# else
2159 mov edx, [pu16]
2160# endif
2161 mov eax, [u16Old]
2162 mov ecx, [u16New]
2163# ifdef RT_ARCH_AMD64
2164 lock cmpxchg [rdx], ecx
2165 mov rdx, [pu16Old]
2166 mov [rdx], eax
2167# else
2168 lock cmpxchg [edx], ecx
2169 mov edx, [pu16Old]
2170 mov [edx], eax
2171# endif
2172 setz al
2173 movzx eax, al
2174 mov [u16Ret], eax
2175 }
2176 return !!u16Ret;
2177# endif
2178
2179# elif defined(RT_ARCH_ARM64) || defined(RT_ARCH_ARM32)
2180 /* M1 bench: match: casalh= 6577 vs dmb+cash= 1608 vs non-lse=5078 (ps/call)
2181 mismatch: casalh=18791 vs dmb+cash=19721 vs non-lse=2543 (ps/call) */
2182# ifdef RT_INLINE_ASM_USES_INTRIN
2183# if defined(RTASM_ARM64_USE_FEAT_LSE)
2184 uint16_t uOldActual;
2185# if defined(RTASM_ARM64_USE_FEAT_LSE_WITHOUT_DMB)
2186 uOldActual = __casal16(pu16, u16Old, u16New);
2187# else
2188 uOldActual = __casal16(pu16, u16Old, u16New);
2189 __dmb(_ARM64_BARRIER_SY);
2190# endif
2191 return (*pu16Old = uOldActual) == u16Old; /* Lets hope the compiler is clever enough to replicate our cmp + cset optimization below. */
2192# else
2193 return (*pu16Old = _InterlockedCompareExchange16((char RT_FAR *)pu16, u16New, u16Old)) == u16Old;
2194# endif
2195
2196# else
2197
2198# if defined(RTASM_ARM64_USE_FEAT_LSE)
2199 union { uint32_t u; bool f; } fXchg;
2200 uint32_t u32Actual;
2201 __asm__ __volatile__("Lstart_ASMAtomicCmpXchgExU16_%=:\n\t"
2202# if defined(RTASM_ARM64_USE_FEAT_LSE_WITHOUT_DMB)
2203 "casalh %w[uOldActual], %w[uNew], %[pMem]\n\t"
2204# else
2205 RTASM_ARM_DMB_SY
2206 "cash %w[uOldActual], %w[uNew], %[pMem]\n\t"
2207# endif
2208 "cmp %w[uOldActual], %w[uOldOrg]\n\t"
2209 "cset %w[fXchg], eq\n\t"
2210 : [pMem] "+Q" (*pu16)
2211 , [uOldActual] "=&r" (u32Actual)
2212 , [fXchg] "=&r" (fXchg.u)
2213 : [uNew] "r" ((uint32_t)u16New)
2214 , [uOldOrg] "r" ((uint32_t)u16Old)
2215 , "[uOldActual]" ((uint32_t)u16Old)
2216 : "cc");
2217 *pu16Old = (uint16_t)u32Actual;
2218# else
2219 union { uint16_t u; bool f; } fXchg;
2220 uint16_t u16ActualOld;
2221 uint16_t rcSpill;
2222 __asm__ __volatile__("Ltry_again_ASMAtomicCmpXchgExU16_%=:\n\t"
2223 RTASM_ARM_DMB_SY
2224# if defined(RT_ARCH_ARM64)
2225 "ldaxrh %w[uOld], %[pMem]\n\t"
2226 "cmp %w[uOld], %w[uCmp]\n\t"
2227 "bne 1f\n\t" /* stop here if not equal */
2228 "stlxrh %w[rc], %w[uNew], %[pMem]\n\t"
2229 "cbnz %w[rc], Ltry_again_ASMAtomicCmpXchgExU16_%=\n\t"
2230 "mov %w[fXchg], #1\n\t"
2231 "1:\n\t"
2232 "clrex\n\t"
2233# else
2234 "ldrexh %[uOld], %[pMem]\n\t"
2235 "teq %[uOld], %[uCmp]\n\t"
2236 "strexheq %[rc], %[uNew], %[pMem]\n\t"
2237 "bne 1f\n\t" /* stop here if not equal */
2238 "cmp %[rc], #0\n\t"
2239 "bne Ltry_again_ASMAtomicCmpXchgExU16_%=\n\t"
2240 "mov %[fXchg], #1\n\t"
2241 "1:\n\t"
2242 /** @todo clrexne on armv7? */
2243# endif
2244 : [pMem] "+Q" (*pu16)
2245 , [uOld] "=&r" (u16ActualOld)
2246 , [rc] "=&r" (rcSpill)
2247 , [fXchg] "=&r" (fXchg.u)
2248 : [uCmp] "r" (u16Old)
2249 , [uNew] "r" (u16New)
2250 , "[fXchg]" (0)
2251 RTASM_ARM_DMB_SY_COMMA_IN_REG
2252 : "cc");
2253 *pu16Old = u16ActualOld;
2254# endif
2255 return fXchg.f;
2256# endif
2257
2258# else
2259# error "Port me"
2260# endif
2261}
2262#endif
2263
2264
2265/**
2266 * Atomically Compare and Exchange a signed 16-bit value, additionally
2267 * passes back old value, ordered.
2268 *
2269 * @returns true if xchg was done.
2270 * @returns false if xchg wasn't done.
2271 *
2272 * @param pi16 Pointer to the value to update.
2273 * @param i16New The new value to assigned to *pi16.
2274 * @param i16Old The old value to *pi16 compare with.
2275 * @param pi16Old Pointer store the old value at.
2276 *
2277 * @remarks x86: Requires a 486 or later.
2278 */
2279DECLINLINE(bool) ASMAtomicCmpXchgExS16(volatile int16_t RT_FAR *pi16, const int16_t i16New, const int16_t i16Old, int16_t RT_FAR *pi16Old) RT_NOTHROW_DEF
2280{
2281 return ASMAtomicCmpXchgExU16((volatile uint16_t RT_FAR *)pi16, (uint16_t)i16New, (uint16_t)i16Old, (uint16_t RT_FAR *)pi16Old);
2282}
2283
2284
2285/**
2286 * Atomically Compare and Exchange an unsigned 32-bit value, additionally
2287 * passes back old value, ordered.
2288 *
2289 * @returns true if xchg was done.
2290 * @returns false if xchg wasn't done.
2291 *
2292 * @param pu32 Pointer to the value to update.
2293 * @param u32New The new value to assigned to *pu32.
2294 * @param u32Old The old value to *pu32 compare with.
2295 * @param pu32Old Pointer store the old value at.
2296 *
2297 * @remarks x86: Requires a 486 or later.
2298 */
2299#if RT_INLINE_ASM_EXTERNAL_TMP_ARM && !RT_INLINE_ASM_USES_INTRIN
2300RT_ASM_DECL_PRAGMA_WATCOM(bool) ASMAtomicCmpXchgExU32(volatile uint32_t RT_FAR *pu32, const uint32_t u32New, const uint32_t u32Old, uint32_t RT_FAR *pu32Old) RT_NOTHROW_PROTO;
2301#else
2302DECLINLINE(bool) ASMAtomicCmpXchgExU32(volatile uint32_t RT_FAR *pu32, const uint32_t u32New, const uint32_t u32Old, uint32_t RT_FAR *pu32Old) RT_NOTHROW_DEF
2303{
2304# if defined(RT_ARCH_AMD64) || defined(RT_ARCH_X86)
2305# if RT_INLINE_ASM_GNU_STYLE
2306 uint8_t u8Ret;
2307 __asm__ __volatile__("lock; cmpxchgl %3, %0\n\t"
2308 "setz %1\n\t"
2309 : "=m" (*pu32)
2310 , "=qm" (u8Ret)
2311 , "=a" (*pu32Old)
2312 : "r" (u32New)
2313 , "a" (u32Old)
2314 , "m" (*pu32)
2315 : "cc");
2316 return (bool)u8Ret;
2317
2318# elif RT_INLINE_ASM_USES_INTRIN
2319 return (*pu32Old = _InterlockedCompareExchange((long RT_FAR *)pu32, u32New, u32Old)) == u32Old;
2320
2321# else
2322 uint32_t u32Ret;
2323 __asm
2324 {
2325# ifdef RT_ARCH_AMD64
2326 mov rdx, [pu32]
2327# else
2328 mov edx, [pu32]
2329# endif
2330 mov eax, [u32Old]
2331 mov ecx, [u32New]
2332# ifdef RT_ARCH_AMD64
2333 lock cmpxchg [rdx], ecx
2334 mov rdx, [pu32Old]
2335 mov [rdx], eax
2336# else
2337 lock cmpxchg [edx], ecx
2338 mov edx, [pu32Old]
2339 mov [edx], eax
2340# endif
2341 setz al
2342 movzx eax, al
2343 mov [u32Ret], eax
2344 }
2345 return !!u32Ret;
2346# endif
2347
2348# elif defined(RT_ARCH_ARM64) || defined(RT_ARCH_ARM32)
2349
2350# ifdef RT_INLINE_ASM_USES_INTRIN
2351# if defined(RTASM_ARM64_USE_FEAT_LSE)
2352 uint32_t uOldActual;
2353# if defined(RTASM_ARM64_USE_FEAT_LSE_WITHOUT_DMB)
2354 uOldActual = __casal32(pu32, u32Old, u32New);
2355# else
2356 uOldActual = __casal32(pu32, u32Old, u32New);
2357 __dmb(_ARM64_BARRIER_SY);
2358# endif
2359 return (*pu32Old = uOldActual) == u32Old; /* Lets hope the compiler is clever enough to replicate our cmp + cset optimization below. */
2360# else
2361 return (*pu32Old = _InterlockedCompareExchange((char RT_FAR *)pu32, u32New, u32Old)) == u32Old;
2362# endif
2363
2364# else
2365
2366 union { uint32_t u; bool f; } fXchg;
2367 /* M1 bench: match: casal= 6590 vs dmb+cas= 1564 vs non-lse=5033 (ps/call)
2368 mismatch: casal=18790 vs dmb+cas=19711 vs non-lse=2503 (ps/call) */
2369# if defined(RTASM_ARM64_USE_FEAT_LSE)
2370 __asm__ __volatile__("Lstart_ASMAtomicCmpXchgExU32_%=:\n\t"
2371# if defined(RTASM_ARM64_USE_FEAT_LSE_WITHOUT_DMB)
2372 "casal %w[uOldActual], %w[uNew], %[pMem]\n\t"
2373# else
2374 RTASM_ARM_DMB_SY
2375 "cas %w[uOldActual], %w[uNew], %[pMem]\n\t"
2376# endif
2377 "cmp %w[uOldActual], %w[uOldOrg]\n\t"
2378 "cset %w[fXchg], eq\n\t"
2379 : [pMem] "+Q" (*pu32)
2380 , [uOldActual] "=&r" (*pu32Old)
2381 , [fXchg] "=&r" (fXchg.u)
2382 : [uNew] "r" (u32New)
2383 , [uOldOrg] "r" (u32Old)
2384 , "[uOldActual]" (u32Old)
2385 : "cc");
2386# else
2387 uint32_t u32ActualOld;
2388 uint32_t rcSpill;
2389 __asm__ __volatile__("Ltry_again_ASMAtomicCmpXchgExU32_%=:\n\t"
2390 RTASM_ARM_DMB_SY
2391# if defined(RT_ARCH_ARM64)
2392 "ldaxr %w[uOld], %[pMem]\n\t"
2393 "cmp %w[uOld], %w[uCmp]\n\t"
2394 "bne 1f\n\t" /* stop here if not equal */
2395 "stlxr %w[rc], %w[uNew], %[pMem]\n\t"
2396 "cbnz %w[rc], Ltry_again_ASMAtomicCmpXchgExU32_%=\n\t"
2397 "mov %w[fXchg], #1\n\t"
2398 "1:\n\t"
2399 "clrex\n\t"
2400# else
2401 "ldrex %[uOld], %[pMem]\n\t"
2402 "teq %[uOld], %[uCmp]\n\t"
2403 "strexeq %[rc], %[uNew], %[pMem]\n\t"
2404 "bne 1f\n\t" /* stop here if not equal */
2405 "cmp %[rc], #0\n\t"
2406 "bne Ltry_again_ASMAtomicCmpXchgExU32_%=\n\t"
2407 "mov %[fXchg], #1\n\t"
2408 "1:\n\t"
2409 /** @todo clrexne on armv7? */
2410# endif
2411 : [pMem] "+Q" (*pu32)
2412 , [uOld] "=&r" (u32ActualOld)
2413 , [rc] "=&r" (rcSpill)
2414 , [fXchg] "=&r" (fXchg.u)
2415 : [uCmp] "r" (u32Old)
2416 , [uNew] "r" (u32New)
2417 , "[fXchg]" (0)
2418 RTASM_ARM_DMB_SY_COMMA_IN_REG
2419 : "cc");
2420 *pu32Old = u32ActualOld;
2421# endif
2422 return fXchg.f;
2423# endif
2424
2425# else
2426# error "Port me"
2427# endif
2428}
2429#endif
2430
2431
2432/**
2433 * Atomically Compare and Exchange a signed 32-bit value, additionally
2434 * passes back old value, ordered.
2435 *
2436 * @returns true if xchg was done.
2437 * @returns false if xchg wasn't done.
2438 *
2439 * @param pi32 Pointer to the value to update.
2440 * @param i32New The new value to assigned to *pi32.
2441 * @param i32Old The old value to *pi32 compare with.
2442 * @param pi32Old Pointer store the old value at.
2443 *
2444 * @remarks x86: Requires a 486 or later.
2445 */
2446DECLINLINE(bool) ASMAtomicCmpXchgExS32(volatile int32_t RT_FAR *pi32, const int32_t i32New, const int32_t i32Old, int32_t RT_FAR *pi32Old) RT_NOTHROW_DEF
2447{
2448 return ASMAtomicCmpXchgExU32((volatile uint32_t RT_FAR *)pi32, (uint32_t)i32New, (uint32_t)i32Old, (uint32_t RT_FAR *)pi32Old);
2449}
2450
2451
2452/**
2453 * Atomically Compare and exchange an unsigned 64-bit value, additionally
2454 * passing back old value, ordered.
2455 *
2456 * @returns true if xchg was done.
2457 * @returns false if xchg wasn't done.
2458 *
2459 * @param pu64 Pointer to the 64-bit variable to update.
2460 * @param u64New The 64-bit value to assign to *pu64.
2461 * @param u64Old The value to compare with.
2462 * @param pu64Old Pointer store the old value at.
2463 *
2464 * @remarks x86: Requires a Pentium or later.
2465 */
2466#if (RT_INLINE_ASM_EXTERNAL_TMP_ARM && !RT_INLINE_ASM_USES_INTRIN) \
2467 || RT_INLINE_DONT_MIX_CMPXCHG8B_AND_PIC
2468RT_ASM_DECL_PRAGMA_WATCOM(bool) ASMAtomicCmpXchgExU64(volatile uint64_t RT_FAR *pu64, const uint64_t u64New, const uint64_t u64Old, uint64_t RT_FAR *pu64Old) RT_NOTHROW_PROTO;
2469#else
2470DECLINLINE(bool) ASMAtomicCmpXchgExU64(volatile uint64_t RT_FAR *pu64, const uint64_t u64New, const uint64_t u64Old, uint64_t RT_FAR *pu64Old) RT_NOTHROW_DEF
2471{
2472# if RT_INLINE_ASM_USES_INTRIN
2473 return (*pu64Old =_InterlockedCompareExchange64((__int64 RT_FAR *)pu64, u64New, u64Old)) == u64Old;
2474
2475# elif defined(RT_ARCH_AMD64)
2476# if RT_INLINE_ASM_GNU_STYLE
2477 uint8_t u8Ret;
2478 __asm__ __volatile__("lock; cmpxchgq %3, %0\n\t"
2479 "setz %1\n\t"
2480 : "=m" (*pu64)
2481 , "=qm" (u8Ret)
2482 , "=a" (*pu64Old)
2483 : "r" (u64New)
2484 , "a" (u64Old)
2485 , "m" (*pu64)
2486 : "cc");
2487 return (bool)u8Ret;
2488# else
2489 bool fRet;
2490 __asm
2491 {
2492 mov rdx, [pu32]
2493 mov rax, [u64Old]
2494 mov rcx, [u64New]
2495 lock cmpxchg [rdx], rcx
2496 mov rdx, [pu64Old]
2497 mov [rdx], rax
2498 setz al
2499 mov [fRet], al
2500 }
2501 return fRet;
2502# endif
2503
2504# elif defined(RT_ARCH_X86)
2505# if RT_INLINE_ASM_GNU_STYLE
2506 uint64_t u64Ret;
2507# if defined(PIC) || defined(__PIC__)
2508 /* Note #1: This code uses a memory clobber description, because the clean
2509 solution with an output value for *pu64 makes gcc run out of
2510 registers. This will cause suboptimal code, and anyone with a
2511 better solution is welcome to improve this.
2512
2513 Note #2: We must prevent gcc from encoding the memory access, as it
2514 may go via the GOT if we're working on a global variable (like
2515 in the testcase). Thus we request a register (%3) and
2516 dereference it ourselves. */
2517 __asm__ __volatile__("xchgl %%ebx, %1\n\t"
2518 "lock; cmpxchg8b (%3)\n\t"
2519 "xchgl %%ebx, %1\n\t"
2520 : "=A" (u64Ret)
2521 : "DS" ((uint32_t)u64New)
2522 , "c" ((uint32_t)(u64New >> 32))
2523 , "r" (pu64) /* Do not use "m" here*/
2524 , "0" (u64Old)
2525 : "memory"
2526 , "cc" );
2527# else /* !PIC */
2528 __asm__ __volatile__("lock; cmpxchg8b %4\n\t"
2529 : "=A" (u64Ret)
2530 , "=m" (*pu64)
2531 : "b" ((uint32_t)u64New)
2532 , "c" ((uint32_t)(u64New >> 32))
2533 , "m" (*pu64)
2534 , "0" (u64Old)
2535 : "cc");
2536# endif
2537 *pu64Old = u64Ret;
2538 return u64Ret == u64Old;
2539# else
2540 uint32_t u32Ret;
2541 __asm
2542 {
2543 mov ebx, dword ptr [u64New]
2544 mov ecx, dword ptr [u64New + 4]
2545 mov edi, [pu64]
2546 mov eax, dword ptr [u64Old]
2547 mov edx, dword ptr [u64Old + 4]
2548 lock cmpxchg8b [edi]
2549 mov ebx, [pu64Old]
2550 mov [ebx], eax
2551 setz al
2552 movzx eax, al
2553 add ebx, 4
2554 mov [ebx], edx
2555 mov dword ptr [u32Ret], eax
2556 }
2557 return !!u32Ret;
2558# endif
2559
2560# elif defined(RT_ARCH_ARM64) || defined(RT_ARCH_ARM32)
2561 union { uint32_t u; bool f; } fXchg;
2562 /* M1 bench: match: casal= 6606 vs dmb+cas= 1565 vs non-lse=5006 (ps/call)
2563 mismatch: casal=18786 vs dmb+cas=19718 vs non-lse=2503 (ps/call) */
2564# if defined(RTASM_ARM64_USE_FEAT_LSE)
2565 __asm__ __volatile__("Lstart_ASMAtomicCmpXchgExU32_%=:\n\t"
2566# if defined(RTASM_ARM64_USE_FEAT_LSE_WITHOUT_DMB)
2567 "casal %[uOldActual], %[uNew], %[pMem]\n\t"
2568# else
2569 RTASM_ARM_DMB_SY
2570 "cas %[uOldActual], %[uNew], %[pMem]\n\t"
2571# endif
2572 "cmp %[uOldActual], %[uOldOrg]\n\t"
2573 "cset %w[fXchg], eq\n\t"
2574 : [pMem] "+Q" (*pu64)
2575 , [uOldActual] "=&r" (*pu64Old)
2576 , [fXchg] "=&r" (fXchg.u)
2577 : [uNew] "r" (u64New)
2578 , [uOldOrg] "r" (u64Old)
2579 , "[uOldActual]" (u64Old)
2580 : "cc");
2581# else
2582 uint64_t u64ActualOld;
2583 uint32_t rcSpill;
2584 __asm__ __volatile__("Ltry_again_ASMAtomicCmpXchgU64_%=:\n\t"
2585 RTASM_ARM_DMB_SY
2586# if defined(RT_ARCH_ARM64)
2587 "ldaxr %[uOld], %[pMem]\n\t"
2588 "cmp %[uOld], %[uCmp]\n\t"
2589 "bne 1f\n\t" /* stop here if not equal */
2590 "stlxr %w[rc], %[uNew], %[pMem]\n\t"
2591 "cbnz %w[rc], Ltry_again_ASMAtomicCmpXchgU64_%=\n\t"
2592 "mov %w[fXchg], #1\n\t"
2593 "1:\n\t"
2594 "clrex\n\t"
2595# else
2596 "ldrexd %[uOld], %H[uOld], %[pMem]\n\t"
2597 "teq %[uOld], %[uCmp]\n\t"
2598 "teqeq %H[uOld], %H[uCmp]\n\t"
2599 "strexdeq %[rc], %[uNew], %H[uNew], %[pMem]\n\t"
2600 "bne 1f\n\t" /* stop here if not equal */
2601 "cmp %[rc], #0\n\t"
2602 "bne Ltry_again_ASMAtomicCmpXchgU64_%=\n\t"
2603 "mov %[fXchg], #1\n\t"
2604 "1:\n\t"
2605 /** @todo clrexne on armv7? */
2606# endif
2607 : [pMem] "+Q" (*pu64)
2608 , [uOld] "=&r" (u64ActualOld)
2609 , [rc] "=&r" (rcSpill)
2610 , [fXchg] "=&r" (fXchg.u)
2611 : [uCmp] "r" (u64Old)
2612 , [uNew] "r" (u64New)
2613 , "[fXchg]" (0)
2614 RTASM_ARM_DMB_SY_COMMA_IN_REG
2615 : "cc");
2616 *pu64Old = u64ActualOld;
2617# endif
2618 return fXchg.f;
2619
2620# else
2621# error "Port me"
2622# endif
2623}
2624#endif
2625
2626
2627/**
2628 * Atomically Compare and exchange a signed 64-bit value, additionally
2629 * passing back old value, ordered.
2630 *
2631 * @returns true if xchg was done.
2632 * @returns false if xchg wasn't done.
2633 *
2634 * @param pi64 Pointer to the 64-bit variable to update.
2635 * @param i64 The 64-bit value to assign to *pu64.
2636 * @param i64Old The value to compare with.
2637 * @param pi64Old Pointer store the old value at.
2638 *
2639 * @remarks x86: Requires a Pentium or later.
2640 */
2641DECLINLINE(bool) ASMAtomicCmpXchgExS64(volatile int64_t RT_FAR *pi64, const int64_t i64, const int64_t i64Old, int64_t RT_FAR *pi64Old) RT_NOTHROW_DEF
2642{
2643 return ASMAtomicCmpXchgExU64((volatile uint64_t RT_FAR *)pi64, (uint64_t)i64, (uint64_t)i64Old, (uint64_t RT_FAR *)pi64Old);
2644}
2645
2646#if defined(RT_ARCH_AMD64) || defined(RT_ARCH_ARM64) || defined(DOXYGEN_RUNNING)
2647
2648/** @def RTASM_HAVE_CMP_XCHG_U128
2649 * Indicates that we've got ASMAtomicCmpSwapU128(), ASMAtomicCmpSwapU128v2()
2650 * and ASMAtomicCmpSwapExU128() available. */
2651# define RTASM_HAVE_CMP_XCHG_U128 1
2652
2653
2654/**
2655 * Atomically compare and exchange an unsigned 128-bit value, ordered.
2656 *
2657 * @returns true if exchange was done.
2658 * @returns false if exchange wasn't done.
2659 *
2660 * @param pu128 Pointer to the 128-bit variable to update.
2661 * @param u64NewHi The high 64 bits of the value to assign to *pu128.
2662 * @param u64NewLo The low 64 bits of the value to assign to *pu128.
2663 * @param u64OldHi The high 64-bit of the value to compare with.
2664 * @param u64OldLo The low 64-bit of the value to compare with.
2665 * @param pu128Old Where to return the old value.
2666 *
2667 * @remarks AMD64: Not present in the earliest CPUs, so check CPUID.
2668 */
2669# if (RT_INLINE_ASM_EXTERNAL_TMP_ARM && !RT_INLINE_ASM_USES_INTRIN)
2670DECLASM(bool) ASMAtomicCmpXchgU128v2(volatile uint128_t *pu128, const uint64_t u64NewHi, const uint64_t u64NewLo,
2671 const uint64_t u64OldHi, const uint64_t u64OldLo, uint128_t *pu128Old) RT_NOTHROW_PROTO;
2672# else
2673DECLINLINE(bool) ASMAtomicCmpXchgU128v2(volatile uint128_t *pu128, const uint64_t u64NewHi, const uint64_t u64NewLo,
2674 const uint64_t u64OldHi, const uint64_t u64OldLo, uint128_t *pu128Old) RT_NOTHROW_DEF
2675{
2676# if RT_INLINE_ASM_USES_INTRIN
2677 pu128Old->Hi = u64OldHi;
2678 pu128Old->Lo = u64OldLo;
2679 AssertCompileMemberOffset(uint128_t, Lo, 0);
2680 return _InterlockedCompareExchange128((__int64 volatile *)pu128, u64NewHi, u64NewLo, (__int64 *)&pu128Old->Lo) != 0;
2681
2682# elif (defined(__clang_major__) || defined(__GNUC__)) && defined(RT_ARCH_ARM64)
2683 uint128_t const uCmp = ((uint128_t)u64OldHi << 64) | u64OldLo;
2684 uint128_t const uOld = __sync_val_compare_and_swap(pu128, uCmp, ((uint128_t)u64NewHi << 64) | u64NewLo);
2685 *pu128Old = uOld;
2686 return uCmp == uOld;
2687
2688# elif defined(RT_ARCH_AMD64)
2689# if RT_INLINE_ASM_GNU_STYLE
2690 uint8_t bRet;
2691 uint64_t u64RetHi, u64RetLo;
2692 __asm__ __volatile__("lock; cmpxchg16b %3\n\t"
2693 "setz %b0\n\t"
2694 : "=r" (bRet)
2695 , "=a" (u64RetLo)
2696 , "=d" (u64RetHi)
2697 , "+m" (*pu128)
2698 : "a" (u64OldLo)
2699 , "d" (u64OldHi)
2700 , "b" (u64NewLo)
2701 , "c" (u64NewHi)
2702 : "cc");
2703 *pu128Old = ((uint128_t)u64RetHi << 64) | u64RetLo;
2704 return (bool)bRet;
2705# else
2706# error "Port me"
2707# endif
2708# else
2709# error "Port me"
2710# endif
2711}
2712# endif
2713
2714
2715/**
2716 * Atomically compare and exchange an unsigned 128-bit value, ordered.
2717 *
2718 * @returns true if exchange was done.
2719 * @returns false if exchange wasn't done.
2720 *
2721 * @param pu128 Pointer to the 128-bit variable to update.
2722 * @param u128New The 128-bit value to assign to *pu128.
2723 * @param u128Old The value to compare with.
2724 * @param pu128Old Where to return the old value.
2725 *
2726 * @remarks AMD64: Not present in the earliest CPUs, so check CPUID.
2727 */
2728DECLINLINE(bool) ASMAtomicCmpXchgU128(volatile uint128_t *pu128, const uint128_t u128New,
2729 const uint128_t u128Old, uint128_t *pu128Old) RT_NOTHROW_DEF
2730{
2731# ifdef RT_COMPILER_WITH_128BIT_INT_TYPES
2732# if (defined(__clang_major__) || defined(__GNUC__)) && defined(RT_ARCH_ARM64)
2733 uint128_t const uSwapped = __sync_val_compare_and_swap(pu128, u128Old, u128New);
2734 *pu128Old = uSwapped;
2735 return uSwapped == u128Old;
2736# else
2737 return ASMAtomicCmpXchgU128v2(pu128, (uint64_t)(u128New >> 64), (uint64_t)u128New,
2738 (uint64_t)(u128Old >> 64), (uint64_t)u128Old, pu128Old);
2739# endif
2740# else
2741 return ASMAtomicCmpXchgU128v2(pu128, u128New.Hi, u128New.Lo, u128Old.Hi, u128Old.Lo, pu128Old);
2742# endif
2743}
2744
2745
2746/**
2747 * RTUINT128U wrapper for ASMAtomicCmpXchgU128.
2748 */
2749DECLINLINE(bool) ASMAtomicCmpXchgU128U(volatile RTUINT128U *pu128, const RTUINT128U u128New,
2750 const RTUINT128U u128Old, PRTUINT128U pu128Old) RT_NOTHROW_DEF
2751{
2752# if (defined(__clang_major__) || defined(__GNUC__)) && defined(RT_ARCH_ARM64)
2753 return ASMAtomicCmpXchgU128(&pu128->u, u128New.u, u128Old.u, &pu128Old->u);
2754# else
2755 return ASMAtomicCmpXchgU128v2(&pu128->u, u128New.s.Hi, u128New.s.Lo, u128Old.s.Hi, u128Old.s.Lo, &pu128Old->u);
2756# endif
2757}
2758
2759#endif /* RT_ARCH_AMD64 || RT_ARCH_ARM64 */
2760
2761
2762
2763/** @def ASMAtomicCmpXchgExHandle
2764 * Atomically Compare and Exchange a typical IPRT handle value, ordered.
2765 *
2766 * @param ph Pointer to the value to update.
2767 * @param hNew The new value to assigned to *pu.
2768 * @param hOld The old value to *pu compare with.
2769 * @param fRc Where to store the result.
2770 * @param phOldVal Pointer to where to store the old value.
2771 *
2772 * @remarks This doesn't currently work for all handles (like RTFILE).
2773 */
2774#if HC_ARCH_BITS == 32 || ARCH_BITS == 16
2775# define ASMAtomicCmpXchgExHandle(ph, hNew, hOld, fRc, phOldVal) \
2776 do { \
2777 AssertCompile(sizeof(*ph) == sizeof(uint32_t)); \
2778 AssertCompile(sizeof(*phOldVal) == sizeof(uint32_t)); \
2779 (fRc) = ASMAtomicCmpXchgExU32((volatile uint32_t RT_FAR *)(ph), (uint32_t)(hNew), (uint32_t)(hOld), (uint32_t RT_FAR *)(phOldVal)); \
2780 } while (0)
2781#elif HC_ARCH_BITS == 64
2782# define ASMAtomicCmpXchgExHandle(ph, hNew, hOld, fRc, phOldVal) \
2783 do { \
2784 AssertCompile(sizeof(*(ph)) == sizeof(uint64_t)); \
2785 AssertCompile(sizeof(*(phOldVal)) == sizeof(uint64_t)); \
2786 (fRc) = ASMAtomicCmpXchgExU64((volatile uint64_t RT_FAR *)(ph), (uint64_t)(hNew), (uint64_t)(hOld), (uint64_t RT_FAR *)(phOldVal)); \
2787 } while (0)
2788#else
2789# error HC_ARCH_BITS
2790#endif
2791
2792
2793/** @def ASMAtomicCmpXchgExSize
2794 * Atomically Compare and Exchange a value which size might differ
2795 * between platforms or compilers. Additionally passes back old value.
2796 *
2797 * @param pu Pointer to the value to update.
2798 * @param uNew The new value to assigned to *pu.
2799 * @param uOld The old value to *pu compare with.
2800 * @param fRc Where to store the result.
2801 * @param puOldVal Pointer to where to store the old value.
2802 *
2803 * @remarks x86: Requires a 486 or later.
2804 */
2805#define ASMAtomicCmpXchgExSize(pu, uNew, uOld, fRc, puOldVal) \
2806 do { \
2807 switch (sizeof(*(pu))) { \
2808 case 4: (fRc) = ASMAtomicCmpXchgExU32((volatile uint32_t RT_FAR *)(void RT_FAR *)(pu), (uint32_t)(uNew), (uint32_t)(uOld), (uint32_t RT_FAR *)(uOldVal)); \
2809 break; \
2810 case 8: (fRc) = ASMAtomicCmpXchgExU64((volatile uint64_t RT_FAR *)(void RT_FAR *)(pu), (uint64_t)(uNew), (uint64_t)(uOld), (uint64_t RT_FAR *)(uOldVal)); \
2811 break; \
2812 default: AssertMsgFailed(("ASMAtomicCmpXchgSize: size %d is not supported\n", sizeof(*(pu)))); \
2813 (fRc) = false; \
2814 (uOldVal) = 0; \
2815 break; \
2816 } \
2817 } while (0)
2818
2819
2820/**
2821 * Atomically Compare and Exchange a pointer value, additionally
2822 * passing back old value, ordered.
2823 *
2824 * @returns true if xchg was done.
2825 * @returns false if xchg wasn't done.
2826 *
2827 * @param ppv Pointer to the value to update.
2828 * @param pvNew The new value to assigned to *ppv.
2829 * @param pvOld The old value to *ppv compare with.
2830 * @param ppvOld Pointer store the old value at.
2831 *
2832 * @remarks x86: Requires a 486 or later.
2833 */
2834DECLINLINE(bool) ASMAtomicCmpXchgExPtrVoid(void RT_FAR * volatile RT_FAR *ppv, const void RT_FAR *pvNew, const void RT_FAR *pvOld,
2835 void RT_FAR * RT_FAR *ppvOld) RT_NOTHROW_DEF
2836{
2837#if ARCH_BITS == 32 || ARCH_BITS == 16
2838 return ASMAtomicCmpXchgExU32((volatile uint32_t RT_FAR *)(void RT_FAR *)ppv, (uint32_t)pvNew, (uint32_t)pvOld, (uint32_t RT_FAR *)ppvOld);
2839#elif ARCH_BITS == 64
2840 return ASMAtomicCmpXchgExU64((volatile uint64_t RT_FAR *)(void RT_FAR *)ppv, (uint64_t)pvNew, (uint64_t)pvOld, (uint64_t RT_FAR *)ppvOld);
2841#else
2842# error "ARCH_BITS is bogus"
2843#endif
2844}
2845
2846
2847/**
2848 * Atomically Compare and Exchange a pointer value, additionally
2849 * passing back old value, ordered.
2850 *
2851 * @returns true if xchg was done.
2852 * @returns false if xchg wasn't done.
2853 *
2854 * @param ppv Pointer to the value to update.
2855 * @param pvNew The new value to assigned to *ppv.
2856 * @param pvOld The old value to *ppv compare with.
2857 * @param ppvOld Pointer store the old value at.
2858 *
2859 * @remarks This is relatively type safe on GCC platforms.
2860 * @remarks x86: Requires a 486 or later.
2861 */
2862#ifdef __GNUC__
2863# define ASMAtomicCmpXchgExPtr(ppv, pvNew, pvOld, ppvOld) \
2864 __extension__ \
2865 ({\
2866 __typeof__(*(ppv)) volatile * const ppvTypeChecked = (ppv); \
2867 __typeof__(*(ppv)) const pvNewTypeChecked = (pvNew); \
2868 __typeof__(*(ppv)) const pvOldTypeChecked = (pvOld); \
2869 __typeof__(*(ppv)) * const ppvOldTypeChecked = (ppvOld); \
2870 bool fMacroRet = ASMAtomicCmpXchgExPtrVoid((void * volatile *)ppvTypeChecked, \
2871 (void *)pvNewTypeChecked, (void *)pvOldTypeChecked, \
2872 (void **)ppvOldTypeChecked); \
2873 fMacroRet; \
2874 })
2875#else
2876# define ASMAtomicCmpXchgExPtr(ppv, pvNew, pvOld, ppvOld) \
2877 ASMAtomicCmpXchgExPtrVoid((void RT_FAR * volatile RT_FAR *)(ppv), (void RT_FAR *)(pvNew), (void RT_FAR *)(pvOld), (void RT_FAR * RT_FAR *)(ppvOld))
2878#endif
2879
2880
2881/**
2882 * Virtualization unfriendly serializing instruction, always exits.
2883 */
2884#if (RT_INLINE_ASM_EXTERNAL && !RT_INLINE_ASM_USES_INTRIN) || (!defined(RT_ARCH_AMD64) && !defined(RT_ARCH_X86))
2885RT_ASM_DECL_PRAGMA_WATCOM(void) ASMSerializeInstructionCpuId(void) RT_NOTHROW_PROTO;
2886#else
2887DECLINLINE(void) ASMSerializeInstructionCpuId(void) RT_NOTHROW_DEF
2888{
2889# if RT_INLINE_ASM_GNU_STYLE
2890 RTCCUINTREG xAX = 0;
2891# ifdef RT_ARCH_AMD64
2892 __asm__ __volatile__ ("cpuid"
2893 : "=a" (xAX)
2894 : "0" (xAX)
2895 : "rbx", "rcx", "rdx", "memory");
2896# elif (defined(PIC) || defined(__PIC__)) && defined(__i386__)
2897 __asm__ __volatile__ ("push %%ebx\n\t"
2898 "cpuid\n\t"
2899 "pop %%ebx\n\t"
2900 : "=a" (xAX)
2901 : "0" (xAX)
2902 : "ecx", "edx", "memory");
2903# else
2904 __asm__ __volatile__ ("cpuid"
2905 : "=a" (xAX)
2906 : "0" (xAX)
2907 : "ebx", "ecx", "edx", "memory");
2908# endif
2909
2910# elif RT_INLINE_ASM_USES_INTRIN
2911 int aInfo[4];
2912 _ReadWriteBarrier();
2913 __cpuid(aInfo, 0);
2914
2915# else
2916 __asm
2917 {
2918 push ebx
2919 xor eax, eax
2920 cpuid
2921 pop ebx
2922 }
2923# endif
2924}
2925#endif
2926
2927/**
2928 * Virtualization friendly serializing instruction, though more expensive.
2929 */
2930#if RT_INLINE_ASM_EXTERNAL || (!defined(RT_ARCH_AMD64) && !defined(RT_ARCH_X86))
2931RT_ASM_DECL_PRAGMA_WATCOM(void) ASMSerializeInstructionIRet(void) RT_NOTHROW_PROTO;
2932#else
2933DECLINLINE(void) ASMSerializeInstructionIRet(void) RT_NOTHROW_DEF
2934{
2935# if RT_INLINE_ASM_GNU_STYLE
2936# ifdef RT_ARCH_AMD64
2937 __asm__ __volatile__ ("movq %%rsp,%%r10\n\t"
2938 "subq $128, %%rsp\n\t" /*redzone*/
2939 "mov %%ss, %%eax\n\t"
2940 "pushq %%rax\n\t"
2941 "pushq %%r10\n\t"
2942 "pushfq\n\t"
2943 "movl %%cs, %%eax\n\t"
2944 "pushq %%rax\n\t"
2945 "leaq 1f(%%rip), %%rax\n\t"
2946 "pushq %%rax\n\t"
2947 "iretq\n\t"
2948 "1:\n\t"
2949 ::: "rax", "r10", "memory", "cc");
2950# else
2951 __asm__ __volatile__ ("pushfl\n\t"
2952 "pushl %%cs\n\t"
2953 "pushl $1f\n\t"
2954 "iretl\n\t"
2955 "1:\n\t"
2956 ::: "memory");
2957# endif
2958
2959# else
2960 __asm
2961 {
2962 pushfd
2963 push cs
2964 push la_ret
2965 iretd
2966 la_ret:
2967 }
2968# endif
2969}
2970#endif
2971
2972/**
2973 * Virtualization friendlier serializing instruction, may still cause exits.
2974 */
2975#if (RT_INLINE_ASM_EXTERNAL && RT_INLINE_ASM_USES_INTRIN < RT_MSC_VER_VS2008) || (!defined(RT_ARCH_AMD64) && !defined(RT_ARCH_X86))
2976RT_ASM_DECL_PRAGMA_WATCOM(void) ASMSerializeInstructionRdTscp(void) RT_NOTHROW_PROTO;
2977#else
2978DECLINLINE(void) ASMSerializeInstructionRdTscp(void) RT_NOTHROW_DEF
2979{
2980# if RT_INLINE_ASM_GNU_STYLE
2981 /* rdtscp is not supported by ancient linux build VM of course :-( */
2982# ifdef RT_ARCH_AMD64
2983 /*__asm__ __volatile__("rdtscp\n\t" ::: "rax", "rdx, "rcx"); */
2984 __asm__ __volatile__(".byte 0x0f,0x01,0xf9\n\t" ::: "rax", "rdx", "rcx", "memory");
2985# else
2986 /*__asm__ __volatile__("rdtscp\n\t" ::: "eax", "edx, "ecx"); */
2987 __asm__ __volatile__(".byte 0x0f,0x01,0xf9\n\t" ::: "eax", "edx", "ecx", "memory");
2988# endif
2989# else
2990# if RT_INLINE_ASM_USES_INTRIN >= RT_MSC_VER_VS2008
2991 uint32_t uIgnore;
2992 _ReadWriteBarrier();
2993 (void)__rdtscp(&uIgnore);
2994 (void)uIgnore;
2995# else
2996 __asm
2997 {
2998 rdtscp
2999 }
3000# endif
3001# endif
3002}
3003#endif
3004
3005
3006/**
3007 * Serialize Instruction (both data store and instruction flush).
3008 */
3009#if (defined(RT_ARCH_X86) && ARCH_BITS == 16) || defined(IN_GUEST)
3010# define ASMSerializeInstruction() ASMSerializeInstructionIRet()
3011#elif defined(RT_ARCH_X86) || defined(RT_ARCH_AMD64)
3012# define ASMSerializeInstruction() ASMSerializeInstructionCpuId()
3013#elif defined(RT_ARCH_SPARC64)
3014RTDECL(void) ASMSerializeInstruction(void) RT_NOTHROW_PROTO;
3015#elif defined(RT_ARCH_ARM64) || defined(RT_ARCH_ARM32)
3016DECLINLINE(void) ASMSerializeInstruction(void) RT_NOTHROW_DEF
3017{
3018# ifdef RT_INLINE_ASM_USES_INTRIN
3019 __dsb(_ARM64_BARRIER_SY);
3020# else
3021 __asm__ __volatile__ (RTASM_ARM_DSB_SY :: RTASM_ARM_DSB_SY_IN_REG :);
3022# endif
3023}
3024#else
3025# error "Port me"
3026#endif
3027
3028
3029/**
3030 * Memory fence, waits for any pending writes and reads to complete.
3031 * @note No implicit compiler barrier (which is probably stupid).
3032 */
3033DECLINLINE(void) ASMMemoryFence(void) RT_NOTHROW_DEF
3034{
3035#if defined(RT_ARCH_AMD64) || (defined(RT_ARCH_X86) && !defined(RT_WITH_OLD_CPU_SUPPORT))
3036# if RT_INLINE_ASM_GNU_STYLE
3037 __asm__ __volatile__ (".byte 0x0f,0xae,0xf0\n\t");
3038# elif RT_INLINE_ASM_USES_INTRIN
3039 _mm_mfence();
3040# else
3041 __asm
3042 {
3043 _emit 0x0f
3044 _emit 0xae
3045 _emit 0xf0
3046 }
3047# endif
3048#elif defined(RT_ARCH_ARM64) || defined(RT_ARCH_ARM32)
3049# ifdef RT_INLINE_ASM_USES_INTRIN
3050 __dmb(_ARM64_BARRIER_SY);
3051# else
3052 __asm__ __volatile__ (RTASM_ARM_DMB_SY :: RTASM_ARM_DMB_SY_IN_REG :);
3053# endif
3054#elif ARCH_BITS == 16
3055 uint16_t volatile u16;
3056 ASMAtomicXchgU16(&u16, 0);
3057#else
3058 uint32_t volatile u32;
3059 ASMAtomicXchgU32(&u32, 0);
3060#endif
3061}
3062
3063
3064/**
3065 * Write fence, waits for any pending writes to complete.
3066 * @note No implicit compiler barrier (which is probably stupid).
3067 */
3068DECLINLINE(void) ASMWriteFence(void) RT_NOTHROW_DEF
3069{
3070#if defined(RT_ARCH_AMD64) || (defined(RT_ARCH_X86) && !defined(RT_WITH_OLD_CPU_SUPPORT))
3071# if RT_INLINE_ASM_GNU_STYLE
3072 __asm__ __volatile__ (".byte 0x0f,0xae,0xf8\n\t");
3073# elif RT_INLINE_ASM_USES_INTRIN
3074 _mm_sfence();
3075# else
3076 __asm
3077 {
3078 _emit 0x0f
3079 _emit 0xae
3080 _emit 0xf8
3081 }
3082# endif
3083#elif defined(RT_ARCH_ARM64) || defined(RT_ARCH_ARM32)
3084# ifdef RT_INLINE_ASM_USES_INTRIN
3085 __dmb(_ARM64_BARRIER_ST);
3086# else
3087 __asm__ __volatile__ (RTASM_ARM_DMB_ST :: RTASM_ARM_DMB_ST_IN_REG :);
3088# endif
3089#else
3090 ASMMemoryFence();
3091#endif
3092}
3093
3094
3095/**
3096 * Read fence, waits for any pending reads to complete.
3097 * @note No implicit compiler barrier (which is probably stupid).
3098 */
3099DECLINLINE(void) ASMReadFence(void) RT_NOTHROW_DEF
3100{
3101#if defined(RT_ARCH_AMD64) || (defined(RT_ARCH_X86) && !defined(RT_WITH_OLD_CPU_SUPPORT))
3102# if RT_INLINE_ASM_GNU_STYLE
3103 __asm__ __volatile__ (".byte 0x0f,0xae,0xe8\n\t");
3104# elif RT_INLINE_ASM_USES_INTRIN
3105 _mm_lfence();
3106# else
3107 __asm
3108 {
3109 _emit 0x0f
3110 _emit 0xae
3111 _emit 0xe8
3112 }
3113# endif
3114#elif defined(RT_ARCH_ARM64) || defined(RT_ARCH_ARM32)
3115# ifdef RT_INLINE_ASM_USES_INTRIN
3116 __dmb(_ARM64_BARRIER_LD);
3117# else
3118 __asm__ __volatile__ (RTASM_ARM_DMB_LD :: RTASM_ARM_DMB_LD_IN_REG :);
3119# endif
3120#else
3121 ASMMemoryFence();
3122#endif
3123}
3124
3125
3126/**
3127 * Atomically reads an unsigned 8-bit value, ordered.
3128 *
3129 * @returns Current *pu8 value
3130 * @param pu8 Pointer to the 8-bit variable to read.
3131 */
3132DECLINLINE(uint8_t) ASMAtomicReadU8(volatile uint8_t RT_FAR *pu8) RT_NOTHROW_DEF
3133{
3134#if defined(RT_ARCH_ARM64) || defined(RT_ARCH_ARM32)
3135
3136# ifdef RT_INLINE_ASM_USES_INTRIN
3137 return (uint8_t)__load_acquire8(pu8);
3138
3139# else
3140 uint32_t u32;
3141# if defined(RTASM_ARM64_USE_FEAT_LSE) && 0 /* very expensive on M1 */
3142 __asm__ __volatile__("Lstart_ASMAtomicReadU8_%=:\n\t"
3143 RTASM_ARM_DMB_SY
3144 "casab %w[uDst], wzr, %[pMem]\n\t"
3145 : [uDst] "=&r" (u32)
3146 : [pMem] "Q" (*pu8),
3147 "0" (0)
3148 RTASM_ARM_DMB_SY_COMMA_IN_REG);
3149# else
3150 __asm__ __volatile__("Lstart_ASMAtomicReadU8_%=:\n\t"
3151 RTASM_ARM_DMB_SY
3152# if defined(RT_ARCH_ARM64)
3153# if 1 /* shouldn't be any need for more than single-copy atomicity when we've got a proper barrier, just like on x86. */
3154 "ldurb %w[uDst], %[pMem]\n\t"
3155# else
3156 "ldxrb %w[uDst], %[pMem]\n\t"
3157 "clrex\n\t"
3158# endif
3159# else
3160 "ldrexb %[uDst], %[pMem]\n\t"
3161 /** @todo clrex */
3162# endif
3163 : [uDst] "=&r" (u32)
3164 : [pMem] "Q" (*pu8)
3165 RTASM_ARM_DMB_SY_COMMA_IN_REG);
3166# endif
3167 return (uint8_t)u32;
3168# endif
3169
3170#else
3171 ASMMemoryFence();
3172 return *pu8; /* byte reads are atomic on x86 */
3173#endif
3174}
3175
3176
3177/**
3178 * Atomically reads an unsigned 8-bit value, unordered.
3179 *
3180 * @returns Current *pu8 value
3181 * @param pu8 Pointer to the 8-bit variable to read.
3182 */
3183DECLINLINE(uint8_t) ASMAtomicUoReadU8(volatile uint8_t RT_FAR *pu8) RT_NOTHROW_DEF
3184{
3185#if defined(RT_ARCH_ARM64) || defined(RT_ARCH_ARM32)
3186
3187# ifdef RT_INLINE_ASM_USES_INTRIN
3188 return (uint8_t)__iso_volatile_load8((volatile char *)pu8);
3189
3190# else
3191
3192 uint32_t u32;
3193 __asm__ __volatile__("Lstart_ASMAtomicUoReadU8_%=:\n\t"
3194# if defined(RT_ARCH_ARM64)
3195 "ldurb %w[uDst], %[pMem]\n\t"
3196# else
3197 "ldrexb %[uDst], %[pMem]\n\t" /** @todo fix this */
3198# endif
3199 : [uDst] "=&r" (u32)
3200 : [pMem] "Q" (*pu8));
3201 return (uint8_t)u32;
3202# endif
3203#else
3204 return *pu8; /* byte reads are atomic on x86 */
3205#endif
3206}
3207
3208
3209/**
3210 * Atomically reads a signed 8-bit value, ordered.
3211 *
3212 * @returns Current *pi8 value
3213 * @param pi8 Pointer to the 8-bit variable to read.
3214 */
3215DECLINLINE(int8_t) ASMAtomicReadS8(volatile int8_t RT_FAR *pi8) RT_NOTHROW_DEF
3216{
3217#if defined(RT_ARCH_ARM64) || defined(RT_ARCH_ARM32)
3218 return (int8_t)ASMAtomicReadU8((volatile uint8_t RT_FAR *)pi8);
3219#else
3220 ASMMemoryFence();
3221 return *pi8; /* byte reads are atomic on x86 */
3222#endif
3223}
3224
3225
3226/**
3227 * Atomically reads a signed 8-bit value, unordered.
3228 *
3229 * @returns Current *pi8 value
3230 * @param pi8 Pointer to the 8-bit variable to read.
3231 */
3232DECLINLINE(int8_t) ASMAtomicUoReadS8(volatile int8_t RT_FAR *pi8) RT_NOTHROW_DEF
3233{
3234#if defined(RT_ARCH_ARM64) || defined(RT_ARCH_ARM32)
3235
3236# ifdef RT_INLINE_ASM_USES_INTRIN
3237 return __iso_volatile_load8((volatile char *)pi8);
3238
3239# else
3240
3241 int32_t i32;
3242 __asm__ __volatile__("Lstart_ASMAtomicUoReadS8_%=:\n\t"
3243# if defined(RT_ARCH_ARM64)
3244 "ldurb %w[iDst], %[pMem]\n\t"
3245# else
3246 "ldrexb %[iDst], %[pMem]\n\t" /** @todo fix this */
3247# endif
3248 : [iDst] "=&r" (i32)
3249 : [pMem] "Q" (*pi8));
3250 return (int8_t)i32;
3251# endif
3252#else
3253 return *pi8; /* byte reads are atomic on x86 */
3254#endif
3255}
3256
3257
3258/**
3259 * Atomically reads an unsigned 16-bit value, ordered.
3260 *
3261 * @returns Current *pu16 value
3262 * @param pu16 Pointer to the 16-bit variable to read.
3263 */
3264DECLINLINE(uint16_t) ASMAtomicReadU16(volatile uint16_t RT_FAR *pu16) RT_NOTHROW_DEF
3265{
3266 Assert(!((uintptr_t)pu16 & 1));
3267#if defined(RT_ARCH_ARM64) || defined(RT_ARCH_ARM32)
3268
3269# ifdef RT_INLINE_ASM_USES_INTRIN
3270 return (uint16_t)__load_acquire16(pu16);
3271
3272# else
3273
3274 uint32_t u32;
3275# if defined(RTASM_ARM64_USE_FEAT_LSE) && 0 /* very expensive on M1, but alignment advantages with LEA2 (M2?). */
3276 __asm__ __volatile__("Lstart_ASMAtomicReadU16_%=:\n\t"
3277 RTASM_ARM_DMB_SY
3278 "casah %w[uDst], wzr, %[pMem]\n\t"
3279 : [uDst] "=&r" (u32)
3280 : [pMem] "Q" (*pu16),
3281 "0" (0)
3282 RTASM_ARM_DMB_SY_COMMA_IN_REG);
3283# else
3284 __asm__ __volatile__("Lstart_ASMAtomicReadU16_%=:\n\t"
3285 RTASM_ARM_DMB_SY
3286# if defined(RT_ARCH_ARM64)
3287# if 1 /* ASSUMING proper barrier and aligned access, we should be fine with single-copy atomicity, just like on x86. */
3288 "ldurh %w[uDst], %[pMem]\n\t"
3289# else
3290 "ldxrh %w[uDst], %[pMem]\n\t"
3291 "clrex\n\t"
3292# endif
3293# else
3294 "ldrexh %[uDst], %[pMem]\n\t"
3295 /** @todo clrex */
3296# endif
3297 : [uDst] "=&r" (u32)
3298 : [pMem] "Q" (*pu16)
3299 RTASM_ARM_DMB_SY_COMMA_IN_REG);
3300# endif
3301 return (uint16_t)u32;
3302# endif
3303
3304#else
3305 ASMMemoryFence();
3306 return *pu16;
3307#endif
3308}
3309
3310
3311/**
3312 * Atomically reads an unsigned 16-bit value, unordered.
3313 *
3314 * @returns Current *pu16 value
3315 * @param pu16 Pointer to the 16-bit variable to read.
3316 */
3317DECLINLINE(uint16_t) ASMAtomicUoReadU16(volatile uint16_t RT_FAR *pu16) RT_NOTHROW_DEF
3318{
3319 Assert(!((uintptr_t)pu16 & 1));
3320#if defined(RT_ARCH_ARM64) || defined(RT_ARCH_ARM32)
3321
3322# ifdef RT_INLINE_ASM_USES_INTRIN
3323 return (uint16_t)__iso_volatile_load16((volatile int16_t *)pu16);
3324
3325# else
3326
3327 uint32_t u32;
3328 __asm__ __volatile__("Lstart_ASMAtomicUoReadU16_%=:\n\t"
3329# if defined(RT_ARCH_ARM64)
3330 "ldurh %w[uDst], %[pMem]\n\t"
3331# else
3332 "ldrexh %[uDst], %[pMem]\n\t" /** @todo fix this */
3333# endif
3334 : [uDst] "=&r" (u32)
3335 : [pMem] "Q" (*pu16));
3336 return (uint16_t)u32;
3337# endif
3338
3339#else
3340 return *pu16;
3341#endif
3342}
3343
3344
3345/**
3346 * Atomically reads a signed 16-bit value, ordered.
3347 *
3348 * @returns Current *pi16 value
3349 * @param pi16 Pointer to the 16-bit variable to read.
3350 */
3351DECLINLINE(int16_t) ASMAtomicReadS16(volatile int16_t RT_FAR *pi16) RT_NOTHROW_DEF
3352{
3353 Assert(!((uintptr_t)pi16 & 1));
3354#if defined(RT_ARCH_ARM64) || defined(RT_ARCH_ARM32)
3355 return (int16_t)ASMAtomicReadU16((volatile uint16_t RT_FAR *)pi16);
3356#else
3357 ASMMemoryFence();
3358 return *pi16;
3359#endif
3360}
3361
3362
3363/**
3364 * Atomically reads a signed 16-bit value, unordered.
3365 *
3366 * @returns Current *pi16 value
3367 * @param pi16 Pointer to the 16-bit variable to read.
3368 */
3369DECLINLINE(int16_t) ASMAtomicUoReadS16(volatile int16_t RT_FAR *pi16) RT_NOTHROW_DEF
3370{
3371 Assert(!((uintptr_t)pi16 & 1));
3372#if defined(RT_ARCH_ARM64) || defined(RT_ARCH_ARM32)
3373
3374# ifdef RT_INLINE_ASM_USES_INTRIN
3375 return __iso_volatile_load16(pi16);
3376
3377# else
3378
3379 int32_t i32;
3380 __asm__ __volatile__("Lstart_ASMAtomicUoReadS16_%=:\n\t"
3381# if defined(RT_ARCH_ARM64)
3382 "ldurh %w[iDst], %[pMem]\n\t"
3383# else
3384 "ldrexh %[iDst], %[pMem]\n\t" /** @todo fix this */
3385# endif
3386 : [iDst] "=&r" (i32)
3387 : [pMem] "Q" (*pi16));
3388 return (int16_t)i32;
3389# endif
3390
3391#else
3392 return *pi16;
3393#endif
3394}
3395
3396
3397/**
3398 * Atomically reads an unsigned 32-bit value, ordered.
3399 *
3400 * @returns Current *pu32 value
3401 * @param pu32 Pointer to the 32-bit variable to read.
3402 */
3403DECLINLINE(uint32_t) ASMAtomicReadU32(volatile uint32_t RT_FAR *pu32) RT_NOTHROW_DEF
3404{
3405 Assert(!((uintptr_t)pu32 & 3));
3406#if defined(RT_ARCH_ARM64) || defined(RT_ARCH_ARM32)
3407
3408# ifdef RT_INLINE_ASM_USES_INTRIN
3409 return (uint32_t)__load_acquire32(pu32);
3410
3411# else
3412
3413 uint32_t u32;
3414# if defined(RTASM_ARM64_USE_FEAT_LSE) && 0 /* very expensive on M1, but alignment advantages with LEA2 (M2?). */
3415 __asm__ __volatile__("Lstart_ASMAtomicReadU32_%=:\n\t"
3416 RTASM_ARM_DMB_SY
3417 "casa %w[uDst], wzr, %[pMem]\n\t"
3418 : [uDst] "=&r" (u32)
3419 : [pMem] "Q" (*pu32),
3420 "0" (0)
3421 RTASM_ARM_DMB_SY_COMMA_IN_REG);
3422# else
3423 __asm__ __volatile__("Lstart_ASMAtomicReadU32_%=:\n\t"
3424 RTASM_ARM_DMB_SY
3425# if defined(RT_ARCH_ARM64)
3426# if 1 /* ASSUMING proper barrier and aligned access, we should be fine with single-copy atomicity, just like on x86. */
3427 "ldur %w[uDst], %[pMem]\n\t"
3428# else
3429 "ldxr %w[uDst], %[pMem]\n\t"
3430 "clrex\n\t"
3431# endif
3432# else
3433 "ldrex %[uDst], %[pMem]\n\t"
3434 /** @todo clrex */
3435# endif
3436 : [uDst] "=&r" (u32)
3437 : [pMem] "Q" (*pu32)
3438 RTASM_ARM_DMB_SY_COMMA_IN_REG);
3439# endif
3440 return u32;
3441# endif
3442
3443#else
3444 ASMMemoryFence();
3445# if ARCH_BITS == 16
3446 AssertFailed(); /** @todo 16-bit */
3447# endif
3448 return *pu32;
3449#endif
3450}
3451
3452
3453/**
3454 * Atomically reads an unsigned 32-bit value, unordered.
3455 *
3456 * @returns Current *pu32 value
3457 * @param pu32 Pointer to the 32-bit variable to read.
3458 */
3459DECLINLINE(uint32_t) ASMAtomicUoReadU32(volatile uint32_t RT_FAR *pu32) RT_NOTHROW_DEF
3460{
3461 Assert(!((uintptr_t)pu32 & 3));
3462#if defined(RT_ARCH_ARM64) || defined(RT_ARCH_ARM32)
3463
3464# ifdef RT_INLINE_ASM_USES_INTRIN
3465 return (uint32_t)__iso_volatile_load32((volatile int32_t *)pu32);
3466
3467# else
3468
3469 uint32_t u32;
3470 __asm__ __volatile__("Lstart_ASMAtomicUoReadU32_%=:\n\t"
3471# if defined(RT_ARCH_ARM64)
3472 "ldur %w[uDst], %[pMem]\n\t"
3473# else
3474 "ldrex %[uDst], %[pMem]\n\t" /** @todo fix this */
3475# endif
3476 : [uDst] "=&r" (u32)
3477 : [pMem] "Q" (*pu32));
3478 return u32;
3479# endif
3480
3481#else
3482# if ARCH_BITS == 16
3483 AssertFailed(); /** @todo 16-bit */
3484# endif
3485 return *pu32;
3486#endif
3487}
3488
3489
3490/**
3491 * Atomically reads a signed 32-bit value, ordered.
3492 *
3493 * @returns Current *pi32 value
3494 * @param pi32 Pointer to the 32-bit variable to read.
3495 */
3496DECLINLINE(int32_t) ASMAtomicReadS32(volatile int32_t RT_FAR *pi32) RT_NOTHROW_DEF
3497{
3498 Assert(!((uintptr_t)pi32 & 3));
3499#if defined(RT_ARCH_ARM64) || defined(RT_ARCH_ARM32)
3500 return (int32_t)ASMAtomicReadU32((volatile uint32_t RT_FAR *)pi32);
3501#else
3502 ASMMemoryFence();
3503# if ARCH_BITS == 16
3504 AssertFailed(); /** @todo 16-bit */
3505# endif
3506 return *pi32;
3507#endif
3508}
3509
3510
3511/**
3512 * Atomically reads a signed 32-bit value, unordered.
3513 *
3514 * @returns Current *pi32 value
3515 * @param pi32 Pointer to the 32-bit variable to read.
3516 */
3517DECLINLINE(int32_t) ASMAtomicUoReadS32(volatile int32_t RT_FAR *pi32) RT_NOTHROW_DEF
3518{
3519 Assert(!((uintptr_t)pi32 & 3));
3520#if defined(RT_ARCH_ARM64) || defined(RT_ARCH_ARM32)
3521
3522# ifdef RT_INLINE_ASM_USES_INTRIN
3523 return __iso_volatile_load32(pi32);
3524
3525# else
3526
3527 int32_t i32;
3528 __asm__ __volatile__("Lstart_ASMAtomicUoReadS32_%=:\n\t"
3529# if defined(RT_ARCH_ARM64)
3530 "ldur %w[iDst], %[pMem]\n\t"
3531# else
3532 "ldrex %[iDst], %[pMem]\n\t" /** @todo thix this */
3533# endif
3534 : [iDst] "=&r" (i32)
3535 : [pMem] "Q" (*pi32));
3536 return i32;
3537# endif
3538
3539#else
3540# if ARCH_BITS == 16
3541 AssertFailed(); /** @todo 16-bit */
3542# endif
3543 return *pi32;
3544#endif
3545}
3546
3547
3548/**
3549 * Atomically reads an unsigned 64-bit value, ordered.
3550 *
3551 * @returns Current *pu64 value
3552 * @param pu64 Pointer to the 64-bit variable to read.
3553 * The memory pointed to must be writable.
3554 *
3555 * @remarks This may fault if the memory is read-only!
3556 * @remarks x86: Requires a Pentium or later.
3557 */
3558#if (RT_INLINE_ASM_EXTERNAL_TMP_ARM && !defined(RT_ARCH_AMD64)) \
3559 || RT_INLINE_DONT_MIX_CMPXCHG8B_AND_PIC
3560RT_ASM_DECL_PRAGMA_WATCOM(uint64_t) ASMAtomicReadU64(volatile uint64_t RT_FAR *pu64) RT_NOTHROW_PROTO;
3561#else
3562DECLINLINE(uint64_t) ASMAtomicReadU64(volatile uint64_t RT_FAR *pu64) RT_NOTHROW_DEF
3563{
3564 uint64_t u64;
3565# ifdef RT_ARCH_AMD64
3566 Assert(!((uintptr_t)pu64 & 7));
3567/*# if RT_INLINE_ASM_GNU_STYLE
3568 __asm__ __volatile__( "mfence\n\t"
3569 "movq %1, %0\n\t"
3570 : "=r" (u64)
3571 : "m" (*pu64));
3572# else
3573 __asm
3574 {
3575 mfence
3576 mov rdx, [pu64]
3577 mov rax, [rdx]
3578 mov [u64], rax
3579 }
3580# endif*/
3581 ASMMemoryFence();
3582 u64 = *pu64;
3583
3584# elif defined(RT_ARCH_X86)
3585# if RT_INLINE_ASM_GNU_STYLE
3586# if defined(PIC) || defined(__PIC__)
3587 uint32_t u32EBX = 0;
3588 Assert(!((uintptr_t)pu64 & 7));
3589 __asm__ __volatile__("xchgl %%ebx, %3\n\t"
3590 "lock; cmpxchg8b (%5)\n\t"
3591 "movl %3, %%ebx\n\t"
3592 : "=A" (u64)
3593# if RT_GNUC_PREREQ(4, 3)
3594 , "+m" (*pu64)
3595# else
3596 , "=m" (*pu64)
3597# endif
3598 : "0" (0ULL)
3599 , "m" (u32EBX)
3600 , "c" (0)
3601 , "S" (pu64)
3602 : "cc");
3603# else /* !PIC */
3604 __asm__ __volatile__("lock; cmpxchg8b %1\n\t"
3605 : "=A" (u64)
3606 , "+m" (*pu64)
3607 : "0" (0ULL)
3608 , "b" (0)
3609 , "c" (0)
3610 : "cc");
3611# endif
3612# else
3613 Assert(!((uintptr_t)pu64 & 7));
3614 __asm
3615 {
3616 xor eax, eax
3617 xor edx, edx
3618 mov edi, pu64
3619 xor ecx, ecx
3620 xor ebx, ebx
3621 lock cmpxchg8b [edi]
3622 mov dword ptr [u64], eax
3623 mov dword ptr [u64 + 4], edx
3624 }
3625# endif
3626
3627# elif defined(RT_ARCH_ARM64) || defined(RT_ARCH_ARM32)
3628 Assert(!((uintptr_t)pu64 & 7));
3629
3630# ifdef RT_INLINE_ASM_USES_INTRIN
3631 u64 = (uint64_t)__load_acquire64(pu64);
3632
3633# else
3634
3635# if defined(RTASM_ARM64_USE_FEAT_LSE) && 0 /* very expensive on M1, but alignment advantages with LEA2 (M2?). */
3636 __asm__ __volatile__("Lstart_ASMAtomicReadU64_%=:\n\t"
3637 RTASM_ARM_DMB_SY
3638 "casa %[uDst], xzr, %[pMem]\n\t"
3639 : [uDst] "=&r" (u64)
3640 : [pMem] "Q" (*pu64),
3641 "0" (0)
3642 RTASM_ARM_DMB_SY_COMMA_IN_REG);
3643# else
3644 __asm__ __volatile__("Lstart_ASMAtomicReadU64_%=:\n\t"
3645 RTASM_ARM_DMB_SY
3646# if defined(RT_ARCH_ARM64)
3647# if 1 /* ASSUMING proper barrier and aligned access, we should be fine with single-copy atomicity, just like on x86. */
3648 "ldur %[uDst], %[pMem]\n\t"
3649# else
3650 "ldxr %[uDst], %[pMem]\n\t"
3651 "clrex\n\t"
3652# endif
3653# else
3654 "ldrexd %[uDst], %H[uDst], %[pMem]\n\t"
3655 /** @todo clrex */
3656# endif
3657 : [uDst] "=&r" (u64)
3658 : [pMem] "Q" (*pu64)
3659 RTASM_ARM_DMB_SY_COMMA_IN_REG);
3660# endif
3661# endif
3662# else
3663# error "Port me"
3664# endif
3665 return u64;
3666}
3667#endif
3668
3669
3670/**
3671 * Atomically reads an unsigned 64-bit value, unordered.
3672 *
3673 * @returns Current *pu64 value
3674 * @param pu64 Pointer to the 64-bit variable to read.
3675 * The memory pointed to must be writable.
3676 *
3677 * @remarks This may fault if the memory is read-only!
3678 * @remarks x86: Requires a Pentium or later.
3679 */
3680#if !defined(RT_ARCH_AMD64) \
3681 && ( (RT_INLINE_ASM_EXTERNAL_TMP_ARM && !RT_INLINE_ASM_USES_INTRIN) \
3682 || RT_INLINE_DONT_MIX_CMPXCHG8B_AND_PIC)
3683RT_ASM_DECL_PRAGMA_WATCOM(uint64_t) ASMAtomicUoReadU64(volatile uint64_t RT_FAR *pu64) RT_NOTHROW_PROTO;
3684#else
3685DECLINLINE(uint64_t) ASMAtomicUoReadU64(volatile uint64_t RT_FAR *pu64) RT_NOTHROW_DEF
3686{
3687 uint64_t u64;
3688# ifdef RT_ARCH_AMD64
3689 Assert(!((uintptr_t)pu64 & 7));
3690/*# if RT_INLINE_ASM_GNU_STYLE
3691 Assert(!((uintptr_t)pu64 & 7));
3692 __asm__ __volatile__("movq %1, %0\n\t"
3693 : "=r" (u64)
3694 : "m" (*pu64));
3695# else
3696 __asm
3697 {
3698 mov rdx, [pu64]
3699 mov rax, [rdx]
3700 mov [u64], rax
3701 }
3702# endif */
3703 u64 = *pu64;
3704
3705# elif defined(RT_ARCH_X86)
3706# if RT_INLINE_ASM_GNU_STYLE
3707# if defined(PIC) || defined(__PIC__)
3708 uint32_t u32EBX = 0;
3709 uint32_t u32Spill;
3710 Assert(!((uintptr_t)pu64 & 7));
3711 __asm__ __volatile__("xor %%eax,%%eax\n\t"
3712 "xor %%ecx,%%ecx\n\t"
3713 "xor %%edx,%%edx\n\t"
3714 "xchgl %%ebx, %3\n\t"
3715 "lock; cmpxchg8b (%4)\n\t"
3716 "movl %3, %%ebx\n\t"
3717 : "=A" (u64)
3718# if RT_GNUC_PREREQ(4, 3)
3719 , "+m" (*pu64)
3720# else
3721 , "=m" (*pu64)
3722# endif
3723 , "=c" (u32Spill)
3724 : "m" (u32EBX)
3725 , "S" (pu64)
3726 : "cc");
3727# else /* !PIC */
3728 __asm__ __volatile__("lock; cmpxchg8b %1\n\t"
3729 : "=A" (u64)
3730 , "+m" (*pu64)
3731 : "0" (0ULL)
3732 , "b" (0)
3733 , "c" (0)
3734 : "cc");
3735# endif
3736# else
3737 Assert(!((uintptr_t)pu64 & 7));
3738 __asm
3739 {
3740 xor eax, eax
3741 xor edx, edx
3742 mov edi, pu64
3743 xor ecx, ecx
3744 xor ebx, ebx
3745 lock cmpxchg8b [edi]
3746 mov dword ptr [u64], eax
3747 mov dword ptr [u64 + 4], edx
3748 }
3749# endif
3750
3751# elif defined(RT_ARCH_ARM64) || defined(RT_ARCH_ARM32)
3752 Assert(!((uintptr_t)pu64 & 7));
3753
3754
3755# ifdef RT_INLINE_ASM_USES_INTRIN
3756 u64 = (uint64_t)__iso_volatile_load64((volatile int64_t *)pu64);
3757
3758# else
3759
3760 __asm__ __volatile__("Lstart_ASMAtomicUoReadU64_%=:\n\t"
3761# if defined(RT_ARCH_ARM64)
3762 "ldur %[uDst], %[pMem]\n\t"
3763# else
3764 "ldrexd %[uDst], %H[uDst], %[pMem]\n\t" /* this is required for atomic access since it's a pair */
3765 /** @todo clrex? */
3766# endif
3767 : [uDst] "=&r" (u64)
3768 : [pMem] "Q" (*pu64));
3769# endif
3770
3771# else
3772# error "Port me"
3773# endif
3774 return u64;
3775}
3776#endif
3777
3778
3779/**
3780 * Atomically reads a signed 64-bit value, ordered.
3781 *
3782 * @returns Current *pi64 value
3783 * @param pi64 Pointer to the 64-bit variable to read.
3784 * The memory pointed to must be writable.
3785 *
3786 * @remarks This may fault if the memory is read-only!
3787 * @remarks x86: Requires a Pentium or later.
3788 */
3789DECLINLINE(int64_t) ASMAtomicReadS64(volatile int64_t RT_FAR *pi64) RT_NOTHROW_DEF
3790{
3791 return (int64_t)ASMAtomicReadU64((volatile uint64_t RT_FAR *)pi64);
3792}
3793
3794
3795/**
3796 * Atomically reads a signed 64-bit value, unordered.
3797 *
3798 * @returns Current *pi64 value
3799 * @param pi64 Pointer to the 64-bit variable to read.
3800 * The memory pointed to must be writable.
3801 *
3802 * @remarks This will fault if the memory is read-only!
3803 * @remarks x86: Requires a Pentium or later.
3804 */
3805DECLINLINE(int64_t) ASMAtomicUoReadS64(volatile int64_t RT_FAR *pi64) RT_NOTHROW_DEF
3806{
3807 return (int64_t)ASMAtomicUoReadU64((volatile uint64_t RT_FAR *)pi64);
3808}
3809
3810
3811/** @def RTASM_HAVE_READ_U128
3812 * Defined in the target architecture supports atomic reading of 128-bit
3813 * integers.
3814 *
3815 * The define value is zero if both ordered and unordered reads are implemented
3816 * using ASMAtomicCmpXchgU128v2(). It is 1 if unordered reads are done natively
3817 * w/o cmpxchg and 3 if both variants are done natively w/o cmpxchg.
3818 *
3819 * @note AMD64: Caller must check for cmpxchg16b support before use and make
3820 * sure variables are writable (won't be changed).
3821 * @sa RTASM_HAVE_CMP_XCHG_U128, RTASM_HAVE_WRITE_U128
3822 */
3823#if defined(RT_ARCH_ARM64) || defined(DOXYGEN_RUNNING)
3824# define RTASM_HAVE_READ_U128 3
3825#elif defined(RTASM_HAVE_CMP_XCHG_U128)
3826# define RTASM_HAVE_READ_U128 0
3827#endif
3828
3829#ifdef RTASM_HAVE_READ_U128
3830
3831/**
3832 * Atomically reads an unsigned 128-bit value, ordered.
3833 *
3834 * @returns Current *pu128 value
3835 * @param pu128 Pointer to the 128-bit variable to read.
3836 * The memory pointed to must be writable.
3837 *
3838 * @remarks AMD64: Requires the memory to be both readable and writable.
3839 * @remarks AMD64: Requires support for cmpxchg16b.
3840 */
3841DECLINLINE(uint128_t) ASMAtomicReadU128(volatile uint128_t RT_FAR *pu128) RT_NOTHROW_DEF
3842{
3843 RTUINT128U u128Ret;
3844 Assert(!((uintptr_t)pu128 & 15));
3845# if defined(__GNUC__) && defined(RT_ARCH_ARM64)
3846 __asm__ __volatile__("Lstart_ASMAtomicReadU128_%=:\n\t"
3847 RTASM_ARM_DMB_SY
3848 "ldp %[uRetLo], %[uRetHi], %[pMem]\n\t"
3849 RTASM_ARM_DMB_SY
3850 : [uRetHi] "=r" (u128Ret.s.Hi)
3851 , [uRetLo] "=r" (u128Ret.s.Lo)
3852 : [pMem] "Q" (*pu128)
3853 : );
3854 return u128Ret.u;
3855# else
3856 ASMAtomicCmpXchgU128v2(pu128, 0, 0, 0, 0, &u128Ret.u);
3857 return u128Ret.u;
3858# endif
3859}
3860
3861/**
3862 * Atomically reads an unsigned 128-bit value, ordered.
3863 *
3864 * @returns Current *pu128 value
3865 * @param pu128 Pointer to the 128-bit variable to read.
3866 * The memory pointed to must be writable.
3867 *
3868 * @remarks AMD64: Requires the memory to be both readable and writable.
3869 * @remarks AMD64: Requires support for cmpxchg16b.
3870 */
3871DECLINLINE(RTUINT128U) ASMAtomicReadU128U(volatile RTUINT128U RT_FAR *pu128) RT_NOTHROW_DEF
3872{
3873 RTUINT128U u128Ret;
3874 Assert(!((uintptr_t)pu128 & 15));
3875# if defined(__GNUC__) && defined(RT_ARCH_ARM64)
3876 __asm__ __volatile__("Lstart_ASMAtomicReadU128U_%=:\n\t"
3877 RTASM_ARM_DMB_SY
3878 "ldp %[uRetLo], %[uRetHi], %[pMem]\n\t"
3879 RTASM_ARM_DMB_SY
3880 : [uRetHi] "=r" (u128Ret.s.Hi)
3881 , [uRetLo] "=r" (u128Ret.s.Lo)
3882 : [pMem] "Q" (*pu128)
3883 : );
3884 return u128Ret;
3885# else
3886 ASMAtomicCmpXchgU128v2(&pu128->u, 0, 0, 0, 0, &u128Ret.u);
3887 return u128Ret;
3888# endif
3889}
3890
3891
3892/**
3893 * Atomically reads an unsigned 128-bit value, unordered.
3894 *
3895 * @returns Current *pu128 value
3896 * @param pu128 Pointer to the 128-bit variable to read.
3897 * The memory pointed to must be writable.
3898 *
3899 * @remarks AMD64: Requires the memory to be both readable and writable.
3900 * @remarks AMD64: Requires support for cmpxchg16b.
3901 * @remarks AMD64: Is ordered.
3902 */
3903DECLINLINE(uint128_t) ASMAtomicUoReadU128(volatile uint128_t RT_FAR *pu128) RT_NOTHROW_DEF
3904{
3905 Assert(!((uintptr_t)pu128 & 15));
3906# if defined(__GNUC__) && defined(RT_ARCH_ARM64)
3907 RTUINT128U u128Ret;
3908 __asm__ __volatile__("Lstart_ASMAtomicUoReadU128_%=:\n\t"
3909 "ldp %[uRetLo], %[uRetHi], %[pMem]\n\t"
3910 : [uRetHi] "=r" (u128Ret.s.Hi)
3911 , [uRetLo] "=r" (u128Ret.s.Lo)
3912 : [pMem] "Q" (*pu128)
3913 : );
3914 return u128Ret.u;
3915
3916# elif defined(RT_ARCH_AMD64) && 0
3917 /* This doesn't work because __m128i can't be made volatile and we're not
3918 able to force MSC (2019) to emit _mm_load_si128 (besides it emits movdqu
3919 instead of movdqa). */
3920 __m128i uTmpSse = _mm_load_si128((__m128i volatile *)pu128);
3921 __m128i uTmpSseHi = _mm_srli_si128(uTmpSse, 64 / 8);
3922 RTUINT128U u128Ret;
3923 u128Ret.s.Lo = (uint64_t)_mm_cvtsi128_si64(uTmpSse);
3924 u128Ret.s.Hi = (uint64_t)_mm_cvtsi128_si64(uTmpSseHi);
3925 return u128Ret.u;
3926
3927# else
3928 return ASMAtomicReadU128(pu128);
3929# endif
3930}
3931
3932/**
3933 * Atomically reads an unsigned 128-bit value, unordered.
3934 *
3935 * @returns Current *pu128 value
3936 * @param pu128 Pointer to the 128-bit variable to read.
3937 * The memory pointed to must be writable.
3938 *
3939 * @remarks AMD64: Requires the memory to be both readable and writable.
3940 * @remarks AMD64: Requires support for cmpxchg16b.
3941 * @remarks AMD64: Is ordered.
3942 */
3943DECLINLINE(RTUINT128U) ASMAtomicUoReadU128U(volatile RTUINT128U RT_FAR *pu128) RT_NOTHROW_DEF
3944{
3945 Assert(!((uintptr_t)pu128 & 15));
3946# if defined(__GNUC__) && defined(RT_ARCH_ARM64)
3947 RTUINT128U u128Ret;
3948 __asm__ __volatile__("Lstart_ASMAtomicUoReadU128U_%=:\n\t"
3949 "ldp %[uRetLo], %[uRetHi], %[pMem]\n\t"
3950 : [uRetHi] "=r" (u128Ret.s.Hi)
3951 , [uRetLo] "=r" (u128Ret.s.Lo)
3952 : [pMem] "Q" (*pu128)
3953 : );
3954 return u128Ret;
3955# else
3956 return ASMAtomicReadU128U(pu128);
3957# endif
3958}
3959
3960#endif /* RTASM_HAVE_READ_U128 */
3961
3962/**
3963 * Atomically reads a size_t value, ordered.
3964 *
3965 * @returns Current *pcb value
3966 * @param pcb Pointer to the size_t variable to read.
3967 */
3968DECLINLINE(size_t) ASMAtomicReadZ(size_t volatile RT_FAR *pcb) RT_NOTHROW_DEF
3969{
3970#if ARCH_BITS == 64
3971 return ASMAtomicReadU64((uint64_t volatile RT_FAR *)pcb);
3972#elif ARCH_BITS == 32
3973 return ASMAtomicReadU32((uint32_t volatile RT_FAR *)pcb);
3974#elif ARCH_BITS == 16
3975 AssertCompileSize(size_t, 2);
3976 return ASMAtomicReadU16((uint16_t volatile RT_FAR *)pcb);
3977#else
3978# error "Unsupported ARCH_BITS value"
3979#endif
3980}
3981
3982
3983/**
3984 * Atomically reads a size_t value, unordered.
3985 *
3986 * @returns Current *pcb value
3987 * @param pcb Pointer to the size_t variable to read.
3988 */
3989DECLINLINE(size_t) ASMAtomicUoReadZ(size_t volatile RT_FAR *pcb) RT_NOTHROW_DEF
3990{
3991#if ARCH_BITS == 64 || ARCH_BITS == 16
3992 return ASMAtomicUoReadU64((uint64_t volatile RT_FAR *)pcb);
3993#elif ARCH_BITS == 32
3994 return ASMAtomicUoReadU32((uint32_t volatile RT_FAR *)pcb);
3995#elif ARCH_BITS == 16
3996 AssertCompileSize(size_t, 2);
3997 return ASMAtomicUoReadU16((uint16_t volatile RT_FAR *)pcb);
3998#else
3999# error "Unsupported ARCH_BITS value"
4000#endif
4001}
4002
4003
4004/**
4005 * Atomically reads a pointer value, ordered.
4006 *
4007 * @returns Current *pv value
4008 * @param ppv Pointer to the pointer variable to read.
4009 *
4010 * @remarks Please use ASMAtomicReadPtrT, it provides better type safety and
4011 * requires less typing (no casts).
4012 */
4013DECLINLINE(void RT_FAR *) ASMAtomicReadPtr(void RT_FAR * volatile RT_FAR *ppv) RT_NOTHROW_DEF
4014{
4015#if ARCH_BITS == 32 || ARCH_BITS == 16
4016 return (void RT_FAR *)ASMAtomicReadU32((volatile uint32_t RT_FAR *)(void RT_FAR *)ppv);
4017#elif ARCH_BITS == 64
4018 return (void RT_FAR *)ASMAtomicReadU64((volatile uint64_t RT_FAR *)(void RT_FAR *)ppv);
4019#else
4020# error "ARCH_BITS is bogus"
4021#endif
4022}
4023
4024/**
4025 * Convenience macro for avoiding the annoying casting with ASMAtomicReadPtr.
4026 *
4027 * @returns Current *pv value
4028 * @param ppv Pointer to the pointer variable to read.
4029 * @param Type The type of *ppv, sans volatile.
4030 */
4031#ifdef __GNUC__ /* 8.2.0 requires -Wno-ignored-qualifiers */
4032# define ASMAtomicReadPtrT(ppv, Type) \
4033 __extension__ \
4034 ({\
4035 __typeof__(*(ppv)) volatile *ppvTypeChecked = (ppv); \
4036 Type pvTypeChecked = (__typeof__(*(ppv))) ASMAtomicReadPtr((void * volatile *)ppvTypeChecked); \
4037 pvTypeChecked; \
4038 })
4039#else
4040# define ASMAtomicReadPtrT(ppv, Type) \
4041 (Type)ASMAtomicReadPtr((void RT_FAR * volatile RT_FAR *)(ppv))
4042#endif
4043
4044
4045/**
4046 * Atomically reads a pointer value, unordered.
4047 *
4048 * @returns Current *pv value
4049 * @param ppv Pointer to the pointer variable to read.
4050 *
4051 * @remarks Please use ASMAtomicUoReadPtrT, it provides better type safety and
4052 * requires less typing (no casts).
4053 */
4054DECLINLINE(void RT_FAR *) ASMAtomicUoReadPtr(void RT_FAR * volatile RT_FAR *ppv) RT_NOTHROW_DEF
4055{
4056#if ARCH_BITS == 32 || ARCH_BITS == 16
4057 return (void RT_FAR *)ASMAtomicUoReadU32((volatile uint32_t RT_FAR *)(void RT_FAR *)ppv);
4058#elif ARCH_BITS == 64
4059 return (void RT_FAR *)ASMAtomicUoReadU64((volatile uint64_t RT_FAR *)(void RT_FAR *)ppv);
4060#else
4061# error "ARCH_BITS is bogus"
4062#endif
4063}
4064
4065
4066/**
4067 * Convenience macro for avoiding the annoying casting with ASMAtomicUoReadPtr.
4068 *
4069 * @returns Current *pv value
4070 * @param ppv Pointer to the pointer variable to read.
4071 * @param Type The type of *ppv, sans volatile.
4072 */
4073#ifdef __GNUC__ /* 8.2.0 requires -Wno-ignored-qualifiers */
4074# define ASMAtomicUoReadPtrT(ppv, Type) \
4075 __extension__ \
4076 ({\
4077 __typeof__(*(ppv)) volatile * const ppvTypeChecked = (ppv); \
4078 Type pvTypeChecked = (__typeof__(*(ppv))) ASMAtomicUoReadPtr((void * volatile *)ppvTypeChecked); \
4079 pvTypeChecked; \
4080 })
4081#else
4082# define ASMAtomicUoReadPtrT(ppv, Type) \
4083 (Type)ASMAtomicUoReadPtr((void RT_FAR * volatile RT_FAR *)(ppv))
4084#endif
4085
4086
4087/**
4088 * Atomically reads a boolean value, ordered.
4089 *
4090 * @returns Current *pf value
4091 * @param pf Pointer to the boolean variable to read.
4092 */
4093DECLINLINE(bool) ASMAtomicReadBool(volatile bool RT_FAR *pf) RT_NOTHROW_DEF
4094{
4095 ASMMemoryFence();
4096 return *pf; /* byte reads are atomic on x86 */
4097}
4098
4099
4100/**
4101 * Atomically reads a boolean value, unordered.
4102 *
4103 * @returns Current *pf value
4104 * @param pf Pointer to the boolean variable to read.
4105 */
4106DECLINLINE(bool) ASMAtomicUoReadBool(volatile bool RT_FAR *pf) RT_NOTHROW_DEF
4107{
4108 return *pf; /* byte reads are atomic on x86 */
4109}
4110
4111
4112/**
4113 * Atomically read a typical IPRT handle value, ordered.
4114 *
4115 * @param ph Pointer to the handle variable to read.
4116 * @param phRes Where to store the result.
4117 *
4118 * @remarks This doesn't currently work for all handles (like RTFILE).
4119 */
4120#if HC_ARCH_BITS == 32 || ARCH_BITS == 16
4121# define ASMAtomicReadHandle(ph, phRes) \
4122 do { \
4123 AssertCompile(sizeof(*(ph)) == sizeof(uint32_t)); \
4124 AssertCompile(sizeof(*(phRes)) == sizeof(uint32_t)); \
4125 *(uint32_t RT_FAR *)(phRes) = ASMAtomicReadU32((uint32_t volatile RT_FAR *)(ph)); \
4126 } while (0)
4127#elif HC_ARCH_BITS == 64
4128# define ASMAtomicReadHandle(ph, phRes) \
4129 do { \
4130 AssertCompile(sizeof(*(ph)) == sizeof(uint64_t)); \
4131 AssertCompile(sizeof(*(phRes)) == sizeof(uint64_t)); \
4132 *(uint64_t RT_FAR *)(phRes) = ASMAtomicReadU64((uint64_t volatile RT_FAR *)(ph)); \
4133 } while (0)
4134#else
4135# error HC_ARCH_BITS
4136#endif
4137
4138
4139/**
4140 * Atomically read a typical IPRT handle value, unordered.
4141 *
4142 * @param ph Pointer to the handle variable to read.
4143 * @param phRes Where to store the result.
4144 *
4145 * @remarks This doesn't currently work for all handles (like RTFILE).
4146 */
4147#if HC_ARCH_BITS == 32 || ARCH_BITS == 16
4148# define ASMAtomicUoReadHandle(ph, phRes) \
4149 do { \
4150 AssertCompile(sizeof(*(ph)) == sizeof(uint32_t)); \
4151 AssertCompile(sizeof(*(phRes)) == sizeof(uint32_t)); \
4152 *(uint32_t RT_FAR *)(phRes) = ASMAtomicUoReadU32((uint32_t volatile RT_FAR *)(ph)); \
4153 } while (0)
4154#elif HC_ARCH_BITS == 64
4155# define ASMAtomicUoReadHandle(ph, phRes) \
4156 do { \
4157 AssertCompile(sizeof(*(ph)) == sizeof(uint64_t)); \
4158 AssertCompile(sizeof(*(phRes)) == sizeof(uint64_t)); \
4159 *(uint64_t RT_FAR *)(phRes) = ASMAtomicUoReadU64((uint64_t volatile RT_FAR *)(ph)); \
4160 } while (0)
4161#else
4162# error HC_ARCH_BITS
4163#endif
4164
4165
4166/**
4167 * Atomically read a value which size might differ
4168 * between platforms or compilers, ordered.
4169 *
4170 * @param pu Pointer to the variable to read.
4171 * @param puRes Where to store the result.
4172 */
4173#define ASMAtomicReadSize(pu, puRes) \
4174 do { \
4175 switch (sizeof(*(pu))) { \
4176 case 1: *(uint8_t RT_FAR *)(puRes) = ASMAtomicReadU8( (volatile uint8_t RT_FAR *)(void RT_FAR *)(pu)); break; \
4177 case 2: *(uint16_t RT_FAR *)(puRes) = ASMAtomicReadU16((volatile uint16_t RT_FAR *)(void RT_FAR *)(pu)); break; \
4178 case 4: *(uint32_t RT_FAR *)(puRes) = ASMAtomicReadU32((volatile uint32_t RT_FAR *)(void RT_FAR *)(pu)); break; \
4179 case 8: *(uint64_t RT_FAR *)(puRes) = ASMAtomicReadU64((volatile uint64_t RT_FAR *)(void RT_FAR *)(pu)); break; \
4180 default: AssertMsgFailed(("ASMAtomicReadSize: size %d is not supported\n", sizeof(*(pu)))); \
4181 } \
4182 } while (0)
4183
4184
4185/**
4186 * Atomically read a value which size might differ
4187 * between platforms or compilers, unordered.
4188 *
4189 * @param pu Pointer to the variable to read.
4190 * @param puRes Where to store the result.
4191 */
4192#define ASMAtomicUoReadSize(pu, puRes) \
4193 do { \
4194 switch (sizeof(*(pu))) { \
4195 case 1: *(uint8_t RT_FAR *)(puRes) = ASMAtomicUoReadU8( (volatile uint8_t RT_FAR *)(void RT_FAR *)(pu)); break; \
4196 case 2: *(uint16_t RT_FAR *)(puRes) = ASMAtomicUoReadU16((volatile uint16_t RT_FAR *)(void RT_FAR *)(pu)); break; \
4197 case 4: *(uint32_t RT_FAR *)(puRes) = ASMAtomicUoReadU32((volatile uint32_t RT_FAR *)(void RT_FAR *)(pu)); break; \
4198 case 8: *(uint64_t RT_FAR *)(puRes) = ASMAtomicUoReadU64((volatile uint64_t RT_FAR *)(void RT_FAR *)(pu)); break; \
4199 default: AssertMsgFailed(("ASMAtomicReadSize: size %d is not supported\n", sizeof(*(pu)))); \
4200 } \
4201 } while (0)
4202
4203
4204/**
4205 * Atomically writes an unsigned 8-bit value, ordered.
4206 *
4207 * @param pu8 Pointer to the 8-bit variable.
4208 * @param u8 The 8-bit value to assign to *pu8.
4209 */
4210DECLINLINE(void) ASMAtomicWriteU8(volatile uint8_t RT_FAR *pu8, uint8_t u8) RT_NOTHROW_DEF
4211{
4212#if defined(RT_ARCH_ARM64)
4213
4214# ifdef RT_INLINE_ASM_USES_INTRIN
4215 __dmb(_ARM64_BARRIER_SY);
4216 __stlr8(pu8, u8);
4217 __dmb(_ARM64_BARRIER_SY);
4218# else
4219
4220 /* The DMB SY will ensure ordering a la x86, the stlrb is probably overkill
4221 as all byte accesses are single-copy atomic, which I think suffices here. */
4222 __asm__ __volatile__("Lstart_ASMAtomicWriteU8_%=:\n\t"
4223# if defined(RTASM_ARM64_USE_FEAT_LSE) && 0 /* this is a lot slower and has no alignment benefits with LSE2 */
4224 RTASM_ARM_DMB_SY
4225 "swpb %w[uValue], wzr, %[pMem]\n\t"
4226# else
4227 RTASM_ARM_DMB_SY
4228 "stlrb %w[uValue], %[pMem]\n\t" /* single-copy atomic w/ release semantics. */
4229# endif
4230 : [pMem] "+Q" (*pu8)
4231 : [uValue] "r" ((uint32_t)u8)
4232 : );
4233# endif
4234
4235#else
4236 ASMAtomicXchgU8(pu8, u8);
4237#endif
4238}
4239
4240
4241/**
4242 * Atomically writes an unsigned 8-bit value, unordered.
4243 *
4244 * @param pu8 Pointer to the 8-bit variable.
4245 * @param u8 The 8-bit value to assign to *pu8.
4246 */
4247DECLINLINE(void) ASMAtomicUoWriteU8(volatile uint8_t RT_FAR *pu8, uint8_t u8) RT_NOTHROW_DEF
4248{
4249 *pu8 = u8; /* byte writes are atomic on x86 */
4250}
4251
4252
4253/**
4254 * Atomically writes a signed 8-bit value, ordered.
4255 *
4256 * @param pi8 Pointer to the 8-bit variable to read.
4257 * @param i8 The 8-bit value to assign to *pi8.
4258 */
4259DECLINLINE(void) ASMAtomicWriteS8(volatile int8_t RT_FAR *pi8, int8_t i8) RT_NOTHROW_DEF
4260{
4261#if defined(RT_ARCH_ARM64)
4262 ASMAtomicWriteU8((volatile uint8_t RT_FAR *)pi8, (uint8_t)i8);
4263#else
4264 ASMAtomicXchgS8(pi8, i8);
4265#endif
4266}
4267
4268
4269/**
4270 * Atomically writes a signed 8-bit value, unordered.
4271 *
4272 * @param pi8 Pointer to the 8-bit variable to write.
4273 * @param i8 The 8-bit value to assign to *pi8.
4274 */
4275DECLINLINE(void) ASMAtomicUoWriteS8(volatile int8_t RT_FAR *pi8, int8_t i8) RT_NOTHROW_DEF
4276{
4277 *pi8 = i8; /* byte writes are atomic on x86 */
4278}
4279
4280
4281/**
4282 * Atomically writes an unsigned 16-bit value, ordered.
4283 *
4284 * @param pu16 Pointer to the 16-bit variable to write.
4285 * @param u16 The 16-bit value to assign to *pu16.
4286 */
4287DECLINLINE(void) ASMAtomicWriteU16(volatile uint16_t RT_FAR *pu16, uint16_t u16) RT_NOTHROW_DEF
4288{
4289#if defined(RT_ARCH_ARM64)
4290
4291# ifdef RT_INLINE_ASM_USES_INTRIN
4292 __dmb(_ARM64_BARRIER_SY);
4293 __stlr16(pu16, u16);
4294 __dmb(_ARM64_BARRIER_SY);
4295# else
4296
4297 __asm__ __volatile__("Lstart_ASMAtomicWriteU16_%=:\n\t"
4298# if defined(RTASM_ARM64_USE_FEAT_LSE) /* slower on M1, but benefits from relaxed LSE2 alignment requirements (M2?). */
4299 RTASM_ARM_DMB_SY
4300 "swph %w[uValue], wzr, %[pMem]\n\t"
4301# else
4302 RTASM_ARM_DMB_SY
4303 "stlrh %w[uValue], %[pMem]\n\t" /* single-copy atomic w/ release semantics. */
4304# endif
4305 : [pMem] "+Q" (*pu16)
4306 : [uValue] "r" ((uint32_t)u16)
4307 : );
4308# endif
4309
4310#else
4311 ASMAtomicXchgU16(pu16, u16);
4312#endif
4313}
4314
4315
4316/**
4317 * Atomically writes an unsigned 16-bit value, unordered.
4318 *
4319 * @param pu16 Pointer to the 16-bit variable to write.
4320 * @param u16 The 16-bit value to assign to *pu16.
4321 */
4322DECLINLINE(void) ASMAtomicUoWriteU16(volatile uint16_t RT_FAR *pu16, uint16_t u16) RT_NOTHROW_DEF
4323{
4324 Assert(!((uintptr_t)pu16 & 1));
4325 *pu16 = u16;
4326}
4327
4328
4329/**
4330 * Atomically writes a signed 16-bit value, ordered.
4331 *
4332 * @param pi16 Pointer to the 16-bit variable to write.
4333 * @param i16 The 16-bit value to assign to *pi16.
4334 */
4335DECLINLINE(void) ASMAtomicWriteS16(volatile int16_t RT_FAR *pi16, int16_t i16) RT_NOTHROW_DEF
4336{
4337#if defined(RT_ARCH_ARM64)
4338 ASMAtomicWriteU16((volatile uint16_t RT_FAR *)pi16, (uint16_t)i16);
4339#else
4340 ASMAtomicXchgS16(pi16, i16);
4341#endif
4342}
4343
4344
4345/**
4346 * Atomically writes a signed 16-bit value, unordered.
4347 *
4348 * @param pi16 Pointer to the 16-bit variable to write.
4349 * @param i16 The 16-bit value to assign to *pi16.
4350 */
4351DECLINLINE(void) ASMAtomicUoWriteS16(volatile int16_t RT_FAR *pi16, int16_t i16) RT_NOTHROW_DEF
4352{
4353 Assert(!((uintptr_t)pi16 & 1));
4354 *pi16 = i16;
4355}
4356
4357
4358/**
4359 * Atomically writes an unsigned 32-bit value, ordered.
4360 *
4361 * @param pu32 Pointer to the 32-bit variable to write.
4362 * @param u32 The 32-bit value to assign to *pu32.
4363 */
4364DECLINLINE(void) ASMAtomicWriteU32(volatile uint32_t RT_FAR *pu32, uint32_t u32) RT_NOTHROW_DEF
4365{
4366#if defined(RT_ARCH_ARM64)
4367
4368
4369# ifdef RT_INLINE_ASM_USES_INTRIN
4370 __dmb(_ARM64_BARRIER_SY);
4371 __stlr32(pu32, u32);
4372 __dmb(_ARM64_BARRIER_SY);
4373# else
4374
4375 __asm__ __volatile__("Lstart_ASMAtomicWriteU32_%=:\n\t"
4376# if defined(RTASM_ARM64_USE_FEAT_LSE) /* slower on M1, but benefits from relaxed LSE2 alignment requirements (M2?). */
4377 RTASM_ARM_DMB_SY
4378 "swp %w[uValue], wzr, %[pMem]\n\t"
4379# else
4380 RTASM_ARM_DMB_SY
4381 "stlr %w[uValue], %[pMem]\n\t" /* single-copy atomic w/ release semantics. */
4382# endif
4383 : [pMem] "+Q" (*pu32)
4384 : [uValue] "r" (u32)
4385 : "cc");
4386# endif
4387
4388#else
4389 ASMAtomicXchgU32(pu32, u32);
4390#endif
4391}
4392
4393
4394/**
4395 * Atomically writes an unsigned 32-bit value, unordered.
4396 *
4397 * @param pu32 Pointer to the 32-bit variable to write.
4398 * @param u32 The 32-bit value to assign to *pu32.
4399 */
4400DECLINLINE(void) ASMAtomicUoWriteU32(volatile uint32_t RT_FAR *pu32, uint32_t u32) RT_NOTHROW_DEF
4401{
4402 Assert(!((uintptr_t)pu32 & 3));
4403#if ARCH_BITS >= 32
4404 *pu32 = u32;
4405#else
4406 ASMAtomicXchgU32(pu32, u32);
4407#endif
4408}
4409
4410
4411/**
4412 * Atomically writes a signed 32-bit value, ordered.
4413 *
4414 * @param pi32 Pointer to the 32-bit variable to write.
4415 * @param i32 The 32-bit value to assign to *pi32.
4416 */
4417DECLINLINE(void) ASMAtomicWriteS32(volatile int32_t RT_FAR *pi32, int32_t i32) RT_NOTHROW_DEF
4418{
4419#if defined(RT_ARCH_ARM64)
4420 ASMAtomicWriteU32((volatile uint32_t RT_FAR *)pi32, (uint32_t)i32);
4421#else
4422 ASMAtomicXchgS32(pi32, i32);
4423#endif
4424}
4425
4426
4427/**
4428 * Atomically writes a signed 32-bit value, unordered.
4429 *
4430 * @param pi32 Pointer to the 32-bit variable to write.
4431 * @param i32 The 32-bit value to assign to *pi32.
4432 */
4433DECLINLINE(void) ASMAtomicUoWriteS32(volatile int32_t RT_FAR *pi32, int32_t i32) RT_NOTHROW_DEF
4434{
4435 Assert(!((uintptr_t)pi32 & 3));
4436#if ARCH_BITS >= 32
4437 *pi32 = i32;
4438#else
4439 ASMAtomicXchgS32(pi32, i32);
4440#endif
4441}
4442
4443
4444/**
4445 * Atomically writes an unsigned 64-bit value, ordered.
4446 *
4447 * @param pu64 Pointer to the 64-bit variable to write.
4448 * @param u64 The 64-bit value to assign to *pu64.
4449 */
4450DECLINLINE(void) ASMAtomicWriteU64(volatile uint64_t RT_FAR *pu64, uint64_t u64) RT_NOTHROW_DEF
4451{
4452#if defined(RT_ARCH_ARM64)
4453
4454# ifdef RT_INLINE_ASM_USES_INTRIN
4455 __dmb(_ARM64_BARRIER_SY);
4456 __stlr64(pu64, u64);
4457 __dmb(_ARM64_BARRIER_SY);
4458# else
4459
4460 __asm__ __volatile__("Lstart_ASMAtomicWriteU64_%=:\n\t"
4461# if defined(RTASM_ARM64_USE_FEAT_LSE) /* slower on M1, but benefits from relaxed LSE2 alignment requirements (M2?). */
4462 RTASM_ARM_DMB_SY
4463 "swp %[uValue], xzr, %[pMem]\n\t"
4464# else
4465 RTASM_ARM_DMB_SY /** @todo necessary? */
4466 "stlr %[uValue], %[pMem]\n\t"
4467# endif
4468 : [pMem] "+Q" (*pu64)
4469 : [uValue] "r" (u64)
4470 : );
4471# endif
4472
4473#else
4474 ASMAtomicXchgU64(pu64, u64);
4475#endif
4476}
4477
4478
4479/**
4480 * Atomically writes an unsigned 64-bit value, unordered.
4481 *
4482 * @param pu64 Pointer to the 64-bit variable to write.
4483 * @param u64 The 64-bit value to assign to *pu64.
4484 */
4485DECLINLINE(void) ASMAtomicUoWriteU64(volatile uint64_t RT_FAR *pu64, uint64_t u64) RT_NOTHROW_DEF
4486{
4487 Assert(!((uintptr_t)pu64 & 7));
4488#if ARCH_BITS == 64
4489 *pu64 = u64;
4490#else
4491 ASMAtomicXchgU64(pu64, u64);
4492#endif
4493}
4494
4495
4496/**
4497 * Atomically writes a signed 64-bit value, ordered.
4498 *
4499 * @param pi64 Pointer to the 64-bit variable to write.
4500 * @param i64 The 64-bit value to assign to *pi64.
4501 */
4502DECLINLINE(void) ASMAtomicWriteS64(volatile int64_t RT_FAR *pi64, int64_t i64) RT_NOTHROW_DEF
4503{
4504#if defined(RT_ARCH_ARM64)
4505 ASMAtomicWriteU64((volatile uint64_t RT_FAR *)pi64, (uint64_t)i64);
4506#else
4507 ASMAtomicXchgS64(pi64, i64);
4508#endif
4509}
4510
4511
4512/**
4513 * Atomically writes a signed 64-bit value, unordered.
4514 *
4515 * @param pi64 Pointer to the 64-bit variable to write.
4516 * @param i64 The 64-bit value to assign to *pi64.
4517 */
4518DECLINLINE(void) ASMAtomicUoWriteS64(volatile int64_t RT_FAR *pi64, int64_t i64) RT_NOTHROW_DEF
4519{
4520 Assert(!((uintptr_t)pi64 & 7));
4521#if ARCH_BITS == 64
4522 *pi64 = i64;
4523#else
4524 ASMAtomicXchgS64(pi64, i64);
4525#endif
4526}
4527
4528
4529/** @def RTASM_HAVE_WRITE_U128
4530 * Defined in the target architecture supports atomic of 128-bit integers.
4531 *
4532 * The define value is zero if both ordered and unordered writes are implemented
4533 * using ASMAtomicCmpXchgU128v2(). It is 1 if unordered writes are done
4534 * natively w/o cmpxchg and 3 if both variants are done natively w/o cmpxchg.
4535 *
4536 * @note AMD64: Caller must check for cmpxchg16b support before use.
4537 * @sa RTASM_HAVE_CMP_XCHG_U128
4538 */
4539#if defined(RT_ARCH_ARM64) || defined(DOXYGEN_RUNNING)
4540# define RTASM_HAVE_WRITE_U128 3
4541#elif defined(RTASM_HAVE_CMP_XCHG_U128)
4542# define RTASM_HAVE_WRITE_U128 0
4543#endif
4544
4545#ifdef RTASM_HAVE_WRITE_U128
4546
4547/**
4548 * Atomically writes an unsigned 128-bit value, ordered.
4549 *
4550 * @param pu128 Pointer to the variable to overwrite. Must be aligned
4551 * on 16 byte boundrary.
4552 * @param u64Hi The high 64 bits of the new value.
4553 * @param u64Lo The low 64 bits of the new value.
4554 */
4555DECLINLINE(void) ASMAtomicWriteU128v2(volatile uint128_t *pu128, const uint64_t u64Hi, const uint64_t u64Lo) RT_NOTHROW_DEF
4556{
4557# if !defined(__GNUC__) || !defined(RT_ARCH_ARM64)
4558 RTUINT128U u128Old;
4559# endif
4560 Assert(!((uintptr_t)pu128 & 15));
4561# if defined(__GNUC__) && defined(RT_ARCH_ARM64)
4562 __asm__ __volatile__("Lstart_ASMAtomicWriteU128v2_%=:\n\t"
4563# if 0 && defined(RTASM_ARM64_USE_FEAT_LSE128) /** @todo hw support? test + debug */
4564 RTASM_ARM_DMB_SY
4565 "swpp %[uValueLo], %[uValueHi], %[pMem]\n\t"
4566# else
4567 RTASM_ARM_DMB_SY
4568 "stp %[uValueLo], %[uValueHi], %[pMem]\n\t"
4569 "dmb sy\n\t"
4570# endif
4571 : [pMem] "+Q" (*pu128)
4572 : [uValueHi] "r" (u64Hi)
4573 , [uValueLo] "r" (u64Lo)
4574 : );
4575
4576# else
4577# ifdef RT_COMPILER_WITH_128BIT_INT_TYPES
4578 u128Old.u = *pu128;
4579# else
4580 u128Old.u.Lo = pu128->Lo;
4581 u128Old.u.Hi = pu128->Hi;
4582# endif
4583 while (!ASMAtomicCmpXchgU128v2(pu128, u64Hi, u64Lo, u128Old.s.Hi, u128Old.s.Lo, &u128Old.u))
4584 { }
4585# endif
4586}
4587
4588
4589/**
4590 * Atomically writes an unsigned 128-bit value, ordered.
4591 *
4592 * @param pu128 Pointer to the variable to overwrite. Must be aligned
4593 * on 16 byte boundrary.
4594 * @param u64Hi The high 64 bits of the new value.
4595 * @param u64Lo The low 64 bits of the new value.
4596 * @note This is ordered on AMD64.
4597 */
4598DECLINLINE(void) ASMAtomicUoWriteU128v2(volatile uint128_t *pu128, const uint64_t u64Hi, const uint64_t u64Lo) RT_NOTHROW_DEF
4599{
4600# if !defined(__GNUC__) || !defined(RT_ARCH_ARM64)
4601 RTUINT128U u128Old;
4602# endif
4603 Assert(!((uintptr_t)pu128 & 15));
4604# if defined(__GNUC__) && defined(RT_ARCH_ARM64)
4605 __asm__ __volatile__("Lstart_ASMAtomicUoWriteU128v2_%=:\n\t"
4606 "stp %[uValueLo], %[uValueHi], %[pMem]\n\t"
4607 : [pMem] "+Q" (*pu128)
4608 : [uValueHi] "r" (u64Hi)
4609 , [uValueLo] "r" (u64Lo)
4610 : );
4611
4612# else
4613# ifdef RT_COMPILER_WITH_128BIT_INT_TYPES
4614 u128Old.u = *pu128;
4615# else
4616 u128Old.u.Lo = pu128->Lo;
4617 u128Old.u.Hi = pu128->Hi;
4618# endif
4619 while (!ASMAtomicCmpXchgU128v2(pu128, u64Hi, u64Lo, u128Old.s.Hi, u128Old.s.Lo, &u128Old.u))
4620 { }
4621# endif
4622}
4623
4624
4625/**
4626 * Atomically writes an unsigned 128-bit value, ordered.
4627 *
4628 * @param pu128 Pointer to the variable to overwrite. Must be aligned
4629 * on 16 byte boundrary.
4630 * @param u128 The the new value.
4631 */
4632DECLINLINE(void) ASMAtomicWriteU128(volatile uint128_t *pu128, const uint128_t u128) RT_NOTHROW_DEF
4633{
4634# ifdef RT_COMPILER_WITH_128BIT_INT_TYPES
4635 ASMAtomicWriteU128v2(pu128, (uint64_t)(u128 >> 64), (uint64_t)u128);
4636# else
4637 ASMAtomicWriteU128v2(pu128, u128.Hi, u128.Lo);
4638# endif
4639}
4640
4641
4642/**
4643 * Atomically writes an unsigned 128-bit value, unordered.
4644 *
4645 * @param pu128 Pointer to the variable to overwrite. Must be aligned
4646 * on 16 byte boundrary.
4647 * @param u128 The the new value.
4648 * @note This is ordered on AMD64.
4649 */
4650DECLINLINE(void) ASMAtomicUoWriteU128(volatile uint128_t *pu128, const uint128_t u128) RT_NOTHROW_DEF
4651{
4652# ifdef RT_COMPILER_WITH_128BIT_INT_TYPES
4653 ASMAtomicUoWriteU128v2(pu128, (uint64_t)(u128 >> 64), (uint64_t)u128);
4654# else
4655 ASMAtomicUoWriteU128v2(pu128, u128.Hi, u128.Lo);
4656# endif
4657}
4658
4659
4660/**
4661 * Atomically writes an unsigned 128-bit value, ordered.
4662 *
4663 * @param pu128 Pointer to the variable to overwrite. Must be aligned
4664 * on 16 byte boundrary.
4665 * @param u128 The the new value.
4666 */
4667DECLINLINE(void) ASMAtomicWriteU128U(volatile RTUINT128U *pu128, const RTUINT128U u128) RT_NOTHROW_DEF
4668{
4669 ASMAtomicWriteU128v2(&pu128->u, u128.s.Hi, u128.s.Lo);
4670}
4671
4672
4673/**
4674 * Atomically writes an unsigned 128-bit value, unordered.
4675 *
4676 * @param pu128 Pointer to the variable to overwrite. Must be aligned
4677 * on 16 byte boundrary.
4678 * @param u128 The the new value.
4679 * @note This is ordered on AMD64.
4680 */
4681DECLINLINE(void) ASMAtomicUoWriteU128U(volatile RTUINT128U *pu128, const RTUINT128U u128) RT_NOTHROW_DEF
4682{
4683 ASMAtomicUoWriteU128v2(&pu128->u, u128.s.Hi, u128.s.Lo);
4684}
4685
4686#endif /* RTASM_HAVE_WRITE_U128 */
4687
4688/**
4689 * Atomically writes a size_t value, ordered.
4690 *
4691 * @param pcb Pointer to the size_t variable to write.
4692 * @param cb The value to assign to *pcb.
4693 */
4694DECLINLINE(void) ASMAtomicWriteZ(volatile size_t RT_FAR *pcb, size_t cb) RT_NOTHROW_DEF
4695{
4696#if ARCH_BITS == 64
4697 ASMAtomicWriteU64((uint64_t volatile *)pcb, cb);
4698#elif ARCH_BITS == 32
4699 ASMAtomicWriteU32((uint32_t volatile *)pcb, cb);
4700#elif ARCH_BITS == 16
4701 AssertCompileSize(size_t, 2);
4702 ASMAtomicWriteU16((uint16_t volatile *)pcb, cb);
4703#else
4704# error "Unsupported ARCH_BITS value"
4705#endif
4706}
4707
4708
4709/**
4710 * Atomically writes a size_t value, unordered.
4711 *
4712 * @param pcb Pointer to the size_t variable to write.
4713 * @param cb The value to assign to *pcb.
4714 */
4715DECLINLINE(void) ASMAtomicUoWriteZ(volatile size_t RT_FAR *pcb, size_t cb) RT_NOTHROW_DEF
4716{
4717#if ARCH_BITS == 64
4718 ASMAtomicUoWriteU64((uint64_t volatile *)pcb, cb);
4719#elif ARCH_BITS == 32
4720 ASMAtomicUoWriteU32((uint32_t volatile *)pcb, cb);
4721#elif ARCH_BITS == 16
4722 AssertCompileSize(size_t, 2);
4723 ASMAtomicUoWriteU16((uint16_t volatile *)pcb, cb);
4724#else
4725# error "Unsupported ARCH_BITS value"
4726#endif
4727}
4728
4729
4730/**
4731 * Atomically writes a boolean value, unordered.
4732 *
4733 * @param pf Pointer to the boolean variable to write.
4734 * @param f The boolean value to assign to *pf.
4735 */
4736DECLINLINE(void) ASMAtomicWriteBool(volatile bool RT_FAR *pf, bool f) RT_NOTHROW_DEF
4737{
4738 ASMAtomicWriteU8((uint8_t volatile RT_FAR *)pf, f);
4739}
4740
4741
4742/**
4743 * Atomically writes a boolean value, unordered.
4744 *
4745 * @param pf Pointer to the boolean variable to write.
4746 * @param f The boolean value to assign to *pf.
4747 */
4748DECLINLINE(void) ASMAtomicUoWriteBool(volatile bool RT_FAR *pf, bool f) RT_NOTHROW_DEF
4749{
4750 *pf = f; /* byte writes are atomic on x86 */
4751}
4752
4753
4754/**
4755 * Atomically writes a pointer value, ordered.
4756 *
4757 * @param ppv Pointer to the pointer variable to write.
4758 * @param pv The pointer value to assign to *ppv.
4759 */
4760DECLINLINE(void) ASMAtomicWritePtrVoid(void RT_FAR * volatile RT_FAR *ppv, const void *pv) RT_NOTHROW_DEF
4761{
4762#if ARCH_BITS == 32 || ARCH_BITS == 16
4763 ASMAtomicWriteU32((volatile uint32_t RT_FAR *)(void RT_FAR *)ppv, (uint32_t)pv);
4764#elif ARCH_BITS == 64
4765 ASMAtomicWriteU64((volatile uint64_t RT_FAR *)(void RT_FAR *)ppv, (uint64_t)pv);
4766#else
4767# error "ARCH_BITS is bogus"
4768#endif
4769}
4770
4771
4772/**
4773 * Atomically writes a pointer value, unordered.
4774 *
4775 * @param ppv Pointer to the pointer variable to write.
4776 * @param pv The pointer value to assign to *ppv.
4777 */
4778DECLINLINE(void) ASMAtomicUoWritePtrVoid(void RT_FAR * volatile RT_FAR *ppv, const void *pv) RT_NOTHROW_DEF
4779{
4780#if ARCH_BITS == 32 || ARCH_BITS == 16
4781 ASMAtomicUoWriteU32((volatile uint32_t RT_FAR *)(void RT_FAR *)ppv, (uint32_t)pv);
4782#elif ARCH_BITS == 64
4783 ASMAtomicUoWriteU64((volatile uint64_t RT_FAR *)(void RT_FAR *)ppv, (uint64_t)pv);
4784#else
4785# error "ARCH_BITS is bogus"
4786#endif
4787}
4788
4789
4790/**
4791 * Atomically writes a pointer value, ordered.
4792 *
4793 * @param ppv Pointer to the pointer variable to write.
4794 * @param pv The pointer value to assign to *ppv. If NULL use
4795 * ASMAtomicWriteNullPtr or you'll land in trouble.
4796 *
4797 * @remarks This is relatively type safe on GCC platforms when @a pv isn't
4798 * NULL.
4799 */
4800#ifdef __GNUC__
4801# define ASMAtomicWritePtr(ppv, pv) \
4802 do \
4803 { \
4804 __typeof__(*(ppv)) volatile RT_FAR * const ppvTypeChecked = (ppv); \
4805 __typeof__(*(ppv)) const pvTypeChecked = (pv); \
4806 \
4807 AssertCompile(sizeof(*ppv) == sizeof(void RT_FAR *)); \
4808 AssertCompile(sizeof(pv) == sizeof(void RT_FAR *)); \
4809 Assert(!( (uintptr_t)ppv & ((ARCH_BITS / 8) - 1) )); \
4810 \
4811 ASMAtomicWritePtrVoid((void RT_FAR * volatile RT_FAR *)(ppvTypeChecked), (void RT_FAR *)(pvTypeChecked)); \
4812 } while (0)
4813#else
4814# define ASMAtomicWritePtr(ppv, pv) \
4815 do \
4816 { \
4817 AssertCompile(sizeof(*ppv) == sizeof(void RT_FAR *)); \
4818 AssertCompile(sizeof(pv) == sizeof(void RT_FAR *)); \
4819 Assert(!( (uintptr_t)ppv & ((ARCH_BITS / 8) - 1) )); \
4820 \
4821 ASMAtomicWritePtrVoid((void RT_FAR * volatile RT_FAR *)(ppv), (void RT_FAR *)(pv)); \
4822 } while (0)
4823#endif
4824
4825
4826/**
4827 * Atomically sets a pointer to NULL, ordered.
4828 *
4829 * @param ppv Pointer to the pointer variable that should be set to NULL.
4830 *
4831 * @remarks This is relatively type safe on GCC platforms.
4832 */
4833#if RT_GNUC_PREREQ(4, 2)
4834# define ASMAtomicWriteNullPtr(ppv) \
4835 do \
4836 { \
4837 __typeof__(*(ppv)) * const ppvTypeChecked = (ppv); \
4838 AssertCompile(sizeof(*ppv) == sizeof(void RT_FAR *)); \
4839 Assert(!( (uintptr_t)ppv & ((ARCH_BITS / 8) - 1) )); \
4840 ASMAtomicWritePtrVoid((void RT_FAR * volatile RT_FAR *)(ppvTypeChecked), NULL); \
4841 } while (0)
4842#else
4843# define ASMAtomicWriteNullPtr(ppv) \
4844 do \
4845 { \
4846 AssertCompile(sizeof(*ppv) == sizeof(void RT_FAR *)); \
4847 Assert(!( (uintptr_t)ppv & ((ARCH_BITS / 8) - 1) )); \
4848 ASMAtomicWritePtrVoid((void RT_FAR * volatile RT_FAR *)(ppv), NULL); \
4849 } while (0)
4850#endif
4851
4852
4853/**
4854 * Atomically writes a pointer value, unordered.
4855 *
4856 * @returns Current *pv value
4857 * @param ppv Pointer to the pointer variable.
4858 * @param pv The pointer value to assign to *ppv. If NULL use
4859 * ASMAtomicUoWriteNullPtr or you'll land in trouble.
4860 *
4861 * @remarks This is relatively type safe on GCC platforms when @a pv isn't
4862 * NULL.
4863 */
4864#if RT_GNUC_PREREQ(4, 2)
4865# define ASMAtomicUoWritePtr(ppv, pv) \
4866 do \
4867 { \
4868 __typeof__(*(ppv)) volatile * const ppvTypeChecked = (ppv); \
4869 __typeof__(*(ppv)) const pvTypeChecked = (pv); \
4870 \
4871 AssertCompile(sizeof(*ppv) == sizeof(void *)); \
4872 AssertCompile(sizeof(pv) == sizeof(void *)); \
4873 Assert(!( (uintptr_t)ppv & ((ARCH_BITS / 8) - 1) )); \
4874 \
4875 *(ppvTypeChecked) = pvTypeChecked; \
4876 } while (0)
4877#else
4878# define ASMAtomicUoWritePtr(ppv, pv) \
4879 do \
4880 { \
4881 AssertCompile(sizeof(*ppv) == sizeof(void RT_FAR *)); \
4882 AssertCompile(sizeof(pv) == sizeof(void RT_FAR *)); \
4883 Assert(!( (uintptr_t)ppv & ((ARCH_BITS / 8) - 1) )); \
4884 *(ppv) = pv; \
4885 } while (0)
4886#endif
4887
4888
4889/**
4890 * Atomically sets a pointer to NULL, unordered.
4891 *
4892 * @param ppv Pointer to the pointer variable that should be set to NULL.
4893 *
4894 * @remarks This is relatively type safe on GCC platforms.
4895 */
4896#ifdef __GNUC__
4897# define ASMAtomicUoWriteNullPtr(ppv) \
4898 do \
4899 { \
4900 __typeof__(*(ppv)) volatile * const ppvTypeChecked = (ppv); \
4901 AssertCompile(sizeof(*ppv) == sizeof(void *)); \
4902 Assert(!( (uintptr_t)ppv & ((ARCH_BITS / 8) - 1) )); \
4903 *(ppvTypeChecked) = NULL; \
4904 } while (0)
4905#else
4906# define ASMAtomicUoWriteNullPtr(ppv) \
4907 do \
4908 { \
4909 AssertCompile(sizeof(*ppv) == sizeof(void RT_FAR *)); \
4910 Assert(!( (uintptr_t)ppv & ((ARCH_BITS / 8) - 1) )); \
4911 *(ppv) = NULL; \
4912 } while (0)
4913#endif
4914
4915
4916/**
4917 * Atomically write a typical IPRT handle value, ordered.
4918 *
4919 * @param ph Pointer to the variable to update.
4920 * @param hNew The value to assign to *ph.
4921 *
4922 * @remarks This doesn't currently work for all handles (like RTFILE).
4923 */
4924#if HC_ARCH_BITS == 32 || ARCH_BITS == 16
4925# define ASMAtomicWriteHandle(ph, hNew) \
4926 do { \
4927 AssertCompile(sizeof(*(ph)) == sizeof(uint32_t)); \
4928 ASMAtomicWriteU32((uint32_t volatile RT_FAR *)(ph), (const uint32_t)(hNew)); \
4929 } while (0)
4930#elif HC_ARCH_BITS == 64
4931# define ASMAtomicWriteHandle(ph, hNew) \
4932 do { \
4933 AssertCompile(sizeof(*(ph)) == sizeof(uint64_t)); \
4934 ASMAtomicWriteU64((uint64_t volatile RT_FAR *)(ph), (const uint64_t)(hNew)); \
4935 } while (0)
4936#else
4937# error HC_ARCH_BITS
4938#endif
4939
4940
4941/**
4942 * Atomically write a typical IPRT handle value, unordered.
4943 *
4944 * @param ph Pointer to the variable to update.
4945 * @param hNew The value to assign to *ph.
4946 *
4947 * @remarks This doesn't currently work for all handles (like RTFILE).
4948 */
4949#if HC_ARCH_BITS == 32 || ARCH_BITS == 16
4950# define ASMAtomicUoWriteHandle(ph, hNew) \
4951 do { \
4952 AssertCompile(sizeof(*(ph)) == sizeof(uint32_t)); \
4953 ASMAtomicUoWriteU32((uint32_t volatile RT_FAR *)(ph), (const uint32_t)hNew); \
4954 } while (0)
4955#elif HC_ARCH_BITS == 64
4956# define ASMAtomicUoWriteHandle(ph, hNew) \
4957 do { \
4958 AssertCompile(sizeof(*(ph)) == sizeof(uint64_t)); \
4959 ASMAtomicUoWriteU64((uint64_t volatile RT_FAR *)(ph), (const uint64_t)hNew); \
4960 } while (0)
4961#else
4962# error HC_ARCH_BITS
4963#endif
4964
4965
4966/**
4967 * Atomically write a value which size might differ
4968 * between platforms or compilers, ordered.
4969 *
4970 * @param pu Pointer to the variable to update.
4971 * @param uNew The value to assign to *pu.
4972 */
4973#define ASMAtomicWriteSize(pu, uNew) \
4974 do { \
4975 switch (sizeof(*(pu))) { \
4976 case 1: ASMAtomicWriteU8( (volatile uint8_t RT_FAR *)(void RT_FAR *)(pu), (uint8_t )(uNew)); break; \
4977 case 2: ASMAtomicWriteU16((volatile uint16_t RT_FAR *)(void RT_FAR *)(pu), (uint16_t)(uNew)); break; \
4978 case 4: ASMAtomicWriteU32((volatile uint32_t RT_FAR *)(void RT_FAR *)(pu), (uint32_t)(uNew)); break; \
4979 case 8: ASMAtomicWriteU64((volatile uint64_t RT_FAR *)(void RT_FAR *)(pu), (uint64_t)(uNew)); break; \
4980 default: AssertMsgFailed(("ASMAtomicWriteSize: size %d is not supported\n", sizeof(*(pu)))); \
4981 } \
4982 } while (0)
4983
4984/**
4985 * Atomically write a value which size might differ
4986 * between platforms or compilers, unordered.
4987 *
4988 * @param pu Pointer to the variable to update.
4989 * @param uNew The value to assign to *pu.
4990 */
4991#define ASMAtomicUoWriteSize(pu, uNew) \
4992 do { \
4993 switch (sizeof(*(pu))) { \
4994 case 1: ASMAtomicUoWriteU8( (volatile uint8_t RT_FAR *)(void RT_FAR *)(pu), (uint8_t )(uNew)); break; \
4995 case 2: ASMAtomicUoWriteU16((volatile uint16_t RT_FAR *)(void RT_FAR *)(pu), (uint16_t)(uNew)); break; \
4996 case 4: ASMAtomicUoWriteU32((volatile uint32_t RT_FAR *)(void RT_FAR *)(pu), (uint32_t)(uNew)); break; \
4997 case 8: ASMAtomicUoWriteU64((volatile uint64_t RT_FAR *)(void RT_FAR *)(pu), (uint64_t)(uNew)); break; \
4998 default: AssertMsgFailed(("ASMAtomicWriteSize: size %d is not supported\n", sizeof(*(pu)))); \
4999 } \
5000 } while (0)
5001
5002
5003
5004/**
5005 * Atomically exchanges and adds to a 16-bit value, ordered.
5006 *
5007 * @returns The old value.
5008 * @param pu16 Pointer to the value.
5009 * @param u16 Number to add.
5010 *
5011 * @remarks Currently not implemented, just to make 16-bit code happy.
5012 * @remarks x86: Requires a 486 or later.
5013 */
5014RT_ASM_DECL_PRAGMA_WATCOM(uint16_t) ASMAtomicAddU16(uint16_t volatile RT_FAR *pu16, uint32_t u16) RT_NOTHROW_PROTO;
5015
5016
5017/**
5018 * Atomically exchanges and adds to a 32-bit value, ordered.
5019 *
5020 * @returns The old value.
5021 * @param pu32 Pointer to the value.
5022 * @param u32 Number to add.
5023 *
5024 * @remarks x86: Requires a 486 or later.
5025 */
5026#if RT_INLINE_ASM_EXTERNAL_TMP_ARM && !RT_INLINE_ASM_USES_INTRIN
5027RT_ASM_DECL_PRAGMA_WATCOM(uint32_t) ASMAtomicAddU32(uint32_t volatile RT_FAR *pu32, uint32_t u32) RT_NOTHROW_PROTO;
5028#else
5029DECLINLINE(uint32_t) ASMAtomicAddU32(uint32_t volatile RT_FAR *pu32, uint32_t u32) RT_NOTHROW_DEF
5030{
5031# if RT_INLINE_ASM_USES_INTRIN
5032 u32 = _InterlockedExchangeAdd((long RT_FAR *)pu32, u32);
5033 return u32;
5034
5035# elif defined(RT_ARCH_AMD64) || defined(RT_ARCH_X86)
5036# if RT_INLINE_ASM_GNU_STYLE
5037 __asm__ __volatile__("lock; xaddl %0, %1\n\t"
5038 : "=r" (u32)
5039 , "=m" (*pu32)
5040 : "0" (u32)
5041 , "m" (*pu32)
5042 : "memory"
5043 , "cc");
5044 return u32;
5045# else
5046 __asm
5047 {
5048 mov eax, [u32]
5049# ifdef RT_ARCH_AMD64
5050 mov rdx, [pu32]
5051 lock xadd [rdx], eax
5052# else
5053 mov edx, [pu32]
5054 lock xadd [edx], eax
5055# endif
5056 mov [u32], eax
5057 }
5058 return u32;
5059# endif
5060
5061# elif defined(RT_ARCH_ARM64) || defined(RT_ARCH_ARM32)
5062 /* M1 benchmark: ldaddal=6907 vs dmb+ldadd=2114 vs non-lse=6249 (ps/call) */
5063# if defined(RTASM_ARM64_USE_FEAT_LSE)
5064 uint32_t u32OldRet;
5065 __asm__ __volatile__("Lstart_ASMAtomicAddU32_%=:\n\t"
5066# if defined(RTASM_ARM64_USE_FEAT_LSE_WITHOUT_DMB)
5067 "ldaddal %w[uAddend], %w[uOldActual], %[pMem]\n\t"
5068# else
5069 RTASM_ARM_DMB_SY
5070 "ldadd %w[uAddend], %w[uOldActual], %[pMem]\n\t"
5071# endif
5072 : [pMem] "+Q" (*pu32)
5073 , [uOldActual] "=&r" (u32OldRet)
5074 : [uAddend] "r" (u32)
5075 : );
5076# else
5077 RTASM_ARM_LOAD_MODIFY_STORE_RET_OLD_32(ASMAtomicAddU32, pu32, DMB_SY,
5078 "add %w[uNew], %w[uOld], %w[uVal]\n\t",
5079 "add %[uNew], %[uOld], %[uVal]\n\t",
5080 [uVal] "r" (u32));
5081# endif
5082 return u32OldRet;
5083
5084# else
5085# error "Port me"
5086# endif
5087}
5088#endif
5089
5090
5091/**
5092 * Atomically exchanges and adds to a signed 32-bit value, ordered.
5093 *
5094 * @returns The old value.
5095 * @param pi32 Pointer to the value.
5096 * @param i32 Number to add.
5097 *
5098 * @remarks x86: Requires a 486 or later.
5099 */
5100DECLINLINE(int32_t) ASMAtomicAddS32(int32_t volatile RT_FAR *pi32, int32_t i32) RT_NOTHROW_DEF
5101{
5102 return (int32_t)ASMAtomicAddU32((uint32_t volatile RT_FAR *)pi32, (uint32_t)i32);
5103}
5104
5105
5106/**
5107 * Atomically exchanges and adds to a 64-bit value, ordered.
5108 *
5109 * @returns The old value.
5110 * @param pu64 Pointer to the value.
5111 * @param u64 Number to add.
5112 *
5113 * @remarks x86: Requires a Pentium or later.
5114 */
5115#if RT_INLINE_ASM_EXTERNAL_TMP_ARM && !RT_INLINE_ASM_USES_INTRIN
5116DECLASM(uint64_t) ASMAtomicAddU64(uint64_t volatile RT_FAR *pu64, uint64_t u64) RT_NOTHROW_PROTO;
5117#else
5118DECLINLINE(uint64_t) ASMAtomicAddU64(uint64_t volatile RT_FAR *pu64, uint64_t u64) RT_NOTHROW_DEF
5119{
5120# if RT_INLINE_ASM_USES_INTRIN && (defined(RT_ARCH_AMD64) || defined(RT_ARCH_ARM64))
5121 u64 = _InterlockedExchangeAdd64((__int64 RT_FAR *)pu64, u64);
5122 return u64;
5123
5124# elif RT_INLINE_ASM_GNU_STYLE && defined(RT_ARCH_AMD64)
5125 __asm__ __volatile__("lock; xaddq %0, %1\n\t"
5126 : "=r" (u64)
5127 , "=m" (*pu64)
5128 : "0" (u64)
5129 , "m" (*pu64)
5130 : "memory"
5131 , "cc");
5132 return u64;
5133
5134# elif defined(RT_ARCH_ARM64) || defined(RT_ARCH_ARM32)
5135# if defined(RTASM_ARM64_USE_FEAT_LSE)
5136 uint64_t u64OldRet;
5137 __asm__ __volatile__("Lstart_ASMAtomicAddU64_%=:\n\t"
5138# if defined(RTASM_ARM64_USE_FEAT_LSE_WITHOUT_DMB)
5139 "ldaddal %[uAddend], %[uOldActual], %[pMem]\n\t"
5140# else
5141 RTASM_ARM_DMB_SY
5142 "ldadd %[uAddend], %[uOldActual], %[pMem]\n\t"
5143# endif
5144 : [pMem] "+Q" (*pu64)
5145 , [uOldActual] "=&r" (u64OldRet)
5146 : [uAddend] "r" (u64)
5147 : );
5148# else
5149 RTASM_ARM_LOAD_MODIFY_STORE_RET_OLD_64(ASMAtomicAddU64, pu64, DMB_SY,
5150 "add %[uNew], %[uOld], %[uVal]\n\t"
5151 ,
5152 "add %[uNew], %[uOld], %[uVal]\n\t"
5153 "adc %H[uNew], %H[uOld], %H[uVal]\n\t",
5154 [uVal] "r" (u64));
5155# endif
5156 return u64OldRet;
5157
5158# else
5159 uint64_t u64Old;
5160 for (;;)
5161 {
5162 uint64_t u64New;
5163 u64Old = ASMAtomicUoReadU64(pu64);
5164 u64New = u64Old + u64;
5165 if (ASMAtomicCmpXchgU64(pu64, u64New, u64Old))
5166 break;
5167 ASMNopPause();
5168 }
5169 return u64Old;
5170# endif
5171}
5172#endif
5173
5174
5175/**
5176 * Atomically exchanges and adds to a signed 64-bit value, ordered.
5177 *
5178 * @returns The old value.
5179 * @param pi64 Pointer to the value.
5180 * @param i64 Number to add.
5181 *
5182 * @remarks x86: Requires a Pentium or later.
5183 */
5184DECLINLINE(int64_t) ASMAtomicAddS64(int64_t volatile RT_FAR *pi64, int64_t i64) RT_NOTHROW_DEF
5185{
5186 return (int64_t)ASMAtomicAddU64((uint64_t volatile RT_FAR *)pi64, (uint64_t)i64);
5187}
5188
5189
5190/**
5191 * Atomically exchanges and adds to a size_t value, ordered.
5192 *
5193 * @returns The old value.
5194 * @param pcb Pointer to the size_t value.
5195 * @param cb Number to add.
5196 */
5197DECLINLINE(size_t) ASMAtomicAddZ(size_t volatile RT_FAR *pcb, size_t cb) RT_NOTHROW_DEF
5198{
5199#if ARCH_BITS == 64
5200 AssertCompileSize(size_t, 8);
5201 return ASMAtomicAddU64((uint64_t volatile RT_FAR *)pcb, cb);
5202#elif ARCH_BITS == 32
5203 AssertCompileSize(size_t, 4);
5204 return ASMAtomicAddU32((uint32_t volatile RT_FAR *)pcb, cb);
5205#elif ARCH_BITS == 16
5206 AssertCompileSize(size_t, 2);
5207 return ASMAtomicAddU16((uint16_t volatile RT_FAR *)pcb, cb);
5208#else
5209# error "Unsupported ARCH_BITS value"
5210#endif
5211}
5212
5213
5214/**
5215 * Atomically exchanges and adds a value which size might differ between
5216 * platforms or compilers, ordered.
5217 *
5218 * @param pu Pointer to the variable to update.
5219 * @param uNew The value to add to *pu.
5220 * @param puOld Where to store the old value.
5221 */
5222#define ASMAtomicAddSize(pu, uNew, puOld) \
5223 do { \
5224 switch (sizeof(*(pu))) { \
5225 case 4: *(uint32_t *)(puOld) = ASMAtomicAddU32((volatile uint32_t RT_FAR *)(void RT_FAR *)(pu), (uint32_t)(uNew)); break; \
5226 case 8: *(uint64_t *)(puOld) = ASMAtomicAddU64((volatile uint64_t RT_FAR *)(void RT_FAR *)(pu), (uint64_t)(uNew)); break; \
5227 default: AssertMsgFailed(("ASMAtomicAddSize: size %d is not supported\n", sizeof(*(pu)))); \
5228 } \
5229 } while (0)
5230
5231
5232
5233/**
5234 * Atomically exchanges and subtracts to an unsigned 16-bit value, ordered.
5235 *
5236 * @returns The old value.
5237 * @param pu16 Pointer to the value.
5238 * @param u16 Number to subtract.
5239 *
5240 * @remarks x86: Requires a 486 or later.
5241 */
5242DECLINLINE(uint16_t) ASMAtomicSubU16(uint16_t volatile RT_FAR *pu16, uint32_t u16) RT_NOTHROW_DEF
5243{
5244 return ASMAtomicAddU16(pu16, (uint16_t)-(int16_t)u16);
5245}
5246
5247
5248/**
5249 * Atomically exchanges and subtracts to a signed 16-bit value, ordered.
5250 *
5251 * @returns The old value.
5252 * @param pi16 Pointer to the value.
5253 * @param i16 Number to subtract.
5254 *
5255 * @remarks x86: Requires a 486 or later.
5256 */
5257DECLINLINE(int16_t) ASMAtomicSubS16(int16_t volatile RT_FAR *pi16, int16_t i16) RT_NOTHROW_DEF
5258{
5259 return (int16_t)ASMAtomicAddU16((uint16_t volatile RT_FAR *)pi16, (uint16_t)-i16);
5260}
5261
5262
5263/**
5264 * Atomically exchanges and subtracts to an unsigned 32-bit value, ordered.
5265 *
5266 * @returns The old value.
5267 * @param pu32 Pointer to the value.
5268 * @param u32 Number to subtract.
5269 *
5270 * @remarks x86: Requires a 486 or later.
5271 */
5272DECLINLINE(uint32_t) ASMAtomicSubU32(uint32_t volatile RT_FAR *pu32, uint32_t u32) RT_NOTHROW_DEF
5273{
5274 return ASMAtomicAddU32(pu32, (uint32_t)-(int32_t)u32);
5275}
5276
5277
5278/**
5279 * Atomically exchanges and subtracts to a signed 32-bit value, ordered.
5280 *
5281 * @returns The old value.
5282 * @param pi32 Pointer to the value.
5283 * @param i32 Number to subtract.
5284 *
5285 * @remarks x86: Requires a 486 or later.
5286 */
5287DECLINLINE(int32_t) ASMAtomicSubS32(int32_t volatile RT_FAR *pi32, int32_t i32) RT_NOTHROW_DEF
5288{
5289 return (int32_t)ASMAtomicAddU32((uint32_t volatile RT_FAR *)pi32, (uint32_t)-i32);
5290}
5291
5292
5293/**
5294 * Atomically exchanges and subtracts to an unsigned 64-bit value, ordered.
5295 *
5296 * @returns The old value.
5297 * @param pu64 Pointer to the value.
5298 * @param u64 Number to subtract.
5299 *
5300 * @remarks x86: Requires a Pentium or later.
5301 */
5302DECLINLINE(uint64_t) ASMAtomicSubU64(uint64_t volatile RT_FAR *pu64, uint64_t u64) RT_NOTHROW_DEF
5303{
5304 return ASMAtomicAddU64(pu64, (uint64_t)-(int64_t)u64);
5305}
5306
5307
5308/**
5309 * Atomically exchanges and subtracts to a signed 64-bit value, ordered.
5310 *
5311 * @returns The old value.
5312 * @param pi64 Pointer to the value.
5313 * @param i64 Number to subtract.
5314 *
5315 * @remarks x86: Requires a Pentium or later.
5316 */
5317DECLINLINE(int64_t) ASMAtomicSubS64(int64_t volatile RT_FAR *pi64, int64_t i64) RT_NOTHROW_DEF
5318{
5319 return (int64_t)ASMAtomicAddU64((uint64_t volatile RT_FAR *)pi64, (uint64_t)-i64);
5320}
5321
5322
5323/**
5324 * Atomically exchanges and subtracts to a size_t value, ordered.
5325 *
5326 * @returns The old value.
5327 * @param pcb Pointer to the size_t value.
5328 * @param cb Number to subtract.
5329 *
5330 * @remarks x86: Requires a 486 or later.
5331 */
5332DECLINLINE(size_t) ASMAtomicSubZ(size_t volatile RT_FAR *pcb, size_t cb) RT_NOTHROW_DEF
5333{
5334#if ARCH_BITS == 64
5335 return ASMAtomicSubU64((uint64_t volatile RT_FAR *)pcb, cb);
5336#elif ARCH_BITS == 32
5337 return ASMAtomicSubU32((uint32_t volatile RT_FAR *)pcb, cb);
5338#elif ARCH_BITS == 16
5339 AssertCompileSize(size_t, 2);
5340 return ASMAtomicSubU16((uint16_t volatile RT_FAR *)pcb, cb);
5341#else
5342# error "Unsupported ARCH_BITS value"
5343#endif
5344}
5345
5346
5347/**
5348 * Atomically exchanges and subtracts a value which size might differ between
5349 * platforms or compilers, ordered.
5350 *
5351 * @param pu Pointer to the variable to update.
5352 * @param uNew The value to subtract to *pu.
5353 * @param puOld Where to store the old value.
5354 *
5355 * @remarks x86: Requires a 486 or later.
5356 */
5357#define ASMAtomicSubSize(pu, uNew, puOld) \
5358 do { \
5359 switch (sizeof(*(pu))) { \
5360 case 4: *(uint32_t RT_FAR *)(puOld) = ASMAtomicSubU32((volatile uint32_t RT_FAR *)(void RT_FAR *)(pu), (uint32_t)(uNew)); break; \
5361 case 8: *(uint64_t RT_FAR *)(puOld) = ASMAtomicSubU64((volatile uint64_t RT_FAR *)(void RT_FAR *)(pu), (uint64_t)(uNew)); break; \
5362 default: AssertMsgFailed(("ASMAtomicSubSize: size %d is not supported\n", sizeof(*(pu)))); \
5363 } \
5364 } while (0)
5365
5366
5367
5368/**
5369 * Atomically increment a 16-bit value, ordered.
5370 *
5371 * @returns The new value.
5372 * @param pu16 Pointer to the value to increment.
5373 * @remarks Not implemented. Just to make 16-bit code happy.
5374 *
5375 * @remarks x86: Requires a 486 or later.
5376 */
5377RT_ASM_DECL_PRAGMA_WATCOM(uint16_t) ASMAtomicIncU16(uint16_t volatile RT_FAR *pu16) RT_NOTHROW_PROTO;
5378
5379
5380/**
5381 * Atomically increment a 32-bit value, ordered.
5382 *
5383 * @returns The new value.
5384 * @param pu32 Pointer to the value to increment.
5385 *
5386 * @remarks x86: Requires a 486 or later.
5387 */
5388#if RT_INLINE_ASM_EXTERNAL_TMP_ARM && !RT_INLINE_ASM_USES_INTRIN
5389RT_ASM_DECL_PRAGMA_WATCOM(uint32_t) ASMAtomicIncU32(uint32_t volatile RT_FAR *pu32) RT_NOTHROW_PROTO;
5390#else
5391DECLINLINE(uint32_t) ASMAtomicIncU32(uint32_t volatile RT_FAR *pu32) RT_NOTHROW_DEF
5392{
5393# if RT_INLINE_ASM_USES_INTRIN
5394 return (uint32_t)_InterlockedIncrement((long RT_FAR *)pu32);
5395
5396# elif defined(RT_ARCH_AMD64) || defined(RT_ARCH_X86)
5397# if RT_INLINE_ASM_GNU_STYLE
5398 uint32_t u32;
5399 __asm__ __volatile__("lock; xaddl %0, %1\n\t"
5400 : "=r" (u32)
5401 , "=m" (*pu32)
5402 : "0" (1)
5403 , "m" (*pu32)
5404 : "memory"
5405 , "cc");
5406 return u32+1;
5407# else
5408 __asm
5409 {
5410 mov eax, 1
5411# ifdef RT_ARCH_AMD64
5412 mov rdx, [pu32]
5413 lock xadd [rdx], eax
5414# else
5415 mov edx, [pu32]
5416 lock xadd [edx], eax
5417# endif
5418 mov u32, eax
5419 }
5420 return u32+1;
5421# endif
5422
5423# elif defined(RT_ARCH_ARM64) || defined(RT_ARCH_ARM32)
5424 /* M1 benchmark: ldaddal=6887 vs dmb+ldadd=2117 vs non-lse=6247 (ps/call) */
5425# if defined(RTASM_ARM64_USE_FEAT_LSE)
5426 uint32_t u32NewRet;
5427 __asm__ __volatile__("Lstart_ASMAtomicIncU32_%=:\n\t"
5428# if defined(RTASM_ARM64_USE_FEAT_LSE_WITHOUT_DMB)
5429 "ldaddal %w[uAddend], %w[uNewRet], %[pMem]\n\t"
5430# else
5431 RTASM_ARM_DMB_SY
5432 "ldadd %w[uAddend], %w[uNewRet], %[pMem]\n\t"
5433# endif
5434 "add %w[uNewRet], %w[uNewRet], #1\n\t"
5435 : [pMem] "+Q" (*pu32)
5436 , [uNewRet] "=&r" (u32NewRet)
5437 : [uAddend] "r" ((uint32_t)1)
5438 : );
5439# else
5440 RTASM_ARM_LOAD_MODIFY_STORE_RET_NEW_32(ASMAtomicIncU32, pu32, DMB_SY,
5441 "add %w[uNew], %w[uNew], #1\n\t",
5442 "add %[uNew], %[uNew], #1\n\t" /* arm6 / thumb2+ */,
5443 "X" (0) /* dummy */);
5444# endif
5445 return u32NewRet;
5446
5447# else
5448 return ASMAtomicAddU32(pu32, 1) + 1;
5449# endif
5450}
5451#endif
5452
5453
5454/**
5455 * Atomically increment a signed 32-bit value, ordered.
5456 *
5457 * @returns The new value.
5458 * @param pi32 Pointer to the value to increment.
5459 *
5460 * @remarks x86: Requires a 486 or later.
5461 */
5462DECLINLINE(int32_t) ASMAtomicIncS32(int32_t volatile RT_FAR *pi32) RT_NOTHROW_DEF
5463{
5464 return (int32_t)ASMAtomicIncU32((uint32_t volatile RT_FAR *)pi32);
5465}
5466
5467
5468/**
5469 * Atomically increment a 64-bit value, ordered.
5470 *
5471 * @returns The new value.
5472 * @param pu64 Pointer to the value to increment.
5473 *
5474 * @remarks x86: Requires a Pentium or later.
5475 */
5476#if RT_INLINE_ASM_EXTERNAL_TMP_ARM && !RT_INLINE_ASM_USES_INTRIN
5477DECLASM(uint64_t) ASMAtomicIncU64(uint64_t volatile RT_FAR *pu64) RT_NOTHROW_PROTO;
5478#else
5479DECLINLINE(uint64_t) ASMAtomicIncU64(uint64_t volatile RT_FAR *pu64) RT_NOTHROW_DEF
5480{
5481# if RT_INLINE_ASM_USES_INTRIN && (defined(RT_ARCH_AMD64) || defined(RT_ARCH_ARM64))
5482 return (uint64_t)_InterlockedIncrement64((__int64 RT_FAR *)pu64);
5483
5484# elif RT_INLINE_ASM_GNU_STYLE && defined(RT_ARCH_AMD64)
5485 uint64_t u64;
5486 __asm__ __volatile__("lock; xaddq %0, %1\n\t"
5487 : "=r" (u64)
5488 , "=m" (*pu64)
5489 : "0" (1)
5490 , "m" (*pu64)
5491 : "memory"
5492 , "cc");
5493 return u64 + 1;
5494
5495# elif defined(RT_ARCH_ARM64) || defined(RT_ARCH_ARM32)
5496# if defined(RTASM_ARM64_USE_FEAT_LSE)
5497 uint64_t u64NewRet;
5498 __asm__ __volatile__("Lstart_ASMAtomicIncU64_%=:\n\t"
5499# if defined(RTASM_ARM64_USE_FEAT_LSE_WITHOUT_DMB)
5500 "ldaddal %[uAddend], %[uNewRet], %[pMem]\n\t"
5501# else
5502 RTASM_ARM_DMB_SY
5503 "ldadd %[uAddend], %[uNewRet], %[pMem]\n\t"
5504# endif
5505 "add %[uNewRet], %[uNewRet], #1\n\t"
5506 : [pMem] "+Q" (*pu64)
5507 , [uNewRet] "=&r" (u64NewRet)
5508 : [uAddend] "r" ((uint64_t)1)
5509 : );
5510# else
5511 RTASM_ARM_LOAD_MODIFY_STORE_RET_NEW_64(ASMAtomicIncU64, pu64, DMB_SY,
5512 "add %[uNew], %[uNew], #1\n\t"
5513 ,
5514 "add %[uNew], %[uNew], #1\n\t" /* arm6 / thumb2+ */
5515 "adc %H[uNew], %H[uNew], %[uZeroVal]\n\t",
5516 RTASM_ARM_PICK_6432("X" (0) /* dummy */, [uZeroVal] "r" (0)) );
5517# endif
5518 return u64NewRet;
5519
5520# else
5521 return ASMAtomicAddU64(pu64, 1) + 1;
5522# endif
5523}
5524#endif
5525
5526
5527/**
5528 * Atomically increment a signed 64-bit value, ordered.
5529 *
5530 * @returns The new value.
5531 * @param pi64 Pointer to the value to increment.
5532 *
5533 * @remarks x86: Requires a Pentium or later.
5534 */
5535DECLINLINE(int64_t) ASMAtomicIncS64(int64_t volatile RT_FAR *pi64) RT_NOTHROW_DEF
5536{
5537 return (int64_t)ASMAtomicIncU64((uint64_t volatile RT_FAR *)pi64);
5538}
5539
5540
5541/**
5542 * Atomically increment a size_t value, ordered.
5543 *
5544 * @returns The new value.
5545 * @param pcb Pointer to the value to increment.
5546 *
5547 * @remarks x86: Requires a 486 or later.
5548 */
5549DECLINLINE(size_t) ASMAtomicIncZ(size_t volatile RT_FAR *pcb) RT_NOTHROW_DEF
5550{
5551#if ARCH_BITS == 64
5552 return ASMAtomicIncU64((uint64_t volatile RT_FAR *)pcb);
5553#elif ARCH_BITS == 32
5554 return ASMAtomicIncU32((uint32_t volatile RT_FAR *)pcb);
5555#elif ARCH_BITS == 16
5556 return ASMAtomicIncU16((uint16_t volatile RT_FAR *)pcb);
5557#else
5558# error "Unsupported ARCH_BITS value"
5559#endif
5560}
5561
5562
5563
5564/**
5565 * Atomically decrement an unsigned 32-bit value, ordered.
5566 *
5567 * @returns The new value.
5568 * @param pu16 Pointer to the value to decrement.
5569 * @remarks Not implemented. Just to make 16-bit code happy.
5570 *
5571 * @remarks x86: Requires a 486 or later.
5572 */
5573RT_ASM_DECL_PRAGMA_WATCOM(uint32_t) ASMAtomicDecU16(uint16_t volatile RT_FAR *pu16) RT_NOTHROW_PROTO;
5574
5575
5576/**
5577 * Atomically decrement an unsigned 32-bit value, ordered.
5578 *
5579 * @returns The new value.
5580 * @param pu32 Pointer to the value to decrement.
5581 *
5582 * @remarks x86: Requires a 486 or later.
5583 */
5584#if RT_INLINE_ASM_EXTERNAL_TMP_ARM && !RT_INLINE_ASM_USES_INTRIN
5585RT_ASM_DECL_PRAGMA_WATCOM(uint32_t) ASMAtomicDecU32(uint32_t volatile RT_FAR *pu32) RT_NOTHROW_PROTO;
5586#else
5587DECLINLINE(uint32_t) ASMAtomicDecU32(uint32_t volatile RT_FAR *pu32) RT_NOTHROW_DEF
5588{
5589# if RT_INLINE_ASM_USES_INTRIN
5590 return (uint32_t)_InterlockedDecrement((long RT_FAR *)pu32);
5591
5592# elif RT_INLINE_ASM_GNU_STYLE && defined(RT_ARCH_AMD64)
5593# if RT_INLINE_ASM_GNU_STYLE
5594 uint32_t u32;
5595 __asm__ __volatile__("lock; xaddl %0, %1\n\t"
5596 : "=r" (u32)
5597 , "=m" (*pu32)
5598 : "0" (-1)
5599 , "m" (*pu32)
5600 : "memory"
5601 , "cc");
5602 return u32-1;
5603# else
5604 uint32_t u32;
5605 __asm
5606 {
5607 mov eax, -1
5608# ifdef RT_ARCH_AMD64
5609 mov rdx, [pu32]
5610 lock xadd [rdx], eax
5611# else
5612 mov edx, [pu32]
5613 lock xadd [edx], eax
5614# endif
5615 mov u32, eax
5616 }
5617 return u32-1;
5618# endif
5619
5620# elif defined(RT_ARCH_ARM64) || defined(RT_ARCH_ARM32)
5621 /* M1 benchmark: ldaddal=6887 vs dmb+ldadd=2120 vs non-lse=6260 (ps/call) */
5622# if defined(RTASM_ARM64_USE_FEAT_LSE)
5623 uint32_t u32NewRet;
5624 __asm__ __volatile__("Lstart_ASMAtomicDecU32_%=:\n\t"
5625# if defined(RTASM_ARM64_USE_FEAT_LSE_WITHOUT_DMB)
5626 "ldaddal %w[uAddend], %w[uNewRet], %[pMem]\n\t"
5627# else
5628 RTASM_ARM_DMB_SY
5629 "ldadd %w[uAddend], %w[uNewRet], %[pMem]\n\t"
5630# endif
5631 "sub %w[uNewRet], %w[uNewRet], #1\n\t"
5632 : [pMem] "+Q" (*pu32)
5633 , [uNewRet] "=&r" (u32NewRet)
5634 : [uAddend] "r" (~(uint32_t)0)
5635 : );
5636# else
5637 RTASM_ARM_LOAD_MODIFY_STORE_RET_NEW_32(ASMAtomicDecU32, pu32, DMB_SY,
5638 "sub %w[uNew], %w[uNew], #1\n\t",
5639 "sub %[uNew], %[uNew], #1\n\t" /* arm6 / thumb2+ */,
5640 "X" (0) /* dummy */);
5641# endif
5642 return u32NewRet;
5643
5644# else
5645 return ASMAtomicSubU32(pu32, 1) - (uint32_t)1;
5646# endif
5647}
5648#endif
5649
5650
5651/**
5652 * Atomically decrement a signed 32-bit value, ordered.
5653 *
5654 * @returns The new value.
5655 * @param pi32 Pointer to the value to decrement.
5656 *
5657 * @remarks x86: Requires a 486 or later.
5658 */
5659DECLINLINE(int32_t) ASMAtomicDecS32(int32_t volatile RT_FAR *pi32) RT_NOTHROW_DEF
5660{
5661 return (int32_t)ASMAtomicDecU32((uint32_t volatile RT_FAR *)pi32);
5662}
5663
5664
5665/**
5666 * Atomically decrement an unsigned 64-bit value, ordered.
5667 *
5668 * @returns The new value.
5669 * @param pu64 Pointer to the value to decrement.
5670 *
5671 * @remarks x86: Requires a Pentium or later.
5672 */
5673#if RT_INLINE_ASM_EXTERNAL_TMP_ARM && !RT_INLINE_ASM_USES_INTRIN
5674RT_ASM_DECL_PRAGMA_WATCOM(uint64_t) ASMAtomicDecU64(uint64_t volatile RT_FAR *pu64) RT_NOTHROW_PROTO;
5675#else
5676DECLINLINE(uint64_t) ASMAtomicDecU64(uint64_t volatile RT_FAR *pu64) RT_NOTHROW_DEF
5677{
5678# if RT_INLINE_ASM_USES_INTRIN && (defined(RT_ARCH_AMD64) || defined(RT_ARCH_ARM64))
5679 return (uint64_t)_InterlockedDecrement64((__int64 volatile RT_FAR *)pu64);
5680
5681# elif RT_INLINE_ASM_GNU_STYLE && defined(RT_ARCH_AMD64)
5682 uint64_t u64;
5683 __asm__ __volatile__("lock; xaddq %q0, %1\n\t"
5684 : "=r" (u64)
5685 , "=m" (*pu64)
5686 : "0" (~(uint64_t)0)
5687 , "m" (*pu64)
5688 : "memory"
5689 , "cc");
5690 return u64-1;
5691
5692# elif defined(RT_ARCH_ARM64) || defined(RT_ARCH_ARM32)
5693# if defined(RTASM_ARM64_USE_FEAT_LSE)
5694 uint64_t u64NewRet;
5695 __asm__ __volatile__("Lstart_ASMAtomicDecU64_%=:\n\t"
5696# if defined(RTASM_ARM64_USE_FEAT_LSE_WITHOUT_DMB)
5697 "ldaddal %[uAddend], %[uNewRet], %[pMem]\n\t"
5698# else
5699 RTASM_ARM_DMB_SY
5700 "ldadd %[uAddend], %[uNewRet], %[pMem]\n\t"
5701# endif
5702 "sub %[uNewRet], %[uNewRet], #1\n\t"
5703 : [pMem] "+Q" (*pu64)
5704 , [uNewRet] "=&r" (u64NewRet)
5705 : [uAddend] "r" (~(uint64_t)0)
5706 : );
5707# else
5708 RTASM_ARM_LOAD_MODIFY_STORE_RET_NEW_64(ASMAtomicDecU64, pu64, DMB_SY,
5709 "sub %[uNew], %[uNew], #1\n\t"
5710 ,
5711 "sub %[uNew], %[uNew], #1\n\t" /* arm6 / thumb2+ */
5712 "sbc %H[uNew], %H[uNew], %[uZeroVal]\n\t",
5713 RTASM_ARM_PICK_6432("X" (0) /* dummy */, [uZeroVal] "r" (0)) );
5714# endif
5715 return u64NewRet;
5716
5717# else
5718 return ASMAtomicAddU64(pu64, UINT64_MAX) - 1;
5719# endif
5720}
5721#endif
5722
5723
5724/**
5725 * Atomically decrement a signed 64-bit value, ordered.
5726 *
5727 * @returns The new value.
5728 * @param pi64 Pointer to the value to decrement.
5729 *
5730 * @remarks x86: Requires a Pentium or later.
5731 */
5732DECLINLINE(int64_t) ASMAtomicDecS64(int64_t volatile RT_FAR *pi64) RT_NOTHROW_DEF
5733{
5734 return (int64_t)ASMAtomicDecU64((uint64_t volatile RT_FAR *)pi64);
5735}
5736
5737
5738/**
5739 * Atomically decrement a size_t value, ordered.
5740 *
5741 * @returns The new value.
5742 * @param pcb Pointer to the value to decrement.
5743 *
5744 * @remarks x86: Requires a 486 or later.
5745 */
5746DECLINLINE(size_t) ASMAtomicDecZ(size_t volatile RT_FAR *pcb) RT_NOTHROW_DEF
5747{
5748#if ARCH_BITS == 64
5749 return ASMAtomicDecU64((uint64_t volatile RT_FAR *)pcb);
5750#elif ARCH_BITS == 32
5751 return ASMAtomicDecU32((uint32_t volatile RT_FAR *)pcb);
5752#elif ARCH_BITS == 16
5753 return ASMAtomicDecU16((uint16_t volatile RT_FAR *)pcb);
5754#else
5755# error "Unsupported ARCH_BITS value"
5756#endif
5757}
5758
5759
5760/**
5761 * Atomically Or an unsigned 32-bit value, ordered.
5762 *
5763 * @param pu32 Pointer to the pointer variable to OR u32 with.
5764 * @param u32 The value to OR *pu32 with.
5765 *
5766 * @remarks x86: Requires a 386 or later.
5767 */
5768#if RT_INLINE_ASM_EXTERNAL_TMP_ARM && !RT_INLINE_ASM_USES_INTRIN
5769RT_ASM_DECL_PRAGMA_WATCOM(void) ASMAtomicOrU32(uint32_t volatile RT_FAR *pu32, uint32_t u32) RT_NOTHROW_PROTO;
5770#else
5771DECLINLINE(void) ASMAtomicOrU32(uint32_t volatile RT_FAR *pu32, uint32_t u32) RT_NOTHROW_DEF
5772{
5773# if RT_INLINE_ASM_USES_INTRIN
5774 _InterlockedOr((long volatile RT_FAR *)pu32, (long)u32);
5775
5776# elif defined(RT_ARCH_AMD64) || defined(RT_ARCH_X86)
5777# if RT_INLINE_ASM_GNU_STYLE
5778 __asm__ __volatile__("lock; orl %1, %0\n\t"
5779 : "=m" (*pu32)
5780 : "ir" (u32)
5781 , "m" (*pu32)
5782 : "cc");
5783# else
5784 __asm
5785 {
5786 mov eax, [u32]
5787# ifdef RT_ARCH_AMD64
5788 mov rdx, [pu32]
5789 lock or [rdx], eax
5790# else
5791 mov edx, [pu32]
5792 lock or [edx], eax
5793# endif
5794 }
5795# endif
5796
5797# elif defined(RT_ARCH_ARM64) || defined(RT_ARCH_ARM32)
5798# if defined(RTASM_ARM64_USE_FEAT_LSE)
5799# if defined(RTASM_ARM64_USE_FEAT_LSE_WITHOUT_DMB)
5800 uint32_t u32Spill;
5801 __asm__ __volatile__("Lstart_ASMAtomicOrU32_%=:\n\t"
5802 "ldsetal %w[fBitsToSet], %w[uSpill], %[pMem]\n\t"
5803 : [pMem] "+Q" (*pu32)
5804 , [uSpill] "=&r" (u32Spill)
5805 : [fBitsToSet] "r" (u32)
5806 : );
5807# else
5808 __asm__ __volatile__("Lstart_ASMAtomicOrU32_%=:\n\t"
5809 RTASM_ARM_DMB_SY
5810 "stset %w[fBitsToSet], %[pMem]\n\t"
5811 : [pMem] "+Q" (*pu32)
5812 : [fBitsToSet] "r" (u32)
5813 : );
5814# endif
5815# else
5816 /* For more on Orr see https://en.wikipedia.org/wiki/Orr_(Catch-22) ;-) */
5817 RTASM_ARM_LOAD_MODIFY_STORE_RET_NEW_32(ASMAtomicOr32, pu32, DMB_SY,
5818 "orr %w[uNew], %w[uNew], %w[uVal]\n\t",
5819 "orr %[uNew], %[uNew], %[uVal]\n\t",
5820 [uVal] "r" (u32));
5821
5822# endif
5823# else
5824# error "Port me"
5825# endif
5826}
5827#endif
5828
5829
5830/**
5831 * Atomically OR an unsigned 32-bit value, ordered, extended version (for bitmap
5832 * fallback).
5833 *
5834 * @returns Old value.
5835 * @param pu32 Pointer to the variable to OR @a u32 with.
5836 * @param u32 The value to OR @a *pu32 with.
5837 */
5838DECLINLINE(uint32_t) ASMAtomicOrExU32(uint32_t volatile RT_FAR *pu32, uint32_t u32) RT_NOTHROW_DEF
5839{
5840#if RT_INLINE_ASM_USES_INTRIN && defined(RT_ARCH_ARM64) /** @todo This should work on amd64 as well I think... */
5841 return (uint32_t)_InterlockedOr((long volatile RT_FAR *)pu32, (long)u32);
5842
5843#elif defined(RT_ARCH_ARM64) || defined(RT_ARCH_ARM32)
5844# if defined(RTASM_ARM64_USE_FEAT_LSE)
5845 uint32_t u32OldRet;
5846 __asm__ __volatile__("Lstart_ASMAtomicOrExU32_%=:\n\t"
5847# if defined(RTASM_ARM64_USE_FEAT_LSE_WITHOUT_DMB)
5848 "ldsetal %w[fBitsToSet], %w[uOldRet], %[pMem]\n\t"
5849# else
5850 RTASM_ARM_DMB_SY
5851 "ldset %w[fBitsToSet], %w[uOldRet], %[pMem]\n\t"
5852# endif
5853 : [pMem] "+Q" (*pu32)
5854 , [uOldRet] "=&r" (u32OldRet)
5855 : [fBitsToSet] "r" (u32)
5856 : );
5857# else
5858 RTASM_ARM_LOAD_MODIFY_STORE_RET_OLD_32(ASMAtomicOrEx32, pu32, DMB_SY,
5859 "orr %w[uNew], %w[uOld], %w[uVal]\n\t",
5860 "orr %[uNew], %[uOld], %[uVal]\n\t",
5861 [uVal] "r" (u32));
5862# endif
5863 return u32OldRet;
5864
5865#else
5866 uint32_t u32RetOld = ASMAtomicUoReadU32(pu32);
5867 uint32_t u32New;
5868 do
5869 u32New = u32RetOld | u32;
5870 while (!ASMAtomicCmpXchgExU32(pu32, u32New, u32RetOld, &u32RetOld));
5871 return u32RetOld;
5872#endif
5873}
5874
5875
5876/**
5877 * Atomically Or a signed 32-bit value, ordered.
5878 *
5879 * @param pi32 Pointer to the pointer variable to OR u32 with.
5880 * @param i32 The value to OR *pu32 with.
5881 *
5882 * @remarks x86: Requires a 386 or later.
5883 */
5884DECLINLINE(void) ASMAtomicOrS32(int32_t volatile RT_FAR *pi32, int32_t i32) RT_NOTHROW_DEF
5885{
5886 ASMAtomicOrU32((uint32_t volatile RT_FAR *)pi32, (uint32_t)i32);
5887}
5888
5889
5890/**
5891 * Atomically Or an unsigned 64-bit value, ordered.
5892 *
5893 * @param pu64 Pointer to the pointer variable to OR u64 with.
5894 * @param u64 The value to OR *pu64 with.
5895 *
5896 * @remarks x86: Requires a Pentium or later.
5897 */
5898#if RT_INLINE_ASM_EXTERNAL_TMP_ARM && !RT_INLINE_ASM_USES_INTRIN
5899DECLASM(void) ASMAtomicOrU64(uint64_t volatile RT_FAR *pu64, uint64_t u64) RT_NOTHROW_PROTO;
5900#else
5901DECLINLINE(void) ASMAtomicOrU64(uint64_t volatile RT_FAR *pu64, uint64_t u64) RT_NOTHROW_DEF
5902{
5903# if RT_INLINE_ASM_USES_INTRIN && (defined(RT_ARCH_AMD64) || defined(RT_ARCH_ARM64))
5904 _InterlockedOr64((__int64 volatile RT_FAR *)pu64, (__int64)u64);
5905
5906# elif RT_INLINE_ASM_GNU_STYLE && defined(RT_ARCH_AMD64)
5907 __asm__ __volatile__("lock; orq %1, %q0\n\t"
5908 : "=m" (*pu64)
5909 : "r" (u64)
5910 , "m" (*pu64)
5911 : "cc");
5912
5913# elif defined(RT_ARCH_ARM64) || defined(RT_ARCH_ARM32)
5914# if defined(RTASM_ARM64_USE_FEAT_LSE)
5915# if defined(RTASM_ARM64_USE_FEAT_LSE_WITHOUT_DMB)
5916 uint64_t u64Spill;
5917 __asm__ __volatile__("Lstart_ASMAtomicOrU64_%=:\n\t"
5918 "ldsetal %[fBitsToSet], %[uSpill], %[pMem]\n\t"
5919 : [pMem] "+Q" (*pu64)
5920 , [uSpill] "=&r" (u64Spill)
5921 : [fBitsToSet] "r" (u64)
5922 : );
5923# else
5924 __asm__ __volatile__("Lstart_ASMAtomicOrU64_%=:\n\t"
5925 RTASM_ARM_DMB_SY
5926 "stset %[fBitsToSet], %[pMem]\n\t"
5927 : [pMem] "+Q" (*pu64)
5928 : [fBitsToSet] "r" (u64)
5929 : );
5930# endif
5931# else
5932 RTASM_ARM_LOAD_MODIFY_STORE_RET_NEW_64(ASMAtomicOrU64, pu64, DMB_SY,
5933 "orr %[uNew], %[uNew], %[uVal]\n\t"
5934 ,
5935 "orr %[uNew], %[uNew], %[uVal]\n\t"
5936 "orr %H[uNew], %H[uNew], %H[uVal]\n\t",
5937 [uVal] "r" (u64));
5938# endif
5939
5940# else
5941 for (;;)
5942 {
5943 uint64_t u64Old = ASMAtomicUoReadU64(pu64);
5944 uint64_t u64New = u64Old | u64;
5945 if (ASMAtomicCmpXchgU64(pu64, u64New, u64Old))
5946 break;
5947 ASMNopPause();
5948 }
5949# endif
5950}
5951#endif
5952
5953
5954/**
5955 * Atomically Or a signed 64-bit value, ordered.
5956 *
5957 * @param pi64 Pointer to the pointer variable to OR u64 with.
5958 * @param i64 The value to OR *pu64 with.
5959 *
5960 * @remarks x86: Requires a Pentium or later.
5961 */
5962DECLINLINE(void) ASMAtomicOrS64(int64_t volatile RT_FAR *pi64, int64_t i64) RT_NOTHROW_DEF
5963{
5964 ASMAtomicOrU64((uint64_t volatile RT_FAR *)pi64, (uint64_t)i64);
5965}
5966
5967
5968/**
5969 * Atomically And an unsigned 32-bit value, ordered.
5970 *
5971 * @param pu32 Pointer to the pointer variable to AND u32 with.
5972 * @param u32 The value to AND *pu32 with.
5973 *
5974 * @remarks x86: Requires a 386 or later.
5975 */
5976#if RT_INLINE_ASM_EXTERNAL_TMP_ARM && !RT_INLINE_ASM_USES_INTRIN
5977RT_ASM_DECL_PRAGMA_WATCOM(void) ASMAtomicAndU32(uint32_t volatile RT_FAR *pu32, uint32_t u32) RT_NOTHROW_PROTO;
5978#else
5979DECLINLINE(void) ASMAtomicAndU32(uint32_t volatile RT_FAR *pu32, uint32_t u32) RT_NOTHROW_DEF
5980{
5981# if RT_INLINE_ASM_USES_INTRIN
5982 _InterlockedAnd((long volatile RT_FAR *)pu32, u32);
5983
5984# elif defined(RT_ARCH_AMD64) || defined(RT_ARCH_X86)
5985# if RT_INLINE_ASM_GNU_STYLE
5986 __asm__ __volatile__("lock; andl %1, %0\n\t"
5987 : "=m" (*pu32)
5988 : "ir" (u32)
5989 , "m" (*pu32)
5990 : "cc");
5991# else
5992 __asm
5993 {
5994 mov eax, [u32]
5995# ifdef RT_ARCH_AMD64
5996 mov rdx, [pu32]
5997 lock and [rdx], eax
5998# else
5999 mov edx, [pu32]
6000 lock and [edx], eax
6001# endif
6002 }
6003# endif
6004
6005# elif defined(RT_ARCH_ARM64) || defined(RT_ARCH_ARM32)
6006# if defined(RTASM_ARM64_USE_FEAT_LSE)
6007# if defined(RTASM_ARM64_USE_FEAT_LSE_WITHOUT_DMB)
6008 uint32_t u32Spill;
6009 __asm__ __volatile__("Lstart_ASMAtomicAndU32_%=:\n\t"
6010 "ldclral %w[fBitsToClear], %w[uSpill], %[pMem]\n\t"
6011 : [pMem] "+Q" (*pu32)
6012 , [uSpill] "=&r" (u32Spill)
6013 : [fBitsToClear] "r" (~u32)
6014 : );
6015# else
6016 __asm__ __volatile__("Lstart_ASMAtomicAndU32_%=:\n\t"
6017 RTASM_ARM_DMB_SY
6018 "stclr %w[fBitsToClear], %[pMem]\n\t"
6019 : [pMem] "+Q" (*pu32)
6020 : [fBitsToClear] "r" (~u32)
6021 : );
6022# endif
6023# else
6024 RTASM_ARM_LOAD_MODIFY_STORE_RET_NEW_32(ASMAtomicAnd32, pu32, DMB_SY,
6025 "and %w[uNew], %w[uNew], %w[uVal]\n\t",
6026 "and %[uNew], %[uNew], %[uVal]\n\t",
6027 [uVal] "r" (u32));
6028
6029# endif
6030# else
6031# error "Port me"
6032# endif
6033}
6034#endif
6035
6036
6037/**
6038 * Atomically AND an unsigned 32-bit value, ordered, extended version.
6039 *
6040 * @returns Old value.
6041 * @param pu32 Pointer to the variable to AND @a u32 with.
6042 * @param u32 The value to AND @a *pu32 with.
6043 */
6044DECLINLINE(uint32_t) ASMAtomicAndExU32(uint32_t volatile RT_FAR *pu32, uint32_t u32) RT_NOTHROW_DEF
6045{
6046#if RT_INLINE_ASM_USES_INTRIN && defined(RT_ARCH_ARM64) /** @todo This should work on amd64 as well I think... */
6047 return (uint32_t)_InterlockedAnd((long volatile RT_FAR *)pu32, (long)u32);
6048
6049#elif defined(RT_ARCH_ARM64) || defined(RT_ARCH_ARM32)
6050# if defined(RTASM_ARM64_USE_FEAT_LSE)
6051 uint32_t u32OldRet;
6052 __asm__ __volatile__("Lstart_ASMAtomicAndExU32_%=:\n\t"
6053# if defined(RTASM_ARM64_USE_FEAT_LSE_WITHOUT_DMB)
6054 "ldclral %w[fBitsToClear], %w[uOldRet], %[pMem]\n\t"
6055# else
6056 RTASM_ARM_DMB_SY
6057 "ldclr %w[fBitsToClear], %w[uOldRet], %[pMem]\n\t"
6058# endif
6059 : [pMem] "+Q" (*pu32)
6060 , [uOldRet] "=&r" (u32OldRet)
6061 : [fBitsToClear] "r" (~u32)
6062 : );
6063# else
6064 RTASM_ARM_LOAD_MODIFY_STORE_RET_OLD_32(ASMAtomicAndEx32, pu32, DMB_SY,
6065 "and %w[uNew], %w[uOld], %w[uVal]\n\t",
6066 "and %[uNew], %[uOld], %[uVal]\n\t",
6067 [uVal] "r" (u32));
6068# endif
6069 return u32OldRet;
6070
6071#else
6072 uint32_t u32RetOld = ASMAtomicUoReadU32(pu32);
6073 uint32_t u32New;
6074 do
6075 u32New = u32RetOld & u32;
6076 while (!ASMAtomicCmpXchgExU32(pu32, u32New, u32RetOld, &u32RetOld));
6077 return u32RetOld;
6078#endif
6079}
6080
6081
6082/**
6083 * Atomically And a signed 32-bit value, ordered.
6084 *
6085 * @param pi32 Pointer to the pointer variable to AND i32 with.
6086 * @param i32 The value to AND *pi32 with.
6087 *
6088 * @remarks x86: Requires a 386 or later.
6089 */
6090DECLINLINE(void) ASMAtomicAndS32(int32_t volatile RT_FAR *pi32, int32_t i32) RT_NOTHROW_DEF
6091{
6092 ASMAtomicAndU32((uint32_t volatile RT_FAR *)pi32, (uint32_t)i32);
6093}
6094
6095
6096/**
6097 * Atomically And an unsigned 64-bit value, ordered.
6098 *
6099 * @param pu64 Pointer to the pointer variable to AND u64 with.
6100 * @param u64 The value to AND *pu64 with.
6101 *
6102 * @remarks x86: Requires a Pentium or later.
6103 */
6104#if RT_INLINE_ASM_EXTERNAL_TMP_ARM && !RT_INLINE_ASM_USES_INTRIN
6105DECLASM(void) ASMAtomicAndU64(uint64_t volatile RT_FAR *pu64, uint64_t u64) RT_NOTHROW_PROTO;
6106#else
6107DECLINLINE(void) ASMAtomicAndU64(uint64_t volatile RT_FAR *pu64, uint64_t u64) RT_NOTHROW_DEF
6108{
6109# if RT_INLINE_ASM_USES_INTRIN && (defined(RT_ARCH_AMD64) || defined(RT_ARCH_ARM64))
6110 _InterlockedAnd64((__int64 volatile RT_FAR *)pu64, u64);
6111
6112# elif RT_INLINE_ASM_GNU_STYLE && defined(RT_ARCH_AMD64)
6113 __asm__ __volatile__("lock; andq %1, %0\n\t"
6114 : "=m" (*pu64)
6115 : "r" (u64)
6116 , "m" (*pu64)
6117 : "cc");
6118
6119# elif defined(RT_ARCH_ARM64) || defined(RT_ARCH_ARM32)
6120# if defined(RTASM_ARM64_USE_FEAT_LSE)
6121# if defined(RTASM_ARM64_USE_FEAT_LSE_WITHOUT_DMB)
6122 uint64_t u64Spill;
6123 __asm__ __volatile__("Lstart_ASMAtomicAndU64_%=:\n\t"
6124 "ldclral %[fBitsToClear], %[uSpill], %[pMem]\n\t"
6125 : [pMem] "+Q" (*pu64)
6126 , [uSpill] "=&r" (u64Spill)
6127 : [fBitsToClear] "r" (~u64)
6128 : );
6129# else
6130 __asm__ __volatile__("Lstart_ASMAtomicAndU64_%=:\n\t"
6131 RTASM_ARM_DMB_SY
6132 "stclr %[fBitsToClear], %[pMem]\n\t"
6133 : [pMem] "+Q" (*pu64)
6134 : [fBitsToClear] "r" (~u64)
6135 : );
6136# endif
6137# else
6138 RTASM_ARM_LOAD_MODIFY_STORE_RET_NEW_64(ASMAtomicAndU64, pu64, DMB_SY,
6139 "and %[uNew], %[uNew], %[uVal]\n\t"
6140 ,
6141 "and %[uNew], %[uNew], %[uVal]\n\t"
6142 "and %H[uNew], %H[uNew], %H[uVal]\n\t",
6143 [uVal] "r" (u64));
6144# endif
6145
6146# else
6147 for (;;)
6148 {
6149 uint64_t u64Old = ASMAtomicUoReadU64(pu64);
6150 uint64_t u64New = u64Old & u64;
6151 if (ASMAtomicCmpXchgU64(pu64, u64New, u64Old))
6152 break;
6153 ASMNopPause();
6154 }
6155# endif
6156}
6157#endif
6158
6159
6160/**
6161 * Atomically And a signed 64-bit value, ordered.
6162 *
6163 * @param pi64 Pointer to the pointer variable to AND i64 with.
6164 * @param i64 The value to AND *pi64 with.
6165 *
6166 * @remarks x86: Requires a Pentium or later.
6167 */
6168DECLINLINE(void) ASMAtomicAndS64(int64_t volatile RT_FAR *pi64, int64_t i64) RT_NOTHROW_DEF
6169{
6170 ASMAtomicAndU64((uint64_t volatile RT_FAR *)pi64, (uint64_t)i64);
6171}
6172
6173
6174/**
6175 * Atomically XOR an unsigned 32-bit value and a memory location, ordered.
6176 *
6177 * @param pu32 Pointer to the variable to XOR @a u32 with.
6178 * @param u32 The value to XOR @a *pu32 with.
6179 *
6180 * @remarks x86: Requires a 386 or later.
6181 */
6182#if RT_INLINE_ASM_EXTERNAL_TMP_ARM && !RT_INLINE_ASM_USES_INTRIN
6183RT_ASM_DECL_PRAGMA_WATCOM(void) ASMAtomicXorU32(uint32_t volatile RT_FAR *pu32, uint32_t u32) RT_NOTHROW_PROTO;
6184#else
6185DECLINLINE(void) ASMAtomicXorU32(uint32_t volatile RT_FAR *pu32, uint32_t u32) RT_NOTHROW_DEF
6186{
6187# if RT_INLINE_ASM_USES_INTRIN
6188 _InterlockedXor((long volatile RT_FAR *)pu32, u32);
6189
6190# elif defined(RT_ARCH_AMD64) || defined(RT_ARCH_X86)
6191# if RT_INLINE_ASM_GNU_STYLE
6192 __asm__ __volatile__("lock; xorl %1, %0\n\t"
6193 : "=m" (*pu32)
6194 : "ir" (u32)
6195 , "m" (*pu32)
6196 : "cc");
6197# else
6198 __asm
6199 {
6200 mov eax, [u32]
6201# ifdef RT_ARCH_AMD64
6202 mov rdx, [pu32]
6203 lock xor [rdx], eax
6204# else
6205 mov edx, [pu32]
6206 lock xor [edx], eax
6207# endif
6208 }
6209# endif
6210
6211# elif defined(RT_ARCH_ARM64) || defined(RT_ARCH_ARM32)
6212# if defined(RTASM_ARM64_USE_FEAT_LSE)
6213# if defined(RTASM_ARM64_USE_FEAT_LSE_WITHOUT_DMB)
6214 uint32_t u32Spill;
6215 __asm__ __volatile__("Lstart_ASMAtomicXorU32_%=:\n\t"
6216 "ldeoral %w[fBitMask], %w[uSpill], %[pMem]\n\t"
6217 : [pMem] "+Q" (*pu32)
6218 , [uSpill] "=&r" (u32Spill)
6219 : [fBitMask] "r" (u32)
6220 : );
6221# else
6222 __asm__ __volatile__("Lstart_ASMAtomicXorU32_%=:\n\t"
6223 RTASM_ARM_DMB_SY
6224 "steor %w[fBitMask], %[pMem]\n\t"
6225 : [pMem] "+Q" (*pu32)
6226 : [fBitMask] "r" (u32)
6227 : );
6228# endif
6229# else
6230 RTASM_ARM_LOAD_MODIFY_STORE_RET_NEW_32(ASMAtomicXor32, pu32, DMB_SY,
6231 "eor %w[uNew], %w[uNew], %w[uVal]\n\t",
6232 "eor %[uNew], %[uNew], %[uVal]\n\t",
6233 [uVal] "r" (u32));
6234# endif
6235
6236# else
6237# error "Port me"
6238# endif
6239}
6240#endif
6241
6242
6243/**
6244 * Atomically XOR an unsigned 32-bit value and a memory location, ordered,
6245 * extended version (for bitmaps).
6246 *
6247 * @returns Old value.
6248 * @param pu32 Pointer to the variable to XOR @a u32 with.
6249 * @param u32 The value to XOR @a *pu32 with.
6250 */
6251DECLINLINE(uint32_t) ASMAtomicXorExU32(uint32_t volatile RT_FAR *pu32, uint32_t u32) RT_NOTHROW_DEF
6252{
6253# if RT_INLINE_ASM_USES_INTRIN
6254 return (uint32_t)_InterlockedXor((long volatile RT_FAR *)pu32, u32);
6255
6256#elif defined(RT_ARCH_ARM64) || defined(RT_ARCH_ARM32)
6257# if defined(RTASM_ARM64_USE_FEAT_LSE)
6258 uint32_t u32OldRet;
6259 __asm__ __volatile__("Lstart_ASMAtomicXorExU32_%=:\n\t"
6260# if defined(RTASM_ARM64_USE_FEAT_LSE_WITHOUT_DMB)
6261 "ldeoral %w[fBitMask], %w[uOldRet], %[pMem]\n\t"
6262# else
6263 RTASM_ARM_DMB_SY
6264 "ldeor %w[fBitMask], %w[uOldRet], %[pMem]\n\t"
6265# endif
6266 : [pMem] "+Q" (*pu32)
6267 , [uOldRet] "=&r" (u32OldRet)
6268 : [fBitMask] "r" (u32)
6269 : );
6270# else
6271 RTASM_ARM_LOAD_MODIFY_STORE_RET_OLD_32(ASMAtomicXorEx32, pu32, DMB_SY,
6272 "eor %w[uNew], %w[uOld], %w[uVal]\n\t",
6273 "eor %[uNew], %[uOld], %[uVal]\n\t",
6274 [uVal] "r" (u32));
6275# endif
6276 return u32OldRet;
6277
6278#else
6279 uint32_t u32RetOld = ASMAtomicUoReadU32(pu32);
6280 uint32_t u32New;
6281 do
6282 u32New = u32RetOld ^ u32;
6283 while (!ASMAtomicCmpXchgExU32(pu32, u32New, u32RetOld, &u32RetOld));
6284 return u32RetOld;
6285#endif
6286}
6287
6288
6289/**
6290 * Atomically XOR a signed 32-bit value, ordered.
6291 *
6292 * @param pi32 Pointer to the variable to XOR i32 with.
6293 * @param i32 The value to XOR *pi32 with.
6294 *
6295 * @remarks x86: Requires a 386 or later.
6296 */
6297DECLINLINE(void) ASMAtomicXorS32(int32_t volatile RT_FAR *pi32, int32_t i32) RT_NOTHROW_DEF
6298{
6299 ASMAtomicXorU32((uint32_t volatile RT_FAR *)pi32, (uint32_t)i32);
6300}
6301
6302
6303/**
6304 * Atomically OR an unsigned 32-bit value, unordered but interrupt safe.
6305 *
6306 * @param pu32 Pointer to the pointer variable to OR u32 with.
6307 * @param u32 The value to OR *pu32 with.
6308 *
6309 * @remarks x86: Requires a 386 or later.
6310 */
6311#if RT_INLINE_ASM_EXTERNAL_TMP_ARM
6312RT_ASM_DECL_PRAGMA_WATCOM(void) ASMAtomicUoOrU32(uint32_t volatile RT_FAR *pu32, uint32_t u32) RT_NOTHROW_PROTO;
6313#else
6314DECLINLINE(void) ASMAtomicUoOrU32(uint32_t volatile RT_FAR *pu32, uint32_t u32) RT_NOTHROW_DEF
6315{
6316# if RT_INLINE_ASM_USES_INTRIN /** @todo This is too much... */
6317 _InterlockedOr((long volatile RT_FAR *)pu32, u32);
6318
6319# elif defined(RT_ARCH_AMD64) || defined(RT_ARCH_X86)
6320# if RT_INLINE_ASM_GNU_STYLE
6321 __asm__ __volatile__("orl %1, %0\n\t"
6322 : "=m" (*pu32)
6323 : "ir" (u32)
6324 , "m" (*pu32)
6325 : "cc");
6326# else
6327 __asm
6328 {
6329 mov eax, [u32]
6330# ifdef RT_ARCH_AMD64
6331 mov rdx, [pu32]
6332 or [rdx], eax
6333# else
6334 mov edx, [pu32]
6335 or [edx], eax
6336# endif
6337 }
6338# endif
6339
6340# elif defined(RT_ARCH_ARM64) || defined(RT_ARCH_ARM32)
6341 /* M1 benchmark: stset=1974 vs non-lse=6271 */
6342# if defined(RTASM_ARM64_USE_FEAT_LSE)
6343 __asm__ __volatile__("Lstart_ASMAtomicUoOrU32_%=:\n\t"
6344 "stset %w[fBitsToSet], %[pMem]\n\t"
6345 : [pMem] "+Q" (*pu32)
6346 : [fBitsToSet] "r" (u32)
6347 : );
6348# else
6349 RTASM_ARM_LOAD_MODIFY_STORE_RET_NEW_32(ASMAtomicUoOrU32, pu32, NO_BARRIER,
6350 "orr %w[uNew], %w[uNew], %w[uVal]\n\t",
6351 "orr %[uNew], %[uNew], %[uVal]\n\t",
6352 [uVal] "r" (u32));
6353# endif
6354
6355# else
6356# error "Port me"
6357# endif
6358}
6359#endif
6360
6361
6362/**
6363 * Atomically OR an unsigned 32-bit value, unordered but interrupt safe,
6364 * extended version (for bitmap fallback).
6365 *
6366 * @returns Old value.
6367 * @param pu32 Pointer to the variable to OR @a u32 with.
6368 * @param u32 The value to OR @a *pu32 with.
6369 */
6370DECLINLINE(uint32_t) ASMAtomicUoOrExU32(uint32_t volatile RT_FAR *pu32, uint32_t u32) RT_NOTHROW_DEF
6371{
6372#if defined(RT_ARCH_ARM64) || defined(RT_ARCH_ARM32)
6373# if RT_INLINE_ASM_USES_INTRIN /** @todo Check what the compiler generates... */
6374 return (uint32_t)_InterlockedOr_nf((long volatile RT_FAR *)pu32, u32);
6375# elif defined(RTASM_ARM64_USE_FEAT_LSE)
6376 uint32_t u32OldRet;
6377 __asm__ __volatile__("Lstart_ASMAtomicOrExU32_%=:\n\t"
6378 "ldset %w[fBitsToSet], %w[uOldRet], %[pMem]\n\t"
6379 : [pMem] "+Q" (*pu32)
6380 , [uOldRet] "=&r" (u32OldRet)
6381 : [fBitsToSet] "r" (u32)
6382 : );
6383 return u32OldRet;
6384# else
6385 uint32_t u32OldRet;
6386 RTASM_ARM_LOAD_MODIFY_STORE_RET_OLD_32(ASMAtomicUoOrExU32, pu32, NO_BARRIER,
6387 "orr %w[uNew], %w[uOld], %w[uVal]\n\t",
6388 "orr %[uNew], %[uOld], %[uVal]\n\t",
6389 [uVal] "r" (u32));
6390 return u32OldRet;
6391# endif
6392#else
6393 return ASMAtomicOrExU32(pu32, u32); /* (we have no unordered cmpxchg primitive atm.) */
6394#endif
6395}
6396
6397
6398/**
6399 * Atomically OR a signed 32-bit value, unordered.
6400 *
6401 * @param pi32 Pointer to the pointer variable to OR u32 with.
6402 * @param i32 The value to OR *pu32 with.
6403 *
6404 * @remarks x86: Requires a 386 or later.
6405 */
6406DECLINLINE(void) ASMAtomicUoOrS32(int32_t volatile RT_FAR *pi32, int32_t i32) RT_NOTHROW_DEF
6407{
6408 ASMAtomicUoOrU32((uint32_t volatile RT_FAR *)pi32, (uint32_t)i32);
6409}
6410
6411
6412/**
6413 * Atomically OR an unsigned 64-bit value, unordered.
6414 *
6415 * @param pu64 Pointer to the pointer variable to OR u64 with.
6416 * @param u64 The value to OR *pu64 with.
6417 *
6418 * @remarks x86: Requires a Pentium or later.
6419 */
6420#if RT_INLINE_ASM_EXTERNAL_TMP_ARM
6421DECLASM(void) ASMAtomicUoOrU64(uint64_t volatile RT_FAR *pu64, uint64_t u64) RT_NOTHROW_PROTO;
6422#else
6423DECLINLINE(void) ASMAtomicUoOrU64(uint64_t volatile RT_FAR *pu64, uint64_t u64) RT_NOTHROW_DEF
6424{
6425# if RT_INLINE_ASM_USES_INTRIN && defined(RT_ARCH_ARM64) /** @todo Check what the compiler generates... */
6426 _InterlockedOr64_nf((volatile int64_t *)pu64, (int64_t)u64);
6427
6428# elif RT_INLINE_ASM_GNU_STYLE && defined(RT_ARCH_AMD64)
6429 __asm__ __volatile__("orq %1, %q0\n\t"
6430 : "=m" (*pu64)
6431 : "r" (u64)
6432 , "m" (*pu64)
6433 : "cc");
6434
6435# elif defined(RT_ARCH_ARM64) || defined(RT_ARCH_ARM32)
6436# if defined(RTASM_ARM64_USE_FEAT_LSE)
6437 __asm__ __volatile__("Lstart_ASMAtomicUoOrU64_%=:\n\t"
6438 "stset %[fBitsToSet], %[pMem]\n\t"
6439 : [pMem] "+Q" (*pu64)
6440 : [fBitsToSet] "r" (u64)
6441 : );
6442# else
6443 RTASM_ARM_LOAD_MODIFY_STORE_RET_NEW_64(ASMAtomicUoOrU64, pu64, NO_BARRIER,
6444 "orr %[uNew], %[uNew], %[uVal]\n\t"
6445 ,
6446 "orr %[uNew], %[uNew], %[uVal]\n\t"
6447 "orr %H[uNew], %H[uNew], %H[uVal]\n\t",
6448 [uVal] "r" (u64));
6449# endif
6450
6451# else
6452 for (;;)
6453 {
6454 uint64_t u64Old = ASMAtomicUoReadU64(pu64);
6455 uint64_t u64New = u64Old | u64;
6456 if (ASMAtomicCmpXchgU64(pu64, u64New, u64Old))
6457 break;
6458 ASMNopPause();
6459 }
6460# endif
6461}
6462#endif
6463
6464
6465/**
6466 * Atomically Or a signed 64-bit value, unordered.
6467 *
6468 * @param pi64 Pointer to the pointer variable to OR u64 with.
6469 * @param i64 The value to OR *pu64 with.
6470 *
6471 * @remarks x86: Requires a Pentium or later.
6472 */
6473DECLINLINE(void) ASMAtomicUoOrS64(int64_t volatile RT_FAR *pi64, int64_t i64) RT_NOTHROW_DEF
6474{
6475 ASMAtomicUoOrU64((uint64_t volatile RT_FAR *)pi64, (uint64_t)i64);
6476}
6477
6478
6479/**
6480 * Atomically And an unsigned 32-bit value, unordered.
6481 *
6482 * @param pu32 Pointer to the pointer variable to AND u32 with.
6483 * @param u32 The value to AND *pu32 with.
6484 *
6485 * @remarks x86: Requires a 386 or later.
6486 */
6487#if RT_INLINE_ASM_EXTERNAL_TMP_ARM
6488RT_ASM_DECL_PRAGMA_WATCOM(void) ASMAtomicUoAndU32(uint32_t volatile RT_FAR *pu32, uint32_t u32) RT_NOTHROW_PROTO;
6489#else
6490DECLINLINE(void) ASMAtomicUoAndU32(uint32_t volatile RT_FAR *pu32, uint32_t u32) RT_NOTHROW_DEF
6491{
6492# if RT_INLINE_ASM_USES_INTRIN && defined(RT_ARCH_ARM64) /** @todo Check what the compiler generates... */
6493 _InterlockedAnd_nf((volatile long *)pu32, (long)u32);
6494
6495# elif defined(RT_ARCH_AMD64) || defined(RT_ARCH_X86)
6496# if RT_INLINE_ASM_GNU_STYLE
6497 __asm__ __volatile__("andl %1, %0\n\t"
6498 : "=m" (*pu32)
6499 : "ir" (u32)
6500 , "m" (*pu32)
6501 : "cc");
6502# else
6503 __asm
6504 {
6505 mov eax, [u32]
6506# ifdef RT_ARCH_AMD64
6507 mov rdx, [pu32]
6508 and [rdx], eax
6509# else
6510 mov edx, [pu32]
6511 and [edx], eax
6512# endif
6513 }
6514# endif
6515
6516# elif defined(RT_ARCH_ARM64) || defined(RT_ARCH_ARM32)
6517 /* M1 benchmark: stclr=1884 vs non-lse=6299 (ps/call) */
6518# if defined(RTASM_ARM64_USE_FEAT_LSE)
6519 __asm__ __volatile__("Lstart_ASMAtomicUoAndU32_%=:\n\t"
6520 "stclr %w[fBitsToClear], %[pMem]\n\t"
6521 : [pMem] "+Q" (*pu32)
6522 : [fBitsToClear] "r" (~u32)
6523 : );
6524# else
6525 RTASM_ARM_LOAD_MODIFY_STORE_RET_NEW_32(ASMAtomicUoAnd32, pu32, NO_BARRIER,
6526 "and %w[uNew], %w[uNew], %w[uVal]\n\t",
6527 "and %[uNew], %[uNew], %[uVal]\n\t",
6528 [uVal] "r" (u32));
6529# endif
6530
6531# else
6532# error "Port me"
6533# endif
6534}
6535#endif
6536
6537
6538/**
6539 * Atomically AND an unsigned 32-bit value, unordered, extended version (for
6540 * bitmap fallback).
6541 *
6542 * @returns Old value.
6543 * @param pu32 Pointer to the pointer to AND @a u32 with.
6544 * @param u32 The value to AND @a *pu32 with.
6545 */
6546DECLINLINE(uint32_t) ASMAtomicUoAndExU32(uint32_t volatile RT_FAR *pu32, uint32_t u32) RT_NOTHROW_DEF
6547{
6548#if RT_INLINE_ASM_USES_INTRIN && defined(RT_ARCH_ARM64) /** @todo Check what the compiler generates... */
6549 return (uint32_t)_InterlockedAnd_nf((volatile long *)pu32, (long)u32);
6550
6551#elif defined(RT_ARCH_ARM64) || defined(RT_ARCH_ARM32)
6552# if defined(RTASM_ARM64_USE_FEAT_LSE)
6553 uint32_t u32OldRet;
6554 __asm__ __volatile__("Lstart_ASMAtomicAndExU32_%=:\n\t"
6555 "ldclr %w[fBitsToClear], %w[uOldRet], %[pMem]\n\t"
6556 : [pMem] "+Q" (*pu32)
6557 , [uOldRet] "=&r" (u32OldRet)
6558 : [fBitsToClear] "r" (~u32)
6559 : );
6560# else
6561 RTASM_ARM_LOAD_MODIFY_STORE_RET_OLD_32(ASMAtomicUoAndEx32, pu32, NO_BARRIER,
6562 "and %w[uNew], %w[uOld], %w[uVal]\n\t",
6563 "and %[uNew], %[uOld], %[uVal]\n\t",
6564 [uVal] "r" (u32));
6565# endif
6566 return u32OldRet;
6567
6568#else
6569 return ASMAtomicAndExU32(pu32, u32); /* (we have no unordered cmpxchg primitive atm.) */
6570#endif
6571}
6572
6573
6574/**
6575 * Atomically And a signed 32-bit value, unordered.
6576 *
6577 * @param pi32 Pointer to the pointer variable to AND i32 with.
6578 * @param i32 The value to AND *pi32 with.
6579 *
6580 * @remarks x86: Requires a 386 or later.
6581 */
6582DECLINLINE(void) ASMAtomicUoAndS32(int32_t volatile RT_FAR *pi32, int32_t i32) RT_NOTHROW_DEF
6583{
6584 ASMAtomicUoAndU32((uint32_t volatile RT_FAR *)pi32, (uint32_t)i32);
6585}
6586
6587
6588/**
6589 * Atomically And an unsigned 64-bit value, unordered.
6590 *
6591 * @param pu64 Pointer to the pointer variable to AND u64 with.
6592 * @param u64 The value to AND *pu64 with.
6593 *
6594 * @remarks x86: Requires a Pentium or later.
6595 */
6596#if RT_INLINE_ASM_EXTERNAL_TMP_ARM
6597DECLASM(void) ASMAtomicUoAndU64(uint64_t volatile RT_FAR *pu64, uint64_t u64) RT_NOTHROW_PROTO;
6598#else
6599DECLINLINE(void) ASMAtomicUoAndU64(uint64_t volatile RT_FAR *pu64, uint64_t u64) RT_NOTHROW_DEF
6600{
6601# if RT_INLINE_ASM_USES_INTRIN && defined(RT_ARCH_ARM64) /** @todo Check what the compiler generates... */
6602 _InterlockedAnd64_nf((volatile int64_t *)pu64, (int64_t)u64);
6603
6604# elif RT_INLINE_ASM_GNU_STYLE && defined(RT_ARCH_AMD64)
6605 __asm__ __volatile__("andq %1, %0\n\t"
6606 : "=m" (*pu64)
6607 : "r" (u64)
6608 , "m" (*pu64)
6609 : "cc");
6610
6611# elif defined(RT_ARCH_ARM64) || defined(RT_ARCH_ARM32)
6612# if defined(RTASM_ARM64_USE_FEAT_LSE)
6613 __asm__ __volatile__("Lstart_ASMAtomicUoAndU64_%=:\n\t"
6614 "stclr %[fBitsToClear], %[pMem]\n\t"
6615 : [pMem] "+Q" (*pu64)
6616 : [fBitsToClear] "r" (~u64)
6617 : );
6618# else
6619 RTASM_ARM_LOAD_MODIFY_STORE_RET_NEW_64(ASMAtomicUoAndU64, pu64, NO_BARRIER,
6620 "and %[uNew], %[uNew], %[uVal]\n\t"
6621 ,
6622 "and %[uNew], %[uNew], %[uVal]\n\t"
6623 "and %H[uNew], %H[uNew], %H[uVal]\n\t",
6624 [uVal] "r" (u64));
6625# endif
6626
6627# else
6628 for (;;)
6629 {
6630 uint64_t u64Old = ASMAtomicUoReadU64(pu64);
6631 uint64_t u64New = u64Old & u64;
6632 if (ASMAtomicCmpXchgU64(pu64, u64New, u64Old))
6633 break;
6634 ASMNopPause();
6635 }
6636# endif
6637}
6638#endif
6639
6640
6641/**
6642 * Atomically And a signed 64-bit value, unordered.
6643 *
6644 * @param pi64 Pointer to the pointer variable to AND i64 with.
6645 * @param i64 The value to AND *pi64 with.
6646 *
6647 * @remarks x86: Requires a Pentium or later.
6648 */
6649DECLINLINE(void) ASMAtomicUoAndS64(int64_t volatile RT_FAR *pi64, int64_t i64) RT_NOTHROW_DEF
6650{
6651 ASMAtomicUoAndU64((uint64_t volatile RT_FAR *)pi64, (uint64_t)i64);
6652}
6653
6654
6655/**
6656 * Atomically XOR an unsigned 32-bit value, unordered but interrupt safe.
6657 *
6658 * @param pu32 Pointer to the variable to XOR @a u32 with.
6659 * @param u32 The value to OR @a *pu32 with.
6660 *
6661 * @remarks x86: Requires a 386 or later.
6662 */
6663#if RT_INLINE_ASM_EXTERNAL_TMP_ARM
6664RT_ASM_DECL_PRAGMA_WATCOM(void) ASMAtomicUoXorU32(uint32_t volatile RT_FAR *pu32, uint32_t u32) RT_NOTHROW_PROTO;
6665#else
6666DECLINLINE(void) ASMAtomicUoXorU32(uint32_t volatile RT_FAR *pu32, uint32_t u32) RT_NOTHROW_DEF
6667{
6668# if RT_INLINE_ASM_USES_INTRIN && defined(RT_ARCH_ARM64) /** @todo Check what the compiler generates... */
6669 _InterlockedXor_nf((volatile long *)pu32, (long)u32);
6670
6671# elif defined(RT_ARCH_AMD64) || defined(RT_ARCH_X86)
6672# if RT_INLINE_ASM_GNU_STYLE
6673 __asm__ __volatile__("xorl %1, %0\n\t"
6674 : "=m" (*pu32)
6675 : "ir" (u32)
6676 , "m" (*pu32)
6677 : "cc");
6678# else
6679 __asm
6680 {
6681 mov eax, [u32]
6682# ifdef RT_ARCH_AMD64
6683 mov rdx, [pu32]
6684 xor [rdx], eax
6685# else
6686 mov edx, [pu32]
6687 xor [edx], eax
6688# endif
6689 }
6690# endif
6691
6692# elif defined(RT_ARCH_ARM64) || defined(RT_ARCH_ARM32)
6693# if defined(RTASM_ARM64_USE_FEAT_LSE)
6694 __asm__ __volatile__("Lstart_ASMAtomicUoXorU32_%=:\n\t"
6695 "steor %w[fBitMask], %[pMem]\n\t"
6696 : [pMem] "+Q" (*pu32)
6697 : [fBitMask] "r" (u32)
6698 : );
6699# else
6700 RTASM_ARM_LOAD_MODIFY_STORE_RET_NEW_32(ASMAtomicUoXorU32, pu32, NO_BARRIER,
6701 "eor %w[uNew], %w[uNew], %w[uVal]\n\t",
6702 "eor %[uNew], %[uNew], %[uVal]\n\t",
6703 [uVal] "r" (u32));
6704# endif
6705
6706# else
6707# error "Port me"
6708# endif
6709}
6710#endif
6711
6712
6713/**
6714 * Atomically XOR an unsigned 32-bit value, unordered but interrupt safe,
6715 * extended version (for bitmap fallback).
6716 *
6717 * @returns Old value.
6718 * @param pu32 Pointer to the variable to XOR @a u32 with.
6719 * @param u32 The value to OR @a *pu32 with.
6720 */
6721DECLINLINE(uint32_t) ASMAtomicUoXorExU32(uint32_t volatile RT_FAR *pu32, uint32_t u32) RT_NOTHROW_DEF
6722{
6723#if RT_INLINE_ASM_USES_INTRIN && defined(RT_ARCH_ARM64) /** @todo Check what the compiler generates... */
6724 return (uint32_t)_InterlockedXor_nf((volatile long *)pu32, (long)u32);
6725
6726#elif defined(RT_ARCH_ARM64) || defined(RT_ARCH_ARM32)
6727# if defined(RTASM_ARM64_USE_FEAT_LSE)
6728 uint32_t u32OldRet;
6729 __asm__ __volatile__("Lstart_ASMAtomicUoXorExU32_%=:\n\t"
6730 "ldeor %w[fBitMask], %w[uOldRet], %[pMem]\n\t"
6731 : [pMem] "+Q" (*pu32)
6732 , [uOldRet] "=&r" (u32OldRet)
6733 : [fBitMask] "r" (u32)
6734 : );
6735# else
6736 RTASM_ARM_LOAD_MODIFY_STORE_RET_OLD_32(ASMAtomicUoXorExU32, pu32, NO_BARRIER,
6737 "eor %w[uNew], %w[uOld], %w[uVal]\n\t",
6738 "eor %[uNew], %[uOld], %[uVal]\n\t",
6739 [uVal] "r" (u32));
6740# endif
6741 return u32OldRet;
6742
6743#else
6744 return ASMAtomicXorExU32(pu32, u32); /* (we have no unordered cmpxchg primitive atm.) */
6745#endif
6746}
6747
6748
6749/**
6750 * Atomically XOR a signed 32-bit value, unordered.
6751 *
6752 * @param pi32 Pointer to the variable to XOR @a u32 with.
6753 * @param i32 The value to XOR @a *pu32 with.
6754 *
6755 * @remarks x86: Requires a 386 or later.
6756 */
6757DECLINLINE(void) ASMAtomicUoXorS32(int32_t volatile RT_FAR *pi32, int32_t i32) RT_NOTHROW_DEF
6758{
6759 ASMAtomicUoXorU32((uint32_t volatile RT_FAR *)pi32, (uint32_t)i32);
6760}
6761
6762
6763/**
6764 * Atomically increment an unsigned 32-bit value, unordered.
6765 *
6766 * @returns the new value.
6767 * @param pu32 Pointer to the variable to increment.
6768 *
6769 * @remarks x86: Requires a 486 or later.
6770 */
6771#if RT_INLINE_ASM_EXTERNAL_TMP_ARM
6772RT_ASM_DECL_PRAGMA_WATCOM(uint32_t) ASMAtomicUoIncU32(uint32_t volatile RT_FAR *pu32) RT_NOTHROW_PROTO;
6773#else
6774DECLINLINE(uint32_t) ASMAtomicUoIncU32(uint32_t volatile RT_FAR *pu32) RT_NOTHROW_DEF
6775{
6776# if RT_INLINE_ASM_USES_INTRIN && defined(RT_ARCH_ARM64) /** @todo Check what the compiler generates... */
6777 return _InterlockedIncrement_nf((volatile long *)pu32);
6778
6779# elif defined(RT_ARCH_AMD64) || defined(RT_ARCH_X86)
6780 uint32_t u32;
6781# if RT_INLINE_ASM_GNU_STYLE
6782 __asm__ __volatile__("xaddl %0, %1\n\t"
6783 : "=r" (u32)
6784 , "=m" (*pu32)
6785 : "0" (1)
6786 , "m" (*pu32)
6787 : "memory" /** @todo why 'memory'? */
6788 , "cc");
6789 return u32 + 1;
6790# else
6791 __asm
6792 {
6793 mov eax, 1
6794# ifdef RT_ARCH_AMD64
6795 mov rdx, [pu32]
6796 xadd [rdx], eax
6797# else
6798 mov edx, [pu32]
6799 xadd [edx], eax
6800# endif
6801 mov u32, eax
6802 }
6803 return u32 + 1;
6804# endif
6805
6806# elif defined(RT_ARCH_ARM64) || defined(RT_ARCH_ARM32)
6807 /* M1 benchmark: ldadd=2031 vs non-lse=6301 (ps/call) */
6808# if defined(RTASM_ARM64_USE_FEAT_LSE)
6809 uint32_t u32NewRet;
6810 __asm__ __volatile__("Lstart_ASMAtomicUoIncU32_%=:\n\t"
6811 "ldadd %w[uAddend], %w[uNewRet], %[pMem]\n\t"
6812 "add %w[uNewRet], %w[uNewRet], #1\n\t"
6813 : [pMem] "+Q" (*pu32)
6814 , [uNewRet] "=&r" (u32NewRet)
6815 : [uAddend] "r" ((uint32_t)1)
6816 : );
6817# else
6818 RTASM_ARM_LOAD_MODIFY_STORE_RET_NEW_32(ASMAtomicUoIncU32, pu32, NO_BARRIER,
6819 "add %w[uNew], %w[uNew], #1\n\t",
6820 "add %[uNew], %[uNew], #1\n\t" /* arm6 / thumb2+ */,
6821 "X" (0) /* dummy */);
6822# endif
6823 return u32NewRet;
6824
6825# else
6826# error "Port me"
6827# endif
6828}
6829#endif
6830
6831
6832/**
6833 * Atomically decrement an unsigned 32-bit value, unordered.
6834 *
6835 * @returns the new value.
6836 * @param pu32 Pointer to the variable to decrement.
6837 *
6838 * @remarks x86: Requires a 486 or later.
6839 */
6840#if RT_INLINE_ASM_EXTERNAL_TMP_ARM
6841RT_ASM_DECL_PRAGMA_WATCOM(uint32_t) ASMAtomicUoDecU32(uint32_t volatile RT_FAR *pu32) RT_NOTHROW_PROTO;
6842#else
6843DECLINLINE(uint32_t) ASMAtomicUoDecU32(uint32_t volatile RT_FAR *pu32) RT_NOTHROW_DEF
6844{
6845# if RT_INLINE_ASM_USES_INTRIN && defined(RT_ARCH_ARM64) /** @todo Check what the compiler generates... */
6846 return _InterlockedDecrement_nf((volatile long *)pu32);
6847
6848# elif defined(RT_ARCH_AMD64) || defined(RT_ARCH_X86)
6849 uint32_t u32;
6850# if RT_INLINE_ASM_GNU_STYLE
6851 __asm__ __volatile__("lock; xaddl %0, %1\n\t"
6852 : "=r" (u32)
6853 , "=m" (*pu32)
6854 : "0" (-1)
6855 , "m" (*pu32)
6856 : "memory"
6857 , "cc");
6858 return u32 - 1;
6859# else
6860 __asm
6861 {
6862 mov eax, -1
6863# ifdef RT_ARCH_AMD64
6864 mov rdx, [pu32]
6865 xadd [rdx], eax
6866# else
6867 mov edx, [pu32]
6868 xadd [edx], eax
6869# endif
6870 mov u32, eax
6871 }
6872 return u32 - 1;
6873# endif
6874
6875# elif defined(RT_ARCH_ARM64) || defined(RT_ARCH_ARM32)
6876 /* M1 benchmark: ldadd=2101 vs non-lse=6268 (ps/call) */
6877# if defined(RTASM_ARM64_USE_FEAT_LSE)
6878 uint32_t u32NewRet;
6879 __asm__ __volatile__("Lstart_ASMAtomicUoDecU32_%=:\n\t"
6880 "ldadd %w[uAddend], %w[uNewRet], %[pMem]\n\t"
6881 "sub %w[uNewRet], %w[uNewRet], #1\n\t"
6882 : [pMem] "+Q" (*pu32)
6883 , [uNewRet] "=&r" (u32NewRet)
6884 : [uAddend] "r" (~(uint32_t)0)
6885 : );
6886# else
6887 RTASM_ARM_LOAD_MODIFY_STORE_RET_NEW_32(ASMAtomicUoDecU32, pu32, NO_BARRIER,
6888 "sub %w[uNew], %w[uNew], #1\n\t",
6889 "sub %[uNew], %[uNew], #1\n\t" /* arm6 / thumb2+ */,
6890 "X" (0) /* dummy */);
6891# endif
6892 return u32NewRet;
6893
6894# else
6895# error "Port me"
6896# endif
6897}
6898#endif
6899
6900/** @todo Move ASMByteSwapU16, ASMByteSwapU32 and ASMByteSwapU64 in their own
6901 * header as it's a common reason for including asm.h. */
6902
6903
6904/**
6905 * Reverse the byte order of the given 16-bit integer.
6906 *
6907 * @returns Revert
6908 * @param u16 16-bit integer value.
6909 */
6910#if RT_INLINE_ASM_EXTERNAL_TMP_ARM && !RT_INLINE_ASM_USES_INTRIN
6911RT_ASM_DECL_PRAGMA_WATCOM(uint16_t) ASMByteSwapU16(uint16_t u16) RT_NOTHROW_PROTO;
6912#else
6913DECLINLINE(uint16_t) ASMByteSwapU16(uint16_t u16) RT_NOTHROW_DEF
6914{
6915# if RT_INLINE_ASM_USES_INTRIN
6916 return _byteswap_ushort(u16);
6917
6918# elif defined(RT_ARCH_AMD64) || defined(RT_ARCH_X86)
6919# if RT_INLINE_ASM_GNU_STYLE
6920 __asm__ ("rorw $8, %0" : "=r" (u16) : "0" (u16) : "cc");
6921# else
6922 _asm
6923 {
6924 mov ax, [u16]
6925 ror ax, 8
6926 mov [u16], ax
6927 }
6928# endif
6929 return u16;
6930
6931# elif defined(RT_ARCH_ARM64) || defined(RT_ARCH_ARM32)
6932 uint32_t u32Ret;
6933 __asm__ __volatile__(
6934# if defined(RT_ARCH_ARM64)
6935 "rev16 %w[uRet], %w[uVal]\n\t"
6936# else
6937 "rev16 %[uRet], %[uVal]\n\t"
6938# endif
6939 : [uRet] "=r" (u32Ret)
6940 : [uVal] "r" (u16));
6941 return (uint16_t)u32Ret;
6942
6943# else
6944# error "Port me"
6945# endif
6946}
6947#endif
6948
6949
6950/**
6951 * Reverse the byte order of the given 32-bit integer.
6952 *
6953 * @returns Revert
6954 * @param u32 32-bit integer value.
6955 */
6956#if RT_INLINE_ASM_EXTERNAL_TMP_ARM && !RT_INLINE_ASM_USES_INTRIN
6957RT_ASM_DECL_PRAGMA_WATCOM(uint32_t) ASMByteSwapU32(uint32_t u32) RT_NOTHROW_PROTO;
6958#else
6959DECLINLINE(uint32_t) ASMByteSwapU32(uint32_t u32) RT_NOTHROW_DEF
6960{
6961# if RT_INLINE_ASM_USES_INTRIN
6962 return _byteswap_ulong(u32);
6963
6964# elif defined(RT_ARCH_AMD64) || defined(RT_ARCH_X86)
6965# if RT_INLINE_ASM_GNU_STYLE
6966 __asm__ ("bswapl %0" : "=r" (u32) : "0" (u32));
6967# else
6968 _asm
6969 {
6970 mov eax, [u32]
6971 bswap eax
6972 mov [u32], eax
6973 }
6974# endif
6975 return u32;
6976
6977# elif defined(RT_ARCH_ARM64)
6978 uint64_t u64Ret;
6979 __asm__ __volatile__("rev32 %[uRet], %[uVal]\n\t"
6980 : [uRet] "=r" (u64Ret)
6981 : [uVal] "r" ((uint64_t)u32));
6982 return (uint32_t)u64Ret;
6983
6984# elif defined(RT_ARCH_ARM32)
6985 __asm__ __volatile__("rev %[uRet], %[uVal]\n\t"
6986 : [uRet] "=r" (u32)
6987 : [uVal] "[uRet]" (u32));
6988 return u32;
6989
6990# else
6991# error "Port me"
6992# endif
6993}
6994#endif
6995
6996
6997/**
6998 * Reverse the byte order of the given 64-bit integer.
6999 *
7000 * @returns Revert
7001 * @param u64 64-bit integer value.
7002 */
7003DECLINLINE(uint64_t) ASMByteSwapU64(uint64_t u64) RT_NOTHROW_DEF
7004{
7005#if (defined(RT_ARCH_AMD64) || defined(RT_ARCH_ARM64)) && RT_INLINE_ASM_USES_INTRIN
7006 return _byteswap_uint64(u64);
7007
7008# elif RT_INLINE_ASM_GNU_STYLE && defined(RT_ARCH_AMD64)
7009 __asm__ ("bswapq %0" : "=r" (u64) : "0" (u64));
7010 return u64;
7011
7012# elif defined(RT_ARCH_ARM64)
7013 __asm__ __volatile__("rev %[uRet], %[uVal]\n\t"
7014 : [uRet] "=r" (u64)
7015 : [uVal] "[uRet]" (u64));
7016 return u64;
7017
7018#else
7019 return (uint64_t)ASMByteSwapU32((uint32_t)u64) << 32
7020 | (uint64_t)ASMByteSwapU32((uint32_t)(u64 >> 32));
7021#endif
7022}
7023
7024
7025
7026/** @defgroup grp_inline_bits Bitmap Operations
7027 *
7028 * @todo Move these into a separate header, with standard IPRT prefix
7029 * (RTBitmapXxx). Move the more complex (searched) stuff into C source
7030 * files.
7031 *
7032 * @{
7033 */
7034
7035
7036/**
7037 * Sets a bit in a bitmap.
7038 *
7039 * @param pvBitmap Pointer to the bitmap (little endian). This should be
7040 * 32-bit aligned.
7041 * @param iBit The bit to set.
7042 *
7043 * @remarks The 32-bit aligning of pvBitmap is not a strict requirement.
7044 * However, doing so will yield better performance as well as avoiding
7045 * traps accessing the last bits in the bitmap.
7046 */
7047#if RT_INLINE_ASM_EXTERNAL_TMP_ARM && !RT_INLINE_ASM_USES_INTRIN
7048RT_ASM_DECL_PRAGMA_WATCOM(void) ASMBitSet(volatile void RT_FAR *pvBitmap, int32_t iBit) RT_NOTHROW_PROTO;
7049#else
7050DECLINLINE(void) ASMBitSet(volatile void RT_FAR *pvBitmap, int32_t iBit) RT_NOTHROW_DEF
7051{
7052# if RT_INLINE_ASM_USES_INTRIN
7053 _bittestandset((long RT_FAR *)pvBitmap, iBit);
7054
7055# elif defined(RT_ARCH_AMD64) || defined(RT_ARCH_X86)
7056# if RT_INLINE_ASM_GNU_STYLE
7057 __asm__ __volatile__("btsl %1, %0"
7058 : "=m" (*(volatile long RT_FAR *)pvBitmap)
7059 : "Ir" (iBit)
7060 , "m" (*(volatile long RT_FAR *)pvBitmap)
7061 : "memory"
7062 , "cc");
7063# else
7064 __asm
7065 {
7066# ifdef RT_ARCH_AMD64
7067 mov rax, [pvBitmap]
7068 mov edx, [iBit]
7069 bts [rax], edx
7070# else
7071 mov eax, [pvBitmap]
7072 mov edx, [iBit]
7073 bts [eax], edx
7074# endif
7075 }
7076# endif
7077
7078# else
7079 int32_t offBitmap = iBit / 32;
7080 AssertStmt(!((uintptr_t)pvBitmap & 3), offBitmap += (uintptr_t)pvBitmap & 3; iBit += ((uintptr_t)pvBitmap & 3) * 8);
7081 ASMAtomicUoOrU32(&((uint32_t volatile *)pvBitmap)[offBitmap], RT_H2LE_U32(RT_BIT_32(iBit & 31)));
7082# endif
7083}
7084#endif
7085
7086
7087/**
7088 * Atomically sets a bit in a bitmap, ordered.
7089 *
7090 * @param pvBitmap Pointer to the bitmap (little endian). Must be 32-bit
7091 * aligned, otherwise the memory access isn't atomic!
7092 * @param iBit The bit to set.
7093 *
7094 * @remarks x86: Requires a 386 or later.
7095 */
7096#if RT_INLINE_ASM_EXTERNAL_TMP_ARM && !RT_INLINE_ASM_USES_INTRIN
7097RT_ASM_DECL_PRAGMA_WATCOM(void) ASMAtomicBitSet(volatile void RT_FAR *pvBitmap, int32_t iBit) RT_NOTHROW_PROTO;
7098#else
7099DECLINLINE(void) ASMAtomicBitSet(volatile void RT_FAR *pvBitmap, int32_t iBit) RT_NOTHROW_DEF
7100{
7101 AssertMsg(!((uintptr_t)pvBitmap & 3), ("address %p not 32-bit aligned", pvBitmap));
7102# if RT_INLINE_ASM_USES_INTRIN
7103 _interlockedbittestandset((long RT_FAR *)pvBitmap, iBit);
7104# elif defined(RT_ARCH_AMD64) || defined(RT_ARCH_X86)
7105# if RT_INLINE_ASM_GNU_STYLE
7106 __asm__ __volatile__("lock; btsl %1, %0"
7107 : "=m" (*(volatile long *)pvBitmap)
7108 : "Ir" (iBit)
7109 , "m" (*(volatile long *)pvBitmap)
7110 : "memory"
7111 , "cc");
7112# else
7113 __asm
7114 {
7115# ifdef RT_ARCH_AMD64
7116 mov rax, [pvBitmap]
7117 mov edx, [iBit]
7118 lock bts [rax], edx
7119# else
7120 mov eax, [pvBitmap]
7121 mov edx, [iBit]
7122 lock bts [eax], edx
7123# endif
7124 }
7125# endif
7126
7127# else
7128 ASMAtomicOrU32(&((uint32_t volatile *)pvBitmap)[iBit / 32], RT_H2LE_U32(RT_BIT_32(iBit & 31)));
7129# endif
7130}
7131#endif
7132
7133
7134/**
7135 * Clears a bit in a bitmap.
7136 *
7137 * @param pvBitmap Pointer to the bitmap (little endian).
7138 * @param iBit The bit to clear.
7139 *
7140 * @remarks The 32-bit aligning of pvBitmap is not a strict requirement.
7141 * However, doing so will yield better performance as well as avoiding
7142 * traps accessing the last bits in the bitmap.
7143 */
7144#if RT_INLINE_ASM_EXTERNAL_TMP_ARM && !RT_INLINE_ASM_USES_INTRIN
7145RT_ASM_DECL_PRAGMA_WATCOM(void) ASMBitClear(volatile void RT_FAR *pvBitmap, int32_t iBit) RT_NOTHROW_PROTO;
7146#else
7147DECLINLINE(void) ASMBitClear(volatile void RT_FAR *pvBitmap, int32_t iBit) RT_NOTHROW_DEF
7148{
7149# if RT_INLINE_ASM_USES_INTRIN
7150 _bittestandreset((long RT_FAR *)pvBitmap, iBit);
7151
7152# elif defined(RT_ARCH_AMD64) || defined(RT_ARCH_X86)
7153# if RT_INLINE_ASM_GNU_STYLE
7154 __asm__ __volatile__("btrl %1, %0"
7155 : "=m" (*(volatile long RT_FAR *)pvBitmap)
7156 : "Ir" (iBit)
7157 , "m" (*(volatile long RT_FAR *)pvBitmap)
7158 : "memory"
7159 , "cc");
7160# else
7161 __asm
7162 {
7163# ifdef RT_ARCH_AMD64
7164 mov rax, [pvBitmap]
7165 mov edx, [iBit]
7166 btr [rax], edx
7167# else
7168 mov eax, [pvBitmap]
7169 mov edx, [iBit]
7170 btr [eax], edx
7171# endif
7172 }
7173# endif
7174
7175# else
7176 int32_t offBitmap = iBit / 32;
7177 AssertStmt(!((uintptr_t)pvBitmap & 3), offBitmap += (uintptr_t)pvBitmap & 3; iBit += ((uintptr_t)pvBitmap & 3) * 8);
7178 ASMAtomicUoAndU32(&((uint32_t volatile *)pvBitmap)[offBitmap], RT_H2LE_U32(~RT_BIT_32(iBit & 31)));
7179# endif
7180}
7181#endif
7182
7183
7184/**
7185 * Atomically clears a bit in a bitmap, ordered.
7186 *
7187 * @param pvBitmap Pointer to the bitmap (little endian). Must be 32-bit
7188 * aligned, otherwise the memory access isn't atomic!
7189 * @param iBit The bit to toggle set.
7190 *
7191 * @remarks No memory barrier, take care on smp.
7192 * @remarks x86: Requires a 386 or later.
7193 */
7194#if RT_INLINE_ASM_EXTERNAL_TMP_ARM
7195RT_ASM_DECL_PRAGMA_WATCOM(void) ASMAtomicBitClear(volatile void RT_FAR *pvBitmap, int32_t iBit) RT_NOTHROW_PROTO;
7196#else
7197DECLINLINE(void) ASMAtomicBitClear(volatile void RT_FAR *pvBitmap, int32_t iBit) RT_NOTHROW_DEF
7198{
7199 AssertMsg(!((uintptr_t)pvBitmap & 3), ("address %p not 32-bit aligned", pvBitmap));
7200# if defined(RT_ARCH_AMD64) || defined(RT_ARCH_X86)
7201# if RT_INLINE_ASM_GNU_STYLE
7202 __asm__ __volatile__("lock; btrl %1, %0"
7203 : "=m" (*(volatile long RT_FAR *)pvBitmap)
7204 : "Ir" (iBit)
7205 , "m" (*(volatile long RT_FAR *)pvBitmap)
7206 : "memory"
7207 , "cc");
7208# else
7209 __asm
7210 {
7211# ifdef RT_ARCH_AMD64
7212 mov rax, [pvBitmap]
7213 mov edx, [iBit]
7214 lock btr [rax], edx
7215# else
7216 mov eax, [pvBitmap]
7217 mov edx, [iBit]
7218 lock btr [eax], edx
7219# endif
7220 }
7221# endif
7222# else
7223 ASMAtomicAndU32(&((uint32_t volatile *)pvBitmap)[iBit / 32], RT_H2LE_U32(~RT_BIT_32(iBit & 31)));
7224# endif
7225}
7226#endif
7227
7228
7229/**
7230 * Toggles a bit in a bitmap.
7231 *
7232 * @param pvBitmap Pointer to the bitmap (little endian).
7233 * @param iBit The bit to toggle.
7234 *
7235 * @remarks The 32-bit aligning of pvBitmap is not a strict requirement.
7236 * However, doing so will yield better performance as well as avoiding
7237 * traps accessing the last bits in the bitmap.
7238 */
7239#if RT_INLINE_ASM_EXTERNAL_TMP_ARM && !RT_INLINE_ASM_USES_INTRIN
7240RT_ASM_DECL_PRAGMA_WATCOM(void) ASMBitToggle(volatile void RT_FAR *pvBitmap, int32_t iBit) RT_NOTHROW_PROTO;
7241#else
7242DECLINLINE(void) ASMBitToggle(volatile void RT_FAR *pvBitmap, int32_t iBit) RT_NOTHROW_DEF
7243{
7244# if RT_INLINE_ASM_USES_INTRIN
7245 _bittestandcomplement((long RT_FAR *)pvBitmap, iBit);
7246# elif defined(RT_ARCH_AMD64) || defined(RT_ARCH_X86)
7247# if RT_INLINE_ASM_GNU_STYLE
7248 __asm__ __volatile__("btcl %1, %0"
7249 : "=m" (*(volatile long *)pvBitmap)
7250 : "Ir" (iBit)
7251 , "m" (*(volatile long *)pvBitmap)
7252 : "memory"
7253 , "cc");
7254# else
7255 __asm
7256 {
7257# ifdef RT_ARCH_AMD64
7258 mov rax, [pvBitmap]
7259 mov edx, [iBit]
7260 btc [rax], edx
7261# else
7262 mov eax, [pvBitmap]
7263 mov edx, [iBit]
7264 btc [eax], edx
7265# endif
7266 }
7267# endif
7268# else
7269 int32_t offBitmap = iBit / 32;
7270 AssertStmt(!((uintptr_t)pvBitmap & 3), offBitmap += (uintptr_t)pvBitmap & 3; iBit += ((uintptr_t)pvBitmap & 3) * 8);
7271 ASMAtomicUoXorU32(&((uint32_t volatile *)pvBitmap)[offBitmap], RT_H2LE_U32(RT_BIT_32(iBit & 31)));
7272# endif
7273}
7274#endif
7275
7276
7277/**
7278 * Atomically toggles a bit in a bitmap, ordered.
7279 *
7280 * @param pvBitmap Pointer to the bitmap (little endian). Must be 32-bit
7281 * aligned, otherwise the memory access isn't atomic!
7282 * @param iBit The bit to test and set.
7283 *
7284 * @remarks x86: Requires a 386 or later.
7285 */
7286#if RT_INLINE_ASM_EXTERNAL_TMP_ARM
7287RT_ASM_DECL_PRAGMA_WATCOM(void) ASMAtomicBitToggle(volatile void RT_FAR *pvBitmap, int32_t iBit) RT_NOTHROW_PROTO;
7288#else
7289DECLINLINE(void) ASMAtomicBitToggle(volatile void RT_FAR *pvBitmap, int32_t iBit) RT_NOTHROW_DEF
7290{
7291 AssertMsg(!((uintptr_t)pvBitmap & 3), ("address %p not 32-bit aligned", pvBitmap));
7292# if defined(RT_ARCH_AMD64) || defined(RT_ARCH_X86)
7293# if RT_INLINE_ASM_GNU_STYLE
7294 __asm__ __volatile__("lock; btcl %1, %0"
7295 : "=m" (*(volatile long RT_FAR *)pvBitmap)
7296 : "Ir" (iBit)
7297 , "m" (*(volatile long RT_FAR *)pvBitmap)
7298 : "memory"
7299 , "cc");
7300# else
7301 __asm
7302 {
7303# ifdef RT_ARCH_AMD64
7304 mov rax, [pvBitmap]
7305 mov edx, [iBit]
7306 lock btc [rax], edx
7307# else
7308 mov eax, [pvBitmap]
7309 mov edx, [iBit]
7310 lock btc [eax], edx
7311# endif
7312 }
7313# endif
7314# else
7315 ASMAtomicXorU32(&((uint32_t volatile *)pvBitmap)[iBit / 32], RT_H2LE_U32(RT_BIT_32(iBit & 31)));
7316# endif
7317}
7318#endif
7319
7320
7321/**
7322 * Tests and sets a bit in a bitmap.
7323 *
7324 * @returns true if the bit was set.
7325 * @returns false if the bit was clear.
7326 *
7327 * @param pvBitmap Pointer to the bitmap (little endian).
7328 * @param iBit The bit to test and set.
7329 *
7330 * @remarks The 32-bit aligning of pvBitmap is not a strict requirement.
7331 * However, doing so will yield better performance as well as avoiding
7332 * traps accessing the last bits in the bitmap.
7333 */
7334#if RT_INLINE_ASM_EXTERNAL_TMP_ARM && !RT_INLINE_ASM_USES_INTRIN
7335RT_ASM_DECL_PRAGMA_WATCOM(bool) ASMBitTestAndSet(volatile void RT_FAR *pvBitmap, int32_t iBit) RT_NOTHROW_PROTO;
7336#else
7337DECLINLINE(bool) ASMBitTestAndSet(volatile void RT_FAR *pvBitmap, int32_t iBit) RT_NOTHROW_DEF
7338{
7339 union { bool f; uint32_t u32; uint8_t u8; } rc;
7340# if RT_INLINE_ASM_USES_INTRIN
7341 rc.u8 = _bittestandset((long RT_FAR *)pvBitmap, iBit);
7342
7343# elif defined(RT_ARCH_AMD64) || defined(RT_ARCH_X86)
7344# if RT_INLINE_ASM_GNU_STYLE
7345 __asm__ __volatile__("btsl %2, %1\n\t"
7346 "setc %b0\n\t"
7347 "andl $1, %0\n\t"
7348 : "=q" (rc.u32)
7349 , "=m" (*(volatile long RT_FAR *)pvBitmap)
7350 : "Ir" (iBit)
7351 , "m" (*(volatile long RT_FAR *)pvBitmap)
7352 : "memory"
7353 , "cc");
7354# else
7355 __asm
7356 {
7357 mov edx, [iBit]
7358# ifdef RT_ARCH_AMD64
7359 mov rax, [pvBitmap]
7360 bts [rax], edx
7361# else
7362 mov eax, [pvBitmap]
7363 bts [eax], edx
7364# endif
7365 setc al
7366 and eax, 1
7367 mov [rc.u32], eax
7368 }
7369# endif
7370
7371# else
7372 int32_t offBitmap = iBit / 32;
7373 AssertStmt(!((uintptr_t)pvBitmap & 3), offBitmap += (uintptr_t)pvBitmap & 3; iBit += ((uintptr_t)pvBitmap & 3) * 8);
7374 rc.u32 = RT_LE2H_U32(ASMAtomicUoOrExU32(&((uint32_t volatile *)pvBitmap)[offBitmap], RT_H2LE_U32(RT_BIT_32(iBit & 31))))
7375 >> (iBit & 31);
7376 rc.u32 &= 1;
7377# endif
7378 return rc.f;
7379}
7380#endif
7381
7382
7383/**
7384 * Atomically tests and sets a bit in a bitmap, ordered.
7385 *
7386 * @returns true if the bit was set.
7387 * @returns false if the bit was clear.
7388 *
7389 * @param pvBitmap Pointer to the bitmap (little endian). Must be 32-bit
7390 * aligned, otherwise the memory access isn't atomic!
7391 * @param iBit The bit to set.
7392 *
7393 * @remarks x86: Requires a 386 or later.
7394 */
7395#if RT_INLINE_ASM_EXTERNAL_TMP_ARM && !RT_INLINE_ASM_USES_INTRIN
7396RT_ASM_DECL_PRAGMA_WATCOM(bool) ASMAtomicBitTestAndSet(volatile void RT_FAR *pvBitmap, int32_t iBit) RT_NOTHROW_PROTO;
7397#else
7398DECLINLINE(bool) ASMAtomicBitTestAndSet(volatile void RT_FAR *pvBitmap, int32_t iBit) RT_NOTHROW_DEF
7399{
7400 union { bool f; uint32_t u32; uint8_t u8; } rc;
7401 AssertMsg(!((uintptr_t)pvBitmap & 3), ("address %p not 32-bit aligned", pvBitmap));
7402# if RT_INLINE_ASM_USES_INTRIN
7403 rc.u8 = _interlockedbittestandset((long RT_FAR *)pvBitmap, iBit);
7404# elif defined(RT_ARCH_AMD64) || defined(RT_ARCH_X86)
7405# if RT_INLINE_ASM_GNU_STYLE
7406 __asm__ __volatile__("lock; btsl %2, %1\n\t"
7407 "setc %b0\n\t"
7408 "andl $1, %0\n\t"
7409 : "=q" (rc.u32)
7410 , "=m" (*(volatile long RT_FAR *)pvBitmap)
7411 : "Ir" (iBit)
7412 , "m" (*(volatile long RT_FAR *)pvBitmap)
7413 : "memory"
7414 , "cc");
7415# else
7416 __asm
7417 {
7418 mov edx, [iBit]
7419# ifdef RT_ARCH_AMD64
7420 mov rax, [pvBitmap]
7421 lock bts [rax], edx
7422# else
7423 mov eax, [pvBitmap]
7424 lock bts [eax], edx
7425# endif
7426 setc al
7427 and eax, 1
7428 mov [rc.u32], eax
7429 }
7430# endif
7431
7432# else
7433 rc.u32 = RT_LE2H_U32(ASMAtomicOrExU32(&((uint32_t volatile *)pvBitmap)[iBit / 32], RT_H2LE_U32(RT_BIT_32(iBit & 31))))
7434 >> (iBit & 31);
7435 rc.u32 &= 1;
7436# endif
7437 return rc.f;
7438}
7439#endif
7440
7441
7442/**
7443 * Tests and clears a bit in a bitmap.
7444 *
7445 * @returns true if the bit was set.
7446 * @returns false if the bit was clear.
7447 *
7448 * @param pvBitmap Pointer to the bitmap (little endian).
7449 * @param iBit The bit to test and clear.
7450 *
7451 * @remarks The 32-bit aligning of pvBitmap is not a strict requirement.
7452 * However, doing so will yield better performance as well as avoiding
7453 * traps accessing the last bits in the bitmap.
7454 */
7455#if RT_INLINE_ASM_EXTERNAL_TMP_ARM && !RT_INLINE_ASM_USES_INTRIN
7456RT_ASM_DECL_PRAGMA_WATCOM(bool) ASMBitTestAndClear(volatile void RT_FAR *pvBitmap, int32_t iBit) RT_NOTHROW_PROTO;
7457#else
7458DECLINLINE(bool) ASMBitTestAndClear(volatile void RT_FAR *pvBitmap, int32_t iBit) RT_NOTHROW_DEF
7459{
7460 union { bool f; uint32_t u32; uint8_t u8; } rc;
7461# if RT_INLINE_ASM_USES_INTRIN
7462 rc.u8 = _bittestandreset((long RT_FAR *)pvBitmap, iBit);
7463
7464# elif defined(RT_ARCH_AMD64) || defined(RT_ARCH_X86)
7465# if RT_INLINE_ASM_GNU_STYLE
7466 __asm__ __volatile__("btrl %2, %1\n\t"
7467 "setc %b0\n\t"
7468 "andl $1, %0\n\t"
7469 : "=q" (rc.u32)
7470 , "=m" (*(volatile long RT_FAR *)pvBitmap)
7471 : "Ir" (iBit)
7472 , "m" (*(volatile long RT_FAR *)pvBitmap)
7473 : "memory"
7474 , "cc");
7475# else
7476 __asm
7477 {
7478 mov edx, [iBit]
7479# ifdef RT_ARCH_AMD64
7480 mov rax, [pvBitmap]
7481 btr [rax], edx
7482# else
7483 mov eax, [pvBitmap]
7484 btr [eax], edx
7485# endif
7486 setc al
7487 and eax, 1
7488 mov [rc.u32], eax
7489 }
7490# endif
7491
7492# else
7493 int32_t offBitmap = iBit / 32;
7494 AssertStmt(!((uintptr_t)pvBitmap & 3), offBitmap += (uintptr_t)pvBitmap & 3; iBit += ((uintptr_t)pvBitmap & 3) * 8);
7495 rc.u32 = RT_LE2H_U32(ASMAtomicUoAndExU32(&((uint32_t volatile *)pvBitmap)[offBitmap], RT_H2LE_U32(~RT_BIT_32(iBit & 31))))
7496 >> (iBit & 31);
7497 rc.u32 &= 1;
7498# endif
7499 return rc.f;
7500}
7501#endif
7502
7503
7504/**
7505 * Atomically tests and clears a bit in a bitmap, ordered.
7506 *
7507 * @returns true if the bit was set.
7508 * @returns false if the bit was clear.
7509 *
7510 * @param pvBitmap Pointer to the bitmap (little endian). Must be 32-bit
7511 * aligned, otherwise the memory access isn't atomic!
7512 * @param iBit The bit to test and clear.
7513 *
7514 * @remarks No memory barrier, take care on smp.
7515 * @remarks x86: Requires a 386 or later.
7516 */
7517#if RT_INLINE_ASM_EXTERNAL_TMP_ARM && !RT_INLINE_ASM_USES_INTRIN
7518RT_ASM_DECL_PRAGMA_WATCOM(bool) ASMAtomicBitTestAndClear(volatile void RT_FAR *pvBitmap, int32_t iBit) RT_NOTHROW_PROTO;
7519#else
7520DECLINLINE(bool) ASMAtomicBitTestAndClear(volatile void RT_FAR *pvBitmap, int32_t iBit) RT_NOTHROW_DEF
7521{
7522 union { bool f; uint32_t u32; uint8_t u8; } rc;
7523 AssertMsg(!((uintptr_t)pvBitmap & 3), ("address %p not 32-bit aligned", pvBitmap));
7524# if RT_INLINE_ASM_USES_INTRIN
7525 rc.u8 = _interlockedbittestandreset((long RT_FAR *)pvBitmap, iBit);
7526
7527# elif defined(RT_ARCH_AMD64) || defined(RT_ARCH_X86)
7528# if RT_INLINE_ASM_GNU_STYLE
7529 __asm__ __volatile__("lock; btrl %2, %1\n\t"
7530 "setc %b0\n\t"
7531 "andl $1, %0\n\t"
7532 : "=q" (rc.u32)
7533 , "=m" (*(volatile long RT_FAR *)pvBitmap)
7534 : "Ir" (iBit)
7535 , "m" (*(volatile long RT_FAR *)pvBitmap)
7536 : "memory"
7537 , "cc");
7538# else
7539 __asm
7540 {
7541 mov edx, [iBit]
7542# ifdef RT_ARCH_AMD64
7543 mov rax, [pvBitmap]
7544 lock btr [rax], edx
7545# else
7546 mov eax, [pvBitmap]
7547 lock btr [eax], edx
7548# endif
7549 setc al
7550 and eax, 1
7551 mov [rc.u32], eax
7552 }
7553# endif
7554
7555# else
7556 rc.u32 = RT_LE2H_U32(ASMAtomicAndExU32(&((uint32_t volatile *)pvBitmap)[iBit / 32], RT_H2LE_U32(~RT_BIT_32(iBit & 31))))
7557 >> (iBit & 31);
7558 rc.u32 &= 1;
7559# endif
7560 return rc.f;
7561}
7562#endif
7563
7564
7565/**
7566 * Tests and toggles a bit in a bitmap.
7567 *
7568 * @returns true if the bit was set.
7569 * @returns false if the bit was clear.
7570 *
7571 * @param pvBitmap Pointer to the bitmap (little endian).
7572 * @param iBit The bit to test and toggle.
7573 *
7574 * @remarks The 32-bit aligning of pvBitmap is not a strict requirement.
7575 * However, doing so will yield better performance as well as avoiding
7576 * traps accessing the last bits in the bitmap.
7577 */
7578#if RT_INLINE_ASM_EXTERNAL_TMP_ARM && !RT_INLINE_ASM_USES_INTRIN
7579RT_ASM_DECL_PRAGMA_WATCOM(bool) ASMBitTestAndToggle(volatile void RT_FAR *pvBitmap, int32_t iBit) RT_NOTHROW_PROTO;
7580#else
7581DECLINLINE(bool) ASMBitTestAndToggle(volatile void RT_FAR *pvBitmap, int32_t iBit) RT_NOTHROW_DEF
7582{
7583 union { bool f; uint32_t u32; uint8_t u8; } rc;
7584# if RT_INLINE_ASM_USES_INTRIN
7585 rc.u8 = _bittestandcomplement((long RT_FAR *)pvBitmap, iBit);
7586
7587# elif defined(RT_ARCH_AMD64) || defined(RT_ARCH_X86)
7588# if RT_INLINE_ASM_GNU_STYLE
7589 __asm__ __volatile__("btcl %2, %1\n\t"
7590 "setc %b0\n\t"
7591 "andl $1, %0\n\t"
7592 : "=q" (rc.u32)
7593 , "=m" (*(volatile long RT_FAR *)pvBitmap)
7594 : "Ir" (iBit)
7595 , "m" (*(volatile long RT_FAR *)pvBitmap)
7596 : "memory"
7597 , "cc");
7598# else
7599 __asm
7600 {
7601 mov edx, [iBit]
7602# ifdef RT_ARCH_AMD64
7603 mov rax, [pvBitmap]
7604 btc [rax], edx
7605# else
7606 mov eax, [pvBitmap]
7607 btc [eax], edx
7608# endif
7609 setc al
7610 and eax, 1
7611 mov [rc.u32], eax
7612 }
7613# endif
7614
7615# else
7616 int32_t offBitmap = iBit / 32;
7617 AssertStmt(!((uintptr_t)pvBitmap & 3), offBitmap += (uintptr_t)pvBitmap & 3; iBit += ((uintptr_t)pvBitmap & 3) * 8);
7618 rc.u32 = RT_LE2H_U32(ASMAtomicUoXorExU32(&((uint32_t volatile *)pvBitmap)[offBitmap], RT_H2LE_U32(RT_BIT_32(iBit & 31))))
7619 >> (iBit & 31);
7620 rc.u32 &= 1;
7621# endif
7622 return rc.f;
7623}
7624#endif
7625
7626
7627/**
7628 * Atomically tests and toggles a bit in a bitmap, ordered.
7629 *
7630 * @returns true if the bit was set.
7631 * @returns false if the bit was clear.
7632 *
7633 * @param pvBitmap Pointer to the bitmap (little endian). Must be 32-bit
7634 * aligned, otherwise the memory access isn't atomic!
7635 * @param iBit The bit to test and toggle.
7636 *
7637 * @remarks x86: Requires a 386 or later.
7638 */
7639#if RT_INLINE_ASM_EXTERNAL_TMP_ARM
7640RT_ASM_DECL_PRAGMA_WATCOM(bool) ASMAtomicBitTestAndToggle(volatile void RT_FAR *pvBitmap, int32_t iBit) RT_NOTHROW_PROTO;
7641#else
7642DECLINLINE(bool) ASMAtomicBitTestAndToggle(volatile void RT_FAR *pvBitmap, int32_t iBit) RT_NOTHROW_DEF
7643{
7644 union { bool f; uint32_t u32; uint8_t u8; } rc;
7645 AssertMsg(!((uintptr_t)pvBitmap & 3), ("address %p not 32-bit aligned", pvBitmap));
7646# if defined(RT_ARCH_AMD64) || defined(RT_ARCH_X86)
7647# if RT_INLINE_ASM_GNU_STYLE
7648 __asm__ __volatile__("lock; btcl %2, %1\n\t"
7649 "setc %b0\n\t"
7650 "andl $1, %0\n\t"
7651 : "=q" (rc.u32)
7652 , "=m" (*(volatile long RT_FAR *)pvBitmap)
7653 : "Ir" (iBit)
7654 , "m" (*(volatile long RT_FAR *)pvBitmap)
7655 : "memory"
7656 , "cc");
7657# else
7658 __asm
7659 {
7660 mov edx, [iBit]
7661# ifdef RT_ARCH_AMD64
7662 mov rax, [pvBitmap]
7663 lock btc [rax], edx
7664# else
7665 mov eax, [pvBitmap]
7666 lock btc [eax], edx
7667# endif
7668 setc al
7669 and eax, 1
7670 mov [rc.u32], eax
7671 }
7672# endif
7673
7674# else
7675 rc.u32 = RT_H2LE_U32(ASMAtomicXorExU32(&((uint32_t volatile *)pvBitmap)[iBit / 32], RT_LE2H_U32(RT_BIT_32(iBit & 31))))
7676 >> (iBit & 31);
7677 rc.u32 &= 1;
7678# endif
7679 return rc.f;
7680}
7681#endif
7682
7683
7684/**
7685 * Tests if a bit in a bitmap is set.
7686 *
7687 * @returns true if the bit is set.
7688 * @returns false if the bit is clear.
7689 *
7690 * @param pvBitmap Pointer to the bitmap (little endian).
7691 * @param iBit The bit to test.
7692 *
7693 * @remarks The 32-bit aligning of pvBitmap is not a strict requirement.
7694 * However, doing so will yield better performance as well as avoiding
7695 * traps accessing the last bits in the bitmap.
7696 */
7697#if RT_INLINE_ASM_EXTERNAL_TMP_ARM && !RT_INLINE_ASM_USES_INTRIN
7698RT_ASM_DECL_PRAGMA_WATCOM(bool) ASMBitTest(const volatile void RT_FAR *pvBitmap, int32_t iBit) RT_NOTHROW_PROTO;
7699#else
7700DECLINLINE(bool) ASMBitTest(const volatile void RT_FAR *pvBitmap, int32_t iBit) RT_NOTHROW_DEF
7701{
7702 union { bool f; uint32_t u32; uint8_t u8; } rc;
7703# if RT_INLINE_ASM_USES_INTRIN
7704 rc.u32 = _bittest((long *)pvBitmap, iBit);
7705
7706# elif defined(RT_ARCH_AMD64) || defined(RT_ARCH_X86)
7707# if RT_INLINE_ASM_GNU_STYLE
7708
7709 __asm__ __volatile__("btl %2, %1\n\t"
7710 "setc %b0\n\t"
7711 "andl $1, %0\n\t"
7712 : "=q" (rc.u32)
7713 : "m" (*(const volatile long RT_FAR *)pvBitmap)
7714 , "Ir" (iBit)
7715 : "memory"
7716 , "cc");
7717# else
7718 __asm
7719 {
7720 mov edx, [iBit]
7721# ifdef RT_ARCH_AMD64
7722 mov rax, [pvBitmap]
7723 bt [rax], edx
7724# else
7725 mov eax, [pvBitmap]
7726 bt [eax], edx
7727# endif
7728 setc al
7729 and eax, 1
7730 mov [rc.u32], eax
7731 }
7732# endif
7733
7734# else
7735 int32_t offBitmap = iBit / 32;
7736 AssertRelease(!((uintptr_t)pvBitmap & (sizeof(uint32_t) - 1)));
7737 rc.u32 = RT_LE2H_U32(ASMAtomicUoReadU32(&((uint32_t volatile *)pvBitmap)[offBitmap])) >> (iBit & 31);
7738 rc.u32 &= 1;
7739# endif
7740 return rc.f;
7741}
7742#endif
7743
7744
7745#ifdef IPRT_INCLUDED_asm_mem_h
7746
7747/**
7748 * Clears a bit range within a bitmap.
7749 *
7750 * @param pvBitmap Pointer to the bitmap (little endian).
7751 * @param iBitStart The First bit to clear.
7752 * @param iBitEnd The first bit not to clear.
7753 */
7754DECLINLINE(void) ASMBitClearRange(volatile void RT_FAR *pvBitmap, size_t iBitStart, size_t iBitEnd) RT_NOTHROW_DEF
7755{
7756 if (iBitStart < iBitEnd)
7757 {
7758 uint32_t volatile RT_FAR *pu32 = (volatile uint32_t RT_FAR *)pvBitmap + (iBitStart >> 5);
7759 size_t iStart = iBitStart & ~(size_t)31;
7760 size_t iEnd = iBitEnd & ~(size_t)31;
7761 if (iStart == iEnd)
7762 *pu32 &= RT_H2LE_U32(((UINT32_C(1) << (iBitStart & 31)) - 1) | ~((UINT32_C(1) << (iBitEnd & 31)) - 1));
7763 else
7764 {
7765 /* bits in first dword. */
7766 if (iBitStart & 31)
7767 {
7768 *pu32 &= RT_H2LE_U32((UINT32_C(1) << (iBitStart & 31)) - 1);
7769 pu32++;
7770 iBitStart = iStart + 32;
7771 }
7772
7773 /* whole dwords. */
7774 if (iBitStart != iEnd)
7775 ASMMemZero32(pu32, (iEnd - iBitStart) >> 3);
7776
7777 /* bits in last dword. */
7778 if (iBitEnd & 31)
7779 {
7780 pu32 = (volatile uint32_t RT_FAR *)pvBitmap + (iBitEnd >> 5);
7781 *pu32 &= RT_H2LE_U32(~((UINT32_C(1) << (iBitEnd & 31)) - 1));
7782 }
7783 }
7784 }
7785}
7786
7787
7788/**
7789 * Sets a bit range within a bitmap.
7790 *
7791 * @param pvBitmap Pointer to the bitmap (little endian).
7792 * @param iBitStart The First bit to set.
7793 * @param iBitEnd The first bit not to set.
7794 */
7795DECLINLINE(void) ASMBitSetRange(volatile void RT_FAR *pvBitmap, size_t iBitStart, size_t iBitEnd) RT_NOTHROW_DEF
7796{
7797 if (iBitStart < iBitEnd)
7798 {
7799 uint32_t volatile RT_FAR *pu32 = (volatile uint32_t RT_FAR *)pvBitmap + (iBitStart >> 5);
7800 size_t iStart = iBitStart & ~(size_t)31;
7801 size_t iEnd = iBitEnd & ~(size_t)31;
7802 if (iStart == iEnd)
7803 *pu32 |= RT_H2LE_U32(((UINT32_C(1) << (iBitEnd - iBitStart)) - 1) << (iBitStart & 31));
7804 else
7805 {
7806 /* bits in first dword. */
7807 if (iBitStart & 31)
7808 {
7809 *pu32 |= RT_H2LE_U32(~((UINT32_C(1) << (iBitStart & 31)) - 1));
7810 pu32++;
7811 iBitStart = iStart + 32;
7812 }
7813
7814 /* whole dword. */
7815 if (iBitStart != iEnd)
7816 ASMMemFill32(pu32, (iEnd - iBitStart) >> 3, ~UINT32_C(0));
7817
7818 /* bits in last dword. */
7819 if (iBitEnd & 31)
7820 {
7821 pu32 = (volatile uint32_t RT_FAR *)pvBitmap + (iBitEnd >> 5);
7822 *pu32 |= RT_H2LE_U32((UINT32_C(1) << (iBitEnd & 31)) - 1);
7823 }
7824 }
7825 }
7826}
7827
7828#endif /* IPRT_INCLUDED_asm_mem_h */
7829
7830/**
7831 * Finds the first clear bit in a bitmap.
7832 *
7833 * @returns Index of the first zero bit.
7834 * @returns -1 if no clear bit was found.
7835 * @param pvBitmap Pointer to the bitmap (little endian).
7836 * @param cBits The number of bits in the bitmap. Multiple of 32.
7837 */
7838#if RT_INLINE_ASM_EXTERNAL || (!defined(RT_ARCH_AMD64) && !defined(RT_ARCH_X86))
7839DECLASM(int32_t) ASMBitFirstClear(const volatile void RT_FAR *pvBitmap, uint32_t cBits) RT_NOTHROW_PROTO;
7840#else
7841DECLINLINE(int32_t) ASMBitFirstClear(const volatile void RT_FAR *pvBitmap, uint32_t cBits) RT_NOTHROW_DEF
7842{
7843 if (cBits)
7844 {
7845 int32_t iBit;
7846# if RT_INLINE_ASM_GNU_STYLE
7847 RTCCUINTREG uEAX, uECX, uEDI;
7848 cBits = RT_ALIGN_32(cBits, 32);
7849 __asm__ __volatile__("repe; scasl\n\t"
7850 "je 1f\n\t"
7851# ifdef RT_ARCH_AMD64
7852 "lea -4(%%rdi), %%rdi\n\t"
7853 "xorl (%%rdi), %%eax\n\t"
7854 "subq %5, %%rdi\n\t"
7855# else
7856 "lea -4(%%edi), %%edi\n\t"
7857 "xorl (%%edi), %%eax\n\t"
7858 "subl %5, %%edi\n\t"
7859# endif
7860 "shll $3, %%edi\n\t"
7861 "bsfl %%eax, %%edx\n\t"
7862 "addl %%edi, %%edx\n\t"
7863 "1:\t\n"
7864 : "=d" (iBit)
7865 , "=&c" (uECX)
7866 , "=&D" (uEDI)
7867 , "=&a" (uEAX)
7868 : "0" (0xffffffff)
7869 , "mr" (pvBitmap)
7870 , "1" (cBits >> 5)
7871 , "2" (pvBitmap)
7872 , "3" (0xffffffff)
7873 : "cc");
7874# else
7875 cBits = RT_ALIGN_32(cBits, 32);
7876 __asm
7877 {
7878# ifdef RT_ARCH_AMD64
7879 mov rdi, [pvBitmap]
7880 mov rbx, rdi
7881# else
7882 mov edi, [pvBitmap]
7883 mov ebx, edi
7884# endif
7885 mov edx, 0ffffffffh
7886 mov eax, edx
7887 mov ecx, [cBits]
7888 shr ecx, 5
7889 repe scasd
7890 je done
7891
7892# ifdef RT_ARCH_AMD64
7893 lea rdi, [rdi - 4]
7894 xor eax, [rdi]
7895 sub rdi, rbx
7896# else
7897 lea edi, [edi - 4]
7898 xor eax, [edi]
7899 sub edi, ebx
7900# endif
7901 shl edi, 3
7902 bsf edx, eax
7903 add edx, edi
7904 done:
7905 mov [iBit], edx
7906 }
7907# endif
7908 return iBit;
7909 }
7910 return -1;
7911}
7912#endif
7913
7914
7915/**
7916 * Finds the next clear bit in a bitmap.
7917 *
7918 * @returns Index of the first zero bit.
7919 * @returns -1 if no clear bit was found.
7920 * @param pvBitmap Pointer to the bitmap (little endian).
7921 * @param cBits The number of bits in the bitmap. Multiple of 32.
7922 * @param iBitPrev The bit returned from the last search.
7923 * The search will start at iBitPrev + 1.
7924 */
7925#if RT_INLINE_ASM_EXTERNAL || (!defined(RT_ARCH_AMD64) && !defined(RT_ARCH_X86))
7926DECLASM(int) ASMBitNextClear(const volatile void RT_FAR *pvBitmap, uint32_t cBits, uint32_t iBitPrev) RT_NOTHROW_PROTO;
7927#else
7928DECLINLINE(int) ASMBitNextClear(const volatile void RT_FAR *pvBitmap, uint32_t cBits, uint32_t iBitPrev) RT_NOTHROW_DEF
7929{
7930 const volatile uint32_t RT_FAR *pau32Bitmap = (const volatile uint32_t RT_FAR *)pvBitmap;
7931 int iBit = ++iBitPrev & 31;
7932 if (iBit)
7933 {
7934 /*
7935 * Inspect the 32-bit word containing the unaligned bit.
7936 */
7937 uint32_t u32 = ~pau32Bitmap[iBitPrev / 32] >> iBit;
7938
7939# if RT_INLINE_ASM_USES_INTRIN
7940 unsigned long ulBit = 0;
7941 if (_BitScanForward(&ulBit, u32))
7942 return ulBit + iBitPrev;
7943# else
7944# if RT_INLINE_ASM_GNU_STYLE
7945 __asm__ __volatile__("bsf %1, %0\n\t"
7946 "jnz 1f\n\t"
7947 "movl $-1, %0\n\t" /** @todo use conditional move for 64-bit? */
7948 "1:\n\t"
7949 : "=r" (iBit)
7950 : "r" (u32)
7951 : "cc");
7952# else
7953 __asm
7954 {
7955 mov edx, [u32]
7956 bsf eax, edx
7957 jnz done
7958 mov eax, 0ffffffffh
7959 done:
7960 mov [iBit], eax
7961 }
7962# endif
7963 if (iBit >= 0)
7964 return iBit + (int)iBitPrev;
7965# endif
7966
7967 /*
7968 * Skip ahead and see if there is anything left to search.
7969 */
7970 iBitPrev |= 31;
7971 iBitPrev++;
7972 if (cBits <= (uint32_t)iBitPrev)
7973 return -1;
7974 }
7975
7976 /*
7977 * 32-bit aligned search, let ASMBitFirstClear do the dirty work.
7978 */
7979 iBit = ASMBitFirstClear(&pau32Bitmap[iBitPrev / 32], cBits - iBitPrev);
7980 if (iBit >= 0)
7981 iBit += iBitPrev;
7982 return iBit;
7983}
7984#endif
7985
7986
7987/**
7988 * Finds the first set bit in a bitmap.
7989 *
7990 * @returns Index of the first set bit.
7991 * @returns -1 if no clear bit was found.
7992 * @param pvBitmap Pointer to the bitmap (little endian).
7993 * @param cBits The number of bits in the bitmap. Multiple of 32.
7994 */
7995#if RT_INLINE_ASM_EXTERNAL || (!defined(RT_ARCH_AMD64) && !defined(RT_ARCH_X86))
7996DECLASM(int32_t) ASMBitFirstSet(const volatile void RT_FAR *pvBitmap, uint32_t cBits) RT_NOTHROW_PROTO;
7997#else
7998DECLINLINE(int32_t) ASMBitFirstSet(const volatile void RT_FAR *pvBitmap, uint32_t cBits) RT_NOTHROW_DEF
7999{
8000 if (cBits)
8001 {
8002 int32_t iBit;
8003# if RT_INLINE_ASM_GNU_STYLE
8004 RTCCUINTREG uEAX, uECX, uEDI;
8005 cBits = RT_ALIGN_32(cBits, 32);
8006 __asm__ __volatile__("repe; scasl\n\t"
8007 "je 1f\n\t"
8008# ifdef RT_ARCH_AMD64
8009 "lea -4(%%rdi), %%rdi\n\t"
8010 "movl (%%rdi), %%eax\n\t"
8011 "subq %5, %%rdi\n\t"
8012# else
8013 "lea -4(%%edi), %%edi\n\t"
8014 "movl (%%edi), %%eax\n\t"
8015 "subl %5, %%edi\n\t"
8016# endif
8017 "shll $3, %%edi\n\t"
8018 "bsfl %%eax, %%edx\n\t"
8019 "addl %%edi, %%edx\n\t"
8020 "1:\t\n"
8021 : "=d" (iBit)
8022 , "=&c" (uECX)
8023 , "=&D" (uEDI)
8024 , "=&a" (uEAX)
8025 : "0" (0xffffffff)
8026 , "mr" (pvBitmap)
8027 , "1" (cBits >> 5)
8028 , "2" (pvBitmap)
8029 , "3" (0)
8030 : "cc");
8031# else
8032 cBits = RT_ALIGN_32(cBits, 32);
8033 __asm
8034 {
8035# ifdef RT_ARCH_AMD64
8036 mov rdi, [pvBitmap]
8037 mov rbx, rdi
8038# else
8039 mov edi, [pvBitmap]
8040 mov ebx, edi
8041# endif
8042 mov edx, 0ffffffffh
8043 xor eax, eax
8044 mov ecx, [cBits]
8045 shr ecx, 5
8046 repe scasd
8047 je done
8048# ifdef RT_ARCH_AMD64
8049 lea rdi, [rdi - 4]
8050 mov eax, [rdi]
8051 sub rdi, rbx
8052# else
8053 lea edi, [edi - 4]
8054 mov eax, [edi]
8055 sub edi, ebx
8056# endif
8057 shl edi, 3
8058 bsf edx, eax
8059 add edx, edi
8060 done:
8061 mov [iBit], edx
8062 }
8063# endif
8064 return iBit;
8065 }
8066 return -1;
8067}
8068#endif
8069
8070
8071/**
8072 * Finds the next set bit in a bitmap.
8073 *
8074 * @returns Index of the next set bit.
8075 * @returns -1 if no set bit was found.
8076 * @param pvBitmap Pointer to the bitmap (little endian).
8077 * @param cBits The number of bits in the bitmap. Multiple of 32.
8078 * @param iBitPrev The bit returned from the last search.
8079 * The search will start at iBitPrev + 1.
8080 */
8081#if RT_INLINE_ASM_EXTERNAL || (!defined(RT_ARCH_AMD64) && !defined(RT_ARCH_X86))
8082DECLASM(int) ASMBitNextSet(const volatile void RT_FAR *pvBitmap, uint32_t cBits, uint32_t iBitPrev) RT_NOTHROW_PROTO;
8083#else
8084DECLINLINE(int) ASMBitNextSet(const volatile void RT_FAR *pvBitmap, uint32_t cBits, uint32_t iBitPrev) RT_NOTHROW_DEF
8085{
8086 const volatile uint32_t RT_FAR *pau32Bitmap = (const volatile uint32_t RT_FAR *)pvBitmap;
8087 int iBit = ++iBitPrev & 31;
8088 if (iBit)
8089 {
8090 /*
8091 * Inspect the 32-bit word containing the unaligned bit.
8092 */
8093 uint32_t u32 = pau32Bitmap[iBitPrev / 32] >> iBit;
8094
8095# if RT_INLINE_ASM_USES_INTRIN
8096 unsigned long ulBit = 0;
8097 if (_BitScanForward(&ulBit, u32))
8098 return ulBit + iBitPrev;
8099# else
8100# if RT_INLINE_ASM_GNU_STYLE
8101 __asm__ __volatile__("bsf %1, %0\n\t"
8102 "jnz 1f\n\t" /** @todo use conditional move for 64-bit? */
8103 "movl $-1, %0\n\t"
8104 "1:\n\t"
8105 : "=r" (iBit)
8106 : "r" (u32)
8107 : "cc");
8108# else
8109 __asm
8110 {
8111 mov edx, [u32]
8112 bsf eax, edx
8113 jnz done
8114 mov eax, 0ffffffffh
8115 done:
8116 mov [iBit], eax
8117 }
8118# endif
8119 if (iBit >= 0)
8120 return iBit + (int)iBitPrev;
8121# endif
8122
8123 /*
8124 * Skip ahead and see if there is anything left to search.
8125 */
8126 iBitPrev |= 31;
8127 iBitPrev++;
8128 if (cBits <= (uint32_t)iBitPrev)
8129 return -1;
8130 }
8131
8132 /*
8133 * 32-bit aligned search, let ASMBitFirstClear do the dirty work.
8134 */
8135 iBit = ASMBitFirstSet(&pau32Bitmap[iBitPrev / 32], cBits - iBitPrev);
8136 if (iBit >= 0)
8137 iBit += iBitPrev;
8138 return iBit;
8139}
8140#endif
8141
8142/** @} */
8143
8144
8145/** @defgroup grp_inline_bits Bitmap Operations
8146 * @{
8147 */
8148
8149/**
8150 * Finds the first bit which is set in the given 32-bit integer.
8151 * Bits are numbered from 1 (least significant) to 32.
8152 *
8153 * @returns index [1..32] of the first set bit.
8154 * @returns 0 if all bits are cleared.
8155 * @param u32 Integer to search for set bits.
8156 * @remarks Similar to ffs() in BSD.
8157 */
8158#if RT_INLINE_ASM_EXTERNAL_TMP_ARM && !RT_INLINE_ASM_USES_INTRIN
8159RT_ASM_DECL_PRAGMA_WATCOM_386(unsigned) ASMBitFirstSetU32(uint32_t u32) RT_NOTHROW_PROTO;
8160#else
8161DECLINLINE(unsigned) ASMBitFirstSetU32(uint32_t u32) RT_NOTHROW_DEF
8162{
8163# if RT_INLINE_ASM_USES_INTRIN
8164 unsigned long iBit;
8165 if (_BitScanForward(&iBit, u32))
8166 iBit++;
8167 else
8168 iBit = 0;
8169
8170# elif defined(RT_ARCH_AMD64) || defined(RT_ARCH_X86)
8171# if RT_INLINE_ASM_GNU_STYLE
8172 uint32_t iBit;
8173 __asm__ __volatile__("bsf %1, %0\n\t"
8174 "jnz 1f\n\t"
8175 "xorl %0, %0\n\t"
8176 "jmp 2f\n"
8177 "1:\n\t"
8178 "incl %0\n"
8179 "2:\n\t"
8180 : "=r" (iBit)
8181 : "rm" (u32)
8182 : "cc");
8183# else
8184 uint32_t iBit;
8185 _asm
8186 {
8187 bsf eax, [u32]
8188 jnz found
8189 xor eax, eax
8190 jmp done
8191 found:
8192 inc eax
8193 done:
8194 mov [iBit], eax
8195 }
8196# endif
8197
8198# elif defined(RT_ARCH_ARM64) || defined(RT_ARCH_ARM32)
8199 /*
8200 * Using the "count leading zeros (clz)" instruction here because there
8201 * is no dedicated instruction to get the first set bit.
8202 * Need to reverse the bits in the value with "rbit" first because
8203 * "clz" starts counting from the most significant bit.
8204 */
8205 uint32_t iBit;
8206 __asm__ __volatile__(
8207# if defined(RT_ARCH_ARM64)
8208 "rbit %w[uVal], %w[uVal]\n\t"
8209 "clz %w[iBit], %w[uVal]\n\t"
8210# else
8211 "rbit %[uVal], %[uVal]\n\t"
8212 "clz %[iBit], %[uVal]\n\t"
8213# endif
8214 : [uVal] "=r" (u32)
8215 , [iBit] "=r" (iBit)
8216 : "[uVal]" (u32));
8217 if (iBit != 32)
8218 iBit++;
8219 else
8220 iBit = 0; /* No bit set. */
8221
8222# else
8223# error "Port me"
8224# endif
8225 return iBit;
8226}
8227#endif
8228
8229
8230/**
8231 * Finds the first bit which is set in the given 32-bit integer.
8232 * Bits are numbered from 1 (least significant) to 32.
8233 *
8234 * @returns index [1..32] of the first set bit.
8235 * @returns 0 if all bits are cleared.
8236 * @param i32 Integer to search for set bits.
8237 * @remark Similar to ffs() in BSD.
8238 */
8239DECLINLINE(unsigned) ASMBitFirstSetS32(int32_t i32) RT_NOTHROW_DEF
8240{
8241 return ASMBitFirstSetU32((uint32_t)i32);
8242}
8243
8244
8245/**
8246 * Finds the first bit which is set in the given 64-bit integer.
8247 *
8248 * Bits are numbered from 1 (least significant) to 64.
8249 *
8250 * @returns index [1..64] of the first set bit.
8251 * @returns 0 if all bits are cleared.
8252 * @param u64 Integer to search for set bits.
8253 * @remarks Similar to ffs() in BSD.
8254 */
8255#if RT_INLINE_ASM_EXTERNAL_TMP_ARM && !RT_INLINE_ASM_USES_INTRIN
8256RT_ASM_DECL_PRAGMA_WATCOM_386(unsigned) ASMBitFirstSetU64(uint64_t u64) RT_NOTHROW_PROTO;
8257#else
8258DECLINLINE(unsigned) ASMBitFirstSetU64(uint64_t u64) RT_NOTHROW_DEF
8259{
8260# if RT_INLINE_ASM_USES_INTRIN
8261 unsigned long iBit;
8262# if ARCH_BITS == 64
8263 if (_BitScanForward64(&iBit, u64))
8264 iBit++;
8265 else
8266 iBit = 0;
8267# else
8268 if (_BitScanForward(&iBit, (uint32_t)u64))
8269 iBit++;
8270 else if (_BitScanForward(&iBit, (uint32_t)(u64 >> 32)))
8271 iBit += 33;
8272 else
8273 iBit = 0;
8274# endif
8275
8276# elif RT_INLINE_ASM_GNU_STYLE && defined(RT_ARCH_AMD64)
8277 uint64_t iBit;
8278 __asm__ __volatile__("bsfq %1, %0\n\t"
8279 "jnz 1f\n\t"
8280 "xorl %k0, %k0\n\t"
8281 "jmp 2f\n"
8282 "1:\n\t"
8283 "incl %k0\n"
8284 "2:\n\t"
8285 : "=r" (iBit)
8286 : "rm" (u64)
8287 : "cc");
8288
8289# elif defined(RT_ARCH_ARM64)
8290 uint64_t iBit;
8291 __asm__ __volatile__("rbit %[uVal], %[uVal]\n\t"
8292 "clz %[iBit], %[uVal]\n\t"
8293 : [uVal] "=r" (u64)
8294 , [iBit] "=r" (iBit)
8295 : "[uVal]" (u64));
8296 if (iBit != 64)
8297 iBit++;
8298 else
8299 iBit = 0; /* No bit set. */
8300
8301# else
8302 unsigned iBit = ASMBitFirstSetU32((uint32_t)u64);
8303 if (!iBit)
8304 {
8305 iBit = ASMBitFirstSetU32((uint32_t)(u64 >> 32));
8306 if (iBit)
8307 iBit += 32;
8308 }
8309# endif
8310 return (unsigned)iBit;
8311}
8312#endif
8313
8314
8315/**
8316 * Finds the first bit which is set in the given 16-bit integer.
8317 *
8318 * Bits are numbered from 1 (least significant) to 16.
8319 *
8320 * @returns index [1..16] of the first set bit.
8321 * @returns 0 if all bits are cleared.
8322 * @param u16 Integer to search for set bits.
8323 * @remarks For 16-bit bs3kit code.
8324 */
8325#if RT_INLINE_ASM_EXTERNAL_TMP_ARM && !RT_INLINE_ASM_USES_INTRIN
8326RT_ASM_DECL_PRAGMA_WATCOM_386(unsigned) ASMBitFirstSetU16(uint16_t u16) RT_NOTHROW_PROTO;
8327#else
8328DECLINLINE(unsigned) ASMBitFirstSetU16(uint16_t u16) RT_NOTHROW_DEF
8329{
8330 return ASMBitFirstSetU32((uint32_t)u16);
8331}
8332#endif
8333
8334
8335/**
8336 * Finds the last bit which is set in the given 32-bit integer.
8337 * Bits are numbered from 1 (least significant) to 32.
8338 *
8339 * @returns index [1..32] of the last set bit.
8340 * @returns 0 if all bits are cleared.
8341 * @param u32 Integer to search for set bits.
8342 * @remark Similar to fls() in BSD.
8343 */
8344#if RT_INLINE_ASM_EXTERNAL_TMP_ARM && !RT_INLINE_ASM_USES_INTRIN
8345RT_ASM_DECL_PRAGMA_WATCOM_386(unsigned) ASMBitLastSetU32(uint32_t u32) RT_NOTHROW_PROTO;
8346#else
8347DECLINLINE(unsigned) ASMBitLastSetU32(uint32_t u32) RT_NOTHROW_DEF
8348{
8349# if RT_INLINE_ASM_USES_INTRIN
8350 unsigned long iBit;
8351 if (_BitScanReverse(&iBit, u32))
8352 iBit++;
8353 else
8354 iBit = 0;
8355
8356# elif defined(RT_ARCH_AMD64) || defined(RT_ARCH_X86)
8357# if RT_INLINE_ASM_GNU_STYLE
8358 uint32_t iBit;
8359 __asm__ __volatile__("bsrl %1, %0\n\t"
8360 "jnz 1f\n\t"
8361 "xorl %0, %0\n\t"
8362 "jmp 2f\n"
8363 "1:\n\t"
8364 "incl %0\n"
8365 "2:\n\t"
8366 : "=r" (iBit)
8367 : "rm" (u32)
8368 : "cc");
8369# else
8370 uint32_t iBit;
8371 _asm
8372 {
8373 bsr eax, [u32]
8374 jnz found
8375 xor eax, eax
8376 jmp done
8377 found:
8378 inc eax
8379 done:
8380 mov [iBit], eax
8381 }
8382# endif
8383
8384# elif defined(RT_ARCH_ARM64) || defined(RT_ARCH_ARM32)
8385 uint32_t iBit;
8386 __asm__ __volatile__(
8387# if defined(RT_ARCH_ARM64)
8388 "clz %w[iBit], %w[uVal]\n\t"
8389# else
8390 "clz %[iBit], %[uVal]\n\t"
8391# endif
8392 : [iBit] "=r" (iBit)
8393 : [uVal] "r" (u32));
8394 iBit = 32 - iBit;
8395
8396# else
8397# error "Port me"
8398# endif
8399 return iBit;
8400}
8401#endif
8402
8403
8404/**
8405 * Finds the last bit which is set in the given 32-bit integer.
8406 * Bits are numbered from 1 (least significant) to 32.
8407 *
8408 * @returns index [1..32] of the last set bit.
8409 * @returns 0 if all bits are cleared.
8410 * @param i32 Integer to search for set bits.
8411 * @remark Similar to fls() in BSD.
8412 */
8413DECLINLINE(unsigned) ASMBitLastSetS32(int32_t i32) RT_NOTHROW_DEF
8414{
8415 return ASMBitLastSetU32((uint32_t)i32);
8416}
8417
8418
8419/**
8420 * Finds the last bit which is set in the given 64-bit integer.
8421 *
8422 * Bits are numbered from 1 (least significant) to 64.
8423 *
8424 * @returns index [1..64] of the last set bit.
8425 * @returns 0 if all bits are cleared.
8426 * @param u64 Integer to search for set bits.
8427 * @remark Similar to fls() in BSD.
8428 */
8429#if RT_INLINE_ASM_EXTERNAL_TMP_ARM && !RT_INLINE_ASM_USES_INTRIN
8430RT_ASM_DECL_PRAGMA_WATCOM_386(unsigned) ASMBitLastSetU64(uint64_t u64) RT_NOTHROW_PROTO;
8431#else
8432DECLINLINE(unsigned) ASMBitLastSetU64(uint64_t u64) RT_NOTHROW_DEF
8433{
8434# if RT_INLINE_ASM_USES_INTRIN
8435 unsigned long iBit;
8436# if ARCH_BITS == 64
8437 if (_BitScanReverse64(&iBit, u64))
8438 iBit++;
8439 else
8440 iBit = 0;
8441# else
8442 if (_BitScanReverse(&iBit, (uint32_t)(u64 >> 32)))
8443 iBit += 33;
8444 else if (_BitScanReverse(&iBit, (uint32_t)u64))
8445 iBit++;
8446 else
8447 iBit = 0;
8448# endif
8449
8450# elif RT_INLINE_ASM_GNU_STYLE && defined(RT_ARCH_AMD64)
8451 uint64_t iBit;
8452 __asm__ __volatile__("bsrq %1, %0\n\t"
8453 "jnz 1f\n\t"
8454 "xorl %k0, %k0\n\t"
8455 "jmp 2f\n"
8456 "1:\n\t"
8457 "incl %k0\n"
8458 "2:\n\t"
8459 : "=r" (iBit)
8460 : "rm" (u64)
8461 : "cc");
8462
8463# elif defined(RT_ARCH_ARM64)
8464 uint64_t iBit;
8465 __asm__ __volatile__("clz %[iBit], %[uVal]\n\t"
8466 : [iBit] "=r" (iBit)
8467 : [uVal] "r" (u64));
8468 iBit = 64 - iBit;
8469
8470# else
8471 unsigned iBit = ASMBitLastSetU32((uint32_t)(u64 >> 32));
8472 if (iBit)
8473 iBit += 32;
8474 else
8475 iBit = ASMBitLastSetU32((uint32_t)u64);
8476# endif
8477 return (unsigned)iBit;
8478}
8479#endif
8480
8481
8482/**
8483 * Finds the last bit which is set in the given 16-bit integer.
8484 *
8485 * Bits are numbered from 1 (least significant) to 16.
8486 *
8487 * @returns index [1..16] of the last set bit.
8488 * @returns 0 if all bits are cleared.
8489 * @param u16 Integer to search for set bits.
8490 * @remarks For 16-bit bs3kit code.
8491 */
8492#if RT_INLINE_ASM_EXTERNAL_TMP_ARM && !RT_INLINE_ASM_USES_INTRIN
8493RT_ASM_DECL_PRAGMA_WATCOM_386(unsigned) ASMBitLastSetU16(uint16_t u16) RT_NOTHROW_PROTO;
8494#else
8495DECLINLINE(unsigned) ASMBitLastSetU16(uint16_t u16) RT_NOTHROW_DEF
8496{
8497 return ASMBitLastSetU32((uint32_t)u16);
8498}
8499#endif
8500
8501
8502/**
8503 * Count the number of leading zero bits in the given 32-bit integer.
8504 *
8505 * The counting starts with the most significate bit.
8506 *
8507 * @returns Number of most significant zero bits.
8508 * @returns 32 if all bits are cleared.
8509 * @param u32 Integer to consider.
8510 * @remarks Similar to __builtin_clz() in gcc, except defined zero input result.
8511 */
8512#if RT_INLINE_ASM_EXTERNAL_TMP_ARM && !RT_INLINE_ASM_USES_INTRIN
8513RT_ASM_DECL_PRAGMA_WATCOM_386(unsigned) ASMCountLeadingZerosU32(uint32_t u32) RT_NOTHROW_PROTO;
8514#else
8515DECLINLINE(unsigned) ASMCountLeadingZerosU32(uint32_t u32) RT_NOTHROW_DEF
8516{
8517# if RT_INLINE_ASM_USES_INTRIN
8518 unsigned long iBit;
8519 if (!_BitScanReverse(&iBit, u32))
8520 return 32;
8521 return 31 - (unsigned)iBit;
8522
8523# elif defined(RT_ARCH_AMD64) || defined(RT_ARCH_X86)
8524 uint32_t iBit;
8525# if RT_INLINE_ASM_GNU_STYLE && defined(RT_ARCH_AMD64) && 0 /* significantly slower on 10980xe; 929 vs 237 ps/call */
8526 __asm__ __volatile__("bsrl %1, %0\n\t"
8527 "cmovzl %2, %0\n\t"
8528 : "=&r" (iBit)
8529 : "rm" (u32)
8530 , "rm" ((int32_t)-1)
8531 : "cc");
8532# elif RT_INLINE_ASM_GNU_STYLE
8533 __asm__ __volatile__("bsr %1, %0\n\t"
8534 "jnz 1f\n\t"
8535 "mov $-1, %0\n\t"
8536 "1:\n\t"
8537 : "=r" (iBit)
8538 : "rm" (u32)
8539 : "cc");
8540# else
8541 _asm
8542 {
8543 bsr eax, [u32]
8544 jnz found
8545 mov eax, -1
8546 found:
8547 mov [iBit], eax
8548 }
8549# endif
8550 return 31 - iBit;
8551
8552# elif defined(RT_ARCH_ARM64) || defined(RT_ARCH_ARM32)
8553 uint32_t iBit;
8554 __asm__ __volatile__(
8555# if defined(RT_ARCH_ARM64)
8556 "clz %w[iBit], %w[uVal]\n\t"
8557# else
8558 "clz %[iBit], %[uVal]\n\t"
8559# endif
8560 : [uVal] "=r" (u32)
8561 , [iBit] "=r" (iBit)
8562 : "[uVal]" (u32));
8563 return iBit;
8564
8565# elif defined(__GNUC__)
8566 AssertCompile(sizeof(u32) == sizeof(unsigned int));
8567 return u32 ? __builtin_clz(u32) : 32;
8568
8569# else
8570# error "Port me"
8571# endif
8572}
8573#endif
8574
8575
8576/**
8577 * Count the number of leading zero bits in the given 64-bit integer.
8578 *
8579 * The counting starts with the most significate bit.
8580 *
8581 * @returns Number of most significant zero bits.
8582 * @returns 64 if all bits are cleared.
8583 * @param u64 Integer to consider.
8584 * @remarks Similar to __builtin_clzl() in gcc, except defined zero input
8585 * result.
8586 */
8587#if RT_INLINE_ASM_EXTERNAL_TMP_ARM && !RT_INLINE_ASM_USES_INTRIN
8588RT_ASM_DECL_PRAGMA_WATCOM_386(unsigned) ASMCountLeadingZerosU64(uint64_t u64) RT_NOTHROW_PROTO;
8589#else
8590DECLINLINE(unsigned) ASMCountLeadingZerosU64(uint64_t u64) RT_NOTHROW_DEF
8591{
8592# if RT_INLINE_ASM_USES_INTRIN
8593 unsigned long iBit;
8594# if ARCH_BITS == 64
8595 if (_BitScanReverse64(&iBit, u64))
8596 return 63 - (unsigned)iBit;
8597# else
8598 if (_BitScanReverse(&iBit, (uint32_t)(u64 >> 32)))
8599 return 31 - (unsigned)iBit;
8600 if (_BitScanReverse(&iBit, (uint32_t)u64))
8601 return 63 - (unsigned)iBit;
8602# endif
8603 return 64;
8604
8605# elif RT_INLINE_ASM_GNU_STYLE && defined(RT_ARCH_AMD64)
8606 uint64_t iBit;
8607# if 0 /* 10980xe benchmark: 932 ps/call - the slower variant */
8608 __asm__ __volatile__("bsrq %1, %0\n\t"
8609 "cmovzq %2, %0\n\t"
8610 : "=&r" (iBit)
8611 : "rm" (u64)
8612 , "rm" ((int64_t)-1)
8613 : "cc");
8614# else /* 10980xe benchmark: 262 ps/call */
8615 __asm__ __volatile__("bsrq %1, %0\n\t"
8616 "jnz 1f\n\t"
8617 "mov $-1, %0\n\t"
8618 "1:\n\t"
8619 : "=&r" (iBit)
8620 : "rm" (u64)
8621 : "cc");
8622# endif
8623 return 63 - (unsigned)iBit;
8624
8625# elif defined(RT_ARCH_ARM64)
8626 uint64_t iBit;
8627 __asm__ __volatile__("clz %[iBit], %[uVal]\n\t"
8628 : [uVal] "=r" (u64)
8629 , [iBit] "=r" (iBit)
8630 : "[uVal]" (u64));
8631 return (unsigned)iBit;
8632
8633# elif defined(__GNUC__) && ARCH_BITS == 64
8634 AssertCompile(sizeof(u64) == sizeof(unsigned long));
8635 return u64 ? __builtin_clzl(u64) : 64;
8636
8637# else
8638 unsigned iBit = ASMCountLeadingZerosU32((uint32_t)(u64 >> 32));
8639 if (iBit == 32)
8640 iBit = ASMCountLeadingZerosU32((uint32_t)u64) + 32;
8641 return iBit;
8642# endif
8643}
8644#endif
8645
8646
8647/**
8648 * Count the number of leading zero bits in the given 16-bit integer.
8649 *
8650 * The counting starts with the most significate bit.
8651 *
8652 * @returns Number of most significant zero bits.
8653 * @returns 16 if all bits are cleared.
8654 * @param u16 Integer to consider.
8655 */
8656#if RT_INLINE_ASM_EXTERNAL_TMP_ARM && !RT_INLINE_ASM_USES_INTRIN
8657RT_ASM_DECL_PRAGMA_WATCOM_386(unsigned) ASMCountLeadingZerosU16(uint16_t u16) RT_NOTHROW_PROTO;
8658#else
8659DECLINLINE(unsigned) ASMCountLeadingZerosU16(uint16_t u16) RT_NOTHROW_DEF
8660{
8661# if RT_INLINE_ASM_GNU_STYLE && (defined(RT_ARCH_X86) || defined(RT_ARCH_AMD64)) && 0 /* slower (10980xe: 987 vs 292 ps/call) */
8662 uint16_t iBit;
8663 __asm__ __volatile__("bsrw %1, %0\n\t"
8664 "jnz 1f\n\t"
8665 "mov $-1, %0\n\t"
8666 "1:\n\t"
8667 : "=r" (iBit)
8668 : "rm" (u16)
8669 : "cc");
8670 return 15 - (int16_t)iBit;
8671# else
8672 return ASMCountLeadingZerosU32((uint32_t)u16) - 16;
8673# endif
8674}
8675#endif
8676
8677
8678/**
8679 * Count the number of trailing zero bits in the given 32-bit integer.
8680 *
8681 * The counting starts with the least significate bit, i.e. the zero bit.
8682 *
8683 * @returns Number of least significant zero bits.
8684 * @returns 32 if all bits are cleared.
8685 * @param u32 Integer to consider.
8686 * @remarks Similar to __builtin_ctz() in gcc, except defined zero input result.
8687 */
8688#if RT_INLINE_ASM_EXTERNAL_TMP_ARM && !RT_INLINE_ASM_USES_INTRIN
8689RT_ASM_DECL_PRAGMA_WATCOM_386(unsigned) ASMCountTrailingZerosU32(uint32_t u32) RT_NOTHROW_PROTO;
8690#else
8691DECLINLINE(unsigned) ASMCountTrailingZerosU32(uint32_t u32) RT_NOTHROW_DEF
8692{
8693# if RT_INLINE_ASM_USES_INTRIN
8694 unsigned long iBit;
8695 if (!_BitScanForward(&iBit, u32))
8696 return 32;
8697 return (unsigned)iBit;
8698
8699# elif defined(RT_ARCH_AMD64) || defined(RT_ARCH_X86)
8700 uint32_t iBit;
8701# if RT_INLINE_ASM_GNU_STYLE && defined(RT_ARCH_AMD64) && 0 /* significantly slower on 10980xe; 932 vs 240 ps/call */
8702 __asm__ __volatile__("bsfl %1, %0\n\t"
8703 "cmovzl %2, %0\n\t"
8704 : "=&r" (iBit)
8705 : "rm" (u32)
8706 , "rm" ((int32_t)32)
8707 : "cc");
8708# elif RT_INLINE_ASM_GNU_STYLE
8709 __asm__ __volatile__("bsfl %1, %0\n\t"
8710 "jnz 1f\n\t"
8711 "mov $32, %0\n\t"
8712 "1:\n\t"
8713 : "=r" (iBit)
8714 : "rm" (u32)
8715 : "cc");
8716# else
8717 _asm
8718 {
8719 bsf eax, [u32]
8720 jnz found
8721 mov eax, 32
8722 found:
8723 mov [iBit], eax
8724 }
8725# endif
8726 return iBit;
8727
8728# elif defined(RT_ARCH_ARM64) || defined(RT_ARCH_ARM32)
8729 /* Invert the bits and use clz. */
8730 uint32_t iBit;
8731 __asm__ __volatile__(
8732# if defined(RT_ARCH_ARM64)
8733 "rbit %w[uVal], %w[uVal]\n\t"
8734 "clz %w[iBit], %w[uVal]\n\t"
8735# else
8736 "rbit %[uVal], %[uVal]\n\t"
8737 "clz %[iBit], %[uVal]\n\t"
8738# endif
8739 : [uVal] "=r" (u32)
8740 , [iBit] "=r" (iBit)
8741 : "[uVal]" (u32));
8742 return iBit;
8743
8744# elif defined(__GNUC__)
8745 AssertCompile(sizeof(u32) == sizeof(unsigned int));
8746 return u32 ? __builtin_ctz(u32) : 32;
8747
8748# else
8749# error "Port me"
8750# endif
8751}
8752#endif
8753
8754
8755/**
8756 * Count the number of trailing zero bits in the given 64-bit integer.
8757 *
8758 * The counting starts with the least significate bit.
8759 *
8760 * @returns Number of least significant zero bits.
8761 * @returns 64 if all bits are cleared.
8762 * @param u64 Integer to consider.
8763 * @remarks Similar to __builtin_ctzl() in gcc, except defined zero input
8764 * result.
8765 */
8766#if RT_INLINE_ASM_EXTERNAL_TMP_ARM && !RT_INLINE_ASM_USES_INTRIN
8767RT_ASM_DECL_PRAGMA_WATCOM_386(unsigned) ASMCountTrailingZerosU64(uint64_t u64) RT_NOTHROW_PROTO;
8768#else
8769DECLINLINE(unsigned) ASMCountTrailingZerosU64(uint64_t u64) RT_NOTHROW_DEF
8770{
8771# if RT_INLINE_ASM_USES_INTRIN
8772 unsigned long iBit;
8773# if ARCH_BITS == 64
8774 if (_BitScanForward64(&iBit, u64))
8775 return (unsigned)iBit;
8776# else
8777 if (_BitScanForward(&iBit, (uint32_t)u64))
8778 return (unsigned)iBit;
8779 if (_BitScanForward(&iBit, (uint32_t)(u64 >> 32)))
8780 return (unsigned)iBit + 32;
8781# endif
8782 return 64;
8783
8784# elif RT_INLINE_ASM_GNU_STYLE && defined(RT_ARCH_AMD64)
8785 uint64_t iBit;
8786# if 0 /* 10980xe benchmark: 932 ps/call - the slower variant */
8787 __asm__ __volatile__("bsfq %1, %0\n\t"
8788 "cmovzq %2, %0\n\t"
8789 : "=&r" (iBit)
8790 : "rm" (u64)
8791 , "rm" ((int64_t)64)
8792 : "cc");
8793# else /* 10980xe benchmark: 262 ps/call */
8794 __asm__ __volatile__("bsfq %1, %0\n\t"
8795 "jnz 1f\n\t"
8796 "mov $64, %0\n\t"
8797 "1:\n\t"
8798 : "=&r" (iBit)
8799 : "rm" (u64)
8800 : "cc");
8801# endif
8802 return (unsigned)iBit;
8803
8804# elif defined(RT_ARCH_ARM64)
8805 /* Invert the bits and use clz. */
8806 uint64_t iBit;
8807 __asm__ __volatile__("rbit %[uVal], %[uVal]\n\t"
8808 "clz %[iBit], %[uVal]\n\t"
8809 : [uVal] "=r" (u64)
8810 , [iBit] "=r" (iBit)
8811 : "[uVal]" (u64));
8812 return (unsigned)iBit;
8813
8814# elif defined(__GNUC__) && ARCH_BITS == 64
8815 AssertCompile(sizeof(u64) == sizeof(unsigned long));
8816 return u64 ? __builtin_ctzl(u64) : 64;
8817
8818# else
8819 unsigned iBit = ASMCountTrailingZerosU32((uint32_t)u64);
8820 if (iBit == 32)
8821 iBit = ASMCountTrailingZerosU32((uint32_t)(u64 >> 32)) + 32;
8822 return iBit;
8823# endif
8824}
8825#endif
8826
8827
8828/**
8829 * Count the number of trailing zero bits in the given 16-bit integer.
8830 *
8831 * The counting starts with the most significate bit.
8832 *
8833 * @returns Number of most significant zero bits.
8834 * @returns 16 if all bits are cleared.
8835 * @param u16 Integer to consider.
8836 */
8837#if RT_INLINE_ASM_EXTERNAL_TMP_ARM && !RT_INLINE_ASM_USES_INTRIN
8838RT_ASM_DECL_PRAGMA_WATCOM_386(unsigned) ASMCountTrailingZerosU16(uint16_t u16) RT_NOTHROW_PROTO;
8839#else
8840DECLINLINE(unsigned) ASMCountTrailingZerosU16(uint16_t u16) RT_NOTHROW_DEF
8841{
8842# if RT_INLINE_ASM_GNU_STYLE && (defined(RT_ARCH_X86) || defined(RT_ARCH_AMD64)) && 0 /* slower (10980xe: 992 vs 349 ps/call) */
8843 uint16_t iBit;
8844 __asm__ __volatile__("bsfw %1, %0\n\t"
8845 "jnz 1f\n\t"
8846 "mov $16, %0\n\t"
8847 "1:\n\t"
8848 : "=r" (iBit)
8849 : "rm" (u16)
8850 : "cc");
8851 return iBit;
8852# else
8853 return ASMCountTrailingZerosU32((uint32_t)u16 | UINT32_C(0x10000));
8854#endif
8855}
8856#endif
8857
8858
8859/**
8860 * Rotate 32-bit unsigned value to the left by @a cShift.
8861 *
8862 * @returns Rotated value.
8863 * @param u32 The value to rotate.
8864 * @param cShift How many bits to rotate by.
8865 */
8866#ifdef __WATCOMC__
8867RT_ASM_DECL_PRAGMA_WATCOM(uint32_t) ASMRotateLeftU32(uint32_t u32, unsigned cShift) RT_NOTHROW_PROTO;
8868#else
8869DECLINLINE(uint32_t) ASMRotateLeftU32(uint32_t u32, uint32_t cShift) RT_NOTHROW_DEF
8870{
8871# if RT_INLINE_ASM_USES_INTRIN
8872 return _rotl(u32, cShift);
8873
8874# elif RT_INLINE_ASM_GNU_STYLE && (defined(RT_ARCH_AMD64) || defined(RT_ARCH_X86))
8875 __asm__ __volatile__("roll %b1, %0" : "=g" (u32) : "Ic" (cShift), "0" (u32) : "cc");
8876 return u32;
8877
8878# elif defined(RT_ARCH_ARM64) || defined(RT_ARCH_ARM32)
8879 __asm__ __volatile__(
8880# if defined(RT_ARCH_ARM64)
8881 "ror %w[uRet], %w[uVal], %w[cShift]\n\t"
8882# else
8883 "ror %[uRet], %[uVal], %[cShift]\n\t"
8884# endif
8885 : [uRet] "=r" (u32)
8886 : [uVal] "[uRet]" (u32)
8887 , [cShift] "r" (32 - (cShift & 31))); /** @todo there is an immediate form here */
8888 return u32;
8889
8890# else
8891 cShift &= 31;
8892 return (u32 << cShift) | (u32 >> (32 - cShift));
8893# endif
8894}
8895#endif
8896
8897
8898/**
8899 * Rotate 32-bit unsigned value to the right by @a cShift.
8900 *
8901 * @returns Rotated value.
8902 * @param u32 The value to rotate.
8903 * @param cShift How many bits to rotate by.
8904 */
8905#ifdef __WATCOMC__
8906RT_ASM_DECL_PRAGMA_WATCOM(uint32_t) ASMRotateRightU32(uint32_t u32, unsigned cShift) RT_NOTHROW_PROTO;
8907#else
8908DECLINLINE(uint32_t) ASMRotateRightU32(uint32_t u32, uint32_t cShift) RT_NOTHROW_DEF
8909{
8910# if RT_INLINE_ASM_USES_INTRIN
8911 return _rotr(u32, cShift);
8912
8913# elif RT_INLINE_ASM_GNU_STYLE && (defined(RT_ARCH_AMD64) || defined(RT_ARCH_X86))
8914 __asm__ __volatile__("rorl %b1, %0" : "=g" (u32) : "Ic" (cShift), "0" (u32) : "cc");
8915 return u32;
8916
8917# elif defined(RT_ARCH_ARM64) || defined(RT_ARCH_ARM32)
8918 __asm__ __volatile__(
8919# if defined(RT_ARCH_ARM64)
8920 "ror %w[uRet], %w[uVal], %w[cShift]\n\t"
8921# else
8922 "ror %[uRet], %[uVal], %[cShift]\n\t"
8923# endif
8924 : [uRet] "=r" (u32)
8925 : [uVal] "[uRet]" (u32)
8926 , [cShift] "r" (cShift & 31)); /** @todo there is an immediate form here */
8927 return u32;
8928
8929# else
8930 cShift &= 31;
8931 return (u32 >> cShift) | (u32 << (32 - cShift));
8932# endif
8933}
8934#endif
8935
8936
8937/**
8938 * Rotate 64-bit unsigned value to the left by @a cShift.
8939 *
8940 * @returns Rotated value.
8941 * @param u64 The value to rotate.
8942 * @param cShift How many bits to rotate by.
8943 */
8944DECLINLINE(uint64_t) ASMRotateLeftU64(uint64_t u64, uint32_t cShift) RT_NOTHROW_DEF
8945{
8946#if RT_INLINE_ASM_USES_INTRIN
8947 return _rotl64(u64, cShift);
8948
8949#elif RT_INLINE_ASM_GNU_STYLE && defined(RT_ARCH_AMD64)
8950 __asm__ __volatile__("rolq %b1, %0" : "=g" (u64) : "Jc" (cShift), "0" (u64) : "cc");
8951 return u64;
8952
8953#elif RT_INLINE_ASM_GNU_STYLE && defined(RT_ARCH_X86)
8954 uint32_t uSpill;
8955 __asm__ __volatile__("testb $0x20, %%cl\n\t" /* if (cShift >= 0x20) { swap(u64.hi, u64lo); cShift -= 0x20; } */
8956 "jz 1f\n\t"
8957 "xchgl %%eax, %%edx\n\t"
8958 "1:\n\t"
8959 "andb $0x1f, %%cl\n\t" /* if (cShift & 0x1f) { */
8960 "jz 2f\n\t"
8961 "movl %%edx, %2\n\t" /* save the hi value in %3. */
8962 "shldl %%cl,%%eax,%%edx\n\t" /* shift the hi value left, feeding MSBits from the low value. */
8963 "shldl %%cl,%2,%%eax\n\t" /* shift the lo value left, feeding MSBits from the saved hi value. */
8964 "2:\n\t" /* } */
8965 : "=A" (u64)
8966 , "=c" (cShift)
8967 , "=r" (uSpill)
8968 : "0" (u64)
8969 , "1" (cShift)
8970 : "cc");
8971 return u64;
8972
8973# elif defined(RT_ARCH_ARM64)
8974 __asm__ __volatile__("ror %[uRet], %[uVal], %[cShift]\n\t"
8975 : [uRet] "=r" (u64)
8976 : [uVal] "[uRet]" (u64)
8977 , [cShift] "r" ((uint64_t)(64 - (cShift & 63)))); /** @todo there is an immediate form here */
8978 return u64;
8979
8980#else
8981 cShift &= 63;
8982 return (u64 << cShift) | (u64 >> (64 - cShift));
8983#endif
8984}
8985
8986
8987/**
8988 * Rotate 64-bit unsigned value to the right by @a cShift.
8989 *
8990 * @returns Rotated value.
8991 * @param u64 The value to rotate.
8992 * @param cShift How many bits to rotate by.
8993 */
8994DECLINLINE(uint64_t) ASMRotateRightU64(uint64_t u64, uint32_t cShift) RT_NOTHROW_DEF
8995{
8996#if RT_INLINE_ASM_USES_INTRIN
8997 return _rotr64(u64, cShift);
8998
8999#elif RT_INLINE_ASM_GNU_STYLE && defined(RT_ARCH_AMD64)
9000 __asm__ __volatile__("rorq %b1, %0" : "=g" (u64) : "Jc" (cShift), "0" (u64) : "cc");
9001 return u64;
9002
9003#elif RT_INLINE_ASM_GNU_STYLE && defined(RT_ARCH_X86)
9004 uint32_t uSpill;
9005 __asm__ __volatile__("testb $0x20, %%cl\n\t" /* if (cShift >= 0x20) { swap(u64.hi, u64lo); cShift -= 0x20; } */
9006 "jz 1f\n\t"
9007 "xchgl %%eax, %%edx\n\t"
9008 "1:\n\t"
9009 "andb $0x1f, %%cl\n\t" /* if (cShift & 0x1f) { */
9010 "jz 2f\n\t"
9011 "movl %%edx, %2\n\t" /* save the hi value in %3. */
9012 "shrdl %%cl,%%eax,%%edx\n\t" /* shift the hi value right, feeding LSBits from the low value. */
9013 "shrdl %%cl,%2,%%eax\n\t" /* shift the lo value right, feeding LSBits from the saved hi value. */
9014 "2:\n\t" /* } */
9015 : "=A" (u64)
9016 , "=c" (cShift)
9017 , "=r" (uSpill)
9018 : "0" (u64)
9019 , "1" (cShift)
9020 : "cc");
9021 return u64;
9022
9023# elif defined(RT_ARCH_ARM64)
9024 __asm__ __volatile__("ror %[uRet], %[uVal], %[cShift]\n\t"
9025 : [uRet] "=r" (u64)
9026 : [uVal] "[uRet]" (u64)
9027 , [cShift] "r" ((uint64_t)(cShift & 63))); /** @todo there is an immediate form here */
9028 return u64;
9029
9030#else
9031 cShift &= 63;
9032 return (u64 >> cShift) | (u64 << (64 - cShift));
9033#endif
9034}
9035
9036/** @} */
9037
9038
9039/** @} */
9040
9041/*
9042 * Include #pragma aux definitions for Watcom C/C++.
9043 */
9044#if defined(__WATCOMC__) && ARCH_BITS == 16 && defined(RT_ARCH_X86)
9045# define IPRT_ASM_WATCOM_X86_16_WITH_PRAGMAS
9046# undef IPRT_INCLUDED_asm_watcom_x86_16_h
9047# include "asm-watcom-x86-16.h"
9048#elif defined(__WATCOMC__) && ARCH_BITS == 32 && defined(RT_ARCH_X86)
9049# define IPRT_ASM_WATCOM_X86_32_WITH_PRAGMAS
9050# undef IPRT_INCLUDED_asm_watcom_x86_32_h
9051# include "asm-watcom-x86-32.h"
9052#endif
9053
9054#endif /* !IPRT_INCLUDED_asm_h */
9055
Note: See TracBrowser for help on using the repository browser.

© 2024 Oracle Support Privacy / Do Not Sell My Info Terms of Use Trademark Policy Automated Access Etiquette