asm-math.h@ 81025

Last change on this file since 81025 was 76585, checked in by vboxsync, 6 years ago
*: scm --fix-header-guard-endif
Property svn:eol-style set to `native` Property svn:keywords set to `Author Date Id Revision`
File size: 13.3 KB

Line
1	/** @file
2	* IPRT - Assembly Routines for Optimizing some Integers Math Operations.
3	*/
4
5	/*
6	* Copyright (C) 2006-2019 Oracle Corporation
7	*
8	* This file is part of VirtualBox Open Source Edition (OSE), as
9	* available from http://www.virtualbox.org. This file is free software;
10	* you can redistribute it and/or modify it under the terms of the GNU
11	* General Public License (GPL) as published by the Free Software
12	* Foundation, in version 2 as it comes in the "COPYING" file of the
13	* VirtualBox OSE distribution. VirtualBox OSE is distributed in the
14	* hope that it will be useful, but WITHOUT ANY WARRANTY of any kind.
15	*
16	* The contents of this file may alternatively be used under the terms
17	* of the Common Development and Distribution License Version 1.0
18	* (CDDL) only, as it comes in the "COPYING.CDDL" file of the
19	* VirtualBox OSE distribution, in which case the provisions of the
20	* CDDL are applicable instead of those of the GPL.
21	*
22	* You may elect to license modified versions of this file under the
23	* terms and conditions of either the GPL or the CDDL or both.
24	*/
25
26	#ifndef IPRT_INCLUDED_asm_math_h
27	#define IPRT_INCLUDED_asm_math_h
28	#ifndef RT_WITHOUT_PRAGMA_ONCE
29	# pragma once
30	#endif
31
32	#include <iprt/types.h>
33
34	#if defined(_MSC_VER) && RT_INLINE_ASM_USES_INTRIN
35	# pragma warning(push)
36	# pragma warning(disable:4668) /* Several incorrect __cplusplus uses. */
37	# pragma warning(disable:4255) /* Incorrect __slwpcb prototype. */
38	# include <intrin.h>
39	# pragma warning(pop)
40	/* Emit the intrinsics at all optimization levels. */
41	# pragma intrinsic(__emul)
42	# pragma intrinsic(__emulu)
43	# ifdef RT_ARCH_AMD64
44	# pragma intrinsic(_mul128)
45	# pragma intrinsic(_umul128)
46	# endif
47	#endif
48
49
50	/** @defgroup grp_rt_asm_math Interger Math Optimizations
51	* @ingroup grp_rt_asm
52	* @{ */
53
54	/**
55	* Multiplies two unsigned 32-bit values returning an unsigned 64-bit result.
56	*
57	* @returns u32F1 * u32F2.
58	*/
59
60	#if RT_INLINE_ASM_EXTERNAL && !RT_INLINE_ASM_USES_INTRIN && defined(RT_ARCH_X86)
61	DECLASM(uint64_t) ASMMult2xU32RetU64(uint32_t u32F1, uint32_t u32F2);
62	#else
63	DECLINLINE(uint64_t) ASMMult2xU32RetU64(uint32_t u32F1, uint32_t u32F2)
64	{
65	# ifdef RT_ARCH_X86
66	uint64_t u64;
67	# if RT_INLINE_ASM_GNU_STYLE
68	__asm__ __volatile__("mull %%edx"
69	: "=A" (u64)
70	: "a" (u32F2), "d" (u32F1));
71	# elif RT_INLINE_ASM_USES_INTRIN
72	u64 = __emulu(u32F1, u32F2);
73	# else
74	__asm
75	{
76	mov edx, [u32F1]
77	mov eax, [u32F2]
78	mul edx
79	mov dword ptr [u64], eax
80	mov dword ptr [u64 + 4], edx
81	}
82	# endif
83	return u64;
84	# else /* generic: */
85	return (uint64_t)u32F1 * u32F2;
86	# endif
87	}
88	#endif
89
90
91	/**
92	* Multiplies two signed 32-bit values returning a signed 64-bit result.
93	*
94	* @returns u32F1 * u32F2.
95	*/
96	#if RT_INLINE_ASM_EXTERNAL && !RT_INLINE_ASM_USES_INTRIN && defined(RT_ARCH_X86)
97	DECLASM(int64_t) ASMMult2xS32RetS64(int32_t i32F1, int32_t i32F2);
98	#else
99	DECLINLINE(int64_t) ASMMult2xS32RetS64(int32_t i32F1, int32_t i32F2)
100	{
101	# ifdef RT_ARCH_X86
102	int64_t i64;
103	# if RT_INLINE_ASM_GNU_STYLE
104	__asm__ __volatile__("imull %%edx"
105	: "=A" (i64)
106	: "a" (i32F2), "d" (i32F1));
107	# elif RT_INLINE_ASM_USES_INTRIN
108	i64 = __emul(i32F1, i32F2);
109	# else
110	__asm
111	{
112	mov edx, [i32F1]
113	mov eax, [i32F2]
114	imul edx
115	mov dword ptr [i64], eax
116	mov dword ptr [i64 + 4], edx
117	}
118	# endif
119	return i64;
120	# else /* generic: */
121	return (int64_t)i32F1 * i32F2;
122	# endif
123	}
124	#endif
125
126
127	#if ARCH_BITS == 64
128	DECLINLINE(uint64_t) ASMMult2xU64Ret2xU64(uint64_t u64F1, uint64_t u64F2, uint64_t *pu64ProdHi)
129	{
130	# if defined(RT_ARCH_AMD64) && (RT_INLINE_ASM_GNU_STYLE \|\| RT_INLINE_ASM_USES_INTRIN)
131	# if RT_INLINE_ASM_GNU_STYLE
132	uint64_t u64Low, u64High;
133	__asm__ __volatile__("mulq %%rdx"
134	: "=a" (u64Low), "=d" (u64High)
135	: "0" (u64F1), "1" (u64F2));
136	*pu64ProdHi = u64High;
137	return u64Low;
138	# elif RT_INLINE_ASM_USES_INTRIN
139	return _umul128(u64F1, u64F2, pu64ProdHi);
140	# else
141	# error "hmm"
142	# endif
143	# else /* generic: */
144	/*
145	* F1 * F2 = Prod
146	* -- --
147	* ab * cd = bd + ad10 + bc10 + ac*100
148	*
149	* Where a, b, c and d are 'digits', and 10 is max digit + 1.
150	*
151	* Our digits are 32-bit wide, so instead of 10 we multiply by 4G.
152	* Prod = F1.s.LoF2.s.Lo + F1.s.HiF2.s.Lo*4G
153	* + F1.s.LoF2.s.Hi4G + F1.s.HiF2.s.Hi4G*4G
154	*/
155	RTUINT128U Prod;
156	RTUINT64U Tmp1;
157	uint64_t u64Tmp;
158	RTUINT64U F1, F2;
159	F1.u = u64F1;
160	F2.u = u64F2;
161
162	Prod.s.Lo = ASMMult2xU32RetU64(F1.s.Lo, F2.s.Lo);
163
164	Tmp1.u = ASMMult2xU32RetU64(F1.s.Hi, F2.s.Lo);
165	u64Tmp = (uint64_t)Prod.DWords.dw1 + Tmp1.s.Lo;
166	Prod.DWords.dw1 = (uint32_t)u64Tmp;
167	Prod.s.Hi = Tmp1.s.Hi;
168	Prod.s.Hi += u64Tmp >> 32; /* carry */
169
170	Tmp1.u = ASMMult2xU32RetU64(F1.s.Lo, F2.s.Hi);
171	u64Tmp = (uint64_t)Prod.DWords.dw1 + Tmp1.s.Lo;
172	Prod.DWords.dw1 = (uint32_t)u64Tmp;
173	u64Tmp >>= 32; /* carry */
174	u64Tmp += Prod.DWords.dw2;
175	u64Tmp += Tmp1.s.Hi;
176	Prod.DWords.dw2 = (uint32_t)u64Tmp;
177	Prod.DWords.dw3 += u64Tmp >> 32; /* carry */
178
179	Prod.s.Hi += ASMMult2xU32RetU64(F1.s.Hi, F2.s.Hi);
180	*pu64ProdHi = Prod.s.Hi;
181	return Prod.s.Lo;
182	# endif
183	}
184	#endif
185
186
187
188	/**
189	* Divides a 64-bit unsigned by a 32-bit unsigned returning an unsigned 32-bit result.
190	*
191	* @returns u64 / u32.
192	*/
193	#if RT_INLINE_ASM_EXTERNAL && defined(RT_ARCH_X86)
194	DECLASM(uint32_t) ASMDivU64ByU32RetU32(uint64_t u64, uint32_t u32);
195	#else
196	DECLINLINE(uint32_t) ASMDivU64ByU32RetU32(uint64_t u64, uint32_t u32)
197	{
198	# ifdef RT_ARCH_X86
199	# if RT_INLINE_ASM_GNU_STYLE
200	RTCCUINTREG uDummy;
201	__asm__ __volatile__("divl %3"
202	: "=a" (u32), "=d"(uDummy)
203	: "A" (u64), "r" (u32));
204	# else
205	__asm
206	{
207	mov eax, dword ptr [u64]
208	mov edx, dword ptr [u64 + 4]
209	mov ecx, [u32]
210	div ecx
211	mov [u32], eax
212	}
213	# endif
214	return u32;
215	# else /* generic: */
216	return (uint32_t)(u64 / u32);
217	# endif
218	}
219	#endif
220
221
222	/**
223	* Divides a 64-bit signed by a 32-bit signed returning a signed 32-bit result.
224	*
225	* @returns u64 / u32.
226	*/
227	#if RT_INLINE_ASM_EXTERNAL && defined(RT_ARCH_X86)
228	DECLASM(int32_t) ASMDivS64ByS32RetS32(int64_t i64, int32_t i32);
229	#else
230	DECLINLINE(int32_t) ASMDivS64ByS32RetS32(int64_t i64, int32_t i32)
231	{
232	# ifdef RT_ARCH_X86
233	# if RT_INLINE_ASM_GNU_STYLE
234	RTCCUINTREG iDummy;
235	__asm__ __volatile__("idivl %3"
236	: "=a" (i32), "=d"(iDummy)
237	: "A" (i64), "r" (i32));
238	# else
239	__asm
240	{
241	mov eax, dword ptr [i64]
242	mov edx, dword ptr [i64 + 4]
243	mov ecx, [i32]
244	idiv ecx
245	mov [i32], eax
246	}
247	# endif
248	return i32;
249	# else /* generic: */
250	return (int32_t)(i64 / i32);
251	# endif
252	}
253	#endif
254
255
256	/**
257	* Performs 64-bit unsigned by a 32-bit unsigned division with a 32-bit unsigned result,
258	* returning the rest.
259	*
260	* @returns u64 % u32.
261	*
262	* @remarks It is important that the result is <= UINT32_MAX or we'll overflow and crash.
263	*/
264	#if RT_INLINE_ASM_EXTERNAL && defined(RT_ARCH_X86)
265	DECLASM(uint32_t) ASMModU64ByU32RetU32(uint64_t u64, uint32_t u32);
266	#else
267	DECLINLINE(uint32_t) ASMModU64ByU32RetU32(uint64_t u64, uint32_t u32)
268	{
269	# ifdef RT_ARCH_X86
270	# if RT_INLINE_ASM_GNU_STYLE
271	RTCCUINTREG uDummy;
272	__asm__ __volatile__("divl %3"
273	: "=a" (uDummy), "=d"(u32)
274	: "A" (u64), "r" (u32));
275	# else
276	__asm
277	{
278	mov eax, dword ptr [u64]
279	mov edx, dword ptr [u64 + 4]
280	mov ecx, [u32]
281	div ecx
282	mov [u32], edx
283	}
284	# endif
285	return u32;
286	# else /* generic: */
287	return (uint32_t)(u64 % u32);
288	# endif
289	}
290	#endif
291
292
293	/**
294	* Performs 64-bit signed by a 32-bit signed division with a 32-bit signed result,
295	* returning the rest.
296	*
297	* @returns u64 % u32.
298	*
299	* @remarks It is important that the result is <= UINT32_MAX or we'll overflow and crash.
300	*/
301	#if RT_INLINE_ASM_EXTERNAL && defined(RT_ARCH_X86)
302	DECLASM(int32_t) ASMModS64ByS32RetS32(int64_t i64, int32_t i32);
303	#else
304	DECLINLINE(int32_t) ASMModS64ByS32RetS32(int64_t i64, int32_t i32)
305	{
306	# ifdef RT_ARCH_X86
307	# if RT_INLINE_ASM_GNU_STYLE
308	RTCCUINTREG iDummy;
309	__asm__ __volatile__("idivl %3"
310	: "=a" (iDummy), "=d"(i32)
311	: "A" (i64), "r" (i32));
312	# else
313	__asm
314	{
315	mov eax, dword ptr [i64]
316	mov edx, dword ptr [i64 + 4]
317	mov ecx, [i32]
318	idiv ecx
319	mov [i32], edx
320	}
321	# endif
322	return i32;
323	# else /* generic: */
324	return (int32_t)(i64 % i32);
325	# endif
326	}
327	#endif
328
329
330	/**
331	* Multiple a 32-bit by a 32-bit integer and divide the result by a 32-bit integer
332	* using a 64 bit intermediate result.
333	*
334	* @returns (u32A * u32B) / u32C.
335	* @param u32A The 32-bit value (A).
336	* @param u32B The 32-bit value to multiple by A.
337	* @param u32C The 32-bit value to divide A*B by.
338	*
339	* @remarks Architecture specific.
340	* @remarks Make sure the result won't ever exceed 32-bit, because hardware
341	* exception may be raised if it does.
342	* @remarks On x86 this may be used to avoid dragging in 64-bit builtin
343	* arithmetics functions.
344	*/
345	#if RT_INLINE_ASM_EXTERNAL && (defined(RT_ARCH_AMD64) \|\| defined(RT_ARCH_X86))
346	DECLASM(uint32_t) ASMMultU32ByU32DivByU32(uint32_t u32A, uint32_t u32B, uint32_t u32C);
347	#else
348	DECLINLINE(uint32_t) ASMMultU32ByU32DivByU32(uint32_t u32A, uint32_t u32B, uint32_t u32C)
349	{
350	# if RT_INLINE_ASM_GNU_STYLE && (defined(RT_ARCH_AMD64) \|\| defined(RT_ARCH_X86))
351	uint32_t u32Result, u32Spill;
352	__asm__ __volatile__("mull %2\n\t"
353	"divl %3\n\t"
354	: "=&a" (u32Result),
355	"=&d" (u32Spill)
356	: "r" (u32B),
357	"r" (u32C),
358	"0" (u32A));
359	return u32Result;
360	# else
361	return (uint32_t)(((uint64_t)u32A * u32B) / u32C);
362	# endif
363	}
364	#endif
365
366
367	/**
368	* Multiple a 64-bit by a 32-bit integer and divide the result by a 32-bit integer
369	* using a 96 bit intermediate result.
370	*
371	* @returns (u64A * u32B) / u32C.
372	* @param u64A The 64-bit value.
373	* @param u32B The 32-bit value to multiple by A.
374	* @param u32C The 32-bit value to divide A*B by.
375	*
376	* @remarks Architecture specific.
377	* @remarks Make sure the result won't ever exceed 64-bit, because hardware
378	* exception may be raised if it does.
379	* @remarks On x86 this may be used to avoid dragging in 64-bit builtin
380	* arithmetics function.
381	*/
382	#if RT_INLINE_ASM_EXTERNAL \|\| !defined(__GNUC__) \|\| (!defined(RT_ARCH_AMD64) && !defined(RT_ARCH_X86))
383	DECLASM(uint64_t) ASMMultU64ByU32DivByU32(uint64_t u64A, uint32_t u32B, uint32_t u32C);
384	#else
385	DECLINLINE(uint64_t) ASMMultU64ByU32DivByU32(uint64_t u64A, uint32_t u32B, uint32_t u32C)
386	{
387	# if RT_INLINE_ASM_GNU_STYLE
388	# ifdef RT_ARCH_AMD64
389	uint64_t u64Result, u64Spill;
390	__asm__ __volatile__("mulq %2\n\t"
391	"divq %3\n\t"
392	: "=&a" (u64Result),
393	"=&d" (u64Spill)
394	: "r" ((uint64_t)u32B),
395	"r" ((uint64_t)u32C),
396	"0" (u64A));
397	return u64Result;
398	# else
399	uint32_t u32Dummy;
400	uint64_t u64Result;
401	__asm__ __volatile__("mull %%ecx \n\t" /* eax = u64Lo.lo = (u64A.lo * u32B).lo
402	edx = u64Lo.hi = (u64A.lo * u32B).hi */
403	"xchg %%eax,%%esi \n\t" /* esi = u64Lo.lo
404	eax = u64A.hi */
405	"xchg %%edx,%%edi \n\t" /* edi = u64Low.hi
406	edx = u32C */
407	"xchg %%edx,%%ecx \n\t" /* ecx = u32C
408	edx = u32B */
409	"mull %%edx \n\t" /* eax = u64Hi.lo = (u64A.hi * u32B).lo
410	edx = u64Hi.hi = (u64A.hi * u32B).hi */
411	"addl %%edi,%%eax \n\t" /* u64Hi.lo += u64Lo.hi */
412	"adcl $0,%%edx \n\t" /* u64Hi.hi += carry */
413	"divl %%ecx \n\t" /* eax = u64Hi / u32C
414	edx = u64Hi % u32C */
415	"movl %%eax,%%edi \n\t" /* edi = u64Result.hi = u64Hi / u32C */
416	"movl %%esi,%%eax \n\t" /* eax = u64Lo.lo */
417	"divl %%ecx \n\t" /* u64Result.lo */
418	"movl %%edi,%%edx \n\t" /* u64Result.hi */
419	: "=A"(u64Result), "=c"(u32Dummy),
420	"=S"(u32Dummy), "=D"(u32Dummy)
421	: "a"((uint32_t)u64A),
422	"S"((uint32_t)(u64A >> 32)),
423	"c"(u32B),
424	"D"(u32C));
425	return u64Result;
426	# endif
427	# else
428	RTUINT64U u;
429	uint64_t u64Lo = (uint64_t)(u64A & 0xffffffff) * u32B;
430	uint64_t u64Hi = (uint64_t)(u64A >> 32) * u32B;
431	u64Hi += (u64Lo >> 32);
432	u.s.Hi = (uint32_t)(u64Hi / u32C);
433	u.s.Lo = (uint32_t)((((u64Hi % u32C) << 32) + (u64Lo & 0xffffffff)) / u32C);
434	return u.u;
435	# endif
436	}
437	#endif
438
439	/** @} */
440	#endif /* !IPRT_INCLUDED_asm_math_h */
441

Note: See TracBrowser for help on using the repository browser.

source: vbox/trunk/include/iprt/asm-math.h@ 81025

Download in other formats: