VirtualBox

source: vbox/trunk/include/iprt/asm-math.h@ 95897

Last change on this file since 95897 was 94512, checked in by vboxsync, 3 years ago

IPRT: Added RTUInt128MulEx and RTUInt128MulU64Ex as well as a limited RTUInt256Xxx Api. [build fix] bugref:9898

  • Property svn:eol-style set to native
  • Property svn:keywords set to Author Date Id Revision
File size: 13.1 KB
Line 
1/** @file
2 * IPRT - Assembly Routines for Optimizing some Integers Math Operations.
3 */
4
5/*
6 * Copyright (C) 2006-2022 Oracle Corporation
7 *
8 * This file is part of VirtualBox Open Source Edition (OSE), as
9 * available from http://www.virtualbox.org. This file is free software;
10 * you can redistribute it and/or modify it under the terms of the GNU
11 * General Public License (GPL) as published by the Free Software
12 * Foundation, in version 2 as it comes in the "COPYING" file of the
13 * VirtualBox OSE distribution. VirtualBox OSE is distributed in the
14 * hope that it will be useful, but WITHOUT ANY WARRANTY of any kind.
15 *
16 * The contents of this file may alternatively be used under the terms
17 * of the Common Development and Distribution License Version 1.0
18 * (CDDL) only, as it comes in the "COPYING.CDDL" file of the
19 * VirtualBox OSE distribution, in which case the provisions of the
20 * CDDL are applicable instead of those of the GPL.
21 *
22 * You may elect to license modified versions of this file under the
23 * terms and conditions of either the GPL or the CDDL or both.
24 */
25
26#ifndef IPRT_INCLUDED_asm_math_h
27#define IPRT_INCLUDED_asm_math_h
28#ifndef RT_WITHOUT_PRAGMA_ONCE
29# pragma once
30#endif
31
32#include <iprt/types.h>
33
34#if defined(_MSC_VER) && RT_INLINE_ASM_USES_INTRIN
35/* Emit the intrinsics at all optimization levels. */
36# include <iprt/sanitized/intrin.h>
37# pragma intrinsic(__emul)
38# pragma intrinsic(__emulu)
39# ifdef RT_ARCH_AMD64
40# pragma intrinsic(_mul128)
41# pragma intrinsic(_umul128)
42# endif
43#endif
44
45
46/** @defgroup grp_rt_asm_math Interger Math Optimizations
47 * @ingroup grp_rt_asm
48 * @{ */
49
50/**
51 * Multiplies two unsigned 32-bit values returning an unsigned 64-bit result.
52 *
53 * @returns u32F1 * u32F2.
54 */
55
56#if RT_INLINE_ASM_EXTERNAL && !RT_INLINE_ASM_USES_INTRIN && defined(RT_ARCH_X86)
57DECLASM(uint64_t) ASMMult2xU32RetU64(uint32_t u32F1, uint32_t u32F2);
58#else
59DECLINLINE(uint64_t) ASMMult2xU32RetU64(uint32_t u32F1, uint32_t u32F2)
60{
61# ifdef RT_ARCH_X86
62 uint64_t u64;
63# if RT_INLINE_ASM_GNU_STYLE
64 __asm__ __volatile__("mull %%edx"
65 : "=A" (u64)
66 : "a" (u32F2), "d" (u32F1));
67# elif RT_INLINE_ASM_USES_INTRIN
68 u64 = __emulu(u32F1, u32F2);
69# else
70 __asm
71 {
72 mov edx, [u32F1]
73 mov eax, [u32F2]
74 mul edx
75 mov dword ptr [u64], eax
76 mov dword ptr [u64 + 4], edx
77 }
78# endif
79 return u64;
80# else /* generic: */
81 return (uint64_t)u32F1 * u32F2;
82# endif
83}
84#endif
85
86
87/**
88 * Multiplies two signed 32-bit values returning a signed 64-bit result.
89 *
90 * @returns u32F1 * u32F2.
91 */
92#if RT_INLINE_ASM_EXTERNAL && !RT_INLINE_ASM_USES_INTRIN && defined(RT_ARCH_X86)
93DECLASM(int64_t) ASMMult2xS32RetS64(int32_t i32F1, int32_t i32F2);
94#else
95DECLINLINE(int64_t) ASMMult2xS32RetS64(int32_t i32F1, int32_t i32F2)
96{
97# ifdef RT_ARCH_X86
98 int64_t i64;
99# if RT_INLINE_ASM_GNU_STYLE
100 __asm__ __volatile__("imull %%edx"
101 : "=A" (i64)
102 : "a" (i32F2), "d" (i32F1));
103# elif RT_INLINE_ASM_USES_INTRIN
104 i64 = __emul(i32F1, i32F2);
105# else
106 __asm
107 {
108 mov edx, [i32F1]
109 mov eax, [i32F2]
110 imul edx
111 mov dword ptr [i64], eax
112 mov dword ptr [i64 + 4], edx
113 }
114# endif
115 return i64;
116# else /* generic: */
117 return (int64_t)i32F1 * i32F2;
118# endif
119}
120#endif
121
122
123DECLINLINE(uint64_t) ASMMult2xU64Ret2xU64(uint64_t u64F1, uint64_t u64F2, uint64_t *pu64ProdHi)
124{
125#if defined(RT_ARCH_AMD64) && (RT_INLINE_ASM_GNU_STYLE || RT_INLINE_ASM_USES_INTRIN)
126# if RT_INLINE_ASM_GNU_STYLE
127 uint64_t u64Low, u64High;
128 __asm__ __volatile__("mulq %%rdx"
129 : "=a" (u64Low), "=d" (u64High)
130 : "0" (u64F1), "1" (u64F2));
131 *pu64ProdHi = u64High;
132 return u64Low;
133# elif RT_INLINE_ASM_USES_INTRIN
134 return _umul128(u64F1, u64F2, pu64ProdHi);
135# else
136# error "hmm"
137# endif
138#else /* generic: */
139 /*
140 * F1 * F2 = Prod
141 * -- --
142 * ab * cd = b*d + a*d*10 + b*c*10 + a*c*100
143 *
144 * Where a, b, c and d are 'digits', and 10 is max digit + 1.
145 *
146 * Our digits are 32-bit wide, so instead of 10 we multiply by 4G.
147 * Prod = F1.s.Lo*F2.s.Lo + F1.s.Hi*F2.s.Lo*4G
148 * + F1.s.Lo*F2.s.Hi*4G + F1.s.Hi*F2.s.Hi*4G*4G
149 */
150 RTUINT128U Prod;
151 RTUINT64U Tmp1;
152 uint64_t u64Tmp;
153 RTUINT64U F1, F2;
154 F1.u = u64F1;
155 F2.u = u64F2;
156
157 Prod.s.Lo = ASMMult2xU32RetU64(F1.s.Lo, F2.s.Lo);
158
159 Tmp1.u = ASMMult2xU32RetU64(F1.s.Hi, F2.s.Lo);
160 u64Tmp = (uint64_t)Prod.DWords.dw1 + Tmp1.s.Lo;
161 Prod.DWords.dw1 = (uint32_t)u64Tmp;
162 Prod.s.Hi = Tmp1.s.Hi;
163 Prod.s.Hi += u64Tmp >> 32; /* carry */
164
165 Tmp1.u = ASMMult2xU32RetU64(F1.s.Lo, F2.s.Hi);
166 u64Tmp = (uint64_t)Prod.DWords.dw1 + Tmp1.s.Lo;
167 Prod.DWords.dw1 = (uint32_t)u64Tmp;
168 u64Tmp >>= 32; /* carry */
169 u64Tmp += Prod.DWords.dw2;
170 u64Tmp += Tmp1.s.Hi;
171 Prod.DWords.dw2 = (uint32_t)u64Tmp;
172 Prod.DWords.dw3 += u64Tmp >> 32; /* carry */
173
174 Prod.s.Hi += ASMMult2xU32RetU64(F1.s.Hi, F2.s.Hi);
175 *pu64ProdHi = Prod.s.Hi;
176 return Prod.s.Lo;
177#endif
178}
179
180
181
182/**
183 * Divides a 64-bit unsigned by a 32-bit unsigned returning an unsigned 32-bit result.
184 *
185 * @returns u64 / u32.
186 */
187#if RT_INLINE_ASM_EXTERNAL && defined(RT_ARCH_X86)
188DECLASM(uint32_t) ASMDivU64ByU32RetU32(uint64_t u64, uint32_t u32);
189#else
190DECLINLINE(uint32_t) ASMDivU64ByU32RetU32(uint64_t u64, uint32_t u32)
191{
192# ifdef RT_ARCH_X86
193# if RT_INLINE_ASM_GNU_STYLE
194 RTCCUINTREG uDummy;
195 __asm__ __volatile__("divl %3"
196 : "=a" (u32), "=d"(uDummy)
197 : "A" (u64), "r" (u32));
198# else
199 __asm
200 {
201 mov eax, dword ptr [u64]
202 mov edx, dword ptr [u64 + 4]
203 mov ecx, [u32]
204 div ecx
205 mov [u32], eax
206 }
207# endif
208 return u32;
209# else /* generic: */
210 return (uint32_t)(u64 / u32);
211# endif
212}
213#endif
214
215
216/**
217 * Divides a 64-bit signed by a 32-bit signed returning a signed 32-bit result.
218 *
219 * @returns u64 / u32.
220 */
221#if RT_INLINE_ASM_EXTERNAL && defined(RT_ARCH_X86)
222DECLASM(int32_t) ASMDivS64ByS32RetS32(int64_t i64, int32_t i32);
223#else
224DECLINLINE(int32_t) ASMDivS64ByS32RetS32(int64_t i64, int32_t i32)
225{
226# ifdef RT_ARCH_X86
227# if RT_INLINE_ASM_GNU_STYLE
228 RTCCUINTREG iDummy;
229 __asm__ __volatile__("idivl %3"
230 : "=a" (i32), "=d"(iDummy)
231 : "A" (i64), "r" (i32));
232# else
233 __asm
234 {
235 mov eax, dword ptr [i64]
236 mov edx, dword ptr [i64 + 4]
237 mov ecx, [i32]
238 idiv ecx
239 mov [i32], eax
240 }
241# endif
242 return i32;
243# else /* generic: */
244 return (int32_t)(i64 / i32);
245# endif
246}
247#endif
248
249
250/**
251 * Performs 64-bit unsigned by a 32-bit unsigned division with a 32-bit unsigned result,
252 * returning the rest.
253 *
254 * @returns u64 % u32.
255 *
256 * @remarks It is important that the result is <= UINT32_MAX or we'll overflow and crash.
257 */
258#if RT_INLINE_ASM_EXTERNAL && defined(RT_ARCH_X86)
259DECLASM(uint32_t) ASMModU64ByU32RetU32(uint64_t u64, uint32_t u32);
260#else
261DECLINLINE(uint32_t) ASMModU64ByU32RetU32(uint64_t u64, uint32_t u32)
262{
263# ifdef RT_ARCH_X86
264# if RT_INLINE_ASM_GNU_STYLE
265 RTCCUINTREG uDummy;
266 __asm__ __volatile__("divl %3"
267 : "=a" (uDummy), "=d"(u32)
268 : "A" (u64), "r" (u32));
269# else
270 __asm
271 {
272 mov eax, dword ptr [u64]
273 mov edx, dword ptr [u64 + 4]
274 mov ecx, [u32]
275 div ecx
276 mov [u32], edx
277 }
278# endif
279 return u32;
280# else /* generic: */
281 return (uint32_t)(u64 % u32);
282# endif
283}
284#endif
285
286
287/**
288 * Performs 64-bit signed by a 32-bit signed division with a 32-bit signed result,
289 * returning the rest.
290 *
291 * @returns u64 % u32.
292 *
293 * @remarks It is important that the result is <= UINT32_MAX or we'll overflow and crash.
294 */
295#if RT_INLINE_ASM_EXTERNAL && defined(RT_ARCH_X86)
296DECLASM(int32_t) ASMModS64ByS32RetS32(int64_t i64, int32_t i32);
297#else
298DECLINLINE(int32_t) ASMModS64ByS32RetS32(int64_t i64, int32_t i32)
299{
300# ifdef RT_ARCH_X86
301# if RT_INLINE_ASM_GNU_STYLE
302 RTCCUINTREG iDummy;
303 __asm__ __volatile__("idivl %3"
304 : "=a" (iDummy), "=d"(i32)
305 : "A" (i64), "r" (i32));
306# else
307 __asm
308 {
309 mov eax, dword ptr [i64]
310 mov edx, dword ptr [i64 + 4]
311 mov ecx, [i32]
312 idiv ecx
313 mov [i32], edx
314 }
315# endif
316 return i32;
317# else /* generic: */
318 return (int32_t)(i64 % i32);
319# endif
320}
321#endif
322
323
324/**
325 * Multiple a 32-bit by a 32-bit integer and divide the result by a 32-bit integer
326 * using a 64 bit intermediate result.
327 *
328 * @returns (u32A * u32B) / u32C.
329 * @param u32A The 32-bit value (A).
330 * @param u32B The 32-bit value to multiple by A.
331 * @param u32C The 32-bit value to divide A*B by.
332 *
333 * @remarks Architecture specific.
334 * @remarks Make sure the result won't ever exceed 32-bit, because hardware
335 * exception may be raised if it does.
336 * @remarks On x86 this may be used to avoid dragging in 64-bit builtin
337 * arithmetics functions.
338 */
339#if RT_INLINE_ASM_EXTERNAL && (defined(RT_ARCH_AMD64) || defined(RT_ARCH_X86))
340DECLASM(uint32_t) ASMMultU32ByU32DivByU32(uint32_t u32A, uint32_t u32B, uint32_t u32C);
341#else
342DECLINLINE(uint32_t) ASMMultU32ByU32DivByU32(uint32_t u32A, uint32_t u32B, uint32_t u32C)
343{
344# if RT_INLINE_ASM_GNU_STYLE && (defined(RT_ARCH_AMD64) || defined(RT_ARCH_X86))
345 uint32_t u32Result, u32Spill;
346 __asm__ __volatile__("mull %2\n\t"
347 "divl %3\n\t"
348 : "=&a" (u32Result),
349 "=&d" (u32Spill)
350 : "r" (u32B),
351 "r" (u32C),
352 "0" (u32A));
353 return u32Result;
354# else
355 return (uint32_t)(((uint64_t)u32A * u32B) / u32C);
356# endif
357}
358#endif
359
360
361/**
362 * Multiple a 64-bit by a 32-bit integer and divide the result by a 32-bit integer
363 * using a 96 bit intermediate result.
364 *
365 * @returns (u64A * u32B) / u32C.
366 * @param u64A The 64-bit value.
367 * @param u32B The 32-bit value to multiple by A.
368 * @param u32C The 32-bit value to divide A*B by.
369 *
370 * @remarks Architecture specific.
371 * @remarks Make sure the result won't ever exceed 64-bit, because hardware
372 * exception may be raised if it does.
373 * @remarks On x86 this may be used to avoid dragging in 64-bit builtin
374 * arithmetics function.
375 */
376#if RT_INLINE_ASM_EXTERNAL || !defined(__GNUC__) || (!defined(RT_ARCH_AMD64) && !defined(RT_ARCH_X86))
377DECLASM(uint64_t) ASMMultU64ByU32DivByU32(uint64_t u64A, uint32_t u32B, uint32_t u32C);
378#else
379DECLINLINE(uint64_t) ASMMultU64ByU32DivByU32(uint64_t u64A, uint32_t u32B, uint32_t u32C)
380{
381# if RT_INLINE_ASM_GNU_STYLE
382# ifdef RT_ARCH_AMD64
383 uint64_t u64Result, u64Spill;
384 __asm__ __volatile__("mulq %2\n\t"
385 "divq %3\n\t"
386 : "=&a" (u64Result),
387 "=&d" (u64Spill)
388 : "r" ((uint64_t)u32B),
389 "r" ((uint64_t)u32C),
390 "0" (u64A));
391 return u64Result;
392# else
393 uint32_t u32Dummy;
394 uint64_t u64Result;
395 __asm__ __volatile__("mull %%ecx \n\t" /* eax = u64Lo.lo = (u64A.lo * u32B).lo
396 edx = u64Lo.hi = (u64A.lo * u32B).hi */
397 "xchg %%eax,%%esi \n\t" /* esi = u64Lo.lo
398 eax = u64A.hi */
399 "xchg %%edx,%%edi \n\t" /* edi = u64Low.hi
400 edx = u32C */
401 "xchg %%edx,%%ecx \n\t" /* ecx = u32C
402 edx = u32B */
403 "mull %%edx \n\t" /* eax = u64Hi.lo = (u64A.hi * u32B).lo
404 edx = u64Hi.hi = (u64A.hi * u32B).hi */
405 "addl %%edi,%%eax \n\t" /* u64Hi.lo += u64Lo.hi */
406 "adcl $0,%%edx \n\t" /* u64Hi.hi += carry */
407 "divl %%ecx \n\t" /* eax = u64Hi / u32C
408 edx = u64Hi % u32C */
409 "movl %%eax,%%edi \n\t" /* edi = u64Result.hi = u64Hi / u32C */
410 "movl %%esi,%%eax \n\t" /* eax = u64Lo.lo */
411 "divl %%ecx \n\t" /* u64Result.lo */
412 "movl %%edi,%%edx \n\t" /* u64Result.hi */
413 : "=A"(u64Result), "=c"(u32Dummy),
414 "=S"(u32Dummy), "=D"(u32Dummy)
415 : "a"((uint32_t)u64A),
416 "S"((uint32_t)(u64A >> 32)),
417 "c"(u32B),
418 "D"(u32C));
419 return u64Result;
420# endif
421# else
422 RTUINT64U u;
423 uint64_t u64Lo = (uint64_t)(u64A & 0xffffffff) * u32B;
424 uint64_t u64Hi = (uint64_t)(u64A >> 32) * u32B;
425 u64Hi += (u64Lo >> 32);
426 u.s.Hi = (uint32_t)(u64Hi / u32C);
427 u.s.Lo = (uint32_t)((((u64Hi % u32C) << 32) + (u64Lo & 0xffffffff)) / u32C);
428 return u.u;
429# endif
430}
431#endif
432
433/** @} */
434#endif /* !IPRT_INCLUDED_asm_math_h */
435
Note: See TracBrowser for help on using the repository browser.

© 2024 Oracle Support Privacy / Do Not Sell My Info Terms of Use Trademark Policy Automated Access Etiquette