VirtualBox

source: vbox/trunk/src/VBox/Runtime/common/math/sin.asm@ 96240

Last change on this file since 96240 was 96240, checked in by vboxsync, 2 years ago

IPRT/nocrt: Reworking the sin and cos code to take into account which ranges FCOS and FSIN instructions are expected to deliver reasonable results and handle extreme values better. bugref:10261

  • Property svn:eol-style set to native
  • Property svn:keywords set to Author Date Id Revision
File size: 15.1 KB
Line 
1; $Id: sin.asm 96240 2022-08-17 00:59:31Z vboxsync $
2;; @file
3; IPRT - No-CRT sin - AMD64 & X86.
4;
5
6;
7; Copyright (C) 2006-2022 Oracle Corporation
8;
9; This file is part of VirtualBox Open Source Edition (OSE), as
10; available from http://www.virtualbox.org. This file is free software;
11; you can redistribute it and/or modify it under the terms of the GNU
12; General Public License (GPL) as published by the Free Software
13; Foundation, in version 2 as it comes in the "COPYING" file of the
14; VirtualBox OSE distribution. VirtualBox OSE is distributed in the
15; hope that it will be useful, but WITHOUT ANY WARRANTY of any kind.
16;
17; The contents of this file may alternatively be used under the terms
18; of the Common Development and Distribution License Version 1.0
19; (CDDL) only, as it comes in the "COPYING.CDDL" file of the
20; VirtualBox OSE distribution, in which case the provisions of the
21; CDDL are applicable instead of those of the GPL.
22;
23; You may elect to license modified versions of this file under the
24; terms and conditions of either the GPL or the CDDL or both.
25;
26
27
28%define RT_ASM_WITH_SEH64
29%include "iprt/asmdefs.mac"
30%include "iprt/x86.mac"
31
32
33BEGINCODE
34
35;;
36; Internal sine and cosine worker that calculates the sine of st0 returning
37; it in st0.
38;
39; When called by a sine function, fabs(st0) >= pi/2.
40; When called by a cosine function, fabs(original input value) >= 3pi/8.
41;
42; That the input isn't a tiny number close to zero, means that we can do a bit
43; cruder rounding when operating close to a pi/2 boundrary. The value in the
44; ecx register indicates the input precision and controls the crudeness of the
45; rounding.
46;
47; @returns st0 = sine
48; @param st0 A finite number to calucate sine of.
49; @param ecx Set to 0 if original input was a 32-bit float.
50; Set to 1 if original input was a 64-bit double.
51; set to 2 if original input was a 80-bit long double.
52;
53BEGINPROC rtNoCrtMathSinCore
54 push xBP
55 SEH64_PUSH_xBP
56 mov xBP, xSP
57 SEH64_SET_FRAME_xBP 0
58 SEH64_END_PROLOGUE
59
60 ;
61 ; Load the pointer to the rounding crudeness factor into xDX.
62 ;
63 lea xDX, [.s_ar64NearZero xWrtRIP]
64 lea xDX, [xDX + xCX * xCB]
65
66 ;
67 ; Finite number. We want it in the range [0,2pi] and will preform
68 ; a remainder division if it isn't.
69 ;
70 fcom qword [.s_r64Max xWrtRIP] ; compares st0 and 2*pi
71 fnstsw ax
72 test ax, X86_FSW_C3 | X86_FSW_C0 | X86_FSW_C2 ; C3 := st0 == mem; C0 := st0 < mem; C2 := unordered (should be the case);
73 jz .reduce_st0 ; Jump if st0 > mem
74
75 fcom qword [.s_r64Min xWrtRIP] ; compares st0 and 0.0
76 fnstsw ax
77 test ax, X86_FSW_C3 | X86_FSW_C0
78 jnz .reduce_st0 ; Jump if st0 <= mem
79
80 ;
81 ; We get here if st0 is in the [0,2pi] range.
82 ;
83 ; Now, FSIN is documented to be reasonably accurate for the range
84 ; -3pi/4 to +3pi/4, so we have to make some more effort to calculate
85 ; in that range only.
86 ;
87.in_range:
88 ; if (st0 < pi)
89 fldpi
90 fcom st1 ; compares st0 (pi) with st1 (the normalized value)
91 fnstsw ax
92 test ax, X86_FSW_C0 ; st1 > pi
93 jnz .larger_than_pi
94 test ax, X86_FSW_C3
95 jnz .equals_pi
96
97 ;
98 ; input in the range [0,pi[
99 ;
100.smaller_than_pi:
101 fdiv qword [.s_r64Two xWrtRIP] ; st0 = pi/2
102
103 ; if (st0 < pi/2)
104 fcom st1 ; compares st0 (pi/2) with st1
105 fnstsw ax
106 test ax, X86_FSW_C0 ; st1 > pi
107 jnz .between_half_pi_and_pi
108 test ax, X86_FSW_C3
109 jnz .equals_half_pi
110
111 ;
112 ; The value is between zero and half pi, including the zero value.
113 ;
114 ; This is in range where FSIN works reasonably reliably. So drop the
115 ; half pi in st0 and do the calculation.
116 ;
117.between_zero_and_half_pi:
118 ; Check if we're so close to pi/2 that it makes no difference.
119 fsub st0, st1 ; st0 = pi/2 - st1
120 fcom qword [xDX]
121 fnstsw ax
122 test ax, X86_FSW_C0 | X86_FSW_C3 ; st0 <= very small positive number.
123 jnz .equals_half_pi
124 ffreep st0
125
126 ; Check if we're so close to zero that it makes no difference given the
127 ; internal accuracy of the FPU.
128 fcom qword [xDX]
129 fnstsw ax
130 test ax, X86_FSW_C0 | X86_FSW_C3 ; st0 <= very small positive number.
131 jnz .equals_zero_popped_one
132
133 ; Ok, calculate sine.
134 fsin
135 jmp .return
136
137 ;
138 ; The value is in the range ]pi/2,pi[
139 ;
140 ; This is outside the comfortable FSIN range, but if we subtract PI and
141 ; move to the ]-pi/2,0[ range we just have to change the sign to get
142 ; the value we want.
143 ;
144.between_half_pi_and_pi:
145 ; Check if we're so close to pi/2 that it makes no difference.
146 fsubr st0, st1 ; st0 = st1 - st0
147 fcom qword [xDX]
148 fnstsw ax
149 test ax, X86_FSW_C0 | X86_FSW_C3 ; st0 <= very small positive number.
150 jnz .equals_half_pi
151 ffreep st0
152
153 ; Check if we're so close to pi that it makes no difference.
154 fldpi
155 fsub st0, st1 ; st0 = st0 - st1
156 fcom qword [xDX]
157 fnstsw ax
158 test ax, X86_FSW_C0 | X86_FSW_C3 ; st0 <= very small positive number.
159 jnz .equals_pi
160 ffreep st0
161
162 ; Ok, transform the value and calculate sine.
163 fldpi
164 fsubp st1, st0
165
166 fsin
167 fchs
168 jmp .return
169
170 ;
171 ; input in the range ]pi,2pi[
172 ;
173.larger_than_pi:
174 fsub st1, st0 ; st1 -= pi
175 fdiv qword [.s_r64Two xWrtRIP] ; st0 = pi/2
176
177 ; if (st0 < pi/2)
178 fcom st1 ; compares st0 (pi/2) with reduced st1
179 fnstsw ax
180 test ax, X86_FSW_C0 ; st1 > pi
181 jnz .between_3_half_pi_and_2pi
182 test ax, X86_FSW_C3
183 jnz .equals_3_half_pi
184
185 ;
186 ; The value is in the the range: ]pi,3pi/2[
187 ;
188 ; The actual st0 is in the range ]pi,pi/2[ where FSIN is performing okay
189 ; and we can get the desired result by changing the sign (-FSIN).
190 ;
191.between_pi_and_3_half_pi:
192 ; Check if we're so close to pi/2 that it makes no difference.
193 fsub st0, st1 ; st0 = pi/2 - st1
194 fcom qword [xDX]
195 fnstsw ax
196 test ax, X86_FSW_C0 | X86_FSW_C3 ; st0 <= very small positive number.
197 jnz .equals_3_half_pi
198 ffreep st0
199
200 ; Check if we're so close to zero that it makes no difference given the
201 ; internal accuracy of the FPU.
202 fcom qword [xDX]
203 fnstsw ax
204 test ax, X86_FSW_C0 | X86_FSW_C3 ; st0 <= very small positive number.
205 jnz .equals_pi_popped
206
207 ; Ok, calculate sine and flip the sign.
208 fsin
209 fchs
210 jmp .return
211
212 ;
213 ; The value is in the last pi/2 of the range: ]3pi/2,2pi[
214 ;
215 ; Since FSIN should work reasonably well for ]-pi/2,pi], we can just
216 ; subtract pi again (we subtracted pi at .larger_than_pi above) and
217 ; run FSIN on it. (st1 is currently in the range ]pi/2,pi[.)
218 ;
219.between_3_half_pi_and_2pi:
220 ; Check if we're so close to pi/2 that it makes no difference.
221 fsubr st0, st1 ; st0 = st1 - st0
222 fcom qword [xDX]
223 fnstsw ax
224 test ax, X86_FSW_C0 | X86_FSW_C3 ; st0 <= very small positive number.
225 jnz .equals_3_half_pi
226 ffreep st0
227
228 ; Check if we're so close to pi that it makes no difference.
229 fldpi
230 fsub st0, st1 ; st0 = st0 - st1
231 fcom qword [xDX]
232 fnstsw ax
233 test ax, X86_FSW_C0 | X86_FSW_C3 ; st0 <= very small positive number.
234 jnz .equals_2pi
235 ffreep st0
236
237 ; Ok, adjust input and calculate sine.
238 fldpi
239 fsubp st1, st0
240 fsin
241 jmp .return
242
243 ;
244 ; sin(0) = 0
245 ; sin(pi) = 0
246 ;
247.equals_zero:
248.equals_pi:
249.equals_2pi:
250 ffreep st0
251.equals_zero_popped_one:
252.equals_pi_popped:
253 ffreep st0
254 fldz
255 jmp .return
256
257 ;
258 ; sin(pi/2) = 1
259 ;
260.equals_half_pi:
261 ffreep st0
262 ffreep st0
263 fld1
264 jmp .return
265
266 ;
267 ; sin(3*pi/2) = -1
268 ;
269.equals_3_half_pi:
270 ffreep st0
271 ffreep st0
272 fld1
273 fchs
274 jmp .return
275
276 ;
277 ; Return.
278 ;
279.return:
280 leave
281 ret
282
283 ;
284 ; Reduce st0 by reminder division by PI*2. The result should be positive here.
285 ;
286 ;; @todo this is one of our weak spots (really any calculation involving PI is).
287.reduce_st0:
288 fldpi
289 fadd st0, st0
290 fxch st1 ; st0=input (dividend) st1=2pi (divisor)
291.again:
292 fprem1
293 fnstsw ax
294 test ah, (X86_FSW_C2 >> 8) ; C2 is set if partial result.
295 jnz .again ; Loop till C2 == 0 and we have a final result.
296
297 ;
298 ; Make sure the result is positive.
299 ;
300 fxam
301 fnstsw ax
302 test ax, X86_FSW_C1 ; The sign bit
303 jz .reduced_to_positive
304
305 fadd st0, st1 ; st0 += 2pi, which should make it positive
306
307%ifdef RT_STRICT
308 fxam
309 fnstsw ax
310 test ax, X86_FSW_C1
311 jz .reduced_to_positive
312 int3
313%endif
314
315.reduced_to_positive:
316 fstp st1 ; Get rid of the 2pi value.
317 jmp .in_range
318
319ALIGNCODE(8)
320.s_r64Max:
321 dq +6.28318530717958647692 ; 2*pi
322.s_r64Min:
323 dq 0.0
324.s_r64Two:
325 dq 2.0
326 ;;
327 ; Close to 2/pi rounding limits for 32-bit, 64-bit and 80-bit floating point operations.
328 ; Given that the original input is at least +/-3pi/8 (1.178) and that precision of the
329 ; PI constant used during reduction/whatever, I think we can round to a whole pi/2
330 ; step when we get close enough.
331 ;
332 ; Look to RTFLOAT64U for the format details, but 52 is the shift for the exponent field
333 ; and 1023 is the exponent bias. Since the format uses an implied 1 in the mantissa,
334 ; we only have to set the exponent to get a valid number.
335 ;
336.s_ar64NearZero:
337 dq (-18 + 1023) << 52 ; float / 32-bit / single precision input
338 dq (-40 + 1023) << 52 ; double / 64-bit / double precision input
339 dq (-52 + 1023) << 52 ; long double / 80-bit / extended precision input
340ENDPROC rtNoCrtMathSinCore
341
342
343;;
344; Compute the sine of rd, measured in radians.
345;
346; @returns st(0) / xmm0
347; @param rd [rbp + xCB*2] / xmm0
348;
349RT_NOCRT_BEGINPROC sin
350 push xBP
351 SEH64_PUSH_xBP
352 mov xBP, xSP
353 SEH64_SET_FRAME_xBP 0
354 sub xSP, 20h
355 SEH64_ALLOCATE_STACK 20h
356 SEH64_END_PROLOGUE
357
358%ifdef RT_OS_WINDOWS
359 ;
360 ; Make sure we use full precision and not the windows default of 53 bits.
361 ;
362 fnstcw [xBP - 20h]
363 mov ax, [xBP - 20h]
364 or ax, X86_FCW_PC_64 ; includes both bits, so no need to clear the mask.
365 mov [xBP - 1ch], ax
366 fldcw [xBP - 1ch]
367%endif
368
369 ;
370 ; Load the input into st0.
371 ;
372%ifdef RT_ARCH_AMD64
373 movsd [xBP - 10h], xmm0
374 fld qword [xBP - 10h]
375%else
376 fld qword [xBP + xCB*2]
377%endif
378
379 ;
380 ; We examin the input and weed out non-finit numbers first.
381 ;
382 fxam
383 fnstsw ax
384 and ax, X86_FSW_C3 | X86_FSW_C2 | X86_FSW_C0
385 cmp ax, X86_FSW_C2 ; Normal finite number (excluding zero)
386 je .finite
387 cmp ax, X86_FSW_C3 ; Zero
388 je .zero
389 cmp ax, X86_FSW_C3 | X86_FSW_C2 ; Denormals - treat them as zero.
390 je .zero
391 cmp ax, X86_FSW_C0 ; NaN - must handle it special,
392 je .nan
393
394 ; Pass infinities and unsupported inputs to fsin, assuming it does the right thing.
395.do_sin:
396 fsin
397 jmp .return_val
398
399 ;
400 ; Finite number.
401 ;
402.finite:
403 ; For very tiny numbers, 0 < abs(input) < 2**-25, we can return the
404 ; input value directly.
405 fld st0 ; duplicate st0
406 fabs ; make it an absolute (positive) value.
407 fld qword [.s_r64Tiny xWrtRIP]
408 fcomip st1 ; compare s_r64Tiny and fabs(input)
409 ja .return_tiny_number_as_is ; jump if fabs(input) is smaller
410
411 ; FSIN is documented to be reasonable for the range ]-3pi/4,3pi/4[, so
412 ; while we have fabs(input) loaded already, check for that here and
413 ; allow rtNoCrtMathSinCore to assume it won't see values very close to
414 ; zero, except by cos -> sin conversion where they won't be relevant to
415 ; any assumpttions about precision approximation.
416 fld qword [.s_r64FSinOkay xWrtRIP]
417 fcomip st1
418 ffreep st0 ; drop the fabs(input) value
419 ja .do_sin
420
421 ;
422 ; Call common sine/cos worker.
423 ;
424 mov ecx, 1 ; double
425 extern NAME(rtNoCrtMathSinCore)
426 call NAME(rtNoCrtMathSinCore)
427
428 ;
429 ; Run st0.
430 ;
431.return_val:
432%ifdef RT_ARCH_AMD64
433 fstp qword [xBP - 10h]
434 movsd xmm0, [xBP - 10h]
435%endif
436%ifdef RT_OS_WINDOWS
437 fldcw [xBP - 20h] ; restore original
438%endif
439.return:
440 leave
441 ret
442
443 ;
444 ; As explained already, we can return tiny numbers directly too as the
445 ; output from sin(input) = input given our precision.
446 ; We can skip the st0 -> xmm0 translation here, so follow the same path
447 ; as .zero & .nan, after we've removed the fabs(input) value.
448 ;
449.return_tiny_number_as_is:
450 ffreep st0
451
452 ;
453 ; sin(+/-0.0) = +/-0.0 (preserve the sign)
454 ; We can skip the st0 -> xmm0 translation here, so follow the .nan code path.
455 ;
456.zero:
457
458 ;
459 ; Input is NaN, output it unmodified as far as we can (FLD changes SNaN
460 ; to QNaN when masked).
461 ;
462.nan:
463%ifdef RT_ARCH_AMD64
464 ffreep st0
465%endif
466 jmp .return
467
468ALIGNCODE(8)
469 ; Ca. 2**-26, absolute value. Inputs closer to zero than this can be
470 ; returns directly as the sin(input) value should be basically the same
471 ; given the precision we're working with and FSIN probably won't even
472 ; manage that.
473 ;; @todo experiment when FSIN gets better than this.
474.s_r64Tiny:
475 dq 1.49011612e-8
476 ; The absolute limit of FSIN "good" range.
477.s_r64FSinOkay:
478 dq 2.356194490192344928845 ; 3pi/4
479 ;dq 1.57079632679489661923 ; pi/2 - alternative.
480
481ENDPROC RT_NOCRT(sin)
482
Note: See TracBrowser for help on using the repository browser.

© 2024 Oracle Support Privacy / Do Not Sell My Info Terms of Use Trademark Policy Automated Access Etiquette