sin.asm@ 96240

Last change on this file since 96240 was 96240, checked in by vboxsync, 2 years ago
IPRT/nocrt: Reworking the sin and cos code to take into account which ranges FCOS and FSIN instructions are expected to deliver reasonable results and handle extreme values better. bugref:10261
Property svn:eol-style set to `native` Property svn:keywords set to `Author Date Id Revision`
File size: 15.1 KB

Line
1	; $Id: sin.asm 96240 2022-08-17 00:59:31Z vboxsync $
2	;; @file
3	; IPRT - No-CRT sin - AMD64 & X86.
4	;
5
6	;
7	; Copyright (C) 2006-2022 Oracle Corporation
8	;
9	; This file is part of VirtualBox Open Source Edition (OSE), as
10	; available from http://www.virtualbox.org. This file is free software;
11	; you can redistribute it and/or modify it under the terms of the GNU
12	; General Public License (GPL) as published by the Free Software
13	; Foundation, in version 2 as it comes in the "COPYING" file of the
14	; VirtualBox OSE distribution. VirtualBox OSE is distributed in the
15	; hope that it will be useful, but WITHOUT ANY WARRANTY of any kind.
16	;
17	; The contents of this file may alternatively be used under the terms
18	; of the Common Development and Distribution License Version 1.0
19	; (CDDL) only, as it comes in the "COPYING.CDDL" file of the
20	; VirtualBox OSE distribution, in which case the provisions of the
21	; CDDL are applicable instead of those of the GPL.
22	;
23	; You may elect to license modified versions of this file under the
24	; terms and conditions of either the GPL or the CDDL or both.
25	;
26
27
28	%define RT_ASM_WITH_SEH64
29	%include "iprt/asmdefs.mac"
30	%include "iprt/x86.mac"
31
32
33	BEGINCODE
34
35	;;
36	; Internal sine and cosine worker that calculates the sine of st0 returning
37	; it in st0.
38	;
39	; When called by a sine function, fabs(st0) >= pi/2.
40	; When called by a cosine function, fabs(original input value) >= 3pi/8.
41	;
42	; That the input isn't a tiny number close to zero, means that we can do a bit
43	; cruder rounding when operating close to a pi/2 boundrary. The value in the
44	; ecx register indicates the input precision and controls the crudeness of the
45	; rounding.
46	;
47	; @returns st0 = sine
48	; @param st0 A finite number to calucate sine of.
49	; @param ecx Set to 0 if original input was a 32-bit float.
50	; Set to 1 if original input was a 64-bit double.
51	; set to 2 if original input was a 80-bit long double.
52	;
53	BEGINPROC rtNoCrtMathSinCore
54	push xBP
55	SEH64_PUSH_xBP
56	mov xBP, xSP
57	SEH64_SET_FRAME_xBP 0
58	SEH64_END_PROLOGUE
59
60	;
61	; Load the pointer to the rounding crudeness factor into xDX.
62	;
63	lea xDX, [.s_ar64NearZero xWrtRIP]
64	lea xDX, [xDX + xCX * xCB]
65
66	;
67	; Finite number. We want it in the range [0,2pi] and will preform
68	; a remainder division if it isn't.
69	;
70	fcom qword [.s_r64Max xWrtRIP] ; compares st0 and 2*pi
71	fnstsw ax
72	test ax, X86_FSW_C3 \| X86_FSW_C0 \| X86_FSW_C2 ; C3 := st0 == mem; C0 := st0 < mem; C2 := unordered (should be the case);
73	jz .reduce_st0 ; Jump if st0 > mem
74
75	fcom qword [.s_r64Min xWrtRIP] ; compares st0 and 0.0
76	fnstsw ax
77	test ax, X86_FSW_C3 \| X86_FSW_C0
78	jnz .reduce_st0 ; Jump if st0 <= mem
79
80	;
81	; We get here if st0 is in the [0,2pi] range.
82	;
83	; Now, FSIN is documented to be reasonably accurate for the range
84	; -3pi/4 to +3pi/4, so we have to make some more effort to calculate
85	; in that range only.
86	;
87	.in_range:
88	; if (st0 < pi)
89	fldpi
90	fcom st1 ; compares st0 (pi) with st1 (the normalized value)
91	fnstsw ax
92	test ax, X86_FSW_C0 ; st1 > pi
93	jnz .larger_than_pi
94	test ax, X86_FSW_C3
95	jnz .equals_pi
96
97	;
98	; input in the range [0,pi[
99	;
100	.smaller_than_pi:
101	fdiv qword [.s_r64Two xWrtRIP] ; st0 = pi/2
102
103	; if (st0 < pi/2)
104	fcom st1 ; compares st0 (pi/2) with st1
105	fnstsw ax
106	test ax, X86_FSW_C0 ; st1 > pi
107	jnz .between_half_pi_and_pi
108	test ax, X86_FSW_C3
109	jnz .equals_half_pi
110
111	;
112	; The value is between zero and half pi, including the zero value.
113	;
114	; This is in range where FSIN works reasonably reliably. So drop the
115	; half pi in st0 and do the calculation.
116	;
117	.between_zero_and_half_pi:
118	; Check if we're so close to pi/2 that it makes no difference.
119	fsub st0, st1 ; st0 = pi/2 - st1
120	fcom qword [xDX]
121	fnstsw ax
122	test ax, X86_FSW_C0 \| X86_FSW_C3 ; st0 <= very small positive number.
123	jnz .equals_half_pi
124	ffreep st0
125
126	; Check if we're so close to zero that it makes no difference given the
127	; internal accuracy of the FPU.
128	fcom qword [xDX]
129	fnstsw ax
130	test ax, X86_FSW_C0 \| X86_FSW_C3 ; st0 <= very small positive number.
131	jnz .equals_zero_popped_one
132
133	; Ok, calculate sine.
134	fsin
135	jmp .return
136
137	;
138	; The value is in the range ]pi/2,pi[
139	;
140	; This is outside the comfortable FSIN range, but if we subtract PI and
141	; move to the ]-pi/2,0[ range we just have to change the sign to get
142	; the value we want.
143	;
144	.between_half_pi_and_pi:
145	; Check if we're so close to pi/2 that it makes no difference.
146	fsubr st0, st1 ; st0 = st1 - st0
147	fcom qword [xDX]
148	fnstsw ax
149	test ax, X86_FSW_C0 \| X86_FSW_C3 ; st0 <= very small positive number.
150	jnz .equals_half_pi
151	ffreep st0
152
153	; Check if we're so close to pi that it makes no difference.
154	fldpi
155	fsub st0, st1 ; st0 = st0 - st1
156	fcom qword [xDX]
157	fnstsw ax
158	test ax, X86_FSW_C0 \| X86_FSW_C3 ; st0 <= very small positive number.
159	jnz .equals_pi
160	ffreep st0
161
162	; Ok, transform the value and calculate sine.
163	fldpi
164	fsubp st1, st0
165
166	fsin
167	fchs
168	jmp .return
169
170	;
171	; input in the range ]pi,2pi[
172	;
173	.larger_than_pi:
174	fsub st1, st0 ; st1 -= pi
175	fdiv qword [.s_r64Two xWrtRIP] ; st0 = pi/2
176
177	; if (st0 < pi/2)
178	fcom st1 ; compares st0 (pi/2) with reduced st1
179	fnstsw ax
180	test ax, X86_FSW_C0 ; st1 > pi
181	jnz .between_3_half_pi_and_2pi
182	test ax, X86_FSW_C3
183	jnz .equals_3_half_pi
184
185	;
186	; The value is in the the range: ]pi,3pi/2[
187	;
188	; The actual st0 is in the range ]pi,pi/2[ where FSIN is performing okay
189	; and we can get the desired result by changing the sign (-FSIN).
190	;
191	.between_pi_and_3_half_pi:
192	; Check if we're so close to pi/2 that it makes no difference.
193	fsub st0, st1 ; st0 = pi/2 - st1
194	fcom qword [xDX]
195	fnstsw ax
196	test ax, X86_FSW_C0 \| X86_FSW_C3 ; st0 <= very small positive number.
197	jnz .equals_3_half_pi
198	ffreep st0
199
200	; Check if we're so close to zero that it makes no difference given the
201	; internal accuracy of the FPU.
202	fcom qword [xDX]
203	fnstsw ax
204	test ax, X86_FSW_C0 \| X86_FSW_C3 ; st0 <= very small positive number.
205	jnz .equals_pi_popped
206
207	; Ok, calculate sine and flip the sign.
208	fsin
209	fchs
210	jmp .return
211
212	;
213	; The value is in the last pi/2 of the range: ]3pi/2,2pi[
214	;
215	; Since FSIN should work reasonably well for ]-pi/2,pi], we can just
216	; subtract pi again (we subtracted pi at .larger_than_pi above) and
217	; run FSIN on it. (st1 is currently in the range ]pi/2,pi[.)
218	;
219	.between_3_half_pi_and_2pi:
220	; Check if we're so close to pi/2 that it makes no difference.
221	fsubr st0, st1 ; st0 = st1 - st0
222	fcom qword [xDX]
223	fnstsw ax
224	test ax, X86_FSW_C0 \| X86_FSW_C3 ; st0 <= very small positive number.
225	jnz .equals_3_half_pi
226	ffreep st0
227
228	; Check if we're so close to pi that it makes no difference.
229	fldpi
230	fsub st0, st1 ; st0 = st0 - st1
231	fcom qword [xDX]
232	fnstsw ax
233	test ax, X86_FSW_C0 \| X86_FSW_C3 ; st0 <= very small positive number.
234	jnz .equals_2pi
235	ffreep st0
236
237	; Ok, adjust input and calculate sine.
238	fldpi
239	fsubp st1, st0
240	fsin
241	jmp .return
242
243	;
244	; sin(0) = 0
245	; sin(pi) = 0
246	;
247	.equals_zero:
248	.equals_pi:
249	.equals_2pi:
250	ffreep st0
251	.equals_zero_popped_one:
252	.equals_pi_popped:
253	ffreep st0
254	fldz
255	jmp .return
256
257	;
258	; sin(pi/2) = 1
259	;
260	.equals_half_pi:
261	ffreep st0
262	ffreep st0
263	fld1
264	jmp .return
265
266	;
267	; sin(3*pi/2) = -1
268	;
269	.equals_3_half_pi:
270	ffreep st0
271	ffreep st0
272	fld1
273	fchs
274	jmp .return
275
276	;
277	; Return.
278	;
279	.return:
280	leave
281	ret
282
283	;
284	; Reduce st0 by reminder division by PI*2. The result should be positive here.
285	;
286	;; @todo this is one of our weak spots (really any calculation involving PI is).
287	.reduce_st0:
288	fldpi
289	fadd st0, st0
290	fxch st1 ; st0=input (dividend) st1=2pi (divisor)
291	.again:
292	fprem1
293	fnstsw ax
294	test ah, (X86_FSW_C2 >> 8) ; C2 is set if partial result.
295	jnz .again ; Loop till C2 == 0 and we have a final result.
296
297	;
298	; Make sure the result is positive.
299	;
300	fxam
301	fnstsw ax
302	test ax, X86_FSW_C1 ; The sign bit
303	jz .reduced_to_positive
304
305	fadd st0, st1 ; st0 += 2pi, which should make it positive
306
307	%ifdef RT_STRICT
308	fxam
309	fnstsw ax
310	test ax, X86_FSW_C1
311	jz .reduced_to_positive
312	int3
313	%endif
314
315	.reduced_to_positive:
316	fstp st1 ; Get rid of the 2pi value.
317	jmp .in_range
318
319	ALIGNCODE(8)
320	.s_r64Max:
321	dq +6.28318530717958647692 ; 2*pi
322	.s_r64Min:
323	dq 0.0
324	.s_r64Two:
325	dq 2.0
326	;;
327	; Close to 2/pi rounding limits for 32-bit, 64-bit and 80-bit floating point operations.
328	; Given that the original input is at least +/-3pi/8 (1.178) and that precision of the
329	; PI constant used during reduction/whatever, I think we can round to a whole pi/2
330	; step when we get close enough.
331	;
332	; Look to RTFLOAT64U for the format details, but 52 is the shift for the exponent field
333	; and 1023 is the exponent bias. Since the format uses an implied 1 in the mantissa,
334	; we only have to set the exponent to get a valid number.
335	;
336	.s_ar64NearZero:
337	dq (-18 + 1023) << 52 ; float / 32-bit / single precision input
338	dq (-40 + 1023) << 52 ; double / 64-bit / double precision input
339	dq (-52 + 1023) << 52 ; long double / 80-bit / extended precision input
340	ENDPROC rtNoCrtMathSinCore
341
342
343	;;
344	; Compute the sine of rd, measured in radians.
345	;
346	; @returns st(0) / xmm0
347	; @param rd [rbp + xCB*2] / xmm0
348	;
349	RT_NOCRT_BEGINPROC sin
350	push xBP
351	SEH64_PUSH_xBP
352	mov xBP, xSP
353	SEH64_SET_FRAME_xBP 0
354	sub xSP, 20h
355	SEH64_ALLOCATE_STACK 20h
356	SEH64_END_PROLOGUE
357
358	%ifdef RT_OS_WINDOWS
359	;
360	; Make sure we use full precision and not the windows default of 53 bits.
361	;
362	fnstcw [xBP - 20h]
363	mov ax, [xBP - 20h]
364	or ax, X86_FCW_PC_64 ; includes both bits, so no need to clear the mask.
365	mov [xBP - 1ch], ax
366	fldcw [xBP - 1ch]
367	%endif
368
369	;
370	; Load the input into st0.
371	;
372	%ifdef RT_ARCH_AMD64
373	movsd [xBP - 10h], xmm0
374	fld qword [xBP - 10h]
375	%else
376	fld qword [xBP + xCB*2]
377	%endif
378
379	;
380	; We examin the input and weed out non-finit numbers first.
381	;
382	fxam
383	fnstsw ax
384	and ax, X86_FSW_C3 \| X86_FSW_C2 \| X86_FSW_C0
385	cmp ax, X86_FSW_C2 ; Normal finite number (excluding zero)
386	je .finite
387	cmp ax, X86_FSW_C3 ; Zero
388	je .zero
389	cmp ax, X86_FSW_C3 \| X86_FSW_C2 ; Denormals - treat them as zero.
390	je .zero
391	cmp ax, X86_FSW_C0 ; NaN - must handle it special,
392	je .nan
393
394	; Pass infinities and unsupported inputs to fsin, assuming it does the right thing.
395	.do_sin:
396	fsin
397	jmp .return_val
398
399	;
400	; Finite number.
401	;
402	.finite:
403	; For very tiny numbers, 0 < abs(input) < 2**-25, we can return the
404	; input value directly.
405	fld st0 ; duplicate st0
406	fabs ; make it an absolute (positive) value.
407	fld qword [.s_r64Tiny xWrtRIP]
408	fcomip st1 ; compare s_r64Tiny and fabs(input)
409	ja .return_tiny_number_as_is ; jump if fabs(input) is smaller
410
411	; FSIN is documented to be reasonable for the range ]-3pi/4,3pi/4[, so
412	; while we have fabs(input) loaded already, check for that here and
413	; allow rtNoCrtMathSinCore to assume it won't see values very close to
414	; zero, except by cos -> sin conversion where they won't be relevant to
415	; any assumpttions about precision approximation.
416	fld qword [.s_r64FSinOkay xWrtRIP]
417	fcomip st1
418	ffreep st0 ; drop the fabs(input) value
419	ja .do_sin
420
421	;
422	; Call common sine/cos worker.
423	;
424	mov ecx, 1 ; double
425	extern NAME(rtNoCrtMathSinCore)
426	call NAME(rtNoCrtMathSinCore)
427
428	;
429	; Run st0.
430	;
431	.return_val:
432	%ifdef RT_ARCH_AMD64
433	fstp qword [xBP - 10h]
434	movsd xmm0, [xBP - 10h]
435	%endif
436	%ifdef RT_OS_WINDOWS
437	fldcw [xBP - 20h] ; restore original
438	%endif
439	.return:
440	leave
441	ret
442
443	;
444	; As explained already, we can return tiny numbers directly too as the
445	; output from sin(input) = input given our precision.
446	; We can skip the st0 -> xmm0 translation here, so follow the same path
447	; as .zero & .nan, after we've removed the fabs(input) value.
448	;
449	.return_tiny_number_as_is:
450	ffreep st0
451
452	;
453	; sin(+/-0.0) = +/-0.0 (preserve the sign)
454	; We can skip the st0 -> xmm0 translation here, so follow the .nan code path.
455	;
456	.zero:
457
458	;
459	; Input is NaN, output it unmodified as far as we can (FLD changes SNaN
460	; to QNaN when masked).
461	;
462	.nan:
463	%ifdef RT_ARCH_AMD64
464	ffreep st0
465	%endif
466	jmp .return
467
468	ALIGNCODE(8)
469	; Ca. 2**-26, absolute value. Inputs closer to zero than this can be
470	; returns directly as the sin(input) value should be basically the same
471	; given the precision we're working with and FSIN probably won't even
472	; manage that.
473	;; @todo experiment when FSIN gets better than this.
474	.s_r64Tiny:
475	dq 1.49011612e-8
476	; The absolute limit of FSIN "good" range.
477	.s_r64FSinOkay:
478	dq 2.356194490192344928845 ; 3pi/4
479	;dq 1.57079632679489661923 ; pi/2 - alternative.
480
481	ENDPROC RT_NOCRT(sin)
482

Note: See TracBrowser for help on using the repository browser.

source: vbox/trunk/src/VBox/Runtime/common/math/sin.asm@ 96240

Download in other formats: