cos.asm@ 96240

Last change on this file since 96240 was 96240, checked in by vboxsync, 2 years ago
IPRT/nocrt: Reworking the sin and cos code to take into account which ranges FCOS and FSIN instructions are expected to deliver reasonable results and handle extreme values better. bugref:10261
Property svn:eol-style set to `native` Property svn:keywords set to `Author Date Id Revision`
File size: 5.8 KB

Line
1	; $Id: cos.asm 96240 2022-08-17 00:59:31Z vboxsync $
2	;; @file
3	; IPRT - No-CRT cos - AMD64 & X86.
4	;
5
6	;
7	; Copyright (C) 2006-2022 Oracle Corporation
8	;
9	; This file is part of VirtualBox Open Source Edition (OSE), as
10	; available from http://www.virtualbox.org. This file is free software;
11	; you can redistribute it and/or modify it under the terms of the GNU
12	; General Public License (GPL) as published by the Free Software
13	; Foundation, in version 2 as it comes in the "COPYING" file of the
14	; VirtualBox OSE distribution. VirtualBox OSE is distributed in the
15	; hope that it will be useful, but WITHOUT ANY WARRANTY of any kind.
16	;
17	; The contents of this file may alternatively be used under the terms
18	; of the Common Development and Distribution License Version 1.0
19	; (CDDL) only, as it comes in the "COPYING.CDDL" file of the
20	; VirtualBox OSE distribution, in which case the provisions of the
21	; CDDL are applicable instead of those of the GPL.
22	;
23	; You may elect to license modified versions of this file under the
24	; terms and conditions of either the GPL or the CDDL or both.
25	;
26
27
28	%define RT_ASM_WITH_SEH64
29	%include "iprt/asmdefs.mac"
30	%include "iprt/x86.mac"
31
32
33	BEGINCODE
34
35	;;
36	; Compute the cosine of rd, measured in radians.
37	;
38	; @returns st(0) / xmm0
39	; @param rd [rbp + xCB*2] / xmm0
40	;
41	RT_NOCRT_BEGINPROC cos
42	push xBP
43	SEH64_PUSH_xBP
44	mov xBP, xSP
45	SEH64_SET_FRAME_xBP 0
46	sub xSP, 20h
47	SEH64_ALLOCATE_STACK 20h
48	SEH64_END_PROLOGUE
49
50	%ifdef RT_OS_WINDOWS
51	;
52	; Make sure we use full precision and not the windows default of 53 bits.
53	;
54	;; @todo not sure if this makes any difference...
55	fnstcw [xBP - 20h]
56	mov ax, [xBP - 20h]
57	or ax, X86_FCW_PC_64 ; includes both bits, so no need to clear the mask.
58	mov [xBP - 1ch], ax
59	fldcw [xBP - 1ch]
60	%endif
61
62	;
63	; Load the input into st0.
64	;
65	%ifdef RT_ARCH_AMD64
66	movsd [xBP - 10h], xmm0
67	fld qword [xBP - 10h]
68	%else
69	fld qword [xBP + xCB*2]
70	%endif
71
72	;
73	; The FCOS instruction has a very narrow range (-3pi/8 to 3pi/8) where it
74	; works reliably, so outside that we'll use the FSIN instruction instead
75	; as it has a larger good range (-5pi/4 to 1pi/4 for cosine).
76	; Input conversion follows: cos(x) = sin(x + pi/2)
77	;
78	; We examin the input and weed out non-finit numbers first.
79	;
80
81	; We only do the range check on normal finite numbers.
82	fxam
83	fnstsw ax
84	and ax, X86_FSW_C3 \| X86_FSW_C2 \| X86_FSW_C0
85	cmp ax, X86_FSW_C2 ; Normal finite number (excluding zero)
86	je .finite
87	cmp ax, X86_FSW_C3 ; Zero
88	je .zero
89	cmp ax, X86_FSW_C3 \| X86_FSW_C2 ; Denormals - treat them as zero.
90	je .zero
91	cmp ax, X86_FSW_C0 ; NaN - must handle it special,
92	je .nan
93
94	; Pass infinities and unsupported inputs to fcos, assuming it does the right thing.
95	; We also jump here if we get a finite number in the "good" range, see below.
96	.do_fcos:
97	fcos
98	jmp .return_val
99
100	;
101	; Finite number.
102	;
103	; First check if it's a very tiny number where we can simply return 1.
104	; Next check if it's in the range where FCOS is reasonable, otherwise
105	; go to FSIN to do the work.
106	;
107	.finite:
108	fld st0
109	fabs
110	fld qword [.s_r64TinyCosTo1 xWrtRIP]
111	fcomip st1
112	jbe .zero_extra_pop
113
114	.not_that_tiny_input:
115	fld qword [.s_r64FCosOkay xWrtRIP]
116	fcomip st1
117	ffreep st0 ; pop fabs(input)
118	ja .do_fcos ; jmp if fabs(input) < .s_r64FCosOkay
119
120	;
121	; If we have a positive number we subtract 3pi/2, for negative we add pi/2.
122	; We still have the FXAM result in AX.
123	;
124	.outside_fcos_range:
125	test ax, X86_FSW_C1 ; The sign bit.
126	jnz .adjust_negative_to_sine
127
128	; Calc -3pi/2 using FPU-internal pi constant.
129	fldpi
130	fadd st0, st0 ; st0=2pi
131	fldpi
132	fdiv qword [.s_r64Two xWrtRIP] ; st1=2pi; st0=pi/2
133	fsubp st1, st0 ; st0=3pi/2
134	fchs ; st0=-3pi/2
135	jmp .make_sine_adjustment
136
137	.adjust_negative_to_sine:
138	; Calc +pi/2.
139	fldpi
140	fdiv qword [.s_r64Two xWrtRIP] ; st1=2pi; st0=pi/2
141
142	.make_sine_adjustment:
143	faddp st1, st0
144
145	;
146	; Call internal sine worker to calculate st0=sin(st0)
147	;
148	.do_sine:
149	mov ecx, 1 ; double
150	extern NAME(rtNoCrtMathSinCore)
151	call NAME(rtNoCrtMathSinCore)
152
153	;
154	; Return st0.
155	;
156	.return_val:
157	%ifdef RT_ARCH_AMD64
158	fstp qword [xBP - 10h]
159	movsd xmm0, [xBP - 10h]
160	%endif
161	%ifdef RT_OS_WINDOWS
162	fldcw [xBP - 20h] ; restore original
163	%endif
164	.return:
165	leave
166	ret
167
168	;
169	; cos(+/-0) = +1.0
170	;
171	.zero_extra_pop:
172	ffreep st0
173	.zero:
174	ffreep st0
175	fld1
176	jmp .return_val
177
178	;
179	; Input is NaN, output it unmodified as far as we can (FLD changes SNaN
180	; to QNaN when masked).
181	;
182	.nan:
183	%ifdef RT_ARCH_AMD64
184	ffreep st0
185	%endif
186	jmp .return
187
188	;
189	; Local constants.
190	;
191	ALIGNCODE(8)
192	; About 2**-27. When fabs(input) is below this limit we can consider cos(input) ~= 1.0.
193	.s_r64TinyCosTo1:
194	dq 7.4505806e-9
195
196	; The absolute limit for the range which FCOS is expected to produce reasonable results.
197	.s_r64FCosOkay:
198	dq 1.1780972450961724644225 ; 3*pi/8
199
200	.s_r64Two:
201	dq 2.0
202	ENDPROC RT_NOCRT(cos)
203

Note: See TracBrowser for help on using the repository browser.

source: vbox/trunk/src/VBox/Runtime/common/math/cos.asm@ 96240

Download in other formats: