math

Timestamp:

Aug 17, 2022 1:59:06 AM (3 years ago)

Author:

vboxsync

svn:sync-xref-src-repo-rev:

153054

Message:

IPRT/nocrt: Adapted the reworked sin and cos code for sinf and cosf; fixed a few cos bugs and added tests for cos. bugref:10261

Location:

trunk/src/VBox/Runtime/common/math

Files:

: 2 edited
: 2 copied

cos.asm (modified) (1 diff)
cosf.asm (copied) (copied from trunk/src/VBox/Runtime/common/math/cos.asm ) (10 diffs)
sin.asm (modified) (3 diffs)
sinf.asm (copied) (copied from trunk/src/VBox/Runtime/common/math/sin.asm ) (9 diffs)

Legend:

: Unmodified
: Added
: Removed

trunk/src/VBox/Runtime/common/math/cos.asm

r96240	r96242
110	110	fld qword [.s_r64TinyCosTo1 xWrtRIP]
111	111	fcomip st1
112		jbe .zero_extra_pop
	112	ja .zero_extra_pop
113	113
114	114	.not_that_tiny_input:

trunk/src/VBox/Runtime/common/math/cosf.asm

-              r96240
+              r96242
 ; $Id$
 ;; @file
 ; IPRT - No-CRT cos - AMD64 & X86.
+; IPRT - No-CRT cosf - AMD64 & X86.
+;
 …
 ;;
 ; Compute the cosine of rd, measured in radians.
+; Compute the cosine of rf, measured in radians.
+;
 ; @returns  st(0) / xmm0
 ; @param    rd      [rbp + xCB*2] / xmm0
+;
 RT_NOCRT_BEGINPROC cos
+; @param    rf      [rbp + xCB*2] / xmm0
+;
+RT_NOCRT_BEGINPROC cosf
         push    xBP
         SEH64_PUSH_xBP
 …
+        ;
 %ifdef RT_ARCH_AMD64
         movsd   [xBP - 10h], xmm0
         fld     qword [xBP - 10h]
+        movss   [xBP - 10h], xmm0
+        fld     dword [xBP - 10h]
 %else
         fld     qword [xBP + xCB*2]
+        fld     dword [xBP + xCB*2]
 %endif
 …
         ; works reliably, so outside that we'll use the FSIN instruction instead
         ; as it has a larger good range (-5pi/4 to 1pi/4 for cosine).
         ; Input conversion follows: cos(x) = sin(x + pi/2)
+        ; Input conversion follows: cosf(x) = sinf(x + pi/2)
+        ;
         ; We examin the input and weed out non-finit numbers first.
 …
         fld     qword [.s_r64TinyCosTo1 xWrtRIP]
         fcomip  st1
         jbe      .zero_extra_pop
+        ja      .zero_extra_pop
 .not_that_tiny_input:
 …
+        ;
 .do_sine:
         mov     ecx, 1                      ; double
+        mov     ecx, 0                      ; double
         extern  NAME(rtNoCrtMathSinCore)
         call    NAME(rtNoCrtMathSinCore)
 …
 .return_val:
 %ifdef RT_ARCH_AMD64
         fstp    qword [xBP - 10h]
         movsd   xmm0, [xBP - 10h]
+        fstp    dword [xBP - 10h]
+        movss   xmm0, [xBP - 10h]
 %endif
 %ifdef RT_OS_WINDOWS
 …
+        ;
         ; cos(+/-0) = +1.0
+        ; cosf(+/-0) = +1.0
+        ;
 .zero_extra_pop:
 …
+        ;
 ALIGNCODE(8)
         ; About 2**-27. When fabs(input) is below this limit we can consider cos(input) ~= 1.0.
+        ; About 2**-18. When fabs(input) is below this limit we can consider cosf(input) ~= 1.0.
 .s_r64TinyCosTo1:
         dq  7.4505806e-9
+        dq  0.000244140625
         ; The absolute limit for the range which FCOS is expected to produce reasonable results.
 …
 .s_r64Two:
         dq  2.0
 ENDPROC   RT_NOCRT(cos)
+ENDPROC   RT_NOCRT(cosf)

trunk/src/VBox/Runtime/common/math/sin.asm

-              r96241
+              r96242
 ;;
 ; Compute the sine of rd, measured in radians.
+; Compute the sine of rf, measured in radians.
+;
 ; @returns  st(0) / xmm0
 ; @param    rd      [rbp + xCB*2] / xmm0
+; @param    rf      [rbp + xCB*2] / xmm0
+;
 RT_NOCRT_BEGINPROC sin
 …
 ALIGNCODE(8)
         ; Ca. 2**-26, absolute value. Inputs closer to zero than this can be
+        ; Ca. 2**-17, absolute value. Inputs closer to zero than this can be
         ; returns directly as the sin(input) value should be basically the same
         ; given the precision we're working with and FSIN probably won't even
 …
         ;; @todo experiment when FSIN gets better than this.
 .s_r64Tiny:
         dq      1.49011612e-8
+        dq      0.00000762939453125
         ; The absolute limit of FSIN "good" range.
 .s_r64FSinOkay:

trunk/src/VBox/Runtime/common/math/sinf.asm

-              r96241
+              r96242
 ; $Id$
 ;; @file
 ; IPRT - No-CRT sin - AMD64 & X86.
+; IPRT - No-CRT sinf - AMD64 & X86.
+;
 …
 ; @param    rd      [rbp + xCB*2] / xmm0
+;
 RT_NOCRT_BEGINPROC sin
+RT_NOCRT_BEGINPROC sinf
         push    xBP
         SEH64_PUSH_xBP
 …
+        ;
 %ifdef RT_ARCH_AMD64
         movsd   [xBP - 10h], xmm0
         fld     qword [xBP - 10h]
+        movss   [xBP - 10h], xmm0
+        fld     dword [xBP - 10h]
 %else
         fld     qword [xBP + xCB*2]
+        fld     dword [xBP + xCB*2]
 %endif
 …
         ; Call common sine/cos worker.
+        ;
         mov     ecx, 1                      ; double
+        mov     ecx, 0                      ; float
         extern  NAME(rtNoCrtMathSinCore)
         call    NAME(rtNoCrtMathSinCore)
 …
 .return_val:
 %ifdef RT_ARCH_AMD64
         fstp    qword [xBP - 10h]
         movsd   xmm0, [xBP - 10h]
+        fstp    dword [xBP - 10h]
+        movss   xmm0, [xBP - 10h]
 %endif
 %ifdef RT_OS_WINDOWS
 …
+        ;
         ; As explained already, we can return tiny numbers directly too as the
         ; output from sin(input) = input given our precision.
+        ; output from sinf(input) = input given our precision.
         ; We can skip the st0 -> xmm0 translation here, so follow the same path
         ; as .zero & .nan, after we've removed the fabs(input) value.
 …
+        ;
         ; sin(+/-0.0) = +/-0.0 (preserve the sign)
+        ; sinf(+/-0.0) = +/-0.0 (preserve the sign)
         ; We can skip the st0 -> xmm0 translation here, so follow the .nan code path.
+        ;
 …
 ALIGNCODE(8)
         ; Ca. 2**-26, absolute value. Inputs closer to zero than this can be
         ; returns directly as the sin(input) value should be basically the same
+        ; returns directly as the sinf(input) value should be basically the same
         ; given the precision we're working with and FSIN probably won't even
         ; manage that.
 …
         ;dq      1.57079632679489661923  ; pi/2 - alternative.
 ENDPROC   RT_NOCRT(sin)
+ENDPROC   RT_NOCRT(sinf)

Note: See TracChangeset for help on using the changeset viewer.