Changeset 96240 in vbox

Timestamp:

Aug 17, 2022 12:59:31 AM (3 years ago)

Author:

vboxsync

svn:sync-xref-src-repo-rev:

153052

Message:

IPRT/nocrt: Reworking the sin and cos code to take into account which ranges FCOS and FSIN instructions are expected to deliver reasonable results and handle extreme values better. bugref:10261

Location:

trunk/src/VBox/Runtime

Files:

: 1 added
: 5 edited

Makefile.kmk (modified) (2 diffs)
common/math/cos.asm (modified) (1 diff)
common/math/sin.asm (modified) (1 diff)
common/math/sincore.asm (added)
testcase/Makefile.kmk (modified) (1 diff)
testcase/tstRTNoCrt-2.cpp (modified) (7 diffs)

Legend:

: Unmodified
: Added
: Removed

trunk/src/VBox/Runtime/Makefile.kmk

-              r96213
+              r96240
         common/math/sin.asm \
         common/math/sinf.asm \
+        common/math/sincore.asm \
         common/math/sqrt.asm \
         common/math/sqrtf.asm \
 …
         common/math/sin.asm \
         common/math/sinf.asm \
+        common/math/sincore.asm \
         common/math/sqrt.asm \
         common/math/sqrtf.asm \

trunk/src/VBox/Runtime/common/math/cos.asm

-              r96060
+              r96240
+;
+%define RT_ASM_WITH_SEH64
 %include "iprt/asmdefs.mac"
+%include "iprt/x86.mac"
 BEGINCODE
 ;;
+; compute the cosine of dr, measured in radians.
+; @returns st(0) / xmm0
+; Compute the cosine of rd, measured in radians.
+;
+; @returns  st(0) / xmm0
 ; @param    rd      [rbp + xCB*2] / xmm0
+;
 RT_NOCRT_BEGINPROC cos
+    push    xBP
+    mov     xBP, xSP
+        push    xBP
+        SEH64_PUSH_xBP
+        mov     xBP, xSP
+        SEH64_SET_FRAME_xBP 0
+        sub     xSP, 20h
+        SEH64_ALLOCATE_STACK 20h
+        SEH64_END_PROLOGUE
+%ifdef RT_OS_WINDOWS
+        ;
+        ; Make sure we use full precision and not the windows default of 53 bits.
+        ;
+;; @todo not sure if this makes any difference...
+        fnstcw  [xBP - 20h]
+        mov     ax, [xBP - 20h]
+        or      ax, X86_FCW_PC_64       ; includes both bits, so no need to clear the mask.
+        mov     [xBP - 1ch], ax
+        fldcw   [xBP - 1ch]
+%endif
+        ;
+        ; Load the input into st0.
+        ;
 %ifdef RT_ARCH_AMD64
+    sub     xSP, 10h
+    movsd   [xSP], xmm0
+    fld     qword [xSP]
+        movsd   [xBP - 10h], xmm0
+        fld     qword [xBP - 10h]
 %else
+    fld     qword [xBP + xCB*2]
+%endif
+    fcos
+    fnstsw  ax
+    test    ah, 4
+    jz      .done
+    fldpi
+    fadd    st0, st0
+    fxch    st1
+.again:
+    fprem1
+    fnstsw  ax
+    test    ah, 4
+    jnz     .again
+    fstp    st0
+    fcos
+.done:
+        fld     qword [xBP + xCB*2]
+%endif
+        ;
+        ; The FCOS instruction has a very narrow range (-3pi/8 to 3pi/8) where it
+        ; works reliably, so outside that we'll use the FSIN instruction instead
+        ; as it has a larger good range (-5pi/4 to 1pi/4 for cosine).
+        ; Input conversion follows: cos(x) = sin(x + pi/2)
+        ;
+        ; We examin the input and weed out non-finit numbers first.
+        ;
+        ; We only do the range check on normal finite numbers.
+        fxam
+        fnstsw  ax
+        and     ax, X86_FSW_C3 | X86_FSW_C2 | X86_FSW_C0
+        cmp     ax, X86_FSW_C2              ; Normal finite number (excluding zero)
+        je      .finite
+        cmp     ax, X86_FSW_C3              ; Zero
+        je      .zero
+        cmp     ax, X86_FSW_C3 | X86_FSW_C2 ; Denormals - treat them as zero.
+        je      .zero
+        cmp     ax, X86_FSW_C0              ; NaN - must handle it special,
+        je      .nan
+        ; Pass infinities and unsupported inputs to fcos, assuming it does the right thing.
+        ; We also jump here if we get a finite number in the "good" range, see below.
+.do_fcos:
+        fcos
+        jmp     .return_val
+        ;
+        ; Finite number.
+        ;
+        ; First check if it's a very tiny number where we can simply return 1.
+        ; Next check if it's in the range where FCOS is reasonable, otherwise
+        ; go to FSIN to do the work.
+        ;
+.finite:
+        fld     st0
+        fabs
+        fld     qword [.s_r64TinyCosTo1 xWrtRIP]
+        fcomip  st1
+        jbe      .zero_extra_pop
+.not_that_tiny_input:
+        fld     qword [.s_r64FCosOkay xWrtRIP]
+        fcomip  st1
+        ffreep  st0                         ; pop fabs(input)
+        ja      .do_fcos                    ; jmp if fabs(input) < .s_r64FCosOkay
+        ;
+        ; If we have a positive number we subtract 3pi/2, for negative we add pi/2.
+        ; We still have the FXAM result in AX.
+        ;
+.outside_fcos_range:
+        test    ax, X86_FSW_C1              ; The sign bit.
+        jnz     .adjust_negative_to_sine
+        ; Calc -3pi/2 using FPU-internal pi constant.
+        fldpi
+        fadd    st0, st0                    ; st0=2pi
+        fldpi
+        fdiv    qword [.s_r64Two xWrtRIP]   ; st1=2pi; st0=pi/2
+        fsubp   st1, st0                    ; st0=3pi/2
+        fchs                                ; st0=-3pi/2
+        jmp     .make_sine_adjustment
+.adjust_negative_to_sine:
+        ; Calc +pi/2.
+        fldpi
+        fdiv    qword [.s_r64Two xWrtRIP]   ; st1=2pi; st0=pi/2
+.make_sine_adjustment:
+        faddp   st1, st0
+        ;
+        ; Call internal sine worker to calculate st0=sin(st0)
+        ;
+.do_sine:
+        mov     ecx, 1                      ; double
+        extern  NAME(rtNoCrtMathSinCore)
+        call    NAME(rtNoCrtMathSinCore)
+        ;
+        ; Return st0.
+        ;
+.return_val:
 %ifdef RT_ARCH_AMD64
+    fstp    qword [xSP]
+    movsd   xmm0, [xSP]
+%endif
+    leave
+    ret
+        fstp    qword [xBP - 10h]
+        movsd   xmm0, [xBP - 10h]
+%endif
+%ifdef RT_OS_WINDOWS
+        fldcw   [xBP - 20h]                 ; restore original
+%endif
+.return:
+        leave
+        ret
+        ;
+        ; cos(+/-0) = +1.0
+        ;
+.zero_extra_pop:
+        ffreep  st0
+.zero:
+        ffreep  st0
+        fld1
+        jmp     .return_val
+        ;
+        ; Input is NaN, output it unmodified as far as we can (FLD changes SNaN
+        ; to QNaN when masked).
+        ;
+.nan:
+%ifdef RT_ARCH_AMD64
+        ffreep  st0
+%endif
+        jmp     .return
+        ;
+        ; Local constants.
+        ;
+ALIGNCODE(8)
+        ; About 2**-27. When fabs(input) is below this limit we can consider cos(input) ~= 1.0.
+.s_r64TinyCosTo1:
+        dq  7.4505806e-9
+        ; The absolute limit for the range which FCOS is expected to produce reasonable results.
+.s_r64FCosOkay:
+        dq  1.1780972450961724644225   ; 3*pi/8
+.s_r64Two:
+        dq  2.0
 ENDPROC   RT_NOCRT(cos)

trunk/src/VBox/Runtime/common/math/sin.asm

-              r96060
+              r96240
+;
+%define RT_ASM_WITH_SEH64
 %include "iprt/asmdefs.mac"
+%include "iprt/x86.mac"
 BEGINCODE
 ;;
+; Compute the sine of rd
+; @returns st(0)/xmm0
+; @param    rd      [xSP + xCB*2] / xmm0
+; Internal sine and cosine worker that calculates the sine of st0 returning
+; it in st0.
+;
+; When called by a sine function, fabs(st0) >= pi/2.
+; When called by a cosine function, fabs(original input value) >= 3pi/8.
+;
+; That the input isn't a tiny number close to zero, means that we can do a bit
+; cruder rounding when operating close to a pi/2 boundrary.  The value in the
+; ecx register indicates the input precision and controls the crudeness of the
+; rounding.
+;
+; @returns st0 = sine
+; @param   st0      A finite number to calucate sine of.
+; @param   ecx      Set to 0 if original input was a 32-bit float.
+;                   Set to 1 if original input was a 64-bit double.
+;                   set to 2 if original input was a 80-bit long double.
+;
+BEGINPROC   rtNoCrtMathSinCore
+        push    xBP
+        SEH64_PUSH_xBP
+        mov     xBP, xSP
+        SEH64_SET_FRAME_xBP 0
+        SEH64_END_PROLOGUE
+        ;
+        ; Load the pointer to the rounding crudeness factor into xDX.
+        ;
+        lea     xDX, [.s_ar64NearZero xWrtRIP]
+        lea     xDX, [xDX + xCX * xCB]
+        ;
+        ; Finite number.  We want it in the range [0,2pi] and will preform
+        ; a remainder division if it isn't.
+        ;
+        fcom    qword [.s_r64Max xWrtRIP]   ; compares st0 and 2*pi
+        fnstsw  ax
+        test    ax, X86_FSW_C3 | X86_FSW_C0 | X86_FSW_C2 ; C3 := st0 == mem;  C0 := st0 < mem;  C2 := unordered (should be the case);
+        jz      .reduce_st0                 ; Jump if st0 > mem
+        fcom    qword [.s_r64Min xWrtRIP]   ; compares st0 and 0.0
+        fnstsw  ax
+        test    ax, X86_FSW_C3 | X86_FSW_C0
+        jnz     .reduce_st0                 ; Jump if st0 <= mem
+        ;
+        ; We get here if st0 is in the [0,2pi] range.
+        ;
+        ; Now, FSIN is documented to be reasonably accurate for the range
+        ; -3pi/4 to +3pi/4, so we have to make some more effort to calculate
+        ; in that range only.
+        ;
+.in_range:
+        ; if (st0 < pi)
+        fldpi
+        fcom    st1                         ; compares st0 (pi) with st1 (the normalized value)
+        fnstsw  ax
+        test    ax, X86_FSW_C0              ; st1 > pi
+        jnz     .larger_than_pi
+        test    ax, X86_FSW_C3
+        jnz     .equals_pi
+        ;
+        ; input in the range [0,pi[
+        ;
+.smaller_than_pi:
+        fdiv    qword [.s_r64Two xWrtRIP]   ; st0 = pi/2
+        ; if (st0 < pi/2)
+        fcom    st1                         ; compares st0 (pi/2) with st1
+        fnstsw  ax
+        test    ax, X86_FSW_C0              ; st1 > pi
+        jnz     .between_half_pi_and_pi
+        test    ax, X86_FSW_C3
+        jnz     .equals_half_pi
+        ;
+        ; The value is between zero and half pi, including the zero value.
+        ;
+        ; This is in range where FSIN works reasonably reliably. So drop the
+        ; half pi in st0 and do the calculation.
+        ;
+.between_zero_and_half_pi:
+        ; Check if we're so close to pi/2 that it makes no difference.
+        fsub    st0, st1                    ; st0 = pi/2 - st1
+        fcom    qword [xDX]
+        fnstsw  ax
+        test    ax, X86_FSW_C0 | X86_FSW_C3 ; st0 <= very small positive number.
+        jnz     .equals_half_pi
+        ffreep  st0
+        ; Check if we're so close to zero that it makes no difference given the
+        ; internal accuracy of the FPU.
+        fcom    qword [xDX]
+        fnstsw  ax
+        test    ax, X86_FSW_C0 | X86_FSW_C3 ; st0 <= very small positive number.
+        jnz     .equals_zero_popped_one
+        ; Ok, calculate sine.
+        fsin
+        jmp     .return
+        ;
+        ; The value is in the range ]pi/2,pi[
+        ;
+        ; This is outside the comfortable FSIN range, but if we subtract PI and
+        ; move to the ]-pi/2,0[ range we just have to change the sign to get
+        ; the value we want.
+        ;
+.between_half_pi_and_pi:
+        ; Check if we're so close to pi/2 that it makes no difference.
+        fsubr   st0, st1                    ; st0 = st1 - st0
+        fcom    qword [xDX]
+        fnstsw  ax
+        test    ax, X86_FSW_C0 | X86_FSW_C3 ; st0 <= very small positive number.
+        jnz     .equals_half_pi
+        ffreep  st0
+        ; Check if we're so close to pi that it makes no difference.
+        fldpi
+        fsub    st0, st1                    ; st0 = st0 - st1
+        fcom    qword [xDX]
+        fnstsw  ax
+        test    ax, X86_FSW_C0 | X86_FSW_C3 ; st0 <= very small positive number.
+        jnz     .equals_pi
+        ffreep  st0
+        ; Ok, transform the value and calculate sine.
+        fldpi
+        fsubp   st1, st0
+        fsin
+        fchs
+        jmp     .return
+        ;
+        ; input in the range ]pi,2pi[
+        ;
+.larger_than_pi:
+        fsub    st1, st0                    ; st1 -= pi
+        fdiv    qword [.s_r64Two xWrtRIP]   ; st0 = pi/2
+        ; if (st0 < pi/2)
+        fcom    st1                         ; compares st0 (pi/2) with reduced st1
+        fnstsw  ax
+        test    ax, X86_FSW_C0              ; st1 > pi
+        jnz     .between_3_half_pi_and_2pi
+        test    ax, X86_FSW_C3
+        jnz     .equals_3_half_pi
+        ;
+        ; The value is in the the range: ]pi,3pi/2[
+        ;
+        ; The actual st0 is in the range ]pi,pi/2[ where FSIN is performing okay
+        ; and we can get the desired result by changing the sign (-FSIN).
+        ;
+.between_pi_and_3_half_pi:
+        ; Check if we're so close to pi/2 that it makes no difference.
+        fsub    st0, st1                    ; st0 = pi/2 - st1
+        fcom    qword [xDX]
+        fnstsw  ax
+        test    ax, X86_FSW_C0 | X86_FSW_C3 ; st0 <= very small positive number.
+        jnz     .equals_3_half_pi
+        ffreep  st0
+        ; Check if we're so close to zero that it makes no difference given the
+        ; internal accuracy of the FPU.
+        fcom    qword [xDX]
+        fnstsw  ax
+        test    ax, X86_FSW_C0 | X86_FSW_C3 ; st0 <= very small positive number.
+        jnz     .equals_pi_popped
+        ; Ok, calculate sine and flip the sign.
+        fsin
+        fchs
+        jmp     .return
+        ;
+        ; The value is in the last pi/2 of the range: ]3pi/2,2pi[
+        ;
+        ; Since FSIN should work reasonably well for ]-pi/2,pi], we can just
+        ; subtract pi again (we subtracted pi at .larger_than_pi above) and
+        ; run FSIN on it.  (st1 is currently in the range ]pi/2,pi[.)
+        ;
+.between_3_half_pi_and_2pi:
+        ; Check if we're so close to pi/2 that it makes no difference.
+        fsubr   st0, st1                    ; st0 = st1 - st0
+        fcom    qword [xDX]
+        fnstsw  ax
+        test    ax, X86_FSW_C0 | X86_FSW_C3 ; st0 <= very small positive number.
+        jnz     .equals_3_half_pi
+        ffreep  st0
+        ; Check if we're so close to pi that it makes no difference.
+        fldpi
+        fsub    st0, st1                    ; st0 = st0 - st1
+        fcom    qword [xDX]
+        fnstsw  ax
+        test    ax, X86_FSW_C0 | X86_FSW_C3 ; st0 <= very small positive number.
+        jnz     .equals_2pi
+        ffreep  st0
+        ; Ok, adjust input and calculate sine.
+        fldpi
+        fsubp   st1, st0
+        fsin
+        jmp     .return
+        ;
+        ; sin(0) = 0
+        ; sin(pi) = 0
+        ;
+.equals_zero:
+.equals_pi:
+.equals_2pi:
+        ffreep  st0
+.equals_zero_popped_one:
+.equals_pi_popped:
+        ffreep  st0
+        fldz
+        jmp     .return
+        ;
+        ; sin(pi/2) = 1
+        ;
+.equals_half_pi:
+        ffreep  st0
+        ffreep  st0
+        fld1
+        jmp     .return
+        ;
+        ; sin(3*pi/2) = -1
+        ;
+.equals_3_half_pi:
+        ffreep  st0
+        ffreep  st0
+        fld1
+        fchs
+        jmp     .return
+        ;
+        ; Return.
+        ;
+.return:
+        leave
+        ret
+        ;
+        ; Reduce st0 by reminder division by PI*2.  The result should be positive here.
+        ;
+        ;; @todo this is one of our weak spots (really any calculation involving PI is).
+.reduce_st0:
+        fldpi
+        fadd    st0, st0
+        fxch    st1                     ; st0=input (dividend) st1=2pi (divisor)
+.again:
+        fprem1
+        fnstsw  ax
+        test    ah, (X86_FSW_C2 >> 8)   ; C2 is set if partial result.
+        jnz     .again                  ; Loop till C2 == 0 and we have a final result.
+        ;
+        ; Make sure the result is positive.
+        ;
+        fxam
+        fnstsw  ax
+        test    ax, X86_FSW_C1          ; The sign bit
+        jz      .reduced_to_positive
+        fadd    st0, st1                ; st0 += 2pi, which should make it positive
+%ifdef RT_STRICT
+        fxam
+        fnstsw  ax
+        test    ax, X86_FSW_C1
+        jz      .reduced_to_positive
+        int3
+%endif
+.reduced_to_positive:
+        fstp    st1                     ; Get rid of the 2pi value.
+        jmp     .in_range
+ALIGNCODE(8)
+.s_r64Max:
+        dq +6.28318530717958647692      ; 2*pi
+.s_r64Min:
+        dq 0.0
+.s_r64Two:
+        dq 2.0
+        ;;
+        ; Close to 2/pi rounding limits for 32-bit, 64-bit and 80-bit floating point operations.
+        ; Given that the original input is at least +/-3pi/8 (1.178) and that precision of the
+        ; PI constant used during reduction/whatever, I think we can round to a whole pi/2
+        ; step when we get close enough.
+        ;
+        ; Look to RTFLOAT64U for the format details, but 52 is the shift for the exponent field
+        ; and 1023 is the exponent bias.  Since the format uses an implied 1 in the mantissa,
+        ; we only have to set the exponent to get a valid number.
+        ;
+.s_ar64NearZero:
+        dq  (-18 + 1023) << 52          ; float / 32-bit / single precision input
+        dq  (-40 + 1023) << 52          ; double / 64-bit / double precision input
+        dq  (-52 + 1023) << 52          ; long double / 80-bit / extended precision input
+ENDPROC     rtNoCrtMathSinCore
+;;
+; Compute the sine of rd, measured in radians.
+;
+; @returns  st(0) / xmm0
+; @param    rd      [rbp + xCB*2] / xmm0
+;
 RT_NOCRT_BEGINPROC sin
+    push    xBP
+    mov     xBP, xSP
+        push    xBP
+        SEH64_PUSH_xBP
+        mov     xBP, xSP
+        SEH64_SET_FRAME_xBP 0
+        sub     xSP, 20h
+        SEH64_ALLOCATE_STACK 20h
+        SEH64_END_PROLOGUE
+%ifdef RT_OS_WINDOWS
+        ;
+        ; Make sure we use full precision and not the windows default of 53 bits.
+        ;
+        fnstcw  [xBP - 20h]
+        mov     ax, [xBP - 20h]
+        or      ax, X86_FCW_PC_64       ; includes both bits, so no need to clear the mask.
+        mov     [xBP - 1ch], ax
+        fldcw   [xBP - 1ch]
+%endif
+        ;
+        ; Load the input into st0.
+        ;
 %ifdef RT_ARCH_AMD64
+    sub     xSP, 10h
+    movsd   [xSP], xmm0
+    fld     qword [xSP]
+        movsd   [xBP - 10h], xmm0
+        fld     qword [xBP - 10h]
 %else
+    fld     qword [xBP + xCB*2]
+%endif
+    fsin
+    fnstsw  ax
+    test    ah, 04h
+    jz      .done
+    fldpi
+    fadd    st0
+    fxch    st1
+.again:
+    fprem1
+    fnstsw  ax
+    test    ah, 04h
+    jnz     .again
+    fstp    st1
+    fsin
+.done:
+        fld     qword [xBP + xCB*2]
+%endif
+        ;
+        ; We examin the input and weed out non-finit numbers first.
+        ;
+        fxam
+        fnstsw  ax
+        and     ax, X86_FSW_C3 | X86_FSW_C2 | X86_FSW_C0
+        cmp     ax, X86_FSW_C2              ; Normal finite number (excluding zero)
+        je      .finite
+        cmp     ax, X86_FSW_C3              ; Zero
+        je      .zero
+        cmp     ax, X86_FSW_C3 | X86_FSW_C2 ; Denormals - treat them as zero.
+        je      .zero
+        cmp     ax, X86_FSW_C0              ; NaN - must handle it special,
+        je      .nan
+        ; Pass infinities and unsupported inputs to fsin, assuming it does the right thing.
+.do_sin:
+        fsin
+        jmp     .return_val
+        ;
+        ; Finite number.
+        ;
+.finite:
+        ; For very tiny numbers, 0 < abs(input) < 2**-25, we can return the
+        ; input value directly.
+        fld     st0                         ; duplicate st0
+        fabs                                ; make it an absolute (positive) value.
+        fld     qword [.s_r64Tiny xWrtRIP]
+        fcomip  st1                         ; compare s_r64Tiny and fabs(input)
+        ja      .return_tiny_number_as_is   ; jump if fabs(input) is smaller
+        ; FSIN is documented to be reasonable for the range ]-3pi/4,3pi/4[, so
+        ; while we have fabs(input) loaded already, check for that here and
+        ; allow rtNoCrtMathSinCore to assume it won't see values very close to
+        ; zero, except by cos -> sin conversion where they won't be relevant to
+        ; any assumpttions about precision approximation.
+        fld     qword [.s_r64FSinOkay xWrtRIP]
+        fcomip  st1
+        ffreep  st0                         ; drop the fabs(input) value
+        ja      .do_sin
+        ;
+        ; Call common sine/cos worker.
+        ;
+        mov     ecx, 1                      ; double
+        extern  NAME(rtNoCrtMathSinCore)
+        call    NAME(rtNoCrtMathSinCore)
+        ;
+        ; Run st0.
+        ;
+.return_val:
 %ifdef RT_ARCH_AMD64
+    fstp    qword [xSP]
+    movsd   xmm0, [xSP]
+%endif
+    leave
+    ret
+        fstp    qword [xBP - 10h]
+        movsd   xmm0, [xBP - 10h]
+%endif
+%ifdef RT_OS_WINDOWS
+        fldcw   [xBP - 20h]                 ; restore original
+%endif
+.return:
+        leave
+        ret
+        ;
+        ; As explained already, we can return tiny numbers directly too as the
+        ; output from sin(input) = input given our precision.
+        ; We can skip the st0 -> xmm0 translation here, so follow the same path
+        ; as .zero & .nan, after we've removed the fabs(input) value.
+        ;
+.return_tiny_number_as_is:
+        ffreep  st0
+        ;
+        ; sin(+/-0.0) = +/-0.0 (preserve the sign)
+        ; We can skip the st0 -> xmm0 translation here, so follow the .nan code path.
+        ;
+.zero:
+        ;
+        ; Input is NaN, output it unmodified as far as we can (FLD changes SNaN
+        ; to QNaN when masked).
+        ;
+.nan:
+%ifdef RT_ARCH_AMD64
+        ffreep  st0
+%endif
+        jmp     .return
+ALIGNCODE(8)
+        ; Ca. 2**-26, absolute value. Inputs closer to zero than this can be
+        ; returns directly as the sin(input) value should be basically the same
+        ; given the precision we're working with and FSIN probably won't even
+        ; manage that.
+        ;; @todo experiment when FSIN gets better than this.
+.s_r64Tiny:
+        dq      1.49011612e-8
+        ; The absolute limit of FSIN "good" range.
+.s_r64FSinOkay:
+        dq      2.356194490192344928845 ; 3pi/4
+        ;dq      1.57079632679489661923  ; pi/2 - alternative.
 ENDPROC   RT_NOCRT(sin)

trunk/src/VBox/Runtime/testcase/Makefile.kmk

r96213	r96240
677	677	../common/math/sin.asm \
678	678	../common/math/sinf.asm \
	679	../common/math/sincore.asm \
679	680	../common/math/sqrt.asm \
680	681	../common/math/sqrtf.asm \

trunk/src/VBox/Runtime/testcase/tstRTNoCrt-2.cpp

-              r96224
+              r96240
 #endif
+/* Stuff we provide in our math, but UCRT apparently doesn't: */
+#ifndef  M_E
+# define M_E     2.7182818284590452354   /* e */
+#endif
+#ifndef  M_LOG2E
+# define M_LOG2E     1.4426950408889634074   /* log 2e */
+#endif
+#ifndef  M_LOG10E
+# define M_LOG10E    0.43429448190325182765  /* log 10e */
+#endif
+#ifndef  M_LN2
+# define M_LN2       0.69314718055994530942  /* log e2 */
+#endif
+#ifndef  M_LN10
+# define M_LN10      2.30258509299404568402  /* log e10 */
+#endif
+#ifndef  M_PI
+# define M_PI        3.14159265358979323846  /* pi */
+#endif
+#ifndef  M_PI_2
+# define M_PI_2      1.57079632679489661923  /* pi/2 */
+#endif
+#ifndef  M_PI_4
+# define M_PI_4      0.78539816339744830962  /* pi/4 */
+#endif
+#ifndef  M_1_PI
+# define M_1_PI      0.31830988618379067154  /* 1/pi */
+#endif
+#ifndef  M_2_PI
+# define M_2_PI      0.63661977236758134308  /* 2/pi */
+#endif
+#ifndef  M_2_SQRTPI
+# define M_2_SQRTPI  1.12837916709551257390  /* 2/sqrt(pi) */
+#endif
+#ifndef  M_SQRT2
+# define M_SQRT2     1.41421356237309504880  /* sqrt(2) */
+#endif
+#ifndef  M_SQRT1_2
+# define M_SQRT1_2   0.70710678118654752440  /* 1/sqrt(2) */
+#endif
 /*********************************************************************************************************************************
 …
     } while (0)
+#define CHECK_DBL_RANGE(a_Expr, a_rdExpect, a_rdPlusMin) do { \
+        RTFLOAT64U uRet; \
+        uRet.r = a_Expr; \
+        RTFLOAT64U uExpectMin; \
+        uExpectMin.r = (a_rdExpect) - (a_rdPlusMin); \
+        RTFLOAT64U uExpectMax; \
+        uExpectMax.r = (a_rdExpect) + (a_rdPlusMin); \
+        if (   !(RTFLOAT64U_IS_NORMAL(&uRet) || RTFLOAT64U_IS_ZERO(&uRet))\
+            || uRet.r < uExpectMin.r \
+            || uRet.r > uExpectMax.r ) \
+        { \
+            RTStrFormatR64(g_szFloat[0], sizeof(g_szFloat[0]), &uRet,       0, 0, RTSTR_F_SPECIAL); \
+            RTStrFormatR64(g_szFloat[1], sizeof(g_szFloat[1]), &uExpectMin, 0, 0, RTSTR_F_SPECIAL); \
+            RTStrFormatR64(g_szFloat[2], sizeof(g_szFloat[2]), &uExpectMax, 0, 0, RTSTR_F_SPECIAL); \
+            RTTestFailed(g_hTest, "line %u: %s -> %s, expected [%s,%s] (%s +/- %s)", \
+                         __LINE__, #a_Expr, g_szFloat[0], g_szFloat[1], #a_rdExpect, #a_rdPlusMin); \
+        } \
+    } while (0)
 #define CHECK_DBL_SAME_RELAXED_NAN(a_Fn, a_Args) do { \
         RTFLOAT64U uNoCrtRet, uCrtRet; \
 …
 *********************************************************************************************************************************/
 RTTEST  g_hTest;
 char    g_szFloat[2][128];
+char    g_szFloat[4][128];
 …
     RTTestSub(g_hTest, "atan[f]");
     CHECK_DBL(RT_NOCRT(atan)(             +1.0), +0.78539816339744830962 /*+M_PI_4*/);
     CHECK_DBL(RT_NOCRT(atan)(             -1.0), -0.78539816339744830962 /*-M_PI_4*/);
     CHECK_DBL(RT_NOCRT(atan)(        +INFINITY), +1.57079632679489661923 /*+M_PI_2*/);
     CHECK_DBL(RT_NOCRT(atan)(        -INFINITY), -1.57079632679489661923 /*-M_PI_2*/);
+    CHECK_DBL(RT_NOCRT(atan)(             +1.0), +M_PI_4);
+    CHECK_DBL(RT_NOCRT(atan)(             -1.0), -M_PI_4);
+    CHECK_DBL(RT_NOCRT(atan)(        +INFINITY), +M_PI_2);
+    CHECK_DBL(RT_NOCRT(atan)(        -INFINITY), -M_PI_2);
     CHECK_DBL_SAME(    atan,(              1.0));
     CHECK_DBL_SAME(    atan,(              1.5));
 …
     CHECK_DBL_SAME(    atan,(RTStrNanDouble("s", false)));
     CHECK_DBL(RT_NOCRT(atanf)(             +1.0f), +0.78539816339744830962f /*+M_PI_4*/);
     CHECK_DBL(RT_NOCRT(atanf)(             -1.0f), -0.78539816339744830962f /*-M_PI_4*/);
     CHECK_DBL(RT_NOCRT(atanf)(         +INFINITY), +1.57079632679489661923f /*+M_PI_2*/);
     CHECK_DBL(RT_NOCRT(atanf)(         -INFINITY), -1.57079632679489661923f /*-M_PI_2*/);
+    CHECK_DBL(RT_NOCRT(atanf)(             +1.0f), (float)+M_PI_4);
+    CHECK_DBL(RT_NOCRT(atanf)(             -1.0f), (float)-M_PI_4);
+    CHECK_DBL(RT_NOCRT(atanf)(         +INFINITY), (float)+M_PI_2);
+    CHECK_DBL(RT_NOCRT(atanf)(         -INFINITY), (float)-M_PI_2);
     CHECK_DBL_SAME(    atanf,(              1.0f));
     CHECK_DBL_SAME(    atanf,(              1.5f));
 …
     CHECK_DBL_SAME(    atanf,(2.34960584706e+30f));
     CHECK_DBL_SAME(    atanf,(2.34960584706e+30f));
+    CHECK_DBL_SAME(    atanf,(RTStrNanDouble(NULL, true)));
+    CHECK_DBL_SAME(    atanf,(RTStrNanDouble("s",  true)));
+    CHECK_DBL_SAME(    atanf,(RTStrNanDouble("s", false)));
+    CHECK_DBL_SAME(    atanf,(RTStrNanFloat(NULL, true)));
+    CHECK_DBL_SAME(    atanf,(RTStrNanFloat("s",  true)));
+    CHECK_DBL_SAME(    atanf,(RTStrNanFloat("s", false)));
+}
+void testATan2()
+{
+    RTTestSub(g_hTest, "atan2[f]");
+    CHECK_DBL(RT_NOCRT(atan2)(             +1.0,            0.0), +M_PI_2);
+    CHECK_DBL(RT_NOCRT(atan2)(             -1.0,            0.0), -M_PI_2);
+    CHECK_DBL(RT_NOCRT(atan2)(             +1.0,           +1.0), +M_PI_4);
+    CHECK_DBL(RT_NOCRT(atan2)(             -1.0,           -1.0), -M_PI_2 - M_PI_4);
+    CHECK_DBL_SAME(    atan2,(             +1.0,            0.0));
+    CHECK_DBL_SAME(    atan2,(             +1.0,           -0.0));
+    CHECK_DBL_SAME(    atan2,(             -1.0,            0.0));
+    CHECK_DBL_SAME(    atan2,(             -1.0,           -0.0));
+    CHECK_DBL_SAME(    atan2,(             +1.0,           +1.0));
+    CHECK_DBL_SAME(    atan2,(             -1.0,           +1.0));
+    CHECK_DBL_SAME(    atan2,(             +1.0,           -1.0));
+    CHECK_DBL_SAME(    atan2,(             -1.0,           -1.0));
+    CHECK_DBL_SAME(    atan2,(      238.6634566,      -999999.0));
+    CHECK_DBL_SAME(    atan2,(     -905698045.1,       490876.0));
+    CHECK_DBL_SAME(    atan2,(     1.333334e-10,   -1.9993e+200));
+    CHECK_DBL_SAME(    atan2,(    1.333334e+168,   -1.9993e+299));
+    CHECK_DBL_SAME(    atan2,(         +DBL_MAX,       +DBL_MAX));
+    CHECK_DBL_SAME(    atan2,(         -DBL_MAX,       +DBL_MAX));
+    CHECK_DBL_SAME(    atan2,(        +INFINITY,      +INFINITY));
+    CHECK_DBL_SAME(    atan2,(        -INFINITY,      +INFINITY));
+    CHECK_DBL_SAME(    atan2,(        -INFINITY,      42.242424));
+    CHECK_DBL_SAME(    atan2,(RTStrNanDouble(NULL, true), RTStrNanDouble(NULL, true)));
+    CHECK_DBL_SAME(    atan2,(RTStrNanDouble(NULL, false), RTStrNanDouble(NULL, false)));
+    CHECK_DBL_SAME(    atan2,(RTStrNanDouble(NULL, false), RTStrNanDouble(NULL, true)));
+    //CHECK_DBL_SAME(    atan2,(RTStrNanDouble(NULL, true), RTStrNanDouble(NULL, false))); - UCRT returns -QNaN, we +QNaN
+    CHECK_DBL_SAME(    atan2,(RTStrNanDouble(NULL, true), RTStrNanDouble("s", false)));
+    CHECK_FLT(RT_NOCRT(atan2f)(             +1.0f,            0.0f), (float)+M_PI_2);
+    CHECK_FLT(RT_NOCRT(atan2f)(             -1.0f,            0.0f), (float)-M_PI_2);
+    CHECK_FLT(RT_NOCRT(atan2f)(             +1.0f,           +1.0f), (float)+M_PI_4);
+    CHECK_FLT(RT_NOCRT(atan2f)(             -1.0f,           -1.0f), (float)(-M_PI_2 - M_PI_4));
+    CHECK_FLT_SAME(    atan2f,(             +1.0f,            0.0f));
+    CHECK_FLT_SAME(    atan2f,(             +1.0f,           -0.0f));
+    CHECK_FLT_SAME(    atan2f,(             -1.0f,            0.0f));
+    CHECK_FLT_SAME(    atan2f,(             -1.0f,           -0.0f));
+    CHECK_FLT_SAME(    atan2f,(             +1.0f,           +1.0f));
+    CHECK_FLT_SAME(    atan2f,(             -1.0f,           +1.0f));
+    CHECK_FLT_SAME(    atan2f,(             +1.0f,           -1.0f));
+    CHECK_FLT_SAME(    atan2f,(             -1.0f,           -1.0f));
+    CHECK_FLT_SAME(    atan2f,(      238.6634566f,      -999999.0f));
+    CHECK_FLT_SAME(    atan2f,(     -905698045.1f,       490876.0f));
+    CHECK_FLT_SAME(    atan2f,(     1.333334e-10f,    -1.9993e+20f));
+    CHECK_FLT_SAME(    atan2f,(     1.333334e+35f,    -1.9993e+29f));
+    CHECK_FLT_SAME(    atan2f,(          +FLT_MAX,        +FLT_MAX));
+    CHECK_FLT_SAME(    atan2f,(          -FLT_MAX,        +FLT_MAX));
+    CHECK_FLT_SAME(    atan2f,(         +INFINITY,       +INFINITY));
+    CHECK_FLT_SAME(    atan2f,(         -INFINITY,       +INFINITY));
+    CHECK_FLT_SAME(    atan2f,(         -INFINITY,      42.242424f));
+    CHECK_FLT_SAME(    atan2f,(RTStrNanFloat(NULL, true), RTStrNanFloat(NULL, true)));
+    CHECK_FLT_SAME(    atan2f,(RTStrNanFloat(NULL, false), RTStrNanFloat(NULL, false)));
+    CHECK_FLT_SAME(    atan2f,(RTStrNanFloat(NULL, false), RTStrNanFloat(NULL, true)));
+    //CHECK_FLT_SAME(    atan2f,(RTStrNanFloat(NULL, true), RTStrNanFloat(NULL, false))); - UCRT returns -QNaN, we +QNaN
+    CHECK_FLT_SAME(    atan2f,(RTStrNanFloat(NULL, true), RTStrNanFloat("s", false)));
+}
+void testSin()
+{
+    RTTestSub(g_hTest, "sin[f]");
+    /*
+     * Note! sin, cos and friends are complicated the results may differ between
+     *       implementations.  The numbers below was computed using amd64 glibc
+     *       (2.27-3ubuntu1.4) sinl() and a %.33Lf printf.
+     *
+     *       Our code is based on the x87 CPU and does not have the best
+     *       reduction code is inaccurate, so accuracy drops. Also, with the
+     *       input accuracy difference we must expect differences too.
+     */
+    CHECK_DBL(      RT_NOCRT(sin)(                          +0.0),                           +0.0);
+    CHECK_DBL(      RT_NOCRT(sin)(                          -0.0),                           -0.0);
+    CHECK_DBL(      RT_NOCRT(sin)(                         +M_PI),                           +0.0);
+    CHECK_DBL(      RT_NOCRT(sin)(                         -M_PI),                           +0.0);
+    CHECK_DBL(      RT_NOCRT(sin)(                       +M_PI_2),                           +1.0);
+    CHECK_DBL(      RT_NOCRT(sin)(                       -M_PI_2),                           -1.0);
+    CHECK_DBL(      RT_NOCRT(sin)(              +M_PI_2 + M_PI*4),                           +1.0);
+    CHECK_DBL(      RT_NOCRT(sin)(              -M_PI_2 - M_PI*4),                           -1.0);
+    CHECK_DBL(      RT_NOCRT(sin)(              +M_PI_2 + M_PI*2),                           +1.0);
+    CHECK_DBL(      RT_NOCRT(sin)(              -M_PI_2 - M_PI*2),                           -1.0);
+    CHECK_DBL(      RT_NOCRT(sin)(                          +1.0),        +0.84147098480789650488);
+    CHECK_DBL(      RT_NOCRT(sin)(                          +2.0),        +0.90929742682568170942);
+    CHECK_DBL(      RT_NOCRT(sin)(                          +3.0),        +0.14112000805986721352);
+    CHECK_DBL(      RT_NOCRT(sin)(                          +4.0),        -0.75680249530792820245);
+    CHECK_DBL(      RT_NOCRT(sin)(                          +5.0),        -0.95892427466313845397);
+    CHECK_DBL(      RT_NOCRT(sin)(                          +6.0),        -0.27941549819892586015);
+    CHECK_DBL(      RT_NOCRT(sin)(                          +7.0),        +0.65698659871878906102);
+    CHECK_DBL(      RT_NOCRT(sin)(                          +8.0),        +0.98935824662338178737);
+    CHECK_DBL(      RT_NOCRT(sin)(                          +9.0),        +0.41211848524175659358);
+    CHECK_DBL(      RT_NOCRT(sin)(                         +10.0),        -0.54402111088936977445);
+    CHECK_DBL(      RT_NOCRT(sin)(                        +100.0),        -0.50636564110975879061);
+    CHECK_DBL(      RT_NOCRT(sin)(                +654.216812456),        +0.69292681127157818022);
+    CHECK_DBL(      RT_NOCRT(sin)(     10.1010101010101010101010),        -0.62585878258501614901);
+    CHECK_DBL(      RT_NOCRT(sin)(    +25.2525252525252525252525),        +0.11949778146891366915);
+    CHECK_DBL(      RT_NOCRT(sin)(   +252.2525252525252525252525),        +0.79868874455343841223);
+    CHECK_DBL(      RT_NOCRT(sin)(  +2525.2525252525252525252525),        -0.55467159842968405403);
+    CHECK_DBL_RANGE(RT_NOCRT(sin)( +25252.2525252525252525252525),        +0.13040325588994761130, 0.0000000000000010000);
+    CHECK_DBL_RANGE(RT_NOCRT(sin)(+252525.2525252525252525252525),        -0.77923047482990159818, 0.0000000000000100000);
+    CHECK_DBL(      RT_NOCRT(sin)(                          -1.0),        -0.84147098480789650488);
+    CHECK_DBL(      RT_NOCRT(sin)(                          -2.0),        -0.90929742682568170942);
+    CHECK_DBL(      RT_NOCRT(sin)(                          -3.0),        -0.14112000805986721352);
+    CHECK_DBL(      RT_NOCRT(sin)(                          -4.0),        +0.75680249530792820245);
+    CHECK_DBL(      RT_NOCRT(sin)(                          -5.0),        +0.95892427466313845397);
+    CHECK_DBL(      RT_NOCRT(sin)(                          -6.0),        +0.27941549819892586015);
+    CHECK_DBL(      RT_NOCRT(sin)(                          -7.0),        -0.65698659871878906102);
+    CHECK_DBL(      RT_NOCRT(sin)(                          -8.0),        -0.98935824662338178737);
+    CHECK_DBL(      RT_NOCRT(sin)(                          -9.0),        -0.41211848524175659358);
+    CHECK_DBL(      RT_NOCRT(sin)(                         -10.0),        +0.54402111088936977445);
+    CHECK_DBL(      RT_NOCRT(sin)(                        -100.0),        +0.50636564110975879061);
+    CHECK_DBL(      RT_NOCRT(sin)(                -654.216812456),        -0.69292681127157818022);
+    CHECK_DBL(      RT_NOCRT(sin)(    -10.1010101010101010101010),        +0.62585878258501614901);
+    CHECK_DBL(      RT_NOCRT(sin)(    -25.2525252525252525252525),        -0.11949778146891366915);
+    CHECK_DBL(      RT_NOCRT(sin)(   -252.2525252525252525252525),        -0.79868874455343841223);
+    CHECK_DBL(      RT_NOCRT(sin)(  -2525.2525252525252525252525),        +0.55467159842968405403);
+    CHECK_DBL_RANGE(RT_NOCRT(sin)( -25252.2525252525252525252525),        -0.13040325588994761130, 0.0000000000000010000);
+    CHECK_DBL_RANGE(RT_NOCRT(sin)(-252525.2525252525252525252525),        +0.77923047482990159818, 0.0000000000000100000);
+    CHECK_DBL(      RT_NOCRT(sin)(     RTStrNanDouble("s", true)),       RTStrNanDouble("s", true));
+    CHECK_DBL(      RT_NOCRT(sin)(RTStrNanDouble("9999s", false)),  RTStrNanDouble("9999s", false));
+    CHECK_DBL_SAME(    sin,(              1.0));
+    CHECK_DBL_SAME(    sin,(              1.5));
+    CHECK_DBL_SAME(    sin,(             +0.0));
+    CHECK_DBL_SAME(    sin,(             +0.0));
+    CHECK_DBL_SAME(    sin,(             -0.0));
+    CHECK_DBL_SAME(    sin,(             -0.0));
+    CHECK_DBL_SAME(    sin,(            -10.0));
+#if 0 /* UCRT returns tiny fractions for these in the 2**-53 range, we return 0.0 */
+    CHECK_DBL_SAME(    sin,(            +M_PI));
+    CHECK_DBL_SAME(    sin,(            -M_PI));
+#endif
+    CHECK_DBL_SAME(    sin,(          +M_PI_2));
+    CHECK_DBL_SAME(    sin,(          -M_PI_2));
+    CHECK_DBL_SAME(    sin,(        +INFINITY));
+    CHECK_DBL_SAME(    sin,(        -INFINITY));
+    CHECK_DBL_SAME(    sin,(RTStrNanDouble(NULL, true)));
+#if 0 /*UCRT converts these to quiet ones, we check above */
+    //CHECK_DBL_SAME(    sin,(RTStrNanDouble("s",  true)));
+    //CHECK_DBL_SAME(    sin,(RTStrNanDouble("s", false)));
+#endif
+}
+void testCos()
+{
+    RTTestSub(g_hTest, "cos[f]");
+    CHECK_DBL(RT_NOCRT(cos)(             +0.0),     1.0);
+    CHECK_DBL(          cos(             +0.0),     1.0);
+    CHECK_DBL(RT_NOCRT(cos)(            +M_PI),    -1.0);
+    CHECK_DBL(          cos(            +M_PI),    -1.0);
+    CHECK_DBL(RT_NOCRT(cos)(            -M_PI),    -1.0);
+    CHECK_DBL(          cos(            -M_PI),    -1.0);
+    CHECK_DBL(RT_NOCRT(cos)(          +M_PI_2),     0.0);
+    CHECK_DBL(          cos(          +M_PI_2),     0.0);
+    CHECK_DBL(RT_NOCRT(cos)(          -M_PI_2),     0.0);
+    CHECK_DBL(          cos(          -M_PI_2),     0.0);
+    CHECK_DBL(RT_NOCRT(cos)(             +1.0), +M_PI_4);
+    CHECK_DBL(          cos(             +1.0), +M_PI_4);
+    CHECK_DBL(RT_NOCRT(cos)(             -1.0), -M_PI_4);
+    CHECK_DBL(          cos(             -1.0), -M_PI_4);
+    CHECK_DBL_SAME(    cos,(              1.0));
+    CHECK_DBL_SAME(    cos,(              1.5));
+    CHECK_DBL_SAME(    cos,(             +0.0));
+    CHECK_DBL_SAME(    cos,(             +0.0));
+    CHECK_DBL_SAME(    cos,(             -0.0));
+    CHECK_DBL_SAME(    cos,(             -0.0));
+    CHECK_DBL_SAME(    cos,(            +M_PI));
+    CHECK_DBL_SAME(    cos,(            -M_PI));
+    CHECK_DBL_SAME(    cos,(          +M_PI_2));
+    CHECK_DBL_SAME(    cos,(          -M_PI_2));
+#if 0
+    CHECK_DBL_SAME(    cos,(      238.6634566));
+    CHECK_DBL_SAME(    cos,(      -49.4578999));
+    CHECK_DBL_SAME(    cos,(         999999.0));
+    CHECK_DBL_SAME(    cos,(        -999999.0));
+    CHECK_DBL_SAME(    cos,(        -999999.0));
+    CHECK_DBL_SAME(    cos,(         999999.0));
+    CHECK_DBL_SAME(    cos,(      39560.32334));
+    CHECK_DBL_SAME(    cos,(      39560.32334));
+    CHECK_DBL_SAME(    cos,(        +INFINITY));
+    CHECK_DBL_SAME(    cos,(        -INFINITY));
+    CHECK_DBL_SAME(    cos,(         +DBL_MAX));
+    CHECK_DBL_SAME(    cos,(         -DBL_MAX));
+    CHECK_DBL_SAME(    cos,(2.34960584706e100));
+    CHECK_DBL_SAME(    cos,(2.34960584706e300));
+    CHECK_DBL_SAME(    cos,(2.34960584706e300));
+    CHECK_DBL_SAME(    cos,(RTStrNanDouble(NULL, true)));
+    CHECK_DBL_SAME(    cos,(RTStrNanDouble("s",  true)));
+    CHECK_DBL_SAME(    cos,(RTStrNanDouble("s", false)));
+#endif
+}
 …
     testATan();
+    testATan2();
+    testSin();
+    //testCos();
 #if 0
-    ../common/math/atan2.asm \
-    ../common/math/atan2f.asm \
     ../common/math/cos.asm \
     ../common/math/cosf.asm \

Note: See TracChangeset for help on using the changeset viewer.