Changeset 96240 in vbox for trunk/src/VBox/Runtime/common/math
- Timestamp:
- Aug 17, 2022 12:59:31 AM (3 years ago)
- svn:sync-xref-src-repo-rev:
- 153052
- Location:
- trunk/src/VBox/Runtime/common/math
- Files:
-
- 1 added
- 2 edited
Legend:
- Unmodified
- Added
- Removed
-
trunk/src/VBox/Runtime/common/math/cos.asm
r96060 r96240 25 25 ; 26 26 27 28 %define RT_ASM_WITH_SEH64 27 29 %include "iprt/asmdefs.mac" 30 %include "iprt/x86.mac" 31 28 32 29 33 BEGINCODE 30 34 31 35 ;; 32 ; compute the cosine of dr, measured in radians. 33 ; @returns st(0) / xmm0 36 ; Compute the cosine of rd, measured in radians. 37 ; 38 ; @returns st(0) / xmm0 34 39 ; @param rd [rbp + xCB*2] / xmm0 40 ; 35 41 RT_NOCRT_BEGINPROC cos 36 push xBP 37 mov xBP, xSP 42 push xBP 43 SEH64_PUSH_xBP 44 mov xBP, xSP 45 SEH64_SET_FRAME_xBP 0 46 sub xSP, 20h 47 SEH64_ALLOCATE_STACK 20h 48 SEH64_END_PROLOGUE 49 50 %ifdef RT_OS_WINDOWS 51 ; 52 ; Make sure we use full precision and not the windows default of 53 bits. 53 ; 54 ;; @todo not sure if this makes any difference... 55 fnstcw [xBP - 20h] 56 mov ax, [xBP - 20h] 57 or ax, X86_FCW_PC_64 ; includes both bits, so no need to clear the mask. 58 mov [xBP - 1ch], ax 59 fldcw [xBP - 1ch] 60 %endif 61 62 ; 63 ; Load the input into st0. 64 ; 38 65 %ifdef RT_ARCH_AMD64 39 sub xSP, 10h 40 41 movsd [xSP], xmm0 42 fld qword [xSP] 66 movsd [xBP - 10h], xmm0 67 fld qword [xBP - 10h] 43 68 %else 44 fld qword [xBP + xCB*2] 45 %endif 46 fcos 47 fnstsw ax 48 test ah, 4 49 jz .done 50 51 fldpi 52 fadd st0, st0 53 fxch st1 54 .again: 55 fprem1 56 fnstsw ax 57 test ah, 4 58 jnz .again 59 60 fstp st0 61 fcos 62 63 .done: 69 fld qword [xBP + xCB*2] 70 %endif 71 72 ; 73 ; The FCOS instruction has a very narrow range (-3pi/8 to 3pi/8) where it 74 ; works reliably, so outside that we'll use the FSIN instruction instead 75 ; as it has a larger good range (-5pi/4 to 1pi/4 for cosine). 76 ; Input conversion follows: cos(x) = sin(x + pi/2) 77 ; 78 ; We examin the input and weed out non-finit numbers first. 79 ; 80 81 ; We only do the range check on normal finite numbers. 82 fxam 83 fnstsw ax 84 and ax, X86_FSW_C3 | X86_FSW_C2 | X86_FSW_C0 85 cmp ax, X86_FSW_C2 ; Normal finite number (excluding zero) 86 je .finite 87 cmp ax, X86_FSW_C3 ; Zero 88 je .zero 89 cmp ax, X86_FSW_C3 | X86_FSW_C2 ; Denormals - treat them as zero. 90 je .zero 91 cmp ax, X86_FSW_C0 ; NaN - must handle it special, 92 je .nan 93 94 ; Pass infinities and unsupported inputs to fcos, assuming it does the right thing. 95 ; We also jump here if we get a finite number in the "good" range, see below. 96 .do_fcos: 97 fcos 98 jmp .return_val 99 100 ; 101 ; Finite number. 102 ; 103 ; First check if it's a very tiny number where we can simply return 1. 104 ; Next check if it's in the range where FCOS is reasonable, otherwise 105 ; go to FSIN to do the work. 106 ; 107 .finite: 108 fld st0 109 fabs 110 fld qword [.s_r64TinyCosTo1 xWrtRIP] 111 fcomip st1 112 jbe .zero_extra_pop 113 114 .not_that_tiny_input: 115 fld qword [.s_r64FCosOkay xWrtRIP] 116 fcomip st1 117 ffreep st0 ; pop fabs(input) 118 ja .do_fcos ; jmp if fabs(input) < .s_r64FCosOkay 119 120 ; 121 ; If we have a positive number we subtract 3pi/2, for negative we add pi/2. 122 ; We still have the FXAM result in AX. 123 ; 124 .outside_fcos_range: 125 test ax, X86_FSW_C1 ; The sign bit. 126 jnz .adjust_negative_to_sine 127 128 ; Calc -3pi/2 using FPU-internal pi constant. 129 fldpi 130 fadd st0, st0 ; st0=2pi 131 fldpi 132 fdiv qword [.s_r64Two xWrtRIP] ; st1=2pi; st0=pi/2 133 fsubp st1, st0 ; st0=3pi/2 134 fchs ; st0=-3pi/2 135 jmp .make_sine_adjustment 136 137 .adjust_negative_to_sine: 138 ; Calc +pi/2. 139 fldpi 140 fdiv qword [.s_r64Two xWrtRIP] ; st1=2pi; st0=pi/2 141 142 .make_sine_adjustment: 143 faddp st1, st0 144 145 ; 146 ; Call internal sine worker to calculate st0=sin(st0) 147 ; 148 .do_sine: 149 mov ecx, 1 ; double 150 extern NAME(rtNoCrtMathSinCore) 151 call NAME(rtNoCrtMathSinCore) 152 153 ; 154 ; Return st0. 155 ; 156 .return_val: 64 157 %ifdef RT_ARCH_AMD64 65 fstp qword [xSP] 66 movsd xmm0, [xSP] 67 %endif 68 leave 69 ret 158 fstp qword [xBP - 10h] 159 movsd xmm0, [xBP - 10h] 160 %endif 161 %ifdef RT_OS_WINDOWS 162 fldcw [xBP - 20h] ; restore original 163 %endif 164 .return: 165 leave 166 ret 167 168 ; 169 ; cos(+/-0) = +1.0 170 ; 171 .zero_extra_pop: 172 ffreep st0 173 .zero: 174 ffreep st0 175 fld1 176 jmp .return_val 177 178 ; 179 ; Input is NaN, output it unmodified as far as we can (FLD changes SNaN 180 ; to QNaN when masked). 181 ; 182 .nan: 183 %ifdef RT_ARCH_AMD64 184 ffreep st0 185 %endif 186 jmp .return 187 188 ; 189 ; Local constants. 190 ; 191 ALIGNCODE(8) 192 ; About 2**-27. When fabs(input) is below this limit we can consider cos(input) ~= 1.0. 193 .s_r64TinyCosTo1: 194 dq 7.4505806e-9 195 196 ; The absolute limit for the range which FCOS is expected to produce reasonable results. 197 .s_r64FCosOkay: 198 dq 1.1780972450961724644225 ; 3*pi/8 199 200 .s_r64Two: 201 dq 2.0 70 202 ENDPROC RT_NOCRT(cos) 71 203 -
trunk/src/VBox/Runtime/common/math/sin.asm
r96060 r96240 25 25 ; 26 26 27 28 %define RT_ASM_WITH_SEH64 27 29 %include "iprt/asmdefs.mac" 30 %include "iprt/x86.mac" 31 28 32 29 33 BEGINCODE 30 34 31 35 ;; 32 ; Compute the sine of rd 33 ; @returns st(0)/xmm0 34 ; @param rd [xSP + xCB*2] / xmm0 36 ; Internal sine and cosine worker that calculates the sine of st0 returning 37 ; it in st0. 38 ; 39 ; When called by a sine function, fabs(st0) >= pi/2. 40 ; When called by a cosine function, fabs(original input value) >= 3pi/8. 41 ; 42 ; That the input isn't a tiny number close to zero, means that we can do a bit 43 ; cruder rounding when operating close to a pi/2 boundrary. The value in the 44 ; ecx register indicates the input precision and controls the crudeness of the 45 ; rounding. 46 ; 47 ; @returns st0 = sine 48 ; @param st0 A finite number to calucate sine of. 49 ; @param ecx Set to 0 if original input was a 32-bit float. 50 ; Set to 1 if original input was a 64-bit double. 51 ; set to 2 if original input was a 80-bit long double. 52 ; 53 BEGINPROC rtNoCrtMathSinCore 54 push xBP 55 SEH64_PUSH_xBP 56 mov xBP, xSP 57 SEH64_SET_FRAME_xBP 0 58 SEH64_END_PROLOGUE 59 60 ; 61 ; Load the pointer to the rounding crudeness factor into xDX. 62 ; 63 lea xDX, [.s_ar64NearZero xWrtRIP] 64 lea xDX, [xDX + xCX * xCB] 65 66 ; 67 ; Finite number. We want it in the range [0,2pi] and will preform 68 ; a remainder division if it isn't. 69 ; 70 fcom qword [.s_r64Max xWrtRIP] ; compares st0 and 2*pi 71 fnstsw ax 72 test ax, X86_FSW_C3 | X86_FSW_C0 | X86_FSW_C2 ; C3 := st0 == mem; C0 := st0 < mem; C2 := unordered (should be the case); 73 jz .reduce_st0 ; Jump if st0 > mem 74 75 fcom qword [.s_r64Min xWrtRIP] ; compares st0 and 0.0 76 fnstsw ax 77 test ax, X86_FSW_C3 | X86_FSW_C0 78 jnz .reduce_st0 ; Jump if st0 <= mem 79 80 ; 81 ; We get here if st0 is in the [0,2pi] range. 82 ; 83 ; Now, FSIN is documented to be reasonably accurate for the range 84 ; -3pi/4 to +3pi/4, so we have to make some more effort to calculate 85 ; in that range only. 86 ; 87 .in_range: 88 ; if (st0 < pi) 89 fldpi 90 fcom st1 ; compares st0 (pi) with st1 (the normalized value) 91 fnstsw ax 92 test ax, X86_FSW_C0 ; st1 > pi 93 jnz .larger_than_pi 94 test ax, X86_FSW_C3 95 jnz .equals_pi 96 97 ; 98 ; input in the range [0,pi[ 99 ; 100 .smaller_than_pi: 101 fdiv qword [.s_r64Two xWrtRIP] ; st0 = pi/2 102 103 ; if (st0 < pi/2) 104 fcom st1 ; compares st0 (pi/2) with st1 105 fnstsw ax 106 test ax, X86_FSW_C0 ; st1 > pi 107 jnz .between_half_pi_and_pi 108 test ax, X86_FSW_C3 109 jnz .equals_half_pi 110 111 ; 112 ; The value is between zero and half pi, including the zero value. 113 ; 114 ; This is in range where FSIN works reasonably reliably. So drop the 115 ; half pi in st0 and do the calculation. 116 ; 117 .between_zero_and_half_pi: 118 ; Check if we're so close to pi/2 that it makes no difference. 119 fsub st0, st1 ; st0 = pi/2 - st1 120 fcom qword [xDX] 121 fnstsw ax 122 test ax, X86_FSW_C0 | X86_FSW_C3 ; st0 <= very small positive number. 123 jnz .equals_half_pi 124 ffreep st0 125 126 ; Check if we're so close to zero that it makes no difference given the 127 ; internal accuracy of the FPU. 128 fcom qword [xDX] 129 fnstsw ax 130 test ax, X86_FSW_C0 | X86_FSW_C3 ; st0 <= very small positive number. 131 jnz .equals_zero_popped_one 132 133 ; Ok, calculate sine. 134 fsin 135 jmp .return 136 137 ; 138 ; The value is in the range ]pi/2,pi[ 139 ; 140 ; This is outside the comfortable FSIN range, but if we subtract PI and 141 ; move to the ]-pi/2,0[ range we just have to change the sign to get 142 ; the value we want. 143 ; 144 .between_half_pi_and_pi: 145 ; Check if we're so close to pi/2 that it makes no difference. 146 fsubr st0, st1 ; st0 = st1 - st0 147 fcom qword [xDX] 148 fnstsw ax 149 test ax, X86_FSW_C0 | X86_FSW_C3 ; st0 <= very small positive number. 150 jnz .equals_half_pi 151 ffreep st0 152 153 ; Check if we're so close to pi that it makes no difference. 154 fldpi 155 fsub st0, st1 ; st0 = st0 - st1 156 fcom qword [xDX] 157 fnstsw ax 158 test ax, X86_FSW_C0 | X86_FSW_C3 ; st0 <= very small positive number. 159 jnz .equals_pi 160 ffreep st0 161 162 ; Ok, transform the value and calculate sine. 163 fldpi 164 fsubp st1, st0 165 166 fsin 167 fchs 168 jmp .return 169 170 ; 171 ; input in the range ]pi,2pi[ 172 ; 173 .larger_than_pi: 174 fsub st1, st0 ; st1 -= pi 175 fdiv qword [.s_r64Two xWrtRIP] ; st0 = pi/2 176 177 ; if (st0 < pi/2) 178 fcom st1 ; compares st0 (pi/2) with reduced st1 179 fnstsw ax 180 test ax, X86_FSW_C0 ; st1 > pi 181 jnz .between_3_half_pi_and_2pi 182 test ax, X86_FSW_C3 183 jnz .equals_3_half_pi 184 185 ; 186 ; The value is in the the range: ]pi,3pi/2[ 187 ; 188 ; The actual st0 is in the range ]pi,pi/2[ where FSIN is performing okay 189 ; and we can get the desired result by changing the sign (-FSIN). 190 ; 191 .between_pi_and_3_half_pi: 192 ; Check if we're so close to pi/2 that it makes no difference. 193 fsub st0, st1 ; st0 = pi/2 - st1 194 fcom qword [xDX] 195 fnstsw ax 196 test ax, X86_FSW_C0 | X86_FSW_C3 ; st0 <= very small positive number. 197 jnz .equals_3_half_pi 198 ffreep st0 199 200 ; Check if we're so close to zero that it makes no difference given the 201 ; internal accuracy of the FPU. 202 fcom qword [xDX] 203 fnstsw ax 204 test ax, X86_FSW_C0 | X86_FSW_C3 ; st0 <= very small positive number. 205 jnz .equals_pi_popped 206 207 ; Ok, calculate sine and flip the sign. 208 fsin 209 fchs 210 jmp .return 211 212 ; 213 ; The value is in the last pi/2 of the range: ]3pi/2,2pi[ 214 ; 215 ; Since FSIN should work reasonably well for ]-pi/2,pi], we can just 216 ; subtract pi again (we subtracted pi at .larger_than_pi above) and 217 ; run FSIN on it. (st1 is currently in the range ]pi/2,pi[.) 218 ; 219 .between_3_half_pi_and_2pi: 220 ; Check if we're so close to pi/2 that it makes no difference. 221 fsubr st0, st1 ; st0 = st1 - st0 222 fcom qword [xDX] 223 fnstsw ax 224 test ax, X86_FSW_C0 | X86_FSW_C3 ; st0 <= very small positive number. 225 jnz .equals_3_half_pi 226 ffreep st0 227 228 ; Check if we're so close to pi that it makes no difference. 229 fldpi 230 fsub st0, st1 ; st0 = st0 - st1 231 fcom qword [xDX] 232 fnstsw ax 233 test ax, X86_FSW_C0 | X86_FSW_C3 ; st0 <= very small positive number. 234 jnz .equals_2pi 235 ffreep st0 236 237 ; Ok, adjust input and calculate sine. 238 fldpi 239 fsubp st1, st0 240 fsin 241 jmp .return 242 243 ; 244 ; sin(0) = 0 245 ; sin(pi) = 0 246 ; 247 .equals_zero: 248 .equals_pi: 249 .equals_2pi: 250 ffreep st0 251 .equals_zero_popped_one: 252 .equals_pi_popped: 253 ffreep st0 254 fldz 255 jmp .return 256 257 ; 258 ; sin(pi/2) = 1 259 ; 260 .equals_half_pi: 261 ffreep st0 262 ffreep st0 263 fld1 264 jmp .return 265 266 ; 267 ; sin(3*pi/2) = -1 268 ; 269 .equals_3_half_pi: 270 ffreep st0 271 ffreep st0 272 fld1 273 fchs 274 jmp .return 275 276 ; 277 ; Return. 278 ; 279 .return: 280 leave 281 ret 282 283 ; 284 ; Reduce st0 by reminder division by PI*2. The result should be positive here. 285 ; 286 ;; @todo this is one of our weak spots (really any calculation involving PI is). 287 .reduce_st0: 288 fldpi 289 fadd st0, st0 290 fxch st1 ; st0=input (dividend) st1=2pi (divisor) 291 .again: 292 fprem1 293 fnstsw ax 294 test ah, (X86_FSW_C2 >> 8) ; C2 is set if partial result. 295 jnz .again ; Loop till C2 == 0 and we have a final result. 296 297 ; 298 ; Make sure the result is positive. 299 ; 300 fxam 301 fnstsw ax 302 test ax, X86_FSW_C1 ; The sign bit 303 jz .reduced_to_positive 304 305 fadd st0, st1 ; st0 += 2pi, which should make it positive 306 307 %ifdef RT_STRICT 308 fxam 309 fnstsw ax 310 test ax, X86_FSW_C1 311 jz .reduced_to_positive 312 int3 313 %endif 314 315 .reduced_to_positive: 316 fstp st1 ; Get rid of the 2pi value. 317 jmp .in_range 318 319 ALIGNCODE(8) 320 .s_r64Max: 321 dq +6.28318530717958647692 ; 2*pi 322 .s_r64Min: 323 dq 0.0 324 .s_r64Two: 325 dq 2.0 326 ;; 327 ; Close to 2/pi rounding limits for 32-bit, 64-bit and 80-bit floating point operations. 328 ; Given that the original input is at least +/-3pi/8 (1.178) and that precision of the 329 ; PI constant used during reduction/whatever, I think we can round to a whole pi/2 330 ; step when we get close enough. 331 ; 332 ; Look to RTFLOAT64U for the format details, but 52 is the shift for the exponent field 333 ; and 1023 is the exponent bias. Since the format uses an implied 1 in the mantissa, 334 ; we only have to set the exponent to get a valid number. 335 ; 336 .s_ar64NearZero: 337 dq (-18 + 1023) << 52 ; float / 32-bit / single precision input 338 dq (-40 + 1023) << 52 ; double / 64-bit / double precision input 339 dq (-52 + 1023) << 52 ; long double / 80-bit / extended precision input 340 ENDPROC rtNoCrtMathSinCore 341 342 343 ;; 344 ; Compute the sine of rd, measured in radians. 345 ; 346 ; @returns st(0) / xmm0 347 ; @param rd [rbp + xCB*2] / xmm0 348 ; 35 349 RT_NOCRT_BEGINPROC sin 36 push xBP 37 mov xBP, xSP 38 350 push xBP 351 SEH64_PUSH_xBP 352 mov xBP, xSP 353 SEH64_SET_FRAME_xBP 0 354 sub xSP, 20h 355 SEH64_ALLOCATE_STACK 20h 356 SEH64_END_PROLOGUE 357 358 %ifdef RT_OS_WINDOWS 359 ; 360 ; Make sure we use full precision and not the windows default of 53 bits. 361 ; 362 fnstcw [xBP - 20h] 363 mov ax, [xBP - 20h] 364 or ax, X86_FCW_PC_64 ; includes both bits, so no need to clear the mask. 365 mov [xBP - 1ch], ax 366 fldcw [xBP - 1ch] 367 %endif 368 369 ; 370 ; Load the input into st0. 371 ; 39 372 %ifdef RT_ARCH_AMD64 40 sub xSP, 10h 41 42 movsd [xSP], xmm0 43 fld qword [xSP] 373 movsd [xBP - 10h], xmm0 374 fld qword [xBP - 10h] 44 375 %else 45 fld qword [xBP + xCB*2] 46 %endif 47 fsin 48 fnstsw ax 49 test ah, 04h 50 jz .done 51 52 fldpi 53 fadd st0 54 fxch st1 55 .again: 56 fprem1 57 fnstsw ax 58 test ah, 04h 59 jnz .again 60 fstp st1 61 fsin 62 63 .done: 376 fld qword [xBP + xCB*2] 377 %endif 378 379 ; 380 ; We examin the input and weed out non-finit numbers first. 381 ; 382 fxam 383 fnstsw ax 384 and ax, X86_FSW_C3 | X86_FSW_C2 | X86_FSW_C0 385 cmp ax, X86_FSW_C2 ; Normal finite number (excluding zero) 386 je .finite 387 cmp ax, X86_FSW_C3 ; Zero 388 je .zero 389 cmp ax, X86_FSW_C3 | X86_FSW_C2 ; Denormals - treat them as zero. 390 je .zero 391 cmp ax, X86_FSW_C0 ; NaN - must handle it special, 392 je .nan 393 394 ; Pass infinities and unsupported inputs to fsin, assuming it does the right thing. 395 .do_sin: 396 fsin 397 jmp .return_val 398 399 ; 400 ; Finite number. 401 ; 402 .finite: 403 ; For very tiny numbers, 0 < abs(input) < 2**-25, we can return the 404 ; input value directly. 405 fld st0 ; duplicate st0 406 fabs ; make it an absolute (positive) value. 407 fld qword [.s_r64Tiny xWrtRIP] 408 fcomip st1 ; compare s_r64Tiny and fabs(input) 409 ja .return_tiny_number_as_is ; jump if fabs(input) is smaller 410 411 ; FSIN is documented to be reasonable for the range ]-3pi/4,3pi/4[, so 412 ; while we have fabs(input) loaded already, check for that here and 413 ; allow rtNoCrtMathSinCore to assume it won't see values very close to 414 ; zero, except by cos -> sin conversion where they won't be relevant to 415 ; any assumpttions about precision approximation. 416 fld qword [.s_r64FSinOkay xWrtRIP] 417 fcomip st1 418 ffreep st0 ; drop the fabs(input) value 419 ja .do_sin 420 421 ; 422 ; Call common sine/cos worker. 423 ; 424 mov ecx, 1 ; double 425 extern NAME(rtNoCrtMathSinCore) 426 call NAME(rtNoCrtMathSinCore) 427 428 ; 429 ; Run st0. 430 ; 431 .return_val: 64 432 %ifdef RT_ARCH_AMD64 65 fstp qword [xSP] 66 movsd xmm0, [xSP] 67 %endif 68 leave 69 ret 433 fstp qword [xBP - 10h] 434 movsd xmm0, [xBP - 10h] 435 %endif 436 %ifdef RT_OS_WINDOWS 437 fldcw [xBP - 20h] ; restore original 438 %endif 439 .return: 440 leave 441 ret 442 443 ; 444 ; As explained already, we can return tiny numbers directly too as the 445 ; output from sin(input) = input given our precision. 446 ; We can skip the st0 -> xmm0 translation here, so follow the same path 447 ; as .zero & .nan, after we've removed the fabs(input) value. 448 ; 449 .return_tiny_number_as_is: 450 ffreep st0 451 452 ; 453 ; sin(+/-0.0) = +/-0.0 (preserve the sign) 454 ; We can skip the st0 -> xmm0 translation here, so follow the .nan code path. 455 ; 456 .zero: 457 458 ; 459 ; Input is NaN, output it unmodified as far as we can (FLD changes SNaN 460 ; to QNaN when masked). 461 ; 462 .nan: 463 %ifdef RT_ARCH_AMD64 464 ffreep st0 465 %endif 466 jmp .return 467 468 ALIGNCODE(8) 469 ; Ca. 2**-26, absolute value. Inputs closer to zero than this can be 470 ; returns directly as the sin(input) value should be basically the same 471 ; given the precision we're working with and FSIN probably won't even 472 ; manage that. 473 ;; @todo experiment when FSIN gets better than this. 474 .s_r64Tiny: 475 dq 1.49011612e-8 476 ; The absolute limit of FSIN "good" range. 477 .s_r64FSinOkay: 478 dq 2.356194490192344928845 ; 3pi/4 479 ;dq 1.57079632679489661923 ; pi/2 - alternative. 480 70 481 ENDPROC RT_NOCRT(sin) 71 482
Note:
See TracChangeset
for help on using the changeset viewer.