Changeset 104173 in vbox for trunk/src/VBox/VMM/VMMAll/IEMAllAImpl-arm64.S
- Timestamp:
- Apr 5, 2024 9:38:49 AM (8 months ago)
- File:
-
- 1 edited
Legend:
- Unmodified
- Added
- Removed
-
trunk/src/VBox/VMM/VMMAll/IEMAllAImpl-arm64.S
r103003 r104173 44 44 #endif 45 45 46 .macro BEGINPROC, a_Name 47 .private_extern NAME(\a_Name) 48 .globl NAME(\a_Name) 49 NAME(\a_Name): 50 .endm 51 52 53 .macro CALC_EFLAGS_PARITY, regEfl, regResult, regTmp 54 /* 55 * Parity calculation for low byte of the result (sucks that there is no popcount for gprs). 56 */ 57 eor \regTmp, \regResult, \regResult, LSR #4 58 eor \regTmp, \regTmp, \regTmp, LSR #2 59 eor \regTmp, \regTmp, \regTmp, LSR #1 60 eor \regTmp, \regTmp, #1 61 bfi \regEfl, \regTmp, #X86_EFL_PF_BIT, #1 /* PF(2) = popcount(w9 & 0xff) & 1 ^ 1 */ 62 .endm 63 64 65 .macro CALC_EFLAGS_AUX_CARRY, regEfl, regResult, regLeft, regRight, regTmp 66 /* 67 * Auxilary carry / borrow flag. This is related to 8-bit BCD. 68 */ 69 eor \regTmp, \regLeft, \regRight 70 eor \regTmp, \regTmp, \regResult 71 lsr \regTmp, \regTmp, #X86_EFL_AF_BIT 72 bfi \regEfl, \regTmp, #X86_EFL_AF_BIT, #1 /* AF(4) = (w8 ^ w1 ^ w9 & X86_EFL_AF) >> X86_EFL_AF_BIT */ 73 .endm 46 74 47 75 .macro CALC_EFLAGS, regEfl, regResult, regLeft, regRight, regTmp, fSkipFlags=0 … … 311 339 ret 312 340 .cfi_endproc 341 342 343 344 /* 345 * Shift Left. 346 */ 347 348 /* void iemAImpl_shl_u8(uint8_t *pu8Dst, uint8_t cShift, uint32_t *pEFlags); */ 349 /* void iemAImpl_shl_u16(uint16_t *pu16Dst, uint8_t cShift, uint32_t *pEFlags); */ 350 /* void iemAImpl_shl_u32(uint16_t *pu32Dst, uint8_t cShift, uint32_t *pEFlags); */ 351 .macro SHL_8_16_32, a_Name, a_cBits, a_fIntelFlags, a_LdStSuff 352 .p2align 2 353 BEGINPROC \a_Name 354 .cfi_startproc 355 356 /* Do we need to shift anything at all? */ 357 and w1, w1, #0x1f 358 cbz w1, 99f 359 360 /* 361 * Do the shifting 362 */ 363 ldr\a_LdStSuff w8, [x0] 364 .ifne \a_cBits < 32 365 lslv w9, w8, w1 366 .else 367 lslv x9, x8, x1 /* use 64-bit registers here so we get CF for free. We know x1 != 0. */ 368 .endif 369 str\a_LdStSuff w9, [x0] 370 371 /* 372 * Calculate EFLAGS. 373 */ 374 ldr w10, [x2] /* w10 = eflags; CF=0 PF=2 AF=4 ZF=6 SF=7 OF=11 */ 375 376 CALC_EFLAGS_PARITY w10, w9, w12 377 378 .ifne \a_cBits < 32 379 setf\a_cBits w9 /* Sets NZ */ 380 .else 381 ands wzr, w9, w9 /* Sets NZ */ 382 .endif 383 #if 1 384 mrs x11, NZCV 385 lsr w11, w11, #30 /* N=1; Z=0 */ 386 bfi w10, w11, X86_EFL_ZF_BIT, 2 /* EFLAGS.ZF and EFLAGS.SF */ 387 #else 388 cset x11, eq 389 bfi w10, w11, X86_EFL_ZF_BIT, 1 390 cset x12, pl 391 bfi w10, w12, X86_EFL_SF_BIT, 1 392 #endif 393 394 .ifne \a_cBits < 32 395 bfxil w10, w9, #\a_cBits, #1 /* w9 bit 8/16 contains carry. (X86_EFL_CF_BIT == 0) */ 396 .else 397 bfxil x10, x9, #\a_cBits, #1 /* x9 bit 32 contains carry. (X86_EFL_CF_BIT == 0) */ 398 .endif 399 400 .ifne \a_fIntelFlags 401 /* Intel: OF = first bit shifted: fEfl |= X86_EFL_GET_OF_ ## cOpBits(uDst ^ (uDst << 1)); */ 402 eor w11, w8, w8, LSL #1 403 lsr w11, w11, #(\a_cBits - 1) 404 bfi w10, w11, #X86_EFL_OF_BIT, #1 405 406 and w10, w10, ~X86_EFL_AF /* AF is cleared */ 407 .else 408 /* AMD: OF = last bit shifted: fEfl |= ((uResult >> (cOpBits - 1)) ^ fCarry) << X86_EFL_OF_BIT; */ 409 .ifne \a_cBits < 32 410 eor w11, w9, w9, LSR #1 411 lsr w11, w11, #(\a_cBits - 1) 412 .else 413 eor x11, x9, x9, LSR #1 414 lsr x11, x11, #(\a_cBits - 1) 415 .endif 416 bfi w10, w11, #X86_EFL_OF_BIT, #1 417 418 orr w10, w10, X86_EFL_AF /* AF is set */ 419 .endif 420 421 str w10, [x2] 422 99: 423 ret 424 .cfi_endproc 425 .endm 426 427 SHL_8_16_32 iemAImpl_shl_u8, 8, 1, b 428 SHL_8_16_32 iemAImpl_shl_u8_intel, 8, 1, b 429 SHL_8_16_32 iemAImpl_shl_u8_amd, 8, 0, b 430 431 SHL_8_16_32 iemAImpl_shl_u16, 16, 1, h 432 SHL_8_16_32 iemAImpl_shl_u16_intel, 16, 1, h 433 SHL_8_16_32 iemAImpl_shl_u16_amd, 16, 0, h 434 435 SHL_8_16_32 iemAImpl_shl_u32, 32, 1, 436 SHL_8_16_32 iemAImpl_shl_u32_intel, 32, 1, 437 SHL_8_16_32 iemAImpl_shl_u32_amd, 32, 0, 438 439 ;; @todo this is slightly slower than the C version (release) on an M2. Investigate why. 440 /* void iemAImpl_shl_u64(uint16_t *pu64Dst, uint8_t cShift, uint32_t *pEFlags); */ 441 .macro SHL_64, a_Name, a_fIntelFlags 442 .p2align 2 443 BEGINPROC \a_Name 444 .cfi_startproc 445 446 /* Do we need to shift anything at all? */ 447 and w1, w1, #0x3f 448 cbz w1, 99f 449 450 /* 451 * Do the shifting 452 */ 453 ldr x8, [x0] 454 lslv x9, x8, x1 455 str x9, [x0] 456 457 /* 458 * Calculate EFLAGS. 459 */ 460 ldr w10, [x2] /* w10 = eflags; CF=0 PF=2 AF=4 ZF=6 SF=7 OF=11 */ 461 462 CALC_EFLAGS_PARITY w10, w9, w11 463 464 ands xzr, x9, x9 /* Sets NZ */ 465 mrs x11, NZCV 466 lsr w11, w11, #30 /* N=1; Z=0 */ 467 bfi w10, w11, X86_EFL_ZF_BIT, 2 /* EFLAGS.ZF and EFLAGS.SF */ 468 469 neg w11, w1 /* the shift count is MODed by the data size, so this is safe. */ 470 lsrv x11, x8, x11 471 bfi w10, w11, X86_EFL_CF_BIT, 1 472 473 .ifne \a_fIntelFlags 474 /* Intel: OF = first bit shifted: fEfl |= X86_EFL_GET_OF_ ## cOpBits(uDst ^ (uDst << 1)); */ 475 eor x11, x8, x8, LSL #1 476 lsr x11, x11, #63 477 bfi w10, w11, #X86_EFL_OF_BIT, #1 478 479 and w10, w10, ~X86_EFL_AF /* AF is cleared */ 480 .else 481 /* AMD: OF = last bit shifted: fEfl |= ((uResult >> (cOpBits - 1)) ^ fCarry) << X86_EFL_OF_BIT; */ 482 eor x11, x11, x9, LSR #63 /* w11[0]=CF from above */ 483 bfi w10, w11, #X86_EFL_OF_BIT, #1 484 485 orr w10, w10, X86_EFL_AF /* AF is set */ 486 .endif 487 str w10, [x2] 488 99: 489 ret 490 .cfi_endproc 491 .endm 492 493 SHL_64 iemAImpl_shl_u64, 1 494 SHL_64 iemAImpl_shl_u64_intel, 1 495 SHL_64 iemAImpl_shl_u64_amd, 0 496 497 498 /* 499 * Shift Right, Unsigned. 500 */ 501 502 /* void iemAImpl_shr_u8(uint8_t *pu8Dst, uint8_t cShift, uint32_t *pEFlags); */ 503 /* void iemAImpl_shr_u16(uint16_t *pu16Dst, uint8_t cShift, uint32_t *pEFlags); */ 504 /* void iemAImpl_shr_u32(uint16_t *pu32Dst, uint8_t cShift, uint32_t *pEFlags); */ 505 .macro shr_8_16_32, a_Name, a_cBits, a_fIntelFlags, a_LdStSuff 506 .p2align 2 507 BEGINPROC \a_Name 508 .cfi_startproc 509 510 /* Do we need to shift anything at all? */ 511 and w1, w1, #0x1f 512 cbz w1, 99f 513 514 /* Load EFLAGS before we start the calculation. */ 515 ldr w10, [x2] /* w10 = eflags; CF=0 PF=2 AF=4 ZF=6 SF=7 OF=11 */ 516 517 /* 518 * Do the shifting. 519 */ 520 ldr\a_LdStSuff w8, [x0] 521 lsrv w9, w8, w1 522 str\a_LdStSuff w9, [x0] 523 524 /* 525 * Calculate EFLAGS. 526 */ 527 sub w11, w1, #1 528 lsrv w11, w8, w11 529 bfxil w10, w11, #X86_EFL_CF_BIT, #1 530 531 .ifne \a_fIntelFlags 532 and w10, w10, ~X86_EFL_AF /* AF is cleared */ 533 /* Intel: OF = one bit shift: fEfl |= X86_EFL_GET_OF_ ## cOpBits(uDstIn); */ 534 lsr w11, w8, #(\a_cBits - 1) 535 bfi w10, w11, #X86_EFL_OF_BIT, #1 536 .else 537 orr w10, w10, X86_EFL_AF /* AF is set */ 538 /* AMD: OF = last bits shifted: fEfl |= (uResult >> (cOpBits - 2)) << X86_EFL_OF_BIT; */ 539 lsr w11, w9, #(\a_cBits - 2) 540 bfi w10, w11, #X86_EFL_OF_BIT, #1 541 .endif 542 543 CALC_EFLAGS_PARITY w10, w9, w11 544 545 .ifne \a_cBits < 32 546 setf\a_cBits w9 /* Sets NZ */ 547 .else 548 ands wzr, w9, w9 /* Sets NZ */ 549 .endif 550 mrs x11, NZCV 551 lsr w11, w11, #30 /* N=1; Z=0 */ 552 bfi w10, w11, X86_EFL_ZF_BIT, 2 /* EFLAGS.ZF and EFLAGS.SF */ 553 554 str w10, [x2] 555 99: 556 ret 557 .cfi_endproc 558 .endm 559 560 shr_8_16_32 iemAImpl_shr_u8, 8, 1, b 561 shr_8_16_32 iemAImpl_shr_u8_intel, 8, 1, b 562 shr_8_16_32 iemAImpl_shr_u8_amd, 8, 0, b 563 564 shr_8_16_32 iemAImpl_shr_u16, 16, 1, h 565 shr_8_16_32 iemAImpl_shr_u16_intel, 16, 1, h 566 shr_8_16_32 iemAImpl_shr_u16_amd, 16, 0, h 567 568 shr_8_16_32 iemAImpl_shr_u32, 32, 1, 569 shr_8_16_32 iemAImpl_shr_u32_intel, 32, 1, 570 shr_8_16_32 iemAImpl_shr_u32_amd, 32, 0, 571 572 ;; @todo this is slightly slower than the C version (release) on an M2. Investigate why. 573 /* void iemAImpl_shr_u64(uint16_t *pu64Dst, uint8_t cShift, uint32_t *pEFlags); */ 574 .macro shr_64, a_Name, a_fIntelFlags 575 .p2align 2 576 BEGINPROC \a_Name 577 .cfi_startproc 578 579 /* Do we need to shift anything at all? */ 580 ands w1, w1, #0x3f 581 b.eq 99f 582 583 /* Load EFLAGS before we start the calculation. */ 584 ldr w10, [x2] /* w10 = eflags; CF=0 PF=2 AF=4 ZF=6 SF=7 OF=11 */ 585 586 /* 587 * Do the shifting 588 */ 589 ldr x8, [x0] 590 lsrv x9, x8, x1 591 str x9, [x0] 592 593 /* 594 * Calculate EFLAGS. 595 */ 596 sub w11, w1, #1 597 lsrv x11, x8, x11 598 bfxil w10, w11, #X86_EFL_CF_BIT, #1 599 600 .ifne \a_fIntelFlags 601 and w10, w10, ~X86_EFL_AF /* AF is cleared */ 602 /* Intel: OF = one bit shift: fEfl |= X86_EFL_GET_OF_ ## cOpBits(uDstIn); */ 603 lsr x11, x8, #63 604 bfi w10, w11, #X86_EFL_OF_BIT, #1 605 .else 606 orr w10, w10, X86_EFL_AF /* AF is set */ 607 /* AMD: OF = last bits shifted: fEfl |= (uResult >> (cOpBits - 2)) << X86_EFL_OF_BIT; */ 608 lsr x11, x9, #62 609 bfi w10, w11, #X86_EFL_OF_BIT, #1 610 .endif 611 612 CALC_EFLAGS_PARITY w10, w9, w11 613 614 ands xzr, x9, x9 /* Sets NZ */ 615 mrs x11, NZCV 616 lsr w11, w11, #30 /* N=1; Z=0 */ 617 bfi w10, w11, X86_EFL_ZF_BIT, 2 /* EFLAGS.ZF and EFLAGS.SF */ 618 619 str w10, [x2] 620 99: 621 ret 622 .cfi_endproc 623 .endm 624 625 shr_64 iemAImpl_shr_u64, 1 626 shr_64 iemAImpl_shr_u64_intel, 1 627 shr_64 iemAImpl_shr_u64_amd, 0 628 629 630 /* 631 * Shift Right, Signed 632 */ 633 634 /* void iemAImpl_sar_u8(uint8_t *pu8Dst, uint8_t cShift, uint32_t *pEFlags); */ 635 /* void iemAImpl_sar_u16(uint16_t *pu16Dst, uint8_t cShift, uint32_t *pEFlags); */ 636 /* void iemAImpl_sar_u32(uint16_t *pu32Dst, uint8_t cShift, uint32_t *pEFlags); */ 637 .macro sar_8_16_32, a_Name, a_cBits, a_fIntelFlags, a_LdSuff, a_StSuff 638 .p2align 2 639 BEGINPROC \a_Name 640 .cfi_startproc 641 642 /* Do we need to shift anything at all? */ 643 and w1, w1, #0x1f 644 cbz w1, 99f 645 646 /* Load EFLAGS before we start the calculation. */ 647 ldr w10, [x2] /* w10 = eflags; CF=0 PF=2 AF=4 ZF=6 SF=7 OF=11 */ 648 649 /* 650 * Do the shifting. 651 */ 652 ldr\a_LdSuff w8, [x0] /* Sign-extending for 8 and 16 bits! */ 653 asrv w9, w8, w1 654 str\a_StSuff w9, [x0] 655 656 /* 657 * Calculate EFLAGS. 658 */ 659 sub w11, w1, #1 660 lsrv w11, w8, w11 661 bfxil w10, w11, #X86_EFL_CF_BIT, #1 662 663 .ifne \a_fIntelFlags 664 mov w11, ~(X86_EFL_AF | X86_EFL_OF) 665 and w10, w10, w11 /* AF and OF are cleared */ 666 .else 667 orr w10, w10, X86_EFL_AF /* AF is set */ 668 and w10, w10, ~X86_EFL_OF /* OF is cleared */ 669 .endif 670 671 CALC_EFLAGS_PARITY w10, w9, w11 672 673 .ifne \a_cBits < 32 674 setf\a_cBits w9 /* Sets NZ */ 675 .else 676 ands wzr, w9, w9 /* Sets NZ */ 677 .endif 678 mrs x11, NZCV 679 lsr w11, w11, #30 /* N=1; Z=0 */ 680 bfi w10, w11, X86_EFL_ZF_BIT, 2 /* EFLAGS.ZF and EFLAGS.SF */ 681 682 str w10, [x2] 683 99: 684 ret 685 .cfi_endproc 686 .endm 687 688 sar_8_16_32 iemAImpl_sar_u8, 8, 1, sb, b 689 sar_8_16_32 iemAImpl_sar_u8_intel, 8, 1, sb, b 690 sar_8_16_32 iemAImpl_sar_u8_amd, 8, 0, sb, b 691 692 sar_8_16_32 iemAImpl_sar_u16, 16, 1, sh, h 693 sar_8_16_32 iemAImpl_sar_u16_intel, 16, 1, sh, h 694 sar_8_16_32 iemAImpl_sar_u16_amd, 16, 0, sh, h 695 696 sar_8_16_32 iemAImpl_sar_u32, 32, 1, , 697 sar_8_16_32 iemAImpl_sar_u32_intel, 32, 1, , 698 sar_8_16_32 iemAImpl_sar_u32_amd, 32, 0, , 699 700 ;; @todo this is slightly slower than the C version (release) on an M2. Investigate why. 701 /* void iemAImpl_sar_u64(uint16_t *pu64Dst, uint8_t cShift, uint32_t *pEFlags); */ 702 .macro sar_64, a_Name, a_fIntelFlags 703 .p2align 2 704 BEGINPROC \a_Name 705 .cfi_startproc 706 707 /* Do we need to shift anything at all? */ 708 ands w1, w1, #0x3f 709 b.eq 99f 710 711 /* Load EFLAGS before we start the calculation. */ 712 ldr w10, [x2] /* w10 = eflags; CF=0 PF=2 AF=4 ZF=6 SF=7 OF=11 */ 713 714 /* 715 * Do the shifting 716 */ 717 ldr x8, [x0] 718 asrv x9, x8, x1 719 str x9, [x0] 720 721 /* 722 * Calculate EFLAGS. 723 */ 724 sub w11, w1, #1 725 lsrv x11, x8, x11 726 bfxil w10, w11, #X86_EFL_CF_BIT, #1 727 728 .ifne \a_fIntelFlags 729 mov w11, ~(X86_EFL_AF | X86_EFL_OF) 730 and w10, w10, w11 /* AF and OF are cleared */ 731 .else 732 orr w10, w10, X86_EFL_AF /* AF is set */ 733 and w10, w10, ~X86_EFL_OF /* OF is cleared */ 734 .endif 735 736 CALC_EFLAGS_PARITY w10, w9, w11 737 738 ands xzr, x9, x9 /* Sets NZ */ 739 mrs x11, NZCV 740 lsr w11, w11, #30 /* N=1; Z=0 */ 741 bfi w10, w11, X86_EFL_ZF_BIT, 2 /* EFLAGS.ZF and EFLAGS.SF */ 742 743 str w10, [x2] 744 99: 745 ret 746 .cfi_endproc 747 .endm 748 749 sar_64 iemAImpl_sar_u64, 1 750 sar_64 iemAImpl_sar_u64_intel, 1 751 sar_64 iemAImpl_sar_u64_amd, 0 752
Note:
See TracChangeset
for help on using the changeset viewer.