VirtualBox

Ignore:
Timestamp:
Jul 6, 2014 3:44:03 AM (11 years ago)
Author:
vboxsync
svn:sync-xref-src-repo-rev:
94743
Message:

alt-sha1.cpp: Unrolling the init code, increasing performance by some 10+.

File:
1 edited

Legend:

Unmodified
Added
Removed
  • trunk/src/VBox/Runtime/common/checksum/alt-sha1.cpp

    r51879 r51880  
    3131/** The SHA-1 block size (in bytes). */
    3232#define RTSHA1_BLOCK_SIZE   64U
     33
     34/** Enables the unrolled init code. */
     35#define RTSHA1_UNROLLED_INIT 1
     36/** Enables the fully unrolled block processing code. */
     37#define RTSHA1_FULLY_UNROLLED_BLOCK_PROCESSING 1
    3338
    3439
     
    8893DECLINLINE(void) rtSha1BlockInit(PRTSHA1CONTEXT pCtx, uint8_t const *pbBlock)
    8994{
     95#ifdef RTSHA1_UNROLLED_INIT
     96    uint32_t const *puSrc = (uint32_t const *)pbBlock;
     97    uint32_t       *puW   = &pCtx->AltPrivate.auW[0];
     98    Assert(!((uintptr_t)puSrc & 3));
     99    Assert(!((uintptr_t)puW & 3));
     100
     101    /* Copy and byte-swap the block. */
     102# ifdef RT_LITTLE_ENDIAN
     103    uint32_t  uS1;
     104    *puW++ = uS1 = ASMByteSwapU32(*puSrc++);
     105    uint32_t uS2;
     106    *puW++ = uS2 = ASMByteSwapU32(*puSrc++);
     107    *puW++ = ASMByteSwapU32(*puSrc++);
     108    *puW++ = ASMByteSwapU32(*puSrc++);
     109
     110    *puW++ = ASMByteSwapU32(*puSrc++);
     111    *puW++ = ASMByteSwapU32(*puSrc++);
     112    *puW++ = ASMByteSwapU32(*puSrc++);
     113    *puW++ = ASMByteSwapU32(*puSrc++);
     114
     115    *puW++ = ASMByteSwapU32(*puSrc++);
     116    *puW++ = ASMByteSwapU32(*puSrc++);
     117    *puW++ = ASMByteSwapU32(*puSrc++);
     118    *puW++ = ASMByteSwapU32(*puSrc++);
     119
     120    *puW++ = ASMByteSwapU32(*puSrc++);
     121    *puW++ = ASMByteSwapU32(*puSrc++);
     122    *puW++ = ASMByteSwapU32(*puSrc++);
     123    *puW++ = ASMByteSwapU32(*puSrc++);
     124# else
     125    memcpy(puW, puSrc, RTSHA1_BLOCK_SIZE);
     126    uint32_t uS1 = puW[-16];
     127    uint32_t uS2 = puW[-15];
     128# endif
     129
     130    /* Initialize W16...W79.*/
     131/** The uS1/uS2 trick here doesn't save much, but it might shave a little bit
     132 * off and we've got enough registers for it on AMD64. */
     133# define RTSHA1_HIGH_INIT_TWO() \
     134        do { \
     135            u32          = uS1; /*puW[-16];*/ \
     136            u32         ^= uS1 = puW[-14]; \
     137            u32         ^= puW[ -8]; \
     138            u32         ^= puW[ -3]; \
     139            *puW++ = ASMRotateLeftU32(u32, 1); \
     140            \
     141            u32          = uS2; /*puW[-16];*/ \
     142            u32         ^= uS2 = puW[-14]; \
     143            u32         ^= puW[ -8]; \
     144            u32         ^= puW[ -3]; \
     145            *puW++ = ASMRotateLeftU32(u32, 1); \
     146        } while (0)
     147# define RTSHA1_HIGH_INIT_EIGHT() \
     148        RTSHA1_HIGH_INIT_TWO(); RTSHA1_HIGH_INIT_TWO(); RTSHA1_HIGH_INIT_TWO(); RTSHA1_HIGH_INIT_TWO()
     149
     150/** This is a variation on the standard one which have some better alignment
     151 *  properties (no -3 access), but probably more importantly, access memory
     152 *  we've accessed before by going futher back. */
     153# define RTSHA1_HIGH_INIT_ONE_HIGH() \
     154        do { \
     155            u32          = puW[-32]; \
     156            u32         ^= puW[-28]; \
     157            u32         ^= puW[-16]; \
     158            u32         ^= puW[ -6]; \
     159            *puW++ = ASMRotateLeftU32(u32, 2); \
     160        } while (0)
     161# define RTSHA1_HIGH_INIT_EIGHT_HIGH() \
     162        RTSHA1_HIGH_INIT_ONE_HIGH(); RTSHA1_HIGH_INIT_ONE_HIGH(); RTSHA1_HIGH_INIT_ONE_HIGH(); RTSHA1_HIGH_INIT_ONE_HIGH(); \
     163        RTSHA1_HIGH_INIT_ONE_HIGH(); RTSHA1_HIGH_INIT_ONE_HIGH(); RTSHA1_HIGH_INIT_ONE_HIGH(); RTSHA1_HIGH_INIT_ONE_HIGH()
     164
     165    uint32_t u32;
     166    RTSHA1_HIGH_INIT_EIGHT();
     167    RTSHA1_HIGH_INIT_EIGHT();
     168    RTSHA1_HIGH_INIT_EIGHT();
     169    RTSHA1_HIGH_INIT_EIGHT();
     170
     171    RTSHA1_HIGH_INIT_EIGHT_HIGH();
     172    RTSHA1_HIGH_INIT_EIGHT_HIGH();
     173    RTSHA1_HIGH_INIT_EIGHT_HIGH();
     174    RTSHA1_HIGH_INIT_EIGHT_HIGH();
     175
     176#else  /* !RTSHA1_UNROLLED_INIT */
    90177    uint32_t const *pu32Block = (uint32_t const *)pbBlock;
    91178    Assert(!((uintptr_t)pu32Block & 3));
     
    103190        pCtx->AltPrivate.auW[iWord] = ASMRotateLeftU32(u32, 1);
    104191    }
     192#endif /* !RTSHA1_UNROLLED_INIT */
    105193}
    106194
     
    113201DECLINLINE(void) rtSha1BlockInitBuffered(PRTSHA1CONTEXT pCtx)
    114202{
     203#ifdef RTSHA1_UNROLLED_INIT
     204    uint32_t       *puW   = &pCtx->AltPrivate.auW[0];
     205    Assert(!((uintptr_t)puW & 3));
     206
     207# ifdef RT_LITTLE_ENDIAN
     208    /* Do the byte swap. */
     209    uint32_t  uS1;
     210    *puW = uS1 = ASMByteSwapU32(*puW); puW++;
     211    uint32_t uS2;
     212    *puW = uS2 = ASMByteSwapU32(*puW); puW++;
     213    *puW = ASMByteSwapU32(*puW); puW++;
     214    *puW = ASMByteSwapU32(*puW); puW++;
     215
     216    *puW = ASMByteSwapU32(*puW); puW++;
     217    *puW = ASMByteSwapU32(*puW); puW++;
     218    *puW = ASMByteSwapU32(*puW); puW++;
     219    *puW = ASMByteSwapU32(*puW); puW++;
     220
     221    *puW = ASMByteSwapU32(*puW); puW++;
     222    *puW = ASMByteSwapU32(*puW); puW++;
     223    *puW = ASMByteSwapU32(*puW); puW++;
     224    *puW = ASMByteSwapU32(*puW); puW++;
     225
     226    *puW = ASMByteSwapU32(*puW); puW++;
     227    *puW = ASMByteSwapU32(*puW); puW++;
     228    *puW = ASMByteSwapU32(*puW); puW++;
     229    *puW = ASMByteSwapU32(*puW); puW++;
     230# else
     231    uint32_t uS1 = puW[-16];
     232    uint32_t uS2 = puW[-15];
     233# endif
     234
     235    /* Initialize W16...W79. */
     236    uint32_t u32;
     237    RTSHA1_HIGH_INIT_EIGHT();
     238    RTSHA1_HIGH_INIT_EIGHT();
     239    RTSHA1_HIGH_INIT_EIGHT();
     240    RTSHA1_HIGH_INIT_EIGHT();
     241
     242    RTSHA1_HIGH_INIT_EIGHT_HIGH();
     243    RTSHA1_HIGH_INIT_EIGHT_HIGH();
     244    RTSHA1_HIGH_INIT_EIGHT_HIGH();
     245    RTSHA1_HIGH_INIT_EIGHT_HIGH();
     246
     247#else  /* !RTSHA1_UNROLLED_INIT */
    115248    unsigned iWord;
    116249    for (iWord = 0; iWord < 16; iWord++)
     
    125258        pCtx->AltPrivate.auW[iWord] = ASMRotateLeftU32(u32, 1);
    126259    }
     260#endif /* !RTSHA1_UNROLLED_INIT */
    127261}
    128262
     
    192326    uint32_t uE = pCtx->AltPrivate.auH[4];
    193327
    194 #if 1 /* Fully unrolled version. */
    195     register uint32_t const *puW = &pCtx->AltPrivate.auW[0];
     328#ifdef RTSHA1_FULLY_UNROLLED_BLOCK_PROCESSING
     329    /* This fully unrolled version will avoid the variable rotation by
     330       embedding it into the loop unrolling. */
     331    uint32_t const *puW = &pCtx->AltPrivate.auW[0];
    196332# define SHA1_BODY(a_uW, a_uK, a_fnFt, a_uA, a_uB, a_uC, a_uD, a_uE) \
    197333        do { \
Note: See TracChangeset for help on using the changeset viewer.

© 2025 Oracle Support Privacy / Do Not Sell My Info Terms of Use Trademark Policy Automated Access Etiquette