VirtualBox

Ignore:
Timestamp:
Jul 6, 2014 1:07:55 PM (10 years ago)
Author:
vboxsync
Message:

alt-sha1: appled the same W16-W63-construction-while-we-process-the-block optimziation as I did for alt-sha256, gaining another 15-20 on win.amd64. Removed alternative optimizations.

File:
1 edited

Legend:

Unmodified
Added
Removed
  • trunk/src/VBox/Runtime/common/checksum/alt-sha1.cpp

    r51880 r51882  
    3232#define RTSHA1_BLOCK_SIZE   64U
    3333
    34 /** Enables the unrolled init code. */
    35 #define RTSHA1_UNROLLED_INIT 1
    36 /** Enables the fully unrolled block processing code. */
    37 #define RTSHA1_FULLY_UNROLLED_BLOCK_PROCESSING 1
     34/** Enables the unrolled code. */
     35#define RTSHA1_UNROLLED 1
    3836
    3937
     
    9391DECLINLINE(void) rtSha1BlockInit(PRTSHA1CONTEXT pCtx, uint8_t const *pbBlock)
    9492{
    95 #ifdef RTSHA1_UNROLLED_INIT
     93#ifdef RTSHA1_UNROLLED
    9694    uint32_t const *puSrc = (uint32_t const *)pbBlock;
    9795    uint32_t       *puW   = &pCtx->AltPrivate.auW[0];
     
    9997    Assert(!((uintptr_t)puW & 3));
    10098
    101     /* Copy and byte-swap the block. */
     99    /* Copy and byte-swap the block. Initializing the rest of the Ws are done
     100       in the processing loop. */
    102101# ifdef RT_LITTLE_ENDIAN
    103     uint32_t  uS1;
    104     *puW++ = uS1 = ASMByteSwapU32(*puSrc++);
    105     uint32_t uS2;
    106     *puW++ = uS2 = ASMByteSwapU32(*puSrc++);
     102    *puW++ = ASMByteSwapU32(*puSrc++);
     103    *puW++ = ASMByteSwapU32(*puSrc++);
    107104    *puW++ = ASMByteSwapU32(*puSrc++);
    108105    *puW++ = ASMByteSwapU32(*puSrc++);
     
    124121# else
    125122    memcpy(puW, puSrc, RTSHA1_BLOCK_SIZE);
    126     uint32_t uS1 = puW[-16];
    127     uint32_t uS2 = puW[-15];
    128123# endif
    129124
    130     /* Initialize W16...W79.*/
    131 /** The uS1/uS2 trick here doesn't save much, but it might shave a little bit
    132  * off and we've got enough registers for it on AMD64. */
    133 # define RTSHA1_HIGH_INIT_TWO() \
    134         do { \
    135             u32          = uS1; /*puW[-16];*/ \
    136             u32         ^= uS1 = puW[-14]; \
    137             u32         ^= puW[ -8]; \
    138             u32         ^= puW[ -3]; \
    139             *puW++ = ASMRotateLeftU32(u32, 1); \
    140             \
    141             u32          = uS2; /*puW[-16];*/ \
    142             u32         ^= uS2 = puW[-14]; \
    143             u32         ^= puW[ -8]; \
    144             u32         ^= puW[ -3]; \
    145             *puW++ = ASMRotateLeftU32(u32, 1); \
    146         } while (0)
    147 # define RTSHA1_HIGH_INIT_EIGHT() \
    148         RTSHA1_HIGH_INIT_TWO(); RTSHA1_HIGH_INIT_TWO(); RTSHA1_HIGH_INIT_TWO(); RTSHA1_HIGH_INIT_TWO()
    149 
    150 /** This is a variation on the standard one which have some better alignment
    151  *  properties (no -3 access), but probably more importantly, access memory
    152  *  we've accessed before by going futher back. */
    153 # define RTSHA1_HIGH_INIT_ONE_HIGH() \
    154         do { \
    155             u32          = puW[-32]; \
    156             u32         ^= puW[-28]; \
    157             u32         ^= puW[-16]; \
    158             u32         ^= puW[ -6]; \
    159             *puW++ = ASMRotateLeftU32(u32, 2); \
    160         } while (0)
    161 # define RTSHA1_HIGH_INIT_EIGHT_HIGH() \
    162         RTSHA1_HIGH_INIT_ONE_HIGH(); RTSHA1_HIGH_INIT_ONE_HIGH(); RTSHA1_HIGH_INIT_ONE_HIGH(); RTSHA1_HIGH_INIT_ONE_HIGH(); \
    163         RTSHA1_HIGH_INIT_ONE_HIGH(); RTSHA1_HIGH_INIT_ONE_HIGH(); RTSHA1_HIGH_INIT_ONE_HIGH(); RTSHA1_HIGH_INIT_ONE_HIGH()
    164 
    165     uint32_t u32;
    166     RTSHA1_HIGH_INIT_EIGHT();
    167     RTSHA1_HIGH_INIT_EIGHT();
    168     RTSHA1_HIGH_INIT_EIGHT();
    169     RTSHA1_HIGH_INIT_EIGHT();
    170 
    171     RTSHA1_HIGH_INIT_EIGHT_HIGH();
    172     RTSHA1_HIGH_INIT_EIGHT_HIGH();
    173     RTSHA1_HIGH_INIT_EIGHT_HIGH();
    174     RTSHA1_HIGH_INIT_EIGHT_HIGH();
    175 
    176 #else  /* !RTSHA1_UNROLLED_INIT */
     125#else  /* !RTSHA1_UNROLLED */
    177126    uint32_t const *pu32Block = (uint32_t const *)pbBlock;
    178127    Assert(!((uintptr_t)pu32Block & 3));
     
    190139        pCtx->AltPrivate.auW[iWord] = ASMRotateLeftU32(u32, 1);
    191140    }
    192 #endif /* !RTSHA1_UNROLLED_INIT */
     141#endif /* !RTSHA1_UNROLLED */
    193142}
    194143
     
    201150DECLINLINE(void) rtSha1BlockInitBuffered(PRTSHA1CONTEXT pCtx)
    202151{
    203 #ifdef RTSHA1_UNROLLED_INIT
     152#ifdef RTSHA1_UNROLLED
    204153    uint32_t       *puW   = &pCtx->AltPrivate.auW[0];
    205154    Assert(!((uintptr_t)puW & 3));
    206155
     156    /* Do the byte swap if necessary. Initializing the rest of the Ws are done
     157       in the processing loop. */
    207158# ifdef RT_LITTLE_ENDIAN
    208     /* Do the byte swap. */
    209     uint32_t  uS1;
    210     *puW = uS1 = ASMByteSwapU32(*puW); puW++;
    211     uint32_t uS2;
    212     *puW = uS2 = ASMByteSwapU32(*puW); puW++;
    213     *puW = ASMByteSwapU32(*puW); puW++;
    214     *puW = ASMByteSwapU32(*puW); puW++;
    215 
    216     *puW = ASMByteSwapU32(*puW); puW++;
    217     *puW = ASMByteSwapU32(*puW); puW++;
    218     *puW = ASMByteSwapU32(*puW); puW++;
    219     *puW = ASMByteSwapU32(*puW); puW++;
    220 
    221     *puW = ASMByteSwapU32(*puW); puW++;
    222     *puW = ASMByteSwapU32(*puW); puW++;
    223     *puW = ASMByteSwapU32(*puW); puW++;
    224     *puW = ASMByteSwapU32(*puW); puW++;
    225 
    226     *puW = ASMByteSwapU32(*puW); puW++;
    227     *puW = ASMByteSwapU32(*puW); puW++;
    228     *puW = ASMByteSwapU32(*puW); puW++;
    229     *puW = ASMByteSwapU32(*puW); puW++;
    230 # else
    231     uint32_t uS1 = puW[-16];
    232     uint32_t uS2 = puW[-15];
     159    *puW = ASMByteSwapU32(*puW); puW++;
     160    *puW = ASMByteSwapU32(*puW); puW++;
     161    *puW = ASMByteSwapU32(*puW); puW++;
     162    *puW = ASMByteSwapU32(*puW); puW++;
     163
     164    *puW = ASMByteSwapU32(*puW); puW++;
     165    *puW = ASMByteSwapU32(*puW); puW++;
     166    *puW = ASMByteSwapU32(*puW); puW++;
     167    *puW = ASMByteSwapU32(*puW); puW++;
     168
     169    *puW = ASMByteSwapU32(*puW); puW++;
     170    *puW = ASMByteSwapU32(*puW); puW++;
     171    *puW = ASMByteSwapU32(*puW); puW++;
     172    *puW = ASMByteSwapU32(*puW); puW++;
     173
     174    *puW = ASMByteSwapU32(*puW); puW++;
     175    *puW = ASMByteSwapU32(*puW); puW++;
     176    *puW = ASMByteSwapU32(*puW); puW++;
     177    *puW = ASMByteSwapU32(*puW); puW++;
    233178# endif
    234 
    235     /* Initialize W16...W79. */
    236     uint32_t u32;
    237     RTSHA1_HIGH_INIT_EIGHT();
    238     RTSHA1_HIGH_INIT_EIGHT();
    239     RTSHA1_HIGH_INIT_EIGHT();
    240     RTSHA1_HIGH_INIT_EIGHT();
    241 
    242     RTSHA1_HIGH_INIT_EIGHT_HIGH();
    243     RTSHA1_HIGH_INIT_EIGHT_HIGH();
    244     RTSHA1_HIGH_INIT_EIGHT_HIGH();
    245     RTSHA1_HIGH_INIT_EIGHT_HIGH();
    246179
    247180#else  /* !RTSHA1_UNROLLED_INIT */
     
    326259    uint32_t uE = pCtx->AltPrivate.auH[4];
    327260
    328 #ifdef RTSHA1_FULLY_UNROLLED_BLOCK_PROCESSING
     261#ifdef RTSHA1_UNROLLED
    329262    /* This fully unrolled version will avoid the variable rotation by
    330263       embedding it into the loop unrolling. */
    331     uint32_t const *puW = &pCtx->AltPrivate.auW[0];
    332 # define SHA1_BODY(a_uW, a_uK, a_fnFt, a_uA, a_uB, a_uC, a_uD, a_uE) \
     264    uint32_t *puW = &pCtx->AltPrivate.auW[0];
     265# define SHA1_BODY(a_iWord, a_uK, a_fnFt, a_uA, a_uB, a_uC, a_uD, a_uE) \
    333266        do { \
    334             a_uE += a_uW; \
     267            if (a_iWord < 16) \
     268                a_uE += *puW++; \
     269            else \
     270            { \
     271                uint32_t u32 = puW[-16]; \
     272                u32         ^= puW[-14]; \
     273                u32         ^= puW[-8]; \
     274                u32         ^= puW[-3]; \
     275                u32 = ASMRotateLeftU32(u32, 1); \
     276                *puW++ = u32; \
     277                a_uE += u32; \
     278            } \
    335279            a_uE += (a_uK); \
    336280            a_uE += ASMRotateLeftU32(a_uA, 5); \
     
    338282            a_uB = ASMRotateLeftU32(a_uB, 30); \
    339283        } while (0)
    340 # define FIVE_ITERATIONS(a_iStart, a_uK, a_fnFt) \
     284# define FIVE_ITERATIONS(a_iFirst, a_uK, a_fnFt) \
    341285    do { \
    342         SHA1_BODY(/*puW[a_iStart + 0]*/ *puW++, a_uK, a_fnFt, uA, uB, uC, uD, uE); \
    343         SHA1_BODY(/*puW[a_iStart + 1]*/ *puW++, a_uK, a_fnFt, uE, uA, uB, uC, uD); \
    344         SHA1_BODY(/*puW[a_iStart + 2]*/ *puW++, a_uK, a_fnFt, uD, uE, uA, uB, uC); \
    345         SHA1_BODY(/*puW[a_iStart + 3]*/ *puW++, a_uK, a_fnFt, uC, uD, uE, uA, uB); \
    346         SHA1_BODY(/*puW[a_iStart + 4]*/ *puW++, a_uK, a_fnFt, uB, uC, uD, uE, uA); \
     286        SHA1_BODY(a_iFirst + 0, a_uK, a_fnFt, uA, uB, uC, uD, uE); \
     287        SHA1_BODY(a_iFirst + 1, a_uK, a_fnFt, uE, uA, uB, uC, uD); \
     288        SHA1_BODY(a_iFirst + 2, a_uK, a_fnFt, uD, uE, uA, uB, uC); \
     289        SHA1_BODY(a_iFirst + 3, a_uK, a_fnFt, uC, uD, uE, uA, uB); \
     290        SHA1_BODY(a_iFirst + 4, a_uK, a_fnFt, uB, uC, uD, uE, uA); \
    347291    } while (0)
    348 # if 0 /* Variation that reduces the code size by a factor of 4 without much loss in preformance. */
    349 #  define TWENTY_ITERATIONS(a_iFirst, a_uK, a_fnFt) \
    350     do { unsigned i = 4; while (i-- > 0) FIVE_ITERATIONS(a_iFirst + (3 - i) * 5, a_uK, a_fnFt); } while (0)
    351     /*for (unsigned i = a_iFirst; i < (a_iFirst + 20); i += 5) FIVE_ITERATIONS(i, a_uK, a_fnFt);*/
    352 # else
    353 #  define TWENTY_ITERATIONS(a_iFirst, a_uK, a_fnFt) \
     292# define TWENTY_ITERATIONS(a_iStart, a_uK, a_fnFt) \
    354293    do { \
    355         FIVE_ITERATIONS(a_iFirst +  0, a_uK, a_fnFt); \
    356         FIVE_ITERATIONS(a_iFirst +  5, a_uK, a_fnFt); \
    357         FIVE_ITERATIONS(a_iFirst + 10, a_uK, a_fnFt); \
    358         FIVE_ITERATIONS(a_iFirst + 15, a_uK, a_fnFt); \
     294        FIVE_ITERATIONS(a_iStart +  0, a_uK, a_fnFt); \
     295        FIVE_ITERATIONS(a_iStart +  5, a_uK, a_fnFt); \
     296        FIVE_ITERATIONS(a_iStart + 10, a_uK, a_fnFt); \
     297        FIVE_ITERATIONS(a_iStart + 15, a_uK, a_fnFt); \
    359298    } while (0)
    360 # endif
     299
    361300    TWENTY_ITERATIONS( 0, UINT32_C(0x5a827999), rtSha1Ch);
    362301    TWENTY_ITERATIONS(20, UINT32_C(0x6ed9eba1), rtSha1Parity);
     
    364303    TWENTY_ITERATIONS(60, UINT32_C(0xca62c1d6), rtSha1Parity);
    365304
    366 #elif 0 /* Version avoiding the constant selection. */
     305#elif 1 /* Version avoiding the constant selection. */
    367306    unsigned iWord = 0;
    368307# define TWENTY_ITERATIONS(a_iWordStop, a_uK, a_uExprBCD) \
Note: See TracChangeset for help on using the changeset viewer.

© 2024 Oracle Support Privacy / Do Not Sell My Info Terms of Use Trademark Policy Automated Access Etiquette