VirtualBox

Ignore:
Timestamp:
Jul 5, 2014 8:23:47 PM (11 years ago)
Author:
vboxsync
Message:

alt-sha1.cpp: Unrolled the block processing code, getting a ~25 speed increase on windows/amd64.

File:
1 edited

Legend:

Unmodified
Added
Removed
  • trunk/src/VBox/Runtime/common/checksum/alt-sha1.cpp

    r51861 r51878  
    128128
    129129
     130/** Function 4.1, Ch(x,y,z). */
     131DECL_FORCE_INLINE(uint32_t) rtSha1Ch(uint32_t uX, uint32_t uY, uint32_t uZ)
     132{
     133    uint32_t uResult = uX & uY;
     134    uResult ^= ~uX & uZ;
     135    return uResult;
     136}
     137
     138
     139/** Function 4.1, Parity(x,y,z). */
     140DECL_FORCE_INLINE(uint32_t) rtSha1Parity(uint32_t uX, uint32_t uY, uint32_t uZ)
     141{
     142    uint32_t uResult = uX;
     143    uResult ^= uY;
     144    uResult ^= uZ;
     145    return uResult;
     146}
     147
     148
     149/** Function 4.1, Maj(x,y,z). */
     150DECL_FORCE_INLINE(uint32_t) rtSha1Maj(uint32_t uX, uint32_t uY, uint32_t uZ)
     151{
     152    uint32_t uResult = (uX & uY);
     153    uResult |= (uX & uZ);
     154    uResult |= (uY & uZ);
     155    return uResult;
     156}
     157
     158
    130159/**
    131160 * Process the current block.
     
    143172    uint32_t uE = pCtx->AltPrivate.auH[4];
    144173
    145 #if 1
     174#if 1 /* Fully unrolled version. */
     175    register uint32_t const *puW = &pCtx->AltPrivate.auW[0];
     176# define SHA1_BODY(a_uW, a_uK, a_fnFt, a_uA, a_uB, a_uC, a_uD, a_uE) \
     177        do { \
     178            a_uE += a_uW; \
     179            a_uE += (a_uK); \
     180            a_uE += ASMRotateLeftU32(a_uA, 5); \
     181            a_uE += a_fnFt(a_uB, a_uC, a_uD); \
     182            a_uB = ASMRotateLeftU32(a_uB, 30); \
     183        } while (0)
     184# define FIVE_ITERATIONS(a_iStart, a_uK, a_fnFt) \
     185    do { \
     186        SHA1_BODY(/*puW[a_iStart + 0]*/ *puW++, a_uK, a_fnFt, uA, uB, uC, uD, uE); \
     187        SHA1_BODY(/*puW[a_iStart + 1]*/ *puW++, a_uK, a_fnFt, uE, uA, uB, uC, uD); \
     188        SHA1_BODY(/*puW[a_iStart + 2]*/ *puW++, a_uK, a_fnFt, uD, uE, uA, uB, uC); \
     189        SHA1_BODY(/*puW[a_iStart + 3]*/ *puW++, a_uK, a_fnFt, uC, uD, uE, uA, uB); \
     190        SHA1_BODY(/*puW[a_iStart + 4]*/ *puW++, a_uK, a_fnFt, uB, uC, uD, uE, uA); \
     191    } while (0)
     192# if 0 /* Variation that reduces the code size by a factor of 4 without much loss in preformance. */
     193#  define TWENTY_ITERATIONS(a_iFirst, a_uK, a_fnFt) \
     194    do { unsigned i = 4; while (i-- > 0) FIVE_ITERATIONS(a_iFirst + (3 - i) * 5, a_uK, a_fnFt); } while (0)
     195    /*for (unsigned i = a_iFirst; i < (a_iFirst + 20); i += 5) FIVE_ITERATIONS(i, a_uK, a_fnFt);*/
     196# else
     197#  define TWENTY_ITERATIONS(a_iFirst, a_uK, a_fnFt) \
     198    do { \
     199        FIVE_ITERATIONS(a_iFirst +  0, a_uK, a_fnFt); \
     200        FIVE_ITERATIONS(a_iFirst +  5, a_uK, a_fnFt); \
     201        FIVE_ITERATIONS(a_iFirst + 10, a_uK, a_fnFt); \
     202        FIVE_ITERATIONS(a_iFirst + 15, a_uK, a_fnFt); \
     203    } while (0)
     204# endif
     205    TWENTY_ITERATIONS( 0, UINT32_C(0x5a827999), rtSha1Ch);
     206    TWENTY_ITERATIONS(20, UINT32_C(0x6ed9eba1), rtSha1Parity);
     207    TWENTY_ITERATIONS(40, UINT32_C(0x8f1bbcdc), rtSha1Maj);
     208    TWENTY_ITERATIONS(60, UINT32_C(0xca62c1d6), rtSha1Parity);
     209
     210#elif 0 /* Version avoiding the constant selection. */
    146211    unsigned iWord = 0;
    147212# define TWENTY_ITERATIONS(a_iWordStop, a_uK, a_uExprBCD) \
     
    160225            uA = uTemp; \
    161226        } do { } while (0)
    162     TWENTY_ITERATIONS(20, UINT32_C(0x5a827999), (uB & uC) | (~uB & uD));
    163     TWENTY_ITERATIONS(40, UINT32_C(0x6ed9eba1), uB ^ uC ^ uD);
    164     TWENTY_ITERATIONS(60, UINT32_C(0x8f1bbcdc), (uB & uC) | (uB & uD) | (uC & uD));
    165     TWENTY_ITERATIONS(80, UINT32_C(0xca62c1d6), uB ^ uC ^ uD);
    166 #else
     227    TWENTY_ITERATIONS(20, UINT32_C(0x5a827999), rtSha1Ch(uB, uC, uD));
     228    TWENTY_ITERATIONS(40, UINT32_C(0x6ed9eba1), rtSha1Parity(uB, uC, uD));
     229    TWENTY_ITERATIONS(60, UINT32_C(0x8f1bbcdc), rtSha1Maj(uB, uC, uD));
     230    TWENTY_ITERATIONS(80, UINT32_C(0xca62c1d6), rtSha1Parity(uB, uC, uD));
     231
     232#else /* Dead simple implementation. */
    167233    for (unsigned iWord = 0; iWord < RT_ELEMENTS(pCtx->AltPrivate.auW); iWord++)
    168234    {
Note: See TracChangeset for help on using the changeset viewer.

© 2024 Oracle Support Privacy / Do Not Sell My Info Terms of Use Trademark Policy Automated Access Etiquette