VirtualBox

Changeset 85624 in vbox for trunk/src/VBox/Runtime


Ignore:
Timestamp:
Aug 5, 2020 8:50:16 PM (4 years ago)
Author:
vboxsync
Message:

IPRT: Tweaking some performance out of the alternative SHA-3 implementation on gcc 10.2.1. bugref:9734

File:
1 edited

Legend:

Unmodified
Added
Removed
  • trunk/src/VBox/Runtime/common/checksum/alt-sha3.cpp

    r85614 r85624  
    3232#define RTSHA3_ROUNDS   24
    3333
     34/** @def RTSHA3_FULL_UNROLL
     35 * Do full loop unrolling unless we're using VS2019 as it seems to degrate
     36 * performances there for some reason.  With gcc 10.2.1 on a recent Intel system
     37 * (10890XE), this results SHA3-512 throughput (tstRTDigest-2) increasing from
     38 * 83532 KiB/s to 194942 KiB/s against a text size jump from 5913 to 6929 bytes.
     39 *
     40 * For comparison, openssl 1.1.1g assembly code (AMD64) achives 264915 KiB/s,
     41 * which is only 36% more.  Performance is more or less exactly the same as
     42 * KECCAK_2X without ROL optimizations (they improve it to 203493 KiB/s).
     43 */
     44#if !defined(_MSC_VER) || defined(DOXYGEN_RUNNING)
     45# define RTSHA3_FULL_UNROLL
     46#endif
     47
    3448
    3549/*********************************************************************************************************************************
     
    91105        {
    92106            /* Step 1: */
    93             uint64_t au64C[5];
    94             au64C[0] = pState->au64[0] ^ pState->au64[5] ^ pState->au64[10] ^ pState->au64[15] ^ pState->au64[20];
    95             au64C[1] = pState->au64[1] ^ pState->au64[6] ^ pState->au64[11] ^ pState->au64[16] ^ pState->au64[21];
    96             au64C[2] = pState->au64[2] ^ pState->au64[7] ^ pState->au64[12] ^ pState->au64[17] ^ pState->au64[22];
    97             au64C[3] = pState->au64[3] ^ pState->au64[8] ^ pState->au64[13] ^ pState->au64[18] ^ pState->au64[23];
    98             au64C[4] = pState->au64[4] ^ pState->au64[9] ^ pState->au64[14] ^ pState->au64[19] ^ pState->au64[24];
     107            const uint64_t au64C[5] =
     108            {
     109                pState->au64[0] ^ pState->au64[5] ^ pState->au64[10] ^ pState->au64[15] ^ pState->au64[20],
     110                pState->au64[1] ^ pState->au64[6] ^ pState->au64[11] ^ pState->au64[16] ^ pState->au64[21],
     111                pState->au64[2] ^ pState->au64[7] ^ pState->au64[12] ^ pState->au64[17] ^ pState->au64[22],
     112                pState->au64[3] ^ pState->au64[8] ^ pState->au64[13] ^ pState->au64[18] ^ pState->au64[23],
     113                pState->au64[4] ^ pState->au64[9] ^ pState->au64[14] ^ pState->au64[19] ^ pState->au64[24],
     114            };
    99115
    100116            /* Step 2 & 3: */
     117#ifndef RTSHA3_FULL_UNROLL
    101118            for (size_t i = 0; i < RT_ELEMENTS(au64C); i++)
    102119            {
     
    109126                pState->au64[20 + i] ^= u64D;
    110127            }
     128#else  /* RTSHA3_FULL_UNROLL */
     129# define THETA_STEP_2_3(a_i, a_idxCLeft, a_idxCRight) do { \
     130                uint64_t const u64D = au64C[a_idxCLeft] ^ ASMRotateLeftU64(au64C[a_idxCRight], 1); \
     131                pState->au64[ 0 + a_i] ^= u64D; \
     132                pState->au64[ 5 + a_i] ^= u64D; \
     133                pState->au64[10 + a_i] ^= u64D; \
     134                pState->au64[15 + a_i] ^= u64D; \
     135                pState->au64[20 + a_i] ^= u64D; \
     136            } while (0)
     137            THETA_STEP_2_3(0, 4, 1);
     138            THETA_STEP_2_3(1, 0, 2);
     139            THETA_STEP_2_3(2, 1, 3);
     140            THETA_STEP_2_3(3, 2, 4);
     141            THETA_STEP_2_3(4, 3, 0);
     142#endif /* RTSHA3_FULL_UNROLL */
    111143        }
    112144
     
    115147         */
    116148        {
     149#ifndef RTSHA3_FULL_UNROLL
    117150            static uint8_t const s_aidxState[] = {10,7,11,17,18,  3, 5,16, 8,21, 24, 4,15,23,19, 13,12, 2,20,14, 22, 9, 6, 1};
    118151            static uint8_t const s_acRotate[]  = { 1,3, 6,10,15, 21,28,36,45,55,  2,14,27,41,56,  8,25,43,62,18, 39,61,20,44};
    119152            AssertCompile(RT_ELEMENTS(s_aidxState) == 24); AssertCompile(RT_ELEMENTS(s_acRotate) == 24);
    120153            uint64_t u64 = pState->au64[1 /*s_aidxState[RT_ELEMENTS(s_aidxState) - 1]*/];
    121 #if 0 /* This is slower with VS2019. */
     154# if !defined(_MSC_VER) /* This is slower with VS2019 but slightly faster with g++ (10.2.1). */
    122155            for (size_t i = 0; i <= 23 - 1; i++) /*i=t*/
    123156            {
     
    128161            }
    129162            pState->au64[1 /*s_aidxState[23]*/] = ASMRotateLeftU64(u64, 44 /*s_acRotate[23]*/);
    130 #else
     163# else
    131164            for (size_t i = 0; i <= 23; i++) /*i=t*/
    132165            {
     
    136169                pState->au64[idxState] = u64Result;
    137170            }
    138 #endif
     171# endif
     172#else  /* RTSHA3_FULL_UNROLL */
     173# define RHO_AND_PI(a_idxState, a_cRotate) do { \
     174                uint64_t const u64Result = ASMRotateLeftU64(u64, a_cRotate); \
     175                u64 = pState->au64[a_idxState]; \
     176                pState->au64[a_idxState] = u64Result; \
     177            } while (0)
     178
     179            uint64_t u64 = pState->au64[1 /*s_aidxState[RT_ELEMENTS(s_aidxState) - 1]*/];
     180            RHO_AND_PI(10,  1);
     181            RHO_AND_PI( 7,  3);
     182            RHO_AND_PI(11,  6);
     183            RHO_AND_PI(17, 10);
     184            RHO_AND_PI(18, 15);
     185            RHO_AND_PI( 3, 21);
     186            RHO_AND_PI( 5, 28);
     187            RHO_AND_PI(16, 36);
     188            RHO_AND_PI( 8, 45);
     189            RHO_AND_PI(21, 55);
     190            RHO_AND_PI(24,  2);
     191            RHO_AND_PI( 4, 14);
     192            RHO_AND_PI(15, 27);
     193            RHO_AND_PI(23, 41);
     194            RHO_AND_PI(19, 56);
     195            RHO_AND_PI(13,  8);
     196            RHO_AND_PI(12, 25);
     197            RHO_AND_PI( 2, 43);
     198            RHO_AND_PI(20, 62);
     199            RHO_AND_PI(14, 18);
     200            RHO_AND_PI(22, 39);
     201            RHO_AND_PI( 9, 61);
     202            RHO_AND_PI( 6, 20);
     203            pState->au64[1 /*s_aidxState[23]*/] = ASMRotateLeftU64(u64, 44 /*s_acRotate[23]*/);
     204
     205#endif /* RTSHA3_FULL_UNROLL */
    139206        }
    140207
    141208        /*
    142          * 3.2.4 Chi
     209         * 3.2.4 Chi & 3.2.5 Iota.
    143210         */
     211        /* Iota values xor constants (indexed by round). */
     212        static uint64_t const s_au64RC[] =
     213        {
     214            UINT64_C(0x0000000000000001), UINT64_C(0x0000000000008082), UINT64_C(0x800000000000808a), UINT64_C(0x8000000080008000),
     215            UINT64_C(0x000000000000808b), UINT64_C(0x0000000080000001), UINT64_C(0x8000000080008081), UINT64_C(0x8000000000008009),
     216            UINT64_C(0x000000000000008a), UINT64_C(0x0000000000000088), UINT64_C(0x0000000080008009), UINT64_C(0x000000008000000a),
     217            UINT64_C(0x000000008000808b), UINT64_C(0x800000000000008b), UINT64_C(0x8000000000008089), UINT64_C(0x8000000000008003),
     218            UINT64_C(0x8000000000008002), UINT64_C(0x8000000000000080), UINT64_C(0x000000000000800a), UINT64_C(0x800000008000000a),
     219            UINT64_C(0x8000000080008081), UINT64_C(0x8000000000008080), UINT64_C(0x0000000080000001), UINT64_C(0x8000000080008008),
     220        };
     221        AssertCompile(RT_ELEMENTS(s_au64RC) == RTSHA3_ROUNDS);
     222#ifndef RTSHA3_FULL_UNROLL
     223        /* Chi */
    144224        for (size_t i = 0; i < 25; i += 5)
    145225        {
    146 #if 0 /* This is typically slower with VS2019. Go figure. */
     226# ifndef _MSC_VER /* This is typically slower with VS2019 - go figure.  Makes not difference with g++. */
    147227            uint64_t const u0 = pState->au64[i + 0];
    148228            uint64_t const u1 = pState->au64[i + 1];
     
    155235            pState->au64[i + 3] = u3 ^ (~u4 & u0);
    156236            pState->au64[i + 4] = u4 ^ (~u0 & u1);
    157 #else
     237# else
    158238            uint64_t const au64Tmp[] = { pState->au64[i + 0], pState->au64[i + 1], pState->au64[i + 2],
    159239                                         pState->au64[i + 3], pState->au64[i + 4] };
     
    163243            pState->au64[i + 3] ^= ~au64Tmp[4] & au64Tmp[0];
    164244            pState->au64[i + 4] ^= ~au64Tmp[0] & au64Tmp[1];
    165 #endif
     245# endif
    166246        }
    167247
    168         /*
    169          * 3.2.5 Iota.
    170          */
    171         static uint64_t const s_au64RC[] =
    172         {
    173             UINT64_C(0x0000000000000001), UINT64_C(0x0000000000008082), UINT64_C(0x800000000000808a), UINT64_C(0x8000000080008000),
    174             UINT64_C(0x000000000000808b), UINT64_C(0x0000000080000001), UINT64_C(0x8000000080008081), UINT64_C(0x8000000000008009),
    175             UINT64_C(0x000000000000008a), UINT64_C(0x0000000000000088), UINT64_C(0x0000000080008009), UINT64_C(0x000000008000000a),
    176             UINT64_C(0x000000008000808b), UINT64_C(0x800000000000008b), UINT64_C(0x8000000000008089), UINT64_C(0x8000000000008003),
    177             UINT64_C(0x8000000000008002), UINT64_C(0x8000000000000080), UINT64_C(0x000000000000800a), UINT64_C(0x800000008000000a),
    178             UINT64_C(0x8000000080008081), UINT64_C(0x8000000000008080), UINT64_C(0x0000000080000001), UINT64_C(0x8000000080008008),
    179         };
    180         AssertCompile(RT_ELEMENTS(s_au64RC) == RTSHA3_ROUNDS);
     248        /* Iota. */
    181249        pState->au64[0] ^= s_au64RC[idxRound];
     250
     251#else  /* RTSHA3_FULL_UNROLL */
     252# define CHI_AND_IOTA(a_i, a_IotaExpr) do { \
     253            uint64_t const u0 = pState->au64[a_i + 0]; \
     254            uint64_t const u1 = pState->au64[a_i + 1]; \
     255            uint64_t const u2 = pState->au64[a_i + 2]; \
     256            pState->au64[a_i + 0] = u0 ^ (~u1 & u2) a_IotaExpr; \
     257            uint64_t const u3 = pState->au64[a_i + 3]; \
     258            pState->au64[a_i + 1] = u1 ^ (~u2 & u3); \
     259            uint64_t const u4 = pState->au64[a_i + 4]; \
     260            pState->au64[a_i + 2] = u2 ^ (~u3 & u4); \
     261            pState->au64[a_i + 3] = u3 ^ (~u4 & u0); \
     262            pState->au64[a_i + 4] = u4 ^ (~u0 & u1); \
     263        } while (0)
     264        CHI_AND_IOTA( 0, ^ s_au64RC[idxRound]);
     265        CHI_AND_IOTA( 5, RT_NOTHING);
     266        CHI_AND_IOTA(10, RT_NOTHING);
     267        CHI_AND_IOTA(15, RT_NOTHING);
     268        CHI_AND_IOTA(20, RT_NOTHING);
     269#endif /* RTSHA3_FULL_UNROLL */
    182270    }
    183271
Note: See TracChangeset for help on using the changeset viewer.

© 2024 Oracle Support Privacy / Do Not Sell My Info Terms of Use Trademark Policy Automated Access Etiquette