- Timestamp:
- Aug 5, 2020 8:50:16 PM (4 years ago)
- File:
-
- 1 edited
Legend:
- Unmodified
- Added
- Removed
-
trunk/src/VBox/Runtime/common/checksum/alt-sha3.cpp
r85614 r85624 32 32 #define RTSHA3_ROUNDS 24 33 33 34 /** @def RTSHA3_FULL_UNROLL 35 * Do full loop unrolling unless we're using VS2019 as it seems to degrate 36 * performances there for some reason. With gcc 10.2.1 on a recent Intel system 37 * (10890XE), this results SHA3-512 throughput (tstRTDigest-2) increasing from 38 * 83532 KiB/s to 194942 KiB/s against a text size jump from 5913 to 6929 bytes. 39 * 40 * For comparison, openssl 1.1.1g assembly code (AMD64) achives 264915 KiB/s, 41 * which is only 36% more. Performance is more or less exactly the same as 42 * KECCAK_2X without ROL optimizations (they improve it to 203493 KiB/s). 43 */ 44 #if !defined(_MSC_VER) || defined(DOXYGEN_RUNNING) 45 # define RTSHA3_FULL_UNROLL 46 #endif 47 34 48 35 49 /********************************************************************************************************************************* … … 91 105 { 92 106 /* Step 1: */ 93 uint64_t au64C[5]; 94 au64C[0] = pState->au64[0] ^ pState->au64[5] ^ pState->au64[10] ^ pState->au64[15] ^ pState->au64[20]; 95 au64C[1] = pState->au64[1] ^ pState->au64[6] ^ pState->au64[11] ^ pState->au64[16] ^ pState->au64[21]; 96 au64C[2] = pState->au64[2] ^ pState->au64[7] ^ pState->au64[12] ^ pState->au64[17] ^ pState->au64[22]; 97 au64C[3] = pState->au64[3] ^ pState->au64[8] ^ pState->au64[13] ^ pState->au64[18] ^ pState->au64[23]; 98 au64C[4] = pState->au64[4] ^ pState->au64[9] ^ pState->au64[14] ^ pState->au64[19] ^ pState->au64[24]; 107 const uint64_t au64C[5] = 108 { 109 pState->au64[0] ^ pState->au64[5] ^ pState->au64[10] ^ pState->au64[15] ^ pState->au64[20], 110 pState->au64[1] ^ pState->au64[6] ^ pState->au64[11] ^ pState->au64[16] ^ pState->au64[21], 111 pState->au64[2] ^ pState->au64[7] ^ pState->au64[12] ^ pState->au64[17] ^ pState->au64[22], 112 pState->au64[3] ^ pState->au64[8] ^ pState->au64[13] ^ pState->au64[18] ^ pState->au64[23], 113 pState->au64[4] ^ pState->au64[9] ^ pState->au64[14] ^ pState->au64[19] ^ pState->au64[24], 114 }; 99 115 100 116 /* Step 2 & 3: */ 117 #ifndef RTSHA3_FULL_UNROLL 101 118 for (size_t i = 0; i < RT_ELEMENTS(au64C); i++) 102 119 { … … 109 126 pState->au64[20 + i] ^= u64D; 110 127 } 128 #else /* RTSHA3_FULL_UNROLL */ 129 # define THETA_STEP_2_3(a_i, a_idxCLeft, a_idxCRight) do { \ 130 uint64_t const u64D = au64C[a_idxCLeft] ^ ASMRotateLeftU64(au64C[a_idxCRight], 1); \ 131 pState->au64[ 0 + a_i] ^= u64D; \ 132 pState->au64[ 5 + a_i] ^= u64D; \ 133 pState->au64[10 + a_i] ^= u64D; \ 134 pState->au64[15 + a_i] ^= u64D; \ 135 pState->au64[20 + a_i] ^= u64D; \ 136 } while (0) 137 THETA_STEP_2_3(0, 4, 1); 138 THETA_STEP_2_3(1, 0, 2); 139 THETA_STEP_2_3(2, 1, 3); 140 THETA_STEP_2_3(3, 2, 4); 141 THETA_STEP_2_3(4, 3, 0); 142 #endif /* RTSHA3_FULL_UNROLL */ 111 143 } 112 144 … … 115 147 */ 116 148 { 149 #ifndef RTSHA3_FULL_UNROLL 117 150 static uint8_t const s_aidxState[] = {10,7,11,17,18, 3, 5,16, 8,21, 24, 4,15,23,19, 13,12, 2,20,14, 22, 9, 6, 1}; 118 151 static uint8_t const s_acRotate[] = { 1,3, 6,10,15, 21,28,36,45,55, 2,14,27,41,56, 8,25,43,62,18, 39,61,20,44}; 119 152 AssertCompile(RT_ELEMENTS(s_aidxState) == 24); AssertCompile(RT_ELEMENTS(s_acRotate) == 24); 120 153 uint64_t u64 = pState->au64[1 /*s_aidxState[RT_ELEMENTS(s_aidxState) - 1]*/]; 121 # if 0 /* This is slower with VS2019. */154 # if !defined(_MSC_VER) /* This is slower with VS2019 but slightly faster with g++ (10.2.1). */ 122 155 for (size_t i = 0; i <= 23 - 1; i++) /*i=t*/ 123 156 { … … 128 161 } 129 162 pState->au64[1 /*s_aidxState[23]*/] = ASMRotateLeftU64(u64, 44 /*s_acRotate[23]*/); 130 # else163 # else 131 164 for (size_t i = 0; i <= 23; i++) /*i=t*/ 132 165 { … … 136 169 pState->au64[idxState] = u64Result; 137 170 } 138 #endif 171 # endif 172 #else /* RTSHA3_FULL_UNROLL */ 173 # define RHO_AND_PI(a_idxState, a_cRotate) do { \ 174 uint64_t const u64Result = ASMRotateLeftU64(u64, a_cRotate); \ 175 u64 = pState->au64[a_idxState]; \ 176 pState->au64[a_idxState] = u64Result; \ 177 } while (0) 178 179 uint64_t u64 = pState->au64[1 /*s_aidxState[RT_ELEMENTS(s_aidxState) - 1]*/]; 180 RHO_AND_PI(10, 1); 181 RHO_AND_PI( 7, 3); 182 RHO_AND_PI(11, 6); 183 RHO_AND_PI(17, 10); 184 RHO_AND_PI(18, 15); 185 RHO_AND_PI( 3, 21); 186 RHO_AND_PI( 5, 28); 187 RHO_AND_PI(16, 36); 188 RHO_AND_PI( 8, 45); 189 RHO_AND_PI(21, 55); 190 RHO_AND_PI(24, 2); 191 RHO_AND_PI( 4, 14); 192 RHO_AND_PI(15, 27); 193 RHO_AND_PI(23, 41); 194 RHO_AND_PI(19, 56); 195 RHO_AND_PI(13, 8); 196 RHO_AND_PI(12, 25); 197 RHO_AND_PI( 2, 43); 198 RHO_AND_PI(20, 62); 199 RHO_AND_PI(14, 18); 200 RHO_AND_PI(22, 39); 201 RHO_AND_PI( 9, 61); 202 RHO_AND_PI( 6, 20); 203 pState->au64[1 /*s_aidxState[23]*/] = ASMRotateLeftU64(u64, 44 /*s_acRotate[23]*/); 204 205 #endif /* RTSHA3_FULL_UNROLL */ 139 206 } 140 207 141 208 /* 142 * 3.2.4 Chi 209 * 3.2.4 Chi & 3.2.5 Iota. 143 210 */ 211 /* Iota values xor constants (indexed by round). */ 212 static uint64_t const s_au64RC[] = 213 { 214 UINT64_C(0x0000000000000001), UINT64_C(0x0000000000008082), UINT64_C(0x800000000000808a), UINT64_C(0x8000000080008000), 215 UINT64_C(0x000000000000808b), UINT64_C(0x0000000080000001), UINT64_C(0x8000000080008081), UINT64_C(0x8000000000008009), 216 UINT64_C(0x000000000000008a), UINT64_C(0x0000000000000088), UINT64_C(0x0000000080008009), UINT64_C(0x000000008000000a), 217 UINT64_C(0x000000008000808b), UINT64_C(0x800000000000008b), UINT64_C(0x8000000000008089), UINT64_C(0x8000000000008003), 218 UINT64_C(0x8000000000008002), UINT64_C(0x8000000000000080), UINT64_C(0x000000000000800a), UINT64_C(0x800000008000000a), 219 UINT64_C(0x8000000080008081), UINT64_C(0x8000000000008080), UINT64_C(0x0000000080000001), UINT64_C(0x8000000080008008), 220 }; 221 AssertCompile(RT_ELEMENTS(s_au64RC) == RTSHA3_ROUNDS); 222 #ifndef RTSHA3_FULL_UNROLL 223 /* Chi */ 144 224 for (size_t i = 0; i < 25; i += 5) 145 225 { 146 # if 0 /* This is typically slower with VS2019. Go figure. */226 # ifndef _MSC_VER /* This is typically slower with VS2019 - go figure. Makes not difference with g++. */ 147 227 uint64_t const u0 = pState->au64[i + 0]; 148 228 uint64_t const u1 = pState->au64[i + 1]; … … 155 235 pState->au64[i + 3] = u3 ^ (~u4 & u0); 156 236 pState->au64[i + 4] = u4 ^ (~u0 & u1); 157 # else237 # else 158 238 uint64_t const au64Tmp[] = { pState->au64[i + 0], pState->au64[i + 1], pState->au64[i + 2], 159 239 pState->au64[i + 3], pState->au64[i + 4] }; … … 163 243 pState->au64[i + 3] ^= ~au64Tmp[4] & au64Tmp[0]; 164 244 pState->au64[i + 4] ^= ~au64Tmp[0] & au64Tmp[1]; 165 # endif245 # endif 166 246 } 167 247 168 /* 169 * 3.2.5 Iota. 170 */ 171 static uint64_t const s_au64RC[] = 172 { 173 UINT64_C(0x0000000000000001), UINT64_C(0x0000000000008082), UINT64_C(0x800000000000808a), UINT64_C(0x8000000080008000), 174 UINT64_C(0x000000000000808b), UINT64_C(0x0000000080000001), UINT64_C(0x8000000080008081), UINT64_C(0x8000000000008009), 175 UINT64_C(0x000000000000008a), UINT64_C(0x0000000000000088), UINT64_C(0x0000000080008009), UINT64_C(0x000000008000000a), 176 UINT64_C(0x000000008000808b), UINT64_C(0x800000000000008b), UINT64_C(0x8000000000008089), UINT64_C(0x8000000000008003), 177 UINT64_C(0x8000000000008002), UINT64_C(0x8000000000000080), UINT64_C(0x000000000000800a), UINT64_C(0x800000008000000a), 178 UINT64_C(0x8000000080008081), UINT64_C(0x8000000000008080), UINT64_C(0x0000000080000001), UINT64_C(0x8000000080008008), 179 }; 180 AssertCompile(RT_ELEMENTS(s_au64RC) == RTSHA3_ROUNDS); 248 /* Iota. */ 181 249 pState->au64[0] ^= s_au64RC[idxRound]; 250 251 #else /* RTSHA3_FULL_UNROLL */ 252 # define CHI_AND_IOTA(a_i, a_IotaExpr) do { \ 253 uint64_t const u0 = pState->au64[a_i + 0]; \ 254 uint64_t const u1 = pState->au64[a_i + 1]; \ 255 uint64_t const u2 = pState->au64[a_i + 2]; \ 256 pState->au64[a_i + 0] = u0 ^ (~u1 & u2) a_IotaExpr; \ 257 uint64_t const u3 = pState->au64[a_i + 3]; \ 258 pState->au64[a_i + 1] = u1 ^ (~u2 & u3); \ 259 uint64_t const u4 = pState->au64[a_i + 4]; \ 260 pState->au64[a_i + 2] = u2 ^ (~u3 & u4); \ 261 pState->au64[a_i + 3] = u3 ^ (~u4 & u0); \ 262 pState->au64[a_i + 4] = u4 ^ (~u0 & u1); \ 263 } while (0) 264 CHI_AND_IOTA( 0, ^ s_au64RC[idxRound]); 265 CHI_AND_IOTA( 5, RT_NOTHING); 266 CHI_AND_IOTA(10, RT_NOTHING); 267 CHI_AND_IOTA(15, RT_NOTHING); 268 CHI_AND_IOTA(20, RT_NOTHING); 269 #endif /* RTSHA3_FULL_UNROLL */ 182 270 } 183 271
Note:
See TracChangeset
for help on using the changeset viewer.