1 | #!/usr/bin/env perl
|
---|
2 | # Copyright 2017-2023 The OpenSSL Project Authors. All Rights Reserved.
|
---|
3 | #
|
---|
4 | # Licensed under the Apache License 2.0 (the "License"). You may not use
|
---|
5 | # this file except in compliance with the License. You can obtain a copy
|
---|
6 | # in the file LICENSE in the source distribution or at
|
---|
7 | # https://www.openssl.org/source/license.html
|
---|
8 | #
|
---|
9 | # ====================================================================
|
---|
10 | # Written by Andy Polyakov <[email protected]> for the OpenSSL
|
---|
11 | # project. The module is, however, dual licensed under OpenSSL and
|
---|
12 | # CRYPTOGAMS licenses depending on where you obtain it. For further
|
---|
13 | # details see http://www.openssl.org/~appro/cryptogams/.
|
---|
14 | # ====================================================================
|
---|
15 | #
|
---|
16 | # Keccak-1600 for AVX-512F.
|
---|
17 | #
|
---|
18 | # July 2017.
|
---|
19 | #
|
---|
20 | # Below code is KECCAK_1X_ALT implementation (see sha/keccak1600.c).
|
---|
21 | # Pretty straightforward, the only "magic" is data layout in registers.
|
---|
22 | # It's impossible to have one that is optimal for every step, hence
|
---|
23 | # it's changing as algorithm progresses. Data is saved in linear order,
|
---|
24 | # but in-register order morphs between rounds. Even rounds take in
|
---|
25 | # linear layout, and odd rounds - transposed, or "verticaly-shaped"...
|
---|
26 | #
|
---|
27 | ########################################################################
|
---|
28 | # Numbers are cycles per processed byte out of large message.
|
---|
29 | #
|
---|
30 | # r=1088(*)
|
---|
31 | #
|
---|
32 | # Knights Landing 7.6
|
---|
33 | # Skylake-X 5.7
|
---|
34 | #
|
---|
35 | # (*) Corresponds to SHA3-256.
|
---|
36 |
|
---|
37 | ########################################################################
|
---|
38 | # Below code is combination of two ideas. One is taken from Keccak Code
|
---|
39 | # Package, hereafter KCP, and another one from initial version of this
|
---|
40 | # module. What is common is observation that Pi's input and output are
|
---|
41 | # "mostly transposed", i.e. if input is aligned by x coordinate, then
|
---|
42 | # output is [mostly] aligned by y. Both versions, KCP and predecessor,
|
---|
43 | # were trying to use one of them from round to round, which resulted in
|
---|
44 | # some kind of transposition in each round. This version still does
|
---|
45 | # transpose data, but only every second round. Another essential factor
|
---|
46 | # is that KCP transposition has to be performed with instructions that
|
---|
47 | # turned to be rather expensive on Knights Landing, both latency- and
|
---|
48 | # throughput-wise. Not to mention that some of them have to depend on
|
---|
49 | # each other. On the other hand initial version of this module was
|
---|
50 | # relying heavily on blend instructions. There were lots of them,
|
---|
51 | # resulting in higher instruction count, yet it performed better on
|
---|
52 | # Knights Landing, because processor can execute pair of them each
|
---|
53 | # cycle and they have minimal latency. This module is an attempt to
|
---|
54 | # bring best parts together:-)
|
---|
55 | #
|
---|
56 | # Coordinates below correspond to those in sha/keccak1600.c. Input
|
---|
57 | # layout is straight linear:
|
---|
58 | #
|
---|
59 | # [0][4] [0][3] [0][2] [0][1] [0][0]
|
---|
60 | # [1][4] [1][3] [1][2] [1][1] [1][0]
|
---|
61 | # [2][4] [2][3] [2][2] [2][1] [2][0]
|
---|
62 | # [3][4] [3][3] [3][2] [3][1] [3][0]
|
---|
63 | # [4][4] [4][3] [4][2] [4][1] [4][0]
|
---|
64 | #
|
---|
65 | # It's perfect for Theta, while Pi is reduced to intra-register
|
---|
66 | # permutations which yield layout perfect for Chi:
|
---|
67 | #
|
---|
68 | # [4][0] [3][0] [2][0] [1][0] [0][0]
|
---|
69 | # [4][1] [3][1] [2][1] [1][1] [0][1]
|
---|
70 | # [4][2] [3][2] [2][2] [1][2] [0][2]
|
---|
71 | # [4][3] [3][3] [2][3] [1][3] [0][3]
|
---|
72 | # [4][4] [3][4] [2][4] [1][4] [0][4]
|
---|
73 | #
|
---|
74 | # Now instead of performing full transposition and feeding it to next
|
---|
75 | # identical round, we perform kind of diagonal transposition to layout
|
---|
76 | # from initial version of this module, and make it suitable for Theta:
|
---|
77 | #
|
---|
78 | # [4][4] [3][3] [2][2] [1][1] [0][0]>4.3.2.1.0>[4][4] [3][3] [2][2] [1][1] [0][0]
|
---|
79 | # [4][0] [3][4] [2][3] [1][2] [0][1]>3.2.1.0.4>[3][4] [2][3] [1][2] [0][1] [4][0]
|
---|
80 | # [4][1] [3][0] [2][4] [1][3] [0][2]>2.1.0.4.3>[2][4] [1][3] [0][2] [4][1] [3][0]
|
---|
81 | # [4][2] [3][1] [2][0] [1][4] [0][3]>1.0.4.3.2>[1][4] [0][3] [4][2] [3][1] [2][0]
|
---|
82 | # [4][3] [3][2] [2][1] [1][0] [0][4]>0.4.3.2.1>[0][4] [4][3] [3][2] [2][1] [1][0]
|
---|
83 | #
|
---|
84 | # Now intra-register permutations yield initial [almost] straight
|
---|
85 | # linear layout:
|
---|
86 | #
|
---|
87 | # [4][4] [3][3] [2][2] [1][1] [0][0]
|
---|
88 | ##[0][4] [0][3] [0][2] [0][1] [0][0]
|
---|
89 | # [3][4] [2][3] [1][2] [0][1] [4][0]
|
---|
90 | ##[2][3] [2][2] [2][1] [2][0] [2][4]
|
---|
91 | # [2][4] [1][3] [0][2] [4][1] [3][0]
|
---|
92 | ##[4][2] [4][1] [4][0] [4][4] [4][3]
|
---|
93 | # [1][4] [0][3] [4][2] [3][1] [2][0]
|
---|
94 | ##[1][1] [1][0] [1][4] [1][3] [1][2]
|
---|
95 | # [0][4] [4][3] [3][2] [2][1] [1][0]
|
---|
96 | ##[3][0] [3][4] [3][3] [3][2] [3][1]
|
---|
97 | #
|
---|
98 | # This means that odd round Chi is performed in less suitable layout,
|
---|
99 | # with a number of additional permutations. But overall it turned to be
|
---|
100 | # a win. Permutations are fastest possible on Knights Landing and they
|
---|
101 | # are laid down to be independent of each other. In the essence I traded
|
---|
102 | # 20 blend instructions for 3 permutations. The result is 13% faster
|
---|
103 | # than KCP on Skylake-X, and >40% on Knights Landing.
|
---|
104 | #
|
---|
105 | # As implied, data is loaded in straight linear order. Digits in
|
---|
106 | # variables' names represent coordinates of right-most element of
|
---|
107 | # loaded data chunk:
|
---|
108 |
|
---|
109 | my ($A00, # [0][4] [0][3] [0][2] [0][1] [0][0]
|
---|
110 | $A10, # [1][4] [1][3] [1][2] [1][1] [1][0]
|
---|
111 | $A20, # [2][4] [2][3] [2][2] [2][1] [2][0]
|
---|
112 | $A30, # [3][4] [3][3] [3][2] [3][1] [3][0]
|
---|
113 | $A40) = # [4][4] [4][3] [4][2] [4][1] [4][0]
|
---|
114 | map("%zmm$_",(0..4));
|
---|
115 |
|
---|
116 | # We also need to map the magic order into offsets within structure:
|
---|
117 |
|
---|
118 | my @A_jagged = ([0,0], [0,1], [0,2], [0,3], [0,4],
|
---|
119 | [1,0], [1,1], [1,2], [1,3], [1,4],
|
---|
120 | [2,0], [2,1], [2,2], [2,3], [2,4],
|
---|
121 | [3,0], [3,1], [3,2], [3,3], [3,4],
|
---|
122 | [4,0], [4,1], [4,2], [4,3], [4,4]);
|
---|
123 | @A_jagged = map(8*($$_[0]*8+$$_[1]), @A_jagged); # ... and now linear
|
---|
124 |
|
---|
125 | my @T = map("%zmm$_",(5..12));
|
---|
126 | my @Theta = map("%zmm$_",(33,13..16)); # invalid @Theta[0] is not typo
|
---|
127 | my @Pi0 = map("%zmm$_",(17..21));
|
---|
128 | my @Rhotate0 = map("%zmm$_",(22..26));
|
---|
129 | my @Rhotate1 = map("%zmm$_",(27..31));
|
---|
130 |
|
---|
131 | my ($C00,$D00) = @T[0..1];
|
---|
132 | my ($k00001,$k00010,$k00100,$k01000,$k10000,$k11111) = map("%k$_",(1..6));
|
---|
133 |
|
---|
134 | $code.=<<___;
|
---|
135 | .text
|
---|
136 |
|
---|
137 | .type __KeccakF1600,\@function
|
---|
138 | .align 32
|
---|
139 | __KeccakF1600:
|
---|
140 | lea iotas(%rip),%r10
|
---|
141 | mov \$12,%eax
|
---|
142 | jmp .Loop_avx512
|
---|
143 |
|
---|
144 | .align 32
|
---|
145 | .Loop_avx512:
|
---|
146 | ######################################### Theta, even round
|
---|
147 | vmovdqa64 $A00,@T[0] # put aside original A00
|
---|
148 | vpternlogq \$0x96,$A20,$A10,$A00 # and use it as "C00"
|
---|
149 | vpternlogq \$0x96,$A40,$A30,$A00
|
---|
150 |
|
---|
151 | vprolq \$1,$A00,$D00
|
---|
152 | vpermq $A00,@Theta[1],$A00
|
---|
153 | vpermq $D00,@Theta[4],$D00
|
---|
154 |
|
---|
155 | vpternlogq \$0x96,$A00,$D00,@T[0] # T[0] is original A00
|
---|
156 | vpternlogq \$0x96,$A00,$D00,$A10
|
---|
157 | vpternlogq \$0x96,$A00,$D00,$A20
|
---|
158 | vpternlogq \$0x96,$A00,$D00,$A30
|
---|
159 | vpternlogq \$0x96,$A00,$D00,$A40
|
---|
160 |
|
---|
161 | ######################################### Rho
|
---|
162 | vprolvq @Rhotate0[0],@T[0],$A00 # T[0] is original A00
|
---|
163 | vprolvq @Rhotate0[1],$A10,$A10
|
---|
164 | vprolvq @Rhotate0[2],$A20,$A20
|
---|
165 | vprolvq @Rhotate0[3],$A30,$A30
|
---|
166 | vprolvq @Rhotate0[4],$A40,$A40
|
---|
167 |
|
---|
168 | ######################################### Pi
|
---|
169 | vpermq $A00,@Pi0[0],$A00
|
---|
170 | vpermq $A10,@Pi0[1],$A10
|
---|
171 | vpermq $A20,@Pi0[2],$A20
|
---|
172 | vpermq $A30,@Pi0[3],$A30
|
---|
173 | vpermq $A40,@Pi0[4],$A40
|
---|
174 |
|
---|
175 | ######################################### Chi
|
---|
176 | vmovdqa64 $A00,@T[0]
|
---|
177 | vmovdqa64 $A10,@T[1]
|
---|
178 | vpternlogq \$0xD2,$A20,$A10,$A00
|
---|
179 | vpternlogq \$0xD2,$A30,$A20,$A10
|
---|
180 | vpternlogq \$0xD2,$A40,$A30,$A20
|
---|
181 | vpternlogq \$0xD2,@T[0],$A40,$A30
|
---|
182 | vpternlogq \$0xD2,@T[1],@T[0],$A40
|
---|
183 |
|
---|
184 | ######################################### Iota
|
---|
185 | vpxorq (%r10),$A00,${A00}{$k00001}
|
---|
186 | lea 16(%r10),%r10
|
---|
187 |
|
---|
188 | ######################################### Harmonize rounds
|
---|
189 | vpblendmq $A20,$A10,@{T[1]}{$k00010}
|
---|
190 | vpblendmq $A30,$A20,@{T[2]}{$k00010}
|
---|
191 | vpblendmq $A40,$A30,@{T[3]}{$k00010}
|
---|
192 | vpblendmq $A10,$A00,@{T[0]}{$k00010}
|
---|
193 | vpblendmq $A00,$A40,@{T[4]}{$k00010}
|
---|
194 |
|
---|
195 | vpblendmq $A30,@T[1],@{T[1]}{$k00100}
|
---|
196 | vpblendmq $A40,@T[2],@{T[2]}{$k00100}
|
---|
197 | vpblendmq $A20,@T[0],@{T[0]}{$k00100}
|
---|
198 | vpblendmq $A00,@T[3],@{T[3]}{$k00100}
|
---|
199 | vpblendmq $A10,@T[4],@{T[4]}{$k00100}
|
---|
200 |
|
---|
201 | vpblendmq $A40,@T[1],@{T[1]}{$k01000}
|
---|
202 | vpblendmq $A30,@T[0],@{T[0]}{$k01000}
|
---|
203 | vpblendmq $A00,@T[2],@{T[2]}{$k01000}
|
---|
204 | vpblendmq $A10,@T[3],@{T[3]}{$k01000}
|
---|
205 | vpblendmq $A20,@T[4],@{T[4]}{$k01000}
|
---|
206 |
|
---|
207 | vpblendmq $A40,@T[0],@{T[0]}{$k10000}
|
---|
208 | vpblendmq $A00,@T[1],@{T[1]}{$k10000}
|
---|
209 | vpblendmq $A10,@T[2],@{T[2]}{$k10000}
|
---|
210 | vpblendmq $A20,@T[3],@{T[3]}{$k10000}
|
---|
211 | vpblendmq $A30,@T[4],@{T[4]}{$k10000}
|
---|
212 |
|
---|
213 | #vpermq @T[0],@Theta[0],$A00 # doesn't actually change order
|
---|
214 | vpermq @T[1],@Theta[1],$A10
|
---|
215 | vpermq @T[2],@Theta[2],$A20
|
---|
216 | vpermq @T[3],@Theta[3],$A30
|
---|
217 | vpermq @T[4],@Theta[4],$A40
|
---|
218 |
|
---|
219 | ######################################### Theta, odd round
|
---|
220 | vmovdqa64 $T[0],$A00 # real A00
|
---|
221 | vpternlogq \$0x96,$A20,$A10,$C00 # C00 is @T[0]'s alias
|
---|
222 | vpternlogq \$0x96,$A40,$A30,$C00
|
---|
223 |
|
---|
224 | vprolq \$1,$C00,$D00
|
---|
225 | vpermq $C00,@Theta[1],$C00
|
---|
226 | vpermq $D00,@Theta[4],$D00
|
---|
227 |
|
---|
228 | vpternlogq \$0x96,$C00,$D00,$A00
|
---|
229 | vpternlogq \$0x96,$C00,$D00,$A30
|
---|
230 | vpternlogq \$0x96,$C00,$D00,$A10
|
---|
231 | vpternlogq \$0x96,$C00,$D00,$A40
|
---|
232 | vpternlogq \$0x96,$C00,$D00,$A20
|
---|
233 |
|
---|
234 | ######################################### Rho
|
---|
235 | vprolvq @Rhotate1[0],$A00,$A00
|
---|
236 | vprolvq @Rhotate1[3],$A30,@T[1]
|
---|
237 | vprolvq @Rhotate1[1],$A10,@T[2]
|
---|
238 | vprolvq @Rhotate1[4],$A40,@T[3]
|
---|
239 | vprolvq @Rhotate1[2],$A20,@T[4]
|
---|
240 |
|
---|
241 | vpermq $A00,@Theta[4],@T[5]
|
---|
242 | vpermq $A00,@Theta[3],@T[6]
|
---|
243 |
|
---|
244 | ######################################### Iota
|
---|
245 | vpxorq -8(%r10),$A00,${A00}{$k00001}
|
---|
246 |
|
---|
247 | ######################################### Pi
|
---|
248 | vpermq @T[1],@Theta[2],$A10
|
---|
249 | vpermq @T[2],@Theta[4],$A20
|
---|
250 | vpermq @T[3],@Theta[1],$A30
|
---|
251 | vpermq @T[4],@Theta[3],$A40
|
---|
252 |
|
---|
253 | ######################################### Chi
|
---|
254 | vpternlogq \$0xD2,@T[6],@T[5],$A00
|
---|
255 |
|
---|
256 | vpermq @T[1],@Theta[1],@T[7]
|
---|
257 | #vpermq @T[1],@Theta[0],@T[1]
|
---|
258 | vpternlogq \$0xD2,@T[1],@T[7],$A10
|
---|
259 |
|
---|
260 | vpermq @T[2],@Theta[3],@T[0]
|
---|
261 | vpermq @T[2],@Theta[2],@T[2]
|
---|
262 | vpternlogq \$0xD2,@T[2],@T[0],$A20
|
---|
263 |
|
---|
264 | #vpermq @T[3],@Theta[0],@T[3]
|
---|
265 | vpermq @T[3],@Theta[4],@T[1]
|
---|
266 | vpternlogq \$0xD2,@T[1],@T[3],$A30
|
---|
267 |
|
---|
268 | vpermq @T[4],@Theta[2],@T[0]
|
---|
269 | vpermq @T[4],@Theta[1],@T[4]
|
---|
270 | vpternlogq \$0xD2,@T[4],@T[0],$A40
|
---|
271 |
|
---|
272 | dec %eax
|
---|
273 | jnz .Loop_avx512
|
---|
274 |
|
---|
275 | ret
|
---|
276 | .size __KeccakF1600,.-__KeccakF1600
|
---|
277 | ___
|
---|
278 |
|
---|
279 | my ($A_flat,$inp,$len,$bsz) = ("%rdi","%rsi","%rdx","%rcx");
|
---|
280 | my $out = $inp; # in squeeze
|
---|
281 |
|
---|
282 | $code.=<<___;
|
---|
283 | .globl SHA3_absorb
|
---|
284 | .type SHA3_absorb,\@function
|
---|
285 | .align 32
|
---|
286 | SHA3_absorb:
|
---|
287 | mov %rsp,%r11
|
---|
288 |
|
---|
289 | lea -320(%rsp),%rsp
|
---|
290 | and \$-64,%rsp
|
---|
291 |
|
---|
292 | lea 96($A_flat),$A_flat
|
---|
293 | lea 96($inp),$inp
|
---|
294 | lea 128(%rsp),%r9
|
---|
295 |
|
---|
296 | lea theta_perm(%rip),%r8
|
---|
297 |
|
---|
298 | kxnorw $k11111,$k11111,$k11111
|
---|
299 | kshiftrw \$15,$k11111,$k00001
|
---|
300 | kshiftrw \$11,$k11111,$k11111
|
---|
301 | kshiftlw \$1,$k00001,$k00010
|
---|
302 | kshiftlw \$2,$k00001,$k00100
|
---|
303 | kshiftlw \$3,$k00001,$k01000
|
---|
304 | kshiftlw \$4,$k00001,$k10000
|
---|
305 |
|
---|
306 | #vmovdqa64 64*0(%r8),@Theta[0]
|
---|
307 | vmovdqa64 64*1(%r8),@Theta[1]
|
---|
308 | vmovdqa64 64*2(%r8),@Theta[2]
|
---|
309 | vmovdqa64 64*3(%r8),@Theta[3]
|
---|
310 | vmovdqa64 64*4(%r8),@Theta[4]
|
---|
311 |
|
---|
312 | vmovdqa64 64*5(%r8),@Rhotate1[0]
|
---|
313 | vmovdqa64 64*6(%r8),@Rhotate1[1]
|
---|
314 | vmovdqa64 64*7(%r8),@Rhotate1[2]
|
---|
315 | vmovdqa64 64*8(%r8),@Rhotate1[3]
|
---|
316 | vmovdqa64 64*9(%r8),@Rhotate1[4]
|
---|
317 |
|
---|
318 | vmovdqa64 64*10(%r8),@Rhotate0[0]
|
---|
319 | vmovdqa64 64*11(%r8),@Rhotate0[1]
|
---|
320 | vmovdqa64 64*12(%r8),@Rhotate0[2]
|
---|
321 | vmovdqa64 64*13(%r8),@Rhotate0[3]
|
---|
322 | vmovdqa64 64*14(%r8),@Rhotate0[4]
|
---|
323 |
|
---|
324 | vmovdqa64 64*15(%r8),@Pi0[0]
|
---|
325 | vmovdqa64 64*16(%r8),@Pi0[1]
|
---|
326 | vmovdqa64 64*17(%r8),@Pi0[2]
|
---|
327 | vmovdqa64 64*18(%r8),@Pi0[3]
|
---|
328 | vmovdqa64 64*19(%r8),@Pi0[4]
|
---|
329 |
|
---|
330 | vmovdqu64 40*0-96($A_flat),${A00}{$k11111}{z}
|
---|
331 | vpxorq @T[0],@T[0],@T[0]
|
---|
332 | vmovdqu64 40*1-96($A_flat),${A10}{$k11111}{z}
|
---|
333 | vmovdqu64 40*2-96($A_flat),${A20}{$k11111}{z}
|
---|
334 | vmovdqu64 40*3-96($A_flat),${A30}{$k11111}{z}
|
---|
335 | vmovdqu64 40*4-96($A_flat),${A40}{$k11111}{z}
|
---|
336 |
|
---|
337 | vmovdqa64 @T[0],0*64-128(%r9) # zero transfer area on stack
|
---|
338 | vmovdqa64 @T[0],1*64-128(%r9)
|
---|
339 | vmovdqa64 @T[0],2*64-128(%r9)
|
---|
340 | vmovdqa64 @T[0],3*64-128(%r9)
|
---|
341 | vmovdqa64 @T[0],4*64-128(%r9)
|
---|
342 | jmp .Loop_absorb_avx512
|
---|
343 |
|
---|
344 | .align 32
|
---|
345 | .Loop_absorb_avx512:
|
---|
346 | mov $bsz,%rax
|
---|
347 | sub $bsz,$len
|
---|
348 | jc .Ldone_absorb_avx512
|
---|
349 |
|
---|
350 | shr \$3,%eax
|
---|
351 | ___
|
---|
352 | for(my $i=0; $i<25; $i++) {
|
---|
353 | $code.=<<___
|
---|
354 | mov 8*$i-96($inp),%r8
|
---|
355 | mov %r8,$A_jagged[$i]-128(%r9)
|
---|
356 | dec %eax
|
---|
357 | jz .Labsorved_avx512
|
---|
358 | ___
|
---|
359 | }
|
---|
360 | $code.=<<___;
|
---|
361 | .Labsorved_avx512:
|
---|
362 | lea ($inp,$bsz),$inp
|
---|
363 |
|
---|
364 | vpxorq 64*0-128(%r9),$A00,$A00
|
---|
365 | vpxorq 64*1-128(%r9),$A10,$A10
|
---|
366 | vpxorq 64*2-128(%r9),$A20,$A20
|
---|
367 | vpxorq 64*3-128(%r9),$A30,$A30
|
---|
368 | vpxorq 64*4-128(%r9),$A40,$A40
|
---|
369 |
|
---|
370 | call __KeccakF1600
|
---|
371 |
|
---|
372 | jmp .Loop_absorb_avx512
|
---|
373 |
|
---|
374 | .align 32
|
---|
375 | .Ldone_absorb_avx512:
|
---|
376 | vmovdqu64 $A00,40*0-96($A_flat){$k11111}
|
---|
377 | vmovdqu64 $A10,40*1-96($A_flat){$k11111}
|
---|
378 | vmovdqu64 $A20,40*2-96($A_flat){$k11111}
|
---|
379 | vmovdqu64 $A30,40*3-96($A_flat){$k11111}
|
---|
380 | vmovdqu64 $A40,40*4-96($A_flat){$k11111}
|
---|
381 |
|
---|
382 | vzeroupper
|
---|
383 |
|
---|
384 | lea (%r11),%rsp
|
---|
385 | lea ($len,$bsz),%rax # return value
|
---|
386 | ret
|
---|
387 | .size SHA3_absorb,.-SHA3_absorb
|
---|
388 |
|
---|
389 | .globl SHA3_squeeze
|
---|
390 | .type SHA3_squeeze,\@function
|
---|
391 | .align 32
|
---|
392 | SHA3_squeeze:
|
---|
393 | mov %rsp,%r11
|
---|
394 |
|
---|
395 | lea 96($A_flat),$A_flat
|
---|
396 | cmp $bsz,$len
|
---|
397 | jbe .Lno_output_extension_avx512
|
---|
398 |
|
---|
399 | lea theta_perm(%rip),%r8
|
---|
400 |
|
---|
401 | kxnorw $k11111,$k11111,$k11111
|
---|
402 | kshiftrw \$15,$k11111,$k00001
|
---|
403 | kshiftrw \$11,$k11111,$k11111
|
---|
404 | kshiftlw \$1,$k00001,$k00010
|
---|
405 | kshiftlw \$2,$k00001,$k00100
|
---|
406 | kshiftlw \$3,$k00001,$k01000
|
---|
407 | kshiftlw \$4,$k00001,$k10000
|
---|
408 |
|
---|
409 | #vmovdqa64 64*0(%r8),@Theta[0]
|
---|
410 | vmovdqa64 64*1(%r8),@Theta[1]
|
---|
411 | vmovdqa64 64*2(%r8),@Theta[2]
|
---|
412 | vmovdqa64 64*3(%r8),@Theta[3]
|
---|
413 | vmovdqa64 64*4(%r8),@Theta[4]
|
---|
414 |
|
---|
415 | vmovdqa64 64*5(%r8),@Rhotate1[0]
|
---|
416 | vmovdqa64 64*6(%r8),@Rhotate1[1]
|
---|
417 | vmovdqa64 64*7(%r8),@Rhotate1[2]
|
---|
418 | vmovdqa64 64*8(%r8),@Rhotate1[3]
|
---|
419 | vmovdqa64 64*9(%r8),@Rhotate1[4]
|
---|
420 |
|
---|
421 | vmovdqa64 64*10(%r8),@Rhotate0[0]
|
---|
422 | vmovdqa64 64*11(%r8),@Rhotate0[1]
|
---|
423 | vmovdqa64 64*12(%r8),@Rhotate0[2]
|
---|
424 | vmovdqa64 64*13(%r8),@Rhotate0[3]
|
---|
425 | vmovdqa64 64*14(%r8),@Rhotate0[4]
|
---|
426 |
|
---|
427 | vmovdqa64 64*15(%r8),@Pi0[0]
|
---|
428 | vmovdqa64 64*16(%r8),@Pi0[1]
|
---|
429 | vmovdqa64 64*17(%r8),@Pi0[2]
|
---|
430 | vmovdqa64 64*18(%r8),@Pi0[3]
|
---|
431 | vmovdqa64 64*19(%r8),@Pi0[4]
|
---|
432 |
|
---|
433 | vmovdqu64 40*0-96($A_flat),${A00}{$k11111}{z}
|
---|
434 | vmovdqu64 40*1-96($A_flat),${A10}{$k11111}{z}
|
---|
435 | vmovdqu64 40*2-96($A_flat),${A20}{$k11111}{z}
|
---|
436 | vmovdqu64 40*3-96($A_flat),${A30}{$k11111}{z}
|
---|
437 | vmovdqu64 40*4-96($A_flat),${A40}{$k11111}{z}
|
---|
438 |
|
---|
439 | .Lno_output_extension_avx512:
|
---|
440 | shr \$3,$bsz
|
---|
441 | lea -96($A_flat),%r9
|
---|
442 | mov $bsz,%rax
|
---|
443 | jmp .Loop_squeeze_avx512
|
---|
444 |
|
---|
445 | .align 32
|
---|
446 | .Loop_squeeze_avx512:
|
---|
447 | cmp \$8,$len
|
---|
448 | jb .Ltail_squeeze_avx512
|
---|
449 |
|
---|
450 | mov (%r9),%r8
|
---|
451 | lea 8(%r9),%r9
|
---|
452 | mov %r8,($out)
|
---|
453 | lea 8($out),$out
|
---|
454 | sub \$8,$len # len -= 8
|
---|
455 | jz .Ldone_squeeze_avx512
|
---|
456 |
|
---|
457 | sub \$1,%rax # bsz--
|
---|
458 | jnz .Loop_squeeze_avx512
|
---|
459 |
|
---|
460 | #vpermq @Theta[4],@Theta[4],@Theta[3]
|
---|
461 | #vpermq @Theta[3],@Theta[4],@Theta[2]
|
---|
462 | #vpermq @Theta[3],@Theta[3],@Theta[1]
|
---|
463 |
|
---|
464 | call __KeccakF1600
|
---|
465 |
|
---|
466 | vmovdqu64 $A00,40*0-96($A_flat){$k11111}
|
---|
467 | vmovdqu64 $A10,40*1-96($A_flat){$k11111}
|
---|
468 | vmovdqu64 $A20,40*2-96($A_flat){$k11111}
|
---|
469 | vmovdqu64 $A30,40*3-96($A_flat){$k11111}
|
---|
470 | vmovdqu64 $A40,40*4-96($A_flat){$k11111}
|
---|
471 |
|
---|
472 | lea -96($A_flat),%r9
|
---|
473 | mov $bsz,%rax
|
---|
474 | jmp .Loop_squeeze_avx512
|
---|
475 |
|
---|
476 | .Ltail_squeeze_avx512:
|
---|
477 | mov $out,%rdi
|
---|
478 | mov %r9,%rsi
|
---|
479 | mov $len,%rcx
|
---|
480 | .byte 0xf3,0xa4 # rep movsb
|
---|
481 |
|
---|
482 | .Ldone_squeeze_avx512:
|
---|
483 | vzeroupper
|
---|
484 |
|
---|
485 | lea (%r11),%rsp
|
---|
486 | ret
|
---|
487 | .size SHA3_squeeze,.-SHA3_squeeze
|
---|
488 |
|
---|
489 | .section .rodata
|
---|
490 | .align 64
|
---|
491 | theta_perm:
|
---|
492 | .quad 0, 1, 2, 3, 4, 5, 6, 7 # [not used]
|
---|
493 | .quad 4, 0, 1, 2, 3, 5, 6, 7
|
---|
494 | .quad 3, 4, 0, 1, 2, 5, 6, 7
|
---|
495 | .quad 2, 3, 4, 0, 1, 5, 6, 7
|
---|
496 | .quad 1, 2, 3, 4, 0, 5, 6, 7
|
---|
497 |
|
---|
498 | rhotates1:
|
---|
499 | .quad 0, 44, 43, 21, 14, 0, 0, 0 # [0][0] [1][1] [2][2] [3][3] [4][4]
|
---|
500 | .quad 18, 1, 6, 25, 8, 0, 0, 0 # [4][0] [0][1] [1][2] [2][3] [3][4]
|
---|
501 | .quad 41, 2, 62, 55, 39, 0, 0, 0 # [3][0] [4][1] [0][2] [1][3] [2][4]
|
---|
502 | .quad 3, 45, 61, 28, 20, 0, 0, 0 # [2][0] [3][1] [4][2] [0][3] [1][4]
|
---|
503 | .quad 36, 10, 15, 56, 27, 0, 0, 0 # [1][0] [2][1] [3][2] [4][3] [0][4]
|
---|
504 |
|
---|
505 | rhotates0:
|
---|
506 | .quad 0, 1, 62, 28, 27, 0, 0, 0
|
---|
507 | .quad 36, 44, 6, 55, 20, 0, 0, 0
|
---|
508 | .quad 3, 10, 43, 25, 39, 0, 0, 0
|
---|
509 | .quad 41, 45, 15, 21, 8, 0, 0, 0
|
---|
510 | .quad 18, 2, 61, 56, 14, 0, 0, 0
|
---|
511 |
|
---|
512 | pi0_perm:
|
---|
513 | .quad 0, 3, 1, 4, 2, 5, 6, 7
|
---|
514 | .quad 1, 4, 2, 0, 3, 5, 6, 7
|
---|
515 | .quad 2, 0, 3, 1, 4, 5, 6, 7
|
---|
516 | .quad 3, 1, 4, 2, 0, 5, 6, 7
|
---|
517 | .quad 4, 2, 0, 3, 1, 5, 6, 7
|
---|
518 |
|
---|
519 |
|
---|
520 | iotas:
|
---|
521 | .quad 0x0000000000000001
|
---|
522 | .quad 0x0000000000008082
|
---|
523 | .quad 0x800000000000808a
|
---|
524 | .quad 0x8000000080008000
|
---|
525 | .quad 0x000000000000808b
|
---|
526 | .quad 0x0000000080000001
|
---|
527 | .quad 0x8000000080008081
|
---|
528 | .quad 0x8000000000008009
|
---|
529 | .quad 0x000000000000008a
|
---|
530 | .quad 0x0000000000000088
|
---|
531 | .quad 0x0000000080008009
|
---|
532 | .quad 0x000000008000000a
|
---|
533 | .quad 0x000000008000808b
|
---|
534 | .quad 0x800000000000008b
|
---|
535 | .quad 0x8000000000008089
|
---|
536 | .quad 0x8000000000008003
|
---|
537 | .quad 0x8000000000008002
|
---|
538 | .quad 0x8000000000000080
|
---|
539 | .quad 0x000000000000800a
|
---|
540 | .quad 0x800000008000000a
|
---|
541 | .quad 0x8000000080008081
|
---|
542 | .quad 0x8000000000008080
|
---|
543 | .quad 0x0000000080000001
|
---|
544 | .quad 0x8000000080008008
|
---|
545 |
|
---|
546 | .asciz "Keccak-1600 absorb and squeeze for AVX-512F, CRYPTOGAMS by <appro\@openssl.org>"
|
---|
547 | ___
|
---|
548 |
|
---|
549 | $output=pop and open STDOUT,">$output";
|
---|
550 | print $code;
|
---|
551 | close STDOUT or die "error closing STDOUT: $!";
|
---|