keccak1600-avx2.pl@ 91772

Last change on this file since 91772 was 91772, checked in by vboxsync, 3 years ago
openssl-1.1.1l: Applied and adjusted our OpenSSL changes to 1.1.1l. bugref:10126
File size: 16.3 KB

Line
1	#!/usr/bin/env perl
2	# Copyright 2017-2020 The OpenSSL Project Authors. All Rights Reserved.
3	#
4	# Licensed under the OpenSSL license (the "License"). You may not use
5	# this file except in compliance with the License. You can obtain a copy
6	# in the file LICENSE in the source distribution or at
7	# https://www.openssl.org/source/license.html
8	#
9	# ====================================================================
10	# Written by Andy Polyakov <[email protected]> for the OpenSSL
11	# project. The module is, however, dual licensed under OpenSSL and
12	# CRYPTOGAMS licenses depending on where you obtain it. For further
13	# details see http://www.openssl.org/~appro/cryptogams/.
14	# ====================================================================
15	#
16	# Keccak-1600 for AVX2.
17	#
18	# July 2017.
19	#
20	# To paraphrase Gilles Van Assche, if you contemplate Fig. 2.3 on page
21	# 20 of The Keccak reference [or Fig. 5 of FIPS PUB 202], and load data
22	# other than A[0][0] in magic order into 6 [256-bit] registers, *each
23	# dedicated to one axis*, Pi permutation is reduced to intra-register
24	# shuffles...
25	#
26	# It makes other steps more intricate, but overall, is it a win? To be
27	# more specific index permutations organized by quadruples are:
28	#
29	# [4][4] [3][3] [2][2] [1][1]<-+
30	# [0][4] [0][3] [0][2] [0][1]<-+
31	# [3][0] [1][0] [4][0] [2][0] \|
32	# [4][3] [3][1] [2][4] [1][2] \|
33	# [3][4] [1][3] [4][2] [2][1] \|
34	# [2][3] [4][1] [1][4] [3][2] \|
35	# [2][2] [4][4] [1][1] [3][3] -+
36	#
37	# This however is highly impractical for Theta and Chi. What would help
38	# Theta is if x indices were aligned column-wise, or in other words:
39	#
40	# [0][4] [0][3] [0][2] [0][1]
41	# [3][0] [1][0] [4][0] [2][0]
42	#vpermq([4][3] [3][1] [2][4] [1][2], 0b01110010)
43	# [2][4] [4][3] [1][2] [3][1]
44	#vpermq([4][2] [3][4] [2][1] [1][3], 0b10001101)
45	# [3][4] [1][3] [4][2] [2][1]
46	#vpermq([2][3] [4][1] [1][4] [3][2], 0b01110010)
47	# [1][4] [2][3] [3][2] [4][1]
48	#vpermq([1][1] [2][2] [3][3] [4][4], 0b00011011)
49	# [4][4] [3][3] [2][2] [1][1]
50	#
51	# So here we have it, lines not marked with vpermq() represent the magic
52	# order in which data is to be loaded and maintained. [And lines marked
53	# with vpermq() represent Pi circular permutation in chosen layout. Note
54	# that first step is permutation-free.] A[0][0] is loaded to register of
55	# its own, to all lanes. [A[0][0] is not part of Pi permutation or Rho.]
56	# Digits in variables' names denote right-most coordinates:
57
58	my ($A00, # [0][0] [0][0] [0][0] [0][0] # %ymm0
59	$A01, # [0][4] [0][3] [0][2] [0][1] # %ymm1
60	$A20, # [3][0] [1][0] [4][0] [2][0] # %ymm2
61	$A31, # [2][4] [4][3] [1][2] [3][1] # %ymm3
62	$A21, # [3][4] [1][3] [4][2] [2][1] # %ymm4
63	$A41, # [1][4] [2][3] [3][2] [4][1] # %ymm5
64	$A11) = # [4][4] [3][3] [2][2] [1][1] # %ymm6
65	map("%ymm$_",(0..6));
66
67	# We also need to map the magic order into offsets within structure:
68
69	my @A_jagged = ([0,0], [1,0], [1,1], [1,2], [1,3], # [0][0..4]
70	[2,2], [6,0], [3,1], [4,2], [5,3], # [1][0..4]
71	[2,0], [4,0], [6,1], [5,2], [3,3], # [2][0..4]
72	[2,3], [3,0], [5,1], [6,2], [4,3], # [3][0..4]
73	[2,1], [5,0], [4,1], [3,2], [6,3]); # [4][0..4]
74	@A_jagged = map(8($$_[0]4+$$_[1]), @A_jagged); # ... and now linear
75
76	# But on the other hand Chi is much better off if y indices were aligned
77	# column-wise, not x. For this reason we have to shuffle data prior
78	# Chi and revert it afterwards. Prior shuffle is naturally merged with
79	# Pi itself:
80	#
81	# [0][4] [0][3] [0][2] [0][1]
82	# [3][0] [1][0] [4][0] [2][0]
83	#vpermq([4][3] [3][1] [2][4] [1][2], 0b01110010)
84	#vpermq([2][4] [4][3] [1][2] [3][1], 0b00011011) = 0b10001101
85	# [3][1] [1][2] [4][3] [2][4]
86	#vpermq([4][2] [3][4] [2][1] [1][3], 0b10001101)
87	#vpermq([3][4] [1][3] [4][2] [2][1], 0b11100100) = 0b10001101
88	# [3][4] [1][3] [4][2] [2][1]
89	#vpermq([2][3] [4][1] [1][4] [3][2], 0b01110010)
90	#vpermq([1][4] [2][3] [3][2] [4][1], 0b01110010) = 0b00011011
91	# [3][2] [1][4] [4][1] [2][3]
92	#vpermq([1][1] [2][2] [3][3] [4][4], 0b00011011)
93	#vpermq([4][4] [3][3] [2][2] [1][1], 0b10001101) = 0b01110010
94	# [3][3] [1][1] [4][4] [2][2]
95	#
96	# And reverse post-Chi permutation:
97	#
98	# [0][4] [0][3] [0][2] [0][1]
99	# [3][0] [1][0] [4][0] [2][0]
100	#vpermq([3][1] [1][2] [4][3] [2][4], 0b00011011)
101	# [2][4] [4][3] [1][2] [3][1]
102	#vpermq([3][4] [1][3] [4][2] [2][1], 0b11100100) = nop :-)
103	# [3][4] [1][3] [4][2] [2][1]
104	#vpermq([3][2] [1][4] [4][1] [2][3], 0b10001101)
105	# [1][4] [2][3] [3][2] [4][1]
106	#vpermq([3][3] [1][1] [4][4] [2][2], 0b01110010)
107	# [4][4] [3][3] [2][2] [1][1]
108	#
109	########################################################################
110	# Numbers are cycles per processed byte out of large message.
111	#
112	# r=1088(*)
113	#
114	# Haswell 8.7/+10%
115	# Skylake 7.8/+20%
116	# Ryzen 17(**)
117	#
118	# (*) Corresponds to SHA3-256. Percentage after slash is improvement
119	# coefficient in comparison to scalar keccak1600-x86_64.pl.
120	# (**) It's expected that Ryzen performs poorly, because instruction
121	# issue rate is limited to two AVX2 instructions per cycle and
122	# in addition vpblendd is reportedly bound to specific port.
123	# Obviously this code path should not be executed on Ryzen.
124
125	my @T = map("%ymm$_",(7..15));
126	my ($C14,$C00,$D00,$D14) = @T[5..8];
127
128	$code.=<<___;
129	.text
130
131	.type __KeccakF1600,\@function
132	.align 32
133	__KeccakF1600:
134	lea rhotates_left+96(%rip),%r8
135	lea rhotates_right+96(%rip),%r9
136	lea iotas(%rip),%r10
137	mov \$24,%eax
138	jmp .Loop_avx2
139
140	.align 32
141	.Loop_avx2:
142	######################################### Theta
143	vpshufd \$0b01001110,$A20,$C00
144	vpxor $A31,$A41,$C14
145	vpxor $A11,$A21,@T[2]
146	vpxor $A01,$C14,$C14
147	vpxor @T[2],$C14,$C14 # C[1..4]
148
149	vpermq \$0b10010011,$C14,@T[4]
150	vpxor $A20,$C00,$C00
151	vpermq \$0b01001110,$C00,@T[0]
152
153	vpsrlq \$63,$C14,@T[1]
154	vpaddq $C14,$C14,@T[2]
155	vpor @T[2],@T[1],@T[1] # ROL64(C[1..4],1)
156
157	vpermq \$0b00111001,@T[1],$D14
158	vpxor @T[4],@T[1],$D00
159	vpermq \$0b00000000,$D00,$D00 # D[0..0] = ROL64(C[1],1) ^ C[4]
160
161	vpxor $A00,$C00,$C00
162	vpxor @T[0],$C00,$C00 # C[0..0]
163
164	vpsrlq \$63,$C00,@T[0]
165	vpaddq $C00,$C00,@T[1]
166	vpor @T[0],@T[1],@T[1] # ROL64(C[0..0],1)
167
168	vpxor $D00,$A20,$A20 # ^= D[0..0]
169	vpxor $D00,$A00,$A00 # ^= D[0..0]
170
171	vpblendd \$0b11000000,@T[1],$D14,$D14
172	vpblendd \$0b00000011,$C00,@T[4],@T[4]
173	vpxor @T[4],$D14,$D14 # D[1..4] = ROL64(C[2..4,0),1) ^ C[0..3]
174
175	######################################### Rho + Pi + pre-Chi shuffle
176	vpsllvq 0*32-96(%r8),$A20,@T[3]
177	vpsrlvq 0*32-96(%r9),$A20,$A20
178	vpor @T[3],$A20,$A20
179
180	vpxor $D14,$A31,$A31 # ^= D[1..4] from Theta
181	vpsllvq 2*32-96(%r8),$A31,@T[4]
182	vpsrlvq 2*32-96(%r9),$A31,$A31
183	vpor @T[4],$A31,$A31
184
185	vpxor $D14,$A21,$A21 # ^= D[1..4] from Theta
186	vpsllvq 3*32-96(%r8),$A21,@T[5]
187	vpsrlvq 3*32-96(%r9),$A21,$A21
188	vpor @T[5],$A21,$A21
189
190	vpxor $D14,$A41,$A41 # ^= D[1..4] from Theta
191	vpsllvq 4*32-96(%r8),$A41,@T[6]
192	vpsrlvq 4*32-96(%r9),$A41,$A41
193	vpor @T[6],$A41,$A41
194
195	vpxor $D14,$A11,$A11 # ^= D[1..4] from Theta
196	vpermq \$0b10001101,$A20,@T[3] # $A20 -> future $A31
197	vpermq \$0b10001101,$A31,@T[4] # $A31 -> future $A21
198	vpsllvq 5*32-96(%r8),$A11,@T[7]
199	vpsrlvq 5*32-96(%r9),$A11,@T[1]
200	vpor @T[7],@T[1],@T[1] # $A11 -> future $A01
201
202	vpxor $D14,$A01,$A01 # ^= D[1..4] from Theta
203	vpermq \$0b00011011,$A21,@T[5] # $A21 -> future $A41
204	vpermq \$0b01110010,$A41,@T[6] # $A41 -> future $A11
205	vpsllvq 1*32-96(%r8),$A01,@T[8]
206	vpsrlvq 1*32-96(%r9),$A01,@T[2]
207	vpor @T[8],@T[2],@T[2] # $A01 -> future $A20
208
209	######################################### Chi
210	vpsrldq \$8,@T[1],@T[7]
211	vpandn @T[7],@T[1],@T[0] # tgting [0][0] [0][0] [0][0] [0][0]
212
213	vpblendd \$0b00001100,@T[6],@T[2],$A31 # [4][4] [2][0]
214	vpblendd \$0b00001100,@T[2],@T[4],@T[8] # [4][0] [2][1]
215	vpblendd \$0b00001100,@T[4],@T[3],$A41 # [4][2] [2][4]
216	vpblendd \$0b00001100,@T[3],@T[2],@T[7] # [4][3] [2][0]
217	vpblendd \$0b00110000,@T[4],$A31,$A31 # [1][3] [4][4] [2][0]
218	vpblendd \$0b00110000,@T[5],@T[8],@T[8] # [1][4] [4][0] [2][1]
219	vpblendd \$0b00110000,@T[2],$A41,$A41 # [1][0] [4][2] [2][4]
220	vpblendd \$0b00110000,@T[6],@T[7],@T[7] # [1][1] [4][3] [2][0]
221	vpblendd \$0b11000000,@T[5],$A31,$A31 # [3][2] [1][3] [4][4] [2][0]
222	vpblendd \$0b11000000,@T[6],@T[8],@T[8] # [3][3] [1][4] [4][0] [2][1]
223	vpblendd \$0b11000000,@T[6],$A41,$A41 # [3][3] [1][0] [4][2] [2][4]
224	vpblendd \$0b11000000,@T[4],@T[7],@T[7] # [3][4] [1][1] [4][3] [2][0]
225	vpandn @T[8],$A31,$A31 # tgting [3][1] [1][2] [4][3] [2][4]
226	vpandn @T[7],$A41,$A41 # tgting [3][2] [1][4] [4][1] [2][3]
227
228	vpblendd \$0b00001100,@T[2],@T[5],$A11 # [4][0] [2][3]
229	vpblendd \$0b00001100,@T[5],@T[3],@T[8] # [4][1] [2][4]
230	vpxor @T[3],$A31,$A31
231	vpblendd \$0b00110000,@T[3],$A11,$A11 # [1][2] [4][0] [2][3]
232	vpblendd \$0b00110000,@T[4],@T[8],@T[8] # [1][3] [4][1] [2][4]
233	vpxor @T[5],$A41,$A41
234	vpblendd \$0b11000000,@T[4],$A11,$A11 # [3][4] [1][2] [4][0] [2][3]
235	vpblendd \$0b11000000,@T[2],@T[8],@T[8] # [3][0] [1][3] [4][1] [2][4]
236	vpandn @T[8],$A11,$A11 # tgting [3][3] [1][1] [4][4] [2][2]
237	vpxor @T[6],$A11,$A11
238
239	vpermq \$0b00011110,@T[1],$A21 # [0][1] [0][2] [0][4] [0][3]
240	vpblendd \$0b00110000,$A00,$A21,@T[8] # [0][1] [0][0] [0][4] [0][3]
241	vpermq \$0b00111001,@T[1],$A01 # [0][1] [0][4] [0][3] [0][2]
242	vpblendd \$0b11000000,$A00,$A01,$A01 # [0][0] [0][4] [0][3] [0][2]
243	vpandn @T[8],$A01,$A01 # tgting [0][4] [0][3] [0][2] [0][1]
244
245	vpblendd \$0b00001100,@T[5],@T[4],$A20 # [4][1] [2][1]
246	vpblendd \$0b00001100,@T[4],@T[6],@T[7] # [4][2] [2][2]
247	vpblendd \$0b00110000,@T[6],$A20,$A20 # [1][1] [4][1] [2][1]
248	vpblendd \$0b00110000,@T[3],@T[7],@T[7] # [1][2] [4][2] [2][2]
249	vpblendd \$0b11000000,@T[3],$A20,$A20 # [3][1] [1][1] [4][1] [2][1]
250	vpblendd \$0b11000000,@T[5],@T[7],@T[7] # [3][2] [1][2] [4][2] [2][2]
251	vpandn @T[7],$A20,$A20 # tgting [3][0] [1][0] [4][0] [2][0]
252	vpxor @T[2],$A20,$A20
253
254	vpermq \$0b00000000,@T[0],@T[0] # [0][0] [0][0] [0][0] [0][0]
255	vpermq \$0b00011011,$A31,$A31 # post-Chi shuffle
256	vpermq \$0b10001101,$A41,$A41
257	vpermq \$0b01110010,$A11,$A11
258
259	vpblendd \$0b00001100,@T[3],@T[6],$A21 # [4][3] [2][2]
260	vpblendd \$0b00001100,@T[6],@T[5],@T[7] # [4][4] [2][3]
261	vpblendd \$0b00110000,@T[5],$A21,$A21 # [1][4] [4][3] [2][2]
262	vpblendd \$0b00110000,@T[2],@T[7],@T[7] # [1][0] [4][4] [2][3]
263	vpblendd \$0b11000000,@T[2],$A21,$A21 # [3][0] [1][4] [4][3] [2][2]
264	vpblendd \$0b11000000,@T[3],@T[7],@T[7] # [3][1] [1][0] [4][4] [2][3]
265	vpandn @T[7],$A21,$A21 # tgting [3][4] [1][3] [4][2] [2][1]
266
267	vpxor @T[0],$A00,$A00
268	vpxor @T[1],$A01,$A01
269	vpxor @T[4],$A21,$A21
270
271	######################################### Iota
272	vpxor (%r10),$A00,$A00
273	lea 32(%r10),%r10
274
275	dec %eax
276	jnz .Loop_avx2
277
278	ret
279	.size __KeccakF1600,.-__KeccakF1600
280	___
281	my ($A_flat,$inp,$len,$bsz) = ("%rdi","%rsi","%rdx","%rcx");
282	my $out = $inp; # in squeeze
283
284	$code.=<<___;
285	.globl SHA3_absorb
286	.type SHA3_absorb,\@function
287	.align 32
288	SHA3_absorb:
289	mov %rsp,%r11
290
291	lea -240(%rsp),%rsp
292	and \$-32,%rsp
293
294	lea 96($A_flat),$A_flat
295	lea 96($inp),$inp
296	lea 96(%rsp),%r10
297
298	vzeroupper
299
300	vpbroadcastq -96($A_flat),$A00 # load A[5][5]
301	vmovdqu 8+32*0-96($A_flat),$A01
302	vmovdqu 8+32*1-96($A_flat),$A20
303	vmovdqu 8+32*2-96($A_flat),$A31
304	vmovdqu 8+32*3-96($A_flat),$A21
305	vmovdqu 8+32*4-96($A_flat),$A41
306	vmovdqu 8+32*5-96($A_flat),$A11
307
308	vpxor @T[0],@T[0],@T[0]
309	vmovdqa @T[0],32*2-96(%r10) # zero transfer area on stack
310	vmovdqa @T[0],32*3-96(%r10)
311	vmovdqa @T[0],32*4-96(%r10)
312	vmovdqa @T[0],32*5-96(%r10)
313	vmovdqa @T[0],32*6-96(%r10)
314
315	.Loop_absorb_avx2:
316	mov $bsz,%rax
317	sub $bsz,$len
318	jc .Ldone_absorb_avx2
319
320	shr \$3,%eax
321	vpbroadcastq 0-96($inp),@T[0]
322	vmovdqu 8-96($inp),@T[1]
323	sub \$4,%eax
324	___
325	for(my $i=5; $i<25; $i++) {
326	$code.=<<___
327	dec %eax
328	jz .Labsorved_avx2
329	mov 8*$i-96($inp),%r8
330	mov %r8,$A_jagged[$i]-96(%r10)
331	___
332	}
333	$code.=<<___;
334	.Labsorved_avx2:
335	lea ($inp,$bsz),$inp
336
337	vpxor @T[0],$A00,$A00
338	vpxor @T[1],$A01,$A01
339	vpxor 32*2-96(%r10),$A20,$A20
340	vpxor 32*3-96(%r10),$A31,$A31
341	vpxor 32*4-96(%r10),$A21,$A21
342	vpxor 32*5-96(%r10),$A41,$A41
343	vpxor 32*6-96(%r10),$A11,$A11
344
345	call __KeccakF1600
346
347	lea 96(%rsp),%r10
348	jmp .Loop_absorb_avx2
349
350	.Ldone_absorb_avx2:
351	vmovq %xmm0,-96($A_flat)
352	vmovdqu $A01,8+32*0-96($A_flat)
353	vmovdqu $A20,8+32*1-96($A_flat)
354	vmovdqu $A31,8+32*2-96($A_flat)
355	vmovdqu $A21,8+32*3-96($A_flat)
356	vmovdqu $A41,8+32*4-96($A_flat)
357	vmovdqu $A11,8+32*5-96($A_flat)
358
359	vzeroupper
360
361	lea (%r11),%rsp
362	lea ($len,$bsz),%rax # return value
363	ret
364	.size SHA3_absorb,.-SHA3_absorb
365
366	.globl SHA3_squeeze
367	.type SHA3_squeeze,\@function
368	.align 32
369	SHA3_squeeze:
370	mov %rsp,%r11
371
372	lea 96($A_flat),$A_flat
373	shr \$3,$bsz
374
375	vzeroupper
376
377	vpbroadcastq -96($A_flat),$A00
378	vpxor @T[0],@T[0],@T[0]
379	vmovdqu 8+32*0-96($A_flat),$A01
380	vmovdqu 8+32*1-96($A_flat),$A20
381	vmovdqu 8+32*2-96($A_flat),$A31
382	vmovdqu 8+32*3-96($A_flat),$A21
383	vmovdqu 8+32*4-96($A_flat),$A41
384	vmovdqu 8+32*5-96($A_flat),$A11
385
386	mov $bsz,%rax
387
388	.Loop_squeeze_avx2:
389	mov @A_jagged[$i]-96($A_flat),%r8
390	___
391	for (my $i=0; $i<25; $i++) {
392	$code.=<<___;
393	sub \$8,$len
394	jc .Ltail_squeeze_avx2
395	mov %r8,($out)
396	lea 8($out),$out
397	je .Ldone_squeeze_avx2
398	dec %eax
399	je .Lextend_output_avx2
400	mov @A_jagged[$i+1]-120($A_flat),%r8
401	___
402	}
403	$code.=<<___;
404	.Lextend_output_avx2:
405	call __KeccakF1600
406
407	vmovq %xmm0,-96($A_flat)
408	vmovdqu $A01,8+32*0-96($A_flat)
409	vmovdqu $A20,8+32*1-96($A_flat)
410	vmovdqu $A31,8+32*2-96($A_flat)
411	vmovdqu $A21,8+32*3-96($A_flat)
412	vmovdqu $A41,8+32*4-96($A_flat)
413	vmovdqu $A11,8+32*5-96($A_flat)
414
415	mov $bsz,%rax
416	jmp .Loop_squeeze_avx2
417
418
419	.Ltail_squeeze_avx2:
420	add \$8,$len
421	.Loop_tail_avx2:
422	mov %r8b,($out)
423	lea 1($out),$out
424	shr \$8,%r8
425	dec $len
426	jnz .Loop_tail_avx2
427
428	.Ldone_squeeze_avx2:
429	vzeroupper
430
431	lea (%r11),%rsp
432	ret
433	.size SHA3_squeeze,.-SHA3_squeeze
434
435	.align 64
436	rhotates_left:
437	.quad 3, 18, 36, 41 # [2][0] [4][0] [1][0] [3][0]
438	.quad 1, 62, 28, 27 # [0][1] [0][2] [0][3] [0][4]
439	.quad 45, 6, 56, 39 # [3][1] [1][2] [4][3] [2][4]
440	.quad 10, 61, 55, 8 # [2][1] [4][2] [1][3] [3][4]
441	.quad 2, 15, 25, 20 # [4][1] [3][2] [2][3] [1][4]
442	.quad 44, 43, 21, 14 # [1][1] [2][2] [3][3] [4][4]
443	rhotates_right:
444	.quad 64-3, 64-18, 64-36, 64-41
445	.quad 64-1, 64-62, 64-28, 64-27
446	.quad 64-45, 64-6, 64-56, 64-39
447	.quad 64-10, 64-61, 64-55, 64-8
448	.quad 64-2, 64-15, 64-25, 64-20
449	.quad 64-44, 64-43, 64-21, 64-14
450	iotas:
451	.quad 0x0000000000000001, 0x0000000000000001, 0x0000000000000001, 0x0000000000000001
452	.quad 0x0000000000008082, 0x0000000000008082, 0x0000000000008082, 0x0000000000008082
453	.quad 0x800000000000808a, 0x800000000000808a, 0x800000000000808a, 0x800000000000808a
454	.quad 0x8000000080008000, 0x8000000080008000, 0x8000000080008000, 0x8000000080008000
455	.quad 0x000000000000808b, 0x000000000000808b, 0x000000000000808b, 0x000000000000808b
456	.quad 0x0000000080000001, 0x0000000080000001, 0x0000000080000001, 0x0000000080000001
457	.quad 0x8000000080008081, 0x8000000080008081, 0x8000000080008081, 0x8000000080008081
458	.quad 0x8000000000008009, 0x8000000000008009, 0x8000000000008009, 0x8000000000008009
459	.quad 0x000000000000008a, 0x000000000000008a, 0x000000000000008a, 0x000000000000008a
460	.quad 0x0000000000000088, 0x0000000000000088, 0x0000000000000088, 0x0000000000000088
461	.quad 0x0000000080008009, 0x0000000080008009, 0x0000000080008009, 0x0000000080008009
462	.quad 0x000000008000000a, 0x000000008000000a, 0x000000008000000a, 0x000000008000000a
463	.quad 0x000000008000808b, 0x000000008000808b, 0x000000008000808b, 0x000000008000808b
464	.quad 0x800000000000008b, 0x800000000000008b, 0x800000000000008b, 0x800000000000008b
465	.quad 0x8000000000008089, 0x8000000000008089, 0x8000000000008089, 0x8000000000008089
466	.quad 0x8000000000008003, 0x8000000000008003, 0x8000000000008003, 0x8000000000008003
467	.quad 0x8000000000008002, 0x8000000000008002, 0x8000000000008002, 0x8000000000008002
468	.quad 0x8000000000000080, 0x8000000000000080, 0x8000000000000080, 0x8000000000000080
469	.quad 0x000000000000800a, 0x000000000000800a, 0x000000000000800a, 0x000000000000800a
470	.quad 0x800000008000000a, 0x800000008000000a, 0x800000008000000a, 0x800000008000000a
471	.quad 0x8000000080008081, 0x8000000080008081, 0x8000000080008081, 0x8000000080008081
472	.quad 0x8000000000008080, 0x8000000000008080, 0x8000000000008080, 0x8000000000008080
473	.quad 0x0000000080000001, 0x0000000080000001, 0x0000000080000001, 0x0000000080000001
474	.quad 0x8000000080008008, 0x8000000080008008, 0x8000000080008008, 0x8000000080008008
475
476	.asciz "Keccak-1600 absorb and squeeze for AVX2, CRYPTOGAMS by <appro\@openssl.org>"
477	___
478
479	$output=pop;
480	open STDOUT,">$output";
481	print $code;
482	close STDOUT or die "error closing STDOUT: $!";

Note: See TracBrowser for help on using the repository browser.

source: vbox/trunk/src/libs/openssl-1.1.1l/crypto/sha/asm/keccak1600-avx2.pl@ 91772

Download in other formats: