keccak1600-avx512.pl@ 101021

Last change on this file since 101021 was 101021, checked in by vboxsync, 18 months ago
openssl-3.1.2: Applied and adjusted our OpenSSL changes to 3.1.0. bugref:10519
File size: 16.0 KB

Line
1	#!/usr/bin/env perl
2	# Copyright 2017-2023 The OpenSSL Project Authors. All Rights Reserved.
3	#
4	# Licensed under the Apache License 2.0 (the "License"). You may not use
5	# this file except in compliance with the License. You can obtain a copy
6	# in the file LICENSE in the source distribution or at
7	# https://www.openssl.org/source/license.html
8	#
9	# ====================================================================
10	# Written by Andy Polyakov <[email protected]> for the OpenSSL
11	# project. The module is, however, dual licensed under OpenSSL and
12	# CRYPTOGAMS licenses depending on where you obtain it. For further
13	# details see http://www.openssl.org/~appro/cryptogams/.
14	# ====================================================================
15	#
16	# Keccak-1600 for AVX-512F.
17	#
18	# July 2017.
19	#
20	# Below code is KECCAK_1X_ALT implementation (see sha/keccak1600.c).
21	# Pretty straightforward, the only "magic" is data layout in registers.
22	# It's impossible to have one that is optimal for every step, hence
23	# it's changing as algorithm progresses. Data is saved in linear order,
24	# but in-register order morphs between rounds. Even rounds take in
25	# linear layout, and odd rounds - transposed, or "verticaly-shaped"...
26	#
27	########################################################################
28	# Numbers are cycles per processed byte out of large message.
29	#
30	# r=1088(*)
31	#
32	# Knights Landing 7.6
33	# Skylake-X 5.7
34	#
35	# (*) Corresponds to SHA3-256.
36
37	########################################################################
38	# Below code is combination of two ideas. One is taken from Keccak Code
39	# Package, hereafter KCP, and another one from initial version of this
40	# module. What is common is observation that Pi's input and output are
41	# "mostly transposed", i.e. if input is aligned by x coordinate, then
42	# output is [mostly] aligned by y. Both versions, KCP and predecessor,
43	# were trying to use one of them from round to round, which resulted in
44	# some kind of transposition in each round. This version still does
45	# transpose data, but only every second round. Another essential factor
46	# is that KCP transposition has to be performed with instructions that
47	# turned to be rather expensive on Knights Landing, both latency- and
48	# throughput-wise. Not to mention that some of them have to depend on
49	# each other. On the other hand initial version of this module was
50	# relying heavily on blend instructions. There were lots of them,
51	# resulting in higher instruction count, yet it performed better on
52	# Knights Landing, because processor can execute pair of them each
53	# cycle and they have minimal latency. This module is an attempt to
54	# bring best parts together:-)
55	#
56	# Coordinates below correspond to those in sha/keccak1600.c. Input
57	# layout is straight linear:
58	#
59	# [0][4] [0][3] [0][2] [0][1] [0][0]
60	# [1][4] [1][3] [1][2] [1][1] [1][0]
61	# [2][4] [2][3] [2][2] [2][1] [2][0]
62	# [3][4] [3][3] [3][2] [3][1] [3][0]
63	# [4][4] [4][3] [4][2] [4][1] [4][0]
64	#
65	# It's perfect for Theta, while Pi is reduced to intra-register
66	# permutations which yield layout perfect for Chi:
67	#
68	# [4][0] [3][0] [2][0] [1][0] [0][0]
69	# [4][1] [3][1] [2][1] [1][1] [0][1]
70	# [4][2] [3][2] [2][2] [1][2] [0][2]
71	# [4][3] [3][3] [2][3] [1][3] [0][3]
72	# [4][4] [3][4] [2][4] [1][4] [0][4]
73	#
74	# Now instead of performing full transposition and feeding it to next
75	# identical round, we perform kind of diagonal transposition to layout
76	# from initial version of this module, and make it suitable for Theta:
77	#
78	# [4][4] [3][3] [2][2] [1][1] [0][0]>4.3.2.1.0>[4][4] [3][3] [2][2] [1][1] [0][0]
79	# [4][0] [3][4] [2][3] [1][2] [0][1]>3.2.1.0.4>[3][4] [2][3] [1][2] [0][1] [4][0]
80	# [4][1] [3][0] [2][4] [1][3] [0][2]>2.1.0.4.3>[2][4] [1][3] [0][2] [4][1] [3][0]
81	# [4][2] [3][1] [2][0] [1][4] [0][3]>1.0.4.3.2>[1][4] [0][3] [4][2] [3][1] [2][0]
82	# [4][3] [3][2] [2][1] [1][0] [0][4]>0.4.3.2.1>[0][4] [4][3] [3][2] [2][1] [1][0]
83	#
84	# Now intra-register permutations yield initial [almost] straight
85	# linear layout:
86	#
87	# [4][4] [3][3] [2][2] [1][1] [0][0]
88	##[0][4] [0][3] [0][2] [0][1] [0][0]
89	# [3][4] [2][3] [1][2] [0][1] [4][0]
90	##[2][3] [2][2] [2][1] [2][0] [2][4]
91	# [2][4] [1][3] [0][2] [4][1] [3][0]
92	##[4][2] [4][1] [4][0] [4][4] [4][3]
93	# [1][4] [0][3] [4][2] [3][1] [2][0]
94	##[1][1] [1][0] [1][4] [1][3] [1][2]
95	# [0][4] [4][3] [3][2] [2][1] [1][0]
96	##[3][0] [3][4] [3][3] [3][2] [3][1]
97	#
98	# This means that odd round Chi is performed in less suitable layout,
99	# with a number of additional permutations. But overall it turned to be
100	# a win. Permutations are fastest possible on Knights Landing and they
101	# are laid down to be independent of each other. In the essence I traded
102	# 20 blend instructions for 3 permutations. The result is 13% faster
103	# than KCP on Skylake-X, and >40% on Knights Landing.
104	#
105	# As implied, data is loaded in straight linear order. Digits in
106	# variables' names represent coordinates of right-most element of
107	# loaded data chunk:
108
109	my ($A00, # [0][4] [0][3] [0][2] [0][1] [0][0]
110	$A10, # [1][4] [1][3] [1][2] [1][1] [1][0]
111	$A20, # [2][4] [2][3] [2][2] [2][1] [2][0]
112	$A30, # [3][4] [3][3] [3][2] [3][1] [3][0]
113	$A40) = # [4][4] [4][3] [4][2] [4][1] [4][0]
114	map("%zmm$_",(0..4));
115
116	# We also need to map the magic order into offsets within structure:
117
118	my @A_jagged = ([0,0], [0,1], [0,2], [0,3], [0,4],
119	[1,0], [1,1], [1,2], [1,3], [1,4],
120	[2,0], [2,1], [2,2], [2,3], [2,4],
121	[3,0], [3,1], [3,2], [3,3], [3,4],
122	[4,0], [4,1], [4,2], [4,3], [4,4]);
123	@A_jagged = map(8($$_[0]8+$$_[1]), @A_jagged); # ... and now linear
124
125	my @T = map("%zmm$_",(5..12));
126	my @Theta = map("%zmm$_",(33,13..16)); # invalid @Theta[0] is not typo
127	my @Pi0 = map("%zmm$_",(17..21));
128	my @Rhotate0 = map("%zmm$_",(22..26));
129	my @Rhotate1 = map("%zmm$_",(27..31));
130
131	my ($C00,$D00) = @T[0..1];
132	my ($k00001,$k00010,$k00100,$k01000,$k10000,$k11111) = map("%k$_",(1..6));
133
134	$code.=<<___;
135	.text
136
137	.type __KeccakF1600,\@function
138	.align 32
139	__KeccakF1600:
140	lea iotas(%rip),%r10
141	mov \$12,%eax
142	jmp .Loop_avx512
143
144	.align 32
145	.Loop_avx512:
146	######################################### Theta, even round
147	vmovdqa64 $A00,@T[0] # put aside original A00
148	vpternlogq \$0x96,$A20,$A10,$A00 # and use it as "C00"
149	vpternlogq \$0x96,$A40,$A30,$A00
150
151	vprolq \$1,$A00,$D00
152	vpermq $A00,@Theta[1],$A00
153	vpermq $D00,@Theta[4],$D00
154
155	vpternlogq \$0x96,$A00,$D00,@T[0] # T[0] is original A00
156	vpternlogq \$0x96,$A00,$D00,$A10
157	vpternlogq \$0x96,$A00,$D00,$A20
158	vpternlogq \$0x96,$A00,$D00,$A30
159	vpternlogq \$0x96,$A00,$D00,$A40
160
161	######################################### Rho
162	vprolvq @Rhotate0[0],@T[0],$A00 # T[0] is original A00
163	vprolvq @Rhotate0[1],$A10,$A10
164	vprolvq @Rhotate0[2],$A20,$A20
165	vprolvq @Rhotate0[3],$A30,$A30
166	vprolvq @Rhotate0[4],$A40,$A40
167
168	######################################### Pi
169	vpermq $A00,@Pi0[0],$A00
170	vpermq $A10,@Pi0[1],$A10
171	vpermq $A20,@Pi0[2],$A20
172	vpermq $A30,@Pi0[3],$A30
173	vpermq $A40,@Pi0[4],$A40
174
175	######################################### Chi
176	vmovdqa64 $A00,@T[0]
177	vmovdqa64 $A10,@T[1]
178	vpternlogq \$0xD2,$A20,$A10,$A00
179	vpternlogq \$0xD2,$A30,$A20,$A10
180	vpternlogq \$0xD2,$A40,$A30,$A20
181	vpternlogq \$0xD2,@T[0],$A40,$A30
182	vpternlogq \$0xD2,@T[1],@T[0],$A40
183
184	######################################### Iota
185	vpxorq (%r10),$A00,${A00}{$k00001}
186	lea 16(%r10),%r10
187
188	######################################### Harmonize rounds
189	vpblendmq $A20,$A10,@{T[1]}{$k00010}
190	vpblendmq $A30,$A20,@{T[2]}{$k00010}
191	vpblendmq $A40,$A30,@{T[3]}{$k00010}
192	vpblendmq $A10,$A00,@{T[0]}{$k00010}
193	vpblendmq $A00,$A40,@{T[4]}{$k00010}
194
195	vpblendmq $A30,@T[1],@{T[1]}{$k00100}
196	vpblendmq $A40,@T[2],@{T[2]}{$k00100}
197	vpblendmq $A20,@T[0],@{T[0]}{$k00100}
198	vpblendmq $A00,@T[3],@{T[3]}{$k00100}
199	vpblendmq $A10,@T[4],@{T[4]}{$k00100}
200
201	vpblendmq $A40,@T[1],@{T[1]}{$k01000}
202	vpblendmq $A30,@T[0],@{T[0]}{$k01000}
203	vpblendmq $A00,@T[2],@{T[2]}{$k01000}
204	vpblendmq $A10,@T[3],@{T[3]}{$k01000}
205	vpblendmq $A20,@T[4],@{T[4]}{$k01000}
206
207	vpblendmq $A40,@T[0],@{T[0]}{$k10000}
208	vpblendmq $A00,@T[1],@{T[1]}{$k10000}
209	vpblendmq $A10,@T[2],@{T[2]}{$k10000}
210	vpblendmq $A20,@T[3],@{T[3]}{$k10000}
211	vpblendmq $A30,@T[4],@{T[4]}{$k10000}
212
213	#vpermq @T[0],@Theta[0],$A00 # doesn't actually change order
214	vpermq @T[1],@Theta[1],$A10
215	vpermq @T[2],@Theta[2],$A20
216	vpermq @T[3],@Theta[3],$A30
217	vpermq @T[4],@Theta[4],$A40
218
219	######################################### Theta, odd round
220	vmovdqa64 $T[0],$A00 # real A00
221	vpternlogq \$0x96,$A20,$A10,$C00 # C00 is @T[0]'s alias
222	vpternlogq \$0x96,$A40,$A30,$C00
223
224	vprolq \$1,$C00,$D00
225	vpermq $C00,@Theta[1],$C00
226	vpermq $D00,@Theta[4],$D00
227
228	vpternlogq \$0x96,$C00,$D00,$A00
229	vpternlogq \$0x96,$C00,$D00,$A30
230	vpternlogq \$0x96,$C00,$D00,$A10
231	vpternlogq \$0x96,$C00,$D00,$A40
232	vpternlogq \$0x96,$C00,$D00,$A20
233
234	######################################### Rho
235	vprolvq @Rhotate1[0],$A00,$A00
236	vprolvq @Rhotate1[3],$A30,@T[1]
237	vprolvq @Rhotate1[1],$A10,@T[2]
238	vprolvq @Rhotate1[4],$A40,@T[3]
239	vprolvq @Rhotate1[2],$A20,@T[4]
240
241	vpermq $A00,@Theta[4],@T[5]
242	vpermq $A00,@Theta[3],@T[6]
243
244	######################################### Iota
245	vpxorq -8(%r10),$A00,${A00}{$k00001}
246
247	######################################### Pi
248	vpermq @T[1],@Theta[2],$A10
249	vpermq @T[2],@Theta[4],$A20
250	vpermq @T[3],@Theta[1],$A30
251	vpermq @T[4],@Theta[3],$A40
252
253	######################################### Chi
254	vpternlogq \$0xD2,@T[6],@T[5],$A00
255
256	vpermq @T[1],@Theta[1],@T[7]
257	#vpermq @T[1],@Theta[0],@T[1]
258	vpternlogq \$0xD2,@T[1],@T[7],$A10
259
260	vpermq @T[2],@Theta[3],@T[0]
261	vpermq @T[2],@Theta[2],@T[2]
262	vpternlogq \$0xD2,@T[2],@T[0],$A20
263
264	#vpermq @T[3],@Theta[0],@T[3]
265	vpermq @T[3],@Theta[4],@T[1]
266	vpternlogq \$0xD2,@T[1],@T[3],$A30
267
268	vpermq @T[4],@Theta[2],@T[0]
269	vpermq @T[4],@Theta[1],@T[4]
270	vpternlogq \$0xD2,@T[4],@T[0],$A40
271
272	dec %eax
273	jnz .Loop_avx512
274
275	ret
276	.size __KeccakF1600,.-__KeccakF1600
277	___
278
279	my ($A_flat,$inp,$len,$bsz) = ("%rdi","%rsi","%rdx","%rcx");
280	my $out = $inp; # in squeeze
281
282	$code.=<<___;
283	.globl SHA3_absorb
284	.type SHA3_absorb,\@function
285	.align 32
286	SHA3_absorb:
287	mov %rsp,%r11
288
289	lea -320(%rsp),%rsp
290	and \$-64,%rsp
291
292	lea 96($A_flat),$A_flat
293	lea 96($inp),$inp
294	lea 128(%rsp),%r9
295
296	lea theta_perm(%rip),%r8
297
298	kxnorw $k11111,$k11111,$k11111
299	kshiftrw \$15,$k11111,$k00001
300	kshiftrw \$11,$k11111,$k11111
301	kshiftlw \$1,$k00001,$k00010
302	kshiftlw \$2,$k00001,$k00100
303	kshiftlw \$3,$k00001,$k01000
304	kshiftlw \$4,$k00001,$k10000
305
306	#vmovdqa64 64*0(%r8),@Theta[0]
307	vmovdqa64 64*1(%r8),@Theta[1]
308	vmovdqa64 64*2(%r8),@Theta[2]
309	vmovdqa64 64*3(%r8),@Theta[3]
310	vmovdqa64 64*4(%r8),@Theta[4]
311
312	vmovdqa64 64*5(%r8),@Rhotate1[0]
313	vmovdqa64 64*6(%r8),@Rhotate1[1]
314	vmovdqa64 64*7(%r8),@Rhotate1[2]
315	vmovdqa64 64*8(%r8),@Rhotate1[3]
316	vmovdqa64 64*9(%r8),@Rhotate1[4]
317
318	vmovdqa64 64*10(%r8),@Rhotate0[0]
319	vmovdqa64 64*11(%r8),@Rhotate0[1]
320	vmovdqa64 64*12(%r8),@Rhotate0[2]
321	vmovdqa64 64*13(%r8),@Rhotate0[3]
322	vmovdqa64 64*14(%r8),@Rhotate0[4]
323
324	vmovdqa64 64*15(%r8),@Pi0[0]
325	vmovdqa64 64*16(%r8),@Pi0[1]
326	vmovdqa64 64*17(%r8),@Pi0[2]
327	vmovdqa64 64*18(%r8),@Pi0[3]
328	vmovdqa64 64*19(%r8),@Pi0[4]
329
330	vmovdqu64 40*0-96($A_flat),${A00}{$k11111}{z}
331	vpxorq @T[0],@T[0],@T[0]
332	vmovdqu64 40*1-96($A_flat),${A10}{$k11111}{z}
333	vmovdqu64 40*2-96($A_flat),${A20}{$k11111}{z}
334	vmovdqu64 40*3-96($A_flat),${A30}{$k11111}{z}
335	vmovdqu64 40*4-96($A_flat),${A40}{$k11111}{z}
336
337	vmovdqa64 @T[0],0*64-128(%r9) # zero transfer area on stack
338	vmovdqa64 @T[0],1*64-128(%r9)
339	vmovdqa64 @T[0],2*64-128(%r9)
340	vmovdqa64 @T[0],3*64-128(%r9)
341	vmovdqa64 @T[0],4*64-128(%r9)
342	jmp .Loop_absorb_avx512
343
344	.align 32
345	.Loop_absorb_avx512:
346	mov $bsz,%rax
347	sub $bsz,$len
348	jc .Ldone_absorb_avx512
349
350	shr \$3,%eax
351	___
352	for(my $i=0; $i<25; $i++) {
353	$code.=<<___
354	mov 8*$i-96($inp),%r8
355	mov %r8,$A_jagged[$i]-128(%r9)
356	dec %eax
357	jz .Labsorved_avx512
358	___
359	}
360	$code.=<<___;
361	.Labsorved_avx512:
362	lea ($inp,$bsz),$inp
363
364	vpxorq 64*0-128(%r9),$A00,$A00
365	vpxorq 64*1-128(%r9),$A10,$A10
366	vpxorq 64*2-128(%r9),$A20,$A20
367	vpxorq 64*3-128(%r9),$A30,$A30
368	vpxorq 64*4-128(%r9),$A40,$A40
369
370	call __KeccakF1600
371
372	jmp .Loop_absorb_avx512
373
374	.align 32
375	.Ldone_absorb_avx512:
376	vmovdqu64 $A00,40*0-96($A_flat){$k11111}
377	vmovdqu64 $A10,40*1-96($A_flat){$k11111}
378	vmovdqu64 $A20,40*2-96($A_flat){$k11111}
379	vmovdqu64 $A30,40*3-96($A_flat){$k11111}
380	vmovdqu64 $A40,40*4-96($A_flat){$k11111}
381
382	vzeroupper
383
384	lea (%r11),%rsp
385	lea ($len,$bsz),%rax # return value
386	ret
387	.size SHA3_absorb,.-SHA3_absorb
388
389	.globl SHA3_squeeze
390	.type SHA3_squeeze,\@function
391	.align 32
392	SHA3_squeeze:
393	mov %rsp,%r11
394
395	lea 96($A_flat),$A_flat
396	cmp $bsz,$len
397	jbe .Lno_output_extension_avx512
398
399	lea theta_perm(%rip),%r8
400
401	kxnorw $k11111,$k11111,$k11111
402	kshiftrw \$15,$k11111,$k00001
403	kshiftrw \$11,$k11111,$k11111
404	kshiftlw \$1,$k00001,$k00010
405	kshiftlw \$2,$k00001,$k00100
406	kshiftlw \$3,$k00001,$k01000
407	kshiftlw \$4,$k00001,$k10000
408
409	#vmovdqa64 64*0(%r8),@Theta[0]
410	vmovdqa64 64*1(%r8),@Theta[1]
411	vmovdqa64 64*2(%r8),@Theta[2]
412	vmovdqa64 64*3(%r8),@Theta[3]
413	vmovdqa64 64*4(%r8),@Theta[4]
414
415	vmovdqa64 64*5(%r8),@Rhotate1[0]
416	vmovdqa64 64*6(%r8),@Rhotate1[1]
417	vmovdqa64 64*7(%r8),@Rhotate1[2]
418	vmovdqa64 64*8(%r8),@Rhotate1[3]
419	vmovdqa64 64*9(%r8),@Rhotate1[4]
420
421	vmovdqa64 64*10(%r8),@Rhotate0[0]
422	vmovdqa64 64*11(%r8),@Rhotate0[1]
423	vmovdqa64 64*12(%r8),@Rhotate0[2]
424	vmovdqa64 64*13(%r8),@Rhotate0[3]
425	vmovdqa64 64*14(%r8),@Rhotate0[4]
426
427	vmovdqa64 64*15(%r8),@Pi0[0]
428	vmovdqa64 64*16(%r8),@Pi0[1]
429	vmovdqa64 64*17(%r8),@Pi0[2]
430	vmovdqa64 64*18(%r8),@Pi0[3]
431	vmovdqa64 64*19(%r8),@Pi0[4]
432
433	vmovdqu64 40*0-96($A_flat),${A00}{$k11111}{z}
434	vmovdqu64 40*1-96($A_flat),${A10}{$k11111}{z}
435	vmovdqu64 40*2-96($A_flat),${A20}{$k11111}{z}
436	vmovdqu64 40*3-96($A_flat),${A30}{$k11111}{z}
437	vmovdqu64 40*4-96($A_flat),${A40}{$k11111}{z}
438
439	.Lno_output_extension_avx512:
440	shr \$3,$bsz
441	lea -96($A_flat),%r9
442	mov $bsz,%rax
443	jmp .Loop_squeeze_avx512
444
445	.align 32
446	.Loop_squeeze_avx512:
447	cmp \$8,$len
448	jb .Ltail_squeeze_avx512
449
450	mov (%r9),%r8
451	lea 8(%r9),%r9
452	mov %r8,($out)
453	lea 8($out),$out
454	sub \$8,$len # len -= 8
455	jz .Ldone_squeeze_avx512
456
457	sub \$1,%rax # bsz--
458	jnz .Loop_squeeze_avx512
459
460	#vpermq @Theta[4],@Theta[4],@Theta[3]
461	#vpermq @Theta[3],@Theta[4],@Theta[2]
462	#vpermq @Theta[3],@Theta[3],@Theta[1]
463
464	call __KeccakF1600
465
466	vmovdqu64 $A00,40*0-96($A_flat){$k11111}
467	vmovdqu64 $A10,40*1-96($A_flat){$k11111}
468	vmovdqu64 $A20,40*2-96($A_flat){$k11111}
469	vmovdqu64 $A30,40*3-96($A_flat){$k11111}
470	vmovdqu64 $A40,40*4-96($A_flat){$k11111}
471
472	lea -96($A_flat),%r9
473	mov $bsz,%rax
474	jmp .Loop_squeeze_avx512
475
476	.Ltail_squeeze_avx512:
477	mov $out,%rdi
478	mov %r9,%rsi
479	mov $len,%rcx
480	.byte 0xf3,0xa4 # rep movsb
481
482	.Ldone_squeeze_avx512:
483	vzeroupper
484
485	lea (%r11),%rsp
486	ret
487	.size SHA3_squeeze,.-SHA3_squeeze
488
489	.section .rodata
490	.align 64
491	theta_perm:
492	.quad 0, 1, 2, 3, 4, 5, 6, 7 # [not used]
493	.quad 4, 0, 1, 2, 3, 5, 6, 7
494	.quad 3, 4, 0, 1, 2, 5, 6, 7
495	.quad 2, 3, 4, 0, 1, 5, 6, 7
496	.quad 1, 2, 3, 4, 0, 5, 6, 7
497
498	rhotates1:
499	.quad 0, 44, 43, 21, 14, 0, 0, 0 # [0][0] [1][1] [2][2] [3][3] [4][4]
500	.quad 18, 1, 6, 25, 8, 0, 0, 0 # [4][0] [0][1] [1][2] [2][3] [3][4]
501	.quad 41, 2, 62, 55, 39, 0, 0, 0 # [3][0] [4][1] [0][2] [1][3] [2][4]
502	.quad 3, 45, 61, 28, 20, 0, 0, 0 # [2][0] [3][1] [4][2] [0][3] [1][4]
503	.quad 36, 10, 15, 56, 27, 0, 0, 0 # [1][0] [2][1] [3][2] [4][3] [0][4]
504
505	rhotates0:
506	.quad 0, 1, 62, 28, 27, 0, 0, 0
507	.quad 36, 44, 6, 55, 20, 0, 0, 0
508	.quad 3, 10, 43, 25, 39, 0, 0, 0
509	.quad 41, 45, 15, 21, 8, 0, 0, 0
510	.quad 18, 2, 61, 56, 14, 0, 0, 0
511
512	pi0_perm:
513	.quad 0, 3, 1, 4, 2, 5, 6, 7
514	.quad 1, 4, 2, 0, 3, 5, 6, 7
515	.quad 2, 0, 3, 1, 4, 5, 6, 7
516	.quad 3, 1, 4, 2, 0, 5, 6, 7
517	.quad 4, 2, 0, 3, 1, 5, 6, 7
518
519
520	iotas:
521	.quad 0x0000000000000001
522	.quad 0x0000000000008082
523	.quad 0x800000000000808a
524	.quad 0x8000000080008000
525	.quad 0x000000000000808b
526	.quad 0x0000000080000001
527	.quad 0x8000000080008081
528	.quad 0x8000000000008009
529	.quad 0x000000000000008a
530	.quad 0x0000000000000088
531	.quad 0x0000000080008009
532	.quad 0x000000008000000a
533	.quad 0x000000008000808b
534	.quad 0x800000000000008b
535	.quad 0x8000000000008089
536	.quad 0x8000000000008003
537	.quad 0x8000000000008002
538	.quad 0x8000000000000080
539	.quad 0x000000000000800a
540	.quad 0x800000008000000a
541	.quad 0x8000000080008081
542	.quad 0x8000000000008080
543	.quad 0x0000000080000001
544	.quad 0x8000000080008008
545
546	.asciz "Keccak-1600 absorb and squeeze for AVX-512F, CRYPTOGAMS by <appro\@openssl.org>"
547	___
548
549	$output=pop and open STDOUT,">$output";
550	print $code;
551	close STDOUT or die "error closing STDOUT: $!";

Note: See TracBrowser for help on using the repository browser.

source: vbox/trunk/src/libs/openssl-3.1.2/crypto/sha/asm/keccak1600-avx512.pl@ 101021

Download in other formats: