keccak1600-avx512vl.pl@ 94082

Last change on this file since 94082 was 94082, checked in by vboxsync, 3 years ago
libs/openssl-3.0.1: started applying and adjusting our OpenSSL changes to 3.0.1. bugref:10128
File size: 13.0 KB

Line
1	#!/usr/bin/env perl
2	# Copyright 2017-2020 The OpenSSL Project Authors. All Rights Reserved.
3	#
4	# Licensed under the Apache License 2.0 (the "License"). You may not use
5	# this file except in compliance with the License. You can obtain a copy
6	# in the file LICENSE in the source distribution or at
7	# https://www.openssl.org/source/license.html
8	#
9	# ====================================================================
10	# Written by Andy Polyakov <[email protected]> for the OpenSSL
11	# project. The module is, however, dual licensed under OpenSSL and
12	# CRYPTOGAMS licenses depending on where you obtain it. For further
13	# details see http://www.openssl.org/~appro/cryptogams/.
14	# ====================================================================
15	#
16	# Keccak-1600 for AVX512VL.
17	#
18	# December 2017.
19	#
20	# This is an adaptation of AVX2 module that reuses register data
21	# layout, but utilizes new 256-bit AVX512VL instructions. See AVX2
22	# module for further information on layout.
23	#
24	########################################################################
25	# Numbers are cycles per processed byte out of large message.
26	#
27	# r=1088(*)
28	#
29	# Skylake-X 6.4/+47%
30	#
31	# (*) Corresponds to SHA3-256. Percentage after slash is improvement
32	# coefficient in comparison to scalar keccak1600-x86_64.pl.
33
34	# Digits in variables' names denote right-most coordinates:
35
36	my ($A00, # [0][0] [0][0] [0][0] [0][0] # %ymm0
37	$A01, # [0][4] [0][3] [0][2] [0][1] # %ymm1
38	$A20, # [3][0] [1][0] [4][0] [2][0] # %ymm2
39	$A31, # [2][4] [4][3] [1][2] [3][1] # %ymm3
40	$A21, # [3][4] [1][3] [4][2] [2][1] # %ymm4
41	$A41, # [1][4] [2][3] [3][2] [4][1] # %ymm5
42	$A11) = # [4][4] [3][3] [2][2] [1][1] # %ymm6
43	map("%ymm$_",(0..6));
44
45	# We also need to map the magic order into offsets within structure:
46
47	my @A_jagged = ([0,0], [1,0], [1,1], [1,2], [1,3], # [0][0..4]
48	[2,2], [6,0], [3,1], [4,2], [5,3], # [1][0..4]
49	[2,0], [4,0], [6,1], [5,2], [3,3], # [2][0..4]
50	[2,3], [3,0], [5,1], [6,2], [4,3], # [3][0..4]
51	[2,1], [5,0], [4,1], [3,2], [6,3]); # [4][0..4]
52	@A_jagged = map(8($$_[0]4+$$_[1]), @A_jagged); # ... and now linear
53
54	my @T = map("%ymm$_",(7..15));
55	my ($C14,$C00,$D00,$D14) = @T[5..8];
56	my ($R20,$R01,$R31,$R21,$R41,$R11) = map("%ymm$_",(16..21));
57
58	$code.=<<___;
59	.text
60
61	.type __KeccakF1600,\@function
62	.align 32
63	__KeccakF1600:
64	lea iotas(%rip),%r10
65	mov \$24,%eax
66	jmp .Loop_avx512vl
67
68	.align 32
69	.Loop_avx512vl:
70	######################################### Theta
71	vpshufd \$0b01001110,$A20,$C00
72	vpxor $A31,$A41,$C14
73	vpxor $A11,$A21,@T[2]
74	vpternlogq \$0x96,$A01,$T[2],$C14 # C[1..4]
75
76	vpxor $A20,$C00,$C00
77	vpermq \$0b01001110,$C00,@T[0]
78
79	vpermq \$0b10010011,$C14,@T[4]
80	vprolq \$1,$C14,@T[1] # ROL64(C[1..4],1)
81
82	vpermq \$0b00111001,@T[1],$D14
83	vpxor @T[4],@T[1],$D00
84	vpermq \$0b00000000,$D00,$D00 # D[0..0] = ROL64(C[1],1) ^ C[4]
85
86	vpternlogq \$0x96,@T[0],$A00,$C00 # C[0..0]
87	vprolq \$1,$C00,@T[1] # ROL64(C[0..0],1)
88
89	vpxor $D00,$A00,$A00 # ^= D[0..0]
90
91	vpblendd \$0b11000000,@T[1],$D14,$D14
92	vpblendd \$0b00000011,$C00,@T[4],@T[0]
93
94	######################################### Rho + Pi + pre-Chi shuffle
95	vpxor $D00,$A20,$A20 # ^= D[0..0] from Theta
96	vprolvq $R20,$A20,$A20
97
98	vpternlogq \$0x96,@T[0],$D14,$A31 # ^= D[1..4] from Theta
99	vprolvq $R31,$A31,$A31
100
101	vpternlogq \$0x96,@T[0],$D14,$A21 # ^= D[1..4] from Theta
102	vprolvq $R21,$A21,$A21
103
104	vpternlogq \$0x96,@T[0],$D14,$A41 # ^= D[1..4] from Theta
105	vprolvq $R41,$A41,$A41
106
107	vpermq \$0b10001101,$A20,@T[3] # $A20 -> future $A31
108	vpermq \$0b10001101,$A31,@T[4] # $A31 -> future $A21
109	vpternlogq \$0x96,@T[0],$D14,$A11 # ^= D[1..4] from Theta
110	vprolvq $R11,$A11,@T[1] # $A11 -> future $A01
111
112	vpermq \$0b00011011,$A21,@T[5] # $A21 -> future $A41
113	vpermq \$0b01110010,$A41,@T[6] # $A41 -> future $A11
114	vpternlogq \$0x96,@T[0],$D14,$A01 # ^= D[1..4] from Theta
115	vprolvq $R01,$A01,@T[2] # $A01 -> future $A20
116
117	######################################### Chi
118	vpblendd \$0b00001100,@T[6],@T[2],$A31 # [4][4] [2][0]
119	vpblendd \$0b00001100,@T[2],@T[4],@T[8] # [4][0] [2][1]
120	vpblendd \$0b00001100,@T[4],@T[3],$A41 # [4][2] [2][4]
121	vpblendd \$0b00001100,@T[3],@T[2],@T[7] # [4][3] [2][0]
122	vpblendd \$0b00110000,@T[4],$A31,$A31 # [1][3] [4][4] [2][0]
123	vpblendd \$0b00110000,@T[5],@T[8],@T[8] # [1][4] [4][0] [2][1]
124	vpblendd \$0b00110000,@T[2],$A41,$A41 # [1][0] [4][2] [2][4]
125	vpblendd \$0b00110000,@T[6],@T[7],@T[7] # [1][1] [4][3] [2][0]
126	vpblendd \$0b11000000,@T[5],$A31,$A31 # [3][2] [1][3] [4][4] [2][0]
127	vpblendd \$0b11000000,@T[6],@T[8],@T[8] # [3][3] [1][4] [4][0] [2][1]
128	vpblendd \$0b11000000,@T[6],$A41,$A41 # [3][3] [1][0] [4][2] [2][4]
129	vpblendd \$0b11000000,@T[4],@T[7],@T[7] # [3][4] [1][1] [4][3] [2][0]
130	vpternlogq \$0xC6,@T[8],@T[3],$A31 # [3][1] [1][2] [4][3] [2][4]
131	vpternlogq \$0xC6,@T[7],@T[5],$A41 # [3][2] [1][4] [4][1] [2][3]
132
133	vpsrldq \$8,@T[1],@T[0]
134	vpandn @T[0],@T[1],@T[0] # tgting [0][0] [0][0] [0][0] [0][0]
135
136	vpblendd \$0b00001100,@T[2],@T[5],$A11 # [4][0] [2][3]
137	vpblendd \$0b00001100,@T[5],@T[3],@T[8] # [4][1] [2][4]
138	vpblendd \$0b00110000,@T[3],$A11,$A11 # [1][2] [4][0] [2][3]
139	vpblendd \$0b00110000,@T[4],@T[8],@T[8] # [1][3] [4][1] [2][4]
140	vpblendd \$0b11000000,@T[4],$A11,$A11 # [3][4] [1][2] [4][0] [2][3]
141	vpblendd \$0b11000000,@T[2],@T[8],@T[8] # [3][0] [1][3] [4][1] [2][4]
142	vpternlogq \$0xC6,@T[8],@T[6],$A11 # [3][3] [1][1] [4][4] [2][2]
143
144	vpermq \$0b00011110,@T[1],$A21 # [0][1] [0][2] [0][4] [0][3]
145	vpblendd \$0b00110000,$A00,$A21,@T[8] # [0][1] [0][0] [0][4] [0][3]
146	vpermq \$0b00111001,@T[1],$A01 # [0][1] [0][4] [0][3] [0][2]
147	vpblendd \$0b11000000,$A00,$A01,$A01 # [0][0] [0][4] [0][3] [0][2]
148
149	vpblendd \$0b00001100,@T[5],@T[4],$A20 # [4][1] [2][1]
150	vpblendd \$0b00001100,@T[4],@T[6],@T[7] # [4][2] [2][2]
151	vpblendd \$0b00110000,@T[6],$A20,$A20 # [1][1] [4][1] [2][1]
152	vpblendd \$0b00110000,@T[3],@T[7],@T[7] # [1][2] [4][2] [2][2]
153	vpblendd \$0b11000000,@T[3],$A20,$A20 # [3][1] [1][1] [4][1] [2][1]
154	vpblendd \$0b11000000,@T[5],@T[7],@T[7] # [3][2] [1][2] [4][2] [2][2]
155	vpternlogq \$0xC6,@T[7],@T[2],$A20 # [3][0] [1][0] [4][0] [2][0]
156
157	vpermq \$0b00000000,@T[0],@T[0] # [0][0] [0][0] [0][0] [0][0]
158	vpermq \$0b00011011,$A31,$A31 # post-Chi shuffle
159	vpermq \$0b10001101,$A41,$A41
160	vpermq \$0b01110010,$A11,$A11
161
162	vpblendd \$0b00001100,@T[3],@T[6],$A21 # [4][3] [2][2]
163	vpblendd \$0b00001100,@T[6],@T[5],@T[7] # [4][4] [2][3]
164	vpblendd \$0b00110000,@T[5],$A21,$A21 # [1][4] [4][3] [2][2]
165	vpblendd \$0b00110000,@T[2],@T[7],@T[7] # [1][0] [4][4] [2][3]
166	vpblendd \$0b11000000,@T[2],$A21,$A21 # [3][0] [1][4] [4][3] [2][2]
167	vpblendd \$0b11000000,@T[3],@T[7],@T[7] # [3][1] [1][0] [4][4] [2][3]
168
169	vpternlogq \$0xC6,@T[8],@T[1],$A01 # [0][4] [0][3] [0][2] [0][1]
170	vpternlogq \$0xC6,@T[7],@T[4],$A21 # [3][4] [1][3] [4][2] [2][1]
171
172	######################################### Iota
173	vpternlogq \$0x96,(%r10),@T[0],$A00
174	lea 32(%r10),%r10
175
176	dec %eax
177	jnz .Loop_avx512vl
178
179	ret
180	.size __KeccakF1600,.-__KeccakF1600
181	___
182	my ($A_flat,$inp,$len,$bsz) = ("%rdi","%rsi","%rdx","%rcx");
183	my $out = $inp; # in squeeze
184
185	$code.=<<___;
186	.globl SHA3_absorb
187	.type SHA3_absorb,\@function
188	.align 32
189	SHA3_absorb:
190	mov %rsp,%r11
191
192	lea -240(%rsp),%rsp
193	and \$-32,%rsp
194
195	lea 96($A_flat),$A_flat
196	lea 96($inp),$inp
197	lea 96(%rsp),%r10
198	lea rhotates_left(%rip),%r8
199
200	vzeroupper
201
202	vpbroadcastq -96($A_flat),$A00 # load A[5][5]
203	vmovdqu 8+32*0-96($A_flat),$A01
204	vmovdqu 8+32*1-96($A_flat),$A20
205	vmovdqu 8+32*2-96($A_flat),$A31
206	vmovdqu 8+32*3-96($A_flat),$A21
207	vmovdqu 8+32*4-96($A_flat),$A41
208	vmovdqu 8+32*5-96($A_flat),$A11
209
210	vmovdqa64 0*32(%r8),$R20 # load "rhotate" indices
211	vmovdqa64 1*32(%r8),$R01
212	vmovdqa64 2*32(%r8),$R31
213	vmovdqa64 3*32(%r8),$R21
214	vmovdqa64 4*32(%r8),$R41
215	vmovdqa64 5*32(%r8),$R11
216
217	vpxor @T[0],@T[0],@T[0]
218	vmovdqa @T[0],32*2-96(%r10) # zero transfer area on stack
219	vmovdqa @T[0],32*3-96(%r10)
220	vmovdqa @T[0],32*4-96(%r10)
221	vmovdqa @T[0],32*5-96(%r10)
222	vmovdqa @T[0],32*6-96(%r10)
223
224	.Loop_absorb_avx512vl:
225	mov $bsz,%rax
226	sub $bsz,$len
227	jc .Ldone_absorb_avx512vl
228
229	shr \$3,%eax
230	vpbroadcastq 0-96($inp),@T[0]
231	vmovdqu 8-96($inp),@T[1]
232	sub \$4,%eax
233	___
234	for(my $i=5; $i<25; $i++) {
235	$code.=<<___
236	dec %eax
237	jz .Labsorved_avx512vl
238	mov 8*$i-96($inp),%r8
239	mov %r8,$A_jagged[$i]-96(%r10)
240	___
241	}
242	$code.=<<___;
243	.Labsorved_avx512vl:
244	lea ($inp,$bsz),$inp
245
246	vpxor @T[0],$A00,$A00
247	vpxor @T[1],$A01,$A01
248	vpxor 32*2-96(%r10),$A20,$A20
249	vpxor 32*3-96(%r10),$A31,$A31
250	vpxor 32*4-96(%r10),$A21,$A21
251	vpxor 32*5-96(%r10),$A41,$A41
252	vpxor 32*6-96(%r10),$A11,$A11
253
254	call __KeccakF1600
255
256	lea 96(%rsp),%r10
257	jmp .Loop_absorb_avx512vl
258
259	.Ldone_absorb_avx512vl:
260	vmovq %xmm0,-96($A_flat)
261	vmovdqu $A01,8+32*0-96($A_flat)
262	vmovdqu $A20,8+32*1-96($A_flat)
263	vmovdqu $A31,8+32*2-96($A_flat)
264	vmovdqu $A21,8+32*3-96($A_flat)
265	vmovdqu $A41,8+32*4-96($A_flat)
266	vmovdqu $A11,8+32*5-96($A_flat)
267
268	vzeroupper
269
270	lea (%r11),%rsp
271	lea ($len,$bsz),%rax # return value
272	ret
273	.size SHA3_absorb,.-SHA3_absorb
274
275	.globl SHA3_squeeze
276	.type SHA3_squeeze,\@function
277	.align 32
278	SHA3_squeeze:
279	mov %rsp,%r11
280
281	lea 96($A_flat),$A_flat
282	lea rhotates_left(%rip),%r8
283	shr \$3,$bsz
284
285	vzeroupper
286
287	vpbroadcastq -96($A_flat),$A00
288	vpxor @T[0],@T[0],@T[0]
289	vmovdqu 8+32*0-96($A_flat),$A01
290	vmovdqu 8+32*1-96($A_flat),$A20
291	vmovdqu 8+32*2-96($A_flat),$A31
292	vmovdqu 8+32*3-96($A_flat),$A21
293	vmovdqu 8+32*4-96($A_flat),$A41
294	vmovdqu 8+32*5-96($A_flat),$A11
295
296	vmovdqa64 0*32(%r8),$R20 # load "rhotate" indices
297	vmovdqa64 1*32(%r8),$R01
298	vmovdqa64 2*32(%r8),$R31
299	vmovdqa64 3*32(%r8),$R21
300	vmovdqa64 4*32(%r8),$R41
301	vmovdqa64 5*32(%r8),$R11
302
303	mov $bsz,%rax
304
305	.Loop_squeeze_avx512vl:
306	mov @A_jagged[$i]-96($A_flat),%r8
307	___
308	for (my $i=0; $i<25; $i++) {
309	$code.=<<___;
310	sub \$8,$len
311	jc .Ltail_squeeze_avx512vl
312	mov %r8,($out)
313	lea 8($out),$out
314	je .Ldone_squeeze_avx512vl
315	dec %eax
316	je .Lextend_output_avx512vl
317	mov @A_jagged[$i+1]-120($A_flat),%r8
318	___
319	}
320	$code.=<<___;
321	.Lextend_output_avx512vl:
322	call __KeccakF1600
323
324	vmovq %xmm0,-96($A_flat)
325	vmovdqu $A01,8+32*0-96($A_flat)
326	vmovdqu $A20,8+32*1-96($A_flat)
327	vmovdqu $A31,8+32*2-96($A_flat)
328	vmovdqu $A21,8+32*3-96($A_flat)
329	vmovdqu $A41,8+32*4-96($A_flat)
330	vmovdqu $A11,8+32*5-96($A_flat)
331
332	mov $bsz,%rax
333	jmp .Loop_squeeze_avx512vl
334
335
336	.Ltail_squeeze_avx512vl:
337	add \$8,$len
338	.Loop_tail_avx512vl:
339	mov %r8b,($out)
340	lea 1($out),$out
341	shr \$8,%r8
342	dec $len
343	jnz .Loop_tail_avx512vl
344
345	.Ldone_squeeze_avx512vl:
346	vzeroupper
347
348	lea (%r11),%rsp
349	ret
350	.size SHA3_squeeze,.-SHA3_squeeze
351
352	.align 64
353	rhotates_left:
354	.quad 3, 18, 36, 41 # [2][0] [4][0] [1][0] [3][0]
355	.quad 1, 62, 28, 27 # [0][1] [0][2] [0][3] [0][4]
356	.quad 45, 6, 56, 39 # [3][1] [1][2] [4][3] [2][4]
357	.quad 10, 61, 55, 8 # [2][1] [4][2] [1][3] [3][4]
358	.quad 2, 15, 25, 20 # [4][1] [3][2] [2][3] [1][4]
359	.quad 44, 43, 21, 14 # [1][1] [2][2] [3][3] [4][4]
360	iotas:
361	.quad 0x0000000000000001, 0x0000000000000001, 0x0000000000000001, 0x0000000000000001
362	.quad 0x0000000000008082, 0x0000000000008082, 0x0000000000008082, 0x0000000000008082
363	.quad 0x800000000000808a, 0x800000000000808a, 0x800000000000808a, 0x800000000000808a
364	.quad 0x8000000080008000, 0x8000000080008000, 0x8000000080008000, 0x8000000080008000
365	.quad 0x000000000000808b, 0x000000000000808b, 0x000000000000808b, 0x000000000000808b
366	.quad 0x0000000080000001, 0x0000000080000001, 0x0000000080000001, 0x0000000080000001
367	.quad 0x8000000080008081, 0x8000000080008081, 0x8000000080008081, 0x8000000080008081
368	.quad 0x8000000000008009, 0x8000000000008009, 0x8000000000008009, 0x8000000000008009
369	.quad 0x000000000000008a, 0x000000000000008a, 0x000000000000008a, 0x000000000000008a
370	.quad 0x0000000000000088, 0x0000000000000088, 0x0000000000000088, 0x0000000000000088
371	.quad 0x0000000080008009, 0x0000000080008009, 0x0000000080008009, 0x0000000080008009
372	.quad 0x000000008000000a, 0x000000008000000a, 0x000000008000000a, 0x000000008000000a
373	.quad 0x000000008000808b, 0x000000008000808b, 0x000000008000808b, 0x000000008000808b
374	.quad 0x800000000000008b, 0x800000000000008b, 0x800000000000008b, 0x800000000000008b
375	.quad 0x8000000000008089, 0x8000000000008089, 0x8000000000008089, 0x8000000000008089
376	.quad 0x8000000000008003, 0x8000000000008003, 0x8000000000008003, 0x8000000000008003
377	.quad 0x8000000000008002, 0x8000000000008002, 0x8000000000008002, 0x8000000000008002
378	.quad 0x8000000000000080, 0x8000000000000080, 0x8000000000000080, 0x8000000000000080
379	.quad 0x000000000000800a, 0x000000000000800a, 0x000000000000800a, 0x000000000000800a
380	.quad 0x800000008000000a, 0x800000008000000a, 0x800000008000000a, 0x800000008000000a
381	.quad 0x8000000080008081, 0x8000000080008081, 0x8000000080008081, 0x8000000080008081
382	.quad 0x8000000000008080, 0x8000000000008080, 0x8000000000008080, 0x8000000000008080
383	.quad 0x0000000080000001, 0x0000000080000001, 0x0000000080000001, 0x0000000080000001
384	.quad 0x8000000080008008, 0x8000000080008008, 0x8000000080008008, 0x8000000080008008
385
386	.asciz "Keccak-1600 absorb and squeeze for AVX512VL, CRYPTOGAMS by <appro\@openssl.org>"
387	___
388
389	$output=pop and open STDOUT,">$output";
390	print $code;
391	close STDOUT or die "error closing STDOUT: $!";

Note: See TracBrowser for help on using the repository browser.

source: vbox/trunk/src/libs/openssl-3.0.1/crypto/sha/asm/keccak1600-avx512vl.pl@ 94082

Download in other formats: