aesni-sha256-x86_64.pl@ 102427

Last change on this file since 102427 was 101211, checked in by vboxsync, 17 months ago
openssl-3.1.3: Applied and adjusted our OpenSSL changes to 3.1.2. bugref:10527
File size: 43.5 KB

Line
1	#! /usr/bin/env perl
2	# Copyright 2013-2020 The OpenSSL Project Authors. All Rights Reserved.
3	#
4	# Licensed under the Apache License 2.0 (the "License"). You may not use
5	# this file except in compliance with the License. You can obtain a copy
6	# in the file LICENSE in the source distribution or at
7	# https://www.openssl.org/source/license.html
8
9	#
10	# ====================================================================
11	# Written by Andy Polyakov <[email protected]> for the OpenSSL
12	# project. The module is, however, dual licensed under OpenSSL and
13	# CRYPTOGAMS licenses depending on where you obtain it. For further
14	# details see http://www.openssl.org/~appro/cryptogams/.
15	# ====================================================================
16	#
17	# January 2013
18	#
19	# This is AESNI-CBC+SHA256 stitch implementation. The idea, as spelled
20	# in http://download.intel.com/design/intarch/papers/323686.pdf, is
21	# that since AESNI-CBC encrypt exhibit very low instruction-level
22	# parallelism, interleaving it with another algorithm would allow to
23	# utilize processor resources better and achieve better performance.
24	# SHA256 instruction sequences(*) are taken from sha512-x86_64.pl and
25	# AESNI code is weaved into it. As SHA256 dominates execution time,
26	# stitch performance does not depend on AES key length. Below are
27	# performance numbers in cycles per processed byte, less is better,
28	# for standalone AESNI-CBC encrypt, standalone SHA256, and stitched
29	# subroutine:
30	#
31	# AES-128/-192/-256+SHA256 this(**) gain
32	# Sandy Bridge 5.05/6.05/7.05+11.6 13.0 +28%/36%/43%
33	# Ivy Bridge 5.05/6.05/7.05+10.3 11.6 +32%/41%/50%
34	# Haswell 4.43/5.29/6.19+7.80 8.79 +39%/49%/59%
35	# Skylake 2.62/3.14/3.62+7.70 8.10 +27%/34%/40%
36	# Bulldozer 5.77/6.89/8.00+13.7 13.7 +42%/50%/58%
37	# Ryzen(***) 2.71/-/3.71+2.05 2.74/-/3.73 +74%/-/54%
38	# Goldmont(***) 3.82/-/5.35+4.16 4.73/-/5.94 +69%/-/60%
39	#
40	# (*) there are XOP, AVX1 and AVX2 code paths, meaning that
41	# Westmere is omitted from loop, this is because gain was not
42	# estimated high enough to justify the effort;
43	# (**) these are EVP-free results, results obtained with 'speed
44	# -evp aes-256-cbc-hmac-sha256' will vary by percent or two;
45	# (***) these are SHAEXT results;
46
47	# $output is the last argument if it looks like a file (it has an extension)
48	# $flavour is the first argument if it doesn't look like a file
49	$output = $#ARGV >= 0 && $ARGV[$#ARGV] =~ m\|\.\w+$\| ? pop : undef;
50	$flavour = $#ARGV >= 0 && $ARGV[0] !~ m\|\.\| ? shift : undef;
51
52	$win64=0; $win64=1 if ($flavour =~ /[nm]asm\|mingw64/ \|\| $output =~ /\.asm$/);
53
54	$0 =~ m/(.*[\/\\])[^\/\\]+$/; $dir=$1;
55	( $xlate="${dir}x86_64-xlate.pl" and -f $xlate ) or
56	( $xlate="${dir}../../perlasm/x86_64-xlate.pl" and -f $xlate) or
57	die "can't locate x86_64-xlate.pl";
58
59	if (`$ENV{CC} -Wa,-v -c -o /dev/null -x assembler /dev/null 2>&1`
60	=~ /GNU assembler version ([2-9]\.[0-9]+)/) {
61	$avx = ($1>=2.19) + ($1>=2.22);
62	}
63
64	if (!$avx && $win64 && ($flavour =~ /nasm/ \|\| $ENV{ASM} =~ /nasm/) &&
65	`nasm -v 2>&1` =~ /NASM version ([2-9]\.[0-9]+)/) {
66	$avx = ($1>=2.09) + ($1>=2.10);
67	}
68
69	if (!$avx && $win64 && ($flavour =~ /masm/ \|\| $ENV{ASM} =~ /ml64/) &&
70	`ml64 2>&1` =~ /Version ([0-9]+)\./) {
71	$avx = ($1>=10) + ($1>=12);
72	}
73
74	if (!$avx && `$ENV{CC} -v 2>&1` =~ /((?:clang\|LLVM) version\|.*based on LLVM) ([0-9]+\.[0-9]+)/) {
75	$avx = ($2>=3.0) + ($2>3.0);
76	}
77
78	$shaext=$avx; ### set to zero if compiling for 1.0.1
79	$avx=1 if (!$shaext && $avx);
80
81	open OUT,"\| \"$^X\" \"$xlate\" $flavour \"$output\""
82	or die "can't call $xlate: $!";
83	STDOUT=OUT;
84
85	$func="aesni_cbc_sha256_enc";
86	$TABLE="K256";
87	$SZ=4;
88	@ROT=($A,$B,$C,$D,$E,$F,$G,$H)=("%eax","%ebx","%ecx","%edx",
89	"%r8d","%r9d","%r10d","%r11d");
90	($T1,$a0,$a1,$a2,$a3)=("%r12d","%r13d","%r14d","%r15d","%esi");
91	@Sigma0=( 2,13,22);
92	@Sigma1=( 6,11,25);
93	@sigma0=( 7,18, 3);
94	@sigma1=(17,19,10);
95	$rounds=64;
96
97	########################################################################
98	# void aesni_cbc_sha256_enc(const void *inp,
99	# void *out,
100	# size_t length,
101	# const AES_KEY *key,
102	# unsigned char *iv,
103	# SHA256_CTX *ctx,
104	# const void *in0);
105	($inp, $out, $len, $key, $ivp, $ctx, $in0) =
106	("%rdi","%rsi","%rdx","%rcx","%r8","%r9","%r10");
107
108	$Tbl="%rbp";
109
110	$_inp="16$SZ+08(%rsp)";
111	$_out="16$SZ+18(%rsp)";
112	$_end="16$SZ+28(%rsp)";
113	$_key="16$SZ+38(%rsp)";
114	$_ivp="16$SZ+48(%rsp)";
115	$_ctx="16$SZ+58(%rsp)";
116	$_in0="16$SZ+68(%rsp)";
117	$_rsp="`16$SZ+78`(%rsp)";
118	$framesz=16$SZ+88;
119
120	$code=<<___;
121	.text
122
123	.extern OPENSSL_ia32cap_P
124	.globl $func
125	.type $func,\@abi-omnipotent
126	.align 16
127	$func:
128	.cfi_startproc
129	___
130	if ($avx) {
131	$code.=<<___;
132	lea OPENSSL_ia32cap_P(%rip),%r11
133	mov \$1,%eax
134	cmp \$0,`$win64?"%rcx":"%rdi"`
135	je .Lprobe
136	mov 0(%r11),%eax
137	mov 4(%r11),%r10
138	___
139	$code.=<<___ if ($shaext);
140	bt \$61,%r10 # check for SHA
141	jc ${func}_shaext
142	___
143	$code.=<<___;
144	mov %r10,%r11
145	shr \$32,%r11
146
147	test \$`1<<11`,%r10d # check for XOP
148	jnz ${func}_xop
149	___
150	$code.=<<___ if ($avx>1);
151	and \$`1<<8\|1<<5\|1<<3`,%r11d # check for BMI2+AVX2+BMI1
152	cmp \$`1<<8\|1<<5\|1<<3`,%r11d
153	je ${func}_avx2
154	___
155	$code.=<<___;
156	and \$`1<<28`,%r10d # check for AVX
157	jnz ${func}_avx
158	ud2
159	___
160	}
161	$code.=<<___;
162	xor %eax,%eax
163	cmp \$0,`$win64?"%rcx":"%rdi"`
164	je .Lprobe
165	ud2
166	.Lprobe:
167	ret
168	.cfi_endproc
169	.size $func,.-$func
170
171	.align 64
172	.type $TABLE,\@object
173	$TABLE:
174	.long 0x428a2f98,0x71374491,0xb5c0fbcf,0xe9b5dba5
175	.long 0x428a2f98,0x71374491,0xb5c0fbcf,0xe9b5dba5
176	.long 0x3956c25b,0x59f111f1,0x923f82a4,0xab1c5ed5
177	.long 0x3956c25b,0x59f111f1,0x923f82a4,0xab1c5ed5
178	.long 0xd807aa98,0x12835b01,0x243185be,0x550c7dc3
179	.long 0xd807aa98,0x12835b01,0x243185be,0x550c7dc3
180	.long 0x72be5d74,0x80deb1fe,0x9bdc06a7,0xc19bf174
181	.long 0x72be5d74,0x80deb1fe,0x9bdc06a7,0xc19bf174
182	.long 0xe49b69c1,0xefbe4786,0x0fc19dc6,0x240ca1cc
183	.long 0xe49b69c1,0xefbe4786,0x0fc19dc6,0x240ca1cc
184	.long 0x2de92c6f,0x4a7484aa,0x5cb0a9dc,0x76f988da
185	.long 0x2de92c6f,0x4a7484aa,0x5cb0a9dc,0x76f988da
186	.long 0x983e5152,0xa831c66d,0xb00327c8,0xbf597fc7
187	.long 0x983e5152,0xa831c66d,0xb00327c8,0xbf597fc7
188	.long 0xc6e00bf3,0xd5a79147,0x06ca6351,0x14292967
189	.long 0xc6e00bf3,0xd5a79147,0x06ca6351,0x14292967
190	.long 0x27b70a85,0x2e1b2138,0x4d2c6dfc,0x53380d13
191	.long 0x27b70a85,0x2e1b2138,0x4d2c6dfc,0x53380d13
192	.long 0x650a7354,0x766a0abb,0x81c2c92e,0x92722c85
193	.long 0x650a7354,0x766a0abb,0x81c2c92e,0x92722c85
194	.long 0xa2bfe8a1,0xa81a664b,0xc24b8b70,0xc76c51a3
195	.long 0xa2bfe8a1,0xa81a664b,0xc24b8b70,0xc76c51a3
196	.long 0xd192e819,0xd6990624,0xf40e3585,0x106aa070
197	.long 0xd192e819,0xd6990624,0xf40e3585,0x106aa070
198	.long 0x19a4c116,0x1e376c08,0x2748774c,0x34b0bcb5
199	.long 0x19a4c116,0x1e376c08,0x2748774c,0x34b0bcb5
200	.long 0x391c0cb3,0x4ed8aa4a,0x5b9cca4f,0x682e6ff3
201	.long 0x391c0cb3,0x4ed8aa4a,0x5b9cca4f,0x682e6ff3
202	.long 0x748f82ee,0x78a5636f,0x84c87814,0x8cc70208
203	.long 0x748f82ee,0x78a5636f,0x84c87814,0x8cc70208
204	.long 0x90befffa,0xa4506ceb,0xbef9a3f7,0xc67178f2
205	.long 0x90befffa,0xa4506ceb,0xbef9a3f7,0xc67178f2
206
207	.long 0x00010203,0x04050607,0x08090a0b,0x0c0d0e0f
208	.long 0x00010203,0x04050607,0x08090a0b,0x0c0d0e0f
209	.long 0,0,0,0, 0,0,0,0, -1,-1,-1,-1
210	.long 0,0,0,0, 0,0,0,0
211	.asciz "AESNI-CBC+SHA256 stitch for x86_64, CRYPTOGAMS by <appro\@openssl.org>"
212	.align 64
213	___
214
215	######################################################################
216	# SIMD code paths
217	#
218	{{{
219	($iv,$inout,$roundkey,$temp,
220	$mask10,$mask12,$mask14,$offload)=map("%xmm$_",(8..15));
221
222	$aesni_cbc_idx=0;
223	@aesni_cbc_block = (
224	## &vmovdqu ($roundkey,"0x00-0x80($inp)");'
225	## &vmovdqu ($inout,($inp));
226	## &mov ($_inp,$inp);
227
228	'&vpxor ($inout,$inout,$roundkey);'.
229	' &vmovdqu ($roundkey,"0x10-0x80($inp)");',
230
231	'&vpxor ($inout,$inout,$iv);',
232
233	'&vaesenc ($inout,$inout,$roundkey);'.
234	' &vmovdqu ($roundkey,"0x20-0x80($inp)");',
235
236	'&vaesenc ($inout,$inout,$roundkey);'.
237	' &vmovdqu ($roundkey,"0x30-0x80($inp)");',
238
239	'&vaesenc ($inout,$inout,$roundkey);'.
240	' &vmovdqu ($roundkey,"0x40-0x80($inp)");',
241
242	'&vaesenc ($inout,$inout,$roundkey);'.
243	' &vmovdqu ($roundkey,"0x50-0x80($inp)");',
244
245	'&vaesenc ($inout,$inout,$roundkey);'.
246	' &vmovdqu ($roundkey,"0x60-0x80($inp)");',
247
248	'&vaesenc ($inout,$inout,$roundkey);'.
249	' &vmovdqu ($roundkey,"0x70-0x80($inp)");',
250
251	'&vaesenc ($inout,$inout,$roundkey);'.
252	' &vmovdqu ($roundkey,"0x80-0x80($inp)");',
253
254	'&vaesenc ($inout,$inout,$roundkey);'.
255	' &vmovdqu ($roundkey,"0x90-0x80($inp)");',
256
257	'&vaesenc ($inout,$inout,$roundkey);'.
258	' &vmovdqu ($roundkey,"0xa0-0x80($inp)");',
259
260	'&vaesenclast ($temp,$inout,$roundkey);'.
261	' &vaesenc ($inout,$inout,$roundkey);'.
262	' &vmovdqu ($roundkey,"0xb0-0x80($inp)");',
263
264	'&vpand ($iv,$temp,$mask10);'.
265	' &vaesenc ($inout,$inout,$roundkey);'.
266	' &vmovdqu ($roundkey,"0xc0-0x80($inp)");',
267
268	'&vaesenclast ($temp,$inout,$roundkey);'.
269	' &vaesenc ($inout,$inout,$roundkey);'.
270	' &vmovdqu ($roundkey,"0xd0-0x80($inp)");',
271
272	'&vpand ($temp,$temp,$mask12);'.
273	' &vaesenc ($inout,$inout,$roundkey);'.
274	'&vmovdqu ($roundkey,"0xe0-0x80($inp)");',
275
276	'&vpor ($iv,$iv,$temp);'.
277	' &vaesenclast ($temp,$inout,$roundkey);'.
278	' &vmovdqu ($roundkey,"0x00-0x80($inp)");'
279
280	## &mov ($inp,$_inp);
281	## &mov ($out,$_out);
282	## &vpand ($temp,$temp,$mask14);
283	## &vpor ($iv,$iv,$temp);
284	## &vmovdqu ($iv,($out,$inp);
285	## &lea (inp,16($inp));
286	);
287
288	my $a4=$T1;
289	my ($a,$b,$c,$d,$e,$f,$g,$h);
290
291	sub AUTOLOAD() # thunk [simplified] 32-bit style perlasm
292	{ my $opcode = $AUTOLOAD; $opcode =~ s/.*:://;
293	my $arg = pop;
294	$arg = "\$$arg" if ($arg*1 eq $arg);
295	$code .= "\t$opcode\t".join(',',$arg,reverse @_)."\n";
296	}
297
298	sub body_00_15 () {
299	(
300	'($a,$b,$c,$d,$e,$f,$g,$h)=@ROT;'.
301
302	'&ror ($a0,$Sigma1[2]-$Sigma1[1])',
303	'&mov ($a,$a1)',
304	'&mov ($a4,$f)',
305
306	'&xor ($a0,$e)',
307	'&ror ($a1,$Sigma0[2]-$Sigma0[1])',
308	'&xor ($a4,$g)', # f^g
309
310	'&ror ($a0,$Sigma1[1]-$Sigma1[0])',
311	'&xor ($a1,$a)',
312	'&and ($a4,$e)', # (f^g)&e
313
314	@aesni_cbc_block[$aesni_cbc_idx++].
315	'&xor ($a0,$e)',
316	'&add ($h,$SZ*($i&15)."(%rsp)")', # h+=X[i]+K[i]
317	'&mov ($a2,$a)',
318
319	'&ror ($a1,$Sigma0[1]-$Sigma0[0])',
320	'&xor ($a4,$g)', # Ch(e,f,g)=((f^g)&e)^g
321	'&xor ($a2,$b)', # a^b, b^c in next round
322
323	'&ror ($a0,$Sigma1[0])', # Sigma1(e)
324	'&add ($h,$a4)', # h+=Ch(e,f,g)
325	'&and ($a3,$a2)', # (b^c)&(a^b)
326
327	'&xor ($a1,$a)',
328	'&add ($h,$a0)', # h+=Sigma1(e)
329	'&xor ($a3,$b)', # Maj(a,b,c)=Ch(a^b,c,b)
330
331	'&add ($d,$h)', # d+=h
332	'&ror ($a1,$Sigma0[0])', # Sigma0(a)
333	'&add ($h,$a3)', # h+=Maj(a,b,c)
334
335	'&mov ($a0,$d)',
336	'&add ($a1,$h);'. # h+=Sigma0(a)
337	'($a2,$a3) = ($a3,$a2); unshift(@ROT,pop(@ROT)); $i++;'
338	);
339	}
340
341	if ($avx) {{
342	######################################################################
343	# XOP code path
344	#
345	$code.=<<___;
346	.type ${func}_xop,\@function,6
347	.align 64
348	${func}_xop:
349	.cfi_startproc
350	.Lxop_shortcut:
351	mov `($win64?56:8)`(%rsp),$in0 # load 7th parameter
352	mov %rsp,%rax # copy %rsp
353	.cfi_def_cfa_register %rax
354	push %rbx
355	.cfi_push %rbx
356	push %rbp
357	.cfi_push %rbp
358	push %r12
359	.cfi_push %r12
360	push %r13
361	.cfi_push %r13
362	push %r14
363	.cfi_push %r14
364	push %r15
365	.cfi_push %r15
366	sub \$`$framesz+$win641610`,%rsp
367	and \$-64,%rsp # align stack frame
368
369	shl \$6,$len
370	sub $inp,$out # re-bias
371	sub $inp,$in0
372	add $inp,$len # end of input
373
374	#mov $inp,$_inp # saved later
375	mov $out,$_out
376	mov $len,$_end
377	#mov $key,$_key # remains resident in $inp register
378	mov $ivp,$_ivp
379	mov $ctx,$_ctx
380	mov $in0,$_in0
381	mov %rax,$_rsp
382	.cfi_cfa_expression $_rsp,deref,+8
383	___
384	$code.=<<___ if ($win64);
385	movaps %xmm6,`$framesz+16*0`(%rsp)
386	movaps %xmm7,`$framesz+16*1`(%rsp)
387	movaps %xmm8,`$framesz+16*2`(%rsp)
388	movaps %xmm9,`$framesz+16*3`(%rsp)
389	movaps %xmm10,`$framesz+16*4`(%rsp)
390	movaps %xmm11,`$framesz+16*5`(%rsp)
391	movaps %xmm12,`$framesz+16*6`(%rsp)
392	movaps %xmm13,`$framesz+16*7`(%rsp)
393	movaps %xmm14,`$framesz+16*8`(%rsp)
394	movaps %xmm15,`$framesz+16*9`(%rsp)
395	___
396	$code.=<<___;
397	.Lprologue_xop:
398	vzeroall
399
400	mov $inp,%r12 # borrow $a4
401	lea 0x80($key),$inp # size optimization, reassign
402	lea $TABLE+`$SZ2$rounds+32`(%rip),%r13 # borrow $a0
403	mov 0xf0-0x80($inp),%r14d # rounds, borrow $a1
404	mov $ctx,%r15 # borrow $a2
405	mov $in0,%rsi # borrow $a3
406	vmovdqu ($ivp),$iv # load IV
407	sub \$9,%r14
408
409	mov $SZ*0(%r15),$A
410	mov $SZ*1(%r15),$B
411	mov $SZ*2(%r15),$C
412	mov $SZ*3(%r15),$D
413	mov $SZ*4(%r15),$E
414	mov $SZ*5(%r15),$F
415	mov $SZ*6(%r15),$G
416	mov $SZ*7(%r15),$H
417
418	vmovdqa 0x00(%r13,%r14,8),$mask14
419	vmovdqa 0x10(%r13,%r14,8),$mask12
420	vmovdqa 0x20(%r13,%r14,8),$mask10
421	vmovdqu 0x00-0x80($inp),$roundkey
422	jmp .Lloop_xop
423	___
424	if ($SZ==4) { # SHA256
425	my @X = map("%xmm$_",(0..3));
426	my ($t0,$t1,$t2,$t3) = map("%xmm$_",(4..7));
427
428	$code.=<<___;
429	.align 16
430	.Lloop_xop:
431	vmovdqa $TABLE+`$SZ2$rounds`(%rip),$t3
432	vmovdqu 0x00(%rsi,%r12),@X[0]
433	vmovdqu 0x10(%rsi,%r12),@X[1]
434	vmovdqu 0x20(%rsi,%r12),@X[2]
435	vmovdqu 0x30(%rsi,%r12),@X[3]
436	vpshufb $t3,@X[0],@X[0]
437	lea $TABLE(%rip),$Tbl
438	vpshufb $t3,@X[1],@X[1]
439	vpshufb $t3,@X[2],@X[2]
440	vpaddd 0x00($Tbl),@X[0],$t0
441	vpshufb $t3,@X[3],@X[3]
442	vpaddd 0x20($Tbl),@X[1],$t1
443	vpaddd 0x40($Tbl),@X[2],$t2
444	vpaddd 0x60($Tbl),@X[3],$t3
445	vmovdqa $t0,0x00(%rsp)
446	mov $A,$a1
447	vmovdqa $t1,0x10(%rsp)
448	mov $B,$a3
449	vmovdqa $t2,0x20(%rsp)
450	xor $C,$a3 # magic
451	vmovdqa $t3,0x30(%rsp)
452	mov $E,$a0
453	jmp .Lxop_00_47
454
455	.align 16
456	.Lxop_00_47:
457	sub \$-162$SZ,$Tbl # size optimization
458	vmovdqu (%r12),$inout # $a4
459	mov %r12,$_inp # $a4
460	___
461	sub XOP_256_00_47 () {
462	my $j = shift;
463	my $body = shift;
464	my @X = @_;
465	my @insns = (&$body,&$body,&$body,&$body); # 104 instructions
466
467	&vpalignr ($t0,@X[1],@X[0],$SZ); # X[1..4]
468	eval(shift(@insns));
469	eval(shift(@insns));
470	&vpalignr ($t3,@X[3],@X[2],$SZ); # X[9..12]
471	eval(shift(@insns));
472	eval(shift(@insns));
473	&vprotd ($t1,$t0,8*$SZ-$sigma0[1]);
474	eval(shift(@insns));
475	eval(shift(@insns));
476	&vpsrld ($t0,$t0,$sigma0[2]);
477	eval(shift(@insns));
478	eval(shift(@insns));
479	&vpaddd (@X[0],@X[0],$t3); # X[0..3] += X[9..12]
480	eval(shift(@insns));
481	eval(shift(@insns));
482	eval(shift(@insns));
483	eval(shift(@insns));
484	&vprotd ($t2,$t1,$sigma0[1]-$sigma0[0]);
485	eval(shift(@insns));
486	eval(shift(@insns));
487	&vpxor ($t0,$t0,$t1);
488	eval(shift(@insns));
489	eval(shift(@insns));
490	eval(shift(@insns));
491	eval(shift(@insns));
492	&vprotd ($t3,@X[3],8*$SZ-$sigma1[1]);
493	eval(shift(@insns));
494	eval(shift(@insns));
495	&vpxor ($t0,$t0,$t2); # sigma0(X[1..4])
496	eval(shift(@insns));
497	eval(shift(@insns));
498	&vpsrld ($t2,@X[3],$sigma1[2]);
499	eval(shift(@insns));
500	eval(shift(@insns));
501	&vpaddd (@X[0],@X[0],$t0); # X[0..3] += sigma0(X[1..4])
502	eval(shift(@insns));
503	eval(shift(@insns));
504	&vprotd ($t1,$t3,$sigma1[1]-$sigma1[0]);
505	eval(shift(@insns));
506	eval(shift(@insns));
507	&vpxor ($t3,$t3,$t2);
508	eval(shift(@insns));
509	eval(shift(@insns));
510	eval(shift(@insns));
511	eval(shift(@insns));
512	&vpxor ($t3,$t3,$t1); # sigma1(X[14..15])
513	eval(shift(@insns));
514	eval(shift(@insns));
515	eval(shift(@insns));
516	eval(shift(@insns));
517	&vpsrldq ($t3,$t3,8);
518	eval(shift(@insns));
519	eval(shift(@insns));
520	eval(shift(@insns));
521	eval(shift(@insns));
522	&vpaddd (@X[0],@X[0],$t3); # X[0..1] += sigma1(X[14..15])
523	eval(shift(@insns));
524	eval(shift(@insns));
525	eval(shift(@insns));
526	eval(shift(@insns));
527	&vprotd ($t3,@X[0],8*$SZ-$sigma1[1]);
528	eval(shift(@insns));
529	eval(shift(@insns));
530	&vpsrld ($t2,@X[0],$sigma1[2]);
531	eval(shift(@insns));
532	eval(shift(@insns));
533	&vprotd ($t1,$t3,$sigma1[1]-$sigma1[0]);
534	eval(shift(@insns));
535	eval(shift(@insns));
536	&vpxor ($t3,$t3,$t2);
537	eval(shift(@insns));
538	eval(shift(@insns));
539	eval(shift(@insns));
540	eval(shift(@insns));
541	&vpxor ($t3,$t3,$t1); # sigma1(X[16..17])
542	eval(shift(@insns));
543	eval(shift(@insns));
544	eval(shift(@insns));
545	eval(shift(@insns));
546	&vpslldq ($t3,$t3,8); # 22 instructions
547	eval(shift(@insns));
548	eval(shift(@insns));
549	eval(shift(@insns));
550	eval(shift(@insns));
551	&vpaddd (@X[0],@X[0],$t3); # X[2..3] += sigma1(X[16..17])
552	eval(shift(@insns));
553	eval(shift(@insns));
554	eval(shift(@insns));
555	eval(shift(@insns));
556	&vpaddd ($t2,@X[0],162$j."($Tbl)");
557	foreach (@insns) { eval; } # remaining instructions
558	&vmovdqa (16*$j."(%rsp)",$t2);
559	}
560
561	$aesni_cbc_idx=0;
562	for ($i=0,$j=0; $j<4; $j++) {
563	&XOP_256_00_47($j,\&body_00_15,@X);
564	push(@X,shift(@X)); # rotate(@X)
565	}
566	&mov ("%r12",$_inp); # borrow $a4
567	&vpand ($temp,$temp,$mask14);
568	&mov ("%r15",$_out); # borrow $a2
569	&vpor ($iv,$iv,$temp);
570	&vmovdqu ("(%r15,%r12)",$iv); # write output
571	&lea ("%r12","16(%r12)"); # inp++
572
573	&cmpb ($SZ-1+162$SZ."($Tbl)",0);
574	&jne (".Lxop_00_47");
575
576	&vmovdqu ($inout,"(%r12)");
577	&mov ($_inp,"%r12");
578
579	$aesni_cbc_idx=0;
580	for ($i=0; $i<16; ) {
581	foreach(body_00_15()) { eval; }
582	}
583	}
584	$code.=<<___;
585	mov $_inp,%r12 # borrow $a4
586	mov $_out,%r13 # borrow $a0
587	mov $_ctx,%r15 # borrow $a2
588	mov $_in0,%rsi # borrow $a3
589
590	vpand $mask14,$temp,$temp
591	mov $a1,$A
592	vpor $temp,$iv,$iv
593	vmovdqu $iv,(%r13,%r12) # write output
594	lea 16(%r12),%r12 # inp++
595
596	add $SZ*0(%r15),$A
597	add $SZ*1(%r15),$B
598	add $SZ*2(%r15),$C
599	add $SZ*3(%r15),$D
600	add $SZ*4(%r15),$E
601	add $SZ*5(%r15),$F
602	add $SZ*6(%r15),$G
603	add $SZ*7(%r15),$H
604
605	cmp $_end,%r12
606
607	mov $A,$SZ*0(%r15)
608	mov $B,$SZ*1(%r15)
609	mov $C,$SZ*2(%r15)
610	mov $D,$SZ*3(%r15)
611	mov $E,$SZ*4(%r15)
612	mov $F,$SZ*5(%r15)
613	mov $G,$SZ*6(%r15)
614	mov $H,$SZ*7(%r15)
615
616	jb .Lloop_xop
617
618	mov $_ivp,$ivp
619	mov $_rsp,%rsi
620	.cfi_def_cfa %rsi,8
621	vmovdqu $iv,($ivp) # output IV
622	vzeroall
623	___
624	$code.=<<___ if ($win64);
625	movaps `$framesz+16*0`(%rsp),%xmm6
626	movaps `$framesz+16*1`(%rsp),%xmm7
627	movaps `$framesz+16*2`(%rsp),%xmm8
628	movaps `$framesz+16*3`(%rsp),%xmm9
629	movaps `$framesz+16*4`(%rsp),%xmm10
630	movaps `$framesz+16*5`(%rsp),%xmm11
631	movaps `$framesz+16*6`(%rsp),%xmm12
632	movaps `$framesz+16*7`(%rsp),%xmm13
633	movaps `$framesz+16*8`(%rsp),%xmm14
634	movaps `$framesz+16*9`(%rsp),%xmm15
635	___
636	$code.=<<___;
637	mov -48(%rsi),%r15
638	.cfi_restore %r15
639	mov -40(%rsi),%r14
640	.cfi_restore %r14
641	mov -32(%rsi),%r13
642	.cfi_restore %r13
643	mov -24(%rsi),%r12
644	.cfi_restore %r12
645	mov -16(%rsi),%rbp
646	.cfi_restore %rbp
647	mov -8(%rsi),%rbx
648	.cfi_restore %rbx
649	lea (%rsi),%rsp
650	.cfi_def_cfa_register %rsp
651	.Lepilogue_xop:
652	ret
653	.cfi_endproc
654	.size ${func}_xop,.-${func}_xop
655	___
656	######################################################################
657	# AVX+shrd code path
658	#
659	local *ror = sub { &shrd(@_[0],@_) };
660
661	$code.=<<___;
662	.type ${func}_avx,\@function,6
663	.align 64
664	${func}_avx:
665	.cfi_startproc
666	.Lavx_shortcut:
667	mov `($win64?56:8)`(%rsp),$in0 # load 7th parameter
668	mov %rsp,%rax # copy %rsp
669	.cfi_def_cfa_register %rax
670	push %rbx
671	.cfi_push %rbx
672	push %rbp
673	.cfi_push %rbp
674	push %r12
675	.cfi_push %r12
676	push %r13
677	.cfi_push %r13
678	push %r14
679	.cfi_push %r14
680	push %r15
681	.cfi_push %r15
682	sub \$`$framesz+$win641610`,%rsp
683	and \$-64,%rsp # align stack frame
684
685	shl \$6,$len
686	sub $inp,$out # re-bias
687	sub $inp,$in0
688	add $inp,$len # end of input
689
690	#mov $inp,$_inp # saved later
691	mov $out,$_out
692	mov $len,$_end
693	#mov $key,$_key # remains resident in $inp register
694	mov $ivp,$_ivp
695	mov $ctx,$_ctx
696	mov $in0,$_in0
697	mov %rax,$_rsp
698	.cfi_cfa_expression $_rsp,deref,+8
699	___
700	$code.=<<___ if ($win64);
701	movaps %xmm6,`$framesz+16*0`(%rsp)
702	movaps %xmm7,`$framesz+16*1`(%rsp)
703	movaps %xmm8,`$framesz+16*2`(%rsp)
704	movaps %xmm9,`$framesz+16*3`(%rsp)
705	movaps %xmm10,`$framesz+16*4`(%rsp)
706	movaps %xmm11,`$framesz+16*5`(%rsp)
707	movaps %xmm12,`$framesz+16*6`(%rsp)
708	movaps %xmm13,`$framesz+16*7`(%rsp)
709	movaps %xmm14,`$framesz+16*8`(%rsp)
710	movaps %xmm15,`$framesz+16*9`(%rsp)
711	___
712	$code.=<<___;
713	.Lprologue_avx:
714	vzeroall
715
716	mov $inp,%r12 # borrow $a4
717	lea 0x80($key),$inp # size optimization, reassign
718	lea $TABLE+`$SZ2$rounds+32`(%rip),%r13 # borrow $a0
719	mov 0xf0-0x80($inp),%r14d # rounds, borrow $a1
720	mov $ctx,%r15 # borrow $a2
721	mov $in0,%rsi # borrow $a3
722	vmovdqu ($ivp),$iv # load IV
723	sub \$9,%r14
724
725	mov $SZ*0(%r15),$A
726	mov $SZ*1(%r15),$B
727	mov $SZ*2(%r15),$C
728	mov $SZ*3(%r15),$D
729	mov $SZ*4(%r15),$E
730	mov $SZ*5(%r15),$F
731	mov $SZ*6(%r15),$G
732	mov $SZ*7(%r15),$H
733
734	vmovdqa 0x00(%r13,%r14,8),$mask14
735	vmovdqa 0x10(%r13,%r14,8),$mask12
736	vmovdqa 0x20(%r13,%r14,8),$mask10
737	vmovdqu 0x00-0x80($inp),$roundkey
738	___
739	if ($SZ==4) { # SHA256
740	my @X = map("%xmm$_",(0..3));
741	my ($t0,$t1,$t2,$t3) = map("%xmm$_",(4..7));
742
743	$code.=<<___;
744	jmp .Lloop_avx
745	.align 16
746	.Lloop_avx:
747	vmovdqa $TABLE+`$SZ2$rounds`(%rip),$t3
748	vmovdqu 0x00(%rsi,%r12),@X[0]
749	vmovdqu 0x10(%rsi,%r12),@X[1]
750	vmovdqu 0x20(%rsi,%r12),@X[2]
751	vmovdqu 0x30(%rsi,%r12),@X[3]
752	vpshufb $t3,@X[0],@X[0]
753	lea $TABLE(%rip),$Tbl
754	vpshufb $t3,@X[1],@X[1]
755	vpshufb $t3,@X[2],@X[2]
756	vpaddd 0x00($Tbl),@X[0],$t0
757	vpshufb $t3,@X[3],@X[3]
758	vpaddd 0x20($Tbl),@X[1],$t1
759	vpaddd 0x40($Tbl),@X[2],$t2
760	vpaddd 0x60($Tbl),@X[3],$t3
761	vmovdqa $t0,0x00(%rsp)
762	mov $A,$a1
763	vmovdqa $t1,0x10(%rsp)
764	mov $B,$a3
765	vmovdqa $t2,0x20(%rsp)
766	xor $C,$a3 # magic
767	vmovdqa $t3,0x30(%rsp)
768	mov $E,$a0
769	jmp .Lavx_00_47
770
771	.align 16
772	.Lavx_00_47:
773	sub \$-162$SZ,$Tbl # size optimization
774	vmovdqu (%r12),$inout # $a4
775	mov %r12,$_inp # $a4
776	___
777	sub Xupdate_256_AVX () {
778	(
779	'&vpalignr ($t0,@X[1],@X[0],$SZ)', # X[1..4]
780	'&vpalignr ($t3,@X[3],@X[2],$SZ)', # X[9..12]
781	'&vpsrld ($t2,$t0,$sigma0[0]);',
782	'&vpaddd (@X[0],@X[0],$t3)', # X[0..3] += X[9..12]
783	'&vpsrld ($t3,$t0,$sigma0[2])',
784	'&vpslld ($t1,$t0,8*$SZ-$sigma0[1]);',
785	'&vpxor ($t0,$t3,$t2)',
786	'&vpshufd ($t3,@X[3],0b11111010)',# X[14..15]
787	'&vpsrld ($t2,$t2,$sigma0[1]-$sigma0[0]);',
788	'&vpxor ($t0,$t0,$t1)',
789	'&vpslld ($t1,$t1,$sigma0[1]-$sigma0[0]);',
790	'&vpxor ($t0,$t0,$t2)',
791	'&vpsrld ($t2,$t3,$sigma1[2]);',
792	'&vpxor ($t0,$t0,$t1)', # sigma0(X[1..4])
793	'&vpsrlq ($t3,$t3,$sigma1[0]);',
794	'&vpaddd (@X[0],@X[0],$t0)', # X[0..3] += sigma0(X[1..4])
795	'&vpxor ($t2,$t2,$t3);',
796	'&vpsrlq ($t3,$t3,$sigma1[1]-$sigma1[0])',
797	'&vpxor ($t2,$t2,$t3)', # sigma1(X[14..15])
798	'&vpshufd ($t2,$t2,0b10000100)',
799	'&vpsrldq ($t2,$t2,8)',
800	'&vpaddd (@X[0],@X[0],$t2)', # X[0..1] += sigma1(X[14..15])
801	'&vpshufd ($t3,@X[0],0b01010000)',# X[16..17]
802	'&vpsrld ($t2,$t3,$sigma1[2])',
803	'&vpsrlq ($t3,$t3,$sigma1[0])',
804	'&vpxor ($t2,$t2,$t3);',
805	'&vpsrlq ($t3,$t3,$sigma1[1]-$sigma1[0])',
806	'&vpxor ($t2,$t2,$t3)',
807	'&vpshufd ($t2,$t2,0b11101000)',
808	'&vpslldq ($t2,$t2,8)',
809	'&vpaddd (@X[0],@X[0],$t2)' # X[2..3] += sigma1(X[16..17])
810	);
811	}
812
813	sub AVX_256_00_47 () {
814	my $j = shift;
815	my $body = shift;
816	my @X = @_;
817	my @insns = (&$body,&$body,&$body,&$body); # 104 instructions
818
819	foreach (Xupdate_256_AVX()) { # 29 instructions
820	eval;
821	eval(shift(@insns));
822	eval(shift(@insns));
823	eval(shift(@insns));
824	}
825	&vpaddd ($t2,@X[0],162$j."($Tbl)");
826	foreach (@insns) { eval; } # remaining instructions
827	&vmovdqa (16*$j."(%rsp)",$t2);
828	}
829
830	$aesni_cbc_idx=0;
831	for ($i=0,$j=0; $j<4; $j++) {
832	&AVX_256_00_47($j,\&body_00_15,@X);
833	push(@X,shift(@X)); # rotate(@X)
834	}
835	&mov ("%r12",$_inp); # borrow $a4
836	&vpand ($temp,$temp,$mask14);
837	&mov ("%r15",$_out); # borrow $a2
838	&vpor ($iv,$iv,$temp);
839	&vmovdqu ("(%r15,%r12)",$iv); # write output
840	&lea ("%r12","16(%r12)"); # inp++
841
842	&cmpb ($SZ-1+162$SZ."($Tbl)",0);
843	&jne (".Lavx_00_47");
844
845	&vmovdqu ($inout,"(%r12)");
846	&mov ($_inp,"%r12");
847
848	$aesni_cbc_idx=0;
849	for ($i=0; $i<16; ) {
850	foreach(body_00_15()) { eval; }
851	}
852
853	}
854	$code.=<<___;
855	mov $_inp,%r12 # borrow $a4
856	mov $_out,%r13 # borrow $a0
857	mov $_ctx,%r15 # borrow $a2
858	mov $_in0,%rsi # borrow $a3
859
860	vpand $mask14,$temp,$temp
861	mov $a1,$A
862	vpor $temp,$iv,$iv
863	vmovdqu $iv,(%r13,%r12) # write output
864	lea 16(%r12),%r12 # inp++
865
866	add $SZ*0(%r15),$A
867	add $SZ*1(%r15),$B
868	add $SZ*2(%r15),$C
869	add $SZ*3(%r15),$D
870	add $SZ*4(%r15),$E
871	add $SZ*5(%r15),$F
872	add $SZ*6(%r15),$G
873	add $SZ*7(%r15),$H
874
875	cmp $_end,%r12
876
877	mov $A,$SZ*0(%r15)
878	mov $B,$SZ*1(%r15)
879	mov $C,$SZ*2(%r15)
880	mov $D,$SZ*3(%r15)
881	mov $E,$SZ*4(%r15)
882	mov $F,$SZ*5(%r15)
883	mov $G,$SZ*6(%r15)
884	mov $H,$SZ*7(%r15)
885	jb .Lloop_avx
886
887	mov $_ivp,$ivp
888	mov $_rsp,%rsi
889	.cfi_def_cfa %rsi,8
890	vmovdqu $iv,($ivp) # output IV
891	vzeroall
892	___
893	$code.=<<___ if ($win64);
894	movaps `$framesz+16*0`(%rsp),%xmm6
895	movaps `$framesz+16*1`(%rsp),%xmm7
896	movaps `$framesz+16*2`(%rsp),%xmm8
897	movaps `$framesz+16*3`(%rsp),%xmm9
898	movaps `$framesz+16*4`(%rsp),%xmm10
899	movaps `$framesz+16*5`(%rsp),%xmm11
900	movaps `$framesz+16*6`(%rsp),%xmm12
901	movaps `$framesz+16*7`(%rsp),%xmm13
902	movaps `$framesz+16*8`(%rsp),%xmm14
903	movaps `$framesz+16*9`(%rsp),%xmm15
904	___
905	$code.=<<___;
906	mov -48(%rsi),%r15
907	.cfi_restore %r15
908	mov -40(%rsi),%r14
909	.cfi_restore %r14
910	mov -32(%rsi),%r13
911	.cfi_restore %r13
912	mov -24(%rsi),%r12
913	.cfi_restore %r12
914	mov -16(%rsi),%rbp
915	.cfi_restore %rbp
916	mov -8(%rsi),%rbx
917	.cfi_restore %rbx
918	lea (%rsi),%rsp
919	.cfi_def_cfa_register %rsp
920	.Lepilogue_avx:
921	ret
922	.cfi_endproc
923	.size ${func}_avx,.-${func}_avx
924	___
925
926	if ($avx>1) {{
927	######################################################################
928	# AVX2+BMI code path
929	#
930	my $a5=$SZ==4?"%esi":"%rsi"; # zap $inp
931	my $PUSH8=82$SZ;
932	use integer;
933
934	sub bodyx_00_15 () {
935	# at start $a1 should be zero, $a3 - $b^$c and $a4 copy of $f
936	(
937	'($a,$b,$c,$d,$e,$f,$g,$h)=@ROT;'.
938
939	'&add ($h,(32($i/(16/$SZ))+$SZ($i%(16/$SZ)))%$PUSH8.$base)', # h+=X[i]+K[i]
940	'&and ($a4,$e)', # f&e
941	'&rorx ($a0,$e,$Sigma1[2])',
942	'&rorx ($a2,$e,$Sigma1[1])',
943
944	'&lea ($a,"($a,$a1)")', # h+=Sigma0(a) from the past
945	'&lea ($h,"($h,$a4)")',
946	'&andn ($a4,$e,$g)', # ~e&g
947	'&xor ($a0,$a2)',
948
949	'&rorx ($a1,$e,$Sigma1[0])',
950	'&lea ($h,"($h,$a4)")', # h+=Ch(e,f,g)=(e&f)+(~e&g)
951	'&xor ($a0,$a1)', # Sigma1(e)
952	'&mov ($a2,$a)',
953
954	'&rorx ($a4,$a,$Sigma0[2])',
955	'&lea ($h,"($h,$a0)")', # h+=Sigma1(e)
956	'&xor ($a2,$b)', # a^b, b^c in next round
957	'&rorx ($a1,$a,$Sigma0[1])',
958
959	'&rorx ($a0,$a,$Sigma0[0])',
960	'&lea ($d,"($d,$h)")', # d+=h
961	'&and ($a3,$a2)', # (b^c)&(a^b)
962	@aesni_cbc_block[$aesni_cbc_idx++].
963	'&xor ($a1,$a4)',
964
965	'&xor ($a3,$b)', # Maj(a,b,c)=Ch(a^b,c,b)
966	'&xor ($a1,$a0)', # Sigma0(a)
967	'&lea ($h,"($h,$a3)");'. # h+=Maj(a,b,c)
968	'&mov ($a4,$e)', # copy of f in future
969
970	'($a2,$a3) = ($a3,$a2); unshift(@ROT,pop(@ROT)); $i++;'
971	);
972	# and at the finish one has to $a+=$a1
973	}
974
975	$code.=<<___;
976	.type ${func}_avx2,\@function,6
977	.align 64
978	${func}_avx2:
979	.cfi_startproc
980	.Lavx2_shortcut:
981	mov `($win64?56:8)`(%rsp),$in0 # load 7th parameter
982	mov %rsp,%rax # copy %rsp
983	.cfi_def_cfa_register %rax
984	push %rbx
985	.cfi_push %rbx
986	push %rbp
987	.cfi_push %rbp
988	push %r12
989	.cfi_push %r12
990	push %r13
991	.cfi_push %r13
992	push %r14
993	.cfi_push %r14
994	push %r15
995	.cfi_push %r15
996	sub \$`2$SZ$rounds+88+$win6416*10`,%rsp
997	and \$-256*$SZ,%rsp # align stack frame
998	add \$`2$SZ($rounds-8)`,%rsp
999
1000	shl \$6,$len
1001	sub $inp,$out # re-bias
1002	sub $inp,$in0
1003	add $inp,$len # end of input
1004
1005	#mov $inp,$_inp # saved later
1006	#mov $out,$_out # kept in $offload
1007	mov $len,$_end
1008	#mov $key,$_key # remains resident in $inp register
1009	mov $ivp,$_ivp
1010	mov $ctx,$_ctx
1011	mov $in0,$_in0
1012	mov %rax,$_rsp
1013	.cfi_cfa_expression $_rsp,deref,+8
1014	___
1015	$code.=<<___ if ($win64);
1016	movaps %xmm6,`$framesz+16*0`(%rsp)
1017	movaps %xmm7,`$framesz+16*1`(%rsp)
1018	movaps %xmm8,`$framesz+16*2`(%rsp)
1019	movaps %xmm9,`$framesz+16*3`(%rsp)
1020	movaps %xmm10,`$framesz+16*4`(%rsp)
1021	movaps %xmm11,`$framesz+16*5`(%rsp)
1022	movaps %xmm12,`$framesz+16*6`(%rsp)
1023	movaps %xmm13,`$framesz+16*7`(%rsp)
1024	movaps %xmm14,`$framesz+16*8`(%rsp)
1025	movaps %xmm15,`$framesz+16*9`(%rsp)
1026	___
1027	$code.=<<___;
1028	.Lprologue_avx2:
1029	vzeroall
1030
1031	mov $inp,%r13 # borrow $a0
1032	vpinsrq \$1,$out,$offload,$offload
1033	lea 0x80($key),$inp # size optimization, reassign
1034	lea $TABLE+`$SZ2$rounds+32`(%rip),%r12 # borrow $a4
1035	mov 0xf0-0x80($inp),%r14d # rounds, borrow $a1
1036	mov $ctx,%r15 # borrow $a2
1037	mov $in0,%rsi # borrow $a3
1038	vmovdqu ($ivp),$iv # load IV
1039	lea -9(%r14),%r14
1040
1041	vmovdqa 0x00(%r12,%r14,8),$mask14
1042	vmovdqa 0x10(%r12,%r14,8),$mask12
1043	vmovdqa 0x20(%r12,%r14,8),$mask10
1044
1045	sub \$-16*$SZ,%r13 # inp++, size optimization
1046	mov $SZ*0(%r15),$A
1047	lea (%rsi,%r13),%r12 # borrow $a0
1048	mov $SZ*1(%r15),$B
1049	cmp $len,%r13 # $_end
1050	mov $SZ*2(%r15),$C
1051	cmove %rsp,%r12 # next block or random data
1052	mov $SZ*3(%r15),$D
1053	mov $SZ*4(%r15),$E
1054	mov $SZ*5(%r15),$F
1055	mov $SZ*6(%r15),$G
1056	mov $SZ*7(%r15),$H
1057	vmovdqu 0x00-0x80($inp),$roundkey
1058	___
1059	if ($SZ==4) { # SHA256
1060	my @X = map("%ymm$_",(0..3));
1061	my ($t0,$t1,$t2,$t3) = map("%ymm$_",(4..7));
1062
1063	$code.=<<___;
1064	jmp .Loop_avx2
1065	.align 16
1066	.Loop_avx2:
1067	vmovdqa $TABLE+`$SZ2$rounds`(%rip),$t3
1068	vmovdqu -16*$SZ+0(%rsi,%r13),%xmm0
1069	vmovdqu -16*$SZ+16(%rsi,%r13),%xmm1
1070	vmovdqu -16*$SZ+32(%rsi,%r13),%xmm2
1071	vmovdqu -16*$SZ+48(%rsi,%r13),%xmm3
1072
1073	vinserti128 \$1,(%r12),@X[0],@X[0]
1074	vinserti128 \$1,16(%r12),@X[1],@X[1]
1075	vpshufb $t3,@X[0],@X[0]
1076	vinserti128 \$1,32(%r12),@X[2],@X[2]
1077	vpshufb $t3,@X[1],@X[1]
1078	vinserti128 \$1,48(%r12),@X[3],@X[3]
1079
1080	lea $TABLE(%rip),$Tbl
1081	vpshufb $t3,@X[2],@X[2]
1082	lea -16*$SZ(%r13),%r13
1083	vpaddd 0x00($Tbl),@X[0],$t0
1084	vpshufb $t3,@X[3],@X[3]
1085	vpaddd 0x20($Tbl),@X[1],$t1
1086	vpaddd 0x40($Tbl),@X[2],$t2
1087	vpaddd 0x60($Tbl),@X[3],$t3
1088	vmovdqa $t0,0x00(%rsp)
1089	xor $a1,$a1
1090	vmovdqa $t1,0x20(%rsp)
1091	___
1092	$code.=<<___ if (!$win64);
1093	# temporarily use %rsi as frame pointer
1094	mov $_rsp,%rsi
1095	.cfi_def_cfa %rsi,8
1096	___
1097	$code.=<<___;
1098	lea -$PUSH8(%rsp),%rsp
1099	___
1100	$code.=<<___ if (!$win64);
1101	# the frame info is at $_rsp, but the stack is moving...
1102	# so a second frame pointer is saved at -8(%rsp)
1103	# that is in the red zone
1104	mov %rsi,-8(%rsp)
1105	.cfi_cfa_expression %rsp-8,deref,+8
1106	___
1107	$code.=<<___;
1108	mov $B,$a3
1109	vmovdqa $t2,0x00(%rsp)
1110	xor $C,$a3 # magic
1111	vmovdqa $t3,0x20(%rsp)
1112	mov $F,$a4
1113	sub \$-162$SZ,$Tbl # size optimization
1114	jmp .Lavx2_00_47
1115
1116	.align 16
1117	.Lavx2_00_47:
1118	vmovdqu (%r13),$inout
1119	vpinsrq \$0,%r13,$offload,$offload
1120	___
1121
1122	sub AVX2_256_00_47 () {
1123	my $j = shift;
1124	my $body = shift;
1125	my @X = @_;
1126	my @insns = (&$body,&$body,&$body,&$body); # 96 instructions
1127	my $base = "+2*$PUSH8(%rsp)";
1128
1129	if (($j%2)==0) {
1130	&lea ("%rsp","-$PUSH8(%rsp)");
1131	$code.=<<___ if (!$win64);
1132	.cfi_cfa_expression %rsp+`$PUSH8-8`,deref,+8
1133	# copy secondary frame pointer to new location again at -8(%rsp)
1134	pushq $PUSH8-8(%rsp)
1135	.cfi_cfa_expression %rsp,deref,+8
1136	lea 8(%rsp),%rsp
1137	.cfi_cfa_expression %rsp-8,deref,+8
1138	___
1139	}
1140	foreach (Xupdate_256_AVX()) { # 29 instructions
1141	eval;
1142	eval(shift(@insns));
1143	eval(shift(@insns));
1144	eval(shift(@insns));
1145	}
1146	&vpaddd ($t2,@X[0],162$j."($Tbl)");
1147	foreach (@insns) { eval; } # remaining instructions
1148	&vmovdqa ((32*$j)%$PUSH8."(%rsp)",$t2);
1149	}
1150	$aesni_cbc_idx=0;
1151	for ($i=0,$j=0; $j<4; $j++) {
1152	&AVX2_256_00_47($j,\&bodyx_00_15,@X);
1153	push(@X,shift(@X)); # rotate(@X)
1154	}
1155	&vmovq ("%r13",$offload); # borrow $a0
1156	&vpextrq ("%r15",$offload,1); # borrow $a2
1157	&vpand ($temp,$temp,$mask14);
1158	&vpor ($iv,$iv,$temp);
1159	&vmovdqu ("(%r15,%r13)",$iv); # write output
1160	&lea ("%r13","16(%r13)"); # inp++
1161
1162	&lea ($Tbl,162$SZ."($Tbl)");
1163	&cmpb (($SZ-1)."($Tbl)",0);
1164	&jne (".Lavx2_00_47");
1165
1166	&vmovdqu ($inout,"(%r13)");
1167	&vpinsrq ($offload,$offload,"%r13",0);
1168
1169	$aesni_cbc_idx=0;
1170	for ($i=0; $i<16; ) {
1171	my $base=$i<8?"+$PUSH8(%rsp)":"(%rsp)";
1172	foreach(bodyx_00_15()) { eval; }
1173	}
1174	}
1175	$code.=<<___;
1176	vpextrq \$1,$offload,%r12 # $_out, borrow $a4
1177	vmovq $offload,%r13 # $_inp, borrow $a0
1178	mov `2$SZ$rounds+5*8`(%rsp),%r15 # $_ctx, borrow $a2
1179	add $a1,$A
1180	lea `2$SZ($rounds-8)`(%rsp),$Tbl
1181
1182	vpand $mask14,$temp,$temp
1183	vpor $temp,$iv,$iv
1184	vmovdqu $iv,(%r12,%r13) # write output
1185	lea 16(%r13),%r13
1186
1187	add $SZ*0(%r15),$A
1188	add $SZ*1(%r15),$B
1189	add $SZ*2(%r15),$C
1190	add $SZ*3(%r15),$D
1191	add $SZ*4(%r15),$E
1192	add $SZ*5(%r15),$F
1193	add $SZ*6(%r15),$G
1194	add $SZ*7(%r15),$H
1195
1196	mov $A,$SZ*0(%r15)
1197	mov $B,$SZ*1(%r15)
1198	mov $C,$SZ*2(%r15)
1199	mov $D,$SZ*3(%r15)
1200	mov $E,$SZ*4(%r15)
1201	mov $F,$SZ*5(%r15)
1202	mov $G,$SZ*6(%r15)
1203	mov $H,$SZ*7(%r15)
1204
1205	cmp `$PUSH8+2*8`($Tbl),%r13 # $_end
1206	je .Ldone_avx2
1207
1208	xor $a1,$a1
1209	mov $B,$a3
1210	mov $F,$a4
1211	xor $C,$a3 # magic
1212	jmp .Lower_avx2
1213	.align 16
1214	.Lower_avx2:
1215	vmovdqu (%r13),$inout
1216	vpinsrq \$0,%r13,$offload,$offload
1217	___
1218	$aesni_cbc_idx=0;
1219	for ($i=0; $i<16; ) {
1220	my $base="+16($Tbl)";
1221	foreach(bodyx_00_15()) { eval; }
1222	&lea ($Tbl,"-$PUSH8($Tbl)") if ($i==8);
1223	}
1224	$code.=<<___;
1225	vmovq $offload,%r13 # borrow $a0
1226	vpextrq \$1,$offload,%r15 # borrow $a2
1227	vpand $mask14,$temp,$temp
1228	vpor $temp,$iv,$iv
1229	lea -$PUSH8($Tbl),$Tbl
1230	vmovdqu $iv,(%r15,%r13) # write output
1231	lea 16(%r13),%r13 # inp++
1232	cmp %rsp,$Tbl
1233	jae .Lower_avx2
1234
1235	mov `2$SZ$rounds+5*8`(%rsp),%r15 # $_ctx, borrow $a2
1236	lea 16*$SZ(%r13),%r13
1237	mov `2$SZ$rounds+6*8`(%rsp),%rsi # $_in0, borrow $a3
1238	add $a1,$A
1239	lea `2$SZ($rounds-8)`(%rsp),%rsp
1240
1241	add $SZ*0(%r15),$A
1242	add $SZ*1(%r15),$B
1243	add $SZ*2(%r15),$C
1244	add $SZ*3(%r15),$D
1245	add $SZ*4(%r15),$E
1246	add $SZ*5(%r15),$F
1247	add $SZ*6(%r15),$G
1248	lea (%rsi,%r13),%r12
1249	add $SZ*7(%r15),$H
1250
1251	cmp $_end,%r13
1252
1253	mov $A,$SZ*0(%r15)
1254	cmove %rsp,%r12 # next block or stale data
1255	mov $B,$SZ*1(%r15)
1256	mov $C,$SZ*2(%r15)
1257	mov $D,$SZ*3(%r15)
1258	mov $E,$SZ*4(%r15)
1259	mov $F,$SZ*5(%r15)
1260	mov $G,$SZ*6(%r15)
1261	mov $H,$SZ*7(%r15)
1262
1263	jbe .Loop_avx2
1264	lea (%rsp),$Tbl
1265	# temporarily use $Tbl as index to $_rsp
1266	# this avoids the need to save a secondary frame pointer at -8(%rsp)
1267	.cfi_cfa_expression $Tbl+`16$SZ+78`,deref,+8
1268
1269	.Ldone_avx2:
1270	mov 16$SZ+48($Tbl),$ivp
1271	mov 16$SZ+78($Tbl),%rsi
1272	.cfi_def_cfa %rsi,8
1273	vmovdqu $iv,($ivp) # output IV
1274	vzeroall
1275	___
1276	$code.=<<___ if ($win64);
1277	movaps `$framesz+16*0`($Tbl),%xmm6
1278	movaps `$framesz+16*1`($Tbl),%xmm7
1279	movaps `$framesz+16*2`($Tbl),%xmm8
1280	movaps `$framesz+16*3`($Tbl),%xmm9
1281	movaps `$framesz+16*4`($Tbl),%xmm10
1282	movaps `$framesz+16*5`($Tbl),%xmm11
1283	movaps `$framesz+16*6`($Tbl),%xmm12
1284	movaps `$framesz+16*7`($Tbl),%xmm13
1285	movaps `$framesz+16*8`($Tbl),%xmm14
1286	movaps `$framesz+16*9`($Tbl),%xmm15
1287	___
1288	$code.=<<___;
1289	mov -48(%rsi),%r15
1290	.cfi_restore %r15
1291	mov -40(%rsi),%r14
1292	.cfi_restore %r14
1293	mov -32(%rsi),%r13
1294	.cfi_restore %r13
1295	mov -24(%rsi),%r12
1296	.cfi_restore %r12
1297	mov -16(%rsi),%rbp
1298	.cfi_restore %rbp
1299	mov -8(%rsi),%rbx
1300	.cfi_restore %rbx
1301	lea (%rsi),%rsp
1302	.cfi_def_cfa_register %rsp
1303	.Lepilogue_avx2:
1304	ret
1305	.cfi_endproc
1306	.size ${func}_avx2,.-${func}_avx2
1307	___
1308	}}
1309	}}
1310	{{
1311	my ($in0,$out,$len,$key,$ivp,$ctx,$inp)=("%rdi","%rsi","%rdx","%rcx","%r8","%r9","%r10");
1312
1313	my ($rounds,$Tbl)=("%r11d","%rbx");
1314
1315	my ($iv,$in,$rndkey0)=map("%xmm$_",(6,14,15));
1316	my @rndkey=("%xmm4","%xmm5");
1317	my $r=0;
1318	my $sn=0;
1319
1320	my ($Wi,$ABEF,$CDGH,$TMP,$BSWAP,$ABEF_SAVE,$CDGH_SAVE)=map("%xmm$_",(0..3,7..9));
1321	my @MSG=map("%xmm$_",(10..13));
1322
1323	my $aesenc=sub {
1324	use integer;
1325	my ($n,$k)=($r/10,$r%10);
1326	if ($k==0) {
1327	$code.=<<___;
1328	movups `16*$n`($in0),$in # load input
1329	xorps $rndkey0,$in
1330	___
1331	$code.=<<___ if ($n);
1332	movups $iv,`16*($n-1)`($out,$in0) # write output
1333	___
1334	$code.=<<___;
1335	xorps $in,$iv
1336	movups `32+16*$k-112`($key),$rndkey[1]
1337	aesenc $rndkey[0],$iv
1338	___
1339	} elsif ($k==9) {
1340	$sn++;
1341	$code.=<<___;
1342	cmp \$11,$rounds
1343	jb .Laesenclast$sn
1344	movups `32+16*($k+0)-112`($key),$rndkey[1]
1345	aesenc $rndkey[0],$iv
1346	movups `32+16*($k+1)-112`($key),$rndkey[0]
1347	aesenc $rndkey[1],$iv
1348	je .Laesenclast$sn
1349	movups `32+16*($k+2)-112`($key),$rndkey[1]
1350	aesenc $rndkey[0],$iv
1351	movups `32+16*($k+3)-112`($key),$rndkey[0]
1352	aesenc $rndkey[1],$iv
1353	.Laesenclast$sn:
1354	aesenclast $rndkey[0],$iv
1355	movups 16-112($key),$rndkey[1] # forward reference
1356	nop
1357	___
1358	} else {
1359	$code.=<<___;
1360	movups `32+16*$k-112`($key),$rndkey[1]
1361	aesenc $rndkey[0],$iv
1362	___
1363	}
1364	$r++; unshift(@rndkey,pop(@rndkey));
1365	};
1366
1367	if ($shaext) {
1368	my $Tbl="%rax";
1369
1370	$code.=<<___;
1371	.type ${func}_shaext,\@function,6
1372	.align 32
1373	${func}_shaext:
1374	.cfi_startproc
1375	mov `($win64?56:8)`(%rsp),$inp # load 7th argument
1376	___
1377	$code.=<<___ if ($win64);
1378	lea `-8-10*16`(%rsp),%rsp
1379	movaps %xmm6,-8-10*16(%rax)
1380	movaps %xmm7,-8-9*16(%rax)
1381	movaps %xmm8,-8-8*16(%rax)
1382	movaps %xmm9,-8-7*16(%rax)
1383	movaps %xmm10,-8-6*16(%rax)
1384	movaps %xmm11,-8-5*16(%rax)
1385	movaps %xmm12,-8-4*16(%rax)
1386	movaps %xmm13,-8-3*16(%rax)
1387	movaps %xmm14,-8-2*16(%rax)
1388	movaps %xmm15,-8-1*16(%rax)
1389	.Lprologue_shaext:
1390	___
1391	$code.=<<___;
1392	lea K256+0x80(%rip),$Tbl
1393	movdqu ($ctx),$ABEF # DCBA
1394	movdqu 16($ctx),$CDGH # HGFE
1395	movdqa 0x200-0x80($Tbl),$TMP # byte swap mask
1396
1397	mov 240($key),$rounds
1398	sub $in0,$out
1399	movups ($key),$rndkey0 # $key[0]
1400	movups ($ivp),$iv # load IV
1401	movups 16($key),$rndkey[0] # forward reference
1402	lea 112($key),$key # size optimization
1403
1404	pshufd \$0x1b,$ABEF,$Wi # ABCD
1405	pshufd \$0xb1,$ABEF,$ABEF # CDAB
1406	pshufd \$0x1b,$CDGH,$CDGH # EFGH
1407	movdqa $TMP,$BSWAP # offload
1408	palignr \$8,$CDGH,$ABEF # ABEF
1409	punpcklqdq $Wi,$CDGH # CDGH
1410
1411	jmp .Loop_shaext
1412
1413	.align 16
1414	.Loop_shaext:
1415	movdqu ($inp),@MSG[0]
1416	movdqu 0x10($inp),@MSG[1]
1417	movdqu 0x20($inp),@MSG[2]
1418	pshufb $TMP,@MSG[0]
1419	movdqu 0x30($inp),@MSG[3]
1420
1421	movdqa 0*32-0x80($Tbl),$Wi
1422	paddd @MSG[0],$Wi
1423	pshufb $TMP,@MSG[1]
1424	movdqa $CDGH,$CDGH_SAVE # offload
1425	movdqa $ABEF,$ABEF_SAVE # offload
1426	___
1427	&$aesenc();
1428	$code.=<<___;
1429	sha256rnds2 $ABEF,$CDGH # 0-3
1430	pshufd \$0x0e,$Wi,$Wi
1431	___
1432	&$aesenc();
1433	$code.=<<___;
1434	sha256rnds2 $CDGH,$ABEF
1435
1436	movdqa 1*32-0x80($Tbl),$Wi
1437	paddd @MSG[1],$Wi
1438	pshufb $TMP,@MSG[2]
1439	lea 0x40($inp),$inp
1440	___
1441	&$aesenc();
1442	$code.=<<___;
1443	sha256rnds2 $ABEF,$CDGH # 4-7
1444	pshufd \$0x0e,$Wi,$Wi
1445	___
1446	&$aesenc();
1447	$code.=<<___;
1448	sha256rnds2 $CDGH,$ABEF
1449
1450	movdqa 2*32-0x80($Tbl),$Wi
1451	paddd @MSG[2],$Wi
1452	pshufb $TMP,@MSG[3]
1453	sha256msg1 @MSG[1],@MSG[0]
1454	___
1455	&$aesenc();
1456	$code.=<<___;
1457	sha256rnds2 $ABEF,$CDGH # 8-11
1458	pshufd \$0x0e,$Wi,$Wi
1459	movdqa @MSG[3],$TMP
1460	palignr \$4,@MSG[2],$TMP
1461	paddd $TMP,@MSG[0]
1462	___
1463	&$aesenc();
1464	$code.=<<___;
1465	sha256rnds2 $CDGH,$ABEF
1466
1467	movdqa 3*32-0x80($Tbl),$Wi
1468	paddd @MSG[3],$Wi
1469	sha256msg2 @MSG[3],@MSG[0]
1470	sha256msg1 @MSG[2],@MSG[1]
1471	___
1472	&$aesenc();
1473	$code.=<<___;
1474	sha256rnds2 $ABEF,$CDGH # 12-15
1475	pshufd \$0x0e,$Wi,$Wi
1476	___
1477	&$aesenc();
1478	$code.=<<___;
1479	movdqa @MSG[0],$TMP
1480	palignr \$4,@MSG[3],$TMP
1481	paddd $TMP,@MSG[1]
1482	sha256rnds2 $CDGH,$ABEF
1483	___
1484	for($i=4;$i<16-3;$i++) {
1485	&$aesenc() if (($r%10)==0);
1486	$code.=<<___;
1487	movdqa $i*32-0x80($Tbl),$Wi
1488	paddd @MSG[0],$Wi
1489	sha256msg2 @MSG[0],@MSG[1]
1490	sha256msg1 @MSG[3],@MSG[2]
1491	___
1492	&$aesenc();
1493	$code.=<<___;
1494	sha256rnds2 $ABEF,$CDGH # 16-19...
1495	pshufd \$0x0e,$Wi,$Wi
1496	movdqa @MSG[1],$TMP
1497	palignr \$4,@MSG[0],$TMP
1498	paddd $TMP,@MSG[2]
1499	___
1500	&$aesenc();
1501	&$aesenc() if ($r==19);
1502	$code.=<<___;
1503	sha256rnds2 $CDGH,$ABEF
1504	___
1505	push(@MSG,shift(@MSG));
1506	}
1507	$code.=<<___;
1508	movdqa 13*32-0x80($Tbl),$Wi
1509	paddd @MSG[0],$Wi
1510	sha256msg2 @MSG[0],@MSG[1]
1511	sha256msg1 @MSG[3],@MSG[2]
1512	___
1513	&$aesenc();
1514	$code.=<<___;
1515	sha256rnds2 $ABEF,$CDGH # 52-55
1516	pshufd \$0x0e,$Wi,$Wi
1517	movdqa @MSG[1],$TMP
1518	palignr \$4,@MSG[0],$TMP
1519	paddd $TMP,@MSG[2]
1520	___
1521	&$aesenc();
1522	&$aesenc();
1523	$code.=<<___;
1524	sha256rnds2 $CDGH,$ABEF
1525
1526	movdqa 14*32-0x80($Tbl),$Wi
1527	paddd @MSG[1],$Wi
1528	sha256msg2 @MSG[1],@MSG[2]
1529	movdqa $BSWAP,$TMP
1530	___
1531	&$aesenc();
1532	$code.=<<___;
1533	sha256rnds2 $ABEF,$CDGH # 56-59
1534	pshufd \$0x0e,$Wi,$Wi
1535	___
1536	&$aesenc();
1537	$code.=<<___;
1538	sha256rnds2 $CDGH,$ABEF
1539
1540	movdqa 15*32-0x80($Tbl),$Wi
1541	paddd @MSG[2],$Wi
1542	___
1543	&$aesenc();
1544	&$aesenc();
1545	$code.=<<___;
1546	sha256rnds2 $ABEF,$CDGH # 60-63
1547	pshufd \$0x0e,$Wi,$Wi
1548	___
1549	&$aesenc();
1550	$code.=<<___;
1551	sha256rnds2 $CDGH,$ABEF
1552	#pxor $CDGH,$rndkey0 # black magic
1553	___
1554	while ($r<40) { &$aesenc(); } # remaining aesenc's
1555	$code.=<<___;
1556	#xorps $CDGH,$rndkey0 # black magic
1557	paddd $CDGH_SAVE,$CDGH
1558	paddd $ABEF_SAVE,$ABEF
1559
1560	dec $len
1561	movups $iv,48($out,$in0) # write output
1562	lea 64($in0),$in0
1563	jnz .Loop_shaext
1564
1565	pshufd \$0xb1,$CDGH,$CDGH # DCHG
1566	pshufd \$0x1b,$ABEF,$TMP # FEBA
1567	pshufd \$0xb1,$ABEF,$ABEF # BAFE
1568	punpckhqdq $CDGH,$ABEF # DCBA
1569	palignr \$8,$TMP,$CDGH # HGFE
1570
1571	movups $iv,($ivp) # write IV
1572	movdqu $ABEF,($ctx)
1573	movdqu $CDGH,16($ctx)
1574	___
1575	$code.=<<___ if ($win64);
1576	movaps 0*16(%rsp),%xmm6
1577	movaps 1*16(%rsp),%xmm7
1578	movaps 2*16(%rsp),%xmm8
1579	movaps 3*16(%rsp),%xmm9
1580	movaps 4*16(%rsp),%xmm10
1581	movaps 5*16(%rsp),%xmm11
1582	movaps 6*16(%rsp),%xmm12
1583	movaps 7*16(%rsp),%xmm13
1584	movaps 8*16(%rsp),%xmm14
1585	movaps 9*16(%rsp),%xmm15
1586	lea 8+10*16(%rsp),%rsp
1587	.Lepilogue_shaext:
1588	___
1589	$code.=<<___;
1590	ret
1591	.cfi_endproc
1592	.size ${func}_shaext,.-${func}_shaext
1593	___
1594	}
1595	}}}}}
1596
1597	# EXCEPTION_DISPOSITION handler (EXCEPTION_RECORD *rec,ULONG64 frame,
1598	# CONTEXT context,DISPATCHER_CONTEXT disp)
1599	if ($win64 && $avx) {
1600	$rec="%rcx";
1601	$frame="%rdx";
1602	$context="%r8";
1603	$disp="%r9";
1604
1605	$code.=<<___;
1606	.extern __imp_RtlVirtualUnwind
1607	.type se_handler,\@abi-omnipotent
1608	.align 16
1609	se_handler:
1610	push %rsi
1611	push %rdi
1612	push %rbx
1613	push %rbp
1614	push %r12
1615	push %r13
1616	push %r14
1617	push %r15
1618	pushfq
1619	sub \$64,%rsp
1620
1621	mov 120($context),%rax # pull context->Rax
1622	mov 248($context),%rbx # pull context->Rip
1623
1624	mov 8($disp),%rsi # disp->ImageBase
1625	mov 56($disp),%r11 # disp->HanderlData
1626
1627	mov 0(%r11),%r10d # HandlerData[0]
1628	lea (%rsi,%r10),%r10 # prologue label
1629	cmp %r10,%rbx # context->Rip<prologue label
1630	jb .Lin_prologue
1631
1632	mov 152($context),%rax # pull context->Rsp
1633
1634	mov 4(%r11),%r10d # HandlerData[1]
1635	lea (%rsi,%r10),%r10 # epilogue label
1636	cmp %r10,%rbx # context->Rip>=epilogue label
1637	jae .Lin_prologue
1638	___
1639	$code.=<<___ if ($shaext);
1640	lea aesni_cbc_sha256_enc_shaext(%rip),%r10
1641	cmp %r10,%rbx
1642	jb .Lnot_in_shaext
1643
1644	lea (%rax),%rsi
1645	lea 512($context),%rdi # &context.Xmm6
1646	mov \$20,%ecx
1647	.long 0xa548f3fc # cld; rep movsq
1648	lea 168(%rax),%rax # adjust stack pointer
1649	jmp .Lin_prologue
1650	.Lnot_in_shaext:
1651	___
1652	$code.=<<___ if ($avx>1);
1653	lea .Lavx2_shortcut(%rip),%r10
1654	cmp %r10,%rbx # context->Rip<avx2_shortcut
1655	jb .Lnot_in_avx2
1656
1657	and \$-256*$SZ,%rax
1658	add \$`2$SZ($rounds-8)`,%rax
1659	.Lnot_in_avx2:
1660	___
1661	$code.=<<___;
1662	mov %rax,%rsi # put aside Rsp
1663	mov 16$SZ+78(%rax),%rax # pull $_rsp
1664
1665	mov -8(%rax),%rbx
1666	mov -16(%rax),%rbp
1667	mov -24(%rax),%r12
1668	mov -32(%rax),%r13
1669	mov -40(%rax),%r14
1670	mov -48(%rax),%r15
1671	mov %rbx,144($context) # restore context->Rbx
1672	mov %rbp,160($context) # restore context->Rbp
1673	mov %r12,216($context) # restore context->R12
1674	mov %r13,224($context) # restore context->R13
1675	mov %r14,232($context) # restore context->R14
1676	mov %r15,240($context) # restore context->R15
1677
1678	lea 16$SZ+88(%rsi),%rsi # Xmm6- save area
1679	lea 512($context),%rdi # &context.Xmm6
1680	mov \$20,%ecx
1681	.long 0xa548f3fc # cld; rep movsq
1682
1683	.Lin_prologue:
1684	mov 8(%rax),%rdi
1685	mov 16(%rax),%rsi
1686	mov %rax,152($context) # restore context->Rsp
1687	mov %rsi,168($context) # restore context->Rsi
1688	mov %rdi,176($context) # restore context->Rdi
1689
1690	mov 40($disp),%rdi # disp->ContextRecord
1691	mov $context,%rsi # context
1692	mov \$154,%ecx # sizeof(CONTEXT)
1693	.long 0xa548f3fc # cld; rep movsq
1694
1695	mov $disp,%rsi
1696	xor %rcx,%rcx # arg1, UNW_FLAG_NHANDLER
1697	mov 8(%rsi),%rdx # arg2, disp->ImageBase
1698	mov 0(%rsi),%r8 # arg3, disp->ControlPc
1699	mov 16(%rsi),%r9 # arg4, disp->FunctionEntry
1700	mov 40(%rsi),%r10 # disp->ContextRecord
1701	lea 56(%rsi),%r11 # &disp->HandlerData
1702	lea 24(%rsi),%r12 # &disp->EstablisherFrame
1703	mov %r10,32(%rsp) # arg5
1704	mov %r11,40(%rsp) # arg6
1705	mov %r12,48(%rsp) # arg7
1706	mov %rcx,56(%rsp) # arg8, (NULL)
1707	call *__imp_RtlVirtualUnwind(%rip)
1708
1709	mov \$1,%eax # ExceptionContinueSearch
1710	add \$64,%rsp
1711	popfq
1712	pop %r15
1713	pop %r14
1714	pop %r13
1715	pop %r12
1716	pop %rbp
1717	pop %rbx
1718	pop %rdi
1719	pop %rsi
1720	ret
1721	.size se_handler,.-se_handler
1722
1723	.section .pdata
1724	.rva .LSEH_begin_${func}_xop
1725	.rva .LSEH_end_${func}_xop
1726	.rva .LSEH_info_${func}_xop
1727
1728	.rva .LSEH_begin_${func}_avx
1729	.rva .LSEH_end_${func}_avx
1730	.rva .LSEH_info_${func}_avx
1731	___
1732	$code.=<<___ if ($avx>1);
1733	.rva .LSEH_begin_${func}_avx2
1734	.rva .LSEH_end_${func}_avx2
1735	.rva .LSEH_info_${func}_avx2
1736	___
1737	$code.=<<___ if ($shaext);
1738	.rva .LSEH_begin_${func}_shaext
1739	.rva .LSEH_end_${func}_shaext
1740	.rva .LSEH_info_${func}_shaext
1741	___
1742	$code.=<<___;
1743	.section .xdata
1744	.align 8
1745	.LSEH_info_${func}_xop:
1746	.byte 9,0,0,0
1747	.rva se_handler
1748	.rva .Lprologue_xop,.Lepilogue_xop # HandlerData[]
1749
1750	.LSEH_info_${func}_avx:
1751	.byte 9,0,0,0
1752	.rva se_handler
1753	.rva .Lprologue_avx,.Lepilogue_avx # HandlerData[]
1754	___
1755	$code.=<<___ if ($avx>1);
1756	.LSEH_info_${func}_avx2:
1757	.byte 9,0,0,0
1758	.rva se_handler
1759	.rva .Lprologue_avx2,.Lepilogue_avx2 # HandlerData[]
1760	___
1761	$code.=<<___ if ($shaext);
1762	.LSEH_info_${func}_shaext:
1763	.byte 9,0,0,0
1764	.rva se_handler
1765	.rva .Lprologue_shaext,.Lepilogue_shaext # HandlerData[]
1766	___
1767	}
1768
1769	####################################################################
1770	sub rex {
1771	local *opcode=shift;
1772	my ($dst,$src)=@_;
1773	my $rex=0;
1774
1775	$rex\|=0x04 if($dst>=8);
1776	$rex\|=0x01 if($src>=8);
1777	unshift @opcode,$rex\|0x40 if($rex);
1778	}
1779
1780	{
1781	my %opcodelet = (
1782	"sha256rnds2" => 0xcb,
1783	"sha256msg1" => 0xcc,
1784	"sha256msg2" => 0xcd );
1785
1786	sub sha256op38 {
1787	my $instr = shift;
1788
1789	if (defined($opcodelet{$instr}) && @_[0] =~ /%xmm([0-9]+),\s*%xmm([0-9]+)/) {
1790	my @opcode=(0x0f,0x38);
1791	rex(\@opcode,$2,$1);
1792	push @opcode,$opcodelet{$instr};
1793	push @opcode,0xc0\|($1&7)\|(($2&7)<<3); # ModR/M
1794	return ".byte\t".join(',',@opcode);
1795	} else {
1796	return $instr."\t".@_[0];
1797	}
1798	}
1799	}
1800
1801	$code =~ s/\`([^\`]*)\`/eval $1/gem;
1802	$code =~ s/\b(sha256[^\s])\s+(.)/sha256op38($1,$2)/gem;
1803	print $code;
1804	close STDOUT or die "error closing STDOUT: $!";

Note: See TracBrowser for help on using the repository browser.

source: vbox/trunk/src/libs/openssl-3.1.3/crypto/aes/asm/aesni-sha256-x86_64.pl@ 102427

Download in other formats: