aesni-sha256-x86_64.pl@ 90293

Last change on this file since 90293 was 90293, checked in by vboxsync, 4 years ago
openssl-1.1.1k: Applied and adjusted our OpenSSL changes to 1.1.1k. bugref:10072
File size: 43.3 KB

Line
1	#! /usr/bin/env perl
2	# Copyright 2013-2020 The OpenSSL Project Authors. All Rights Reserved.
3	#
4	# Licensed under the OpenSSL license (the "License"). You may not use
5	# this file except in compliance with the License. You can obtain a copy
6	# in the file LICENSE in the source distribution or at
7	# https://www.openssl.org/source/license.html
8
9	#
10	# ====================================================================
11	# Written by Andy Polyakov <[email protected]> for the OpenSSL
12	# project. The module is, however, dual licensed under OpenSSL and
13	# CRYPTOGAMS licenses depending on where you obtain it. For further
14	# details see http://www.openssl.org/~appro/cryptogams/.
15	# ====================================================================
16	#
17	# January 2013
18	#
19	# This is AESNI-CBC+SHA256 stitch implementation. The idea, as spelled
20	# in http://download.intel.com/design/intarch/papers/323686.pdf, is
21	# that since AESNI-CBC encrypt exhibit very low instruction-level
22	# parallelism, interleaving it with another algorithm would allow to
23	# utilize processor resources better and achieve better performance.
24	# SHA256 instruction sequences(*) are taken from sha512-x86_64.pl and
25	# AESNI code is weaved into it. As SHA256 dominates execution time,
26	# stitch performance does not depend on AES key length. Below are
27	# performance numbers in cycles per processed byte, less is better,
28	# for standalone AESNI-CBC encrypt, standalone SHA256, and stitched
29	# subroutine:
30	#
31	# AES-128/-192/-256+SHA256 this(**) gain
32	# Sandy Bridge 5.05/6.05/7.05+11.6 13.0 +28%/36%/43%
33	# Ivy Bridge 5.05/6.05/7.05+10.3 11.6 +32%/41%/50%
34	# Haswell 4.43/5.29/6.19+7.80 8.79 +39%/49%/59%
35	# Skylake 2.62/3.14/3.62+7.70 8.10 +27%/34%/40%
36	# Bulldozer 5.77/6.89/8.00+13.7 13.7 +42%/50%/58%
37	# Ryzen(***) 2.71/-/3.71+2.05 2.74/-/3.73 +74%/-/54%
38	# Goldmont(***) 3.82/-/5.35+4.16 4.73/-/5.94 +69%/-/60%
39	#
40	# (*) there are XOP, AVX1 and AVX2 code paths, meaning that
41	# Westmere is omitted from loop, this is because gain was not
42	# estimated high enough to justify the effort;
43	# (**) these are EVP-free results, results obtained with 'speed
44	# -evp aes-256-cbc-hmac-sha256' will vary by percent or two;
45	# (***) these are SHAEXT results;
46
47	$flavour = shift;
48	$output = shift;
49	if ($flavour =~ /\./) { $output = $flavour; undef $flavour; }
50
51	$win64=0; $win64=1 if ($flavour =~ /[nm]asm\|mingw64/ \|\| $output =~ /\.asm$/);
52
53	$0 =~ m/(.*[\/\\])[^\/\\]+$/; $dir=$1;
54	( $xlate="${dir}x86_64-xlate.pl" and -f $xlate ) or
55	( $xlate="${dir}../../perlasm/x86_64-xlate.pl" and -f $xlate) or
56	die "can't locate x86_64-xlate.pl";
57
58	if (`$ENV{CC} -Wa,-v -c -o /dev/null -x assembler /dev/null 2>&1`
59	=~ /GNU assembler version ([2-9]\.[0-9]+)/) {
60	$avx = ($1>=2.19) + ($1>=2.22);
61	}
62
63	if (!$avx && $win64 && ($flavour =~ /nasm/ \|\| $ENV{ASM} =~ /nasm/) &&
64	`nasm -v 2>&1` =~ /NASM version ([2-9]\.[0-9]+)/) {
65	$avx = ($1>=2.09) + ($1>=2.10);
66	}
67
68	if (!$avx && $win64 && ($flavour =~ /masm/ \|\| $ENV{ASM} =~ /ml64/) &&
69	`ml64 2>&1` =~ /Version ([0-9]+)\./) {
70	$avx = ($1>=10) + ($1>=12);
71	}
72
73	if (!$avx && `$ENV{CC} -v 2>&1` =~ /((?:clang\|LLVM) version\|.*based on LLVM) ([0-9]+\.[0-9]+)/) {
74	$avx = ($2>=3.0) + ($2>3.0);
75	}
76
77	$shaext=$avx; ### set to zero if compiling for 1.0.1
78	$avx=1 if (!$shaext && $avx);
79
80	open OUT,"\| \"$^X\" \"$xlate\" $flavour \"$output\"";
81	STDOUT=OUT;
82
83	$func="aesni_cbc_sha256_enc";
84	$TABLE="K256";
85	$SZ=4;
86	@ROT=($A,$B,$C,$D,$E,$F,$G,$H)=("%eax","%ebx","%ecx","%edx",
87	"%r8d","%r9d","%r10d","%r11d");
88	($T1,$a0,$a1,$a2,$a3)=("%r12d","%r13d","%r14d","%r15d","%esi");
89	@Sigma0=( 2,13,22);
90	@Sigma1=( 6,11,25);
91	@sigma0=( 7,18, 3);
92	@sigma1=(17,19,10);
93	$rounds=64;
94
95	########################################################################
96	# void aesni_cbc_sha256_enc(const void *inp,
97	# void *out,
98	# size_t length,
99	# const AES_KEY *key,
100	# unsigned char *iv,
101	# SHA256_CTX *ctx,
102	# const void *in0);
103	($inp, $out, $len, $key, $ivp, $ctx, $in0) =
104	("%rdi","%rsi","%rdx","%rcx","%r8","%r9","%r10");
105
106	$Tbl="%rbp";
107
108	$_inp="16$SZ+08(%rsp)";
109	$_out="16$SZ+18(%rsp)";
110	$_end="16$SZ+28(%rsp)";
111	$_key="16$SZ+38(%rsp)";
112	$_ivp="16$SZ+48(%rsp)";
113	$_ctx="16$SZ+58(%rsp)";
114	$_in0="16$SZ+68(%rsp)";
115	$_rsp="`16$SZ+78`(%rsp)";
116	$framesz=16$SZ+88;
117
118	$code=<<___;
119	.text
120
121	.extern OPENSSL_ia32cap_P
122	.globl $func
123	.type $func,\@abi-omnipotent
124	.align 16
125	$func:
126	.cfi_startproc
127	___
128	if ($avx) {
129	$code.=<<___;
130	lea OPENSSL_ia32cap_P(%rip),%r11
131	mov \$1,%eax
132	cmp \$0,`$win64?"%rcx":"%rdi"`
133	je .Lprobe
134	mov 0(%r11),%eax
135	mov 4(%r11),%r10
136	___
137	$code.=<<___ if ($shaext);
138	bt \$61,%r10 # check for SHA
139	jc ${func}_shaext
140	___
141	$code.=<<___;
142	mov %r10,%r11
143	shr \$32,%r11
144
145	test \$`1<<11`,%r10d # check for XOP
146	jnz ${func}_xop
147	___
148	$code.=<<___ if ($avx>1);
149	and \$`1<<8\|1<<5\|1<<3`,%r11d # check for BMI2+AVX2+BMI1
150	cmp \$`1<<8\|1<<5\|1<<3`,%r11d
151	je ${func}_avx2
152	___
153	$code.=<<___;
154	and \$`1<<28`,%r10d # check for AVX
155	jnz ${func}_avx
156	ud2
157	___
158	}
159	$code.=<<___;
160	xor %eax,%eax
161	cmp \$0,`$win64?"%rcx":"%rdi"`
162	je .Lprobe
163	ud2
164	.Lprobe:
165	ret
166	.cfi_endproc
167	.size $func,.-$func
168
169	.align 64
170	.type $TABLE,\@object
171	$TABLE:
172	.long 0x428a2f98,0x71374491,0xb5c0fbcf,0xe9b5dba5
173	.long 0x428a2f98,0x71374491,0xb5c0fbcf,0xe9b5dba5
174	.long 0x3956c25b,0x59f111f1,0x923f82a4,0xab1c5ed5
175	.long 0x3956c25b,0x59f111f1,0x923f82a4,0xab1c5ed5
176	.long 0xd807aa98,0x12835b01,0x243185be,0x550c7dc3
177	.long 0xd807aa98,0x12835b01,0x243185be,0x550c7dc3
178	.long 0x72be5d74,0x80deb1fe,0x9bdc06a7,0xc19bf174
179	.long 0x72be5d74,0x80deb1fe,0x9bdc06a7,0xc19bf174
180	.long 0xe49b69c1,0xefbe4786,0x0fc19dc6,0x240ca1cc
181	.long 0xe49b69c1,0xefbe4786,0x0fc19dc6,0x240ca1cc
182	.long 0x2de92c6f,0x4a7484aa,0x5cb0a9dc,0x76f988da
183	.long 0x2de92c6f,0x4a7484aa,0x5cb0a9dc,0x76f988da
184	.long 0x983e5152,0xa831c66d,0xb00327c8,0xbf597fc7
185	.long 0x983e5152,0xa831c66d,0xb00327c8,0xbf597fc7
186	.long 0xc6e00bf3,0xd5a79147,0x06ca6351,0x14292967
187	.long 0xc6e00bf3,0xd5a79147,0x06ca6351,0x14292967
188	.long 0x27b70a85,0x2e1b2138,0x4d2c6dfc,0x53380d13
189	.long 0x27b70a85,0x2e1b2138,0x4d2c6dfc,0x53380d13
190	.long 0x650a7354,0x766a0abb,0x81c2c92e,0x92722c85
191	.long 0x650a7354,0x766a0abb,0x81c2c92e,0x92722c85
192	.long 0xa2bfe8a1,0xa81a664b,0xc24b8b70,0xc76c51a3
193	.long 0xa2bfe8a1,0xa81a664b,0xc24b8b70,0xc76c51a3
194	.long 0xd192e819,0xd6990624,0xf40e3585,0x106aa070
195	.long 0xd192e819,0xd6990624,0xf40e3585,0x106aa070
196	.long 0x19a4c116,0x1e376c08,0x2748774c,0x34b0bcb5
197	.long 0x19a4c116,0x1e376c08,0x2748774c,0x34b0bcb5
198	.long 0x391c0cb3,0x4ed8aa4a,0x5b9cca4f,0x682e6ff3
199	.long 0x391c0cb3,0x4ed8aa4a,0x5b9cca4f,0x682e6ff3
200	.long 0x748f82ee,0x78a5636f,0x84c87814,0x8cc70208
201	.long 0x748f82ee,0x78a5636f,0x84c87814,0x8cc70208
202	.long 0x90befffa,0xa4506ceb,0xbef9a3f7,0xc67178f2
203	.long 0x90befffa,0xa4506ceb,0xbef9a3f7,0xc67178f2
204
205	.long 0x00010203,0x04050607,0x08090a0b,0x0c0d0e0f
206	.long 0x00010203,0x04050607,0x08090a0b,0x0c0d0e0f
207	.long 0,0,0,0, 0,0,0,0, -1,-1,-1,-1
208	.long 0,0,0,0, 0,0,0,0
209	.asciz "AESNI-CBC+SHA256 stitch for x86_64, CRYPTOGAMS by <appro\@openssl.org>"
210	.align 64
211	___
212
213	######################################################################
214	# SIMD code paths
215	#
216	{{{
217	($iv,$inout,$roundkey,$temp,
218	$mask10,$mask12,$mask14,$offload)=map("%xmm$_",(8..15));
219
220	$aesni_cbc_idx=0;
221	@aesni_cbc_block = (
222	## &vmovdqu ($roundkey,"0x00-0x80($inp)");'
223	## &vmovdqu ($inout,($inp));
224	## &mov ($_inp,$inp);
225
226	'&vpxor ($inout,$inout,$roundkey);'.
227	' &vmovdqu ($roundkey,"0x10-0x80($inp)");',
228
229	'&vpxor ($inout,$inout,$iv);',
230
231	'&vaesenc ($inout,$inout,$roundkey);'.
232	' &vmovdqu ($roundkey,"0x20-0x80($inp)");',
233
234	'&vaesenc ($inout,$inout,$roundkey);'.
235	' &vmovdqu ($roundkey,"0x30-0x80($inp)");',
236
237	'&vaesenc ($inout,$inout,$roundkey);'.
238	' &vmovdqu ($roundkey,"0x40-0x80($inp)");',
239
240	'&vaesenc ($inout,$inout,$roundkey);'.
241	' &vmovdqu ($roundkey,"0x50-0x80($inp)");',
242
243	'&vaesenc ($inout,$inout,$roundkey);'.
244	' &vmovdqu ($roundkey,"0x60-0x80($inp)");',
245
246	'&vaesenc ($inout,$inout,$roundkey);'.
247	' &vmovdqu ($roundkey,"0x70-0x80($inp)");',
248
249	'&vaesenc ($inout,$inout,$roundkey);'.
250	' &vmovdqu ($roundkey,"0x80-0x80($inp)");',
251
252	'&vaesenc ($inout,$inout,$roundkey);'.
253	' &vmovdqu ($roundkey,"0x90-0x80($inp)");',
254
255	'&vaesenc ($inout,$inout,$roundkey);'.
256	' &vmovdqu ($roundkey,"0xa0-0x80($inp)");',
257
258	'&vaesenclast ($temp,$inout,$roundkey);'.
259	' &vaesenc ($inout,$inout,$roundkey);'.
260	' &vmovdqu ($roundkey,"0xb0-0x80($inp)");',
261
262	'&vpand ($iv,$temp,$mask10);'.
263	' &vaesenc ($inout,$inout,$roundkey);'.
264	' &vmovdqu ($roundkey,"0xc0-0x80($inp)");',
265
266	'&vaesenclast ($temp,$inout,$roundkey);'.
267	' &vaesenc ($inout,$inout,$roundkey);'.
268	' &vmovdqu ($roundkey,"0xd0-0x80($inp)");',
269
270	'&vpand ($temp,$temp,$mask12);'.
271	' &vaesenc ($inout,$inout,$roundkey);'.
272	'&vmovdqu ($roundkey,"0xe0-0x80($inp)");',
273
274	'&vpor ($iv,$iv,$temp);'.
275	' &vaesenclast ($temp,$inout,$roundkey);'.
276	' &vmovdqu ($roundkey,"0x00-0x80($inp)");'
277
278	## &mov ($inp,$_inp);
279	## &mov ($out,$_out);
280	## &vpand ($temp,$temp,$mask14);
281	## &vpor ($iv,$iv,$temp);
282	## &vmovdqu ($iv,($out,$inp);
283	## &lea (inp,16($inp));
284	);
285
286	my $a4=$T1;
287	my ($a,$b,$c,$d,$e,$f,$g,$h);
288
289	sub AUTOLOAD() # thunk [simplified] 32-bit style perlasm
290	{ my $opcode = $AUTOLOAD; $opcode =~ s/.*:://;
291	my $arg = pop;
292	$arg = "\$$arg" if ($arg*1 eq $arg);
293	$code .= "\t$opcode\t".join(',',$arg,reverse @_)."\n";
294	}
295
296	sub body_00_15 () {
297	(
298	'($a,$b,$c,$d,$e,$f,$g,$h)=@ROT;'.
299
300	'&ror ($a0,$Sigma1[2]-$Sigma1[1])',
301	'&mov ($a,$a1)',
302	'&mov ($a4,$f)',
303
304	'&xor ($a0,$e)',
305	'&ror ($a1,$Sigma0[2]-$Sigma0[1])',
306	'&xor ($a4,$g)', # f^g
307
308	'&ror ($a0,$Sigma1[1]-$Sigma1[0])',
309	'&xor ($a1,$a)',
310	'&and ($a4,$e)', # (f^g)&e
311
312	@aesni_cbc_block[$aesni_cbc_idx++].
313	'&xor ($a0,$e)',
314	'&add ($h,$SZ*($i&15)."(%rsp)")', # h+=X[i]+K[i]
315	'&mov ($a2,$a)',
316
317	'&ror ($a1,$Sigma0[1]-$Sigma0[0])',
318	'&xor ($a4,$g)', # Ch(e,f,g)=((f^g)&e)^g
319	'&xor ($a2,$b)', # a^b, b^c in next round
320
321	'&ror ($a0,$Sigma1[0])', # Sigma1(e)
322	'&add ($h,$a4)', # h+=Ch(e,f,g)
323	'&and ($a3,$a2)', # (b^c)&(a^b)
324
325	'&xor ($a1,$a)',
326	'&add ($h,$a0)', # h+=Sigma1(e)
327	'&xor ($a3,$b)', # Maj(a,b,c)=Ch(a^b,c,b)
328
329	'&add ($d,$h)', # d+=h
330	'&ror ($a1,$Sigma0[0])', # Sigma0(a)
331	'&add ($h,$a3)', # h+=Maj(a,b,c)
332
333	'&mov ($a0,$d)',
334	'&add ($a1,$h);'. # h+=Sigma0(a)
335	'($a2,$a3) = ($a3,$a2); unshift(@ROT,pop(@ROT)); $i++;'
336	);
337	}
338
339	if ($avx) {{
340	######################################################################
341	# XOP code path
342	#
343	$code.=<<___;
344	.type ${func}_xop,\@function,6
345	.align 64
346	${func}_xop:
347	.cfi_startproc
348	.Lxop_shortcut:
349	mov `($win64?56:8)`(%rsp),$in0 # load 7th parameter
350	mov %rsp,%rax # copy %rsp
351	.cfi_def_cfa_register %rax
352	push %rbx
353	.cfi_push %rbx
354	push %rbp
355	.cfi_push %rbp
356	push %r12
357	.cfi_push %r12
358	push %r13
359	.cfi_push %r13
360	push %r14
361	.cfi_push %r14
362	push %r15
363	.cfi_push %r15
364	sub \$`$framesz+$win641610`,%rsp
365	and \$-64,%rsp # align stack frame
366
367	shl \$6,$len
368	sub $inp,$out # re-bias
369	sub $inp,$in0
370	add $inp,$len # end of input
371
372	#mov $inp,$_inp # saved later
373	mov $out,$_out
374	mov $len,$_end
375	#mov $key,$_key # remains resident in $inp register
376	mov $ivp,$_ivp
377	mov $ctx,$_ctx
378	mov $in0,$_in0
379	mov %rax,$_rsp
380	.cfi_cfa_expression $_rsp,deref,+8
381	___
382	$code.=<<___ if ($win64);
383	movaps %xmm6,`$framesz+16*0`(%rsp)
384	movaps %xmm7,`$framesz+16*1`(%rsp)
385	movaps %xmm8,`$framesz+16*2`(%rsp)
386	movaps %xmm9,`$framesz+16*3`(%rsp)
387	movaps %xmm10,`$framesz+16*4`(%rsp)
388	movaps %xmm11,`$framesz+16*5`(%rsp)
389	movaps %xmm12,`$framesz+16*6`(%rsp)
390	movaps %xmm13,`$framesz+16*7`(%rsp)
391	movaps %xmm14,`$framesz+16*8`(%rsp)
392	movaps %xmm15,`$framesz+16*9`(%rsp)
393	___
394	$code.=<<___;
395	.Lprologue_xop:
396	vzeroall
397
398	mov $inp,%r12 # borrow $a4
399	lea 0x80($key),$inp # size optimization, reassign
400	lea $TABLE+`$SZ2$rounds+32`(%rip),%r13 # borrow $a0
401	mov 0xf0-0x80($inp),%r14d # rounds, borrow $a1
402	mov $ctx,%r15 # borrow $a2
403	mov $in0,%rsi # borrow $a3
404	vmovdqu ($ivp),$iv # load IV
405	sub \$9,%r14
406
407	mov $SZ*0(%r15),$A
408	mov $SZ*1(%r15),$B
409	mov $SZ*2(%r15),$C
410	mov $SZ*3(%r15),$D
411	mov $SZ*4(%r15),$E
412	mov $SZ*5(%r15),$F
413	mov $SZ*6(%r15),$G
414	mov $SZ*7(%r15),$H
415
416	vmovdqa 0x00(%r13,%r14,8),$mask14
417	vmovdqa 0x10(%r13,%r14,8),$mask12
418	vmovdqa 0x20(%r13,%r14,8),$mask10
419	vmovdqu 0x00-0x80($inp),$roundkey
420	jmp .Lloop_xop
421	___
422	if ($SZ==4) { # SHA256
423	my @X = map("%xmm$_",(0..3));
424	my ($t0,$t1,$t2,$t3) = map("%xmm$_",(4..7));
425
426	$code.=<<___;
427	.align 16
428	.Lloop_xop:
429	vmovdqa $TABLE+`$SZ2$rounds`(%rip),$t3
430	vmovdqu 0x00(%rsi,%r12),@X[0]
431	vmovdqu 0x10(%rsi,%r12),@X[1]
432	vmovdqu 0x20(%rsi,%r12),@X[2]
433	vmovdqu 0x30(%rsi,%r12),@X[3]
434	vpshufb $t3,@X[0],@X[0]
435	lea $TABLE(%rip),$Tbl
436	vpshufb $t3,@X[1],@X[1]
437	vpshufb $t3,@X[2],@X[2]
438	vpaddd 0x00($Tbl),@X[0],$t0
439	vpshufb $t3,@X[3],@X[3]
440	vpaddd 0x20($Tbl),@X[1],$t1
441	vpaddd 0x40($Tbl),@X[2],$t2
442	vpaddd 0x60($Tbl),@X[3],$t3
443	vmovdqa $t0,0x00(%rsp)
444	mov $A,$a1
445	vmovdqa $t1,0x10(%rsp)
446	mov $B,$a3
447	vmovdqa $t2,0x20(%rsp)
448	xor $C,$a3 # magic
449	vmovdqa $t3,0x30(%rsp)
450	mov $E,$a0
451	jmp .Lxop_00_47
452
453	.align 16
454	.Lxop_00_47:
455	sub \$-162$SZ,$Tbl # size optimization
456	vmovdqu (%r12),$inout # $a4
457	mov %r12,$_inp # $a4
458	___
459	sub XOP_256_00_47 () {
460	my $j = shift;
461	my $body = shift;
462	my @X = @_;
463	my @insns = (&$body,&$body,&$body,&$body); # 104 instructions
464
465	&vpalignr ($t0,@X[1],@X[0],$SZ); # X[1..4]
466	eval(shift(@insns));
467	eval(shift(@insns));
468	&vpalignr ($t3,@X[3],@X[2],$SZ); # X[9..12]
469	eval(shift(@insns));
470	eval(shift(@insns));
471	&vprotd ($t1,$t0,8*$SZ-$sigma0[1]);
472	eval(shift(@insns));
473	eval(shift(@insns));
474	&vpsrld ($t0,$t0,$sigma0[2]);
475	eval(shift(@insns));
476	eval(shift(@insns));
477	&vpaddd (@X[0],@X[0],$t3); # X[0..3] += X[9..12]
478	eval(shift(@insns));
479	eval(shift(@insns));
480	eval(shift(@insns));
481	eval(shift(@insns));
482	&vprotd ($t2,$t1,$sigma0[1]-$sigma0[0]);
483	eval(shift(@insns));
484	eval(shift(@insns));
485	&vpxor ($t0,$t0,$t1);
486	eval(shift(@insns));
487	eval(shift(@insns));
488	eval(shift(@insns));
489	eval(shift(@insns));
490	&vprotd ($t3,@X[3],8*$SZ-$sigma1[1]);
491	eval(shift(@insns));
492	eval(shift(@insns));
493	&vpxor ($t0,$t0,$t2); # sigma0(X[1..4])
494	eval(shift(@insns));
495	eval(shift(@insns));
496	&vpsrld ($t2,@X[3],$sigma1[2]);
497	eval(shift(@insns));
498	eval(shift(@insns));
499	&vpaddd (@X[0],@X[0],$t0); # X[0..3] += sigma0(X[1..4])
500	eval(shift(@insns));
501	eval(shift(@insns));
502	&vprotd ($t1,$t3,$sigma1[1]-$sigma1[0]);
503	eval(shift(@insns));
504	eval(shift(@insns));
505	&vpxor ($t3,$t3,$t2);
506	eval(shift(@insns));
507	eval(shift(@insns));
508	eval(shift(@insns));
509	eval(shift(@insns));
510	&vpxor ($t3,$t3,$t1); # sigma1(X[14..15])
511	eval(shift(@insns));
512	eval(shift(@insns));
513	eval(shift(@insns));
514	eval(shift(@insns));
515	&vpsrldq ($t3,$t3,8);
516	eval(shift(@insns));
517	eval(shift(@insns));
518	eval(shift(@insns));
519	eval(shift(@insns));
520	&vpaddd (@X[0],@X[0],$t3); # X[0..1] += sigma1(X[14..15])
521	eval(shift(@insns));
522	eval(shift(@insns));
523	eval(shift(@insns));
524	eval(shift(@insns));
525	&vprotd ($t3,@X[0],8*$SZ-$sigma1[1]);
526	eval(shift(@insns));
527	eval(shift(@insns));
528	&vpsrld ($t2,@X[0],$sigma1[2]);
529	eval(shift(@insns));
530	eval(shift(@insns));
531	&vprotd ($t1,$t3,$sigma1[1]-$sigma1[0]);
532	eval(shift(@insns));
533	eval(shift(@insns));
534	&vpxor ($t3,$t3,$t2);
535	eval(shift(@insns));
536	eval(shift(@insns));
537	eval(shift(@insns));
538	eval(shift(@insns));
539	&vpxor ($t3,$t3,$t1); # sigma1(X[16..17])
540	eval(shift(@insns));
541	eval(shift(@insns));
542	eval(shift(@insns));
543	eval(shift(@insns));
544	&vpslldq ($t3,$t3,8); # 22 instructions
545	eval(shift(@insns));
546	eval(shift(@insns));
547	eval(shift(@insns));
548	eval(shift(@insns));
549	&vpaddd (@X[0],@X[0],$t3); # X[2..3] += sigma1(X[16..17])
550	eval(shift(@insns));
551	eval(shift(@insns));
552	eval(shift(@insns));
553	eval(shift(@insns));
554	&vpaddd ($t2,@X[0],162$j."($Tbl)");
555	foreach (@insns) { eval; } # remaining instructions
556	&vmovdqa (16*$j."(%rsp)",$t2);
557	}
558
559	$aesni_cbc_idx=0;
560	for ($i=0,$j=0; $j<4; $j++) {
561	&XOP_256_00_47($j,\&body_00_15,@X);
562	push(@X,shift(@X)); # rotate(@X)
563	}
564	&mov ("%r12",$_inp); # borrow $a4
565	&vpand ($temp,$temp,$mask14);
566	&mov ("%r15",$_out); # borrow $a2
567	&vpor ($iv,$iv,$temp);
568	&vmovdqu ("(%r15,%r12)",$iv); # write output
569	&lea ("%r12","16(%r12)"); # inp++
570
571	&cmpb ($SZ-1+162$SZ."($Tbl)",0);
572	&jne (".Lxop_00_47");
573
574	&vmovdqu ($inout,"(%r12)");
575	&mov ($_inp,"%r12");
576
577	$aesni_cbc_idx=0;
578	for ($i=0; $i<16; ) {
579	foreach(body_00_15()) { eval; }
580	}
581	}
582	$code.=<<___;
583	mov $_inp,%r12 # borrow $a4
584	mov $_out,%r13 # borrow $a0
585	mov $_ctx,%r15 # borrow $a2
586	mov $_in0,%rsi # borrow $a3
587
588	vpand $mask14,$temp,$temp
589	mov $a1,$A
590	vpor $temp,$iv,$iv
591	vmovdqu $iv,(%r13,%r12) # write output
592	lea 16(%r12),%r12 # inp++
593
594	add $SZ*0(%r15),$A
595	add $SZ*1(%r15),$B
596	add $SZ*2(%r15),$C
597	add $SZ*3(%r15),$D
598	add $SZ*4(%r15),$E
599	add $SZ*5(%r15),$F
600	add $SZ*6(%r15),$G
601	add $SZ*7(%r15),$H
602
603	cmp $_end,%r12
604
605	mov $A,$SZ*0(%r15)
606	mov $B,$SZ*1(%r15)
607	mov $C,$SZ*2(%r15)
608	mov $D,$SZ*3(%r15)
609	mov $E,$SZ*4(%r15)
610	mov $F,$SZ*5(%r15)
611	mov $G,$SZ*6(%r15)
612	mov $H,$SZ*7(%r15)
613
614	jb .Lloop_xop
615
616	mov $_ivp,$ivp
617	mov $_rsp,%rsi
618	.cfi_def_cfa %rsi,8
619	vmovdqu $iv,($ivp) # output IV
620	vzeroall
621	___
622	$code.=<<___ if ($win64);
623	movaps `$framesz+16*0`(%rsp),%xmm6
624	movaps `$framesz+16*1`(%rsp),%xmm7
625	movaps `$framesz+16*2`(%rsp),%xmm8
626	movaps `$framesz+16*3`(%rsp),%xmm9
627	movaps `$framesz+16*4`(%rsp),%xmm10
628	movaps `$framesz+16*5`(%rsp),%xmm11
629	movaps `$framesz+16*6`(%rsp),%xmm12
630	movaps `$framesz+16*7`(%rsp),%xmm13
631	movaps `$framesz+16*8`(%rsp),%xmm14
632	movaps `$framesz+16*9`(%rsp),%xmm15
633	___
634	$code.=<<___;
635	mov -48(%rsi),%r15
636	.cfi_restore %r15
637	mov -40(%rsi),%r14
638	.cfi_restore %r14
639	mov -32(%rsi),%r13
640	.cfi_restore %r13
641	mov -24(%rsi),%r12
642	.cfi_restore %r12
643	mov -16(%rsi),%rbp
644	.cfi_restore %rbp
645	mov -8(%rsi),%rbx
646	.cfi_restore %rbx
647	lea (%rsi),%rsp
648	.cfi_def_cfa_register %rsp
649	.Lepilogue_xop:
650	ret
651	.cfi_endproc
652	.size ${func}_xop,.-${func}_xop
653	___
654	######################################################################
655	# AVX+shrd code path
656	#
657	local *ror = sub { &shrd(@_[0],@_) };
658
659	$code.=<<___;
660	.type ${func}_avx,\@function,6
661	.align 64
662	${func}_avx:
663	.cfi_startproc
664	.Lavx_shortcut:
665	mov `($win64?56:8)`(%rsp),$in0 # load 7th parameter
666	mov %rsp,%rax # copy %rsp
667	.cfi_def_cfa_register %rax
668	push %rbx
669	.cfi_push %rbx
670	push %rbp
671	.cfi_push %rbp
672	push %r12
673	.cfi_push %r12
674	push %r13
675	.cfi_push %r13
676	push %r14
677	.cfi_push %r14
678	push %r15
679	.cfi_push %r15
680	sub \$`$framesz+$win641610`,%rsp
681	and \$-64,%rsp # align stack frame
682
683	shl \$6,$len
684	sub $inp,$out # re-bias
685	sub $inp,$in0
686	add $inp,$len # end of input
687
688	#mov $inp,$_inp # saved later
689	mov $out,$_out
690	mov $len,$_end
691	#mov $key,$_key # remains resident in $inp register
692	mov $ivp,$_ivp
693	mov $ctx,$_ctx
694	mov $in0,$_in0
695	mov %rax,$_rsp
696	.cfi_cfa_expression $_rsp,deref,+8
697	___
698	$code.=<<___ if ($win64);
699	movaps %xmm6,`$framesz+16*0`(%rsp)
700	movaps %xmm7,`$framesz+16*1`(%rsp)
701	movaps %xmm8,`$framesz+16*2`(%rsp)
702	movaps %xmm9,`$framesz+16*3`(%rsp)
703	movaps %xmm10,`$framesz+16*4`(%rsp)
704	movaps %xmm11,`$framesz+16*5`(%rsp)
705	movaps %xmm12,`$framesz+16*6`(%rsp)
706	movaps %xmm13,`$framesz+16*7`(%rsp)
707	movaps %xmm14,`$framesz+16*8`(%rsp)
708	movaps %xmm15,`$framesz+16*9`(%rsp)
709	___
710	$code.=<<___;
711	.Lprologue_avx:
712	vzeroall
713
714	mov $inp,%r12 # borrow $a4
715	lea 0x80($key),$inp # size optimization, reassign
716	lea $TABLE+`$SZ2$rounds+32`(%rip),%r13 # borrow $a0
717	mov 0xf0-0x80($inp),%r14d # rounds, borrow $a1
718	mov $ctx,%r15 # borrow $a2
719	mov $in0,%rsi # borrow $a3
720	vmovdqu ($ivp),$iv # load IV
721	sub \$9,%r14
722
723	mov $SZ*0(%r15),$A
724	mov $SZ*1(%r15),$B
725	mov $SZ*2(%r15),$C
726	mov $SZ*3(%r15),$D
727	mov $SZ*4(%r15),$E
728	mov $SZ*5(%r15),$F
729	mov $SZ*6(%r15),$G
730	mov $SZ*7(%r15),$H
731
732	vmovdqa 0x00(%r13,%r14,8),$mask14
733	vmovdqa 0x10(%r13,%r14,8),$mask12
734	vmovdqa 0x20(%r13,%r14,8),$mask10
735	vmovdqu 0x00-0x80($inp),$roundkey
736	___
737	if ($SZ==4) { # SHA256
738	my @X = map("%xmm$_",(0..3));
739	my ($t0,$t1,$t2,$t3) = map("%xmm$_",(4..7));
740
741	$code.=<<___;
742	jmp .Lloop_avx
743	.align 16
744	.Lloop_avx:
745	vmovdqa $TABLE+`$SZ2$rounds`(%rip),$t3
746	vmovdqu 0x00(%rsi,%r12),@X[0]
747	vmovdqu 0x10(%rsi,%r12),@X[1]
748	vmovdqu 0x20(%rsi,%r12),@X[2]
749	vmovdqu 0x30(%rsi,%r12),@X[3]
750	vpshufb $t3,@X[0],@X[0]
751	lea $TABLE(%rip),$Tbl
752	vpshufb $t3,@X[1],@X[1]
753	vpshufb $t3,@X[2],@X[2]
754	vpaddd 0x00($Tbl),@X[0],$t0
755	vpshufb $t3,@X[3],@X[3]
756	vpaddd 0x20($Tbl),@X[1],$t1
757	vpaddd 0x40($Tbl),@X[2],$t2
758	vpaddd 0x60($Tbl),@X[3],$t3
759	vmovdqa $t0,0x00(%rsp)
760	mov $A,$a1
761	vmovdqa $t1,0x10(%rsp)
762	mov $B,$a3
763	vmovdqa $t2,0x20(%rsp)
764	xor $C,$a3 # magic
765	vmovdqa $t3,0x30(%rsp)
766	mov $E,$a0
767	jmp .Lavx_00_47
768
769	.align 16
770	.Lavx_00_47:
771	sub \$-162$SZ,$Tbl # size optimization
772	vmovdqu (%r12),$inout # $a4
773	mov %r12,$_inp # $a4
774	___
775	sub Xupdate_256_AVX () {
776	(
777	'&vpalignr ($t0,@X[1],@X[0],$SZ)', # X[1..4]
778	'&vpalignr ($t3,@X[3],@X[2],$SZ)', # X[9..12]
779	'&vpsrld ($t2,$t0,$sigma0[0]);',
780	'&vpaddd (@X[0],@X[0],$t3)', # X[0..3] += X[9..12]
781	'&vpsrld ($t3,$t0,$sigma0[2])',
782	'&vpslld ($t1,$t0,8*$SZ-$sigma0[1]);',
783	'&vpxor ($t0,$t3,$t2)',
784	'&vpshufd ($t3,@X[3],0b11111010)',# X[14..15]
785	'&vpsrld ($t2,$t2,$sigma0[1]-$sigma0[0]);',
786	'&vpxor ($t0,$t0,$t1)',
787	'&vpslld ($t1,$t1,$sigma0[1]-$sigma0[0]);',
788	'&vpxor ($t0,$t0,$t2)',
789	'&vpsrld ($t2,$t3,$sigma1[2]);',
790	'&vpxor ($t0,$t0,$t1)', # sigma0(X[1..4])
791	'&vpsrlq ($t3,$t3,$sigma1[0]);',
792	'&vpaddd (@X[0],@X[0],$t0)', # X[0..3] += sigma0(X[1..4])
793	'&vpxor ($t2,$t2,$t3);',
794	'&vpsrlq ($t3,$t3,$sigma1[1]-$sigma1[0])',
795	'&vpxor ($t2,$t2,$t3)', # sigma1(X[14..15])
796	'&vpshufd ($t2,$t2,0b10000100)',
797	'&vpsrldq ($t2,$t2,8)',
798	'&vpaddd (@X[0],@X[0],$t2)', # X[0..1] += sigma1(X[14..15])
799	'&vpshufd ($t3,@X[0],0b01010000)',# X[16..17]
800	'&vpsrld ($t2,$t3,$sigma1[2])',
801	'&vpsrlq ($t3,$t3,$sigma1[0])',
802	'&vpxor ($t2,$t2,$t3);',
803	'&vpsrlq ($t3,$t3,$sigma1[1]-$sigma1[0])',
804	'&vpxor ($t2,$t2,$t3)',
805	'&vpshufd ($t2,$t2,0b11101000)',
806	'&vpslldq ($t2,$t2,8)',
807	'&vpaddd (@X[0],@X[0],$t2)' # X[2..3] += sigma1(X[16..17])
808	);
809	}
810
811	sub AVX_256_00_47 () {
812	my $j = shift;
813	my $body = shift;
814	my @X = @_;
815	my @insns = (&$body,&$body,&$body,&$body); # 104 instructions
816
817	foreach (Xupdate_256_AVX()) { # 29 instructions
818	eval;
819	eval(shift(@insns));
820	eval(shift(@insns));
821	eval(shift(@insns));
822	}
823	&vpaddd ($t2,@X[0],162$j."($Tbl)");
824	foreach (@insns) { eval; } # remaining instructions
825	&vmovdqa (16*$j."(%rsp)",$t2);
826	}
827
828	$aesni_cbc_idx=0;
829	for ($i=0,$j=0; $j<4; $j++) {
830	&AVX_256_00_47($j,\&body_00_15,@X);
831	push(@X,shift(@X)); # rotate(@X)
832	}
833	&mov ("%r12",$_inp); # borrow $a4
834	&vpand ($temp,$temp,$mask14);
835	&mov ("%r15",$_out); # borrow $a2
836	&vpor ($iv,$iv,$temp);
837	&vmovdqu ("(%r15,%r12)",$iv); # write output
838	&lea ("%r12","16(%r12)"); # inp++
839
840	&cmpb ($SZ-1+162$SZ."($Tbl)",0);
841	&jne (".Lavx_00_47");
842
843	&vmovdqu ($inout,"(%r12)");
844	&mov ($_inp,"%r12");
845
846	$aesni_cbc_idx=0;
847	for ($i=0; $i<16; ) {
848	foreach(body_00_15()) { eval; }
849	}
850
851	}
852	$code.=<<___;
853	mov $_inp,%r12 # borrow $a4
854	mov $_out,%r13 # borrow $a0
855	mov $_ctx,%r15 # borrow $a2
856	mov $_in0,%rsi # borrow $a3
857
858	vpand $mask14,$temp,$temp
859	mov $a1,$A
860	vpor $temp,$iv,$iv
861	vmovdqu $iv,(%r13,%r12) # write output
862	lea 16(%r12),%r12 # inp++
863
864	add $SZ*0(%r15),$A
865	add $SZ*1(%r15),$B
866	add $SZ*2(%r15),$C
867	add $SZ*3(%r15),$D
868	add $SZ*4(%r15),$E
869	add $SZ*5(%r15),$F
870	add $SZ*6(%r15),$G
871	add $SZ*7(%r15),$H
872
873	cmp $_end,%r12
874
875	mov $A,$SZ*0(%r15)
876	mov $B,$SZ*1(%r15)
877	mov $C,$SZ*2(%r15)
878	mov $D,$SZ*3(%r15)
879	mov $E,$SZ*4(%r15)
880	mov $F,$SZ*5(%r15)
881	mov $G,$SZ*6(%r15)
882	mov $H,$SZ*7(%r15)
883	jb .Lloop_avx
884
885	mov $_ivp,$ivp
886	mov $_rsp,%rsi
887	.cfi_def_cfa %rsi,8
888	vmovdqu $iv,($ivp) # output IV
889	vzeroall
890	___
891	$code.=<<___ if ($win64);
892	movaps `$framesz+16*0`(%rsp),%xmm6
893	movaps `$framesz+16*1`(%rsp),%xmm7
894	movaps `$framesz+16*2`(%rsp),%xmm8
895	movaps `$framesz+16*3`(%rsp),%xmm9
896	movaps `$framesz+16*4`(%rsp),%xmm10
897	movaps `$framesz+16*5`(%rsp),%xmm11
898	movaps `$framesz+16*6`(%rsp),%xmm12
899	movaps `$framesz+16*7`(%rsp),%xmm13
900	movaps `$framesz+16*8`(%rsp),%xmm14
901	movaps `$framesz+16*9`(%rsp),%xmm15
902	___
903	$code.=<<___;
904	mov -48(%rsi),%r15
905	.cfi_restore %r15
906	mov -40(%rsi),%r14
907	.cfi_restore %r14
908	mov -32(%rsi),%r13
909	.cfi_restore %r13
910	mov -24(%rsi),%r12
911	.cfi_restore %r12
912	mov -16(%rsi),%rbp
913	.cfi_restore %rbp
914	mov -8(%rsi),%rbx
915	.cfi_restore %rbx
916	lea (%rsi),%rsp
917	.cfi_def_cfa_register %rsp
918	.Lepilogue_avx:
919	ret
920	.cfi_endproc
921	.size ${func}_avx,.-${func}_avx
922	___
923
924	if ($avx>1) {{
925	######################################################################
926	# AVX2+BMI code path
927	#
928	my $a5=$SZ==4?"%esi":"%rsi"; # zap $inp
929	my $PUSH8=82$SZ;
930	use integer;
931
932	sub bodyx_00_15 () {
933	# at start $a1 should be zero, $a3 - $b^$c and $a4 copy of $f
934	(
935	'($a,$b,$c,$d,$e,$f,$g,$h)=@ROT;'.
936
937	'&add ($h,(32($i/(16/$SZ))+$SZ($i%(16/$SZ)))%$PUSH8.$base)', # h+=X[i]+K[i]
938	'&and ($a4,$e)', # f&e
939	'&rorx ($a0,$e,$Sigma1[2])',
940	'&rorx ($a2,$e,$Sigma1[1])',
941
942	'&lea ($a,"($a,$a1)")', # h+=Sigma0(a) from the past
943	'&lea ($h,"($h,$a4)")',
944	'&andn ($a4,$e,$g)', # ~e&g
945	'&xor ($a0,$a2)',
946
947	'&rorx ($a1,$e,$Sigma1[0])',
948	'&lea ($h,"($h,$a4)")', # h+=Ch(e,f,g)=(e&f)+(~e&g)
949	'&xor ($a0,$a1)', # Sigma1(e)
950	'&mov ($a2,$a)',
951
952	'&rorx ($a4,$a,$Sigma0[2])',
953	'&lea ($h,"($h,$a0)")', # h+=Sigma1(e)
954	'&xor ($a2,$b)', # a^b, b^c in next round
955	'&rorx ($a1,$a,$Sigma0[1])',
956
957	'&rorx ($a0,$a,$Sigma0[0])',
958	'&lea ($d,"($d,$h)")', # d+=h
959	'&and ($a3,$a2)', # (b^c)&(a^b)
960	@aesni_cbc_block[$aesni_cbc_idx++].
961	'&xor ($a1,$a4)',
962
963	'&xor ($a3,$b)', # Maj(a,b,c)=Ch(a^b,c,b)
964	'&xor ($a1,$a0)', # Sigma0(a)
965	'&lea ($h,"($h,$a3)");'. # h+=Maj(a,b,c)
966	'&mov ($a4,$e)', # copy of f in future
967
968	'($a2,$a3) = ($a3,$a2); unshift(@ROT,pop(@ROT)); $i++;'
969	);
970	# and at the finish one has to $a+=$a1
971	}
972
973	$code.=<<___;
974	.type ${func}_avx2,\@function,6
975	.align 64
976	${func}_avx2:
977	.cfi_startproc
978	.Lavx2_shortcut:
979	mov `($win64?56:8)`(%rsp),$in0 # load 7th parameter
980	mov %rsp,%rax # copy %rsp
981	.cfi_def_cfa_register %rax
982	push %rbx
983	.cfi_push %rbx
984	push %rbp
985	.cfi_push %rbp
986	push %r12
987	.cfi_push %r12
988	push %r13
989	.cfi_push %r13
990	push %r14
991	.cfi_push %r14
992	push %r15
993	.cfi_push %r15
994	sub \$`2$SZ$rounds+88+$win6416*10`,%rsp
995	and \$-256*$SZ,%rsp # align stack frame
996	add \$`2$SZ($rounds-8)`,%rsp
997
998	shl \$6,$len
999	sub $inp,$out # re-bias
1000	sub $inp,$in0
1001	add $inp,$len # end of input
1002
1003	#mov $inp,$_inp # saved later
1004	#mov $out,$_out # kept in $offload
1005	mov $len,$_end
1006	#mov $key,$_key # remains resident in $inp register
1007	mov $ivp,$_ivp
1008	mov $ctx,$_ctx
1009	mov $in0,$_in0
1010	mov %rax,$_rsp
1011	.cfi_cfa_expression $_rsp,deref,+8
1012	___
1013	$code.=<<___ if ($win64);
1014	movaps %xmm6,`$framesz+16*0`(%rsp)
1015	movaps %xmm7,`$framesz+16*1`(%rsp)
1016	movaps %xmm8,`$framesz+16*2`(%rsp)
1017	movaps %xmm9,`$framesz+16*3`(%rsp)
1018	movaps %xmm10,`$framesz+16*4`(%rsp)
1019	movaps %xmm11,`$framesz+16*5`(%rsp)
1020	movaps %xmm12,`$framesz+16*6`(%rsp)
1021	movaps %xmm13,`$framesz+16*7`(%rsp)
1022	movaps %xmm14,`$framesz+16*8`(%rsp)
1023	movaps %xmm15,`$framesz+16*9`(%rsp)
1024	___
1025	$code.=<<___;
1026	.Lprologue_avx2:
1027	vzeroall
1028
1029	mov $inp,%r13 # borrow $a0
1030	vpinsrq \$1,$out,$offload,$offload
1031	lea 0x80($key),$inp # size optimization, reassign
1032	lea $TABLE+`$SZ2$rounds+32`(%rip),%r12 # borrow $a4
1033	mov 0xf0-0x80($inp),%r14d # rounds, borrow $a1
1034	mov $ctx,%r15 # borrow $a2
1035	mov $in0,%rsi # borrow $a3
1036	vmovdqu ($ivp),$iv # load IV
1037	lea -9(%r14),%r14
1038
1039	vmovdqa 0x00(%r12,%r14,8),$mask14
1040	vmovdqa 0x10(%r12,%r14,8),$mask12
1041	vmovdqa 0x20(%r12,%r14,8),$mask10
1042
1043	sub \$-16*$SZ,%r13 # inp++, size optimization
1044	mov $SZ*0(%r15),$A
1045	lea (%rsi,%r13),%r12 # borrow $a0
1046	mov $SZ*1(%r15),$B
1047	cmp $len,%r13 # $_end
1048	mov $SZ*2(%r15),$C
1049	cmove %rsp,%r12 # next block or random data
1050	mov $SZ*3(%r15),$D
1051	mov $SZ*4(%r15),$E
1052	mov $SZ*5(%r15),$F
1053	mov $SZ*6(%r15),$G
1054	mov $SZ*7(%r15),$H
1055	vmovdqu 0x00-0x80($inp),$roundkey
1056	___
1057	if ($SZ==4) { # SHA256
1058	my @X = map("%ymm$_",(0..3));
1059	my ($t0,$t1,$t2,$t3) = map("%ymm$_",(4..7));
1060
1061	$code.=<<___;
1062	jmp .Loop_avx2
1063	.align 16
1064	.Loop_avx2:
1065	vmovdqa $TABLE+`$SZ2$rounds`(%rip),$t3
1066	vmovdqu -16*$SZ+0(%rsi,%r13),%xmm0
1067	vmovdqu -16*$SZ+16(%rsi,%r13),%xmm1
1068	vmovdqu -16*$SZ+32(%rsi,%r13),%xmm2
1069	vmovdqu -16*$SZ+48(%rsi,%r13),%xmm3
1070
1071	vinserti128 \$1,(%r12),@X[0],@X[0]
1072	vinserti128 \$1,16(%r12),@X[1],@X[1]
1073	vpshufb $t3,@X[0],@X[0]
1074	vinserti128 \$1,32(%r12),@X[2],@X[2]
1075	vpshufb $t3,@X[1],@X[1]
1076	vinserti128 \$1,48(%r12),@X[3],@X[3]
1077
1078	lea $TABLE(%rip),$Tbl
1079	vpshufb $t3,@X[2],@X[2]
1080	lea -16*$SZ(%r13),%r13
1081	vpaddd 0x00($Tbl),@X[0],$t0
1082	vpshufb $t3,@X[3],@X[3]
1083	vpaddd 0x20($Tbl),@X[1],$t1
1084	vpaddd 0x40($Tbl),@X[2],$t2
1085	vpaddd 0x60($Tbl),@X[3],$t3
1086	vmovdqa $t0,0x00(%rsp)
1087	xor $a1,$a1
1088	vmovdqa $t1,0x20(%rsp)
1089	___
1090	$code.=<<___ if (!$win64);
1091	# temporarily use %rsi as frame pointer
1092	mov $_rsp,%rsi
1093	.cfi_def_cfa %rsi,8
1094	___
1095	$code.=<<___;
1096	lea -$PUSH8(%rsp),%rsp
1097	___
1098	$code.=<<___ if (!$win64);
1099	# the frame info is at $_rsp, but the stack is moving...
1100	# so a second frame pointer is saved at -8(%rsp)
1101	# that is in the red zone
1102	mov %rsi,-8(%rsp)
1103	.cfi_cfa_expression %rsp-8,deref,+8
1104	___
1105	$code.=<<___;
1106	mov $B,$a3
1107	vmovdqa $t2,0x00(%rsp)
1108	xor $C,$a3 # magic
1109	vmovdqa $t3,0x20(%rsp)
1110	mov $F,$a4
1111	sub \$-162$SZ,$Tbl # size optimization
1112	jmp .Lavx2_00_47
1113
1114	.align 16
1115	.Lavx2_00_47:
1116	vmovdqu (%r13),$inout
1117	vpinsrq \$0,%r13,$offload,$offload
1118	___
1119
1120	sub AVX2_256_00_47 () {
1121	my $j = shift;
1122	my $body = shift;
1123	my @X = @_;
1124	my @insns = (&$body,&$body,&$body,&$body); # 96 instructions
1125	my $base = "+2*$PUSH8(%rsp)";
1126
1127	if (($j%2)==0) {
1128	&lea ("%rsp","-$PUSH8(%rsp)");
1129	$code.=<<___ if (!$win64);
1130	.cfi_cfa_expression %rsp+`$PUSH8-8`,deref,+8
1131	# copy secondary frame pointer to new location again at -8(%rsp)
1132	pushq $PUSH8-8(%rsp)
1133	.cfi_cfa_expression %rsp,deref,+8
1134	lea 8(%rsp),%rsp
1135	.cfi_cfa_expression %rsp-8,deref,+8
1136	___
1137	}
1138	foreach (Xupdate_256_AVX()) { # 29 instructions
1139	eval;
1140	eval(shift(@insns));
1141	eval(shift(@insns));
1142	eval(shift(@insns));
1143	}
1144	&vpaddd ($t2,@X[0],162$j."($Tbl)");
1145	foreach (@insns) { eval; } # remaining instructions
1146	&vmovdqa ((32*$j)%$PUSH8."(%rsp)",$t2);
1147	}
1148	$aesni_cbc_idx=0;
1149	for ($i=0,$j=0; $j<4; $j++) {
1150	&AVX2_256_00_47($j,\&bodyx_00_15,@X);
1151	push(@X,shift(@X)); # rotate(@X)
1152	}
1153	&vmovq ("%r13",$offload); # borrow $a0
1154	&vpextrq ("%r15",$offload,1); # borrow $a2
1155	&vpand ($temp,$temp,$mask14);
1156	&vpor ($iv,$iv,$temp);
1157	&vmovdqu ("(%r15,%r13)",$iv); # write output
1158	&lea ("%r13","16(%r13)"); # inp++
1159
1160	&lea ($Tbl,162$SZ."($Tbl)");
1161	&cmpb (($SZ-1)."($Tbl)",0);
1162	&jne (".Lavx2_00_47");
1163
1164	&vmovdqu ($inout,"(%r13)");
1165	&vpinsrq ($offload,$offload,"%r13",0);
1166
1167	$aesni_cbc_idx=0;
1168	for ($i=0; $i<16; ) {
1169	my $base=$i<8?"+$PUSH8(%rsp)":"(%rsp)";
1170	foreach(bodyx_00_15()) { eval; }
1171	}
1172	}
1173	$code.=<<___;
1174	vpextrq \$1,$offload,%r12 # $_out, borrow $a4
1175	vmovq $offload,%r13 # $_inp, borrow $a0
1176	mov `2$SZ$rounds+5*8`(%rsp),%r15 # $_ctx, borrow $a2
1177	add $a1,$A
1178	lea `2$SZ($rounds-8)`(%rsp),$Tbl
1179
1180	vpand $mask14,$temp,$temp
1181	vpor $temp,$iv,$iv
1182	vmovdqu $iv,(%r12,%r13) # write output
1183	lea 16(%r13),%r13
1184
1185	add $SZ*0(%r15),$A
1186	add $SZ*1(%r15),$B
1187	add $SZ*2(%r15),$C
1188	add $SZ*3(%r15),$D
1189	add $SZ*4(%r15),$E
1190	add $SZ*5(%r15),$F
1191	add $SZ*6(%r15),$G
1192	add $SZ*7(%r15),$H
1193
1194	mov $A,$SZ*0(%r15)
1195	mov $B,$SZ*1(%r15)
1196	mov $C,$SZ*2(%r15)
1197	mov $D,$SZ*3(%r15)
1198	mov $E,$SZ*4(%r15)
1199	mov $F,$SZ*5(%r15)
1200	mov $G,$SZ*6(%r15)
1201	mov $H,$SZ*7(%r15)
1202
1203	cmp `$PUSH8+2*8`($Tbl),%r13 # $_end
1204	je .Ldone_avx2
1205
1206	xor $a1,$a1
1207	mov $B,$a3
1208	mov $F,$a4
1209	xor $C,$a3 # magic
1210	jmp .Lower_avx2
1211	.align 16
1212	.Lower_avx2:
1213	vmovdqu (%r13),$inout
1214	vpinsrq \$0,%r13,$offload,$offload
1215	___
1216	$aesni_cbc_idx=0;
1217	for ($i=0; $i<16; ) {
1218	my $base="+16($Tbl)";
1219	foreach(bodyx_00_15()) { eval; }
1220	&lea ($Tbl,"-$PUSH8($Tbl)") if ($i==8);
1221	}
1222	$code.=<<___;
1223	vmovq $offload,%r13 # borrow $a0
1224	vpextrq \$1,$offload,%r15 # borrow $a2
1225	vpand $mask14,$temp,$temp
1226	vpor $temp,$iv,$iv
1227	lea -$PUSH8($Tbl),$Tbl
1228	vmovdqu $iv,(%r15,%r13) # write output
1229	lea 16(%r13),%r13 # inp++
1230	cmp %rsp,$Tbl
1231	jae .Lower_avx2
1232
1233	mov `2$SZ$rounds+5*8`(%rsp),%r15 # $_ctx, borrow $a2
1234	lea 16*$SZ(%r13),%r13
1235	mov `2$SZ$rounds+6*8`(%rsp),%rsi # $_in0, borrow $a3
1236	add $a1,$A
1237	lea `2$SZ($rounds-8)`(%rsp),%rsp
1238
1239	add $SZ*0(%r15),$A
1240	add $SZ*1(%r15),$B
1241	add $SZ*2(%r15),$C
1242	add $SZ*3(%r15),$D
1243	add $SZ*4(%r15),$E
1244	add $SZ*5(%r15),$F
1245	add $SZ*6(%r15),$G
1246	lea (%rsi,%r13),%r12
1247	add $SZ*7(%r15),$H
1248
1249	cmp $_end,%r13
1250
1251	mov $A,$SZ*0(%r15)
1252	cmove %rsp,%r12 # next block or stale data
1253	mov $B,$SZ*1(%r15)
1254	mov $C,$SZ*2(%r15)
1255	mov $D,$SZ*3(%r15)
1256	mov $E,$SZ*4(%r15)
1257	mov $F,$SZ*5(%r15)
1258	mov $G,$SZ*6(%r15)
1259	mov $H,$SZ*7(%r15)
1260
1261	jbe .Loop_avx2
1262	lea (%rsp),$Tbl
1263	# temporarily use $Tbl as index to $_rsp
1264	# this avoids the need to save a secondary frame pointer at -8(%rsp)
1265	.cfi_cfa_expression $Tbl+`16$SZ+78`,deref,+8
1266
1267	.Ldone_avx2:
1268	mov 16$SZ+48($Tbl),$ivp
1269	mov 16$SZ+78($Tbl),%rsi
1270	.cfi_def_cfa %rsi,8
1271	vmovdqu $iv,($ivp) # output IV
1272	vzeroall
1273	___
1274	$code.=<<___ if ($win64);
1275	movaps `$framesz+16*0`($Tbl),%xmm6
1276	movaps `$framesz+16*1`($Tbl),%xmm7
1277	movaps `$framesz+16*2`($Tbl),%xmm8
1278	movaps `$framesz+16*3`($Tbl),%xmm9
1279	movaps `$framesz+16*4`($Tbl),%xmm10
1280	movaps `$framesz+16*5`($Tbl),%xmm11
1281	movaps `$framesz+16*6`($Tbl),%xmm12
1282	movaps `$framesz+16*7`($Tbl),%xmm13
1283	movaps `$framesz+16*8`($Tbl),%xmm14
1284	movaps `$framesz+16*9`($Tbl),%xmm15
1285	___
1286	$code.=<<___;
1287	mov -48(%rsi),%r15
1288	.cfi_restore %r15
1289	mov -40(%rsi),%r14
1290	.cfi_restore %r14
1291	mov -32(%rsi),%r13
1292	.cfi_restore %r13
1293	mov -24(%rsi),%r12
1294	.cfi_restore %r12
1295	mov -16(%rsi),%rbp
1296	.cfi_restore %rbp
1297	mov -8(%rsi),%rbx
1298	.cfi_restore %rbx
1299	lea (%rsi),%rsp
1300	.cfi_def_cfa_register %rsp
1301	.Lepilogue_avx2:
1302	ret
1303	.cfi_endproc
1304	.size ${func}_avx2,.-${func}_avx2
1305	___
1306	}}
1307	}}
1308	{{
1309	my ($in0,$out,$len,$key,$ivp,$ctx,$inp)=("%rdi","%rsi","%rdx","%rcx","%r8","%r9","%r10");
1310
1311	my ($rounds,$Tbl)=("%r11d","%rbx");
1312
1313	my ($iv,$in,$rndkey0)=map("%xmm$_",(6,14,15));
1314	my @rndkey=("%xmm4","%xmm5");
1315	my $r=0;
1316	my $sn=0;
1317
1318	my ($Wi,$ABEF,$CDGH,$TMP,$BSWAP,$ABEF_SAVE,$CDGH_SAVE)=map("%xmm$_",(0..3,7..9));
1319	my @MSG=map("%xmm$_",(10..13));
1320
1321	my $aesenc=sub {
1322	use integer;
1323	my ($n,$k)=($r/10,$r%10);
1324	if ($k==0) {
1325	$code.=<<___;
1326	movups `16*$n`($in0),$in # load input
1327	xorps $rndkey0,$in
1328	___
1329	$code.=<<___ if ($n);
1330	movups $iv,`16*($n-1)`($out,$in0) # write output
1331	___
1332	$code.=<<___;
1333	xorps $in,$iv
1334	movups `32+16*$k-112`($key),$rndkey[1]
1335	aesenc $rndkey[0],$iv
1336	___
1337	} elsif ($k==9) {
1338	$sn++;
1339	$code.=<<___;
1340	cmp \$11,$rounds
1341	jb .Laesenclast$sn
1342	movups `32+16*($k+0)-112`($key),$rndkey[1]
1343	aesenc $rndkey[0],$iv
1344	movups `32+16*($k+1)-112`($key),$rndkey[0]
1345	aesenc $rndkey[1],$iv
1346	je .Laesenclast$sn
1347	movups `32+16*($k+2)-112`($key),$rndkey[1]
1348	aesenc $rndkey[0],$iv
1349	movups `32+16*($k+3)-112`($key),$rndkey[0]
1350	aesenc $rndkey[1],$iv
1351	.Laesenclast$sn:
1352	aesenclast $rndkey[0],$iv
1353	movups 16-112($key),$rndkey[1] # forward reference
1354	nop
1355	___
1356	} else {
1357	$code.=<<___;
1358	movups `32+16*$k-112`($key),$rndkey[1]
1359	aesenc $rndkey[0],$iv
1360	___
1361	}
1362	$r++; unshift(@rndkey,pop(@rndkey));
1363	};
1364
1365	if ($shaext) {
1366	my $Tbl="%rax";
1367
1368	$code.=<<___;
1369	.type ${func}_shaext,\@function,6
1370	.align 32
1371	${func}_shaext:
1372	.cfi_startproc
1373	mov `($win64?56:8)`(%rsp),$inp # load 7th argument
1374	___
1375	$code.=<<___ if ($win64);
1376	lea `-8-10*16`(%rsp),%rsp
1377	movaps %xmm6,-8-10*16(%rax)
1378	movaps %xmm7,-8-9*16(%rax)
1379	movaps %xmm8,-8-8*16(%rax)
1380	movaps %xmm9,-8-7*16(%rax)
1381	movaps %xmm10,-8-6*16(%rax)
1382	movaps %xmm11,-8-5*16(%rax)
1383	movaps %xmm12,-8-4*16(%rax)
1384	movaps %xmm13,-8-3*16(%rax)
1385	movaps %xmm14,-8-2*16(%rax)
1386	movaps %xmm15,-8-1*16(%rax)
1387	.Lprologue_shaext:
1388	___
1389	$code.=<<___;
1390	lea K256+0x80(%rip),$Tbl
1391	movdqu ($ctx),$ABEF # DCBA
1392	movdqu 16($ctx),$CDGH # HGFE
1393	movdqa 0x200-0x80($Tbl),$TMP # byte swap mask
1394
1395	mov 240($key),$rounds
1396	sub $in0,$out
1397	movups ($key),$rndkey0 # $key[0]
1398	movups ($ivp),$iv # load IV
1399	movups 16($key),$rndkey[0] # forward reference
1400	lea 112($key),$key # size optimization
1401
1402	pshufd \$0x1b,$ABEF,$Wi # ABCD
1403	pshufd \$0xb1,$ABEF,$ABEF # CDAB
1404	pshufd \$0x1b,$CDGH,$CDGH # EFGH
1405	movdqa $TMP,$BSWAP # offload
1406	palignr \$8,$CDGH,$ABEF # ABEF
1407	punpcklqdq $Wi,$CDGH # CDGH
1408
1409	jmp .Loop_shaext
1410
1411	.align 16
1412	.Loop_shaext:
1413	movdqu ($inp),@MSG[0]
1414	movdqu 0x10($inp),@MSG[1]
1415	movdqu 0x20($inp),@MSG[2]
1416	pshufb $TMP,@MSG[0]
1417	movdqu 0x30($inp),@MSG[3]
1418
1419	movdqa 0*32-0x80($Tbl),$Wi
1420	paddd @MSG[0],$Wi
1421	pshufb $TMP,@MSG[1]
1422	movdqa $CDGH,$CDGH_SAVE # offload
1423	movdqa $ABEF,$ABEF_SAVE # offload
1424	___
1425	&$aesenc();
1426	$code.=<<___;
1427	sha256rnds2 $ABEF,$CDGH # 0-3
1428	pshufd \$0x0e,$Wi,$Wi
1429	___
1430	&$aesenc();
1431	$code.=<<___;
1432	sha256rnds2 $CDGH,$ABEF
1433
1434	movdqa 1*32-0x80($Tbl),$Wi
1435	paddd @MSG[1],$Wi
1436	pshufb $TMP,@MSG[2]
1437	lea 0x40($inp),$inp
1438	___
1439	&$aesenc();
1440	$code.=<<___;
1441	sha256rnds2 $ABEF,$CDGH # 4-7
1442	pshufd \$0x0e,$Wi,$Wi
1443	___
1444	&$aesenc();
1445	$code.=<<___;
1446	sha256rnds2 $CDGH,$ABEF
1447
1448	movdqa 2*32-0x80($Tbl),$Wi
1449	paddd @MSG[2],$Wi
1450	pshufb $TMP,@MSG[3]
1451	sha256msg1 @MSG[1],@MSG[0]
1452	___
1453	&$aesenc();
1454	$code.=<<___;
1455	sha256rnds2 $ABEF,$CDGH # 8-11
1456	pshufd \$0x0e,$Wi,$Wi
1457	movdqa @MSG[3],$TMP
1458	palignr \$4,@MSG[2],$TMP
1459	paddd $TMP,@MSG[0]
1460	___
1461	&$aesenc();
1462	$code.=<<___;
1463	sha256rnds2 $CDGH,$ABEF
1464
1465	movdqa 3*32-0x80($Tbl),$Wi
1466	paddd @MSG[3],$Wi
1467	sha256msg2 @MSG[3],@MSG[0]
1468	sha256msg1 @MSG[2],@MSG[1]
1469	___
1470	&$aesenc();
1471	$code.=<<___;
1472	sha256rnds2 $ABEF,$CDGH # 12-15
1473	pshufd \$0x0e,$Wi,$Wi
1474	___
1475	&$aesenc();
1476	$code.=<<___;
1477	movdqa @MSG[0],$TMP
1478	palignr \$4,@MSG[3],$TMP
1479	paddd $TMP,@MSG[1]
1480	sha256rnds2 $CDGH,$ABEF
1481	___
1482	for($i=4;$i<16-3;$i++) {
1483	&$aesenc() if (($r%10)==0);
1484	$code.=<<___;
1485	movdqa $i*32-0x80($Tbl),$Wi
1486	paddd @MSG[0],$Wi
1487	sha256msg2 @MSG[0],@MSG[1]
1488	sha256msg1 @MSG[3],@MSG[2]
1489	___
1490	&$aesenc();
1491	$code.=<<___;
1492	sha256rnds2 $ABEF,$CDGH # 16-19...
1493	pshufd \$0x0e,$Wi,$Wi
1494	movdqa @MSG[1],$TMP
1495	palignr \$4,@MSG[0],$TMP
1496	paddd $TMP,@MSG[2]
1497	___
1498	&$aesenc();
1499	&$aesenc() if ($r==19);
1500	$code.=<<___;
1501	sha256rnds2 $CDGH,$ABEF
1502	___
1503	push(@MSG,shift(@MSG));
1504	}
1505	$code.=<<___;
1506	movdqa 13*32-0x80($Tbl),$Wi
1507	paddd @MSG[0],$Wi
1508	sha256msg2 @MSG[0],@MSG[1]
1509	sha256msg1 @MSG[3],@MSG[2]
1510	___
1511	&$aesenc();
1512	$code.=<<___;
1513	sha256rnds2 $ABEF,$CDGH # 52-55
1514	pshufd \$0x0e,$Wi,$Wi
1515	movdqa @MSG[1],$TMP
1516	palignr \$4,@MSG[0],$TMP
1517	paddd $TMP,@MSG[2]
1518	___
1519	&$aesenc();
1520	&$aesenc();
1521	$code.=<<___;
1522	sha256rnds2 $CDGH,$ABEF
1523
1524	movdqa 14*32-0x80($Tbl),$Wi
1525	paddd @MSG[1],$Wi
1526	sha256msg2 @MSG[1],@MSG[2]
1527	movdqa $BSWAP,$TMP
1528	___
1529	&$aesenc();
1530	$code.=<<___;
1531	sha256rnds2 $ABEF,$CDGH # 56-59
1532	pshufd \$0x0e,$Wi,$Wi
1533	___
1534	&$aesenc();
1535	$code.=<<___;
1536	sha256rnds2 $CDGH,$ABEF
1537
1538	movdqa 15*32-0x80($Tbl),$Wi
1539	paddd @MSG[2],$Wi
1540	___
1541	&$aesenc();
1542	&$aesenc();
1543	$code.=<<___;
1544	sha256rnds2 $ABEF,$CDGH # 60-63
1545	pshufd \$0x0e,$Wi,$Wi
1546	___
1547	&$aesenc();
1548	$code.=<<___;
1549	sha256rnds2 $CDGH,$ABEF
1550	#pxor $CDGH,$rndkey0 # black magic
1551	___
1552	while ($r<40) { &$aesenc(); } # remaining aesenc's
1553	$code.=<<___;
1554	#xorps $CDGH,$rndkey0 # black magic
1555	paddd $CDGH_SAVE,$CDGH
1556	paddd $ABEF_SAVE,$ABEF
1557
1558	dec $len
1559	movups $iv,48($out,$in0) # write output
1560	lea 64($in0),$in0
1561	jnz .Loop_shaext
1562
1563	pshufd \$0xb1,$CDGH,$CDGH # DCHG
1564	pshufd \$0x1b,$ABEF,$TMP # FEBA
1565	pshufd \$0xb1,$ABEF,$ABEF # BAFE
1566	punpckhqdq $CDGH,$ABEF # DCBA
1567	palignr \$8,$TMP,$CDGH # HGFE
1568
1569	movups $iv,($ivp) # write IV
1570	movdqu $ABEF,($ctx)
1571	movdqu $CDGH,16($ctx)
1572	___
1573	$code.=<<___ if ($win64);
1574	movaps 0*16(%rsp),%xmm6
1575	movaps 1*16(%rsp),%xmm7
1576	movaps 2*16(%rsp),%xmm8
1577	movaps 3*16(%rsp),%xmm9
1578	movaps 4*16(%rsp),%xmm10
1579	movaps 5*16(%rsp),%xmm11
1580	movaps 6*16(%rsp),%xmm12
1581	movaps 7*16(%rsp),%xmm13
1582	movaps 8*16(%rsp),%xmm14
1583	movaps 9*16(%rsp),%xmm15
1584	lea 8+10*16(%rsp),%rsp
1585	.Lepilogue_shaext:
1586	___
1587	$code.=<<___;
1588	ret
1589	.cfi_endproc
1590	.size ${func}_shaext,.-${func}_shaext
1591	___
1592	}
1593	}}}}}
1594
1595	# EXCEPTION_DISPOSITION handler (EXCEPTION_RECORD *rec,ULONG64 frame,
1596	# CONTEXT context,DISPATCHER_CONTEXT disp)
1597	if ($win64 && $avx) {
1598	$rec="%rcx";
1599	$frame="%rdx";
1600	$context="%r8";
1601	$disp="%r9";
1602
1603	$code.=<<___;
1604	.extern __imp_RtlVirtualUnwind
1605	.type se_handler,\@abi-omnipotent
1606	.align 16
1607	se_handler:
1608	push %rsi
1609	push %rdi
1610	push %rbx
1611	push %rbp
1612	push %r12
1613	push %r13
1614	push %r14
1615	push %r15
1616	pushfq
1617	sub \$64,%rsp
1618
1619	mov 120($context),%rax # pull context->Rax
1620	mov 248($context),%rbx # pull context->Rip
1621
1622	mov 8($disp),%rsi # disp->ImageBase
1623	mov 56($disp),%r11 # disp->HanderlData
1624
1625	mov 0(%r11),%r10d # HandlerData[0]
1626	lea (%rsi,%r10),%r10 # prologue label
1627	cmp %r10,%rbx # context->Rip<prologue label
1628	jb .Lin_prologue
1629
1630	mov 152($context),%rax # pull context->Rsp
1631
1632	mov 4(%r11),%r10d # HandlerData[1]
1633	lea (%rsi,%r10),%r10 # epilogue label
1634	cmp %r10,%rbx # context->Rip>=epilogue label
1635	jae .Lin_prologue
1636	___
1637	$code.=<<___ if ($shaext);
1638	lea aesni_cbc_sha256_enc_shaext(%rip),%r10
1639	cmp %r10,%rbx
1640	jb .Lnot_in_shaext
1641
1642	lea (%rax),%rsi
1643	lea 512($context),%rdi # &context.Xmm6
1644	mov \$20,%ecx
1645	.long 0xa548f3fc # cld; rep movsq
1646	lea 168(%rax),%rax # adjust stack pointer
1647	jmp .Lin_prologue
1648	.Lnot_in_shaext:
1649	___
1650	$code.=<<___ if ($avx>1);
1651	lea .Lavx2_shortcut(%rip),%r10
1652	cmp %r10,%rbx # context->Rip<avx2_shortcut
1653	jb .Lnot_in_avx2
1654
1655	and \$-256*$SZ,%rax
1656	add \$`2$SZ($rounds-8)`,%rax
1657	.Lnot_in_avx2:
1658	___
1659	$code.=<<___;
1660	mov %rax,%rsi # put aside Rsp
1661	mov 16$SZ+78(%rax),%rax # pull $_rsp
1662
1663	mov -8(%rax),%rbx
1664	mov -16(%rax),%rbp
1665	mov -24(%rax),%r12
1666	mov -32(%rax),%r13
1667	mov -40(%rax),%r14
1668	mov -48(%rax),%r15
1669	mov %rbx,144($context) # restore context->Rbx
1670	mov %rbp,160($context) # restore context->Rbp
1671	mov %r12,216($context) # restore context->R12
1672	mov %r13,224($context) # restore context->R13
1673	mov %r14,232($context) # restore context->R14
1674	mov %r15,240($context) # restore context->R15
1675
1676	lea 16$SZ+88(%rsi),%rsi # Xmm6- save area
1677	lea 512($context),%rdi # &context.Xmm6
1678	mov \$20,%ecx
1679	.long 0xa548f3fc # cld; rep movsq
1680
1681	.Lin_prologue:
1682	mov 8(%rax),%rdi
1683	mov 16(%rax),%rsi
1684	mov %rax,152($context) # restore context->Rsp
1685	mov %rsi,168($context) # restore context->Rsi
1686	mov %rdi,176($context) # restore context->Rdi
1687
1688	mov 40($disp),%rdi # disp->ContextRecord
1689	mov $context,%rsi # context
1690	mov \$154,%ecx # sizeof(CONTEXT)
1691	.long 0xa548f3fc # cld; rep movsq
1692
1693	mov $disp,%rsi
1694	xor %rcx,%rcx # arg1, UNW_FLAG_NHANDLER
1695	mov 8(%rsi),%rdx # arg2, disp->ImageBase
1696	mov 0(%rsi),%r8 # arg3, disp->ControlPc
1697	mov 16(%rsi),%r9 # arg4, disp->FunctionEntry
1698	mov 40(%rsi),%r10 # disp->ContextRecord
1699	lea 56(%rsi),%r11 # &disp->HandlerData
1700	lea 24(%rsi),%r12 # &disp->EstablisherFrame
1701	mov %r10,32(%rsp) # arg5
1702	mov %r11,40(%rsp) # arg6
1703	mov %r12,48(%rsp) # arg7
1704	mov %rcx,56(%rsp) # arg8, (NULL)
1705	call *__imp_RtlVirtualUnwind(%rip)
1706
1707	mov \$1,%eax # ExceptionContinueSearch
1708	add \$64,%rsp
1709	popfq
1710	pop %r15
1711	pop %r14
1712	pop %r13
1713	pop %r12
1714	pop %rbp
1715	pop %rbx
1716	pop %rdi
1717	pop %rsi
1718	ret
1719	.size se_handler,.-se_handler
1720
1721	.section .pdata
1722	.rva .LSEH_begin_${func}_xop
1723	.rva .LSEH_end_${func}_xop
1724	.rva .LSEH_info_${func}_xop
1725
1726	.rva .LSEH_begin_${func}_avx
1727	.rva .LSEH_end_${func}_avx
1728	.rva .LSEH_info_${func}_avx
1729	___
1730	$code.=<<___ if ($avx>1);
1731	.rva .LSEH_begin_${func}_avx2
1732	.rva .LSEH_end_${func}_avx2
1733	.rva .LSEH_info_${func}_avx2
1734	___
1735	$code.=<<___ if ($shaext);
1736	.rva .LSEH_begin_${func}_shaext
1737	.rva .LSEH_end_${func}_shaext
1738	.rva .LSEH_info_${func}_shaext
1739	___
1740	$code.=<<___;
1741	.section .xdata
1742	.align 8
1743	.LSEH_info_${func}_xop:
1744	.byte 9,0,0,0
1745	.rva se_handler
1746	.rva .Lprologue_xop,.Lepilogue_xop # HandlerData[]
1747
1748	.LSEH_info_${func}_avx:
1749	.byte 9,0,0,0
1750	.rva se_handler
1751	.rva .Lprologue_avx,.Lepilogue_avx # HandlerData[]
1752	___
1753	$code.=<<___ if ($avx>1);
1754	.LSEH_info_${func}_avx2:
1755	.byte 9,0,0,0
1756	.rva se_handler
1757	.rva .Lprologue_avx2,.Lepilogue_avx2 # HandlerData[]
1758	___
1759	$code.=<<___ if ($shaext);
1760	.LSEH_info_${func}_shaext:
1761	.byte 9,0,0,0
1762	.rva se_handler
1763	.rva .Lprologue_shaext,.Lepilogue_shaext # HandlerData[]
1764	___
1765	}
1766
1767	####################################################################
1768	sub rex {
1769	local *opcode=shift;
1770	my ($dst,$src)=@_;
1771	my $rex=0;
1772
1773	$rex\|=0x04 if($dst>=8);
1774	$rex\|=0x01 if($src>=8);
1775	unshift @opcode,$rex\|0x40 if($rex);
1776	}
1777
1778	{
1779	my %opcodelet = (
1780	"sha256rnds2" => 0xcb,
1781	"sha256msg1" => 0xcc,
1782	"sha256msg2" => 0xcd );
1783
1784	sub sha256op38 {
1785	my $instr = shift;
1786
1787	if (defined($opcodelet{$instr}) && @_[0] =~ /%xmm([0-9]+),\s*%xmm([0-9]+)/) {
1788	my @opcode=(0x0f,0x38);
1789	rex(\@opcode,$2,$1);
1790	push @opcode,$opcodelet{$instr};
1791	push @opcode,0xc0\|($1&7)\|(($2&7)<<3); # ModR/M
1792	return ".byte\t".join(',',@opcode);
1793	} else {
1794	return $instr."\t".@_[0];
1795	}
1796	}
1797	}
1798
1799	$code =~ s/\`([^\`]*)\`/eval $1/gem;
1800	$code =~ s/\b(sha256[^\s])\s+(.)/sha256op38($1,$2)/gem;
1801	print $code;
1802	close STDOUT or die "error closing STDOUT: $!";

Note: See TracBrowser for help on using the repository browser.

source: vbox/trunk/src/libs/openssl-1.1.1k/crypto/aes/asm/aesni-sha256-x86_64.pl@ 90293

Download in other formats: