VirtualBox

source: vbox/trunk/src/libs/openssl-1.1.1k/crypto/aes/asm/aesni-sha256-x86_64.pl@ 90293

Last change on this file since 90293 was 90293, checked in by vboxsync, 4 years ago

openssl-1.1.1k: Applied and adjusted our OpenSSL changes to 1.1.1k. bugref:10072

File size: 43.3 KB
Line 
1#! /usr/bin/env perl
2# Copyright 2013-2020 The OpenSSL Project Authors. All Rights Reserved.
3#
4# Licensed under the OpenSSL license (the "License"). You may not use
5# this file except in compliance with the License. You can obtain a copy
6# in the file LICENSE in the source distribution or at
7# https://www.openssl.org/source/license.html
8
9#
10# ====================================================================
11# Written by Andy Polyakov <[email protected]> for the OpenSSL
12# project. The module is, however, dual licensed under OpenSSL and
13# CRYPTOGAMS licenses depending on where you obtain it. For further
14# details see http://www.openssl.org/~appro/cryptogams/.
15# ====================================================================
16#
17# January 2013
18#
19# This is AESNI-CBC+SHA256 stitch implementation. The idea, as spelled
20# in http://download.intel.com/design/intarch/papers/323686.pdf, is
21# that since AESNI-CBC encrypt exhibit *very* low instruction-level
22# parallelism, interleaving it with another algorithm would allow to
23# utilize processor resources better and achieve better performance.
24# SHA256 instruction sequences(*) are taken from sha512-x86_64.pl and
25# AESNI code is weaved into it. As SHA256 dominates execution time,
26# stitch performance does not depend on AES key length. Below are
27# performance numbers in cycles per processed byte, less is better,
28# for standalone AESNI-CBC encrypt, standalone SHA256, and stitched
29# subroutine:
30#
31# AES-128/-192/-256+SHA256 this(**) gain
32# Sandy Bridge 5.05/6.05/7.05+11.6 13.0 +28%/36%/43%
33# Ivy Bridge 5.05/6.05/7.05+10.3 11.6 +32%/41%/50%
34# Haswell 4.43/5.29/6.19+7.80 8.79 +39%/49%/59%
35# Skylake 2.62/3.14/3.62+7.70 8.10 +27%/34%/40%
36# Bulldozer 5.77/6.89/8.00+13.7 13.7 +42%/50%/58%
37# Ryzen(***) 2.71/-/3.71+2.05 2.74/-/3.73 +74%/-/54%
38# Goldmont(***) 3.82/-/5.35+4.16 4.73/-/5.94 +69%/-/60%
39#
40# (*) there are XOP, AVX1 and AVX2 code paths, meaning that
41# Westmere is omitted from loop, this is because gain was not
42# estimated high enough to justify the effort;
43# (**) these are EVP-free results, results obtained with 'speed
44# -evp aes-256-cbc-hmac-sha256' will vary by percent or two;
45# (***) these are SHAEXT results;
46
47$flavour = shift;
48$output = shift;
49if ($flavour =~ /\./) { $output = $flavour; undef $flavour; }
50
51$win64=0; $win64=1 if ($flavour =~ /[nm]asm|mingw64/ || $output =~ /\.asm$/);
52
53$0 =~ m/(.*[\/\\])[^\/\\]+$/; $dir=$1;
54( $xlate="${dir}x86_64-xlate.pl" and -f $xlate ) or
55( $xlate="${dir}../../perlasm/x86_64-xlate.pl" and -f $xlate) or
56die "can't locate x86_64-xlate.pl";
57
58if (`$ENV{CC} -Wa,-v -c -o /dev/null -x assembler /dev/null 2>&1`
59 =~ /GNU assembler version ([2-9]\.[0-9]+)/) {
60 $avx = ($1>=2.19) + ($1>=2.22);
61}
62
63if (!$avx && $win64 && ($flavour =~ /nasm/ || $ENV{ASM} =~ /nasm/) &&
64 `nasm -v 2>&1` =~ /NASM version ([2-9]\.[0-9]+)/) {
65 $avx = ($1>=2.09) + ($1>=2.10);
66}
67
68if (!$avx && $win64 && ($flavour =~ /masm/ || $ENV{ASM} =~ /ml64/) &&
69 `ml64 2>&1` =~ /Version ([0-9]+)\./) {
70 $avx = ($1>=10) + ($1>=12);
71}
72
73if (!$avx && `$ENV{CC} -v 2>&1` =~ /((?:clang|LLVM) version|.*based on LLVM) ([0-9]+\.[0-9]+)/) {
74 $avx = ($2>=3.0) + ($2>3.0);
75}
76
77$shaext=$avx; ### set to zero if compiling for 1.0.1
78$avx=1 if (!$shaext && $avx);
79
80open OUT,"| \"$^X\" \"$xlate\" $flavour \"$output\"";
81*STDOUT=*OUT;
82
83$func="aesni_cbc_sha256_enc";
84$TABLE="K256";
85$SZ=4;
86@ROT=($A,$B,$C,$D,$E,$F,$G,$H)=("%eax","%ebx","%ecx","%edx",
87 "%r8d","%r9d","%r10d","%r11d");
88($T1,$a0,$a1,$a2,$a3)=("%r12d","%r13d","%r14d","%r15d","%esi");
89@Sigma0=( 2,13,22);
90@Sigma1=( 6,11,25);
91@sigma0=( 7,18, 3);
92@sigma1=(17,19,10);
93$rounds=64;
94
95########################################################################
96# void aesni_cbc_sha256_enc(const void *inp,
97# void *out,
98# size_t length,
99# const AES_KEY *key,
100# unsigned char *iv,
101# SHA256_CTX *ctx,
102# const void *in0);
103($inp, $out, $len, $key, $ivp, $ctx, $in0) =
104("%rdi","%rsi","%rdx","%rcx","%r8","%r9","%r10");
105
106$Tbl="%rbp";
107
108$_inp="16*$SZ+0*8(%rsp)";
109$_out="16*$SZ+1*8(%rsp)";
110$_end="16*$SZ+2*8(%rsp)";
111$_key="16*$SZ+3*8(%rsp)";
112$_ivp="16*$SZ+4*8(%rsp)";
113$_ctx="16*$SZ+5*8(%rsp)";
114$_in0="16*$SZ+6*8(%rsp)";
115$_rsp="`16*$SZ+7*8`(%rsp)";
116$framesz=16*$SZ+8*8;
117
118$code=<<___;
119.text
120
121.extern OPENSSL_ia32cap_P
122.globl $func
123.type $func,\@abi-omnipotent
124.align 16
125$func:
126.cfi_startproc
127___
128 if ($avx) {
129$code.=<<___;
130 lea OPENSSL_ia32cap_P(%rip),%r11
131 mov \$1,%eax
132 cmp \$0,`$win64?"%rcx":"%rdi"`
133 je .Lprobe
134 mov 0(%r11),%eax
135 mov 4(%r11),%r10
136___
137$code.=<<___ if ($shaext);
138 bt \$61,%r10 # check for SHA
139 jc ${func}_shaext
140___
141$code.=<<___;
142 mov %r10,%r11
143 shr \$32,%r11
144
145 test \$`1<<11`,%r10d # check for XOP
146 jnz ${func}_xop
147___
148$code.=<<___ if ($avx>1);
149 and \$`1<<8|1<<5|1<<3`,%r11d # check for BMI2+AVX2+BMI1
150 cmp \$`1<<8|1<<5|1<<3`,%r11d
151 je ${func}_avx2
152___
153$code.=<<___;
154 and \$`1<<28`,%r10d # check for AVX
155 jnz ${func}_avx
156 ud2
157___
158 }
159$code.=<<___;
160 xor %eax,%eax
161 cmp \$0,`$win64?"%rcx":"%rdi"`
162 je .Lprobe
163 ud2
164.Lprobe:
165 ret
166.cfi_endproc
167.size $func,.-$func
168
169.align 64
170.type $TABLE,\@object
171$TABLE:
172 .long 0x428a2f98,0x71374491,0xb5c0fbcf,0xe9b5dba5
173 .long 0x428a2f98,0x71374491,0xb5c0fbcf,0xe9b5dba5
174 .long 0x3956c25b,0x59f111f1,0x923f82a4,0xab1c5ed5
175 .long 0x3956c25b,0x59f111f1,0x923f82a4,0xab1c5ed5
176 .long 0xd807aa98,0x12835b01,0x243185be,0x550c7dc3
177 .long 0xd807aa98,0x12835b01,0x243185be,0x550c7dc3
178 .long 0x72be5d74,0x80deb1fe,0x9bdc06a7,0xc19bf174
179 .long 0x72be5d74,0x80deb1fe,0x9bdc06a7,0xc19bf174
180 .long 0xe49b69c1,0xefbe4786,0x0fc19dc6,0x240ca1cc
181 .long 0xe49b69c1,0xefbe4786,0x0fc19dc6,0x240ca1cc
182 .long 0x2de92c6f,0x4a7484aa,0x5cb0a9dc,0x76f988da
183 .long 0x2de92c6f,0x4a7484aa,0x5cb0a9dc,0x76f988da
184 .long 0x983e5152,0xa831c66d,0xb00327c8,0xbf597fc7
185 .long 0x983e5152,0xa831c66d,0xb00327c8,0xbf597fc7
186 .long 0xc6e00bf3,0xd5a79147,0x06ca6351,0x14292967
187 .long 0xc6e00bf3,0xd5a79147,0x06ca6351,0x14292967
188 .long 0x27b70a85,0x2e1b2138,0x4d2c6dfc,0x53380d13
189 .long 0x27b70a85,0x2e1b2138,0x4d2c6dfc,0x53380d13
190 .long 0x650a7354,0x766a0abb,0x81c2c92e,0x92722c85
191 .long 0x650a7354,0x766a0abb,0x81c2c92e,0x92722c85
192 .long 0xa2bfe8a1,0xa81a664b,0xc24b8b70,0xc76c51a3
193 .long 0xa2bfe8a1,0xa81a664b,0xc24b8b70,0xc76c51a3
194 .long 0xd192e819,0xd6990624,0xf40e3585,0x106aa070
195 .long 0xd192e819,0xd6990624,0xf40e3585,0x106aa070
196 .long 0x19a4c116,0x1e376c08,0x2748774c,0x34b0bcb5
197 .long 0x19a4c116,0x1e376c08,0x2748774c,0x34b0bcb5
198 .long 0x391c0cb3,0x4ed8aa4a,0x5b9cca4f,0x682e6ff3
199 .long 0x391c0cb3,0x4ed8aa4a,0x5b9cca4f,0x682e6ff3
200 .long 0x748f82ee,0x78a5636f,0x84c87814,0x8cc70208
201 .long 0x748f82ee,0x78a5636f,0x84c87814,0x8cc70208
202 .long 0x90befffa,0xa4506ceb,0xbef9a3f7,0xc67178f2
203 .long 0x90befffa,0xa4506ceb,0xbef9a3f7,0xc67178f2
204
205 .long 0x00010203,0x04050607,0x08090a0b,0x0c0d0e0f
206 .long 0x00010203,0x04050607,0x08090a0b,0x0c0d0e0f
207 .long 0,0,0,0, 0,0,0,0, -1,-1,-1,-1
208 .long 0,0,0,0, 0,0,0,0
209 .asciz "AESNI-CBC+SHA256 stitch for x86_64, CRYPTOGAMS by <appro\@openssl.org>"
210.align 64
211___
212
213######################################################################
214# SIMD code paths
215#
216{{{
217($iv,$inout,$roundkey,$temp,
218 $mask10,$mask12,$mask14,$offload)=map("%xmm$_",(8..15));
219
220$aesni_cbc_idx=0;
221@aesni_cbc_block = (
222## &vmovdqu ($roundkey,"0x00-0x80($inp)");'
223## &vmovdqu ($inout,($inp));
224## &mov ($_inp,$inp);
225
226 '&vpxor ($inout,$inout,$roundkey);'.
227 ' &vmovdqu ($roundkey,"0x10-0x80($inp)");',
228
229 '&vpxor ($inout,$inout,$iv);',
230
231 '&vaesenc ($inout,$inout,$roundkey);'.
232 ' &vmovdqu ($roundkey,"0x20-0x80($inp)");',
233
234 '&vaesenc ($inout,$inout,$roundkey);'.
235 ' &vmovdqu ($roundkey,"0x30-0x80($inp)");',
236
237 '&vaesenc ($inout,$inout,$roundkey);'.
238 ' &vmovdqu ($roundkey,"0x40-0x80($inp)");',
239
240 '&vaesenc ($inout,$inout,$roundkey);'.
241 ' &vmovdqu ($roundkey,"0x50-0x80($inp)");',
242
243 '&vaesenc ($inout,$inout,$roundkey);'.
244 ' &vmovdqu ($roundkey,"0x60-0x80($inp)");',
245
246 '&vaesenc ($inout,$inout,$roundkey);'.
247 ' &vmovdqu ($roundkey,"0x70-0x80($inp)");',
248
249 '&vaesenc ($inout,$inout,$roundkey);'.
250 ' &vmovdqu ($roundkey,"0x80-0x80($inp)");',
251
252 '&vaesenc ($inout,$inout,$roundkey);'.
253 ' &vmovdqu ($roundkey,"0x90-0x80($inp)");',
254
255 '&vaesenc ($inout,$inout,$roundkey);'.
256 ' &vmovdqu ($roundkey,"0xa0-0x80($inp)");',
257
258 '&vaesenclast ($temp,$inout,$roundkey);'.
259 ' &vaesenc ($inout,$inout,$roundkey);'.
260 ' &vmovdqu ($roundkey,"0xb0-0x80($inp)");',
261
262 '&vpand ($iv,$temp,$mask10);'.
263 ' &vaesenc ($inout,$inout,$roundkey);'.
264 ' &vmovdqu ($roundkey,"0xc0-0x80($inp)");',
265
266 '&vaesenclast ($temp,$inout,$roundkey);'.
267 ' &vaesenc ($inout,$inout,$roundkey);'.
268 ' &vmovdqu ($roundkey,"0xd0-0x80($inp)");',
269
270 '&vpand ($temp,$temp,$mask12);'.
271 ' &vaesenc ($inout,$inout,$roundkey);'.
272 '&vmovdqu ($roundkey,"0xe0-0x80($inp)");',
273
274 '&vpor ($iv,$iv,$temp);'.
275 ' &vaesenclast ($temp,$inout,$roundkey);'.
276 ' &vmovdqu ($roundkey,"0x00-0x80($inp)");'
277
278## &mov ($inp,$_inp);
279## &mov ($out,$_out);
280## &vpand ($temp,$temp,$mask14);
281## &vpor ($iv,$iv,$temp);
282## &vmovdqu ($iv,($out,$inp);
283## &lea (inp,16($inp));
284);
285
286my $a4=$T1;
287my ($a,$b,$c,$d,$e,$f,$g,$h);
288
289sub AUTOLOAD() # thunk [simplified] 32-bit style perlasm
290{ my $opcode = $AUTOLOAD; $opcode =~ s/.*:://;
291 my $arg = pop;
292 $arg = "\$$arg" if ($arg*1 eq $arg);
293 $code .= "\t$opcode\t".join(',',$arg,reverse @_)."\n";
294}
295
296sub body_00_15 () {
297 (
298 '($a,$b,$c,$d,$e,$f,$g,$h)=@ROT;'.
299
300 '&ror ($a0,$Sigma1[2]-$Sigma1[1])',
301 '&mov ($a,$a1)',
302 '&mov ($a4,$f)',
303
304 '&xor ($a0,$e)',
305 '&ror ($a1,$Sigma0[2]-$Sigma0[1])',
306 '&xor ($a4,$g)', # f^g
307
308 '&ror ($a0,$Sigma1[1]-$Sigma1[0])',
309 '&xor ($a1,$a)',
310 '&and ($a4,$e)', # (f^g)&e
311
312 @aesni_cbc_block[$aesni_cbc_idx++].
313 '&xor ($a0,$e)',
314 '&add ($h,$SZ*($i&15)."(%rsp)")', # h+=X[i]+K[i]
315 '&mov ($a2,$a)',
316
317 '&ror ($a1,$Sigma0[1]-$Sigma0[0])',
318 '&xor ($a4,$g)', # Ch(e,f,g)=((f^g)&e)^g
319 '&xor ($a2,$b)', # a^b, b^c in next round
320
321 '&ror ($a0,$Sigma1[0])', # Sigma1(e)
322 '&add ($h,$a4)', # h+=Ch(e,f,g)
323 '&and ($a3,$a2)', # (b^c)&(a^b)
324
325 '&xor ($a1,$a)',
326 '&add ($h,$a0)', # h+=Sigma1(e)
327 '&xor ($a3,$b)', # Maj(a,b,c)=Ch(a^b,c,b)
328
329 '&add ($d,$h)', # d+=h
330 '&ror ($a1,$Sigma0[0])', # Sigma0(a)
331 '&add ($h,$a3)', # h+=Maj(a,b,c)
332
333 '&mov ($a0,$d)',
334 '&add ($a1,$h);'. # h+=Sigma0(a)
335 '($a2,$a3) = ($a3,$a2); unshift(@ROT,pop(@ROT)); $i++;'
336 );
337}
338
339if ($avx) {{
340######################################################################
341# XOP code path
342#
343$code.=<<___;
344.type ${func}_xop,\@function,6
345.align 64
346${func}_xop:
347.cfi_startproc
348.Lxop_shortcut:
349 mov `($win64?56:8)`(%rsp),$in0 # load 7th parameter
350 mov %rsp,%rax # copy %rsp
351.cfi_def_cfa_register %rax
352 push %rbx
353.cfi_push %rbx
354 push %rbp
355.cfi_push %rbp
356 push %r12
357.cfi_push %r12
358 push %r13
359.cfi_push %r13
360 push %r14
361.cfi_push %r14
362 push %r15
363.cfi_push %r15
364 sub \$`$framesz+$win64*16*10`,%rsp
365 and \$-64,%rsp # align stack frame
366
367 shl \$6,$len
368 sub $inp,$out # re-bias
369 sub $inp,$in0
370 add $inp,$len # end of input
371
372 #mov $inp,$_inp # saved later
373 mov $out,$_out
374 mov $len,$_end
375 #mov $key,$_key # remains resident in $inp register
376 mov $ivp,$_ivp
377 mov $ctx,$_ctx
378 mov $in0,$_in0
379 mov %rax,$_rsp
380.cfi_cfa_expression $_rsp,deref,+8
381___
382$code.=<<___ if ($win64);
383 movaps %xmm6,`$framesz+16*0`(%rsp)
384 movaps %xmm7,`$framesz+16*1`(%rsp)
385 movaps %xmm8,`$framesz+16*2`(%rsp)
386 movaps %xmm9,`$framesz+16*3`(%rsp)
387 movaps %xmm10,`$framesz+16*4`(%rsp)
388 movaps %xmm11,`$framesz+16*5`(%rsp)
389 movaps %xmm12,`$framesz+16*6`(%rsp)
390 movaps %xmm13,`$framesz+16*7`(%rsp)
391 movaps %xmm14,`$framesz+16*8`(%rsp)
392 movaps %xmm15,`$framesz+16*9`(%rsp)
393___
394$code.=<<___;
395.Lprologue_xop:
396 vzeroall
397
398 mov $inp,%r12 # borrow $a4
399 lea 0x80($key),$inp # size optimization, reassign
400 lea $TABLE+`$SZ*2*$rounds+32`(%rip),%r13 # borrow $a0
401 mov 0xf0-0x80($inp),%r14d # rounds, borrow $a1
402 mov $ctx,%r15 # borrow $a2
403 mov $in0,%rsi # borrow $a3
404 vmovdqu ($ivp),$iv # load IV
405 sub \$9,%r14
406
407 mov $SZ*0(%r15),$A
408 mov $SZ*1(%r15),$B
409 mov $SZ*2(%r15),$C
410 mov $SZ*3(%r15),$D
411 mov $SZ*4(%r15),$E
412 mov $SZ*5(%r15),$F
413 mov $SZ*6(%r15),$G
414 mov $SZ*7(%r15),$H
415
416 vmovdqa 0x00(%r13,%r14,8),$mask14
417 vmovdqa 0x10(%r13,%r14,8),$mask12
418 vmovdqa 0x20(%r13,%r14,8),$mask10
419 vmovdqu 0x00-0x80($inp),$roundkey
420 jmp .Lloop_xop
421___
422 if ($SZ==4) { # SHA256
423 my @X = map("%xmm$_",(0..3));
424 my ($t0,$t1,$t2,$t3) = map("%xmm$_",(4..7));
425
426$code.=<<___;
427.align 16
428.Lloop_xop:
429 vmovdqa $TABLE+`$SZ*2*$rounds`(%rip),$t3
430 vmovdqu 0x00(%rsi,%r12),@X[0]
431 vmovdqu 0x10(%rsi,%r12),@X[1]
432 vmovdqu 0x20(%rsi,%r12),@X[2]
433 vmovdqu 0x30(%rsi,%r12),@X[3]
434 vpshufb $t3,@X[0],@X[0]
435 lea $TABLE(%rip),$Tbl
436 vpshufb $t3,@X[1],@X[1]
437 vpshufb $t3,@X[2],@X[2]
438 vpaddd 0x00($Tbl),@X[0],$t0
439 vpshufb $t3,@X[3],@X[3]
440 vpaddd 0x20($Tbl),@X[1],$t1
441 vpaddd 0x40($Tbl),@X[2],$t2
442 vpaddd 0x60($Tbl),@X[3],$t3
443 vmovdqa $t0,0x00(%rsp)
444 mov $A,$a1
445 vmovdqa $t1,0x10(%rsp)
446 mov $B,$a3
447 vmovdqa $t2,0x20(%rsp)
448 xor $C,$a3 # magic
449 vmovdqa $t3,0x30(%rsp)
450 mov $E,$a0
451 jmp .Lxop_00_47
452
453.align 16
454.Lxop_00_47:
455 sub \$-16*2*$SZ,$Tbl # size optimization
456 vmovdqu (%r12),$inout # $a4
457 mov %r12,$_inp # $a4
458___
459sub XOP_256_00_47 () {
460my $j = shift;
461my $body = shift;
462my @X = @_;
463my @insns = (&$body,&$body,&$body,&$body); # 104 instructions
464
465 &vpalignr ($t0,@X[1],@X[0],$SZ); # X[1..4]
466 eval(shift(@insns));
467 eval(shift(@insns));
468 &vpalignr ($t3,@X[3],@X[2],$SZ); # X[9..12]
469 eval(shift(@insns));
470 eval(shift(@insns));
471 &vprotd ($t1,$t0,8*$SZ-$sigma0[1]);
472 eval(shift(@insns));
473 eval(shift(@insns));
474 &vpsrld ($t0,$t0,$sigma0[2]);
475 eval(shift(@insns));
476 eval(shift(@insns));
477 &vpaddd (@X[0],@X[0],$t3); # X[0..3] += X[9..12]
478 eval(shift(@insns));
479 eval(shift(@insns));
480 eval(shift(@insns));
481 eval(shift(@insns));
482 &vprotd ($t2,$t1,$sigma0[1]-$sigma0[0]);
483 eval(shift(@insns));
484 eval(shift(@insns));
485 &vpxor ($t0,$t0,$t1);
486 eval(shift(@insns));
487 eval(shift(@insns));
488 eval(shift(@insns));
489 eval(shift(@insns));
490 &vprotd ($t3,@X[3],8*$SZ-$sigma1[1]);
491 eval(shift(@insns));
492 eval(shift(@insns));
493 &vpxor ($t0,$t0,$t2); # sigma0(X[1..4])
494 eval(shift(@insns));
495 eval(shift(@insns));
496 &vpsrld ($t2,@X[3],$sigma1[2]);
497 eval(shift(@insns));
498 eval(shift(@insns));
499 &vpaddd (@X[0],@X[0],$t0); # X[0..3] += sigma0(X[1..4])
500 eval(shift(@insns));
501 eval(shift(@insns));
502 &vprotd ($t1,$t3,$sigma1[1]-$sigma1[0]);
503 eval(shift(@insns));
504 eval(shift(@insns));
505 &vpxor ($t3,$t3,$t2);
506 eval(shift(@insns));
507 eval(shift(@insns));
508 eval(shift(@insns));
509 eval(shift(@insns));
510 &vpxor ($t3,$t3,$t1); # sigma1(X[14..15])
511 eval(shift(@insns));
512 eval(shift(@insns));
513 eval(shift(@insns));
514 eval(shift(@insns));
515 &vpsrldq ($t3,$t3,8);
516 eval(shift(@insns));
517 eval(shift(@insns));
518 eval(shift(@insns));
519 eval(shift(@insns));
520 &vpaddd (@X[0],@X[0],$t3); # X[0..1] += sigma1(X[14..15])
521 eval(shift(@insns));
522 eval(shift(@insns));
523 eval(shift(@insns));
524 eval(shift(@insns));
525 &vprotd ($t3,@X[0],8*$SZ-$sigma1[1]);
526 eval(shift(@insns));
527 eval(shift(@insns));
528 &vpsrld ($t2,@X[0],$sigma1[2]);
529 eval(shift(@insns));
530 eval(shift(@insns));
531 &vprotd ($t1,$t3,$sigma1[1]-$sigma1[0]);
532 eval(shift(@insns));
533 eval(shift(@insns));
534 &vpxor ($t3,$t3,$t2);
535 eval(shift(@insns));
536 eval(shift(@insns));
537 eval(shift(@insns));
538 eval(shift(@insns));
539 &vpxor ($t3,$t3,$t1); # sigma1(X[16..17])
540 eval(shift(@insns));
541 eval(shift(@insns));
542 eval(shift(@insns));
543 eval(shift(@insns));
544 &vpslldq ($t3,$t3,8); # 22 instructions
545 eval(shift(@insns));
546 eval(shift(@insns));
547 eval(shift(@insns));
548 eval(shift(@insns));
549 &vpaddd (@X[0],@X[0],$t3); # X[2..3] += sigma1(X[16..17])
550 eval(shift(@insns));
551 eval(shift(@insns));
552 eval(shift(@insns));
553 eval(shift(@insns));
554 &vpaddd ($t2,@X[0],16*2*$j."($Tbl)");
555 foreach (@insns) { eval; } # remaining instructions
556 &vmovdqa (16*$j."(%rsp)",$t2);
557}
558
559 $aesni_cbc_idx=0;
560 for ($i=0,$j=0; $j<4; $j++) {
561 &XOP_256_00_47($j,\&body_00_15,@X);
562 push(@X,shift(@X)); # rotate(@X)
563 }
564 &mov ("%r12",$_inp); # borrow $a4
565 &vpand ($temp,$temp,$mask14);
566 &mov ("%r15",$_out); # borrow $a2
567 &vpor ($iv,$iv,$temp);
568 &vmovdqu ("(%r15,%r12)",$iv); # write output
569 &lea ("%r12","16(%r12)"); # inp++
570
571 &cmpb ($SZ-1+16*2*$SZ."($Tbl)",0);
572 &jne (".Lxop_00_47");
573
574 &vmovdqu ($inout,"(%r12)");
575 &mov ($_inp,"%r12");
576
577 $aesni_cbc_idx=0;
578 for ($i=0; $i<16; ) {
579 foreach(body_00_15()) { eval; }
580 }
581 }
582$code.=<<___;
583 mov $_inp,%r12 # borrow $a4
584 mov $_out,%r13 # borrow $a0
585 mov $_ctx,%r15 # borrow $a2
586 mov $_in0,%rsi # borrow $a3
587
588 vpand $mask14,$temp,$temp
589 mov $a1,$A
590 vpor $temp,$iv,$iv
591 vmovdqu $iv,(%r13,%r12) # write output
592 lea 16(%r12),%r12 # inp++
593
594 add $SZ*0(%r15),$A
595 add $SZ*1(%r15),$B
596 add $SZ*2(%r15),$C
597 add $SZ*3(%r15),$D
598 add $SZ*4(%r15),$E
599 add $SZ*5(%r15),$F
600 add $SZ*6(%r15),$G
601 add $SZ*7(%r15),$H
602
603 cmp $_end,%r12
604
605 mov $A,$SZ*0(%r15)
606 mov $B,$SZ*1(%r15)
607 mov $C,$SZ*2(%r15)
608 mov $D,$SZ*3(%r15)
609 mov $E,$SZ*4(%r15)
610 mov $F,$SZ*5(%r15)
611 mov $G,$SZ*6(%r15)
612 mov $H,$SZ*7(%r15)
613
614 jb .Lloop_xop
615
616 mov $_ivp,$ivp
617 mov $_rsp,%rsi
618.cfi_def_cfa %rsi,8
619 vmovdqu $iv,($ivp) # output IV
620 vzeroall
621___
622$code.=<<___ if ($win64);
623 movaps `$framesz+16*0`(%rsp),%xmm6
624 movaps `$framesz+16*1`(%rsp),%xmm7
625 movaps `$framesz+16*2`(%rsp),%xmm8
626 movaps `$framesz+16*3`(%rsp),%xmm9
627 movaps `$framesz+16*4`(%rsp),%xmm10
628 movaps `$framesz+16*5`(%rsp),%xmm11
629 movaps `$framesz+16*6`(%rsp),%xmm12
630 movaps `$framesz+16*7`(%rsp),%xmm13
631 movaps `$framesz+16*8`(%rsp),%xmm14
632 movaps `$framesz+16*9`(%rsp),%xmm15
633___
634$code.=<<___;
635 mov -48(%rsi),%r15
636.cfi_restore %r15
637 mov -40(%rsi),%r14
638.cfi_restore %r14
639 mov -32(%rsi),%r13
640.cfi_restore %r13
641 mov -24(%rsi),%r12
642.cfi_restore %r12
643 mov -16(%rsi),%rbp
644.cfi_restore %rbp
645 mov -8(%rsi),%rbx
646.cfi_restore %rbx
647 lea (%rsi),%rsp
648.cfi_def_cfa_register %rsp
649.Lepilogue_xop:
650 ret
651.cfi_endproc
652.size ${func}_xop,.-${func}_xop
653___
654######################################################################
655# AVX+shrd code path
656#
657local *ror = sub { &shrd(@_[0],@_) };
658
659$code.=<<___;
660.type ${func}_avx,\@function,6
661.align 64
662${func}_avx:
663.cfi_startproc
664.Lavx_shortcut:
665 mov `($win64?56:8)`(%rsp),$in0 # load 7th parameter
666 mov %rsp,%rax # copy %rsp
667.cfi_def_cfa_register %rax
668 push %rbx
669.cfi_push %rbx
670 push %rbp
671.cfi_push %rbp
672 push %r12
673.cfi_push %r12
674 push %r13
675.cfi_push %r13
676 push %r14
677.cfi_push %r14
678 push %r15
679.cfi_push %r15
680 sub \$`$framesz+$win64*16*10`,%rsp
681 and \$-64,%rsp # align stack frame
682
683 shl \$6,$len
684 sub $inp,$out # re-bias
685 sub $inp,$in0
686 add $inp,$len # end of input
687
688 #mov $inp,$_inp # saved later
689 mov $out,$_out
690 mov $len,$_end
691 #mov $key,$_key # remains resident in $inp register
692 mov $ivp,$_ivp
693 mov $ctx,$_ctx
694 mov $in0,$_in0
695 mov %rax,$_rsp
696.cfi_cfa_expression $_rsp,deref,+8
697___
698$code.=<<___ if ($win64);
699 movaps %xmm6,`$framesz+16*0`(%rsp)
700 movaps %xmm7,`$framesz+16*1`(%rsp)
701 movaps %xmm8,`$framesz+16*2`(%rsp)
702 movaps %xmm9,`$framesz+16*3`(%rsp)
703 movaps %xmm10,`$framesz+16*4`(%rsp)
704 movaps %xmm11,`$framesz+16*5`(%rsp)
705 movaps %xmm12,`$framesz+16*6`(%rsp)
706 movaps %xmm13,`$framesz+16*7`(%rsp)
707 movaps %xmm14,`$framesz+16*8`(%rsp)
708 movaps %xmm15,`$framesz+16*9`(%rsp)
709___
710$code.=<<___;
711.Lprologue_avx:
712 vzeroall
713
714 mov $inp,%r12 # borrow $a4
715 lea 0x80($key),$inp # size optimization, reassign
716 lea $TABLE+`$SZ*2*$rounds+32`(%rip),%r13 # borrow $a0
717 mov 0xf0-0x80($inp),%r14d # rounds, borrow $a1
718 mov $ctx,%r15 # borrow $a2
719 mov $in0,%rsi # borrow $a3
720 vmovdqu ($ivp),$iv # load IV
721 sub \$9,%r14
722
723 mov $SZ*0(%r15),$A
724 mov $SZ*1(%r15),$B
725 mov $SZ*2(%r15),$C
726 mov $SZ*3(%r15),$D
727 mov $SZ*4(%r15),$E
728 mov $SZ*5(%r15),$F
729 mov $SZ*6(%r15),$G
730 mov $SZ*7(%r15),$H
731
732 vmovdqa 0x00(%r13,%r14,8),$mask14
733 vmovdqa 0x10(%r13,%r14,8),$mask12
734 vmovdqa 0x20(%r13,%r14,8),$mask10
735 vmovdqu 0x00-0x80($inp),$roundkey
736___
737 if ($SZ==4) { # SHA256
738 my @X = map("%xmm$_",(0..3));
739 my ($t0,$t1,$t2,$t3) = map("%xmm$_",(4..7));
740
741$code.=<<___;
742 jmp .Lloop_avx
743.align 16
744.Lloop_avx:
745 vmovdqa $TABLE+`$SZ*2*$rounds`(%rip),$t3
746 vmovdqu 0x00(%rsi,%r12),@X[0]
747 vmovdqu 0x10(%rsi,%r12),@X[1]
748 vmovdqu 0x20(%rsi,%r12),@X[2]
749 vmovdqu 0x30(%rsi,%r12),@X[3]
750 vpshufb $t3,@X[0],@X[0]
751 lea $TABLE(%rip),$Tbl
752 vpshufb $t3,@X[1],@X[1]
753 vpshufb $t3,@X[2],@X[2]
754 vpaddd 0x00($Tbl),@X[0],$t0
755 vpshufb $t3,@X[3],@X[3]
756 vpaddd 0x20($Tbl),@X[1],$t1
757 vpaddd 0x40($Tbl),@X[2],$t2
758 vpaddd 0x60($Tbl),@X[3],$t3
759 vmovdqa $t0,0x00(%rsp)
760 mov $A,$a1
761 vmovdqa $t1,0x10(%rsp)
762 mov $B,$a3
763 vmovdqa $t2,0x20(%rsp)
764 xor $C,$a3 # magic
765 vmovdqa $t3,0x30(%rsp)
766 mov $E,$a0
767 jmp .Lavx_00_47
768
769.align 16
770.Lavx_00_47:
771 sub \$-16*2*$SZ,$Tbl # size optimization
772 vmovdqu (%r12),$inout # $a4
773 mov %r12,$_inp # $a4
774___
775sub Xupdate_256_AVX () {
776 (
777 '&vpalignr ($t0,@X[1],@X[0],$SZ)', # X[1..4]
778 '&vpalignr ($t3,@X[3],@X[2],$SZ)', # X[9..12]
779 '&vpsrld ($t2,$t0,$sigma0[0]);',
780 '&vpaddd (@X[0],@X[0],$t3)', # X[0..3] += X[9..12]
781 '&vpsrld ($t3,$t0,$sigma0[2])',
782 '&vpslld ($t1,$t0,8*$SZ-$sigma0[1]);',
783 '&vpxor ($t0,$t3,$t2)',
784 '&vpshufd ($t3,@X[3],0b11111010)',# X[14..15]
785 '&vpsrld ($t2,$t2,$sigma0[1]-$sigma0[0]);',
786 '&vpxor ($t0,$t0,$t1)',
787 '&vpslld ($t1,$t1,$sigma0[1]-$sigma0[0]);',
788 '&vpxor ($t0,$t0,$t2)',
789 '&vpsrld ($t2,$t3,$sigma1[2]);',
790 '&vpxor ($t0,$t0,$t1)', # sigma0(X[1..4])
791 '&vpsrlq ($t3,$t3,$sigma1[0]);',
792 '&vpaddd (@X[0],@X[0],$t0)', # X[0..3] += sigma0(X[1..4])
793 '&vpxor ($t2,$t2,$t3);',
794 '&vpsrlq ($t3,$t3,$sigma1[1]-$sigma1[0])',
795 '&vpxor ($t2,$t2,$t3)', # sigma1(X[14..15])
796 '&vpshufd ($t2,$t2,0b10000100)',
797 '&vpsrldq ($t2,$t2,8)',
798 '&vpaddd (@X[0],@X[0],$t2)', # X[0..1] += sigma1(X[14..15])
799 '&vpshufd ($t3,@X[0],0b01010000)',# X[16..17]
800 '&vpsrld ($t2,$t3,$sigma1[2])',
801 '&vpsrlq ($t3,$t3,$sigma1[0])',
802 '&vpxor ($t2,$t2,$t3);',
803 '&vpsrlq ($t3,$t3,$sigma1[1]-$sigma1[0])',
804 '&vpxor ($t2,$t2,$t3)',
805 '&vpshufd ($t2,$t2,0b11101000)',
806 '&vpslldq ($t2,$t2,8)',
807 '&vpaddd (@X[0],@X[0],$t2)' # X[2..3] += sigma1(X[16..17])
808 );
809}
810
811sub AVX_256_00_47 () {
812my $j = shift;
813my $body = shift;
814my @X = @_;
815my @insns = (&$body,&$body,&$body,&$body); # 104 instructions
816
817 foreach (Xupdate_256_AVX()) { # 29 instructions
818 eval;
819 eval(shift(@insns));
820 eval(shift(@insns));
821 eval(shift(@insns));
822 }
823 &vpaddd ($t2,@X[0],16*2*$j."($Tbl)");
824 foreach (@insns) { eval; } # remaining instructions
825 &vmovdqa (16*$j."(%rsp)",$t2);
826}
827
828 $aesni_cbc_idx=0;
829 for ($i=0,$j=0; $j<4; $j++) {
830 &AVX_256_00_47($j,\&body_00_15,@X);
831 push(@X,shift(@X)); # rotate(@X)
832 }
833 &mov ("%r12",$_inp); # borrow $a4
834 &vpand ($temp,$temp,$mask14);
835 &mov ("%r15",$_out); # borrow $a2
836 &vpor ($iv,$iv,$temp);
837 &vmovdqu ("(%r15,%r12)",$iv); # write output
838 &lea ("%r12","16(%r12)"); # inp++
839
840 &cmpb ($SZ-1+16*2*$SZ."($Tbl)",0);
841 &jne (".Lavx_00_47");
842
843 &vmovdqu ($inout,"(%r12)");
844 &mov ($_inp,"%r12");
845
846 $aesni_cbc_idx=0;
847 for ($i=0; $i<16; ) {
848 foreach(body_00_15()) { eval; }
849 }
850
851 }
852$code.=<<___;
853 mov $_inp,%r12 # borrow $a4
854 mov $_out,%r13 # borrow $a0
855 mov $_ctx,%r15 # borrow $a2
856 mov $_in0,%rsi # borrow $a3
857
858 vpand $mask14,$temp,$temp
859 mov $a1,$A
860 vpor $temp,$iv,$iv
861 vmovdqu $iv,(%r13,%r12) # write output
862 lea 16(%r12),%r12 # inp++
863
864 add $SZ*0(%r15),$A
865 add $SZ*1(%r15),$B
866 add $SZ*2(%r15),$C
867 add $SZ*3(%r15),$D
868 add $SZ*4(%r15),$E
869 add $SZ*5(%r15),$F
870 add $SZ*6(%r15),$G
871 add $SZ*7(%r15),$H
872
873 cmp $_end,%r12
874
875 mov $A,$SZ*0(%r15)
876 mov $B,$SZ*1(%r15)
877 mov $C,$SZ*2(%r15)
878 mov $D,$SZ*3(%r15)
879 mov $E,$SZ*4(%r15)
880 mov $F,$SZ*5(%r15)
881 mov $G,$SZ*6(%r15)
882 mov $H,$SZ*7(%r15)
883 jb .Lloop_avx
884
885 mov $_ivp,$ivp
886 mov $_rsp,%rsi
887.cfi_def_cfa %rsi,8
888 vmovdqu $iv,($ivp) # output IV
889 vzeroall
890___
891$code.=<<___ if ($win64);
892 movaps `$framesz+16*0`(%rsp),%xmm6
893 movaps `$framesz+16*1`(%rsp),%xmm7
894 movaps `$framesz+16*2`(%rsp),%xmm8
895 movaps `$framesz+16*3`(%rsp),%xmm9
896 movaps `$framesz+16*4`(%rsp),%xmm10
897 movaps `$framesz+16*5`(%rsp),%xmm11
898 movaps `$framesz+16*6`(%rsp),%xmm12
899 movaps `$framesz+16*7`(%rsp),%xmm13
900 movaps `$framesz+16*8`(%rsp),%xmm14
901 movaps `$framesz+16*9`(%rsp),%xmm15
902___
903$code.=<<___;
904 mov -48(%rsi),%r15
905.cfi_restore %r15
906 mov -40(%rsi),%r14
907.cfi_restore %r14
908 mov -32(%rsi),%r13
909.cfi_restore %r13
910 mov -24(%rsi),%r12
911.cfi_restore %r12
912 mov -16(%rsi),%rbp
913.cfi_restore %rbp
914 mov -8(%rsi),%rbx
915.cfi_restore %rbx
916 lea (%rsi),%rsp
917.cfi_def_cfa_register %rsp
918.Lepilogue_avx:
919 ret
920.cfi_endproc
921.size ${func}_avx,.-${func}_avx
922___
923
924if ($avx>1) {{
925######################################################################
926# AVX2+BMI code path
927#
928my $a5=$SZ==4?"%esi":"%rsi"; # zap $inp
929my $PUSH8=8*2*$SZ;
930use integer;
931
932sub bodyx_00_15 () {
933 # at start $a1 should be zero, $a3 - $b^$c and $a4 copy of $f
934 (
935 '($a,$b,$c,$d,$e,$f,$g,$h)=@ROT;'.
936
937 '&add ($h,(32*($i/(16/$SZ))+$SZ*($i%(16/$SZ)))%$PUSH8.$base)', # h+=X[i]+K[i]
938 '&and ($a4,$e)', # f&e
939 '&rorx ($a0,$e,$Sigma1[2])',
940 '&rorx ($a2,$e,$Sigma1[1])',
941
942 '&lea ($a,"($a,$a1)")', # h+=Sigma0(a) from the past
943 '&lea ($h,"($h,$a4)")',
944 '&andn ($a4,$e,$g)', # ~e&g
945 '&xor ($a0,$a2)',
946
947 '&rorx ($a1,$e,$Sigma1[0])',
948 '&lea ($h,"($h,$a4)")', # h+=Ch(e,f,g)=(e&f)+(~e&g)
949 '&xor ($a0,$a1)', # Sigma1(e)
950 '&mov ($a2,$a)',
951
952 '&rorx ($a4,$a,$Sigma0[2])',
953 '&lea ($h,"($h,$a0)")', # h+=Sigma1(e)
954 '&xor ($a2,$b)', # a^b, b^c in next round
955 '&rorx ($a1,$a,$Sigma0[1])',
956
957 '&rorx ($a0,$a,$Sigma0[0])',
958 '&lea ($d,"($d,$h)")', # d+=h
959 '&and ($a3,$a2)', # (b^c)&(a^b)
960 @aesni_cbc_block[$aesni_cbc_idx++].
961 '&xor ($a1,$a4)',
962
963 '&xor ($a3,$b)', # Maj(a,b,c)=Ch(a^b,c,b)
964 '&xor ($a1,$a0)', # Sigma0(a)
965 '&lea ($h,"($h,$a3)");'. # h+=Maj(a,b,c)
966 '&mov ($a4,$e)', # copy of f in future
967
968 '($a2,$a3) = ($a3,$a2); unshift(@ROT,pop(@ROT)); $i++;'
969 );
970 # and at the finish one has to $a+=$a1
971}
972
973$code.=<<___;
974.type ${func}_avx2,\@function,6
975.align 64
976${func}_avx2:
977.cfi_startproc
978.Lavx2_shortcut:
979 mov `($win64?56:8)`(%rsp),$in0 # load 7th parameter
980 mov %rsp,%rax # copy %rsp
981.cfi_def_cfa_register %rax
982 push %rbx
983.cfi_push %rbx
984 push %rbp
985.cfi_push %rbp
986 push %r12
987.cfi_push %r12
988 push %r13
989.cfi_push %r13
990 push %r14
991.cfi_push %r14
992 push %r15
993.cfi_push %r15
994 sub \$`2*$SZ*$rounds+8*8+$win64*16*10`,%rsp
995 and \$-256*$SZ,%rsp # align stack frame
996 add \$`2*$SZ*($rounds-8)`,%rsp
997
998 shl \$6,$len
999 sub $inp,$out # re-bias
1000 sub $inp,$in0
1001 add $inp,$len # end of input
1002
1003 #mov $inp,$_inp # saved later
1004 #mov $out,$_out # kept in $offload
1005 mov $len,$_end
1006 #mov $key,$_key # remains resident in $inp register
1007 mov $ivp,$_ivp
1008 mov $ctx,$_ctx
1009 mov $in0,$_in0
1010 mov %rax,$_rsp
1011.cfi_cfa_expression $_rsp,deref,+8
1012___
1013$code.=<<___ if ($win64);
1014 movaps %xmm6,`$framesz+16*0`(%rsp)
1015 movaps %xmm7,`$framesz+16*1`(%rsp)
1016 movaps %xmm8,`$framesz+16*2`(%rsp)
1017 movaps %xmm9,`$framesz+16*3`(%rsp)
1018 movaps %xmm10,`$framesz+16*4`(%rsp)
1019 movaps %xmm11,`$framesz+16*5`(%rsp)
1020 movaps %xmm12,`$framesz+16*6`(%rsp)
1021 movaps %xmm13,`$framesz+16*7`(%rsp)
1022 movaps %xmm14,`$framesz+16*8`(%rsp)
1023 movaps %xmm15,`$framesz+16*9`(%rsp)
1024___
1025$code.=<<___;
1026.Lprologue_avx2:
1027 vzeroall
1028
1029 mov $inp,%r13 # borrow $a0
1030 vpinsrq \$1,$out,$offload,$offload
1031 lea 0x80($key),$inp # size optimization, reassign
1032 lea $TABLE+`$SZ*2*$rounds+32`(%rip),%r12 # borrow $a4
1033 mov 0xf0-0x80($inp),%r14d # rounds, borrow $a1
1034 mov $ctx,%r15 # borrow $a2
1035 mov $in0,%rsi # borrow $a3
1036 vmovdqu ($ivp),$iv # load IV
1037 lea -9(%r14),%r14
1038
1039 vmovdqa 0x00(%r12,%r14,8),$mask14
1040 vmovdqa 0x10(%r12,%r14,8),$mask12
1041 vmovdqa 0x20(%r12,%r14,8),$mask10
1042
1043 sub \$-16*$SZ,%r13 # inp++, size optimization
1044 mov $SZ*0(%r15),$A
1045 lea (%rsi,%r13),%r12 # borrow $a0
1046 mov $SZ*1(%r15),$B
1047 cmp $len,%r13 # $_end
1048 mov $SZ*2(%r15),$C
1049 cmove %rsp,%r12 # next block or random data
1050 mov $SZ*3(%r15),$D
1051 mov $SZ*4(%r15),$E
1052 mov $SZ*5(%r15),$F
1053 mov $SZ*6(%r15),$G
1054 mov $SZ*7(%r15),$H
1055 vmovdqu 0x00-0x80($inp),$roundkey
1056___
1057 if ($SZ==4) { # SHA256
1058 my @X = map("%ymm$_",(0..3));
1059 my ($t0,$t1,$t2,$t3) = map("%ymm$_",(4..7));
1060
1061$code.=<<___;
1062 jmp .Loop_avx2
1063.align 16
1064.Loop_avx2:
1065 vmovdqa $TABLE+`$SZ*2*$rounds`(%rip),$t3
1066 vmovdqu -16*$SZ+0(%rsi,%r13),%xmm0
1067 vmovdqu -16*$SZ+16(%rsi,%r13),%xmm1
1068 vmovdqu -16*$SZ+32(%rsi,%r13),%xmm2
1069 vmovdqu -16*$SZ+48(%rsi,%r13),%xmm3
1070
1071 vinserti128 \$1,(%r12),@X[0],@X[0]
1072 vinserti128 \$1,16(%r12),@X[1],@X[1]
1073 vpshufb $t3,@X[0],@X[0]
1074 vinserti128 \$1,32(%r12),@X[2],@X[2]
1075 vpshufb $t3,@X[1],@X[1]
1076 vinserti128 \$1,48(%r12),@X[3],@X[3]
1077
1078 lea $TABLE(%rip),$Tbl
1079 vpshufb $t3,@X[2],@X[2]
1080 lea -16*$SZ(%r13),%r13
1081 vpaddd 0x00($Tbl),@X[0],$t0
1082 vpshufb $t3,@X[3],@X[3]
1083 vpaddd 0x20($Tbl),@X[1],$t1
1084 vpaddd 0x40($Tbl),@X[2],$t2
1085 vpaddd 0x60($Tbl),@X[3],$t3
1086 vmovdqa $t0,0x00(%rsp)
1087 xor $a1,$a1
1088 vmovdqa $t1,0x20(%rsp)
1089___
1090$code.=<<___ if (!$win64);
1091# temporarily use %rsi as frame pointer
1092 mov $_rsp,%rsi
1093.cfi_def_cfa %rsi,8
1094___
1095$code.=<<___;
1096 lea -$PUSH8(%rsp),%rsp
1097___
1098$code.=<<___ if (!$win64);
1099# the frame info is at $_rsp, but the stack is moving...
1100# so a second frame pointer is saved at -8(%rsp)
1101# that is in the red zone
1102 mov %rsi,-8(%rsp)
1103.cfi_cfa_expression %rsp-8,deref,+8
1104___
1105$code.=<<___;
1106 mov $B,$a3
1107 vmovdqa $t2,0x00(%rsp)
1108 xor $C,$a3 # magic
1109 vmovdqa $t3,0x20(%rsp)
1110 mov $F,$a4
1111 sub \$-16*2*$SZ,$Tbl # size optimization
1112 jmp .Lavx2_00_47
1113
1114.align 16
1115.Lavx2_00_47:
1116 vmovdqu (%r13),$inout
1117 vpinsrq \$0,%r13,$offload,$offload
1118___
1119
1120sub AVX2_256_00_47 () {
1121my $j = shift;
1122my $body = shift;
1123my @X = @_;
1124my @insns = (&$body,&$body,&$body,&$body); # 96 instructions
1125my $base = "+2*$PUSH8(%rsp)";
1126
1127 if (($j%2)==0) {
1128 &lea ("%rsp","-$PUSH8(%rsp)");
1129$code.=<<___ if (!$win64);
1130.cfi_cfa_expression %rsp+`$PUSH8-8`,deref,+8
1131# copy secondary frame pointer to new location again at -8(%rsp)
1132 pushq $PUSH8-8(%rsp)
1133.cfi_cfa_expression %rsp,deref,+8
1134 lea 8(%rsp),%rsp
1135.cfi_cfa_expression %rsp-8,deref,+8
1136___
1137 }
1138 foreach (Xupdate_256_AVX()) { # 29 instructions
1139 eval;
1140 eval(shift(@insns));
1141 eval(shift(@insns));
1142 eval(shift(@insns));
1143 }
1144 &vpaddd ($t2,@X[0],16*2*$j."($Tbl)");
1145 foreach (@insns) { eval; } # remaining instructions
1146 &vmovdqa ((32*$j)%$PUSH8."(%rsp)",$t2);
1147}
1148 $aesni_cbc_idx=0;
1149 for ($i=0,$j=0; $j<4; $j++) {
1150 &AVX2_256_00_47($j,\&bodyx_00_15,@X);
1151 push(@X,shift(@X)); # rotate(@X)
1152 }
1153 &vmovq ("%r13",$offload); # borrow $a0
1154 &vpextrq ("%r15",$offload,1); # borrow $a2
1155 &vpand ($temp,$temp,$mask14);
1156 &vpor ($iv,$iv,$temp);
1157 &vmovdqu ("(%r15,%r13)",$iv); # write output
1158 &lea ("%r13","16(%r13)"); # inp++
1159
1160 &lea ($Tbl,16*2*$SZ."($Tbl)");
1161 &cmpb (($SZ-1)."($Tbl)",0);
1162 &jne (".Lavx2_00_47");
1163
1164 &vmovdqu ($inout,"(%r13)");
1165 &vpinsrq ($offload,$offload,"%r13",0);
1166
1167 $aesni_cbc_idx=0;
1168 for ($i=0; $i<16; ) {
1169 my $base=$i<8?"+$PUSH8(%rsp)":"(%rsp)";
1170 foreach(bodyx_00_15()) { eval; }
1171 }
1172 }
1173$code.=<<___;
1174 vpextrq \$1,$offload,%r12 # $_out, borrow $a4
1175 vmovq $offload,%r13 # $_inp, borrow $a0
1176 mov `2*$SZ*$rounds+5*8`(%rsp),%r15 # $_ctx, borrow $a2
1177 add $a1,$A
1178 lea `2*$SZ*($rounds-8)`(%rsp),$Tbl
1179
1180 vpand $mask14,$temp,$temp
1181 vpor $temp,$iv,$iv
1182 vmovdqu $iv,(%r12,%r13) # write output
1183 lea 16(%r13),%r13
1184
1185 add $SZ*0(%r15),$A
1186 add $SZ*1(%r15),$B
1187 add $SZ*2(%r15),$C
1188 add $SZ*3(%r15),$D
1189 add $SZ*4(%r15),$E
1190 add $SZ*5(%r15),$F
1191 add $SZ*6(%r15),$G
1192 add $SZ*7(%r15),$H
1193
1194 mov $A,$SZ*0(%r15)
1195 mov $B,$SZ*1(%r15)
1196 mov $C,$SZ*2(%r15)
1197 mov $D,$SZ*3(%r15)
1198 mov $E,$SZ*4(%r15)
1199 mov $F,$SZ*5(%r15)
1200 mov $G,$SZ*6(%r15)
1201 mov $H,$SZ*7(%r15)
1202
1203 cmp `$PUSH8+2*8`($Tbl),%r13 # $_end
1204 je .Ldone_avx2
1205
1206 xor $a1,$a1
1207 mov $B,$a3
1208 mov $F,$a4
1209 xor $C,$a3 # magic
1210 jmp .Lower_avx2
1211.align 16
1212.Lower_avx2:
1213 vmovdqu (%r13),$inout
1214 vpinsrq \$0,%r13,$offload,$offload
1215___
1216 $aesni_cbc_idx=0;
1217 for ($i=0; $i<16; ) {
1218 my $base="+16($Tbl)";
1219 foreach(bodyx_00_15()) { eval; }
1220 &lea ($Tbl,"-$PUSH8($Tbl)") if ($i==8);
1221 }
1222$code.=<<___;
1223 vmovq $offload,%r13 # borrow $a0
1224 vpextrq \$1,$offload,%r15 # borrow $a2
1225 vpand $mask14,$temp,$temp
1226 vpor $temp,$iv,$iv
1227 lea -$PUSH8($Tbl),$Tbl
1228 vmovdqu $iv,(%r15,%r13) # write output
1229 lea 16(%r13),%r13 # inp++
1230 cmp %rsp,$Tbl
1231 jae .Lower_avx2
1232
1233 mov `2*$SZ*$rounds+5*8`(%rsp),%r15 # $_ctx, borrow $a2
1234 lea 16*$SZ(%r13),%r13
1235 mov `2*$SZ*$rounds+6*8`(%rsp),%rsi # $_in0, borrow $a3
1236 add $a1,$A
1237 lea `2*$SZ*($rounds-8)`(%rsp),%rsp
1238
1239 add $SZ*0(%r15),$A
1240 add $SZ*1(%r15),$B
1241 add $SZ*2(%r15),$C
1242 add $SZ*3(%r15),$D
1243 add $SZ*4(%r15),$E
1244 add $SZ*5(%r15),$F
1245 add $SZ*6(%r15),$G
1246 lea (%rsi,%r13),%r12
1247 add $SZ*7(%r15),$H
1248
1249 cmp $_end,%r13
1250
1251 mov $A,$SZ*0(%r15)
1252 cmove %rsp,%r12 # next block or stale data
1253 mov $B,$SZ*1(%r15)
1254 mov $C,$SZ*2(%r15)
1255 mov $D,$SZ*3(%r15)
1256 mov $E,$SZ*4(%r15)
1257 mov $F,$SZ*5(%r15)
1258 mov $G,$SZ*6(%r15)
1259 mov $H,$SZ*7(%r15)
1260
1261 jbe .Loop_avx2
1262 lea (%rsp),$Tbl
1263# temporarily use $Tbl as index to $_rsp
1264# this avoids the need to save a secondary frame pointer at -8(%rsp)
1265.cfi_cfa_expression $Tbl+`16*$SZ+7*8`,deref,+8
1266
1267.Ldone_avx2:
1268 mov 16*$SZ+4*8($Tbl),$ivp
1269 mov 16*$SZ+7*8($Tbl),%rsi
1270.cfi_def_cfa %rsi,8
1271 vmovdqu $iv,($ivp) # output IV
1272 vzeroall
1273___
1274$code.=<<___ if ($win64);
1275 movaps `$framesz+16*0`($Tbl),%xmm6
1276 movaps `$framesz+16*1`($Tbl),%xmm7
1277 movaps `$framesz+16*2`($Tbl),%xmm8
1278 movaps `$framesz+16*3`($Tbl),%xmm9
1279 movaps `$framesz+16*4`($Tbl),%xmm10
1280 movaps `$framesz+16*5`($Tbl),%xmm11
1281 movaps `$framesz+16*6`($Tbl),%xmm12
1282 movaps `$framesz+16*7`($Tbl),%xmm13
1283 movaps `$framesz+16*8`($Tbl),%xmm14
1284 movaps `$framesz+16*9`($Tbl),%xmm15
1285___
1286$code.=<<___;
1287 mov -48(%rsi),%r15
1288.cfi_restore %r15
1289 mov -40(%rsi),%r14
1290.cfi_restore %r14
1291 mov -32(%rsi),%r13
1292.cfi_restore %r13
1293 mov -24(%rsi),%r12
1294.cfi_restore %r12
1295 mov -16(%rsi),%rbp
1296.cfi_restore %rbp
1297 mov -8(%rsi),%rbx
1298.cfi_restore %rbx
1299 lea (%rsi),%rsp
1300.cfi_def_cfa_register %rsp
1301.Lepilogue_avx2:
1302 ret
1303.cfi_endproc
1304.size ${func}_avx2,.-${func}_avx2
1305___
1306}}
1307}}
1308{{
1309my ($in0,$out,$len,$key,$ivp,$ctx,$inp)=("%rdi","%rsi","%rdx","%rcx","%r8","%r9","%r10");
1310
1311my ($rounds,$Tbl)=("%r11d","%rbx");
1312
1313my ($iv,$in,$rndkey0)=map("%xmm$_",(6,14,15));
1314my @rndkey=("%xmm4","%xmm5");
1315my $r=0;
1316my $sn=0;
1317
1318my ($Wi,$ABEF,$CDGH,$TMP,$BSWAP,$ABEF_SAVE,$CDGH_SAVE)=map("%xmm$_",(0..3,7..9));
1319my @MSG=map("%xmm$_",(10..13));
1320
1321my $aesenc=sub {
1322 use integer;
1323 my ($n,$k)=($r/10,$r%10);
1324 if ($k==0) {
1325 $code.=<<___;
1326 movups `16*$n`($in0),$in # load input
1327 xorps $rndkey0,$in
1328___
1329 $code.=<<___ if ($n);
1330 movups $iv,`16*($n-1)`($out,$in0) # write output
1331___
1332 $code.=<<___;
1333 xorps $in,$iv
1334 movups `32+16*$k-112`($key),$rndkey[1]
1335 aesenc $rndkey[0],$iv
1336___
1337 } elsif ($k==9) {
1338 $sn++;
1339 $code.=<<___;
1340 cmp \$11,$rounds
1341 jb .Laesenclast$sn
1342 movups `32+16*($k+0)-112`($key),$rndkey[1]
1343 aesenc $rndkey[0],$iv
1344 movups `32+16*($k+1)-112`($key),$rndkey[0]
1345 aesenc $rndkey[1],$iv
1346 je .Laesenclast$sn
1347 movups `32+16*($k+2)-112`($key),$rndkey[1]
1348 aesenc $rndkey[0],$iv
1349 movups `32+16*($k+3)-112`($key),$rndkey[0]
1350 aesenc $rndkey[1],$iv
1351.Laesenclast$sn:
1352 aesenclast $rndkey[0],$iv
1353 movups 16-112($key),$rndkey[1] # forward reference
1354 nop
1355___
1356 } else {
1357 $code.=<<___;
1358 movups `32+16*$k-112`($key),$rndkey[1]
1359 aesenc $rndkey[0],$iv
1360___
1361 }
1362 $r++; unshift(@rndkey,pop(@rndkey));
1363};
1364
1365if ($shaext) {
1366my $Tbl="%rax";
1367
1368$code.=<<___;
1369.type ${func}_shaext,\@function,6
1370.align 32
1371${func}_shaext:
1372.cfi_startproc
1373 mov `($win64?56:8)`(%rsp),$inp # load 7th argument
1374___
1375$code.=<<___ if ($win64);
1376 lea `-8-10*16`(%rsp),%rsp
1377 movaps %xmm6,-8-10*16(%rax)
1378 movaps %xmm7,-8-9*16(%rax)
1379 movaps %xmm8,-8-8*16(%rax)
1380 movaps %xmm9,-8-7*16(%rax)
1381 movaps %xmm10,-8-6*16(%rax)
1382 movaps %xmm11,-8-5*16(%rax)
1383 movaps %xmm12,-8-4*16(%rax)
1384 movaps %xmm13,-8-3*16(%rax)
1385 movaps %xmm14,-8-2*16(%rax)
1386 movaps %xmm15,-8-1*16(%rax)
1387.Lprologue_shaext:
1388___
1389$code.=<<___;
1390 lea K256+0x80(%rip),$Tbl
1391 movdqu ($ctx),$ABEF # DCBA
1392 movdqu 16($ctx),$CDGH # HGFE
1393 movdqa 0x200-0x80($Tbl),$TMP # byte swap mask
1394
1395 mov 240($key),$rounds
1396 sub $in0,$out
1397 movups ($key),$rndkey0 # $key[0]
1398 movups ($ivp),$iv # load IV
1399 movups 16($key),$rndkey[0] # forward reference
1400 lea 112($key),$key # size optimization
1401
1402 pshufd \$0x1b,$ABEF,$Wi # ABCD
1403 pshufd \$0xb1,$ABEF,$ABEF # CDAB
1404 pshufd \$0x1b,$CDGH,$CDGH # EFGH
1405 movdqa $TMP,$BSWAP # offload
1406 palignr \$8,$CDGH,$ABEF # ABEF
1407 punpcklqdq $Wi,$CDGH # CDGH
1408
1409 jmp .Loop_shaext
1410
1411.align 16
1412.Loop_shaext:
1413 movdqu ($inp),@MSG[0]
1414 movdqu 0x10($inp),@MSG[1]
1415 movdqu 0x20($inp),@MSG[2]
1416 pshufb $TMP,@MSG[0]
1417 movdqu 0x30($inp),@MSG[3]
1418
1419 movdqa 0*32-0x80($Tbl),$Wi
1420 paddd @MSG[0],$Wi
1421 pshufb $TMP,@MSG[1]
1422 movdqa $CDGH,$CDGH_SAVE # offload
1423 movdqa $ABEF,$ABEF_SAVE # offload
1424___
1425 &$aesenc();
1426$code.=<<___;
1427 sha256rnds2 $ABEF,$CDGH # 0-3
1428 pshufd \$0x0e,$Wi,$Wi
1429___
1430 &$aesenc();
1431$code.=<<___;
1432 sha256rnds2 $CDGH,$ABEF
1433
1434 movdqa 1*32-0x80($Tbl),$Wi
1435 paddd @MSG[1],$Wi
1436 pshufb $TMP,@MSG[2]
1437 lea 0x40($inp),$inp
1438___
1439 &$aesenc();
1440$code.=<<___;
1441 sha256rnds2 $ABEF,$CDGH # 4-7
1442 pshufd \$0x0e,$Wi,$Wi
1443___
1444 &$aesenc();
1445$code.=<<___;
1446 sha256rnds2 $CDGH,$ABEF
1447
1448 movdqa 2*32-0x80($Tbl),$Wi
1449 paddd @MSG[2],$Wi
1450 pshufb $TMP,@MSG[3]
1451 sha256msg1 @MSG[1],@MSG[0]
1452___
1453 &$aesenc();
1454$code.=<<___;
1455 sha256rnds2 $ABEF,$CDGH # 8-11
1456 pshufd \$0x0e,$Wi,$Wi
1457 movdqa @MSG[3],$TMP
1458 palignr \$4,@MSG[2],$TMP
1459 paddd $TMP,@MSG[0]
1460___
1461 &$aesenc();
1462$code.=<<___;
1463 sha256rnds2 $CDGH,$ABEF
1464
1465 movdqa 3*32-0x80($Tbl),$Wi
1466 paddd @MSG[3],$Wi
1467 sha256msg2 @MSG[3],@MSG[0]
1468 sha256msg1 @MSG[2],@MSG[1]
1469___
1470 &$aesenc();
1471$code.=<<___;
1472 sha256rnds2 $ABEF,$CDGH # 12-15
1473 pshufd \$0x0e,$Wi,$Wi
1474___
1475 &$aesenc();
1476$code.=<<___;
1477 movdqa @MSG[0],$TMP
1478 palignr \$4,@MSG[3],$TMP
1479 paddd $TMP,@MSG[1]
1480 sha256rnds2 $CDGH,$ABEF
1481___
1482for($i=4;$i<16-3;$i++) {
1483 &$aesenc() if (($r%10)==0);
1484$code.=<<___;
1485 movdqa $i*32-0x80($Tbl),$Wi
1486 paddd @MSG[0],$Wi
1487 sha256msg2 @MSG[0],@MSG[1]
1488 sha256msg1 @MSG[3],@MSG[2]
1489___
1490 &$aesenc();
1491$code.=<<___;
1492 sha256rnds2 $ABEF,$CDGH # 16-19...
1493 pshufd \$0x0e,$Wi,$Wi
1494 movdqa @MSG[1],$TMP
1495 palignr \$4,@MSG[0],$TMP
1496 paddd $TMP,@MSG[2]
1497___
1498 &$aesenc();
1499 &$aesenc() if ($r==19);
1500$code.=<<___;
1501 sha256rnds2 $CDGH,$ABEF
1502___
1503 push(@MSG,shift(@MSG));
1504}
1505$code.=<<___;
1506 movdqa 13*32-0x80($Tbl),$Wi
1507 paddd @MSG[0],$Wi
1508 sha256msg2 @MSG[0],@MSG[1]
1509 sha256msg1 @MSG[3],@MSG[2]
1510___
1511 &$aesenc();
1512$code.=<<___;
1513 sha256rnds2 $ABEF,$CDGH # 52-55
1514 pshufd \$0x0e,$Wi,$Wi
1515 movdqa @MSG[1],$TMP
1516 palignr \$4,@MSG[0],$TMP
1517 paddd $TMP,@MSG[2]
1518___
1519 &$aesenc();
1520 &$aesenc();
1521$code.=<<___;
1522 sha256rnds2 $CDGH,$ABEF
1523
1524 movdqa 14*32-0x80($Tbl),$Wi
1525 paddd @MSG[1],$Wi
1526 sha256msg2 @MSG[1],@MSG[2]
1527 movdqa $BSWAP,$TMP
1528___
1529 &$aesenc();
1530$code.=<<___;
1531 sha256rnds2 $ABEF,$CDGH # 56-59
1532 pshufd \$0x0e,$Wi,$Wi
1533___
1534 &$aesenc();
1535$code.=<<___;
1536 sha256rnds2 $CDGH,$ABEF
1537
1538 movdqa 15*32-0x80($Tbl),$Wi
1539 paddd @MSG[2],$Wi
1540___
1541 &$aesenc();
1542 &$aesenc();
1543$code.=<<___;
1544 sha256rnds2 $ABEF,$CDGH # 60-63
1545 pshufd \$0x0e,$Wi,$Wi
1546___
1547 &$aesenc();
1548$code.=<<___;
1549 sha256rnds2 $CDGH,$ABEF
1550 #pxor $CDGH,$rndkey0 # black magic
1551___
1552 while ($r<40) { &$aesenc(); } # remaining aesenc's
1553$code.=<<___;
1554 #xorps $CDGH,$rndkey0 # black magic
1555 paddd $CDGH_SAVE,$CDGH
1556 paddd $ABEF_SAVE,$ABEF
1557
1558 dec $len
1559 movups $iv,48($out,$in0) # write output
1560 lea 64($in0),$in0
1561 jnz .Loop_shaext
1562
1563 pshufd \$0xb1,$CDGH,$CDGH # DCHG
1564 pshufd \$0x1b,$ABEF,$TMP # FEBA
1565 pshufd \$0xb1,$ABEF,$ABEF # BAFE
1566 punpckhqdq $CDGH,$ABEF # DCBA
1567 palignr \$8,$TMP,$CDGH # HGFE
1568
1569 movups $iv,($ivp) # write IV
1570 movdqu $ABEF,($ctx)
1571 movdqu $CDGH,16($ctx)
1572___
1573$code.=<<___ if ($win64);
1574 movaps 0*16(%rsp),%xmm6
1575 movaps 1*16(%rsp),%xmm7
1576 movaps 2*16(%rsp),%xmm8
1577 movaps 3*16(%rsp),%xmm9
1578 movaps 4*16(%rsp),%xmm10
1579 movaps 5*16(%rsp),%xmm11
1580 movaps 6*16(%rsp),%xmm12
1581 movaps 7*16(%rsp),%xmm13
1582 movaps 8*16(%rsp),%xmm14
1583 movaps 9*16(%rsp),%xmm15
1584 lea 8+10*16(%rsp),%rsp
1585.Lepilogue_shaext:
1586___
1587$code.=<<___;
1588 ret
1589.cfi_endproc
1590.size ${func}_shaext,.-${func}_shaext
1591___
1592}
1593}}}}}
1594
1595# EXCEPTION_DISPOSITION handler (EXCEPTION_RECORD *rec,ULONG64 frame,
1596# CONTEXT *context,DISPATCHER_CONTEXT *disp)
1597if ($win64 && $avx) {
1598$rec="%rcx";
1599$frame="%rdx";
1600$context="%r8";
1601$disp="%r9";
1602
1603$code.=<<___;
1604.extern __imp_RtlVirtualUnwind
1605.type se_handler,\@abi-omnipotent
1606.align 16
1607se_handler:
1608 push %rsi
1609 push %rdi
1610 push %rbx
1611 push %rbp
1612 push %r12
1613 push %r13
1614 push %r14
1615 push %r15
1616 pushfq
1617 sub \$64,%rsp
1618
1619 mov 120($context),%rax # pull context->Rax
1620 mov 248($context),%rbx # pull context->Rip
1621
1622 mov 8($disp),%rsi # disp->ImageBase
1623 mov 56($disp),%r11 # disp->HanderlData
1624
1625 mov 0(%r11),%r10d # HandlerData[0]
1626 lea (%rsi,%r10),%r10 # prologue label
1627 cmp %r10,%rbx # context->Rip<prologue label
1628 jb .Lin_prologue
1629
1630 mov 152($context),%rax # pull context->Rsp
1631
1632 mov 4(%r11),%r10d # HandlerData[1]
1633 lea (%rsi,%r10),%r10 # epilogue label
1634 cmp %r10,%rbx # context->Rip>=epilogue label
1635 jae .Lin_prologue
1636___
1637$code.=<<___ if ($shaext);
1638 lea aesni_cbc_sha256_enc_shaext(%rip),%r10
1639 cmp %r10,%rbx
1640 jb .Lnot_in_shaext
1641
1642 lea (%rax),%rsi
1643 lea 512($context),%rdi # &context.Xmm6
1644 mov \$20,%ecx
1645 .long 0xa548f3fc # cld; rep movsq
1646 lea 168(%rax),%rax # adjust stack pointer
1647 jmp .Lin_prologue
1648.Lnot_in_shaext:
1649___
1650$code.=<<___ if ($avx>1);
1651 lea .Lavx2_shortcut(%rip),%r10
1652 cmp %r10,%rbx # context->Rip<avx2_shortcut
1653 jb .Lnot_in_avx2
1654
1655 and \$-256*$SZ,%rax
1656 add \$`2*$SZ*($rounds-8)`,%rax
1657.Lnot_in_avx2:
1658___
1659$code.=<<___;
1660 mov %rax,%rsi # put aside Rsp
1661 mov 16*$SZ+7*8(%rax),%rax # pull $_rsp
1662
1663 mov -8(%rax),%rbx
1664 mov -16(%rax),%rbp
1665 mov -24(%rax),%r12
1666 mov -32(%rax),%r13
1667 mov -40(%rax),%r14
1668 mov -48(%rax),%r15
1669 mov %rbx,144($context) # restore context->Rbx
1670 mov %rbp,160($context) # restore context->Rbp
1671 mov %r12,216($context) # restore context->R12
1672 mov %r13,224($context) # restore context->R13
1673 mov %r14,232($context) # restore context->R14
1674 mov %r15,240($context) # restore context->R15
1675
1676 lea 16*$SZ+8*8(%rsi),%rsi # Xmm6- save area
1677 lea 512($context),%rdi # &context.Xmm6
1678 mov \$20,%ecx
1679 .long 0xa548f3fc # cld; rep movsq
1680
1681.Lin_prologue:
1682 mov 8(%rax),%rdi
1683 mov 16(%rax),%rsi
1684 mov %rax,152($context) # restore context->Rsp
1685 mov %rsi,168($context) # restore context->Rsi
1686 mov %rdi,176($context) # restore context->Rdi
1687
1688 mov 40($disp),%rdi # disp->ContextRecord
1689 mov $context,%rsi # context
1690 mov \$154,%ecx # sizeof(CONTEXT)
1691 .long 0xa548f3fc # cld; rep movsq
1692
1693 mov $disp,%rsi
1694 xor %rcx,%rcx # arg1, UNW_FLAG_NHANDLER
1695 mov 8(%rsi),%rdx # arg2, disp->ImageBase
1696 mov 0(%rsi),%r8 # arg3, disp->ControlPc
1697 mov 16(%rsi),%r9 # arg4, disp->FunctionEntry
1698 mov 40(%rsi),%r10 # disp->ContextRecord
1699 lea 56(%rsi),%r11 # &disp->HandlerData
1700 lea 24(%rsi),%r12 # &disp->EstablisherFrame
1701 mov %r10,32(%rsp) # arg5
1702 mov %r11,40(%rsp) # arg6
1703 mov %r12,48(%rsp) # arg7
1704 mov %rcx,56(%rsp) # arg8, (NULL)
1705 call *__imp_RtlVirtualUnwind(%rip)
1706
1707 mov \$1,%eax # ExceptionContinueSearch
1708 add \$64,%rsp
1709 popfq
1710 pop %r15
1711 pop %r14
1712 pop %r13
1713 pop %r12
1714 pop %rbp
1715 pop %rbx
1716 pop %rdi
1717 pop %rsi
1718 ret
1719.size se_handler,.-se_handler
1720
1721.section .pdata
1722 .rva .LSEH_begin_${func}_xop
1723 .rva .LSEH_end_${func}_xop
1724 .rva .LSEH_info_${func}_xop
1725
1726 .rva .LSEH_begin_${func}_avx
1727 .rva .LSEH_end_${func}_avx
1728 .rva .LSEH_info_${func}_avx
1729___
1730$code.=<<___ if ($avx>1);
1731 .rva .LSEH_begin_${func}_avx2
1732 .rva .LSEH_end_${func}_avx2
1733 .rva .LSEH_info_${func}_avx2
1734___
1735$code.=<<___ if ($shaext);
1736 .rva .LSEH_begin_${func}_shaext
1737 .rva .LSEH_end_${func}_shaext
1738 .rva .LSEH_info_${func}_shaext
1739___
1740$code.=<<___;
1741.section .xdata
1742.align 8
1743.LSEH_info_${func}_xop:
1744 .byte 9,0,0,0
1745 .rva se_handler
1746 .rva .Lprologue_xop,.Lepilogue_xop # HandlerData[]
1747
1748.LSEH_info_${func}_avx:
1749 .byte 9,0,0,0
1750 .rva se_handler
1751 .rva .Lprologue_avx,.Lepilogue_avx # HandlerData[]
1752___
1753$code.=<<___ if ($avx>1);
1754.LSEH_info_${func}_avx2:
1755 .byte 9,0,0,0
1756 .rva se_handler
1757 .rva .Lprologue_avx2,.Lepilogue_avx2 # HandlerData[]
1758___
1759$code.=<<___ if ($shaext);
1760.LSEH_info_${func}_shaext:
1761 .byte 9,0,0,0
1762 .rva se_handler
1763 .rva .Lprologue_shaext,.Lepilogue_shaext # HandlerData[]
1764___
1765}
1766
1767####################################################################
1768sub rex {
1769 local *opcode=shift;
1770 my ($dst,$src)=@_;
1771 my $rex=0;
1772
1773 $rex|=0x04 if($dst>=8);
1774 $rex|=0x01 if($src>=8);
1775 unshift @opcode,$rex|0x40 if($rex);
1776}
1777
1778{
1779 my %opcodelet = (
1780 "sha256rnds2" => 0xcb,
1781 "sha256msg1" => 0xcc,
1782 "sha256msg2" => 0xcd );
1783
1784 sub sha256op38 {
1785 my $instr = shift;
1786
1787 if (defined($opcodelet{$instr}) && @_[0] =~ /%xmm([0-9]+),\s*%xmm([0-9]+)/) {
1788 my @opcode=(0x0f,0x38);
1789 rex(\@opcode,$2,$1);
1790 push @opcode,$opcodelet{$instr};
1791 push @opcode,0xc0|($1&7)|(($2&7)<<3); # ModR/M
1792 return ".byte\t".join(',',@opcode);
1793 } else {
1794 return $instr."\t".@_[0];
1795 }
1796 }
1797}
1798
1799$code =~ s/\`([^\`]*)\`/eval $1/gem;
1800$code =~ s/\b(sha256[^\s]*)\s+(.*)/sha256op38($1,$2)/gem;
1801print $code;
1802close STDOUT or die "error closing STDOUT: $!";
Note: See TracBrowser for help on using the repository browser.

© 2024 Oracle Support Privacy / Do Not Sell My Info Terms of Use Trademark Policy Automated Access Etiquette