VirtualBox

source: vbox/trunk/src/libs/openssl-1.1.1f/crypto/sha/asm/sha256-mb-x86_64.pl@ 83531

Last change on this file since 83531 was 83531, checked in by vboxsync, 5 years ago

setting svn:sync-process=export for openssl-1.1.1f, all files except tests

File size: 38.6 KB
Line 
1#! /usr/bin/env perl
2# Copyright 2013-2020 The OpenSSL Project Authors. All Rights Reserved.
3#
4# Licensed under the OpenSSL license (the "License"). You may not use
5# this file except in compliance with the License. You can obtain a copy
6# in the file LICENSE in the source distribution or at
7# https://www.openssl.org/source/license.html
8
9
10# ====================================================================
11# Written by Andy Polyakov <[email protected]> for the OpenSSL
12# project. The module is, however, dual licensed under OpenSSL and
13# CRYPTOGAMS licenses depending on where you obtain it. For further
14# details see http://www.openssl.org/~appro/cryptogams/.
15# ====================================================================
16
17# Multi-buffer SHA256 procedure processes n buffers in parallel by
18# placing buffer data to designated lane of SIMD register. n is
19# naturally limited to 4 on pre-AVX2 processors and to 8 on
20# AVX2-capable processors such as Haswell.
21#
22# this +aesni(i) sha256 aesni-sha256 gain(iv)
23# -------------------------------------------------------------------
24# Westmere(ii) 23.3/n +1.28=7.11(n=4) 12.3 +3.75=16.1 +126%
25# Atom(ii) 38.7/n +3.93=13.6(n=4) 20.8 +5.69=26.5 +95%
26# Sandy Bridge (20.5 +5.15=25.7)/n 11.6 13.0 +103%
27# Ivy Bridge (20.4 +5.14=25.5)/n 10.3 11.6 +82%
28# Haswell(iii) (21.0 +5.00=26.0)/n 7.80 8.79 +170%
29# Skylake (18.9 +5.00=23.9)/n 7.70 8.17 +170%
30# Bulldozer (21.6 +5.76=27.4)/n 13.6 13.7 +100%
31#
32# (i) multi-block CBC encrypt with 128-bit key;
33# (ii) (HASH+AES)/n does not apply to Westmere for n>3 and Atom,
34# because of lower AES-NI instruction throughput, nor is there
35# AES-NI-SHA256 stitch for these processors;
36# (iii) "this" is for n=8, when we gather twice as much data, result
37# for n=4 is 20.3+4.44=24.7;
38# (iv) presented improvement coefficients are asymptotic limits and
39# in real-life application are somewhat lower, e.g. for 2KB
40# fragments they range from 75% to 130% (on Haswell);
41
42$flavour = shift;
43$output = shift;
44if ($flavour =~ /\./) { $output = $flavour; undef $flavour; }
45
46$win64=0; $win64=1 if ($flavour =~ /[nm]asm|mingw64/ || $output =~ /\.asm$/);
47
48$0 =~ m/(.*[\/\\])[^\/\\]+$/; $dir=$1;
49( $xlate="${dir}x86_64-xlate.pl" and -f $xlate ) or
50( $xlate="${dir}../../perlasm/x86_64-xlate.pl" and -f $xlate) or
51die "can't locate x86_64-xlate.pl";
52
53$avx=0;
54
55if (`$ENV{CC} -Wa,-v -c -o /dev/null -x assembler /dev/null 2>&1`
56 =~ /GNU assembler version ([2-9]\.[0-9]+)/) {
57 $avx = ($1>=2.19) + ($1>=2.22);
58}
59
60if (!$avx && $win64 && ($flavour =~ /nasm/ || $ENV{ASM} =~ /nasm/) &&
61 `nasm -v 2>&1` =~ /NASM version ([2-9]\.[0-9]+)/) {
62 $avx = ($1>=2.09) + ($1>=2.10);
63}
64
65if (!$avx && $win64 && ($flavour =~ /masm/ || $ENV{ASM} =~ /ml64/) &&
66 `ml64 2>&1` =~ /Version ([0-9]+)\./) {
67 $avx = ($1>=10) + ($1>=11);
68}
69
70if (!$avx && `$ENV{CC} -v 2>&1` =~ /((?:^clang|LLVM) version|.*based on LLVM) ([0-9]+\.[0-9]+)/) {
71 $avx = ($2>=3.0) + ($2>3.0);
72}
73
74open OUT,"| \"$^X\" \"$xlate\" $flavour \"$output\"";
75*STDOUT=*OUT;
76
77# void sha256_multi_block (
78# struct { unsigned int A[8];
79# unsigned int B[8];
80# unsigned int C[8];
81# unsigned int D[8];
82# unsigned int E[8];
83# unsigned int F[8];
84# unsigned int G[8];
85# unsigned int H[8]; } *ctx,
86# struct { void *ptr; int blocks; } inp[8],
87# int num); /* 1 or 2 */
88#
89$ctx="%rdi"; # 1st arg
90$inp="%rsi"; # 2nd arg
91$num="%edx"; # 3rd arg
92@ptr=map("%r$_",(8..11));
93$Tbl="%rbp";
94
95@V=($A,$B,$C,$D,$E,$F,$G,$H)=map("%xmm$_",(8..15));
96($t1,$t2,$t3,$axb,$bxc,$Xi,$Xn,$sigma)=map("%xmm$_",(0..7));
97
98$REG_SZ=16;
99
100sub Xi_off {
101my $off = shift;
102
103 $off %= 16; $off *= $REG_SZ;
104 $off<256 ? "$off-128(%rax)" : "$off-256-128(%rbx)";
105}
106
107sub ROUND_00_15 {
108my ($i,$a,$b,$c,$d,$e,$f,$g,$h)=@_;
109
110$code.=<<___ if ($i<15);
111 movd `4*$i`(@ptr[0]),$Xi
112 movd `4*$i`(@ptr[1]),$t1
113 movd `4*$i`(@ptr[2]),$t2
114 movd `4*$i`(@ptr[3]),$t3
115 punpckldq $t2,$Xi
116 punpckldq $t3,$t1
117 punpckldq $t1,$Xi
118___
119$code.=<<___ if ($i==15);
120 movd `4*$i`(@ptr[0]),$Xi
121 lea `16*4`(@ptr[0]),@ptr[0]
122 movd `4*$i`(@ptr[1]),$t1
123 lea `16*4`(@ptr[1]),@ptr[1]
124 movd `4*$i`(@ptr[2]),$t2
125 lea `16*4`(@ptr[2]),@ptr[2]
126 movd `4*$i`(@ptr[3]),$t3
127 lea `16*4`(@ptr[3]),@ptr[3]
128 punpckldq $t2,$Xi
129 punpckldq $t3,$t1
130 punpckldq $t1,$Xi
131___
132$code.=<<___;
133 movdqa $e,$sigma
134 `"pshufb $Xn,$Xi" if ($i<=15 && ($i&1)==0)`
135 movdqa $e,$t3
136 `"pshufb $Xn,$Xi" if ($i<=15 && ($i&1)==1)`
137 psrld \$6,$sigma
138 movdqa $e,$t2
139 pslld \$7,$t3
140 movdqa $Xi,`&Xi_off($i)`
141 paddd $h,$Xi # Xi+=h
142
143 psrld \$11,$t2
144 pxor $t3,$sigma
145 pslld \$21-7,$t3
146 paddd `32*($i%8)-128`($Tbl),$Xi # Xi+=K[round]
147 pxor $t2,$sigma
148
149 psrld \$25-11,$t2
150 movdqa $e,$t1
151 `"prefetcht0 63(@ptr[0])" if ($i==15)`
152 pxor $t3,$sigma
153 movdqa $e,$axb # borrow $axb
154 pslld \$26-21,$t3
155 pandn $g,$t1
156 pand $f,$axb
157 pxor $t2,$sigma
158
159 `"prefetcht0 63(@ptr[1])" if ($i==15)`
160 movdqa $a,$t2
161 pxor $t3,$sigma # Sigma1(e)
162 movdqa $a,$t3
163 psrld \$2,$t2
164 paddd $sigma,$Xi # Xi+=Sigma1(e)
165 pxor $axb,$t1 # Ch(e,f,g)
166 movdqa $b,$axb
167 movdqa $a,$sigma
168 pslld \$10,$t3
169 pxor $a,$axb # a^b, b^c in next round
170
171 `"prefetcht0 63(@ptr[2])" if ($i==15)`
172 psrld \$13,$sigma
173 pxor $t3,$t2
174 paddd $t1,$Xi # Xi+=Ch(e,f,g)
175 pslld \$19-10,$t3
176 pand $axb,$bxc
177 pxor $sigma,$t2
178
179 `"prefetcht0 63(@ptr[3])" if ($i==15)`
180 psrld \$22-13,$sigma
181 pxor $t3,$t2
182 movdqa $b,$h
183 pslld \$30-19,$t3
184 pxor $t2,$sigma
185 pxor $bxc,$h # h=Maj(a,b,c)=Ch(a^b,c,b)
186 paddd $Xi,$d # d+=Xi
187 pxor $t3,$sigma # Sigma0(a)
188
189 paddd $Xi,$h # h+=Xi
190 paddd $sigma,$h # h+=Sigma0(a)
191___
192$code.=<<___ if (($i%8)==7);
193 lea `32*8`($Tbl),$Tbl
194___
195 ($axb,$bxc)=($bxc,$axb);
196}
197
198sub ROUND_16_XX {
199my $i=shift;
200
201$code.=<<___;
202 movdqa `&Xi_off($i+1)`,$Xn
203 paddd `&Xi_off($i+9)`,$Xi # Xi+=X[i+9]
204
205 movdqa $Xn,$sigma
206 movdqa $Xn,$t2
207 psrld \$3,$sigma
208 movdqa $Xn,$t3
209
210 psrld \$7,$t2
211 movdqa `&Xi_off($i+14)`,$t1
212 pslld \$14,$t3
213 pxor $t2,$sigma
214 psrld \$18-7,$t2
215 movdqa $t1,$axb # borrow $axb
216 pxor $t3,$sigma
217 pslld \$25-14,$t3
218 pxor $t2,$sigma
219 psrld \$10,$t1
220 movdqa $axb,$t2
221
222 psrld \$17,$axb
223 pxor $t3,$sigma # sigma0(X[i+1])
224 pslld \$13,$t2
225 paddd $sigma,$Xi # Xi+=sigma0(e)
226 pxor $axb,$t1
227 psrld \$19-17,$axb
228 pxor $t2,$t1
229 pslld \$15-13,$t2
230 pxor $axb,$t1
231 pxor $t2,$t1 # sigma0(X[i+14])
232 paddd $t1,$Xi # Xi+=sigma1(X[i+14])
233___
234 &ROUND_00_15($i,@_);
235 ($Xi,$Xn)=($Xn,$Xi);
236}
237
238$code.=<<___;
239.text
240
241.extern OPENSSL_ia32cap_P
242
243.globl sha256_multi_block
244.type sha256_multi_block,\@function,3
245.align 32
246sha256_multi_block:
247.cfi_startproc
248 mov OPENSSL_ia32cap_P+4(%rip),%rcx
249 bt \$61,%rcx # check SHA bit
250 jc _shaext_shortcut
251___
252$code.=<<___ if ($avx);
253 test \$`1<<28`,%ecx
254 jnz _avx_shortcut
255___
256$code.=<<___;
257 mov %rsp,%rax
258.cfi_def_cfa_register %rax
259 push %rbx
260.cfi_push %rbx
261 push %rbp
262.cfi_push %rbp
263___
264$code.=<<___ if ($win64);
265 lea -0xa8(%rsp),%rsp
266 movaps %xmm6,(%rsp)
267 movaps %xmm7,0x10(%rsp)
268 movaps %xmm8,0x20(%rsp)
269 movaps %xmm9,0x30(%rsp)
270 movaps %xmm10,-0x78(%rax)
271 movaps %xmm11,-0x68(%rax)
272 movaps %xmm12,-0x58(%rax)
273 movaps %xmm13,-0x48(%rax)
274 movaps %xmm14,-0x38(%rax)
275 movaps %xmm15,-0x28(%rax)
276___
277$code.=<<___;
278 sub \$`$REG_SZ*18`, %rsp
279 and \$-256,%rsp
280 mov %rax,`$REG_SZ*17`(%rsp) # original %rsp
281.cfi_cfa_expression %rsp+`$REG_SZ*17`,deref,+8
282.Lbody:
283 lea K256+128(%rip),$Tbl
284 lea `$REG_SZ*16`(%rsp),%rbx
285 lea 0x80($ctx),$ctx # size optimization
286
287.Loop_grande:
288 mov $num,`$REG_SZ*17+8`(%rsp) # original $num
289 xor $num,$num
290___
291for($i=0;$i<4;$i++) {
292 $code.=<<___;
293 mov `16*$i+0`($inp),@ptr[$i] # input pointer
294 mov `16*$i+8`($inp),%ecx # number of blocks
295 cmp $num,%ecx
296 cmovg %ecx,$num # find maximum
297 test %ecx,%ecx
298 mov %ecx,`4*$i`(%rbx) # initialize counters
299 cmovle $Tbl,@ptr[$i] # cancel input
300___
301}
302$code.=<<___;
303 test $num,$num
304 jz .Ldone
305
306 movdqu 0x00-0x80($ctx),$A # load context
307 lea 128(%rsp),%rax
308 movdqu 0x20-0x80($ctx),$B
309 movdqu 0x40-0x80($ctx),$C
310 movdqu 0x60-0x80($ctx),$D
311 movdqu 0x80-0x80($ctx),$E
312 movdqu 0xa0-0x80($ctx),$F
313 movdqu 0xc0-0x80($ctx),$G
314 movdqu 0xe0-0x80($ctx),$H
315 movdqu .Lpbswap(%rip),$Xn
316 jmp .Loop
317
318.align 32
319.Loop:
320 movdqa $C,$bxc
321 pxor $B,$bxc # magic seed
322___
323for($i=0;$i<16;$i++) { &ROUND_00_15($i,@V); unshift(@V,pop(@V)); }
324$code.=<<___;
325 movdqu `&Xi_off($i)`,$Xi
326 mov \$3,%ecx
327 jmp .Loop_16_xx
328.align 32
329.Loop_16_xx:
330___
331for(;$i<32;$i++) { &ROUND_16_XX($i,@V); unshift(@V,pop(@V)); }
332$code.=<<___;
333 dec %ecx
334 jnz .Loop_16_xx
335
336 mov \$1,%ecx
337 lea K256+128(%rip),$Tbl
338
339 movdqa (%rbx),$sigma # pull counters
340 cmp 4*0(%rbx),%ecx # examine counters
341 pxor $t1,$t1
342 cmovge $Tbl,@ptr[0] # cancel input
343 cmp 4*1(%rbx),%ecx
344 movdqa $sigma,$Xn
345 cmovge $Tbl,@ptr[1]
346 cmp 4*2(%rbx),%ecx
347 pcmpgtd $t1,$Xn # mask value
348 cmovge $Tbl,@ptr[2]
349 cmp 4*3(%rbx),%ecx
350 paddd $Xn,$sigma # counters--
351 cmovge $Tbl,@ptr[3]
352
353 movdqu 0x00-0x80($ctx),$t1
354 pand $Xn,$A
355 movdqu 0x20-0x80($ctx),$t2
356 pand $Xn,$B
357 movdqu 0x40-0x80($ctx),$t3
358 pand $Xn,$C
359 movdqu 0x60-0x80($ctx),$Xi
360 pand $Xn,$D
361 paddd $t1,$A
362 movdqu 0x80-0x80($ctx),$t1
363 pand $Xn,$E
364 paddd $t2,$B
365 movdqu 0xa0-0x80($ctx),$t2
366 pand $Xn,$F
367 paddd $t3,$C
368 movdqu 0xc0-0x80($ctx),$t3
369 pand $Xn,$G
370 paddd $Xi,$D
371 movdqu 0xe0-0x80($ctx),$Xi
372 pand $Xn,$H
373 paddd $t1,$E
374 paddd $t2,$F
375 movdqu $A,0x00-0x80($ctx)
376 paddd $t3,$G
377 movdqu $B,0x20-0x80($ctx)
378 paddd $Xi,$H
379 movdqu $C,0x40-0x80($ctx)
380 movdqu $D,0x60-0x80($ctx)
381 movdqu $E,0x80-0x80($ctx)
382 movdqu $F,0xa0-0x80($ctx)
383 movdqu $G,0xc0-0x80($ctx)
384 movdqu $H,0xe0-0x80($ctx)
385
386 movdqa $sigma,(%rbx) # save counters
387 movdqa .Lpbswap(%rip),$Xn
388 dec $num
389 jnz .Loop
390
391 mov `$REG_SZ*17+8`(%rsp),$num
392 lea $REG_SZ($ctx),$ctx
393 lea `16*$REG_SZ/4`($inp),$inp
394 dec $num
395 jnz .Loop_grande
396
397.Ldone:
398 mov `$REG_SZ*17`(%rsp),%rax # original %rsp
399.cfi_def_cfa %rax,8
400___
401$code.=<<___ if ($win64);
402 movaps -0xb8(%rax),%xmm6
403 movaps -0xa8(%rax),%xmm7
404 movaps -0x98(%rax),%xmm8
405 movaps -0x88(%rax),%xmm9
406 movaps -0x78(%rax),%xmm10
407 movaps -0x68(%rax),%xmm11
408 movaps -0x58(%rax),%xmm12
409 movaps -0x48(%rax),%xmm13
410 movaps -0x38(%rax),%xmm14
411 movaps -0x28(%rax),%xmm15
412___
413$code.=<<___;
414 mov -16(%rax),%rbp
415.cfi_restore %rbp
416 mov -8(%rax),%rbx
417.cfi_restore %rbx
418 lea (%rax),%rsp
419.cfi_def_cfa_register %rsp
420.Lepilogue:
421 ret
422.cfi_endproc
423.size sha256_multi_block,.-sha256_multi_block
424___
425 {{{
426my ($Wi,$TMP0,$TMP1,$TMPx,$ABEF0,$CDGH0,$ABEF1,$CDGH1)=map("%xmm$_",(0..3,12..15));
427my @MSG0=map("%xmm$_",(4..7));
428my @MSG1=map("%xmm$_",(8..11));
429
430$code.=<<___;
431.type sha256_multi_block_shaext,\@function,3
432.align 32
433sha256_multi_block_shaext:
434.cfi_startproc
435_shaext_shortcut:
436 mov %rsp,%rax
437.cfi_def_cfa_register %rax
438 push %rbx
439.cfi_push %rbx
440 push %rbp
441.cfi_push %rbp
442___
443$code.=<<___ if ($win64);
444 lea -0xa8(%rsp),%rsp
445 movaps %xmm6,(%rsp)
446 movaps %xmm7,0x10(%rsp)
447 movaps %xmm8,0x20(%rsp)
448 movaps %xmm9,0x30(%rsp)
449 movaps %xmm10,-0x78(%rax)
450 movaps %xmm11,-0x68(%rax)
451 movaps %xmm12,-0x58(%rax)
452 movaps %xmm13,-0x48(%rax)
453 movaps %xmm14,-0x38(%rax)
454 movaps %xmm15,-0x28(%rax)
455___
456$code.=<<___;
457 sub \$`$REG_SZ*18`,%rsp
458 shl \$1,$num # we process pair at a time
459 and \$-256,%rsp
460 lea 0x80($ctx),$ctx # size optimization
461 mov %rax,`$REG_SZ*17`(%rsp) # original %rsp
462.Lbody_shaext:
463 lea `$REG_SZ*16`(%rsp),%rbx
464 lea K256_shaext+0x80(%rip),$Tbl
465
466.Loop_grande_shaext:
467 mov $num,`$REG_SZ*17+8`(%rsp) # original $num
468 xor $num,$num
469___
470for($i=0;$i<2;$i++) {
471 $code.=<<___;
472 mov `16*$i+0`($inp),@ptr[$i] # input pointer
473 mov `16*$i+8`($inp),%ecx # number of blocks
474 cmp $num,%ecx
475 cmovg %ecx,$num # find maximum
476 test %ecx,%ecx
477 mov %ecx,`4*$i`(%rbx) # initialize counters
478 cmovle %rsp,@ptr[$i] # cancel input
479___
480}
481$code.=<<___;
482 test $num,$num
483 jz .Ldone_shaext
484
485 movq 0x00-0x80($ctx),$ABEF0 # A1.A0
486 movq 0x20-0x80($ctx),@MSG0[0] # B1.B0
487 movq 0x40-0x80($ctx),$CDGH0 # C1.C0
488 movq 0x60-0x80($ctx),@MSG0[1] # D1.D0
489 movq 0x80-0x80($ctx),@MSG1[0] # E1.E0
490 movq 0xa0-0x80($ctx),@MSG1[1] # F1.F0
491 movq 0xc0-0x80($ctx),@MSG1[2] # G1.G0
492 movq 0xe0-0x80($ctx),@MSG1[3] # H1.H0
493
494 punpckldq @MSG0[0],$ABEF0 # B1.A1.B0.A0
495 punpckldq @MSG0[1],$CDGH0 # D1.C1.D0.C0
496 punpckldq @MSG1[1],@MSG1[0] # F1.E1.F0.E0
497 punpckldq @MSG1[3],@MSG1[2] # H1.G1.H0.G0
498 movdqa K256_shaext-0x10(%rip),$TMPx # byte swap
499
500 movdqa $ABEF0,$ABEF1
501 movdqa $CDGH0,$CDGH1
502 punpcklqdq @MSG1[0],$ABEF0 # F0.E0.B0.A0
503 punpcklqdq @MSG1[2],$CDGH0 # H0.G0.D0.C0
504 punpckhqdq @MSG1[0],$ABEF1 # F1.E1.B1.A1
505 punpckhqdq @MSG1[2],$CDGH1 # H1.G1.D1.C1
506
507 pshufd \$0b00011011,$ABEF0,$ABEF0
508 pshufd \$0b00011011,$CDGH0,$CDGH0
509 pshufd \$0b00011011,$ABEF1,$ABEF1
510 pshufd \$0b00011011,$CDGH1,$CDGH1
511 jmp .Loop_shaext
512
513.align 32
514.Loop_shaext:
515 movdqu 0x00(@ptr[0]),@MSG0[0]
516 movdqu 0x00(@ptr[1]),@MSG1[0]
517 movdqu 0x10(@ptr[0]),@MSG0[1]
518 movdqu 0x10(@ptr[1]),@MSG1[1]
519 movdqu 0x20(@ptr[0]),@MSG0[2]
520 pshufb $TMPx,@MSG0[0]
521 movdqu 0x20(@ptr[1]),@MSG1[2]
522 pshufb $TMPx,@MSG1[0]
523 movdqu 0x30(@ptr[0]),@MSG0[3]
524 lea 0x40(@ptr[0]),@ptr[0]
525 movdqu 0x30(@ptr[1]),@MSG1[3]
526 lea 0x40(@ptr[1]),@ptr[1]
527
528 movdqa 0*16-0x80($Tbl),$Wi
529 pshufb $TMPx,@MSG0[1]
530 paddd @MSG0[0],$Wi
531 pxor $ABEF0,@MSG0[0] # black magic
532 movdqa $Wi,$TMP0
533 movdqa 0*16-0x80($Tbl),$TMP1
534 pshufb $TMPx,@MSG1[1]
535 paddd @MSG1[0],$TMP1
536 movdqa $CDGH0,0x50(%rsp) # offload
537 sha256rnds2 $ABEF0,$CDGH0 # 0-3
538 pxor $ABEF1,@MSG1[0] # black magic
539 movdqa $TMP1,$Wi
540 movdqa $CDGH1,0x70(%rsp)
541 sha256rnds2 $ABEF1,$CDGH1 # 0-3
542 pshufd \$0x0e,$TMP0,$Wi
543 pxor $ABEF0,@MSG0[0] # black magic
544 movdqa $ABEF0,0x40(%rsp) # offload
545 sha256rnds2 $CDGH0,$ABEF0
546 pshufd \$0x0e,$TMP1,$Wi
547 pxor $ABEF1,@MSG1[0] # black magic
548 movdqa $ABEF1,0x60(%rsp)
549 movdqa 1*16-0x80($Tbl),$TMP0
550 paddd @MSG0[1],$TMP0
551 pshufb $TMPx,@MSG0[2]
552 sha256rnds2 $CDGH1,$ABEF1
553
554 movdqa $TMP0,$Wi
555 movdqa 1*16-0x80($Tbl),$TMP1
556 paddd @MSG1[1],$TMP1
557 sha256rnds2 $ABEF0,$CDGH0 # 4-7
558 movdqa $TMP1,$Wi
559 prefetcht0 127(@ptr[0])
560 pshufb $TMPx,@MSG0[3]
561 pshufb $TMPx,@MSG1[2]
562 prefetcht0 127(@ptr[1])
563 sha256rnds2 $ABEF1,$CDGH1 # 4-7
564 pshufd \$0x0e,$TMP0,$Wi
565 pshufb $TMPx,@MSG1[3]
566 sha256msg1 @MSG0[1],@MSG0[0]
567 sha256rnds2 $CDGH0,$ABEF0
568 pshufd \$0x0e,$TMP1,$Wi
569 movdqa 2*16-0x80($Tbl),$TMP0
570 paddd @MSG0[2],$TMP0
571 sha256rnds2 $CDGH1,$ABEF1
572
573 movdqa $TMP0,$Wi
574 movdqa 2*16-0x80($Tbl),$TMP1
575 paddd @MSG1[2],$TMP1
576 sha256rnds2 $ABEF0,$CDGH0 # 8-11
577 sha256msg1 @MSG1[1],@MSG1[0]
578 movdqa $TMP1,$Wi
579 movdqa @MSG0[3],$TMPx
580 sha256rnds2 $ABEF1,$CDGH1 # 8-11
581 pshufd \$0x0e,$TMP0,$Wi
582 palignr \$4,@MSG0[2],$TMPx
583 paddd $TMPx,@MSG0[0]
584 movdqa @MSG1[3],$TMPx
585 palignr \$4,@MSG1[2],$TMPx
586 sha256msg1 @MSG0[2],@MSG0[1]
587 sha256rnds2 $CDGH0,$ABEF0
588 pshufd \$0x0e,$TMP1,$Wi
589 movdqa 3*16-0x80($Tbl),$TMP0
590 paddd @MSG0[3],$TMP0
591 sha256rnds2 $CDGH1,$ABEF1
592 sha256msg1 @MSG1[2],@MSG1[1]
593
594 movdqa $TMP0,$Wi
595 movdqa 3*16-0x80($Tbl),$TMP1
596 paddd $TMPx,@MSG1[0]
597 paddd @MSG1[3],$TMP1
598 sha256msg2 @MSG0[3],@MSG0[0]
599 sha256rnds2 $ABEF0,$CDGH0 # 12-15
600 movdqa $TMP1,$Wi
601 movdqa @MSG0[0],$TMPx
602 palignr \$4,@MSG0[3],$TMPx
603 sha256rnds2 $ABEF1,$CDGH1 # 12-15
604 sha256msg2 @MSG1[3],@MSG1[0]
605 pshufd \$0x0e,$TMP0,$Wi
606 paddd $TMPx,@MSG0[1]
607 movdqa @MSG1[0],$TMPx
608 palignr \$4,@MSG1[3],$TMPx
609 sha256msg1 @MSG0[3],@MSG0[2]
610 sha256rnds2 $CDGH0,$ABEF0
611 pshufd \$0x0e,$TMP1,$Wi
612 movdqa 4*16-0x80($Tbl),$TMP0
613 paddd @MSG0[0],$TMP0
614 sha256rnds2 $CDGH1,$ABEF1
615 sha256msg1 @MSG1[3],@MSG1[2]
616___
617for($i=4;$i<16-3;$i++) {
618$code.=<<___;
619 movdqa $TMP0,$Wi
620 movdqa $i*16-0x80($Tbl),$TMP1
621 paddd $TMPx,@MSG1[1]
622 paddd @MSG1[0],$TMP1
623 sha256msg2 @MSG0[0],@MSG0[1]
624 sha256rnds2 $ABEF0,$CDGH0 # 16-19...
625 movdqa $TMP1,$Wi
626 movdqa @MSG0[1],$TMPx
627 palignr \$4,@MSG0[0],$TMPx
628 sha256rnds2 $ABEF1,$CDGH1 # 16-19...
629 sha256msg2 @MSG1[0],@MSG1[1]
630 pshufd \$0x0e,$TMP0,$Wi
631 paddd $TMPx,@MSG0[2]
632 movdqa @MSG1[1],$TMPx
633 palignr \$4,@MSG1[0],$TMPx
634 sha256msg1 @MSG0[0],@MSG0[3]
635 sha256rnds2 $CDGH0,$ABEF0
636 pshufd \$0x0e,$TMP1,$Wi
637 movdqa `($i+1)*16`-0x80($Tbl),$TMP0
638 paddd @MSG0[1],$TMP0
639 sha256rnds2 $CDGH1,$ABEF1
640 sha256msg1 @MSG1[0],@MSG1[3]
641___
642 push(@MSG0,shift(@MSG0)); push(@MSG1,shift(@MSG1));
643}
644$code.=<<___;
645 movdqa $TMP0,$Wi
646 movdqa 13*16-0x80($Tbl),$TMP1
647 paddd $TMPx,@MSG1[1]
648 paddd @MSG1[0],$TMP1
649 sha256msg2 @MSG0[0],@MSG0[1]
650 sha256rnds2 $ABEF0,$CDGH0 # 52-55
651 movdqa $TMP1,$Wi
652 movdqa @MSG0[1],$TMPx
653 palignr \$4,@MSG0[0],$TMPx
654 sha256rnds2 $ABEF1,$CDGH1 # 52-55
655 sha256msg2 @MSG1[0],@MSG1[1]
656 pshufd \$0x0e,$TMP0,$Wi
657 paddd $TMPx,@MSG0[2]
658 movdqa @MSG1[1],$TMPx
659 palignr \$4,@MSG1[0],$TMPx
660 nop
661 sha256rnds2 $CDGH0,$ABEF0
662 pshufd \$0x0e,$TMP1,$Wi
663 movdqa 14*16-0x80($Tbl),$TMP0
664 paddd @MSG0[1],$TMP0
665 sha256rnds2 $CDGH1,$ABEF1
666
667 movdqa $TMP0,$Wi
668 movdqa 14*16-0x80($Tbl),$TMP1
669 paddd $TMPx,@MSG1[2]
670 paddd @MSG1[1],$TMP1
671 sha256msg2 @MSG0[1],@MSG0[2]
672 nop
673 sha256rnds2 $ABEF0,$CDGH0 # 56-59
674 movdqa $TMP1,$Wi
675 mov \$1,%ecx
676 pxor @MSG0[1],@MSG0[1] # zero
677 sha256rnds2 $ABEF1,$CDGH1 # 56-59
678 sha256msg2 @MSG1[1],@MSG1[2]
679 pshufd \$0x0e,$TMP0,$Wi
680 movdqa 15*16-0x80($Tbl),$TMP0
681 paddd @MSG0[2],$TMP0
682 movq (%rbx),@MSG0[2] # pull counters
683 nop
684 sha256rnds2 $CDGH0,$ABEF0
685 pshufd \$0x0e,$TMP1,$Wi
686 movdqa 15*16-0x80($Tbl),$TMP1
687 paddd @MSG1[2],$TMP1
688 sha256rnds2 $CDGH1,$ABEF1
689
690 movdqa $TMP0,$Wi
691 cmp 4*0(%rbx),%ecx # examine counters
692 cmovge %rsp,@ptr[0] # cancel input
693 cmp 4*1(%rbx),%ecx
694 cmovge %rsp,@ptr[1]
695 pshufd \$0x00,@MSG0[2],@MSG1[0]
696 sha256rnds2 $ABEF0,$CDGH0 # 60-63
697 movdqa $TMP1,$Wi
698 pshufd \$0x55,@MSG0[2],@MSG1[1]
699 movdqa @MSG0[2],@MSG1[2]
700 sha256rnds2 $ABEF1,$CDGH1 # 60-63
701 pshufd \$0x0e,$TMP0,$Wi
702 pcmpgtd @MSG0[1],@MSG1[0]
703 pcmpgtd @MSG0[1],@MSG1[1]
704 sha256rnds2 $CDGH0,$ABEF0
705 pshufd \$0x0e,$TMP1,$Wi
706 pcmpgtd @MSG0[1],@MSG1[2] # counter mask
707 movdqa K256_shaext-0x10(%rip),$TMPx
708 sha256rnds2 $CDGH1,$ABEF1
709
710 pand @MSG1[0],$CDGH0
711 pand @MSG1[1],$CDGH1
712 pand @MSG1[0],$ABEF0
713 pand @MSG1[1],$ABEF1
714 paddd @MSG0[2],@MSG1[2] # counters--
715
716 paddd 0x50(%rsp),$CDGH0
717 paddd 0x70(%rsp),$CDGH1
718 paddd 0x40(%rsp),$ABEF0
719 paddd 0x60(%rsp),$ABEF1
720
721 movq @MSG1[2],(%rbx) # save counters
722 dec $num
723 jnz .Loop_shaext
724
725 mov `$REG_SZ*17+8`(%rsp),$num
726
727 pshufd \$0b00011011,$ABEF0,$ABEF0
728 pshufd \$0b00011011,$CDGH0,$CDGH0
729 pshufd \$0b00011011,$ABEF1,$ABEF1
730 pshufd \$0b00011011,$CDGH1,$CDGH1
731
732 movdqa $ABEF0,@MSG0[0]
733 movdqa $CDGH0,@MSG0[1]
734 punpckldq $ABEF1,$ABEF0 # B1.B0.A1.A0
735 punpckhdq $ABEF1,@MSG0[0] # F1.F0.E1.E0
736 punpckldq $CDGH1,$CDGH0 # D1.D0.C1.C0
737 punpckhdq $CDGH1,@MSG0[1] # H1.H0.G1.G0
738
739 movq $ABEF0,0x00-0x80($ctx) # A1.A0
740 psrldq \$8,$ABEF0
741 movq @MSG0[0],0x80-0x80($ctx) # E1.E0
742 psrldq \$8,@MSG0[0]
743 movq $ABEF0,0x20-0x80($ctx) # B1.B0
744 movq @MSG0[0],0xa0-0x80($ctx) # F1.F0
745
746 movq $CDGH0,0x40-0x80($ctx) # C1.C0
747 psrldq \$8,$CDGH0
748 movq @MSG0[1],0xc0-0x80($ctx) # G1.G0
749 psrldq \$8,@MSG0[1]
750 movq $CDGH0,0x60-0x80($ctx) # D1.D0
751 movq @MSG0[1],0xe0-0x80($ctx) # H1.H0
752
753 lea `$REG_SZ/2`($ctx),$ctx
754 lea `16*2`($inp),$inp
755 dec $num
756 jnz .Loop_grande_shaext
757
758.Ldone_shaext:
759 #mov `$REG_SZ*17`(%rsp),%rax # original %rsp
760___
761$code.=<<___ if ($win64);
762 movaps -0xb8(%rax),%xmm6
763 movaps -0xa8(%rax),%xmm7
764 movaps -0x98(%rax),%xmm8
765 movaps -0x88(%rax),%xmm9
766 movaps -0x78(%rax),%xmm10
767 movaps -0x68(%rax),%xmm11
768 movaps -0x58(%rax),%xmm12
769 movaps -0x48(%rax),%xmm13
770 movaps -0x38(%rax),%xmm14
771 movaps -0x28(%rax),%xmm15
772___
773$code.=<<___;
774 mov -16(%rax),%rbp
775.cfi_restore %rbp
776 mov -8(%rax),%rbx
777.cfi_restore %rbx
778 lea (%rax),%rsp
779.cfi_def_cfa_register %rsp
780.Lepilogue_shaext:
781 ret
782.cfi_endproc
783.size sha256_multi_block_shaext,.-sha256_multi_block_shaext
784___
785 }}}
786 if ($avx) {{{
787sub ROUND_00_15_avx {
788my ($i,$a,$b,$c,$d,$e,$f,$g,$h)=@_;
789
790$code.=<<___ if ($i<15 && $REG_SZ==16);
791 vmovd `4*$i`(@ptr[0]),$Xi
792 vmovd `4*$i`(@ptr[1]),$t1
793 vpinsrd \$1,`4*$i`(@ptr[2]),$Xi,$Xi
794 vpinsrd \$1,`4*$i`(@ptr[3]),$t1,$t1
795 vpunpckldq $t1,$Xi,$Xi
796 vpshufb $Xn,$Xi,$Xi
797___
798$code.=<<___ if ($i==15 && $REG_SZ==16);
799 vmovd `4*$i`(@ptr[0]),$Xi
800 lea `16*4`(@ptr[0]),@ptr[0]
801 vmovd `4*$i`(@ptr[1]),$t1
802 lea `16*4`(@ptr[1]),@ptr[1]
803 vpinsrd \$1,`4*$i`(@ptr[2]),$Xi,$Xi
804 lea `16*4`(@ptr[2]),@ptr[2]
805 vpinsrd \$1,`4*$i`(@ptr[3]),$t1,$t1
806 lea `16*4`(@ptr[3]),@ptr[3]
807 vpunpckldq $t1,$Xi,$Xi
808 vpshufb $Xn,$Xi,$Xi
809___
810$code.=<<___ if ($i<15 && $REG_SZ==32);
811 vmovd `4*$i`(@ptr[0]),$Xi
812 vmovd `4*$i`(@ptr[4]),$t1
813 vmovd `4*$i`(@ptr[1]),$t2
814 vmovd `4*$i`(@ptr[5]),$t3
815 vpinsrd \$1,`4*$i`(@ptr[2]),$Xi,$Xi
816 vpinsrd \$1,`4*$i`(@ptr[6]),$t1,$t1
817 vpinsrd \$1,`4*$i`(@ptr[3]),$t2,$t2
818 vpunpckldq $t2,$Xi,$Xi
819 vpinsrd \$1,`4*$i`(@ptr[7]),$t3,$t3
820 vpunpckldq $t3,$t1,$t1
821 vinserti128 $t1,$Xi,$Xi
822 vpshufb $Xn,$Xi,$Xi
823___
824$code.=<<___ if ($i==15 && $REG_SZ==32);
825 vmovd `4*$i`(@ptr[0]),$Xi
826 lea `16*4`(@ptr[0]),@ptr[0]
827 vmovd `4*$i`(@ptr[4]),$t1
828 lea `16*4`(@ptr[4]),@ptr[4]
829 vmovd `4*$i`(@ptr[1]),$t2
830 lea `16*4`(@ptr[1]),@ptr[1]
831 vmovd `4*$i`(@ptr[5]),$t3
832 lea `16*4`(@ptr[5]),@ptr[5]
833 vpinsrd \$1,`4*$i`(@ptr[2]),$Xi,$Xi
834 lea `16*4`(@ptr[2]),@ptr[2]
835 vpinsrd \$1,`4*$i`(@ptr[6]),$t1,$t1
836 lea `16*4`(@ptr[6]),@ptr[6]
837 vpinsrd \$1,`4*$i`(@ptr[3]),$t2,$t2
838 lea `16*4`(@ptr[3]),@ptr[3]
839 vpunpckldq $t2,$Xi,$Xi
840 vpinsrd \$1,`4*$i`(@ptr[7]),$t3,$t3
841 lea `16*4`(@ptr[7]),@ptr[7]
842 vpunpckldq $t3,$t1,$t1
843 vinserti128 $t1,$Xi,$Xi
844 vpshufb $Xn,$Xi,$Xi
845___
846$code.=<<___;
847 vpsrld \$6,$e,$sigma
848 vpslld \$26,$e,$t3
849 vmovdqu $Xi,`&Xi_off($i)`
850 vpaddd $h,$Xi,$Xi # Xi+=h
851
852 vpsrld \$11,$e,$t2
853 vpxor $t3,$sigma,$sigma
854 vpslld \$21,$e,$t3
855 vpaddd `32*($i%8)-128`($Tbl),$Xi,$Xi # Xi+=K[round]
856 vpxor $t2,$sigma,$sigma
857
858 vpsrld \$25,$e,$t2
859 vpxor $t3,$sigma,$sigma
860 `"prefetcht0 63(@ptr[0])" if ($i==15)`
861 vpslld \$7,$e,$t3
862 vpandn $g,$e,$t1
863 vpand $f,$e,$axb # borrow $axb
864 `"prefetcht0 63(@ptr[1])" if ($i==15)`
865 vpxor $t2,$sigma,$sigma
866
867 vpsrld \$2,$a,$h # borrow $h
868 vpxor $t3,$sigma,$sigma # Sigma1(e)
869 `"prefetcht0 63(@ptr[2])" if ($i==15)`
870 vpslld \$30,$a,$t2
871 vpxor $axb,$t1,$t1 # Ch(e,f,g)
872 vpxor $a,$b,$axb # a^b, b^c in next round
873 `"prefetcht0 63(@ptr[3])" if ($i==15)`
874 vpxor $t2,$h,$h
875 vpaddd $sigma,$Xi,$Xi # Xi+=Sigma1(e)
876
877 vpsrld \$13,$a,$t2
878 `"prefetcht0 63(@ptr[4])" if ($i==15 && $REG_SZ==32)`
879 vpslld \$19,$a,$t3
880 vpaddd $t1,$Xi,$Xi # Xi+=Ch(e,f,g)
881 vpand $axb,$bxc,$bxc
882 `"prefetcht0 63(@ptr[5])" if ($i==15 && $REG_SZ==32)`
883 vpxor $t2,$h,$sigma
884
885 vpsrld \$22,$a,$t2
886 vpxor $t3,$sigma,$sigma
887 `"prefetcht0 63(@ptr[6])" if ($i==15 && $REG_SZ==32)`
888 vpslld \$10,$a,$t3
889 vpxor $bxc,$b,$h # h=Maj(a,b,c)=Ch(a^b,c,b)
890 vpaddd $Xi,$d,$d # d+=Xi
891 `"prefetcht0 63(@ptr[7])" if ($i==15 && $REG_SZ==32)`
892 vpxor $t2,$sigma,$sigma
893 vpxor $t3,$sigma,$sigma # Sigma0(a)
894
895 vpaddd $Xi,$h,$h # h+=Xi
896 vpaddd $sigma,$h,$h # h+=Sigma0(a)
897___
898$code.=<<___ if (($i%8)==7);
899 add \$`32*8`,$Tbl
900___
901 ($axb,$bxc)=($bxc,$axb);
902}
903
904sub ROUND_16_XX_avx {
905my $i=shift;
906
907$code.=<<___;
908 vmovdqu `&Xi_off($i+1)`,$Xn
909 vpaddd `&Xi_off($i+9)`,$Xi,$Xi # Xi+=X[i+9]
910
911 vpsrld \$3,$Xn,$sigma
912 vpsrld \$7,$Xn,$t2
913 vpslld \$25,$Xn,$t3
914 vpxor $t2,$sigma,$sigma
915 vpsrld \$18,$Xn,$t2
916 vpxor $t3,$sigma,$sigma
917 vpslld \$14,$Xn,$t3
918 vmovdqu `&Xi_off($i+14)`,$t1
919 vpsrld \$10,$t1,$axb # borrow $axb
920
921 vpxor $t2,$sigma,$sigma
922 vpsrld \$17,$t1,$t2
923 vpxor $t3,$sigma,$sigma # sigma0(X[i+1])
924 vpslld \$15,$t1,$t3
925 vpaddd $sigma,$Xi,$Xi # Xi+=sigma0(e)
926 vpxor $t2,$axb,$sigma
927 vpsrld \$19,$t1,$t2
928 vpxor $t3,$sigma,$sigma
929 vpslld \$13,$t1,$t3
930 vpxor $t2,$sigma,$sigma
931 vpxor $t3,$sigma,$sigma # sigma0(X[i+14])
932 vpaddd $sigma,$Xi,$Xi # Xi+=sigma1(X[i+14])
933___
934 &ROUND_00_15_avx($i,@_);
935 ($Xi,$Xn)=($Xn,$Xi);
936}
937
938$code.=<<___;
939.type sha256_multi_block_avx,\@function,3
940.align 32
941sha256_multi_block_avx:
942.cfi_startproc
943_avx_shortcut:
944___
945$code.=<<___ if ($avx>1);
946 shr \$32,%rcx
947 cmp \$2,$num
948 jb .Lavx
949 test \$`1<<5`,%ecx
950 jnz _avx2_shortcut
951 jmp .Lavx
952.align 32
953.Lavx:
954___
955$code.=<<___;
956 mov %rsp,%rax
957.cfi_def_cfa_register %rax
958 push %rbx
959.cfi_push %rbx
960 push %rbp
961.cfi_push %rbp
962___
963$code.=<<___ if ($win64);
964 lea -0xa8(%rsp),%rsp
965 movaps %xmm6,(%rsp)
966 movaps %xmm7,0x10(%rsp)
967 movaps %xmm8,0x20(%rsp)
968 movaps %xmm9,0x30(%rsp)
969 movaps %xmm10,-0x78(%rax)
970 movaps %xmm11,-0x68(%rax)
971 movaps %xmm12,-0x58(%rax)
972 movaps %xmm13,-0x48(%rax)
973 movaps %xmm14,-0x38(%rax)
974 movaps %xmm15,-0x28(%rax)
975___
976$code.=<<___;
977 sub \$`$REG_SZ*18`, %rsp
978 and \$-256,%rsp
979 mov %rax,`$REG_SZ*17`(%rsp) # original %rsp
980.cfi_cfa_expression %rsp+`$REG_SZ*17`,deref,+8
981.Lbody_avx:
982 lea K256+128(%rip),$Tbl
983 lea `$REG_SZ*16`(%rsp),%rbx
984 lea 0x80($ctx),$ctx # size optimization
985
986.Loop_grande_avx:
987 mov $num,`$REG_SZ*17+8`(%rsp) # original $num
988 xor $num,$num
989___
990for($i=0;$i<4;$i++) {
991 $code.=<<___;
992 mov `16*$i+0`($inp),@ptr[$i] # input pointer
993 mov `16*$i+8`($inp),%ecx # number of blocks
994 cmp $num,%ecx
995 cmovg %ecx,$num # find maximum
996 test %ecx,%ecx
997 mov %ecx,`4*$i`(%rbx) # initialize counters
998 cmovle $Tbl,@ptr[$i] # cancel input
999___
1000}
1001$code.=<<___;
1002 test $num,$num
1003 jz .Ldone_avx
1004
1005 vmovdqu 0x00-0x80($ctx),$A # load context
1006 lea 128(%rsp),%rax
1007 vmovdqu 0x20-0x80($ctx),$B
1008 vmovdqu 0x40-0x80($ctx),$C
1009 vmovdqu 0x60-0x80($ctx),$D
1010 vmovdqu 0x80-0x80($ctx),$E
1011 vmovdqu 0xa0-0x80($ctx),$F
1012 vmovdqu 0xc0-0x80($ctx),$G
1013 vmovdqu 0xe0-0x80($ctx),$H
1014 vmovdqu .Lpbswap(%rip),$Xn
1015 jmp .Loop_avx
1016
1017.align 32
1018.Loop_avx:
1019 vpxor $B,$C,$bxc # magic seed
1020___
1021for($i=0;$i<16;$i++) { &ROUND_00_15_avx($i,@V); unshift(@V,pop(@V)); }
1022$code.=<<___;
1023 vmovdqu `&Xi_off($i)`,$Xi
1024 mov \$3,%ecx
1025 jmp .Loop_16_xx_avx
1026.align 32
1027.Loop_16_xx_avx:
1028___
1029for(;$i<32;$i++) { &ROUND_16_XX_avx($i,@V); unshift(@V,pop(@V)); }
1030$code.=<<___;
1031 dec %ecx
1032 jnz .Loop_16_xx_avx
1033
1034 mov \$1,%ecx
1035 lea K256+128(%rip),$Tbl
1036___
1037for($i=0;$i<4;$i++) {
1038 $code.=<<___;
1039 cmp `4*$i`(%rbx),%ecx # examine counters
1040 cmovge $Tbl,@ptr[$i] # cancel input
1041___
1042}
1043$code.=<<___;
1044 vmovdqa (%rbx),$sigma # pull counters
1045 vpxor $t1,$t1,$t1
1046 vmovdqa $sigma,$Xn
1047 vpcmpgtd $t1,$Xn,$Xn # mask value
1048 vpaddd $Xn,$sigma,$sigma # counters--
1049
1050 vmovdqu 0x00-0x80($ctx),$t1
1051 vpand $Xn,$A,$A
1052 vmovdqu 0x20-0x80($ctx),$t2
1053 vpand $Xn,$B,$B
1054 vmovdqu 0x40-0x80($ctx),$t3
1055 vpand $Xn,$C,$C
1056 vmovdqu 0x60-0x80($ctx),$Xi
1057 vpand $Xn,$D,$D
1058 vpaddd $t1,$A,$A
1059 vmovdqu 0x80-0x80($ctx),$t1
1060 vpand $Xn,$E,$E
1061 vpaddd $t2,$B,$B
1062 vmovdqu 0xa0-0x80($ctx),$t2
1063 vpand $Xn,$F,$F
1064 vpaddd $t3,$C,$C
1065 vmovdqu 0xc0-0x80($ctx),$t3
1066 vpand $Xn,$G,$G
1067 vpaddd $Xi,$D,$D
1068 vmovdqu 0xe0-0x80($ctx),$Xi
1069 vpand $Xn,$H,$H
1070 vpaddd $t1,$E,$E
1071 vpaddd $t2,$F,$F
1072 vmovdqu $A,0x00-0x80($ctx)
1073 vpaddd $t3,$G,$G
1074 vmovdqu $B,0x20-0x80($ctx)
1075 vpaddd $Xi,$H,$H
1076 vmovdqu $C,0x40-0x80($ctx)
1077 vmovdqu $D,0x60-0x80($ctx)
1078 vmovdqu $E,0x80-0x80($ctx)
1079 vmovdqu $F,0xa0-0x80($ctx)
1080 vmovdqu $G,0xc0-0x80($ctx)
1081 vmovdqu $H,0xe0-0x80($ctx)
1082
1083 vmovdqu $sigma,(%rbx) # save counters
1084 vmovdqu .Lpbswap(%rip),$Xn
1085 dec $num
1086 jnz .Loop_avx
1087
1088 mov `$REG_SZ*17+8`(%rsp),$num
1089 lea $REG_SZ($ctx),$ctx
1090 lea `16*$REG_SZ/4`($inp),$inp
1091 dec $num
1092 jnz .Loop_grande_avx
1093
1094.Ldone_avx:
1095 mov `$REG_SZ*17`(%rsp),%rax # original %rsp
1096.cfi_def_cfa %rax,8
1097 vzeroupper
1098___
1099$code.=<<___ if ($win64);
1100 movaps -0xb8(%rax),%xmm6
1101 movaps -0xa8(%rax),%xmm7
1102 movaps -0x98(%rax),%xmm8
1103 movaps -0x88(%rax),%xmm9
1104 movaps -0x78(%rax),%xmm10
1105 movaps -0x68(%rax),%xmm11
1106 movaps -0x58(%rax),%xmm12
1107 movaps -0x48(%rax),%xmm13
1108 movaps -0x38(%rax),%xmm14
1109 movaps -0x28(%rax),%xmm15
1110___
1111$code.=<<___;
1112 mov -16(%rax),%rbp
1113.cfi_restore %rbp
1114 mov -8(%rax),%rbx
1115.cfi_restore %rbx
1116 lea (%rax),%rsp
1117.cfi_def_cfa_register %rsp
1118.Lepilogue_avx:
1119 ret
1120.cfi_endproc
1121.size sha256_multi_block_avx,.-sha256_multi_block_avx
1122___
1123 if ($avx>1) {
1124$code =~ s/\`([^\`]*)\`/eval $1/gem;
1125
1126$REG_SZ=32;
1127@ptr=map("%r$_",(12..15,8..11));
1128
1129@V=($A,$B,$C,$D,$E,$F,$G,$H)=map("%ymm$_",(8..15));
1130($t1,$t2,$t3,$axb,$bxc,$Xi,$Xn,$sigma)=map("%ymm$_",(0..7));
1131
1132$code.=<<___;
1133.type sha256_multi_block_avx2,\@function,3
1134.align 32
1135sha256_multi_block_avx2:
1136.cfi_startproc
1137_avx2_shortcut:
1138 mov %rsp,%rax
1139.cfi_def_cfa_register %rax
1140 push %rbx
1141.cfi_push %rbx
1142 push %rbp
1143.cfi_push %rbp
1144 push %r12
1145.cfi_push %r12
1146 push %r13
1147.cfi_push %r13
1148 push %r14
1149.cfi_push %r14
1150 push %r15
1151.cfi_push %r15
1152___
1153$code.=<<___ if ($win64);
1154 lea -0xa8(%rsp),%rsp
1155 movaps %xmm6,(%rsp)
1156 movaps %xmm7,0x10(%rsp)
1157 movaps %xmm8,0x20(%rsp)
1158 movaps %xmm9,0x30(%rsp)
1159 movaps %xmm10,0x40(%rsp)
1160 movaps %xmm11,0x50(%rsp)
1161 movaps %xmm12,-0x78(%rax)
1162 movaps %xmm13,-0x68(%rax)
1163 movaps %xmm14,-0x58(%rax)
1164 movaps %xmm15,-0x48(%rax)
1165___
1166$code.=<<___;
1167 sub \$`$REG_SZ*18`, %rsp
1168 and \$-256,%rsp
1169 mov %rax,`$REG_SZ*17`(%rsp) # original %rsp
1170.cfi_cfa_expression %rsp+`$REG_SZ*17`,deref,+8
1171.Lbody_avx2:
1172 lea K256+128(%rip),$Tbl
1173 lea 0x80($ctx),$ctx # size optimization
1174
1175.Loop_grande_avx2:
1176 mov $num,`$REG_SZ*17+8`(%rsp) # original $num
1177 xor $num,$num
1178 lea `$REG_SZ*16`(%rsp),%rbx
1179___
1180for($i=0;$i<8;$i++) {
1181 $code.=<<___;
1182 mov `16*$i+0`($inp),@ptr[$i] # input pointer
1183 mov `16*$i+8`($inp),%ecx # number of blocks
1184 cmp $num,%ecx
1185 cmovg %ecx,$num # find maximum
1186 test %ecx,%ecx
1187 mov %ecx,`4*$i`(%rbx) # initialize counters
1188 cmovle $Tbl,@ptr[$i] # cancel input
1189___
1190}
1191$code.=<<___;
1192 vmovdqu 0x00-0x80($ctx),$A # load context
1193 lea 128(%rsp),%rax
1194 vmovdqu 0x20-0x80($ctx),$B
1195 lea 256+128(%rsp),%rbx
1196 vmovdqu 0x40-0x80($ctx),$C
1197 vmovdqu 0x60-0x80($ctx),$D
1198 vmovdqu 0x80-0x80($ctx),$E
1199 vmovdqu 0xa0-0x80($ctx),$F
1200 vmovdqu 0xc0-0x80($ctx),$G
1201 vmovdqu 0xe0-0x80($ctx),$H
1202 vmovdqu .Lpbswap(%rip),$Xn
1203 jmp .Loop_avx2
1204
1205.align 32
1206.Loop_avx2:
1207 vpxor $B,$C,$bxc # magic seed
1208___
1209for($i=0;$i<16;$i++) { &ROUND_00_15_avx($i,@V); unshift(@V,pop(@V)); }
1210$code.=<<___;
1211 vmovdqu `&Xi_off($i)`,$Xi
1212 mov \$3,%ecx
1213 jmp .Loop_16_xx_avx2
1214.align 32
1215.Loop_16_xx_avx2:
1216___
1217for(;$i<32;$i++) { &ROUND_16_XX_avx($i,@V); unshift(@V,pop(@V)); }
1218$code.=<<___;
1219 dec %ecx
1220 jnz .Loop_16_xx_avx2
1221
1222 mov \$1,%ecx
1223 lea `$REG_SZ*16`(%rsp),%rbx
1224 lea K256+128(%rip),$Tbl
1225___
1226for($i=0;$i<8;$i++) {
1227 $code.=<<___;
1228 cmp `4*$i`(%rbx),%ecx # examine counters
1229 cmovge $Tbl,@ptr[$i] # cancel input
1230___
1231}
1232$code.=<<___;
1233 vmovdqa (%rbx),$sigma # pull counters
1234 vpxor $t1,$t1,$t1
1235 vmovdqa $sigma,$Xn
1236 vpcmpgtd $t1,$Xn,$Xn # mask value
1237 vpaddd $Xn,$sigma,$sigma # counters--
1238
1239 vmovdqu 0x00-0x80($ctx),$t1
1240 vpand $Xn,$A,$A
1241 vmovdqu 0x20-0x80($ctx),$t2
1242 vpand $Xn,$B,$B
1243 vmovdqu 0x40-0x80($ctx),$t3
1244 vpand $Xn,$C,$C
1245 vmovdqu 0x60-0x80($ctx),$Xi
1246 vpand $Xn,$D,$D
1247 vpaddd $t1,$A,$A
1248 vmovdqu 0x80-0x80($ctx),$t1
1249 vpand $Xn,$E,$E
1250 vpaddd $t2,$B,$B
1251 vmovdqu 0xa0-0x80($ctx),$t2
1252 vpand $Xn,$F,$F
1253 vpaddd $t3,$C,$C
1254 vmovdqu 0xc0-0x80($ctx),$t3
1255 vpand $Xn,$G,$G
1256 vpaddd $Xi,$D,$D
1257 vmovdqu 0xe0-0x80($ctx),$Xi
1258 vpand $Xn,$H,$H
1259 vpaddd $t1,$E,$E
1260 vpaddd $t2,$F,$F
1261 vmovdqu $A,0x00-0x80($ctx)
1262 vpaddd $t3,$G,$G
1263 vmovdqu $B,0x20-0x80($ctx)
1264 vpaddd $Xi,$H,$H
1265 vmovdqu $C,0x40-0x80($ctx)
1266 vmovdqu $D,0x60-0x80($ctx)
1267 vmovdqu $E,0x80-0x80($ctx)
1268 vmovdqu $F,0xa0-0x80($ctx)
1269 vmovdqu $G,0xc0-0x80($ctx)
1270 vmovdqu $H,0xe0-0x80($ctx)
1271
1272 vmovdqu $sigma,(%rbx) # save counters
1273 lea 256+128(%rsp),%rbx
1274 vmovdqu .Lpbswap(%rip),$Xn
1275 dec $num
1276 jnz .Loop_avx2
1277
1278 #mov `$REG_SZ*17+8`(%rsp),$num
1279 #lea $REG_SZ($ctx),$ctx
1280 #lea `16*$REG_SZ/4`($inp),$inp
1281 #dec $num
1282 #jnz .Loop_grande_avx2
1283
1284.Ldone_avx2:
1285 mov `$REG_SZ*17`(%rsp),%rax # original %rsp
1286.cfi_def_cfa %rax,8
1287 vzeroupper
1288___
1289$code.=<<___ if ($win64);
1290 movaps -0xd8(%rax),%xmm6
1291 movaps -0xc8(%rax),%xmm7
1292 movaps -0xb8(%rax),%xmm8
1293 movaps -0xa8(%rax),%xmm9
1294 movaps -0x98(%rax),%xmm10
1295 movaps -0x88(%rax),%xmm11
1296 movaps -0x78(%rax),%xmm12
1297 movaps -0x68(%rax),%xmm13
1298 movaps -0x58(%rax),%xmm14
1299 movaps -0x48(%rax),%xmm15
1300___
1301$code.=<<___;
1302 mov -48(%rax),%r15
1303.cfi_restore %r15
1304 mov -40(%rax),%r14
1305.cfi_restore %r14
1306 mov -32(%rax),%r13
1307.cfi_restore %r13
1308 mov -24(%rax),%r12
1309.cfi_restore %r12
1310 mov -16(%rax),%rbp
1311.cfi_restore %rbp
1312 mov -8(%rax),%rbx
1313.cfi_restore %rbx
1314 lea (%rax),%rsp
1315.cfi_def_cfa_register %rsp
1316.Lepilogue_avx2:
1317 ret
1318.cfi_endproc
1319.size sha256_multi_block_avx2,.-sha256_multi_block_avx2
1320___
1321 } }}}
1322$code.=<<___;
1323.align 256
1324K256:
1325___
1326sub TABLE {
1327 foreach (@_) {
1328 $code.=<<___;
1329 .long $_,$_,$_,$_
1330 .long $_,$_,$_,$_
1331___
1332 }
1333}
1334&TABLE( 0x428a2f98,0x71374491,0xb5c0fbcf,0xe9b5dba5,
1335 0x3956c25b,0x59f111f1,0x923f82a4,0xab1c5ed5,
1336 0xd807aa98,0x12835b01,0x243185be,0x550c7dc3,
1337 0x72be5d74,0x80deb1fe,0x9bdc06a7,0xc19bf174,
1338 0xe49b69c1,0xefbe4786,0x0fc19dc6,0x240ca1cc,
1339 0x2de92c6f,0x4a7484aa,0x5cb0a9dc,0x76f988da,
1340 0x983e5152,0xa831c66d,0xb00327c8,0xbf597fc7,
1341 0xc6e00bf3,0xd5a79147,0x06ca6351,0x14292967,
1342 0x27b70a85,0x2e1b2138,0x4d2c6dfc,0x53380d13,
1343 0x650a7354,0x766a0abb,0x81c2c92e,0x92722c85,
1344 0xa2bfe8a1,0xa81a664b,0xc24b8b70,0xc76c51a3,
1345 0xd192e819,0xd6990624,0xf40e3585,0x106aa070,
1346 0x19a4c116,0x1e376c08,0x2748774c,0x34b0bcb5,
1347 0x391c0cb3,0x4ed8aa4a,0x5b9cca4f,0x682e6ff3,
1348 0x748f82ee,0x78a5636f,0x84c87814,0x8cc70208,
1349 0x90befffa,0xa4506ceb,0xbef9a3f7,0xc67178f2 );
1350$code.=<<___;
1351.Lpbswap:
1352 .long 0x00010203,0x04050607,0x08090a0b,0x0c0d0e0f # pbswap
1353 .long 0x00010203,0x04050607,0x08090a0b,0x0c0d0e0f # pbswap
1354K256_shaext:
1355 .long 0x428a2f98,0x71374491,0xb5c0fbcf,0xe9b5dba5
1356 .long 0x3956c25b,0x59f111f1,0x923f82a4,0xab1c5ed5
1357 .long 0xd807aa98,0x12835b01,0x243185be,0x550c7dc3
1358 .long 0x72be5d74,0x80deb1fe,0x9bdc06a7,0xc19bf174
1359 .long 0xe49b69c1,0xefbe4786,0x0fc19dc6,0x240ca1cc
1360 .long 0x2de92c6f,0x4a7484aa,0x5cb0a9dc,0x76f988da
1361 .long 0x983e5152,0xa831c66d,0xb00327c8,0xbf597fc7
1362 .long 0xc6e00bf3,0xd5a79147,0x06ca6351,0x14292967
1363 .long 0x27b70a85,0x2e1b2138,0x4d2c6dfc,0x53380d13
1364 .long 0x650a7354,0x766a0abb,0x81c2c92e,0x92722c85
1365 .long 0xa2bfe8a1,0xa81a664b,0xc24b8b70,0xc76c51a3
1366 .long 0xd192e819,0xd6990624,0xf40e3585,0x106aa070
1367 .long 0x19a4c116,0x1e376c08,0x2748774c,0x34b0bcb5
1368 .long 0x391c0cb3,0x4ed8aa4a,0x5b9cca4f,0x682e6ff3
1369 .long 0x748f82ee,0x78a5636f,0x84c87814,0x8cc70208
1370 .long 0x90befffa,0xa4506ceb,0xbef9a3f7,0xc67178f2
1371 .asciz "SHA256 multi-block transform for x86_64, CRYPTOGAMS by <appro\@openssl.org>"
1372___
1373
1374if ($win64) {
1375# EXCEPTION_DISPOSITION handler (EXCEPTION_RECORD *rec,ULONG64 frame,
1376# CONTEXT *context,DISPATCHER_CONTEXT *disp)
1377$rec="%rcx";
1378$frame="%rdx";
1379$context="%r8";
1380$disp="%r9";
1381
1382$code.=<<___;
1383.extern __imp_RtlVirtualUnwind
1384.type se_handler,\@abi-omnipotent
1385.align 16
1386se_handler:
1387 push %rsi
1388 push %rdi
1389 push %rbx
1390 push %rbp
1391 push %r12
1392 push %r13
1393 push %r14
1394 push %r15
1395 pushfq
1396 sub \$64,%rsp
1397
1398 mov 120($context),%rax # pull context->Rax
1399 mov 248($context),%rbx # pull context->Rip
1400
1401 mov 8($disp),%rsi # disp->ImageBase
1402 mov 56($disp),%r11 # disp->HandlerData
1403
1404 mov 0(%r11),%r10d # HandlerData[0]
1405 lea (%rsi,%r10),%r10 # end of prologue label
1406 cmp %r10,%rbx # context->Rip<.Lbody
1407 jb .Lin_prologue
1408
1409 mov 152($context),%rax # pull context->Rsp
1410
1411 mov 4(%r11),%r10d # HandlerData[1]
1412 lea (%rsi,%r10),%r10 # epilogue label
1413 cmp %r10,%rbx # context->Rip>=.Lepilogue
1414 jae .Lin_prologue
1415
1416 mov `16*17`(%rax),%rax # pull saved stack pointer
1417
1418 mov -8(%rax),%rbx
1419 mov -16(%rax),%rbp
1420 mov %rbx,144($context) # restore context->Rbx
1421 mov %rbp,160($context) # restore context->Rbp
1422
1423 lea -24-10*16(%rax),%rsi
1424 lea 512($context),%rdi # &context.Xmm6
1425 mov \$20,%ecx
1426 .long 0xa548f3fc # cld; rep movsq
1427
1428.Lin_prologue:
1429 mov 8(%rax),%rdi
1430 mov 16(%rax),%rsi
1431 mov %rax,152($context) # restore context->Rsp
1432 mov %rsi,168($context) # restore context->Rsi
1433 mov %rdi,176($context) # restore context->Rdi
1434
1435 mov 40($disp),%rdi # disp->ContextRecord
1436 mov $context,%rsi # context
1437 mov \$154,%ecx # sizeof(CONTEXT)
1438 .long 0xa548f3fc # cld; rep movsq
1439
1440 mov $disp,%rsi
1441 xor %rcx,%rcx # arg1, UNW_FLAG_NHANDLER
1442 mov 8(%rsi),%rdx # arg2, disp->ImageBase
1443 mov 0(%rsi),%r8 # arg3, disp->ControlPc
1444 mov 16(%rsi),%r9 # arg4, disp->FunctionEntry
1445 mov 40(%rsi),%r10 # disp->ContextRecord
1446 lea 56(%rsi),%r11 # &disp->HandlerData
1447 lea 24(%rsi),%r12 # &disp->EstablisherFrame
1448 mov %r10,32(%rsp) # arg5
1449 mov %r11,40(%rsp) # arg6
1450 mov %r12,48(%rsp) # arg7
1451 mov %rcx,56(%rsp) # arg8, (NULL)
1452 call *__imp_RtlVirtualUnwind(%rip)
1453
1454 mov \$1,%eax # ExceptionContinueSearch
1455 add \$64,%rsp
1456 popfq
1457 pop %r15
1458 pop %r14
1459 pop %r13
1460 pop %r12
1461 pop %rbp
1462 pop %rbx
1463 pop %rdi
1464 pop %rsi
1465 ret
1466.size se_handler,.-se_handler
1467___
1468$code.=<<___ if ($avx>1);
1469.type avx2_handler,\@abi-omnipotent
1470.align 16
1471avx2_handler:
1472 push %rsi
1473 push %rdi
1474 push %rbx
1475 push %rbp
1476 push %r12
1477 push %r13
1478 push %r14
1479 push %r15
1480 pushfq
1481 sub \$64,%rsp
1482
1483 mov 120($context),%rax # pull context->Rax
1484 mov 248($context),%rbx # pull context->Rip
1485
1486 mov 8($disp),%rsi # disp->ImageBase
1487 mov 56($disp),%r11 # disp->HandlerData
1488
1489 mov 0(%r11),%r10d # HandlerData[0]
1490 lea (%rsi,%r10),%r10 # end of prologue label
1491 cmp %r10,%rbx # context->Rip<body label
1492 jb .Lin_prologue
1493
1494 mov 152($context),%rax # pull context->Rsp
1495
1496 mov 4(%r11),%r10d # HandlerData[1]
1497 lea (%rsi,%r10),%r10 # epilogue label
1498 cmp %r10,%rbx # context->Rip>=epilogue label
1499 jae .Lin_prologue
1500
1501 mov `32*17`($context),%rax # pull saved stack pointer
1502
1503 mov -8(%rax),%rbx
1504 mov -16(%rax),%rbp
1505 mov -24(%rax),%r12
1506 mov -32(%rax),%r13
1507 mov -40(%rax),%r14
1508 mov -48(%rax),%r15
1509 mov %rbx,144($context) # restore context->Rbx
1510 mov %rbp,160($context) # restore context->Rbp
1511 mov %r12,216($context) # restore context->R12
1512 mov %r13,224($context) # restore context->R13
1513 mov %r14,232($context) # restore context->R14
1514 mov %r15,240($context) # restore context->R15
1515
1516 lea -56-10*16(%rax),%rsi
1517 lea 512($context),%rdi # &context.Xmm6
1518 mov \$20,%ecx
1519 .long 0xa548f3fc # cld; rep movsq
1520
1521 jmp .Lin_prologue
1522.size avx2_handler,.-avx2_handler
1523___
1524$code.=<<___;
1525.section .pdata
1526.align 4
1527 .rva .LSEH_begin_sha256_multi_block
1528 .rva .LSEH_end_sha256_multi_block
1529 .rva .LSEH_info_sha256_multi_block
1530 .rva .LSEH_begin_sha256_multi_block_shaext
1531 .rva .LSEH_end_sha256_multi_block_shaext
1532 .rva .LSEH_info_sha256_multi_block_shaext
1533___
1534$code.=<<___ if ($avx);
1535 .rva .LSEH_begin_sha256_multi_block_avx
1536 .rva .LSEH_end_sha256_multi_block_avx
1537 .rva .LSEH_info_sha256_multi_block_avx
1538___
1539$code.=<<___ if ($avx>1);
1540 .rva .LSEH_begin_sha256_multi_block_avx2
1541 .rva .LSEH_end_sha256_multi_block_avx2
1542 .rva .LSEH_info_sha256_multi_block_avx2
1543___
1544$code.=<<___;
1545.section .xdata
1546.align 8
1547.LSEH_info_sha256_multi_block:
1548 .byte 9,0,0,0
1549 .rva se_handler
1550 .rva .Lbody,.Lepilogue # HandlerData[]
1551.LSEH_info_sha256_multi_block_shaext:
1552 .byte 9,0,0,0
1553 .rva se_handler
1554 .rva .Lbody_shaext,.Lepilogue_shaext # HandlerData[]
1555___
1556$code.=<<___ if ($avx);
1557.LSEH_info_sha256_multi_block_avx:
1558 .byte 9,0,0,0
1559 .rva se_handler
1560 .rva .Lbody_avx,.Lepilogue_avx # HandlerData[]
1561___
1562$code.=<<___ if ($avx>1);
1563.LSEH_info_sha256_multi_block_avx2:
1564 .byte 9,0,0,0
1565 .rva avx2_handler
1566 .rva .Lbody_avx2,.Lepilogue_avx2 # HandlerData[]
1567___
1568}
1569####################################################################
1570
1571sub rex {
1572 local *opcode=shift;
1573 my ($dst,$src)=@_;
1574 my $rex=0;
1575
1576 $rex|=0x04 if ($dst>=8);
1577 $rex|=0x01 if ($src>=8);
1578 unshift @opcode,$rex|0x40 if ($rex);
1579}
1580
1581sub sha256op38 {
1582 my $instr = shift;
1583 my %opcodelet = (
1584 "sha256rnds2" => 0xcb,
1585 "sha256msg1" => 0xcc,
1586 "sha256msg2" => 0xcd );
1587
1588 if (defined($opcodelet{$instr}) && @_[0] =~ /%xmm([0-9]+),\s*%xmm([0-9]+)/) {
1589 my @opcode=(0x0f,0x38);
1590 rex(\@opcode,$2,$1);
1591 push @opcode,$opcodelet{$instr};
1592 push @opcode,0xc0|($1&7)|(($2&7)<<3); # ModR/M
1593 return ".byte\t".join(',',@opcode);
1594 } else {
1595 return $instr."\t".@_[0];
1596 }
1597}
1598
1599foreach (split("\n",$code)) {
1600 s/\`([^\`]*)\`/eval($1)/ge;
1601
1602 s/\b(sha256[^\s]*)\s+(.*)/sha256op38($1,$2)/geo or
1603
1604 s/\b(vmov[dq])\b(.+)%ymm([0-9]+)/$1$2%xmm$3/go or
1605 s/\b(vmovdqu)\b(.+)%x%ymm([0-9]+)/$1$2%xmm$3/go or
1606 s/\b(vpinsr[qd])\b(.+)%ymm([0-9]+),%ymm([0-9]+)/$1$2%xmm$3,%xmm$4/go or
1607 s/\b(vpextr[qd])\b(.+)%ymm([0-9]+)/$1$2%xmm$3/go or
1608 s/\b(vinserti128)\b(\s+)%ymm/$1$2\$1,%xmm/go or
1609 s/\b(vpbroadcast[qd]\s+)%ymm([0-9]+)/$1%xmm$2/go;
1610
1611 print $_,"\n";
1612}
1613
1614close STDOUT or die "error closing STDOUT: $!";
Note: See TracBrowser for help on using the repository browser.

© 2024 Oracle Support Privacy / Do Not Sell My Info Terms of Use Trademark Policy Automated Access Etiquette