VirtualBox

source: vbox/trunk/src/libs/openssl-3.1.3/crypto/sha/asm/sha512-x86_64.pl@ 102427

Last change on this file since 102427 was 101211, checked in by vboxsync, 17 months ago

openssl-3.1.3: Applied and adjusted our OpenSSL changes to 3.1.2. bugref:10527

  • Property svn:executable set to *
File size: 63.5 KB
Line 
1#! /usr/bin/env perl
2# Copyright 2005-2020 The OpenSSL Project Authors. All Rights Reserved.
3#
4# Licensed under the Apache License 2.0 (the "License"). You may not use
5# this file except in compliance with the License. You can obtain a copy
6# in the file LICENSE in the source distribution or at
7# https://www.openssl.org/source/license.html
8
9#
10# ====================================================================
11# Written by Andy Polyakov <[email protected]> for the OpenSSL
12# project. Rights for redistribution and usage in source and binary
13# forms are granted according to the License.
14# ====================================================================
15#
16# sha256/512_block procedure for x86_64.
17#
18# 40% improvement over compiler-generated code on Opteron. On EM64T
19# sha256 was observed to run >80% faster and sha512 - >40%. No magical
20# tricks, just straight implementation... I really wonder why gcc
21# [being armed with inline assembler] fails to generate as fast code.
22# The only thing which is cool about this module is that it's very
23# same instruction sequence used for both SHA-256 and SHA-512. In
24# former case the instructions operate on 32-bit operands, while in
25# latter - on 64-bit ones. All I had to do is to get one flavor right,
26# the other one passed the test right away:-)
27#
28# sha256_block runs in ~1005 cycles on Opteron, which gives you
29# asymptotic performance of 64*1000/1005=63.7MBps times CPU clock
30# frequency in GHz. sha512_block runs in ~1275 cycles, which results
31# in 128*1000/1275=100MBps per GHz. Is there room for improvement?
32# Well, if you compare it to IA-64 implementation, which maintains
33# X[16] in register bank[!], tends to 4 instructions per CPU clock
34# cycle and runs in 1003 cycles, 1275 is very good result for 3-way
35# issue Opteron pipeline and X[16] maintained in memory. So that *if*
36# there is a way to improve it, *then* the only way would be to try to
37# offload X[16] updates to SSE unit, but that would require "deeper"
38# loop unroll, which in turn would naturally cause size blow-up, not
39# to mention increased complexity! And once again, only *if* it's
40# actually possible to noticeably improve overall ILP, instruction
41# level parallelism, on a given CPU implementation in this case.
42#
43# Special note on Intel EM64T. While Opteron CPU exhibits perfect
44# performance ratio of 1.5 between 64- and 32-bit flavors [see above],
45# [currently available] EM64T CPUs apparently are far from it. On the
46# contrary, 64-bit version, sha512_block, is ~30% *slower* than 32-bit
47# sha256_block:-( This is presumably because 64-bit shifts/rotates
48# apparently are not atomic instructions, but implemented in microcode.
49#
50# May 2012.
51#
52# Optimization including one of Pavel Semjanov's ideas, alternative
53# Maj, resulted in >=5% improvement on most CPUs, +20% SHA256 and
54# unfortunately -2% SHA512 on P4 [which nobody should care about
55# that much].
56#
57# June 2012.
58#
59# Add SIMD code paths, see below for improvement coefficients. SSSE3
60# code path was not attempted for SHA512, because improvement is not
61# estimated to be high enough, noticeably less than 9%, to justify
62# the effort, not on pre-AVX processors. [Obviously with exclusion
63# for VIA Nano, but it has SHA512 instruction that is faster and
64# should be used instead.] For reference, corresponding estimated
65# upper limit for improvement for SSSE3 SHA256 is 28%. The fact that
66# higher coefficients are observed on VIA Nano and Bulldozer has more
67# to do with specifics of their architecture [which is topic for
68# separate discussion].
69#
70# November 2012.
71#
72# Add AVX2 code path. Two consecutive input blocks are loaded to
73# 256-bit %ymm registers, with data from first block to least
74# significant 128-bit halves and data from second to most significant.
75# The data is then processed with same SIMD instruction sequence as
76# for AVX, but with %ymm as operands. Side effect is increased stack
77# frame, 448 additional bytes in SHA256 and 1152 in SHA512, and 1.2KB
78# code size increase.
79#
80# March 2014.
81#
82# Add support for Intel SHA Extensions.
83
84######################################################################
85# Current performance in cycles per processed byte (less is better):
86#
87# SHA256 SSSE3 AVX/XOP(*) SHA512 AVX/XOP(*)
88#
89# AMD K8 14.9 - - 9.57 -
90# P4 17.3 - - 30.8 -
91# Core 2 15.6 13.8(+13%) - 9.97 -
92# Westmere 14.8 12.3(+19%) - 9.58 -
93# Sandy Bridge 17.4 14.2(+23%) 11.6(+50%(**)) 11.2 8.10(+38%(**))
94# Ivy Bridge 12.6 10.5(+20%) 10.3(+22%) 8.17 7.22(+13%)
95# Haswell 12.2 9.28(+31%) 7.80(+56%) 7.66 5.40(+42%)
96# Skylake 11.4 9.03(+26%) 7.70(+48%) 7.25 5.20(+40%)
97# Bulldozer 21.1 13.6(+54%) 13.6(+54%(***)) 13.5 8.58(+57%)
98# Ryzen 11.0 9.02(+22%) 2.05(+440%) 7.05 5.67(+20%)
99# VIA Nano 23.0 16.5(+39%) - 14.7 -
100# Atom 23.0 18.9(+22%) - 14.7 -
101# Silvermont 27.4 20.6(+33%) - 17.5 -
102# Knights L 27.4 21.0(+30%) 19.6(+40%) 17.5 12.8(+37%)
103# Goldmont 18.9 14.3(+32%) 4.16(+350%) 12.0 -
104#
105# (*) whichever best applicable, including SHAEXT;
106# (**) switch from ror to shrd stands for fair share of improvement;
107# (***) execution time is fully determined by remaining integer-only
108# part, body_00_15; reducing the amount of SIMD instructions
109# below certain limit makes no difference/sense; to conserve
110# space SHA256 XOP code path is therefore omitted;
111
112# $output is the last argument if it looks like a file (it has an extension)
113# $flavour is the first argument if it doesn't look like a file
114$output = $#ARGV >= 0 && $ARGV[$#ARGV] =~ m|\.\w+$| ? pop : undef;
115$flavour = $#ARGV >= 0 && $ARGV[0] !~ m|\.| ? shift : undef;
116
117$win64=0; $win64=1 if ($flavour =~ /[nm]asm|mingw64/ || $output =~ /\.asm$/);
118
119$0 =~ m/(.*[\/\\])[^\/\\]+$/; $dir=$1;
120( $xlate="${dir}x86_64-xlate.pl" and -f $xlate ) or
121( $xlate="${dir}../../perlasm/x86_64-xlate.pl" and -f $xlate) or
122die "can't locate x86_64-xlate.pl";
123
124if (`$ENV{CC} -Wa,-v -c -o /dev/null -x assembler /dev/null 2>&1`
125 =~ /GNU assembler version ([2-9]\.[0-9]+)/) {
126 $avx = ($1>=2.19) + ($1>=2.22);
127}
128
129if (!$avx && $win64 && ($flavour =~ /nasm/ || $ENV{ASM} =~ /nasm/) &&
130 `nasm -v 2>&1` =~ /NASM version ([2-9]\.[0-9]+)/) {
131 $avx = ($1>=2.09) + ($1>=2.10);
132}
133
134if (!$avx && $win64 && ($flavour =~ /masm/ || $ENV{ASM} =~ /ml64/) &&
135 `ml64 2>&1` =~ /Version ([0-9]+)\./) {
136 $avx = ($1>=10) + ($1>=11);
137}
138
139if (!$avx && `$ENV{CC} -v 2>&1` =~ /((?:clang|LLVM) version|.*based on LLVM) ([0-9]+\.[0-9]+)/) {
140 $avx = ($2>=3.0) + ($2>3.0);
141}
142
143$shaext=1; ### set to zero if compiling for 1.0.1
144$avx=1 if (!$shaext && $avx);
145
146open OUT,"| \"$^X\" \"$xlate\" $flavour \"$output\""
147 or die "can't call $xlate: $!";
148*STDOUT=*OUT;
149
150if ($output =~ /512/) {
151 $func="sha512_block_data_order";
152 $TABLE="K512";
153 $SZ=8;
154 @ROT=($A,$B,$C,$D,$E,$F,$G,$H)=("%rax","%rbx","%rcx","%rdx",
155 "%r8", "%r9", "%r10","%r11");
156 ($T1,$a0,$a1,$a2,$a3)=("%r12","%r13","%r14","%r15","%rdi");
157 @Sigma0=(28,34,39);
158 @Sigma1=(14,18,41);
159 @sigma0=(1, 8, 7);
160 @sigma1=(19,61, 6);
161 $rounds=80;
162} else {
163 $func="sha256_block_data_order";
164 $TABLE="K256";
165 $SZ=4;
166 @ROT=($A,$B,$C,$D,$E,$F,$G,$H)=("%eax","%ebx","%ecx","%edx",
167 "%r8d","%r9d","%r10d","%r11d");
168 ($T1,$a0,$a1,$a2,$a3)=("%r12d","%r13d","%r14d","%r15d","%edi");
169 @Sigma0=( 2,13,22);
170 @Sigma1=( 6,11,25);
171 @sigma0=( 7,18, 3);
172 @sigma1=(17,19,10);
173 $rounds=64;
174}
175
176$ctx="%rdi"; # 1st arg, zapped by $a3
177$inp="%rsi"; # 2nd arg
178$Tbl="%rbp";
179
180$_ctx="16*$SZ+0*8(%rsp)";
181$_inp="16*$SZ+1*8(%rsp)";
182$_end="16*$SZ+2*8(%rsp)";
183$_rsp="`16*$SZ+3*8`(%rsp)";
184$framesz="16*$SZ+4*8";
185
186
187sub ROUND_00_15()
188{ my ($i,$a,$b,$c,$d,$e,$f,$g,$h) = @_;
189 my $STRIDE=$SZ;
190 $STRIDE += 16 if ($i%(16/$SZ)==(16/$SZ-1));
191
192$code.=<<___;
193 ror \$`$Sigma1[2]-$Sigma1[1]`,$a0
194 mov $f,$a2
195
196 xor $e,$a0
197 ror \$`$Sigma0[2]-$Sigma0[1]`,$a1
198 xor $g,$a2 # f^g
199
200 mov $T1,`$SZ*($i&0xf)`(%rsp)
201 xor $a,$a1
202 and $e,$a2 # (f^g)&e
203
204 ror \$`$Sigma1[1]-$Sigma1[0]`,$a0
205 add $h,$T1 # T1+=h
206 xor $g,$a2 # Ch(e,f,g)=((f^g)&e)^g
207
208 ror \$`$Sigma0[1]-$Sigma0[0]`,$a1
209 xor $e,$a0
210 add $a2,$T1 # T1+=Ch(e,f,g)
211
212 mov $a,$a2
213 add ($Tbl),$T1 # T1+=K[round]
214 xor $a,$a1
215
216 xor $b,$a2 # a^b, b^c in next round
217 ror \$$Sigma1[0],$a0 # Sigma1(e)
218 mov $b,$h
219
220 and $a2,$a3
221 ror \$$Sigma0[0],$a1 # Sigma0(a)
222 add $a0,$T1 # T1+=Sigma1(e)
223
224 xor $a3,$h # h=Maj(a,b,c)=Ch(a^b,c,b)
225 add $T1,$d # d+=T1
226 add $T1,$h # h+=T1
227
228 lea $STRIDE($Tbl),$Tbl # round++
229___
230$code.=<<___ if ($i<15);
231 add $a1,$h # h+=Sigma0(a)
232___
233 ($a2,$a3) = ($a3,$a2);
234}
235
236sub ROUND_16_XX()
237{ my ($i,$a,$b,$c,$d,$e,$f,$g,$h) = @_;
238
239$code.=<<___;
240 mov `$SZ*(($i+1)&0xf)`(%rsp),$a0
241 mov `$SZ*(($i+14)&0xf)`(%rsp),$a2
242
243 mov $a0,$T1
244 ror \$`$sigma0[1]-$sigma0[0]`,$a0
245 add $a1,$a # modulo-scheduled h+=Sigma0(a)
246 mov $a2,$a1
247 ror \$`$sigma1[1]-$sigma1[0]`,$a2
248
249 xor $T1,$a0
250 shr \$$sigma0[2],$T1
251 ror \$$sigma0[0],$a0
252 xor $a1,$a2
253 shr \$$sigma1[2],$a1
254
255 ror \$$sigma1[0],$a2
256 xor $a0,$T1 # sigma0(X[(i+1)&0xf])
257 xor $a1,$a2 # sigma1(X[(i+14)&0xf])
258 add `$SZ*(($i+9)&0xf)`(%rsp),$T1
259
260 add `$SZ*($i&0xf)`(%rsp),$T1
261 mov $e,$a0
262 add $a2,$T1
263 mov $a,$a1
264___
265 &ROUND_00_15(@_);
266}
267
268$code=<<___;
269.text
270
271.extern OPENSSL_ia32cap_P
272.globl $func
273.type $func,\@function,3
274.align 16
275$func:
276.cfi_startproc
277___
278$code.=<<___ if ($SZ==4 || $avx);
279 lea OPENSSL_ia32cap_P(%rip),%r11
280 mov 0(%r11),%r9d
281 mov 4(%r11),%r10d
282 mov 8(%r11),%r11d
283___
284$code.=<<___ if ($SZ==4 && $shaext);
285 test \$`1<<29`,%r11d # check for SHA
286 jnz _shaext_shortcut
287___
288$code.=<<___ if ($avx && $SZ==8);
289 test \$`1<<11`,%r10d # check for XOP
290 jnz .Lxop_shortcut
291___
292$code.=<<___ if ($avx>1);
293 and \$`1<<8|1<<5|1<<3`,%r11d # check for BMI2+AVX2+BMI1
294 cmp \$`1<<8|1<<5|1<<3`,%r11d
295 je .Lavx2_shortcut
296___
297$code.=<<___ if ($avx);
298 and \$`1<<30`,%r9d # mask "Intel CPU" bit
299 and \$`1<<28|1<<9`,%r10d # mask AVX and SSSE3 bits
300 or %r9d,%r10d
301 cmp \$`1<<28|1<<9|1<<30`,%r10d
302 je .Lavx_shortcut
303___
304$code.=<<___ if ($SZ==4);
305 test \$`1<<9`,%r10d
306 jnz .Lssse3_shortcut
307___
308$code.=<<___;
309 mov %rsp,%rax # copy %rsp
310.cfi_def_cfa_register %rax
311 push %rbx
312.cfi_push %rbx
313 push %rbp
314.cfi_push %rbp
315 push %r12
316.cfi_push %r12
317 push %r13
318.cfi_push %r13
319 push %r14
320.cfi_push %r14
321 push %r15
322.cfi_push %r15
323 shl \$4,%rdx # num*16
324 sub \$$framesz,%rsp
325 lea ($inp,%rdx,$SZ),%rdx # inp+num*16*$SZ
326 and \$-64,%rsp # align stack frame
327 mov $ctx,$_ctx # save ctx, 1st arg
328 mov $inp,$_inp # save inp, 2nd arh
329 mov %rdx,$_end # save end pointer, "3rd" arg
330 mov %rax,$_rsp # save copy of %rsp
331.cfi_cfa_expression $_rsp,deref,+8
332.Lprologue:
333
334 mov $SZ*0($ctx),$A
335 mov $SZ*1($ctx),$B
336 mov $SZ*2($ctx),$C
337 mov $SZ*3($ctx),$D
338 mov $SZ*4($ctx),$E
339 mov $SZ*5($ctx),$F
340 mov $SZ*6($ctx),$G
341 mov $SZ*7($ctx),$H
342 jmp .Lloop
343
344.align 16
345.Lloop:
346 mov $B,$a3
347 lea $TABLE(%rip),$Tbl
348 xor $C,$a3 # magic
349___
350 for($i=0;$i<16;$i++) {
351 $code.=" mov $SZ*$i($inp),$T1\n";
352 $code.=" mov @ROT[4],$a0\n";
353 $code.=" mov @ROT[0],$a1\n";
354 $code.=" bswap $T1\n";
355 &ROUND_00_15($i,@ROT);
356 unshift(@ROT,pop(@ROT));
357 }
358$code.=<<___;
359 jmp .Lrounds_16_xx
360.align 16
361.Lrounds_16_xx:
362___
363 for(;$i<32;$i++) {
364 &ROUND_16_XX($i,@ROT);
365 unshift(@ROT,pop(@ROT));
366 }
367
368$code.=<<___;
369 cmpb \$0,`$SZ-1`($Tbl)
370 jnz .Lrounds_16_xx
371
372 mov $_ctx,$ctx
373 add $a1,$A # modulo-scheduled h+=Sigma0(a)
374 lea 16*$SZ($inp),$inp
375
376 add $SZ*0($ctx),$A
377 add $SZ*1($ctx),$B
378 add $SZ*2($ctx),$C
379 add $SZ*3($ctx),$D
380 add $SZ*4($ctx),$E
381 add $SZ*5($ctx),$F
382 add $SZ*6($ctx),$G
383 add $SZ*7($ctx),$H
384
385 cmp $_end,$inp
386
387 mov $A,$SZ*0($ctx)
388 mov $B,$SZ*1($ctx)
389 mov $C,$SZ*2($ctx)
390 mov $D,$SZ*3($ctx)
391 mov $E,$SZ*4($ctx)
392 mov $F,$SZ*5($ctx)
393 mov $G,$SZ*6($ctx)
394 mov $H,$SZ*7($ctx)
395 jb .Lloop
396
397 mov $_rsp,%rsi
398.cfi_def_cfa %rsi,8
399 mov -48(%rsi),%r15
400.cfi_restore %r15
401 mov -40(%rsi),%r14
402.cfi_restore %r14
403 mov -32(%rsi),%r13
404.cfi_restore %r13
405 mov -24(%rsi),%r12
406.cfi_restore %r12
407 mov -16(%rsi),%rbp
408.cfi_restore %rbp
409 mov -8(%rsi),%rbx
410.cfi_restore %rbx
411 lea (%rsi),%rsp
412.cfi_def_cfa_register %rsp
413.Lepilogue:
414 ret
415.cfi_endproc
416.size $func,.-$func
417___
418
419if ($SZ==4) {
420$code.=<<___;
421.align 64
422.type $TABLE,\@object
423$TABLE:
424 .long 0x428a2f98,0x71374491,0xb5c0fbcf,0xe9b5dba5
425 .long 0x428a2f98,0x71374491,0xb5c0fbcf,0xe9b5dba5
426 .long 0x3956c25b,0x59f111f1,0x923f82a4,0xab1c5ed5
427 .long 0x3956c25b,0x59f111f1,0x923f82a4,0xab1c5ed5
428 .long 0xd807aa98,0x12835b01,0x243185be,0x550c7dc3
429 .long 0xd807aa98,0x12835b01,0x243185be,0x550c7dc3
430 .long 0x72be5d74,0x80deb1fe,0x9bdc06a7,0xc19bf174
431 .long 0x72be5d74,0x80deb1fe,0x9bdc06a7,0xc19bf174
432 .long 0xe49b69c1,0xefbe4786,0x0fc19dc6,0x240ca1cc
433 .long 0xe49b69c1,0xefbe4786,0x0fc19dc6,0x240ca1cc
434 .long 0x2de92c6f,0x4a7484aa,0x5cb0a9dc,0x76f988da
435 .long 0x2de92c6f,0x4a7484aa,0x5cb0a9dc,0x76f988da
436 .long 0x983e5152,0xa831c66d,0xb00327c8,0xbf597fc7
437 .long 0x983e5152,0xa831c66d,0xb00327c8,0xbf597fc7
438 .long 0xc6e00bf3,0xd5a79147,0x06ca6351,0x14292967
439 .long 0xc6e00bf3,0xd5a79147,0x06ca6351,0x14292967
440 .long 0x27b70a85,0x2e1b2138,0x4d2c6dfc,0x53380d13
441 .long 0x27b70a85,0x2e1b2138,0x4d2c6dfc,0x53380d13
442 .long 0x650a7354,0x766a0abb,0x81c2c92e,0x92722c85
443 .long 0x650a7354,0x766a0abb,0x81c2c92e,0x92722c85
444 .long 0xa2bfe8a1,0xa81a664b,0xc24b8b70,0xc76c51a3
445 .long 0xa2bfe8a1,0xa81a664b,0xc24b8b70,0xc76c51a3
446 .long 0xd192e819,0xd6990624,0xf40e3585,0x106aa070
447 .long 0xd192e819,0xd6990624,0xf40e3585,0x106aa070
448 .long 0x19a4c116,0x1e376c08,0x2748774c,0x34b0bcb5
449 .long 0x19a4c116,0x1e376c08,0x2748774c,0x34b0bcb5
450 .long 0x391c0cb3,0x4ed8aa4a,0x5b9cca4f,0x682e6ff3
451 .long 0x391c0cb3,0x4ed8aa4a,0x5b9cca4f,0x682e6ff3
452 .long 0x748f82ee,0x78a5636f,0x84c87814,0x8cc70208
453 .long 0x748f82ee,0x78a5636f,0x84c87814,0x8cc70208
454 .long 0x90befffa,0xa4506ceb,0xbef9a3f7,0xc67178f2
455 .long 0x90befffa,0xa4506ceb,0xbef9a3f7,0xc67178f2
456
457 .long 0x00010203,0x04050607,0x08090a0b,0x0c0d0e0f
458 .long 0x00010203,0x04050607,0x08090a0b,0x0c0d0e0f
459 .long 0x03020100,0x0b0a0908,0xffffffff,0xffffffff
460 .long 0x03020100,0x0b0a0908,0xffffffff,0xffffffff
461 .long 0xffffffff,0xffffffff,0x03020100,0x0b0a0908
462 .long 0xffffffff,0xffffffff,0x03020100,0x0b0a0908
463 .asciz "SHA256 block transform for x86_64, CRYPTOGAMS by <appro\@openssl.org>"
464___
465} else {
466$code.=<<___;
467.align 64
468.type $TABLE,\@object
469$TABLE:
470 .quad 0x428a2f98d728ae22,0x7137449123ef65cd
471 .quad 0x428a2f98d728ae22,0x7137449123ef65cd
472 .quad 0xb5c0fbcfec4d3b2f,0xe9b5dba58189dbbc
473 .quad 0xb5c0fbcfec4d3b2f,0xe9b5dba58189dbbc
474 .quad 0x3956c25bf348b538,0x59f111f1b605d019
475 .quad 0x3956c25bf348b538,0x59f111f1b605d019
476 .quad 0x923f82a4af194f9b,0xab1c5ed5da6d8118
477 .quad 0x923f82a4af194f9b,0xab1c5ed5da6d8118
478 .quad 0xd807aa98a3030242,0x12835b0145706fbe
479 .quad 0xd807aa98a3030242,0x12835b0145706fbe
480 .quad 0x243185be4ee4b28c,0x550c7dc3d5ffb4e2
481 .quad 0x243185be4ee4b28c,0x550c7dc3d5ffb4e2
482 .quad 0x72be5d74f27b896f,0x80deb1fe3b1696b1
483 .quad 0x72be5d74f27b896f,0x80deb1fe3b1696b1
484 .quad 0x9bdc06a725c71235,0xc19bf174cf692694
485 .quad 0x9bdc06a725c71235,0xc19bf174cf692694
486 .quad 0xe49b69c19ef14ad2,0xefbe4786384f25e3
487 .quad 0xe49b69c19ef14ad2,0xefbe4786384f25e3
488 .quad 0x0fc19dc68b8cd5b5,0x240ca1cc77ac9c65
489 .quad 0x0fc19dc68b8cd5b5,0x240ca1cc77ac9c65
490 .quad 0x2de92c6f592b0275,0x4a7484aa6ea6e483
491 .quad 0x2de92c6f592b0275,0x4a7484aa6ea6e483
492 .quad 0x5cb0a9dcbd41fbd4,0x76f988da831153b5
493 .quad 0x5cb0a9dcbd41fbd4,0x76f988da831153b5
494 .quad 0x983e5152ee66dfab,0xa831c66d2db43210
495 .quad 0x983e5152ee66dfab,0xa831c66d2db43210
496 .quad 0xb00327c898fb213f,0xbf597fc7beef0ee4
497 .quad 0xb00327c898fb213f,0xbf597fc7beef0ee4
498 .quad 0xc6e00bf33da88fc2,0xd5a79147930aa725
499 .quad 0xc6e00bf33da88fc2,0xd5a79147930aa725
500 .quad 0x06ca6351e003826f,0x142929670a0e6e70
501 .quad 0x06ca6351e003826f,0x142929670a0e6e70
502 .quad 0x27b70a8546d22ffc,0x2e1b21385c26c926
503 .quad 0x27b70a8546d22ffc,0x2e1b21385c26c926
504 .quad 0x4d2c6dfc5ac42aed,0x53380d139d95b3df
505 .quad 0x4d2c6dfc5ac42aed,0x53380d139d95b3df
506 .quad 0x650a73548baf63de,0x766a0abb3c77b2a8
507 .quad 0x650a73548baf63de,0x766a0abb3c77b2a8
508 .quad 0x81c2c92e47edaee6,0x92722c851482353b
509 .quad 0x81c2c92e47edaee6,0x92722c851482353b
510 .quad 0xa2bfe8a14cf10364,0xa81a664bbc423001
511 .quad 0xa2bfe8a14cf10364,0xa81a664bbc423001
512 .quad 0xc24b8b70d0f89791,0xc76c51a30654be30
513 .quad 0xc24b8b70d0f89791,0xc76c51a30654be30
514 .quad 0xd192e819d6ef5218,0xd69906245565a910
515 .quad 0xd192e819d6ef5218,0xd69906245565a910
516 .quad 0xf40e35855771202a,0x106aa07032bbd1b8
517 .quad 0xf40e35855771202a,0x106aa07032bbd1b8
518 .quad 0x19a4c116b8d2d0c8,0x1e376c085141ab53
519 .quad 0x19a4c116b8d2d0c8,0x1e376c085141ab53
520 .quad 0x2748774cdf8eeb99,0x34b0bcb5e19b48a8
521 .quad 0x2748774cdf8eeb99,0x34b0bcb5e19b48a8
522 .quad 0x391c0cb3c5c95a63,0x4ed8aa4ae3418acb
523 .quad 0x391c0cb3c5c95a63,0x4ed8aa4ae3418acb
524 .quad 0x5b9cca4f7763e373,0x682e6ff3d6b2b8a3
525 .quad 0x5b9cca4f7763e373,0x682e6ff3d6b2b8a3
526 .quad 0x748f82ee5defb2fc,0x78a5636f43172f60
527 .quad 0x748f82ee5defb2fc,0x78a5636f43172f60
528 .quad 0x84c87814a1f0ab72,0x8cc702081a6439ec
529 .quad 0x84c87814a1f0ab72,0x8cc702081a6439ec
530 .quad 0x90befffa23631e28,0xa4506cebde82bde9
531 .quad 0x90befffa23631e28,0xa4506cebde82bde9
532 .quad 0xbef9a3f7b2c67915,0xc67178f2e372532b
533 .quad 0xbef9a3f7b2c67915,0xc67178f2e372532b
534 .quad 0xca273eceea26619c,0xd186b8c721c0c207
535 .quad 0xca273eceea26619c,0xd186b8c721c0c207
536 .quad 0xeada7dd6cde0eb1e,0xf57d4f7fee6ed178
537 .quad 0xeada7dd6cde0eb1e,0xf57d4f7fee6ed178
538 .quad 0x06f067aa72176fba,0x0a637dc5a2c898a6
539 .quad 0x06f067aa72176fba,0x0a637dc5a2c898a6
540 .quad 0x113f9804bef90dae,0x1b710b35131c471b
541 .quad 0x113f9804bef90dae,0x1b710b35131c471b
542 .quad 0x28db77f523047d84,0x32caab7b40c72493
543 .quad 0x28db77f523047d84,0x32caab7b40c72493
544 .quad 0x3c9ebe0a15c9bebc,0x431d67c49c100d4c
545 .quad 0x3c9ebe0a15c9bebc,0x431d67c49c100d4c
546 .quad 0x4cc5d4becb3e42b6,0x597f299cfc657e2a
547 .quad 0x4cc5d4becb3e42b6,0x597f299cfc657e2a
548 .quad 0x5fcb6fab3ad6faec,0x6c44198c4a475817
549 .quad 0x5fcb6fab3ad6faec,0x6c44198c4a475817
550
551 .quad 0x0001020304050607,0x08090a0b0c0d0e0f
552 .quad 0x0001020304050607,0x08090a0b0c0d0e0f
553 .asciz "SHA512 block transform for x86_64, CRYPTOGAMS by <appro\@openssl.org>"
554___
555}
556
557######################################################################
558# SIMD code paths
559#
560if ($SZ==4 && $shaext) {{{
561######################################################################
562# Intel SHA Extensions implementation of SHA256 update function.
563#
564my ($ctx,$inp,$num,$Tbl)=("%rdi","%rsi","%rdx","%rcx");
565
566my ($Wi,$ABEF,$CDGH,$TMP,$BSWAP,$ABEF_SAVE,$CDGH_SAVE)=map("%xmm$_",(0..2,7..10));
567my @MSG=map("%xmm$_",(3..6));
568
569$code.=<<___;
570.type sha256_block_data_order_shaext,\@function,3
571.align 64
572sha256_block_data_order_shaext:
573_shaext_shortcut:
574.cfi_startproc
575___
576$code.=<<___ if ($win64);
577 lea `-8-5*16`(%rsp),%rsp
578 movaps %xmm6,-8-5*16(%rax)
579 movaps %xmm7,-8-4*16(%rax)
580 movaps %xmm8,-8-3*16(%rax)
581 movaps %xmm9,-8-2*16(%rax)
582 movaps %xmm10,-8-1*16(%rax)
583.Lprologue_shaext:
584___
585$code.=<<___;
586 lea K256+0x80(%rip),$Tbl
587 movdqu ($ctx),$ABEF # DCBA
588 movdqu 16($ctx),$CDGH # HGFE
589 movdqa 0x200-0x80($Tbl),$TMP # byte swap mask
590
591 pshufd \$0x1b,$ABEF,$Wi # ABCD
592 pshufd \$0xb1,$ABEF,$ABEF # CDAB
593 pshufd \$0x1b,$CDGH,$CDGH # EFGH
594 movdqa $TMP,$BSWAP # offload
595 palignr \$8,$CDGH,$ABEF # ABEF
596 punpcklqdq $Wi,$CDGH # CDGH
597 jmp .Loop_shaext
598
599.align 16
600.Loop_shaext:
601 movdqu ($inp),@MSG[0]
602 movdqu 0x10($inp),@MSG[1]
603 movdqu 0x20($inp),@MSG[2]
604 pshufb $TMP,@MSG[0]
605 movdqu 0x30($inp),@MSG[3]
606
607 movdqa 0*32-0x80($Tbl),$Wi
608 paddd @MSG[0],$Wi
609 pshufb $TMP,@MSG[1]
610 movdqa $CDGH,$CDGH_SAVE # offload
611 sha256rnds2 $ABEF,$CDGH # 0-3
612 pshufd \$0x0e,$Wi,$Wi
613 nop
614 movdqa $ABEF,$ABEF_SAVE # offload
615 sha256rnds2 $CDGH,$ABEF
616
617 movdqa 1*32-0x80($Tbl),$Wi
618 paddd @MSG[1],$Wi
619 pshufb $TMP,@MSG[2]
620 sha256rnds2 $ABEF,$CDGH # 4-7
621 pshufd \$0x0e,$Wi,$Wi
622 lea 0x40($inp),$inp
623 sha256msg1 @MSG[1],@MSG[0]
624 sha256rnds2 $CDGH,$ABEF
625
626 movdqa 2*32-0x80($Tbl),$Wi
627 paddd @MSG[2],$Wi
628 pshufb $TMP,@MSG[3]
629 sha256rnds2 $ABEF,$CDGH # 8-11
630 pshufd \$0x0e,$Wi,$Wi
631 movdqa @MSG[3],$TMP
632 palignr \$4,@MSG[2],$TMP
633 nop
634 paddd $TMP,@MSG[0]
635 sha256msg1 @MSG[2],@MSG[1]
636 sha256rnds2 $CDGH,$ABEF
637
638 movdqa 3*32-0x80($Tbl),$Wi
639 paddd @MSG[3],$Wi
640 sha256msg2 @MSG[3],@MSG[0]
641 sha256rnds2 $ABEF,$CDGH # 12-15
642 pshufd \$0x0e,$Wi,$Wi
643 movdqa @MSG[0],$TMP
644 palignr \$4,@MSG[3],$TMP
645 nop
646 paddd $TMP,@MSG[1]
647 sha256msg1 @MSG[3],@MSG[2]
648 sha256rnds2 $CDGH,$ABEF
649___
650for($i=4;$i<16-3;$i++) {
651$code.=<<___;
652 movdqa $i*32-0x80($Tbl),$Wi
653 paddd @MSG[0],$Wi
654 sha256msg2 @MSG[0],@MSG[1]
655 sha256rnds2 $ABEF,$CDGH # 16-19...
656 pshufd \$0x0e,$Wi,$Wi
657 movdqa @MSG[1],$TMP
658 palignr \$4,@MSG[0],$TMP
659 nop
660 paddd $TMP,@MSG[2]
661 sha256msg1 @MSG[0],@MSG[3]
662 sha256rnds2 $CDGH,$ABEF
663___
664 push(@MSG,shift(@MSG));
665}
666$code.=<<___;
667 movdqa 13*32-0x80($Tbl),$Wi
668 paddd @MSG[0],$Wi
669 sha256msg2 @MSG[0],@MSG[1]
670 sha256rnds2 $ABEF,$CDGH # 52-55
671 pshufd \$0x0e,$Wi,$Wi
672 movdqa @MSG[1],$TMP
673 palignr \$4,@MSG[0],$TMP
674 sha256rnds2 $CDGH,$ABEF
675 paddd $TMP,@MSG[2]
676
677 movdqa 14*32-0x80($Tbl),$Wi
678 paddd @MSG[1],$Wi
679 sha256rnds2 $ABEF,$CDGH # 56-59
680 pshufd \$0x0e,$Wi,$Wi
681 sha256msg2 @MSG[1],@MSG[2]
682 movdqa $BSWAP,$TMP
683 sha256rnds2 $CDGH,$ABEF
684
685 movdqa 15*32-0x80($Tbl),$Wi
686 paddd @MSG[2],$Wi
687 nop
688 sha256rnds2 $ABEF,$CDGH # 60-63
689 pshufd \$0x0e,$Wi,$Wi
690 dec $num
691 nop
692 sha256rnds2 $CDGH,$ABEF
693
694 paddd $CDGH_SAVE,$CDGH
695 paddd $ABEF_SAVE,$ABEF
696 jnz .Loop_shaext
697
698 pshufd \$0xb1,$CDGH,$CDGH # DCHG
699 pshufd \$0x1b,$ABEF,$TMP # FEBA
700 pshufd \$0xb1,$ABEF,$ABEF # BAFE
701 punpckhqdq $CDGH,$ABEF # DCBA
702 palignr \$8,$TMP,$CDGH # HGFE
703
704 movdqu $ABEF,($ctx)
705 movdqu $CDGH,16($ctx)
706___
707$code.=<<___ if ($win64);
708 movaps -8-5*16(%rax),%xmm6
709 movaps -8-4*16(%rax),%xmm7
710 movaps -8-3*16(%rax),%xmm8
711 movaps -8-2*16(%rax),%xmm9
712 movaps -8-1*16(%rax),%xmm10
713 mov %rax,%rsp
714.Lepilogue_shaext:
715___
716$code.=<<___;
717 ret
718.cfi_endproc
719.size sha256_block_data_order_shaext,.-sha256_block_data_order_shaext
720___
721}}}
722{{{
723
724my $a4=$T1;
725my ($a,$b,$c,$d,$e,$f,$g,$h);
726
727sub AUTOLOAD() # thunk [simplified] 32-bit style perlasm
728{ my $opcode = $AUTOLOAD; $opcode =~ s/.*:://;
729 my $arg = pop;
730 $arg = "\$$arg" if ($arg*1 eq $arg);
731 $code .= "\t$opcode\t".join(',',$arg,reverse @_)."\n";
732}
733
734sub body_00_15 () {
735 (
736 '($a,$b,$c,$d,$e,$f,$g,$h)=@ROT;'.
737
738 '&ror ($a0,$Sigma1[2]-$Sigma1[1])',
739 '&mov ($a,$a1)',
740 '&mov ($a4,$f)',
741
742 '&ror ($a1,$Sigma0[2]-$Sigma0[1])',
743 '&xor ($a0,$e)',
744 '&xor ($a4,$g)', # f^g
745
746 '&ror ($a0,$Sigma1[1]-$Sigma1[0])',
747 '&xor ($a1,$a)',
748 '&and ($a4,$e)', # (f^g)&e
749
750 '&xor ($a0,$e)',
751 '&add ($h,$SZ*($i&15)."(%rsp)")', # h+=X[i]+K[i]
752 '&mov ($a2,$a)',
753
754 '&xor ($a4,$g)', # Ch(e,f,g)=((f^g)&e)^g
755 '&ror ($a1,$Sigma0[1]-$Sigma0[0])',
756 '&xor ($a2,$b)', # a^b, b^c in next round
757
758 '&add ($h,$a4)', # h+=Ch(e,f,g)
759 '&ror ($a0,$Sigma1[0])', # Sigma1(e)
760 '&and ($a3,$a2)', # (b^c)&(a^b)
761
762 '&xor ($a1,$a)',
763 '&add ($h,$a0)', # h+=Sigma1(e)
764 '&xor ($a3,$b)', # Maj(a,b,c)=Ch(a^b,c,b)
765
766 '&ror ($a1,$Sigma0[0])', # Sigma0(a)
767 '&add ($d,$h)', # d+=h
768 '&add ($h,$a3)', # h+=Maj(a,b,c)
769
770 '&mov ($a0,$d)',
771 '&add ($a1,$h);'. # h+=Sigma0(a)
772 '($a2,$a3) = ($a3,$a2); unshift(@ROT,pop(@ROT)); $i++;'
773 );
774}
775
776######################################################################
777# SSSE3 code path
778#
779if ($SZ==4) { # SHA256 only
780my @X = map("%xmm$_",(0..3));
781my ($t0,$t1,$t2,$t3, $t4,$t5) = map("%xmm$_",(4..9));
782
783$code.=<<___;
784.type ${func}_ssse3,\@function,3
785.align 64
786${func}_ssse3:
787.cfi_startproc
788.Lssse3_shortcut:
789 mov %rsp,%rax # copy %rsp
790.cfi_def_cfa_register %rax
791 push %rbx
792.cfi_push %rbx
793 push %rbp
794.cfi_push %rbp
795 push %r12
796.cfi_push %r12
797 push %r13
798.cfi_push %r13
799 push %r14
800.cfi_push %r14
801 push %r15
802.cfi_push %r15
803 shl \$4,%rdx # num*16
804 sub \$`$framesz+$win64*16*4`,%rsp
805 lea ($inp,%rdx,$SZ),%rdx # inp+num*16*$SZ
806 and \$-64,%rsp # align stack frame
807 mov $ctx,$_ctx # save ctx, 1st arg
808 mov $inp,$_inp # save inp, 2nd arh
809 mov %rdx,$_end # save end pointer, "3rd" arg
810 mov %rax,$_rsp # save copy of %rsp
811.cfi_cfa_expression $_rsp,deref,+8
812___
813$code.=<<___ if ($win64);
814 movaps %xmm6,16*$SZ+32(%rsp)
815 movaps %xmm7,16*$SZ+48(%rsp)
816 movaps %xmm8,16*$SZ+64(%rsp)
817 movaps %xmm9,16*$SZ+80(%rsp)
818___
819$code.=<<___;
820.Lprologue_ssse3:
821
822 mov $SZ*0($ctx),$A
823 mov $SZ*1($ctx),$B
824 mov $SZ*2($ctx),$C
825 mov $SZ*3($ctx),$D
826 mov $SZ*4($ctx),$E
827 mov $SZ*5($ctx),$F
828 mov $SZ*6($ctx),$G
829 mov $SZ*7($ctx),$H
830___
831
832$code.=<<___;
833 #movdqa $TABLE+`$SZ*2*$rounds`+32(%rip),$t4
834 #movdqa $TABLE+`$SZ*2*$rounds`+64(%rip),$t5
835 jmp .Lloop_ssse3
836.align 16
837.Lloop_ssse3:
838 movdqa $TABLE+`$SZ*2*$rounds`(%rip),$t3
839 movdqu 0x00($inp),@X[0]
840 movdqu 0x10($inp),@X[1]
841 movdqu 0x20($inp),@X[2]
842 pshufb $t3,@X[0]
843 movdqu 0x30($inp),@X[3]
844 lea $TABLE(%rip),$Tbl
845 pshufb $t3,@X[1]
846 movdqa 0x00($Tbl),$t0
847 movdqa 0x20($Tbl),$t1
848 pshufb $t3,@X[2]
849 paddd @X[0],$t0
850 movdqa 0x40($Tbl),$t2
851 pshufb $t3,@X[3]
852 movdqa 0x60($Tbl),$t3
853 paddd @X[1],$t1
854 paddd @X[2],$t2
855 paddd @X[3],$t3
856 movdqa $t0,0x00(%rsp)
857 mov $A,$a1
858 movdqa $t1,0x10(%rsp)
859 mov $B,$a3
860 movdqa $t2,0x20(%rsp)
861 xor $C,$a3 # magic
862 movdqa $t3,0x30(%rsp)
863 mov $E,$a0
864 jmp .Lssse3_00_47
865
866.align 16
867.Lssse3_00_47:
868 sub \$`-16*2*$SZ`,$Tbl # size optimization
869___
870sub Xupdate_256_SSSE3 () {
871 (
872 '&movdqa ($t0,@X[1]);',
873 '&movdqa ($t3,@X[3])',
874 '&palignr ($t0,@X[0],$SZ)', # X[1..4]
875 '&palignr ($t3,@X[2],$SZ);', # X[9..12]
876 '&movdqa ($t1,$t0)',
877 '&movdqa ($t2,$t0);',
878 '&psrld ($t0,$sigma0[2])',
879 '&paddd (@X[0],$t3);', # X[0..3] += X[9..12]
880 '&psrld ($t2,$sigma0[0])',
881 '&pshufd ($t3,@X[3],0b11111010)',# X[14..15]
882 '&pslld ($t1,8*$SZ-$sigma0[1]);'.
883 '&pxor ($t0,$t2)',
884 '&psrld ($t2,$sigma0[1]-$sigma0[0]);'.
885 '&pxor ($t0,$t1)',
886 '&pslld ($t1,$sigma0[1]-$sigma0[0]);'.
887 '&pxor ($t0,$t2);',
888 '&movdqa ($t2,$t3)',
889 '&pxor ($t0,$t1);', # sigma0(X[1..4])
890 '&psrld ($t3,$sigma1[2])',
891 '&paddd (@X[0],$t0);', # X[0..3] += sigma0(X[1..4])
892 '&psrlq ($t2,$sigma1[0])',
893 '&pxor ($t3,$t2);',
894 '&psrlq ($t2,$sigma1[1]-$sigma1[0])',
895 '&pxor ($t3,$t2)',
896 '&pshufb ($t3,$t4)', # sigma1(X[14..15])
897 '&paddd (@X[0],$t3)', # X[0..1] += sigma1(X[14..15])
898 '&pshufd ($t3,@X[0],0b01010000)',# X[16..17]
899 '&movdqa ($t2,$t3);',
900 '&psrld ($t3,$sigma1[2])',
901 '&psrlq ($t2,$sigma1[0])',
902 '&pxor ($t3,$t2);',
903 '&psrlq ($t2,$sigma1[1]-$sigma1[0])',
904 '&pxor ($t3,$t2);',
905 '&movdqa ($t2,16*2*$j."($Tbl)")',
906 '&pshufb ($t3,$t5)',
907 '&paddd (@X[0],$t3)' # X[2..3] += sigma1(X[16..17])
908 );
909}
910
911sub SSSE3_256_00_47 () {
912my $j = shift;
913my $body = shift;
914my @X = @_;
915my @insns = (&$body,&$body,&$body,&$body); # 104 instructions
916
917 if (0) {
918 foreach (Xupdate_256_SSSE3()) { # 36 instructions
919 eval;
920 eval(shift(@insns));
921 eval(shift(@insns));
922 eval(shift(@insns));
923 }
924 } else { # squeeze extra 4% on Westmere and 19% on Atom
925 eval(shift(@insns)); #@
926 &movdqa ($t0,@X[1]);
927 eval(shift(@insns));
928 eval(shift(@insns));
929 &movdqa ($t3,@X[3]);
930 eval(shift(@insns)); #@
931 eval(shift(@insns));
932 eval(shift(@insns));
933 eval(shift(@insns)); #@
934 eval(shift(@insns));
935 &palignr ($t0,@X[0],$SZ); # X[1..4]
936 eval(shift(@insns));
937 eval(shift(@insns));
938 &palignr ($t3,@X[2],$SZ); # X[9..12]
939 eval(shift(@insns));
940 eval(shift(@insns));
941 eval(shift(@insns));
942 eval(shift(@insns)); #@
943 &movdqa ($t1,$t0);
944 eval(shift(@insns));
945 eval(shift(@insns));
946 &movdqa ($t2,$t0);
947 eval(shift(@insns)); #@
948 eval(shift(@insns));
949 &psrld ($t0,$sigma0[2]);
950 eval(shift(@insns));
951 eval(shift(@insns));
952 eval(shift(@insns));
953 &paddd (@X[0],$t3); # X[0..3] += X[9..12]
954 eval(shift(@insns)); #@
955 eval(shift(@insns));
956 &psrld ($t2,$sigma0[0]);
957 eval(shift(@insns));
958 eval(shift(@insns));
959 &pshufd ($t3,@X[3],0b11111010); # X[4..15]
960 eval(shift(@insns));
961 eval(shift(@insns)); #@
962 &pslld ($t1,8*$SZ-$sigma0[1]);
963 eval(shift(@insns));
964 eval(shift(@insns));
965 &pxor ($t0,$t2);
966 eval(shift(@insns)); #@
967 eval(shift(@insns));
968 eval(shift(@insns));
969 eval(shift(@insns)); #@
970 &psrld ($t2,$sigma0[1]-$sigma0[0]);
971 eval(shift(@insns));
972 &pxor ($t0,$t1);
973 eval(shift(@insns));
974 eval(shift(@insns));
975 &pslld ($t1,$sigma0[1]-$sigma0[0]);
976 eval(shift(@insns));
977 eval(shift(@insns));
978 &pxor ($t0,$t2);
979 eval(shift(@insns));
980 eval(shift(@insns)); #@
981 &movdqa ($t2,$t3);
982 eval(shift(@insns));
983 eval(shift(@insns));
984 &pxor ($t0,$t1); # sigma0(X[1..4])
985 eval(shift(@insns)); #@
986 eval(shift(@insns));
987 eval(shift(@insns));
988 &psrld ($t3,$sigma1[2]);
989 eval(shift(@insns));
990 eval(shift(@insns));
991 &paddd (@X[0],$t0); # X[0..3] += sigma0(X[1..4])
992 eval(shift(@insns)); #@
993 eval(shift(@insns));
994 &psrlq ($t2,$sigma1[0]);
995 eval(shift(@insns));
996 eval(shift(@insns));
997 eval(shift(@insns));
998 &pxor ($t3,$t2);
999 eval(shift(@insns)); #@
1000 eval(shift(@insns));
1001 eval(shift(@insns));
1002 eval(shift(@insns)); #@
1003 &psrlq ($t2,$sigma1[1]-$sigma1[0]);
1004 eval(shift(@insns));
1005 eval(shift(@insns));
1006 &pxor ($t3,$t2);
1007 eval(shift(@insns)); #@
1008 eval(shift(@insns));
1009 eval(shift(@insns));
1010 #&pshufb ($t3,$t4); # sigma1(X[14..15])
1011 &pshufd ($t3,$t3,0b10000000);
1012 eval(shift(@insns));
1013 eval(shift(@insns));
1014 eval(shift(@insns));
1015 &psrldq ($t3,8);
1016 eval(shift(@insns));
1017 eval(shift(@insns)); #@
1018 eval(shift(@insns));
1019 eval(shift(@insns));
1020 eval(shift(@insns)); #@
1021 &paddd (@X[0],$t3); # X[0..1] += sigma1(X[14..15])
1022 eval(shift(@insns));
1023 eval(shift(@insns));
1024 eval(shift(@insns));
1025 &pshufd ($t3,@X[0],0b01010000); # X[16..17]
1026 eval(shift(@insns));
1027 eval(shift(@insns)); #@
1028 eval(shift(@insns));
1029 &movdqa ($t2,$t3);
1030 eval(shift(@insns));
1031 eval(shift(@insns));
1032 &psrld ($t3,$sigma1[2]);
1033 eval(shift(@insns));
1034 eval(shift(@insns)); #@
1035 &psrlq ($t2,$sigma1[0]);
1036 eval(shift(@insns));
1037 eval(shift(@insns));
1038 &pxor ($t3,$t2);
1039 eval(shift(@insns)); #@
1040 eval(shift(@insns));
1041 eval(shift(@insns));
1042 eval(shift(@insns)); #@
1043 eval(shift(@insns));
1044 &psrlq ($t2,$sigma1[1]-$sigma1[0]);
1045 eval(shift(@insns));
1046 eval(shift(@insns));
1047 eval(shift(@insns));
1048 &pxor ($t3,$t2);
1049 eval(shift(@insns));
1050 eval(shift(@insns));
1051 eval(shift(@insns)); #@
1052 #&pshufb ($t3,$t5);
1053 &pshufd ($t3,$t3,0b00001000);
1054 eval(shift(@insns));
1055 eval(shift(@insns));
1056 &movdqa ($t2,16*2*$j."($Tbl)");
1057 eval(shift(@insns)); #@
1058 eval(shift(@insns));
1059 &pslldq ($t3,8);
1060 eval(shift(@insns));
1061 eval(shift(@insns));
1062 eval(shift(@insns));
1063 &paddd (@X[0],$t3); # X[2..3] += sigma1(X[16..17])
1064 eval(shift(@insns)); #@
1065 eval(shift(@insns));
1066 eval(shift(@insns));
1067 }
1068 &paddd ($t2,@X[0]);
1069 foreach (@insns) { eval; } # remaining instructions
1070 &movdqa (16*$j."(%rsp)",$t2);
1071}
1072
1073 for ($i=0,$j=0; $j<4; $j++) {
1074 &SSSE3_256_00_47($j,\&body_00_15,@X);
1075 push(@X,shift(@X)); # rotate(@X)
1076 }
1077 &cmpb ($SZ-1+16*2*$SZ."($Tbl)",0);
1078 &jne (".Lssse3_00_47");
1079
1080 for ($i=0; $i<16; ) {
1081 foreach(body_00_15()) { eval; }
1082 }
1083$code.=<<___;
1084 mov $_ctx,$ctx
1085 mov $a1,$A
1086
1087 add $SZ*0($ctx),$A
1088 lea 16*$SZ($inp),$inp
1089 add $SZ*1($ctx),$B
1090 add $SZ*2($ctx),$C
1091 add $SZ*3($ctx),$D
1092 add $SZ*4($ctx),$E
1093 add $SZ*5($ctx),$F
1094 add $SZ*6($ctx),$G
1095 add $SZ*7($ctx),$H
1096
1097 cmp $_end,$inp
1098
1099 mov $A,$SZ*0($ctx)
1100 mov $B,$SZ*1($ctx)
1101 mov $C,$SZ*2($ctx)
1102 mov $D,$SZ*3($ctx)
1103 mov $E,$SZ*4($ctx)
1104 mov $F,$SZ*5($ctx)
1105 mov $G,$SZ*6($ctx)
1106 mov $H,$SZ*7($ctx)
1107 jb .Lloop_ssse3
1108
1109 mov $_rsp,%rsi
1110.cfi_def_cfa %rsi,8
1111___
1112$code.=<<___ if ($win64);
1113 movaps 16*$SZ+32(%rsp),%xmm6
1114 movaps 16*$SZ+48(%rsp),%xmm7
1115 movaps 16*$SZ+64(%rsp),%xmm8
1116 movaps 16*$SZ+80(%rsp),%xmm9
1117___
1118$code.=<<___;
1119 mov -48(%rsi),%r15
1120.cfi_restore %r15
1121 mov -40(%rsi),%r14
1122.cfi_restore %r14
1123 mov -32(%rsi),%r13
1124.cfi_restore %r13
1125 mov -24(%rsi),%r12
1126.cfi_restore %r12
1127 mov -16(%rsi),%rbp
1128.cfi_restore %rbp
1129 mov -8(%rsi),%rbx
1130.cfi_restore %rbx
1131 lea (%rsi),%rsp
1132.cfi_def_cfa_register %rsp
1133.Lepilogue_ssse3:
1134 ret
1135.cfi_endproc
1136.size ${func}_ssse3,.-${func}_ssse3
1137___
1138}
1139
1140if ($avx) {{
1141######################################################################
1142# XOP code path
1143#
1144if ($SZ==8) { # SHA512 only
1145$code.=<<___;
1146.type ${func}_xop,\@function,3
1147.align 64
1148${func}_xop:
1149.cfi_startproc
1150.Lxop_shortcut:
1151 mov %rsp,%rax # copy %rsp
1152.cfi_def_cfa_register %rax
1153 push %rbx
1154.cfi_push %rbx
1155 push %rbp
1156.cfi_push %rbp
1157 push %r12
1158.cfi_push %r12
1159 push %r13
1160.cfi_push %r13
1161 push %r14
1162.cfi_push %r14
1163 push %r15
1164.cfi_push %r15
1165 shl \$4,%rdx # num*16
1166 sub \$`$framesz+$win64*16*($SZ==4?4:6)`,%rsp
1167 lea ($inp,%rdx,$SZ),%rdx # inp+num*16*$SZ
1168 and \$-64,%rsp # align stack frame
1169 mov $ctx,$_ctx # save ctx, 1st arg
1170 mov $inp,$_inp # save inp, 2nd arh
1171 mov %rdx,$_end # save end pointer, "3rd" arg
1172 mov %rax,$_rsp # save copy of %rsp
1173.cfi_cfa_expression $_rsp,deref,+8
1174___
1175$code.=<<___ if ($win64);
1176 movaps %xmm6,16*$SZ+32(%rsp)
1177 movaps %xmm7,16*$SZ+48(%rsp)
1178 movaps %xmm8,16*$SZ+64(%rsp)
1179 movaps %xmm9,16*$SZ+80(%rsp)
1180___
1181$code.=<<___ if ($win64 && $SZ>4);
1182 movaps %xmm10,16*$SZ+96(%rsp)
1183 movaps %xmm11,16*$SZ+112(%rsp)
1184___
1185$code.=<<___;
1186.Lprologue_xop:
1187
1188 vzeroupper
1189 mov $SZ*0($ctx),$A
1190 mov $SZ*1($ctx),$B
1191 mov $SZ*2($ctx),$C
1192 mov $SZ*3($ctx),$D
1193 mov $SZ*4($ctx),$E
1194 mov $SZ*5($ctx),$F
1195 mov $SZ*6($ctx),$G
1196 mov $SZ*7($ctx),$H
1197 jmp .Lloop_xop
1198___
1199 if ($SZ==4) { # SHA256
1200 my @X = map("%xmm$_",(0..3));
1201 my ($t0,$t1,$t2,$t3) = map("%xmm$_",(4..7));
1202
1203$code.=<<___;
1204.align 16
1205.Lloop_xop:
1206 vmovdqa $TABLE+`$SZ*2*$rounds`(%rip),$t3
1207 vmovdqu 0x00($inp),@X[0]
1208 vmovdqu 0x10($inp),@X[1]
1209 vmovdqu 0x20($inp),@X[2]
1210 vmovdqu 0x30($inp),@X[3]
1211 vpshufb $t3,@X[0],@X[0]
1212 lea $TABLE(%rip),$Tbl
1213 vpshufb $t3,@X[1],@X[1]
1214 vpshufb $t3,@X[2],@X[2]
1215 vpaddd 0x00($Tbl),@X[0],$t0
1216 vpshufb $t3,@X[3],@X[3]
1217 vpaddd 0x20($Tbl),@X[1],$t1
1218 vpaddd 0x40($Tbl),@X[2],$t2
1219 vpaddd 0x60($Tbl),@X[3],$t3
1220 vmovdqa $t0,0x00(%rsp)
1221 mov $A,$a1
1222 vmovdqa $t1,0x10(%rsp)
1223 mov $B,$a3
1224 vmovdqa $t2,0x20(%rsp)
1225 xor $C,$a3 # magic
1226 vmovdqa $t3,0x30(%rsp)
1227 mov $E,$a0
1228 jmp .Lxop_00_47
1229
1230.align 16
1231.Lxop_00_47:
1232 sub \$`-16*2*$SZ`,$Tbl # size optimization
1233___
1234sub XOP_256_00_47 () {
1235my $j = shift;
1236my $body = shift;
1237my @X = @_;
1238my @insns = (&$body,&$body,&$body,&$body); # 104 instructions
1239
1240 &vpalignr ($t0,@X[1],@X[0],$SZ); # X[1..4]
1241 eval(shift(@insns));
1242 eval(shift(@insns));
1243 &vpalignr ($t3,@X[3],@X[2],$SZ); # X[9..12]
1244 eval(shift(@insns));
1245 eval(shift(@insns));
1246 &vprotd ($t1,$t0,8*$SZ-$sigma0[1]);
1247 eval(shift(@insns));
1248 eval(shift(@insns));
1249 &vpsrld ($t0,$t0,$sigma0[2]);
1250 eval(shift(@insns));
1251 eval(shift(@insns));
1252 &vpaddd (@X[0],@X[0],$t3); # X[0..3] += X[9..12]
1253 eval(shift(@insns));
1254 eval(shift(@insns));
1255 eval(shift(@insns));
1256 eval(shift(@insns));
1257 &vprotd ($t2,$t1,$sigma0[1]-$sigma0[0]);
1258 eval(shift(@insns));
1259 eval(shift(@insns));
1260 &vpxor ($t0,$t0,$t1);
1261 eval(shift(@insns));
1262 eval(shift(@insns));
1263 eval(shift(@insns));
1264 eval(shift(@insns));
1265 &vprotd ($t3,@X[3],8*$SZ-$sigma1[1]);
1266 eval(shift(@insns));
1267 eval(shift(@insns));
1268 &vpxor ($t0,$t0,$t2); # sigma0(X[1..4])
1269 eval(shift(@insns));
1270 eval(shift(@insns));
1271 &vpsrld ($t2,@X[3],$sigma1[2]);
1272 eval(shift(@insns));
1273 eval(shift(@insns));
1274 &vpaddd (@X[0],@X[0],$t0); # X[0..3] += sigma0(X[1..4])
1275 eval(shift(@insns));
1276 eval(shift(@insns));
1277 &vprotd ($t1,$t3,$sigma1[1]-$sigma1[0]);
1278 eval(shift(@insns));
1279 eval(shift(@insns));
1280 &vpxor ($t3,$t3,$t2);
1281 eval(shift(@insns));
1282 eval(shift(@insns));
1283 eval(shift(@insns));
1284 eval(shift(@insns));
1285 &vpxor ($t3,$t3,$t1); # sigma1(X[14..15])
1286 eval(shift(@insns));
1287 eval(shift(@insns));
1288 eval(shift(@insns));
1289 eval(shift(@insns));
1290 &vpsrldq ($t3,$t3,8);
1291 eval(shift(@insns));
1292 eval(shift(@insns));
1293 eval(shift(@insns));
1294 eval(shift(@insns));
1295 &vpaddd (@X[0],@X[0],$t3); # X[0..1] += sigma1(X[14..15])
1296 eval(shift(@insns));
1297 eval(shift(@insns));
1298 eval(shift(@insns));
1299 eval(shift(@insns));
1300 &vprotd ($t3,@X[0],8*$SZ-$sigma1[1]);
1301 eval(shift(@insns));
1302 eval(shift(@insns));
1303 &vpsrld ($t2,@X[0],$sigma1[2]);
1304 eval(shift(@insns));
1305 eval(shift(@insns));
1306 &vprotd ($t1,$t3,$sigma1[1]-$sigma1[0]);
1307 eval(shift(@insns));
1308 eval(shift(@insns));
1309 &vpxor ($t3,$t3,$t2);
1310 eval(shift(@insns));
1311 eval(shift(@insns));
1312 eval(shift(@insns));
1313 eval(shift(@insns));
1314 &vpxor ($t3,$t3,$t1); # sigma1(X[16..17])
1315 eval(shift(@insns));
1316 eval(shift(@insns));
1317 eval(shift(@insns));
1318 eval(shift(@insns));
1319 &vpslldq ($t3,$t3,8); # 22 instructions
1320 eval(shift(@insns));
1321 eval(shift(@insns));
1322 eval(shift(@insns));
1323 eval(shift(@insns));
1324 &vpaddd (@X[0],@X[0],$t3); # X[2..3] += sigma1(X[16..17])
1325 eval(shift(@insns));
1326 eval(shift(@insns));
1327 eval(shift(@insns));
1328 eval(shift(@insns));
1329 &vpaddd ($t2,@X[0],16*2*$j."($Tbl)");
1330 foreach (@insns) { eval; } # remaining instructions
1331 &vmovdqa (16*$j."(%rsp)",$t2);
1332}
1333
1334 for ($i=0,$j=0; $j<4; $j++) {
1335 &XOP_256_00_47($j,\&body_00_15,@X);
1336 push(@X,shift(@X)); # rotate(@X)
1337 }
1338 &cmpb ($SZ-1+16*2*$SZ."($Tbl)",0);
1339 &jne (".Lxop_00_47");
1340
1341 for ($i=0; $i<16; ) {
1342 foreach(body_00_15()) { eval; }
1343 }
1344
1345 } else { # SHA512
1346 my @X = map("%xmm$_",(0..7));
1347 my ($t0,$t1,$t2,$t3) = map("%xmm$_",(8..11));
1348
1349$code.=<<___;
1350.align 16
1351.Lloop_xop:
1352 vmovdqa $TABLE+`$SZ*2*$rounds`(%rip),$t3
1353 vmovdqu 0x00($inp),@X[0]
1354 lea $TABLE+0x80(%rip),$Tbl # size optimization
1355 vmovdqu 0x10($inp),@X[1]
1356 vmovdqu 0x20($inp),@X[2]
1357 vpshufb $t3,@X[0],@X[0]
1358 vmovdqu 0x30($inp),@X[3]
1359 vpshufb $t3,@X[1],@X[1]
1360 vmovdqu 0x40($inp),@X[4]
1361 vpshufb $t3,@X[2],@X[2]
1362 vmovdqu 0x50($inp),@X[5]
1363 vpshufb $t3,@X[3],@X[3]
1364 vmovdqu 0x60($inp),@X[6]
1365 vpshufb $t3,@X[4],@X[4]
1366 vmovdqu 0x70($inp),@X[7]
1367 vpshufb $t3,@X[5],@X[5]
1368 vpaddq -0x80($Tbl),@X[0],$t0
1369 vpshufb $t3,@X[6],@X[6]
1370 vpaddq -0x60($Tbl),@X[1],$t1
1371 vpshufb $t3,@X[7],@X[7]
1372 vpaddq -0x40($Tbl),@X[2],$t2
1373 vpaddq -0x20($Tbl),@X[3],$t3
1374 vmovdqa $t0,0x00(%rsp)
1375 vpaddq 0x00($Tbl),@X[4],$t0
1376 vmovdqa $t1,0x10(%rsp)
1377 vpaddq 0x20($Tbl),@X[5],$t1
1378 vmovdqa $t2,0x20(%rsp)
1379 vpaddq 0x40($Tbl),@X[6],$t2
1380 vmovdqa $t3,0x30(%rsp)
1381 vpaddq 0x60($Tbl),@X[7],$t3
1382 vmovdqa $t0,0x40(%rsp)
1383 mov $A,$a1
1384 vmovdqa $t1,0x50(%rsp)
1385 mov $B,$a3
1386 vmovdqa $t2,0x60(%rsp)
1387 xor $C,$a3 # magic
1388 vmovdqa $t3,0x70(%rsp)
1389 mov $E,$a0
1390 jmp .Lxop_00_47
1391
1392.align 16
1393.Lxop_00_47:
1394 add \$`16*2*$SZ`,$Tbl
1395___
1396sub XOP_512_00_47 () {
1397my $j = shift;
1398my $body = shift;
1399my @X = @_;
1400my @insns = (&$body,&$body); # 52 instructions
1401
1402 &vpalignr ($t0,@X[1],@X[0],$SZ); # X[1..2]
1403 eval(shift(@insns));
1404 eval(shift(@insns));
1405 &vpalignr ($t3,@X[5],@X[4],$SZ); # X[9..10]
1406 eval(shift(@insns));
1407 eval(shift(@insns));
1408 &vprotq ($t1,$t0,8*$SZ-$sigma0[1]);
1409 eval(shift(@insns));
1410 eval(shift(@insns));
1411 &vpsrlq ($t0,$t0,$sigma0[2]);
1412 eval(shift(@insns));
1413 eval(shift(@insns));
1414 &vpaddq (@X[0],@X[0],$t3); # X[0..1] += X[9..10]
1415 eval(shift(@insns));
1416 eval(shift(@insns));
1417 eval(shift(@insns));
1418 eval(shift(@insns));
1419 &vprotq ($t2,$t1,$sigma0[1]-$sigma0[0]);
1420 eval(shift(@insns));
1421 eval(shift(@insns));
1422 &vpxor ($t0,$t0,$t1);
1423 eval(shift(@insns));
1424 eval(shift(@insns));
1425 eval(shift(@insns));
1426 eval(shift(@insns));
1427 &vprotq ($t3,@X[7],8*$SZ-$sigma1[1]);
1428 eval(shift(@insns));
1429 eval(shift(@insns));
1430 &vpxor ($t0,$t0,$t2); # sigma0(X[1..2])
1431 eval(shift(@insns));
1432 eval(shift(@insns));
1433 &vpsrlq ($t2,@X[7],$sigma1[2]);
1434 eval(shift(@insns));
1435 eval(shift(@insns));
1436 &vpaddq (@X[0],@X[0],$t0); # X[0..1] += sigma0(X[1..2])
1437 eval(shift(@insns));
1438 eval(shift(@insns));
1439 &vprotq ($t1,$t3,$sigma1[1]-$sigma1[0]);
1440 eval(shift(@insns));
1441 eval(shift(@insns));
1442 &vpxor ($t3,$t3,$t2);
1443 eval(shift(@insns));
1444 eval(shift(@insns));
1445 eval(shift(@insns));
1446 eval(shift(@insns));
1447 &vpxor ($t3,$t3,$t1); # sigma1(X[14..15])
1448 eval(shift(@insns));
1449 eval(shift(@insns));
1450 eval(shift(@insns));
1451 eval(shift(@insns));
1452 &vpaddq (@X[0],@X[0],$t3); # X[0..1] += sigma1(X[14..15])
1453 eval(shift(@insns));
1454 eval(shift(@insns));
1455 eval(shift(@insns));
1456 eval(shift(@insns));
1457 &vpaddq ($t2,@X[0],16*2*$j-0x80."($Tbl)");
1458 foreach (@insns) { eval; } # remaining instructions
1459 &vmovdqa (16*$j."(%rsp)",$t2);
1460}
1461
1462 for ($i=0,$j=0; $j<8; $j++) {
1463 &XOP_512_00_47($j,\&body_00_15,@X);
1464 push(@X,shift(@X)); # rotate(@X)
1465 }
1466 &cmpb ($SZ-1+16*2*$SZ-0x80."($Tbl)",0);
1467 &jne (".Lxop_00_47");
1468
1469 for ($i=0; $i<16; ) {
1470 foreach(body_00_15()) { eval; }
1471 }
1472}
1473$code.=<<___;
1474 mov $_ctx,$ctx
1475 mov $a1,$A
1476
1477 add $SZ*0($ctx),$A
1478 lea 16*$SZ($inp),$inp
1479 add $SZ*1($ctx),$B
1480 add $SZ*2($ctx),$C
1481 add $SZ*3($ctx),$D
1482 add $SZ*4($ctx),$E
1483 add $SZ*5($ctx),$F
1484 add $SZ*6($ctx),$G
1485 add $SZ*7($ctx),$H
1486
1487 cmp $_end,$inp
1488
1489 mov $A,$SZ*0($ctx)
1490 mov $B,$SZ*1($ctx)
1491 mov $C,$SZ*2($ctx)
1492 mov $D,$SZ*3($ctx)
1493 mov $E,$SZ*4($ctx)
1494 mov $F,$SZ*5($ctx)
1495 mov $G,$SZ*6($ctx)
1496 mov $H,$SZ*7($ctx)
1497 jb .Lloop_xop
1498
1499 mov $_rsp,%rsi
1500.cfi_def_cfa %rsi,8
1501 vzeroupper
1502___
1503$code.=<<___ if ($win64);
1504 movaps 16*$SZ+32(%rsp),%xmm6
1505 movaps 16*$SZ+48(%rsp),%xmm7
1506 movaps 16*$SZ+64(%rsp),%xmm8
1507 movaps 16*$SZ+80(%rsp),%xmm9
1508___
1509$code.=<<___ if ($win64 && $SZ>4);
1510 movaps 16*$SZ+96(%rsp),%xmm10
1511 movaps 16*$SZ+112(%rsp),%xmm11
1512___
1513$code.=<<___;
1514 mov -48(%rsi),%r15
1515.cfi_restore %r15
1516 mov -40(%rsi),%r14
1517.cfi_restore %r14
1518 mov -32(%rsi),%r13
1519.cfi_restore %r13
1520 mov -24(%rsi),%r12
1521.cfi_restore %r12
1522 mov -16(%rsi),%rbp
1523.cfi_restore %rbp
1524 mov -8(%rsi),%rbx
1525.cfi_restore %rbx
1526 lea (%rsi),%rsp
1527.cfi_def_cfa_register %rsp
1528.Lepilogue_xop:
1529 ret
1530.cfi_endproc
1531.size ${func}_xop,.-${func}_xop
1532___
1533}
1534######################################################################
1535# AVX+shrd code path
1536#
1537local *ror = sub { &shrd(@_[0],@_) };
1538
1539$code.=<<___;
1540.type ${func}_avx,\@function,3
1541.align 64
1542${func}_avx:
1543.cfi_startproc
1544.Lavx_shortcut:
1545 mov %rsp,%rax # copy %rsp
1546.cfi_def_cfa_register %rax
1547 push %rbx
1548.cfi_push %rbx
1549 push %rbp
1550.cfi_push %rbp
1551 push %r12
1552.cfi_push %r12
1553 push %r13
1554.cfi_push %r13
1555 push %r14
1556.cfi_push %r14
1557 push %r15
1558.cfi_push %r15
1559 shl \$4,%rdx # num*16
1560 sub \$`$framesz+$win64*16*($SZ==4?4:6)`,%rsp
1561 lea ($inp,%rdx,$SZ),%rdx # inp+num*16*$SZ
1562 and \$-64,%rsp # align stack frame
1563 mov $ctx,$_ctx # save ctx, 1st arg
1564 mov $inp,$_inp # save inp, 2nd arh
1565 mov %rdx,$_end # save end pointer, "3rd" arg
1566 mov %rax,$_rsp # save copy of %rsp
1567.cfi_cfa_expression $_rsp,deref,+8
1568___
1569$code.=<<___ if ($win64);
1570 movaps %xmm6,16*$SZ+32(%rsp)
1571 movaps %xmm7,16*$SZ+48(%rsp)
1572 movaps %xmm8,16*$SZ+64(%rsp)
1573 movaps %xmm9,16*$SZ+80(%rsp)
1574___
1575$code.=<<___ if ($win64 && $SZ>4);
1576 movaps %xmm10,16*$SZ+96(%rsp)
1577 movaps %xmm11,16*$SZ+112(%rsp)
1578___
1579$code.=<<___;
1580.Lprologue_avx:
1581
1582 vzeroupper
1583 mov $SZ*0($ctx),$A
1584 mov $SZ*1($ctx),$B
1585 mov $SZ*2($ctx),$C
1586 mov $SZ*3($ctx),$D
1587 mov $SZ*4($ctx),$E
1588 mov $SZ*5($ctx),$F
1589 mov $SZ*6($ctx),$G
1590 mov $SZ*7($ctx),$H
1591___
1592 if ($SZ==4) { # SHA256
1593 my @X = map("%xmm$_",(0..3));
1594 my ($t0,$t1,$t2,$t3, $t4,$t5) = map("%xmm$_",(4..9));
1595
1596$code.=<<___;
1597 vmovdqa $TABLE+`$SZ*2*$rounds`+32(%rip),$t4
1598 vmovdqa $TABLE+`$SZ*2*$rounds`+64(%rip),$t5
1599 jmp .Lloop_avx
1600.align 16
1601.Lloop_avx:
1602 vmovdqa $TABLE+`$SZ*2*$rounds`(%rip),$t3
1603 vmovdqu 0x00($inp),@X[0]
1604 vmovdqu 0x10($inp),@X[1]
1605 vmovdqu 0x20($inp),@X[2]
1606 vmovdqu 0x30($inp),@X[3]
1607 vpshufb $t3,@X[0],@X[0]
1608 lea $TABLE(%rip),$Tbl
1609 vpshufb $t3,@X[1],@X[1]
1610 vpshufb $t3,@X[2],@X[2]
1611 vpaddd 0x00($Tbl),@X[0],$t0
1612 vpshufb $t3,@X[3],@X[3]
1613 vpaddd 0x20($Tbl),@X[1],$t1
1614 vpaddd 0x40($Tbl),@X[2],$t2
1615 vpaddd 0x60($Tbl),@X[3],$t3
1616 vmovdqa $t0,0x00(%rsp)
1617 mov $A,$a1
1618 vmovdqa $t1,0x10(%rsp)
1619 mov $B,$a3
1620 vmovdqa $t2,0x20(%rsp)
1621 xor $C,$a3 # magic
1622 vmovdqa $t3,0x30(%rsp)
1623 mov $E,$a0
1624 jmp .Lavx_00_47
1625
1626.align 16
1627.Lavx_00_47:
1628 sub \$`-16*2*$SZ`,$Tbl # size optimization
1629___
1630sub Xupdate_256_AVX () {
1631 (
1632 '&vpalignr ($t0,@X[1],@X[0],$SZ)', # X[1..4]
1633 '&vpalignr ($t3,@X[3],@X[2],$SZ)', # X[9..12]
1634 '&vpsrld ($t2,$t0,$sigma0[0]);',
1635 '&vpaddd (@X[0],@X[0],$t3)', # X[0..3] += X[9..12]
1636 '&vpsrld ($t3,$t0,$sigma0[2])',
1637 '&vpslld ($t1,$t0,8*$SZ-$sigma0[1]);',
1638 '&vpxor ($t0,$t3,$t2)',
1639 '&vpshufd ($t3,@X[3],0b11111010)',# X[14..15]
1640 '&vpsrld ($t2,$t2,$sigma0[1]-$sigma0[0]);',
1641 '&vpxor ($t0,$t0,$t1)',
1642 '&vpslld ($t1,$t1,$sigma0[1]-$sigma0[0]);',
1643 '&vpxor ($t0,$t0,$t2)',
1644 '&vpsrld ($t2,$t3,$sigma1[2]);',
1645 '&vpxor ($t0,$t0,$t1)', # sigma0(X[1..4])
1646 '&vpsrlq ($t3,$t3,$sigma1[0]);',
1647 '&vpaddd (@X[0],@X[0],$t0)', # X[0..3] += sigma0(X[1..4])
1648 '&vpxor ($t2,$t2,$t3);',
1649 '&vpsrlq ($t3,$t3,$sigma1[1]-$sigma1[0])',
1650 '&vpxor ($t2,$t2,$t3)',
1651 '&vpshufb ($t2,$t2,$t4)', # sigma1(X[14..15])
1652 '&vpaddd (@X[0],@X[0],$t2)', # X[0..1] += sigma1(X[14..15])
1653 '&vpshufd ($t3,@X[0],0b01010000)',# X[16..17]
1654 '&vpsrld ($t2,$t3,$sigma1[2])',
1655 '&vpsrlq ($t3,$t3,$sigma1[0])',
1656 '&vpxor ($t2,$t2,$t3);',
1657 '&vpsrlq ($t3,$t3,$sigma1[1]-$sigma1[0])',
1658 '&vpxor ($t2,$t2,$t3)',
1659 '&vpshufb ($t2,$t2,$t5)',
1660 '&vpaddd (@X[0],@X[0],$t2)' # X[2..3] += sigma1(X[16..17])
1661 );
1662}
1663
1664sub AVX_256_00_47 () {
1665my $j = shift;
1666my $body = shift;
1667my @X = @_;
1668my @insns = (&$body,&$body,&$body,&$body); # 104 instructions
1669
1670 foreach (Xupdate_256_AVX()) { # 29 instructions
1671 eval;
1672 eval(shift(@insns));
1673 eval(shift(@insns));
1674 eval(shift(@insns));
1675 }
1676 &vpaddd ($t2,@X[0],16*2*$j."($Tbl)");
1677 foreach (@insns) { eval; } # remaining instructions
1678 &vmovdqa (16*$j."(%rsp)",$t2);
1679}
1680
1681 for ($i=0,$j=0; $j<4; $j++) {
1682 &AVX_256_00_47($j,\&body_00_15,@X);
1683 push(@X,shift(@X)); # rotate(@X)
1684 }
1685 &cmpb ($SZ-1+16*2*$SZ."($Tbl)",0);
1686 &jne (".Lavx_00_47");
1687
1688 for ($i=0; $i<16; ) {
1689 foreach(body_00_15()) { eval; }
1690 }
1691
1692 } else { # SHA512
1693 my @X = map("%xmm$_",(0..7));
1694 my ($t0,$t1,$t2,$t3) = map("%xmm$_",(8..11));
1695
1696$code.=<<___;
1697 jmp .Lloop_avx
1698.align 16
1699.Lloop_avx:
1700 vmovdqa $TABLE+`$SZ*2*$rounds`(%rip),$t3
1701 vmovdqu 0x00($inp),@X[0]
1702 lea $TABLE+0x80(%rip),$Tbl # size optimization
1703 vmovdqu 0x10($inp),@X[1]
1704 vmovdqu 0x20($inp),@X[2]
1705 vpshufb $t3,@X[0],@X[0]
1706 vmovdqu 0x30($inp),@X[3]
1707 vpshufb $t3,@X[1],@X[1]
1708 vmovdqu 0x40($inp),@X[4]
1709 vpshufb $t3,@X[2],@X[2]
1710 vmovdqu 0x50($inp),@X[5]
1711 vpshufb $t3,@X[3],@X[3]
1712 vmovdqu 0x60($inp),@X[6]
1713 vpshufb $t3,@X[4],@X[4]
1714 vmovdqu 0x70($inp),@X[7]
1715 vpshufb $t3,@X[5],@X[5]
1716 vpaddq -0x80($Tbl),@X[0],$t0
1717 vpshufb $t3,@X[6],@X[6]
1718 vpaddq -0x60($Tbl),@X[1],$t1
1719 vpshufb $t3,@X[7],@X[7]
1720 vpaddq -0x40($Tbl),@X[2],$t2
1721 vpaddq -0x20($Tbl),@X[3],$t3
1722 vmovdqa $t0,0x00(%rsp)
1723 vpaddq 0x00($Tbl),@X[4],$t0
1724 vmovdqa $t1,0x10(%rsp)
1725 vpaddq 0x20($Tbl),@X[5],$t1
1726 vmovdqa $t2,0x20(%rsp)
1727 vpaddq 0x40($Tbl),@X[6],$t2
1728 vmovdqa $t3,0x30(%rsp)
1729 vpaddq 0x60($Tbl),@X[7],$t3
1730 vmovdqa $t0,0x40(%rsp)
1731 mov $A,$a1
1732 vmovdqa $t1,0x50(%rsp)
1733 mov $B,$a3
1734 vmovdqa $t2,0x60(%rsp)
1735 xor $C,$a3 # magic
1736 vmovdqa $t3,0x70(%rsp)
1737 mov $E,$a0
1738 jmp .Lavx_00_47
1739
1740.align 16
1741.Lavx_00_47:
1742 add \$`16*2*$SZ`,$Tbl
1743___
1744sub Xupdate_512_AVX () {
1745 (
1746 '&vpalignr ($t0,@X[1],@X[0],$SZ)', # X[1..2]
1747 '&vpalignr ($t3,@X[5],@X[4],$SZ)', # X[9..10]
1748 '&vpsrlq ($t2,$t0,$sigma0[0])',
1749 '&vpaddq (@X[0],@X[0],$t3);', # X[0..1] += X[9..10]
1750 '&vpsrlq ($t3,$t0,$sigma0[2])',
1751 '&vpsllq ($t1,$t0,8*$SZ-$sigma0[1]);',
1752 '&vpxor ($t0,$t3,$t2)',
1753 '&vpsrlq ($t2,$t2,$sigma0[1]-$sigma0[0]);',
1754 '&vpxor ($t0,$t0,$t1)',
1755 '&vpsllq ($t1,$t1,$sigma0[1]-$sigma0[0]);',
1756 '&vpxor ($t0,$t0,$t2)',
1757 '&vpsrlq ($t3,@X[7],$sigma1[2]);',
1758 '&vpxor ($t0,$t0,$t1)', # sigma0(X[1..2])
1759 '&vpsllq ($t2,@X[7],8*$SZ-$sigma1[1]);',
1760 '&vpaddq (@X[0],@X[0],$t0)', # X[0..1] += sigma0(X[1..2])
1761 '&vpsrlq ($t1,@X[7],$sigma1[0]);',
1762 '&vpxor ($t3,$t3,$t2)',
1763 '&vpsllq ($t2,$t2,$sigma1[1]-$sigma1[0]);',
1764 '&vpxor ($t3,$t3,$t1)',
1765 '&vpsrlq ($t1,$t1,$sigma1[1]-$sigma1[0]);',
1766 '&vpxor ($t3,$t3,$t2)',
1767 '&vpxor ($t3,$t3,$t1)', # sigma1(X[14..15])
1768 '&vpaddq (@X[0],@X[0],$t3)', # X[0..1] += sigma1(X[14..15])
1769 );
1770}
1771
1772sub AVX_512_00_47 () {
1773my $j = shift;
1774my $body = shift;
1775my @X = @_;
1776my @insns = (&$body,&$body); # 52 instructions
1777
1778 foreach (Xupdate_512_AVX()) { # 23 instructions
1779 eval;
1780 eval(shift(@insns));
1781 eval(shift(@insns));
1782 }
1783 &vpaddq ($t2,@X[0],16*2*$j-0x80."($Tbl)");
1784 foreach (@insns) { eval; } # remaining instructions
1785 &vmovdqa (16*$j."(%rsp)",$t2);
1786}
1787
1788 for ($i=0,$j=0; $j<8; $j++) {
1789 &AVX_512_00_47($j,\&body_00_15,@X);
1790 push(@X,shift(@X)); # rotate(@X)
1791 }
1792 &cmpb ($SZ-1+16*2*$SZ-0x80."($Tbl)",0);
1793 &jne (".Lavx_00_47");
1794
1795 for ($i=0; $i<16; ) {
1796 foreach(body_00_15()) { eval; }
1797 }
1798}
1799$code.=<<___;
1800 mov $_ctx,$ctx
1801 mov $a1,$A
1802
1803 add $SZ*0($ctx),$A
1804 lea 16*$SZ($inp),$inp
1805 add $SZ*1($ctx),$B
1806 add $SZ*2($ctx),$C
1807 add $SZ*3($ctx),$D
1808 add $SZ*4($ctx),$E
1809 add $SZ*5($ctx),$F
1810 add $SZ*6($ctx),$G
1811 add $SZ*7($ctx),$H
1812
1813 cmp $_end,$inp
1814
1815 mov $A,$SZ*0($ctx)
1816 mov $B,$SZ*1($ctx)
1817 mov $C,$SZ*2($ctx)
1818 mov $D,$SZ*3($ctx)
1819 mov $E,$SZ*4($ctx)
1820 mov $F,$SZ*5($ctx)
1821 mov $G,$SZ*6($ctx)
1822 mov $H,$SZ*7($ctx)
1823 jb .Lloop_avx
1824
1825 mov $_rsp,%rsi
1826.cfi_def_cfa %rsi,8
1827 vzeroupper
1828___
1829$code.=<<___ if ($win64);
1830 movaps 16*$SZ+32(%rsp),%xmm6
1831 movaps 16*$SZ+48(%rsp),%xmm7
1832 movaps 16*$SZ+64(%rsp),%xmm8
1833 movaps 16*$SZ+80(%rsp),%xmm9
1834___
1835$code.=<<___ if ($win64 && $SZ>4);
1836 movaps 16*$SZ+96(%rsp),%xmm10
1837 movaps 16*$SZ+112(%rsp),%xmm11
1838___
1839$code.=<<___;
1840 mov -48(%rsi),%r15
1841.cfi_restore %r15
1842 mov -40(%rsi),%r14
1843.cfi_restore %r14
1844 mov -32(%rsi),%r13
1845.cfi_restore %r13
1846 mov -24(%rsi),%r12
1847.cfi_restore %r12
1848 mov -16(%rsi),%rbp
1849.cfi_restore %rbp
1850 mov -8(%rsi),%rbx
1851.cfi_restore %rbx
1852 lea (%rsi),%rsp
1853.cfi_def_cfa_register %rsp
1854.Lepilogue_avx:
1855 ret
1856.cfi_endproc
1857.size ${func}_avx,.-${func}_avx
1858___
1859
1860if ($avx>1) {{
1861######################################################################
1862# AVX2+BMI code path
1863#
1864my $a5=$SZ==4?"%esi":"%rsi"; # zap $inp
1865my $PUSH8=8*2*$SZ;
1866use integer;
1867
1868sub bodyx_00_15 () {
1869 # at start $a1 should be zero, $a3 - $b^$c and $a4 copy of $f
1870 (
1871 '($a,$b,$c,$d,$e,$f,$g,$h)=@ROT;'.
1872
1873 '&add ($h,(32*($i/(16/$SZ))+$SZ*($i%(16/$SZ)))%$PUSH8.$base)', # h+=X[i]+K[i]
1874 '&and ($a4,$e)', # f&e
1875 '&rorx ($a0,$e,$Sigma1[2])',
1876 '&rorx ($a2,$e,$Sigma1[1])',
1877
1878 '&lea ($a,"($a,$a1)")', # h+=Sigma0(a) from the past
1879 '&lea ($h,"($h,$a4)")',
1880 '&andn ($a4,$e,$g)', # ~e&g
1881 '&xor ($a0,$a2)',
1882
1883 '&rorx ($a1,$e,$Sigma1[0])',
1884 '&lea ($h,"($h,$a4)")', # h+=Ch(e,f,g)=(e&f)+(~e&g)
1885 '&xor ($a0,$a1)', # Sigma1(e)
1886 '&mov ($a2,$a)',
1887
1888 '&rorx ($a4,$a,$Sigma0[2])',
1889 '&lea ($h,"($h,$a0)")', # h+=Sigma1(e)
1890 '&xor ($a2,$b)', # a^b, b^c in next round
1891 '&rorx ($a1,$a,$Sigma0[1])',
1892
1893 '&rorx ($a0,$a,$Sigma0[0])',
1894 '&lea ($d,"($d,$h)")', # d+=h
1895 '&and ($a3,$a2)', # (b^c)&(a^b)
1896 '&xor ($a1,$a4)',
1897
1898 '&xor ($a3,$b)', # Maj(a,b,c)=Ch(a^b,c,b)
1899 '&xor ($a1,$a0)', # Sigma0(a)
1900 '&lea ($h,"($h,$a3)");'. # h+=Maj(a,b,c)
1901 '&mov ($a4,$e)', # copy of f in future
1902
1903 '($a2,$a3) = ($a3,$a2); unshift(@ROT,pop(@ROT)); $i++;'
1904 );
1905 # and at the finish one has to $a+=$a1
1906}
1907
1908$code.=<<___;
1909.type ${func}_avx2,\@function,3
1910.align 64
1911${func}_avx2:
1912.cfi_startproc
1913.Lavx2_shortcut:
1914 mov %rsp,%rax # copy %rsp
1915.cfi_def_cfa_register %rax
1916 push %rbx
1917.cfi_push %rbx
1918 push %rbp
1919.cfi_push %rbp
1920 push %r12
1921.cfi_push %r12
1922 push %r13
1923.cfi_push %r13
1924 push %r14
1925.cfi_push %r14
1926 push %r15
1927.cfi_push %r15
1928 sub \$`2*$SZ*$rounds+4*8+$win64*16*($SZ==4?4:6)`,%rsp
1929 shl \$4,%rdx # num*16
1930 and \$-256*$SZ,%rsp # align stack frame
1931 lea ($inp,%rdx,$SZ),%rdx # inp+num*16*$SZ
1932 add \$`2*$SZ*($rounds-8)`,%rsp
1933 mov $ctx,$_ctx # save ctx, 1st arg
1934 mov $inp,$_inp # save inp, 2nd arh
1935 mov %rdx,$_end # save end pointer, "3rd" arg
1936 mov %rax,$_rsp # save copy of %rsp
1937.cfi_cfa_expression $_rsp,deref,+8
1938___
1939$code.=<<___ if ($win64);
1940 movaps %xmm6,16*$SZ+32(%rsp)
1941 movaps %xmm7,16*$SZ+48(%rsp)
1942 movaps %xmm8,16*$SZ+64(%rsp)
1943 movaps %xmm9,16*$SZ+80(%rsp)
1944___
1945$code.=<<___ if ($win64 && $SZ>4);
1946 movaps %xmm10,16*$SZ+96(%rsp)
1947 movaps %xmm11,16*$SZ+112(%rsp)
1948___
1949$code.=<<___;
1950.Lprologue_avx2:
1951
1952 vzeroupper
1953 sub \$-16*$SZ,$inp # inp++, size optimization
1954 mov $SZ*0($ctx),$A
1955 mov $inp,%r12 # borrow $T1
1956 mov $SZ*1($ctx),$B
1957 cmp %rdx,$inp # $_end
1958 mov $SZ*2($ctx),$C
1959 cmove %rsp,%r12 # next block or random data
1960 mov $SZ*3($ctx),$D
1961 mov $SZ*4($ctx),$E
1962 mov $SZ*5($ctx),$F
1963 mov $SZ*6($ctx),$G
1964 mov $SZ*7($ctx),$H
1965___
1966 if ($SZ==4) { # SHA256
1967 my @X = map("%ymm$_",(0..3));
1968 my ($t0,$t1,$t2,$t3, $t4,$t5) = map("%ymm$_",(4..9));
1969
1970$code.=<<___;
1971 vmovdqa $TABLE+`$SZ*2*$rounds`+32(%rip),$t4
1972 vmovdqa $TABLE+`$SZ*2*$rounds`+64(%rip),$t5
1973 jmp .Loop_avx2
1974.align 16
1975.Loop_avx2:
1976 vmovdqa $TABLE+`$SZ*2*$rounds`(%rip),$t3
1977 vmovdqu -16*$SZ+0($inp),%xmm0
1978 vmovdqu -16*$SZ+16($inp),%xmm1
1979 vmovdqu -16*$SZ+32($inp),%xmm2
1980 vmovdqu -16*$SZ+48($inp),%xmm3
1981 #mov $inp,$_inp # offload $inp
1982 vinserti128 \$1,(%r12),@X[0],@X[0]
1983 vinserti128 \$1,16(%r12),@X[1],@X[1]
1984 vpshufb $t3,@X[0],@X[0]
1985 vinserti128 \$1,32(%r12),@X[2],@X[2]
1986 vpshufb $t3,@X[1],@X[1]
1987 vinserti128 \$1,48(%r12),@X[3],@X[3]
1988
1989 lea $TABLE(%rip),$Tbl
1990 vpshufb $t3,@X[2],@X[2]
1991 vpaddd 0x00($Tbl),@X[0],$t0
1992 vpshufb $t3,@X[3],@X[3]
1993 vpaddd 0x20($Tbl),@X[1],$t1
1994 vpaddd 0x40($Tbl),@X[2],$t2
1995 vpaddd 0x60($Tbl),@X[3],$t3
1996 vmovdqa $t0,0x00(%rsp)
1997 xor $a1,$a1
1998 vmovdqa $t1,0x20(%rsp)
1999___
2000$code.=<<___ if (!$win64);
2001# temporarily use %rdi as frame pointer
2002 mov $_rsp,%rdi
2003.cfi_def_cfa %rdi,8
2004___
2005$code.=<<___;
2006 lea -$PUSH8(%rsp),%rsp
2007___
2008$code.=<<___ if (!$win64);
2009# the frame info is at $_rsp, but the stack is moving...
2010# so a second frame pointer is saved at -8(%rsp)
2011# that is in the red zone
2012 mov %rdi,-8(%rsp)
2013.cfi_cfa_expression %rsp-8,deref,+8
2014___
2015$code.=<<___;
2016 mov $B,$a3
2017 vmovdqa $t2,0x00(%rsp)
2018 xor $C,$a3 # magic
2019 vmovdqa $t3,0x20(%rsp)
2020 mov $F,$a4
2021 sub \$-16*2*$SZ,$Tbl # size optimization
2022 jmp .Lavx2_00_47
2023
2024.align 16
2025.Lavx2_00_47:
2026___
2027
2028sub AVX2_256_00_47 () {
2029my $j = shift;
2030my $body = shift;
2031my @X = @_;
2032my @insns = (&$body,&$body,&$body,&$body); # 96 instructions
2033my $base = "+2*$PUSH8(%rsp)";
2034
2035 if (($j%2)==0) {
2036 &lea ("%rsp","-$PUSH8(%rsp)");
2037$code.=<<___ if (!$win64);
2038.cfi_cfa_expression %rsp+`$PUSH8-8`,deref,+8
2039# copy secondary frame pointer to new location again at -8(%rsp)
2040 pushq $PUSH8-8(%rsp)
2041.cfi_cfa_expression %rsp,deref,+8
2042 lea 8(%rsp),%rsp
2043.cfi_cfa_expression %rsp-8,deref,+8
2044___
2045 }
2046
2047 foreach (Xupdate_256_AVX()) { # 29 instructions
2048 eval;
2049 eval(shift(@insns));
2050 eval(shift(@insns));
2051 eval(shift(@insns));
2052 }
2053 &vpaddd ($t2,@X[0],16*2*$j."($Tbl)");
2054 foreach (@insns) { eval; } # remaining instructions
2055 &vmovdqa ((32*$j)%$PUSH8."(%rsp)",$t2);
2056}
2057
2058 for ($i=0,$j=0; $j<4; $j++) {
2059 &AVX2_256_00_47($j,\&bodyx_00_15,@X);
2060 push(@X,shift(@X)); # rotate(@X)
2061 }
2062 &lea ($Tbl,16*2*$SZ."($Tbl)");
2063 &cmpb (($SZ-1)."($Tbl)",0);
2064 &jne (".Lavx2_00_47");
2065
2066 for ($i=0; $i<16; ) {
2067 my $base=$i<8?"+$PUSH8(%rsp)":"(%rsp)";
2068 foreach(bodyx_00_15()) { eval; }
2069 }
2070 } else { # SHA512
2071 my @X = map("%ymm$_",(0..7));
2072 my ($t0,$t1,$t2,$t3) = map("%ymm$_",(8..11));
2073
2074$code.=<<___;
2075 jmp .Loop_avx2
2076.align 16
2077.Loop_avx2:
2078 vmovdqu -16*$SZ($inp),%xmm0
2079 vmovdqu -16*$SZ+16($inp),%xmm1
2080 vmovdqu -16*$SZ+32($inp),%xmm2
2081 lea $TABLE+0x80(%rip),$Tbl # size optimization
2082 vmovdqu -16*$SZ+48($inp),%xmm3
2083 vmovdqu -16*$SZ+64($inp),%xmm4
2084 vmovdqu -16*$SZ+80($inp),%xmm5
2085 vmovdqu -16*$SZ+96($inp),%xmm6
2086 vmovdqu -16*$SZ+112($inp),%xmm7
2087 #mov $inp,$_inp # offload $inp
2088 vmovdqa `$SZ*2*$rounds-0x80`($Tbl),$t2
2089 vinserti128 \$1,(%r12),@X[0],@X[0]
2090 vinserti128 \$1,16(%r12),@X[1],@X[1]
2091 vpshufb $t2,@X[0],@X[0]
2092 vinserti128 \$1,32(%r12),@X[2],@X[2]
2093 vpshufb $t2,@X[1],@X[1]
2094 vinserti128 \$1,48(%r12),@X[3],@X[3]
2095 vpshufb $t2,@X[2],@X[2]
2096 vinserti128 \$1,64(%r12),@X[4],@X[4]
2097 vpshufb $t2,@X[3],@X[3]
2098 vinserti128 \$1,80(%r12),@X[5],@X[5]
2099 vpshufb $t2,@X[4],@X[4]
2100 vinserti128 \$1,96(%r12),@X[6],@X[6]
2101 vpshufb $t2,@X[5],@X[5]
2102 vinserti128 \$1,112(%r12),@X[7],@X[7]
2103
2104 vpaddq -0x80($Tbl),@X[0],$t0
2105 vpshufb $t2,@X[6],@X[6]
2106 vpaddq -0x60($Tbl),@X[1],$t1
2107 vpshufb $t2,@X[7],@X[7]
2108 vpaddq -0x40($Tbl),@X[2],$t2
2109 vpaddq -0x20($Tbl),@X[3],$t3
2110 vmovdqa $t0,0x00(%rsp)
2111 vpaddq 0x00($Tbl),@X[4],$t0
2112 vmovdqa $t1,0x20(%rsp)
2113 vpaddq 0x20($Tbl),@X[5],$t1
2114 vmovdqa $t2,0x40(%rsp)
2115 vpaddq 0x40($Tbl),@X[6],$t2
2116 vmovdqa $t3,0x60(%rsp)
2117___
2118$code.=<<___ if (!$win64);
2119# temporarily use %rdi as frame pointer
2120 mov $_rsp,%rdi
2121.cfi_def_cfa %rdi,8
2122___
2123$code.=<<___;
2124 lea -$PUSH8(%rsp),%rsp
2125___
2126$code.=<<___ if (!$win64);
2127# the frame info is at $_rsp, but the stack is moving...
2128# so a second frame pointer is saved at -8(%rsp)
2129# that is in the red zone
2130 mov %rdi,-8(%rsp)
2131.cfi_cfa_expression %rsp-8,deref,+8
2132___
2133$code.=<<___;
2134 vpaddq 0x60($Tbl),@X[7],$t3
2135 vmovdqa $t0,0x00(%rsp)
2136 xor $a1,$a1
2137 vmovdqa $t1,0x20(%rsp)
2138 mov $B,$a3
2139 vmovdqa $t2,0x40(%rsp)
2140 xor $C,$a3 # magic
2141 vmovdqa $t3,0x60(%rsp)
2142 mov $F,$a4
2143 add \$16*2*$SZ,$Tbl
2144 jmp .Lavx2_00_47
2145
2146.align 16
2147.Lavx2_00_47:
2148___
2149
2150sub AVX2_512_00_47 () {
2151my $j = shift;
2152my $body = shift;
2153my @X = @_;
2154my @insns = (&$body,&$body); # 48 instructions
2155my $base = "+2*$PUSH8(%rsp)";
2156
2157 if (($j%4)==0) {
2158 &lea ("%rsp","-$PUSH8(%rsp)");
2159$code.=<<___ if (!$win64);
2160.cfi_cfa_expression %rsp+`$PUSH8-8`,deref,+8
2161# copy secondary frame pointer to new location again at -8(%rsp)
2162 pushq $PUSH8-8(%rsp)
2163.cfi_cfa_expression %rsp,deref,+8
2164 lea 8(%rsp),%rsp
2165.cfi_cfa_expression %rsp-8,deref,+8
2166___
2167 }
2168
2169 foreach (Xupdate_512_AVX()) { # 23 instructions
2170 eval;
2171 if ($_ !~ /\;$/) {
2172 eval(shift(@insns));
2173 eval(shift(@insns));
2174 eval(shift(@insns));
2175 }
2176 }
2177 &vpaddq ($t2,@X[0],16*2*$j-0x80."($Tbl)");
2178 foreach (@insns) { eval; } # remaining instructions
2179 &vmovdqa ((32*$j)%$PUSH8."(%rsp)",$t2);
2180}
2181
2182 for ($i=0,$j=0; $j<8; $j++) {
2183 &AVX2_512_00_47($j,\&bodyx_00_15,@X);
2184 push(@X,shift(@X)); # rotate(@X)
2185 }
2186 &lea ($Tbl,16*2*$SZ."($Tbl)");
2187 &cmpb (($SZ-1-0x80)."($Tbl)",0);
2188 &jne (".Lavx2_00_47");
2189
2190 for ($i=0; $i<16; ) {
2191 my $base=$i<8?"+$PUSH8(%rsp)":"(%rsp)";
2192 foreach(bodyx_00_15()) { eval; }
2193 }
2194}
2195$code.=<<___;
2196 mov `2*$SZ*$rounds`(%rsp),$ctx # $_ctx
2197 add $a1,$A
2198 #mov `2*$SZ*$rounds+8`(%rsp),$inp # $_inp
2199 lea `2*$SZ*($rounds-8)`(%rsp),$Tbl
2200
2201 add $SZ*0($ctx),$A
2202 add $SZ*1($ctx),$B
2203 add $SZ*2($ctx),$C
2204 add $SZ*3($ctx),$D
2205 add $SZ*4($ctx),$E
2206 add $SZ*5($ctx),$F
2207 add $SZ*6($ctx),$G
2208 add $SZ*7($ctx),$H
2209
2210 mov $A,$SZ*0($ctx)
2211 mov $B,$SZ*1($ctx)
2212 mov $C,$SZ*2($ctx)
2213 mov $D,$SZ*3($ctx)
2214 mov $E,$SZ*4($ctx)
2215 mov $F,$SZ*5($ctx)
2216 mov $G,$SZ*6($ctx)
2217 mov $H,$SZ*7($ctx)
2218
2219 cmp `$PUSH8+2*8`($Tbl),$inp # $_end
2220 je .Ldone_avx2
2221
2222 xor $a1,$a1
2223 mov $B,$a3
2224 xor $C,$a3 # magic
2225 mov $F,$a4
2226 jmp .Lower_avx2
2227.align 16
2228.Lower_avx2:
2229___
2230 for ($i=0; $i<8; ) {
2231 my $base="+16($Tbl)";
2232 foreach(bodyx_00_15()) { eval; }
2233 }
2234$code.=<<___;
2235 lea -$PUSH8($Tbl),$Tbl
2236 cmp %rsp,$Tbl
2237 jae .Lower_avx2
2238
2239 mov `2*$SZ*$rounds`(%rsp),$ctx # $_ctx
2240 add $a1,$A
2241 #mov `2*$SZ*$rounds+8`(%rsp),$inp # $_inp
2242 lea `2*$SZ*($rounds-8)`(%rsp),%rsp
2243# restore frame pointer to original location at $_rsp
2244.cfi_cfa_expression $_rsp,deref,+8
2245
2246 add $SZ*0($ctx),$A
2247 add $SZ*1($ctx),$B
2248 add $SZ*2($ctx),$C
2249 add $SZ*3($ctx),$D
2250 add $SZ*4($ctx),$E
2251 add $SZ*5($ctx),$F
2252 lea `2*16*$SZ`($inp),$inp # inp+=2
2253 add $SZ*6($ctx),$G
2254 mov $inp,%r12
2255 add $SZ*7($ctx),$H
2256 cmp $_end,$inp
2257
2258 mov $A,$SZ*0($ctx)
2259 cmove %rsp,%r12 # next block or stale data
2260 mov $B,$SZ*1($ctx)
2261 mov $C,$SZ*2($ctx)
2262 mov $D,$SZ*3($ctx)
2263 mov $E,$SZ*4($ctx)
2264 mov $F,$SZ*5($ctx)
2265 mov $G,$SZ*6($ctx)
2266 mov $H,$SZ*7($ctx)
2267
2268 jbe .Loop_avx2
2269 lea (%rsp),$Tbl
2270# temporarily use $Tbl as index to $_rsp
2271# this avoids the need to save a secondary frame pointer at -8(%rsp)
2272.cfi_cfa_expression $Tbl+`16*$SZ+3*8`,deref,+8
2273
2274.Ldone_avx2:
2275 mov `16*$SZ+3*8`($Tbl),%rsi
2276.cfi_def_cfa %rsi,8
2277 vzeroupper
2278___
2279$code.=<<___ if ($win64);
2280 movaps 16*$SZ+32($Tbl),%xmm6
2281 movaps 16*$SZ+48($Tbl),%xmm7
2282 movaps 16*$SZ+64($Tbl),%xmm8
2283 movaps 16*$SZ+80($Tbl),%xmm9
2284___
2285$code.=<<___ if ($win64 && $SZ>4);
2286 movaps 16*$SZ+96($Tbl),%xmm10
2287 movaps 16*$SZ+112($Tbl),%xmm11
2288___
2289$code.=<<___;
2290 mov -48(%rsi),%r15
2291.cfi_restore %r15
2292 mov -40(%rsi),%r14
2293.cfi_restore %r14
2294 mov -32(%rsi),%r13
2295.cfi_restore %r13
2296 mov -24(%rsi),%r12
2297.cfi_restore %r12
2298 mov -16(%rsi),%rbp
2299.cfi_restore %rbp
2300 mov -8(%rsi),%rbx
2301.cfi_restore %rbx
2302 lea (%rsi),%rsp
2303.cfi_def_cfa_register %rsp
2304.Lepilogue_avx2:
2305 ret
2306.cfi_endproc
2307.size ${func}_avx2,.-${func}_avx2
2308___
2309}}
2310}}}}}
2311
2312# EXCEPTION_DISPOSITION handler (EXCEPTION_RECORD *rec,ULONG64 frame,
2313# CONTEXT *context,DISPATCHER_CONTEXT *disp)
2314if ($win64) {
2315$rec="%rcx";
2316$frame="%rdx";
2317$context="%r8";
2318$disp="%r9";
2319
2320$code.=<<___;
2321.extern __imp_RtlVirtualUnwind
2322.type se_handler,\@abi-omnipotent
2323.align 16
2324se_handler:
2325 push %rsi
2326 push %rdi
2327 push %rbx
2328 push %rbp
2329 push %r12
2330 push %r13
2331 push %r14
2332 push %r15
2333 pushfq
2334 sub \$64,%rsp
2335
2336 mov 120($context),%rax # pull context->Rax
2337 mov 248($context),%rbx # pull context->Rip
2338
2339 mov 8($disp),%rsi # disp->ImageBase
2340 mov 56($disp),%r11 # disp->HanderlData
2341
2342 mov 0(%r11),%r10d # HandlerData[0]
2343 lea (%rsi,%r10),%r10 # prologue label
2344 cmp %r10,%rbx # context->Rip<prologue label
2345 jb .Lin_prologue
2346
2347 mov 152($context),%rax # pull context->Rsp
2348
2349 mov 4(%r11),%r10d # HandlerData[1]
2350 lea (%rsi,%r10),%r10 # epilogue label
2351 cmp %r10,%rbx # context->Rip>=epilogue label
2352 jae .Lin_prologue
2353___
2354$code.=<<___ if ($avx>1);
2355 lea .Lavx2_shortcut(%rip),%r10
2356 cmp %r10,%rbx # context->Rip<avx2_shortcut
2357 jb .Lnot_in_avx2
2358
2359 and \$-256*$SZ,%rax
2360 add \$`2*$SZ*($rounds-8)`,%rax
2361.Lnot_in_avx2:
2362___
2363$code.=<<___;
2364 mov %rax,%rsi # put aside Rsp
2365 mov 16*$SZ+3*8(%rax),%rax # pull $_rsp
2366
2367 mov -8(%rax),%rbx
2368 mov -16(%rax),%rbp
2369 mov -24(%rax),%r12
2370 mov -32(%rax),%r13
2371 mov -40(%rax),%r14
2372 mov -48(%rax),%r15
2373 mov %rbx,144($context) # restore context->Rbx
2374 mov %rbp,160($context) # restore context->Rbp
2375 mov %r12,216($context) # restore context->R12
2376 mov %r13,224($context) # restore context->R13
2377 mov %r14,232($context) # restore context->R14
2378 mov %r15,240($context) # restore context->R15
2379
2380 lea .Lepilogue(%rip),%r10
2381 cmp %r10,%rbx
2382 jb .Lin_prologue # non-AVX code
2383
2384 lea 16*$SZ+4*8(%rsi),%rsi # Xmm6- save area
2385 lea 512($context),%rdi # &context.Xmm6
2386 mov \$`$SZ==4?8:12`,%ecx
2387 .long 0xa548f3fc # cld; rep movsq
2388
2389.Lin_prologue:
2390 mov 8(%rax),%rdi
2391 mov 16(%rax),%rsi
2392 mov %rax,152($context) # restore context->Rsp
2393 mov %rsi,168($context) # restore context->Rsi
2394 mov %rdi,176($context) # restore context->Rdi
2395
2396 mov 40($disp),%rdi # disp->ContextRecord
2397 mov $context,%rsi # context
2398 mov \$154,%ecx # sizeof(CONTEXT)
2399 .long 0xa548f3fc # cld; rep movsq
2400
2401 mov $disp,%rsi
2402 xor %rcx,%rcx # arg1, UNW_FLAG_NHANDLER
2403 mov 8(%rsi),%rdx # arg2, disp->ImageBase
2404 mov 0(%rsi),%r8 # arg3, disp->ControlPc
2405 mov 16(%rsi),%r9 # arg4, disp->FunctionEntry
2406 mov 40(%rsi),%r10 # disp->ContextRecord
2407 lea 56(%rsi),%r11 # &disp->HandlerData
2408 lea 24(%rsi),%r12 # &disp->EstablisherFrame
2409 mov %r10,32(%rsp) # arg5
2410 mov %r11,40(%rsp) # arg6
2411 mov %r12,48(%rsp) # arg7
2412 mov %rcx,56(%rsp) # arg8, (NULL)
2413 call *__imp_RtlVirtualUnwind(%rip)
2414
2415 mov \$1,%eax # ExceptionContinueSearch
2416 add \$64,%rsp
2417 popfq
2418 pop %r15
2419 pop %r14
2420 pop %r13
2421 pop %r12
2422 pop %rbp
2423 pop %rbx
2424 pop %rdi
2425 pop %rsi
2426 ret
2427.size se_handler,.-se_handler
2428___
2429
2430$code.=<<___ if ($SZ==4 && $shaext);
2431.type shaext_handler,\@abi-omnipotent
2432.align 16
2433shaext_handler:
2434 push %rsi
2435 push %rdi
2436 push %rbx
2437 push %rbp
2438 push %r12
2439 push %r13
2440 push %r14
2441 push %r15
2442 pushfq
2443 sub \$64,%rsp
2444
2445 mov 120($context),%rax # pull context->Rax
2446 mov 248($context),%rbx # pull context->Rip
2447
2448 lea .Lprologue_shaext(%rip),%r10
2449 cmp %r10,%rbx # context->Rip<.Lprologue
2450 jb .Lin_prologue
2451
2452 lea .Lepilogue_shaext(%rip),%r10
2453 cmp %r10,%rbx # context->Rip>=.Lepilogue
2454 jae .Lin_prologue
2455
2456 lea -8-5*16(%rax),%rsi
2457 lea 512($context),%rdi # &context.Xmm6
2458 mov \$10,%ecx
2459 .long 0xa548f3fc # cld; rep movsq
2460
2461 jmp .Lin_prologue
2462.size shaext_handler,.-shaext_handler
2463___
2464
2465$code.=<<___;
2466.section .pdata
2467.align 4
2468 .rva .LSEH_begin_$func
2469 .rva .LSEH_end_$func
2470 .rva .LSEH_info_$func
2471___
2472$code.=<<___ if ($SZ==4 && $shaext);
2473 .rva .LSEH_begin_${func}_shaext
2474 .rva .LSEH_end_${func}_shaext
2475 .rva .LSEH_info_${func}_shaext
2476___
2477$code.=<<___ if ($SZ==4);
2478 .rva .LSEH_begin_${func}_ssse3
2479 .rva .LSEH_end_${func}_ssse3
2480 .rva .LSEH_info_${func}_ssse3
2481___
2482$code.=<<___ if ($avx && $SZ==8);
2483 .rva .LSEH_begin_${func}_xop
2484 .rva .LSEH_end_${func}_xop
2485 .rva .LSEH_info_${func}_xop
2486___
2487$code.=<<___ if ($avx);
2488 .rva .LSEH_begin_${func}_avx
2489 .rva .LSEH_end_${func}_avx
2490 .rva .LSEH_info_${func}_avx
2491___
2492$code.=<<___ if ($avx>1);
2493 .rva .LSEH_begin_${func}_avx2
2494 .rva .LSEH_end_${func}_avx2
2495 .rva .LSEH_info_${func}_avx2
2496___
2497$code.=<<___;
2498.section .xdata
2499.align 8
2500.LSEH_info_$func:
2501 .byte 9,0,0,0
2502 .rva se_handler
2503 .rva .Lprologue,.Lepilogue # HandlerData[]
2504___
2505$code.=<<___ if ($SZ==4 && $shaext);
2506.LSEH_info_${func}_shaext:
2507 .byte 9,0,0,0
2508 .rva shaext_handler
2509___
2510$code.=<<___ if ($SZ==4);
2511.LSEH_info_${func}_ssse3:
2512 .byte 9,0,0,0
2513 .rva se_handler
2514 .rva .Lprologue_ssse3,.Lepilogue_ssse3 # HandlerData[]
2515___
2516$code.=<<___ if ($avx && $SZ==8);
2517.LSEH_info_${func}_xop:
2518 .byte 9,0,0,0
2519 .rva se_handler
2520 .rva .Lprologue_xop,.Lepilogue_xop # HandlerData[]
2521___
2522$code.=<<___ if ($avx);
2523.LSEH_info_${func}_avx:
2524 .byte 9,0,0,0
2525 .rva se_handler
2526 .rva .Lprologue_avx,.Lepilogue_avx # HandlerData[]
2527___
2528$code.=<<___ if ($avx>1);
2529.LSEH_info_${func}_avx2:
2530 .byte 9,0,0,0
2531 .rva se_handler
2532 .rva .Lprologue_avx2,.Lepilogue_avx2 # HandlerData[]
2533___
2534}
2535
2536sub sha256op38 {
2537 my $instr = shift;
2538 my %opcodelet = (
2539 "sha256rnds2" => 0xcb,
2540 "sha256msg1" => 0xcc,
2541 "sha256msg2" => 0xcd );
2542
2543 if (defined($opcodelet{$instr}) && @_[0] =~ /%xmm([0-7]),\s*%xmm([0-7])/) {
2544 my @opcode=(0x0f,0x38);
2545 push @opcode,$opcodelet{$instr};
2546 push @opcode,0xc0|($1&7)|(($2&7)<<3); # ModR/M
2547 return ".byte\t".join(',',@opcode);
2548 } else {
2549 return $instr."\t".@_[0];
2550 }
2551}
2552
2553foreach (split("\n",$code)) {
2554 s/\`([^\`]*)\`/eval $1/geo;
2555
2556 s/\b(sha256[^\s]*)\s+(.*)/sha256op38($1,$2)/geo;
2557
2558 print $_,"\n";
2559}
2560close STDOUT or die "error closing STDOUT: $!";
Note: See TracBrowser for help on using the repository browser.

© 2024 Oracle Support Privacy / Do Not Sell My Info Terms of Use Trademark Policy Automated Access Etiquette