VirtualBox

source: vbox/trunk/src/libs/openssl-1.1.1l/crypto/sha/asm/sha512-x86_64.pl@ 91772

Last change on this file since 91772 was 91772, checked in by vboxsync, 3 years ago

openssl-1.1.1l: Applied and adjusted our OpenSSL changes to 1.1.1l. bugref:10126

  • Property svn:executable set to *
File size: 63.3 KB
Line 
1#! /usr/bin/env perl
2# Copyright 2005-2020 The OpenSSL Project Authors. All Rights Reserved.
3#
4# Licensed under the OpenSSL license (the "License"). You may not use
5# this file except in compliance with the License. You can obtain a copy
6# in the file LICENSE in the source distribution or at
7# https://www.openssl.org/source/license.html
8
9#
10# ====================================================================
11# Written by Andy Polyakov <[email protected]> for the OpenSSL
12# project. Rights for redistribution and usage in source and binary
13# forms are granted according to the OpenSSL license.
14# ====================================================================
15#
16# sha256/512_block procedure for x86_64.
17#
18# 40% improvement over compiler-generated code on Opteron. On EM64T
19# sha256 was observed to run >80% faster and sha512 - >40%. No magical
20# tricks, just straight implementation... I really wonder why gcc
21# [being armed with inline assembler] fails to generate as fast code.
22# The only thing which is cool about this module is that it's very
23# same instruction sequence used for both SHA-256 and SHA-512. In
24# former case the instructions operate on 32-bit operands, while in
25# latter - on 64-bit ones. All I had to do is to get one flavor right,
26# the other one passed the test right away:-)
27#
28# sha256_block runs in ~1005 cycles on Opteron, which gives you
29# asymptotic performance of 64*1000/1005=63.7MBps times CPU clock
30# frequency in GHz. sha512_block runs in ~1275 cycles, which results
31# in 128*1000/1275=100MBps per GHz. Is there room for improvement?
32# Well, if you compare it to IA-64 implementation, which maintains
33# X[16] in register bank[!], tends to 4 instructions per CPU clock
34# cycle and runs in 1003 cycles, 1275 is very good result for 3-way
35# issue Opteron pipeline and X[16] maintained in memory. So that *if*
36# there is a way to improve it, *then* the only way would be to try to
37# offload X[16] updates to SSE unit, but that would require "deeper"
38# loop unroll, which in turn would naturally cause size blow-up, not
39# to mention increased complexity! And once again, only *if* it's
40# actually possible to noticeably improve overall ILP, instruction
41# level parallelism, on a given CPU implementation in this case.
42#
43# Special note on Intel EM64T. While Opteron CPU exhibits perfect
44# performance ratio of 1.5 between 64- and 32-bit flavors [see above],
45# [currently available] EM64T CPUs apparently are far from it. On the
46# contrary, 64-bit version, sha512_block, is ~30% *slower* than 32-bit
47# sha256_block:-( This is presumably because 64-bit shifts/rotates
48# apparently are not atomic instructions, but implemented in microcode.
49#
50# May 2012.
51#
52# Optimization including one of Pavel Semjanov's ideas, alternative
53# Maj, resulted in >=5% improvement on most CPUs, +20% SHA256 and
54# unfortunately -2% SHA512 on P4 [which nobody should care about
55# that much].
56#
57# June 2012.
58#
59# Add SIMD code paths, see below for improvement coefficients. SSSE3
60# code path was not attempted for SHA512, because improvement is not
61# estimated to be high enough, noticeably less than 9%, to justify
62# the effort, not on pre-AVX processors. [Obviously with exclusion
63# for VIA Nano, but it has SHA512 instruction that is faster and
64# should be used instead.] For reference, corresponding estimated
65# upper limit for improvement for SSSE3 SHA256 is 28%. The fact that
66# higher coefficients are observed on VIA Nano and Bulldozer has more
67# to do with specifics of their architecture [which is topic for
68# separate discussion].
69#
70# November 2012.
71#
72# Add AVX2 code path. Two consecutive input blocks are loaded to
73# 256-bit %ymm registers, with data from first block to least
74# significant 128-bit halves and data from second to most significant.
75# The data is then processed with same SIMD instruction sequence as
76# for AVX, but with %ymm as operands. Side effect is increased stack
77# frame, 448 additional bytes in SHA256 and 1152 in SHA512, and 1.2KB
78# code size increase.
79#
80# March 2014.
81#
82# Add support for Intel SHA Extensions.
83
84######################################################################
85# Current performance in cycles per processed byte (less is better):
86#
87# SHA256 SSSE3 AVX/XOP(*) SHA512 AVX/XOP(*)
88#
89# AMD K8 14.9 - - 9.57 -
90# P4 17.3 - - 30.8 -
91# Core 2 15.6 13.8(+13%) - 9.97 -
92# Westmere 14.8 12.3(+19%) - 9.58 -
93# Sandy Bridge 17.4 14.2(+23%) 11.6(+50%(**)) 11.2 8.10(+38%(**))
94# Ivy Bridge 12.6 10.5(+20%) 10.3(+22%) 8.17 7.22(+13%)
95# Haswell 12.2 9.28(+31%) 7.80(+56%) 7.66 5.40(+42%)
96# Skylake 11.4 9.03(+26%) 7.70(+48%) 7.25 5.20(+40%)
97# Bulldozer 21.1 13.6(+54%) 13.6(+54%(***)) 13.5 8.58(+57%)
98# Ryzen 11.0 9.02(+22%) 2.05(+440%) 7.05 5.67(+20%)
99# VIA Nano 23.0 16.5(+39%) - 14.7 -
100# Atom 23.0 18.9(+22%) - 14.7 -
101# Silvermont 27.4 20.6(+33%) - 17.5 -
102# Knights L 27.4 21.0(+30%) 19.6(+40%) 17.5 12.8(+37%)
103# Goldmont 18.9 14.3(+32%) 4.16(+350%) 12.0 -
104#
105# (*) whichever best applicable, including SHAEXT;
106# (**) switch from ror to shrd stands for fair share of improvement;
107# (***) execution time is fully determined by remaining integer-only
108# part, body_00_15; reducing the amount of SIMD instructions
109# below certain limit makes no difference/sense; to conserve
110# space SHA256 XOP code path is therefore omitted;
111
112$flavour = shift;
113$output = shift;
114if ($flavour =~ /\./) { $output = $flavour; undef $flavour; }
115
116$win64=0; $win64=1 if ($flavour =~ /[nm]asm|mingw64/ || $output =~ /\.asm$/);
117
118$0 =~ m/(.*[\/\\])[^\/\\]+$/; $dir=$1;
119( $xlate="${dir}x86_64-xlate.pl" and -f $xlate ) or
120( $xlate="${dir}../../perlasm/x86_64-xlate.pl" and -f $xlate) or
121die "can't locate x86_64-xlate.pl";
122
123if (`$ENV{CC} -Wa,-v -c -o /dev/null -x assembler /dev/null 2>&1`
124 =~ /GNU assembler version ([2-9]\.[0-9]+)/) {
125 $avx = ($1>=2.19) + ($1>=2.22);
126}
127
128if (!$avx && $win64 && ($flavour =~ /nasm/ || $ENV{ASM} =~ /nasm/) &&
129 `nasm -v 2>&1` =~ /NASM version ([2-9]\.[0-9]+)/) {
130 $avx = ($1>=2.09) + ($1>=2.10);
131}
132
133if (!$avx && $win64 && ($flavour =~ /masm/ || $ENV{ASM} =~ /ml64/) &&
134 `ml64 2>&1` =~ /Version ([0-9]+)\./) {
135 $avx = ($1>=10) + ($1>=11);
136}
137
138if (!$avx && `$ENV{CC} -v 2>&1` =~ /((?:clang|LLVM) version|.*based on LLVM) ([0-9]+\.[0-9]+)/) {
139 $avx = ($2>=3.0) + ($2>3.0);
140}
141
142$shaext=1; ### set to zero if compiling for 1.0.1
143$avx=1 if (!$shaext && $avx);
144
145open OUT,"| \"$^X\" \"$xlate\" $flavour \"$output\"";
146*STDOUT=*OUT;
147
148if ($output =~ /512/) {
149 $func="sha512_block_data_order";
150 $TABLE="K512";
151 $SZ=8;
152 @ROT=($A,$B,$C,$D,$E,$F,$G,$H)=("%rax","%rbx","%rcx","%rdx",
153 "%r8", "%r9", "%r10","%r11");
154 ($T1,$a0,$a1,$a2,$a3)=("%r12","%r13","%r14","%r15","%rdi");
155 @Sigma0=(28,34,39);
156 @Sigma1=(14,18,41);
157 @sigma0=(1, 8, 7);
158 @sigma1=(19,61, 6);
159 $rounds=80;
160} else {
161 $func="sha256_block_data_order";
162 $TABLE="K256";
163 $SZ=4;
164 @ROT=($A,$B,$C,$D,$E,$F,$G,$H)=("%eax","%ebx","%ecx","%edx",
165 "%r8d","%r9d","%r10d","%r11d");
166 ($T1,$a0,$a1,$a2,$a3)=("%r12d","%r13d","%r14d","%r15d","%edi");
167 @Sigma0=( 2,13,22);
168 @Sigma1=( 6,11,25);
169 @sigma0=( 7,18, 3);
170 @sigma1=(17,19,10);
171 $rounds=64;
172}
173
174$ctx="%rdi"; # 1st arg, zapped by $a3
175$inp="%rsi"; # 2nd arg
176$Tbl="%rbp";
177
178$_ctx="16*$SZ+0*8(%rsp)";
179$_inp="16*$SZ+1*8(%rsp)";
180$_end="16*$SZ+2*8(%rsp)";
181$_rsp="`16*$SZ+3*8`(%rsp)";
182$framesz="16*$SZ+4*8";
183
184
185sub ROUND_00_15()
186{ my ($i,$a,$b,$c,$d,$e,$f,$g,$h) = @_;
187 my $STRIDE=$SZ;
188 $STRIDE += 16 if ($i%(16/$SZ)==(16/$SZ-1));
189
190$code.=<<___;
191 ror \$`$Sigma1[2]-$Sigma1[1]`,$a0
192 mov $f,$a2
193
194 xor $e,$a0
195 ror \$`$Sigma0[2]-$Sigma0[1]`,$a1
196 xor $g,$a2 # f^g
197
198 mov $T1,`$SZ*($i&0xf)`(%rsp)
199 xor $a,$a1
200 and $e,$a2 # (f^g)&e
201
202 ror \$`$Sigma1[1]-$Sigma1[0]`,$a0
203 add $h,$T1 # T1+=h
204 xor $g,$a2 # Ch(e,f,g)=((f^g)&e)^g
205
206 ror \$`$Sigma0[1]-$Sigma0[0]`,$a1
207 xor $e,$a0
208 add $a2,$T1 # T1+=Ch(e,f,g)
209
210 mov $a,$a2
211 add ($Tbl),$T1 # T1+=K[round]
212 xor $a,$a1
213
214 xor $b,$a2 # a^b, b^c in next round
215 ror \$$Sigma1[0],$a0 # Sigma1(e)
216 mov $b,$h
217
218 and $a2,$a3
219 ror \$$Sigma0[0],$a1 # Sigma0(a)
220 add $a0,$T1 # T1+=Sigma1(e)
221
222 xor $a3,$h # h=Maj(a,b,c)=Ch(a^b,c,b)
223 add $T1,$d # d+=T1
224 add $T1,$h # h+=T1
225
226 lea $STRIDE($Tbl),$Tbl # round++
227___
228$code.=<<___ if ($i<15);
229 add $a1,$h # h+=Sigma0(a)
230___
231 ($a2,$a3) = ($a3,$a2);
232}
233
234sub ROUND_16_XX()
235{ my ($i,$a,$b,$c,$d,$e,$f,$g,$h) = @_;
236
237$code.=<<___;
238 mov `$SZ*(($i+1)&0xf)`(%rsp),$a0
239 mov `$SZ*(($i+14)&0xf)`(%rsp),$a2
240
241 mov $a0,$T1
242 ror \$`$sigma0[1]-$sigma0[0]`,$a0
243 add $a1,$a # modulo-scheduled h+=Sigma0(a)
244 mov $a2,$a1
245 ror \$`$sigma1[1]-$sigma1[0]`,$a2
246
247 xor $T1,$a0
248 shr \$$sigma0[2],$T1
249 ror \$$sigma0[0],$a0
250 xor $a1,$a2
251 shr \$$sigma1[2],$a1
252
253 ror \$$sigma1[0],$a2
254 xor $a0,$T1 # sigma0(X[(i+1)&0xf])
255 xor $a1,$a2 # sigma1(X[(i+14)&0xf])
256 add `$SZ*(($i+9)&0xf)`(%rsp),$T1
257
258 add `$SZ*($i&0xf)`(%rsp),$T1
259 mov $e,$a0
260 add $a2,$T1
261 mov $a,$a1
262___
263 &ROUND_00_15(@_);
264}
265
266$code=<<___;
267.text
268
269.extern OPENSSL_ia32cap_P
270.globl $func
271.type $func,\@function,3
272.align 16
273$func:
274.cfi_startproc
275___
276$code.=<<___ if ($SZ==4 || $avx);
277 lea OPENSSL_ia32cap_P(%rip),%r11
278 mov 0(%r11),%r9d
279 mov 4(%r11),%r10d
280 mov 8(%r11),%r11d
281___
282$code.=<<___ if ($SZ==4 && $shaext);
283 test \$`1<<29`,%r11d # check for SHA
284 jnz _shaext_shortcut
285___
286$code.=<<___ if ($avx && $SZ==8);
287 test \$`1<<11`,%r10d # check for XOP
288 jnz .Lxop_shortcut
289___
290$code.=<<___ if ($avx>1);
291 and \$`1<<8|1<<5|1<<3`,%r11d # check for BMI2+AVX2+BMI1
292 cmp \$`1<<8|1<<5|1<<3`,%r11d
293 je .Lavx2_shortcut
294___
295$code.=<<___ if ($avx);
296 and \$`1<<30`,%r9d # mask "Intel CPU" bit
297 and \$`1<<28|1<<9`,%r10d # mask AVX and SSSE3 bits
298 or %r9d,%r10d
299 cmp \$`1<<28|1<<9|1<<30`,%r10d
300 je .Lavx_shortcut
301___
302$code.=<<___ if ($SZ==4);
303 test \$`1<<9`,%r10d
304 jnz .Lssse3_shortcut
305___
306$code.=<<___;
307 mov %rsp,%rax # copy %rsp
308.cfi_def_cfa_register %rax
309 push %rbx
310.cfi_push %rbx
311 push %rbp
312.cfi_push %rbp
313 push %r12
314.cfi_push %r12
315 push %r13
316.cfi_push %r13
317 push %r14
318.cfi_push %r14
319 push %r15
320.cfi_push %r15
321 shl \$4,%rdx # num*16
322 sub \$$framesz,%rsp
323 lea ($inp,%rdx,$SZ),%rdx # inp+num*16*$SZ
324 and \$-64,%rsp # align stack frame
325 mov $ctx,$_ctx # save ctx, 1st arg
326 mov $inp,$_inp # save inp, 2nd arh
327 mov %rdx,$_end # save end pointer, "3rd" arg
328 mov %rax,$_rsp # save copy of %rsp
329.cfi_cfa_expression $_rsp,deref,+8
330.Lprologue:
331
332 mov $SZ*0($ctx),$A
333 mov $SZ*1($ctx),$B
334 mov $SZ*2($ctx),$C
335 mov $SZ*3($ctx),$D
336 mov $SZ*4($ctx),$E
337 mov $SZ*5($ctx),$F
338 mov $SZ*6($ctx),$G
339 mov $SZ*7($ctx),$H
340 jmp .Lloop
341
342.align 16
343.Lloop:
344 mov $B,$a3
345 lea $TABLE(%rip),$Tbl
346 xor $C,$a3 # magic
347___
348 for($i=0;$i<16;$i++) {
349 $code.=" mov $SZ*$i($inp),$T1\n";
350 $code.=" mov @ROT[4],$a0\n";
351 $code.=" mov @ROT[0],$a1\n";
352 $code.=" bswap $T1\n";
353 &ROUND_00_15($i,@ROT);
354 unshift(@ROT,pop(@ROT));
355 }
356$code.=<<___;
357 jmp .Lrounds_16_xx
358.align 16
359.Lrounds_16_xx:
360___
361 for(;$i<32;$i++) {
362 &ROUND_16_XX($i,@ROT);
363 unshift(@ROT,pop(@ROT));
364 }
365
366$code.=<<___;
367 cmpb \$0,`$SZ-1`($Tbl)
368 jnz .Lrounds_16_xx
369
370 mov $_ctx,$ctx
371 add $a1,$A # modulo-scheduled h+=Sigma0(a)
372 lea 16*$SZ($inp),$inp
373
374 add $SZ*0($ctx),$A
375 add $SZ*1($ctx),$B
376 add $SZ*2($ctx),$C
377 add $SZ*3($ctx),$D
378 add $SZ*4($ctx),$E
379 add $SZ*5($ctx),$F
380 add $SZ*6($ctx),$G
381 add $SZ*7($ctx),$H
382
383 cmp $_end,$inp
384
385 mov $A,$SZ*0($ctx)
386 mov $B,$SZ*1($ctx)
387 mov $C,$SZ*2($ctx)
388 mov $D,$SZ*3($ctx)
389 mov $E,$SZ*4($ctx)
390 mov $F,$SZ*5($ctx)
391 mov $G,$SZ*6($ctx)
392 mov $H,$SZ*7($ctx)
393 jb .Lloop
394
395 mov $_rsp,%rsi
396.cfi_def_cfa %rsi,8
397 mov -48(%rsi),%r15
398.cfi_restore %r15
399 mov -40(%rsi),%r14
400.cfi_restore %r14
401 mov -32(%rsi),%r13
402.cfi_restore %r13
403 mov -24(%rsi),%r12
404.cfi_restore %r12
405 mov -16(%rsi),%rbp
406.cfi_restore %rbp
407 mov -8(%rsi),%rbx
408.cfi_restore %rbx
409 lea (%rsi),%rsp
410.cfi_def_cfa_register %rsp
411.Lepilogue:
412 ret
413.cfi_endproc
414.size $func,.-$func
415___
416
417if ($SZ==4) {
418$code.=<<___;
419.align 64
420.type $TABLE,\@object
421$TABLE:
422 .long 0x428a2f98,0x71374491,0xb5c0fbcf,0xe9b5dba5
423 .long 0x428a2f98,0x71374491,0xb5c0fbcf,0xe9b5dba5
424 .long 0x3956c25b,0x59f111f1,0x923f82a4,0xab1c5ed5
425 .long 0x3956c25b,0x59f111f1,0x923f82a4,0xab1c5ed5
426 .long 0xd807aa98,0x12835b01,0x243185be,0x550c7dc3
427 .long 0xd807aa98,0x12835b01,0x243185be,0x550c7dc3
428 .long 0x72be5d74,0x80deb1fe,0x9bdc06a7,0xc19bf174
429 .long 0x72be5d74,0x80deb1fe,0x9bdc06a7,0xc19bf174
430 .long 0xe49b69c1,0xefbe4786,0x0fc19dc6,0x240ca1cc
431 .long 0xe49b69c1,0xefbe4786,0x0fc19dc6,0x240ca1cc
432 .long 0x2de92c6f,0x4a7484aa,0x5cb0a9dc,0x76f988da
433 .long 0x2de92c6f,0x4a7484aa,0x5cb0a9dc,0x76f988da
434 .long 0x983e5152,0xa831c66d,0xb00327c8,0xbf597fc7
435 .long 0x983e5152,0xa831c66d,0xb00327c8,0xbf597fc7
436 .long 0xc6e00bf3,0xd5a79147,0x06ca6351,0x14292967
437 .long 0xc6e00bf3,0xd5a79147,0x06ca6351,0x14292967
438 .long 0x27b70a85,0x2e1b2138,0x4d2c6dfc,0x53380d13
439 .long 0x27b70a85,0x2e1b2138,0x4d2c6dfc,0x53380d13
440 .long 0x650a7354,0x766a0abb,0x81c2c92e,0x92722c85
441 .long 0x650a7354,0x766a0abb,0x81c2c92e,0x92722c85
442 .long 0xa2bfe8a1,0xa81a664b,0xc24b8b70,0xc76c51a3
443 .long 0xa2bfe8a1,0xa81a664b,0xc24b8b70,0xc76c51a3
444 .long 0xd192e819,0xd6990624,0xf40e3585,0x106aa070
445 .long 0xd192e819,0xd6990624,0xf40e3585,0x106aa070
446 .long 0x19a4c116,0x1e376c08,0x2748774c,0x34b0bcb5
447 .long 0x19a4c116,0x1e376c08,0x2748774c,0x34b0bcb5
448 .long 0x391c0cb3,0x4ed8aa4a,0x5b9cca4f,0x682e6ff3
449 .long 0x391c0cb3,0x4ed8aa4a,0x5b9cca4f,0x682e6ff3
450 .long 0x748f82ee,0x78a5636f,0x84c87814,0x8cc70208
451 .long 0x748f82ee,0x78a5636f,0x84c87814,0x8cc70208
452 .long 0x90befffa,0xa4506ceb,0xbef9a3f7,0xc67178f2
453 .long 0x90befffa,0xa4506ceb,0xbef9a3f7,0xc67178f2
454
455 .long 0x00010203,0x04050607,0x08090a0b,0x0c0d0e0f
456 .long 0x00010203,0x04050607,0x08090a0b,0x0c0d0e0f
457 .long 0x03020100,0x0b0a0908,0xffffffff,0xffffffff
458 .long 0x03020100,0x0b0a0908,0xffffffff,0xffffffff
459 .long 0xffffffff,0xffffffff,0x03020100,0x0b0a0908
460 .long 0xffffffff,0xffffffff,0x03020100,0x0b0a0908
461 .asciz "SHA256 block transform for x86_64, CRYPTOGAMS by <appro\@openssl.org>"
462___
463} else {
464$code.=<<___;
465.align 64
466.type $TABLE,\@object
467$TABLE:
468 .quad 0x428a2f98d728ae22,0x7137449123ef65cd
469 .quad 0x428a2f98d728ae22,0x7137449123ef65cd
470 .quad 0xb5c0fbcfec4d3b2f,0xe9b5dba58189dbbc
471 .quad 0xb5c0fbcfec4d3b2f,0xe9b5dba58189dbbc
472 .quad 0x3956c25bf348b538,0x59f111f1b605d019
473 .quad 0x3956c25bf348b538,0x59f111f1b605d019
474 .quad 0x923f82a4af194f9b,0xab1c5ed5da6d8118
475 .quad 0x923f82a4af194f9b,0xab1c5ed5da6d8118
476 .quad 0xd807aa98a3030242,0x12835b0145706fbe
477 .quad 0xd807aa98a3030242,0x12835b0145706fbe
478 .quad 0x243185be4ee4b28c,0x550c7dc3d5ffb4e2
479 .quad 0x243185be4ee4b28c,0x550c7dc3d5ffb4e2
480 .quad 0x72be5d74f27b896f,0x80deb1fe3b1696b1
481 .quad 0x72be5d74f27b896f,0x80deb1fe3b1696b1
482 .quad 0x9bdc06a725c71235,0xc19bf174cf692694
483 .quad 0x9bdc06a725c71235,0xc19bf174cf692694
484 .quad 0xe49b69c19ef14ad2,0xefbe4786384f25e3
485 .quad 0xe49b69c19ef14ad2,0xefbe4786384f25e3
486 .quad 0x0fc19dc68b8cd5b5,0x240ca1cc77ac9c65
487 .quad 0x0fc19dc68b8cd5b5,0x240ca1cc77ac9c65
488 .quad 0x2de92c6f592b0275,0x4a7484aa6ea6e483
489 .quad 0x2de92c6f592b0275,0x4a7484aa6ea6e483
490 .quad 0x5cb0a9dcbd41fbd4,0x76f988da831153b5
491 .quad 0x5cb0a9dcbd41fbd4,0x76f988da831153b5
492 .quad 0x983e5152ee66dfab,0xa831c66d2db43210
493 .quad 0x983e5152ee66dfab,0xa831c66d2db43210
494 .quad 0xb00327c898fb213f,0xbf597fc7beef0ee4
495 .quad 0xb00327c898fb213f,0xbf597fc7beef0ee4
496 .quad 0xc6e00bf33da88fc2,0xd5a79147930aa725
497 .quad 0xc6e00bf33da88fc2,0xd5a79147930aa725
498 .quad 0x06ca6351e003826f,0x142929670a0e6e70
499 .quad 0x06ca6351e003826f,0x142929670a0e6e70
500 .quad 0x27b70a8546d22ffc,0x2e1b21385c26c926
501 .quad 0x27b70a8546d22ffc,0x2e1b21385c26c926
502 .quad 0x4d2c6dfc5ac42aed,0x53380d139d95b3df
503 .quad 0x4d2c6dfc5ac42aed,0x53380d139d95b3df
504 .quad 0x650a73548baf63de,0x766a0abb3c77b2a8
505 .quad 0x650a73548baf63de,0x766a0abb3c77b2a8
506 .quad 0x81c2c92e47edaee6,0x92722c851482353b
507 .quad 0x81c2c92e47edaee6,0x92722c851482353b
508 .quad 0xa2bfe8a14cf10364,0xa81a664bbc423001
509 .quad 0xa2bfe8a14cf10364,0xa81a664bbc423001
510 .quad 0xc24b8b70d0f89791,0xc76c51a30654be30
511 .quad 0xc24b8b70d0f89791,0xc76c51a30654be30
512 .quad 0xd192e819d6ef5218,0xd69906245565a910
513 .quad 0xd192e819d6ef5218,0xd69906245565a910
514 .quad 0xf40e35855771202a,0x106aa07032bbd1b8
515 .quad 0xf40e35855771202a,0x106aa07032bbd1b8
516 .quad 0x19a4c116b8d2d0c8,0x1e376c085141ab53
517 .quad 0x19a4c116b8d2d0c8,0x1e376c085141ab53
518 .quad 0x2748774cdf8eeb99,0x34b0bcb5e19b48a8
519 .quad 0x2748774cdf8eeb99,0x34b0bcb5e19b48a8
520 .quad 0x391c0cb3c5c95a63,0x4ed8aa4ae3418acb
521 .quad 0x391c0cb3c5c95a63,0x4ed8aa4ae3418acb
522 .quad 0x5b9cca4f7763e373,0x682e6ff3d6b2b8a3
523 .quad 0x5b9cca4f7763e373,0x682e6ff3d6b2b8a3
524 .quad 0x748f82ee5defb2fc,0x78a5636f43172f60
525 .quad 0x748f82ee5defb2fc,0x78a5636f43172f60
526 .quad 0x84c87814a1f0ab72,0x8cc702081a6439ec
527 .quad 0x84c87814a1f0ab72,0x8cc702081a6439ec
528 .quad 0x90befffa23631e28,0xa4506cebde82bde9
529 .quad 0x90befffa23631e28,0xa4506cebde82bde9
530 .quad 0xbef9a3f7b2c67915,0xc67178f2e372532b
531 .quad 0xbef9a3f7b2c67915,0xc67178f2e372532b
532 .quad 0xca273eceea26619c,0xd186b8c721c0c207
533 .quad 0xca273eceea26619c,0xd186b8c721c0c207
534 .quad 0xeada7dd6cde0eb1e,0xf57d4f7fee6ed178
535 .quad 0xeada7dd6cde0eb1e,0xf57d4f7fee6ed178
536 .quad 0x06f067aa72176fba,0x0a637dc5a2c898a6
537 .quad 0x06f067aa72176fba,0x0a637dc5a2c898a6
538 .quad 0x113f9804bef90dae,0x1b710b35131c471b
539 .quad 0x113f9804bef90dae,0x1b710b35131c471b
540 .quad 0x28db77f523047d84,0x32caab7b40c72493
541 .quad 0x28db77f523047d84,0x32caab7b40c72493
542 .quad 0x3c9ebe0a15c9bebc,0x431d67c49c100d4c
543 .quad 0x3c9ebe0a15c9bebc,0x431d67c49c100d4c
544 .quad 0x4cc5d4becb3e42b6,0x597f299cfc657e2a
545 .quad 0x4cc5d4becb3e42b6,0x597f299cfc657e2a
546 .quad 0x5fcb6fab3ad6faec,0x6c44198c4a475817
547 .quad 0x5fcb6fab3ad6faec,0x6c44198c4a475817
548
549 .quad 0x0001020304050607,0x08090a0b0c0d0e0f
550 .quad 0x0001020304050607,0x08090a0b0c0d0e0f
551 .asciz "SHA512 block transform for x86_64, CRYPTOGAMS by <appro\@openssl.org>"
552___
553}
554
555######################################################################
556# SIMD code paths
557#
558if ($SZ==4 && $shaext) {{{
559######################################################################
560# Intel SHA Extensions implementation of SHA256 update function.
561#
562my ($ctx,$inp,$num,$Tbl)=("%rdi","%rsi","%rdx","%rcx");
563
564my ($Wi,$ABEF,$CDGH,$TMP,$BSWAP,$ABEF_SAVE,$CDGH_SAVE)=map("%xmm$_",(0..2,7..10));
565my @MSG=map("%xmm$_",(3..6));
566
567$code.=<<___;
568.type sha256_block_data_order_shaext,\@function,3
569.align 64
570sha256_block_data_order_shaext:
571_shaext_shortcut:
572.cfi_startproc
573___
574$code.=<<___ if ($win64);
575 lea `-8-5*16`(%rsp),%rsp
576 movaps %xmm6,-8-5*16(%rax)
577 movaps %xmm7,-8-4*16(%rax)
578 movaps %xmm8,-8-3*16(%rax)
579 movaps %xmm9,-8-2*16(%rax)
580 movaps %xmm10,-8-1*16(%rax)
581.Lprologue_shaext:
582___
583$code.=<<___;
584 lea K256+0x80(%rip),$Tbl
585 movdqu ($ctx),$ABEF # DCBA
586 movdqu 16($ctx),$CDGH # HGFE
587 movdqa 0x200-0x80($Tbl),$TMP # byte swap mask
588
589 pshufd \$0x1b,$ABEF,$Wi # ABCD
590 pshufd \$0xb1,$ABEF,$ABEF # CDAB
591 pshufd \$0x1b,$CDGH,$CDGH # EFGH
592 movdqa $TMP,$BSWAP # offload
593 palignr \$8,$CDGH,$ABEF # ABEF
594 punpcklqdq $Wi,$CDGH # CDGH
595 jmp .Loop_shaext
596
597.align 16
598.Loop_shaext:
599 movdqu ($inp),@MSG[0]
600 movdqu 0x10($inp),@MSG[1]
601 movdqu 0x20($inp),@MSG[2]
602 pshufb $TMP,@MSG[0]
603 movdqu 0x30($inp),@MSG[3]
604
605 movdqa 0*32-0x80($Tbl),$Wi
606 paddd @MSG[0],$Wi
607 pshufb $TMP,@MSG[1]
608 movdqa $CDGH,$CDGH_SAVE # offload
609 sha256rnds2 $ABEF,$CDGH # 0-3
610 pshufd \$0x0e,$Wi,$Wi
611 nop
612 movdqa $ABEF,$ABEF_SAVE # offload
613 sha256rnds2 $CDGH,$ABEF
614
615 movdqa 1*32-0x80($Tbl),$Wi
616 paddd @MSG[1],$Wi
617 pshufb $TMP,@MSG[2]
618 sha256rnds2 $ABEF,$CDGH # 4-7
619 pshufd \$0x0e,$Wi,$Wi
620 lea 0x40($inp),$inp
621 sha256msg1 @MSG[1],@MSG[0]
622 sha256rnds2 $CDGH,$ABEF
623
624 movdqa 2*32-0x80($Tbl),$Wi
625 paddd @MSG[2],$Wi
626 pshufb $TMP,@MSG[3]
627 sha256rnds2 $ABEF,$CDGH # 8-11
628 pshufd \$0x0e,$Wi,$Wi
629 movdqa @MSG[3],$TMP
630 palignr \$4,@MSG[2],$TMP
631 nop
632 paddd $TMP,@MSG[0]
633 sha256msg1 @MSG[2],@MSG[1]
634 sha256rnds2 $CDGH,$ABEF
635
636 movdqa 3*32-0x80($Tbl),$Wi
637 paddd @MSG[3],$Wi
638 sha256msg2 @MSG[3],@MSG[0]
639 sha256rnds2 $ABEF,$CDGH # 12-15
640 pshufd \$0x0e,$Wi,$Wi
641 movdqa @MSG[0],$TMP
642 palignr \$4,@MSG[3],$TMP
643 nop
644 paddd $TMP,@MSG[1]
645 sha256msg1 @MSG[3],@MSG[2]
646 sha256rnds2 $CDGH,$ABEF
647___
648for($i=4;$i<16-3;$i++) {
649$code.=<<___;
650 movdqa $i*32-0x80($Tbl),$Wi
651 paddd @MSG[0],$Wi
652 sha256msg2 @MSG[0],@MSG[1]
653 sha256rnds2 $ABEF,$CDGH # 16-19...
654 pshufd \$0x0e,$Wi,$Wi
655 movdqa @MSG[1],$TMP
656 palignr \$4,@MSG[0],$TMP
657 nop
658 paddd $TMP,@MSG[2]
659 sha256msg1 @MSG[0],@MSG[3]
660 sha256rnds2 $CDGH,$ABEF
661___
662 push(@MSG,shift(@MSG));
663}
664$code.=<<___;
665 movdqa 13*32-0x80($Tbl),$Wi
666 paddd @MSG[0],$Wi
667 sha256msg2 @MSG[0],@MSG[1]
668 sha256rnds2 $ABEF,$CDGH # 52-55
669 pshufd \$0x0e,$Wi,$Wi
670 movdqa @MSG[1],$TMP
671 palignr \$4,@MSG[0],$TMP
672 sha256rnds2 $CDGH,$ABEF
673 paddd $TMP,@MSG[2]
674
675 movdqa 14*32-0x80($Tbl),$Wi
676 paddd @MSG[1],$Wi
677 sha256rnds2 $ABEF,$CDGH # 56-59
678 pshufd \$0x0e,$Wi,$Wi
679 sha256msg2 @MSG[1],@MSG[2]
680 movdqa $BSWAP,$TMP
681 sha256rnds2 $CDGH,$ABEF
682
683 movdqa 15*32-0x80($Tbl),$Wi
684 paddd @MSG[2],$Wi
685 nop
686 sha256rnds2 $ABEF,$CDGH # 60-63
687 pshufd \$0x0e,$Wi,$Wi
688 dec $num
689 nop
690 sha256rnds2 $CDGH,$ABEF
691
692 paddd $CDGH_SAVE,$CDGH
693 paddd $ABEF_SAVE,$ABEF
694 jnz .Loop_shaext
695
696 pshufd \$0xb1,$CDGH,$CDGH # DCHG
697 pshufd \$0x1b,$ABEF,$TMP # FEBA
698 pshufd \$0xb1,$ABEF,$ABEF # BAFE
699 punpckhqdq $CDGH,$ABEF # DCBA
700 palignr \$8,$TMP,$CDGH # HGFE
701
702 movdqu $ABEF,($ctx)
703 movdqu $CDGH,16($ctx)
704___
705$code.=<<___ if ($win64);
706 movaps -8-5*16(%rax),%xmm6
707 movaps -8-4*16(%rax),%xmm7
708 movaps -8-3*16(%rax),%xmm8
709 movaps -8-2*16(%rax),%xmm9
710 movaps -8-1*16(%rax),%xmm10
711 mov %rax,%rsp
712.Lepilogue_shaext:
713___
714$code.=<<___;
715 ret
716.cfi_endproc
717.size sha256_block_data_order_shaext,.-sha256_block_data_order_shaext
718___
719}}}
720{{{
721
722my $a4=$T1;
723my ($a,$b,$c,$d,$e,$f,$g,$h);
724
725sub AUTOLOAD() # thunk [simplified] 32-bit style perlasm
726{ my $opcode = $AUTOLOAD; $opcode =~ s/.*:://;
727 my $arg = pop;
728 $arg = "\$$arg" if ($arg*1 eq $arg);
729 $code .= "\t$opcode\t".join(',',$arg,reverse @_)."\n";
730}
731
732sub body_00_15 () {
733 (
734 '($a,$b,$c,$d,$e,$f,$g,$h)=@ROT;'.
735
736 '&ror ($a0,$Sigma1[2]-$Sigma1[1])',
737 '&mov ($a,$a1)',
738 '&mov ($a4,$f)',
739
740 '&ror ($a1,$Sigma0[2]-$Sigma0[1])',
741 '&xor ($a0,$e)',
742 '&xor ($a4,$g)', # f^g
743
744 '&ror ($a0,$Sigma1[1]-$Sigma1[0])',
745 '&xor ($a1,$a)',
746 '&and ($a4,$e)', # (f^g)&e
747
748 '&xor ($a0,$e)',
749 '&add ($h,$SZ*($i&15)."(%rsp)")', # h+=X[i]+K[i]
750 '&mov ($a2,$a)',
751
752 '&xor ($a4,$g)', # Ch(e,f,g)=((f^g)&e)^g
753 '&ror ($a1,$Sigma0[1]-$Sigma0[0])',
754 '&xor ($a2,$b)', # a^b, b^c in next round
755
756 '&add ($h,$a4)', # h+=Ch(e,f,g)
757 '&ror ($a0,$Sigma1[0])', # Sigma1(e)
758 '&and ($a3,$a2)', # (b^c)&(a^b)
759
760 '&xor ($a1,$a)',
761 '&add ($h,$a0)', # h+=Sigma1(e)
762 '&xor ($a3,$b)', # Maj(a,b,c)=Ch(a^b,c,b)
763
764 '&ror ($a1,$Sigma0[0])', # Sigma0(a)
765 '&add ($d,$h)', # d+=h
766 '&add ($h,$a3)', # h+=Maj(a,b,c)
767
768 '&mov ($a0,$d)',
769 '&add ($a1,$h);'. # h+=Sigma0(a)
770 '($a2,$a3) = ($a3,$a2); unshift(@ROT,pop(@ROT)); $i++;'
771 );
772}
773
774######################################################################
775# SSSE3 code path
776#
777if ($SZ==4) { # SHA256 only
778my @X = map("%xmm$_",(0..3));
779my ($t0,$t1,$t2,$t3, $t4,$t5) = map("%xmm$_",(4..9));
780
781$code.=<<___;
782.type ${func}_ssse3,\@function,3
783.align 64
784${func}_ssse3:
785.cfi_startproc
786.Lssse3_shortcut:
787 mov %rsp,%rax # copy %rsp
788.cfi_def_cfa_register %rax
789 push %rbx
790.cfi_push %rbx
791 push %rbp
792.cfi_push %rbp
793 push %r12
794.cfi_push %r12
795 push %r13
796.cfi_push %r13
797 push %r14
798.cfi_push %r14
799 push %r15
800.cfi_push %r15
801 shl \$4,%rdx # num*16
802 sub \$`$framesz+$win64*16*4`,%rsp
803 lea ($inp,%rdx,$SZ),%rdx # inp+num*16*$SZ
804 and \$-64,%rsp # align stack frame
805 mov $ctx,$_ctx # save ctx, 1st arg
806 mov $inp,$_inp # save inp, 2nd arh
807 mov %rdx,$_end # save end pointer, "3rd" arg
808 mov %rax,$_rsp # save copy of %rsp
809.cfi_cfa_expression $_rsp,deref,+8
810___
811$code.=<<___ if ($win64);
812 movaps %xmm6,16*$SZ+32(%rsp)
813 movaps %xmm7,16*$SZ+48(%rsp)
814 movaps %xmm8,16*$SZ+64(%rsp)
815 movaps %xmm9,16*$SZ+80(%rsp)
816___
817$code.=<<___;
818.Lprologue_ssse3:
819
820 mov $SZ*0($ctx),$A
821 mov $SZ*1($ctx),$B
822 mov $SZ*2($ctx),$C
823 mov $SZ*3($ctx),$D
824 mov $SZ*4($ctx),$E
825 mov $SZ*5($ctx),$F
826 mov $SZ*6($ctx),$G
827 mov $SZ*7($ctx),$H
828___
829
830$code.=<<___;
831 #movdqa $TABLE+`$SZ*2*$rounds`+32(%rip),$t4
832 #movdqa $TABLE+`$SZ*2*$rounds`+64(%rip),$t5
833 jmp .Lloop_ssse3
834.align 16
835.Lloop_ssse3:
836 movdqa $TABLE+`$SZ*2*$rounds`(%rip),$t3
837 movdqu 0x00($inp),@X[0]
838 movdqu 0x10($inp),@X[1]
839 movdqu 0x20($inp),@X[2]
840 pshufb $t3,@X[0]
841 movdqu 0x30($inp),@X[3]
842 lea $TABLE(%rip),$Tbl
843 pshufb $t3,@X[1]
844 movdqa 0x00($Tbl),$t0
845 movdqa 0x20($Tbl),$t1
846 pshufb $t3,@X[2]
847 paddd @X[0],$t0
848 movdqa 0x40($Tbl),$t2
849 pshufb $t3,@X[3]
850 movdqa 0x60($Tbl),$t3
851 paddd @X[1],$t1
852 paddd @X[2],$t2
853 paddd @X[3],$t3
854 movdqa $t0,0x00(%rsp)
855 mov $A,$a1
856 movdqa $t1,0x10(%rsp)
857 mov $B,$a3
858 movdqa $t2,0x20(%rsp)
859 xor $C,$a3 # magic
860 movdqa $t3,0x30(%rsp)
861 mov $E,$a0
862 jmp .Lssse3_00_47
863
864.align 16
865.Lssse3_00_47:
866 sub \$`-16*2*$SZ`,$Tbl # size optimization
867___
868sub Xupdate_256_SSSE3 () {
869 (
870 '&movdqa ($t0,@X[1]);',
871 '&movdqa ($t3,@X[3])',
872 '&palignr ($t0,@X[0],$SZ)', # X[1..4]
873 '&palignr ($t3,@X[2],$SZ);', # X[9..12]
874 '&movdqa ($t1,$t0)',
875 '&movdqa ($t2,$t0);',
876 '&psrld ($t0,$sigma0[2])',
877 '&paddd (@X[0],$t3);', # X[0..3] += X[9..12]
878 '&psrld ($t2,$sigma0[0])',
879 '&pshufd ($t3,@X[3],0b11111010)',# X[14..15]
880 '&pslld ($t1,8*$SZ-$sigma0[1]);'.
881 '&pxor ($t0,$t2)',
882 '&psrld ($t2,$sigma0[1]-$sigma0[0]);'.
883 '&pxor ($t0,$t1)',
884 '&pslld ($t1,$sigma0[1]-$sigma0[0]);'.
885 '&pxor ($t0,$t2);',
886 '&movdqa ($t2,$t3)',
887 '&pxor ($t0,$t1);', # sigma0(X[1..4])
888 '&psrld ($t3,$sigma1[2])',
889 '&paddd (@X[0],$t0);', # X[0..3] += sigma0(X[1..4])
890 '&psrlq ($t2,$sigma1[0])',
891 '&pxor ($t3,$t2);',
892 '&psrlq ($t2,$sigma1[1]-$sigma1[0])',
893 '&pxor ($t3,$t2)',
894 '&pshufb ($t3,$t4)', # sigma1(X[14..15])
895 '&paddd (@X[0],$t3)', # X[0..1] += sigma1(X[14..15])
896 '&pshufd ($t3,@X[0],0b01010000)',# X[16..17]
897 '&movdqa ($t2,$t3);',
898 '&psrld ($t3,$sigma1[2])',
899 '&psrlq ($t2,$sigma1[0])',
900 '&pxor ($t3,$t2);',
901 '&psrlq ($t2,$sigma1[1]-$sigma1[0])',
902 '&pxor ($t3,$t2);',
903 '&movdqa ($t2,16*2*$j."($Tbl)")',
904 '&pshufb ($t3,$t5)',
905 '&paddd (@X[0],$t3)' # X[2..3] += sigma1(X[16..17])
906 );
907}
908
909sub SSSE3_256_00_47 () {
910my $j = shift;
911my $body = shift;
912my @X = @_;
913my @insns = (&$body,&$body,&$body,&$body); # 104 instructions
914
915 if (0) {
916 foreach (Xupdate_256_SSSE3()) { # 36 instructions
917 eval;
918 eval(shift(@insns));
919 eval(shift(@insns));
920 eval(shift(@insns));
921 }
922 } else { # squeeze extra 4% on Westmere and 19% on Atom
923 eval(shift(@insns)); #@
924 &movdqa ($t0,@X[1]);
925 eval(shift(@insns));
926 eval(shift(@insns));
927 &movdqa ($t3,@X[3]);
928 eval(shift(@insns)); #@
929 eval(shift(@insns));
930 eval(shift(@insns));
931 eval(shift(@insns)); #@
932 eval(shift(@insns));
933 &palignr ($t0,@X[0],$SZ); # X[1..4]
934 eval(shift(@insns));
935 eval(shift(@insns));
936 &palignr ($t3,@X[2],$SZ); # X[9..12]
937 eval(shift(@insns));
938 eval(shift(@insns));
939 eval(shift(@insns));
940 eval(shift(@insns)); #@
941 &movdqa ($t1,$t0);
942 eval(shift(@insns));
943 eval(shift(@insns));
944 &movdqa ($t2,$t0);
945 eval(shift(@insns)); #@
946 eval(shift(@insns));
947 &psrld ($t0,$sigma0[2]);
948 eval(shift(@insns));
949 eval(shift(@insns));
950 eval(shift(@insns));
951 &paddd (@X[0],$t3); # X[0..3] += X[9..12]
952 eval(shift(@insns)); #@
953 eval(shift(@insns));
954 &psrld ($t2,$sigma0[0]);
955 eval(shift(@insns));
956 eval(shift(@insns));
957 &pshufd ($t3,@X[3],0b11111010); # X[4..15]
958 eval(shift(@insns));
959 eval(shift(@insns)); #@
960 &pslld ($t1,8*$SZ-$sigma0[1]);
961 eval(shift(@insns));
962 eval(shift(@insns));
963 &pxor ($t0,$t2);
964 eval(shift(@insns)); #@
965 eval(shift(@insns));
966 eval(shift(@insns));
967 eval(shift(@insns)); #@
968 &psrld ($t2,$sigma0[1]-$sigma0[0]);
969 eval(shift(@insns));
970 &pxor ($t0,$t1);
971 eval(shift(@insns));
972 eval(shift(@insns));
973 &pslld ($t1,$sigma0[1]-$sigma0[0]);
974 eval(shift(@insns));
975 eval(shift(@insns));
976 &pxor ($t0,$t2);
977 eval(shift(@insns));
978 eval(shift(@insns)); #@
979 &movdqa ($t2,$t3);
980 eval(shift(@insns));
981 eval(shift(@insns));
982 &pxor ($t0,$t1); # sigma0(X[1..4])
983 eval(shift(@insns)); #@
984 eval(shift(@insns));
985 eval(shift(@insns));
986 &psrld ($t3,$sigma1[2]);
987 eval(shift(@insns));
988 eval(shift(@insns));
989 &paddd (@X[0],$t0); # X[0..3] += sigma0(X[1..4])
990 eval(shift(@insns)); #@
991 eval(shift(@insns));
992 &psrlq ($t2,$sigma1[0]);
993 eval(shift(@insns));
994 eval(shift(@insns));
995 eval(shift(@insns));
996 &pxor ($t3,$t2);
997 eval(shift(@insns)); #@
998 eval(shift(@insns));
999 eval(shift(@insns));
1000 eval(shift(@insns)); #@
1001 &psrlq ($t2,$sigma1[1]-$sigma1[0]);
1002 eval(shift(@insns));
1003 eval(shift(@insns));
1004 &pxor ($t3,$t2);
1005 eval(shift(@insns)); #@
1006 eval(shift(@insns));
1007 eval(shift(@insns));
1008 #&pshufb ($t3,$t4); # sigma1(X[14..15])
1009 &pshufd ($t3,$t3,0b10000000);
1010 eval(shift(@insns));
1011 eval(shift(@insns));
1012 eval(shift(@insns));
1013 &psrldq ($t3,8);
1014 eval(shift(@insns));
1015 eval(shift(@insns)); #@
1016 eval(shift(@insns));
1017 eval(shift(@insns));
1018 eval(shift(@insns)); #@
1019 &paddd (@X[0],$t3); # X[0..1] += sigma1(X[14..15])
1020 eval(shift(@insns));
1021 eval(shift(@insns));
1022 eval(shift(@insns));
1023 &pshufd ($t3,@X[0],0b01010000); # X[16..17]
1024 eval(shift(@insns));
1025 eval(shift(@insns)); #@
1026 eval(shift(@insns));
1027 &movdqa ($t2,$t3);
1028 eval(shift(@insns));
1029 eval(shift(@insns));
1030 &psrld ($t3,$sigma1[2]);
1031 eval(shift(@insns));
1032 eval(shift(@insns)); #@
1033 &psrlq ($t2,$sigma1[0]);
1034 eval(shift(@insns));
1035 eval(shift(@insns));
1036 &pxor ($t3,$t2);
1037 eval(shift(@insns)); #@
1038 eval(shift(@insns));
1039 eval(shift(@insns));
1040 eval(shift(@insns)); #@
1041 eval(shift(@insns));
1042 &psrlq ($t2,$sigma1[1]-$sigma1[0]);
1043 eval(shift(@insns));
1044 eval(shift(@insns));
1045 eval(shift(@insns));
1046 &pxor ($t3,$t2);
1047 eval(shift(@insns));
1048 eval(shift(@insns));
1049 eval(shift(@insns)); #@
1050 #&pshufb ($t3,$t5);
1051 &pshufd ($t3,$t3,0b00001000);
1052 eval(shift(@insns));
1053 eval(shift(@insns));
1054 &movdqa ($t2,16*2*$j."($Tbl)");
1055 eval(shift(@insns)); #@
1056 eval(shift(@insns));
1057 &pslldq ($t3,8);
1058 eval(shift(@insns));
1059 eval(shift(@insns));
1060 eval(shift(@insns));
1061 &paddd (@X[0],$t3); # X[2..3] += sigma1(X[16..17])
1062 eval(shift(@insns)); #@
1063 eval(shift(@insns));
1064 eval(shift(@insns));
1065 }
1066 &paddd ($t2,@X[0]);
1067 foreach (@insns) { eval; } # remaining instructions
1068 &movdqa (16*$j."(%rsp)",$t2);
1069}
1070
1071 for ($i=0,$j=0; $j<4; $j++) {
1072 &SSSE3_256_00_47($j,\&body_00_15,@X);
1073 push(@X,shift(@X)); # rotate(@X)
1074 }
1075 &cmpb ($SZ-1+16*2*$SZ."($Tbl)",0);
1076 &jne (".Lssse3_00_47");
1077
1078 for ($i=0; $i<16; ) {
1079 foreach(body_00_15()) { eval; }
1080 }
1081$code.=<<___;
1082 mov $_ctx,$ctx
1083 mov $a1,$A
1084
1085 add $SZ*0($ctx),$A
1086 lea 16*$SZ($inp),$inp
1087 add $SZ*1($ctx),$B
1088 add $SZ*2($ctx),$C
1089 add $SZ*3($ctx),$D
1090 add $SZ*4($ctx),$E
1091 add $SZ*5($ctx),$F
1092 add $SZ*6($ctx),$G
1093 add $SZ*7($ctx),$H
1094
1095 cmp $_end,$inp
1096
1097 mov $A,$SZ*0($ctx)
1098 mov $B,$SZ*1($ctx)
1099 mov $C,$SZ*2($ctx)
1100 mov $D,$SZ*3($ctx)
1101 mov $E,$SZ*4($ctx)
1102 mov $F,$SZ*5($ctx)
1103 mov $G,$SZ*6($ctx)
1104 mov $H,$SZ*7($ctx)
1105 jb .Lloop_ssse3
1106
1107 mov $_rsp,%rsi
1108.cfi_def_cfa %rsi,8
1109___
1110$code.=<<___ if ($win64);
1111 movaps 16*$SZ+32(%rsp),%xmm6
1112 movaps 16*$SZ+48(%rsp),%xmm7
1113 movaps 16*$SZ+64(%rsp),%xmm8
1114 movaps 16*$SZ+80(%rsp),%xmm9
1115___
1116$code.=<<___;
1117 mov -48(%rsi),%r15
1118.cfi_restore %r15
1119 mov -40(%rsi),%r14
1120.cfi_restore %r14
1121 mov -32(%rsi),%r13
1122.cfi_restore %r13
1123 mov -24(%rsi),%r12
1124.cfi_restore %r12
1125 mov -16(%rsi),%rbp
1126.cfi_restore %rbp
1127 mov -8(%rsi),%rbx
1128.cfi_restore %rbx
1129 lea (%rsi),%rsp
1130.cfi_def_cfa_register %rsp
1131.Lepilogue_ssse3:
1132 ret
1133.cfi_endproc
1134.size ${func}_ssse3,.-${func}_ssse3
1135___
1136}
1137
1138if ($avx) {{
1139######################################################################
1140# XOP code path
1141#
1142if ($SZ==8) { # SHA512 only
1143$code.=<<___;
1144.type ${func}_xop,\@function,3
1145.align 64
1146${func}_xop:
1147.cfi_startproc
1148.Lxop_shortcut:
1149 mov %rsp,%rax # copy %rsp
1150.cfi_def_cfa_register %rax
1151 push %rbx
1152.cfi_push %rbx
1153 push %rbp
1154.cfi_push %rbp
1155 push %r12
1156.cfi_push %r12
1157 push %r13
1158.cfi_push %r13
1159 push %r14
1160.cfi_push %r14
1161 push %r15
1162.cfi_push %r15
1163 shl \$4,%rdx # num*16
1164 sub \$`$framesz+$win64*16*($SZ==4?4:6)`,%rsp
1165 lea ($inp,%rdx,$SZ),%rdx # inp+num*16*$SZ
1166 and \$-64,%rsp # align stack frame
1167 mov $ctx,$_ctx # save ctx, 1st arg
1168 mov $inp,$_inp # save inp, 2nd arh
1169 mov %rdx,$_end # save end pointer, "3rd" arg
1170 mov %rax,$_rsp # save copy of %rsp
1171.cfi_cfa_expression $_rsp,deref,+8
1172___
1173$code.=<<___ if ($win64);
1174 movaps %xmm6,16*$SZ+32(%rsp)
1175 movaps %xmm7,16*$SZ+48(%rsp)
1176 movaps %xmm8,16*$SZ+64(%rsp)
1177 movaps %xmm9,16*$SZ+80(%rsp)
1178___
1179$code.=<<___ if ($win64 && $SZ>4);
1180 movaps %xmm10,16*$SZ+96(%rsp)
1181 movaps %xmm11,16*$SZ+112(%rsp)
1182___
1183$code.=<<___;
1184.Lprologue_xop:
1185
1186 vzeroupper
1187 mov $SZ*0($ctx),$A
1188 mov $SZ*1($ctx),$B
1189 mov $SZ*2($ctx),$C
1190 mov $SZ*3($ctx),$D
1191 mov $SZ*4($ctx),$E
1192 mov $SZ*5($ctx),$F
1193 mov $SZ*6($ctx),$G
1194 mov $SZ*7($ctx),$H
1195 jmp .Lloop_xop
1196___
1197 if ($SZ==4) { # SHA256
1198 my @X = map("%xmm$_",(0..3));
1199 my ($t0,$t1,$t2,$t3) = map("%xmm$_",(4..7));
1200
1201$code.=<<___;
1202.align 16
1203.Lloop_xop:
1204 vmovdqa $TABLE+`$SZ*2*$rounds`(%rip),$t3
1205 vmovdqu 0x00($inp),@X[0]
1206 vmovdqu 0x10($inp),@X[1]
1207 vmovdqu 0x20($inp),@X[2]
1208 vmovdqu 0x30($inp),@X[3]
1209 vpshufb $t3,@X[0],@X[0]
1210 lea $TABLE(%rip),$Tbl
1211 vpshufb $t3,@X[1],@X[1]
1212 vpshufb $t3,@X[2],@X[2]
1213 vpaddd 0x00($Tbl),@X[0],$t0
1214 vpshufb $t3,@X[3],@X[3]
1215 vpaddd 0x20($Tbl),@X[1],$t1
1216 vpaddd 0x40($Tbl),@X[2],$t2
1217 vpaddd 0x60($Tbl),@X[3],$t3
1218 vmovdqa $t0,0x00(%rsp)
1219 mov $A,$a1
1220 vmovdqa $t1,0x10(%rsp)
1221 mov $B,$a3
1222 vmovdqa $t2,0x20(%rsp)
1223 xor $C,$a3 # magic
1224 vmovdqa $t3,0x30(%rsp)
1225 mov $E,$a0
1226 jmp .Lxop_00_47
1227
1228.align 16
1229.Lxop_00_47:
1230 sub \$`-16*2*$SZ`,$Tbl # size optimization
1231___
1232sub XOP_256_00_47 () {
1233my $j = shift;
1234my $body = shift;
1235my @X = @_;
1236my @insns = (&$body,&$body,&$body,&$body); # 104 instructions
1237
1238 &vpalignr ($t0,@X[1],@X[0],$SZ); # X[1..4]
1239 eval(shift(@insns));
1240 eval(shift(@insns));
1241 &vpalignr ($t3,@X[3],@X[2],$SZ); # X[9..12]
1242 eval(shift(@insns));
1243 eval(shift(@insns));
1244 &vprotd ($t1,$t0,8*$SZ-$sigma0[1]);
1245 eval(shift(@insns));
1246 eval(shift(@insns));
1247 &vpsrld ($t0,$t0,$sigma0[2]);
1248 eval(shift(@insns));
1249 eval(shift(@insns));
1250 &vpaddd (@X[0],@X[0],$t3); # X[0..3] += X[9..12]
1251 eval(shift(@insns));
1252 eval(shift(@insns));
1253 eval(shift(@insns));
1254 eval(shift(@insns));
1255 &vprotd ($t2,$t1,$sigma0[1]-$sigma0[0]);
1256 eval(shift(@insns));
1257 eval(shift(@insns));
1258 &vpxor ($t0,$t0,$t1);
1259 eval(shift(@insns));
1260 eval(shift(@insns));
1261 eval(shift(@insns));
1262 eval(shift(@insns));
1263 &vprotd ($t3,@X[3],8*$SZ-$sigma1[1]);
1264 eval(shift(@insns));
1265 eval(shift(@insns));
1266 &vpxor ($t0,$t0,$t2); # sigma0(X[1..4])
1267 eval(shift(@insns));
1268 eval(shift(@insns));
1269 &vpsrld ($t2,@X[3],$sigma1[2]);
1270 eval(shift(@insns));
1271 eval(shift(@insns));
1272 &vpaddd (@X[0],@X[0],$t0); # X[0..3] += sigma0(X[1..4])
1273 eval(shift(@insns));
1274 eval(shift(@insns));
1275 &vprotd ($t1,$t3,$sigma1[1]-$sigma1[0]);
1276 eval(shift(@insns));
1277 eval(shift(@insns));
1278 &vpxor ($t3,$t3,$t2);
1279 eval(shift(@insns));
1280 eval(shift(@insns));
1281 eval(shift(@insns));
1282 eval(shift(@insns));
1283 &vpxor ($t3,$t3,$t1); # sigma1(X[14..15])
1284 eval(shift(@insns));
1285 eval(shift(@insns));
1286 eval(shift(@insns));
1287 eval(shift(@insns));
1288 &vpsrldq ($t3,$t3,8);
1289 eval(shift(@insns));
1290 eval(shift(@insns));
1291 eval(shift(@insns));
1292 eval(shift(@insns));
1293 &vpaddd (@X[0],@X[0],$t3); # X[0..1] += sigma1(X[14..15])
1294 eval(shift(@insns));
1295 eval(shift(@insns));
1296 eval(shift(@insns));
1297 eval(shift(@insns));
1298 &vprotd ($t3,@X[0],8*$SZ-$sigma1[1]);
1299 eval(shift(@insns));
1300 eval(shift(@insns));
1301 &vpsrld ($t2,@X[0],$sigma1[2]);
1302 eval(shift(@insns));
1303 eval(shift(@insns));
1304 &vprotd ($t1,$t3,$sigma1[1]-$sigma1[0]);
1305 eval(shift(@insns));
1306 eval(shift(@insns));
1307 &vpxor ($t3,$t3,$t2);
1308 eval(shift(@insns));
1309 eval(shift(@insns));
1310 eval(shift(@insns));
1311 eval(shift(@insns));
1312 &vpxor ($t3,$t3,$t1); # sigma1(X[16..17])
1313 eval(shift(@insns));
1314 eval(shift(@insns));
1315 eval(shift(@insns));
1316 eval(shift(@insns));
1317 &vpslldq ($t3,$t3,8); # 22 instructions
1318 eval(shift(@insns));
1319 eval(shift(@insns));
1320 eval(shift(@insns));
1321 eval(shift(@insns));
1322 &vpaddd (@X[0],@X[0],$t3); # X[2..3] += sigma1(X[16..17])
1323 eval(shift(@insns));
1324 eval(shift(@insns));
1325 eval(shift(@insns));
1326 eval(shift(@insns));
1327 &vpaddd ($t2,@X[0],16*2*$j."($Tbl)");
1328 foreach (@insns) { eval; } # remaining instructions
1329 &vmovdqa (16*$j."(%rsp)",$t2);
1330}
1331
1332 for ($i=0,$j=0; $j<4; $j++) {
1333 &XOP_256_00_47($j,\&body_00_15,@X);
1334 push(@X,shift(@X)); # rotate(@X)
1335 }
1336 &cmpb ($SZ-1+16*2*$SZ."($Tbl)",0);
1337 &jne (".Lxop_00_47");
1338
1339 for ($i=0; $i<16; ) {
1340 foreach(body_00_15()) { eval; }
1341 }
1342
1343 } else { # SHA512
1344 my @X = map("%xmm$_",(0..7));
1345 my ($t0,$t1,$t2,$t3) = map("%xmm$_",(8..11));
1346
1347$code.=<<___;
1348.align 16
1349.Lloop_xop:
1350 vmovdqa $TABLE+`$SZ*2*$rounds`(%rip),$t3
1351 vmovdqu 0x00($inp),@X[0]
1352 lea $TABLE+0x80(%rip),$Tbl # size optimization
1353 vmovdqu 0x10($inp),@X[1]
1354 vmovdqu 0x20($inp),@X[2]
1355 vpshufb $t3,@X[0],@X[0]
1356 vmovdqu 0x30($inp),@X[3]
1357 vpshufb $t3,@X[1],@X[1]
1358 vmovdqu 0x40($inp),@X[4]
1359 vpshufb $t3,@X[2],@X[2]
1360 vmovdqu 0x50($inp),@X[5]
1361 vpshufb $t3,@X[3],@X[3]
1362 vmovdqu 0x60($inp),@X[6]
1363 vpshufb $t3,@X[4],@X[4]
1364 vmovdqu 0x70($inp),@X[7]
1365 vpshufb $t3,@X[5],@X[5]
1366 vpaddq -0x80($Tbl),@X[0],$t0
1367 vpshufb $t3,@X[6],@X[6]
1368 vpaddq -0x60($Tbl),@X[1],$t1
1369 vpshufb $t3,@X[7],@X[7]
1370 vpaddq -0x40($Tbl),@X[2],$t2
1371 vpaddq -0x20($Tbl),@X[3],$t3
1372 vmovdqa $t0,0x00(%rsp)
1373 vpaddq 0x00($Tbl),@X[4],$t0
1374 vmovdqa $t1,0x10(%rsp)
1375 vpaddq 0x20($Tbl),@X[5],$t1
1376 vmovdqa $t2,0x20(%rsp)
1377 vpaddq 0x40($Tbl),@X[6],$t2
1378 vmovdqa $t3,0x30(%rsp)
1379 vpaddq 0x60($Tbl),@X[7],$t3
1380 vmovdqa $t0,0x40(%rsp)
1381 mov $A,$a1
1382 vmovdqa $t1,0x50(%rsp)
1383 mov $B,$a3
1384 vmovdqa $t2,0x60(%rsp)
1385 xor $C,$a3 # magic
1386 vmovdqa $t3,0x70(%rsp)
1387 mov $E,$a0
1388 jmp .Lxop_00_47
1389
1390.align 16
1391.Lxop_00_47:
1392 add \$`16*2*$SZ`,$Tbl
1393___
1394sub XOP_512_00_47 () {
1395my $j = shift;
1396my $body = shift;
1397my @X = @_;
1398my @insns = (&$body,&$body); # 52 instructions
1399
1400 &vpalignr ($t0,@X[1],@X[0],$SZ); # X[1..2]
1401 eval(shift(@insns));
1402 eval(shift(@insns));
1403 &vpalignr ($t3,@X[5],@X[4],$SZ); # X[9..10]
1404 eval(shift(@insns));
1405 eval(shift(@insns));
1406 &vprotq ($t1,$t0,8*$SZ-$sigma0[1]);
1407 eval(shift(@insns));
1408 eval(shift(@insns));
1409 &vpsrlq ($t0,$t0,$sigma0[2]);
1410 eval(shift(@insns));
1411 eval(shift(@insns));
1412 &vpaddq (@X[0],@X[0],$t3); # X[0..1] += X[9..10]
1413 eval(shift(@insns));
1414 eval(shift(@insns));
1415 eval(shift(@insns));
1416 eval(shift(@insns));
1417 &vprotq ($t2,$t1,$sigma0[1]-$sigma0[0]);
1418 eval(shift(@insns));
1419 eval(shift(@insns));
1420 &vpxor ($t0,$t0,$t1);
1421 eval(shift(@insns));
1422 eval(shift(@insns));
1423 eval(shift(@insns));
1424 eval(shift(@insns));
1425 &vprotq ($t3,@X[7],8*$SZ-$sigma1[1]);
1426 eval(shift(@insns));
1427 eval(shift(@insns));
1428 &vpxor ($t0,$t0,$t2); # sigma0(X[1..2])
1429 eval(shift(@insns));
1430 eval(shift(@insns));
1431 &vpsrlq ($t2,@X[7],$sigma1[2]);
1432 eval(shift(@insns));
1433 eval(shift(@insns));
1434 &vpaddq (@X[0],@X[0],$t0); # X[0..1] += sigma0(X[1..2])
1435 eval(shift(@insns));
1436 eval(shift(@insns));
1437 &vprotq ($t1,$t3,$sigma1[1]-$sigma1[0]);
1438 eval(shift(@insns));
1439 eval(shift(@insns));
1440 &vpxor ($t3,$t3,$t2);
1441 eval(shift(@insns));
1442 eval(shift(@insns));
1443 eval(shift(@insns));
1444 eval(shift(@insns));
1445 &vpxor ($t3,$t3,$t1); # sigma1(X[14..15])
1446 eval(shift(@insns));
1447 eval(shift(@insns));
1448 eval(shift(@insns));
1449 eval(shift(@insns));
1450 &vpaddq (@X[0],@X[0],$t3); # X[0..1] += sigma1(X[14..15])
1451 eval(shift(@insns));
1452 eval(shift(@insns));
1453 eval(shift(@insns));
1454 eval(shift(@insns));
1455 &vpaddq ($t2,@X[0],16*2*$j-0x80."($Tbl)");
1456 foreach (@insns) { eval; } # remaining instructions
1457 &vmovdqa (16*$j."(%rsp)",$t2);
1458}
1459
1460 for ($i=0,$j=0; $j<8; $j++) {
1461 &XOP_512_00_47($j,\&body_00_15,@X);
1462 push(@X,shift(@X)); # rotate(@X)
1463 }
1464 &cmpb ($SZ-1+16*2*$SZ-0x80."($Tbl)",0);
1465 &jne (".Lxop_00_47");
1466
1467 for ($i=0; $i<16; ) {
1468 foreach(body_00_15()) { eval; }
1469 }
1470}
1471$code.=<<___;
1472 mov $_ctx,$ctx
1473 mov $a1,$A
1474
1475 add $SZ*0($ctx),$A
1476 lea 16*$SZ($inp),$inp
1477 add $SZ*1($ctx),$B
1478 add $SZ*2($ctx),$C
1479 add $SZ*3($ctx),$D
1480 add $SZ*4($ctx),$E
1481 add $SZ*5($ctx),$F
1482 add $SZ*6($ctx),$G
1483 add $SZ*7($ctx),$H
1484
1485 cmp $_end,$inp
1486
1487 mov $A,$SZ*0($ctx)
1488 mov $B,$SZ*1($ctx)
1489 mov $C,$SZ*2($ctx)
1490 mov $D,$SZ*3($ctx)
1491 mov $E,$SZ*4($ctx)
1492 mov $F,$SZ*5($ctx)
1493 mov $G,$SZ*6($ctx)
1494 mov $H,$SZ*7($ctx)
1495 jb .Lloop_xop
1496
1497 mov $_rsp,%rsi
1498.cfi_def_cfa %rsi,8
1499 vzeroupper
1500___
1501$code.=<<___ if ($win64);
1502 movaps 16*$SZ+32(%rsp),%xmm6
1503 movaps 16*$SZ+48(%rsp),%xmm7
1504 movaps 16*$SZ+64(%rsp),%xmm8
1505 movaps 16*$SZ+80(%rsp),%xmm9
1506___
1507$code.=<<___ if ($win64 && $SZ>4);
1508 movaps 16*$SZ+96(%rsp),%xmm10
1509 movaps 16*$SZ+112(%rsp),%xmm11
1510___
1511$code.=<<___;
1512 mov -48(%rsi),%r15
1513.cfi_restore %r15
1514 mov -40(%rsi),%r14
1515.cfi_restore %r14
1516 mov -32(%rsi),%r13
1517.cfi_restore %r13
1518 mov -24(%rsi),%r12
1519.cfi_restore %r12
1520 mov -16(%rsi),%rbp
1521.cfi_restore %rbp
1522 mov -8(%rsi),%rbx
1523.cfi_restore %rbx
1524 lea (%rsi),%rsp
1525.cfi_def_cfa_register %rsp
1526.Lepilogue_xop:
1527 ret
1528.cfi_endproc
1529.size ${func}_xop,.-${func}_xop
1530___
1531}
1532######################################################################
1533# AVX+shrd code path
1534#
1535local *ror = sub { &shrd(@_[0],@_) };
1536
1537$code.=<<___;
1538.type ${func}_avx,\@function,3
1539.align 64
1540${func}_avx:
1541.cfi_startproc
1542.Lavx_shortcut:
1543 mov %rsp,%rax # copy %rsp
1544.cfi_def_cfa_register %rax
1545 push %rbx
1546.cfi_push %rbx
1547 push %rbp
1548.cfi_push %rbp
1549 push %r12
1550.cfi_push %r12
1551 push %r13
1552.cfi_push %r13
1553 push %r14
1554.cfi_push %r14
1555 push %r15
1556.cfi_push %r15
1557 shl \$4,%rdx # num*16
1558 sub \$`$framesz+$win64*16*($SZ==4?4:6)`,%rsp
1559 lea ($inp,%rdx,$SZ),%rdx # inp+num*16*$SZ
1560 and \$-64,%rsp # align stack frame
1561 mov $ctx,$_ctx # save ctx, 1st arg
1562 mov $inp,$_inp # save inp, 2nd arh
1563 mov %rdx,$_end # save end pointer, "3rd" arg
1564 mov %rax,$_rsp # save copy of %rsp
1565.cfi_cfa_expression $_rsp,deref,+8
1566___
1567$code.=<<___ if ($win64);
1568 movaps %xmm6,16*$SZ+32(%rsp)
1569 movaps %xmm7,16*$SZ+48(%rsp)
1570 movaps %xmm8,16*$SZ+64(%rsp)
1571 movaps %xmm9,16*$SZ+80(%rsp)
1572___
1573$code.=<<___ if ($win64 && $SZ>4);
1574 movaps %xmm10,16*$SZ+96(%rsp)
1575 movaps %xmm11,16*$SZ+112(%rsp)
1576___
1577$code.=<<___;
1578.Lprologue_avx:
1579
1580 vzeroupper
1581 mov $SZ*0($ctx),$A
1582 mov $SZ*1($ctx),$B
1583 mov $SZ*2($ctx),$C
1584 mov $SZ*3($ctx),$D
1585 mov $SZ*4($ctx),$E
1586 mov $SZ*5($ctx),$F
1587 mov $SZ*6($ctx),$G
1588 mov $SZ*7($ctx),$H
1589___
1590 if ($SZ==4) { # SHA256
1591 my @X = map("%xmm$_",(0..3));
1592 my ($t0,$t1,$t2,$t3, $t4,$t5) = map("%xmm$_",(4..9));
1593
1594$code.=<<___;
1595 vmovdqa $TABLE+`$SZ*2*$rounds`+32(%rip),$t4
1596 vmovdqa $TABLE+`$SZ*2*$rounds`+64(%rip),$t5
1597 jmp .Lloop_avx
1598.align 16
1599.Lloop_avx:
1600 vmovdqa $TABLE+`$SZ*2*$rounds`(%rip),$t3
1601 vmovdqu 0x00($inp),@X[0]
1602 vmovdqu 0x10($inp),@X[1]
1603 vmovdqu 0x20($inp),@X[2]
1604 vmovdqu 0x30($inp),@X[3]
1605 vpshufb $t3,@X[0],@X[0]
1606 lea $TABLE(%rip),$Tbl
1607 vpshufb $t3,@X[1],@X[1]
1608 vpshufb $t3,@X[2],@X[2]
1609 vpaddd 0x00($Tbl),@X[0],$t0
1610 vpshufb $t3,@X[3],@X[3]
1611 vpaddd 0x20($Tbl),@X[1],$t1
1612 vpaddd 0x40($Tbl),@X[2],$t2
1613 vpaddd 0x60($Tbl),@X[3],$t3
1614 vmovdqa $t0,0x00(%rsp)
1615 mov $A,$a1
1616 vmovdqa $t1,0x10(%rsp)
1617 mov $B,$a3
1618 vmovdqa $t2,0x20(%rsp)
1619 xor $C,$a3 # magic
1620 vmovdqa $t3,0x30(%rsp)
1621 mov $E,$a0
1622 jmp .Lavx_00_47
1623
1624.align 16
1625.Lavx_00_47:
1626 sub \$`-16*2*$SZ`,$Tbl # size optimization
1627___
1628sub Xupdate_256_AVX () {
1629 (
1630 '&vpalignr ($t0,@X[1],@X[0],$SZ)', # X[1..4]
1631 '&vpalignr ($t3,@X[3],@X[2],$SZ)', # X[9..12]
1632 '&vpsrld ($t2,$t0,$sigma0[0]);',
1633 '&vpaddd (@X[0],@X[0],$t3)', # X[0..3] += X[9..12]
1634 '&vpsrld ($t3,$t0,$sigma0[2])',
1635 '&vpslld ($t1,$t0,8*$SZ-$sigma0[1]);',
1636 '&vpxor ($t0,$t3,$t2)',
1637 '&vpshufd ($t3,@X[3],0b11111010)',# X[14..15]
1638 '&vpsrld ($t2,$t2,$sigma0[1]-$sigma0[0]);',
1639 '&vpxor ($t0,$t0,$t1)',
1640 '&vpslld ($t1,$t1,$sigma0[1]-$sigma0[0]);',
1641 '&vpxor ($t0,$t0,$t2)',
1642 '&vpsrld ($t2,$t3,$sigma1[2]);',
1643 '&vpxor ($t0,$t0,$t1)', # sigma0(X[1..4])
1644 '&vpsrlq ($t3,$t3,$sigma1[0]);',
1645 '&vpaddd (@X[0],@X[0],$t0)', # X[0..3] += sigma0(X[1..4])
1646 '&vpxor ($t2,$t2,$t3);',
1647 '&vpsrlq ($t3,$t3,$sigma1[1]-$sigma1[0])',
1648 '&vpxor ($t2,$t2,$t3)',
1649 '&vpshufb ($t2,$t2,$t4)', # sigma1(X[14..15])
1650 '&vpaddd (@X[0],@X[0],$t2)', # X[0..1] += sigma1(X[14..15])
1651 '&vpshufd ($t3,@X[0],0b01010000)',# X[16..17]
1652 '&vpsrld ($t2,$t3,$sigma1[2])',
1653 '&vpsrlq ($t3,$t3,$sigma1[0])',
1654 '&vpxor ($t2,$t2,$t3);',
1655 '&vpsrlq ($t3,$t3,$sigma1[1]-$sigma1[0])',
1656 '&vpxor ($t2,$t2,$t3)',
1657 '&vpshufb ($t2,$t2,$t5)',
1658 '&vpaddd (@X[0],@X[0],$t2)' # X[2..3] += sigma1(X[16..17])
1659 );
1660}
1661
1662sub AVX_256_00_47 () {
1663my $j = shift;
1664my $body = shift;
1665my @X = @_;
1666my @insns = (&$body,&$body,&$body,&$body); # 104 instructions
1667
1668 foreach (Xupdate_256_AVX()) { # 29 instructions
1669 eval;
1670 eval(shift(@insns));
1671 eval(shift(@insns));
1672 eval(shift(@insns));
1673 }
1674 &vpaddd ($t2,@X[0],16*2*$j."($Tbl)");
1675 foreach (@insns) { eval; } # remaining instructions
1676 &vmovdqa (16*$j."(%rsp)",$t2);
1677}
1678
1679 for ($i=0,$j=0; $j<4; $j++) {
1680 &AVX_256_00_47($j,\&body_00_15,@X);
1681 push(@X,shift(@X)); # rotate(@X)
1682 }
1683 &cmpb ($SZ-1+16*2*$SZ."($Tbl)",0);
1684 &jne (".Lavx_00_47");
1685
1686 for ($i=0; $i<16; ) {
1687 foreach(body_00_15()) { eval; }
1688 }
1689
1690 } else { # SHA512
1691 my @X = map("%xmm$_",(0..7));
1692 my ($t0,$t1,$t2,$t3) = map("%xmm$_",(8..11));
1693
1694$code.=<<___;
1695 jmp .Lloop_avx
1696.align 16
1697.Lloop_avx:
1698 vmovdqa $TABLE+`$SZ*2*$rounds`(%rip),$t3
1699 vmovdqu 0x00($inp),@X[0]
1700 lea $TABLE+0x80(%rip),$Tbl # size optimization
1701 vmovdqu 0x10($inp),@X[1]
1702 vmovdqu 0x20($inp),@X[2]
1703 vpshufb $t3,@X[0],@X[0]
1704 vmovdqu 0x30($inp),@X[3]
1705 vpshufb $t3,@X[1],@X[1]
1706 vmovdqu 0x40($inp),@X[4]
1707 vpshufb $t3,@X[2],@X[2]
1708 vmovdqu 0x50($inp),@X[5]
1709 vpshufb $t3,@X[3],@X[3]
1710 vmovdqu 0x60($inp),@X[6]
1711 vpshufb $t3,@X[4],@X[4]
1712 vmovdqu 0x70($inp),@X[7]
1713 vpshufb $t3,@X[5],@X[5]
1714 vpaddq -0x80($Tbl),@X[0],$t0
1715 vpshufb $t3,@X[6],@X[6]
1716 vpaddq -0x60($Tbl),@X[1],$t1
1717 vpshufb $t3,@X[7],@X[7]
1718 vpaddq -0x40($Tbl),@X[2],$t2
1719 vpaddq -0x20($Tbl),@X[3],$t3
1720 vmovdqa $t0,0x00(%rsp)
1721 vpaddq 0x00($Tbl),@X[4],$t0
1722 vmovdqa $t1,0x10(%rsp)
1723 vpaddq 0x20($Tbl),@X[5],$t1
1724 vmovdqa $t2,0x20(%rsp)
1725 vpaddq 0x40($Tbl),@X[6],$t2
1726 vmovdqa $t3,0x30(%rsp)
1727 vpaddq 0x60($Tbl),@X[7],$t3
1728 vmovdqa $t0,0x40(%rsp)
1729 mov $A,$a1
1730 vmovdqa $t1,0x50(%rsp)
1731 mov $B,$a3
1732 vmovdqa $t2,0x60(%rsp)
1733 xor $C,$a3 # magic
1734 vmovdqa $t3,0x70(%rsp)
1735 mov $E,$a0
1736 jmp .Lavx_00_47
1737
1738.align 16
1739.Lavx_00_47:
1740 add \$`16*2*$SZ`,$Tbl
1741___
1742sub Xupdate_512_AVX () {
1743 (
1744 '&vpalignr ($t0,@X[1],@X[0],$SZ)', # X[1..2]
1745 '&vpalignr ($t3,@X[5],@X[4],$SZ)', # X[9..10]
1746 '&vpsrlq ($t2,$t0,$sigma0[0])',
1747 '&vpaddq (@X[0],@X[0],$t3);', # X[0..1] += X[9..10]
1748 '&vpsrlq ($t3,$t0,$sigma0[2])',
1749 '&vpsllq ($t1,$t0,8*$SZ-$sigma0[1]);',
1750 '&vpxor ($t0,$t3,$t2)',
1751 '&vpsrlq ($t2,$t2,$sigma0[1]-$sigma0[0]);',
1752 '&vpxor ($t0,$t0,$t1)',
1753 '&vpsllq ($t1,$t1,$sigma0[1]-$sigma0[0]);',
1754 '&vpxor ($t0,$t0,$t2)',
1755 '&vpsrlq ($t3,@X[7],$sigma1[2]);',
1756 '&vpxor ($t0,$t0,$t1)', # sigma0(X[1..2])
1757 '&vpsllq ($t2,@X[7],8*$SZ-$sigma1[1]);',
1758 '&vpaddq (@X[0],@X[0],$t0)', # X[0..1] += sigma0(X[1..2])
1759 '&vpsrlq ($t1,@X[7],$sigma1[0]);',
1760 '&vpxor ($t3,$t3,$t2)',
1761 '&vpsllq ($t2,$t2,$sigma1[1]-$sigma1[0]);',
1762 '&vpxor ($t3,$t3,$t1)',
1763 '&vpsrlq ($t1,$t1,$sigma1[1]-$sigma1[0]);',
1764 '&vpxor ($t3,$t3,$t2)',
1765 '&vpxor ($t3,$t3,$t1)', # sigma1(X[14..15])
1766 '&vpaddq (@X[0],@X[0],$t3)', # X[0..1] += sigma1(X[14..15])
1767 );
1768}
1769
1770sub AVX_512_00_47 () {
1771my $j = shift;
1772my $body = shift;
1773my @X = @_;
1774my @insns = (&$body,&$body); # 52 instructions
1775
1776 foreach (Xupdate_512_AVX()) { # 23 instructions
1777 eval;
1778 eval(shift(@insns));
1779 eval(shift(@insns));
1780 }
1781 &vpaddq ($t2,@X[0],16*2*$j-0x80."($Tbl)");
1782 foreach (@insns) { eval; } # remaining instructions
1783 &vmovdqa (16*$j."(%rsp)",$t2);
1784}
1785
1786 for ($i=0,$j=0; $j<8; $j++) {
1787 &AVX_512_00_47($j,\&body_00_15,@X);
1788 push(@X,shift(@X)); # rotate(@X)
1789 }
1790 &cmpb ($SZ-1+16*2*$SZ-0x80."($Tbl)",0);
1791 &jne (".Lavx_00_47");
1792
1793 for ($i=0; $i<16; ) {
1794 foreach(body_00_15()) { eval; }
1795 }
1796}
1797$code.=<<___;
1798 mov $_ctx,$ctx
1799 mov $a1,$A
1800
1801 add $SZ*0($ctx),$A
1802 lea 16*$SZ($inp),$inp
1803 add $SZ*1($ctx),$B
1804 add $SZ*2($ctx),$C
1805 add $SZ*3($ctx),$D
1806 add $SZ*4($ctx),$E
1807 add $SZ*5($ctx),$F
1808 add $SZ*6($ctx),$G
1809 add $SZ*7($ctx),$H
1810
1811 cmp $_end,$inp
1812
1813 mov $A,$SZ*0($ctx)
1814 mov $B,$SZ*1($ctx)
1815 mov $C,$SZ*2($ctx)
1816 mov $D,$SZ*3($ctx)
1817 mov $E,$SZ*4($ctx)
1818 mov $F,$SZ*5($ctx)
1819 mov $G,$SZ*6($ctx)
1820 mov $H,$SZ*7($ctx)
1821 jb .Lloop_avx
1822
1823 mov $_rsp,%rsi
1824.cfi_def_cfa %rsi,8
1825 vzeroupper
1826___
1827$code.=<<___ if ($win64);
1828 movaps 16*$SZ+32(%rsp),%xmm6
1829 movaps 16*$SZ+48(%rsp),%xmm7
1830 movaps 16*$SZ+64(%rsp),%xmm8
1831 movaps 16*$SZ+80(%rsp),%xmm9
1832___
1833$code.=<<___ if ($win64 && $SZ>4);
1834 movaps 16*$SZ+96(%rsp),%xmm10
1835 movaps 16*$SZ+112(%rsp),%xmm11
1836___
1837$code.=<<___;
1838 mov -48(%rsi),%r15
1839.cfi_restore %r15
1840 mov -40(%rsi),%r14
1841.cfi_restore %r14
1842 mov -32(%rsi),%r13
1843.cfi_restore %r13
1844 mov -24(%rsi),%r12
1845.cfi_restore %r12
1846 mov -16(%rsi),%rbp
1847.cfi_restore %rbp
1848 mov -8(%rsi),%rbx
1849.cfi_restore %rbx
1850 lea (%rsi),%rsp
1851.cfi_def_cfa_register %rsp
1852.Lepilogue_avx:
1853 ret
1854.cfi_endproc
1855.size ${func}_avx,.-${func}_avx
1856___
1857
1858if ($avx>1) {{
1859######################################################################
1860# AVX2+BMI code path
1861#
1862my $a5=$SZ==4?"%esi":"%rsi"; # zap $inp
1863my $PUSH8=8*2*$SZ;
1864use integer;
1865
1866sub bodyx_00_15 () {
1867 # at start $a1 should be zero, $a3 - $b^$c and $a4 copy of $f
1868 (
1869 '($a,$b,$c,$d,$e,$f,$g,$h)=@ROT;'.
1870
1871 '&add ($h,(32*($i/(16/$SZ))+$SZ*($i%(16/$SZ)))%$PUSH8.$base)', # h+=X[i]+K[i]
1872 '&and ($a4,$e)', # f&e
1873 '&rorx ($a0,$e,$Sigma1[2])',
1874 '&rorx ($a2,$e,$Sigma1[1])',
1875
1876 '&lea ($a,"($a,$a1)")', # h+=Sigma0(a) from the past
1877 '&lea ($h,"($h,$a4)")',
1878 '&andn ($a4,$e,$g)', # ~e&g
1879 '&xor ($a0,$a2)',
1880
1881 '&rorx ($a1,$e,$Sigma1[0])',
1882 '&lea ($h,"($h,$a4)")', # h+=Ch(e,f,g)=(e&f)+(~e&g)
1883 '&xor ($a0,$a1)', # Sigma1(e)
1884 '&mov ($a2,$a)',
1885
1886 '&rorx ($a4,$a,$Sigma0[2])',
1887 '&lea ($h,"($h,$a0)")', # h+=Sigma1(e)
1888 '&xor ($a2,$b)', # a^b, b^c in next round
1889 '&rorx ($a1,$a,$Sigma0[1])',
1890
1891 '&rorx ($a0,$a,$Sigma0[0])',
1892 '&lea ($d,"($d,$h)")', # d+=h
1893 '&and ($a3,$a2)', # (b^c)&(a^b)
1894 '&xor ($a1,$a4)',
1895
1896 '&xor ($a3,$b)', # Maj(a,b,c)=Ch(a^b,c,b)
1897 '&xor ($a1,$a0)', # Sigma0(a)
1898 '&lea ($h,"($h,$a3)");'. # h+=Maj(a,b,c)
1899 '&mov ($a4,$e)', # copy of f in future
1900
1901 '($a2,$a3) = ($a3,$a2); unshift(@ROT,pop(@ROT)); $i++;'
1902 );
1903 # and at the finish one has to $a+=$a1
1904}
1905
1906$code.=<<___;
1907.type ${func}_avx2,\@function,3
1908.align 64
1909${func}_avx2:
1910.cfi_startproc
1911.Lavx2_shortcut:
1912 mov %rsp,%rax # copy %rsp
1913.cfi_def_cfa_register %rax
1914 push %rbx
1915.cfi_push %rbx
1916 push %rbp
1917.cfi_push %rbp
1918 push %r12
1919.cfi_push %r12
1920 push %r13
1921.cfi_push %r13
1922 push %r14
1923.cfi_push %r14
1924 push %r15
1925.cfi_push %r15
1926 sub \$`2*$SZ*$rounds+4*8+$win64*16*($SZ==4?4:6)`,%rsp
1927 shl \$4,%rdx # num*16
1928 and \$-256*$SZ,%rsp # align stack frame
1929 lea ($inp,%rdx,$SZ),%rdx # inp+num*16*$SZ
1930 add \$`2*$SZ*($rounds-8)`,%rsp
1931 mov $ctx,$_ctx # save ctx, 1st arg
1932 mov $inp,$_inp # save inp, 2nd arh
1933 mov %rdx,$_end # save end pointer, "3rd" arg
1934 mov %rax,$_rsp # save copy of %rsp
1935.cfi_cfa_expression $_rsp,deref,+8
1936___
1937$code.=<<___ if ($win64);
1938 movaps %xmm6,16*$SZ+32(%rsp)
1939 movaps %xmm7,16*$SZ+48(%rsp)
1940 movaps %xmm8,16*$SZ+64(%rsp)
1941 movaps %xmm9,16*$SZ+80(%rsp)
1942___
1943$code.=<<___ if ($win64 && $SZ>4);
1944 movaps %xmm10,16*$SZ+96(%rsp)
1945 movaps %xmm11,16*$SZ+112(%rsp)
1946___
1947$code.=<<___;
1948.Lprologue_avx2:
1949
1950 vzeroupper
1951 sub \$-16*$SZ,$inp # inp++, size optimization
1952 mov $SZ*0($ctx),$A
1953 mov $inp,%r12 # borrow $T1
1954 mov $SZ*1($ctx),$B
1955 cmp %rdx,$inp # $_end
1956 mov $SZ*2($ctx),$C
1957 cmove %rsp,%r12 # next block or random data
1958 mov $SZ*3($ctx),$D
1959 mov $SZ*4($ctx),$E
1960 mov $SZ*5($ctx),$F
1961 mov $SZ*6($ctx),$G
1962 mov $SZ*7($ctx),$H
1963___
1964 if ($SZ==4) { # SHA256
1965 my @X = map("%ymm$_",(0..3));
1966 my ($t0,$t1,$t2,$t3, $t4,$t5) = map("%ymm$_",(4..9));
1967
1968$code.=<<___;
1969 vmovdqa $TABLE+`$SZ*2*$rounds`+32(%rip),$t4
1970 vmovdqa $TABLE+`$SZ*2*$rounds`+64(%rip),$t5
1971 jmp .Loop_avx2
1972.align 16
1973.Loop_avx2:
1974 vmovdqa $TABLE+`$SZ*2*$rounds`(%rip),$t3
1975 vmovdqu -16*$SZ+0($inp),%xmm0
1976 vmovdqu -16*$SZ+16($inp),%xmm1
1977 vmovdqu -16*$SZ+32($inp),%xmm2
1978 vmovdqu -16*$SZ+48($inp),%xmm3
1979 #mov $inp,$_inp # offload $inp
1980 vinserti128 \$1,(%r12),@X[0],@X[0]
1981 vinserti128 \$1,16(%r12),@X[1],@X[1]
1982 vpshufb $t3,@X[0],@X[0]
1983 vinserti128 \$1,32(%r12),@X[2],@X[2]
1984 vpshufb $t3,@X[1],@X[1]
1985 vinserti128 \$1,48(%r12),@X[3],@X[3]
1986
1987 lea $TABLE(%rip),$Tbl
1988 vpshufb $t3,@X[2],@X[2]
1989 vpaddd 0x00($Tbl),@X[0],$t0
1990 vpshufb $t3,@X[3],@X[3]
1991 vpaddd 0x20($Tbl),@X[1],$t1
1992 vpaddd 0x40($Tbl),@X[2],$t2
1993 vpaddd 0x60($Tbl),@X[3],$t3
1994 vmovdqa $t0,0x00(%rsp)
1995 xor $a1,$a1
1996 vmovdqa $t1,0x20(%rsp)
1997___
1998$code.=<<___ if (!$win64);
1999# temporarily use %rdi as frame pointer
2000 mov $_rsp,%rdi
2001.cfi_def_cfa %rdi,8
2002___
2003$code.=<<___;
2004 lea -$PUSH8(%rsp),%rsp
2005___
2006$code.=<<___ if (!$win64);
2007# the frame info is at $_rsp, but the stack is moving...
2008# so a second frame pointer is saved at -8(%rsp)
2009# that is in the red zone
2010 mov %rdi,-8(%rsp)
2011.cfi_cfa_expression %rsp-8,deref,+8
2012___
2013$code.=<<___;
2014 mov $B,$a3
2015 vmovdqa $t2,0x00(%rsp)
2016 xor $C,$a3 # magic
2017 vmovdqa $t3,0x20(%rsp)
2018 mov $F,$a4
2019 sub \$-16*2*$SZ,$Tbl # size optimization
2020 jmp .Lavx2_00_47
2021
2022.align 16
2023.Lavx2_00_47:
2024___
2025
2026sub AVX2_256_00_47 () {
2027my $j = shift;
2028my $body = shift;
2029my @X = @_;
2030my @insns = (&$body,&$body,&$body,&$body); # 96 instructions
2031my $base = "+2*$PUSH8(%rsp)";
2032
2033 if (($j%2)==0) {
2034 &lea ("%rsp","-$PUSH8(%rsp)");
2035$code.=<<___ if (!$win64);
2036.cfi_cfa_expression %rsp+`$PUSH8-8`,deref,+8
2037# copy secondary frame pointer to new location again at -8(%rsp)
2038 pushq $PUSH8-8(%rsp)
2039.cfi_cfa_expression %rsp,deref,+8
2040 lea 8(%rsp),%rsp
2041.cfi_cfa_expression %rsp-8,deref,+8
2042___
2043 }
2044
2045 foreach (Xupdate_256_AVX()) { # 29 instructions
2046 eval;
2047 eval(shift(@insns));
2048 eval(shift(@insns));
2049 eval(shift(@insns));
2050 }
2051 &vpaddd ($t2,@X[0],16*2*$j."($Tbl)");
2052 foreach (@insns) { eval; } # remaining instructions
2053 &vmovdqa ((32*$j)%$PUSH8."(%rsp)",$t2);
2054}
2055
2056 for ($i=0,$j=0; $j<4; $j++) {
2057 &AVX2_256_00_47($j,\&bodyx_00_15,@X);
2058 push(@X,shift(@X)); # rotate(@X)
2059 }
2060 &lea ($Tbl,16*2*$SZ."($Tbl)");
2061 &cmpb (($SZ-1)."($Tbl)",0);
2062 &jne (".Lavx2_00_47");
2063
2064 for ($i=0; $i<16; ) {
2065 my $base=$i<8?"+$PUSH8(%rsp)":"(%rsp)";
2066 foreach(bodyx_00_15()) { eval; }
2067 }
2068 } else { # SHA512
2069 my @X = map("%ymm$_",(0..7));
2070 my ($t0,$t1,$t2,$t3) = map("%ymm$_",(8..11));
2071
2072$code.=<<___;
2073 jmp .Loop_avx2
2074.align 16
2075.Loop_avx2:
2076 vmovdqu -16*$SZ($inp),%xmm0
2077 vmovdqu -16*$SZ+16($inp),%xmm1
2078 vmovdqu -16*$SZ+32($inp),%xmm2
2079 lea $TABLE+0x80(%rip),$Tbl # size optimization
2080 vmovdqu -16*$SZ+48($inp),%xmm3
2081 vmovdqu -16*$SZ+64($inp),%xmm4
2082 vmovdqu -16*$SZ+80($inp),%xmm5
2083 vmovdqu -16*$SZ+96($inp),%xmm6
2084 vmovdqu -16*$SZ+112($inp),%xmm7
2085 #mov $inp,$_inp # offload $inp
2086 vmovdqa `$SZ*2*$rounds-0x80`($Tbl),$t2
2087 vinserti128 \$1,(%r12),@X[0],@X[0]
2088 vinserti128 \$1,16(%r12),@X[1],@X[1]
2089 vpshufb $t2,@X[0],@X[0]
2090 vinserti128 \$1,32(%r12),@X[2],@X[2]
2091 vpshufb $t2,@X[1],@X[1]
2092 vinserti128 \$1,48(%r12),@X[3],@X[3]
2093 vpshufb $t2,@X[2],@X[2]
2094 vinserti128 \$1,64(%r12),@X[4],@X[4]
2095 vpshufb $t2,@X[3],@X[3]
2096 vinserti128 \$1,80(%r12),@X[5],@X[5]
2097 vpshufb $t2,@X[4],@X[4]
2098 vinserti128 \$1,96(%r12),@X[6],@X[6]
2099 vpshufb $t2,@X[5],@X[5]
2100 vinserti128 \$1,112(%r12),@X[7],@X[7]
2101
2102 vpaddq -0x80($Tbl),@X[0],$t0
2103 vpshufb $t2,@X[6],@X[6]
2104 vpaddq -0x60($Tbl),@X[1],$t1
2105 vpshufb $t2,@X[7],@X[7]
2106 vpaddq -0x40($Tbl),@X[2],$t2
2107 vpaddq -0x20($Tbl),@X[3],$t3
2108 vmovdqa $t0,0x00(%rsp)
2109 vpaddq 0x00($Tbl),@X[4],$t0
2110 vmovdqa $t1,0x20(%rsp)
2111 vpaddq 0x20($Tbl),@X[5],$t1
2112 vmovdqa $t2,0x40(%rsp)
2113 vpaddq 0x40($Tbl),@X[6],$t2
2114 vmovdqa $t3,0x60(%rsp)
2115___
2116$code.=<<___ if (!$win64);
2117# temporarily use %rdi as frame pointer
2118 mov $_rsp,%rdi
2119.cfi_def_cfa %rdi,8
2120___
2121$code.=<<___;
2122 lea -$PUSH8(%rsp),%rsp
2123___
2124$code.=<<___ if (!$win64);
2125# the frame info is at $_rsp, but the stack is moving...
2126# so a second frame pointer is saved at -8(%rsp)
2127# that is in the red zone
2128 mov %rdi,-8(%rsp)
2129.cfi_cfa_expression %rsp-8,deref,+8
2130___
2131$code.=<<___;
2132 vpaddq 0x60($Tbl),@X[7],$t3
2133 vmovdqa $t0,0x00(%rsp)
2134 xor $a1,$a1
2135 vmovdqa $t1,0x20(%rsp)
2136 mov $B,$a3
2137 vmovdqa $t2,0x40(%rsp)
2138 xor $C,$a3 # magic
2139 vmovdqa $t3,0x60(%rsp)
2140 mov $F,$a4
2141 add \$16*2*$SZ,$Tbl
2142 jmp .Lavx2_00_47
2143
2144.align 16
2145.Lavx2_00_47:
2146___
2147
2148sub AVX2_512_00_47 () {
2149my $j = shift;
2150my $body = shift;
2151my @X = @_;
2152my @insns = (&$body,&$body); # 48 instructions
2153my $base = "+2*$PUSH8(%rsp)";
2154
2155 if (($j%4)==0) {
2156 &lea ("%rsp","-$PUSH8(%rsp)");
2157$code.=<<___ if (!$win64);
2158.cfi_cfa_expression %rsp+`$PUSH8-8`,deref,+8
2159# copy secondary frame pointer to new location again at -8(%rsp)
2160 pushq $PUSH8-8(%rsp)
2161.cfi_cfa_expression %rsp,deref,+8
2162 lea 8(%rsp),%rsp
2163.cfi_cfa_expression %rsp-8,deref,+8
2164___
2165 }
2166
2167 foreach (Xupdate_512_AVX()) { # 23 instructions
2168 eval;
2169 if ($_ !~ /\;$/) {
2170 eval(shift(@insns));
2171 eval(shift(@insns));
2172 eval(shift(@insns));
2173 }
2174 }
2175 &vpaddq ($t2,@X[0],16*2*$j-0x80."($Tbl)");
2176 foreach (@insns) { eval; } # remaining instructions
2177 &vmovdqa ((32*$j)%$PUSH8."(%rsp)",$t2);
2178}
2179
2180 for ($i=0,$j=0; $j<8; $j++) {
2181 &AVX2_512_00_47($j,\&bodyx_00_15,@X);
2182 push(@X,shift(@X)); # rotate(@X)
2183 }
2184 &lea ($Tbl,16*2*$SZ."($Tbl)");
2185 &cmpb (($SZ-1-0x80)."($Tbl)",0);
2186 &jne (".Lavx2_00_47");
2187
2188 for ($i=0; $i<16; ) {
2189 my $base=$i<8?"+$PUSH8(%rsp)":"(%rsp)";
2190 foreach(bodyx_00_15()) { eval; }
2191 }
2192}
2193$code.=<<___;
2194 mov `2*$SZ*$rounds`(%rsp),$ctx # $_ctx
2195 add $a1,$A
2196 #mov `2*$SZ*$rounds+8`(%rsp),$inp # $_inp
2197 lea `2*$SZ*($rounds-8)`(%rsp),$Tbl
2198
2199 add $SZ*0($ctx),$A
2200 add $SZ*1($ctx),$B
2201 add $SZ*2($ctx),$C
2202 add $SZ*3($ctx),$D
2203 add $SZ*4($ctx),$E
2204 add $SZ*5($ctx),$F
2205 add $SZ*6($ctx),$G
2206 add $SZ*7($ctx),$H
2207
2208 mov $A,$SZ*0($ctx)
2209 mov $B,$SZ*1($ctx)
2210 mov $C,$SZ*2($ctx)
2211 mov $D,$SZ*3($ctx)
2212 mov $E,$SZ*4($ctx)
2213 mov $F,$SZ*5($ctx)
2214 mov $G,$SZ*6($ctx)
2215 mov $H,$SZ*7($ctx)
2216
2217 cmp `$PUSH8+2*8`($Tbl),$inp # $_end
2218 je .Ldone_avx2
2219
2220 xor $a1,$a1
2221 mov $B,$a3
2222 xor $C,$a3 # magic
2223 mov $F,$a4
2224 jmp .Lower_avx2
2225.align 16
2226.Lower_avx2:
2227___
2228 for ($i=0; $i<8; ) {
2229 my $base="+16($Tbl)";
2230 foreach(bodyx_00_15()) { eval; }
2231 }
2232$code.=<<___;
2233 lea -$PUSH8($Tbl),$Tbl
2234 cmp %rsp,$Tbl
2235 jae .Lower_avx2
2236
2237 mov `2*$SZ*$rounds`(%rsp),$ctx # $_ctx
2238 add $a1,$A
2239 #mov `2*$SZ*$rounds+8`(%rsp),$inp # $_inp
2240 lea `2*$SZ*($rounds-8)`(%rsp),%rsp
2241# restore frame pointer to original location at $_rsp
2242.cfi_cfa_expression $_rsp,deref,+8
2243
2244 add $SZ*0($ctx),$A
2245 add $SZ*1($ctx),$B
2246 add $SZ*2($ctx),$C
2247 add $SZ*3($ctx),$D
2248 add $SZ*4($ctx),$E
2249 add $SZ*5($ctx),$F
2250 lea `2*16*$SZ`($inp),$inp # inp+=2
2251 add $SZ*6($ctx),$G
2252 mov $inp,%r12
2253 add $SZ*7($ctx),$H
2254 cmp $_end,$inp
2255
2256 mov $A,$SZ*0($ctx)
2257 cmove %rsp,%r12 # next block or stale data
2258 mov $B,$SZ*1($ctx)
2259 mov $C,$SZ*2($ctx)
2260 mov $D,$SZ*3($ctx)
2261 mov $E,$SZ*4($ctx)
2262 mov $F,$SZ*5($ctx)
2263 mov $G,$SZ*6($ctx)
2264 mov $H,$SZ*7($ctx)
2265
2266 jbe .Loop_avx2
2267 lea (%rsp),$Tbl
2268# temporarily use $Tbl as index to $_rsp
2269# this avoids the need to save a secondary frame pointer at -8(%rsp)
2270.cfi_cfa_expression $Tbl+`16*$SZ+3*8`,deref,+8
2271
2272.Ldone_avx2:
2273 mov `16*$SZ+3*8`($Tbl),%rsi
2274.cfi_def_cfa %rsi,8
2275 vzeroupper
2276___
2277$code.=<<___ if ($win64);
2278 movaps 16*$SZ+32($Tbl),%xmm6
2279 movaps 16*$SZ+48($Tbl),%xmm7
2280 movaps 16*$SZ+64($Tbl),%xmm8
2281 movaps 16*$SZ+80($Tbl),%xmm9
2282___
2283$code.=<<___ if ($win64 && $SZ>4);
2284 movaps 16*$SZ+96($Tbl),%xmm10
2285 movaps 16*$SZ+112($Tbl),%xmm11
2286___
2287$code.=<<___;
2288 mov -48(%rsi),%r15
2289.cfi_restore %r15
2290 mov -40(%rsi),%r14
2291.cfi_restore %r14
2292 mov -32(%rsi),%r13
2293.cfi_restore %r13
2294 mov -24(%rsi),%r12
2295.cfi_restore %r12
2296 mov -16(%rsi),%rbp
2297.cfi_restore %rbp
2298 mov -8(%rsi),%rbx
2299.cfi_restore %rbx
2300 lea (%rsi),%rsp
2301.cfi_def_cfa_register %rsp
2302.Lepilogue_avx2:
2303 ret
2304.cfi_endproc
2305.size ${func}_avx2,.-${func}_avx2
2306___
2307}}
2308}}}}}
2309
2310# EXCEPTION_DISPOSITION handler (EXCEPTION_RECORD *rec,ULONG64 frame,
2311# CONTEXT *context,DISPATCHER_CONTEXT *disp)
2312if ($win64) {
2313$rec="%rcx";
2314$frame="%rdx";
2315$context="%r8";
2316$disp="%r9";
2317
2318$code.=<<___;
2319.extern __imp_RtlVirtualUnwind
2320.type se_handler,\@abi-omnipotent
2321.align 16
2322se_handler:
2323 push %rsi
2324 push %rdi
2325 push %rbx
2326 push %rbp
2327 push %r12
2328 push %r13
2329 push %r14
2330 push %r15
2331 pushfq
2332 sub \$64,%rsp
2333
2334 mov 120($context),%rax # pull context->Rax
2335 mov 248($context),%rbx # pull context->Rip
2336
2337 mov 8($disp),%rsi # disp->ImageBase
2338 mov 56($disp),%r11 # disp->HanderlData
2339
2340 mov 0(%r11),%r10d # HandlerData[0]
2341 lea (%rsi,%r10),%r10 # prologue label
2342 cmp %r10,%rbx # context->Rip<prologue label
2343 jb .Lin_prologue
2344
2345 mov 152($context),%rax # pull context->Rsp
2346
2347 mov 4(%r11),%r10d # HandlerData[1]
2348 lea (%rsi,%r10),%r10 # epilogue label
2349 cmp %r10,%rbx # context->Rip>=epilogue label
2350 jae .Lin_prologue
2351___
2352$code.=<<___ if ($avx>1);
2353 lea .Lavx2_shortcut(%rip),%r10
2354 cmp %r10,%rbx # context->Rip<avx2_shortcut
2355 jb .Lnot_in_avx2
2356
2357 and \$-256*$SZ,%rax
2358 add \$`2*$SZ*($rounds-8)`,%rax
2359.Lnot_in_avx2:
2360___
2361$code.=<<___;
2362 mov %rax,%rsi # put aside Rsp
2363 mov 16*$SZ+3*8(%rax),%rax # pull $_rsp
2364
2365 mov -8(%rax),%rbx
2366 mov -16(%rax),%rbp
2367 mov -24(%rax),%r12
2368 mov -32(%rax),%r13
2369 mov -40(%rax),%r14
2370 mov -48(%rax),%r15
2371 mov %rbx,144($context) # restore context->Rbx
2372 mov %rbp,160($context) # restore context->Rbp
2373 mov %r12,216($context) # restore context->R12
2374 mov %r13,224($context) # restore context->R13
2375 mov %r14,232($context) # restore context->R14
2376 mov %r15,240($context) # restore context->R15
2377
2378 lea .Lepilogue(%rip),%r10
2379 cmp %r10,%rbx
2380 jb .Lin_prologue # non-AVX code
2381
2382 lea 16*$SZ+4*8(%rsi),%rsi # Xmm6- save area
2383 lea 512($context),%rdi # &context.Xmm6
2384 mov \$`$SZ==4?8:12`,%ecx
2385 .long 0xa548f3fc # cld; rep movsq
2386
2387.Lin_prologue:
2388 mov 8(%rax),%rdi
2389 mov 16(%rax),%rsi
2390 mov %rax,152($context) # restore context->Rsp
2391 mov %rsi,168($context) # restore context->Rsi
2392 mov %rdi,176($context) # restore context->Rdi
2393
2394 mov 40($disp),%rdi # disp->ContextRecord
2395 mov $context,%rsi # context
2396 mov \$154,%ecx # sizeof(CONTEXT)
2397 .long 0xa548f3fc # cld; rep movsq
2398
2399 mov $disp,%rsi
2400 xor %rcx,%rcx # arg1, UNW_FLAG_NHANDLER
2401 mov 8(%rsi),%rdx # arg2, disp->ImageBase
2402 mov 0(%rsi),%r8 # arg3, disp->ControlPc
2403 mov 16(%rsi),%r9 # arg4, disp->FunctionEntry
2404 mov 40(%rsi),%r10 # disp->ContextRecord
2405 lea 56(%rsi),%r11 # &disp->HandlerData
2406 lea 24(%rsi),%r12 # &disp->EstablisherFrame
2407 mov %r10,32(%rsp) # arg5
2408 mov %r11,40(%rsp) # arg6
2409 mov %r12,48(%rsp) # arg7
2410 mov %rcx,56(%rsp) # arg8, (NULL)
2411 call *__imp_RtlVirtualUnwind(%rip)
2412
2413 mov \$1,%eax # ExceptionContinueSearch
2414 add \$64,%rsp
2415 popfq
2416 pop %r15
2417 pop %r14
2418 pop %r13
2419 pop %r12
2420 pop %rbp
2421 pop %rbx
2422 pop %rdi
2423 pop %rsi
2424 ret
2425.size se_handler,.-se_handler
2426___
2427
2428$code.=<<___ if ($SZ==4 && $shaext);
2429.type shaext_handler,\@abi-omnipotent
2430.align 16
2431shaext_handler:
2432 push %rsi
2433 push %rdi
2434 push %rbx
2435 push %rbp
2436 push %r12
2437 push %r13
2438 push %r14
2439 push %r15
2440 pushfq
2441 sub \$64,%rsp
2442
2443 mov 120($context),%rax # pull context->Rax
2444 mov 248($context),%rbx # pull context->Rip
2445
2446 lea .Lprologue_shaext(%rip),%r10
2447 cmp %r10,%rbx # context->Rip<.Lprologue
2448 jb .Lin_prologue
2449
2450 lea .Lepilogue_shaext(%rip),%r10
2451 cmp %r10,%rbx # context->Rip>=.Lepilogue
2452 jae .Lin_prologue
2453
2454 lea -8-5*16(%rax),%rsi
2455 lea 512($context),%rdi # &context.Xmm6
2456 mov \$10,%ecx
2457 .long 0xa548f3fc # cld; rep movsq
2458
2459 jmp .Lin_prologue
2460.size shaext_handler,.-shaext_handler
2461___
2462
2463$code.=<<___;
2464.section .pdata
2465.align 4
2466 .rva .LSEH_begin_$func
2467 .rva .LSEH_end_$func
2468 .rva .LSEH_info_$func
2469___
2470$code.=<<___ if ($SZ==4 && $shaext);
2471 .rva .LSEH_begin_${func}_shaext
2472 .rva .LSEH_end_${func}_shaext
2473 .rva .LSEH_info_${func}_shaext
2474___
2475$code.=<<___ if ($SZ==4);
2476 .rva .LSEH_begin_${func}_ssse3
2477 .rva .LSEH_end_${func}_ssse3
2478 .rva .LSEH_info_${func}_ssse3
2479___
2480$code.=<<___ if ($avx && $SZ==8);
2481 .rva .LSEH_begin_${func}_xop
2482 .rva .LSEH_end_${func}_xop
2483 .rva .LSEH_info_${func}_xop
2484___
2485$code.=<<___ if ($avx);
2486 .rva .LSEH_begin_${func}_avx
2487 .rva .LSEH_end_${func}_avx
2488 .rva .LSEH_info_${func}_avx
2489___
2490$code.=<<___ if ($avx>1);
2491 .rva .LSEH_begin_${func}_avx2
2492 .rva .LSEH_end_${func}_avx2
2493 .rva .LSEH_info_${func}_avx2
2494___
2495$code.=<<___;
2496.section .xdata
2497.align 8
2498.LSEH_info_$func:
2499 .byte 9,0,0,0
2500 .rva se_handler
2501 .rva .Lprologue,.Lepilogue # HandlerData[]
2502___
2503$code.=<<___ if ($SZ==4 && $shaext);
2504.LSEH_info_${func}_shaext:
2505 .byte 9,0,0,0
2506 .rva shaext_handler
2507___
2508$code.=<<___ if ($SZ==4);
2509.LSEH_info_${func}_ssse3:
2510 .byte 9,0,0,0
2511 .rva se_handler
2512 .rva .Lprologue_ssse3,.Lepilogue_ssse3 # HandlerData[]
2513___
2514$code.=<<___ if ($avx && $SZ==8);
2515.LSEH_info_${func}_xop:
2516 .byte 9,0,0,0
2517 .rva se_handler
2518 .rva .Lprologue_xop,.Lepilogue_xop # HandlerData[]
2519___
2520$code.=<<___ if ($avx);
2521.LSEH_info_${func}_avx:
2522 .byte 9,0,0,0
2523 .rva se_handler
2524 .rva .Lprologue_avx,.Lepilogue_avx # HandlerData[]
2525___
2526$code.=<<___ if ($avx>1);
2527.LSEH_info_${func}_avx2:
2528 .byte 9,0,0,0
2529 .rva se_handler
2530 .rva .Lprologue_avx2,.Lepilogue_avx2 # HandlerData[]
2531___
2532}
2533
2534sub sha256op38 {
2535 my $instr = shift;
2536 my %opcodelet = (
2537 "sha256rnds2" => 0xcb,
2538 "sha256msg1" => 0xcc,
2539 "sha256msg2" => 0xcd );
2540
2541 if (defined($opcodelet{$instr}) && @_[0] =~ /%xmm([0-7]),\s*%xmm([0-7])/) {
2542 my @opcode=(0x0f,0x38);
2543 push @opcode,$opcodelet{$instr};
2544 push @opcode,0xc0|($1&7)|(($2&7)<<3); # ModR/M
2545 return ".byte\t".join(',',@opcode);
2546 } else {
2547 return $instr."\t".@_[0];
2548 }
2549}
2550
2551foreach (split("\n",$code)) {
2552 s/\`([^\`]*)\`/eval $1/geo;
2553
2554 s/\b(sha256[^\s]*)\s+(.*)/sha256op38($1,$2)/geo;
2555
2556 print $_,"\n";
2557}
2558close STDOUT or die "error closing STDOUT: $!";
Note: See TracBrowser for help on using the repository browser.

© 2024 Oracle Support Privacy / Do Not Sell My Info Terms of Use Trademark Policy Automated Access Etiquette