1 | #! /usr/bin/env perl
|
---|
2 | # Copyright 2011-2016 The OpenSSL Project Authors. All Rights Reserved.
|
---|
3 | #
|
---|
4 | # Licensed under the OpenSSL license (the "License"). You may not use
|
---|
5 | # this file except in compliance with the License. You can obtain a copy
|
---|
6 | # in the file LICENSE in the source distribution or at
|
---|
7 | # https://www.openssl.org/source/license.html
|
---|
8 |
|
---|
9 | #
|
---|
10 | # ====================================================================
|
---|
11 | # Written by Andy Polyakov <[email protected]> for the OpenSSL
|
---|
12 | # project. The module is, however, dual licensed under OpenSSL and
|
---|
13 | # CRYPTOGAMS licenses depending on where you obtain it. For further
|
---|
14 | # details see http://www.openssl.org/~appro/cryptogams/.
|
---|
15 | # ====================================================================
|
---|
16 | #
|
---|
17 | # June 2011
|
---|
18 | #
|
---|
19 | # This is AESNI-CBC+SHA1 "stitch" implementation. The idea, as spelled
|
---|
20 | # in http://download.intel.com/design/intarch/papers/323686.pdf, is
|
---|
21 | # that since AESNI-CBC encrypt exhibit *very* low instruction-level
|
---|
22 | # parallelism, interleaving it with another algorithm would allow to
|
---|
23 | # utilize processor resources better and achieve better performance.
|
---|
24 | # SHA1 instruction sequences(*) are taken from sha1-x86_64.pl and
|
---|
25 | # AESNI code is weaved into it. Below are performance numbers in
|
---|
26 | # cycles per processed byte, less is better, for standalone AESNI-CBC
|
---|
27 | # encrypt, sum of the latter and standalone SHA1, and "stitched"
|
---|
28 | # subroutine:
|
---|
29 | #
|
---|
30 | # AES-128-CBC +SHA1 stitch gain
|
---|
31 | # Westmere 3.77[+5.3] 9.07 6.55 +38%
|
---|
32 | # Sandy Bridge 5.05[+5.0(6.1)] 10.06(11.15) 5.98(7.05) +68%(+58%)
|
---|
33 | # Ivy Bridge 5.05[+4.6] 9.65 5.54 +74%
|
---|
34 | # Haswell 4.43[+3.6(4.2)] 8.00(8.58) 4.55(5.21) +75%(+65%)
|
---|
35 | # Skylake 2.63[+3.5(4.1)] 6.17(6.69) 4.23(4.44) +46%(+51%)
|
---|
36 | # Bulldozer 5.77[+6.0] 11.72 6.37 +84%
|
---|
37 | # Ryzen(**) 2.71[+1.93] 4.64 2.74 +69%
|
---|
38 | # Goldmont(**) 3.82[+1.70] 5.52 4.20 +31%
|
---|
39 | #
|
---|
40 | # AES-192-CBC
|
---|
41 | # Westmere 4.51 9.81 6.80 +44%
|
---|
42 | # Sandy Bridge 6.05 11.06(12.15) 6.11(7.19) +81%(+69%)
|
---|
43 | # Ivy Bridge 6.05 10.65 6.07 +75%
|
---|
44 | # Haswell 5.29 8.86(9.44) 5.32(5.32) +67%(+77%)
|
---|
45 | # Bulldozer 6.89 12.84 6.96 +84%
|
---|
46 | #
|
---|
47 | # AES-256-CBC
|
---|
48 | # Westmere 5.25 10.55 7.21 +46%
|
---|
49 | # Sandy Bridge 7.05 12.06(13.15) 7.12(7.72) +69%(+70%)
|
---|
50 | # Ivy Bridge 7.05 11.65 7.12 +64%
|
---|
51 | # Haswell 6.19 9.76(10.34) 6.21(6.25) +57%(+65%)
|
---|
52 | # Skylake 3.62 7.16(7.68) 4.56(4.76) +57%(+61%)
|
---|
53 | # Bulldozer 8.00 13.95 8.25 +69%
|
---|
54 | # Ryzen(**) 3.71 5.64 3.72 +52%
|
---|
55 | # Goldmont(**) 5.35 7.05 5.76 +22%
|
---|
56 | #
|
---|
57 | # (*) There are two code paths: SSSE3 and AVX. See sha1-568.pl for
|
---|
58 | # background information. Above numbers in parentheses are SSSE3
|
---|
59 | # results collected on AVX-capable CPU, i.e. apply on OSes that
|
---|
60 | # don't support AVX.
|
---|
61 | # (**) SHAEXT results.
|
---|
62 | #
|
---|
63 | # Needless to mention that it makes no sense to implement "stitched"
|
---|
64 | # *decrypt* subroutine. Because *both* AESNI-CBC decrypt and SHA1
|
---|
65 | # fully utilize parallelism, so stitching would not give any gain
|
---|
66 | # anyway. Well, there might be some, e.g. because of better cache
|
---|
67 | # locality... For reference, here are performance results for
|
---|
68 | # standalone AESNI-CBC decrypt:
|
---|
69 | #
|
---|
70 | # AES-128-CBC AES-192-CBC AES-256-CBC
|
---|
71 | # Westmere 1.25 1.50 1.75
|
---|
72 | # Sandy Bridge 0.74 0.91 1.09
|
---|
73 | # Ivy Bridge 0.74 0.90 1.11
|
---|
74 | # Haswell 0.63 0.76 0.88
|
---|
75 | # Bulldozer 0.70 0.85 0.99
|
---|
76 |
|
---|
77 | # And indeed:
|
---|
78 | #
|
---|
79 | # AES-256-CBC +SHA1 stitch gain
|
---|
80 | # Westmere 1.75 7.20 6.68 +7.8%
|
---|
81 | # Sandy Bridge 1.09 6.09(7.22) 5.82(6.95) +4.6%(+3.9%)
|
---|
82 | # Ivy Bridge 1.11 5.70 5.45 +4.6%
|
---|
83 | # Haswell 0.88 4.45(5.00) 4.39(4.69) +1.4%(*)(+6.6%)
|
---|
84 | # Bulldozer 0.99 6.95 5.95 +17%(**)
|
---|
85 | #
|
---|
86 | # (*) Tiny improvement coefficient on Haswell is because we compare
|
---|
87 | # AVX1 stitch to sum with AVX2 SHA1.
|
---|
88 | # (**) Execution is fully dominated by integer code sequence and
|
---|
89 | # SIMD still hardly shows [in single-process benchmark;-]
|
---|
90 |
|
---|
91 | $flavour = shift;
|
---|
92 | $output = shift;
|
---|
93 | if ($flavour =~ /\./) { $output = $flavour; undef $flavour; }
|
---|
94 |
|
---|
95 | $win64=0; $win64=1 if ($flavour =~ /[nm]asm|mingw64/ || $output =~ /\.asm$/);
|
---|
96 |
|
---|
97 | $0 =~ m/(.*[\/\\])[^\/\\]+$/; $dir=$1;
|
---|
98 | ( $xlate="${dir}x86_64-xlate.pl" and -f $xlate ) or
|
---|
99 | ( $xlate="${dir}../../perlasm/x86_64-xlate.pl" and -f $xlate) or
|
---|
100 | die "can't locate x86_64-xlate.pl";
|
---|
101 |
|
---|
102 | $avx=1 if (`$ENV{CC} -Wa,-v -c -o /dev/null -x assembler /dev/null 2>&1`
|
---|
103 | =~ /GNU assembler version ([2-9]\.[0-9]+)/ &&
|
---|
104 | $1>=2.19);
|
---|
105 | $avx=1 if (!$avx && $win64 && ($flavour =~ /nasm/ || $ENV{ASM} =~ /nasm/) &&
|
---|
106 | `nasm -v 2>&1` =~ /NASM version ([2-9]\.[0-9]+)/ &&
|
---|
107 | $1>=2.09);
|
---|
108 | $avx=1 if (!$avx && $win64 && ($flavour =~ /masm/ || $ENV{ASM} =~ /ml64/) &&
|
---|
109 | `ml64 2>&1` =~ /Version ([0-9]+)\./ &&
|
---|
110 | $1>=10);
|
---|
111 | $avx=1 if (!$avx && `$ENV{CC} -v 2>&1` =~ /((?:^clang|LLVM) version|.*based on LLVM) ([3-9]\.[0-9]+)/ && $2>=3.0);
|
---|
112 |
|
---|
113 | $shaext=1; ### set to zero if compiling for 1.0.1
|
---|
114 |
|
---|
115 | $stitched_decrypt=0;
|
---|
116 |
|
---|
117 | open OUT,"| \"$^X\" \"$xlate\" $flavour \"$output\"";
|
---|
118 | *STDOUT=*OUT;
|
---|
119 |
|
---|
120 | # void aesni_cbc_sha1_enc(const void *inp,
|
---|
121 | # void *out,
|
---|
122 | # size_t length,
|
---|
123 | # const AES_KEY *key,
|
---|
124 | # unsigned char *iv,
|
---|
125 | # SHA_CTX *ctx,
|
---|
126 | # const void *in0);
|
---|
127 |
|
---|
128 | $code.=<<___;
|
---|
129 | .text
|
---|
130 | .extern OPENSSL_ia32cap_P
|
---|
131 |
|
---|
132 | .globl aesni_cbc_sha1_enc
|
---|
133 | .type aesni_cbc_sha1_enc,\@abi-omnipotent
|
---|
134 | .align 32
|
---|
135 | aesni_cbc_sha1_enc:
|
---|
136 | # caller should check for SSSE3 and AES-NI bits
|
---|
137 | mov OPENSSL_ia32cap_P+0(%rip),%r10d
|
---|
138 | mov OPENSSL_ia32cap_P+4(%rip),%r11
|
---|
139 | ___
|
---|
140 | $code.=<<___ if ($shaext);
|
---|
141 | bt \$61,%r11 # check SHA bit
|
---|
142 | jc aesni_cbc_sha1_enc_shaext
|
---|
143 | ___
|
---|
144 | $code.=<<___ if ($avx);
|
---|
145 | and \$`1<<28`,%r11d # mask AVX bit
|
---|
146 | and \$`1<<30`,%r10d # mask "Intel CPU" bit
|
---|
147 | or %r11d,%r10d
|
---|
148 | cmp \$`1<<28|1<<30`,%r10d
|
---|
149 | je aesni_cbc_sha1_enc_avx
|
---|
150 | ___
|
---|
151 | $code.=<<___;
|
---|
152 | jmp aesni_cbc_sha1_enc_ssse3
|
---|
153 | ret
|
---|
154 | .size aesni_cbc_sha1_enc,.-aesni_cbc_sha1_enc
|
---|
155 | ___
|
---|
156 |
|
---|
157 | my ($in0,$out,$len,$key,$ivp,$ctx,$inp)=("%rdi","%rsi","%rdx","%rcx","%r8","%r9","%r10");
|
---|
158 |
|
---|
159 | my $Xi=4;
|
---|
160 | my @X=map("%xmm$_",(4..7,0..3));
|
---|
161 | my @Tx=map("%xmm$_",(8..10));
|
---|
162 | my @V=($A,$B,$C,$D,$E)=("%eax","%ebx","%ecx","%edx","%ebp"); # size optimization
|
---|
163 | my @T=("%esi","%edi");
|
---|
164 | my $j=0; my $jj=0; my $r=0; my $sn=0; my $rx=0;
|
---|
165 | my $K_XX_XX="%r11";
|
---|
166 | my ($rndkey0,$iv,$in)=map("%xmm$_",(11..13)); # for enc
|
---|
167 | my @rndkey=("%xmm14","%xmm15"); # for enc
|
---|
168 | my ($inout0,$inout1,$inout2,$inout3)=map("%xmm$_",(12..15)); # for dec
|
---|
169 |
|
---|
170 | if (1) { # reassign for Atom Silvermont
|
---|
171 | # The goal is to minimize amount of instructions with more than
|
---|
172 | # 3 prefix bytes. Or in more practical terms to keep AES-NI *and*
|
---|
173 | # SSSE3 instructions to upper half of the register bank.
|
---|
174 | @X=map("%xmm$_",(8..11,4..7));
|
---|
175 | @Tx=map("%xmm$_",(12,13,3));
|
---|
176 | ($iv,$in,$rndkey0)=map("%xmm$_",(2,14,15));
|
---|
177 | @rndkey=("%xmm0","%xmm1");
|
---|
178 | }
|
---|
179 |
|
---|
180 | sub AUTOLOAD() # thunk [simplified] 32-bit style perlasm
|
---|
181 | { my $opcode = $AUTOLOAD; $opcode =~ s/.*:://;
|
---|
182 | my $arg = pop;
|
---|
183 | $arg = "\$$arg" if ($arg*1 eq $arg);
|
---|
184 | $code .= "\t$opcode\t".join(',',$arg,reverse @_)."\n";
|
---|
185 | }
|
---|
186 |
|
---|
187 | my $_rol=sub { &rol(@_) };
|
---|
188 | my $_ror=sub { &ror(@_) };
|
---|
189 |
|
---|
190 | $code.=<<___;
|
---|
191 | .type aesni_cbc_sha1_enc_ssse3,\@function,6
|
---|
192 | .align 32
|
---|
193 | aesni_cbc_sha1_enc_ssse3:
|
---|
194 | mov `($win64?56:8)`(%rsp),$inp # load 7th argument
|
---|
195 | #shr \$6,$len # debugging artefact
|
---|
196 | #jz .Lepilogue_ssse3 # debugging artefact
|
---|
197 | push %rbx
|
---|
198 | push %rbp
|
---|
199 | push %r12
|
---|
200 | push %r13
|
---|
201 | push %r14
|
---|
202 | push %r15
|
---|
203 | lea `-104-($win64?10*16:0)`(%rsp),%rsp
|
---|
204 | #mov $in0,$inp # debugging artefact
|
---|
205 | #lea 64(%rsp),$ctx # debugging artefact
|
---|
206 | ___
|
---|
207 | $code.=<<___ if ($win64);
|
---|
208 | movaps %xmm6,96+0(%rsp)
|
---|
209 | movaps %xmm7,96+16(%rsp)
|
---|
210 | movaps %xmm8,96+32(%rsp)
|
---|
211 | movaps %xmm9,96+48(%rsp)
|
---|
212 | movaps %xmm10,96+64(%rsp)
|
---|
213 | movaps %xmm11,96+80(%rsp)
|
---|
214 | movaps %xmm12,96+96(%rsp)
|
---|
215 | movaps %xmm13,96+112(%rsp)
|
---|
216 | movaps %xmm14,96+128(%rsp)
|
---|
217 | movaps %xmm15,96+144(%rsp)
|
---|
218 | .Lprologue_ssse3:
|
---|
219 | ___
|
---|
220 | $code.=<<___;
|
---|
221 | mov $in0,%r12 # reassign arguments
|
---|
222 | mov $out,%r13
|
---|
223 | mov $len,%r14
|
---|
224 | lea 112($key),%r15 # size optimization
|
---|
225 | movdqu ($ivp),$iv # load IV
|
---|
226 | mov $ivp,88(%rsp) # save $ivp
|
---|
227 | ___
|
---|
228 | ($in0,$out,$len,$key)=map("%r$_",(12..15)); # reassign arguments
|
---|
229 | my $rounds="${ivp}d";
|
---|
230 | $code.=<<___;
|
---|
231 | shl \$6,$len
|
---|
232 | sub $in0,$out
|
---|
233 | mov 240-112($key),$rounds
|
---|
234 | add $inp,$len # end of input
|
---|
235 |
|
---|
236 | lea K_XX_XX(%rip),$K_XX_XX
|
---|
237 | mov 0($ctx),$A # load context
|
---|
238 | mov 4($ctx),$B
|
---|
239 | mov 8($ctx),$C
|
---|
240 | mov 12($ctx),$D
|
---|
241 | mov $B,@T[0] # magic seed
|
---|
242 | mov 16($ctx),$E
|
---|
243 | mov $C,@T[1]
|
---|
244 | xor $D,@T[1]
|
---|
245 | and @T[1],@T[0]
|
---|
246 |
|
---|
247 | movdqa 64($K_XX_XX),@Tx[2] # pbswap mask
|
---|
248 | movdqa 0($K_XX_XX),@Tx[1] # K_00_19
|
---|
249 | movdqu 0($inp),@X[-4&7] # load input to %xmm[0-3]
|
---|
250 | movdqu 16($inp),@X[-3&7]
|
---|
251 | movdqu 32($inp),@X[-2&7]
|
---|
252 | movdqu 48($inp),@X[-1&7]
|
---|
253 | pshufb @Tx[2],@X[-4&7] # byte swap
|
---|
254 | pshufb @Tx[2],@X[-3&7]
|
---|
255 | pshufb @Tx[2],@X[-2&7]
|
---|
256 | add \$64,$inp
|
---|
257 | paddd @Tx[1],@X[-4&7] # add K_00_19
|
---|
258 | pshufb @Tx[2],@X[-1&7]
|
---|
259 | paddd @Tx[1],@X[-3&7]
|
---|
260 | paddd @Tx[1],@X[-2&7]
|
---|
261 | movdqa @X[-4&7],0(%rsp) # X[]+K xfer to IALU
|
---|
262 | psubd @Tx[1],@X[-4&7] # restore X[]
|
---|
263 | movdqa @X[-3&7],16(%rsp)
|
---|
264 | psubd @Tx[1],@X[-3&7]
|
---|
265 | movdqa @X[-2&7],32(%rsp)
|
---|
266 | psubd @Tx[1],@X[-2&7]
|
---|
267 | movups -112($key),$rndkey0 # $key[0]
|
---|
268 | movups 16-112($key),$rndkey[0] # forward reference
|
---|
269 | jmp .Loop_ssse3
|
---|
270 | ___
|
---|
271 |
|
---|
272 | my $aesenc=sub {
|
---|
273 | use integer;
|
---|
274 | my ($n,$k)=($r/10,$r%10);
|
---|
275 | if ($k==0) {
|
---|
276 | $code.=<<___;
|
---|
277 | movups `16*$n`($in0),$in # load input
|
---|
278 | xorps $rndkey0,$in
|
---|
279 | ___
|
---|
280 | $code.=<<___ if ($n);
|
---|
281 | movups $iv,`16*($n-1)`($out,$in0) # write output
|
---|
282 | ___
|
---|
283 | $code.=<<___;
|
---|
284 | xorps $in,$iv
|
---|
285 | movups `32+16*$k-112`($key),$rndkey[1]
|
---|
286 | aesenc $rndkey[0],$iv
|
---|
287 | ___
|
---|
288 | } elsif ($k==9) {
|
---|
289 | $sn++;
|
---|
290 | $code.=<<___;
|
---|
291 | cmp \$11,$rounds
|
---|
292 | jb .Laesenclast$sn
|
---|
293 | movups `32+16*($k+0)-112`($key),$rndkey[1]
|
---|
294 | aesenc $rndkey[0],$iv
|
---|
295 | movups `32+16*($k+1)-112`($key),$rndkey[0]
|
---|
296 | aesenc $rndkey[1],$iv
|
---|
297 | je .Laesenclast$sn
|
---|
298 | movups `32+16*($k+2)-112`($key),$rndkey[1]
|
---|
299 | aesenc $rndkey[0],$iv
|
---|
300 | movups `32+16*($k+3)-112`($key),$rndkey[0]
|
---|
301 | aesenc $rndkey[1],$iv
|
---|
302 | .Laesenclast$sn:
|
---|
303 | aesenclast $rndkey[0],$iv
|
---|
304 | movups 16-112($key),$rndkey[1] # forward reference
|
---|
305 | ___
|
---|
306 | } else {
|
---|
307 | $code.=<<___;
|
---|
308 | movups `32+16*$k-112`($key),$rndkey[1]
|
---|
309 | aesenc $rndkey[0],$iv
|
---|
310 | ___
|
---|
311 | }
|
---|
312 | $r++; unshift(@rndkey,pop(@rndkey));
|
---|
313 | };
|
---|
314 |
|
---|
315 | sub Xupdate_ssse3_16_31() # recall that $Xi starts with 4
|
---|
316 | { use integer;
|
---|
317 | my $body = shift;
|
---|
318 | my @insns = (&$body,&$body,&$body,&$body); # 40 instructions
|
---|
319 | my ($a,$b,$c,$d,$e);
|
---|
320 |
|
---|
321 | eval(shift(@insns)); # ror
|
---|
322 | &pshufd (@X[0],@X[-4&7],0xee); # was &movdqa (@X[0],@X[-3&7]);
|
---|
323 | eval(shift(@insns));
|
---|
324 | &movdqa (@Tx[0],@X[-1&7]);
|
---|
325 | &paddd (@Tx[1],@X[-1&7]);
|
---|
326 | eval(shift(@insns));
|
---|
327 | eval(shift(@insns));
|
---|
328 |
|
---|
329 | &punpcklqdq(@X[0],@X[-3&7]); # compose "X[-14]" in "X[0]", was &palignr(@X[0],@X[-4&7],8);
|
---|
330 | eval(shift(@insns));
|
---|
331 | eval(shift(@insns)); # rol
|
---|
332 | eval(shift(@insns));
|
---|
333 | &psrldq (@Tx[0],4); # "X[-3]", 3 dwords
|
---|
334 | eval(shift(@insns));
|
---|
335 | eval(shift(@insns));
|
---|
336 |
|
---|
337 | &pxor (@X[0],@X[-4&7]); # "X[0]"^="X[-16]"
|
---|
338 | eval(shift(@insns));
|
---|
339 | eval(shift(@insns)); # ror
|
---|
340 | &pxor (@Tx[0],@X[-2&7]); # "X[-3]"^"X[-8]"
|
---|
341 | eval(shift(@insns));
|
---|
342 | eval(shift(@insns));
|
---|
343 | eval(shift(@insns));
|
---|
344 |
|
---|
345 | &pxor (@X[0],@Tx[0]); # "X[0]"^="X[-3]"^"X[-8]"
|
---|
346 | eval(shift(@insns));
|
---|
347 | eval(shift(@insns)); # rol
|
---|
348 | &movdqa (eval(16*(($Xi-1)&3))."(%rsp)",@Tx[1]); # X[]+K xfer to IALU
|
---|
349 | eval(shift(@insns));
|
---|
350 | eval(shift(@insns));
|
---|
351 |
|
---|
352 | &movdqa (@Tx[2],@X[0]);
|
---|
353 | eval(shift(@insns));
|
---|
354 | eval(shift(@insns));
|
---|
355 | eval(shift(@insns)); # ror
|
---|
356 | &movdqa (@Tx[0],@X[0]);
|
---|
357 | eval(shift(@insns));
|
---|
358 |
|
---|
359 | &pslldq (@Tx[2],12); # "X[0]"<<96, extract one dword
|
---|
360 | &paddd (@X[0],@X[0]);
|
---|
361 | eval(shift(@insns));
|
---|
362 | eval(shift(@insns));
|
---|
363 |
|
---|
364 | &psrld (@Tx[0],31);
|
---|
365 | eval(shift(@insns));
|
---|
366 | eval(shift(@insns)); # rol
|
---|
367 | eval(shift(@insns));
|
---|
368 | &movdqa (@Tx[1],@Tx[2]);
|
---|
369 | eval(shift(@insns));
|
---|
370 | eval(shift(@insns));
|
---|
371 |
|
---|
372 | &psrld (@Tx[2],30);
|
---|
373 | eval(shift(@insns));
|
---|
374 | eval(shift(@insns)); # ror
|
---|
375 | &por (@X[0],@Tx[0]); # "X[0]"<<<=1
|
---|
376 | eval(shift(@insns));
|
---|
377 | eval(shift(@insns));
|
---|
378 | eval(shift(@insns));
|
---|
379 |
|
---|
380 | &pslld (@Tx[1],2);
|
---|
381 | &pxor (@X[0],@Tx[2]);
|
---|
382 | eval(shift(@insns));
|
---|
383 | &movdqa (@Tx[2],eval(16*(($Xi)/5))."($K_XX_XX)"); # K_XX_XX
|
---|
384 | eval(shift(@insns)); # rol
|
---|
385 | eval(shift(@insns));
|
---|
386 | eval(shift(@insns));
|
---|
387 |
|
---|
388 | &pxor (@X[0],@Tx[1]); # "X[0]"^=("X[0]">>96)<<<2
|
---|
389 | &pshufd (@Tx[1],@X[-1&7],0xee) if ($Xi==7); # was &movdqa (@Tx[0],@X[-1&7]) in Xupdate_ssse3_32_79
|
---|
390 |
|
---|
391 | foreach (@insns) { eval; } # remaining instructions [if any]
|
---|
392 |
|
---|
393 | $Xi++; push(@X,shift(@X)); # "rotate" X[]
|
---|
394 | push(@Tx,shift(@Tx));
|
---|
395 | }
|
---|
396 |
|
---|
397 | sub Xupdate_ssse3_32_79()
|
---|
398 | { use integer;
|
---|
399 | my $body = shift;
|
---|
400 | my @insns = (&$body,&$body,&$body,&$body); # 32 to 44 instructions
|
---|
401 | my ($a,$b,$c,$d,$e);
|
---|
402 |
|
---|
403 | eval(shift(@insns)) if ($Xi==8);
|
---|
404 | &pxor (@X[0],@X[-4&7]); # "X[0]"="X[-32]"^"X[-16]"
|
---|
405 | eval(shift(@insns)) if ($Xi==8);
|
---|
406 | eval(shift(@insns)); # body_20_39
|
---|
407 | eval(shift(@insns));
|
---|
408 | eval(shift(@insns)) if (@insns[1] =~ /_ror/);
|
---|
409 | eval(shift(@insns)) if (@insns[0] =~ /_ror/);
|
---|
410 | &punpcklqdq(@Tx[0],@X[-1&7]); # compose "X[-6]", was &palignr(@Tx[0],@X[-2&7],8);
|
---|
411 | eval(shift(@insns));
|
---|
412 | eval(shift(@insns)); # rol
|
---|
413 |
|
---|
414 | &pxor (@X[0],@X[-7&7]); # "X[0]"^="X[-28]"
|
---|
415 | eval(shift(@insns));
|
---|
416 | eval(shift(@insns));
|
---|
417 | if ($Xi%5) {
|
---|
418 | &movdqa (@Tx[2],@Tx[1]);# "perpetuate" K_XX_XX...
|
---|
419 | } else { # ... or load next one
|
---|
420 | &movdqa (@Tx[2],eval(16*($Xi/5))."($K_XX_XX)");
|
---|
421 | }
|
---|
422 | eval(shift(@insns)); # ror
|
---|
423 | &paddd (@Tx[1],@X[-1&7]);
|
---|
424 | eval(shift(@insns));
|
---|
425 |
|
---|
426 | &pxor (@X[0],@Tx[0]); # "X[0]"^="X[-6]"
|
---|
427 | eval(shift(@insns)); # body_20_39
|
---|
428 | eval(shift(@insns));
|
---|
429 | eval(shift(@insns));
|
---|
430 | eval(shift(@insns)); # rol
|
---|
431 | eval(shift(@insns)) if (@insns[0] =~ /_ror/);
|
---|
432 |
|
---|
433 | &movdqa (@Tx[0],@X[0]);
|
---|
434 | eval(shift(@insns));
|
---|
435 | eval(shift(@insns));
|
---|
436 | &movdqa (eval(16*(($Xi-1)&3))."(%rsp)",@Tx[1]); # X[]+K xfer to IALU
|
---|
437 | eval(shift(@insns)); # ror
|
---|
438 | eval(shift(@insns));
|
---|
439 | eval(shift(@insns)); # body_20_39
|
---|
440 |
|
---|
441 | &pslld (@X[0],2);
|
---|
442 | eval(shift(@insns));
|
---|
443 | eval(shift(@insns));
|
---|
444 | &psrld (@Tx[0],30);
|
---|
445 | eval(shift(@insns)) if (@insns[0] =~ /_rol/);# rol
|
---|
446 | eval(shift(@insns));
|
---|
447 | eval(shift(@insns));
|
---|
448 | eval(shift(@insns)); # ror
|
---|
449 |
|
---|
450 | &por (@X[0],@Tx[0]); # "X[0]"<<<=2
|
---|
451 | eval(shift(@insns));
|
---|
452 | eval(shift(@insns)); # body_20_39
|
---|
453 | eval(shift(@insns)) if (@insns[1] =~ /_rol/);
|
---|
454 | eval(shift(@insns)) if (@insns[0] =~ /_rol/);
|
---|
455 | &pshufd(@Tx[1],@X[-1&7],0xee) if ($Xi<19); # was &movdqa (@Tx[1],@X[0])
|
---|
456 | eval(shift(@insns));
|
---|
457 | eval(shift(@insns)); # rol
|
---|
458 | eval(shift(@insns));
|
---|
459 | eval(shift(@insns));
|
---|
460 | eval(shift(@insns)); # rol
|
---|
461 | eval(shift(@insns));
|
---|
462 |
|
---|
463 | foreach (@insns) { eval; } # remaining instructions
|
---|
464 |
|
---|
465 | $Xi++; push(@X,shift(@X)); # "rotate" X[]
|
---|
466 | push(@Tx,shift(@Tx));
|
---|
467 | }
|
---|
468 |
|
---|
469 | sub Xuplast_ssse3_80()
|
---|
470 | { use integer;
|
---|
471 | my $body = shift;
|
---|
472 | my @insns = (&$body,&$body,&$body,&$body); # 32 instructions
|
---|
473 | my ($a,$b,$c,$d,$e);
|
---|
474 |
|
---|
475 | eval(shift(@insns));
|
---|
476 | eval(shift(@insns));
|
---|
477 | eval(shift(@insns));
|
---|
478 | eval(shift(@insns));
|
---|
479 | &paddd (@Tx[1],@X[-1&7]);
|
---|
480 | eval(shift(@insns));
|
---|
481 | eval(shift(@insns));
|
---|
482 |
|
---|
483 | &movdqa (eval(16*(($Xi-1)&3))."(%rsp)",@Tx[1]); # X[]+K xfer IALU
|
---|
484 |
|
---|
485 | foreach (@insns) { eval; } # remaining instructions
|
---|
486 |
|
---|
487 | &cmp ($inp,$len);
|
---|
488 | &je (shift);
|
---|
489 |
|
---|
490 | unshift(@Tx,pop(@Tx));
|
---|
491 |
|
---|
492 | &movdqa (@Tx[2],"64($K_XX_XX)"); # pbswap mask
|
---|
493 | &movdqa (@Tx[1],"0($K_XX_XX)"); # K_00_19
|
---|
494 | &movdqu (@X[-4&7],"0($inp)"); # load input
|
---|
495 | &movdqu (@X[-3&7],"16($inp)");
|
---|
496 | &movdqu (@X[-2&7],"32($inp)");
|
---|
497 | &movdqu (@X[-1&7],"48($inp)");
|
---|
498 | &pshufb (@X[-4&7],@Tx[2]); # byte swap
|
---|
499 | &add ($inp,64);
|
---|
500 |
|
---|
501 | $Xi=0;
|
---|
502 | }
|
---|
503 |
|
---|
504 | sub Xloop_ssse3()
|
---|
505 | { use integer;
|
---|
506 | my $body = shift;
|
---|
507 | my @insns = (&$body,&$body,&$body,&$body); # 32 instructions
|
---|
508 | my ($a,$b,$c,$d,$e);
|
---|
509 |
|
---|
510 | eval(shift(@insns));
|
---|
511 | eval(shift(@insns));
|
---|
512 | eval(shift(@insns));
|
---|
513 | &pshufb (@X[($Xi-3)&7],@Tx[2]);
|
---|
514 | eval(shift(@insns));
|
---|
515 | eval(shift(@insns));
|
---|
516 | eval(shift(@insns));
|
---|
517 | eval(shift(@insns));
|
---|
518 | &paddd (@X[($Xi-4)&7],@Tx[1]);
|
---|
519 | eval(shift(@insns));
|
---|
520 | eval(shift(@insns));
|
---|
521 | eval(shift(@insns));
|
---|
522 | eval(shift(@insns));
|
---|
523 | &movdqa (eval(16*$Xi)."(%rsp)",@X[($Xi-4)&7]); # X[]+K xfer to IALU
|
---|
524 | eval(shift(@insns));
|
---|
525 | eval(shift(@insns));
|
---|
526 | eval(shift(@insns));
|
---|
527 | eval(shift(@insns));
|
---|
528 | &psubd (@X[($Xi-4)&7],@Tx[1]);
|
---|
529 |
|
---|
530 | foreach (@insns) { eval; }
|
---|
531 | $Xi++;
|
---|
532 | }
|
---|
533 |
|
---|
534 | sub Xtail_ssse3()
|
---|
535 | { use integer;
|
---|
536 | my $body = shift;
|
---|
537 | my @insns = (&$body,&$body,&$body,&$body); # 32 instructions
|
---|
538 | my ($a,$b,$c,$d,$e);
|
---|
539 |
|
---|
540 | foreach (@insns) { eval; }
|
---|
541 | }
|
---|
542 |
|
---|
543 | my @body_00_19 = (
|
---|
544 | '($a,$b,$c,$d,$e)=@V;'.
|
---|
545 | '&$_ror ($b,$j?7:2);', # $b>>>2
|
---|
546 | '&xor (@T[0],$d);',
|
---|
547 | '&mov (@T[1],$a);', # $b for next round
|
---|
548 |
|
---|
549 | '&add ($e,eval(4*($j&15))."(%rsp)");',# X[]+K xfer
|
---|
550 | '&xor ($b,$c);', # $c^$d for next round
|
---|
551 |
|
---|
552 | '&$_rol ($a,5);',
|
---|
553 | '&add ($e,@T[0]);',
|
---|
554 | '&and (@T[1],$b);', # ($b&($c^$d)) for next round
|
---|
555 |
|
---|
556 | '&xor ($b,$c);', # restore $b
|
---|
557 | '&add ($e,$a);' .'$j++; unshift(@V,pop(@V)); unshift(@T,pop(@T));'
|
---|
558 | );
|
---|
559 |
|
---|
560 | sub body_00_19 () { # ((c^d)&b)^d
|
---|
561 | # on start @T[0]=(c^d)&b
|
---|
562 | return &body_20_39() if ($rx==19); $rx++;
|
---|
563 |
|
---|
564 | use integer;
|
---|
565 | my ($k,$n);
|
---|
566 | my @r=@body_00_19;
|
---|
567 |
|
---|
568 | $n = scalar(@r);
|
---|
569 | $k = (($jj+1)*12/20)*20*$n/12; # 12 aesencs per these 20 rounds
|
---|
570 | @r[$k%$n].='&$aesenc();' if ($jj==$k/$n);
|
---|
571 | $jj++;
|
---|
572 |
|
---|
573 | return @r;
|
---|
574 | }
|
---|
575 |
|
---|
576 | my @body_20_39 = (
|
---|
577 | '($a,$b,$c,$d,$e)=@V;'.
|
---|
578 | '&add ($e,eval(4*($j&15))."(%rsp)");',# X[]+K xfer
|
---|
579 | '&xor (@T[0],$d) if($j==19);'.
|
---|
580 | '&xor (@T[0],$c) if($j> 19);', # ($b^$d^$c)
|
---|
581 | '&mov (@T[1],$a);', # $b for next round
|
---|
582 |
|
---|
583 | '&$_rol ($a,5);',
|
---|
584 | '&add ($e,@T[0]);',
|
---|
585 | '&xor (@T[1],$c) if ($j< 79);', # $b^$d for next round
|
---|
586 |
|
---|
587 | '&$_ror ($b,7);', # $b>>>2
|
---|
588 | '&add ($e,$a);' .'$j++; unshift(@V,pop(@V)); unshift(@T,pop(@T));'
|
---|
589 | );
|
---|
590 |
|
---|
591 | sub body_20_39 () { # b^d^c
|
---|
592 | # on entry @T[0]=b^d
|
---|
593 | return &body_40_59() if ($rx==39); $rx++;
|
---|
594 |
|
---|
595 | use integer;
|
---|
596 | my ($k,$n);
|
---|
597 | my @r=@body_20_39;
|
---|
598 |
|
---|
599 | $n = scalar(@r);
|
---|
600 | $k = (($jj+1)*8/20)*20*$n/8; # 8 aesencs per these 20 rounds
|
---|
601 | @r[$k%$n].='&$aesenc();' if ($jj==$k/$n && $rx!=20);
|
---|
602 | $jj++;
|
---|
603 |
|
---|
604 | return @r;
|
---|
605 | }
|
---|
606 |
|
---|
607 | my @body_40_59 = (
|
---|
608 | '($a,$b,$c,$d,$e)=@V;'.
|
---|
609 | '&add ($e,eval(4*($j&15))."(%rsp)");',# X[]+K xfer
|
---|
610 | '&and (@T[0],$c) if ($j>=40);', # (b^c)&(c^d)
|
---|
611 | '&xor ($c,$d) if ($j>=40);', # restore $c
|
---|
612 |
|
---|
613 | '&$_ror ($b,7);', # $b>>>2
|
---|
614 | '&mov (@T[1],$a);', # $b for next round
|
---|
615 | '&xor (@T[0],$c);',
|
---|
616 |
|
---|
617 | '&$_rol ($a,5);',
|
---|
618 | '&add ($e,@T[0]);',
|
---|
619 | '&xor (@T[1],$c) if ($j==59);'.
|
---|
620 | '&xor (@T[1],$b) if ($j< 59);', # b^c for next round
|
---|
621 |
|
---|
622 | '&xor ($b,$c) if ($j< 59);', # c^d for next round
|
---|
623 | '&add ($e,$a);' .'$j++; unshift(@V,pop(@V)); unshift(@T,pop(@T));'
|
---|
624 | );
|
---|
625 |
|
---|
626 | sub body_40_59 () { # ((b^c)&(c^d))^c
|
---|
627 | # on entry @T[0]=(b^c), (c^=d)
|
---|
628 | $rx++;
|
---|
629 |
|
---|
630 | use integer;
|
---|
631 | my ($k,$n);
|
---|
632 | my @r=@body_40_59;
|
---|
633 |
|
---|
634 | $n = scalar(@r);
|
---|
635 | $k=(($jj+1)*12/20)*20*$n/12; # 12 aesencs per these 20 rounds
|
---|
636 | @r[$k%$n].='&$aesenc();' if ($jj==$k/$n && $rx!=40);
|
---|
637 | $jj++;
|
---|
638 |
|
---|
639 | return @r;
|
---|
640 | }
|
---|
641 | $code.=<<___;
|
---|
642 | .align 32
|
---|
643 | .Loop_ssse3:
|
---|
644 | ___
|
---|
645 | &Xupdate_ssse3_16_31(\&body_00_19);
|
---|
646 | &Xupdate_ssse3_16_31(\&body_00_19);
|
---|
647 | &Xupdate_ssse3_16_31(\&body_00_19);
|
---|
648 | &Xupdate_ssse3_16_31(\&body_00_19);
|
---|
649 | &Xupdate_ssse3_32_79(\&body_00_19);
|
---|
650 | &Xupdate_ssse3_32_79(\&body_20_39);
|
---|
651 | &Xupdate_ssse3_32_79(\&body_20_39);
|
---|
652 | &Xupdate_ssse3_32_79(\&body_20_39);
|
---|
653 | &Xupdate_ssse3_32_79(\&body_20_39);
|
---|
654 | &Xupdate_ssse3_32_79(\&body_20_39);
|
---|
655 | &Xupdate_ssse3_32_79(\&body_40_59);
|
---|
656 | &Xupdate_ssse3_32_79(\&body_40_59);
|
---|
657 | &Xupdate_ssse3_32_79(\&body_40_59);
|
---|
658 | &Xupdate_ssse3_32_79(\&body_40_59);
|
---|
659 | &Xupdate_ssse3_32_79(\&body_40_59);
|
---|
660 | &Xupdate_ssse3_32_79(\&body_20_39);
|
---|
661 | &Xuplast_ssse3_80(\&body_20_39,".Ldone_ssse3"); # can jump to "done"
|
---|
662 |
|
---|
663 | $saved_j=$j; @saved_V=@V;
|
---|
664 | $saved_r=$r; @saved_rndkey=@rndkey;
|
---|
665 |
|
---|
666 | &Xloop_ssse3(\&body_20_39);
|
---|
667 | &Xloop_ssse3(\&body_20_39);
|
---|
668 | &Xloop_ssse3(\&body_20_39);
|
---|
669 |
|
---|
670 | $code.=<<___;
|
---|
671 | movups $iv,48($out,$in0) # write output
|
---|
672 | lea 64($in0),$in0
|
---|
673 |
|
---|
674 | add 0($ctx),$A # update context
|
---|
675 | add 4($ctx),@T[0]
|
---|
676 | add 8($ctx),$C
|
---|
677 | add 12($ctx),$D
|
---|
678 | mov $A,0($ctx)
|
---|
679 | add 16($ctx),$E
|
---|
680 | mov @T[0],4($ctx)
|
---|
681 | mov @T[0],$B # magic seed
|
---|
682 | mov $C,8($ctx)
|
---|
683 | mov $C,@T[1]
|
---|
684 | mov $D,12($ctx)
|
---|
685 | xor $D,@T[1]
|
---|
686 | mov $E,16($ctx)
|
---|
687 | and @T[1],@T[0]
|
---|
688 | jmp .Loop_ssse3
|
---|
689 |
|
---|
690 | .Ldone_ssse3:
|
---|
691 | ___
|
---|
692 | $jj=$j=$saved_j; @V=@saved_V;
|
---|
693 | $r=$saved_r; @rndkey=@saved_rndkey;
|
---|
694 |
|
---|
695 | &Xtail_ssse3(\&body_20_39);
|
---|
696 | &Xtail_ssse3(\&body_20_39);
|
---|
697 | &Xtail_ssse3(\&body_20_39);
|
---|
698 |
|
---|
699 | $code.=<<___;
|
---|
700 | movups $iv,48($out,$in0) # write output
|
---|
701 | mov 88(%rsp),$ivp # restore $ivp
|
---|
702 |
|
---|
703 | add 0($ctx),$A # update context
|
---|
704 | add 4($ctx),@T[0]
|
---|
705 | add 8($ctx),$C
|
---|
706 | mov $A,0($ctx)
|
---|
707 | add 12($ctx),$D
|
---|
708 | mov @T[0],4($ctx)
|
---|
709 | add 16($ctx),$E
|
---|
710 | mov $C,8($ctx)
|
---|
711 | mov $D,12($ctx)
|
---|
712 | mov $E,16($ctx)
|
---|
713 | movups $iv,($ivp) # write IV
|
---|
714 | ___
|
---|
715 | $code.=<<___ if ($win64);
|
---|
716 | movaps 96+0(%rsp),%xmm6
|
---|
717 | movaps 96+16(%rsp),%xmm7
|
---|
718 | movaps 96+32(%rsp),%xmm8
|
---|
719 | movaps 96+48(%rsp),%xmm9
|
---|
720 | movaps 96+64(%rsp),%xmm10
|
---|
721 | movaps 96+80(%rsp),%xmm11
|
---|
722 | movaps 96+96(%rsp),%xmm12
|
---|
723 | movaps 96+112(%rsp),%xmm13
|
---|
724 | movaps 96+128(%rsp),%xmm14
|
---|
725 | movaps 96+144(%rsp),%xmm15
|
---|
726 | ___
|
---|
727 | $code.=<<___;
|
---|
728 | lea `104+($win64?10*16:0)`(%rsp),%rsi
|
---|
729 | mov 0(%rsi),%r15
|
---|
730 | mov 8(%rsi),%r14
|
---|
731 | mov 16(%rsi),%r13
|
---|
732 | mov 24(%rsi),%r12
|
---|
733 | mov 32(%rsi),%rbp
|
---|
734 | mov 40(%rsi),%rbx
|
---|
735 | lea 48(%rsi),%rsp
|
---|
736 | .Lepilogue_ssse3:
|
---|
737 | ret
|
---|
738 | .size aesni_cbc_sha1_enc_ssse3,.-aesni_cbc_sha1_enc_ssse3
|
---|
739 | ___
|
---|
740 |
|
---|
741 | if ($stitched_decrypt) {{{
|
---|
742 | # reset
|
---|
743 | ($in0,$out,$len,$key,$ivp,$ctx,$inp)=("%rdi","%rsi","%rdx","%rcx","%r8","%r9","%r10");
|
---|
744 | $j=$jj=$r=$rx=0;
|
---|
745 | $Xi=4;
|
---|
746 |
|
---|
747 | # reassign for Atom Silvermont (see above)
|
---|
748 | ($inout0,$inout1,$inout2,$inout3,$rndkey0)=map("%xmm$_",(0..4));
|
---|
749 | @X=map("%xmm$_",(8..13,6,7));
|
---|
750 | @Tx=map("%xmm$_",(14,15,5));
|
---|
751 |
|
---|
752 | my @aes256_dec = (
|
---|
753 | '&movdqu($inout0,"0x00($in0)");',
|
---|
754 | '&movdqu($inout1,"0x10($in0)"); &pxor ($inout0,$rndkey0);',
|
---|
755 | '&movdqu($inout2,"0x20($in0)"); &pxor ($inout1,$rndkey0);',
|
---|
756 | '&movdqu($inout3,"0x30($in0)"); &pxor ($inout2,$rndkey0);',
|
---|
757 |
|
---|
758 | '&pxor ($inout3,$rndkey0); &movups ($rndkey0,"16-112($key)");',
|
---|
759 | '&movaps("64(%rsp)",@X[2]);', # save IV, originally @X[3]
|
---|
760 | undef,undef
|
---|
761 | );
|
---|
762 | for ($i=0;$i<13;$i++) {
|
---|
763 | push (@aes256_dec,(
|
---|
764 | '&aesdec ($inout0,$rndkey0);',
|
---|
765 | '&aesdec ($inout1,$rndkey0);',
|
---|
766 | '&aesdec ($inout2,$rndkey0);',
|
---|
767 | '&aesdec ($inout3,$rndkey0); &movups($rndkey0,"'.(16*($i+2)-112).'($key)");'
|
---|
768 | ));
|
---|
769 | push (@aes256_dec,(undef,undef)) if (($i>=3 && $i<=5) || $i>=11);
|
---|
770 | push (@aes256_dec,(undef,undef)) if ($i==5);
|
---|
771 | }
|
---|
772 | push(@aes256_dec,(
|
---|
773 | '&aesdeclast ($inout0,$rndkey0); &movups (@X[0],"0x00($in0)");',
|
---|
774 | '&aesdeclast ($inout1,$rndkey0); &movups (@X[1],"0x10($in0)");',
|
---|
775 | '&aesdeclast ($inout2,$rndkey0); &movups (@X[2],"0x20($in0)");',
|
---|
776 | '&aesdeclast ($inout3,$rndkey0); &movups (@X[3],"0x30($in0)");',
|
---|
777 |
|
---|
778 | '&xorps ($inout0,"64(%rsp)"); &movdqu ($rndkey0,"-112($key)");',
|
---|
779 | '&xorps ($inout1,@X[0]); &movups ("0x00($out,$in0)",$inout0);',
|
---|
780 | '&xorps ($inout2,@X[1]); &movups ("0x10($out,$in0)",$inout1);',
|
---|
781 | '&xorps ($inout3,@X[2]); &movups ("0x20($out,$in0)",$inout2);',
|
---|
782 |
|
---|
783 | '&movups ("0x30($out,$in0)",$inout3);'
|
---|
784 | ));
|
---|
785 |
|
---|
786 | sub body_00_19_dec () { # ((c^d)&b)^d
|
---|
787 | # on start @T[0]=(c^d)&b
|
---|
788 | return &body_20_39_dec() if ($rx==19);
|
---|
789 |
|
---|
790 | my @r=@body_00_19;
|
---|
791 |
|
---|
792 | unshift (@r,@aes256_dec[$rx]) if (@aes256_dec[$rx]);
|
---|
793 | $rx++;
|
---|
794 |
|
---|
795 | return @r;
|
---|
796 | }
|
---|
797 |
|
---|
798 | sub body_20_39_dec () { # b^d^c
|
---|
799 | # on entry @T[0]=b^d
|
---|
800 | return &body_40_59_dec() if ($rx==39);
|
---|
801 |
|
---|
802 | my @r=@body_20_39;
|
---|
803 |
|
---|
804 | unshift (@r,@aes256_dec[$rx]) if (@aes256_dec[$rx]);
|
---|
805 | $rx++;
|
---|
806 |
|
---|
807 | return @r;
|
---|
808 | }
|
---|
809 |
|
---|
810 | sub body_40_59_dec () { # ((b^c)&(c^d))^c
|
---|
811 | # on entry @T[0]=(b^c), (c^=d)
|
---|
812 |
|
---|
813 | my @r=@body_40_59;
|
---|
814 |
|
---|
815 | unshift (@r,@aes256_dec[$rx]) if (@aes256_dec[$rx]);
|
---|
816 | $rx++;
|
---|
817 |
|
---|
818 | return @r;
|
---|
819 | }
|
---|
820 |
|
---|
821 | $code.=<<___;
|
---|
822 | .globl aesni256_cbc_sha1_dec
|
---|
823 | .type aesni256_cbc_sha1_dec,\@abi-omnipotent
|
---|
824 | .align 32
|
---|
825 | aesni256_cbc_sha1_dec:
|
---|
826 | # caller should check for SSSE3 and AES-NI bits
|
---|
827 | mov OPENSSL_ia32cap_P+0(%rip),%r10d
|
---|
828 | mov OPENSSL_ia32cap_P+4(%rip),%r11d
|
---|
829 | ___
|
---|
830 | $code.=<<___ if ($avx);
|
---|
831 | and \$`1<<28`,%r11d # mask AVX bit
|
---|
832 | and \$`1<<30`,%r10d # mask "Intel CPU" bit
|
---|
833 | or %r11d,%r10d
|
---|
834 | cmp \$`1<<28|1<<30`,%r10d
|
---|
835 | je aesni256_cbc_sha1_dec_avx
|
---|
836 | ___
|
---|
837 | $code.=<<___;
|
---|
838 | jmp aesni256_cbc_sha1_dec_ssse3
|
---|
839 | ret
|
---|
840 | .size aesni256_cbc_sha1_dec,.-aesni256_cbc_sha1_dec
|
---|
841 |
|
---|
842 | .type aesni256_cbc_sha1_dec_ssse3,\@function,6
|
---|
843 | .align 32
|
---|
844 | aesni256_cbc_sha1_dec_ssse3:
|
---|
845 | mov `($win64?56:8)`(%rsp),$inp # load 7th argument
|
---|
846 | push %rbx
|
---|
847 | push %rbp
|
---|
848 | push %r12
|
---|
849 | push %r13
|
---|
850 | push %r14
|
---|
851 | push %r15
|
---|
852 | lea `-104-($win64?10*16:0)`(%rsp),%rsp
|
---|
853 | ___
|
---|
854 | $code.=<<___ if ($win64);
|
---|
855 | movaps %xmm6,96+0(%rsp)
|
---|
856 | movaps %xmm7,96+16(%rsp)
|
---|
857 | movaps %xmm8,96+32(%rsp)
|
---|
858 | movaps %xmm9,96+48(%rsp)
|
---|
859 | movaps %xmm10,96+64(%rsp)
|
---|
860 | movaps %xmm11,96+80(%rsp)
|
---|
861 | movaps %xmm12,96+96(%rsp)
|
---|
862 | movaps %xmm13,96+112(%rsp)
|
---|
863 | movaps %xmm14,96+128(%rsp)
|
---|
864 | movaps %xmm15,96+144(%rsp)
|
---|
865 | .Lprologue_dec_ssse3:
|
---|
866 | ___
|
---|
867 | $code.=<<___;
|
---|
868 | mov $in0,%r12 # reassign arguments
|
---|
869 | mov $out,%r13
|
---|
870 | mov $len,%r14
|
---|
871 | lea 112($key),%r15 # size optimization
|
---|
872 | movdqu ($ivp),@X[3] # load IV
|
---|
873 | #mov $ivp,88(%rsp) # save $ivp
|
---|
874 | ___
|
---|
875 | ($in0,$out,$len,$key)=map("%r$_",(12..15)); # reassign arguments
|
---|
876 | $code.=<<___;
|
---|
877 | shl \$6,$len
|
---|
878 | sub $in0,$out
|
---|
879 | add $inp,$len # end of input
|
---|
880 |
|
---|
881 | lea K_XX_XX(%rip),$K_XX_XX
|
---|
882 | mov 0($ctx),$A # load context
|
---|
883 | mov 4($ctx),$B
|
---|
884 | mov 8($ctx),$C
|
---|
885 | mov 12($ctx),$D
|
---|
886 | mov $B,@T[0] # magic seed
|
---|
887 | mov 16($ctx),$E
|
---|
888 | mov $C,@T[1]
|
---|
889 | xor $D,@T[1]
|
---|
890 | and @T[1],@T[0]
|
---|
891 |
|
---|
892 | movdqa 64($K_XX_XX),@Tx[2] # pbswap mask
|
---|
893 | movdqa 0($K_XX_XX),@Tx[1] # K_00_19
|
---|
894 | movdqu 0($inp),@X[-4&7] # load input to %xmm[0-3]
|
---|
895 | movdqu 16($inp),@X[-3&7]
|
---|
896 | movdqu 32($inp),@X[-2&7]
|
---|
897 | movdqu 48($inp),@X[-1&7]
|
---|
898 | pshufb @Tx[2],@X[-4&7] # byte swap
|
---|
899 | add \$64,$inp
|
---|
900 | pshufb @Tx[2],@X[-3&7]
|
---|
901 | pshufb @Tx[2],@X[-2&7]
|
---|
902 | pshufb @Tx[2],@X[-1&7]
|
---|
903 | paddd @Tx[1],@X[-4&7] # add K_00_19
|
---|
904 | paddd @Tx[1],@X[-3&7]
|
---|
905 | paddd @Tx[1],@X[-2&7]
|
---|
906 | movdqa @X[-4&7],0(%rsp) # X[]+K xfer to IALU
|
---|
907 | psubd @Tx[1],@X[-4&7] # restore X[]
|
---|
908 | movdqa @X[-3&7],16(%rsp)
|
---|
909 | psubd @Tx[1],@X[-3&7]
|
---|
910 | movdqa @X[-2&7],32(%rsp)
|
---|
911 | psubd @Tx[1],@X[-2&7]
|
---|
912 | movdqu -112($key),$rndkey0 # $key[0]
|
---|
913 | jmp .Loop_dec_ssse3
|
---|
914 |
|
---|
915 | .align 32
|
---|
916 | .Loop_dec_ssse3:
|
---|
917 | ___
|
---|
918 | &Xupdate_ssse3_16_31(\&body_00_19_dec);
|
---|
919 | &Xupdate_ssse3_16_31(\&body_00_19_dec);
|
---|
920 | &Xupdate_ssse3_16_31(\&body_00_19_dec);
|
---|
921 | &Xupdate_ssse3_16_31(\&body_00_19_dec);
|
---|
922 | &Xupdate_ssse3_32_79(\&body_00_19_dec);
|
---|
923 | &Xupdate_ssse3_32_79(\&body_20_39_dec);
|
---|
924 | &Xupdate_ssse3_32_79(\&body_20_39_dec);
|
---|
925 | &Xupdate_ssse3_32_79(\&body_20_39_dec);
|
---|
926 | &Xupdate_ssse3_32_79(\&body_20_39_dec);
|
---|
927 | &Xupdate_ssse3_32_79(\&body_20_39_dec);
|
---|
928 | &Xupdate_ssse3_32_79(\&body_40_59_dec);
|
---|
929 | &Xupdate_ssse3_32_79(\&body_40_59_dec);
|
---|
930 | &Xupdate_ssse3_32_79(\&body_40_59_dec);
|
---|
931 | &Xupdate_ssse3_32_79(\&body_40_59_dec);
|
---|
932 | &Xupdate_ssse3_32_79(\&body_40_59_dec);
|
---|
933 | &Xupdate_ssse3_32_79(\&body_20_39_dec);
|
---|
934 | &Xuplast_ssse3_80(\&body_20_39_dec,".Ldone_dec_ssse3"); # can jump to "done"
|
---|
935 |
|
---|
936 | $saved_j=$j; @saved_V=@V;
|
---|
937 | $saved_rx=$rx;
|
---|
938 |
|
---|
939 | &Xloop_ssse3(\&body_20_39_dec);
|
---|
940 | &Xloop_ssse3(\&body_20_39_dec);
|
---|
941 | &Xloop_ssse3(\&body_20_39_dec);
|
---|
942 |
|
---|
943 | eval(@aes256_dec[-1]); # last store
|
---|
944 | $code.=<<___;
|
---|
945 | lea 64($in0),$in0
|
---|
946 |
|
---|
947 | add 0($ctx),$A # update context
|
---|
948 | add 4($ctx),@T[0]
|
---|
949 | add 8($ctx),$C
|
---|
950 | add 12($ctx),$D
|
---|
951 | mov $A,0($ctx)
|
---|
952 | add 16($ctx),$E
|
---|
953 | mov @T[0],4($ctx)
|
---|
954 | mov @T[0],$B # magic seed
|
---|
955 | mov $C,8($ctx)
|
---|
956 | mov $C,@T[1]
|
---|
957 | mov $D,12($ctx)
|
---|
958 | xor $D,@T[1]
|
---|
959 | mov $E,16($ctx)
|
---|
960 | and @T[1],@T[0]
|
---|
961 | jmp .Loop_dec_ssse3
|
---|
962 |
|
---|
963 | .Ldone_dec_ssse3:
|
---|
964 | ___
|
---|
965 | $jj=$j=$saved_j; @V=@saved_V;
|
---|
966 | $rx=$saved_rx;
|
---|
967 |
|
---|
968 | &Xtail_ssse3(\&body_20_39_dec);
|
---|
969 | &Xtail_ssse3(\&body_20_39_dec);
|
---|
970 | &Xtail_ssse3(\&body_20_39_dec);
|
---|
971 |
|
---|
972 | eval(@aes256_dec[-1]); # last store
|
---|
973 | $code.=<<___;
|
---|
974 | add 0($ctx),$A # update context
|
---|
975 | add 4($ctx),@T[0]
|
---|
976 | add 8($ctx),$C
|
---|
977 | mov $A,0($ctx)
|
---|
978 | add 12($ctx),$D
|
---|
979 | mov @T[0],4($ctx)
|
---|
980 | add 16($ctx),$E
|
---|
981 | mov $C,8($ctx)
|
---|
982 | mov $D,12($ctx)
|
---|
983 | mov $E,16($ctx)
|
---|
984 | movups @X[3],($ivp) # write IV
|
---|
985 | ___
|
---|
986 | $code.=<<___ if ($win64);
|
---|
987 | movaps 96+0(%rsp),%xmm6
|
---|
988 | movaps 96+16(%rsp),%xmm7
|
---|
989 | movaps 96+32(%rsp),%xmm8
|
---|
990 | movaps 96+48(%rsp),%xmm9
|
---|
991 | movaps 96+64(%rsp),%xmm10
|
---|
992 | movaps 96+80(%rsp),%xmm11
|
---|
993 | movaps 96+96(%rsp),%xmm12
|
---|
994 | movaps 96+112(%rsp),%xmm13
|
---|
995 | movaps 96+128(%rsp),%xmm14
|
---|
996 | movaps 96+144(%rsp),%xmm15
|
---|
997 | ___
|
---|
998 | $code.=<<___;
|
---|
999 | lea `104+($win64?10*16:0)`(%rsp),%rsi
|
---|
1000 | mov 0(%rsi),%r15
|
---|
1001 | mov 8(%rsi),%r14
|
---|
1002 | mov 16(%rsi),%r13
|
---|
1003 | mov 24(%rsi),%r12
|
---|
1004 | mov 32(%rsi),%rbp
|
---|
1005 | mov 40(%rsi),%rbx
|
---|
1006 | lea 48(%rsi),%rsp
|
---|
1007 | .Lepilogue_dec_ssse3:
|
---|
1008 | ret
|
---|
1009 | .size aesni256_cbc_sha1_dec_ssse3,.-aesni256_cbc_sha1_dec_ssse3
|
---|
1010 | ___
|
---|
1011 | }}}
|
---|
1012 | $j=$jj=$r=$rx=0;
|
---|
1013 |
|
---|
1014 | if ($avx) {
|
---|
1015 | my ($in0,$out,$len,$key,$ivp,$ctx,$inp)=("%rdi","%rsi","%rdx","%rcx","%r8","%r9","%r10");
|
---|
1016 |
|
---|
1017 | my $Xi=4;
|
---|
1018 | my @X=map("%xmm$_",(4..7,0..3));
|
---|
1019 | my @Tx=map("%xmm$_",(8..10));
|
---|
1020 | my @V=($A,$B,$C,$D,$E)=("%eax","%ebx","%ecx","%edx","%ebp"); # size optimization
|
---|
1021 | my @T=("%esi","%edi");
|
---|
1022 | my ($rndkey0,$iv,$in)=map("%xmm$_",(11..13));
|
---|
1023 | my @rndkey=("%xmm14","%xmm15");
|
---|
1024 | my ($inout0,$inout1,$inout2,$inout3)=map("%xmm$_",(12..15)); # for dec
|
---|
1025 | my $Kx=@Tx[2];
|
---|
1026 |
|
---|
1027 | my $_rol=sub { &shld(@_[0],@_) };
|
---|
1028 | my $_ror=sub { &shrd(@_[0],@_) };
|
---|
1029 |
|
---|
1030 | $code.=<<___;
|
---|
1031 | .type aesni_cbc_sha1_enc_avx,\@function,6
|
---|
1032 | .align 32
|
---|
1033 | aesni_cbc_sha1_enc_avx:
|
---|
1034 | mov `($win64?56:8)`(%rsp),$inp # load 7th argument
|
---|
1035 | #shr \$6,$len # debugging artefact
|
---|
1036 | #jz .Lepilogue_avx # debugging artefact
|
---|
1037 | push %rbx
|
---|
1038 | push %rbp
|
---|
1039 | push %r12
|
---|
1040 | push %r13
|
---|
1041 | push %r14
|
---|
1042 | push %r15
|
---|
1043 | lea `-104-($win64?10*16:0)`(%rsp),%rsp
|
---|
1044 | #mov $in0,$inp # debugging artefact
|
---|
1045 | #lea 64(%rsp),$ctx # debugging artefact
|
---|
1046 | ___
|
---|
1047 | $code.=<<___ if ($win64);
|
---|
1048 | movaps %xmm6,96+0(%rsp)
|
---|
1049 | movaps %xmm7,96+16(%rsp)
|
---|
1050 | movaps %xmm8,96+32(%rsp)
|
---|
1051 | movaps %xmm9,96+48(%rsp)
|
---|
1052 | movaps %xmm10,96+64(%rsp)
|
---|
1053 | movaps %xmm11,96+80(%rsp)
|
---|
1054 | movaps %xmm12,96+96(%rsp)
|
---|
1055 | movaps %xmm13,96+112(%rsp)
|
---|
1056 | movaps %xmm14,96+128(%rsp)
|
---|
1057 | movaps %xmm15,96+144(%rsp)
|
---|
1058 | .Lprologue_avx:
|
---|
1059 | ___
|
---|
1060 | $code.=<<___;
|
---|
1061 | vzeroall
|
---|
1062 | mov $in0,%r12 # reassign arguments
|
---|
1063 | mov $out,%r13
|
---|
1064 | mov $len,%r14
|
---|
1065 | lea 112($key),%r15 # size optimization
|
---|
1066 | vmovdqu ($ivp),$iv # load IV
|
---|
1067 | mov $ivp,88(%rsp) # save $ivp
|
---|
1068 | ___
|
---|
1069 | ($in0,$out,$len,$key)=map("%r$_",(12..15)); # reassign arguments
|
---|
1070 | my $rounds="${ivp}d";
|
---|
1071 | $code.=<<___;
|
---|
1072 | shl \$6,$len
|
---|
1073 | sub $in0,$out
|
---|
1074 | mov 240-112($key),$rounds
|
---|
1075 | add $inp,$len # end of input
|
---|
1076 |
|
---|
1077 | lea K_XX_XX(%rip),$K_XX_XX
|
---|
1078 | mov 0($ctx),$A # load context
|
---|
1079 | mov 4($ctx),$B
|
---|
1080 | mov 8($ctx),$C
|
---|
1081 | mov 12($ctx),$D
|
---|
1082 | mov $B,@T[0] # magic seed
|
---|
1083 | mov 16($ctx),$E
|
---|
1084 | mov $C,@T[1]
|
---|
1085 | xor $D,@T[1]
|
---|
1086 | and @T[1],@T[0]
|
---|
1087 |
|
---|
1088 | vmovdqa 64($K_XX_XX),@X[2] # pbswap mask
|
---|
1089 | vmovdqa 0($K_XX_XX),$Kx # K_00_19
|
---|
1090 | vmovdqu 0($inp),@X[-4&7] # load input to %xmm[0-3]
|
---|
1091 | vmovdqu 16($inp),@X[-3&7]
|
---|
1092 | vmovdqu 32($inp),@X[-2&7]
|
---|
1093 | vmovdqu 48($inp),@X[-1&7]
|
---|
1094 | vpshufb @X[2],@X[-4&7],@X[-4&7] # byte swap
|
---|
1095 | add \$64,$inp
|
---|
1096 | vpshufb @X[2],@X[-3&7],@X[-3&7]
|
---|
1097 | vpshufb @X[2],@X[-2&7],@X[-2&7]
|
---|
1098 | vpshufb @X[2],@X[-1&7],@X[-1&7]
|
---|
1099 | vpaddd $Kx,@X[-4&7],@X[0] # add K_00_19
|
---|
1100 | vpaddd $Kx,@X[-3&7],@X[1]
|
---|
1101 | vpaddd $Kx,@X[-2&7],@X[2]
|
---|
1102 | vmovdqa @X[0],0(%rsp) # X[]+K xfer to IALU
|
---|
1103 | vmovdqa @X[1],16(%rsp)
|
---|
1104 | vmovdqa @X[2],32(%rsp)
|
---|
1105 | vmovups -112($key),$rndkey[1] # $key[0]
|
---|
1106 | vmovups 16-112($key),$rndkey[0] # forward reference
|
---|
1107 | jmp .Loop_avx
|
---|
1108 | ___
|
---|
1109 |
|
---|
1110 | my $aesenc=sub {
|
---|
1111 | use integer;
|
---|
1112 | my ($n,$k)=($r/10,$r%10);
|
---|
1113 | if ($k==0) {
|
---|
1114 | $code.=<<___;
|
---|
1115 | vmovdqu `16*$n`($in0),$in # load input
|
---|
1116 | vpxor $rndkey[1],$in,$in
|
---|
1117 | ___
|
---|
1118 | $code.=<<___ if ($n);
|
---|
1119 | vmovups $iv,`16*($n-1)`($out,$in0) # write output
|
---|
1120 | ___
|
---|
1121 | $code.=<<___;
|
---|
1122 | vpxor $in,$iv,$iv
|
---|
1123 | vaesenc $rndkey[0],$iv,$iv
|
---|
1124 | vmovups `32+16*$k-112`($key),$rndkey[1]
|
---|
1125 | ___
|
---|
1126 | } elsif ($k==9) {
|
---|
1127 | $sn++;
|
---|
1128 | $code.=<<___;
|
---|
1129 | cmp \$11,$rounds
|
---|
1130 | jb .Lvaesenclast$sn
|
---|
1131 | vaesenc $rndkey[0],$iv,$iv
|
---|
1132 | vmovups `32+16*($k+0)-112`($key),$rndkey[1]
|
---|
1133 | vaesenc $rndkey[1],$iv,$iv
|
---|
1134 | vmovups `32+16*($k+1)-112`($key),$rndkey[0]
|
---|
1135 | je .Lvaesenclast$sn
|
---|
1136 | vaesenc $rndkey[0],$iv,$iv
|
---|
1137 | vmovups `32+16*($k+2)-112`($key),$rndkey[1]
|
---|
1138 | vaesenc $rndkey[1],$iv,$iv
|
---|
1139 | vmovups `32+16*($k+3)-112`($key),$rndkey[0]
|
---|
1140 | .Lvaesenclast$sn:
|
---|
1141 | vaesenclast $rndkey[0],$iv,$iv
|
---|
1142 | vmovups -112($key),$rndkey[0]
|
---|
1143 | vmovups 16-112($key),$rndkey[1] # forward reference
|
---|
1144 | ___
|
---|
1145 | } else {
|
---|
1146 | $code.=<<___;
|
---|
1147 | vaesenc $rndkey[0],$iv,$iv
|
---|
1148 | vmovups `32+16*$k-112`($key),$rndkey[1]
|
---|
1149 | ___
|
---|
1150 | }
|
---|
1151 | $r++; unshift(@rndkey,pop(@rndkey));
|
---|
1152 | };
|
---|
1153 |
|
---|
1154 | sub Xupdate_avx_16_31() # recall that $Xi starts with 4
|
---|
1155 | { use integer;
|
---|
1156 | my $body = shift;
|
---|
1157 | my @insns = (&$body,&$body,&$body,&$body); # 40 instructions
|
---|
1158 | my ($a,$b,$c,$d,$e);
|
---|
1159 |
|
---|
1160 | eval(shift(@insns));
|
---|
1161 | eval(shift(@insns));
|
---|
1162 | &vpalignr(@X[0],@X[-3&7],@X[-4&7],8); # compose "X[-14]" in "X[0]"
|
---|
1163 | eval(shift(@insns));
|
---|
1164 | eval(shift(@insns));
|
---|
1165 |
|
---|
1166 | &vpaddd (@Tx[1],$Kx,@X[-1&7]);
|
---|
1167 | eval(shift(@insns));
|
---|
1168 | eval(shift(@insns));
|
---|
1169 | &vpsrldq(@Tx[0],@X[-1&7],4); # "X[-3]", 3 dwords
|
---|
1170 | eval(shift(@insns));
|
---|
1171 | eval(shift(@insns));
|
---|
1172 | &vpxor (@X[0],@X[0],@X[-4&7]); # "X[0]"^="X[-16]"
|
---|
1173 | eval(shift(@insns));
|
---|
1174 | eval(shift(@insns));
|
---|
1175 |
|
---|
1176 | &vpxor (@Tx[0],@Tx[0],@X[-2&7]); # "X[-3]"^"X[-8]"
|
---|
1177 | eval(shift(@insns));
|
---|
1178 | eval(shift(@insns));
|
---|
1179 | eval(shift(@insns));
|
---|
1180 | eval(shift(@insns));
|
---|
1181 |
|
---|
1182 | &vpxor (@X[0],@X[0],@Tx[0]); # "X[0]"^="X[-3]"^"X[-8]"
|
---|
1183 | eval(shift(@insns));
|
---|
1184 | eval(shift(@insns));
|
---|
1185 | &vmovdqa (eval(16*(($Xi-1)&3))."(%rsp)",@Tx[1]); # X[]+K xfer to IALU
|
---|
1186 | eval(shift(@insns));
|
---|
1187 | eval(shift(@insns));
|
---|
1188 |
|
---|
1189 | &vpsrld (@Tx[0],@X[0],31);
|
---|
1190 | eval(shift(@insns));
|
---|
1191 | eval(shift(@insns));
|
---|
1192 | eval(shift(@insns));
|
---|
1193 | eval(shift(@insns));
|
---|
1194 |
|
---|
1195 | &vpslldq(@Tx[1],@X[0],12); # "X[0]"<<96, extract one dword
|
---|
1196 | &vpaddd (@X[0],@X[0],@X[0]);
|
---|
1197 | eval(shift(@insns));
|
---|
1198 | eval(shift(@insns));
|
---|
1199 | eval(shift(@insns));
|
---|
1200 | eval(shift(@insns));
|
---|
1201 |
|
---|
1202 | &vpor (@X[0],@X[0],@Tx[0]); # "X[0]"<<<=1
|
---|
1203 | &vpsrld (@Tx[0],@Tx[1],30);
|
---|
1204 | eval(shift(@insns));
|
---|
1205 | eval(shift(@insns));
|
---|
1206 | eval(shift(@insns));
|
---|
1207 | eval(shift(@insns));
|
---|
1208 |
|
---|
1209 | &vpslld (@Tx[1],@Tx[1],2);
|
---|
1210 | &vpxor (@X[0],@X[0],@Tx[0]);
|
---|
1211 | eval(shift(@insns));
|
---|
1212 | eval(shift(@insns));
|
---|
1213 | eval(shift(@insns));
|
---|
1214 | eval(shift(@insns));
|
---|
1215 |
|
---|
1216 | &vpxor (@X[0],@X[0],@Tx[1]); # "X[0]"^=("X[0]">>96)<<<2
|
---|
1217 | eval(shift(@insns));
|
---|
1218 | eval(shift(@insns));
|
---|
1219 | &vmovdqa ($Kx,eval(16*(($Xi)/5))."($K_XX_XX)") if ($Xi%5==0); # K_XX_XX
|
---|
1220 | eval(shift(@insns));
|
---|
1221 | eval(shift(@insns));
|
---|
1222 |
|
---|
1223 |
|
---|
1224 | foreach (@insns) { eval; } # remaining instructions [if any]
|
---|
1225 |
|
---|
1226 | $Xi++; push(@X,shift(@X)); # "rotate" X[]
|
---|
1227 | }
|
---|
1228 |
|
---|
1229 | sub Xupdate_avx_32_79()
|
---|
1230 | { use integer;
|
---|
1231 | my $body = shift;
|
---|
1232 | my @insns = (&$body,&$body,&$body,&$body); # 32 to 48 instructions
|
---|
1233 | my ($a,$b,$c,$d,$e);
|
---|
1234 |
|
---|
1235 | &vpalignr(@Tx[0],@X[-1&7],@X[-2&7],8); # compose "X[-6]"
|
---|
1236 | &vpxor (@X[0],@X[0],@X[-4&7]); # "X[0]"="X[-32]"^"X[-16]"
|
---|
1237 | eval(shift(@insns)); # body_20_39
|
---|
1238 | eval(shift(@insns));
|
---|
1239 | eval(shift(@insns));
|
---|
1240 | eval(shift(@insns)); # rol
|
---|
1241 |
|
---|
1242 | &vpxor (@X[0],@X[0],@X[-7&7]); # "X[0]"^="X[-28]"
|
---|
1243 | eval(shift(@insns));
|
---|
1244 | eval(shift(@insns)) if (@insns[0] !~ /&ro[rl]/);
|
---|
1245 | &vpaddd (@Tx[1],$Kx,@X[-1&7]);
|
---|
1246 | &vmovdqa ($Kx,eval(16*($Xi/5))."($K_XX_XX)") if ($Xi%5==0);
|
---|
1247 | eval(shift(@insns)); # ror
|
---|
1248 | eval(shift(@insns));
|
---|
1249 |
|
---|
1250 | &vpxor (@X[0],@X[0],@Tx[0]); # "X[0]"^="X[-6]"
|
---|
1251 | eval(shift(@insns)); # body_20_39
|
---|
1252 | eval(shift(@insns));
|
---|
1253 | eval(shift(@insns));
|
---|
1254 | eval(shift(@insns)); # rol
|
---|
1255 |
|
---|
1256 | &vpsrld (@Tx[0],@X[0],30);
|
---|
1257 | &vmovdqa (eval(16*(($Xi-1)&3))."(%rsp)",@Tx[1]); # X[]+K xfer to IALU
|
---|
1258 | eval(shift(@insns));
|
---|
1259 | eval(shift(@insns));
|
---|
1260 | eval(shift(@insns)); # ror
|
---|
1261 | eval(shift(@insns));
|
---|
1262 |
|
---|
1263 | &vpslld (@X[0],@X[0],2);
|
---|
1264 | eval(shift(@insns)); # body_20_39
|
---|
1265 | eval(shift(@insns));
|
---|
1266 | eval(shift(@insns));
|
---|
1267 | eval(shift(@insns)); # rol
|
---|
1268 | eval(shift(@insns));
|
---|
1269 | eval(shift(@insns));
|
---|
1270 | eval(shift(@insns)); # ror
|
---|
1271 | eval(shift(@insns));
|
---|
1272 |
|
---|
1273 | &vpor (@X[0],@X[0],@Tx[0]); # "X[0]"<<<=2
|
---|
1274 | eval(shift(@insns)); # body_20_39
|
---|
1275 | eval(shift(@insns));
|
---|
1276 | eval(shift(@insns));
|
---|
1277 | eval(shift(@insns)); # rol
|
---|
1278 | eval(shift(@insns));
|
---|
1279 | eval(shift(@insns));
|
---|
1280 | eval(shift(@insns)); # rol
|
---|
1281 | eval(shift(@insns));
|
---|
1282 |
|
---|
1283 | foreach (@insns) { eval; } # remaining instructions
|
---|
1284 |
|
---|
1285 | $Xi++; push(@X,shift(@X)); # "rotate" X[]
|
---|
1286 | }
|
---|
1287 |
|
---|
1288 | sub Xuplast_avx_80()
|
---|
1289 | { use integer;
|
---|
1290 | my $body = shift;
|
---|
1291 | my @insns = (&$body,&$body,&$body,&$body); # 32 instructions
|
---|
1292 | my ($a,$b,$c,$d,$e);
|
---|
1293 |
|
---|
1294 | eval(shift(@insns));
|
---|
1295 | &vpaddd (@Tx[1],$Kx,@X[-1&7]);
|
---|
1296 | eval(shift(@insns));
|
---|
1297 | eval(shift(@insns));
|
---|
1298 | eval(shift(@insns));
|
---|
1299 | eval(shift(@insns));
|
---|
1300 |
|
---|
1301 | &vmovdqa (eval(16*(($Xi-1)&3))."(%rsp)",@Tx[1]); # X[]+K xfer IALU
|
---|
1302 |
|
---|
1303 | foreach (@insns) { eval; } # remaining instructions
|
---|
1304 |
|
---|
1305 | &cmp ($inp,$len);
|
---|
1306 | &je (shift);
|
---|
1307 |
|
---|
1308 | &vmovdqa(@Tx[1],"64($K_XX_XX)"); # pbswap mask
|
---|
1309 | &vmovdqa($Kx,"0($K_XX_XX)"); # K_00_19
|
---|
1310 | &vmovdqu(@X[-4&7],"0($inp)"); # load input
|
---|
1311 | &vmovdqu(@X[-3&7],"16($inp)");
|
---|
1312 | &vmovdqu(@X[-2&7],"32($inp)");
|
---|
1313 | &vmovdqu(@X[-1&7],"48($inp)");
|
---|
1314 | &vpshufb(@X[-4&7],@X[-4&7],@Tx[1]); # byte swap
|
---|
1315 | &add ($inp,64);
|
---|
1316 |
|
---|
1317 | $Xi=0;
|
---|
1318 | }
|
---|
1319 |
|
---|
1320 | sub Xloop_avx()
|
---|
1321 | { use integer;
|
---|
1322 | my $body = shift;
|
---|
1323 | my @insns = (&$body,&$body,&$body,&$body); # 32 instructions
|
---|
1324 | my ($a,$b,$c,$d,$e);
|
---|
1325 |
|
---|
1326 | eval(shift(@insns));
|
---|
1327 | eval(shift(@insns));
|
---|
1328 | &vpshufb(@X[($Xi-3)&7],@X[($Xi-3)&7],@Tx[1]);
|
---|
1329 | eval(shift(@insns));
|
---|
1330 | eval(shift(@insns));
|
---|
1331 | &vpaddd (@Tx[0],@X[($Xi-4)&7],$Kx);
|
---|
1332 | eval(shift(@insns));
|
---|
1333 | eval(shift(@insns));
|
---|
1334 | eval(shift(@insns));
|
---|
1335 | eval(shift(@insns));
|
---|
1336 | &vmovdqa(eval(16*$Xi)."(%rsp)",@Tx[0]); # X[]+K xfer to IALU
|
---|
1337 | eval(shift(@insns));
|
---|
1338 | eval(shift(@insns));
|
---|
1339 |
|
---|
1340 | foreach (@insns) { eval; }
|
---|
1341 | $Xi++;
|
---|
1342 | }
|
---|
1343 |
|
---|
1344 | sub Xtail_avx()
|
---|
1345 | { use integer;
|
---|
1346 | my $body = shift;
|
---|
1347 | my @insns = (&$body,&$body,&$body,&$body); # 32 instructions
|
---|
1348 | my ($a,$b,$c,$d,$e);
|
---|
1349 |
|
---|
1350 | foreach (@insns) { eval; }
|
---|
1351 | }
|
---|
1352 |
|
---|
1353 | $code.=<<___;
|
---|
1354 | .align 32
|
---|
1355 | .Loop_avx:
|
---|
1356 | ___
|
---|
1357 | &Xupdate_avx_16_31(\&body_00_19);
|
---|
1358 | &Xupdate_avx_16_31(\&body_00_19);
|
---|
1359 | &Xupdate_avx_16_31(\&body_00_19);
|
---|
1360 | &Xupdate_avx_16_31(\&body_00_19);
|
---|
1361 | &Xupdate_avx_32_79(\&body_00_19);
|
---|
1362 | &Xupdate_avx_32_79(\&body_20_39);
|
---|
1363 | &Xupdate_avx_32_79(\&body_20_39);
|
---|
1364 | &Xupdate_avx_32_79(\&body_20_39);
|
---|
1365 | &Xupdate_avx_32_79(\&body_20_39);
|
---|
1366 | &Xupdate_avx_32_79(\&body_20_39);
|
---|
1367 | &Xupdate_avx_32_79(\&body_40_59);
|
---|
1368 | &Xupdate_avx_32_79(\&body_40_59);
|
---|
1369 | &Xupdate_avx_32_79(\&body_40_59);
|
---|
1370 | &Xupdate_avx_32_79(\&body_40_59);
|
---|
1371 | &Xupdate_avx_32_79(\&body_40_59);
|
---|
1372 | &Xupdate_avx_32_79(\&body_20_39);
|
---|
1373 | &Xuplast_avx_80(\&body_20_39,".Ldone_avx"); # can jump to "done"
|
---|
1374 |
|
---|
1375 | $saved_j=$j; @saved_V=@V;
|
---|
1376 | $saved_r=$r; @saved_rndkey=@rndkey;
|
---|
1377 |
|
---|
1378 | &Xloop_avx(\&body_20_39);
|
---|
1379 | &Xloop_avx(\&body_20_39);
|
---|
1380 | &Xloop_avx(\&body_20_39);
|
---|
1381 |
|
---|
1382 | $code.=<<___;
|
---|
1383 | vmovups $iv,48($out,$in0) # write output
|
---|
1384 | lea 64($in0),$in0
|
---|
1385 |
|
---|
1386 | add 0($ctx),$A # update context
|
---|
1387 | add 4($ctx),@T[0]
|
---|
1388 | add 8($ctx),$C
|
---|
1389 | add 12($ctx),$D
|
---|
1390 | mov $A,0($ctx)
|
---|
1391 | add 16($ctx),$E
|
---|
1392 | mov @T[0],4($ctx)
|
---|
1393 | mov @T[0],$B # magic seed
|
---|
1394 | mov $C,8($ctx)
|
---|
1395 | mov $C,@T[1]
|
---|
1396 | mov $D,12($ctx)
|
---|
1397 | xor $D,@T[1]
|
---|
1398 | mov $E,16($ctx)
|
---|
1399 | and @T[1],@T[0]
|
---|
1400 | jmp .Loop_avx
|
---|
1401 |
|
---|
1402 | .Ldone_avx:
|
---|
1403 | ___
|
---|
1404 | $jj=$j=$saved_j; @V=@saved_V;
|
---|
1405 | $r=$saved_r; @rndkey=@saved_rndkey;
|
---|
1406 |
|
---|
1407 | &Xtail_avx(\&body_20_39);
|
---|
1408 | &Xtail_avx(\&body_20_39);
|
---|
1409 | &Xtail_avx(\&body_20_39);
|
---|
1410 |
|
---|
1411 | $code.=<<___;
|
---|
1412 | vmovups $iv,48($out,$in0) # write output
|
---|
1413 | mov 88(%rsp),$ivp # restore $ivp
|
---|
1414 |
|
---|
1415 | add 0($ctx),$A # update context
|
---|
1416 | add 4($ctx),@T[0]
|
---|
1417 | add 8($ctx),$C
|
---|
1418 | mov $A,0($ctx)
|
---|
1419 | add 12($ctx),$D
|
---|
1420 | mov @T[0],4($ctx)
|
---|
1421 | add 16($ctx),$E
|
---|
1422 | mov $C,8($ctx)
|
---|
1423 | mov $D,12($ctx)
|
---|
1424 | mov $E,16($ctx)
|
---|
1425 | vmovups $iv,($ivp) # write IV
|
---|
1426 | vzeroall
|
---|
1427 | ___
|
---|
1428 | $code.=<<___ if ($win64);
|
---|
1429 | movaps 96+0(%rsp),%xmm6
|
---|
1430 | movaps 96+16(%rsp),%xmm7
|
---|
1431 | movaps 96+32(%rsp),%xmm8
|
---|
1432 | movaps 96+48(%rsp),%xmm9
|
---|
1433 | movaps 96+64(%rsp),%xmm10
|
---|
1434 | movaps 96+80(%rsp),%xmm11
|
---|
1435 | movaps 96+96(%rsp),%xmm12
|
---|
1436 | movaps 96+112(%rsp),%xmm13
|
---|
1437 | movaps 96+128(%rsp),%xmm14
|
---|
1438 | movaps 96+144(%rsp),%xmm15
|
---|
1439 | ___
|
---|
1440 | $code.=<<___;
|
---|
1441 | lea `104+($win64?10*16:0)`(%rsp),%rsi
|
---|
1442 | mov 0(%rsi),%r15
|
---|
1443 | mov 8(%rsi),%r14
|
---|
1444 | mov 16(%rsi),%r13
|
---|
1445 | mov 24(%rsi),%r12
|
---|
1446 | mov 32(%rsi),%rbp
|
---|
1447 | mov 40(%rsi),%rbx
|
---|
1448 | lea 48(%rsi),%rsp
|
---|
1449 | .Lepilogue_avx:
|
---|
1450 | ret
|
---|
1451 | .size aesni_cbc_sha1_enc_avx,.-aesni_cbc_sha1_enc_avx
|
---|
1452 | ___
|
---|
1453 |
|
---|
1454 | if ($stitched_decrypt) {{{
|
---|
1455 | # reset
|
---|
1456 | ($in0,$out,$len,$key,$ivp,$ctx,$inp)=("%rdi","%rsi","%rdx","%rcx","%r8","%r9","%r10");
|
---|
1457 |
|
---|
1458 | $j=$jj=$r=$rx=0;
|
---|
1459 | $Xi=4;
|
---|
1460 |
|
---|
1461 | @aes256_dec = (
|
---|
1462 | '&vpxor ($inout0,$rndkey0,"0x00($in0)");',
|
---|
1463 | '&vpxor ($inout1,$rndkey0,"0x10($in0)");',
|
---|
1464 | '&vpxor ($inout2,$rndkey0,"0x20($in0)");',
|
---|
1465 | '&vpxor ($inout3,$rndkey0,"0x30($in0)");',
|
---|
1466 |
|
---|
1467 | '&vmovups($rndkey0,"16-112($key)");',
|
---|
1468 | '&vmovups("64(%rsp)",@X[2]);', # save IV, originally @X[3]
|
---|
1469 | undef,undef
|
---|
1470 | );
|
---|
1471 | for ($i=0;$i<13;$i++) {
|
---|
1472 | push (@aes256_dec,(
|
---|
1473 | '&vaesdec ($inout0,$inout0,$rndkey0);',
|
---|
1474 | '&vaesdec ($inout1,$inout1,$rndkey0);',
|
---|
1475 | '&vaesdec ($inout2,$inout2,$rndkey0);',
|
---|
1476 | '&vaesdec ($inout3,$inout3,$rndkey0); &vmovups($rndkey0,"'.(16*($i+2)-112).'($key)");'
|
---|
1477 | ));
|
---|
1478 | push (@aes256_dec,(undef,undef)) if (($i>=3 && $i<=5) || $i>=11);
|
---|
1479 | push (@aes256_dec,(undef,undef)) if ($i==5);
|
---|
1480 | }
|
---|
1481 | push(@aes256_dec,(
|
---|
1482 | '&vaesdeclast ($inout0,$inout0,$rndkey0); &vmovups(@X[0],"0x00($in0)");',
|
---|
1483 | '&vaesdeclast ($inout1,$inout1,$rndkey0); &vmovups(@X[1],"0x10($in0)");',
|
---|
1484 | '&vaesdeclast ($inout2,$inout2,$rndkey0); &vmovups(@X[2],"0x20($in0)");',
|
---|
1485 | '&vaesdeclast ($inout3,$inout3,$rndkey0); &vmovups(@X[3],"0x30($in0)");',
|
---|
1486 |
|
---|
1487 | '&vxorps ($inout0,$inout0,"64(%rsp)"); &vmovdqu($rndkey0,"-112($key)");',
|
---|
1488 | '&vxorps ($inout1,$inout1,@X[0]); &vmovups("0x00($out,$in0)",$inout0);',
|
---|
1489 | '&vxorps ($inout2,$inout2,@X[1]); &vmovups("0x10($out,$in0)",$inout1);',
|
---|
1490 | '&vxorps ($inout3,$inout3,@X[2]); &vmovups("0x20($out,$in0)",$inout2);',
|
---|
1491 |
|
---|
1492 | '&vmovups ("0x30($out,$in0)",$inout3);'
|
---|
1493 | ));
|
---|
1494 |
|
---|
1495 | $code.=<<___;
|
---|
1496 | .type aesni256_cbc_sha1_dec_avx,\@function,6
|
---|
1497 | .align 32
|
---|
1498 | aesni256_cbc_sha1_dec_avx:
|
---|
1499 | mov `($win64?56:8)`(%rsp),$inp # load 7th argument
|
---|
1500 | push %rbx
|
---|
1501 | push %rbp
|
---|
1502 | push %r12
|
---|
1503 | push %r13
|
---|
1504 | push %r14
|
---|
1505 | push %r15
|
---|
1506 | lea `-104-($win64?10*16:0)`(%rsp),%rsp
|
---|
1507 | ___
|
---|
1508 | $code.=<<___ if ($win64);
|
---|
1509 | movaps %xmm6,96+0(%rsp)
|
---|
1510 | movaps %xmm7,96+16(%rsp)
|
---|
1511 | movaps %xmm8,96+32(%rsp)
|
---|
1512 | movaps %xmm9,96+48(%rsp)
|
---|
1513 | movaps %xmm10,96+64(%rsp)
|
---|
1514 | movaps %xmm11,96+80(%rsp)
|
---|
1515 | movaps %xmm12,96+96(%rsp)
|
---|
1516 | movaps %xmm13,96+112(%rsp)
|
---|
1517 | movaps %xmm14,96+128(%rsp)
|
---|
1518 | movaps %xmm15,96+144(%rsp)
|
---|
1519 | .Lprologue_dec_avx:
|
---|
1520 | ___
|
---|
1521 | $code.=<<___;
|
---|
1522 | vzeroall
|
---|
1523 | mov $in0,%r12 # reassign arguments
|
---|
1524 | mov $out,%r13
|
---|
1525 | mov $len,%r14
|
---|
1526 | lea 112($key),%r15 # size optimization
|
---|
1527 | vmovdqu ($ivp),@X[3] # load IV
|
---|
1528 | ___
|
---|
1529 | ($in0,$out,$len,$key)=map("%r$_",(12..15)); # reassign arguments
|
---|
1530 | $code.=<<___;
|
---|
1531 | shl \$6,$len
|
---|
1532 | sub $in0,$out
|
---|
1533 | add $inp,$len # end of input
|
---|
1534 |
|
---|
1535 | lea K_XX_XX(%rip),$K_XX_XX
|
---|
1536 | mov 0($ctx),$A # load context
|
---|
1537 | mov 4($ctx),$B
|
---|
1538 | mov 8($ctx),$C
|
---|
1539 | mov 12($ctx),$D
|
---|
1540 | mov $B,@T[0] # magic seed
|
---|
1541 | mov 16($ctx),$E
|
---|
1542 | mov $C,@T[1]
|
---|
1543 | xor $D,@T[1]
|
---|
1544 | and @T[1],@T[0]
|
---|
1545 |
|
---|
1546 | vmovdqa 64($K_XX_XX),@X[2] # pbswap mask
|
---|
1547 | vmovdqa 0($K_XX_XX),$Kx # K_00_19
|
---|
1548 | vmovdqu 0($inp),@X[-4&7] # load input to %xmm[0-3]
|
---|
1549 | vmovdqu 16($inp),@X[-3&7]
|
---|
1550 | vmovdqu 32($inp),@X[-2&7]
|
---|
1551 | vmovdqu 48($inp),@X[-1&7]
|
---|
1552 | vpshufb @X[2],@X[-4&7],@X[-4&7] # byte swap
|
---|
1553 | add \$64,$inp
|
---|
1554 | vpshufb @X[2],@X[-3&7],@X[-3&7]
|
---|
1555 | vpshufb @X[2],@X[-2&7],@X[-2&7]
|
---|
1556 | vpshufb @X[2],@X[-1&7],@X[-1&7]
|
---|
1557 | vpaddd $Kx,@X[-4&7],@X[0] # add K_00_19
|
---|
1558 | vpaddd $Kx,@X[-3&7],@X[1]
|
---|
1559 | vpaddd $Kx,@X[-2&7],@X[2]
|
---|
1560 | vmovdqa @X[0],0(%rsp) # X[]+K xfer to IALU
|
---|
1561 | vmovdqa @X[1],16(%rsp)
|
---|
1562 | vmovdqa @X[2],32(%rsp)
|
---|
1563 | vmovups -112($key),$rndkey0 # $key[0]
|
---|
1564 | jmp .Loop_dec_avx
|
---|
1565 |
|
---|
1566 | .align 32
|
---|
1567 | .Loop_dec_avx:
|
---|
1568 | ___
|
---|
1569 | &Xupdate_avx_16_31(\&body_00_19_dec);
|
---|
1570 | &Xupdate_avx_16_31(\&body_00_19_dec);
|
---|
1571 | &Xupdate_avx_16_31(\&body_00_19_dec);
|
---|
1572 | &Xupdate_avx_16_31(\&body_00_19_dec);
|
---|
1573 | &Xupdate_avx_32_79(\&body_00_19_dec);
|
---|
1574 | &Xupdate_avx_32_79(\&body_20_39_dec);
|
---|
1575 | &Xupdate_avx_32_79(\&body_20_39_dec);
|
---|
1576 | &Xupdate_avx_32_79(\&body_20_39_dec);
|
---|
1577 | &Xupdate_avx_32_79(\&body_20_39_dec);
|
---|
1578 | &Xupdate_avx_32_79(\&body_20_39_dec);
|
---|
1579 | &Xupdate_avx_32_79(\&body_40_59_dec);
|
---|
1580 | &Xupdate_avx_32_79(\&body_40_59_dec);
|
---|
1581 | &Xupdate_avx_32_79(\&body_40_59_dec);
|
---|
1582 | &Xupdate_avx_32_79(\&body_40_59_dec);
|
---|
1583 | &Xupdate_avx_32_79(\&body_40_59_dec);
|
---|
1584 | &Xupdate_avx_32_79(\&body_20_39_dec);
|
---|
1585 | &Xuplast_avx_80(\&body_20_39_dec,".Ldone_dec_avx"); # can jump to "done"
|
---|
1586 |
|
---|
1587 | $saved_j=$j; @saved_V=@V;
|
---|
1588 | $saved_rx=$rx;
|
---|
1589 |
|
---|
1590 | &Xloop_avx(\&body_20_39_dec);
|
---|
1591 | &Xloop_avx(\&body_20_39_dec);
|
---|
1592 | &Xloop_avx(\&body_20_39_dec);
|
---|
1593 |
|
---|
1594 | eval(@aes256_dec[-1]); # last store
|
---|
1595 | $code.=<<___;
|
---|
1596 | lea 64($in0),$in0
|
---|
1597 |
|
---|
1598 | add 0($ctx),$A # update context
|
---|
1599 | add 4($ctx),@T[0]
|
---|
1600 | add 8($ctx),$C
|
---|
1601 | add 12($ctx),$D
|
---|
1602 | mov $A,0($ctx)
|
---|
1603 | add 16($ctx),$E
|
---|
1604 | mov @T[0],4($ctx)
|
---|
1605 | mov @T[0],$B # magic seed
|
---|
1606 | mov $C,8($ctx)
|
---|
1607 | mov $C,@T[1]
|
---|
1608 | mov $D,12($ctx)
|
---|
1609 | xor $D,@T[1]
|
---|
1610 | mov $E,16($ctx)
|
---|
1611 | and @T[1],@T[0]
|
---|
1612 | jmp .Loop_dec_avx
|
---|
1613 |
|
---|
1614 | .Ldone_dec_avx:
|
---|
1615 | ___
|
---|
1616 | $jj=$j=$saved_j; @V=@saved_V;
|
---|
1617 | $rx=$saved_rx;
|
---|
1618 |
|
---|
1619 | &Xtail_avx(\&body_20_39_dec);
|
---|
1620 | &Xtail_avx(\&body_20_39_dec);
|
---|
1621 | &Xtail_avx(\&body_20_39_dec);
|
---|
1622 |
|
---|
1623 | eval(@aes256_dec[-1]); # last store
|
---|
1624 | $code.=<<___;
|
---|
1625 |
|
---|
1626 | add 0($ctx),$A # update context
|
---|
1627 | add 4($ctx),@T[0]
|
---|
1628 | add 8($ctx),$C
|
---|
1629 | mov $A,0($ctx)
|
---|
1630 | add 12($ctx),$D
|
---|
1631 | mov @T[0],4($ctx)
|
---|
1632 | add 16($ctx),$E
|
---|
1633 | mov $C,8($ctx)
|
---|
1634 | mov $D,12($ctx)
|
---|
1635 | mov $E,16($ctx)
|
---|
1636 | vmovups @X[3],($ivp) # write IV
|
---|
1637 | vzeroall
|
---|
1638 | ___
|
---|
1639 | $code.=<<___ if ($win64);
|
---|
1640 | movaps 96+0(%rsp),%xmm6
|
---|
1641 | movaps 96+16(%rsp),%xmm7
|
---|
1642 | movaps 96+32(%rsp),%xmm8
|
---|
1643 | movaps 96+48(%rsp),%xmm9
|
---|
1644 | movaps 96+64(%rsp),%xmm10
|
---|
1645 | movaps 96+80(%rsp),%xmm11
|
---|
1646 | movaps 96+96(%rsp),%xmm12
|
---|
1647 | movaps 96+112(%rsp),%xmm13
|
---|
1648 | movaps 96+128(%rsp),%xmm14
|
---|
1649 | movaps 96+144(%rsp),%xmm15
|
---|
1650 | ___
|
---|
1651 | $code.=<<___;
|
---|
1652 | lea `104+($win64?10*16:0)`(%rsp),%rsi
|
---|
1653 | mov 0(%rsi),%r15
|
---|
1654 | mov 8(%rsi),%r14
|
---|
1655 | mov 16(%rsi),%r13
|
---|
1656 | mov 24(%rsi),%r12
|
---|
1657 | mov 32(%rsi),%rbp
|
---|
1658 | mov 40(%rsi),%rbx
|
---|
1659 | lea 48(%rsi),%rsp
|
---|
1660 | .Lepilogue_dec_avx:
|
---|
1661 | ret
|
---|
1662 | .size aesni256_cbc_sha1_dec_avx,.-aesni256_cbc_sha1_dec_avx
|
---|
1663 | ___
|
---|
1664 | }}}
|
---|
1665 | }
|
---|
1666 | $code.=<<___;
|
---|
1667 | .align 64
|
---|
1668 | K_XX_XX:
|
---|
1669 | .long 0x5a827999,0x5a827999,0x5a827999,0x5a827999 # K_00_19
|
---|
1670 | .long 0x6ed9eba1,0x6ed9eba1,0x6ed9eba1,0x6ed9eba1 # K_20_39
|
---|
1671 | .long 0x8f1bbcdc,0x8f1bbcdc,0x8f1bbcdc,0x8f1bbcdc # K_40_59
|
---|
1672 | .long 0xca62c1d6,0xca62c1d6,0xca62c1d6,0xca62c1d6 # K_60_79
|
---|
1673 | .long 0x00010203,0x04050607,0x08090a0b,0x0c0d0e0f # pbswap mask
|
---|
1674 | .byte 0xf,0xe,0xd,0xc,0xb,0xa,0x9,0x8,0x7,0x6,0x5,0x4,0x3,0x2,0x1,0x0
|
---|
1675 |
|
---|
1676 | .asciz "AESNI-CBC+SHA1 stitch for x86_64, CRYPTOGAMS by <appro\@openssl.org>"
|
---|
1677 | .align 64
|
---|
1678 | ___
|
---|
1679 | if ($shaext) {{{
|
---|
1680 | ($in0,$out,$len,$key,$ivp,$ctx,$inp)=("%rdi","%rsi","%rdx","%rcx","%r8","%r9","%r10");
|
---|
1681 |
|
---|
1682 | $rounds="%r11d";
|
---|
1683 |
|
---|
1684 | ($iv,$in,$rndkey0)=map("%xmm$_",(2,14,15));
|
---|
1685 | @rndkey=("%xmm0","%xmm1");
|
---|
1686 | $r=0;
|
---|
1687 |
|
---|
1688 | my ($BSWAP,$ABCD,$E,$E_,$ABCD_SAVE,$E_SAVE)=map("%xmm$_",(7..12));
|
---|
1689 | my @MSG=map("%xmm$_",(3..6));
|
---|
1690 |
|
---|
1691 | $code.=<<___;
|
---|
1692 | .type aesni_cbc_sha1_enc_shaext,\@function,6
|
---|
1693 | .align 32
|
---|
1694 | aesni_cbc_sha1_enc_shaext:
|
---|
1695 | mov `($win64?56:8)`(%rsp),$inp # load 7th argument
|
---|
1696 | ___
|
---|
1697 | $code.=<<___ if ($win64);
|
---|
1698 | lea `-8-10*16`(%rsp),%rsp
|
---|
1699 | movaps %xmm6,-8-10*16(%rax)
|
---|
1700 | movaps %xmm7,-8-9*16(%rax)
|
---|
1701 | movaps %xmm8,-8-8*16(%rax)
|
---|
1702 | movaps %xmm9,-8-7*16(%rax)
|
---|
1703 | movaps %xmm10,-8-6*16(%rax)
|
---|
1704 | movaps %xmm11,-8-5*16(%rax)
|
---|
1705 | movaps %xmm12,-8-4*16(%rax)
|
---|
1706 | movaps %xmm13,-8-3*16(%rax)
|
---|
1707 | movaps %xmm14,-8-2*16(%rax)
|
---|
1708 | movaps %xmm15,-8-1*16(%rax)
|
---|
1709 | .Lprologue_shaext:
|
---|
1710 | ___
|
---|
1711 | $code.=<<___;
|
---|
1712 | movdqu ($ctx),$ABCD
|
---|
1713 | movd 16($ctx),$E
|
---|
1714 | movdqa K_XX_XX+0x50(%rip),$BSWAP # byte-n-word swap
|
---|
1715 |
|
---|
1716 | mov 240($key),$rounds
|
---|
1717 | sub $in0,$out
|
---|
1718 | movups ($key),$rndkey0 # $key[0]
|
---|
1719 | movups ($ivp),$iv # load IV
|
---|
1720 | movups 16($key),$rndkey[0] # forward reference
|
---|
1721 | lea 112($key),$key # size optimization
|
---|
1722 |
|
---|
1723 | pshufd \$0b00011011,$ABCD,$ABCD # flip word order
|
---|
1724 | pshufd \$0b00011011,$E,$E # flip word order
|
---|
1725 | jmp .Loop_shaext
|
---|
1726 |
|
---|
1727 | .align 16
|
---|
1728 | .Loop_shaext:
|
---|
1729 | ___
|
---|
1730 | &$aesenc();
|
---|
1731 | $code.=<<___;
|
---|
1732 | movdqu ($inp),@MSG[0]
|
---|
1733 | movdqa $E,$E_SAVE # offload $E
|
---|
1734 | pshufb $BSWAP,@MSG[0]
|
---|
1735 | movdqu 0x10($inp),@MSG[1]
|
---|
1736 | movdqa $ABCD,$ABCD_SAVE # offload $ABCD
|
---|
1737 | ___
|
---|
1738 | &$aesenc();
|
---|
1739 | $code.=<<___;
|
---|
1740 | pshufb $BSWAP,@MSG[1]
|
---|
1741 |
|
---|
1742 | paddd @MSG[0],$E
|
---|
1743 | movdqu 0x20($inp),@MSG[2]
|
---|
1744 | lea 0x40($inp),$inp
|
---|
1745 | pxor $E_SAVE,@MSG[0] # black magic
|
---|
1746 | ___
|
---|
1747 | &$aesenc();
|
---|
1748 | $code.=<<___;
|
---|
1749 | pxor $E_SAVE,@MSG[0] # black magic
|
---|
1750 | movdqa $ABCD,$E_
|
---|
1751 | pshufb $BSWAP,@MSG[2]
|
---|
1752 | sha1rnds4 \$0,$E,$ABCD # 0-3
|
---|
1753 | sha1nexte @MSG[1],$E_
|
---|
1754 | ___
|
---|
1755 | &$aesenc();
|
---|
1756 | $code.=<<___;
|
---|
1757 | sha1msg1 @MSG[1],@MSG[0]
|
---|
1758 | movdqu -0x10($inp),@MSG[3]
|
---|
1759 | movdqa $ABCD,$E
|
---|
1760 | pshufb $BSWAP,@MSG[3]
|
---|
1761 | ___
|
---|
1762 | &$aesenc();
|
---|
1763 | $code.=<<___;
|
---|
1764 | sha1rnds4 \$0,$E_,$ABCD # 4-7
|
---|
1765 | sha1nexte @MSG[2],$E
|
---|
1766 | pxor @MSG[2],@MSG[0]
|
---|
1767 | sha1msg1 @MSG[2],@MSG[1]
|
---|
1768 | ___
|
---|
1769 | &$aesenc();
|
---|
1770 |
|
---|
1771 | for($i=2;$i<20-4;$i++) {
|
---|
1772 | $code.=<<___;
|
---|
1773 | movdqa $ABCD,$E_
|
---|
1774 | sha1rnds4 \$`int($i/5)`,$E,$ABCD # 8-11
|
---|
1775 | sha1nexte @MSG[3],$E_
|
---|
1776 | ___
|
---|
1777 | &$aesenc();
|
---|
1778 | $code.=<<___;
|
---|
1779 | sha1msg2 @MSG[3],@MSG[0]
|
---|
1780 | pxor @MSG[3],@MSG[1]
|
---|
1781 | sha1msg1 @MSG[3],@MSG[2]
|
---|
1782 | ___
|
---|
1783 | ($E,$E_)=($E_,$E);
|
---|
1784 | push(@MSG,shift(@MSG));
|
---|
1785 |
|
---|
1786 | &$aesenc();
|
---|
1787 | }
|
---|
1788 | $code.=<<___;
|
---|
1789 | movdqa $ABCD,$E_
|
---|
1790 | sha1rnds4 \$3,$E,$ABCD # 64-67
|
---|
1791 | sha1nexte @MSG[3],$E_
|
---|
1792 | sha1msg2 @MSG[3],@MSG[0]
|
---|
1793 | pxor @MSG[3],@MSG[1]
|
---|
1794 | ___
|
---|
1795 | &$aesenc();
|
---|
1796 | $code.=<<___;
|
---|
1797 | movdqa $ABCD,$E
|
---|
1798 | sha1rnds4 \$3,$E_,$ABCD # 68-71
|
---|
1799 | sha1nexte @MSG[0],$E
|
---|
1800 | sha1msg2 @MSG[0],@MSG[1]
|
---|
1801 | ___
|
---|
1802 | &$aesenc();
|
---|
1803 | $code.=<<___;
|
---|
1804 | movdqa $E_SAVE,@MSG[0]
|
---|
1805 | movdqa $ABCD,$E_
|
---|
1806 | sha1rnds4 \$3,$E,$ABCD # 72-75
|
---|
1807 | sha1nexte @MSG[1],$E_
|
---|
1808 | ___
|
---|
1809 | &$aesenc();
|
---|
1810 | $code.=<<___;
|
---|
1811 | movdqa $ABCD,$E
|
---|
1812 | sha1rnds4 \$3,$E_,$ABCD # 76-79
|
---|
1813 | sha1nexte $MSG[0],$E
|
---|
1814 | ___
|
---|
1815 | while($r<40) { &$aesenc(); } # remaining aesenc's
|
---|
1816 | $code.=<<___;
|
---|
1817 | dec $len
|
---|
1818 |
|
---|
1819 | paddd $ABCD_SAVE,$ABCD
|
---|
1820 | movups $iv,48($out,$in0) # write output
|
---|
1821 | lea 64($in0),$in0
|
---|
1822 | jnz .Loop_shaext
|
---|
1823 |
|
---|
1824 | pshufd \$0b00011011,$ABCD,$ABCD
|
---|
1825 | pshufd \$0b00011011,$E,$E
|
---|
1826 | movups $iv,($ivp) # write IV
|
---|
1827 | movdqu $ABCD,($ctx)
|
---|
1828 | movd $E,16($ctx)
|
---|
1829 | ___
|
---|
1830 | $code.=<<___ if ($win64);
|
---|
1831 | movaps -8-10*16(%rax),%xmm6
|
---|
1832 | movaps -8-9*16(%rax),%xmm7
|
---|
1833 | movaps -8-8*16(%rax),%xmm8
|
---|
1834 | movaps -8-7*16(%rax),%xmm9
|
---|
1835 | movaps -8-6*16(%rax),%xmm10
|
---|
1836 | movaps -8-5*16(%rax),%xmm11
|
---|
1837 | movaps -8-4*16(%rax),%xmm12
|
---|
1838 | movaps -8-3*16(%rax),%xmm13
|
---|
1839 | movaps -8-2*16(%rax),%xmm14
|
---|
1840 | movaps -8-1*16(%rax),%xmm15
|
---|
1841 | mov %rax,%rsp
|
---|
1842 | .Lepilogue_shaext:
|
---|
1843 | ___
|
---|
1844 | $code.=<<___;
|
---|
1845 | ret
|
---|
1846 | .size aesni_cbc_sha1_enc_shaext,.-aesni_cbc_sha1_enc_shaext
|
---|
1847 | ___
|
---|
1848 | }}}
|
---|
1849 | # EXCEPTION_DISPOSITION handler (EXCEPTION_RECORD *rec,ULONG64 frame,
|
---|
1850 | # CONTEXT *context,DISPATCHER_CONTEXT *disp)
|
---|
1851 | if ($win64) {
|
---|
1852 | $rec="%rcx";
|
---|
1853 | $frame="%rdx";
|
---|
1854 | $context="%r8";
|
---|
1855 | $disp="%r9";
|
---|
1856 |
|
---|
1857 | $code.=<<___;
|
---|
1858 | .extern __imp_RtlVirtualUnwind
|
---|
1859 | .type ssse3_handler,\@abi-omnipotent
|
---|
1860 | .align 16
|
---|
1861 | ssse3_handler:
|
---|
1862 | push %rsi
|
---|
1863 | push %rdi
|
---|
1864 | push %rbx
|
---|
1865 | push %rbp
|
---|
1866 | push %r12
|
---|
1867 | push %r13
|
---|
1868 | push %r14
|
---|
1869 | push %r15
|
---|
1870 | pushfq
|
---|
1871 | sub \$64,%rsp
|
---|
1872 |
|
---|
1873 | mov 120($context),%rax # pull context->Rax
|
---|
1874 | mov 248($context),%rbx # pull context->Rip
|
---|
1875 |
|
---|
1876 | mov 8($disp),%rsi # disp->ImageBase
|
---|
1877 | mov 56($disp),%r11 # disp->HandlerData
|
---|
1878 |
|
---|
1879 | mov 0(%r11),%r10d # HandlerData[0]
|
---|
1880 | lea (%rsi,%r10),%r10 # prologue label
|
---|
1881 | cmp %r10,%rbx # context->Rip<prologue label
|
---|
1882 | jb .Lcommon_seh_tail
|
---|
1883 |
|
---|
1884 | mov 152($context),%rax # pull context->Rsp
|
---|
1885 |
|
---|
1886 | mov 4(%r11),%r10d # HandlerData[1]
|
---|
1887 | lea (%rsi,%r10),%r10 # epilogue label
|
---|
1888 | cmp %r10,%rbx # context->Rip>=epilogue label
|
---|
1889 | jae .Lcommon_seh_tail
|
---|
1890 | ___
|
---|
1891 | $code.=<<___ if ($shaext);
|
---|
1892 | lea aesni_cbc_sha1_enc_shaext(%rip),%r10
|
---|
1893 | cmp %r10,%rbx
|
---|
1894 | jb .Lseh_no_shaext
|
---|
1895 |
|
---|
1896 | lea (%rax),%rsi
|
---|
1897 | lea 512($context),%rdi # &context.Xmm6
|
---|
1898 | mov \$20,%ecx
|
---|
1899 | .long 0xa548f3fc # cld; rep movsq
|
---|
1900 | lea 168(%rax),%rax # adjust stack pointer
|
---|
1901 | jmp .Lcommon_seh_tail
|
---|
1902 | .Lseh_no_shaext:
|
---|
1903 | ___
|
---|
1904 | $code.=<<___;
|
---|
1905 | lea 96(%rax),%rsi
|
---|
1906 | lea 512($context),%rdi # &context.Xmm6
|
---|
1907 | mov \$20,%ecx
|
---|
1908 | .long 0xa548f3fc # cld; rep movsq
|
---|
1909 | lea `104+10*16`(%rax),%rax # adjust stack pointer
|
---|
1910 |
|
---|
1911 | mov 0(%rax),%r15
|
---|
1912 | mov 8(%rax),%r14
|
---|
1913 | mov 16(%rax),%r13
|
---|
1914 | mov 24(%rax),%r12
|
---|
1915 | mov 32(%rax),%rbp
|
---|
1916 | mov 40(%rax),%rbx
|
---|
1917 | lea 48(%rax),%rax
|
---|
1918 | mov %rbx,144($context) # restore context->Rbx
|
---|
1919 | mov %rbp,160($context) # restore context->Rbp
|
---|
1920 | mov %r12,216($context) # restore context->R12
|
---|
1921 | mov %r13,224($context) # restore context->R13
|
---|
1922 | mov %r14,232($context) # restore context->R14
|
---|
1923 | mov %r15,240($context) # restore context->R15
|
---|
1924 |
|
---|
1925 | .Lcommon_seh_tail:
|
---|
1926 | mov 8(%rax),%rdi
|
---|
1927 | mov 16(%rax),%rsi
|
---|
1928 | mov %rax,152($context) # restore context->Rsp
|
---|
1929 | mov %rsi,168($context) # restore context->Rsi
|
---|
1930 | mov %rdi,176($context) # restore context->Rdi
|
---|
1931 |
|
---|
1932 | mov 40($disp),%rdi # disp->ContextRecord
|
---|
1933 | mov $context,%rsi # context
|
---|
1934 | mov \$154,%ecx # sizeof(CONTEXT)
|
---|
1935 | .long 0xa548f3fc # cld; rep movsq
|
---|
1936 |
|
---|
1937 | mov $disp,%rsi
|
---|
1938 | xor %rcx,%rcx # arg1, UNW_FLAG_NHANDLER
|
---|
1939 | mov 8(%rsi),%rdx # arg2, disp->ImageBase
|
---|
1940 | mov 0(%rsi),%r8 # arg3, disp->ControlPc
|
---|
1941 | mov 16(%rsi),%r9 # arg4, disp->FunctionEntry
|
---|
1942 | mov 40(%rsi),%r10 # disp->ContextRecord
|
---|
1943 | lea 56(%rsi),%r11 # &disp->HandlerData
|
---|
1944 | lea 24(%rsi),%r12 # &disp->EstablisherFrame
|
---|
1945 | mov %r10,32(%rsp) # arg5
|
---|
1946 | mov %r11,40(%rsp) # arg6
|
---|
1947 | mov %r12,48(%rsp) # arg7
|
---|
1948 | mov %rcx,56(%rsp) # arg8, (NULL)
|
---|
1949 | call *__imp_RtlVirtualUnwind(%rip)
|
---|
1950 |
|
---|
1951 | mov \$1,%eax # ExceptionContinueSearch
|
---|
1952 | add \$64,%rsp
|
---|
1953 | popfq
|
---|
1954 | pop %r15
|
---|
1955 | pop %r14
|
---|
1956 | pop %r13
|
---|
1957 | pop %r12
|
---|
1958 | pop %rbp
|
---|
1959 | pop %rbx
|
---|
1960 | pop %rdi
|
---|
1961 | pop %rsi
|
---|
1962 | ret
|
---|
1963 | .size ssse3_handler,.-ssse3_handler
|
---|
1964 |
|
---|
1965 | .section .pdata
|
---|
1966 | .align 4
|
---|
1967 | .rva .LSEH_begin_aesni_cbc_sha1_enc_ssse3
|
---|
1968 | .rva .LSEH_end_aesni_cbc_sha1_enc_ssse3
|
---|
1969 | .rva .LSEH_info_aesni_cbc_sha1_enc_ssse3
|
---|
1970 | ___
|
---|
1971 | $code.=<<___ if ($avx);
|
---|
1972 | .rva .LSEH_begin_aesni_cbc_sha1_enc_avx
|
---|
1973 | .rva .LSEH_end_aesni_cbc_sha1_enc_avx
|
---|
1974 | .rva .LSEH_info_aesni_cbc_sha1_enc_avx
|
---|
1975 | ___
|
---|
1976 | $code.=<<___ if ($shaext);
|
---|
1977 | .rva .LSEH_begin_aesni_cbc_sha1_enc_shaext
|
---|
1978 | .rva .LSEH_end_aesni_cbc_sha1_enc_shaext
|
---|
1979 | .rva .LSEH_info_aesni_cbc_sha1_enc_shaext
|
---|
1980 | ___
|
---|
1981 | $code.=<<___;
|
---|
1982 | .section .xdata
|
---|
1983 | .align 8
|
---|
1984 | .LSEH_info_aesni_cbc_sha1_enc_ssse3:
|
---|
1985 | .byte 9,0,0,0
|
---|
1986 | .rva ssse3_handler
|
---|
1987 | .rva .Lprologue_ssse3,.Lepilogue_ssse3 # HandlerData[]
|
---|
1988 | ___
|
---|
1989 | $code.=<<___ if ($avx);
|
---|
1990 | .LSEH_info_aesni_cbc_sha1_enc_avx:
|
---|
1991 | .byte 9,0,0,0
|
---|
1992 | .rva ssse3_handler
|
---|
1993 | .rva .Lprologue_avx,.Lepilogue_avx # HandlerData[]
|
---|
1994 | ___
|
---|
1995 | $code.=<<___ if ($shaext);
|
---|
1996 | .LSEH_info_aesni_cbc_sha1_enc_shaext:
|
---|
1997 | .byte 9,0,0,0
|
---|
1998 | .rva ssse3_handler
|
---|
1999 | .rva .Lprologue_shaext,.Lepilogue_shaext # HandlerData[]
|
---|
2000 | ___
|
---|
2001 | }
|
---|
2002 |
|
---|
2003 | ####################################################################
|
---|
2004 | sub rex {
|
---|
2005 | local *opcode=shift;
|
---|
2006 | my ($dst,$src)=@_;
|
---|
2007 | my $rex=0;
|
---|
2008 |
|
---|
2009 | $rex|=0x04 if($dst>=8);
|
---|
2010 | $rex|=0x01 if($src>=8);
|
---|
2011 | unshift @opcode,$rex|0x40 if($rex);
|
---|
2012 | }
|
---|
2013 |
|
---|
2014 | sub sha1rnds4 {
|
---|
2015 | if (@_[0] =~ /\$([x0-9a-f]+),\s*%xmm([0-9]+),\s*%xmm([0-9]+)/) {
|
---|
2016 | my @opcode=(0x0f,0x3a,0xcc);
|
---|
2017 | rex(\@opcode,$3,$2);
|
---|
2018 | push @opcode,0xc0|($2&7)|(($3&7)<<3); # ModR/M
|
---|
2019 | my $c=$1;
|
---|
2020 | push @opcode,$c=~/^0/?oct($c):$c;
|
---|
2021 | return ".byte\t".join(',',@opcode);
|
---|
2022 | } else {
|
---|
2023 | return "sha1rnds4\t".@_[0];
|
---|
2024 | }
|
---|
2025 | }
|
---|
2026 |
|
---|
2027 | sub sha1op38 {
|
---|
2028 | my $instr = shift;
|
---|
2029 | my %opcodelet = (
|
---|
2030 | "sha1nexte" => 0xc8,
|
---|
2031 | "sha1msg1" => 0xc9,
|
---|
2032 | "sha1msg2" => 0xca );
|
---|
2033 |
|
---|
2034 | if (defined($opcodelet{$instr}) && @_[0] =~ /%xmm([0-9]+),\s*%xmm([0-9]+)/) {
|
---|
2035 | my @opcode=(0x0f,0x38);
|
---|
2036 | rex(\@opcode,$2,$1);
|
---|
2037 | push @opcode,$opcodelet{$instr};
|
---|
2038 | push @opcode,0xc0|($1&7)|(($2&7)<<3); # ModR/M
|
---|
2039 | return ".byte\t".join(',',@opcode);
|
---|
2040 | } else {
|
---|
2041 | return $instr."\t".@_[0];
|
---|
2042 | }
|
---|
2043 | }
|
---|
2044 |
|
---|
2045 | sub aesni {
|
---|
2046 | my $line=shift;
|
---|
2047 | my @opcode=(0x0f,0x38);
|
---|
2048 |
|
---|
2049 | if ($line=~/(aes[a-z]+)\s+%xmm([0-9]+),\s*%xmm([0-9]+)/) {
|
---|
2050 | my %opcodelet = (
|
---|
2051 | "aesenc" => 0xdc, "aesenclast" => 0xdd,
|
---|
2052 | "aesdec" => 0xde, "aesdeclast" => 0xdf
|
---|
2053 | );
|
---|
2054 | return undef if (!defined($opcodelet{$1}));
|
---|
2055 | rex(\@opcode,$3,$2);
|
---|
2056 | push @opcode,$opcodelet{$1},0xc0|($2&7)|(($3&7)<<3); # ModR/M
|
---|
2057 | unshift @opcode,0x66;
|
---|
2058 | return ".byte\t".join(',',@opcode);
|
---|
2059 | }
|
---|
2060 | return $line;
|
---|
2061 | }
|
---|
2062 |
|
---|
2063 | foreach (split("\n",$code)) {
|
---|
2064 | s/\`([^\`]*)\`/eval $1/geo;
|
---|
2065 |
|
---|
2066 | s/\b(sha1rnds4)\s+(.*)/sha1rnds4($2)/geo or
|
---|
2067 | s/\b(sha1[^\s]*)\s+(.*)/sha1op38($1,$2)/geo or
|
---|
2068 | s/\b(aes.*%xmm[0-9]+).*$/aesni($1)/geo;
|
---|
2069 |
|
---|
2070 | print $_,"\n";
|
---|
2071 | }
|
---|
2072 | close STDOUT;
|
---|