1 | #! /usr/bin/env perl
|
---|
2 | # Copyright 2011-2020 The OpenSSL Project Authors. All Rights Reserved.
|
---|
3 | #
|
---|
4 | # Licensed under the Apache License 2.0 (the "License"). You may not use
|
---|
5 | # this file except in compliance with the License. You can obtain a copy
|
---|
6 | # in the file LICENSE in the source distribution or at
|
---|
7 | # https://www.openssl.org/source/license.html
|
---|
8 |
|
---|
9 | #
|
---|
10 | # ====================================================================
|
---|
11 | # Written by Andy Polyakov <[email protected]> for the OpenSSL
|
---|
12 | # project. The module is, however, dual licensed under OpenSSL and
|
---|
13 | # CRYPTOGAMS licenses depending on where you obtain it. For further
|
---|
14 | # details see http://www.openssl.org/~appro/cryptogams/.
|
---|
15 | # ====================================================================
|
---|
16 | #
|
---|
17 | # June 2011
|
---|
18 | #
|
---|
19 | # This is AESNI-CBC+SHA1 "stitch" implementation. The idea, as spelled
|
---|
20 | # in http://download.intel.com/design/intarch/papers/323686.pdf, is
|
---|
21 | # that since AESNI-CBC encrypt exhibit *very* low instruction-level
|
---|
22 | # parallelism, interleaving it with another algorithm would allow to
|
---|
23 | # utilize processor resources better and achieve better performance.
|
---|
24 | # SHA1 instruction sequences(*) are taken from sha1-x86_64.pl and
|
---|
25 | # AESNI code is weaved into it. Below are performance numbers in
|
---|
26 | # cycles per processed byte, less is better, for standalone AESNI-CBC
|
---|
27 | # encrypt, sum of the latter and standalone SHA1, and "stitched"
|
---|
28 | # subroutine:
|
---|
29 | #
|
---|
30 | # AES-128-CBC +SHA1 stitch gain
|
---|
31 | # Westmere 3.77[+5.3] 9.07 6.55 +38%
|
---|
32 | # Sandy Bridge 5.05[+5.0(6.1)] 10.06(11.15) 5.98(7.05) +68%(+58%)
|
---|
33 | # Ivy Bridge 5.05[+4.6] 9.65 5.54 +74%
|
---|
34 | # Haswell 4.43[+3.6(4.2)] 8.00(8.58) 4.55(5.21) +75%(+65%)
|
---|
35 | # Skylake 2.63[+3.5(4.1)] 6.17(6.69) 4.23(4.44) +46%(+51%)
|
---|
36 | # Bulldozer 5.77[+6.0] 11.72 6.37 +84%
|
---|
37 | # Ryzen(**) 2.71[+1.93] 4.64 2.74 +69%
|
---|
38 | # Goldmont(**) 3.82[+1.70] 5.52 4.20 +31%
|
---|
39 | #
|
---|
40 | # AES-192-CBC
|
---|
41 | # Westmere 4.51 9.81 6.80 +44%
|
---|
42 | # Sandy Bridge 6.05 11.06(12.15) 6.11(7.19) +81%(+69%)
|
---|
43 | # Ivy Bridge 6.05 10.65 6.07 +75%
|
---|
44 | # Haswell 5.29 8.86(9.44) 5.32(5.32) +67%(+77%)
|
---|
45 | # Bulldozer 6.89 12.84 6.96 +84%
|
---|
46 | #
|
---|
47 | # AES-256-CBC
|
---|
48 | # Westmere 5.25 10.55 7.21 +46%
|
---|
49 | # Sandy Bridge 7.05 12.06(13.15) 7.12(7.72) +69%(+70%)
|
---|
50 | # Ivy Bridge 7.05 11.65 7.12 +64%
|
---|
51 | # Haswell 6.19 9.76(10.34) 6.21(6.25) +57%(+65%)
|
---|
52 | # Skylake 3.62 7.16(7.68) 4.56(4.76) +57%(+61%)
|
---|
53 | # Bulldozer 8.00 13.95 8.25 +69%
|
---|
54 | # Ryzen(**) 3.71 5.64 3.72 +52%
|
---|
55 | # Goldmont(**) 5.35 7.05 5.76 +22%
|
---|
56 | #
|
---|
57 | # (*) There are two code paths: SSSE3 and AVX. See sha1-568.pl for
|
---|
58 | # background information. Above numbers in parentheses are SSSE3
|
---|
59 | # results collected on AVX-capable CPU, i.e. apply on OSes that
|
---|
60 | # don't support AVX.
|
---|
61 | # (**) SHAEXT results.
|
---|
62 | #
|
---|
63 | # Needless to mention that it makes no sense to implement "stitched"
|
---|
64 | # *decrypt* subroutine. Because *both* AESNI-CBC decrypt and SHA1
|
---|
65 | # fully utilize parallelism, so stitching would not give any gain
|
---|
66 | # anyway. Well, there might be some, e.g. because of better cache
|
---|
67 | # locality... For reference, here are performance results for
|
---|
68 | # standalone AESNI-CBC decrypt:
|
---|
69 | #
|
---|
70 | # AES-128-CBC AES-192-CBC AES-256-CBC
|
---|
71 | # Westmere 1.25 1.50 1.75
|
---|
72 | # Sandy Bridge 0.74 0.91 1.09
|
---|
73 | # Ivy Bridge 0.74 0.90 1.11
|
---|
74 | # Haswell 0.63 0.76 0.88
|
---|
75 | # Bulldozer 0.70 0.85 0.99
|
---|
76 |
|
---|
77 | # And indeed:
|
---|
78 | #
|
---|
79 | # AES-256-CBC +SHA1 stitch gain
|
---|
80 | # Westmere 1.75 7.20 6.68 +7.8%
|
---|
81 | # Sandy Bridge 1.09 6.09(7.22) 5.82(6.95) +4.6%(+3.9%)
|
---|
82 | # Ivy Bridge 1.11 5.70 5.45 +4.6%
|
---|
83 | # Haswell 0.88 4.45(5.00) 4.39(4.69) +1.4%(*)(+6.6%)
|
---|
84 | # Bulldozer 0.99 6.95 5.95 +17%(**)
|
---|
85 | #
|
---|
86 | # (*) Tiny improvement coefficient on Haswell is because we compare
|
---|
87 | # AVX1 stitch to sum with AVX2 SHA1.
|
---|
88 | # (**) Execution is fully dominated by integer code sequence and
|
---|
89 | # SIMD still hardly shows [in single-process benchmark;-]
|
---|
90 |
|
---|
91 | # $output is the last argument if it looks like a file (it has an extension)
|
---|
92 | # $flavour is the first argument if it doesn't look like a file
|
---|
93 | $output = $#ARGV >= 0 && $ARGV[$#ARGV] =~ m|\.\w+$| ? pop : undef;
|
---|
94 | $flavour = $#ARGV >= 0 && $ARGV[0] !~ m|\.| ? shift : undef;
|
---|
95 |
|
---|
96 | $win64=0; $win64=1 if ($flavour =~ /[nm]asm|mingw64/ || $output =~ /\.asm$/);
|
---|
97 |
|
---|
98 | $0 =~ m/(.*[\/\\])[^\/\\]+$/; $dir=$1;
|
---|
99 | ( $xlate="${dir}x86_64-xlate.pl" and -f $xlate ) or
|
---|
100 | ( $xlate="${dir}../../perlasm/x86_64-xlate.pl" and -f $xlate) or
|
---|
101 | die "can't locate x86_64-xlate.pl";
|
---|
102 |
|
---|
103 | $avx=1 if (`$ENV{CC} -Wa,-v -c -o /dev/null -x assembler /dev/null 2>&1`
|
---|
104 | =~ /GNU assembler version ([2-9]\.[0-9]+)/ &&
|
---|
105 | $1>=2.19);
|
---|
106 | $avx=1 if (!$avx && $win64 && ($flavour =~ /nasm/ || $ENV{ASM} =~ /nasm/) &&
|
---|
107 | `nasm -v 2>&1` =~ /NASM version ([2-9]\.[0-9]+)/ &&
|
---|
108 | $1>=2.09);
|
---|
109 | $avx=1 if (!$avx && $win64 && ($flavour =~ /masm/ || $ENV{ASM} =~ /ml64/) &&
|
---|
110 | `ml64 2>&1` =~ /Version ([0-9]+)\./ &&
|
---|
111 | $1>=10);
|
---|
112 | $avx=1 if (!$avx && `$ENV{CC} -v 2>&1` =~ /((?:clang|LLVM) version|.*based on LLVM) ([0-9]+\.[0-9]+)/ && $2>=3.0);
|
---|
113 |
|
---|
114 | $shaext=1; ### set to zero if compiling for 1.0.1
|
---|
115 |
|
---|
116 | $stitched_decrypt=0;
|
---|
117 |
|
---|
118 | open OUT,"| \"$^X\" \"$xlate\" $flavour \"$output\""
|
---|
119 | or die "can't call $xlate: $!";
|
---|
120 | *STDOUT=*OUT;
|
---|
121 |
|
---|
122 | # void aesni_cbc_sha1_enc(const void *inp,
|
---|
123 | # void *out,
|
---|
124 | # size_t length,
|
---|
125 | # const AES_KEY *key,
|
---|
126 | # unsigned char *iv,
|
---|
127 | # SHA_CTX *ctx,
|
---|
128 | # const void *in0);
|
---|
129 |
|
---|
130 | $code.=<<___;
|
---|
131 | .text
|
---|
132 | .extern OPENSSL_ia32cap_P
|
---|
133 |
|
---|
134 | .globl aesni_cbc_sha1_enc
|
---|
135 | .type aesni_cbc_sha1_enc,\@abi-omnipotent
|
---|
136 | .align 32
|
---|
137 | aesni_cbc_sha1_enc:
|
---|
138 | .cfi_startproc
|
---|
139 | # caller should check for SSSE3 and AES-NI bits
|
---|
140 | mov OPENSSL_ia32cap_P+0(%rip),%r10d
|
---|
141 | mov OPENSSL_ia32cap_P+4(%rip),%r11
|
---|
142 | ___
|
---|
143 | $code.=<<___ if ($shaext);
|
---|
144 | bt \$61,%r11 # check SHA bit
|
---|
145 | jc aesni_cbc_sha1_enc_shaext
|
---|
146 | ___
|
---|
147 | $code.=<<___ if ($avx);
|
---|
148 | and \$`1<<28`,%r11d # mask AVX bit
|
---|
149 | and \$`1<<30`,%r10d # mask "Intel CPU" bit
|
---|
150 | or %r11d,%r10d
|
---|
151 | cmp \$`1<<28|1<<30`,%r10d
|
---|
152 | je aesni_cbc_sha1_enc_avx
|
---|
153 | ___
|
---|
154 | $code.=<<___;
|
---|
155 | jmp aesni_cbc_sha1_enc_ssse3
|
---|
156 | ret
|
---|
157 | .cfi_endproc
|
---|
158 | .size aesni_cbc_sha1_enc,.-aesni_cbc_sha1_enc
|
---|
159 | ___
|
---|
160 |
|
---|
161 | my ($in0,$out,$len,$key,$ivp,$ctx,$inp)=("%rdi","%rsi","%rdx","%rcx","%r8","%r9","%r10");
|
---|
162 |
|
---|
163 | my $Xi=4;
|
---|
164 | my @X=map("%xmm$_",(4..7,0..3));
|
---|
165 | my @Tx=map("%xmm$_",(8..10));
|
---|
166 | my @V=($A,$B,$C,$D,$E)=("%eax","%ebx","%ecx","%edx","%ebp"); # size optimization
|
---|
167 | my @T=("%esi","%edi");
|
---|
168 | my $j=0; my $jj=0; my $r=0; my $sn=0; my $rx=0;
|
---|
169 | my $K_XX_XX="%r11";
|
---|
170 | my ($rndkey0,$iv,$in)=map("%xmm$_",(11..13)); # for enc
|
---|
171 | my @rndkey=("%xmm14","%xmm15"); # for enc
|
---|
172 | my ($inout0,$inout1,$inout2,$inout3)=map("%xmm$_",(12..15)); # for dec
|
---|
173 |
|
---|
174 | if (1) { # reassign for Atom Silvermont
|
---|
175 | # The goal is to minimize amount of instructions with more than
|
---|
176 | # 3 prefix bytes. Or in more practical terms to keep AES-NI *and*
|
---|
177 | # SSSE3 instructions to upper half of the register bank.
|
---|
178 | @X=map("%xmm$_",(8..11,4..7));
|
---|
179 | @Tx=map("%xmm$_",(12,13,3));
|
---|
180 | ($iv,$in,$rndkey0)=map("%xmm$_",(2,14,15));
|
---|
181 | @rndkey=("%xmm0","%xmm1");
|
---|
182 | }
|
---|
183 |
|
---|
184 | sub AUTOLOAD() # thunk [simplified] 32-bit style perlasm
|
---|
185 | { my $opcode = $AUTOLOAD; $opcode =~ s/.*:://;
|
---|
186 | my $arg = pop;
|
---|
187 | $arg = "\$$arg" if ($arg*1 eq $arg);
|
---|
188 | $code .= "\t$opcode\t".join(',',$arg,reverse @_)."\n";
|
---|
189 | }
|
---|
190 |
|
---|
191 | my $_rol=sub { &rol(@_) };
|
---|
192 | my $_ror=sub { &ror(@_) };
|
---|
193 |
|
---|
194 | $code.=<<___;
|
---|
195 | .type aesni_cbc_sha1_enc_ssse3,\@function,6
|
---|
196 | .align 32
|
---|
197 | aesni_cbc_sha1_enc_ssse3:
|
---|
198 | .cfi_startproc
|
---|
199 | mov `($win64?56:8)`(%rsp),$inp # load 7th argument
|
---|
200 | #shr \$6,$len # debugging artefact
|
---|
201 | #jz .Lepilogue_ssse3 # debugging artefact
|
---|
202 | push %rbx
|
---|
203 | .cfi_push %rbx
|
---|
204 | push %rbp
|
---|
205 | .cfi_push %rbp
|
---|
206 | push %r12
|
---|
207 | .cfi_push %r12
|
---|
208 | push %r13
|
---|
209 | .cfi_push %r13
|
---|
210 | push %r14
|
---|
211 | .cfi_push %r14
|
---|
212 | push %r15
|
---|
213 | .cfi_push %r15
|
---|
214 | lea `-104-($win64?10*16:0)`(%rsp),%rsp
|
---|
215 | .cfi_adjust_cfa_offset `104+($win64?10*16:0)`
|
---|
216 | #mov $in0,$inp # debugging artefact
|
---|
217 | #lea 64(%rsp),$ctx # debugging artefact
|
---|
218 | ___
|
---|
219 | $code.=<<___ if ($win64);
|
---|
220 | movaps %xmm6,96+0(%rsp)
|
---|
221 | movaps %xmm7,96+16(%rsp)
|
---|
222 | movaps %xmm8,96+32(%rsp)
|
---|
223 | movaps %xmm9,96+48(%rsp)
|
---|
224 | movaps %xmm10,96+64(%rsp)
|
---|
225 | movaps %xmm11,96+80(%rsp)
|
---|
226 | movaps %xmm12,96+96(%rsp)
|
---|
227 | movaps %xmm13,96+112(%rsp)
|
---|
228 | movaps %xmm14,96+128(%rsp)
|
---|
229 | movaps %xmm15,96+144(%rsp)
|
---|
230 | .Lprologue_ssse3:
|
---|
231 | ___
|
---|
232 | $code.=<<___;
|
---|
233 | mov $in0,%r12 # reassign arguments
|
---|
234 | mov $out,%r13
|
---|
235 | mov $len,%r14
|
---|
236 | lea 112($key),%r15 # size optimization
|
---|
237 | movdqu ($ivp),$iv # load IV
|
---|
238 | mov $ivp,88(%rsp) # save $ivp
|
---|
239 | ___
|
---|
240 | ($in0,$out,$len,$key)=map("%r$_",(12..15)); # reassign arguments
|
---|
241 | my $rounds="${ivp}d";
|
---|
242 | $code.=<<___;
|
---|
243 | shl \$6,$len
|
---|
244 | sub $in0,$out
|
---|
245 | mov 240-112($key),$rounds
|
---|
246 | add $inp,$len # end of input
|
---|
247 |
|
---|
248 | lea K_XX_XX(%rip),$K_XX_XX
|
---|
249 | mov 0($ctx),$A # load context
|
---|
250 | mov 4($ctx),$B
|
---|
251 | mov 8($ctx),$C
|
---|
252 | mov 12($ctx),$D
|
---|
253 | mov $B,@T[0] # magic seed
|
---|
254 | mov 16($ctx),$E
|
---|
255 | mov $C,@T[1]
|
---|
256 | xor $D,@T[1]
|
---|
257 | and @T[1],@T[0]
|
---|
258 |
|
---|
259 | movdqa 64($K_XX_XX),@Tx[2] # pbswap mask
|
---|
260 | movdqa 0($K_XX_XX),@Tx[1] # K_00_19
|
---|
261 | movdqu 0($inp),@X[-4&7] # load input to %xmm[0-3]
|
---|
262 | movdqu 16($inp),@X[-3&7]
|
---|
263 | movdqu 32($inp),@X[-2&7]
|
---|
264 | movdqu 48($inp),@X[-1&7]
|
---|
265 | pshufb @Tx[2],@X[-4&7] # byte swap
|
---|
266 | pshufb @Tx[2],@X[-3&7]
|
---|
267 | pshufb @Tx[2],@X[-2&7]
|
---|
268 | add \$64,$inp
|
---|
269 | paddd @Tx[1],@X[-4&7] # add K_00_19
|
---|
270 | pshufb @Tx[2],@X[-1&7]
|
---|
271 | paddd @Tx[1],@X[-3&7]
|
---|
272 | paddd @Tx[1],@X[-2&7]
|
---|
273 | movdqa @X[-4&7],0(%rsp) # X[]+K xfer to IALU
|
---|
274 | psubd @Tx[1],@X[-4&7] # restore X[]
|
---|
275 | movdqa @X[-3&7],16(%rsp)
|
---|
276 | psubd @Tx[1],@X[-3&7]
|
---|
277 | movdqa @X[-2&7],32(%rsp)
|
---|
278 | psubd @Tx[1],@X[-2&7]
|
---|
279 | movups -112($key),$rndkey0 # $key[0]
|
---|
280 | movups 16-112($key),$rndkey[0] # forward reference
|
---|
281 | jmp .Loop_ssse3
|
---|
282 | ___
|
---|
283 |
|
---|
284 | my $aesenc=sub {
|
---|
285 | use integer;
|
---|
286 | my ($n,$k)=($r/10,$r%10);
|
---|
287 | if ($k==0) {
|
---|
288 | $code.=<<___;
|
---|
289 | movups `16*$n`($in0),$in # load input
|
---|
290 | xorps $rndkey0,$in
|
---|
291 | ___
|
---|
292 | $code.=<<___ if ($n);
|
---|
293 | movups $iv,`16*($n-1)`($out,$in0) # write output
|
---|
294 | ___
|
---|
295 | $code.=<<___;
|
---|
296 | xorps $in,$iv
|
---|
297 | movups `32+16*$k-112`($key),$rndkey[1]
|
---|
298 | aesenc $rndkey[0],$iv
|
---|
299 | ___
|
---|
300 | } elsif ($k==9) {
|
---|
301 | $sn++;
|
---|
302 | $code.=<<___;
|
---|
303 | cmp \$11,$rounds
|
---|
304 | jb .Laesenclast$sn
|
---|
305 | movups `32+16*($k+0)-112`($key),$rndkey[1]
|
---|
306 | aesenc $rndkey[0],$iv
|
---|
307 | movups `32+16*($k+1)-112`($key),$rndkey[0]
|
---|
308 | aesenc $rndkey[1],$iv
|
---|
309 | je .Laesenclast$sn
|
---|
310 | movups `32+16*($k+2)-112`($key),$rndkey[1]
|
---|
311 | aesenc $rndkey[0],$iv
|
---|
312 | movups `32+16*($k+3)-112`($key),$rndkey[0]
|
---|
313 | aesenc $rndkey[1],$iv
|
---|
314 | .Laesenclast$sn:
|
---|
315 | aesenclast $rndkey[0],$iv
|
---|
316 | movups 16-112($key),$rndkey[1] # forward reference
|
---|
317 | ___
|
---|
318 | } else {
|
---|
319 | $code.=<<___;
|
---|
320 | movups `32+16*$k-112`($key),$rndkey[1]
|
---|
321 | aesenc $rndkey[0],$iv
|
---|
322 | ___
|
---|
323 | }
|
---|
324 | $r++; unshift(@rndkey,pop(@rndkey));
|
---|
325 | };
|
---|
326 |
|
---|
327 | sub Xupdate_ssse3_16_31() # recall that $Xi starts with 4
|
---|
328 | { use integer;
|
---|
329 | my $body = shift;
|
---|
330 | my @insns = (&$body,&$body,&$body,&$body); # 40 instructions
|
---|
331 | my ($a,$b,$c,$d,$e);
|
---|
332 |
|
---|
333 | eval(shift(@insns)); # ror
|
---|
334 | &pshufd (@X[0],@X[-4&7],0xee); # was &movdqa (@X[0],@X[-3&7]);
|
---|
335 | eval(shift(@insns));
|
---|
336 | &movdqa (@Tx[0],@X[-1&7]);
|
---|
337 | &paddd (@Tx[1],@X[-1&7]);
|
---|
338 | eval(shift(@insns));
|
---|
339 | eval(shift(@insns));
|
---|
340 |
|
---|
341 | &punpcklqdq(@X[0],@X[-3&7]); # compose "X[-14]" in "X[0]", was &palignr(@X[0],@X[-4&7],8);
|
---|
342 | eval(shift(@insns));
|
---|
343 | eval(shift(@insns)); # rol
|
---|
344 | eval(shift(@insns));
|
---|
345 | &psrldq (@Tx[0],4); # "X[-3]", 3 dwords
|
---|
346 | eval(shift(@insns));
|
---|
347 | eval(shift(@insns));
|
---|
348 |
|
---|
349 | &pxor (@X[0],@X[-4&7]); # "X[0]"^="X[-16]"
|
---|
350 | eval(shift(@insns));
|
---|
351 | eval(shift(@insns)); # ror
|
---|
352 | &pxor (@Tx[0],@X[-2&7]); # "X[-3]"^"X[-8]"
|
---|
353 | eval(shift(@insns));
|
---|
354 | eval(shift(@insns));
|
---|
355 | eval(shift(@insns));
|
---|
356 |
|
---|
357 | &pxor (@X[0],@Tx[0]); # "X[0]"^="X[-3]"^"X[-8]"
|
---|
358 | eval(shift(@insns));
|
---|
359 | eval(shift(@insns)); # rol
|
---|
360 | &movdqa (eval(16*(($Xi-1)&3))."(%rsp)",@Tx[1]); # X[]+K xfer to IALU
|
---|
361 | eval(shift(@insns));
|
---|
362 | eval(shift(@insns));
|
---|
363 |
|
---|
364 | &movdqa (@Tx[2],@X[0]);
|
---|
365 | eval(shift(@insns));
|
---|
366 | eval(shift(@insns));
|
---|
367 | eval(shift(@insns)); # ror
|
---|
368 | &movdqa (@Tx[0],@X[0]);
|
---|
369 | eval(shift(@insns));
|
---|
370 |
|
---|
371 | &pslldq (@Tx[2],12); # "X[0]"<<96, extract one dword
|
---|
372 | &paddd (@X[0],@X[0]);
|
---|
373 | eval(shift(@insns));
|
---|
374 | eval(shift(@insns));
|
---|
375 |
|
---|
376 | &psrld (@Tx[0],31);
|
---|
377 | eval(shift(@insns));
|
---|
378 | eval(shift(@insns)); # rol
|
---|
379 | eval(shift(@insns));
|
---|
380 | &movdqa (@Tx[1],@Tx[2]);
|
---|
381 | eval(shift(@insns));
|
---|
382 | eval(shift(@insns));
|
---|
383 |
|
---|
384 | &psrld (@Tx[2],30);
|
---|
385 | eval(shift(@insns));
|
---|
386 | eval(shift(@insns)); # ror
|
---|
387 | &por (@X[0],@Tx[0]); # "X[0]"<<<=1
|
---|
388 | eval(shift(@insns));
|
---|
389 | eval(shift(@insns));
|
---|
390 | eval(shift(@insns));
|
---|
391 |
|
---|
392 | &pslld (@Tx[1],2);
|
---|
393 | &pxor (@X[0],@Tx[2]);
|
---|
394 | eval(shift(@insns));
|
---|
395 | &movdqa (@Tx[2],eval(16*(($Xi)/5))."($K_XX_XX)"); # K_XX_XX
|
---|
396 | eval(shift(@insns)); # rol
|
---|
397 | eval(shift(@insns));
|
---|
398 | eval(shift(@insns));
|
---|
399 |
|
---|
400 | &pxor (@X[0],@Tx[1]); # "X[0]"^=("X[0]">>96)<<<2
|
---|
401 | &pshufd (@Tx[1],@X[-1&7],0xee) if ($Xi==7); # was &movdqa (@Tx[0],@X[-1&7]) in Xupdate_ssse3_32_79
|
---|
402 |
|
---|
403 | foreach (@insns) { eval; } # remaining instructions [if any]
|
---|
404 |
|
---|
405 | $Xi++; push(@X,shift(@X)); # "rotate" X[]
|
---|
406 | push(@Tx,shift(@Tx));
|
---|
407 | }
|
---|
408 |
|
---|
409 | sub Xupdate_ssse3_32_79()
|
---|
410 | { use integer;
|
---|
411 | my $body = shift;
|
---|
412 | my @insns = (&$body,&$body,&$body,&$body); # 32 to 44 instructions
|
---|
413 | my ($a,$b,$c,$d,$e);
|
---|
414 |
|
---|
415 | eval(shift(@insns)) if ($Xi==8);
|
---|
416 | &pxor (@X[0],@X[-4&7]); # "X[0]"="X[-32]"^"X[-16]"
|
---|
417 | eval(shift(@insns)) if ($Xi==8);
|
---|
418 | eval(shift(@insns)); # body_20_39
|
---|
419 | eval(shift(@insns));
|
---|
420 | eval(shift(@insns)) if (@insns[1] =~ /_ror/);
|
---|
421 | eval(shift(@insns)) if (@insns[0] =~ /_ror/);
|
---|
422 | &punpcklqdq(@Tx[0],@X[-1&7]); # compose "X[-6]", was &palignr(@Tx[0],@X[-2&7],8);
|
---|
423 | eval(shift(@insns));
|
---|
424 | eval(shift(@insns)); # rol
|
---|
425 |
|
---|
426 | &pxor (@X[0],@X[-7&7]); # "X[0]"^="X[-28]"
|
---|
427 | eval(shift(@insns));
|
---|
428 | eval(shift(@insns));
|
---|
429 | if ($Xi%5) {
|
---|
430 | &movdqa (@Tx[2],@Tx[1]);# "perpetuate" K_XX_XX...
|
---|
431 | } else { # ... or load next one
|
---|
432 | &movdqa (@Tx[2],eval(16*($Xi/5))."($K_XX_XX)");
|
---|
433 | }
|
---|
434 | eval(shift(@insns)); # ror
|
---|
435 | &paddd (@Tx[1],@X[-1&7]);
|
---|
436 | eval(shift(@insns));
|
---|
437 |
|
---|
438 | &pxor (@X[0],@Tx[0]); # "X[0]"^="X[-6]"
|
---|
439 | eval(shift(@insns)); # body_20_39
|
---|
440 | eval(shift(@insns));
|
---|
441 | eval(shift(@insns));
|
---|
442 | eval(shift(@insns)); # rol
|
---|
443 | eval(shift(@insns)) if (@insns[0] =~ /_ror/);
|
---|
444 |
|
---|
445 | &movdqa (@Tx[0],@X[0]);
|
---|
446 | eval(shift(@insns));
|
---|
447 | eval(shift(@insns));
|
---|
448 | &movdqa (eval(16*(($Xi-1)&3))."(%rsp)",@Tx[1]); # X[]+K xfer to IALU
|
---|
449 | eval(shift(@insns)); # ror
|
---|
450 | eval(shift(@insns));
|
---|
451 | eval(shift(@insns)); # body_20_39
|
---|
452 |
|
---|
453 | &pslld (@X[0],2);
|
---|
454 | eval(shift(@insns));
|
---|
455 | eval(shift(@insns));
|
---|
456 | &psrld (@Tx[0],30);
|
---|
457 | eval(shift(@insns)) if (@insns[0] =~ /_rol/);# rol
|
---|
458 | eval(shift(@insns));
|
---|
459 | eval(shift(@insns));
|
---|
460 | eval(shift(@insns)); # ror
|
---|
461 |
|
---|
462 | &por (@X[0],@Tx[0]); # "X[0]"<<<=2
|
---|
463 | eval(shift(@insns));
|
---|
464 | eval(shift(@insns)); # body_20_39
|
---|
465 | eval(shift(@insns)) if (@insns[1] =~ /_rol/);
|
---|
466 | eval(shift(@insns)) if (@insns[0] =~ /_rol/);
|
---|
467 | &pshufd(@Tx[1],@X[-1&7],0xee) if ($Xi<19); # was &movdqa (@Tx[1],@X[0])
|
---|
468 | eval(shift(@insns));
|
---|
469 | eval(shift(@insns)); # rol
|
---|
470 | eval(shift(@insns));
|
---|
471 | eval(shift(@insns));
|
---|
472 | eval(shift(@insns)); # rol
|
---|
473 | eval(shift(@insns));
|
---|
474 |
|
---|
475 | foreach (@insns) { eval; } # remaining instructions
|
---|
476 |
|
---|
477 | $Xi++; push(@X,shift(@X)); # "rotate" X[]
|
---|
478 | push(@Tx,shift(@Tx));
|
---|
479 | }
|
---|
480 |
|
---|
481 | sub Xuplast_ssse3_80()
|
---|
482 | { use integer;
|
---|
483 | my $body = shift;
|
---|
484 | my @insns = (&$body,&$body,&$body,&$body); # 32 instructions
|
---|
485 | my ($a,$b,$c,$d,$e);
|
---|
486 |
|
---|
487 | eval(shift(@insns));
|
---|
488 | eval(shift(@insns));
|
---|
489 | eval(shift(@insns));
|
---|
490 | eval(shift(@insns));
|
---|
491 | &paddd (@Tx[1],@X[-1&7]);
|
---|
492 | eval(shift(@insns));
|
---|
493 | eval(shift(@insns));
|
---|
494 |
|
---|
495 | &movdqa (eval(16*(($Xi-1)&3))."(%rsp)",@Tx[1]); # X[]+K xfer IALU
|
---|
496 |
|
---|
497 | foreach (@insns) { eval; } # remaining instructions
|
---|
498 |
|
---|
499 | &cmp ($inp,$len);
|
---|
500 | &je (shift);
|
---|
501 |
|
---|
502 | unshift(@Tx,pop(@Tx));
|
---|
503 |
|
---|
504 | &movdqa (@Tx[2],"64($K_XX_XX)"); # pbswap mask
|
---|
505 | &movdqa (@Tx[1],"0($K_XX_XX)"); # K_00_19
|
---|
506 | &movdqu (@X[-4&7],"0($inp)"); # load input
|
---|
507 | &movdqu (@X[-3&7],"16($inp)");
|
---|
508 | &movdqu (@X[-2&7],"32($inp)");
|
---|
509 | &movdqu (@X[-1&7],"48($inp)");
|
---|
510 | &pshufb (@X[-4&7],@Tx[2]); # byte swap
|
---|
511 | &add ($inp,64);
|
---|
512 |
|
---|
513 | $Xi=0;
|
---|
514 | }
|
---|
515 |
|
---|
516 | sub Xloop_ssse3()
|
---|
517 | { use integer;
|
---|
518 | my $body = shift;
|
---|
519 | my @insns = (&$body,&$body,&$body,&$body); # 32 instructions
|
---|
520 | my ($a,$b,$c,$d,$e);
|
---|
521 |
|
---|
522 | eval(shift(@insns));
|
---|
523 | eval(shift(@insns));
|
---|
524 | eval(shift(@insns));
|
---|
525 | &pshufb (@X[($Xi-3)&7],@Tx[2]);
|
---|
526 | eval(shift(@insns));
|
---|
527 | eval(shift(@insns));
|
---|
528 | eval(shift(@insns));
|
---|
529 | eval(shift(@insns));
|
---|
530 | &paddd (@X[($Xi-4)&7],@Tx[1]);
|
---|
531 | eval(shift(@insns));
|
---|
532 | eval(shift(@insns));
|
---|
533 | eval(shift(@insns));
|
---|
534 | eval(shift(@insns));
|
---|
535 | &movdqa (eval(16*$Xi)."(%rsp)",@X[($Xi-4)&7]); # X[]+K xfer to IALU
|
---|
536 | eval(shift(@insns));
|
---|
537 | eval(shift(@insns));
|
---|
538 | eval(shift(@insns));
|
---|
539 | eval(shift(@insns));
|
---|
540 | &psubd (@X[($Xi-4)&7],@Tx[1]);
|
---|
541 |
|
---|
542 | foreach (@insns) { eval; }
|
---|
543 | $Xi++;
|
---|
544 | }
|
---|
545 |
|
---|
546 | sub Xtail_ssse3()
|
---|
547 | { use integer;
|
---|
548 | my $body = shift;
|
---|
549 | my @insns = (&$body,&$body,&$body,&$body); # 32 instructions
|
---|
550 | my ($a,$b,$c,$d,$e);
|
---|
551 |
|
---|
552 | foreach (@insns) { eval; }
|
---|
553 | }
|
---|
554 |
|
---|
555 | my @body_00_19 = (
|
---|
556 | '($a,$b,$c,$d,$e)=@V;'.
|
---|
557 | '&$_ror ($b,$j?7:2);', # $b>>>2
|
---|
558 | '&xor (@T[0],$d);',
|
---|
559 | '&mov (@T[1],$a);', # $b for next round
|
---|
560 |
|
---|
561 | '&add ($e,eval(4*($j&15))."(%rsp)");',# X[]+K xfer
|
---|
562 | '&xor ($b,$c);', # $c^$d for next round
|
---|
563 |
|
---|
564 | '&$_rol ($a,5);',
|
---|
565 | '&add ($e,@T[0]);',
|
---|
566 | '&and (@T[1],$b);', # ($b&($c^$d)) for next round
|
---|
567 |
|
---|
568 | '&xor ($b,$c);', # restore $b
|
---|
569 | '&add ($e,$a);' .'$j++; unshift(@V,pop(@V)); unshift(@T,pop(@T));'
|
---|
570 | );
|
---|
571 |
|
---|
572 | sub body_00_19 () { # ((c^d)&b)^d
|
---|
573 | # on start @T[0]=(c^d)&b
|
---|
574 | return &body_20_39() if ($rx==19); $rx++;
|
---|
575 |
|
---|
576 | use integer;
|
---|
577 | my ($k,$n);
|
---|
578 | my @r=@body_00_19;
|
---|
579 |
|
---|
580 | $n = scalar(@r);
|
---|
581 | $k = (($jj+1)*12/20)*20*$n/12; # 12 aesencs per these 20 rounds
|
---|
582 | @r[$k%$n].='&$aesenc();' if ($jj==$k/$n);
|
---|
583 | $jj++;
|
---|
584 |
|
---|
585 | return @r;
|
---|
586 | }
|
---|
587 |
|
---|
588 | my @body_20_39 = (
|
---|
589 | '($a,$b,$c,$d,$e)=@V;'.
|
---|
590 | '&add ($e,eval(4*($j&15))."(%rsp)");',# X[]+K xfer
|
---|
591 | '&xor (@T[0],$d) if($j==19);'.
|
---|
592 | '&xor (@T[0],$c) if($j> 19);', # ($b^$d^$c)
|
---|
593 | '&mov (@T[1],$a);', # $b for next round
|
---|
594 |
|
---|
595 | '&$_rol ($a,5);',
|
---|
596 | '&add ($e,@T[0]);',
|
---|
597 | '&xor (@T[1],$c) if ($j< 79);', # $b^$d for next round
|
---|
598 |
|
---|
599 | '&$_ror ($b,7);', # $b>>>2
|
---|
600 | '&add ($e,$a);' .'$j++; unshift(@V,pop(@V)); unshift(@T,pop(@T));'
|
---|
601 | );
|
---|
602 |
|
---|
603 | sub body_20_39 () { # b^d^c
|
---|
604 | # on entry @T[0]=b^d
|
---|
605 | return &body_40_59() if ($rx==39); $rx++;
|
---|
606 |
|
---|
607 | use integer;
|
---|
608 | my ($k,$n);
|
---|
609 | my @r=@body_20_39;
|
---|
610 |
|
---|
611 | $n = scalar(@r);
|
---|
612 | $k = (($jj+1)*8/20)*20*$n/8; # 8 aesencs per these 20 rounds
|
---|
613 | @r[$k%$n].='&$aesenc();' if ($jj==$k/$n && $rx!=20);
|
---|
614 | $jj++;
|
---|
615 |
|
---|
616 | return @r;
|
---|
617 | }
|
---|
618 |
|
---|
619 | my @body_40_59 = (
|
---|
620 | '($a,$b,$c,$d,$e)=@V;'.
|
---|
621 | '&add ($e,eval(4*($j&15))."(%rsp)");',# X[]+K xfer
|
---|
622 | '&and (@T[0],$c) if ($j>=40);', # (b^c)&(c^d)
|
---|
623 | '&xor ($c,$d) if ($j>=40);', # restore $c
|
---|
624 |
|
---|
625 | '&$_ror ($b,7);', # $b>>>2
|
---|
626 | '&mov (@T[1],$a);', # $b for next round
|
---|
627 | '&xor (@T[0],$c);',
|
---|
628 |
|
---|
629 | '&$_rol ($a,5);',
|
---|
630 | '&add ($e,@T[0]);',
|
---|
631 | '&xor (@T[1],$c) if ($j==59);'.
|
---|
632 | '&xor (@T[1],$b) if ($j< 59);', # b^c for next round
|
---|
633 |
|
---|
634 | '&xor ($b,$c) if ($j< 59);', # c^d for next round
|
---|
635 | '&add ($e,$a);' .'$j++; unshift(@V,pop(@V)); unshift(@T,pop(@T));'
|
---|
636 | );
|
---|
637 |
|
---|
638 | sub body_40_59 () { # ((b^c)&(c^d))^c
|
---|
639 | # on entry @T[0]=(b^c), (c^=d)
|
---|
640 | $rx++;
|
---|
641 |
|
---|
642 | use integer;
|
---|
643 | my ($k,$n);
|
---|
644 | my @r=@body_40_59;
|
---|
645 |
|
---|
646 | $n = scalar(@r);
|
---|
647 | $k=(($jj+1)*12/20)*20*$n/12; # 12 aesencs per these 20 rounds
|
---|
648 | @r[$k%$n].='&$aesenc();' if ($jj==$k/$n && $rx!=40);
|
---|
649 | $jj++;
|
---|
650 |
|
---|
651 | return @r;
|
---|
652 | }
|
---|
653 | $code.=<<___;
|
---|
654 | .align 32
|
---|
655 | .Loop_ssse3:
|
---|
656 | ___
|
---|
657 | &Xupdate_ssse3_16_31(\&body_00_19);
|
---|
658 | &Xupdate_ssse3_16_31(\&body_00_19);
|
---|
659 | &Xupdate_ssse3_16_31(\&body_00_19);
|
---|
660 | &Xupdate_ssse3_16_31(\&body_00_19);
|
---|
661 | &Xupdate_ssse3_32_79(\&body_00_19);
|
---|
662 | &Xupdate_ssse3_32_79(\&body_20_39);
|
---|
663 | &Xupdate_ssse3_32_79(\&body_20_39);
|
---|
664 | &Xupdate_ssse3_32_79(\&body_20_39);
|
---|
665 | &Xupdate_ssse3_32_79(\&body_20_39);
|
---|
666 | &Xupdate_ssse3_32_79(\&body_20_39);
|
---|
667 | &Xupdate_ssse3_32_79(\&body_40_59);
|
---|
668 | &Xupdate_ssse3_32_79(\&body_40_59);
|
---|
669 | &Xupdate_ssse3_32_79(\&body_40_59);
|
---|
670 | &Xupdate_ssse3_32_79(\&body_40_59);
|
---|
671 | &Xupdate_ssse3_32_79(\&body_40_59);
|
---|
672 | &Xupdate_ssse3_32_79(\&body_20_39);
|
---|
673 | &Xuplast_ssse3_80(\&body_20_39,".Ldone_ssse3"); # can jump to "done"
|
---|
674 |
|
---|
675 | $saved_j=$j; @saved_V=@V;
|
---|
676 | $saved_r=$r; @saved_rndkey=@rndkey;
|
---|
677 |
|
---|
678 | &Xloop_ssse3(\&body_20_39);
|
---|
679 | &Xloop_ssse3(\&body_20_39);
|
---|
680 | &Xloop_ssse3(\&body_20_39);
|
---|
681 |
|
---|
682 | $code.=<<___;
|
---|
683 | movups $iv,48($out,$in0) # write output
|
---|
684 | lea 64($in0),$in0
|
---|
685 |
|
---|
686 | add 0($ctx),$A # update context
|
---|
687 | add 4($ctx),@T[0]
|
---|
688 | add 8($ctx),$C
|
---|
689 | add 12($ctx),$D
|
---|
690 | mov $A,0($ctx)
|
---|
691 | add 16($ctx),$E
|
---|
692 | mov @T[0],4($ctx)
|
---|
693 | mov @T[0],$B # magic seed
|
---|
694 | mov $C,8($ctx)
|
---|
695 | mov $C,@T[1]
|
---|
696 | mov $D,12($ctx)
|
---|
697 | xor $D,@T[1]
|
---|
698 | mov $E,16($ctx)
|
---|
699 | and @T[1],@T[0]
|
---|
700 | jmp .Loop_ssse3
|
---|
701 |
|
---|
702 | .Ldone_ssse3:
|
---|
703 | ___
|
---|
704 | $jj=$j=$saved_j; @V=@saved_V;
|
---|
705 | $r=$saved_r; @rndkey=@saved_rndkey;
|
---|
706 |
|
---|
707 | &Xtail_ssse3(\&body_20_39);
|
---|
708 | &Xtail_ssse3(\&body_20_39);
|
---|
709 | &Xtail_ssse3(\&body_20_39);
|
---|
710 |
|
---|
711 | $code.=<<___;
|
---|
712 | movups $iv,48($out,$in0) # write output
|
---|
713 | mov 88(%rsp),$ivp # restore $ivp
|
---|
714 |
|
---|
715 | add 0($ctx),$A # update context
|
---|
716 | add 4($ctx),@T[0]
|
---|
717 | add 8($ctx),$C
|
---|
718 | mov $A,0($ctx)
|
---|
719 | add 12($ctx),$D
|
---|
720 | mov @T[0],4($ctx)
|
---|
721 | add 16($ctx),$E
|
---|
722 | mov $C,8($ctx)
|
---|
723 | mov $D,12($ctx)
|
---|
724 | mov $E,16($ctx)
|
---|
725 | movups $iv,($ivp) # write IV
|
---|
726 | ___
|
---|
727 | $code.=<<___ if ($win64);
|
---|
728 | movaps 96+0(%rsp),%xmm6
|
---|
729 | movaps 96+16(%rsp),%xmm7
|
---|
730 | movaps 96+32(%rsp),%xmm8
|
---|
731 | movaps 96+48(%rsp),%xmm9
|
---|
732 | movaps 96+64(%rsp),%xmm10
|
---|
733 | movaps 96+80(%rsp),%xmm11
|
---|
734 | movaps 96+96(%rsp),%xmm12
|
---|
735 | movaps 96+112(%rsp),%xmm13
|
---|
736 | movaps 96+128(%rsp),%xmm14
|
---|
737 | movaps 96+144(%rsp),%xmm15
|
---|
738 | ___
|
---|
739 | $code.=<<___;
|
---|
740 | lea `104+($win64?10*16:0)`(%rsp),%rsi
|
---|
741 | .cfi_def_cfa %rsi,56
|
---|
742 | mov 0(%rsi),%r15
|
---|
743 | .cfi_restore %r15
|
---|
744 | mov 8(%rsi),%r14
|
---|
745 | .cfi_restore %r14
|
---|
746 | mov 16(%rsi),%r13
|
---|
747 | .cfi_restore %r13
|
---|
748 | mov 24(%rsi),%r12
|
---|
749 | .cfi_restore %r12
|
---|
750 | mov 32(%rsi),%rbp
|
---|
751 | .cfi_restore %rbp
|
---|
752 | mov 40(%rsi),%rbx
|
---|
753 | .cfi_restore %rbx
|
---|
754 | lea 48(%rsi),%rsp
|
---|
755 | .cfi_def_cfa %rsp,8
|
---|
756 | .Lepilogue_ssse3:
|
---|
757 | ret
|
---|
758 | .cfi_endproc
|
---|
759 | .size aesni_cbc_sha1_enc_ssse3,.-aesni_cbc_sha1_enc_ssse3
|
---|
760 | ___
|
---|
761 |
|
---|
762 | if ($stitched_decrypt) {{{
|
---|
763 | # reset
|
---|
764 | ($in0,$out,$len,$key,$ivp,$ctx,$inp)=("%rdi","%rsi","%rdx","%rcx","%r8","%r9","%r10");
|
---|
765 | $j=$jj=$r=$rx=0;
|
---|
766 | $Xi=4;
|
---|
767 |
|
---|
768 | # reassign for Atom Silvermont (see above)
|
---|
769 | ($inout0,$inout1,$inout2,$inout3,$rndkey0)=map("%xmm$_",(0..4));
|
---|
770 | @X=map("%xmm$_",(8..13,6,7));
|
---|
771 | @Tx=map("%xmm$_",(14,15,5));
|
---|
772 |
|
---|
773 | my @aes256_dec = (
|
---|
774 | '&movdqu($inout0,"0x00($in0)");',
|
---|
775 | '&movdqu($inout1,"0x10($in0)"); &pxor ($inout0,$rndkey0);',
|
---|
776 | '&movdqu($inout2,"0x20($in0)"); &pxor ($inout1,$rndkey0);',
|
---|
777 | '&movdqu($inout3,"0x30($in0)"); &pxor ($inout2,$rndkey0);',
|
---|
778 |
|
---|
779 | '&pxor ($inout3,$rndkey0); &movups ($rndkey0,"16-112($key)");',
|
---|
780 | '&movaps("64(%rsp)",@X[2]);', # save IV, originally @X[3]
|
---|
781 | undef,undef
|
---|
782 | );
|
---|
783 | for ($i=0;$i<13;$i++) {
|
---|
784 | push (@aes256_dec,(
|
---|
785 | '&aesdec ($inout0,$rndkey0);',
|
---|
786 | '&aesdec ($inout1,$rndkey0);',
|
---|
787 | '&aesdec ($inout2,$rndkey0);',
|
---|
788 | '&aesdec ($inout3,$rndkey0); &movups($rndkey0,"'.(16*($i+2)-112).'($key)");'
|
---|
789 | ));
|
---|
790 | push (@aes256_dec,(undef,undef)) if (($i>=3 && $i<=5) || $i>=11);
|
---|
791 | push (@aes256_dec,(undef,undef)) if ($i==5);
|
---|
792 | }
|
---|
793 | push(@aes256_dec,(
|
---|
794 | '&aesdeclast ($inout0,$rndkey0); &movups (@X[0],"0x00($in0)");',
|
---|
795 | '&aesdeclast ($inout1,$rndkey0); &movups (@X[1],"0x10($in0)");',
|
---|
796 | '&aesdeclast ($inout2,$rndkey0); &movups (@X[2],"0x20($in0)");',
|
---|
797 | '&aesdeclast ($inout3,$rndkey0); &movups (@X[3],"0x30($in0)");',
|
---|
798 |
|
---|
799 | '&xorps ($inout0,"64(%rsp)"); &movdqu ($rndkey0,"-112($key)");',
|
---|
800 | '&xorps ($inout1,@X[0]); &movups ("0x00($out,$in0)",$inout0);',
|
---|
801 | '&xorps ($inout2,@X[1]); &movups ("0x10($out,$in0)",$inout1);',
|
---|
802 | '&xorps ($inout3,@X[2]); &movups ("0x20($out,$in0)",$inout2);',
|
---|
803 |
|
---|
804 | '&movups ("0x30($out,$in0)",$inout3);'
|
---|
805 | ));
|
---|
806 |
|
---|
807 | sub body_00_19_dec () { # ((c^d)&b)^d
|
---|
808 | # on start @T[0]=(c^d)&b
|
---|
809 | return &body_20_39_dec() if ($rx==19);
|
---|
810 |
|
---|
811 | my @r=@body_00_19;
|
---|
812 |
|
---|
813 | unshift (@r,@aes256_dec[$rx]) if (@aes256_dec[$rx]);
|
---|
814 | $rx++;
|
---|
815 |
|
---|
816 | return @r;
|
---|
817 | }
|
---|
818 |
|
---|
819 | sub body_20_39_dec () { # b^d^c
|
---|
820 | # on entry @T[0]=b^d
|
---|
821 | return &body_40_59_dec() if ($rx==39);
|
---|
822 |
|
---|
823 | my @r=@body_20_39;
|
---|
824 |
|
---|
825 | unshift (@r,@aes256_dec[$rx]) if (@aes256_dec[$rx]);
|
---|
826 | $rx++;
|
---|
827 |
|
---|
828 | return @r;
|
---|
829 | }
|
---|
830 |
|
---|
831 | sub body_40_59_dec () { # ((b^c)&(c^d))^c
|
---|
832 | # on entry @T[0]=(b^c), (c^=d)
|
---|
833 |
|
---|
834 | my @r=@body_40_59;
|
---|
835 |
|
---|
836 | unshift (@r,@aes256_dec[$rx]) if (@aes256_dec[$rx]);
|
---|
837 | $rx++;
|
---|
838 |
|
---|
839 | return @r;
|
---|
840 | }
|
---|
841 |
|
---|
842 | $code.=<<___;
|
---|
843 | .globl aesni256_cbc_sha1_dec
|
---|
844 | .type aesni256_cbc_sha1_dec,\@abi-omnipotent
|
---|
845 | .align 32
|
---|
846 | aesni256_cbc_sha1_dec:
|
---|
847 | .cfi_startproc
|
---|
848 | # caller should check for SSSE3 and AES-NI bits
|
---|
849 | mov OPENSSL_ia32cap_P+0(%rip),%r10d
|
---|
850 | mov OPENSSL_ia32cap_P+4(%rip),%r11d
|
---|
851 | ___
|
---|
852 | $code.=<<___ if ($avx);
|
---|
853 | and \$`1<<28`,%r11d # mask AVX bit
|
---|
854 | and \$`1<<30`,%r10d # mask "Intel CPU" bit
|
---|
855 | or %r11d,%r10d
|
---|
856 | cmp \$`1<<28|1<<30`,%r10d
|
---|
857 | je aesni256_cbc_sha1_dec_avx
|
---|
858 | ___
|
---|
859 | $code.=<<___;
|
---|
860 | jmp aesni256_cbc_sha1_dec_ssse3
|
---|
861 | ret
|
---|
862 | .cfi_endproc
|
---|
863 | .size aesni256_cbc_sha1_dec,.-aesni256_cbc_sha1_dec
|
---|
864 |
|
---|
865 | .type aesni256_cbc_sha1_dec_ssse3,\@function,6
|
---|
866 | .align 32
|
---|
867 | aesni256_cbc_sha1_dec_ssse3:
|
---|
868 | .cfi_startproc
|
---|
869 | mov `($win64?56:8)`(%rsp),$inp # load 7th argument
|
---|
870 | push %rbx
|
---|
871 | .cfi_push %rbx
|
---|
872 | push %rbp
|
---|
873 | .cfi_push %rbp
|
---|
874 | push %r12
|
---|
875 | .cfi_push %r12
|
---|
876 | push %r13
|
---|
877 | .cfi_push %r13
|
---|
878 | push %r14
|
---|
879 | .cfi_push %r14
|
---|
880 | push %r15
|
---|
881 | .cfi_push %r15
|
---|
882 | lea `-104-($win64?10*16:0)`(%rsp),%rsp
|
---|
883 | .cfi_adjust_cfa_offset `104+($win64?10*16:0)`
|
---|
884 | ___
|
---|
885 | $code.=<<___ if ($win64);
|
---|
886 | movaps %xmm6,96+0(%rsp)
|
---|
887 | movaps %xmm7,96+16(%rsp)
|
---|
888 | movaps %xmm8,96+32(%rsp)
|
---|
889 | movaps %xmm9,96+48(%rsp)
|
---|
890 | movaps %xmm10,96+64(%rsp)
|
---|
891 | movaps %xmm11,96+80(%rsp)
|
---|
892 | movaps %xmm12,96+96(%rsp)
|
---|
893 | movaps %xmm13,96+112(%rsp)
|
---|
894 | movaps %xmm14,96+128(%rsp)
|
---|
895 | movaps %xmm15,96+144(%rsp)
|
---|
896 | .Lprologue_dec_ssse3:
|
---|
897 | ___
|
---|
898 | $code.=<<___;
|
---|
899 | mov $in0,%r12 # reassign arguments
|
---|
900 | mov $out,%r13
|
---|
901 | mov $len,%r14
|
---|
902 | lea 112($key),%r15 # size optimization
|
---|
903 | movdqu ($ivp),@X[3] # load IV
|
---|
904 | #mov $ivp,88(%rsp) # save $ivp
|
---|
905 | ___
|
---|
906 | ($in0,$out,$len,$key)=map("%r$_",(12..15)); # reassign arguments
|
---|
907 | $code.=<<___;
|
---|
908 | shl \$6,$len
|
---|
909 | sub $in0,$out
|
---|
910 | add $inp,$len # end of input
|
---|
911 |
|
---|
912 | lea K_XX_XX(%rip),$K_XX_XX
|
---|
913 | mov 0($ctx),$A # load context
|
---|
914 | mov 4($ctx),$B
|
---|
915 | mov 8($ctx),$C
|
---|
916 | mov 12($ctx),$D
|
---|
917 | mov $B,@T[0] # magic seed
|
---|
918 | mov 16($ctx),$E
|
---|
919 | mov $C,@T[1]
|
---|
920 | xor $D,@T[1]
|
---|
921 | and @T[1],@T[0]
|
---|
922 |
|
---|
923 | movdqa 64($K_XX_XX),@Tx[2] # pbswap mask
|
---|
924 | movdqa 0($K_XX_XX),@Tx[1] # K_00_19
|
---|
925 | movdqu 0($inp),@X[-4&7] # load input to %xmm[0-3]
|
---|
926 | movdqu 16($inp),@X[-3&7]
|
---|
927 | movdqu 32($inp),@X[-2&7]
|
---|
928 | movdqu 48($inp),@X[-1&7]
|
---|
929 | pshufb @Tx[2],@X[-4&7] # byte swap
|
---|
930 | add \$64,$inp
|
---|
931 | pshufb @Tx[2],@X[-3&7]
|
---|
932 | pshufb @Tx[2],@X[-2&7]
|
---|
933 | pshufb @Tx[2],@X[-1&7]
|
---|
934 | paddd @Tx[1],@X[-4&7] # add K_00_19
|
---|
935 | paddd @Tx[1],@X[-3&7]
|
---|
936 | paddd @Tx[1],@X[-2&7]
|
---|
937 | movdqa @X[-4&7],0(%rsp) # X[]+K xfer to IALU
|
---|
938 | psubd @Tx[1],@X[-4&7] # restore X[]
|
---|
939 | movdqa @X[-3&7],16(%rsp)
|
---|
940 | psubd @Tx[1],@X[-3&7]
|
---|
941 | movdqa @X[-2&7],32(%rsp)
|
---|
942 | psubd @Tx[1],@X[-2&7]
|
---|
943 | movdqu -112($key),$rndkey0 # $key[0]
|
---|
944 | jmp .Loop_dec_ssse3
|
---|
945 |
|
---|
946 | .align 32
|
---|
947 | .Loop_dec_ssse3:
|
---|
948 | ___
|
---|
949 | &Xupdate_ssse3_16_31(\&body_00_19_dec);
|
---|
950 | &Xupdate_ssse3_16_31(\&body_00_19_dec);
|
---|
951 | &Xupdate_ssse3_16_31(\&body_00_19_dec);
|
---|
952 | &Xupdate_ssse3_16_31(\&body_00_19_dec);
|
---|
953 | &Xupdate_ssse3_32_79(\&body_00_19_dec);
|
---|
954 | &Xupdate_ssse3_32_79(\&body_20_39_dec);
|
---|
955 | &Xupdate_ssse3_32_79(\&body_20_39_dec);
|
---|
956 | &Xupdate_ssse3_32_79(\&body_20_39_dec);
|
---|
957 | &Xupdate_ssse3_32_79(\&body_20_39_dec);
|
---|
958 | &Xupdate_ssse3_32_79(\&body_20_39_dec);
|
---|
959 | &Xupdate_ssse3_32_79(\&body_40_59_dec);
|
---|
960 | &Xupdate_ssse3_32_79(\&body_40_59_dec);
|
---|
961 | &Xupdate_ssse3_32_79(\&body_40_59_dec);
|
---|
962 | &Xupdate_ssse3_32_79(\&body_40_59_dec);
|
---|
963 | &Xupdate_ssse3_32_79(\&body_40_59_dec);
|
---|
964 | &Xupdate_ssse3_32_79(\&body_20_39_dec);
|
---|
965 | &Xuplast_ssse3_80(\&body_20_39_dec,".Ldone_dec_ssse3"); # can jump to "done"
|
---|
966 |
|
---|
967 | $saved_j=$j; @saved_V=@V;
|
---|
968 | $saved_rx=$rx;
|
---|
969 |
|
---|
970 | &Xloop_ssse3(\&body_20_39_dec);
|
---|
971 | &Xloop_ssse3(\&body_20_39_dec);
|
---|
972 | &Xloop_ssse3(\&body_20_39_dec);
|
---|
973 |
|
---|
974 | eval(@aes256_dec[-1]); # last store
|
---|
975 | $code.=<<___;
|
---|
976 | lea 64($in0),$in0
|
---|
977 |
|
---|
978 | add 0($ctx),$A # update context
|
---|
979 | add 4($ctx),@T[0]
|
---|
980 | add 8($ctx),$C
|
---|
981 | add 12($ctx),$D
|
---|
982 | mov $A,0($ctx)
|
---|
983 | add 16($ctx),$E
|
---|
984 | mov @T[0],4($ctx)
|
---|
985 | mov @T[0],$B # magic seed
|
---|
986 | mov $C,8($ctx)
|
---|
987 | mov $C,@T[1]
|
---|
988 | mov $D,12($ctx)
|
---|
989 | xor $D,@T[1]
|
---|
990 | mov $E,16($ctx)
|
---|
991 | and @T[1],@T[0]
|
---|
992 | jmp .Loop_dec_ssse3
|
---|
993 |
|
---|
994 | .Ldone_dec_ssse3:
|
---|
995 | ___
|
---|
996 | $jj=$j=$saved_j; @V=@saved_V;
|
---|
997 | $rx=$saved_rx;
|
---|
998 |
|
---|
999 | &Xtail_ssse3(\&body_20_39_dec);
|
---|
1000 | &Xtail_ssse3(\&body_20_39_dec);
|
---|
1001 | &Xtail_ssse3(\&body_20_39_dec);
|
---|
1002 |
|
---|
1003 | eval(@aes256_dec[-1]); # last store
|
---|
1004 | $code.=<<___;
|
---|
1005 | add 0($ctx),$A # update context
|
---|
1006 | add 4($ctx),@T[0]
|
---|
1007 | add 8($ctx),$C
|
---|
1008 | mov $A,0($ctx)
|
---|
1009 | add 12($ctx),$D
|
---|
1010 | mov @T[0],4($ctx)
|
---|
1011 | add 16($ctx),$E
|
---|
1012 | mov $C,8($ctx)
|
---|
1013 | mov $D,12($ctx)
|
---|
1014 | mov $E,16($ctx)
|
---|
1015 | movups @X[3],($ivp) # write IV
|
---|
1016 | ___
|
---|
1017 | $code.=<<___ if ($win64);
|
---|
1018 | movaps 96+0(%rsp),%xmm6
|
---|
1019 | movaps 96+16(%rsp),%xmm7
|
---|
1020 | movaps 96+32(%rsp),%xmm8
|
---|
1021 | movaps 96+48(%rsp),%xmm9
|
---|
1022 | movaps 96+64(%rsp),%xmm10
|
---|
1023 | movaps 96+80(%rsp),%xmm11
|
---|
1024 | movaps 96+96(%rsp),%xmm12
|
---|
1025 | movaps 96+112(%rsp),%xmm13
|
---|
1026 | movaps 96+128(%rsp),%xmm14
|
---|
1027 | movaps 96+144(%rsp),%xmm15
|
---|
1028 | ___
|
---|
1029 | $code.=<<___;
|
---|
1030 | lea `104+($win64?10*16:0)`(%rsp),%rsi
|
---|
1031 | .cfi_cfa_def %rsi,56
|
---|
1032 | mov 0(%rsi),%r15
|
---|
1033 | .cfi_restore %r15
|
---|
1034 | mov 8(%rsi),%r14
|
---|
1035 | .cfi_restore %r14
|
---|
1036 | mov 16(%rsi),%r13
|
---|
1037 | .cfi_restore %r13
|
---|
1038 | mov 24(%rsi),%r12
|
---|
1039 | .cfi_restore %r12
|
---|
1040 | mov 32(%rsi),%rbp
|
---|
1041 | .cfi_restore %rbp
|
---|
1042 | mov 40(%rsi),%rbx
|
---|
1043 | .cfi_restore %rbx
|
---|
1044 | lea 48(%rsi),%rsp
|
---|
1045 | .cfi_cfa_def %rsp,8
|
---|
1046 | .Lepilogue_dec_ssse3:
|
---|
1047 | ret
|
---|
1048 | .cfi_endproc
|
---|
1049 | .size aesni256_cbc_sha1_dec_ssse3,.-aesni256_cbc_sha1_dec_ssse3
|
---|
1050 | ___
|
---|
1051 | }}}
|
---|
1052 | $j=$jj=$r=$rx=0;
|
---|
1053 |
|
---|
1054 | if ($avx) {
|
---|
1055 | my ($in0,$out,$len,$key,$ivp,$ctx,$inp)=("%rdi","%rsi","%rdx","%rcx","%r8","%r9","%r10");
|
---|
1056 |
|
---|
1057 | my $Xi=4;
|
---|
1058 | my @X=map("%xmm$_",(4..7,0..3));
|
---|
1059 | my @Tx=map("%xmm$_",(8..10));
|
---|
1060 | my @V=($A,$B,$C,$D,$E)=("%eax","%ebx","%ecx","%edx","%ebp"); # size optimization
|
---|
1061 | my @T=("%esi","%edi");
|
---|
1062 | my ($rndkey0,$iv,$in)=map("%xmm$_",(11..13));
|
---|
1063 | my @rndkey=("%xmm14","%xmm15");
|
---|
1064 | my ($inout0,$inout1,$inout2,$inout3)=map("%xmm$_",(12..15)); # for dec
|
---|
1065 | my $Kx=@Tx[2];
|
---|
1066 |
|
---|
1067 | my $_rol=sub { &shld(@_[0],@_) };
|
---|
1068 | my $_ror=sub { &shrd(@_[0],@_) };
|
---|
1069 |
|
---|
1070 | $code.=<<___;
|
---|
1071 | .type aesni_cbc_sha1_enc_avx,\@function,6
|
---|
1072 | .align 32
|
---|
1073 | aesni_cbc_sha1_enc_avx:
|
---|
1074 | .cfi_startproc
|
---|
1075 | mov `($win64?56:8)`(%rsp),$inp # load 7th argument
|
---|
1076 | #shr \$6,$len # debugging artefact
|
---|
1077 | #jz .Lepilogue_avx # debugging artefact
|
---|
1078 | push %rbx
|
---|
1079 | .cfi_push %rbx
|
---|
1080 | push %rbp
|
---|
1081 | .cfi_push %rbp
|
---|
1082 | push %r12
|
---|
1083 | .cfi_push %r12
|
---|
1084 | push %r13
|
---|
1085 | .cfi_push %r13
|
---|
1086 | push %r14
|
---|
1087 | .cfi_push %r14
|
---|
1088 | push %r15
|
---|
1089 | .cfi_push %r15
|
---|
1090 | lea `-104-($win64?10*16:0)`(%rsp),%rsp
|
---|
1091 | .cfi_adjust_cfa_offset `104+($win64?10*16:0)`
|
---|
1092 | #mov $in0,$inp # debugging artefact
|
---|
1093 | #lea 64(%rsp),$ctx # debugging artefact
|
---|
1094 | ___
|
---|
1095 | $code.=<<___ if ($win64);
|
---|
1096 | movaps %xmm6,96+0(%rsp)
|
---|
1097 | movaps %xmm7,96+16(%rsp)
|
---|
1098 | movaps %xmm8,96+32(%rsp)
|
---|
1099 | movaps %xmm9,96+48(%rsp)
|
---|
1100 | movaps %xmm10,96+64(%rsp)
|
---|
1101 | movaps %xmm11,96+80(%rsp)
|
---|
1102 | movaps %xmm12,96+96(%rsp)
|
---|
1103 | movaps %xmm13,96+112(%rsp)
|
---|
1104 | movaps %xmm14,96+128(%rsp)
|
---|
1105 | movaps %xmm15,96+144(%rsp)
|
---|
1106 | .Lprologue_avx:
|
---|
1107 | ___
|
---|
1108 | $code.=<<___;
|
---|
1109 | vzeroall
|
---|
1110 | mov $in0,%r12 # reassign arguments
|
---|
1111 | mov $out,%r13
|
---|
1112 | mov $len,%r14
|
---|
1113 | lea 112($key),%r15 # size optimization
|
---|
1114 | vmovdqu ($ivp),$iv # load IV
|
---|
1115 | mov $ivp,88(%rsp) # save $ivp
|
---|
1116 | ___
|
---|
1117 | ($in0,$out,$len,$key)=map("%r$_",(12..15)); # reassign arguments
|
---|
1118 | my $rounds="${ivp}d";
|
---|
1119 | $code.=<<___;
|
---|
1120 | shl \$6,$len
|
---|
1121 | sub $in0,$out
|
---|
1122 | mov 240-112($key),$rounds
|
---|
1123 | add $inp,$len # end of input
|
---|
1124 |
|
---|
1125 | lea K_XX_XX(%rip),$K_XX_XX
|
---|
1126 | mov 0($ctx),$A # load context
|
---|
1127 | mov 4($ctx),$B
|
---|
1128 | mov 8($ctx),$C
|
---|
1129 | mov 12($ctx),$D
|
---|
1130 | mov $B,@T[0] # magic seed
|
---|
1131 | mov 16($ctx),$E
|
---|
1132 | mov $C,@T[1]
|
---|
1133 | xor $D,@T[1]
|
---|
1134 | and @T[1],@T[0]
|
---|
1135 |
|
---|
1136 | vmovdqa 64($K_XX_XX),@X[2] # pbswap mask
|
---|
1137 | vmovdqa 0($K_XX_XX),$Kx # K_00_19
|
---|
1138 | vmovdqu 0($inp),@X[-4&7] # load input to %xmm[0-3]
|
---|
1139 | vmovdqu 16($inp),@X[-3&7]
|
---|
1140 | vmovdqu 32($inp),@X[-2&7]
|
---|
1141 | vmovdqu 48($inp),@X[-1&7]
|
---|
1142 | vpshufb @X[2],@X[-4&7],@X[-4&7] # byte swap
|
---|
1143 | add \$64,$inp
|
---|
1144 | vpshufb @X[2],@X[-3&7],@X[-3&7]
|
---|
1145 | vpshufb @X[2],@X[-2&7],@X[-2&7]
|
---|
1146 | vpshufb @X[2],@X[-1&7],@X[-1&7]
|
---|
1147 | vpaddd $Kx,@X[-4&7],@X[0] # add K_00_19
|
---|
1148 | vpaddd $Kx,@X[-3&7],@X[1]
|
---|
1149 | vpaddd $Kx,@X[-2&7],@X[2]
|
---|
1150 | vmovdqa @X[0],0(%rsp) # X[]+K xfer to IALU
|
---|
1151 | vmovdqa @X[1],16(%rsp)
|
---|
1152 | vmovdqa @X[2],32(%rsp)
|
---|
1153 | vmovups -112($key),$rndkey[1] # $key[0]
|
---|
1154 | vmovups 16-112($key),$rndkey[0] # forward reference
|
---|
1155 | jmp .Loop_avx
|
---|
1156 | ___
|
---|
1157 |
|
---|
1158 | my $aesenc=sub {
|
---|
1159 | use integer;
|
---|
1160 | my ($n,$k)=($r/10,$r%10);
|
---|
1161 | if ($k==0) {
|
---|
1162 | $code.=<<___;
|
---|
1163 | vmovdqu `16*$n`($in0),$in # load input
|
---|
1164 | vpxor $rndkey[1],$in,$in
|
---|
1165 | ___
|
---|
1166 | $code.=<<___ if ($n);
|
---|
1167 | vmovups $iv,`16*($n-1)`($out,$in0) # write output
|
---|
1168 | ___
|
---|
1169 | $code.=<<___;
|
---|
1170 | vpxor $in,$iv,$iv
|
---|
1171 | vaesenc $rndkey[0],$iv,$iv
|
---|
1172 | vmovups `32+16*$k-112`($key),$rndkey[1]
|
---|
1173 | ___
|
---|
1174 | } elsif ($k==9) {
|
---|
1175 | $sn++;
|
---|
1176 | $code.=<<___;
|
---|
1177 | cmp \$11,$rounds
|
---|
1178 | jb .Lvaesenclast$sn
|
---|
1179 | vaesenc $rndkey[0],$iv,$iv
|
---|
1180 | vmovups `32+16*($k+0)-112`($key),$rndkey[1]
|
---|
1181 | vaesenc $rndkey[1],$iv,$iv
|
---|
1182 | vmovups `32+16*($k+1)-112`($key),$rndkey[0]
|
---|
1183 | je .Lvaesenclast$sn
|
---|
1184 | vaesenc $rndkey[0],$iv,$iv
|
---|
1185 | vmovups `32+16*($k+2)-112`($key),$rndkey[1]
|
---|
1186 | vaesenc $rndkey[1],$iv,$iv
|
---|
1187 | vmovups `32+16*($k+3)-112`($key),$rndkey[0]
|
---|
1188 | .Lvaesenclast$sn:
|
---|
1189 | vaesenclast $rndkey[0],$iv,$iv
|
---|
1190 | vmovups -112($key),$rndkey[0]
|
---|
1191 | vmovups 16-112($key),$rndkey[1] # forward reference
|
---|
1192 | ___
|
---|
1193 | } else {
|
---|
1194 | $code.=<<___;
|
---|
1195 | vaesenc $rndkey[0],$iv,$iv
|
---|
1196 | vmovups `32+16*$k-112`($key),$rndkey[1]
|
---|
1197 | ___
|
---|
1198 | }
|
---|
1199 | $r++; unshift(@rndkey,pop(@rndkey));
|
---|
1200 | };
|
---|
1201 |
|
---|
1202 | sub Xupdate_avx_16_31() # recall that $Xi starts with 4
|
---|
1203 | { use integer;
|
---|
1204 | my $body = shift;
|
---|
1205 | my @insns = (&$body,&$body,&$body,&$body); # 40 instructions
|
---|
1206 | my ($a,$b,$c,$d,$e);
|
---|
1207 |
|
---|
1208 | eval(shift(@insns));
|
---|
1209 | eval(shift(@insns));
|
---|
1210 | &vpalignr(@X[0],@X[-3&7],@X[-4&7],8); # compose "X[-14]" in "X[0]"
|
---|
1211 | eval(shift(@insns));
|
---|
1212 | eval(shift(@insns));
|
---|
1213 |
|
---|
1214 | &vpaddd (@Tx[1],$Kx,@X[-1&7]);
|
---|
1215 | eval(shift(@insns));
|
---|
1216 | eval(shift(@insns));
|
---|
1217 | &vpsrldq(@Tx[0],@X[-1&7],4); # "X[-3]", 3 dwords
|
---|
1218 | eval(shift(@insns));
|
---|
1219 | eval(shift(@insns));
|
---|
1220 | &vpxor (@X[0],@X[0],@X[-4&7]); # "X[0]"^="X[-16]"
|
---|
1221 | eval(shift(@insns));
|
---|
1222 | eval(shift(@insns));
|
---|
1223 |
|
---|
1224 | &vpxor (@Tx[0],@Tx[0],@X[-2&7]); # "X[-3]"^"X[-8]"
|
---|
1225 | eval(shift(@insns));
|
---|
1226 | eval(shift(@insns));
|
---|
1227 | eval(shift(@insns));
|
---|
1228 | eval(shift(@insns));
|
---|
1229 |
|
---|
1230 | &vpxor (@X[0],@X[0],@Tx[0]); # "X[0]"^="X[-3]"^"X[-8]"
|
---|
1231 | eval(shift(@insns));
|
---|
1232 | eval(shift(@insns));
|
---|
1233 | &vmovdqa (eval(16*(($Xi-1)&3))."(%rsp)",@Tx[1]); # X[]+K xfer to IALU
|
---|
1234 | eval(shift(@insns));
|
---|
1235 | eval(shift(@insns));
|
---|
1236 |
|
---|
1237 | &vpsrld (@Tx[0],@X[0],31);
|
---|
1238 | eval(shift(@insns));
|
---|
1239 | eval(shift(@insns));
|
---|
1240 | eval(shift(@insns));
|
---|
1241 | eval(shift(@insns));
|
---|
1242 |
|
---|
1243 | &vpslldq(@Tx[1],@X[0],12); # "X[0]"<<96, extract one dword
|
---|
1244 | &vpaddd (@X[0],@X[0],@X[0]);
|
---|
1245 | eval(shift(@insns));
|
---|
1246 | eval(shift(@insns));
|
---|
1247 | eval(shift(@insns));
|
---|
1248 | eval(shift(@insns));
|
---|
1249 |
|
---|
1250 | &vpor (@X[0],@X[0],@Tx[0]); # "X[0]"<<<=1
|
---|
1251 | &vpsrld (@Tx[0],@Tx[1],30);
|
---|
1252 | eval(shift(@insns));
|
---|
1253 | eval(shift(@insns));
|
---|
1254 | eval(shift(@insns));
|
---|
1255 | eval(shift(@insns));
|
---|
1256 |
|
---|
1257 | &vpslld (@Tx[1],@Tx[1],2);
|
---|
1258 | &vpxor (@X[0],@X[0],@Tx[0]);
|
---|
1259 | eval(shift(@insns));
|
---|
1260 | eval(shift(@insns));
|
---|
1261 | eval(shift(@insns));
|
---|
1262 | eval(shift(@insns));
|
---|
1263 |
|
---|
1264 | &vpxor (@X[0],@X[0],@Tx[1]); # "X[0]"^=("X[0]">>96)<<<2
|
---|
1265 | eval(shift(@insns));
|
---|
1266 | eval(shift(@insns));
|
---|
1267 | &vmovdqa ($Kx,eval(16*(($Xi)/5))."($K_XX_XX)") if ($Xi%5==0); # K_XX_XX
|
---|
1268 | eval(shift(@insns));
|
---|
1269 | eval(shift(@insns));
|
---|
1270 |
|
---|
1271 |
|
---|
1272 | foreach (@insns) { eval; } # remaining instructions [if any]
|
---|
1273 |
|
---|
1274 | $Xi++; push(@X,shift(@X)); # "rotate" X[]
|
---|
1275 | }
|
---|
1276 |
|
---|
1277 | sub Xupdate_avx_32_79()
|
---|
1278 | { use integer;
|
---|
1279 | my $body = shift;
|
---|
1280 | my @insns = (&$body,&$body,&$body,&$body); # 32 to 48 instructions
|
---|
1281 | my ($a,$b,$c,$d,$e);
|
---|
1282 |
|
---|
1283 | &vpalignr(@Tx[0],@X[-1&7],@X[-2&7],8); # compose "X[-6]"
|
---|
1284 | &vpxor (@X[0],@X[0],@X[-4&7]); # "X[0]"="X[-32]"^"X[-16]"
|
---|
1285 | eval(shift(@insns)); # body_20_39
|
---|
1286 | eval(shift(@insns));
|
---|
1287 | eval(shift(@insns));
|
---|
1288 | eval(shift(@insns)); # rol
|
---|
1289 |
|
---|
1290 | &vpxor (@X[0],@X[0],@X[-7&7]); # "X[0]"^="X[-28]"
|
---|
1291 | eval(shift(@insns));
|
---|
1292 | eval(shift(@insns)) if (@insns[0] !~ /&ro[rl]/);
|
---|
1293 | &vpaddd (@Tx[1],$Kx,@X[-1&7]);
|
---|
1294 | &vmovdqa ($Kx,eval(16*($Xi/5))."($K_XX_XX)") if ($Xi%5==0);
|
---|
1295 | eval(shift(@insns)); # ror
|
---|
1296 | eval(shift(@insns));
|
---|
1297 |
|
---|
1298 | &vpxor (@X[0],@X[0],@Tx[0]); # "X[0]"^="X[-6]"
|
---|
1299 | eval(shift(@insns)); # body_20_39
|
---|
1300 | eval(shift(@insns));
|
---|
1301 | eval(shift(@insns));
|
---|
1302 | eval(shift(@insns)); # rol
|
---|
1303 |
|
---|
1304 | &vpsrld (@Tx[0],@X[0],30);
|
---|
1305 | &vmovdqa (eval(16*(($Xi-1)&3))."(%rsp)",@Tx[1]); # X[]+K xfer to IALU
|
---|
1306 | eval(shift(@insns));
|
---|
1307 | eval(shift(@insns));
|
---|
1308 | eval(shift(@insns)); # ror
|
---|
1309 | eval(shift(@insns));
|
---|
1310 |
|
---|
1311 | &vpslld (@X[0],@X[0],2);
|
---|
1312 | eval(shift(@insns)); # body_20_39
|
---|
1313 | eval(shift(@insns));
|
---|
1314 | eval(shift(@insns));
|
---|
1315 | eval(shift(@insns)); # rol
|
---|
1316 | eval(shift(@insns));
|
---|
1317 | eval(shift(@insns));
|
---|
1318 | eval(shift(@insns)); # ror
|
---|
1319 | eval(shift(@insns));
|
---|
1320 |
|
---|
1321 | &vpor (@X[0],@X[0],@Tx[0]); # "X[0]"<<<=2
|
---|
1322 | eval(shift(@insns)); # body_20_39
|
---|
1323 | eval(shift(@insns));
|
---|
1324 | eval(shift(@insns));
|
---|
1325 | eval(shift(@insns)); # rol
|
---|
1326 | eval(shift(@insns));
|
---|
1327 | eval(shift(@insns));
|
---|
1328 | eval(shift(@insns)); # rol
|
---|
1329 | eval(shift(@insns));
|
---|
1330 |
|
---|
1331 | foreach (@insns) { eval; } # remaining instructions
|
---|
1332 |
|
---|
1333 | $Xi++; push(@X,shift(@X)); # "rotate" X[]
|
---|
1334 | }
|
---|
1335 |
|
---|
1336 | sub Xuplast_avx_80()
|
---|
1337 | { use integer;
|
---|
1338 | my $body = shift;
|
---|
1339 | my @insns = (&$body,&$body,&$body,&$body); # 32 instructions
|
---|
1340 | my ($a,$b,$c,$d,$e);
|
---|
1341 |
|
---|
1342 | eval(shift(@insns));
|
---|
1343 | &vpaddd (@Tx[1],$Kx,@X[-1&7]);
|
---|
1344 | eval(shift(@insns));
|
---|
1345 | eval(shift(@insns));
|
---|
1346 | eval(shift(@insns));
|
---|
1347 | eval(shift(@insns));
|
---|
1348 |
|
---|
1349 | &vmovdqa (eval(16*(($Xi-1)&3))."(%rsp)",@Tx[1]); # X[]+K xfer IALU
|
---|
1350 |
|
---|
1351 | foreach (@insns) { eval; } # remaining instructions
|
---|
1352 |
|
---|
1353 | &cmp ($inp,$len);
|
---|
1354 | &je (shift);
|
---|
1355 |
|
---|
1356 | &vmovdqa(@Tx[1],"64($K_XX_XX)"); # pbswap mask
|
---|
1357 | &vmovdqa($Kx,"0($K_XX_XX)"); # K_00_19
|
---|
1358 | &vmovdqu(@X[-4&7],"0($inp)"); # load input
|
---|
1359 | &vmovdqu(@X[-3&7],"16($inp)");
|
---|
1360 | &vmovdqu(@X[-2&7],"32($inp)");
|
---|
1361 | &vmovdqu(@X[-1&7],"48($inp)");
|
---|
1362 | &vpshufb(@X[-4&7],@X[-4&7],@Tx[1]); # byte swap
|
---|
1363 | &add ($inp,64);
|
---|
1364 |
|
---|
1365 | $Xi=0;
|
---|
1366 | }
|
---|
1367 |
|
---|
1368 | sub Xloop_avx()
|
---|
1369 | { use integer;
|
---|
1370 | my $body = shift;
|
---|
1371 | my @insns = (&$body,&$body,&$body,&$body); # 32 instructions
|
---|
1372 | my ($a,$b,$c,$d,$e);
|
---|
1373 |
|
---|
1374 | eval(shift(@insns));
|
---|
1375 | eval(shift(@insns));
|
---|
1376 | &vpshufb(@X[($Xi-3)&7],@X[($Xi-3)&7],@Tx[1]);
|
---|
1377 | eval(shift(@insns));
|
---|
1378 | eval(shift(@insns));
|
---|
1379 | &vpaddd (@Tx[0],@X[($Xi-4)&7],$Kx);
|
---|
1380 | eval(shift(@insns));
|
---|
1381 | eval(shift(@insns));
|
---|
1382 | eval(shift(@insns));
|
---|
1383 | eval(shift(@insns));
|
---|
1384 | &vmovdqa(eval(16*$Xi)."(%rsp)",@Tx[0]); # X[]+K xfer to IALU
|
---|
1385 | eval(shift(@insns));
|
---|
1386 | eval(shift(@insns));
|
---|
1387 |
|
---|
1388 | foreach (@insns) { eval; }
|
---|
1389 | $Xi++;
|
---|
1390 | }
|
---|
1391 |
|
---|
1392 | sub Xtail_avx()
|
---|
1393 | { use integer;
|
---|
1394 | my $body = shift;
|
---|
1395 | my @insns = (&$body,&$body,&$body,&$body); # 32 instructions
|
---|
1396 | my ($a,$b,$c,$d,$e);
|
---|
1397 |
|
---|
1398 | foreach (@insns) { eval; }
|
---|
1399 | }
|
---|
1400 |
|
---|
1401 | $code.=<<___;
|
---|
1402 | .align 32
|
---|
1403 | .Loop_avx:
|
---|
1404 | ___
|
---|
1405 | &Xupdate_avx_16_31(\&body_00_19);
|
---|
1406 | &Xupdate_avx_16_31(\&body_00_19);
|
---|
1407 | &Xupdate_avx_16_31(\&body_00_19);
|
---|
1408 | &Xupdate_avx_16_31(\&body_00_19);
|
---|
1409 | &Xupdate_avx_32_79(\&body_00_19);
|
---|
1410 | &Xupdate_avx_32_79(\&body_20_39);
|
---|
1411 | &Xupdate_avx_32_79(\&body_20_39);
|
---|
1412 | &Xupdate_avx_32_79(\&body_20_39);
|
---|
1413 | &Xupdate_avx_32_79(\&body_20_39);
|
---|
1414 | &Xupdate_avx_32_79(\&body_20_39);
|
---|
1415 | &Xupdate_avx_32_79(\&body_40_59);
|
---|
1416 | &Xupdate_avx_32_79(\&body_40_59);
|
---|
1417 | &Xupdate_avx_32_79(\&body_40_59);
|
---|
1418 | &Xupdate_avx_32_79(\&body_40_59);
|
---|
1419 | &Xupdate_avx_32_79(\&body_40_59);
|
---|
1420 | &Xupdate_avx_32_79(\&body_20_39);
|
---|
1421 | &Xuplast_avx_80(\&body_20_39,".Ldone_avx"); # can jump to "done"
|
---|
1422 |
|
---|
1423 | $saved_j=$j; @saved_V=@V;
|
---|
1424 | $saved_r=$r; @saved_rndkey=@rndkey;
|
---|
1425 |
|
---|
1426 | &Xloop_avx(\&body_20_39);
|
---|
1427 | &Xloop_avx(\&body_20_39);
|
---|
1428 | &Xloop_avx(\&body_20_39);
|
---|
1429 |
|
---|
1430 | $code.=<<___;
|
---|
1431 | vmovups $iv,48($out,$in0) # write output
|
---|
1432 | lea 64($in0),$in0
|
---|
1433 |
|
---|
1434 | add 0($ctx),$A # update context
|
---|
1435 | add 4($ctx),@T[0]
|
---|
1436 | add 8($ctx),$C
|
---|
1437 | add 12($ctx),$D
|
---|
1438 | mov $A,0($ctx)
|
---|
1439 | add 16($ctx),$E
|
---|
1440 | mov @T[0],4($ctx)
|
---|
1441 | mov @T[0],$B # magic seed
|
---|
1442 | mov $C,8($ctx)
|
---|
1443 | mov $C,@T[1]
|
---|
1444 | mov $D,12($ctx)
|
---|
1445 | xor $D,@T[1]
|
---|
1446 | mov $E,16($ctx)
|
---|
1447 | and @T[1],@T[0]
|
---|
1448 | jmp .Loop_avx
|
---|
1449 |
|
---|
1450 | .Ldone_avx:
|
---|
1451 | ___
|
---|
1452 | $jj=$j=$saved_j; @V=@saved_V;
|
---|
1453 | $r=$saved_r; @rndkey=@saved_rndkey;
|
---|
1454 |
|
---|
1455 | &Xtail_avx(\&body_20_39);
|
---|
1456 | &Xtail_avx(\&body_20_39);
|
---|
1457 | &Xtail_avx(\&body_20_39);
|
---|
1458 |
|
---|
1459 | $code.=<<___;
|
---|
1460 | vmovups $iv,48($out,$in0) # write output
|
---|
1461 | mov 88(%rsp),$ivp # restore $ivp
|
---|
1462 |
|
---|
1463 | add 0($ctx),$A # update context
|
---|
1464 | add 4($ctx),@T[0]
|
---|
1465 | add 8($ctx),$C
|
---|
1466 | mov $A,0($ctx)
|
---|
1467 | add 12($ctx),$D
|
---|
1468 | mov @T[0],4($ctx)
|
---|
1469 | add 16($ctx),$E
|
---|
1470 | mov $C,8($ctx)
|
---|
1471 | mov $D,12($ctx)
|
---|
1472 | mov $E,16($ctx)
|
---|
1473 | vmovups $iv,($ivp) # write IV
|
---|
1474 | vzeroall
|
---|
1475 | ___
|
---|
1476 | $code.=<<___ if ($win64);
|
---|
1477 | movaps 96+0(%rsp),%xmm6
|
---|
1478 | movaps 96+16(%rsp),%xmm7
|
---|
1479 | movaps 96+32(%rsp),%xmm8
|
---|
1480 | movaps 96+48(%rsp),%xmm9
|
---|
1481 | movaps 96+64(%rsp),%xmm10
|
---|
1482 | movaps 96+80(%rsp),%xmm11
|
---|
1483 | movaps 96+96(%rsp),%xmm12
|
---|
1484 | movaps 96+112(%rsp),%xmm13
|
---|
1485 | movaps 96+128(%rsp),%xmm14
|
---|
1486 | movaps 96+144(%rsp),%xmm15
|
---|
1487 | ___
|
---|
1488 | $code.=<<___;
|
---|
1489 | lea `104+($win64?10*16:0)`(%rsp),%rsi
|
---|
1490 | .cfi_def_cfa %rsi,56
|
---|
1491 | mov 0(%rsi),%r15
|
---|
1492 | .cfi_restore %r15
|
---|
1493 | mov 8(%rsi),%r14
|
---|
1494 | .cfi_restore %r14
|
---|
1495 | mov 16(%rsi),%r13
|
---|
1496 | .cfi_restore %r13
|
---|
1497 | mov 24(%rsi),%r12
|
---|
1498 | .cfi_restore %r12
|
---|
1499 | mov 32(%rsi),%rbp
|
---|
1500 | .cfi_restore %rbp
|
---|
1501 | mov 40(%rsi),%rbx
|
---|
1502 | .cfi_restore %rbx
|
---|
1503 | lea 48(%rsi),%rsp
|
---|
1504 | .cfi_def_cfa %rsp,8
|
---|
1505 | .Lepilogue_avx:
|
---|
1506 | ret
|
---|
1507 | .cfi_endproc
|
---|
1508 | .size aesni_cbc_sha1_enc_avx,.-aesni_cbc_sha1_enc_avx
|
---|
1509 | ___
|
---|
1510 |
|
---|
1511 | if ($stitched_decrypt) {{{
|
---|
1512 | # reset
|
---|
1513 | ($in0,$out,$len,$key,$ivp,$ctx,$inp)=("%rdi","%rsi","%rdx","%rcx","%r8","%r9","%r10");
|
---|
1514 |
|
---|
1515 | $j=$jj=$r=$rx=0;
|
---|
1516 | $Xi=4;
|
---|
1517 |
|
---|
1518 | @aes256_dec = (
|
---|
1519 | '&vpxor ($inout0,$rndkey0,"0x00($in0)");',
|
---|
1520 | '&vpxor ($inout1,$rndkey0,"0x10($in0)");',
|
---|
1521 | '&vpxor ($inout2,$rndkey0,"0x20($in0)");',
|
---|
1522 | '&vpxor ($inout3,$rndkey0,"0x30($in0)");',
|
---|
1523 |
|
---|
1524 | '&vmovups($rndkey0,"16-112($key)");',
|
---|
1525 | '&vmovups("64(%rsp)",@X[2]);', # save IV, originally @X[3]
|
---|
1526 | undef,undef
|
---|
1527 | );
|
---|
1528 | for ($i=0;$i<13;$i++) {
|
---|
1529 | push (@aes256_dec,(
|
---|
1530 | '&vaesdec ($inout0,$inout0,$rndkey0);',
|
---|
1531 | '&vaesdec ($inout1,$inout1,$rndkey0);',
|
---|
1532 | '&vaesdec ($inout2,$inout2,$rndkey0);',
|
---|
1533 | '&vaesdec ($inout3,$inout3,$rndkey0); &vmovups($rndkey0,"'.(16*($i+2)-112).'($key)");'
|
---|
1534 | ));
|
---|
1535 | push (@aes256_dec,(undef,undef)) if (($i>=3 && $i<=5) || $i>=11);
|
---|
1536 | push (@aes256_dec,(undef,undef)) if ($i==5);
|
---|
1537 | }
|
---|
1538 | push(@aes256_dec,(
|
---|
1539 | '&vaesdeclast ($inout0,$inout0,$rndkey0); &vmovups(@X[0],"0x00($in0)");',
|
---|
1540 | '&vaesdeclast ($inout1,$inout1,$rndkey0); &vmovups(@X[1],"0x10($in0)");',
|
---|
1541 | '&vaesdeclast ($inout2,$inout2,$rndkey0); &vmovups(@X[2],"0x20($in0)");',
|
---|
1542 | '&vaesdeclast ($inout3,$inout3,$rndkey0); &vmovups(@X[3],"0x30($in0)");',
|
---|
1543 |
|
---|
1544 | '&vxorps ($inout0,$inout0,"64(%rsp)"); &vmovdqu($rndkey0,"-112($key)");',
|
---|
1545 | '&vxorps ($inout1,$inout1,@X[0]); &vmovups("0x00($out,$in0)",$inout0);',
|
---|
1546 | '&vxorps ($inout2,$inout2,@X[1]); &vmovups("0x10($out,$in0)",$inout1);',
|
---|
1547 | '&vxorps ($inout3,$inout3,@X[2]); &vmovups("0x20($out,$in0)",$inout2);',
|
---|
1548 |
|
---|
1549 | '&vmovups ("0x30($out,$in0)",$inout3);'
|
---|
1550 | ));
|
---|
1551 |
|
---|
1552 | $code.=<<___;
|
---|
1553 | .type aesni256_cbc_sha1_dec_avx,\@function,6
|
---|
1554 | .align 32
|
---|
1555 | aesni256_cbc_sha1_dec_avx:
|
---|
1556 | .cfi_startproc
|
---|
1557 | mov `($win64?56:8)`(%rsp),$inp # load 7th argument
|
---|
1558 | push %rbx
|
---|
1559 | .cfi_push %rbx
|
---|
1560 | push %rbp
|
---|
1561 | .cfi_push %rbp
|
---|
1562 | push %r12
|
---|
1563 | .cfi_push %r12
|
---|
1564 | push %r13
|
---|
1565 | .cfi_push %r13
|
---|
1566 | push %r14
|
---|
1567 | .cfi_push %r14
|
---|
1568 | push %r15
|
---|
1569 | .cfi_push %r15
|
---|
1570 | lea `-104-($win64?10*16:0)`(%rsp),%rsp
|
---|
1571 | .cfi_adjust_cfa_offset `104+($win64?10*16:0)`
|
---|
1572 | ___
|
---|
1573 | $code.=<<___ if ($win64);
|
---|
1574 | movaps %xmm6,96+0(%rsp)
|
---|
1575 | movaps %xmm7,96+16(%rsp)
|
---|
1576 | movaps %xmm8,96+32(%rsp)
|
---|
1577 | movaps %xmm9,96+48(%rsp)
|
---|
1578 | movaps %xmm10,96+64(%rsp)
|
---|
1579 | movaps %xmm11,96+80(%rsp)
|
---|
1580 | movaps %xmm12,96+96(%rsp)
|
---|
1581 | movaps %xmm13,96+112(%rsp)
|
---|
1582 | movaps %xmm14,96+128(%rsp)
|
---|
1583 | movaps %xmm15,96+144(%rsp)
|
---|
1584 | .Lprologue_dec_avx:
|
---|
1585 | ___
|
---|
1586 | $code.=<<___;
|
---|
1587 | vzeroall
|
---|
1588 | mov $in0,%r12 # reassign arguments
|
---|
1589 | mov $out,%r13
|
---|
1590 | mov $len,%r14
|
---|
1591 | lea 112($key),%r15 # size optimization
|
---|
1592 | vmovdqu ($ivp),@X[3] # load IV
|
---|
1593 | ___
|
---|
1594 | ($in0,$out,$len,$key)=map("%r$_",(12..15)); # reassign arguments
|
---|
1595 | $code.=<<___;
|
---|
1596 | shl \$6,$len
|
---|
1597 | sub $in0,$out
|
---|
1598 | add $inp,$len # end of input
|
---|
1599 |
|
---|
1600 | lea K_XX_XX(%rip),$K_XX_XX
|
---|
1601 | mov 0($ctx),$A # load context
|
---|
1602 | mov 4($ctx),$B
|
---|
1603 | mov 8($ctx),$C
|
---|
1604 | mov 12($ctx),$D
|
---|
1605 | mov $B,@T[0] # magic seed
|
---|
1606 | mov 16($ctx),$E
|
---|
1607 | mov $C,@T[1]
|
---|
1608 | xor $D,@T[1]
|
---|
1609 | and @T[1],@T[0]
|
---|
1610 |
|
---|
1611 | vmovdqa 64($K_XX_XX),@X[2] # pbswap mask
|
---|
1612 | vmovdqa 0($K_XX_XX),$Kx # K_00_19
|
---|
1613 | vmovdqu 0($inp),@X[-4&7] # load input to %xmm[0-3]
|
---|
1614 | vmovdqu 16($inp),@X[-3&7]
|
---|
1615 | vmovdqu 32($inp),@X[-2&7]
|
---|
1616 | vmovdqu 48($inp),@X[-1&7]
|
---|
1617 | vpshufb @X[2],@X[-4&7],@X[-4&7] # byte swap
|
---|
1618 | add \$64,$inp
|
---|
1619 | vpshufb @X[2],@X[-3&7],@X[-3&7]
|
---|
1620 | vpshufb @X[2],@X[-2&7],@X[-2&7]
|
---|
1621 | vpshufb @X[2],@X[-1&7],@X[-1&7]
|
---|
1622 | vpaddd $Kx,@X[-4&7],@X[0] # add K_00_19
|
---|
1623 | vpaddd $Kx,@X[-3&7],@X[1]
|
---|
1624 | vpaddd $Kx,@X[-2&7],@X[2]
|
---|
1625 | vmovdqa @X[0],0(%rsp) # X[]+K xfer to IALU
|
---|
1626 | vmovdqa @X[1],16(%rsp)
|
---|
1627 | vmovdqa @X[2],32(%rsp)
|
---|
1628 | vmovups -112($key),$rndkey0 # $key[0]
|
---|
1629 | jmp .Loop_dec_avx
|
---|
1630 |
|
---|
1631 | .align 32
|
---|
1632 | .Loop_dec_avx:
|
---|
1633 | ___
|
---|
1634 | &Xupdate_avx_16_31(\&body_00_19_dec);
|
---|
1635 | &Xupdate_avx_16_31(\&body_00_19_dec);
|
---|
1636 | &Xupdate_avx_16_31(\&body_00_19_dec);
|
---|
1637 | &Xupdate_avx_16_31(\&body_00_19_dec);
|
---|
1638 | &Xupdate_avx_32_79(\&body_00_19_dec);
|
---|
1639 | &Xupdate_avx_32_79(\&body_20_39_dec);
|
---|
1640 | &Xupdate_avx_32_79(\&body_20_39_dec);
|
---|
1641 | &Xupdate_avx_32_79(\&body_20_39_dec);
|
---|
1642 | &Xupdate_avx_32_79(\&body_20_39_dec);
|
---|
1643 | &Xupdate_avx_32_79(\&body_20_39_dec);
|
---|
1644 | &Xupdate_avx_32_79(\&body_40_59_dec);
|
---|
1645 | &Xupdate_avx_32_79(\&body_40_59_dec);
|
---|
1646 | &Xupdate_avx_32_79(\&body_40_59_dec);
|
---|
1647 | &Xupdate_avx_32_79(\&body_40_59_dec);
|
---|
1648 | &Xupdate_avx_32_79(\&body_40_59_dec);
|
---|
1649 | &Xupdate_avx_32_79(\&body_20_39_dec);
|
---|
1650 | &Xuplast_avx_80(\&body_20_39_dec,".Ldone_dec_avx"); # can jump to "done"
|
---|
1651 |
|
---|
1652 | $saved_j=$j; @saved_V=@V;
|
---|
1653 | $saved_rx=$rx;
|
---|
1654 |
|
---|
1655 | &Xloop_avx(\&body_20_39_dec);
|
---|
1656 | &Xloop_avx(\&body_20_39_dec);
|
---|
1657 | &Xloop_avx(\&body_20_39_dec);
|
---|
1658 |
|
---|
1659 | eval(@aes256_dec[-1]); # last store
|
---|
1660 | $code.=<<___;
|
---|
1661 | lea 64($in0),$in0
|
---|
1662 |
|
---|
1663 | add 0($ctx),$A # update context
|
---|
1664 | add 4($ctx),@T[0]
|
---|
1665 | add 8($ctx),$C
|
---|
1666 | add 12($ctx),$D
|
---|
1667 | mov $A,0($ctx)
|
---|
1668 | add 16($ctx),$E
|
---|
1669 | mov @T[0],4($ctx)
|
---|
1670 | mov @T[0],$B # magic seed
|
---|
1671 | mov $C,8($ctx)
|
---|
1672 | mov $C,@T[1]
|
---|
1673 | mov $D,12($ctx)
|
---|
1674 | xor $D,@T[1]
|
---|
1675 | mov $E,16($ctx)
|
---|
1676 | and @T[1],@T[0]
|
---|
1677 | jmp .Loop_dec_avx
|
---|
1678 |
|
---|
1679 | .Ldone_dec_avx:
|
---|
1680 | ___
|
---|
1681 | $jj=$j=$saved_j; @V=@saved_V;
|
---|
1682 | $rx=$saved_rx;
|
---|
1683 |
|
---|
1684 | &Xtail_avx(\&body_20_39_dec);
|
---|
1685 | &Xtail_avx(\&body_20_39_dec);
|
---|
1686 | &Xtail_avx(\&body_20_39_dec);
|
---|
1687 |
|
---|
1688 | eval(@aes256_dec[-1]); # last store
|
---|
1689 | $code.=<<___;
|
---|
1690 |
|
---|
1691 | add 0($ctx),$A # update context
|
---|
1692 | add 4($ctx),@T[0]
|
---|
1693 | add 8($ctx),$C
|
---|
1694 | mov $A,0($ctx)
|
---|
1695 | add 12($ctx),$D
|
---|
1696 | mov @T[0],4($ctx)
|
---|
1697 | add 16($ctx),$E
|
---|
1698 | mov $C,8($ctx)
|
---|
1699 | mov $D,12($ctx)
|
---|
1700 | mov $E,16($ctx)
|
---|
1701 | vmovups @X[3],($ivp) # write IV
|
---|
1702 | vzeroall
|
---|
1703 | ___
|
---|
1704 | $code.=<<___ if ($win64);
|
---|
1705 | movaps 96+0(%rsp),%xmm6
|
---|
1706 | movaps 96+16(%rsp),%xmm7
|
---|
1707 | movaps 96+32(%rsp),%xmm8
|
---|
1708 | movaps 96+48(%rsp),%xmm9
|
---|
1709 | movaps 96+64(%rsp),%xmm10
|
---|
1710 | movaps 96+80(%rsp),%xmm11
|
---|
1711 | movaps 96+96(%rsp),%xmm12
|
---|
1712 | movaps 96+112(%rsp),%xmm13
|
---|
1713 | movaps 96+128(%rsp),%xmm14
|
---|
1714 | movaps 96+144(%rsp),%xmm15
|
---|
1715 | ___
|
---|
1716 | $code.=<<___;
|
---|
1717 | lea `104+($win64?10*16:0)`(%rsp),%rsi
|
---|
1718 | .cfi_def_cfa %rsi,56
|
---|
1719 | mov 0(%rsi),%r15
|
---|
1720 | .cfi_restore %r15
|
---|
1721 | mov 8(%rsi),%r14
|
---|
1722 | .cfi_restore %r14
|
---|
1723 | mov 16(%rsi),%r13
|
---|
1724 | .cfi_restore %r13
|
---|
1725 | mov 24(%rsi),%r12
|
---|
1726 | .cfi_restore %r12
|
---|
1727 | mov 32(%rsi),%rbp
|
---|
1728 | .cfi_restore %rbp
|
---|
1729 | mov 40(%rsi),%rbx
|
---|
1730 | .cfi_restore %rbx
|
---|
1731 | lea 48(%rsi),%rsp
|
---|
1732 | .cfi_def_cfa %rsp,8
|
---|
1733 | .Lepilogue_dec_avx:
|
---|
1734 | ret
|
---|
1735 | .cfi_endproc
|
---|
1736 | .size aesni256_cbc_sha1_dec_avx,.-aesni256_cbc_sha1_dec_avx
|
---|
1737 | ___
|
---|
1738 | }}}
|
---|
1739 | }
|
---|
1740 | $code.=<<___;
|
---|
1741 | .align 64
|
---|
1742 | K_XX_XX:
|
---|
1743 | .long 0x5a827999,0x5a827999,0x5a827999,0x5a827999 # K_00_19
|
---|
1744 | .long 0x6ed9eba1,0x6ed9eba1,0x6ed9eba1,0x6ed9eba1 # K_20_39
|
---|
1745 | .long 0x8f1bbcdc,0x8f1bbcdc,0x8f1bbcdc,0x8f1bbcdc # K_40_59
|
---|
1746 | .long 0xca62c1d6,0xca62c1d6,0xca62c1d6,0xca62c1d6 # K_60_79
|
---|
1747 | .long 0x00010203,0x04050607,0x08090a0b,0x0c0d0e0f # pbswap mask
|
---|
1748 | .byte 0xf,0xe,0xd,0xc,0xb,0xa,0x9,0x8,0x7,0x6,0x5,0x4,0x3,0x2,0x1,0x0
|
---|
1749 |
|
---|
1750 | .asciz "AESNI-CBC+SHA1 stitch for x86_64, CRYPTOGAMS by <appro\@openssl.org>"
|
---|
1751 | .align 64
|
---|
1752 | ___
|
---|
1753 | if ($shaext) {{{
|
---|
1754 | ($in0,$out,$len,$key,$ivp,$ctx,$inp)=("%rdi","%rsi","%rdx","%rcx","%r8","%r9","%r10");
|
---|
1755 |
|
---|
1756 | $rounds="%r11d";
|
---|
1757 |
|
---|
1758 | ($iv,$in,$rndkey0)=map("%xmm$_",(2,14,15));
|
---|
1759 | @rndkey=("%xmm0","%xmm1");
|
---|
1760 | $r=0;
|
---|
1761 |
|
---|
1762 | my ($BSWAP,$ABCD,$E,$E_,$ABCD_SAVE,$E_SAVE)=map("%xmm$_",(7..12));
|
---|
1763 | my @MSG=map("%xmm$_",(3..6));
|
---|
1764 |
|
---|
1765 | $code.=<<___;
|
---|
1766 | .type aesni_cbc_sha1_enc_shaext,\@function,6
|
---|
1767 | .align 32
|
---|
1768 | aesni_cbc_sha1_enc_shaext:
|
---|
1769 | .cfi_startproc
|
---|
1770 | mov `($win64?56:8)`(%rsp),$inp # load 7th argument
|
---|
1771 | ___
|
---|
1772 | $code.=<<___ if ($win64);
|
---|
1773 | lea `-8-10*16`(%rsp),%rsp
|
---|
1774 | movaps %xmm6,-8-10*16(%rax)
|
---|
1775 | movaps %xmm7,-8-9*16(%rax)
|
---|
1776 | movaps %xmm8,-8-8*16(%rax)
|
---|
1777 | movaps %xmm9,-8-7*16(%rax)
|
---|
1778 | movaps %xmm10,-8-6*16(%rax)
|
---|
1779 | movaps %xmm11,-8-5*16(%rax)
|
---|
1780 | movaps %xmm12,-8-4*16(%rax)
|
---|
1781 | movaps %xmm13,-8-3*16(%rax)
|
---|
1782 | movaps %xmm14,-8-2*16(%rax)
|
---|
1783 | movaps %xmm15,-8-1*16(%rax)
|
---|
1784 | .Lprologue_shaext:
|
---|
1785 | ___
|
---|
1786 | $code.=<<___;
|
---|
1787 | movdqu ($ctx),$ABCD
|
---|
1788 | movd 16($ctx),$E
|
---|
1789 | movdqa K_XX_XX+0x50(%rip),$BSWAP # byte-n-word swap
|
---|
1790 |
|
---|
1791 | mov 240($key),$rounds
|
---|
1792 | sub $in0,$out
|
---|
1793 | movups ($key),$rndkey0 # $key[0]
|
---|
1794 | movups ($ivp),$iv # load IV
|
---|
1795 | movups 16($key),$rndkey[0] # forward reference
|
---|
1796 | lea 112($key),$key # size optimization
|
---|
1797 |
|
---|
1798 | pshufd \$0b00011011,$ABCD,$ABCD # flip word order
|
---|
1799 | pshufd \$0b00011011,$E,$E # flip word order
|
---|
1800 | jmp .Loop_shaext
|
---|
1801 |
|
---|
1802 | .align 16
|
---|
1803 | .Loop_shaext:
|
---|
1804 | ___
|
---|
1805 | &$aesenc();
|
---|
1806 | $code.=<<___;
|
---|
1807 | movdqu ($inp),@MSG[0]
|
---|
1808 | movdqa $E,$E_SAVE # offload $E
|
---|
1809 | pshufb $BSWAP,@MSG[0]
|
---|
1810 | movdqu 0x10($inp),@MSG[1]
|
---|
1811 | movdqa $ABCD,$ABCD_SAVE # offload $ABCD
|
---|
1812 | ___
|
---|
1813 | &$aesenc();
|
---|
1814 | $code.=<<___;
|
---|
1815 | pshufb $BSWAP,@MSG[1]
|
---|
1816 |
|
---|
1817 | paddd @MSG[0],$E
|
---|
1818 | movdqu 0x20($inp),@MSG[2]
|
---|
1819 | lea 0x40($inp),$inp
|
---|
1820 | pxor $E_SAVE,@MSG[0] # black magic
|
---|
1821 | ___
|
---|
1822 | &$aesenc();
|
---|
1823 | $code.=<<___;
|
---|
1824 | pxor $E_SAVE,@MSG[0] # black magic
|
---|
1825 | movdqa $ABCD,$E_
|
---|
1826 | pshufb $BSWAP,@MSG[2]
|
---|
1827 | sha1rnds4 \$0,$E,$ABCD # 0-3
|
---|
1828 | sha1nexte @MSG[1],$E_
|
---|
1829 | ___
|
---|
1830 | &$aesenc();
|
---|
1831 | $code.=<<___;
|
---|
1832 | sha1msg1 @MSG[1],@MSG[0]
|
---|
1833 | movdqu -0x10($inp),@MSG[3]
|
---|
1834 | movdqa $ABCD,$E
|
---|
1835 | pshufb $BSWAP,@MSG[3]
|
---|
1836 | ___
|
---|
1837 | &$aesenc();
|
---|
1838 | $code.=<<___;
|
---|
1839 | sha1rnds4 \$0,$E_,$ABCD # 4-7
|
---|
1840 | sha1nexte @MSG[2],$E
|
---|
1841 | pxor @MSG[2],@MSG[0]
|
---|
1842 | sha1msg1 @MSG[2],@MSG[1]
|
---|
1843 | ___
|
---|
1844 | &$aesenc();
|
---|
1845 |
|
---|
1846 | for($i=2;$i<20-4;$i++) {
|
---|
1847 | $code.=<<___;
|
---|
1848 | movdqa $ABCD,$E_
|
---|
1849 | sha1rnds4 \$`int($i/5)`,$E,$ABCD # 8-11
|
---|
1850 | sha1nexte @MSG[3],$E_
|
---|
1851 | ___
|
---|
1852 | &$aesenc();
|
---|
1853 | $code.=<<___;
|
---|
1854 | sha1msg2 @MSG[3],@MSG[0]
|
---|
1855 | pxor @MSG[3],@MSG[1]
|
---|
1856 | sha1msg1 @MSG[3],@MSG[2]
|
---|
1857 | ___
|
---|
1858 | ($E,$E_)=($E_,$E);
|
---|
1859 | push(@MSG,shift(@MSG));
|
---|
1860 |
|
---|
1861 | &$aesenc();
|
---|
1862 | }
|
---|
1863 | $code.=<<___;
|
---|
1864 | movdqa $ABCD,$E_
|
---|
1865 | sha1rnds4 \$3,$E,$ABCD # 64-67
|
---|
1866 | sha1nexte @MSG[3],$E_
|
---|
1867 | sha1msg2 @MSG[3],@MSG[0]
|
---|
1868 | pxor @MSG[3],@MSG[1]
|
---|
1869 | ___
|
---|
1870 | &$aesenc();
|
---|
1871 | $code.=<<___;
|
---|
1872 | movdqa $ABCD,$E
|
---|
1873 | sha1rnds4 \$3,$E_,$ABCD # 68-71
|
---|
1874 | sha1nexte @MSG[0],$E
|
---|
1875 | sha1msg2 @MSG[0],@MSG[1]
|
---|
1876 | ___
|
---|
1877 | &$aesenc();
|
---|
1878 | $code.=<<___;
|
---|
1879 | movdqa $E_SAVE,@MSG[0]
|
---|
1880 | movdqa $ABCD,$E_
|
---|
1881 | sha1rnds4 \$3,$E,$ABCD # 72-75
|
---|
1882 | sha1nexte @MSG[1],$E_
|
---|
1883 | ___
|
---|
1884 | &$aesenc();
|
---|
1885 | $code.=<<___;
|
---|
1886 | movdqa $ABCD,$E
|
---|
1887 | sha1rnds4 \$3,$E_,$ABCD # 76-79
|
---|
1888 | sha1nexte $MSG[0],$E
|
---|
1889 | ___
|
---|
1890 | while($r<40) { &$aesenc(); } # remaining aesenc's
|
---|
1891 | $code.=<<___;
|
---|
1892 | dec $len
|
---|
1893 |
|
---|
1894 | paddd $ABCD_SAVE,$ABCD
|
---|
1895 | movups $iv,48($out,$in0) # write output
|
---|
1896 | lea 64($in0),$in0
|
---|
1897 | jnz .Loop_shaext
|
---|
1898 |
|
---|
1899 | pshufd \$0b00011011,$ABCD,$ABCD
|
---|
1900 | pshufd \$0b00011011,$E,$E
|
---|
1901 | movups $iv,($ivp) # write IV
|
---|
1902 | movdqu $ABCD,($ctx)
|
---|
1903 | movd $E,16($ctx)
|
---|
1904 | ___
|
---|
1905 | $code.=<<___ if ($win64);
|
---|
1906 | movaps -8-10*16(%rax),%xmm6
|
---|
1907 | movaps -8-9*16(%rax),%xmm7
|
---|
1908 | movaps -8-8*16(%rax),%xmm8
|
---|
1909 | movaps -8-7*16(%rax),%xmm9
|
---|
1910 | movaps -8-6*16(%rax),%xmm10
|
---|
1911 | movaps -8-5*16(%rax),%xmm11
|
---|
1912 | movaps -8-4*16(%rax),%xmm12
|
---|
1913 | movaps -8-3*16(%rax),%xmm13
|
---|
1914 | movaps -8-2*16(%rax),%xmm14
|
---|
1915 | movaps -8-1*16(%rax),%xmm15
|
---|
1916 | mov %rax,%rsp
|
---|
1917 | .Lepilogue_shaext:
|
---|
1918 | ___
|
---|
1919 | $code.=<<___;
|
---|
1920 | ret
|
---|
1921 | .cfi_endproc
|
---|
1922 | .size aesni_cbc_sha1_enc_shaext,.-aesni_cbc_sha1_enc_shaext
|
---|
1923 | ___
|
---|
1924 | }}}
|
---|
1925 | # EXCEPTION_DISPOSITION handler (EXCEPTION_RECORD *rec,ULONG64 frame,
|
---|
1926 | # CONTEXT *context,DISPATCHER_CONTEXT *disp)
|
---|
1927 | if ($win64) {
|
---|
1928 | $rec="%rcx";
|
---|
1929 | $frame="%rdx";
|
---|
1930 | $context="%r8";
|
---|
1931 | $disp="%r9";
|
---|
1932 |
|
---|
1933 | $code.=<<___;
|
---|
1934 | .extern __imp_RtlVirtualUnwind
|
---|
1935 | .type ssse3_handler,\@abi-omnipotent
|
---|
1936 | .align 16
|
---|
1937 | ssse3_handler:
|
---|
1938 | push %rsi
|
---|
1939 | push %rdi
|
---|
1940 | push %rbx
|
---|
1941 | push %rbp
|
---|
1942 | push %r12
|
---|
1943 | push %r13
|
---|
1944 | push %r14
|
---|
1945 | push %r15
|
---|
1946 | pushfq
|
---|
1947 | sub \$64,%rsp
|
---|
1948 |
|
---|
1949 | mov 120($context),%rax # pull context->Rax
|
---|
1950 | mov 248($context),%rbx # pull context->Rip
|
---|
1951 |
|
---|
1952 | mov 8($disp),%rsi # disp->ImageBase
|
---|
1953 | mov 56($disp),%r11 # disp->HandlerData
|
---|
1954 |
|
---|
1955 | mov 0(%r11),%r10d # HandlerData[0]
|
---|
1956 | lea (%rsi,%r10),%r10 # prologue label
|
---|
1957 | cmp %r10,%rbx # context->Rip<prologue label
|
---|
1958 | jb .Lcommon_seh_tail
|
---|
1959 |
|
---|
1960 | mov 152($context),%rax # pull context->Rsp
|
---|
1961 |
|
---|
1962 | mov 4(%r11),%r10d # HandlerData[1]
|
---|
1963 | lea (%rsi,%r10),%r10 # epilogue label
|
---|
1964 | cmp %r10,%rbx # context->Rip>=epilogue label
|
---|
1965 | jae .Lcommon_seh_tail
|
---|
1966 | ___
|
---|
1967 | $code.=<<___ if ($shaext);
|
---|
1968 | lea aesni_cbc_sha1_enc_shaext(%rip),%r10
|
---|
1969 | cmp %r10,%rbx
|
---|
1970 | jb .Lseh_no_shaext
|
---|
1971 |
|
---|
1972 | lea (%rax),%rsi
|
---|
1973 | lea 512($context),%rdi # &context.Xmm6
|
---|
1974 | mov \$20,%ecx
|
---|
1975 | .long 0xa548f3fc # cld; rep movsq
|
---|
1976 | lea 168(%rax),%rax # adjust stack pointer
|
---|
1977 | jmp .Lcommon_seh_tail
|
---|
1978 | .Lseh_no_shaext:
|
---|
1979 | ___
|
---|
1980 | $code.=<<___;
|
---|
1981 | lea 96(%rax),%rsi
|
---|
1982 | lea 512($context),%rdi # &context.Xmm6
|
---|
1983 | mov \$20,%ecx
|
---|
1984 | .long 0xa548f3fc # cld; rep movsq
|
---|
1985 | lea `104+10*16`(%rax),%rax # adjust stack pointer
|
---|
1986 |
|
---|
1987 | mov 0(%rax),%r15
|
---|
1988 | mov 8(%rax),%r14
|
---|
1989 | mov 16(%rax),%r13
|
---|
1990 | mov 24(%rax),%r12
|
---|
1991 | mov 32(%rax),%rbp
|
---|
1992 | mov 40(%rax),%rbx
|
---|
1993 | lea 48(%rax),%rax
|
---|
1994 | mov %rbx,144($context) # restore context->Rbx
|
---|
1995 | mov %rbp,160($context) # restore context->Rbp
|
---|
1996 | mov %r12,216($context) # restore context->R12
|
---|
1997 | mov %r13,224($context) # restore context->R13
|
---|
1998 | mov %r14,232($context) # restore context->R14
|
---|
1999 | mov %r15,240($context) # restore context->R15
|
---|
2000 |
|
---|
2001 | .Lcommon_seh_tail:
|
---|
2002 | mov 8(%rax),%rdi
|
---|
2003 | mov 16(%rax),%rsi
|
---|
2004 | mov %rax,152($context) # restore context->Rsp
|
---|
2005 | mov %rsi,168($context) # restore context->Rsi
|
---|
2006 | mov %rdi,176($context) # restore context->Rdi
|
---|
2007 |
|
---|
2008 | mov 40($disp),%rdi # disp->ContextRecord
|
---|
2009 | mov $context,%rsi # context
|
---|
2010 | mov \$154,%ecx # sizeof(CONTEXT)
|
---|
2011 | .long 0xa548f3fc # cld; rep movsq
|
---|
2012 |
|
---|
2013 | mov $disp,%rsi
|
---|
2014 | xor %rcx,%rcx # arg1, UNW_FLAG_NHANDLER
|
---|
2015 | mov 8(%rsi),%rdx # arg2, disp->ImageBase
|
---|
2016 | mov 0(%rsi),%r8 # arg3, disp->ControlPc
|
---|
2017 | mov 16(%rsi),%r9 # arg4, disp->FunctionEntry
|
---|
2018 | mov 40(%rsi),%r10 # disp->ContextRecord
|
---|
2019 | lea 56(%rsi),%r11 # &disp->HandlerData
|
---|
2020 | lea 24(%rsi),%r12 # &disp->EstablisherFrame
|
---|
2021 | mov %r10,32(%rsp) # arg5
|
---|
2022 | mov %r11,40(%rsp) # arg6
|
---|
2023 | mov %r12,48(%rsp) # arg7
|
---|
2024 | mov %rcx,56(%rsp) # arg8, (NULL)
|
---|
2025 | call *__imp_RtlVirtualUnwind(%rip)
|
---|
2026 |
|
---|
2027 | mov \$1,%eax # ExceptionContinueSearch
|
---|
2028 | add \$64,%rsp
|
---|
2029 | popfq
|
---|
2030 | pop %r15
|
---|
2031 | pop %r14
|
---|
2032 | pop %r13
|
---|
2033 | pop %r12
|
---|
2034 | pop %rbp
|
---|
2035 | pop %rbx
|
---|
2036 | pop %rdi
|
---|
2037 | pop %rsi
|
---|
2038 | ret
|
---|
2039 | .size ssse3_handler,.-ssse3_handler
|
---|
2040 |
|
---|
2041 | .section .pdata
|
---|
2042 | .align 4
|
---|
2043 | .rva .LSEH_begin_aesni_cbc_sha1_enc_ssse3
|
---|
2044 | .rva .LSEH_end_aesni_cbc_sha1_enc_ssse3
|
---|
2045 | .rva .LSEH_info_aesni_cbc_sha1_enc_ssse3
|
---|
2046 | ___
|
---|
2047 | $code.=<<___ if ($avx);
|
---|
2048 | .rva .LSEH_begin_aesni_cbc_sha1_enc_avx
|
---|
2049 | .rva .LSEH_end_aesni_cbc_sha1_enc_avx
|
---|
2050 | .rva .LSEH_info_aesni_cbc_sha1_enc_avx
|
---|
2051 | ___
|
---|
2052 | $code.=<<___ if ($shaext);
|
---|
2053 | .rva .LSEH_begin_aesni_cbc_sha1_enc_shaext
|
---|
2054 | .rva .LSEH_end_aesni_cbc_sha1_enc_shaext
|
---|
2055 | .rva .LSEH_info_aesni_cbc_sha1_enc_shaext
|
---|
2056 | ___
|
---|
2057 | $code.=<<___;
|
---|
2058 | .section .xdata
|
---|
2059 | .align 8
|
---|
2060 | .LSEH_info_aesni_cbc_sha1_enc_ssse3:
|
---|
2061 | .byte 9,0,0,0
|
---|
2062 | .rva ssse3_handler
|
---|
2063 | .rva .Lprologue_ssse3,.Lepilogue_ssse3 # HandlerData[]
|
---|
2064 | ___
|
---|
2065 | $code.=<<___ if ($avx);
|
---|
2066 | .LSEH_info_aesni_cbc_sha1_enc_avx:
|
---|
2067 | .byte 9,0,0,0
|
---|
2068 | .rva ssse3_handler
|
---|
2069 | .rva .Lprologue_avx,.Lepilogue_avx # HandlerData[]
|
---|
2070 | ___
|
---|
2071 | $code.=<<___ if ($shaext);
|
---|
2072 | .LSEH_info_aesni_cbc_sha1_enc_shaext:
|
---|
2073 | .byte 9,0,0,0
|
---|
2074 | .rva ssse3_handler
|
---|
2075 | .rva .Lprologue_shaext,.Lepilogue_shaext # HandlerData[]
|
---|
2076 | ___
|
---|
2077 | }
|
---|
2078 |
|
---|
2079 | ####################################################################
|
---|
2080 | sub rex {
|
---|
2081 | local *opcode=shift;
|
---|
2082 | my ($dst,$src)=@_;
|
---|
2083 | my $rex=0;
|
---|
2084 |
|
---|
2085 | $rex|=0x04 if($dst>=8);
|
---|
2086 | $rex|=0x01 if($src>=8);
|
---|
2087 | unshift @opcode,$rex|0x40 if($rex);
|
---|
2088 | }
|
---|
2089 |
|
---|
2090 | sub sha1rnds4 {
|
---|
2091 | if (@_[0] =~ /\$([x0-9a-f]+),\s*%xmm([0-9]+),\s*%xmm([0-9]+)/) {
|
---|
2092 | my @opcode=(0x0f,0x3a,0xcc);
|
---|
2093 | rex(\@opcode,$3,$2);
|
---|
2094 | push @opcode,0xc0|($2&7)|(($3&7)<<3); # ModR/M
|
---|
2095 | my $c=$1;
|
---|
2096 | push @opcode,$c=~/^0/?oct($c):$c;
|
---|
2097 | return ".byte\t".join(',',@opcode);
|
---|
2098 | } else {
|
---|
2099 | return "sha1rnds4\t".@_[0];
|
---|
2100 | }
|
---|
2101 | }
|
---|
2102 |
|
---|
2103 | sub sha1op38 {
|
---|
2104 | my $instr = shift;
|
---|
2105 | my %opcodelet = (
|
---|
2106 | "sha1nexte" => 0xc8,
|
---|
2107 | "sha1msg1" => 0xc9,
|
---|
2108 | "sha1msg2" => 0xca );
|
---|
2109 |
|
---|
2110 | if (defined($opcodelet{$instr}) && @_[0] =~ /%xmm([0-9]+),\s*%xmm([0-9]+)/) {
|
---|
2111 | my @opcode=(0x0f,0x38);
|
---|
2112 | rex(\@opcode,$2,$1);
|
---|
2113 | push @opcode,$opcodelet{$instr};
|
---|
2114 | push @opcode,0xc0|($1&7)|(($2&7)<<3); # ModR/M
|
---|
2115 | return ".byte\t".join(',',@opcode);
|
---|
2116 | } else {
|
---|
2117 | return $instr."\t".@_[0];
|
---|
2118 | }
|
---|
2119 | }
|
---|
2120 |
|
---|
2121 | sub aesni {
|
---|
2122 | my $line=shift;
|
---|
2123 | my @opcode=(0x0f,0x38);
|
---|
2124 |
|
---|
2125 | if ($line=~/(aes[a-z]+)\s+%xmm([0-9]+),\s*%xmm([0-9]+)/) {
|
---|
2126 | my %opcodelet = (
|
---|
2127 | "aesenc" => 0xdc, "aesenclast" => 0xdd,
|
---|
2128 | "aesdec" => 0xde, "aesdeclast" => 0xdf
|
---|
2129 | );
|
---|
2130 | return undef if (!defined($opcodelet{$1}));
|
---|
2131 | rex(\@opcode,$3,$2);
|
---|
2132 | push @opcode,$opcodelet{$1},0xc0|($2&7)|(($3&7)<<3); # ModR/M
|
---|
2133 | unshift @opcode,0x66;
|
---|
2134 | return ".byte\t".join(',',@opcode);
|
---|
2135 | }
|
---|
2136 | return $line;
|
---|
2137 | }
|
---|
2138 |
|
---|
2139 | foreach (split("\n",$code)) {
|
---|
2140 | s/\`([^\`]*)\`/eval $1/geo;
|
---|
2141 |
|
---|
2142 | s/\b(sha1rnds4)\s+(.*)/sha1rnds4($2)/geo or
|
---|
2143 | s/\b(sha1[^\s]*)\s+(.*)/sha1op38($1,$2)/geo or
|
---|
2144 | s/\b(aes.*%xmm[0-9]+).*$/aesni($1)/geo;
|
---|
2145 |
|
---|
2146 | print $_,"\n";
|
---|
2147 | }
|
---|
2148 | close STDOUT or die "error closing STDOUT: $!";
|
---|