1 | #! /usr/bin/env perl
|
---|
2 | # Copyright 2011-2020 The OpenSSL Project Authors. All Rights Reserved.
|
---|
3 | #
|
---|
4 | # Licensed under the OpenSSL license (the "License"). You may not use
|
---|
5 | # this file except in compliance with the License. You can obtain a copy
|
---|
6 | # in the file LICENSE in the source distribution or at
|
---|
7 | # https://www.openssl.org/source/license.html
|
---|
8 |
|
---|
9 | #
|
---|
10 | # ====================================================================
|
---|
11 | # Written by Andy Polyakov <[email protected]> for the OpenSSL
|
---|
12 | # project. The module is, however, dual licensed under OpenSSL and
|
---|
13 | # CRYPTOGAMS licenses depending on where you obtain it. For further
|
---|
14 | # details see http://www.openssl.org/~appro/cryptogams/.
|
---|
15 | # ====================================================================
|
---|
16 | #
|
---|
17 | # June 2011
|
---|
18 | #
|
---|
19 | # This is AESNI-CBC+SHA1 "stitch" implementation. The idea, as spelled
|
---|
20 | # in http://download.intel.com/design/intarch/papers/323686.pdf, is
|
---|
21 | # that since AESNI-CBC encrypt exhibit *very* low instruction-level
|
---|
22 | # parallelism, interleaving it with another algorithm would allow to
|
---|
23 | # utilize processor resources better and achieve better performance.
|
---|
24 | # SHA1 instruction sequences(*) are taken from sha1-x86_64.pl and
|
---|
25 | # AESNI code is weaved into it. Below are performance numbers in
|
---|
26 | # cycles per processed byte, less is better, for standalone AESNI-CBC
|
---|
27 | # encrypt, sum of the latter and standalone SHA1, and "stitched"
|
---|
28 | # subroutine:
|
---|
29 | #
|
---|
30 | # AES-128-CBC +SHA1 stitch gain
|
---|
31 | # Westmere 3.77[+5.3] 9.07 6.55 +38%
|
---|
32 | # Sandy Bridge 5.05[+5.0(6.1)] 10.06(11.15) 5.98(7.05) +68%(+58%)
|
---|
33 | # Ivy Bridge 5.05[+4.6] 9.65 5.54 +74%
|
---|
34 | # Haswell 4.43[+3.6(4.2)] 8.00(8.58) 4.55(5.21) +75%(+65%)
|
---|
35 | # Skylake 2.63[+3.5(4.1)] 6.17(6.69) 4.23(4.44) +46%(+51%)
|
---|
36 | # Bulldozer 5.77[+6.0] 11.72 6.37 +84%
|
---|
37 | # Ryzen(**) 2.71[+1.93] 4.64 2.74 +69%
|
---|
38 | # Goldmont(**) 3.82[+1.70] 5.52 4.20 +31%
|
---|
39 | #
|
---|
40 | # AES-192-CBC
|
---|
41 | # Westmere 4.51 9.81 6.80 +44%
|
---|
42 | # Sandy Bridge 6.05 11.06(12.15) 6.11(7.19) +81%(+69%)
|
---|
43 | # Ivy Bridge 6.05 10.65 6.07 +75%
|
---|
44 | # Haswell 5.29 8.86(9.44) 5.32(5.32) +67%(+77%)
|
---|
45 | # Bulldozer 6.89 12.84 6.96 +84%
|
---|
46 | #
|
---|
47 | # AES-256-CBC
|
---|
48 | # Westmere 5.25 10.55 7.21 +46%
|
---|
49 | # Sandy Bridge 7.05 12.06(13.15) 7.12(7.72) +69%(+70%)
|
---|
50 | # Ivy Bridge 7.05 11.65 7.12 +64%
|
---|
51 | # Haswell 6.19 9.76(10.34) 6.21(6.25) +57%(+65%)
|
---|
52 | # Skylake 3.62 7.16(7.68) 4.56(4.76) +57%(+61%)
|
---|
53 | # Bulldozer 8.00 13.95 8.25 +69%
|
---|
54 | # Ryzen(**) 3.71 5.64 3.72 +52%
|
---|
55 | # Goldmont(**) 5.35 7.05 5.76 +22%
|
---|
56 | #
|
---|
57 | # (*) There are two code paths: SSSE3 and AVX. See sha1-568.pl for
|
---|
58 | # background information. Above numbers in parentheses are SSSE3
|
---|
59 | # results collected on AVX-capable CPU, i.e. apply on OSes that
|
---|
60 | # don't support AVX.
|
---|
61 | # (**) SHAEXT results.
|
---|
62 | #
|
---|
63 | # Needless to mention that it makes no sense to implement "stitched"
|
---|
64 | # *decrypt* subroutine. Because *both* AESNI-CBC decrypt and SHA1
|
---|
65 | # fully utilize parallelism, so stitching would not give any gain
|
---|
66 | # anyway. Well, there might be some, e.g. because of better cache
|
---|
67 | # locality... For reference, here are performance results for
|
---|
68 | # standalone AESNI-CBC decrypt:
|
---|
69 | #
|
---|
70 | # AES-128-CBC AES-192-CBC AES-256-CBC
|
---|
71 | # Westmere 1.25 1.50 1.75
|
---|
72 | # Sandy Bridge 0.74 0.91 1.09
|
---|
73 | # Ivy Bridge 0.74 0.90 1.11
|
---|
74 | # Haswell 0.63 0.76 0.88
|
---|
75 | # Bulldozer 0.70 0.85 0.99
|
---|
76 |
|
---|
77 | # And indeed:
|
---|
78 | #
|
---|
79 | # AES-256-CBC +SHA1 stitch gain
|
---|
80 | # Westmere 1.75 7.20 6.68 +7.8%
|
---|
81 | # Sandy Bridge 1.09 6.09(7.22) 5.82(6.95) +4.6%(+3.9%)
|
---|
82 | # Ivy Bridge 1.11 5.70 5.45 +4.6%
|
---|
83 | # Haswell 0.88 4.45(5.00) 4.39(4.69) +1.4%(*)(+6.6%)
|
---|
84 | # Bulldozer 0.99 6.95 5.95 +17%(**)
|
---|
85 | #
|
---|
86 | # (*) Tiny improvement coefficient on Haswell is because we compare
|
---|
87 | # AVX1 stitch to sum with AVX2 SHA1.
|
---|
88 | # (**) Execution is fully dominated by integer code sequence and
|
---|
89 | # SIMD still hardly shows [in single-process benchmark;-]
|
---|
90 |
|
---|
91 | $flavour = shift;
|
---|
92 | $output = shift;
|
---|
93 | if ($flavour =~ /\./) { $output = $flavour; undef $flavour; }
|
---|
94 |
|
---|
95 | $win64=0; $win64=1 if ($flavour =~ /[nm]asm|mingw64/ || $output =~ /\.asm$/);
|
---|
96 |
|
---|
97 | $0 =~ m/(.*[\/\\])[^\/\\]+$/; $dir=$1;
|
---|
98 | ( $xlate="${dir}x86_64-xlate.pl" and -f $xlate ) or
|
---|
99 | ( $xlate="${dir}../../perlasm/x86_64-xlate.pl" and -f $xlate) or
|
---|
100 | die "can't locate x86_64-xlate.pl";
|
---|
101 |
|
---|
102 | $avx=1 if (`$ENV{CC} -Wa,-v -c -o /dev/null -x assembler /dev/null 2>&1`
|
---|
103 | =~ /GNU assembler version ([2-9]\.[0-9]+)/ &&
|
---|
104 | $1>=2.19);
|
---|
105 | $avx=1 if (!$avx && $win64 && ($flavour =~ /nasm/ || $ENV{ASM} =~ /nasm/) &&
|
---|
106 | `nasm -v 2>&1` =~ /NASM version ([2-9]\.[0-9]+)/ &&
|
---|
107 | $1>=2.09);
|
---|
108 | $avx=1 if (!$avx && $win64 && ($flavour =~ /masm/ || $ENV{ASM} =~ /ml64/) &&
|
---|
109 | `ml64 2>&1` =~ /Version ([0-9]+)\./ &&
|
---|
110 | $1>=10);
|
---|
111 | $avx=1 if (!$avx && `$ENV{CC} -v 2>&1` =~ /((?:^clang|LLVM) version|.*based on LLVM) ([0-9]+\.[0-9]+)/ && $2>=3.0);
|
---|
112 |
|
---|
113 | $shaext=1; ### set to zero if compiling for 1.0.1
|
---|
114 |
|
---|
115 | $stitched_decrypt=0;
|
---|
116 |
|
---|
117 | open OUT,"| \"$^X\" \"$xlate\" $flavour \"$output\"";
|
---|
118 | *STDOUT=*OUT;
|
---|
119 |
|
---|
120 | # void aesni_cbc_sha1_enc(const void *inp,
|
---|
121 | # void *out,
|
---|
122 | # size_t length,
|
---|
123 | # const AES_KEY *key,
|
---|
124 | # unsigned char *iv,
|
---|
125 | # SHA_CTX *ctx,
|
---|
126 | # const void *in0);
|
---|
127 |
|
---|
128 | $code.=<<___;
|
---|
129 | .text
|
---|
130 | .extern OPENSSL_ia32cap_P
|
---|
131 |
|
---|
132 | .globl aesni_cbc_sha1_enc
|
---|
133 | .type aesni_cbc_sha1_enc,\@abi-omnipotent
|
---|
134 | .align 32
|
---|
135 | aesni_cbc_sha1_enc:
|
---|
136 | .cfi_startproc
|
---|
137 | # caller should check for SSSE3 and AES-NI bits
|
---|
138 | mov OPENSSL_ia32cap_P+0(%rip),%r10d
|
---|
139 | mov OPENSSL_ia32cap_P+4(%rip),%r11
|
---|
140 | ___
|
---|
141 | $code.=<<___ if ($shaext);
|
---|
142 | bt \$61,%r11 # check SHA bit
|
---|
143 | jc aesni_cbc_sha1_enc_shaext
|
---|
144 | ___
|
---|
145 | $code.=<<___ if ($avx);
|
---|
146 | and \$`1<<28`,%r11d # mask AVX bit
|
---|
147 | and \$`1<<30`,%r10d # mask "Intel CPU" bit
|
---|
148 | or %r11d,%r10d
|
---|
149 | cmp \$`1<<28|1<<30`,%r10d
|
---|
150 | je aesni_cbc_sha1_enc_avx
|
---|
151 | ___
|
---|
152 | $code.=<<___;
|
---|
153 | jmp aesni_cbc_sha1_enc_ssse3
|
---|
154 | ret
|
---|
155 | .cfi_endproc
|
---|
156 | .size aesni_cbc_sha1_enc,.-aesni_cbc_sha1_enc
|
---|
157 | ___
|
---|
158 |
|
---|
159 | my ($in0,$out,$len,$key,$ivp,$ctx,$inp)=("%rdi","%rsi","%rdx","%rcx","%r8","%r9","%r10");
|
---|
160 |
|
---|
161 | my $Xi=4;
|
---|
162 | my @X=map("%xmm$_",(4..7,0..3));
|
---|
163 | my @Tx=map("%xmm$_",(8..10));
|
---|
164 | my @V=($A,$B,$C,$D,$E)=("%eax","%ebx","%ecx","%edx","%ebp"); # size optimization
|
---|
165 | my @T=("%esi","%edi");
|
---|
166 | my $j=0; my $jj=0; my $r=0; my $sn=0; my $rx=0;
|
---|
167 | my $K_XX_XX="%r11";
|
---|
168 | my ($rndkey0,$iv,$in)=map("%xmm$_",(11..13)); # for enc
|
---|
169 | my @rndkey=("%xmm14","%xmm15"); # for enc
|
---|
170 | my ($inout0,$inout1,$inout2,$inout3)=map("%xmm$_",(12..15)); # for dec
|
---|
171 |
|
---|
172 | if (1) { # reassign for Atom Silvermont
|
---|
173 | # The goal is to minimize amount of instructions with more than
|
---|
174 | # 3 prefix bytes. Or in more practical terms to keep AES-NI *and*
|
---|
175 | # SSSE3 instructions to upper half of the register bank.
|
---|
176 | @X=map("%xmm$_",(8..11,4..7));
|
---|
177 | @Tx=map("%xmm$_",(12,13,3));
|
---|
178 | ($iv,$in,$rndkey0)=map("%xmm$_",(2,14,15));
|
---|
179 | @rndkey=("%xmm0","%xmm1");
|
---|
180 | }
|
---|
181 |
|
---|
182 | sub AUTOLOAD() # thunk [simplified] 32-bit style perlasm
|
---|
183 | { my $opcode = $AUTOLOAD; $opcode =~ s/.*:://;
|
---|
184 | my $arg = pop;
|
---|
185 | $arg = "\$$arg" if ($arg*1 eq $arg);
|
---|
186 | $code .= "\t$opcode\t".join(',',$arg,reverse @_)."\n";
|
---|
187 | }
|
---|
188 |
|
---|
189 | my $_rol=sub { &rol(@_) };
|
---|
190 | my $_ror=sub { &ror(@_) };
|
---|
191 |
|
---|
192 | $code.=<<___;
|
---|
193 | .type aesni_cbc_sha1_enc_ssse3,\@function,6
|
---|
194 | .align 32
|
---|
195 | aesni_cbc_sha1_enc_ssse3:
|
---|
196 | .cfi_startproc
|
---|
197 | mov `($win64?56:8)`(%rsp),$inp # load 7th argument
|
---|
198 | #shr \$6,$len # debugging artefact
|
---|
199 | #jz .Lepilogue_ssse3 # debugging artefact
|
---|
200 | push %rbx
|
---|
201 | .cfi_push %rbx
|
---|
202 | push %rbp
|
---|
203 | .cfi_push %rbp
|
---|
204 | push %r12
|
---|
205 | .cfi_push %r12
|
---|
206 | push %r13
|
---|
207 | .cfi_push %r13
|
---|
208 | push %r14
|
---|
209 | .cfi_push %r14
|
---|
210 | push %r15
|
---|
211 | .cfi_push %r15
|
---|
212 | lea `-104-($win64?10*16:0)`(%rsp),%rsp
|
---|
213 | .cfi_adjust_cfa_offset `104+($win64?10*16:0)`
|
---|
214 | #mov $in0,$inp # debugging artefact
|
---|
215 | #lea 64(%rsp),$ctx # debugging artefact
|
---|
216 | ___
|
---|
217 | $code.=<<___ if ($win64);
|
---|
218 | movaps %xmm6,96+0(%rsp)
|
---|
219 | movaps %xmm7,96+16(%rsp)
|
---|
220 | movaps %xmm8,96+32(%rsp)
|
---|
221 | movaps %xmm9,96+48(%rsp)
|
---|
222 | movaps %xmm10,96+64(%rsp)
|
---|
223 | movaps %xmm11,96+80(%rsp)
|
---|
224 | movaps %xmm12,96+96(%rsp)
|
---|
225 | movaps %xmm13,96+112(%rsp)
|
---|
226 | movaps %xmm14,96+128(%rsp)
|
---|
227 | movaps %xmm15,96+144(%rsp)
|
---|
228 | .Lprologue_ssse3:
|
---|
229 | ___
|
---|
230 | $code.=<<___;
|
---|
231 | mov $in0,%r12 # reassign arguments
|
---|
232 | mov $out,%r13
|
---|
233 | mov $len,%r14
|
---|
234 | lea 112($key),%r15 # size optimization
|
---|
235 | movdqu ($ivp),$iv # load IV
|
---|
236 | mov $ivp,88(%rsp) # save $ivp
|
---|
237 | ___
|
---|
238 | ($in0,$out,$len,$key)=map("%r$_",(12..15)); # reassign arguments
|
---|
239 | my $rounds="${ivp}d";
|
---|
240 | $code.=<<___;
|
---|
241 | shl \$6,$len
|
---|
242 | sub $in0,$out
|
---|
243 | mov 240-112($key),$rounds
|
---|
244 | add $inp,$len # end of input
|
---|
245 |
|
---|
246 | lea K_XX_XX(%rip),$K_XX_XX
|
---|
247 | mov 0($ctx),$A # load context
|
---|
248 | mov 4($ctx),$B
|
---|
249 | mov 8($ctx),$C
|
---|
250 | mov 12($ctx),$D
|
---|
251 | mov $B,@T[0] # magic seed
|
---|
252 | mov 16($ctx),$E
|
---|
253 | mov $C,@T[1]
|
---|
254 | xor $D,@T[1]
|
---|
255 | and @T[1],@T[0]
|
---|
256 |
|
---|
257 | movdqa 64($K_XX_XX),@Tx[2] # pbswap mask
|
---|
258 | movdqa 0($K_XX_XX),@Tx[1] # K_00_19
|
---|
259 | movdqu 0($inp),@X[-4&7] # load input to %xmm[0-3]
|
---|
260 | movdqu 16($inp),@X[-3&7]
|
---|
261 | movdqu 32($inp),@X[-2&7]
|
---|
262 | movdqu 48($inp),@X[-1&7]
|
---|
263 | pshufb @Tx[2],@X[-4&7] # byte swap
|
---|
264 | pshufb @Tx[2],@X[-3&7]
|
---|
265 | pshufb @Tx[2],@X[-2&7]
|
---|
266 | add \$64,$inp
|
---|
267 | paddd @Tx[1],@X[-4&7] # add K_00_19
|
---|
268 | pshufb @Tx[2],@X[-1&7]
|
---|
269 | paddd @Tx[1],@X[-3&7]
|
---|
270 | paddd @Tx[1],@X[-2&7]
|
---|
271 | movdqa @X[-4&7],0(%rsp) # X[]+K xfer to IALU
|
---|
272 | psubd @Tx[1],@X[-4&7] # restore X[]
|
---|
273 | movdqa @X[-3&7],16(%rsp)
|
---|
274 | psubd @Tx[1],@X[-3&7]
|
---|
275 | movdqa @X[-2&7],32(%rsp)
|
---|
276 | psubd @Tx[1],@X[-2&7]
|
---|
277 | movups -112($key),$rndkey0 # $key[0]
|
---|
278 | movups 16-112($key),$rndkey[0] # forward reference
|
---|
279 | jmp .Loop_ssse3
|
---|
280 | ___
|
---|
281 |
|
---|
282 | my $aesenc=sub {
|
---|
283 | use integer;
|
---|
284 | my ($n,$k)=($r/10,$r%10);
|
---|
285 | if ($k==0) {
|
---|
286 | $code.=<<___;
|
---|
287 | movups `16*$n`($in0),$in # load input
|
---|
288 | xorps $rndkey0,$in
|
---|
289 | ___
|
---|
290 | $code.=<<___ if ($n);
|
---|
291 | movups $iv,`16*($n-1)`($out,$in0) # write output
|
---|
292 | ___
|
---|
293 | $code.=<<___;
|
---|
294 | xorps $in,$iv
|
---|
295 | movups `32+16*$k-112`($key),$rndkey[1]
|
---|
296 | aesenc $rndkey[0],$iv
|
---|
297 | ___
|
---|
298 | } elsif ($k==9) {
|
---|
299 | $sn++;
|
---|
300 | $code.=<<___;
|
---|
301 | cmp \$11,$rounds
|
---|
302 | jb .Laesenclast$sn
|
---|
303 | movups `32+16*($k+0)-112`($key),$rndkey[1]
|
---|
304 | aesenc $rndkey[0],$iv
|
---|
305 | movups `32+16*($k+1)-112`($key),$rndkey[0]
|
---|
306 | aesenc $rndkey[1],$iv
|
---|
307 | je .Laesenclast$sn
|
---|
308 | movups `32+16*($k+2)-112`($key),$rndkey[1]
|
---|
309 | aesenc $rndkey[0],$iv
|
---|
310 | movups `32+16*($k+3)-112`($key),$rndkey[0]
|
---|
311 | aesenc $rndkey[1],$iv
|
---|
312 | .Laesenclast$sn:
|
---|
313 | aesenclast $rndkey[0],$iv
|
---|
314 | movups 16-112($key),$rndkey[1] # forward reference
|
---|
315 | ___
|
---|
316 | } else {
|
---|
317 | $code.=<<___;
|
---|
318 | movups `32+16*$k-112`($key),$rndkey[1]
|
---|
319 | aesenc $rndkey[0],$iv
|
---|
320 | ___
|
---|
321 | }
|
---|
322 | $r++; unshift(@rndkey,pop(@rndkey));
|
---|
323 | };
|
---|
324 |
|
---|
325 | sub Xupdate_ssse3_16_31() # recall that $Xi starts with 4
|
---|
326 | { use integer;
|
---|
327 | my $body = shift;
|
---|
328 | my @insns = (&$body,&$body,&$body,&$body); # 40 instructions
|
---|
329 | my ($a,$b,$c,$d,$e);
|
---|
330 |
|
---|
331 | eval(shift(@insns)); # ror
|
---|
332 | &pshufd (@X[0],@X[-4&7],0xee); # was &movdqa (@X[0],@X[-3&7]);
|
---|
333 | eval(shift(@insns));
|
---|
334 | &movdqa (@Tx[0],@X[-1&7]);
|
---|
335 | &paddd (@Tx[1],@X[-1&7]);
|
---|
336 | eval(shift(@insns));
|
---|
337 | eval(shift(@insns));
|
---|
338 |
|
---|
339 | &punpcklqdq(@X[0],@X[-3&7]); # compose "X[-14]" in "X[0]", was &palignr(@X[0],@X[-4&7],8);
|
---|
340 | eval(shift(@insns));
|
---|
341 | eval(shift(@insns)); # rol
|
---|
342 | eval(shift(@insns));
|
---|
343 | &psrldq (@Tx[0],4); # "X[-3]", 3 dwords
|
---|
344 | eval(shift(@insns));
|
---|
345 | eval(shift(@insns));
|
---|
346 |
|
---|
347 | &pxor (@X[0],@X[-4&7]); # "X[0]"^="X[-16]"
|
---|
348 | eval(shift(@insns));
|
---|
349 | eval(shift(@insns)); # ror
|
---|
350 | &pxor (@Tx[0],@X[-2&7]); # "X[-3]"^"X[-8]"
|
---|
351 | eval(shift(@insns));
|
---|
352 | eval(shift(@insns));
|
---|
353 | eval(shift(@insns));
|
---|
354 |
|
---|
355 | &pxor (@X[0],@Tx[0]); # "X[0]"^="X[-3]"^"X[-8]"
|
---|
356 | eval(shift(@insns));
|
---|
357 | eval(shift(@insns)); # rol
|
---|
358 | &movdqa (eval(16*(($Xi-1)&3))."(%rsp)",@Tx[1]); # X[]+K xfer to IALU
|
---|
359 | eval(shift(@insns));
|
---|
360 | eval(shift(@insns));
|
---|
361 |
|
---|
362 | &movdqa (@Tx[2],@X[0]);
|
---|
363 | eval(shift(@insns));
|
---|
364 | eval(shift(@insns));
|
---|
365 | eval(shift(@insns)); # ror
|
---|
366 | &movdqa (@Tx[0],@X[0]);
|
---|
367 | eval(shift(@insns));
|
---|
368 |
|
---|
369 | &pslldq (@Tx[2],12); # "X[0]"<<96, extract one dword
|
---|
370 | &paddd (@X[0],@X[0]);
|
---|
371 | eval(shift(@insns));
|
---|
372 | eval(shift(@insns));
|
---|
373 |
|
---|
374 | &psrld (@Tx[0],31);
|
---|
375 | eval(shift(@insns));
|
---|
376 | eval(shift(@insns)); # rol
|
---|
377 | eval(shift(@insns));
|
---|
378 | &movdqa (@Tx[1],@Tx[2]);
|
---|
379 | eval(shift(@insns));
|
---|
380 | eval(shift(@insns));
|
---|
381 |
|
---|
382 | &psrld (@Tx[2],30);
|
---|
383 | eval(shift(@insns));
|
---|
384 | eval(shift(@insns)); # ror
|
---|
385 | &por (@X[0],@Tx[0]); # "X[0]"<<<=1
|
---|
386 | eval(shift(@insns));
|
---|
387 | eval(shift(@insns));
|
---|
388 | eval(shift(@insns));
|
---|
389 |
|
---|
390 | &pslld (@Tx[1],2);
|
---|
391 | &pxor (@X[0],@Tx[2]);
|
---|
392 | eval(shift(@insns));
|
---|
393 | &movdqa (@Tx[2],eval(16*(($Xi)/5))."($K_XX_XX)"); # K_XX_XX
|
---|
394 | eval(shift(@insns)); # rol
|
---|
395 | eval(shift(@insns));
|
---|
396 | eval(shift(@insns));
|
---|
397 |
|
---|
398 | &pxor (@X[0],@Tx[1]); # "X[0]"^=("X[0]">>96)<<<2
|
---|
399 | &pshufd (@Tx[1],@X[-1&7],0xee) if ($Xi==7); # was &movdqa (@Tx[0],@X[-1&7]) in Xupdate_ssse3_32_79
|
---|
400 |
|
---|
401 | foreach (@insns) { eval; } # remaining instructions [if any]
|
---|
402 |
|
---|
403 | $Xi++; push(@X,shift(@X)); # "rotate" X[]
|
---|
404 | push(@Tx,shift(@Tx));
|
---|
405 | }
|
---|
406 |
|
---|
407 | sub Xupdate_ssse3_32_79()
|
---|
408 | { use integer;
|
---|
409 | my $body = shift;
|
---|
410 | my @insns = (&$body,&$body,&$body,&$body); # 32 to 44 instructions
|
---|
411 | my ($a,$b,$c,$d,$e);
|
---|
412 |
|
---|
413 | eval(shift(@insns)) if ($Xi==8);
|
---|
414 | &pxor (@X[0],@X[-4&7]); # "X[0]"="X[-32]"^"X[-16]"
|
---|
415 | eval(shift(@insns)) if ($Xi==8);
|
---|
416 | eval(shift(@insns)); # body_20_39
|
---|
417 | eval(shift(@insns));
|
---|
418 | eval(shift(@insns)) if (@insns[1] =~ /_ror/);
|
---|
419 | eval(shift(@insns)) if (@insns[0] =~ /_ror/);
|
---|
420 | &punpcklqdq(@Tx[0],@X[-1&7]); # compose "X[-6]", was &palignr(@Tx[0],@X[-2&7],8);
|
---|
421 | eval(shift(@insns));
|
---|
422 | eval(shift(@insns)); # rol
|
---|
423 |
|
---|
424 | &pxor (@X[0],@X[-7&7]); # "X[0]"^="X[-28]"
|
---|
425 | eval(shift(@insns));
|
---|
426 | eval(shift(@insns));
|
---|
427 | if ($Xi%5) {
|
---|
428 | &movdqa (@Tx[2],@Tx[1]);# "perpetuate" K_XX_XX...
|
---|
429 | } else { # ... or load next one
|
---|
430 | &movdqa (@Tx[2],eval(16*($Xi/5))."($K_XX_XX)");
|
---|
431 | }
|
---|
432 | eval(shift(@insns)); # ror
|
---|
433 | &paddd (@Tx[1],@X[-1&7]);
|
---|
434 | eval(shift(@insns));
|
---|
435 |
|
---|
436 | &pxor (@X[0],@Tx[0]); # "X[0]"^="X[-6]"
|
---|
437 | eval(shift(@insns)); # body_20_39
|
---|
438 | eval(shift(@insns));
|
---|
439 | eval(shift(@insns));
|
---|
440 | eval(shift(@insns)); # rol
|
---|
441 | eval(shift(@insns)) if (@insns[0] =~ /_ror/);
|
---|
442 |
|
---|
443 | &movdqa (@Tx[0],@X[0]);
|
---|
444 | eval(shift(@insns));
|
---|
445 | eval(shift(@insns));
|
---|
446 | &movdqa (eval(16*(($Xi-1)&3))."(%rsp)",@Tx[1]); # X[]+K xfer to IALU
|
---|
447 | eval(shift(@insns)); # ror
|
---|
448 | eval(shift(@insns));
|
---|
449 | eval(shift(@insns)); # body_20_39
|
---|
450 |
|
---|
451 | &pslld (@X[0],2);
|
---|
452 | eval(shift(@insns));
|
---|
453 | eval(shift(@insns));
|
---|
454 | &psrld (@Tx[0],30);
|
---|
455 | eval(shift(@insns)) if (@insns[0] =~ /_rol/);# rol
|
---|
456 | eval(shift(@insns));
|
---|
457 | eval(shift(@insns));
|
---|
458 | eval(shift(@insns)); # ror
|
---|
459 |
|
---|
460 | &por (@X[0],@Tx[0]); # "X[0]"<<<=2
|
---|
461 | eval(shift(@insns));
|
---|
462 | eval(shift(@insns)); # body_20_39
|
---|
463 | eval(shift(@insns)) if (@insns[1] =~ /_rol/);
|
---|
464 | eval(shift(@insns)) if (@insns[0] =~ /_rol/);
|
---|
465 | &pshufd(@Tx[1],@X[-1&7],0xee) if ($Xi<19); # was &movdqa (@Tx[1],@X[0])
|
---|
466 | eval(shift(@insns));
|
---|
467 | eval(shift(@insns)); # rol
|
---|
468 | eval(shift(@insns));
|
---|
469 | eval(shift(@insns));
|
---|
470 | eval(shift(@insns)); # rol
|
---|
471 | eval(shift(@insns));
|
---|
472 |
|
---|
473 | foreach (@insns) { eval; } # remaining instructions
|
---|
474 |
|
---|
475 | $Xi++; push(@X,shift(@X)); # "rotate" X[]
|
---|
476 | push(@Tx,shift(@Tx));
|
---|
477 | }
|
---|
478 |
|
---|
479 | sub Xuplast_ssse3_80()
|
---|
480 | { use integer;
|
---|
481 | my $body = shift;
|
---|
482 | my @insns = (&$body,&$body,&$body,&$body); # 32 instructions
|
---|
483 | my ($a,$b,$c,$d,$e);
|
---|
484 |
|
---|
485 | eval(shift(@insns));
|
---|
486 | eval(shift(@insns));
|
---|
487 | eval(shift(@insns));
|
---|
488 | eval(shift(@insns));
|
---|
489 | &paddd (@Tx[1],@X[-1&7]);
|
---|
490 | eval(shift(@insns));
|
---|
491 | eval(shift(@insns));
|
---|
492 |
|
---|
493 | &movdqa (eval(16*(($Xi-1)&3))."(%rsp)",@Tx[1]); # X[]+K xfer IALU
|
---|
494 |
|
---|
495 | foreach (@insns) { eval; } # remaining instructions
|
---|
496 |
|
---|
497 | &cmp ($inp,$len);
|
---|
498 | &je (shift);
|
---|
499 |
|
---|
500 | unshift(@Tx,pop(@Tx));
|
---|
501 |
|
---|
502 | &movdqa (@Tx[2],"64($K_XX_XX)"); # pbswap mask
|
---|
503 | &movdqa (@Tx[1],"0($K_XX_XX)"); # K_00_19
|
---|
504 | &movdqu (@X[-4&7],"0($inp)"); # load input
|
---|
505 | &movdqu (@X[-3&7],"16($inp)");
|
---|
506 | &movdqu (@X[-2&7],"32($inp)");
|
---|
507 | &movdqu (@X[-1&7],"48($inp)");
|
---|
508 | &pshufb (@X[-4&7],@Tx[2]); # byte swap
|
---|
509 | &add ($inp,64);
|
---|
510 |
|
---|
511 | $Xi=0;
|
---|
512 | }
|
---|
513 |
|
---|
514 | sub Xloop_ssse3()
|
---|
515 | { use integer;
|
---|
516 | my $body = shift;
|
---|
517 | my @insns = (&$body,&$body,&$body,&$body); # 32 instructions
|
---|
518 | my ($a,$b,$c,$d,$e);
|
---|
519 |
|
---|
520 | eval(shift(@insns));
|
---|
521 | eval(shift(@insns));
|
---|
522 | eval(shift(@insns));
|
---|
523 | &pshufb (@X[($Xi-3)&7],@Tx[2]);
|
---|
524 | eval(shift(@insns));
|
---|
525 | eval(shift(@insns));
|
---|
526 | eval(shift(@insns));
|
---|
527 | eval(shift(@insns));
|
---|
528 | &paddd (@X[($Xi-4)&7],@Tx[1]);
|
---|
529 | eval(shift(@insns));
|
---|
530 | eval(shift(@insns));
|
---|
531 | eval(shift(@insns));
|
---|
532 | eval(shift(@insns));
|
---|
533 | &movdqa (eval(16*$Xi)."(%rsp)",@X[($Xi-4)&7]); # X[]+K xfer to IALU
|
---|
534 | eval(shift(@insns));
|
---|
535 | eval(shift(@insns));
|
---|
536 | eval(shift(@insns));
|
---|
537 | eval(shift(@insns));
|
---|
538 | &psubd (@X[($Xi-4)&7],@Tx[1]);
|
---|
539 |
|
---|
540 | foreach (@insns) { eval; }
|
---|
541 | $Xi++;
|
---|
542 | }
|
---|
543 |
|
---|
544 | sub Xtail_ssse3()
|
---|
545 | { use integer;
|
---|
546 | my $body = shift;
|
---|
547 | my @insns = (&$body,&$body,&$body,&$body); # 32 instructions
|
---|
548 | my ($a,$b,$c,$d,$e);
|
---|
549 |
|
---|
550 | foreach (@insns) { eval; }
|
---|
551 | }
|
---|
552 |
|
---|
553 | my @body_00_19 = (
|
---|
554 | '($a,$b,$c,$d,$e)=@V;'.
|
---|
555 | '&$_ror ($b,$j?7:2);', # $b>>>2
|
---|
556 | '&xor (@T[0],$d);',
|
---|
557 | '&mov (@T[1],$a);', # $b for next round
|
---|
558 |
|
---|
559 | '&add ($e,eval(4*($j&15))."(%rsp)");',# X[]+K xfer
|
---|
560 | '&xor ($b,$c);', # $c^$d for next round
|
---|
561 |
|
---|
562 | '&$_rol ($a,5);',
|
---|
563 | '&add ($e,@T[0]);',
|
---|
564 | '&and (@T[1],$b);', # ($b&($c^$d)) for next round
|
---|
565 |
|
---|
566 | '&xor ($b,$c);', # restore $b
|
---|
567 | '&add ($e,$a);' .'$j++; unshift(@V,pop(@V)); unshift(@T,pop(@T));'
|
---|
568 | );
|
---|
569 |
|
---|
570 | sub body_00_19 () { # ((c^d)&b)^d
|
---|
571 | # on start @T[0]=(c^d)&b
|
---|
572 | return &body_20_39() if ($rx==19); $rx++;
|
---|
573 |
|
---|
574 | use integer;
|
---|
575 | my ($k,$n);
|
---|
576 | my @r=@body_00_19;
|
---|
577 |
|
---|
578 | $n = scalar(@r);
|
---|
579 | $k = (($jj+1)*12/20)*20*$n/12; # 12 aesencs per these 20 rounds
|
---|
580 | @r[$k%$n].='&$aesenc();' if ($jj==$k/$n);
|
---|
581 | $jj++;
|
---|
582 |
|
---|
583 | return @r;
|
---|
584 | }
|
---|
585 |
|
---|
586 | my @body_20_39 = (
|
---|
587 | '($a,$b,$c,$d,$e)=@V;'.
|
---|
588 | '&add ($e,eval(4*($j&15))."(%rsp)");',# X[]+K xfer
|
---|
589 | '&xor (@T[0],$d) if($j==19);'.
|
---|
590 | '&xor (@T[0],$c) if($j> 19);', # ($b^$d^$c)
|
---|
591 | '&mov (@T[1],$a);', # $b for next round
|
---|
592 |
|
---|
593 | '&$_rol ($a,5);',
|
---|
594 | '&add ($e,@T[0]);',
|
---|
595 | '&xor (@T[1],$c) if ($j< 79);', # $b^$d for next round
|
---|
596 |
|
---|
597 | '&$_ror ($b,7);', # $b>>>2
|
---|
598 | '&add ($e,$a);' .'$j++; unshift(@V,pop(@V)); unshift(@T,pop(@T));'
|
---|
599 | );
|
---|
600 |
|
---|
601 | sub body_20_39 () { # b^d^c
|
---|
602 | # on entry @T[0]=b^d
|
---|
603 | return &body_40_59() if ($rx==39); $rx++;
|
---|
604 |
|
---|
605 | use integer;
|
---|
606 | my ($k,$n);
|
---|
607 | my @r=@body_20_39;
|
---|
608 |
|
---|
609 | $n = scalar(@r);
|
---|
610 | $k = (($jj+1)*8/20)*20*$n/8; # 8 aesencs per these 20 rounds
|
---|
611 | @r[$k%$n].='&$aesenc();' if ($jj==$k/$n && $rx!=20);
|
---|
612 | $jj++;
|
---|
613 |
|
---|
614 | return @r;
|
---|
615 | }
|
---|
616 |
|
---|
617 | my @body_40_59 = (
|
---|
618 | '($a,$b,$c,$d,$e)=@V;'.
|
---|
619 | '&add ($e,eval(4*($j&15))."(%rsp)");',# X[]+K xfer
|
---|
620 | '&and (@T[0],$c) if ($j>=40);', # (b^c)&(c^d)
|
---|
621 | '&xor ($c,$d) if ($j>=40);', # restore $c
|
---|
622 |
|
---|
623 | '&$_ror ($b,7);', # $b>>>2
|
---|
624 | '&mov (@T[1],$a);', # $b for next round
|
---|
625 | '&xor (@T[0],$c);',
|
---|
626 |
|
---|
627 | '&$_rol ($a,5);',
|
---|
628 | '&add ($e,@T[0]);',
|
---|
629 | '&xor (@T[1],$c) if ($j==59);'.
|
---|
630 | '&xor (@T[1],$b) if ($j< 59);', # b^c for next round
|
---|
631 |
|
---|
632 | '&xor ($b,$c) if ($j< 59);', # c^d for next round
|
---|
633 | '&add ($e,$a);' .'$j++; unshift(@V,pop(@V)); unshift(@T,pop(@T));'
|
---|
634 | );
|
---|
635 |
|
---|
636 | sub body_40_59 () { # ((b^c)&(c^d))^c
|
---|
637 | # on entry @T[0]=(b^c), (c^=d)
|
---|
638 | $rx++;
|
---|
639 |
|
---|
640 | use integer;
|
---|
641 | my ($k,$n);
|
---|
642 | my @r=@body_40_59;
|
---|
643 |
|
---|
644 | $n = scalar(@r);
|
---|
645 | $k=(($jj+1)*12/20)*20*$n/12; # 12 aesencs per these 20 rounds
|
---|
646 | @r[$k%$n].='&$aesenc();' if ($jj==$k/$n && $rx!=40);
|
---|
647 | $jj++;
|
---|
648 |
|
---|
649 | return @r;
|
---|
650 | }
|
---|
651 | $code.=<<___;
|
---|
652 | .align 32
|
---|
653 | .Loop_ssse3:
|
---|
654 | ___
|
---|
655 | &Xupdate_ssse3_16_31(\&body_00_19);
|
---|
656 | &Xupdate_ssse3_16_31(\&body_00_19);
|
---|
657 | &Xupdate_ssse3_16_31(\&body_00_19);
|
---|
658 | &Xupdate_ssse3_16_31(\&body_00_19);
|
---|
659 | &Xupdate_ssse3_32_79(\&body_00_19);
|
---|
660 | &Xupdate_ssse3_32_79(\&body_20_39);
|
---|
661 | &Xupdate_ssse3_32_79(\&body_20_39);
|
---|
662 | &Xupdate_ssse3_32_79(\&body_20_39);
|
---|
663 | &Xupdate_ssse3_32_79(\&body_20_39);
|
---|
664 | &Xupdate_ssse3_32_79(\&body_20_39);
|
---|
665 | &Xupdate_ssse3_32_79(\&body_40_59);
|
---|
666 | &Xupdate_ssse3_32_79(\&body_40_59);
|
---|
667 | &Xupdate_ssse3_32_79(\&body_40_59);
|
---|
668 | &Xupdate_ssse3_32_79(\&body_40_59);
|
---|
669 | &Xupdate_ssse3_32_79(\&body_40_59);
|
---|
670 | &Xupdate_ssse3_32_79(\&body_20_39);
|
---|
671 | &Xuplast_ssse3_80(\&body_20_39,".Ldone_ssse3"); # can jump to "done"
|
---|
672 |
|
---|
673 | $saved_j=$j; @saved_V=@V;
|
---|
674 | $saved_r=$r; @saved_rndkey=@rndkey;
|
---|
675 |
|
---|
676 | &Xloop_ssse3(\&body_20_39);
|
---|
677 | &Xloop_ssse3(\&body_20_39);
|
---|
678 | &Xloop_ssse3(\&body_20_39);
|
---|
679 |
|
---|
680 | $code.=<<___;
|
---|
681 | movups $iv,48($out,$in0) # write output
|
---|
682 | lea 64($in0),$in0
|
---|
683 |
|
---|
684 | add 0($ctx),$A # update context
|
---|
685 | add 4($ctx),@T[0]
|
---|
686 | add 8($ctx),$C
|
---|
687 | add 12($ctx),$D
|
---|
688 | mov $A,0($ctx)
|
---|
689 | add 16($ctx),$E
|
---|
690 | mov @T[0],4($ctx)
|
---|
691 | mov @T[0],$B # magic seed
|
---|
692 | mov $C,8($ctx)
|
---|
693 | mov $C,@T[1]
|
---|
694 | mov $D,12($ctx)
|
---|
695 | xor $D,@T[1]
|
---|
696 | mov $E,16($ctx)
|
---|
697 | and @T[1],@T[0]
|
---|
698 | jmp .Loop_ssse3
|
---|
699 |
|
---|
700 | .Ldone_ssse3:
|
---|
701 | ___
|
---|
702 | $jj=$j=$saved_j; @V=@saved_V;
|
---|
703 | $r=$saved_r; @rndkey=@saved_rndkey;
|
---|
704 |
|
---|
705 | &Xtail_ssse3(\&body_20_39);
|
---|
706 | &Xtail_ssse3(\&body_20_39);
|
---|
707 | &Xtail_ssse3(\&body_20_39);
|
---|
708 |
|
---|
709 | $code.=<<___;
|
---|
710 | movups $iv,48($out,$in0) # write output
|
---|
711 | mov 88(%rsp),$ivp # restore $ivp
|
---|
712 |
|
---|
713 | add 0($ctx),$A # update context
|
---|
714 | add 4($ctx),@T[0]
|
---|
715 | add 8($ctx),$C
|
---|
716 | mov $A,0($ctx)
|
---|
717 | add 12($ctx),$D
|
---|
718 | mov @T[0],4($ctx)
|
---|
719 | add 16($ctx),$E
|
---|
720 | mov $C,8($ctx)
|
---|
721 | mov $D,12($ctx)
|
---|
722 | mov $E,16($ctx)
|
---|
723 | movups $iv,($ivp) # write IV
|
---|
724 | ___
|
---|
725 | $code.=<<___ if ($win64);
|
---|
726 | movaps 96+0(%rsp),%xmm6
|
---|
727 | movaps 96+16(%rsp),%xmm7
|
---|
728 | movaps 96+32(%rsp),%xmm8
|
---|
729 | movaps 96+48(%rsp),%xmm9
|
---|
730 | movaps 96+64(%rsp),%xmm10
|
---|
731 | movaps 96+80(%rsp),%xmm11
|
---|
732 | movaps 96+96(%rsp),%xmm12
|
---|
733 | movaps 96+112(%rsp),%xmm13
|
---|
734 | movaps 96+128(%rsp),%xmm14
|
---|
735 | movaps 96+144(%rsp),%xmm15
|
---|
736 | ___
|
---|
737 | $code.=<<___;
|
---|
738 | lea `104+($win64?10*16:0)`(%rsp),%rsi
|
---|
739 | .cfi_def_cfa %rsi,56
|
---|
740 | mov 0(%rsi),%r15
|
---|
741 | .cfi_restore %r15
|
---|
742 | mov 8(%rsi),%r14
|
---|
743 | .cfi_restore %r14
|
---|
744 | mov 16(%rsi),%r13
|
---|
745 | .cfi_restore %r13
|
---|
746 | mov 24(%rsi),%r12
|
---|
747 | .cfi_restore %r12
|
---|
748 | mov 32(%rsi),%rbp
|
---|
749 | .cfi_restore %rbp
|
---|
750 | mov 40(%rsi),%rbx
|
---|
751 | .cfi_restore %rbx
|
---|
752 | lea 48(%rsi),%rsp
|
---|
753 | .cfi_def_cfa %rsp,8
|
---|
754 | .Lepilogue_ssse3:
|
---|
755 | ret
|
---|
756 | .cfi_endproc
|
---|
757 | .size aesni_cbc_sha1_enc_ssse3,.-aesni_cbc_sha1_enc_ssse3
|
---|
758 | ___
|
---|
759 |
|
---|
760 | if ($stitched_decrypt) {{{
|
---|
761 | # reset
|
---|
762 | ($in0,$out,$len,$key,$ivp,$ctx,$inp)=("%rdi","%rsi","%rdx","%rcx","%r8","%r9","%r10");
|
---|
763 | $j=$jj=$r=$rx=0;
|
---|
764 | $Xi=4;
|
---|
765 |
|
---|
766 | # reassign for Atom Silvermont (see above)
|
---|
767 | ($inout0,$inout1,$inout2,$inout3,$rndkey0)=map("%xmm$_",(0..4));
|
---|
768 | @X=map("%xmm$_",(8..13,6,7));
|
---|
769 | @Tx=map("%xmm$_",(14,15,5));
|
---|
770 |
|
---|
771 | my @aes256_dec = (
|
---|
772 | '&movdqu($inout0,"0x00($in0)");',
|
---|
773 | '&movdqu($inout1,"0x10($in0)"); &pxor ($inout0,$rndkey0);',
|
---|
774 | '&movdqu($inout2,"0x20($in0)"); &pxor ($inout1,$rndkey0);',
|
---|
775 | '&movdqu($inout3,"0x30($in0)"); &pxor ($inout2,$rndkey0);',
|
---|
776 |
|
---|
777 | '&pxor ($inout3,$rndkey0); &movups ($rndkey0,"16-112($key)");',
|
---|
778 | '&movaps("64(%rsp)",@X[2]);', # save IV, originally @X[3]
|
---|
779 | undef,undef
|
---|
780 | );
|
---|
781 | for ($i=0;$i<13;$i++) {
|
---|
782 | push (@aes256_dec,(
|
---|
783 | '&aesdec ($inout0,$rndkey0);',
|
---|
784 | '&aesdec ($inout1,$rndkey0);',
|
---|
785 | '&aesdec ($inout2,$rndkey0);',
|
---|
786 | '&aesdec ($inout3,$rndkey0); &movups($rndkey0,"'.(16*($i+2)-112).'($key)");'
|
---|
787 | ));
|
---|
788 | push (@aes256_dec,(undef,undef)) if (($i>=3 && $i<=5) || $i>=11);
|
---|
789 | push (@aes256_dec,(undef,undef)) if ($i==5);
|
---|
790 | }
|
---|
791 | push(@aes256_dec,(
|
---|
792 | '&aesdeclast ($inout0,$rndkey0); &movups (@X[0],"0x00($in0)");',
|
---|
793 | '&aesdeclast ($inout1,$rndkey0); &movups (@X[1],"0x10($in0)");',
|
---|
794 | '&aesdeclast ($inout2,$rndkey0); &movups (@X[2],"0x20($in0)");',
|
---|
795 | '&aesdeclast ($inout3,$rndkey0); &movups (@X[3],"0x30($in0)");',
|
---|
796 |
|
---|
797 | '&xorps ($inout0,"64(%rsp)"); &movdqu ($rndkey0,"-112($key)");',
|
---|
798 | '&xorps ($inout1,@X[0]); &movups ("0x00($out,$in0)",$inout0);',
|
---|
799 | '&xorps ($inout2,@X[1]); &movups ("0x10($out,$in0)",$inout1);',
|
---|
800 | '&xorps ($inout3,@X[2]); &movups ("0x20($out,$in0)",$inout2);',
|
---|
801 |
|
---|
802 | '&movups ("0x30($out,$in0)",$inout3);'
|
---|
803 | ));
|
---|
804 |
|
---|
805 | sub body_00_19_dec () { # ((c^d)&b)^d
|
---|
806 | # on start @T[0]=(c^d)&b
|
---|
807 | return &body_20_39_dec() if ($rx==19);
|
---|
808 |
|
---|
809 | my @r=@body_00_19;
|
---|
810 |
|
---|
811 | unshift (@r,@aes256_dec[$rx]) if (@aes256_dec[$rx]);
|
---|
812 | $rx++;
|
---|
813 |
|
---|
814 | return @r;
|
---|
815 | }
|
---|
816 |
|
---|
817 | sub body_20_39_dec () { # b^d^c
|
---|
818 | # on entry @T[0]=b^d
|
---|
819 | return &body_40_59_dec() if ($rx==39);
|
---|
820 |
|
---|
821 | my @r=@body_20_39;
|
---|
822 |
|
---|
823 | unshift (@r,@aes256_dec[$rx]) if (@aes256_dec[$rx]);
|
---|
824 | $rx++;
|
---|
825 |
|
---|
826 | return @r;
|
---|
827 | }
|
---|
828 |
|
---|
829 | sub body_40_59_dec () { # ((b^c)&(c^d))^c
|
---|
830 | # on entry @T[0]=(b^c), (c^=d)
|
---|
831 |
|
---|
832 | my @r=@body_40_59;
|
---|
833 |
|
---|
834 | unshift (@r,@aes256_dec[$rx]) if (@aes256_dec[$rx]);
|
---|
835 | $rx++;
|
---|
836 |
|
---|
837 | return @r;
|
---|
838 | }
|
---|
839 |
|
---|
840 | $code.=<<___;
|
---|
841 | .globl aesni256_cbc_sha1_dec
|
---|
842 | .type aesni256_cbc_sha1_dec,\@abi-omnipotent
|
---|
843 | .align 32
|
---|
844 | aesni256_cbc_sha1_dec:
|
---|
845 | .cfi_startproc
|
---|
846 | # caller should check for SSSE3 and AES-NI bits
|
---|
847 | mov OPENSSL_ia32cap_P+0(%rip),%r10d
|
---|
848 | mov OPENSSL_ia32cap_P+4(%rip),%r11d
|
---|
849 | ___
|
---|
850 | $code.=<<___ if ($avx);
|
---|
851 | and \$`1<<28`,%r11d # mask AVX bit
|
---|
852 | and \$`1<<30`,%r10d # mask "Intel CPU" bit
|
---|
853 | or %r11d,%r10d
|
---|
854 | cmp \$`1<<28|1<<30`,%r10d
|
---|
855 | je aesni256_cbc_sha1_dec_avx
|
---|
856 | ___
|
---|
857 | $code.=<<___;
|
---|
858 | jmp aesni256_cbc_sha1_dec_ssse3
|
---|
859 | ret
|
---|
860 | .cfi_endproc
|
---|
861 | .size aesni256_cbc_sha1_dec,.-aesni256_cbc_sha1_dec
|
---|
862 |
|
---|
863 | .type aesni256_cbc_sha1_dec_ssse3,\@function,6
|
---|
864 | .align 32
|
---|
865 | aesni256_cbc_sha1_dec_ssse3:
|
---|
866 | .cfi_startproc
|
---|
867 | mov `($win64?56:8)`(%rsp),$inp # load 7th argument
|
---|
868 | push %rbx
|
---|
869 | .cfi_push %rbx
|
---|
870 | push %rbp
|
---|
871 | .cfi_push %rbp
|
---|
872 | push %r12
|
---|
873 | .cfi_push %r12
|
---|
874 | push %r13
|
---|
875 | .cfi_push %r13
|
---|
876 | push %r14
|
---|
877 | .cfi_push %r14
|
---|
878 | push %r15
|
---|
879 | .cfi_push %r15
|
---|
880 | lea `-104-($win64?10*16:0)`(%rsp),%rsp
|
---|
881 | .cfi_adjust_cfa_offset `104+($win64?10*16:0)`
|
---|
882 | ___
|
---|
883 | $code.=<<___ if ($win64);
|
---|
884 | movaps %xmm6,96+0(%rsp)
|
---|
885 | movaps %xmm7,96+16(%rsp)
|
---|
886 | movaps %xmm8,96+32(%rsp)
|
---|
887 | movaps %xmm9,96+48(%rsp)
|
---|
888 | movaps %xmm10,96+64(%rsp)
|
---|
889 | movaps %xmm11,96+80(%rsp)
|
---|
890 | movaps %xmm12,96+96(%rsp)
|
---|
891 | movaps %xmm13,96+112(%rsp)
|
---|
892 | movaps %xmm14,96+128(%rsp)
|
---|
893 | movaps %xmm15,96+144(%rsp)
|
---|
894 | .Lprologue_dec_ssse3:
|
---|
895 | ___
|
---|
896 | $code.=<<___;
|
---|
897 | mov $in0,%r12 # reassign arguments
|
---|
898 | mov $out,%r13
|
---|
899 | mov $len,%r14
|
---|
900 | lea 112($key),%r15 # size optimization
|
---|
901 | movdqu ($ivp),@X[3] # load IV
|
---|
902 | #mov $ivp,88(%rsp) # save $ivp
|
---|
903 | ___
|
---|
904 | ($in0,$out,$len,$key)=map("%r$_",(12..15)); # reassign arguments
|
---|
905 | $code.=<<___;
|
---|
906 | shl \$6,$len
|
---|
907 | sub $in0,$out
|
---|
908 | add $inp,$len # end of input
|
---|
909 |
|
---|
910 | lea K_XX_XX(%rip),$K_XX_XX
|
---|
911 | mov 0($ctx),$A # load context
|
---|
912 | mov 4($ctx),$B
|
---|
913 | mov 8($ctx),$C
|
---|
914 | mov 12($ctx),$D
|
---|
915 | mov $B,@T[0] # magic seed
|
---|
916 | mov 16($ctx),$E
|
---|
917 | mov $C,@T[1]
|
---|
918 | xor $D,@T[1]
|
---|
919 | and @T[1],@T[0]
|
---|
920 |
|
---|
921 | movdqa 64($K_XX_XX),@Tx[2] # pbswap mask
|
---|
922 | movdqa 0($K_XX_XX),@Tx[1] # K_00_19
|
---|
923 | movdqu 0($inp),@X[-4&7] # load input to %xmm[0-3]
|
---|
924 | movdqu 16($inp),@X[-3&7]
|
---|
925 | movdqu 32($inp),@X[-2&7]
|
---|
926 | movdqu 48($inp),@X[-1&7]
|
---|
927 | pshufb @Tx[2],@X[-4&7] # byte swap
|
---|
928 | add \$64,$inp
|
---|
929 | pshufb @Tx[2],@X[-3&7]
|
---|
930 | pshufb @Tx[2],@X[-2&7]
|
---|
931 | pshufb @Tx[2],@X[-1&7]
|
---|
932 | paddd @Tx[1],@X[-4&7] # add K_00_19
|
---|
933 | paddd @Tx[1],@X[-3&7]
|
---|
934 | paddd @Tx[1],@X[-2&7]
|
---|
935 | movdqa @X[-4&7],0(%rsp) # X[]+K xfer to IALU
|
---|
936 | psubd @Tx[1],@X[-4&7] # restore X[]
|
---|
937 | movdqa @X[-3&7],16(%rsp)
|
---|
938 | psubd @Tx[1],@X[-3&7]
|
---|
939 | movdqa @X[-2&7],32(%rsp)
|
---|
940 | psubd @Tx[1],@X[-2&7]
|
---|
941 | movdqu -112($key),$rndkey0 # $key[0]
|
---|
942 | jmp .Loop_dec_ssse3
|
---|
943 |
|
---|
944 | .align 32
|
---|
945 | .Loop_dec_ssse3:
|
---|
946 | ___
|
---|
947 | &Xupdate_ssse3_16_31(\&body_00_19_dec);
|
---|
948 | &Xupdate_ssse3_16_31(\&body_00_19_dec);
|
---|
949 | &Xupdate_ssse3_16_31(\&body_00_19_dec);
|
---|
950 | &Xupdate_ssse3_16_31(\&body_00_19_dec);
|
---|
951 | &Xupdate_ssse3_32_79(\&body_00_19_dec);
|
---|
952 | &Xupdate_ssse3_32_79(\&body_20_39_dec);
|
---|
953 | &Xupdate_ssse3_32_79(\&body_20_39_dec);
|
---|
954 | &Xupdate_ssse3_32_79(\&body_20_39_dec);
|
---|
955 | &Xupdate_ssse3_32_79(\&body_20_39_dec);
|
---|
956 | &Xupdate_ssse3_32_79(\&body_20_39_dec);
|
---|
957 | &Xupdate_ssse3_32_79(\&body_40_59_dec);
|
---|
958 | &Xupdate_ssse3_32_79(\&body_40_59_dec);
|
---|
959 | &Xupdate_ssse3_32_79(\&body_40_59_dec);
|
---|
960 | &Xupdate_ssse3_32_79(\&body_40_59_dec);
|
---|
961 | &Xupdate_ssse3_32_79(\&body_40_59_dec);
|
---|
962 | &Xupdate_ssse3_32_79(\&body_20_39_dec);
|
---|
963 | &Xuplast_ssse3_80(\&body_20_39_dec,".Ldone_dec_ssse3"); # can jump to "done"
|
---|
964 |
|
---|
965 | $saved_j=$j; @saved_V=@V;
|
---|
966 | $saved_rx=$rx;
|
---|
967 |
|
---|
968 | &Xloop_ssse3(\&body_20_39_dec);
|
---|
969 | &Xloop_ssse3(\&body_20_39_dec);
|
---|
970 | &Xloop_ssse3(\&body_20_39_dec);
|
---|
971 |
|
---|
972 | eval(@aes256_dec[-1]); # last store
|
---|
973 | $code.=<<___;
|
---|
974 | lea 64($in0),$in0
|
---|
975 |
|
---|
976 | add 0($ctx),$A # update context
|
---|
977 | add 4($ctx),@T[0]
|
---|
978 | add 8($ctx),$C
|
---|
979 | add 12($ctx),$D
|
---|
980 | mov $A,0($ctx)
|
---|
981 | add 16($ctx),$E
|
---|
982 | mov @T[0],4($ctx)
|
---|
983 | mov @T[0],$B # magic seed
|
---|
984 | mov $C,8($ctx)
|
---|
985 | mov $C,@T[1]
|
---|
986 | mov $D,12($ctx)
|
---|
987 | xor $D,@T[1]
|
---|
988 | mov $E,16($ctx)
|
---|
989 | and @T[1],@T[0]
|
---|
990 | jmp .Loop_dec_ssse3
|
---|
991 |
|
---|
992 | .Ldone_dec_ssse3:
|
---|
993 | ___
|
---|
994 | $jj=$j=$saved_j; @V=@saved_V;
|
---|
995 | $rx=$saved_rx;
|
---|
996 |
|
---|
997 | &Xtail_ssse3(\&body_20_39_dec);
|
---|
998 | &Xtail_ssse3(\&body_20_39_dec);
|
---|
999 | &Xtail_ssse3(\&body_20_39_dec);
|
---|
1000 |
|
---|
1001 | eval(@aes256_dec[-1]); # last store
|
---|
1002 | $code.=<<___;
|
---|
1003 | add 0($ctx),$A # update context
|
---|
1004 | add 4($ctx),@T[0]
|
---|
1005 | add 8($ctx),$C
|
---|
1006 | mov $A,0($ctx)
|
---|
1007 | add 12($ctx),$D
|
---|
1008 | mov @T[0],4($ctx)
|
---|
1009 | add 16($ctx),$E
|
---|
1010 | mov $C,8($ctx)
|
---|
1011 | mov $D,12($ctx)
|
---|
1012 | mov $E,16($ctx)
|
---|
1013 | movups @X[3],($ivp) # write IV
|
---|
1014 | ___
|
---|
1015 | $code.=<<___ if ($win64);
|
---|
1016 | movaps 96+0(%rsp),%xmm6
|
---|
1017 | movaps 96+16(%rsp),%xmm7
|
---|
1018 | movaps 96+32(%rsp),%xmm8
|
---|
1019 | movaps 96+48(%rsp),%xmm9
|
---|
1020 | movaps 96+64(%rsp),%xmm10
|
---|
1021 | movaps 96+80(%rsp),%xmm11
|
---|
1022 | movaps 96+96(%rsp),%xmm12
|
---|
1023 | movaps 96+112(%rsp),%xmm13
|
---|
1024 | movaps 96+128(%rsp),%xmm14
|
---|
1025 | movaps 96+144(%rsp),%xmm15
|
---|
1026 | ___
|
---|
1027 | $code.=<<___;
|
---|
1028 | lea `104+($win64?10*16:0)`(%rsp),%rsi
|
---|
1029 | .cfi_cfa_def %rsi,56
|
---|
1030 | mov 0(%rsi),%r15
|
---|
1031 | .cfi_restore %r15
|
---|
1032 | mov 8(%rsi),%r14
|
---|
1033 | .cfi_restore %r14
|
---|
1034 | mov 16(%rsi),%r13
|
---|
1035 | .cfi_restore %r13
|
---|
1036 | mov 24(%rsi),%r12
|
---|
1037 | .cfi_restore %r12
|
---|
1038 | mov 32(%rsi),%rbp
|
---|
1039 | .cfi_restore %rbp
|
---|
1040 | mov 40(%rsi),%rbx
|
---|
1041 | .cfi_restore %rbx
|
---|
1042 | lea 48(%rsi),%rsp
|
---|
1043 | .cfi_cfa_def %rsp,8
|
---|
1044 | .Lepilogue_dec_ssse3:
|
---|
1045 | ret
|
---|
1046 | .cfi_endproc
|
---|
1047 | .size aesni256_cbc_sha1_dec_ssse3,.-aesni256_cbc_sha1_dec_ssse3
|
---|
1048 | ___
|
---|
1049 | }}}
|
---|
1050 | $j=$jj=$r=$rx=0;
|
---|
1051 |
|
---|
1052 | if ($avx) {
|
---|
1053 | my ($in0,$out,$len,$key,$ivp,$ctx,$inp)=("%rdi","%rsi","%rdx","%rcx","%r8","%r9","%r10");
|
---|
1054 |
|
---|
1055 | my $Xi=4;
|
---|
1056 | my @X=map("%xmm$_",(4..7,0..3));
|
---|
1057 | my @Tx=map("%xmm$_",(8..10));
|
---|
1058 | my @V=($A,$B,$C,$D,$E)=("%eax","%ebx","%ecx","%edx","%ebp"); # size optimization
|
---|
1059 | my @T=("%esi","%edi");
|
---|
1060 | my ($rndkey0,$iv,$in)=map("%xmm$_",(11..13));
|
---|
1061 | my @rndkey=("%xmm14","%xmm15");
|
---|
1062 | my ($inout0,$inout1,$inout2,$inout3)=map("%xmm$_",(12..15)); # for dec
|
---|
1063 | my $Kx=@Tx[2];
|
---|
1064 |
|
---|
1065 | my $_rol=sub { &shld(@_[0],@_) };
|
---|
1066 | my $_ror=sub { &shrd(@_[0],@_) };
|
---|
1067 |
|
---|
1068 | $code.=<<___;
|
---|
1069 | .type aesni_cbc_sha1_enc_avx,\@function,6
|
---|
1070 | .align 32
|
---|
1071 | aesni_cbc_sha1_enc_avx:
|
---|
1072 | .cfi_startproc
|
---|
1073 | mov `($win64?56:8)`(%rsp),$inp # load 7th argument
|
---|
1074 | #shr \$6,$len # debugging artefact
|
---|
1075 | #jz .Lepilogue_avx # debugging artefact
|
---|
1076 | push %rbx
|
---|
1077 | .cfi_push %rbx
|
---|
1078 | push %rbp
|
---|
1079 | .cfi_push %rbp
|
---|
1080 | push %r12
|
---|
1081 | .cfi_push %r12
|
---|
1082 | push %r13
|
---|
1083 | .cfi_push %r13
|
---|
1084 | push %r14
|
---|
1085 | .cfi_push %r14
|
---|
1086 | push %r15
|
---|
1087 | .cfi_push %r15
|
---|
1088 | lea `-104-($win64?10*16:0)`(%rsp),%rsp
|
---|
1089 | .cfi_adjust_cfa_offset `104+($win64?10*16:0)`
|
---|
1090 | #mov $in0,$inp # debugging artefact
|
---|
1091 | #lea 64(%rsp),$ctx # debugging artefact
|
---|
1092 | ___
|
---|
1093 | $code.=<<___ if ($win64);
|
---|
1094 | movaps %xmm6,96+0(%rsp)
|
---|
1095 | movaps %xmm7,96+16(%rsp)
|
---|
1096 | movaps %xmm8,96+32(%rsp)
|
---|
1097 | movaps %xmm9,96+48(%rsp)
|
---|
1098 | movaps %xmm10,96+64(%rsp)
|
---|
1099 | movaps %xmm11,96+80(%rsp)
|
---|
1100 | movaps %xmm12,96+96(%rsp)
|
---|
1101 | movaps %xmm13,96+112(%rsp)
|
---|
1102 | movaps %xmm14,96+128(%rsp)
|
---|
1103 | movaps %xmm15,96+144(%rsp)
|
---|
1104 | .Lprologue_avx:
|
---|
1105 | ___
|
---|
1106 | $code.=<<___;
|
---|
1107 | vzeroall
|
---|
1108 | mov $in0,%r12 # reassign arguments
|
---|
1109 | mov $out,%r13
|
---|
1110 | mov $len,%r14
|
---|
1111 | lea 112($key),%r15 # size optimization
|
---|
1112 | vmovdqu ($ivp),$iv # load IV
|
---|
1113 | mov $ivp,88(%rsp) # save $ivp
|
---|
1114 | ___
|
---|
1115 | ($in0,$out,$len,$key)=map("%r$_",(12..15)); # reassign arguments
|
---|
1116 | my $rounds="${ivp}d";
|
---|
1117 | $code.=<<___;
|
---|
1118 | shl \$6,$len
|
---|
1119 | sub $in0,$out
|
---|
1120 | mov 240-112($key),$rounds
|
---|
1121 | add $inp,$len # end of input
|
---|
1122 |
|
---|
1123 | lea K_XX_XX(%rip),$K_XX_XX
|
---|
1124 | mov 0($ctx),$A # load context
|
---|
1125 | mov 4($ctx),$B
|
---|
1126 | mov 8($ctx),$C
|
---|
1127 | mov 12($ctx),$D
|
---|
1128 | mov $B,@T[0] # magic seed
|
---|
1129 | mov 16($ctx),$E
|
---|
1130 | mov $C,@T[1]
|
---|
1131 | xor $D,@T[1]
|
---|
1132 | and @T[1],@T[0]
|
---|
1133 |
|
---|
1134 | vmovdqa 64($K_XX_XX),@X[2] # pbswap mask
|
---|
1135 | vmovdqa 0($K_XX_XX),$Kx # K_00_19
|
---|
1136 | vmovdqu 0($inp),@X[-4&7] # load input to %xmm[0-3]
|
---|
1137 | vmovdqu 16($inp),@X[-3&7]
|
---|
1138 | vmovdqu 32($inp),@X[-2&7]
|
---|
1139 | vmovdqu 48($inp),@X[-1&7]
|
---|
1140 | vpshufb @X[2],@X[-4&7],@X[-4&7] # byte swap
|
---|
1141 | add \$64,$inp
|
---|
1142 | vpshufb @X[2],@X[-3&7],@X[-3&7]
|
---|
1143 | vpshufb @X[2],@X[-2&7],@X[-2&7]
|
---|
1144 | vpshufb @X[2],@X[-1&7],@X[-1&7]
|
---|
1145 | vpaddd $Kx,@X[-4&7],@X[0] # add K_00_19
|
---|
1146 | vpaddd $Kx,@X[-3&7],@X[1]
|
---|
1147 | vpaddd $Kx,@X[-2&7],@X[2]
|
---|
1148 | vmovdqa @X[0],0(%rsp) # X[]+K xfer to IALU
|
---|
1149 | vmovdqa @X[1],16(%rsp)
|
---|
1150 | vmovdqa @X[2],32(%rsp)
|
---|
1151 | vmovups -112($key),$rndkey[1] # $key[0]
|
---|
1152 | vmovups 16-112($key),$rndkey[0] # forward reference
|
---|
1153 | jmp .Loop_avx
|
---|
1154 | ___
|
---|
1155 |
|
---|
1156 | my $aesenc=sub {
|
---|
1157 | use integer;
|
---|
1158 | my ($n,$k)=($r/10,$r%10);
|
---|
1159 | if ($k==0) {
|
---|
1160 | $code.=<<___;
|
---|
1161 | vmovdqu `16*$n`($in0),$in # load input
|
---|
1162 | vpxor $rndkey[1],$in,$in
|
---|
1163 | ___
|
---|
1164 | $code.=<<___ if ($n);
|
---|
1165 | vmovups $iv,`16*($n-1)`($out,$in0) # write output
|
---|
1166 | ___
|
---|
1167 | $code.=<<___;
|
---|
1168 | vpxor $in,$iv,$iv
|
---|
1169 | vaesenc $rndkey[0],$iv,$iv
|
---|
1170 | vmovups `32+16*$k-112`($key),$rndkey[1]
|
---|
1171 | ___
|
---|
1172 | } elsif ($k==9) {
|
---|
1173 | $sn++;
|
---|
1174 | $code.=<<___;
|
---|
1175 | cmp \$11,$rounds
|
---|
1176 | jb .Lvaesenclast$sn
|
---|
1177 | vaesenc $rndkey[0],$iv,$iv
|
---|
1178 | vmovups `32+16*($k+0)-112`($key),$rndkey[1]
|
---|
1179 | vaesenc $rndkey[1],$iv,$iv
|
---|
1180 | vmovups `32+16*($k+1)-112`($key),$rndkey[0]
|
---|
1181 | je .Lvaesenclast$sn
|
---|
1182 | vaesenc $rndkey[0],$iv,$iv
|
---|
1183 | vmovups `32+16*($k+2)-112`($key),$rndkey[1]
|
---|
1184 | vaesenc $rndkey[1],$iv,$iv
|
---|
1185 | vmovups `32+16*($k+3)-112`($key),$rndkey[0]
|
---|
1186 | .Lvaesenclast$sn:
|
---|
1187 | vaesenclast $rndkey[0],$iv,$iv
|
---|
1188 | vmovups -112($key),$rndkey[0]
|
---|
1189 | vmovups 16-112($key),$rndkey[1] # forward reference
|
---|
1190 | ___
|
---|
1191 | } else {
|
---|
1192 | $code.=<<___;
|
---|
1193 | vaesenc $rndkey[0],$iv,$iv
|
---|
1194 | vmovups `32+16*$k-112`($key),$rndkey[1]
|
---|
1195 | ___
|
---|
1196 | }
|
---|
1197 | $r++; unshift(@rndkey,pop(@rndkey));
|
---|
1198 | };
|
---|
1199 |
|
---|
1200 | sub Xupdate_avx_16_31() # recall that $Xi starts with 4
|
---|
1201 | { use integer;
|
---|
1202 | my $body = shift;
|
---|
1203 | my @insns = (&$body,&$body,&$body,&$body); # 40 instructions
|
---|
1204 | my ($a,$b,$c,$d,$e);
|
---|
1205 |
|
---|
1206 | eval(shift(@insns));
|
---|
1207 | eval(shift(@insns));
|
---|
1208 | &vpalignr(@X[0],@X[-3&7],@X[-4&7],8); # compose "X[-14]" in "X[0]"
|
---|
1209 | eval(shift(@insns));
|
---|
1210 | eval(shift(@insns));
|
---|
1211 |
|
---|
1212 | &vpaddd (@Tx[1],$Kx,@X[-1&7]);
|
---|
1213 | eval(shift(@insns));
|
---|
1214 | eval(shift(@insns));
|
---|
1215 | &vpsrldq(@Tx[0],@X[-1&7],4); # "X[-3]", 3 dwords
|
---|
1216 | eval(shift(@insns));
|
---|
1217 | eval(shift(@insns));
|
---|
1218 | &vpxor (@X[0],@X[0],@X[-4&7]); # "X[0]"^="X[-16]"
|
---|
1219 | eval(shift(@insns));
|
---|
1220 | eval(shift(@insns));
|
---|
1221 |
|
---|
1222 | &vpxor (@Tx[0],@Tx[0],@X[-2&7]); # "X[-3]"^"X[-8]"
|
---|
1223 | eval(shift(@insns));
|
---|
1224 | eval(shift(@insns));
|
---|
1225 | eval(shift(@insns));
|
---|
1226 | eval(shift(@insns));
|
---|
1227 |
|
---|
1228 | &vpxor (@X[0],@X[0],@Tx[0]); # "X[0]"^="X[-3]"^"X[-8]"
|
---|
1229 | eval(shift(@insns));
|
---|
1230 | eval(shift(@insns));
|
---|
1231 | &vmovdqa (eval(16*(($Xi-1)&3))."(%rsp)",@Tx[1]); # X[]+K xfer to IALU
|
---|
1232 | eval(shift(@insns));
|
---|
1233 | eval(shift(@insns));
|
---|
1234 |
|
---|
1235 | &vpsrld (@Tx[0],@X[0],31);
|
---|
1236 | eval(shift(@insns));
|
---|
1237 | eval(shift(@insns));
|
---|
1238 | eval(shift(@insns));
|
---|
1239 | eval(shift(@insns));
|
---|
1240 |
|
---|
1241 | &vpslldq(@Tx[1],@X[0],12); # "X[0]"<<96, extract one dword
|
---|
1242 | &vpaddd (@X[0],@X[0],@X[0]);
|
---|
1243 | eval(shift(@insns));
|
---|
1244 | eval(shift(@insns));
|
---|
1245 | eval(shift(@insns));
|
---|
1246 | eval(shift(@insns));
|
---|
1247 |
|
---|
1248 | &vpor (@X[0],@X[0],@Tx[0]); # "X[0]"<<<=1
|
---|
1249 | &vpsrld (@Tx[0],@Tx[1],30);
|
---|
1250 | eval(shift(@insns));
|
---|
1251 | eval(shift(@insns));
|
---|
1252 | eval(shift(@insns));
|
---|
1253 | eval(shift(@insns));
|
---|
1254 |
|
---|
1255 | &vpslld (@Tx[1],@Tx[1],2);
|
---|
1256 | &vpxor (@X[0],@X[0],@Tx[0]);
|
---|
1257 | eval(shift(@insns));
|
---|
1258 | eval(shift(@insns));
|
---|
1259 | eval(shift(@insns));
|
---|
1260 | eval(shift(@insns));
|
---|
1261 |
|
---|
1262 | &vpxor (@X[0],@X[0],@Tx[1]); # "X[0]"^=("X[0]">>96)<<<2
|
---|
1263 | eval(shift(@insns));
|
---|
1264 | eval(shift(@insns));
|
---|
1265 | &vmovdqa ($Kx,eval(16*(($Xi)/5))."($K_XX_XX)") if ($Xi%5==0); # K_XX_XX
|
---|
1266 | eval(shift(@insns));
|
---|
1267 | eval(shift(@insns));
|
---|
1268 |
|
---|
1269 |
|
---|
1270 | foreach (@insns) { eval; } # remaining instructions [if any]
|
---|
1271 |
|
---|
1272 | $Xi++; push(@X,shift(@X)); # "rotate" X[]
|
---|
1273 | }
|
---|
1274 |
|
---|
1275 | sub Xupdate_avx_32_79()
|
---|
1276 | { use integer;
|
---|
1277 | my $body = shift;
|
---|
1278 | my @insns = (&$body,&$body,&$body,&$body); # 32 to 48 instructions
|
---|
1279 | my ($a,$b,$c,$d,$e);
|
---|
1280 |
|
---|
1281 | &vpalignr(@Tx[0],@X[-1&7],@X[-2&7],8); # compose "X[-6]"
|
---|
1282 | &vpxor (@X[0],@X[0],@X[-4&7]); # "X[0]"="X[-32]"^"X[-16]"
|
---|
1283 | eval(shift(@insns)); # body_20_39
|
---|
1284 | eval(shift(@insns));
|
---|
1285 | eval(shift(@insns));
|
---|
1286 | eval(shift(@insns)); # rol
|
---|
1287 |
|
---|
1288 | &vpxor (@X[0],@X[0],@X[-7&7]); # "X[0]"^="X[-28]"
|
---|
1289 | eval(shift(@insns));
|
---|
1290 | eval(shift(@insns)) if (@insns[0] !~ /&ro[rl]/);
|
---|
1291 | &vpaddd (@Tx[1],$Kx,@X[-1&7]);
|
---|
1292 | &vmovdqa ($Kx,eval(16*($Xi/5))."($K_XX_XX)") if ($Xi%5==0);
|
---|
1293 | eval(shift(@insns)); # ror
|
---|
1294 | eval(shift(@insns));
|
---|
1295 |
|
---|
1296 | &vpxor (@X[0],@X[0],@Tx[0]); # "X[0]"^="X[-6]"
|
---|
1297 | eval(shift(@insns)); # body_20_39
|
---|
1298 | eval(shift(@insns));
|
---|
1299 | eval(shift(@insns));
|
---|
1300 | eval(shift(@insns)); # rol
|
---|
1301 |
|
---|
1302 | &vpsrld (@Tx[0],@X[0],30);
|
---|
1303 | &vmovdqa (eval(16*(($Xi-1)&3))."(%rsp)",@Tx[1]); # X[]+K xfer to IALU
|
---|
1304 | eval(shift(@insns));
|
---|
1305 | eval(shift(@insns));
|
---|
1306 | eval(shift(@insns)); # ror
|
---|
1307 | eval(shift(@insns));
|
---|
1308 |
|
---|
1309 | &vpslld (@X[0],@X[0],2);
|
---|
1310 | eval(shift(@insns)); # body_20_39
|
---|
1311 | eval(shift(@insns));
|
---|
1312 | eval(shift(@insns));
|
---|
1313 | eval(shift(@insns)); # rol
|
---|
1314 | eval(shift(@insns));
|
---|
1315 | eval(shift(@insns));
|
---|
1316 | eval(shift(@insns)); # ror
|
---|
1317 | eval(shift(@insns));
|
---|
1318 |
|
---|
1319 | &vpor (@X[0],@X[0],@Tx[0]); # "X[0]"<<<=2
|
---|
1320 | eval(shift(@insns)); # body_20_39
|
---|
1321 | eval(shift(@insns));
|
---|
1322 | eval(shift(@insns));
|
---|
1323 | eval(shift(@insns)); # rol
|
---|
1324 | eval(shift(@insns));
|
---|
1325 | eval(shift(@insns));
|
---|
1326 | eval(shift(@insns)); # rol
|
---|
1327 | eval(shift(@insns));
|
---|
1328 |
|
---|
1329 | foreach (@insns) { eval; } # remaining instructions
|
---|
1330 |
|
---|
1331 | $Xi++; push(@X,shift(@X)); # "rotate" X[]
|
---|
1332 | }
|
---|
1333 |
|
---|
1334 | sub Xuplast_avx_80()
|
---|
1335 | { use integer;
|
---|
1336 | my $body = shift;
|
---|
1337 | my @insns = (&$body,&$body,&$body,&$body); # 32 instructions
|
---|
1338 | my ($a,$b,$c,$d,$e);
|
---|
1339 |
|
---|
1340 | eval(shift(@insns));
|
---|
1341 | &vpaddd (@Tx[1],$Kx,@X[-1&7]);
|
---|
1342 | eval(shift(@insns));
|
---|
1343 | eval(shift(@insns));
|
---|
1344 | eval(shift(@insns));
|
---|
1345 | eval(shift(@insns));
|
---|
1346 |
|
---|
1347 | &vmovdqa (eval(16*(($Xi-1)&3))."(%rsp)",@Tx[1]); # X[]+K xfer IALU
|
---|
1348 |
|
---|
1349 | foreach (@insns) { eval; } # remaining instructions
|
---|
1350 |
|
---|
1351 | &cmp ($inp,$len);
|
---|
1352 | &je (shift);
|
---|
1353 |
|
---|
1354 | &vmovdqa(@Tx[1],"64($K_XX_XX)"); # pbswap mask
|
---|
1355 | &vmovdqa($Kx,"0($K_XX_XX)"); # K_00_19
|
---|
1356 | &vmovdqu(@X[-4&7],"0($inp)"); # load input
|
---|
1357 | &vmovdqu(@X[-3&7],"16($inp)");
|
---|
1358 | &vmovdqu(@X[-2&7],"32($inp)");
|
---|
1359 | &vmovdqu(@X[-1&7],"48($inp)");
|
---|
1360 | &vpshufb(@X[-4&7],@X[-4&7],@Tx[1]); # byte swap
|
---|
1361 | &add ($inp,64);
|
---|
1362 |
|
---|
1363 | $Xi=0;
|
---|
1364 | }
|
---|
1365 |
|
---|
1366 | sub Xloop_avx()
|
---|
1367 | { use integer;
|
---|
1368 | my $body = shift;
|
---|
1369 | my @insns = (&$body,&$body,&$body,&$body); # 32 instructions
|
---|
1370 | my ($a,$b,$c,$d,$e);
|
---|
1371 |
|
---|
1372 | eval(shift(@insns));
|
---|
1373 | eval(shift(@insns));
|
---|
1374 | &vpshufb(@X[($Xi-3)&7],@X[($Xi-3)&7],@Tx[1]);
|
---|
1375 | eval(shift(@insns));
|
---|
1376 | eval(shift(@insns));
|
---|
1377 | &vpaddd (@Tx[0],@X[($Xi-4)&7],$Kx);
|
---|
1378 | eval(shift(@insns));
|
---|
1379 | eval(shift(@insns));
|
---|
1380 | eval(shift(@insns));
|
---|
1381 | eval(shift(@insns));
|
---|
1382 | &vmovdqa(eval(16*$Xi)."(%rsp)",@Tx[0]); # X[]+K xfer to IALU
|
---|
1383 | eval(shift(@insns));
|
---|
1384 | eval(shift(@insns));
|
---|
1385 |
|
---|
1386 | foreach (@insns) { eval; }
|
---|
1387 | $Xi++;
|
---|
1388 | }
|
---|
1389 |
|
---|
1390 | sub Xtail_avx()
|
---|
1391 | { use integer;
|
---|
1392 | my $body = shift;
|
---|
1393 | my @insns = (&$body,&$body,&$body,&$body); # 32 instructions
|
---|
1394 | my ($a,$b,$c,$d,$e);
|
---|
1395 |
|
---|
1396 | foreach (@insns) { eval; }
|
---|
1397 | }
|
---|
1398 |
|
---|
1399 | $code.=<<___;
|
---|
1400 | .align 32
|
---|
1401 | .Loop_avx:
|
---|
1402 | ___
|
---|
1403 | &Xupdate_avx_16_31(\&body_00_19);
|
---|
1404 | &Xupdate_avx_16_31(\&body_00_19);
|
---|
1405 | &Xupdate_avx_16_31(\&body_00_19);
|
---|
1406 | &Xupdate_avx_16_31(\&body_00_19);
|
---|
1407 | &Xupdate_avx_32_79(\&body_00_19);
|
---|
1408 | &Xupdate_avx_32_79(\&body_20_39);
|
---|
1409 | &Xupdate_avx_32_79(\&body_20_39);
|
---|
1410 | &Xupdate_avx_32_79(\&body_20_39);
|
---|
1411 | &Xupdate_avx_32_79(\&body_20_39);
|
---|
1412 | &Xupdate_avx_32_79(\&body_20_39);
|
---|
1413 | &Xupdate_avx_32_79(\&body_40_59);
|
---|
1414 | &Xupdate_avx_32_79(\&body_40_59);
|
---|
1415 | &Xupdate_avx_32_79(\&body_40_59);
|
---|
1416 | &Xupdate_avx_32_79(\&body_40_59);
|
---|
1417 | &Xupdate_avx_32_79(\&body_40_59);
|
---|
1418 | &Xupdate_avx_32_79(\&body_20_39);
|
---|
1419 | &Xuplast_avx_80(\&body_20_39,".Ldone_avx"); # can jump to "done"
|
---|
1420 |
|
---|
1421 | $saved_j=$j; @saved_V=@V;
|
---|
1422 | $saved_r=$r; @saved_rndkey=@rndkey;
|
---|
1423 |
|
---|
1424 | &Xloop_avx(\&body_20_39);
|
---|
1425 | &Xloop_avx(\&body_20_39);
|
---|
1426 | &Xloop_avx(\&body_20_39);
|
---|
1427 |
|
---|
1428 | $code.=<<___;
|
---|
1429 | vmovups $iv,48($out,$in0) # write output
|
---|
1430 | lea 64($in0),$in0
|
---|
1431 |
|
---|
1432 | add 0($ctx),$A # update context
|
---|
1433 | add 4($ctx),@T[0]
|
---|
1434 | add 8($ctx),$C
|
---|
1435 | add 12($ctx),$D
|
---|
1436 | mov $A,0($ctx)
|
---|
1437 | add 16($ctx),$E
|
---|
1438 | mov @T[0],4($ctx)
|
---|
1439 | mov @T[0],$B # magic seed
|
---|
1440 | mov $C,8($ctx)
|
---|
1441 | mov $C,@T[1]
|
---|
1442 | mov $D,12($ctx)
|
---|
1443 | xor $D,@T[1]
|
---|
1444 | mov $E,16($ctx)
|
---|
1445 | and @T[1],@T[0]
|
---|
1446 | jmp .Loop_avx
|
---|
1447 |
|
---|
1448 | .Ldone_avx:
|
---|
1449 | ___
|
---|
1450 | $jj=$j=$saved_j; @V=@saved_V;
|
---|
1451 | $r=$saved_r; @rndkey=@saved_rndkey;
|
---|
1452 |
|
---|
1453 | &Xtail_avx(\&body_20_39);
|
---|
1454 | &Xtail_avx(\&body_20_39);
|
---|
1455 | &Xtail_avx(\&body_20_39);
|
---|
1456 |
|
---|
1457 | $code.=<<___;
|
---|
1458 | vmovups $iv,48($out,$in0) # write output
|
---|
1459 | mov 88(%rsp),$ivp # restore $ivp
|
---|
1460 |
|
---|
1461 | add 0($ctx),$A # update context
|
---|
1462 | add 4($ctx),@T[0]
|
---|
1463 | add 8($ctx),$C
|
---|
1464 | mov $A,0($ctx)
|
---|
1465 | add 12($ctx),$D
|
---|
1466 | mov @T[0],4($ctx)
|
---|
1467 | add 16($ctx),$E
|
---|
1468 | mov $C,8($ctx)
|
---|
1469 | mov $D,12($ctx)
|
---|
1470 | mov $E,16($ctx)
|
---|
1471 | vmovups $iv,($ivp) # write IV
|
---|
1472 | vzeroall
|
---|
1473 | ___
|
---|
1474 | $code.=<<___ if ($win64);
|
---|
1475 | movaps 96+0(%rsp),%xmm6
|
---|
1476 | movaps 96+16(%rsp),%xmm7
|
---|
1477 | movaps 96+32(%rsp),%xmm8
|
---|
1478 | movaps 96+48(%rsp),%xmm9
|
---|
1479 | movaps 96+64(%rsp),%xmm10
|
---|
1480 | movaps 96+80(%rsp),%xmm11
|
---|
1481 | movaps 96+96(%rsp),%xmm12
|
---|
1482 | movaps 96+112(%rsp),%xmm13
|
---|
1483 | movaps 96+128(%rsp),%xmm14
|
---|
1484 | movaps 96+144(%rsp),%xmm15
|
---|
1485 | ___
|
---|
1486 | $code.=<<___;
|
---|
1487 | lea `104+($win64?10*16:0)`(%rsp),%rsi
|
---|
1488 | .cfi_def_cfa %rsi,56
|
---|
1489 | mov 0(%rsi),%r15
|
---|
1490 | .cfi_restore %r15
|
---|
1491 | mov 8(%rsi),%r14
|
---|
1492 | .cfi_restore %r14
|
---|
1493 | mov 16(%rsi),%r13
|
---|
1494 | .cfi_restore %r13
|
---|
1495 | mov 24(%rsi),%r12
|
---|
1496 | .cfi_restore %r12
|
---|
1497 | mov 32(%rsi),%rbp
|
---|
1498 | .cfi_restore %rbp
|
---|
1499 | mov 40(%rsi),%rbx
|
---|
1500 | .cfi_restore %rbx
|
---|
1501 | lea 48(%rsi),%rsp
|
---|
1502 | .cfi_def_cfa %rsp,8
|
---|
1503 | .Lepilogue_avx:
|
---|
1504 | ret
|
---|
1505 | .cfi_endproc
|
---|
1506 | .size aesni_cbc_sha1_enc_avx,.-aesni_cbc_sha1_enc_avx
|
---|
1507 | ___
|
---|
1508 |
|
---|
1509 | if ($stitched_decrypt) {{{
|
---|
1510 | # reset
|
---|
1511 | ($in0,$out,$len,$key,$ivp,$ctx,$inp)=("%rdi","%rsi","%rdx","%rcx","%r8","%r9","%r10");
|
---|
1512 |
|
---|
1513 | $j=$jj=$r=$rx=0;
|
---|
1514 | $Xi=4;
|
---|
1515 |
|
---|
1516 | @aes256_dec = (
|
---|
1517 | '&vpxor ($inout0,$rndkey0,"0x00($in0)");',
|
---|
1518 | '&vpxor ($inout1,$rndkey0,"0x10($in0)");',
|
---|
1519 | '&vpxor ($inout2,$rndkey0,"0x20($in0)");',
|
---|
1520 | '&vpxor ($inout3,$rndkey0,"0x30($in0)");',
|
---|
1521 |
|
---|
1522 | '&vmovups($rndkey0,"16-112($key)");',
|
---|
1523 | '&vmovups("64(%rsp)",@X[2]);', # save IV, originally @X[3]
|
---|
1524 | undef,undef
|
---|
1525 | );
|
---|
1526 | for ($i=0;$i<13;$i++) {
|
---|
1527 | push (@aes256_dec,(
|
---|
1528 | '&vaesdec ($inout0,$inout0,$rndkey0);',
|
---|
1529 | '&vaesdec ($inout1,$inout1,$rndkey0);',
|
---|
1530 | '&vaesdec ($inout2,$inout2,$rndkey0);',
|
---|
1531 | '&vaesdec ($inout3,$inout3,$rndkey0); &vmovups($rndkey0,"'.(16*($i+2)-112).'($key)");'
|
---|
1532 | ));
|
---|
1533 | push (@aes256_dec,(undef,undef)) if (($i>=3 && $i<=5) || $i>=11);
|
---|
1534 | push (@aes256_dec,(undef,undef)) if ($i==5);
|
---|
1535 | }
|
---|
1536 | push(@aes256_dec,(
|
---|
1537 | '&vaesdeclast ($inout0,$inout0,$rndkey0); &vmovups(@X[0],"0x00($in0)");',
|
---|
1538 | '&vaesdeclast ($inout1,$inout1,$rndkey0); &vmovups(@X[1],"0x10($in0)");',
|
---|
1539 | '&vaesdeclast ($inout2,$inout2,$rndkey0); &vmovups(@X[2],"0x20($in0)");',
|
---|
1540 | '&vaesdeclast ($inout3,$inout3,$rndkey0); &vmovups(@X[3],"0x30($in0)");',
|
---|
1541 |
|
---|
1542 | '&vxorps ($inout0,$inout0,"64(%rsp)"); &vmovdqu($rndkey0,"-112($key)");',
|
---|
1543 | '&vxorps ($inout1,$inout1,@X[0]); &vmovups("0x00($out,$in0)",$inout0);',
|
---|
1544 | '&vxorps ($inout2,$inout2,@X[1]); &vmovups("0x10($out,$in0)",$inout1);',
|
---|
1545 | '&vxorps ($inout3,$inout3,@X[2]); &vmovups("0x20($out,$in0)",$inout2);',
|
---|
1546 |
|
---|
1547 | '&vmovups ("0x30($out,$in0)",$inout3);'
|
---|
1548 | ));
|
---|
1549 |
|
---|
1550 | $code.=<<___;
|
---|
1551 | .type aesni256_cbc_sha1_dec_avx,\@function,6
|
---|
1552 | .align 32
|
---|
1553 | aesni256_cbc_sha1_dec_avx:
|
---|
1554 | .cfi_startproc
|
---|
1555 | mov `($win64?56:8)`(%rsp),$inp # load 7th argument
|
---|
1556 | push %rbx
|
---|
1557 | .cfi_push %rbx
|
---|
1558 | push %rbp
|
---|
1559 | .cfi_push %rbp
|
---|
1560 | push %r12
|
---|
1561 | .cfi_push %r12
|
---|
1562 | push %r13
|
---|
1563 | .cfi_push %r13
|
---|
1564 | push %r14
|
---|
1565 | .cfi_push %r14
|
---|
1566 | push %r15
|
---|
1567 | .cfi_push %r15
|
---|
1568 | lea `-104-($win64?10*16:0)`(%rsp),%rsp
|
---|
1569 | .cfi_adjust_cfa_offset `104+($win64?10*16:0)`
|
---|
1570 | ___
|
---|
1571 | $code.=<<___ if ($win64);
|
---|
1572 | movaps %xmm6,96+0(%rsp)
|
---|
1573 | movaps %xmm7,96+16(%rsp)
|
---|
1574 | movaps %xmm8,96+32(%rsp)
|
---|
1575 | movaps %xmm9,96+48(%rsp)
|
---|
1576 | movaps %xmm10,96+64(%rsp)
|
---|
1577 | movaps %xmm11,96+80(%rsp)
|
---|
1578 | movaps %xmm12,96+96(%rsp)
|
---|
1579 | movaps %xmm13,96+112(%rsp)
|
---|
1580 | movaps %xmm14,96+128(%rsp)
|
---|
1581 | movaps %xmm15,96+144(%rsp)
|
---|
1582 | .Lprologue_dec_avx:
|
---|
1583 | ___
|
---|
1584 | $code.=<<___;
|
---|
1585 | vzeroall
|
---|
1586 | mov $in0,%r12 # reassign arguments
|
---|
1587 | mov $out,%r13
|
---|
1588 | mov $len,%r14
|
---|
1589 | lea 112($key),%r15 # size optimization
|
---|
1590 | vmovdqu ($ivp),@X[3] # load IV
|
---|
1591 | ___
|
---|
1592 | ($in0,$out,$len,$key)=map("%r$_",(12..15)); # reassign arguments
|
---|
1593 | $code.=<<___;
|
---|
1594 | shl \$6,$len
|
---|
1595 | sub $in0,$out
|
---|
1596 | add $inp,$len # end of input
|
---|
1597 |
|
---|
1598 | lea K_XX_XX(%rip),$K_XX_XX
|
---|
1599 | mov 0($ctx),$A # load context
|
---|
1600 | mov 4($ctx),$B
|
---|
1601 | mov 8($ctx),$C
|
---|
1602 | mov 12($ctx),$D
|
---|
1603 | mov $B,@T[0] # magic seed
|
---|
1604 | mov 16($ctx),$E
|
---|
1605 | mov $C,@T[1]
|
---|
1606 | xor $D,@T[1]
|
---|
1607 | and @T[1],@T[0]
|
---|
1608 |
|
---|
1609 | vmovdqa 64($K_XX_XX),@X[2] # pbswap mask
|
---|
1610 | vmovdqa 0($K_XX_XX),$Kx # K_00_19
|
---|
1611 | vmovdqu 0($inp),@X[-4&7] # load input to %xmm[0-3]
|
---|
1612 | vmovdqu 16($inp),@X[-3&7]
|
---|
1613 | vmovdqu 32($inp),@X[-2&7]
|
---|
1614 | vmovdqu 48($inp),@X[-1&7]
|
---|
1615 | vpshufb @X[2],@X[-4&7],@X[-4&7] # byte swap
|
---|
1616 | add \$64,$inp
|
---|
1617 | vpshufb @X[2],@X[-3&7],@X[-3&7]
|
---|
1618 | vpshufb @X[2],@X[-2&7],@X[-2&7]
|
---|
1619 | vpshufb @X[2],@X[-1&7],@X[-1&7]
|
---|
1620 | vpaddd $Kx,@X[-4&7],@X[0] # add K_00_19
|
---|
1621 | vpaddd $Kx,@X[-3&7],@X[1]
|
---|
1622 | vpaddd $Kx,@X[-2&7],@X[2]
|
---|
1623 | vmovdqa @X[0],0(%rsp) # X[]+K xfer to IALU
|
---|
1624 | vmovdqa @X[1],16(%rsp)
|
---|
1625 | vmovdqa @X[2],32(%rsp)
|
---|
1626 | vmovups -112($key),$rndkey0 # $key[0]
|
---|
1627 | jmp .Loop_dec_avx
|
---|
1628 |
|
---|
1629 | .align 32
|
---|
1630 | .Loop_dec_avx:
|
---|
1631 | ___
|
---|
1632 | &Xupdate_avx_16_31(\&body_00_19_dec);
|
---|
1633 | &Xupdate_avx_16_31(\&body_00_19_dec);
|
---|
1634 | &Xupdate_avx_16_31(\&body_00_19_dec);
|
---|
1635 | &Xupdate_avx_16_31(\&body_00_19_dec);
|
---|
1636 | &Xupdate_avx_32_79(\&body_00_19_dec);
|
---|
1637 | &Xupdate_avx_32_79(\&body_20_39_dec);
|
---|
1638 | &Xupdate_avx_32_79(\&body_20_39_dec);
|
---|
1639 | &Xupdate_avx_32_79(\&body_20_39_dec);
|
---|
1640 | &Xupdate_avx_32_79(\&body_20_39_dec);
|
---|
1641 | &Xupdate_avx_32_79(\&body_20_39_dec);
|
---|
1642 | &Xupdate_avx_32_79(\&body_40_59_dec);
|
---|
1643 | &Xupdate_avx_32_79(\&body_40_59_dec);
|
---|
1644 | &Xupdate_avx_32_79(\&body_40_59_dec);
|
---|
1645 | &Xupdate_avx_32_79(\&body_40_59_dec);
|
---|
1646 | &Xupdate_avx_32_79(\&body_40_59_dec);
|
---|
1647 | &Xupdate_avx_32_79(\&body_20_39_dec);
|
---|
1648 | &Xuplast_avx_80(\&body_20_39_dec,".Ldone_dec_avx"); # can jump to "done"
|
---|
1649 |
|
---|
1650 | $saved_j=$j; @saved_V=@V;
|
---|
1651 | $saved_rx=$rx;
|
---|
1652 |
|
---|
1653 | &Xloop_avx(\&body_20_39_dec);
|
---|
1654 | &Xloop_avx(\&body_20_39_dec);
|
---|
1655 | &Xloop_avx(\&body_20_39_dec);
|
---|
1656 |
|
---|
1657 | eval(@aes256_dec[-1]); # last store
|
---|
1658 | $code.=<<___;
|
---|
1659 | lea 64($in0),$in0
|
---|
1660 |
|
---|
1661 | add 0($ctx),$A # update context
|
---|
1662 | add 4($ctx),@T[0]
|
---|
1663 | add 8($ctx),$C
|
---|
1664 | add 12($ctx),$D
|
---|
1665 | mov $A,0($ctx)
|
---|
1666 | add 16($ctx),$E
|
---|
1667 | mov @T[0],4($ctx)
|
---|
1668 | mov @T[0],$B # magic seed
|
---|
1669 | mov $C,8($ctx)
|
---|
1670 | mov $C,@T[1]
|
---|
1671 | mov $D,12($ctx)
|
---|
1672 | xor $D,@T[1]
|
---|
1673 | mov $E,16($ctx)
|
---|
1674 | and @T[1],@T[0]
|
---|
1675 | jmp .Loop_dec_avx
|
---|
1676 |
|
---|
1677 | .Ldone_dec_avx:
|
---|
1678 | ___
|
---|
1679 | $jj=$j=$saved_j; @V=@saved_V;
|
---|
1680 | $rx=$saved_rx;
|
---|
1681 |
|
---|
1682 | &Xtail_avx(\&body_20_39_dec);
|
---|
1683 | &Xtail_avx(\&body_20_39_dec);
|
---|
1684 | &Xtail_avx(\&body_20_39_dec);
|
---|
1685 |
|
---|
1686 | eval(@aes256_dec[-1]); # last store
|
---|
1687 | $code.=<<___;
|
---|
1688 |
|
---|
1689 | add 0($ctx),$A # update context
|
---|
1690 | add 4($ctx),@T[0]
|
---|
1691 | add 8($ctx),$C
|
---|
1692 | mov $A,0($ctx)
|
---|
1693 | add 12($ctx),$D
|
---|
1694 | mov @T[0],4($ctx)
|
---|
1695 | add 16($ctx),$E
|
---|
1696 | mov $C,8($ctx)
|
---|
1697 | mov $D,12($ctx)
|
---|
1698 | mov $E,16($ctx)
|
---|
1699 | vmovups @X[3],($ivp) # write IV
|
---|
1700 | vzeroall
|
---|
1701 | ___
|
---|
1702 | $code.=<<___ if ($win64);
|
---|
1703 | movaps 96+0(%rsp),%xmm6
|
---|
1704 | movaps 96+16(%rsp),%xmm7
|
---|
1705 | movaps 96+32(%rsp),%xmm8
|
---|
1706 | movaps 96+48(%rsp),%xmm9
|
---|
1707 | movaps 96+64(%rsp),%xmm10
|
---|
1708 | movaps 96+80(%rsp),%xmm11
|
---|
1709 | movaps 96+96(%rsp),%xmm12
|
---|
1710 | movaps 96+112(%rsp),%xmm13
|
---|
1711 | movaps 96+128(%rsp),%xmm14
|
---|
1712 | movaps 96+144(%rsp),%xmm15
|
---|
1713 | ___
|
---|
1714 | $code.=<<___;
|
---|
1715 | lea `104+($win64?10*16:0)`(%rsp),%rsi
|
---|
1716 | .cfi_def_cfa %rsi,56
|
---|
1717 | mov 0(%rsi),%r15
|
---|
1718 | .cfi_restore %r15
|
---|
1719 | mov 8(%rsi),%r14
|
---|
1720 | .cfi_restore %r14
|
---|
1721 | mov 16(%rsi),%r13
|
---|
1722 | .cfi_restore %r13
|
---|
1723 | mov 24(%rsi),%r12
|
---|
1724 | .cfi_restore %r12
|
---|
1725 | mov 32(%rsi),%rbp
|
---|
1726 | .cfi_restore %rbp
|
---|
1727 | mov 40(%rsi),%rbx
|
---|
1728 | .cfi_restore %rbx
|
---|
1729 | lea 48(%rsi),%rsp
|
---|
1730 | .cfi_def_cfa %rsp,8
|
---|
1731 | .Lepilogue_dec_avx:
|
---|
1732 | ret
|
---|
1733 | .cfi_endproc
|
---|
1734 | .size aesni256_cbc_sha1_dec_avx,.-aesni256_cbc_sha1_dec_avx
|
---|
1735 | ___
|
---|
1736 | }}}
|
---|
1737 | }
|
---|
1738 | $code.=<<___;
|
---|
1739 | .align 64
|
---|
1740 | K_XX_XX:
|
---|
1741 | .long 0x5a827999,0x5a827999,0x5a827999,0x5a827999 # K_00_19
|
---|
1742 | .long 0x6ed9eba1,0x6ed9eba1,0x6ed9eba1,0x6ed9eba1 # K_20_39
|
---|
1743 | .long 0x8f1bbcdc,0x8f1bbcdc,0x8f1bbcdc,0x8f1bbcdc # K_40_59
|
---|
1744 | .long 0xca62c1d6,0xca62c1d6,0xca62c1d6,0xca62c1d6 # K_60_79
|
---|
1745 | .long 0x00010203,0x04050607,0x08090a0b,0x0c0d0e0f # pbswap mask
|
---|
1746 | .byte 0xf,0xe,0xd,0xc,0xb,0xa,0x9,0x8,0x7,0x6,0x5,0x4,0x3,0x2,0x1,0x0
|
---|
1747 |
|
---|
1748 | .asciz "AESNI-CBC+SHA1 stitch for x86_64, CRYPTOGAMS by <appro\@openssl.org>"
|
---|
1749 | .align 64
|
---|
1750 | ___
|
---|
1751 | if ($shaext) {{{
|
---|
1752 | ($in0,$out,$len,$key,$ivp,$ctx,$inp)=("%rdi","%rsi","%rdx","%rcx","%r8","%r9","%r10");
|
---|
1753 |
|
---|
1754 | $rounds="%r11d";
|
---|
1755 |
|
---|
1756 | ($iv,$in,$rndkey0)=map("%xmm$_",(2,14,15));
|
---|
1757 | @rndkey=("%xmm0","%xmm1");
|
---|
1758 | $r=0;
|
---|
1759 |
|
---|
1760 | my ($BSWAP,$ABCD,$E,$E_,$ABCD_SAVE,$E_SAVE)=map("%xmm$_",(7..12));
|
---|
1761 | my @MSG=map("%xmm$_",(3..6));
|
---|
1762 |
|
---|
1763 | $code.=<<___;
|
---|
1764 | .type aesni_cbc_sha1_enc_shaext,\@function,6
|
---|
1765 | .align 32
|
---|
1766 | aesni_cbc_sha1_enc_shaext:
|
---|
1767 | .cfi_startproc
|
---|
1768 | mov `($win64?56:8)`(%rsp),$inp # load 7th argument
|
---|
1769 | ___
|
---|
1770 | $code.=<<___ if ($win64);
|
---|
1771 | lea `-8-10*16`(%rsp),%rsp
|
---|
1772 | movaps %xmm6,-8-10*16(%rax)
|
---|
1773 | movaps %xmm7,-8-9*16(%rax)
|
---|
1774 | movaps %xmm8,-8-8*16(%rax)
|
---|
1775 | movaps %xmm9,-8-7*16(%rax)
|
---|
1776 | movaps %xmm10,-8-6*16(%rax)
|
---|
1777 | movaps %xmm11,-8-5*16(%rax)
|
---|
1778 | movaps %xmm12,-8-4*16(%rax)
|
---|
1779 | movaps %xmm13,-8-3*16(%rax)
|
---|
1780 | movaps %xmm14,-8-2*16(%rax)
|
---|
1781 | movaps %xmm15,-8-1*16(%rax)
|
---|
1782 | .Lprologue_shaext:
|
---|
1783 | ___
|
---|
1784 | $code.=<<___;
|
---|
1785 | movdqu ($ctx),$ABCD
|
---|
1786 | movd 16($ctx),$E
|
---|
1787 | movdqa K_XX_XX+0x50(%rip),$BSWAP # byte-n-word swap
|
---|
1788 |
|
---|
1789 | mov 240($key),$rounds
|
---|
1790 | sub $in0,$out
|
---|
1791 | movups ($key),$rndkey0 # $key[0]
|
---|
1792 | movups ($ivp),$iv # load IV
|
---|
1793 | movups 16($key),$rndkey[0] # forward reference
|
---|
1794 | lea 112($key),$key # size optimization
|
---|
1795 |
|
---|
1796 | pshufd \$0b00011011,$ABCD,$ABCD # flip word order
|
---|
1797 | pshufd \$0b00011011,$E,$E # flip word order
|
---|
1798 | jmp .Loop_shaext
|
---|
1799 |
|
---|
1800 | .align 16
|
---|
1801 | .Loop_shaext:
|
---|
1802 | ___
|
---|
1803 | &$aesenc();
|
---|
1804 | $code.=<<___;
|
---|
1805 | movdqu ($inp),@MSG[0]
|
---|
1806 | movdqa $E,$E_SAVE # offload $E
|
---|
1807 | pshufb $BSWAP,@MSG[0]
|
---|
1808 | movdqu 0x10($inp),@MSG[1]
|
---|
1809 | movdqa $ABCD,$ABCD_SAVE # offload $ABCD
|
---|
1810 | ___
|
---|
1811 | &$aesenc();
|
---|
1812 | $code.=<<___;
|
---|
1813 | pshufb $BSWAP,@MSG[1]
|
---|
1814 |
|
---|
1815 | paddd @MSG[0],$E
|
---|
1816 | movdqu 0x20($inp),@MSG[2]
|
---|
1817 | lea 0x40($inp),$inp
|
---|
1818 | pxor $E_SAVE,@MSG[0] # black magic
|
---|
1819 | ___
|
---|
1820 | &$aesenc();
|
---|
1821 | $code.=<<___;
|
---|
1822 | pxor $E_SAVE,@MSG[0] # black magic
|
---|
1823 | movdqa $ABCD,$E_
|
---|
1824 | pshufb $BSWAP,@MSG[2]
|
---|
1825 | sha1rnds4 \$0,$E,$ABCD # 0-3
|
---|
1826 | sha1nexte @MSG[1],$E_
|
---|
1827 | ___
|
---|
1828 | &$aesenc();
|
---|
1829 | $code.=<<___;
|
---|
1830 | sha1msg1 @MSG[1],@MSG[0]
|
---|
1831 | movdqu -0x10($inp),@MSG[3]
|
---|
1832 | movdqa $ABCD,$E
|
---|
1833 | pshufb $BSWAP,@MSG[3]
|
---|
1834 | ___
|
---|
1835 | &$aesenc();
|
---|
1836 | $code.=<<___;
|
---|
1837 | sha1rnds4 \$0,$E_,$ABCD # 4-7
|
---|
1838 | sha1nexte @MSG[2],$E
|
---|
1839 | pxor @MSG[2],@MSG[0]
|
---|
1840 | sha1msg1 @MSG[2],@MSG[1]
|
---|
1841 | ___
|
---|
1842 | &$aesenc();
|
---|
1843 |
|
---|
1844 | for($i=2;$i<20-4;$i++) {
|
---|
1845 | $code.=<<___;
|
---|
1846 | movdqa $ABCD,$E_
|
---|
1847 | sha1rnds4 \$`int($i/5)`,$E,$ABCD # 8-11
|
---|
1848 | sha1nexte @MSG[3],$E_
|
---|
1849 | ___
|
---|
1850 | &$aesenc();
|
---|
1851 | $code.=<<___;
|
---|
1852 | sha1msg2 @MSG[3],@MSG[0]
|
---|
1853 | pxor @MSG[3],@MSG[1]
|
---|
1854 | sha1msg1 @MSG[3],@MSG[2]
|
---|
1855 | ___
|
---|
1856 | ($E,$E_)=($E_,$E);
|
---|
1857 | push(@MSG,shift(@MSG));
|
---|
1858 |
|
---|
1859 | &$aesenc();
|
---|
1860 | }
|
---|
1861 | $code.=<<___;
|
---|
1862 | movdqa $ABCD,$E_
|
---|
1863 | sha1rnds4 \$3,$E,$ABCD # 64-67
|
---|
1864 | sha1nexte @MSG[3],$E_
|
---|
1865 | sha1msg2 @MSG[3],@MSG[0]
|
---|
1866 | pxor @MSG[3],@MSG[1]
|
---|
1867 | ___
|
---|
1868 | &$aesenc();
|
---|
1869 | $code.=<<___;
|
---|
1870 | movdqa $ABCD,$E
|
---|
1871 | sha1rnds4 \$3,$E_,$ABCD # 68-71
|
---|
1872 | sha1nexte @MSG[0],$E
|
---|
1873 | sha1msg2 @MSG[0],@MSG[1]
|
---|
1874 | ___
|
---|
1875 | &$aesenc();
|
---|
1876 | $code.=<<___;
|
---|
1877 | movdqa $E_SAVE,@MSG[0]
|
---|
1878 | movdqa $ABCD,$E_
|
---|
1879 | sha1rnds4 \$3,$E,$ABCD # 72-75
|
---|
1880 | sha1nexte @MSG[1],$E_
|
---|
1881 | ___
|
---|
1882 | &$aesenc();
|
---|
1883 | $code.=<<___;
|
---|
1884 | movdqa $ABCD,$E
|
---|
1885 | sha1rnds4 \$3,$E_,$ABCD # 76-79
|
---|
1886 | sha1nexte $MSG[0],$E
|
---|
1887 | ___
|
---|
1888 | while($r<40) { &$aesenc(); } # remaining aesenc's
|
---|
1889 | $code.=<<___;
|
---|
1890 | dec $len
|
---|
1891 |
|
---|
1892 | paddd $ABCD_SAVE,$ABCD
|
---|
1893 | movups $iv,48($out,$in0) # write output
|
---|
1894 | lea 64($in0),$in0
|
---|
1895 | jnz .Loop_shaext
|
---|
1896 |
|
---|
1897 | pshufd \$0b00011011,$ABCD,$ABCD
|
---|
1898 | pshufd \$0b00011011,$E,$E
|
---|
1899 | movups $iv,($ivp) # write IV
|
---|
1900 | movdqu $ABCD,($ctx)
|
---|
1901 | movd $E,16($ctx)
|
---|
1902 | ___
|
---|
1903 | $code.=<<___ if ($win64);
|
---|
1904 | movaps -8-10*16(%rax),%xmm6
|
---|
1905 | movaps -8-9*16(%rax),%xmm7
|
---|
1906 | movaps -8-8*16(%rax),%xmm8
|
---|
1907 | movaps -8-7*16(%rax),%xmm9
|
---|
1908 | movaps -8-6*16(%rax),%xmm10
|
---|
1909 | movaps -8-5*16(%rax),%xmm11
|
---|
1910 | movaps -8-4*16(%rax),%xmm12
|
---|
1911 | movaps -8-3*16(%rax),%xmm13
|
---|
1912 | movaps -8-2*16(%rax),%xmm14
|
---|
1913 | movaps -8-1*16(%rax),%xmm15
|
---|
1914 | mov %rax,%rsp
|
---|
1915 | .Lepilogue_shaext:
|
---|
1916 | ___
|
---|
1917 | $code.=<<___;
|
---|
1918 | ret
|
---|
1919 | .cfi_endproc
|
---|
1920 | .size aesni_cbc_sha1_enc_shaext,.-aesni_cbc_sha1_enc_shaext
|
---|
1921 | ___
|
---|
1922 | }}}
|
---|
1923 | # EXCEPTION_DISPOSITION handler (EXCEPTION_RECORD *rec,ULONG64 frame,
|
---|
1924 | # CONTEXT *context,DISPATCHER_CONTEXT *disp)
|
---|
1925 | if ($win64) {
|
---|
1926 | $rec="%rcx";
|
---|
1927 | $frame="%rdx";
|
---|
1928 | $context="%r8";
|
---|
1929 | $disp="%r9";
|
---|
1930 |
|
---|
1931 | $code.=<<___;
|
---|
1932 | .extern __imp_RtlVirtualUnwind
|
---|
1933 | .type ssse3_handler,\@abi-omnipotent
|
---|
1934 | .align 16
|
---|
1935 | ssse3_handler:
|
---|
1936 | push %rsi
|
---|
1937 | push %rdi
|
---|
1938 | push %rbx
|
---|
1939 | push %rbp
|
---|
1940 | push %r12
|
---|
1941 | push %r13
|
---|
1942 | push %r14
|
---|
1943 | push %r15
|
---|
1944 | pushfq
|
---|
1945 | sub \$64,%rsp
|
---|
1946 |
|
---|
1947 | mov 120($context),%rax # pull context->Rax
|
---|
1948 | mov 248($context),%rbx # pull context->Rip
|
---|
1949 |
|
---|
1950 | mov 8($disp),%rsi # disp->ImageBase
|
---|
1951 | mov 56($disp),%r11 # disp->HandlerData
|
---|
1952 |
|
---|
1953 | mov 0(%r11),%r10d # HandlerData[0]
|
---|
1954 | lea (%rsi,%r10),%r10 # prologue label
|
---|
1955 | cmp %r10,%rbx # context->Rip<prologue label
|
---|
1956 | jb .Lcommon_seh_tail
|
---|
1957 |
|
---|
1958 | mov 152($context),%rax # pull context->Rsp
|
---|
1959 |
|
---|
1960 | mov 4(%r11),%r10d # HandlerData[1]
|
---|
1961 | lea (%rsi,%r10),%r10 # epilogue label
|
---|
1962 | cmp %r10,%rbx # context->Rip>=epilogue label
|
---|
1963 | jae .Lcommon_seh_tail
|
---|
1964 | ___
|
---|
1965 | $code.=<<___ if ($shaext);
|
---|
1966 | lea aesni_cbc_sha1_enc_shaext(%rip),%r10
|
---|
1967 | cmp %r10,%rbx
|
---|
1968 | jb .Lseh_no_shaext
|
---|
1969 |
|
---|
1970 | lea (%rax),%rsi
|
---|
1971 | lea 512($context),%rdi # &context.Xmm6
|
---|
1972 | mov \$20,%ecx
|
---|
1973 | .long 0xa548f3fc # cld; rep movsq
|
---|
1974 | lea 168(%rax),%rax # adjust stack pointer
|
---|
1975 | jmp .Lcommon_seh_tail
|
---|
1976 | .Lseh_no_shaext:
|
---|
1977 | ___
|
---|
1978 | $code.=<<___;
|
---|
1979 | lea 96(%rax),%rsi
|
---|
1980 | lea 512($context),%rdi # &context.Xmm6
|
---|
1981 | mov \$20,%ecx
|
---|
1982 | .long 0xa548f3fc # cld; rep movsq
|
---|
1983 | lea `104+10*16`(%rax),%rax # adjust stack pointer
|
---|
1984 |
|
---|
1985 | mov 0(%rax),%r15
|
---|
1986 | mov 8(%rax),%r14
|
---|
1987 | mov 16(%rax),%r13
|
---|
1988 | mov 24(%rax),%r12
|
---|
1989 | mov 32(%rax),%rbp
|
---|
1990 | mov 40(%rax),%rbx
|
---|
1991 | lea 48(%rax),%rax
|
---|
1992 | mov %rbx,144($context) # restore context->Rbx
|
---|
1993 | mov %rbp,160($context) # restore context->Rbp
|
---|
1994 | mov %r12,216($context) # restore context->R12
|
---|
1995 | mov %r13,224($context) # restore context->R13
|
---|
1996 | mov %r14,232($context) # restore context->R14
|
---|
1997 | mov %r15,240($context) # restore context->R15
|
---|
1998 |
|
---|
1999 | .Lcommon_seh_tail:
|
---|
2000 | mov 8(%rax),%rdi
|
---|
2001 | mov 16(%rax),%rsi
|
---|
2002 | mov %rax,152($context) # restore context->Rsp
|
---|
2003 | mov %rsi,168($context) # restore context->Rsi
|
---|
2004 | mov %rdi,176($context) # restore context->Rdi
|
---|
2005 |
|
---|
2006 | mov 40($disp),%rdi # disp->ContextRecord
|
---|
2007 | mov $context,%rsi # context
|
---|
2008 | mov \$154,%ecx # sizeof(CONTEXT)
|
---|
2009 | .long 0xa548f3fc # cld; rep movsq
|
---|
2010 |
|
---|
2011 | mov $disp,%rsi
|
---|
2012 | xor %rcx,%rcx # arg1, UNW_FLAG_NHANDLER
|
---|
2013 | mov 8(%rsi),%rdx # arg2, disp->ImageBase
|
---|
2014 | mov 0(%rsi),%r8 # arg3, disp->ControlPc
|
---|
2015 | mov 16(%rsi),%r9 # arg4, disp->FunctionEntry
|
---|
2016 | mov 40(%rsi),%r10 # disp->ContextRecord
|
---|
2017 | lea 56(%rsi),%r11 # &disp->HandlerData
|
---|
2018 | lea 24(%rsi),%r12 # &disp->EstablisherFrame
|
---|
2019 | mov %r10,32(%rsp) # arg5
|
---|
2020 | mov %r11,40(%rsp) # arg6
|
---|
2021 | mov %r12,48(%rsp) # arg7
|
---|
2022 | mov %rcx,56(%rsp) # arg8, (NULL)
|
---|
2023 | call *__imp_RtlVirtualUnwind(%rip)
|
---|
2024 |
|
---|
2025 | mov \$1,%eax # ExceptionContinueSearch
|
---|
2026 | add \$64,%rsp
|
---|
2027 | popfq
|
---|
2028 | pop %r15
|
---|
2029 | pop %r14
|
---|
2030 | pop %r13
|
---|
2031 | pop %r12
|
---|
2032 | pop %rbp
|
---|
2033 | pop %rbx
|
---|
2034 | pop %rdi
|
---|
2035 | pop %rsi
|
---|
2036 | ret
|
---|
2037 | .size ssse3_handler,.-ssse3_handler
|
---|
2038 |
|
---|
2039 | .section .pdata
|
---|
2040 | .align 4
|
---|
2041 | .rva .LSEH_begin_aesni_cbc_sha1_enc_ssse3
|
---|
2042 | .rva .LSEH_end_aesni_cbc_sha1_enc_ssse3
|
---|
2043 | .rva .LSEH_info_aesni_cbc_sha1_enc_ssse3
|
---|
2044 | ___
|
---|
2045 | $code.=<<___ if ($avx);
|
---|
2046 | .rva .LSEH_begin_aesni_cbc_sha1_enc_avx
|
---|
2047 | .rva .LSEH_end_aesni_cbc_sha1_enc_avx
|
---|
2048 | .rva .LSEH_info_aesni_cbc_sha1_enc_avx
|
---|
2049 | ___
|
---|
2050 | $code.=<<___ if ($shaext);
|
---|
2051 | .rva .LSEH_begin_aesni_cbc_sha1_enc_shaext
|
---|
2052 | .rva .LSEH_end_aesni_cbc_sha1_enc_shaext
|
---|
2053 | .rva .LSEH_info_aesni_cbc_sha1_enc_shaext
|
---|
2054 | ___
|
---|
2055 | $code.=<<___;
|
---|
2056 | .section .xdata
|
---|
2057 | .align 8
|
---|
2058 | .LSEH_info_aesni_cbc_sha1_enc_ssse3:
|
---|
2059 | .byte 9,0,0,0
|
---|
2060 | .rva ssse3_handler
|
---|
2061 | .rva .Lprologue_ssse3,.Lepilogue_ssse3 # HandlerData[]
|
---|
2062 | ___
|
---|
2063 | $code.=<<___ if ($avx);
|
---|
2064 | .LSEH_info_aesni_cbc_sha1_enc_avx:
|
---|
2065 | .byte 9,0,0,0
|
---|
2066 | .rva ssse3_handler
|
---|
2067 | .rva .Lprologue_avx,.Lepilogue_avx # HandlerData[]
|
---|
2068 | ___
|
---|
2069 | $code.=<<___ if ($shaext);
|
---|
2070 | .LSEH_info_aesni_cbc_sha1_enc_shaext:
|
---|
2071 | .byte 9,0,0,0
|
---|
2072 | .rva ssse3_handler
|
---|
2073 | .rva .Lprologue_shaext,.Lepilogue_shaext # HandlerData[]
|
---|
2074 | ___
|
---|
2075 | }
|
---|
2076 |
|
---|
2077 | ####################################################################
|
---|
2078 | sub rex {
|
---|
2079 | local *opcode=shift;
|
---|
2080 | my ($dst,$src)=@_;
|
---|
2081 | my $rex=0;
|
---|
2082 |
|
---|
2083 | $rex|=0x04 if($dst>=8);
|
---|
2084 | $rex|=0x01 if($src>=8);
|
---|
2085 | unshift @opcode,$rex|0x40 if($rex);
|
---|
2086 | }
|
---|
2087 |
|
---|
2088 | sub sha1rnds4 {
|
---|
2089 | if (@_[0] =~ /\$([x0-9a-f]+),\s*%xmm([0-9]+),\s*%xmm([0-9]+)/) {
|
---|
2090 | my @opcode=(0x0f,0x3a,0xcc);
|
---|
2091 | rex(\@opcode,$3,$2);
|
---|
2092 | push @opcode,0xc0|($2&7)|(($3&7)<<3); # ModR/M
|
---|
2093 | my $c=$1;
|
---|
2094 | push @opcode,$c=~/^0/?oct($c):$c;
|
---|
2095 | return ".byte\t".join(',',@opcode);
|
---|
2096 | } else {
|
---|
2097 | return "sha1rnds4\t".@_[0];
|
---|
2098 | }
|
---|
2099 | }
|
---|
2100 |
|
---|
2101 | sub sha1op38 {
|
---|
2102 | my $instr = shift;
|
---|
2103 | my %opcodelet = (
|
---|
2104 | "sha1nexte" => 0xc8,
|
---|
2105 | "sha1msg1" => 0xc9,
|
---|
2106 | "sha1msg2" => 0xca );
|
---|
2107 |
|
---|
2108 | if (defined($opcodelet{$instr}) && @_[0] =~ /%xmm([0-9]+),\s*%xmm([0-9]+)/) {
|
---|
2109 | my @opcode=(0x0f,0x38);
|
---|
2110 | rex(\@opcode,$2,$1);
|
---|
2111 | push @opcode,$opcodelet{$instr};
|
---|
2112 | push @opcode,0xc0|($1&7)|(($2&7)<<3); # ModR/M
|
---|
2113 | return ".byte\t".join(',',@opcode);
|
---|
2114 | } else {
|
---|
2115 | return $instr."\t".@_[0];
|
---|
2116 | }
|
---|
2117 | }
|
---|
2118 |
|
---|
2119 | sub aesni {
|
---|
2120 | my $line=shift;
|
---|
2121 | my @opcode=(0x0f,0x38);
|
---|
2122 |
|
---|
2123 | if ($line=~/(aes[a-z]+)\s+%xmm([0-9]+),\s*%xmm([0-9]+)/) {
|
---|
2124 | my %opcodelet = (
|
---|
2125 | "aesenc" => 0xdc, "aesenclast" => 0xdd,
|
---|
2126 | "aesdec" => 0xde, "aesdeclast" => 0xdf
|
---|
2127 | );
|
---|
2128 | return undef if (!defined($opcodelet{$1}));
|
---|
2129 | rex(\@opcode,$3,$2);
|
---|
2130 | push @opcode,$opcodelet{$1},0xc0|($2&7)|(($3&7)<<3); # ModR/M
|
---|
2131 | unshift @opcode,0x66;
|
---|
2132 | return ".byte\t".join(',',@opcode);
|
---|
2133 | }
|
---|
2134 | return $line;
|
---|
2135 | }
|
---|
2136 |
|
---|
2137 | foreach (split("\n",$code)) {
|
---|
2138 | s/\`([^\`]*)\`/eval $1/geo;
|
---|
2139 |
|
---|
2140 | s/\b(sha1rnds4)\s+(.*)/sha1rnds4($2)/geo or
|
---|
2141 | s/\b(sha1[^\s]*)\s+(.*)/sha1op38($1,$2)/geo or
|
---|
2142 | s/\b(aes.*%xmm[0-9]+).*$/aesni($1)/geo;
|
---|
2143 |
|
---|
2144 | print $_,"\n";
|
---|
2145 | }
|
---|
2146 | close STDOUT or die "error closing STDOUT: $!";
|
---|