VirtualBox

source: vbox/trunk/src/libs/openssl-1.1.1l/crypto/aes/asm/aesni-x86_64.pl@ 91772

Last change on this file since 91772 was 91772, checked in by vboxsync, 3 years ago

openssl-1.1.1l: Applied and adjusted our OpenSSL changes to 1.1.1l. bugref:10126

File size: 126.8 KB
Line 
1#! /usr/bin/env perl
2# Copyright 2009-2020 The OpenSSL Project Authors. All Rights Reserved.
3#
4# Licensed under the OpenSSL license (the "License"). You may not use
5# this file except in compliance with the License. You can obtain a copy
6# in the file LICENSE in the source distribution or at
7# https://www.openssl.org/source/license.html
8
9#
10# ====================================================================
11# Written by Andy Polyakov <[email protected]> for the OpenSSL
12# project. The module is, however, dual licensed under OpenSSL and
13# CRYPTOGAMS licenses depending on where you obtain it. For further
14# details see http://www.openssl.org/~appro/cryptogams/.
15# ====================================================================
16#
17# This module implements support for Intel AES-NI extension. In
18# OpenSSL context it's used with Intel engine, but can also be used as
19# drop-in replacement for crypto/aes/asm/aes-x86_64.pl [see below for
20# details].
21#
22# Performance.
23#
24# Given aes(enc|dec) instructions' latency asymptotic performance for
25# non-parallelizable modes such as CBC encrypt is 3.75 cycles per byte
26# processed with 128-bit key. And given their throughput asymptotic
27# performance for parallelizable modes is 1.25 cycles per byte. Being
28# asymptotic limit it's not something you commonly achieve in reality,
29# but how close does one get? Below are results collected for
30# different modes and block sized. Pairs of numbers are for en-/
31# decryption.
32#
33# 16-byte 64-byte 256-byte 1-KB 8-KB
34# ECB 4.25/4.25 1.38/1.38 1.28/1.28 1.26/1.26 1.26/1.26
35# CTR 5.42/5.42 1.92/1.92 1.44/1.44 1.28/1.28 1.26/1.26
36# CBC 4.38/4.43 4.15/1.43 4.07/1.32 4.07/1.29 4.06/1.28
37# CCM 5.66/9.42 4.42/5.41 4.16/4.40 4.09/4.15 4.06/4.07
38# OFB 5.42/5.42 4.64/4.64 4.44/4.44 4.39/4.39 4.38/4.38
39# CFB 5.73/5.85 5.56/5.62 5.48/5.56 5.47/5.55 5.47/5.55
40#
41# ECB, CTR, CBC and CCM results are free from EVP overhead. This means
42# that otherwise used 'openssl speed -evp aes-128-??? -engine aesni
43# [-decrypt]' will exhibit 10-15% worse results for smaller blocks.
44# The results were collected with specially crafted speed.c benchmark
45# in order to compare them with results reported in "Intel Advanced
46# Encryption Standard (AES) New Instruction Set" White Paper Revision
47# 3.0 dated May 2010. All above results are consistently better. This
48# module also provides better performance for block sizes smaller than
49# 128 bytes in points *not* represented in the above table.
50#
51# Looking at the results for 8-KB buffer.
52#
53# CFB and OFB results are far from the limit, because implementation
54# uses "generic" CRYPTO_[c|o]fb128_encrypt interfaces relying on
55# single-block aesni_encrypt, which is not the most optimal way to go.
56# CBC encrypt result is unexpectedly high and there is no documented
57# explanation for it. Seemingly there is a small penalty for feeding
58# the result back to AES unit the way it's done in CBC mode. There is
59# nothing one can do and the result appears optimal. CCM result is
60# identical to CBC, because CBC-MAC is essentially CBC encrypt without
61# saving output. CCM CTR "stays invisible," because it's neatly
62# interleaved with CBC-MAC. This provides ~30% improvement over
63# "straightforward" CCM implementation with CTR and CBC-MAC performed
64# disjointly. Parallelizable modes practically achieve the theoretical
65# limit.
66#
67# Looking at how results vary with buffer size.
68#
69# Curves are practically saturated at 1-KB buffer size. In most cases
70# "256-byte" performance is >95%, and "64-byte" is ~90% of "8-KB" one.
71# CTR curve doesn't follow this pattern and is "slowest" changing one
72# with "256-byte" result being 87% of "8-KB." This is because overhead
73# in CTR mode is most computationally intensive. Small-block CCM
74# decrypt is slower than encrypt, because first CTR and last CBC-MAC
75# iterations can't be interleaved.
76#
77# Results for 192- and 256-bit keys.
78#
79# EVP-free results were observed to scale perfectly with number of
80# rounds for larger block sizes, i.e. 192-bit result being 10/12 times
81# lower and 256-bit one - 10/14. Well, in CBC encrypt case differences
82# are a tad smaller, because the above mentioned penalty biases all
83# results by same constant value. In similar way function call
84# overhead affects small-block performance, as well as OFB and CFB
85# results. Differences are not large, most common coefficients are
86# 10/11.7 and 10/13.4 (as opposite to 10/12.0 and 10/14.0), but one
87# observe even 10/11.2 and 10/12.4 (CTR, OFB, CFB)...
88
89# January 2011
90#
91# While Westmere processor features 6 cycles latency for aes[enc|dec]
92# instructions, which can be scheduled every second cycle, Sandy
93# Bridge spends 8 cycles per instruction, but it can schedule them
94# every cycle. This means that code targeting Westmere would perform
95# suboptimally on Sandy Bridge. Therefore this update.
96#
97# In addition, non-parallelizable CBC encrypt (as well as CCM) is
98# optimized. Relative improvement might appear modest, 8% on Westmere,
99# but in absolute terms it's 3.77 cycles per byte encrypted with
100# 128-bit key on Westmere, and 5.07 - on Sandy Bridge. These numbers
101# should be compared to asymptotic limits of 3.75 for Westmere and
102# 5.00 for Sandy Bridge. Actually, the fact that they get this close
103# to asymptotic limits is quite amazing. Indeed, the limit is
104# calculated as latency times number of rounds, 10 for 128-bit key,
105# and divided by 16, the number of bytes in block, or in other words
106# it accounts *solely* for aesenc instructions. But there are extra
107# instructions, and numbers so close to the asymptotic limits mean
108# that it's as if it takes as little as *one* additional cycle to
109# execute all of them. How is it possible? It is possible thanks to
110# out-of-order execution logic, which manages to overlap post-
111# processing of previous block, things like saving the output, with
112# actual encryption of current block, as well as pre-processing of
113# current block, things like fetching input and xor-ing it with
114# 0-round element of the key schedule, with actual encryption of
115# previous block. Keep this in mind...
116#
117# For parallelizable modes, such as ECB, CBC decrypt, CTR, higher
118# performance is achieved by interleaving instructions working on
119# independent blocks. In which case asymptotic limit for such modes
120# can be obtained by dividing above mentioned numbers by AES
121# instructions' interleave factor. Westmere can execute at most 3
122# instructions at a time, meaning that optimal interleave factor is 3,
123# and that's where the "magic" number of 1.25 come from. "Optimal
124# interleave factor" means that increase of interleave factor does
125# not improve performance. The formula has proven to reflect reality
126# pretty well on Westmere... Sandy Bridge on the other hand can
127# execute up to 8 AES instructions at a time, so how does varying
128# interleave factor affect the performance? Here is table for ECB
129# (numbers are cycles per byte processed with 128-bit key):
130#
131# instruction interleave factor 3x 6x 8x
132# theoretical asymptotic limit 1.67 0.83 0.625
133# measured performance for 8KB block 1.05 0.86 0.84
134#
135# "as if" interleave factor 4.7x 5.8x 6.0x
136#
137# Further data for other parallelizable modes:
138#
139# CBC decrypt 1.16 0.93 0.74
140# CTR 1.14 0.91 0.74
141#
142# Well, given 3x column it's probably inappropriate to call the limit
143# asymptotic, if it can be surpassed, isn't it? What happens there?
144# Rewind to CBC paragraph for the answer. Yes, out-of-order execution
145# magic is responsible for this. Processor overlaps not only the
146# additional instructions with AES ones, but even AES instructions
147# processing adjacent triplets of independent blocks. In the 6x case
148# additional instructions still claim disproportionally small amount
149# of additional cycles, but in 8x case number of instructions must be
150# a tad too high for out-of-order logic to cope with, and AES unit
151# remains underutilized... As you can see 8x interleave is hardly
152# justifiable, so there no need to feel bad that 32-bit aesni-x86.pl
153# utilizes 6x interleave because of limited register bank capacity.
154#
155# Higher interleave factors do have negative impact on Westmere
156# performance. While for ECB mode it's negligible ~1.5%, other
157# parallelizables perform ~5% worse, which is outweighed by ~25%
158# improvement on Sandy Bridge. To balance regression on Westmere
159# CTR mode was implemented with 6x aesenc interleave factor.
160
161# April 2011
162#
163# Add aesni_xts_[en|de]crypt. Westmere spends 1.25 cycles processing
164# one byte out of 8KB with 128-bit key, Sandy Bridge - 0.90. Just like
165# in CTR mode AES instruction interleave factor was chosen to be 6x.
166
167# November 2015
168#
169# Add aesni_ocb_[en|de]crypt. AES instruction interleave factor was
170# chosen to be 6x.
171
172######################################################################
173# Current large-block performance in cycles per byte processed with
174# 128-bit key (less is better).
175#
176# CBC en-/decrypt CTR XTS ECB OCB
177# Westmere 3.77/1.25 1.25 1.25 1.26
178# * Bridge 5.07/0.74 0.75 0.90 0.85 0.98
179# Haswell 4.44/0.63 0.63 0.73 0.63 0.70
180# Skylake 2.62/0.63 0.63 0.63 0.63
181# Silvermont 5.75/3.54 3.56 4.12 3.87(*) 4.11
182# Knights L 2.54/0.77 0.78 0.85 - 1.50
183# Goldmont 3.82/1.26 1.26 1.29 1.29 1.50
184# Bulldozer 5.77/0.70 0.72 0.90 0.70 0.95
185# Ryzen 2.71/0.35 0.35 0.44 0.38 0.49
186#
187# (*) Atom Silvermont ECB result is suboptimal because of penalties
188# incurred by operations on %xmm8-15. As ECB is not considered
189# critical, nothing was done to mitigate the problem.
190
191$PREFIX="aesni"; # if $PREFIX is set to "AES", the script
192 # generates drop-in replacement for
193 # crypto/aes/asm/aes-x86_64.pl:-)
194
195$flavour = shift;
196$output = shift;
197if ($flavour =~ /\./) { $output = $flavour; undef $flavour; }
198
199$win64=0; $win64=1 if ($flavour =~ /[nm]asm|mingw64/ || $output =~ /\.asm$/);
200
201$0 =~ m/(.*[\/\\])[^\/\\]+$/; $dir=$1;
202( $xlate="${dir}x86_64-xlate.pl" and -f $xlate ) or
203( $xlate="${dir}../../perlasm/x86_64-xlate.pl" and -f $xlate) or
204die "can't locate x86_64-xlate.pl";
205
206open OUT,"| \"$^X\" \"$xlate\" $flavour \"$output\"";
207*STDOUT=*OUT;
208
209$movkey = $PREFIX eq "aesni" ? "movups" : "movups";
210@_4args=$win64? ("%rcx","%rdx","%r8", "%r9") : # Win64 order
211 ("%rdi","%rsi","%rdx","%rcx"); # Unix order
212
213$code=".text\n";
214$code.=".extern OPENSSL_ia32cap_P\n";
215
216$rounds="%eax"; # input to and changed by aesni_[en|de]cryptN !!!
217# this is natural Unix argument order for public $PREFIX_[ecb|cbc]_encrypt ...
218$inp="%rdi";
219$out="%rsi";
220$len="%rdx";
221$key="%rcx"; # input to and changed by aesni_[en|de]cryptN !!!
222$ivp="%r8"; # cbc, ctr, ...
223
224$rnds_="%r10d"; # backup copy for $rounds
225$key_="%r11"; # backup copy for $key
226
227# %xmm register layout
228$rndkey0="%xmm0"; $rndkey1="%xmm1";
229$inout0="%xmm2"; $inout1="%xmm3";
230$inout2="%xmm4"; $inout3="%xmm5";
231$inout4="%xmm6"; $inout5="%xmm7";
232$inout6="%xmm8"; $inout7="%xmm9";
233
234$in2="%xmm6"; $in1="%xmm7"; # used in CBC decrypt, CTR, ...
235$in0="%xmm8"; $iv="%xmm9";
236
237
238# Inline version of internal aesni_[en|de]crypt1.
239#
240# Why folded loop? Because aes[enc|dec] is slow enough to accommodate
241# cycles which take care of loop variables...
242{ my $sn;
243sub aesni_generate1 {
244my ($p,$key,$rounds,$inout,$ivec)=@_; $inout=$inout0 if (!defined($inout));
245++$sn;
246$code.=<<___;
247 $movkey ($key),$rndkey0
248 $movkey 16($key),$rndkey1
249___
250$code.=<<___ if (defined($ivec));
251 xorps $rndkey0,$ivec
252 lea 32($key),$key
253 xorps $ivec,$inout
254___
255$code.=<<___ if (!defined($ivec));
256 lea 32($key),$key
257 xorps $rndkey0,$inout
258___
259$code.=<<___;
260.Loop_${p}1_$sn:
261 aes${p} $rndkey1,$inout
262 dec $rounds
263 $movkey ($key),$rndkey1
264 lea 16($key),$key
265 jnz .Loop_${p}1_$sn # loop body is 16 bytes
266 aes${p}last $rndkey1,$inout
267___
268}}
269# void $PREFIX_[en|de]crypt (const void *inp,void *out,const AES_KEY *key);
270#
271{ my ($inp,$out,$key) = @_4args;
272
273$code.=<<___;
274.globl ${PREFIX}_encrypt
275.type ${PREFIX}_encrypt,\@abi-omnipotent
276.align 16
277${PREFIX}_encrypt:
278.cfi_startproc
279 movups ($inp),$inout0 # load input
280 mov 240($key),$rounds # key->rounds
281___
282 &aesni_generate1("enc",$key,$rounds);
283$code.=<<___;
284 pxor $rndkey0,$rndkey0 # clear register bank
285 pxor $rndkey1,$rndkey1
286 movups $inout0,($out) # output
287 pxor $inout0,$inout0
288 ret
289.cfi_endproc
290.size ${PREFIX}_encrypt,.-${PREFIX}_encrypt
291
292.globl ${PREFIX}_decrypt
293.type ${PREFIX}_decrypt,\@abi-omnipotent
294.align 16
295${PREFIX}_decrypt:
296.cfi_startproc
297 movups ($inp),$inout0 # load input
298 mov 240($key),$rounds # key->rounds
299___
300 &aesni_generate1("dec",$key,$rounds);
301$code.=<<___;
302 pxor $rndkey0,$rndkey0 # clear register bank
303 pxor $rndkey1,$rndkey1
304 movups $inout0,($out) # output
305 pxor $inout0,$inout0
306 ret
307.cfi_endproc
308.size ${PREFIX}_decrypt, .-${PREFIX}_decrypt
309___
310}
311
312
313# _aesni_[en|de]cryptN are private interfaces, N denotes interleave
314# factor. Why 3x subroutine were originally used in loops? Even though
315# aes[enc|dec] latency was originally 6, it could be scheduled only
316# every *2nd* cycle. Thus 3x interleave was the one providing optimal
317# utilization, i.e. when subroutine's throughput is virtually same as
318# of non-interleaved subroutine [for number of input blocks up to 3].
319# This is why it originally made no sense to implement 2x subroutine.
320# But times change and it became appropriate to spend extra 192 bytes
321# on 2x subroutine on Atom Silvermont account. For processors that
322# can schedule aes[enc|dec] every cycle optimal interleave factor
323# equals to corresponding instructions latency. 8x is optimal for
324# * Bridge and "super-optimal" for other Intel CPUs...
325
326sub aesni_generate2 {
327my $dir=shift;
328# As already mentioned it takes in $key and $rounds, which are *not*
329# preserved. $inout[0-1] is cipher/clear text...
330$code.=<<___;
331.type _aesni_${dir}rypt2,\@abi-omnipotent
332.align 16
333_aesni_${dir}rypt2:
334.cfi_startproc
335 $movkey ($key),$rndkey0
336 shl \$4,$rounds
337 $movkey 16($key),$rndkey1
338 xorps $rndkey0,$inout0
339 xorps $rndkey0,$inout1
340 $movkey 32($key),$rndkey0
341 lea 32($key,$rounds),$key
342 neg %rax # $rounds
343 add \$16,%rax
344
345.L${dir}_loop2:
346 aes${dir} $rndkey1,$inout0
347 aes${dir} $rndkey1,$inout1
348 $movkey ($key,%rax),$rndkey1
349 add \$32,%rax
350 aes${dir} $rndkey0,$inout0
351 aes${dir} $rndkey0,$inout1
352 $movkey -16($key,%rax),$rndkey0
353 jnz .L${dir}_loop2
354
355 aes${dir} $rndkey1,$inout0
356 aes${dir} $rndkey1,$inout1
357 aes${dir}last $rndkey0,$inout0
358 aes${dir}last $rndkey0,$inout1
359 ret
360.cfi_endproc
361.size _aesni_${dir}rypt2,.-_aesni_${dir}rypt2
362___
363}
364sub aesni_generate3 {
365my $dir=shift;
366# As already mentioned it takes in $key and $rounds, which are *not*
367# preserved. $inout[0-2] is cipher/clear text...
368$code.=<<___;
369.type _aesni_${dir}rypt3,\@abi-omnipotent
370.align 16
371_aesni_${dir}rypt3:
372.cfi_startproc
373 $movkey ($key),$rndkey0
374 shl \$4,$rounds
375 $movkey 16($key),$rndkey1
376 xorps $rndkey0,$inout0
377 xorps $rndkey0,$inout1
378 xorps $rndkey0,$inout2
379 $movkey 32($key),$rndkey0
380 lea 32($key,$rounds),$key
381 neg %rax # $rounds
382 add \$16,%rax
383
384.L${dir}_loop3:
385 aes${dir} $rndkey1,$inout0
386 aes${dir} $rndkey1,$inout1
387 aes${dir} $rndkey1,$inout2
388 $movkey ($key,%rax),$rndkey1
389 add \$32,%rax
390 aes${dir} $rndkey0,$inout0
391 aes${dir} $rndkey0,$inout1
392 aes${dir} $rndkey0,$inout2
393 $movkey -16($key,%rax),$rndkey0
394 jnz .L${dir}_loop3
395
396 aes${dir} $rndkey1,$inout0
397 aes${dir} $rndkey1,$inout1
398 aes${dir} $rndkey1,$inout2
399 aes${dir}last $rndkey0,$inout0
400 aes${dir}last $rndkey0,$inout1
401 aes${dir}last $rndkey0,$inout2
402 ret
403.cfi_endproc
404.size _aesni_${dir}rypt3,.-_aesni_${dir}rypt3
405___
406}
407# 4x interleave is implemented to improve small block performance,
408# most notably [and naturally] 4 block by ~30%. One can argue that one
409# should have implemented 5x as well, but improvement would be <20%,
410# so it's not worth it...
411sub aesni_generate4 {
412my $dir=shift;
413# As already mentioned it takes in $key and $rounds, which are *not*
414# preserved. $inout[0-3] is cipher/clear text...
415$code.=<<___;
416.type _aesni_${dir}rypt4,\@abi-omnipotent
417.align 16
418_aesni_${dir}rypt4:
419.cfi_startproc
420 $movkey ($key),$rndkey0
421 shl \$4,$rounds
422 $movkey 16($key),$rndkey1
423 xorps $rndkey0,$inout0
424 xorps $rndkey0,$inout1
425 xorps $rndkey0,$inout2
426 xorps $rndkey0,$inout3
427 $movkey 32($key),$rndkey0
428 lea 32($key,$rounds),$key
429 neg %rax # $rounds
430 .byte 0x0f,0x1f,0x00
431 add \$16,%rax
432
433.L${dir}_loop4:
434 aes${dir} $rndkey1,$inout0
435 aes${dir} $rndkey1,$inout1
436 aes${dir} $rndkey1,$inout2
437 aes${dir} $rndkey1,$inout3
438 $movkey ($key,%rax),$rndkey1
439 add \$32,%rax
440 aes${dir} $rndkey0,$inout0
441 aes${dir} $rndkey0,$inout1
442 aes${dir} $rndkey0,$inout2
443 aes${dir} $rndkey0,$inout3
444 $movkey -16($key,%rax),$rndkey0
445 jnz .L${dir}_loop4
446
447 aes${dir} $rndkey1,$inout0
448 aes${dir} $rndkey1,$inout1
449 aes${dir} $rndkey1,$inout2
450 aes${dir} $rndkey1,$inout3
451 aes${dir}last $rndkey0,$inout0
452 aes${dir}last $rndkey0,$inout1
453 aes${dir}last $rndkey0,$inout2
454 aes${dir}last $rndkey0,$inout3
455 ret
456.cfi_endproc
457.size _aesni_${dir}rypt4,.-_aesni_${dir}rypt4
458___
459}
460sub aesni_generate6 {
461my $dir=shift;
462# As already mentioned it takes in $key and $rounds, which are *not*
463# preserved. $inout[0-5] is cipher/clear text...
464$code.=<<___;
465.type _aesni_${dir}rypt6,\@abi-omnipotent
466.align 16
467_aesni_${dir}rypt6:
468.cfi_startproc
469 $movkey ($key),$rndkey0
470 shl \$4,$rounds
471 $movkey 16($key),$rndkey1
472 xorps $rndkey0,$inout0
473 pxor $rndkey0,$inout1
474 pxor $rndkey0,$inout2
475 aes${dir} $rndkey1,$inout0
476 lea 32($key,$rounds),$key
477 neg %rax # $rounds
478 aes${dir} $rndkey1,$inout1
479 pxor $rndkey0,$inout3
480 pxor $rndkey0,$inout4
481 aes${dir} $rndkey1,$inout2
482 pxor $rndkey0,$inout5
483 $movkey ($key,%rax),$rndkey0
484 add \$16,%rax
485 jmp .L${dir}_loop6_enter
486.align 16
487.L${dir}_loop6:
488 aes${dir} $rndkey1,$inout0
489 aes${dir} $rndkey1,$inout1
490 aes${dir} $rndkey1,$inout2
491.L${dir}_loop6_enter:
492 aes${dir} $rndkey1,$inout3
493 aes${dir} $rndkey1,$inout4
494 aes${dir} $rndkey1,$inout5
495 $movkey ($key,%rax),$rndkey1
496 add \$32,%rax
497 aes${dir} $rndkey0,$inout0
498 aes${dir} $rndkey0,$inout1
499 aes${dir} $rndkey0,$inout2
500 aes${dir} $rndkey0,$inout3
501 aes${dir} $rndkey0,$inout4
502 aes${dir} $rndkey0,$inout5
503 $movkey -16($key,%rax),$rndkey0
504 jnz .L${dir}_loop6
505
506 aes${dir} $rndkey1,$inout0
507 aes${dir} $rndkey1,$inout1
508 aes${dir} $rndkey1,$inout2
509 aes${dir} $rndkey1,$inout3
510 aes${dir} $rndkey1,$inout4
511 aes${dir} $rndkey1,$inout5
512 aes${dir}last $rndkey0,$inout0
513 aes${dir}last $rndkey0,$inout1
514 aes${dir}last $rndkey0,$inout2
515 aes${dir}last $rndkey0,$inout3
516 aes${dir}last $rndkey0,$inout4
517 aes${dir}last $rndkey0,$inout5
518 ret
519.cfi_endproc
520.size _aesni_${dir}rypt6,.-_aesni_${dir}rypt6
521___
522}
523sub aesni_generate8 {
524my $dir=shift;
525# As already mentioned it takes in $key and $rounds, which are *not*
526# preserved. $inout[0-7] is cipher/clear text...
527$code.=<<___;
528.type _aesni_${dir}rypt8,\@abi-omnipotent
529.align 16
530_aesni_${dir}rypt8:
531.cfi_startproc
532 $movkey ($key),$rndkey0
533 shl \$4,$rounds
534 $movkey 16($key),$rndkey1
535 xorps $rndkey0,$inout0
536 xorps $rndkey0,$inout1
537 pxor $rndkey0,$inout2
538 pxor $rndkey0,$inout3
539 pxor $rndkey0,$inout4
540 lea 32($key,$rounds),$key
541 neg %rax # $rounds
542 aes${dir} $rndkey1,$inout0
543 pxor $rndkey0,$inout5
544 pxor $rndkey0,$inout6
545 aes${dir} $rndkey1,$inout1
546 pxor $rndkey0,$inout7
547 $movkey ($key,%rax),$rndkey0
548 add \$16,%rax
549 jmp .L${dir}_loop8_inner
550.align 16
551.L${dir}_loop8:
552 aes${dir} $rndkey1,$inout0
553 aes${dir} $rndkey1,$inout1
554.L${dir}_loop8_inner:
555 aes${dir} $rndkey1,$inout2
556 aes${dir} $rndkey1,$inout3
557 aes${dir} $rndkey1,$inout4
558 aes${dir} $rndkey1,$inout5
559 aes${dir} $rndkey1,$inout6
560 aes${dir} $rndkey1,$inout7
561.L${dir}_loop8_enter:
562 $movkey ($key,%rax),$rndkey1
563 add \$32,%rax
564 aes${dir} $rndkey0,$inout0
565 aes${dir} $rndkey0,$inout1
566 aes${dir} $rndkey0,$inout2
567 aes${dir} $rndkey0,$inout3
568 aes${dir} $rndkey0,$inout4
569 aes${dir} $rndkey0,$inout5
570 aes${dir} $rndkey0,$inout6
571 aes${dir} $rndkey0,$inout7
572 $movkey -16($key,%rax),$rndkey0
573 jnz .L${dir}_loop8
574
575 aes${dir} $rndkey1,$inout0
576 aes${dir} $rndkey1,$inout1
577 aes${dir} $rndkey1,$inout2
578 aes${dir} $rndkey1,$inout3
579 aes${dir} $rndkey1,$inout4
580 aes${dir} $rndkey1,$inout5
581 aes${dir} $rndkey1,$inout6
582 aes${dir} $rndkey1,$inout7
583 aes${dir}last $rndkey0,$inout0
584 aes${dir}last $rndkey0,$inout1
585 aes${dir}last $rndkey0,$inout2
586 aes${dir}last $rndkey0,$inout3
587 aes${dir}last $rndkey0,$inout4
588 aes${dir}last $rndkey0,$inout5
589 aes${dir}last $rndkey0,$inout6
590 aes${dir}last $rndkey0,$inout7
591 ret
592.cfi_endproc
593.size _aesni_${dir}rypt8,.-_aesni_${dir}rypt8
594___
595}
596&aesni_generate2("enc") if ($PREFIX eq "aesni");
597&aesni_generate2("dec");
598&aesni_generate3("enc") if ($PREFIX eq "aesni");
599&aesni_generate3("dec");
600&aesni_generate4("enc") if ($PREFIX eq "aesni");
601&aesni_generate4("dec");
602&aesni_generate6("enc") if ($PREFIX eq "aesni");
603&aesni_generate6("dec");
604&aesni_generate8("enc") if ($PREFIX eq "aesni");
605&aesni_generate8("dec");
606
607
608if ($PREFIX eq "aesni") {
609########################################################################
610# void aesni_ecb_encrypt (const void *in, void *out,
611# size_t length, const AES_KEY *key,
612# int enc);
613$code.=<<___;
614.globl aesni_ecb_encrypt
615.type aesni_ecb_encrypt,\@function,5
616.align 16
617aesni_ecb_encrypt:
618.cfi_startproc
619___
620$code.=<<___ if ($win64);
621 lea -0x58(%rsp),%rsp
622 movaps %xmm6,(%rsp) # offload $inout4..7
623 movaps %xmm7,0x10(%rsp)
624 movaps %xmm8,0x20(%rsp)
625 movaps %xmm9,0x30(%rsp)
626.Lecb_enc_body:
627___
628$code.=<<___;
629 and \$-16,$len # if ($len<16)
630 jz .Lecb_ret # return
631
632 mov 240($key),$rounds # key->rounds
633 $movkey ($key),$rndkey0
634 mov $key,$key_ # backup $key
635 mov $rounds,$rnds_ # backup $rounds
636 test %r8d,%r8d # 5th argument
637 jz .Lecb_decrypt
638#--------------------------- ECB ENCRYPT ------------------------------#
639 cmp \$0x80,$len # if ($len<8*16)
640 jb .Lecb_enc_tail # short input
641
642 movdqu ($inp),$inout0 # load 8 input blocks
643 movdqu 0x10($inp),$inout1
644 movdqu 0x20($inp),$inout2
645 movdqu 0x30($inp),$inout3
646 movdqu 0x40($inp),$inout4
647 movdqu 0x50($inp),$inout5
648 movdqu 0x60($inp),$inout6
649 movdqu 0x70($inp),$inout7
650 lea 0x80($inp),$inp # $inp+=8*16
651 sub \$0x80,$len # $len-=8*16 (can be zero)
652 jmp .Lecb_enc_loop8_enter
653.align 16
654.Lecb_enc_loop8:
655 movups $inout0,($out) # store 8 output blocks
656 mov $key_,$key # restore $key
657 movdqu ($inp),$inout0 # load 8 input blocks
658 mov $rnds_,$rounds # restore $rounds
659 movups $inout1,0x10($out)
660 movdqu 0x10($inp),$inout1
661 movups $inout2,0x20($out)
662 movdqu 0x20($inp),$inout2
663 movups $inout3,0x30($out)
664 movdqu 0x30($inp),$inout3
665 movups $inout4,0x40($out)
666 movdqu 0x40($inp),$inout4
667 movups $inout5,0x50($out)
668 movdqu 0x50($inp),$inout5
669 movups $inout6,0x60($out)
670 movdqu 0x60($inp),$inout6
671 movups $inout7,0x70($out)
672 lea 0x80($out),$out # $out+=8*16
673 movdqu 0x70($inp),$inout7
674 lea 0x80($inp),$inp # $inp+=8*16
675.Lecb_enc_loop8_enter:
676
677 call _aesni_encrypt8
678
679 sub \$0x80,$len
680 jnc .Lecb_enc_loop8 # loop if $len-=8*16 didn't borrow
681
682 movups $inout0,($out) # store 8 output blocks
683 mov $key_,$key # restore $key
684 movups $inout1,0x10($out)
685 mov $rnds_,$rounds # restore $rounds
686 movups $inout2,0x20($out)
687 movups $inout3,0x30($out)
688 movups $inout4,0x40($out)
689 movups $inout5,0x50($out)
690 movups $inout6,0x60($out)
691 movups $inout7,0x70($out)
692 lea 0x80($out),$out # $out+=8*16
693 add \$0x80,$len # restore real remaining $len
694 jz .Lecb_ret # done if ($len==0)
695
696.Lecb_enc_tail: # $len is less than 8*16
697 movups ($inp),$inout0
698 cmp \$0x20,$len
699 jb .Lecb_enc_one
700 movups 0x10($inp),$inout1
701 je .Lecb_enc_two
702 movups 0x20($inp),$inout2
703 cmp \$0x40,$len
704 jb .Lecb_enc_three
705 movups 0x30($inp),$inout3
706 je .Lecb_enc_four
707 movups 0x40($inp),$inout4
708 cmp \$0x60,$len
709 jb .Lecb_enc_five
710 movups 0x50($inp),$inout5
711 je .Lecb_enc_six
712 movdqu 0x60($inp),$inout6
713 xorps $inout7,$inout7
714 call _aesni_encrypt8
715 movups $inout0,($out) # store 7 output blocks
716 movups $inout1,0x10($out)
717 movups $inout2,0x20($out)
718 movups $inout3,0x30($out)
719 movups $inout4,0x40($out)
720 movups $inout5,0x50($out)
721 movups $inout6,0x60($out)
722 jmp .Lecb_ret
723.align 16
724.Lecb_enc_one:
725___
726 &aesni_generate1("enc",$key,$rounds);
727$code.=<<___;
728 movups $inout0,($out) # store one output block
729 jmp .Lecb_ret
730.align 16
731.Lecb_enc_two:
732 call _aesni_encrypt2
733 movups $inout0,($out) # store 2 output blocks
734 movups $inout1,0x10($out)
735 jmp .Lecb_ret
736.align 16
737.Lecb_enc_three:
738 call _aesni_encrypt3
739 movups $inout0,($out) # store 3 output blocks
740 movups $inout1,0x10($out)
741 movups $inout2,0x20($out)
742 jmp .Lecb_ret
743.align 16
744.Lecb_enc_four:
745 call _aesni_encrypt4
746 movups $inout0,($out) # store 4 output blocks
747 movups $inout1,0x10($out)
748 movups $inout2,0x20($out)
749 movups $inout3,0x30($out)
750 jmp .Lecb_ret
751.align 16
752.Lecb_enc_five:
753 xorps $inout5,$inout5
754 call _aesni_encrypt6
755 movups $inout0,($out) # store 5 output blocks
756 movups $inout1,0x10($out)
757 movups $inout2,0x20($out)
758 movups $inout3,0x30($out)
759 movups $inout4,0x40($out)
760 jmp .Lecb_ret
761.align 16
762.Lecb_enc_six:
763 call _aesni_encrypt6
764 movups $inout0,($out) # store 6 output blocks
765 movups $inout1,0x10($out)
766 movups $inout2,0x20($out)
767 movups $inout3,0x30($out)
768 movups $inout4,0x40($out)
769 movups $inout5,0x50($out)
770 jmp .Lecb_ret
771
772#--------------------------- ECB DECRYPT ------------------------------#
773.align 16
774.Lecb_decrypt:
775 cmp \$0x80,$len # if ($len<8*16)
776 jb .Lecb_dec_tail # short input
777
778 movdqu ($inp),$inout0 # load 8 input blocks
779 movdqu 0x10($inp),$inout1
780 movdqu 0x20($inp),$inout2
781 movdqu 0x30($inp),$inout3
782 movdqu 0x40($inp),$inout4
783 movdqu 0x50($inp),$inout5
784 movdqu 0x60($inp),$inout6
785 movdqu 0x70($inp),$inout7
786 lea 0x80($inp),$inp # $inp+=8*16
787 sub \$0x80,$len # $len-=8*16 (can be zero)
788 jmp .Lecb_dec_loop8_enter
789.align 16
790.Lecb_dec_loop8:
791 movups $inout0,($out) # store 8 output blocks
792 mov $key_,$key # restore $key
793 movdqu ($inp),$inout0 # load 8 input blocks
794 mov $rnds_,$rounds # restore $rounds
795 movups $inout1,0x10($out)
796 movdqu 0x10($inp),$inout1
797 movups $inout2,0x20($out)
798 movdqu 0x20($inp),$inout2
799 movups $inout3,0x30($out)
800 movdqu 0x30($inp),$inout3
801 movups $inout4,0x40($out)
802 movdqu 0x40($inp),$inout4
803 movups $inout5,0x50($out)
804 movdqu 0x50($inp),$inout5
805 movups $inout6,0x60($out)
806 movdqu 0x60($inp),$inout6
807 movups $inout7,0x70($out)
808 lea 0x80($out),$out # $out+=8*16
809 movdqu 0x70($inp),$inout7
810 lea 0x80($inp),$inp # $inp+=8*16
811.Lecb_dec_loop8_enter:
812
813 call _aesni_decrypt8
814
815 $movkey ($key_),$rndkey0
816 sub \$0x80,$len
817 jnc .Lecb_dec_loop8 # loop if $len-=8*16 didn't borrow
818
819 movups $inout0,($out) # store 8 output blocks
820 pxor $inout0,$inout0 # clear register bank
821 mov $key_,$key # restore $key
822 movups $inout1,0x10($out)
823 pxor $inout1,$inout1
824 mov $rnds_,$rounds # restore $rounds
825 movups $inout2,0x20($out)
826 pxor $inout2,$inout2
827 movups $inout3,0x30($out)
828 pxor $inout3,$inout3
829 movups $inout4,0x40($out)
830 pxor $inout4,$inout4
831 movups $inout5,0x50($out)
832 pxor $inout5,$inout5
833 movups $inout6,0x60($out)
834 pxor $inout6,$inout6
835 movups $inout7,0x70($out)
836 pxor $inout7,$inout7
837 lea 0x80($out),$out # $out+=8*16
838 add \$0x80,$len # restore real remaining $len
839 jz .Lecb_ret # done if ($len==0)
840
841.Lecb_dec_tail:
842 movups ($inp),$inout0
843 cmp \$0x20,$len
844 jb .Lecb_dec_one
845 movups 0x10($inp),$inout1
846 je .Lecb_dec_two
847 movups 0x20($inp),$inout2
848 cmp \$0x40,$len
849 jb .Lecb_dec_three
850 movups 0x30($inp),$inout3
851 je .Lecb_dec_four
852 movups 0x40($inp),$inout4
853 cmp \$0x60,$len
854 jb .Lecb_dec_five
855 movups 0x50($inp),$inout5
856 je .Lecb_dec_six
857 movups 0x60($inp),$inout6
858 $movkey ($key),$rndkey0
859 xorps $inout7,$inout7
860 call _aesni_decrypt8
861 movups $inout0,($out) # store 7 output blocks
862 pxor $inout0,$inout0 # clear register bank
863 movups $inout1,0x10($out)
864 pxor $inout1,$inout1
865 movups $inout2,0x20($out)
866 pxor $inout2,$inout2
867 movups $inout3,0x30($out)
868 pxor $inout3,$inout3
869 movups $inout4,0x40($out)
870 pxor $inout4,$inout4
871 movups $inout5,0x50($out)
872 pxor $inout5,$inout5
873 movups $inout6,0x60($out)
874 pxor $inout6,$inout6
875 pxor $inout7,$inout7
876 jmp .Lecb_ret
877.align 16
878.Lecb_dec_one:
879___
880 &aesni_generate1("dec",$key,$rounds);
881$code.=<<___;
882 movups $inout0,($out) # store one output block
883 pxor $inout0,$inout0 # clear register bank
884 jmp .Lecb_ret
885.align 16
886.Lecb_dec_two:
887 call _aesni_decrypt2
888 movups $inout0,($out) # store 2 output blocks
889 pxor $inout0,$inout0 # clear register bank
890 movups $inout1,0x10($out)
891 pxor $inout1,$inout1
892 jmp .Lecb_ret
893.align 16
894.Lecb_dec_three:
895 call _aesni_decrypt3
896 movups $inout0,($out) # store 3 output blocks
897 pxor $inout0,$inout0 # clear register bank
898 movups $inout1,0x10($out)
899 pxor $inout1,$inout1
900 movups $inout2,0x20($out)
901 pxor $inout2,$inout2
902 jmp .Lecb_ret
903.align 16
904.Lecb_dec_four:
905 call _aesni_decrypt4
906 movups $inout0,($out) # store 4 output blocks
907 pxor $inout0,$inout0 # clear register bank
908 movups $inout1,0x10($out)
909 pxor $inout1,$inout1
910 movups $inout2,0x20($out)
911 pxor $inout2,$inout2
912 movups $inout3,0x30($out)
913 pxor $inout3,$inout3
914 jmp .Lecb_ret
915.align 16
916.Lecb_dec_five:
917 xorps $inout5,$inout5
918 call _aesni_decrypt6
919 movups $inout0,($out) # store 5 output blocks
920 pxor $inout0,$inout0 # clear register bank
921 movups $inout1,0x10($out)
922 pxor $inout1,$inout1
923 movups $inout2,0x20($out)
924 pxor $inout2,$inout2
925 movups $inout3,0x30($out)
926 pxor $inout3,$inout3
927 movups $inout4,0x40($out)
928 pxor $inout4,$inout4
929 pxor $inout5,$inout5
930 jmp .Lecb_ret
931.align 16
932.Lecb_dec_six:
933 call _aesni_decrypt6
934 movups $inout0,($out) # store 6 output blocks
935 pxor $inout0,$inout0 # clear register bank
936 movups $inout1,0x10($out)
937 pxor $inout1,$inout1
938 movups $inout2,0x20($out)
939 pxor $inout2,$inout2
940 movups $inout3,0x30($out)
941 pxor $inout3,$inout3
942 movups $inout4,0x40($out)
943 pxor $inout4,$inout4
944 movups $inout5,0x50($out)
945 pxor $inout5,$inout5
946
947.Lecb_ret:
948 xorps $rndkey0,$rndkey0 # %xmm0
949 pxor $rndkey1,$rndkey1
950___
951$code.=<<___ if ($win64);
952 movaps (%rsp),%xmm6
953 movaps %xmm0,(%rsp) # clear stack
954 movaps 0x10(%rsp),%xmm7
955 movaps %xmm0,0x10(%rsp)
956 movaps 0x20(%rsp),%xmm8
957 movaps %xmm0,0x20(%rsp)
958 movaps 0x30(%rsp),%xmm9
959 movaps %xmm0,0x30(%rsp)
960 lea 0x58(%rsp),%rsp
961.Lecb_enc_ret:
962___
963$code.=<<___;
964 ret
965.cfi_endproc
966.size aesni_ecb_encrypt,.-aesni_ecb_encrypt
967___
968
969
970{
971######################################################################
972# void aesni_ccm64_[en|de]crypt_blocks (const void *in, void *out,
973# size_t blocks, const AES_KEY *key,
974# const char *ivec,char *cmac);
975#
976# Handles only complete blocks, operates on 64-bit counter and
977# does not update *ivec! Nor does it finalize CMAC value
978# (see engine/eng_aesni.c for details)
979#
980{
981my $cmac="%r9"; # 6th argument
982
983my $increment="%xmm9";
984my $iv="%xmm6";
985my $bswap_mask="%xmm7";
986
987$code.=<<___;
988.globl aesni_ccm64_encrypt_blocks
989.type aesni_ccm64_encrypt_blocks,\@function,6
990.align 16
991aesni_ccm64_encrypt_blocks:
992.cfi_startproc
993___
994$code.=<<___ if ($win64);
995 lea -0x58(%rsp),%rsp
996 movaps %xmm6,(%rsp) # $iv
997 movaps %xmm7,0x10(%rsp) # $bswap_mask
998 movaps %xmm8,0x20(%rsp) # $in0
999 movaps %xmm9,0x30(%rsp) # $increment
1000.Lccm64_enc_body:
1001___
1002$code.=<<___;
1003 mov 240($key),$rounds # key->rounds
1004 movdqu ($ivp),$iv
1005 movdqa .Lincrement64(%rip),$increment
1006 movdqa .Lbswap_mask(%rip),$bswap_mask
1007
1008 shl \$4,$rounds
1009 mov \$16,$rnds_
1010 lea 0($key),$key_
1011 movdqu ($cmac),$inout1
1012 movdqa $iv,$inout0
1013 lea 32($key,$rounds),$key # end of key schedule
1014 pshufb $bswap_mask,$iv
1015 sub %rax,%r10 # twisted $rounds
1016 jmp .Lccm64_enc_outer
1017.align 16
1018.Lccm64_enc_outer:
1019 $movkey ($key_),$rndkey0
1020 mov %r10,%rax
1021 movups ($inp),$in0 # load inp
1022
1023 xorps $rndkey0,$inout0 # counter
1024 $movkey 16($key_),$rndkey1
1025 xorps $in0,$rndkey0
1026 xorps $rndkey0,$inout1 # cmac^=inp
1027 $movkey 32($key_),$rndkey0
1028
1029.Lccm64_enc2_loop:
1030 aesenc $rndkey1,$inout0
1031 aesenc $rndkey1,$inout1
1032 $movkey ($key,%rax),$rndkey1
1033 add \$32,%rax
1034 aesenc $rndkey0,$inout0
1035 aesenc $rndkey0,$inout1
1036 $movkey -16($key,%rax),$rndkey0
1037 jnz .Lccm64_enc2_loop
1038 aesenc $rndkey1,$inout0
1039 aesenc $rndkey1,$inout1
1040 paddq $increment,$iv
1041 dec $len # $len-- ($len is in blocks)
1042 aesenclast $rndkey0,$inout0
1043 aesenclast $rndkey0,$inout1
1044
1045 lea 16($inp),$inp
1046 xorps $inout0,$in0 # inp ^= E(iv)
1047 movdqa $iv,$inout0
1048 movups $in0,($out) # save output
1049 pshufb $bswap_mask,$inout0
1050 lea 16($out),$out # $out+=16
1051 jnz .Lccm64_enc_outer # loop if ($len!=0)
1052
1053 pxor $rndkey0,$rndkey0 # clear register bank
1054 pxor $rndkey1,$rndkey1
1055 pxor $inout0,$inout0
1056 movups $inout1,($cmac) # store resulting mac
1057 pxor $inout1,$inout1
1058 pxor $in0,$in0
1059 pxor $iv,$iv
1060___
1061$code.=<<___ if ($win64);
1062 movaps (%rsp),%xmm6
1063 movaps %xmm0,(%rsp) # clear stack
1064 movaps 0x10(%rsp),%xmm7
1065 movaps %xmm0,0x10(%rsp)
1066 movaps 0x20(%rsp),%xmm8
1067 movaps %xmm0,0x20(%rsp)
1068 movaps 0x30(%rsp),%xmm9
1069 movaps %xmm0,0x30(%rsp)
1070 lea 0x58(%rsp),%rsp
1071.Lccm64_enc_ret:
1072___
1073$code.=<<___;
1074 ret
1075.cfi_endproc
1076.size aesni_ccm64_encrypt_blocks,.-aesni_ccm64_encrypt_blocks
1077___
1078######################################################################
1079$code.=<<___;
1080.globl aesni_ccm64_decrypt_blocks
1081.type aesni_ccm64_decrypt_blocks,\@function,6
1082.align 16
1083aesni_ccm64_decrypt_blocks:
1084.cfi_startproc
1085___
1086$code.=<<___ if ($win64);
1087 lea -0x58(%rsp),%rsp
1088 movaps %xmm6,(%rsp) # $iv
1089 movaps %xmm7,0x10(%rsp) # $bswap_mask
1090 movaps %xmm8,0x20(%rsp) # $in8
1091 movaps %xmm9,0x30(%rsp) # $increment
1092.Lccm64_dec_body:
1093___
1094$code.=<<___;
1095 mov 240($key),$rounds # key->rounds
1096 movups ($ivp),$iv
1097 movdqu ($cmac),$inout1
1098 movdqa .Lincrement64(%rip),$increment
1099 movdqa .Lbswap_mask(%rip),$bswap_mask
1100
1101 movaps $iv,$inout0
1102 mov $rounds,$rnds_
1103 mov $key,$key_
1104 pshufb $bswap_mask,$iv
1105___
1106 &aesni_generate1("enc",$key,$rounds);
1107$code.=<<___;
1108 shl \$4,$rnds_
1109 mov \$16,$rounds
1110 movups ($inp),$in0 # load inp
1111 paddq $increment,$iv
1112 lea 16($inp),$inp # $inp+=16
1113 sub %r10,%rax # twisted $rounds
1114 lea 32($key_,$rnds_),$key # end of key schedule
1115 mov %rax,%r10
1116 jmp .Lccm64_dec_outer
1117.align 16
1118.Lccm64_dec_outer:
1119 xorps $inout0,$in0 # inp ^= E(iv)
1120 movdqa $iv,$inout0
1121 movups $in0,($out) # save output
1122 lea 16($out),$out # $out+=16
1123 pshufb $bswap_mask,$inout0
1124
1125 sub \$1,$len # $len-- ($len is in blocks)
1126 jz .Lccm64_dec_break # if ($len==0) break
1127
1128 $movkey ($key_),$rndkey0
1129 mov %r10,%rax
1130 $movkey 16($key_),$rndkey1
1131 xorps $rndkey0,$in0
1132 xorps $rndkey0,$inout0
1133 xorps $in0,$inout1 # cmac^=out
1134 $movkey 32($key_),$rndkey0
1135 jmp .Lccm64_dec2_loop
1136.align 16
1137.Lccm64_dec2_loop:
1138 aesenc $rndkey1,$inout0
1139 aesenc $rndkey1,$inout1
1140 $movkey ($key,%rax),$rndkey1
1141 add \$32,%rax
1142 aesenc $rndkey0,$inout0
1143 aesenc $rndkey0,$inout1
1144 $movkey -16($key,%rax),$rndkey0
1145 jnz .Lccm64_dec2_loop
1146 movups ($inp),$in0 # load input
1147 paddq $increment,$iv
1148 aesenc $rndkey1,$inout0
1149 aesenc $rndkey1,$inout1
1150 aesenclast $rndkey0,$inout0
1151 aesenclast $rndkey0,$inout1
1152 lea 16($inp),$inp # $inp+=16
1153 jmp .Lccm64_dec_outer
1154
1155.align 16
1156.Lccm64_dec_break:
1157 #xorps $in0,$inout1 # cmac^=out
1158 mov 240($key_),$rounds
1159___
1160 &aesni_generate1("enc",$key_,$rounds,$inout1,$in0);
1161$code.=<<___;
1162 pxor $rndkey0,$rndkey0 # clear register bank
1163 pxor $rndkey1,$rndkey1
1164 pxor $inout0,$inout0
1165 movups $inout1,($cmac) # store resulting mac
1166 pxor $inout1,$inout1
1167 pxor $in0,$in0
1168 pxor $iv,$iv
1169___
1170$code.=<<___ if ($win64);
1171 movaps (%rsp),%xmm6
1172 movaps %xmm0,(%rsp) # clear stack
1173 movaps 0x10(%rsp),%xmm7
1174 movaps %xmm0,0x10(%rsp)
1175 movaps 0x20(%rsp),%xmm8
1176 movaps %xmm0,0x20(%rsp)
1177 movaps 0x30(%rsp),%xmm9
1178 movaps %xmm0,0x30(%rsp)
1179 lea 0x58(%rsp),%rsp
1180.Lccm64_dec_ret:
1181___
1182$code.=<<___;
1183 ret
1184.cfi_endproc
1185.size aesni_ccm64_decrypt_blocks,.-aesni_ccm64_decrypt_blocks
1186___
1187}
1188
1189######################################################################
1190# void aesni_ctr32_encrypt_blocks (const void *in, void *out,
1191# size_t blocks, const AES_KEY *key,
1192# const char *ivec);
1193#
1194# Handles only complete blocks, operates on 32-bit counter and
1195# does not update *ivec! (see crypto/modes/ctr128.c for details)
1196#
1197# Overhaul based on suggestions from Shay Gueron and Vlad Krasnov,
1198# http://rt.openssl.org/Ticket/Display.html?id=3021&user=guest&pass=guest.
1199# Keywords are full unroll and modulo-schedule counter calculations
1200# with zero-round key xor.
1201{
1202my ($in0,$in1,$in2,$in3,$in4,$in5)=map("%xmm$_",(10..15));
1203my ($key0,$ctr)=("%ebp","${ivp}d");
1204my $frame_size = 0x80 + ($win64?160:0);
1205
1206$code.=<<___;
1207.globl aesni_ctr32_encrypt_blocks
1208.type aesni_ctr32_encrypt_blocks,\@function,5
1209.align 16
1210aesni_ctr32_encrypt_blocks:
1211.cfi_startproc
1212 cmp \$1,$len
1213 jne .Lctr32_bulk
1214
1215 # handle single block without allocating stack frame,
1216 # useful when handling edges
1217 movups ($ivp),$inout0
1218 movups ($inp),$inout1
1219 mov 240($key),%edx # key->rounds
1220___
1221 &aesni_generate1("enc",$key,"%edx");
1222$code.=<<___;
1223 pxor $rndkey0,$rndkey0 # clear register bank
1224 pxor $rndkey1,$rndkey1
1225 xorps $inout1,$inout0
1226 pxor $inout1,$inout1
1227 movups $inout0,($out)
1228 xorps $inout0,$inout0
1229 jmp .Lctr32_epilogue
1230
1231.align 16
1232.Lctr32_bulk:
1233 lea (%rsp),$key_ # use $key_ as frame pointer
1234.cfi_def_cfa_register $key_
1235 push %rbp
1236.cfi_push %rbp
1237 sub \$$frame_size,%rsp
1238 and \$-16,%rsp # Linux kernel stack can be incorrectly seeded
1239___
1240$code.=<<___ if ($win64);
1241 movaps %xmm6,-0xa8($key_) # offload everything
1242 movaps %xmm7,-0x98($key_)
1243 movaps %xmm8,-0x88($key_)
1244 movaps %xmm9,-0x78($key_)
1245 movaps %xmm10,-0x68($key_)
1246 movaps %xmm11,-0x58($key_)
1247 movaps %xmm12,-0x48($key_)
1248 movaps %xmm13,-0x38($key_)
1249 movaps %xmm14,-0x28($key_)
1250 movaps %xmm15,-0x18($key_)
1251.Lctr32_body:
1252___
1253$code.=<<___;
1254
1255 # 8 16-byte words on top of stack are counter values
1256 # xor-ed with zero-round key
1257
1258 movdqu ($ivp),$inout0
1259 movdqu ($key),$rndkey0
1260 mov 12($ivp),$ctr # counter LSB
1261 pxor $rndkey0,$inout0
1262 mov 12($key),$key0 # 0-round key LSB
1263 movdqa $inout0,0x00(%rsp) # populate counter block
1264 bswap $ctr
1265 movdqa $inout0,$inout1
1266 movdqa $inout0,$inout2
1267 movdqa $inout0,$inout3
1268 movdqa $inout0,0x40(%rsp)
1269 movdqa $inout0,0x50(%rsp)
1270 movdqa $inout0,0x60(%rsp)
1271 mov %rdx,%r10 # about to borrow %rdx
1272 movdqa $inout0,0x70(%rsp)
1273
1274 lea 1($ctr),%rax
1275 lea 2($ctr),%rdx
1276 bswap %eax
1277 bswap %edx
1278 xor $key0,%eax
1279 xor $key0,%edx
1280 pinsrd \$3,%eax,$inout1
1281 lea 3($ctr),%rax
1282 movdqa $inout1,0x10(%rsp)
1283 pinsrd \$3,%edx,$inout2
1284 bswap %eax
1285 mov %r10,%rdx # restore %rdx
1286 lea 4($ctr),%r10
1287 movdqa $inout2,0x20(%rsp)
1288 xor $key0,%eax
1289 bswap %r10d
1290 pinsrd \$3,%eax,$inout3
1291 xor $key0,%r10d
1292 movdqa $inout3,0x30(%rsp)
1293 lea 5($ctr),%r9
1294 mov %r10d,0x40+12(%rsp)
1295 bswap %r9d
1296 lea 6($ctr),%r10
1297 mov 240($key),$rounds # key->rounds
1298 xor $key0,%r9d
1299 bswap %r10d
1300 mov %r9d,0x50+12(%rsp)
1301 xor $key0,%r10d
1302 lea 7($ctr),%r9
1303 mov %r10d,0x60+12(%rsp)
1304 bswap %r9d
1305 mov OPENSSL_ia32cap_P+4(%rip),%r10d
1306 xor $key0,%r9d
1307 and \$`1<<26|1<<22`,%r10d # isolate XSAVE+MOVBE
1308 mov %r9d,0x70+12(%rsp)
1309
1310 $movkey 0x10($key),$rndkey1
1311
1312 movdqa 0x40(%rsp),$inout4
1313 movdqa 0x50(%rsp),$inout5
1314
1315 cmp \$8,$len # $len is in blocks
1316 jb .Lctr32_tail # short input if ($len<8)
1317
1318 sub \$6,$len # $len is biased by -6
1319 cmp \$`1<<22`,%r10d # check for MOVBE without XSAVE
1320 je .Lctr32_6x # [which denotes Atom Silvermont]
1321
1322 lea 0x80($key),$key # size optimization
1323 sub \$2,$len # $len is biased by -8
1324 jmp .Lctr32_loop8
1325
1326.align 16
1327.Lctr32_6x:
1328 shl \$4,$rounds
1329 mov \$48,$rnds_
1330 bswap $key0
1331 lea 32($key,$rounds),$key # end of key schedule
1332 sub %rax,%r10 # twisted $rounds
1333 jmp .Lctr32_loop6
1334
1335.align 16
1336.Lctr32_loop6:
1337 add \$6,$ctr # next counter value
1338 $movkey -48($key,$rnds_),$rndkey0
1339 aesenc $rndkey1,$inout0
1340 mov $ctr,%eax
1341 xor $key0,%eax
1342 aesenc $rndkey1,$inout1
1343 movbe %eax,`0x00+12`(%rsp) # store next counter value
1344 lea 1($ctr),%eax
1345 aesenc $rndkey1,$inout2
1346 xor $key0,%eax
1347 movbe %eax,`0x10+12`(%rsp)
1348 aesenc $rndkey1,$inout3
1349 lea 2($ctr),%eax
1350 xor $key0,%eax
1351 aesenc $rndkey1,$inout4
1352 movbe %eax,`0x20+12`(%rsp)
1353 lea 3($ctr),%eax
1354 aesenc $rndkey1,$inout5
1355 $movkey -32($key,$rnds_),$rndkey1
1356 xor $key0,%eax
1357
1358 aesenc $rndkey0,$inout0
1359 movbe %eax,`0x30+12`(%rsp)
1360 lea 4($ctr),%eax
1361 aesenc $rndkey0,$inout1
1362 xor $key0,%eax
1363 movbe %eax,`0x40+12`(%rsp)
1364 aesenc $rndkey0,$inout2
1365 lea 5($ctr),%eax
1366 xor $key0,%eax
1367 aesenc $rndkey0,$inout3
1368 movbe %eax,`0x50+12`(%rsp)
1369 mov %r10,%rax # mov $rnds_,$rounds
1370 aesenc $rndkey0,$inout4
1371 aesenc $rndkey0,$inout5
1372 $movkey -16($key,$rnds_),$rndkey0
1373
1374 call .Lenc_loop6
1375
1376 movdqu ($inp),$inout6 # load 6 input blocks
1377 movdqu 0x10($inp),$inout7
1378 movdqu 0x20($inp),$in0
1379 movdqu 0x30($inp),$in1
1380 movdqu 0x40($inp),$in2
1381 movdqu 0x50($inp),$in3
1382 lea 0x60($inp),$inp # $inp+=6*16
1383 $movkey -64($key,$rnds_),$rndkey1
1384 pxor $inout0,$inout6 # inp^=E(ctr)
1385 movaps 0x00(%rsp),$inout0 # load next counter [xor-ed with 0 round]
1386 pxor $inout1,$inout7
1387 movaps 0x10(%rsp),$inout1
1388 pxor $inout2,$in0
1389 movaps 0x20(%rsp),$inout2
1390 pxor $inout3,$in1
1391 movaps 0x30(%rsp),$inout3
1392 pxor $inout4,$in2
1393 movaps 0x40(%rsp),$inout4
1394 pxor $inout5,$in3
1395 movaps 0x50(%rsp),$inout5
1396 movdqu $inout6,($out) # store 6 output blocks
1397 movdqu $inout7,0x10($out)
1398 movdqu $in0,0x20($out)
1399 movdqu $in1,0x30($out)
1400 movdqu $in2,0x40($out)
1401 movdqu $in3,0x50($out)
1402 lea 0x60($out),$out # $out+=6*16
1403
1404 sub \$6,$len
1405 jnc .Lctr32_loop6 # loop if $len-=6 didn't borrow
1406
1407 add \$6,$len # restore real remaining $len
1408 jz .Lctr32_done # done if ($len==0)
1409
1410 lea -48($rnds_),$rounds
1411 lea -80($key,$rnds_),$key # restore $key
1412 neg $rounds
1413 shr \$4,$rounds # restore $rounds
1414 jmp .Lctr32_tail
1415
1416.align 32
1417.Lctr32_loop8:
1418 add \$8,$ctr # next counter value
1419 movdqa 0x60(%rsp),$inout6
1420 aesenc $rndkey1,$inout0
1421 mov $ctr,%r9d
1422 movdqa 0x70(%rsp),$inout7
1423 aesenc $rndkey1,$inout1
1424 bswap %r9d
1425 $movkey 0x20-0x80($key),$rndkey0
1426 aesenc $rndkey1,$inout2
1427 xor $key0,%r9d
1428 nop
1429 aesenc $rndkey1,$inout3
1430 mov %r9d,0x00+12(%rsp) # store next counter value
1431 lea 1($ctr),%r9
1432 aesenc $rndkey1,$inout4
1433 aesenc $rndkey1,$inout5
1434 aesenc $rndkey1,$inout6
1435 aesenc $rndkey1,$inout7
1436 $movkey 0x30-0x80($key),$rndkey1
1437___
1438for($i=2;$i<8;$i++) {
1439my $rndkeyx = ($i&1)?$rndkey1:$rndkey0;
1440$code.=<<___;
1441 bswap %r9d
1442 aesenc $rndkeyx,$inout0
1443 aesenc $rndkeyx,$inout1
1444 xor $key0,%r9d
1445 .byte 0x66,0x90
1446 aesenc $rndkeyx,$inout2
1447 aesenc $rndkeyx,$inout3
1448 mov %r9d,`0x10*($i-1)`+12(%rsp)
1449 lea $i($ctr),%r9
1450 aesenc $rndkeyx,$inout4
1451 aesenc $rndkeyx,$inout5
1452 aesenc $rndkeyx,$inout6
1453 aesenc $rndkeyx,$inout7
1454 $movkey `0x20+0x10*$i`-0x80($key),$rndkeyx
1455___
1456}
1457$code.=<<___;
1458 bswap %r9d
1459 aesenc $rndkey0,$inout0
1460 aesenc $rndkey0,$inout1
1461 aesenc $rndkey0,$inout2
1462 xor $key0,%r9d
1463 movdqu 0x00($inp),$in0 # start loading input
1464 aesenc $rndkey0,$inout3
1465 mov %r9d,0x70+12(%rsp)
1466 cmp \$11,$rounds
1467 aesenc $rndkey0,$inout4
1468 aesenc $rndkey0,$inout5
1469 aesenc $rndkey0,$inout6
1470 aesenc $rndkey0,$inout7
1471 $movkey 0xa0-0x80($key),$rndkey0
1472
1473 jb .Lctr32_enc_done
1474
1475 aesenc $rndkey1,$inout0
1476 aesenc $rndkey1,$inout1
1477 aesenc $rndkey1,$inout2
1478 aesenc $rndkey1,$inout3
1479 aesenc $rndkey1,$inout4
1480 aesenc $rndkey1,$inout5
1481 aesenc $rndkey1,$inout6
1482 aesenc $rndkey1,$inout7
1483 $movkey 0xb0-0x80($key),$rndkey1
1484
1485 aesenc $rndkey0,$inout0
1486 aesenc $rndkey0,$inout1
1487 aesenc $rndkey0,$inout2
1488 aesenc $rndkey0,$inout3
1489 aesenc $rndkey0,$inout4
1490 aesenc $rndkey0,$inout5
1491 aesenc $rndkey0,$inout6
1492 aesenc $rndkey0,$inout7
1493 $movkey 0xc0-0x80($key),$rndkey0
1494 je .Lctr32_enc_done
1495
1496 aesenc $rndkey1,$inout0
1497 aesenc $rndkey1,$inout1
1498 aesenc $rndkey1,$inout2
1499 aesenc $rndkey1,$inout3
1500 aesenc $rndkey1,$inout4
1501 aesenc $rndkey1,$inout5
1502 aesenc $rndkey1,$inout6
1503 aesenc $rndkey1,$inout7
1504 $movkey 0xd0-0x80($key),$rndkey1
1505
1506 aesenc $rndkey0,$inout0
1507 aesenc $rndkey0,$inout1
1508 aesenc $rndkey0,$inout2
1509 aesenc $rndkey0,$inout3
1510 aesenc $rndkey0,$inout4
1511 aesenc $rndkey0,$inout5
1512 aesenc $rndkey0,$inout6
1513 aesenc $rndkey0,$inout7
1514 $movkey 0xe0-0x80($key),$rndkey0
1515 jmp .Lctr32_enc_done
1516
1517.align 16
1518.Lctr32_enc_done:
1519 movdqu 0x10($inp),$in1
1520 pxor $rndkey0,$in0 # input^=round[last]
1521 movdqu 0x20($inp),$in2
1522 pxor $rndkey0,$in1
1523 movdqu 0x30($inp),$in3
1524 pxor $rndkey0,$in2
1525 movdqu 0x40($inp),$in4
1526 pxor $rndkey0,$in3
1527 movdqu 0x50($inp),$in5
1528 pxor $rndkey0,$in4
1529 pxor $rndkey0,$in5
1530 aesenc $rndkey1,$inout0
1531 aesenc $rndkey1,$inout1
1532 aesenc $rndkey1,$inout2
1533 aesenc $rndkey1,$inout3
1534 aesenc $rndkey1,$inout4
1535 aesenc $rndkey1,$inout5
1536 aesenc $rndkey1,$inout6
1537 aesenc $rndkey1,$inout7
1538 movdqu 0x60($inp),$rndkey1 # borrow $rndkey1 for inp[6]
1539 lea 0x80($inp),$inp # $inp+=8*16
1540
1541 aesenclast $in0,$inout0 # $inN is inp[N]^round[last]
1542 pxor $rndkey0,$rndkey1 # borrowed $rndkey
1543 movdqu 0x70-0x80($inp),$in0
1544 aesenclast $in1,$inout1
1545 pxor $rndkey0,$in0
1546 movdqa 0x00(%rsp),$in1 # load next counter block
1547 aesenclast $in2,$inout2
1548 aesenclast $in3,$inout3
1549 movdqa 0x10(%rsp),$in2
1550 movdqa 0x20(%rsp),$in3
1551 aesenclast $in4,$inout4
1552 aesenclast $in5,$inout5
1553 movdqa 0x30(%rsp),$in4
1554 movdqa 0x40(%rsp),$in5
1555 aesenclast $rndkey1,$inout6
1556 movdqa 0x50(%rsp),$rndkey0
1557 $movkey 0x10-0x80($key),$rndkey1#real 1st-round key
1558 aesenclast $in0,$inout7
1559
1560 movups $inout0,($out) # store 8 output blocks
1561 movdqa $in1,$inout0
1562 movups $inout1,0x10($out)
1563 movdqa $in2,$inout1
1564 movups $inout2,0x20($out)
1565 movdqa $in3,$inout2
1566 movups $inout3,0x30($out)
1567 movdqa $in4,$inout3
1568 movups $inout4,0x40($out)
1569 movdqa $in5,$inout4
1570 movups $inout5,0x50($out)
1571 movdqa $rndkey0,$inout5
1572 movups $inout6,0x60($out)
1573 movups $inout7,0x70($out)
1574 lea 0x80($out),$out # $out+=8*16
1575
1576 sub \$8,$len
1577 jnc .Lctr32_loop8 # loop if $len-=8 didn't borrow
1578
1579 add \$8,$len # restore real remaining $len
1580 jz .Lctr32_done # done if ($len==0)
1581 lea -0x80($key),$key
1582
1583.Lctr32_tail:
1584 # note that at this point $inout0..5 are populated with
1585 # counter values xor-ed with 0-round key
1586 lea 16($key),$key
1587 cmp \$4,$len
1588 jb .Lctr32_loop3
1589 je .Lctr32_loop4
1590
1591 # if ($len>4) compute 7 E(counter)
1592 shl \$4,$rounds
1593 movdqa 0x60(%rsp),$inout6
1594 pxor $inout7,$inout7
1595
1596 $movkey 16($key),$rndkey0
1597 aesenc $rndkey1,$inout0
1598 aesenc $rndkey1,$inout1
1599 lea 32-16($key,$rounds),$key# prepare for .Lenc_loop8_enter
1600 neg %rax
1601 aesenc $rndkey1,$inout2
1602 add \$16,%rax # prepare for .Lenc_loop8_enter
1603 movups ($inp),$in0
1604 aesenc $rndkey1,$inout3
1605 aesenc $rndkey1,$inout4
1606 movups 0x10($inp),$in1 # pre-load input
1607 movups 0x20($inp),$in2
1608 aesenc $rndkey1,$inout5
1609 aesenc $rndkey1,$inout6
1610
1611 call .Lenc_loop8_enter
1612
1613 movdqu 0x30($inp),$in3
1614 pxor $in0,$inout0
1615 movdqu 0x40($inp),$in0
1616 pxor $in1,$inout1
1617 movdqu $inout0,($out) # store output
1618 pxor $in2,$inout2
1619 movdqu $inout1,0x10($out)
1620 pxor $in3,$inout3
1621 movdqu $inout2,0x20($out)
1622 pxor $in0,$inout4
1623 movdqu $inout3,0x30($out)
1624 movdqu $inout4,0x40($out)
1625 cmp \$6,$len
1626 jb .Lctr32_done # $len was 5, stop store
1627
1628 movups 0x50($inp),$in1
1629 xorps $in1,$inout5
1630 movups $inout5,0x50($out)
1631 je .Lctr32_done # $len was 6, stop store
1632
1633 movups 0x60($inp),$in2
1634 xorps $in2,$inout6
1635 movups $inout6,0x60($out)
1636 jmp .Lctr32_done # $len was 7, stop store
1637
1638.align 32
1639.Lctr32_loop4:
1640 aesenc $rndkey1,$inout0
1641 lea 16($key),$key
1642 dec $rounds
1643 aesenc $rndkey1,$inout1
1644 aesenc $rndkey1,$inout2
1645 aesenc $rndkey1,$inout3
1646 $movkey ($key),$rndkey1
1647 jnz .Lctr32_loop4
1648 aesenclast $rndkey1,$inout0
1649 aesenclast $rndkey1,$inout1
1650 movups ($inp),$in0 # load input
1651 movups 0x10($inp),$in1
1652 aesenclast $rndkey1,$inout2
1653 aesenclast $rndkey1,$inout3
1654 movups 0x20($inp),$in2
1655 movups 0x30($inp),$in3
1656
1657 xorps $in0,$inout0
1658 movups $inout0,($out) # store output
1659 xorps $in1,$inout1
1660 movups $inout1,0x10($out)
1661 pxor $in2,$inout2
1662 movdqu $inout2,0x20($out)
1663 pxor $in3,$inout3
1664 movdqu $inout3,0x30($out)
1665 jmp .Lctr32_done # $len was 4, stop store
1666
1667.align 32
1668.Lctr32_loop3:
1669 aesenc $rndkey1,$inout0
1670 lea 16($key),$key
1671 dec $rounds
1672 aesenc $rndkey1,$inout1
1673 aesenc $rndkey1,$inout2
1674 $movkey ($key),$rndkey1
1675 jnz .Lctr32_loop3
1676 aesenclast $rndkey1,$inout0
1677 aesenclast $rndkey1,$inout1
1678 aesenclast $rndkey1,$inout2
1679
1680 movups ($inp),$in0 # load input
1681 xorps $in0,$inout0
1682 movups $inout0,($out) # store output
1683 cmp \$2,$len
1684 jb .Lctr32_done # $len was 1, stop store
1685
1686 movups 0x10($inp),$in1
1687 xorps $in1,$inout1
1688 movups $inout1,0x10($out)
1689 je .Lctr32_done # $len was 2, stop store
1690
1691 movups 0x20($inp),$in2
1692 xorps $in2,$inout2
1693 movups $inout2,0x20($out) # $len was 3, stop store
1694
1695.Lctr32_done:
1696 xorps %xmm0,%xmm0 # clear register bank
1697 xor $key0,$key0
1698 pxor %xmm1,%xmm1
1699 pxor %xmm2,%xmm2
1700 pxor %xmm3,%xmm3
1701 pxor %xmm4,%xmm4
1702 pxor %xmm5,%xmm5
1703___
1704$code.=<<___ if (!$win64);
1705 pxor %xmm6,%xmm6
1706 pxor %xmm7,%xmm7
1707 movaps %xmm0,0x00(%rsp) # clear stack
1708 pxor %xmm8,%xmm8
1709 movaps %xmm0,0x10(%rsp)
1710 pxor %xmm9,%xmm9
1711 movaps %xmm0,0x20(%rsp)
1712 pxor %xmm10,%xmm10
1713 movaps %xmm0,0x30(%rsp)
1714 pxor %xmm11,%xmm11
1715 movaps %xmm0,0x40(%rsp)
1716 pxor %xmm12,%xmm12
1717 movaps %xmm0,0x50(%rsp)
1718 pxor %xmm13,%xmm13
1719 movaps %xmm0,0x60(%rsp)
1720 pxor %xmm14,%xmm14
1721 movaps %xmm0,0x70(%rsp)
1722 pxor %xmm15,%xmm15
1723___
1724$code.=<<___ if ($win64);
1725 movaps -0xa8($key_),%xmm6
1726 movaps %xmm0,-0xa8($key_) # clear stack
1727 movaps -0x98($key_),%xmm7
1728 movaps %xmm0,-0x98($key_)
1729 movaps -0x88($key_),%xmm8
1730 movaps %xmm0,-0x88($key_)
1731 movaps -0x78($key_),%xmm9
1732 movaps %xmm0,-0x78($key_)
1733 movaps -0x68($key_),%xmm10
1734 movaps %xmm0,-0x68($key_)
1735 movaps -0x58($key_),%xmm11
1736 movaps %xmm0,-0x58($key_)
1737 movaps -0x48($key_),%xmm12
1738 movaps %xmm0,-0x48($key_)
1739 movaps -0x38($key_),%xmm13
1740 movaps %xmm0,-0x38($key_)
1741 movaps -0x28($key_),%xmm14
1742 movaps %xmm0,-0x28($key_)
1743 movaps -0x18($key_),%xmm15
1744 movaps %xmm0,-0x18($key_)
1745 movaps %xmm0,0x00(%rsp)
1746 movaps %xmm0,0x10(%rsp)
1747 movaps %xmm0,0x20(%rsp)
1748 movaps %xmm0,0x30(%rsp)
1749 movaps %xmm0,0x40(%rsp)
1750 movaps %xmm0,0x50(%rsp)
1751 movaps %xmm0,0x60(%rsp)
1752 movaps %xmm0,0x70(%rsp)
1753___
1754$code.=<<___;
1755 mov -8($key_),%rbp
1756.cfi_restore %rbp
1757 lea ($key_),%rsp
1758.cfi_def_cfa_register %rsp
1759.Lctr32_epilogue:
1760 ret
1761.cfi_endproc
1762.size aesni_ctr32_encrypt_blocks,.-aesni_ctr32_encrypt_blocks
1763___
1764}
1765
1766
1767######################################################################
1768# void aesni_xts_[en|de]crypt(const char *inp,char *out,size_t len,
1769# const AES_KEY *key1, const AES_KEY *key2
1770# const unsigned char iv[16]);
1771#
1772{
1773my @tweak=map("%xmm$_",(10..15));
1774my ($twmask,$twres,$twtmp)=("%xmm8","%xmm9",@tweak[4]);
1775my ($key2,$ivp,$len_)=("%r8","%r9","%r9");
1776my $frame_size = 0x70 + ($win64?160:0);
1777my $key_ = "%rbp"; # override so that we can use %r11 as FP
1778
1779$code.=<<___;
1780.globl aesni_xts_encrypt
1781.type aesni_xts_encrypt,\@function,6
1782.align 16
1783aesni_xts_encrypt:
1784.cfi_startproc
1785 lea (%rsp),%r11 # frame pointer
1786.cfi_def_cfa_register %r11
1787 push %rbp
1788.cfi_push %rbp
1789 sub \$$frame_size,%rsp
1790 and \$-16,%rsp # Linux kernel stack can be incorrectly seeded
1791___
1792$code.=<<___ if ($win64);
1793 movaps %xmm6,-0xa8(%r11) # offload everything
1794 movaps %xmm7,-0x98(%r11)
1795 movaps %xmm8,-0x88(%r11)
1796 movaps %xmm9,-0x78(%r11)
1797 movaps %xmm10,-0x68(%r11)
1798 movaps %xmm11,-0x58(%r11)
1799 movaps %xmm12,-0x48(%r11)
1800 movaps %xmm13,-0x38(%r11)
1801 movaps %xmm14,-0x28(%r11)
1802 movaps %xmm15,-0x18(%r11)
1803.Lxts_enc_body:
1804___
1805$code.=<<___;
1806 movups ($ivp),$inout0 # load clear-text tweak
1807 mov 240(%r8),$rounds # key2->rounds
1808 mov 240($key),$rnds_ # key1->rounds
1809___
1810 # generate the tweak
1811 &aesni_generate1("enc",$key2,$rounds,$inout0);
1812$code.=<<___;
1813 $movkey ($key),$rndkey0 # zero round key
1814 mov $key,$key_ # backup $key
1815 mov $rnds_,$rounds # backup $rounds
1816 shl \$4,$rnds_
1817 mov $len,$len_ # backup $len
1818 and \$-16,$len
1819
1820 $movkey 16($key,$rnds_),$rndkey1 # last round key
1821
1822 movdqa .Lxts_magic(%rip),$twmask
1823 movdqa $inout0,@tweak[5]
1824 pshufd \$0x5f,$inout0,$twres
1825 pxor $rndkey0,$rndkey1
1826___
1827 # alternative tweak calculation algorithm is based on suggestions
1828 # by Shay Gueron. psrad doesn't conflict with AES-NI instructions
1829 # and should help in the future...
1830 for ($i=0;$i<4;$i++) {
1831 $code.=<<___;
1832 movdqa $twres,$twtmp
1833 paddd $twres,$twres
1834 movdqa @tweak[5],@tweak[$i]
1835 psrad \$31,$twtmp # broadcast upper bits
1836 paddq @tweak[5],@tweak[5]
1837 pand $twmask,$twtmp
1838 pxor $rndkey0,@tweak[$i]
1839 pxor $twtmp,@tweak[5]
1840___
1841 }
1842$code.=<<___;
1843 movdqa @tweak[5],@tweak[4]
1844 psrad \$31,$twres
1845 paddq @tweak[5],@tweak[5]
1846 pand $twmask,$twres
1847 pxor $rndkey0,@tweak[4]
1848 pxor $twres,@tweak[5]
1849 movaps $rndkey1,0x60(%rsp) # save round[0]^round[last]
1850
1851 sub \$16*6,$len
1852 jc .Lxts_enc_short # if $len-=6*16 borrowed
1853
1854 mov \$16+96,$rounds
1855 lea 32($key_,$rnds_),$key # end of key schedule
1856 sub %r10,%rax # twisted $rounds
1857 $movkey 16($key_),$rndkey1
1858 mov %rax,%r10 # backup twisted $rounds
1859 lea .Lxts_magic(%rip),%r8
1860 jmp .Lxts_enc_grandloop
1861
1862.align 32
1863.Lxts_enc_grandloop:
1864 movdqu `16*0`($inp),$inout0 # load input
1865 movdqa $rndkey0,$twmask
1866 movdqu `16*1`($inp),$inout1
1867 pxor @tweak[0],$inout0 # input^=tweak^round[0]
1868 movdqu `16*2`($inp),$inout2
1869 pxor @tweak[1],$inout1
1870 aesenc $rndkey1,$inout0
1871 movdqu `16*3`($inp),$inout3
1872 pxor @tweak[2],$inout2
1873 aesenc $rndkey1,$inout1
1874 movdqu `16*4`($inp),$inout4
1875 pxor @tweak[3],$inout3
1876 aesenc $rndkey1,$inout2
1877 movdqu `16*5`($inp),$inout5
1878 pxor @tweak[5],$twmask # round[0]^=tweak[5]
1879 movdqa 0x60(%rsp),$twres # load round[0]^round[last]
1880 pxor @tweak[4],$inout4
1881 aesenc $rndkey1,$inout3
1882 $movkey 32($key_),$rndkey0
1883 lea `16*6`($inp),$inp
1884 pxor $twmask,$inout5
1885
1886 pxor $twres,@tweak[0] # calculate tweaks^round[last]
1887 aesenc $rndkey1,$inout4
1888 pxor $twres,@tweak[1]
1889 movdqa @tweak[0],`16*0`(%rsp) # put aside tweaks^round[last]
1890 aesenc $rndkey1,$inout5
1891 $movkey 48($key_),$rndkey1
1892 pxor $twres,@tweak[2]
1893
1894 aesenc $rndkey0,$inout0
1895 pxor $twres,@tweak[3]
1896 movdqa @tweak[1],`16*1`(%rsp)
1897 aesenc $rndkey0,$inout1
1898 pxor $twres,@tweak[4]
1899 movdqa @tweak[2],`16*2`(%rsp)
1900 aesenc $rndkey0,$inout2
1901 aesenc $rndkey0,$inout3
1902 pxor $twres,$twmask
1903 movdqa @tweak[4],`16*4`(%rsp)
1904 aesenc $rndkey0,$inout4
1905 aesenc $rndkey0,$inout5
1906 $movkey 64($key_),$rndkey0
1907 movdqa $twmask,`16*5`(%rsp)
1908 pshufd \$0x5f,@tweak[5],$twres
1909 jmp .Lxts_enc_loop6
1910.align 32
1911.Lxts_enc_loop6:
1912 aesenc $rndkey1,$inout0
1913 aesenc $rndkey1,$inout1
1914 aesenc $rndkey1,$inout2
1915 aesenc $rndkey1,$inout3
1916 aesenc $rndkey1,$inout4
1917 aesenc $rndkey1,$inout5
1918 $movkey -64($key,%rax),$rndkey1
1919 add \$32,%rax
1920
1921 aesenc $rndkey0,$inout0
1922 aesenc $rndkey0,$inout1
1923 aesenc $rndkey0,$inout2
1924 aesenc $rndkey0,$inout3
1925 aesenc $rndkey0,$inout4
1926 aesenc $rndkey0,$inout5
1927 $movkey -80($key,%rax),$rndkey0
1928 jnz .Lxts_enc_loop6
1929
1930 movdqa (%r8),$twmask # start calculating next tweak
1931 movdqa $twres,$twtmp
1932 paddd $twres,$twres
1933 aesenc $rndkey1,$inout0
1934 paddq @tweak[5],@tweak[5]
1935 psrad \$31,$twtmp
1936 aesenc $rndkey1,$inout1
1937 pand $twmask,$twtmp
1938 $movkey ($key_),@tweak[0] # load round[0]
1939 aesenc $rndkey1,$inout2
1940 aesenc $rndkey1,$inout3
1941 aesenc $rndkey1,$inout4
1942 pxor $twtmp,@tweak[5]
1943 movaps @tweak[0],@tweak[1] # copy round[0]
1944 aesenc $rndkey1,$inout5
1945 $movkey -64($key),$rndkey1
1946
1947 movdqa $twres,$twtmp
1948 aesenc $rndkey0,$inout0
1949 paddd $twres,$twres
1950 pxor @tweak[5],@tweak[0]
1951 aesenc $rndkey0,$inout1
1952 psrad \$31,$twtmp
1953 paddq @tweak[5],@tweak[5]
1954 aesenc $rndkey0,$inout2
1955 aesenc $rndkey0,$inout3
1956 pand $twmask,$twtmp
1957 movaps @tweak[1],@tweak[2]
1958 aesenc $rndkey0,$inout4
1959 pxor $twtmp,@tweak[5]
1960 movdqa $twres,$twtmp
1961 aesenc $rndkey0,$inout5
1962 $movkey -48($key),$rndkey0
1963
1964 paddd $twres,$twres
1965 aesenc $rndkey1,$inout0
1966 pxor @tweak[5],@tweak[1]
1967 psrad \$31,$twtmp
1968 aesenc $rndkey1,$inout1
1969 paddq @tweak[5],@tweak[5]
1970 pand $twmask,$twtmp
1971 aesenc $rndkey1,$inout2
1972 aesenc $rndkey1,$inout3
1973 movdqa @tweak[3],`16*3`(%rsp)
1974 pxor $twtmp,@tweak[5]
1975 aesenc $rndkey1,$inout4
1976 movaps @tweak[2],@tweak[3]
1977 movdqa $twres,$twtmp
1978 aesenc $rndkey1,$inout5
1979 $movkey -32($key),$rndkey1
1980
1981 paddd $twres,$twres
1982 aesenc $rndkey0,$inout0
1983 pxor @tweak[5],@tweak[2]
1984 psrad \$31,$twtmp
1985 aesenc $rndkey0,$inout1
1986 paddq @tweak[5],@tweak[5]
1987 pand $twmask,$twtmp
1988 aesenc $rndkey0,$inout2
1989 aesenc $rndkey0,$inout3
1990 aesenc $rndkey0,$inout4
1991 pxor $twtmp,@tweak[5]
1992 movaps @tweak[3],@tweak[4]
1993 aesenc $rndkey0,$inout5
1994
1995 movdqa $twres,$rndkey0
1996 paddd $twres,$twres
1997 aesenc $rndkey1,$inout0
1998 pxor @tweak[5],@tweak[3]
1999 psrad \$31,$rndkey0
2000 aesenc $rndkey1,$inout1
2001 paddq @tweak[5],@tweak[5]
2002 pand $twmask,$rndkey0
2003 aesenc $rndkey1,$inout2
2004 aesenc $rndkey1,$inout3
2005 pxor $rndkey0,@tweak[5]
2006 $movkey ($key_),$rndkey0
2007 aesenc $rndkey1,$inout4
2008 aesenc $rndkey1,$inout5
2009 $movkey 16($key_),$rndkey1
2010
2011 pxor @tweak[5],@tweak[4]
2012 aesenclast `16*0`(%rsp),$inout0
2013 psrad \$31,$twres
2014 paddq @tweak[5],@tweak[5]
2015 aesenclast `16*1`(%rsp),$inout1
2016 aesenclast `16*2`(%rsp),$inout2
2017 pand $twmask,$twres
2018 mov %r10,%rax # restore $rounds
2019 aesenclast `16*3`(%rsp),$inout3
2020 aesenclast `16*4`(%rsp),$inout4
2021 aesenclast `16*5`(%rsp),$inout5
2022 pxor $twres,@tweak[5]
2023
2024 lea `16*6`($out),$out # $out+=6*16
2025 movups $inout0,`-16*6`($out) # store 6 output blocks
2026 movups $inout1,`-16*5`($out)
2027 movups $inout2,`-16*4`($out)
2028 movups $inout3,`-16*3`($out)
2029 movups $inout4,`-16*2`($out)
2030 movups $inout5,`-16*1`($out)
2031 sub \$16*6,$len
2032 jnc .Lxts_enc_grandloop # loop if $len-=6*16 didn't borrow
2033
2034 mov \$16+96,$rounds
2035 sub $rnds_,$rounds
2036 mov $key_,$key # restore $key
2037 shr \$4,$rounds # restore original value
2038
2039.Lxts_enc_short:
2040 # at the point @tweak[0..5] are populated with tweak values
2041 mov $rounds,$rnds_ # backup $rounds
2042 pxor $rndkey0,@tweak[0]
2043 add \$16*6,$len # restore real remaining $len
2044 jz .Lxts_enc_done # done if ($len==0)
2045
2046 pxor $rndkey0,@tweak[1]
2047 cmp \$0x20,$len
2048 jb .Lxts_enc_one # $len is 1*16
2049 pxor $rndkey0,@tweak[2]
2050 je .Lxts_enc_two # $len is 2*16
2051
2052 pxor $rndkey0,@tweak[3]
2053 cmp \$0x40,$len
2054 jb .Lxts_enc_three # $len is 3*16
2055 pxor $rndkey0,@tweak[4]
2056 je .Lxts_enc_four # $len is 4*16
2057
2058 movdqu ($inp),$inout0 # $len is 5*16
2059 movdqu 16*1($inp),$inout1
2060 movdqu 16*2($inp),$inout2
2061 pxor @tweak[0],$inout0
2062 movdqu 16*3($inp),$inout3
2063 pxor @tweak[1],$inout1
2064 movdqu 16*4($inp),$inout4
2065 lea 16*5($inp),$inp # $inp+=5*16
2066 pxor @tweak[2],$inout2
2067 pxor @tweak[3],$inout3
2068 pxor @tweak[4],$inout4
2069 pxor $inout5,$inout5
2070
2071 call _aesni_encrypt6
2072
2073 xorps @tweak[0],$inout0
2074 movdqa @tweak[5],@tweak[0]
2075 xorps @tweak[1],$inout1
2076 xorps @tweak[2],$inout2
2077 movdqu $inout0,($out) # store 5 output blocks
2078 xorps @tweak[3],$inout3
2079 movdqu $inout1,16*1($out)
2080 xorps @tweak[4],$inout4
2081 movdqu $inout2,16*2($out)
2082 movdqu $inout3,16*3($out)
2083 movdqu $inout4,16*4($out)
2084 lea 16*5($out),$out # $out+=5*16
2085 jmp .Lxts_enc_done
2086
2087.align 16
2088.Lxts_enc_one:
2089 movups ($inp),$inout0
2090 lea 16*1($inp),$inp # inp+=1*16
2091 xorps @tweak[0],$inout0
2092___
2093 &aesni_generate1("enc",$key,$rounds);
2094$code.=<<___;
2095 xorps @tweak[0],$inout0
2096 movdqa @tweak[1],@tweak[0]
2097 movups $inout0,($out) # store one output block
2098 lea 16*1($out),$out # $out+=1*16
2099 jmp .Lxts_enc_done
2100
2101.align 16
2102.Lxts_enc_two:
2103 movups ($inp),$inout0
2104 movups 16($inp),$inout1
2105 lea 32($inp),$inp # $inp+=2*16
2106 xorps @tweak[0],$inout0
2107 xorps @tweak[1],$inout1
2108
2109 call _aesni_encrypt2
2110
2111 xorps @tweak[0],$inout0
2112 movdqa @tweak[2],@tweak[0]
2113 xorps @tweak[1],$inout1
2114 movups $inout0,($out) # store 2 output blocks
2115 movups $inout1,16*1($out)
2116 lea 16*2($out),$out # $out+=2*16
2117 jmp .Lxts_enc_done
2118
2119.align 16
2120.Lxts_enc_three:
2121 movups ($inp),$inout0
2122 movups 16*1($inp),$inout1
2123 movups 16*2($inp),$inout2
2124 lea 16*3($inp),$inp # $inp+=3*16
2125 xorps @tweak[0],$inout0
2126 xorps @tweak[1],$inout1
2127 xorps @tweak[2],$inout2
2128
2129 call _aesni_encrypt3
2130
2131 xorps @tweak[0],$inout0
2132 movdqa @tweak[3],@tweak[0]
2133 xorps @tweak[1],$inout1
2134 xorps @tweak[2],$inout2
2135 movups $inout0,($out) # store 3 output blocks
2136 movups $inout1,16*1($out)
2137 movups $inout2,16*2($out)
2138 lea 16*3($out),$out # $out+=3*16
2139 jmp .Lxts_enc_done
2140
2141.align 16
2142.Lxts_enc_four:
2143 movups ($inp),$inout0
2144 movups 16*1($inp),$inout1
2145 movups 16*2($inp),$inout2
2146 xorps @tweak[0],$inout0
2147 movups 16*3($inp),$inout3
2148 lea 16*4($inp),$inp # $inp+=4*16
2149 xorps @tweak[1],$inout1
2150 xorps @tweak[2],$inout2
2151 xorps @tweak[3],$inout3
2152
2153 call _aesni_encrypt4
2154
2155 pxor @tweak[0],$inout0
2156 movdqa @tweak[4],@tweak[0]
2157 pxor @tweak[1],$inout1
2158 pxor @tweak[2],$inout2
2159 movdqu $inout0,($out) # store 4 output blocks
2160 pxor @tweak[3],$inout3
2161 movdqu $inout1,16*1($out)
2162 movdqu $inout2,16*2($out)
2163 movdqu $inout3,16*3($out)
2164 lea 16*4($out),$out # $out+=4*16
2165 jmp .Lxts_enc_done
2166
2167.align 16
2168.Lxts_enc_done:
2169 and \$15,$len_ # see if $len%16 is 0
2170 jz .Lxts_enc_ret
2171 mov $len_,$len
2172
2173.Lxts_enc_steal:
2174 movzb ($inp),%eax # borrow $rounds ...
2175 movzb -16($out),%ecx # ... and $key
2176 lea 1($inp),$inp
2177 mov %al,-16($out)
2178 mov %cl,0($out)
2179 lea 1($out),$out
2180 sub \$1,$len
2181 jnz .Lxts_enc_steal
2182
2183 sub $len_,$out # rewind $out
2184 mov $key_,$key # restore $key
2185 mov $rnds_,$rounds # restore $rounds
2186
2187 movups -16($out),$inout0
2188 xorps @tweak[0],$inout0
2189___
2190 &aesni_generate1("enc",$key,$rounds);
2191$code.=<<___;
2192 xorps @tweak[0],$inout0
2193 movups $inout0,-16($out)
2194
2195.Lxts_enc_ret:
2196 xorps %xmm0,%xmm0 # clear register bank
2197 pxor %xmm1,%xmm1
2198 pxor %xmm2,%xmm2
2199 pxor %xmm3,%xmm3
2200 pxor %xmm4,%xmm4
2201 pxor %xmm5,%xmm5
2202___
2203$code.=<<___ if (!$win64);
2204 pxor %xmm6,%xmm6
2205 pxor %xmm7,%xmm7
2206 movaps %xmm0,0x00(%rsp) # clear stack
2207 pxor %xmm8,%xmm8
2208 movaps %xmm0,0x10(%rsp)
2209 pxor %xmm9,%xmm9
2210 movaps %xmm0,0x20(%rsp)
2211 pxor %xmm10,%xmm10
2212 movaps %xmm0,0x30(%rsp)
2213 pxor %xmm11,%xmm11
2214 movaps %xmm0,0x40(%rsp)
2215 pxor %xmm12,%xmm12
2216 movaps %xmm0,0x50(%rsp)
2217 pxor %xmm13,%xmm13
2218 movaps %xmm0,0x60(%rsp)
2219 pxor %xmm14,%xmm14
2220 pxor %xmm15,%xmm15
2221___
2222$code.=<<___ if ($win64);
2223 movaps -0xa8(%r11),%xmm6
2224 movaps %xmm0,-0xa8(%r11) # clear stack
2225 movaps -0x98(%r11),%xmm7
2226 movaps %xmm0,-0x98(%r11)
2227 movaps -0x88(%r11),%xmm8
2228 movaps %xmm0,-0x88(%r11)
2229 movaps -0x78(%r11),%xmm9
2230 movaps %xmm0,-0x78(%r11)
2231 movaps -0x68(%r11),%xmm10
2232 movaps %xmm0,-0x68(%r11)
2233 movaps -0x58(%r11),%xmm11
2234 movaps %xmm0,-0x58(%r11)
2235 movaps -0x48(%r11),%xmm12
2236 movaps %xmm0,-0x48(%r11)
2237 movaps -0x38(%r11),%xmm13
2238 movaps %xmm0,-0x38(%r11)
2239 movaps -0x28(%r11),%xmm14
2240 movaps %xmm0,-0x28(%r11)
2241 movaps -0x18(%r11),%xmm15
2242 movaps %xmm0,-0x18(%r11)
2243 movaps %xmm0,0x00(%rsp)
2244 movaps %xmm0,0x10(%rsp)
2245 movaps %xmm0,0x20(%rsp)
2246 movaps %xmm0,0x30(%rsp)
2247 movaps %xmm0,0x40(%rsp)
2248 movaps %xmm0,0x50(%rsp)
2249 movaps %xmm0,0x60(%rsp)
2250___
2251$code.=<<___;
2252 mov -8(%r11),%rbp
2253.cfi_restore %rbp
2254 lea (%r11),%rsp
2255.cfi_def_cfa_register %rsp
2256.Lxts_enc_epilogue:
2257 ret
2258.cfi_endproc
2259.size aesni_xts_encrypt,.-aesni_xts_encrypt
2260___
2261
2262$code.=<<___;
2263.globl aesni_xts_decrypt
2264.type aesni_xts_decrypt,\@function,6
2265.align 16
2266aesni_xts_decrypt:
2267.cfi_startproc
2268 lea (%rsp),%r11 # frame pointer
2269.cfi_def_cfa_register %r11
2270 push %rbp
2271.cfi_push %rbp
2272 sub \$$frame_size,%rsp
2273 and \$-16,%rsp # Linux kernel stack can be incorrectly seeded
2274___
2275$code.=<<___ if ($win64);
2276 movaps %xmm6,-0xa8(%r11) # offload everything
2277 movaps %xmm7,-0x98(%r11)
2278 movaps %xmm8,-0x88(%r11)
2279 movaps %xmm9,-0x78(%r11)
2280 movaps %xmm10,-0x68(%r11)
2281 movaps %xmm11,-0x58(%r11)
2282 movaps %xmm12,-0x48(%r11)
2283 movaps %xmm13,-0x38(%r11)
2284 movaps %xmm14,-0x28(%r11)
2285 movaps %xmm15,-0x18(%r11)
2286.Lxts_dec_body:
2287___
2288$code.=<<___;
2289 movups ($ivp),$inout0 # load clear-text tweak
2290 mov 240($key2),$rounds # key2->rounds
2291 mov 240($key),$rnds_ # key1->rounds
2292___
2293 # generate the tweak
2294 &aesni_generate1("enc",$key2,$rounds,$inout0);
2295$code.=<<___;
2296 xor %eax,%eax # if ($len%16) len-=16;
2297 test \$15,$len
2298 setnz %al
2299 shl \$4,%rax
2300 sub %rax,$len
2301
2302 $movkey ($key),$rndkey0 # zero round key
2303 mov $key,$key_ # backup $key
2304 mov $rnds_,$rounds # backup $rounds
2305 shl \$4,$rnds_
2306 mov $len,$len_ # backup $len
2307 and \$-16,$len
2308
2309 $movkey 16($key,$rnds_),$rndkey1 # last round key
2310
2311 movdqa .Lxts_magic(%rip),$twmask
2312 movdqa $inout0,@tweak[5]
2313 pshufd \$0x5f,$inout0,$twres
2314 pxor $rndkey0,$rndkey1
2315___
2316 for ($i=0;$i<4;$i++) {
2317 $code.=<<___;
2318 movdqa $twres,$twtmp
2319 paddd $twres,$twres
2320 movdqa @tweak[5],@tweak[$i]
2321 psrad \$31,$twtmp # broadcast upper bits
2322 paddq @tweak[5],@tweak[5]
2323 pand $twmask,$twtmp
2324 pxor $rndkey0,@tweak[$i]
2325 pxor $twtmp,@tweak[5]
2326___
2327 }
2328$code.=<<___;
2329 movdqa @tweak[5],@tweak[4]
2330 psrad \$31,$twres
2331 paddq @tweak[5],@tweak[5]
2332 pand $twmask,$twres
2333 pxor $rndkey0,@tweak[4]
2334 pxor $twres,@tweak[5]
2335 movaps $rndkey1,0x60(%rsp) # save round[0]^round[last]
2336
2337 sub \$16*6,$len
2338 jc .Lxts_dec_short # if $len-=6*16 borrowed
2339
2340 mov \$16+96,$rounds
2341 lea 32($key_,$rnds_),$key # end of key schedule
2342 sub %r10,%rax # twisted $rounds
2343 $movkey 16($key_),$rndkey1
2344 mov %rax,%r10 # backup twisted $rounds
2345 lea .Lxts_magic(%rip),%r8
2346 jmp .Lxts_dec_grandloop
2347
2348.align 32
2349.Lxts_dec_grandloop:
2350 movdqu `16*0`($inp),$inout0 # load input
2351 movdqa $rndkey0,$twmask
2352 movdqu `16*1`($inp),$inout1
2353 pxor @tweak[0],$inout0 # input^=tweak^round[0]
2354 movdqu `16*2`($inp),$inout2
2355 pxor @tweak[1],$inout1
2356 aesdec $rndkey1,$inout0
2357 movdqu `16*3`($inp),$inout3
2358 pxor @tweak[2],$inout2
2359 aesdec $rndkey1,$inout1
2360 movdqu `16*4`($inp),$inout4
2361 pxor @tweak[3],$inout3
2362 aesdec $rndkey1,$inout2
2363 movdqu `16*5`($inp),$inout5
2364 pxor @tweak[5],$twmask # round[0]^=tweak[5]
2365 movdqa 0x60(%rsp),$twres # load round[0]^round[last]
2366 pxor @tweak[4],$inout4
2367 aesdec $rndkey1,$inout3
2368 $movkey 32($key_),$rndkey0
2369 lea `16*6`($inp),$inp
2370 pxor $twmask,$inout5
2371
2372 pxor $twres,@tweak[0] # calculate tweaks^round[last]
2373 aesdec $rndkey1,$inout4
2374 pxor $twres,@tweak[1]
2375 movdqa @tweak[0],`16*0`(%rsp) # put aside tweaks^last round key
2376 aesdec $rndkey1,$inout5
2377 $movkey 48($key_),$rndkey1
2378 pxor $twres,@tweak[2]
2379
2380 aesdec $rndkey0,$inout0
2381 pxor $twres,@tweak[3]
2382 movdqa @tweak[1],`16*1`(%rsp)
2383 aesdec $rndkey0,$inout1
2384 pxor $twres,@tweak[4]
2385 movdqa @tweak[2],`16*2`(%rsp)
2386 aesdec $rndkey0,$inout2
2387 aesdec $rndkey0,$inout3
2388 pxor $twres,$twmask
2389 movdqa @tweak[4],`16*4`(%rsp)
2390 aesdec $rndkey0,$inout4
2391 aesdec $rndkey0,$inout5
2392 $movkey 64($key_),$rndkey0
2393 movdqa $twmask,`16*5`(%rsp)
2394 pshufd \$0x5f,@tweak[5],$twres
2395 jmp .Lxts_dec_loop6
2396.align 32
2397.Lxts_dec_loop6:
2398 aesdec $rndkey1,$inout0
2399 aesdec $rndkey1,$inout1
2400 aesdec $rndkey1,$inout2
2401 aesdec $rndkey1,$inout3
2402 aesdec $rndkey1,$inout4
2403 aesdec $rndkey1,$inout5
2404 $movkey -64($key,%rax),$rndkey1
2405 add \$32,%rax
2406
2407 aesdec $rndkey0,$inout0
2408 aesdec $rndkey0,$inout1
2409 aesdec $rndkey0,$inout2
2410 aesdec $rndkey0,$inout3
2411 aesdec $rndkey0,$inout4
2412 aesdec $rndkey0,$inout5
2413 $movkey -80($key,%rax),$rndkey0
2414 jnz .Lxts_dec_loop6
2415
2416 movdqa (%r8),$twmask # start calculating next tweak
2417 movdqa $twres,$twtmp
2418 paddd $twres,$twres
2419 aesdec $rndkey1,$inout0
2420 paddq @tweak[5],@tweak[5]
2421 psrad \$31,$twtmp
2422 aesdec $rndkey1,$inout1
2423 pand $twmask,$twtmp
2424 $movkey ($key_),@tweak[0] # load round[0]
2425 aesdec $rndkey1,$inout2
2426 aesdec $rndkey1,$inout3
2427 aesdec $rndkey1,$inout4
2428 pxor $twtmp,@tweak[5]
2429 movaps @tweak[0],@tweak[1] # copy round[0]
2430 aesdec $rndkey1,$inout5
2431 $movkey -64($key),$rndkey1
2432
2433 movdqa $twres,$twtmp
2434 aesdec $rndkey0,$inout0
2435 paddd $twres,$twres
2436 pxor @tweak[5],@tweak[0]
2437 aesdec $rndkey0,$inout1
2438 psrad \$31,$twtmp
2439 paddq @tweak[5],@tweak[5]
2440 aesdec $rndkey0,$inout2
2441 aesdec $rndkey0,$inout3
2442 pand $twmask,$twtmp
2443 movaps @tweak[1],@tweak[2]
2444 aesdec $rndkey0,$inout4
2445 pxor $twtmp,@tweak[5]
2446 movdqa $twres,$twtmp
2447 aesdec $rndkey0,$inout5
2448 $movkey -48($key),$rndkey0
2449
2450 paddd $twres,$twres
2451 aesdec $rndkey1,$inout0
2452 pxor @tweak[5],@tweak[1]
2453 psrad \$31,$twtmp
2454 aesdec $rndkey1,$inout1
2455 paddq @tweak[5],@tweak[5]
2456 pand $twmask,$twtmp
2457 aesdec $rndkey1,$inout2
2458 aesdec $rndkey1,$inout3
2459 movdqa @tweak[3],`16*3`(%rsp)
2460 pxor $twtmp,@tweak[5]
2461 aesdec $rndkey1,$inout4
2462 movaps @tweak[2],@tweak[3]
2463 movdqa $twres,$twtmp
2464 aesdec $rndkey1,$inout5
2465 $movkey -32($key),$rndkey1
2466
2467 paddd $twres,$twres
2468 aesdec $rndkey0,$inout0
2469 pxor @tweak[5],@tweak[2]
2470 psrad \$31,$twtmp
2471 aesdec $rndkey0,$inout1
2472 paddq @tweak[5],@tweak[5]
2473 pand $twmask,$twtmp
2474 aesdec $rndkey0,$inout2
2475 aesdec $rndkey0,$inout3
2476 aesdec $rndkey0,$inout4
2477 pxor $twtmp,@tweak[5]
2478 movaps @tweak[3],@tweak[4]
2479 aesdec $rndkey0,$inout5
2480
2481 movdqa $twres,$rndkey0
2482 paddd $twres,$twres
2483 aesdec $rndkey1,$inout0
2484 pxor @tweak[5],@tweak[3]
2485 psrad \$31,$rndkey0
2486 aesdec $rndkey1,$inout1
2487 paddq @tweak[5],@tweak[5]
2488 pand $twmask,$rndkey0
2489 aesdec $rndkey1,$inout2
2490 aesdec $rndkey1,$inout3
2491 pxor $rndkey0,@tweak[5]
2492 $movkey ($key_),$rndkey0
2493 aesdec $rndkey1,$inout4
2494 aesdec $rndkey1,$inout5
2495 $movkey 16($key_),$rndkey1
2496
2497 pxor @tweak[5],@tweak[4]
2498 aesdeclast `16*0`(%rsp),$inout0
2499 psrad \$31,$twres
2500 paddq @tweak[5],@tweak[5]
2501 aesdeclast `16*1`(%rsp),$inout1
2502 aesdeclast `16*2`(%rsp),$inout2
2503 pand $twmask,$twres
2504 mov %r10,%rax # restore $rounds
2505 aesdeclast `16*3`(%rsp),$inout3
2506 aesdeclast `16*4`(%rsp),$inout4
2507 aesdeclast `16*5`(%rsp),$inout5
2508 pxor $twres,@tweak[5]
2509
2510 lea `16*6`($out),$out # $out+=6*16
2511 movups $inout0,`-16*6`($out) # store 6 output blocks
2512 movups $inout1,`-16*5`($out)
2513 movups $inout2,`-16*4`($out)
2514 movups $inout3,`-16*3`($out)
2515 movups $inout4,`-16*2`($out)
2516 movups $inout5,`-16*1`($out)
2517 sub \$16*6,$len
2518 jnc .Lxts_dec_grandloop # loop if $len-=6*16 didn't borrow
2519
2520 mov \$16+96,$rounds
2521 sub $rnds_,$rounds
2522 mov $key_,$key # restore $key
2523 shr \$4,$rounds # restore original value
2524
2525.Lxts_dec_short:
2526 # at the point @tweak[0..5] are populated with tweak values
2527 mov $rounds,$rnds_ # backup $rounds
2528 pxor $rndkey0,@tweak[0]
2529 pxor $rndkey0,@tweak[1]
2530 add \$16*6,$len # restore real remaining $len
2531 jz .Lxts_dec_done # done if ($len==0)
2532
2533 pxor $rndkey0,@tweak[2]
2534 cmp \$0x20,$len
2535 jb .Lxts_dec_one # $len is 1*16
2536 pxor $rndkey0,@tweak[3]
2537 je .Lxts_dec_two # $len is 2*16
2538
2539 pxor $rndkey0,@tweak[4]
2540 cmp \$0x40,$len
2541 jb .Lxts_dec_three # $len is 3*16
2542 je .Lxts_dec_four # $len is 4*16
2543
2544 movdqu ($inp),$inout0 # $len is 5*16
2545 movdqu 16*1($inp),$inout1
2546 movdqu 16*2($inp),$inout2
2547 pxor @tweak[0],$inout0
2548 movdqu 16*3($inp),$inout3
2549 pxor @tweak[1],$inout1
2550 movdqu 16*4($inp),$inout4
2551 lea 16*5($inp),$inp # $inp+=5*16
2552 pxor @tweak[2],$inout2
2553 pxor @tweak[3],$inout3
2554 pxor @tweak[4],$inout4
2555
2556 call _aesni_decrypt6
2557
2558 xorps @tweak[0],$inout0
2559 xorps @tweak[1],$inout1
2560 xorps @tweak[2],$inout2
2561 movdqu $inout0,($out) # store 5 output blocks
2562 xorps @tweak[3],$inout3
2563 movdqu $inout1,16*1($out)
2564 xorps @tweak[4],$inout4
2565 movdqu $inout2,16*2($out)
2566 pxor $twtmp,$twtmp
2567 movdqu $inout3,16*3($out)
2568 pcmpgtd @tweak[5],$twtmp
2569 movdqu $inout4,16*4($out)
2570 lea 16*5($out),$out # $out+=5*16
2571 pshufd \$0x13,$twtmp,@tweak[1] # $twres
2572 and \$15,$len_
2573 jz .Lxts_dec_ret
2574
2575 movdqa @tweak[5],@tweak[0]
2576 paddq @tweak[5],@tweak[5] # psllq 1,$tweak
2577 pand $twmask,@tweak[1] # isolate carry and residue
2578 pxor @tweak[5],@tweak[1]
2579 jmp .Lxts_dec_done2
2580
2581.align 16
2582.Lxts_dec_one:
2583 movups ($inp),$inout0
2584 lea 16*1($inp),$inp # $inp+=1*16
2585 xorps @tweak[0],$inout0
2586___
2587 &aesni_generate1("dec",$key,$rounds);
2588$code.=<<___;
2589 xorps @tweak[0],$inout0
2590 movdqa @tweak[1],@tweak[0]
2591 movups $inout0,($out) # store one output block
2592 movdqa @tweak[2],@tweak[1]
2593 lea 16*1($out),$out # $out+=1*16
2594 jmp .Lxts_dec_done
2595
2596.align 16
2597.Lxts_dec_two:
2598 movups ($inp),$inout0
2599 movups 16($inp),$inout1
2600 lea 32($inp),$inp # $inp+=2*16
2601 xorps @tweak[0],$inout0
2602 xorps @tweak[1],$inout1
2603
2604 call _aesni_decrypt2
2605
2606 xorps @tweak[0],$inout0
2607 movdqa @tweak[2],@tweak[0]
2608 xorps @tweak[1],$inout1
2609 movdqa @tweak[3],@tweak[1]
2610 movups $inout0,($out) # store 2 output blocks
2611 movups $inout1,16*1($out)
2612 lea 16*2($out),$out # $out+=2*16
2613 jmp .Lxts_dec_done
2614
2615.align 16
2616.Lxts_dec_three:
2617 movups ($inp),$inout0
2618 movups 16*1($inp),$inout1
2619 movups 16*2($inp),$inout2
2620 lea 16*3($inp),$inp # $inp+=3*16
2621 xorps @tweak[0],$inout0
2622 xorps @tweak[1],$inout1
2623 xorps @tweak[2],$inout2
2624
2625 call _aesni_decrypt3
2626
2627 xorps @tweak[0],$inout0
2628 movdqa @tweak[3],@tweak[0]
2629 xorps @tweak[1],$inout1
2630 movdqa @tweak[4],@tweak[1]
2631 xorps @tweak[2],$inout2
2632 movups $inout0,($out) # store 3 output blocks
2633 movups $inout1,16*1($out)
2634 movups $inout2,16*2($out)
2635 lea 16*3($out),$out # $out+=3*16
2636 jmp .Lxts_dec_done
2637
2638.align 16
2639.Lxts_dec_four:
2640 movups ($inp),$inout0
2641 movups 16*1($inp),$inout1
2642 movups 16*2($inp),$inout2
2643 xorps @tweak[0],$inout0
2644 movups 16*3($inp),$inout3
2645 lea 16*4($inp),$inp # $inp+=4*16
2646 xorps @tweak[1],$inout1
2647 xorps @tweak[2],$inout2
2648 xorps @tweak[3],$inout3
2649
2650 call _aesni_decrypt4
2651
2652 pxor @tweak[0],$inout0
2653 movdqa @tweak[4],@tweak[0]
2654 pxor @tweak[1],$inout1
2655 movdqa @tweak[5],@tweak[1]
2656 pxor @tweak[2],$inout2
2657 movdqu $inout0,($out) # store 4 output blocks
2658 pxor @tweak[3],$inout3
2659 movdqu $inout1,16*1($out)
2660 movdqu $inout2,16*2($out)
2661 movdqu $inout3,16*3($out)
2662 lea 16*4($out),$out # $out+=4*16
2663 jmp .Lxts_dec_done
2664
2665.align 16
2666.Lxts_dec_done:
2667 and \$15,$len_ # see if $len%16 is 0
2668 jz .Lxts_dec_ret
2669.Lxts_dec_done2:
2670 mov $len_,$len
2671 mov $key_,$key # restore $key
2672 mov $rnds_,$rounds # restore $rounds
2673
2674 movups ($inp),$inout0
2675 xorps @tweak[1],$inout0
2676___
2677 &aesni_generate1("dec",$key,$rounds);
2678$code.=<<___;
2679 xorps @tweak[1],$inout0
2680 movups $inout0,($out)
2681
2682.Lxts_dec_steal:
2683 movzb 16($inp),%eax # borrow $rounds ...
2684 movzb ($out),%ecx # ... and $key
2685 lea 1($inp),$inp
2686 mov %al,($out)
2687 mov %cl,16($out)
2688 lea 1($out),$out
2689 sub \$1,$len
2690 jnz .Lxts_dec_steal
2691
2692 sub $len_,$out # rewind $out
2693 mov $key_,$key # restore $key
2694 mov $rnds_,$rounds # restore $rounds
2695
2696 movups ($out),$inout0
2697 xorps @tweak[0],$inout0
2698___
2699 &aesni_generate1("dec",$key,$rounds);
2700$code.=<<___;
2701 xorps @tweak[0],$inout0
2702 movups $inout0,($out)
2703
2704.Lxts_dec_ret:
2705 xorps %xmm0,%xmm0 # clear register bank
2706 pxor %xmm1,%xmm1
2707 pxor %xmm2,%xmm2
2708 pxor %xmm3,%xmm3
2709 pxor %xmm4,%xmm4
2710 pxor %xmm5,%xmm5
2711___
2712$code.=<<___ if (!$win64);
2713 pxor %xmm6,%xmm6
2714 pxor %xmm7,%xmm7
2715 movaps %xmm0,0x00(%rsp) # clear stack
2716 pxor %xmm8,%xmm8
2717 movaps %xmm0,0x10(%rsp)
2718 pxor %xmm9,%xmm9
2719 movaps %xmm0,0x20(%rsp)
2720 pxor %xmm10,%xmm10
2721 movaps %xmm0,0x30(%rsp)
2722 pxor %xmm11,%xmm11
2723 movaps %xmm0,0x40(%rsp)
2724 pxor %xmm12,%xmm12
2725 movaps %xmm0,0x50(%rsp)
2726 pxor %xmm13,%xmm13
2727 movaps %xmm0,0x60(%rsp)
2728 pxor %xmm14,%xmm14
2729 pxor %xmm15,%xmm15
2730___
2731$code.=<<___ if ($win64);
2732 movaps -0xa8(%r11),%xmm6
2733 movaps %xmm0,-0xa8(%r11) # clear stack
2734 movaps -0x98(%r11),%xmm7
2735 movaps %xmm0,-0x98(%r11)
2736 movaps -0x88(%r11),%xmm8
2737 movaps %xmm0,-0x88(%r11)
2738 movaps -0x78(%r11),%xmm9
2739 movaps %xmm0,-0x78(%r11)
2740 movaps -0x68(%r11),%xmm10
2741 movaps %xmm0,-0x68(%r11)
2742 movaps -0x58(%r11),%xmm11
2743 movaps %xmm0,-0x58(%r11)
2744 movaps -0x48(%r11),%xmm12
2745 movaps %xmm0,-0x48(%r11)
2746 movaps -0x38(%r11),%xmm13
2747 movaps %xmm0,-0x38(%r11)
2748 movaps -0x28(%r11),%xmm14
2749 movaps %xmm0,-0x28(%r11)
2750 movaps -0x18(%r11),%xmm15
2751 movaps %xmm0,-0x18(%r11)
2752 movaps %xmm0,0x00(%rsp)
2753 movaps %xmm0,0x10(%rsp)
2754 movaps %xmm0,0x20(%rsp)
2755 movaps %xmm0,0x30(%rsp)
2756 movaps %xmm0,0x40(%rsp)
2757 movaps %xmm0,0x50(%rsp)
2758 movaps %xmm0,0x60(%rsp)
2759___
2760$code.=<<___;
2761 mov -8(%r11),%rbp
2762.cfi_restore %rbp
2763 lea (%r11),%rsp
2764.cfi_def_cfa_register %rsp
2765.Lxts_dec_epilogue:
2766 ret
2767.cfi_endproc
2768.size aesni_xts_decrypt,.-aesni_xts_decrypt
2769___
2770}
2771
2772
2773######################################################################
2774# void aesni_ocb_[en|de]crypt(const char *inp, char *out, size_t blocks,
2775# const AES_KEY *key, unsigned int start_block_num,
2776# unsigned char offset_i[16], const unsigned char L_[][16],
2777# unsigned char checksum[16]);
2778#
2779{
2780my @offset=map("%xmm$_",(10..15));
2781my ($checksum,$rndkey0l)=("%xmm8","%xmm9");
2782my ($block_num,$offset_p)=("%r8","%r9"); # 5th and 6th arguments
2783my ($L_p,$checksum_p) = ("%rbx","%rbp");
2784my ($i1,$i3,$i5) = ("%r12","%r13","%r14");
2785my $seventh_arg = $win64 ? 56 : 8;
2786my $blocks = $len;
2787
2788$code.=<<___;
2789.globl aesni_ocb_encrypt
2790.type aesni_ocb_encrypt,\@function,6
2791.align 32
2792aesni_ocb_encrypt:
2793.cfi_startproc
2794 lea (%rsp),%rax
2795 push %rbx
2796.cfi_push %rbx
2797 push %rbp
2798.cfi_push %rbp
2799 push %r12
2800.cfi_push %r12
2801 push %r13
2802.cfi_push %r13
2803 push %r14
2804.cfi_push %r14
2805___
2806$code.=<<___ if ($win64);
2807 lea -0xa0(%rsp),%rsp
2808 movaps %xmm6,0x00(%rsp) # offload everything
2809 movaps %xmm7,0x10(%rsp)
2810 movaps %xmm8,0x20(%rsp)
2811 movaps %xmm9,0x30(%rsp)
2812 movaps %xmm10,0x40(%rsp)
2813 movaps %xmm11,0x50(%rsp)
2814 movaps %xmm12,0x60(%rsp)
2815 movaps %xmm13,0x70(%rsp)
2816 movaps %xmm14,0x80(%rsp)
2817 movaps %xmm15,0x90(%rsp)
2818.Locb_enc_body:
2819___
2820$code.=<<___;
2821 mov $seventh_arg(%rax),$L_p # 7th argument
2822 mov $seventh_arg+8(%rax),$checksum_p# 8th argument
2823
2824 mov 240($key),$rnds_
2825 mov $key,$key_
2826 shl \$4,$rnds_
2827 $movkey ($key),$rndkey0l # round[0]
2828 $movkey 16($key,$rnds_),$rndkey1 # round[last]
2829
2830 movdqu ($offset_p),@offset[5] # load last offset_i
2831 pxor $rndkey1,$rndkey0l # round[0] ^ round[last]
2832 pxor $rndkey1,@offset[5] # offset_i ^ round[last]
2833
2834 mov \$16+32,$rounds
2835 lea 32($key_,$rnds_),$key
2836 $movkey 16($key_),$rndkey1 # round[1]
2837 sub %r10,%rax # twisted $rounds
2838 mov %rax,%r10 # backup twisted $rounds
2839
2840 movdqu ($L_p),@offset[0] # L_0 for all odd-numbered blocks
2841 movdqu ($checksum_p),$checksum # load checksum
2842
2843 test \$1,$block_num # is first block number odd?
2844 jnz .Locb_enc_odd
2845
2846 bsf $block_num,$i1
2847 add \$1,$block_num
2848 shl \$4,$i1
2849 movdqu ($L_p,$i1),$inout5 # borrow
2850 movdqu ($inp),$inout0
2851 lea 16($inp),$inp
2852
2853 call __ocb_encrypt1
2854
2855 movdqa $inout5,@offset[5]
2856 movups $inout0,($out)
2857 lea 16($out),$out
2858 sub \$1,$blocks
2859 jz .Locb_enc_done
2860
2861.Locb_enc_odd:
2862 lea 1($block_num),$i1 # even-numbered blocks
2863 lea 3($block_num),$i3
2864 lea 5($block_num),$i5
2865 lea 6($block_num),$block_num
2866 bsf $i1,$i1 # ntz(block)
2867 bsf $i3,$i3
2868 bsf $i5,$i5
2869 shl \$4,$i1 # ntz(block) -> table offset
2870 shl \$4,$i3
2871 shl \$4,$i5
2872
2873 sub \$6,$blocks
2874 jc .Locb_enc_short
2875 jmp .Locb_enc_grandloop
2876
2877.align 32
2878.Locb_enc_grandloop:
2879 movdqu `16*0`($inp),$inout0 # load input
2880 movdqu `16*1`($inp),$inout1
2881 movdqu `16*2`($inp),$inout2
2882 movdqu `16*3`($inp),$inout3
2883 movdqu `16*4`($inp),$inout4
2884 movdqu `16*5`($inp),$inout5
2885 lea `16*6`($inp),$inp
2886
2887 call __ocb_encrypt6
2888
2889 movups $inout0,`16*0`($out) # store output
2890 movups $inout1,`16*1`($out)
2891 movups $inout2,`16*2`($out)
2892 movups $inout3,`16*3`($out)
2893 movups $inout4,`16*4`($out)
2894 movups $inout5,`16*5`($out)
2895 lea `16*6`($out),$out
2896 sub \$6,$blocks
2897 jnc .Locb_enc_grandloop
2898
2899.Locb_enc_short:
2900 add \$6,$blocks
2901 jz .Locb_enc_done
2902
2903 movdqu `16*0`($inp),$inout0
2904 cmp \$2,$blocks
2905 jb .Locb_enc_one
2906 movdqu `16*1`($inp),$inout1
2907 je .Locb_enc_two
2908
2909 movdqu `16*2`($inp),$inout2
2910 cmp \$4,$blocks
2911 jb .Locb_enc_three
2912 movdqu `16*3`($inp),$inout3
2913 je .Locb_enc_four
2914
2915 movdqu `16*4`($inp),$inout4
2916 pxor $inout5,$inout5
2917
2918 call __ocb_encrypt6
2919
2920 movdqa @offset[4],@offset[5]
2921 movups $inout0,`16*0`($out)
2922 movups $inout1,`16*1`($out)
2923 movups $inout2,`16*2`($out)
2924 movups $inout3,`16*3`($out)
2925 movups $inout4,`16*4`($out)
2926
2927 jmp .Locb_enc_done
2928
2929.align 16
2930.Locb_enc_one:
2931 movdqa @offset[0],$inout5 # borrow
2932
2933 call __ocb_encrypt1
2934
2935 movdqa $inout5,@offset[5]
2936 movups $inout0,`16*0`($out)
2937 jmp .Locb_enc_done
2938
2939.align 16
2940.Locb_enc_two:
2941 pxor $inout2,$inout2
2942 pxor $inout3,$inout3
2943
2944 call __ocb_encrypt4
2945
2946 movdqa @offset[1],@offset[5]
2947 movups $inout0,`16*0`($out)
2948 movups $inout1,`16*1`($out)
2949
2950 jmp .Locb_enc_done
2951
2952.align 16
2953.Locb_enc_three:
2954 pxor $inout3,$inout3
2955
2956 call __ocb_encrypt4
2957
2958 movdqa @offset[2],@offset[5]
2959 movups $inout0,`16*0`($out)
2960 movups $inout1,`16*1`($out)
2961 movups $inout2,`16*2`($out)
2962
2963 jmp .Locb_enc_done
2964
2965.align 16
2966.Locb_enc_four:
2967 call __ocb_encrypt4
2968
2969 movdqa @offset[3],@offset[5]
2970 movups $inout0,`16*0`($out)
2971 movups $inout1,`16*1`($out)
2972 movups $inout2,`16*2`($out)
2973 movups $inout3,`16*3`($out)
2974
2975.Locb_enc_done:
2976 pxor $rndkey0,@offset[5] # "remove" round[last]
2977 movdqu $checksum,($checksum_p) # store checksum
2978 movdqu @offset[5],($offset_p) # store last offset_i
2979
2980 xorps %xmm0,%xmm0 # clear register bank
2981 pxor %xmm1,%xmm1
2982 pxor %xmm2,%xmm2
2983 pxor %xmm3,%xmm3
2984 pxor %xmm4,%xmm4
2985 pxor %xmm5,%xmm5
2986___
2987$code.=<<___ if (!$win64);
2988 pxor %xmm6,%xmm6
2989 pxor %xmm7,%xmm7
2990 pxor %xmm8,%xmm8
2991 pxor %xmm9,%xmm9
2992 pxor %xmm10,%xmm10
2993 pxor %xmm11,%xmm11
2994 pxor %xmm12,%xmm12
2995 pxor %xmm13,%xmm13
2996 pxor %xmm14,%xmm14
2997 pxor %xmm15,%xmm15
2998 lea 0x28(%rsp),%rax
2999.cfi_def_cfa %rax,8
3000___
3001$code.=<<___ if ($win64);
3002 movaps 0x00(%rsp),%xmm6
3003 movaps %xmm0,0x00(%rsp) # clear stack
3004 movaps 0x10(%rsp),%xmm7
3005 movaps %xmm0,0x10(%rsp)
3006 movaps 0x20(%rsp),%xmm8
3007 movaps %xmm0,0x20(%rsp)
3008 movaps 0x30(%rsp),%xmm9
3009 movaps %xmm0,0x30(%rsp)
3010 movaps 0x40(%rsp),%xmm10
3011 movaps %xmm0,0x40(%rsp)
3012 movaps 0x50(%rsp),%xmm11
3013 movaps %xmm0,0x50(%rsp)
3014 movaps 0x60(%rsp),%xmm12
3015 movaps %xmm0,0x60(%rsp)
3016 movaps 0x70(%rsp),%xmm13
3017 movaps %xmm0,0x70(%rsp)
3018 movaps 0x80(%rsp),%xmm14
3019 movaps %xmm0,0x80(%rsp)
3020 movaps 0x90(%rsp),%xmm15
3021 movaps %xmm0,0x90(%rsp)
3022 lea 0xa0+0x28(%rsp),%rax
3023.Locb_enc_pop:
3024___
3025$code.=<<___;
3026 mov -40(%rax),%r14
3027.cfi_restore %r14
3028 mov -32(%rax),%r13
3029.cfi_restore %r13
3030 mov -24(%rax),%r12
3031.cfi_restore %r12
3032 mov -16(%rax),%rbp
3033.cfi_restore %rbp
3034 mov -8(%rax),%rbx
3035.cfi_restore %rbx
3036 lea (%rax),%rsp
3037.cfi_def_cfa_register %rsp
3038.Locb_enc_epilogue:
3039 ret
3040.cfi_endproc
3041.size aesni_ocb_encrypt,.-aesni_ocb_encrypt
3042
3043.type __ocb_encrypt6,\@abi-omnipotent
3044.align 32
3045__ocb_encrypt6:
3046.cfi_startproc
3047 pxor $rndkey0l,@offset[5] # offset_i ^ round[0]
3048 movdqu ($L_p,$i1),@offset[1]
3049 movdqa @offset[0],@offset[2]
3050 movdqu ($L_p,$i3),@offset[3]
3051 movdqa @offset[0],@offset[4]
3052 pxor @offset[5],@offset[0]
3053 movdqu ($L_p,$i5),@offset[5]
3054 pxor @offset[0],@offset[1]
3055 pxor $inout0,$checksum # accumulate checksum
3056 pxor @offset[0],$inout0 # input ^ round[0] ^ offset_i
3057 pxor @offset[1],@offset[2]
3058 pxor $inout1,$checksum
3059 pxor @offset[1],$inout1
3060 pxor @offset[2],@offset[3]
3061 pxor $inout2,$checksum
3062 pxor @offset[2],$inout2
3063 pxor @offset[3],@offset[4]
3064 pxor $inout3,$checksum
3065 pxor @offset[3],$inout3
3066 pxor @offset[4],@offset[5]
3067 pxor $inout4,$checksum
3068 pxor @offset[4],$inout4
3069 pxor $inout5,$checksum
3070 pxor @offset[5],$inout5
3071 $movkey 32($key_),$rndkey0
3072
3073 lea 1($block_num),$i1 # even-numbered blocks
3074 lea 3($block_num),$i3
3075 lea 5($block_num),$i5
3076 add \$6,$block_num
3077 pxor $rndkey0l,@offset[0] # offset_i ^ round[last]
3078 bsf $i1,$i1 # ntz(block)
3079 bsf $i3,$i3
3080 bsf $i5,$i5
3081
3082 aesenc $rndkey1,$inout0
3083 aesenc $rndkey1,$inout1
3084 aesenc $rndkey1,$inout2
3085 aesenc $rndkey1,$inout3
3086 pxor $rndkey0l,@offset[1]
3087 pxor $rndkey0l,@offset[2]
3088 aesenc $rndkey1,$inout4
3089 pxor $rndkey0l,@offset[3]
3090 pxor $rndkey0l,@offset[4]
3091 aesenc $rndkey1,$inout5
3092 $movkey 48($key_),$rndkey1
3093 pxor $rndkey0l,@offset[5]
3094
3095 aesenc $rndkey0,$inout0
3096 aesenc $rndkey0,$inout1
3097 aesenc $rndkey0,$inout2
3098 aesenc $rndkey0,$inout3
3099 aesenc $rndkey0,$inout4
3100 aesenc $rndkey0,$inout5
3101 $movkey 64($key_),$rndkey0
3102 shl \$4,$i1 # ntz(block) -> table offset
3103 shl \$4,$i3
3104 jmp .Locb_enc_loop6
3105
3106.align 32
3107.Locb_enc_loop6:
3108 aesenc $rndkey1,$inout0
3109 aesenc $rndkey1,$inout1
3110 aesenc $rndkey1,$inout2
3111 aesenc $rndkey1,$inout3
3112 aesenc $rndkey1,$inout4
3113 aesenc $rndkey1,$inout5
3114 $movkey ($key,%rax),$rndkey1
3115 add \$32,%rax
3116
3117 aesenc $rndkey0,$inout0
3118 aesenc $rndkey0,$inout1
3119 aesenc $rndkey0,$inout2
3120 aesenc $rndkey0,$inout3
3121 aesenc $rndkey0,$inout4
3122 aesenc $rndkey0,$inout5
3123 $movkey -16($key,%rax),$rndkey0
3124 jnz .Locb_enc_loop6
3125
3126 aesenc $rndkey1,$inout0
3127 aesenc $rndkey1,$inout1
3128 aesenc $rndkey1,$inout2
3129 aesenc $rndkey1,$inout3
3130 aesenc $rndkey1,$inout4
3131 aesenc $rndkey1,$inout5
3132 $movkey 16($key_),$rndkey1
3133 shl \$4,$i5
3134
3135 aesenclast @offset[0],$inout0
3136 movdqu ($L_p),@offset[0] # L_0 for all odd-numbered blocks
3137 mov %r10,%rax # restore twisted rounds
3138 aesenclast @offset[1],$inout1
3139 aesenclast @offset[2],$inout2
3140 aesenclast @offset[3],$inout3
3141 aesenclast @offset[4],$inout4
3142 aesenclast @offset[5],$inout5
3143 ret
3144.cfi_endproc
3145.size __ocb_encrypt6,.-__ocb_encrypt6
3146
3147.type __ocb_encrypt4,\@abi-omnipotent
3148.align 32
3149__ocb_encrypt4:
3150.cfi_startproc
3151 pxor $rndkey0l,@offset[5] # offset_i ^ round[0]
3152 movdqu ($L_p,$i1),@offset[1]
3153 movdqa @offset[0],@offset[2]
3154 movdqu ($L_p,$i3),@offset[3]
3155 pxor @offset[5],@offset[0]
3156 pxor @offset[0],@offset[1]
3157 pxor $inout0,$checksum # accumulate checksum
3158 pxor @offset[0],$inout0 # input ^ round[0] ^ offset_i
3159 pxor @offset[1],@offset[2]
3160 pxor $inout1,$checksum
3161 pxor @offset[1],$inout1
3162 pxor @offset[2],@offset[3]
3163 pxor $inout2,$checksum
3164 pxor @offset[2],$inout2
3165 pxor $inout3,$checksum
3166 pxor @offset[3],$inout3
3167 $movkey 32($key_),$rndkey0
3168
3169 pxor $rndkey0l,@offset[0] # offset_i ^ round[last]
3170 pxor $rndkey0l,@offset[1]
3171 pxor $rndkey0l,@offset[2]
3172 pxor $rndkey0l,@offset[3]
3173
3174 aesenc $rndkey1,$inout0
3175 aesenc $rndkey1,$inout1
3176 aesenc $rndkey1,$inout2
3177 aesenc $rndkey1,$inout3
3178 $movkey 48($key_),$rndkey1
3179
3180 aesenc $rndkey0,$inout0
3181 aesenc $rndkey0,$inout1
3182 aesenc $rndkey0,$inout2
3183 aesenc $rndkey0,$inout3
3184 $movkey 64($key_),$rndkey0
3185 jmp .Locb_enc_loop4
3186
3187.align 32
3188.Locb_enc_loop4:
3189 aesenc $rndkey1,$inout0
3190 aesenc $rndkey1,$inout1
3191 aesenc $rndkey1,$inout2
3192 aesenc $rndkey1,$inout3
3193 $movkey ($key,%rax),$rndkey1
3194 add \$32,%rax
3195
3196 aesenc $rndkey0,$inout0
3197 aesenc $rndkey0,$inout1
3198 aesenc $rndkey0,$inout2
3199 aesenc $rndkey0,$inout3
3200 $movkey -16($key,%rax),$rndkey0
3201 jnz .Locb_enc_loop4
3202
3203 aesenc $rndkey1,$inout0
3204 aesenc $rndkey1,$inout1
3205 aesenc $rndkey1,$inout2
3206 aesenc $rndkey1,$inout3
3207 $movkey 16($key_),$rndkey1
3208 mov %r10,%rax # restore twisted rounds
3209
3210 aesenclast @offset[0],$inout0
3211 aesenclast @offset[1],$inout1
3212 aesenclast @offset[2],$inout2
3213 aesenclast @offset[3],$inout3
3214 ret
3215.cfi_endproc
3216.size __ocb_encrypt4,.-__ocb_encrypt4
3217
3218.type __ocb_encrypt1,\@abi-omnipotent
3219.align 32
3220__ocb_encrypt1:
3221.cfi_startproc
3222 pxor @offset[5],$inout5 # offset_i
3223 pxor $rndkey0l,$inout5 # offset_i ^ round[0]
3224 pxor $inout0,$checksum # accumulate checksum
3225 pxor $inout5,$inout0 # input ^ round[0] ^ offset_i
3226 $movkey 32($key_),$rndkey0
3227
3228 aesenc $rndkey1,$inout0
3229 $movkey 48($key_),$rndkey1
3230 pxor $rndkey0l,$inout5 # offset_i ^ round[last]
3231
3232 aesenc $rndkey0,$inout0
3233 $movkey 64($key_),$rndkey0
3234 jmp .Locb_enc_loop1
3235
3236.align 32
3237.Locb_enc_loop1:
3238 aesenc $rndkey1,$inout0
3239 $movkey ($key,%rax),$rndkey1
3240 add \$32,%rax
3241
3242 aesenc $rndkey0,$inout0
3243 $movkey -16($key,%rax),$rndkey0
3244 jnz .Locb_enc_loop1
3245
3246 aesenc $rndkey1,$inout0
3247 $movkey 16($key_),$rndkey1 # redundant in tail
3248 mov %r10,%rax # restore twisted rounds
3249
3250 aesenclast $inout5,$inout0
3251 ret
3252.cfi_endproc
3253.size __ocb_encrypt1,.-__ocb_encrypt1
3254
3255.globl aesni_ocb_decrypt
3256.type aesni_ocb_decrypt,\@function,6
3257.align 32
3258aesni_ocb_decrypt:
3259.cfi_startproc
3260 lea (%rsp),%rax
3261 push %rbx
3262.cfi_push %rbx
3263 push %rbp
3264.cfi_push %rbp
3265 push %r12
3266.cfi_push %r12
3267 push %r13
3268.cfi_push %r13
3269 push %r14
3270.cfi_push %r14
3271___
3272$code.=<<___ if ($win64);
3273 lea -0xa0(%rsp),%rsp
3274 movaps %xmm6,0x00(%rsp) # offload everything
3275 movaps %xmm7,0x10(%rsp)
3276 movaps %xmm8,0x20(%rsp)
3277 movaps %xmm9,0x30(%rsp)
3278 movaps %xmm10,0x40(%rsp)
3279 movaps %xmm11,0x50(%rsp)
3280 movaps %xmm12,0x60(%rsp)
3281 movaps %xmm13,0x70(%rsp)
3282 movaps %xmm14,0x80(%rsp)
3283 movaps %xmm15,0x90(%rsp)
3284.Locb_dec_body:
3285___
3286$code.=<<___;
3287 mov $seventh_arg(%rax),$L_p # 7th argument
3288 mov $seventh_arg+8(%rax),$checksum_p# 8th argument
3289
3290 mov 240($key),$rnds_
3291 mov $key,$key_
3292 shl \$4,$rnds_
3293 $movkey ($key),$rndkey0l # round[0]
3294 $movkey 16($key,$rnds_),$rndkey1 # round[last]
3295
3296 movdqu ($offset_p),@offset[5] # load last offset_i
3297 pxor $rndkey1,$rndkey0l # round[0] ^ round[last]
3298 pxor $rndkey1,@offset[5] # offset_i ^ round[last]
3299
3300 mov \$16+32,$rounds
3301 lea 32($key_,$rnds_),$key
3302 $movkey 16($key_),$rndkey1 # round[1]
3303 sub %r10,%rax # twisted $rounds
3304 mov %rax,%r10 # backup twisted $rounds
3305
3306 movdqu ($L_p),@offset[0] # L_0 for all odd-numbered blocks
3307 movdqu ($checksum_p),$checksum # load checksum
3308
3309 test \$1,$block_num # is first block number odd?
3310 jnz .Locb_dec_odd
3311
3312 bsf $block_num,$i1
3313 add \$1,$block_num
3314 shl \$4,$i1
3315 movdqu ($L_p,$i1),$inout5 # borrow
3316 movdqu ($inp),$inout0
3317 lea 16($inp),$inp
3318
3319 call __ocb_decrypt1
3320
3321 movdqa $inout5,@offset[5]
3322 movups $inout0,($out)
3323 xorps $inout0,$checksum # accumulate checksum
3324 lea 16($out),$out
3325 sub \$1,$blocks
3326 jz .Locb_dec_done
3327
3328.Locb_dec_odd:
3329 lea 1($block_num),$i1 # even-numbered blocks
3330 lea 3($block_num),$i3
3331 lea 5($block_num),$i5
3332 lea 6($block_num),$block_num
3333 bsf $i1,$i1 # ntz(block)
3334 bsf $i3,$i3
3335 bsf $i5,$i5
3336 shl \$4,$i1 # ntz(block) -> table offset
3337 shl \$4,$i3
3338 shl \$4,$i5
3339
3340 sub \$6,$blocks
3341 jc .Locb_dec_short
3342 jmp .Locb_dec_grandloop
3343
3344.align 32
3345.Locb_dec_grandloop:
3346 movdqu `16*0`($inp),$inout0 # load input
3347 movdqu `16*1`($inp),$inout1
3348 movdqu `16*2`($inp),$inout2
3349 movdqu `16*3`($inp),$inout3
3350 movdqu `16*4`($inp),$inout4
3351 movdqu `16*5`($inp),$inout5
3352 lea `16*6`($inp),$inp
3353
3354 call __ocb_decrypt6
3355
3356 movups $inout0,`16*0`($out) # store output
3357 pxor $inout0,$checksum # accumulate checksum
3358 movups $inout1,`16*1`($out)
3359 pxor $inout1,$checksum
3360 movups $inout2,`16*2`($out)
3361 pxor $inout2,$checksum
3362 movups $inout3,`16*3`($out)
3363 pxor $inout3,$checksum
3364 movups $inout4,`16*4`($out)
3365 pxor $inout4,$checksum
3366 movups $inout5,`16*5`($out)
3367 pxor $inout5,$checksum
3368 lea `16*6`($out),$out
3369 sub \$6,$blocks
3370 jnc .Locb_dec_grandloop
3371
3372.Locb_dec_short:
3373 add \$6,$blocks
3374 jz .Locb_dec_done
3375
3376 movdqu `16*0`($inp),$inout0
3377 cmp \$2,$blocks
3378 jb .Locb_dec_one
3379 movdqu `16*1`($inp),$inout1
3380 je .Locb_dec_two
3381
3382 movdqu `16*2`($inp),$inout2
3383 cmp \$4,$blocks
3384 jb .Locb_dec_three
3385 movdqu `16*3`($inp),$inout3
3386 je .Locb_dec_four
3387
3388 movdqu `16*4`($inp),$inout4
3389 pxor $inout5,$inout5
3390
3391 call __ocb_decrypt6
3392
3393 movdqa @offset[4],@offset[5]
3394 movups $inout0,`16*0`($out) # store output
3395 pxor $inout0,$checksum # accumulate checksum
3396 movups $inout1,`16*1`($out)
3397 pxor $inout1,$checksum
3398 movups $inout2,`16*2`($out)
3399 pxor $inout2,$checksum
3400 movups $inout3,`16*3`($out)
3401 pxor $inout3,$checksum
3402 movups $inout4,`16*4`($out)
3403 pxor $inout4,$checksum
3404
3405 jmp .Locb_dec_done
3406
3407.align 16
3408.Locb_dec_one:
3409 movdqa @offset[0],$inout5 # borrow
3410
3411 call __ocb_decrypt1
3412
3413 movdqa $inout5,@offset[5]
3414 movups $inout0,`16*0`($out) # store output
3415 xorps $inout0,$checksum # accumulate checksum
3416 jmp .Locb_dec_done
3417
3418.align 16
3419.Locb_dec_two:
3420 pxor $inout2,$inout2
3421 pxor $inout3,$inout3
3422
3423 call __ocb_decrypt4
3424
3425 movdqa @offset[1],@offset[5]
3426 movups $inout0,`16*0`($out) # store output
3427 xorps $inout0,$checksum # accumulate checksum
3428 movups $inout1,`16*1`($out)
3429 xorps $inout1,$checksum
3430
3431 jmp .Locb_dec_done
3432
3433.align 16
3434.Locb_dec_three:
3435 pxor $inout3,$inout3
3436
3437 call __ocb_decrypt4
3438
3439 movdqa @offset[2],@offset[5]
3440 movups $inout0,`16*0`($out) # store output
3441 xorps $inout0,$checksum # accumulate checksum
3442 movups $inout1,`16*1`($out)
3443 xorps $inout1,$checksum
3444 movups $inout2,`16*2`($out)
3445 xorps $inout2,$checksum
3446
3447 jmp .Locb_dec_done
3448
3449.align 16
3450.Locb_dec_four:
3451 call __ocb_decrypt4
3452
3453 movdqa @offset[3],@offset[5]
3454 movups $inout0,`16*0`($out) # store output
3455 pxor $inout0,$checksum # accumulate checksum
3456 movups $inout1,`16*1`($out)
3457 pxor $inout1,$checksum
3458 movups $inout2,`16*2`($out)
3459 pxor $inout2,$checksum
3460 movups $inout3,`16*3`($out)
3461 pxor $inout3,$checksum
3462
3463.Locb_dec_done:
3464 pxor $rndkey0,@offset[5] # "remove" round[last]
3465 movdqu $checksum,($checksum_p) # store checksum
3466 movdqu @offset[5],($offset_p) # store last offset_i
3467
3468 xorps %xmm0,%xmm0 # clear register bank
3469 pxor %xmm1,%xmm1
3470 pxor %xmm2,%xmm2
3471 pxor %xmm3,%xmm3
3472 pxor %xmm4,%xmm4
3473 pxor %xmm5,%xmm5
3474___
3475$code.=<<___ if (!$win64);
3476 pxor %xmm6,%xmm6
3477 pxor %xmm7,%xmm7
3478 pxor %xmm8,%xmm8
3479 pxor %xmm9,%xmm9
3480 pxor %xmm10,%xmm10
3481 pxor %xmm11,%xmm11
3482 pxor %xmm12,%xmm12
3483 pxor %xmm13,%xmm13
3484 pxor %xmm14,%xmm14
3485 pxor %xmm15,%xmm15
3486 lea 0x28(%rsp),%rax
3487.cfi_def_cfa %rax,8
3488___
3489$code.=<<___ if ($win64);
3490 movaps 0x00(%rsp),%xmm6
3491 movaps %xmm0,0x00(%rsp) # clear stack
3492 movaps 0x10(%rsp),%xmm7
3493 movaps %xmm0,0x10(%rsp)
3494 movaps 0x20(%rsp),%xmm8
3495 movaps %xmm0,0x20(%rsp)
3496 movaps 0x30(%rsp),%xmm9
3497 movaps %xmm0,0x30(%rsp)
3498 movaps 0x40(%rsp),%xmm10
3499 movaps %xmm0,0x40(%rsp)
3500 movaps 0x50(%rsp),%xmm11
3501 movaps %xmm0,0x50(%rsp)
3502 movaps 0x60(%rsp),%xmm12
3503 movaps %xmm0,0x60(%rsp)
3504 movaps 0x70(%rsp),%xmm13
3505 movaps %xmm0,0x70(%rsp)
3506 movaps 0x80(%rsp),%xmm14
3507 movaps %xmm0,0x80(%rsp)
3508 movaps 0x90(%rsp),%xmm15
3509 movaps %xmm0,0x90(%rsp)
3510 lea 0xa0+0x28(%rsp),%rax
3511.Locb_dec_pop:
3512___
3513$code.=<<___;
3514 mov -40(%rax),%r14
3515.cfi_restore %r14
3516 mov -32(%rax),%r13
3517.cfi_restore %r13
3518 mov -24(%rax),%r12
3519.cfi_restore %r12
3520 mov -16(%rax),%rbp
3521.cfi_restore %rbp
3522 mov -8(%rax),%rbx
3523.cfi_restore %rbx
3524 lea (%rax),%rsp
3525.cfi_def_cfa_register %rsp
3526.Locb_dec_epilogue:
3527 ret
3528.cfi_endproc
3529.size aesni_ocb_decrypt,.-aesni_ocb_decrypt
3530
3531.type __ocb_decrypt6,\@abi-omnipotent
3532.align 32
3533__ocb_decrypt6:
3534.cfi_startproc
3535 pxor $rndkey0l,@offset[5] # offset_i ^ round[0]
3536 movdqu ($L_p,$i1),@offset[1]
3537 movdqa @offset[0],@offset[2]
3538 movdqu ($L_p,$i3),@offset[3]
3539 movdqa @offset[0],@offset[4]
3540 pxor @offset[5],@offset[0]
3541 movdqu ($L_p,$i5),@offset[5]
3542 pxor @offset[0],@offset[1]
3543 pxor @offset[0],$inout0 # input ^ round[0] ^ offset_i
3544 pxor @offset[1],@offset[2]
3545 pxor @offset[1],$inout1
3546 pxor @offset[2],@offset[3]
3547 pxor @offset[2],$inout2
3548 pxor @offset[3],@offset[4]
3549 pxor @offset[3],$inout3
3550 pxor @offset[4],@offset[5]
3551 pxor @offset[4],$inout4
3552 pxor @offset[5],$inout5
3553 $movkey 32($key_),$rndkey0
3554
3555 lea 1($block_num),$i1 # even-numbered blocks
3556 lea 3($block_num),$i3
3557 lea 5($block_num),$i5
3558 add \$6,$block_num
3559 pxor $rndkey0l,@offset[0] # offset_i ^ round[last]
3560 bsf $i1,$i1 # ntz(block)
3561 bsf $i3,$i3
3562 bsf $i5,$i5
3563
3564 aesdec $rndkey1,$inout0
3565 aesdec $rndkey1,$inout1
3566 aesdec $rndkey1,$inout2
3567 aesdec $rndkey1,$inout3
3568 pxor $rndkey0l,@offset[1]
3569 pxor $rndkey0l,@offset[2]
3570 aesdec $rndkey1,$inout4
3571 pxor $rndkey0l,@offset[3]
3572 pxor $rndkey0l,@offset[4]
3573 aesdec $rndkey1,$inout5
3574 $movkey 48($key_),$rndkey1
3575 pxor $rndkey0l,@offset[5]
3576
3577 aesdec $rndkey0,$inout0
3578 aesdec $rndkey0,$inout1
3579 aesdec $rndkey0,$inout2
3580 aesdec $rndkey0,$inout3
3581 aesdec $rndkey0,$inout4
3582 aesdec $rndkey0,$inout5
3583 $movkey 64($key_),$rndkey0
3584 shl \$4,$i1 # ntz(block) -> table offset
3585 shl \$4,$i3
3586 jmp .Locb_dec_loop6
3587
3588.align 32
3589.Locb_dec_loop6:
3590 aesdec $rndkey1,$inout0
3591 aesdec $rndkey1,$inout1
3592 aesdec $rndkey1,$inout2
3593 aesdec $rndkey1,$inout3
3594 aesdec $rndkey1,$inout4
3595 aesdec $rndkey1,$inout5
3596 $movkey ($key,%rax),$rndkey1
3597 add \$32,%rax
3598
3599 aesdec $rndkey0,$inout0
3600 aesdec $rndkey0,$inout1
3601 aesdec $rndkey0,$inout2
3602 aesdec $rndkey0,$inout3
3603 aesdec $rndkey0,$inout4
3604 aesdec $rndkey0,$inout5
3605 $movkey -16($key,%rax),$rndkey0
3606 jnz .Locb_dec_loop6
3607
3608 aesdec $rndkey1,$inout0
3609 aesdec $rndkey1,$inout1
3610 aesdec $rndkey1,$inout2
3611 aesdec $rndkey1,$inout3
3612 aesdec $rndkey1,$inout4
3613 aesdec $rndkey1,$inout5
3614 $movkey 16($key_),$rndkey1
3615 shl \$4,$i5
3616
3617 aesdeclast @offset[0],$inout0
3618 movdqu ($L_p),@offset[0] # L_0 for all odd-numbered blocks
3619 mov %r10,%rax # restore twisted rounds
3620 aesdeclast @offset[1],$inout1
3621 aesdeclast @offset[2],$inout2
3622 aesdeclast @offset[3],$inout3
3623 aesdeclast @offset[4],$inout4
3624 aesdeclast @offset[5],$inout5
3625 ret
3626.cfi_endproc
3627.size __ocb_decrypt6,.-__ocb_decrypt6
3628
3629.type __ocb_decrypt4,\@abi-omnipotent
3630.align 32
3631__ocb_decrypt4:
3632.cfi_startproc
3633 pxor $rndkey0l,@offset[5] # offset_i ^ round[0]
3634 movdqu ($L_p,$i1),@offset[1]
3635 movdqa @offset[0],@offset[2]
3636 movdqu ($L_p,$i3),@offset[3]
3637 pxor @offset[5],@offset[0]
3638 pxor @offset[0],@offset[1]
3639 pxor @offset[0],$inout0 # input ^ round[0] ^ offset_i
3640 pxor @offset[1],@offset[2]
3641 pxor @offset[1],$inout1
3642 pxor @offset[2],@offset[3]
3643 pxor @offset[2],$inout2
3644 pxor @offset[3],$inout3
3645 $movkey 32($key_),$rndkey0
3646
3647 pxor $rndkey0l,@offset[0] # offset_i ^ round[last]
3648 pxor $rndkey0l,@offset[1]
3649 pxor $rndkey0l,@offset[2]
3650 pxor $rndkey0l,@offset[3]
3651
3652 aesdec $rndkey1,$inout0
3653 aesdec $rndkey1,$inout1
3654 aesdec $rndkey1,$inout2
3655 aesdec $rndkey1,$inout3
3656 $movkey 48($key_),$rndkey1
3657
3658 aesdec $rndkey0,$inout0
3659 aesdec $rndkey0,$inout1
3660 aesdec $rndkey0,$inout2
3661 aesdec $rndkey0,$inout3
3662 $movkey 64($key_),$rndkey0
3663 jmp .Locb_dec_loop4
3664
3665.align 32
3666.Locb_dec_loop4:
3667 aesdec $rndkey1,$inout0
3668 aesdec $rndkey1,$inout1
3669 aesdec $rndkey1,$inout2
3670 aesdec $rndkey1,$inout3
3671 $movkey ($key,%rax),$rndkey1
3672 add \$32,%rax
3673
3674 aesdec $rndkey0,$inout0
3675 aesdec $rndkey0,$inout1
3676 aesdec $rndkey0,$inout2
3677 aesdec $rndkey0,$inout3
3678 $movkey -16($key,%rax),$rndkey0
3679 jnz .Locb_dec_loop4
3680
3681 aesdec $rndkey1,$inout0
3682 aesdec $rndkey1,$inout1
3683 aesdec $rndkey1,$inout2
3684 aesdec $rndkey1,$inout3
3685 $movkey 16($key_),$rndkey1
3686 mov %r10,%rax # restore twisted rounds
3687
3688 aesdeclast @offset[0],$inout0
3689 aesdeclast @offset[1],$inout1
3690 aesdeclast @offset[2],$inout2
3691 aesdeclast @offset[3],$inout3
3692 ret
3693.cfi_endproc
3694.size __ocb_decrypt4,.-__ocb_decrypt4
3695
3696.type __ocb_decrypt1,\@abi-omnipotent
3697.align 32
3698__ocb_decrypt1:
3699.cfi_startproc
3700 pxor @offset[5],$inout5 # offset_i
3701 pxor $rndkey0l,$inout5 # offset_i ^ round[0]
3702 pxor $inout5,$inout0 # input ^ round[0] ^ offset_i
3703 $movkey 32($key_),$rndkey0
3704
3705 aesdec $rndkey1,$inout0
3706 $movkey 48($key_),$rndkey1
3707 pxor $rndkey0l,$inout5 # offset_i ^ round[last]
3708
3709 aesdec $rndkey0,$inout0
3710 $movkey 64($key_),$rndkey0
3711 jmp .Locb_dec_loop1
3712
3713.align 32
3714.Locb_dec_loop1:
3715 aesdec $rndkey1,$inout0
3716 $movkey ($key,%rax),$rndkey1
3717 add \$32,%rax
3718
3719 aesdec $rndkey0,$inout0
3720 $movkey -16($key,%rax),$rndkey0
3721 jnz .Locb_dec_loop1
3722
3723 aesdec $rndkey1,$inout0
3724 $movkey 16($key_),$rndkey1 # redundant in tail
3725 mov %r10,%rax # restore twisted rounds
3726
3727 aesdeclast $inout5,$inout0
3728 ret
3729.cfi_endproc
3730.size __ocb_decrypt1,.-__ocb_decrypt1
3731___
3732} }}
3733
3734
3735########################################################################
3736# void $PREFIX_cbc_encrypt (const void *inp, void *out,
3737# size_t length, const AES_KEY *key,
3738# unsigned char *ivp,const int enc);
3739{
3740my $frame_size = 0x10 + ($win64?0xa0:0); # used in decrypt
3741my ($iv,$in0,$in1,$in2,$in3,$in4)=map("%xmm$_",(10..15));
3742
3743$code.=<<___;
3744.globl ${PREFIX}_cbc_encrypt
3745.type ${PREFIX}_cbc_encrypt,\@function,6
3746.align 16
3747${PREFIX}_cbc_encrypt:
3748.cfi_startproc
3749 test $len,$len # check length
3750 jz .Lcbc_ret
3751
3752 mov 240($key),$rnds_ # key->rounds
3753 mov $key,$key_ # backup $key
3754 test %r9d,%r9d # 6th argument
3755 jz .Lcbc_decrypt
3756#--------------------------- CBC ENCRYPT ------------------------------#
3757 movups ($ivp),$inout0 # load iv as initial state
3758 mov $rnds_,$rounds
3759 cmp \$16,$len
3760 jb .Lcbc_enc_tail
3761 sub \$16,$len
3762 jmp .Lcbc_enc_loop
3763.align 16
3764.Lcbc_enc_loop:
3765 movups ($inp),$inout1 # load input
3766 lea 16($inp),$inp
3767 #xorps $inout1,$inout0
3768___
3769 &aesni_generate1("enc",$key,$rounds,$inout0,$inout1);
3770$code.=<<___;
3771 mov $rnds_,$rounds # restore $rounds
3772 mov $key_,$key # restore $key
3773 movups $inout0,0($out) # store output
3774 lea 16($out),$out
3775 sub \$16,$len
3776 jnc .Lcbc_enc_loop
3777 add \$16,$len
3778 jnz .Lcbc_enc_tail
3779 pxor $rndkey0,$rndkey0 # clear register bank
3780 pxor $rndkey1,$rndkey1
3781 movups $inout0,($ivp)
3782 pxor $inout0,$inout0
3783 pxor $inout1,$inout1
3784 jmp .Lcbc_ret
3785
3786.Lcbc_enc_tail:
3787 mov $len,%rcx # zaps $key
3788 xchg $inp,$out # $inp is %rsi and $out is %rdi now
3789 .long 0x9066A4F3 # rep movsb
3790 mov \$16,%ecx # zero tail
3791 sub $len,%rcx
3792 xor %eax,%eax
3793 .long 0x9066AAF3 # rep stosb
3794 lea -16(%rdi),%rdi # rewind $out by 1 block
3795 mov $rnds_,$rounds # restore $rounds
3796 mov %rdi,%rsi # $inp and $out are the same
3797 mov $key_,$key # restore $key
3798 xor $len,$len # len=16
3799 jmp .Lcbc_enc_loop # one more spin
3800
3801#--------------------------- CBC DECRYPT ------------------------------#
3802.align 16
3803.Lcbc_decrypt:
3804 cmp \$16,$len
3805 jne .Lcbc_decrypt_bulk
3806
3807 # handle single block without allocating stack frame,
3808 # useful in ciphertext stealing mode
3809 movdqu ($inp),$inout0 # load input
3810 movdqu ($ivp),$inout1 # load iv
3811 movdqa $inout0,$inout2 # future iv
3812___
3813 &aesni_generate1("dec",$key,$rnds_);
3814$code.=<<___;
3815 pxor $rndkey0,$rndkey0 # clear register bank
3816 pxor $rndkey1,$rndkey1
3817 movdqu $inout2,($ivp) # store iv
3818 xorps $inout1,$inout0 # ^=iv
3819 pxor $inout1,$inout1
3820 movups $inout0,($out) # store output
3821 pxor $inout0,$inout0
3822 jmp .Lcbc_ret
3823.align 16
3824.Lcbc_decrypt_bulk:
3825 lea (%rsp),%r11 # frame pointer
3826.cfi_def_cfa_register %r11
3827 push %rbp
3828.cfi_push %rbp
3829 sub \$$frame_size,%rsp
3830 and \$-16,%rsp # Linux kernel stack can be incorrectly seeded
3831___
3832$code.=<<___ if ($win64);
3833 movaps %xmm6,0x10(%rsp)
3834 movaps %xmm7,0x20(%rsp)
3835 movaps %xmm8,0x30(%rsp)
3836 movaps %xmm9,0x40(%rsp)
3837 movaps %xmm10,0x50(%rsp)
3838 movaps %xmm11,0x60(%rsp)
3839 movaps %xmm12,0x70(%rsp)
3840 movaps %xmm13,0x80(%rsp)
3841 movaps %xmm14,0x90(%rsp)
3842 movaps %xmm15,0xa0(%rsp)
3843.Lcbc_decrypt_body:
3844___
3845
3846my $inp_=$key_="%rbp"; # reassign $key_
3847
3848$code.=<<___;
3849 mov $key,$key_ # [re-]backup $key [after reassignment]
3850 movups ($ivp),$iv
3851 mov $rnds_,$rounds
3852 cmp \$0x50,$len
3853 jbe .Lcbc_dec_tail
3854
3855 $movkey ($key),$rndkey0
3856 movdqu 0x00($inp),$inout0 # load input
3857 movdqu 0x10($inp),$inout1
3858 movdqa $inout0,$in0
3859 movdqu 0x20($inp),$inout2
3860 movdqa $inout1,$in1
3861 movdqu 0x30($inp),$inout3
3862 movdqa $inout2,$in2
3863 movdqu 0x40($inp),$inout4
3864 movdqa $inout3,$in3
3865 movdqu 0x50($inp),$inout5
3866 movdqa $inout4,$in4
3867 mov OPENSSL_ia32cap_P+4(%rip),%r9d
3868 cmp \$0x70,$len
3869 jbe .Lcbc_dec_six_or_seven
3870
3871 and \$`1<<26|1<<22`,%r9d # isolate XSAVE+MOVBE
3872 sub \$0x50,$len # $len is biased by -5*16
3873 cmp \$`1<<22`,%r9d # check for MOVBE without XSAVE
3874 je .Lcbc_dec_loop6_enter # [which denotes Atom Silvermont]
3875 sub \$0x20,$len # $len is biased by -7*16
3876 lea 0x70($key),$key # size optimization
3877 jmp .Lcbc_dec_loop8_enter
3878.align 16
3879.Lcbc_dec_loop8:
3880 movups $inout7,($out)
3881 lea 0x10($out),$out
3882.Lcbc_dec_loop8_enter:
3883 movdqu 0x60($inp),$inout6
3884 pxor $rndkey0,$inout0
3885 movdqu 0x70($inp),$inout7
3886 pxor $rndkey0,$inout1
3887 $movkey 0x10-0x70($key),$rndkey1
3888 pxor $rndkey0,$inout2
3889 mov \$-1,$inp_
3890 cmp \$0x70,$len # is there at least 0x60 bytes ahead?
3891 pxor $rndkey0,$inout3
3892 pxor $rndkey0,$inout4
3893 pxor $rndkey0,$inout5
3894 pxor $rndkey0,$inout6
3895
3896 aesdec $rndkey1,$inout0
3897 pxor $rndkey0,$inout7
3898 $movkey 0x20-0x70($key),$rndkey0
3899 aesdec $rndkey1,$inout1
3900 aesdec $rndkey1,$inout2
3901 aesdec $rndkey1,$inout3
3902 aesdec $rndkey1,$inout4
3903 aesdec $rndkey1,$inout5
3904 aesdec $rndkey1,$inout6
3905 adc \$0,$inp_
3906 and \$128,$inp_
3907 aesdec $rndkey1,$inout7
3908 add $inp,$inp_
3909 $movkey 0x30-0x70($key),$rndkey1
3910___
3911for($i=1;$i<12;$i++) {
3912my $rndkeyx = ($i&1)?$rndkey0:$rndkey1;
3913$code.=<<___ if ($i==7);
3914 cmp \$11,$rounds
3915___
3916$code.=<<___;
3917 aesdec $rndkeyx,$inout0
3918 aesdec $rndkeyx,$inout1
3919 aesdec $rndkeyx,$inout2
3920 aesdec $rndkeyx,$inout3
3921 aesdec $rndkeyx,$inout4
3922 aesdec $rndkeyx,$inout5
3923 aesdec $rndkeyx,$inout6
3924 aesdec $rndkeyx,$inout7
3925 $movkey `0x30+0x10*$i`-0x70($key),$rndkeyx
3926___
3927$code.=<<___ if ($i<6 || (!($i&1) && $i>7));
3928 nop
3929___
3930$code.=<<___ if ($i==7);
3931 jb .Lcbc_dec_done
3932___
3933$code.=<<___ if ($i==9);
3934 je .Lcbc_dec_done
3935___
3936$code.=<<___ if ($i==11);
3937 jmp .Lcbc_dec_done
3938___
3939}
3940$code.=<<___;
3941.align 16
3942.Lcbc_dec_done:
3943 aesdec $rndkey1,$inout0
3944 aesdec $rndkey1,$inout1
3945 pxor $rndkey0,$iv
3946 pxor $rndkey0,$in0
3947 aesdec $rndkey1,$inout2
3948 aesdec $rndkey1,$inout3
3949 pxor $rndkey0,$in1
3950 pxor $rndkey0,$in2
3951 aesdec $rndkey1,$inout4
3952 aesdec $rndkey1,$inout5
3953 pxor $rndkey0,$in3
3954 pxor $rndkey0,$in4
3955 aesdec $rndkey1,$inout6
3956 aesdec $rndkey1,$inout7
3957 movdqu 0x50($inp),$rndkey1
3958
3959 aesdeclast $iv,$inout0
3960 movdqu 0x60($inp),$iv # borrow $iv
3961 pxor $rndkey0,$rndkey1
3962 aesdeclast $in0,$inout1
3963 pxor $rndkey0,$iv
3964 movdqu 0x70($inp),$rndkey0 # next IV
3965 aesdeclast $in1,$inout2
3966 lea 0x80($inp),$inp
3967 movdqu 0x00($inp_),$in0
3968 aesdeclast $in2,$inout3
3969 aesdeclast $in3,$inout4
3970 movdqu 0x10($inp_),$in1
3971 movdqu 0x20($inp_),$in2
3972 aesdeclast $in4,$inout5
3973 aesdeclast $rndkey1,$inout6
3974 movdqu 0x30($inp_),$in3
3975 movdqu 0x40($inp_),$in4
3976 aesdeclast $iv,$inout7
3977 movdqa $rndkey0,$iv # return $iv
3978 movdqu 0x50($inp_),$rndkey1
3979 $movkey -0x70($key),$rndkey0
3980
3981 movups $inout0,($out) # store output
3982 movdqa $in0,$inout0
3983 movups $inout1,0x10($out)
3984 movdqa $in1,$inout1
3985 movups $inout2,0x20($out)
3986 movdqa $in2,$inout2
3987 movups $inout3,0x30($out)
3988 movdqa $in3,$inout3
3989 movups $inout4,0x40($out)
3990 movdqa $in4,$inout4
3991 movups $inout5,0x50($out)
3992 movdqa $rndkey1,$inout5
3993 movups $inout6,0x60($out)
3994 lea 0x70($out),$out
3995
3996 sub \$0x80,$len
3997 ja .Lcbc_dec_loop8
3998
3999 movaps $inout7,$inout0
4000 lea -0x70($key),$key
4001 add \$0x70,$len
4002 jle .Lcbc_dec_clear_tail_collected
4003 movups $inout7,($out)
4004 lea 0x10($out),$out
4005 cmp \$0x50,$len
4006 jbe .Lcbc_dec_tail
4007
4008 movaps $in0,$inout0
4009.Lcbc_dec_six_or_seven:
4010 cmp \$0x60,$len
4011 ja .Lcbc_dec_seven
4012
4013 movaps $inout5,$inout6
4014 call _aesni_decrypt6
4015 pxor $iv,$inout0 # ^= IV
4016 movaps $inout6,$iv
4017 pxor $in0,$inout1
4018 movdqu $inout0,($out)
4019 pxor $in1,$inout2
4020 movdqu $inout1,0x10($out)
4021 pxor $inout1,$inout1 # clear register bank
4022 pxor $in2,$inout3
4023 movdqu $inout2,0x20($out)
4024 pxor $inout2,$inout2
4025 pxor $in3,$inout4
4026 movdqu $inout3,0x30($out)
4027 pxor $inout3,$inout3
4028 pxor $in4,$inout5
4029 movdqu $inout4,0x40($out)
4030 pxor $inout4,$inout4
4031 lea 0x50($out),$out
4032 movdqa $inout5,$inout0
4033 pxor $inout5,$inout5
4034 jmp .Lcbc_dec_tail_collected
4035
4036.align 16
4037.Lcbc_dec_seven:
4038 movups 0x60($inp),$inout6
4039 xorps $inout7,$inout7
4040 call _aesni_decrypt8
4041 movups 0x50($inp),$inout7
4042 pxor $iv,$inout0 # ^= IV
4043 movups 0x60($inp),$iv
4044 pxor $in0,$inout1
4045 movdqu $inout0,($out)
4046 pxor $in1,$inout2
4047 movdqu $inout1,0x10($out)
4048 pxor $inout1,$inout1 # clear register bank
4049 pxor $in2,$inout3
4050 movdqu $inout2,0x20($out)
4051 pxor $inout2,$inout2
4052 pxor $in3,$inout4
4053 movdqu $inout3,0x30($out)
4054 pxor $inout3,$inout3
4055 pxor $in4,$inout5
4056 movdqu $inout4,0x40($out)
4057 pxor $inout4,$inout4
4058 pxor $inout7,$inout6
4059 movdqu $inout5,0x50($out)
4060 pxor $inout5,$inout5
4061 lea 0x60($out),$out
4062 movdqa $inout6,$inout0
4063 pxor $inout6,$inout6
4064 pxor $inout7,$inout7
4065 jmp .Lcbc_dec_tail_collected
4066
4067.align 16
4068.Lcbc_dec_loop6:
4069 movups $inout5,($out)
4070 lea 0x10($out),$out
4071 movdqu 0x00($inp),$inout0 # load input
4072 movdqu 0x10($inp),$inout1
4073 movdqa $inout0,$in0
4074 movdqu 0x20($inp),$inout2
4075 movdqa $inout1,$in1
4076 movdqu 0x30($inp),$inout3
4077 movdqa $inout2,$in2
4078 movdqu 0x40($inp),$inout4
4079 movdqa $inout3,$in3
4080 movdqu 0x50($inp),$inout5
4081 movdqa $inout4,$in4
4082.Lcbc_dec_loop6_enter:
4083 lea 0x60($inp),$inp
4084 movdqa $inout5,$inout6
4085
4086 call _aesni_decrypt6
4087
4088 pxor $iv,$inout0 # ^= IV
4089 movdqa $inout6,$iv
4090 pxor $in0,$inout1
4091 movdqu $inout0,($out)
4092 pxor $in1,$inout2
4093 movdqu $inout1,0x10($out)
4094 pxor $in2,$inout3
4095 movdqu $inout2,0x20($out)
4096 pxor $in3,$inout4
4097 mov $key_,$key
4098 movdqu $inout3,0x30($out)
4099 pxor $in4,$inout5
4100 mov $rnds_,$rounds
4101 movdqu $inout4,0x40($out)
4102 lea 0x50($out),$out
4103 sub \$0x60,$len
4104 ja .Lcbc_dec_loop6
4105
4106 movdqa $inout5,$inout0
4107 add \$0x50,$len
4108 jle .Lcbc_dec_clear_tail_collected
4109 movups $inout5,($out)
4110 lea 0x10($out),$out
4111
4112.Lcbc_dec_tail:
4113 movups ($inp),$inout0
4114 sub \$0x10,$len
4115 jbe .Lcbc_dec_one # $len is 1*16 or less
4116
4117 movups 0x10($inp),$inout1
4118 movaps $inout0,$in0
4119 sub \$0x10,$len
4120 jbe .Lcbc_dec_two # $len is 2*16 or less
4121
4122 movups 0x20($inp),$inout2
4123 movaps $inout1,$in1
4124 sub \$0x10,$len
4125 jbe .Lcbc_dec_three # $len is 3*16 or less
4126
4127 movups 0x30($inp),$inout3
4128 movaps $inout2,$in2
4129 sub \$0x10,$len
4130 jbe .Lcbc_dec_four # $len is 4*16 or less
4131
4132 movups 0x40($inp),$inout4 # $len is 5*16 or less
4133 movaps $inout3,$in3
4134 movaps $inout4,$in4
4135 xorps $inout5,$inout5
4136 call _aesni_decrypt6
4137 pxor $iv,$inout0
4138 movaps $in4,$iv
4139 pxor $in0,$inout1
4140 movdqu $inout0,($out)
4141 pxor $in1,$inout2
4142 movdqu $inout1,0x10($out)
4143 pxor $inout1,$inout1 # clear register bank
4144 pxor $in2,$inout3
4145 movdqu $inout2,0x20($out)
4146 pxor $inout2,$inout2
4147 pxor $in3,$inout4
4148 movdqu $inout3,0x30($out)
4149 pxor $inout3,$inout3
4150 lea 0x40($out),$out
4151 movdqa $inout4,$inout0
4152 pxor $inout4,$inout4
4153 pxor $inout5,$inout5
4154 sub \$0x10,$len
4155 jmp .Lcbc_dec_tail_collected
4156
4157.align 16
4158.Lcbc_dec_one:
4159 movaps $inout0,$in0
4160___
4161 &aesni_generate1("dec",$key,$rounds);
4162$code.=<<___;
4163 xorps $iv,$inout0
4164 movaps $in0,$iv
4165 jmp .Lcbc_dec_tail_collected
4166.align 16
4167.Lcbc_dec_two:
4168 movaps $inout1,$in1
4169 call _aesni_decrypt2
4170 pxor $iv,$inout0
4171 movaps $in1,$iv
4172 pxor $in0,$inout1
4173 movdqu $inout0,($out)
4174 movdqa $inout1,$inout0
4175 pxor $inout1,$inout1 # clear register bank
4176 lea 0x10($out),$out
4177 jmp .Lcbc_dec_tail_collected
4178.align 16
4179.Lcbc_dec_three:
4180 movaps $inout2,$in2
4181 call _aesni_decrypt3
4182 pxor $iv,$inout0
4183 movaps $in2,$iv
4184 pxor $in0,$inout1
4185 movdqu $inout0,($out)
4186 pxor $in1,$inout2
4187 movdqu $inout1,0x10($out)
4188 pxor $inout1,$inout1 # clear register bank
4189 movdqa $inout2,$inout0
4190 pxor $inout2,$inout2
4191 lea 0x20($out),$out
4192 jmp .Lcbc_dec_tail_collected
4193.align 16
4194.Lcbc_dec_four:
4195 movaps $inout3,$in3
4196 call _aesni_decrypt4
4197 pxor $iv,$inout0
4198 movaps $in3,$iv
4199 pxor $in0,$inout1
4200 movdqu $inout0,($out)
4201 pxor $in1,$inout2
4202 movdqu $inout1,0x10($out)
4203 pxor $inout1,$inout1 # clear register bank
4204 pxor $in2,$inout3
4205 movdqu $inout2,0x20($out)
4206 pxor $inout2,$inout2
4207 movdqa $inout3,$inout0
4208 pxor $inout3,$inout3
4209 lea 0x30($out),$out
4210 jmp .Lcbc_dec_tail_collected
4211
4212.align 16
4213.Lcbc_dec_clear_tail_collected:
4214 pxor $inout1,$inout1 # clear register bank
4215 pxor $inout2,$inout2
4216 pxor $inout3,$inout3
4217___
4218$code.=<<___ if (!$win64);
4219 pxor $inout4,$inout4 # %xmm6..9
4220 pxor $inout5,$inout5
4221 pxor $inout6,$inout6
4222 pxor $inout7,$inout7
4223___
4224$code.=<<___;
4225.Lcbc_dec_tail_collected:
4226 movups $iv,($ivp)
4227 and \$15,$len
4228 jnz .Lcbc_dec_tail_partial
4229 movups $inout0,($out)
4230 pxor $inout0,$inout0
4231 jmp .Lcbc_dec_ret
4232.align 16
4233.Lcbc_dec_tail_partial:
4234 movaps $inout0,(%rsp)
4235 pxor $inout0,$inout0
4236 mov \$16,%rcx
4237 mov $out,%rdi
4238 sub $len,%rcx
4239 lea (%rsp),%rsi
4240 .long 0x9066A4F3 # rep movsb
4241 movdqa $inout0,(%rsp)
4242
4243.Lcbc_dec_ret:
4244 xorps $rndkey0,$rndkey0 # %xmm0
4245 pxor $rndkey1,$rndkey1
4246___
4247$code.=<<___ if ($win64);
4248 movaps 0x10(%rsp),%xmm6
4249 movaps %xmm0,0x10(%rsp) # clear stack
4250 movaps 0x20(%rsp),%xmm7
4251 movaps %xmm0,0x20(%rsp)
4252 movaps 0x30(%rsp),%xmm8
4253 movaps %xmm0,0x30(%rsp)
4254 movaps 0x40(%rsp),%xmm9
4255 movaps %xmm0,0x40(%rsp)
4256 movaps 0x50(%rsp),%xmm10
4257 movaps %xmm0,0x50(%rsp)
4258 movaps 0x60(%rsp),%xmm11
4259 movaps %xmm0,0x60(%rsp)
4260 movaps 0x70(%rsp),%xmm12
4261 movaps %xmm0,0x70(%rsp)
4262 movaps 0x80(%rsp),%xmm13
4263 movaps %xmm0,0x80(%rsp)
4264 movaps 0x90(%rsp),%xmm14
4265 movaps %xmm0,0x90(%rsp)
4266 movaps 0xa0(%rsp),%xmm15
4267 movaps %xmm0,0xa0(%rsp)
4268___
4269$code.=<<___;
4270 mov -8(%r11),%rbp
4271.cfi_restore %rbp
4272 lea (%r11),%rsp
4273.cfi_def_cfa_register %rsp
4274.Lcbc_ret:
4275 ret
4276.cfi_endproc
4277.size ${PREFIX}_cbc_encrypt,.-${PREFIX}_cbc_encrypt
4278___
4279}
4280
4281# int ${PREFIX}_set_decrypt_key(const unsigned char *inp,
4282# int bits, AES_KEY *key)
4283#
4284# input: $inp user-supplied key
4285# $bits $inp length in bits
4286# $key pointer to key schedule
4287# output: %eax 0 denoting success, -1 or -2 - failure (see C)
4288# *$key key schedule
4289#
4290{ my ($inp,$bits,$key) = @_4args;
4291 $bits =~ s/%r/%e/;
4292
4293$code.=<<___;
4294.globl ${PREFIX}_set_decrypt_key
4295.type ${PREFIX}_set_decrypt_key,\@abi-omnipotent
4296.align 16
4297${PREFIX}_set_decrypt_key:
4298.cfi_startproc
4299 .byte 0x48,0x83,0xEC,0x08 # sub rsp,8
4300.cfi_adjust_cfa_offset 8
4301 call __aesni_set_encrypt_key
4302 shl \$4,$bits # rounds-1 after _aesni_set_encrypt_key
4303 test %eax,%eax
4304 jnz .Ldec_key_ret
4305 lea 16($key,$bits),$inp # points at the end of key schedule
4306
4307 $movkey ($key),%xmm0 # just swap
4308 $movkey ($inp),%xmm1
4309 $movkey %xmm0,($inp)
4310 $movkey %xmm1,($key)
4311 lea 16($key),$key
4312 lea -16($inp),$inp
4313
4314.Ldec_key_inverse:
4315 $movkey ($key),%xmm0 # swap and inverse
4316 $movkey ($inp),%xmm1
4317 aesimc %xmm0,%xmm0
4318 aesimc %xmm1,%xmm1
4319 lea 16($key),$key
4320 lea -16($inp),$inp
4321 $movkey %xmm0,16($inp)
4322 $movkey %xmm1,-16($key)
4323 cmp $key,$inp
4324 ja .Ldec_key_inverse
4325
4326 $movkey ($key),%xmm0 # inverse middle
4327 aesimc %xmm0,%xmm0
4328 pxor %xmm1,%xmm1
4329 $movkey %xmm0,($inp)
4330 pxor %xmm0,%xmm0
4331.Ldec_key_ret:
4332 add \$8,%rsp
4333.cfi_adjust_cfa_offset -8
4334 ret
4335.cfi_endproc
4336.LSEH_end_set_decrypt_key:
4337.size ${PREFIX}_set_decrypt_key,.-${PREFIX}_set_decrypt_key
4338___
4339
4340
4341# This is based on submission from Intel by
4342# Huang Ying
4343# Vinodh Gopal
4344# Kahraman Akdemir
4345#
4346# Aggressively optimized in respect to aeskeygenassist's critical path
4347# and is contained in %xmm0-5 to meet Win64 ABI requirement.
4348#
4349# int ${PREFIX}_set_encrypt_key(const unsigned char *inp,
4350# int bits, AES_KEY * const key);
4351#
4352# input: $inp user-supplied key
4353# $bits $inp length in bits
4354# $key pointer to key schedule
4355# output: %eax 0 denoting success, -1 or -2 - failure (see C)
4356# $bits rounds-1 (used in aesni_set_decrypt_key)
4357# *$key key schedule
4358# $key pointer to key schedule (used in
4359# aesni_set_decrypt_key)
4360#
4361# Subroutine is frame-less, which means that only volatile registers
4362# are used. Note that it's declared "abi-omnipotent", which means that
4363# amount of volatile registers is smaller on Windows.
4364#
4365$code.=<<___;
4366.globl ${PREFIX}_set_encrypt_key
4367.type ${PREFIX}_set_encrypt_key,\@abi-omnipotent
4368.align 16
4369${PREFIX}_set_encrypt_key:
4370__aesni_set_encrypt_key:
4371.cfi_startproc
4372 .byte 0x48,0x83,0xEC,0x08 # sub rsp,8
4373.cfi_adjust_cfa_offset 8
4374 mov \$-1,%rax
4375 test $inp,$inp
4376 jz .Lenc_key_ret
4377 test $key,$key
4378 jz .Lenc_key_ret
4379
4380 mov \$`1<<28|1<<11`,%r10d # AVX and XOP bits
4381 movups ($inp),%xmm0 # pull first 128 bits of *userKey
4382 xorps %xmm4,%xmm4 # low dword of xmm4 is assumed 0
4383 and OPENSSL_ia32cap_P+4(%rip),%r10d
4384 lea 16($key),%rax # %rax is used as modifiable copy of $key
4385 cmp \$256,$bits
4386 je .L14rounds
4387 cmp \$192,$bits
4388 je .L12rounds
4389 cmp \$128,$bits
4390 jne .Lbad_keybits
4391
4392.L10rounds:
4393 mov \$9,$bits # 10 rounds for 128-bit key
4394 cmp \$`1<<28`,%r10d # AVX, bit no XOP
4395 je .L10rounds_alt
4396
4397 $movkey %xmm0,($key) # round 0
4398 aeskeygenassist \$0x1,%xmm0,%xmm1 # round 1
4399 call .Lkey_expansion_128_cold
4400 aeskeygenassist \$0x2,%xmm0,%xmm1 # round 2
4401 call .Lkey_expansion_128
4402 aeskeygenassist \$0x4,%xmm0,%xmm1 # round 3
4403 call .Lkey_expansion_128
4404 aeskeygenassist \$0x8,%xmm0,%xmm1 # round 4
4405 call .Lkey_expansion_128
4406 aeskeygenassist \$0x10,%xmm0,%xmm1 # round 5
4407 call .Lkey_expansion_128
4408 aeskeygenassist \$0x20,%xmm0,%xmm1 # round 6
4409 call .Lkey_expansion_128
4410 aeskeygenassist \$0x40,%xmm0,%xmm1 # round 7
4411 call .Lkey_expansion_128
4412 aeskeygenassist \$0x80,%xmm0,%xmm1 # round 8
4413 call .Lkey_expansion_128
4414 aeskeygenassist \$0x1b,%xmm0,%xmm1 # round 9
4415 call .Lkey_expansion_128
4416 aeskeygenassist \$0x36,%xmm0,%xmm1 # round 10
4417 call .Lkey_expansion_128
4418 $movkey %xmm0,(%rax)
4419 mov $bits,80(%rax) # 240(%rdx)
4420 xor %eax,%eax
4421 jmp .Lenc_key_ret
4422
4423.align 16
4424.L10rounds_alt:
4425 movdqa .Lkey_rotate(%rip),%xmm5
4426 mov \$8,%r10d
4427 movdqa .Lkey_rcon1(%rip),%xmm4
4428 movdqa %xmm0,%xmm2
4429 movdqu %xmm0,($key)
4430 jmp .Loop_key128
4431
4432.align 16
4433.Loop_key128:
4434 pshufb %xmm5,%xmm0
4435 aesenclast %xmm4,%xmm0
4436 pslld \$1,%xmm4
4437 lea 16(%rax),%rax
4438
4439 movdqa %xmm2,%xmm3
4440 pslldq \$4,%xmm2
4441 pxor %xmm2,%xmm3
4442 pslldq \$4,%xmm2
4443 pxor %xmm2,%xmm3
4444 pslldq \$4,%xmm2
4445 pxor %xmm3,%xmm2
4446
4447 pxor %xmm2,%xmm0
4448 movdqu %xmm0,-16(%rax)
4449 movdqa %xmm0,%xmm2
4450
4451 dec %r10d
4452 jnz .Loop_key128
4453
4454 movdqa .Lkey_rcon1b(%rip),%xmm4
4455
4456 pshufb %xmm5,%xmm0
4457 aesenclast %xmm4,%xmm0
4458 pslld \$1,%xmm4
4459
4460 movdqa %xmm2,%xmm3
4461 pslldq \$4,%xmm2
4462 pxor %xmm2,%xmm3
4463 pslldq \$4,%xmm2
4464 pxor %xmm2,%xmm3
4465 pslldq \$4,%xmm2
4466 pxor %xmm3,%xmm2
4467
4468 pxor %xmm2,%xmm0
4469 movdqu %xmm0,(%rax)
4470
4471 movdqa %xmm0,%xmm2
4472 pshufb %xmm5,%xmm0
4473 aesenclast %xmm4,%xmm0
4474
4475 movdqa %xmm2,%xmm3
4476 pslldq \$4,%xmm2
4477 pxor %xmm2,%xmm3
4478 pslldq \$4,%xmm2
4479 pxor %xmm2,%xmm3
4480 pslldq \$4,%xmm2
4481 pxor %xmm3,%xmm2
4482
4483 pxor %xmm2,%xmm0
4484 movdqu %xmm0,16(%rax)
4485
4486 mov $bits,96(%rax) # 240($key)
4487 xor %eax,%eax
4488 jmp .Lenc_key_ret
4489
4490.align 16
4491.L12rounds:
4492 movq 16($inp),%xmm2 # remaining 1/3 of *userKey
4493 mov \$11,$bits # 12 rounds for 192
4494 cmp \$`1<<28`,%r10d # AVX, but no XOP
4495 je .L12rounds_alt
4496
4497 $movkey %xmm0,($key) # round 0
4498 aeskeygenassist \$0x1,%xmm2,%xmm1 # round 1,2
4499 call .Lkey_expansion_192a_cold
4500 aeskeygenassist \$0x2,%xmm2,%xmm1 # round 2,3
4501 call .Lkey_expansion_192b
4502 aeskeygenassist \$0x4,%xmm2,%xmm1 # round 4,5
4503 call .Lkey_expansion_192a
4504 aeskeygenassist \$0x8,%xmm2,%xmm1 # round 5,6
4505 call .Lkey_expansion_192b
4506 aeskeygenassist \$0x10,%xmm2,%xmm1 # round 7,8
4507 call .Lkey_expansion_192a
4508 aeskeygenassist \$0x20,%xmm2,%xmm1 # round 8,9
4509 call .Lkey_expansion_192b
4510 aeskeygenassist \$0x40,%xmm2,%xmm1 # round 10,11
4511 call .Lkey_expansion_192a
4512 aeskeygenassist \$0x80,%xmm2,%xmm1 # round 11,12
4513 call .Lkey_expansion_192b
4514 $movkey %xmm0,(%rax)
4515 mov $bits,48(%rax) # 240(%rdx)
4516 xor %rax, %rax
4517 jmp .Lenc_key_ret
4518
4519.align 16
4520.L12rounds_alt:
4521 movdqa .Lkey_rotate192(%rip),%xmm5
4522 movdqa .Lkey_rcon1(%rip),%xmm4
4523 mov \$8,%r10d
4524 movdqu %xmm0,($key)
4525 jmp .Loop_key192
4526
4527.align 16
4528.Loop_key192:
4529 movq %xmm2,0(%rax)
4530 movdqa %xmm2,%xmm1
4531 pshufb %xmm5,%xmm2
4532 aesenclast %xmm4,%xmm2
4533 pslld \$1, %xmm4
4534 lea 24(%rax),%rax
4535
4536 movdqa %xmm0,%xmm3
4537 pslldq \$4,%xmm0
4538 pxor %xmm0,%xmm3
4539 pslldq \$4,%xmm0
4540 pxor %xmm0,%xmm3
4541 pslldq \$4,%xmm0
4542 pxor %xmm3,%xmm0
4543
4544 pshufd \$0xff,%xmm0,%xmm3
4545 pxor %xmm1,%xmm3
4546 pslldq \$4,%xmm1
4547 pxor %xmm1,%xmm3
4548
4549 pxor %xmm2,%xmm0
4550 pxor %xmm3,%xmm2
4551 movdqu %xmm0,-16(%rax)
4552
4553 dec %r10d
4554 jnz .Loop_key192
4555
4556 mov $bits,32(%rax) # 240($key)
4557 xor %eax,%eax
4558 jmp .Lenc_key_ret
4559
4560.align 16
4561.L14rounds:
4562 movups 16($inp),%xmm2 # remaining half of *userKey
4563 mov \$13,$bits # 14 rounds for 256
4564 lea 16(%rax),%rax
4565 cmp \$`1<<28`,%r10d # AVX, but no XOP
4566 je .L14rounds_alt
4567
4568 $movkey %xmm0,($key) # round 0
4569 $movkey %xmm2,16($key) # round 1
4570 aeskeygenassist \$0x1,%xmm2,%xmm1 # round 2
4571 call .Lkey_expansion_256a_cold
4572 aeskeygenassist \$0x1,%xmm0,%xmm1 # round 3
4573 call .Lkey_expansion_256b
4574 aeskeygenassist \$0x2,%xmm2,%xmm1 # round 4
4575 call .Lkey_expansion_256a
4576 aeskeygenassist \$0x2,%xmm0,%xmm1 # round 5
4577 call .Lkey_expansion_256b
4578 aeskeygenassist \$0x4,%xmm2,%xmm1 # round 6
4579 call .Lkey_expansion_256a
4580 aeskeygenassist \$0x4,%xmm0,%xmm1 # round 7
4581 call .Lkey_expansion_256b
4582 aeskeygenassist \$0x8,%xmm2,%xmm1 # round 8
4583 call .Lkey_expansion_256a
4584 aeskeygenassist \$0x8,%xmm0,%xmm1 # round 9
4585 call .Lkey_expansion_256b
4586 aeskeygenassist \$0x10,%xmm2,%xmm1 # round 10
4587 call .Lkey_expansion_256a
4588 aeskeygenassist \$0x10,%xmm0,%xmm1 # round 11
4589 call .Lkey_expansion_256b
4590 aeskeygenassist \$0x20,%xmm2,%xmm1 # round 12
4591 call .Lkey_expansion_256a
4592 aeskeygenassist \$0x20,%xmm0,%xmm1 # round 13
4593 call .Lkey_expansion_256b
4594 aeskeygenassist \$0x40,%xmm2,%xmm1 # round 14
4595 call .Lkey_expansion_256a
4596 $movkey %xmm0,(%rax)
4597 mov $bits,16(%rax) # 240(%rdx)
4598 xor %rax,%rax
4599 jmp .Lenc_key_ret
4600
4601.align 16
4602.L14rounds_alt:
4603 movdqa .Lkey_rotate(%rip),%xmm5
4604 movdqa .Lkey_rcon1(%rip),%xmm4
4605 mov \$7,%r10d
4606 movdqu %xmm0,0($key)
4607 movdqa %xmm2,%xmm1
4608 movdqu %xmm2,16($key)
4609 jmp .Loop_key256
4610
4611.align 16
4612.Loop_key256:
4613 pshufb %xmm5,%xmm2
4614 aesenclast %xmm4,%xmm2
4615
4616 movdqa %xmm0,%xmm3
4617 pslldq \$4,%xmm0
4618 pxor %xmm0,%xmm3
4619 pslldq \$4,%xmm0
4620 pxor %xmm0,%xmm3
4621 pslldq \$4,%xmm0
4622 pxor %xmm3,%xmm0
4623 pslld \$1,%xmm4
4624
4625 pxor %xmm2,%xmm0
4626 movdqu %xmm0,(%rax)
4627
4628 dec %r10d
4629 jz .Ldone_key256
4630
4631 pshufd \$0xff,%xmm0,%xmm2
4632 pxor %xmm3,%xmm3
4633 aesenclast %xmm3,%xmm2
4634
4635 movdqa %xmm1,%xmm3
4636 pslldq \$4,%xmm1
4637 pxor %xmm1,%xmm3
4638 pslldq \$4,%xmm1
4639 pxor %xmm1,%xmm3
4640 pslldq \$4,%xmm1
4641 pxor %xmm3,%xmm1
4642
4643 pxor %xmm1,%xmm2
4644 movdqu %xmm2,16(%rax)
4645 lea 32(%rax),%rax
4646 movdqa %xmm2,%xmm1
4647
4648 jmp .Loop_key256
4649
4650.Ldone_key256:
4651 mov $bits,16(%rax) # 240($key)
4652 xor %eax,%eax
4653 jmp .Lenc_key_ret
4654
4655.align 16
4656.Lbad_keybits:
4657 mov \$-2,%rax
4658.Lenc_key_ret:
4659 pxor %xmm0,%xmm0
4660 pxor %xmm1,%xmm1
4661 pxor %xmm2,%xmm2
4662 pxor %xmm3,%xmm3
4663 pxor %xmm4,%xmm4
4664 pxor %xmm5,%xmm5
4665 add \$8,%rsp
4666.cfi_adjust_cfa_offset -8
4667 ret
4668.LSEH_end_set_encrypt_key:
4669
4670
4671.align 16
4672.Lkey_expansion_128:
4673 $movkey %xmm0,(%rax)
4674 lea 16(%rax),%rax
4675.Lkey_expansion_128_cold:
4676 shufps \$0b00010000,%xmm0,%xmm4
4677 xorps %xmm4, %xmm0
4678 shufps \$0b10001100,%xmm0,%xmm4
4679 xorps %xmm4, %xmm0
4680 shufps \$0b11111111,%xmm1,%xmm1 # critical path
4681 xorps %xmm1,%xmm0
4682 ret
4683
4684.align 16
4685.Lkey_expansion_192a:
4686 $movkey %xmm0,(%rax)
4687 lea 16(%rax),%rax
4688.Lkey_expansion_192a_cold:
4689 movaps %xmm2, %xmm5
4690.Lkey_expansion_192b_warm:
4691 shufps \$0b00010000,%xmm0,%xmm4
4692 movdqa %xmm2,%xmm3
4693 xorps %xmm4,%xmm0
4694 shufps \$0b10001100,%xmm0,%xmm4
4695 pslldq \$4,%xmm3
4696 xorps %xmm4,%xmm0
4697 pshufd \$0b01010101,%xmm1,%xmm1 # critical path
4698 pxor %xmm3,%xmm2
4699 pxor %xmm1,%xmm0
4700 pshufd \$0b11111111,%xmm0,%xmm3
4701 pxor %xmm3,%xmm2
4702 ret
4703
4704.align 16
4705.Lkey_expansion_192b:
4706 movaps %xmm0,%xmm3
4707 shufps \$0b01000100,%xmm0,%xmm5
4708 $movkey %xmm5,(%rax)
4709 shufps \$0b01001110,%xmm2,%xmm3
4710 $movkey %xmm3,16(%rax)
4711 lea 32(%rax),%rax
4712 jmp .Lkey_expansion_192b_warm
4713
4714.align 16
4715.Lkey_expansion_256a:
4716 $movkey %xmm2,(%rax)
4717 lea 16(%rax),%rax
4718.Lkey_expansion_256a_cold:
4719 shufps \$0b00010000,%xmm0,%xmm4
4720 xorps %xmm4,%xmm0
4721 shufps \$0b10001100,%xmm0,%xmm4
4722 xorps %xmm4,%xmm0
4723 shufps \$0b11111111,%xmm1,%xmm1 # critical path
4724 xorps %xmm1,%xmm0
4725 ret
4726
4727.align 16
4728.Lkey_expansion_256b:
4729 $movkey %xmm0,(%rax)
4730 lea 16(%rax),%rax
4731
4732 shufps \$0b00010000,%xmm2,%xmm4
4733 xorps %xmm4,%xmm2
4734 shufps \$0b10001100,%xmm2,%xmm4
4735 xorps %xmm4,%xmm2
4736 shufps \$0b10101010,%xmm1,%xmm1 # critical path
4737 xorps %xmm1,%xmm2
4738 ret
4739.cfi_endproc
4740.size ${PREFIX}_set_encrypt_key,.-${PREFIX}_set_encrypt_key
4741.size __aesni_set_encrypt_key,.-__aesni_set_encrypt_key
4742___
4743}
4744
4745
4746$code.=<<___;
4747.align 64
4748.Lbswap_mask:
4749 .byte 15,14,13,12,11,10,9,8,7,6,5,4,3,2,1,0
4750.Lincrement32:
4751 .long 6,6,6,0
4752.Lincrement64:
4753 .long 1,0,0,0
4754.Lxts_magic:
4755 .long 0x87,0,1,0
4756.Lincrement1:
4757 .byte 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1
4758.Lkey_rotate:
4759 .long 0x0c0f0e0d,0x0c0f0e0d,0x0c0f0e0d,0x0c0f0e0d
4760.Lkey_rotate192:
4761 .long 0x04070605,0x04070605,0x04070605,0x04070605
4762.Lkey_rcon1:
4763 .long 1,1,1,1
4764.Lkey_rcon1b:
4765 .long 0x1b,0x1b,0x1b,0x1b
4766
4767.asciz "AES for Intel AES-NI, CRYPTOGAMS by <appro\@openssl.org>"
4768.align 64
4769___
4770
4771# EXCEPTION_DISPOSITION handler (EXCEPTION_RECORD *rec,ULONG64 frame,
4772# CONTEXT *context,DISPATCHER_CONTEXT *disp)
4773if ($win64) {
4774$rec="%rcx";
4775$frame="%rdx";
4776$context="%r8";
4777$disp="%r9";
4778
4779$code.=<<___;
4780.extern __imp_RtlVirtualUnwind
4781___
4782$code.=<<___ if ($PREFIX eq "aesni");
4783.type ecb_ccm64_se_handler,\@abi-omnipotent
4784.align 16
4785ecb_ccm64_se_handler:
4786 push %rsi
4787 push %rdi
4788 push %rbx
4789 push %rbp
4790 push %r12
4791 push %r13
4792 push %r14
4793 push %r15
4794 pushfq
4795 sub \$64,%rsp
4796
4797 mov 120($context),%rax # pull context->Rax
4798 mov 248($context),%rbx # pull context->Rip
4799
4800 mov 8($disp),%rsi # disp->ImageBase
4801 mov 56($disp),%r11 # disp->HandlerData
4802
4803 mov 0(%r11),%r10d # HandlerData[0]
4804 lea (%rsi,%r10),%r10 # prologue label
4805 cmp %r10,%rbx # context->Rip<prologue label
4806 jb .Lcommon_seh_tail
4807
4808 mov 152($context),%rax # pull context->Rsp
4809
4810 mov 4(%r11),%r10d # HandlerData[1]
4811 lea (%rsi,%r10),%r10 # epilogue label
4812 cmp %r10,%rbx # context->Rip>=epilogue label
4813 jae .Lcommon_seh_tail
4814
4815 lea 0(%rax),%rsi # %xmm save area
4816 lea 512($context),%rdi # &context.Xmm6
4817 mov \$8,%ecx # 4*sizeof(%xmm0)/sizeof(%rax)
4818 .long 0xa548f3fc # cld; rep movsq
4819 lea 0x58(%rax),%rax # adjust stack pointer
4820
4821 jmp .Lcommon_seh_tail
4822.size ecb_ccm64_se_handler,.-ecb_ccm64_se_handler
4823
4824.type ctr_xts_se_handler,\@abi-omnipotent
4825.align 16
4826ctr_xts_se_handler:
4827 push %rsi
4828 push %rdi
4829 push %rbx
4830 push %rbp
4831 push %r12
4832 push %r13
4833 push %r14
4834 push %r15
4835 pushfq
4836 sub \$64,%rsp
4837
4838 mov 120($context),%rax # pull context->Rax
4839 mov 248($context),%rbx # pull context->Rip
4840
4841 mov 8($disp),%rsi # disp->ImageBase
4842 mov 56($disp),%r11 # disp->HandlerData
4843
4844 mov 0(%r11),%r10d # HandlerData[0]
4845 lea (%rsi,%r10),%r10 # prologue label
4846 cmp %r10,%rbx # context->Rip<prologue label
4847 jb .Lcommon_seh_tail
4848
4849 mov 152($context),%rax # pull context->Rsp
4850
4851 mov 4(%r11),%r10d # HandlerData[1]
4852 lea (%rsi,%r10),%r10 # epilogue label
4853 cmp %r10,%rbx # context->Rip>=epilogue label
4854 jae .Lcommon_seh_tail
4855
4856 mov 208($context),%rax # pull context->R11
4857
4858 lea -0xa8(%rax),%rsi # %xmm save area
4859 lea 512($context),%rdi # & context.Xmm6
4860 mov \$20,%ecx # 10*sizeof(%xmm0)/sizeof(%rax)
4861 .long 0xa548f3fc # cld; rep movsq
4862
4863 mov -8(%rax),%rbp # restore saved %rbp
4864 mov %rbp,160($context) # restore context->Rbp
4865 jmp .Lcommon_seh_tail
4866.size ctr_xts_se_handler,.-ctr_xts_se_handler
4867
4868.type ocb_se_handler,\@abi-omnipotent
4869.align 16
4870ocb_se_handler:
4871 push %rsi
4872 push %rdi
4873 push %rbx
4874 push %rbp
4875 push %r12
4876 push %r13
4877 push %r14
4878 push %r15
4879 pushfq
4880 sub \$64,%rsp
4881
4882 mov 120($context),%rax # pull context->Rax
4883 mov 248($context),%rbx # pull context->Rip
4884
4885 mov 8($disp),%rsi # disp->ImageBase
4886 mov 56($disp),%r11 # disp->HandlerData
4887
4888 mov 0(%r11),%r10d # HandlerData[0]
4889 lea (%rsi,%r10),%r10 # prologue label
4890 cmp %r10,%rbx # context->Rip<prologue label
4891 jb .Lcommon_seh_tail
4892
4893 mov 4(%r11),%r10d # HandlerData[1]
4894 lea (%rsi,%r10),%r10 # epilogue label
4895 cmp %r10,%rbx # context->Rip>=epilogue label
4896 jae .Lcommon_seh_tail
4897
4898 mov 8(%r11),%r10d # HandlerData[2]
4899 lea (%rsi,%r10),%r10
4900 cmp %r10,%rbx # context->Rip>=pop label
4901 jae .Locb_no_xmm
4902
4903 mov 152($context),%rax # pull context->Rsp
4904
4905 lea (%rax),%rsi # %xmm save area
4906 lea 512($context),%rdi # & context.Xmm6
4907 mov \$20,%ecx # 10*sizeof(%xmm0)/sizeof(%rax)
4908 .long 0xa548f3fc # cld; rep movsq
4909 lea 0xa0+0x28(%rax),%rax
4910
4911.Locb_no_xmm:
4912 mov -8(%rax),%rbx
4913 mov -16(%rax),%rbp
4914 mov -24(%rax),%r12
4915 mov -32(%rax),%r13
4916 mov -40(%rax),%r14
4917
4918 mov %rbx,144($context) # restore context->Rbx
4919 mov %rbp,160($context) # restore context->Rbp
4920 mov %r12,216($context) # restore context->R12
4921 mov %r13,224($context) # restore context->R13
4922 mov %r14,232($context) # restore context->R14
4923
4924 jmp .Lcommon_seh_tail
4925.size ocb_se_handler,.-ocb_se_handler
4926___
4927$code.=<<___;
4928.type cbc_se_handler,\@abi-omnipotent
4929.align 16
4930cbc_se_handler:
4931 push %rsi
4932 push %rdi
4933 push %rbx
4934 push %rbp
4935 push %r12
4936 push %r13
4937 push %r14
4938 push %r15
4939 pushfq
4940 sub \$64,%rsp
4941
4942 mov 152($context),%rax # pull context->Rsp
4943 mov 248($context),%rbx # pull context->Rip
4944
4945 lea .Lcbc_decrypt_bulk(%rip),%r10
4946 cmp %r10,%rbx # context->Rip<"prologue" label
4947 jb .Lcommon_seh_tail
4948
4949 mov 120($context),%rax # pull context->Rax
4950
4951 lea .Lcbc_decrypt_body(%rip),%r10
4952 cmp %r10,%rbx # context->Rip<cbc_decrypt_body
4953 jb .Lcommon_seh_tail
4954
4955 mov 152($context),%rax # pull context->Rsp
4956
4957 lea .Lcbc_ret(%rip),%r10
4958 cmp %r10,%rbx # context->Rip>="epilogue" label
4959 jae .Lcommon_seh_tail
4960
4961 lea 16(%rax),%rsi # %xmm save area
4962 lea 512($context),%rdi # &context.Xmm6
4963 mov \$20,%ecx # 10*sizeof(%xmm0)/sizeof(%rax)
4964 .long 0xa548f3fc # cld; rep movsq
4965
4966 mov 208($context),%rax # pull context->R11
4967
4968 mov -8(%rax),%rbp # restore saved %rbp
4969 mov %rbp,160($context) # restore context->Rbp
4970
4971.Lcommon_seh_tail:
4972 mov 8(%rax),%rdi
4973 mov 16(%rax),%rsi
4974 mov %rax,152($context) # restore context->Rsp
4975 mov %rsi,168($context) # restore context->Rsi
4976 mov %rdi,176($context) # restore context->Rdi
4977
4978 mov 40($disp),%rdi # disp->ContextRecord
4979 mov $context,%rsi # context
4980 mov \$154,%ecx # sizeof(CONTEXT)
4981 .long 0xa548f3fc # cld; rep movsq
4982
4983 mov $disp,%rsi
4984 xor %rcx,%rcx # arg1, UNW_FLAG_NHANDLER
4985 mov 8(%rsi),%rdx # arg2, disp->ImageBase
4986 mov 0(%rsi),%r8 # arg3, disp->ControlPc
4987 mov 16(%rsi),%r9 # arg4, disp->FunctionEntry
4988 mov 40(%rsi),%r10 # disp->ContextRecord
4989 lea 56(%rsi),%r11 # &disp->HandlerData
4990 lea 24(%rsi),%r12 # &disp->EstablisherFrame
4991 mov %r10,32(%rsp) # arg5
4992 mov %r11,40(%rsp) # arg6
4993 mov %r12,48(%rsp) # arg7
4994 mov %rcx,56(%rsp) # arg8, (NULL)
4995 call *__imp_RtlVirtualUnwind(%rip)
4996
4997 mov \$1,%eax # ExceptionContinueSearch
4998 add \$64,%rsp
4999 popfq
5000 pop %r15
5001 pop %r14
5002 pop %r13
5003 pop %r12
5004 pop %rbp
5005 pop %rbx
5006 pop %rdi
5007 pop %rsi
5008 ret
5009.size cbc_se_handler,.-cbc_se_handler
5010
5011.section .pdata
5012.align 4
5013___
5014$code.=<<___ if ($PREFIX eq "aesni");
5015 .rva .LSEH_begin_aesni_ecb_encrypt
5016 .rva .LSEH_end_aesni_ecb_encrypt
5017 .rva .LSEH_info_ecb
5018
5019 .rva .LSEH_begin_aesni_ccm64_encrypt_blocks
5020 .rva .LSEH_end_aesni_ccm64_encrypt_blocks
5021 .rva .LSEH_info_ccm64_enc
5022
5023 .rva .LSEH_begin_aesni_ccm64_decrypt_blocks
5024 .rva .LSEH_end_aesni_ccm64_decrypt_blocks
5025 .rva .LSEH_info_ccm64_dec
5026
5027 .rva .LSEH_begin_aesni_ctr32_encrypt_blocks
5028 .rva .LSEH_end_aesni_ctr32_encrypt_blocks
5029 .rva .LSEH_info_ctr32
5030
5031 .rva .LSEH_begin_aesni_xts_encrypt
5032 .rva .LSEH_end_aesni_xts_encrypt
5033 .rva .LSEH_info_xts_enc
5034
5035 .rva .LSEH_begin_aesni_xts_decrypt
5036 .rva .LSEH_end_aesni_xts_decrypt
5037 .rva .LSEH_info_xts_dec
5038
5039 .rva .LSEH_begin_aesni_ocb_encrypt
5040 .rva .LSEH_end_aesni_ocb_encrypt
5041 .rva .LSEH_info_ocb_enc
5042
5043 .rva .LSEH_begin_aesni_ocb_decrypt
5044 .rva .LSEH_end_aesni_ocb_decrypt
5045 .rva .LSEH_info_ocb_dec
5046___
5047$code.=<<___;
5048 .rva .LSEH_begin_${PREFIX}_cbc_encrypt
5049 .rva .LSEH_end_${PREFIX}_cbc_encrypt
5050 .rva .LSEH_info_cbc
5051
5052 .rva ${PREFIX}_set_decrypt_key
5053 .rva .LSEH_end_set_decrypt_key
5054 .rva .LSEH_info_key
5055
5056 .rva ${PREFIX}_set_encrypt_key
5057 .rva .LSEH_end_set_encrypt_key
5058 .rva .LSEH_info_key
5059.section .xdata
5060.align 8
5061___
5062$code.=<<___ if ($PREFIX eq "aesni");
5063.LSEH_info_ecb:
5064 .byte 9,0,0,0
5065 .rva ecb_ccm64_se_handler
5066 .rva .Lecb_enc_body,.Lecb_enc_ret # HandlerData[]
5067.LSEH_info_ccm64_enc:
5068 .byte 9,0,0,0
5069 .rva ecb_ccm64_se_handler
5070 .rva .Lccm64_enc_body,.Lccm64_enc_ret # HandlerData[]
5071.LSEH_info_ccm64_dec:
5072 .byte 9,0,0,0
5073 .rva ecb_ccm64_se_handler
5074 .rva .Lccm64_dec_body,.Lccm64_dec_ret # HandlerData[]
5075.LSEH_info_ctr32:
5076 .byte 9,0,0,0
5077 .rva ctr_xts_se_handler
5078 .rva .Lctr32_body,.Lctr32_epilogue # HandlerData[]
5079.LSEH_info_xts_enc:
5080 .byte 9,0,0,0
5081 .rva ctr_xts_se_handler
5082 .rva .Lxts_enc_body,.Lxts_enc_epilogue # HandlerData[]
5083.LSEH_info_xts_dec:
5084 .byte 9,0,0,0
5085 .rva ctr_xts_se_handler
5086 .rva .Lxts_dec_body,.Lxts_dec_epilogue # HandlerData[]
5087.LSEH_info_ocb_enc:
5088 .byte 9,0,0,0
5089 .rva ocb_se_handler
5090 .rva .Locb_enc_body,.Locb_enc_epilogue # HandlerData[]
5091 .rva .Locb_enc_pop
5092 .long 0
5093.LSEH_info_ocb_dec:
5094 .byte 9,0,0,0
5095 .rva ocb_se_handler
5096 .rva .Locb_dec_body,.Locb_dec_epilogue # HandlerData[]
5097 .rva .Locb_dec_pop
5098 .long 0
5099___
5100$code.=<<___;
5101.LSEH_info_cbc:
5102 .byte 9,0,0,0
5103 .rva cbc_se_handler
5104.LSEH_info_key:
5105 .byte 0x01,0x04,0x01,0x00
5106 .byte 0x04,0x02,0x00,0x00 # sub rsp,8
5107___
5108}
5109
5110sub rex {
5111 local *opcode=shift;
5112 my ($dst,$src)=@_;
5113 my $rex=0;
5114
5115 $rex|=0x04 if($dst>=8);
5116 $rex|=0x01 if($src>=8);
5117 push @opcode,$rex|0x40 if($rex);
5118}
5119
5120sub aesni {
5121 my $line=shift;
5122 my @opcode=(0x66);
5123
5124 if ($line=~/(aeskeygenassist)\s+\$([x0-9a-f]+),\s*%xmm([0-9]+),\s*%xmm([0-9]+)/) {
5125 rex(\@opcode,$4,$3);
5126 push @opcode,0x0f,0x3a,0xdf;
5127 push @opcode,0xc0|($3&7)|(($4&7)<<3); # ModR/M
5128 my $c=$2;
5129 push @opcode,$c=~/^0/?oct($c):$c;
5130 return ".byte\t".join(',',@opcode);
5131 }
5132 elsif ($line=~/(aes[a-z]+)\s+%xmm([0-9]+),\s*%xmm([0-9]+)/) {
5133 my %opcodelet = (
5134 "aesimc" => 0xdb,
5135 "aesenc" => 0xdc, "aesenclast" => 0xdd,
5136 "aesdec" => 0xde, "aesdeclast" => 0xdf
5137 );
5138 return undef if (!defined($opcodelet{$1}));
5139 rex(\@opcode,$3,$2);
5140 push @opcode,0x0f,0x38,$opcodelet{$1};
5141 push @opcode,0xc0|($2&7)|(($3&7)<<3); # ModR/M
5142 return ".byte\t".join(',',@opcode);
5143 }
5144 elsif ($line=~/(aes[a-z]+)\s+([0x1-9a-fA-F]*)\(%rsp\),\s*%xmm([0-9]+)/) {
5145 my %opcodelet = (
5146 "aesenc" => 0xdc, "aesenclast" => 0xdd,
5147 "aesdec" => 0xde, "aesdeclast" => 0xdf
5148 );
5149 return undef if (!defined($opcodelet{$1}));
5150 my $off = $2;
5151 push @opcode,0x44 if ($3>=8);
5152 push @opcode,0x0f,0x38,$opcodelet{$1};
5153 push @opcode,0x44|(($3&7)<<3),0x24; # ModR/M
5154 push @opcode,($off=~/^0/?oct($off):$off)&0xff;
5155 return ".byte\t".join(',',@opcode);
5156 }
5157 return $line;
5158}
5159
5160sub movbe {
5161 ".byte 0x0f,0x38,0xf1,0x44,0x24,".shift;
5162}
5163
5164$code =~ s/\`([^\`]*)\`/eval($1)/gem;
5165$code =~ s/\b(aes.*%xmm[0-9]+).*$/aesni($1)/gem;
5166#$code =~ s/\bmovbe\s+%eax/bswap %eax; mov %eax/gm; # debugging artefact
5167$code =~ s/\bmovbe\s+%eax,\s*([0-9]+)\(%rsp\)/movbe($1)/gem;
5168
5169print $code;
5170
5171close STDOUT or die "error closing STDOUT: $!";
Note: See TracBrowser for help on using the repository browser.

© 2024 Oracle Support Privacy / Do Not Sell My Info Terms of Use Trademark Policy Automated Access Etiquette