VirtualBox

source: vbox/trunk/src/libs/openssl-3.1.3/crypto/aes/asm/bsaes-x86_64.pl@ 101211

Last change on this file since 101211 was 101211, checked in by vboxsync, 17 months ago

openssl-3.1.3: Applied and adjusted our OpenSSL changes to 3.1.2. bugref:10527

File size: 74.2 KB
Line 
1#! /usr/bin/env perl
2# Copyright 2011-2021 The OpenSSL Project Authors. All Rights Reserved.
3#
4# Licensed under the Apache License 2.0 (the "License"). You may not use
5# this file except in compliance with the License. You can obtain a copy
6# in the file LICENSE in the source distribution or at
7# https://www.openssl.org/source/license.html
8
9
10###################################################################
11### AES-128 [originally in CTR mode] ###
12### bitsliced implementation for Intel Core 2 processors ###
13### requires support of SSE extensions up to SSSE3 ###
14### Author: Emilia Käsper and Peter Schwabe ###
15### Date: 2009-03-19 ###
16### Public domain ###
17### ###
18### See http://homes.esat.kuleuven.be/~ekasper/#software for ###
19### further information. ###
20###################################################################
21#
22# September 2011.
23#
24# Started as transliteration to "perlasm" the original code has
25# undergone following changes:
26#
27# - code was made position-independent;
28# - rounds were folded into a loop resulting in >5x size reduction
29# from 12.5KB to 2.2KB;
30# - above was possible thanks to mixcolumns() modification that
31# allowed to feed its output back to aesenc[last], this was
32# achieved at cost of two additional inter-registers moves;
33# - some instruction reordering and interleaving;
34# - this module doesn't implement key setup subroutine, instead it
35# relies on conversion of "conventional" key schedule as returned
36# by AES_set_encrypt_key (see discussion below);
37# - first and last round keys are treated differently, which allowed
38# to skip one shiftrows(), reduce bit-sliced key schedule and
39# speed-up conversion by 22%;
40# - support for 192- and 256-bit keys was added;
41#
42# Resulting performance in CPU cycles spent to encrypt one byte out
43# of 4096-byte buffer with 128-bit key is:
44#
45# Emilia's this(*) difference
46#
47# Core 2 9.30 8.69 +7%
48# Nehalem(**) 7.63 6.88 +11%
49# Atom 17.1 16.4 +4%
50# Silvermont - 12.9
51# Goldmont - 8.85
52#
53# (*) Comparison is not completely fair, because "this" is ECB,
54# i.e. no extra processing such as counter values calculation
55# and xor-ing input as in Emilia's CTR implementation is
56# performed. However, the CTR calculations stand for not more
57# than 1% of total time, so comparison is *rather* fair.
58#
59# (**) Results were collected on Westmere, which is considered to
60# be equivalent to Nehalem for this code.
61#
62# As for key schedule conversion subroutine. Interface to OpenSSL
63# relies on per-invocation on-the-fly conversion. This naturally
64# has impact on performance, especially for short inputs. Conversion
65# time in CPU cycles and its ratio to CPU cycles spent in 8x block
66# function is:
67#
68# conversion conversion/8x block
69# Core 2 240 0.22
70# Nehalem 180 0.20
71# Atom 430 0.20
72#
73# The ratio values mean that 128-byte blocks will be processed
74# 16-18% slower, 256-byte blocks - 9-10%, 384-byte blocks - 6-7%,
75# etc. Then keep in mind that input sizes not divisible by 128 are
76# *effectively* slower, especially shortest ones, e.g. consecutive
77# 144-byte blocks are processed 44% slower than one would expect,
78# 272 - 29%, 400 - 22%, etc. Yet, despite all these "shortcomings"
79# it's still faster than ["hyper-threading-safe" code path in]
80# aes-x86_64.pl on all lengths above 64 bytes...
81#
82# October 2011.
83#
84# Add decryption procedure. Performance in CPU cycles spent to decrypt
85# one byte out of 4096-byte buffer with 128-bit key is:
86#
87# Core 2 9.98
88# Nehalem 7.80
89# Atom 17.9
90# Silvermont 14.0
91# Goldmont 10.2
92#
93# November 2011.
94#
95# Add bsaes_xts_[en|de]crypt. Less-than-80-bytes-block performance is
96# suboptimal, but XTS is meant to be used with larger blocks...
97#
98# <[email protected]>
99
100# $output is the last argument if it looks like a file (it has an extension)
101# $flavour is the first argument if it doesn't look like a file
102$output = $#ARGV >= 0 && $ARGV[$#ARGV] =~ m|\.\w+$| ? pop : undef;
103$flavour = $#ARGV >= 0 && $ARGV[0] !~ m|\.| ? shift : undef;
104
105$win64=0; $win64=1 if ($flavour =~ /[nm]asm|mingw64/ || $output =~ /\.asm$/);
106
107$0 =~ m/(.*[\/\\])[^\/\\]+$/; $dir=$1;
108( $xlate="${dir}x86_64-xlate.pl" and -f $xlate ) or
109( $xlate="${dir}../../perlasm/x86_64-xlate.pl" and -f $xlate) or
110die "can't locate x86_64-xlate.pl";
111
112open OUT,"| \"$^X\" \"$xlate\" $flavour \"$output\""
113 or die "can't call $xlate: $!";
114*STDOUT=*OUT;
115
116my ($inp,$out,$len,$key,$ivp)=("%rdi","%rsi","%rdx","%rcx");
117my @XMM=map("%xmm$_",(15,0..14)); # best on Atom, +10% over (0..15)
118my $ecb=0; # suppress unreferenced ECB subroutines, spare some space...
119
120{
121my ($key,$rounds,$const)=("%rax","%r10d","%r11");
122
123sub Sbox {
124# input in lsb > [b0, b1, b2, b3, b4, b5, b6, b7] < msb
125# output in lsb > [b0, b1, b4, b6, b3, b7, b2, b5] < msb
126my @b=@_[0..7];
127my @t=@_[8..11];
128my @s=@_[12..15];
129 &InBasisChange (@b);
130 &Inv_GF256 (@b[6,5,0,3,7,1,4,2],@t,@s);
131 &OutBasisChange (@b[7,1,4,2,6,5,0,3]);
132}
133
134sub InBasisChange {
135# input in lsb > [b0, b1, b2, b3, b4, b5, b6, b7] < msb
136# output in lsb > [b6, b5, b0, b3, b7, b1, b4, b2] < msb
137my @b=@_[0..7];
138$code.=<<___;
139 pxor @b[6], @b[5]
140 pxor @b[1], @b[2]
141 pxor @b[0], @b[3]
142 pxor @b[2], @b[6]
143 pxor @b[0], @b[5]
144
145 pxor @b[3], @b[6]
146 pxor @b[7], @b[3]
147 pxor @b[5], @b[7]
148 pxor @b[4], @b[3]
149 pxor @b[5], @b[4]
150 pxor @b[1], @b[3]
151
152 pxor @b[7], @b[2]
153 pxor @b[5], @b[1]
154___
155}
156
157sub OutBasisChange {
158# input in lsb > [b0, b1, b2, b3, b4, b5, b6, b7] < msb
159# output in lsb > [b6, b1, b2, b4, b7, b0, b3, b5] < msb
160my @b=@_[0..7];
161$code.=<<___;
162 pxor @b[6], @b[0]
163 pxor @b[4], @b[1]
164 pxor @b[0], @b[2]
165 pxor @b[6], @b[4]
166 pxor @b[1], @b[6]
167
168 pxor @b[5], @b[1]
169 pxor @b[3], @b[5]
170 pxor @b[7], @b[3]
171 pxor @b[5], @b[7]
172 pxor @b[5], @b[2]
173
174 pxor @b[7], @b[4]
175___
176}
177
178sub InvSbox {
179# input in lsb > [b0, b1, b2, b3, b4, b5, b6, b7] < msb
180# output in lsb > [b0, b1, b6, b4, b2, b7, b3, b5] < msb
181my @b=@_[0..7];
182my @t=@_[8..11];
183my @s=@_[12..15];
184 &InvInBasisChange (@b);
185 &Inv_GF256 (@b[5,1,2,6,3,7,0,4],@t,@s);
186 &InvOutBasisChange (@b[3,7,0,4,5,1,2,6]);
187}
188
189sub InvInBasisChange { # OutBasisChange in reverse
190my @b=@_[5,1,2,6,3,7,0,4];
191$code.=<<___
192 pxor @b[7], @b[4]
193
194 pxor @b[5], @b[7]
195 pxor @b[5], @b[2]
196 pxor @b[7], @b[3]
197 pxor @b[3], @b[5]
198 pxor @b[5], @b[1]
199
200 pxor @b[1], @b[6]
201 pxor @b[0], @b[2]
202 pxor @b[6], @b[4]
203 pxor @b[6], @b[0]
204 pxor @b[4], @b[1]
205___
206}
207
208sub InvOutBasisChange { # InBasisChange in reverse
209my @b=@_[2,5,7,3,6,1,0,4];
210$code.=<<___;
211 pxor @b[5], @b[1]
212 pxor @b[7], @b[2]
213
214 pxor @b[1], @b[3]
215 pxor @b[5], @b[4]
216 pxor @b[5], @b[7]
217 pxor @b[4], @b[3]
218 pxor @b[0], @b[5]
219 pxor @b[7], @b[3]
220 pxor @b[2], @b[6]
221 pxor @b[1], @b[2]
222 pxor @b[3], @b[6]
223
224 pxor @b[0], @b[3]
225 pxor @b[6], @b[5]
226___
227}
228
229sub Mul_GF4 {
230#;*************************************************************
231#;* Mul_GF4: Input x0-x1,y0-y1 Output x0-x1 Temp t0 (8) *
232#;*************************************************************
233my ($x0,$x1,$y0,$y1,$t0)=@_;
234$code.=<<___;
235 movdqa $y0, $t0
236 pxor $y1, $t0
237 pand $x0, $t0
238 pxor $x1, $x0
239 pand $y0, $x1
240 pand $y1, $x0
241 pxor $x1, $x0
242 pxor $t0, $x1
243___
244}
245
246sub Mul_GF4_N { # not used, see next subroutine
247# multiply and scale by N
248my ($x0,$x1,$y0,$y1,$t0)=@_;
249$code.=<<___;
250 movdqa $y0, $t0
251 pxor $y1, $t0
252 pand $x0, $t0
253 pxor $x1, $x0
254 pand $y0, $x1
255 pand $y1, $x0
256 pxor $x0, $x1
257 pxor $t0, $x0
258___
259}
260
261sub Mul_GF4_N_GF4 {
262# interleaved Mul_GF4_N and Mul_GF4
263my ($x0,$x1,$y0,$y1,$t0,
264 $x2,$x3,$y2,$y3,$t1)=@_;
265$code.=<<___;
266 movdqa $y0, $t0
267 movdqa $y2, $t1
268 pxor $y1, $t0
269 pxor $y3, $t1
270 pand $x0, $t0
271 pand $x2, $t1
272 pxor $x1, $x0
273 pxor $x3, $x2
274 pand $y0, $x1
275 pand $y2, $x3
276 pand $y1, $x0
277 pand $y3, $x2
278 pxor $x0, $x1
279 pxor $x3, $x2
280 pxor $t0, $x0
281 pxor $t1, $x3
282___
283}
284sub Mul_GF16_2 {
285my @x=@_[0..7];
286my @y=@_[8..11];
287my @t=@_[12..15];
288$code.=<<___;
289 movdqa @x[0], @t[0]
290 movdqa @x[1], @t[1]
291___
292 &Mul_GF4 (@x[0], @x[1], @y[0], @y[1], @t[2]);
293$code.=<<___;
294 pxor @x[2], @t[0]
295 pxor @x[3], @t[1]
296 pxor @y[2], @y[0]
297 pxor @y[3], @y[1]
298___
299 Mul_GF4_N_GF4 (@t[0], @t[1], @y[0], @y[1], @t[3],
300 @x[2], @x[3], @y[2], @y[3], @t[2]);
301$code.=<<___;
302 pxor @t[0], @x[0]
303 pxor @t[0], @x[2]
304 pxor @t[1], @x[1]
305 pxor @t[1], @x[3]
306
307 movdqa @x[4], @t[0]
308 movdqa @x[5], @t[1]
309 pxor @x[6], @t[0]
310 pxor @x[7], @t[1]
311___
312 &Mul_GF4_N_GF4 (@t[0], @t[1], @y[0], @y[1], @t[3],
313 @x[6], @x[7], @y[2], @y[3], @t[2]);
314$code.=<<___;
315 pxor @y[2], @y[0]
316 pxor @y[3], @y[1]
317___
318 &Mul_GF4 (@x[4], @x[5], @y[0], @y[1], @t[3]);
319$code.=<<___;
320 pxor @t[0], @x[4]
321 pxor @t[0], @x[6]
322 pxor @t[1], @x[5]
323 pxor @t[1], @x[7]
324___
325}
326sub Inv_GF256 {
327#;********************************************************************
328#;* Inv_GF256: Input x0-x7 Output x0-x7 Temp t0-t3,s0-s3 (144) *
329#;********************************************************************
330my @x=@_[0..7];
331my @t=@_[8..11];
332my @s=@_[12..15];
333# direct optimizations from hardware
334$code.=<<___;
335 movdqa @x[4], @t[3]
336 movdqa @x[5], @t[2]
337 movdqa @x[1], @t[1]
338 movdqa @x[7], @s[1]
339 movdqa @x[0], @s[0]
340
341 pxor @x[6], @t[3]
342 pxor @x[7], @t[2]
343 pxor @x[3], @t[1]
344 movdqa @t[3], @s[2]
345 pxor @x[6], @s[1]
346 movdqa @t[2], @t[0]
347 pxor @x[2], @s[0]
348 movdqa @t[3], @s[3]
349
350 por @t[1], @t[2]
351 por @s[0], @t[3]
352 pxor @t[0], @s[3]
353 pand @s[0], @s[2]
354 pxor @t[1], @s[0]
355 pand @t[1], @t[0]
356 pand @s[0], @s[3]
357 movdqa @x[3], @s[0]
358 pxor @x[2], @s[0]
359 pand @s[0], @s[1]
360 pxor @s[1], @t[3]
361 pxor @s[1], @t[2]
362 movdqa @x[4], @s[1]
363 movdqa @x[1], @s[0]
364 pxor @x[5], @s[1]
365 pxor @x[0], @s[0]
366 movdqa @s[1], @t[1]
367 pand @s[0], @s[1]
368 por @s[0], @t[1]
369 pxor @s[1], @t[0]
370 pxor @s[3], @t[3]
371 pxor @s[2], @t[2]
372 pxor @s[3], @t[1]
373 movdqa @x[7], @s[0]
374 pxor @s[2], @t[0]
375 movdqa @x[6], @s[1]
376 pxor @s[2], @t[1]
377 movdqa @x[5], @s[2]
378 pand @x[3], @s[0]
379 movdqa @x[4], @s[3]
380 pand @x[2], @s[1]
381 pand @x[1], @s[2]
382 por @x[0], @s[3]
383 pxor @s[0], @t[3]
384 pxor @s[1], @t[2]
385 pxor @s[2], @t[1]
386 pxor @s[3], @t[0]
387
388 #Inv_GF16 \t0, \t1, \t2, \t3, \s0, \s1, \s2, \s3
389
390 # new smaller inversion
391
392 movdqa @t[3], @s[0]
393 pand @t[1], @t[3]
394 pxor @t[2], @s[0]
395
396 movdqa @t[0], @s[2]
397 movdqa @s[0], @s[3]
398 pxor @t[3], @s[2]
399 pand @s[2], @s[3]
400
401 movdqa @t[1], @s[1]
402 pxor @t[2], @s[3]
403 pxor @t[0], @s[1]
404
405 pxor @t[2], @t[3]
406
407 pand @t[3], @s[1]
408
409 movdqa @s[2], @t[2]
410 pxor @t[0], @s[1]
411
412 pxor @s[1], @t[2]
413 pxor @s[1], @t[1]
414
415 pand @t[0], @t[2]
416
417 pxor @t[2], @s[2]
418 pxor @t[2], @t[1]
419
420 pand @s[3], @s[2]
421
422 pxor @s[0], @s[2]
423___
424# output in s3, s2, s1, t1
425
426# Mul_GF16_2 \x0, \x1, \x2, \x3, \x4, \x5, \x6, \x7, \t2, \t3, \t0, \t1, \s0, \s1, \s2, \s3
427
428# Mul_GF16_2 \x0, \x1, \x2, \x3, \x4, \x5, \x6, \x7, \s3, \s2, \s1, \t1, \s0, \t0, \t2, \t3
429 &Mul_GF16_2(@x,@s[3,2,1],@t[1],@s[0],@t[0,2,3]);
430
431### output msb > [x3,x2,x1,x0,x7,x6,x5,x4] < lsb
432}
433
434# AES linear components
435
436sub ShiftRows {
437my @x=@_[0..7];
438my $mask=pop;
439$code.=<<___;
440 pxor 0x00($key),@x[0]
441 pxor 0x10($key),@x[1]
442 pxor 0x20($key),@x[2]
443 pxor 0x30($key),@x[3]
444 pshufb $mask,@x[0]
445 pshufb $mask,@x[1]
446 pxor 0x40($key),@x[4]
447 pxor 0x50($key),@x[5]
448 pshufb $mask,@x[2]
449 pshufb $mask,@x[3]
450 pxor 0x60($key),@x[6]
451 pxor 0x70($key),@x[7]
452 pshufb $mask,@x[4]
453 pshufb $mask,@x[5]
454 pshufb $mask,@x[6]
455 pshufb $mask,@x[7]
456 lea 0x80($key),$key
457___
458}
459
460sub MixColumns {
461# modified to emit output in order suitable for feeding back to aesenc[last]
462my @x=@_[0..7];
463my @t=@_[8..15];
464my $inv=@_[16]; # optional
465$code.=<<___;
466 pshufd \$0x93, @x[0], @t[0] # x0 <<< 32
467 pshufd \$0x93, @x[1], @t[1]
468 pxor @t[0], @x[0] # x0 ^ (x0 <<< 32)
469 pshufd \$0x93, @x[2], @t[2]
470 pxor @t[1], @x[1]
471 pshufd \$0x93, @x[3], @t[3]
472 pxor @t[2], @x[2]
473 pshufd \$0x93, @x[4], @t[4]
474 pxor @t[3], @x[3]
475 pshufd \$0x93, @x[5], @t[5]
476 pxor @t[4], @x[4]
477 pshufd \$0x93, @x[6], @t[6]
478 pxor @t[5], @x[5]
479 pshufd \$0x93, @x[7], @t[7]
480 pxor @t[6], @x[6]
481 pxor @t[7], @x[7]
482
483 pxor @x[0], @t[1]
484 pxor @x[7], @t[0]
485 pxor @x[7], @t[1]
486 pshufd \$0x4E, @x[0], @x[0] # (x0 ^ (x0 <<< 32)) <<< 64)
487 pxor @x[1], @t[2]
488 pshufd \$0x4E, @x[1], @x[1]
489 pxor @x[4], @t[5]
490 pxor @t[0], @x[0]
491 pxor @x[5], @t[6]
492 pxor @t[1], @x[1]
493 pxor @x[3], @t[4]
494 pshufd \$0x4E, @x[4], @t[0]
495 pxor @x[6], @t[7]
496 pshufd \$0x4E, @x[5], @t[1]
497 pxor @x[2], @t[3]
498 pshufd \$0x4E, @x[3], @x[4]
499 pxor @x[7], @t[3]
500 pshufd \$0x4E, @x[7], @x[5]
501 pxor @x[7], @t[4]
502 pshufd \$0x4E, @x[6], @x[3]
503 pxor @t[4], @t[0]
504 pshufd \$0x4E, @x[2], @x[6]
505 pxor @t[5], @t[1]
506___
507$code.=<<___ if (!$inv);
508 pxor @t[3], @x[4]
509 pxor @t[7], @x[5]
510 pxor @t[6], @x[3]
511 movdqa @t[0], @x[2]
512 pxor @t[2], @x[6]
513 movdqa @t[1], @x[7]
514___
515$code.=<<___ if ($inv);
516 pxor @x[4], @t[3]
517 pxor @t[7], @x[5]
518 pxor @x[3], @t[6]
519 movdqa @t[0], @x[3]
520 pxor @t[2], @x[6]
521 movdqa @t[6], @x[2]
522 movdqa @t[1], @x[7]
523 movdqa @x[6], @x[4]
524 movdqa @t[3], @x[6]
525___
526}
527
528sub InvMixColumns_orig {
529my @x=@_[0..7];
530my @t=@_[8..15];
531
532$code.=<<___;
533 # multiplication by 0x0e
534 pshufd \$0x93, @x[7], @t[7]
535 movdqa @x[2], @t[2]
536 pxor @x[5], @x[7] # 7 5
537 pxor @x[5], @x[2] # 2 5
538 pshufd \$0x93, @x[0], @t[0]
539 movdqa @x[5], @t[5]
540 pxor @x[0], @x[5] # 5 0 [1]
541 pxor @x[1], @x[0] # 0 1
542 pshufd \$0x93, @x[1], @t[1]
543 pxor @x[2], @x[1] # 1 25
544 pxor @x[6], @x[0] # 01 6 [2]
545 pxor @x[3], @x[1] # 125 3 [4]
546 pshufd \$0x93, @x[3], @t[3]
547 pxor @x[0], @x[2] # 25 016 [3]
548 pxor @x[7], @x[3] # 3 75
549 pxor @x[6], @x[7] # 75 6 [0]
550 pshufd \$0x93, @x[6], @t[6]
551 movdqa @x[4], @t[4]
552 pxor @x[4], @x[6] # 6 4
553 pxor @x[3], @x[4] # 4 375 [6]
554 pxor @x[7], @x[3] # 375 756=36
555 pxor @t[5], @x[6] # 64 5 [7]
556 pxor @t[2], @x[3] # 36 2
557 pxor @t[4], @x[3] # 362 4 [5]
558 pshufd \$0x93, @t[5], @t[5]
559___
560 my @y = @x[7,5,0,2,1,3,4,6];
561$code.=<<___;
562 # multiplication by 0x0b
563 pxor @y[0], @y[1]
564 pxor @t[0], @y[0]
565 pxor @t[1], @y[1]
566 pshufd \$0x93, @t[2], @t[2]
567 pxor @t[5], @y[0]
568 pxor @t[6], @y[1]
569 pxor @t[7], @y[0]
570 pshufd \$0x93, @t[4], @t[4]
571 pxor @t[6], @t[7] # clobber t[7]
572 pxor @y[0], @y[1]
573
574 pxor @t[0], @y[3]
575 pshufd \$0x93, @t[0], @t[0]
576 pxor @t[1], @y[2]
577 pxor @t[1], @y[4]
578 pxor @t[2], @y[2]
579 pshufd \$0x93, @t[1], @t[1]
580 pxor @t[2], @y[3]
581 pxor @t[2], @y[5]
582 pxor @t[7], @y[2]
583 pshufd \$0x93, @t[2], @t[2]
584 pxor @t[3], @y[3]
585 pxor @t[3], @y[6]
586 pxor @t[3], @y[4]
587 pshufd \$0x93, @t[3], @t[3]
588 pxor @t[4], @y[7]
589 pxor @t[4], @y[5]
590 pxor @t[7], @y[7]
591 pxor @t[5], @y[3]
592 pxor @t[4], @y[4]
593 pxor @t[5], @t[7] # clobber t[7] even more
594
595 pxor @t[7], @y[5]
596 pshufd \$0x93, @t[4], @t[4]
597 pxor @t[7], @y[6]
598 pxor @t[7], @y[4]
599
600 pxor @t[5], @t[7]
601 pshufd \$0x93, @t[5], @t[5]
602 pxor @t[6], @t[7] # restore t[7]
603
604 # multiplication by 0x0d
605 pxor @y[7], @y[4]
606 pxor @t[4], @y[7]
607 pshufd \$0x93, @t[6], @t[6]
608 pxor @t[0], @y[2]
609 pxor @t[5], @y[7]
610 pxor @t[2], @y[2]
611 pshufd \$0x93, @t[7], @t[7]
612
613 pxor @y[1], @y[3]
614 pxor @t[1], @y[1]
615 pxor @t[0], @y[0]
616 pxor @t[0], @y[3]
617 pxor @t[5], @y[1]
618 pxor @t[5], @y[0]
619 pxor @t[7], @y[1]
620 pshufd \$0x93, @t[0], @t[0]
621 pxor @t[6], @y[0]
622 pxor @y[1], @y[3]
623 pxor @t[1], @y[4]
624 pshufd \$0x93, @t[1], @t[1]
625
626 pxor @t[7], @y[7]
627 pxor @t[2], @y[4]
628 pxor @t[2], @y[5]
629 pshufd \$0x93, @t[2], @t[2]
630 pxor @t[6], @y[2]
631 pxor @t[3], @t[6] # clobber t[6]
632 pxor @y[7], @y[4]
633 pxor @t[6], @y[3]
634
635 pxor @t[6], @y[6]
636 pxor @t[5], @y[5]
637 pxor @t[4], @y[6]
638 pshufd \$0x93, @t[4], @t[4]
639 pxor @t[6], @y[5]
640 pxor @t[7], @y[6]
641 pxor @t[3], @t[6] # restore t[6]
642
643 pshufd \$0x93, @t[5], @t[5]
644 pshufd \$0x93, @t[6], @t[6]
645 pshufd \$0x93, @t[7], @t[7]
646 pshufd \$0x93, @t[3], @t[3]
647
648 # multiplication by 0x09
649 pxor @y[1], @y[4]
650 pxor @y[1], @t[1] # t[1]=y[1]
651 pxor @t[5], @t[0] # clobber t[0]
652 pxor @t[5], @t[1]
653 pxor @t[0], @y[3]
654 pxor @y[0], @t[0] # t[0]=y[0]
655 pxor @t[6], @t[1]
656 pxor @t[7], @t[6] # clobber t[6]
657 pxor @t[1], @y[4]
658 pxor @t[4], @y[7]
659 pxor @y[4], @t[4] # t[4]=y[4]
660 pxor @t[3], @y[6]
661 pxor @y[3], @t[3] # t[3]=y[3]
662 pxor @t[2], @y[5]
663 pxor @y[2], @t[2] # t[2]=y[2]
664 pxor @t[7], @t[3]
665 pxor @y[5], @t[5] # t[5]=y[5]
666 pxor @t[6], @t[2]
667 pxor @t[6], @t[5]
668 pxor @y[6], @t[6] # t[6]=y[6]
669 pxor @y[7], @t[7] # t[7]=y[7]
670
671 movdqa @t[0],@XMM[0]
672 movdqa @t[1],@XMM[1]
673 movdqa @t[2],@XMM[2]
674 movdqa @t[3],@XMM[3]
675 movdqa @t[4],@XMM[4]
676 movdqa @t[5],@XMM[5]
677 movdqa @t[6],@XMM[6]
678 movdqa @t[7],@XMM[7]
679___
680}
681
682sub InvMixColumns {
683my @x=@_[0..7];
684my @t=@_[8..15];
685
686# Thanks to Jussi Kivilinna for providing pointer to
687#
688# | 0e 0b 0d 09 | | 02 03 01 01 | | 05 00 04 00 |
689# | 09 0e 0b 0d | = | 01 02 03 01 | x | 00 05 00 04 |
690# | 0d 09 0e 0b | | 01 01 02 03 | | 04 00 05 00 |
691# | 0b 0d 09 0e | | 03 01 01 02 | | 00 04 00 05 |
692
693$code.=<<___;
694 # multiplication by 0x05-0x00-0x04-0x00
695 pshufd \$0x4E, @x[0], @t[0]
696 pshufd \$0x4E, @x[6], @t[6]
697 pxor @x[0], @t[0]
698 pshufd \$0x4E, @x[7], @t[7]
699 pxor @x[6], @t[6]
700 pshufd \$0x4E, @x[1], @t[1]
701 pxor @x[7], @t[7]
702 pshufd \$0x4E, @x[2], @t[2]
703 pxor @x[1], @t[1]
704 pshufd \$0x4E, @x[3], @t[3]
705 pxor @x[2], @t[2]
706 pxor @t[6], @x[0]
707 pxor @t[6], @x[1]
708 pshufd \$0x4E, @x[4], @t[4]
709 pxor @x[3], @t[3]
710 pxor @t[0], @x[2]
711 pxor @t[1], @x[3]
712 pshufd \$0x4E, @x[5], @t[5]
713 pxor @x[4], @t[4]
714 pxor @t[7], @x[1]
715 pxor @t[2], @x[4]
716 pxor @x[5], @t[5]
717
718 pxor @t[7], @x[2]
719 pxor @t[6], @x[3]
720 pxor @t[6], @x[4]
721 pxor @t[3], @x[5]
722 pxor @t[4], @x[6]
723 pxor @t[7], @x[4]
724 pxor @t[7], @x[5]
725 pxor @t[5], @x[7]
726___
727 &MixColumns (@x,@t,1); # flipped 2<->3 and 4<->6
728}
729
730sub aesenc { # not used
731my @b=@_[0..7];
732my @t=@_[8..15];
733$code.=<<___;
734 movdqa 0x30($const),@t[0] # .LSR
735___
736 &ShiftRows (@b,@t[0]);
737 &Sbox (@b,@t);
738 &MixColumns (@b[0,1,4,6,3,7,2,5],@t);
739}
740
741sub aesenclast { # not used
742my @b=@_[0..7];
743my @t=@_[8..15];
744$code.=<<___;
745 movdqa 0x40($const),@t[0] # .LSRM0
746___
747 &ShiftRows (@b,@t[0]);
748 &Sbox (@b,@t);
749$code.=<<___
750 pxor 0x00($key),@b[0]
751 pxor 0x10($key),@b[1]
752 pxor 0x20($key),@b[4]
753 pxor 0x30($key),@b[6]
754 pxor 0x40($key),@b[3]
755 pxor 0x50($key),@b[7]
756 pxor 0x60($key),@b[2]
757 pxor 0x70($key),@b[5]
758___
759}
760
761sub swapmove {
762my ($a,$b,$n,$mask,$t)=@_;
763$code.=<<___;
764 movdqa $b,$t
765 psrlq \$$n,$b
766 pxor $a,$b
767 pand $mask,$b
768 pxor $b,$a
769 psllq \$$n,$b
770 pxor $t,$b
771___
772}
773sub swapmove2x {
774my ($a0,$b0,$a1,$b1,$n,$mask,$t0,$t1)=@_;
775$code.=<<___;
776 movdqa $b0,$t0
777 psrlq \$$n,$b0
778 movdqa $b1,$t1
779 psrlq \$$n,$b1
780 pxor $a0,$b0
781 pxor $a1,$b1
782 pand $mask,$b0
783 pand $mask,$b1
784 pxor $b0,$a0
785 psllq \$$n,$b0
786 pxor $b1,$a1
787 psllq \$$n,$b1
788 pxor $t0,$b0
789 pxor $t1,$b1
790___
791}
792
793sub bitslice {
794my @x=reverse(@_[0..7]);
795my ($t0,$t1,$t2,$t3)=@_[8..11];
796$code.=<<___;
797 movdqa 0x00($const),$t0 # .LBS0
798 movdqa 0x10($const),$t1 # .LBS1
799___
800 &swapmove2x(@x[0,1,2,3],1,$t0,$t2,$t3);
801 &swapmove2x(@x[4,5,6,7],1,$t0,$t2,$t3);
802$code.=<<___;
803 movdqa 0x20($const),$t0 # .LBS2
804___
805 &swapmove2x(@x[0,2,1,3],2,$t1,$t2,$t3);
806 &swapmove2x(@x[4,6,5,7],2,$t1,$t2,$t3);
807
808 &swapmove2x(@x[0,4,1,5],4,$t0,$t2,$t3);
809 &swapmove2x(@x[2,6,3,7],4,$t0,$t2,$t3);
810}
811
812$code.=<<___;
813.text
814
815.extern asm_AES_encrypt
816.extern asm_AES_decrypt
817
818.type _bsaes_encrypt8,\@abi-omnipotent
819.align 64
820_bsaes_encrypt8:
821.cfi_startproc
822 lea .LBS0(%rip), $const # constants table
823
824 movdqa ($key), @XMM[9] # round 0 key
825 lea 0x10($key), $key
826 movdqa 0x50($const), @XMM[8] # .LM0SR
827 pxor @XMM[9], @XMM[0] # xor with round0 key
828 pxor @XMM[9], @XMM[1]
829 pxor @XMM[9], @XMM[2]
830 pxor @XMM[9], @XMM[3]
831 pshufb @XMM[8], @XMM[0]
832 pshufb @XMM[8], @XMM[1]
833 pxor @XMM[9], @XMM[4]
834 pxor @XMM[9], @XMM[5]
835 pshufb @XMM[8], @XMM[2]
836 pshufb @XMM[8], @XMM[3]
837 pxor @XMM[9], @XMM[6]
838 pxor @XMM[9], @XMM[7]
839 pshufb @XMM[8], @XMM[4]
840 pshufb @XMM[8], @XMM[5]
841 pshufb @XMM[8], @XMM[6]
842 pshufb @XMM[8], @XMM[7]
843_bsaes_encrypt8_bitslice:
844___
845 &bitslice (@XMM[0..7, 8..11]);
846$code.=<<___;
847 dec $rounds
848 jmp .Lenc_sbox
849.align 16
850.Lenc_loop:
851___
852 &ShiftRows (@XMM[0..7, 8]);
853$code.=".Lenc_sbox:\n";
854 &Sbox (@XMM[0..7, 8..15]);
855$code.=<<___;
856 dec $rounds
857 jl .Lenc_done
858___
859 &MixColumns (@XMM[0,1,4,6,3,7,2,5, 8..15]);
860$code.=<<___;
861 movdqa 0x30($const), @XMM[8] # .LSR
862 jnz .Lenc_loop
863 movdqa 0x40($const), @XMM[8] # .LSRM0
864 jmp .Lenc_loop
865.align 16
866.Lenc_done:
867___
868 # output in lsb > [t0, t1, t4, t6, t3, t7, t2, t5] < msb
869 &bitslice (@XMM[0,1,4,6,3,7,2,5, 8..11]);
870$code.=<<___;
871 movdqa ($key), @XMM[8] # last round key
872 pxor @XMM[8], @XMM[4]
873 pxor @XMM[8], @XMM[6]
874 pxor @XMM[8], @XMM[3]
875 pxor @XMM[8], @XMM[7]
876 pxor @XMM[8], @XMM[2]
877 pxor @XMM[8], @XMM[5]
878 pxor @XMM[8], @XMM[0]
879 pxor @XMM[8], @XMM[1]
880 ret
881.cfi_endproc
882.size _bsaes_encrypt8,.-_bsaes_encrypt8
883
884.type _bsaes_decrypt8,\@abi-omnipotent
885.align 64
886_bsaes_decrypt8:
887.cfi_startproc
888 lea .LBS0(%rip), $const # constants table
889
890 movdqa ($key), @XMM[9] # round 0 key
891 lea 0x10($key), $key
892 movdqa -0x30($const), @XMM[8] # .LM0ISR
893 pxor @XMM[9], @XMM[0] # xor with round0 key
894 pxor @XMM[9], @XMM[1]
895 pxor @XMM[9], @XMM[2]
896 pxor @XMM[9], @XMM[3]
897 pshufb @XMM[8], @XMM[0]
898 pshufb @XMM[8], @XMM[1]
899 pxor @XMM[9], @XMM[4]
900 pxor @XMM[9], @XMM[5]
901 pshufb @XMM[8], @XMM[2]
902 pshufb @XMM[8], @XMM[3]
903 pxor @XMM[9], @XMM[6]
904 pxor @XMM[9], @XMM[7]
905 pshufb @XMM[8], @XMM[4]
906 pshufb @XMM[8], @XMM[5]
907 pshufb @XMM[8], @XMM[6]
908 pshufb @XMM[8], @XMM[7]
909___
910 &bitslice (@XMM[0..7, 8..11]);
911$code.=<<___;
912 dec $rounds
913 jmp .Ldec_sbox
914.align 16
915.Ldec_loop:
916___
917 &ShiftRows (@XMM[0..7, 8]);
918$code.=".Ldec_sbox:\n";
919 &InvSbox (@XMM[0..7, 8..15]);
920$code.=<<___;
921 dec $rounds
922 jl .Ldec_done
923___
924 &InvMixColumns (@XMM[0,1,6,4,2,7,3,5, 8..15]);
925$code.=<<___;
926 movdqa -0x10($const), @XMM[8] # .LISR
927 jnz .Ldec_loop
928 movdqa -0x20($const), @XMM[8] # .LISRM0
929 jmp .Ldec_loop
930.align 16
931.Ldec_done:
932___
933 &bitslice (@XMM[0,1,6,4,2,7,3,5, 8..11]);
934$code.=<<___;
935 movdqa ($key), @XMM[8] # last round key
936 pxor @XMM[8], @XMM[6]
937 pxor @XMM[8], @XMM[4]
938 pxor @XMM[8], @XMM[2]
939 pxor @XMM[8], @XMM[7]
940 pxor @XMM[8], @XMM[3]
941 pxor @XMM[8], @XMM[5]
942 pxor @XMM[8], @XMM[0]
943 pxor @XMM[8], @XMM[1]
944 ret
945.cfi_endproc
946.size _bsaes_decrypt8,.-_bsaes_decrypt8
947___
948}
949{
950my ($out,$inp,$rounds,$const)=("%rax","%rcx","%r10d","%r11");
951
952sub bitslice_key {
953my @x=reverse(@_[0..7]);
954my ($bs0,$bs1,$bs2,$t2,$t3)=@_[8..12];
955
956 &swapmove (@x[0,1],1,$bs0,$t2,$t3);
957$code.=<<___;
958 #&swapmove(@x[2,3],1,$t0,$t2,$t3);
959 movdqa @x[0], @x[2]
960 movdqa @x[1], @x[3]
961___
962 #&swapmove2x(@x[4,5,6,7],1,$t0,$t2,$t3);
963
964 &swapmove2x (@x[0,2,1,3],2,$bs1,$t2,$t3);
965$code.=<<___;
966 #&swapmove2x(@x[4,6,5,7],2,$t1,$t2,$t3);
967 movdqa @x[0], @x[4]
968 movdqa @x[2], @x[6]
969 movdqa @x[1], @x[5]
970 movdqa @x[3], @x[7]
971___
972 &swapmove2x (@x[0,4,1,5],4,$bs2,$t2,$t3);
973 &swapmove2x (@x[2,6,3,7],4,$bs2,$t2,$t3);
974}
975
976$code.=<<___;
977.type _bsaes_key_convert,\@abi-omnipotent
978.align 16
979_bsaes_key_convert:
980.cfi_startproc
981 lea .Lmasks(%rip), $const
982 movdqu ($inp), %xmm7 # load round 0 key
983 lea 0x10($inp), $inp
984 movdqa 0x00($const), %xmm0 # 0x01...
985 movdqa 0x10($const), %xmm1 # 0x02...
986 movdqa 0x20($const), %xmm2 # 0x04...
987 movdqa 0x30($const), %xmm3 # 0x08...
988 movdqa 0x40($const), %xmm4 # .LM0
989 pcmpeqd %xmm5, %xmm5 # .LNOT
990
991 movdqu ($inp), %xmm6 # load round 1 key
992 movdqa %xmm7, ($out) # save round 0 key
993 lea 0x10($out), $out
994 dec $rounds
995 jmp .Lkey_loop
996.align 16
997.Lkey_loop:
998 pshufb %xmm4, %xmm6 # .LM0
999
1000 movdqa %xmm0, %xmm8
1001 movdqa %xmm1, %xmm9
1002
1003 pand %xmm6, %xmm8
1004 pand %xmm6, %xmm9
1005 movdqa %xmm2, %xmm10
1006 pcmpeqb %xmm0, %xmm8
1007 psllq \$4, %xmm0 # 0x10...
1008 movdqa %xmm3, %xmm11
1009 pcmpeqb %xmm1, %xmm9
1010 psllq \$4, %xmm1 # 0x20...
1011
1012 pand %xmm6, %xmm10
1013 pand %xmm6, %xmm11
1014 movdqa %xmm0, %xmm12
1015 pcmpeqb %xmm2, %xmm10
1016 psllq \$4, %xmm2 # 0x40...
1017 movdqa %xmm1, %xmm13
1018 pcmpeqb %xmm3, %xmm11
1019 psllq \$4, %xmm3 # 0x80...
1020
1021 movdqa %xmm2, %xmm14
1022 movdqa %xmm3, %xmm15
1023 pxor %xmm5, %xmm8 # "pnot"
1024 pxor %xmm5, %xmm9
1025
1026 pand %xmm6, %xmm12
1027 pand %xmm6, %xmm13
1028 movdqa %xmm8, 0x00($out) # write bit-sliced round key
1029 pcmpeqb %xmm0, %xmm12
1030 psrlq \$4, %xmm0 # 0x01...
1031 movdqa %xmm9, 0x10($out)
1032 pcmpeqb %xmm1, %xmm13
1033 psrlq \$4, %xmm1 # 0x02...
1034 lea 0x10($inp), $inp
1035
1036 pand %xmm6, %xmm14
1037 pand %xmm6, %xmm15
1038 movdqa %xmm10, 0x20($out)
1039 pcmpeqb %xmm2, %xmm14
1040 psrlq \$4, %xmm2 # 0x04...
1041 movdqa %xmm11, 0x30($out)
1042 pcmpeqb %xmm3, %xmm15
1043 psrlq \$4, %xmm3 # 0x08...
1044 movdqu ($inp), %xmm6 # load next round key
1045
1046 pxor %xmm5, %xmm13 # "pnot"
1047 pxor %xmm5, %xmm14
1048 movdqa %xmm12, 0x40($out)
1049 movdqa %xmm13, 0x50($out)
1050 movdqa %xmm14, 0x60($out)
1051 movdqa %xmm15, 0x70($out)
1052 lea 0x80($out),$out
1053 dec $rounds
1054 jnz .Lkey_loop
1055
1056 movdqa 0x50($const), %xmm7 # .L63
1057 #movdqa %xmm6, ($out) # don't save last round key
1058 ret
1059.cfi_endproc
1060.size _bsaes_key_convert,.-_bsaes_key_convert
1061___
1062}
1063
1064if (0 && !$win64) { # following four functions are unsupported interface
1065 # used for benchmarking...
1066$code.=<<___;
1067.globl bsaes_enc_key_convert
1068.type bsaes_enc_key_convert,\@function,2
1069.align 16
1070bsaes_enc_key_convert:
1071 mov 240($inp),%r10d # pass rounds
1072 mov $inp,%rcx # pass key
1073 mov $out,%rax # pass key schedule
1074 call _bsaes_key_convert
1075 pxor %xmm6,%xmm7 # fix up last round key
1076 movdqa %xmm7,(%rax) # save last round key
1077 ret
1078.size bsaes_enc_key_convert,.-bsaes_enc_key_convert
1079
1080.globl bsaes_encrypt_128
1081.type bsaes_encrypt_128,\@function,4
1082.align 16
1083bsaes_encrypt_128:
1084.Lenc128_loop:
1085 movdqu 0x00($inp), @XMM[0] # load input
1086 movdqu 0x10($inp), @XMM[1]
1087 movdqu 0x20($inp), @XMM[2]
1088 movdqu 0x30($inp), @XMM[3]
1089 movdqu 0x40($inp), @XMM[4]
1090 movdqu 0x50($inp), @XMM[5]
1091 movdqu 0x60($inp), @XMM[6]
1092 movdqu 0x70($inp), @XMM[7]
1093 mov $key, %rax # pass the $key
1094 lea 0x80($inp), $inp
1095 mov \$10,%r10d
1096
1097 call _bsaes_encrypt8
1098
1099 movdqu @XMM[0], 0x00($out) # write output
1100 movdqu @XMM[1], 0x10($out)
1101 movdqu @XMM[4], 0x20($out)
1102 movdqu @XMM[6], 0x30($out)
1103 movdqu @XMM[3], 0x40($out)
1104 movdqu @XMM[7], 0x50($out)
1105 movdqu @XMM[2], 0x60($out)
1106 movdqu @XMM[5], 0x70($out)
1107 lea 0x80($out), $out
1108 sub \$0x80,$len
1109 ja .Lenc128_loop
1110 ret
1111.size bsaes_encrypt_128,.-bsaes_encrypt_128
1112
1113.globl bsaes_dec_key_convert
1114.type bsaes_dec_key_convert,\@function,2
1115.align 16
1116bsaes_dec_key_convert:
1117 mov 240($inp),%r10d # pass rounds
1118 mov $inp,%rcx # pass key
1119 mov $out,%rax # pass key schedule
1120 call _bsaes_key_convert
1121 pxor ($out),%xmm7 # fix up round 0 key
1122 movdqa %xmm6,(%rax) # save last round key
1123 movdqa %xmm7,($out)
1124 ret
1125.size bsaes_dec_key_convert,.-bsaes_dec_key_convert
1126
1127.globl bsaes_decrypt_128
1128.type bsaes_decrypt_128,\@function,4
1129.align 16
1130bsaes_decrypt_128:
1131.Ldec128_loop:
1132 movdqu 0x00($inp), @XMM[0] # load input
1133 movdqu 0x10($inp), @XMM[1]
1134 movdqu 0x20($inp), @XMM[2]
1135 movdqu 0x30($inp), @XMM[3]
1136 movdqu 0x40($inp), @XMM[4]
1137 movdqu 0x50($inp), @XMM[5]
1138 movdqu 0x60($inp), @XMM[6]
1139 movdqu 0x70($inp), @XMM[7]
1140 mov $key, %rax # pass the $key
1141 lea 0x80($inp), $inp
1142 mov \$10,%r10d
1143
1144 call _bsaes_decrypt8
1145
1146 movdqu @XMM[0], 0x00($out) # write output
1147 movdqu @XMM[1], 0x10($out)
1148 movdqu @XMM[6], 0x20($out)
1149 movdqu @XMM[4], 0x30($out)
1150 movdqu @XMM[2], 0x40($out)
1151 movdqu @XMM[7], 0x50($out)
1152 movdqu @XMM[3], 0x60($out)
1153 movdqu @XMM[5], 0x70($out)
1154 lea 0x80($out), $out
1155 sub \$0x80,$len
1156 ja .Ldec128_loop
1157 ret
1158.size bsaes_decrypt_128,.-bsaes_decrypt_128
1159___
1160}
1161{
1162######################################################################
1163#
1164# OpenSSL interface
1165#
1166my ($arg1,$arg2,$arg3,$arg4,$arg5,$arg6)=$win64 ? ("%rcx","%rdx","%r8","%r9","%r10","%r11d")
1167 : ("%rdi","%rsi","%rdx","%rcx","%r8","%r9d");
1168my ($inp,$out,$len,$key)=("%r12","%r13","%r14","%r15");
1169
1170if ($ecb) {
1171$code.=<<___;
1172.globl bsaes_ecb_encrypt_blocks
1173.type bsaes_ecb_encrypt_blocks,\@abi-omnipotent
1174.align 16
1175bsaes_ecb_encrypt_blocks:
1176.cfi_startproc
1177 mov %rsp, %rax
1178.Lecb_enc_prologue:
1179 push %rbp
1180.cfi_push %rbp
1181 push %rbx
1182.cfi_push %rbx
1183 push %r12
1184.cfi_push %r12
1185 push %r13
1186.cfi_push %r13
1187 push %r14
1188.cfi_push %r14
1189 push %r15
1190.cfi_push %r15
1191 lea -0x48(%rsp),%rsp
1192.cfi_adjust_cfa_offset 0x48
1193___
1194$code.=<<___ if ($win64);
1195 lea -0xa0(%rsp), %rsp
1196 movaps %xmm6, 0x40(%rsp)
1197 movaps %xmm7, 0x50(%rsp)
1198 movaps %xmm8, 0x60(%rsp)
1199 movaps %xmm9, 0x70(%rsp)
1200 movaps %xmm10, 0x80(%rsp)
1201 movaps %xmm11, 0x90(%rsp)
1202 movaps %xmm12, 0xa0(%rsp)
1203 movaps %xmm13, 0xb0(%rsp)
1204 movaps %xmm14, 0xc0(%rsp)
1205 movaps %xmm15, 0xd0(%rsp)
1206.Lecb_enc_body:
1207___
1208$code.=<<___;
1209 mov %rsp,%rbp # backup %rsp
1210.cfi_def_cfa_register %rbp
1211 mov 240($arg4),%eax # rounds
1212 mov $arg1,$inp # backup arguments
1213 mov $arg2,$out
1214 mov $arg3,$len
1215 mov $arg4,$key
1216 cmp \$8,$arg3
1217 jb .Lecb_enc_short
1218
1219 mov %eax,%ebx # backup rounds
1220 shl \$7,%rax # 128 bytes per inner round key
1221 sub \$`128-32`,%rax # size of bit-sliced key schedule
1222 sub %rax,%rsp
1223 mov %rsp,%rax # pass key schedule
1224 mov $key,%rcx # pass key
1225 mov %ebx,%r10d # pass rounds
1226 call _bsaes_key_convert
1227 pxor %xmm6,%xmm7 # fix up last round key
1228 movdqa %xmm7,(%rax) # save last round key
1229
1230 sub \$8,$len
1231.Lecb_enc_loop:
1232 movdqu 0x00($inp), @XMM[0] # load input
1233 movdqu 0x10($inp), @XMM[1]
1234 movdqu 0x20($inp), @XMM[2]
1235 movdqu 0x30($inp), @XMM[3]
1236 movdqu 0x40($inp), @XMM[4]
1237 movdqu 0x50($inp), @XMM[5]
1238 mov %rsp, %rax # pass key schedule
1239 movdqu 0x60($inp), @XMM[6]
1240 mov %ebx,%r10d # pass rounds
1241 movdqu 0x70($inp), @XMM[7]
1242 lea 0x80($inp), $inp
1243
1244 call _bsaes_encrypt8
1245
1246 movdqu @XMM[0], 0x00($out) # write output
1247 movdqu @XMM[1], 0x10($out)
1248 movdqu @XMM[4], 0x20($out)
1249 movdqu @XMM[6], 0x30($out)
1250 movdqu @XMM[3], 0x40($out)
1251 movdqu @XMM[7], 0x50($out)
1252 movdqu @XMM[2], 0x60($out)
1253 movdqu @XMM[5], 0x70($out)
1254 lea 0x80($out), $out
1255 sub \$8,$len
1256 jnc .Lecb_enc_loop
1257
1258 add \$8,$len
1259 jz .Lecb_enc_done
1260
1261 movdqu 0x00($inp), @XMM[0] # load input
1262 mov %rsp, %rax # pass key schedule
1263 mov %ebx,%r10d # pass rounds
1264 cmp \$2,$len
1265 jb .Lecb_enc_one
1266 movdqu 0x10($inp), @XMM[1]
1267 je .Lecb_enc_two
1268 movdqu 0x20($inp), @XMM[2]
1269 cmp \$4,$len
1270 jb .Lecb_enc_three
1271 movdqu 0x30($inp), @XMM[3]
1272 je .Lecb_enc_four
1273 movdqu 0x40($inp), @XMM[4]
1274 cmp \$6,$len
1275 jb .Lecb_enc_five
1276 movdqu 0x50($inp), @XMM[5]
1277 je .Lecb_enc_six
1278 movdqu 0x60($inp), @XMM[6]
1279 call _bsaes_encrypt8
1280 movdqu @XMM[0], 0x00($out) # write output
1281 movdqu @XMM[1], 0x10($out)
1282 movdqu @XMM[4], 0x20($out)
1283 movdqu @XMM[6], 0x30($out)
1284 movdqu @XMM[3], 0x40($out)
1285 movdqu @XMM[7], 0x50($out)
1286 movdqu @XMM[2], 0x60($out)
1287 jmp .Lecb_enc_done
1288.align 16
1289.Lecb_enc_six:
1290 call _bsaes_encrypt8
1291 movdqu @XMM[0], 0x00($out) # write output
1292 movdqu @XMM[1], 0x10($out)
1293 movdqu @XMM[4], 0x20($out)
1294 movdqu @XMM[6], 0x30($out)
1295 movdqu @XMM[3], 0x40($out)
1296 movdqu @XMM[7], 0x50($out)
1297 jmp .Lecb_enc_done
1298.align 16
1299.Lecb_enc_five:
1300 call _bsaes_encrypt8
1301 movdqu @XMM[0], 0x00($out) # write output
1302 movdqu @XMM[1], 0x10($out)
1303 movdqu @XMM[4], 0x20($out)
1304 movdqu @XMM[6], 0x30($out)
1305 movdqu @XMM[3], 0x40($out)
1306 jmp .Lecb_enc_done
1307.align 16
1308.Lecb_enc_four:
1309 call _bsaes_encrypt8
1310 movdqu @XMM[0], 0x00($out) # write output
1311 movdqu @XMM[1], 0x10($out)
1312 movdqu @XMM[4], 0x20($out)
1313 movdqu @XMM[6], 0x30($out)
1314 jmp .Lecb_enc_done
1315.align 16
1316.Lecb_enc_three:
1317 call _bsaes_encrypt8
1318 movdqu @XMM[0], 0x00($out) # write output
1319 movdqu @XMM[1], 0x10($out)
1320 movdqu @XMM[4], 0x20($out)
1321 jmp .Lecb_enc_done
1322.align 16
1323.Lecb_enc_two:
1324 call _bsaes_encrypt8
1325 movdqu @XMM[0], 0x00($out) # write output
1326 movdqu @XMM[1], 0x10($out)
1327 jmp .Lecb_enc_done
1328.align 16
1329.Lecb_enc_one:
1330 call _bsaes_encrypt8
1331 movdqu @XMM[0], 0x00($out) # write output
1332 jmp .Lecb_enc_done
1333.align 16
1334.Lecb_enc_short:
1335 lea ($inp), $arg1
1336 lea ($out), $arg2
1337 lea ($key), $arg3
1338 call asm_AES_encrypt
1339 lea 16($inp), $inp
1340 lea 16($out), $out
1341 dec $len
1342 jnz .Lecb_enc_short
1343
1344.Lecb_enc_done:
1345 lea (%rsp),%rax
1346 pxor %xmm0, %xmm0
1347.Lecb_enc_bzero: # wipe key schedule [if any]
1348 movdqa %xmm0, 0x00(%rax)
1349 movdqa %xmm0, 0x10(%rax)
1350 lea 0x20(%rax), %rax
1351 cmp %rax, %rbp
1352 jb .Lecb_enc_bzero
1353
1354 lea 0x78(%rbp),%rax
1355.cfi_def_cfa %rax,8
1356___
1357$code.=<<___ if ($win64);
1358 movaps 0x40(%rbp), %xmm6
1359 movaps 0x50(%rbp), %xmm7
1360 movaps 0x60(%rbp), %xmm8
1361 movaps 0x70(%rbp), %xmm9
1362 movaps 0x80(%rbp), %xmm10
1363 movaps 0x90(%rbp), %xmm11
1364 movaps 0xa0(%rbp), %xmm12
1365 movaps 0xb0(%rbp), %xmm13
1366 movaps 0xc0(%rbp), %xmm14
1367 movaps 0xd0(%rbp), %xmm15
1368 lea 0xa0(%rax), %rax
1369.Lecb_enc_tail:
1370___
1371$code.=<<___;
1372 mov -48(%rax), %r15
1373.cfi_restore %r15
1374 mov -40(%rax), %r14
1375.cfi_restore %r14
1376 mov -32(%rax), %r13
1377.cfi_restore %r13
1378 mov -24(%rax), %r12
1379.cfi_restore %r12
1380 mov -16(%rax), %rbx
1381.cfi_restore %rbx
1382 mov -8(%rax), %rbp
1383.cfi_restore %rbp
1384 lea (%rax), %rsp # restore %rsp
1385.cfi_def_cfa_register %rsp
1386.Lecb_enc_epilogue:
1387 ret
1388.cfi_endproc
1389.size bsaes_ecb_encrypt_blocks,.-bsaes_ecb_encrypt_blocks
1390
1391.globl bsaes_ecb_decrypt_blocks
1392.type bsaes_ecb_decrypt_blocks,\@abi-omnipotent
1393.align 16
1394bsaes_ecb_decrypt_blocks:
1395.cfi_startproc
1396 mov %rsp, %rax
1397.Lecb_dec_prologue:
1398 push %rbp
1399.cfi_push %rbp
1400 push %rbx
1401.cfi_push %rbx
1402 push %r12
1403.cfi_push %r12
1404 push %r13
1405.cfi_push %r13
1406 push %r14
1407.cfi_push %r14
1408 push %r15
1409.cfi_push %r15
1410 lea -0x48(%rsp),%rsp
1411.cfi_adjust_cfa_offset 0x48
1412___
1413$code.=<<___ if ($win64);
1414 lea -0xa0(%rsp), %rsp
1415 movaps %xmm6, 0x40(%rsp)
1416 movaps %xmm7, 0x50(%rsp)
1417 movaps %xmm8, 0x60(%rsp)
1418 movaps %xmm9, 0x70(%rsp)
1419 movaps %xmm10, 0x80(%rsp)
1420 movaps %xmm11, 0x90(%rsp)
1421 movaps %xmm12, 0xa0(%rsp)
1422 movaps %xmm13, 0xb0(%rsp)
1423 movaps %xmm14, 0xc0(%rsp)
1424 movaps %xmm15, 0xd0(%rsp)
1425.Lecb_dec_body:
1426___
1427$code.=<<___;
1428 mov %rsp,%rbp # backup %rsp
1429.cfi_def_cfa_register %rbp
1430 mov 240($arg4),%eax # rounds
1431 mov $arg1,$inp # backup arguments
1432 mov $arg2,$out
1433 mov $arg3,$len
1434 mov $arg4,$key
1435 cmp \$8,$arg3
1436 jb .Lecb_dec_short
1437
1438 mov %eax,%ebx # backup rounds
1439 shl \$7,%rax # 128 bytes per inner round key
1440 sub \$`128-32`,%rax # size of bit-sliced key schedule
1441 sub %rax,%rsp
1442 mov %rsp,%rax # pass key schedule
1443 mov $key,%rcx # pass key
1444 mov %ebx,%r10d # pass rounds
1445 call _bsaes_key_convert
1446 pxor (%rsp),%xmm7 # fix up 0 round key
1447 movdqa %xmm6,(%rax) # save last round key
1448 movdqa %xmm7,(%rsp)
1449
1450 sub \$8,$len
1451.Lecb_dec_loop:
1452 movdqu 0x00($inp), @XMM[0] # load input
1453 movdqu 0x10($inp), @XMM[1]
1454 movdqu 0x20($inp), @XMM[2]
1455 movdqu 0x30($inp), @XMM[3]
1456 movdqu 0x40($inp), @XMM[4]
1457 movdqu 0x50($inp), @XMM[5]
1458 mov %rsp, %rax # pass key schedule
1459 movdqu 0x60($inp), @XMM[6]
1460 mov %ebx,%r10d # pass rounds
1461 movdqu 0x70($inp), @XMM[7]
1462 lea 0x80($inp), $inp
1463
1464 call _bsaes_decrypt8
1465
1466 movdqu @XMM[0], 0x00($out) # write output
1467 movdqu @XMM[1], 0x10($out)
1468 movdqu @XMM[6], 0x20($out)
1469 movdqu @XMM[4], 0x30($out)
1470 movdqu @XMM[2], 0x40($out)
1471 movdqu @XMM[7], 0x50($out)
1472 movdqu @XMM[3], 0x60($out)
1473 movdqu @XMM[5], 0x70($out)
1474 lea 0x80($out), $out
1475 sub \$8,$len
1476 jnc .Lecb_dec_loop
1477
1478 add \$8,$len
1479 jz .Lecb_dec_done
1480
1481 movdqu 0x00($inp), @XMM[0] # load input
1482 mov %rsp, %rax # pass key schedule
1483 mov %ebx,%r10d # pass rounds
1484 cmp \$2,$len
1485 jb .Lecb_dec_one
1486 movdqu 0x10($inp), @XMM[1]
1487 je .Lecb_dec_two
1488 movdqu 0x20($inp), @XMM[2]
1489 cmp \$4,$len
1490 jb .Lecb_dec_three
1491 movdqu 0x30($inp), @XMM[3]
1492 je .Lecb_dec_four
1493 movdqu 0x40($inp), @XMM[4]
1494 cmp \$6,$len
1495 jb .Lecb_dec_five
1496 movdqu 0x50($inp), @XMM[5]
1497 je .Lecb_dec_six
1498 movdqu 0x60($inp), @XMM[6]
1499 call _bsaes_decrypt8
1500 movdqu @XMM[0], 0x00($out) # write output
1501 movdqu @XMM[1], 0x10($out)
1502 movdqu @XMM[6], 0x20($out)
1503 movdqu @XMM[4], 0x30($out)
1504 movdqu @XMM[2], 0x40($out)
1505 movdqu @XMM[7], 0x50($out)
1506 movdqu @XMM[3], 0x60($out)
1507 jmp .Lecb_dec_done
1508.align 16
1509.Lecb_dec_six:
1510 call _bsaes_decrypt8
1511 movdqu @XMM[0], 0x00($out) # write output
1512 movdqu @XMM[1], 0x10($out)
1513 movdqu @XMM[6], 0x20($out)
1514 movdqu @XMM[4], 0x30($out)
1515 movdqu @XMM[2], 0x40($out)
1516 movdqu @XMM[7], 0x50($out)
1517 jmp .Lecb_dec_done
1518.align 16
1519.Lecb_dec_five:
1520 call _bsaes_decrypt8
1521 movdqu @XMM[0], 0x00($out) # write output
1522 movdqu @XMM[1], 0x10($out)
1523 movdqu @XMM[6], 0x20($out)
1524 movdqu @XMM[4], 0x30($out)
1525 movdqu @XMM[2], 0x40($out)
1526 jmp .Lecb_dec_done
1527.align 16
1528.Lecb_dec_four:
1529 call _bsaes_decrypt8
1530 movdqu @XMM[0], 0x00($out) # write output
1531 movdqu @XMM[1], 0x10($out)
1532 movdqu @XMM[6], 0x20($out)
1533 movdqu @XMM[4], 0x30($out)
1534 jmp .Lecb_dec_done
1535.align 16
1536.Lecb_dec_three:
1537 call _bsaes_decrypt8
1538 movdqu @XMM[0], 0x00($out) # write output
1539 movdqu @XMM[1], 0x10($out)
1540 movdqu @XMM[6], 0x20($out)
1541 jmp .Lecb_dec_done
1542.align 16
1543.Lecb_dec_two:
1544 call _bsaes_decrypt8
1545 movdqu @XMM[0], 0x00($out) # write output
1546 movdqu @XMM[1], 0x10($out)
1547 jmp .Lecb_dec_done
1548.align 16
1549.Lecb_dec_one:
1550 call _bsaes_decrypt8
1551 movdqu @XMM[0], 0x00($out) # write output
1552 jmp .Lecb_dec_done
1553.align 16
1554.Lecb_dec_short:
1555 lea ($inp), $arg1
1556 lea ($out), $arg2
1557 lea ($key), $arg3
1558 call asm_AES_decrypt
1559 lea 16($inp), $inp
1560 lea 16($out), $out
1561 dec $len
1562 jnz .Lecb_dec_short
1563
1564.Lecb_dec_done:
1565 lea (%rsp),%rax
1566 pxor %xmm0, %xmm0
1567.Lecb_dec_bzero: # wipe key schedule [if any]
1568 movdqa %xmm0, 0x00(%rax)
1569 movdqa %xmm0, 0x10(%rax)
1570 lea 0x20(%rax), %rax
1571 cmp %rax, %rbp
1572 jb .Lecb_dec_bzero
1573
1574 lea 0x78(%rbp),%rax
1575.cfi_def_cfa %rax,8
1576___
1577$code.=<<___ if ($win64);
1578 movaps 0x40(%rbp), %xmm6
1579 movaps 0x50(%rbp), %xmm7
1580 movaps 0x60(%rbp), %xmm8
1581 movaps 0x70(%rbp), %xmm9
1582 movaps 0x80(%rbp), %xmm10
1583 movaps 0x90(%rbp), %xmm11
1584 movaps 0xa0(%rbp), %xmm12
1585 movaps 0xb0(%rbp), %xmm13
1586 movaps 0xc0(%rbp), %xmm14
1587 movaps 0xd0(%rbp), %xmm15
1588 lea 0xa0(%rax), %rax
1589.Lecb_dec_tail:
1590___
1591$code.=<<___;
1592 mov -48(%rax), %r15
1593.cfi_restore %r15
1594 mov -40(%rax), %r14
1595.cfi_restore %r14
1596 mov -32(%rax), %r13
1597.cfi_restore %r13
1598 mov -24(%rax), %r12
1599.cfi_restore %r12
1600 mov -16(%rax), %rbx
1601.cfi_restore %rbx
1602 mov -8(%rax), %rbp
1603.cfi_restore %rbp
1604 lea (%rax), %rsp # restore %rsp
1605.cfi_def_cfa_register %rsp
1606.Lecb_dec_epilogue:
1607 ret
1608.cfi_endproc
1609.size bsaes_ecb_decrypt_blocks,.-bsaes_ecb_decrypt_blocks
1610___
1611}
1612$code.=<<___;
1613.extern asm_AES_cbc_encrypt
1614.globl ossl_bsaes_cbc_encrypt
1615.type ossl_bsaes_cbc_encrypt,\@abi-omnipotent
1616.align 16
1617ossl_bsaes_cbc_encrypt:
1618.cfi_startproc
1619 endbranch
1620___
1621$code.=<<___ if ($win64);
1622 mov 48(%rsp),$arg6 # pull direction flag
1623___
1624$code.=<<___;
1625 cmp \$0,$arg6
1626 jne asm_AES_cbc_encrypt
1627 cmp \$128,$arg3
1628 jb asm_AES_cbc_encrypt
1629
1630 mov %rsp, %rax
1631.Lcbc_dec_prologue:
1632 push %rbp
1633.cfi_push %rbp
1634 push %rbx
1635.cfi_push %rbx
1636 push %r12
1637.cfi_push %r12
1638 push %r13
1639.cfi_push %r13
1640 push %r14
1641.cfi_push %r14
1642 push %r15
1643.cfi_push %r15
1644 lea -0x48(%rsp), %rsp
1645.cfi_adjust_cfa_offset 0x48
1646___
1647$code.=<<___ if ($win64);
1648 mov 0xa0(%rsp),$arg5 # pull ivp
1649 lea -0xa0(%rsp), %rsp
1650 movaps %xmm6, 0x40(%rsp)
1651 movaps %xmm7, 0x50(%rsp)
1652 movaps %xmm8, 0x60(%rsp)
1653 movaps %xmm9, 0x70(%rsp)
1654 movaps %xmm10, 0x80(%rsp)
1655 movaps %xmm11, 0x90(%rsp)
1656 movaps %xmm12, 0xa0(%rsp)
1657 movaps %xmm13, 0xb0(%rsp)
1658 movaps %xmm14, 0xc0(%rsp)
1659 movaps %xmm15, 0xd0(%rsp)
1660.Lcbc_dec_body:
1661___
1662$code.=<<___;
1663 mov %rsp, %rbp # backup %rsp
1664.cfi_def_cfa_register %rbp
1665 mov 240($arg4), %eax # rounds
1666 mov $arg1, $inp # backup arguments
1667 mov $arg2, $out
1668 mov $arg3, $len
1669 mov $arg4, $key
1670 mov $arg5, %rbx
1671 shr \$4, $len # bytes to blocks
1672
1673 mov %eax, %edx # rounds
1674 shl \$7, %rax # 128 bytes per inner round key
1675 sub \$`128-32`, %rax # size of bit-sliced key schedule
1676 sub %rax, %rsp
1677
1678 mov %rsp, %rax # pass key schedule
1679 mov $key, %rcx # pass key
1680 mov %edx, %r10d # pass rounds
1681 call _bsaes_key_convert
1682 pxor (%rsp),%xmm7 # fix up 0 round key
1683 movdqa %xmm6,(%rax) # save last round key
1684 movdqa %xmm7,(%rsp)
1685
1686 movdqu (%rbx), @XMM[15] # load IV
1687 sub \$8,$len
1688.Lcbc_dec_loop:
1689 movdqu 0x00($inp), @XMM[0] # load input
1690 movdqu 0x10($inp), @XMM[1]
1691 movdqu 0x20($inp), @XMM[2]
1692 movdqu 0x30($inp), @XMM[3]
1693 movdqu 0x40($inp), @XMM[4]
1694 movdqu 0x50($inp), @XMM[5]
1695 mov %rsp, %rax # pass key schedule
1696 movdqu 0x60($inp), @XMM[6]
1697 mov %edx,%r10d # pass rounds
1698 movdqu 0x70($inp), @XMM[7]
1699 movdqa @XMM[15], 0x20(%rbp) # put aside IV
1700
1701 call _bsaes_decrypt8
1702
1703 pxor 0x20(%rbp), @XMM[0] # ^= IV
1704 movdqu 0x00($inp), @XMM[8] # re-load input
1705 movdqu 0x10($inp), @XMM[9]
1706 pxor @XMM[8], @XMM[1]
1707 movdqu 0x20($inp), @XMM[10]
1708 pxor @XMM[9], @XMM[6]
1709 movdqu 0x30($inp), @XMM[11]
1710 pxor @XMM[10], @XMM[4]
1711 movdqu 0x40($inp), @XMM[12]
1712 pxor @XMM[11], @XMM[2]
1713 movdqu 0x50($inp), @XMM[13]
1714 pxor @XMM[12], @XMM[7]
1715 movdqu 0x60($inp), @XMM[14]
1716 pxor @XMM[13], @XMM[3]
1717 movdqu 0x70($inp), @XMM[15] # IV
1718 pxor @XMM[14], @XMM[5]
1719 movdqu @XMM[0], 0x00($out) # write output
1720 lea 0x80($inp), $inp
1721 movdqu @XMM[1], 0x10($out)
1722 movdqu @XMM[6], 0x20($out)
1723 movdqu @XMM[4], 0x30($out)
1724 movdqu @XMM[2], 0x40($out)
1725 movdqu @XMM[7], 0x50($out)
1726 movdqu @XMM[3], 0x60($out)
1727 movdqu @XMM[5], 0x70($out)
1728 lea 0x80($out), $out
1729 sub \$8,$len
1730 jnc .Lcbc_dec_loop
1731
1732 add \$8,$len
1733 jz .Lcbc_dec_done
1734
1735 movdqu 0x00($inp), @XMM[0] # load input
1736 mov %rsp, %rax # pass key schedule
1737 mov %edx, %r10d # pass rounds
1738 cmp \$2,$len
1739 jb .Lcbc_dec_one
1740 movdqu 0x10($inp), @XMM[1]
1741 je .Lcbc_dec_two
1742 movdqu 0x20($inp), @XMM[2]
1743 cmp \$4,$len
1744 jb .Lcbc_dec_three
1745 movdqu 0x30($inp), @XMM[3]
1746 je .Lcbc_dec_four
1747 movdqu 0x40($inp), @XMM[4]
1748 cmp \$6,$len
1749 jb .Lcbc_dec_five
1750 movdqu 0x50($inp), @XMM[5]
1751 je .Lcbc_dec_six
1752 movdqu 0x60($inp), @XMM[6]
1753 movdqa @XMM[15], 0x20(%rbp) # put aside IV
1754 call _bsaes_decrypt8
1755 pxor 0x20(%rbp), @XMM[0] # ^= IV
1756 movdqu 0x00($inp), @XMM[8] # re-load input
1757 movdqu 0x10($inp), @XMM[9]
1758 pxor @XMM[8], @XMM[1]
1759 movdqu 0x20($inp), @XMM[10]
1760 pxor @XMM[9], @XMM[6]
1761 movdqu 0x30($inp), @XMM[11]
1762 pxor @XMM[10], @XMM[4]
1763 movdqu 0x40($inp), @XMM[12]
1764 pxor @XMM[11], @XMM[2]
1765 movdqu 0x50($inp), @XMM[13]
1766 pxor @XMM[12], @XMM[7]
1767 movdqu 0x60($inp), @XMM[15] # IV
1768 pxor @XMM[13], @XMM[3]
1769 movdqu @XMM[0], 0x00($out) # write output
1770 movdqu @XMM[1], 0x10($out)
1771 movdqu @XMM[6], 0x20($out)
1772 movdqu @XMM[4], 0x30($out)
1773 movdqu @XMM[2], 0x40($out)
1774 movdqu @XMM[7], 0x50($out)
1775 movdqu @XMM[3], 0x60($out)
1776 jmp .Lcbc_dec_done
1777.align 16
1778.Lcbc_dec_six:
1779 movdqa @XMM[15], 0x20(%rbp) # put aside IV
1780 call _bsaes_decrypt8
1781 pxor 0x20(%rbp), @XMM[0] # ^= IV
1782 movdqu 0x00($inp), @XMM[8] # re-load input
1783 movdqu 0x10($inp), @XMM[9]
1784 pxor @XMM[8], @XMM[1]
1785 movdqu 0x20($inp), @XMM[10]
1786 pxor @XMM[9], @XMM[6]
1787 movdqu 0x30($inp), @XMM[11]
1788 pxor @XMM[10], @XMM[4]
1789 movdqu 0x40($inp), @XMM[12]
1790 pxor @XMM[11], @XMM[2]
1791 movdqu 0x50($inp), @XMM[15] # IV
1792 pxor @XMM[12], @XMM[7]
1793 movdqu @XMM[0], 0x00($out) # write output
1794 movdqu @XMM[1], 0x10($out)
1795 movdqu @XMM[6], 0x20($out)
1796 movdqu @XMM[4], 0x30($out)
1797 movdqu @XMM[2], 0x40($out)
1798 movdqu @XMM[7], 0x50($out)
1799 jmp .Lcbc_dec_done
1800.align 16
1801.Lcbc_dec_five:
1802 movdqa @XMM[15], 0x20(%rbp) # put aside IV
1803 call _bsaes_decrypt8
1804 pxor 0x20(%rbp), @XMM[0] # ^= IV
1805 movdqu 0x00($inp), @XMM[8] # re-load input
1806 movdqu 0x10($inp), @XMM[9]
1807 pxor @XMM[8], @XMM[1]
1808 movdqu 0x20($inp), @XMM[10]
1809 pxor @XMM[9], @XMM[6]
1810 movdqu 0x30($inp), @XMM[11]
1811 pxor @XMM[10], @XMM[4]
1812 movdqu 0x40($inp), @XMM[15] # IV
1813 pxor @XMM[11], @XMM[2]
1814 movdqu @XMM[0], 0x00($out) # write output
1815 movdqu @XMM[1], 0x10($out)
1816 movdqu @XMM[6], 0x20($out)
1817 movdqu @XMM[4], 0x30($out)
1818 movdqu @XMM[2], 0x40($out)
1819 jmp .Lcbc_dec_done
1820.align 16
1821.Lcbc_dec_four:
1822 movdqa @XMM[15], 0x20(%rbp) # put aside IV
1823 call _bsaes_decrypt8
1824 pxor 0x20(%rbp), @XMM[0] # ^= IV
1825 movdqu 0x00($inp), @XMM[8] # re-load input
1826 movdqu 0x10($inp), @XMM[9]
1827 pxor @XMM[8], @XMM[1]
1828 movdqu 0x20($inp), @XMM[10]
1829 pxor @XMM[9], @XMM[6]
1830 movdqu 0x30($inp), @XMM[15] # IV
1831 pxor @XMM[10], @XMM[4]
1832 movdqu @XMM[0], 0x00($out) # write output
1833 movdqu @XMM[1], 0x10($out)
1834 movdqu @XMM[6], 0x20($out)
1835 movdqu @XMM[4], 0x30($out)
1836 jmp .Lcbc_dec_done
1837.align 16
1838.Lcbc_dec_three:
1839 movdqa @XMM[15], 0x20(%rbp) # put aside IV
1840 call _bsaes_decrypt8
1841 pxor 0x20(%rbp), @XMM[0] # ^= IV
1842 movdqu 0x00($inp), @XMM[8] # re-load input
1843 movdqu 0x10($inp), @XMM[9]
1844 pxor @XMM[8], @XMM[1]
1845 movdqu 0x20($inp), @XMM[15] # IV
1846 pxor @XMM[9], @XMM[6]
1847 movdqu @XMM[0], 0x00($out) # write output
1848 movdqu @XMM[1], 0x10($out)
1849 movdqu @XMM[6], 0x20($out)
1850 jmp .Lcbc_dec_done
1851.align 16
1852.Lcbc_dec_two:
1853 movdqa @XMM[15], 0x20(%rbp) # put aside IV
1854 call _bsaes_decrypt8
1855 pxor 0x20(%rbp), @XMM[0] # ^= IV
1856 movdqu 0x00($inp), @XMM[8] # re-load input
1857 movdqu 0x10($inp), @XMM[15] # IV
1858 pxor @XMM[8], @XMM[1]
1859 movdqu @XMM[0], 0x00($out) # write output
1860 movdqu @XMM[1], 0x10($out)
1861 jmp .Lcbc_dec_done
1862.align 16
1863.Lcbc_dec_one:
1864 lea ($inp), $arg1
1865 lea 0x20(%rbp), $arg2 # buffer output
1866 lea ($key), $arg3
1867 call asm_AES_decrypt # doesn't touch %xmm
1868 pxor 0x20(%rbp), @XMM[15] # ^= IV
1869 movdqu @XMM[15], ($out) # write output
1870 movdqa @XMM[0], @XMM[15] # IV
1871
1872.Lcbc_dec_done:
1873 movdqu @XMM[15], (%rbx) # return IV
1874 lea (%rsp), %rax
1875 pxor %xmm0, %xmm0
1876.Lcbc_dec_bzero: # wipe key schedule [if any]
1877 movdqa %xmm0, 0x00(%rax)
1878 movdqa %xmm0, 0x10(%rax)
1879 lea 0x20(%rax), %rax
1880 cmp %rax, %rbp
1881 ja .Lcbc_dec_bzero
1882
1883 lea 0x78(%rbp),%rax
1884.cfi_def_cfa %rax,8
1885___
1886$code.=<<___ if ($win64);
1887 movaps 0x40(%rbp), %xmm6
1888 movaps 0x50(%rbp), %xmm7
1889 movaps 0x60(%rbp), %xmm8
1890 movaps 0x70(%rbp), %xmm9
1891 movaps 0x80(%rbp), %xmm10
1892 movaps 0x90(%rbp), %xmm11
1893 movaps 0xa0(%rbp), %xmm12
1894 movaps 0xb0(%rbp), %xmm13
1895 movaps 0xc0(%rbp), %xmm14
1896 movaps 0xd0(%rbp), %xmm15
1897 lea 0xa0(%rax), %rax
1898.Lcbc_dec_tail:
1899___
1900$code.=<<___;
1901 mov -48(%rax), %r15
1902.cfi_restore %r15
1903 mov -40(%rax), %r14
1904.cfi_restore %r14
1905 mov -32(%rax), %r13
1906.cfi_restore %r13
1907 mov -24(%rax), %r12
1908.cfi_restore %r12
1909 mov -16(%rax), %rbx
1910.cfi_restore %rbx
1911 mov -8(%rax), %rbp
1912.cfi_restore %rbp
1913 lea (%rax), %rsp # restore %rsp
1914.cfi_def_cfa_register %rsp
1915.Lcbc_dec_epilogue:
1916 ret
1917.cfi_endproc
1918.size ossl_bsaes_cbc_encrypt,.-ossl_bsaes_cbc_encrypt
1919
1920.globl ossl_bsaes_ctr32_encrypt_blocks
1921.type ossl_bsaes_ctr32_encrypt_blocks,\@abi-omnipotent
1922.align 16
1923ossl_bsaes_ctr32_encrypt_blocks:
1924.cfi_startproc
1925 endbranch
1926 mov %rsp, %rax
1927.Lctr_enc_prologue:
1928 push %rbp
1929.cfi_push %rbp
1930 push %rbx
1931.cfi_push %rbx
1932 push %r12
1933.cfi_push %r12
1934 push %r13
1935.cfi_push %r13
1936 push %r14
1937.cfi_push %r14
1938 push %r15
1939.cfi_push %r15
1940 lea -0x48(%rsp), %rsp
1941.cfi_adjust_cfa_offset 0x48
1942___
1943$code.=<<___ if ($win64);
1944 mov 0xa0(%rsp),$arg5 # pull ivp
1945 lea -0xa0(%rsp), %rsp
1946 movaps %xmm6, 0x40(%rsp)
1947 movaps %xmm7, 0x50(%rsp)
1948 movaps %xmm8, 0x60(%rsp)
1949 movaps %xmm9, 0x70(%rsp)
1950 movaps %xmm10, 0x80(%rsp)
1951 movaps %xmm11, 0x90(%rsp)
1952 movaps %xmm12, 0xa0(%rsp)
1953 movaps %xmm13, 0xb0(%rsp)
1954 movaps %xmm14, 0xc0(%rsp)
1955 movaps %xmm15, 0xd0(%rsp)
1956.Lctr_enc_body:
1957___
1958$code.=<<___;
1959 mov %rsp, %rbp # backup %rsp
1960.cfi_def_cfa_register %rbp
1961 movdqu ($arg5), %xmm0 # load counter
1962 mov 240($arg4), %eax # rounds
1963 mov $arg1, $inp # backup arguments
1964 mov $arg2, $out
1965 mov $arg3, $len
1966 mov $arg4, $key
1967 movdqa %xmm0, 0x20(%rbp) # copy counter
1968 cmp \$8, $arg3
1969 jb .Lctr_enc_short
1970
1971 mov %eax, %ebx # rounds
1972 shl \$7, %rax # 128 bytes per inner round key
1973 sub \$`128-32`, %rax # size of bit-sliced key schedule
1974 sub %rax, %rsp
1975
1976 mov %rsp, %rax # pass key schedule
1977 mov $key, %rcx # pass key
1978 mov %ebx, %r10d # pass rounds
1979 call _bsaes_key_convert
1980 pxor %xmm6,%xmm7 # fix up last round key
1981 movdqa %xmm7,(%rax) # save last round key
1982
1983 movdqa (%rsp), @XMM[9] # load round0 key
1984 lea .LADD1(%rip), %r11
1985 movdqa 0x20(%rbp), @XMM[0] # counter copy
1986 movdqa -0x20(%r11), @XMM[8] # .LSWPUP
1987 pshufb @XMM[8], @XMM[9] # byte swap upper part
1988 pshufb @XMM[8], @XMM[0]
1989 movdqa @XMM[9], (%rsp) # save adjusted round0 key
1990 jmp .Lctr_enc_loop
1991.align 16
1992.Lctr_enc_loop:
1993 movdqa @XMM[0], 0x20(%rbp) # save counter
1994 movdqa @XMM[0], @XMM[1] # prepare 8 counter values
1995 movdqa @XMM[0], @XMM[2]
1996 paddd 0x00(%r11), @XMM[1] # .LADD1
1997 movdqa @XMM[0], @XMM[3]
1998 paddd 0x10(%r11), @XMM[2] # .LADD2
1999 movdqa @XMM[0], @XMM[4]
2000 paddd 0x20(%r11), @XMM[3] # .LADD3
2001 movdqa @XMM[0], @XMM[5]
2002 paddd 0x30(%r11), @XMM[4] # .LADD4
2003 movdqa @XMM[0], @XMM[6]
2004 paddd 0x40(%r11), @XMM[5] # .LADD5
2005 movdqa @XMM[0], @XMM[7]
2006 paddd 0x50(%r11), @XMM[6] # .LADD6
2007 paddd 0x60(%r11), @XMM[7] # .LADD7
2008
2009 # Borrow prologue from _bsaes_encrypt8 to use the opportunity
2010 # to flip byte order in 32-bit counter
2011 movdqa (%rsp), @XMM[9] # round 0 key
2012 lea 0x10(%rsp), %rax # pass key schedule
2013 movdqa -0x10(%r11), @XMM[8] # .LSWPUPM0SR
2014 pxor @XMM[9], @XMM[0] # xor with round0 key
2015 pxor @XMM[9], @XMM[1]
2016 pxor @XMM[9], @XMM[2]
2017 pxor @XMM[9], @XMM[3]
2018 pshufb @XMM[8], @XMM[0]
2019 pshufb @XMM[8], @XMM[1]
2020 pxor @XMM[9], @XMM[4]
2021 pxor @XMM[9], @XMM[5]
2022 pshufb @XMM[8], @XMM[2]
2023 pshufb @XMM[8], @XMM[3]
2024 pxor @XMM[9], @XMM[6]
2025 pxor @XMM[9], @XMM[7]
2026 pshufb @XMM[8], @XMM[4]
2027 pshufb @XMM[8], @XMM[5]
2028 pshufb @XMM[8], @XMM[6]
2029 pshufb @XMM[8], @XMM[7]
2030 lea .LBS0(%rip), %r11 # constants table
2031 mov %ebx,%r10d # pass rounds
2032
2033 call _bsaes_encrypt8_bitslice
2034
2035 sub \$8,$len
2036 jc .Lctr_enc_loop_done
2037
2038 movdqu 0x00($inp), @XMM[8] # load input
2039 movdqu 0x10($inp), @XMM[9]
2040 movdqu 0x20($inp), @XMM[10]
2041 movdqu 0x30($inp), @XMM[11]
2042 movdqu 0x40($inp), @XMM[12]
2043 movdqu 0x50($inp), @XMM[13]
2044 movdqu 0x60($inp), @XMM[14]
2045 movdqu 0x70($inp), @XMM[15]
2046 lea 0x80($inp),$inp
2047 pxor @XMM[0], @XMM[8]
2048 movdqa 0x20(%rbp), @XMM[0] # load counter
2049 pxor @XMM[9], @XMM[1]
2050 movdqu @XMM[8], 0x00($out) # write output
2051 pxor @XMM[10], @XMM[4]
2052 movdqu @XMM[1], 0x10($out)
2053 pxor @XMM[11], @XMM[6]
2054 movdqu @XMM[4], 0x20($out)
2055 pxor @XMM[12], @XMM[3]
2056 movdqu @XMM[6], 0x30($out)
2057 pxor @XMM[13], @XMM[7]
2058 movdqu @XMM[3], 0x40($out)
2059 pxor @XMM[14], @XMM[2]
2060 movdqu @XMM[7], 0x50($out)
2061 pxor @XMM[15], @XMM[5]
2062 movdqu @XMM[2], 0x60($out)
2063 lea .LADD1(%rip), %r11
2064 movdqu @XMM[5], 0x70($out)
2065 lea 0x80($out), $out
2066 paddd 0x70(%r11), @XMM[0] # .LADD8
2067 jnz .Lctr_enc_loop
2068
2069 jmp .Lctr_enc_done
2070.align 16
2071.Lctr_enc_loop_done:
2072 add \$8, $len
2073 movdqu 0x00($inp), @XMM[8] # load input
2074 pxor @XMM[8], @XMM[0]
2075 movdqu @XMM[0], 0x00($out) # write output
2076 cmp \$2,$len
2077 jb .Lctr_enc_done
2078 movdqu 0x10($inp), @XMM[9]
2079 pxor @XMM[9], @XMM[1]
2080 movdqu @XMM[1], 0x10($out)
2081 je .Lctr_enc_done
2082 movdqu 0x20($inp), @XMM[10]
2083 pxor @XMM[10], @XMM[4]
2084 movdqu @XMM[4], 0x20($out)
2085 cmp \$4,$len
2086 jb .Lctr_enc_done
2087 movdqu 0x30($inp), @XMM[11]
2088 pxor @XMM[11], @XMM[6]
2089 movdqu @XMM[6], 0x30($out)
2090 je .Lctr_enc_done
2091 movdqu 0x40($inp), @XMM[12]
2092 pxor @XMM[12], @XMM[3]
2093 movdqu @XMM[3], 0x40($out)
2094 cmp \$6,$len
2095 jb .Lctr_enc_done
2096 movdqu 0x50($inp), @XMM[13]
2097 pxor @XMM[13], @XMM[7]
2098 movdqu @XMM[7], 0x50($out)
2099 je .Lctr_enc_done
2100 movdqu 0x60($inp), @XMM[14]
2101 pxor @XMM[14], @XMM[2]
2102 movdqu @XMM[2], 0x60($out)
2103 jmp .Lctr_enc_done
2104
2105.align 16
2106.Lctr_enc_short:
2107 lea 0x20(%rbp), $arg1
2108 lea 0x30(%rbp), $arg2
2109 lea ($key), $arg3
2110 call asm_AES_encrypt
2111 movdqu ($inp), @XMM[1]
2112 lea 16($inp), $inp
2113 mov 0x2c(%rbp), %eax # load 32-bit counter
2114 bswap %eax
2115 pxor 0x30(%rbp), @XMM[1]
2116 inc %eax # increment
2117 movdqu @XMM[1], ($out)
2118 bswap %eax
2119 lea 16($out), $out
2120 mov %eax, 0x2c(%rsp) # save 32-bit counter
2121 dec $len
2122 jnz .Lctr_enc_short
2123
2124.Lctr_enc_done:
2125 lea (%rsp), %rax
2126 pxor %xmm0, %xmm0
2127.Lctr_enc_bzero: # wipe key schedule [if any]
2128 movdqa %xmm0, 0x00(%rax)
2129 movdqa %xmm0, 0x10(%rax)
2130 lea 0x20(%rax), %rax
2131 cmp %rax, %rbp
2132 ja .Lctr_enc_bzero
2133
2134 lea 0x78(%rbp),%rax
2135.cfi_def_cfa %rax,8
2136___
2137$code.=<<___ if ($win64);
2138 movaps 0x40(%rbp), %xmm6
2139 movaps 0x50(%rbp), %xmm7
2140 movaps 0x60(%rbp), %xmm8
2141 movaps 0x70(%rbp), %xmm9
2142 movaps 0x80(%rbp), %xmm10
2143 movaps 0x90(%rbp), %xmm11
2144 movaps 0xa0(%rbp), %xmm12
2145 movaps 0xb0(%rbp), %xmm13
2146 movaps 0xc0(%rbp), %xmm14
2147 movaps 0xd0(%rbp), %xmm15
2148 lea 0xa0(%rax), %rax
2149.Lctr_enc_tail:
2150___
2151$code.=<<___;
2152 mov -48(%rax), %r15
2153.cfi_restore %r15
2154 mov -40(%rax), %r14
2155.cfi_restore %r14
2156 mov -32(%rax), %r13
2157.cfi_restore %r13
2158 mov -24(%rax), %r12
2159.cfi_restore %r12
2160 mov -16(%rax), %rbx
2161.cfi_restore %rbx
2162 mov -8(%rax), %rbp
2163.cfi_restore %rbp
2164 lea (%rax), %rsp # restore %rsp
2165.cfi_def_cfa_register %rsp
2166.Lctr_enc_epilogue:
2167 ret
2168.cfi_endproc
2169.size ossl_bsaes_ctr32_encrypt_blocks,.-ossl_bsaes_ctr32_encrypt_blocks
2170___
2171######################################################################
2172# void bsaes_xts_[en|de]crypt(const char *inp,char *out,size_t len,
2173# const AES_KEY *key1, const AES_KEY *key2,
2174# const unsigned char iv[16]);
2175#
2176my ($twmask,$twres,$twtmp)=@XMM[13..15];
2177$arg6=~s/d$//;
2178
2179$code.=<<___;
2180.globl ossl_bsaes_xts_encrypt
2181.type ossl_bsaes_xts_encrypt,\@abi-omnipotent
2182.align 16
2183ossl_bsaes_xts_encrypt:
2184.cfi_startproc
2185 mov %rsp, %rax
2186.Lxts_enc_prologue:
2187 push %rbp
2188.cfi_push %rbp
2189 push %rbx
2190.cfi_push %rbx
2191 push %r12
2192.cfi_push %r12
2193 push %r13
2194.cfi_push %r13
2195 push %r14
2196.cfi_push %r14
2197 push %r15
2198.cfi_push %r15
2199 lea -0x48(%rsp), %rsp
2200.cfi_adjust_cfa_offset 0x48
2201___
2202$code.=<<___ if ($win64);
2203 mov 0xa0(%rsp),$arg5 # pull key2
2204 mov 0xa8(%rsp),$arg6 # pull ivp
2205 lea -0xa0(%rsp), %rsp
2206 movaps %xmm6, 0x40(%rsp)
2207 movaps %xmm7, 0x50(%rsp)
2208 movaps %xmm8, 0x60(%rsp)
2209 movaps %xmm9, 0x70(%rsp)
2210 movaps %xmm10, 0x80(%rsp)
2211 movaps %xmm11, 0x90(%rsp)
2212 movaps %xmm12, 0xa0(%rsp)
2213 movaps %xmm13, 0xb0(%rsp)
2214 movaps %xmm14, 0xc0(%rsp)
2215 movaps %xmm15, 0xd0(%rsp)
2216.Lxts_enc_body:
2217___
2218$code.=<<___;
2219 mov %rsp, %rbp # backup %rsp
2220.cfi_def_cfa_register %rbp
2221 mov $arg1, $inp # backup arguments
2222 mov $arg2, $out
2223 mov $arg3, $len
2224 mov $arg4, $key
2225
2226 lea ($arg6), $arg1
2227 lea 0x20(%rbp), $arg2
2228 lea ($arg5), $arg3
2229 call asm_AES_encrypt # generate initial tweak
2230
2231 mov 240($key), %eax # rounds
2232 mov $len, %rbx # backup $len
2233
2234 mov %eax, %edx # rounds
2235 shl \$7, %rax # 128 bytes per inner round key
2236 sub \$`128-32`, %rax # size of bit-sliced key schedule
2237 sub %rax, %rsp
2238
2239 mov %rsp, %rax # pass key schedule
2240 mov $key, %rcx # pass key
2241 mov %edx, %r10d # pass rounds
2242 call _bsaes_key_convert
2243 pxor %xmm6, %xmm7 # fix up last round key
2244 movdqa %xmm7, (%rax) # save last round key
2245
2246 and \$-16, $len
2247 sub \$0x80, %rsp # place for tweak[8]
2248 movdqa 0x20(%rbp), @XMM[7] # initial tweak
2249
2250 pxor $twtmp, $twtmp
2251 movdqa .Lxts_magic(%rip), $twmask
2252 pcmpgtd @XMM[7], $twtmp # broadcast upper bits
2253
2254 sub \$0x80, $len
2255 jc .Lxts_enc_short
2256 jmp .Lxts_enc_loop
2257
2258.align 16
2259.Lxts_enc_loop:
2260___
2261 for ($i=0;$i<7;$i++) {
2262 $code.=<<___;
2263 pshufd \$0x13, $twtmp, $twres
2264 pxor $twtmp, $twtmp
2265 movdqa @XMM[7], @XMM[$i]
2266 movdqa @XMM[7], `0x10*$i`(%rsp)# save tweak[$i]
2267 paddq @XMM[7], @XMM[7] # psllq 1,$tweak
2268 pand $twmask, $twres # isolate carry and residue
2269 pcmpgtd @XMM[7], $twtmp # broadcast upper bits
2270 pxor $twres, @XMM[7]
2271___
2272 $code.=<<___ if ($i>=1);
2273 movdqu `0x10*($i-1)`($inp), @XMM[8+$i-1]
2274___
2275 $code.=<<___ if ($i>=2);
2276 pxor @XMM[8+$i-2], @XMM[$i-2]# input[] ^ tweak[]
2277___
2278 }
2279$code.=<<___;
2280 movdqu 0x60($inp), @XMM[8+6]
2281 pxor @XMM[8+5], @XMM[5]
2282 movdqu 0x70($inp), @XMM[8+7]
2283 lea 0x80($inp), $inp
2284 movdqa @XMM[7], 0x70(%rsp)
2285 pxor @XMM[8+6], @XMM[6]
2286 lea 0x80(%rsp), %rax # pass key schedule
2287 pxor @XMM[8+7], @XMM[7]
2288 mov %edx, %r10d # pass rounds
2289
2290 call _bsaes_encrypt8
2291
2292 pxor 0x00(%rsp), @XMM[0] # ^= tweak[]
2293 pxor 0x10(%rsp), @XMM[1]
2294 movdqu @XMM[0], 0x00($out) # write output
2295 pxor 0x20(%rsp), @XMM[4]
2296 movdqu @XMM[1], 0x10($out)
2297 pxor 0x30(%rsp), @XMM[6]
2298 movdqu @XMM[4], 0x20($out)
2299 pxor 0x40(%rsp), @XMM[3]
2300 movdqu @XMM[6], 0x30($out)
2301 pxor 0x50(%rsp), @XMM[7]
2302 movdqu @XMM[3], 0x40($out)
2303 pxor 0x60(%rsp), @XMM[2]
2304 movdqu @XMM[7], 0x50($out)
2305 pxor 0x70(%rsp), @XMM[5]
2306 movdqu @XMM[2], 0x60($out)
2307 movdqu @XMM[5], 0x70($out)
2308 lea 0x80($out), $out
2309
2310 movdqa 0x70(%rsp), @XMM[7] # prepare next iteration tweak
2311 pxor $twtmp, $twtmp
2312 movdqa .Lxts_magic(%rip), $twmask
2313 pcmpgtd @XMM[7], $twtmp
2314 pshufd \$0x13, $twtmp, $twres
2315 pxor $twtmp, $twtmp
2316 paddq @XMM[7], @XMM[7] # psllq 1,$tweak
2317 pand $twmask, $twres # isolate carry and residue
2318 pcmpgtd @XMM[7], $twtmp # broadcast upper bits
2319 pxor $twres, @XMM[7]
2320
2321 sub \$0x80,$len
2322 jnc .Lxts_enc_loop
2323
2324.Lxts_enc_short:
2325 add \$0x80, $len
2326 jz .Lxts_enc_done
2327___
2328 for ($i=0;$i<7;$i++) {
2329 $code.=<<___;
2330 pshufd \$0x13, $twtmp, $twres
2331 pxor $twtmp, $twtmp
2332 movdqa @XMM[7], @XMM[$i]
2333 movdqa @XMM[7], `0x10*$i`(%rsp)# save tweak[$i]
2334 paddq @XMM[7], @XMM[7] # psllq 1,$tweak
2335 pand $twmask, $twres # isolate carry and residue
2336 pcmpgtd @XMM[7], $twtmp # broadcast upper bits
2337 pxor $twres, @XMM[7]
2338___
2339 $code.=<<___ if ($i>=1);
2340 movdqu `0x10*($i-1)`($inp), @XMM[8+$i-1]
2341 cmp \$`0x10*$i`,$len
2342 je .Lxts_enc_$i
2343___
2344 $code.=<<___ if ($i>=2);
2345 pxor @XMM[8+$i-2], @XMM[$i-2]# input[] ^ tweak[]
2346___
2347 }
2348$code.=<<___;
2349 movdqu 0x60($inp), @XMM[8+6]
2350 pxor @XMM[8+5], @XMM[5]
2351 movdqa @XMM[7], 0x70(%rsp)
2352 lea 0x70($inp), $inp
2353 pxor @XMM[8+6], @XMM[6]
2354 lea 0x80(%rsp), %rax # pass key schedule
2355 mov %edx, %r10d # pass rounds
2356
2357 call _bsaes_encrypt8
2358
2359 pxor 0x00(%rsp), @XMM[0] # ^= tweak[]
2360 pxor 0x10(%rsp), @XMM[1]
2361 movdqu @XMM[0], 0x00($out) # write output
2362 pxor 0x20(%rsp), @XMM[4]
2363 movdqu @XMM[1], 0x10($out)
2364 pxor 0x30(%rsp), @XMM[6]
2365 movdqu @XMM[4], 0x20($out)
2366 pxor 0x40(%rsp), @XMM[3]
2367 movdqu @XMM[6], 0x30($out)
2368 pxor 0x50(%rsp), @XMM[7]
2369 movdqu @XMM[3], 0x40($out)
2370 pxor 0x60(%rsp), @XMM[2]
2371 movdqu @XMM[7], 0x50($out)
2372 movdqu @XMM[2], 0x60($out)
2373 lea 0x70($out), $out
2374
2375 movdqa 0x70(%rsp), @XMM[7] # next iteration tweak
2376 jmp .Lxts_enc_done
2377.align 16
2378.Lxts_enc_6:
2379 pxor @XMM[8+4], @XMM[4]
2380 lea 0x60($inp), $inp
2381 pxor @XMM[8+5], @XMM[5]
2382 lea 0x80(%rsp), %rax # pass key schedule
2383 mov %edx, %r10d # pass rounds
2384
2385 call _bsaes_encrypt8
2386
2387 pxor 0x00(%rsp), @XMM[0] # ^= tweak[]
2388 pxor 0x10(%rsp), @XMM[1]
2389 movdqu @XMM[0], 0x00($out) # write output
2390 pxor 0x20(%rsp), @XMM[4]
2391 movdqu @XMM[1], 0x10($out)
2392 pxor 0x30(%rsp), @XMM[6]
2393 movdqu @XMM[4], 0x20($out)
2394 pxor 0x40(%rsp), @XMM[3]
2395 movdqu @XMM[6], 0x30($out)
2396 pxor 0x50(%rsp), @XMM[7]
2397 movdqu @XMM[3], 0x40($out)
2398 movdqu @XMM[7], 0x50($out)
2399 lea 0x60($out), $out
2400
2401 movdqa 0x60(%rsp), @XMM[7] # next iteration tweak
2402 jmp .Lxts_enc_done
2403.align 16
2404.Lxts_enc_5:
2405 pxor @XMM[8+3], @XMM[3]
2406 lea 0x50($inp), $inp
2407 pxor @XMM[8+4], @XMM[4]
2408 lea 0x80(%rsp), %rax # pass key schedule
2409 mov %edx, %r10d # pass rounds
2410
2411 call _bsaes_encrypt8
2412
2413 pxor 0x00(%rsp), @XMM[0] # ^= tweak[]
2414 pxor 0x10(%rsp), @XMM[1]
2415 movdqu @XMM[0], 0x00($out) # write output
2416 pxor 0x20(%rsp), @XMM[4]
2417 movdqu @XMM[1], 0x10($out)
2418 pxor 0x30(%rsp), @XMM[6]
2419 movdqu @XMM[4], 0x20($out)
2420 pxor 0x40(%rsp), @XMM[3]
2421 movdqu @XMM[6], 0x30($out)
2422 movdqu @XMM[3], 0x40($out)
2423 lea 0x50($out), $out
2424
2425 movdqa 0x50(%rsp), @XMM[7] # next iteration tweak
2426 jmp .Lxts_enc_done
2427.align 16
2428.Lxts_enc_4:
2429 pxor @XMM[8+2], @XMM[2]
2430 lea 0x40($inp), $inp
2431 pxor @XMM[8+3], @XMM[3]
2432 lea 0x80(%rsp), %rax # pass key schedule
2433 mov %edx, %r10d # pass rounds
2434
2435 call _bsaes_encrypt8
2436
2437 pxor 0x00(%rsp), @XMM[0] # ^= tweak[]
2438 pxor 0x10(%rsp), @XMM[1]
2439 movdqu @XMM[0], 0x00($out) # write output
2440 pxor 0x20(%rsp), @XMM[4]
2441 movdqu @XMM[1], 0x10($out)
2442 pxor 0x30(%rsp), @XMM[6]
2443 movdqu @XMM[4], 0x20($out)
2444 movdqu @XMM[6], 0x30($out)
2445 lea 0x40($out), $out
2446
2447 movdqa 0x40(%rsp), @XMM[7] # next iteration tweak
2448 jmp .Lxts_enc_done
2449.align 16
2450.Lxts_enc_3:
2451 pxor @XMM[8+1], @XMM[1]
2452 lea 0x30($inp), $inp
2453 pxor @XMM[8+2], @XMM[2]
2454 lea 0x80(%rsp), %rax # pass key schedule
2455 mov %edx, %r10d # pass rounds
2456
2457 call _bsaes_encrypt8
2458
2459 pxor 0x00(%rsp), @XMM[0] # ^= tweak[]
2460 pxor 0x10(%rsp), @XMM[1]
2461 movdqu @XMM[0], 0x00($out) # write output
2462 pxor 0x20(%rsp), @XMM[4]
2463 movdqu @XMM[1], 0x10($out)
2464 movdqu @XMM[4], 0x20($out)
2465 lea 0x30($out), $out
2466
2467 movdqa 0x30(%rsp), @XMM[7] # next iteration tweak
2468 jmp .Lxts_enc_done
2469.align 16
2470.Lxts_enc_2:
2471 pxor @XMM[8+0], @XMM[0]
2472 lea 0x20($inp), $inp
2473 pxor @XMM[8+1], @XMM[1]
2474 lea 0x80(%rsp), %rax # pass key schedule
2475 mov %edx, %r10d # pass rounds
2476
2477 call _bsaes_encrypt8
2478
2479 pxor 0x00(%rsp), @XMM[0] # ^= tweak[]
2480 pxor 0x10(%rsp), @XMM[1]
2481 movdqu @XMM[0], 0x00($out) # write output
2482 movdqu @XMM[1], 0x10($out)
2483 lea 0x20($out), $out
2484
2485 movdqa 0x20(%rsp), @XMM[7] # next iteration tweak
2486 jmp .Lxts_enc_done
2487.align 16
2488.Lxts_enc_1:
2489 pxor @XMM[0], @XMM[8]
2490 lea 0x10($inp), $inp
2491 movdqa @XMM[8], 0x20(%rbp)
2492 lea 0x20(%rbp), $arg1
2493 lea 0x20(%rbp), $arg2
2494 lea ($key), $arg3
2495 call asm_AES_encrypt # doesn't touch %xmm
2496 pxor 0x20(%rbp), @XMM[0] # ^= tweak[]
2497 #pxor @XMM[8], @XMM[0]
2498 #lea 0x80(%rsp), %rax # pass key schedule
2499 #mov %edx, %r10d # pass rounds
2500 #call _bsaes_encrypt8
2501 #pxor 0x00(%rsp), @XMM[0] # ^= tweak[]
2502 movdqu @XMM[0], 0x00($out) # write output
2503 lea 0x10($out), $out
2504
2505 movdqa 0x10(%rsp), @XMM[7] # next iteration tweak
2506
2507.Lxts_enc_done:
2508 and \$15, %ebx
2509 jz .Lxts_enc_ret
2510 mov $out, %rdx
2511
2512.Lxts_enc_steal:
2513 movzb ($inp), %eax
2514 movzb -16(%rdx), %ecx
2515 lea 1($inp), $inp
2516 mov %al, -16(%rdx)
2517 mov %cl, 0(%rdx)
2518 lea 1(%rdx), %rdx
2519 sub \$1,%ebx
2520 jnz .Lxts_enc_steal
2521
2522 movdqu -16($out), @XMM[0]
2523 lea 0x20(%rbp), $arg1
2524 pxor @XMM[7], @XMM[0]
2525 lea 0x20(%rbp), $arg2
2526 movdqa @XMM[0], 0x20(%rbp)
2527 lea ($key), $arg3
2528 call asm_AES_encrypt # doesn't touch %xmm
2529 pxor 0x20(%rbp), @XMM[7]
2530 movdqu @XMM[7], -16($out)
2531
2532.Lxts_enc_ret:
2533 lea (%rsp), %rax
2534 pxor %xmm0, %xmm0
2535.Lxts_enc_bzero: # wipe key schedule [if any]
2536 movdqa %xmm0, 0x00(%rax)
2537 movdqa %xmm0, 0x10(%rax)
2538 lea 0x20(%rax), %rax
2539 cmp %rax, %rbp
2540 ja .Lxts_enc_bzero
2541
2542 lea 0x78(%rbp),%rax
2543.cfi_def_cfa %rax,8
2544___
2545$code.=<<___ if ($win64);
2546 movaps 0x40(%rbp), %xmm6
2547 movaps 0x50(%rbp), %xmm7
2548 movaps 0x60(%rbp), %xmm8
2549 movaps 0x70(%rbp), %xmm9
2550 movaps 0x80(%rbp), %xmm10
2551 movaps 0x90(%rbp), %xmm11
2552 movaps 0xa0(%rbp), %xmm12
2553 movaps 0xb0(%rbp), %xmm13
2554 movaps 0xc0(%rbp), %xmm14
2555 movaps 0xd0(%rbp), %xmm15
2556 lea 0xa0(%rax), %rax
2557.Lxts_enc_tail:
2558___
2559$code.=<<___;
2560 mov -48(%rax), %r15
2561.cfi_restore %r15
2562 mov -40(%rax), %r14
2563.cfi_restore %r14
2564 mov -32(%rax), %r13
2565.cfi_restore %r13
2566 mov -24(%rax), %r12
2567.cfi_restore %r12
2568 mov -16(%rax), %rbx
2569.cfi_restore %rbx
2570 mov -8(%rax), %rbp
2571.cfi_restore %rbp
2572 lea (%rax), %rsp # restore %rsp
2573.cfi_def_cfa_register %rsp
2574.Lxts_enc_epilogue:
2575 ret
2576.cfi_endproc
2577.size ossl_bsaes_xts_encrypt,.-ossl_bsaes_xts_encrypt
2578
2579.globl ossl_bsaes_xts_decrypt
2580.type ossl_bsaes_xts_decrypt,\@abi-omnipotent
2581.align 16
2582ossl_bsaes_xts_decrypt:
2583.cfi_startproc
2584 mov %rsp, %rax
2585.Lxts_dec_prologue:
2586 push %rbp
2587.cfi_push %rbp
2588 push %rbx
2589.cfi_push %rbx
2590 push %r12
2591.cfi_push %r12
2592 push %r13
2593.cfi_push %r13
2594 push %r14
2595.cfi_push %r14
2596 push %r15
2597.cfi_push %r15
2598 lea -0x48(%rsp), %rsp
2599.cfi_adjust_cfa_offset 0x48
2600___
2601$code.=<<___ if ($win64);
2602 mov 0xa0(%rsp),$arg5 # pull key2
2603 mov 0xa8(%rsp),$arg6 # pull ivp
2604 lea -0xa0(%rsp), %rsp
2605 movaps %xmm6, 0x40(%rsp)
2606 movaps %xmm7, 0x50(%rsp)
2607 movaps %xmm8, 0x60(%rsp)
2608 movaps %xmm9, 0x70(%rsp)
2609 movaps %xmm10, 0x80(%rsp)
2610 movaps %xmm11, 0x90(%rsp)
2611 movaps %xmm12, 0xa0(%rsp)
2612 movaps %xmm13, 0xb0(%rsp)
2613 movaps %xmm14, 0xc0(%rsp)
2614 movaps %xmm15, 0xd0(%rsp)
2615.Lxts_dec_body:
2616___
2617$code.=<<___;
2618 mov %rsp, %rbp # backup %rsp
2619 mov $arg1, $inp # backup arguments
2620 mov $arg2, $out
2621 mov $arg3, $len
2622 mov $arg4, $key
2623
2624 lea ($arg6), $arg1
2625 lea 0x20(%rbp), $arg2
2626 lea ($arg5), $arg3
2627 call asm_AES_encrypt # generate initial tweak
2628
2629 mov 240($key), %eax # rounds
2630 mov $len, %rbx # backup $len
2631
2632 mov %eax, %edx # rounds
2633 shl \$7, %rax # 128 bytes per inner round key
2634 sub \$`128-32`, %rax # size of bit-sliced key schedule
2635 sub %rax, %rsp
2636
2637 mov %rsp, %rax # pass key schedule
2638 mov $key, %rcx # pass key
2639 mov %edx, %r10d # pass rounds
2640 call _bsaes_key_convert
2641 pxor (%rsp), %xmm7 # fix up round 0 key
2642 movdqa %xmm6, (%rax) # save last round key
2643 movdqa %xmm7, (%rsp)
2644
2645 xor %eax, %eax # if ($len%16) len-=16;
2646 and \$-16, $len
2647 test \$15, %ebx
2648 setnz %al
2649 shl \$4, %rax
2650 sub %rax, $len
2651
2652 sub \$0x80, %rsp # place for tweak[8]
2653 movdqa 0x20(%rbp), @XMM[7] # initial tweak
2654
2655 pxor $twtmp, $twtmp
2656 movdqa .Lxts_magic(%rip), $twmask
2657 pcmpgtd @XMM[7], $twtmp # broadcast upper bits
2658
2659 sub \$0x80, $len
2660 jc .Lxts_dec_short
2661 jmp .Lxts_dec_loop
2662
2663.align 16
2664.Lxts_dec_loop:
2665___
2666 for ($i=0;$i<7;$i++) {
2667 $code.=<<___;
2668 pshufd \$0x13, $twtmp, $twres
2669 pxor $twtmp, $twtmp
2670 movdqa @XMM[7], @XMM[$i]
2671 movdqa @XMM[7], `0x10*$i`(%rsp)# save tweak[$i]
2672 paddq @XMM[7], @XMM[7] # psllq 1,$tweak
2673 pand $twmask, $twres # isolate carry and residue
2674 pcmpgtd @XMM[7], $twtmp # broadcast upper bits
2675 pxor $twres, @XMM[7]
2676___
2677 $code.=<<___ if ($i>=1);
2678 movdqu `0x10*($i-1)`($inp), @XMM[8+$i-1]
2679___
2680 $code.=<<___ if ($i>=2);
2681 pxor @XMM[8+$i-2], @XMM[$i-2]# input[] ^ tweak[]
2682___
2683 }
2684$code.=<<___;
2685 movdqu 0x60($inp), @XMM[8+6]
2686 pxor @XMM[8+5], @XMM[5]
2687 movdqu 0x70($inp), @XMM[8+7]
2688 lea 0x80($inp), $inp
2689 movdqa @XMM[7], 0x70(%rsp)
2690 pxor @XMM[8+6], @XMM[6]
2691 lea 0x80(%rsp), %rax # pass key schedule
2692 pxor @XMM[8+7], @XMM[7]
2693 mov %edx, %r10d # pass rounds
2694
2695 call _bsaes_decrypt8
2696
2697 pxor 0x00(%rsp), @XMM[0] # ^= tweak[]
2698 pxor 0x10(%rsp), @XMM[1]
2699 movdqu @XMM[0], 0x00($out) # write output
2700 pxor 0x20(%rsp), @XMM[6]
2701 movdqu @XMM[1], 0x10($out)
2702 pxor 0x30(%rsp), @XMM[4]
2703 movdqu @XMM[6], 0x20($out)
2704 pxor 0x40(%rsp), @XMM[2]
2705 movdqu @XMM[4], 0x30($out)
2706 pxor 0x50(%rsp), @XMM[7]
2707 movdqu @XMM[2], 0x40($out)
2708 pxor 0x60(%rsp), @XMM[3]
2709 movdqu @XMM[7], 0x50($out)
2710 pxor 0x70(%rsp), @XMM[5]
2711 movdqu @XMM[3], 0x60($out)
2712 movdqu @XMM[5], 0x70($out)
2713 lea 0x80($out), $out
2714
2715 movdqa 0x70(%rsp), @XMM[7] # prepare next iteration tweak
2716 pxor $twtmp, $twtmp
2717 movdqa .Lxts_magic(%rip), $twmask
2718 pcmpgtd @XMM[7], $twtmp
2719 pshufd \$0x13, $twtmp, $twres
2720 pxor $twtmp, $twtmp
2721 paddq @XMM[7], @XMM[7] # psllq 1,$tweak
2722 pand $twmask, $twres # isolate carry and residue
2723 pcmpgtd @XMM[7], $twtmp # broadcast upper bits
2724 pxor $twres, @XMM[7]
2725
2726 sub \$0x80,$len
2727 jnc .Lxts_dec_loop
2728
2729.Lxts_dec_short:
2730 add \$0x80, $len
2731 jz .Lxts_dec_done
2732___
2733 for ($i=0;$i<7;$i++) {
2734 $code.=<<___;
2735 pshufd \$0x13, $twtmp, $twres
2736 pxor $twtmp, $twtmp
2737 movdqa @XMM[7], @XMM[$i]
2738 movdqa @XMM[7], `0x10*$i`(%rsp)# save tweak[$i]
2739 paddq @XMM[7], @XMM[7] # psllq 1,$tweak
2740 pand $twmask, $twres # isolate carry and residue
2741 pcmpgtd @XMM[7], $twtmp # broadcast upper bits
2742 pxor $twres, @XMM[7]
2743___
2744 $code.=<<___ if ($i>=1);
2745 movdqu `0x10*($i-1)`($inp), @XMM[8+$i-1]
2746 cmp \$`0x10*$i`,$len
2747 je .Lxts_dec_$i
2748___
2749 $code.=<<___ if ($i>=2);
2750 pxor @XMM[8+$i-2], @XMM[$i-2]# input[] ^ tweak[]
2751___
2752 }
2753$code.=<<___;
2754 movdqu 0x60($inp), @XMM[8+6]
2755 pxor @XMM[8+5], @XMM[5]
2756 movdqa @XMM[7], 0x70(%rsp)
2757 lea 0x70($inp), $inp
2758 pxor @XMM[8+6], @XMM[6]
2759 lea 0x80(%rsp), %rax # pass key schedule
2760 mov %edx, %r10d # pass rounds
2761
2762 call _bsaes_decrypt8
2763
2764 pxor 0x00(%rsp), @XMM[0] # ^= tweak[]
2765 pxor 0x10(%rsp), @XMM[1]
2766 movdqu @XMM[0], 0x00($out) # write output
2767 pxor 0x20(%rsp), @XMM[6]
2768 movdqu @XMM[1], 0x10($out)
2769 pxor 0x30(%rsp), @XMM[4]
2770 movdqu @XMM[6], 0x20($out)
2771 pxor 0x40(%rsp), @XMM[2]
2772 movdqu @XMM[4], 0x30($out)
2773 pxor 0x50(%rsp), @XMM[7]
2774 movdqu @XMM[2], 0x40($out)
2775 pxor 0x60(%rsp), @XMM[3]
2776 movdqu @XMM[7], 0x50($out)
2777 movdqu @XMM[3], 0x60($out)
2778 lea 0x70($out), $out
2779
2780 movdqa 0x70(%rsp), @XMM[7] # next iteration tweak
2781 jmp .Lxts_dec_done
2782.align 16
2783.Lxts_dec_6:
2784 pxor @XMM[8+4], @XMM[4]
2785 lea 0x60($inp), $inp
2786 pxor @XMM[8+5], @XMM[5]
2787 lea 0x80(%rsp), %rax # pass key schedule
2788 mov %edx, %r10d # pass rounds
2789
2790 call _bsaes_decrypt8
2791
2792 pxor 0x00(%rsp), @XMM[0] # ^= tweak[]
2793 pxor 0x10(%rsp), @XMM[1]
2794 movdqu @XMM[0], 0x00($out) # write output
2795 pxor 0x20(%rsp), @XMM[6]
2796 movdqu @XMM[1], 0x10($out)
2797 pxor 0x30(%rsp), @XMM[4]
2798 movdqu @XMM[6], 0x20($out)
2799 pxor 0x40(%rsp), @XMM[2]
2800 movdqu @XMM[4], 0x30($out)
2801 pxor 0x50(%rsp), @XMM[7]
2802 movdqu @XMM[2], 0x40($out)
2803 movdqu @XMM[7], 0x50($out)
2804 lea 0x60($out), $out
2805
2806 movdqa 0x60(%rsp), @XMM[7] # next iteration tweak
2807 jmp .Lxts_dec_done
2808.align 16
2809.Lxts_dec_5:
2810 pxor @XMM[8+3], @XMM[3]
2811 lea 0x50($inp), $inp
2812 pxor @XMM[8+4], @XMM[4]
2813 lea 0x80(%rsp), %rax # pass key schedule
2814 mov %edx, %r10d # pass rounds
2815
2816 call _bsaes_decrypt8
2817
2818 pxor 0x00(%rsp), @XMM[0] # ^= tweak[]
2819 pxor 0x10(%rsp), @XMM[1]
2820 movdqu @XMM[0], 0x00($out) # write output
2821 pxor 0x20(%rsp), @XMM[6]
2822 movdqu @XMM[1], 0x10($out)
2823 pxor 0x30(%rsp), @XMM[4]
2824 movdqu @XMM[6], 0x20($out)
2825 pxor 0x40(%rsp), @XMM[2]
2826 movdqu @XMM[4], 0x30($out)
2827 movdqu @XMM[2], 0x40($out)
2828 lea 0x50($out), $out
2829
2830 movdqa 0x50(%rsp), @XMM[7] # next iteration tweak
2831 jmp .Lxts_dec_done
2832.align 16
2833.Lxts_dec_4:
2834 pxor @XMM[8+2], @XMM[2]
2835 lea 0x40($inp), $inp
2836 pxor @XMM[8+3], @XMM[3]
2837 lea 0x80(%rsp), %rax # pass key schedule
2838 mov %edx, %r10d # pass rounds
2839
2840 call _bsaes_decrypt8
2841
2842 pxor 0x00(%rsp), @XMM[0] # ^= tweak[]
2843 pxor 0x10(%rsp), @XMM[1]
2844 movdqu @XMM[0], 0x00($out) # write output
2845 pxor 0x20(%rsp), @XMM[6]
2846 movdqu @XMM[1], 0x10($out)
2847 pxor 0x30(%rsp), @XMM[4]
2848 movdqu @XMM[6], 0x20($out)
2849 movdqu @XMM[4], 0x30($out)
2850 lea 0x40($out), $out
2851
2852 movdqa 0x40(%rsp), @XMM[7] # next iteration tweak
2853 jmp .Lxts_dec_done
2854.align 16
2855.Lxts_dec_3:
2856 pxor @XMM[8+1], @XMM[1]
2857 lea 0x30($inp), $inp
2858 pxor @XMM[8+2], @XMM[2]
2859 lea 0x80(%rsp), %rax # pass key schedule
2860 mov %edx, %r10d # pass rounds
2861
2862 call _bsaes_decrypt8
2863
2864 pxor 0x00(%rsp), @XMM[0] # ^= tweak[]
2865 pxor 0x10(%rsp), @XMM[1]
2866 movdqu @XMM[0], 0x00($out) # write output
2867 pxor 0x20(%rsp), @XMM[6]
2868 movdqu @XMM[1], 0x10($out)
2869 movdqu @XMM[6], 0x20($out)
2870 lea 0x30($out), $out
2871
2872 movdqa 0x30(%rsp), @XMM[7] # next iteration tweak
2873 jmp .Lxts_dec_done
2874.align 16
2875.Lxts_dec_2:
2876 pxor @XMM[8+0], @XMM[0]
2877 lea 0x20($inp), $inp
2878 pxor @XMM[8+1], @XMM[1]
2879 lea 0x80(%rsp), %rax # pass key schedule
2880 mov %edx, %r10d # pass rounds
2881
2882 call _bsaes_decrypt8
2883
2884 pxor 0x00(%rsp), @XMM[0] # ^= tweak[]
2885 pxor 0x10(%rsp), @XMM[1]
2886 movdqu @XMM[0], 0x00($out) # write output
2887 movdqu @XMM[1], 0x10($out)
2888 lea 0x20($out), $out
2889
2890 movdqa 0x20(%rsp), @XMM[7] # next iteration tweak
2891 jmp .Lxts_dec_done
2892.align 16
2893.Lxts_dec_1:
2894 pxor @XMM[0], @XMM[8]
2895 lea 0x10($inp), $inp
2896 movdqa @XMM[8], 0x20(%rbp)
2897 lea 0x20(%rbp), $arg1
2898 lea 0x20(%rbp), $arg2
2899 lea ($key), $arg3
2900 call asm_AES_decrypt # doesn't touch %xmm
2901 pxor 0x20(%rbp), @XMM[0] # ^= tweak[]
2902 #pxor @XMM[8], @XMM[0]
2903 #lea 0x80(%rsp), %rax # pass key schedule
2904 #mov %edx, %r10d # pass rounds
2905 #call _bsaes_decrypt8
2906 #pxor 0x00(%rsp), @XMM[0] # ^= tweak[]
2907 movdqu @XMM[0], 0x00($out) # write output
2908 lea 0x10($out), $out
2909
2910 movdqa 0x10(%rsp), @XMM[7] # next iteration tweak
2911
2912.Lxts_dec_done:
2913 and \$15, %ebx
2914 jz .Lxts_dec_ret
2915
2916 pxor $twtmp, $twtmp
2917 movdqa .Lxts_magic(%rip), $twmask
2918 pcmpgtd @XMM[7], $twtmp
2919 pshufd \$0x13, $twtmp, $twres
2920 movdqa @XMM[7], @XMM[6]
2921 paddq @XMM[7], @XMM[7] # psllq 1,$tweak
2922 pand $twmask, $twres # isolate carry and residue
2923 movdqu ($inp), @XMM[0]
2924 pxor $twres, @XMM[7]
2925
2926 lea 0x20(%rbp), $arg1
2927 pxor @XMM[7], @XMM[0]
2928 lea 0x20(%rbp), $arg2
2929 movdqa @XMM[0], 0x20(%rbp)
2930 lea ($key), $arg3
2931 call asm_AES_decrypt # doesn't touch %xmm
2932 pxor 0x20(%rbp), @XMM[7]
2933 mov $out, %rdx
2934 movdqu @XMM[7], ($out)
2935
2936.Lxts_dec_steal:
2937 movzb 16($inp), %eax
2938 movzb (%rdx), %ecx
2939 lea 1($inp), $inp
2940 mov %al, (%rdx)
2941 mov %cl, 16(%rdx)
2942 lea 1(%rdx), %rdx
2943 sub \$1,%ebx
2944 jnz .Lxts_dec_steal
2945
2946 movdqu ($out), @XMM[0]
2947 lea 0x20(%rbp), $arg1
2948 pxor @XMM[6], @XMM[0]
2949 lea 0x20(%rbp), $arg2
2950 movdqa @XMM[0], 0x20(%rbp)
2951 lea ($key), $arg3
2952 call asm_AES_decrypt # doesn't touch %xmm
2953 pxor 0x20(%rbp), @XMM[6]
2954 movdqu @XMM[6], ($out)
2955
2956.Lxts_dec_ret:
2957 lea (%rsp), %rax
2958 pxor %xmm0, %xmm0
2959.Lxts_dec_bzero: # wipe key schedule [if any]
2960 movdqa %xmm0, 0x00(%rax)
2961 movdqa %xmm0, 0x10(%rax)
2962 lea 0x20(%rax), %rax
2963 cmp %rax, %rbp
2964 ja .Lxts_dec_bzero
2965
2966 lea 0x78(%rbp),%rax
2967.cfi_def_cfa %rax,8
2968___
2969$code.=<<___ if ($win64);
2970 movaps 0x40(%rbp), %xmm6
2971 movaps 0x50(%rbp), %xmm7
2972 movaps 0x60(%rbp), %xmm8
2973 movaps 0x70(%rbp), %xmm9
2974 movaps 0x80(%rbp), %xmm10
2975 movaps 0x90(%rbp), %xmm11
2976 movaps 0xa0(%rbp), %xmm12
2977 movaps 0xb0(%rbp), %xmm13
2978 movaps 0xc0(%rbp), %xmm14
2979 movaps 0xd0(%rbp), %xmm15
2980 lea 0xa0(%rax), %rax
2981.Lxts_dec_tail:
2982___
2983$code.=<<___;
2984 mov -48(%rax), %r15
2985.cfi_restore %r15
2986 mov -40(%rax), %r14
2987.cfi_restore %r14
2988 mov -32(%rax), %r13
2989.cfi_restore %r13
2990 mov -24(%rax), %r12
2991.cfi_restore %r12
2992 mov -16(%rax), %rbx
2993.cfi_restore %rbx
2994 mov -8(%rax), %rbp
2995.cfi_restore %rbp
2996 lea (%rax), %rsp # restore %rsp
2997.cfi_def_cfa_register %rsp
2998.Lxts_dec_epilogue:
2999 ret
3000.cfi_endproc
3001.size ossl_bsaes_xts_decrypt,.-ossl_bsaes_xts_decrypt
3002___
3003}
3004$code.=<<___;
3005.type _bsaes_const,\@object
3006.align 64
3007_bsaes_const:
3008.LM0ISR: # InvShiftRows constants
3009 .quad 0x0a0e0206070b0f03, 0x0004080c0d010509
3010.LISRM0:
3011 .quad 0x01040b0e0205080f, 0x0306090c00070a0d
3012.LISR:
3013 .quad 0x0504070602010003, 0x0f0e0d0c080b0a09
3014.LBS0: # bit-slice constants
3015 .quad 0x5555555555555555, 0x5555555555555555
3016.LBS1:
3017 .quad 0x3333333333333333, 0x3333333333333333
3018.LBS2:
3019 .quad 0x0f0f0f0f0f0f0f0f, 0x0f0f0f0f0f0f0f0f
3020.LSR: # shiftrows constants
3021 .quad 0x0504070600030201, 0x0f0e0d0c0a09080b
3022.LSRM0:
3023 .quad 0x0304090e00050a0f, 0x01060b0c0207080d
3024.LM0SR:
3025 .quad 0x0a0e02060f03070b, 0x0004080c05090d01
3026.LSWPUP: # byte-swap upper dword
3027 .quad 0x0706050403020100, 0x0c0d0e0f0b0a0908
3028.LSWPUPM0SR:
3029 .quad 0x0a0d02060c03070b, 0x0004080f05090e01
3030.LADD1: # counter increment constants
3031 .quad 0x0000000000000000, 0x0000000100000000
3032.LADD2:
3033 .quad 0x0000000000000000, 0x0000000200000000
3034.LADD3:
3035 .quad 0x0000000000000000, 0x0000000300000000
3036.LADD4:
3037 .quad 0x0000000000000000, 0x0000000400000000
3038.LADD5:
3039 .quad 0x0000000000000000, 0x0000000500000000
3040.LADD6:
3041 .quad 0x0000000000000000, 0x0000000600000000
3042.LADD7:
3043 .quad 0x0000000000000000, 0x0000000700000000
3044.LADD8:
3045 .quad 0x0000000000000000, 0x0000000800000000
3046.Lxts_magic:
3047 .long 0x87,0,1,0
3048.Lmasks:
3049 .quad 0x0101010101010101, 0x0101010101010101
3050 .quad 0x0202020202020202, 0x0202020202020202
3051 .quad 0x0404040404040404, 0x0404040404040404
3052 .quad 0x0808080808080808, 0x0808080808080808
3053.LM0:
3054 .quad 0x02060a0e03070b0f, 0x0004080c0105090d
3055.L63:
3056 .quad 0x6363636363636363, 0x6363636363636363
3057.asciz "Bit-sliced AES for x86_64/SSSE3, Emilia Käsper, Peter Schwabe, Andy Polyakov"
3058.align 64
3059.size _bsaes_const,.-_bsaes_const
3060___
3061
3062# EXCEPTION_DISPOSITION handler (EXCEPTION_RECORD *rec,ULONG64 frame,
3063# CONTEXT *context,DISPATCHER_CONTEXT *disp)
3064if ($win64) {
3065$rec="%rcx";
3066$frame="%rdx";
3067$context="%r8";
3068$disp="%r9";
3069
3070$code.=<<___;
3071.extern __imp_RtlVirtualUnwind
3072.type se_handler,\@abi-omnipotent
3073.align 16
3074se_handler:
3075 push %rsi
3076 push %rdi
3077 push %rbx
3078 push %rbp
3079 push %r12
3080 push %r13
3081 push %r14
3082 push %r15
3083 pushfq
3084 sub \$64,%rsp
3085
3086 mov 120($context),%rax # pull context->Rax
3087 mov 248($context),%rbx # pull context->Rip
3088
3089 mov 8($disp),%rsi # disp->ImageBase
3090 mov 56($disp),%r11 # disp->HandlerData
3091
3092 mov 0(%r11),%r10d # HandlerData[0]
3093 lea (%rsi,%r10),%r10 # prologue label
3094 cmp %r10,%rbx # context->Rip<=prologue label
3095 jbe .Lin_prologue
3096
3097 mov 4(%r11),%r10d # HandlerData[1]
3098 lea (%rsi,%r10),%r10 # epilogue label
3099 cmp %r10,%rbx # context->Rip>=epilogue label
3100 jae .Lin_prologue
3101
3102 mov 8(%r11),%r10d # HandlerData[2]
3103 lea (%rsi,%r10),%r10 # epilogue label
3104 cmp %r10,%rbx # context->Rip>=tail label
3105 jae .Lin_tail
3106
3107 mov 160($context),%rax # pull context->Rbp
3108
3109 lea 0x40(%rax),%rsi # %xmm save area
3110 lea 512($context),%rdi # &context.Xmm6
3111 mov \$20,%ecx # 10*sizeof(%xmm0)/sizeof(%rax)
3112 .long 0xa548f3fc # cld; rep movsq
3113 lea 0xa0+0x78(%rax),%rax # adjust stack pointer
3114
3115.Lin_tail:
3116 mov -48(%rax),%rbp
3117 mov -40(%rax),%rbx
3118 mov -32(%rax),%r12
3119 mov -24(%rax),%r13
3120 mov -16(%rax),%r14
3121 mov -8(%rax),%r15
3122 mov %rbx,144($context) # restore context->Rbx
3123 mov %rbp,160($context) # restore context->Rbp
3124 mov %r12,216($context) # restore context->R12
3125 mov %r13,224($context) # restore context->R13
3126 mov %r14,232($context) # restore context->R14
3127 mov %r15,240($context) # restore context->R15
3128
3129.Lin_prologue:
3130 mov %rax,152($context) # restore context->Rsp
3131
3132 mov 40($disp),%rdi # disp->ContextRecord
3133 mov $context,%rsi # context
3134 mov \$`1232/8`,%ecx # sizeof(CONTEXT)
3135 .long 0xa548f3fc # cld; rep movsq
3136
3137 mov $disp,%rsi
3138 xor %rcx,%rcx # arg1, UNW_FLAG_NHANDLER
3139 mov 8(%rsi),%rdx # arg2, disp->ImageBase
3140 mov 0(%rsi),%r8 # arg3, disp->ControlPc
3141 mov 16(%rsi),%r9 # arg4, disp->FunctionEntry
3142 mov 40(%rsi),%r10 # disp->ContextRecord
3143 lea 56(%rsi),%r11 # &disp->HandlerData
3144 lea 24(%rsi),%r12 # &disp->EstablisherFrame
3145 mov %r10,32(%rsp) # arg5
3146 mov %r11,40(%rsp) # arg6
3147 mov %r12,48(%rsp) # arg7
3148 mov %rcx,56(%rsp) # arg8, (NULL)
3149 call *__imp_RtlVirtualUnwind(%rip)
3150
3151 mov \$1,%eax # ExceptionContinueSearch
3152 add \$64,%rsp
3153 popfq
3154 pop %r15
3155 pop %r14
3156 pop %r13
3157 pop %r12
3158 pop %rbp
3159 pop %rbx
3160 pop %rdi
3161 pop %rsi
3162 ret
3163.size se_handler,.-se_handler
3164
3165.section .pdata
3166.align 4
3167___
3168$code.=<<___ if ($ecb);
3169 .rva .Lecb_enc_prologue
3170 .rva .Lecb_enc_epilogue
3171 .rva .Lecb_enc_info
3172
3173 .rva .Lecb_dec_prologue
3174 .rva .Lecb_dec_epilogue
3175 .rva .Lecb_dec_info
3176___
3177$code.=<<___;
3178 .rva .Lcbc_dec_prologue
3179 .rva .Lcbc_dec_epilogue
3180 .rva .Lcbc_dec_info
3181
3182 .rva .Lctr_enc_prologue
3183 .rva .Lctr_enc_epilogue
3184 .rva .Lctr_enc_info
3185
3186 .rva .Lxts_enc_prologue
3187 .rva .Lxts_enc_epilogue
3188 .rva .Lxts_enc_info
3189
3190 .rva .Lxts_dec_prologue
3191 .rva .Lxts_dec_epilogue
3192 .rva .Lxts_dec_info
3193
3194.section .xdata
3195.align 8
3196___
3197$code.=<<___ if ($ecb);
3198.Lecb_enc_info:
3199 .byte 9,0,0,0
3200 .rva se_handler
3201 .rva .Lecb_enc_body,.Lecb_enc_epilogue # HandlerData[]
3202 .rva .Lecb_enc_tail
3203 .long 0
3204.Lecb_dec_info:
3205 .byte 9,0,0,0
3206 .rva se_handler
3207 .rva .Lecb_dec_body,.Lecb_dec_epilogue # HandlerData[]
3208 .rva .Lecb_dec_tail
3209 .long 0
3210___
3211$code.=<<___;
3212.Lcbc_dec_info:
3213 .byte 9,0,0,0
3214 .rva se_handler
3215 .rva .Lcbc_dec_body,.Lcbc_dec_epilogue # HandlerData[]
3216 .rva .Lcbc_dec_tail
3217 .long 0
3218.Lctr_enc_info:
3219 .byte 9,0,0,0
3220 .rva se_handler
3221 .rva .Lctr_enc_body,.Lctr_enc_epilogue # HandlerData[]
3222 .rva .Lctr_enc_tail
3223 .long 0
3224.Lxts_enc_info:
3225 .byte 9,0,0,0
3226 .rva se_handler
3227 .rva .Lxts_enc_body,.Lxts_enc_epilogue # HandlerData[]
3228 .rva .Lxts_enc_tail
3229 .long 0
3230.Lxts_dec_info:
3231 .byte 9,0,0,0
3232 .rva se_handler
3233 .rva .Lxts_dec_body,.Lxts_dec_epilogue # HandlerData[]
3234 .rva .Lxts_dec_tail
3235 .long 0
3236___
3237}
3238
3239$code =~ s/\`([^\`]*)\`/eval($1)/gem;
3240
3241print $code;
3242
3243close STDOUT or die "error closing STDOUT: $!";
Note: See TracBrowser for help on using the repository browser.

© 2024 Oracle Support Privacy / Do Not Sell My Info Terms of Use Trademark Policy Automated Access Etiquette