VirtualBox

source: vbox/trunk/src/libs/openssl-1.1.1f/crypto/aes/asm/aesni-x86.pl@ 83531

Last change on this file since 83531 was 83531, checked in by vboxsync, 5 years ago

setting svn:sync-process=export for openssl-1.1.1f, all files except tests

File size: 99.8 KB
Line 
1#! /usr/bin/env perl
2# Copyright 2009-2020 The OpenSSL Project Authors. All Rights Reserved.
3#
4# Licensed under the OpenSSL license (the "License"). You may not use
5# this file except in compliance with the License. You can obtain a copy
6# in the file LICENSE in the source distribution or at
7# https://www.openssl.org/source/license.html
8
9
10# ====================================================================
11# Written by Andy Polyakov <[email protected]> for the OpenSSL
12# project. The module is, however, dual licensed under OpenSSL and
13# CRYPTOGAMS licenses depending on where you obtain it. For further
14# details see http://www.openssl.org/~appro/cryptogams/.
15# ====================================================================
16#
17# This module implements support for Intel AES-NI extension. In
18# OpenSSL context it's used with Intel engine, but can also be used as
19# drop-in replacement for crypto/aes/asm/aes-586.pl [see below for
20# details].
21#
22# Performance.
23#
24# To start with see corresponding paragraph in aesni-x86_64.pl...
25# Instead of filling table similar to one found there I've chosen to
26# summarize *comparison* results for raw ECB, CTR and CBC benchmarks.
27# The simplified table below represents 32-bit performance relative
28# to 64-bit one in every given point. Ratios vary for different
29# encryption modes, therefore interval values.
30#
31# 16-byte 64-byte 256-byte 1-KB 8-KB
32# 53-67% 67-84% 91-94% 95-98% 97-99.5%
33#
34# Lower ratios for smaller block sizes are perfectly understandable,
35# because function call overhead is higher in 32-bit mode. Largest
36# 8-KB block performance is virtually same: 32-bit code is less than
37# 1% slower for ECB, CBC and CCM, and ~3% slower otherwise.
38
39# January 2011
40#
41# See aesni-x86_64.pl for details. Unlike x86_64 version this module
42# interleaves at most 6 aes[enc|dec] instructions, because there are
43# not enough registers for 8x interleave [which should be optimal for
44# Sandy Bridge]. Actually, performance results for 6x interleave
45# factor presented in aesni-x86_64.pl (except for CTR) are for this
46# module.
47
48# April 2011
49#
50# Add aesni_xts_[en|de]crypt. Westmere spends 1.50 cycles processing
51# one byte out of 8KB with 128-bit key, Sandy Bridge - 1.09.
52
53# November 2015
54#
55# Add aesni_ocb_[en|de]crypt.
56
57######################################################################
58# Current large-block performance in cycles per byte processed with
59# 128-bit key (less is better).
60#
61# CBC en-/decrypt CTR XTS ECB OCB
62# Westmere 3.77/1.37 1.37 1.52 1.27
63# * Bridge 5.07/0.98 0.99 1.09 0.91 1.10
64# Haswell 4.44/0.80 0.97 1.03 0.72 0.76
65# Skylake 2.68/0.65 0.65 0.66 0.64 0.66
66# Silvermont 5.77/3.56 3.67 4.03 3.46 4.03
67# Goldmont 3.84/1.39 1.39 1.63 1.31 1.70
68# Bulldozer 5.80/0.98 1.05 1.24 0.93 1.23
69
70$PREFIX="aesni"; # if $PREFIX is set to "AES", the script
71 # generates drop-in replacement for
72 # crypto/aes/asm/aes-586.pl:-)
73$inline=1; # inline _aesni_[en|de]crypt
74
75$0 =~ m/(.*[\/\\])[^\/\\]+$/; $dir=$1;
76push(@INC,"${dir}","${dir}../../perlasm");
77require "x86asm.pl";
78
79$output = pop;
80open OUT,">$output";
81*STDOUT=*OUT;
82
83&asm_init($ARGV[0]);
84
85&external_label("OPENSSL_ia32cap_P");
86&static_label("key_const");
87
88if ($PREFIX eq "aesni") { $movekey=\&movups; }
89else { $movekey=\&movups; }
90
91$len="eax";
92$rounds="ecx";
93$key="edx";
94$inp="esi";
95$out="edi";
96$rounds_="ebx"; # backup copy for $rounds
97$key_="ebp"; # backup copy for $key
98
99$rndkey0="xmm0";
100$rndkey1="xmm1";
101$inout0="xmm2";
102$inout1="xmm3";
103$inout2="xmm4";
104$inout3="xmm5"; $in1="xmm5";
105$inout4="xmm6"; $in0="xmm6";
106$inout5="xmm7"; $ivec="xmm7";
107
108# AESNI extension
109sub aeskeygenassist
110{ my($dst,$src,$imm)=@_;
111 if ("$dst:$src" =~ /xmm([0-7]):xmm([0-7])/)
112 { &data_byte(0x66,0x0f,0x3a,0xdf,0xc0|($1<<3)|$2,$imm); }
113}
114sub aescommon
115{ my($opcodelet,$dst,$src)=@_;
116 if ("$dst:$src" =~ /xmm([0-7]):xmm([0-7])/)
117 { &data_byte(0x66,0x0f,0x38,$opcodelet,0xc0|($1<<3)|$2);}
118}
119sub aesimc { aescommon(0xdb,@_); }
120sub aesenc { aescommon(0xdc,@_); }
121sub aesenclast { aescommon(0xdd,@_); }
122sub aesdec { aescommon(0xde,@_); }
123sub aesdeclast { aescommon(0xdf,@_); }
124
125
126# Inline version of internal aesni_[en|de]crypt1
127{ my $sn;
128sub aesni_inline_generate1
129{ my ($p,$inout,$ivec)=@_; $inout=$inout0 if (!defined($inout));
130 $sn++;
131
132 &$movekey ($rndkey0,&QWP(0,$key));
133 &$movekey ($rndkey1,&QWP(16,$key));
134 &xorps ($ivec,$rndkey0) if (defined($ivec));
135 &lea ($key,&DWP(32,$key));
136 &xorps ($inout,$ivec) if (defined($ivec));
137 &xorps ($inout,$rndkey0) if (!defined($ivec));
138 &set_label("${p}1_loop_$sn");
139 eval"&aes${p} ($inout,$rndkey1)";
140 &dec ($rounds);
141 &$movekey ($rndkey1,&QWP(0,$key));
142 &lea ($key,&DWP(16,$key));
143 &jnz (&label("${p}1_loop_$sn"));
144 eval"&aes${p}last ($inout,$rndkey1)";
145}}
146
147sub aesni_generate1 # fully unrolled loop
148{ my ($p,$inout)=@_; $inout=$inout0 if (!defined($inout));
149
150 &function_begin_B("_aesni_${p}rypt1");
151 &movups ($rndkey0,&QWP(0,$key));
152 &$movekey ($rndkey1,&QWP(0x10,$key));
153 &xorps ($inout,$rndkey0);
154 &$movekey ($rndkey0,&QWP(0x20,$key));
155 &lea ($key,&DWP(0x30,$key));
156 &cmp ($rounds,11);
157 &jb (&label("${p}128"));
158 &lea ($key,&DWP(0x20,$key));
159 &je (&label("${p}192"));
160 &lea ($key,&DWP(0x20,$key));
161 eval"&aes${p} ($inout,$rndkey1)";
162 &$movekey ($rndkey1,&QWP(-0x40,$key));
163 eval"&aes${p} ($inout,$rndkey0)";
164 &$movekey ($rndkey0,&QWP(-0x30,$key));
165 &set_label("${p}192");
166 eval"&aes${p} ($inout,$rndkey1)";
167 &$movekey ($rndkey1,&QWP(-0x20,$key));
168 eval"&aes${p} ($inout,$rndkey0)";
169 &$movekey ($rndkey0,&QWP(-0x10,$key));
170 &set_label("${p}128");
171 eval"&aes${p} ($inout,$rndkey1)";
172 &$movekey ($rndkey1,&QWP(0,$key));
173 eval"&aes${p} ($inout,$rndkey0)";
174 &$movekey ($rndkey0,&QWP(0x10,$key));
175 eval"&aes${p} ($inout,$rndkey1)";
176 &$movekey ($rndkey1,&QWP(0x20,$key));
177 eval"&aes${p} ($inout,$rndkey0)";
178 &$movekey ($rndkey0,&QWP(0x30,$key));
179 eval"&aes${p} ($inout,$rndkey1)";
180 &$movekey ($rndkey1,&QWP(0x40,$key));
181 eval"&aes${p} ($inout,$rndkey0)";
182 &$movekey ($rndkey0,&QWP(0x50,$key));
183 eval"&aes${p} ($inout,$rndkey1)";
184 &$movekey ($rndkey1,&QWP(0x60,$key));
185 eval"&aes${p} ($inout,$rndkey0)";
186 &$movekey ($rndkey0,&QWP(0x70,$key));
187 eval"&aes${p} ($inout,$rndkey1)";
188 eval"&aes${p}last ($inout,$rndkey0)";
189 &ret();
190 &function_end_B("_aesni_${p}rypt1");
191}
192
193
194# void $PREFIX_encrypt (const void *inp,void *out,const AES_KEY *key);
195&aesni_generate1("enc") if (!$inline);
196&function_begin_B("${PREFIX}_encrypt");
197 &mov ("eax",&wparam(0));
198 &mov ($key,&wparam(2));
199 &movups ($inout0,&QWP(0,"eax"));
200 &mov ($rounds,&DWP(240,$key));
201 &mov ("eax",&wparam(1));
202 if ($inline)
203 { &aesni_inline_generate1("enc"); }
204 else
205 { &call ("_aesni_encrypt1"); }
206 &pxor ($rndkey0,$rndkey0); # clear register bank
207 &pxor ($rndkey1,$rndkey1);
208 &movups (&QWP(0,"eax"),$inout0);
209 &pxor ($inout0,$inout0);
210 &ret ();
211&function_end_B("${PREFIX}_encrypt");
212
213# void $PREFIX_decrypt (const void *inp,void *out,const AES_KEY *key);
214&aesni_generate1("dec") if(!$inline);
215&function_begin_B("${PREFIX}_decrypt");
216 &mov ("eax",&wparam(0));
217 &mov ($key,&wparam(2));
218 &movups ($inout0,&QWP(0,"eax"));
219 &mov ($rounds,&DWP(240,$key));
220 &mov ("eax",&wparam(1));
221 if ($inline)
222 { &aesni_inline_generate1("dec"); }
223 else
224 { &call ("_aesni_decrypt1"); }
225 &pxor ($rndkey0,$rndkey0); # clear register bank
226 &pxor ($rndkey1,$rndkey1);
227 &movups (&QWP(0,"eax"),$inout0);
228 &pxor ($inout0,$inout0);
229 &ret ();
230&function_end_B("${PREFIX}_decrypt");
231
232# _aesni_[en|de]cryptN are private interfaces, N denotes interleave
233# factor. Why 3x subroutine were originally used in loops? Even though
234# aes[enc|dec] latency was originally 6, it could be scheduled only
235# every *2nd* cycle. Thus 3x interleave was the one providing optimal
236# utilization, i.e. when subroutine's throughput is virtually same as
237# of non-interleaved subroutine [for number of input blocks up to 3].
238# This is why it originally made no sense to implement 2x subroutine.
239# But times change and it became appropriate to spend extra 192 bytes
240# on 2x subroutine on Atom Silvermont account. For processors that
241# can schedule aes[enc|dec] every cycle optimal interleave factor
242# equals to corresponding instructions latency. 8x is optimal for
243# * Bridge, but it's unfeasible to accommodate such implementation
244# in XMM registers addressable in 32-bit mode and therefore maximum
245# of 6x is used instead...
246
247sub aesni_generate2
248{ my $p=shift;
249
250 &function_begin_B("_aesni_${p}rypt2");
251 &$movekey ($rndkey0,&QWP(0,$key));
252 &shl ($rounds,4);
253 &$movekey ($rndkey1,&QWP(16,$key));
254 &xorps ($inout0,$rndkey0);
255 &pxor ($inout1,$rndkey0);
256 &$movekey ($rndkey0,&QWP(32,$key));
257 &lea ($key,&DWP(32,$key,$rounds));
258 &neg ($rounds);
259 &add ($rounds,16);
260
261 &set_label("${p}2_loop");
262 eval"&aes${p} ($inout0,$rndkey1)";
263 eval"&aes${p} ($inout1,$rndkey1)";
264 &$movekey ($rndkey1,&QWP(0,$key,$rounds));
265 &add ($rounds,32);
266 eval"&aes${p} ($inout0,$rndkey0)";
267 eval"&aes${p} ($inout1,$rndkey0)";
268 &$movekey ($rndkey0,&QWP(-16,$key,$rounds));
269 &jnz (&label("${p}2_loop"));
270 eval"&aes${p} ($inout0,$rndkey1)";
271 eval"&aes${p} ($inout1,$rndkey1)";
272 eval"&aes${p}last ($inout0,$rndkey0)";
273 eval"&aes${p}last ($inout1,$rndkey0)";
274 &ret();
275 &function_end_B("_aesni_${p}rypt2");
276}
277
278sub aesni_generate3
279{ my $p=shift;
280
281 &function_begin_B("_aesni_${p}rypt3");
282 &$movekey ($rndkey0,&QWP(0,$key));
283 &shl ($rounds,4);
284 &$movekey ($rndkey1,&QWP(16,$key));
285 &xorps ($inout0,$rndkey0);
286 &pxor ($inout1,$rndkey0);
287 &pxor ($inout2,$rndkey0);
288 &$movekey ($rndkey0,&QWP(32,$key));
289 &lea ($key,&DWP(32,$key,$rounds));
290 &neg ($rounds);
291 &add ($rounds,16);
292
293 &set_label("${p}3_loop");
294 eval"&aes${p} ($inout0,$rndkey1)";
295 eval"&aes${p} ($inout1,$rndkey1)";
296 eval"&aes${p} ($inout2,$rndkey1)";
297 &$movekey ($rndkey1,&QWP(0,$key,$rounds));
298 &add ($rounds,32);
299 eval"&aes${p} ($inout0,$rndkey0)";
300 eval"&aes${p} ($inout1,$rndkey0)";
301 eval"&aes${p} ($inout2,$rndkey0)";
302 &$movekey ($rndkey0,&QWP(-16,$key,$rounds));
303 &jnz (&label("${p}3_loop"));
304 eval"&aes${p} ($inout0,$rndkey1)";
305 eval"&aes${p} ($inout1,$rndkey1)";
306 eval"&aes${p} ($inout2,$rndkey1)";
307 eval"&aes${p}last ($inout0,$rndkey0)";
308 eval"&aes${p}last ($inout1,$rndkey0)";
309 eval"&aes${p}last ($inout2,$rndkey0)";
310 &ret();
311 &function_end_B("_aesni_${p}rypt3");
312}
313
314# 4x interleave is implemented to improve small block performance,
315# most notably [and naturally] 4 block by ~30%. One can argue that one
316# should have implemented 5x as well, but improvement would be <20%,
317# so it's not worth it...
318sub aesni_generate4
319{ my $p=shift;
320
321 &function_begin_B("_aesni_${p}rypt4");
322 &$movekey ($rndkey0,&QWP(0,$key));
323 &$movekey ($rndkey1,&QWP(16,$key));
324 &shl ($rounds,4);
325 &xorps ($inout0,$rndkey0);
326 &pxor ($inout1,$rndkey0);
327 &pxor ($inout2,$rndkey0);
328 &pxor ($inout3,$rndkey0);
329 &$movekey ($rndkey0,&QWP(32,$key));
330 &lea ($key,&DWP(32,$key,$rounds));
331 &neg ($rounds);
332 &data_byte (0x0f,0x1f,0x40,0x00);
333 &add ($rounds,16);
334
335 &set_label("${p}4_loop");
336 eval"&aes${p} ($inout0,$rndkey1)";
337 eval"&aes${p} ($inout1,$rndkey1)";
338 eval"&aes${p} ($inout2,$rndkey1)";
339 eval"&aes${p} ($inout3,$rndkey1)";
340 &$movekey ($rndkey1,&QWP(0,$key,$rounds));
341 &add ($rounds,32);
342 eval"&aes${p} ($inout0,$rndkey0)";
343 eval"&aes${p} ($inout1,$rndkey0)";
344 eval"&aes${p} ($inout2,$rndkey0)";
345 eval"&aes${p} ($inout3,$rndkey0)";
346 &$movekey ($rndkey0,&QWP(-16,$key,$rounds));
347 &jnz (&label("${p}4_loop"));
348
349 eval"&aes${p} ($inout0,$rndkey1)";
350 eval"&aes${p} ($inout1,$rndkey1)";
351 eval"&aes${p} ($inout2,$rndkey1)";
352 eval"&aes${p} ($inout3,$rndkey1)";
353 eval"&aes${p}last ($inout0,$rndkey0)";
354 eval"&aes${p}last ($inout1,$rndkey0)";
355 eval"&aes${p}last ($inout2,$rndkey0)";
356 eval"&aes${p}last ($inout3,$rndkey0)";
357 &ret();
358 &function_end_B("_aesni_${p}rypt4");
359}
360
361sub aesni_generate6
362{ my $p=shift;
363
364 &function_begin_B("_aesni_${p}rypt6");
365 &static_label("_aesni_${p}rypt6_enter");
366 &$movekey ($rndkey0,&QWP(0,$key));
367 &shl ($rounds,4);
368 &$movekey ($rndkey1,&QWP(16,$key));
369 &xorps ($inout0,$rndkey0);
370 &pxor ($inout1,$rndkey0); # pxor does better here
371 &pxor ($inout2,$rndkey0);
372 eval"&aes${p} ($inout0,$rndkey1)";
373 &pxor ($inout3,$rndkey0);
374 &pxor ($inout4,$rndkey0);
375 eval"&aes${p} ($inout1,$rndkey1)";
376 &lea ($key,&DWP(32,$key,$rounds));
377 &neg ($rounds);
378 eval"&aes${p} ($inout2,$rndkey1)";
379 &pxor ($inout5,$rndkey0);
380 &$movekey ($rndkey0,&QWP(0,$key,$rounds));
381 &add ($rounds,16);
382 &jmp (&label("_aesni_${p}rypt6_inner"));
383
384 &set_label("${p}6_loop",16);
385 eval"&aes${p} ($inout0,$rndkey1)";
386 eval"&aes${p} ($inout1,$rndkey1)";
387 eval"&aes${p} ($inout2,$rndkey1)";
388 &set_label("_aesni_${p}rypt6_inner");
389 eval"&aes${p} ($inout3,$rndkey1)";
390 eval"&aes${p} ($inout4,$rndkey1)";
391 eval"&aes${p} ($inout5,$rndkey1)";
392 &set_label("_aesni_${p}rypt6_enter");
393 &$movekey ($rndkey1,&QWP(0,$key,$rounds));
394 &add ($rounds,32);
395 eval"&aes${p} ($inout0,$rndkey0)";
396 eval"&aes${p} ($inout1,$rndkey0)";
397 eval"&aes${p} ($inout2,$rndkey0)";
398 eval"&aes${p} ($inout3,$rndkey0)";
399 eval"&aes${p} ($inout4,$rndkey0)";
400 eval"&aes${p} ($inout5,$rndkey0)";
401 &$movekey ($rndkey0,&QWP(-16,$key,$rounds));
402 &jnz (&label("${p}6_loop"));
403
404 eval"&aes${p} ($inout0,$rndkey1)";
405 eval"&aes${p} ($inout1,$rndkey1)";
406 eval"&aes${p} ($inout2,$rndkey1)";
407 eval"&aes${p} ($inout3,$rndkey1)";
408 eval"&aes${p} ($inout4,$rndkey1)";
409 eval"&aes${p} ($inout5,$rndkey1)";
410 eval"&aes${p}last ($inout0,$rndkey0)";
411 eval"&aes${p}last ($inout1,$rndkey0)";
412 eval"&aes${p}last ($inout2,$rndkey0)";
413 eval"&aes${p}last ($inout3,$rndkey0)";
414 eval"&aes${p}last ($inout4,$rndkey0)";
415 eval"&aes${p}last ($inout5,$rndkey0)";
416 &ret();
417 &function_end_B("_aesni_${p}rypt6");
418}
419&aesni_generate2("enc") if ($PREFIX eq "aesni");
420&aesni_generate2("dec");
421&aesni_generate3("enc") if ($PREFIX eq "aesni");
422&aesni_generate3("dec");
423&aesni_generate4("enc") if ($PREFIX eq "aesni");
424&aesni_generate4("dec");
425&aesni_generate6("enc") if ($PREFIX eq "aesni");
426&aesni_generate6("dec");
427
428
429if ($PREFIX eq "aesni") {
430######################################################################
431# void aesni_ecb_encrypt (const void *in, void *out,
432# size_t length, const AES_KEY *key,
433# int enc);
434&function_begin("aesni_ecb_encrypt");
435 &mov ($inp,&wparam(0));
436 &mov ($out,&wparam(1));
437 &mov ($len,&wparam(2));
438 &mov ($key,&wparam(3));
439 &mov ($rounds_,&wparam(4));
440 &and ($len,-16);
441 &jz (&label("ecb_ret"));
442 &mov ($rounds,&DWP(240,$key));
443 &test ($rounds_,$rounds_);
444 &jz (&label("ecb_decrypt"));
445
446 &mov ($key_,$key); # backup $key
447 &mov ($rounds_,$rounds); # backup $rounds
448 &cmp ($len,0x60);
449 &jb (&label("ecb_enc_tail"));
450
451 &movdqu ($inout0,&QWP(0,$inp));
452 &movdqu ($inout1,&QWP(0x10,$inp));
453 &movdqu ($inout2,&QWP(0x20,$inp));
454 &movdqu ($inout3,&QWP(0x30,$inp));
455 &movdqu ($inout4,&QWP(0x40,$inp));
456 &movdqu ($inout5,&QWP(0x50,$inp));
457 &lea ($inp,&DWP(0x60,$inp));
458 &sub ($len,0x60);
459 &jmp (&label("ecb_enc_loop6_enter"));
460
461&set_label("ecb_enc_loop6",16);
462 &movups (&QWP(0,$out),$inout0);
463 &movdqu ($inout0,&QWP(0,$inp));
464 &movups (&QWP(0x10,$out),$inout1);
465 &movdqu ($inout1,&QWP(0x10,$inp));
466 &movups (&QWP(0x20,$out),$inout2);
467 &movdqu ($inout2,&QWP(0x20,$inp));
468 &movups (&QWP(0x30,$out),$inout3);
469 &movdqu ($inout3,&QWP(0x30,$inp));
470 &movups (&QWP(0x40,$out),$inout4);
471 &movdqu ($inout4,&QWP(0x40,$inp));
472 &movups (&QWP(0x50,$out),$inout5);
473 &lea ($out,&DWP(0x60,$out));
474 &movdqu ($inout5,&QWP(0x50,$inp));
475 &lea ($inp,&DWP(0x60,$inp));
476&set_label("ecb_enc_loop6_enter");
477
478 &call ("_aesni_encrypt6");
479
480 &mov ($key,$key_); # restore $key
481 &mov ($rounds,$rounds_); # restore $rounds
482 &sub ($len,0x60);
483 &jnc (&label("ecb_enc_loop6"));
484
485 &movups (&QWP(0,$out),$inout0);
486 &movups (&QWP(0x10,$out),$inout1);
487 &movups (&QWP(0x20,$out),$inout2);
488 &movups (&QWP(0x30,$out),$inout3);
489 &movups (&QWP(0x40,$out),$inout4);
490 &movups (&QWP(0x50,$out),$inout5);
491 &lea ($out,&DWP(0x60,$out));
492 &add ($len,0x60);
493 &jz (&label("ecb_ret"));
494
495&set_label("ecb_enc_tail");
496 &movups ($inout0,&QWP(0,$inp));
497 &cmp ($len,0x20);
498 &jb (&label("ecb_enc_one"));
499 &movups ($inout1,&QWP(0x10,$inp));
500 &je (&label("ecb_enc_two"));
501 &movups ($inout2,&QWP(0x20,$inp));
502 &cmp ($len,0x40);
503 &jb (&label("ecb_enc_three"));
504 &movups ($inout3,&QWP(0x30,$inp));
505 &je (&label("ecb_enc_four"));
506 &movups ($inout4,&QWP(0x40,$inp));
507 &xorps ($inout5,$inout5);
508 &call ("_aesni_encrypt6");
509 &movups (&QWP(0,$out),$inout0);
510 &movups (&QWP(0x10,$out),$inout1);
511 &movups (&QWP(0x20,$out),$inout2);
512 &movups (&QWP(0x30,$out),$inout3);
513 &movups (&QWP(0x40,$out),$inout4);
514 jmp (&label("ecb_ret"));
515
516&set_label("ecb_enc_one",16);
517 if ($inline)
518 { &aesni_inline_generate1("enc"); }
519 else
520 { &call ("_aesni_encrypt1"); }
521 &movups (&QWP(0,$out),$inout0);
522 &jmp (&label("ecb_ret"));
523
524&set_label("ecb_enc_two",16);
525 &call ("_aesni_encrypt2");
526 &movups (&QWP(0,$out),$inout0);
527 &movups (&QWP(0x10,$out),$inout1);
528 &jmp (&label("ecb_ret"));
529
530&set_label("ecb_enc_three",16);
531 &call ("_aesni_encrypt3");
532 &movups (&QWP(0,$out),$inout0);
533 &movups (&QWP(0x10,$out),$inout1);
534 &movups (&QWP(0x20,$out),$inout2);
535 &jmp (&label("ecb_ret"));
536
537&set_label("ecb_enc_four",16);
538 &call ("_aesni_encrypt4");
539 &movups (&QWP(0,$out),$inout0);
540 &movups (&QWP(0x10,$out),$inout1);
541 &movups (&QWP(0x20,$out),$inout2);
542 &movups (&QWP(0x30,$out),$inout3);
543 &jmp (&label("ecb_ret"));
544######################################################################
545&set_label("ecb_decrypt",16);
546 &mov ($key_,$key); # backup $key
547 &mov ($rounds_,$rounds); # backup $rounds
548 &cmp ($len,0x60);
549 &jb (&label("ecb_dec_tail"));
550
551 &movdqu ($inout0,&QWP(0,$inp));
552 &movdqu ($inout1,&QWP(0x10,$inp));
553 &movdqu ($inout2,&QWP(0x20,$inp));
554 &movdqu ($inout3,&QWP(0x30,$inp));
555 &movdqu ($inout4,&QWP(0x40,$inp));
556 &movdqu ($inout5,&QWP(0x50,$inp));
557 &lea ($inp,&DWP(0x60,$inp));
558 &sub ($len,0x60);
559 &jmp (&label("ecb_dec_loop6_enter"));
560
561&set_label("ecb_dec_loop6",16);
562 &movups (&QWP(0,$out),$inout0);
563 &movdqu ($inout0,&QWP(0,$inp));
564 &movups (&QWP(0x10,$out),$inout1);
565 &movdqu ($inout1,&QWP(0x10,$inp));
566 &movups (&QWP(0x20,$out),$inout2);
567 &movdqu ($inout2,&QWP(0x20,$inp));
568 &movups (&QWP(0x30,$out),$inout3);
569 &movdqu ($inout3,&QWP(0x30,$inp));
570 &movups (&QWP(0x40,$out),$inout4);
571 &movdqu ($inout4,&QWP(0x40,$inp));
572 &movups (&QWP(0x50,$out),$inout5);
573 &lea ($out,&DWP(0x60,$out));
574 &movdqu ($inout5,&QWP(0x50,$inp));
575 &lea ($inp,&DWP(0x60,$inp));
576&set_label("ecb_dec_loop6_enter");
577
578 &call ("_aesni_decrypt6");
579
580 &mov ($key,$key_); # restore $key
581 &mov ($rounds,$rounds_); # restore $rounds
582 &sub ($len,0x60);
583 &jnc (&label("ecb_dec_loop6"));
584
585 &movups (&QWP(0,$out),$inout0);
586 &movups (&QWP(0x10,$out),$inout1);
587 &movups (&QWP(0x20,$out),$inout2);
588 &movups (&QWP(0x30,$out),$inout3);
589 &movups (&QWP(0x40,$out),$inout4);
590 &movups (&QWP(0x50,$out),$inout5);
591 &lea ($out,&DWP(0x60,$out));
592 &add ($len,0x60);
593 &jz (&label("ecb_ret"));
594
595&set_label("ecb_dec_tail");
596 &movups ($inout0,&QWP(0,$inp));
597 &cmp ($len,0x20);
598 &jb (&label("ecb_dec_one"));
599 &movups ($inout1,&QWP(0x10,$inp));
600 &je (&label("ecb_dec_two"));
601 &movups ($inout2,&QWP(0x20,$inp));
602 &cmp ($len,0x40);
603 &jb (&label("ecb_dec_three"));
604 &movups ($inout3,&QWP(0x30,$inp));
605 &je (&label("ecb_dec_four"));
606 &movups ($inout4,&QWP(0x40,$inp));
607 &xorps ($inout5,$inout5);
608 &call ("_aesni_decrypt6");
609 &movups (&QWP(0,$out),$inout0);
610 &movups (&QWP(0x10,$out),$inout1);
611 &movups (&QWP(0x20,$out),$inout2);
612 &movups (&QWP(0x30,$out),$inout3);
613 &movups (&QWP(0x40,$out),$inout4);
614 &jmp (&label("ecb_ret"));
615
616&set_label("ecb_dec_one",16);
617 if ($inline)
618 { &aesni_inline_generate1("dec"); }
619 else
620 { &call ("_aesni_decrypt1"); }
621 &movups (&QWP(0,$out),$inout0);
622 &jmp (&label("ecb_ret"));
623
624&set_label("ecb_dec_two",16);
625 &call ("_aesni_decrypt2");
626 &movups (&QWP(0,$out),$inout0);
627 &movups (&QWP(0x10,$out),$inout1);
628 &jmp (&label("ecb_ret"));
629
630&set_label("ecb_dec_three",16);
631 &call ("_aesni_decrypt3");
632 &movups (&QWP(0,$out),$inout0);
633 &movups (&QWP(0x10,$out),$inout1);
634 &movups (&QWP(0x20,$out),$inout2);
635 &jmp (&label("ecb_ret"));
636
637&set_label("ecb_dec_four",16);
638 &call ("_aesni_decrypt4");
639 &movups (&QWP(0,$out),$inout0);
640 &movups (&QWP(0x10,$out),$inout1);
641 &movups (&QWP(0x20,$out),$inout2);
642 &movups (&QWP(0x30,$out),$inout3);
643
644&set_label("ecb_ret");
645 &pxor ("xmm0","xmm0"); # clear register bank
646 &pxor ("xmm1","xmm1");
647 &pxor ("xmm2","xmm2");
648 &pxor ("xmm3","xmm3");
649 &pxor ("xmm4","xmm4");
650 &pxor ("xmm5","xmm5");
651 &pxor ("xmm6","xmm6");
652 &pxor ("xmm7","xmm7");
653&function_end("aesni_ecb_encrypt");
654
655
656######################################################################
657# void aesni_ccm64_[en|de]crypt_blocks (const void *in, void *out,
658# size_t blocks, const AES_KEY *key,
659# const char *ivec,char *cmac);
660#
661# Handles only complete blocks, operates on 64-bit counter and
662# does not update *ivec! Nor does it finalize CMAC value
663# (see engine/eng_aesni.c for details)
664#
665{ my $cmac=$inout1;
666&function_begin("aesni_ccm64_encrypt_blocks");
667 &mov ($inp,&wparam(0));
668 &mov ($out,&wparam(1));
669 &mov ($len,&wparam(2));
670 &mov ($key,&wparam(3));
671 &mov ($rounds_,&wparam(4));
672 &mov ($rounds,&wparam(5));
673 &mov ($key_,"esp");
674 &sub ("esp",60);
675 &and ("esp",-16); # align stack
676 &mov (&DWP(48,"esp"),$key_);
677
678 &movdqu ($ivec,&QWP(0,$rounds_)); # load ivec
679 &movdqu ($cmac,&QWP(0,$rounds)); # load cmac
680 &mov ($rounds,&DWP(240,$key));
681
682 # compose byte-swap control mask for pshufb on stack
683 &mov (&DWP(0,"esp"),0x0c0d0e0f);
684 &mov (&DWP(4,"esp"),0x08090a0b);
685 &mov (&DWP(8,"esp"),0x04050607);
686 &mov (&DWP(12,"esp"),0x00010203);
687
688 # compose counter increment vector on stack
689 &mov ($rounds_,1);
690 &xor ($key_,$key_);
691 &mov (&DWP(16,"esp"),$rounds_);
692 &mov (&DWP(20,"esp"),$key_);
693 &mov (&DWP(24,"esp"),$key_);
694 &mov (&DWP(28,"esp"),$key_);
695
696 &shl ($rounds,4);
697 &mov ($rounds_,16);
698 &lea ($key_,&DWP(0,$key));
699 &movdqa ($inout3,&QWP(0,"esp"));
700 &movdqa ($inout0,$ivec);
701 &lea ($key,&DWP(32,$key,$rounds));
702 &sub ($rounds_,$rounds);
703 &pshufb ($ivec,$inout3);
704
705&set_label("ccm64_enc_outer");
706 &$movekey ($rndkey0,&QWP(0,$key_));
707 &mov ($rounds,$rounds_);
708 &movups ($in0,&QWP(0,$inp));
709
710 &xorps ($inout0,$rndkey0);
711 &$movekey ($rndkey1,&QWP(16,$key_));
712 &xorps ($rndkey0,$in0);
713 &xorps ($cmac,$rndkey0); # cmac^=inp
714 &$movekey ($rndkey0,&QWP(32,$key_));
715
716&set_label("ccm64_enc2_loop");
717 &aesenc ($inout0,$rndkey1);
718 &aesenc ($cmac,$rndkey1);
719 &$movekey ($rndkey1,&QWP(0,$key,$rounds));
720 &add ($rounds,32);
721 &aesenc ($inout0,$rndkey0);
722 &aesenc ($cmac,$rndkey0);
723 &$movekey ($rndkey0,&QWP(-16,$key,$rounds));
724 &jnz (&label("ccm64_enc2_loop"));
725 &aesenc ($inout0,$rndkey1);
726 &aesenc ($cmac,$rndkey1);
727 &paddq ($ivec,&QWP(16,"esp"));
728 &dec ($len);
729 &aesenclast ($inout0,$rndkey0);
730 &aesenclast ($cmac,$rndkey0);
731
732 &lea ($inp,&DWP(16,$inp));
733 &xorps ($in0,$inout0); # inp^=E(ivec)
734 &movdqa ($inout0,$ivec);
735 &movups (&QWP(0,$out),$in0); # save output
736 &pshufb ($inout0,$inout3);
737 &lea ($out,&DWP(16,$out));
738 &jnz (&label("ccm64_enc_outer"));
739
740 &mov ("esp",&DWP(48,"esp"));
741 &mov ($out,&wparam(5));
742 &movups (&QWP(0,$out),$cmac);
743
744 &pxor ("xmm0","xmm0"); # clear register bank
745 &pxor ("xmm1","xmm1");
746 &pxor ("xmm2","xmm2");
747 &pxor ("xmm3","xmm3");
748 &pxor ("xmm4","xmm4");
749 &pxor ("xmm5","xmm5");
750 &pxor ("xmm6","xmm6");
751 &pxor ("xmm7","xmm7");
752&function_end("aesni_ccm64_encrypt_blocks");
753
754&function_begin("aesni_ccm64_decrypt_blocks");
755 &mov ($inp,&wparam(0));
756 &mov ($out,&wparam(1));
757 &mov ($len,&wparam(2));
758 &mov ($key,&wparam(3));
759 &mov ($rounds_,&wparam(4));
760 &mov ($rounds,&wparam(5));
761 &mov ($key_,"esp");
762 &sub ("esp",60);
763 &and ("esp",-16); # align stack
764 &mov (&DWP(48,"esp"),$key_);
765
766 &movdqu ($ivec,&QWP(0,$rounds_)); # load ivec
767 &movdqu ($cmac,&QWP(0,$rounds)); # load cmac
768 &mov ($rounds,&DWP(240,$key));
769
770 # compose byte-swap control mask for pshufb on stack
771 &mov (&DWP(0,"esp"),0x0c0d0e0f);
772 &mov (&DWP(4,"esp"),0x08090a0b);
773 &mov (&DWP(8,"esp"),0x04050607);
774 &mov (&DWP(12,"esp"),0x00010203);
775
776 # compose counter increment vector on stack
777 &mov ($rounds_,1);
778 &xor ($key_,$key_);
779 &mov (&DWP(16,"esp"),$rounds_);
780 &mov (&DWP(20,"esp"),$key_);
781 &mov (&DWP(24,"esp"),$key_);
782 &mov (&DWP(28,"esp"),$key_);
783
784 &movdqa ($inout3,&QWP(0,"esp")); # bswap mask
785 &movdqa ($inout0,$ivec);
786
787 &mov ($key_,$key);
788 &mov ($rounds_,$rounds);
789
790 &pshufb ($ivec,$inout3);
791 if ($inline)
792 { &aesni_inline_generate1("enc"); }
793 else
794 { &call ("_aesni_encrypt1"); }
795 &shl ($rounds_,4);
796 &mov ($rounds,16);
797 &movups ($in0,&QWP(0,$inp)); # load inp
798 &paddq ($ivec,&QWP(16,"esp"));
799 &lea ($inp,&QWP(16,$inp));
800 &sub ($rounds,$rounds_);
801 &lea ($key,&DWP(32,$key_,$rounds_));
802 &mov ($rounds_,$rounds);
803 &jmp (&label("ccm64_dec_outer"));
804
805&set_label("ccm64_dec_outer",16);
806 &xorps ($in0,$inout0); # inp ^= E(ivec)
807 &movdqa ($inout0,$ivec);
808 &movups (&QWP(0,$out),$in0); # save output
809 &lea ($out,&DWP(16,$out));
810 &pshufb ($inout0,$inout3);
811
812 &sub ($len,1);
813 &jz (&label("ccm64_dec_break"));
814
815 &$movekey ($rndkey0,&QWP(0,$key_));
816 &mov ($rounds,$rounds_);
817 &$movekey ($rndkey1,&QWP(16,$key_));
818 &xorps ($in0,$rndkey0);
819 &xorps ($inout0,$rndkey0);
820 &xorps ($cmac,$in0); # cmac^=out
821 &$movekey ($rndkey0,&QWP(32,$key_));
822
823&set_label("ccm64_dec2_loop");
824 &aesenc ($inout0,$rndkey1);
825 &aesenc ($cmac,$rndkey1);
826 &$movekey ($rndkey1,&QWP(0,$key,$rounds));
827 &add ($rounds,32);
828 &aesenc ($inout0,$rndkey0);
829 &aesenc ($cmac,$rndkey0);
830 &$movekey ($rndkey0,&QWP(-16,$key,$rounds));
831 &jnz (&label("ccm64_dec2_loop"));
832 &movups ($in0,&QWP(0,$inp)); # load inp
833 &paddq ($ivec,&QWP(16,"esp"));
834 &aesenc ($inout0,$rndkey1);
835 &aesenc ($cmac,$rndkey1);
836 &aesenclast ($inout0,$rndkey0);
837 &aesenclast ($cmac,$rndkey0);
838 &lea ($inp,&QWP(16,$inp));
839 &jmp (&label("ccm64_dec_outer"));
840
841&set_label("ccm64_dec_break",16);
842 &mov ($rounds,&DWP(240,$key_));
843 &mov ($key,$key_);
844 if ($inline)
845 { &aesni_inline_generate1("enc",$cmac,$in0); }
846 else
847 { &call ("_aesni_encrypt1",$cmac); }
848
849 &mov ("esp",&DWP(48,"esp"));
850 &mov ($out,&wparam(5));
851 &movups (&QWP(0,$out),$cmac);
852
853 &pxor ("xmm0","xmm0"); # clear register bank
854 &pxor ("xmm1","xmm1");
855 &pxor ("xmm2","xmm2");
856 &pxor ("xmm3","xmm3");
857 &pxor ("xmm4","xmm4");
858 &pxor ("xmm5","xmm5");
859 &pxor ("xmm6","xmm6");
860 &pxor ("xmm7","xmm7");
861&function_end("aesni_ccm64_decrypt_blocks");
862}
863
864
865######################################################################
866# void aesni_ctr32_encrypt_blocks (const void *in, void *out,
867# size_t blocks, const AES_KEY *key,
868# const char *ivec);
869#
870# Handles only complete blocks, operates on 32-bit counter and
871# does not update *ivec! (see crypto/modes/ctr128.c for details)
872#
873# stack layout:
874# 0 pshufb mask
875# 16 vector addend: 0,6,6,6
876# 32 counter-less ivec
877# 48 1st triplet of counter vector
878# 64 2nd triplet of counter vector
879# 80 saved %esp
880
881&function_begin("aesni_ctr32_encrypt_blocks");
882 &mov ($inp,&wparam(0));
883 &mov ($out,&wparam(1));
884 &mov ($len,&wparam(2));
885 &mov ($key,&wparam(3));
886 &mov ($rounds_,&wparam(4));
887 &mov ($key_,"esp");
888 &sub ("esp",88);
889 &and ("esp",-16); # align stack
890 &mov (&DWP(80,"esp"),$key_);
891
892 &cmp ($len,1);
893 &je (&label("ctr32_one_shortcut"));
894
895 &movdqu ($inout5,&QWP(0,$rounds_)); # load ivec
896
897 # compose byte-swap control mask for pshufb on stack
898 &mov (&DWP(0,"esp"),0x0c0d0e0f);
899 &mov (&DWP(4,"esp"),0x08090a0b);
900 &mov (&DWP(8,"esp"),0x04050607);
901 &mov (&DWP(12,"esp"),0x00010203);
902
903 # compose counter increment vector on stack
904 &mov ($rounds,6);
905 &xor ($key_,$key_);
906 &mov (&DWP(16,"esp"),$rounds);
907 &mov (&DWP(20,"esp"),$rounds);
908 &mov (&DWP(24,"esp"),$rounds);
909 &mov (&DWP(28,"esp"),$key_);
910
911 &pextrd ($rounds_,$inout5,3); # pull 32-bit counter
912 &pinsrd ($inout5,$key_,3); # wipe 32-bit counter
913
914 &mov ($rounds,&DWP(240,$key)); # key->rounds
915
916 # compose 2 vectors of 3x32-bit counters
917 &bswap ($rounds_);
918 &pxor ($rndkey0,$rndkey0);
919 &pxor ($rndkey1,$rndkey1);
920 &movdqa ($inout0,&QWP(0,"esp")); # load byte-swap mask
921 &pinsrd ($rndkey0,$rounds_,0);
922 &lea ($key_,&DWP(3,$rounds_));
923 &pinsrd ($rndkey1,$key_,0);
924 &inc ($rounds_);
925 &pinsrd ($rndkey0,$rounds_,1);
926 &inc ($key_);
927 &pinsrd ($rndkey1,$key_,1);
928 &inc ($rounds_);
929 &pinsrd ($rndkey0,$rounds_,2);
930 &inc ($key_);
931 &pinsrd ($rndkey1,$key_,2);
932 &movdqa (&QWP(48,"esp"),$rndkey0); # save 1st triplet
933 &pshufb ($rndkey0,$inout0); # byte swap
934 &movdqu ($inout4,&QWP(0,$key)); # key[0]
935 &movdqa (&QWP(64,"esp"),$rndkey1); # save 2nd triplet
936 &pshufb ($rndkey1,$inout0); # byte swap
937
938 &pshufd ($inout0,$rndkey0,3<<6); # place counter to upper dword
939 &pshufd ($inout1,$rndkey0,2<<6);
940 &cmp ($len,6);
941 &jb (&label("ctr32_tail"));
942 &pxor ($inout5,$inout4); # counter-less ivec^key[0]
943 &shl ($rounds,4);
944 &mov ($rounds_,16);
945 &movdqa (&QWP(32,"esp"),$inout5); # save counter-less ivec^key[0]
946 &mov ($key_,$key); # backup $key
947 &sub ($rounds_,$rounds); # backup twisted $rounds
948 &lea ($key,&DWP(32,$key,$rounds));
949 &sub ($len,6);
950 &jmp (&label("ctr32_loop6"));
951
952&set_label("ctr32_loop6",16);
953 # inlining _aesni_encrypt6's prologue gives ~6% improvement...
954 &pshufd ($inout2,$rndkey0,1<<6);
955 &movdqa ($rndkey0,&QWP(32,"esp")); # pull counter-less ivec
956 &pshufd ($inout3,$rndkey1,3<<6);
957 &pxor ($inout0,$rndkey0); # merge counter-less ivec
958 &pshufd ($inout4,$rndkey1,2<<6);
959 &pxor ($inout1,$rndkey0);
960 &pshufd ($inout5,$rndkey1,1<<6);
961 &$movekey ($rndkey1,&QWP(16,$key_));
962 &pxor ($inout2,$rndkey0);
963 &pxor ($inout3,$rndkey0);
964 &aesenc ($inout0,$rndkey1);
965 &pxor ($inout4,$rndkey0);
966 &pxor ($inout5,$rndkey0);
967 &aesenc ($inout1,$rndkey1);
968 &$movekey ($rndkey0,&QWP(32,$key_));
969 &mov ($rounds,$rounds_);
970 &aesenc ($inout2,$rndkey1);
971 &aesenc ($inout3,$rndkey1);
972 &aesenc ($inout4,$rndkey1);
973 &aesenc ($inout5,$rndkey1);
974
975 &call (&label("_aesni_encrypt6_enter"));
976
977 &movups ($rndkey1,&QWP(0,$inp));
978 &movups ($rndkey0,&QWP(0x10,$inp));
979 &xorps ($inout0,$rndkey1);
980 &movups ($rndkey1,&QWP(0x20,$inp));
981 &xorps ($inout1,$rndkey0);
982 &movups (&QWP(0,$out),$inout0);
983 &movdqa ($rndkey0,&QWP(16,"esp")); # load increment
984 &xorps ($inout2,$rndkey1);
985 &movdqa ($rndkey1,&QWP(64,"esp")); # load 2nd triplet
986 &movups (&QWP(0x10,$out),$inout1);
987 &movups (&QWP(0x20,$out),$inout2);
988
989 &paddd ($rndkey1,$rndkey0); # 2nd triplet increment
990 &paddd ($rndkey0,&QWP(48,"esp")); # 1st triplet increment
991 &movdqa ($inout0,&QWP(0,"esp")); # load byte swap mask
992
993 &movups ($inout1,&QWP(0x30,$inp));
994 &movups ($inout2,&QWP(0x40,$inp));
995 &xorps ($inout3,$inout1);
996 &movups ($inout1,&QWP(0x50,$inp));
997 &lea ($inp,&DWP(0x60,$inp));
998 &movdqa (&QWP(48,"esp"),$rndkey0); # save 1st triplet
999 &pshufb ($rndkey0,$inout0); # byte swap
1000 &xorps ($inout4,$inout2);
1001 &movups (&QWP(0x30,$out),$inout3);
1002 &xorps ($inout5,$inout1);
1003 &movdqa (&QWP(64,"esp"),$rndkey1); # save 2nd triplet
1004 &pshufb ($rndkey1,$inout0); # byte swap
1005 &movups (&QWP(0x40,$out),$inout4);
1006 &pshufd ($inout0,$rndkey0,3<<6);
1007 &movups (&QWP(0x50,$out),$inout5);
1008 &lea ($out,&DWP(0x60,$out));
1009
1010 &pshufd ($inout1,$rndkey0,2<<6);
1011 &sub ($len,6);
1012 &jnc (&label("ctr32_loop6"));
1013
1014 &add ($len,6);
1015 &jz (&label("ctr32_ret"));
1016 &movdqu ($inout5,&QWP(0,$key_));
1017 &mov ($key,$key_);
1018 &pxor ($inout5,&QWP(32,"esp")); # restore count-less ivec
1019 &mov ($rounds,&DWP(240,$key_)); # restore $rounds
1020
1021&set_label("ctr32_tail");
1022 &por ($inout0,$inout5);
1023 &cmp ($len,2);
1024 &jb (&label("ctr32_one"));
1025
1026 &pshufd ($inout2,$rndkey0,1<<6);
1027 &por ($inout1,$inout5);
1028 &je (&label("ctr32_two"));
1029
1030 &pshufd ($inout3,$rndkey1,3<<6);
1031 &por ($inout2,$inout5);
1032 &cmp ($len,4);
1033 &jb (&label("ctr32_three"));
1034
1035 &pshufd ($inout4,$rndkey1,2<<6);
1036 &por ($inout3,$inout5);
1037 &je (&label("ctr32_four"));
1038
1039 &por ($inout4,$inout5);
1040 &call ("_aesni_encrypt6");
1041 &movups ($rndkey1,&QWP(0,$inp));
1042 &movups ($rndkey0,&QWP(0x10,$inp));
1043 &xorps ($inout0,$rndkey1);
1044 &movups ($rndkey1,&QWP(0x20,$inp));
1045 &xorps ($inout1,$rndkey0);
1046 &movups ($rndkey0,&QWP(0x30,$inp));
1047 &xorps ($inout2,$rndkey1);
1048 &movups ($rndkey1,&QWP(0x40,$inp));
1049 &xorps ($inout3,$rndkey0);
1050 &movups (&QWP(0,$out),$inout0);
1051 &xorps ($inout4,$rndkey1);
1052 &movups (&QWP(0x10,$out),$inout1);
1053 &movups (&QWP(0x20,$out),$inout2);
1054 &movups (&QWP(0x30,$out),$inout3);
1055 &movups (&QWP(0x40,$out),$inout4);
1056 &jmp (&label("ctr32_ret"));
1057
1058&set_label("ctr32_one_shortcut",16);
1059 &movups ($inout0,&QWP(0,$rounds_)); # load ivec
1060 &mov ($rounds,&DWP(240,$key));
1061
1062&set_label("ctr32_one");
1063 if ($inline)
1064 { &aesni_inline_generate1("enc"); }
1065 else
1066 { &call ("_aesni_encrypt1"); }
1067 &movups ($in0,&QWP(0,$inp));
1068 &xorps ($in0,$inout0);
1069 &movups (&QWP(0,$out),$in0);
1070 &jmp (&label("ctr32_ret"));
1071
1072&set_label("ctr32_two",16);
1073 &call ("_aesni_encrypt2");
1074 &movups ($inout3,&QWP(0,$inp));
1075 &movups ($inout4,&QWP(0x10,$inp));
1076 &xorps ($inout0,$inout3);
1077 &xorps ($inout1,$inout4);
1078 &movups (&QWP(0,$out),$inout0);
1079 &movups (&QWP(0x10,$out),$inout1);
1080 &jmp (&label("ctr32_ret"));
1081
1082&set_label("ctr32_three",16);
1083 &call ("_aesni_encrypt3");
1084 &movups ($inout3,&QWP(0,$inp));
1085 &movups ($inout4,&QWP(0x10,$inp));
1086 &xorps ($inout0,$inout3);
1087 &movups ($inout5,&QWP(0x20,$inp));
1088 &xorps ($inout1,$inout4);
1089 &movups (&QWP(0,$out),$inout0);
1090 &xorps ($inout2,$inout5);
1091 &movups (&QWP(0x10,$out),$inout1);
1092 &movups (&QWP(0x20,$out),$inout2);
1093 &jmp (&label("ctr32_ret"));
1094
1095&set_label("ctr32_four",16);
1096 &call ("_aesni_encrypt4");
1097 &movups ($inout4,&QWP(0,$inp));
1098 &movups ($inout5,&QWP(0x10,$inp));
1099 &movups ($rndkey1,&QWP(0x20,$inp));
1100 &xorps ($inout0,$inout4);
1101 &movups ($rndkey0,&QWP(0x30,$inp));
1102 &xorps ($inout1,$inout5);
1103 &movups (&QWP(0,$out),$inout0);
1104 &xorps ($inout2,$rndkey1);
1105 &movups (&QWP(0x10,$out),$inout1);
1106 &xorps ($inout3,$rndkey0);
1107 &movups (&QWP(0x20,$out),$inout2);
1108 &movups (&QWP(0x30,$out),$inout3);
1109
1110&set_label("ctr32_ret");
1111 &pxor ("xmm0","xmm0"); # clear register bank
1112 &pxor ("xmm1","xmm1");
1113 &pxor ("xmm2","xmm2");
1114 &pxor ("xmm3","xmm3");
1115 &pxor ("xmm4","xmm4");
1116 &movdqa (&QWP(32,"esp"),"xmm0"); # clear stack
1117 &pxor ("xmm5","xmm5");
1118 &movdqa (&QWP(48,"esp"),"xmm0");
1119 &pxor ("xmm6","xmm6");
1120 &movdqa (&QWP(64,"esp"),"xmm0");
1121 &pxor ("xmm7","xmm7");
1122 &mov ("esp",&DWP(80,"esp"));
1123&function_end("aesni_ctr32_encrypt_blocks");
1124
1125
1126######################################################################
1127# void aesni_xts_[en|de]crypt(const char *inp,char *out,size_t len,
1128# const AES_KEY *key1, const AES_KEY *key2
1129# const unsigned char iv[16]);
1130#
1131{ my ($tweak,$twtmp,$twres,$twmask)=($rndkey1,$rndkey0,$inout0,$inout1);
1132
1133&function_begin("aesni_xts_encrypt");
1134 &mov ($key,&wparam(4)); # key2
1135 &mov ($inp,&wparam(5)); # clear-text tweak
1136
1137 &mov ($rounds,&DWP(240,$key)); # key2->rounds
1138 &movups ($inout0,&QWP(0,$inp));
1139 if ($inline)
1140 { &aesni_inline_generate1("enc"); }
1141 else
1142 { &call ("_aesni_encrypt1"); }
1143
1144 &mov ($inp,&wparam(0));
1145 &mov ($out,&wparam(1));
1146 &mov ($len,&wparam(2));
1147 &mov ($key,&wparam(3)); # key1
1148
1149 &mov ($key_,"esp");
1150 &sub ("esp",16*7+8);
1151 &mov ($rounds,&DWP(240,$key)); # key1->rounds
1152 &and ("esp",-16); # align stack
1153
1154 &mov (&DWP(16*6+0,"esp"),0x87); # compose the magic constant
1155 &mov (&DWP(16*6+4,"esp"),0);
1156 &mov (&DWP(16*6+8,"esp"),1);
1157 &mov (&DWP(16*6+12,"esp"),0);
1158 &mov (&DWP(16*7+0,"esp"),$len); # save original $len
1159 &mov (&DWP(16*7+4,"esp"),$key_); # save original %esp
1160
1161 &movdqa ($tweak,$inout0);
1162 &pxor ($twtmp,$twtmp);
1163 &movdqa ($twmask,&QWP(6*16,"esp")); # 0x0...010...87
1164 &pcmpgtd($twtmp,$tweak); # broadcast upper bits
1165
1166 &and ($len,-16);
1167 &mov ($key_,$key); # backup $key
1168 &mov ($rounds_,$rounds); # backup $rounds
1169 &sub ($len,16*6);
1170 &jc (&label("xts_enc_short"));
1171
1172 &shl ($rounds,4);
1173 &mov ($rounds_,16);
1174 &sub ($rounds_,$rounds);
1175 &lea ($key,&DWP(32,$key,$rounds));
1176 &jmp (&label("xts_enc_loop6"));
1177
1178&set_label("xts_enc_loop6",16);
1179 for ($i=0;$i<4;$i++) {
1180 &pshufd ($twres,$twtmp,0x13);
1181 &pxor ($twtmp,$twtmp);
1182 &movdqa (&QWP(16*$i,"esp"),$tweak);
1183 &paddq ($tweak,$tweak); # &psllq($tweak,1);
1184 &pand ($twres,$twmask); # isolate carry and residue
1185 &pcmpgtd ($twtmp,$tweak); # broadcast upper bits
1186 &pxor ($tweak,$twres);
1187 }
1188 &pshufd ($inout5,$twtmp,0x13);
1189 &movdqa (&QWP(16*$i++,"esp"),$tweak);
1190 &paddq ($tweak,$tweak); # &psllq($tweak,1);
1191 &$movekey ($rndkey0,&QWP(0,$key_));
1192 &pand ($inout5,$twmask); # isolate carry and residue
1193 &movups ($inout0,&QWP(0,$inp)); # load input
1194 &pxor ($inout5,$tweak);
1195
1196 # inline _aesni_encrypt6 prologue and flip xor with tweak and key[0]
1197 &mov ($rounds,$rounds_); # restore $rounds
1198 &movdqu ($inout1,&QWP(16*1,$inp));
1199 &xorps ($inout0,$rndkey0); # input^=rndkey[0]
1200 &movdqu ($inout2,&QWP(16*2,$inp));
1201 &pxor ($inout1,$rndkey0);
1202 &movdqu ($inout3,&QWP(16*3,$inp));
1203 &pxor ($inout2,$rndkey0);
1204 &movdqu ($inout4,&QWP(16*4,$inp));
1205 &pxor ($inout3,$rndkey0);
1206 &movdqu ($rndkey1,&QWP(16*5,$inp));
1207 &pxor ($inout4,$rndkey0);
1208 &lea ($inp,&DWP(16*6,$inp));
1209 &pxor ($inout0,&QWP(16*0,"esp")); # input^=tweak
1210 &movdqa (&QWP(16*$i,"esp"),$inout5); # save last tweak
1211 &pxor ($inout5,$rndkey1);
1212
1213 &$movekey ($rndkey1,&QWP(16,$key_));
1214 &pxor ($inout1,&QWP(16*1,"esp"));
1215 &pxor ($inout2,&QWP(16*2,"esp"));
1216 &aesenc ($inout0,$rndkey1);
1217 &pxor ($inout3,&QWP(16*3,"esp"));
1218 &pxor ($inout4,&QWP(16*4,"esp"));
1219 &aesenc ($inout1,$rndkey1);
1220 &pxor ($inout5,$rndkey0);
1221 &$movekey ($rndkey0,&QWP(32,$key_));
1222 &aesenc ($inout2,$rndkey1);
1223 &aesenc ($inout3,$rndkey1);
1224 &aesenc ($inout4,$rndkey1);
1225 &aesenc ($inout5,$rndkey1);
1226 &call (&label("_aesni_encrypt6_enter"));
1227
1228 &movdqa ($tweak,&QWP(16*5,"esp")); # last tweak
1229 &pxor ($twtmp,$twtmp);
1230 &xorps ($inout0,&QWP(16*0,"esp")); # output^=tweak
1231 &pcmpgtd ($twtmp,$tweak); # broadcast upper bits
1232 &xorps ($inout1,&QWP(16*1,"esp"));
1233 &movups (&QWP(16*0,$out),$inout0); # write output
1234 &xorps ($inout2,&QWP(16*2,"esp"));
1235 &movups (&QWP(16*1,$out),$inout1);
1236 &xorps ($inout3,&QWP(16*3,"esp"));
1237 &movups (&QWP(16*2,$out),$inout2);
1238 &xorps ($inout4,&QWP(16*4,"esp"));
1239 &movups (&QWP(16*3,$out),$inout3);
1240 &xorps ($inout5,$tweak);
1241 &movups (&QWP(16*4,$out),$inout4);
1242 &pshufd ($twres,$twtmp,0x13);
1243 &movups (&QWP(16*5,$out),$inout5);
1244 &lea ($out,&DWP(16*6,$out));
1245 &movdqa ($twmask,&QWP(16*6,"esp")); # 0x0...010...87
1246
1247 &pxor ($twtmp,$twtmp);
1248 &paddq ($tweak,$tweak); # &psllq($tweak,1);
1249 &pand ($twres,$twmask); # isolate carry and residue
1250 &pcmpgtd($twtmp,$tweak); # broadcast upper bits
1251 &pxor ($tweak,$twres);
1252
1253 &sub ($len,16*6);
1254 &jnc (&label("xts_enc_loop6"));
1255
1256 &mov ($rounds,&DWP(240,$key_)); # restore $rounds
1257 &mov ($key,$key_); # restore $key
1258 &mov ($rounds_,$rounds);
1259
1260&set_label("xts_enc_short");
1261 &add ($len,16*6);
1262 &jz (&label("xts_enc_done6x"));
1263
1264 &movdqa ($inout3,$tweak); # put aside previous tweak
1265 &cmp ($len,0x20);
1266 &jb (&label("xts_enc_one"));
1267
1268 &pshufd ($twres,$twtmp,0x13);
1269 &pxor ($twtmp,$twtmp);
1270 &paddq ($tweak,$tweak); # &psllq($tweak,1);
1271 &pand ($twres,$twmask); # isolate carry and residue
1272 &pcmpgtd($twtmp,$tweak); # broadcast upper bits
1273 &pxor ($tweak,$twres);
1274 &je (&label("xts_enc_two"));
1275
1276 &pshufd ($twres,$twtmp,0x13);
1277 &pxor ($twtmp,$twtmp);
1278 &movdqa ($inout4,$tweak); # put aside previous tweak
1279 &paddq ($tweak,$tweak); # &psllq($tweak,1);
1280 &pand ($twres,$twmask); # isolate carry and residue
1281 &pcmpgtd($twtmp,$tweak); # broadcast upper bits
1282 &pxor ($tweak,$twres);
1283 &cmp ($len,0x40);
1284 &jb (&label("xts_enc_three"));
1285
1286 &pshufd ($twres,$twtmp,0x13);
1287 &pxor ($twtmp,$twtmp);
1288 &movdqa ($inout5,$tweak); # put aside previous tweak
1289 &paddq ($tweak,$tweak); # &psllq($tweak,1);
1290 &pand ($twres,$twmask); # isolate carry and residue
1291 &pcmpgtd($twtmp,$tweak); # broadcast upper bits
1292 &pxor ($tweak,$twres);
1293 &movdqa (&QWP(16*0,"esp"),$inout3);
1294 &movdqa (&QWP(16*1,"esp"),$inout4);
1295 &je (&label("xts_enc_four"));
1296
1297 &movdqa (&QWP(16*2,"esp"),$inout5);
1298 &pshufd ($inout5,$twtmp,0x13);
1299 &movdqa (&QWP(16*3,"esp"),$tweak);
1300 &paddq ($tweak,$tweak); # &psllq($inout0,1);
1301 &pand ($inout5,$twmask); # isolate carry and residue
1302 &pxor ($inout5,$tweak);
1303
1304 &movdqu ($inout0,&QWP(16*0,$inp)); # load input
1305 &movdqu ($inout1,&QWP(16*1,$inp));
1306 &movdqu ($inout2,&QWP(16*2,$inp));
1307 &pxor ($inout0,&QWP(16*0,"esp")); # input^=tweak
1308 &movdqu ($inout3,&QWP(16*3,$inp));
1309 &pxor ($inout1,&QWP(16*1,"esp"));
1310 &movdqu ($inout4,&QWP(16*4,$inp));
1311 &pxor ($inout2,&QWP(16*2,"esp"));
1312 &lea ($inp,&DWP(16*5,$inp));
1313 &pxor ($inout3,&QWP(16*3,"esp"));
1314 &movdqa (&QWP(16*4,"esp"),$inout5); # save last tweak
1315 &pxor ($inout4,$inout5);
1316
1317 &call ("_aesni_encrypt6");
1318
1319 &movaps ($tweak,&QWP(16*4,"esp")); # last tweak
1320 &xorps ($inout0,&QWP(16*0,"esp")); # output^=tweak
1321 &xorps ($inout1,&QWP(16*1,"esp"));
1322 &xorps ($inout2,&QWP(16*2,"esp"));
1323 &movups (&QWP(16*0,$out),$inout0); # write output
1324 &xorps ($inout3,&QWP(16*3,"esp"));
1325 &movups (&QWP(16*1,$out),$inout1);
1326 &xorps ($inout4,$tweak);
1327 &movups (&QWP(16*2,$out),$inout2);
1328 &movups (&QWP(16*3,$out),$inout3);
1329 &movups (&QWP(16*4,$out),$inout4);
1330 &lea ($out,&DWP(16*5,$out));
1331 &jmp (&label("xts_enc_done"));
1332
1333&set_label("xts_enc_one",16);
1334 &movups ($inout0,&QWP(16*0,$inp)); # load input
1335 &lea ($inp,&DWP(16*1,$inp));
1336 &xorps ($inout0,$inout3); # input^=tweak
1337 if ($inline)
1338 { &aesni_inline_generate1("enc"); }
1339 else
1340 { &call ("_aesni_encrypt1"); }
1341 &xorps ($inout0,$inout3); # output^=tweak
1342 &movups (&QWP(16*0,$out),$inout0); # write output
1343 &lea ($out,&DWP(16*1,$out));
1344
1345 &movdqa ($tweak,$inout3); # last tweak
1346 &jmp (&label("xts_enc_done"));
1347
1348&set_label("xts_enc_two",16);
1349 &movaps ($inout4,$tweak); # put aside last tweak
1350
1351 &movups ($inout0,&QWP(16*0,$inp)); # load input
1352 &movups ($inout1,&QWP(16*1,$inp));
1353 &lea ($inp,&DWP(16*2,$inp));
1354 &xorps ($inout0,$inout3); # input^=tweak
1355 &xorps ($inout1,$inout4);
1356
1357 &call ("_aesni_encrypt2");
1358
1359 &xorps ($inout0,$inout3); # output^=tweak
1360 &xorps ($inout1,$inout4);
1361 &movups (&QWP(16*0,$out),$inout0); # write output
1362 &movups (&QWP(16*1,$out),$inout1);
1363 &lea ($out,&DWP(16*2,$out));
1364
1365 &movdqa ($tweak,$inout4); # last tweak
1366 &jmp (&label("xts_enc_done"));
1367
1368&set_label("xts_enc_three",16);
1369 &movaps ($inout5,$tweak); # put aside last tweak
1370 &movups ($inout0,&QWP(16*0,$inp)); # load input
1371 &movups ($inout1,&QWP(16*1,$inp));
1372 &movups ($inout2,&QWP(16*2,$inp));
1373 &lea ($inp,&DWP(16*3,$inp));
1374 &xorps ($inout0,$inout3); # input^=tweak
1375 &xorps ($inout1,$inout4);
1376 &xorps ($inout2,$inout5);
1377
1378 &call ("_aesni_encrypt3");
1379
1380 &xorps ($inout0,$inout3); # output^=tweak
1381 &xorps ($inout1,$inout4);
1382 &xorps ($inout2,$inout5);
1383 &movups (&QWP(16*0,$out),$inout0); # write output
1384 &movups (&QWP(16*1,$out),$inout1);
1385 &movups (&QWP(16*2,$out),$inout2);
1386 &lea ($out,&DWP(16*3,$out));
1387
1388 &movdqa ($tweak,$inout5); # last tweak
1389 &jmp (&label("xts_enc_done"));
1390
1391&set_label("xts_enc_four",16);
1392 &movaps ($inout4,$tweak); # put aside last tweak
1393
1394 &movups ($inout0,&QWP(16*0,$inp)); # load input
1395 &movups ($inout1,&QWP(16*1,$inp));
1396 &movups ($inout2,&QWP(16*2,$inp));
1397 &xorps ($inout0,&QWP(16*0,"esp")); # input^=tweak
1398 &movups ($inout3,&QWP(16*3,$inp));
1399 &lea ($inp,&DWP(16*4,$inp));
1400 &xorps ($inout1,&QWP(16*1,"esp"));
1401 &xorps ($inout2,$inout5);
1402 &xorps ($inout3,$inout4);
1403
1404 &call ("_aesni_encrypt4");
1405
1406 &xorps ($inout0,&QWP(16*0,"esp")); # output^=tweak
1407 &xorps ($inout1,&QWP(16*1,"esp"));
1408 &xorps ($inout2,$inout5);
1409 &movups (&QWP(16*0,$out),$inout0); # write output
1410 &xorps ($inout3,$inout4);
1411 &movups (&QWP(16*1,$out),$inout1);
1412 &movups (&QWP(16*2,$out),$inout2);
1413 &movups (&QWP(16*3,$out),$inout3);
1414 &lea ($out,&DWP(16*4,$out));
1415
1416 &movdqa ($tweak,$inout4); # last tweak
1417 &jmp (&label("xts_enc_done"));
1418
1419&set_label("xts_enc_done6x",16); # $tweak is pre-calculated
1420 &mov ($len,&DWP(16*7+0,"esp")); # restore original $len
1421 &and ($len,15);
1422 &jz (&label("xts_enc_ret"));
1423 &movdqa ($inout3,$tweak);
1424 &mov (&DWP(16*7+0,"esp"),$len); # save $len%16
1425 &jmp (&label("xts_enc_steal"));
1426
1427&set_label("xts_enc_done",16);
1428 &mov ($len,&DWP(16*7+0,"esp")); # restore original $len
1429 &pxor ($twtmp,$twtmp);
1430 &and ($len,15);
1431 &jz (&label("xts_enc_ret"));
1432
1433 &pcmpgtd($twtmp,$tweak); # broadcast upper bits
1434 &mov (&DWP(16*7+0,"esp"),$len); # save $len%16
1435 &pshufd ($inout3,$twtmp,0x13);
1436 &paddq ($tweak,$tweak); # &psllq($tweak,1);
1437 &pand ($inout3,&QWP(16*6,"esp")); # isolate carry and residue
1438 &pxor ($inout3,$tweak);
1439
1440&set_label("xts_enc_steal");
1441 &movz ($rounds,&BP(0,$inp));
1442 &movz ($key,&BP(-16,$out));
1443 &lea ($inp,&DWP(1,$inp));
1444 &mov (&BP(-16,$out),&LB($rounds));
1445 &mov (&BP(0,$out),&LB($key));
1446 &lea ($out,&DWP(1,$out));
1447 &sub ($len,1);
1448 &jnz (&label("xts_enc_steal"));
1449
1450 &sub ($out,&DWP(16*7+0,"esp")); # rewind $out
1451 &mov ($key,$key_); # restore $key
1452 &mov ($rounds,$rounds_); # restore $rounds
1453
1454 &movups ($inout0,&QWP(-16,$out)); # load input
1455 &xorps ($inout0,$inout3); # input^=tweak
1456 if ($inline)
1457 { &aesni_inline_generate1("enc"); }
1458 else
1459 { &call ("_aesni_encrypt1"); }
1460 &xorps ($inout0,$inout3); # output^=tweak
1461 &movups (&QWP(-16,$out),$inout0); # write output
1462
1463&set_label("xts_enc_ret");
1464 &pxor ("xmm0","xmm0"); # clear register bank
1465 &pxor ("xmm1","xmm1");
1466 &pxor ("xmm2","xmm2");
1467 &movdqa (&QWP(16*0,"esp"),"xmm0"); # clear stack
1468 &pxor ("xmm3","xmm3");
1469 &movdqa (&QWP(16*1,"esp"),"xmm0");
1470 &pxor ("xmm4","xmm4");
1471 &movdqa (&QWP(16*2,"esp"),"xmm0");
1472 &pxor ("xmm5","xmm5");
1473 &movdqa (&QWP(16*3,"esp"),"xmm0");
1474 &pxor ("xmm6","xmm6");
1475 &movdqa (&QWP(16*4,"esp"),"xmm0");
1476 &pxor ("xmm7","xmm7");
1477 &movdqa (&QWP(16*5,"esp"),"xmm0");
1478 &mov ("esp",&DWP(16*7+4,"esp")); # restore %esp
1479&function_end("aesni_xts_encrypt");
1480
1481&function_begin("aesni_xts_decrypt");
1482 &mov ($key,&wparam(4)); # key2
1483 &mov ($inp,&wparam(5)); # clear-text tweak
1484
1485 &mov ($rounds,&DWP(240,$key)); # key2->rounds
1486 &movups ($inout0,&QWP(0,$inp));
1487 if ($inline)
1488 { &aesni_inline_generate1("enc"); }
1489 else
1490 { &call ("_aesni_encrypt1"); }
1491
1492 &mov ($inp,&wparam(0));
1493 &mov ($out,&wparam(1));
1494 &mov ($len,&wparam(2));
1495 &mov ($key,&wparam(3)); # key1
1496
1497 &mov ($key_,"esp");
1498 &sub ("esp",16*7+8);
1499 &and ("esp",-16); # align stack
1500
1501 &xor ($rounds_,$rounds_); # if(len%16) len-=16;
1502 &test ($len,15);
1503 &setnz (&LB($rounds_));
1504 &shl ($rounds_,4);
1505 &sub ($len,$rounds_);
1506
1507 &mov (&DWP(16*6+0,"esp"),0x87); # compose the magic constant
1508 &mov (&DWP(16*6+4,"esp"),0);
1509 &mov (&DWP(16*6+8,"esp"),1);
1510 &mov (&DWP(16*6+12,"esp"),0);
1511 &mov (&DWP(16*7+0,"esp"),$len); # save original $len
1512 &mov (&DWP(16*7+4,"esp"),$key_); # save original %esp
1513
1514 &mov ($rounds,&DWP(240,$key)); # key1->rounds
1515 &mov ($key_,$key); # backup $key
1516 &mov ($rounds_,$rounds); # backup $rounds
1517
1518 &movdqa ($tweak,$inout0);
1519 &pxor ($twtmp,$twtmp);
1520 &movdqa ($twmask,&QWP(6*16,"esp")); # 0x0...010...87
1521 &pcmpgtd($twtmp,$tweak); # broadcast upper bits
1522
1523 &and ($len,-16);
1524 &sub ($len,16*6);
1525 &jc (&label("xts_dec_short"));
1526
1527 &shl ($rounds,4);
1528 &mov ($rounds_,16);
1529 &sub ($rounds_,$rounds);
1530 &lea ($key,&DWP(32,$key,$rounds));
1531 &jmp (&label("xts_dec_loop6"));
1532
1533&set_label("xts_dec_loop6",16);
1534 for ($i=0;$i<4;$i++) {
1535 &pshufd ($twres,$twtmp,0x13);
1536 &pxor ($twtmp,$twtmp);
1537 &movdqa (&QWP(16*$i,"esp"),$tweak);
1538 &paddq ($tweak,$tweak); # &psllq($tweak,1);
1539 &pand ($twres,$twmask); # isolate carry and residue
1540 &pcmpgtd ($twtmp,$tweak); # broadcast upper bits
1541 &pxor ($tweak,$twres);
1542 }
1543 &pshufd ($inout5,$twtmp,0x13);
1544 &movdqa (&QWP(16*$i++,"esp"),$tweak);
1545 &paddq ($tweak,$tweak); # &psllq($tweak,1);
1546 &$movekey ($rndkey0,&QWP(0,$key_));
1547 &pand ($inout5,$twmask); # isolate carry and residue
1548 &movups ($inout0,&QWP(0,$inp)); # load input
1549 &pxor ($inout5,$tweak);
1550
1551 # inline _aesni_encrypt6 prologue and flip xor with tweak and key[0]
1552 &mov ($rounds,$rounds_);
1553 &movdqu ($inout1,&QWP(16*1,$inp));
1554 &xorps ($inout0,$rndkey0); # input^=rndkey[0]
1555 &movdqu ($inout2,&QWP(16*2,$inp));
1556 &pxor ($inout1,$rndkey0);
1557 &movdqu ($inout3,&QWP(16*3,$inp));
1558 &pxor ($inout2,$rndkey0);
1559 &movdqu ($inout4,&QWP(16*4,$inp));
1560 &pxor ($inout3,$rndkey0);
1561 &movdqu ($rndkey1,&QWP(16*5,$inp));
1562 &pxor ($inout4,$rndkey0);
1563 &lea ($inp,&DWP(16*6,$inp));
1564 &pxor ($inout0,&QWP(16*0,"esp")); # input^=tweak
1565 &movdqa (&QWP(16*$i,"esp"),$inout5); # save last tweak
1566 &pxor ($inout5,$rndkey1);
1567
1568 &$movekey ($rndkey1,&QWP(16,$key_));
1569 &pxor ($inout1,&QWP(16*1,"esp"));
1570 &pxor ($inout2,&QWP(16*2,"esp"));
1571 &aesdec ($inout0,$rndkey1);
1572 &pxor ($inout3,&QWP(16*3,"esp"));
1573 &pxor ($inout4,&QWP(16*4,"esp"));
1574 &aesdec ($inout1,$rndkey1);
1575 &pxor ($inout5,$rndkey0);
1576 &$movekey ($rndkey0,&QWP(32,$key_));
1577 &aesdec ($inout2,$rndkey1);
1578 &aesdec ($inout3,$rndkey1);
1579 &aesdec ($inout4,$rndkey1);
1580 &aesdec ($inout5,$rndkey1);
1581 &call (&label("_aesni_decrypt6_enter"));
1582
1583 &movdqa ($tweak,&QWP(16*5,"esp")); # last tweak
1584 &pxor ($twtmp,$twtmp);
1585 &xorps ($inout0,&QWP(16*0,"esp")); # output^=tweak
1586 &pcmpgtd ($twtmp,$tweak); # broadcast upper bits
1587 &xorps ($inout1,&QWP(16*1,"esp"));
1588 &movups (&QWP(16*0,$out),$inout0); # write output
1589 &xorps ($inout2,&QWP(16*2,"esp"));
1590 &movups (&QWP(16*1,$out),$inout1);
1591 &xorps ($inout3,&QWP(16*3,"esp"));
1592 &movups (&QWP(16*2,$out),$inout2);
1593 &xorps ($inout4,&QWP(16*4,"esp"));
1594 &movups (&QWP(16*3,$out),$inout3);
1595 &xorps ($inout5,$tweak);
1596 &movups (&QWP(16*4,$out),$inout4);
1597 &pshufd ($twres,$twtmp,0x13);
1598 &movups (&QWP(16*5,$out),$inout5);
1599 &lea ($out,&DWP(16*6,$out));
1600 &movdqa ($twmask,&QWP(16*6,"esp")); # 0x0...010...87
1601
1602 &pxor ($twtmp,$twtmp);
1603 &paddq ($tweak,$tweak); # &psllq($tweak,1);
1604 &pand ($twres,$twmask); # isolate carry and residue
1605 &pcmpgtd($twtmp,$tweak); # broadcast upper bits
1606 &pxor ($tweak,$twres);
1607
1608 &sub ($len,16*6);
1609 &jnc (&label("xts_dec_loop6"));
1610
1611 &mov ($rounds,&DWP(240,$key_)); # restore $rounds
1612 &mov ($key,$key_); # restore $key
1613 &mov ($rounds_,$rounds);
1614
1615&set_label("xts_dec_short");
1616 &add ($len,16*6);
1617 &jz (&label("xts_dec_done6x"));
1618
1619 &movdqa ($inout3,$tweak); # put aside previous tweak
1620 &cmp ($len,0x20);
1621 &jb (&label("xts_dec_one"));
1622
1623 &pshufd ($twres,$twtmp,0x13);
1624 &pxor ($twtmp,$twtmp);
1625 &paddq ($tweak,$tweak); # &psllq($tweak,1);
1626 &pand ($twres,$twmask); # isolate carry and residue
1627 &pcmpgtd($twtmp,$tweak); # broadcast upper bits
1628 &pxor ($tweak,$twres);
1629 &je (&label("xts_dec_two"));
1630
1631 &pshufd ($twres,$twtmp,0x13);
1632 &pxor ($twtmp,$twtmp);
1633 &movdqa ($inout4,$tweak); # put aside previous tweak
1634 &paddq ($tweak,$tweak); # &psllq($tweak,1);
1635 &pand ($twres,$twmask); # isolate carry and residue
1636 &pcmpgtd($twtmp,$tweak); # broadcast upper bits
1637 &pxor ($tweak,$twres);
1638 &cmp ($len,0x40);
1639 &jb (&label("xts_dec_three"));
1640
1641 &pshufd ($twres,$twtmp,0x13);
1642 &pxor ($twtmp,$twtmp);
1643 &movdqa ($inout5,$tweak); # put aside previous tweak
1644 &paddq ($tweak,$tweak); # &psllq($tweak,1);
1645 &pand ($twres,$twmask); # isolate carry and residue
1646 &pcmpgtd($twtmp,$tweak); # broadcast upper bits
1647 &pxor ($tweak,$twres);
1648 &movdqa (&QWP(16*0,"esp"),$inout3);
1649 &movdqa (&QWP(16*1,"esp"),$inout4);
1650 &je (&label("xts_dec_four"));
1651
1652 &movdqa (&QWP(16*2,"esp"),$inout5);
1653 &pshufd ($inout5,$twtmp,0x13);
1654 &movdqa (&QWP(16*3,"esp"),$tweak);
1655 &paddq ($tweak,$tweak); # &psllq($inout0,1);
1656 &pand ($inout5,$twmask); # isolate carry and residue
1657 &pxor ($inout5,$tweak);
1658
1659 &movdqu ($inout0,&QWP(16*0,$inp)); # load input
1660 &movdqu ($inout1,&QWP(16*1,$inp));
1661 &movdqu ($inout2,&QWP(16*2,$inp));
1662 &pxor ($inout0,&QWP(16*0,"esp")); # input^=tweak
1663 &movdqu ($inout3,&QWP(16*3,$inp));
1664 &pxor ($inout1,&QWP(16*1,"esp"));
1665 &movdqu ($inout4,&QWP(16*4,$inp));
1666 &pxor ($inout2,&QWP(16*2,"esp"));
1667 &lea ($inp,&DWP(16*5,$inp));
1668 &pxor ($inout3,&QWP(16*3,"esp"));
1669 &movdqa (&QWP(16*4,"esp"),$inout5); # save last tweak
1670 &pxor ($inout4,$inout5);
1671
1672 &call ("_aesni_decrypt6");
1673
1674 &movaps ($tweak,&QWP(16*4,"esp")); # last tweak
1675 &xorps ($inout0,&QWP(16*0,"esp")); # output^=tweak
1676 &xorps ($inout1,&QWP(16*1,"esp"));
1677 &xorps ($inout2,&QWP(16*2,"esp"));
1678 &movups (&QWP(16*0,$out),$inout0); # write output
1679 &xorps ($inout3,&QWP(16*3,"esp"));
1680 &movups (&QWP(16*1,$out),$inout1);
1681 &xorps ($inout4,$tweak);
1682 &movups (&QWP(16*2,$out),$inout2);
1683 &movups (&QWP(16*3,$out),$inout3);
1684 &movups (&QWP(16*4,$out),$inout4);
1685 &lea ($out,&DWP(16*5,$out));
1686 &jmp (&label("xts_dec_done"));
1687
1688&set_label("xts_dec_one",16);
1689 &movups ($inout0,&QWP(16*0,$inp)); # load input
1690 &lea ($inp,&DWP(16*1,$inp));
1691 &xorps ($inout0,$inout3); # input^=tweak
1692 if ($inline)
1693 { &aesni_inline_generate1("dec"); }
1694 else
1695 { &call ("_aesni_decrypt1"); }
1696 &xorps ($inout0,$inout3); # output^=tweak
1697 &movups (&QWP(16*0,$out),$inout0); # write output
1698 &lea ($out,&DWP(16*1,$out));
1699
1700 &movdqa ($tweak,$inout3); # last tweak
1701 &jmp (&label("xts_dec_done"));
1702
1703&set_label("xts_dec_two",16);
1704 &movaps ($inout4,$tweak); # put aside last tweak
1705
1706 &movups ($inout0,&QWP(16*0,$inp)); # load input
1707 &movups ($inout1,&QWP(16*1,$inp));
1708 &lea ($inp,&DWP(16*2,$inp));
1709 &xorps ($inout0,$inout3); # input^=tweak
1710 &xorps ($inout1,$inout4);
1711
1712 &call ("_aesni_decrypt2");
1713
1714 &xorps ($inout0,$inout3); # output^=tweak
1715 &xorps ($inout1,$inout4);
1716 &movups (&QWP(16*0,$out),$inout0); # write output
1717 &movups (&QWP(16*1,$out),$inout1);
1718 &lea ($out,&DWP(16*2,$out));
1719
1720 &movdqa ($tweak,$inout4); # last tweak
1721 &jmp (&label("xts_dec_done"));
1722
1723&set_label("xts_dec_three",16);
1724 &movaps ($inout5,$tweak); # put aside last tweak
1725 &movups ($inout0,&QWP(16*0,$inp)); # load input
1726 &movups ($inout1,&QWP(16*1,$inp));
1727 &movups ($inout2,&QWP(16*2,$inp));
1728 &lea ($inp,&DWP(16*3,$inp));
1729 &xorps ($inout0,$inout3); # input^=tweak
1730 &xorps ($inout1,$inout4);
1731 &xorps ($inout2,$inout5);
1732
1733 &call ("_aesni_decrypt3");
1734
1735 &xorps ($inout0,$inout3); # output^=tweak
1736 &xorps ($inout1,$inout4);
1737 &xorps ($inout2,$inout5);
1738 &movups (&QWP(16*0,$out),$inout0); # write output
1739 &movups (&QWP(16*1,$out),$inout1);
1740 &movups (&QWP(16*2,$out),$inout2);
1741 &lea ($out,&DWP(16*3,$out));
1742
1743 &movdqa ($tweak,$inout5); # last tweak
1744 &jmp (&label("xts_dec_done"));
1745
1746&set_label("xts_dec_four",16);
1747 &movaps ($inout4,$tweak); # put aside last tweak
1748
1749 &movups ($inout0,&QWP(16*0,$inp)); # load input
1750 &movups ($inout1,&QWP(16*1,$inp));
1751 &movups ($inout2,&QWP(16*2,$inp));
1752 &xorps ($inout0,&QWP(16*0,"esp")); # input^=tweak
1753 &movups ($inout3,&QWP(16*3,$inp));
1754 &lea ($inp,&DWP(16*4,$inp));
1755 &xorps ($inout1,&QWP(16*1,"esp"));
1756 &xorps ($inout2,$inout5);
1757 &xorps ($inout3,$inout4);
1758
1759 &call ("_aesni_decrypt4");
1760
1761 &xorps ($inout0,&QWP(16*0,"esp")); # output^=tweak
1762 &xorps ($inout1,&QWP(16*1,"esp"));
1763 &xorps ($inout2,$inout5);
1764 &movups (&QWP(16*0,$out),$inout0); # write output
1765 &xorps ($inout3,$inout4);
1766 &movups (&QWP(16*1,$out),$inout1);
1767 &movups (&QWP(16*2,$out),$inout2);
1768 &movups (&QWP(16*3,$out),$inout3);
1769 &lea ($out,&DWP(16*4,$out));
1770
1771 &movdqa ($tweak,$inout4); # last tweak
1772 &jmp (&label("xts_dec_done"));
1773
1774&set_label("xts_dec_done6x",16); # $tweak is pre-calculated
1775 &mov ($len,&DWP(16*7+0,"esp")); # restore original $len
1776 &and ($len,15);
1777 &jz (&label("xts_dec_ret"));
1778 &mov (&DWP(16*7+0,"esp"),$len); # save $len%16
1779 &jmp (&label("xts_dec_only_one_more"));
1780
1781&set_label("xts_dec_done",16);
1782 &mov ($len,&DWP(16*7+0,"esp")); # restore original $len
1783 &pxor ($twtmp,$twtmp);
1784 &and ($len,15);
1785 &jz (&label("xts_dec_ret"));
1786
1787 &pcmpgtd($twtmp,$tweak); # broadcast upper bits
1788 &mov (&DWP(16*7+0,"esp"),$len); # save $len%16
1789 &pshufd ($twres,$twtmp,0x13);
1790 &pxor ($twtmp,$twtmp);
1791 &movdqa ($twmask,&QWP(16*6,"esp"));
1792 &paddq ($tweak,$tweak); # &psllq($tweak,1);
1793 &pand ($twres,$twmask); # isolate carry and residue
1794 &pcmpgtd($twtmp,$tweak); # broadcast upper bits
1795 &pxor ($tweak,$twres);
1796
1797&set_label("xts_dec_only_one_more");
1798 &pshufd ($inout3,$twtmp,0x13);
1799 &movdqa ($inout4,$tweak); # put aside previous tweak
1800 &paddq ($tweak,$tweak); # &psllq($tweak,1);
1801 &pand ($inout3,$twmask); # isolate carry and residue
1802 &pxor ($inout3,$tweak);
1803
1804 &mov ($key,$key_); # restore $key
1805 &mov ($rounds,$rounds_); # restore $rounds
1806
1807 &movups ($inout0,&QWP(0,$inp)); # load input
1808 &xorps ($inout0,$inout3); # input^=tweak
1809 if ($inline)
1810 { &aesni_inline_generate1("dec"); }
1811 else
1812 { &call ("_aesni_decrypt1"); }
1813 &xorps ($inout0,$inout3); # output^=tweak
1814 &movups (&QWP(0,$out),$inout0); # write output
1815
1816&set_label("xts_dec_steal");
1817 &movz ($rounds,&BP(16,$inp));
1818 &movz ($key,&BP(0,$out));
1819 &lea ($inp,&DWP(1,$inp));
1820 &mov (&BP(0,$out),&LB($rounds));
1821 &mov (&BP(16,$out),&LB($key));
1822 &lea ($out,&DWP(1,$out));
1823 &sub ($len,1);
1824 &jnz (&label("xts_dec_steal"));
1825
1826 &sub ($out,&DWP(16*7+0,"esp")); # rewind $out
1827 &mov ($key,$key_); # restore $key
1828 &mov ($rounds,$rounds_); # restore $rounds
1829
1830 &movups ($inout0,&QWP(0,$out)); # load input
1831 &xorps ($inout0,$inout4); # input^=tweak
1832 if ($inline)
1833 { &aesni_inline_generate1("dec"); }
1834 else
1835 { &call ("_aesni_decrypt1"); }
1836 &xorps ($inout0,$inout4); # output^=tweak
1837 &movups (&QWP(0,$out),$inout0); # write output
1838
1839&set_label("xts_dec_ret");
1840 &pxor ("xmm0","xmm0"); # clear register bank
1841 &pxor ("xmm1","xmm1");
1842 &pxor ("xmm2","xmm2");
1843 &movdqa (&QWP(16*0,"esp"),"xmm0"); # clear stack
1844 &pxor ("xmm3","xmm3");
1845 &movdqa (&QWP(16*1,"esp"),"xmm0");
1846 &pxor ("xmm4","xmm4");
1847 &movdqa (&QWP(16*2,"esp"),"xmm0");
1848 &pxor ("xmm5","xmm5");
1849 &movdqa (&QWP(16*3,"esp"),"xmm0");
1850 &pxor ("xmm6","xmm6");
1851 &movdqa (&QWP(16*4,"esp"),"xmm0");
1852 &pxor ("xmm7","xmm7");
1853 &movdqa (&QWP(16*5,"esp"),"xmm0");
1854 &mov ("esp",&DWP(16*7+4,"esp")); # restore %esp
1855&function_end("aesni_xts_decrypt");
1856}
1857
1858
1859######################################################################
1860# void aesni_ocb_[en|de]crypt(const char *inp, char *out, size_t blocks,
1861# const AES_KEY *key, unsigned int start_block_num,
1862# unsigned char offset_i[16], const unsigned char L_[][16],
1863# unsigned char checksum[16]);
1864#
1865{
1866# offsets within stack frame
1867my $checksum = 16*6;
1868my ($key_off,$rounds_off,$out_off,$end_off,$esp_off)=map(16*7+4*$_,(0..4));
1869
1870# reassigned registers
1871my ($l_,$block,$i1,$i3,$i5) = ($rounds_,$key_,$rounds,$len,$out);
1872# $l_, $blocks, $inp, $key are permanently allocated in registers;
1873# remaining non-volatile ones are offloaded to stack, which even
1874# stay invariant after written to stack.
1875
1876&function_begin("aesni_ocb_encrypt");
1877 &mov ($rounds,&wparam(5)); # &offset_i
1878 &mov ($rounds_,&wparam(7)); # &checksum
1879
1880 &mov ($inp,&wparam(0));
1881 &mov ($out,&wparam(1));
1882 &mov ($len,&wparam(2));
1883 &mov ($key,&wparam(3));
1884 &movdqu ($rndkey0,&QWP(0,$rounds)); # load offset_i
1885 &mov ($block,&wparam(4)); # start_block_num
1886 &movdqu ($rndkey1,&QWP(0,$rounds_)); # load checksum
1887 &mov ($l_,&wparam(6)); # L_
1888
1889 &mov ($rounds,"esp");
1890 &sub ("esp",$esp_off+4); # alloca
1891 &and ("esp",-16); # align stack
1892
1893 &sub ($out,$inp);
1894 &shl ($len,4);
1895 &lea ($len,&DWP(-16*6,$inp,$len)); # end of input - 16*6
1896 &mov (&DWP($out_off,"esp"),$out);
1897 &mov (&DWP($end_off,"esp"),$len);
1898 &mov (&DWP($esp_off,"esp"),$rounds);
1899
1900 &mov ($rounds,&DWP(240,$key));
1901
1902 &test ($block,1);
1903 &jnz (&label("odd"));
1904
1905 &bsf ($i3,$block);
1906 &add ($block,1);
1907 &shl ($i3,4);
1908 &movdqu ($inout5,&QWP(0,$l_,$i3));
1909 &mov ($i3,$key); # put aside key
1910
1911 &movdqu ($inout0,&QWP(16*0,$inp)); # load input
1912 &lea ($inp,&DWP(16,$inp));
1913
1914 &pxor ($inout5,$rndkey0); # ^ last offset_i
1915 &pxor ($rndkey1,$inout0); # checksum
1916 &pxor ($inout0,$inout5); # ^ offset_i
1917
1918 &movdqa ($inout4,$rndkey1);
1919 if ($inline)
1920 { &aesni_inline_generate1("enc"); }
1921 else
1922 { &call ("_aesni_encrypt1"); }
1923
1924 &xorps ($inout0,$inout5); # ^ offset_i
1925 &movdqa ($rndkey0,$inout5); # pass last offset_i
1926 &movdqa ($rndkey1,$inout4); # pass the checksum
1927
1928 &movups (&QWP(-16,$out,$inp),$inout0); # store output
1929
1930 &mov ($rounds,&DWP(240,$i3));
1931 &mov ($key,$i3); # restore key
1932 &mov ($len,&DWP($end_off,"esp"));
1933
1934&set_label("odd");
1935 &shl ($rounds,4);
1936 &mov ($out,16);
1937 &sub ($out,$rounds); # twisted rounds
1938 &mov (&DWP($key_off,"esp"),$key);
1939 &lea ($key,&DWP(32,$key,$rounds)); # end of key schedule
1940 &mov (&DWP($rounds_off,"esp"),$out);
1941
1942 &cmp ($inp,$len);
1943 &ja (&label("short"));
1944 &jmp (&label("grandloop"));
1945
1946&set_label("grandloop",32);
1947 &lea ($i1,&DWP(1,$block));
1948 &lea ($i3,&DWP(3,$block));
1949 &lea ($i5,&DWP(5,$block));
1950 &add ($block,6);
1951 &bsf ($i1,$i1);
1952 &bsf ($i3,$i3);
1953 &bsf ($i5,$i5);
1954 &shl ($i1,4);
1955 &shl ($i3,4);
1956 &shl ($i5,4);
1957 &movdqu ($inout0,&QWP(0,$l_));
1958 &movdqu ($inout1,&QWP(0,$l_,$i1));
1959 &mov ($rounds,&DWP($rounds_off,"esp"));
1960 &movdqa ($inout2,$inout0);
1961 &movdqu ($inout3,&QWP(0,$l_,$i3));
1962 &movdqa ($inout4,$inout0);
1963 &movdqu ($inout5,&QWP(0,$l_,$i5));
1964
1965 &pxor ($inout0,$rndkey0); # ^ last offset_i
1966 &pxor ($inout1,$inout0);
1967 &movdqa (&QWP(16*0,"esp"),$inout0);
1968 &pxor ($inout2,$inout1);
1969 &movdqa (&QWP(16*1,"esp"),$inout1);
1970 &pxor ($inout3,$inout2);
1971 &movdqa (&QWP(16*2,"esp"),$inout2);
1972 &pxor ($inout4,$inout3);
1973 &movdqa (&QWP(16*3,"esp"),$inout3);
1974 &pxor ($inout5,$inout4);
1975 &movdqa (&QWP(16*4,"esp"),$inout4);
1976 &movdqa (&QWP(16*5,"esp"),$inout5);
1977
1978 &$movekey ($rndkey0,&QWP(-48,$key,$rounds));
1979 &movdqu ($inout0,&QWP(16*0,$inp)); # load input
1980 &movdqu ($inout1,&QWP(16*1,$inp));
1981 &movdqu ($inout2,&QWP(16*2,$inp));
1982 &movdqu ($inout3,&QWP(16*3,$inp));
1983 &movdqu ($inout4,&QWP(16*4,$inp));
1984 &movdqu ($inout5,&QWP(16*5,$inp));
1985 &lea ($inp,&DWP(16*6,$inp));
1986
1987 &pxor ($rndkey1,$inout0); # checksum
1988 &pxor ($inout0,$rndkey0); # ^ roundkey[0]
1989 &pxor ($rndkey1,$inout1);
1990 &pxor ($inout1,$rndkey0);
1991 &pxor ($rndkey1,$inout2);
1992 &pxor ($inout2,$rndkey0);
1993 &pxor ($rndkey1,$inout3);
1994 &pxor ($inout3,$rndkey0);
1995 &pxor ($rndkey1,$inout4);
1996 &pxor ($inout4,$rndkey0);
1997 &pxor ($rndkey1,$inout5);
1998 &pxor ($inout5,$rndkey0);
1999 &movdqa (&QWP($checksum,"esp"),$rndkey1);
2000
2001 &$movekey ($rndkey1,&QWP(-32,$key,$rounds));
2002 &pxor ($inout0,&QWP(16*0,"esp")); # ^ offset_i
2003 &pxor ($inout1,&QWP(16*1,"esp"));
2004 &pxor ($inout2,&QWP(16*2,"esp"));
2005 &pxor ($inout3,&QWP(16*3,"esp"));
2006 &pxor ($inout4,&QWP(16*4,"esp"));
2007 &pxor ($inout5,&QWP(16*5,"esp"));
2008
2009 &$movekey ($rndkey0,&QWP(-16,$key,$rounds));
2010 &aesenc ($inout0,$rndkey1);
2011 &aesenc ($inout1,$rndkey1);
2012 &aesenc ($inout2,$rndkey1);
2013 &aesenc ($inout3,$rndkey1);
2014 &aesenc ($inout4,$rndkey1);
2015 &aesenc ($inout5,$rndkey1);
2016
2017 &mov ($out,&DWP($out_off,"esp"));
2018 &mov ($len,&DWP($end_off,"esp"));
2019 &call ("_aesni_encrypt6_enter");
2020
2021 &movdqa ($rndkey0,&QWP(16*5,"esp")); # pass last offset_i
2022 &pxor ($inout0,&QWP(16*0,"esp")); # ^ offset_i
2023 &pxor ($inout1,&QWP(16*1,"esp"));
2024 &pxor ($inout2,&QWP(16*2,"esp"));
2025 &pxor ($inout3,&QWP(16*3,"esp"));
2026 &pxor ($inout4,&QWP(16*4,"esp"));
2027 &pxor ($inout5,$rndkey0);
2028 &movdqa ($rndkey1,&QWP($checksum,"esp"));# pass the checksum
2029
2030 &movdqu (&QWP(-16*6,$out,$inp),$inout0);# store output
2031 &movdqu (&QWP(-16*5,$out,$inp),$inout1);
2032 &movdqu (&QWP(-16*4,$out,$inp),$inout2);
2033 &movdqu (&QWP(-16*3,$out,$inp),$inout3);
2034 &movdqu (&QWP(-16*2,$out,$inp),$inout4);
2035 &movdqu (&QWP(-16*1,$out,$inp),$inout5);
2036 &cmp ($inp,$len); # done yet?
2037 &jb (&label("grandloop"));
2038
2039&set_label("short");
2040 &add ($len,16*6);
2041 &sub ($len,$inp);
2042 &jz (&label("done"));
2043
2044 &cmp ($len,16*2);
2045 &jb (&label("one"));
2046 &je (&label("two"));
2047
2048 &cmp ($len,16*4);
2049 &jb (&label("three"));
2050 &je (&label("four"));
2051
2052 &lea ($i1,&DWP(1,$block));
2053 &lea ($i3,&DWP(3,$block));
2054 &bsf ($i1,$i1);
2055 &bsf ($i3,$i3);
2056 &shl ($i1,4);
2057 &shl ($i3,4);
2058 &movdqu ($inout0,&QWP(0,$l_));
2059 &movdqu ($inout1,&QWP(0,$l_,$i1));
2060 &mov ($rounds,&DWP($rounds_off,"esp"));
2061 &movdqa ($inout2,$inout0);
2062 &movdqu ($inout3,&QWP(0,$l_,$i3));
2063 &movdqa ($inout4,$inout0);
2064
2065 &pxor ($inout0,$rndkey0); # ^ last offset_i
2066 &pxor ($inout1,$inout0);
2067 &movdqa (&QWP(16*0,"esp"),$inout0);
2068 &pxor ($inout2,$inout1);
2069 &movdqa (&QWP(16*1,"esp"),$inout1);
2070 &pxor ($inout3,$inout2);
2071 &movdqa (&QWP(16*2,"esp"),$inout2);
2072 &pxor ($inout4,$inout3);
2073 &movdqa (&QWP(16*3,"esp"),$inout3);
2074 &pxor ($inout5,$inout4);
2075 &movdqa (&QWP(16*4,"esp"),$inout4);
2076
2077 &$movekey ($rndkey0,&QWP(-48,$key,$rounds));
2078 &movdqu ($inout0,&QWP(16*0,$inp)); # load input
2079 &movdqu ($inout1,&QWP(16*1,$inp));
2080 &movdqu ($inout2,&QWP(16*2,$inp));
2081 &movdqu ($inout3,&QWP(16*3,$inp));
2082 &movdqu ($inout4,&QWP(16*4,$inp));
2083 &pxor ($inout5,$inout5);
2084
2085 &pxor ($rndkey1,$inout0); # checksum
2086 &pxor ($inout0,$rndkey0); # ^ roundkey[0]
2087 &pxor ($rndkey1,$inout1);
2088 &pxor ($inout1,$rndkey0);
2089 &pxor ($rndkey1,$inout2);
2090 &pxor ($inout2,$rndkey0);
2091 &pxor ($rndkey1,$inout3);
2092 &pxor ($inout3,$rndkey0);
2093 &pxor ($rndkey1,$inout4);
2094 &pxor ($inout4,$rndkey0);
2095 &movdqa (&QWP($checksum,"esp"),$rndkey1);
2096
2097 &$movekey ($rndkey1,&QWP(-32,$key,$rounds));
2098 &pxor ($inout0,&QWP(16*0,"esp")); # ^ offset_i
2099 &pxor ($inout1,&QWP(16*1,"esp"));
2100 &pxor ($inout2,&QWP(16*2,"esp"));
2101 &pxor ($inout3,&QWP(16*3,"esp"));
2102 &pxor ($inout4,&QWP(16*4,"esp"));
2103
2104 &$movekey ($rndkey0,&QWP(-16,$key,$rounds));
2105 &aesenc ($inout0,$rndkey1);
2106 &aesenc ($inout1,$rndkey1);
2107 &aesenc ($inout2,$rndkey1);
2108 &aesenc ($inout3,$rndkey1);
2109 &aesenc ($inout4,$rndkey1);
2110 &aesenc ($inout5,$rndkey1);
2111
2112 &mov ($out,&DWP($out_off,"esp"));
2113 &call ("_aesni_encrypt6_enter");
2114
2115 &movdqa ($rndkey0,&QWP(16*4,"esp")); # pass last offset_i
2116 &pxor ($inout0,&QWP(16*0,"esp")); # ^ offset_i
2117 &pxor ($inout1,&QWP(16*1,"esp"));
2118 &pxor ($inout2,&QWP(16*2,"esp"));
2119 &pxor ($inout3,&QWP(16*3,"esp"));
2120 &pxor ($inout4,$rndkey0);
2121 &movdqa ($rndkey1,&QWP($checksum,"esp"));# pass the checksum
2122
2123 &movdqu (&QWP(16*0,$out,$inp),$inout0); # store output
2124 &movdqu (&QWP(16*1,$out,$inp),$inout1);
2125 &movdqu (&QWP(16*2,$out,$inp),$inout2);
2126 &movdqu (&QWP(16*3,$out,$inp),$inout3);
2127 &movdqu (&QWP(16*4,$out,$inp),$inout4);
2128
2129 &jmp (&label("done"));
2130
2131&set_label("one",16);
2132 &movdqu ($inout5,&QWP(0,$l_));
2133 &mov ($key,&DWP($key_off,"esp")); # restore key
2134
2135 &movdqu ($inout0,&QWP(16*0,$inp)); # load input
2136 &mov ($rounds,&DWP(240,$key));
2137
2138 &pxor ($inout5,$rndkey0); # ^ last offset_i
2139 &pxor ($rndkey1,$inout0); # checksum
2140 &pxor ($inout0,$inout5); # ^ offset_i
2141
2142 &movdqa ($inout4,$rndkey1);
2143 &mov ($out,&DWP($out_off,"esp"));
2144 if ($inline)
2145 { &aesni_inline_generate1("enc"); }
2146 else
2147 { &call ("_aesni_encrypt1"); }
2148
2149 &xorps ($inout0,$inout5); # ^ offset_i
2150 &movdqa ($rndkey0,$inout5); # pass last offset_i
2151 &movdqa ($rndkey1,$inout4); # pass the checksum
2152 &movups (&QWP(0,$out,$inp),$inout0);
2153
2154 &jmp (&label("done"));
2155
2156&set_label("two",16);
2157 &lea ($i1,&DWP(1,$block));
2158 &mov ($key,&DWP($key_off,"esp")); # restore key
2159 &bsf ($i1,$i1);
2160 &shl ($i1,4);
2161 &movdqu ($inout4,&QWP(0,$l_));
2162 &movdqu ($inout5,&QWP(0,$l_,$i1));
2163
2164 &movdqu ($inout0,&QWP(16*0,$inp)); # load input
2165 &movdqu ($inout1,&QWP(16*1,$inp));
2166 &mov ($rounds,&DWP(240,$key));
2167
2168 &pxor ($inout4,$rndkey0); # ^ last offset_i
2169 &pxor ($inout5,$inout4);
2170
2171 &pxor ($rndkey1,$inout0); # checksum
2172 &pxor ($inout0,$inout4); # ^ offset_i
2173 &pxor ($rndkey1,$inout1);
2174 &pxor ($inout1,$inout5);
2175
2176 &movdqa ($inout3,$rndkey1)
2177 &mov ($out,&DWP($out_off,"esp"));
2178 &call ("_aesni_encrypt2");
2179
2180 &xorps ($inout0,$inout4); # ^ offset_i
2181 &xorps ($inout1,$inout5);
2182 &movdqa ($rndkey0,$inout5); # pass last offset_i
2183 &movdqa ($rndkey1,$inout3); # pass the checksum
2184 &movups (&QWP(16*0,$out,$inp),$inout0); # store output
2185 &movups (&QWP(16*1,$out,$inp),$inout1);
2186
2187 &jmp (&label("done"));
2188
2189&set_label("three",16);
2190 &lea ($i1,&DWP(1,$block));
2191 &mov ($key,&DWP($key_off,"esp")); # restore key
2192 &bsf ($i1,$i1);
2193 &shl ($i1,4);
2194 &movdqu ($inout3,&QWP(0,$l_));
2195 &movdqu ($inout4,&QWP(0,$l_,$i1));
2196 &movdqa ($inout5,$inout3);
2197
2198 &movdqu ($inout0,&QWP(16*0,$inp)); # load input
2199 &movdqu ($inout1,&QWP(16*1,$inp));
2200 &movdqu ($inout2,&QWP(16*2,$inp));
2201 &mov ($rounds,&DWP(240,$key));
2202
2203 &pxor ($inout3,$rndkey0); # ^ last offset_i
2204 &pxor ($inout4,$inout3);
2205 &pxor ($inout5,$inout4);
2206
2207 &pxor ($rndkey1,$inout0); # checksum
2208 &pxor ($inout0,$inout3); # ^ offset_i
2209 &pxor ($rndkey1,$inout1);
2210 &pxor ($inout1,$inout4);
2211 &pxor ($rndkey1,$inout2);
2212 &pxor ($inout2,$inout5);
2213
2214 &movdqa (&QWP($checksum,"esp"),$rndkey1);
2215 &mov ($out,&DWP($out_off,"esp"));
2216 &call ("_aesni_encrypt3");
2217
2218 &xorps ($inout0,$inout3); # ^ offset_i
2219 &xorps ($inout1,$inout4);
2220 &xorps ($inout2,$inout5);
2221 &movdqa ($rndkey0,$inout5); # pass last offset_i
2222 &movdqa ($rndkey1,&QWP($checksum,"esp"));# pass the checksum
2223 &movups (&QWP(16*0,$out,$inp),$inout0); # store output
2224 &movups (&QWP(16*1,$out,$inp),$inout1);
2225 &movups (&QWP(16*2,$out,$inp),$inout2);
2226
2227 &jmp (&label("done"));
2228
2229&set_label("four",16);
2230 &lea ($i1,&DWP(1,$block));
2231 &lea ($i3,&DWP(3,$block));
2232 &bsf ($i1,$i1);
2233 &bsf ($i3,$i3);
2234 &mov ($key,&DWP($key_off,"esp")); # restore key
2235 &shl ($i1,4);
2236 &shl ($i3,4);
2237 &movdqu ($inout2,&QWP(0,$l_));
2238 &movdqu ($inout3,&QWP(0,$l_,$i1));
2239 &movdqa ($inout4,$inout2);
2240 &movdqu ($inout5,&QWP(0,$l_,$i3));
2241
2242 &pxor ($inout2,$rndkey0); # ^ last offset_i
2243 &movdqu ($inout0,&QWP(16*0,$inp)); # load input
2244 &pxor ($inout3,$inout2);
2245 &movdqu ($inout1,&QWP(16*1,$inp));
2246 &pxor ($inout4,$inout3);
2247 &movdqa (&QWP(16*0,"esp"),$inout2);
2248 &pxor ($inout5,$inout4);
2249 &movdqa (&QWP(16*1,"esp"),$inout3);
2250 &movdqu ($inout2,&QWP(16*2,$inp));
2251 &movdqu ($inout3,&QWP(16*3,$inp));
2252 &mov ($rounds,&DWP(240,$key));
2253
2254 &pxor ($rndkey1,$inout0); # checksum
2255 &pxor ($inout0,&QWP(16*0,"esp")); # ^ offset_i
2256 &pxor ($rndkey1,$inout1);
2257 &pxor ($inout1,&QWP(16*1,"esp"));
2258 &pxor ($rndkey1,$inout2);
2259 &pxor ($inout2,$inout4);
2260 &pxor ($rndkey1,$inout3);
2261 &pxor ($inout3,$inout5);
2262
2263 &movdqa (&QWP($checksum,"esp"),$rndkey1)
2264 &mov ($out,&DWP($out_off,"esp"));
2265 &call ("_aesni_encrypt4");
2266
2267 &xorps ($inout0,&QWP(16*0,"esp")); # ^ offset_i
2268 &xorps ($inout1,&QWP(16*1,"esp"));
2269 &xorps ($inout2,$inout4);
2270 &movups (&QWP(16*0,$out,$inp),$inout0); # store output
2271 &xorps ($inout3,$inout5);
2272 &movups (&QWP(16*1,$out,$inp),$inout1);
2273 &movdqa ($rndkey0,$inout5); # pass last offset_i
2274 &movups (&QWP(16*2,$out,$inp),$inout2);
2275 &movdqa ($rndkey1,&QWP($checksum,"esp"));# pass the checksum
2276 &movups (&QWP(16*3,$out,$inp),$inout3);
2277
2278&set_label("done");
2279 &mov ($key,&DWP($esp_off,"esp"));
2280 &pxor ($inout0,$inout0); # clear register bank
2281 &pxor ($inout1,$inout1);
2282 &movdqa (&QWP(16*0,"esp"),$inout0); # clear stack
2283 &pxor ($inout2,$inout2);
2284 &movdqa (&QWP(16*1,"esp"),$inout0);
2285 &pxor ($inout3,$inout3);
2286 &movdqa (&QWP(16*2,"esp"),$inout0);
2287 &pxor ($inout4,$inout4);
2288 &movdqa (&QWP(16*3,"esp"),$inout0);
2289 &pxor ($inout5,$inout5);
2290 &movdqa (&QWP(16*4,"esp"),$inout0);
2291 &movdqa (&QWP(16*5,"esp"),$inout0);
2292 &movdqa (&QWP(16*6,"esp"),$inout0);
2293
2294 &lea ("esp",&DWP(0,$key));
2295 &mov ($rounds,&wparam(5)); # &offset_i
2296 &mov ($rounds_,&wparam(7)); # &checksum
2297 &movdqu (&QWP(0,$rounds),$rndkey0);
2298 &pxor ($rndkey0,$rndkey0);
2299 &movdqu (&QWP(0,$rounds_),$rndkey1);
2300 &pxor ($rndkey1,$rndkey1);
2301&function_end("aesni_ocb_encrypt");
2302
2303&function_begin("aesni_ocb_decrypt");
2304 &mov ($rounds,&wparam(5)); # &offset_i
2305 &mov ($rounds_,&wparam(7)); # &checksum
2306
2307 &mov ($inp,&wparam(0));
2308 &mov ($out,&wparam(1));
2309 &mov ($len,&wparam(2));
2310 &mov ($key,&wparam(3));
2311 &movdqu ($rndkey0,&QWP(0,$rounds)); # load offset_i
2312 &mov ($block,&wparam(4)); # start_block_num
2313 &movdqu ($rndkey1,&QWP(0,$rounds_)); # load checksum
2314 &mov ($l_,&wparam(6)); # L_
2315
2316 &mov ($rounds,"esp");
2317 &sub ("esp",$esp_off+4); # alloca
2318 &and ("esp",-16); # align stack
2319
2320 &sub ($out,$inp);
2321 &shl ($len,4);
2322 &lea ($len,&DWP(-16*6,$inp,$len)); # end of input - 16*6
2323 &mov (&DWP($out_off,"esp"),$out);
2324 &mov (&DWP($end_off,"esp"),$len);
2325 &mov (&DWP($esp_off,"esp"),$rounds);
2326
2327 &mov ($rounds,&DWP(240,$key));
2328
2329 &test ($block,1);
2330 &jnz (&label("odd"));
2331
2332 &bsf ($i3,$block);
2333 &add ($block,1);
2334 &shl ($i3,4);
2335 &movdqu ($inout5,&QWP(0,$l_,$i3));
2336 &mov ($i3,$key); # put aside key
2337
2338 &movdqu ($inout0,&QWP(16*0,$inp)); # load input
2339 &lea ($inp,&DWP(16,$inp));
2340
2341 &pxor ($inout5,$rndkey0); # ^ last offset_i
2342 &pxor ($inout0,$inout5); # ^ offset_i
2343
2344 &movdqa ($inout4,$rndkey1);
2345 if ($inline)
2346 { &aesni_inline_generate1("dec"); }
2347 else
2348 { &call ("_aesni_decrypt1"); }
2349
2350 &xorps ($inout0,$inout5); # ^ offset_i
2351 &movaps ($rndkey1,$inout4); # pass the checksum
2352 &movdqa ($rndkey0,$inout5); # pass last offset_i
2353 &xorps ($rndkey1,$inout0); # checksum
2354 &movups (&QWP(-16,$out,$inp),$inout0); # store output
2355
2356 &mov ($rounds,&DWP(240,$i3));
2357 &mov ($key,$i3); # restore key
2358 &mov ($len,&DWP($end_off,"esp"));
2359
2360&set_label("odd");
2361 &shl ($rounds,4);
2362 &mov ($out,16);
2363 &sub ($out,$rounds); # twisted rounds
2364 &mov (&DWP($key_off,"esp"),$key);
2365 &lea ($key,&DWP(32,$key,$rounds)); # end of key schedule
2366 &mov (&DWP($rounds_off,"esp"),$out);
2367
2368 &cmp ($inp,$len);
2369 &ja (&label("short"));
2370 &jmp (&label("grandloop"));
2371
2372&set_label("grandloop",32);
2373 &lea ($i1,&DWP(1,$block));
2374 &lea ($i3,&DWP(3,$block));
2375 &lea ($i5,&DWP(5,$block));
2376 &add ($block,6);
2377 &bsf ($i1,$i1);
2378 &bsf ($i3,$i3);
2379 &bsf ($i5,$i5);
2380 &shl ($i1,4);
2381 &shl ($i3,4);
2382 &shl ($i5,4);
2383 &movdqu ($inout0,&QWP(0,$l_));
2384 &movdqu ($inout1,&QWP(0,$l_,$i1));
2385 &mov ($rounds,&DWP($rounds_off,"esp"));
2386 &movdqa ($inout2,$inout0);
2387 &movdqu ($inout3,&QWP(0,$l_,$i3));
2388 &movdqa ($inout4,$inout0);
2389 &movdqu ($inout5,&QWP(0,$l_,$i5));
2390
2391 &pxor ($inout0,$rndkey0); # ^ last offset_i
2392 &pxor ($inout1,$inout0);
2393 &movdqa (&QWP(16*0,"esp"),$inout0);
2394 &pxor ($inout2,$inout1);
2395 &movdqa (&QWP(16*1,"esp"),$inout1);
2396 &pxor ($inout3,$inout2);
2397 &movdqa (&QWP(16*2,"esp"),$inout2);
2398 &pxor ($inout4,$inout3);
2399 &movdqa (&QWP(16*3,"esp"),$inout3);
2400 &pxor ($inout5,$inout4);
2401 &movdqa (&QWP(16*4,"esp"),$inout4);
2402 &movdqa (&QWP(16*5,"esp"),$inout5);
2403
2404 &$movekey ($rndkey0,&QWP(-48,$key,$rounds));
2405 &movdqu ($inout0,&QWP(16*0,$inp)); # load input
2406 &movdqu ($inout1,&QWP(16*1,$inp));
2407 &movdqu ($inout2,&QWP(16*2,$inp));
2408 &movdqu ($inout3,&QWP(16*3,$inp));
2409 &movdqu ($inout4,&QWP(16*4,$inp));
2410 &movdqu ($inout5,&QWP(16*5,$inp));
2411 &lea ($inp,&DWP(16*6,$inp));
2412
2413 &movdqa (&QWP($checksum,"esp"),$rndkey1);
2414 &pxor ($inout0,$rndkey0); # ^ roundkey[0]
2415 &pxor ($inout1,$rndkey0);
2416 &pxor ($inout2,$rndkey0);
2417 &pxor ($inout3,$rndkey0);
2418 &pxor ($inout4,$rndkey0);
2419 &pxor ($inout5,$rndkey0);
2420
2421 &$movekey ($rndkey1,&QWP(-32,$key,$rounds));
2422 &pxor ($inout0,&QWP(16*0,"esp")); # ^ offset_i
2423 &pxor ($inout1,&QWP(16*1,"esp"));
2424 &pxor ($inout2,&QWP(16*2,"esp"));
2425 &pxor ($inout3,&QWP(16*3,"esp"));
2426 &pxor ($inout4,&QWP(16*4,"esp"));
2427 &pxor ($inout5,&QWP(16*5,"esp"));
2428
2429 &$movekey ($rndkey0,&QWP(-16,$key,$rounds));
2430 &aesdec ($inout0,$rndkey1);
2431 &aesdec ($inout1,$rndkey1);
2432 &aesdec ($inout2,$rndkey1);
2433 &aesdec ($inout3,$rndkey1);
2434 &aesdec ($inout4,$rndkey1);
2435 &aesdec ($inout5,$rndkey1);
2436
2437 &mov ($out,&DWP($out_off,"esp"));
2438 &mov ($len,&DWP($end_off,"esp"));
2439 &call ("_aesni_decrypt6_enter");
2440
2441 &movdqa ($rndkey0,&QWP(16*5,"esp")); # pass last offset_i
2442 &pxor ($inout0,&QWP(16*0,"esp")); # ^ offset_i
2443 &movdqa ($rndkey1,&QWP($checksum,"esp"));
2444 &pxor ($inout1,&QWP(16*1,"esp"));
2445 &pxor ($inout2,&QWP(16*2,"esp"));
2446 &pxor ($inout3,&QWP(16*3,"esp"));
2447 &pxor ($inout4,&QWP(16*4,"esp"));
2448 &pxor ($inout5,$rndkey0);
2449
2450 &pxor ($rndkey1,$inout0); # checksum
2451 &movdqu (&QWP(-16*6,$out,$inp),$inout0);# store output
2452 &pxor ($rndkey1,$inout1);
2453 &movdqu (&QWP(-16*5,$out,$inp),$inout1);
2454 &pxor ($rndkey1,$inout2);
2455 &movdqu (&QWP(-16*4,$out,$inp),$inout2);
2456 &pxor ($rndkey1,$inout3);
2457 &movdqu (&QWP(-16*3,$out,$inp),$inout3);
2458 &pxor ($rndkey1,$inout4);
2459 &movdqu (&QWP(-16*2,$out,$inp),$inout4);
2460 &pxor ($rndkey1,$inout5);
2461 &movdqu (&QWP(-16*1,$out,$inp),$inout5);
2462 &cmp ($inp,$len); # done yet?
2463 &jb (&label("grandloop"));
2464
2465&set_label("short");
2466 &add ($len,16*6);
2467 &sub ($len,$inp);
2468 &jz (&label("done"));
2469
2470 &cmp ($len,16*2);
2471 &jb (&label("one"));
2472 &je (&label("two"));
2473
2474 &cmp ($len,16*4);
2475 &jb (&label("three"));
2476 &je (&label("four"));
2477
2478 &lea ($i1,&DWP(1,$block));
2479 &lea ($i3,&DWP(3,$block));
2480 &bsf ($i1,$i1);
2481 &bsf ($i3,$i3);
2482 &shl ($i1,4);
2483 &shl ($i3,4);
2484 &movdqu ($inout0,&QWP(0,$l_));
2485 &movdqu ($inout1,&QWP(0,$l_,$i1));
2486 &mov ($rounds,&DWP($rounds_off,"esp"));
2487 &movdqa ($inout2,$inout0);
2488 &movdqu ($inout3,&QWP(0,$l_,$i3));
2489 &movdqa ($inout4,$inout0);
2490
2491 &pxor ($inout0,$rndkey0); # ^ last offset_i
2492 &pxor ($inout1,$inout0);
2493 &movdqa (&QWP(16*0,"esp"),$inout0);
2494 &pxor ($inout2,$inout1);
2495 &movdqa (&QWP(16*1,"esp"),$inout1);
2496 &pxor ($inout3,$inout2);
2497 &movdqa (&QWP(16*2,"esp"),$inout2);
2498 &pxor ($inout4,$inout3);
2499 &movdqa (&QWP(16*3,"esp"),$inout3);
2500 &pxor ($inout5,$inout4);
2501 &movdqa (&QWP(16*4,"esp"),$inout4);
2502
2503 &$movekey ($rndkey0,&QWP(-48,$key,$rounds));
2504 &movdqu ($inout0,&QWP(16*0,$inp)); # load input
2505 &movdqu ($inout1,&QWP(16*1,$inp));
2506 &movdqu ($inout2,&QWP(16*2,$inp));
2507 &movdqu ($inout3,&QWP(16*3,$inp));
2508 &movdqu ($inout4,&QWP(16*4,$inp));
2509 &pxor ($inout5,$inout5);
2510
2511 &movdqa (&QWP($checksum,"esp"),$rndkey1);
2512 &pxor ($inout0,$rndkey0); # ^ roundkey[0]
2513 &pxor ($inout1,$rndkey0);
2514 &pxor ($inout2,$rndkey0);
2515 &pxor ($inout3,$rndkey0);
2516 &pxor ($inout4,$rndkey0);
2517
2518 &$movekey ($rndkey1,&QWP(-32,$key,$rounds));
2519 &pxor ($inout0,&QWP(16*0,"esp")); # ^ offset_i
2520 &pxor ($inout1,&QWP(16*1,"esp"));
2521 &pxor ($inout2,&QWP(16*2,"esp"));
2522 &pxor ($inout3,&QWP(16*3,"esp"));
2523 &pxor ($inout4,&QWP(16*4,"esp"));
2524
2525 &$movekey ($rndkey0,&QWP(-16,$key,$rounds));
2526 &aesdec ($inout0,$rndkey1);
2527 &aesdec ($inout1,$rndkey1);
2528 &aesdec ($inout2,$rndkey1);
2529 &aesdec ($inout3,$rndkey1);
2530 &aesdec ($inout4,$rndkey1);
2531 &aesdec ($inout5,$rndkey1);
2532
2533 &mov ($out,&DWP($out_off,"esp"));
2534 &call ("_aesni_decrypt6_enter");
2535
2536 &movdqa ($rndkey0,&QWP(16*4,"esp")); # pass last offset_i
2537 &pxor ($inout0,&QWP(16*0,"esp")); # ^ offset_i
2538 &movdqa ($rndkey1,&QWP($checksum,"esp"));
2539 &pxor ($inout1,&QWP(16*1,"esp"));
2540 &pxor ($inout2,&QWP(16*2,"esp"));
2541 &pxor ($inout3,&QWP(16*3,"esp"));
2542 &pxor ($inout4,$rndkey0);
2543
2544 &pxor ($rndkey1,$inout0); # checksum
2545 &movdqu (&QWP(16*0,$out,$inp),$inout0); # store output
2546 &pxor ($rndkey1,$inout1);
2547 &movdqu (&QWP(16*1,$out,$inp),$inout1);
2548 &pxor ($rndkey1,$inout2);
2549 &movdqu (&QWP(16*2,$out,$inp),$inout2);
2550 &pxor ($rndkey1,$inout3);
2551 &movdqu (&QWP(16*3,$out,$inp),$inout3);
2552 &pxor ($rndkey1,$inout4);
2553 &movdqu (&QWP(16*4,$out,$inp),$inout4);
2554
2555 &jmp (&label("done"));
2556
2557&set_label("one",16);
2558 &movdqu ($inout5,&QWP(0,$l_));
2559 &mov ($key,&DWP($key_off,"esp")); # restore key
2560
2561 &movdqu ($inout0,&QWP(16*0,$inp)); # load input
2562 &mov ($rounds,&DWP(240,$key));
2563
2564 &pxor ($inout5,$rndkey0); # ^ last offset_i
2565 &pxor ($inout0,$inout5); # ^ offset_i
2566
2567 &movdqa ($inout4,$rndkey1);
2568 &mov ($out,&DWP($out_off,"esp"));
2569 if ($inline)
2570 { &aesni_inline_generate1("dec"); }
2571 else
2572 { &call ("_aesni_decrypt1"); }
2573
2574 &xorps ($inout0,$inout5); # ^ offset_i
2575 &movaps ($rndkey1,$inout4); # pass the checksum
2576 &movdqa ($rndkey0,$inout5); # pass last offset_i
2577 &xorps ($rndkey1,$inout0); # checksum
2578 &movups (&QWP(0,$out,$inp),$inout0);
2579
2580 &jmp (&label("done"));
2581
2582&set_label("two",16);
2583 &lea ($i1,&DWP(1,$block));
2584 &mov ($key,&DWP($key_off,"esp")); # restore key
2585 &bsf ($i1,$i1);
2586 &shl ($i1,4);
2587 &movdqu ($inout4,&QWP(0,$l_));
2588 &movdqu ($inout5,&QWP(0,$l_,$i1));
2589
2590 &movdqu ($inout0,&QWP(16*0,$inp)); # load input
2591 &movdqu ($inout1,&QWP(16*1,$inp));
2592 &mov ($rounds,&DWP(240,$key));
2593
2594 &movdqa ($inout3,$rndkey1);
2595 &pxor ($inout4,$rndkey0); # ^ last offset_i
2596 &pxor ($inout5,$inout4);
2597
2598 &pxor ($inout0,$inout4); # ^ offset_i
2599 &pxor ($inout1,$inout5);
2600
2601 &mov ($out,&DWP($out_off,"esp"));
2602 &call ("_aesni_decrypt2");
2603
2604 &xorps ($inout0,$inout4); # ^ offset_i
2605 &xorps ($inout1,$inout5);
2606 &movdqa ($rndkey0,$inout5); # pass last offset_i
2607 &xorps ($inout3,$inout0); # checksum
2608 &movups (&QWP(16*0,$out,$inp),$inout0); # store output
2609 &xorps ($inout3,$inout1);
2610 &movups (&QWP(16*1,$out,$inp),$inout1);
2611 &movaps ($rndkey1,$inout3); # pass the checksum
2612
2613 &jmp (&label("done"));
2614
2615&set_label("three",16);
2616 &lea ($i1,&DWP(1,$block));
2617 &mov ($key,&DWP($key_off,"esp")); # restore key
2618 &bsf ($i1,$i1);
2619 &shl ($i1,4);
2620 &movdqu ($inout3,&QWP(0,$l_));
2621 &movdqu ($inout4,&QWP(0,$l_,$i1));
2622 &movdqa ($inout5,$inout3);
2623
2624 &movdqu ($inout0,&QWP(16*0,$inp)); # load input
2625 &movdqu ($inout1,&QWP(16*1,$inp));
2626 &movdqu ($inout2,&QWP(16*2,$inp));
2627 &mov ($rounds,&DWP(240,$key));
2628
2629 &movdqa (&QWP($checksum,"esp"),$rndkey1);
2630 &pxor ($inout3,$rndkey0); # ^ last offset_i
2631 &pxor ($inout4,$inout3);
2632 &pxor ($inout5,$inout4);
2633
2634 &pxor ($inout0,$inout3); # ^ offset_i
2635 &pxor ($inout1,$inout4);
2636 &pxor ($inout2,$inout5);
2637
2638 &mov ($out,&DWP($out_off,"esp"));
2639 &call ("_aesni_decrypt3");
2640
2641 &movdqa ($rndkey1,&QWP($checksum,"esp"));# pass the checksum
2642 &xorps ($inout0,$inout3); # ^ offset_i
2643 &xorps ($inout1,$inout4);
2644 &xorps ($inout2,$inout5);
2645 &movups (&QWP(16*0,$out,$inp),$inout0); # store output
2646 &pxor ($rndkey1,$inout0); # checksum
2647 &movdqa ($rndkey0,$inout5); # pass last offset_i
2648 &movups (&QWP(16*1,$out,$inp),$inout1);
2649 &pxor ($rndkey1,$inout1);
2650 &movups (&QWP(16*2,$out,$inp),$inout2);
2651 &pxor ($rndkey1,$inout2);
2652
2653 &jmp (&label("done"));
2654
2655&set_label("four",16);
2656 &lea ($i1,&DWP(1,$block));
2657 &lea ($i3,&DWP(3,$block));
2658 &bsf ($i1,$i1);
2659 &bsf ($i3,$i3);
2660 &mov ($key,&DWP($key_off,"esp")); # restore key
2661 &shl ($i1,4);
2662 &shl ($i3,4);
2663 &movdqu ($inout2,&QWP(0,$l_));
2664 &movdqu ($inout3,&QWP(0,$l_,$i1));
2665 &movdqa ($inout4,$inout2);
2666 &movdqu ($inout5,&QWP(0,$l_,$i3));
2667
2668 &pxor ($inout2,$rndkey0); # ^ last offset_i
2669 &movdqu ($inout0,&QWP(16*0,$inp)); # load input
2670 &pxor ($inout3,$inout2);
2671 &movdqu ($inout1,&QWP(16*1,$inp));
2672 &pxor ($inout4,$inout3);
2673 &movdqa (&QWP(16*0,"esp"),$inout2);
2674 &pxor ($inout5,$inout4);
2675 &movdqa (&QWP(16*1,"esp"),$inout3);
2676 &movdqu ($inout2,&QWP(16*2,$inp));
2677 &movdqu ($inout3,&QWP(16*3,$inp));
2678 &mov ($rounds,&DWP(240,$key));
2679
2680 &movdqa (&QWP($checksum,"esp"),$rndkey1);
2681 &pxor ($inout0,&QWP(16*0,"esp")); # ^ offset_i
2682 &pxor ($inout1,&QWP(16*1,"esp"));
2683 &pxor ($inout2,$inout4);
2684 &pxor ($inout3,$inout5);
2685
2686 &mov ($out,&DWP($out_off,"esp"));
2687 &call ("_aesni_decrypt4");
2688
2689 &movdqa ($rndkey1,&QWP($checksum,"esp"));# pass the checksum
2690 &xorps ($inout0,&QWP(16*0,"esp")); # ^ offset_i
2691 &xorps ($inout1,&QWP(16*1,"esp"));
2692 &xorps ($inout2,$inout4);
2693 &movups (&QWP(16*0,$out,$inp),$inout0); # store output
2694 &pxor ($rndkey1,$inout0); # checksum
2695 &xorps ($inout3,$inout5);
2696 &movups (&QWP(16*1,$out,$inp),$inout1);
2697 &pxor ($rndkey1,$inout1);
2698 &movdqa ($rndkey0,$inout5); # pass last offset_i
2699 &movups (&QWP(16*2,$out,$inp),$inout2);
2700 &pxor ($rndkey1,$inout2);
2701 &movups (&QWP(16*3,$out,$inp),$inout3);
2702 &pxor ($rndkey1,$inout3);
2703
2704&set_label("done");
2705 &mov ($key,&DWP($esp_off,"esp"));
2706 &pxor ($inout0,$inout0); # clear register bank
2707 &pxor ($inout1,$inout1);
2708 &movdqa (&QWP(16*0,"esp"),$inout0); # clear stack
2709 &pxor ($inout2,$inout2);
2710 &movdqa (&QWP(16*1,"esp"),$inout0);
2711 &pxor ($inout3,$inout3);
2712 &movdqa (&QWP(16*2,"esp"),$inout0);
2713 &pxor ($inout4,$inout4);
2714 &movdqa (&QWP(16*3,"esp"),$inout0);
2715 &pxor ($inout5,$inout5);
2716 &movdqa (&QWP(16*4,"esp"),$inout0);
2717 &movdqa (&QWP(16*5,"esp"),$inout0);
2718 &movdqa (&QWP(16*6,"esp"),$inout0);
2719
2720 &lea ("esp",&DWP(0,$key));
2721 &mov ($rounds,&wparam(5)); # &offset_i
2722 &mov ($rounds_,&wparam(7)); # &checksum
2723 &movdqu (&QWP(0,$rounds),$rndkey0);
2724 &pxor ($rndkey0,$rndkey0);
2725 &movdqu (&QWP(0,$rounds_),$rndkey1);
2726 &pxor ($rndkey1,$rndkey1);
2727&function_end("aesni_ocb_decrypt");
2728}
2729}
2730
2731
2732######################################################################
2733# void $PREFIX_cbc_encrypt (const void *inp, void *out,
2734# size_t length, const AES_KEY *key,
2735# unsigned char *ivp,const int enc);
2736&function_begin("${PREFIX}_cbc_encrypt");
2737 &mov ($inp,&wparam(0));
2738 &mov ($rounds_,"esp");
2739 &mov ($out,&wparam(1));
2740 &sub ($rounds_,24);
2741 &mov ($len,&wparam(2));
2742 &and ($rounds_,-16);
2743 &mov ($key,&wparam(3));
2744 &mov ($key_,&wparam(4));
2745 &test ($len,$len);
2746 &jz (&label("cbc_abort"));
2747
2748 &cmp (&wparam(5),0);
2749 &xchg ($rounds_,"esp"); # alloca
2750 &movups ($ivec,&QWP(0,$key_)); # load IV
2751 &mov ($rounds,&DWP(240,$key));
2752 &mov ($key_,$key); # backup $key
2753 &mov (&DWP(16,"esp"),$rounds_); # save original %esp
2754 &mov ($rounds_,$rounds); # backup $rounds
2755 &je (&label("cbc_decrypt"));
2756
2757 &movaps ($inout0,$ivec);
2758 &cmp ($len,16);
2759 &jb (&label("cbc_enc_tail"));
2760 &sub ($len,16);
2761 &jmp (&label("cbc_enc_loop"));
2762
2763&set_label("cbc_enc_loop",16);
2764 &movups ($ivec,&QWP(0,$inp)); # input actually
2765 &lea ($inp,&DWP(16,$inp));
2766 if ($inline)
2767 { &aesni_inline_generate1("enc",$inout0,$ivec); }
2768 else
2769 { &xorps($inout0,$ivec); &call("_aesni_encrypt1"); }
2770 &mov ($rounds,$rounds_); # restore $rounds
2771 &mov ($key,$key_); # restore $key
2772 &movups (&QWP(0,$out),$inout0); # store output
2773 &lea ($out,&DWP(16,$out));
2774 &sub ($len,16);
2775 &jnc (&label("cbc_enc_loop"));
2776 &add ($len,16);
2777 &jnz (&label("cbc_enc_tail"));
2778 &movaps ($ivec,$inout0);
2779 &pxor ($inout0,$inout0);
2780 &jmp (&label("cbc_ret"));
2781
2782&set_label("cbc_enc_tail");
2783 &mov ("ecx",$len); # zaps $rounds
2784 &data_word(0xA4F3F689); # rep movsb
2785 &mov ("ecx",16); # zero tail
2786 &sub ("ecx",$len);
2787 &xor ("eax","eax"); # zaps $len
2788 &data_word(0xAAF3F689); # rep stosb
2789 &lea ($out,&DWP(-16,$out)); # rewind $out by 1 block
2790 &mov ($rounds,$rounds_); # restore $rounds
2791 &mov ($inp,$out); # $inp and $out are the same
2792 &mov ($key,$key_); # restore $key
2793 &jmp (&label("cbc_enc_loop"));
2794######################################################################
2795&set_label("cbc_decrypt",16);
2796 &cmp ($len,0x50);
2797 &jbe (&label("cbc_dec_tail"));
2798 &movaps (&QWP(0,"esp"),$ivec); # save IV
2799 &sub ($len,0x50);
2800 &jmp (&label("cbc_dec_loop6_enter"));
2801
2802&set_label("cbc_dec_loop6",16);
2803 &movaps (&QWP(0,"esp"),$rndkey0); # save IV
2804 &movups (&QWP(0,$out),$inout5);
2805 &lea ($out,&DWP(0x10,$out));
2806&set_label("cbc_dec_loop6_enter");
2807 &movdqu ($inout0,&QWP(0,$inp));
2808 &movdqu ($inout1,&QWP(0x10,$inp));
2809 &movdqu ($inout2,&QWP(0x20,$inp));
2810 &movdqu ($inout3,&QWP(0x30,$inp));
2811 &movdqu ($inout4,&QWP(0x40,$inp));
2812 &movdqu ($inout5,&QWP(0x50,$inp));
2813
2814 &call ("_aesni_decrypt6");
2815
2816 &movups ($rndkey1,&QWP(0,$inp));
2817 &movups ($rndkey0,&QWP(0x10,$inp));
2818 &xorps ($inout0,&QWP(0,"esp")); # ^=IV
2819 &xorps ($inout1,$rndkey1);
2820 &movups ($rndkey1,&QWP(0x20,$inp));
2821 &xorps ($inout2,$rndkey0);
2822 &movups ($rndkey0,&QWP(0x30,$inp));
2823 &xorps ($inout3,$rndkey1);
2824 &movups ($rndkey1,&QWP(0x40,$inp));
2825 &xorps ($inout4,$rndkey0);
2826 &movups ($rndkey0,&QWP(0x50,$inp)); # IV
2827 &xorps ($inout5,$rndkey1);
2828 &movups (&QWP(0,$out),$inout0);
2829 &movups (&QWP(0x10,$out),$inout1);
2830 &lea ($inp,&DWP(0x60,$inp));
2831 &movups (&QWP(0x20,$out),$inout2);
2832 &mov ($rounds,$rounds_); # restore $rounds
2833 &movups (&QWP(0x30,$out),$inout3);
2834 &mov ($key,$key_); # restore $key
2835 &movups (&QWP(0x40,$out),$inout4);
2836 &lea ($out,&DWP(0x50,$out));
2837 &sub ($len,0x60);
2838 &ja (&label("cbc_dec_loop6"));
2839
2840 &movaps ($inout0,$inout5);
2841 &movaps ($ivec,$rndkey0);
2842 &add ($len,0x50);
2843 &jle (&label("cbc_dec_clear_tail_collected"));
2844 &movups (&QWP(0,$out),$inout0);
2845 &lea ($out,&DWP(0x10,$out));
2846&set_label("cbc_dec_tail");
2847 &movups ($inout0,&QWP(0,$inp));
2848 &movaps ($in0,$inout0);
2849 &cmp ($len,0x10);
2850 &jbe (&label("cbc_dec_one"));
2851
2852 &movups ($inout1,&QWP(0x10,$inp));
2853 &movaps ($in1,$inout1);
2854 &cmp ($len,0x20);
2855 &jbe (&label("cbc_dec_two"));
2856
2857 &movups ($inout2,&QWP(0x20,$inp));
2858 &cmp ($len,0x30);
2859 &jbe (&label("cbc_dec_three"));
2860
2861 &movups ($inout3,&QWP(0x30,$inp));
2862 &cmp ($len,0x40);
2863 &jbe (&label("cbc_dec_four"));
2864
2865 &movups ($inout4,&QWP(0x40,$inp));
2866 &movaps (&QWP(0,"esp"),$ivec); # save IV
2867 &movups ($inout0,&QWP(0,$inp));
2868 &xorps ($inout5,$inout5);
2869 &call ("_aesni_decrypt6");
2870 &movups ($rndkey1,&QWP(0,$inp));
2871 &movups ($rndkey0,&QWP(0x10,$inp));
2872 &xorps ($inout0,&QWP(0,"esp")); # ^= IV
2873 &xorps ($inout1,$rndkey1);
2874 &movups ($rndkey1,&QWP(0x20,$inp));
2875 &xorps ($inout2,$rndkey0);
2876 &movups ($rndkey0,&QWP(0x30,$inp));
2877 &xorps ($inout3,$rndkey1);
2878 &movups ($ivec,&QWP(0x40,$inp)); # IV
2879 &xorps ($inout4,$rndkey0);
2880 &movups (&QWP(0,$out),$inout0);
2881 &movups (&QWP(0x10,$out),$inout1);
2882 &pxor ($inout1,$inout1);
2883 &movups (&QWP(0x20,$out),$inout2);
2884 &pxor ($inout2,$inout2);
2885 &movups (&QWP(0x30,$out),$inout3);
2886 &pxor ($inout3,$inout3);
2887 &lea ($out,&DWP(0x40,$out));
2888 &movaps ($inout0,$inout4);
2889 &pxor ($inout4,$inout4);
2890 &sub ($len,0x50);
2891 &jmp (&label("cbc_dec_tail_collected"));
2892
2893&set_label("cbc_dec_one",16);
2894 if ($inline)
2895 { &aesni_inline_generate1("dec"); }
2896 else
2897 { &call ("_aesni_decrypt1"); }
2898 &xorps ($inout0,$ivec);
2899 &movaps ($ivec,$in0);
2900 &sub ($len,0x10);
2901 &jmp (&label("cbc_dec_tail_collected"));
2902
2903&set_label("cbc_dec_two",16);
2904 &call ("_aesni_decrypt2");
2905 &xorps ($inout0,$ivec);
2906 &xorps ($inout1,$in0);
2907 &movups (&QWP(0,$out),$inout0);
2908 &movaps ($inout0,$inout1);
2909 &pxor ($inout1,$inout1);
2910 &lea ($out,&DWP(0x10,$out));
2911 &movaps ($ivec,$in1);
2912 &sub ($len,0x20);
2913 &jmp (&label("cbc_dec_tail_collected"));
2914
2915&set_label("cbc_dec_three",16);
2916 &call ("_aesni_decrypt3");
2917 &xorps ($inout0,$ivec);
2918 &xorps ($inout1,$in0);
2919 &xorps ($inout2,$in1);
2920 &movups (&QWP(0,$out),$inout0);
2921 &movaps ($inout0,$inout2);
2922 &pxor ($inout2,$inout2);
2923 &movups (&QWP(0x10,$out),$inout1);
2924 &pxor ($inout1,$inout1);
2925 &lea ($out,&DWP(0x20,$out));
2926 &movups ($ivec,&QWP(0x20,$inp));
2927 &sub ($len,0x30);
2928 &jmp (&label("cbc_dec_tail_collected"));
2929
2930&set_label("cbc_dec_four",16);
2931 &call ("_aesni_decrypt4");
2932 &movups ($rndkey1,&QWP(0x10,$inp));
2933 &movups ($rndkey0,&QWP(0x20,$inp));
2934 &xorps ($inout0,$ivec);
2935 &movups ($ivec,&QWP(0x30,$inp));
2936 &xorps ($inout1,$in0);
2937 &movups (&QWP(0,$out),$inout0);
2938 &xorps ($inout2,$rndkey1);
2939 &movups (&QWP(0x10,$out),$inout1);
2940 &pxor ($inout1,$inout1);
2941 &xorps ($inout3,$rndkey0);
2942 &movups (&QWP(0x20,$out),$inout2);
2943 &pxor ($inout2,$inout2);
2944 &lea ($out,&DWP(0x30,$out));
2945 &movaps ($inout0,$inout3);
2946 &pxor ($inout3,$inout3);
2947 &sub ($len,0x40);
2948 &jmp (&label("cbc_dec_tail_collected"));
2949
2950&set_label("cbc_dec_clear_tail_collected",16);
2951 &pxor ($inout1,$inout1);
2952 &pxor ($inout2,$inout2);
2953 &pxor ($inout3,$inout3);
2954 &pxor ($inout4,$inout4);
2955&set_label("cbc_dec_tail_collected");
2956 &and ($len,15);
2957 &jnz (&label("cbc_dec_tail_partial"));
2958 &movups (&QWP(0,$out),$inout0);
2959 &pxor ($rndkey0,$rndkey0);
2960 &jmp (&label("cbc_ret"));
2961
2962&set_label("cbc_dec_tail_partial",16);
2963 &movaps (&QWP(0,"esp"),$inout0);
2964 &pxor ($rndkey0,$rndkey0);
2965 &mov ("ecx",16);
2966 &mov ($inp,"esp");
2967 &sub ("ecx",$len);
2968 &data_word(0xA4F3F689); # rep movsb
2969 &movdqa (&QWP(0,"esp"),$inout0);
2970
2971&set_label("cbc_ret");
2972 &mov ("esp",&DWP(16,"esp")); # pull original %esp
2973 &mov ($key_,&wparam(4));
2974 &pxor ($inout0,$inout0);
2975 &pxor ($rndkey1,$rndkey1);
2976 &movups (&QWP(0,$key_),$ivec); # output IV
2977 &pxor ($ivec,$ivec);
2978&set_label("cbc_abort");
2979&function_end("${PREFIX}_cbc_encrypt");
2980
2981
2982######################################################################
2983# Mechanical port from aesni-x86_64.pl.
2984#
2985# _aesni_set_encrypt_key is private interface,
2986# input:
2987# "eax" const unsigned char *userKey
2988# $rounds int bits
2989# $key AES_KEY *key
2990# output:
2991# "eax" return code
2992# $round rounds
2993
2994&function_begin_B("_aesni_set_encrypt_key");
2995 &push ("ebp");
2996 &push ("ebx");
2997 &test ("eax","eax");
2998 &jz (&label("bad_pointer"));
2999 &test ($key,$key);
3000 &jz (&label("bad_pointer"));
3001
3002 &call (&label("pic"));
3003&set_label("pic");
3004 &blindpop("ebx");
3005 &lea ("ebx",&DWP(&label("key_const")."-".&label("pic"),"ebx"));
3006
3007 &picmeup("ebp","OPENSSL_ia32cap_P","ebx",&label("key_const"));
3008 &movups ("xmm0",&QWP(0,"eax")); # pull first 128 bits of *userKey
3009 &xorps ("xmm4","xmm4"); # low dword of xmm4 is assumed 0
3010 &mov ("ebp",&DWP(4,"ebp"));
3011 &lea ($key,&DWP(16,$key));
3012 &and ("ebp",1<<28|1<<11); # AVX and XOP bits
3013 &cmp ($rounds,256);
3014 &je (&label("14rounds"));
3015 &cmp ($rounds,192);
3016 &je (&label("12rounds"));
3017 &cmp ($rounds,128);
3018 &jne (&label("bad_keybits"));
3019
3020&set_label("10rounds",16);
3021 &cmp ("ebp",1<<28);
3022 &je (&label("10rounds_alt"));
3023
3024 &mov ($rounds,9);
3025 &$movekey (&QWP(-16,$key),"xmm0"); # round 0
3026 &aeskeygenassist("xmm1","xmm0",0x01); # round 1
3027 &call (&label("key_128_cold"));
3028 &aeskeygenassist("xmm1","xmm0",0x2); # round 2
3029 &call (&label("key_128"));
3030 &aeskeygenassist("xmm1","xmm0",0x04); # round 3
3031 &call (&label("key_128"));
3032 &aeskeygenassist("xmm1","xmm0",0x08); # round 4
3033 &call (&label("key_128"));
3034 &aeskeygenassist("xmm1","xmm0",0x10); # round 5
3035 &call (&label("key_128"));
3036 &aeskeygenassist("xmm1","xmm0",0x20); # round 6
3037 &call (&label("key_128"));
3038 &aeskeygenassist("xmm1","xmm0",0x40); # round 7
3039 &call (&label("key_128"));
3040 &aeskeygenassist("xmm1","xmm0",0x80); # round 8
3041 &call (&label("key_128"));
3042 &aeskeygenassist("xmm1","xmm0",0x1b); # round 9
3043 &call (&label("key_128"));
3044 &aeskeygenassist("xmm1","xmm0",0x36); # round 10
3045 &call (&label("key_128"));
3046 &$movekey (&QWP(0,$key),"xmm0");
3047 &mov (&DWP(80,$key),$rounds);
3048
3049 &jmp (&label("good_key"));
3050
3051&set_label("key_128",16);
3052 &$movekey (&QWP(0,$key),"xmm0");
3053 &lea ($key,&DWP(16,$key));
3054&set_label("key_128_cold");
3055 &shufps ("xmm4","xmm0",0b00010000);
3056 &xorps ("xmm0","xmm4");
3057 &shufps ("xmm4","xmm0",0b10001100);
3058 &xorps ("xmm0","xmm4");
3059 &shufps ("xmm1","xmm1",0b11111111); # critical path
3060 &xorps ("xmm0","xmm1");
3061 &ret();
3062
3063&set_label("10rounds_alt",16);
3064 &movdqa ("xmm5",&QWP(0x00,"ebx"));
3065 &mov ($rounds,8);
3066 &movdqa ("xmm4",&QWP(0x20,"ebx"));
3067 &movdqa ("xmm2","xmm0");
3068 &movdqu (&QWP(-16,$key),"xmm0");
3069
3070&set_label("loop_key128");
3071 &pshufb ("xmm0","xmm5");
3072 &aesenclast ("xmm0","xmm4");
3073 &pslld ("xmm4",1);
3074 &lea ($key,&DWP(16,$key));
3075
3076 &movdqa ("xmm3","xmm2");
3077 &pslldq ("xmm2",4);
3078 &pxor ("xmm3","xmm2");
3079 &pslldq ("xmm2",4);
3080 &pxor ("xmm3","xmm2");
3081 &pslldq ("xmm2",4);
3082 &pxor ("xmm2","xmm3");
3083
3084 &pxor ("xmm0","xmm2");
3085 &movdqu (&QWP(-16,$key),"xmm0");
3086 &movdqa ("xmm2","xmm0");
3087
3088 &dec ($rounds);
3089 &jnz (&label("loop_key128"));
3090
3091 &movdqa ("xmm4",&QWP(0x30,"ebx"));
3092
3093 &pshufb ("xmm0","xmm5");
3094 &aesenclast ("xmm0","xmm4");
3095 &pslld ("xmm4",1);
3096
3097 &movdqa ("xmm3","xmm2");
3098 &pslldq ("xmm2",4);
3099 &pxor ("xmm3","xmm2");
3100 &pslldq ("xmm2",4);
3101 &pxor ("xmm3","xmm2");
3102 &pslldq ("xmm2",4);
3103 &pxor ("xmm2","xmm3");
3104
3105 &pxor ("xmm0","xmm2");
3106 &movdqu (&QWP(0,$key),"xmm0");
3107
3108 &movdqa ("xmm2","xmm0");
3109 &pshufb ("xmm0","xmm5");
3110 &aesenclast ("xmm0","xmm4");
3111
3112 &movdqa ("xmm3","xmm2");
3113 &pslldq ("xmm2",4);
3114 &pxor ("xmm3","xmm2");
3115 &pslldq ("xmm2",4);
3116 &pxor ("xmm3","xmm2");
3117 &pslldq ("xmm2",4);
3118 &pxor ("xmm2","xmm3");
3119
3120 &pxor ("xmm0","xmm2");
3121 &movdqu (&QWP(16,$key),"xmm0");
3122
3123 &mov ($rounds,9);
3124 &mov (&DWP(96,$key),$rounds);
3125
3126 &jmp (&label("good_key"));
3127
3128&set_label("12rounds",16);
3129 &movq ("xmm2",&QWP(16,"eax")); # remaining 1/3 of *userKey
3130 &cmp ("ebp",1<<28);
3131 &je (&label("12rounds_alt"));
3132
3133 &mov ($rounds,11);
3134 &$movekey (&QWP(-16,$key),"xmm0"); # round 0
3135 &aeskeygenassist("xmm1","xmm2",0x01); # round 1,2
3136 &call (&label("key_192a_cold"));
3137 &aeskeygenassist("xmm1","xmm2",0x02); # round 2,3
3138 &call (&label("key_192b"));
3139 &aeskeygenassist("xmm1","xmm2",0x04); # round 4,5
3140 &call (&label("key_192a"));
3141 &aeskeygenassist("xmm1","xmm2",0x08); # round 5,6
3142 &call (&label("key_192b"));
3143 &aeskeygenassist("xmm1","xmm2",0x10); # round 7,8
3144 &call (&label("key_192a"));
3145 &aeskeygenassist("xmm1","xmm2",0x20); # round 8,9
3146 &call (&label("key_192b"));
3147 &aeskeygenassist("xmm1","xmm2",0x40); # round 10,11
3148 &call (&label("key_192a"));
3149 &aeskeygenassist("xmm1","xmm2",0x80); # round 11,12
3150 &call (&label("key_192b"));
3151 &$movekey (&QWP(0,$key),"xmm0");
3152 &mov (&DWP(48,$key),$rounds);
3153
3154 &jmp (&label("good_key"));
3155
3156&set_label("key_192a",16);
3157 &$movekey (&QWP(0,$key),"xmm0");
3158 &lea ($key,&DWP(16,$key));
3159&set_label("key_192a_cold",16);
3160 &movaps ("xmm5","xmm2");
3161&set_label("key_192b_warm");
3162 &shufps ("xmm4","xmm0",0b00010000);
3163 &movdqa ("xmm3","xmm2");
3164 &xorps ("xmm0","xmm4");
3165 &shufps ("xmm4","xmm0",0b10001100);
3166 &pslldq ("xmm3",4);
3167 &xorps ("xmm0","xmm4");
3168 &pshufd ("xmm1","xmm1",0b01010101); # critical path
3169 &pxor ("xmm2","xmm3");
3170 &pxor ("xmm0","xmm1");
3171 &pshufd ("xmm3","xmm0",0b11111111);
3172 &pxor ("xmm2","xmm3");
3173 &ret();
3174
3175&set_label("key_192b",16);
3176 &movaps ("xmm3","xmm0");
3177 &shufps ("xmm5","xmm0",0b01000100);
3178 &$movekey (&QWP(0,$key),"xmm5");
3179 &shufps ("xmm3","xmm2",0b01001110);
3180 &$movekey (&QWP(16,$key),"xmm3");
3181 &lea ($key,&DWP(32,$key));
3182 &jmp (&label("key_192b_warm"));
3183
3184&set_label("12rounds_alt",16);
3185 &movdqa ("xmm5",&QWP(0x10,"ebx"));
3186 &movdqa ("xmm4",&QWP(0x20,"ebx"));
3187 &mov ($rounds,8);
3188 &movdqu (&QWP(-16,$key),"xmm0");
3189
3190&set_label("loop_key192");
3191 &movq (&QWP(0,$key),"xmm2");
3192 &movdqa ("xmm1","xmm2");
3193 &pshufb ("xmm2","xmm5");
3194 &aesenclast ("xmm2","xmm4");
3195 &pslld ("xmm4",1);
3196 &lea ($key,&DWP(24,$key));
3197
3198 &movdqa ("xmm3","xmm0");
3199 &pslldq ("xmm0",4);
3200 &pxor ("xmm3","xmm0");
3201 &pslldq ("xmm0",4);
3202 &pxor ("xmm3","xmm0");
3203 &pslldq ("xmm0",4);
3204 &pxor ("xmm0","xmm3");
3205
3206 &pshufd ("xmm3","xmm0",0xff);
3207 &pxor ("xmm3","xmm1");
3208 &pslldq ("xmm1",4);
3209 &pxor ("xmm3","xmm1");
3210
3211 &pxor ("xmm0","xmm2");
3212 &pxor ("xmm2","xmm3");
3213 &movdqu (&QWP(-16,$key),"xmm0");
3214
3215 &dec ($rounds);
3216 &jnz (&label("loop_key192"));
3217
3218 &mov ($rounds,11);
3219 &mov (&DWP(32,$key),$rounds);
3220
3221 &jmp (&label("good_key"));
3222
3223&set_label("14rounds",16);
3224 &movups ("xmm2",&QWP(16,"eax")); # remaining half of *userKey
3225 &lea ($key,&DWP(16,$key));
3226 &cmp ("ebp",1<<28);
3227 &je (&label("14rounds_alt"));
3228
3229 &mov ($rounds,13);
3230 &$movekey (&QWP(-32,$key),"xmm0"); # round 0
3231 &$movekey (&QWP(-16,$key),"xmm2"); # round 1
3232 &aeskeygenassist("xmm1","xmm2",0x01); # round 2
3233 &call (&label("key_256a_cold"));
3234 &aeskeygenassist("xmm1","xmm0",0x01); # round 3
3235 &call (&label("key_256b"));
3236 &aeskeygenassist("xmm1","xmm2",0x02); # round 4
3237 &call (&label("key_256a"));
3238 &aeskeygenassist("xmm1","xmm0",0x02); # round 5
3239 &call (&label("key_256b"));
3240 &aeskeygenassist("xmm1","xmm2",0x04); # round 6
3241 &call (&label("key_256a"));
3242 &aeskeygenassist("xmm1","xmm0",0x04); # round 7
3243 &call (&label("key_256b"));
3244 &aeskeygenassist("xmm1","xmm2",0x08); # round 8
3245 &call (&label("key_256a"));
3246 &aeskeygenassist("xmm1","xmm0",0x08); # round 9
3247 &call (&label("key_256b"));
3248 &aeskeygenassist("xmm1","xmm2",0x10); # round 10
3249 &call (&label("key_256a"));
3250 &aeskeygenassist("xmm1","xmm0",0x10); # round 11
3251 &call (&label("key_256b"));
3252 &aeskeygenassist("xmm1","xmm2",0x20); # round 12
3253 &call (&label("key_256a"));
3254 &aeskeygenassist("xmm1","xmm0",0x20); # round 13
3255 &call (&label("key_256b"));
3256 &aeskeygenassist("xmm1","xmm2",0x40); # round 14
3257 &call (&label("key_256a"));
3258 &$movekey (&QWP(0,$key),"xmm0");
3259 &mov (&DWP(16,$key),$rounds);
3260 &xor ("eax","eax");
3261
3262 &jmp (&label("good_key"));
3263
3264&set_label("key_256a",16);
3265 &$movekey (&QWP(0,$key),"xmm2");
3266 &lea ($key,&DWP(16,$key));
3267&set_label("key_256a_cold");
3268 &shufps ("xmm4","xmm0",0b00010000);
3269 &xorps ("xmm0","xmm4");
3270 &shufps ("xmm4","xmm0",0b10001100);
3271 &xorps ("xmm0","xmm4");
3272 &shufps ("xmm1","xmm1",0b11111111); # critical path
3273 &xorps ("xmm0","xmm1");
3274 &ret();
3275
3276&set_label("key_256b",16);
3277 &$movekey (&QWP(0,$key),"xmm0");
3278 &lea ($key,&DWP(16,$key));
3279
3280 &shufps ("xmm4","xmm2",0b00010000);
3281 &xorps ("xmm2","xmm4");
3282 &shufps ("xmm4","xmm2",0b10001100);
3283 &xorps ("xmm2","xmm4");
3284 &shufps ("xmm1","xmm1",0b10101010); # critical path
3285 &xorps ("xmm2","xmm1");
3286 &ret();
3287
3288&set_label("14rounds_alt",16);
3289 &movdqa ("xmm5",&QWP(0x00,"ebx"));
3290 &movdqa ("xmm4",&QWP(0x20,"ebx"));
3291 &mov ($rounds,7);
3292 &movdqu (&QWP(-32,$key),"xmm0");
3293 &movdqa ("xmm1","xmm2");
3294 &movdqu (&QWP(-16,$key),"xmm2");
3295
3296&set_label("loop_key256");
3297 &pshufb ("xmm2","xmm5");
3298 &aesenclast ("xmm2","xmm4");
3299
3300 &movdqa ("xmm3","xmm0");
3301 &pslldq ("xmm0",4);
3302 &pxor ("xmm3","xmm0");
3303 &pslldq ("xmm0",4);
3304 &pxor ("xmm3","xmm0");
3305 &pslldq ("xmm0",4);
3306 &pxor ("xmm0","xmm3");
3307 &pslld ("xmm4",1);
3308
3309 &pxor ("xmm0","xmm2");
3310 &movdqu (&QWP(0,$key),"xmm0");
3311
3312 &dec ($rounds);
3313 &jz (&label("done_key256"));
3314
3315 &pshufd ("xmm2","xmm0",0xff);
3316 &pxor ("xmm3","xmm3");
3317 &aesenclast ("xmm2","xmm3");
3318
3319 &movdqa ("xmm3","xmm1");
3320 &pslldq ("xmm1",4);
3321 &pxor ("xmm3","xmm1");
3322 &pslldq ("xmm1",4);
3323 &pxor ("xmm3","xmm1");
3324 &pslldq ("xmm1",4);
3325 &pxor ("xmm1","xmm3");
3326
3327 &pxor ("xmm2","xmm1");
3328 &movdqu (&QWP(16,$key),"xmm2");
3329 &lea ($key,&DWP(32,$key));
3330 &movdqa ("xmm1","xmm2");
3331 &jmp (&label("loop_key256"));
3332
3333&set_label("done_key256");
3334 &mov ($rounds,13);
3335 &mov (&DWP(16,$key),$rounds);
3336
3337&set_label("good_key");
3338 &pxor ("xmm0","xmm0");
3339 &pxor ("xmm1","xmm1");
3340 &pxor ("xmm2","xmm2");
3341 &pxor ("xmm3","xmm3");
3342 &pxor ("xmm4","xmm4");
3343 &pxor ("xmm5","xmm5");
3344 &xor ("eax","eax");
3345 &pop ("ebx");
3346 &pop ("ebp");
3347 &ret ();
3348
3349&set_label("bad_pointer",4);
3350 &mov ("eax",-1);
3351 &pop ("ebx");
3352 &pop ("ebp");
3353 &ret ();
3354&set_label("bad_keybits",4);
3355 &pxor ("xmm0","xmm0");
3356 &mov ("eax",-2);
3357 &pop ("ebx");
3358 &pop ("ebp");
3359 &ret ();
3360&function_end_B("_aesni_set_encrypt_key");
3361
3362# int $PREFIX_set_encrypt_key (const unsigned char *userKey, int bits,
3363# AES_KEY *key)
3364&function_begin_B("${PREFIX}_set_encrypt_key");
3365 &mov ("eax",&wparam(0));
3366 &mov ($rounds,&wparam(1));
3367 &mov ($key,&wparam(2));
3368 &call ("_aesni_set_encrypt_key");
3369 &ret ();
3370&function_end_B("${PREFIX}_set_encrypt_key");
3371
3372# int $PREFIX_set_decrypt_key (const unsigned char *userKey, int bits,
3373# AES_KEY *key)
3374&function_begin_B("${PREFIX}_set_decrypt_key");
3375 &mov ("eax",&wparam(0));
3376 &mov ($rounds,&wparam(1));
3377 &mov ($key,&wparam(2));
3378 &call ("_aesni_set_encrypt_key");
3379 &mov ($key,&wparam(2));
3380 &shl ($rounds,4); # rounds-1 after _aesni_set_encrypt_key
3381 &test ("eax","eax");
3382 &jnz (&label("dec_key_ret"));
3383 &lea ("eax",&DWP(16,$key,$rounds)); # end of key schedule
3384
3385 &$movekey ("xmm0",&QWP(0,$key)); # just swap
3386 &$movekey ("xmm1",&QWP(0,"eax"));
3387 &$movekey (&QWP(0,"eax"),"xmm0");
3388 &$movekey (&QWP(0,$key),"xmm1");
3389 &lea ($key,&DWP(16,$key));
3390 &lea ("eax",&DWP(-16,"eax"));
3391
3392&set_label("dec_key_inverse");
3393 &$movekey ("xmm0",&QWP(0,$key)); # swap and inverse
3394 &$movekey ("xmm1",&QWP(0,"eax"));
3395 &aesimc ("xmm0","xmm0");
3396 &aesimc ("xmm1","xmm1");
3397 &lea ($key,&DWP(16,$key));
3398 &lea ("eax",&DWP(-16,"eax"));
3399 &$movekey (&QWP(16,"eax"),"xmm0");
3400 &$movekey (&QWP(-16,$key),"xmm1");
3401 &cmp ("eax",$key);
3402 &ja (&label("dec_key_inverse"));
3403
3404 &$movekey ("xmm0",&QWP(0,$key)); # inverse middle
3405 &aesimc ("xmm0","xmm0");
3406 &$movekey (&QWP(0,$key),"xmm0");
3407
3408 &pxor ("xmm0","xmm0");
3409 &pxor ("xmm1","xmm1");
3410 &xor ("eax","eax"); # return success
3411&set_label("dec_key_ret");
3412 &ret ();
3413&function_end_B("${PREFIX}_set_decrypt_key");
3414
3415&set_label("key_const",64);
3416&data_word(0x0c0f0e0d,0x0c0f0e0d,0x0c0f0e0d,0x0c0f0e0d);
3417&data_word(0x04070605,0x04070605,0x04070605,0x04070605);
3418&data_word(1,1,1,1);
3419&data_word(0x1b,0x1b,0x1b,0x1b);
3420&asciz("AES for Intel AES-NI, CRYPTOGAMS by <appro\@openssl.org>");
3421
3422&asm_finish();
3423
3424close STDOUT or die "error closing STDOUT: $!";
Note: See TracBrowser for help on using the repository browser.

© 2024 Oracle Support Privacy / Do Not Sell My Info Terms of Use Trademark Policy Automated Access Etiquette