VirtualBox

source: vbox/trunk/src/libs/openssl-3.1.3/crypto/aes/asm/aesni-mb-x86_64.pl@ 101211

Last change on this file since 101211 was 101211, checked in by vboxsync, 17 months ago

openssl-3.1.3: Applied and adjusted our OpenSSL changes to 3.1.2. bugref:10527

File size: 38.2 KB
Line 
1#! /usr/bin/env perl
2# Copyright 2013-2020 The OpenSSL Project Authors. All Rights Reserved.
3#
4# Licensed under the Apache License 2.0 (the "License"). You may not use
5# this file except in compliance with the License. You can obtain a copy
6# in the file LICENSE in the source distribution or at
7# https://www.openssl.org/source/license.html
8
9
10# ====================================================================
11# Written by Andy Polyakov <[email protected]> for the OpenSSL
12# project. The module is, however, dual licensed under OpenSSL and
13# CRYPTOGAMS licenses depending on where you obtain it. For further
14# details see http://www.openssl.org/~appro/cryptogams/.
15# ====================================================================
16
17# Multi-buffer AES-NI procedures process several independent buffers
18# in parallel by interleaving independent instructions.
19#
20# Cycles per byte for interleave factor 4:
21#
22# asymptotic measured
23# ---------------------------
24# Westmere 5.00/4=1.25 5.13/4=1.28
25# Atom 15.0/4=3.75 ?15.7/4=3.93
26# Sandy Bridge 5.06/4=1.27 5.18/4=1.29
27# Ivy Bridge 5.06/4=1.27 5.14/4=1.29
28# Haswell 4.44/4=1.11 4.44/4=1.11
29# Bulldozer 5.75/4=1.44 5.76/4=1.44
30#
31# Cycles per byte for interleave factor 8 (not implemented for
32# pre-AVX processors, where higher interleave factor incidentally
33# doesn't result in improvement):
34#
35# asymptotic measured
36# ---------------------------
37# Sandy Bridge 5.06/8=0.64 7.10/8=0.89(*)
38# Ivy Bridge 5.06/8=0.64 7.14/8=0.89(*)
39# Haswell 5.00/8=0.63 5.00/8=0.63
40# Bulldozer 5.75/8=0.72 5.77/8=0.72
41#
42# (*) Sandy/Ivy Bridge are known to handle high interleave factors
43# suboptimally;
44
45# $output is the last argument if it looks like a file (it has an extension)
46# $flavour is the first argument if it doesn't look like a file
47$output = $#ARGV >= 0 && $ARGV[$#ARGV] =~ m|\.\w+$| ? pop : undef;
48$flavour = $#ARGV >= 0 && $ARGV[0] !~ m|\.| ? shift : undef;
49
50$win64=0; $win64=1 if ($flavour =~ /[nm]asm|mingw64/ || $output =~ /\.asm$/);
51
52$0 =~ m/(.*[\/\\])[^\/\\]+$/; $dir=$1;
53( $xlate="${dir}x86_64-xlate.pl" and -f $xlate ) or
54( $xlate="${dir}../../perlasm/x86_64-xlate.pl" and -f $xlate) or
55die "can't locate x86_64-xlate.pl";
56
57push(@INC,"${dir}","${dir}../../perlasm");
58require "x86_64-support.pl";
59
60$ptr_size=&pointer_size($flavour);
61
62$avx=0;
63
64if (`$ENV{CC} -Wa,-v -c -o /dev/null -x assembler /dev/null 2>&1`
65 =~ /GNU assembler version ([2-9]\.[0-9]+)/) {
66 $avx = ($1>=2.19) + ($1>=2.22);
67}
68
69if (!$avx && $win64 && ($flavour =~ /nasm/ || $ENV{ASM} =~ /nasm/) &&
70 `nasm -v 2>&1` =~ /NASM version ([2-9]\.[0-9]+)/) {
71 $avx = ($1>=2.09) + ($1>=2.10);
72}
73
74if (!$avx && $win64 && ($flavour =~ /masm/ || $ENV{ASM} =~ /ml64/) &&
75 `ml64 2>&1` =~ /Version ([0-9]+)\./) {
76 $avx = ($1>=10) + ($1>=11);
77}
78
79if (!$avx && `$ENV{CC} -v 2>&1` =~ /((?:clang|LLVM) version|.*based on LLVM) ([0-9]+\.[0-9]+)/) {
80 $avx = ($2>=3.0) + ($2>3.0);
81}
82
83open OUT,"| \"$^X\" \"$xlate\" $flavour \"$output\""
84 or die "can't call $xlate: $!";
85*STDOUT=*OUT;
86
87# void aesni_multi_cbc_encrypt (
88# struct { void *inp,*out; int blocks; double iv[2]; } inp[8];
89# const AES_KEY *key,
90# int num); /* 1 or 2 */
91#
92$inp="%rdi"; # 1st arg
93$key="%rsi"; # 2nd arg
94$num="%edx";
95
96$inp_elm_size=2*$ptr_size+8+16;
97
98@inptr=map("%r$_",(8..11));
99@outptr=map("%r$_",(12..15));
100
101($rndkey0,$rndkey1)=("%xmm0","%xmm1");
102@out=map("%xmm$_",(2..5));
103@inp=map("%xmm$_",(6..9));
104($counters,$mask,$zero)=map("%xmm$_",(10..12));
105
106($rounds,$one,$sink,$offset)=("%eax","%ecx","%rbp","%rbx");
107
108$code.=<<___;
109.text
110
111.extern OPENSSL_ia32cap_P
112
113.globl aesni_multi_cbc_encrypt
114.type aesni_multi_cbc_encrypt,\@function,3
115.align 32
116aesni_multi_cbc_encrypt:
117.cfi_startproc
118___
119$code.=<<___ if ($avx);
120 cmp \$2,$num
121 jb .Lenc_non_avx
122 mov OPENSSL_ia32cap_P+4(%rip),%ecx
123 test \$`1<<28`,%ecx # AVX bit
124 jnz _avx_cbc_enc_shortcut
125 jmp .Lenc_non_avx
126.align 16
127.Lenc_non_avx:
128___
129$code.=<<___;
130 mov %rsp,%rax
131.cfi_def_cfa_register %rax
132 push %rbx
133.cfi_push %rbx
134 push %rbp
135.cfi_push %rbp
136 push %r12
137.cfi_push %r12
138 push %r13
139.cfi_push %r13
140 push %r14
141.cfi_push %r14
142 push %r15
143.cfi_push %r15
144___
145$code.=<<___ if ($win64);
146 lea -0xa8(%rsp),%rsp
147 movaps %xmm6,(%rsp)
148 movaps %xmm7,0x10(%rsp)
149 movaps %xmm8,0x20(%rsp)
150 movaps %xmm9,0x30(%rsp)
151 movaps %xmm10,0x40(%rsp)
152 movaps %xmm11,0x50(%rsp)
153 movaps %xmm12,0x60(%rsp)
154 movaps %xmm13,-0x68(%rax) # not used, saved to share se_handler
155 movaps %xmm14,-0x58(%rax)
156 movaps %xmm15,-0x48(%rax)
157___
158$code.=<<___;
159 # stack layout
160 #
161 # +0 output sink
162 # +16 input sink [original %rsp and $num]
163 # +32 counters
164
165 sub \$48,%rsp
166 and \$-64,%rsp
167 mov %rax,16(%rsp) # original %rsp
168.cfi_cfa_expression %rsp+16,deref,+8
169
170.Lenc4x_body:
171 movdqu ($key),$zero # 0-round key
172 lea 0x78($key),$key # size optimization
173 lea $inp_elm_size*2($inp),$inp
174
175.Lenc4x_loop_grande:
176 mov $num,24(%rsp) # original $num
177 xor $num,$num
178___
179for($i=0;$i<4;$i++) {
180 $inptr_reg=&pointer_register($flavour,@inptr[$i]);
181 $outptr_reg=&pointer_register($flavour,@outptr[$i]);
182 $code.=<<___;
183 # borrow $one for number of blocks
184 mov `$inp_elm_size*$i+2*$ptr_size-$inp_elm_size*2`($inp),$one
185 mov `$inp_elm_size*$i+0-$inp_elm_size*2`($inp),$inptr_reg
186 cmp $num,$one
187 mov `$inp_elm_size*$i+$ptr_size-$inp_elm_size*2`($inp),$outptr_reg
188 cmovg $one,$num # find maximum
189 test $one,$one
190 # load IV
191 movdqu `$inp_elm_size*$i+2*$ptr_size+8-$inp_elm_size*2`($inp),@out[$i]
192 mov $one,`32+4*$i`(%rsp) # initialize counters
193 cmovle %rsp,@inptr[$i] # cancel input
194___
195}
196$code.=<<___;
197 test $num,$num
198 jz .Lenc4x_done
199
200 movups 0x10-0x78($key),$rndkey1
201 pxor $zero,@out[0]
202 movups 0x20-0x78($key),$rndkey0
203 pxor $zero,@out[1]
204 mov 0xf0-0x78($key),$rounds
205 pxor $zero,@out[2]
206 movdqu (@inptr[0]),@inp[0] # load inputs
207 pxor $zero,@out[3]
208 movdqu (@inptr[1]),@inp[1]
209 pxor @inp[0],@out[0]
210 movdqu (@inptr[2]),@inp[2]
211 pxor @inp[1],@out[1]
212 movdqu (@inptr[3]),@inp[3]
213 pxor @inp[2],@out[2]
214 pxor @inp[3],@out[3]
215 movdqa 32(%rsp),$counters # load counters
216 xor $offset,$offset
217 jmp .Loop_enc4x
218
219.align 32
220.Loop_enc4x:
221 add \$16,$offset
222 lea 16(%rsp),$sink # sink pointer
223 mov \$1,$one # constant of 1
224 sub $offset,$sink
225
226 aesenc $rndkey1,@out[0]
227 prefetcht0 31(@inptr[0],$offset) # prefetch input
228 prefetcht0 31(@inptr[1],$offset)
229 aesenc $rndkey1,@out[1]
230 prefetcht0 31(@inptr[2],$offset)
231 prefetcht0 31(@inptr[2],$offset)
232 aesenc $rndkey1,@out[2]
233 aesenc $rndkey1,@out[3]
234 movups 0x30-0x78($key),$rndkey1
235___
236for($i=0;$i<4;$i++) {
237my $rndkey = ($i&1) ? $rndkey1 : $rndkey0;
238$code.=<<___;
239 cmp `32+4*$i`(%rsp),$one
240 aesenc $rndkey,@out[0]
241 aesenc $rndkey,@out[1]
242 aesenc $rndkey,@out[2]
243 cmovge $sink,@inptr[$i] # cancel input
244 cmovg $sink,@outptr[$i] # sink output
245 aesenc $rndkey,@out[3]
246 movups `0x40+16*$i-0x78`($key),$rndkey
247___
248}
249$code.=<<___;
250 movdqa $counters,$mask
251 aesenc $rndkey0,@out[0]
252 prefetcht0 15(@outptr[0],$offset) # prefetch output
253 prefetcht0 15(@outptr[1],$offset)
254 aesenc $rndkey0,@out[1]
255 prefetcht0 15(@outptr[2],$offset)
256 prefetcht0 15(@outptr[3],$offset)
257 aesenc $rndkey0,@out[2]
258 aesenc $rndkey0,@out[3]
259 movups 0x80-0x78($key),$rndkey0
260 pxor $zero,$zero
261
262 aesenc $rndkey1,@out[0]
263 pcmpgtd $zero,$mask
264 movdqu -0x78($key),$zero # reload 0-round key
265 aesenc $rndkey1,@out[1]
266 paddd $mask,$counters # decrement counters
267 movdqa $counters,32(%rsp) # update counters
268 aesenc $rndkey1,@out[2]
269 aesenc $rndkey1,@out[3]
270 movups 0x90-0x78($key),$rndkey1
271
272 cmp \$11,$rounds
273
274 aesenc $rndkey0,@out[0]
275 aesenc $rndkey0,@out[1]
276 aesenc $rndkey0,@out[2]
277 aesenc $rndkey0,@out[3]
278 movups 0xa0-0x78($key),$rndkey0
279
280 jb .Lenc4x_tail
281
282 aesenc $rndkey1,@out[0]
283 aesenc $rndkey1,@out[1]
284 aesenc $rndkey1,@out[2]
285 aesenc $rndkey1,@out[3]
286 movups 0xb0-0x78($key),$rndkey1
287
288 aesenc $rndkey0,@out[0]
289 aesenc $rndkey0,@out[1]
290 aesenc $rndkey0,@out[2]
291 aesenc $rndkey0,@out[3]
292 movups 0xc0-0x78($key),$rndkey0
293
294 je .Lenc4x_tail
295
296 aesenc $rndkey1,@out[0]
297 aesenc $rndkey1,@out[1]
298 aesenc $rndkey1,@out[2]
299 aesenc $rndkey1,@out[3]
300 movups 0xd0-0x78($key),$rndkey1
301
302 aesenc $rndkey0,@out[0]
303 aesenc $rndkey0,@out[1]
304 aesenc $rndkey0,@out[2]
305 aesenc $rndkey0,@out[3]
306 movups 0xe0-0x78($key),$rndkey0
307 jmp .Lenc4x_tail
308
309.align 32
310.Lenc4x_tail:
311 aesenc $rndkey1,@out[0]
312 aesenc $rndkey1,@out[1]
313 aesenc $rndkey1,@out[2]
314 aesenc $rndkey1,@out[3]
315 movdqu (@inptr[0],$offset),@inp[0]
316 movdqu 0x10-0x78($key),$rndkey1
317
318 aesenclast $rndkey0,@out[0]
319 movdqu (@inptr[1],$offset),@inp[1]
320 pxor $zero,@inp[0]
321 aesenclast $rndkey0,@out[1]
322 movdqu (@inptr[2],$offset),@inp[2]
323 pxor $zero,@inp[1]
324 aesenclast $rndkey0,@out[2]
325 movdqu (@inptr[3],$offset),@inp[3]
326 pxor $zero,@inp[2]
327 aesenclast $rndkey0,@out[3]
328 movdqu 0x20-0x78($key),$rndkey0
329 pxor $zero,@inp[3]
330
331 movups @out[0],-16(@outptr[0],$offset)
332 pxor @inp[0],@out[0]
333 movups @out[1],-16(@outptr[1],$offset)
334 pxor @inp[1],@out[1]
335 movups @out[2],-16(@outptr[2],$offset)
336 pxor @inp[2],@out[2]
337 movups @out[3],-16(@outptr[3],$offset)
338 pxor @inp[3],@out[3]
339
340 dec $num
341 jnz .Loop_enc4x
342
343 mov 16(%rsp),%rax # original %rsp
344.cfi_def_cfa %rax,8
345 mov 24(%rsp),$num
346
347 #pxor @inp[0],@out[0]
348 #pxor @inp[1],@out[1]
349 # output iv FIX ME!
350 #movdqu @out[0],`$inp_elm_size*0+2*$ptr_size+8-$inp_elm_size*2`($inp)
351 #pxor @inp[2],@out[2]
352 #movdqu @out[1],`$inp_elm_size*1+2*$ptr_size+8-$inp_elm_size*2`($inp)
353 #pxor @inp[3],@out[3]
354 #movdqu @out[2],`$inp_elm_size*2+2*$ptr_size+8-$inp_elm_size*2`($inp) # won't fix, let caller
355 #movdqu @out[3],`$inp_elm_size*3+2*$ptr_size+8-$inp_elm_size*2`($inp) # figure this out...
356
357 lea `$inp_elm_size*4`($inp),$inp
358 dec $num
359 jnz .Lenc4x_loop_grande
360
361.Lenc4x_done:
362___
363$code.=<<___ if ($win64);
364 movaps -0xd8(%rax),%xmm6
365 movaps -0xc8(%rax),%xmm7
366 movaps -0xb8(%rax),%xmm8
367 movaps -0xa8(%rax),%xmm9
368 movaps -0x98(%rax),%xmm10
369 movaps -0x88(%rax),%xmm11
370 movaps -0x78(%rax),%xmm12
371 #movaps -0x68(%rax),%xmm13
372 #movaps -0x58(%rax),%xmm14
373 #movaps -0x48(%rax),%xmm15
374___
375$code.=<<___;
376 mov -48(%rax),%r15
377.cfi_restore %r15
378 mov -40(%rax),%r14
379.cfi_restore %r14
380 mov -32(%rax),%r13
381.cfi_restore %r13
382 mov -24(%rax),%r12
383.cfi_restore %r12
384 mov -16(%rax),%rbp
385.cfi_restore %rbp
386 mov -8(%rax),%rbx
387.cfi_restore %rbx
388 lea (%rax),%rsp
389.cfi_def_cfa_register %rsp
390.Lenc4x_epilogue:
391 ret
392.cfi_endproc
393.size aesni_multi_cbc_encrypt,.-aesni_multi_cbc_encrypt
394
395.globl aesni_multi_cbc_decrypt
396.type aesni_multi_cbc_decrypt,\@function,3
397.align 32
398aesni_multi_cbc_decrypt:
399.cfi_startproc
400___
401$code.=<<___ if ($avx);
402 cmp \$2,$num
403 jb .Ldec_non_avx
404 mov OPENSSL_ia32cap_P+4(%rip),%ecx
405 test \$`1<<28`,%ecx # AVX bit
406 jnz _avx_cbc_dec_shortcut
407 jmp .Ldec_non_avx
408.align 16
409.Ldec_non_avx:
410___
411$code.=<<___;
412 mov %rsp,%rax
413.cfi_def_cfa_register %rax
414 push %rbx
415.cfi_push %rbx
416 push %rbp
417.cfi_push %rbp
418 push %r12
419.cfi_push %r12
420 push %r13
421.cfi_push %r13
422 push %r14
423.cfi_push %r14
424 push %r15
425.cfi_push %r15
426___
427$code.=<<___ if ($win64);
428 lea -0xa8(%rsp),%rsp
429 movaps %xmm6,(%rsp)
430 movaps %xmm7,0x10(%rsp)
431 movaps %xmm8,0x20(%rsp)
432 movaps %xmm9,0x30(%rsp)
433 movaps %xmm10,0x40(%rsp)
434 movaps %xmm11,0x50(%rsp)
435 movaps %xmm12,0x60(%rsp)
436 movaps %xmm13,-0x68(%rax) # not used, saved to share se_handler
437 movaps %xmm14,-0x58(%rax)
438 movaps %xmm15,-0x48(%rax)
439___
440$code.=<<___;
441 # stack layout
442 #
443 # +0 output sink
444 # +16 input sink [original %rsp and $num]
445 # +32 counters
446
447 sub \$48,%rsp
448 and \$-64,%rsp
449 mov %rax,16(%rsp) # original %rsp
450.cfi_cfa_expression %rsp+16,deref,+8
451
452.Ldec4x_body:
453 movdqu ($key),$zero # 0-round key
454 lea 0x78($key),$key # size optimization
455 lea $inp_elm_size*2($inp),$inp
456
457.Ldec4x_loop_grande:
458 mov $num,24(%rsp) # original $num
459 xor $num,$num
460___
461for($i=0;$i<4;$i++) {
462 $inptr_reg=&pointer_register($flavour,@inptr[$i]);
463 $outptr_reg=&pointer_register($flavour,@outptr[$i]);
464 $code.=<<___;
465 # borrow $one for number of blocks
466 mov `$inp_elm_size*$i+2*$ptr_size-$inp_elm_size*2`($inp),$one
467 mov `$inp_elm_size*$i+0-$inp_elm_size*2`($inp),$inptr_reg
468 cmp $num,$one
469 mov `$inp_elm_size*$i+$ptr_size-$inp_elm_size*2`($inp),$outptr_reg
470 cmovg $one,$num # find maximum
471 test $one,$one
472 # load IV
473 movdqu `$inp_elm_size*$i+2*$ptr_size+8-$inp_elm_size*2`($inp),@inp[$i]
474 mov $one,`32+4*$i`(%rsp) # initialize counters
475 cmovle %rsp,@inptr[$i] # cancel input
476___
477}
478$code.=<<___;
479 test $num,$num
480 jz .Ldec4x_done
481
482 movups 0x10-0x78($key),$rndkey1
483 movups 0x20-0x78($key),$rndkey0
484 mov 0xf0-0x78($key),$rounds
485 movdqu (@inptr[0]),@out[0] # load inputs
486 movdqu (@inptr[1]),@out[1]
487 pxor $zero,@out[0]
488 movdqu (@inptr[2]),@out[2]
489 pxor $zero,@out[1]
490 movdqu (@inptr[3]),@out[3]
491 pxor $zero,@out[2]
492 pxor $zero,@out[3]
493 movdqa 32(%rsp),$counters # load counters
494 xor $offset,$offset
495 jmp .Loop_dec4x
496
497.align 32
498.Loop_dec4x:
499 add \$16,$offset
500 lea 16(%rsp),$sink # sink pointer
501 mov \$1,$one # constant of 1
502 sub $offset,$sink
503
504 aesdec $rndkey1,@out[0]
505 prefetcht0 31(@inptr[0],$offset) # prefetch input
506 prefetcht0 31(@inptr[1],$offset)
507 aesdec $rndkey1,@out[1]
508 prefetcht0 31(@inptr[2],$offset)
509 prefetcht0 31(@inptr[3],$offset)
510 aesdec $rndkey1,@out[2]
511 aesdec $rndkey1,@out[3]
512 movups 0x30-0x78($key),$rndkey1
513___
514for($i=0;$i<4;$i++) {
515my $rndkey = ($i&1) ? $rndkey1 : $rndkey0;
516$code.=<<___;
517 cmp `32+4*$i`(%rsp),$one
518 aesdec $rndkey,@out[0]
519 aesdec $rndkey,@out[1]
520 aesdec $rndkey,@out[2]
521 cmovge $sink,@inptr[$i] # cancel input
522 cmovg $sink,@outptr[$i] # sink output
523 aesdec $rndkey,@out[3]
524 movups `0x40+16*$i-0x78`($key),$rndkey
525___
526}
527$code.=<<___;
528 movdqa $counters,$mask
529 aesdec $rndkey0,@out[0]
530 prefetcht0 15(@outptr[0],$offset) # prefetch output
531 prefetcht0 15(@outptr[1],$offset)
532 aesdec $rndkey0,@out[1]
533 prefetcht0 15(@outptr[2],$offset)
534 prefetcht0 15(@outptr[3],$offset)
535 aesdec $rndkey0,@out[2]
536 aesdec $rndkey0,@out[3]
537 movups 0x80-0x78($key),$rndkey0
538 pxor $zero,$zero
539
540 aesdec $rndkey1,@out[0]
541 pcmpgtd $zero,$mask
542 movdqu -0x78($key),$zero # reload 0-round key
543 aesdec $rndkey1,@out[1]
544 paddd $mask,$counters # decrement counters
545 movdqa $counters,32(%rsp) # update counters
546 aesdec $rndkey1,@out[2]
547 aesdec $rndkey1,@out[3]
548 movups 0x90-0x78($key),$rndkey1
549
550 cmp \$11,$rounds
551
552 aesdec $rndkey0,@out[0]
553 aesdec $rndkey0,@out[1]
554 aesdec $rndkey0,@out[2]
555 aesdec $rndkey0,@out[3]
556 movups 0xa0-0x78($key),$rndkey0
557
558 jb .Ldec4x_tail
559
560 aesdec $rndkey1,@out[0]
561 aesdec $rndkey1,@out[1]
562 aesdec $rndkey1,@out[2]
563 aesdec $rndkey1,@out[3]
564 movups 0xb0-0x78($key),$rndkey1
565
566 aesdec $rndkey0,@out[0]
567 aesdec $rndkey0,@out[1]
568 aesdec $rndkey0,@out[2]
569 aesdec $rndkey0,@out[3]
570 movups 0xc0-0x78($key),$rndkey0
571
572 je .Ldec4x_tail
573
574 aesdec $rndkey1,@out[0]
575 aesdec $rndkey1,@out[1]
576 aesdec $rndkey1,@out[2]
577 aesdec $rndkey1,@out[3]
578 movups 0xd0-0x78($key),$rndkey1
579
580 aesdec $rndkey0,@out[0]
581 aesdec $rndkey0,@out[1]
582 aesdec $rndkey0,@out[2]
583 aesdec $rndkey0,@out[3]
584 movups 0xe0-0x78($key),$rndkey0
585 jmp .Ldec4x_tail
586
587.align 32
588.Ldec4x_tail:
589 aesdec $rndkey1,@out[0]
590 aesdec $rndkey1,@out[1]
591 aesdec $rndkey1,@out[2]
592 pxor $rndkey0,@inp[0]
593 pxor $rndkey0,@inp[1]
594 aesdec $rndkey1,@out[3]
595 movdqu 0x10-0x78($key),$rndkey1
596 pxor $rndkey0,@inp[2]
597 pxor $rndkey0,@inp[3]
598 movdqu 0x20-0x78($key),$rndkey0
599
600 aesdeclast @inp[0],@out[0]
601 aesdeclast @inp[1],@out[1]
602 movdqu -16(@inptr[0],$offset),@inp[0] # load next IV
603 movdqu -16(@inptr[1],$offset),@inp[1]
604 aesdeclast @inp[2],@out[2]
605 aesdeclast @inp[3],@out[3]
606 movdqu -16(@inptr[2],$offset),@inp[2]
607 movdqu -16(@inptr[3],$offset),@inp[3]
608
609 movups @out[0],-16(@outptr[0],$offset)
610 movdqu (@inptr[0],$offset),@out[0]
611 movups @out[1],-16(@outptr[1],$offset)
612 movdqu (@inptr[1],$offset),@out[1]
613 pxor $zero,@out[0]
614 movups @out[2],-16(@outptr[2],$offset)
615 movdqu (@inptr[2],$offset),@out[2]
616 pxor $zero,@out[1]
617 movups @out[3],-16(@outptr[3],$offset)
618 movdqu (@inptr[3],$offset),@out[3]
619 pxor $zero,@out[2]
620 pxor $zero,@out[3]
621
622 dec $num
623 jnz .Loop_dec4x
624
625 mov 16(%rsp),%rax # original %rsp
626.cfi_def_cfa %rax,8
627 mov 24(%rsp),$num
628
629 lea `$inp_elm_size*4`($inp),$inp
630 dec $num
631 jnz .Ldec4x_loop_grande
632
633.Ldec4x_done:
634___
635$code.=<<___ if ($win64);
636 movaps -0xd8(%rax),%xmm6
637 movaps -0xc8(%rax),%xmm7
638 movaps -0xb8(%rax),%xmm8
639 movaps -0xa8(%rax),%xmm9
640 movaps -0x98(%rax),%xmm10
641 movaps -0x88(%rax),%xmm11
642 movaps -0x78(%rax),%xmm12
643 #movaps -0x68(%rax),%xmm13
644 #movaps -0x58(%rax),%xmm14
645 #movaps -0x48(%rax),%xmm15
646___
647$code.=<<___;
648 mov -48(%rax),%r15
649.cfi_restore %r15
650 mov -40(%rax),%r14
651.cfi_restore %r14
652 mov -32(%rax),%r13
653.cfi_restore %r13
654 mov -24(%rax),%r12
655.cfi_restore %r12
656 mov -16(%rax),%rbp
657.cfi_restore %rbp
658 mov -8(%rax),%rbx
659.cfi_restore %rbx
660 lea (%rax),%rsp
661.cfi_def_cfa_register %rsp
662.Ldec4x_epilogue:
663 ret
664.cfi_endproc
665.size aesni_multi_cbc_decrypt,.-aesni_multi_cbc_decrypt
666___
667
668 if ($avx) {{{
669my @ptr=map("%r$_",(8..15));
670my $offload=$sink;
671
672my @out=map("%xmm$_",(2..9));
673my @inp=map("%xmm$_",(10..13));
674my ($counters,$zero)=("%xmm14","%xmm15");
675
676$code.=<<___;
677.type aesni_multi_cbc_encrypt_avx,\@function,3
678.align 32
679aesni_multi_cbc_encrypt_avx:
680.cfi_startproc
681_avx_cbc_enc_shortcut:
682 mov %rsp,%rax
683.cfi_def_cfa_register %rax
684 push %rbx
685.cfi_push %rbx
686 push %rbp
687.cfi_push %rbp
688 push %r12
689.cfi_push %r12
690 push %r13
691.cfi_push %r13
692 push %r14
693.cfi_push %r14
694 push %r15
695.cfi_push %r15
696___
697$code.=<<___ if ($win64);
698 lea -0xa8(%rsp),%rsp
699 movaps %xmm6,(%rsp)
700 movaps %xmm7,0x10(%rsp)
701 movaps %xmm8,0x20(%rsp)
702 movaps %xmm9,0x30(%rsp)
703 movaps %xmm10,0x40(%rsp)
704 movaps %xmm11,0x50(%rsp)
705 movaps %xmm12,-0x78(%rax)
706 movaps %xmm13,-0x68(%rax)
707 movaps %xmm14,-0x58(%rax)
708 movaps %xmm15,-0x48(%rax)
709___
710$code.=<<___;
711 # stack layout
712 #
713 # +0 output sink
714 # +16 input sink [original %rsp and $num]
715 # +32 counters
716 # +64 distances between inputs and outputs
717 # +128 off-load area for @inp[0..3]
718
719 sub \$192,%rsp
720 and \$-128,%rsp
721 mov %rax,16(%rsp) # original %rsp
722.cfi_cfa_expression %rsp+16,deref,+8
723
724.Lenc8x_body:
725 vzeroupper
726 vmovdqu ($key),$zero # 0-round key
727 lea 0x78($key),$key # size optimization
728 lea `$inp_elm_size*4`($inp),$inp
729 shr \$1,$num
730
731.Lenc8x_loop_grande:
732 #mov $num,24(%rsp) # original $num
733 xor $num,$num
734___
735for($i=0;$i<8;$i++) {
736 my $temp = $i ? $offload : $offset;
737 $ptr_reg=&pointer_register($flavour,@ptr[$i]);
738 $temp_reg=&pointer_register($flavour,$temp);
739 $code.=<<___;
740 # borrow $one for number of blocks
741 mov `$inp_elm_size*$i+2*$ptr_size-$inp_elm_size*4`($inp),$one
742 # input pointer
743 mov `$inp_elm_size*$i+0-$inp_elm_size*4`($inp),$ptr_reg
744 cmp $num,$one
745 # output pointer
746 mov `$inp_elm_size*$i+$ptr_size-$inp_elm_size*4`($inp),$temp_reg
747 cmovg $one,$num # find maximum
748 test $one,$one
749 # load IV
750 vmovdqu `$inp_elm_size*$i+2*$ptr_size+8-$inp_elm_size*4`($inp),@out[$i]
751 mov $one,`32+4*$i`(%rsp) # initialize counters
752 cmovle %rsp,@ptr[$i] # cancel input
753 sub @ptr[$i],$temp # distance between input and output
754 mov $temp,`64+8*$i`(%rsp) # initialize distances
755___
756}
757$code.=<<___;
758 test $num,$num
759 jz .Lenc8x_done
760
761 vmovups 0x10-0x78($key),$rndkey1
762 vmovups 0x20-0x78($key),$rndkey0
763 mov 0xf0-0x78($key),$rounds
764
765 vpxor (@ptr[0]),$zero,@inp[0] # load inputs and xor with 0-round
766 lea 128(%rsp),$offload # offload area
767 vpxor (@ptr[1]),$zero,@inp[1]
768 vpxor (@ptr[2]),$zero,@inp[2]
769 vpxor (@ptr[3]),$zero,@inp[3]
770 vpxor @inp[0],@out[0],@out[0]
771 vpxor (@ptr[4]),$zero,@inp[0]
772 vpxor @inp[1],@out[1],@out[1]
773 vpxor (@ptr[5]),$zero,@inp[1]
774 vpxor @inp[2],@out[2],@out[2]
775 vpxor (@ptr[6]),$zero,@inp[2]
776 vpxor @inp[3],@out[3],@out[3]
777 vpxor (@ptr[7]),$zero,@inp[3]
778 vpxor @inp[0],@out[4],@out[4]
779 mov \$1,$one # constant of 1
780 vpxor @inp[1],@out[5],@out[5]
781 vpxor @inp[2],@out[6],@out[6]
782 vpxor @inp[3],@out[7],@out[7]
783 jmp .Loop_enc8x
784
785.align 32
786.Loop_enc8x:
787___
788for($i=0;$i<8;$i++) {
789my $rndkey=($i&1)?$rndkey0:$rndkey1;
790$code.=<<___;
791 vaesenc $rndkey,@out[0],@out[0]
792 cmp 32+4*$i(%rsp),$one
793___
794$code.=<<___ if ($i);
795 mov 64+8*$i(%rsp),$offset
796___
797$code.=<<___;
798 vaesenc $rndkey,@out[1],@out[1]
799 prefetcht0 31(@ptr[$i]) # prefetch input
800 vaesenc $rndkey,@out[2],@out[2]
801___
802$code.=<<___ if ($i>1);
803 prefetcht0 15(@ptr[$i-2]) # prefetch output
804___
805$code.=<<___;
806 vaesenc $rndkey,@out[3],@out[3]
807 lea (@ptr[$i],$offset),$offset
808 cmovge %rsp,@ptr[$i] # cancel input
809 vaesenc $rndkey,@out[4],@out[4]
810 cmovg %rsp,$offset # sink output
811 vaesenc $rndkey,@out[5],@out[5]
812 sub @ptr[$i],$offset
813 vaesenc $rndkey,@out[6],@out[6]
814 vpxor 16(@ptr[$i]),$zero,@inp[$i%4] # load input and xor with 0-round
815 mov $offset,64+8*$i(%rsp)
816 vaesenc $rndkey,@out[7],@out[7]
817 vmovups `16*(3+$i)-0x78`($key),$rndkey
818 lea 16(@ptr[$i],$offset),@ptr[$i] # switch to output
819___
820$code.=<<___ if ($i<4)
821 vmovdqu @inp[$i%4],`16*$i`($offload) # off-load
822___
823}
824$code.=<<___;
825 vmovdqu 32(%rsp),$counters
826 prefetcht0 15(@ptr[$i-2]) # prefetch output
827 prefetcht0 15(@ptr[$i-1])
828 cmp \$11,$rounds
829 jb .Lenc8x_tail
830
831 vaesenc $rndkey1,@out[0],@out[0]
832 vaesenc $rndkey1,@out[1],@out[1]
833 vaesenc $rndkey1,@out[2],@out[2]
834 vaesenc $rndkey1,@out[3],@out[3]
835 vaesenc $rndkey1,@out[4],@out[4]
836 vaesenc $rndkey1,@out[5],@out[5]
837 vaesenc $rndkey1,@out[6],@out[6]
838 vaesenc $rndkey1,@out[7],@out[7]
839 vmovups 0xb0-0x78($key),$rndkey1
840
841 vaesenc $rndkey0,@out[0],@out[0]
842 vaesenc $rndkey0,@out[1],@out[1]
843 vaesenc $rndkey0,@out[2],@out[2]
844 vaesenc $rndkey0,@out[3],@out[3]
845 vaesenc $rndkey0,@out[4],@out[4]
846 vaesenc $rndkey0,@out[5],@out[5]
847 vaesenc $rndkey0,@out[6],@out[6]
848 vaesenc $rndkey0,@out[7],@out[7]
849 vmovups 0xc0-0x78($key),$rndkey0
850 je .Lenc8x_tail
851
852 vaesenc $rndkey1,@out[0],@out[0]
853 vaesenc $rndkey1,@out[1],@out[1]
854 vaesenc $rndkey1,@out[2],@out[2]
855 vaesenc $rndkey1,@out[3],@out[3]
856 vaesenc $rndkey1,@out[4],@out[4]
857 vaesenc $rndkey1,@out[5],@out[5]
858 vaesenc $rndkey1,@out[6],@out[6]
859 vaesenc $rndkey1,@out[7],@out[7]
860 vmovups 0xd0-0x78($key),$rndkey1
861
862 vaesenc $rndkey0,@out[0],@out[0]
863 vaesenc $rndkey0,@out[1],@out[1]
864 vaesenc $rndkey0,@out[2],@out[2]
865 vaesenc $rndkey0,@out[3],@out[3]
866 vaesenc $rndkey0,@out[4],@out[4]
867 vaesenc $rndkey0,@out[5],@out[5]
868 vaesenc $rndkey0,@out[6],@out[6]
869 vaesenc $rndkey0,@out[7],@out[7]
870 vmovups 0xe0-0x78($key),$rndkey0
871
872.Lenc8x_tail:
873 vaesenc $rndkey1,@out[0],@out[0]
874 vpxor $zero,$zero,$zero
875 vaesenc $rndkey1,@out[1],@out[1]
876 vaesenc $rndkey1,@out[2],@out[2]
877 vpcmpgtd $zero,$counters,$zero
878 vaesenc $rndkey1,@out[3],@out[3]
879 vaesenc $rndkey1,@out[4],@out[4]
880 vpaddd $counters,$zero,$zero # decrement counters
881 vmovdqu 48(%rsp),$counters
882 vaesenc $rndkey1,@out[5],@out[5]
883 mov 64(%rsp),$offset # pre-load 1st offset
884 vaesenc $rndkey1,@out[6],@out[6]
885 vaesenc $rndkey1,@out[7],@out[7]
886 vmovups 0x10-0x78($key),$rndkey1
887
888 vaesenclast $rndkey0,@out[0],@out[0]
889 vmovdqa $zero,32(%rsp) # update counters
890 vpxor $zero,$zero,$zero
891 vaesenclast $rndkey0,@out[1],@out[1]
892 vaesenclast $rndkey0,@out[2],@out[2]
893 vpcmpgtd $zero,$counters,$zero
894 vaesenclast $rndkey0,@out[3],@out[3]
895 vaesenclast $rndkey0,@out[4],@out[4]
896 vpaddd $zero,$counters,$counters # decrement counters
897 vmovdqu -0x78($key),$zero # 0-round
898 vaesenclast $rndkey0,@out[5],@out[5]
899 vaesenclast $rndkey0,@out[6],@out[6]
900 vmovdqa $counters,48(%rsp) # update counters
901 vaesenclast $rndkey0,@out[7],@out[7]
902 vmovups 0x20-0x78($key),$rndkey0
903
904 vmovups @out[0],-16(@ptr[0]) # write output
905 sub $offset,@ptr[0] # switch to input
906 vpxor 0x00($offload),@out[0],@out[0]
907 vmovups @out[1],-16(@ptr[1])
908 sub `64+1*8`(%rsp),@ptr[1]
909 vpxor 0x10($offload),@out[1],@out[1]
910 vmovups @out[2],-16(@ptr[2])
911 sub `64+2*8`(%rsp),@ptr[2]
912 vpxor 0x20($offload),@out[2],@out[2]
913 vmovups @out[3],-16(@ptr[3])
914 sub `64+3*8`(%rsp),@ptr[3]
915 vpxor 0x30($offload),@out[3],@out[3]
916 vmovups @out[4],-16(@ptr[4])
917 sub `64+4*8`(%rsp),@ptr[4]
918 vpxor @inp[0],@out[4],@out[4]
919 vmovups @out[5],-16(@ptr[5])
920 sub `64+5*8`(%rsp),@ptr[5]
921 vpxor @inp[1],@out[5],@out[5]
922 vmovups @out[6],-16(@ptr[6])
923 sub `64+6*8`(%rsp),@ptr[6]
924 vpxor @inp[2],@out[6],@out[6]
925 vmovups @out[7],-16(@ptr[7])
926 sub `64+7*8`(%rsp),@ptr[7]
927 vpxor @inp[3],@out[7],@out[7]
928
929 dec $num
930 jnz .Loop_enc8x
931
932 mov 16(%rsp),%rax # original %rsp
933.cfi_def_cfa %rax,8
934 #mov 24(%rsp),$num
935 #lea `$inp_elm_size*8`($inp),$inp
936 #dec $num
937 #jnz .Lenc8x_loop_grande
938
939.Lenc8x_done:
940 vzeroupper
941___
942$code.=<<___ if ($win64);
943 movaps -0xd8(%rax),%xmm6
944 movaps -0xc8(%rax),%xmm7
945 movaps -0xb8(%rax),%xmm8
946 movaps -0xa8(%rax),%xmm9
947 movaps -0x98(%rax),%xmm10
948 movaps -0x88(%rax),%xmm11
949 movaps -0x78(%rax),%xmm12
950 movaps -0x68(%rax),%xmm13
951 movaps -0x58(%rax),%xmm14
952 movaps -0x48(%rax),%xmm15
953___
954$code.=<<___;
955 mov -48(%rax),%r15
956.cfi_restore %r15
957 mov -40(%rax),%r14
958.cfi_restore %r14
959 mov -32(%rax),%r13
960.cfi_restore %r13
961 mov -24(%rax),%r12
962.cfi_restore %r12
963 mov -16(%rax),%rbp
964.cfi_restore %rbp
965 mov -8(%rax),%rbx
966.cfi_restore %rbx
967 lea (%rax),%rsp
968.cfi_def_cfa_register %rsp
969.Lenc8x_epilogue:
970 ret
971.cfi_endproc
972.size aesni_multi_cbc_encrypt_avx,.-aesni_multi_cbc_encrypt_avx
973
974.type aesni_multi_cbc_decrypt_avx,\@function,3
975.align 32
976aesni_multi_cbc_decrypt_avx:
977.cfi_startproc
978_avx_cbc_dec_shortcut:
979 mov %rsp,%rax
980.cfi_def_cfa_register %rax
981 push %rbx
982.cfi_push %rbx
983 push %rbp
984.cfi_push %rbp
985 push %r12
986.cfi_push %r12
987 push %r13
988.cfi_push %r13
989 push %r14
990.cfi_push %r14
991 push %r15
992.cfi_push %r15
993___
994$code.=<<___ if ($win64);
995 lea -0xa8(%rsp),%rsp
996 movaps %xmm6,(%rsp)
997 movaps %xmm7,0x10(%rsp)
998 movaps %xmm8,0x20(%rsp)
999 movaps %xmm9,0x30(%rsp)
1000 movaps %xmm10,0x40(%rsp)
1001 movaps %xmm11,0x50(%rsp)
1002 movaps %xmm12,-0x78(%rax)
1003 movaps %xmm13,-0x68(%rax)
1004 movaps %xmm14,-0x58(%rax)
1005 movaps %xmm15,-0x48(%rax)
1006___
1007$code.=<<___;
1008 # stack layout
1009 #
1010 # +0 output sink
1011 # +16 input sink [original %rsp and $num]
1012 # +32 counters
1013 # +64 distances between inputs and outputs
1014 # +128 off-load area for @inp[0..3]
1015 # +192 IV/input offload
1016
1017 sub \$256,%rsp
1018 and \$-256,%rsp
1019 sub \$192,%rsp
1020 mov %rax,16(%rsp) # original %rsp
1021.cfi_cfa_expression %rsp+16,deref,+8
1022
1023.Ldec8x_body:
1024 vzeroupper
1025 vmovdqu ($key),$zero # 0-round key
1026 lea 0x78($key),$key # size optimization
1027 lea `$inp_elm_size*4`($inp),$inp
1028 shr \$1,$num
1029
1030.Ldec8x_loop_grande:
1031 #mov $num,24(%rsp) # original $num
1032 xor $num,$num
1033___
1034for($i=0;$i<8;$i++) {
1035 my $temp = $i ? $offload : $offset;
1036 $ptr_reg=&pointer_register($flavour,@ptr[$i]);
1037 $temp_reg=&pointer_register($flavour,$temp);
1038 $code.=<<___;
1039 # borrow $one for number of blocks
1040 mov `$inp_elm_size*$i+2*$ptr_size-$inp_elm_size*4`($inp),$one
1041 # input pointer
1042 mov `$inp_elm_size*$i+0-$inp_elm_size*4`($inp),$ptr_reg
1043 cmp $num,$one
1044 # output pointer
1045 mov `$inp_elm_size*$i+$ptr_size-$inp_elm_size*4`($inp),$temp_reg
1046 cmovg $one,$num # find maximum
1047 test $one,$one
1048 # load IV
1049 vmovdqu `$inp_elm_size*$i+2*$ptr_size+8-$inp_elm_size*4`($inp),@out[$i]
1050 mov $one,`32+4*$i`(%rsp) # initialize counters
1051 cmovle %rsp,@ptr[$i] # cancel input
1052 sub @ptr[$i],$temp # distance between input and output
1053 mov $temp,`64+8*$i`(%rsp) # initialize distances
1054 vmovdqu @out[$i],`192+16*$i`(%rsp) # offload IV
1055___
1056}
1057$code.=<<___;
1058 test $num,$num
1059 jz .Ldec8x_done
1060
1061 vmovups 0x10-0x78($key),$rndkey1
1062 vmovups 0x20-0x78($key),$rndkey0
1063 mov 0xf0-0x78($key),$rounds
1064 lea 192+128(%rsp),$offload # offload area
1065
1066 vmovdqu (@ptr[0]),@out[0] # load inputs
1067 vmovdqu (@ptr[1]),@out[1]
1068 vmovdqu (@ptr[2]),@out[2]
1069 vmovdqu (@ptr[3]),@out[3]
1070 vmovdqu (@ptr[4]),@out[4]
1071 vmovdqu (@ptr[5]),@out[5]
1072 vmovdqu (@ptr[6]),@out[6]
1073 vmovdqu (@ptr[7]),@out[7]
1074 vmovdqu @out[0],0x00($offload) # offload inputs
1075 vpxor $zero,@out[0],@out[0] # xor inputs with 0-round
1076 vmovdqu @out[1],0x10($offload)
1077 vpxor $zero,@out[1],@out[1]
1078 vmovdqu @out[2],0x20($offload)
1079 vpxor $zero,@out[2],@out[2]
1080 vmovdqu @out[3],0x30($offload)
1081 vpxor $zero,@out[3],@out[3]
1082 vmovdqu @out[4],0x40($offload)
1083 vpxor $zero,@out[4],@out[4]
1084 vmovdqu @out[5],0x50($offload)
1085 vpxor $zero,@out[5],@out[5]
1086 vmovdqu @out[6],0x60($offload)
1087 vpxor $zero,@out[6],@out[6]
1088 vmovdqu @out[7],0x70($offload)
1089 vpxor $zero,@out[7],@out[7]
1090 xor \$0x80,$offload
1091 mov \$1,$one # constant of 1
1092 jmp .Loop_dec8x
1093
1094.align 32
1095.Loop_dec8x:
1096___
1097for($i=0;$i<8;$i++) {
1098my $rndkey=($i&1)?$rndkey0:$rndkey1;
1099$code.=<<___;
1100 vaesdec $rndkey,@out[0],@out[0]
1101 cmp 32+4*$i(%rsp),$one
1102___
1103$code.=<<___ if ($i);
1104 mov 64+8*$i(%rsp),$offset
1105___
1106$code.=<<___;
1107 vaesdec $rndkey,@out[1],@out[1]
1108 prefetcht0 31(@ptr[$i]) # prefetch input
1109 vaesdec $rndkey,@out[2],@out[2]
1110___
1111$code.=<<___ if ($i>1);
1112 prefetcht0 15(@ptr[$i-2]) # prefetch output
1113___
1114$code.=<<___;
1115 vaesdec $rndkey,@out[3],@out[3]
1116 lea (@ptr[$i],$offset),$offset
1117 cmovge %rsp,@ptr[$i] # cancel input
1118 vaesdec $rndkey,@out[4],@out[4]
1119 cmovg %rsp,$offset # sink output
1120 vaesdec $rndkey,@out[5],@out[5]
1121 sub @ptr[$i],$offset
1122 vaesdec $rndkey,@out[6],@out[6]
1123 vmovdqu 16(@ptr[$i]),@inp[$i%4] # load input
1124 mov $offset,64+8*$i(%rsp)
1125 vaesdec $rndkey,@out[7],@out[7]
1126 vmovups `16*(3+$i)-0x78`($key),$rndkey
1127 lea 16(@ptr[$i],$offset),@ptr[$i] # switch to output
1128___
1129$code.=<<___ if ($i<4);
1130 vmovdqu @inp[$i%4],`128+16*$i`(%rsp) # off-load
1131___
1132}
1133$code.=<<___;
1134 vmovdqu 32(%rsp),$counters
1135 prefetcht0 15(@ptr[$i-2]) # prefetch output
1136 prefetcht0 15(@ptr[$i-1])
1137 cmp \$11,$rounds
1138 jb .Ldec8x_tail
1139
1140 vaesdec $rndkey1,@out[0],@out[0]
1141 vaesdec $rndkey1,@out[1],@out[1]
1142 vaesdec $rndkey1,@out[2],@out[2]
1143 vaesdec $rndkey1,@out[3],@out[3]
1144 vaesdec $rndkey1,@out[4],@out[4]
1145 vaesdec $rndkey1,@out[5],@out[5]
1146 vaesdec $rndkey1,@out[6],@out[6]
1147 vaesdec $rndkey1,@out[7],@out[7]
1148 vmovups 0xb0-0x78($key),$rndkey1
1149
1150 vaesdec $rndkey0,@out[0],@out[0]
1151 vaesdec $rndkey0,@out[1],@out[1]
1152 vaesdec $rndkey0,@out[2],@out[2]
1153 vaesdec $rndkey0,@out[3],@out[3]
1154 vaesdec $rndkey0,@out[4],@out[4]
1155 vaesdec $rndkey0,@out[5],@out[5]
1156 vaesdec $rndkey0,@out[6],@out[6]
1157 vaesdec $rndkey0,@out[7],@out[7]
1158 vmovups 0xc0-0x78($key),$rndkey0
1159 je .Ldec8x_tail
1160
1161 vaesdec $rndkey1,@out[0],@out[0]
1162 vaesdec $rndkey1,@out[1],@out[1]
1163 vaesdec $rndkey1,@out[2],@out[2]
1164 vaesdec $rndkey1,@out[3],@out[3]
1165 vaesdec $rndkey1,@out[4],@out[4]
1166 vaesdec $rndkey1,@out[5],@out[5]
1167 vaesdec $rndkey1,@out[6],@out[6]
1168 vaesdec $rndkey1,@out[7],@out[7]
1169 vmovups 0xd0-0x78($key),$rndkey1
1170
1171 vaesdec $rndkey0,@out[0],@out[0]
1172 vaesdec $rndkey0,@out[1],@out[1]
1173 vaesdec $rndkey0,@out[2],@out[2]
1174 vaesdec $rndkey0,@out[3],@out[3]
1175 vaesdec $rndkey0,@out[4],@out[4]
1176 vaesdec $rndkey0,@out[5],@out[5]
1177 vaesdec $rndkey0,@out[6],@out[6]
1178 vaesdec $rndkey0,@out[7],@out[7]
1179 vmovups 0xe0-0x78($key),$rndkey0
1180
1181.Ldec8x_tail:
1182 vaesdec $rndkey1,@out[0],@out[0]
1183 vpxor $zero,$zero,$zero
1184 vaesdec $rndkey1,@out[1],@out[1]
1185 vaesdec $rndkey1,@out[2],@out[2]
1186 vpcmpgtd $zero,$counters,$zero
1187 vaesdec $rndkey1,@out[3],@out[3]
1188 vaesdec $rndkey1,@out[4],@out[4]
1189 vpaddd $counters,$zero,$zero # decrement counters
1190 vmovdqu 48(%rsp),$counters
1191 vaesdec $rndkey1,@out[5],@out[5]
1192 mov 64(%rsp),$offset # pre-load 1st offset
1193 vaesdec $rndkey1,@out[6],@out[6]
1194 vaesdec $rndkey1,@out[7],@out[7]
1195 vmovups 0x10-0x78($key),$rndkey1
1196
1197 vaesdeclast $rndkey0,@out[0],@out[0]
1198 vmovdqa $zero,32(%rsp) # update counters
1199 vpxor $zero,$zero,$zero
1200 vaesdeclast $rndkey0,@out[1],@out[1]
1201 vpxor 0x00($offload),@out[0],@out[0] # xor with IV
1202 vaesdeclast $rndkey0,@out[2],@out[2]
1203 vpxor 0x10($offload),@out[1],@out[1]
1204 vpcmpgtd $zero,$counters,$zero
1205 vaesdeclast $rndkey0,@out[3],@out[3]
1206 vpxor 0x20($offload),@out[2],@out[2]
1207 vaesdeclast $rndkey0,@out[4],@out[4]
1208 vpxor 0x30($offload),@out[3],@out[3]
1209 vpaddd $zero,$counters,$counters # decrement counters
1210 vmovdqu -0x78($key),$zero # 0-round
1211 vaesdeclast $rndkey0,@out[5],@out[5]
1212 vpxor 0x40($offload),@out[4],@out[4]
1213 vaesdeclast $rndkey0,@out[6],@out[6]
1214 vpxor 0x50($offload),@out[5],@out[5]
1215 vmovdqa $counters,48(%rsp) # update counters
1216 vaesdeclast $rndkey0,@out[7],@out[7]
1217 vpxor 0x60($offload),@out[6],@out[6]
1218 vmovups 0x20-0x78($key),$rndkey0
1219
1220 vmovups @out[0],-16(@ptr[0]) # write output
1221 sub $offset,@ptr[0] # switch to input
1222 vmovdqu 128+0(%rsp),@out[0]
1223 vpxor 0x70($offload),@out[7],@out[7]
1224 vmovups @out[1],-16(@ptr[1])
1225 sub `64+1*8`(%rsp),@ptr[1]
1226 vmovdqu @out[0],0x00($offload)
1227 vpxor $zero,@out[0],@out[0]
1228 vmovdqu 128+16(%rsp),@out[1]
1229 vmovups @out[2],-16(@ptr[2])
1230 sub `64+2*8`(%rsp),@ptr[2]
1231 vmovdqu @out[1],0x10($offload)
1232 vpxor $zero,@out[1],@out[1]
1233 vmovdqu 128+32(%rsp),@out[2]
1234 vmovups @out[3],-16(@ptr[3])
1235 sub `64+3*8`(%rsp),@ptr[3]
1236 vmovdqu @out[2],0x20($offload)
1237 vpxor $zero,@out[2],@out[2]
1238 vmovdqu 128+48(%rsp),@out[3]
1239 vmovups @out[4],-16(@ptr[4])
1240 sub `64+4*8`(%rsp),@ptr[4]
1241 vmovdqu @out[3],0x30($offload)
1242 vpxor $zero,@out[3],@out[3]
1243 vmovdqu @inp[0],0x40($offload)
1244 vpxor @inp[0],$zero,@out[4]
1245 vmovups @out[5],-16(@ptr[5])
1246 sub `64+5*8`(%rsp),@ptr[5]
1247 vmovdqu @inp[1],0x50($offload)
1248 vpxor @inp[1],$zero,@out[5]
1249 vmovups @out[6],-16(@ptr[6])
1250 sub `64+6*8`(%rsp),@ptr[6]
1251 vmovdqu @inp[2],0x60($offload)
1252 vpxor @inp[2],$zero,@out[6]
1253 vmovups @out[7],-16(@ptr[7])
1254 sub `64+7*8`(%rsp),@ptr[7]
1255 vmovdqu @inp[3],0x70($offload)
1256 vpxor @inp[3],$zero,@out[7]
1257
1258 xor \$128,$offload
1259 dec $num
1260 jnz .Loop_dec8x
1261
1262 mov 16(%rsp),%rax # original %rsp
1263.cfi_def_cfa %rax,8
1264 #mov 24(%rsp),$num
1265 #lea `$inp_elm_size*8`($inp),$inp
1266 #dec $num
1267 #jnz .Ldec8x_loop_grande
1268
1269.Ldec8x_done:
1270 vzeroupper
1271___
1272$code.=<<___ if ($win64);
1273 movaps -0xd8(%rax),%xmm6
1274 movaps -0xc8(%rax),%xmm7
1275 movaps -0xb8(%rax),%xmm8
1276 movaps -0xa8(%rax),%xmm9
1277 movaps -0x98(%rax),%xmm10
1278 movaps -0x88(%rax),%xmm11
1279 movaps -0x78(%rax),%xmm12
1280 movaps -0x68(%rax),%xmm13
1281 movaps -0x58(%rax),%xmm14
1282 movaps -0x48(%rax),%xmm15
1283___
1284$code.=<<___;
1285 mov -48(%rax),%r15
1286.cfi_restore %r15
1287 mov -40(%rax),%r14
1288.cfi_restore %r14
1289 mov -32(%rax),%r13
1290.cfi_restore %r13
1291 mov -24(%rax),%r12
1292.cfi_restore %r12
1293 mov -16(%rax),%rbp
1294.cfi_restore %rbp
1295 mov -8(%rax),%rbx
1296.cfi_restore %rbx
1297 lea (%rax),%rsp
1298.cfi_def_cfa_register %rsp
1299.Ldec8x_epilogue:
1300 ret
1301.cfi_endproc
1302.size aesni_multi_cbc_decrypt_avx,.-aesni_multi_cbc_decrypt_avx
1303___
1304 }}}
1305
1306if ($win64) {
1307# EXCEPTION_DISPOSITION handler (EXCEPTION_RECORD *rec,ULONG64 frame,
1308# CONTEXT *context,DISPATCHER_CONTEXT *disp)
1309$rec="%rcx";
1310$frame="%rdx";
1311$context="%r8";
1312$disp="%r9";
1313
1314$code.=<<___;
1315.extern __imp_RtlVirtualUnwind
1316.type se_handler,\@abi-omnipotent
1317.align 16
1318se_handler:
1319 push %rsi
1320 push %rdi
1321 push %rbx
1322 push %rbp
1323 push %r12
1324 push %r13
1325 push %r14
1326 push %r15
1327 pushfq
1328 sub \$64,%rsp
1329
1330 mov 120($context),%rax # pull context->Rax
1331 mov 248($context),%rbx # pull context->Rip
1332
1333 mov 8($disp),%rsi # disp->ImageBase
1334 mov 56($disp),%r11 # disp->HandlerData
1335
1336 mov 0(%r11),%r10d # HandlerData[0]
1337 lea (%rsi,%r10),%r10 # prologue label
1338 cmp %r10,%rbx # context->Rip<.Lprologue
1339 jb .Lin_prologue
1340
1341 mov 152($context),%rax # pull context->Rsp
1342
1343 mov 4(%r11),%r10d # HandlerData[1]
1344 lea (%rsi,%r10),%r10 # epilogue label
1345 cmp %r10,%rbx # context->Rip>=.Lepilogue
1346 jae .Lin_prologue
1347
1348 mov 16(%rax),%rax # pull saved stack pointer
1349
1350 mov -8(%rax),%rbx
1351 mov -16(%rax),%rbp
1352 mov -24(%rax),%r12
1353 mov -32(%rax),%r13
1354 mov -40(%rax),%r14
1355 mov -48(%rax),%r15
1356 mov %rbx,144($context) # restore context->Rbx
1357 mov %rbp,160($context) # restore context->Rbp
1358 mov %r12,216($context) # restore context->R12
1359 mov %r13,224($context) # restore context->R13
1360 mov %r14,232($context) # restore context->R14
1361 mov %r15,240($context) # restore context->R15
1362
1363 lea -56-10*16(%rax),%rsi
1364 lea 512($context),%rdi # &context.Xmm6
1365 mov \$20,%ecx
1366 .long 0xa548f3fc # cld; rep movsq
1367
1368.Lin_prologue:
1369 mov 8(%rax),%rdi
1370 mov 16(%rax),%rsi
1371 mov %rax,152($context) # restore context->Rsp
1372 mov %rsi,168($context) # restore context->Rsi
1373 mov %rdi,176($context) # restore context->Rdi
1374
1375 mov 40($disp),%rdi # disp->ContextRecord
1376 mov $context,%rsi # context
1377 mov \$154,%ecx # sizeof(CONTEXT)
1378 .long 0xa548f3fc # cld; rep movsq
1379
1380 mov $disp,%rsi
1381 xor %rcx,%rcx # arg1, UNW_FLAG_NHANDLER
1382 mov 8(%rsi),%rdx # arg2, disp->ImageBase
1383 mov 0(%rsi),%r8 # arg3, disp->ControlPc
1384 mov 16(%rsi),%r9 # arg4, disp->FunctionEntry
1385 mov 40(%rsi),%r10 # disp->ContextRecord
1386 lea 56(%rsi),%r11 # &disp->HandlerData
1387 lea 24(%rsi),%r12 # &disp->EstablisherFrame
1388 mov %r10,32(%rsp) # arg5
1389 mov %r11,40(%rsp) # arg6
1390 mov %r12,48(%rsp) # arg7
1391 mov %rcx,56(%rsp) # arg8, (NULL)
1392 call *__imp_RtlVirtualUnwind(%rip)
1393
1394 mov \$1,%eax # ExceptionContinueSearch
1395 add \$64,%rsp
1396 popfq
1397 pop %r15
1398 pop %r14
1399 pop %r13
1400 pop %r12
1401 pop %rbp
1402 pop %rbx
1403 pop %rdi
1404 pop %rsi
1405 ret
1406.size se_handler,.-se_handler
1407
1408.section .pdata
1409.align 4
1410 .rva .LSEH_begin_aesni_multi_cbc_encrypt
1411 .rva .LSEH_end_aesni_multi_cbc_encrypt
1412 .rva .LSEH_info_aesni_multi_cbc_encrypt
1413 .rva .LSEH_begin_aesni_multi_cbc_decrypt
1414 .rva .LSEH_end_aesni_multi_cbc_decrypt
1415 .rva .LSEH_info_aesni_multi_cbc_decrypt
1416___
1417$code.=<<___ if ($avx);
1418 .rva .LSEH_begin_aesni_multi_cbc_encrypt_avx
1419 .rva .LSEH_end_aesni_multi_cbc_encrypt_avx
1420 .rva .LSEH_info_aesni_multi_cbc_encrypt_avx
1421 .rva .LSEH_begin_aesni_multi_cbc_decrypt_avx
1422 .rva .LSEH_end_aesni_multi_cbc_decrypt_avx
1423 .rva .LSEH_info_aesni_multi_cbc_decrypt_avx
1424___
1425$code.=<<___;
1426.section .xdata
1427.align 8
1428.LSEH_info_aesni_multi_cbc_encrypt:
1429 .byte 9,0,0,0
1430 .rva se_handler
1431 .rva .Lenc4x_body,.Lenc4x_epilogue # HandlerData[]
1432.LSEH_info_aesni_multi_cbc_decrypt:
1433 .byte 9,0,0,0
1434 .rva se_handler
1435 .rva .Ldec4x_body,.Ldec4x_epilogue # HandlerData[]
1436___
1437$code.=<<___ if ($avx);
1438.LSEH_info_aesni_multi_cbc_encrypt_avx:
1439 .byte 9,0,0,0
1440 .rva se_handler
1441 .rva .Lenc8x_body,.Lenc8x_epilogue # HandlerData[]
1442.LSEH_info_aesni_multi_cbc_decrypt_avx:
1443 .byte 9,0,0,0
1444 .rva se_handler
1445 .rva .Ldec8x_body,.Ldec8x_epilogue # HandlerData[]
1446___
1447}
1448####################################################################
1449
1450sub rex {
1451 local *opcode=shift;
1452 my ($dst,$src)=@_;
1453 my $rex=0;
1454
1455 $rex|=0x04 if($dst>=8);
1456 $rex|=0x01 if($src>=8);
1457 push @opcode,$rex|0x40 if($rex);
1458}
1459
1460sub aesni {
1461 my $line=shift;
1462 my @opcode=(0x66);
1463
1464 if ($line=~/(aeskeygenassist)\s+\$([x0-9a-f]+),\s*%xmm([0-9]+),\s*%xmm([0-9]+)/) {
1465 rex(\@opcode,$4,$3);
1466 push @opcode,0x0f,0x3a,0xdf;
1467 push @opcode,0xc0|($3&7)|(($4&7)<<3); # ModR/M
1468 my $c=$2;
1469 push @opcode,$c=~/^0/?oct($c):$c;
1470 return ".byte\t".join(',',@opcode);
1471 }
1472 elsif ($line=~/(aes[a-z]+)\s+%xmm([0-9]+),\s*%xmm([0-9]+)/) {
1473 my %opcodelet = (
1474 "aesimc" => 0xdb,
1475 "aesenc" => 0xdc, "aesenclast" => 0xdd,
1476 "aesdec" => 0xde, "aesdeclast" => 0xdf
1477 );
1478 return undef if (!defined($opcodelet{$1}));
1479 rex(\@opcode,$3,$2);
1480 push @opcode,0x0f,0x38,$opcodelet{$1};
1481 push @opcode,0xc0|($2&7)|(($3&7)<<3); # ModR/M
1482 return ".byte\t".join(',',@opcode);
1483 }
1484 elsif ($line=~/(aes[a-z]+)\s+([0x1-9a-fA-F]*)\(%rsp\),\s*%xmm([0-9]+)/) {
1485 my %opcodelet = (
1486 "aesenc" => 0xdc, "aesenclast" => 0xdd,
1487 "aesdec" => 0xde, "aesdeclast" => 0xdf
1488 );
1489 return undef if (!defined($opcodelet{$1}));
1490 my $off = $2;
1491 push @opcode,0x44 if ($3>=8);
1492 push @opcode,0x0f,0x38,$opcodelet{$1};
1493 push @opcode,0x44|(($3&7)<<3),0x24; # ModR/M
1494 push @opcode,($off=~/^0/?oct($off):$off)&0xff;
1495 return ".byte\t".join(',',@opcode);
1496 }
1497 return $line;
1498}
1499
1500$code =~ s/\`([^\`]*)\`/eval($1)/gem;
1501$code =~ s/\b(aes.*%xmm[0-9]+).*$/aesni($1)/gem;
1502
1503print $code;
1504close STDOUT or die "error closing STDOUT: $!";
Note: See TracBrowser for help on using the repository browser.

© 2024 Oracle Support Privacy / Do Not Sell My Info Terms of Use Trademark Policy Automated Access Etiquette