VirtualBox

source: vbox/trunk/src/libs/openssl-1.1.1g/crypto/aes/asm/aesni-mb-x86_64.pl@ 83916

Last change on this file since 83916 was 83916, checked in by vboxsync, 5 years ago

openssl-1.1.1g: Applied and adjusted our OpenSSL changes to 1.1.1g. bugref:9719

File size: 36.8 KB
Line 
1#! /usr/bin/env perl
2# Copyright 2013-2020 The OpenSSL Project Authors. All Rights Reserved.
3#
4# Licensed under the OpenSSL license (the "License"). You may not use
5# this file except in compliance with the License. You can obtain a copy
6# in the file LICENSE in the source distribution or at
7# https://www.openssl.org/source/license.html
8
9
10# ====================================================================
11# Written by Andy Polyakov <[email protected]> for the OpenSSL
12# project. The module is, however, dual licensed under OpenSSL and
13# CRYPTOGAMS licenses depending on where you obtain it. For further
14# details see http://www.openssl.org/~appro/cryptogams/.
15# ====================================================================
16
17# Multi-buffer AES-NI procedures process several independent buffers
18# in parallel by interleaving independent instructions.
19#
20# Cycles per byte for interleave factor 4:
21#
22# asymptotic measured
23# ---------------------------
24# Westmere 5.00/4=1.25 5.13/4=1.28
25# Atom 15.0/4=3.75 ?15.7/4=3.93
26# Sandy Bridge 5.06/4=1.27 5.18/4=1.29
27# Ivy Bridge 5.06/4=1.27 5.14/4=1.29
28# Haswell 4.44/4=1.11 4.44/4=1.11
29# Bulldozer 5.75/4=1.44 5.76/4=1.44
30#
31# Cycles per byte for interleave factor 8 (not implemented for
32# pre-AVX processors, where higher interleave factor incidentally
33# doesn't result in improvement):
34#
35# asymptotic measured
36# ---------------------------
37# Sandy Bridge 5.06/8=0.64 7.10/8=0.89(*)
38# Ivy Bridge 5.06/8=0.64 7.14/8=0.89(*)
39# Haswell 5.00/8=0.63 5.00/8=0.63
40# Bulldozer 5.75/8=0.72 5.77/8=0.72
41#
42# (*) Sandy/Ivy Bridge are known to handle high interleave factors
43# suboptimally;
44
45$flavour = shift;
46$output = shift;
47if ($flavour =~ /\./) { $output = $flavour; undef $flavour; }
48
49$win64=0; $win64=1 if ($flavour =~ /[nm]asm|mingw64/ || $output =~ /\.asm$/);
50
51$0 =~ m/(.*[\/\\])[^\/\\]+$/; $dir=$1;
52( $xlate="${dir}x86_64-xlate.pl" and -f $xlate ) or
53( $xlate="${dir}../../perlasm/x86_64-xlate.pl" and -f $xlate) or
54die "can't locate x86_64-xlate.pl";
55
56$avx=0;
57
58if (`$ENV{CC} -Wa,-v -c -o /dev/null -x assembler /dev/null 2>&1`
59 =~ /GNU assembler version ([2-9]\.[0-9]+)/) {
60 $avx = ($1>=2.19) + ($1>=2.22);
61}
62
63if (!$avx && $win64 && ($flavour =~ /nasm/ || $ENV{ASM} =~ /nasm/) &&
64 `nasm -v 2>&1` =~ /NASM version ([2-9]\.[0-9]+)/) {
65 $avx = ($1>=2.09) + ($1>=2.10);
66}
67
68if (!$avx && $win64 && ($flavour =~ /masm/ || $ENV{ASM} =~ /ml64/) &&
69 `ml64 2>&1` =~ /Version ([0-9]+)\./) {
70 $avx = ($1>=10) + ($1>=11);
71}
72
73if (!$avx && `$ENV{CC} -v 2>&1` =~ /((?:^clang|LLVM) version|.*based on LLVM) ([0-9]+\.[0-9]+)/) {
74 $avx = ($2>=3.0) + ($2>3.0);
75}
76
77open OUT,"| \"$^X\" \"$xlate\" $flavour \"$output\"";
78*STDOUT=*OUT;
79
80# void aesni_multi_cbc_encrypt (
81# struct { void *inp,*out; int blocks; double iv[2]; } inp[8];
82# const AES_KEY *key,
83# int num); /* 1 or 2 */
84#
85$inp="%rdi"; # 1st arg
86$key="%rsi"; # 2nd arg
87$num="%edx";
88
89@inptr=map("%r$_",(8..11));
90@outptr=map("%r$_",(12..15));
91
92($rndkey0,$rndkey1)=("%xmm0","%xmm1");
93@out=map("%xmm$_",(2..5));
94@inp=map("%xmm$_",(6..9));
95($counters,$mask,$zero)=map("%xmm$_",(10..12));
96
97($rounds,$one,$sink,$offset)=("%eax","%ecx","%rbp","%rbx");
98
99$code.=<<___;
100.text
101
102.extern OPENSSL_ia32cap_P
103
104.globl aesni_multi_cbc_encrypt
105.type aesni_multi_cbc_encrypt,\@function,3
106.align 32
107aesni_multi_cbc_encrypt:
108.cfi_startproc
109___
110$code.=<<___ if ($avx);
111 cmp \$2,$num
112 jb .Lenc_non_avx
113 mov OPENSSL_ia32cap_P+4(%rip),%ecx
114 test \$`1<<28`,%ecx # AVX bit
115 jnz _avx_cbc_enc_shortcut
116 jmp .Lenc_non_avx
117.align 16
118.Lenc_non_avx:
119___
120$code.=<<___;
121 mov %rsp,%rax
122.cfi_def_cfa_register %rax
123 push %rbx
124.cfi_push %rbx
125 push %rbp
126.cfi_push %rbp
127 push %r12
128.cfi_push %r12
129 push %r13
130.cfi_push %r13
131 push %r14
132.cfi_push %r14
133 push %r15
134.cfi_push %r15
135___
136$code.=<<___ if ($win64);
137 lea -0xa8(%rsp),%rsp
138 movaps %xmm6,(%rsp)
139 movaps %xmm7,0x10(%rsp)
140 movaps %xmm8,0x20(%rsp)
141 movaps %xmm9,0x30(%rsp)
142 movaps %xmm10,0x40(%rsp)
143 movaps %xmm11,0x50(%rsp)
144 movaps %xmm12,0x60(%rsp)
145 movaps %xmm13,-0x68(%rax) # not used, saved to share se_handler
146 movaps %xmm14,-0x58(%rax)
147 movaps %xmm15,-0x48(%rax)
148___
149$code.=<<___;
150 # stack layout
151 #
152 # +0 output sink
153 # +16 input sink [original %rsp and $num]
154 # +32 counters
155
156 sub \$48,%rsp
157 and \$-64,%rsp
158 mov %rax,16(%rsp) # original %rsp
159.cfi_cfa_expression %rsp+16,deref,+8
160
161.Lenc4x_body:
162 movdqu ($key),$zero # 0-round key
163 lea 0x78($key),$key # size optimization
164 lea 40*2($inp),$inp
165
166.Lenc4x_loop_grande:
167 mov $num,24(%rsp) # original $num
168 xor $num,$num
169___
170for($i=0;$i<4;$i++) {
171 $code.=<<___;
172 mov `40*$i+16-40*2`($inp),$one # borrow $one for number of blocks
173 mov `40*$i+0-40*2`($inp),@inptr[$i]
174 cmp $num,$one
175 mov `40*$i+8-40*2`($inp),@outptr[$i]
176 cmovg $one,$num # find maximum
177 test $one,$one
178 movdqu `40*$i+24-40*2`($inp),@out[$i] # load IV
179 mov $one,`32+4*$i`(%rsp) # initialize counters
180 cmovle %rsp,@inptr[$i] # cancel input
181___
182}
183$code.=<<___;
184 test $num,$num
185 jz .Lenc4x_done
186
187 movups 0x10-0x78($key),$rndkey1
188 pxor $zero,@out[0]
189 movups 0x20-0x78($key),$rndkey0
190 pxor $zero,@out[1]
191 mov 0xf0-0x78($key),$rounds
192 pxor $zero,@out[2]
193 movdqu (@inptr[0]),@inp[0] # load inputs
194 pxor $zero,@out[3]
195 movdqu (@inptr[1]),@inp[1]
196 pxor @inp[0],@out[0]
197 movdqu (@inptr[2]),@inp[2]
198 pxor @inp[1],@out[1]
199 movdqu (@inptr[3]),@inp[3]
200 pxor @inp[2],@out[2]
201 pxor @inp[3],@out[3]
202 movdqa 32(%rsp),$counters # load counters
203 xor $offset,$offset
204 jmp .Loop_enc4x
205
206.align 32
207.Loop_enc4x:
208 add \$16,$offset
209 lea 16(%rsp),$sink # sink pointer
210 mov \$1,$one # constant of 1
211 sub $offset,$sink
212
213 aesenc $rndkey1,@out[0]
214 prefetcht0 31(@inptr[0],$offset) # prefetch input
215 prefetcht0 31(@inptr[1],$offset)
216 aesenc $rndkey1,@out[1]
217 prefetcht0 31(@inptr[2],$offset)
218 prefetcht0 31(@inptr[2],$offset)
219 aesenc $rndkey1,@out[2]
220 aesenc $rndkey1,@out[3]
221 movups 0x30-0x78($key),$rndkey1
222___
223for($i=0;$i<4;$i++) {
224my $rndkey = ($i&1) ? $rndkey1 : $rndkey0;
225$code.=<<___;
226 cmp `32+4*$i`(%rsp),$one
227 aesenc $rndkey,@out[0]
228 aesenc $rndkey,@out[1]
229 aesenc $rndkey,@out[2]
230 cmovge $sink,@inptr[$i] # cancel input
231 cmovg $sink,@outptr[$i] # sink output
232 aesenc $rndkey,@out[3]
233 movups `0x40+16*$i-0x78`($key),$rndkey
234___
235}
236$code.=<<___;
237 movdqa $counters,$mask
238 aesenc $rndkey0,@out[0]
239 prefetcht0 15(@outptr[0],$offset) # prefetch output
240 prefetcht0 15(@outptr[1],$offset)
241 aesenc $rndkey0,@out[1]
242 prefetcht0 15(@outptr[2],$offset)
243 prefetcht0 15(@outptr[3],$offset)
244 aesenc $rndkey0,@out[2]
245 aesenc $rndkey0,@out[3]
246 movups 0x80-0x78($key),$rndkey0
247 pxor $zero,$zero
248
249 aesenc $rndkey1,@out[0]
250 pcmpgtd $zero,$mask
251 movdqu -0x78($key),$zero # reload 0-round key
252 aesenc $rndkey1,@out[1]
253 paddd $mask,$counters # decrement counters
254 movdqa $counters,32(%rsp) # update counters
255 aesenc $rndkey1,@out[2]
256 aesenc $rndkey1,@out[3]
257 movups 0x90-0x78($key),$rndkey1
258
259 cmp \$11,$rounds
260
261 aesenc $rndkey0,@out[0]
262 aesenc $rndkey0,@out[1]
263 aesenc $rndkey0,@out[2]
264 aesenc $rndkey0,@out[3]
265 movups 0xa0-0x78($key),$rndkey0
266
267 jb .Lenc4x_tail
268
269 aesenc $rndkey1,@out[0]
270 aesenc $rndkey1,@out[1]
271 aesenc $rndkey1,@out[2]
272 aesenc $rndkey1,@out[3]
273 movups 0xb0-0x78($key),$rndkey1
274
275 aesenc $rndkey0,@out[0]
276 aesenc $rndkey0,@out[1]
277 aesenc $rndkey0,@out[2]
278 aesenc $rndkey0,@out[3]
279 movups 0xc0-0x78($key),$rndkey0
280
281 je .Lenc4x_tail
282
283 aesenc $rndkey1,@out[0]
284 aesenc $rndkey1,@out[1]
285 aesenc $rndkey1,@out[2]
286 aesenc $rndkey1,@out[3]
287 movups 0xd0-0x78($key),$rndkey1
288
289 aesenc $rndkey0,@out[0]
290 aesenc $rndkey0,@out[1]
291 aesenc $rndkey0,@out[2]
292 aesenc $rndkey0,@out[3]
293 movups 0xe0-0x78($key),$rndkey0
294 jmp .Lenc4x_tail
295
296.align 32
297.Lenc4x_tail:
298 aesenc $rndkey1,@out[0]
299 aesenc $rndkey1,@out[1]
300 aesenc $rndkey1,@out[2]
301 aesenc $rndkey1,@out[3]
302 movdqu (@inptr[0],$offset),@inp[0]
303 movdqu 0x10-0x78($key),$rndkey1
304
305 aesenclast $rndkey0,@out[0]
306 movdqu (@inptr[1],$offset),@inp[1]
307 pxor $zero,@inp[0]
308 aesenclast $rndkey0,@out[1]
309 movdqu (@inptr[2],$offset),@inp[2]
310 pxor $zero,@inp[1]
311 aesenclast $rndkey0,@out[2]
312 movdqu (@inptr[3],$offset),@inp[3]
313 pxor $zero,@inp[2]
314 aesenclast $rndkey0,@out[3]
315 movdqu 0x20-0x78($key),$rndkey0
316 pxor $zero,@inp[3]
317
318 movups @out[0],-16(@outptr[0],$offset)
319 pxor @inp[0],@out[0]
320 movups @out[1],-16(@outptr[1],$offset)
321 pxor @inp[1],@out[1]
322 movups @out[2],-16(@outptr[2],$offset)
323 pxor @inp[2],@out[2]
324 movups @out[3],-16(@outptr[3],$offset)
325 pxor @inp[3],@out[3]
326
327 dec $num
328 jnz .Loop_enc4x
329
330 mov 16(%rsp),%rax # original %rsp
331.cfi_def_cfa %rax,8
332 mov 24(%rsp),$num
333
334 #pxor @inp[0],@out[0]
335 #pxor @inp[1],@out[1]
336 #movdqu @out[0],`40*0+24-40*2`($inp) # output iv FIX ME!
337 #pxor @inp[2],@out[2]
338 #movdqu @out[1],`40*1+24-40*2`($inp)
339 #pxor @inp[3],@out[3]
340 #movdqu @out[2],`40*2+24-40*2`($inp) # won't fix, let caller
341 #movdqu @out[3],`40*3+24-40*2`($inp) # figure this out...
342
343 lea `40*4`($inp),$inp
344 dec $num
345 jnz .Lenc4x_loop_grande
346
347.Lenc4x_done:
348___
349$code.=<<___ if ($win64);
350 movaps -0xd8(%rax),%xmm6
351 movaps -0xc8(%rax),%xmm7
352 movaps -0xb8(%rax),%xmm8
353 movaps -0xa8(%rax),%xmm9
354 movaps -0x98(%rax),%xmm10
355 movaps -0x88(%rax),%xmm11
356 movaps -0x78(%rax),%xmm12
357 #movaps -0x68(%rax),%xmm13
358 #movaps -0x58(%rax),%xmm14
359 #movaps -0x48(%rax),%xmm15
360___
361$code.=<<___;
362 mov -48(%rax),%r15
363.cfi_restore %r15
364 mov -40(%rax),%r14
365.cfi_restore %r14
366 mov -32(%rax),%r13
367.cfi_restore %r13
368 mov -24(%rax),%r12
369.cfi_restore %r12
370 mov -16(%rax),%rbp
371.cfi_restore %rbp
372 mov -8(%rax),%rbx
373.cfi_restore %rbx
374 lea (%rax),%rsp
375.cfi_def_cfa_register %rsp
376.Lenc4x_epilogue:
377 ret
378.cfi_endproc
379.size aesni_multi_cbc_encrypt,.-aesni_multi_cbc_encrypt
380
381.globl aesni_multi_cbc_decrypt
382.type aesni_multi_cbc_decrypt,\@function,3
383.align 32
384aesni_multi_cbc_decrypt:
385.cfi_startproc
386___
387$code.=<<___ if ($avx);
388 cmp \$2,$num
389 jb .Ldec_non_avx
390 mov OPENSSL_ia32cap_P+4(%rip),%ecx
391 test \$`1<<28`,%ecx # AVX bit
392 jnz _avx_cbc_dec_shortcut
393 jmp .Ldec_non_avx
394.align 16
395.Ldec_non_avx:
396___
397$code.=<<___;
398 mov %rsp,%rax
399.cfi_def_cfa_register %rax
400 push %rbx
401.cfi_push %rbx
402 push %rbp
403.cfi_push %rbp
404 push %r12
405.cfi_push %r12
406 push %r13
407.cfi_push %r13
408 push %r14
409.cfi_push %r14
410 push %r15
411.cfi_push %r15
412___
413$code.=<<___ if ($win64);
414 lea -0xa8(%rsp),%rsp
415 movaps %xmm6,(%rsp)
416 movaps %xmm7,0x10(%rsp)
417 movaps %xmm8,0x20(%rsp)
418 movaps %xmm9,0x30(%rsp)
419 movaps %xmm10,0x40(%rsp)
420 movaps %xmm11,0x50(%rsp)
421 movaps %xmm12,0x60(%rsp)
422 movaps %xmm13,-0x68(%rax) # not used, saved to share se_handler
423 movaps %xmm14,-0x58(%rax)
424 movaps %xmm15,-0x48(%rax)
425___
426$code.=<<___;
427 # stack layout
428 #
429 # +0 output sink
430 # +16 input sink [original %rsp and $num]
431 # +32 counters
432
433 sub \$48,%rsp
434 and \$-64,%rsp
435 mov %rax,16(%rsp) # original %rsp
436.cfi_cfa_expression %rsp+16,deref,+8
437
438.Ldec4x_body:
439 movdqu ($key),$zero # 0-round key
440 lea 0x78($key),$key # size optimization
441 lea 40*2($inp),$inp
442
443.Ldec4x_loop_grande:
444 mov $num,24(%rsp) # original $num
445 xor $num,$num
446___
447for($i=0;$i<4;$i++) {
448 $code.=<<___;
449 mov `40*$i+16-40*2`($inp),$one # borrow $one for number of blocks
450 mov `40*$i+0-40*2`($inp),@inptr[$i]
451 cmp $num,$one
452 mov `40*$i+8-40*2`($inp),@outptr[$i]
453 cmovg $one,$num # find maximum
454 test $one,$one
455 movdqu `40*$i+24-40*2`($inp),@inp[$i] # load IV
456 mov $one,`32+4*$i`(%rsp) # initialize counters
457 cmovle %rsp,@inptr[$i] # cancel input
458___
459}
460$code.=<<___;
461 test $num,$num
462 jz .Ldec4x_done
463
464 movups 0x10-0x78($key),$rndkey1
465 movups 0x20-0x78($key),$rndkey0
466 mov 0xf0-0x78($key),$rounds
467 movdqu (@inptr[0]),@out[0] # load inputs
468 movdqu (@inptr[1]),@out[1]
469 pxor $zero,@out[0]
470 movdqu (@inptr[2]),@out[2]
471 pxor $zero,@out[1]
472 movdqu (@inptr[3]),@out[3]
473 pxor $zero,@out[2]
474 pxor $zero,@out[3]
475 movdqa 32(%rsp),$counters # load counters
476 xor $offset,$offset
477 jmp .Loop_dec4x
478
479.align 32
480.Loop_dec4x:
481 add \$16,$offset
482 lea 16(%rsp),$sink # sink pointer
483 mov \$1,$one # constant of 1
484 sub $offset,$sink
485
486 aesdec $rndkey1,@out[0]
487 prefetcht0 31(@inptr[0],$offset) # prefetch input
488 prefetcht0 31(@inptr[1],$offset)
489 aesdec $rndkey1,@out[1]
490 prefetcht0 31(@inptr[2],$offset)
491 prefetcht0 31(@inptr[3],$offset)
492 aesdec $rndkey1,@out[2]
493 aesdec $rndkey1,@out[3]
494 movups 0x30-0x78($key),$rndkey1
495___
496for($i=0;$i<4;$i++) {
497my $rndkey = ($i&1) ? $rndkey1 : $rndkey0;
498$code.=<<___;
499 cmp `32+4*$i`(%rsp),$one
500 aesdec $rndkey,@out[0]
501 aesdec $rndkey,@out[1]
502 aesdec $rndkey,@out[2]
503 cmovge $sink,@inptr[$i] # cancel input
504 cmovg $sink,@outptr[$i] # sink output
505 aesdec $rndkey,@out[3]
506 movups `0x40+16*$i-0x78`($key),$rndkey
507___
508}
509$code.=<<___;
510 movdqa $counters,$mask
511 aesdec $rndkey0,@out[0]
512 prefetcht0 15(@outptr[0],$offset) # prefetch output
513 prefetcht0 15(@outptr[1],$offset)
514 aesdec $rndkey0,@out[1]
515 prefetcht0 15(@outptr[2],$offset)
516 prefetcht0 15(@outptr[3],$offset)
517 aesdec $rndkey0,@out[2]
518 aesdec $rndkey0,@out[3]
519 movups 0x80-0x78($key),$rndkey0
520 pxor $zero,$zero
521
522 aesdec $rndkey1,@out[0]
523 pcmpgtd $zero,$mask
524 movdqu -0x78($key),$zero # reload 0-round key
525 aesdec $rndkey1,@out[1]
526 paddd $mask,$counters # decrement counters
527 movdqa $counters,32(%rsp) # update counters
528 aesdec $rndkey1,@out[2]
529 aesdec $rndkey1,@out[3]
530 movups 0x90-0x78($key),$rndkey1
531
532 cmp \$11,$rounds
533
534 aesdec $rndkey0,@out[0]
535 aesdec $rndkey0,@out[1]
536 aesdec $rndkey0,@out[2]
537 aesdec $rndkey0,@out[3]
538 movups 0xa0-0x78($key),$rndkey0
539
540 jb .Ldec4x_tail
541
542 aesdec $rndkey1,@out[0]
543 aesdec $rndkey1,@out[1]
544 aesdec $rndkey1,@out[2]
545 aesdec $rndkey1,@out[3]
546 movups 0xb0-0x78($key),$rndkey1
547
548 aesdec $rndkey0,@out[0]
549 aesdec $rndkey0,@out[1]
550 aesdec $rndkey0,@out[2]
551 aesdec $rndkey0,@out[3]
552 movups 0xc0-0x78($key),$rndkey0
553
554 je .Ldec4x_tail
555
556 aesdec $rndkey1,@out[0]
557 aesdec $rndkey1,@out[1]
558 aesdec $rndkey1,@out[2]
559 aesdec $rndkey1,@out[3]
560 movups 0xd0-0x78($key),$rndkey1
561
562 aesdec $rndkey0,@out[0]
563 aesdec $rndkey0,@out[1]
564 aesdec $rndkey0,@out[2]
565 aesdec $rndkey0,@out[3]
566 movups 0xe0-0x78($key),$rndkey0
567 jmp .Ldec4x_tail
568
569.align 32
570.Ldec4x_tail:
571 aesdec $rndkey1,@out[0]
572 aesdec $rndkey1,@out[1]
573 aesdec $rndkey1,@out[2]
574 pxor $rndkey0,@inp[0]
575 pxor $rndkey0,@inp[1]
576 aesdec $rndkey1,@out[3]
577 movdqu 0x10-0x78($key),$rndkey1
578 pxor $rndkey0,@inp[2]
579 pxor $rndkey0,@inp[3]
580 movdqu 0x20-0x78($key),$rndkey0
581
582 aesdeclast @inp[0],@out[0]
583 aesdeclast @inp[1],@out[1]
584 movdqu -16(@inptr[0],$offset),@inp[0] # load next IV
585 movdqu -16(@inptr[1],$offset),@inp[1]
586 aesdeclast @inp[2],@out[2]
587 aesdeclast @inp[3],@out[3]
588 movdqu -16(@inptr[2],$offset),@inp[2]
589 movdqu -16(@inptr[3],$offset),@inp[3]
590
591 movups @out[0],-16(@outptr[0],$offset)
592 movdqu (@inptr[0],$offset),@out[0]
593 movups @out[1],-16(@outptr[1],$offset)
594 movdqu (@inptr[1],$offset),@out[1]
595 pxor $zero,@out[0]
596 movups @out[2],-16(@outptr[2],$offset)
597 movdqu (@inptr[2],$offset),@out[2]
598 pxor $zero,@out[1]
599 movups @out[3],-16(@outptr[3],$offset)
600 movdqu (@inptr[3],$offset),@out[3]
601 pxor $zero,@out[2]
602 pxor $zero,@out[3]
603
604 dec $num
605 jnz .Loop_dec4x
606
607 mov 16(%rsp),%rax # original %rsp
608.cfi_def_cfa %rax,8
609 mov 24(%rsp),$num
610
611 lea `40*4`($inp),$inp
612 dec $num
613 jnz .Ldec4x_loop_grande
614
615.Ldec4x_done:
616___
617$code.=<<___ if ($win64);
618 movaps -0xd8(%rax),%xmm6
619 movaps -0xc8(%rax),%xmm7
620 movaps -0xb8(%rax),%xmm8
621 movaps -0xa8(%rax),%xmm9
622 movaps -0x98(%rax),%xmm10
623 movaps -0x88(%rax),%xmm11
624 movaps -0x78(%rax),%xmm12
625 #movaps -0x68(%rax),%xmm13
626 #movaps -0x58(%rax),%xmm14
627 #movaps -0x48(%rax),%xmm15
628___
629$code.=<<___;
630 mov -48(%rax),%r15
631.cfi_restore %r15
632 mov -40(%rax),%r14
633.cfi_restore %r14
634 mov -32(%rax),%r13
635.cfi_restore %r13
636 mov -24(%rax),%r12
637.cfi_restore %r12
638 mov -16(%rax),%rbp
639.cfi_restore %rbp
640 mov -8(%rax),%rbx
641.cfi_restore %rbx
642 lea (%rax),%rsp
643.cfi_def_cfa_register %rsp
644.Ldec4x_epilogue:
645 ret
646.cfi_endproc
647.size aesni_multi_cbc_decrypt,.-aesni_multi_cbc_decrypt
648___
649
650 if ($avx) {{{
651my @ptr=map("%r$_",(8..15));
652my $offload=$sink;
653
654my @out=map("%xmm$_",(2..9));
655my @inp=map("%xmm$_",(10..13));
656my ($counters,$zero)=("%xmm14","%xmm15");
657
658$code.=<<___;
659.type aesni_multi_cbc_encrypt_avx,\@function,3
660.align 32
661aesni_multi_cbc_encrypt_avx:
662.cfi_startproc
663_avx_cbc_enc_shortcut:
664 mov %rsp,%rax
665.cfi_def_cfa_register %rax
666 push %rbx
667.cfi_push %rbx
668 push %rbp
669.cfi_push %rbp
670 push %r12
671.cfi_push %r12
672 push %r13
673.cfi_push %r13
674 push %r14
675.cfi_push %r14
676 push %r15
677.cfi_push %r15
678___
679$code.=<<___ if ($win64);
680 lea -0xa8(%rsp),%rsp
681 movaps %xmm6,(%rsp)
682 movaps %xmm7,0x10(%rsp)
683 movaps %xmm8,0x20(%rsp)
684 movaps %xmm9,0x30(%rsp)
685 movaps %xmm10,0x40(%rsp)
686 movaps %xmm11,0x50(%rsp)
687 movaps %xmm12,-0x78(%rax)
688 movaps %xmm13,-0x68(%rax)
689 movaps %xmm14,-0x58(%rax)
690 movaps %xmm15,-0x48(%rax)
691___
692$code.=<<___;
693 # stack layout
694 #
695 # +0 output sink
696 # +16 input sink [original %rsp and $num]
697 # +32 counters
698 # +64 distances between inputs and outputs
699 # +128 off-load area for @inp[0..3]
700
701 sub \$192,%rsp
702 and \$-128,%rsp
703 mov %rax,16(%rsp) # original %rsp
704.cfi_cfa_expression %rsp+16,deref,+8
705
706.Lenc8x_body:
707 vzeroupper
708 vmovdqu ($key),$zero # 0-round key
709 lea 0x78($key),$key # size optimization
710 lea 40*4($inp),$inp
711 shr \$1,$num
712
713.Lenc8x_loop_grande:
714 #mov $num,24(%rsp) # original $num
715 xor $num,$num
716___
717for($i=0;$i<8;$i++) {
718 my $temp = $i ? $offload : $offset;
719 $code.=<<___;
720 mov `40*$i+16-40*4`($inp),$one # borrow $one for number of blocks
721 mov `40*$i+0-40*4`($inp),@ptr[$i] # input pointer
722 cmp $num,$one
723 mov `40*$i+8-40*4`($inp),$temp # output pointer
724 cmovg $one,$num # find maximum
725 test $one,$one
726 vmovdqu `40*$i+24-40*4`($inp),@out[$i] # load IV
727 mov $one,`32+4*$i`(%rsp) # initialize counters
728 cmovle %rsp,@ptr[$i] # cancel input
729 sub @ptr[$i],$temp # distance between input and output
730 mov $temp,`64+8*$i`(%rsp) # initialize distances
731___
732}
733$code.=<<___;
734 test $num,$num
735 jz .Lenc8x_done
736
737 vmovups 0x10-0x78($key),$rndkey1
738 vmovups 0x20-0x78($key),$rndkey0
739 mov 0xf0-0x78($key),$rounds
740
741 vpxor (@ptr[0]),$zero,@inp[0] # load inputs and xor with 0-round
742 lea 128(%rsp),$offload # offload area
743 vpxor (@ptr[1]),$zero,@inp[1]
744 vpxor (@ptr[2]),$zero,@inp[2]
745 vpxor (@ptr[3]),$zero,@inp[3]
746 vpxor @inp[0],@out[0],@out[0]
747 vpxor (@ptr[4]),$zero,@inp[0]
748 vpxor @inp[1],@out[1],@out[1]
749 vpxor (@ptr[5]),$zero,@inp[1]
750 vpxor @inp[2],@out[2],@out[2]
751 vpxor (@ptr[6]),$zero,@inp[2]
752 vpxor @inp[3],@out[3],@out[3]
753 vpxor (@ptr[7]),$zero,@inp[3]
754 vpxor @inp[0],@out[4],@out[4]
755 mov \$1,$one # constant of 1
756 vpxor @inp[1],@out[5],@out[5]
757 vpxor @inp[2],@out[6],@out[6]
758 vpxor @inp[3],@out[7],@out[7]
759 jmp .Loop_enc8x
760
761.align 32
762.Loop_enc8x:
763___
764for($i=0;$i<8;$i++) {
765my $rndkey=($i&1)?$rndkey0:$rndkey1;
766$code.=<<___;
767 vaesenc $rndkey,@out[0],@out[0]
768 cmp 32+4*$i(%rsp),$one
769___
770$code.=<<___ if ($i);
771 mov 64+8*$i(%rsp),$offset
772___
773$code.=<<___;
774 vaesenc $rndkey,@out[1],@out[1]
775 prefetcht0 31(@ptr[$i]) # prefetch input
776 vaesenc $rndkey,@out[2],@out[2]
777___
778$code.=<<___ if ($i>1);
779 prefetcht0 15(@ptr[$i-2]) # prefetch output
780___
781$code.=<<___;
782 vaesenc $rndkey,@out[3],@out[3]
783 lea (@ptr[$i],$offset),$offset
784 cmovge %rsp,@ptr[$i] # cancel input
785 vaesenc $rndkey,@out[4],@out[4]
786 cmovg %rsp,$offset # sink output
787 vaesenc $rndkey,@out[5],@out[5]
788 sub @ptr[$i],$offset
789 vaesenc $rndkey,@out[6],@out[6]
790 vpxor 16(@ptr[$i]),$zero,@inp[$i%4] # load input and xor with 0-round
791 mov $offset,64+8*$i(%rsp)
792 vaesenc $rndkey,@out[7],@out[7]
793 vmovups `16*(3+$i)-0x78`($key),$rndkey
794 lea 16(@ptr[$i],$offset),@ptr[$i] # switch to output
795___
796$code.=<<___ if ($i<4)
797 vmovdqu @inp[$i%4],`16*$i`($offload) # off-load
798___
799}
800$code.=<<___;
801 vmovdqu 32(%rsp),$counters
802 prefetcht0 15(@ptr[$i-2]) # prefetch output
803 prefetcht0 15(@ptr[$i-1])
804 cmp \$11,$rounds
805 jb .Lenc8x_tail
806
807 vaesenc $rndkey1,@out[0],@out[0]
808 vaesenc $rndkey1,@out[1],@out[1]
809 vaesenc $rndkey1,@out[2],@out[2]
810 vaesenc $rndkey1,@out[3],@out[3]
811 vaesenc $rndkey1,@out[4],@out[4]
812 vaesenc $rndkey1,@out[5],@out[5]
813 vaesenc $rndkey1,@out[6],@out[6]
814 vaesenc $rndkey1,@out[7],@out[7]
815 vmovups 0xb0-0x78($key),$rndkey1
816
817 vaesenc $rndkey0,@out[0],@out[0]
818 vaesenc $rndkey0,@out[1],@out[1]
819 vaesenc $rndkey0,@out[2],@out[2]
820 vaesenc $rndkey0,@out[3],@out[3]
821 vaesenc $rndkey0,@out[4],@out[4]
822 vaesenc $rndkey0,@out[5],@out[5]
823 vaesenc $rndkey0,@out[6],@out[6]
824 vaesenc $rndkey0,@out[7],@out[7]
825 vmovups 0xc0-0x78($key),$rndkey0
826 je .Lenc8x_tail
827
828 vaesenc $rndkey1,@out[0],@out[0]
829 vaesenc $rndkey1,@out[1],@out[1]
830 vaesenc $rndkey1,@out[2],@out[2]
831 vaesenc $rndkey1,@out[3],@out[3]
832 vaesenc $rndkey1,@out[4],@out[4]
833 vaesenc $rndkey1,@out[5],@out[5]
834 vaesenc $rndkey1,@out[6],@out[6]
835 vaesenc $rndkey1,@out[7],@out[7]
836 vmovups 0xd0-0x78($key),$rndkey1
837
838 vaesenc $rndkey0,@out[0],@out[0]
839 vaesenc $rndkey0,@out[1],@out[1]
840 vaesenc $rndkey0,@out[2],@out[2]
841 vaesenc $rndkey0,@out[3],@out[3]
842 vaesenc $rndkey0,@out[4],@out[4]
843 vaesenc $rndkey0,@out[5],@out[5]
844 vaesenc $rndkey0,@out[6],@out[6]
845 vaesenc $rndkey0,@out[7],@out[7]
846 vmovups 0xe0-0x78($key),$rndkey0
847
848.Lenc8x_tail:
849 vaesenc $rndkey1,@out[0],@out[0]
850 vpxor $zero,$zero,$zero
851 vaesenc $rndkey1,@out[1],@out[1]
852 vaesenc $rndkey1,@out[2],@out[2]
853 vpcmpgtd $zero,$counters,$zero
854 vaesenc $rndkey1,@out[3],@out[3]
855 vaesenc $rndkey1,@out[4],@out[4]
856 vpaddd $counters,$zero,$zero # decrement counters
857 vmovdqu 48(%rsp),$counters
858 vaesenc $rndkey1,@out[5],@out[5]
859 mov 64(%rsp),$offset # pre-load 1st offset
860 vaesenc $rndkey1,@out[6],@out[6]
861 vaesenc $rndkey1,@out[7],@out[7]
862 vmovups 0x10-0x78($key),$rndkey1
863
864 vaesenclast $rndkey0,@out[0],@out[0]
865 vmovdqa $zero,32(%rsp) # update counters
866 vpxor $zero,$zero,$zero
867 vaesenclast $rndkey0,@out[1],@out[1]
868 vaesenclast $rndkey0,@out[2],@out[2]
869 vpcmpgtd $zero,$counters,$zero
870 vaesenclast $rndkey0,@out[3],@out[3]
871 vaesenclast $rndkey0,@out[4],@out[4]
872 vpaddd $zero,$counters,$counters # decrement counters
873 vmovdqu -0x78($key),$zero # 0-round
874 vaesenclast $rndkey0,@out[5],@out[5]
875 vaesenclast $rndkey0,@out[6],@out[6]
876 vmovdqa $counters,48(%rsp) # update counters
877 vaesenclast $rndkey0,@out[7],@out[7]
878 vmovups 0x20-0x78($key),$rndkey0
879
880 vmovups @out[0],-16(@ptr[0]) # write output
881 sub $offset,@ptr[0] # switch to input
882 vpxor 0x00($offload),@out[0],@out[0]
883 vmovups @out[1],-16(@ptr[1])
884 sub `64+1*8`(%rsp),@ptr[1]
885 vpxor 0x10($offload),@out[1],@out[1]
886 vmovups @out[2],-16(@ptr[2])
887 sub `64+2*8`(%rsp),@ptr[2]
888 vpxor 0x20($offload),@out[2],@out[2]
889 vmovups @out[3],-16(@ptr[3])
890 sub `64+3*8`(%rsp),@ptr[3]
891 vpxor 0x30($offload),@out[3],@out[3]
892 vmovups @out[4],-16(@ptr[4])
893 sub `64+4*8`(%rsp),@ptr[4]
894 vpxor @inp[0],@out[4],@out[4]
895 vmovups @out[5],-16(@ptr[5])
896 sub `64+5*8`(%rsp),@ptr[5]
897 vpxor @inp[1],@out[5],@out[5]
898 vmovups @out[6],-16(@ptr[6])
899 sub `64+6*8`(%rsp),@ptr[6]
900 vpxor @inp[2],@out[6],@out[6]
901 vmovups @out[7],-16(@ptr[7])
902 sub `64+7*8`(%rsp),@ptr[7]
903 vpxor @inp[3],@out[7],@out[7]
904
905 dec $num
906 jnz .Loop_enc8x
907
908 mov 16(%rsp),%rax # original %rsp
909.cfi_def_cfa %rax,8
910 #mov 24(%rsp),$num
911 #lea `40*8`($inp),$inp
912 #dec $num
913 #jnz .Lenc8x_loop_grande
914
915.Lenc8x_done:
916 vzeroupper
917___
918$code.=<<___ if ($win64);
919 movaps -0xd8(%rax),%xmm6
920 movaps -0xc8(%rax),%xmm7
921 movaps -0xb8(%rax),%xmm8
922 movaps -0xa8(%rax),%xmm9
923 movaps -0x98(%rax),%xmm10
924 movaps -0x88(%rax),%xmm11
925 movaps -0x78(%rax),%xmm12
926 movaps -0x68(%rax),%xmm13
927 movaps -0x58(%rax),%xmm14
928 movaps -0x48(%rax),%xmm15
929___
930$code.=<<___;
931 mov -48(%rax),%r15
932.cfi_restore %r15
933 mov -40(%rax),%r14
934.cfi_restore %r14
935 mov -32(%rax),%r13
936.cfi_restore %r13
937 mov -24(%rax),%r12
938.cfi_restore %r12
939 mov -16(%rax),%rbp
940.cfi_restore %rbp
941 mov -8(%rax),%rbx
942.cfi_restore %rbx
943 lea (%rax),%rsp
944.cfi_def_cfa_register %rsp
945.Lenc8x_epilogue:
946 ret
947.cfi_endproc
948.size aesni_multi_cbc_encrypt_avx,.-aesni_multi_cbc_encrypt_avx
949
950.type aesni_multi_cbc_decrypt_avx,\@function,3
951.align 32
952aesni_multi_cbc_decrypt_avx:
953.cfi_startproc
954_avx_cbc_dec_shortcut:
955 mov %rsp,%rax
956.cfi_def_cfa_register %rax
957 push %rbx
958.cfi_push %rbx
959 push %rbp
960.cfi_push %rbp
961 push %r12
962.cfi_push %r12
963 push %r13
964.cfi_push %r13
965 push %r14
966.cfi_push %r14
967 push %r15
968.cfi_push %r15
969___
970$code.=<<___ if ($win64);
971 lea -0xa8(%rsp),%rsp
972 movaps %xmm6,(%rsp)
973 movaps %xmm7,0x10(%rsp)
974 movaps %xmm8,0x20(%rsp)
975 movaps %xmm9,0x30(%rsp)
976 movaps %xmm10,0x40(%rsp)
977 movaps %xmm11,0x50(%rsp)
978 movaps %xmm12,-0x78(%rax)
979 movaps %xmm13,-0x68(%rax)
980 movaps %xmm14,-0x58(%rax)
981 movaps %xmm15,-0x48(%rax)
982___
983$code.=<<___;
984 # stack layout
985 #
986 # +0 output sink
987 # +16 input sink [original %rsp and $num]
988 # +32 counters
989 # +64 distances between inputs and outputs
990 # +128 off-load area for @inp[0..3]
991 # +192 IV/input offload
992
993 sub \$256,%rsp
994 and \$-256,%rsp
995 sub \$192,%rsp
996 mov %rax,16(%rsp) # original %rsp
997.cfi_cfa_expression %rsp+16,deref,+8
998
999.Ldec8x_body:
1000 vzeroupper
1001 vmovdqu ($key),$zero # 0-round key
1002 lea 0x78($key),$key # size optimization
1003 lea 40*4($inp),$inp
1004 shr \$1,$num
1005
1006.Ldec8x_loop_grande:
1007 #mov $num,24(%rsp) # original $num
1008 xor $num,$num
1009___
1010for($i=0;$i<8;$i++) {
1011 my $temp = $i ? $offload : $offset;
1012 $code.=<<___;
1013 mov `40*$i+16-40*4`($inp),$one # borrow $one for number of blocks
1014 mov `40*$i+0-40*4`($inp),@ptr[$i] # input pointer
1015 cmp $num,$one
1016 mov `40*$i+8-40*4`($inp),$temp # output pointer
1017 cmovg $one,$num # find maximum
1018 test $one,$one
1019 vmovdqu `40*$i+24-40*4`($inp),@out[$i] # load IV
1020 mov $one,`32+4*$i`(%rsp) # initialize counters
1021 cmovle %rsp,@ptr[$i] # cancel input
1022 sub @ptr[$i],$temp # distance between input and output
1023 mov $temp,`64+8*$i`(%rsp) # initialize distances
1024 vmovdqu @out[$i],`192+16*$i`(%rsp) # offload IV
1025___
1026}
1027$code.=<<___;
1028 test $num,$num
1029 jz .Ldec8x_done
1030
1031 vmovups 0x10-0x78($key),$rndkey1
1032 vmovups 0x20-0x78($key),$rndkey0
1033 mov 0xf0-0x78($key),$rounds
1034 lea 192+128(%rsp),$offload # offload area
1035
1036 vmovdqu (@ptr[0]),@out[0] # load inputs
1037 vmovdqu (@ptr[1]),@out[1]
1038 vmovdqu (@ptr[2]),@out[2]
1039 vmovdqu (@ptr[3]),@out[3]
1040 vmovdqu (@ptr[4]),@out[4]
1041 vmovdqu (@ptr[5]),@out[5]
1042 vmovdqu (@ptr[6]),@out[6]
1043 vmovdqu (@ptr[7]),@out[7]
1044 vmovdqu @out[0],0x00($offload) # offload inputs
1045 vpxor $zero,@out[0],@out[0] # xor inputs with 0-round
1046 vmovdqu @out[1],0x10($offload)
1047 vpxor $zero,@out[1],@out[1]
1048 vmovdqu @out[2],0x20($offload)
1049 vpxor $zero,@out[2],@out[2]
1050 vmovdqu @out[3],0x30($offload)
1051 vpxor $zero,@out[3],@out[3]
1052 vmovdqu @out[4],0x40($offload)
1053 vpxor $zero,@out[4],@out[4]
1054 vmovdqu @out[5],0x50($offload)
1055 vpxor $zero,@out[5],@out[5]
1056 vmovdqu @out[6],0x60($offload)
1057 vpxor $zero,@out[6],@out[6]
1058 vmovdqu @out[7],0x70($offload)
1059 vpxor $zero,@out[7],@out[7]
1060 xor \$0x80,$offload
1061 mov \$1,$one # constant of 1
1062 jmp .Loop_dec8x
1063
1064.align 32
1065.Loop_dec8x:
1066___
1067for($i=0;$i<8;$i++) {
1068my $rndkey=($i&1)?$rndkey0:$rndkey1;
1069$code.=<<___;
1070 vaesdec $rndkey,@out[0],@out[0]
1071 cmp 32+4*$i(%rsp),$one
1072___
1073$code.=<<___ if ($i);
1074 mov 64+8*$i(%rsp),$offset
1075___
1076$code.=<<___;
1077 vaesdec $rndkey,@out[1],@out[1]
1078 prefetcht0 31(@ptr[$i]) # prefetch input
1079 vaesdec $rndkey,@out[2],@out[2]
1080___
1081$code.=<<___ if ($i>1);
1082 prefetcht0 15(@ptr[$i-2]) # prefetch output
1083___
1084$code.=<<___;
1085 vaesdec $rndkey,@out[3],@out[3]
1086 lea (@ptr[$i],$offset),$offset
1087 cmovge %rsp,@ptr[$i] # cancel input
1088 vaesdec $rndkey,@out[4],@out[4]
1089 cmovg %rsp,$offset # sink output
1090 vaesdec $rndkey,@out[5],@out[5]
1091 sub @ptr[$i],$offset
1092 vaesdec $rndkey,@out[6],@out[6]
1093 vmovdqu 16(@ptr[$i]),@inp[$i%4] # load input
1094 mov $offset,64+8*$i(%rsp)
1095 vaesdec $rndkey,@out[7],@out[7]
1096 vmovups `16*(3+$i)-0x78`($key),$rndkey
1097 lea 16(@ptr[$i],$offset),@ptr[$i] # switch to output
1098___
1099$code.=<<___ if ($i<4);
1100 vmovdqu @inp[$i%4],`128+16*$i`(%rsp) # off-load
1101___
1102}
1103$code.=<<___;
1104 vmovdqu 32(%rsp),$counters
1105 prefetcht0 15(@ptr[$i-2]) # prefetch output
1106 prefetcht0 15(@ptr[$i-1])
1107 cmp \$11,$rounds
1108 jb .Ldec8x_tail
1109
1110 vaesdec $rndkey1,@out[0],@out[0]
1111 vaesdec $rndkey1,@out[1],@out[1]
1112 vaesdec $rndkey1,@out[2],@out[2]
1113 vaesdec $rndkey1,@out[3],@out[3]
1114 vaesdec $rndkey1,@out[4],@out[4]
1115 vaesdec $rndkey1,@out[5],@out[5]
1116 vaesdec $rndkey1,@out[6],@out[6]
1117 vaesdec $rndkey1,@out[7],@out[7]
1118 vmovups 0xb0-0x78($key),$rndkey1
1119
1120 vaesdec $rndkey0,@out[0],@out[0]
1121 vaesdec $rndkey0,@out[1],@out[1]
1122 vaesdec $rndkey0,@out[2],@out[2]
1123 vaesdec $rndkey0,@out[3],@out[3]
1124 vaesdec $rndkey0,@out[4],@out[4]
1125 vaesdec $rndkey0,@out[5],@out[5]
1126 vaesdec $rndkey0,@out[6],@out[6]
1127 vaesdec $rndkey0,@out[7],@out[7]
1128 vmovups 0xc0-0x78($key),$rndkey0
1129 je .Ldec8x_tail
1130
1131 vaesdec $rndkey1,@out[0],@out[0]
1132 vaesdec $rndkey1,@out[1],@out[1]
1133 vaesdec $rndkey1,@out[2],@out[2]
1134 vaesdec $rndkey1,@out[3],@out[3]
1135 vaesdec $rndkey1,@out[4],@out[4]
1136 vaesdec $rndkey1,@out[5],@out[5]
1137 vaesdec $rndkey1,@out[6],@out[6]
1138 vaesdec $rndkey1,@out[7],@out[7]
1139 vmovups 0xd0-0x78($key),$rndkey1
1140
1141 vaesdec $rndkey0,@out[0],@out[0]
1142 vaesdec $rndkey0,@out[1],@out[1]
1143 vaesdec $rndkey0,@out[2],@out[2]
1144 vaesdec $rndkey0,@out[3],@out[3]
1145 vaesdec $rndkey0,@out[4],@out[4]
1146 vaesdec $rndkey0,@out[5],@out[5]
1147 vaesdec $rndkey0,@out[6],@out[6]
1148 vaesdec $rndkey0,@out[7],@out[7]
1149 vmovups 0xe0-0x78($key),$rndkey0
1150
1151.Ldec8x_tail:
1152 vaesdec $rndkey1,@out[0],@out[0]
1153 vpxor $zero,$zero,$zero
1154 vaesdec $rndkey1,@out[1],@out[1]
1155 vaesdec $rndkey1,@out[2],@out[2]
1156 vpcmpgtd $zero,$counters,$zero
1157 vaesdec $rndkey1,@out[3],@out[3]
1158 vaesdec $rndkey1,@out[4],@out[4]
1159 vpaddd $counters,$zero,$zero # decrement counters
1160 vmovdqu 48(%rsp),$counters
1161 vaesdec $rndkey1,@out[5],@out[5]
1162 mov 64(%rsp),$offset # pre-load 1st offset
1163 vaesdec $rndkey1,@out[6],@out[6]
1164 vaesdec $rndkey1,@out[7],@out[7]
1165 vmovups 0x10-0x78($key),$rndkey1
1166
1167 vaesdeclast $rndkey0,@out[0],@out[0]
1168 vmovdqa $zero,32(%rsp) # update counters
1169 vpxor $zero,$zero,$zero
1170 vaesdeclast $rndkey0,@out[1],@out[1]
1171 vpxor 0x00($offload),@out[0],@out[0] # xor with IV
1172 vaesdeclast $rndkey0,@out[2],@out[2]
1173 vpxor 0x10($offload),@out[1],@out[1]
1174 vpcmpgtd $zero,$counters,$zero
1175 vaesdeclast $rndkey0,@out[3],@out[3]
1176 vpxor 0x20($offload),@out[2],@out[2]
1177 vaesdeclast $rndkey0,@out[4],@out[4]
1178 vpxor 0x30($offload),@out[3],@out[3]
1179 vpaddd $zero,$counters,$counters # decrement counters
1180 vmovdqu -0x78($key),$zero # 0-round
1181 vaesdeclast $rndkey0,@out[5],@out[5]
1182 vpxor 0x40($offload),@out[4],@out[4]
1183 vaesdeclast $rndkey0,@out[6],@out[6]
1184 vpxor 0x50($offload),@out[5],@out[5]
1185 vmovdqa $counters,48(%rsp) # update counters
1186 vaesdeclast $rndkey0,@out[7],@out[7]
1187 vpxor 0x60($offload),@out[6],@out[6]
1188 vmovups 0x20-0x78($key),$rndkey0
1189
1190 vmovups @out[0],-16(@ptr[0]) # write output
1191 sub $offset,@ptr[0] # switch to input
1192 vmovdqu 128+0(%rsp),@out[0]
1193 vpxor 0x70($offload),@out[7],@out[7]
1194 vmovups @out[1],-16(@ptr[1])
1195 sub `64+1*8`(%rsp),@ptr[1]
1196 vmovdqu @out[0],0x00($offload)
1197 vpxor $zero,@out[0],@out[0]
1198 vmovdqu 128+16(%rsp),@out[1]
1199 vmovups @out[2],-16(@ptr[2])
1200 sub `64+2*8`(%rsp),@ptr[2]
1201 vmovdqu @out[1],0x10($offload)
1202 vpxor $zero,@out[1],@out[1]
1203 vmovdqu 128+32(%rsp),@out[2]
1204 vmovups @out[3],-16(@ptr[3])
1205 sub `64+3*8`(%rsp),@ptr[3]
1206 vmovdqu @out[2],0x20($offload)
1207 vpxor $zero,@out[2],@out[2]
1208 vmovdqu 128+48(%rsp),@out[3]
1209 vmovups @out[4],-16(@ptr[4])
1210 sub `64+4*8`(%rsp),@ptr[4]
1211 vmovdqu @out[3],0x30($offload)
1212 vpxor $zero,@out[3],@out[3]
1213 vmovdqu @inp[0],0x40($offload)
1214 vpxor @inp[0],$zero,@out[4]
1215 vmovups @out[5],-16(@ptr[5])
1216 sub `64+5*8`(%rsp),@ptr[5]
1217 vmovdqu @inp[1],0x50($offload)
1218 vpxor @inp[1],$zero,@out[5]
1219 vmovups @out[6],-16(@ptr[6])
1220 sub `64+6*8`(%rsp),@ptr[6]
1221 vmovdqu @inp[2],0x60($offload)
1222 vpxor @inp[2],$zero,@out[6]
1223 vmovups @out[7],-16(@ptr[7])
1224 sub `64+7*8`(%rsp),@ptr[7]
1225 vmovdqu @inp[3],0x70($offload)
1226 vpxor @inp[3],$zero,@out[7]
1227
1228 xor \$128,$offload
1229 dec $num
1230 jnz .Loop_dec8x
1231
1232 mov 16(%rsp),%rax # original %rsp
1233.cfi_def_cfa %rax,8
1234 #mov 24(%rsp),$num
1235 #lea `40*8`($inp),$inp
1236 #dec $num
1237 #jnz .Ldec8x_loop_grande
1238
1239.Ldec8x_done:
1240 vzeroupper
1241___
1242$code.=<<___ if ($win64);
1243 movaps -0xd8(%rax),%xmm6
1244 movaps -0xc8(%rax),%xmm7
1245 movaps -0xb8(%rax),%xmm8
1246 movaps -0xa8(%rax),%xmm9
1247 movaps -0x98(%rax),%xmm10
1248 movaps -0x88(%rax),%xmm11
1249 movaps -0x78(%rax),%xmm12
1250 movaps -0x68(%rax),%xmm13
1251 movaps -0x58(%rax),%xmm14
1252 movaps -0x48(%rax),%xmm15
1253___
1254$code.=<<___;
1255 mov -48(%rax),%r15
1256.cfi_restore %r15
1257 mov -40(%rax),%r14
1258.cfi_restore %r14
1259 mov -32(%rax),%r13
1260.cfi_restore %r13
1261 mov -24(%rax),%r12
1262.cfi_restore %r12
1263 mov -16(%rax),%rbp
1264.cfi_restore %rbp
1265 mov -8(%rax),%rbx
1266.cfi_restore %rbx
1267 lea (%rax),%rsp
1268.cfi_def_cfa_register %rsp
1269.Ldec8x_epilogue:
1270 ret
1271.cfi_endproc
1272.size aesni_multi_cbc_decrypt_avx,.-aesni_multi_cbc_decrypt_avx
1273___
1274 }}}
1275
1276if ($win64) {
1277# EXCEPTION_DISPOSITION handler (EXCEPTION_RECORD *rec,ULONG64 frame,
1278# CONTEXT *context,DISPATCHER_CONTEXT *disp)
1279$rec="%rcx";
1280$frame="%rdx";
1281$context="%r8";
1282$disp="%r9";
1283
1284$code.=<<___;
1285.extern __imp_RtlVirtualUnwind
1286.type se_handler,\@abi-omnipotent
1287.align 16
1288se_handler:
1289 push %rsi
1290 push %rdi
1291 push %rbx
1292 push %rbp
1293 push %r12
1294 push %r13
1295 push %r14
1296 push %r15
1297 pushfq
1298 sub \$64,%rsp
1299
1300 mov 120($context),%rax # pull context->Rax
1301 mov 248($context),%rbx # pull context->Rip
1302
1303 mov 8($disp),%rsi # disp->ImageBase
1304 mov 56($disp),%r11 # disp->HandlerData
1305
1306 mov 0(%r11),%r10d # HandlerData[0]
1307 lea (%rsi,%r10),%r10 # prologue label
1308 cmp %r10,%rbx # context->Rip<.Lprologue
1309 jb .Lin_prologue
1310
1311 mov 152($context),%rax # pull context->Rsp
1312
1313 mov 4(%r11),%r10d # HandlerData[1]
1314 lea (%rsi,%r10),%r10 # epilogue label
1315 cmp %r10,%rbx # context->Rip>=.Lepilogue
1316 jae .Lin_prologue
1317
1318 mov 16(%rax),%rax # pull saved stack pointer
1319
1320 mov -8(%rax),%rbx
1321 mov -16(%rax),%rbp
1322 mov -24(%rax),%r12
1323 mov -32(%rax),%r13
1324 mov -40(%rax),%r14
1325 mov -48(%rax),%r15
1326 mov %rbx,144($context) # restore context->Rbx
1327 mov %rbp,160($context) # restore context->Rbp
1328 mov %r12,216($context) # restore context->R12
1329 mov %r13,224($context) # restore context->R13
1330 mov %r14,232($context) # restore context->R14
1331 mov %r15,240($context) # restore context->R15
1332
1333 lea -56-10*16(%rax),%rsi
1334 lea 512($context),%rdi # &context.Xmm6
1335 mov \$20,%ecx
1336 .long 0xa548f3fc # cld; rep movsq
1337
1338.Lin_prologue:
1339 mov 8(%rax),%rdi
1340 mov 16(%rax),%rsi
1341 mov %rax,152($context) # restore context->Rsp
1342 mov %rsi,168($context) # restore context->Rsi
1343 mov %rdi,176($context) # restore context->Rdi
1344
1345 mov 40($disp),%rdi # disp->ContextRecord
1346 mov $context,%rsi # context
1347 mov \$154,%ecx # sizeof(CONTEXT)
1348 .long 0xa548f3fc # cld; rep movsq
1349
1350 mov $disp,%rsi
1351 xor %rcx,%rcx # arg1, UNW_FLAG_NHANDLER
1352 mov 8(%rsi),%rdx # arg2, disp->ImageBase
1353 mov 0(%rsi),%r8 # arg3, disp->ControlPc
1354 mov 16(%rsi),%r9 # arg4, disp->FunctionEntry
1355 mov 40(%rsi),%r10 # disp->ContextRecord
1356 lea 56(%rsi),%r11 # &disp->HandlerData
1357 lea 24(%rsi),%r12 # &disp->EstablisherFrame
1358 mov %r10,32(%rsp) # arg5
1359 mov %r11,40(%rsp) # arg6
1360 mov %r12,48(%rsp) # arg7
1361 mov %rcx,56(%rsp) # arg8, (NULL)
1362 call *__imp_RtlVirtualUnwind(%rip)
1363
1364 mov \$1,%eax # ExceptionContinueSearch
1365 add \$64,%rsp
1366 popfq
1367 pop %r15
1368 pop %r14
1369 pop %r13
1370 pop %r12
1371 pop %rbp
1372 pop %rbx
1373 pop %rdi
1374 pop %rsi
1375 ret
1376.size se_handler,.-se_handler
1377
1378.section .pdata
1379.align 4
1380 .rva .LSEH_begin_aesni_multi_cbc_encrypt
1381 .rva .LSEH_end_aesni_multi_cbc_encrypt
1382 .rva .LSEH_info_aesni_multi_cbc_encrypt
1383 .rva .LSEH_begin_aesni_multi_cbc_decrypt
1384 .rva .LSEH_end_aesni_multi_cbc_decrypt
1385 .rva .LSEH_info_aesni_multi_cbc_decrypt
1386___
1387$code.=<<___ if ($avx);
1388 .rva .LSEH_begin_aesni_multi_cbc_encrypt_avx
1389 .rva .LSEH_end_aesni_multi_cbc_encrypt_avx
1390 .rva .LSEH_info_aesni_multi_cbc_encrypt_avx
1391 .rva .LSEH_begin_aesni_multi_cbc_decrypt_avx
1392 .rva .LSEH_end_aesni_multi_cbc_decrypt_avx
1393 .rva .LSEH_info_aesni_multi_cbc_decrypt_avx
1394___
1395$code.=<<___;
1396.section .xdata
1397.align 8
1398.LSEH_info_aesni_multi_cbc_encrypt:
1399 .byte 9,0,0,0
1400 .rva se_handler
1401 .rva .Lenc4x_body,.Lenc4x_epilogue # HandlerData[]
1402.LSEH_info_aesni_multi_cbc_decrypt:
1403 .byte 9,0,0,0
1404 .rva se_handler
1405 .rva .Ldec4x_body,.Ldec4x_epilogue # HandlerData[]
1406___
1407$code.=<<___ if ($avx);
1408.LSEH_info_aesni_multi_cbc_encrypt_avx:
1409 .byte 9,0,0,0
1410 .rva se_handler
1411 .rva .Lenc8x_body,.Lenc8x_epilogue # HandlerData[]
1412.LSEH_info_aesni_multi_cbc_decrypt_avx:
1413 .byte 9,0,0,0
1414 .rva se_handler
1415 .rva .Ldec8x_body,.Ldec8x_epilogue # HandlerData[]
1416___
1417}
1418####################################################################
1419
1420sub rex {
1421 local *opcode=shift;
1422 my ($dst,$src)=@_;
1423 my $rex=0;
1424
1425 $rex|=0x04 if($dst>=8);
1426 $rex|=0x01 if($src>=8);
1427 push @opcode,$rex|0x40 if($rex);
1428}
1429
1430sub aesni {
1431 my $line=shift;
1432 my @opcode=(0x66);
1433
1434 if ($line=~/(aeskeygenassist)\s+\$([x0-9a-f]+),\s*%xmm([0-9]+),\s*%xmm([0-9]+)/) {
1435 rex(\@opcode,$4,$3);
1436 push @opcode,0x0f,0x3a,0xdf;
1437 push @opcode,0xc0|($3&7)|(($4&7)<<3); # ModR/M
1438 my $c=$2;
1439 push @opcode,$c=~/^0/?oct($c):$c;
1440 return ".byte\t".join(',',@opcode);
1441 }
1442 elsif ($line=~/(aes[a-z]+)\s+%xmm([0-9]+),\s*%xmm([0-9]+)/) {
1443 my %opcodelet = (
1444 "aesimc" => 0xdb,
1445 "aesenc" => 0xdc, "aesenclast" => 0xdd,
1446 "aesdec" => 0xde, "aesdeclast" => 0xdf
1447 );
1448 return undef if (!defined($opcodelet{$1}));
1449 rex(\@opcode,$3,$2);
1450 push @opcode,0x0f,0x38,$opcodelet{$1};
1451 push @opcode,0xc0|($2&7)|(($3&7)<<3); # ModR/M
1452 return ".byte\t".join(',',@opcode);
1453 }
1454 elsif ($line=~/(aes[a-z]+)\s+([0x1-9a-fA-F]*)\(%rsp\),\s*%xmm([0-9]+)/) {
1455 my %opcodelet = (
1456 "aesenc" => 0xdc, "aesenclast" => 0xdd,
1457 "aesdec" => 0xde, "aesdeclast" => 0xdf
1458 );
1459 return undef if (!defined($opcodelet{$1}));
1460 my $off = $2;
1461 push @opcode,0x44 if ($3>=8);
1462 push @opcode,0x0f,0x38,$opcodelet{$1};
1463 push @opcode,0x44|(($3&7)<<3),0x24; # ModR/M
1464 push @opcode,($off=~/^0/?oct($off):$off)&0xff;
1465 return ".byte\t".join(',',@opcode);
1466 }
1467 return $line;
1468}
1469
1470$code =~ s/\`([^\`]*)\`/eval($1)/gem;
1471$code =~ s/\b(aes.*%xmm[0-9]+).*$/aesni($1)/gem;
1472
1473print $code;
1474close STDOUT or die "error closing STDOUT: $!";
Note: See TracBrowser for help on using the repository browser.

© 2025 Oracle Support Privacy / Do Not Sell My Info Terms of Use Trademark Policy Automated Access Etiquette