aesv8-armx.pl@ 69881

Last change on this file since 69881 was 69881, checked in by vboxsync, 7 years ago
Update OpenSSL to 1.1.0g. bugref:8070: src/libs maintenance
Property svn:eol-style set to `LF` Property svn:executable set to ``*
File size: 21.6 KB

Line
1	#! /usr/bin/env perl
2	# Copyright 2014-2016 The OpenSSL Project Authors. All Rights Reserved.
3	#
4	# Licensed under the OpenSSL license (the "License"). You may not use
5	# this file except in compliance with the License. You can obtain a copy
6	# in the file LICENSE in the source distribution or at
7	# https://www.openssl.org/source/license.html
8
9	#
10	# ====================================================================
11	# Written by Andy Polyakov <[email protected]> for the OpenSSL
12	# project. The module is, however, dual licensed under OpenSSL and
13	# CRYPTOGAMS licenses depending on where you obtain it. For further
14	# details see http://www.openssl.org/~appro/cryptogams/.
15	# ====================================================================
16	#
17	# This module implements support for ARMv8 AES instructions. The
18	# module is endian-agnostic in sense that it supports both big- and
19	# little-endian cases. As does it support both 32- and 64-bit modes
20	# of operation. Latter is achieved by limiting amount of utilized
21	# registers to 16, which implies additional NEON load and integer
22	# instructions. This has no effect on mighty Apple A7, where results
23	# are literally equal to the theoretical estimates based on AES
24	# instruction latencies and issue rates. On Cortex-A53, an in-order
25	# execution core, this costs up to 10-15%, which is partially
26	# compensated by implementing dedicated code path for 128-bit
27	# CBC encrypt case. On Cortex-A57 parallelizable mode performance
28	# seems to be limited by sheer amount of NEON instructions...
29	#
30	# Performance in cycles per byte processed with 128-bit key:
31	#
32	# CBC enc CBC dec CTR
33	# Apple A7 2.39 1.20 1.20
34	# Cortex-A53 1.32 1.29 1.46
35	# Cortex-A57(*) 1.95 0.85 0.93
36	# Denver 1.96 0.86 0.80
37	# Mongoose 1.33 1.20 1.20
38	#
39	# (*) original 3.64/1.34/1.32 results were for r0p0 revision
40	# and are still same even for updated module;
41
42	$flavour = shift;
43	$output = shift;
44
45	$0 =~ m/(.*[\/\\])[^\/\\]+$/; $dir=$1;
46	( $xlate="${dir}arm-xlate.pl" and -f $xlate ) or
47	( $xlate="${dir}../../perlasm/arm-xlate.pl" and -f $xlate) or
48	die "can't locate arm-xlate.pl";
49
50	open OUT,"\| \"$^X\" $xlate $flavour $output";
51	STDOUT=OUT;
52
53	$prefix="aes_v8";
54
55	$code=<<___;
56	#include "arm_arch.h"
57
58	#if __ARM_MAX_ARCH__>=7
59	.text
60	___
61	$code.=".arch armv8-a+crypto\n" if ($flavour =~ /64/);
62	$code.=<<___ if ($flavour !~ /64/);
63	.arch armv7-a // don't confuse not-so-latest binutils with argv8 :-)
64	.fpu neon
65	.code 32
66	#undef __thumb2__
67	___
68
69	# Assembler mnemonics are an eclectic mix of 32- and 64-bit syntax,
70	# NEON is mostly 32-bit mnemonics, integer - mostly 64. Goal is to
71	# maintain both 32- and 64-bit codes within single module and
72	# transliterate common code to either flavour with regex vodoo.
73	#
74	{{{
75	my ($inp,$bits,$out,$ptr,$rounds)=("x0","w1","x2","x3","w12");
76	my ($zero,$rcon,$mask,$in0,$in1,$tmp,$key)=
77	$flavour=~/64/? map("q$_",(0..6)) : map("q$_",(0..3,8..10));
78
79
80	$code.=<<___;
81	.align 5
82	.Lrcon:
83	.long 0x01,0x01,0x01,0x01
84	.long 0x0c0f0e0d,0x0c0f0e0d,0x0c0f0e0d,0x0c0f0e0d // rotate-n-splat
85	.long 0x1b,0x1b,0x1b,0x1b
86
87	.globl ${prefix}_set_encrypt_key
88	.type ${prefix}_set_encrypt_key,%function
89	.align 5
90	${prefix}_set_encrypt_key:
91	.Lenc_key:
92	___
93	$code.=<<___ if ($flavour =~ /64/);
94	stp x29,x30,[sp,#-16]!
95	add x29,sp,#0
96	___
97	$code.=<<___;
98	mov $ptr,#-1
99	cmp $inp,#0
100	b.eq .Lenc_key_abort
101	cmp $out,#0
102	b.eq .Lenc_key_abort
103	mov $ptr,#-2
104	cmp $bits,#128
105	b.lt .Lenc_key_abort
106	cmp $bits,#256
107	b.gt .Lenc_key_abort
108	tst $bits,#0x3f
109	b.ne .Lenc_key_abort
110
111	adr $ptr,.Lrcon
112	cmp $bits,#192
113
114	veor $zero,$zero,$zero
115	vld1.8 {$in0},[$inp],#16
116	mov $bits,#8 // reuse $bits
117	vld1.32 {$rcon,$mask},[$ptr],#32
118
119	b.lt .Loop128
120	b.eq .L192
121	b .L256
122
123	.align 4
124	.Loop128:
125	vtbl.8 $key,{$in0},$mask
126	vext.8 $tmp,$zero,$in0,#12
127	vst1.32 {$in0},[$out],#16
128	aese $key,$zero
129	subs $bits,$bits,#1
130
131	veor $in0,$in0,$tmp
132	vext.8 $tmp,$zero,$tmp,#12
133	veor $in0,$in0,$tmp
134	vext.8 $tmp,$zero,$tmp,#12
135	veor $key,$key,$rcon
136	veor $in0,$in0,$tmp
137	vshl.u8 $rcon,$rcon,#1
138	veor $in0,$in0,$key
139	b.ne .Loop128
140
141	vld1.32 {$rcon},[$ptr]
142
143	vtbl.8 $key,{$in0},$mask
144	vext.8 $tmp,$zero,$in0,#12
145	vst1.32 {$in0},[$out],#16
146	aese $key,$zero
147
148	veor $in0,$in0,$tmp
149	vext.8 $tmp,$zero,$tmp,#12
150	veor $in0,$in0,$tmp
151	vext.8 $tmp,$zero,$tmp,#12
152	veor $key,$key,$rcon
153	veor $in0,$in0,$tmp
154	vshl.u8 $rcon,$rcon,#1
155	veor $in0,$in0,$key
156
157	vtbl.8 $key,{$in0},$mask
158	vext.8 $tmp,$zero,$in0,#12
159	vst1.32 {$in0},[$out],#16
160	aese $key,$zero
161
162	veor $in0,$in0,$tmp
163	vext.8 $tmp,$zero,$tmp,#12
164	veor $in0,$in0,$tmp
165	vext.8 $tmp,$zero,$tmp,#12
166	veor $key,$key,$rcon
167	veor $in0,$in0,$tmp
168	veor $in0,$in0,$key
169	vst1.32 {$in0},[$out]
170	add $out,$out,#0x50
171
172	mov $rounds,#10
173	b .Ldone
174
175	.align 4
176	.L192:
177	vld1.8 {$in1},[$inp],#8
178	vmov.i8 $key,#8 // borrow $key
179	vst1.32 {$in0},[$out],#16
180	vsub.i8 $mask,$mask,$key // adjust the mask
181
182	.Loop192:
183	vtbl.8 $key,{$in1},$mask
184	vext.8 $tmp,$zero,$in0,#12
185	vst1.32 {$in1},[$out],#8
186	aese $key,$zero
187	subs $bits,$bits,#1
188
189	veor $in0,$in0,$tmp
190	vext.8 $tmp,$zero,$tmp,#12
191	veor $in0,$in0,$tmp
192	vext.8 $tmp,$zero,$tmp,#12
193	veor $in0,$in0,$tmp
194
195	vdup.32 $tmp,${in0}[3]
196	veor $tmp,$tmp,$in1
197	veor $key,$key,$rcon
198	vext.8 $in1,$zero,$in1,#12
199	vshl.u8 $rcon,$rcon,#1
200	veor $in1,$in1,$tmp
201	veor $in0,$in0,$key
202	veor $in1,$in1,$key
203	vst1.32 {$in0},[$out],#16
204	b.ne .Loop192
205
206	mov $rounds,#12
207	add $out,$out,#0x20
208	b .Ldone
209
210	.align 4
211	.L256:
212	vld1.8 {$in1},[$inp]
213	mov $bits,#7
214	mov $rounds,#14
215	vst1.32 {$in0},[$out],#16
216
217	.Loop256:
218	vtbl.8 $key,{$in1},$mask
219	vext.8 $tmp,$zero,$in0,#12
220	vst1.32 {$in1},[$out],#16
221	aese $key,$zero
222	subs $bits,$bits,#1
223
224	veor $in0,$in0,$tmp
225	vext.8 $tmp,$zero,$tmp,#12
226	veor $in0,$in0,$tmp
227	vext.8 $tmp,$zero,$tmp,#12
228	veor $key,$key,$rcon
229	veor $in0,$in0,$tmp
230	vshl.u8 $rcon,$rcon,#1
231	veor $in0,$in0,$key
232	vst1.32 {$in0},[$out],#16
233	b.eq .Ldone
234
235	vdup.32 $key,${in0}[3] // just splat
236	vext.8 $tmp,$zero,$in1,#12
237	aese $key,$zero
238
239	veor $in1,$in1,$tmp
240	vext.8 $tmp,$zero,$tmp,#12
241	veor $in1,$in1,$tmp
242	vext.8 $tmp,$zero,$tmp,#12
243	veor $in1,$in1,$tmp
244
245	veor $in1,$in1,$key
246	b .Loop256
247
248	.Ldone:
249	str $rounds,[$out]
250	mov $ptr,#0
251
252	.Lenc_key_abort:
253	mov x0,$ptr // return value
254	`"ldr x29,[sp],#16" if ($flavour =~ /64/)`
255	ret
256	.size ${prefix}_set_encrypt_key,.-${prefix}_set_encrypt_key
257
258	.globl ${prefix}_set_decrypt_key
259	.type ${prefix}_set_decrypt_key,%function
260	.align 5
261	${prefix}_set_decrypt_key:
262	___
263	$code.=<<___ if ($flavour =~ /64/);
264	stp x29,x30,[sp,#-16]!
265	add x29,sp,#0
266	___
267	$code.=<<___ if ($flavour !~ /64/);
268	stmdb sp!,{r4,lr}
269	___
270	$code.=<<___;
271	bl .Lenc_key
272
273	cmp x0,#0
274	b.ne .Ldec_key_abort
275
276	sub $out,$out,#240 // restore original $out
277	mov x4,#-16
278	add $inp,$out,x12,lsl#4 // end of key schedule
279
280	vld1.32 {v0.16b},[$out]
281	vld1.32 {v1.16b},[$inp]
282	vst1.32 {v0.16b},[$inp],x4
283	vst1.32 {v1.16b},[$out],#16
284
285	.Loop_imc:
286	vld1.32 {v0.16b},[$out]
287	vld1.32 {v1.16b},[$inp]
288	aesimc v0.16b,v0.16b
289	aesimc v1.16b,v1.16b
290	vst1.32 {v0.16b},[$inp],x4
291	vst1.32 {v1.16b},[$out],#16
292	cmp $inp,$out
293	b.hi .Loop_imc
294
295	vld1.32 {v0.16b},[$out]
296	aesimc v0.16b,v0.16b
297	vst1.32 {v0.16b},[$inp]
298
299	eor x0,x0,x0 // return value
300	.Ldec_key_abort:
301	___
302	$code.=<<___ if ($flavour !~ /64/);
303	ldmia sp!,{r4,pc}
304	___
305	$code.=<<___ if ($flavour =~ /64/);
306	ldp x29,x30,[sp],#16
307	ret
308	___
309	$code.=<<___;
310	.size ${prefix}_set_decrypt_key,.-${prefix}_set_decrypt_key
311	___
312	}}}
313	{{{
314	sub gen_block () {
315	my $dir = shift;
316	my ($e,$mc) = $dir eq "en" ? ("e","mc") : ("d","imc");
317	my ($inp,$out,$key)=map("x$_",(0..2));
318	my $rounds="w3";
319	my ($rndkey0,$rndkey1,$inout)=map("q$_",(0..3));
320
321	$code.=<<___;
322	.globl ${prefix}_${dir}crypt
323	.type ${prefix}_${dir}crypt,%function
324	.align 5
325	${prefix}_${dir}crypt:
326	ldr $rounds,[$key,#240]
327	vld1.32 {$rndkey0},[$key],#16
328	vld1.8 {$inout},[$inp]
329	sub $rounds,$rounds,#2
330	vld1.32 {$rndkey1},[$key],#16
331
332	.Loop_${dir}c:
333	aes$e $inout,$rndkey0
334	aes$mc $inout,$inout
335	vld1.32 {$rndkey0},[$key],#16
336	subs $rounds,$rounds,#2
337	aes$e $inout,$rndkey1
338	aes$mc $inout,$inout
339	vld1.32 {$rndkey1},[$key],#16
340	b.gt .Loop_${dir}c
341
342	aes$e $inout,$rndkey0
343	aes$mc $inout,$inout
344	vld1.32 {$rndkey0},[$key]
345	aes$e $inout,$rndkey1
346	veor $inout,$inout,$rndkey0
347
348	vst1.8 {$inout},[$out]
349	ret
350	.size ${prefix}_${dir}crypt,.-${prefix}_${dir}crypt
351	___
352	}
353	&gen_block("en");
354	&gen_block("de");
355	}}}
356	{{{
357	my ($inp,$out,$len,$key,$ivp)=map("x$_",(0..4)); my $enc="w5";
358	my ($rounds,$cnt,$key_,$step,$step1)=($enc,"w6","x7","x8","x12");
359	my ($dat0,$dat1,$in0,$in1,$tmp0,$tmp1,$ivec,$rndlast)=map("q$_",(0..7));
360
361	my ($dat,$tmp,$rndzero_n_last)=($dat0,$tmp0,$tmp1);
362	my ($key4,$key5,$key6,$key7)=("x6","x12","x14",$key);
363
364	### q8-q15 preloaded key schedule
365
366	$code.=<<___;
367	.globl ${prefix}_cbc_encrypt
368	.type ${prefix}_cbc_encrypt,%function
369	.align 5
370	${prefix}_cbc_encrypt:
371	___
372	$code.=<<___ if ($flavour =~ /64/);
373	stp x29,x30,[sp,#-16]!
374	add x29,sp,#0
375	___
376	$code.=<<___ if ($flavour !~ /64/);
377	mov ip,sp
378	stmdb sp!,{r4-r8,lr}
379	vstmdb sp!,{d8-d15} @ ABI specification says so
380	ldmia ip,{r4-r5} @ load remaining args
381	___
382	$code.=<<___;
383	subs $len,$len,#16
384	mov $step,#16
385	b.lo .Lcbc_abort
386	cclr $step,eq
387
388	cmp $enc,#0 // en- or decrypting?
389	ldr $rounds,[$key,#240]
390	and $len,$len,#-16
391	vld1.8 {$ivec},[$ivp]
392	vld1.8 {$dat},[$inp],$step
393
394	vld1.32 {q8-q9},[$key] // load key schedule...
395	sub $rounds,$rounds,#6
396	add $key_,$key,x5,lsl#4 // pointer to last 7 round keys
397	sub $rounds,$rounds,#2
398	vld1.32 {q10-q11},[$key_],#32
399	vld1.32 {q12-q13},[$key_],#32
400	vld1.32 {q14-q15},[$key_],#32
401	vld1.32 {$rndlast},[$key_]
402
403	add $key_,$key,#32
404	mov $cnt,$rounds
405	b.eq .Lcbc_dec
406
407	cmp $rounds,#2
408	veor $dat,$dat,$ivec
409	veor $rndzero_n_last,q8,$rndlast
410	b.eq .Lcbc_enc128
411
412	vld1.32 {$in0-$in1},[$key_]
413	add $key_,$key,#16
414	add $key4,$key,#16*4
415	add $key5,$key,#16*5
416	aese $dat,q8
417	aesmc $dat,$dat
418	add $key6,$key,#16*6
419	add $key7,$key,#16*7
420	b .Lenter_cbc_enc
421
422	.align 4
423	.Loop_cbc_enc:
424	aese $dat,q8
425	aesmc $dat,$dat
426	vst1.8 {$ivec},[$out],#16
427	.Lenter_cbc_enc:
428	aese $dat,q9
429	aesmc $dat,$dat
430	aese $dat,$in0
431	aesmc $dat,$dat
432	vld1.32 {q8},[$key4]
433	cmp $rounds,#4
434	aese $dat,$in1
435	aesmc $dat,$dat
436	vld1.32 {q9},[$key5]
437	b.eq .Lcbc_enc192
438
439	aese $dat,q8
440	aesmc $dat,$dat
441	vld1.32 {q8},[$key6]
442	aese $dat,q9
443	aesmc $dat,$dat
444	vld1.32 {q9},[$key7]
445	nop
446
447	.Lcbc_enc192:
448	aese $dat,q8
449	aesmc $dat,$dat
450	subs $len,$len,#16
451	aese $dat,q9
452	aesmc $dat,$dat
453	cclr $step,eq
454	aese $dat,q10
455	aesmc $dat,$dat
456	aese $dat,q11
457	aesmc $dat,$dat
458	vld1.8 {q8},[$inp],$step
459	aese $dat,q12
460	aesmc $dat,$dat
461	veor q8,q8,$rndzero_n_last
462	aese $dat,q13
463	aesmc $dat,$dat
464	vld1.32 {q9},[$key_] // re-pre-load rndkey[1]
465	aese $dat,q14
466	aesmc $dat,$dat
467	aese $dat,q15
468	veor $ivec,$dat,$rndlast
469	b.hs .Loop_cbc_enc
470
471	vst1.8 {$ivec},[$out],#16
472	b .Lcbc_done
473
474	.align 5
475	.Lcbc_enc128:
476	vld1.32 {$in0-$in1},[$key_]
477	aese $dat,q8
478	aesmc $dat,$dat
479	b .Lenter_cbc_enc128
480	.Loop_cbc_enc128:
481	aese $dat,q8
482	aesmc $dat,$dat
483	vst1.8 {$ivec},[$out],#16
484	.Lenter_cbc_enc128:
485	aese $dat,q9
486	aesmc $dat,$dat
487	subs $len,$len,#16
488	aese $dat,$in0
489	aesmc $dat,$dat
490	cclr $step,eq
491	aese $dat,$in1
492	aesmc $dat,$dat
493	aese $dat,q10
494	aesmc $dat,$dat
495	aese $dat,q11
496	aesmc $dat,$dat
497	vld1.8 {q8},[$inp],$step
498	aese $dat,q12
499	aesmc $dat,$dat
500	aese $dat,q13
501	aesmc $dat,$dat
502	aese $dat,q14
503	aesmc $dat,$dat
504	veor q8,q8,$rndzero_n_last
505	aese $dat,q15
506	veor $ivec,$dat,$rndlast
507	b.hs .Loop_cbc_enc128
508
509	vst1.8 {$ivec},[$out],#16
510	b .Lcbc_done
511	___
512	{
513	my ($dat2,$in2,$tmp2)=map("q$_",(10,11,9));
514	$code.=<<___;
515	.align 5
516	.Lcbc_dec:
517	vld1.8 {$dat2},[$inp],#16
518	subs $len,$len,#32 // bias
519	add $cnt,$rounds,#2
520	vorr $in1,$dat,$dat
521	vorr $dat1,$dat,$dat
522	vorr $in2,$dat2,$dat2
523	b.lo .Lcbc_dec_tail
524
525	vorr $dat1,$dat2,$dat2
526	vld1.8 {$dat2},[$inp],#16
527	vorr $in0,$dat,$dat
528	vorr $in1,$dat1,$dat1
529	vorr $in2,$dat2,$dat2
530
531	.Loop3x_cbc_dec:
532	aesd $dat0,q8
533	aesimc $dat0,$dat0
534	aesd $dat1,q8
535	aesimc $dat1,$dat1
536	aesd $dat2,q8
537	aesimc $dat2,$dat2
538	vld1.32 {q8},[$key_],#16
539	subs $cnt,$cnt,#2
540	aesd $dat0,q9
541	aesimc $dat0,$dat0
542	aesd $dat1,q9
543	aesimc $dat1,$dat1
544	aesd $dat2,q9
545	aesimc $dat2,$dat2
546	vld1.32 {q9},[$key_],#16
547	b.gt .Loop3x_cbc_dec
548
549	aesd $dat0,q8
550	aesimc $dat0,$dat0
551	aesd $dat1,q8
552	aesimc $dat1,$dat1
553	aesd $dat2,q8
554	aesimc $dat2,$dat2
555	veor $tmp0,$ivec,$rndlast
556	subs $len,$len,#0x30
557	veor $tmp1,$in0,$rndlast
558	mov.lo x6,$len // x6, $cnt, is zero at this point
559	aesd $dat0,q9
560	aesimc $dat0,$dat0
561	aesd $dat1,q9
562	aesimc $dat1,$dat1
563	aesd $dat2,q9
564	aesimc $dat2,$dat2
565	veor $tmp2,$in1,$rndlast
566	add $inp,$inp,x6 // $inp is adjusted in such way that
567	// at exit from the loop $dat1-$dat2
568	// are loaded with last "words"
569	vorr $ivec,$in2,$in2
570	mov $key_,$key
571	aesd $dat0,q12
572	aesimc $dat0,$dat0
573	aesd $dat1,q12
574	aesimc $dat1,$dat1
575	aesd $dat2,q12
576	aesimc $dat2,$dat2
577	vld1.8 {$in0},[$inp],#16
578	aesd $dat0,q13
579	aesimc $dat0,$dat0
580	aesd $dat1,q13
581	aesimc $dat1,$dat1
582	aesd $dat2,q13
583	aesimc $dat2,$dat2
584	vld1.8 {$in1},[$inp],#16
585	aesd $dat0,q14
586	aesimc $dat0,$dat0
587	aesd $dat1,q14
588	aesimc $dat1,$dat1
589	aesd $dat2,q14
590	aesimc $dat2,$dat2
591	vld1.8 {$in2},[$inp],#16
592	aesd $dat0,q15
593	aesd $dat1,q15
594	aesd $dat2,q15
595	vld1.32 {q8},[$key_],#16 // re-pre-load rndkey[0]
596	add $cnt,$rounds,#2
597	veor $tmp0,$tmp0,$dat0
598	veor $tmp1,$tmp1,$dat1
599	veor $dat2,$dat2,$tmp2
600	vld1.32 {q9},[$key_],#16 // re-pre-load rndkey[1]
601	vst1.8 {$tmp0},[$out],#16
602	vorr $dat0,$in0,$in0
603	vst1.8 {$tmp1},[$out],#16
604	vorr $dat1,$in1,$in1
605	vst1.8 {$dat2},[$out],#16
606	vorr $dat2,$in2,$in2
607	b.hs .Loop3x_cbc_dec
608
609	cmn $len,#0x30
610	b.eq .Lcbc_done
611	nop
612
613	.Lcbc_dec_tail:
614	aesd $dat1,q8
615	aesimc $dat1,$dat1
616	aesd $dat2,q8
617	aesimc $dat2,$dat2
618	vld1.32 {q8},[$key_],#16
619	subs $cnt,$cnt,#2
620	aesd $dat1,q9
621	aesimc $dat1,$dat1
622	aesd $dat2,q9
623	aesimc $dat2,$dat2
624	vld1.32 {q9},[$key_],#16
625	b.gt .Lcbc_dec_tail
626
627	aesd $dat1,q8
628	aesimc $dat1,$dat1
629	aesd $dat2,q8
630	aesimc $dat2,$dat2
631	aesd $dat1,q9
632	aesimc $dat1,$dat1
633	aesd $dat2,q9
634	aesimc $dat2,$dat2
635	aesd $dat1,q12
636	aesimc $dat1,$dat1
637	aesd $dat2,q12
638	aesimc $dat2,$dat2
639	cmn $len,#0x20
640	aesd $dat1,q13
641	aesimc $dat1,$dat1
642	aesd $dat2,q13
643	aesimc $dat2,$dat2
644	veor $tmp1,$ivec,$rndlast
645	aesd $dat1,q14
646	aesimc $dat1,$dat1
647	aesd $dat2,q14
648	aesimc $dat2,$dat2
649	veor $tmp2,$in1,$rndlast
650	aesd $dat1,q15
651	aesd $dat2,q15
652	b.eq .Lcbc_dec_one
653	veor $tmp1,$tmp1,$dat1
654	veor $tmp2,$tmp2,$dat2
655	vorr $ivec,$in2,$in2
656	vst1.8 {$tmp1},[$out],#16
657	vst1.8 {$tmp2},[$out],#16
658	b .Lcbc_done
659
660	.Lcbc_dec_one:
661	veor $tmp1,$tmp1,$dat2
662	vorr $ivec,$in2,$in2
663	vst1.8 {$tmp1},[$out],#16
664
665	.Lcbc_done:
666	vst1.8 {$ivec},[$ivp]
667	.Lcbc_abort:
668	___
669	}
670	$code.=<<___ if ($flavour !~ /64/);
671	vldmia sp!,{d8-d15}
672	ldmia sp!,{r4-r8,pc}
673	___
674	$code.=<<___ if ($flavour =~ /64/);
675	ldr x29,[sp],#16
676	ret
677	___
678	$code.=<<___;
679	.size ${prefix}_cbc_encrypt,.-${prefix}_cbc_encrypt
680	___
681	}}}
682	{{{
683	my ($inp,$out,$len,$key,$ivp)=map("x$_",(0..4));
684	my ($rounds,$cnt,$key_)=("w5","w6","x7");
685	my ($ctr,$tctr0,$tctr1,$tctr2)=map("w$_",(8..10,12));
686	my $step="x12"; # aliases with $tctr2
687
688	my ($dat0,$dat1,$in0,$in1,$tmp0,$tmp1,$ivec,$rndlast)=map("q$_",(0..7));
689	my ($dat2,$in2,$tmp2)=map("q$_",(10,11,9));
690
691	my ($dat,$tmp)=($dat0,$tmp0);
692
693	### q8-q15 preloaded key schedule
694
695	$code.=<<___;
696	.globl ${prefix}_ctr32_encrypt_blocks
697	.type ${prefix}_ctr32_encrypt_blocks,%function
698	.align 5
699	${prefix}_ctr32_encrypt_blocks:
700	___
701	$code.=<<___ if ($flavour =~ /64/);
702	stp x29,x30,[sp,#-16]!
703	add x29,sp,#0
704	___
705	$code.=<<___ if ($flavour !~ /64/);
706	mov ip,sp
707	stmdb sp!,{r4-r10,lr}
708	vstmdb sp!,{d8-d15} @ ABI specification says so
709	ldr r4, [ip] @ load remaining arg
710	___
711	$code.=<<___;
712	ldr $rounds,[$key,#240]
713
714	ldr $ctr, [$ivp, #12]
715	vld1.32 {$dat0},[$ivp]
716
717	vld1.32 {q8-q9},[$key] // load key schedule...
718	sub $rounds,$rounds,#4
719	mov $step,#16
720	cmp $len,#2
721	add $key_,$key,x5,lsl#4 // pointer to last 5 round keys
722	sub $rounds,$rounds,#2
723	vld1.32 {q12-q13},[$key_],#32
724	vld1.32 {q14-q15},[$key_],#32
725	vld1.32 {$rndlast},[$key_]
726	add $key_,$key,#32
727	mov $cnt,$rounds
728	cclr $step,lo
729	#ifndef __ARMEB__
730	rev $ctr, $ctr
731	#endif
732	vorr $dat1,$dat0,$dat0
733	add $tctr1, $ctr, #1
734	vorr $dat2,$dat0,$dat0
735	add $ctr, $ctr, #2
736	vorr $ivec,$dat0,$dat0
737	rev $tctr1, $tctr1
738	vmov.32 ${dat1}[3],$tctr1
739	b.ls .Lctr32_tail
740	rev $tctr2, $ctr
741	sub $len,$len,#3 // bias
742	vmov.32 ${dat2}[3],$tctr2
743	b .Loop3x_ctr32
744
745	.align 4
746	.Loop3x_ctr32:
747	aese $dat0,q8
748	aesmc $dat0,$dat0
749	aese $dat1,q8
750	aesmc $dat1,$dat1
751	aese $dat2,q8
752	aesmc $dat2,$dat2
753	vld1.32 {q8},[$key_],#16
754	subs $cnt,$cnt,#2
755	aese $dat0,q9
756	aesmc $dat0,$dat0
757	aese $dat1,q9
758	aesmc $dat1,$dat1
759	aese $dat2,q9
760	aesmc $dat2,$dat2
761	vld1.32 {q9},[$key_],#16
762	b.gt .Loop3x_ctr32
763
764	aese $dat0,q8
765	aesmc $tmp0,$dat0
766	aese $dat1,q8
767	aesmc $tmp1,$dat1
768	vld1.8 {$in0},[$inp],#16
769	vorr $dat0,$ivec,$ivec
770	aese $dat2,q8
771	aesmc $dat2,$dat2
772	vld1.8 {$in1},[$inp],#16
773	vorr $dat1,$ivec,$ivec
774	aese $tmp0,q9
775	aesmc $tmp0,$tmp0
776	aese $tmp1,q9
777	aesmc $tmp1,$tmp1
778	vld1.8 {$in2},[$inp],#16
779	mov $key_,$key
780	aese $dat2,q9
781	aesmc $tmp2,$dat2
782	vorr $dat2,$ivec,$ivec
783	add $tctr0,$ctr,#1
784	aese $tmp0,q12
785	aesmc $tmp0,$tmp0
786	aese $tmp1,q12
787	aesmc $tmp1,$tmp1
788	veor $in0,$in0,$rndlast
789	add $tctr1,$ctr,#2
790	aese $tmp2,q12
791	aesmc $tmp2,$tmp2
792	veor $in1,$in1,$rndlast
793	add $ctr,$ctr,#3
794	aese $tmp0,q13
795	aesmc $tmp0,$tmp0
796	aese $tmp1,q13
797	aesmc $tmp1,$tmp1
798	veor $in2,$in2,$rndlast
799	rev $tctr0,$tctr0
800	aese $tmp2,q13
801	aesmc $tmp2,$tmp2
802	vmov.32 ${dat0}[3], $tctr0
803	rev $tctr1,$tctr1
804	aese $tmp0,q14
805	aesmc $tmp0,$tmp0
806	aese $tmp1,q14
807	aesmc $tmp1,$tmp1
808	vmov.32 ${dat1}[3], $tctr1
809	rev $tctr2,$ctr
810	aese $tmp2,q14
811	aesmc $tmp2,$tmp2
812	vmov.32 ${dat2}[3], $tctr2
813	subs $len,$len,#3
814	aese $tmp0,q15
815	aese $tmp1,q15
816	aese $tmp2,q15
817
818	veor $in0,$in0,$tmp0
819	vld1.32 {q8},[$key_],#16 // re-pre-load rndkey[0]
820	vst1.8 {$in0},[$out],#16
821	veor $in1,$in1,$tmp1
822	mov $cnt,$rounds
823	vst1.8 {$in1},[$out],#16
824	veor $in2,$in2,$tmp2
825	vld1.32 {q9},[$key_],#16 // re-pre-load rndkey[1]
826	vst1.8 {$in2},[$out],#16
827	b.hs .Loop3x_ctr32
828
829	adds $len,$len,#3
830	b.eq .Lctr32_done
831	cmp $len,#1
832	mov $step,#16
833	cclr $step,eq
834
835	.Lctr32_tail:
836	aese $dat0,q8
837	aesmc $dat0,$dat0
838	aese $dat1,q8
839	aesmc $dat1,$dat1
840	vld1.32 {q8},[$key_],#16
841	subs $cnt,$cnt,#2
842	aese $dat0,q9
843	aesmc $dat0,$dat0
844	aese $dat1,q9
845	aesmc $dat1,$dat1
846	vld1.32 {q9},[$key_],#16
847	b.gt .Lctr32_tail
848
849	aese $dat0,q8
850	aesmc $dat0,$dat0
851	aese $dat1,q8
852	aesmc $dat1,$dat1
853	aese $dat0,q9
854	aesmc $dat0,$dat0
855	aese $dat1,q9
856	aesmc $dat1,$dat1
857	vld1.8 {$in0},[$inp],$step
858	aese $dat0,q12
859	aesmc $dat0,$dat0
860	aese $dat1,q12
861	aesmc $dat1,$dat1
862	vld1.8 {$in1},[$inp]
863	aese $dat0,q13
864	aesmc $dat0,$dat0
865	aese $dat1,q13
866	aesmc $dat1,$dat1
867	veor $in0,$in0,$rndlast
868	aese $dat0,q14
869	aesmc $dat0,$dat0
870	aese $dat1,q14
871	aesmc $dat1,$dat1
872	veor $in1,$in1,$rndlast
873	aese $dat0,q15
874	aese $dat1,q15
875
876	cmp $len,#1
877	veor $in0,$in0,$dat0
878	veor $in1,$in1,$dat1
879	vst1.8 {$in0},[$out],#16
880	b.eq .Lctr32_done
881	vst1.8 {$in1},[$out]
882
883	.Lctr32_done:
884	___
885	$code.=<<___ if ($flavour !~ /64/);
886	vldmia sp!,{d8-d15}
887	ldmia sp!,{r4-r10,pc}
888	___
889	$code.=<<___ if ($flavour =~ /64/);
890	ldr x29,[sp],#16
891	ret
892	___
893	$code.=<<___;
894	.size ${prefix}_ctr32_encrypt_blocks,.-${prefix}_ctr32_encrypt_blocks
895	___
896	}}}
897	$code.=<<___;
898	#endif
899	___
900	########################################
901	if ($flavour =~ /64/) { ######## 64-bit code
902	my %opcode = (
903	"aesd" => 0x4e285800, "aese" => 0x4e284800,
904	"aesimc"=> 0x4e287800, "aesmc" => 0x4e286800 );
905
906	local *unaes = sub {
907	my ($mnemonic,$arg)=@_;
908
909	$arg =~ m/[qv]([0-9]+)[^,],\s[qv]([0-9]+)/o &&
910	sprintf ".inst\t0x%08x\t//%s %s",
911	$opcode{$mnemonic}\|$1\|($2<<5),
912	$mnemonic,$arg;
913	};
914
915	foreach(split("\n",$code)) {
916	s/\`([^\`]*)\`/eval($1)/geo;
917
918	s/\bq([0-9]+)\b/"v".($1<8?$1:$1+8).".16b"/geo; # old->new registers
919	s/@\s/\/\//o; # old->new style commentary
920
921	#s/[v]?(aes\w+)\s+([qv].*)/unaes($1,$2)/geo or
922	s/cclr\s+([wx])([^,]+),\s*([a-z]+)/csel $1$2,$1zr,$1$2,$3/o or
923	s/mov\.([a-z]+)\s+([wx][0-9]+),\s*([wx][0-9]+)/csel $2,$3,$2,$1/o or
924	s/vmov\.i8/movi/o or # fix up legacy mnemonics
925	s/vext\.8/ext/o or
926	s/vrev32\.8/rev32/o or
927	s/vtst\.8/cmtst/o or
928	s/vshr/ushr/o or
929	s/^(\s+)v/$1/o or # strip off v prefix
930	s/\bbx\s+lr\b/ret/o;
931
932	# fix up remainig legacy suffixes
933	s/\.[ui]?8//o;
934	m/\],#8/o and s/\.16b/\.8b/go;
935	s/\.[ui]?32//o and s/\.16b/\.4s/go;
936	s/\.[ui]?64//o and s/\.16b/\.2d/go;
937	s/\.[42]([sd])\[([0-3])\]/\.$1\[$2\]/o;
938
939	print $_,"\n";
940	}
941	} else { ######## 32-bit code
942	my %opcode = (
943	"aesd" => 0xf3b00340, "aese" => 0xf3b00300,
944	"aesimc"=> 0xf3b003c0, "aesmc" => 0xf3b00380 );
945
946	local *unaes = sub {
947	my ($mnemonic,$arg)=@_;
948
949	if ($arg =~ m/[qv]([0-9]+)[^,],\s[qv]([0-9]+)/o) {
950	my $word = $opcode{$mnemonic}\|(($1&7)<<13)\|(($1&8)<<19)
951	\|(($2&7)<<1) \|(($2&8)<<2);
952	# since ARMv7 instructions are always encoded little-endian.
953	# correct solution is to use .inst directive, but older
954	# assemblers don't implement it:-(
955	sprintf ".byte\t0x%02x,0x%02x,0x%02x,0x%02x\t@ %s %s",
956	$word&0xff,($word>>8)&0xff,
957	($word>>16)&0xff,($word>>24)&0xff,
958	$mnemonic,$arg;
959	}
960	};
961
962	sub unvtbl {
963	my $arg=shift;
964
965	$arg =~ m/q([0-9]+),\s\{q([0-9]+)\},\sq([0-9]+)/o &&
966	sprintf "vtbl.8 d%d,{q%d},d%d\n\t".
967	"vtbl.8 d%d,{q%d},d%d", 2$1,$2,2$3, 2$1+1,$2,2$3+1;
968	}
969
970	sub unvdup32 {
971	my $arg=shift;
972
973	$arg =~ m/q([0-9]+),\s*q([0-9]+)\[([0-3])\]/o &&
974	sprintf "vdup.32 q%d,d%d[%d]",$1,2*$2+($3>>1),$3&1;
975	}
976
977	sub unvmov32 {
978	my $arg=shift;
979
980	$arg =~ m/q([0-9]+)\[([0-3])\],(.*)/o &&
981	sprintf "vmov.32 d%d[%d],%s",2*$1+($2>>1),$2&1,$3;
982	}
983
984	foreach(split("\n",$code)) {
985	s/\`([^\`]*)\`/eval($1)/geo;
986
987	s/\b[wx]([0-9]+)\b/r$1/go; # new->old registers
988	s/\bv([0-9])\.[12468]+[bsd]\b/q$1/go; # new->old registers
989	s/\/\/\s?/@ /o; # new->old style commentary
990
991	# fix up remainig new-style suffixes
992	s/\{q([0-9]+)\},\s\[(.+)\],#8/sprintf "{d%d},[$2]!",2$1/eo or
993	s/\],#[0-9]+/]!/o;
994
995	s/[v]?(aes\w+)\s+([qv].*)/unaes($1,$2)/geo or
996	s/cclr\s+([^,]+),\s*([a-z]+)/mov$2 $1,#0/o or
997	s/vtbl\.8\s+(.*)/unvtbl($1)/geo or
998	s/vdup\.32\s+(.*)/unvdup32($1)/geo or
999	s/vmov\.32\s+(.*)/unvmov32($1)/geo or
1000	s/^(\s+)b\./$1b/o or
1001	s/^(\s+)mov\./$1mov/o or
1002	s/^(\s+)ret/$1bx\tlr/o;
1003
1004	print $_,"\n";
1005	}
1006	}
1007
1008	close STDOUT;

Note: See TracBrowser for help on using the repository browser.

source: vbox/trunk/src/libs/openssl-1.1.0g/crypto/aes/asm/aesv8-armx.pl@ 69881

Download in other formats: