bsaes-armv7.pl@ 69881

Last change on this file since 69881 was 69881, checked in by vboxsync, 7 years ago
Update OpenSSL to 1.1.0g. bugref:8070: src/libs maintenance
Property svn:eol-style set to `LF` Property svn:executable set to ``*
File size: 62.3 KB

Line
1	#! /usr/bin/env perl
2	# Copyright 2012-2016 The OpenSSL Project Authors. All Rights Reserved.
3	#
4	# Licensed under the OpenSSL license (the "License"). You may not use
5	# this file except in compliance with the License. You can obtain a copy
6	# in the file LICENSE in the source distribution or at
7	# https://www.openssl.org/source/license.html
8
9
10	# ====================================================================
11	# Written by Andy Polyakov <[email protected]> for the OpenSSL
12	# project. The module is, however, dual licensed under OpenSSL and
13	# CRYPTOGAMS licenses depending on where you obtain it. For further
14	# details see http://www.openssl.org/~appro/cryptogams/.
15	#
16	# Specific modes and adaptation for Linux kernel by Ard Biesheuvel
17	# <[email protected]>. Permission to use under GPL terms is
18	# granted.
19	# ====================================================================
20
21	# Bit-sliced AES for ARM NEON
22	#
23	# February 2012.
24	#
25	# This implementation is direct adaptation of bsaes-x86_64 module for
26	# ARM NEON. Except that this module is endian-neutral [in sense that
27	# it can be compiled for either endianness] by courtesy of vld1.8's
28	# neutrality. Initial version doesn't implement interface to OpenSSL,
29	# only low-level primitives and unsupported entry points, just enough
30	# to collect performance results, which for Cortex-A8 core are:
31	#
32	# encrypt 19.5 cycles per byte processed with 128-bit key
33	# decrypt 22.1 cycles per byte processed with 128-bit key
34	# key conv. 440 cycles per 128-bit key/0.18 of 8x block
35	#
36	# Snapdragon S4 encrypts byte in 17.6 cycles and decrypts in 19.7,
37	# which is [much] worse than anticipated (for further details see
38	# http://www.openssl.org/~appro/Snapdragon-S4.html).
39	#
40	# Cortex-A15 manages in 14.2/16.1 cycles [when integer-only code
41	# manages in 20.0 cycles].
42	#
43	# When comparing to x86_64 results keep in mind that NEON unit is
44	# [mostly] single-issue and thus can't [fully] benefit from
45	# instruction-level parallelism. And when comparing to aes-armv4
46	# results keep in mind key schedule conversion overhead (see
47	# bsaes-x86_64.pl for further details)...
48	#
49	# <[email protected]>
50
51	# April-August 2013
52	#
53	# Add CBC, CTR and XTS subroutines, adapt for kernel use.
54	#
55	# <[email protected]>
56
57	$flavour = shift;
58	if ($flavour=~/\w[\w\-]*\.\w+$/) { $output=$flavour; undef $flavour; }
59	else { while (($output=shift) && ($output!~/\w[\w\-]*\.\w+$/)) {} }
60
61	if ($flavour && $flavour ne "void") {
62	$0 =~ m/(.*[\/\\])[^\/\\]+$/; $dir=$1;
63	( $xlate="${dir}arm-xlate.pl" and -f $xlate ) or
64	( $xlate="${dir}../../perlasm/arm-xlate.pl" and -f $xlate) or
65	die "can't locate arm-xlate.pl";
66
67	open STDOUT,"\| \"$^X\" $xlate $flavour $output";
68	} else {
69	open STDOUT,">$output";
70	}
71
72	my ($inp,$out,$len,$key)=("r0","r1","r2","r3");
73	my @XMM=map("q$_",(0..15));
74
75	{
76	my ($key,$rounds,$const)=("r4","r5","r6");
77
78	sub Dlo() { shift=~m\|q([1]?[0-9])\|?"d".($1*2):""; }
79	sub Dhi() { shift=~m\|q([1]?[0-9])\|?"d".($1*2+1):""; }
80
81	sub Sbox {
82	# input in lsb > [b0, b1, b2, b3, b4, b5, b6, b7] < msb
83	# output in lsb > [b0, b1, b4, b6, b3, b7, b2, b5] < msb
84	my @b=@_[0..7];
85	my @t=@_[8..11];
86	my @s=@_[12..15];
87	&InBasisChange (@b);
88	&Inv_GF256 (@b[6,5,0,3,7,1,4,2],@t,@s);
89	&OutBasisChange (@b[7,1,4,2,6,5,0,3]);
90	}
91
92	sub InBasisChange {
93	# input in lsb > [b0, b1, b2, b3, b4, b5, b6, b7] < msb
94	# output in lsb > [b6, b5, b0, b3, b7, b1, b4, b2] < msb
95	my @b=@_[0..7];
96	$code.=<<___;
97	veor @b[2], @b[2], @b[1]
98	veor @b[5], @b[5], @b[6]
99	veor @b[3], @b[3], @b[0]
100	veor @b[6], @b[6], @b[2]
101	veor @b[5], @b[5], @b[0]
102
103	veor @b[6], @b[6], @b[3]
104	veor @b[3], @b[3], @b[7]
105	veor @b[7], @b[7], @b[5]
106	veor @b[3], @b[3], @b[4]
107	veor @b[4], @b[4], @b[5]
108
109	veor @b[2], @b[2], @b[7]
110	veor @b[3], @b[3], @b[1]
111	veor @b[1], @b[1], @b[5]
112	___
113	}
114
115	sub OutBasisChange {
116	# input in lsb > [b0, b1, b2, b3, b4, b5, b6, b7] < msb
117	# output in lsb > [b6, b1, b2, b4, b7, b0, b3, b5] < msb
118	my @b=@_[0..7];
119	$code.=<<___;
120	veor @b[0], @b[0], @b[6]
121	veor @b[1], @b[1], @b[4]
122	veor @b[4], @b[4], @b[6]
123	veor @b[2], @b[2], @b[0]
124	veor @b[6], @b[6], @b[1]
125
126	veor @b[1], @b[1], @b[5]
127	veor @b[5], @b[5], @b[3]
128	veor @b[3], @b[3], @b[7]
129	veor @b[7], @b[7], @b[5]
130	veor @b[2], @b[2], @b[5]
131
132	veor @b[4], @b[4], @b[7]
133	___
134	}
135
136	sub InvSbox {
137	# input in lsb > [b0, b1, b2, b3, b4, b5, b6, b7] < msb
138	# output in lsb > [b0, b1, b6, b4, b2, b7, b3, b5] < msb
139	my @b=@_[0..7];
140	my @t=@_[8..11];
141	my @s=@_[12..15];
142	&InvInBasisChange (@b);
143	&Inv_GF256 (@b[5,1,2,6,3,7,0,4],@t,@s);
144	&InvOutBasisChange (@b[3,7,0,4,5,1,2,6]);
145	}
146
147	sub InvInBasisChange { # OutBasisChange in reverse (with twist)
148	my @b=@_[5,1,2,6,3,7,0,4];
149	$code.=<<___
150	veor @b[1], @b[1], @b[7]
151	veor @b[4], @b[4], @b[7]
152
153	veor @b[7], @b[7], @b[5]
154	veor @b[1], @b[1], @b[3]
155	veor @b[2], @b[2], @b[5]
156	veor @b[3], @b[3], @b[7]
157
158	veor @b[6], @b[6], @b[1]
159	veor @b[2], @b[2], @b[0]
160	veor @b[5], @b[5], @b[3]
161	veor @b[4], @b[4], @b[6]
162	veor @b[0], @b[0], @b[6]
163	veor @b[1], @b[1], @b[4]
164	___
165	}
166
167	sub InvOutBasisChange { # InBasisChange in reverse
168	my @b=@_[2,5,7,3,6,1,0,4];
169	$code.=<<___;
170	veor @b[1], @b[1], @b[5]
171	veor @b[2], @b[2], @b[7]
172
173	veor @b[3], @b[3], @b[1]
174	veor @b[4], @b[4], @b[5]
175	veor @b[7], @b[7], @b[5]
176	veor @b[3], @b[3], @b[4]
177	veor @b[5], @b[5], @b[0]
178	veor @b[3], @b[3], @b[7]
179	veor @b[6], @b[6], @b[2]
180	veor @b[2], @b[2], @b[1]
181	veor @b[6], @b[6], @b[3]
182
183	veor @b[3], @b[3], @b[0]
184	veor @b[5], @b[5], @b[6]
185	___
186	}
187
188	sub Mul_GF4 {
189	#;*************************************************************
190	#;* Mul_GF4: Input x0-x1,y0-y1 Output x0-x1 Temp t0 (8) *
191	#;*************************************************************
192	my ($x0,$x1,$y0,$y1,$t0,$t1)=@_;
193	$code.=<<___;
194	veor $t0, $y0, $y1
195	vand $t0, $t0, $x0
196	veor $x0, $x0, $x1
197	vand $t1, $x1, $y0
198	vand $x0, $x0, $y1
199	veor $x1, $t1, $t0
200	veor $x0, $x0, $t1
201	___
202	}
203
204	sub Mul_GF4_N { # not used, see next subroutine
205	# multiply and scale by N
206	my ($x0,$x1,$y0,$y1,$t0)=@_;
207	$code.=<<___;
208	veor $t0, $y0, $y1
209	vand $t0, $t0, $x0
210	veor $x0, $x0, $x1
211	vand $x1, $x1, $y0
212	vand $x0, $x0, $y1
213	veor $x1, $x1, $x0
214	veor $x0, $x0, $t0
215	___
216	}
217
218	sub Mul_GF4_N_GF4 {
219	# interleaved Mul_GF4_N and Mul_GF4
220	my ($x0,$x1,$y0,$y1,$t0,
221	$x2,$x3,$y2,$y3,$t1)=@_;
222	$code.=<<___;
223	veor $t0, $y0, $y1
224	veor $t1, $y2, $y3
225	vand $t0, $t0, $x0
226	vand $t1, $t1, $x2
227	veor $x0, $x0, $x1
228	veor $x2, $x2, $x3
229	vand $x1, $x1, $y0
230	vand $x3, $x3, $y2
231	vand $x0, $x0, $y1
232	vand $x2, $x2, $y3
233	veor $x1, $x1, $x0
234	veor $x2, $x2, $x3
235	veor $x0, $x0, $t0
236	veor $x3, $x3, $t1
237	___
238	}
239	sub Mul_GF16_2 {
240	my @x=@_[0..7];
241	my @y=@_[8..11];
242	my @t=@_[12..15];
243	$code.=<<___;
244	veor @t[0], @x[0], @x[2]
245	veor @t[1], @x[1], @x[3]
246	___
247	&Mul_GF4 (@x[0], @x[1], @y[0], @y[1], @t[2..3]);
248	$code.=<<___;
249	veor @y[0], @y[0], @y[2]
250	veor @y[1], @y[1], @y[3]
251	___
252	Mul_GF4_N_GF4 (@t[0], @t[1], @y[0], @y[1], @t[3],
253	@x[2], @x[3], @y[2], @y[3], @t[2]);
254	$code.=<<___;
255	veor @x[0], @x[0], @t[0]
256	veor @x[2], @x[2], @t[0]
257	veor @x[1], @x[1], @t[1]
258	veor @x[3], @x[3], @t[1]
259
260	veor @t[0], @x[4], @x[6]
261	veor @t[1], @x[5], @x[7]
262	___
263	&Mul_GF4_N_GF4 (@t[0], @t[1], @y[0], @y[1], @t[3],
264	@x[6], @x[7], @y[2], @y[3], @t[2]);
265	$code.=<<___;
266	veor @y[0], @y[0], @y[2]
267	veor @y[1], @y[1], @y[3]
268	___
269	&Mul_GF4 (@x[4], @x[5], @y[0], @y[1], @t[2..3]);
270	$code.=<<___;
271	veor @x[4], @x[4], @t[0]
272	veor @x[6], @x[6], @t[0]
273	veor @x[5], @x[5], @t[1]
274	veor @x[7], @x[7], @t[1]
275	___
276	}
277	sub Inv_GF256 {
278	#;********************************************************************
279	#;* Inv_GF256: Input x0-x7 Output x0-x7 Temp t0-t3,s0-s3 (144) *
280	#;********************************************************************
281	my @x=@_[0..7];
282	my @t=@_[8..11];
283	my @s=@_[12..15];
284	# direct optimizations from hardware
285	$code.=<<___;
286	veor @t[3], @x[4], @x[6]
287	veor @t[2], @x[5], @x[7]
288	veor @t[1], @x[1], @x[3]
289	veor @s[1], @x[7], @x[6]
290	vmov @t[0], @t[2]
291	veor @s[0], @x[0], @x[2]
292
293	vorr @t[2], @t[2], @t[1]
294	veor @s[3], @t[3], @t[0]
295	vand @s[2], @t[3], @s[0]
296	vorr @t[3], @t[3], @s[0]
297	veor @s[0], @s[0], @t[1]
298	vand @t[0], @t[0], @t[1]
299	veor @t[1], @x[3], @x[2]
300	vand @s[3], @s[3], @s[0]
301	vand @s[1], @s[1], @t[1]
302	veor @t[1], @x[4], @x[5]
303	veor @s[0], @x[1], @x[0]
304	veor @t[3], @t[3], @s[1]
305	veor @t[2], @t[2], @s[1]
306	vand @s[1], @t[1], @s[0]
307	vorr @t[1], @t[1], @s[0]
308	veor @t[3], @t[3], @s[3]
309	veor @t[0], @t[0], @s[1]
310	veor @t[2], @t[2], @s[2]
311	veor @t[1], @t[1], @s[3]
312	veor @t[0], @t[0], @s[2]
313	vand @s[0], @x[7], @x[3]
314	veor @t[1], @t[1], @s[2]
315	vand @s[1], @x[6], @x[2]
316	vand @s[2], @x[5], @x[1]
317	vorr @s[3], @x[4], @x[0]
318	veor @t[3], @t[3], @s[0]
319	veor @t[1], @t[1], @s[2]
320	veor @t[0], @t[0], @s[3]
321	veor @t[2], @t[2], @s[1]
322
323	@ Inv_GF16 \t0, \t1, \t2, \t3, \s0, \s1, \s2, \s3
324
325	@ new smaller inversion
326
327	vand @s[2], @t[3], @t[1]
328	vmov @s[0], @t[0]
329
330	veor @s[1], @t[2], @s[2]
331	veor @s[3], @t[0], @s[2]
332	veor @s[2], @t[0], @s[2] @ @s[2]=@s[3]
333
334	vbsl @s[1], @t[1], @t[0]
335	vbsl @s[3], @t[3], @t[2]
336	veor @t[3], @t[3], @t[2]
337
338	vbsl @s[0], @s[1], @s[2]
339	vbsl @t[0], @s[2], @s[1]
340
341	vand @s[2], @s[0], @s[3]
342	veor @t[1], @t[1], @t[0]
343
344	veor @s[2], @s[2], @t[3]
345	___
346	# output in s3, s2, s1, t1
347
348	# Mul_GF16_2 \x0, \x1, \x2, \x3, \x4, \x5, \x6, \x7, \t2, \t3, \t0, \t1, \s0, \s1, \s2, \s3
349
350	# Mul_GF16_2 \x0, \x1, \x2, \x3, \x4, \x5, \x6, \x7, \s3, \s2, \s1, \t1, \s0, \t0, \t2, \t3
351	&Mul_GF16_2(@x,@s[3,2,1],@t[1],@s[0],@t[0,2,3]);
352
353	### output msb > [x3,x2,x1,x0,x7,x6,x5,x4] < lsb
354	}
355
356	# AES linear components
357
358	sub ShiftRows {
359	my @x=@_[0..7];
360	my @t=@_[8..11];
361	my $mask=pop;
362	$code.=<<___;
363	vldmia $key!, {@t[0]-@t[3]}
364	veor @t[0], @t[0], @x[0]
365	veor @t[1], @t[1], @x[1]
366	vtbl.8 `&Dlo(@x[0])`, {@t[0]}, `&Dlo($mask)`
367	vtbl.8 `&Dhi(@x[0])`, {@t[0]}, `&Dhi($mask)`
368	vldmia $key!, {@t[0]}
369	veor @t[2], @t[2], @x[2]
370	vtbl.8 `&Dlo(@x[1])`, {@t[1]}, `&Dlo($mask)`
371	vtbl.8 `&Dhi(@x[1])`, {@t[1]}, `&Dhi($mask)`
372	vldmia $key!, {@t[1]}
373	veor @t[3], @t[3], @x[3]
374	vtbl.8 `&Dlo(@x[2])`, {@t[2]}, `&Dlo($mask)`
375	vtbl.8 `&Dhi(@x[2])`, {@t[2]}, `&Dhi($mask)`
376	vldmia $key!, {@t[2]}
377	vtbl.8 `&Dlo(@x[3])`, {@t[3]}, `&Dlo($mask)`
378	vtbl.8 `&Dhi(@x[3])`, {@t[3]}, `&Dhi($mask)`
379	vldmia $key!, {@t[3]}
380	veor @t[0], @t[0], @x[4]
381	veor @t[1], @t[1], @x[5]
382	vtbl.8 `&Dlo(@x[4])`, {@t[0]}, `&Dlo($mask)`
383	vtbl.8 `&Dhi(@x[4])`, {@t[0]}, `&Dhi($mask)`
384	veor @t[2], @t[2], @x[6]
385	vtbl.8 `&Dlo(@x[5])`, {@t[1]}, `&Dlo($mask)`
386	vtbl.8 `&Dhi(@x[5])`, {@t[1]}, `&Dhi($mask)`
387	veor @t[3], @t[3], @x[7]
388	vtbl.8 `&Dlo(@x[6])`, {@t[2]}, `&Dlo($mask)`
389	vtbl.8 `&Dhi(@x[6])`, {@t[2]}, `&Dhi($mask)`
390	vtbl.8 `&Dlo(@x[7])`, {@t[3]}, `&Dlo($mask)`
391	vtbl.8 `&Dhi(@x[7])`, {@t[3]}, `&Dhi($mask)`
392	___
393	}
394
395	sub MixColumns {
396	# modified to emit output in order suitable for feeding back to aesenc[last]
397	my @x=@_[0..7];
398	my @t=@_[8..15];
399	my $inv=@_[16]; # optional
400	$code.=<<___;
401	vext.8 @t[0], @x[0], @x[0], #12 @ x0 <<< 32
402	vext.8 @t[1], @x[1], @x[1], #12
403	veor @x[0], @x[0], @t[0] @ x0 ^ (x0 <<< 32)
404	vext.8 @t[2], @x[2], @x[2], #12
405	veor @x[1], @x[1], @t[1]
406	vext.8 @t[3], @x[3], @x[3], #12
407	veor @x[2], @x[2], @t[2]
408	vext.8 @t[4], @x[4], @x[4], #12
409	veor @x[3], @x[3], @t[3]
410	vext.8 @t[5], @x[5], @x[5], #12
411	veor @x[4], @x[4], @t[4]
412	vext.8 @t[6], @x[6], @x[6], #12
413	veor @x[5], @x[5], @t[5]
414	vext.8 @t[7], @x[7], @x[7], #12
415	veor @x[6], @x[6], @t[6]
416
417	veor @t[1], @t[1], @x[0]
418	veor @x[7], @x[7], @t[7]
419	vext.8 @x[0], @x[0], @x[0], #8 @ (x0 ^ (x0 <<< 32)) <<< 64)
420	veor @t[2], @t[2], @x[1]
421	veor @t[0], @t[0], @x[7]
422	veor @t[1], @t[1], @x[7]
423	vext.8 @x[1], @x[1], @x[1], #8
424	veor @t[5], @t[5], @x[4]
425	veor @x[0], @x[0], @t[0]
426	veor @t[6], @t[6], @x[5]
427	veor @x[1], @x[1], @t[1]
428	vext.8 @t[0], @x[4], @x[4], #8
429	veor @t[4], @t[4], @x[3]
430	vext.8 @t[1], @x[5], @x[5], #8
431	veor @t[7], @t[7], @x[6]
432	vext.8 @x[4], @x[3], @x[3], #8
433	veor @t[3], @t[3], @x[2]
434	vext.8 @x[5], @x[7], @x[7], #8
435	veor @t[4], @t[4], @x[7]
436	vext.8 @x[3], @x[6], @x[6], #8
437	veor @t[3], @t[3], @x[7]
438	vext.8 @x[6], @x[2], @x[2], #8
439	veor @x[7], @t[1], @t[5]
440	___
441	$code.=<<___ if (!$inv);
442	veor @x[2], @t[0], @t[4]
443	veor @x[4], @x[4], @t[3]
444	veor @x[5], @x[5], @t[7]
445	veor @x[3], @x[3], @t[6]
446	@ vmov @x[2], @t[0]
447	veor @x[6], @x[6], @t[2]
448	@ vmov @x[7], @t[1]
449	___
450	$code.=<<___ if ($inv);
451	veor @t[3], @t[3], @x[4]
452	veor @x[5], @x[5], @t[7]
453	veor @x[2], @x[3], @t[6]
454	veor @x[3], @t[0], @t[4]
455	veor @x[4], @x[6], @t[2]
456	vmov @x[6], @t[3]
457	@ vmov @x[7], @t[1]
458	___
459	}
460
461	sub InvMixColumns_orig {
462	my @x=@_[0..7];
463	my @t=@_[8..15];
464
465	$code.=<<___;
466	@ multiplication by 0x0e
467	vext.8 @t[7], @x[7], @x[7], #12
468	vmov @t[2], @x[2]
469	veor @x[2], @x[2], @x[5] @ 2 5
470	veor @x[7], @x[7], @x[5] @ 7 5
471	vext.8 @t[0], @x[0], @x[0], #12
472	vmov @t[5], @x[5]
473	veor @x[5], @x[5], @x[0] @ 5 0 [1]
474	veor @x[0], @x[0], @x[1] @ 0 1
475	vext.8 @t[1], @x[1], @x[1], #12
476	veor @x[1], @x[1], @x[2] @ 1 25
477	veor @x[0], @x[0], @x[6] @ 01 6 [2]
478	vext.8 @t[3], @x[3], @x[3], #12
479	veor @x[1], @x[1], @x[3] @ 125 3 [4]
480	veor @x[2], @x[2], @x[0] @ 25 016 [3]
481	veor @x[3], @x[3], @x[7] @ 3 75
482	veor @x[7], @x[7], @x[6] @ 75 6 [0]
483	vext.8 @t[6], @x[6], @x[6], #12
484	vmov @t[4], @x[4]
485	veor @x[6], @x[6], @x[4] @ 6 4
486	veor @x[4], @x[4], @x[3] @ 4 375 [6]
487	veor @x[3], @x[3], @x[7] @ 375 756=36
488	veor @x[6], @x[6], @t[5] @ 64 5 [7]
489	veor @x[3], @x[3], @t[2] @ 36 2
490	vext.8 @t[5], @t[5], @t[5], #12
491	veor @x[3], @x[3], @t[4] @ 362 4 [5]
492	___
493	my @y = @x[7,5,0,2,1,3,4,6];
494	$code.=<<___;
495	@ multiplication by 0x0b
496	veor @y[1], @y[1], @y[0]
497	veor @y[0], @y[0], @t[0]
498	vext.8 @t[2], @t[2], @t[2], #12
499	veor @y[1], @y[1], @t[1]
500	veor @y[0], @y[0], @t[5]
501	vext.8 @t[4], @t[4], @t[4], #12
502	veor @y[1], @y[1], @t[6]
503	veor @y[0], @y[0], @t[7]
504	veor @t[7], @t[7], @t[6] @ clobber t[7]
505
506	veor @y[3], @y[3], @t[0]
507	veor @y[1], @y[1], @y[0]
508	vext.8 @t[0], @t[0], @t[0], #12
509	veor @y[2], @y[2], @t[1]
510	veor @y[4], @y[4], @t[1]
511	vext.8 @t[1], @t[1], @t[1], #12
512	veor @y[2], @y[2], @t[2]
513	veor @y[3], @y[3], @t[2]
514	veor @y[5], @y[5], @t[2]
515	veor @y[2], @y[2], @t[7]
516	vext.8 @t[2], @t[2], @t[2], #12
517	veor @y[3], @y[3], @t[3]
518	veor @y[6], @y[6], @t[3]
519	veor @y[4], @y[4], @t[3]
520	veor @y[7], @y[7], @t[4]
521	vext.8 @t[3], @t[3], @t[3], #12
522	veor @y[5], @y[5], @t[4]
523	veor @y[7], @y[7], @t[7]
524	veor @t[7], @t[7], @t[5] @ clobber t[7] even more
525	veor @y[3], @y[3], @t[5]
526	veor @y[4], @y[4], @t[4]
527
528	veor @y[5], @y[5], @t[7]
529	vext.8 @t[4], @t[4], @t[4], #12
530	veor @y[6], @y[6], @t[7]
531	veor @y[4], @y[4], @t[7]
532
533	veor @t[7], @t[7], @t[5]
534	vext.8 @t[5], @t[5], @t[5], #12
535
536	@ multiplication by 0x0d
537	veor @y[4], @y[4], @y[7]
538	veor @t[7], @t[7], @t[6] @ restore t[7]
539	veor @y[7], @y[7], @t[4]
540	vext.8 @t[6], @t[6], @t[6], #12
541	veor @y[2], @y[2], @t[0]
542	veor @y[7], @y[7], @t[5]
543	vext.8 @t[7], @t[7], @t[7], #12
544	veor @y[2], @y[2], @t[2]
545
546	veor @y[3], @y[3], @y[1]
547	veor @y[1], @y[1], @t[1]
548	veor @y[0], @y[0], @t[0]
549	veor @y[3], @y[3], @t[0]
550	veor @y[1], @y[1], @t[5]
551	veor @y[0], @y[0], @t[5]
552	vext.8 @t[0], @t[0], @t[0], #12
553	veor @y[1], @y[1], @t[7]
554	veor @y[0], @y[0], @t[6]
555	veor @y[3], @y[3], @y[1]
556	veor @y[4], @y[4], @t[1]
557	vext.8 @t[1], @t[1], @t[1], #12
558
559	veor @y[7], @y[7], @t[7]
560	veor @y[4], @y[4], @t[2]
561	veor @y[5], @y[5], @t[2]
562	veor @y[2], @y[2], @t[6]
563	veor @t[6], @t[6], @t[3] @ clobber t[6]
564	vext.8 @t[2], @t[2], @t[2], #12
565	veor @y[4], @y[4], @y[7]
566	veor @y[3], @y[3], @t[6]
567
568	veor @y[6], @y[6], @t[6]
569	veor @y[5], @y[5], @t[5]
570	vext.8 @t[5], @t[5], @t[5], #12
571	veor @y[6], @y[6], @t[4]
572	vext.8 @t[4], @t[4], @t[4], #12
573	veor @y[5], @y[5], @t[6]
574	veor @y[6], @y[6], @t[7]
575	vext.8 @t[7], @t[7], @t[7], #12
576	veor @t[6], @t[6], @t[3] @ restore t[6]
577	vext.8 @t[3], @t[3], @t[3], #12
578
579	@ multiplication by 0x09
580	veor @y[4], @y[4], @y[1]
581	veor @t[1], @t[1], @y[1] @ t[1]=y[1]
582	veor @t[0], @t[0], @t[5] @ clobber t[0]
583	vext.8 @t[6], @t[6], @t[6], #12
584	veor @t[1], @t[1], @t[5]
585	veor @y[3], @y[3], @t[0]
586	veor @t[0], @t[0], @y[0] @ t[0]=y[0]
587	veor @t[1], @t[1], @t[6]
588	veor @t[6], @t[6], @t[7] @ clobber t[6]
589	veor @y[4], @y[4], @t[1]
590	veor @y[7], @y[7], @t[4]
591	veor @y[6], @y[6], @t[3]
592	veor @y[5], @y[5], @t[2]
593	veor @t[4], @t[4], @y[4] @ t[4]=y[4]
594	veor @t[3], @t[3], @y[3] @ t[3]=y[3]
595	veor @t[5], @t[5], @y[5] @ t[5]=y[5]
596	veor @t[2], @t[2], @y[2] @ t[2]=y[2]
597	veor @t[3], @t[3], @t[7]
598	veor @XMM[5], @t[5], @t[6]
599	veor @XMM[6], @t[6], @y[6] @ t[6]=y[6]
600	veor @XMM[2], @t[2], @t[6]
601	veor @XMM[7], @t[7], @y[7] @ t[7]=y[7]
602
603	vmov @XMM[0], @t[0]
604	vmov @XMM[1], @t[1]
605	@ vmov @XMM[2], @t[2]
606	vmov @XMM[3], @t[3]
607	vmov @XMM[4], @t[4]
608	@ vmov @XMM[5], @t[5]
609	@ vmov @XMM[6], @t[6]
610	@ vmov @XMM[7], @t[7]
611	___
612	}
613
614	sub InvMixColumns {
615	my @x=@_[0..7];
616	my @t=@_[8..15];
617
618	# Thanks to Jussi Kivilinna for providing pointer to
619	#
620	# \| 0e 0b 0d 09 \| \| 02 03 01 01 \| \| 05 00 04 00 \|
621	# \| 09 0e 0b 0d \| = \| 01 02 03 01 \| x \| 00 05 00 04 \|
622	# \| 0d 09 0e 0b \| \| 01 01 02 03 \| \| 04 00 05 00 \|
623	# \| 0b 0d 09 0e \| \| 03 01 01 02 \| \| 00 04 00 05 \|
624
625	$code.=<<___;
626	@ multiplication by 0x05-0x00-0x04-0x00
627	vext.8 @t[0], @x[0], @x[0], #8
628	vext.8 @t[6], @x[6], @x[6], #8
629	vext.8 @t[7], @x[7], @x[7], #8
630	veor @t[0], @t[0], @x[0]
631	vext.8 @t[1], @x[1], @x[1], #8
632	veor @t[6], @t[6], @x[6]
633	vext.8 @t[2], @x[2], @x[2], #8
634	veor @t[7], @t[7], @x[7]
635	vext.8 @t[3], @x[3], @x[3], #8
636	veor @t[1], @t[1], @x[1]
637	vext.8 @t[4], @x[4], @x[4], #8
638	veor @t[2], @t[2], @x[2]
639	vext.8 @t[5], @x[5], @x[5], #8
640	veor @t[3], @t[3], @x[3]
641	veor @t[4], @t[4], @x[4]
642	veor @t[5], @t[5], @x[5]
643
644	veor @x[0], @x[0], @t[6]
645	veor @x[1], @x[1], @t[6]
646	veor @x[2], @x[2], @t[0]
647	veor @x[4], @x[4], @t[2]
648	veor @x[3], @x[3], @t[1]
649	veor @x[1], @x[1], @t[7]
650	veor @x[2], @x[2], @t[7]
651	veor @x[4], @x[4], @t[6]
652	veor @x[5], @x[5], @t[3]
653	veor @x[3], @x[3], @t[6]
654	veor @x[6], @x[6], @t[4]
655	veor @x[4], @x[4], @t[7]
656	veor @x[5], @x[5], @t[7]
657	veor @x[7], @x[7], @t[5]
658	___
659	&MixColumns (@x,@t,1); # flipped 2<->3 and 4<->6
660	}
661
662	sub swapmove {
663	my ($a,$b,$n,$mask,$t)=@_;
664	$code.=<<___;
665	vshr.u64 $t, $b, #$n
666	veor $t, $t, $a
667	vand $t, $t, $mask
668	veor $a, $a, $t
669	vshl.u64 $t, $t, #$n
670	veor $b, $b, $t
671	___
672	}
673	sub swapmove2x {
674	my ($a0,$b0,$a1,$b1,$n,$mask,$t0,$t1)=@_;
675	$code.=<<___;
676	vshr.u64 $t0, $b0, #$n
677	vshr.u64 $t1, $b1, #$n
678	veor $t0, $t0, $a0
679	veor $t1, $t1, $a1
680	vand $t0, $t0, $mask
681	vand $t1, $t1, $mask
682	veor $a0, $a0, $t0
683	vshl.u64 $t0, $t0, #$n
684	veor $a1, $a1, $t1
685	vshl.u64 $t1, $t1, #$n
686	veor $b0, $b0, $t0
687	veor $b1, $b1, $t1
688	___
689	}
690
691	sub bitslice {
692	my @x=reverse(@_[0..7]);
693	my ($t0,$t1,$t2,$t3)=@_[8..11];
694	$code.=<<___;
695	vmov.i8 $t0,#0x55 @ compose .LBS0
696	vmov.i8 $t1,#0x33 @ compose .LBS1
697	___
698	&swapmove2x(@x[0,1,2,3],1,$t0,$t2,$t3);
699	&swapmove2x(@x[4,5,6,7],1,$t0,$t2,$t3);
700	$code.=<<___;
701	vmov.i8 $t0,#0x0f @ compose .LBS2
702	___
703	&swapmove2x(@x[0,2,1,3],2,$t1,$t2,$t3);
704	&swapmove2x(@x[4,6,5,7],2,$t1,$t2,$t3);
705
706	&swapmove2x(@x[0,4,1,5],4,$t0,$t2,$t3);
707	&swapmove2x(@x[2,6,3,7],4,$t0,$t2,$t3);
708	}
709
710	$code.=<<___;
711	#ifndef __KERNEL__
712	# include "arm_arch.h"
713
714	# define VFP_ABI_PUSH vstmdb sp!,{d8-d15}
715	# define VFP_ABI_POP vldmia sp!,{d8-d15}
716	# define VFP_ABI_FRAME 0x40
717	#else
718	# define VFP_ABI_PUSH
719	# define VFP_ABI_POP
720	# define VFP_ABI_FRAME 0
721	# define BSAES_ASM_EXTENDED_KEY
722	# define XTS_CHAIN_TWEAK
723	# define __ARM_ARCH__ __LINUX_ARM_ARCH__
724	# define __ARM_MAX_ARCH__ 7
725	#endif
726
727	#ifdef __thumb__
728	# define adrl adr
729	#endif
730
731	#if __ARM_MAX_ARCH__>=7
732	.arch armv7-a
733	.fpu neon
734
735	.text
736	.syntax unified @ ARMv7-capable assembler is expected to handle this
737	#if defined(__thumb2__) && !defined(__APPLE__)
738	.thumb
739	#else
740	.code 32
741	# undef __thumb2__
742	#endif
743
744	.type _bsaes_decrypt8,%function
745	.align 4
746	_bsaes_decrypt8:
747	adr $const,_bsaes_decrypt8
748	vldmia $key!, {@XMM[9]} @ round 0 key
749	#ifdef __APPLE__
750	adr $const,.LM0ISR
751	#else
752	add $const,$const,#.LM0ISR-_bsaes_decrypt8
753	#endif
754
755	vldmia $const!, {@XMM[8]} @ .LM0ISR
756	veor @XMM[10], @XMM[0], @XMM[9] @ xor with round0 key
757	veor @XMM[11], @XMM[1], @XMM[9]
758	vtbl.8 `&Dlo(@XMM[0])`, {@XMM[10]}, `&Dlo(@XMM[8])`
759	vtbl.8 `&Dhi(@XMM[0])`, {@XMM[10]}, `&Dhi(@XMM[8])`
760	veor @XMM[12], @XMM[2], @XMM[9]
761	vtbl.8 `&Dlo(@XMM[1])`, {@XMM[11]}, `&Dlo(@XMM[8])`
762	vtbl.8 `&Dhi(@XMM[1])`, {@XMM[11]}, `&Dhi(@XMM[8])`
763	veor @XMM[13], @XMM[3], @XMM[9]
764	vtbl.8 `&Dlo(@XMM[2])`, {@XMM[12]}, `&Dlo(@XMM[8])`
765	vtbl.8 `&Dhi(@XMM[2])`, {@XMM[12]}, `&Dhi(@XMM[8])`
766	veor @XMM[14], @XMM[4], @XMM[9]
767	vtbl.8 `&Dlo(@XMM[3])`, {@XMM[13]}, `&Dlo(@XMM[8])`
768	vtbl.8 `&Dhi(@XMM[3])`, {@XMM[13]}, `&Dhi(@XMM[8])`
769	veor @XMM[15], @XMM[5], @XMM[9]
770	vtbl.8 `&Dlo(@XMM[4])`, {@XMM[14]}, `&Dlo(@XMM[8])`
771	vtbl.8 `&Dhi(@XMM[4])`, {@XMM[14]}, `&Dhi(@XMM[8])`
772	veor @XMM[10], @XMM[6], @XMM[9]
773	vtbl.8 `&Dlo(@XMM[5])`, {@XMM[15]}, `&Dlo(@XMM[8])`
774	vtbl.8 `&Dhi(@XMM[5])`, {@XMM[15]}, `&Dhi(@XMM[8])`
775	veor @XMM[11], @XMM[7], @XMM[9]
776	vtbl.8 `&Dlo(@XMM[6])`, {@XMM[10]}, `&Dlo(@XMM[8])`
777	vtbl.8 `&Dhi(@XMM[6])`, {@XMM[10]}, `&Dhi(@XMM[8])`
778	vtbl.8 `&Dlo(@XMM[7])`, {@XMM[11]}, `&Dlo(@XMM[8])`
779	vtbl.8 `&Dhi(@XMM[7])`, {@XMM[11]}, `&Dhi(@XMM[8])`
780	___
781	&bitslice (@XMM[0..7, 8..11]);
782	$code.=<<___;
783	sub $rounds,$rounds,#1
784	b .Ldec_sbox
785	.align 4
786	.Ldec_loop:
787	___
788	&ShiftRows (@XMM[0..7, 8..12]);
789	$code.=".Ldec_sbox:\n";
790	&InvSbox (@XMM[0..7, 8..15]);
791	$code.=<<___;
792	subs $rounds,$rounds,#1
793	bcc .Ldec_done
794	___
795	&InvMixColumns (@XMM[0,1,6,4,2,7,3,5, 8..15]);
796	$code.=<<___;
797	vldmia $const, {@XMM[12]} @ .LISR
798	ite eq @ Thumb2 thing, sanity check in ARM
799	addeq $const,$const,#0x10
800	bne .Ldec_loop
801	vldmia $const, {@XMM[12]} @ .LISRM0
802	b .Ldec_loop
803	.align 4
804	.Ldec_done:
805	___
806	&bitslice (@XMM[0,1,6,4,2,7,3,5, 8..11]);
807	$code.=<<___;
808	vldmia $key, {@XMM[8]} @ last round key
809	veor @XMM[6], @XMM[6], @XMM[8]
810	veor @XMM[4], @XMM[4], @XMM[8]
811	veor @XMM[2], @XMM[2], @XMM[8]
812	veor @XMM[7], @XMM[7], @XMM[8]
813	veor @XMM[3], @XMM[3], @XMM[8]
814	veor @XMM[5], @XMM[5], @XMM[8]
815	veor @XMM[0], @XMM[0], @XMM[8]
816	veor @XMM[1], @XMM[1], @XMM[8]
817	bx lr
818	.size _bsaes_decrypt8,.-_bsaes_decrypt8
819
820	.type _bsaes_const,%object
821	.align 6
822	_bsaes_const:
823	.LM0ISR: @ InvShiftRows constants
824	.quad 0x0a0e0206070b0f03, 0x0004080c0d010509
825	.LISR:
826	.quad 0x0504070602010003, 0x0f0e0d0c080b0a09
827	.LISRM0:
828	.quad 0x01040b0e0205080f, 0x0306090c00070a0d
829	.LM0SR: @ ShiftRows constants
830	.quad 0x0a0e02060f03070b, 0x0004080c05090d01
831	.LSR:
832	.quad 0x0504070600030201, 0x0f0e0d0c0a09080b
833	.LSRM0:
834	.quad 0x0304090e00050a0f, 0x01060b0c0207080d
835	.LM0:
836	.quad 0x02060a0e03070b0f, 0x0004080c0105090d
837	.LREVM0SR:
838	.quad 0x090d01050c000408, 0x03070b0f060a0e02
839	.asciz "Bit-sliced AES for NEON, CRYPTOGAMS by <appro\@openssl.org>"
840	.align 6
841	.size _bsaes_const,.-_bsaes_const
842
843	.type _bsaes_encrypt8,%function
844	.align 4
845	_bsaes_encrypt8:
846	adr $const,_bsaes_encrypt8
847	vldmia $key!, {@XMM[9]} @ round 0 key
848	#ifdef __APPLE__
849	adr $const,.LM0SR
850	#else
851	sub $const,$const,#_bsaes_encrypt8-.LM0SR
852	#endif
853
854	vldmia $const!, {@XMM[8]} @ .LM0SR
855	_bsaes_encrypt8_alt:
856	veor @XMM[10], @XMM[0], @XMM[9] @ xor with round0 key
857	veor @XMM[11], @XMM[1], @XMM[9]
858	vtbl.8 `&Dlo(@XMM[0])`, {@XMM[10]}, `&Dlo(@XMM[8])`
859	vtbl.8 `&Dhi(@XMM[0])`, {@XMM[10]}, `&Dhi(@XMM[8])`
860	veor @XMM[12], @XMM[2], @XMM[9]
861	vtbl.8 `&Dlo(@XMM[1])`, {@XMM[11]}, `&Dlo(@XMM[8])`
862	vtbl.8 `&Dhi(@XMM[1])`, {@XMM[11]}, `&Dhi(@XMM[8])`
863	veor @XMM[13], @XMM[3], @XMM[9]
864	vtbl.8 `&Dlo(@XMM[2])`, {@XMM[12]}, `&Dlo(@XMM[8])`
865	vtbl.8 `&Dhi(@XMM[2])`, {@XMM[12]}, `&Dhi(@XMM[8])`
866	veor @XMM[14], @XMM[4], @XMM[9]
867	vtbl.8 `&Dlo(@XMM[3])`, {@XMM[13]}, `&Dlo(@XMM[8])`
868	vtbl.8 `&Dhi(@XMM[3])`, {@XMM[13]}, `&Dhi(@XMM[8])`
869	veor @XMM[15], @XMM[5], @XMM[9]
870	vtbl.8 `&Dlo(@XMM[4])`, {@XMM[14]}, `&Dlo(@XMM[8])`
871	vtbl.8 `&Dhi(@XMM[4])`, {@XMM[14]}, `&Dhi(@XMM[8])`
872	veor @XMM[10], @XMM[6], @XMM[9]
873	vtbl.8 `&Dlo(@XMM[5])`, {@XMM[15]}, `&Dlo(@XMM[8])`
874	vtbl.8 `&Dhi(@XMM[5])`, {@XMM[15]}, `&Dhi(@XMM[8])`
875	veor @XMM[11], @XMM[7], @XMM[9]
876	vtbl.8 `&Dlo(@XMM[6])`, {@XMM[10]}, `&Dlo(@XMM[8])`
877	vtbl.8 `&Dhi(@XMM[6])`, {@XMM[10]}, `&Dhi(@XMM[8])`
878	vtbl.8 `&Dlo(@XMM[7])`, {@XMM[11]}, `&Dlo(@XMM[8])`
879	vtbl.8 `&Dhi(@XMM[7])`, {@XMM[11]}, `&Dhi(@XMM[8])`
880	_bsaes_encrypt8_bitslice:
881	___
882	&bitslice (@XMM[0..7, 8..11]);
883	$code.=<<___;
884	sub $rounds,$rounds,#1
885	b .Lenc_sbox
886	.align 4
887	.Lenc_loop:
888	___
889	&ShiftRows (@XMM[0..7, 8..12]);
890	$code.=".Lenc_sbox:\n";
891	&Sbox (@XMM[0..7, 8..15]);
892	$code.=<<___;
893	subs $rounds,$rounds,#1
894	bcc .Lenc_done
895	___
896	&MixColumns (@XMM[0,1,4,6,3,7,2,5, 8..15]);
897	$code.=<<___;
898	vldmia $const, {@XMM[12]} @ .LSR
899	ite eq @ Thumb2 thing, samity check in ARM
900	addeq $const,$const,#0x10
901	bne .Lenc_loop
902	vldmia $const, {@XMM[12]} @ .LSRM0
903	b .Lenc_loop
904	.align 4
905	.Lenc_done:
906	___
907	# output in lsb > [t0, t1, t4, t6, t3, t7, t2, t5] < msb
908	&bitslice (@XMM[0,1,4,6,3,7,2,5, 8..11]);
909	$code.=<<___;
910	vldmia $key, {@XMM[8]} @ last round key
911	veor @XMM[4], @XMM[4], @XMM[8]
912	veor @XMM[6], @XMM[6], @XMM[8]
913	veor @XMM[3], @XMM[3], @XMM[8]
914	veor @XMM[7], @XMM[7], @XMM[8]
915	veor @XMM[2], @XMM[2], @XMM[8]
916	veor @XMM[5], @XMM[5], @XMM[8]
917	veor @XMM[0], @XMM[0], @XMM[8]
918	veor @XMM[1], @XMM[1], @XMM[8]
919	bx lr
920	.size _bsaes_encrypt8,.-_bsaes_encrypt8
921	___
922	}
923	{
924	my ($out,$inp,$rounds,$const)=("r12","r4","r5","r6");
925
926	sub bitslice_key {
927	my @x=reverse(@_[0..7]);
928	my ($bs0,$bs1,$bs2,$t2,$t3)=@_[8..12];
929
930	&swapmove (@x[0,1],1,$bs0,$t2,$t3);
931	$code.=<<___;
932	@ &swapmove(@x[2,3],1,$t0,$t2,$t3);
933	vmov @x[2], @x[0]
934	vmov @x[3], @x[1]
935	___
936	#&swapmove2x(@x[4,5,6,7],1,$t0,$t2,$t3);
937
938	&swapmove2x (@x[0,2,1,3],2,$bs1,$t2,$t3);
939	$code.=<<___;
940	@ &swapmove2x(@x[4,6,5,7],2,$t1,$t2,$t3);
941	vmov @x[4], @x[0]
942	vmov @x[6], @x[2]
943	vmov @x[5], @x[1]
944	vmov @x[7], @x[3]
945	___
946	&swapmove2x (@x[0,4,1,5],4,$bs2,$t2,$t3);
947	&swapmove2x (@x[2,6,3,7],4,$bs2,$t2,$t3);
948	}
949
950	$code.=<<___;
951	.type _bsaes_key_convert,%function
952	.align 4
953	_bsaes_key_convert:
954	adr $const,_bsaes_key_convert
955	vld1.8 {@XMM[7]}, [$inp]! @ load round 0 key
956	#ifdef __APPLE__
957	adr $const,.LM0
958	#else
959	sub $const,$const,#_bsaes_key_convert-.LM0
960	#endif
961	vld1.8 {@XMM[15]}, [$inp]! @ load round 1 key
962
963	vmov.i8 @XMM[8], #0x01 @ bit masks
964	vmov.i8 @XMM[9], #0x02
965	vmov.i8 @XMM[10], #0x04
966	vmov.i8 @XMM[11], #0x08
967	vmov.i8 @XMM[12], #0x10
968	vmov.i8 @XMM[13], #0x20
969	vldmia $const, {@XMM[14]} @ .LM0
970
971	#ifdef __ARMEL__
972	vrev32.8 @XMM[7], @XMM[7]
973	vrev32.8 @XMM[15], @XMM[15]
974	#endif
975	sub $rounds,$rounds,#1
976	vstmia $out!, {@XMM[7]} @ save round 0 key
977	b .Lkey_loop
978
979	.align 4
980	.Lkey_loop:
981	vtbl.8 `&Dlo(@XMM[7])`,{@XMM[15]},`&Dlo(@XMM[14])`
982	vtbl.8 `&Dhi(@XMM[7])`,{@XMM[15]},`&Dhi(@XMM[14])`
983	vmov.i8 @XMM[6], #0x40
984	vmov.i8 @XMM[15], #0x80
985
986	vtst.8 @XMM[0], @XMM[7], @XMM[8]
987	vtst.8 @XMM[1], @XMM[7], @XMM[9]
988	vtst.8 @XMM[2], @XMM[7], @XMM[10]
989	vtst.8 @XMM[3], @XMM[7], @XMM[11]
990	vtst.8 @XMM[4], @XMM[7], @XMM[12]
991	vtst.8 @XMM[5], @XMM[7], @XMM[13]
992	vtst.8 @XMM[6], @XMM[7], @XMM[6]
993	vtst.8 @XMM[7], @XMM[7], @XMM[15]
994	vld1.8 {@XMM[15]}, [$inp]! @ load next round key
995	vmvn @XMM[0], @XMM[0] @ "pnot"
996	vmvn @XMM[1], @XMM[1]
997	vmvn @XMM[5], @XMM[5]
998	vmvn @XMM[6], @XMM[6]
999	#ifdef __ARMEL__
1000	vrev32.8 @XMM[15], @XMM[15]
1001	#endif
1002	subs $rounds,$rounds,#1
1003	vstmia $out!,{@XMM[0]-@XMM[7]} @ write bit-sliced round key
1004	bne .Lkey_loop
1005
1006	vmov.i8 @XMM[7],#0x63 @ compose .L63
1007	@ don't save last round key
1008	bx lr
1009	.size _bsaes_key_convert,.-_bsaes_key_convert
1010	___
1011	}
1012
1013	if (0) { # following four functions are unsupported interface
1014	# used for benchmarking...
1015	$code.=<<___;
1016	.globl bsaes_enc_key_convert
1017	.type bsaes_enc_key_convert,%function
1018	.align 4
1019	bsaes_enc_key_convert:
1020	stmdb sp!,{r4-r6,lr}
1021	vstmdb sp!,{d8-d15} @ ABI specification says so
1022
1023	ldr r5,[$inp,#240] @ pass rounds
1024	mov r4,$inp @ pass key
1025	mov r12,$out @ pass key schedule
1026	bl _bsaes_key_convert
1027	veor @XMM[7],@XMM[7],@XMM[15] @ fix up last round key
1028	vstmia r12, {@XMM[7]} @ save last round key
1029
1030	vldmia sp!,{d8-d15}
1031	ldmia sp!,{r4-r6,pc}
1032	.size bsaes_enc_key_convert,.-bsaes_enc_key_convert
1033
1034	.globl bsaes_encrypt_128
1035	.type bsaes_encrypt_128,%function
1036	.align 4
1037	bsaes_encrypt_128:
1038	stmdb sp!,{r4-r6,lr}
1039	vstmdb sp!,{d8-d15} @ ABI specification says so
1040	.Lenc128_loop:
1041	vld1.8 {@XMM[0]-@XMM[1]}, [$inp]! @ load input
1042	vld1.8 {@XMM[2]-@XMM[3]}, [$inp]!
1043	mov r4,$key @ pass the key
1044	vld1.8 {@XMM[4]-@XMM[5]}, [$inp]!
1045	mov r5,#10 @ pass rounds
1046	vld1.8 {@XMM[6]-@XMM[7]}, [$inp]!
1047
1048	bl _bsaes_encrypt8
1049
1050	vst1.8 {@XMM[0]-@XMM[1]}, [$out]! @ write output
1051	vst1.8 {@XMM[4]}, [$out]!
1052	vst1.8 {@XMM[6]}, [$out]!
1053	vst1.8 {@XMM[3]}, [$out]!
1054	vst1.8 {@XMM[7]}, [$out]!
1055	vst1.8 {@XMM[2]}, [$out]!
1056	subs $len,$len,#0x80
1057	vst1.8 {@XMM[5]}, [$out]!
1058	bhi .Lenc128_loop
1059
1060	vldmia sp!,{d8-d15}
1061	ldmia sp!,{r4-r6,pc}
1062	.size bsaes_encrypt_128,.-bsaes_encrypt_128
1063
1064	.globl bsaes_dec_key_convert
1065	.type bsaes_dec_key_convert,%function
1066	.align 4
1067	bsaes_dec_key_convert:
1068	stmdb sp!,{r4-r6,lr}
1069	vstmdb sp!,{d8-d15} @ ABI specification says so
1070
1071	ldr r5,[$inp,#240] @ pass rounds
1072	mov r4,$inp @ pass key
1073	mov r12,$out @ pass key schedule
1074	bl _bsaes_key_convert
1075	vldmia $out, {@XMM[6]}
1076	vstmia r12, {@XMM[15]} @ save last round key
1077	veor @XMM[7], @XMM[7], @XMM[6] @ fix up round 0 key
1078	vstmia $out, {@XMM[7]}
1079
1080	vldmia sp!,{d8-d15}
1081	ldmia sp!,{r4-r6,pc}
1082	.size bsaes_dec_key_convert,.-bsaes_dec_key_convert
1083
1084	.globl bsaes_decrypt_128
1085	.type bsaes_decrypt_128,%function
1086	.align 4
1087	bsaes_decrypt_128:
1088	stmdb sp!,{r4-r6,lr}
1089	vstmdb sp!,{d8-d15} @ ABI specification says so
1090	.Ldec128_loop:
1091	vld1.8 {@XMM[0]-@XMM[1]}, [$inp]! @ load input
1092	vld1.8 {@XMM[2]-@XMM[3]}, [$inp]!
1093	mov r4,$key @ pass the key
1094	vld1.8 {@XMM[4]-@XMM[5]}, [$inp]!
1095	mov r5,#10 @ pass rounds
1096	vld1.8 {@XMM[6]-@XMM[7]}, [$inp]!
1097
1098	bl _bsaes_decrypt8
1099
1100	vst1.8 {@XMM[0]-@XMM[1]}, [$out]! @ write output
1101	vst1.8 {@XMM[6]}, [$out]!
1102	vst1.8 {@XMM[4]}, [$out]!
1103	vst1.8 {@XMM[2]}, [$out]!
1104	vst1.8 {@XMM[7]}, [$out]!
1105	vst1.8 {@XMM[3]}, [$out]!
1106	subs $len,$len,#0x80
1107	vst1.8 {@XMM[5]}, [$out]!
1108	bhi .Ldec128_loop
1109
1110	vldmia sp!,{d8-d15}
1111	ldmia sp!,{r4-r6,pc}
1112	.size bsaes_decrypt_128,.-bsaes_decrypt_128
1113	___
1114	}
1115	{
1116	my ($inp,$out,$len,$key, $ivp,$fp,$rounds)=map("r$_",(0..3,8..10));
1117	my ($keysched)=("sp");
1118
1119	$code.=<<___;
1120	.extern AES_cbc_encrypt
1121	.extern AES_decrypt
1122
1123	.global bsaes_cbc_encrypt
1124	.type bsaes_cbc_encrypt,%function
1125	.align 5
1126	bsaes_cbc_encrypt:
1127	#ifndef __KERNEL__
1128	cmp $len, #128
1129	#ifndef __thumb__
1130	blo AES_cbc_encrypt
1131	#else
1132	bhs 1f
1133	b AES_cbc_encrypt
1134	1:
1135	#endif
1136	#endif
1137
1138	@ it is up to the caller to make sure we are called with enc == 0
1139
1140	mov ip, sp
1141	stmdb sp!, {r4-r10, lr}
1142	VFP_ABI_PUSH
1143	ldr $ivp, [ip] @ IV is 1st arg on the stack
1144	mov $len, $len, lsr#4 @ len in 16 byte blocks
1145	sub sp, #0x10 @ scratch space to carry over the IV
1146	mov $fp, sp @ save sp
1147
1148	ldr $rounds, [$key, #240] @ get # of rounds
1149	#ifndef BSAES_ASM_EXTENDED_KEY
1150	@ allocate the key schedule on the stack
1151	sub r12, sp, $rounds, lsl#7 @ 128 bytes per inner round key
1152	add r12, #`128-32` @ sifze of bit-slices key schedule
1153
1154	@ populate the key schedule
1155	mov r4, $key @ pass key
1156	mov r5, $rounds @ pass # of rounds
1157	mov sp, r12 @ sp is $keysched
1158	bl _bsaes_key_convert
1159	vldmia $keysched, {@XMM[6]}
1160	vstmia r12, {@XMM[15]} @ save last round key
1161	veor @XMM[7], @XMM[7], @XMM[6] @ fix up round 0 key
1162	vstmia $keysched, {@XMM[7]}
1163	#else
1164	ldr r12, [$key, #244]
1165	eors r12, #1
1166	beq 0f
1167
1168	@ populate the key schedule
1169	str r12, [$key, #244]
1170	mov r4, $key @ pass key
1171	mov r5, $rounds @ pass # of rounds
1172	add r12, $key, #248 @ pass key schedule
1173	bl _bsaes_key_convert
1174	add r4, $key, #248
1175	vldmia r4, {@XMM[6]}
1176	vstmia r12, {@XMM[15]} @ save last round key
1177	veor @XMM[7], @XMM[7], @XMM[6] @ fix up round 0 key
1178	vstmia r4, {@XMM[7]}
1179
1180	.align 2
1181	0:
1182	#endif
1183
1184	vld1.8 {@XMM[15]}, [$ivp] @ load IV
1185	b .Lcbc_dec_loop
1186
1187	.align 4
1188	.Lcbc_dec_loop:
1189	subs $len, $len, #0x8
1190	bmi .Lcbc_dec_loop_finish
1191
1192	vld1.8 {@XMM[0]-@XMM[1]}, [$inp]! @ load input
1193	vld1.8 {@XMM[2]-@XMM[3]}, [$inp]!
1194	#ifndef BSAES_ASM_EXTENDED_KEY
1195	mov r4, $keysched @ pass the key
1196	#else
1197	add r4, $key, #248
1198	#endif
1199	vld1.8 {@XMM[4]-@XMM[5]}, [$inp]!
1200	mov r5, $rounds
1201	vld1.8 {@XMM[6]-@XMM[7]}, [$inp]
1202	sub $inp, $inp, #0x60
1203	vstmia $fp, {@XMM[15]} @ put aside IV
1204
1205	bl _bsaes_decrypt8
1206
1207	vldmia $fp, {@XMM[14]} @ reload IV
1208	vld1.8 {@XMM[8]-@XMM[9]}, [$inp]! @ reload input
1209	veor @XMM[0], @XMM[0], @XMM[14] @ ^= IV
1210	vld1.8 {@XMM[10]-@XMM[11]}, [$inp]!
1211	veor @XMM[1], @XMM[1], @XMM[8]
1212	veor @XMM[6], @XMM[6], @XMM[9]
1213	vld1.8 {@XMM[12]-@XMM[13]}, [$inp]!
1214	veor @XMM[4], @XMM[4], @XMM[10]
1215	veor @XMM[2], @XMM[2], @XMM[11]
1216	vld1.8 {@XMM[14]-@XMM[15]}, [$inp]!
1217	veor @XMM[7], @XMM[7], @XMM[12]
1218	vst1.8 {@XMM[0]-@XMM[1]}, [$out]! @ write output
1219	veor @XMM[3], @XMM[3], @XMM[13]
1220	vst1.8 {@XMM[6]}, [$out]!
1221	veor @XMM[5], @XMM[5], @XMM[14]
1222	vst1.8 {@XMM[4]}, [$out]!
1223	vst1.8 {@XMM[2]}, [$out]!
1224	vst1.8 {@XMM[7]}, [$out]!
1225	vst1.8 {@XMM[3]}, [$out]!
1226	vst1.8 {@XMM[5]}, [$out]!
1227
1228	b .Lcbc_dec_loop
1229
1230	.Lcbc_dec_loop_finish:
1231	adds $len, $len, #8
1232	beq .Lcbc_dec_done
1233
1234	vld1.8 {@XMM[0]}, [$inp]! @ load input
1235	cmp $len, #2
1236	blo .Lcbc_dec_one
1237	vld1.8 {@XMM[1]}, [$inp]!
1238	#ifndef BSAES_ASM_EXTENDED_KEY
1239	mov r4, $keysched @ pass the key
1240	#else
1241	add r4, $key, #248
1242	#endif
1243	mov r5, $rounds
1244	vstmia $fp, {@XMM[15]} @ put aside IV
1245	beq .Lcbc_dec_two
1246	vld1.8 {@XMM[2]}, [$inp]!
1247	cmp $len, #4
1248	blo .Lcbc_dec_three
1249	vld1.8 {@XMM[3]}, [$inp]!
1250	beq .Lcbc_dec_four
1251	vld1.8 {@XMM[4]}, [$inp]!
1252	cmp $len, #6
1253	blo .Lcbc_dec_five
1254	vld1.8 {@XMM[5]}, [$inp]!
1255	beq .Lcbc_dec_six
1256	vld1.8 {@XMM[6]}, [$inp]!
1257	sub $inp, $inp, #0x70
1258
1259	bl _bsaes_decrypt8
1260
1261	vldmia $fp, {@XMM[14]} @ reload IV
1262	vld1.8 {@XMM[8]-@XMM[9]}, [$inp]! @ reload input
1263	veor @XMM[0], @XMM[0], @XMM[14] @ ^= IV
1264	vld1.8 {@XMM[10]-@XMM[11]}, [$inp]!
1265	veor @XMM[1], @XMM[1], @XMM[8]
1266	veor @XMM[6], @XMM[6], @XMM[9]
1267	vld1.8 {@XMM[12]-@XMM[13]}, [$inp]!
1268	veor @XMM[4], @XMM[4], @XMM[10]
1269	veor @XMM[2], @XMM[2], @XMM[11]
1270	vld1.8 {@XMM[15]}, [$inp]!
1271	veor @XMM[7], @XMM[7], @XMM[12]
1272	vst1.8 {@XMM[0]-@XMM[1]}, [$out]! @ write output
1273	veor @XMM[3], @XMM[3], @XMM[13]
1274	vst1.8 {@XMM[6]}, [$out]!
1275	vst1.8 {@XMM[4]}, [$out]!
1276	vst1.8 {@XMM[2]}, [$out]!
1277	vst1.8 {@XMM[7]}, [$out]!
1278	vst1.8 {@XMM[3]}, [$out]!
1279	b .Lcbc_dec_done
1280	.align 4
1281	.Lcbc_dec_six:
1282	sub $inp, $inp, #0x60
1283	bl _bsaes_decrypt8
1284	vldmia $fp,{@XMM[14]} @ reload IV
1285	vld1.8 {@XMM[8]-@XMM[9]}, [$inp]! @ reload input
1286	veor @XMM[0], @XMM[0], @XMM[14] @ ^= IV
1287	vld1.8 {@XMM[10]-@XMM[11]}, [$inp]!
1288	veor @XMM[1], @XMM[1], @XMM[8]
1289	veor @XMM[6], @XMM[6], @XMM[9]
1290	vld1.8 {@XMM[12]}, [$inp]!
1291	veor @XMM[4], @XMM[4], @XMM[10]
1292	veor @XMM[2], @XMM[2], @XMM[11]
1293	vld1.8 {@XMM[15]}, [$inp]!
1294	veor @XMM[7], @XMM[7], @XMM[12]
1295	vst1.8 {@XMM[0]-@XMM[1]}, [$out]! @ write output
1296	vst1.8 {@XMM[6]}, [$out]!
1297	vst1.8 {@XMM[4]}, [$out]!
1298	vst1.8 {@XMM[2]}, [$out]!
1299	vst1.8 {@XMM[7]}, [$out]!
1300	b .Lcbc_dec_done
1301	.align 4
1302	.Lcbc_dec_five:
1303	sub $inp, $inp, #0x50
1304	bl _bsaes_decrypt8
1305	vldmia $fp, {@XMM[14]} @ reload IV
1306	vld1.8 {@XMM[8]-@XMM[9]}, [$inp]! @ reload input
1307	veor @XMM[0], @XMM[0], @XMM[14] @ ^= IV
1308	vld1.8 {@XMM[10]-@XMM[11]}, [$inp]!
1309	veor @XMM[1], @XMM[1], @XMM[8]
1310	veor @XMM[6], @XMM[6], @XMM[9]
1311	vld1.8 {@XMM[15]}, [$inp]!
1312	veor @XMM[4], @XMM[4], @XMM[10]
1313	vst1.8 {@XMM[0]-@XMM[1]}, [$out]! @ write output
1314	veor @XMM[2], @XMM[2], @XMM[11]
1315	vst1.8 {@XMM[6]}, [$out]!
1316	vst1.8 {@XMM[4]}, [$out]!
1317	vst1.8 {@XMM[2]}, [$out]!
1318	b .Lcbc_dec_done
1319	.align 4
1320	.Lcbc_dec_four:
1321	sub $inp, $inp, #0x40
1322	bl _bsaes_decrypt8
1323	vldmia $fp, {@XMM[14]} @ reload IV
1324	vld1.8 {@XMM[8]-@XMM[9]}, [$inp]! @ reload input
1325	veor @XMM[0], @XMM[0], @XMM[14] @ ^= IV
1326	vld1.8 {@XMM[10]}, [$inp]!
1327	veor @XMM[1], @XMM[1], @XMM[8]
1328	veor @XMM[6], @XMM[6], @XMM[9]
1329	vld1.8 {@XMM[15]}, [$inp]!
1330	veor @XMM[4], @XMM[4], @XMM[10]
1331	vst1.8 {@XMM[0]-@XMM[1]}, [$out]! @ write output
1332	vst1.8 {@XMM[6]}, [$out]!
1333	vst1.8 {@XMM[4]}, [$out]!
1334	b .Lcbc_dec_done
1335	.align 4
1336	.Lcbc_dec_three:
1337	sub $inp, $inp, #0x30
1338	bl _bsaes_decrypt8
1339	vldmia $fp, {@XMM[14]} @ reload IV
1340	vld1.8 {@XMM[8]-@XMM[9]}, [$inp]! @ reload input
1341	veor @XMM[0], @XMM[0], @XMM[14] @ ^= IV
1342	vld1.8 {@XMM[15]}, [$inp]!
1343	veor @XMM[1], @XMM[1], @XMM[8]
1344	veor @XMM[6], @XMM[6], @XMM[9]
1345	vst1.8 {@XMM[0]-@XMM[1]}, [$out]! @ write output
1346	vst1.8 {@XMM[6]}, [$out]!
1347	b .Lcbc_dec_done
1348	.align 4
1349	.Lcbc_dec_two:
1350	sub $inp, $inp, #0x20
1351	bl _bsaes_decrypt8
1352	vldmia $fp, {@XMM[14]} @ reload IV
1353	vld1.8 {@XMM[8]}, [$inp]! @ reload input
1354	veor @XMM[0], @XMM[0], @XMM[14] @ ^= IV
1355	vld1.8 {@XMM[15]}, [$inp]! @ reload input
1356	veor @XMM[1], @XMM[1], @XMM[8]
1357	vst1.8 {@XMM[0]-@XMM[1]}, [$out]! @ write output
1358	b .Lcbc_dec_done
1359	.align 4
1360	.Lcbc_dec_one:
1361	sub $inp, $inp, #0x10
1362	mov $rounds, $out @ save original out pointer
1363	mov $out, $fp @ use the iv scratch space as out buffer
1364	mov r2, $key
1365	vmov @XMM[4],@XMM[15] @ just in case ensure that IV
1366	vmov @XMM[5],@XMM[0] @ and input are preserved
1367	bl AES_decrypt
1368	vld1.8 {@XMM[0]}, [$fp] @ load result
1369	veor @XMM[0], @XMM[0], @XMM[4] @ ^= IV
1370	vmov @XMM[15], @XMM[5] @ @XMM[5] holds input
1371	vst1.8 {@XMM[0]}, [$rounds] @ write output
1372
1373	.Lcbc_dec_done:
1374	#ifndef BSAES_ASM_EXTENDED_KEY
1375	vmov.i32 q0, #0
1376	vmov.i32 q1, #0
1377	.Lcbc_dec_bzero: @ wipe key schedule [if any]
1378	vstmia $keysched!, {q0-q1}
1379	cmp $keysched, $fp
1380	bne .Lcbc_dec_bzero
1381	#endif
1382
1383	mov sp, $fp
1384	add sp, #0x10 @ add sp,$fp,#0x10 is no good for thumb
1385	vst1.8 {@XMM[15]}, [$ivp] @ return IV
1386	VFP_ABI_POP
1387	ldmia sp!, {r4-r10, pc}
1388	.size bsaes_cbc_encrypt,.-bsaes_cbc_encrypt
1389	___
1390	}
1391	{
1392	my ($inp,$out,$len,$key, $ctr,$fp,$rounds)=(map("r$_",(0..3,8..10)));
1393	my $const = "r6"; # shared with _bsaes_encrypt8_alt
1394	my $keysched = "sp";
1395
1396	$code.=<<___;
1397	.extern AES_encrypt
1398	.global bsaes_ctr32_encrypt_blocks
1399	.type bsaes_ctr32_encrypt_blocks,%function
1400	.align 5
1401	bsaes_ctr32_encrypt_blocks:
1402	cmp $len, #8 @ use plain AES for
1403	blo .Lctr_enc_short @ small sizes
1404
1405	mov ip, sp
1406	stmdb sp!, {r4-r10, lr}
1407	VFP_ABI_PUSH
1408	ldr $ctr, [ip] @ ctr is 1st arg on the stack
1409	sub sp, sp, #0x10 @ scratch space to carry over the ctr
1410	mov $fp, sp @ save sp
1411
1412	ldr $rounds, [$key, #240] @ get # of rounds
1413	#ifndef BSAES_ASM_EXTENDED_KEY
1414	@ allocate the key schedule on the stack
1415	sub r12, sp, $rounds, lsl#7 @ 128 bytes per inner round key
1416	add r12, #`128-32` @ size of bit-sliced key schedule
1417
1418	@ populate the key schedule
1419	mov r4, $key @ pass key
1420	mov r5, $rounds @ pass # of rounds
1421	mov sp, r12 @ sp is $keysched
1422	bl _bsaes_key_convert
1423	veor @XMM[7],@XMM[7],@XMM[15] @ fix up last round key
1424	vstmia r12, {@XMM[7]} @ save last round key
1425
1426	vld1.8 {@XMM[0]}, [$ctr] @ load counter
1427	#ifdef __APPLE__
1428	mov $ctr, #:lower16:(.LREVM0SR-.LM0)
1429	add $ctr, $const, $ctr
1430	#else
1431	add $ctr, $const, #.LREVM0SR-.LM0 @ borrow $ctr
1432	#endif
1433	vldmia $keysched, {@XMM[4]} @ load round0 key
1434	#else
1435	ldr r12, [$key, #244]
1436	eors r12, #1
1437	beq 0f
1438
1439	@ populate the key schedule
1440	str r12, [$key, #244]
1441	mov r4, $key @ pass key
1442	mov r5, $rounds @ pass # of rounds
1443	add r12, $key, #248 @ pass key schedule
1444	bl _bsaes_key_convert
1445	veor @XMM[7],@XMM[7],@XMM[15] @ fix up last round key
1446	vstmia r12, {@XMM[7]} @ save last round key
1447
1448	.align 2
1449	0: add r12, $key, #248
1450	vld1.8 {@XMM[0]}, [$ctr] @ load counter
1451	adrl $ctr, .LREVM0SR @ borrow $ctr
1452	vldmia r12, {@XMM[4]} @ load round0 key
1453	sub sp, #0x10 @ place for adjusted round0 key
1454	#endif
1455
1456	vmov.i32 @XMM[8],#1 @ compose 1<<96
1457	veor @XMM[9],@XMM[9],@XMM[9]
1458	vrev32.8 @XMM[0],@XMM[0]
1459	vext.8 @XMM[8],@XMM[9],@XMM[8],#4
1460	vrev32.8 @XMM[4],@XMM[4]
1461	vadd.u32 @XMM[9],@XMM[8],@XMM[8] @ compose 2<<96
1462	vstmia $keysched, {@XMM[4]} @ save adjusted round0 key
1463	b .Lctr_enc_loop
1464
1465	.align 4
1466	.Lctr_enc_loop:
1467	vadd.u32 @XMM[10], @XMM[8], @XMM[9] @ compose 3<<96
1468	vadd.u32 @XMM[1], @XMM[0], @XMM[8] @ +1
1469	vadd.u32 @XMM[2], @XMM[0], @XMM[9] @ +2
1470	vadd.u32 @XMM[3], @XMM[0], @XMM[10] @ +3
1471	vadd.u32 @XMM[4], @XMM[1], @XMM[10]
1472	vadd.u32 @XMM[5], @XMM[2], @XMM[10]
1473	vadd.u32 @XMM[6], @XMM[3], @XMM[10]
1474	vadd.u32 @XMM[7], @XMM[4], @XMM[10]
1475	vadd.u32 @XMM[10], @XMM[5], @XMM[10] @ next counter
1476
1477	@ Borrow prologue from _bsaes_encrypt8 to use the opportunity
1478	@ to flip byte order in 32-bit counter
1479
1480	vldmia $keysched, {@XMM[9]} @ load round0 key
1481	#ifndef BSAES_ASM_EXTENDED_KEY
1482	add r4, $keysched, #0x10 @ pass next round key
1483	#else
1484	add r4, $key, #`248+16`
1485	#endif
1486	vldmia $ctr, {@XMM[8]} @ .LREVM0SR
1487	mov r5, $rounds @ pass rounds
1488	vstmia $fp, {@XMM[10]} @ save next counter
1489	#ifdef __APPLE__
1490	mov $const, #:lower16:(.LREVM0SR-.LSR)
1491	sub $const, $ctr, $const
1492	#else
1493	sub $const, $ctr, #.LREVM0SR-.LSR @ pass constants
1494	#endif
1495
1496	bl _bsaes_encrypt8_alt
1497
1498	subs $len, $len, #8
1499	blo .Lctr_enc_loop_done
1500
1501	vld1.8 {@XMM[8]-@XMM[9]}, [$inp]! @ load input
1502	vld1.8 {@XMM[10]-@XMM[11]}, [$inp]!
1503	veor @XMM[0], @XMM[8]
1504	veor @XMM[1], @XMM[9]
1505	vld1.8 {@XMM[12]-@XMM[13]}, [$inp]!
1506	veor @XMM[4], @XMM[10]
1507	veor @XMM[6], @XMM[11]
1508	vld1.8 {@XMM[14]-@XMM[15]}, [$inp]!
1509	veor @XMM[3], @XMM[12]
1510	vst1.8 {@XMM[0]-@XMM[1]}, [$out]! @ write output
1511	veor @XMM[7], @XMM[13]
1512	veor @XMM[2], @XMM[14]
1513	vst1.8 {@XMM[4]}, [$out]!
1514	veor @XMM[5], @XMM[15]
1515	vst1.8 {@XMM[6]}, [$out]!
1516	vmov.i32 @XMM[8], #1 @ compose 1<<96
1517	vst1.8 {@XMM[3]}, [$out]!
1518	veor @XMM[9], @XMM[9], @XMM[9]
1519	vst1.8 {@XMM[7]}, [$out]!
1520	vext.8 @XMM[8], @XMM[9], @XMM[8], #4
1521	vst1.8 {@XMM[2]}, [$out]!
1522	vadd.u32 @XMM[9],@XMM[8],@XMM[8] @ compose 2<<96
1523	vst1.8 {@XMM[5]}, [$out]!
1524	vldmia $fp, {@XMM[0]} @ load counter
1525
1526	bne .Lctr_enc_loop
1527	b .Lctr_enc_done
1528
1529	.align 4
1530	.Lctr_enc_loop_done:
1531	add $len, $len, #8
1532	vld1.8 {@XMM[8]}, [$inp]! @ load input
1533	veor @XMM[0], @XMM[8]
1534	vst1.8 {@XMM[0]}, [$out]! @ write output
1535	cmp $len, #2
1536	blo .Lctr_enc_done
1537	vld1.8 {@XMM[9]}, [$inp]!
1538	veor @XMM[1], @XMM[9]
1539	vst1.8 {@XMM[1]}, [$out]!
1540	beq .Lctr_enc_done
1541	vld1.8 {@XMM[10]}, [$inp]!
1542	veor @XMM[4], @XMM[10]
1543	vst1.8 {@XMM[4]}, [$out]!
1544	cmp $len, #4
1545	blo .Lctr_enc_done
1546	vld1.8 {@XMM[11]}, [$inp]!
1547	veor @XMM[6], @XMM[11]
1548	vst1.8 {@XMM[6]}, [$out]!
1549	beq .Lctr_enc_done
1550	vld1.8 {@XMM[12]}, [$inp]!
1551	veor @XMM[3], @XMM[12]
1552	vst1.8 {@XMM[3]}, [$out]!
1553	cmp $len, #6
1554	blo .Lctr_enc_done
1555	vld1.8 {@XMM[13]}, [$inp]!
1556	veor @XMM[7], @XMM[13]
1557	vst1.8 {@XMM[7]}, [$out]!
1558	beq .Lctr_enc_done
1559	vld1.8 {@XMM[14]}, [$inp]
1560	veor @XMM[2], @XMM[14]
1561	vst1.8 {@XMM[2]}, [$out]!
1562
1563	.Lctr_enc_done:
1564	vmov.i32 q0, #0
1565	vmov.i32 q1, #0
1566	#ifndef BSAES_ASM_EXTENDED_KEY
1567	.Lctr_enc_bzero: @ wipe key schedule [if any]
1568	vstmia $keysched!, {q0-q1}
1569	cmp $keysched, $fp
1570	bne .Lctr_enc_bzero
1571	#else
1572	vstmia $keysched, {q0-q1}
1573	#endif
1574
1575	mov sp, $fp
1576	add sp, #0x10 @ add sp,$fp,#0x10 is no good for thumb
1577	VFP_ABI_POP
1578	ldmia sp!, {r4-r10, pc} @ return
1579
1580	.align 4
1581	.Lctr_enc_short:
1582	ldr ip, [sp] @ ctr pointer is passed on stack
1583	stmdb sp!, {r4-r8, lr}
1584
1585	mov r4, $inp @ copy arguments
1586	mov r5, $out
1587	mov r6, $len
1588	mov r7, $key
1589	ldr r8, [ip, #12] @ load counter LSW
1590	vld1.8 {@XMM[1]}, [ip] @ load whole counter value
1591	#ifdef __ARMEL__
1592	rev r8, r8
1593	#endif
1594	sub sp, sp, #0x10
1595	vst1.8 {@XMM[1]}, [sp] @ copy counter value
1596	sub sp, sp, #0x10
1597
1598	.Lctr_enc_short_loop:
1599	add r0, sp, #0x10 @ input counter value
1600	mov r1, sp @ output on the stack
1601	mov r2, r7 @ key
1602
1603	bl AES_encrypt
1604
1605	vld1.8 {@XMM[0]}, [r4]! @ load input
1606	vld1.8 {@XMM[1]}, [sp] @ load encrypted counter
1607	add r8, r8, #1
1608	#ifdef __ARMEL__
1609	rev r0, r8
1610	str r0, [sp, #0x1c] @ next counter value
1611	#else
1612	str r8, [sp, #0x1c] @ next counter value
1613	#endif
1614	veor @XMM[0],@XMM[0],@XMM[1]
1615	vst1.8 {@XMM[0]}, [r5]! @ store output
1616	subs r6, r6, #1
1617	bne .Lctr_enc_short_loop
1618
1619	vmov.i32 q0, #0
1620	vmov.i32 q1, #0
1621	vstmia sp!, {q0-q1}
1622
1623	ldmia sp!, {r4-r8, pc}
1624	.size bsaes_ctr32_encrypt_blocks,.-bsaes_ctr32_encrypt_blocks
1625	___
1626	}
1627	{
1628	######################################################################
1629	# void bsaes_xts_[en\|de]crypt(const char inp,char out,size_t len,
1630	# const AES_KEY key1, const AES_KEY key2,
1631	# const unsigned char iv[16]);
1632	#
1633	my ($inp,$out,$len,$key,$rounds,$magic,$fp)=(map("r$_",(7..10,1..3)));
1634	my $const="r6"; # returned by _bsaes_key_convert
1635	my $twmask=@XMM[5];
1636	my @T=@XMM[6..7];
1637
1638	$code.=<<___;
1639	.globl bsaes_xts_encrypt
1640	.type bsaes_xts_encrypt,%function
1641	.align 4
1642	bsaes_xts_encrypt:
1643	mov ip, sp
1644	stmdb sp!, {r4-r10, lr} @ 0x20
1645	VFP_ABI_PUSH
1646	mov r6, sp @ future $fp
1647
1648	mov $inp, r0
1649	mov $out, r1
1650	mov $len, r2
1651	mov $key, r3
1652
1653	sub r0, sp, #0x10 @ 0x10
1654	bic r0, #0xf @ align at 16 bytes
1655	mov sp, r0
1656
1657	#ifdef XTS_CHAIN_TWEAK
1658	ldr r0, [ip] @ pointer to input tweak
1659	#else
1660	@ generate initial tweak
1661	ldr r0, [ip, #4] @ iv[]
1662	mov r1, sp
1663	ldr r2, [ip, #0] @ key2
1664	bl AES_encrypt
1665	mov r0,sp @ pointer to initial tweak
1666	#endif
1667
1668	ldr $rounds, [$key, #240] @ get # of rounds
1669	mov $fp, r6
1670	#ifndef BSAES_ASM_EXTENDED_KEY
1671	@ allocate the key schedule on the stack
1672	sub r12, sp, $rounds, lsl#7 @ 128 bytes per inner round key
1673	@ add r12, #`128-32` @ size of bit-sliced key schedule
1674	sub r12, #`32+16` @ place for tweak[9]
1675
1676	@ populate the key schedule
1677	mov r4, $key @ pass key
1678	mov r5, $rounds @ pass # of rounds
1679	mov sp, r12
1680	add r12, #0x90 @ pass key schedule
1681	bl _bsaes_key_convert
1682	veor @XMM[7], @XMM[7], @XMM[15] @ fix up last round key
1683	vstmia r12, {@XMM[7]} @ save last round key
1684	#else
1685	ldr r12, [$key, #244]
1686	eors r12, #1
1687	beq 0f
1688
1689	str r12, [$key, #244]
1690	mov r4, $key @ pass key
1691	mov r5, $rounds @ pass # of rounds
1692	add r12, $key, #248 @ pass key schedule
1693	bl _bsaes_key_convert
1694	veor @XMM[7], @XMM[7], @XMM[15] @ fix up last round key
1695	vstmia r12, {@XMM[7]}
1696
1697	.align 2
1698	0: sub sp, #0x90 @ place for tweak[9]
1699	#endif
1700
1701	vld1.8 {@XMM[8]}, [r0] @ initial tweak
1702	adr $magic, .Lxts_magic
1703
1704	subs $len, #0x80
1705	blo .Lxts_enc_short
1706	b .Lxts_enc_loop
1707
1708	.align 4
1709	.Lxts_enc_loop:
1710	vldmia $magic, {$twmask} @ load XTS magic
1711	vshr.s64 @T[0], @XMM[8], #63
1712	mov r0, sp
1713	vand @T[0], @T[0], $twmask
1714	___
1715	for($i=9;$i<16;$i++) {
1716	$code.=<<___;
1717	vadd.u64 @XMM[$i], @XMM[$i-1], @XMM[$i-1]
1718	vst1.64 {@XMM[$i-1]}, [r0,:128]!
1719	vswp `&Dhi("@T[0]")`,`&Dlo("@T[0]")`
1720	vshr.s64 @T[1], @XMM[$i], #63
1721	veor @XMM[$i], @XMM[$i], @T[0]
1722	vand @T[1], @T[1], $twmask
1723	___
1724	@T=reverse(@T);
1725
1726	$code.=<<___ if ($i>=10);
1727	vld1.8 {@XMM[$i-10]}, [$inp]!
1728	___
1729	$code.=<<___ if ($i>=11);
1730	veor @XMM[$i-11], @XMM[$i-11], @XMM[$i-3]
1731	___
1732	}
1733	$code.=<<___;
1734	vadd.u64 @XMM[8], @XMM[15], @XMM[15]
1735	vst1.64 {@XMM[15]}, [r0,:128]!
1736	vswp `&Dhi("@T[0]")`,`&Dlo("@T[0]")`
1737	veor @XMM[8], @XMM[8], @T[0]
1738	vst1.64 {@XMM[8]}, [r0,:128] @ next round tweak
1739
1740	vld1.8 {@XMM[6]-@XMM[7]}, [$inp]!
1741	veor @XMM[5], @XMM[5], @XMM[13]
1742	#ifndef BSAES_ASM_EXTENDED_KEY
1743	add r4, sp, #0x90 @ pass key schedule
1744	#else
1745	add r4, $key, #248 @ pass key schedule
1746	#endif
1747	veor @XMM[6], @XMM[6], @XMM[14]
1748	mov r5, $rounds @ pass rounds
1749	veor @XMM[7], @XMM[7], @XMM[15]
1750	mov r0, sp
1751
1752	bl _bsaes_encrypt8
1753
1754	vld1.64 {@XMM[ 8]-@XMM[ 9]}, [r0,:128]!
1755	vld1.64 {@XMM[10]-@XMM[11]}, [r0,:128]!
1756	veor @XMM[0], @XMM[0], @XMM[ 8]
1757	vld1.64 {@XMM[12]-@XMM[13]}, [r0,:128]!
1758	veor @XMM[1], @XMM[1], @XMM[ 9]
1759	veor @XMM[8], @XMM[4], @XMM[10]
1760	vst1.8 {@XMM[0]-@XMM[1]}, [$out]!
1761	veor @XMM[9], @XMM[6], @XMM[11]
1762	vld1.64 {@XMM[14]-@XMM[15]}, [r0,:128]!
1763	veor @XMM[10], @XMM[3], @XMM[12]
1764	vst1.8 {@XMM[8]-@XMM[9]}, [$out]!
1765	veor @XMM[11], @XMM[7], @XMM[13]
1766	veor @XMM[12], @XMM[2], @XMM[14]
1767	vst1.8 {@XMM[10]-@XMM[11]}, [$out]!
1768	veor @XMM[13], @XMM[5], @XMM[15]
1769	vst1.8 {@XMM[12]-@XMM[13]}, [$out]!
1770
1771	vld1.64 {@XMM[8]}, [r0,:128] @ next round tweak
1772
1773	subs $len, #0x80
1774	bpl .Lxts_enc_loop
1775
1776	.Lxts_enc_short:
1777	adds $len, #0x70
1778	bmi .Lxts_enc_done
1779
1780	vldmia $magic, {$twmask} @ load XTS magic
1781	vshr.s64 @T[0], @XMM[8], #63
1782	mov r0, sp
1783	vand @T[0], @T[0], $twmask
1784	___
1785	for($i=9;$i<16;$i++) {
1786	$code.=<<___;
1787	vadd.u64 @XMM[$i], @XMM[$i-1], @XMM[$i-1]
1788	vst1.64 {@XMM[$i-1]}, [r0,:128]!
1789	vswp `&Dhi("@T[0]")`,`&Dlo("@T[0]")`
1790	vshr.s64 @T[1], @XMM[$i], #63
1791	veor @XMM[$i], @XMM[$i], @T[0]
1792	vand @T[1], @T[1], $twmask
1793	___
1794	@T=reverse(@T);
1795
1796	$code.=<<___ if ($i>=10);
1797	vld1.8 {@XMM[$i-10]}, [$inp]!
1798	subs $len, #0x10
1799	bmi .Lxts_enc_`$i-9`
1800	___
1801	$code.=<<___ if ($i>=11);
1802	veor @XMM[$i-11], @XMM[$i-11], @XMM[$i-3]
1803	___
1804	}
1805	$code.=<<___;
1806	sub $len, #0x10
1807	vst1.64 {@XMM[15]}, [r0,:128] @ next round tweak
1808
1809	vld1.8 {@XMM[6]}, [$inp]!
1810	veor @XMM[5], @XMM[5], @XMM[13]
1811	#ifndef BSAES_ASM_EXTENDED_KEY
1812	add r4, sp, #0x90 @ pass key schedule
1813	#else
1814	add r4, $key, #248 @ pass key schedule
1815	#endif
1816	veor @XMM[6], @XMM[6], @XMM[14]
1817	mov r5, $rounds @ pass rounds
1818	mov r0, sp
1819
1820	bl _bsaes_encrypt8
1821
1822	vld1.64 {@XMM[ 8]-@XMM[ 9]}, [r0,:128]!
1823	vld1.64 {@XMM[10]-@XMM[11]}, [r0,:128]!
1824	veor @XMM[0], @XMM[0], @XMM[ 8]
1825	vld1.64 {@XMM[12]-@XMM[13]}, [r0,:128]!
1826	veor @XMM[1], @XMM[1], @XMM[ 9]
1827	veor @XMM[8], @XMM[4], @XMM[10]
1828	vst1.8 {@XMM[0]-@XMM[1]}, [$out]!
1829	veor @XMM[9], @XMM[6], @XMM[11]
1830	vld1.64 {@XMM[14]}, [r0,:128]!
1831	veor @XMM[10], @XMM[3], @XMM[12]
1832	vst1.8 {@XMM[8]-@XMM[9]}, [$out]!
1833	veor @XMM[11], @XMM[7], @XMM[13]
1834	veor @XMM[12], @XMM[2], @XMM[14]
1835	vst1.8 {@XMM[10]-@XMM[11]}, [$out]!
1836	vst1.8 {@XMM[12]}, [$out]!
1837
1838	vld1.64 {@XMM[8]}, [r0,:128] @ next round tweak
1839	b .Lxts_enc_done
1840	.align 4
1841	.Lxts_enc_6:
1842	veor @XMM[4], @XMM[4], @XMM[12]
1843	#ifndef BSAES_ASM_EXTENDED_KEY
1844	add r4, sp, #0x90 @ pass key schedule
1845	#else
1846	add r4, $key, #248 @ pass key schedule
1847	#endif
1848	veor @XMM[5], @XMM[5], @XMM[13]
1849	mov r5, $rounds @ pass rounds
1850	mov r0, sp
1851
1852	bl _bsaes_encrypt8
1853
1854	vld1.64 {@XMM[ 8]-@XMM[ 9]}, [r0,:128]!
1855	vld1.64 {@XMM[10]-@XMM[11]}, [r0,:128]!
1856	veor @XMM[0], @XMM[0], @XMM[ 8]
1857	vld1.64 {@XMM[12]-@XMM[13]}, [r0,:128]!
1858	veor @XMM[1], @XMM[1], @XMM[ 9]
1859	veor @XMM[8], @XMM[4], @XMM[10]
1860	vst1.8 {@XMM[0]-@XMM[1]}, [$out]!
1861	veor @XMM[9], @XMM[6], @XMM[11]
1862	veor @XMM[10], @XMM[3], @XMM[12]
1863	vst1.8 {@XMM[8]-@XMM[9]}, [$out]!
1864	veor @XMM[11], @XMM[7], @XMM[13]
1865	vst1.8 {@XMM[10]-@XMM[11]}, [$out]!
1866
1867	vld1.64 {@XMM[8]}, [r0,:128] @ next round tweak
1868	b .Lxts_enc_done
1869
1870	@ put this in range for both ARM and Thumb mode adr instructions
1871	.align 5
1872	.Lxts_magic:
1873	.quad 1, 0x87
1874
1875	.align 5
1876	.Lxts_enc_5:
1877	veor @XMM[3], @XMM[3], @XMM[11]
1878	#ifndef BSAES_ASM_EXTENDED_KEY
1879	add r4, sp, #0x90 @ pass key schedule
1880	#else
1881	add r4, $key, #248 @ pass key schedule
1882	#endif
1883	veor @XMM[4], @XMM[4], @XMM[12]
1884	mov r5, $rounds @ pass rounds
1885	mov r0, sp
1886
1887	bl _bsaes_encrypt8
1888
1889	vld1.64 {@XMM[ 8]-@XMM[ 9]}, [r0,:128]!
1890	vld1.64 {@XMM[10]-@XMM[11]}, [r0,:128]!
1891	veor @XMM[0], @XMM[0], @XMM[ 8]
1892	vld1.64 {@XMM[12]}, [r0,:128]!
1893	veor @XMM[1], @XMM[1], @XMM[ 9]
1894	veor @XMM[8], @XMM[4], @XMM[10]
1895	vst1.8 {@XMM[0]-@XMM[1]}, [$out]!
1896	veor @XMM[9], @XMM[6], @XMM[11]
1897	veor @XMM[10], @XMM[3], @XMM[12]
1898	vst1.8 {@XMM[8]-@XMM[9]}, [$out]!
1899	vst1.8 {@XMM[10]}, [$out]!
1900
1901	vld1.64 {@XMM[8]}, [r0,:128] @ next round tweak
1902	b .Lxts_enc_done
1903	.align 4
1904	.Lxts_enc_4:
1905	veor @XMM[2], @XMM[2], @XMM[10]
1906	#ifndef BSAES_ASM_EXTENDED_KEY
1907	add r4, sp, #0x90 @ pass key schedule
1908	#else
1909	add r4, $key, #248 @ pass key schedule
1910	#endif
1911	veor @XMM[3], @XMM[3], @XMM[11]
1912	mov r5, $rounds @ pass rounds
1913	mov r0, sp
1914
1915	bl _bsaes_encrypt8
1916
1917	vld1.64 {@XMM[ 8]-@XMM[ 9]}, [r0,:128]!
1918	vld1.64 {@XMM[10]-@XMM[11]}, [r0,:128]!
1919	veor @XMM[0], @XMM[0], @XMM[ 8]
1920	veor @XMM[1], @XMM[1], @XMM[ 9]
1921	veor @XMM[8], @XMM[4], @XMM[10]
1922	vst1.8 {@XMM[0]-@XMM[1]}, [$out]!
1923	veor @XMM[9], @XMM[6], @XMM[11]
1924	vst1.8 {@XMM[8]-@XMM[9]}, [$out]!
1925
1926	vld1.64 {@XMM[8]}, [r0,:128] @ next round tweak
1927	b .Lxts_enc_done
1928	.align 4
1929	.Lxts_enc_3:
1930	veor @XMM[1], @XMM[1], @XMM[9]
1931	#ifndef BSAES_ASM_EXTENDED_KEY
1932	add r4, sp, #0x90 @ pass key schedule
1933	#else
1934	add r4, $key, #248 @ pass key schedule
1935	#endif
1936	veor @XMM[2], @XMM[2], @XMM[10]
1937	mov r5, $rounds @ pass rounds
1938	mov r0, sp
1939
1940	bl _bsaes_encrypt8
1941
1942	vld1.64 {@XMM[8]-@XMM[9]}, [r0,:128]!
1943	vld1.64 {@XMM[10]}, [r0,:128]!
1944	veor @XMM[0], @XMM[0], @XMM[ 8]
1945	veor @XMM[1], @XMM[1], @XMM[ 9]
1946	veor @XMM[8], @XMM[4], @XMM[10]
1947	vst1.8 {@XMM[0]-@XMM[1]}, [$out]!
1948	vst1.8 {@XMM[8]}, [$out]!
1949
1950	vld1.64 {@XMM[8]}, [r0,:128] @ next round tweak
1951	b .Lxts_enc_done
1952	.align 4
1953	.Lxts_enc_2:
1954	veor @XMM[0], @XMM[0], @XMM[8]
1955	#ifndef BSAES_ASM_EXTENDED_KEY
1956	add r4, sp, #0x90 @ pass key schedule
1957	#else
1958	add r4, $key, #248 @ pass key schedule
1959	#endif
1960	veor @XMM[1], @XMM[1], @XMM[9]
1961	mov r5, $rounds @ pass rounds
1962	mov r0, sp
1963
1964	bl _bsaes_encrypt8
1965
1966	vld1.64 {@XMM[8]-@XMM[9]}, [r0,:128]!
1967	veor @XMM[0], @XMM[0], @XMM[ 8]
1968	veor @XMM[1], @XMM[1], @XMM[ 9]
1969	vst1.8 {@XMM[0]-@XMM[1]}, [$out]!
1970
1971	vld1.64 {@XMM[8]}, [r0,:128] @ next round tweak
1972	b .Lxts_enc_done
1973	.align 4
1974	.Lxts_enc_1:
1975	mov r0, sp
1976	veor @XMM[0], @XMM[0], @XMM[8]
1977	mov r1, sp
1978	vst1.8 {@XMM[0]}, [sp,:128]
1979	mov r2, $key
1980	mov r4, $fp @ preserve fp
1981
1982	bl AES_encrypt
1983
1984	vld1.8 {@XMM[0]}, [sp,:128]
1985	veor @XMM[0], @XMM[0], @XMM[8]
1986	vst1.8 {@XMM[0]}, [$out]!
1987	mov $fp, r4
1988
1989	vmov @XMM[8], @XMM[9] @ next round tweak
1990
1991	.Lxts_enc_done:
1992	#ifndef XTS_CHAIN_TWEAK
1993	adds $len, #0x10
1994	beq .Lxts_enc_ret
1995	sub r6, $out, #0x10
1996
1997	.Lxts_enc_steal:
1998	ldrb r0, [$inp], #1
1999	ldrb r1, [$out, #-0x10]
2000	strb r0, [$out, #-0x10]
2001	strb r1, [$out], #1
2002
2003	subs $len, #1
2004	bhi .Lxts_enc_steal
2005
2006	vld1.8 {@XMM[0]}, [r6]
2007	mov r0, sp
2008	veor @XMM[0], @XMM[0], @XMM[8]
2009	mov r1, sp
2010	vst1.8 {@XMM[0]}, [sp,:128]
2011	mov r2, $key
2012	mov r4, $fp @ preserve fp
2013
2014	bl AES_encrypt
2015
2016	vld1.8 {@XMM[0]}, [sp,:128]
2017	veor @XMM[0], @XMM[0], @XMM[8]
2018	vst1.8 {@XMM[0]}, [r6]
2019	mov $fp, r4
2020	#endif
2021
2022	.Lxts_enc_ret:
2023	bic r0, $fp, #0xf
2024	vmov.i32 q0, #0
2025	vmov.i32 q1, #0
2026	#ifdef XTS_CHAIN_TWEAK
2027	ldr r1, [$fp, #0x20+VFP_ABI_FRAME] @ chain tweak
2028	#endif
2029	.Lxts_enc_bzero: @ wipe key schedule [if any]
2030	vstmia sp!, {q0-q1}
2031	cmp sp, r0
2032	bne .Lxts_enc_bzero
2033
2034	mov sp, $fp
2035	#ifdef XTS_CHAIN_TWEAK
2036	vst1.8 {@XMM[8]}, [r1]
2037	#endif
2038	VFP_ABI_POP
2039	ldmia sp!, {r4-r10, pc} @ return
2040
2041	.size bsaes_xts_encrypt,.-bsaes_xts_encrypt
2042
2043	.globl bsaes_xts_decrypt
2044	.type bsaes_xts_decrypt,%function
2045	.align 4
2046	bsaes_xts_decrypt:
2047	mov ip, sp
2048	stmdb sp!, {r4-r10, lr} @ 0x20
2049	VFP_ABI_PUSH
2050	mov r6, sp @ future $fp
2051
2052	mov $inp, r0
2053	mov $out, r1
2054	mov $len, r2
2055	mov $key, r3
2056
2057	sub r0, sp, #0x10 @ 0x10
2058	bic r0, #0xf @ align at 16 bytes
2059	mov sp, r0
2060
2061	#ifdef XTS_CHAIN_TWEAK
2062	ldr r0, [ip] @ pointer to input tweak
2063	#else
2064	@ generate initial tweak
2065	ldr r0, [ip, #4] @ iv[]
2066	mov r1, sp
2067	ldr r2, [ip, #0] @ key2
2068	bl AES_encrypt
2069	mov r0, sp @ pointer to initial tweak
2070	#endif
2071
2072	ldr $rounds, [$key, #240] @ get # of rounds
2073	mov $fp, r6
2074	#ifndef BSAES_ASM_EXTENDED_KEY
2075	@ allocate the key schedule on the stack
2076	sub r12, sp, $rounds, lsl#7 @ 128 bytes per inner round key
2077	@ add r12, #`128-32` @ size of bit-sliced key schedule
2078	sub r12, #`32+16` @ place for tweak[9]
2079
2080	@ populate the key schedule
2081	mov r4, $key @ pass key
2082	mov r5, $rounds @ pass # of rounds
2083	mov sp, r12
2084	add r12, #0x90 @ pass key schedule
2085	bl _bsaes_key_convert
2086	add r4, sp, #0x90
2087	vldmia r4, {@XMM[6]}
2088	vstmia r12, {@XMM[15]} @ save last round key
2089	veor @XMM[7], @XMM[7], @XMM[6] @ fix up round 0 key
2090	vstmia r4, {@XMM[7]}
2091	#else
2092	ldr r12, [$key, #244]
2093	eors r12, #1
2094	beq 0f
2095
2096	str r12, [$key, #244]
2097	mov r4, $key @ pass key
2098	mov r5, $rounds @ pass # of rounds
2099	add r12, $key, #248 @ pass key schedule
2100	bl _bsaes_key_convert
2101	add r4, $key, #248
2102	vldmia r4, {@XMM[6]}
2103	vstmia r12, {@XMM[15]} @ save last round key
2104	veor @XMM[7], @XMM[7], @XMM[6] @ fix up round 0 key
2105	vstmia r4, {@XMM[7]}
2106
2107	.align 2
2108	0: sub sp, #0x90 @ place for tweak[9]
2109	#endif
2110	vld1.8 {@XMM[8]}, [r0] @ initial tweak
2111	adr $magic, .Lxts_magic
2112
2113	#ifndef XTS_CHAIN_TWEAK
2114	tst $len, #0xf @ if not multiple of 16
2115	it ne @ Thumb2 thing, sanity check in ARM
2116	subne $len, #0x10 @ subtract another 16 bytes
2117	#endif
2118	subs $len, #0x80
2119
2120	blo .Lxts_dec_short
2121	b .Lxts_dec_loop
2122
2123	.align 4
2124	.Lxts_dec_loop:
2125	vldmia $magic, {$twmask} @ load XTS magic
2126	vshr.s64 @T[0], @XMM[8], #63
2127	mov r0, sp
2128	vand @T[0], @T[0], $twmask
2129	___
2130	for($i=9;$i<16;$i++) {
2131	$code.=<<___;
2132	vadd.u64 @XMM[$i], @XMM[$i-1], @XMM[$i-1]
2133	vst1.64 {@XMM[$i-1]}, [r0,:128]!
2134	vswp `&Dhi("@T[0]")`,`&Dlo("@T[0]")`
2135	vshr.s64 @T[1], @XMM[$i], #63
2136	veor @XMM[$i], @XMM[$i], @T[0]
2137	vand @T[1], @T[1], $twmask
2138	___
2139	@T=reverse(@T);
2140
2141	$code.=<<___ if ($i>=10);
2142	vld1.8 {@XMM[$i-10]}, [$inp]!
2143	___
2144	$code.=<<___ if ($i>=11);
2145	veor @XMM[$i-11], @XMM[$i-11], @XMM[$i-3]
2146	___
2147	}
2148	$code.=<<___;
2149	vadd.u64 @XMM[8], @XMM[15], @XMM[15]
2150	vst1.64 {@XMM[15]}, [r0,:128]!
2151	vswp `&Dhi("@T[0]")`,`&Dlo("@T[0]")`
2152	veor @XMM[8], @XMM[8], @T[0]
2153	vst1.64 {@XMM[8]}, [r0,:128] @ next round tweak
2154
2155	vld1.8 {@XMM[6]-@XMM[7]}, [$inp]!
2156	veor @XMM[5], @XMM[5], @XMM[13]
2157	#ifndef BSAES_ASM_EXTENDED_KEY
2158	add r4, sp, #0x90 @ pass key schedule
2159	#else
2160	add r4, $key, #248 @ pass key schedule
2161	#endif
2162	veor @XMM[6], @XMM[6], @XMM[14]
2163	mov r5, $rounds @ pass rounds
2164	veor @XMM[7], @XMM[7], @XMM[15]
2165	mov r0, sp
2166
2167	bl _bsaes_decrypt8
2168
2169	vld1.64 {@XMM[ 8]-@XMM[ 9]}, [r0,:128]!
2170	vld1.64 {@XMM[10]-@XMM[11]}, [r0,:128]!
2171	veor @XMM[0], @XMM[0], @XMM[ 8]
2172	vld1.64 {@XMM[12]-@XMM[13]}, [r0,:128]!
2173	veor @XMM[1], @XMM[1], @XMM[ 9]
2174	veor @XMM[8], @XMM[6], @XMM[10]
2175	vst1.8 {@XMM[0]-@XMM[1]}, [$out]!
2176	veor @XMM[9], @XMM[4], @XMM[11]
2177	vld1.64 {@XMM[14]-@XMM[15]}, [r0,:128]!
2178	veor @XMM[10], @XMM[2], @XMM[12]
2179	vst1.8 {@XMM[8]-@XMM[9]}, [$out]!
2180	veor @XMM[11], @XMM[7], @XMM[13]
2181	veor @XMM[12], @XMM[3], @XMM[14]
2182	vst1.8 {@XMM[10]-@XMM[11]}, [$out]!
2183	veor @XMM[13], @XMM[5], @XMM[15]
2184	vst1.8 {@XMM[12]-@XMM[13]}, [$out]!
2185
2186	vld1.64 {@XMM[8]}, [r0,:128] @ next round tweak
2187
2188	subs $len, #0x80
2189	bpl .Lxts_dec_loop
2190
2191	.Lxts_dec_short:
2192	adds $len, #0x70
2193	bmi .Lxts_dec_done
2194
2195	vldmia $magic, {$twmask} @ load XTS magic
2196	vshr.s64 @T[0], @XMM[8], #63
2197	mov r0, sp
2198	vand @T[0], @T[0], $twmask
2199	___
2200	for($i=9;$i<16;$i++) {
2201	$code.=<<___;
2202	vadd.u64 @XMM[$i], @XMM[$i-1], @XMM[$i-1]
2203	vst1.64 {@XMM[$i-1]}, [r0,:128]!
2204	vswp `&Dhi("@T[0]")`,`&Dlo("@T[0]")`
2205	vshr.s64 @T[1], @XMM[$i], #63
2206	veor @XMM[$i], @XMM[$i], @T[0]
2207	vand @T[1], @T[1], $twmask
2208	___
2209	@T=reverse(@T);
2210
2211	$code.=<<___ if ($i>=10);
2212	vld1.8 {@XMM[$i-10]}, [$inp]!
2213	subs $len, #0x10
2214	bmi .Lxts_dec_`$i-9`
2215	___
2216	$code.=<<___ if ($i>=11);
2217	veor @XMM[$i-11], @XMM[$i-11], @XMM[$i-3]
2218	___
2219	}
2220	$code.=<<___;
2221	sub $len, #0x10
2222	vst1.64 {@XMM[15]}, [r0,:128] @ next round tweak
2223
2224	vld1.8 {@XMM[6]}, [$inp]!
2225	veor @XMM[5], @XMM[5], @XMM[13]
2226	#ifndef BSAES_ASM_EXTENDED_KEY
2227	add r4, sp, #0x90 @ pass key schedule
2228	#else
2229	add r4, $key, #248 @ pass key schedule
2230	#endif
2231	veor @XMM[6], @XMM[6], @XMM[14]
2232	mov r5, $rounds @ pass rounds
2233	mov r0, sp
2234
2235	bl _bsaes_decrypt8
2236
2237	vld1.64 {@XMM[ 8]-@XMM[ 9]}, [r0,:128]!
2238	vld1.64 {@XMM[10]-@XMM[11]}, [r0,:128]!
2239	veor @XMM[0], @XMM[0], @XMM[ 8]
2240	vld1.64 {@XMM[12]-@XMM[13]}, [r0,:128]!
2241	veor @XMM[1], @XMM[1], @XMM[ 9]
2242	veor @XMM[8], @XMM[6], @XMM[10]
2243	vst1.8 {@XMM[0]-@XMM[1]}, [$out]!
2244	veor @XMM[9], @XMM[4], @XMM[11]
2245	vld1.64 {@XMM[14]}, [r0,:128]!
2246	veor @XMM[10], @XMM[2], @XMM[12]
2247	vst1.8 {@XMM[8]-@XMM[9]}, [$out]!
2248	veor @XMM[11], @XMM[7], @XMM[13]
2249	veor @XMM[12], @XMM[3], @XMM[14]
2250	vst1.8 {@XMM[10]-@XMM[11]}, [$out]!
2251	vst1.8 {@XMM[12]}, [$out]!
2252
2253	vld1.64 {@XMM[8]}, [r0,:128] @ next round tweak
2254	b .Lxts_dec_done
2255	.align 4
2256	.Lxts_dec_6:
2257	vst1.64 {@XMM[14]}, [r0,:128] @ next round tweak
2258
2259	veor @XMM[4], @XMM[4], @XMM[12]
2260	#ifndef BSAES_ASM_EXTENDED_KEY
2261	add r4, sp, #0x90 @ pass key schedule
2262	#else
2263	add r4, $key, #248 @ pass key schedule
2264	#endif
2265	veor @XMM[5], @XMM[5], @XMM[13]
2266	mov r5, $rounds @ pass rounds
2267	mov r0, sp
2268
2269	bl _bsaes_decrypt8
2270
2271	vld1.64 {@XMM[ 8]-@XMM[ 9]}, [r0,:128]!
2272	vld1.64 {@XMM[10]-@XMM[11]}, [r0,:128]!
2273	veor @XMM[0], @XMM[0], @XMM[ 8]
2274	vld1.64 {@XMM[12]-@XMM[13]}, [r0,:128]!
2275	veor @XMM[1], @XMM[1], @XMM[ 9]
2276	veor @XMM[8], @XMM[6], @XMM[10]
2277	vst1.8 {@XMM[0]-@XMM[1]}, [$out]!
2278	veor @XMM[9], @XMM[4], @XMM[11]
2279	veor @XMM[10], @XMM[2], @XMM[12]
2280	vst1.8 {@XMM[8]-@XMM[9]}, [$out]!
2281	veor @XMM[11], @XMM[7], @XMM[13]
2282	vst1.8 {@XMM[10]-@XMM[11]}, [$out]!
2283
2284	vld1.64 {@XMM[8]}, [r0,:128] @ next round tweak
2285	b .Lxts_dec_done
2286	.align 4
2287	.Lxts_dec_5:
2288	veor @XMM[3], @XMM[3], @XMM[11]
2289	#ifndef BSAES_ASM_EXTENDED_KEY
2290	add r4, sp, #0x90 @ pass key schedule
2291	#else
2292	add r4, $key, #248 @ pass key schedule
2293	#endif
2294	veor @XMM[4], @XMM[4], @XMM[12]
2295	mov r5, $rounds @ pass rounds
2296	mov r0, sp
2297
2298	bl _bsaes_decrypt8
2299
2300	vld1.64 {@XMM[ 8]-@XMM[ 9]}, [r0,:128]!
2301	vld1.64 {@XMM[10]-@XMM[11]}, [r0,:128]!
2302	veor @XMM[0], @XMM[0], @XMM[ 8]
2303	vld1.64 {@XMM[12]}, [r0,:128]!
2304	veor @XMM[1], @XMM[1], @XMM[ 9]
2305	veor @XMM[8], @XMM[6], @XMM[10]
2306	vst1.8 {@XMM[0]-@XMM[1]}, [$out]!
2307	veor @XMM[9], @XMM[4], @XMM[11]
2308	veor @XMM[10], @XMM[2], @XMM[12]
2309	vst1.8 {@XMM[8]-@XMM[9]}, [$out]!
2310	vst1.8 {@XMM[10]}, [$out]!
2311
2312	vld1.64 {@XMM[8]}, [r0,:128] @ next round tweak
2313	b .Lxts_dec_done
2314	.align 4
2315	.Lxts_dec_4:
2316	veor @XMM[2], @XMM[2], @XMM[10]
2317	#ifndef BSAES_ASM_EXTENDED_KEY
2318	add r4, sp, #0x90 @ pass key schedule
2319	#else
2320	add r4, $key, #248 @ pass key schedule
2321	#endif
2322	veor @XMM[3], @XMM[3], @XMM[11]
2323	mov r5, $rounds @ pass rounds
2324	mov r0, sp
2325
2326	bl _bsaes_decrypt8
2327
2328	vld1.64 {@XMM[ 8]-@XMM[ 9]}, [r0,:128]!
2329	vld1.64 {@XMM[10]-@XMM[11]}, [r0,:128]!
2330	veor @XMM[0], @XMM[0], @XMM[ 8]
2331	veor @XMM[1], @XMM[1], @XMM[ 9]
2332	veor @XMM[8], @XMM[6], @XMM[10]
2333	vst1.8 {@XMM[0]-@XMM[1]}, [$out]!
2334	veor @XMM[9], @XMM[4], @XMM[11]
2335	vst1.8 {@XMM[8]-@XMM[9]}, [$out]!
2336
2337	vld1.64 {@XMM[8]}, [r0,:128] @ next round tweak
2338	b .Lxts_dec_done
2339	.align 4
2340	.Lxts_dec_3:
2341	veor @XMM[1], @XMM[1], @XMM[9]
2342	#ifndef BSAES_ASM_EXTENDED_KEY
2343	add r4, sp, #0x90 @ pass key schedule
2344	#else
2345	add r4, $key, #248 @ pass key schedule
2346	#endif
2347	veor @XMM[2], @XMM[2], @XMM[10]
2348	mov r5, $rounds @ pass rounds
2349	mov r0, sp
2350
2351	bl _bsaes_decrypt8
2352
2353	vld1.64 {@XMM[8]-@XMM[9]}, [r0,:128]!
2354	vld1.64 {@XMM[10]}, [r0,:128]!
2355	veor @XMM[0], @XMM[0], @XMM[ 8]
2356	veor @XMM[1], @XMM[1], @XMM[ 9]
2357	veor @XMM[8], @XMM[6], @XMM[10]
2358	vst1.8 {@XMM[0]-@XMM[1]}, [$out]!
2359	vst1.8 {@XMM[8]}, [$out]!
2360
2361	vld1.64 {@XMM[8]}, [r0,:128] @ next round tweak
2362	b .Lxts_dec_done
2363	.align 4
2364	.Lxts_dec_2:
2365	veor @XMM[0], @XMM[0], @XMM[8]
2366	#ifndef BSAES_ASM_EXTENDED_KEY
2367	add r4, sp, #0x90 @ pass key schedule
2368	#else
2369	add r4, $key, #248 @ pass key schedule
2370	#endif
2371	veor @XMM[1], @XMM[1], @XMM[9]
2372	mov r5, $rounds @ pass rounds
2373	mov r0, sp
2374
2375	bl _bsaes_decrypt8
2376
2377	vld1.64 {@XMM[8]-@XMM[9]}, [r0,:128]!
2378	veor @XMM[0], @XMM[0], @XMM[ 8]
2379	veor @XMM[1], @XMM[1], @XMM[ 9]
2380	vst1.8 {@XMM[0]-@XMM[1]}, [$out]!
2381
2382	vld1.64 {@XMM[8]}, [r0,:128] @ next round tweak
2383	b .Lxts_dec_done
2384	.align 4
2385	.Lxts_dec_1:
2386	mov r0, sp
2387	veor @XMM[0], @XMM[0], @XMM[8]
2388	mov r1, sp
2389	vst1.8 {@XMM[0]}, [sp,:128]
2390	mov r5, $magic @ preserve magic
2391	mov r2, $key
2392	mov r4, $fp @ preserve fp
2393
2394	bl AES_decrypt
2395
2396	vld1.8 {@XMM[0]}, [sp,:128]
2397	veor @XMM[0], @XMM[0], @XMM[8]
2398	vst1.8 {@XMM[0]}, [$out]!
2399	mov $fp, r4
2400	mov $magic, r5
2401
2402	vmov @XMM[8], @XMM[9] @ next round tweak
2403
2404	.Lxts_dec_done:
2405	#ifndef XTS_CHAIN_TWEAK
2406	adds $len, #0x10
2407	beq .Lxts_dec_ret
2408
2409	@ calculate one round of extra tweak for the stolen ciphertext
2410	vldmia $magic, {$twmask}
2411	vshr.s64 @XMM[6], @XMM[8], #63
2412	vand @XMM[6], @XMM[6], $twmask
2413	vadd.u64 @XMM[9], @XMM[8], @XMM[8]
2414	vswp `&Dhi("@XMM[6]")`,`&Dlo("@XMM[6]")`
2415	veor @XMM[9], @XMM[9], @XMM[6]
2416
2417	@ perform the final decryption with the last tweak value
2418	vld1.8 {@XMM[0]}, [$inp]!
2419	mov r0, sp
2420	veor @XMM[0], @XMM[0], @XMM[9]
2421	mov r1, sp
2422	vst1.8 {@XMM[0]}, [sp,:128]
2423	mov r2, $key
2424	mov r4, $fp @ preserve fp
2425
2426	bl AES_decrypt
2427
2428	vld1.8 {@XMM[0]}, [sp,:128]
2429	veor @XMM[0], @XMM[0], @XMM[9]
2430	vst1.8 {@XMM[0]}, [$out]
2431
2432	mov r6, $out
2433	.Lxts_dec_steal:
2434	ldrb r1, [$out]
2435	ldrb r0, [$inp], #1
2436	strb r1, [$out, #0x10]
2437	strb r0, [$out], #1
2438
2439	subs $len, #1
2440	bhi .Lxts_dec_steal
2441
2442	vld1.8 {@XMM[0]}, [r6]
2443	mov r0, sp
2444	veor @XMM[0], @XMM[8]
2445	mov r1, sp
2446	vst1.8 {@XMM[0]}, [sp,:128]
2447	mov r2, $key
2448
2449	bl AES_decrypt
2450
2451	vld1.8 {@XMM[0]}, [sp,:128]
2452	veor @XMM[0], @XMM[0], @XMM[8]
2453	vst1.8 {@XMM[0]}, [r6]
2454	mov $fp, r4
2455	#endif
2456
2457	.Lxts_dec_ret:
2458	bic r0, $fp, #0xf
2459	vmov.i32 q0, #0
2460	vmov.i32 q1, #0
2461	#ifdef XTS_CHAIN_TWEAK
2462	ldr r1, [$fp, #0x20+VFP_ABI_FRAME] @ chain tweak
2463	#endif
2464	.Lxts_dec_bzero: @ wipe key schedule [if any]
2465	vstmia sp!, {q0-q1}
2466	cmp sp, r0
2467	bne .Lxts_dec_bzero
2468
2469	mov sp, $fp
2470	#ifdef XTS_CHAIN_TWEAK
2471	vst1.8 {@XMM[8]}, [r1]
2472	#endif
2473	VFP_ABI_POP
2474	ldmia sp!, {r4-r10, pc} @ return
2475
2476	.size bsaes_xts_decrypt,.-bsaes_xts_decrypt
2477	___
2478	}
2479	$code.=<<___;
2480	#endif
2481	___
2482
2483	$code =~ s/\`([^\`]*)\`/eval($1)/gem;
2484
2485	open SELF,$0;
2486	while(<SELF>) {
2487	next if (/^#!/);
2488	last if (!s/^#/@/ and !/^$/);
2489	print;
2490	}
2491	close SELF;
2492
2493	print $code;
2494
2495	close STDOUT;

Note: See TracBrowser for help on using the repository browser.

source: vbox/trunk/src/libs/openssl-1.1.0g/crypto/aes/asm/bsaes-armv7.pl@ 69881

Download in other formats: