aesfx-sparcv9.pl@ 83531

Last change on this file since 83531 was 83531, checked in by vboxsync, 5 years ago
setting svn:sync-process=export for openssl-1.1.1f, all files except tests
File size: 27.5 KB

Line
1	#! /usr/bin/env perl
2	# Copyright 2016-2020 The OpenSSL Project Authors. All Rights Reserved.
3	#
4	# Licensed under the OpenSSL license (the "License"). You may not use
5	# this file except in compliance with the License. You can obtain a copy
6	# in the file LICENSE in the source distribution or at
7	# https://www.openssl.org/source/license.html
8
9	#
10	# ====================================================================
11	# Written by Andy Polyakov <[email protected]> for the OpenSSL
12	# project. The module is, however, dual licensed under OpenSSL and
13	# CRYPTOGAMS licenses depending on where you obtain it. For further
14	# details see http://www.openssl.org/~appro/cryptogams/.
15	# ====================================================================
16
17	# March 2016
18	#
19	# Initial support for Fujitsu SPARC64 X/X+ comprises minimally
20	# required key setup and single-block procedures.
21	#
22	# April 2016
23	#
24	# Add "teaser" CBC and CTR mode-specific subroutines. "Teaser" means
25	# that parallelizable nature of CBC decrypt and CTR is not utilized
26	# yet. CBC encrypt on the other hand is as good as it can possibly
27	# get processing one byte in 4.1 cycles with 128-bit key on SPARC64 X.
28	# This is ~6x faster than pure software implementation...
29	#
30	# July 2016
31	#
32	# Switch from faligndata to fshiftorx, which allows to omit alignaddr
33	# instructions and improve single-block and short-input performance
34	# with misaligned data.
35
36	$output = pop;
37	open STDOUT,">$output";
38
39	{
40	my ($inp,$out,$key,$rounds,$tmp,$mask) = map("%o$_",(0..5));
41
42	$code.=<<___;
43	#include "sparc_arch.h"
44
45	#define LOCALS (STACK_BIAS+STACK_FRAME)
46
47	.text
48
49	.globl aes_fx_encrypt
50	.align 32
51	aes_fx_encrypt:
52	and $inp, 7, $tmp ! is input aligned?
53	andn $inp, 7, $inp
54	ldd [$key + 0], %f6 ! round[0]
55	ldd [$key + 8], %f8
56	mov %o7, %g1
57	ld [$key + 240], $rounds
58
59	1: call .+8
60	add %o7, .Linp_align-1b, %o7
61
62	sll $tmp, 3, $tmp
63	ldd [$inp + 0], %f0 ! load input
64	brz,pt $tmp, .Lenc_inp_aligned
65	ldd [$inp + 8], %f2
66
67	ldd [%o7 + $tmp], %f14 ! shift left params
68	ldd [$inp + 16], %f4
69	fshiftorx %f0, %f2, %f14, %f0
70	fshiftorx %f2, %f4, %f14, %f2
71
72	.Lenc_inp_aligned:
73	ldd [$key + 16], %f10 ! round[1]
74	ldd [$key + 24], %f12
75
76	fxor %f0, %f6, %f0 ! ^=round[0]
77	fxor %f2, %f8, %f2
78	ldd [$key + 32], %f6 ! round[2]
79	ldd [$key + 40], %f8
80	add $key, 32, $key
81	sub $rounds, 4, $rounds
82
83	.Loop_enc:
84	fmovd %f0, %f4
85	faesencx %f2, %f10, %f0
86	faesencx %f4, %f12, %f2
87	ldd [$key + 16], %f10
88	ldd [$key + 24], %f12
89	add $key, 32, $key
90
91	fmovd %f0, %f4
92	faesencx %f2, %f6, %f0
93	faesencx %f4, %f8, %f2
94	ldd [$key + 0], %f6
95	ldd [$key + 8], %f8
96
97	brnz,a $rounds, .Loop_enc
98	sub $rounds, 2, $rounds
99
100	andcc $out, 7, $tmp ! is output aligned?
101	andn $out, 7, $out
102	mov 0xff, $mask
103	srl $mask, $tmp, $mask
104	add %o7, 64, %o7
105	sll $tmp, 3, $tmp
106
107	fmovd %f0, %f4
108	faesencx %f2, %f10, %f0
109	faesencx %f4, %f12, %f2
110	ldd [%o7 + $tmp], %f14 ! shift right params
111
112	fmovd %f0, %f4
113	faesenclx %f2, %f6, %f0
114	faesenclx %f4, %f8, %f2
115
116	bnz,pn %icc, .Lenc_out_unaligned
117	mov %g1, %o7
118
119	std %f0, [$out + 0]
120	retl
121	std %f2, [$out + 8]
122
123	.align 16
124	.Lenc_out_unaligned:
125	add $out, 16, $inp
126	orn %g0, $mask, $tmp
127	fshiftorx %f0, %f0, %f14, %f4
128	fshiftorx %f0, %f2, %f14, %f6
129	fshiftorx %f2, %f2, %f14, %f8
130
131	stda %f4, [$out + $mask]0xc0 ! partial store
132	std %f6, [$out + 8]
133	stda %f8, [$inp + $tmp]0xc0 ! partial store
134	retl
135	nop
136	.type aes_fx_encrypt,#function
137	.size aes_fx_encrypt,.-aes_fx_encrypt
138
139	.globl aes_fx_decrypt
140	.align 32
141	aes_fx_decrypt:
142	and $inp, 7, $tmp ! is input aligned?
143	andn $inp, 7, $inp
144	ldd [$key + 0], %f6 ! round[0]
145	ldd [$key + 8], %f8
146	mov %o7, %g1
147	ld [$key + 240], $rounds
148
149	1: call .+8
150	add %o7, .Linp_align-1b, %o7
151
152	sll $tmp, 3, $tmp
153	ldd [$inp + 0], %f0 ! load input
154	brz,pt $tmp, .Ldec_inp_aligned
155	ldd [$inp + 8], %f2
156
157	ldd [%o7 + $tmp], %f14 ! shift left params
158	ldd [$inp + 16], %f4
159	fshiftorx %f0, %f2, %f14, %f0
160	fshiftorx %f2, %f4, %f14, %f2
161
162	.Ldec_inp_aligned:
163	ldd [$key + 16], %f10 ! round[1]
164	ldd [$key + 24], %f12
165
166	fxor %f0, %f6, %f0 ! ^=round[0]
167	fxor %f2, %f8, %f2
168	ldd [$key + 32], %f6 ! round[2]
169	ldd [$key + 40], %f8
170	add $key, 32, $key
171	sub $rounds, 4, $rounds
172
173	.Loop_dec:
174	fmovd %f0, %f4
175	faesdecx %f2, %f10, %f0
176	faesdecx %f4, %f12, %f2
177	ldd [$key + 16], %f10
178	ldd [$key + 24], %f12
179	add $key, 32, $key
180
181	fmovd %f0, %f4
182	faesdecx %f2, %f6, %f0
183	faesdecx %f4, %f8, %f2
184	ldd [$key + 0], %f6
185	ldd [$key + 8], %f8
186
187	brnz,a $rounds, .Loop_dec
188	sub $rounds, 2, $rounds
189
190	andcc $out, 7, $tmp ! is output aligned?
191	andn $out, 7, $out
192	mov 0xff, $mask
193	srl $mask, $tmp, $mask
194	add %o7, 64, %o7
195	sll $tmp, 3, $tmp
196
197	fmovd %f0, %f4
198	faesdecx %f2, %f10, %f0
199	faesdecx %f4, %f12, %f2
200	ldd [%o7 + $tmp], %f14 ! shift right params
201
202	fmovd %f0, %f4
203	faesdeclx %f2, %f6, %f0
204	faesdeclx %f4, %f8, %f2
205
206	bnz,pn %icc, .Ldec_out_unaligned
207	mov %g1, %o7
208
209	std %f0, [$out + 0]
210	retl
211	std %f2, [$out + 8]
212
213	.align 16
214	.Ldec_out_unaligned:
215	add $out, 16, $inp
216	orn %g0, $mask, $tmp
217	fshiftorx %f0, %f0, %f14, %f4
218	fshiftorx %f0, %f2, %f14, %f6
219	fshiftorx %f2, %f2, %f14, %f8
220
221	stda %f4, [$out + $mask]0xc0 ! partial store
222	std %f6, [$out + 8]
223	stda %f8, [$inp + $tmp]0xc0 ! partial store
224	retl
225	nop
226	.type aes_fx_decrypt,#function
227	.size aes_fx_decrypt,.-aes_fx_decrypt
228	___
229	}
230	{
231	my ($inp,$bits,$out,$tmp,$inc) = map("%o$_",(0..5));
232	$code.=<<___;
233	.globl aes_fx_set_decrypt_key
234	.align 32
235	aes_fx_set_decrypt_key:
236	b .Lset_encrypt_key
237	mov -1, $inc
238	retl
239	nop
240	.type aes_fx_set_decrypt_key,#function
241	.size aes_fx_set_decrypt_key,.-aes_fx_set_decrypt_key
242
243	.globl aes_fx_set_encrypt_key
244	.align 32
245	aes_fx_set_encrypt_key:
246	mov 1, $inc
247	nop
248	.Lset_encrypt_key:
249	and $inp, 7, $tmp
250	andn $inp, 7, $inp
251	sll $tmp, 3, $tmp
252	mov %o7, %g1
253
254	1: call .+8
255	add %o7, .Linp_align-1b, %o7
256
257	ldd [%o7 + $tmp], %f10 ! shift left params
258	mov %g1, %o7
259
260	cmp $bits, 192
261	ldd [$inp + 0], %f0
262	bl,pt %icc, .L128
263	ldd [$inp + 8], %f2
264
265	be,pt %icc, .L192
266	ldd [$inp + 16], %f4
267	brz,pt $tmp, .L256aligned
268	ldd [$inp + 24], %f6
269
270	ldd [$inp + 32], %f8
271	fshiftorx %f0, %f2, %f10, %f0
272	fshiftorx %f2, %f4, %f10, %f2
273	fshiftorx %f4, %f6, %f10, %f4
274	fshiftorx %f6, %f8, %f10, %f6
275
276	.L256aligned:
277	mov 14, $bits
278	and $inc, `14*16`, $tmp
279	st $bits, [$out + 240] ! store rounds
280	add $out, $tmp, $out ! start or end of key schedule
281	sllx $inc, 4, $inc ! 16 or -16
282	___
283	for ($i=0; $i<6; $i++) {
284	$code.=<<___;
285	std %f0, [$out + 0]
286	faeskeyx %f6, `0x10+$i`, %f0
287	std %f2, [$out + 8]
288	add $out, $inc, $out
289	faeskeyx %f0, 0x00, %f2
290	std %f4, [$out + 0]
291	faeskeyx %f2, 0x01, %f4
292	std %f6, [$out + 8]
293	add $out, $inc, $out
294	faeskeyx %f4, 0x00, %f6
295	___
296	}
297	$code.=<<___;
298	std %f0, [$out + 0]
299	faeskeyx %f6, `0x10+$i`, %f0
300	std %f2, [$out + 8]
301	add $out, $inc, $out
302	faeskeyx %f0, 0x00, %f2
303	std %f4,[$out + 0]
304	std %f6,[$out + 8]
305	add $out, $inc, $out
306	std %f0,[$out + 0]
307	std %f2,[$out + 8]
308	retl
309	xor %o0, %o0, %o0 ! return 0
310
311	.align 16
312	.L192:
313	brz,pt $tmp, .L192aligned
314	nop
315
316	ldd [$inp + 24], %f6
317	fshiftorx %f0, %f2, %f10, %f0
318	fshiftorx %f2, %f4, %f10, %f2
319	fshiftorx %f4, %f6, %f10, %f4
320
321	.L192aligned:
322	mov 12, $bits
323	and $inc, `12*16`, $tmp
324	st $bits, [$out + 240] ! store rounds
325	add $out, $tmp, $out ! start or end of key schedule
326	sllx $inc, 4, $inc ! 16 or -16
327	___
328	for ($i=0; $i<8; $i+=2) {
329	$code.=<<___;
330	std %f0, [$out + 0]
331	faeskeyx %f4, `0x10+$i`, %f0
332	std %f2, [$out + 8]
333	add $out, $inc, $out
334	faeskeyx %f0, 0x00, %f2
335	std %f4, [$out + 0]
336	faeskeyx %f2, 0x00, %f4
337	std %f0, [$out + 8]
338	add $out, $inc, $out
339	faeskeyx %f4, `0x10+$i+1`, %f0
340	std %f2, [$out + 0]
341	faeskeyx %f0, 0x00, %f2
342	std %f4, [$out + 8]
343	add $out, $inc, $out
344	___
345	$code.=<<___ if ($i<6);
346	faeskeyx %f2, 0x00, %f4
347	___
348	}
349	$code.=<<___;
350	std %f0, [$out + 0]
351	std %f2, [$out + 8]
352	retl
353	xor %o0, %o0, %o0 ! return 0
354
355	.align 16
356	.L128:
357	brz,pt $tmp, .L128aligned
358	nop
359
360	ldd [$inp + 16], %f4
361	fshiftorx %f0, %f2, %f10, %f0
362	fshiftorx %f2, %f4, %f10, %f2
363
364	.L128aligned:
365	mov 10, $bits
366	and $inc, `10*16`, $tmp
367	st $bits, [$out + 240] ! store rounds
368	add $out, $tmp, $out ! start or end of key schedule
369	sllx $inc, 4, $inc ! 16 or -16
370	___
371	for ($i=0; $i<10; $i++) {
372	$code.=<<___;
373	std %f0, [$out + 0]
374	faeskeyx %f2, `0x10+$i`, %f0
375	std %f2, [$out + 8]
376	add $out, $inc, $out
377	faeskeyx %f0, 0x00, %f2
378	___
379	}
380	$code.=<<___;
381	std %f0, [$out + 0]
382	std %f2, [$out + 8]
383	retl
384	xor %o0, %o0, %o0 ! return 0
385	.type aes_fx_set_encrypt_key,#function
386	.size aes_fx_set_encrypt_key,.-aes_fx_set_encrypt_key
387	___
388	}
389	{
390	my ($inp,$out,$len,$key,$ivp,$dir) = map("%i$_",(0..5));
391	my ($rounds,$inner,$end,$inc,$ialign,$oalign,$mask) = map("%l$_",(0..7));
392	my ($iv0,$iv1,$r0hi,$r0lo,$rlhi,$rllo,$in0,$in1,$intail,$outhead,$fshift)
393	= map("%f$_",grep { !($_ & 1) } (16 .. 62));
394	my ($ileft,$iright) = ($ialign,$oalign);
395
396	$code.=<<___;
397	.globl aes_fx_cbc_encrypt
398	.align 32
399	aes_fx_cbc_encrypt:
400	save %sp, -STACK_FRAME-16, %sp
401	srln $len, 4, $len
402	and $inp, 7, $ialign
403	andn $inp, 7, $inp
404	brz,pn $len, .Lcbc_no_data
405	sll $ialign, 3, $ileft
406
407	1: call .+8
408	add %o7, .Linp_align-1b, %o7
409
410	ld [$key + 240], $rounds
411	and $out, 7, $oalign
412	ld [$ivp + 0], %f0 ! load ivec
413	andn $out, 7, $out
414	ld [$ivp + 4], %f1
415	sll $oalign, 3, $mask
416	ld [$ivp + 8], %f2
417	ld [$ivp + 12], %f3
418
419	sll $rounds, 4, $rounds
420	add $rounds, $key, $end
421	ldd [$key + 0], $r0hi ! round[0]
422	ldd [$key + 8], $r0lo
423
424	add $inp, 16, $inp
425	sub $len, 1, $len
426	ldd [$end + 0], $rlhi ! round[last]
427	ldd [$end + 8], $rllo
428
429	mov 16, $inc
430	movrz $len, 0, $inc
431	ldd [$key + 16], %f10 ! round[1]
432	ldd [$key + 24], %f12
433
434	ldd [%o7 + $ileft], $fshift ! shift left params
435	add %o7, 64, %o7
436	ldd [$inp - 16], $in0 ! load input
437	ldd [$inp - 8], $in1
438	ldda [$inp]0x82, $intail ! non-faulting load
439	brz $dir, .Lcbc_decrypt
440	add $inp, $inc, $inp ! inp+=16
441
442	fxor $r0hi, %f0, %f0 ! ivec^=round[0]
443	fxor $r0lo, %f2, %f2
444	fshiftorx $in0, $in1, $fshift, $in0
445	fshiftorx $in1, $intail, $fshift, $in1
446	nop
447
448	.Loop_cbc_enc:
449	fxor $in0, %f0, %f0 ! inp^ivec^round[0]
450	fxor $in1, %f2, %f2
451	ldd [$key + 32], %f6 ! round[2]
452	ldd [$key + 40], %f8
453	add $key, 32, $end
454	sub $rounds, 16*6, $inner
455
456	.Lcbc_enc:
457	fmovd %f0, %f4
458	faesencx %f2, %f10, %f0
459	faesencx %f4, %f12, %f2
460	ldd [$end + 16], %f10
461	ldd [$end + 24], %f12
462	add $end, 32, $end
463
464	fmovd %f0, %f4
465	faesencx %f2, %f6, %f0
466	faesencx %f4, %f8, %f2
467	ldd [$end + 0], %f6
468	ldd [$end + 8], %f8
469
470	brnz,a $inner, .Lcbc_enc
471	sub $inner, 16*2, $inner
472
473	fmovd %f0, %f4
474	faesencx %f2, %f10, %f0
475	faesencx %f4, %f12, %f2
476	ldd [$end + 16], %f10 ! round[last-1]
477	ldd [$end + 24], %f12
478
479	movrz $len, 0, $inc
480	fmovd $intail, $in0
481	ldd [$inp - 8], $in1 ! load next input block
482	ldda [$inp]0x82, $intail ! non-faulting load
483	add $inp, $inc, $inp ! inp+=16
484
485	fmovd %f0, %f4
486	faesencx %f2, %f6, %f0
487	faesencx %f4, %f8, %f2
488
489	fshiftorx $in0, $in1, $fshift, $in0
490	fshiftorx $in1, $intail, $fshift, $in1
491
492	fmovd %f0, %f4
493	faesencx %f2, %f10, %f0
494	faesencx %f4, %f12, %f2
495	ldd [$key + 16], %f10 ! round[1]
496	ldd [$key + 24], %f12
497
498	fxor $r0hi, $in0, $in0 ! inp^=round[0]
499	fxor $r0lo, $in1, $in1
500
501	fmovd %f0, %f4
502	faesenclx %f2, $rlhi, %f0
503	faesenclx %f4, $rllo, %f2
504
505	brnz,pn $oalign, .Lcbc_enc_unaligned_out
506	nop
507
508	std %f0, [$out + 0]
509	std %f2, [$out + 8]
510	add $out, 16, $out
511
512	brnz,a $len, .Loop_cbc_enc
513	sub $len, 1, $len
514
515	st %f0, [$ivp + 0] ! output ivec
516	st %f1, [$ivp + 4]
517	st %f2, [$ivp + 8]
518	st %f3, [$ivp + 12]
519
520	.Lcbc_no_data:
521	ret
522	restore
523
524	.align 32
525	.Lcbc_enc_unaligned_out:
526	ldd [%o7 + $mask], $fshift ! shift right params
527	mov 0xff, $mask
528	srl $mask, $oalign, $mask
529	sub %g0, $ileft, $iright
530
531	fshiftorx %f0, %f0, $fshift, %f6
532	fshiftorx %f0, %f2, $fshift, %f8
533
534	stda %f6, [$out + $mask]0xc0 ! partial store
535	orn %g0, $mask, $mask
536	std %f8, [$out + 8]
537	add $out, 16, $out
538	brz $len, .Lcbc_enc_unaligned_out_done
539	sub $len, 1, $len
540	b .Loop_cbc_enc_unaligned_out
541	nop
542
543	.align 32
544	.Loop_cbc_enc_unaligned_out:
545	fmovd %f2, $outhead
546	fxor $in0, %f0, %f0 ! inp^ivec^round[0]
547	fxor $in1, %f2, %f2
548	ldd [$key + 32], %f6 ! round[2]
549	ldd [$key + 40], %f8
550
551	fmovd %f0, %f4
552	faesencx %f2, %f10, %f0
553	faesencx %f4, %f12, %f2
554	ldd [$key + 48], %f10 ! round[3]
555	ldd [$key + 56], %f12
556
557	ldx [$inp - 16], %o0
558	ldx [$inp - 8], %o1
559	brz $ileft, .Lcbc_enc_aligned_inp
560	movrz $len, 0, $inc
561
562	ldx [$inp], %o2
563	sllx %o0, $ileft, %o0
564	srlx %o1, $iright, %g1
565	sllx %o1, $ileft, %o1
566	or %g1, %o0, %o0
567	srlx %o2, $iright, %o2
568	or %o2, %o1, %o1
569
570	.Lcbc_enc_aligned_inp:
571	fmovd %f0, %f4
572	faesencx %f2, %f6, %f0
573	faesencx %f4, %f8, %f2
574	ldd [$key + 64], %f6 ! round[4]
575	ldd [$key + 72], %f8
576	add $key, 64, $end
577	sub $rounds, 16*8, $inner
578
579	stx %o0, [%sp + LOCALS + 0]
580	stx %o1, [%sp + LOCALS + 8]
581	add $inp, $inc, $inp ! inp+=16
582	nop
583
584	.Lcbc_enc_unaligned:
585	fmovd %f0, %f4
586	faesencx %f2, %f10, %f0
587	faesencx %f4, %f12, %f2
588	ldd [$end + 16], %f10
589	ldd [$end + 24], %f12
590	add $end, 32, $end
591
592	fmovd %f0, %f4
593	faesencx %f2, %f6, %f0
594	faesencx %f4, %f8, %f2
595	ldd [$end + 0], %f6
596	ldd [$end + 8], %f8
597
598	brnz,a $inner, .Lcbc_enc_unaligned
599	sub $inner, 16*2, $inner
600
601	fmovd %f0, %f4
602	faesencx %f2, %f10, %f0
603	faesencx %f4, %f12, %f2
604	ldd [$end + 16], %f10 ! round[last-1]
605	ldd [$end + 24], %f12
606
607	fmovd %f0, %f4
608	faesencx %f2, %f6, %f0
609	faesencx %f4, %f8, %f2
610
611	ldd [%sp + LOCALS + 0], $in0
612	ldd [%sp + LOCALS + 8], $in1
613
614	fmovd %f0, %f4
615	faesencx %f2, %f10, %f0
616	faesencx %f4, %f12, %f2
617	ldd [$key + 16], %f10 ! round[1]
618	ldd [$key + 24], %f12
619
620	fxor $r0hi, $in0, $in0 ! inp^=round[0]
621	fxor $r0lo, $in1, $in1
622
623	fmovd %f0, %f4
624	faesenclx %f2, $rlhi, %f0
625	faesenclx %f4, $rllo, %f2
626
627	fshiftorx $outhead, %f0, $fshift, %f6
628	fshiftorx %f0, %f2, $fshift, %f8
629	std %f6, [$out + 0]
630	std %f8, [$out + 8]
631	add $out, 16, $out
632
633	brnz,a $len, .Loop_cbc_enc_unaligned_out
634	sub $len, 1, $len
635
636	.Lcbc_enc_unaligned_out_done:
637	fshiftorx %f2, %f2, $fshift, %f8
638	stda %f8, [$out + $mask]0xc0 ! partial store
639
640	st %f0, [$ivp + 0] ! output ivec
641	st %f1, [$ivp + 4]
642	st %f2, [$ivp + 8]
643	st %f3, [$ivp + 12]
644
645	ret
646	restore
647
648	.align 32
649	.Lcbc_decrypt:
650	fshiftorx $in0, $in1, $fshift, $in0
651	fshiftorx $in1, $intail, $fshift, $in1
652	fmovd %f0, $iv0
653	fmovd %f2, $iv1
654
655	.Loop_cbc_dec:
656	fxor $in0, $r0hi, %f0 ! inp^round[0]
657	fxor $in1, $r0lo, %f2
658	ldd [$key + 32], %f6 ! round[2]
659	ldd [$key + 40], %f8
660	add $key, 32, $end
661	sub $rounds, 16*6, $inner
662
663	.Lcbc_dec:
664	fmovd %f0, %f4
665	faesdecx %f2, %f10, %f0
666	faesdecx %f4, %f12, %f2
667	ldd [$end + 16], %f10
668	ldd [$end + 24], %f12
669	add $end, 32, $end
670
671	fmovd %f0, %f4
672	faesdecx %f2, %f6, %f0
673	faesdecx %f4, %f8, %f2
674	ldd [$end + 0], %f6
675	ldd [$end + 8], %f8
676
677	brnz,a $inner, .Lcbc_dec
678	sub $inner, 16*2, $inner
679
680	fmovd %f0, %f4
681	faesdecx %f2, %f10, %f0
682	faesdecx %f4, %f12, %f2
683	ldd [$end + 16], %f10 ! round[last-1]
684	ldd [$end + 24], %f12
685
686	fmovd %f0, %f4
687	faesdecx %f2, %f6, %f0
688	faesdecx %f4, %f8, %f2
689	fxor $iv0, $rlhi, %f6 ! ivec^round[last]
690	fxor $iv1, $rllo, %f8
691	fmovd $in0, $iv0
692	fmovd $in1, $iv1
693
694	movrz $len, 0, $inc
695	fmovd $intail, $in0
696	ldd [$inp - 8], $in1 ! load next input block
697	ldda [$inp]0x82, $intail ! non-faulting load
698	add $inp, $inc, $inp ! inp+=16
699
700	fmovd %f0, %f4
701	faesdecx %f2, %f10, %f0
702	faesdecx %f4, %f12, %f2
703	ldd [$key + 16], %f10 ! round[1]
704	ldd [$key + 24], %f12
705
706	fshiftorx $in0, $in1, $fshift, $in0
707	fshiftorx $in1, $intail, $fshift, $in1
708
709	fmovd %f0, %f4
710	faesdeclx %f2, %f6, %f0
711	faesdeclx %f4, %f8, %f2
712
713	brnz,pn $oalign, .Lcbc_dec_unaligned_out
714	nop
715
716	std %f0, [$out + 0]
717	std %f2, [$out + 8]
718	add $out, 16, $out
719
720	brnz,a $len, .Loop_cbc_dec
721	sub $len, 1, $len
722
723	st $iv0, [$ivp + 0] ! output ivec
724	st $iv0#lo, [$ivp + 4]
725	st $iv1, [$ivp + 8]
726	st $iv1#lo, [$ivp + 12]
727
728	ret
729	restore
730
731	.align 32
732	.Lcbc_dec_unaligned_out:
733	ldd [%o7 + $mask], $fshift ! shift right params
734	mov 0xff, $mask
735	srl $mask, $oalign, $mask
736	sub %g0, $ileft, $iright
737
738	fshiftorx %f0, %f0, $fshift, %f6
739	fshiftorx %f0, %f2, $fshift, %f8
740
741	stda %f6, [$out + $mask]0xc0 ! partial store
742	orn %g0, $mask, $mask
743	std %f8, [$out + 8]
744	add $out, 16, $out
745	brz $len, .Lcbc_dec_unaligned_out_done
746	sub $len, 1, $len
747	b .Loop_cbc_dec_unaligned_out
748	nop
749
750	.align 32
751	.Loop_cbc_dec_unaligned_out:
752	fmovd %f2, $outhead
753	fxor $in0, $r0hi, %f0 ! inp^round[0]
754	fxor $in1, $r0lo, %f2
755	ldd [$key + 32], %f6 ! round[2]
756	ldd [$key + 40], %f8
757
758	fmovd %f0, %f4
759	faesdecx %f2, %f10, %f0
760	faesdecx %f4, %f12, %f2
761	ldd [$key + 48], %f10 ! round[3]
762	ldd [$key + 56], %f12
763
764	ldx [$inp - 16], %o0
765	ldx [$inp - 8], %o1
766	brz $ileft, .Lcbc_dec_aligned_inp
767	movrz $len, 0, $inc
768
769	ldx [$inp], %o2
770	sllx %o0, $ileft, %o0
771	srlx %o1, $iright, %g1
772	sllx %o1, $ileft, %o1
773	or %g1, %o0, %o0
774	srlx %o2, $iright, %o2
775	or %o2, %o1, %o1
776
777	.Lcbc_dec_aligned_inp:
778	fmovd %f0, %f4
779	faesdecx %f2, %f6, %f0
780	faesdecx %f4, %f8, %f2
781	ldd [$key + 64], %f6 ! round[4]
782	ldd [$key + 72], %f8
783	add $key, 64, $end
784	sub $rounds, 16*8, $inner
785
786	stx %o0, [%sp + LOCALS + 0]
787	stx %o1, [%sp + LOCALS + 8]
788	add $inp, $inc, $inp ! inp+=16
789	nop
790
791	.Lcbc_dec_unaligned:
792	fmovd %f0, %f4
793	faesdecx %f2, %f10, %f0
794	faesdecx %f4, %f12, %f2
795	ldd [$end + 16], %f10
796	ldd [$end + 24], %f12
797	add $end, 32, $end
798
799	fmovd %f0, %f4
800	faesdecx %f2, %f6, %f0
801	faesdecx %f4, %f8, %f2
802	ldd [$end + 0], %f6
803	ldd [$end + 8], %f8
804
805	brnz,a $inner, .Lcbc_dec_unaligned
806	sub $inner, 16*2, $inner
807
808	fmovd %f0, %f4
809	faesdecx %f2, %f10, %f0
810	faesdecx %f4, %f12, %f2
811	ldd [$end + 16], %f10 ! round[last-1]
812	ldd [$end + 24], %f12
813
814	fmovd %f0, %f4
815	faesdecx %f2, %f6, %f0
816	faesdecx %f4, %f8, %f2
817
818	fxor $iv0, $rlhi, %f6 ! ivec^round[last]
819	fxor $iv1, $rllo, %f8
820	fmovd $in0, $iv0
821	fmovd $in1, $iv1
822	ldd [%sp + LOCALS + 0], $in0
823	ldd [%sp + LOCALS + 8], $in1
824
825	fmovd %f0, %f4
826	faesdecx %f2, %f10, %f0
827	faesdecx %f4, %f12, %f2
828	ldd [$key + 16], %f10 ! round[1]
829	ldd [$key + 24], %f12
830
831	fmovd %f0, %f4
832	faesdeclx %f2, %f6, %f0
833	faesdeclx %f4, %f8, %f2
834
835	fshiftorx $outhead, %f0, $fshift, %f6
836	fshiftorx %f0, %f2, $fshift, %f8
837	std %f6, [$out + 0]
838	std %f8, [$out + 8]
839	add $out, 16, $out
840
841	brnz,a $len, .Loop_cbc_dec_unaligned_out
842	sub $len, 1, $len
843
844	.Lcbc_dec_unaligned_out_done:
845	fshiftorx %f2, %f2, $fshift, %f8
846	stda %f8, [$out + $mask]0xc0 ! partial store
847
848	st $iv0, [$ivp + 0] ! output ivec
849	st $iv0#lo, [$ivp + 4]
850	st $iv1, [$ivp + 8]
851	st $iv1#lo, [$ivp + 12]
852
853	ret
854	restore
855	.type aes_fx_cbc_encrypt,#function
856	.size aes_fx_cbc_encrypt,.-aes_fx_cbc_encrypt
857	___
858	}
859	{
860	my ($inp,$out,$len,$key,$ivp) = map("%i$_",(0..5));
861	my ($rounds,$inner,$end,$inc,$ialign,$oalign,$mask) = map("%l$_",(0..7));
862	my ($ctr0,$ctr1,$r0hi,$r0lo,$rlhi,$rllo,$in0,$in1,$intail,$outhead,$fshift)
863	= map("%f$_",grep { !($_ & 1) } (16 .. 62));
864	my ($ileft,$iright) = ($ialign, $oalign);
865	my $one = "%f14";
866
867	$code.=<<___;
868	.globl aes_fx_ctr32_encrypt_blocks
869	.align 32
870	aes_fx_ctr32_encrypt_blocks:
871	save %sp, -STACK_FRAME-16, %sp
872	srln $len, 0, $len
873	and $inp, 7, $ialign
874	andn $inp, 7, $inp
875	brz,pn $len, .Lctr32_no_data
876	sll $ialign, 3, $ileft
877
878	.Lpic: call .+8
879	add %o7, .Linp_align - .Lpic, %o7
880
881	ld [$key + 240], $rounds
882	and $out, 7, $oalign
883	ld [$ivp + 0], $ctr0 ! load counter
884	andn $out, 7, $out
885	ld [$ivp + 4], $ctr0#lo
886	sll $oalign, 3, $mask
887	ld [$ivp + 8], $ctr1
888	ld [$ivp + 12], $ctr1#lo
889	ldd [%o7 + 128], $one
890
891	sll $rounds, 4, $rounds
892	add $rounds, $key, $end
893	ldd [$key + 0], $r0hi ! round[0]
894	ldd [$key + 8], $r0lo
895
896	add $inp, 16, $inp
897	sub $len, 1, $len
898	ldd [$key + 16], %f10 ! round[1]
899	ldd [$key + 24], %f12
900
901	mov 16, $inc
902	movrz $len, 0, $inc
903	ldd [$end + 0], $rlhi ! round[last]
904	ldd [$end + 8], $rllo
905
906	ldd [%o7 + $ileft], $fshift ! shiftleft params
907	add %o7, 64, %o7
908	ldd [$inp - 16], $in0 ! load input
909	ldd [$inp - 8], $in1
910	ldda [$inp]0x82, $intail ! non-faulting load
911	add $inp, $inc, $inp ! inp+=16
912
913	fshiftorx $in0, $in1, $fshift, $in0
914	fshiftorx $in1, $intail, $fshift, $in1
915
916	.Loop_ctr32:
917	fxor $ctr0, $r0hi, %f0 ! counter^round[0]
918	fxor $ctr1, $r0lo, %f2
919	ldd [$key + 32], %f6 ! round[2]
920	ldd [$key + 40], %f8
921	add $key, 32, $end
922	sub $rounds, 16*6, $inner
923
924	.Lctr32_enc:
925	fmovd %f0, %f4
926	faesencx %f2, %f10, %f0
927	faesencx %f4, %f12, %f2
928	ldd [$end + 16], %f10
929	ldd [$end + 24], %f12
930	add $end, 32, $end
931
932	fmovd %f0, %f4
933	faesencx %f2, %f6, %f0
934	faesencx %f4, %f8, %f2
935	ldd [$end + 0], %f6
936	ldd [$end + 8], %f8
937
938	brnz,a $inner, .Lctr32_enc
939	sub $inner, 16*2, $inner
940
941	fmovd %f0, %f4
942	faesencx %f2, %f10, %f0
943	faesencx %f4, %f12, %f2
944	ldd [$end + 16], %f10 ! round[last-1]
945	ldd [$end + 24], %f12
946
947	fmovd %f0, %f4
948	faesencx %f2, %f6, %f0
949	faesencx %f4, %f8, %f2
950	fxor $in0, $rlhi, %f6 ! inp^round[last]
951	fxor $in1, $rllo, %f8
952
953	movrz $len, 0, $inc
954	fmovd $intail, $in0
955	ldd [$inp - 8], $in1 ! load next input block
956	ldda [$inp]0x82, $intail ! non-faulting load
957	add $inp, $inc, $inp ! inp+=16
958
959	fmovd %f0, %f4
960	faesencx %f2, %f10, %f0
961	faesencx %f4, %f12, %f2
962	ldd [$key + 16], %f10 ! round[1]
963	ldd [$key + 24], %f12
964
965	fshiftorx $in0, $in1, $fshift, $in0
966	fshiftorx $in1, $intail, $fshift, $in1
967	fpadd32 $ctr1, $one, $ctr1 ! increment counter
968
969	fmovd %f0, %f4
970	faesenclx %f2, %f6, %f0
971	faesenclx %f4, %f8, %f2
972
973	brnz,pn $oalign, .Lctr32_unaligned_out
974	nop
975
976	std %f0, [$out + 0]
977	std %f2, [$out + 8]
978	add $out, 16, $out
979
980	brnz,a $len, .Loop_ctr32
981	sub $len, 1, $len
982
983	.Lctr32_no_data:
984	ret
985	restore
986
987	.align 32
988	.Lctr32_unaligned_out:
989	ldd [%o7 + $mask], $fshift ! shift right params
990	mov 0xff, $mask
991	srl $mask, $oalign, $mask
992	sub %g0, $ileft, $iright
993
994	fshiftorx %f0, %f0, $fshift, %f6
995	fshiftorx %f0, %f2, $fshift, %f8
996
997	stda %f6, [$out + $mask]0xc0 ! partial store
998	orn %g0, $mask, $mask
999	std %f8, [$out + 8]
1000	add $out, 16, $out
1001	brz $len, .Lctr32_unaligned_out_done
1002	sub $len, 1, $len
1003	b .Loop_ctr32_unaligned_out
1004	nop
1005
1006	.align 32
1007	.Loop_ctr32_unaligned_out:
1008	fmovd %f2, $outhead
1009	fxor $ctr0, $r0hi, %f0 ! counter^round[0]
1010	fxor $ctr1, $r0lo, %f2
1011	ldd [$key + 32], %f6 ! round[2]
1012	ldd [$key + 40], %f8
1013
1014	fmovd %f0, %f4
1015	faesencx %f2, %f10, %f0
1016	faesencx %f4, %f12, %f2
1017	ldd [$key + 48], %f10 ! round[3]
1018	ldd [$key + 56], %f12
1019
1020	ldx [$inp - 16], %o0
1021	ldx [$inp - 8], %o1
1022	brz $ileft, .Lctr32_aligned_inp
1023	movrz $len, 0, $inc
1024
1025	ldx [$inp], %o2
1026	sllx %o0, $ileft, %o0
1027	srlx %o1, $iright, %g1
1028	sllx %o1, $ileft, %o1
1029	or %g1, %o0, %o0
1030	srlx %o2, $iright, %o2
1031	or %o2, %o1, %o1
1032
1033	.Lctr32_aligned_inp:
1034	fmovd %f0, %f4
1035	faesencx %f2, %f6, %f0
1036	faesencx %f4, %f8, %f2
1037	ldd [$key + 64], %f6 ! round[4]
1038	ldd [$key + 72], %f8
1039	add $key, 64, $end
1040	sub $rounds, 16*8, $inner
1041
1042	stx %o0, [%sp + LOCALS + 0]
1043	stx %o1, [%sp + LOCALS + 8]
1044	add $inp, $inc, $inp ! inp+=16
1045	nop
1046
1047	.Lctr32_enc_unaligned:
1048	fmovd %f0, %f4
1049	faesencx %f2, %f10, %f0
1050	faesencx %f4, %f12, %f2
1051	ldd [$end + 16], %f10
1052	ldd [$end + 24], %f12
1053	add $end, 32, $end
1054
1055	fmovd %f0, %f4
1056	faesencx %f2, %f6, %f0
1057	faesencx %f4, %f8, %f2
1058	ldd [$end + 0], %f6
1059	ldd [$end + 8], %f8
1060
1061	brnz,a $inner, .Lctr32_enc_unaligned
1062	sub $inner, 16*2, $inner
1063
1064	fmovd %f0, %f4
1065	faesencx %f2, %f10, %f0
1066	faesencx %f4, %f12, %f2
1067	ldd [$end + 16], %f10 ! round[last-1]
1068	ldd [$end + 24], %f12
1069	fpadd32 $ctr1, $one, $ctr1 ! increment counter
1070
1071	fmovd %f0, %f4
1072	faesencx %f2, %f6, %f0
1073	faesencx %f4, %f8, %f2
1074	fxor $in0, $rlhi, %f6 ! inp^round[last]
1075	fxor $in1, $rllo, %f8
1076	ldd [%sp + LOCALS + 0], $in0
1077	ldd [%sp + LOCALS + 8], $in1
1078
1079	fmovd %f0, %f4
1080	faesencx %f2, %f10, %f0
1081	faesencx %f4, %f12, %f2
1082	ldd [$key + 16], %f10 ! round[1]
1083	ldd [$key + 24], %f12
1084
1085	fmovd %f0, %f4
1086	faesenclx %f2, %f6, %f0
1087	faesenclx %f4, %f8, %f2
1088
1089	fshiftorx $outhead, %f0, $fshift, %f6
1090	fshiftorx %f0, %f2, $fshift, %f8
1091	std %f6, [$out + 0]
1092	std %f8, [$out + 8]
1093	add $out, 16, $out
1094
1095	brnz,a $len, .Loop_ctr32_unaligned_out
1096	sub $len, 1, $len
1097
1098	.Lctr32_unaligned_out_done:
1099	fshiftorx %f2, %f2, $fshift, %f8
1100	stda %f8, [$out + $mask]0xc0 ! partial store
1101
1102	ret
1103	restore
1104	.type aes_fx_ctr32_encrypt_blocks,#function
1105	.size aes_fx_ctr32_encrypt_blocks,.-aes_fx_ctr32_encrypt_blocks
1106
1107	.align 32
1108	.Linp_align: ! fshiftorx parameters for left shift toward %rs1
1109	.byte 0, 0, 64, 0, 0, 64, 0, -64
1110	.byte 0, 0, 56, 8, 0, 56, 8, -56
1111	.byte 0, 0, 48, 16, 0, 48, 16, -48
1112	.byte 0, 0, 40, 24, 0, 40, 24, -40
1113	.byte 0, 0, 32, 32, 0, 32, 32, -32
1114	.byte 0, 0, 24, 40, 0, 24, 40, -24
1115	.byte 0, 0, 16, 48, 0, 16, 48, -16
1116	.byte 0, 0, 8, 56, 0, 8, 56, -8
1117	.Lout_align: ! fshiftorx parameters for right shift toward %rs2
1118	.byte 0, 0, 0, 64, 0, 0, 64, 0
1119	.byte 0, 0, 8, 56, 0, 8, 56, -8
1120	.byte 0, 0, 16, 48, 0, 16, 48, -16
1121	.byte 0, 0, 24, 40, 0, 24, 40, -24
1122	.byte 0, 0, 32, 32, 0, 32, 32, -32
1123	.byte 0, 0, 40, 24, 0, 40, 24, -40
1124	.byte 0, 0, 48, 16, 0, 48, 16, -48
1125	.byte 0, 0, 56, 8, 0, 56, 8, -56
1126	.Lone:
1127	.word 0, 1
1128	.asciz "AES for Fujitsu SPARC64 X, CRYPTOGAMS by <appro\@openssl.org>"
1129	.align 4
1130	___
1131	}
1132	# Purpose of these subroutines is to explicitly encode VIS instructions,
1133	# so that one can compile the module without having to specify VIS
1134	# extensions on compiler command line, e.g. -xarch=v9 vs. -xarch=v9a.
1135	# Idea is to reserve for option to produce "universal" binary and let
1136	# programmer detect if current CPU is VIS capable at run-time.
1137	sub unvis {
1138	my ($mnemonic,$rs1,$rs2,$rd)=@_;
1139	my ($ref,$opf);
1140	my %visopf = ( "faligndata" => 0x048,
1141	"bshuffle" => 0x04c,
1142	"fpadd32" => 0x052,
1143	"fxor" => 0x06c,
1144	"fsrc2" => 0x078 );
1145
1146	$ref = "$mnemonic\t$rs1,$rs2,$rd";
1147
1148	if ($opf=$visopf{$mnemonic}) {
1149	foreach ($rs1,$rs2,$rd) {
1150	return $ref if (!/%f([0-9]{1,2})/);
1151	$_=$1;
1152	if ($1>=32) {
1153	return $ref if ($1&1);
1154	# re-encode for upper double register addressing
1155	$_=($1\|$1>>5)&31;
1156	}
1157	}
1158
1159	return sprintf ".word\t0x%08x !%s",
1160	0x81b00000\|$rd<<25\|$rs1<<14\|$opf<<5\|$rs2,
1161	$ref;
1162	} else {
1163	return $ref;
1164	}
1165	}
1166
1167	sub unvis3 {
1168	my ($mnemonic,$rs1,$rs2,$rd)=@_;
1169	my %bias = ( "g" => 0, "o" => 8, "l" => 16, "i" => 24 );
1170	my ($ref,$opf);
1171	my %visopf = ( "alignaddr" => 0x018,
1172	"bmask" => 0x019,
1173	"alignaddrl" => 0x01a );
1174
1175	$ref = "$mnemonic\t$rs1,$rs2,$rd";
1176
1177	if ($opf=$visopf{$mnemonic}) {
1178	foreach ($rs1,$rs2,$rd) {
1179	return $ref if (!/%([goli])([0-9])/);
1180	$_=$bias{$1}+$2;
1181	}
1182
1183	return sprintf ".word\t0x%08x !%s",
1184	0x81b00000\|$rd<<25\|$rs1<<14\|$opf<<5\|$rs2,
1185	$ref;
1186	} else {
1187	return $ref;
1188	}
1189	}
1190
1191	sub unfx {
1192	my ($mnemonic,$rs1,$rs2,$rd)=@_;
1193	my ($ref,$opf);
1194	my %aesopf = ( "faesencx" => 0x90,
1195	"faesdecx" => 0x91,
1196	"faesenclx" => 0x92,
1197	"faesdeclx" => 0x93,
1198	"faeskeyx" => 0x94 );
1199
1200	$ref = "$mnemonic\t$rs1,$rs2,$rd";
1201
1202	if (defined($opf=$aesopf{$mnemonic})) {
1203	$rs2 = ($rs2 =~ /%f([0-6]*[02468])/) ? (($1\|$1>>5)&31) : $rs2;
1204	$rs2 = oct($rs2) if ($rs2 =~ /^0/);
1205
1206	foreach ($rs1,$rd) {
1207	return $ref if (!/%f([0-9]{1,2})/);
1208	$_=$1;
1209	if ($1>=32) {
1210	return $ref if ($1&1);
1211	# re-encode for upper double register addressing
1212	$_=($1\|$1>>5)&31;
1213	}
1214	}
1215
1216	return sprintf ".word\t0x%08x !%s",
1217	2<<30\|$rd<<25\|0x36<<19\|$rs1<<14\|$opf<<5\|$rs2,
1218	$ref;
1219	} else {
1220	return $ref;
1221	}
1222	}
1223
1224	sub unfx3src {
1225	my ($mnemonic,$rs1,$rs2,$rs3,$rd)=@_;
1226	my ($ref,$opf);
1227	my %aesopf = ( "fshiftorx" => 0x0b );
1228
1229	$ref = "$mnemonic\t$rs1,$rs2,$rs3,$rd";
1230
1231	if (defined($opf=$aesopf{$mnemonic})) {
1232	foreach ($rs1,$rs2,$rs3,$rd) {
1233	return $ref if (!/%f([0-9]{1,2})/);
1234	$_=$1;
1235	if ($1>=32) {
1236	return $ref if ($1&1);
1237	# re-encode for upper double register addressing
1238	$_=($1\|$1>>5)&31;
1239	}
1240	}
1241
1242	return sprintf ".word\t0x%08x !%s",
1243	2<<30\|$rd<<25\|0x37<<19\|$rs1<<14\|$rs3<<9\|$opf<<5\|$rs2,
1244	$ref;
1245	} else {
1246	return $ref;
1247	}
1248	}
1249
1250	foreach (split("\n",$code)) {
1251	s/\`([^\`]*)\`/eval $1/ge;
1252
1253	s/%f([0-9]+)#lo/sprintf "%%f%d",$1+1/ge;
1254
1255	s/\b(faes[^x]{3,4}x)\s+(%f[0-9]{1,2}),\s([%fx0-9]+),\s(%f[0-9]{1,2})/
1256	&unfx($1,$2,$3,$4)
1257	/ge or
1258	s/\b([f][^\s])\s+(%f[0-9]{1,2}),\s(%f[0-9]{1,2}),\s(%f[0-9]{1,2}),\s(%f[0-9]{1,2})/
1259	&unfx3src($1,$2,$3,$4,$5)
1260	/ge or
1261	s/\b([fb][^\s])\s+(%f[0-9]{1,2}),\s(%f[0-9]{1,2}),\s*(%f[0-9]{1,2})/
1262	&unvis($1,$2,$3,$4)
1263	/ge or
1264	s/\b(alignaddr[l])\s+(%[goli][0-7]),\s(%[goli][0-7]),\s*(%[goli][0-7])/
1265	&unvis3($1,$2,$3,$4)
1266	/ge;
1267	print $_,"\n";
1268	}
1269
1270	close STDOUT or die "error closing STDOUT: $!";

Note: See TracBrowser for help on using the repository browser.

source: vbox/trunk/src/libs/openssl-1.1.1f/crypto/aes/asm/aesfx-sparcv9.pl@ 83531

Download in other formats: