aesfx-sparcv9.pl@ 102427

Last change on this file since 102427 was 101211, checked in by vboxsync, 17 months ago
openssl-3.1.3: Applied and adjusted our OpenSSL changes to 3.1.2. bugref:10527
File size: 27.6 KB

Line
1	#! /usr/bin/env perl
2	# Copyright 2016-2021 The OpenSSL Project Authors. All Rights Reserved.
3	#
4	# Licensed under the Apache License 2.0 (the "License"). You may not use
5	# this file except in compliance with the License. You can obtain a copy
6	# in the file LICENSE in the source distribution or at
7	# https://www.openssl.org/source/license.html
8
9	#
10	# ====================================================================
11	# Written by Andy Polyakov <[email protected]> for the OpenSSL
12	# project. The module is, however, dual licensed under OpenSSL and
13	# CRYPTOGAMS licenses depending on where you obtain it. For further
14	# details see http://www.openssl.org/~appro/cryptogams/.
15	# ====================================================================
16
17	# March 2016
18	#
19	# Initial support for Fujitsu SPARC64 X/X+ comprises minimally
20	# required key setup and single-block procedures.
21	#
22	# April 2016
23	#
24	# Add "teaser" CBC and CTR mode-specific subroutines. "Teaser" means
25	# that parallelizable nature of CBC decrypt and CTR is not utilized
26	# yet. CBC encrypt on the other hand is as good as it can possibly
27	# get processing one byte in 4.1 cycles with 128-bit key on SPARC64 X.
28	# This is ~6x faster than pure software implementation...
29	#
30	# July 2016
31	#
32	# Switch from faligndata to fshiftorx, which allows to omit alignaddr
33	# instructions and improve single-block and short-input performance
34	# with misaligned data.
35
36	$output = pop and open STDOUT,">$output";
37
38	{
39	my ($inp,$out,$key,$rounds,$tmp,$mask) = map("%o$_",(0..5));
40
41	$code.=<<___;
42	#ifndef __ASSEMBLER__
43	# define __ASSEMBLER__ 1
44	#endif
45	#include "crypto/sparc_arch.h"
46
47	#define LOCALS (STACK_BIAS+STACK_FRAME)
48
49	.text
50
51	.globl aes_fx_encrypt
52	.align 32
53	aes_fx_encrypt:
54	and $inp, 7, $tmp ! is input aligned?
55	andn $inp, 7, $inp
56	ldd [$key + 0], %f6 ! round[0]
57	ldd [$key + 8], %f8
58	mov %o7, %g1
59	ld [$key + 240], $rounds
60
61	1: call .+8
62	add %o7, .Linp_align-1b, %o7
63
64	sll $tmp, 3, $tmp
65	ldd [$inp + 0], %f0 ! load input
66	brz,pt $tmp, .Lenc_inp_aligned
67	ldd [$inp + 8], %f2
68
69	ldd [%o7 + $tmp], %f14 ! shift left params
70	ldd [$inp + 16], %f4
71	fshiftorx %f0, %f2, %f14, %f0
72	fshiftorx %f2, %f4, %f14, %f2
73
74	.Lenc_inp_aligned:
75	ldd [$key + 16], %f10 ! round[1]
76	ldd [$key + 24], %f12
77
78	fxor %f0, %f6, %f0 ! ^=round[0]
79	fxor %f2, %f8, %f2
80	ldd [$key + 32], %f6 ! round[2]
81	ldd [$key + 40], %f8
82	add $key, 32, $key
83	sub $rounds, 4, $rounds
84
85	.Loop_enc:
86	fmovd %f0, %f4
87	faesencx %f2, %f10, %f0
88	faesencx %f4, %f12, %f2
89	ldd [$key + 16], %f10
90	ldd [$key + 24], %f12
91	add $key, 32, $key
92
93	fmovd %f0, %f4
94	faesencx %f2, %f6, %f0
95	faesencx %f4, %f8, %f2
96	ldd [$key + 0], %f6
97	ldd [$key + 8], %f8
98
99	brnz,a $rounds, .Loop_enc
100	sub $rounds, 2, $rounds
101
102	andcc $out, 7, $tmp ! is output aligned?
103	andn $out, 7, $out
104	mov 0xff, $mask
105	srl $mask, $tmp, $mask
106	add %o7, 64, %o7
107	sll $tmp, 3, $tmp
108
109	fmovd %f0, %f4
110	faesencx %f2, %f10, %f0
111	faesencx %f4, %f12, %f2
112	ldd [%o7 + $tmp], %f14 ! shift right params
113
114	fmovd %f0, %f4
115	faesenclx %f2, %f6, %f0
116	faesenclx %f4, %f8, %f2
117
118	bnz,pn %icc, .Lenc_out_unaligned
119	mov %g1, %o7
120
121	std %f0, [$out + 0]
122	retl
123	std %f2, [$out + 8]
124
125	.align 16
126	.Lenc_out_unaligned:
127	add $out, 16, $inp
128	orn %g0, $mask, $tmp
129	fshiftorx %f0, %f0, %f14, %f4
130	fshiftorx %f0, %f2, %f14, %f6
131	fshiftorx %f2, %f2, %f14, %f8
132
133	stda %f4, [$out + $mask]0xc0 ! partial store
134	std %f6, [$out + 8]
135	stda %f8, [$inp + $tmp]0xc0 ! partial store
136	retl
137	nop
138	.type aes_fx_encrypt,#function
139	.size aes_fx_encrypt,.-aes_fx_encrypt
140
141	.globl aes_fx_decrypt
142	.align 32
143	aes_fx_decrypt:
144	and $inp, 7, $tmp ! is input aligned?
145	andn $inp, 7, $inp
146	ldd [$key + 0], %f6 ! round[0]
147	ldd [$key + 8], %f8
148	mov %o7, %g1
149	ld [$key + 240], $rounds
150
151	1: call .+8
152	add %o7, .Linp_align-1b, %o7
153
154	sll $tmp, 3, $tmp
155	ldd [$inp + 0], %f0 ! load input
156	brz,pt $tmp, .Ldec_inp_aligned
157	ldd [$inp + 8], %f2
158
159	ldd [%o7 + $tmp], %f14 ! shift left params
160	ldd [$inp + 16], %f4
161	fshiftorx %f0, %f2, %f14, %f0
162	fshiftorx %f2, %f4, %f14, %f2
163
164	.Ldec_inp_aligned:
165	ldd [$key + 16], %f10 ! round[1]
166	ldd [$key + 24], %f12
167
168	fxor %f0, %f6, %f0 ! ^=round[0]
169	fxor %f2, %f8, %f2
170	ldd [$key + 32], %f6 ! round[2]
171	ldd [$key + 40], %f8
172	add $key, 32, $key
173	sub $rounds, 4, $rounds
174
175	.Loop_dec:
176	fmovd %f0, %f4
177	faesdecx %f2, %f10, %f0
178	faesdecx %f4, %f12, %f2
179	ldd [$key + 16], %f10
180	ldd [$key + 24], %f12
181	add $key, 32, $key
182
183	fmovd %f0, %f4
184	faesdecx %f2, %f6, %f0
185	faesdecx %f4, %f8, %f2
186	ldd [$key + 0], %f6
187	ldd [$key + 8], %f8
188
189	brnz,a $rounds, .Loop_dec
190	sub $rounds, 2, $rounds
191
192	andcc $out, 7, $tmp ! is output aligned?
193	andn $out, 7, $out
194	mov 0xff, $mask
195	srl $mask, $tmp, $mask
196	add %o7, 64, %o7
197	sll $tmp, 3, $tmp
198
199	fmovd %f0, %f4
200	faesdecx %f2, %f10, %f0
201	faesdecx %f4, %f12, %f2
202	ldd [%o7 + $tmp], %f14 ! shift right params
203
204	fmovd %f0, %f4
205	faesdeclx %f2, %f6, %f0
206	faesdeclx %f4, %f8, %f2
207
208	bnz,pn %icc, .Ldec_out_unaligned
209	mov %g1, %o7
210
211	std %f0, [$out + 0]
212	retl
213	std %f2, [$out + 8]
214
215	.align 16
216	.Ldec_out_unaligned:
217	add $out, 16, $inp
218	orn %g0, $mask, $tmp
219	fshiftorx %f0, %f0, %f14, %f4
220	fshiftorx %f0, %f2, %f14, %f6
221	fshiftorx %f2, %f2, %f14, %f8
222
223	stda %f4, [$out + $mask]0xc0 ! partial store
224	std %f6, [$out + 8]
225	stda %f8, [$inp + $tmp]0xc0 ! partial store
226	retl
227	nop
228	.type aes_fx_decrypt,#function
229	.size aes_fx_decrypt,.-aes_fx_decrypt
230	___
231	}
232	{
233	my ($inp,$bits,$out,$tmp,$inc) = map("%o$_",(0..5));
234	$code.=<<___;
235	.globl aes_fx_set_decrypt_key
236	.align 32
237	aes_fx_set_decrypt_key:
238	b .Lset_encrypt_key
239	mov -1, $inc
240	retl
241	nop
242	.type aes_fx_set_decrypt_key,#function
243	.size aes_fx_set_decrypt_key,.-aes_fx_set_decrypt_key
244
245	.globl aes_fx_set_encrypt_key
246	.align 32
247	aes_fx_set_encrypt_key:
248	mov 1, $inc
249	nop
250	.Lset_encrypt_key:
251	and $inp, 7, $tmp
252	andn $inp, 7, $inp
253	sll $tmp, 3, $tmp
254	mov %o7, %g1
255
256	1: call .+8
257	add %o7, .Linp_align-1b, %o7
258
259	ldd [%o7 + $tmp], %f10 ! shift left params
260	mov %g1, %o7
261
262	cmp $bits, 192
263	ldd [$inp + 0], %f0
264	bl,pt %icc, .L128
265	ldd [$inp + 8], %f2
266
267	be,pt %icc, .L192
268	ldd [$inp + 16], %f4
269	brz,pt $tmp, .L256aligned
270	ldd [$inp + 24], %f6
271
272	ldd [$inp + 32], %f8
273	fshiftorx %f0, %f2, %f10, %f0
274	fshiftorx %f2, %f4, %f10, %f2
275	fshiftorx %f4, %f6, %f10, %f4
276	fshiftorx %f6, %f8, %f10, %f6
277
278	.L256aligned:
279	mov 14, $bits
280	and $inc, `14*16`, $tmp
281	st $bits, [$out + 240] ! store rounds
282	add $out, $tmp, $out ! start or end of key schedule
283	sllx $inc, 4, $inc ! 16 or -16
284	___
285	for ($i=0; $i<6; $i++) {
286	$code.=<<___;
287	std %f0, [$out + 0]
288	faeskeyx %f6, `0x10+$i`, %f0
289	std %f2, [$out + 8]
290	add $out, $inc, $out
291	faeskeyx %f0, 0x00, %f2
292	std %f4, [$out + 0]
293	faeskeyx %f2, 0x01, %f4
294	std %f6, [$out + 8]
295	add $out, $inc, $out
296	faeskeyx %f4, 0x00, %f6
297	___
298	}
299	$code.=<<___;
300	std %f0, [$out + 0]
301	faeskeyx %f6, `0x10+$i`, %f0
302	std %f2, [$out + 8]
303	add $out, $inc, $out
304	faeskeyx %f0, 0x00, %f2
305	std %f4,[$out + 0]
306	std %f6,[$out + 8]
307	add $out, $inc, $out
308	std %f0,[$out + 0]
309	std %f2,[$out + 8]
310	retl
311	xor %o0, %o0, %o0 ! return 0
312
313	.align 16
314	.L192:
315	brz,pt $tmp, .L192aligned
316	nop
317
318	ldd [$inp + 24], %f6
319	fshiftorx %f0, %f2, %f10, %f0
320	fshiftorx %f2, %f4, %f10, %f2
321	fshiftorx %f4, %f6, %f10, %f4
322
323	.L192aligned:
324	mov 12, $bits
325	and $inc, `12*16`, $tmp
326	st $bits, [$out + 240] ! store rounds
327	add $out, $tmp, $out ! start or end of key schedule
328	sllx $inc, 4, $inc ! 16 or -16
329	___
330	for ($i=0; $i<8; $i+=2) {
331	$code.=<<___;
332	std %f0, [$out + 0]
333	faeskeyx %f4, `0x10+$i`, %f0
334	std %f2, [$out + 8]
335	add $out, $inc, $out
336	faeskeyx %f0, 0x00, %f2
337	std %f4, [$out + 0]
338	faeskeyx %f2, 0x00, %f4
339	std %f0, [$out + 8]
340	add $out, $inc, $out
341	faeskeyx %f4, `0x10+$i+1`, %f0
342	std %f2, [$out + 0]
343	faeskeyx %f0, 0x00, %f2
344	std %f4, [$out + 8]
345	add $out, $inc, $out
346	___
347	$code.=<<___ if ($i<6);
348	faeskeyx %f2, 0x00, %f4
349	___
350	}
351	$code.=<<___;
352	std %f0, [$out + 0]
353	std %f2, [$out + 8]
354	retl
355	xor %o0, %o0, %o0 ! return 0
356
357	.align 16
358	.L128:
359	brz,pt $tmp, .L128aligned
360	nop
361
362	ldd [$inp + 16], %f4
363	fshiftorx %f0, %f2, %f10, %f0
364	fshiftorx %f2, %f4, %f10, %f2
365
366	.L128aligned:
367	mov 10, $bits
368	and $inc, `10*16`, $tmp
369	st $bits, [$out + 240] ! store rounds
370	add $out, $tmp, $out ! start or end of key schedule
371	sllx $inc, 4, $inc ! 16 or -16
372	___
373	for ($i=0; $i<10; $i++) {
374	$code.=<<___;
375	std %f0, [$out + 0]
376	faeskeyx %f2, `0x10+$i`, %f0
377	std %f2, [$out + 8]
378	add $out, $inc, $out
379	faeskeyx %f0, 0x00, %f2
380	___
381	}
382	$code.=<<___;
383	std %f0, [$out + 0]
384	std %f2, [$out + 8]
385	retl
386	xor %o0, %o0, %o0 ! return 0
387	.type aes_fx_set_encrypt_key,#function
388	.size aes_fx_set_encrypt_key,.-aes_fx_set_encrypt_key
389	___
390	}
391	{
392	my ($inp,$out,$len,$key,$ivp,$dir) = map("%i$_",(0..5));
393	my ($rounds,$inner,$end,$inc,$ialign,$oalign,$mask) = map("%l$_",(0..7));
394	my ($iv0,$iv1,$r0hi,$r0lo,$rlhi,$rllo,$in0,$in1,$intail,$outhead,$fshift)
395	= map("%f$_",grep { !($_ & 1) } (16 .. 62));
396	my ($ileft,$iright) = ($ialign,$oalign);
397
398	$code.=<<___;
399	.globl aes_fx_cbc_encrypt
400	.align 32
401	aes_fx_cbc_encrypt:
402	save %sp, -STACK_FRAME-16, %sp
403	srln $len, 4, $len
404	and $inp, 7, $ialign
405	andn $inp, 7, $inp
406	brz,pn $len, .Lcbc_no_data
407	sll $ialign, 3, $ileft
408
409	1: call .+8
410	add %o7, .Linp_align-1b, %o7
411
412	ld [$key + 240], $rounds
413	and $out, 7, $oalign
414	ld [$ivp + 0], %f0 ! load ivec
415	andn $out, 7, $out
416	ld [$ivp + 4], %f1
417	sll $oalign, 3, $mask
418	ld [$ivp + 8], %f2
419	ld [$ivp + 12], %f3
420
421	sll $rounds, 4, $rounds
422	add $rounds, $key, $end
423	ldd [$key + 0], $r0hi ! round[0]
424	ldd [$key + 8], $r0lo
425
426	add $inp, 16, $inp
427	sub $len, 1, $len
428	ldd [$end + 0], $rlhi ! round[last]
429	ldd [$end + 8], $rllo
430
431	mov 16, $inc
432	movrz $len, 0, $inc
433	ldd [$key + 16], %f10 ! round[1]
434	ldd [$key + 24], %f12
435
436	ldd [%o7 + $ileft], $fshift ! shift left params
437	add %o7, 64, %o7
438	ldd [$inp - 16], $in0 ! load input
439	ldd [$inp - 8], $in1
440	ldda [$inp]0x82, $intail ! non-faulting load
441	brz $dir, .Lcbc_decrypt
442	add $inp, $inc, $inp ! inp+=16
443
444	fxor $r0hi, %f0, %f0 ! ivec^=round[0]
445	fxor $r0lo, %f2, %f2
446	fshiftorx $in0, $in1, $fshift, $in0
447	fshiftorx $in1, $intail, $fshift, $in1
448	nop
449
450	.Loop_cbc_enc:
451	fxor $in0, %f0, %f0 ! inp^ivec^round[0]
452	fxor $in1, %f2, %f2
453	ldd [$key + 32], %f6 ! round[2]
454	ldd [$key + 40], %f8
455	add $key, 32, $end
456	sub $rounds, 16*6, $inner
457
458	.Lcbc_enc:
459	fmovd %f0, %f4
460	faesencx %f2, %f10, %f0
461	faesencx %f4, %f12, %f2
462	ldd [$end + 16], %f10
463	ldd [$end + 24], %f12
464	add $end, 32, $end
465
466	fmovd %f0, %f4
467	faesencx %f2, %f6, %f0
468	faesencx %f4, %f8, %f2
469	ldd [$end + 0], %f6
470	ldd [$end + 8], %f8
471
472	brnz,a $inner, .Lcbc_enc
473	sub $inner, 16*2, $inner
474
475	fmovd %f0, %f4
476	faesencx %f2, %f10, %f0
477	faesencx %f4, %f12, %f2
478	ldd [$end + 16], %f10 ! round[last-1]
479	ldd [$end + 24], %f12
480
481	movrz $len, 0, $inc
482	fmovd $intail, $in0
483	ldd [$inp - 8], $in1 ! load next input block
484	ldda [$inp]0x82, $intail ! non-faulting load
485	add $inp, $inc, $inp ! inp+=16
486
487	fmovd %f0, %f4
488	faesencx %f2, %f6, %f0
489	faesencx %f4, %f8, %f2
490
491	fshiftorx $in0, $in1, $fshift, $in0
492	fshiftorx $in1, $intail, $fshift, $in1
493
494	fmovd %f0, %f4
495	faesencx %f2, %f10, %f0
496	faesencx %f4, %f12, %f2
497	ldd [$key + 16], %f10 ! round[1]
498	ldd [$key + 24], %f12
499
500	fxor $r0hi, $in0, $in0 ! inp^=round[0]
501	fxor $r0lo, $in1, $in1
502
503	fmovd %f0, %f4
504	faesenclx %f2, $rlhi, %f0
505	faesenclx %f4, $rllo, %f2
506
507	brnz,pn $oalign, .Lcbc_enc_unaligned_out
508	nop
509
510	std %f0, [$out + 0]
511	std %f2, [$out + 8]
512	add $out, 16, $out
513
514	brnz,a $len, .Loop_cbc_enc
515	sub $len, 1, $len
516
517	st %f0, [$ivp + 0] ! output ivec
518	st %f1, [$ivp + 4]
519	st %f2, [$ivp + 8]
520	st %f3, [$ivp + 12]
521
522	.Lcbc_no_data:
523	ret
524	restore
525
526	.align 32
527	.Lcbc_enc_unaligned_out:
528	ldd [%o7 + $mask], $fshift ! shift right params
529	mov 0xff, $mask
530	srl $mask, $oalign, $mask
531	sub %g0, $ileft, $iright
532
533	fshiftorx %f0, %f0, $fshift, %f6
534	fshiftorx %f0, %f2, $fshift, %f8
535
536	stda %f6, [$out + $mask]0xc0 ! partial store
537	orn %g0, $mask, $mask
538	std %f8, [$out + 8]
539	add $out, 16, $out
540	brz $len, .Lcbc_enc_unaligned_out_done
541	sub $len, 1, $len
542	b .Loop_cbc_enc_unaligned_out
543	nop
544
545	.align 32
546	.Loop_cbc_enc_unaligned_out:
547	fmovd %f2, $outhead
548	fxor $in0, %f0, %f0 ! inp^ivec^round[0]
549	fxor $in1, %f2, %f2
550	ldd [$key + 32], %f6 ! round[2]
551	ldd [$key + 40], %f8
552
553	fmovd %f0, %f4
554	faesencx %f2, %f10, %f0
555	faesencx %f4, %f12, %f2
556	ldd [$key + 48], %f10 ! round[3]
557	ldd [$key + 56], %f12
558
559	ldx [$inp - 16], %o0
560	ldx [$inp - 8], %o1
561	brz $ileft, .Lcbc_enc_aligned_inp
562	movrz $len, 0, $inc
563
564	ldx [$inp], %o2
565	sllx %o0, $ileft, %o0
566	srlx %o1, $iright, %g1
567	sllx %o1, $ileft, %o1
568	or %g1, %o0, %o0
569	srlx %o2, $iright, %o2
570	or %o2, %o1, %o1
571
572	.Lcbc_enc_aligned_inp:
573	fmovd %f0, %f4
574	faesencx %f2, %f6, %f0
575	faesencx %f4, %f8, %f2
576	ldd [$key + 64], %f6 ! round[4]
577	ldd [$key + 72], %f8
578	add $key, 64, $end
579	sub $rounds, 16*8, $inner
580
581	stx %o0, [%sp + LOCALS + 0]
582	stx %o1, [%sp + LOCALS + 8]
583	add $inp, $inc, $inp ! inp+=16
584	nop
585
586	.Lcbc_enc_unaligned:
587	fmovd %f0, %f4
588	faesencx %f2, %f10, %f0
589	faesencx %f4, %f12, %f2
590	ldd [$end + 16], %f10
591	ldd [$end + 24], %f12
592	add $end, 32, $end
593
594	fmovd %f0, %f4
595	faesencx %f2, %f6, %f0
596	faesencx %f4, %f8, %f2
597	ldd [$end + 0], %f6
598	ldd [$end + 8], %f8
599
600	brnz,a $inner, .Lcbc_enc_unaligned
601	sub $inner, 16*2, $inner
602
603	fmovd %f0, %f4
604	faesencx %f2, %f10, %f0
605	faesencx %f4, %f12, %f2
606	ldd [$end + 16], %f10 ! round[last-1]
607	ldd [$end + 24], %f12
608
609	fmovd %f0, %f4
610	faesencx %f2, %f6, %f0
611	faesencx %f4, %f8, %f2
612
613	ldd [%sp + LOCALS + 0], $in0
614	ldd [%sp + LOCALS + 8], $in1
615
616	fmovd %f0, %f4
617	faesencx %f2, %f10, %f0
618	faesencx %f4, %f12, %f2
619	ldd [$key + 16], %f10 ! round[1]
620	ldd [$key + 24], %f12
621
622	fxor $r0hi, $in0, $in0 ! inp^=round[0]
623	fxor $r0lo, $in1, $in1
624
625	fmovd %f0, %f4
626	faesenclx %f2, $rlhi, %f0
627	faesenclx %f4, $rllo, %f2
628
629	fshiftorx $outhead, %f0, $fshift, %f6
630	fshiftorx %f0, %f2, $fshift, %f8
631	std %f6, [$out + 0]
632	std %f8, [$out + 8]
633	add $out, 16, $out
634
635	brnz,a $len, .Loop_cbc_enc_unaligned_out
636	sub $len, 1, $len
637
638	.Lcbc_enc_unaligned_out_done:
639	fshiftorx %f2, %f2, $fshift, %f8
640	stda %f8, [$out + $mask]0xc0 ! partial store
641
642	st %f0, [$ivp + 0] ! output ivec
643	st %f1, [$ivp + 4]
644	st %f2, [$ivp + 8]
645	st %f3, [$ivp + 12]
646
647	ret
648	restore
649
650	.align 32
651	.Lcbc_decrypt:
652	fshiftorx $in0, $in1, $fshift, $in0
653	fshiftorx $in1, $intail, $fshift, $in1
654	fmovd %f0, $iv0
655	fmovd %f2, $iv1
656
657	.Loop_cbc_dec:
658	fxor $in0, $r0hi, %f0 ! inp^round[0]
659	fxor $in1, $r0lo, %f2
660	ldd [$key + 32], %f6 ! round[2]
661	ldd [$key + 40], %f8
662	add $key, 32, $end
663	sub $rounds, 16*6, $inner
664
665	.Lcbc_dec:
666	fmovd %f0, %f4
667	faesdecx %f2, %f10, %f0
668	faesdecx %f4, %f12, %f2
669	ldd [$end + 16], %f10
670	ldd [$end + 24], %f12
671	add $end, 32, $end
672
673	fmovd %f0, %f4
674	faesdecx %f2, %f6, %f0
675	faesdecx %f4, %f8, %f2
676	ldd [$end + 0], %f6
677	ldd [$end + 8], %f8
678
679	brnz,a $inner, .Lcbc_dec
680	sub $inner, 16*2, $inner
681
682	fmovd %f0, %f4
683	faesdecx %f2, %f10, %f0
684	faesdecx %f4, %f12, %f2
685	ldd [$end + 16], %f10 ! round[last-1]
686	ldd [$end + 24], %f12
687
688	fmovd %f0, %f4
689	faesdecx %f2, %f6, %f0
690	faesdecx %f4, %f8, %f2
691	fxor $iv0, $rlhi, %f6 ! ivec^round[last]
692	fxor $iv1, $rllo, %f8
693	fmovd $in0, $iv0
694	fmovd $in1, $iv1
695
696	movrz $len, 0, $inc
697	fmovd $intail, $in0
698	ldd [$inp - 8], $in1 ! load next input block
699	ldda [$inp]0x82, $intail ! non-faulting load
700	add $inp, $inc, $inp ! inp+=16
701
702	fmovd %f0, %f4
703	faesdecx %f2, %f10, %f0
704	faesdecx %f4, %f12, %f2
705	ldd [$key + 16], %f10 ! round[1]
706	ldd [$key + 24], %f12
707
708	fshiftorx $in0, $in1, $fshift, $in0
709	fshiftorx $in1, $intail, $fshift, $in1
710
711	fmovd %f0, %f4
712	faesdeclx %f2, %f6, %f0
713	faesdeclx %f4, %f8, %f2
714
715	brnz,pn $oalign, .Lcbc_dec_unaligned_out
716	nop
717
718	std %f0, [$out + 0]
719	std %f2, [$out + 8]
720	add $out, 16, $out
721
722	brnz,a $len, .Loop_cbc_dec
723	sub $len, 1, $len
724
725	st $iv0, [$ivp + 0] ! output ivec
726	st $iv0#lo, [$ivp + 4]
727	st $iv1, [$ivp + 8]
728	st $iv1#lo, [$ivp + 12]
729
730	ret
731	restore
732
733	.align 32
734	.Lcbc_dec_unaligned_out:
735	ldd [%o7 + $mask], $fshift ! shift right params
736	mov 0xff, $mask
737	srl $mask, $oalign, $mask
738	sub %g0, $ileft, $iright
739
740	fshiftorx %f0, %f0, $fshift, %f6
741	fshiftorx %f0, %f2, $fshift, %f8
742
743	stda %f6, [$out + $mask]0xc0 ! partial store
744	orn %g0, $mask, $mask
745	std %f8, [$out + 8]
746	add $out, 16, $out
747	brz $len, .Lcbc_dec_unaligned_out_done
748	sub $len, 1, $len
749	b .Loop_cbc_dec_unaligned_out
750	nop
751
752	.align 32
753	.Loop_cbc_dec_unaligned_out:
754	fmovd %f2, $outhead
755	fxor $in0, $r0hi, %f0 ! inp^round[0]
756	fxor $in1, $r0lo, %f2
757	ldd [$key + 32], %f6 ! round[2]
758	ldd [$key + 40], %f8
759
760	fmovd %f0, %f4
761	faesdecx %f2, %f10, %f0
762	faesdecx %f4, %f12, %f2
763	ldd [$key + 48], %f10 ! round[3]
764	ldd [$key + 56], %f12
765
766	ldx [$inp - 16], %o0
767	ldx [$inp - 8], %o1
768	brz $ileft, .Lcbc_dec_aligned_inp
769	movrz $len, 0, $inc
770
771	ldx [$inp], %o2
772	sllx %o0, $ileft, %o0
773	srlx %o1, $iright, %g1
774	sllx %o1, $ileft, %o1
775	or %g1, %o0, %o0
776	srlx %o2, $iright, %o2
777	or %o2, %o1, %o1
778
779	.Lcbc_dec_aligned_inp:
780	fmovd %f0, %f4
781	faesdecx %f2, %f6, %f0
782	faesdecx %f4, %f8, %f2
783	ldd [$key + 64], %f6 ! round[4]
784	ldd [$key + 72], %f8
785	add $key, 64, $end
786	sub $rounds, 16*8, $inner
787
788	stx %o0, [%sp + LOCALS + 0]
789	stx %o1, [%sp + LOCALS + 8]
790	add $inp, $inc, $inp ! inp+=16
791	nop
792
793	.Lcbc_dec_unaligned:
794	fmovd %f0, %f4
795	faesdecx %f2, %f10, %f0
796	faesdecx %f4, %f12, %f2
797	ldd [$end + 16], %f10
798	ldd [$end + 24], %f12
799	add $end, 32, $end
800
801	fmovd %f0, %f4
802	faesdecx %f2, %f6, %f0
803	faesdecx %f4, %f8, %f2
804	ldd [$end + 0], %f6
805	ldd [$end + 8], %f8
806
807	brnz,a $inner, .Lcbc_dec_unaligned
808	sub $inner, 16*2, $inner
809
810	fmovd %f0, %f4
811	faesdecx %f2, %f10, %f0
812	faesdecx %f4, %f12, %f2
813	ldd [$end + 16], %f10 ! round[last-1]
814	ldd [$end + 24], %f12
815
816	fmovd %f0, %f4
817	faesdecx %f2, %f6, %f0
818	faesdecx %f4, %f8, %f2
819
820	fxor $iv0, $rlhi, %f6 ! ivec^round[last]
821	fxor $iv1, $rllo, %f8
822	fmovd $in0, $iv0
823	fmovd $in1, $iv1
824	ldd [%sp + LOCALS + 0], $in0
825	ldd [%sp + LOCALS + 8], $in1
826
827	fmovd %f0, %f4
828	faesdecx %f2, %f10, %f0
829	faesdecx %f4, %f12, %f2
830	ldd [$key + 16], %f10 ! round[1]
831	ldd [$key + 24], %f12
832
833	fmovd %f0, %f4
834	faesdeclx %f2, %f6, %f0
835	faesdeclx %f4, %f8, %f2
836
837	fshiftorx $outhead, %f0, $fshift, %f6
838	fshiftorx %f0, %f2, $fshift, %f8
839	std %f6, [$out + 0]
840	std %f8, [$out + 8]
841	add $out, 16, $out
842
843	brnz,a $len, .Loop_cbc_dec_unaligned_out
844	sub $len, 1, $len
845
846	.Lcbc_dec_unaligned_out_done:
847	fshiftorx %f2, %f2, $fshift, %f8
848	stda %f8, [$out + $mask]0xc0 ! partial store
849
850	st $iv0, [$ivp + 0] ! output ivec
851	st $iv0#lo, [$ivp + 4]
852	st $iv1, [$ivp + 8]
853	st $iv1#lo, [$ivp + 12]
854
855	ret
856	restore
857	.type aes_fx_cbc_encrypt,#function
858	.size aes_fx_cbc_encrypt,.-aes_fx_cbc_encrypt
859	___
860	}
861	{
862	my ($inp,$out,$len,$key,$ivp) = map("%i$_",(0..5));
863	my ($rounds,$inner,$end,$inc,$ialign,$oalign,$mask) = map("%l$_",(0..7));
864	my ($ctr0,$ctr1,$r0hi,$r0lo,$rlhi,$rllo,$in0,$in1,$intail,$outhead,$fshift)
865	= map("%f$_",grep { !($_ & 1) } (16 .. 62));
866	my ($ileft,$iright) = ($ialign, $oalign);
867	my $one = "%f14";
868
869	$code.=<<___;
870	.globl aes_fx_ctr32_encrypt_blocks
871	.align 32
872	aes_fx_ctr32_encrypt_blocks:
873	save %sp, -STACK_FRAME-16, %sp
874	srln $len, 0, $len
875	and $inp, 7, $ialign
876	andn $inp, 7, $inp
877	brz,pn $len, .Lctr32_no_data
878	sll $ialign, 3, $ileft
879
880	.Lpic: call .+8
881	add %o7, .Linp_align - .Lpic, %o7
882
883	ld [$key + 240], $rounds
884	and $out, 7, $oalign
885	ld [$ivp + 0], $ctr0 ! load counter
886	andn $out, 7, $out
887	ld [$ivp + 4], $ctr0#lo
888	sll $oalign, 3, $mask
889	ld [$ivp + 8], $ctr1
890	ld [$ivp + 12], $ctr1#lo
891	ldd [%o7 + 128], $one
892
893	sll $rounds, 4, $rounds
894	add $rounds, $key, $end
895	ldd [$key + 0], $r0hi ! round[0]
896	ldd [$key + 8], $r0lo
897
898	add $inp, 16, $inp
899	sub $len, 1, $len
900	ldd [$key + 16], %f10 ! round[1]
901	ldd [$key + 24], %f12
902
903	mov 16, $inc
904	movrz $len, 0, $inc
905	ldd [$end + 0], $rlhi ! round[last]
906	ldd [$end + 8], $rllo
907
908	ldd [%o7 + $ileft], $fshift ! shiftleft params
909	add %o7, 64, %o7
910	ldd [$inp - 16], $in0 ! load input
911	ldd [$inp - 8], $in1
912	ldda [$inp]0x82, $intail ! non-faulting load
913	add $inp, $inc, $inp ! inp+=16
914
915	fshiftorx $in0, $in1, $fshift, $in0
916	fshiftorx $in1, $intail, $fshift, $in1
917
918	.Loop_ctr32:
919	fxor $ctr0, $r0hi, %f0 ! counter^round[0]
920	fxor $ctr1, $r0lo, %f2
921	ldd [$key + 32], %f6 ! round[2]
922	ldd [$key + 40], %f8
923	add $key, 32, $end
924	sub $rounds, 16*6, $inner
925
926	.Lctr32_enc:
927	fmovd %f0, %f4
928	faesencx %f2, %f10, %f0
929	faesencx %f4, %f12, %f2
930	ldd [$end + 16], %f10
931	ldd [$end + 24], %f12
932	add $end, 32, $end
933
934	fmovd %f0, %f4
935	faesencx %f2, %f6, %f0
936	faesencx %f4, %f8, %f2
937	ldd [$end + 0], %f6
938	ldd [$end + 8], %f8
939
940	brnz,a $inner, .Lctr32_enc
941	sub $inner, 16*2, $inner
942
943	fmovd %f0, %f4
944	faesencx %f2, %f10, %f0
945	faesencx %f4, %f12, %f2
946	ldd [$end + 16], %f10 ! round[last-1]
947	ldd [$end + 24], %f12
948
949	fmovd %f0, %f4
950	faesencx %f2, %f6, %f0
951	faesencx %f4, %f8, %f2
952	fxor $in0, $rlhi, %f6 ! inp^round[last]
953	fxor $in1, $rllo, %f8
954
955	movrz $len, 0, $inc
956	fmovd $intail, $in0
957	ldd [$inp - 8], $in1 ! load next input block
958	ldda [$inp]0x82, $intail ! non-faulting load
959	add $inp, $inc, $inp ! inp+=16
960
961	fmovd %f0, %f4
962	faesencx %f2, %f10, %f0
963	faesencx %f4, %f12, %f2
964	ldd [$key + 16], %f10 ! round[1]
965	ldd [$key + 24], %f12
966
967	fshiftorx $in0, $in1, $fshift, $in0
968	fshiftorx $in1, $intail, $fshift, $in1
969	fpadd32 $ctr1, $one, $ctr1 ! increment counter
970
971	fmovd %f0, %f4
972	faesenclx %f2, %f6, %f0
973	faesenclx %f4, %f8, %f2
974
975	brnz,pn $oalign, .Lctr32_unaligned_out
976	nop
977
978	std %f0, [$out + 0]
979	std %f2, [$out + 8]
980	add $out, 16, $out
981
982	brnz,a $len, .Loop_ctr32
983	sub $len, 1, $len
984
985	.Lctr32_no_data:
986	ret
987	restore
988
989	.align 32
990	.Lctr32_unaligned_out:
991	ldd [%o7 + $mask], $fshift ! shift right params
992	mov 0xff, $mask
993	srl $mask, $oalign, $mask
994	sub %g0, $ileft, $iright
995
996	fshiftorx %f0, %f0, $fshift, %f6
997	fshiftorx %f0, %f2, $fshift, %f8
998
999	stda %f6, [$out + $mask]0xc0 ! partial store
1000	orn %g0, $mask, $mask
1001	std %f8, [$out + 8]
1002	add $out, 16, $out
1003	brz $len, .Lctr32_unaligned_out_done
1004	sub $len, 1, $len
1005	b .Loop_ctr32_unaligned_out
1006	nop
1007
1008	.align 32
1009	.Loop_ctr32_unaligned_out:
1010	fmovd %f2, $outhead
1011	fxor $ctr0, $r0hi, %f0 ! counter^round[0]
1012	fxor $ctr1, $r0lo, %f2
1013	ldd [$key + 32], %f6 ! round[2]
1014	ldd [$key + 40], %f8
1015
1016	fmovd %f0, %f4
1017	faesencx %f2, %f10, %f0
1018	faesencx %f4, %f12, %f2
1019	ldd [$key + 48], %f10 ! round[3]
1020	ldd [$key + 56], %f12
1021
1022	ldx [$inp - 16], %o0
1023	ldx [$inp - 8], %o1
1024	brz $ileft, .Lctr32_aligned_inp
1025	movrz $len, 0, $inc
1026
1027	ldx [$inp], %o2
1028	sllx %o0, $ileft, %o0
1029	srlx %o1, $iright, %g1
1030	sllx %o1, $ileft, %o1
1031	or %g1, %o0, %o0
1032	srlx %o2, $iright, %o2
1033	or %o2, %o1, %o1
1034
1035	.Lctr32_aligned_inp:
1036	fmovd %f0, %f4
1037	faesencx %f2, %f6, %f0
1038	faesencx %f4, %f8, %f2
1039	ldd [$key + 64], %f6 ! round[4]
1040	ldd [$key + 72], %f8
1041	add $key, 64, $end
1042	sub $rounds, 16*8, $inner
1043
1044	stx %o0, [%sp + LOCALS + 0]
1045	stx %o1, [%sp + LOCALS + 8]
1046	add $inp, $inc, $inp ! inp+=16
1047	nop
1048
1049	.Lctr32_enc_unaligned:
1050	fmovd %f0, %f4
1051	faesencx %f2, %f10, %f0
1052	faesencx %f4, %f12, %f2
1053	ldd [$end + 16], %f10
1054	ldd [$end + 24], %f12
1055	add $end, 32, $end
1056
1057	fmovd %f0, %f4
1058	faesencx %f2, %f6, %f0
1059	faesencx %f4, %f8, %f2
1060	ldd [$end + 0], %f6
1061	ldd [$end + 8], %f8
1062
1063	brnz,a $inner, .Lctr32_enc_unaligned
1064	sub $inner, 16*2, $inner
1065
1066	fmovd %f0, %f4
1067	faesencx %f2, %f10, %f0
1068	faesencx %f4, %f12, %f2
1069	ldd [$end + 16], %f10 ! round[last-1]
1070	ldd [$end + 24], %f12
1071	fpadd32 $ctr1, $one, $ctr1 ! increment counter
1072
1073	fmovd %f0, %f4
1074	faesencx %f2, %f6, %f0
1075	faesencx %f4, %f8, %f2
1076	fxor $in0, $rlhi, %f6 ! inp^round[last]
1077	fxor $in1, $rllo, %f8
1078	ldd [%sp + LOCALS + 0], $in0
1079	ldd [%sp + LOCALS + 8], $in1
1080
1081	fmovd %f0, %f4
1082	faesencx %f2, %f10, %f0
1083	faesencx %f4, %f12, %f2
1084	ldd [$key + 16], %f10 ! round[1]
1085	ldd [$key + 24], %f12
1086
1087	fmovd %f0, %f4
1088	faesenclx %f2, %f6, %f0
1089	faesenclx %f4, %f8, %f2
1090
1091	fshiftorx $outhead, %f0, $fshift, %f6
1092	fshiftorx %f0, %f2, $fshift, %f8
1093	std %f6, [$out + 0]
1094	std %f8, [$out + 8]
1095	add $out, 16, $out
1096
1097	brnz,a $len, .Loop_ctr32_unaligned_out
1098	sub $len, 1, $len
1099
1100	.Lctr32_unaligned_out_done:
1101	fshiftorx %f2, %f2, $fshift, %f8
1102	stda %f8, [$out + $mask]0xc0 ! partial store
1103
1104	ret
1105	restore
1106	.type aes_fx_ctr32_encrypt_blocks,#function
1107	.size aes_fx_ctr32_encrypt_blocks,.-aes_fx_ctr32_encrypt_blocks
1108
1109	.align 32
1110	.Linp_align: ! fshiftorx parameters for left shift toward %rs1
1111	.byte 0, 0, 64, 0, 0, 64, 0, -64
1112	.byte 0, 0, 56, 8, 0, 56, 8, -56
1113	.byte 0, 0, 48, 16, 0, 48, 16, -48
1114	.byte 0, 0, 40, 24, 0, 40, 24, -40
1115	.byte 0, 0, 32, 32, 0, 32, 32, -32
1116	.byte 0, 0, 24, 40, 0, 24, 40, -24
1117	.byte 0, 0, 16, 48, 0, 16, 48, -16
1118	.byte 0, 0, 8, 56, 0, 8, 56, -8
1119	.Lout_align: ! fshiftorx parameters for right shift toward %rs2
1120	.byte 0, 0, 0, 64, 0, 0, 64, 0
1121	.byte 0, 0, 8, 56, 0, 8, 56, -8
1122	.byte 0, 0, 16, 48, 0, 16, 48, -16
1123	.byte 0, 0, 24, 40, 0, 24, 40, -24
1124	.byte 0, 0, 32, 32, 0, 32, 32, -32
1125	.byte 0, 0, 40, 24, 0, 40, 24, -40
1126	.byte 0, 0, 48, 16, 0, 48, 16, -48
1127	.byte 0, 0, 56, 8, 0, 56, 8, -56
1128	.Lone:
1129	.word 0, 1
1130	.asciz "AES for Fujitsu SPARC64 X, CRYPTOGAMS by <appro\@openssl.org>"
1131	.align 4
1132	___
1133	}
1134	# Purpose of these subroutines is to explicitly encode VIS instructions,
1135	# so that one can compile the module without having to specify VIS
1136	# extensions on compiler command line, e.g. -xarch=v9 vs. -xarch=v9a.
1137	# Idea is to reserve for option to produce "universal" binary and let
1138	# programmer detect if current CPU is VIS capable at run-time.
1139	sub unvis {
1140	my ($mnemonic,$rs1,$rs2,$rd)=@_;
1141	my ($ref,$opf);
1142	my %visopf = ( "faligndata" => 0x048,
1143	"bshuffle" => 0x04c,
1144	"fpadd32" => 0x052,
1145	"fxor" => 0x06c,
1146	"fsrc2" => 0x078 );
1147
1148	$ref = "$mnemonic\t$rs1,$rs2,$rd";
1149
1150	if ($opf=$visopf{$mnemonic}) {
1151	foreach ($rs1,$rs2,$rd) {
1152	return $ref if (!/%f([0-9]{1,2})/);
1153	$_=$1;
1154	if ($1>=32) {
1155	return $ref if ($1&1);
1156	# re-encode for upper double register addressing
1157	$_=($1\|$1>>5)&31;
1158	}
1159	}
1160
1161	return sprintf ".word\t0x%08x !%s",
1162	0x81b00000\|$rd<<25\|$rs1<<14\|$opf<<5\|$rs2,
1163	$ref;
1164	} else {
1165	return $ref;
1166	}
1167	}
1168
1169	sub unvis3 {
1170	my ($mnemonic,$rs1,$rs2,$rd)=@_;
1171	my %bias = ( "g" => 0, "o" => 8, "l" => 16, "i" => 24 );
1172	my ($ref,$opf);
1173	my %visopf = ( "alignaddr" => 0x018,
1174	"bmask" => 0x019,
1175	"alignaddrl" => 0x01a );
1176
1177	$ref = "$mnemonic\t$rs1,$rs2,$rd";
1178
1179	if ($opf=$visopf{$mnemonic}) {
1180	foreach ($rs1,$rs2,$rd) {
1181	return $ref if (!/%([goli])([0-9])/);
1182	$_=$bias{$1}+$2;
1183	}
1184
1185	return sprintf ".word\t0x%08x !%s",
1186	0x81b00000\|$rd<<25\|$rs1<<14\|$opf<<5\|$rs2,
1187	$ref;
1188	} else {
1189	return $ref;
1190	}
1191	}
1192
1193	sub unfx {
1194	my ($mnemonic,$rs1,$rs2,$rd)=@_;
1195	my ($ref,$opf);
1196	my %aesopf = ( "faesencx" => 0x90,
1197	"faesdecx" => 0x91,
1198	"faesenclx" => 0x92,
1199	"faesdeclx" => 0x93,
1200	"faeskeyx" => 0x94 );
1201
1202	$ref = "$mnemonic\t$rs1,$rs2,$rd";
1203
1204	if (defined($opf=$aesopf{$mnemonic})) {
1205	$rs2 = ($rs2 =~ /%f([0-6]*[02468])/) ? (($1\|$1>>5)&31) : $rs2;
1206	$rs2 = oct($rs2) if ($rs2 =~ /^0/);
1207
1208	foreach ($rs1,$rd) {
1209	return $ref if (!/%f([0-9]{1,2})/);
1210	$_=$1;
1211	if ($1>=32) {
1212	return $ref if ($1&1);
1213	# re-encode for upper double register addressing
1214	$_=($1\|$1>>5)&31;
1215	}
1216	}
1217
1218	return sprintf ".word\t0x%08x !%s",
1219	2<<30\|$rd<<25\|0x36<<19\|$rs1<<14\|$opf<<5\|$rs2,
1220	$ref;
1221	} else {
1222	return $ref;
1223	}
1224	}
1225
1226	sub unfx3src {
1227	my ($mnemonic,$rs1,$rs2,$rs3,$rd)=@_;
1228	my ($ref,$opf);
1229	my %aesopf = ( "fshiftorx" => 0x0b );
1230
1231	$ref = "$mnemonic\t$rs1,$rs2,$rs3,$rd";
1232
1233	if (defined($opf=$aesopf{$mnemonic})) {
1234	foreach ($rs1,$rs2,$rs3,$rd) {
1235	return $ref if (!/%f([0-9]{1,2})/);
1236	$_=$1;
1237	if ($1>=32) {
1238	return $ref if ($1&1);
1239	# re-encode for upper double register addressing
1240	$_=($1\|$1>>5)&31;
1241	}
1242	}
1243
1244	return sprintf ".word\t0x%08x !%s",
1245	2<<30\|$rd<<25\|0x37<<19\|$rs1<<14\|$rs3<<9\|$opf<<5\|$rs2,
1246	$ref;
1247	} else {
1248	return $ref;
1249	}
1250	}
1251
1252	foreach (split("\n",$code)) {
1253	s/\`([^\`]*)\`/eval $1/ge;
1254
1255	s/%f([0-9]+)#lo/sprintf "%%f%d",$1+1/ge;
1256
1257	s/\b(faes[^x]{3,4}x)\s+(%f[0-9]{1,2}),\s([%fx0-9]+),\s(%f[0-9]{1,2})/
1258	&unfx($1,$2,$3,$4)
1259	/ge or
1260	s/\b([f][^\s])\s+(%f[0-9]{1,2}),\s(%f[0-9]{1,2}),\s(%f[0-9]{1,2}),\s(%f[0-9]{1,2})/
1261	&unfx3src($1,$2,$3,$4,$5)
1262	/ge or
1263	s/\b([fb][^\s])\s+(%f[0-9]{1,2}),\s(%f[0-9]{1,2}),\s*(%f[0-9]{1,2})/
1264	&unvis($1,$2,$3,$4)
1265	/ge or
1266	s/\b(alignaddr[l])\s+(%[goli][0-7]),\s(%[goli][0-7]),\s*(%[goli][0-7])/
1267	&unvis3($1,$2,$3,$4)
1268	/ge;
1269	print $_,"\n";
1270	}
1271
1272	close STDOUT or die "error closing STDOUT: $!";

Note: See TracBrowser for help on using the repository browser.

source: vbox/trunk/src/libs/openssl-3.1.3/crypto/aes/asm/aesfx-sparcv9.pl@ 102427

Download in other formats: