sha1-sparcv9.pl@ 101021

Last change on this file since 101021 was 101021, checked in by vboxsync, 18 months ago
openssl-3.1.2: Applied and adjusted our OpenSSL changes to 3.1.0. bugref:10519
File size: 9.3 KB

Line
1	#! /usr/bin/env perl
2	# Copyright 2007-2021 The OpenSSL Project Authors. All Rights Reserved.
3	#
4	# Licensed under the Apache License 2.0 (the "License"). You may not use
5	# this file except in compliance with the License. You can obtain a copy
6	# in the file LICENSE in the source distribution or at
7	# https://www.openssl.org/source/license.html
8
9
10	# ====================================================================
11	# Written by Andy Polyakov <[email protected]> for the OpenSSL
12	# project. The module is, however, dual licensed under OpenSSL and
13	# CRYPTOGAMS licenses depending on where you obtain it. For further
14	# details see http://www.openssl.org/~appro/cryptogams/.
15	#
16	# Hardware SPARC T4 support by David S. Miller
17	# ====================================================================
18
19	# Performance improvement is not really impressive on pre-T1 CPU: +8%
20	# over Sun C and +25% over gcc [3.3]. While on T1, a.k.a. Niagara, it
21	# turned to be 40% faster than 64-bit code generated by Sun C 5.8 and
22	# >2x than 64-bit code generated by gcc 3.4. And there is a gimmick.
23	# X[16] vector is packed to 8 64-bit registers and as result nothing
24	# is spilled on stack. In addition input data is loaded in compact
25	# instruction sequence, thus minimizing the window when the code is
26	# subject to [inter-thread] cache-thrashing hazard. The goal is to
27	# ensure scalability on UltraSPARC T1, or rather to avoid decay when
28	# amount of active threads exceeds the number of physical cores.
29
30	# SPARC T4 SHA1 hardware achieves 3.72 cycles per byte, which is 3.1x
31	# faster than software. Multi-process benchmark saturates at 11x
32	# single-process result on 8-core processor, or ~9GBps per 2.85GHz
33	# socket.
34
35	$output=pop and open STDOUT,">$output";
36
37	@X=("%o0","%o1","%o2","%o3","%o4","%o5","%g1","%o7");
38	$rot1m="%g2";
39	$tmp64="%g3";
40	$Xi="%g4";
41	$A="%l0";
42	$B="%l1";
43	$C="%l2";
44	$D="%l3";
45	$E="%l4";
46	@V=($A,$B,$C,$D,$E);
47	$K_00_19="%l5";
48	$K_20_39="%l6";
49	$K_40_59="%l7";
50	$K_60_79="%g5";
51	@K=($K_00_19,$K_20_39,$K_40_59,$K_60_79);
52
53	$ctx="%i0";
54	$inp="%i1";
55	$len="%i2";
56	$tmp0="%i3";
57	$tmp1="%i4";
58	$tmp2="%i5";
59
60	sub BODY_00_15 {
61	my ($i,$a,$b,$c,$d,$e)=@_;
62	my $xi=($i&1)?@X[($i/2)%8]:$Xi;
63
64	$code.=<<___;
65	sll $a,5,$tmp0 !! $i
66	add @K[$i/20],$e,$e
67	srl $a,27,$tmp1
68	add $tmp0,$e,$e
69	and $c,$b,$tmp0
70	add $tmp1,$e,$e
71	sll $b,30,$tmp2
72	andn $d,$b,$tmp1
73	srl $b,2,$b
74	or $tmp1,$tmp0,$tmp1
75	or $tmp2,$b,$b
76	add $xi,$e,$e
77	___
78	if ($i&1 && $i<15) {
79	$code.=
80	" srlx @X[(($i+1)/2)%8],32,$Xi\n";
81	}
82	$code.=<<___;
83	add $tmp1,$e,$e
84	___
85	}
86
87	sub Xupdate {
88	my ($i,$a,$b,$c,$d,$e)=@_;
89	my $j=$i/2;
90
91	if ($i&1) {
92	$code.=<<___;
93	sll $a,5,$tmp0 !! $i
94	add @K[$i/20],$e,$e
95	srl $a,27,$tmp1
96	___
97	} else {
98	$code.=<<___;
99	sllx @X[($j+6)%8],32,$Xi ! Xupdate($i)
100	xor @X[($j+1)%8],@X[$j%8],@X[$j%8]
101	srlx @X[($j+7)%8],32,$tmp1
102	xor @X[($j+4)%8],@X[$j%8],@X[$j%8]
103	sll $a,5,$tmp0 !! $i
104	or $tmp1,$Xi,$Xi
105	add @K[$i/20],$e,$e !!
106	xor $Xi,@X[$j%8],@X[$j%8]
107	srlx @X[$j%8],31,$Xi
108	add @X[$j%8],@X[$j%8],@X[$j%8]
109	and $Xi,$rot1m,$Xi
110	andn @X[$j%8],$rot1m,@X[$j%8]
111	srl $a,27,$tmp1 !!
112	or $Xi,@X[$j%8],@X[$j%8]
113	___
114	}
115	}
116
117	sub BODY_16_19 {
118	my ($i,$a,$b,$c,$d,$e)=@_;
119
120	&Xupdate(@_);
121	if ($i&1) {
122	$xi=@X[($i/2)%8];
123	} else {
124	$xi=$Xi;
125	$code.="\tsrlx @X[($i/2)%8],32,$xi\n";
126	}
127	$code.=<<___;
128	add $tmp0,$e,$e !!
129	and $c,$b,$tmp0
130	add $tmp1,$e,$e
131	sll $b,30,$tmp2
132	add $xi,$e,$e
133	andn $d,$b,$tmp1
134	srl $b,2,$b
135	or $tmp1,$tmp0,$tmp1
136	or $tmp2,$b,$b
137	add $tmp1,$e,$e
138	___
139	}
140
141	sub BODY_20_39 {
142	my ($i,$a,$b,$c,$d,$e)=@_;
143	my $xi;
144	&Xupdate(@_);
145	if ($i&1) {
146	$xi=@X[($i/2)%8];
147	} else {
148	$xi=$Xi;
149	$code.="\tsrlx @X[($i/2)%8],32,$xi\n";
150	}
151	$code.=<<___;
152	add $tmp0,$e,$e !!
153	xor $c,$b,$tmp0
154	add $tmp1,$e,$e
155	sll $b,30,$tmp2
156	xor $d,$tmp0,$tmp1
157	srl $b,2,$b
158	add $tmp1,$e,$e
159	or $tmp2,$b,$b
160	add $xi,$e,$e
161	___
162	}
163
164	sub BODY_40_59 {
165	my ($i,$a,$b,$c,$d,$e)=@_;
166	my $xi;
167	&Xupdate(@_);
168	if ($i&1) {
169	$xi=@X[($i/2)%8];
170	} else {
171	$xi=$Xi;
172	$code.="\tsrlx @X[($i/2)%8],32,$xi\n";
173	}
174	$code.=<<___;
175	add $tmp0,$e,$e !!
176	and $c,$b,$tmp0
177	add $tmp1,$e,$e
178	sll $b,30,$tmp2
179	or $c,$b,$tmp1
180	srl $b,2,$b
181	and $d,$tmp1,$tmp1
182	add $xi,$e,$e
183	or $tmp1,$tmp0,$tmp1
184	or $tmp2,$b,$b
185	add $tmp1,$e,$e
186	___
187	}
188
189	$code.=<<___;
190	#ifndef __ASSEMBLER__
191	# define __ASSEMBLER__ 1
192	#endif
193	#include "crypto/sparc_arch.h"
194
195	#ifdef __arch64__
196	.register %g2,#scratch
197	.register %g3,#scratch
198	#endif
199
200	.section ".text",#alloc,#execinstr
201
202	#ifdef __PIC__
203	SPARC_PIC_THUNK(%g1)
204	#endif
205
206	.align 32
207	.globl sha1_block_data_order
208	sha1_block_data_order:
209	SPARC_LOAD_ADDRESS_LEAF(OPENSSL_sparcv9cap_P,%g1,%g5)
210	ld [%g1+4],%g1 ! OPENSSL_sparcv9cap_P[1]
211
212	andcc %g1, CFR_SHA1, %g0
213	be .Lsoftware
214	nop
215
216	ld [%o0 + 0x00], %f0 ! load context
217	ld [%o0 + 0x04], %f1
218	ld [%o0 + 0x08], %f2
219	andcc %o1, 0x7, %g0
220	ld [%o0 + 0x0c], %f3
221	bne,pn %icc, .Lhwunaligned
222	ld [%o0 + 0x10], %f4
223
224	.Lhw_loop:
225	ldd [%o1 + 0x00], %f8
226	ldd [%o1 + 0x08], %f10
227	ldd [%o1 + 0x10], %f12
228	ldd [%o1 + 0x18], %f14
229	ldd [%o1 + 0x20], %f16
230	ldd [%o1 + 0x28], %f18
231	ldd [%o1 + 0x30], %f20
232	subcc %o2, 1, %o2 ! done yet?
233	ldd [%o1 + 0x38], %f22
234	add %o1, 0x40, %o1
235	prefetch [%o1 + 63], 20
236
237	.word 0x81b02820 ! SHA1
238
239	bne,pt SIZE_T_CC, .Lhw_loop
240	nop
241
242	.Lhwfinish:
243	st %f0, [%o0 + 0x00] ! store context
244	st %f1, [%o0 + 0x04]
245	st %f2, [%o0 + 0x08]
246	st %f3, [%o0 + 0x0c]
247	retl
248	st %f4, [%o0 + 0x10]
249
250	.align 8
251	.Lhwunaligned:
252	alignaddr %o1, %g0, %o1
253
254	ldd [%o1 + 0x00], %f10
255	.Lhwunaligned_loop:
256	ldd [%o1 + 0x08], %f12
257	ldd [%o1 + 0x10], %f14
258	ldd [%o1 + 0x18], %f16
259	ldd [%o1 + 0x20], %f18
260	ldd [%o1 + 0x28], %f20
261	ldd [%o1 + 0x30], %f22
262	ldd [%o1 + 0x38], %f24
263	subcc %o2, 1, %o2 ! done yet?
264	ldd [%o1 + 0x40], %f26
265	add %o1, 0x40, %o1
266	prefetch [%o1 + 63], 20
267
268	faligndata %f10, %f12, %f8
269	faligndata %f12, %f14, %f10
270	faligndata %f14, %f16, %f12
271	faligndata %f16, %f18, %f14
272	faligndata %f18, %f20, %f16
273	faligndata %f20, %f22, %f18
274	faligndata %f22, %f24, %f20
275	faligndata %f24, %f26, %f22
276
277	.word 0x81b02820 ! SHA1
278
279	bne,pt SIZE_T_CC, .Lhwunaligned_loop
280	for %f26, %f26, %f10 ! %f10=%f26
281
282	ba .Lhwfinish
283	nop
284
285	.align 16
286	.Lsoftware:
287	save %sp,-STACK_FRAME,%sp
288	sllx $len,6,$len
289	add $inp,$len,$len
290
291	or %g0,1,$rot1m
292	sllx $rot1m,32,$rot1m
293	or $rot1m,1,$rot1m
294
295	ld [$ctx+0],$A
296	ld [$ctx+4],$B
297	ld [$ctx+8],$C
298	ld [$ctx+12],$D
299	ld [$ctx+16],$E
300	andn $inp,7,$tmp0
301
302	sethi %hi(0x5a827999),$K_00_19
303	or $K_00_19,%lo(0x5a827999),$K_00_19
304	sethi %hi(0x6ed9eba1),$K_20_39
305	or $K_20_39,%lo(0x6ed9eba1),$K_20_39
306	sethi %hi(0x8f1bbcdc),$K_40_59
307	or $K_40_59,%lo(0x8f1bbcdc),$K_40_59
308	sethi %hi(0xca62c1d6),$K_60_79
309	or $K_60_79,%lo(0xca62c1d6),$K_60_79
310
311	.Lloop:
312	ldx [$tmp0+0],@X[0]
313	ldx [$tmp0+16],@X[2]
314	ldx [$tmp0+32],@X[4]
315	ldx [$tmp0+48],@X[6]
316	and $inp,7,$tmp1
317	ldx [$tmp0+8],@X[1]
318	sll $tmp1,3,$tmp1
319	ldx [$tmp0+24],@X[3]
320	subcc %g0,$tmp1,$tmp2 ! should be 64-$tmp1, but -$tmp1 works too
321	ldx [$tmp0+40],@X[5]
322	bz,pt %icc,.Laligned
323	ldx [$tmp0+56],@X[7]
324
325	sllx @X[0],$tmp1,@X[0]
326	ldx [$tmp0+64],$tmp64
327	___
328	for($i=0;$i<7;$i++)
329	{ $code.=<<___;
330	srlx @X[$i+1],$tmp2,$Xi
331	sllx @X[$i+1],$tmp1,@X[$i+1]
332	or $Xi,@X[$i],@X[$i]
333	___
334	}
335	$code.=<<___;
336	srlx $tmp64,$tmp2,$tmp64
337	or $tmp64,@X[7],@X[7]
338	.Laligned:
339	srlx @X[0],32,$Xi
340	___
341	for ($i=0;$i<16;$i++) { &BODY_00_15($i,@V); unshift(@V,pop(@V)); }
342	for (;$i<20;$i++) { &BODY_16_19($i,@V); unshift(@V,pop(@V)); }
343	for (;$i<40;$i++) { &BODY_20_39($i,@V); unshift(@V,pop(@V)); }
344	for (;$i<60;$i++) { &BODY_40_59($i,@V); unshift(@V,pop(@V)); }
345	for (;$i<80;$i++) { &BODY_20_39($i,@V); unshift(@V,pop(@V)); }
346	$code.=<<___;
347
348	ld [$ctx+0],@X[0]
349	ld [$ctx+4],@X[1]
350	ld [$ctx+8],@X[2]
351	ld [$ctx+12],@X[3]
352	add $inp,64,$inp
353	ld [$ctx+16],@X[4]
354	cmp $inp,$len
355
356	add $A,@X[0],$A
357	st $A,[$ctx+0]
358	add $B,@X[1],$B
359	st $B,[$ctx+4]
360	add $C,@X[2],$C
361	st $C,[$ctx+8]
362	add $D,@X[3],$D
363	st $D,[$ctx+12]
364	add $E,@X[4],$E
365	st $E,[$ctx+16]
366
367	bne SIZE_T_CC,.Lloop
368	andn $inp,7,$tmp0
369
370	ret
371	restore
372	.type sha1_block_data_order,#function
373	.size sha1_block_data_order,(.-sha1_block_data_order)
374	.asciz "SHA1 block transform for SPARCv9, CRYPTOGAMS by <appro\@openssl.org>"
375	.align 4
376	___
377
378	# Purpose of these subroutines is to explicitly encode VIS instructions,
379	# so that one can compile the module without having to specify VIS
380	# extensions on compiler command line, e.g. -xarch=v9 vs. -xarch=v9a.
381	# Idea is to reserve for option to produce "universal" binary and let
382	# programmer detect if current CPU is VIS capable at run-time.
383	sub unvis {
384	my ($mnemonic,$rs1,$rs2,$rd)=@_;
385	my $ref,$opf;
386	my %visopf = ( "faligndata" => 0x048,
387	"for" => 0x07c );
388
389	$ref = "$mnemonic\t$rs1,$rs2,$rd";
390
391	if ($opf=$visopf{$mnemonic}) {
392	foreach ($rs1,$rs2,$rd) {
393	return $ref if (!/%f([0-9]{1,2})/);
394	$_=$1;
395	if ($1>=32) {
396	return $ref if ($1&1);
397	# re-encode for upper double register addressing
398	$_=($1\|$1>>5)&31;
399	}
400	}
401
402	return sprintf ".word\t0x%08x !%s",
403	0x81b00000\|$rd<<25\|$rs1<<14\|$opf<<5\|$rs2,
404	$ref;
405	} else {
406	return $ref;
407	}
408	}
409	sub unalignaddr {
410	my ($mnemonic,$rs1,$rs2,$rd)=@_;
411	my %bias = ( "g" => 0, "o" => 8, "l" => 16, "i" => 24 );
412	my $ref="$mnemonic\t$rs1,$rs2,$rd";
413
414	foreach ($rs1,$rs2,$rd) {
415	if (/%([goli])([0-7])/) { $_=$bias{$1}+$2; }
416	else { return $ref; }
417	}
418	return sprintf ".word\t0x%08x !%s",
419	0x81b00300\|$rd<<25\|$rs1<<14\|$rs2,
420	$ref;
421	}
422
423	foreach (split("\n",$code)) {
424	s/\`([^\`]*)\`/eval $1/ge;
425
426	s/\b(f[^\s])\s+(%f[0-9]{1,2}),\s(%f[0-9]{1,2}),\s*(%f[0-9]{1,2})/
427	&unvis($1,$2,$3,$4)
428	/ge;
429	s/\b(alignaddr)\s+(%[goli][0-7]),\s(%[goli][0-7]),\s(%[goli][0-7])/
430	&unalignaddr($1,$2,$3,$4)
431	/ge;
432
433	print $_,"\n";
434	}
435
436	close STDOUT or die "error closing STDOUT: $!";

Note: See TracBrowser for help on using the repository browser.

source: vbox/trunk/src/libs/openssl-3.1.2/crypto/sha/asm/sha1-sparcv9.pl@ 101021

Download in other formats: