ghash-sparcv9.pl@ 69890

Last change on this file since 69890 was 69890, checked in by vboxsync, 7 years ago
Added OpenSSL 1.1.0g with unneeded files removed, otherwise unmodified. bugref:8070: src/libs maintenance
Property svn:eol-style set to `LF` Property svn:executable set to ``*
File size: 12.7 KB

Line
1	#! /usr/bin/env perl
2	# Copyright 2010-2016 The OpenSSL Project Authors. All Rights Reserved.
3	#
4	# Licensed under the OpenSSL license (the "License"). You may not use
5	# this file except in compliance with the License. You can obtain a copy
6	# in the file LICENSE in the source distribution or at
7	# https://www.openssl.org/source/license.html
8
9
10	# ====================================================================
11	# Written by Andy Polyakov <[email protected]> for the OpenSSL
12	# project. The module is, however, dual licensed under OpenSSL and
13	# CRYPTOGAMS licenses depending on where you obtain it. For further
14	# details see http://www.openssl.org/~appro/cryptogams/.
15	# ====================================================================
16
17	# March 2010
18	#
19	# The module implements "4-bit" GCM GHASH function and underlying
20	# single multiplication operation in GF(2^128). "4-bit" means that it
21	# uses 256 bytes per-key table [+128 bytes shared table]. Performance
22	# results are for streamed GHASH subroutine on UltraSPARC pre-Tx CPU
23	# and are expressed in cycles per processed byte, less is better:
24	#
25	# gcc 3.3.x cc 5.2 this assembler
26	#
27	# 32-bit build 81.4 43.3 12.6 (+546%/+244%)
28	# 64-bit build 20.2 21.2 12.6 (+60%/+68%)
29	#
30	# Here is data collected on UltraSPARC T1 system running Linux:
31	#
32	# gcc 4.4.1 this assembler
33	#
34	# 32-bit build 566 50 (+1000%)
35	# 64-bit build 56 50 (+12%)
36	#
37	# I don't quite understand why difference between 32-bit and 64-bit
38	# compiler-generated code is so big. Compilers were instructed to
39	# generate code for UltraSPARC and should have used 64-bit registers
40	# for Z vector (see C code) even in 32-bit build... Oh well, it only
41	# means more impressive improvement coefficients for this assembler
42	# module;-) Loops are aggressively modulo-scheduled in respect to
43	# references to input data and Z.hi updates to achieve 12 cycles
44	# timing. To anchor to something else, sha1-sparcv9.pl spends 11.6
45	# cycles to process one byte on UltraSPARC pre-Tx CPU and ~24 on T1.
46	#
47	# October 2012
48	#
49	# Add VIS3 lookup-table-free implementation using polynomial
50	# multiplication xmulx[hi] and extended addition addxc[cc]
51	# instructions. 4.52/7.63x improvement on T3/T4 or in absolute
52	# terms 7.90/2.14 cycles per byte. On T4 multi-process benchmark
53	# saturates at ~15.5x single-process result on 8-core processor,
54	# or ~20.5GBps per 2.85GHz socket.
55
56	$output=pop;
57	open STDOUT,">$output";
58
59	$frame="STACK_FRAME";
60	$bias="STACK_BIAS";
61
62	$Zhi="%o0"; # 64-bit values
63	$Zlo="%o1";
64	$Thi="%o2";
65	$Tlo="%o3";
66	$rem="%o4";
67	$tmp="%o5";
68
69	$nhi="%l0"; # small values and pointers
70	$nlo="%l1";
71	$xi0="%l2";
72	$xi1="%l3";
73	$rem_4bit="%l4";
74	$remi="%l5";
75	$Htblo="%l6";
76	$cnt="%l7";
77
78	$Xi="%i0"; # input argument block
79	$Htbl="%i1";
80	$inp="%i2";
81	$len="%i3";
82
83	$code.=<<___;
84	#include "sparc_arch.h"
85
86	#ifdef __arch64__
87	.register %g2,#scratch
88	.register %g3,#scratch
89	#endif
90
91	.section ".text",#alloc,#execinstr
92
93	.align 64
94	rem_4bit:
95	.long `0x0000<<16`,0,`0x1C20<<16`,0,`0x3840<<16`,0,`0x2460<<16`,0
96	.long `0x7080<<16`,0,`0x6CA0<<16`,0,`0x48C0<<16`,0,`0x54E0<<16`,0
97	.long `0xE100<<16`,0,`0xFD20<<16`,0,`0xD940<<16`,0,`0xC560<<16`,0
98	.long `0x9180<<16`,0,`0x8DA0<<16`,0,`0xA9C0<<16`,0,`0xB5E0<<16`,0
99	.type rem_4bit,#object
100	.size rem_4bit,(.-rem_4bit)
101
102	.globl gcm_ghash_4bit
103	.align 32
104	gcm_ghash_4bit:
105	save %sp,-$frame,%sp
106	ldub [$inp+15],$nlo
107	ldub [$Xi+15],$xi0
108	ldub [$Xi+14],$xi1
109	add $len,$inp,$len
110	add $Htbl,8,$Htblo
111
112	1: call .+8
113	add %o7,rem_4bit-1b,$rem_4bit
114
115	.Louter:
116	xor $xi0,$nlo,$nlo
117	and $nlo,0xf0,$nhi
118	and $nlo,0x0f,$nlo
119	sll $nlo,4,$nlo
120	ldx [$Htblo+$nlo],$Zlo
121	ldx [$Htbl+$nlo],$Zhi
122
123	ldub [$inp+14],$nlo
124
125	ldx [$Htblo+$nhi],$Tlo
126	and $Zlo,0xf,$remi
127	ldx [$Htbl+$nhi],$Thi
128	sll $remi,3,$remi
129	ldx [$rem_4bit+$remi],$rem
130	srlx $Zlo,4,$Zlo
131	mov 13,$cnt
132	sllx $Zhi,60,$tmp
133	xor $Tlo,$Zlo,$Zlo
134	srlx $Zhi,4,$Zhi
135	xor $Zlo,$tmp,$Zlo
136
137	xor $xi1,$nlo,$nlo
138	and $Zlo,0xf,$remi
139	and $nlo,0xf0,$nhi
140	and $nlo,0x0f,$nlo
141	ba .Lghash_inner
142	sll $nlo,4,$nlo
143	.align 32
144	.Lghash_inner:
145	ldx [$Htblo+$nlo],$Tlo
146	sll $remi,3,$remi
147	xor $Thi,$Zhi,$Zhi
148	ldx [$Htbl+$nlo],$Thi
149	srlx $Zlo,4,$Zlo
150	xor $rem,$Zhi,$Zhi
151	ldx [$rem_4bit+$remi],$rem
152	sllx $Zhi,60,$tmp
153	xor $Tlo,$Zlo,$Zlo
154	ldub [$inp+$cnt],$nlo
155	srlx $Zhi,4,$Zhi
156	xor $Zlo,$tmp,$Zlo
157	ldub [$Xi+$cnt],$xi1
158	xor $Thi,$Zhi,$Zhi
159	and $Zlo,0xf,$remi
160
161	ldx [$Htblo+$nhi],$Tlo
162	sll $remi,3,$remi
163	xor $rem,$Zhi,$Zhi
164	ldx [$Htbl+$nhi],$Thi
165	srlx $Zlo,4,$Zlo
166	ldx [$rem_4bit+$remi],$rem
167	sllx $Zhi,60,$tmp
168	xor $xi1,$nlo,$nlo
169	srlx $Zhi,4,$Zhi
170	and $nlo,0xf0,$nhi
171	addcc $cnt,-1,$cnt
172	xor $Zlo,$tmp,$Zlo
173	and $nlo,0x0f,$nlo
174	xor $Tlo,$Zlo,$Zlo
175	sll $nlo,4,$nlo
176	blu .Lghash_inner
177	and $Zlo,0xf,$remi
178
179	ldx [$Htblo+$nlo],$Tlo
180	sll $remi,3,$remi
181	xor $Thi,$Zhi,$Zhi
182	ldx [$Htbl+$nlo],$Thi
183	srlx $Zlo,4,$Zlo
184	xor $rem,$Zhi,$Zhi
185	ldx [$rem_4bit+$remi],$rem
186	sllx $Zhi,60,$tmp
187	xor $Tlo,$Zlo,$Zlo
188	srlx $Zhi,4,$Zhi
189	xor $Zlo,$tmp,$Zlo
190	xor $Thi,$Zhi,$Zhi
191
192	add $inp,16,$inp
193	cmp $inp,$len
194	be,pn SIZE_T_CC,.Ldone
195	and $Zlo,0xf,$remi
196
197	ldx [$Htblo+$nhi],$Tlo
198	sll $remi,3,$remi
199	xor $rem,$Zhi,$Zhi
200	ldx [$Htbl+$nhi],$Thi
201	srlx $Zlo,4,$Zlo
202	ldx [$rem_4bit+$remi],$rem
203	sllx $Zhi,60,$tmp
204	xor $Tlo,$Zlo,$Zlo
205	ldub [$inp+15],$nlo
206	srlx $Zhi,4,$Zhi
207	xor $Zlo,$tmp,$Zlo
208	xor $Thi,$Zhi,$Zhi
209	stx $Zlo,[$Xi+8]
210	xor $rem,$Zhi,$Zhi
211	stx $Zhi,[$Xi]
212	srl $Zlo,8,$xi1
213	and $Zlo,0xff,$xi0
214	ba .Louter
215	and $xi1,0xff,$xi1
216	.align 32
217	.Ldone:
218	ldx [$Htblo+$nhi],$Tlo
219	sll $remi,3,$remi
220	xor $rem,$Zhi,$Zhi
221	ldx [$Htbl+$nhi],$Thi
222	srlx $Zlo,4,$Zlo
223	ldx [$rem_4bit+$remi],$rem
224	sllx $Zhi,60,$tmp
225	xor $Tlo,$Zlo,$Zlo
226	srlx $Zhi,4,$Zhi
227	xor $Zlo,$tmp,$Zlo
228	xor $Thi,$Zhi,$Zhi
229	stx $Zlo,[$Xi+8]
230	xor $rem,$Zhi,$Zhi
231	stx $Zhi,[$Xi]
232
233	ret
234	restore
235	.type gcm_ghash_4bit,#function
236	.size gcm_ghash_4bit,(.-gcm_ghash_4bit)
237	___
238
239	undef $inp;
240	undef $len;
241
242	$code.=<<___;
243	.globl gcm_gmult_4bit
244	.align 32
245	gcm_gmult_4bit:
246	save %sp,-$frame,%sp
247	ldub [$Xi+15],$nlo
248	add $Htbl,8,$Htblo
249
250	1: call .+8
251	add %o7,rem_4bit-1b,$rem_4bit
252
253	and $nlo,0xf0,$nhi
254	and $nlo,0x0f,$nlo
255	sll $nlo,4,$nlo
256	ldx [$Htblo+$nlo],$Zlo
257	ldx [$Htbl+$nlo],$Zhi
258
259	ldub [$Xi+14],$nlo
260
261	ldx [$Htblo+$nhi],$Tlo
262	and $Zlo,0xf,$remi
263	ldx [$Htbl+$nhi],$Thi
264	sll $remi,3,$remi
265	ldx [$rem_4bit+$remi],$rem
266	srlx $Zlo,4,$Zlo
267	mov 13,$cnt
268	sllx $Zhi,60,$tmp
269	xor $Tlo,$Zlo,$Zlo
270	srlx $Zhi,4,$Zhi
271	xor $Zlo,$tmp,$Zlo
272
273	and $Zlo,0xf,$remi
274	and $nlo,0xf0,$nhi
275	and $nlo,0x0f,$nlo
276	ba .Lgmult_inner
277	sll $nlo,4,$nlo
278	.align 32
279	.Lgmult_inner:
280	ldx [$Htblo+$nlo],$Tlo
281	sll $remi,3,$remi
282	xor $Thi,$Zhi,$Zhi
283	ldx [$Htbl+$nlo],$Thi
284	srlx $Zlo,4,$Zlo
285	xor $rem,$Zhi,$Zhi
286	ldx [$rem_4bit+$remi],$rem
287	sllx $Zhi,60,$tmp
288	xor $Tlo,$Zlo,$Zlo
289	ldub [$Xi+$cnt],$nlo
290	srlx $Zhi,4,$Zhi
291	xor $Zlo,$tmp,$Zlo
292	xor $Thi,$Zhi,$Zhi
293	and $Zlo,0xf,$remi
294
295	ldx [$Htblo+$nhi],$Tlo
296	sll $remi,3,$remi
297	xor $rem,$Zhi,$Zhi
298	ldx [$Htbl+$nhi],$Thi
299	srlx $Zlo,4,$Zlo
300	ldx [$rem_4bit+$remi],$rem
301	sllx $Zhi,60,$tmp
302	srlx $Zhi,4,$Zhi
303	and $nlo,0xf0,$nhi
304	addcc $cnt,-1,$cnt
305	xor $Zlo,$tmp,$Zlo
306	and $nlo,0x0f,$nlo
307	xor $Tlo,$Zlo,$Zlo
308	sll $nlo,4,$nlo
309	blu .Lgmult_inner
310	and $Zlo,0xf,$remi
311
312	ldx [$Htblo+$nlo],$Tlo
313	sll $remi,3,$remi
314	xor $Thi,$Zhi,$Zhi
315	ldx [$Htbl+$nlo],$Thi
316	srlx $Zlo,4,$Zlo
317	xor $rem,$Zhi,$Zhi
318	ldx [$rem_4bit+$remi],$rem
319	sllx $Zhi,60,$tmp
320	xor $Tlo,$Zlo,$Zlo
321	srlx $Zhi,4,$Zhi
322	xor $Zlo,$tmp,$Zlo
323	xor $Thi,$Zhi,$Zhi
324	and $Zlo,0xf,$remi
325
326	ldx [$Htblo+$nhi],$Tlo
327	sll $remi,3,$remi
328	xor $rem,$Zhi,$Zhi
329	ldx [$Htbl+$nhi],$Thi
330	srlx $Zlo,4,$Zlo
331	ldx [$rem_4bit+$remi],$rem
332	sllx $Zhi,60,$tmp
333	xor $Tlo,$Zlo,$Zlo
334	srlx $Zhi,4,$Zhi
335	xor $Zlo,$tmp,$Zlo
336	xor $Thi,$Zhi,$Zhi
337	stx $Zlo,[$Xi+8]
338	xor $rem,$Zhi,$Zhi
339	stx $Zhi,[$Xi]
340
341	ret
342	restore
343	.type gcm_gmult_4bit,#function
344	.size gcm_gmult_4bit,(.-gcm_gmult_4bit)
345	___
346
347
348	{{{
349	# Straightforward 128x128-bit multiplication using Karatsuba algorithm
350	# followed by pair of 64-bit reductions [with a shortcut in first one,
351	# which allowed to break dependency between reductions and remove one
352	# multiplication from critical path]. While it might be suboptimal
353	# with regard to sheer number of multiplications, other methods [such
354	# as aggregate reduction] would require more 64-bit registers, which
355	# we don't have in 32-bit application context.
356
357	($Xip,$Htable,$inp,$len)=map("%i$_",(0..3));
358
359	($Hhl,$Hlo,$Hhi,$Xlo,$Xhi,$xE1,$sqr, $C0,$C1,$C2,$C3,$V)=
360	(map("%o$_",(0..5,7)),map("%g$_",(1..5)));
361
362	($shl,$shr)=map("%l$_",(0..7));
363
364	# For details regarding "twisted H" see ghash-x86.pl.
365	$code.=<<___;
366	.globl gcm_init_vis3
367	.align 32
368	gcm_init_vis3:
369	save %sp,-$frame,%sp
370
371	ldx [%i1+0],$Hhi
372	ldx [%i1+8],$Hlo
373	mov 0xE1,$Xhi
374	mov 1,$Xlo
375	sllx $Xhi,57,$Xhi
376	srax $Hhi,63,$C0 ! broadcast carry
377	addcc $Hlo,$Hlo,$Hlo ! H<<=1
378	addxc $Hhi,$Hhi,$Hhi
379	and $C0,$Xlo,$Xlo
380	and $C0,$Xhi,$Xhi
381	xor $Xlo,$Hlo,$Hlo
382	xor $Xhi,$Hhi,$Hhi
383	stx $Hlo,[%i0+8] ! save twisted H
384	stx $Hhi,[%i0+0]
385
386	sethi %hi(0xA0406080),$V
387	sethi %hi(0x20C0E000),%l0
388	or $V,%lo(0xA0406080),$V
389	or %l0,%lo(0x20C0E000),%l0
390	sllx $V,32,$V
391	or %l0,$V,$V ! (0xE0·i)&0xff=0xA040608020C0E000
392	stx $V,[%i0+16]
393
394	ret
395	restore
396	.type gcm_init_vis3,#function
397	.size gcm_init_vis3,.-gcm_init_vis3
398
399	.globl gcm_gmult_vis3
400	.align 32
401	gcm_gmult_vis3:
402	save %sp,-$frame,%sp
403
404	ldx [$Xip+8],$Xlo ! load Xi
405	ldx [$Xip+0],$Xhi
406	ldx [$Htable+8],$Hlo ! load twisted H
407	ldx [$Htable+0],$Hhi
408
409	mov 0xE1,%l7
410	sllx %l7,57,$xE1 ! 57 is not a typo
411	ldx [$Htable+16],$V ! (0xE0·i)&0xff=0xA040608020C0E000
412
413	xor $Hhi,$Hlo,$Hhl ! Karatsuba pre-processing
414	xmulx $Xlo,$Hlo,$C0
415	xor $Xlo,$Xhi,$C2 ! Karatsuba pre-processing
416	xmulx $C2,$Hhl,$C1
417	xmulxhi $Xlo,$Hlo,$Xlo
418	xmulxhi $C2,$Hhl,$C2
419	xmulxhi $Xhi,$Hhi,$C3
420	xmulx $Xhi,$Hhi,$Xhi
421
422	sll $C0,3,$sqr
423	srlx $V,$sqr,$sqr ! ·0xE0 [implicit &(7<<3)]
424	xor $C0,$sqr,$sqr
425	sllx $sqr,57,$sqr ! ($C0·0xE1)<<1<<56 [implicit &0x7f]
426
427	xor $C0,$C1,$C1 ! Karatsuba post-processing
428	xor $Xlo,$C2,$C2
429	xor $sqr,$Xlo,$Xlo ! real destination is $C1
430	xor $C3,$C2,$C2
431	xor $Xlo,$C1,$C1
432	xor $Xhi,$C2,$C2
433	xor $Xhi,$C1,$C1
434
435	xmulxhi $C0,$xE1,$Xlo ! ·0xE1<<1<<56
436	xor $C0,$C2,$C2
437	xmulx $C1,$xE1,$C0
438	xor $C1,$C3,$C3
439	xmulxhi $C1,$xE1,$C1
440
441	xor $Xlo,$C2,$C2
442	xor $C0,$C2,$C2
443	xor $C1,$C3,$C3
444
445	stx $C2,[$Xip+8] ! save Xi
446	stx $C3,[$Xip+0]
447
448	ret
449	restore
450	.type gcm_gmult_vis3,#function
451	.size gcm_gmult_vis3,.-gcm_gmult_vis3
452
453	.globl gcm_ghash_vis3
454	.align 32
455	gcm_ghash_vis3:
456	save %sp,-$frame,%sp
457	nop
458	srln $len,0,$len ! needed on v8+, "nop" on v9
459
460	ldx [$Xip+8],$C2 ! load Xi
461	ldx [$Xip+0],$C3
462	ldx [$Htable+8],$Hlo ! load twisted H
463	ldx [$Htable+0],$Hhi
464
465	mov 0xE1,%l7
466	sllx %l7,57,$xE1 ! 57 is not a typo
467	ldx [$Htable+16],$V ! (0xE0·i)&0xff=0xA040608020C0E000
468
469	and $inp,7,$shl
470	andn $inp,7,$inp
471	sll $shl,3,$shl
472	prefetch [$inp+63], 20
473	sub %g0,$shl,$shr
474
475	xor $Hhi,$Hlo,$Hhl ! Karatsuba pre-processing
476	.Loop:
477	ldx [$inp+8],$Xlo
478	brz,pt $shl,1f
479	ldx [$inp+0],$Xhi
480
481	ldx [$inp+16],$C1 ! align data
482	srlx $Xlo,$shr,$C0
483	sllx $Xlo,$shl,$Xlo
484	sllx $Xhi,$shl,$Xhi
485	srlx $C1,$shr,$C1
486	or $C0,$Xhi,$Xhi
487	or $C1,$Xlo,$Xlo
488	1:
489	add $inp,16,$inp
490	sub $len,16,$len
491	xor $C2,$Xlo,$Xlo
492	xor $C3,$Xhi,$Xhi
493	prefetch [$inp+63], 20
494
495	xmulx $Xlo,$Hlo,$C0
496	xor $Xlo,$Xhi,$C2 ! Karatsuba pre-processing
497	xmulx $C2,$Hhl,$C1
498	xmulxhi $Xlo,$Hlo,$Xlo
499	xmulxhi $C2,$Hhl,$C2
500	xmulxhi $Xhi,$Hhi,$C3
501	xmulx $Xhi,$Hhi,$Xhi
502
503	sll $C0,3,$sqr
504	srlx $V,$sqr,$sqr ! ·0xE0 [implicit &(7<<3)]
505	xor $C0,$sqr,$sqr
506	sllx $sqr,57,$sqr ! ($C0·0xE1)<<1<<56 [implicit &0x7f]
507
508	xor $C0,$C1,$C1 ! Karatsuba post-processing
509	xor $Xlo,$C2,$C2
510	xor $sqr,$Xlo,$Xlo ! real destination is $C1
511	xor $C3,$C2,$C2
512	xor $Xlo,$C1,$C1
513	xor $Xhi,$C2,$C2
514	xor $Xhi,$C1,$C1
515
516	xmulxhi $C0,$xE1,$Xlo ! ·0xE1<<1<<56
517	xor $C0,$C2,$C2
518	xmulx $C1,$xE1,$C0
519	xor $C1,$C3,$C3
520	xmulxhi $C1,$xE1,$C1
521
522	xor $Xlo,$C2,$C2
523	xor $C0,$C2,$C2
524	brnz,pt $len,.Loop
525	xor $C1,$C3,$C3
526
527	stx $C2,[$Xip+8] ! save Xi
528	stx $C3,[$Xip+0]
529
530	ret
531	restore
532	.type gcm_ghash_vis3,#function
533	.size gcm_ghash_vis3,.-gcm_ghash_vis3
534	___
535	}}}
536	$code.=<<___;
537	.asciz "GHASH for SPARCv9/VIS3, CRYPTOGAMS by <appro\@openssl.org>"
538	.align 4
539	___
540
541
542
543	# Purpose of these subroutines is to explicitly encode VIS instructions,
544	# so that one can compile the module without having to specify VIS
545	# extensions on compiler command line, e.g. -xarch=v9 vs. -xarch=v9a.
546	# Idea is to reserve for option to produce "universal" binary and let
547	# programmer detect if current CPU is VIS capable at run-time.
548	sub unvis3 {
549	my ($mnemonic,$rs1,$rs2,$rd)=@_;
550	my %bias = ( "g" => 0, "o" => 8, "l" => 16, "i" => 24 );
551	my ($ref,$opf);
552	my %visopf = ( "addxc" => 0x011,
553	"addxccc" => 0x013,
554	"xmulx" => 0x115,
555	"xmulxhi" => 0x116 );
556
557	$ref = "$mnemonic\t$rs1,$rs2,$rd";
558
559	if ($opf=$visopf{$mnemonic}) {
560	foreach ($rs1,$rs2,$rd) {
561	return $ref if (!/%([goli])([0-9])/);
562	$_=$bias{$1}+$2;
563	}
564
565	return sprintf ".word\t0x%08x !%s",
566	0x81b00000\|$rd<<25\|$rs1<<14\|$opf<<5\|$rs2,
567	$ref;
568	} else {
569	return $ref;
570	}
571	}
572
573	foreach (split("\n",$code)) {
574	s/\`([^\`]*)\`/eval $1/ge;
575
576	s/\b(xmulx[hi]\|addxc[c]{0,2})\s+(%[goli][0-7]),\s(%[goli][0-7]),\s*(%[goli][0-7])/
577	&unvis3($1,$2,$3,$4)
578	/ge;
579
580	print $_,"\n";
581	}
582
583	close STDOUT;

Note: See TracBrowser for help on using the repository browser.

source: vbox/trunk/src/libs/openssl-1.1.0g/crypto/modes/asm/ghash-sparcv9.pl@ 69890

Download in other formats: