poly1305-sparcv9.pl@ 101021

Last change on this file since 101021 was 101021, checked in by vboxsync, 18 months ago
openssl-3.1.2: Applied and adjusted our OpenSSL changes to 3.1.0. bugref:10519
Property svn:executable set to ``*
File size: 23.9 KB

Line
1	#! /usr/bin/env perl
2	# Copyright 2016-2021 The OpenSSL Project Authors. All Rights Reserved.
3	#
4	# Licensed under the Apache License 2.0 (the "License"). You may not use
5	# this file except in compliance with the License. You can obtain a copy
6	# in the file LICENSE in the source distribution or at
7	# https://www.openssl.org/source/license.html
8
9	#
10	# ====================================================================
11	# Written by Andy Polyakov <[email protected]> for the OpenSSL
12	# project. The module is, however, dual licensed under OpenSSL and
13	# CRYPTOGAMS licenses depending on where you obtain it. For further
14	# details see http://www.openssl.org/~appro/cryptogams/.
15	# ====================================================================
16	#
17	# This module implements Poly1305 hash for SPARCv9, vanilla, as well
18	# as VIS3 and FMA extensions.
19	#
20	# May, August 2015
21	#
22	# Numbers are cycles per processed byte with poly1305_blocks alone.
23	#
24	# IALU(*) FMA
25	#
26	# UltraSPARC III 12.3(**)
27	# SPARC T3 7.92
28	# SPARC T4 1.70(***) 6.55
29	# SPARC64 X 5.60 3.64
30	#
31	# (*) Comparison to compiler-generated code is really problematic,
32	# because latter's performance varies too much depending on too
33	# many variables. For example, one can measure from 5x to 15x
34	# improvement on T4 for gcc-4.6. Well, in T4 case it's a bit
35	# unfair comparison, because compiler doesn't use VIS3, but
36	# given same initial conditions coefficient varies from 3x to 9x.
37	# (**) Pre-III performance should be even worse; floating-point
38	# performance for UltraSPARC I-IV on the other hand is reported
39	# to be 4.25 for hand-coded assembly, but they are just too old
40	# to care about.
41	# (***) Multi-process benchmark saturates at ~12.5x single-process
42	# result on 8-core processor, or ~21GBps per 2.85GHz socket.
43
44	# $output is the last argument if it looks like a file (it has an extension)
45	my $output = $#ARGV >= 0 && $ARGV[$#ARGV] =~ m\|\.\w+$\| ? pop : undef;
46
47	open STDOUT,">$output" if $output;
48
49	my ($ctx,$inp,$len,$padbit,$shl,$shr) = map("%i$_",(0..5));
50	my ($r0,$r1,$r2,$r3,$s1,$s2,$s3,$h4) = map("%l$_",(0..7));
51	my ($h0,$h1,$h2,$h3, $t0,$t1,$t2) = map("%o$_",(0..5,7));
52	my ($d0,$d1,$d2,$d3) = map("%g$_",(1..4));
53
54	$code.=<<___;
55	#ifndef __ASSEMBLER__
56	# define __ASSEMBLER__ 1
57	#endif
58	#include "crypto/sparc_arch.h"
59
60	#ifdef __arch64__
61	.register %g2,#scratch
62	.register %g3,#scratch
63	# define STPTR stx
64	# define SIZE_T 8
65	#else
66	# define STPTR st
67	# define SIZE_T 4
68	#endif
69	#define LOCALS (STACK_BIAS+STACK_FRAME)
70
71	.section ".text",#alloc,#execinstr
72
73	#ifdef __PIC__
74	SPARC_PIC_THUNK(%g1)
75	#endif
76
77	.globl poly1305_init
78	.align 32
79	poly1305_init:
80	save %sp,-STACK_FRAME-16,%sp
81	nop
82
83	SPARC_LOAD_ADDRESS(OPENSSL_sparcv9cap_P,%g1)
84	ld [%g1],%g1
85
86	and %g1,SPARCV9_FMADD\|SPARCV9_VIS3,%g1
87	cmp %g1,SPARCV9_FMADD
88	be .Lpoly1305_init_fma
89	nop
90
91	stx %g0,[$ctx+0]
92	stx %g0,[$ctx+8] ! zero hash value
93	brz,pn $inp,.Lno_key
94	stx %g0,[$ctx+16]
95
96	and $inp,7,$shr ! alignment factor
97	andn $inp,7,$inp
98	sll $shr,3,$shr ! *8
99	neg $shr,$shl
100
101	sethi %hi(0x0ffffffc),$t0
102	set 8,$h1
103	or $t0,%lo(0x0ffffffc),$t0
104	set 16,$h2
105	sllx $t0,32,$t1
106	or $t0,$t1,$t1 ! 0x0ffffffc0ffffffc
107	or $t1,3,$t0 ! 0x0ffffffc0fffffff
108
109	ldxa [$inp+%g0]0x88,$h0 ! load little-endian key
110	brz,pt $shr,.Lkey_aligned
111	ldxa [$inp+$h1]0x88,$h1
112
113	ldxa [$inp+$h2]0x88,$h2
114	srlx $h0,$shr,$h0
115	sllx $h1,$shl,$t2
116	srlx $h1,$shr,$h1
117	or $t2,$h0,$h0
118	sllx $h2,$shl,$h2
119	or $h2,$h1,$h1
120
121	.Lkey_aligned:
122	and $t0,$h0,$h0
123	and $t1,$h1,$h1
124	stx $h0,[$ctx+32+0] ! store key
125	stx $h1,[$ctx+32+8]
126
127	andcc %g1,SPARCV9_VIS3,%g0
128	be .Lno_key
129	nop
130
131	1: call .+8
132	add %o7,poly1305_blocks_vis3-1b,%o7
133
134	add %o7,poly1305_emit-poly1305_blocks_vis3,%o5
135	STPTR %o7,[%i2]
136	STPTR %o5,[%i2+SIZE_T]
137
138	ret
139	restore %g0,1,%o0 ! return 1
140
141	.Lno_key:
142	ret
143	restore %g0,%g0,%o0 ! return 0
144	.type poly1305_init,#function
145	.size poly1305_init,.-poly1305_init
146
147	.globl poly1305_blocks
148	.align 32
149	poly1305_blocks:
150	save %sp,-STACK_FRAME,%sp
151	srln $len,4,$len
152
153	brz,pn $len,.Lno_data
154	nop
155
156	ld [$ctx+32+0],$r1 ! load key
157	ld [$ctx+32+4],$r0
158	ld [$ctx+32+8],$r3
159	ld [$ctx+32+12],$r2
160
161	ld [$ctx+0],$h1 ! load hash value
162	ld [$ctx+4],$h0
163	ld [$ctx+8],$h3
164	ld [$ctx+12],$h2
165	ld [$ctx+16],$h4
166
167	and $inp,7,$shr ! alignment factor
168	andn $inp,7,$inp
169	set 8,$d1
170	sll $shr,3,$shr ! *8
171	set 16,$d2
172	neg $shr,$shl
173
174	srl $r1,2,$s1
175	srl $r2,2,$s2
176	add $r1,$s1,$s1
177	srl $r3,2,$s3
178	add $r2,$s2,$s2
179	add $r3,$s3,$s3
180
181	.Loop:
182	ldxa [$inp+%g0]0x88,$d0 ! load little-endian input
183	brz,pt $shr,.Linp_aligned
184	ldxa [$inp+$d1]0x88,$d1
185
186	ldxa [$inp+$d2]0x88,$d2
187	srlx $d0,$shr,$d0
188	sllx $d1,$shl,$t1
189	srlx $d1,$shr,$d1
190	or $t1,$d0,$d0
191	sllx $d2,$shl,$d2
192	or $d2,$d1,$d1
193
194	.Linp_aligned:
195	srlx $d0,32,$t0
196	addcc $d0,$h0,$h0 ! accumulate input
197	srlx $d1,32,$t1
198	addccc $t0,$h1,$h1
199	addccc $d1,$h2,$h2
200	addccc $t1,$h3,$h3
201	addc $padbit,$h4,$h4
202
203	umul $r0,$h0,$d0
204	umul $r1,$h0,$d1
205	umul $r2,$h0,$d2
206	umul $r3,$h0,$d3
207	sub $len,1,$len
208	add $inp,16,$inp
209
210	umul $s3,$h1,$t0
211	umul $r0,$h1,$t1
212	umul $r1,$h1,$t2
213	add $t0,$d0,$d0
214	add $t1,$d1,$d1
215	umul $r2,$h1,$t0
216	add $t2,$d2,$d2
217	add $t0,$d3,$d3
218
219	umul $s2,$h2,$t1
220	umul $s3,$h2,$t2
221	umul $r0,$h2,$t0
222	add $t1,$d0,$d0
223	add $t2,$d1,$d1
224	umul $r1,$h2,$t1
225	add $t0,$d2,$d2
226	add $t1,$d3,$d3
227
228	umul $s1,$h3,$t2
229	umul $s2,$h3,$t0
230	umul $s3,$h3,$t1
231	add $t2,$d0,$d0
232	add $t0,$d1,$d1
233	umul $r0,$h3,$t2
234	add $t1,$d2,$d2
235	add $t2,$d3,$d3
236
237	umul $s1,$h4,$t0
238	umul $s2,$h4,$t1
239	umul $s3,$h4,$t2
240	umul $r0,$h4,$h4
241	add $t0,$d1,$d1
242	add $t1,$d2,$d2
243	srlx $d0,32,$h1
244	add $t2,$d3,$d3
245	srlx $d1,32,$h2
246
247	addcc $d1,$h1,$h1
248	srlx $d2,32,$h3
249	set 8,$d1
250	addccc $d2,$h2,$h2
251	srlx $d3,32,$t0
252	set 16,$d2
253	addccc $d3,$h3,$h3
254	addc $t0,$h4,$h4
255
256	srl $h4,2,$t0 ! final reduction step
257	andn $h4,3,$t1
258	and $h4,3,$h4
259	add $t1,$t0,$t0
260
261	addcc $t0,$d0,$h0
262	addccc %g0,$h1,$h1
263	addccc %g0,$h2,$h2
264	addccc %g0,$h3,$h3
265	brnz,pt $len,.Loop
266	addc %g0,$h4,$h4
267
268	st $h1,[$ctx+0] ! store hash value
269	st $h0,[$ctx+4]
270	st $h3,[$ctx+8]
271	st $h2,[$ctx+12]
272	st $h4,[$ctx+16]
273
274	.Lno_data:
275	ret
276	restore
277	.type poly1305_blocks,#function
278	.size poly1305_blocks,.-poly1305_blocks
279	___
280	########################################################################
281	# VIS3 has umulxhi and addxc...
282	{
283	my ($H0,$H1,$H2,$R0,$R1,$S1,$T1) = map("%o$_",(0..5,7));
284	my ($D0,$D1,$D2,$T0) = map("%g$_",(1..4));
285
286	$code.=<<___;
287	.align 32
288	poly1305_blocks_vis3:
289	save %sp,-STACK_FRAME,%sp
290	srln $len,4,$len
291
292	brz,pn $len,.Lno_data
293	nop
294
295	ldx [$ctx+32+0],$R0 ! load key
296	ldx [$ctx+32+8],$R1
297
298	ldx [$ctx+0],$H0 ! load hash value
299	ldx [$ctx+8],$H1
300	ld [$ctx+16],$H2
301
302	and $inp,7,$shr ! alignment factor
303	andn $inp,7,$inp
304	set 8,$r1
305	sll $shr,3,$shr ! *8
306	set 16,$r2
307	neg $shr,$shl
308
309	srlx $R1,2,$S1
310	b .Loop_vis3
311	add $R1,$S1,$S1
312
313	.Loop_vis3:
314	ldxa [$inp+%g0]0x88,$D0 ! load little-endian input
315	brz,pt $shr,.Linp_aligned_vis3
316	ldxa [$inp+$r1]0x88,$D1
317
318	ldxa [$inp+$r2]0x88,$D2
319	srlx $D0,$shr,$D0
320	sllx $D1,$shl,$T1
321	srlx $D1,$shr,$D1
322	or $T1,$D0,$D0
323	sllx $D2,$shl,$D2
324	or $D2,$D1,$D1
325
326	.Linp_aligned_vis3:
327	addcc $D0,$H0,$H0 ! accumulate input
328	sub $len,1,$len
329	addxccc $D1,$H1,$H1
330	add $inp,16,$inp
331
332	mulx $R0,$H0,$D0 ! r0*h0
333	addxc $padbit,$H2,$H2
334	umulxhi $R0,$H0,$D1
335	mulx $S1,$H1,$T0 ! s1*h1
336	umulxhi $S1,$H1,$T1
337	addcc $T0,$D0,$D0
338	mulx $R1,$H0,$T0 ! r1*h0
339	addxc $T1,$D1,$D1
340	umulxhi $R1,$H0,$D2
341	addcc $T0,$D1,$D1
342	mulx $R0,$H1,$T0 ! r0*h1
343	addxc %g0,$D2,$D2
344	umulxhi $R0,$H1,$T1
345	addcc $T0,$D1,$D1
346	mulx $S1,$H2,$T0 ! s1*h2
347	addxc $T1,$D2,$D2
348	mulx $R0,$H2,$T1 ! r0*h2
349	addcc $T0,$D1,$D1
350	addxc $T1,$D2,$D2
351
352	srlx $D2,2,$T0 ! final reduction step
353	andn $D2,3,$T1
354	and $D2,3,$H2
355	add $T1,$T0,$T0
356
357	addcc $T0,$D0,$H0
358	addxccc %g0,$D1,$H1
359	brnz,pt $len,.Loop_vis3
360	addxc %g0,$H2,$H2
361
362	stx $H0,[$ctx+0] ! store hash value
363	stx $H1,[$ctx+8]
364	st $H2,[$ctx+16]
365
366	ret
367	restore
368	.type poly1305_blocks_vis3,#function
369	.size poly1305_blocks_vis3,.-poly1305_blocks_vis3
370	___
371	}
372	my ($mac,$nonce) = ($inp,$len);
373
374	$code.=<<___;
375	.globl poly1305_emit
376	.align 32
377	poly1305_emit:
378	save %sp,-STACK_FRAME,%sp
379
380	ld [$ctx+0],$h1 ! load hash value
381	ld [$ctx+4],$h0
382	ld [$ctx+8],$h3
383	ld [$ctx+12],$h2
384	ld [$ctx+16],$h4
385
386	addcc $h0,5,$r0 ! compare to modulus
387	addccc $h1,0,$r1
388	addccc $h2,0,$r2
389	addccc $h3,0,$r3
390	addc $h4,0,$h4
391	andcc $h4,4,%g0 ! did it carry/borrow?
392
393	movnz %icc,$r0,$h0
394	ld [$nonce+0],$r0 ! load nonce
395	movnz %icc,$r1,$h1
396	ld [$nonce+4],$r1
397	movnz %icc,$r2,$h2
398	ld [$nonce+8],$r2
399	movnz %icc,$r3,$h3
400	ld [$nonce+12],$r3
401
402	addcc $r0,$h0,$h0 ! accumulate nonce
403	addccc $r1,$h1,$h1
404	addccc $r2,$h2,$h2
405	addc $r3,$h3,$h3
406
407	srl $h0,8,$r0
408	stb $h0,[$mac+0] ! store little-endian result
409	srl $h0,16,$r1
410	stb $r0,[$mac+1]
411	srl $h0,24,$r2
412	stb $r1,[$mac+2]
413	stb $r2,[$mac+3]
414
415	srl $h1,8,$r0
416	stb $h1,[$mac+4]
417	srl $h1,16,$r1
418	stb $r0,[$mac+5]
419	srl $h1,24,$r2
420	stb $r1,[$mac+6]
421	stb $r2,[$mac+7]
422
423	srl $h2,8,$r0
424	stb $h2,[$mac+8]
425	srl $h2,16,$r1
426	stb $r0,[$mac+9]
427	srl $h2,24,$r2
428	stb $r1,[$mac+10]
429	stb $r2,[$mac+11]
430
431	srl $h3,8,$r0
432	stb $h3,[$mac+12]
433	srl $h3,16,$r1
434	stb $r0,[$mac+13]
435	srl $h3,24,$r2
436	stb $r1,[$mac+14]
437	stb $r2,[$mac+15]
438
439	ret
440	restore
441	.type poly1305_emit,#function
442	.size poly1305_emit,.-poly1305_emit
443	___
444
445	{
446	my ($ctx,$inp,$len,$padbit) = map("%i$_",(0..3));
447	my ($in0,$in1,$in2,$in3,$in4) = map("%o$_",(0..4));
448	my ($i1,$step,$shr,$shl) = map("%l$_",(0..7));
449	my $i2=$step;
450
451	my ($h0lo,$h0hi,$h1lo,$h1hi,$h2lo,$h2hi,$h3lo,$h3hi,
452	$two0,$two32,$two64,$two96,$two130,$five_two130,
453	$r0lo,$r0hi,$r1lo,$r1hi,$r2lo,$r2hi,
454	$s2lo,$s2hi,$s3lo,$s3hi,
455	$c0lo,$c0hi,$c1lo,$c1hi,$c2lo,$c2hi,$c3lo,$c3hi) = map("%f".2*$_,(0..31));
456	# borrowings
457	my ($r3lo,$r3hi,$s1lo,$s1hi) = ($c0lo,$c0hi,$c1lo,$c1hi);
458	my ($x0,$x1,$x2,$x3) = ($c2lo,$c2hi,$c3lo,$c3hi);
459	my ($y0,$y1,$y2,$y3) = ($c1lo,$c1hi,$c3hi,$c3lo);
460
461	$code.=<<___;
462	.align 32
463	poly1305_init_fma:
464	save %sp,-STACK_FRAME-16,%sp
465	nop
466
467	.Lpoly1305_init_fma:
468	1: call .+8
469	add %o7,.Lconsts_fma-1b,%o7
470
471	ldd [%o7+8*0],$two0 ! load constants
472	ldd [%o7+8*1],$two32
473	ldd [%o7+8*2],$two64
474	ldd [%o7+8*3],$two96
475	ldd [%o7+8*5],$five_two130
476
477	std $two0,[$ctx+8*0] ! initial hash value, biased 0
478	std $two32,[$ctx+8*1]
479	std $two64,[$ctx+8*2]
480	std $two96,[$ctx+8*3]
481
482	brz,pn $inp,.Lno_key_fma
483	nop
484
485	stx %fsr,[%sp+LOCALS] ! save original %fsr
486	ldx [%o7+8*6],%fsr ! load new %fsr
487
488	std $two0,[$ctx+8*4] ! key "template"
489	std $two32,[$ctx+8*5]
490	std $two64,[$ctx+8*6]
491	std $two96,[$ctx+8*7]
492
493	and $inp,7,$shr
494	andn $inp,7,$inp ! align pointer
495	mov 8,$i1
496	sll $shr,3,$shr
497	mov 16,$i2
498	neg $shr,$shl
499
500	ldxa [$inp+%g0]0x88,$in0 ! load little-endian key
501	ldxa [$inp+$i1]0x88,$in2
502
503	brz $shr,.Lkey_aligned_fma
504	sethi %hi(0xf0000000),$i1 ! 0xf0000000
505
506	ldxa [$inp+$i2]0x88,$in4
507
508	srlx $in0,$shr,$in0 ! align data
509	sllx $in2,$shl,$in1
510	srlx $in2,$shr,$in2
511	or $in1,$in0,$in0
512	sllx $in4,$shl,$in3
513	or $in3,$in2,$in2
514
515	.Lkey_aligned_fma:
516	or $i1,3,$i2 ! 0xf0000003
517	srlx $in0,32,$in1
518	andn $in0,$i1,$in0 ! &=0x0fffffff
519	andn $in1,$i2,$in1 ! &=0x0ffffffc
520	srlx $in2,32,$in3
521	andn $in2,$i2,$in2
522	andn $in3,$i2,$in3
523
524	st $in0,[$ctx+`8*4+4`] ! fill "template"
525	st $in1,[$ctx+`8*5+4`]
526	st $in2,[$ctx+`8*6+4`]
527	st $in3,[$ctx+`8*7+4`]
528
529	ldd [$ctx+8*4],$h0lo ! load [biased] key
530	ldd [$ctx+8*5],$h1lo
531	ldd [$ctx+8*6],$h2lo
532	ldd [$ctx+8*7],$h3lo
533
534	fsubd $h0lo,$two0, $h0lo ! r0
535	ldd [%o7+8*7],$two0 ! more constants
536	fsubd $h1lo,$two32,$h1lo ! r1
537	ldd [%o7+8*8],$two32
538	fsubd $h2lo,$two64,$h2lo ! r2
539	ldd [%o7+8*9],$two64
540	fsubd $h3lo,$two96,$h3lo ! r3
541	ldd [%o7+8*10],$two96
542
543	fmuld $five_two130,$h1lo,$s1lo ! s1
544	fmuld $five_two130,$h2lo,$s2lo ! s2
545	fmuld $five_two130,$h3lo,$s3lo ! s3
546
547	faddd $h0lo,$two0, $h0hi
548	faddd $h1lo,$two32,$h1hi
549	faddd $h2lo,$two64,$h2hi
550	faddd $h3lo,$two96,$h3hi
551
552	fsubd $h0hi,$two0, $h0hi
553	ldd [%o7+8*11],$two0 ! more constants
554	fsubd $h1hi,$two32,$h1hi
555	ldd [%o7+8*12],$two32
556	fsubd $h2hi,$two64,$h2hi
557	ldd [%o7+8*13],$two64
558	fsubd $h3hi,$two96,$h3hi
559
560	fsubd $h0lo,$h0hi,$h0lo
561	std $h0hi,[$ctx+8*5] ! r0hi
562	fsubd $h1lo,$h1hi,$h1lo
563	std $h1hi,[$ctx+8*7] ! r1hi
564	fsubd $h2lo,$h2hi,$h2lo
565	std $h2hi,[$ctx+8*9] ! r2hi
566	fsubd $h3lo,$h3hi,$h3lo
567	std $h3hi,[$ctx+8*11] ! r3hi
568
569	faddd $s1lo,$two0, $s1hi
570	faddd $s2lo,$two32,$s2hi
571	faddd $s3lo,$two64,$s3hi
572
573	fsubd $s1hi,$two0, $s1hi
574	fsubd $s2hi,$two32,$s2hi
575	fsubd $s3hi,$two64,$s3hi
576
577	fsubd $s1lo,$s1hi,$s1lo
578	fsubd $s2lo,$s2hi,$s2lo
579	fsubd $s3lo,$s3hi,$s3lo
580
581	ldx [%sp+LOCALS],%fsr ! restore %fsr
582
583	std $h0lo,[$ctx+8*4] ! r0lo
584	std $h1lo,[$ctx+8*6] ! r1lo
585	std $h2lo,[$ctx+8*8] ! r2lo
586	std $h3lo,[$ctx+8*10] ! r3lo
587
588	std $s1hi,[$ctx+8*13]
589	std $s2hi,[$ctx+8*15]
590	std $s3hi,[$ctx+8*17]
591
592	std $s1lo,[$ctx+8*12]
593	std $s2lo,[$ctx+8*14]
594	std $s3lo,[$ctx+8*16]
595
596	add %o7,poly1305_blocks_fma-.Lconsts_fma,%o0
597	add %o7,poly1305_emit_fma-.Lconsts_fma,%o1
598	STPTR %o0,[%i2]
599	STPTR %o1,[%i2+SIZE_T]
600
601	ret
602	restore %g0,1,%o0 ! return 1
603
604	.Lno_key_fma:
605	ret
606	restore %g0,%g0,%o0 ! return 0
607	.type poly1305_init_fma,#function
608	.size poly1305_init_fma,.-poly1305_init_fma
609
610	.align 32
611	poly1305_blocks_fma:
612	save %sp,-STACK_FRAME-48,%sp
613	srln $len,4,$len
614
615	brz,pn $len,.Labort
616	sub $len,1,$len
617
618	1: call .+8
619	add %o7,.Lconsts_fma-1b,%o7
620
621	ldd [%o7+8*0],$two0 ! load constants
622	ldd [%o7+8*1],$two32
623	ldd [%o7+8*2],$two64
624	ldd [%o7+8*3],$two96
625	ldd [%o7+8*4],$two130
626	ldd [%o7+8*5],$five_two130
627
628	ldd [$ctx+8*0],$h0lo ! load [biased] hash value
629	ldd [$ctx+8*1],$h1lo
630	ldd [$ctx+8*2],$h2lo
631	ldd [$ctx+8*3],$h3lo
632
633	std $two0,[%sp+LOCALS+8*0] ! input "template"
634	sethi %hi((1023+52+96)<<20),$in3
635	std $two32,[%sp+LOCALS+8*1]
636	or $padbit,$in3,$in3
637	std $two64,[%sp+LOCALS+8*2]
638	st $in3,[%sp+LOCALS+8*3]
639
640	and $inp,7,$shr
641	andn $inp,7,$inp ! align pointer
642	mov 8,$i1
643	sll $shr,3,$shr
644	mov 16,$step
645	neg $shr,$shl
646
647	ldxa [$inp+%g0]0x88,$in0 ! load little-endian input
648	brz $shr,.Linp_aligned_fma
649	ldxa [$inp+$i1]0x88,$in2
650
651	ldxa [$inp+$step]0x88,$in4
652	add $inp,8,$inp
653
654	srlx $in0,$shr,$in0 ! align data
655	sllx $in2,$shl,$in1
656	srlx $in2,$shr,$in2
657	or $in1,$in0,$in0
658	sllx $in4,$shl,$in3
659	srlx $in4,$shr,$in4 ! pre-shift
660	or $in3,$in2,$in2
661
662	.Linp_aligned_fma:
663	srlx $in0,32,$in1
664	movrz $len,0,$step
665	srlx $in2,32,$in3
666	add $step,$inp,$inp ! conditional advance
667
668	st $in0,[%sp+LOCALS+8*0+4] ! fill "template"
669	st $in1,[%sp+LOCALS+8*1+4]
670	st $in2,[%sp+LOCALS+8*2+4]
671	st $in3,[%sp+LOCALS+8*3+4]
672
673	ldd [$ctx+8*4],$r0lo ! load key
674	ldd [$ctx+8*5],$r0hi
675	ldd [$ctx+8*6],$r1lo
676	ldd [$ctx+8*7],$r1hi
677	ldd [$ctx+8*8],$r2lo
678	ldd [$ctx+8*9],$r2hi
679	ldd [$ctx+8*10],$r3lo
680	ldd [$ctx+8*11],$r3hi
681	ldd [$ctx+8*12],$s1lo
682	ldd [$ctx+8*13],$s1hi
683	ldd [$ctx+8*14],$s2lo
684	ldd [$ctx+8*15],$s2hi
685	ldd [$ctx+8*16],$s3lo
686	ldd [$ctx+8*17],$s3hi
687
688	stx %fsr,[%sp+LOCALS+8*4] ! save original %fsr
689	ldx [%o7+8*6],%fsr ! load new %fsr
690
691	subcc $len,1,$len
692	movrz $len,0,$step
693
694	ldd [%sp+LOCALS+8*0],$x0 ! load biased input
695	ldd [%sp+LOCALS+8*1],$x1
696	ldd [%sp+LOCALS+8*2],$x2
697	ldd [%sp+LOCALS+8*3],$x3
698
699	fsubd $h0lo,$two0, $h0lo ! de-bias hash value
700	fsubd $h1lo,$two32,$h1lo
701	ldxa [$inp+%g0]0x88,$in0 ! modulo-scheduled input load
702	fsubd $h2lo,$two64,$h2lo
703	fsubd $h3lo,$two96,$h3lo
704	ldxa [$inp+$i1]0x88,$in2
705
706	fsubd $x0,$two0, $x0 ! de-bias input
707	fsubd $x1,$two32,$x1
708	fsubd $x2,$two64,$x2
709	fsubd $x3,$two96,$x3
710
711	brz $shr,.Linp_aligned_fma2
712	add $step,$inp,$inp ! conditional advance
713
714	sllx $in0,$shl,$in1 ! align data
715	srlx $in0,$shr,$in3
716	or $in1,$in4,$in0
717	sllx $in2,$shl,$in1
718	srlx $in2,$shr,$in4 ! pre-shift
719	or $in3,$in1,$in2
720	.Linp_aligned_fma2:
721	srlx $in0,32,$in1
722	srlx $in2,32,$in3
723
724	faddd $h0lo,$x0,$x0 ! accumulate input
725	stw $in0,[%sp+LOCALS+8*0+4]
726	faddd $h1lo,$x1,$x1
727	stw $in1,[%sp+LOCALS+8*1+4]
728	faddd $h2lo,$x2,$x2
729	stw $in2,[%sp+LOCALS+8*2+4]
730	faddd $h3lo,$x3,$x3
731	stw $in3,[%sp+LOCALS+8*3+4]
732
733	b .Lentry_fma
734	nop
735
736	.align 16
737	.Loop_fma:
738	ldxa [$inp+%g0]0x88,$in0 ! modulo-scheduled input load
739	ldxa [$inp+$i1]0x88,$in2
740	movrz $len,0,$step
741
742	faddd $y0,$h0lo,$h0lo ! accumulate input
743	faddd $y1,$h0hi,$h0hi
744	faddd $y2,$h2lo,$h2lo
745	faddd $y3,$h2hi,$h2hi
746
747	brz,pn $shr,.Linp_aligned_fma3
748	add $step,$inp,$inp ! conditional advance
749
750	sllx $in0,$shl,$in1 ! align data
751	srlx $in0,$shr,$in3
752	or $in1,$in4,$in0
753	sllx $in2,$shl,$in1
754	srlx $in2,$shr,$in4 ! pre-shift
755	or $in3,$in1,$in2
756
757	.Linp_aligned_fma3:
758	!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!! base 2^48 -> base 2^32
759	faddd $two64,$h1lo,$c1lo
760	srlx $in0,32,$in1
761	faddd $two64,$h1hi,$c1hi
762	srlx $in2,32,$in3
763	faddd $two130,$h3lo,$c3lo
764	st $in0,[%sp+LOCALS+8*0+4] ! fill "template"
765	faddd $two130,$h3hi,$c3hi
766	st $in1,[%sp+LOCALS+8*1+4]
767	faddd $two32,$h0lo,$c0lo
768	st $in2,[%sp+LOCALS+8*2+4]
769	faddd $two32,$h0hi,$c0hi
770	st $in3,[%sp+LOCALS+8*3+4]
771	faddd $two96,$h2lo,$c2lo
772	faddd $two96,$h2hi,$c2hi
773
774	fsubd $c1lo,$two64,$c1lo
775	fsubd $c1hi,$two64,$c1hi
776	fsubd $c3lo,$two130,$c3lo
777	fsubd $c3hi,$two130,$c3hi
778	fsubd $c0lo,$two32,$c0lo
779	fsubd $c0hi,$two32,$c0hi
780	fsubd $c2lo,$two96,$c2lo
781	fsubd $c2hi,$two96,$c2hi
782
783	fsubd $h1lo,$c1lo,$h1lo
784	fsubd $h1hi,$c1hi,$h1hi
785	fsubd $h3lo,$c3lo,$h3lo
786	fsubd $h3hi,$c3hi,$h3hi
787	fsubd $h2lo,$c2lo,$h2lo
788	fsubd $h2hi,$c2hi,$h2hi
789	fsubd $h0lo,$c0lo,$h0lo
790	fsubd $h0hi,$c0hi,$h0hi
791
792	faddd $h1lo,$c0lo,$h1lo
793	faddd $h1hi,$c0hi,$h1hi
794	faddd $h3lo,$c2lo,$h3lo
795	faddd $h3hi,$c2hi,$h3hi
796	faddd $h2lo,$c1lo,$h2lo
797	faddd $h2hi,$c1hi,$h2hi
798	fmaddd $five_two130,$c3lo,$h0lo,$h0lo
799	fmaddd $five_two130,$c3hi,$h0hi,$h0hi
800
801	faddd $h1lo,$h1hi,$x1
802	ldd [$ctx+8*12],$s1lo ! reload constants
803	faddd $h3lo,$h3hi,$x3
804	ldd [$ctx+8*13],$s1hi
805	faddd $h2lo,$h2hi,$x2
806	ldd [$ctx+8*10],$r3lo
807	faddd $h0lo,$h0hi,$x0
808	ldd [$ctx+8*11],$r3hi
809
810	.Lentry_fma:
811	fmuld $x1,$s3lo,$h0lo
812	fmuld $x1,$s3hi,$h0hi
813	fmuld $x1,$r1lo,$h2lo
814	fmuld $x1,$r1hi,$h2hi
815	fmuld $x1,$r0lo,$h1lo
816	fmuld $x1,$r0hi,$h1hi
817	fmuld $x1,$r2lo,$h3lo
818	fmuld $x1,$r2hi,$h3hi
819
820	fmaddd $x3,$s1lo,$h0lo,$h0lo
821	fmaddd $x3,$s1hi,$h0hi,$h0hi
822	fmaddd $x3,$s3lo,$h2lo,$h2lo
823	fmaddd $x3,$s3hi,$h2hi,$h2hi
824	fmaddd $x3,$s2lo,$h1lo,$h1lo
825	fmaddd $x3,$s2hi,$h1hi,$h1hi
826	fmaddd $x3,$r0lo,$h3lo,$h3lo
827	fmaddd $x3,$r0hi,$h3hi,$h3hi
828
829	fmaddd $x2,$s2lo,$h0lo,$h0lo
830	fmaddd $x2,$s2hi,$h0hi,$h0hi
831	fmaddd $x2,$r0lo,$h2lo,$h2lo
832	fmaddd $x2,$r0hi,$h2hi,$h2hi
833	fmaddd $x2,$s3lo,$h1lo,$h1lo
834	ldd [%sp+LOCALS+8*0],$y0 ! load [biased] input
835	fmaddd $x2,$s3hi,$h1hi,$h1hi
836	ldd [%sp+LOCALS+8*1],$y1
837	fmaddd $x2,$r1lo,$h3lo,$h3lo
838	ldd [%sp+LOCALS+8*2],$y2
839	fmaddd $x2,$r1hi,$h3hi,$h3hi
840	ldd [%sp+LOCALS+8*3],$y3
841
842	fmaddd $x0,$r0lo,$h0lo,$h0lo
843	fsubd $y0,$two0, $y0 ! de-bias input
844	fmaddd $x0,$r0hi,$h0hi,$h0hi
845	fsubd $y1,$two32,$y1
846	fmaddd $x0,$r2lo,$h2lo,$h2lo
847	fsubd $y2,$two64,$y2
848	fmaddd $x0,$r2hi,$h2hi,$h2hi
849	fsubd $y3,$two96,$y3
850	fmaddd $x0,$r1lo,$h1lo,$h1lo
851	fmaddd $x0,$r1hi,$h1hi,$h1hi
852	fmaddd $x0,$r3lo,$h3lo,$h3lo
853	fmaddd $x0,$r3hi,$h3hi,$h3hi
854
855	bcc SIZE_T_CC,.Loop_fma
856	subcc $len,1,$len
857
858	!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!! base 2^48 -> base 2^32
859	faddd $h0lo,$two32,$c0lo
860	faddd $h0hi,$two32,$c0hi
861	faddd $h2lo,$two96,$c2lo
862	faddd $h2hi,$two96,$c2hi
863	faddd $h1lo,$two64,$c1lo
864	faddd $h1hi,$two64,$c1hi
865	faddd $h3lo,$two130,$c3lo
866	faddd $h3hi,$two130,$c3hi
867
868	fsubd $c0lo,$two32,$c0lo
869	fsubd $c0hi,$two32,$c0hi
870	fsubd $c2lo,$two96,$c2lo
871	fsubd $c2hi,$two96,$c2hi
872	fsubd $c1lo,$two64,$c1lo
873	fsubd $c1hi,$two64,$c1hi
874	fsubd $c3lo,$two130,$c3lo
875	fsubd $c3hi,$two130,$c3hi
876
877	fsubd $h1lo,$c1lo,$h1lo
878	fsubd $h1hi,$c1hi,$h1hi
879	fsubd $h3lo,$c3lo,$h3lo
880	fsubd $h3hi,$c3hi,$h3hi
881	fsubd $h2lo,$c2lo,$h2lo
882	fsubd $h2hi,$c2hi,$h2hi
883	fsubd $h0lo,$c0lo,$h0lo
884	fsubd $h0hi,$c0hi,$h0hi
885
886	faddd $h1lo,$c0lo,$h1lo
887	faddd $h1hi,$c0hi,$h1hi
888	faddd $h3lo,$c2lo,$h3lo
889	faddd $h3hi,$c2hi,$h3hi
890	faddd $h2lo,$c1lo,$h2lo
891	faddd $h2hi,$c1hi,$h2hi
892	fmaddd $five_two130,$c3lo,$h0lo,$h0lo
893	fmaddd $five_two130,$c3hi,$h0hi,$h0hi
894
895	faddd $h1lo,$h1hi,$x1
896	faddd $h3lo,$h3hi,$x3
897	faddd $h2lo,$h2hi,$x2
898	faddd $h0lo,$h0hi,$x0
899
900	faddd $x1,$two32,$x1 ! bias
901	faddd $x3,$two96,$x3
902	faddd $x2,$two64,$x2
903	faddd $x0,$two0, $x0
904
905	ldx [%sp+LOCALS+8*4],%fsr ! restore saved %fsr
906
907	std $x1,[$ctx+8*1] ! store [biased] hash value
908	std $x3,[$ctx+8*3]
909	std $x2,[$ctx+8*2]
910	std $x0,[$ctx+8*0]
911
912	.Labort:
913	ret
914	restore
915	.type poly1305_blocks_fma,#function
916	.size poly1305_blocks_fma,.-poly1305_blocks_fma
917	___
918	{
919	my ($mac,$nonce)=($inp,$len);
920
921	my ($h0,$h1,$h2,$h3,$h4, $d0,$d1,$d2,$d3, $mask
922	) = (map("%l$_",(0..5)),map("%o$_",(0..4)));
923
924	$code.=<<___;
925	.align 32
926	poly1305_emit_fma:
927	save %sp,-STACK_FRAME,%sp
928
929	ld [$ctx+8*0+0],$d0 ! load hash
930	ld [$ctx+8*0+4],$h0
931	ld [$ctx+8*1+0],$d1
932	ld [$ctx+8*1+4],$h1
933	ld [$ctx+8*2+0],$d2
934	ld [$ctx+8*2+4],$h2
935	ld [$ctx+8*3+0],$d3
936	ld [$ctx+8*3+4],$h3
937
938	sethi %hi(0xfff00000),$mask
939	andn $d0,$mask,$d0 ! mask exponent
940	andn $d1,$mask,$d1
941	andn $d2,$mask,$d2
942	andn $d3,$mask,$d3 ! can be partially reduced...
943	mov 3,$mask
944
945	srl $d3,2,$padbit ! ... so reduce
946	and $d3,$mask,$h4
947	andn $d3,$mask,$d3
948	add $padbit,$d3,$d3
949
950	addcc $d3,$h0,$h0
951	addccc $d0,$h1,$h1
952	addccc $d1,$h2,$h2
953	addccc $d2,$h3,$h3
954	addc %g0,$h4,$h4
955
956	addcc $h0,5,$d0 ! compare to modulus
957	addccc $h1,0,$d1
958	addccc $h2,0,$d2
959	addccc $h3,0,$d3
960	addc $h4,0,$mask
961
962	srl $mask,2,$mask ! did it carry/borrow?
963	neg $mask,$mask
964	sra $mask,31,$mask ! mask
965
966	andn $h0,$mask,$h0
967	and $d0,$mask,$d0
968	andn $h1,$mask,$h1
969	and $d1,$mask,$d1
970	or $d0,$h0,$h0
971	ld [$nonce+0],$d0 ! load nonce
972	andn $h2,$mask,$h2
973	and $d2,$mask,$d2
974	or $d1,$h1,$h1
975	ld [$nonce+4],$d1
976	andn $h3,$mask,$h3
977	and $d3,$mask,$d3
978	or $d2,$h2,$h2
979	ld [$nonce+8],$d2
980	or $d3,$h3,$h3
981	ld [$nonce+12],$d3
982
983	addcc $d0,$h0,$h0 ! accumulate nonce
984	addccc $d1,$h1,$h1
985	addccc $d2,$h2,$h2
986	addc $d3,$h3,$h3
987
988	stb $h0,[$mac+0] ! write little-endian result
989	srl $h0,8,$h0
990	stb $h1,[$mac+4]
991	srl $h1,8,$h1
992	stb $h2,[$mac+8]
993	srl $h2,8,$h2
994	stb $h3,[$mac+12]
995	srl $h3,8,$h3
996
997	stb $h0,[$mac+1]
998	srl $h0,8,$h0
999	stb $h1,[$mac+5]
1000	srl $h1,8,$h1
1001	stb $h2,[$mac+9]
1002	srl $h2,8,$h2
1003	stb $h3,[$mac+13]
1004	srl $h3,8,$h3
1005
1006	stb $h0,[$mac+2]
1007	srl $h0,8,$h0
1008	stb $h1,[$mac+6]
1009	srl $h1,8,$h1
1010	stb $h2,[$mac+10]
1011	srl $h2,8,$h2
1012	stb $h3,[$mac+14]
1013	srl $h3,8,$h3
1014
1015	stb $h0,[$mac+3]
1016	stb $h1,[$mac+7]
1017	stb $h2,[$mac+11]
1018	stb $h3,[$mac+15]
1019
1020	ret
1021	restore
1022	.type poly1305_emit_fma,#function
1023	.size poly1305_emit_fma,.-poly1305_emit_fma
1024	___
1025	}
1026
1027	$code.=<<___;
1028	.align 64
1029	.Lconsts_fma:
1030	.word 0x43300000,0x00000000 ! 2^(52+0)
1031	.word 0x45300000,0x00000000 ! 2^(52+32)
1032	.word 0x47300000,0x00000000 ! 2^(52+64)
1033	.word 0x49300000,0x00000000 ! 2^(52+96)
1034	.word 0x4b500000,0x00000000 ! 2^(52+130)
1035
1036	.word 0x37f40000,0x00000000 ! 5/2^130
1037	.word 0,1<<30 ! fsr: truncate, no exceptions
1038
1039	.word 0x44300000,0x00000000 ! 2^(52+16+0)
1040	.word 0x46300000,0x00000000 ! 2^(52+16+32)
1041	.word 0x48300000,0x00000000 ! 2^(52+16+64)
1042	.word 0x4a300000,0x00000000 ! 2^(52+16+96)
1043	.word 0x3e300000,0x00000000 ! 2^(52+16+0-96)
1044	.word 0x40300000,0x00000000 ! 2^(52+16+32-96)
1045	.word 0x42300000,0x00000000 ! 2^(52+16+64-96)
1046	.asciz "Poly1305 for SPARCv9/VIS3/FMA, CRYPTOGAMS by <appro\@openssl.org>"
1047	.align 4
1048	___
1049	}
1050
1051
1052	# Purpose of these subroutines is to explicitly encode VIS instructions,
1053	# so that one can compile the module without having to specify VIS
1054	# extensions on compiler command line, e.g. -xarch=v9 vs. -xarch=v9a.
1055	# Idea is to reserve for option to produce "universal" binary and let
1056	# programmer detect if current CPU is VIS capable at run-time.
1057	sub unvis3 {
1058	my ($mnemonic,$rs1,$rs2,$rd)=@_;
1059	my %bias = ( "g" => 0, "o" => 8, "l" => 16, "i" => 24 );
1060	my ($ref,$opf);
1061	my %visopf = ( "addxc" => 0x011,
1062	"addxccc" => 0x013,
1063	"umulxhi" => 0x016 );
1064
1065	$ref = "$mnemonic\t$rs1,$rs2,$rd";
1066
1067	if ($opf=$visopf{$mnemonic}) {
1068	foreach ($rs1,$rs2,$rd) {
1069	return $ref if (!/%([goli])([0-9])/);
1070	$_=$bias{$1}+$2;
1071	}
1072
1073	return sprintf ".word\t0x%08x !%s",
1074	0x81b00000\|$rd<<25\|$rs1<<14\|$opf<<5\|$rs2,
1075	$ref;
1076	} else {
1077	return $ref;
1078	}
1079	}
1080
1081	sub unfma {
1082	my ($mnemonic,$rs1,$rs2,$rs3,$rd)=@_;
1083	my ($ref,$opf);
1084	my %fmaopf = ( "fmadds" => 0x1,
1085	"fmaddd" => 0x2,
1086	"fmsubs" => 0x5,
1087	"fmsubd" => 0x6 );
1088
1089	$ref = "$mnemonic\t$rs1,$rs2,$rs3,$rd";
1090
1091	if ($opf=$fmaopf{$mnemonic}) {
1092	foreach ($rs1,$rs2,$rs3,$rd) {
1093	return $ref if (!/%f([0-9]{1,2})/);
1094	$_=$1;
1095	if ($1>=32) {
1096	return $ref if ($1&1);
1097	# re-encode for upper double register addressing
1098	$_=($1\|$1>>5)&31;
1099	}
1100	}
1101
1102	return sprintf ".word\t0x%08x !%s",
1103	0x81b80000\|$rd<<25\|$rs1<<14\|$rs3<<9\|$opf<<5\|$rs2,
1104	$ref;
1105	} else {
1106	return $ref;
1107	}
1108	}
1109
1110	foreach (split("\n",$code)) {
1111	s/\`([^\`]*)\`/eval $1/ge;
1112
1113	s/\b(umulxhi\|addxc[c]{0,2})\s+(%[goli][0-7]),\s(%[goli][0-7]),\s(%[goli][0-7])/
1114	&unvis3($1,$2,$3,$4)
1115	/ge or
1116	s/\b(fmadd[sd])\s+(%f[0-9]+),\s(%f[0-9]+),\s(%f[0-9]+),\s*(%f[0-9]+)/
1117	&unfma($1,$2,$3,$4,$5)
1118	/ge;
1119
1120	print $_,"\n";
1121	}
1122
1123	close STDOUT or die "error closing STDOUT: $!";

Note: See TracBrowser for help on using the repository browser.

source: vbox/trunk/src/libs/openssl-3.1.2/crypto/poly1305/asm/poly1305-sparcv9.pl@ 101021

Download in other formats: