sparcv9a-mont.pl@ 94082

Last change on this file since 94082 was 94082, checked in by vboxsync, 3 years ago
libs/openssl-3.0.1: started applying and adjusting our OpenSSL changes to 3.0.1. bugref:10128
Property svn:executable set to ``*
File size: 20.7 KB

Line
1	#! /usr/bin/env perl
2	# Copyright 2005-2021 The OpenSSL Project Authors. All Rights Reserved.
3	#
4	# Licensed under the Apache License 2.0 (the "License"). You may not use
5	# this file except in compliance with the License. You can obtain a copy
6	# in the file LICENSE in the source distribution or at
7	# https://www.openssl.org/source/license.html
8
9
10	# ====================================================================
11	# Written by Andy Polyakov <[email protected]> for the OpenSSL
12	# project. The module is, however, dual licensed under OpenSSL and
13	# CRYPTOGAMS licenses depending on where you obtain it. For further
14	# details see http://www.openssl.org/~appro/cryptogams/.
15	# ====================================================================
16
17	# October 2005
18	#
19	# "Teaser" Montgomery multiplication module for UltraSPARC. Why FPU?
20	# Because unlike integer multiplier, which simply stalls whole CPU,
21	# FPU is fully pipelined and can effectively emit 48 bit partial
22	# product every cycle. Why not blended SPARC v9? One can argue that
23	# making this module dependent on UltraSPARC VIS extension limits its
24	# binary compatibility. Well yes, it does exclude SPARC64 prior-V(!)
25	# implementations from compatibility matrix. But the rest, whole Sun
26	# UltraSPARC family and brand new Fujitsu's SPARC64 V, all support
27	# VIS extension instructions used in this module. This is considered
28	# good enough to not care about HAL SPARC64 users [if any] who have
29	# integer-only pure SPARCv9 module to "fall down" to.
30
31	# USI&II cores currently exhibit uniform 2x improvement [over pre-
32	# bn_mul_mont codebase] for all key lengths and benchmarks. On USIII
33	# performance improves few percents for shorter keys and worsens few
34	# percents for longer keys. This is because USIII integer multiplier
35	# is >3x faster than USI&II one, which is harder to match [but see
36	# TODO list below]. It should also be noted that SPARC64 V features
37	# out-of-order execution, which might mean that integer multiplier
38	# is pipelined, which in turn might be impossible to match... On
39	# additional note, SPARC64 V implements FP Multiply-Add instruction,
40	# which is perfectly usable in this context... In other words, as far
41	# as Fujitsu SPARC64 V goes, talk to the author:-)
42
43	# The implementation implies following "non-natural" limitations on
44	# input arguments:
45	# - num may not be less than 4;
46	# - num has to be even;
47	# Failure to meet either condition has no fatal effects, simply
48	# doesn't give any performance gain.
49
50	# TODO:
51	# - modulo-schedule inner loop for better performance (on in-order
52	# execution core such as UltraSPARC this shall result in further
53	# noticeable(!) improvement);
54	# - dedicated squaring procedure[?];
55
56	######################################################################
57	# November 2006
58	#
59	# Modulo-scheduled inner loops allow to interleave floating point and
60	# integer instructions and minimize Read-After-Write penalties. This
61	# results in further 20-50% performance improvement [depending on
62	# key length, more for longer keys] on USI&II cores and 30-80% - on
63	# USIII&IV.
64
65	# $output is the last argument if it looks like a file (it has an extension)
66	$output = $#ARGV >= 0 && $ARGV[$#ARGV] =~ m\|\.\w+$\| ? pop : undef;
67
68	$output and open STDOUT,">$output";
69
70	$fname="bn_mul_mont_fpu";
71
72	$frame="STACK_FRAME";
73	$bias="STACK_BIAS";
74	$locals=64;
75
76	# In order to provide for 32-/64-bit ABI duality, I keep integers wider
77	# than 32 bit in %g1-%g4 and %o0-%o5. %l0-%l7 and %i0-%i5 are used
78	# exclusively for pointers, indexes and other small values...
79	# int bn_mul_mont(
80	$rp="%i0"; # BN_ULONG *rp,
81	$ap="%i1"; # const BN_ULONG *ap,
82	$bp="%i2"; # const BN_ULONG *bp,
83	$np="%i3"; # const BN_ULONG *np,
84	$n0="%i4"; # const BN_ULONG *n0,
85	$num="%i5"; # int num);
86
87	$tp="%l0"; # t[num]
88	$ap_l="%l1"; # a[num],n[num] are smashed to 32-bit words and saved
89	$ap_h="%l2"; # to these four vectors as double-precision FP values.
90	$np_l="%l3"; # This way a bunch of fxtods are eliminated in second
91	$np_h="%l4"; # loop and L1-cache aliasing is minimized...
92	$i="%l5";
93	$j="%l6";
94	$mask="%l7"; # 16-bit mask, 0xffff
95
96	$n0="%g4"; # reassigned(!) to "64-bit" register
97	$carry="%i4"; # %i4 reused(!) for a carry bit
98
99	# FP register naming chart
100	#
101	# ..HILO
102	# dcba
103	# --------
104	# LOa
105	# LOb
106	# LOc
107	# LOd
108	# HIa
109	# HIb
110	# HIc
111	# HId
112	# ..a
113	# ..b
114	$ba="%f0"; $bb="%f2"; $bc="%f4"; $bd="%f6";
115	$na="%f8"; $nb="%f10"; $nc="%f12"; $nd="%f14";
116	$alo="%f16"; $alo_="%f17"; $ahi="%f18"; $ahi_="%f19";
117	$nlo="%f20"; $nlo_="%f21"; $nhi="%f22"; $nhi_="%f23";
118
119	$dota="%f24"; $dotb="%f26";
120
121	$aloa="%f32"; $alob="%f34"; $aloc="%f36"; $alod="%f38";
122	$ahia="%f40"; $ahib="%f42"; $ahic="%f44"; $ahid="%f46";
123	$nloa="%f48"; $nlob="%f50"; $nloc="%f52"; $nlod="%f54";
124	$nhia="%f56"; $nhib="%f58"; $nhic="%f60"; $nhid="%f62";
125
126	$ASI_FL16_P=0xD2; # magic ASI value to engage 16-bit FP load
127
128	$code=<<___;
129	#ifndef __ASSEMBLER__
130	# define __ASSEMBLER__ 1
131	#endif
132	#include "crypto/sparc_arch.h"
133
134	.section ".text",#alloc,#execinstr
135
136	.global $fname
137	.align 32
138	$fname:
139	save %sp,-$frame-$locals,%sp
140
141	cmp $num,4
142	bl,a,pn %icc,.Lret
143	clr %i0
144	andcc $num,1,%g0 ! $num has to be even...
145	bnz,a,pn %icc,.Lret
146	clr %i0 ! signal "unsupported input value"
147
148	srl $num,1,$num
149	sethi %hi(0xffff),$mask
150	ld [%i4+0],$n0 ! $n0 reassigned, remember?
151	or $mask,%lo(0xffff),$mask
152	ld [%i4+4],%o0
153	sllx %o0,32,%o0
154	or %o0,$n0,$n0 ! $n0=n0[1].n0[0]
155
156	sll $num,3,$num ! num*=8
157
158	add %sp,$bias,%o0 ! real top of stack
159	sll $num,2,%o1
160	add %o1,$num,%o1 ! %o1=num*5
161	sub %o0,%o1,%o0
162	and %o0,-2048,%o0 ! optimize TLB utilization
163	sub %o0,$bias,%sp ! alloca(5num8)
164
165	rd %asi,%o7 ! save %asi
166	add %sp,$bias+$frame+$locals,$tp
167	add $tp,$num,$ap_l
168	add $ap_l,$num,$ap_l ! [an]p_[lh] point at the vectors' ends !
169	add $ap_l,$num,$ap_h
170	add $ap_h,$num,$np_l
171	add $np_l,$num,$np_h
172
173	wr %g0,$ASI_FL16_P,%asi ! setup %asi for 16-bit FP loads
174
175	add $rp,$num,$rp ! readjust input pointers to point
176	add $ap,$num,$ap ! at the ends too...
177	add $bp,$num,$bp
178	add $np,$num,$np
179
180	stx %o7,[%sp+$bias+$frame+48] ! save %asi
181
182
183	sub %g0,$num,$i ! i=-num
184	sub %g0,$num,$j ! j=-num
185
186	add $ap,$j,%o3
187	add $bp,$i,%o4
188
189	ld [%o3+4],%g1 ! bp[0]
190	ld [%o3+0],%o0
191	ld [%o4+4],%g5 ! ap[0]
192	sllx %g1,32,%g1
193	ld [%o4+0],%o1
194	sllx %g5,32,%g5
195	or %g1,%o0,%o0
196	or %g5,%o1,%o1
197
198	add $np,$j,%o5
199
200	mulx %o1,%o0,%o0 ! ap[0]*bp[0]
201	mulx $n0,%o0,%o0 ! ap[0]bp[0]n0
202	stx %o0,[%sp+$bias+$frame+0]
203
204	ld [%o3+0],$alo_ ! load a[j] as pair of 32-bit words
205	fzeros $alo
206	ld [%o3+4],$ahi_
207	fzeros $ahi
208	ld [%o5+0],$nlo_ ! load n[j] as pair of 32-bit words
209	fzeros $nlo
210	ld [%o5+4],$nhi_
211	fzeros $nhi
212
213	! transfer b[i] to FPU as 4x16-bit values
214	ldda [%o4+2]%asi,$ba
215	fxtod $alo,$alo
216	ldda [%o4+0]%asi,$bb
217	fxtod $ahi,$ahi
218	ldda [%o4+6]%asi,$bc
219	fxtod $nlo,$nlo
220	ldda [%o4+4]%asi,$bd
221	fxtod $nhi,$nhi
222
223	! transfer ap[0]b[0]n0 to FPU as 4x16-bit values
224	ldda [%sp+$bias+$frame+6]%asi,$na
225	fxtod $ba,$ba
226	ldda [%sp+$bias+$frame+4]%asi,$nb
227	fxtod $bb,$bb
228	ldda [%sp+$bias+$frame+2]%asi,$nc
229	fxtod $bc,$bc
230	ldda [%sp+$bias+$frame+0]%asi,$nd
231	fxtod $bd,$bd
232
233	std $alo,[$ap_l+$j] ! save smashed ap[j] in double format
234	fxtod $na,$na
235	std $ahi,[$ap_h+$j]
236	fxtod $nb,$nb
237	std $nlo,[$np_l+$j] ! save smashed np[j] in double format
238	fxtod $nc,$nc
239	std $nhi,[$np_h+$j]
240	fxtod $nd,$nd
241
242	fmuld $alo,$ba,$aloa
243	fmuld $nlo,$na,$nloa
244	fmuld $alo,$bb,$alob
245	fmuld $nlo,$nb,$nlob
246	fmuld $alo,$bc,$aloc
247	faddd $aloa,$nloa,$nloa
248	fmuld $nlo,$nc,$nloc
249	fmuld $alo,$bd,$alod
250	faddd $alob,$nlob,$nlob
251	fmuld $nlo,$nd,$nlod
252	fmuld $ahi,$ba,$ahia
253	faddd $aloc,$nloc,$nloc
254	fmuld $nhi,$na,$nhia
255	fmuld $ahi,$bb,$ahib
256	faddd $alod,$nlod,$nlod
257	fmuld $nhi,$nb,$nhib
258	fmuld $ahi,$bc,$ahic
259	faddd $ahia,$nhia,$nhia
260	fmuld $nhi,$nc,$nhic
261	fmuld $ahi,$bd,$ahid
262	faddd $ahib,$nhib,$nhib
263	fmuld $nhi,$nd,$nhid
264
265	faddd $ahic,$nhic,$dota ! $nhic
266	faddd $ahid,$nhid,$dotb ! $nhid
267
268	faddd $nloc,$nhia,$nloc
269	faddd $nlod,$nhib,$nlod
270
271	fdtox $nloa,$nloa
272	fdtox $nlob,$nlob
273	fdtox $nloc,$nloc
274	fdtox $nlod,$nlod
275
276	std $nloa,[%sp+$bias+$frame+0]
277	add $j,8,$j
278	std $nlob,[%sp+$bias+$frame+8]
279	add $ap,$j,%o4
280	std $nloc,[%sp+$bias+$frame+16]
281	add $np,$j,%o5
282	std $nlod,[%sp+$bias+$frame+24]
283
284
285	ld [%o4+0],$alo_ ! load a[j] as pair of 32-bit words
286	fzeros $alo
287	ld [%o4+4],$ahi_
288	fzeros $ahi
289	ld [%o5+0],$nlo_ ! load n[j] as pair of 32-bit words
290	fzeros $nlo
291	ld [%o5+4],$nhi_
292	fzeros $nhi
293
294	fxtod $alo,$alo
295	fxtod $ahi,$ahi
296	fxtod $nlo,$nlo
297	fxtod $nhi,$nhi
298
299	ldx [%sp+$bias+$frame+0],%o0
300	fmuld $alo,$ba,$aloa
301	ldx [%sp+$bias+$frame+8],%o1
302	fmuld $nlo,$na,$nloa
303	ldx [%sp+$bias+$frame+16],%o2
304	fmuld $alo,$bb,$alob
305	ldx [%sp+$bias+$frame+24],%o3
306	fmuld $nlo,$nb,$nlob
307
308	srlx %o0,16,%o7
309	std $alo,[$ap_l+$j] ! save smashed ap[j] in double format
310	fmuld $alo,$bc,$aloc
311	add %o7,%o1,%o1
312	std $ahi,[$ap_h+$j]
313	faddd $aloa,$nloa,$nloa
314	fmuld $nlo,$nc,$nloc
315	srlx %o1,16,%o7
316	std $nlo,[$np_l+$j] ! save smashed np[j] in double format
317	fmuld $alo,$bd,$alod
318	add %o7,%o2,%o2
319	std $nhi,[$np_h+$j]
320	faddd $alob,$nlob,$nlob
321	fmuld $nlo,$nd,$nlod
322	srlx %o2,16,%o7
323	fmuld $ahi,$ba,$ahia
324	add %o7,%o3,%o3 ! %o3.%o2[0..15].%o1[0..15].%o0[0..15]
325	faddd $aloc,$nloc,$nloc
326	fmuld $nhi,$na,$nhia
327	!and %o0,$mask,%o0
328	!and %o1,$mask,%o1
329	!and %o2,$mask,%o2
330	!sllx %o1,16,%o1
331	!sllx %o2,32,%o2
332	!sllx %o3,48,%o7
333	!or %o1,%o0,%o0
334	!or %o2,%o0,%o0
335	!or %o7,%o0,%o0 ! 64-bit result
336	srlx %o3,16,%g1 ! 34-bit carry
337	fmuld $ahi,$bb,$ahib
338
339	faddd $alod,$nlod,$nlod
340	fmuld $nhi,$nb,$nhib
341	fmuld $ahi,$bc,$ahic
342	faddd $ahia,$nhia,$nhia
343	fmuld $nhi,$nc,$nhic
344	fmuld $ahi,$bd,$ahid
345	faddd $ahib,$nhib,$nhib
346	fmuld $nhi,$nd,$nhid
347
348	faddd $dota,$nloa,$nloa
349	faddd $dotb,$nlob,$nlob
350	faddd $ahic,$nhic,$dota ! $nhic
351	faddd $ahid,$nhid,$dotb ! $nhid
352
353	faddd $nloc,$nhia,$nloc
354	faddd $nlod,$nhib,$nlod
355
356	fdtox $nloa,$nloa
357	fdtox $nlob,$nlob
358	fdtox $nloc,$nloc
359	fdtox $nlod,$nlod
360
361	std $nloa,[%sp+$bias+$frame+0]
362	std $nlob,[%sp+$bias+$frame+8]
363	addcc $j,8,$j
364	std $nloc,[%sp+$bias+$frame+16]
365	bz,pn %icc,.L1stskip
366	std $nlod,[%sp+$bias+$frame+24]
367
368
369	.align 32 ! incidentally already aligned !
370	.L1st:
371	add $ap,$j,%o4
372	add $np,$j,%o5
373	ld [%o4+0],$alo_ ! load a[j] as pair of 32-bit words
374	fzeros $alo
375	ld [%o4+4],$ahi_
376	fzeros $ahi
377	ld [%o5+0],$nlo_ ! load n[j] as pair of 32-bit words
378	fzeros $nlo
379	ld [%o5+4],$nhi_
380	fzeros $nhi
381
382	fxtod $alo,$alo
383	fxtod $ahi,$ahi
384	fxtod $nlo,$nlo
385	fxtod $nhi,$nhi
386
387	ldx [%sp+$bias+$frame+0],%o0
388	fmuld $alo,$ba,$aloa
389	ldx [%sp+$bias+$frame+8],%o1
390	fmuld $nlo,$na,$nloa
391	ldx [%sp+$bias+$frame+16],%o2
392	fmuld $alo,$bb,$alob
393	ldx [%sp+$bias+$frame+24],%o3
394	fmuld $nlo,$nb,$nlob
395
396	srlx %o0,16,%o7
397	std $alo,[$ap_l+$j] ! save smashed ap[j] in double format
398	fmuld $alo,$bc,$aloc
399	add %o7,%o1,%o1
400	std $ahi,[$ap_h+$j]
401	faddd $aloa,$nloa,$nloa
402	fmuld $nlo,$nc,$nloc
403	srlx %o1,16,%o7
404	std $nlo,[$np_l+$j] ! save smashed np[j] in double format
405	fmuld $alo,$bd,$alod
406	add %o7,%o2,%o2
407	std $nhi,[$np_h+$j]
408	faddd $alob,$nlob,$nlob
409	fmuld $nlo,$nd,$nlod
410	srlx %o2,16,%o7
411	fmuld $ahi,$ba,$ahia
412	add %o7,%o3,%o3 ! %o3.%o2[0..15].%o1[0..15].%o0[0..15]
413	and %o0,$mask,%o0
414	faddd $aloc,$nloc,$nloc
415	fmuld $nhi,$na,$nhia
416	and %o1,$mask,%o1
417	and %o2,$mask,%o2
418	fmuld $ahi,$bb,$ahib
419	sllx %o1,16,%o1
420	faddd $alod,$nlod,$nlod
421	fmuld $nhi,$nb,$nhib
422	sllx %o2,32,%o2
423	fmuld $ahi,$bc,$ahic
424	sllx %o3,48,%o7
425	or %o1,%o0,%o0
426	faddd $ahia,$nhia,$nhia
427	fmuld $nhi,$nc,$nhic
428	or %o2,%o0,%o0
429	fmuld $ahi,$bd,$ahid
430	or %o7,%o0,%o0 ! 64-bit result
431	faddd $ahib,$nhib,$nhib
432	fmuld $nhi,$nd,$nhid
433	addcc %g1,%o0,%o0
434	faddd $dota,$nloa,$nloa
435	srlx %o3,16,%g1 ! 34-bit carry
436	faddd $dotb,$nlob,$nlob
437	bcs,a %xcc,.+8
438	add %g1,1,%g1
439
440	stx %o0,[$tp] ! tp[j-1]=
441
442	faddd $ahic,$nhic,$dota ! $nhic
443	faddd $ahid,$nhid,$dotb ! $nhid
444
445	faddd $nloc,$nhia,$nloc
446	faddd $nlod,$nhib,$nlod
447
448	fdtox $nloa,$nloa
449	fdtox $nlob,$nlob
450	fdtox $nloc,$nloc
451	fdtox $nlod,$nlod
452
453	std $nloa,[%sp+$bias+$frame+0]
454	std $nlob,[%sp+$bias+$frame+8]
455	std $nloc,[%sp+$bias+$frame+16]
456	std $nlod,[%sp+$bias+$frame+24]
457
458	addcc $j,8,$j
459	bnz,pt %icc,.L1st
460	add $tp,8,$tp
461
462
463	.L1stskip:
464	fdtox $dota,$dota
465	fdtox $dotb,$dotb
466
467	ldx [%sp+$bias+$frame+0],%o0
468	ldx [%sp+$bias+$frame+8],%o1
469	ldx [%sp+$bias+$frame+16],%o2
470	ldx [%sp+$bias+$frame+24],%o3
471
472	srlx %o0,16,%o7
473	std $dota,[%sp+$bias+$frame+32]
474	add %o7,%o1,%o1
475	std $dotb,[%sp+$bias+$frame+40]
476	srlx %o1,16,%o7
477	add %o7,%o2,%o2
478	srlx %o2,16,%o7
479	add %o7,%o3,%o3 ! %o3.%o2[0..15].%o1[0..15].%o0[0..15]
480	and %o0,$mask,%o0
481	and %o1,$mask,%o1
482	and %o2,$mask,%o2
483	sllx %o1,16,%o1
484	sllx %o2,32,%o2
485	sllx %o3,48,%o7
486	or %o1,%o0,%o0
487	or %o2,%o0,%o0
488	or %o7,%o0,%o0 ! 64-bit result
489	ldx [%sp+$bias+$frame+32],%o4
490	addcc %g1,%o0,%o0
491	ldx [%sp+$bias+$frame+40],%o5
492	srlx %o3,16,%g1 ! 34-bit carry
493	bcs,a %xcc,.+8
494	add %g1,1,%g1
495
496	stx %o0,[$tp] ! tp[j-1]=
497	add $tp,8,$tp
498
499	srlx %o4,16,%o7
500	add %o7,%o5,%o5
501	and %o4,$mask,%o4
502	sllx %o5,16,%o7
503	or %o7,%o4,%o4
504	addcc %g1,%o4,%o4
505	srlx %o5,48,%g1
506	bcs,a %xcc,.+8
507	add %g1,1,%g1
508
509	mov %g1,$carry
510	stx %o4,[$tp] ! tp[num-1]=
511
512
513	ba .Louter
514	add $i,8,$i
515	.align 32
516	.Louter:
517	sub %g0,$num,$j ! j=-num
518	add %sp,$bias+$frame+$locals,$tp
519
520	add $ap,$j,%o3
521	add $bp,$i,%o4
522
523	ld [%o3+4],%g1 ! bp[i]
524	ld [%o3+0],%o0
525	ld [%o4+4],%g5 ! ap[0]
526	sllx %g1,32,%g1
527	ld [%o4+0],%o1
528	sllx %g5,32,%g5
529	or %g1,%o0,%o0
530	or %g5,%o1,%o1
531
532	ldx [$tp],%o2 ! tp[0]
533	mulx %o1,%o0,%o0
534	addcc %o2,%o0,%o0
535	mulx $n0,%o0,%o0 ! (ap[0]bp[i]+t[0])n0
536	stx %o0,[%sp+$bias+$frame+0]
537
538	! transfer b[i] to FPU as 4x16-bit values
539	ldda [%o4+2]%asi,$ba
540	ldda [%o4+0]%asi,$bb
541	ldda [%o4+6]%asi,$bc
542	ldda [%o4+4]%asi,$bd
543
544	! transfer (ap[0]b[i]+t[0])n0 to FPU as 4x16-bit values
545	ldda [%sp+$bias+$frame+6]%asi,$na
546	fxtod $ba,$ba
547	ldda [%sp+$bias+$frame+4]%asi,$nb
548	fxtod $bb,$bb
549	ldda [%sp+$bias+$frame+2]%asi,$nc
550	fxtod $bc,$bc
551	ldda [%sp+$bias+$frame+0]%asi,$nd
552	fxtod $bd,$bd
553	ldd [$ap_l+$j],$alo ! load a[j] in double format
554	fxtod $na,$na
555	ldd [$ap_h+$j],$ahi
556	fxtod $nb,$nb
557	ldd [$np_l+$j],$nlo ! load n[j] in double format
558	fxtod $nc,$nc
559	ldd [$np_h+$j],$nhi
560	fxtod $nd,$nd
561
562	fmuld $alo,$ba,$aloa
563	fmuld $nlo,$na,$nloa
564	fmuld $alo,$bb,$alob
565	fmuld $nlo,$nb,$nlob
566	fmuld $alo,$bc,$aloc
567	faddd $aloa,$nloa,$nloa
568	fmuld $nlo,$nc,$nloc
569	fmuld $alo,$bd,$alod
570	faddd $alob,$nlob,$nlob
571	fmuld $nlo,$nd,$nlod
572	fmuld $ahi,$ba,$ahia
573	faddd $aloc,$nloc,$nloc
574	fmuld $nhi,$na,$nhia
575	fmuld $ahi,$bb,$ahib
576	faddd $alod,$nlod,$nlod
577	fmuld $nhi,$nb,$nhib
578	fmuld $ahi,$bc,$ahic
579	faddd $ahia,$nhia,$nhia
580	fmuld $nhi,$nc,$nhic
581	fmuld $ahi,$bd,$ahid
582	faddd $ahib,$nhib,$nhib
583	fmuld $nhi,$nd,$nhid
584
585	faddd $ahic,$nhic,$dota ! $nhic
586	faddd $ahid,$nhid,$dotb ! $nhid
587
588	faddd $nloc,$nhia,$nloc
589	faddd $nlod,$nhib,$nlod
590
591	fdtox $nloa,$nloa
592	fdtox $nlob,$nlob
593	fdtox $nloc,$nloc
594	fdtox $nlod,$nlod
595
596	std $nloa,[%sp+$bias+$frame+0]
597	std $nlob,[%sp+$bias+$frame+8]
598	std $nloc,[%sp+$bias+$frame+16]
599	add $j,8,$j
600	std $nlod,[%sp+$bias+$frame+24]
601
602
603	ldd [$ap_l+$j],$alo ! load a[j] in double format
604	ldd [$ap_h+$j],$ahi
605	ldd [$np_l+$j],$nlo ! load n[j] in double format
606	ldd [$np_h+$j],$nhi
607
608	fmuld $alo,$ba,$aloa
609	fmuld $nlo,$na,$nloa
610	fmuld $alo,$bb,$alob
611	fmuld $nlo,$nb,$nlob
612	fmuld $alo,$bc,$aloc
613	ldx [%sp+$bias+$frame+0],%o0
614	faddd $aloa,$nloa,$nloa
615	fmuld $nlo,$nc,$nloc
616	ldx [%sp+$bias+$frame+8],%o1
617	fmuld $alo,$bd,$alod
618	ldx [%sp+$bias+$frame+16],%o2
619	faddd $alob,$nlob,$nlob
620	fmuld $nlo,$nd,$nlod
621	ldx [%sp+$bias+$frame+24],%o3
622	fmuld $ahi,$ba,$ahia
623
624	srlx %o0,16,%o7
625	faddd $aloc,$nloc,$nloc
626	fmuld $nhi,$na,$nhia
627	add %o7,%o1,%o1
628	fmuld $ahi,$bb,$ahib
629	srlx %o1,16,%o7
630	faddd $alod,$nlod,$nlod
631	fmuld $nhi,$nb,$nhib
632	add %o7,%o2,%o2
633	fmuld $ahi,$bc,$ahic
634	srlx %o2,16,%o7
635	faddd $ahia,$nhia,$nhia
636	fmuld $nhi,$nc,$nhic
637	add %o7,%o3,%o3 ! %o3.%o2[0..15].%o1[0..15].%o0[0..15]
638	! why?
639	and %o0,$mask,%o0
640	fmuld $ahi,$bd,$ahid
641	and %o1,$mask,%o1
642	and %o2,$mask,%o2
643	faddd $ahib,$nhib,$nhib
644	fmuld $nhi,$nd,$nhid
645	sllx %o1,16,%o1
646	faddd $dota,$nloa,$nloa
647	sllx %o2,32,%o2
648	faddd $dotb,$nlob,$nlob
649	sllx %o3,48,%o7
650	or %o1,%o0,%o0
651	faddd $ahic,$nhic,$dota ! $nhic
652	or %o2,%o0,%o0
653	faddd $ahid,$nhid,$dotb ! $nhid
654	or %o7,%o0,%o0 ! 64-bit result
655	ldx [$tp],%o7
656	faddd $nloc,$nhia,$nloc
657	addcc %o7,%o0,%o0
658	! end-of-why?
659	faddd $nlod,$nhib,$nlod
660	srlx %o3,16,%g1 ! 34-bit carry
661	fdtox $nloa,$nloa
662	bcs,a %xcc,.+8
663	add %g1,1,%g1
664
665	fdtox $nlob,$nlob
666	fdtox $nloc,$nloc
667	fdtox $nlod,$nlod
668
669	std $nloa,[%sp+$bias+$frame+0]
670	std $nlob,[%sp+$bias+$frame+8]
671	addcc $j,8,$j
672	std $nloc,[%sp+$bias+$frame+16]
673	bz,pn %icc,.Linnerskip
674	std $nlod,[%sp+$bias+$frame+24]
675
676
677	ba .Linner
678	nop
679	.align 32
680	.Linner:
681	ldd [$ap_l+$j],$alo ! load a[j] in double format
682	ldd [$ap_h+$j],$ahi
683	ldd [$np_l+$j],$nlo ! load n[j] in double format
684	ldd [$np_h+$j],$nhi
685
686	fmuld $alo,$ba,$aloa
687	fmuld $nlo,$na,$nloa
688	fmuld $alo,$bb,$alob
689	fmuld $nlo,$nb,$nlob
690	fmuld $alo,$bc,$aloc
691	ldx [%sp+$bias+$frame+0],%o0
692	faddd $aloa,$nloa,$nloa
693	fmuld $nlo,$nc,$nloc
694	ldx [%sp+$bias+$frame+8],%o1
695	fmuld $alo,$bd,$alod
696	ldx [%sp+$bias+$frame+16],%o2
697	faddd $alob,$nlob,$nlob
698	fmuld $nlo,$nd,$nlod
699	ldx [%sp+$bias+$frame+24],%o3
700	fmuld $ahi,$ba,$ahia
701
702	srlx %o0,16,%o7
703	faddd $aloc,$nloc,$nloc
704	fmuld $nhi,$na,$nhia
705	add %o7,%o1,%o1
706	fmuld $ahi,$bb,$ahib
707	srlx %o1,16,%o7
708	faddd $alod,$nlod,$nlod
709	fmuld $nhi,$nb,$nhib
710	add %o7,%o2,%o2
711	fmuld $ahi,$bc,$ahic
712	srlx %o2,16,%o7
713	faddd $ahia,$nhia,$nhia
714	fmuld $nhi,$nc,$nhic
715	add %o7,%o3,%o3 ! %o3.%o2[0..15].%o1[0..15].%o0[0..15]
716	and %o0,$mask,%o0
717	fmuld $ahi,$bd,$ahid
718	and %o1,$mask,%o1
719	and %o2,$mask,%o2
720	faddd $ahib,$nhib,$nhib
721	fmuld $nhi,$nd,$nhid
722	sllx %o1,16,%o1
723	faddd $dota,$nloa,$nloa
724	sllx %o2,32,%o2
725	faddd $dotb,$nlob,$nlob
726	sllx %o3,48,%o7
727	or %o1,%o0,%o0
728	faddd $ahic,$nhic,$dota ! $nhic
729	or %o2,%o0,%o0
730	faddd $ahid,$nhid,$dotb ! $nhid
731	or %o7,%o0,%o0 ! 64-bit result
732	faddd $nloc,$nhia,$nloc
733	addcc %g1,%o0,%o0
734	ldx [$tp+8],%o7 ! tp[j]
735	faddd $nlod,$nhib,$nlod
736	srlx %o3,16,%g1 ! 34-bit carry
737	fdtox $nloa,$nloa
738	bcs,a %xcc,.+8
739	add %g1,1,%g1
740	fdtox $nlob,$nlob
741	addcc %o7,%o0,%o0
742	fdtox $nloc,$nloc
743	bcs,a %xcc,.+8
744	add %g1,1,%g1
745
746	stx %o0,[$tp] ! tp[j-1]
747	fdtox $nlod,$nlod
748
749	std $nloa,[%sp+$bias+$frame+0]
750	std $nlob,[%sp+$bias+$frame+8]
751	std $nloc,[%sp+$bias+$frame+16]
752	addcc $j,8,$j
753	std $nlod,[%sp+$bias+$frame+24]
754	bnz,pt %icc,.Linner
755	add $tp,8,$tp
756
757
758	.Linnerskip:
759	fdtox $dota,$dota
760	fdtox $dotb,$dotb
761
762	ldx [%sp+$bias+$frame+0],%o0
763	ldx [%sp+$bias+$frame+8],%o1
764	ldx [%sp+$bias+$frame+16],%o2
765	ldx [%sp+$bias+$frame+24],%o3
766
767	srlx %o0,16,%o7
768	std $dota,[%sp+$bias+$frame+32]
769	add %o7,%o1,%o1
770	std $dotb,[%sp+$bias+$frame+40]
771	srlx %o1,16,%o7
772	add %o7,%o2,%o2
773	srlx %o2,16,%o7
774	add %o7,%o3,%o3 ! %o3.%o2[0..15].%o1[0..15].%o0[0..15]
775	and %o0,$mask,%o0
776	and %o1,$mask,%o1
777	and %o2,$mask,%o2
778	sllx %o1,16,%o1
779	sllx %o2,32,%o2
780	sllx %o3,48,%o7
781	or %o1,%o0,%o0
782	or %o2,%o0,%o0
783	ldx [%sp+$bias+$frame+32],%o4
784	or %o7,%o0,%o0 ! 64-bit result
785	ldx [%sp+$bias+$frame+40],%o5
786	addcc %g1,%o0,%o0
787	ldx [$tp+8],%o7 ! tp[j]
788	srlx %o3,16,%g1 ! 34-bit carry
789	bcs,a %xcc,.+8
790	add %g1,1,%g1
791
792	addcc %o7,%o0,%o0
793	bcs,a %xcc,.+8
794	add %g1,1,%g1
795
796	stx %o0,[$tp] ! tp[j-1]
797	add $tp,8,$tp
798
799	srlx %o4,16,%o7
800	add %o7,%o5,%o5
801	and %o4,$mask,%o4
802	sllx %o5,16,%o7
803	or %o7,%o4,%o4
804	addcc %g1,%o4,%o4
805	srlx %o5,48,%g1
806	bcs,a %xcc,.+8
807	add %g1,1,%g1
808
809	addcc $carry,%o4,%o4
810	stx %o4,[$tp] ! tp[num-1]
811	mov %g1,$carry
812	bcs,a %xcc,.+8
813	add $carry,1,$carry
814
815	addcc $i,8,$i
816	bnz %icc,.Louter
817	nop
818
819
820	add $tp,8,$tp ! adjust tp to point at the end
821	orn %g0,%g0,%g4
822	sub %g0,$num,%o7 ! n=-num
823	ba .Lsub
824	subcc %g0,%g0,%g0 ! clear %icc.c
825
826	.align 32
827	.Lsub:
828	ldx [$tp+%o7],%o0
829	add $np,%o7,%g1
830	ld [%g1+0],%o2
831	ld [%g1+4],%o3
832	srlx %o0,32,%o1
833	subccc %o0,%o2,%o2
834	add $rp,%o7,%g1
835	subccc %o1,%o3,%o3
836	st %o2,[%g1+0]
837	add %o7,8,%o7
838	brnz,pt %o7,.Lsub
839	st %o3,[%g1+4]
840	subc $carry,0,%g4
841	sub %g0,$num,%o7 ! n=-num
842	ba .Lcopy
843	nop
844
845	.align 32
846	.Lcopy:
847	ldx [$tp+%o7],%o0
848	add $rp,%o7,%g1
849	ld [%g1+0],%o2
850	ld [%g1+4],%o3
851	stx %g0,[$tp+%o7]
852	and %o0,%g4,%o0
853	srlx %o0,32,%o1
854	andn %o2,%g4,%o2
855	andn %o3,%g4,%o3
856	or %o2,%o0,%o0
857	or %o3,%o1,%o1
858	st %o0,[%g1+0]
859	add %o7,8,%o7
860	brnz,pt %o7,.Lcopy
861	st %o1,[%g1+4]
862	sub %g0,$num,%o7 ! n=-num
863
864	.Lzap:
865	stx %g0,[$ap_l+%o7]
866	stx %g0,[$ap_h+%o7]
867	stx %g0,[$np_l+%o7]
868	stx %g0,[$np_h+%o7]
869	add %o7,8,%o7
870	brnz,pt %o7,.Lzap
871	nop
872
873	ldx [%sp+$bias+$frame+48],%o7
874	wr %g0,%o7,%asi ! restore %asi
875
876	mov 1,%i0
877	.Lret:
878	ret
879	restore
880	.type $fname,#function
881	.size $fname,(.-$fname)
882	.asciz "Montgomery Multiplication for UltraSPARC, CRYPTOGAMS by <appro\@openssl.org>"
883	.align 32
884	___
885
886	$code =~ s/\`([^\`]*)\`/eval($1)/gem;
887
888	# Below substitution makes it possible to compile without demanding
889	# VIS extensions on command line, e.g. -xarch=v9 vs. -xarch=v9a. I
890	# dare to do this, because VIS capability is detected at run-time now
891	# and this routine is not called on CPU not capable to execute it. Do
892	# note that fzeros is not the only VIS dependency! Another dependency
893	# is implicit and is just _a_ numerical value loaded to %asi register,
894	# which assembler can't recognize as VIS specific...
895	$code =~ s/fzeros\s+%f([0-9]+)/
896	sprintf(".word\t0x%x\t! fzeros %%f%d",0x81b00c20\|($1<<25),$1)
897	/gem;
898
899	print $code;
900	# flush
901	close STDOUT or die "error closing STDOUT: $!";

Note: See TracBrowser for help on using the repository browser.

source: vbox/trunk/src/libs/openssl-3.0.1/crypto/bn/asm/sparcv9a-mont.pl@ 94082

Download in other formats: