VirtualBox

source: vbox/trunk/src/libs/openssl-3.0.1/crypto/bn/asm/sparcv9a-mont.pl@ 94082

Last change on this file since 94082 was 94082, checked in by vboxsync, 3 years ago

libs/openssl-3.0.1: started applying and adjusting our OpenSSL changes to 3.0.1. bugref:10128

  • Property svn:executable set to *
File size: 20.7 KB
Line 
1#! /usr/bin/env perl
2# Copyright 2005-2021 The OpenSSL Project Authors. All Rights Reserved.
3#
4# Licensed under the Apache License 2.0 (the "License"). You may not use
5# this file except in compliance with the License. You can obtain a copy
6# in the file LICENSE in the source distribution or at
7# https://www.openssl.org/source/license.html
8
9
10# ====================================================================
11# Written by Andy Polyakov <[email protected]> for the OpenSSL
12# project. The module is, however, dual licensed under OpenSSL and
13# CRYPTOGAMS licenses depending on where you obtain it. For further
14# details see http://www.openssl.org/~appro/cryptogams/.
15# ====================================================================
16
17# October 2005
18#
19# "Teaser" Montgomery multiplication module for UltraSPARC. Why FPU?
20# Because unlike integer multiplier, which simply stalls whole CPU,
21# FPU is fully pipelined and can effectively emit 48 bit partial
22# product every cycle. Why not blended SPARC v9? One can argue that
23# making this module dependent on UltraSPARC VIS extension limits its
24# binary compatibility. Well yes, it does exclude SPARC64 prior-V(!)
25# implementations from compatibility matrix. But the rest, whole Sun
26# UltraSPARC family and brand new Fujitsu's SPARC64 V, all support
27# VIS extension instructions used in this module. This is considered
28# good enough to not care about HAL SPARC64 users [if any] who have
29# integer-only pure SPARCv9 module to "fall down" to.
30
31# USI&II cores currently exhibit uniform 2x improvement [over pre-
32# bn_mul_mont codebase] for all key lengths and benchmarks. On USIII
33# performance improves few percents for shorter keys and worsens few
34# percents for longer keys. This is because USIII integer multiplier
35# is >3x faster than USI&II one, which is harder to match [but see
36# TODO list below]. It should also be noted that SPARC64 V features
37# out-of-order execution, which *might* mean that integer multiplier
38# is pipelined, which in turn *might* be impossible to match... On
39# additional note, SPARC64 V implements FP Multiply-Add instruction,
40# which is perfectly usable in this context... In other words, as far
41# as Fujitsu SPARC64 V goes, talk to the author:-)
42
43# The implementation implies following "non-natural" limitations on
44# input arguments:
45# - num may not be less than 4;
46# - num has to be even;
47# Failure to meet either condition has no fatal effects, simply
48# doesn't give any performance gain.
49
50# TODO:
51# - modulo-schedule inner loop for better performance (on in-order
52# execution core such as UltraSPARC this shall result in further
53# noticeable(!) improvement);
54# - dedicated squaring procedure[?];
55
56######################################################################
57# November 2006
58#
59# Modulo-scheduled inner loops allow to interleave floating point and
60# integer instructions and minimize Read-After-Write penalties. This
61# results in *further* 20-50% performance improvement [depending on
62# key length, more for longer keys] on USI&II cores and 30-80% - on
63# USIII&IV.
64
65# $output is the last argument if it looks like a file (it has an extension)
66$output = $#ARGV >= 0 && $ARGV[$#ARGV] =~ m|\.\w+$| ? pop : undef;
67
68$output and open STDOUT,">$output";
69
70$fname="bn_mul_mont_fpu";
71
72$frame="STACK_FRAME";
73$bias="STACK_BIAS";
74$locals=64;
75
76# In order to provide for 32-/64-bit ABI duality, I keep integers wider
77# than 32 bit in %g1-%g4 and %o0-%o5. %l0-%l7 and %i0-%i5 are used
78# exclusively for pointers, indexes and other small values...
79# int bn_mul_mont(
80$rp="%i0"; # BN_ULONG *rp,
81$ap="%i1"; # const BN_ULONG *ap,
82$bp="%i2"; # const BN_ULONG *bp,
83$np="%i3"; # const BN_ULONG *np,
84$n0="%i4"; # const BN_ULONG *n0,
85$num="%i5"; # int num);
86
87$tp="%l0"; # t[num]
88$ap_l="%l1"; # a[num],n[num] are smashed to 32-bit words and saved
89$ap_h="%l2"; # to these four vectors as double-precision FP values.
90$np_l="%l3"; # This way a bunch of fxtods are eliminated in second
91$np_h="%l4"; # loop and L1-cache aliasing is minimized...
92$i="%l5";
93$j="%l6";
94$mask="%l7"; # 16-bit mask, 0xffff
95
96$n0="%g4"; # reassigned(!) to "64-bit" register
97$carry="%i4"; # %i4 reused(!) for a carry bit
98
99# FP register naming chart
100#
101# ..HILO
102# dcba
103# --------
104# LOa
105# LOb
106# LOc
107# LOd
108# HIa
109# HIb
110# HIc
111# HId
112# ..a
113# ..b
114$ba="%f0"; $bb="%f2"; $bc="%f4"; $bd="%f6";
115$na="%f8"; $nb="%f10"; $nc="%f12"; $nd="%f14";
116$alo="%f16"; $alo_="%f17"; $ahi="%f18"; $ahi_="%f19";
117$nlo="%f20"; $nlo_="%f21"; $nhi="%f22"; $nhi_="%f23";
118
119$dota="%f24"; $dotb="%f26";
120
121$aloa="%f32"; $alob="%f34"; $aloc="%f36"; $alod="%f38";
122$ahia="%f40"; $ahib="%f42"; $ahic="%f44"; $ahid="%f46";
123$nloa="%f48"; $nlob="%f50"; $nloc="%f52"; $nlod="%f54";
124$nhia="%f56"; $nhib="%f58"; $nhic="%f60"; $nhid="%f62";
125
126$ASI_FL16_P=0xD2; # magic ASI value to engage 16-bit FP load
127
128$code=<<___;
129#ifndef __ASSEMBLER__
130# define __ASSEMBLER__ 1
131#endif
132#include "crypto/sparc_arch.h"
133
134.section ".text",#alloc,#execinstr
135
136.global $fname
137.align 32
138$fname:
139 save %sp,-$frame-$locals,%sp
140
141 cmp $num,4
142 bl,a,pn %icc,.Lret
143 clr %i0
144 andcc $num,1,%g0 ! $num has to be even...
145 bnz,a,pn %icc,.Lret
146 clr %i0 ! signal "unsupported input value"
147
148 srl $num,1,$num
149 sethi %hi(0xffff),$mask
150 ld [%i4+0],$n0 ! $n0 reassigned, remember?
151 or $mask,%lo(0xffff),$mask
152 ld [%i4+4],%o0
153 sllx %o0,32,%o0
154 or %o0,$n0,$n0 ! $n0=n0[1].n0[0]
155
156 sll $num,3,$num ! num*=8
157
158 add %sp,$bias,%o0 ! real top of stack
159 sll $num,2,%o1
160 add %o1,$num,%o1 ! %o1=num*5
161 sub %o0,%o1,%o0
162 and %o0,-2048,%o0 ! optimize TLB utilization
163 sub %o0,$bias,%sp ! alloca(5*num*8)
164
165 rd %asi,%o7 ! save %asi
166 add %sp,$bias+$frame+$locals,$tp
167 add $tp,$num,$ap_l
168 add $ap_l,$num,$ap_l ! [an]p_[lh] point at the vectors' ends !
169 add $ap_l,$num,$ap_h
170 add $ap_h,$num,$np_l
171 add $np_l,$num,$np_h
172
173 wr %g0,$ASI_FL16_P,%asi ! setup %asi for 16-bit FP loads
174
175 add $rp,$num,$rp ! readjust input pointers to point
176 add $ap,$num,$ap ! at the ends too...
177 add $bp,$num,$bp
178 add $np,$num,$np
179
180 stx %o7,[%sp+$bias+$frame+48] ! save %asi
181
182
183 sub %g0,$num,$i ! i=-num
184 sub %g0,$num,$j ! j=-num
185
186 add $ap,$j,%o3
187 add $bp,$i,%o4
188
189 ld [%o3+4],%g1 ! bp[0]
190 ld [%o3+0],%o0
191 ld [%o4+4],%g5 ! ap[0]
192 sllx %g1,32,%g1
193 ld [%o4+0],%o1
194 sllx %g5,32,%g5
195 or %g1,%o0,%o0
196 or %g5,%o1,%o1
197
198 add $np,$j,%o5
199
200 mulx %o1,%o0,%o0 ! ap[0]*bp[0]
201 mulx $n0,%o0,%o0 ! ap[0]*bp[0]*n0
202 stx %o0,[%sp+$bias+$frame+0]
203
204 ld [%o3+0],$alo_ ! load a[j] as pair of 32-bit words
205 fzeros $alo
206 ld [%o3+4],$ahi_
207 fzeros $ahi
208 ld [%o5+0],$nlo_ ! load n[j] as pair of 32-bit words
209 fzeros $nlo
210 ld [%o5+4],$nhi_
211 fzeros $nhi
212
213 ! transfer b[i] to FPU as 4x16-bit values
214 ldda [%o4+2]%asi,$ba
215 fxtod $alo,$alo
216 ldda [%o4+0]%asi,$bb
217 fxtod $ahi,$ahi
218 ldda [%o4+6]%asi,$bc
219 fxtod $nlo,$nlo
220 ldda [%o4+4]%asi,$bd
221 fxtod $nhi,$nhi
222
223 ! transfer ap[0]*b[0]*n0 to FPU as 4x16-bit values
224 ldda [%sp+$bias+$frame+6]%asi,$na
225 fxtod $ba,$ba
226 ldda [%sp+$bias+$frame+4]%asi,$nb
227 fxtod $bb,$bb
228 ldda [%sp+$bias+$frame+2]%asi,$nc
229 fxtod $bc,$bc
230 ldda [%sp+$bias+$frame+0]%asi,$nd
231 fxtod $bd,$bd
232
233 std $alo,[$ap_l+$j] ! save smashed ap[j] in double format
234 fxtod $na,$na
235 std $ahi,[$ap_h+$j]
236 fxtod $nb,$nb
237 std $nlo,[$np_l+$j] ! save smashed np[j] in double format
238 fxtod $nc,$nc
239 std $nhi,[$np_h+$j]
240 fxtod $nd,$nd
241
242 fmuld $alo,$ba,$aloa
243 fmuld $nlo,$na,$nloa
244 fmuld $alo,$bb,$alob
245 fmuld $nlo,$nb,$nlob
246 fmuld $alo,$bc,$aloc
247 faddd $aloa,$nloa,$nloa
248 fmuld $nlo,$nc,$nloc
249 fmuld $alo,$bd,$alod
250 faddd $alob,$nlob,$nlob
251 fmuld $nlo,$nd,$nlod
252 fmuld $ahi,$ba,$ahia
253 faddd $aloc,$nloc,$nloc
254 fmuld $nhi,$na,$nhia
255 fmuld $ahi,$bb,$ahib
256 faddd $alod,$nlod,$nlod
257 fmuld $nhi,$nb,$nhib
258 fmuld $ahi,$bc,$ahic
259 faddd $ahia,$nhia,$nhia
260 fmuld $nhi,$nc,$nhic
261 fmuld $ahi,$bd,$ahid
262 faddd $ahib,$nhib,$nhib
263 fmuld $nhi,$nd,$nhid
264
265 faddd $ahic,$nhic,$dota ! $nhic
266 faddd $ahid,$nhid,$dotb ! $nhid
267
268 faddd $nloc,$nhia,$nloc
269 faddd $nlod,$nhib,$nlod
270
271 fdtox $nloa,$nloa
272 fdtox $nlob,$nlob
273 fdtox $nloc,$nloc
274 fdtox $nlod,$nlod
275
276 std $nloa,[%sp+$bias+$frame+0]
277 add $j,8,$j
278 std $nlob,[%sp+$bias+$frame+8]
279 add $ap,$j,%o4
280 std $nloc,[%sp+$bias+$frame+16]
281 add $np,$j,%o5
282 std $nlod,[%sp+$bias+$frame+24]
283
284
285 ld [%o4+0],$alo_ ! load a[j] as pair of 32-bit words
286 fzeros $alo
287 ld [%o4+4],$ahi_
288 fzeros $ahi
289 ld [%o5+0],$nlo_ ! load n[j] as pair of 32-bit words
290 fzeros $nlo
291 ld [%o5+4],$nhi_
292 fzeros $nhi
293
294 fxtod $alo,$alo
295 fxtod $ahi,$ahi
296 fxtod $nlo,$nlo
297 fxtod $nhi,$nhi
298
299 ldx [%sp+$bias+$frame+0],%o0
300 fmuld $alo,$ba,$aloa
301 ldx [%sp+$bias+$frame+8],%o1
302 fmuld $nlo,$na,$nloa
303 ldx [%sp+$bias+$frame+16],%o2
304 fmuld $alo,$bb,$alob
305 ldx [%sp+$bias+$frame+24],%o3
306 fmuld $nlo,$nb,$nlob
307
308 srlx %o0,16,%o7
309 std $alo,[$ap_l+$j] ! save smashed ap[j] in double format
310 fmuld $alo,$bc,$aloc
311 add %o7,%o1,%o1
312 std $ahi,[$ap_h+$j]
313 faddd $aloa,$nloa,$nloa
314 fmuld $nlo,$nc,$nloc
315 srlx %o1,16,%o7
316 std $nlo,[$np_l+$j] ! save smashed np[j] in double format
317 fmuld $alo,$bd,$alod
318 add %o7,%o2,%o2
319 std $nhi,[$np_h+$j]
320 faddd $alob,$nlob,$nlob
321 fmuld $nlo,$nd,$nlod
322 srlx %o2,16,%o7
323 fmuld $ahi,$ba,$ahia
324 add %o7,%o3,%o3 ! %o3.%o2[0..15].%o1[0..15].%o0[0..15]
325 faddd $aloc,$nloc,$nloc
326 fmuld $nhi,$na,$nhia
327 !and %o0,$mask,%o0
328 !and %o1,$mask,%o1
329 !and %o2,$mask,%o2
330 !sllx %o1,16,%o1
331 !sllx %o2,32,%o2
332 !sllx %o3,48,%o7
333 !or %o1,%o0,%o0
334 !or %o2,%o0,%o0
335 !or %o7,%o0,%o0 ! 64-bit result
336 srlx %o3,16,%g1 ! 34-bit carry
337 fmuld $ahi,$bb,$ahib
338
339 faddd $alod,$nlod,$nlod
340 fmuld $nhi,$nb,$nhib
341 fmuld $ahi,$bc,$ahic
342 faddd $ahia,$nhia,$nhia
343 fmuld $nhi,$nc,$nhic
344 fmuld $ahi,$bd,$ahid
345 faddd $ahib,$nhib,$nhib
346 fmuld $nhi,$nd,$nhid
347
348 faddd $dota,$nloa,$nloa
349 faddd $dotb,$nlob,$nlob
350 faddd $ahic,$nhic,$dota ! $nhic
351 faddd $ahid,$nhid,$dotb ! $nhid
352
353 faddd $nloc,$nhia,$nloc
354 faddd $nlod,$nhib,$nlod
355
356 fdtox $nloa,$nloa
357 fdtox $nlob,$nlob
358 fdtox $nloc,$nloc
359 fdtox $nlod,$nlod
360
361 std $nloa,[%sp+$bias+$frame+0]
362 std $nlob,[%sp+$bias+$frame+8]
363 addcc $j,8,$j
364 std $nloc,[%sp+$bias+$frame+16]
365 bz,pn %icc,.L1stskip
366 std $nlod,[%sp+$bias+$frame+24]
367
368
369.align 32 ! incidentally already aligned !
370.L1st:
371 add $ap,$j,%o4
372 add $np,$j,%o5
373 ld [%o4+0],$alo_ ! load a[j] as pair of 32-bit words
374 fzeros $alo
375 ld [%o4+4],$ahi_
376 fzeros $ahi
377 ld [%o5+0],$nlo_ ! load n[j] as pair of 32-bit words
378 fzeros $nlo
379 ld [%o5+4],$nhi_
380 fzeros $nhi
381
382 fxtod $alo,$alo
383 fxtod $ahi,$ahi
384 fxtod $nlo,$nlo
385 fxtod $nhi,$nhi
386
387 ldx [%sp+$bias+$frame+0],%o0
388 fmuld $alo,$ba,$aloa
389 ldx [%sp+$bias+$frame+8],%o1
390 fmuld $nlo,$na,$nloa
391 ldx [%sp+$bias+$frame+16],%o2
392 fmuld $alo,$bb,$alob
393 ldx [%sp+$bias+$frame+24],%o3
394 fmuld $nlo,$nb,$nlob
395
396 srlx %o0,16,%o7
397 std $alo,[$ap_l+$j] ! save smashed ap[j] in double format
398 fmuld $alo,$bc,$aloc
399 add %o7,%o1,%o1
400 std $ahi,[$ap_h+$j]
401 faddd $aloa,$nloa,$nloa
402 fmuld $nlo,$nc,$nloc
403 srlx %o1,16,%o7
404 std $nlo,[$np_l+$j] ! save smashed np[j] in double format
405 fmuld $alo,$bd,$alod
406 add %o7,%o2,%o2
407 std $nhi,[$np_h+$j]
408 faddd $alob,$nlob,$nlob
409 fmuld $nlo,$nd,$nlod
410 srlx %o2,16,%o7
411 fmuld $ahi,$ba,$ahia
412 add %o7,%o3,%o3 ! %o3.%o2[0..15].%o1[0..15].%o0[0..15]
413 and %o0,$mask,%o0
414 faddd $aloc,$nloc,$nloc
415 fmuld $nhi,$na,$nhia
416 and %o1,$mask,%o1
417 and %o2,$mask,%o2
418 fmuld $ahi,$bb,$ahib
419 sllx %o1,16,%o1
420 faddd $alod,$nlod,$nlod
421 fmuld $nhi,$nb,$nhib
422 sllx %o2,32,%o2
423 fmuld $ahi,$bc,$ahic
424 sllx %o3,48,%o7
425 or %o1,%o0,%o0
426 faddd $ahia,$nhia,$nhia
427 fmuld $nhi,$nc,$nhic
428 or %o2,%o0,%o0
429 fmuld $ahi,$bd,$ahid
430 or %o7,%o0,%o0 ! 64-bit result
431 faddd $ahib,$nhib,$nhib
432 fmuld $nhi,$nd,$nhid
433 addcc %g1,%o0,%o0
434 faddd $dota,$nloa,$nloa
435 srlx %o3,16,%g1 ! 34-bit carry
436 faddd $dotb,$nlob,$nlob
437 bcs,a %xcc,.+8
438 add %g1,1,%g1
439
440 stx %o0,[$tp] ! tp[j-1]=
441
442 faddd $ahic,$nhic,$dota ! $nhic
443 faddd $ahid,$nhid,$dotb ! $nhid
444
445 faddd $nloc,$nhia,$nloc
446 faddd $nlod,$nhib,$nlod
447
448 fdtox $nloa,$nloa
449 fdtox $nlob,$nlob
450 fdtox $nloc,$nloc
451 fdtox $nlod,$nlod
452
453 std $nloa,[%sp+$bias+$frame+0]
454 std $nlob,[%sp+$bias+$frame+8]
455 std $nloc,[%sp+$bias+$frame+16]
456 std $nlod,[%sp+$bias+$frame+24]
457
458 addcc $j,8,$j
459 bnz,pt %icc,.L1st
460 add $tp,8,$tp
461
462
463.L1stskip:
464 fdtox $dota,$dota
465 fdtox $dotb,$dotb
466
467 ldx [%sp+$bias+$frame+0],%o0
468 ldx [%sp+$bias+$frame+8],%o1
469 ldx [%sp+$bias+$frame+16],%o2
470 ldx [%sp+$bias+$frame+24],%o3
471
472 srlx %o0,16,%o7
473 std $dota,[%sp+$bias+$frame+32]
474 add %o7,%o1,%o1
475 std $dotb,[%sp+$bias+$frame+40]
476 srlx %o1,16,%o7
477 add %o7,%o2,%o2
478 srlx %o2,16,%o7
479 add %o7,%o3,%o3 ! %o3.%o2[0..15].%o1[0..15].%o0[0..15]
480 and %o0,$mask,%o0
481 and %o1,$mask,%o1
482 and %o2,$mask,%o2
483 sllx %o1,16,%o1
484 sllx %o2,32,%o2
485 sllx %o3,48,%o7
486 or %o1,%o0,%o0
487 or %o2,%o0,%o0
488 or %o7,%o0,%o0 ! 64-bit result
489 ldx [%sp+$bias+$frame+32],%o4
490 addcc %g1,%o0,%o0
491 ldx [%sp+$bias+$frame+40],%o5
492 srlx %o3,16,%g1 ! 34-bit carry
493 bcs,a %xcc,.+8
494 add %g1,1,%g1
495
496 stx %o0,[$tp] ! tp[j-1]=
497 add $tp,8,$tp
498
499 srlx %o4,16,%o7
500 add %o7,%o5,%o5
501 and %o4,$mask,%o4
502 sllx %o5,16,%o7
503 or %o7,%o4,%o4
504 addcc %g1,%o4,%o4
505 srlx %o5,48,%g1
506 bcs,a %xcc,.+8
507 add %g1,1,%g1
508
509 mov %g1,$carry
510 stx %o4,[$tp] ! tp[num-1]=
511
512
513 ba .Louter
514 add $i,8,$i
515.align 32
516.Louter:
517 sub %g0,$num,$j ! j=-num
518 add %sp,$bias+$frame+$locals,$tp
519
520 add $ap,$j,%o3
521 add $bp,$i,%o4
522
523 ld [%o3+4],%g1 ! bp[i]
524 ld [%o3+0],%o0
525 ld [%o4+4],%g5 ! ap[0]
526 sllx %g1,32,%g1
527 ld [%o4+0],%o1
528 sllx %g5,32,%g5
529 or %g1,%o0,%o0
530 or %g5,%o1,%o1
531
532 ldx [$tp],%o2 ! tp[0]
533 mulx %o1,%o0,%o0
534 addcc %o2,%o0,%o0
535 mulx $n0,%o0,%o0 ! (ap[0]*bp[i]+t[0])*n0
536 stx %o0,[%sp+$bias+$frame+0]
537
538 ! transfer b[i] to FPU as 4x16-bit values
539 ldda [%o4+2]%asi,$ba
540 ldda [%o4+0]%asi,$bb
541 ldda [%o4+6]%asi,$bc
542 ldda [%o4+4]%asi,$bd
543
544 ! transfer (ap[0]*b[i]+t[0])*n0 to FPU as 4x16-bit values
545 ldda [%sp+$bias+$frame+6]%asi,$na
546 fxtod $ba,$ba
547 ldda [%sp+$bias+$frame+4]%asi,$nb
548 fxtod $bb,$bb
549 ldda [%sp+$bias+$frame+2]%asi,$nc
550 fxtod $bc,$bc
551 ldda [%sp+$bias+$frame+0]%asi,$nd
552 fxtod $bd,$bd
553 ldd [$ap_l+$j],$alo ! load a[j] in double format
554 fxtod $na,$na
555 ldd [$ap_h+$j],$ahi
556 fxtod $nb,$nb
557 ldd [$np_l+$j],$nlo ! load n[j] in double format
558 fxtod $nc,$nc
559 ldd [$np_h+$j],$nhi
560 fxtod $nd,$nd
561
562 fmuld $alo,$ba,$aloa
563 fmuld $nlo,$na,$nloa
564 fmuld $alo,$bb,$alob
565 fmuld $nlo,$nb,$nlob
566 fmuld $alo,$bc,$aloc
567 faddd $aloa,$nloa,$nloa
568 fmuld $nlo,$nc,$nloc
569 fmuld $alo,$bd,$alod
570 faddd $alob,$nlob,$nlob
571 fmuld $nlo,$nd,$nlod
572 fmuld $ahi,$ba,$ahia
573 faddd $aloc,$nloc,$nloc
574 fmuld $nhi,$na,$nhia
575 fmuld $ahi,$bb,$ahib
576 faddd $alod,$nlod,$nlod
577 fmuld $nhi,$nb,$nhib
578 fmuld $ahi,$bc,$ahic
579 faddd $ahia,$nhia,$nhia
580 fmuld $nhi,$nc,$nhic
581 fmuld $ahi,$bd,$ahid
582 faddd $ahib,$nhib,$nhib
583 fmuld $nhi,$nd,$nhid
584
585 faddd $ahic,$nhic,$dota ! $nhic
586 faddd $ahid,$nhid,$dotb ! $nhid
587
588 faddd $nloc,$nhia,$nloc
589 faddd $nlod,$nhib,$nlod
590
591 fdtox $nloa,$nloa
592 fdtox $nlob,$nlob
593 fdtox $nloc,$nloc
594 fdtox $nlod,$nlod
595
596 std $nloa,[%sp+$bias+$frame+0]
597 std $nlob,[%sp+$bias+$frame+8]
598 std $nloc,[%sp+$bias+$frame+16]
599 add $j,8,$j
600 std $nlod,[%sp+$bias+$frame+24]
601
602
603 ldd [$ap_l+$j],$alo ! load a[j] in double format
604 ldd [$ap_h+$j],$ahi
605 ldd [$np_l+$j],$nlo ! load n[j] in double format
606 ldd [$np_h+$j],$nhi
607
608 fmuld $alo,$ba,$aloa
609 fmuld $nlo,$na,$nloa
610 fmuld $alo,$bb,$alob
611 fmuld $nlo,$nb,$nlob
612 fmuld $alo,$bc,$aloc
613 ldx [%sp+$bias+$frame+0],%o0
614 faddd $aloa,$nloa,$nloa
615 fmuld $nlo,$nc,$nloc
616 ldx [%sp+$bias+$frame+8],%o1
617 fmuld $alo,$bd,$alod
618 ldx [%sp+$bias+$frame+16],%o2
619 faddd $alob,$nlob,$nlob
620 fmuld $nlo,$nd,$nlod
621 ldx [%sp+$bias+$frame+24],%o3
622 fmuld $ahi,$ba,$ahia
623
624 srlx %o0,16,%o7
625 faddd $aloc,$nloc,$nloc
626 fmuld $nhi,$na,$nhia
627 add %o7,%o1,%o1
628 fmuld $ahi,$bb,$ahib
629 srlx %o1,16,%o7
630 faddd $alod,$nlod,$nlod
631 fmuld $nhi,$nb,$nhib
632 add %o7,%o2,%o2
633 fmuld $ahi,$bc,$ahic
634 srlx %o2,16,%o7
635 faddd $ahia,$nhia,$nhia
636 fmuld $nhi,$nc,$nhic
637 add %o7,%o3,%o3 ! %o3.%o2[0..15].%o1[0..15].%o0[0..15]
638 ! why?
639 and %o0,$mask,%o0
640 fmuld $ahi,$bd,$ahid
641 and %o1,$mask,%o1
642 and %o2,$mask,%o2
643 faddd $ahib,$nhib,$nhib
644 fmuld $nhi,$nd,$nhid
645 sllx %o1,16,%o1
646 faddd $dota,$nloa,$nloa
647 sllx %o2,32,%o2
648 faddd $dotb,$nlob,$nlob
649 sllx %o3,48,%o7
650 or %o1,%o0,%o0
651 faddd $ahic,$nhic,$dota ! $nhic
652 or %o2,%o0,%o0
653 faddd $ahid,$nhid,$dotb ! $nhid
654 or %o7,%o0,%o0 ! 64-bit result
655 ldx [$tp],%o7
656 faddd $nloc,$nhia,$nloc
657 addcc %o7,%o0,%o0
658 ! end-of-why?
659 faddd $nlod,$nhib,$nlod
660 srlx %o3,16,%g1 ! 34-bit carry
661 fdtox $nloa,$nloa
662 bcs,a %xcc,.+8
663 add %g1,1,%g1
664
665 fdtox $nlob,$nlob
666 fdtox $nloc,$nloc
667 fdtox $nlod,$nlod
668
669 std $nloa,[%sp+$bias+$frame+0]
670 std $nlob,[%sp+$bias+$frame+8]
671 addcc $j,8,$j
672 std $nloc,[%sp+$bias+$frame+16]
673 bz,pn %icc,.Linnerskip
674 std $nlod,[%sp+$bias+$frame+24]
675
676
677 ba .Linner
678 nop
679.align 32
680.Linner:
681 ldd [$ap_l+$j],$alo ! load a[j] in double format
682 ldd [$ap_h+$j],$ahi
683 ldd [$np_l+$j],$nlo ! load n[j] in double format
684 ldd [$np_h+$j],$nhi
685
686 fmuld $alo,$ba,$aloa
687 fmuld $nlo,$na,$nloa
688 fmuld $alo,$bb,$alob
689 fmuld $nlo,$nb,$nlob
690 fmuld $alo,$bc,$aloc
691 ldx [%sp+$bias+$frame+0],%o0
692 faddd $aloa,$nloa,$nloa
693 fmuld $nlo,$nc,$nloc
694 ldx [%sp+$bias+$frame+8],%o1
695 fmuld $alo,$bd,$alod
696 ldx [%sp+$bias+$frame+16],%o2
697 faddd $alob,$nlob,$nlob
698 fmuld $nlo,$nd,$nlod
699 ldx [%sp+$bias+$frame+24],%o3
700 fmuld $ahi,$ba,$ahia
701
702 srlx %o0,16,%o7
703 faddd $aloc,$nloc,$nloc
704 fmuld $nhi,$na,$nhia
705 add %o7,%o1,%o1
706 fmuld $ahi,$bb,$ahib
707 srlx %o1,16,%o7
708 faddd $alod,$nlod,$nlod
709 fmuld $nhi,$nb,$nhib
710 add %o7,%o2,%o2
711 fmuld $ahi,$bc,$ahic
712 srlx %o2,16,%o7
713 faddd $ahia,$nhia,$nhia
714 fmuld $nhi,$nc,$nhic
715 add %o7,%o3,%o3 ! %o3.%o2[0..15].%o1[0..15].%o0[0..15]
716 and %o0,$mask,%o0
717 fmuld $ahi,$bd,$ahid
718 and %o1,$mask,%o1
719 and %o2,$mask,%o2
720 faddd $ahib,$nhib,$nhib
721 fmuld $nhi,$nd,$nhid
722 sllx %o1,16,%o1
723 faddd $dota,$nloa,$nloa
724 sllx %o2,32,%o2
725 faddd $dotb,$nlob,$nlob
726 sllx %o3,48,%o7
727 or %o1,%o0,%o0
728 faddd $ahic,$nhic,$dota ! $nhic
729 or %o2,%o0,%o0
730 faddd $ahid,$nhid,$dotb ! $nhid
731 or %o7,%o0,%o0 ! 64-bit result
732 faddd $nloc,$nhia,$nloc
733 addcc %g1,%o0,%o0
734 ldx [$tp+8],%o7 ! tp[j]
735 faddd $nlod,$nhib,$nlod
736 srlx %o3,16,%g1 ! 34-bit carry
737 fdtox $nloa,$nloa
738 bcs,a %xcc,.+8
739 add %g1,1,%g1
740 fdtox $nlob,$nlob
741 addcc %o7,%o0,%o0
742 fdtox $nloc,$nloc
743 bcs,a %xcc,.+8
744 add %g1,1,%g1
745
746 stx %o0,[$tp] ! tp[j-1]
747 fdtox $nlod,$nlod
748
749 std $nloa,[%sp+$bias+$frame+0]
750 std $nlob,[%sp+$bias+$frame+8]
751 std $nloc,[%sp+$bias+$frame+16]
752 addcc $j,8,$j
753 std $nlod,[%sp+$bias+$frame+24]
754 bnz,pt %icc,.Linner
755 add $tp,8,$tp
756
757
758.Linnerskip:
759 fdtox $dota,$dota
760 fdtox $dotb,$dotb
761
762 ldx [%sp+$bias+$frame+0],%o0
763 ldx [%sp+$bias+$frame+8],%o1
764 ldx [%sp+$bias+$frame+16],%o2
765 ldx [%sp+$bias+$frame+24],%o3
766
767 srlx %o0,16,%o7
768 std $dota,[%sp+$bias+$frame+32]
769 add %o7,%o1,%o1
770 std $dotb,[%sp+$bias+$frame+40]
771 srlx %o1,16,%o7
772 add %o7,%o2,%o2
773 srlx %o2,16,%o7
774 add %o7,%o3,%o3 ! %o3.%o2[0..15].%o1[0..15].%o0[0..15]
775 and %o0,$mask,%o0
776 and %o1,$mask,%o1
777 and %o2,$mask,%o2
778 sllx %o1,16,%o1
779 sllx %o2,32,%o2
780 sllx %o3,48,%o7
781 or %o1,%o0,%o0
782 or %o2,%o0,%o0
783 ldx [%sp+$bias+$frame+32],%o4
784 or %o7,%o0,%o0 ! 64-bit result
785 ldx [%sp+$bias+$frame+40],%o5
786 addcc %g1,%o0,%o0
787 ldx [$tp+8],%o7 ! tp[j]
788 srlx %o3,16,%g1 ! 34-bit carry
789 bcs,a %xcc,.+8
790 add %g1,1,%g1
791
792 addcc %o7,%o0,%o0
793 bcs,a %xcc,.+8
794 add %g1,1,%g1
795
796 stx %o0,[$tp] ! tp[j-1]
797 add $tp,8,$tp
798
799 srlx %o4,16,%o7
800 add %o7,%o5,%o5
801 and %o4,$mask,%o4
802 sllx %o5,16,%o7
803 or %o7,%o4,%o4
804 addcc %g1,%o4,%o4
805 srlx %o5,48,%g1
806 bcs,a %xcc,.+8
807 add %g1,1,%g1
808
809 addcc $carry,%o4,%o4
810 stx %o4,[$tp] ! tp[num-1]
811 mov %g1,$carry
812 bcs,a %xcc,.+8
813 add $carry,1,$carry
814
815 addcc $i,8,$i
816 bnz %icc,.Louter
817 nop
818
819
820 add $tp,8,$tp ! adjust tp to point at the end
821 orn %g0,%g0,%g4
822 sub %g0,$num,%o7 ! n=-num
823 ba .Lsub
824 subcc %g0,%g0,%g0 ! clear %icc.c
825
826.align 32
827.Lsub:
828 ldx [$tp+%o7],%o0
829 add $np,%o7,%g1
830 ld [%g1+0],%o2
831 ld [%g1+4],%o3
832 srlx %o0,32,%o1
833 subccc %o0,%o2,%o2
834 add $rp,%o7,%g1
835 subccc %o1,%o3,%o3
836 st %o2,[%g1+0]
837 add %o7,8,%o7
838 brnz,pt %o7,.Lsub
839 st %o3,[%g1+4]
840 subc $carry,0,%g4
841 sub %g0,$num,%o7 ! n=-num
842 ba .Lcopy
843 nop
844
845.align 32
846.Lcopy:
847 ldx [$tp+%o7],%o0
848 add $rp,%o7,%g1
849 ld [%g1+0],%o2
850 ld [%g1+4],%o3
851 stx %g0,[$tp+%o7]
852 and %o0,%g4,%o0
853 srlx %o0,32,%o1
854 andn %o2,%g4,%o2
855 andn %o3,%g4,%o3
856 or %o2,%o0,%o0
857 or %o3,%o1,%o1
858 st %o0,[%g1+0]
859 add %o7,8,%o7
860 brnz,pt %o7,.Lcopy
861 st %o1,[%g1+4]
862 sub %g0,$num,%o7 ! n=-num
863
864.Lzap:
865 stx %g0,[$ap_l+%o7]
866 stx %g0,[$ap_h+%o7]
867 stx %g0,[$np_l+%o7]
868 stx %g0,[$np_h+%o7]
869 add %o7,8,%o7
870 brnz,pt %o7,.Lzap
871 nop
872
873 ldx [%sp+$bias+$frame+48],%o7
874 wr %g0,%o7,%asi ! restore %asi
875
876 mov 1,%i0
877.Lret:
878 ret
879 restore
880.type $fname,#function
881.size $fname,(.-$fname)
882.asciz "Montgomery Multiplication for UltraSPARC, CRYPTOGAMS by <appro\@openssl.org>"
883.align 32
884___
885
886$code =~ s/\`([^\`]*)\`/eval($1)/gem;
887
888# Below substitution makes it possible to compile without demanding
889# VIS extensions on command line, e.g. -xarch=v9 vs. -xarch=v9a. I
890# dare to do this, because VIS capability is detected at run-time now
891# and this routine is not called on CPU not capable to execute it. Do
892# note that fzeros is not the only VIS dependency! Another dependency
893# is implicit and is just _a_ numerical value loaded to %asi register,
894# which assembler can't recognize as VIS specific...
895$code =~ s/fzeros\s+%f([0-9]+)/
896 sprintf(".word\t0x%x\t! fzeros %%f%d",0x81b00c20|($1<<25),$1)
897 /gem;
898
899print $code;
900# flush
901close STDOUT or die "error closing STDOUT: $!";
Note: See TracBrowser for help on using the repository browser.

© 2024 Oracle Support Privacy / Do Not Sell My Info Terms of Use Trademark Policy Automated Access Etiquette