VirtualBox

source: vbox/trunk/src/libs/openssl-3.0.1/crypto/bn/asm/sparcv9-mont.pl@ 94082

Last change on this file since 94082 was 94082, checked in by vboxsync, 3 years ago

libs/openssl-3.0.1: started applying and adjusting our OpenSSL changes to 3.0.1. bugref:10128

File size: 13.8 KB
Line 
1#! /usr/bin/env perl
2# Copyright 2005-2021 The OpenSSL Project Authors. All Rights Reserved.
3#
4# Licensed under the Apache License 2.0 (the "License"). You may not use
5# this file except in compliance with the License. You can obtain a copy
6# in the file LICENSE in the source distribution or at
7# https://www.openssl.org/source/license.html
8
9
10# ====================================================================
11# Written by Andy Polyakov <[email protected]> for the OpenSSL
12# project. The module is, however, dual licensed under OpenSSL and
13# CRYPTOGAMS licenses depending on where you obtain it. For further
14# details see http://www.openssl.org/~appro/cryptogams/.
15# ====================================================================
16
17# December 2005
18#
19# Pure SPARCv9/8+ and IALU-only bn_mul_mont implementation. The reasons
20# for undertaken effort are multiple. First of all, UltraSPARC is not
21# the whole SPARCv9 universe and other VIS-free implementations deserve
22# optimized code as much. Secondly, newly introduced UltraSPARC T1,
23# a.k.a. Niagara, has shared FPU and concurrent FPU-intensive paths,
24# such as sparcv9a-mont, will simply sink it. Yes, T1 is equipped with
25# several integrated RSA/DSA accelerator circuits accessible through
26# kernel driver [only(*)], but having decent user-land software
27# implementation is important too. Finally, reasons like desire to
28# experiment with dedicated squaring procedure. Yes, this module
29# implements one, because it was easiest to draft it in SPARCv9
30# instructions...
31
32# (*) Engine accessing the driver in question is on my TODO list.
33# For reference, accelerator is estimated to give 6 to 10 times
34# improvement on single-threaded RSA sign. It should be noted
35# that 6-10x improvement coefficient does not actually mean
36# something extraordinary in terms of absolute [single-threaded]
37# performance, as SPARCv9 instruction set is by all means least
38# suitable for high performance crypto among other 64 bit
39# platforms. 6-10x factor simply places T1 in same performance
40# domain as say AMD64 and IA-64. Improvement of RSA verify don't
41# appear impressive at all, but it's the sign operation which is
42# far more critical/interesting.
43
44# You might notice that inner loops are modulo-scheduled:-) This has
45# essentially negligible impact on UltraSPARC performance, it's
46# Fujitsu SPARC64 V users who should notice and hopefully appreciate
47# the advantage... Currently this module surpasses sparcv9a-mont.pl
48# by ~20% on UltraSPARC-III and later cores, but recall that sparcv9a
49# module still have hidden potential [see TODO list there], which is
50# estimated to be larger than 20%...
51
52$output = pop and open STDOUT,">$output";
53
54# int bn_mul_mont(
55$rp="%i0"; # BN_ULONG *rp,
56$ap="%i1"; # const BN_ULONG *ap,
57$bp="%i2"; # const BN_ULONG *bp,
58$np="%i3"; # const BN_ULONG *np,
59$n0="%i4"; # const BN_ULONG *n0,
60$num="%i5"; # int num);
61
62$frame="STACK_FRAME";
63$bias="STACK_BIAS";
64
65$car0="%o0";
66$car1="%o1";
67$car2="%o2"; # 1 bit
68$acc0="%o3";
69$acc1="%o4";
70$mask="%g1"; # 32 bits, what a waste...
71$tmp0="%g4";
72$tmp1="%g5";
73
74$i="%l0";
75$j="%l1";
76$mul0="%l2";
77$mul1="%l3";
78$tp="%l4";
79$apj="%l5";
80$npj="%l6";
81$tpj="%l7";
82
83$fname="bn_mul_mont_int";
84
85$code=<<___;
86#ifndef __ASSEMBLER__
87# define __ASSEMBLER__ 1
88#endif
89#include "crypto/sparc_arch.h"
90
91.section ".text",#alloc,#execinstr
92
93.global $fname
94.align 32
95$fname:
96 cmp %o5,4 ! 128 bits minimum
97 bge,pt %icc,.Lenter
98 sethi %hi(0xffffffff),$mask
99 retl
100 clr %o0
101.align 32
102.Lenter:
103 save %sp,-$frame,%sp
104 sll $num,2,$num ! num*=4
105 or $mask,%lo(0xffffffff),$mask
106 ld [$n0],$n0
107 cmp $ap,$bp
108 and $num,$mask,$num
109 ld [$bp],$mul0 ! bp[0]
110 nop
111
112 add %sp,$bias,%o7 ! real top of stack
113 ld [$ap],$car0 ! ap[0] ! redundant in squaring context
114 sub %o7,$num,%o7
115 ld [$ap+4],$apj ! ap[1]
116 and %o7,-1024,%o7
117 ld [$np],$car1 ! np[0]
118 sub %o7,$bias,%sp ! alloca
119 ld [$np+4],$npj ! np[1]
120 be,pt SIZE_T_CC,.Lbn_sqr_mont
121 mov 12,$j
122
123 mulx $car0,$mul0,$car0 ! ap[0]*bp[0]
124 mulx $apj,$mul0,$tmp0 !prologue! ap[1]*bp[0]
125 and $car0,$mask,$acc0
126 add %sp,$bias+$frame,$tp
127 ld [$ap+8],$apj !prologue!
128
129 mulx $n0,$acc0,$mul1 ! "t[0]"*n0
130 and $mul1,$mask,$mul1
131
132 mulx $car1,$mul1,$car1 ! np[0]*"t[0]"*n0
133 mulx $npj,$mul1,$acc1 !prologue! np[1]*"t[0]"*n0
134 srlx $car0,32,$car0
135 add $acc0,$car1,$car1
136 ld [$np+8],$npj !prologue!
137 srlx $car1,32,$car1
138 mov $tmp0,$acc0 !prologue!
139
140.L1st:
141 mulx $apj,$mul0,$tmp0
142 mulx $npj,$mul1,$tmp1
143 add $acc0,$car0,$car0
144 ld [$ap+$j],$apj ! ap[j]
145 and $car0,$mask,$acc0
146 add $acc1,$car1,$car1
147 ld [$np+$j],$npj ! np[j]
148 srlx $car0,32,$car0
149 add $acc0,$car1,$car1
150 add $j,4,$j ! j++
151 mov $tmp0,$acc0
152 st $car1,[$tp]
153 cmp $j,$num
154 mov $tmp1,$acc1
155 srlx $car1,32,$car1
156 bl %icc,.L1st
157 add $tp,4,$tp ! tp++
158!.L1st
159
160 mulx $apj,$mul0,$tmp0 !epilogue!
161 mulx $npj,$mul1,$tmp1
162 add $acc0,$car0,$car0
163 and $car0,$mask,$acc0
164 add $acc1,$car1,$car1
165 srlx $car0,32,$car0
166 add $acc0,$car1,$car1
167 st $car1,[$tp]
168 srlx $car1,32,$car1
169
170 add $tmp0,$car0,$car0
171 and $car0,$mask,$acc0
172 add $tmp1,$car1,$car1
173 srlx $car0,32,$car0
174 add $acc0,$car1,$car1
175 st $car1,[$tp+4]
176 srlx $car1,32,$car1
177
178 add $car0,$car1,$car1
179 st $car1,[$tp+8]
180 srlx $car1,32,$car2
181
182
183 mov 4,$i ! i++
184 ld [$bp+4],$mul0 ! bp[1]
185.Louter:
186 add %sp,$bias+$frame,$tp
187 ld [$ap],$car0 ! ap[0]
188 ld [$ap+4],$apj ! ap[1]
189 ld [$np],$car1 ! np[0]
190 ld [$np+4],$npj ! np[1]
191 ld [$tp],$tmp1 ! tp[0]
192 ld [$tp+4],$tpj ! tp[1]
193 mov 12,$j
194
195 mulx $car0,$mul0,$car0
196 mulx $apj,$mul0,$tmp0 !prologue!
197 add $tmp1,$car0,$car0
198 ld [$ap+8],$apj !prologue!
199 and $car0,$mask,$acc0
200
201 mulx $n0,$acc0,$mul1
202 and $mul1,$mask,$mul1
203
204 mulx $car1,$mul1,$car1
205 mulx $npj,$mul1,$acc1 !prologue!
206 srlx $car0,32,$car0
207 add $acc0,$car1,$car1
208 ld [$np+8],$npj !prologue!
209 srlx $car1,32,$car1
210 mov $tmp0,$acc0 !prologue!
211
212.Linner:
213 mulx $apj,$mul0,$tmp0
214 mulx $npj,$mul1,$tmp1
215 add $tpj,$car0,$car0
216 ld [$ap+$j],$apj ! ap[j]
217 add $acc0,$car0,$car0
218 add $acc1,$car1,$car1
219 ld [$np+$j],$npj ! np[j]
220 and $car0,$mask,$acc0
221 ld [$tp+8],$tpj ! tp[j]
222 srlx $car0,32,$car0
223 add $acc0,$car1,$car1
224 add $j,4,$j ! j++
225 mov $tmp0,$acc0
226 st $car1,[$tp] ! tp[j-1]
227 srlx $car1,32,$car1
228 mov $tmp1,$acc1
229 cmp $j,$num
230 bl %icc,.Linner
231 add $tp,4,$tp ! tp++
232!.Linner
233
234 mulx $apj,$mul0,$tmp0 !epilogue!
235 mulx $npj,$mul1,$tmp1
236 add $tpj,$car0,$car0
237 add $acc0,$car0,$car0
238 ld [$tp+8],$tpj ! tp[j]
239 and $car0,$mask,$acc0
240 add $acc1,$car1,$car1
241 srlx $car0,32,$car0
242 add $acc0,$car1,$car1
243 st $car1,[$tp] ! tp[j-1]
244 srlx $car1,32,$car1
245
246 add $tpj,$car0,$car0
247 add $tmp0,$car0,$car0
248 and $car0,$mask,$acc0
249 add $tmp1,$car1,$car1
250 add $acc0,$car1,$car1
251 st $car1,[$tp+4] ! tp[j-1]
252 srlx $car0,32,$car0
253 add $i,4,$i ! i++
254 srlx $car1,32,$car1
255
256 add $car0,$car1,$car1
257 cmp $i,$num
258 add $car2,$car1,$car1
259 st $car1,[$tp+8]
260
261 srlx $car1,32,$car2
262 bl,a %icc,.Louter
263 ld [$bp+$i],$mul0 ! bp[i]
264!.Louter
265
266 add $tp,12,$tp
267
268
269.Ltail:
270 add $np,$num,$np
271 add $rp,$num,$rp
272 sub %g0,$num,%o7 ! k=-num
273 ba .Lsub
274 subcc %g0,%g0,%g0 ! clear %icc.c
275.align 16
276.Lsub:
277 ld [$tp+%o7],%o0
278 ld [$np+%o7],%o1
279 subccc %o0,%o1,%o1 ! tp[j]-np[j]
280 add $rp,%o7,$i
281 add %o7,4,%o7
282 brnz %o7,.Lsub
283 st %o1,[$i]
284 subccc $car2,0,$car2 ! handle upmost overflow bit
285 sub %g0,$num,%o7
286
287.Lcopy:
288 ld [$tp+%o7],%o1 ! conditional copy
289 ld [$rp+%o7],%o0
290 st %g0,[$tp+%o7] ! zap tp
291 movcs %icc,%o1,%o0
292 st %o0,[$rp+%o7]
293 add %o7,4,%o7
294 brnz %o7,.Lcopy
295 nop
296 mov 1,%i0
297 ret
298 restore
299___
300
301
302########
303######## .Lbn_sqr_mont gives up to 20% *overall* improvement over
304######## code without following dedicated squaring procedure.
305########
306$sbit="%o5";
307
308$code.=<<___;
309.align 32
310.Lbn_sqr_mont:
311 mulx $mul0,$mul0,$car0 ! ap[0]*ap[0]
312 mulx $apj,$mul0,$tmp0 !prologue!
313 and $car0,$mask,$acc0
314 add %sp,$bias+$frame,$tp
315 ld [$ap+8],$apj !prologue!
316
317 mulx $n0,$acc0,$mul1 ! "t[0]"*n0
318 srlx $car0,32,$car0
319 and $mul1,$mask,$mul1
320
321 mulx $car1,$mul1,$car1 ! np[0]*"t[0]"*n0
322 mulx $npj,$mul1,$acc1 !prologue!
323 and $car0,1,$sbit
324 ld [$np+8],$npj !prologue!
325 srlx $car0,1,$car0
326 add $acc0,$car1,$car1
327 srlx $car1,32,$car1
328 mov $tmp0,$acc0 !prologue!
329
330.Lsqr_1st:
331 mulx $apj,$mul0,$tmp0
332 mulx $npj,$mul1,$tmp1
333 add $acc0,$car0,$car0 ! ap[j]*a0+c0
334 add $acc1,$car1,$car1
335 ld [$ap+$j],$apj ! ap[j]
336 and $car0,$mask,$acc0
337 ld [$np+$j],$npj ! np[j]
338 srlx $car0,32,$car0
339 add $acc0,$acc0,$acc0
340 or $sbit,$acc0,$acc0
341 mov $tmp1,$acc1
342 srlx $acc0,32,$sbit
343 add $j,4,$j ! j++
344 and $acc0,$mask,$acc0
345 cmp $j,$num
346 add $acc0,$car1,$car1
347 st $car1,[$tp]
348 mov $tmp0,$acc0
349 srlx $car1,32,$car1
350 bl %icc,.Lsqr_1st
351 add $tp,4,$tp ! tp++
352!.Lsqr_1st
353
354 mulx $apj,$mul0,$tmp0 ! epilogue
355 mulx $npj,$mul1,$tmp1
356 add $acc0,$car0,$car0 ! ap[j]*a0+c0
357 add $acc1,$car1,$car1
358 and $car0,$mask,$acc0
359 srlx $car0,32,$car0
360 add $acc0,$acc0,$acc0
361 or $sbit,$acc0,$acc0
362 srlx $acc0,32,$sbit
363 and $acc0,$mask,$acc0
364 add $acc0,$car1,$car1
365 st $car1,[$tp]
366 srlx $car1,32,$car1
367
368 add $tmp0,$car0,$car0 ! ap[j]*a0+c0
369 add $tmp1,$car1,$car1
370 and $car0,$mask,$acc0
371 srlx $car0,32,$car0
372 add $acc0,$acc0,$acc0
373 or $sbit,$acc0,$acc0
374 srlx $acc0,32,$sbit
375 and $acc0,$mask,$acc0
376 add $acc0,$car1,$car1
377 st $car1,[$tp+4]
378 srlx $car1,32,$car1
379
380 add $car0,$car0,$car0
381 or $sbit,$car0,$car0
382 add $car0,$car1,$car1
383 st $car1,[$tp+8]
384 srlx $car1,32,$car2
385
386
387 ld [%sp+$bias+$frame],$tmp0 ! tp[0]
388 ld [%sp+$bias+$frame+4],$tmp1 ! tp[1]
389 ld [%sp+$bias+$frame+8],$tpj ! tp[2]
390 ld [$ap+4],$mul0 ! ap[1]
391 ld [$ap+8],$apj ! ap[2]
392 ld [$np],$car1 ! np[0]
393 ld [$np+4],$npj ! np[1]
394 mulx $n0,$tmp0,$mul1
395
396 mulx $mul0,$mul0,$car0
397 and $mul1,$mask,$mul1
398
399 mulx $car1,$mul1,$car1
400 mulx $npj,$mul1,$acc1
401 add $tmp0,$car1,$car1
402 and $car0,$mask,$acc0
403 ld [$np+8],$npj ! np[2]
404 srlx $car1,32,$car1
405 add $tmp1,$car1,$car1
406 srlx $car0,32,$car0
407 add $acc0,$car1,$car1
408 and $car0,1,$sbit
409 add $acc1,$car1,$car1
410 srlx $car0,1,$car0
411 mov 12,$j
412 st $car1,[%sp+$bias+$frame] ! tp[0]=
413 srlx $car1,32,$car1
414 add %sp,$bias+$frame+4,$tp
415
416.Lsqr_2nd:
417 mulx $apj,$mul0,$acc0
418 mulx $npj,$mul1,$acc1
419 add $acc0,$car0,$car0
420 add $tpj,$sbit,$sbit
421 ld [$ap+$j],$apj ! ap[j]
422 and $car0,$mask,$acc0
423 ld [$np+$j],$npj ! np[j]
424 srlx $car0,32,$car0
425 add $acc1,$car1,$car1
426 ld [$tp+8],$tpj ! tp[j]
427 add $acc0,$acc0,$acc0
428 add $j,4,$j ! j++
429 add $sbit,$acc0,$acc0
430 srlx $acc0,32,$sbit
431 and $acc0,$mask,$acc0
432 cmp $j,$num
433 add $acc0,$car1,$car1
434 st $car1,[$tp] ! tp[j-1]
435 srlx $car1,32,$car1
436 bl %icc,.Lsqr_2nd
437 add $tp,4,$tp ! tp++
438!.Lsqr_2nd
439
440 mulx $apj,$mul0,$acc0
441 mulx $npj,$mul1,$acc1
442 add $acc0,$car0,$car0
443 add $tpj,$sbit,$sbit
444 and $car0,$mask,$acc0
445 srlx $car0,32,$car0
446 add $acc1,$car1,$car1
447 add $acc0,$acc0,$acc0
448 add $sbit,$acc0,$acc0
449 srlx $acc0,32,$sbit
450 and $acc0,$mask,$acc0
451 add $acc0,$car1,$car1
452 st $car1,[$tp] ! tp[j-1]
453 srlx $car1,32,$car1
454
455 add $car0,$car0,$car0
456 add $sbit,$car0,$car0
457 add $car0,$car1,$car1
458 add $car2,$car1,$car1
459 st $car1,[$tp+4]
460 srlx $car1,32,$car2
461
462
463 ld [%sp+$bias+$frame],$tmp1 ! tp[0]
464 ld [%sp+$bias+$frame+4],$tpj ! tp[1]
465 ld [$ap+8],$mul0 ! ap[2]
466 ld [$np],$car1 ! np[0]
467 ld [$np+4],$npj ! np[1]
468 mulx $n0,$tmp1,$mul1
469 and $mul1,$mask,$mul1
470 mov 8,$i
471
472 mulx $mul0,$mul0,$car0
473 mulx $car1,$mul1,$car1
474 and $car0,$mask,$acc0
475 add $tmp1,$car1,$car1
476 srlx $car0,32,$car0
477 add %sp,$bias+$frame,$tp
478 srlx $car1,32,$car1
479 and $car0,1,$sbit
480 srlx $car0,1,$car0
481 mov 4,$j
482
483.Lsqr_outer:
484.Lsqr_inner1:
485 mulx $npj,$mul1,$acc1
486 add $tpj,$car1,$car1
487 add $j,4,$j
488 ld [$tp+8],$tpj
489 cmp $j,$i
490 add $acc1,$car1,$car1
491 ld [$np+$j],$npj
492 st $car1,[$tp]
493 srlx $car1,32,$car1
494 bl %icc,.Lsqr_inner1
495 add $tp,4,$tp
496!.Lsqr_inner1
497
498 add $j,4,$j
499 ld [$ap+$j],$apj ! ap[j]
500 mulx $npj,$mul1,$acc1
501 add $tpj,$car1,$car1
502 ld [$np+$j],$npj ! np[j]
503 srlx $car1,32,$tmp0
504 and $car1,$mask,$car1
505 add $tmp0,$sbit,$sbit
506 add $acc0,$car1,$car1
507 ld [$tp+8],$tpj ! tp[j]
508 add $acc1,$car1,$car1
509 st $car1,[$tp]
510 srlx $car1,32,$car1
511
512 add $j,4,$j
513 cmp $j,$num
514 be,pn %icc,.Lsqr_no_inner2
515 add $tp,4,$tp
516
517.Lsqr_inner2:
518 mulx $apj,$mul0,$acc0
519 mulx $npj,$mul1,$acc1
520 add $tpj,$sbit,$sbit
521 add $acc0,$car0,$car0
522 ld [$ap+$j],$apj ! ap[j]
523 and $car0,$mask,$acc0
524 ld [$np+$j],$npj ! np[j]
525 srlx $car0,32,$car0
526 add $acc0,$acc0,$acc0
527 ld [$tp+8],$tpj ! tp[j]
528 add $sbit,$acc0,$acc0
529 add $j,4,$j ! j++
530 srlx $acc0,32,$sbit
531 and $acc0,$mask,$acc0
532 cmp $j,$num
533 add $acc0,$car1,$car1
534 add $acc1,$car1,$car1
535 st $car1,[$tp] ! tp[j-1]
536 srlx $car1,32,$car1
537 bl %icc,.Lsqr_inner2
538 add $tp,4,$tp ! tp++
539
540.Lsqr_no_inner2:
541 mulx $apj,$mul0,$acc0
542 mulx $npj,$mul1,$acc1
543 add $tpj,$sbit,$sbit
544 add $acc0,$car0,$car0
545 and $car0,$mask,$acc0
546 srlx $car0,32,$car0
547 add $acc0,$acc0,$acc0
548 add $sbit,$acc0,$acc0
549 srlx $acc0,32,$sbit
550 and $acc0,$mask,$acc0
551 add $acc0,$car1,$car1
552 add $acc1,$car1,$car1
553 st $car1,[$tp] ! tp[j-1]
554 srlx $car1,32,$car1
555
556 add $car0,$car0,$car0
557 add $sbit,$car0,$car0
558 add $car0,$car1,$car1
559 add $car2,$car1,$car1
560 st $car1,[$tp+4]
561 srlx $car1,32,$car2
562
563
564 add $i,4,$i ! i++
565 ld [%sp+$bias+$frame],$tmp1 ! tp[0]
566 ld [%sp+$bias+$frame+4],$tpj ! tp[1]
567 ld [$ap+$i],$mul0 ! ap[j]
568 ld [$np],$car1 ! np[0]
569 ld [$np+4],$npj ! np[1]
570 mulx $n0,$tmp1,$mul1
571 and $mul1,$mask,$mul1
572 add $i,4,$tmp0
573
574 mulx $mul0,$mul0,$car0
575 mulx $car1,$mul1,$car1
576 and $car0,$mask,$acc0
577 add $tmp1,$car1,$car1
578 srlx $car0,32,$car0
579 add %sp,$bias+$frame,$tp
580 srlx $car1,32,$car1
581 and $car0,1,$sbit
582 srlx $car0,1,$car0
583
584 cmp $tmp0,$num ! i<num-1
585 bl %icc,.Lsqr_outer
586 mov 4,$j
587
588
589.Lsqr_last:
590 mulx $npj,$mul1,$acc1
591 add $tpj,$car1,$car1
592 add $j,4,$j
593 ld [$tp+8],$tpj
594 cmp $j,$i
595 add $acc1,$car1,$car1
596 ld [$np+$j],$npj
597 st $car1,[$tp]
598 srlx $car1,32,$car1
599 bl %icc,.Lsqr_last
600 add $tp,4,$tp
601!.Lsqr_last
602
603 mulx $npj,$mul1,$acc1
604 add $tpj,$acc0,$acc0
605 srlx $acc0,32,$tmp0
606 and $acc0,$mask,$acc0
607 add $tmp0,$sbit,$sbit
608 add $acc0,$car1,$car1
609 add $acc1,$car1,$car1
610 st $car1,[$tp]
611 srlx $car1,32,$car1
612
613 add $car0,$car0,$car0 ! recover $car0
614 add $sbit,$car0,$car0
615 add $car0,$car1,$car1
616 add $car2,$car1,$car1
617 st $car1,[$tp+4]
618 srlx $car1,32,$car2
619
620 ba .Ltail
621 add $tp,8,$tp
622.type $fname,#function
623.size $fname,(.-$fname)
624.asciz "Montgomery Multiplication for SPARCv9, CRYPTOGAMS by <appro\@openssl.org>"
625.align 32
626___
627$code =~ s/\`([^\`]*)\`/eval($1)/gem;
628print $code;
629close STDOUT or die "error closing STDOUT: $!";
Note: See TracBrowser for help on using the repository browser.

© 2024 Oracle Support Privacy / Do Not Sell My Info Terms of Use Trademark Policy Automated Access Etiquette