VirtualBox

source: vbox/trunk/src/libs/openssl-1.1.1f/crypto/bn/asm/sparcv9-mont.pl@ 83531

Last change on this file since 83531 was 83531, checked in by vboxsync, 5 years ago

setting svn:sync-process=export for openssl-1.1.1f, all files except tests

File size: 13.8 KB
Line 
1#! /usr/bin/env perl
2# Copyright 2005-2020 The OpenSSL Project Authors. All Rights Reserved.
3#
4# Licensed under the OpenSSL license (the "License"). You may not use
5# this file except in compliance with the License. You can obtain a copy
6# in the file LICENSE in the source distribution or at
7# https://www.openssl.org/source/license.html
8
9
10# ====================================================================
11# Written by Andy Polyakov <[email protected]> for the OpenSSL
12# project. The module is, however, dual licensed under OpenSSL and
13# CRYPTOGAMS licenses depending on where you obtain it. For further
14# details see http://www.openssl.org/~appro/cryptogams/.
15# ====================================================================
16
17# December 2005
18#
19# Pure SPARCv9/8+ and IALU-only bn_mul_mont implementation. The reasons
20# for undertaken effort are multiple. First of all, UltraSPARC is not
21# the whole SPARCv9 universe and other VIS-free implementations deserve
22# optimized code as much. Secondly, newly introduced UltraSPARC T1,
23# a.k.a. Niagara, has shared FPU and concurrent FPU-intensive paths,
24# such as sparcv9a-mont, will simply sink it. Yes, T1 is equipped with
25# several integrated RSA/DSA accelerator circuits accessible through
26# kernel driver [only(*)], but having decent user-land software
27# implementation is important too. Finally, reasons like desire to
28# experiment with dedicated squaring procedure. Yes, this module
29# implements one, because it was easiest to draft it in SPARCv9
30# instructions...
31
32# (*) Engine accessing the driver in question is on my TODO list.
33# For reference, accelerator is estimated to give 6 to 10 times
34# improvement on single-threaded RSA sign. It should be noted
35# that 6-10x improvement coefficient does not actually mean
36# something extraordinary in terms of absolute [single-threaded]
37# performance, as SPARCv9 instruction set is by all means least
38# suitable for high performance crypto among other 64 bit
39# platforms. 6-10x factor simply places T1 in same performance
40# domain as say AMD64 and IA-64. Improvement of RSA verify don't
41# appear impressive at all, but it's the sign operation which is
42# far more critical/interesting.
43
44# You might notice that inner loops are modulo-scheduled:-) This has
45# essentially negligible impact on UltraSPARC performance, it's
46# Fujitsu SPARC64 V users who should notice and hopefully appreciate
47# the advantage... Currently this module surpasses sparcv9a-mont.pl
48# by ~20% on UltraSPARC-III and later cores, but recall that sparcv9a
49# module still have hidden potential [see TODO list there], which is
50# estimated to be larger than 20%...
51
52$output = pop;
53open STDOUT,">$output";
54
55# int bn_mul_mont(
56$rp="%i0"; # BN_ULONG *rp,
57$ap="%i1"; # const BN_ULONG *ap,
58$bp="%i2"; # const BN_ULONG *bp,
59$np="%i3"; # const BN_ULONG *np,
60$n0="%i4"; # const BN_ULONG *n0,
61$num="%i5"; # int num);
62
63$frame="STACK_FRAME";
64$bias="STACK_BIAS";
65
66$car0="%o0";
67$car1="%o1";
68$car2="%o2"; # 1 bit
69$acc0="%o3";
70$acc1="%o4";
71$mask="%g1"; # 32 bits, what a waste...
72$tmp0="%g4";
73$tmp1="%g5";
74
75$i="%l0";
76$j="%l1";
77$mul0="%l2";
78$mul1="%l3";
79$tp="%l4";
80$apj="%l5";
81$npj="%l6";
82$tpj="%l7";
83
84$fname="bn_mul_mont_int";
85
86$code=<<___;
87#include "sparc_arch.h"
88
89.section ".text",#alloc,#execinstr
90
91.global $fname
92.align 32
93$fname:
94 cmp %o5,4 ! 128 bits minimum
95 bge,pt %icc,.Lenter
96 sethi %hi(0xffffffff),$mask
97 retl
98 clr %o0
99.align 32
100.Lenter:
101 save %sp,-$frame,%sp
102 sll $num,2,$num ! num*=4
103 or $mask,%lo(0xffffffff),$mask
104 ld [$n0],$n0
105 cmp $ap,$bp
106 and $num,$mask,$num
107 ld [$bp],$mul0 ! bp[0]
108 nop
109
110 add %sp,$bias,%o7 ! real top of stack
111 ld [$ap],$car0 ! ap[0] ! redundant in squaring context
112 sub %o7,$num,%o7
113 ld [$ap+4],$apj ! ap[1]
114 and %o7,-1024,%o7
115 ld [$np],$car1 ! np[0]
116 sub %o7,$bias,%sp ! alloca
117 ld [$np+4],$npj ! np[1]
118 be,pt SIZE_T_CC,.Lbn_sqr_mont
119 mov 12,$j
120
121 mulx $car0,$mul0,$car0 ! ap[0]*bp[0]
122 mulx $apj,$mul0,$tmp0 !prologue! ap[1]*bp[0]
123 and $car0,$mask,$acc0
124 add %sp,$bias+$frame,$tp
125 ld [$ap+8],$apj !prologue!
126
127 mulx $n0,$acc0,$mul1 ! "t[0]"*n0
128 and $mul1,$mask,$mul1
129
130 mulx $car1,$mul1,$car1 ! np[0]*"t[0]"*n0
131 mulx $npj,$mul1,$acc1 !prologue! np[1]*"t[0]"*n0
132 srlx $car0,32,$car0
133 add $acc0,$car1,$car1
134 ld [$np+8],$npj !prologue!
135 srlx $car1,32,$car1
136 mov $tmp0,$acc0 !prologue!
137
138.L1st:
139 mulx $apj,$mul0,$tmp0
140 mulx $npj,$mul1,$tmp1
141 add $acc0,$car0,$car0
142 ld [$ap+$j],$apj ! ap[j]
143 and $car0,$mask,$acc0
144 add $acc1,$car1,$car1
145 ld [$np+$j],$npj ! np[j]
146 srlx $car0,32,$car0
147 add $acc0,$car1,$car1
148 add $j,4,$j ! j++
149 mov $tmp0,$acc0
150 st $car1,[$tp]
151 cmp $j,$num
152 mov $tmp1,$acc1
153 srlx $car1,32,$car1
154 bl %icc,.L1st
155 add $tp,4,$tp ! tp++
156!.L1st
157
158 mulx $apj,$mul0,$tmp0 !epilogue!
159 mulx $npj,$mul1,$tmp1
160 add $acc0,$car0,$car0
161 and $car0,$mask,$acc0
162 add $acc1,$car1,$car1
163 srlx $car0,32,$car0
164 add $acc0,$car1,$car1
165 st $car1,[$tp]
166 srlx $car1,32,$car1
167
168 add $tmp0,$car0,$car0
169 and $car0,$mask,$acc0
170 add $tmp1,$car1,$car1
171 srlx $car0,32,$car0
172 add $acc0,$car1,$car1
173 st $car1,[$tp+4]
174 srlx $car1,32,$car1
175
176 add $car0,$car1,$car1
177 st $car1,[$tp+8]
178 srlx $car1,32,$car2
179
180
181 mov 4,$i ! i++
182 ld [$bp+4],$mul0 ! bp[1]
183.Louter:
184 add %sp,$bias+$frame,$tp
185 ld [$ap],$car0 ! ap[0]
186 ld [$ap+4],$apj ! ap[1]
187 ld [$np],$car1 ! np[0]
188 ld [$np+4],$npj ! np[1]
189 ld [$tp],$tmp1 ! tp[0]
190 ld [$tp+4],$tpj ! tp[1]
191 mov 12,$j
192
193 mulx $car0,$mul0,$car0
194 mulx $apj,$mul0,$tmp0 !prologue!
195 add $tmp1,$car0,$car0
196 ld [$ap+8],$apj !prologue!
197 and $car0,$mask,$acc0
198
199 mulx $n0,$acc0,$mul1
200 and $mul1,$mask,$mul1
201
202 mulx $car1,$mul1,$car1
203 mulx $npj,$mul1,$acc1 !prologue!
204 srlx $car0,32,$car0
205 add $acc0,$car1,$car1
206 ld [$np+8],$npj !prologue!
207 srlx $car1,32,$car1
208 mov $tmp0,$acc0 !prologue!
209
210.Linner:
211 mulx $apj,$mul0,$tmp0
212 mulx $npj,$mul1,$tmp1
213 add $tpj,$car0,$car0
214 ld [$ap+$j],$apj ! ap[j]
215 add $acc0,$car0,$car0
216 add $acc1,$car1,$car1
217 ld [$np+$j],$npj ! np[j]
218 and $car0,$mask,$acc0
219 ld [$tp+8],$tpj ! tp[j]
220 srlx $car0,32,$car0
221 add $acc0,$car1,$car1
222 add $j,4,$j ! j++
223 mov $tmp0,$acc0
224 st $car1,[$tp] ! tp[j-1]
225 srlx $car1,32,$car1
226 mov $tmp1,$acc1
227 cmp $j,$num
228 bl %icc,.Linner
229 add $tp,4,$tp ! tp++
230!.Linner
231
232 mulx $apj,$mul0,$tmp0 !epilogue!
233 mulx $npj,$mul1,$tmp1
234 add $tpj,$car0,$car0
235 add $acc0,$car0,$car0
236 ld [$tp+8],$tpj ! tp[j]
237 and $car0,$mask,$acc0
238 add $acc1,$car1,$car1
239 srlx $car0,32,$car0
240 add $acc0,$car1,$car1
241 st $car1,[$tp] ! tp[j-1]
242 srlx $car1,32,$car1
243
244 add $tpj,$car0,$car0
245 add $tmp0,$car0,$car0
246 and $car0,$mask,$acc0
247 add $tmp1,$car1,$car1
248 add $acc0,$car1,$car1
249 st $car1,[$tp+4] ! tp[j-1]
250 srlx $car0,32,$car0
251 add $i,4,$i ! i++
252 srlx $car1,32,$car1
253
254 add $car0,$car1,$car1
255 cmp $i,$num
256 add $car2,$car1,$car1
257 st $car1,[$tp+8]
258
259 srlx $car1,32,$car2
260 bl,a %icc,.Louter
261 ld [$bp+$i],$mul0 ! bp[i]
262!.Louter
263
264 add $tp,12,$tp
265
266
267.Ltail:
268 add $np,$num,$np
269 add $rp,$num,$rp
270 sub %g0,$num,%o7 ! k=-num
271 ba .Lsub
272 subcc %g0,%g0,%g0 ! clear %icc.c
273.align 16
274.Lsub:
275 ld [$tp+%o7],%o0
276 ld [$np+%o7],%o1
277 subccc %o0,%o1,%o1 ! tp[j]-np[j]
278 add $rp,%o7,$i
279 add %o7,4,%o7
280 brnz %o7,.Lsub
281 st %o1,[$i]
282 subccc $car2,0,$car2 ! handle upmost overflow bit
283 sub %g0,$num,%o7
284
285.Lcopy:
286 ld [$tp+%o7],%o1 ! conditional copy
287 ld [$rp+%o7],%o0
288 st %g0,[$tp+%o7] ! zap tp
289 movcs %icc,%o1,%o0
290 st %o0,[$rp+%o7]
291 add %o7,4,%o7
292 brnz %o7,.Lcopy
293 nop
294 mov 1,%i0
295 ret
296 restore
297___
298
299
300########
301######## .Lbn_sqr_mont gives up to 20% *overall* improvement over
302######## code without following dedicated squaring procedure.
303########
304$sbit="%o5";
305
306$code.=<<___;
307.align 32
308.Lbn_sqr_mont:
309 mulx $mul0,$mul0,$car0 ! ap[0]*ap[0]
310 mulx $apj,$mul0,$tmp0 !prologue!
311 and $car0,$mask,$acc0
312 add %sp,$bias+$frame,$tp
313 ld [$ap+8],$apj !prologue!
314
315 mulx $n0,$acc0,$mul1 ! "t[0]"*n0
316 srlx $car0,32,$car0
317 and $mul1,$mask,$mul1
318
319 mulx $car1,$mul1,$car1 ! np[0]*"t[0]"*n0
320 mulx $npj,$mul1,$acc1 !prologue!
321 and $car0,1,$sbit
322 ld [$np+8],$npj !prologue!
323 srlx $car0,1,$car0
324 add $acc0,$car1,$car1
325 srlx $car1,32,$car1
326 mov $tmp0,$acc0 !prologue!
327
328.Lsqr_1st:
329 mulx $apj,$mul0,$tmp0
330 mulx $npj,$mul1,$tmp1
331 add $acc0,$car0,$car0 ! ap[j]*a0+c0
332 add $acc1,$car1,$car1
333 ld [$ap+$j],$apj ! ap[j]
334 and $car0,$mask,$acc0
335 ld [$np+$j],$npj ! np[j]
336 srlx $car0,32,$car0
337 add $acc0,$acc0,$acc0
338 or $sbit,$acc0,$acc0
339 mov $tmp1,$acc1
340 srlx $acc0,32,$sbit
341 add $j,4,$j ! j++
342 and $acc0,$mask,$acc0
343 cmp $j,$num
344 add $acc0,$car1,$car1
345 st $car1,[$tp]
346 mov $tmp0,$acc0
347 srlx $car1,32,$car1
348 bl %icc,.Lsqr_1st
349 add $tp,4,$tp ! tp++
350!.Lsqr_1st
351
352 mulx $apj,$mul0,$tmp0 ! epilogue
353 mulx $npj,$mul1,$tmp1
354 add $acc0,$car0,$car0 ! ap[j]*a0+c0
355 add $acc1,$car1,$car1
356 and $car0,$mask,$acc0
357 srlx $car0,32,$car0
358 add $acc0,$acc0,$acc0
359 or $sbit,$acc0,$acc0
360 srlx $acc0,32,$sbit
361 and $acc0,$mask,$acc0
362 add $acc0,$car1,$car1
363 st $car1,[$tp]
364 srlx $car1,32,$car1
365
366 add $tmp0,$car0,$car0 ! ap[j]*a0+c0
367 add $tmp1,$car1,$car1
368 and $car0,$mask,$acc0
369 srlx $car0,32,$car0
370 add $acc0,$acc0,$acc0
371 or $sbit,$acc0,$acc0
372 srlx $acc0,32,$sbit
373 and $acc0,$mask,$acc0
374 add $acc0,$car1,$car1
375 st $car1,[$tp+4]
376 srlx $car1,32,$car1
377
378 add $car0,$car0,$car0
379 or $sbit,$car0,$car0
380 add $car0,$car1,$car1
381 st $car1,[$tp+8]
382 srlx $car1,32,$car2
383
384
385 ld [%sp+$bias+$frame],$tmp0 ! tp[0]
386 ld [%sp+$bias+$frame+4],$tmp1 ! tp[1]
387 ld [%sp+$bias+$frame+8],$tpj ! tp[2]
388 ld [$ap+4],$mul0 ! ap[1]
389 ld [$ap+8],$apj ! ap[2]
390 ld [$np],$car1 ! np[0]
391 ld [$np+4],$npj ! np[1]
392 mulx $n0,$tmp0,$mul1
393
394 mulx $mul0,$mul0,$car0
395 and $mul1,$mask,$mul1
396
397 mulx $car1,$mul1,$car1
398 mulx $npj,$mul1,$acc1
399 add $tmp0,$car1,$car1
400 and $car0,$mask,$acc0
401 ld [$np+8],$npj ! np[2]
402 srlx $car1,32,$car1
403 add $tmp1,$car1,$car1
404 srlx $car0,32,$car0
405 add $acc0,$car1,$car1
406 and $car0,1,$sbit
407 add $acc1,$car1,$car1
408 srlx $car0,1,$car0
409 mov 12,$j
410 st $car1,[%sp+$bias+$frame] ! tp[0]=
411 srlx $car1,32,$car1
412 add %sp,$bias+$frame+4,$tp
413
414.Lsqr_2nd:
415 mulx $apj,$mul0,$acc0
416 mulx $npj,$mul1,$acc1
417 add $acc0,$car0,$car0
418 add $tpj,$sbit,$sbit
419 ld [$ap+$j],$apj ! ap[j]
420 and $car0,$mask,$acc0
421 ld [$np+$j],$npj ! np[j]
422 srlx $car0,32,$car0
423 add $acc1,$car1,$car1
424 ld [$tp+8],$tpj ! tp[j]
425 add $acc0,$acc0,$acc0
426 add $j,4,$j ! j++
427 add $sbit,$acc0,$acc0
428 srlx $acc0,32,$sbit
429 and $acc0,$mask,$acc0
430 cmp $j,$num
431 add $acc0,$car1,$car1
432 st $car1,[$tp] ! tp[j-1]
433 srlx $car1,32,$car1
434 bl %icc,.Lsqr_2nd
435 add $tp,4,$tp ! tp++
436!.Lsqr_2nd
437
438 mulx $apj,$mul0,$acc0
439 mulx $npj,$mul1,$acc1
440 add $acc0,$car0,$car0
441 add $tpj,$sbit,$sbit
442 and $car0,$mask,$acc0
443 srlx $car0,32,$car0
444 add $acc1,$car1,$car1
445 add $acc0,$acc0,$acc0
446 add $sbit,$acc0,$acc0
447 srlx $acc0,32,$sbit
448 and $acc0,$mask,$acc0
449 add $acc0,$car1,$car1
450 st $car1,[$tp] ! tp[j-1]
451 srlx $car1,32,$car1
452
453 add $car0,$car0,$car0
454 add $sbit,$car0,$car0
455 add $car0,$car1,$car1
456 add $car2,$car1,$car1
457 st $car1,[$tp+4]
458 srlx $car1,32,$car2
459
460
461 ld [%sp+$bias+$frame],$tmp1 ! tp[0]
462 ld [%sp+$bias+$frame+4],$tpj ! tp[1]
463 ld [$ap+8],$mul0 ! ap[2]
464 ld [$np],$car1 ! np[0]
465 ld [$np+4],$npj ! np[1]
466 mulx $n0,$tmp1,$mul1
467 and $mul1,$mask,$mul1
468 mov 8,$i
469
470 mulx $mul0,$mul0,$car0
471 mulx $car1,$mul1,$car1
472 and $car0,$mask,$acc0
473 add $tmp1,$car1,$car1
474 srlx $car0,32,$car0
475 add %sp,$bias+$frame,$tp
476 srlx $car1,32,$car1
477 and $car0,1,$sbit
478 srlx $car0,1,$car0
479 mov 4,$j
480
481.Lsqr_outer:
482.Lsqr_inner1:
483 mulx $npj,$mul1,$acc1
484 add $tpj,$car1,$car1
485 add $j,4,$j
486 ld [$tp+8],$tpj
487 cmp $j,$i
488 add $acc1,$car1,$car1
489 ld [$np+$j],$npj
490 st $car1,[$tp]
491 srlx $car1,32,$car1
492 bl %icc,.Lsqr_inner1
493 add $tp,4,$tp
494!.Lsqr_inner1
495
496 add $j,4,$j
497 ld [$ap+$j],$apj ! ap[j]
498 mulx $npj,$mul1,$acc1
499 add $tpj,$car1,$car1
500 ld [$np+$j],$npj ! np[j]
501 srlx $car1,32,$tmp0
502 and $car1,$mask,$car1
503 add $tmp0,$sbit,$sbit
504 add $acc0,$car1,$car1
505 ld [$tp+8],$tpj ! tp[j]
506 add $acc1,$car1,$car1
507 st $car1,[$tp]
508 srlx $car1,32,$car1
509
510 add $j,4,$j
511 cmp $j,$num
512 be,pn %icc,.Lsqr_no_inner2
513 add $tp,4,$tp
514
515.Lsqr_inner2:
516 mulx $apj,$mul0,$acc0
517 mulx $npj,$mul1,$acc1
518 add $tpj,$sbit,$sbit
519 add $acc0,$car0,$car0
520 ld [$ap+$j],$apj ! ap[j]
521 and $car0,$mask,$acc0
522 ld [$np+$j],$npj ! np[j]
523 srlx $car0,32,$car0
524 add $acc0,$acc0,$acc0
525 ld [$tp+8],$tpj ! tp[j]
526 add $sbit,$acc0,$acc0
527 add $j,4,$j ! j++
528 srlx $acc0,32,$sbit
529 and $acc0,$mask,$acc0
530 cmp $j,$num
531 add $acc0,$car1,$car1
532 add $acc1,$car1,$car1
533 st $car1,[$tp] ! tp[j-1]
534 srlx $car1,32,$car1
535 bl %icc,.Lsqr_inner2
536 add $tp,4,$tp ! tp++
537
538.Lsqr_no_inner2:
539 mulx $apj,$mul0,$acc0
540 mulx $npj,$mul1,$acc1
541 add $tpj,$sbit,$sbit
542 add $acc0,$car0,$car0
543 and $car0,$mask,$acc0
544 srlx $car0,32,$car0
545 add $acc0,$acc0,$acc0
546 add $sbit,$acc0,$acc0
547 srlx $acc0,32,$sbit
548 and $acc0,$mask,$acc0
549 add $acc0,$car1,$car1
550 add $acc1,$car1,$car1
551 st $car1,[$tp] ! tp[j-1]
552 srlx $car1,32,$car1
553
554 add $car0,$car0,$car0
555 add $sbit,$car0,$car0
556 add $car0,$car1,$car1
557 add $car2,$car1,$car1
558 st $car1,[$tp+4]
559 srlx $car1,32,$car2
560
561
562 add $i,4,$i ! i++
563 ld [%sp+$bias+$frame],$tmp1 ! tp[0]
564 ld [%sp+$bias+$frame+4],$tpj ! tp[1]
565 ld [$ap+$i],$mul0 ! ap[j]
566 ld [$np],$car1 ! np[0]
567 ld [$np+4],$npj ! np[1]
568 mulx $n0,$tmp1,$mul1
569 and $mul1,$mask,$mul1
570 add $i,4,$tmp0
571
572 mulx $mul0,$mul0,$car0
573 mulx $car1,$mul1,$car1
574 and $car0,$mask,$acc0
575 add $tmp1,$car1,$car1
576 srlx $car0,32,$car0
577 add %sp,$bias+$frame,$tp
578 srlx $car1,32,$car1
579 and $car0,1,$sbit
580 srlx $car0,1,$car0
581
582 cmp $tmp0,$num ! i<num-1
583 bl %icc,.Lsqr_outer
584 mov 4,$j
585
586
587.Lsqr_last:
588 mulx $npj,$mul1,$acc1
589 add $tpj,$car1,$car1
590 add $j,4,$j
591 ld [$tp+8],$tpj
592 cmp $j,$i
593 add $acc1,$car1,$car1
594 ld [$np+$j],$npj
595 st $car1,[$tp]
596 srlx $car1,32,$car1
597 bl %icc,.Lsqr_last
598 add $tp,4,$tp
599!.Lsqr_last
600
601 mulx $npj,$mul1,$acc1
602 add $tpj,$acc0,$acc0
603 srlx $acc0,32,$tmp0
604 and $acc0,$mask,$acc0
605 add $tmp0,$sbit,$sbit
606 add $acc0,$car1,$car1
607 add $acc1,$car1,$car1
608 st $car1,[$tp]
609 srlx $car1,32,$car1
610
611 add $car0,$car0,$car0 ! recover $car0
612 add $sbit,$car0,$car0
613 add $car0,$car1,$car1
614 add $car2,$car1,$car1
615 st $car1,[$tp+4]
616 srlx $car1,32,$car2
617
618 ba .Ltail
619 add $tp,8,$tp
620.type $fname,#function
621.size $fname,(.-$fname)
622.asciz "Montgomery Multiplication for SPARCv9, CRYPTOGAMS by <appro\@openssl.org>"
623.align 32
624___
625$code =~ s/\`([^\`]*)\`/eval($1)/gem;
626print $code;
627close STDOUT or die "error closing STDOUT: $!";
Note: See TracBrowser for help on using the repository browser.

© 2024 Oracle Support Privacy / Do Not Sell My Info Terms of Use Trademark Policy Automated Access Etiquette