VirtualBox

source: vbox/trunk/src/libs/openssl-3.0.1/crypto/bn/asm/sparct4-mont.pl@ 94081

Last change on this file since 94081 was 91772, checked in by vboxsync, 3 years ago

openssl-1.1.1l: Applied and adjusted our OpenSSL changes to 1.1.1l. bugref:10126

  • Property svn:executable set to *
File size: 27.1 KB
Line 
1#! /usr/bin/env perl
2# Copyright 2012-2020 The OpenSSL Project Authors. All Rights Reserved.
3#
4# Licensed under the OpenSSL license (the "License"). You may not use
5# this file except in compliance with the License. You can obtain a copy
6# in the file LICENSE in the source distribution or at
7# https://www.openssl.org/source/license.html
8
9
10# ====================================================================
11# Written by David S. Miller and Andy Polyakov
12# The module is licensed under 2-clause BSD license.
13# November 2012. All rights reserved.
14# ====================================================================
15
16######################################################################
17# Montgomery squaring-n-multiplication module for SPARC T4.
18#
19# The module consists of three parts:
20#
21# 1) collection of "single-op" subroutines that perform single
22# operation, Montgomery squaring or multiplication, on 512-,
23# 1024-, 1536- and 2048-bit operands;
24# 2) collection of "multi-op" subroutines that perform 5 squaring and
25# 1 multiplication operations on operands of above lengths;
26# 3) fall-back and helper VIS3 subroutines.
27#
28# RSA sign is dominated by multi-op subroutine, while RSA verify and
29# DSA - by single-op. Special note about 4096-bit RSA verify result.
30# Operands are too long for dedicated hardware and it's handled by
31# VIS3 code, which is why you don't see any improvement. It's surely
32# possible to improve it [by deploying 'mpmul' instruction], maybe in
33# the future...
34#
35# Performance improvement.
36#
37# 64-bit process, VIS3:
38# sign verify sign/s verify/s
39# rsa 1024 bits 0.000628s 0.000028s 1592.4 35434.4
40# rsa 2048 bits 0.003282s 0.000106s 304.7 9438.3
41# rsa 4096 bits 0.025866s 0.000340s 38.7 2940.9
42# dsa 1024 bits 0.000301s 0.000332s 3323.7 3013.9
43# dsa 2048 bits 0.001056s 0.001233s 946.9 810.8
44#
45# 64-bit process, this module:
46# sign verify sign/s verify/s
47# rsa 1024 bits 0.000256s 0.000016s 3904.4 61411.9
48# rsa 2048 bits 0.000946s 0.000029s 1056.8 34292.7
49# rsa 4096 bits 0.005061s 0.000340s 197.6 2940.5
50# dsa 1024 bits 0.000176s 0.000195s 5674.7 5130.5
51# dsa 2048 bits 0.000296s 0.000354s 3383.2 2827.6
52#
53######################################################################
54# 32-bit process, VIS3:
55# sign verify sign/s verify/s
56# rsa 1024 bits 0.000665s 0.000028s 1504.8 35233.3
57# rsa 2048 bits 0.003349s 0.000106s 298.6 9433.4
58# rsa 4096 bits 0.025959s 0.000341s 38.5 2934.8
59# dsa 1024 bits 0.000320s 0.000341s 3123.3 2929.6
60# dsa 2048 bits 0.001101s 0.001260s 908.2 793.4
61#
62# 32-bit process, this module:
63# sign verify sign/s verify/s
64# rsa 1024 bits 0.000301s 0.000017s 3317.1 60240.0
65# rsa 2048 bits 0.001034s 0.000030s 966.9 33812.7
66# rsa 4096 bits 0.005244s 0.000341s 190.7 2935.4
67# dsa 1024 bits 0.000201s 0.000205s 4976.1 4879.2
68# dsa 2048 bits 0.000328s 0.000360s 3051.1 2774.2
69#
70# 32-bit code is prone to performance degradation as interrupt rate
71# dispatched to CPU executing the code grows. This is because in
72# standard process of handling interrupt in 32-bit process context
73# upper halves of most integer registers used as input or output are
74# zeroed. This renders result invalid, and operation has to be re-run.
75# If CPU is "bothered" with timer interrupts only, the penalty is
76# hardly measurable. But in order to mitigate this problem for higher
77# interrupt rates contemporary Linux kernel recognizes biased stack
78# even in 32-bit process context and preserves full register contents.
79# See http://git.kernel.org/cgit/linux/kernel/git/torvalds/linux.git/commit/?id=517ffce4e1a03aea979fe3a18a3dd1761a24fafb
80# for details.
81
82$0 =~ m/(.*[\/\\])[^\/\\]+$/; $dir=$1;
83push(@INC,"${dir}","${dir}../../perlasm");
84require "sparcv9_modes.pl";
85
86$output = pop;
87open STDOUT,">$output";
88
89$code.=<<___;
90#include "sparc_arch.h"
91
92#ifdef __arch64__
93.register %g2,#scratch
94.register %g3,#scratch
95#endif
96
97.section ".text",#alloc,#execinstr
98
99#ifdef __PIC__
100SPARC_PIC_THUNK(%g1)
101#endif
102___
103
104########################################################################
105# Register layout for mont[mul|sqr] instructions.
106# For details see "Oracle SPARC Architecture 2011" manual at
107# http://www.oracle.com/technetwork/server-storage/sun-sparc-enterprise/documentation/.
108#
109my @R=map("%f".2*$_,(0..11,30,31,12..29));
110my @N=(map("%l$_",(0..7)),map("%o$_",(0..5))); @N=(@N,@N,@N[0..3]);
111my @A=(@N[0..13],@R[14..31]);
112my @B=(map("%i$_",(0..5)),map("%l$_",(0..7))); @B=(@B,@B,map("%o$_",(0..3)));
113
114
115########################################################################
116# int bn_mul_mont_t4_$NUM(u64 *rp,const u64 *ap,const u64 *bp,
117# const u64 *np,const BN_ULONG *n0);
118#
119sub generate_bn_mul_mont_t4() {
120my $NUM=shift;
121my ($rp,$ap,$bp,$np,$sentinel)=map("%g$_",(1..5));
122
123$code.=<<___;
124.globl bn_mul_mont_t4_$NUM
125.align 32
126bn_mul_mont_t4_$NUM:
127#ifdef __arch64__
128 mov 0,$sentinel
129 mov -128,%g4
130#elif defined(SPARCV9_64BIT_STACK)
131 SPARC_LOAD_ADDRESS_LEAF(OPENSSL_sparcv9cap_P,%g1,%g5)
132 ld [%g1+0],%g1 ! OPENSSL_sparcv9_P[0]
133 mov -2047,%g4
134 and %g1,SPARCV9_64BIT_STACK,%g1
135 movrz %g1,0,%g4
136 mov -1,$sentinel
137 add %g4,-128,%g4
138#else
139 mov -1,$sentinel
140 mov -128,%g4
141#endif
142 sllx $sentinel,32,$sentinel
143 save %sp,%g4,%sp
144#ifndef __arch64__
145 save %sp,-128,%sp ! warm it up
146 save %sp,-128,%sp
147 save %sp,-128,%sp
148 save %sp,-128,%sp
149 save %sp,-128,%sp
150 save %sp,-128,%sp
151 restore
152 restore
153 restore
154 restore
155 restore
156 restore
157#endif
158 and %sp,1,%g4
159 or $sentinel,%fp,%fp
160 or %g4,$sentinel,$sentinel
161
162 ! copy arguments to global registers
163 mov %i0,$rp
164 mov %i1,$ap
165 mov %i2,$bp
166 mov %i3,$np
167 ld [%i4+0],%f1 ! load *n0
168 ld [%i4+4],%f0
169 fsrc2 %f0,%f60
170___
171
172
173# load ap[$NUM] ########################################################
174$code.=<<___;
175 save %sp,-128,%sp; or $sentinel,%fp,%fp
176___
177for($i=0; $i<14 && $i<$NUM; $i++) {
178my $lo=$i<13?@A[$i+1]:"%o7";
179$code.=<<___;
180 ld [$ap+$i*8+0],$lo
181 ld [$ap+$i*8+4],@A[$i]
182 sllx @A[$i],32,@A[$i]
183 or $lo,@A[$i],@A[$i]
184___
185}
186for(; $i<$NUM; $i++) {
187my ($hi,$lo)=("%f".2*($i%4),"%f".(2*($i%4)+1));
188$code.=<<___;
189 ld [$ap+$i*8+0],$lo
190 ld [$ap+$i*8+4],$hi
191 fsrc2 $hi,@A[$i]
192___
193}
194# load np[$NUM] ########################################################
195$code.=<<___;
196 save %sp,-128,%sp; or $sentinel,%fp,%fp
197___
198for($i=0; $i<14 && $i<$NUM; $i++) {
199my $lo=$i<13?@N[$i+1]:"%o7";
200$code.=<<___;
201 ld [$np+$i*8+0],$lo
202 ld [$np+$i*8+4],@N[$i]
203 sllx @N[$i],32,@N[$i]
204 or $lo,@N[$i],@N[$i]
205___
206}
207$code.=<<___;
208 save %sp,-128,%sp; or $sentinel,%fp,%fp
209___
210for(; $i<28 && $i<$NUM; $i++) {
211my $lo=$i<27?@N[$i+1]:"%o7";
212$code.=<<___;
213 ld [$np+$i*8+0],$lo
214 ld [$np+$i*8+4],@N[$i]
215 sllx @N[$i],32,@N[$i]
216 or $lo,@N[$i],@N[$i]
217___
218}
219$code.=<<___;
220 save %sp,-128,%sp; or $sentinel,%fp,%fp
221___
222for(; $i<$NUM; $i++) {
223my $lo=($i<$NUM-1)?@N[$i+1]:"%o7";
224$code.=<<___;
225 ld [$np+$i*8+0],$lo
226 ld [$np+$i*8+4],@N[$i]
227 sllx @N[$i],32,@N[$i]
228 or $lo,@N[$i],@N[$i]
229___
230}
231$code.=<<___;
232 cmp $ap,$bp
233 be SIZE_T_CC,.Lmsquare_$NUM
234 nop
235___
236
237
238# load bp[$NUM] ########################################################
239$code.=<<___;
240 save %sp,-128,%sp; or $sentinel,%fp,%fp
241___
242for($i=0; $i<14 && $i<$NUM; $i++) {
243my $lo=$i<13?@B[$i+1]:"%o7";
244$code.=<<___;
245 ld [$bp+$i*8+0],$lo
246 ld [$bp+$i*8+4],@B[$i]
247 sllx @B[$i],32,@B[$i]
248 or $lo,@B[$i],@B[$i]
249___
250}
251$code.=<<___;
252 save %sp,-128,%sp; or $sentinel,%fp,%fp
253___
254for(; $i<$NUM; $i++) {
255my $lo=($i<$NUM-1)?@B[$i+1]:"%o7";
256$code.=<<___;
257 ld [$bp+$i*8+0],$lo
258 ld [$bp+$i*8+4],@B[$i]
259 sllx @B[$i],32,@B[$i]
260 or $lo,@B[$i],@B[$i]
261___
262}
263# magic ################################################################
264$code.=<<___;
265 .word 0x81b02920+$NUM-1 ! montmul $NUM-1
266.Lmresume_$NUM:
267 fbu,pn %fcc3,.Lmabort_$NUM
268#ifndef __arch64__
269 and %fp,$sentinel,$sentinel
270 brz,pn $sentinel,.Lmabort_$NUM
271#endif
272 nop
273#ifdef __arch64__
274 restore
275 restore
276 restore
277 restore
278 restore
279#else
280 restore; and %fp,$sentinel,$sentinel
281 restore; and %fp,$sentinel,$sentinel
282 restore; and %fp,$sentinel,$sentinel
283 restore; and %fp,$sentinel,$sentinel
284 brz,pn $sentinel,.Lmabort1_$NUM
285 restore
286#endif
287___
288
289
290# save tp[$NUM] ########################################################
291for($i=0; $i<14 && $i<$NUM; $i++) {
292$code.=<<___;
293 movxtod @A[$i],@R[$i]
294___
295}
296$code.=<<___;
297#ifdef __arch64__
298 restore
299#else
300 and %fp,$sentinel,$sentinel
301 restore
302 and $sentinel,1,%o7
303 and %fp,$sentinel,$sentinel
304 srl %fp,0,%fp ! just in case?
305 or %o7,$sentinel,$sentinel
306 brz,a,pn $sentinel,.Lmdone_$NUM
307 mov 0,%i0 ! return failure
308#endif
309___
310for($i=0; $i<12 && $i<$NUM; $i++) {
311@R[$i] =~ /%f([0-9]+)/;
312my $lo = "%f".($1+1);
313$code.=<<___;
314 st $lo,[$rp+$i*8+0]
315 st @R[$i],[$rp+$i*8+4]
316___
317}
318for(; $i<$NUM; $i++) {
319my ($hi,$lo)=("%f".2*($i%4),"%f".(2*($i%4)+1));
320$code.=<<___;
321 fsrc2 @R[$i],$hi
322 st $lo,[$rp+$i*8+0]
323 st $hi,[$rp+$i*8+4]
324___
325}
326$code.=<<___;
327 mov 1,%i0 ! return success
328.Lmdone_$NUM:
329 ret
330 restore
331
332.Lmabort_$NUM:
333 restore
334 restore
335 restore
336 restore
337 restore
338.Lmabort1_$NUM:
339 restore
340
341 mov 0,%i0 ! return failure
342 ret
343 restore
344
345.align 32
346.Lmsquare_$NUM:
347 save %sp,-128,%sp; or $sentinel,%fp,%fp
348 save %sp,-128,%sp; or $sentinel,%fp,%fp
349 .word 0x81b02940+$NUM-1 ! montsqr $NUM-1
350 ba .Lmresume_$NUM
351 nop
352.type bn_mul_mont_t4_$NUM, #function
353.size bn_mul_mont_t4_$NUM, .-bn_mul_mont_t4_$NUM
354___
355}
356
357for ($i=8;$i<=32;$i+=8) {
358 &generate_bn_mul_mont_t4($i);
359}
360
361
362########################################################################
363#
364sub load_ccr {
365my ($ptbl,$pwr,$ccr,$skip_wr)=@_;
366$code.=<<___;
367 srl $pwr, 2, %o4
368 and $pwr, 3, %o5
369 and %o4, 7, %o4
370 sll %o5, 3, %o5 ! offset within first cache line
371 add %o5, $ptbl, $ptbl ! of the pwrtbl
372 or %g0, 1, %o5
373 sll %o5, %o4, $ccr
374___
375$code.=<<___ if (!$skip_wr);
376 wr $ccr, %g0, %ccr
377___
378}
379sub load_b_pair {
380my ($pwrtbl,$B0,$B1)=@_;
381
382$code.=<<___;
383 ldx [$pwrtbl+0*32], $B0
384 ldx [$pwrtbl+8*32], $B1
385 ldx [$pwrtbl+1*32], %o4
386 ldx [$pwrtbl+9*32], %o5
387 movvs %icc, %o4, $B0
388 ldx [$pwrtbl+2*32], %o4
389 movvs %icc, %o5, $B1
390 ldx [$pwrtbl+10*32],%o5
391 move %icc, %o4, $B0
392 ldx [$pwrtbl+3*32], %o4
393 move %icc, %o5, $B1
394 ldx [$pwrtbl+11*32],%o5
395 movneg %icc, %o4, $B0
396 ldx [$pwrtbl+4*32], %o4
397 movneg %icc, %o5, $B1
398 ldx [$pwrtbl+12*32],%o5
399 movcs %xcc, %o4, $B0
400 ldx [$pwrtbl+5*32],%o4
401 movcs %xcc, %o5, $B1
402 ldx [$pwrtbl+13*32],%o5
403 movvs %xcc, %o4, $B0
404 ldx [$pwrtbl+6*32], %o4
405 movvs %xcc, %o5, $B1
406 ldx [$pwrtbl+14*32],%o5
407 move %xcc, %o4, $B0
408 ldx [$pwrtbl+7*32], %o4
409 move %xcc, %o5, $B1
410 ldx [$pwrtbl+15*32],%o5
411 movneg %xcc, %o4, $B0
412 add $pwrtbl,16*32, $pwrtbl
413 movneg %xcc, %o5, $B1
414___
415}
416sub load_b {
417my ($pwrtbl,$Bi)=@_;
418
419$code.=<<___;
420 ldx [$pwrtbl+0*32], $Bi
421 ldx [$pwrtbl+1*32], %o4
422 ldx [$pwrtbl+2*32], %o5
423 movvs %icc, %o4, $Bi
424 ldx [$pwrtbl+3*32], %o4
425 move %icc, %o5, $Bi
426 ldx [$pwrtbl+4*32], %o5
427 movneg %icc, %o4, $Bi
428 ldx [$pwrtbl+5*32], %o4
429 movcs %xcc, %o5, $Bi
430 ldx [$pwrtbl+6*32], %o5
431 movvs %xcc, %o4, $Bi
432 ldx [$pwrtbl+7*32], %o4
433 move %xcc, %o5, $Bi
434 add $pwrtbl,8*32, $pwrtbl
435 movneg %xcc, %o4, $Bi
436___
437}
438
439
440########################################################################
441# int bn_pwr5_mont_t4_$NUM(u64 *tp,const u64 *np,const BN_ULONG *n0,
442# const u64 *pwrtbl,int pwr,int stride);
443#
444sub generate_bn_pwr5_mont_t4() {
445my $NUM=shift;
446my ($tp,$np,$pwrtbl,$pwr,$sentinel)=map("%g$_",(1..5));
447
448$code.=<<___;
449.globl bn_pwr5_mont_t4_$NUM
450.align 32
451bn_pwr5_mont_t4_$NUM:
452#ifdef __arch64__
453 mov 0,$sentinel
454 mov -128,%g4
455#elif defined(SPARCV9_64BIT_STACK)
456 SPARC_LOAD_ADDRESS_LEAF(OPENSSL_sparcv9cap_P,%g1,%g5)
457 ld [%g1+0],%g1 ! OPENSSL_sparcv9_P[0]
458 mov -2047,%g4
459 and %g1,SPARCV9_64BIT_STACK,%g1
460 movrz %g1,0,%g4
461 mov -1,$sentinel
462 add %g4,-128,%g4
463#else
464 mov -1,$sentinel
465 mov -128,%g4
466#endif
467 sllx $sentinel,32,$sentinel
468 save %sp,%g4,%sp
469#ifndef __arch64__
470 save %sp,-128,%sp ! warm it up
471 save %sp,-128,%sp
472 save %sp,-128,%sp
473 save %sp,-128,%sp
474 save %sp,-128,%sp
475 save %sp,-128,%sp
476 restore
477 restore
478 restore
479 restore
480 restore
481 restore
482#endif
483 and %sp,1,%g4
484 or $sentinel,%fp,%fp
485 or %g4,$sentinel,$sentinel
486
487 ! copy arguments to global registers
488 mov %i0,$tp
489 mov %i1,$np
490 ld [%i2+0],%f1 ! load *n0
491 ld [%i2+4],%f0
492 mov %i3,$pwrtbl
493 srl %i4,%g0,%i4 ! pack last arguments
494 sllx %i5,32,$pwr
495 or %i4,$pwr,$pwr
496 fsrc2 %f0,%f60
497___
498
499
500# load tp[$NUM] ########################################################
501$code.=<<___;
502 save %sp,-128,%sp; or $sentinel,%fp,%fp
503___
504for($i=0; $i<14 && $i<$NUM; $i++) {
505$code.=<<___;
506 ldx [$tp+$i*8],@A[$i]
507___
508}
509for(; $i<$NUM; $i++) {
510$code.=<<___;
511 ldd [$tp+$i*8],@A[$i]
512___
513}
514# load np[$NUM] ########################################################
515$code.=<<___;
516 save %sp,-128,%sp; or $sentinel,%fp,%fp
517___
518for($i=0; $i<14 && $i<$NUM; $i++) {
519$code.=<<___;
520 ldx [$np+$i*8],@N[$i]
521___
522}
523$code.=<<___;
524 save %sp,-128,%sp; or $sentinel,%fp,%fp
525___
526for(; $i<28 && $i<$NUM; $i++) {
527$code.=<<___;
528 ldx [$np+$i*8],@N[$i]
529___
530}
531$code.=<<___;
532 save %sp,-128,%sp; or $sentinel,%fp,%fp
533___
534for(; $i<$NUM; $i++) {
535$code.=<<___;
536 ldx [$np+$i*8],@N[$i]
537___
538}
539# load pwrtbl[pwr] ########################################################
540$code.=<<___;
541 save %sp,-128,%sp; or $sentinel,%fp,%fp
542
543 srlx $pwr, 32, %o4 ! unpack $pwr
544 srl $pwr, %g0, %o5
545 sub %o4, 5, %o4
546 mov $pwrtbl, %o7
547 sllx %o4, 32, $pwr ! re-pack $pwr
548 or %o5, $pwr, $pwr
549 srl %o5, %o4, %o5
550___
551 &load_ccr("%o7","%o5","%o4");
552$code.=<<___;
553 b .Lstride_$NUM
554 nop
555.align 16
556.Lstride_$NUM:
557___
558for($i=0; $i<14 && $i<$NUM; $i+=2) {
559 &load_b_pair("%o7",@B[$i],@B[$i+1]);
560}
561$code.=<<___;
562 save %sp,-128,%sp; or $sentinel,%fp,%fp
563___
564for(; $i<$NUM; $i+=2) {
565 &load_b_pair("%i7",@B[$i],@B[$i+1]);
566}
567$code.=<<___;
568 srax $pwr, 32, %o4 ! unpack $pwr
569 srl $pwr, %g0, %o5
570 sub %o4, 5, %o4
571 mov $pwrtbl, %i7
572 sllx %o4, 32, $pwr ! re-pack $pwr
573 or %o5, $pwr, $pwr
574 srl %o5, %o4, %o5
575___
576 &load_ccr("%i7","%o5","%o4",1);
577
578
579# magic ################################################################
580for($i=0; $i<5; $i++) {
581$code.=<<___;
582 .word 0x81b02940+$NUM-1 ! montsqr $NUM-1
583 fbu,pn %fcc3,.Labort_$NUM
584#ifndef __arch64__
585 and %fp,$sentinel,$sentinel
586 brz,pn $sentinel,.Labort_$NUM
587#endif
588 nop
589___
590}
591$code.=<<___;
592 wr %o4, %g0, %ccr
593 .word 0x81b02920+$NUM-1 ! montmul $NUM-1
594 fbu,pn %fcc3,.Labort_$NUM
595#ifndef __arch64__
596 and %fp,$sentinel,$sentinel
597 brz,pn $sentinel,.Labort_$NUM
598#endif
599
600 srax $pwr, 32, %o4
601#ifdef __arch64__
602 brgez %o4,.Lstride_$NUM
603 restore
604 restore
605 restore
606 restore
607 restore
608#else
609 brgez %o4,.Lstride_$NUM
610 restore; and %fp,$sentinel,$sentinel
611 restore; and %fp,$sentinel,$sentinel
612 restore; and %fp,$sentinel,$sentinel
613 restore; and %fp,$sentinel,$sentinel
614 brz,pn $sentinel,.Labort1_$NUM
615 restore
616#endif
617___
618
619
620# save tp[$NUM] ########################################################
621for($i=0; $i<14 && $i<$NUM; $i++) {
622$code.=<<___;
623 movxtod @A[$i],@R[$i]
624___
625}
626$code.=<<___;
627#ifdef __arch64__
628 restore
629#else
630 and %fp,$sentinel,$sentinel
631 restore
632 and $sentinel,1,%o7
633 and %fp,$sentinel,$sentinel
634 srl %fp,0,%fp ! just in case?
635 or %o7,$sentinel,$sentinel
636 brz,a,pn $sentinel,.Ldone_$NUM
637 mov 0,%i0 ! return failure
638#endif
639___
640for($i=0; $i<$NUM; $i++) {
641$code.=<<___;
642 std @R[$i],[$tp+$i*8]
643___
644}
645$code.=<<___;
646 mov 1,%i0 ! return success
647.Ldone_$NUM:
648 ret
649 restore
650
651.Labort_$NUM:
652 restore
653 restore
654 restore
655 restore
656 restore
657.Labort1_$NUM:
658 restore
659
660 mov 0,%i0 ! return failure
661 ret
662 restore
663.type bn_pwr5_mont_t4_$NUM, #function
664.size bn_pwr5_mont_t4_$NUM, .-bn_pwr5_mont_t4_$NUM
665___
666}
667
668for ($i=8;$i<=32;$i+=8) {
669 &generate_bn_pwr5_mont_t4($i);
670}
671
672
673{
674########################################################################
675# Fall-back subroutines
676#
677# copy of bn_mul_mont_vis3 adjusted for vectors of 64-bit values
678#
679($n0,$m0,$m1,$lo0,$hi0, $lo1,$hi1,$aj,$alo,$nj,$nlo,$tj)=
680 (map("%g$_",(1..5)),map("%o$_",(0..5,7)));
681
682# int bn_mul_mont(
683$rp="%o0"; # u64 *rp,
684$ap="%o1"; # const u64 *ap,
685$bp="%o2"; # const u64 *bp,
686$np="%o3"; # const u64 *np,
687$n0p="%o4"; # const BN_ULONG *n0,
688$num="%o5"; # int num); # caller ensures that num is >=3
689$code.=<<___;
690.globl bn_mul_mont_t4
691.align 32
692bn_mul_mont_t4:
693 add %sp, STACK_BIAS, %g4 ! real top of stack
694 sll $num, 3, $num ! size in bytes
695 add $num, 63, %g1
696 andn %g1, 63, %g1 ! buffer size rounded up to 64 bytes
697 sub %g4, %g1, %g1
698 andn %g1, 63, %g1 ! align at 64 byte
699 sub %g1, STACK_FRAME, %g1 ! new top of stack
700 sub %g1, %g4, %g1
701
702 save %sp, %g1, %sp
703___
704# +-------------------------------+<----- %sp
705# . .
706# +-------------------------------+<----- aligned at 64 bytes
707# | __int64 tmp[0] |
708# +-------------------------------+
709# . .
710# . .
711# +-------------------------------+<----- aligned at 64 bytes
712# . .
713($rp,$ap,$bp,$np,$n0p,$num)=map("%i$_",(0..5));
714($t0,$t1,$t2,$t3,$cnt,$tp,$bufsz)=map("%l$_",(0..7));
715($ovf,$i)=($t0,$t1);
716$code.=<<___;
717 ld [$n0p+0], $t0 ! pull n0[0..1] value
718 ld [$n0p+4], $t1
719 add %sp, STACK_BIAS+STACK_FRAME, $tp
720 ldx [$bp+0], $m0 ! m0=bp[0]
721 sllx $t1, 32, $n0
722 add $bp, 8, $bp
723 or $t0, $n0, $n0
724
725
726 ldx [$ap+0], $aj ! ap[0]
727
728 mulx $aj, $m0, $lo0 ! ap[0]*bp[0]
729 umulxhi $aj, $m0, $hi0
730
731 ldx [$ap+8], $aj ! ap[1]
732 add $ap, 16, $ap
733 ldx [$np+0], $nj ! np[0]
734
735 mulx $lo0, $n0, $m1 ! "tp[0]"*n0
736
737 mulx $aj, $m0, $alo ! ap[1]*bp[0]
738 umulxhi $aj, $m0, $aj ! ahi=aj
739
740 mulx $nj, $m1, $lo1 ! np[0]*m1
741 umulxhi $nj, $m1, $hi1
742
743 ldx [$np+8], $nj ! np[1]
744
745 addcc $lo0, $lo1, $lo1
746 add $np, 16, $np
747 addxc %g0, $hi1, $hi1
748
749 mulx $nj, $m1, $nlo ! np[1]*m1
750 umulxhi $nj, $m1, $nj ! nhi=nj
751
752
753 ba .L1st
754 sub $num, 24, $cnt ! cnt=num-3
755
756.align 16
757.L1st:
758 addcc $alo, $hi0, $lo0
759 addxc $aj, %g0, $hi0
760
761 ldx [$ap+0], $aj ! ap[j]
762 addcc $nlo, $hi1, $lo1
763 add $ap, 8, $ap
764 addxc $nj, %g0, $hi1 ! nhi=nj
765
766 ldx [$np+0], $nj ! np[j]
767 mulx $aj, $m0, $alo ! ap[j]*bp[0]
768 add $np, 8, $np
769 umulxhi $aj, $m0, $aj ! ahi=aj
770
771 mulx $nj, $m1, $nlo ! np[j]*m1
772 addcc $lo0, $lo1, $lo1 ! np[j]*m1+ap[j]*bp[0]
773 umulxhi $nj, $m1, $nj ! nhi=nj
774 addxc %g0, $hi1, $hi1
775 stxa $lo1, [$tp]0xe2 ! tp[j-1]
776 add $tp, 8, $tp ! tp++
777
778 brnz,pt $cnt, .L1st
779 sub $cnt, 8, $cnt ! j--
780!.L1st
781 addcc $alo, $hi0, $lo0
782 addxc $aj, %g0, $hi0 ! ahi=aj
783
784 addcc $nlo, $hi1, $lo1
785 addxc $nj, %g0, $hi1
786 addcc $lo0, $lo1, $lo1 ! np[j]*m1+ap[j]*bp[0]
787 addxc %g0, $hi1, $hi1
788 stxa $lo1, [$tp]0xe2 ! tp[j-1]
789 add $tp, 8, $tp
790
791 addcc $hi0, $hi1, $hi1
792 addxc %g0, %g0, $ovf ! upmost overflow bit
793 stxa $hi1, [$tp]0xe2
794 add $tp, 8, $tp
795
796
797 ba .Louter
798 sub $num, 16, $i ! i=num-2
799
800.align 16
801.Louter:
802 ldx [$bp+0], $m0 ! m0=bp[i]
803 add $bp, 8, $bp
804
805 sub $ap, $num, $ap ! rewind
806 sub $np, $num, $np
807 sub $tp, $num, $tp
808
809 ldx [$ap+0], $aj ! ap[0]
810 ldx [$np+0], $nj ! np[0]
811
812 mulx $aj, $m0, $lo0 ! ap[0]*bp[i]
813 ldx [$tp], $tj ! tp[0]
814 umulxhi $aj, $m0, $hi0
815 ldx [$ap+8], $aj ! ap[1]
816 addcc $lo0, $tj, $lo0 ! ap[0]*bp[i]+tp[0]
817 mulx $aj, $m0, $alo ! ap[1]*bp[i]
818 addxc %g0, $hi0, $hi0
819 mulx $lo0, $n0, $m1 ! tp[0]*n0
820 umulxhi $aj, $m0, $aj ! ahi=aj
821 mulx $nj, $m1, $lo1 ! np[0]*m1
822 add $ap, 16, $ap
823 umulxhi $nj, $m1, $hi1
824 ldx [$np+8], $nj ! np[1]
825 add $np, 16, $np
826 addcc $lo1, $lo0, $lo1
827 mulx $nj, $m1, $nlo ! np[1]*m1
828 addxc %g0, $hi1, $hi1
829 umulxhi $nj, $m1, $nj ! nhi=nj
830
831
832 ba .Linner
833 sub $num, 24, $cnt ! cnt=num-3
834.align 16
835.Linner:
836 addcc $alo, $hi0, $lo0
837 ldx [$tp+8], $tj ! tp[j]
838 addxc $aj, %g0, $hi0 ! ahi=aj
839 ldx [$ap+0], $aj ! ap[j]
840 add $ap, 8, $ap
841 addcc $nlo, $hi1, $lo1
842 mulx $aj, $m0, $alo ! ap[j]*bp[i]
843 addxc $nj, %g0, $hi1 ! nhi=nj
844 ldx [$np+0], $nj ! np[j]
845 add $np, 8, $np
846 umulxhi $aj, $m0, $aj ! ahi=aj
847 addcc $lo0, $tj, $lo0 ! ap[j]*bp[i]+tp[j]
848 mulx $nj, $m1, $nlo ! np[j]*m1
849 addxc %g0, $hi0, $hi0
850 umulxhi $nj, $m1, $nj ! nhi=nj
851 addcc $lo1, $lo0, $lo1 ! np[j]*m1+ap[j]*bp[i]+tp[j]
852 addxc %g0, $hi1, $hi1
853 stx $lo1, [$tp] ! tp[j-1]
854 add $tp, 8, $tp
855 brnz,pt $cnt, .Linner
856 sub $cnt, 8, $cnt
857!.Linner
858 ldx [$tp+8], $tj ! tp[j]
859 addcc $alo, $hi0, $lo0
860 addxc $aj, %g0, $hi0 ! ahi=aj
861 addcc $lo0, $tj, $lo0 ! ap[j]*bp[i]+tp[j]
862 addxc %g0, $hi0, $hi0
863
864 addcc $nlo, $hi1, $lo1
865 addxc $nj, %g0, $hi1 ! nhi=nj
866 addcc $lo1, $lo0, $lo1 ! np[j]*m1+ap[j]*bp[i]+tp[j]
867 addxc %g0, $hi1, $hi1
868 stx $lo1, [$tp] ! tp[j-1]
869
870 subcc %g0, $ovf, %g0 ! move upmost overflow to CCR.xcc
871 addxccc $hi1, $hi0, $hi1
872 addxc %g0, %g0, $ovf
873 stx $hi1, [$tp+8]
874 add $tp, 16, $tp
875
876 brnz,pt $i, .Louter
877 sub $i, 8, $i
878
879
880 sub $ap, $num, $ap ! rewind
881 sub $np, $num, $np
882 sub $tp, $num, $tp
883 ba .Lsub
884 subcc $num, 8, $cnt ! cnt=num-1 and clear CCR.xcc
885
886.align 16
887.Lsub:
888 ldx [$tp], $tj
889 add $tp, 8, $tp
890 ldx [$np+0], $nj
891 add $np, 8, $np
892 subccc $tj, $nj, $t2 ! tp[j]-np[j]
893 srlx $tj, 32, $tj
894 srlx $nj, 32, $nj
895 subccc $tj, $nj, $t3
896 add $rp, 8, $rp
897 st $t2, [$rp-4] ! reverse order
898 st $t3, [$rp-8]
899 brnz,pt $cnt, .Lsub
900 sub $cnt, 8, $cnt
901
902 sub $np, $num, $np ! rewind
903 sub $tp, $num, $tp
904 sub $rp, $num, $rp
905
906 subccc $ovf, %g0, $ovf ! handle upmost overflow bit
907 ba .Lcopy
908 sub $num, 8, $cnt
909
910.align 16
911.Lcopy: ! conditional copy
912 ldx [$tp], $tj
913 ldx [$rp+0], $t2
914 stx %g0, [$tp] ! zap
915 add $tp, 8, $tp
916 movcs %icc, $tj, $t2
917 stx $t2, [$rp+0]
918 add $rp, 8, $rp
919 brnz $cnt, .Lcopy
920 sub $cnt, 8, $cnt
921
922 mov 1, %o0
923 ret
924 restore
925.type bn_mul_mont_t4, #function
926.size bn_mul_mont_t4, .-bn_mul_mont_t4
927___
928
929
930# int bn_mul_mont_gather5(
931$rp="%o0"; # u64 *rp,
932$ap="%o1"; # const u64 *ap,
933$bp="%o2"; # const u64 *pwrtbl,
934$np="%o3"; # const u64 *np,
935$n0p="%o4"; # const BN_ULONG *n0,
936$num="%o5"; # int num, # caller ensures that num is >=3
937 # int power);
938$code.=<<___;
939.globl bn_mul_mont_gather5_t4
940.align 32
941bn_mul_mont_gather5_t4:
942 add %sp, STACK_BIAS, %g4 ! real top of stack
943 sll $num, 3, $num ! size in bytes
944 add $num, 63, %g1
945 andn %g1, 63, %g1 ! buffer size rounded up to 64 bytes
946 sub %g4, %g1, %g1
947 andn %g1, 63, %g1 ! align at 64 byte
948 sub %g1, STACK_FRAME, %g1 ! new top of stack
949 sub %g1, %g4, %g1
950 LDPTR [%sp+STACK_7thARG], %g4 ! load power, 7th argument
951
952 save %sp, %g1, %sp
953___
954# +-------------------------------+<----- %sp
955# . .
956# +-------------------------------+<----- aligned at 64 bytes
957# | __int64 tmp[0] |
958# +-------------------------------+
959# . .
960# . .
961# +-------------------------------+<----- aligned at 64 bytes
962# . .
963($rp,$ap,$bp,$np,$n0p,$num)=map("%i$_",(0..5));
964($t0,$t1,$t2,$t3,$cnt,$tp,$bufsz,$ccr)=map("%l$_",(0..7));
965($ovf,$i)=($t0,$t1);
966 &load_ccr($bp,"%g4",$ccr);
967 &load_b($bp,$m0,"%o7"); # m0=bp[0]
968
969$code.=<<___;
970 ld [$n0p+0], $t0 ! pull n0[0..1] value
971 ld [$n0p+4], $t1
972 add %sp, STACK_BIAS+STACK_FRAME, $tp
973 sllx $t1, 32, $n0
974 or $t0, $n0, $n0
975
976
977 ldx [$ap+0], $aj ! ap[0]
978
979 mulx $aj, $m0, $lo0 ! ap[0]*bp[0]
980 umulxhi $aj, $m0, $hi0
981
982 ldx [$ap+8], $aj ! ap[1]
983 add $ap, 16, $ap
984 ldx [$np+0], $nj ! np[0]
985
986 mulx $lo0, $n0, $m1 ! "tp[0]"*n0
987
988 mulx $aj, $m0, $alo ! ap[1]*bp[0]
989 umulxhi $aj, $m0, $aj ! ahi=aj
990
991 mulx $nj, $m1, $lo1 ! np[0]*m1
992 umulxhi $nj, $m1, $hi1
993
994 ldx [$np+8], $nj ! np[1]
995
996 addcc $lo0, $lo1, $lo1
997 add $np, 16, $np
998 addxc %g0, $hi1, $hi1
999
1000 mulx $nj, $m1, $nlo ! np[1]*m1
1001 umulxhi $nj, $m1, $nj ! nhi=nj
1002
1003
1004 ba .L1st_g5
1005 sub $num, 24, $cnt ! cnt=num-3
1006
1007.align 16
1008.L1st_g5:
1009 addcc $alo, $hi0, $lo0
1010 addxc $aj, %g0, $hi0
1011
1012 ldx [$ap+0], $aj ! ap[j]
1013 addcc $nlo, $hi1, $lo1
1014 add $ap, 8, $ap
1015 addxc $nj, %g0, $hi1 ! nhi=nj
1016
1017 ldx [$np+0], $nj ! np[j]
1018 mulx $aj, $m0, $alo ! ap[j]*bp[0]
1019 add $np, 8, $np
1020 umulxhi $aj, $m0, $aj ! ahi=aj
1021
1022 mulx $nj, $m1, $nlo ! np[j]*m1
1023 addcc $lo0, $lo1, $lo1 ! np[j]*m1+ap[j]*bp[0]
1024 umulxhi $nj, $m1, $nj ! nhi=nj
1025 addxc %g0, $hi1, $hi1
1026 stxa $lo1, [$tp]0xe2 ! tp[j-1]
1027 add $tp, 8, $tp ! tp++
1028
1029 brnz,pt $cnt, .L1st_g5
1030 sub $cnt, 8, $cnt ! j--
1031!.L1st_g5
1032 addcc $alo, $hi0, $lo0
1033 addxc $aj, %g0, $hi0 ! ahi=aj
1034
1035 addcc $nlo, $hi1, $lo1
1036 addxc $nj, %g0, $hi1
1037 addcc $lo0, $lo1, $lo1 ! np[j]*m1+ap[j]*bp[0]
1038 addxc %g0, $hi1, $hi1
1039 stxa $lo1, [$tp]0xe2 ! tp[j-1]
1040 add $tp, 8, $tp
1041
1042 addcc $hi0, $hi1, $hi1
1043 addxc %g0, %g0, $ovf ! upmost overflow bit
1044 stxa $hi1, [$tp]0xe2
1045 add $tp, 8, $tp
1046
1047
1048 ba .Louter_g5
1049 sub $num, 16, $i ! i=num-2
1050
1051.align 16
1052.Louter_g5:
1053 wr $ccr, %g0, %ccr
1054___
1055 &load_b($bp,$m0); # m0=bp[i]
1056$code.=<<___;
1057 sub $ap, $num, $ap ! rewind
1058 sub $np, $num, $np
1059 sub $tp, $num, $tp
1060
1061 ldx [$ap+0], $aj ! ap[0]
1062 ldx [$np+0], $nj ! np[0]
1063
1064 mulx $aj, $m0, $lo0 ! ap[0]*bp[i]
1065 ldx [$tp], $tj ! tp[0]
1066 umulxhi $aj, $m0, $hi0
1067 ldx [$ap+8], $aj ! ap[1]
1068 addcc $lo0, $tj, $lo0 ! ap[0]*bp[i]+tp[0]
1069 mulx $aj, $m0, $alo ! ap[1]*bp[i]
1070 addxc %g0, $hi0, $hi0
1071 mulx $lo0, $n0, $m1 ! tp[0]*n0
1072 umulxhi $aj, $m0, $aj ! ahi=aj
1073 mulx $nj, $m1, $lo1 ! np[0]*m1
1074 add $ap, 16, $ap
1075 umulxhi $nj, $m1, $hi1
1076 ldx [$np+8], $nj ! np[1]
1077 add $np, 16, $np
1078 addcc $lo1, $lo0, $lo1
1079 mulx $nj, $m1, $nlo ! np[1]*m1
1080 addxc %g0, $hi1, $hi1
1081 umulxhi $nj, $m1, $nj ! nhi=nj
1082
1083
1084 ba .Linner_g5
1085 sub $num, 24, $cnt ! cnt=num-3
1086.align 16
1087.Linner_g5:
1088 addcc $alo, $hi0, $lo0
1089 ldx [$tp+8], $tj ! tp[j]
1090 addxc $aj, %g0, $hi0 ! ahi=aj
1091 ldx [$ap+0], $aj ! ap[j]
1092 add $ap, 8, $ap
1093 addcc $nlo, $hi1, $lo1
1094 mulx $aj, $m0, $alo ! ap[j]*bp[i]
1095 addxc $nj, %g0, $hi1 ! nhi=nj
1096 ldx [$np+0], $nj ! np[j]
1097 add $np, 8, $np
1098 umulxhi $aj, $m0, $aj ! ahi=aj
1099 addcc $lo0, $tj, $lo0 ! ap[j]*bp[i]+tp[j]
1100 mulx $nj, $m1, $nlo ! np[j]*m1
1101 addxc %g0, $hi0, $hi0
1102 umulxhi $nj, $m1, $nj ! nhi=nj
1103 addcc $lo1, $lo0, $lo1 ! np[j]*m1+ap[j]*bp[i]+tp[j]
1104 addxc %g0, $hi1, $hi1
1105 stx $lo1, [$tp] ! tp[j-1]
1106 add $tp, 8, $tp
1107 brnz,pt $cnt, .Linner_g5
1108 sub $cnt, 8, $cnt
1109!.Linner_g5
1110 ldx [$tp+8], $tj ! tp[j]
1111 addcc $alo, $hi0, $lo0
1112 addxc $aj, %g0, $hi0 ! ahi=aj
1113 addcc $lo0, $tj, $lo0 ! ap[j]*bp[i]+tp[j]
1114 addxc %g0, $hi0, $hi0
1115
1116 addcc $nlo, $hi1, $lo1
1117 addxc $nj, %g0, $hi1 ! nhi=nj
1118 addcc $lo1, $lo0, $lo1 ! np[j]*m1+ap[j]*bp[i]+tp[j]
1119 addxc %g0, $hi1, $hi1
1120 stx $lo1, [$tp] ! tp[j-1]
1121
1122 subcc %g0, $ovf, %g0 ! move upmost overflow to CCR.xcc
1123 addxccc $hi1, $hi0, $hi1
1124 addxc %g0, %g0, $ovf
1125 stx $hi1, [$tp+8]
1126 add $tp, 16, $tp
1127
1128 brnz,pt $i, .Louter_g5
1129 sub $i, 8, $i
1130
1131
1132 sub $ap, $num, $ap ! rewind
1133 sub $np, $num, $np
1134 sub $tp, $num, $tp
1135 ba .Lsub_g5
1136 subcc $num, 8, $cnt ! cnt=num-1 and clear CCR.xcc
1137
1138.align 16
1139.Lsub_g5:
1140 ldx [$tp], $tj
1141 add $tp, 8, $tp
1142 ldx [$np+0], $nj
1143 add $np, 8, $np
1144 subccc $tj, $nj, $t2 ! tp[j]-np[j]
1145 srlx $tj, 32, $tj
1146 srlx $nj, 32, $nj
1147 subccc $tj, $nj, $t3
1148 add $rp, 8, $rp
1149 st $t2, [$rp-4] ! reverse order
1150 st $t3, [$rp-8]
1151 brnz,pt $cnt, .Lsub_g5
1152 sub $cnt, 8, $cnt
1153
1154 sub $np, $num, $np ! rewind
1155 sub $tp, $num, $tp
1156 sub $rp, $num, $rp
1157
1158 subccc $ovf, %g0, $ovf ! handle upmost overflow bit
1159 ba .Lcopy_g5
1160 sub $num, 8, $cnt
1161
1162.align 16
1163.Lcopy_g5: ! conditional copy
1164 ldx [$tp], $tj
1165 ldx [$rp+0], $t2
1166 stx %g0, [$tp] ! zap
1167 add $tp, 8, $tp
1168 movcs %icc, $tj, $t2
1169 stx $t2, [$rp+0]
1170 add $rp, 8, $rp
1171 brnz $cnt, .Lcopy_g5
1172 sub $cnt, 8, $cnt
1173
1174 mov 1, %o0
1175 ret
1176 restore
1177.type bn_mul_mont_gather5_t4, #function
1178.size bn_mul_mont_gather5_t4, .-bn_mul_mont_gather5_t4
1179___
1180}
1181
1182
1183$code.=<<___;
1184.globl bn_flip_t4
1185.align 32
1186bn_flip_t4:
1187.Loop_flip:
1188 ld [%o1+0], %o4
1189 sub %o2, 1, %o2
1190 ld [%o1+4], %o5
1191 add %o1, 8, %o1
1192 st %o5, [%o0+0]
1193 st %o4, [%o0+4]
1194 brnz %o2, .Loop_flip
1195 add %o0, 8, %o0
1196 retl
1197 nop
1198.type bn_flip_t4, #function
1199.size bn_flip_t4, .-bn_flip_t4
1200
1201.globl bn_flip_n_scatter5_t4
1202.align 32
1203bn_flip_n_scatter5_t4:
1204 sll %o3, 3, %o3
1205 srl %o1, 1, %o1
1206 add %o3, %o2, %o2 ! &pwrtbl[pwr]
1207 sub %o1, 1, %o1
1208.Loop_flip_n_scatter5:
1209 ld [%o0+0], %o4 ! inp[i]
1210 ld [%o0+4], %o5
1211 add %o0, 8, %o0
1212 sllx %o5, 32, %o5
1213 or %o4, %o5, %o5
1214 stx %o5, [%o2]
1215 add %o2, 32*8, %o2
1216 brnz %o1, .Loop_flip_n_scatter5
1217 sub %o1, 1, %o1
1218 retl
1219 nop
1220.type bn_flip_n_scatter5_t4, #function
1221.size bn_flip_n_scatter5_t4, .-bn_flip_n_scatter5_t4
1222
1223.globl bn_gather5_t4
1224.align 32
1225bn_gather5_t4:
1226___
1227 &load_ccr("%o2","%o3","%g1");
1228$code.=<<___;
1229 sub %o1, 1, %o1
1230.Loop_gather5:
1231___
1232 &load_b("%o2","%g1");
1233$code.=<<___;
1234 stx %g1, [%o0]
1235 add %o0, 8, %o0
1236 brnz %o1, .Loop_gather5
1237 sub %o1, 1, %o1
1238
1239 retl
1240 nop
1241.type bn_gather5_t4, #function
1242.size bn_gather5_t4, .-bn_gather5_t4
1243
1244.asciz "Montgomery Multiplication for SPARC T4, David S. Miller, Andy Polyakov"
1245.align 4
1246___
1247
1248&emit_assembler();
1249
1250close STDOUT or die "error closing STDOUT: $!";
Note: See TracBrowser for help on using the repository browser.

© 2024 Oracle Support Privacy / Do Not Sell My Info Terms of Use Trademark Policy Automated Access Etiquette