VirtualBox

source: vbox/trunk/src/libs/openssl-3.0.1/crypto/bn/asm/sparct4-mont.pl@ 94082

Last change on this file since 94082 was 94082, checked in by vboxsync, 3 years ago

libs/openssl-3.0.1: started applying and adjusting our OpenSSL changes to 3.0.1. bugref:10128

  • Property svn:executable set to *
File size: 27.2 KB
Line 
1#! /usr/bin/env perl
2# Copyright 2012-2021 The OpenSSL Project Authors. All Rights Reserved.
3#
4# Licensed under the Apache License 2.0 (the "License"). You may not use
5# this file except in compliance with the License. You can obtain a copy
6# in the file LICENSE in the source distribution or at
7# https://www.openssl.org/source/license.html
8
9
10# ====================================================================
11# Written by David S. Miller and Andy Polyakov
12# The module is licensed under 2-clause BSD license.
13# November 2012. All rights reserved.
14# ====================================================================
15
16######################################################################
17# Montgomery squaring-n-multiplication module for SPARC T4.
18#
19# The module consists of three parts:
20#
21# 1) collection of "single-op" subroutines that perform single
22# operation, Montgomery squaring or multiplication, on 512-,
23# 1024-, 1536- and 2048-bit operands;
24# 2) collection of "multi-op" subroutines that perform 5 squaring and
25# 1 multiplication operations on operands of above lengths;
26# 3) fall-back and helper VIS3 subroutines.
27#
28# RSA sign is dominated by multi-op subroutine, while RSA verify and
29# DSA - by single-op. Special note about 4096-bit RSA verify result.
30# Operands are too long for dedicated hardware and it's handled by
31# VIS3 code, which is why you don't see any improvement. It's surely
32# possible to improve it [by deploying 'mpmul' instruction], maybe in
33# the future...
34#
35# Performance improvement.
36#
37# 64-bit process, VIS3:
38# sign verify sign/s verify/s
39# rsa 1024 bits 0.000628s 0.000028s 1592.4 35434.4
40# rsa 2048 bits 0.003282s 0.000106s 304.7 9438.3
41# rsa 4096 bits 0.025866s 0.000340s 38.7 2940.9
42# dsa 1024 bits 0.000301s 0.000332s 3323.7 3013.9
43# dsa 2048 bits 0.001056s 0.001233s 946.9 810.8
44#
45# 64-bit process, this module:
46# sign verify sign/s verify/s
47# rsa 1024 bits 0.000256s 0.000016s 3904.4 61411.9
48# rsa 2048 bits 0.000946s 0.000029s 1056.8 34292.7
49# rsa 4096 bits 0.005061s 0.000340s 197.6 2940.5
50# dsa 1024 bits 0.000176s 0.000195s 5674.7 5130.5
51# dsa 2048 bits 0.000296s 0.000354s 3383.2 2827.6
52#
53######################################################################
54# 32-bit process, VIS3:
55# sign verify sign/s verify/s
56# rsa 1024 bits 0.000665s 0.000028s 1504.8 35233.3
57# rsa 2048 bits 0.003349s 0.000106s 298.6 9433.4
58# rsa 4096 bits 0.025959s 0.000341s 38.5 2934.8
59# dsa 1024 bits 0.000320s 0.000341s 3123.3 2929.6
60# dsa 2048 bits 0.001101s 0.001260s 908.2 793.4
61#
62# 32-bit process, this module:
63# sign verify sign/s verify/s
64# rsa 1024 bits 0.000301s 0.000017s 3317.1 60240.0
65# rsa 2048 bits 0.001034s 0.000030s 966.9 33812.7
66# rsa 4096 bits 0.005244s 0.000341s 190.7 2935.4
67# dsa 1024 bits 0.000201s 0.000205s 4976.1 4879.2
68# dsa 2048 bits 0.000328s 0.000360s 3051.1 2774.2
69#
70# 32-bit code is prone to performance degradation as interrupt rate
71# dispatched to CPU executing the code grows. This is because in
72# standard process of handling interrupt in 32-bit process context
73# upper halves of most integer registers used as input or output are
74# zeroed. This renders result invalid, and operation has to be re-run.
75# If CPU is "bothered" with timer interrupts only, the penalty is
76# hardly measurable. But in order to mitigate this problem for higher
77# interrupt rates contemporary Linux kernel recognizes biased stack
78# even in 32-bit process context and preserves full register contents.
79# See http://git.kernel.org/cgit/linux/kernel/git/torvalds/linux.git/commit/?id=517ffce4e1a03aea979fe3a18a3dd1761a24fafb
80# for details.
81
82$0 =~ m/(.*[\/\\])[^\/\\]+$/; $dir=$1;
83push(@INC,"${dir}","${dir}../../perlasm");
84require "sparcv9_modes.pl";
85
86$output = pop and open STDOUT,">$output";
87
88$code.=<<___;
89#ifndef __ASSEMBLER__
90# define __ASSEMBLER__ 1
91#endif
92#include "crypto/sparc_arch.h"
93
94#ifdef __arch64__
95.register %g2,#scratch
96.register %g3,#scratch
97#endif
98
99.section ".text",#alloc,#execinstr
100
101#ifdef __PIC__
102SPARC_PIC_THUNK(%g1)
103#endif
104___
105
106########################################################################
107# Register layout for mont[mul|sqr] instructions.
108# For details see "Oracle SPARC Architecture 2011" manual at
109# http://www.oracle.com/technetwork/server-storage/sun-sparc-enterprise/documentation/.
110#
111my @R=map("%f".2*$_,(0..11,30,31,12..29));
112my @N=(map("%l$_",(0..7)),map("%o$_",(0..5))); @N=(@N,@N,@N[0..3]);
113my @A=(@N[0..13],@R[14..31]);
114my @B=(map("%i$_",(0..5)),map("%l$_",(0..7))); @B=(@B,@B,map("%o$_",(0..3)));
115
116
117########################################################################
118# int bn_mul_mont_t4_$NUM(u64 *rp,const u64 *ap,const u64 *bp,
119# const u64 *np,const BN_ULONG *n0);
120#
121sub generate_bn_mul_mont_t4() {
122my $NUM=shift;
123my ($rp,$ap,$bp,$np,$sentinel)=map("%g$_",(1..5));
124
125$code.=<<___;
126.globl bn_mul_mont_t4_$NUM
127.align 32
128bn_mul_mont_t4_$NUM:
129#ifdef __arch64__
130 mov 0,$sentinel
131 mov -128,%g4
132#elif defined(SPARCV9_64BIT_STACK)
133 SPARC_LOAD_ADDRESS_LEAF(OPENSSL_sparcv9cap_P,%g1,%g5)
134 ld [%g1+0],%g1 ! OPENSSL_sparcv9_P[0]
135 mov -2047,%g4
136 and %g1,SPARCV9_64BIT_STACK,%g1
137 movrz %g1,0,%g4
138 mov -1,$sentinel
139 add %g4,-128,%g4
140#else
141 mov -1,$sentinel
142 mov -128,%g4
143#endif
144 sllx $sentinel,32,$sentinel
145 save %sp,%g4,%sp
146#ifndef __arch64__
147 save %sp,-128,%sp ! warm it up
148 save %sp,-128,%sp
149 save %sp,-128,%sp
150 save %sp,-128,%sp
151 save %sp,-128,%sp
152 save %sp,-128,%sp
153 restore
154 restore
155 restore
156 restore
157 restore
158 restore
159#endif
160 and %sp,1,%g4
161 or $sentinel,%fp,%fp
162 or %g4,$sentinel,$sentinel
163
164 ! copy arguments to global registers
165 mov %i0,$rp
166 mov %i1,$ap
167 mov %i2,$bp
168 mov %i3,$np
169 ld [%i4+0],%f1 ! load *n0
170 ld [%i4+4],%f0
171 fsrc2 %f0,%f60
172___
173
174
175# load ap[$NUM] ########################################################
176$code.=<<___;
177 save %sp,-128,%sp; or $sentinel,%fp,%fp
178___
179for($i=0; $i<14 && $i<$NUM; $i++) {
180my $lo=$i<13?@A[$i+1]:"%o7";
181$code.=<<___;
182 ld [$ap+$i*8+0],$lo
183 ld [$ap+$i*8+4],@A[$i]
184 sllx @A[$i],32,@A[$i]
185 or $lo,@A[$i],@A[$i]
186___
187}
188for(; $i<$NUM; $i++) {
189my ($hi,$lo)=("%f".2*($i%4),"%f".(2*($i%4)+1));
190$code.=<<___;
191 ld [$ap+$i*8+0],$lo
192 ld [$ap+$i*8+4],$hi
193 fsrc2 $hi,@A[$i]
194___
195}
196# load np[$NUM] ########################################################
197$code.=<<___;
198 save %sp,-128,%sp; or $sentinel,%fp,%fp
199___
200for($i=0; $i<14 && $i<$NUM; $i++) {
201my $lo=$i<13?@N[$i+1]:"%o7";
202$code.=<<___;
203 ld [$np+$i*8+0],$lo
204 ld [$np+$i*8+4],@N[$i]
205 sllx @N[$i],32,@N[$i]
206 or $lo,@N[$i],@N[$i]
207___
208}
209$code.=<<___;
210 save %sp,-128,%sp; or $sentinel,%fp,%fp
211___
212for(; $i<28 && $i<$NUM; $i++) {
213my $lo=$i<27?@N[$i+1]:"%o7";
214$code.=<<___;
215 ld [$np+$i*8+0],$lo
216 ld [$np+$i*8+4],@N[$i]
217 sllx @N[$i],32,@N[$i]
218 or $lo,@N[$i],@N[$i]
219___
220}
221$code.=<<___;
222 save %sp,-128,%sp; or $sentinel,%fp,%fp
223___
224for(; $i<$NUM; $i++) {
225my $lo=($i<$NUM-1)?@N[$i+1]:"%o7";
226$code.=<<___;
227 ld [$np+$i*8+0],$lo
228 ld [$np+$i*8+4],@N[$i]
229 sllx @N[$i],32,@N[$i]
230 or $lo,@N[$i],@N[$i]
231___
232}
233$code.=<<___;
234 cmp $ap,$bp
235 be SIZE_T_CC,.Lmsquare_$NUM
236 nop
237___
238
239
240# load bp[$NUM] ########################################################
241$code.=<<___;
242 save %sp,-128,%sp; or $sentinel,%fp,%fp
243___
244for($i=0; $i<14 && $i<$NUM; $i++) {
245my $lo=$i<13?@B[$i+1]:"%o7";
246$code.=<<___;
247 ld [$bp+$i*8+0],$lo
248 ld [$bp+$i*8+4],@B[$i]
249 sllx @B[$i],32,@B[$i]
250 or $lo,@B[$i],@B[$i]
251___
252}
253$code.=<<___;
254 save %sp,-128,%sp; or $sentinel,%fp,%fp
255___
256for(; $i<$NUM; $i++) {
257my $lo=($i<$NUM-1)?@B[$i+1]:"%o7";
258$code.=<<___;
259 ld [$bp+$i*8+0],$lo
260 ld [$bp+$i*8+4],@B[$i]
261 sllx @B[$i],32,@B[$i]
262 or $lo,@B[$i],@B[$i]
263___
264}
265# magic ################################################################
266$code.=<<___;
267 .word 0x81b02920+$NUM-1 ! montmul $NUM-1
268.Lmresume_$NUM:
269 fbu,pn %fcc3,.Lmabort_$NUM
270#ifndef __arch64__
271 and %fp,$sentinel,$sentinel
272 brz,pn $sentinel,.Lmabort_$NUM
273#endif
274 nop
275#ifdef __arch64__
276 restore
277 restore
278 restore
279 restore
280 restore
281#else
282 restore; and %fp,$sentinel,$sentinel
283 restore; and %fp,$sentinel,$sentinel
284 restore; and %fp,$sentinel,$sentinel
285 restore; and %fp,$sentinel,$sentinel
286 brz,pn $sentinel,.Lmabort1_$NUM
287 restore
288#endif
289___
290
291
292# save tp[$NUM] ########################################################
293for($i=0; $i<14 && $i<$NUM; $i++) {
294$code.=<<___;
295 movxtod @A[$i],@R[$i]
296___
297}
298$code.=<<___;
299#ifdef __arch64__
300 restore
301#else
302 and %fp,$sentinel,$sentinel
303 restore
304 and $sentinel,1,%o7
305 and %fp,$sentinel,$sentinel
306 srl %fp,0,%fp ! just in case?
307 or %o7,$sentinel,$sentinel
308 brz,a,pn $sentinel,.Lmdone_$NUM
309 mov 0,%i0 ! return failure
310#endif
311___
312for($i=0; $i<12 && $i<$NUM; $i++) {
313@R[$i] =~ /%f([0-9]+)/;
314my $lo = "%f".($1+1);
315$code.=<<___;
316 st $lo,[$rp+$i*8+0]
317 st @R[$i],[$rp+$i*8+4]
318___
319}
320for(; $i<$NUM; $i++) {
321my ($hi,$lo)=("%f".2*($i%4),"%f".(2*($i%4)+1));
322$code.=<<___;
323 fsrc2 @R[$i],$hi
324 st $lo,[$rp+$i*8+0]
325 st $hi,[$rp+$i*8+4]
326___
327}
328$code.=<<___;
329 mov 1,%i0 ! return success
330.Lmdone_$NUM:
331 ret
332 restore
333
334.Lmabort_$NUM:
335 restore
336 restore
337 restore
338 restore
339 restore
340.Lmabort1_$NUM:
341 restore
342
343 mov 0,%i0 ! return failure
344 ret
345 restore
346
347.align 32
348.Lmsquare_$NUM:
349 save %sp,-128,%sp; or $sentinel,%fp,%fp
350 save %sp,-128,%sp; or $sentinel,%fp,%fp
351 .word 0x81b02940+$NUM-1 ! montsqr $NUM-1
352 ba .Lmresume_$NUM
353 nop
354.type bn_mul_mont_t4_$NUM, #function
355.size bn_mul_mont_t4_$NUM, .-bn_mul_mont_t4_$NUM
356___
357}
358
359for ($i=8;$i<=32;$i+=8) {
360 &generate_bn_mul_mont_t4($i);
361}
362
363
364########################################################################
365#
366sub load_ccr {
367my ($ptbl,$pwr,$ccr,$skip_wr)=@_;
368$code.=<<___;
369 srl $pwr, 2, %o4
370 and $pwr, 3, %o5
371 and %o4, 7, %o4
372 sll %o5, 3, %o5 ! offset within first cache line
373 add %o5, $ptbl, $ptbl ! of the pwrtbl
374 or %g0, 1, %o5
375 sll %o5, %o4, $ccr
376___
377$code.=<<___ if (!$skip_wr);
378 wr $ccr, %g0, %ccr
379___
380}
381sub load_b_pair {
382my ($pwrtbl,$B0,$B1)=@_;
383
384$code.=<<___;
385 ldx [$pwrtbl+0*32], $B0
386 ldx [$pwrtbl+8*32], $B1
387 ldx [$pwrtbl+1*32], %o4
388 ldx [$pwrtbl+9*32], %o5
389 movvs %icc, %o4, $B0
390 ldx [$pwrtbl+2*32], %o4
391 movvs %icc, %o5, $B1
392 ldx [$pwrtbl+10*32],%o5
393 move %icc, %o4, $B0
394 ldx [$pwrtbl+3*32], %o4
395 move %icc, %o5, $B1
396 ldx [$pwrtbl+11*32],%o5
397 movneg %icc, %o4, $B0
398 ldx [$pwrtbl+4*32], %o4
399 movneg %icc, %o5, $B1
400 ldx [$pwrtbl+12*32],%o5
401 movcs %xcc, %o4, $B0
402 ldx [$pwrtbl+5*32],%o4
403 movcs %xcc, %o5, $B1
404 ldx [$pwrtbl+13*32],%o5
405 movvs %xcc, %o4, $B0
406 ldx [$pwrtbl+6*32], %o4
407 movvs %xcc, %o5, $B1
408 ldx [$pwrtbl+14*32],%o5
409 move %xcc, %o4, $B0
410 ldx [$pwrtbl+7*32], %o4
411 move %xcc, %o5, $B1
412 ldx [$pwrtbl+15*32],%o5
413 movneg %xcc, %o4, $B0
414 add $pwrtbl,16*32, $pwrtbl
415 movneg %xcc, %o5, $B1
416___
417}
418sub load_b {
419my ($pwrtbl,$Bi)=@_;
420
421$code.=<<___;
422 ldx [$pwrtbl+0*32], $Bi
423 ldx [$pwrtbl+1*32], %o4
424 ldx [$pwrtbl+2*32], %o5
425 movvs %icc, %o4, $Bi
426 ldx [$pwrtbl+3*32], %o4
427 move %icc, %o5, $Bi
428 ldx [$pwrtbl+4*32], %o5
429 movneg %icc, %o4, $Bi
430 ldx [$pwrtbl+5*32], %o4
431 movcs %xcc, %o5, $Bi
432 ldx [$pwrtbl+6*32], %o5
433 movvs %xcc, %o4, $Bi
434 ldx [$pwrtbl+7*32], %o4
435 move %xcc, %o5, $Bi
436 add $pwrtbl,8*32, $pwrtbl
437 movneg %xcc, %o4, $Bi
438___
439}
440
441
442########################################################################
443# int bn_pwr5_mont_t4_$NUM(u64 *tp,const u64 *np,const BN_ULONG *n0,
444# const u64 *pwrtbl,int pwr,int stride);
445#
446sub generate_bn_pwr5_mont_t4() {
447my $NUM=shift;
448my ($tp,$np,$pwrtbl,$pwr,$sentinel)=map("%g$_",(1..5));
449
450$code.=<<___;
451.globl bn_pwr5_mont_t4_$NUM
452.align 32
453bn_pwr5_mont_t4_$NUM:
454#ifdef __arch64__
455 mov 0,$sentinel
456 mov -128,%g4
457#elif defined(SPARCV9_64BIT_STACK)
458 SPARC_LOAD_ADDRESS_LEAF(OPENSSL_sparcv9cap_P,%g1,%g5)
459 ld [%g1+0],%g1 ! OPENSSL_sparcv9_P[0]
460 mov -2047,%g4
461 and %g1,SPARCV9_64BIT_STACK,%g1
462 movrz %g1,0,%g4
463 mov -1,$sentinel
464 add %g4,-128,%g4
465#else
466 mov -1,$sentinel
467 mov -128,%g4
468#endif
469 sllx $sentinel,32,$sentinel
470 save %sp,%g4,%sp
471#ifndef __arch64__
472 save %sp,-128,%sp ! warm it up
473 save %sp,-128,%sp
474 save %sp,-128,%sp
475 save %sp,-128,%sp
476 save %sp,-128,%sp
477 save %sp,-128,%sp
478 restore
479 restore
480 restore
481 restore
482 restore
483 restore
484#endif
485 and %sp,1,%g4
486 or $sentinel,%fp,%fp
487 or %g4,$sentinel,$sentinel
488
489 ! copy arguments to global registers
490 mov %i0,$tp
491 mov %i1,$np
492 ld [%i2+0],%f1 ! load *n0
493 ld [%i2+4],%f0
494 mov %i3,$pwrtbl
495 srl %i4,%g0,%i4 ! pack last arguments
496 sllx %i5,32,$pwr
497 or %i4,$pwr,$pwr
498 fsrc2 %f0,%f60
499___
500
501
502# load tp[$NUM] ########################################################
503$code.=<<___;
504 save %sp,-128,%sp; or $sentinel,%fp,%fp
505___
506for($i=0; $i<14 && $i<$NUM; $i++) {
507$code.=<<___;
508 ldx [$tp+$i*8],@A[$i]
509___
510}
511for(; $i<$NUM; $i++) {
512$code.=<<___;
513 ldd [$tp+$i*8],@A[$i]
514___
515}
516# load np[$NUM] ########################################################
517$code.=<<___;
518 save %sp,-128,%sp; or $sentinel,%fp,%fp
519___
520for($i=0; $i<14 && $i<$NUM; $i++) {
521$code.=<<___;
522 ldx [$np+$i*8],@N[$i]
523___
524}
525$code.=<<___;
526 save %sp,-128,%sp; or $sentinel,%fp,%fp
527___
528for(; $i<28 && $i<$NUM; $i++) {
529$code.=<<___;
530 ldx [$np+$i*8],@N[$i]
531___
532}
533$code.=<<___;
534 save %sp,-128,%sp; or $sentinel,%fp,%fp
535___
536for(; $i<$NUM; $i++) {
537$code.=<<___;
538 ldx [$np+$i*8],@N[$i]
539___
540}
541# load pwrtbl[pwr] ########################################################
542$code.=<<___;
543 save %sp,-128,%sp; or $sentinel,%fp,%fp
544
545 srlx $pwr, 32, %o4 ! unpack $pwr
546 srl $pwr, %g0, %o5
547 sub %o4, 5, %o4
548 mov $pwrtbl, %o7
549 sllx %o4, 32, $pwr ! re-pack $pwr
550 or %o5, $pwr, $pwr
551 srl %o5, %o4, %o5
552___
553 &load_ccr("%o7","%o5","%o4");
554$code.=<<___;
555 b .Lstride_$NUM
556 nop
557.align 16
558.Lstride_$NUM:
559___
560for($i=0; $i<14 && $i<$NUM; $i+=2) {
561 &load_b_pair("%o7",@B[$i],@B[$i+1]);
562}
563$code.=<<___;
564 save %sp,-128,%sp; or $sentinel,%fp,%fp
565___
566for(; $i<$NUM; $i+=2) {
567 &load_b_pair("%i7",@B[$i],@B[$i+1]);
568}
569$code.=<<___;
570 srax $pwr, 32, %o4 ! unpack $pwr
571 srl $pwr, %g0, %o5
572 sub %o4, 5, %o4
573 mov $pwrtbl, %i7
574 sllx %o4, 32, $pwr ! re-pack $pwr
575 or %o5, $pwr, $pwr
576 srl %o5, %o4, %o5
577___
578 &load_ccr("%i7","%o5","%o4",1);
579
580
581# magic ################################################################
582for($i=0; $i<5; $i++) {
583$code.=<<___;
584 .word 0x81b02940+$NUM-1 ! montsqr $NUM-1
585 fbu,pn %fcc3,.Labort_$NUM
586#ifndef __arch64__
587 and %fp,$sentinel,$sentinel
588 brz,pn $sentinel,.Labort_$NUM
589#endif
590 nop
591___
592}
593$code.=<<___;
594 wr %o4, %g0, %ccr
595 .word 0x81b02920+$NUM-1 ! montmul $NUM-1
596 fbu,pn %fcc3,.Labort_$NUM
597#ifndef __arch64__
598 and %fp,$sentinel,$sentinel
599 brz,pn $sentinel,.Labort_$NUM
600#endif
601
602 srax $pwr, 32, %o4
603#ifdef __arch64__
604 brgez %o4,.Lstride_$NUM
605 restore
606 restore
607 restore
608 restore
609 restore
610#else
611 brgez %o4,.Lstride_$NUM
612 restore; and %fp,$sentinel,$sentinel
613 restore; and %fp,$sentinel,$sentinel
614 restore; and %fp,$sentinel,$sentinel
615 restore; and %fp,$sentinel,$sentinel
616 brz,pn $sentinel,.Labort1_$NUM
617 restore
618#endif
619___
620
621
622# save tp[$NUM] ########################################################
623for($i=0; $i<14 && $i<$NUM; $i++) {
624$code.=<<___;
625 movxtod @A[$i],@R[$i]
626___
627}
628$code.=<<___;
629#ifdef __arch64__
630 restore
631#else
632 and %fp,$sentinel,$sentinel
633 restore
634 and $sentinel,1,%o7
635 and %fp,$sentinel,$sentinel
636 srl %fp,0,%fp ! just in case?
637 or %o7,$sentinel,$sentinel
638 brz,a,pn $sentinel,.Ldone_$NUM
639 mov 0,%i0 ! return failure
640#endif
641___
642for($i=0; $i<$NUM; $i++) {
643$code.=<<___;
644 std @R[$i],[$tp+$i*8]
645___
646}
647$code.=<<___;
648 mov 1,%i0 ! return success
649.Ldone_$NUM:
650 ret
651 restore
652
653.Labort_$NUM:
654 restore
655 restore
656 restore
657 restore
658 restore
659.Labort1_$NUM:
660 restore
661
662 mov 0,%i0 ! return failure
663 ret
664 restore
665.type bn_pwr5_mont_t4_$NUM, #function
666.size bn_pwr5_mont_t4_$NUM, .-bn_pwr5_mont_t4_$NUM
667___
668}
669
670for ($i=8;$i<=32;$i+=8) {
671 &generate_bn_pwr5_mont_t4($i);
672}
673
674
675{
676########################################################################
677# Fall-back subroutines
678#
679# copy of bn_mul_mont_vis3 adjusted for vectors of 64-bit values
680#
681($n0,$m0,$m1,$lo0,$hi0, $lo1,$hi1,$aj,$alo,$nj,$nlo,$tj)=
682 (map("%g$_",(1..5)),map("%o$_",(0..5,7)));
683
684# int bn_mul_mont(
685$rp="%o0"; # u64 *rp,
686$ap="%o1"; # const u64 *ap,
687$bp="%o2"; # const u64 *bp,
688$np="%o3"; # const u64 *np,
689$n0p="%o4"; # const BN_ULONG *n0,
690$num="%o5"; # int num); # caller ensures that num is >=3
691$code.=<<___;
692.globl bn_mul_mont_t4
693.align 32
694bn_mul_mont_t4:
695 add %sp, STACK_BIAS, %g4 ! real top of stack
696 sll $num, 3, $num ! size in bytes
697 add $num, 63, %g1
698 andn %g1, 63, %g1 ! buffer size rounded up to 64 bytes
699 sub %g4, %g1, %g1
700 andn %g1, 63, %g1 ! align at 64 byte
701 sub %g1, STACK_FRAME, %g1 ! new top of stack
702 sub %g1, %g4, %g1
703
704 save %sp, %g1, %sp
705___
706# +-------------------------------+<----- %sp
707# . .
708# +-------------------------------+<----- aligned at 64 bytes
709# | __int64 tmp[0] |
710# +-------------------------------+
711# . .
712# . .
713# +-------------------------------+<----- aligned at 64 bytes
714# . .
715($rp,$ap,$bp,$np,$n0p,$num)=map("%i$_",(0..5));
716($t0,$t1,$t2,$t3,$cnt,$tp,$bufsz)=map("%l$_",(0..7));
717($ovf,$i)=($t0,$t1);
718$code.=<<___;
719 ld [$n0p+0], $t0 ! pull n0[0..1] value
720 ld [$n0p+4], $t1
721 add %sp, STACK_BIAS+STACK_FRAME, $tp
722 ldx [$bp+0], $m0 ! m0=bp[0]
723 sllx $t1, 32, $n0
724 add $bp, 8, $bp
725 or $t0, $n0, $n0
726
727
728 ldx [$ap+0], $aj ! ap[0]
729
730 mulx $aj, $m0, $lo0 ! ap[0]*bp[0]
731 umulxhi $aj, $m0, $hi0
732
733 ldx [$ap+8], $aj ! ap[1]
734 add $ap, 16, $ap
735 ldx [$np+0], $nj ! np[0]
736
737 mulx $lo0, $n0, $m1 ! "tp[0]"*n0
738
739 mulx $aj, $m0, $alo ! ap[1]*bp[0]
740 umulxhi $aj, $m0, $aj ! ahi=aj
741
742 mulx $nj, $m1, $lo1 ! np[0]*m1
743 umulxhi $nj, $m1, $hi1
744
745 ldx [$np+8], $nj ! np[1]
746
747 addcc $lo0, $lo1, $lo1
748 add $np, 16, $np
749 addxc %g0, $hi1, $hi1
750
751 mulx $nj, $m1, $nlo ! np[1]*m1
752 umulxhi $nj, $m1, $nj ! nhi=nj
753
754
755 ba .L1st
756 sub $num, 24, $cnt ! cnt=num-3
757
758.align 16
759.L1st:
760 addcc $alo, $hi0, $lo0
761 addxc $aj, %g0, $hi0
762
763 ldx [$ap+0], $aj ! ap[j]
764 addcc $nlo, $hi1, $lo1
765 add $ap, 8, $ap
766 addxc $nj, %g0, $hi1 ! nhi=nj
767
768 ldx [$np+0], $nj ! np[j]
769 mulx $aj, $m0, $alo ! ap[j]*bp[0]
770 add $np, 8, $np
771 umulxhi $aj, $m0, $aj ! ahi=aj
772
773 mulx $nj, $m1, $nlo ! np[j]*m1
774 addcc $lo0, $lo1, $lo1 ! np[j]*m1+ap[j]*bp[0]
775 umulxhi $nj, $m1, $nj ! nhi=nj
776 addxc %g0, $hi1, $hi1
777 stxa $lo1, [$tp]0xe2 ! tp[j-1]
778 add $tp, 8, $tp ! tp++
779
780 brnz,pt $cnt, .L1st
781 sub $cnt, 8, $cnt ! j--
782!.L1st
783 addcc $alo, $hi0, $lo0
784 addxc $aj, %g0, $hi0 ! ahi=aj
785
786 addcc $nlo, $hi1, $lo1
787 addxc $nj, %g0, $hi1
788 addcc $lo0, $lo1, $lo1 ! np[j]*m1+ap[j]*bp[0]
789 addxc %g0, $hi1, $hi1
790 stxa $lo1, [$tp]0xe2 ! tp[j-1]
791 add $tp, 8, $tp
792
793 addcc $hi0, $hi1, $hi1
794 addxc %g0, %g0, $ovf ! upmost overflow bit
795 stxa $hi1, [$tp]0xe2
796 add $tp, 8, $tp
797
798
799 ba .Louter
800 sub $num, 16, $i ! i=num-2
801
802.align 16
803.Louter:
804 ldx [$bp+0], $m0 ! m0=bp[i]
805 add $bp, 8, $bp
806
807 sub $ap, $num, $ap ! rewind
808 sub $np, $num, $np
809 sub $tp, $num, $tp
810
811 ldx [$ap+0], $aj ! ap[0]
812 ldx [$np+0], $nj ! np[0]
813
814 mulx $aj, $m0, $lo0 ! ap[0]*bp[i]
815 ldx [$tp], $tj ! tp[0]
816 umulxhi $aj, $m0, $hi0
817 ldx [$ap+8], $aj ! ap[1]
818 addcc $lo0, $tj, $lo0 ! ap[0]*bp[i]+tp[0]
819 mulx $aj, $m0, $alo ! ap[1]*bp[i]
820 addxc %g0, $hi0, $hi0
821 mulx $lo0, $n0, $m1 ! tp[0]*n0
822 umulxhi $aj, $m0, $aj ! ahi=aj
823 mulx $nj, $m1, $lo1 ! np[0]*m1
824 add $ap, 16, $ap
825 umulxhi $nj, $m1, $hi1
826 ldx [$np+8], $nj ! np[1]
827 add $np, 16, $np
828 addcc $lo1, $lo0, $lo1
829 mulx $nj, $m1, $nlo ! np[1]*m1
830 addxc %g0, $hi1, $hi1
831 umulxhi $nj, $m1, $nj ! nhi=nj
832
833
834 ba .Linner
835 sub $num, 24, $cnt ! cnt=num-3
836.align 16
837.Linner:
838 addcc $alo, $hi0, $lo0
839 ldx [$tp+8], $tj ! tp[j]
840 addxc $aj, %g0, $hi0 ! ahi=aj
841 ldx [$ap+0], $aj ! ap[j]
842 add $ap, 8, $ap
843 addcc $nlo, $hi1, $lo1
844 mulx $aj, $m0, $alo ! ap[j]*bp[i]
845 addxc $nj, %g0, $hi1 ! nhi=nj
846 ldx [$np+0], $nj ! np[j]
847 add $np, 8, $np
848 umulxhi $aj, $m0, $aj ! ahi=aj
849 addcc $lo0, $tj, $lo0 ! ap[j]*bp[i]+tp[j]
850 mulx $nj, $m1, $nlo ! np[j]*m1
851 addxc %g0, $hi0, $hi0
852 umulxhi $nj, $m1, $nj ! nhi=nj
853 addcc $lo1, $lo0, $lo1 ! np[j]*m1+ap[j]*bp[i]+tp[j]
854 addxc %g0, $hi1, $hi1
855 stx $lo1, [$tp] ! tp[j-1]
856 add $tp, 8, $tp
857 brnz,pt $cnt, .Linner
858 sub $cnt, 8, $cnt
859!.Linner
860 ldx [$tp+8], $tj ! tp[j]
861 addcc $alo, $hi0, $lo0
862 addxc $aj, %g0, $hi0 ! ahi=aj
863 addcc $lo0, $tj, $lo0 ! ap[j]*bp[i]+tp[j]
864 addxc %g0, $hi0, $hi0
865
866 addcc $nlo, $hi1, $lo1
867 addxc $nj, %g0, $hi1 ! nhi=nj
868 addcc $lo1, $lo0, $lo1 ! np[j]*m1+ap[j]*bp[i]+tp[j]
869 addxc %g0, $hi1, $hi1
870 stx $lo1, [$tp] ! tp[j-1]
871
872 subcc %g0, $ovf, %g0 ! move upmost overflow to CCR.xcc
873 addxccc $hi1, $hi0, $hi1
874 addxc %g0, %g0, $ovf
875 stx $hi1, [$tp+8]
876 add $tp, 16, $tp
877
878 brnz,pt $i, .Louter
879 sub $i, 8, $i
880
881
882 sub $ap, $num, $ap ! rewind
883 sub $np, $num, $np
884 sub $tp, $num, $tp
885 ba .Lsub
886 subcc $num, 8, $cnt ! cnt=num-1 and clear CCR.xcc
887
888.align 16
889.Lsub:
890 ldx [$tp], $tj
891 add $tp, 8, $tp
892 ldx [$np+0], $nj
893 add $np, 8, $np
894 subccc $tj, $nj, $t2 ! tp[j]-np[j]
895 srlx $tj, 32, $tj
896 srlx $nj, 32, $nj
897 subccc $tj, $nj, $t3
898 add $rp, 8, $rp
899 st $t2, [$rp-4] ! reverse order
900 st $t3, [$rp-8]
901 brnz,pt $cnt, .Lsub
902 sub $cnt, 8, $cnt
903
904 sub $np, $num, $np ! rewind
905 sub $tp, $num, $tp
906 sub $rp, $num, $rp
907
908 subccc $ovf, %g0, $ovf ! handle upmost overflow bit
909 ba .Lcopy
910 sub $num, 8, $cnt
911
912.align 16
913.Lcopy: ! conditional copy
914 ldx [$tp], $tj
915 ldx [$rp+0], $t2
916 stx %g0, [$tp] ! zap
917 add $tp, 8, $tp
918 movcs %icc, $tj, $t2
919 stx $t2, [$rp+0]
920 add $rp, 8, $rp
921 brnz $cnt, .Lcopy
922 sub $cnt, 8, $cnt
923
924 mov 1, %o0
925 ret
926 restore
927.type bn_mul_mont_t4, #function
928.size bn_mul_mont_t4, .-bn_mul_mont_t4
929___
930
931
932# int bn_mul_mont_gather5(
933$rp="%o0"; # u64 *rp,
934$ap="%o1"; # const u64 *ap,
935$bp="%o2"; # const u64 *pwrtbl,
936$np="%o3"; # const u64 *np,
937$n0p="%o4"; # const BN_ULONG *n0,
938$num="%o5"; # int num, # caller ensures that num is >=3
939 # int power);
940$code.=<<___;
941.globl bn_mul_mont_gather5_t4
942.align 32
943bn_mul_mont_gather5_t4:
944 add %sp, STACK_BIAS, %g4 ! real top of stack
945 sll $num, 3, $num ! size in bytes
946 add $num, 63, %g1
947 andn %g1, 63, %g1 ! buffer size rounded up to 64 bytes
948 sub %g4, %g1, %g1
949 andn %g1, 63, %g1 ! align at 64 byte
950 sub %g1, STACK_FRAME, %g1 ! new top of stack
951 sub %g1, %g4, %g1
952 LDPTR [%sp+STACK_7thARG], %g4 ! load power, 7th argument
953
954 save %sp, %g1, %sp
955___
956# +-------------------------------+<----- %sp
957# . .
958# +-------------------------------+<----- aligned at 64 bytes
959# | __int64 tmp[0] |
960# +-------------------------------+
961# . .
962# . .
963# +-------------------------------+<----- aligned at 64 bytes
964# . .
965($rp,$ap,$bp,$np,$n0p,$num)=map("%i$_",(0..5));
966($t0,$t1,$t2,$t3,$cnt,$tp,$bufsz,$ccr)=map("%l$_",(0..7));
967($ovf,$i)=($t0,$t1);
968 &load_ccr($bp,"%g4",$ccr);
969 &load_b($bp,$m0,"%o7"); # m0=bp[0]
970
971$code.=<<___;
972 ld [$n0p+0], $t0 ! pull n0[0..1] value
973 ld [$n0p+4], $t1
974 add %sp, STACK_BIAS+STACK_FRAME, $tp
975 sllx $t1, 32, $n0
976 or $t0, $n0, $n0
977
978
979 ldx [$ap+0], $aj ! ap[0]
980
981 mulx $aj, $m0, $lo0 ! ap[0]*bp[0]
982 umulxhi $aj, $m0, $hi0
983
984 ldx [$ap+8], $aj ! ap[1]
985 add $ap, 16, $ap
986 ldx [$np+0], $nj ! np[0]
987
988 mulx $lo0, $n0, $m1 ! "tp[0]"*n0
989
990 mulx $aj, $m0, $alo ! ap[1]*bp[0]
991 umulxhi $aj, $m0, $aj ! ahi=aj
992
993 mulx $nj, $m1, $lo1 ! np[0]*m1
994 umulxhi $nj, $m1, $hi1
995
996 ldx [$np+8], $nj ! np[1]
997
998 addcc $lo0, $lo1, $lo1
999 add $np, 16, $np
1000 addxc %g0, $hi1, $hi1
1001
1002 mulx $nj, $m1, $nlo ! np[1]*m1
1003 umulxhi $nj, $m1, $nj ! nhi=nj
1004
1005
1006 ba .L1st_g5
1007 sub $num, 24, $cnt ! cnt=num-3
1008
1009.align 16
1010.L1st_g5:
1011 addcc $alo, $hi0, $lo0
1012 addxc $aj, %g0, $hi0
1013
1014 ldx [$ap+0], $aj ! ap[j]
1015 addcc $nlo, $hi1, $lo1
1016 add $ap, 8, $ap
1017 addxc $nj, %g0, $hi1 ! nhi=nj
1018
1019 ldx [$np+0], $nj ! np[j]
1020 mulx $aj, $m0, $alo ! ap[j]*bp[0]
1021 add $np, 8, $np
1022 umulxhi $aj, $m0, $aj ! ahi=aj
1023
1024 mulx $nj, $m1, $nlo ! np[j]*m1
1025 addcc $lo0, $lo1, $lo1 ! np[j]*m1+ap[j]*bp[0]
1026 umulxhi $nj, $m1, $nj ! nhi=nj
1027 addxc %g0, $hi1, $hi1
1028 stxa $lo1, [$tp]0xe2 ! tp[j-1]
1029 add $tp, 8, $tp ! tp++
1030
1031 brnz,pt $cnt, .L1st_g5
1032 sub $cnt, 8, $cnt ! j--
1033!.L1st_g5
1034 addcc $alo, $hi0, $lo0
1035 addxc $aj, %g0, $hi0 ! ahi=aj
1036
1037 addcc $nlo, $hi1, $lo1
1038 addxc $nj, %g0, $hi1
1039 addcc $lo0, $lo1, $lo1 ! np[j]*m1+ap[j]*bp[0]
1040 addxc %g0, $hi1, $hi1
1041 stxa $lo1, [$tp]0xe2 ! tp[j-1]
1042 add $tp, 8, $tp
1043
1044 addcc $hi0, $hi1, $hi1
1045 addxc %g0, %g0, $ovf ! upmost overflow bit
1046 stxa $hi1, [$tp]0xe2
1047 add $tp, 8, $tp
1048
1049
1050 ba .Louter_g5
1051 sub $num, 16, $i ! i=num-2
1052
1053.align 16
1054.Louter_g5:
1055 wr $ccr, %g0, %ccr
1056___
1057 &load_b($bp,$m0); # m0=bp[i]
1058$code.=<<___;
1059 sub $ap, $num, $ap ! rewind
1060 sub $np, $num, $np
1061 sub $tp, $num, $tp
1062
1063 ldx [$ap+0], $aj ! ap[0]
1064 ldx [$np+0], $nj ! np[0]
1065
1066 mulx $aj, $m0, $lo0 ! ap[0]*bp[i]
1067 ldx [$tp], $tj ! tp[0]
1068 umulxhi $aj, $m0, $hi0
1069 ldx [$ap+8], $aj ! ap[1]
1070 addcc $lo0, $tj, $lo0 ! ap[0]*bp[i]+tp[0]
1071 mulx $aj, $m0, $alo ! ap[1]*bp[i]
1072 addxc %g0, $hi0, $hi0
1073 mulx $lo0, $n0, $m1 ! tp[0]*n0
1074 umulxhi $aj, $m0, $aj ! ahi=aj
1075 mulx $nj, $m1, $lo1 ! np[0]*m1
1076 add $ap, 16, $ap
1077 umulxhi $nj, $m1, $hi1
1078 ldx [$np+8], $nj ! np[1]
1079 add $np, 16, $np
1080 addcc $lo1, $lo0, $lo1
1081 mulx $nj, $m1, $nlo ! np[1]*m1
1082 addxc %g0, $hi1, $hi1
1083 umulxhi $nj, $m1, $nj ! nhi=nj
1084
1085
1086 ba .Linner_g5
1087 sub $num, 24, $cnt ! cnt=num-3
1088.align 16
1089.Linner_g5:
1090 addcc $alo, $hi0, $lo0
1091 ldx [$tp+8], $tj ! tp[j]
1092 addxc $aj, %g0, $hi0 ! ahi=aj
1093 ldx [$ap+0], $aj ! ap[j]
1094 add $ap, 8, $ap
1095 addcc $nlo, $hi1, $lo1
1096 mulx $aj, $m0, $alo ! ap[j]*bp[i]
1097 addxc $nj, %g0, $hi1 ! nhi=nj
1098 ldx [$np+0], $nj ! np[j]
1099 add $np, 8, $np
1100 umulxhi $aj, $m0, $aj ! ahi=aj
1101 addcc $lo0, $tj, $lo0 ! ap[j]*bp[i]+tp[j]
1102 mulx $nj, $m1, $nlo ! np[j]*m1
1103 addxc %g0, $hi0, $hi0
1104 umulxhi $nj, $m1, $nj ! nhi=nj
1105 addcc $lo1, $lo0, $lo1 ! np[j]*m1+ap[j]*bp[i]+tp[j]
1106 addxc %g0, $hi1, $hi1
1107 stx $lo1, [$tp] ! tp[j-1]
1108 add $tp, 8, $tp
1109 brnz,pt $cnt, .Linner_g5
1110 sub $cnt, 8, $cnt
1111!.Linner_g5
1112 ldx [$tp+8], $tj ! tp[j]
1113 addcc $alo, $hi0, $lo0
1114 addxc $aj, %g0, $hi0 ! ahi=aj
1115 addcc $lo0, $tj, $lo0 ! ap[j]*bp[i]+tp[j]
1116 addxc %g0, $hi0, $hi0
1117
1118 addcc $nlo, $hi1, $lo1
1119 addxc $nj, %g0, $hi1 ! nhi=nj
1120 addcc $lo1, $lo0, $lo1 ! np[j]*m1+ap[j]*bp[i]+tp[j]
1121 addxc %g0, $hi1, $hi1
1122 stx $lo1, [$tp] ! tp[j-1]
1123
1124 subcc %g0, $ovf, %g0 ! move upmost overflow to CCR.xcc
1125 addxccc $hi1, $hi0, $hi1
1126 addxc %g0, %g0, $ovf
1127 stx $hi1, [$tp+8]
1128 add $tp, 16, $tp
1129
1130 brnz,pt $i, .Louter_g5
1131 sub $i, 8, $i
1132
1133
1134 sub $ap, $num, $ap ! rewind
1135 sub $np, $num, $np
1136 sub $tp, $num, $tp
1137 ba .Lsub_g5
1138 subcc $num, 8, $cnt ! cnt=num-1 and clear CCR.xcc
1139
1140.align 16
1141.Lsub_g5:
1142 ldx [$tp], $tj
1143 add $tp, 8, $tp
1144 ldx [$np+0], $nj
1145 add $np, 8, $np
1146 subccc $tj, $nj, $t2 ! tp[j]-np[j]
1147 srlx $tj, 32, $tj
1148 srlx $nj, 32, $nj
1149 subccc $tj, $nj, $t3
1150 add $rp, 8, $rp
1151 st $t2, [$rp-4] ! reverse order
1152 st $t3, [$rp-8]
1153 brnz,pt $cnt, .Lsub_g5
1154 sub $cnt, 8, $cnt
1155
1156 sub $np, $num, $np ! rewind
1157 sub $tp, $num, $tp
1158 sub $rp, $num, $rp
1159
1160 subccc $ovf, %g0, $ovf ! handle upmost overflow bit
1161 ba .Lcopy_g5
1162 sub $num, 8, $cnt
1163
1164.align 16
1165.Lcopy_g5: ! conditional copy
1166 ldx [$tp], $tj
1167 ldx [$rp+0], $t2
1168 stx %g0, [$tp] ! zap
1169 add $tp, 8, $tp
1170 movcs %icc, $tj, $t2
1171 stx $t2, [$rp+0]
1172 add $rp, 8, $rp
1173 brnz $cnt, .Lcopy_g5
1174 sub $cnt, 8, $cnt
1175
1176 mov 1, %o0
1177 ret
1178 restore
1179.type bn_mul_mont_gather5_t4, #function
1180.size bn_mul_mont_gather5_t4, .-bn_mul_mont_gather5_t4
1181___
1182}
1183
1184
1185$code.=<<___;
1186.globl bn_flip_t4
1187.align 32
1188bn_flip_t4:
1189.Loop_flip:
1190 ld [%o1+0], %o4
1191 sub %o2, 1, %o2
1192 ld [%o1+4], %o5
1193 add %o1, 8, %o1
1194 st %o5, [%o0+0]
1195 st %o4, [%o0+4]
1196 brnz %o2, .Loop_flip
1197 add %o0, 8, %o0
1198 retl
1199 nop
1200.type bn_flip_t4, #function
1201.size bn_flip_t4, .-bn_flip_t4
1202
1203.globl bn_flip_n_scatter5_t4
1204.align 32
1205bn_flip_n_scatter5_t4:
1206 sll %o3, 3, %o3
1207 srl %o1, 1, %o1
1208 add %o3, %o2, %o2 ! &pwrtbl[pwr]
1209 sub %o1, 1, %o1
1210.Loop_flip_n_scatter5:
1211 ld [%o0+0], %o4 ! inp[i]
1212 ld [%o0+4], %o5
1213 add %o0, 8, %o0
1214 sllx %o5, 32, %o5
1215 or %o4, %o5, %o5
1216 stx %o5, [%o2]
1217 add %o2, 32*8, %o2
1218 brnz %o1, .Loop_flip_n_scatter5
1219 sub %o1, 1, %o1
1220 retl
1221 nop
1222.type bn_flip_n_scatter5_t4, #function
1223.size bn_flip_n_scatter5_t4, .-bn_flip_n_scatter5_t4
1224
1225.globl bn_gather5_t4
1226.align 32
1227bn_gather5_t4:
1228___
1229 &load_ccr("%o2","%o3","%g1");
1230$code.=<<___;
1231 sub %o1, 1, %o1
1232.Loop_gather5:
1233___
1234 &load_b("%o2","%g1");
1235$code.=<<___;
1236 stx %g1, [%o0]
1237 add %o0, 8, %o0
1238 brnz %o1, .Loop_gather5
1239 sub %o1, 1, %o1
1240
1241 retl
1242 nop
1243.type bn_gather5_t4, #function
1244.size bn_gather5_t4, .-bn_gather5_t4
1245
1246.asciz "Montgomery Multiplication for SPARC T4, David S. Miller, Andy Polyakov"
1247.align 4
1248___
1249
1250&emit_assembler();
1251
1252close STDOUT or die "error closing STDOUT: $!";
Note: See TracBrowser for help on using the repository browser.

© 2024 Oracle Support Privacy / Do Not Sell My Info Terms of Use Trademark Policy Automated Access Etiquette