1 | #! /usr/bin/env perl
|
---|
2 | # Copyright 2012-2020 The OpenSSL Project Authors. All Rights Reserved.
|
---|
3 | #
|
---|
4 | # Licensed under the OpenSSL license (the "License"). You may not use
|
---|
5 | # this file except in compliance with the License. You can obtain a copy
|
---|
6 | # in the file LICENSE in the source distribution or at
|
---|
7 | # https://www.openssl.org/source/license.html
|
---|
8 |
|
---|
9 |
|
---|
10 | # ====================================================================
|
---|
11 | # Written by Andy Polyakov <[email protected]> for the OpenSSL
|
---|
12 | # project. The module is, however, dual licensed under OpenSSL and
|
---|
13 | # CRYPTOGAMS licenses depending on where you obtain it. For further
|
---|
14 | # details see http://www.openssl.org/~appro/cryptogams/.
|
---|
15 | # ====================================================================
|
---|
16 |
|
---|
17 | # October 2012.
|
---|
18 | #
|
---|
19 | # SPARCv9 VIS3 Montgomery multiplication procedure suitable for T3 and
|
---|
20 | # onward. There are three new instructions used here: umulxhi,
|
---|
21 | # addxc[cc] and initializing store. On T3 RSA private key operations
|
---|
22 | # are 1.54/1.87/2.11/2.26 times faster for 512/1024/2048/4096-bit key
|
---|
23 | # lengths. This is without dedicated squaring procedure. On T4
|
---|
24 | # corresponding coefficients are 1.47/2.10/2.80/2.90x, which is mostly
|
---|
25 | # for reference purposes, because T4 has dedicated Montgomery
|
---|
26 | # multiplication and squaring *instructions* that deliver even more.
|
---|
27 |
|
---|
28 | $output = pop;
|
---|
29 | open STDOUT,">$output";
|
---|
30 |
|
---|
31 | $frame = "STACK_FRAME";
|
---|
32 | $bias = "STACK_BIAS";
|
---|
33 |
|
---|
34 | $code.=<<___;
|
---|
35 | #include "sparc_arch.h"
|
---|
36 |
|
---|
37 | #ifdef __arch64__
|
---|
38 | .register %g2,#scratch
|
---|
39 | .register %g3,#scratch
|
---|
40 | #endif
|
---|
41 |
|
---|
42 | .section ".text",#alloc,#execinstr
|
---|
43 | ___
|
---|
44 |
|
---|
45 | ($n0,$m0,$m1,$lo0,$hi0, $lo1,$hi1,$aj,$alo,$nj,$nlo,$tj)=
|
---|
46 | (map("%g$_",(1..5)),map("%o$_",(0..5,7)));
|
---|
47 |
|
---|
48 | # int bn_mul_mont(
|
---|
49 | $rp="%o0"; # BN_ULONG *rp,
|
---|
50 | $ap="%o1"; # const BN_ULONG *ap,
|
---|
51 | $bp="%o2"; # const BN_ULONG *bp,
|
---|
52 | $np="%o3"; # const BN_ULONG *np,
|
---|
53 | $n0p="%o4"; # const BN_ULONG *n0,
|
---|
54 | $num="%o5"; # int num); # caller ensures that num is even
|
---|
55 | # and >=6
|
---|
56 | $code.=<<___;
|
---|
57 | .globl bn_mul_mont_vis3
|
---|
58 | .align 32
|
---|
59 | bn_mul_mont_vis3:
|
---|
60 | add %sp, $bias, %g4 ! real top of stack
|
---|
61 | sll $num, 2, $num ! size in bytes
|
---|
62 | add $num, 63, %g5
|
---|
63 | andn %g5, 63, %g5 ! buffer size rounded up to 64 bytes
|
---|
64 | add %g5, %g5, %g1
|
---|
65 | add %g5, %g1, %g1 ! 3*buffer size
|
---|
66 | sub %g4, %g1, %g1
|
---|
67 | andn %g1, 63, %g1 ! align at 64 byte
|
---|
68 | sub %g1, $frame, %g1 ! new top of stack
|
---|
69 | sub %g1, %g4, %g1
|
---|
70 |
|
---|
71 | save %sp, %g1, %sp
|
---|
72 | ___
|
---|
73 | |
---|
74 |
|
---|
75 | # +-------------------------------+<----- %sp
|
---|
76 | # . .
|
---|
77 | # +-------------------------------+<----- aligned at 64 bytes
|
---|
78 | # | __int64 tmp[0] |
|
---|
79 | # +-------------------------------+
|
---|
80 | # . .
|
---|
81 | # . .
|
---|
82 | # +-------------------------------+<----- aligned at 64 bytes
|
---|
83 | # | __int64 ap[1..0] | converted ap[]
|
---|
84 | # +-------------------------------+
|
---|
85 | # | __int64 np[1..0] | converted np[]
|
---|
86 | # +-------------------------------+
|
---|
87 | # | __int64 ap[3..2] |
|
---|
88 | # . .
|
---|
89 | # . .
|
---|
90 | # +-------------------------------+
|
---|
91 | ($rp,$ap,$bp,$np,$n0p,$num)=map("%i$_",(0..5));
|
---|
92 | ($t0,$t1,$t2,$t3,$cnt,$tp,$bufsz,$anp)=map("%l$_",(0..7));
|
---|
93 | ($ovf,$i)=($t0,$t1);
|
---|
94 | $code.=<<___;
|
---|
95 | ld [$n0p+0], $t0 ! pull n0[0..1] value
|
---|
96 | add %sp, $bias+$frame, $tp
|
---|
97 | ld [$n0p+4], $t1
|
---|
98 | add $tp, %g5, $anp
|
---|
99 | ld [$bp+0], $t2 ! m0=bp[0]
|
---|
100 | sllx $t1, 32, $n0
|
---|
101 | ld [$bp+4], $t3
|
---|
102 | or $t0, $n0, $n0
|
---|
103 | add $bp, 8, $bp
|
---|
104 | |
---|
105 |
|
---|
106 | ld [$ap+0], $t0 ! ap[0]
|
---|
107 | sllx $t3, 32, $m0
|
---|
108 | ld [$ap+4], $t1
|
---|
109 | or $t2, $m0, $m0
|
---|
110 |
|
---|
111 | ld [$ap+8], $t2 ! ap[1]
|
---|
112 | sllx $t1, 32, $aj
|
---|
113 | ld [$ap+12], $t3
|
---|
114 | or $t0, $aj, $aj
|
---|
115 | add $ap, 16, $ap
|
---|
116 | stx $aj, [$anp] ! converted ap[0]
|
---|
117 |
|
---|
118 | mulx $aj, $m0, $lo0 ! ap[0]*bp[0]
|
---|
119 | umulxhi $aj, $m0, $hi0
|
---|
120 |
|
---|
121 | ld [$np+0], $t0 ! np[0]
|
---|
122 | sllx $t3, 32, $aj
|
---|
123 | ld [$np+4], $t1
|
---|
124 | or $t2, $aj, $aj
|
---|
125 |
|
---|
126 | ld [$np+8], $t2 ! np[1]
|
---|
127 | sllx $t1, 32, $nj
|
---|
128 | ld [$np+12], $t3
|
---|
129 | or $t0, $nj, $nj
|
---|
130 | add $np, 16, $np
|
---|
131 | stx $nj, [$anp+8] ! converted np[0]
|
---|
132 |
|
---|
133 | mulx $lo0, $n0, $m1 ! "tp[0]"*n0
|
---|
134 | stx $aj, [$anp+16] ! converted ap[1]
|
---|
135 |
|
---|
136 | mulx $aj, $m0, $alo ! ap[1]*bp[0]
|
---|
137 | umulxhi $aj, $m0, $aj ! ahi=aj
|
---|
138 |
|
---|
139 | mulx $nj, $m1, $lo1 ! np[0]*m1
|
---|
140 | umulxhi $nj, $m1, $hi1
|
---|
141 |
|
---|
142 | sllx $t3, 32, $nj
|
---|
143 | or $t2, $nj, $nj
|
---|
144 | stx $nj, [$anp+24] ! converted np[1]
|
---|
145 | add $anp, 32, $anp
|
---|
146 |
|
---|
147 | addcc $lo0, $lo1, $lo1
|
---|
148 | addxc %g0, $hi1, $hi1
|
---|
149 |
|
---|
150 | mulx $nj, $m1, $nlo ! np[1]*m1
|
---|
151 | umulxhi $nj, $m1, $nj ! nhi=nj
|
---|
152 | |
---|
153 |
|
---|
154 | ba .L1st
|
---|
155 | sub $num, 24, $cnt ! cnt=num-3
|
---|
156 |
|
---|
157 | .align 16
|
---|
158 | .L1st:
|
---|
159 | ld [$ap+0], $t0 ! ap[j]
|
---|
160 | addcc $alo, $hi0, $lo0
|
---|
161 | ld [$ap+4], $t1
|
---|
162 | addxc $aj, %g0, $hi0
|
---|
163 |
|
---|
164 | sllx $t1, 32, $aj
|
---|
165 | add $ap, 8, $ap
|
---|
166 | or $t0, $aj, $aj
|
---|
167 | stx $aj, [$anp] ! converted ap[j]
|
---|
168 |
|
---|
169 | ld [$np+0], $t2 ! np[j]
|
---|
170 | addcc $nlo, $hi1, $lo1
|
---|
171 | ld [$np+4], $t3
|
---|
172 | addxc $nj, %g0, $hi1 ! nhi=nj
|
---|
173 |
|
---|
174 | sllx $t3, 32, $nj
|
---|
175 | add $np, 8, $np
|
---|
176 | mulx $aj, $m0, $alo ! ap[j]*bp[0]
|
---|
177 | or $t2, $nj, $nj
|
---|
178 | umulxhi $aj, $m0, $aj ! ahi=aj
|
---|
179 | stx $nj, [$anp+8] ! converted np[j]
|
---|
180 | add $anp, 16, $anp ! anp++
|
---|
181 |
|
---|
182 | mulx $nj, $m1, $nlo ! np[j]*m1
|
---|
183 | addcc $lo0, $lo1, $lo1 ! np[j]*m1+ap[j]*bp[0]
|
---|
184 | umulxhi $nj, $m1, $nj ! nhi=nj
|
---|
185 | addxc %g0, $hi1, $hi1
|
---|
186 | stx $lo1, [$tp] ! tp[j-1]
|
---|
187 | add $tp, 8, $tp ! tp++
|
---|
188 |
|
---|
189 | brnz,pt $cnt, .L1st
|
---|
190 | sub $cnt, 8, $cnt ! j--
|
---|
191 | !.L1st
|
---|
192 | addcc $alo, $hi0, $lo0
|
---|
193 | addxc $aj, %g0, $hi0 ! ahi=aj
|
---|
194 |
|
---|
195 | addcc $nlo, $hi1, $lo1
|
---|
196 | addxc $nj, %g0, $hi1
|
---|
197 | addcc $lo0, $lo1, $lo1 ! np[j]*m1+ap[j]*bp[0]
|
---|
198 | addxc %g0, $hi1, $hi1
|
---|
199 | stx $lo1, [$tp] ! tp[j-1]
|
---|
200 | add $tp, 8, $tp
|
---|
201 |
|
---|
202 | addcc $hi0, $hi1, $hi1
|
---|
203 | addxc %g0, %g0, $ovf ! upmost overflow bit
|
---|
204 | stx $hi1, [$tp]
|
---|
205 | add $tp, 8, $tp
|
---|
206 | |
---|
207 |
|
---|
208 | ba .Louter
|
---|
209 | sub $num, 16, $i ! i=num-2
|
---|
210 |
|
---|
211 | .align 16
|
---|
212 | .Louter:
|
---|
213 | ld [$bp+0], $t2 ! m0=bp[i]
|
---|
214 | ld [$bp+4], $t3
|
---|
215 |
|
---|
216 | sub $anp, $num, $anp ! rewind
|
---|
217 | sub $tp, $num, $tp
|
---|
218 | sub $anp, $num, $anp
|
---|
219 |
|
---|
220 | add $bp, 8, $bp
|
---|
221 | sllx $t3, 32, $m0
|
---|
222 | ldx [$anp+0], $aj ! ap[0]
|
---|
223 | or $t2, $m0, $m0
|
---|
224 | ldx [$anp+8], $nj ! np[0]
|
---|
225 |
|
---|
226 | mulx $aj, $m0, $lo0 ! ap[0]*bp[i]
|
---|
227 | ldx [$tp], $tj ! tp[0]
|
---|
228 | umulxhi $aj, $m0, $hi0
|
---|
229 | ldx [$anp+16], $aj ! ap[1]
|
---|
230 | addcc $lo0, $tj, $lo0 ! ap[0]*bp[i]+tp[0]
|
---|
231 | mulx $aj, $m0, $alo ! ap[1]*bp[i]
|
---|
232 | addxc %g0, $hi0, $hi0
|
---|
233 | mulx $lo0, $n0, $m1 ! tp[0]*n0
|
---|
234 | umulxhi $aj, $m0, $aj ! ahi=aj
|
---|
235 | mulx $nj, $m1, $lo1 ! np[0]*m1
|
---|
236 | umulxhi $nj, $m1, $hi1
|
---|
237 | ldx [$anp+24], $nj ! np[1]
|
---|
238 | add $anp, 32, $anp
|
---|
239 | addcc $lo1, $lo0, $lo1
|
---|
240 | mulx $nj, $m1, $nlo ! np[1]*m1
|
---|
241 | addxc %g0, $hi1, $hi1
|
---|
242 | umulxhi $nj, $m1, $nj ! nhi=nj
|
---|
243 | |
---|
244 |
|
---|
245 | ba .Linner
|
---|
246 | sub $num, 24, $cnt ! cnt=num-3
|
---|
247 | .align 16
|
---|
248 | .Linner:
|
---|
249 | addcc $alo, $hi0, $lo0
|
---|
250 | ldx [$tp+8], $tj ! tp[j]
|
---|
251 | addxc $aj, %g0, $hi0 ! ahi=aj
|
---|
252 | ldx [$anp+0], $aj ! ap[j]
|
---|
253 | addcc $nlo, $hi1, $lo1
|
---|
254 | mulx $aj, $m0, $alo ! ap[j]*bp[i]
|
---|
255 | addxc $nj, %g0, $hi1 ! nhi=nj
|
---|
256 | ldx [$anp+8], $nj ! np[j]
|
---|
257 | add $anp, 16, $anp
|
---|
258 | umulxhi $aj, $m0, $aj ! ahi=aj
|
---|
259 | addcc $lo0, $tj, $lo0 ! ap[j]*bp[i]+tp[j]
|
---|
260 | mulx $nj, $m1, $nlo ! np[j]*m1
|
---|
261 | addxc %g0, $hi0, $hi0
|
---|
262 | umulxhi $nj, $m1, $nj ! nhi=nj
|
---|
263 | addcc $lo1, $lo0, $lo1 ! np[j]*m1+ap[j]*bp[i]+tp[j]
|
---|
264 | addxc %g0, $hi1, $hi1
|
---|
265 | stx $lo1, [$tp] ! tp[j-1]
|
---|
266 | add $tp, 8, $tp
|
---|
267 | brnz,pt $cnt, .Linner
|
---|
268 | sub $cnt, 8, $cnt
|
---|
269 | !.Linner
|
---|
270 | ldx [$tp+8], $tj ! tp[j]
|
---|
271 | addcc $alo, $hi0, $lo0
|
---|
272 | addxc $aj, %g0, $hi0 ! ahi=aj
|
---|
273 | addcc $lo0, $tj, $lo0 ! ap[j]*bp[i]+tp[j]
|
---|
274 | addxc %g0, $hi0, $hi0
|
---|
275 |
|
---|
276 | addcc $nlo, $hi1, $lo1
|
---|
277 | addxc $nj, %g0, $hi1 ! nhi=nj
|
---|
278 | addcc $lo1, $lo0, $lo1 ! np[j]*m1+ap[j]*bp[i]+tp[j]
|
---|
279 | addxc %g0, $hi1, $hi1
|
---|
280 | stx $lo1, [$tp] ! tp[j-1]
|
---|
281 |
|
---|
282 | subcc %g0, $ovf, %g0 ! move upmost overflow to CCR.xcc
|
---|
283 | addxccc $hi1, $hi0, $hi1
|
---|
284 | addxc %g0, %g0, $ovf
|
---|
285 | stx $hi1, [$tp+8]
|
---|
286 | add $tp, 16, $tp
|
---|
287 |
|
---|
288 | brnz,pt $i, .Louter
|
---|
289 | sub $i, 8, $i
|
---|
290 | |
---|
291 |
|
---|
292 | sub $anp, $num, $anp ! rewind
|
---|
293 | sub $tp, $num, $tp
|
---|
294 | sub $anp, $num, $anp
|
---|
295 | ba .Lsub
|
---|
296 | subcc $num, 8, $cnt ! cnt=num-1 and clear CCR.xcc
|
---|
297 |
|
---|
298 | .align 16
|
---|
299 | .Lsub:
|
---|
300 | ldx [$tp], $tj
|
---|
301 | add $tp, 8, $tp
|
---|
302 | ldx [$anp+8], $nj
|
---|
303 | add $anp, 16, $anp
|
---|
304 | subccc $tj, $nj, $t2 ! tp[j]-np[j]
|
---|
305 | srlx $tj, 32, $tj
|
---|
306 | srlx $nj, 32, $nj
|
---|
307 | subccc $tj, $nj, $t3
|
---|
308 | add $rp, 8, $rp
|
---|
309 | st $t2, [$rp-4] ! reverse order
|
---|
310 | st $t3, [$rp-8]
|
---|
311 | brnz,pt $cnt, .Lsub
|
---|
312 | sub $cnt, 8, $cnt
|
---|
313 |
|
---|
314 | sub $anp, $num, $anp ! rewind
|
---|
315 | sub $tp, $num, $tp
|
---|
316 | sub $anp, $num, $anp
|
---|
317 | sub $rp, $num, $rp
|
---|
318 |
|
---|
319 | subccc $ovf, %g0, $ovf ! handle upmost overflow bit
|
---|
320 | ba .Lcopy
|
---|
321 | sub $num, 8, $cnt
|
---|
322 |
|
---|
323 | .align 16
|
---|
324 | .Lcopy: ! conditional copy
|
---|
325 | ld [$tp+0], $t0
|
---|
326 | ld [$tp+4], $t1
|
---|
327 | ld [$rp+0], $t2
|
---|
328 | ld [$rp+4], $t3
|
---|
329 | stx %g0, [$tp] ! zap
|
---|
330 | add $tp, 8, $tp
|
---|
331 | stx %g0, [$anp] ! zap
|
---|
332 | stx %g0, [$anp+8]
|
---|
333 | add $anp, 16, $anp
|
---|
334 | movcs %icc, $t0, $t2
|
---|
335 | movcs %icc, $t1, $t3
|
---|
336 | st $t3, [$rp+0] ! flip order
|
---|
337 | st $t2, [$rp+4]
|
---|
338 | add $rp, 8, $rp
|
---|
339 | brnz $cnt, .Lcopy
|
---|
340 | sub $cnt, 8, $cnt
|
---|
341 |
|
---|
342 | mov 1, %o0
|
---|
343 | ret
|
---|
344 | restore
|
---|
345 | .type bn_mul_mont_vis3, #function
|
---|
346 | .size bn_mul_mont_vis3, .-bn_mul_mont_vis3
|
---|
347 | .asciz "Montgomery Multiplication for SPARCv9 VIS3, CRYPTOGAMS by <appro\@openssl.org>"
|
---|
348 | .align 4
|
---|
349 | ___
|
---|
350 | |
---|
351 |
|
---|
352 | # Purpose of these subroutines is to explicitly encode VIS instructions,
|
---|
353 | # so that one can compile the module without having to specify VIS
|
---|
354 | # extensions on compiler command line, e.g. -xarch=v9 vs. -xarch=v9a.
|
---|
355 | # Idea is to reserve for option to produce "universal" binary and let
|
---|
356 | # programmer detect if current CPU is VIS capable at run-time.
|
---|
357 | sub unvis3 {
|
---|
358 | my ($mnemonic,$rs1,$rs2,$rd)=@_;
|
---|
359 | my %bias = ( "g" => 0, "o" => 8, "l" => 16, "i" => 24 );
|
---|
360 | my ($ref,$opf);
|
---|
361 | my %visopf = ( "addxc" => 0x011,
|
---|
362 | "addxccc" => 0x013,
|
---|
363 | "umulxhi" => 0x016 );
|
---|
364 |
|
---|
365 | $ref = "$mnemonic\t$rs1,$rs2,$rd";
|
---|
366 |
|
---|
367 | if ($opf=$visopf{$mnemonic}) {
|
---|
368 | foreach ($rs1,$rs2,$rd) {
|
---|
369 | return $ref if (!/%([goli])([0-9])/);
|
---|
370 | $_=$bias{$1}+$2;
|
---|
371 | }
|
---|
372 |
|
---|
373 | return sprintf ".word\t0x%08x !%s",
|
---|
374 | 0x81b00000|$rd<<25|$rs1<<14|$opf<<5|$rs2,
|
---|
375 | $ref;
|
---|
376 | } else {
|
---|
377 | return $ref;
|
---|
378 | }
|
---|
379 | }
|
---|
380 |
|
---|
381 | foreach (split("\n",$code)) {
|
---|
382 | s/\`([^\`]*)\`/eval $1/ge;
|
---|
383 |
|
---|
384 | s/\b(umulxhi|addxc[c]{0,2})\s+(%[goli][0-7]),\s*(%[goli][0-7]),\s*(%[goli][0-7])/
|
---|
385 | &unvis3($1,$2,$3,$4)
|
---|
386 | /ge;
|
---|
387 |
|
---|
388 | print $_,"\n";
|
---|
389 | }
|
---|
390 |
|
---|
391 | close STDOUT or die "error closing STDOUT: $!";
|
---|