1 | #! /usr/bin/env perl
|
---|
2 | # Copyright 2005-2021 The OpenSSL Project Authors. All Rights Reserved.
|
---|
3 | #
|
---|
4 | # Licensed under the Apache License 2.0 (the "License"). You may not use
|
---|
5 | # this file except in compliance with the License. You can obtain a copy
|
---|
6 | # in the file LICENSE in the source distribution or at
|
---|
7 | # https://www.openssl.org/source/license.html
|
---|
8 |
|
---|
9 |
|
---|
10 | # ====================================================================
|
---|
11 | # Written by Andy Polyakov <[email protected]> for the OpenSSL
|
---|
12 | # project. The module is, however, dual licensed under OpenSSL and
|
---|
13 | # CRYPTOGAMS licenses depending on where you obtain it. For further
|
---|
14 | # details see http://www.openssl.org/~appro/cryptogams/.
|
---|
15 | # ====================================================================
|
---|
16 |
|
---|
17 | # December 2005
|
---|
18 | #
|
---|
19 | # Pure SPARCv9/8+ and IALU-only bn_mul_mont implementation. The reasons
|
---|
20 | # for undertaken effort are multiple. First of all, UltraSPARC is not
|
---|
21 | # the whole SPARCv9 universe and other VIS-free implementations deserve
|
---|
22 | # optimized code as much. Secondly, newly introduced UltraSPARC T1,
|
---|
23 | # a.k.a. Niagara, has shared FPU and concurrent FPU-intensive paths,
|
---|
24 | # such as sparcv9a-mont, will simply sink it. Yes, T1 is equipped with
|
---|
25 | # several integrated RSA/DSA accelerator circuits accessible through
|
---|
26 | # kernel driver [only(*)], but having decent user-land software
|
---|
27 | # implementation is important too. Finally, reasons like desire to
|
---|
28 | # experiment with dedicated squaring procedure. Yes, this module
|
---|
29 | # implements one, because it was easiest to draft it in SPARCv9
|
---|
30 | # instructions...
|
---|
31 |
|
---|
32 | # (*) Engine accessing the driver in question is on my TODO list.
|
---|
33 | # For reference, accelerator is estimated to give 6 to 10 times
|
---|
34 | # improvement on single-threaded RSA sign. It should be noted
|
---|
35 | # that 6-10x improvement coefficient does not actually mean
|
---|
36 | # something extraordinary in terms of absolute [single-threaded]
|
---|
37 | # performance, as SPARCv9 instruction set is by all means least
|
---|
38 | # suitable for high performance crypto among other 64 bit
|
---|
39 | # platforms. 6-10x factor simply places T1 in same performance
|
---|
40 | # domain as say AMD64 and IA-64. Improvement of RSA verify don't
|
---|
41 | # appear impressive at all, but it's the sign operation which is
|
---|
42 | # far more critical/interesting.
|
---|
43 |
|
---|
44 | # You might notice that inner loops are modulo-scheduled:-) This has
|
---|
45 | # essentially negligible impact on UltraSPARC performance, it's
|
---|
46 | # Fujitsu SPARC64 V users who should notice and hopefully appreciate
|
---|
47 | # the advantage... Currently this module surpasses sparcv9a-mont.pl
|
---|
48 | # by ~20% on UltraSPARC-III and later cores, but recall that sparcv9a
|
---|
49 | # module still have hidden potential [see TODO list there], which is
|
---|
50 | # estimated to be larger than 20%...
|
---|
51 |
|
---|
52 | $output = pop and open STDOUT,">$output";
|
---|
53 |
|
---|
54 | # int bn_mul_mont(
|
---|
55 | $rp="%i0"; # BN_ULONG *rp,
|
---|
56 | $ap="%i1"; # const BN_ULONG *ap,
|
---|
57 | $bp="%i2"; # const BN_ULONG *bp,
|
---|
58 | $np="%i3"; # const BN_ULONG *np,
|
---|
59 | $n0="%i4"; # const BN_ULONG *n0,
|
---|
60 | $num="%i5"; # int num);
|
---|
61 |
|
---|
62 | $frame="STACK_FRAME";
|
---|
63 | $bias="STACK_BIAS";
|
---|
64 |
|
---|
65 | $car0="%o0";
|
---|
66 | $car1="%o1";
|
---|
67 | $car2="%o2"; # 1 bit
|
---|
68 | $acc0="%o3";
|
---|
69 | $acc1="%o4";
|
---|
70 | $mask="%g1"; # 32 bits, what a waste...
|
---|
71 | $tmp0="%g4";
|
---|
72 | $tmp1="%g5";
|
---|
73 |
|
---|
74 | $i="%l0";
|
---|
75 | $j="%l1";
|
---|
76 | $mul0="%l2";
|
---|
77 | $mul1="%l3";
|
---|
78 | $tp="%l4";
|
---|
79 | $apj="%l5";
|
---|
80 | $npj="%l6";
|
---|
81 | $tpj="%l7";
|
---|
82 |
|
---|
83 | $fname="bn_mul_mont_int";
|
---|
84 |
|
---|
85 | $code=<<___;
|
---|
86 | #ifndef __ASSEMBLER__
|
---|
87 | # define __ASSEMBLER__ 1
|
---|
88 | #endif
|
---|
89 | #include "crypto/sparc_arch.h"
|
---|
90 |
|
---|
91 | .section ".text",#alloc,#execinstr
|
---|
92 |
|
---|
93 | .global $fname
|
---|
94 | .align 32
|
---|
95 | $fname:
|
---|
96 | cmp %o5,4 ! 128 bits minimum
|
---|
97 | bge,pt %icc,.Lenter
|
---|
98 | sethi %hi(0xffffffff),$mask
|
---|
99 | retl
|
---|
100 | clr %o0
|
---|
101 | .align 32
|
---|
102 | .Lenter:
|
---|
103 | save %sp,-$frame,%sp
|
---|
104 | sll $num,2,$num ! num*=4
|
---|
105 | or $mask,%lo(0xffffffff),$mask
|
---|
106 | ld [$n0],$n0
|
---|
107 | cmp $ap,$bp
|
---|
108 | and $num,$mask,$num
|
---|
109 | ld [$bp],$mul0 ! bp[0]
|
---|
110 | nop
|
---|
111 |
|
---|
112 | add %sp,$bias,%o7 ! real top of stack
|
---|
113 | ld [$ap],$car0 ! ap[0] ! redundant in squaring context
|
---|
114 | sub %o7,$num,%o7
|
---|
115 | ld [$ap+4],$apj ! ap[1]
|
---|
116 | and %o7,-1024,%o7
|
---|
117 | ld [$np],$car1 ! np[0]
|
---|
118 | sub %o7,$bias,%sp ! alloca
|
---|
119 | ld [$np+4],$npj ! np[1]
|
---|
120 | be,pt SIZE_T_CC,.Lbn_sqr_mont
|
---|
121 | mov 12,$j
|
---|
122 |
|
---|
123 | mulx $car0,$mul0,$car0 ! ap[0]*bp[0]
|
---|
124 | mulx $apj,$mul0,$tmp0 !prologue! ap[1]*bp[0]
|
---|
125 | and $car0,$mask,$acc0
|
---|
126 | add %sp,$bias+$frame,$tp
|
---|
127 | ld [$ap+8],$apj !prologue!
|
---|
128 |
|
---|
129 | mulx $n0,$acc0,$mul1 ! "t[0]"*n0
|
---|
130 | and $mul1,$mask,$mul1
|
---|
131 |
|
---|
132 | mulx $car1,$mul1,$car1 ! np[0]*"t[0]"*n0
|
---|
133 | mulx $npj,$mul1,$acc1 !prologue! np[1]*"t[0]"*n0
|
---|
134 | srlx $car0,32,$car0
|
---|
135 | add $acc0,$car1,$car1
|
---|
136 | ld [$np+8],$npj !prologue!
|
---|
137 | srlx $car1,32,$car1
|
---|
138 | mov $tmp0,$acc0 !prologue!
|
---|
139 |
|
---|
140 | .L1st:
|
---|
141 | mulx $apj,$mul0,$tmp0
|
---|
142 | mulx $npj,$mul1,$tmp1
|
---|
143 | add $acc0,$car0,$car0
|
---|
144 | ld [$ap+$j],$apj ! ap[j]
|
---|
145 | and $car0,$mask,$acc0
|
---|
146 | add $acc1,$car1,$car1
|
---|
147 | ld [$np+$j],$npj ! np[j]
|
---|
148 | srlx $car0,32,$car0
|
---|
149 | add $acc0,$car1,$car1
|
---|
150 | add $j,4,$j ! j++
|
---|
151 | mov $tmp0,$acc0
|
---|
152 | st $car1,[$tp]
|
---|
153 | cmp $j,$num
|
---|
154 | mov $tmp1,$acc1
|
---|
155 | srlx $car1,32,$car1
|
---|
156 | bl %icc,.L1st
|
---|
157 | add $tp,4,$tp ! tp++
|
---|
158 | !.L1st
|
---|
159 |
|
---|
160 | mulx $apj,$mul0,$tmp0 !epilogue!
|
---|
161 | mulx $npj,$mul1,$tmp1
|
---|
162 | add $acc0,$car0,$car0
|
---|
163 | and $car0,$mask,$acc0
|
---|
164 | add $acc1,$car1,$car1
|
---|
165 | srlx $car0,32,$car0
|
---|
166 | add $acc0,$car1,$car1
|
---|
167 | st $car1,[$tp]
|
---|
168 | srlx $car1,32,$car1
|
---|
169 |
|
---|
170 | add $tmp0,$car0,$car0
|
---|
171 | and $car0,$mask,$acc0
|
---|
172 | add $tmp1,$car1,$car1
|
---|
173 | srlx $car0,32,$car0
|
---|
174 | add $acc0,$car1,$car1
|
---|
175 | st $car1,[$tp+4]
|
---|
176 | srlx $car1,32,$car1
|
---|
177 |
|
---|
178 | add $car0,$car1,$car1
|
---|
179 | st $car1,[$tp+8]
|
---|
180 | srlx $car1,32,$car2
|
---|
181 | |
---|
182 |
|
---|
183 | mov 4,$i ! i++
|
---|
184 | ld [$bp+4],$mul0 ! bp[1]
|
---|
185 | .Louter:
|
---|
186 | add %sp,$bias+$frame,$tp
|
---|
187 | ld [$ap],$car0 ! ap[0]
|
---|
188 | ld [$ap+4],$apj ! ap[1]
|
---|
189 | ld [$np],$car1 ! np[0]
|
---|
190 | ld [$np+4],$npj ! np[1]
|
---|
191 | ld [$tp],$tmp1 ! tp[0]
|
---|
192 | ld [$tp+4],$tpj ! tp[1]
|
---|
193 | mov 12,$j
|
---|
194 |
|
---|
195 | mulx $car0,$mul0,$car0
|
---|
196 | mulx $apj,$mul0,$tmp0 !prologue!
|
---|
197 | add $tmp1,$car0,$car0
|
---|
198 | ld [$ap+8],$apj !prologue!
|
---|
199 | and $car0,$mask,$acc0
|
---|
200 |
|
---|
201 | mulx $n0,$acc0,$mul1
|
---|
202 | and $mul1,$mask,$mul1
|
---|
203 |
|
---|
204 | mulx $car1,$mul1,$car1
|
---|
205 | mulx $npj,$mul1,$acc1 !prologue!
|
---|
206 | srlx $car0,32,$car0
|
---|
207 | add $acc0,$car1,$car1
|
---|
208 | ld [$np+8],$npj !prologue!
|
---|
209 | srlx $car1,32,$car1
|
---|
210 | mov $tmp0,$acc0 !prologue!
|
---|
211 |
|
---|
212 | .Linner:
|
---|
213 | mulx $apj,$mul0,$tmp0
|
---|
214 | mulx $npj,$mul1,$tmp1
|
---|
215 | add $tpj,$car0,$car0
|
---|
216 | ld [$ap+$j],$apj ! ap[j]
|
---|
217 | add $acc0,$car0,$car0
|
---|
218 | add $acc1,$car1,$car1
|
---|
219 | ld [$np+$j],$npj ! np[j]
|
---|
220 | and $car0,$mask,$acc0
|
---|
221 | ld [$tp+8],$tpj ! tp[j]
|
---|
222 | srlx $car0,32,$car0
|
---|
223 | add $acc0,$car1,$car1
|
---|
224 | add $j,4,$j ! j++
|
---|
225 | mov $tmp0,$acc0
|
---|
226 | st $car1,[$tp] ! tp[j-1]
|
---|
227 | srlx $car1,32,$car1
|
---|
228 | mov $tmp1,$acc1
|
---|
229 | cmp $j,$num
|
---|
230 | bl %icc,.Linner
|
---|
231 | add $tp,4,$tp ! tp++
|
---|
232 | !.Linner
|
---|
233 |
|
---|
234 | mulx $apj,$mul0,$tmp0 !epilogue!
|
---|
235 | mulx $npj,$mul1,$tmp1
|
---|
236 | add $tpj,$car0,$car0
|
---|
237 | add $acc0,$car0,$car0
|
---|
238 | ld [$tp+8],$tpj ! tp[j]
|
---|
239 | and $car0,$mask,$acc0
|
---|
240 | add $acc1,$car1,$car1
|
---|
241 | srlx $car0,32,$car0
|
---|
242 | add $acc0,$car1,$car1
|
---|
243 | st $car1,[$tp] ! tp[j-1]
|
---|
244 | srlx $car1,32,$car1
|
---|
245 |
|
---|
246 | add $tpj,$car0,$car0
|
---|
247 | add $tmp0,$car0,$car0
|
---|
248 | and $car0,$mask,$acc0
|
---|
249 | add $tmp1,$car1,$car1
|
---|
250 | add $acc0,$car1,$car1
|
---|
251 | st $car1,[$tp+4] ! tp[j-1]
|
---|
252 | srlx $car0,32,$car0
|
---|
253 | add $i,4,$i ! i++
|
---|
254 | srlx $car1,32,$car1
|
---|
255 |
|
---|
256 | add $car0,$car1,$car1
|
---|
257 | cmp $i,$num
|
---|
258 | add $car2,$car1,$car1
|
---|
259 | st $car1,[$tp+8]
|
---|
260 |
|
---|
261 | srlx $car1,32,$car2
|
---|
262 | bl,a %icc,.Louter
|
---|
263 | ld [$bp+$i],$mul0 ! bp[i]
|
---|
264 | !.Louter
|
---|
265 |
|
---|
266 | add $tp,12,$tp
|
---|
267 | |
---|
268 |
|
---|
269 | .Ltail:
|
---|
270 | add $np,$num,$np
|
---|
271 | add $rp,$num,$rp
|
---|
272 | sub %g0,$num,%o7 ! k=-num
|
---|
273 | ba .Lsub
|
---|
274 | subcc %g0,%g0,%g0 ! clear %icc.c
|
---|
275 | .align 16
|
---|
276 | .Lsub:
|
---|
277 | ld [$tp+%o7],%o0
|
---|
278 | ld [$np+%o7],%o1
|
---|
279 | subccc %o0,%o1,%o1 ! tp[j]-np[j]
|
---|
280 | add $rp,%o7,$i
|
---|
281 | add %o7,4,%o7
|
---|
282 | brnz %o7,.Lsub
|
---|
283 | st %o1,[$i]
|
---|
284 | subccc $car2,0,$car2 ! handle upmost overflow bit
|
---|
285 | sub %g0,$num,%o7
|
---|
286 |
|
---|
287 | .Lcopy:
|
---|
288 | ld [$tp+%o7],%o1 ! conditional copy
|
---|
289 | ld [$rp+%o7],%o0
|
---|
290 | st %g0,[$tp+%o7] ! zap tp
|
---|
291 | movcs %icc,%o1,%o0
|
---|
292 | st %o0,[$rp+%o7]
|
---|
293 | add %o7,4,%o7
|
---|
294 | brnz %o7,.Lcopy
|
---|
295 | nop
|
---|
296 | mov 1,%i0
|
---|
297 | ret
|
---|
298 | restore
|
---|
299 | ___
|
---|
300 | |
---|
301 |
|
---|
302 | ########
|
---|
303 | ######## .Lbn_sqr_mont gives up to 20% *overall* improvement over
|
---|
304 | ######## code without following dedicated squaring procedure.
|
---|
305 | ########
|
---|
306 | $sbit="%o5";
|
---|
307 |
|
---|
308 | $code.=<<___;
|
---|
309 | .align 32
|
---|
310 | .Lbn_sqr_mont:
|
---|
311 | mulx $mul0,$mul0,$car0 ! ap[0]*ap[0]
|
---|
312 | mulx $apj,$mul0,$tmp0 !prologue!
|
---|
313 | and $car0,$mask,$acc0
|
---|
314 | add %sp,$bias+$frame,$tp
|
---|
315 | ld [$ap+8],$apj !prologue!
|
---|
316 |
|
---|
317 | mulx $n0,$acc0,$mul1 ! "t[0]"*n0
|
---|
318 | srlx $car0,32,$car0
|
---|
319 | and $mul1,$mask,$mul1
|
---|
320 |
|
---|
321 | mulx $car1,$mul1,$car1 ! np[0]*"t[0]"*n0
|
---|
322 | mulx $npj,$mul1,$acc1 !prologue!
|
---|
323 | and $car0,1,$sbit
|
---|
324 | ld [$np+8],$npj !prologue!
|
---|
325 | srlx $car0,1,$car0
|
---|
326 | add $acc0,$car1,$car1
|
---|
327 | srlx $car1,32,$car1
|
---|
328 | mov $tmp0,$acc0 !prologue!
|
---|
329 |
|
---|
330 | .Lsqr_1st:
|
---|
331 | mulx $apj,$mul0,$tmp0
|
---|
332 | mulx $npj,$mul1,$tmp1
|
---|
333 | add $acc0,$car0,$car0 ! ap[j]*a0+c0
|
---|
334 | add $acc1,$car1,$car1
|
---|
335 | ld [$ap+$j],$apj ! ap[j]
|
---|
336 | and $car0,$mask,$acc0
|
---|
337 | ld [$np+$j],$npj ! np[j]
|
---|
338 | srlx $car0,32,$car0
|
---|
339 | add $acc0,$acc0,$acc0
|
---|
340 | or $sbit,$acc0,$acc0
|
---|
341 | mov $tmp1,$acc1
|
---|
342 | srlx $acc0,32,$sbit
|
---|
343 | add $j,4,$j ! j++
|
---|
344 | and $acc0,$mask,$acc0
|
---|
345 | cmp $j,$num
|
---|
346 | add $acc0,$car1,$car1
|
---|
347 | st $car1,[$tp]
|
---|
348 | mov $tmp0,$acc0
|
---|
349 | srlx $car1,32,$car1
|
---|
350 | bl %icc,.Lsqr_1st
|
---|
351 | add $tp,4,$tp ! tp++
|
---|
352 | !.Lsqr_1st
|
---|
353 |
|
---|
354 | mulx $apj,$mul0,$tmp0 ! epilogue
|
---|
355 | mulx $npj,$mul1,$tmp1
|
---|
356 | add $acc0,$car0,$car0 ! ap[j]*a0+c0
|
---|
357 | add $acc1,$car1,$car1
|
---|
358 | and $car0,$mask,$acc0
|
---|
359 | srlx $car0,32,$car0
|
---|
360 | add $acc0,$acc0,$acc0
|
---|
361 | or $sbit,$acc0,$acc0
|
---|
362 | srlx $acc0,32,$sbit
|
---|
363 | and $acc0,$mask,$acc0
|
---|
364 | add $acc0,$car1,$car1
|
---|
365 | st $car1,[$tp]
|
---|
366 | srlx $car1,32,$car1
|
---|
367 |
|
---|
368 | add $tmp0,$car0,$car0 ! ap[j]*a0+c0
|
---|
369 | add $tmp1,$car1,$car1
|
---|
370 | and $car0,$mask,$acc0
|
---|
371 | srlx $car0,32,$car0
|
---|
372 | add $acc0,$acc0,$acc0
|
---|
373 | or $sbit,$acc0,$acc0
|
---|
374 | srlx $acc0,32,$sbit
|
---|
375 | and $acc0,$mask,$acc0
|
---|
376 | add $acc0,$car1,$car1
|
---|
377 | st $car1,[$tp+4]
|
---|
378 | srlx $car1,32,$car1
|
---|
379 |
|
---|
380 | add $car0,$car0,$car0
|
---|
381 | or $sbit,$car0,$car0
|
---|
382 | add $car0,$car1,$car1
|
---|
383 | st $car1,[$tp+8]
|
---|
384 | srlx $car1,32,$car2
|
---|
385 | |
---|
386 |
|
---|
387 | ld [%sp+$bias+$frame],$tmp0 ! tp[0]
|
---|
388 | ld [%sp+$bias+$frame+4],$tmp1 ! tp[1]
|
---|
389 | ld [%sp+$bias+$frame+8],$tpj ! tp[2]
|
---|
390 | ld [$ap+4],$mul0 ! ap[1]
|
---|
391 | ld [$ap+8],$apj ! ap[2]
|
---|
392 | ld [$np],$car1 ! np[0]
|
---|
393 | ld [$np+4],$npj ! np[1]
|
---|
394 | mulx $n0,$tmp0,$mul1
|
---|
395 |
|
---|
396 | mulx $mul0,$mul0,$car0
|
---|
397 | and $mul1,$mask,$mul1
|
---|
398 |
|
---|
399 | mulx $car1,$mul1,$car1
|
---|
400 | mulx $npj,$mul1,$acc1
|
---|
401 | add $tmp0,$car1,$car1
|
---|
402 | and $car0,$mask,$acc0
|
---|
403 | ld [$np+8],$npj ! np[2]
|
---|
404 | srlx $car1,32,$car1
|
---|
405 | add $tmp1,$car1,$car1
|
---|
406 | srlx $car0,32,$car0
|
---|
407 | add $acc0,$car1,$car1
|
---|
408 | and $car0,1,$sbit
|
---|
409 | add $acc1,$car1,$car1
|
---|
410 | srlx $car0,1,$car0
|
---|
411 | mov 12,$j
|
---|
412 | st $car1,[%sp+$bias+$frame] ! tp[0]=
|
---|
413 | srlx $car1,32,$car1
|
---|
414 | add %sp,$bias+$frame+4,$tp
|
---|
415 |
|
---|
416 | .Lsqr_2nd:
|
---|
417 | mulx $apj,$mul0,$acc0
|
---|
418 | mulx $npj,$mul1,$acc1
|
---|
419 | add $acc0,$car0,$car0
|
---|
420 | add $tpj,$sbit,$sbit
|
---|
421 | ld [$ap+$j],$apj ! ap[j]
|
---|
422 | and $car0,$mask,$acc0
|
---|
423 | ld [$np+$j],$npj ! np[j]
|
---|
424 | srlx $car0,32,$car0
|
---|
425 | add $acc1,$car1,$car1
|
---|
426 | ld [$tp+8],$tpj ! tp[j]
|
---|
427 | add $acc0,$acc0,$acc0
|
---|
428 | add $j,4,$j ! j++
|
---|
429 | add $sbit,$acc0,$acc0
|
---|
430 | srlx $acc0,32,$sbit
|
---|
431 | and $acc0,$mask,$acc0
|
---|
432 | cmp $j,$num
|
---|
433 | add $acc0,$car1,$car1
|
---|
434 | st $car1,[$tp] ! tp[j-1]
|
---|
435 | srlx $car1,32,$car1
|
---|
436 | bl %icc,.Lsqr_2nd
|
---|
437 | add $tp,4,$tp ! tp++
|
---|
438 | !.Lsqr_2nd
|
---|
439 |
|
---|
440 | mulx $apj,$mul0,$acc0
|
---|
441 | mulx $npj,$mul1,$acc1
|
---|
442 | add $acc0,$car0,$car0
|
---|
443 | add $tpj,$sbit,$sbit
|
---|
444 | and $car0,$mask,$acc0
|
---|
445 | srlx $car0,32,$car0
|
---|
446 | add $acc1,$car1,$car1
|
---|
447 | add $acc0,$acc0,$acc0
|
---|
448 | add $sbit,$acc0,$acc0
|
---|
449 | srlx $acc0,32,$sbit
|
---|
450 | and $acc0,$mask,$acc0
|
---|
451 | add $acc0,$car1,$car1
|
---|
452 | st $car1,[$tp] ! tp[j-1]
|
---|
453 | srlx $car1,32,$car1
|
---|
454 |
|
---|
455 | add $car0,$car0,$car0
|
---|
456 | add $sbit,$car0,$car0
|
---|
457 | add $car0,$car1,$car1
|
---|
458 | add $car2,$car1,$car1
|
---|
459 | st $car1,[$tp+4]
|
---|
460 | srlx $car1,32,$car2
|
---|
461 | |
---|
462 |
|
---|
463 | ld [%sp+$bias+$frame],$tmp1 ! tp[0]
|
---|
464 | ld [%sp+$bias+$frame+4],$tpj ! tp[1]
|
---|
465 | ld [$ap+8],$mul0 ! ap[2]
|
---|
466 | ld [$np],$car1 ! np[0]
|
---|
467 | ld [$np+4],$npj ! np[1]
|
---|
468 | mulx $n0,$tmp1,$mul1
|
---|
469 | and $mul1,$mask,$mul1
|
---|
470 | mov 8,$i
|
---|
471 |
|
---|
472 | mulx $mul0,$mul0,$car0
|
---|
473 | mulx $car1,$mul1,$car1
|
---|
474 | and $car0,$mask,$acc0
|
---|
475 | add $tmp1,$car1,$car1
|
---|
476 | srlx $car0,32,$car0
|
---|
477 | add %sp,$bias+$frame,$tp
|
---|
478 | srlx $car1,32,$car1
|
---|
479 | and $car0,1,$sbit
|
---|
480 | srlx $car0,1,$car0
|
---|
481 | mov 4,$j
|
---|
482 |
|
---|
483 | .Lsqr_outer:
|
---|
484 | .Lsqr_inner1:
|
---|
485 | mulx $npj,$mul1,$acc1
|
---|
486 | add $tpj,$car1,$car1
|
---|
487 | add $j,4,$j
|
---|
488 | ld [$tp+8],$tpj
|
---|
489 | cmp $j,$i
|
---|
490 | add $acc1,$car1,$car1
|
---|
491 | ld [$np+$j],$npj
|
---|
492 | st $car1,[$tp]
|
---|
493 | srlx $car1,32,$car1
|
---|
494 | bl %icc,.Lsqr_inner1
|
---|
495 | add $tp,4,$tp
|
---|
496 | !.Lsqr_inner1
|
---|
497 |
|
---|
498 | add $j,4,$j
|
---|
499 | ld [$ap+$j],$apj ! ap[j]
|
---|
500 | mulx $npj,$mul1,$acc1
|
---|
501 | add $tpj,$car1,$car1
|
---|
502 | ld [$np+$j],$npj ! np[j]
|
---|
503 | srlx $car1,32,$tmp0
|
---|
504 | and $car1,$mask,$car1
|
---|
505 | add $tmp0,$sbit,$sbit
|
---|
506 | add $acc0,$car1,$car1
|
---|
507 | ld [$tp+8],$tpj ! tp[j]
|
---|
508 | add $acc1,$car1,$car1
|
---|
509 | st $car1,[$tp]
|
---|
510 | srlx $car1,32,$car1
|
---|
511 |
|
---|
512 | add $j,4,$j
|
---|
513 | cmp $j,$num
|
---|
514 | be,pn %icc,.Lsqr_no_inner2
|
---|
515 | add $tp,4,$tp
|
---|
516 |
|
---|
517 | .Lsqr_inner2:
|
---|
518 | mulx $apj,$mul0,$acc0
|
---|
519 | mulx $npj,$mul1,$acc1
|
---|
520 | add $tpj,$sbit,$sbit
|
---|
521 | add $acc0,$car0,$car0
|
---|
522 | ld [$ap+$j],$apj ! ap[j]
|
---|
523 | and $car0,$mask,$acc0
|
---|
524 | ld [$np+$j],$npj ! np[j]
|
---|
525 | srlx $car0,32,$car0
|
---|
526 | add $acc0,$acc0,$acc0
|
---|
527 | ld [$tp+8],$tpj ! tp[j]
|
---|
528 | add $sbit,$acc0,$acc0
|
---|
529 | add $j,4,$j ! j++
|
---|
530 | srlx $acc0,32,$sbit
|
---|
531 | and $acc0,$mask,$acc0
|
---|
532 | cmp $j,$num
|
---|
533 | add $acc0,$car1,$car1
|
---|
534 | add $acc1,$car1,$car1
|
---|
535 | st $car1,[$tp] ! tp[j-1]
|
---|
536 | srlx $car1,32,$car1
|
---|
537 | bl %icc,.Lsqr_inner2
|
---|
538 | add $tp,4,$tp ! tp++
|
---|
539 |
|
---|
540 | .Lsqr_no_inner2:
|
---|
541 | mulx $apj,$mul0,$acc0
|
---|
542 | mulx $npj,$mul1,$acc1
|
---|
543 | add $tpj,$sbit,$sbit
|
---|
544 | add $acc0,$car0,$car0
|
---|
545 | and $car0,$mask,$acc0
|
---|
546 | srlx $car0,32,$car0
|
---|
547 | add $acc0,$acc0,$acc0
|
---|
548 | add $sbit,$acc0,$acc0
|
---|
549 | srlx $acc0,32,$sbit
|
---|
550 | and $acc0,$mask,$acc0
|
---|
551 | add $acc0,$car1,$car1
|
---|
552 | add $acc1,$car1,$car1
|
---|
553 | st $car1,[$tp] ! tp[j-1]
|
---|
554 | srlx $car1,32,$car1
|
---|
555 |
|
---|
556 | add $car0,$car0,$car0
|
---|
557 | add $sbit,$car0,$car0
|
---|
558 | add $car0,$car1,$car1
|
---|
559 | add $car2,$car1,$car1
|
---|
560 | st $car1,[$tp+4]
|
---|
561 | srlx $car1,32,$car2
|
---|
562 | |
---|
563 |
|
---|
564 | add $i,4,$i ! i++
|
---|
565 | ld [%sp+$bias+$frame],$tmp1 ! tp[0]
|
---|
566 | ld [%sp+$bias+$frame+4],$tpj ! tp[1]
|
---|
567 | ld [$ap+$i],$mul0 ! ap[j]
|
---|
568 | ld [$np],$car1 ! np[0]
|
---|
569 | ld [$np+4],$npj ! np[1]
|
---|
570 | mulx $n0,$tmp1,$mul1
|
---|
571 | and $mul1,$mask,$mul1
|
---|
572 | add $i,4,$tmp0
|
---|
573 |
|
---|
574 | mulx $mul0,$mul0,$car0
|
---|
575 | mulx $car1,$mul1,$car1
|
---|
576 | and $car0,$mask,$acc0
|
---|
577 | add $tmp1,$car1,$car1
|
---|
578 | srlx $car0,32,$car0
|
---|
579 | add %sp,$bias+$frame,$tp
|
---|
580 | srlx $car1,32,$car1
|
---|
581 | and $car0,1,$sbit
|
---|
582 | srlx $car0,1,$car0
|
---|
583 |
|
---|
584 | cmp $tmp0,$num ! i<num-1
|
---|
585 | bl %icc,.Lsqr_outer
|
---|
586 | mov 4,$j
|
---|
587 | |
---|
588 |
|
---|
589 | .Lsqr_last:
|
---|
590 | mulx $npj,$mul1,$acc1
|
---|
591 | add $tpj,$car1,$car1
|
---|
592 | add $j,4,$j
|
---|
593 | ld [$tp+8],$tpj
|
---|
594 | cmp $j,$i
|
---|
595 | add $acc1,$car1,$car1
|
---|
596 | ld [$np+$j],$npj
|
---|
597 | st $car1,[$tp]
|
---|
598 | srlx $car1,32,$car1
|
---|
599 | bl %icc,.Lsqr_last
|
---|
600 | add $tp,4,$tp
|
---|
601 | !.Lsqr_last
|
---|
602 |
|
---|
603 | mulx $npj,$mul1,$acc1
|
---|
604 | add $tpj,$acc0,$acc0
|
---|
605 | srlx $acc0,32,$tmp0
|
---|
606 | and $acc0,$mask,$acc0
|
---|
607 | add $tmp0,$sbit,$sbit
|
---|
608 | add $acc0,$car1,$car1
|
---|
609 | add $acc1,$car1,$car1
|
---|
610 | st $car1,[$tp]
|
---|
611 | srlx $car1,32,$car1
|
---|
612 |
|
---|
613 | add $car0,$car0,$car0 ! recover $car0
|
---|
614 | add $sbit,$car0,$car0
|
---|
615 | add $car0,$car1,$car1
|
---|
616 | add $car2,$car1,$car1
|
---|
617 | st $car1,[$tp+4]
|
---|
618 | srlx $car1,32,$car2
|
---|
619 |
|
---|
620 | ba .Ltail
|
---|
621 | add $tp,8,$tp
|
---|
622 | .type $fname,#function
|
---|
623 | .size $fname,(.-$fname)
|
---|
624 | .asciz "Montgomery Multiplication for SPARCv9, CRYPTOGAMS by <appro\@openssl.org>"
|
---|
625 | .align 32
|
---|
626 | ___
|
---|
627 | $code =~ s/\`([^\`]*)\`/eval($1)/gem;
|
---|
628 | print $code;
|
---|
629 | close STDOUT or die "error closing STDOUT: $!";
|
---|