1 | #! /usr/bin/env perl
|
---|
2 | # Copyright 2005-2020 The OpenSSL Project Authors. All Rights Reserved.
|
---|
3 | #
|
---|
4 | # Licensed under the OpenSSL license (the "License"). You may not use
|
---|
5 | # this file except in compliance with the License. You can obtain a copy
|
---|
6 | # in the file LICENSE in the source distribution or at
|
---|
7 | # https://www.openssl.org/source/license.html
|
---|
8 |
|
---|
9 |
|
---|
10 | # ====================================================================
|
---|
11 | # Written by Andy Polyakov <[email protected]> for the OpenSSL
|
---|
12 | # project. The module is, however, dual licensed under OpenSSL and
|
---|
13 | # CRYPTOGAMS licenses depending on where you obtain it. For further
|
---|
14 | # details see http://www.openssl.org/~appro/cryptogams/.
|
---|
15 | # ====================================================================
|
---|
16 |
|
---|
17 | # December 2005
|
---|
18 | #
|
---|
19 | # Pure SPARCv9/8+ and IALU-only bn_mul_mont implementation. The reasons
|
---|
20 | # for undertaken effort are multiple. First of all, UltraSPARC is not
|
---|
21 | # the whole SPARCv9 universe and other VIS-free implementations deserve
|
---|
22 | # optimized code as much. Secondly, newly introduced UltraSPARC T1,
|
---|
23 | # a.k.a. Niagara, has shared FPU and concurrent FPU-intensive paths,
|
---|
24 | # such as sparcv9a-mont, will simply sink it. Yes, T1 is equipped with
|
---|
25 | # several integrated RSA/DSA accelerator circuits accessible through
|
---|
26 | # kernel driver [only(*)], but having decent user-land software
|
---|
27 | # implementation is important too. Finally, reasons like desire to
|
---|
28 | # experiment with dedicated squaring procedure. Yes, this module
|
---|
29 | # implements one, because it was easiest to draft it in SPARCv9
|
---|
30 | # instructions...
|
---|
31 |
|
---|
32 | # (*) Engine accessing the driver in question is on my TODO list.
|
---|
33 | # For reference, accelerator is estimated to give 6 to 10 times
|
---|
34 | # improvement on single-threaded RSA sign. It should be noted
|
---|
35 | # that 6-10x improvement coefficient does not actually mean
|
---|
36 | # something extraordinary in terms of absolute [single-threaded]
|
---|
37 | # performance, as SPARCv9 instruction set is by all means least
|
---|
38 | # suitable for high performance crypto among other 64 bit
|
---|
39 | # platforms. 6-10x factor simply places T1 in same performance
|
---|
40 | # domain as say AMD64 and IA-64. Improvement of RSA verify don't
|
---|
41 | # appear impressive at all, but it's the sign operation which is
|
---|
42 | # far more critical/interesting.
|
---|
43 |
|
---|
44 | # You might notice that inner loops are modulo-scheduled:-) This has
|
---|
45 | # essentially negligible impact on UltraSPARC performance, it's
|
---|
46 | # Fujitsu SPARC64 V users who should notice and hopefully appreciate
|
---|
47 | # the advantage... Currently this module surpasses sparcv9a-mont.pl
|
---|
48 | # by ~20% on UltraSPARC-III and later cores, but recall that sparcv9a
|
---|
49 | # module still have hidden potential [see TODO list there], which is
|
---|
50 | # estimated to be larger than 20%...
|
---|
51 |
|
---|
52 | $output = pop;
|
---|
53 | open STDOUT,">$output";
|
---|
54 |
|
---|
55 | # int bn_mul_mont(
|
---|
56 | $rp="%i0"; # BN_ULONG *rp,
|
---|
57 | $ap="%i1"; # const BN_ULONG *ap,
|
---|
58 | $bp="%i2"; # const BN_ULONG *bp,
|
---|
59 | $np="%i3"; # const BN_ULONG *np,
|
---|
60 | $n0="%i4"; # const BN_ULONG *n0,
|
---|
61 | $num="%i5"; # int num);
|
---|
62 |
|
---|
63 | $frame="STACK_FRAME";
|
---|
64 | $bias="STACK_BIAS";
|
---|
65 |
|
---|
66 | $car0="%o0";
|
---|
67 | $car1="%o1";
|
---|
68 | $car2="%o2"; # 1 bit
|
---|
69 | $acc0="%o3";
|
---|
70 | $acc1="%o4";
|
---|
71 | $mask="%g1"; # 32 bits, what a waste...
|
---|
72 | $tmp0="%g4";
|
---|
73 | $tmp1="%g5";
|
---|
74 |
|
---|
75 | $i="%l0";
|
---|
76 | $j="%l1";
|
---|
77 | $mul0="%l2";
|
---|
78 | $mul1="%l3";
|
---|
79 | $tp="%l4";
|
---|
80 | $apj="%l5";
|
---|
81 | $npj="%l6";
|
---|
82 | $tpj="%l7";
|
---|
83 |
|
---|
84 | $fname="bn_mul_mont_int";
|
---|
85 |
|
---|
86 | $code=<<___;
|
---|
87 | #include "sparc_arch.h"
|
---|
88 |
|
---|
89 | .section ".text",#alloc,#execinstr
|
---|
90 |
|
---|
91 | .global $fname
|
---|
92 | .align 32
|
---|
93 | $fname:
|
---|
94 | cmp %o5,4 ! 128 bits minimum
|
---|
95 | bge,pt %icc,.Lenter
|
---|
96 | sethi %hi(0xffffffff),$mask
|
---|
97 | retl
|
---|
98 | clr %o0
|
---|
99 | .align 32
|
---|
100 | .Lenter:
|
---|
101 | save %sp,-$frame,%sp
|
---|
102 | sll $num,2,$num ! num*=4
|
---|
103 | or $mask,%lo(0xffffffff),$mask
|
---|
104 | ld [$n0],$n0
|
---|
105 | cmp $ap,$bp
|
---|
106 | and $num,$mask,$num
|
---|
107 | ld [$bp],$mul0 ! bp[0]
|
---|
108 | nop
|
---|
109 |
|
---|
110 | add %sp,$bias,%o7 ! real top of stack
|
---|
111 | ld [$ap],$car0 ! ap[0] ! redundant in squaring context
|
---|
112 | sub %o7,$num,%o7
|
---|
113 | ld [$ap+4],$apj ! ap[1]
|
---|
114 | and %o7,-1024,%o7
|
---|
115 | ld [$np],$car1 ! np[0]
|
---|
116 | sub %o7,$bias,%sp ! alloca
|
---|
117 | ld [$np+4],$npj ! np[1]
|
---|
118 | be,pt SIZE_T_CC,.Lbn_sqr_mont
|
---|
119 | mov 12,$j
|
---|
120 |
|
---|
121 | mulx $car0,$mul0,$car0 ! ap[0]*bp[0]
|
---|
122 | mulx $apj,$mul0,$tmp0 !prologue! ap[1]*bp[0]
|
---|
123 | and $car0,$mask,$acc0
|
---|
124 | add %sp,$bias+$frame,$tp
|
---|
125 | ld [$ap+8],$apj !prologue!
|
---|
126 |
|
---|
127 | mulx $n0,$acc0,$mul1 ! "t[0]"*n0
|
---|
128 | and $mul1,$mask,$mul1
|
---|
129 |
|
---|
130 | mulx $car1,$mul1,$car1 ! np[0]*"t[0]"*n0
|
---|
131 | mulx $npj,$mul1,$acc1 !prologue! np[1]*"t[0]"*n0
|
---|
132 | srlx $car0,32,$car0
|
---|
133 | add $acc0,$car1,$car1
|
---|
134 | ld [$np+8],$npj !prologue!
|
---|
135 | srlx $car1,32,$car1
|
---|
136 | mov $tmp0,$acc0 !prologue!
|
---|
137 |
|
---|
138 | .L1st:
|
---|
139 | mulx $apj,$mul0,$tmp0
|
---|
140 | mulx $npj,$mul1,$tmp1
|
---|
141 | add $acc0,$car0,$car0
|
---|
142 | ld [$ap+$j],$apj ! ap[j]
|
---|
143 | and $car0,$mask,$acc0
|
---|
144 | add $acc1,$car1,$car1
|
---|
145 | ld [$np+$j],$npj ! np[j]
|
---|
146 | srlx $car0,32,$car0
|
---|
147 | add $acc0,$car1,$car1
|
---|
148 | add $j,4,$j ! j++
|
---|
149 | mov $tmp0,$acc0
|
---|
150 | st $car1,[$tp]
|
---|
151 | cmp $j,$num
|
---|
152 | mov $tmp1,$acc1
|
---|
153 | srlx $car1,32,$car1
|
---|
154 | bl %icc,.L1st
|
---|
155 | add $tp,4,$tp ! tp++
|
---|
156 | !.L1st
|
---|
157 |
|
---|
158 | mulx $apj,$mul0,$tmp0 !epilogue!
|
---|
159 | mulx $npj,$mul1,$tmp1
|
---|
160 | add $acc0,$car0,$car0
|
---|
161 | and $car0,$mask,$acc0
|
---|
162 | add $acc1,$car1,$car1
|
---|
163 | srlx $car0,32,$car0
|
---|
164 | add $acc0,$car1,$car1
|
---|
165 | st $car1,[$tp]
|
---|
166 | srlx $car1,32,$car1
|
---|
167 |
|
---|
168 | add $tmp0,$car0,$car0
|
---|
169 | and $car0,$mask,$acc0
|
---|
170 | add $tmp1,$car1,$car1
|
---|
171 | srlx $car0,32,$car0
|
---|
172 | add $acc0,$car1,$car1
|
---|
173 | st $car1,[$tp+4]
|
---|
174 | srlx $car1,32,$car1
|
---|
175 |
|
---|
176 | add $car0,$car1,$car1
|
---|
177 | st $car1,[$tp+8]
|
---|
178 | srlx $car1,32,$car2
|
---|
179 | |
---|
180 |
|
---|
181 | mov 4,$i ! i++
|
---|
182 | ld [$bp+4],$mul0 ! bp[1]
|
---|
183 | .Louter:
|
---|
184 | add %sp,$bias+$frame,$tp
|
---|
185 | ld [$ap],$car0 ! ap[0]
|
---|
186 | ld [$ap+4],$apj ! ap[1]
|
---|
187 | ld [$np],$car1 ! np[0]
|
---|
188 | ld [$np+4],$npj ! np[1]
|
---|
189 | ld [$tp],$tmp1 ! tp[0]
|
---|
190 | ld [$tp+4],$tpj ! tp[1]
|
---|
191 | mov 12,$j
|
---|
192 |
|
---|
193 | mulx $car0,$mul0,$car0
|
---|
194 | mulx $apj,$mul0,$tmp0 !prologue!
|
---|
195 | add $tmp1,$car0,$car0
|
---|
196 | ld [$ap+8],$apj !prologue!
|
---|
197 | and $car0,$mask,$acc0
|
---|
198 |
|
---|
199 | mulx $n0,$acc0,$mul1
|
---|
200 | and $mul1,$mask,$mul1
|
---|
201 |
|
---|
202 | mulx $car1,$mul1,$car1
|
---|
203 | mulx $npj,$mul1,$acc1 !prologue!
|
---|
204 | srlx $car0,32,$car0
|
---|
205 | add $acc0,$car1,$car1
|
---|
206 | ld [$np+8],$npj !prologue!
|
---|
207 | srlx $car1,32,$car1
|
---|
208 | mov $tmp0,$acc0 !prologue!
|
---|
209 |
|
---|
210 | .Linner:
|
---|
211 | mulx $apj,$mul0,$tmp0
|
---|
212 | mulx $npj,$mul1,$tmp1
|
---|
213 | add $tpj,$car0,$car0
|
---|
214 | ld [$ap+$j],$apj ! ap[j]
|
---|
215 | add $acc0,$car0,$car0
|
---|
216 | add $acc1,$car1,$car1
|
---|
217 | ld [$np+$j],$npj ! np[j]
|
---|
218 | and $car0,$mask,$acc0
|
---|
219 | ld [$tp+8],$tpj ! tp[j]
|
---|
220 | srlx $car0,32,$car0
|
---|
221 | add $acc0,$car1,$car1
|
---|
222 | add $j,4,$j ! j++
|
---|
223 | mov $tmp0,$acc0
|
---|
224 | st $car1,[$tp] ! tp[j-1]
|
---|
225 | srlx $car1,32,$car1
|
---|
226 | mov $tmp1,$acc1
|
---|
227 | cmp $j,$num
|
---|
228 | bl %icc,.Linner
|
---|
229 | add $tp,4,$tp ! tp++
|
---|
230 | !.Linner
|
---|
231 |
|
---|
232 | mulx $apj,$mul0,$tmp0 !epilogue!
|
---|
233 | mulx $npj,$mul1,$tmp1
|
---|
234 | add $tpj,$car0,$car0
|
---|
235 | add $acc0,$car0,$car0
|
---|
236 | ld [$tp+8],$tpj ! tp[j]
|
---|
237 | and $car0,$mask,$acc0
|
---|
238 | add $acc1,$car1,$car1
|
---|
239 | srlx $car0,32,$car0
|
---|
240 | add $acc0,$car1,$car1
|
---|
241 | st $car1,[$tp] ! tp[j-1]
|
---|
242 | srlx $car1,32,$car1
|
---|
243 |
|
---|
244 | add $tpj,$car0,$car0
|
---|
245 | add $tmp0,$car0,$car0
|
---|
246 | and $car0,$mask,$acc0
|
---|
247 | add $tmp1,$car1,$car1
|
---|
248 | add $acc0,$car1,$car1
|
---|
249 | st $car1,[$tp+4] ! tp[j-1]
|
---|
250 | srlx $car0,32,$car0
|
---|
251 | add $i,4,$i ! i++
|
---|
252 | srlx $car1,32,$car1
|
---|
253 |
|
---|
254 | add $car0,$car1,$car1
|
---|
255 | cmp $i,$num
|
---|
256 | add $car2,$car1,$car1
|
---|
257 | st $car1,[$tp+8]
|
---|
258 |
|
---|
259 | srlx $car1,32,$car2
|
---|
260 | bl,a %icc,.Louter
|
---|
261 | ld [$bp+$i],$mul0 ! bp[i]
|
---|
262 | !.Louter
|
---|
263 |
|
---|
264 | add $tp,12,$tp
|
---|
265 | |
---|
266 |
|
---|
267 | .Ltail:
|
---|
268 | add $np,$num,$np
|
---|
269 | add $rp,$num,$rp
|
---|
270 | sub %g0,$num,%o7 ! k=-num
|
---|
271 | ba .Lsub
|
---|
272 | subcc %g0,%g0,%g0 ! clear %icc.c
|
---|
273 | .align 16
|
---|
274 | .Lsub:
|
---|
275 | ld [$tp+%o7],%o0
|
---|
276 | ld [$np+%o7],%o1
|
---|
277 | subccc %o0,%o1,%o1 ! tp[j]-np[j]
|
---|
278 | add $rp,%o7,$i
|
---|
279 | add %o7,4,%o7
|
---|
280 | brnz %o7,.Lsub
|
---|
281 | st %o1,[$i]
|
---|
282 | subccc $car2,0,$car2 ! handle upmost overflow bit
|
---|
283 | sub %g0,$num,%o7
|
---|
284 |
|
---|
285 | .Lcopy:
|
---|
286 | ld [$tp+%o7],%o1 ! conditional copy
|
---|
287 | ld [$rp+%o7],%o0
|
---|
288 | st %g0,[$tp+%o7] ! zap tp
|
---|
289 | movcs %icc,%o1,%o0
|
---|
290 | st %o0,[$rp+%o7]
|
---|
291 | add %o7,4,%o7
|
---|
292 | brnz %o7,.Lcopy
|
---|
293 | nop
|
---|
294 | mov 1,%i0
|
---|
295 | ret
|
---|
296 | restore
|
---|
297 | ___
|
---|
298 | |
---|
299 |
|
---|
300 | ########
|
---|
301 | ######## .Lbn_sqr_mont gives up to 20% *overall* improvement over
|
---|
302 | ######## code without following dedicated squaring procedure.
|
---|
303 | ########
|
---|
304 | $sbit="%o5";
|
---|
305 |
|
---|
306 | $code.=<<___;
|
---|
307 | .align 32
|
---|
308 | .Lbn_sqr_mont:
|
---|
309 | mulx $mul0,$mul0,$car0 ! ap[0]*ap[0]
|
---|
310 | mulx $apj,$mul0,$tmp0 !prologue!
|
---|
311 | and $car0,$mask,$acc0
|
---|
312 | add %sp,$bias+$frame,$tp
|
---|
313 | ld [$ap+8],$apj !prologue!
|
---|
314 |
|
---|
315 | mulx $n0,$acc0,$mul1 ! "t[0]"*n0
|
---|
316 | srlx $car0,32,$car0
|
---|
317 | and $mul1,$mask,$mul1
|
---|
318 |
|
---|
319 | mulx $car1,$mul1,$car1 ! np[0]*"t[0]"*n0
|
---|
320 | mulx $npj,$mul1,$acc1 !prologue!
|
---|
321 | and $car0,1,$sbit
|
---|
322 | ld [$np+8],$npj !prologue!
|
---|
323 | srlx $car0,1,$car0
|
---|
324 | add $acc0,$car1,$car1
|
---|
325 | srlx $car1,32,$car1
|
---|
326 | mov $tmp0,$acc0 !prologue!
|
---|
327 |
|
---|
328 | .Lsqr_1st:
|
---|
329 | mulx $apj,$mul0,$tmp0
|
---|
330 | mulx $npj,$mul1,$tmp1
|
---|
331 | add $acc0,$car0,$car0 ! ap[j]*a0+c0
|
---|
332 | add $acc1,$car1,$car1
|
---|
333 | ld [$ap+$j],$apj ! ap[j]
|
---|
334 | and $car0,$mask,$acc0
|
---|
335 | ld [$np+$j],$npj ! np[j]
|
---|
336 | srlx $car0,32,$car0
|
---|
337 | add $acc0,$acc0,$acc0
|
---|
338 | or $sbit,$acc0,$acc0
|
---|
339 | mov $tmp1,$acc1
|
---|
340 | srlx $acc0,32,$sbit
|
---|
341 | add $j,4,$j ! j++
|
---|
342 | and $acc0,$mask,$acc0
|
---|
343 | cmp $j,$num
|
---|
344 | add $acc0,$car1,$car1
|
---|
345 | st $car1,[$tp]
|
---|
346 | mov $tmp0,$acc0
|
---|
347 | srlx $car1,32,$car1
|
---|
348 | bl %icc,.Lsqr_1st
|
---|
349 | add $tp,4,$tp ! tp++
|
---|
350 | !.Lsqr_1st
|
---|
351 |
|
---|
352 | mulx $apj,$mul0,$tmp0 ! epilogue
|
---|
353 | mulx $npj,$mul1,$tmp1
|
---|
354 | add $acc0,$car0,$car0 ! ap[j]*a0+c0
|
---|
355 | add $acc1,$car1,$car1
|
---|
356 | and $car0,$mask,$acc0
|
---|
357 | srlx $car0,32,$car0
|
---|
358 | add $acc0,$acc0,$acc0
|
---|
359 | or $sbit,$acc0,$acc0
|
---|
360 | srlx $acc0,32,$sbit
|
---|
361 | and $acc0,$mask,$acc0
|
---|
362 | add $acc0,$car1,$car1
|
---|
363 | st $car1,[$tp]
|
---|
364 | srlx $car1,32,$car1
|
---|
365 |
|
---|
366 | add $tmp0,$car0,$car0 ! ap[j]*a0+c0
|
---|
367 | add $tmp1,$car1,$car1
|
---|
368 | and $car0,$mask,$acc0
|
---|
369 | srlx $car0,32,$car0
|
---|
370 | add $acc0,$acc0,$acc0
|
---|
371 | or $sbit,$acc0,$acc0
|
---|
372 | srlx $acc0,32,$sbit
|
---|
373 | and $acc0,$mask,$acc0
|
---|
374 | add $acc0,$car1,$car1
|
---|
375 | st $car1,[$tp+4]
|
---|
376 | srlx $car1,32,$car1
|
---|
377 |
|
---|
378 | add $car0,$car0,$car0
|
---|
379 | or $sbit,$car0,$car0
|
---|
380 | add $car0,$car1,$car1
|
---|
381 | st $car1,[$tp+8]
|
---|
382 | srlx $car1,32,$car2
|
---|
383 | |
---|
384 |
|
---|
385 | ld [%sp+$bias+$frame],$tmp0 ! tp[0]
|
---|
386 | ld [%sp+$bias+$frame+4],$tmp1 ! tp[1]
|
---|
387 | ld [%sp+$bias+$frame+8],$tpj ! tp[2]
|
---|
388 | ld [$ap+4],$mul0 ! ap[1]
|
---|
389 | ld [$ap+8],$apj ! ap[2]
|
---|
390 | ld [$np],$car1 ! np[0]
|
---|
391 | ld [$np+4],$npj ! np[1]
|
---|
392 | mulx $n0,$tmp0,$mul1
|
---|
393 |
|
---|
394 | mulx $mul0,$mul0,$car0
|
---|
395 | and $mul1,$mask,$mul1
|
---|
396 |
|
---|
397 | mulx $car1,$mul1,$car1
|
---|
398 | mulx $npj,$mul1,$acc1
|
---|
399 | add $tmp0,$car1,$car1
|
---|
400 | and $car0,$mask,$acc0
|
---|
401 | ld [$np+8],$npj ! np[2]
|
---|
402 | srlx $car1,32,$car1
|
---|
403 | add $tmp1,$car1,$car1
|
---|
404 | srlx $car0,32,$car0
|
---|
405 | add $acc0,$car1,$car1
|
---|
406 | and $car0,1,$sbit
|
---|
407 | add $acc1,$car1,$car1
|
---|
408 | srlx $car0,1,$car0
|
---|
409 | mov 12,$j
|
---|
410 | st $car1,[%sp+$bias+$frame] ! tp[0]=
|
---|
411 | srlx $car1,32,$car1
|
---|
412 | add %sp,$bias+$frame+4,$tp
|
---|
413 |
|
---|
414 | .Lsqr_2nd:
|
---|
415 | mulx $apj,$mul0,$acc0
|
---|
416 | mulx $npj,$mul1,$acc1
|
---|
417 | add $acc0,$car0,$car0
|
---|
418 | add $tpj,$sbit,$sbit
|
---|
419 | ld [$ap+$j],$apj ! ap[j]
|
---|
420 | and $car0,$mask,$acc0
|
---|
421 | ld [$np+$j],$npj ! np[j]
|
---|
422 | srlx $car0,32,$car0
|
---|
423 | add $acc1,$car1,$car1
|
---|
424 | ld [$tp+8],$tpj ! tp[j]
|
---|
425 | add $acc0,$acc0,$acc0
|
---|
426 | add $j,4,$j ! j++
|
---|
427 | add $sbit,$acc0,$acc0
|
---|
428 | srlx $acc0,32,$sbit
|
---|
429 | and $acc0,$mask,$acc0
|
---|
430 | cmp $j,$num
|
---|
431 | add $acc0,$car1,$car1
|
---|
432 | st $car1,[$tp] ! tp[j-1]
|
---|
433 | srlx $car1,32,$car1
|
---|
434 | bl %icc,.Lsqr_2nd
|
---|
435 | add $tp,4,$tp ! tp++
|
---|
436 | !.Lsqr_2nd
|
---|
437 |
|
---|
438 | mulx $apj,$mul0,$acc0
|
---|
439 | mulx $npj,$mul1,$acc1
|
---|
440 | add $acc0,$car0,$car0
|
---|
441 | add $tpj,$sbit,$sbit
|
---|
442 | and $car0,$mask,$acc0
|
---|
443 | srlx $car0,32,$car0
|
---|
444 | add $acc1,$car1,$car1
|
---|
445 | add $acc0,$acc0,$acc0
|
---|
446 | add $sbit,$acc0,$acc0
|
---|
447 | srlx $acc0,32,$sbit
|
---|
448 | and $acc0,$mask,$acc0
|
---|
449 | add $acc0,$car1,$car1
|
---|
450 | st $car1,[$tp] ! tp[j-1]
|
---|
451 | srlx $car1,32,$car1
|
---|
452 |
|
---|
453 | add $car0,$car0,$car0
|
---|
454 | add $sbit,$car0,$car0
|
---|
455 | add $car0,$car1,$car1
|
---|
456 | add $car2,$car1,$car1
|
---|
457 | st $car1,[$tp+4]
|
---|
458 | srlx $car1,32,$car2
|
---|
459 | |
---|
460 |
|
---|
461 | ld [%sp+$bias+$frame],$tmp1 ! tp[0]
|
---|
462 | ld [%sp+$bias+$frame+4],$tpj ! tp[1]
|
---|
463 | ld [$ap+8],$mul0 ! ap[2]
|
---|
464 | ld [$np],$car1 ! np[0]
|
---|
465 | ld [$np+4],$npj ! np[1]
|
---|
466 | mulx $n0,$tmp1,$mul1
|
---|
467 | and $mul1,$mask,$mul1
|
---|
468 | mov 8,$i
|
---|
469 |
|
---|
470 | mulx $mul0,$mul0,$car0
|
---|
471 | mulx $car1,$mul1,$car1
|
---|
472 | and $car0,$mask,$acc0
|
---|
473 | add $tmp1,$car1,$car1
|
---|
474 | srlx $car0,32,$car0
|
---|
475 | add %sp,$bias+$frame,$tp
|
---|
476 | srlx $car1,32,$car1
|
---|
477 | and $car0,1,$sbit
|
---|
478 | srlx $car0,1,$car0
|
---|
479 | mov 4,$j
|
---|
480 |
|
---|
481 | .Lsqr_outer:
|
---|
482 | .Lsqr_inner1:
|
---|
483 | mulx $npj,$mul1,$acc1
|
---|
484 | add $tpj,$car1,$car1
|
---|
485 | add $j,4,$j
|
---|
486 | ld [$tp+8],$tpj
|
---|
487 | cmp $j,$i
|
---|
488 | add $acc1,$car1,$car1
|
---|
489 | ld [$np+$j],$npj
|
---|
490 | st $car1,[$tp]
|
---|
491 | srlx $car1,32,$car1
|
---|
492 | bl %icc,.Lsqr_inner1
|
---|
493 | add $tp,4,$tp
|
---|
494 | !.Lsqr_inner1
|
---|
495 |
|
---|
496 | add $j,4,$j
|
---|
497 | ld [$ap+$j],$apj ! ap[j]
|
---|
498 | mulx $npj,$mul1,$acc1
|
---|
499 | add $tpj,$car1,$car1
|
---|
500 | ld [$np+$j],$npj ! np[j]
|
---|
501 | srlx $car1,32,$tmp0
|
---|
502 | and $car1,$mask,$car1
|
---|
503 | add $tmp0,$sbit,$sbit
|
---|
504 | add $acc0,$car1,$car1
|
---|
505 | ld [$tp+8],$tpj ! tp[j]
|
---|
506 | add $acc1,$car1,$car1
|
---|
507 | st $car1,[$tp]
|
---|
508 | srlx $car1,32,$car1
|
---|
509 |
|
---|
510 | add $j,4,$j
|
---|
511 | cmp $j,$num
|
---|
512 | be,pn %icc,.Lsqr_no_inner2
|
---|
513 | add $tp,4,$tp
|
---|
514 |
|
---|
515 | .Lsqr_inner2:
|
---|
516 | mulx $apj,$mul0,$acc0
|
---|
517 | mulx $npj,$mul1,$acc1
|
---|
518 | add $tpj,$sbit,$sbit
|
---|
519 | add $acc0,$car0,$car0
|
---|
520 | ld [$ap+$j],$apj ! ap[j]
|
---|
521 | and $car0,$mask,$acc0
|
---|
522 | ld [$np+$j],$npj ! np[j]
|
---|
523 | srlx $car0,32,$car0
|
---|
524 | add $acc0,$acc0,$acc0
|
---|
525 | ld [$tp+8],$tpj ! tp[j]
|
---|
526 | add $sbit,$acc0,$acc0
|
---|
527 | add $j,4,$j ! j++
|
---|
528 | srlx $acc0,32,$sbit
|
---|
529 | and $acc0,$mask,$acc0
|
---|
530 | cmp $j,$num
|
---|
531 | add $acc0,$car1,$car1
|
---|
532 | add $acc1,$car1,$car1
|
---|
533 | st $car1,[$tp] ! tp[j-1]
|
---|
534 | srlx $car1,32,$car1
|
---|
535 | bl %icc,.Lsqr_inner2
|
---|
536 | add $tp,4,$tp ! tp++
|
---|
537 |
|
---|
538 | .Lsqr_no_inner2:
|
---|
539 | mulx $apj,$mul0,$acc0
|
---|
540 | mulx $npj,$mul1,$acc1
|
---|
541 | add $tpj,$sbit,$sbit
|
---|
542 | add $acc0,$car0,$car0
|
---|
543 | and $car0,$mask,$acc0
|
---|
544 | srlx $car0,32,$car0
|
---|
545 | add $acc0,$acc0,$acc0
|
---|
546 | add $sbit,$acc0,$acc0
|
---|
547 | srlx $acc0,32,$sbit
|
---|
548 | and $acc0,$mask,$acc0
|
---|
549 | add $acc0,$car1,$car1
|
---|
550 | add $acc1,$car1,$car1
|
---|
551 | st $car1,[$tp] ! tp[j-1]
|
---|
552 | srlx $car1,32,$car1
|
---|
553 |
|
---|
554 | add $car0,$car0,$car0
|
---|
555 | add $sbit,$car0,$car0
|
---|
556 | add $car0,$car1,$car1
|
---|
557 | add $car2,$car1,$car1
|
---|
558 | st $car1,[$tp+4]
|
---|
559 | srlx $car1,32,$car2
|
---|
560 | |
---|
561 |
|
---|
562 | add $i,4,$i ! i++
|
---|
563 | ld [%sp+$bias+$frame],$tmp1 ! tp[0]
|
---|
564 | ld [%sp+$bias+$frame+4],$tpj ! tp[1]
|
---|
565 | ld [$ap+$i],$mul0 ! ap[j]
|
---|
566 | ld [$np],$car1 ! np[0]
|
---|
567 | ld [$np+4],$npj ! np[1]
|
---|
568 | mulx $n0,$tmp1,$mul1
|
---|
569 | and $mul1,$mask,$mul1
|
---|
570 | add $i,4,$tmp0
|
---|
571 |
|
---|
572 | mulx $mul0,$mul0,$car0
|
---|
573 | mulx $car1,$mul1,$car1
|
---|
574 | and $car0,$mask,$acc0
|
---|
575 | add $tmp1,$car1,$car1
|
---|
576 | srlx $car0,32,$car0
|
---|
577 | add %sp,$bias+$frame,$tp
|
---|
578 | srlx $car1,32,$car1
|
---|
579 | and $car0,1,$sbit
|
---|
580 | srlx $car0,1,$car0
|
---|
581 |
|
---|
582 | cmp $tmp0,$num ! i<num-1
|
---|
583 | bl %icc,.Lsqr_outer
|
---|
584 | mov 4,$j
|
---|
585 | |
---|
586 |
|
---|
587 | .Lsqr_last:
|
---|
588 | mulx $npj,$mul1,$acc1
|
---|
589 | add $tpj,$car1,$car1
|
---|
590 | add $j,4,$j
|
---|
591 | ld [$tp+8],$tpj
|
---|
592 | cmp $j,$i
|
---|
593 | add $acc1,$car1,$car1
|
---|
594 | ld [$np+$j],$npj
|
---|
595 | st $car1,[$tp]
|
---|
596 | srlx $car1,32,$car1
|
---|
597 | bl %icc,.Lsqr_last
|
---|
598 | add $tp,4,$tp
|
---|
599 | !.Lsqr_last
|
---|
600 |
|
---|
601 | mulx $npj,$mul1,$acc1
|
---|
602 | add $tpj,$acc0,$acc0
|
---|
603 | srlx $acc0,32,$tmp0
|
---|
604 | and $acc0,$mask,$acc0
|
---|
605 | add $tmp0,$sbit,$sbit
|
---|
606 | add $acc0,$car1,$car1
|
---|
607 | add $acc1,$car1,$car1
|
---|
608 | st $car1,[$tp]
|
---|
609 | srlx $car1,32,$car1
|
---|
610 |
|
---|
611 | add $car0,$car0,$car0 ! recover $car0
|
---|
612 | add $sbit,$car0,$car0
|
---|
613 | add $car0,$car1,$car1
|
---|
614 | add $car2,$car1,$car1
|
---|
615 | st $car1,[$tp+4]
|
---|
616 | srlx $car1,32,$car2
|
---|
617 |
|
---|
618 | ba .Ltail
|
---|
619 | add $tp,8,$tp
|
---|
620 | .type $fname,#function
|
---|
621 | .size $fname,(.-$fname)
|
---|
622 | .asciz "Montgomery Multiplication for SPARCv9, CRYPTOGAMS by <appro\@openssl.org>"
|
---|
623 | .align 32
|
---|
624 | ___
|
---|
625 | $code =~ s/\`([^\`]*)\`/eval($1)/gem;
|
---|
626 | print $code;
|
---|
627 | close STDOUT or die "error closing STDOUT: $!";
|
---|