1 | #! /usr/bin/env perl
|
---|
2 | # Copyright 2005-2021 The OpenSSL Project Authors. All Rights Reserved.
|
---|
3 | #
|
---|
4 | # Licensed under the Apache License 2.0 (the "License"). You may not use
|
---|
5 | # this file except in compliance with the License. You can obtain a copy
|
---|
6 | # in the file LICENSE in the source distribution or at
|
---|
7 | # https://www.openssl.org/source/license.html
|
---|
8 |
|
---|
9 |
|
---|
10 | # ====================================================================
|
---|
11 | # Written by Andy Polyakov <[email protected]> for the OpenSSL
|
---|
12 | # project. The module is, however, dual licensed under OpenSSL and
|
---|
13 | # CRYPTOGAMS licenses depending on where you obtain it. For further
|
---|
14 | # details see http://www.openssl.org/~appro/cryptogams/.
|
---|
15 | # ====================================================================
|
---|
16 |
|
---|
17 | # October 2005
|
---|
18 | #
|
---|
19 | # "Teaser" Montgomery multiplication module for UltraSPARC. Why FPU?
|
---|
20 | # Because unlike integer multiplier, which simply stalls whole CPU,
|
---|
21 | # FPU is fully pipelined and can effectively emit 48 bit partial
|
---|
22 | # product every cycle. Why not blended SPARC v9? One can argue that
|
---|
23 | # making this module dependent on UltraSPARC VIS extension limits its
|
---|
24 | # binary compatibility. Well yes, it does exclude SPARC64 prior-V(!)
|
---|
25 | # implementations from compatibility matrix. But the rest, whole Sun
|
---|
26 | # UltraSPARC family and brand new Fujitsu's SPARC64 V, all support
|
---|
27 | # VIS extension instructions used in this module. This is considered
|
---|
28 | # good enough to not care about HAL SPARC64 users [if any] who have
|
---|
29 | # integer-only pure SPARCv9 module to "fall down" to.
|
---|
30 |
|
---|
31 | # USI&II cores currently exhibit uniform 2x improvement [over pre-
|
---|
32 | # bn_mul_mont codebase] for all key lengths and benchmarks. On USIII
|
---|
33 | # performance improves few percents for shorter keys and worsens few
|
---|
34 | # percents for longer keys. This is because USIII integer multiplier
|
---|
35 | # is >3x faster than USI&II one, which is harder to match [but see
|
---|
36 | # TODO list below]. It should also be noted that SPARC64 V features
|
---|
37 | # out-of-order execution, which *might* mean that integer multiplier
|
---|
38 | # is pipelined, which in turn *might* be impossible to match... On
|
---|
39 | # additional note, SPARC64 V implements FP Multiply-Add instruction,
|
---|
40 | # which is perfectly usable in this context... In other words, as far
|
---|
41 | # as Fujitsu SPARC64 V goes, talk to the author:-)
|
---|
42 |
|
---|
43 | # The implementation implies following "non-natural" limitations on
|
---|
44 | # input arguments:
|
---|
45 | # - num may not be less than 4;
|
---|
46 | # - num has to be even;
|
---|
47 | # Failure to meet either condition has no fatal effects, simply
|
---|
48 | # doesn't give any performance gain.
|
---|
49 |
|
---|
50 | # TODO:
|
---|
51 | # - modulo-schedule inner loop for better performance (on in-order
|
---|
52 | # execution core such as UltraSPARC this shall result in further
|
---|
53 | # noticeable(!) improvement);
|
---|
54 | # - dedicated squaring procedure[?];
|
---|
55 |
|
---|
56 | ######################################################################
|
---|
57 | # November 2006
|
---|
58 | #
|
---|
59 | # Modulo-scheduled inner loops allow to interleave floating point and
|
---|
60 | # integer instructions and minimize Read-After-Write penalties. This
|
---|
61 | # results in *further* 20-50% performance improvement [depending on
|
---|
62 | # key length, more for longer keys] on USI&II cores and 30-80% - on
|
---|
63 | # USIII&IV.
|
---|
64 |
|
---|
65 | # $output is the last argument if it looks like a file (it has an extension)
|
---|
66 | $output = $#ARGV >= 0 && $ARGV[$#ARGV] =~ m|\.\w+$| ? pop : undef;
|
---|
67 |
|
---|
68 | $output and open STDOUT,">$output";
|
---|
69 |
|
---|
70 | $fname="bn_mul_mont_fpu";
|
---|
71 |
|
---|
72 | $frame="STACK_FRAME";
|
---|
73 | $bias="STACK_BIAS";
|
---|
74 | $locals=64;
|
---|
75 |
|
---|
76 | # In order to provide for 32-/64-bit ABI duality, I keep integers wider
|
---|
77 | # than 32 bit in %g1-%g4 and %o0-%o5. %l0-%l7 and %i0-%i5 are used
|
---|
78 | # exclusively for pointers, indexes and other small values...
|
---|
79 | # int bn_mul_mont(
|
---|
80 | $rp="%i0"; # BN_ULONG *rp,
|
---|
81 | $ap="%i1"; # const BN_ULONG *ap,
|
---|
82 | $bp="%i2"; # const BN_ULONG *bp,
|
---|
83 | $np="%i3"; # const BN_ULONG *np,
|
---|
84 | $n0="%i4"; # const BN_ULONG *n0,
|
---|
85 | $num="%i5"; # int num);
|
---|
86 |
|
---|
87 | $tp="%l0"; # t[num]
|
---|
88 | $ap_l="%l1"; # a[num],n[num] are smashed to 32-bit words and saved
|
---|
89 | $ap_h="%l2"; # to these four vectors as double-precision FP values.
|
---|
90 | $np_l="%l3"; # This way a bunch of fxtods are eliminated in second
|
---|
91 | $np_h="%l4"; # loop and L1-cache aliasing is minimized...
|
---|
92 | $i="%l5";
|
---|
93 | $j="%l6";
|
---|
94 | $mask="%l7"; # 16-bit mask, 0xffff
|
---|
95 |
|
---|
96 | $n0="%g4"; # reassigned(!) to "64-bit" register
|
---|
97 | $carry="%i4"; # %i4 reused(!) for a carry bit
|
---|
98 |
|
---|
99 | # FP register naming chart
|
---|
100 | #
|
---|
101 | # ..HILO
|
---|
102 | # dcba
|
---|
103 | # --------
|
---|
104 | # LOa
|
---|
105 | # LOb
|
---|
106 | # LOc
|
---|
107 | # LOd
|
---|
108 | # HIa
|
---|
109 | # HIb
|
---|
110 | # HIc
|
---|
111 | # HId
|
---|
112 | # ..a
|
---|
113 | # ..b
|
---|
114 | $ba="%f0"; $bb="%f2"; $bc="%f4"; $bd="%f6";
|
---|
115 | $na="%f8"; $nb="%f10"; $nc="%f12"; $nd="%f14";
|
---|
116 | $alo="%f16"; $alo_="%f17"; $ahi="%f18"; $ahi_="%f19";
|
---|
117 | $nlo="%f20"; $nlo_="%f21"; $nhi="%f22"; $nhi_="%f23";
|
---|
118 |
|
---|
119 | $dota="%f24"; $dotb="%f26";
|
---|
120 |
|
---|
121 | $aloa="%f32"; $alob="%f34"; $aloc="%f36"; $alod="%f38";
|
---|
122 | $ahia="%f40"; $ahib="%f42"; $ahic="%f44"; $ahid="%f46";
|
---|
123 | $nloa="%f48"; $nlob="%f50"; $nloc="%f52"; $nlod="%f54";
|
---|
124 | $nhia="%f56"; $nhib="%f58"; $nhic="%f60"; $nhid="%f62";
|
---|
125 |
|
---|
126 | $ASI_FL16_P=0xD2; # magic ASI value to engage 16-bit FP load
|
---|
127 |
|
---|
128 | $code=<<___;
|
---|
129 | #ifndef __ASSEMBLER__
|
---|
130 | # define __ASSEMBLER__ 1
|
---|
131 | #endif
|
---|
132 | #include "crypto/sparc_arch.h"
|
---|
133 |
|
---|
134 | .section ".text",#alloc,#execinstr
|
---|
135 |
|
---|
136 | .global $fname
|
---|
137 | .align 32
|
---|
138 | $fname:
|
---|
139 | save %sp,-$frame-$locals,%sp
|
---|
140 |
|
---|
141 | cmp $num,4
|
---|
142 | bl,a,pn %icc,.Lret
|
---|
143 | clr %i0
|
---|
144 | andcc $num,1,%g0 ! $num has to be even...
|
---|
145 | bnz,a,pn %icc,.Lret
|
---|
146 | clr %i0 ! signal "unsupported input value"
|
---|
147 |
|
---|
148 | srl $num,1,$num
|
---|
149 | sethi %hi(0xffff),$mask
|
---|
150 | ld [%i4+0],$n0 ! $n0 reassigned, remember?
|
---|
151 | or $mask,%lo(0xffff),$mask
|
---|
152 | ld [%i4+4],%o0
|
---|
153 | sllx %o0,32,%o0
|
---|
154 | or %o0,$n0,$n0 ! $n0=n0[1].n0[0]
|
---|
155 |
|
---|
156 | sll $num,3,$num ! num*=8
|
---|
157 |
|
---|
158 | add %sp,$bias,%o0 ! real top of stack
|
---|
159 | sll $num,2,%o1
|
---|
160 | add %o1,$num,%o1 ! %o1=num*5
|
---|
161 | sub %o0,%o1,%o0
|
---|
162 | and %o0,-2048,%o0 ! optimize TLB utilization
|
---|
163 | sub %o0,$bias,%sp ! alloca(5*num*8)
|
---|
164 |
|
---|
165 | rd %asi,%o7 ! save %asi
|
---|
166 | add %sp,$bias+$frame+$locals,$tp
|
---|
167 | add $tp,$num,$ap_l
|
---|
168 | add $ap_l,$num,$ap_l ! [an]p_[lh] point at the vectors' ends !
|
---|
169 | add $ap_l,$num,$ap_h
|
---|
170 | add $ap_h,$num,$np_l
|
---|
171 | add $np_l,$num,$np_h
|
---|
172 |
|
---|
173 | wr %g0,$ASI_FL16_P,%asi ! setup %asi for 16-bit FP loads
|
---|
174 |
|
---|
175 | add $rp,$num,$rp ! readjust input pointers to point
|
---|
176 | add $ap,$num,$ap ! at the ends too...
|
---|
177 | add $bp,$num,$bp
|
---|
178 | add $np,$num,$np
|
---|
179 |
|
---|
180 | stx %o7,[%sp+$bias+$frame+48] ! save %asi
|
---|
181 | |
---|
182 |
|
---|
183 | sub %g0,$num,$i ! i=-num
|
---|
184 | sub %g0,$num,$j ! j=-num
|
---|
185 |
|
---|
186 | add $ap,$j,%o3
|
---|
187 | add $bp,$i,%o4
|
---|
188 |
|
---|
189 | ld [%o3+4],%g1 ! bp[0]
|
---|
190 | ld [%o3+0],%o0
|
---|
191 | ld [%o4+4],%g5 ! ap[0]
|
---|
192 | sllx %g1,32,%g1
|
---|
193 | ld [%o4+0],%o1
|
---|
194 | sllx %g5,32,%g5
|
---|
195 | or %g1,%o0,%o0
|
---|
196 | or %g5,%o1,%o1
|
---|
197 |
|
---|
198 | add $np,$j,%o5
|
---|
199 |
|
---|
200 | mulx %o1,%o0,%o0 ! ap[0]*bp[0]
|
---|
201 | mulx $n0,%o0,%o0 ! ap[0]*bp[0]*n0
|
---|
202 | stx %o0,[%sp+$bias+$frame+0]
|
---|
203 |
|
---|
204 | ld [%o3+0],$alo_ ! load a[j] as pair of 32-bit words
|
---|
205 | fzeros $alo
|
---|
206 | ld [%o3+4],$ahi_
|
---|
207 | fzeros $ahi
|
---|
208 | ld [%o5+0],$nlo_ ! load n[j] as pair of 32-bit words
|
---|
209 | fzeros $nlo
|
---|
210 | ld [%o5+4],$nhi_
|
---|
211 | fzeros $nhi
|
---|
212 |
|
---|
213 | ! transfer b[i] to FPU as 4x16-bit values
|
---|
214 | ldda [%o4+2]%asi,$ba
|
---|
215 | fxtod $alo,$alo
|
---|
216 | ldda [%o4+0]%asi,$bb
|
---|
217 | fxtod $ahi,$ahi
|
---|
218 | ldda [%o4+6]%asi,$bc
|
---|
219 | fxtod $nlo,$nlo
|
---|
220 | ldda [%o4+4]%asi,$bd
|
---|
221 | fxtod $nhi,$nhi
|
---|
222 |
|
---|
223 | ! transfer ap[0]*b[0]*n0 to FPU as 4x16-bit values
|
---|
224 | ldda [%sp+$bias+$frame+6]%asi,$na
|
---|
225 | fxtod $ba,$ba
|
---|
226 | ldda [%sp+$bias+$frame+4]%asi,$nb
|
---|
227 | fxtod $bb,$bb
|
---|
228 | ldda [%sp+$bias+$frame+2]%asi,$nc
|
---|
229 | fxtod $bc,$bc
|
---|
230 | ldda [%sp+$bias+$frame+0]%asi,$nd
|
---|
231 | fxtod $bd,$bd
|
---|
232 |
|
---|
233 | std $alo,[$ap_l+$j] ! save smashed ap[j] in double format
|
---|
234 | fxtod $na,$na
|
---|
235 | std $ahi,[$ap_h+$j]
|
---|
236 | fxtod $nb,$nb
|
---|
237 | std $nlo,[$np_l+$j] ! save smashed np[j] in double format
|
---|
238 | fxtod $nc,$nc
|
---|
239 | std $nhi,[$np_h+$j]
|
---|
240 | fxtod $nd,$nd
|
---|
241 |
|
---|
242 | fmuld $alo,$ba,$aloa
|
---|
243 | fmuld $nlo,$na,$nloa
|
---|
244 | fmuld $alo,$bb,$alob
|
---|
245 | fmuld $nlo,$nb,$nlob
|
---|
246 | fmuld $alo,$bc,$aloc
|
---|
247 | faddd $aloa,$nloa,$nloa
|
---|
248 | fmuld $nlo,$nc,$nloc
|
---|
249 | fmuld $alo,$bd,$alod
|
---|
250 | faddd $alob,$nlob,$nlob
|
---|
251 | fmuld $nlo,$nd,$nlod
|
---|
252 | fmuld $ahi,$ba,$ahia
|
---|
253 | faddd $aloc,$nloc,$nloc
|
---|
254 | fmuld $nhi,$na,$nhia
|
---|
255 | fmuld $ahi,$bb,$ahib
|
---|
256 | faddd $alod,$nlod,$nlod
|
---|
257 | fmuld $nhi,$nb,$nhib
|
---|
258 | fmuld $ahi,$bc,$ahic
|
---|
259 | faddd $ahia,$nhia,$nhia
|
---|
260 | fmuld $nhi,$nc,$nhic
|
---|
261 | fmuld $ahi,$bd,$ahid
|
---|
262 | faddd $ahib,$nhib,$nhib
|
---|
263 | fmuld $nhi,$nd,$nhid
|
---|
264 |
|
---|
265 | faddd $ahic,$nhic,$dota ! $nhic
|
---|
266 | faddd $ahid,$nhid,$dotb ! $nhid
|
---|
267 |
|
---|
268 | faddd $nloc,$nhia,$nloc
|
---|
269 | faddd $nlod,$nhib,$nlod
|
---|
270 |
|
---|
271 | fdtox $nloa,$nloa
|
---|
272 | fdtox $nlob,$nlob
|
---|
273 | fdtox $nloc,$nloc
|
---|
274 | fdtox $nlod,$nlod
|
---|
275 |
|
---|
276 | std $nloa,[%sp+$bias+$frame+0]
|
---|
277 | add $j,8,$j
|
---|
278 | std $nlob,[%sp+$bias+$frame+8]
|
---|
279 | add $ap,$j,%o4
|
---|
280 | std $nloc,[%sp+$bias+$frame+16]
|
---|
281 | add $np,$j,%o5
|
---|
282 | std $nlod,[%sp+$bias+$frame+24]
|
---|
283 | |
---|
284 |
|
---|
285 | ld [%o4+0],$alo_ ! load a[j] as pair of 32-bit words
|
---|
286 | fzeros $alo
|
---|
287 | ld [%o4+4],$ahi_
|
---|
288 | fzeros $ahi
|
---|
289 | ld [%o5+0],$nlo_ ! load n[j] as pair of 32-bit words
|
---|
290 | fzeros $nlo
|
---|
291 | ld [%o5+4],$nhi_
|
---|
292 | fzeros $nhi
|
---|
293 |
|
---|
294 | fxtod $alo,$alo
|
---|
295 | fxtod $ahi,$ahi
|
---|
296 | fxtod $nlo,$nlo
|
---|
297 | fxtod $nhi,$nhi
|
---|
298 |
|
---|
299 | ldx [%sp+$bias+$frame+0],%o0
|
---|
300 | fmuld $alo,$ba,$aloa
|
---|
301 | ldx [%sp+$bias+$frame+8],%o1
|
---|
302 | fmuld $nlo,$na,$nloa
|
---|
303 | ldx [%sp+$bias+$frame+16],%o2
|
---|
304 | fmuld $alo,$bb,$alob
|
---|
305 | ldx [%sp+$bias+$frame+24],%o3
|
---|
306 | fmuld $nlo,$nb,$nlob
|
---|
307 |
|
---|
308 | srlx %o0,16,%o7
|
---|
309 | std $alo,[$ap_l+$j] ! save smashed ap[j] in double format
|
---|
310 | fmuld $alo,$bc,$aloc
|
---|
311 | add %o7,%o1,%o1
|
---|
312 | std $ahi,[$ap_h+$j]
|
---|
313 | faddd $aloa,$nloa,$nloa
|
---|
314 | fmuld $nlo,$nc,$nloc
|
---|
315 | srlx %o1,16,%o7
|
---|
316 | std $nlo,[$np_l+$j] ! save smashed np[j] in double format
|
---|
317 | fmuld $alo,$bd,$alod
|
---|
318 | add %o7,%o2,%o2
|
---|
319 | std $nhi,[$np_h+$j]
|
---|
320 | faddd $alob,$nlob,$nlob
|
---|
321 | fmuld $nlo,$nd,$nlod
|
---|
322 | srlx %o2,16,%o7
|
---|
323 | fmuld $ahi,$ba,$ahia
|
---|
324 | add %o7,%o3,%o3 ! %o3.%o2[0..15].%o1[0..15].%o0[0..15]
|
---|
325 | faddd $aloc,$nloc,$nloc
|
---|
326 | fmuld $nhi,$na,$nhia
|
---|
327 | !and %o0,$mask,%o0
|
---|
328 | !and %o1,$mask,%o1
|
---|
329 | !and %o2,$mask,%o2
|
---|
330 | !sllx %o1,16,%o1
|
---|
331 | !sllx %o2,32,%o2
|
---|
332 | !sllx %o3,48,%o7
|
---|
333 | !or %o1,%o0,%o0
|
---|
334 | !or %o2,%o0,%o0
|
---|
335 | !or %o7,%o0,%o0 ! 64-bit result
|
---|
336 | srlx %o3,16,%g1 ! 34-bit carry
|
---|
337 | fmuld $ahi,$bb,$ahib
|
---|
338 |
|
---|
339 | faddd $alod,$nlod,$nlod
|
---|
340 | fmuld $nhi,$nb,$nhib
|
---|
341 | fmuld $ahi,$bc,$ahic
|
---|
342 | faddd $ahia,$nhia,$nhia
|
---|
343 | fmuld $nhi,$nc,$nhic
|
---|
344 | fmuld $ahi,$bd,$ahid
|
---|
345 | faddd $ahib,$nhib,$nhib
|
---|
346 | fmuld $nhi,$nd,$nhid
|
---|
347 |
|
---|
348 | faddd $dota,$nloa,$nloa
|
---|
349 | faddd $dotb,$nlob,$nlob
|
---|
350 | faddd $ahic,$nhic,$dota ! $nhic
|
---|
351 | faddd $ahid,$nhid,$dotb ! $nhid
|
---|
352 |
|
---|
353 | faddd $nloc,$nhia,$nloc
|
---|
354 | faddd $nlod,$nhib,$nlod
|
---|
355 |
|
---|
356 | fdtox $nloa,$nloa
|
---|
357 | fdtox $nlob,$nlob
|
---|
358 | fdtox $nloc,$nloc
|
---|
359 | fdtox $nlod,$nlod
|
---|
360 |
|
---|
361 | std $nloa,[%sp+$bias+$frame+0]
|
---|
362 | std $nlob,[%sp+$bias+$frame+8]
|
---|
363 | addcc $j,8,$j
|
---|
364 | std $nloc,[%sp+$bias+$frame+16]
|
---|
365 | bz,pn %icc,.L1stskip
|
---|
366 | std $nlod,[%sp+$bias+$frame+24]
|
---|
367 | |
---|
368 |
|
---|
369 | .align 32 ! incidentally already aligned !
|
---|
370 | .L1st:
|
---|
371 | add $ap,$j,%o4
|
---|
372 | add $np,$j,%o5
|
---|
373 | ld [%o4+0],$alo_ ! load a[j] as pair of 32-bit words
|
---|
374 | fzeros $alo
|
---|
375 | ld [%o4+4],$ahi_
|
---|
376 | fzeros $ahi
|
---|
377 | ld [%o5+0],$nlo_ ! load n[j] as pair of 32-bit words
|
---|
378 | fzeros $nlo
|
---|
379 | ld [%o5+4],$nhi_
|
---|
380 | fzeros $nhi
|
---|
381 |
|
---|
382 | fxtod $alo,$alo
|
---|
383 | fxtod $ahi,$ahi
|
---|
384 | fxtod $nlo,$nlo
|
---|
385 | fxtod $nhi,$nhi
|
---|
386 |
|
---|
387 | ldx [%sp+$bias+$frame+0],%o0
|
---|
388 | fmuld $alo,$ba,$aloa
|
---|
389 | ldx [%sp+$bias+$frame+8],%o1
|
---|
390 | fmuld $nlo,$na,$nloa
|
---|
391 | ldx [%sp+$bias+$frame+16],%o2
|
---|
392 | fmuld $alo,$bb,$alob
|
---|
393 | ldx [%sp+$bias+$frame+24],%o3
|
---|
394 | fmuld $nlo,$nb,$nlob
|
---|
395 |
|
---|
396 | srlx %o0,16,%o7
|
---|
397 | std $alo,[$ap_l+$j] ! save smashed ap[j] in double format
|
---|
398 | fmuld $alo,$bc,$aloc
|
---|
399 | add %o7,%o1,%o1
|
---|
400 | std $ahi,[$ap_h+$j]
|
---|
401 | faddd $aloa,$nloa,$nloa
|
---|
402 | fmuld $nlo,$nc,$nloc
|
---|
403 | srlx %o1,16,%o7
|
---|
404 | std $nlo,[$np_l+$j] ! save smashed np[j] in double format
|
---|
405 | fmuld $alo,$bd,$alod
|
---|
406 | add %o7,%o2,%o2
|
---|
407 | std $nhi,[$np_h+$j]
|
---|
408 | faddd $alob,$nlob,$nlob
|
---|
409 | fmuld $nlo,$nd,$nlod
|
---|
410 | srlx %o2,16,%o7
|
---|
411 | fmuld $ahi,$ba,$ahia
|
---|
412 | add %o7,%o3,%o3 ! %o3.%o2[0..15].%o1[0..15].%o0[0..15]
|
---|
413 | and %o0,$mask,%o0
|
---|
414 | faddd $aloc,$nloc,$nloc
|
---|
415 | fmuld $nhi,$na,$nhia
|
---|
416 | and %o1,$mask,%o1
|
---|
417 | and %o2,$mask,%o2
|
---|
418 | fmuld $ahi,$bb,$ahib
|
---|
419 | sllx %o1,16,%o1
|
---|
420 | faddd $alod,$nlod,$nlod
|
---|
421 | fmuld $nhi,$nb,$nhib
|
---|
422 | sllx %o2,32,%o2
|
---|
423 | fmuld $ahi,$bc,$ahic
|
---|
424 | sllx %o3,48,%o7
|
---|
425 | or %o1,%o0,%o0
|
---|
426 | faddd $ahia,$nhia,$nhia
|
---|
427 | fmuld $nhi,$nc,$nhic
|
---|
428 | or %o2,%o0,%o0
|
---|
429 | fmuld $ahi,$bd,$ahid
|
---|
430 | or %o7,%o0,%o0 ! 64-bit result
|
---|
431 | faddd $ahib,$nhib,$nhib
|
---|
432 | fmuld $nhi,$nd,$nhid
|
---|
433 | addcc %g1,%o0,%o0
|
---|
434 | faddd $dota,$nloa,$nloa
|
---|
435 | srlx %o3,16,%g1 ! 34-bit carry
|
---|
436 | faddd $dotb,$nlob,$nlob
|
---|
437 | bcs,a %xcc,.+8
|
---|
438 | add %g1,1,%g1
|
---|
439 |
|
---|
440 | stx %o0,[$tp] ! tp[j-1]=
|
---|
441 |
|
---|
442 | faddd $ahic,$nhic,$dota ! $nhic
|
---|
443 | faddd $ahid,$nhid,$dotb ! $nhid
|
---|
444 |
|
---|
445 | faddd $nloc,$nhia,$nloc
|
---|
446 | faddd $nlod,$nhib,$nlod
|
---|
447 |
|
---|
448 | fdtox $nloa,$nloa
|
---|
449 | fdtox $nlob,$nlob
|
---|
450 | fdtox $nloc,$nloc
|
---|
451 | fdtox $nlod,$nlod
|
---|
452 |
|
---|
453 | std $nloa,[%sp+$bias+$frame+0]
|
---|
454 | std $nlob,[%sp+$bias+$frame+8]
|
---|
455 | std $nloc,[%sp+$bias+$frame+16]
|
---|
456 | std $nlod,[%sp+$bias+$frame+24]
|
---|
457 |
|
---|
458 | addcc $j,8,$j
|
---|
459 | bnz,pt %icc,.L1st
|
---|
460 | add $tp,8,$tp
|
---|
461 | |
---|
462 |
|
---|
463 | .L1stskip:
|
---|
464 | fdtox $dota,$dota
|
---|
465 | fdtox $dotb,$dotb
|
---|
466 |
|
---|
467 | ldx [%sp+$bias+$frame+0],%o0
|
---|
468 | ldx [%sp+$bias+$frame+8],%o1
|
---|
469 | ldx [%sp+$bias+$frame+16],%o2
|
---|
470 | ldx [%sp+$bias+$frame+24],%o3
|
---|
471 |
|
---|
472 | srlx %o0,16,%o7
|
---|
473 | std $dota,[%sp+$bias+$frame+32]
|
---|
474 | add %o7,%o1,%o1
|
---|
475 | std $dotb,[%sp+$bias+$frame+40]
|
---|
476 | srlx %o1,16,%o7
|
---|
477 | add %o7,%o2,%o2
|
---|
478 | srlx %o2,16,%o7
|
---|
479 | add %o7,%o3,%o3 ! %o3.%o2[0..15].%o1[0..15].%o0[0..15]
|
---|
480 | and %o0,$mask,%o0
|
---|
481 | and %o1,$mask,%o1
|
---|
482 | and %o2,$mask,%o2
|
---|
483 | sllx %o1,16,%o1
|
---|
484 | sllx %o2,32,%o2
|
---|
485 | sllx %o3,48,%o7
|
---|
486 | or %o1,%o0,%o0
|
---|
487 | or %o2,%o0,%o0
|
---|
488 | or %o7,%o0,%o0 ! 64-bit result
|
---|
489 | ldx [%sp+$bias+$frame+32],%o4
|
---|
490 | addcc %g1,%o0,%o0
|
---|
491 | ldx [%sp+$bias+$frame+40],%o5
|
---|
492 | srlx %o3,16,%g1 ! 34-bit carry
|
---|
493 | bcs,a %xcc,.+8
|
---|
494 | add %g1,1,%g1
|
---|
495 |
|
---|
496 | stx %o0,[$tp] ! tp[j-1]=
|
---|
497 | add $tp,8,$tp
|
---|
498 |
|
---|
499 | srlx %o4,16,%o7
|
---|
500 | add %o7,%o5,%o5
|
---|
501 | and %o4,$mask,%o4
|
---|
502 | sllx %o5,16,%o7
|
---|
503 | or %o7,%o4,%o4
|
---|
504 | addcc %g1,%o4,%o4
|
---|
505 | srlx %o5,48,%g1
|
---|
506 | bcs,a %xcc,.+8
|
---|
507 | add %g1,1,%g1
|
---|
508 |
|
---|
509 | mov %g1,$carry
|
---|
510 | stx %o4,[$tp] ! tp[num-1]=
|
---|
511 | |
---|
512 |
|
---|
513 | ba .Louter
|
---|
514 | add $i,8,$i
|
---|
515 | .align 32
|
---|
516 | .Louter:
|
---|
517 | sub %g0,$num,$j ! j=-num
|
---|
518 | add %sp,$bias+$frame+$locals,$tp
|
---|
519 |
|
---|
520 | add $ap,$j,%o3
|
---|
521 | add $bp,$i,%o4
|
---|
522 |
|
---|
523 | ld [%o3+4],%g1 ! bp[i]
|
---|
524 | ld [%o3+0],%o0
|
---|
525 | ld [%o4+4],%g5 ! ap[0]
|
---|
526 | sllx %g1,32,%g1
|
---|
527 | ld [%o4+0],%o1
|
---|
528 | sllx %g5,32,%g5
|
---|
529 | or %g1,%o0,%o0
|
---|
530 | or %g5,%o1,%o1
|
---|
531 |
|
---|
532 | ldx [$tp],%o2 ! tp[0]
|
---|
533 | mulx %o1,%o0,%o0
|
---|
534 | addcc %o2,%o0,%o0
|
---|
535 | mulx $n0,%o0,%o0 ! (ap[0]*bp[i]+t[0])*n0
|
---|
536 | stx %o0,[%sp+$bias+$frame+0]
|
---|
537 |
|
---|
538 | ! transfer b[i] to FPU as 4x16-bit values
|
---|
539 | ldda [%o4+2]%asi,$ba
|
---|
540 | ldda [%o4+0]%asi,$bb
|
---|
541 | ldda [%o4+6]%asi,$bc
|
---|
542 | ldda [%o4+4]%asi,$bd
|
---|
543 |
|
---|
544 | ! transfer (ap[0]*b[i]+t[0])*n0 to FPU as 4x16-bit values
|
---|
545 | ldda [%sp+$bias+$frame+6]%asi,$na
|
---|
546 | fxtod $ba,$ba
|
---|
547 | ldda [%sp+$bias+$frame+4]%asi,$nb
|
---|
548 | fxtod $bb,$bb
|
---|
549 | ldda [%sp+$bias+$frame+2]%asi,$nc
|
---|
550 | fxtod $bc,$bc
|
---|
551 | ldda [%sp+$bias+$frame+0]%asi,$nd
|
---|
552 | fxtod $bd,$bd
|
---|
553 | ldd [$ap_l+$j],$alo ! load a[j] in double format
|
---|
554 | fxtod $na,$na
|
---|
555 | ldd [$ap_h+$j],$ahi
|
---|
556 | fxtod $nb,$nb
|
---|
557 | ldd [$np_l+$j],$nlo ! load n[j] in double format
|
---|
558 | fxtod $nc,$nc
|
---|
559 | ldd [$np_h+$j],$nhi
|
---|
560 | fxtod $nd,$nd
|
---|
561 |
|
---|
562 | fmuld $alo,$ba,$aloa
|
---|
563 | fmuld $nlo,$na,$nloa
|
---|
564 | fmuld $alo,$bb,$alob
|
---|
565 | fmuld $nlo,$nb,$nlob
|
---|
566 | fmuld $alo,$bc,$aloc
|
---|
567 | faddd $aloa,$nloa,$nloa
|
---|
568 | fmuld $nlo,$nc,$nloc
|
---|
569 | fmuld $alo,$bd,$alod
|
---|
570 | faddd $alob,$nlob,$nlob
|
---|
571 | fmuld $nlo,$nd,$nlod
|
---|
572 | fmuld $ahi,$ba,$ahia
|
---|
573 | faddd $aloc,$nloc,$nloc
|
---|
574 | fmuld $nhi,$na,$nhia
|
---|
575 | fmuld $ahi,$bb,$ahib
|
---|
576 | faddd $alod,$nlod,$nlod
|
---|
577 | fmuld $nhi,$nb,$nhib
|
---|
578 | fmuld $ahi,$bc,$ahic
|
---|
579 | faddd $ahia,$nhia,$nhia
|
---|
580 | fmuld $nhi,$nc,$nhic
|
---|
581 | fmuld $ahi,$bd,$ahid
|
---|
582 | faddd $ahib,$nhib,$nhib
|
---|
583 | fmuld $nhi,$nd,$nhid
|
---|
584 |
|
---|
585 | faddd $ahic,$nhic,$dota ! $nhic
|
---|
586 | faddd $ahid,$nhid,$dotb ! $nhid
|
---|
587 |
|
---|
588 | faddd $nloc,$nhia,$nloc
|
---|
589 | faddd $nlod,$nhib,$nlod
|
---|
590 |
|
---|
591 | fdtox $nloa,$nloa
|
---|
592 | fdtox $nlob,$nlob
|
---|
593 | fdtox $nloc,$nloc
|
---|
594 | fdtox $nlod,$nlod
|
---|
595 |
|
---|
596 | std $nloa,[%sp+$bias+$frame+0]
|
---|
597 | std $nlob,[%sp+$bias+$frame+8]
|
---|
598 | std $nloc,[%sp+$bias+$frame+16]
|
---|
599 | add $j,8,$j
|
---|
600 | std $nlod,[%sp+$bias+$frame+24]
|
---|
601 | |
---|
602 |
|
---|
603 | ldd [$ap_l+$j],$alo ! load a[j] in double format
|
---|
604 | ldd [$ap_h+$j],$ahi
|
---|
605 | ldd [$np_l+$j],$nlo ! load n[j] in double format
|
---|
606 | ldd [$np_h+$j],$nhi
|
---|
607 |
|
---|
608 | fmuld $alo,$ba,$aloa
|
---|
609 | fmuld $nlo,$na,$nloa
|
---|
610 | fmuld $alo,$bb,$alob
|
---|
611 | fmuld $nlo,$nb,$nlob
|
---|
612 | fmuld $alo,$bc,$aloc
|
---|
613 | ldx [%sp+$bias+$frame+0],%o0
|
---|
614 | faddd $aloa,$nloa,$nloa
|
---|
615 | fmuld $nlo,$nc,$nloc
|
---|
616 | ldx [%sp+$bias+$frame+8],%o1
|
---|
617 | fmuld $alo,$bd,$alod
|
---|
618 | ldx [%sp+$bias+$frame+16],%o2
|
---|
619 | faddd $alob,$nlob,$nlob
|
---|
620 | fmuld $nlo,$nd,$nlod
|
---|
621 | ldx [%sp+$bias+$frame+24],%o3
|
---|
622 | fmuld $ahi,$ba,$ahia
|
---|
623 |
|
---|
624 | srlx %o0,16,%o7
|
---|
625 | faddd $aloc,$nloc,$nloc
|
---|
626 | fmuld $nhi,$na,$nhia
|
---|
627 | add %o7,%o1,%o1
|
---|
628 | fmuld $ahi,$bb,$ahib
|
---|
629 | srlx %o1,16,%o7
|
---|
630 | faddd $alod,$nlod,$nlod
|
---|
631 | fmuld $nhi,$nb,$nhib
|
---|
632 | add %o7,%o2,%o2
|
---|
633 | fmuld $ahi,$bc,$ahic
|
---|
634 | srlx %o2,16,%o7
|
---|
635 | faddd $ahia,$nhia,$nhia
|
---|
636 | fmuld $nhi,$nc,$nhic
|
---|
637 | add %o7,%o3,%o3 ! %o3.%o2[0..15].%o1[0..15].%o0[0..15]
|
---|
638 | ! why?
|
---|
639 | and %o0,$mask,%o0
|
---|
640 | fmuld $ahi,$bd,$ahid
|
---|
641 | and %o1,$mask,%o1
|
---|
642 | and %o2,$mask,%o2
|
---|
643 | faddd $ahib,$nhib,$nhib
|
---|
644 | fmuld $nhi,$nd,$nhid
|
---|
645 | sllx %o1,16,%o1
|
---|
646 | faddd $dota,$nloa,$nloa
|
---|
647 | sllx %o2,32,%o2
|
---|
648 | faddd $dotb,$nlob,$nlob
|
---|
649 | sllx %o3,48,%o7
|
---|
650 | or %o1,%o0,%o0
|
---|
651 | faddd $ahic,$nhic,$dota ! $nhic
|
---|
652 | or %o2,%o0,%o0
|
---|
653 | faddd $ahid,$nhid,$dotb ! $nhid
|
---|
654 | or %o7,%o0,%o0 ! 64-bit result
|
---|
655 | ldx [$tp],%o7
|
---|
656 | faddd $nloc,$nhia,$nloc
|
---|
657 | addcc %o7,%o0,%o0
|
---|
658 | ! end-of-why?
|
---|
659 | faddd $nlod,$nhib,$nlod
|
---|
660 | srlx %o3,16,%g1 ! 34-bit carry
|
---|
661 | fdtox $nloa,$nloa
|
---|
662 | bcs,a %xcc,.+8
|
---|
663 | add %g1,1,%g1
|
---|
664 |
|
---|
665 | fdtox $nlob,$nlob
|
---|
666 | fdtox $nloc,$nloc
|
---|
667 | fdtox $nlod,$nlod
|
---|
668 |
|
---|
669 | std $nloa,[%sp+$bias+$frame+0]
|
---|
670 | std $nlob,[%sp+$bias+$frame+8]
|
---|
671 | addcc $j,8,$j
|
---|
672 | std $nloc,[%sp+$bias+$frame+16]
|
---|
673 | bz,pn %icc,.Linnerskip
|
---|
674 | std $nlod,[%sp+$bias+$frame+24]
|
---|
675 | |
---|
676 |
|
---|
677 | ba .Linner
|
---|
678 | nop
|
---|
679 | .align 32
|
---|
680 | .Linner:
|
---|
681 | ldd [$ap_l+$j],$alo ! load a[j] in double format
|
---|
682 | ldd [$ap_h+$j],$ahi
|
---|
683 | ldd [$np_l+$j],$nlo ! load n[j] in double format
|
---|
684 | ldd [$np_h+$j],$nhi
|
---|
685 |
|
---|
686 | fmuld $alo,$ba,$aloa
|
---|
687 | fmuld $nlo,$na,$nloa
|
---|
688 | fmuld $alo,$bb,$alob
|
---|
689 | fmuld $nlo,$nb,$nlob
|
---|
690 | fmuld $alo,$bc,$aloc
|
---|
691 | ldx [%sp+$bias+$frame+0],%o0
|
---|
692 | faddd $aloa,$nloa,$nloa
|
---|
693 | fmuld $nlo,$nc,$nloc
|
---|
694 | ldx [%sp+$bias+$frame+8],%o1
|
---|
695 | fmuld $alo,$bd,$alod
|
---|
696 | ldx [%sp+$bias+$frame+16],%o2
|
---|
697 | faddd $alob,$nlob,$nlob
|
---|
698 | fmuld $nlo,$nd,$nlod
|
---|
699 | ldx [%sp+$bias+$frame+24],%o3
|
---|
700 | fmuld $ahi,$ba,$ahia
|
---|
701 |
|
---|
702 | srlx %o0,16,%o7
|
---|
703 | faddd $aloc,$nloc,$nloc
|
---|
704 | fmuld $nhi,$na,$nhia
|
---|
705 | add %o7,%o1,%o1
|
---|
706 | fmuld $ahi,$bb,$ahib
|
---|
707 | srlx %o1,16,%o7
|
---|
708 | faddd $alod,$nlod,$nlod
|
---|
709 | fmuld $nhi,$nb,$nhib
|
---|
710 | add %o7,%o2,%o2
|
---|
711 | fmuld $ahi,$bc,$ahic
|
---|
712 | srlx %o2,16,%o7
|
---|
713 | faddd $ahia,$nhia,$nhia
|
---|
714 | fmuld $nhi,$nc,$nhic
|
---|
715 | add %o7,%o3,%o3 ! %o3.%o2[0..15].%o1[0..15].%o0[0..15]
|
---|
716 | and %o0,$mask,%o0
|
---|
717 | fmuld $ahi,$bd,$ahid
|
---|
718 | and %o1,$mask,%o1
|
---|
719 | and %o2,$mask,%o2
|
---|
720 | faddd $ahib,$nhib,$nhib
|
---|
721 | fmuld $nhi,$nd,$nhid
|
---|
722 | sllx %o1,16,%o1
|
---|
723 | faddd $dota,$nloa,$nloa
|
---|
724 | sllx %o2,32,%o2
|
---|
725 | faddd $dotb,$nlob,$nlob
|
---|
726 | sllx %o3,48,%o7
|
---|
727 | or %o1,%o0,%o0
|
---|
728 | faddd $ahic,$nhic,$dota ! $nhic
|
---|
729 | or %o2,%o0,%o0
|
---|
730 | faddd $ahid,$nhid,$dotb ! $nhid
|
---|
731 | or %o7,%o0,%o0 ! 64-bit result
|
---|
732 | faddd $nloc,$nhia,$nloc
|
---|
733 | addcc %g1,%o0,%o0
|
---|
734 | ldx [$tp+8],%o7 ! tp[j]
|
---|
735 | faddd $nlod,$nhib,$nlod
|
---|
736 | srlx %o3,16,%g1 ! 34-bit carry
|
---|
737 | fdtox $nloa,$nloa
|
---|
738 | bcs,a %xcc,.+8
|
---|
739 | add %g1,1,%g1
|
---|
740 | fdtox $nlob,$nlob
|
---|
741 | addcc %o7,%o0,%o0
|
---|
742 | fdtox $nloc,$nloc
|
---|
743 | bcs,a %xcc,.+8
|
---|
744 | add %g1,1,%g1
|
---|
745 |
|
---|
746 | stx %o0,[$tp] ! tp[j-1]
|
---|
747 | fdtox $nlod,$nlod
|
---|
748 |
|
---|
749 | std $nloa,[%sp+$bias+$frame+0]
|
---|
750 | std $nlob,[%sp+$bias+$frame+8]
|
---|
751 | std $nloc,[%sp+$bias+$frame+16]
|
---|
752 | addcc $j,8,$j
|
---|
753 | std $nlod,[%sp+$bias+$frame+24]
|
---|
754 | bnz,pt %icc,.Linner
|
---|
755 | add $tp,8,$tp
|
---|
756 | |
---|
757 |
|
---|
758 | .Linnerskip:
|
---|
759 | fdtox $dota,$dota
|
---|
760 | fdtox $dotb,$dotb
|
---|
761 |
|
---|
762 | ldx [%sp+$bias+$frame+0],%o0
|
---|
763 | ldx [%sp+$bias+$frame+8],%o1
|
---|
764 | ldx [%sp+$bias+$frame+16],%o2
|
---|
765 | ldx [%sp+$bias+$frame+24],%o3
|
---|
766 |
|
---|
767 | srlx %o0,16,%o7
|
---|
768 | std $dota,[%sp+$bias+$frame+32]
|
---|
769 | add %o7,%o1,%o1
|
---|
770 | std $dotb,[%sp+$bias+$frame+40]
|
---|
771 | srlx %o1,16,%o7
|
---|
772 | add %o7,%o2,%o2
|
---|
773 | srlx %o2,16,%o7
|
---|
774 | add %o7,%o3,%o3 ! %o3.%o2[0..15].%o1[0..15].%o0[0..15]
|
---|
775 | and %o0,$mask,%o0
|
---|
776 | and %o1,$mask,%o1
|
---|
777 | and %o2,$mask,%o2
|
---|
778 | sllx %o1,16,%o1
|
---|
779 | sllx %o2,32,%o2
|
---|
780 | sllx %o3,48,%o7
|
---|
781 | or %o1,%o0,%o0
|
---|
782 | or %o2,%o0,%o0
|
---|
783 | ldx [%sp+$bias+$frame+32],%o4
|
---|
784 | or %o7,%o0,%o0 ! 64-bit result
|
---|
785 | ldx [%sp+$bias+$frame+40],%o5
|
---|
786 | addcc %g1,%o0,%o0
|
---|
787 | ldx [$tp+8],%o7 ! tp[j]
|
---|
788 | srlx %o3,16,%g1 ! 34-bit carry
|
---|
789 | bcs,a %xcc,.+8
|
---|
790 | add %g1,1,%g1
|
---|
791 |
|
---|
792 | addcc %o7,%o0,%o0
|
---|
793 | bcs,a %xcc,.+8
|
---|
794 | add %g1,1,%g1
|
---|
795 |
|
---|
796 | stx %o0,[$tp] ! tp[j-1]
|
---|
797 | add $tp,8,$tp
|
---|
798 |
|
---|
799 | srlx %o4,16,%o7
|
---|
800 | add %o7,%o5,%o5
|
---|
801 | and %o4,$mask,%o4
|
---|
802 | sllx %o5,16,%o7
|
---|
803 | or %o7,%o4,%o4
|
---|
804 | addcc %g1,%o4,%o4
|
---|
805 | srlx %o5,48,%g1
|
---|
806 | bcs,a %xcc,.+8
|
---|
807 | add %g1,1,%g1
|
---|
808 |
|
---|
809 | addcc $carry,%o4,%o4
|
---|
810 | stx %o4,[$tp] ! tp[num-1]
|
---|
811 | mov %g1,$carry
|
---|
812 | bcs,a %xcc,.+8
|
---|
813 | add $carry,1,$carry
|
---|
814 |
|
---|
815 | addcc $i,8,$i
|
---|
816 | bnz %icc,.Louter
|
---|
817 | nop
|
---|
818 | |
---|
819 |
|
---|
820 | add $tp,8,$tp ! adjust tp to point at the end
|
---|
821 | orn %g0,%g0,%g4
|
---|
822 | sub %g0,$num,%o7 ! n=-num
|
---|
823 | ba .Lsub
|
---|
824 | subcc %g0,%g0,%g0 ! clear %icc.c
|
---|
825 |
|
---|
826 | .align 32
|
---|
827 | .Lsub:
|
---|
828 | ldx [$tp+%o7],%o0
|
---|
829 | add $np,%o7,%g1
|
---|
830 | ld [%g1+0],%o2
|
---|
831 | ld [%g1+4],%o3
|
---|
832 | srlx %o0,32,%o1
|
---|
833 | subccc %o0,%o2,%o2
|
---|
834 | add $rp,%o7,%g1
|
---|
835 | subccc %o1,%o3,%o3
|
---|
836 | st %o2,[%g1+0]
|
---|
837 | add %o7,8,%o7
|
---|
838 | brnz,pt %o7,.Lsub
|
---|
839 | st %o3,[%g1+4]
|
---|
840 | subc $carry,0,%g4
|
---|
841 | sub %g0,$num,%o7 ! n=-num
|
---|
842 | ba .Lcopy
|
---|
843 | nop
|
---|
844 |
|
---|
845 | .align 32
|
---|
846 | .Lcopy:
|
---|
847 | ldx [$tp+%o7],%o0
|
---|
848 | add $rp,%o7,%g1
|
---|
849 | ld [%g1+0],%o2
|
---|
850 | ld [%g1+4],%o3
|
---|
851 | stx %g0,[$tp+%o7]
|
---|
852 | and %o0,%g4,%o0
|
---|
853 | srlx %o0,32,%o1
|
---|
854 | andn %o2,%g4,%o2
|
---|
855 | andn %o3,%g4,%o3
|
---|
856 | or %o2,%o0,%o0
|
---|
857 | or %o3,%o1,%o1
|
---|
858 | st %o0,[%g1+0]
|
---|
859 | add %o7,8,%o7
|
---|
860 | brnz,pt %o7,.Lcopy
|
---|
861 | st %o1,[%g1+4]
|
---|
862 | sub %g0,$num,%o7 ! n=-num
|
---|
863 |
|
---|
864 | .Lzap:
|
---|
865 | stx %g0,[$ap_l+%o7]
|
---|
866 | stx %g0,[$ap_h+%o7]
|
---|
867 | stx %g0,[$np_l+%o7]
|
---|
868 | stx %g0,[$np_h+%o7]
|
---|
869 | add %o7,8,%o7
|
---|
870 | brnz,pt %o7,.Lzap
|
---|
871 | nop
|
---|
872 |
|
---|
873 | ldx [%sp+$bias+$frame+48],%o7
|
---|
874 | wr %g0,%o7,%asi ! restore %asi
|
---|
875 |
|
---|
876 | mov 1,%i0
|
---|
877 | .Lret:
|
---|
878 | ret
|
---|
879 | restore
|
---|
880 | .type $fname,#function
|
---|
881 | .size $fname,(.-$fname)
|
---|
882 | .asciz "Montgomery Multiplication for UltraSPARC, CRYPTOGAMS by <appro\@openssl.org>"
|
---|
883 | .align 32
|
---|
884 | ___
|
---|
885 |
|
---|
886 | $code =~ s/\`([^\`]*)\`/eval($1)/gem;
|
---|
887 |
|
---|
888 | # Below substitution makes it possible to compile without demanding
|
---|
889 | # VIS extensions on command line, e.g. -xarch=v9 vs. -xarch=v9a. I
|
---|
890 | # dare to do this, because VIS capability is detected at run-time now
|
---|
891 | # and this routine is not called on CPU not capable to execute it. Do
|
---|
892 | # note that fzeros is not the only VIS dependency! Another dependency
|
---|
893 | # is implicit and is just _a_ numerical value loaded to %asi register,
|
---|
894 | # which assembler can't recognize as VIS specific...
|
---|
895 | $code =~ s/fzeros\s+%f([0-9]+)/
|
---|
896 | sprintf(".word\t0x%x\t! fzeros %%f%d",0x81b00c20|($1<<25),$1)
|
---|
897 | /gem;
|
---|
898 |
|
---|
899 | print $code;
|
---|
900 | # flush
|
---|
901 | close STDOUT or die "error closing STDOUT: $!";
|
---|