1 | ; Copyright 2000-2016 The OpenSSL Project Authors. All Rights Reserved.
|
---|
2 | ;
|
---|
3 | ; Licensed under the OpenSSL license (the "License"). You may not use
|
---|
4 | ; this file except in compliance with the License. You can obtain a copy
|
---|
5 | ; in the file LICENSE in the source distribution or at
|
---|
6 | ; https://www.openssl.org/source/license.html
|
---|
7 |
|
---|
8 | ;
|
---|
9 | ; PA-RISC 64-bit implementation of bn_asm code
|
---|
10 | ;
|
---|
11 | ; This code is approximately 2x faster than the C version
|
---|
12 | ; for RSA/DSA.
|
---|
13 | ;
|
---|
14 | ; See http://devresource.hp.com/ for more details on the PA-RISC
|
---|
15 | ; architecture. Also see the book "PA-RISC 2.0 Architecture"
|
---|
16 | ; by Gerry Kane for information on the instruction set architecture.
|
---|
17 | ;
|
---|
18 | ; Code written by Chris Ruemmler (with some help from the HP C
|
---|
19 | ; compiler).
|
---|
20 | ;
|
---|
21 | ; The code compiles with HP's assembler
|
---|
22 | ;
|
---|
23 |
|
---|
24 | .level 2.0W
|
---|
25 | .space $TEXT$
|
---|
26 | .subspa $CODE$,QUAD=0,ALIGN=8,ACCESS=0x2c,CODE_ONLY
|
---|
27 |
|
---|
28 | ;
|
---|
29 | ; Global Register definitions used for the routines.
|
---|
30 | ;
|
---|
31 | ; Some information about HP's runtime architecture for 64-bits.
|
---|
32 | ;
|
---|
33 | ; "Caller save" means the calling function must save the register
|
---|
34 | ; if it wants the register to be preserved.
|
---|
35 | ; "Callee save" means if a function uses the register, it must save
|
---|
36 | ; the value before using it.
|
---|
37 | ;
|
---|
38 | ; For the floating point registers
|
---|
39 | ;
|
---|
40 | ; "caller save" registers: fr4-fr11, fr22-fr31
|
---|
41 | ; "callee save" registers: fr12-fr21
|
---|
42 | ; "special" registers: fr0-fr3 (status and exception registers)
|
---|
43 | ;
|
---|
44 | ; For the integer registers
|
---|
45 | ; value zero : r0
|
---|
46 | ; "caller save" registers: r1,r19-r26
|
---|
47 | ; "callee save" registers: r3-r18
|
---|
48 | ; return register : r2 (rp)
|
---|
49 | ; return values ; r28 (ret0,ret1)
|
---|
50 | ; Stack pointer ; r30 (sp)
|
---|
51 | ; global data pointer ; r27 (dp)
|
---|
52 | ; argument pointer ; r29 (ap)
|
---|
53 | ; millicode return ptr ; r31 (also a caller save register)
|
---|
54 |
|
---|
55 |
|
---|
56 | ;
|
---|
57 | ; Arguments to the routines
|
---|
58 | ;
|
---|
59 | r_ptr .reg %r26
|
---|
60 | a_ptr .reg %r25
|
---|
61 | b_ptr .reg %r24
|
---|
62 | num .reg %r24
|
---|
63 | w .reg %r23
|
---|
64 | n .reg %r23
|
---|
65 |
|
---|
66 |
|
---|
67 | ;
|
---|
68 | ; Globals used in some routines
|
---|
69 | ;
|
---|
70 |
|
---|
71 | top_overflow .reg %r29
|
---|
72 | high_mask .reg %r22 ; value 0xffffffff80000000L
|
---|
73 |
|
---|
74 |
|
---|
75 | ;------------------------------------------------------------------------------
|
---|
76 | ;
|
---|
77 | ; bn_mul_add_words
|
---|
78 | ;
|
---|
79 | ;BN_ULONG bn_mul_add_words(BN_ULONG *r_ptr, BN_ULONG *a_ptr,
|
---|
80 | ; int num, BN_ULONG w)
|
---|
81 | ;
|
---|
82 | ; arg0 = r_ptr
|
---|
83 | ; arg1 = a_ptr
|
---|
84 | ; arg2 = num
|
---|
85 | ; arg3 = w
|
---|
86 | ;
|
---|
87 | ; Local register definitions
|
---|
88 | ;
|
---|
89 |
|
---|
90 | fm1 .reg %fr22
|
---|
91 | fm .reg %fr23
|
---|
92 | ht_temp .reg %fr24
|
---|
93 | ht_temp_1 .reg %fr25
|
---|
94 | lt_temp .reg %fr26
|
---|
95 | lt_temp_1 .reg %fr27
|
---|
96 | fm1_1 .reg %fr28
|
---|
97 | fm_1 .reg %fr29
|
---|
98 |
|
---|
99 | fw_h .reg %fr7L
|
---|
100 | fw_l .reg %fr7R
|
---|
101 | fw .reg %fr7
|
---|
102 |
|
---|
103 | fht_0 .reg %fr8L
|
---|
104 | flt_0 .reg %fr8R
|
---|
105 | t_float_0 .reg %fr8
|
---|
106 |
|
---|
107 | fht_1 .reg %fr9L
|
---|
108 | flt_1 .reg %fr9R
|
---|
109 | t_float_1 .reg %fr9
|
---|
110 |
|
---|
111 | tmp_0 .reg %r31
|
---|
112 | tmp_1 .reg %r21
|
---|
113 | m_0 .reg %r20
|
---|
114 | m_1 .reg %r19
|
---|
115 | ht_0 .reg %r1
|
---|
116 | ht_1 .reg %r3
|
---|
117 | lt_0 .reg %r4
|
---|
118 | lt_1 .reg %r5
|
---|
119 | m1_0 .reg %r6
|
---|
120 | m1_1 .reg %r7
|
---|
121 | rp_val .reg %r8
|
---|
122 | rp_val_1 .reg %r9
|
---|
123 |
|
---|
124 | bn_mul_add_words
|
---|
125 | .export bn_mul_add_words,entry,NO_RELOCATION,LONG_RETURN
|
---|
126 | .proc
|
---|
127 | .callinfo frame=128
|
---|
128 | .entry
|
---|
129 | .align 64
|
---|
130 |
|
---|
131 | STD %r3,0(%sp) ; save r3
|
---|
132 | STD %r4,8(%sp) ; save r4
|
---|
133 | NOP ; Needed to make the loop 16-byte aligned
|
---|
134 | NOP ; Needed to make the loop 16-byte aligned
|
---|
135 |
|
---|
136 | STD %r5,16(%sp) ; save r5
|
---|
137 | STD %r6,24(%sp) ; save r6
|
---|
138 | STD %r7,32(%sp) ; save r7
|
---|
139 | STD %r8,40(%sp) ; save r8
|
---|
140 |
|
---|
141 | STD %r9,48(%sp) ; save r9
|
---|
142 | COPY %r0,%ret0 ; return 0 by default
|
---|
143 | DEPDI,Z 1,31,1,top_overflow ; top_overflow = 1 << 32
|
---|
144 | STD w,56(%sp) ; store w on stack
|
---|
145 |
|
---|
146 | CMPIB,>= 0,num,bn_mul_add_words_exit ; if (num <= 0) then exit
|
---|
147 | LDO 128(%sp),%sp ; bump stack
|
---|
148 |
|
---|
149 | ;
|
---|
150 | ; The loop is unrolled twice, so if there is only 1 number
|
---|
151 | ; then go straight to the cleanup code.
|
---|
152 | ;
|
---|
153 | CMPIB,= 1,num,bn_mul_add_words_single_top
|
---|
154 | FLDD -72(%sp),fw ; load up w into fp register fw (fw_h/fw_l)
|
---|
155 |
|
---|
156 | ;
|
---|
157 | ; This loop is unrolled 2 times (64-byte aligned as well)
|
---|
158 | ;
|
---|
159 | ; PA-RISC 2.0 chips have two fully pipelined multipliers, thus
|
---|
160 | ; two 32-bit mutiplies can be issued per cycle.
|
---|
161 | ;
|
---|
162 | bn_mul_add_words_unroll2
|
---|
163 |
|
---|
164 | FLDD 0(a_ptr),t_float_0 ; load up 64-bit value (fr8L) ht(L)/lt(R)
|
---|
165 | FLDD 8(a_ptr),t_float_1 ; load up 64-bit value (fr8L) ht(L)/lt(R)
|
---|
166 | LDD 0(r_ptr),rp_val ; rp[0]
|
---|
167 | LDD 8(r_ptr),rp_val_1 ; rp[1]
|
---|
168 |
|
---|
169 | XMPYU fht_0,fw_l,fm1 ; m1[0] = fht_0*fw_l
|
---|
170 | XMPYU fht_1,fw_l,fm1_1 ; m1[1] = fht_1*fw_l
|
---|
171 | FSTD fm1,-16(%sp) ; -16(sp) = m1[0]
|
---|
172 | FSTD fm1_1,-48(%sp) ; -48(sp) = m1[1]
|
---|
173 |
|
---|
174 | XMPYU flt_0,fw_h,fm ; m[0] = flt_0*fw_h
|
---|
175 | XMPYU flt_1,fw_h,fm_1 ; m[1] = flt_1*fw_h
|
---|
176 | FSTD fm,-8(%sp) ; -8(sp) = m[0]
|
---|
177 | FSTD fm_1,-40(%sp) ; -40(sp) = m[1]
|
---|
178 |
|
---|
179 | XMPYU fht_0,fw_h,ht_temp ; ht_temp = fht_0*fw_h
|
---|
180 | XMPYU fht_1,fw_h,ht_temp_1 ; ht_temp_1 = fht_1*fw_h
|
---|
181 | FSTD ht_temp,-24(%sp) ; -24(sp) = ht_temp
|
---|
182 | FSTD ht_temp_1,-56(%sp) ; -56(sp) = ht_temp_1
|
---|
183 |
|
---|
184 | XMPYU flt_0,fw_l,lt_temp ; lt_temp = lt*fw_l
|
---|
185 | XMPYU flt_1,fw_l,lt_temp_1 ; lt_temp = lt*fw_l
|
---|
186 | FSTD lt_temp,-32(%sp) ; -32(sp) = lt_temp
|
---|
187 | FSTD lt_temp_1,-64(%sp) ; -64(sp) = lt_temp_1
|
---|
188 |
|
---|
189 | LDD -8(%sp),m_0 ; m[0]
|
---|
190 | LDD -40(%sp),m_1 ; m[1]
|
---|
191 | LDD -16(%sp),m1_0 ; m1[0]
|
---|
192 | LDD -48(%sp),m1_1 ; m1[1]
|
---|
193 |
|
---|
194 | LDD -24(%sp),ht_0 ; ht[0]
|
---|
195 | LDD -56(%sp),ht_1 ; ht[1]
|
---|
196 | ADD,L m1_0,m_0,tmp_0 ; tmp_0 = m[0] + m1[0];
|
---|
197 | ADD,L m1_1,m_1,tmp_1 ; tmp_1 = m[1] + m1[1];
|
---|
198 |
|
---|
199 | LDD -32(%sp),lt_0
|
---|
200 | LDD -64(%sp),lt_1
|
---|
201 | CMPCLR,*>>= tmp_0,m1_0, %r0 ; if (m[0] < m1[0])
|
---|
202 | ADD,L ht_0,top_overflow,ht_0 ; ht[0] += (1<<32)
|
---|
203 |
|
---|
204 | CMPCLR,*>>= tmp_1,m1_1,%r0 ; if (m[1] < m1[1])
|
---|
205 | ADD,L ht_1,top_overflow,ht_1 ; ht[1] += (1<<32)
|
---|
206 | EXTRD,U tmp_0,31,32,m_0 ; m[0]>>32
|
---|
207 | DEPD,Z tmp_0,31,32,m1_0 ; m1[0] = m[0]<<32
|
---|
208 |
|
---|
209 | EXTRD,U tmp_1,31,32,m_1 ; m[1]>>32
|
---|
210 | DEPD,Z tmp_1,31,32,m1_1 ; m1[1] = m[1]<<32
|
---|
211 | ADD,L ht_0,m_0,ht_0 ; ht[0]+= (m[0]>>32)
|
---|
212 | ADD,L ht_1,m_1,ht_1 ; ht[1]+= (m[1]>>32)
|
---|
213 |
|
---|
214 | ADD lt_0,m1_0,lt_0 ; lt[0] = lt[0]+m1[0];
|
---|
215 | ADD,DC ht_0,%r0,ht_0 ; ht[0]++
|
---|
216 | ADD lt_1,m1_1,lt_1 ; lt[1] = lt[1]+m1[1];
|
---|
217 | ADD,DC ht_1,%r0,ht_1 ; ht[1]++
|
---|
218 |
|
---|
219 | ADD %ret0,lt_0,lt_0 ; lt[0] = lt[0] + c;
|
---|
220 | ADD,DC ht_0,%r0,ht_0 ; ht[0]++
|
---|
221 | ADD lt_0,rp_val,lt_0 ; lt[0] = lt[0]+rp[0]
|
---|
222 | ADD,DC ht_0,%r0,ht_0 ; ht[0]++
|
---|
223 |
|
---|
224 | LDO -2(num),num ; num = num - 2;
|
---|
225 | ADD ht_0,lt_1,lt_1 ; lt[1] = lt[1] + ht_0 (c);
|
---|
226 | ADD,DC ht_1,%r0,ht_1 ; ht[1]++
|
---|
227 | STD lt_0,0(r_ptr) ; rp[0] = lt[0]
|
---|
228 |
|
---|
229 | ADD lt_1,rp_val_1,lt_1 ; lt[1] = lt[1]+rp[1]
|
---|
230 | ADD,DC ht_1,%r0,%ret0 ; ht[1]++
|
---|
231 | LDO 16(a_ptr),a_ptr ; a_ptr += 2
|
---|
232 |
|
---|
233 | STD lt_1,8(r_ptr) ; rp[1] = lt[1]
|
---|
234 | CMPIB,<= 2,num,bn_mul_add_words_unroll2 ; go again if more to do
|
---|
235 | LDO 16(r_ptr),r_ptr ; r_ptr += 2
|
---|
236 |
|
---|
237 | CMPIB,=,N 0,num,bn_mul_add_words_exit ; are we done, or cleanup last one
|
---|
238 |
|
---|
239 | ;
|
---|
240 | ; Top of loop aligned on 64-byte boundary
|
---|
241 | ;
|
---|
242 | bn_mul_add_words_single_top
|
---|
243 | FLDD 0(a_ptr),t_float_0 ; load up 64-bit value (fr8L) ht(L)/lt(R)
|
---|
244 | LDD 0(r_ptr),rp_val ; rp[0]
|
---|
245 | LDO 8(a_ptr),a_ptr ; a_ptr++
|
---|
246 | XMPYU fht_0,fw_l,fm1 ; m1 = ht*fw_l
|
---|
247 | FSTD fm1,-16(%sp) ; -16(sp) = m1
|
---|
248 | XMPYU flt_0,fw_h,fm ; m = lt*fw_h
|
---|
249 | FSTD fm,-8(%sp) ; -8(sp) = m
|
---|
250 | XMPYU fht_0,fw_h,ht_temp ; ht_temp = ht*fw_h
|
---|
251 | FSTD ht_temp,-24(%sp) ; -24(sp) = ht
|
---|
252 | XMPYU flt_0,fw_l,lt_temp ; lt_temp = lt*fw_l
|
---|
253 | FSTD lt_temp,-32(%sp) ; -32(sp) = lt
|
---|
254 |
|
---|
255 | LDD -8(%sp),m_0
|
---|
256 | LDD -16(%sp),m1_0 ; m1 = temp1
|
---|
257 | ADD,L m_0,m1_0,tmp_0 ; tmp_0 = m + m1;
|
---|
258 | LDD -24(%sp),ht_0
|
---|
259 | LDD -32(%sp),lt_0
|
---|
260 |
|
---|
261 | CMPCLR,*>>= tmp_0,m1_0,%r0 ; if (m < m1)
|
---|
262 | ADD,L ht_0,top_overflow,ht_0 ; ht += (1<<32)
|
---|
263 |
|
---|
264 | EXTRD,U tmp_0,31,32,m_0 ; m>>32
|
---|
265 | DEPD,Z tmp_0,31,32,m1_0 ; m1 = m<<32
|
---|
266 |
|
---|
267 | ADD,L ht_0,m_0,ht_0 ; ht+= (m>>32)
|
---|
268 | ADD lt_0,m1_0,tmp_0 ; tmp_0 = lt+m1;
|
---|
269 | ADD,DC ht_0,%r0,ht_0 ; ht++
|
---|
270 | ADD %ret0,tmp_0,lt_0 ; lt = lt + c;
|
---|
271 | ADD,DC ht_0,%r0,ht_0 ; ht++
|
---|
272 | ADD lt_0,rp_val,lt_0 ; lt = lt+rp[0]
|
---|
273 | ADD,DC ht_0,%r0,%ret0 ; ht++
|
---|
274 | STD lt_0,0(r_ptr) ; rp[0] = lt
|
---|
275 |
|
---|
276 | bn_mul_add_words_exit
|
---|
277 | .EXIT
|
---|
278 | LDD -80(%sp),%r9 ; restore r9
|
---|
279 | LDD -88(%sp),%r8 ; restore r8
|
---|
280 | LDD -96(%sp),%r7 ; restore r7
|
---|
281 | LDD -104(%sp),%r6 ; restore r6
|
---|
282 | LDD -112(%sp),%r5 ; restore r5
|
---|
283 | LDD -120(%sp),%r4 ; restore r4
|
---|
284 | BVE (%rp)
|
---|
285 | LDD,MB -128(%sp),%r3 ; restore r3
|
---|
286 | .PROCEND ;in=23,24,25,26,29;out=28;
|
---|
287 |
|
---|
288 | ;----------------------------------------------------------------------------
|
---|
289 | ;
|
---|
290 | ;BN_ULONG bn_mul_words(BN_ULONG *rp, BN_ULONG *ap, int num, BN_ULONG w)
|
---|
291 | ;
|
---|
292 | ; arg0 = rp
|
---|
293 | ; arg1 = ap
|
---|
294 | ; arg2 = num
|
---|
295 | ; arg3 = w
|
---|
296 |
|
---|
297 | bn_mul_words
|
---|
298 | .proc
|
---|
299 | .callinfo frame=128
|
---|
300 | .entry
|
---|
301 | .EXPORT bn_mul_words,ENTRY,PRIV_LEV=3,NO_RELOCATION,LONG_RETURN
|
---|
302 | .align 64
|
---|
303 |
|
---|
304 | STD %r3,0(%sp) ; save r3
|
---|
305 | STD %r4,8(%sp) ; save r4
|
---|
306 | STD %r5,16(%sp) ; save r5
|
---|
307 | STD %r6,24(%sp) ; save r6
|
---|
308 |
|
---|
309 | STD %r7,32(%sp) ; save r7
|
---|
310 | COPY %r0,%ret0 ; return 0 by default
|
---|
311 | DEPDI,Z 1,31,1,top_overflow ; top_overflow = 1 << 32
|
---|
312 | STD w,56(%sp) ; w on stack
|
---|
313 |
|
---|
314 | CMPIB,>= 0,num,bn_mul_words_exit
|
---|
315 | LDO 128(%sp),%sp ; bump stack
|
---|
316 |
|
---|
317 | ;
|
---|
318 | ; See if only 1 word to do, thus just do cleanup
|
---|
319 | ;
|
---|
320 | CMPIB,= 1,num,bn_mul_words_single_top
|
---|
321 | FLDD -72(%sp),fw ; load up w into fp register fw (fw_h/fw_l)
|
---|
322 |
|
---|
323 | ;
|
---|
324 | ; This loop is unrolled 2 times (64-byte aligned as well)
|
---|
325 | ;
|
---|
326 | ; PA-RISC 2.0 chips have two fully pipelined multipliers, thus
|
---|
327 | ; two 32-bit mutiplies can be issued per cycle.
|
---|
328 | ;
|
---|
329 | bn_mul_words_unroll2
|
---|
330 |
|
---|
331 | FLDD 0(a_ptr),t_float_0 ; load up 64-bit value (fr8L) ht(L)/lt(R)
|
---|
332 | FLDD 8(a_ptr),t_float_1 ; load up 64-bit value (fr8L) ht(L)/lt(R)
|
---|
333 | XMPYU fht_0,fw_l,fm1 ; m1[0] = fht_0*fw_l
|
---|
334 | XMPYU fht_1,fw_l,fm1_1 ; m1[1] = ht*fw_l
|
---|
335 |
|
---|
336 | FSTD fm1,-16(%sp) ; -16(sp) = m1
|
---|
337 | FSTD fm1_1,-48(%sp) ; -48(sp) = m1
|
---|
338 | XMPYU flt_0,fw_h,fm ; m = lt*fw_h
|
---|
339 | XMPYU flt_1,fw_h,fm_1 ; m = lt*fw_h
|
---|
340 |
|
---|
341 | FSTD fm,-8(%sp) ; -8(sp) = m
|
---|
342 | FSTD fm_1,-40(%sp) ; -40(sp) = m
|
---|
343 | XMPYU fht_0,fw_h,ht_temp ; ht_temp = fht_0*fw_h
|
---|
344 | XMPYU fht_1,fw_h,ht_temp_1 ; ht_temp = ht*fw_h
|
---|
345 |
|
---|
346 | FSTD ht_temp,-24(%sp) ; -24(sp) = ht
|
---|
347 | FSTD ht_temp_1,-56(%sp) ; -56(sp) = ht
|
---|
348 | XMPYU flt_0,fw_l,lt_temp ; lt_temp = lt*fw_l
|
---|
349 | XMPYU flt_1,fw_l,lt_temp_1 ; lt_temp = lt*fw_l
|
---|
350 |
|
---|
351 | FSTD lt_temp,-32(%sp) ; -32(sp) = lt
|
---|
352 | FSTD lt_temp_1,-64(%sp) ; -64(sp) = lt
|
---|
353 | LDD -8(%sp),m_0
|
---|
354 | LDD -40(%sp),m_1
|
---|
355 |
|
---|
356 | LDD -16(%sp),m1_0
|
---|
357 | LDD -48(%sp),m1_1
|
---|
358 | LDD -24(%sp),ht_0
|
---|
359 | LDD -56(%sp),ht_1
|
---|
360 |
|
---|
361 | ADD,L m1_0,m_0,tmp_0 ; tmp_0 = m + m1;
|
---|
362 | ADD,L m1_1,m_1,tmp_1 ; tmp_1 = m + m1;
|
---|
363 | LDD -32(%sp),lt_0
|
---|
364 | LDD -64(%sp),lt_1
|
---|
365 |
|
---|
366 | CMPCLR,*>>= tmp_0,m1_0, %r0 ; if (m < m1)
|
---|
367 | ADD,L ht_0,top_overflow,ht_0 ; ht += (1<<32)
|
---|
368 | CMPCLR,*>>= tmp_1,m1_1,%r0 ; if (m < m1)
|
---|
369 | ADD,L ht_1,top_overflow,ht_1 ; ht += (1<<32)
|
---|
370 |
|
---|
371 | EXTRD,U tmp_0,31,32,m_0 ; m>>32
|
---|
372 | DEPD,Z tmp_0,31,32,m1_0 ; m1 = m<<32
|
---|
373 | EXTRD,U tmp_1,31,32,m_1 ; m>>32
|
---|
374 | DEPD,Z tmp_1,31,32,m1_1 ; m1 = m<<32
|
---|
375 |
|
---|
376 | ADD,L ht_0,m_0,ht_0 ; ht+= (m>>32)
|
---|
377 | ADD,L ht_1,m_1,ht_1 ; ht+= (m>>32)
|
---|
378 | ADD lt_0,m1_0,lt_0 ; lt = lt+m1;
|
---|
379 | ADD,DC ht_0,%r0,ht_0 ; ht++
|
---|
380 |
|
---|
381 | ADD lt_1,m1_1,lt_1 ; lt = lt+m1;
|
---|
382 | ADD,DC ht_1,%r0,ht_1 ; ht++
|
---|
383 | ADD %ret0,lt_0,lt_0 ; lt = lt + c (ret0);
|
---|
384 | ADD,DC ht_0,%r0,ht_0 ; ht++
|
---|
385 |
|
---|
386 | ADD ht_0,lt_1,lt_1 ; lt = lt + c (ht_0)
|
---|
387 | ADD,DC ht_1,%r0,ht_1 ; ht++
|
---|
388 | STD lt_0,0(r_ptr) ; rp[0] = lt
|
---|
389 | STD lt_1,8(r_ptr) ; rp[1] = lt
|
---|
390 |
|
---|
391 | COPY ht_1,%ret0 ; carry = ht
|
---|
392 | LDO -2(num),num ; num = num - 2;
|
---|
393 | LDO 16(a_ptr),a_ptr ; ap += 2
|
---|
394 | CMPIB,<= 2,num,bn_mul_words_unroll2
|
---|
395 | LDO 16(r_ptr),r_ptr ; rp++
|
---|
396 |
|
---|
397 | CMPIB,=,N 0,num,bn_mul_words_exit ; are we done?
|
---|
398 |
|
---|
399 | ;
|
---|
400 | ; Top of loop aligned on 64-byte boundary
|
---|
401 | ;
|
---|
402 | bn_mul_words_single_top
|
---|
403 | FLDD 0(a_ptr),t_float_0 ; load up 64-bit value (fr8L) ht(L)/lt(R)
|
---|
404 |
|
---|
405 | XMPYU fht_0,fw_l,fm1 ; m1 = ht*fw_l
|
---|
406 | FSTD fm1,-16(%sp) ; -16(sp) = m1
|
---|
407 | XMPYU flt_0,fw_h,fm ; m = lt*fw_h
|
---|
408 | FSTD fm,-8(%sp) ; -8(sp) = m
|
---|
409 | XMPYU fht_0,fw_h,ht_temp ; ht_temp = ht*fw_h
|
---|
410 | FSTD ht_temp,-24(%sp) ; -24(sp) = ht
|
---|
411 | XMPYU flt_0,fw_l,lt_temp ; lt_temp = lt*fw_l
|
---|
412 | FSTD lt_temp,-32(%sp) ; -32(sp) = lt
|
---|
413 |
|
---|
414 | LDD -8(%sp),m_0
|
---|
415 | LDD -16(%sp),m1_0
|
---|
416 | ADD,L m_0,m1_0,tmp_0 ; tmp_0 = m + m1;
|
---|
417 | LDD -24(%sp),ht_0
|
---|
418 | LDD -32(%sp),lt_0
|
---|
419 |
|
---|
420 | CMPCLR,*>>= tmp_0,m1_0,%r0 ; if (m < m1)
|
---|
421 | ADD,L ht_0,top_overflow,ht_0 ; ht += (1<<32)
|
---|
422 |
|
---|
423 | EXTRD,U tmp_0,31,32,m_0 ; m>>32
|
---|
424 | DEPD,Z tmp_0,31,32,m1_0 ; m1 = m<<32
|
---|
425 |
|
---|
426 | ADD,L ht_0,m_0,ht_0 ; ht+= (m>>32)
|
---|
427 | ADD lt_0,m1_0,lt_0 ; lt= lt+m1;
|
---|
428 | ADD,DC ht_0,%r0,ht_0 ; ht++
|
---|
429 |
|
---|
430 | ADD %ret0,lt_0,lt_0 ; lt = lt + c;
|
---|
431 | ADD,DC ht_0,%r0,ht_0 ; ht++
|
---|
432 |
|
---|
433 | COPY ht_0,%ret0 ; copy carry
|
---|
434 | STD lt_0,0(r_ptr) ; rp[0] = lt
|
---|
435 |
|
---|
436 | bn_mul_words_exit
|
---|
437 | .EXIT
|
---|
438 | LDD -96(%sp),%r7 ; restore r7
|
---|
439 | LDD -104(%sp),%r6 ; restore r6
|
---|
440 | LDD -112(%sp),%r5 ; restore r5
|
---|
441 | LDD -120(%sp),%r4 ; restore r4
|
---|
442 | BVE (%rp)
|
---|
443 | LDD,MB -128(%sp),%r3 ; restore r3
|
---|
444 | .PROCEND ;in=23,24,25,26,29;out=28;
|
---|
445 |
|
---|
446 | ;----------------------------------------------------------------------------
|
---|
447 | ;
|
---|
448 | ;void bn_sqr_words(BN_ULONG *rp, BN_ULONG *ap, int num)
|
---|
449 | ;
|
---|
450 | ; arg0 = rp
|
---|
451 | ; arg1 = ap
|
---|
452 | ; arg2 = num
|
---|
453 | ;
|
---|
454 |
|
---|
455 | bn_sqr_words
|
---|
456 | .proc
|
---|
457 | .callinfo FRAME=128,ENTRY_GR=%r3,ARGS_SAVED,ORDERING_AWARE
|
---|
458 | .EXPORT bn_sqr_words,ENTRY,PRIV_LEV=3,NO_RELOCATION,LONG_RETURN
|
---|
459 | .entry
|
---|
460 | .align 64
|
---|
461 |
|
---|
462 | STD %r3,0(%sp) ; save r3
|
---|
463 | STD %r4,8(%sp) ; save r4
|
---|
464 | NOP
|
---|
465 | STD %r5,16(%sp) ; save r5
|
---|
466 |
|
---|
467 | CMPIB,>= 0,num,bn_sqr_words_exit
|
---|
468 | LDO 128(%sp),%sp ; bump stack
|
---|
469 |
|
---|
470 | ;
|
---|
471 | ; If only 1, the goto straight to cleanup
|
---|
472 | ;
|
---|
473 | CMPIB,= 1,num,bn_sqr_words_single_top
|
---|
474 | DEPDI,Z -1,32,33,high_mask ; Create Mask 0xffffffff80000000L
|
---|
475 |
|
---|
476 | ;
|
---|
477 | ; This loop is unrolled 2 times (64-byte aligned as well)
|
---|
478 | ;
|
---|
479 |
|
---|
480 | bn_sqr_words_unroll2
|
---|
481 | FLDD 0(a_ptr),t_float_0 ; a[0]
|
---|
482 | FLDD 8(a_ptr),t_float_1 ; a[1]
|
---|
483 | XMPYU fht_0,flt_0,fm ; m[0]
|
---|
484 | XMPYU fht_1,flt_1,fm_1 ; m[1]
|
---|
485 |
|
---|
486 | FSTD fm,-24(%sp) ; store m[0]
|
---|
487 | FSTD fm_1,-56(%sp) ; store m[1]
|
---|
488 | XMPYU flt_0,flt_0,lt_temp ; lt[0]
|
---|
489 | XMPYU flt_1,flt_1,lt_temp_1 ; lt[1]
|
---|
490 |
|
---|
491 | FSTD lt_temp,-16(%sp) ; store lt[0]
|
---|
492 | FSTD lt_temp_1,-48(%sp) ; store lt[1]
|
---|
493 | XMPYU fht_0,fht_0,ht_temp ; ht[0]
|
---|
494 | XMPYU fht_1,fht_1,ht_temp_1 ; ht[1]
|
---|
495 |
|
---|
496 | FSTD ht_temp,-8(%sp) ; store ht[0]
|
---|
497 | FSTD ht_temp_1,-40(%sp) ; store ht[1]
|
---|
498 | LDD -24(%sp),m_0
|
---|
499 | LDD -56(%sp),m_1
|
---|
500 |
|
---|
501 | AND m_0,high_mask,tmp_0 ; m[0] & Mask
|
---|
502 | AND m_1,high_mask,tmp_1 ; m[1] & Mask
|
---|
503 | DEPD,Z m_0,30,31,m_0 ; m[0] << 32+1
|
---|
504 | DEPD,Z m_1,30,31,m_1 ; m[1] << 32+1
|
---|
505 |
|
---|
506 | LDD -16(%sp),lt_0
|
---|
507 | LDD -48(%sp),lt_1
|
---|
508 | EXTRD,U tmp_0,32,33,tmp_0 ; tmp_0 = m[0]&Mask >> 32-1
|
---|
509 | EXTRD,U tmp_1,32,33,tmp_1 ; tmp_1 = m[1]&Mask >> 32-1
|
---|
510 |
|
---|
511 | LDD -8(%sp),ht_0
|
---|
512 | LDD -40(%sp),ht_1
|
---|
513 | ADD,L ht_0,tmp_0,ht_0 ; ht[0] += tmp_0
|
---|
514 | ADD,L ht_1,tmp_1,ht_1 ; ht[1] += tmp_1
|
---|
515 |
|
---|
516 | ADD lt_0,m_0,lt_0 ; lt = lt+m
|
---|
517 | ADD,DC ht_0,%r0,ht_0 ; ht[0]++
|
---|
518 | STD lt_0,0(r_ptr) ; rp[0] = lt[0]
|
---|
519 | STD ht_0,8(r_ptr) ; rp[1] = ht[1]
|
---|
520 |
|
---|
521 | ADD lt_1,m_1,lt_1 ; lt = lt+m
|
---|
522 | ADD,DC ht_1,%r0,ht_1 ; ht[1]++
|
---|
523 | STD lt_1,16(r_ptr) ; rp[2] = lt[1]
|
---|
524 | STD ht_1,24(r_ptr) ; rp[3] = ht[1]
|
---|
525 |
|
---|
526 | LDO -2(num),num ; num = num - 2;
|
---|
527 | LDO 16(a_ptr),a_ptr ; ap += 2
|
---|
528 | CMPIB,<= 2,num,bn_sqr_words_unroll2
|
---|
529 | LDO 32(r_ptr),r_ptr ; rp += 4
|
---|
530 |
|
---|
531 | CMPIB,=,N 0,num,bn_sqr_words_exit ; are we done?
|
---|
532 |
|
---|
533 | ;
|
---|
534 | ; Top of loop aligned on 64-byte boundary
|
---|
535 | ;
|
---|
536 | bn_sqr_words_single_top
|
---|
537 | FLDD 0(a_ptr),t_float_0 ; load up 64-bit value (fr8L) ht(L)/lt(R)
|
---|
538 |
|
---|
539 | XMPYU fht_0,flt_0,fm ; m
|
---|
540 | FSTD fm,-24(%sp) ; store m
|
---|
541 |
|
---|
542 | XMPYU flt_0,flt_0,lt_temp ; lt
|
---|
543 | FSTD lt_temp,-16(%sp) ; store lt
|
---|
544 |
|
---|
545 | XMPYU fht_0,fht_0,ht_temp ; ht
|
---|
546 | FSTD ht_temp,-8(%sp) ; store ht
|
---|
547 |
|
---|
548 | LDD -24(%sp),m_0 ; load m
|
---|
549 | AND m_0,high_mask,tmp_0 ; m & Mask
|
---|
550 | DEPD,Z m_0,30,31,m_0 ; m << 32+1
|
---|
551 | LDD -16(%sp),lt_0 ; lt
|
---|
552 |
|
---|
553 | LDD -8(%sp),ht_0 ; ht
|
---|
554 | EXTRD,U tmp_0,32,33,tmp_0 ; tmp_0 = m&Mask >> 32-1
|
---|
555 | ADD m_0,lt_0,lt_0 ; lt = lt+m
|
---|
556 | ADD,L ht_0,tmp_0,ht_0 ; ht += tmp_0
|
---|
557 | ADD,DC ht_0,%r0,ht_0 ; ht++
|
---|
558 |
|
---|
559 | STD lt_0,0(r_ptr) ; rp[0] = lt
|
---|
560 | STD ht_0,8(r_ptr) ; rp[1] = ht
|
---|
561 |
|
---|
562 | bn_sqr_words_exit
|
---|
563 | .EXIT
|
---|
564 | LDD -112(%sp),%r5 ; restore r5
|
---|
565 | LDD -120(%sp),%r4 ; restore r4
|
---|
566 | BVE (%rp)
|
---|
567 | LDD,MB -128(%sp),%r3
|
---|
568 | .PROCEND ;in=23,24,25,26,29;out=28;
|
---|
569 |
|
---|
570 |
|
---|
571 | ;----------------------------------------------------------------------------
|
---|
572 | ;
|
---|
573 | ;BN_ULONG bn_add_words(BN_ULONG *r, BN_ULONG *a, BN_ULONG *b, int n)
|
---|
574 | ;
|
---|
575 | ; arg0 = rp
|
---|
576 | ; arg1 = ap
|
---|
577 | ; arg2 = bp
|
---|
578 | ; arg3 = n
|
---|
579 |
|
---|
580 | t .reg %r22
|
---|
581 | b .reg %r21
|
---|
582 | l .reg %r20
|
---|
583 |
|
---|
584 | bn_add_words
|
---|
585 | .proc
|
---|
586 | .entry
|
---|
587 | .callinfo
|
---|
588 | .EXPORT bn_add_words,ENTRY,PRIV_LEV=3,NO_RELOCATION,LONG_RETURN
|
---|
589 | .align 64
|
---|
590 |
|
---|
591 | CMPIB,>= 0,n,bn_add_words_exit
|
---|
592 | COPY %r0,%ret0 ; return 0 by default
|
---|
593 |
|
---|
594 | ;
|
---|
595 | ; If 2 or more numbers do the loop
|
---|
596 | ;
|
---|
597 | CMPIB,= 1,n,bn_add_words_single_top
|
---|
598 | NOP
|
---|
599 |
|
---|
600 | ;
|
---|
601 | ; This loop is unrolled 2 times (64-byte aligned as well)
|
---|
602 | ;
|
---|
603 | bn_add_words_unroll2
|
---|
604 | LDD 0(a_ptr),t
|
---|
605 | LDD 0(b_ptr),b
|
---|
606 | ADD t,%ret0,t ; t = t+c;
|
---|
607 | ADD,DC %r0,%r0,%ret0 ; set c to carry
|
---|
608 | ADD t,b,l ; l = t + b[0]
|
---|
609 | ADD,DC %ret0,%r0,%ret0 ; c+= carry
|
---|
610 | STD l,0(r_ptr)
|
---|
611 |
|
---|
612 | LDD 8(a_ptr),t
|
---|
613 | LDD 8(b_ptr),b
|
---|
614 | ADD t,%ret0,t ; t = t+c;
|
---|
615 | ADD,DC %r0,%r0,%ret0 ; set c to carry
|
---|
616 | ADD t,b,l ; l = t + b[0]
|
---|
617 | ADD,DC %ret0,%r0,%ret0 ; c+= carry
|
---|
618 | STD l,8(r_ptr)
|
---|
619 |
|
---|
620 | LDO -2(n),n
|
---|
621 | LDO 16(a_ptr),a_ptr
|
---|
622 | LDO 16(b_ptr),b_ptr
|
---|
623 |
|
---|
624 | CMPIB,<= 2,n,bn_add_words_unroll2
|
---|
625 | LDO 16(r_ptr),r_ptr
|
---|
626 |
|
---|
627 | CMPIB,=,N 0,n,bn_add_words_exit ; are we done?
|
---|
628 |
|
---|
629 | bn_add_words_single_top
|
---|
630 | LDD 0(a_ptr),t
|
---|
631 | LDD 0(b_ptr),b
|
---|
632 |
|
---|
633 | ADD t,%ret0,t ; t = t+c;
|
---|
634 | ADD,DC %r0,%r0,%ret0 ; set c to carry (could use CMPCLR??)
|
---|
635 | ADD t,b,l ; l = t + b[0]
|
---|
636 | ADD,DC %ret0,%r0,%ret0 ; c+= carry
|
---|
637 | STD l,0(r_ptr)
|
---|
638 |
|
---|
639 | bn_add_words_exit
|
---|
640 | .EXIT
|
---|
641 | BVE (%rp)
|
---|
642 | NOP
|
---|
643 | .PROCEND ;in=23,24,25,26,29;out=28;
|
---|
644 |
|
---|
645 | ;----------------------------------------------------------------------------
|
---|
646 | ;
|
---|
647 | ;BN_ULONG bn_sub_words(BN_ULONG *r, BN_ULONG *a, BN_ULONG *b, int n)
|
---|
648 | ;
|
---|
649 | ; arg0 = rp
|
---|
650 | ; arg1 = ap
|
---|
651 | ; arg2 = bp
|
---|
652 | ; arg3 = n
|
---|
653 |
|
---|
654 | t1 .reg %r22
|
---|
655 | t2 .reg %r21
|
---|
656 | sub_tmp1 .reg %r20
|
---|
657 | sub_tmp2 .reg %r19
|
---|
658 |
|
---|
659 |
|
---|
660 | bn_sub_words
|
---|
661 | .proc
|
---|
662 | .callinfo
|
---|
663 | .EXPORT bn_sub_words,ENTRY,PRIV_LEV=3,NO_RELOCATION,LONG_RETURN
|
---|
664 | .entry
|
---|
665 | .align 64
|
---|
666 |
|
---|
667 | CMPIB,>= 0,n,bn_sub_words_exit
|
---|
668 | COPY %r0,%ret0 ; return 0 by default
|
---|
669 |
|
---|
670 | ;
|
---|
671 | ; If 2 or more numbers do the loop
|
---|
672 | ;
|
---|
673 | CMPIB,= 1,n,bn_sub_words_single_top
|
---|
674 | NOP
|
---|
675 |
|
---|
676 | ;
|
---|
677 | ; This loop is unrolled 2 times (64-byte aligned as well)
|
---|
678 | ;
|
---|
679 | bn_sub_words_unroll2
|
---|
680 | LDD 0(a_ptr),t1
|
---|
681 | LDD 0(b_ptr),t2
|
---|
682 | SUB t1,t2,sub_tmp1 ; t3 = t1-t2;
|
---|
683 | SUB sub_tmp1,%ret0,sub_tmp1 ; t3 = t3- c;
|
---|
684 |
|
---|
685 | CMPCLR,*>> t1,t2,sub_tmp2 ; clear if t1 > t2
|
---|
686 | LDO 1(%r0),sub_tmp2
|
---|
687 |
|
---|
688 | CMPCLR,*= t1,t2,%r0
|
---|
689 | COPY sub_tmp2,%ret0
|
---|
690 | STD sub_tmp1,0(r_ptr)
|
---|
691 |
|
---|
692 | LDD 8(a_ptr),t1
|
---|
693 | LDD 8(b_ptr),t2
|
---|
694 | SUB t1,t2,sub_tmp1 ; t3 = t1-t2;
|
---|
695 | SUB sub_tmp1,%ret0,sub_tmp1 ; t3 = t3- c;
|
---|
696 | CMPCLR,*>> t1,t2,sub_tmp2 ; clear if t1 > t2
|
---|
697 | LDO 1(%r0),sub_tmp2
|
---|
698 |
|
---|
699 | CMPCLR,*= t1,t2,%r0
|
---|
700 | COPY sub_tmp2,%ret0
|
---|
701 | STD sub_tmp1,8(r_ptr)
|
---|
702 |
|
---|
703 | LDO -2(n),n
|
---|
704 | LDO 16(a_ptr),a_ptr
|
---|
705 | LDO 16(b_ptr),b_ptr
|
---|
706 |
|
---|
707 | CMPIB,<= 2,n,bn_sub_words_unroll2
|
---|
708 | LDO 16(r_ptr),r_ptr
|
---|
709 |
|
---|
710 | CMPIB,=,N 0,n,bn_sub_words_exit ; are we done?
|
---|
711 |
|
---|
712 | bn_sub_words_single_top
|
---|
713 | LDD 0(a_ptr),t1
|
---|
714 | LDD 0(b_ptr),t2
|
---|
715 | SUB t1,t2,sub_tmp1 ; t3 = t1-t2;
|
---|
716 | SUB sub_tmp1,%ret0,sub_tmp1 ; t3 = t3- c;
|
---|
717 | CMPCLR,*>> t1,t2,sub_tmp2 ; clear if t1 > t2
|
---|
718 | LDO 1(%r0),sub_tmp2
|
---|
719 |
|
---|
720 | CMPCLR,*= t1,t2,%r0
|
---|
721 | COPY sub_tmp2,%ret0
|
---|
722 |
|
---|
723 | STD sub_tmp1,0(r_ptr)
|
---|
724 |
|
---|
725 | bn_sub_words_exit
|
---|
726 | .EXIT
|
---|
727 | BVE (%rp)
|
---|
728 | NOP
|
---|
729 | .PROCEND ;in=23,24,25,26,29;out=28;
|
---|
730 |
|
---|
731 | ;------------------------------------------------------------------------------
|
---|
732 | ;
|
---|
733 | ; unsigned long bn_div_words(unsigned long h, unsigned long l, unsigned long d)
|
---|
734 | ;
|
---|
735 | ; arg0 = h
|
---|
736 | ; arg1 = l
|
---|
737 | ; arg2 = d
|
---|
738 | ;
|
---|
739 | ; This is mainly just modified assembly from the compiler, thus the
|
---|
740 | ; lack of variable names.
|
---|
741 | ;
|
---|
742 | ;------------------------------------------------------------------------------
|
---|
743 | bn_div_words
|
---|
744 | .proc
|
---|
745 | .callinfo CALLER,FRAME=272,ENTRY_GR=%r10,SAVE_RP,ARGS_SAVED,ORDERING_AWARE
|
---|
746 | .EXPORT bn_div_words,ENTRY,PRIV_LEV=3,NO_RELOCATION,LONG_RETURN
|
---|
747 | .IMPORT BN_num_bits_word,CODE,NO_RELOCATION
|
---|
748 | .IMPORT __iob,DATA
|
---|
749 | .IMPORT fprintf,CODE,NO_RELOCATION
|
---|
750 | .IMPORT abort,CODE,NO_RELOCATION
|
---|
751 | .IMPORT $$div2U,MILLICODE
|
---|
752 | .entry
|
---|
753 | STD %r2,-16(%r30)
|
---|
754 | STD,MA %r3,352(%r30)
|
---|
755 | STD %r4,-344(%r30)
|
---|
756 | STD %r5,-336(%r30)
|
---|
757 | STD %r6,-328(%r30)
|
---|
758 | STD %r7,-320(%r30)
|
---|
759 | STD %r8,-312(%r30)
|
---|
760 | STD %r9,-304(%r30)
|
---|
761 | STD %r10,-296(%r30)
|
---|
762 |
|
---|
763 | STD %r27,-288(%r30) ; save gp
|
---|
764 |
|
---|
765 | COPY %r24,%r3 ; save d
|
---|
766 | COPY %r26,%r4 ; save h (high 64-bits)
|
---|
767 | LDO -1(%r0),%ret0 ; return -1 by default
|
---|
768 |
|
---|
769 | CMPB,*= %r0,%arg2,$D3 ; if (d == 0)
|
---|
770 | COPY %r25,%r5 ; save l (low 64-bits)
|
---|
771 |
|
---|
772 | LDO -48(%r30),%r29 ; create ap
|
---|
773 | .CALL ;in=26,29;out=28;
|
---|
774 | B,L BN_num_bits_word,%r2
|
---|
775 | COPY %r3,%r26
|
---|
776 | LDD -288(%r30),%r27 ; restore gp
|
---|
777 | LDI 64,%r21
|
---|
778 |
|
---|
779 | CMPB,= %r21,%ret0,$00000012 ;if (i == 64) (forward)
|
---|
780 | COPY %ret0,%r24 ; i
|
---|
781 | MTSARCM %r24
|
---|
782 | DEPDI,Z -1,%sar,1,%r29
|
---|
783 | CMPB,*<<,N %r29,%r4,bn_div_err_case ; if (h > 1<<i) (forward)
|
---|
784 |
|
---|
785 | $00000012
|
---|
786 | SUBI 64,%r24,%r31 ; i = 64 - i;
|
---|
787 | CMPCLR,*<< %r4,%r3,%r0 ; if (h >= d)
|
---|
788 | SUB %r4,%r3,%r4 ; h -= d
|
---|
789 | CMPB,= %r31,%r0,$0000001A ; if (i)
|
---|
790 | COPY %r0,%r10 ; ret = 0
|
---|
791 | MTSARCM %r31 ; i to shift
|
---|
792 | DEPD,Z %r3,%sar,64,%r3 ; d <<= i;
|
---|
793 | SUBI 64,%r31,%r19 ; 64 - i; redundent
|
---|
794 | MTSAR %r19 ; (64 -i) to shift
|
---|
795 | SHRPD %r4,%r5,%sar,%r4 ; l>> (64-i)
|
---|
796 | MTSARCM %r31 ; i to shift
|
---|
797 | DEPD,Z %r5,%sar,64,%r5 ; l <<= i;
|
---|
798 |
|
---|
799 | $0000001A
|
---|
800 | DEPDI,Z -1,31,32,%r19
|
---|
801 | EXTRD,U %r3,31,32,%r6 ; dh=(d&0xfff)>>32
|
---|
802 | EXTRD,U %r3,63,32,%r8 ; dl = d&0xffffff
|
---|
803 | LDO 2(%r0),%r9
|
---|
804 | STD %r3,-280(%r30) ; "d" to stack
|
---|
805 |
|
---|
806 | $0000001C
|
---|
807 | DEPDI,Z -1,63,32,%r29 ;
|
---|
808 | EXTRD,U %r4,31,32,%r31 ; h >> 32
|
---|
809 | CMPB,*=,N %r31,%r6,$D2 ; if ((h>>32) != dh)(forward) div
|
---|
810 | COPY %r4,%r26
|
---|
811 | EXTRD,U %r4,31,32,%r25
|
---|
812 | COPY %r6,%r24
|
---|
813 | .CALL ;in=23,24,25,26;out=20,21,22,28,29; (MILLICALL)
|
---|
814 | B,L $$div2U,%r2
|
---|
815 | EXTRD,U %r6,31,32,%r23
|
---|
816 | DEPD %r28,31,32,%r29
|
---|
817 | $D2
|
---|
818 | STD %r29,-272(%r30) ; q
|
---|
819 | AND %r5,%r19,%r24 ; t & 0xffffffff00000000;
|
---|
820 | EXTRD,U %r24,31,32,%r24 ; ???
|
---|
821 | FLDD -272(%r30),%fr7 ; q
|
---|
822 | FLDD -280(%r30),%fr8 ; d
|
---|
823 | XMPYU %fr8L,%fr7L,%fr10
|
---|
824 | FSTD %fr10,-256(%r30)
|
---|
825 | XMPYU %fr8L,%fr7R,%fr22
|
---|
826 | FSTD %fr22,-264(%r30)
|
---|
827 | XMPYU %fr8R,%fr7L,%fr11
|
---|
828 | XMPYU %fr8R,%fr7R,%fr23
|
---|
829 | FSTD %fr11,-232(%r30)
|
---|
830 | FSTD %fr23,-240(%r30)
|
---|
831 | LDD -256(%r30),%r28
|
---|
832 | DEPD,Z %r28,31,32,%r2
|
---|
833 | LDD -264(%r30),%r20
|
---|
834 | ADD,L %r20,%r2,%r31
|
---|
835 | LDD -232(%r30),%r22
|
---|
836 | DEPD,Z %r22,31,32,%r22
|
---|
837 | LDD -240(%r30),%r21
|
---|
838 | B $00000024 ; enter loop
|
---|
839 | ADD,L %r21,%r22,%r23
|
---|
840 |
|
---|
841 | $0000002A
|
---|
842 | LDO -1(%r29),%r29
|
---|
843 | SUB %r23,%r8,%r23
|
---|
844 | $00000024
|
---|
845 | SUB %r4,%r31,%r25
|
---|
846 | AND %r25,%r19,%r26
|
---|
847 | CMPB,*<>,N %r0,%r26,$00000046 ; (forward)
|
---|
848 | DEPD,Z %r25,31,32,%r20
|
---|
849 | OR %r20,%r24,%r21
|
---|
850 | CMPB,*<<,N %r21,%r23,$0000002A ;(backward)
|
---|
851 | SUB %r31,%r6,%r31
|
---|
852 | ;-------------Break path---------------------
|
---|
853 |
|
---|
854 | $00000046
|
---|
855 | DEPD,Z %r23,31,32,%r25 ;tl
|
---|
856 | EXTRD,U %r23,31,32,%r26 ;t
|
---|
857 | AND %r25,%r19,%r24 ;tl = (tl<<32)&0xfffffff0000000L
|
---|
858 | ADD,L %r31,%r26,%r31 ;th += t;
|
---|
859 | CMPCLR,*>>= %r5,%r24,%r0 ;if (l<tl)
|
---|
860 | LDO 1(%r31),%r31 ; th++;
|
---|
861 | CMPB,*<<=,N %r31,%r4,$00000036 ;if (n < th) (forward)
|
---|
862 | LDO -1(%r29),%r29 ;q--;
|
---|
863 | ADD,L %r4,%r3,%r4 ;h += d;
|
---|
864 | $00000036
|
---|
865 | ADDIB,=,N -1,%r9,$D1 ;if (--count == 0) break (forward)
|
---|
866 | SUB %r5,%r24,%r28 ; l -= tl;
|
---|
867 | SUB %r4,%r31,%r24 ; h -= th;
|
---|
868 | SHRPD %r24,%r28,32,%r4 ; h = ((h<<32)|(l>>32));
|
---|
869 | DEPD,Z %r29,31,32,%r10 ; ret = q<<32
|
---|
870 | b $0000001C
|
---|
871 | DEPD,Z %r28,31,32,%r5 ; l = l << 32
|
---|
872 |
|
---|
873 | $D1
|
---|
874 | OR %r10,%r29,%r28 ; ret |= q
|
---|
875 | $D3
|
---|
876 | LDD -368(%r30),%r2
|
---|
877 | $D0
|
---|
878 | LDD -296(%r30),%r10
|
---|
879 | LDD -304(%r30),%r9
|
---|
880 | LDD -312(%r30),%r8
|
---|
881 | LDD -320(%r30),%r7
|
---|
882 | LDD -328(%r30),%r6
|
---|
883 | LDD -336(%r30),%r5
|
---|
884 | LDD -344(%r30),%r4
|
---|
885 | BVE (%r2)
|
---|
886 | .EXIT
|
---|
887 | LDD,MB -352(%r30),%r3
|
---|
888 |
|
---|
889 | bn_div_err_case
|
---|
890 | MFIA %r6
|
---|
891 | ADDIL L'bn_div_words-bn_div_err_case,%r6,%r1
|
---|
892 | LDO R'bn_div_words-bn_div_err_case(%r1),%r6
|
---|
893 | ADDIL LT'__iob,%r27,%r1
|
---|
894 | LDD RT'__iob(%r1),%r26
|
---|
895 | ADDIL L'C$4-bn_div_words,%r6,%r1
|
---|
896 | LDO R'C$4-bn_div_words(%r1),%r25
|
---|
897 | LDO 64(%r26),%r26
|
---|
898 | .CALL ;in=24,25,26,29;out=28;
|
---|
899 | B,L fprintf,%r2
|
---|
900 | LDO -48(%r30),%r29
|
---|
901 | LDD -288(%r30),%r27
|
---|
902 | .CALL ;in=29;
|
---|
903 | B,L abort,%r2
|
---|
904 | LDO -48(%r30),%r29
|
---|
905 | LDD -288(%r30),%r27
|
---|
906 | B $D0
|
---|
907 | LDD -368(%r30),%r2
|
---|
908 | .PROCEND ;in=24,25,26,29;out=28;
|
---|
909 |
|
---|
910 | ;----------------------------------------------------------------------------
|
---|
911 | ;
|
---|
912 | ; Registers to hold 64-bit values to manipulate. The "L" part
|
---|
913 | ; of the register corresponds to the upper 32-bits, while the "R"
|
---|
914 | ; part corresponds to the lower 32-bits
|
---|
915 | ;
|
---|
916 | ; Note, that when using b6 and b7, the code must save these before
|
---|
917 | ; using them because they are callee save registers
|
---|
918 | ;
|
---|
919 | ;
|
---|
920 | ; Floating point registers to use to save values that
|
---|
921 | ; are manipulated. These don't collide with ftemp1-6 and
|
---|
922 | ; are all caller save registers
|
---|
923 | ;
|
---|
924 | a0 .reg %fr22
|
---|
925 | a0L .reg %fr22L
|
---|
926 | a0R .reg %fr22R
|
---|
927 |
|
---|
928 | a1 .reg %fr23
|
---|
929 | a1L .reg %fr23L
|
---|
930 | a1R .reg %fr23R
|
---|
931 |
|
---|
932 | a2 .reg %fr24
|
---|
933 | a2L .reg %fr24L
|
---|
934 | a2R .reg %fr24R
|
---|
935 |
|
---|
936 | a3 .reg %fr25
|
---|
937 | a3L .reg %fr25L
|
---|
938 | a3R .reg %fr25R
|
---|
939 |
|
---|
940 | a4 .reg %fr26
|
---|
941 | a4L .reg %fr26L
|
---|
942 | a4R .reg %fr26R
|
---|
943 |
|
---|
944 | a5 .reg %fr27
|
---|
945 | a5L .reg %fr27L
|
---|
946 | a5R .reg %fr27R
|
---|
947 |
|
---|
948 | a6 .reg %fr28
|
---|
949 | a6L .reg %fr28L
|
---|
950 | a6R .reg %fr28R
|
---|
951 |
|
---|
952 | a7 .reg %fr29
|
---|
953 | a7L .reg %fr29L
|
---|
954 | a7R .reg %fr29R
|
---|
955 |
|
---|
956 | b0 .reg %fr30
|
---|
957 | b0L .reg %fr30L
|
---|
958 | b0R .reg %fr30R
|
---|
959 |
|
---|
960 | b1 .reg %fr31
|
---|
961 | b1L .reg %fr31L
|
---|
962 | b1R .reg %fr31R
|
---|
963 |
|
---|
964 | ;
|
---|
965 | ; Temporary floating point variables, these are all caller save
|
---|
966 | ; registers
|
---|
967 | ;
|
---|
968 | ftemp1 .reg %fr4
|
---|
969 | ftemp2 .reg %fr5
|
---|
970 | ftemp3 .reg %fr6
|
---|
971 | ftemp4 .reg %fr7
|
---|
972 |
|
---|
973 | ;
|
---|
974 | ; The B set of registers when used.
|
---|
975 | ;
|
---|
976 |
|
---|
977 | b2 .reg %fr8
|
---|
978 | b2L .reg %fr8L
|
---|
979 | b2R .reg %fr8R
|
---|
980 |
|
---|
981 | b3 .reg %fr9
|
---|
982 | b3L .reg %fr9L
|
---|
983 | b3R .reg %fr9R
|
---|
984 |
|
---|
985 | b4 .reg %fr10
|
---|
986 | b4L .reg %fr10L
|
---|
987 | b4R .reg %fr10R
|
---|
988 |
|
---|
989 | b5 .reg %fr11
|
---|
990 | b5L .reg %fr11L
|
---|
991 | b5R .reg %fr11R
|
---|
992 |
|
---|
993 | b6 .reg %fr12
|
---|
994 | b6L .reg %fr12L
|
---|
995 | b6R .reg %fr12R
|
---|
996 |
|
---|
997 | b7 .reg %fr13
|
---|
998 | b7L .reg %fr13L
|
---|
999 | b7R .reg %fr13R
|
---|
1000 |
|
---|
1001 | c1 .reg %r21 ; only reg
|
---|
1002 | temp1 .reg %r20 ; only reg
|
---|
1003 | temp2 .reg %r19 ; only reg
|
---|
1004 | temp3 .reg %r31 ; only reg
|
---|
1005 |
|
---|
1006 | m1 .reg %r28
|
---|
1007 | c2 .reg %r23
|
---|
1008 | high_one .reg %r1
|
---|
1009 | ht .reg %r6
|
---|
1010 | lt .reg %r5
|
---|
1011 | m .reg %r4
|
---|
1012 | c3 .reg %r3
|
---|
1013 |
|
---|
1014 | SQR_ADD_C .macro A0L,A0R,C1,C2,C3
|
---|
1015 | XMPYU A0L,A0R,ftemp1 ; m
|
---|
1016 | FSTD ftemp1,-24(%sp) ; store m
|
---|
1017 |
|
---|
1018 | XMPYU A0R,A0R,ftemp2 ; lt
|
---|
1019 | FSTD ftemp2,-16(%sp) ; store lt
|
---|
1020 |
|
---|
1021 | XMPYU A0L,A0L,ftemp3 ; ht
|
---|
1022 | FSTD ftemp3,-8(%sp) ; store ht
|
---|
1023 |
|
---|
1024 | LDD -24(%sp),m ; load m
|
---|
1025 | AND m,high_mask,temp2 ; m & Mask
|
---|
1026 | DEPD,Z m,30,31,temp3 ; m << 32+1
|
---|
1027 | LDD -16(%sp),lt ; lt
|
---|
1028 |
|
---|
1029 | LDD -8(%sp),ht ; ht
|
---|
1030 | EXTRD,U temp2,32,33,temp1 ; temp1 = m&Mask >> 32-1
|
---|
1031 | ADD temp3,lt,lt ; lt = lt+m
|
---|
1032 | ADD,L ht,temp1,ht ; ht += temp1
|
---|
1033 | ADD,DC ht,%r0,ht ; ht++
|
---|
1034 |
|
---|
1035 | ADD C1,lt,C1 ; c1=c1+lt
|
---|
1036 | ADD,DC ht,%r0,ht ; ht++
|
---|
1037 |
|
---|
1038 | ADD C2,ht,C2 ; c2=c2+ht
|
---|
1039 | ADD,DC C3,%r0,C3 ; c3++
|
---|
1040 | .endm
|
---|
1041 |
|
---|
1042 | SQR_ADD_C2 .macro A0L,A0R,A1L,A1R,C1,C2,C3
|
---|
1043 | XMPYU A0L,A1R,ftemp1 ; m1 = bl*ht
|
---|
1044 | FSTD ftemp1,-16(%sp) ;
|
---|
1045 | XMPYU A0R,A1L,ftemp2 ; m = bh*lt
|
---|
1046 | FSTD ftemp2,-8(%sp) ;
|
---|
1047 | XMPYU A0R,A1R,ftemp3 ; lt = bl*lt
|
---|
1048 | FSTD ftemp3,-32(%sp)
|
---|
1049 | XMPYU A0L,A1L,ftemp4 ; ht = bh*ht
|
---|
1050 | FSTD ftemp4,-24(%sp) ;
|
---|
1051 |
|
---|
1052 | LDD -8(%sp),m ; r21 = m
|
---|
1053 | LDD -16(%sp),m1 ; r19 = m1
|
---|
1054 | ADD,L m,m1,m ; m+m1
|
---|
1055 |
|
---|
1056 | DEPD,Z m,31,32,temp3 ; (m+m1<<32)
|
---|
1057 | LDD -24(%sp),ht ; r24 = ht
|
---|
1058 |
|
---|
1059 | CMPCLR,*>>= m,m1,%r0 ; if (m < m1)
|
---|
1060 | ADD,L ht,high_one,ht ; ht+=high_one
|
---|
1061 |
|
---|
1062 | EXTRD,U m,31,32,temp1 ; m >> 32
|
---|
1063 | LDD -32(%sp),lt ; lt
|
---|
1064 | ADD,L ht,temp1,ht ; ht+= m>>32
|
---|
1065 | ADD lt,temp3,lt ; lt = lt+m1
|
---|
1066 | ADD,DC ht,%r0,ht ; ht++
|
---|
1067 |
|
---|
1068 | ADD ht,ht,ht ; ht=ht+ht;
|
---|
1069 | ADD,DC C3,%r0,C3 ; add in carry (c3++)
|
---|
1070 |
|
---|
1071 | ADD lt,lt,lt ; lt=lt+lt;
|
---|
1072 | ADD,DC ht,%r0,ht ; add in carry (ht++)
|
---|
1073 |
|
---|
1074 | ADD C1,lt,C1 ; c1=c1+lt
|
---|
1075 | ADD,DC,*NUV ht,%r0,ht ; add in carry (ht++)
|
---|
1076 | LDO 1(C3),C3 ; bump c3 if overflow,nullify otherwise
|
---|
1077 |
|
---|
1078 | ADD C2,ht,C2 ; c2 = c2 + ht
|
---|
1079 | ADD,DC C3,%r0,C3 ; add in carry (c3++)
|
---|
1080 | .endm
|
---|
1081 |
|
---|
1082 | ;
|
---|
1083 | ;void bn_sqr_comba8(BN_ULONG *r, BN_ULONG *a)
|
---|
1084 | ; arg0 = r_ptr
|
---|
1085 | ; arg1 = a_ptr
|
---|
1086 | ;
|
---|
1087 |
|
---|
1088 | bn_sqr_comba8
|
---|
1089 | .PROC
|
---|
1090 | .CALLINFO FRAME=128,ENTRY_GR=%r3,ARGS_SAVED,ORDERING_AWARE
|
---|
1091 | .EXPORT bn_sqr_comba8,ENTRY,PRIV_LEV=3,NO_RELOCATION,LONG_RETURN
|
---|
1092 | .ENTRY
|
---|
1093 | .align 64
|
---|
1094 |
|
---|
1095 | STD %r3,0(%sp) ; save r3
|
---|
1096 | STD %r4,8(%sp) ; save r4
|
---|
1097 | STD %r5,16(%sp) ; save r5
|
---|
1098 | STD %r6,24(%sp) ; save r6
|
---|
1099 |
|
---|
1100 | ;
|
---|
1101 | ; Zero out carries
|
---|
1102 | ;
|
---|
1103 | COPY %r0,c1
|
---|
1104 | COPY %r0,c2
|
---|
1105 | COPY %r0,c3
|
---|
1106 |
|
---|
1107 | LDO 128(%sp),%sp ; bump stack
|
---|
1108 | DEPDI,Z -1,32,33,high_mask ; Create Mask 0xffffffff80000000L
|
---|
1109 | DEPDI,Z 1,31,1,high_one ; Create Value 1 << 32
|
---|
1110 |
|
---|
1111 | ;
|
---|
1112 | ; Load up all of the values we are going to use
|
---|
1113 | ;
|
---|
1114 | FLDD 0(a_ptr),a0
|
---|
1115 | FLDD 8(a_ptr),a1
|
---|
1116 | FLDD 16(a_ptr),a2
|
---|
1117 | FLDD 24(a_ptr),a3
|
---|
1118 | FLDD 32(a_ptr),a4
|
---|
1119 | FLDD 40(a_ptr),a5
|
---|
1120 | FLDD 48(a_ptr),a6
|
---|
1121 | FLDD 56(a_ptr),a7
|
---|
1122 |
|
---|
1123 | SQR_ADD_C a0L,a0R,c1,c2,c3
|
---|
1124 | STD c1,0(r_ptr) ; r[0] = c1;
|
---|
1125 | COPY %r0,c1
|
---|
1126 |
|
---|
1127 | SQR_ADD_C2 a1L,a1R,a0L,a0R,c2,c3,c1
|
---|
1128 | STD c2,8(r_ptr) ; r[1] = c2;
|
---|
1129 | COPY %r0,c2
|
---|
1130 |
|
---|
1131 | SQR_ADD_C a1L,a1R,c3,c1,c2
|
---|
1132 | SQR_ADD_C2 a2L,a2R,a0L,a0R,c3,c1,c2
|
---|
1133 | STD c3,16(r_ptr) ; r[2] = c3;
|
---|
1134 | COPY %r0,c3
|
---|
1135 |
|
---|
1136 | SQR_ADD_C2 a3L,a3R,a0L,a0R,c1,c2,c3
|
---|
1137 | SQR_ADD_C2 a2L,a2R,a1L,a1R,c1,c2,c3
|
---|
1138 | STD c1,24(r_ptr) ; r[3] = c1;
|
---|
1139 | COPY %r0,c1
|
---|
1140 |
|
---|
1141 | SQR_ADD_C a2L,a2R,c2,c3,c1
|
---|
1142 | SQR_ADD_C2 a3L,a3R,a1L,a1R,c2,c3,c1
|
---|
1143 | SQR_ADD_C2 a4L,a4R,a0L,a0R,c2,c3,c1
|
---|
1144 | STD c2,32(r_ptr) ; r[4] = c2;
|
---|
1145 | COPY %r0,c2
|
---|
1146 |
|
---|
1147 | SQR_ADD_C2 a5L,a5R,a0L,a0R,c3,c1,c2
|
---|
1148 | SQR_ADD_C2 a4L,a4R,a1L,a1R,c3,c1,c2
|
---|
1149 | SQR_ADD_C2 a3L,a3R,a2L,a2R,c3,c1,c2
|
---|
1150 | STD c3,40(r_ptr) ; r[5] = c3;
|
---|
1151 | COPY %r0,c3
|
---|
1152 |
|
---|
1153 | SQR_ADD_C a3L,a3R,c1,c2,c3
|
---|
1154 | SQR_ADD_C2 a4L,a4R,a2L,a2R,c1,c2,c3
|
---|
1155 | SQR_ADD_C2 a5L,a5R,a1L,a1R,c1,c2,c3
|
---|
1156 | SQR_ADD_C2 a6L,a6R,a0L,a0R,c1,c2,c3
|
---|
1157 | STD c1,48(r_ptr) ; r[6] = c1;
|
---|
1158 | COPY %r0,c1
|
---|
1159 |
|
---|
1160 | SQR_ADD_C2 a7L,a7R,a0L,a0R,c2,c3,c1
|
---|
1161 | SQR_ADD_C2 a6L,a6R,a1L,a1R,c2,c3,c1
|
---|
1162 | SQR_ADD_C2 a5L,a5R,a2L,a2R,c2,c3,c1
|
---|
1163 | SQR_ADD_C2 a4L,a4R,a3L,a3R,c2,c3,c1
|
---|
1164 | STD c2,56(r_ptr) ; r[7] = c2;
|
---|
1165 | COPY %r0,c2
|
---|
1166 |
|
---|
1167 | SQR_ADD_C a4L,a4R,c3,c1,c2
|
---|
1168 | SQR_ADD_C2 a5L,a5R,a3L,a3R,c3,c1,c2
|
---|
1169 | SQR_ADD_C2 a6L,a6R,a2L,a2R,c3,c1,c2
|
---|
1170 | SQR_ADD_C2 a7L,a7R,a1L,a1R,c3,c1,c2
|
---|
1171 | STD c3,64(r_ptr) ; r[8] = c3;
|
---|
1172 | COPY %r0,c3
|
---|
1173 |
|
---|
1174 | SQR_ADD_C2 a7L,a7R,a2L,a2R,c1,c2,c3
|
---|
1175 | SQR_ADD_C2 a6L,a6R,a3L,a3R,c1,c2,c3
|
---|
1176 | SQR_ADD_C2 a5L,a5R,a4L,a4R,c1,c2,c3
|
---|
1177 | STD c1,72(r_ptr) ; r[9] = c1;
|
---|
1178 | COPY %r0,c1
|
---|
1179 |
|
---|
1180 | SQR_ADD_C a5L,a5R,c2,c3,c1
|
---|
1181 | SQR_ADD_C2 a6L,a6R,a4L,a4R,c2,c3,c1
|
---|
1182 | SQR_ADD_C2 a7L,a7R,a3L,a3R,c2,c3,c1
|
---|
1183 | STD c2,80(r_ptr) ; r[10] = c2;
|
---|
1184 | COPY %r0,c2
|
---|
1185 |
|
---|
1186 | SQR_ADD_C2 a7L,a7R,a4L,a4R,c3,c1,c2
|
---|
1187 | SQR_ADD_C2 a6L,a6R,a5L,a5R,c3,c1,c2
|
---|
1188 | STD c3,88(r_ptr) ; r[11] = c3;
|
---|
1189 | COPY %r0,c3
|
---|
1190 |
|
---|
1191 | SQR_ADD_C a6L,a6R,c1,c2,c3
|
---|
1192 | SQR_ADD_C2 a7L,a7R,a5L,a5R,c1,c2,c3
|
---|
1193 | STD c1,96(r_ptr) ; r[12] = c1;
|
---|
1194 | COPY %r0,c1
|
---|
1195 |
|
---|
1196 | SQR_ADD_C2 a7L,a7R,a6L,a6R,c2,c3,c1
|
---|
1197 | STD c2,104(r_ptr) ; r[13] = c2;
|
---|
1198 | COPY %r0,c2
|
---|
1199 |
|
---|
1200 | SQR_ADD_C a7L,a7R,c3,c1,c2
|
---|
1201 | STD c3, 112(r_ptr) ; r[14] = c3
|
---|
1202 | STD c1, 120(r_ptr) ; r[15] = c1
|
---|
1203 |
|
---|
1204 | .EXIT
|
---|
1205 | LDD -104(%sp),%r6 ; restore r6
|
---|
1206 | LDD -112(%sp),%r5 ; restore r5
|
---|
1207 | LDD -120(%sp),%r4 ; restore r4
|
---|
1208 | BVE (%rp)
|
---|
1209 | LDD,MB -128(%sp),%r3
|
---|
1210 |
|
---|
1211 | .PROCEND
|
---|
1212 |
|
---|
1213 | ;-----------------------------------------------------------------------------
|
---|
1214 | ;
|
---|
1215 | ;void bn_sqr_comba4(BN_ULONG *r, BN_ULONG *a)
|
---|
1216 | ; arg0 = r_ptr
|
---|
1217 | ; arg1 = a_ptr
|
---|
1218 | ;
|
---|
1219 |
|
---|
1220 | bn_sqr_comba4
|
---|
1221 | .proc
|
---|
1222 | .callinfo FRAME=128,ENTRY_GR=%r3,ARGS_SAVED,ORDERING_AWARE
|
---|
1223 | .EXPORT bn_sqr_comba4,ENTRY,PRIV_LEV=3,NO_RELOCATION,LONG_RETURN
|
---|
1224 | .entry
|
---|
1225 | .align 64
|
---|
1226 | STD %r3,0(%sp) ; save r3
|
---|
1227 | STD %r4,8(%sp) ; save r4
|
---|
1228 | STD %r5,16(%sp) ; save r5
|
---|
1229 | STD %r6,24(%sp) ; save r6
|
---|
1230 |
|
---|
1231 | ;
|
---|
1232 | ; Zero out carries
|
---|
1233 | ;
|
---|
1234 | COPY %r0,c1
|
---|
1235 | COPY %r0,c2
|
---|
1236 | COPY %r0,c3
|
---|
1237 |
|
---|
1238 | LDO 128(%sp),%sp ; bump stack
|
---|
1239 | DEPDI,Z -1,32,33,high_mask ; Create Mask 0xffffffff80000000L
|
---|
1240 | DEPDI,Z 1,31,1,high_one ; Create Value 1 << 32
|
---|
1241 |
|
---|
1242 | ;
|
---|
1243 | ; Load up all of the values we are going to use
|
---|
1244 | ;
|
---|
1245 | FLDD 0(a_ptr),a0
|
---|
1246 | FLDD 8(a_ptr),a1
|
---|
1247 | FLDD 16(a_ptr),a2
|
---|
1248 | FLDD 24(a_ptr),a3
|
---|
1249 | FLDD 32(a_ptr),a4
|
---|
1250 | FLDD 40(a_ptr),a5
|
---|
1251 | FLDD 48(a_ptr),a6
|
---|
1252 | FLDD 56(a_ptr),a7
|
---|
1253 |
|
---|
1254 | SQR_ADD_C a0L,a0R,c1,c2,c3
|
---|
1255 |
|
---|
1256 | STD c1,0(r_ptr) ; r[0] = c1;
|
---|
1257 | COPY %r0,c1
|
---|
1258 |
|
---|
1259 | SQR_ADD_C2 a1L,a1R,a0L,a0R,c2,c3,c1
|
---|
1260 |
|
---|
1261 | STD c2,8(r_ptr) ; r[1] = c2;
|
---|
1262 | COPY %r0,c2
|
---|
1263 |
|
---|
1264 | SQR_ADD_C a1L,a1R,c3,c1,c2
|
---|
1265 | SQR_ADD_C2 a2L,a2R,a0L,a0R,c3,c1,c2
|
---|
1266 |
|
---|
1267 | STD c3,16(r_ptr) ; r[2] = c3;
|
---|
1268 | COPY %r0,c3
|
---|
1269 |
|
---|
1270 | SQR_ADD_C2 a3L,a3R,a0L,a0R,c1,c2,c3
|
---|
1271 | SQR_ADD_C2 a2L,a2R,a1L,a1R,c1,c2,c3
|
---|
1272 |
|
---|
1273 | STD c1,24(r_ptr) ; r[3] = c1;
|
---|
1274 | COPY %r0,c1
|
---|
1275 |
|
---|
1276 | SQR_ADD_C a2L,a2R,c2,c3,c1
|
---|
1277 | SQR_ADD_C2 a3L,a3R,a1L,a1R,c2,c3,c1
|
---|
1278 |
|
---|
1279 | STD c2,32(r_ptr) ; r[4] = c2;
|
---|
1280 | COPY %r0,c2
|
---|
1281 |
|
---|
1282 | SQR_ADD_C2 a3L,a3R,a2L,a2R,c3,c1,c2
|
---|
1283 | STD c3,40(r_ptr) ; r[5] = c3;
|
---|
1284 | COPY %r0,c3
|
---|
1285 |
|
---|
1286 | SQR_ADD_C a3L,a3R,c1,c2,c3
|
---|
1287 | STD c1,48(r_ptr) ; r[6] = c1;
|
---|
1288 | STD c2,56(r_ptr) ; r[7] = c2;
|
---|
1289 |
|
---|
1290 | .EXIT
|
---|
1291 | LDD -104(%sp),%r6 ; restore r6
|
---|
1292 | LDD -112(%sp),%r5 ; restore r5
|
---|
1293 | LDD -120(%sp),%r4 ; restore r4
|
---|
1294 | BVE (%rp)
|
---|
1295 | LDD,MB -128(%sp),%r3
|
---|
1296 |
|
---|
1297 | .PROCEND
|
---|
1298 |
|
---|
1299 |
|
---|
1300 | ;---------------------------------------------------------------------------
|
---|
1301 |
|
---|
1302 | MUL_ADD_C .macro A0L,A0R,B0L,B0R,C1,C2,C3
|
---|
1303 | XMPYU A0L,B0R,ftemp1 ; m1 = bl*ht
|
---|
1304 | FSTD ftemp1,-16(%sp) ;
|
---|
1305 | XMPYU A0R,B0L,ftemp2 ; m = bh*lt
|
---|
1306 | FSTD ftemp2,-8(%sp) ;
|
---|
1307 | XMPYU A0R,B0R,ftemp3 ; lt = bl*lt
|
---|
1308 | FSTD ftemp3,-32(%sp)
|
---|
1309 | XMPYU A0L,B0L,ftemp4 ; ht = bh*ht
|
---|
1310 | FSTD ftemp4,-24(%sp) ;
|
---|
1311 |
|
---|
1312 | LDD -8(%sp),m ; r21 = m
|
---|
1313 | LDD -16(%sp),m1 ; r19 = m1
|
---|
1314 | ADD,L m,m1,m ; m+m1
|
---|
1315 |
|
---|
1316 | DEPD,Z m,31,32,temp3 ; (m+m1<<32)
|
---|
1317 | LDD -24(%sp),ht ; r24 = ht
|
---|
1318 |
|
---|
1319 | CMPCLR,*>>= m,m1,%r0 ; if (m < m1)
|
---|
1320 | ADD,L ht,high_one,ht ; ht+=high_one
|
---|
1321 |
|
---|
1322 | EXTRD,U m,31,32,temp1 ; m >> 32
|
---|
1323 | LDD -32(%sp),lt ; lt
|
---|
1324 | ADD,L ht,temp1,ht ; ht+= m>>32
|
---|
1325 | ADD lt,temp3,lt ; lt = lt+m1
|
---|
1326 | ADD,DC ht,%r0,ht ; ht++
|
---|
1327 |
|
---|
1328 | ADD C1,lt,C1 ; c1=c1+lt
|
---|
1329 | ADD,DC ht,%r0,ht ; bump c3 if overflow,nullify otherwise
|
---|
1330 |
|
---|
1331 | ADD C2,ht,C2 ; c2 = c2 + ht
|
---|
1332 | ADD,DC C3,%r0,C3 ; add in carry (c3++)
|
---|
1333 | .endm
|
---|
1334 |
|
---|
1335 |
|
---|
1336 | ;
|
---|
1337 | ;void bn_mul_comba8(BN_ULONG *r, BN_ULONG *a, BN_ULONG *b)
|
---|
1338 | ; arg0 = r_ptr
|
---|
1339 | ; arg1 = a_ptr
|
---|
1340 | ; arg2 = b_ptr
|
---|
1341 | ;
|
---|
1342 |
|
---|
1343 | bn_mul_comba8
|
---|
1344 | .proc
|
---|
1345 | .callinfo FRAME=128,ENTRY_GR=%r3,ARGS_SAVED,ORDERING_AWARE
|
---|
1346 | .EXPORT bn_mul_comba8,ENTRY,PRIV_LEV=3,NO_RELOCATION,LONG_RETURN
|
---|
1347 | .entry
|
---|
1348 | .align 64
|
---|
1349 |
|
---|
1350 | STD %r3,0(%sp) ; save r3
|
---|
1351 | STD %r4,8(%sp) ; save r4
|
---|
1352 | STD %r5,16(%sp) ; save r5
|
---|
1353 | STD %r6,24(%sp) ; save r6
|
---|
1354 | FSTD %fr12,32(%sp) ; save r6
|
---|
1355 | FSTD %fr13,40(%sp) ; save r7
|
---|
1356 |
|
---|
1357 | ;
|
---|
1358 | ; Zero out carries
|
---|
1359 | ;
|
---|
1360 | COPY %r0,c1
|
---|
1361 | COPY %r0,c2
|
---|
1362 | COPY %r0,c3
|
---|
1363 |
|
---|
1364 | LDO 128(%sp),%sp ; bump stack
|
---|
1365 | DEPDI,Z 1,31,1,high_one ; Create Value 1 << 32
|
---|
1366 |
|
---|
1367 | ;
|
---|
1368 | ; Load up all of the values we are going to use
|
---|
1369 | ;
|
---|
1370 | FLDD 0(a_ptr),a0
|
---|
1371 | FLDD 8(a_ptr),a1
|
---|
1372 | FLDD 16(a_ptr),a2
|
---|
1373 | FLDD 24(a_ptr),a3
|
---|
1374 | FLDD 32(a_ptr),a4
|
---|
1375 | FLDD 40(a_ptr),a5
|
---|
1376 | FLDD 48(a_ptr),a6
|
---|
1377 | FLDD 56(a_ptr),a7
|
---|
1378 |
|
---|
1379 | FLDD 0(b_ptr),b0
|
---|
1380 | FLDD 8(b_ptr),b1
|
---|
1381 | FLDD 16(b_ptr),b2
|
---|
1382 | FLDD 24(b_ptr),b3
|
---|
1383 | FLDD 32(b_ptr),b4
|
---|
1384 | FLDD 40(b_ptr),b5
|
---|
1385 | FLDD 48(b_ptr),b6
|
---|
1386 | FLDD 56(b_ptr),b7
|
---|
1387 |
|
---|
1388 | MUL_ADD_C a0L,a0R,b0L,b0R,c1,c2,c3
|
---|
1389 | STD c1,0(r_ptr)
|
---|
1390 | COPY %r0,c1
|
---|
1391 |
|
---|
1392 | MUL_ADD_C a0L,a0R,b1L,b1R,c2,c3,c1
|
---|
1393 | MUL_ADD_C a1L,a1R,b0L,b0R,c2,c3,c1
|
---|
1394 | STD c2,8(r_ptr)
|
---|
1395 | COPY %r0,c2
|
---|
1396 |
|
---|
1397 | MUL_ADD_C a2L,a2R,b0L,b0R,c3,c1,c2
|
---|
1398 | MUL_ADD_C a1L,a1R,b1L,b1R,c3,c1,c2
|
---|
1399 | MUL_ADD_C a0L,a0R,b2L,b2R,c3,c1,c2
|
---|
1400 | STD c3,16(r_ptr)
|
---|
1401 | COPY %r0,c3
|
---|
1402 |
|
---|
1403 | MUL_ADD_C a0L,a0R,b3L,b3R,c1,c2,c3
|
---|
1404 | MUL_ADD_C a1L,a1R,b2L,b2R,c1,c2,c3
|
---|
1405 | MUL_ADD_C a2L,a2R,b1L,b1R,c1,c2,c3
|
---|
1406 | MUL_ADD_C a3L,a3R,b0L,b0R,c1,c2,c3
|
---|
1407 | STD c1,24(r_ptr)
|
---|
1408 | COPY %r0,c1
|
---|
1409 |
|
---|
1410 | MUL_ADD_C a4L,a4R,b0L,b0R,c2,c3,c1
|
---|
1411 | MUL_ADD_C a3L,a3R,b1L,b1R,c2,c3,c1
|
---|
1412 | MUL_ADD_C a2L,a2R,b2L,b2R,c2,c3,c1
|
---|
1413 | MUL_ADD_C a1L,a1R,b3L,b3R,c2,c3,c1
|
---|
1414 | MUL_ADD_C a0L,a0R,b4L,b4R,c2,c3,c1
|
---|
1415 | STD c2,32(r_ptr)
|
---|
1416 | COPY %r0,c2
|
---|
1417 |
|
---|
1418 | MUL_ADD_C a0L,a0R,b5L,b5R,c3,c1,c2
|
---|
1419 | MUL_ADD_C a1L,a1R,b4L,b4R,c3,c1,c2
|
---|
1420 | MUL_ADD_C a2L,a2R,b3L,b3R,c3,c1,c2
|
---|
1421 | MUL_ADD_C a3L,a3R,b2L,b2R,c3,c1,c2
|
---|
1422 | MUL_ADD_C a4L,a4R,b1L,b1R,c3,c1,c2
|
---|
1423 | MUL_ADD_C a5L,a5R,b0L,b0R,c3,c1,c2
|
---|
1424 | STD c3,40(r_ptr)
|
---|
1425 | COPY %r0,c3
|
---|
1426 |
|
---|
1427 | MUL_ADD_C a6L,a6R,b0L,b0R,c1,c2,c3
|
---|
1428 | MUL_ADD_C a5L,a5R,b1L,b1R,c1,c2,c3
|
---|
1429 | MUL_ADD_C a4L,a4R,b2L,b2R,c1,c2,c3
|
---|
1430 | MUL_ADD_C a3L,a3R,b3L,b3R,c1,c2,c3
|
---|
1431 | MUL_ADD_C a2L,a2R,b4L,b4R,c1,c2,c3
|
---|
1432 | MUL_ADD_C a1L,a1R,b5L,b5R,c1,c2,c3
|
---|
1433 | MUL_ADD_C a0L,a0R,b6L,b6R,c1,c2,c3
|
---|
1434 | STD c1,48(r_ptr)
|
---|
1435 | COPY %r0,c1
|
---|
1436 |
|
---|
1437 | MUL_ADD_C a0L,a0R,b7L,b7R,c2,c3,c1
|
---|
1438 | MUL_ADD_C a1L,a1R,b6L,b6R,c2,c3,c1
|
---|
1439 | MUL_ADD_C a2L,a2R,b5L,b5R,c2,c3,c1
|
---|
1440 | MUL_ADD_C a3L,a3R,b4L,b4R,c2,c3,c1
|
---|
1441 | MUL_ADD_C a4L,a4R,b3L,b3R,c2,c3,c1
|
---|
1442 | MUL_ADD_C a5L,a5R,b2L,b2R,c2,c3,c1
|
---|
1443 | MUL_ADD_C a6L,a6R,b1L,b1R,c2,c3,c1
|
---|
1444 | MUL_ADD_C a7L,a7R,b0L,b0R,c2,c3,c1
|
---|
1445 | STD c2,56(r_ptr)
|
---|
1446 | COPY %r0,c2
|
---|
1447 |
|
---|
1448 | MUL_ADD_C a7L,a7R,b1L,b1R,c3,c1,c2
|
---|
1449 | MUL_ADD_C a6L,a6R,b2L,b2R,c3,c1,c2
|
---|
1450 | MUL_ADD_C a5L,a5R,b3L,b3R,c3,c1,c2
|
---|
1451 | MUL_ADD_C a4L,a4R,b4L,b4R,c3,c1,c2
|
---|
1452 | MUL_ADD_C a3L,a3R,b5L,b5R,c3,c1,c2
|
---|
1453 | MUL_ADD_C a2L,a2R,b6L,b6R,c3,c1,c2
|
---|
1454 | MUL_ADD_C a1L,a1R,b7L,b7R,c3,c1,c2
|
---|
1455 | STD c3,64(r_ptr)
|
---|
1456 | COPY %r0,c3
|
---|
1457 |
|
---|
1458 | MUL_ADD_C a2L,a2R,b7L,b7R,c1,c2,c3
|
---|
1459 | MUL_ADD_C a3L,a3R,b6L,b6R,c1,c2,c3
|
---|
1460 | MUL_ADD_C a4L,a4R,b5L,b5R,c1,c2,c3
|
---|
1461 | MUL_ADD_C a5L,a5R,b4L,b4R,c1,c2,c3
|
---|
1462 | MUL_ADD_C a6L,a6R,b3L,b3R,c1,c2,c3
|
---|
1463 | MUL_ADD_C a7L,a7R,b2L,b2R,c1,c2,c3
|
---|
1464 | STD c1,72(r_ptr)
|
---|
1465 | COPY %r0,c1
|
---|
1466 |
|
---|
1467 | MUL_ADD_C a7L,a7R,b3L,b3R,c2,c3,c1
|
---|
1468 | MUL_ADD_C a6L,a6R,b4L,b4R,c2,c3,c1
|
---|
1469 | MUL_ADD_C a5L,a5R,b5L,b5R,c2,c3,c1
|
---|
1470 | MUL_ADD_C a4L,a4R,b6L,b6R,c2,c3,c1
|
---|
1471 | MUL_ADD_C a3L,a3R,b7L,b7R,c2,c3,c1
|
---|
1472 | STD c2,80(r_ptr)
|
---|
1473 | COPY %r0,c2
|
---|
1474 |
|
---|
1475 | MUL_ADD_C a4L,a4R,b7L,b7R,c3,c1,c2
|
---|
1476 | MUL_ADD_C a5L,a5R,b6L,b6R,c3,c1,c2
|
---|
1477 | MUL_ADD_C a6L,a6R,b5L,b5R,c3,c1,c2
|
---|
1478 | MUL_ADD_C a7L,a7R,b4L,b4R,c3,c1,c2
|
---|
1479 | STD c3,88(r_ptr)
|
---|
1480 | COPY %r0,c3
|
---|
1481 |
|
---|
1482 | MUL_ADD_C a7L,a7R,b5L,b5R,c1,c2,c3
|
---|
1483 | MUL_ADD_C a6L,a6R,b6L,b6R,c1,c2,c3
|
---|
1484 | MUL_ADD_C a5L,a5R,b7L,b7R,c1,c2,c3
|
---|
1485 | STD c1,96(r_ptr)
|
---|
1486 | COPY %r0,c1
|
---|
1487 |
|
---|
1488 | MUL_ADD_C a6L,a6R,b7L,b7R,c2,c3,c1
|
---|
1489 | MUL_ADD_C a7L,a7R,b6L,b6R,c2,c3,c1
|
---|
1490 | STD c2,104(r_ptr)
|
---|
1491 | COPY %r0,c2
|
---|
1492 |
|
---|
1493 | MUL_ADD_C a7L,a7R,b7L,b7R,c3,c1,c2
|
---|
1494 | STD c3,112(r_ptr)
|
---|
1495 | STD c1,120(r_ptr)
|
---|
1496 |
|
---|
1497 | .EXIT
|
---|
1498 | FLDD -88(%sp),%fr13
|
---|
1499 | FLDD -96(%sp),%fr12
|
---|
1500 | LDD -104(%sp),%r6 ; restore r6
|
---|
1501 | LDD -112(%sp),%r5 ; restore r5
|
---|
1502 | LDD -120(%sp),%r4 ; restore r4
|
---|
1503 | BVE (%rp)
|
---|
1504 | LDD,MB -128(%sp),%r3
|
---|
1505 |
|
---|
1506 | .PROCEND
|
---|
1507 |
|
---|
1508 | ;-----------------------------------------------------------------------------
|
---|
1509 | ;
|
---|
1510 | ;void bn_mul_comba4(BN_ULONG *r, BN_ULONG *a, BN_ULONG *b)
|
---|
1511 | ; arg0 = r_ptr
|
---|
1512 | ; arg1 = a_ptr
|
---|
1513 | ; arg2 = b_ptr
|
---|
1514 | ;
|
---|
1515 |
|
---|
1516 | bn_mul_comba4
|
---|
1517 | .proc
|
---|
1518 | .callinfo FRAME=128,ENTRY_GR=%r3,ARGS_SAVED,ORDERING_AWARE
|
---|
1519 | .EXPORT bn_mul_comba4,ENTRY,PRIV_LEV=3,NO_RELOCATION,LONG_RETURN
|
---|
1520 | .entry
|
---|
1521 | .align 64
|
---|
1522 |
|
---|
1523 | STD %r3,0(%sp) ; save r3
|
---|
1524 | STD %r4,8(%sp) ; save r4
|
---|
1525 | STD %r5,16(%sp) ; save r5
|
---|
1526 | STD %r6,24(%sp) ; save r6
|
---|
1527 | FSTD %fr12,32(%sp) ; save r6
|
---|
1528 | FSTD %fr13,40(%sp) ; save r7
|
---|
1529 |
|
---|
1530 | ;
|
---|
1531 | ; Zero out carries
|
---|
1532 | ;
|
---|
1533 | COPY %r0,c1
|
---|
1534 | COPY %r0,c2
|
---|
1535 | COPY %r0,c3
|
---|
1536 |
|
---|
1537 | LDO 128(%sp),%sp ; bump stack
|
---|
1538 | DEPDI,Z 1,31,1,high_one ; Create Value 1 << 32
|
---|
1539 |
|
---|
1540 | ;
|
---|
1541 | ; Load up all of the values we are going to use
|
---|
1542 | ;
|
---|
1543 | FLDD 0(a_ptr),a0
|
---|
1544 | FLDD 8(a_ptr),a1
|
---|
1545 | FLDD 16(a_ptr),a2
|
---|
1546 | FLDD 24(a_ptr),a3
|
---|
1547 |
|
---|
1548 | FLDD 0(b_ptr),b0
|
---|
1549 | FLDD 8(b_ptr),b1
|
---|
1550 | FLDD 16(b_ptr),b2
|
---|
1551 | FLDD 24(b_ptr),b3
|
---|
1552 |
|
---|
1553 | MUL_ADD_C a0L,a0R,b0L,b0R,c1,c2,c3
|
---|
1554 | STD c1,0(r_ptr)
|
---|
1555 | COPY %r0,c1
|
---|
1556 |
|
---|
1557 | MUL_ADD_C a0L,a0R,b1L,b1R,c2,c3,c1
|
---|
1558 | MUL_ADD_C a1L,a1R,b0L,b0R,c2,c3,c1
|
---|
1559 | STD c2,8(r_ptr)
|
---|
1560 | COPY %r0,c2
|
---|
1561 |
|
---|
1562 | MUL_ADD_C a2L,a2R,b0L,b0R,c3,c1,c2
|
---|
1563 | MUL_ADD_C a1L,a1R,b1L,b1R,c3,c1,c2
|
---|
1564 | MUL_ADD_C a0L,a0R,b2L,b2R,c3,c1,c2
|
---|
1565 | STD c3,16(r_ptr)
|
---|
1566 | COPY %r0,c3
|
---|
1567 |
|
---|
1568 | MUL_ADD_C a0L,a0R,b3L,b3R,c1,c2,c3
|
---|
1569 | MUL_ADD_C a1L,a1R,b2L,b2R,c1,c2,c3
|
---|
1570 | MUL_ADD_C a2L,a2R,b1L,b1R,c1,c2,c3
|
---|
1571 | MUL_ADD_C a3L,a3R,b0L,b0R,c1,c2,c3
|
---|
1572 | STD c1,24(r_ptr)
|
---|
1573 | COPY %r0,c1
|
---|
1574 |
|
---|
1575 | MUL_ADD_C a3L,a3R,b1L,b1R,c2,c3,c1
|
---|
1576 | MUL_ADD_C a2L,a2R,b2L,b2R,c2,c3,c1
|
---|
1577 | MUL_ADD_C a1L,a1R,b3L,b3R,c2,c3,c1
|
---|
1578 | STD c2,32(r_ptr)
|
---|
1579 | COPY %r0,c2
|
---|
1580 |
|
---|
1581 | MUL_ADD_C a2L,a2R,b3L,b3R,c3,c1,c2
|
---|
1582 | MUL_ADD_C a3L,a3R,b2L,b2R,c3,c1,c2
|
---|
1583 | STD c3,40(r_ptr)
|
---|
1584 | COPY %r0,c3
|
---|
1585 |
|
---|
1586 | MUL_ADD_C a3L,a3R,b3L,b3R,c1,c2,c3
|
---|
1587 | STD c1,48(r_ptr)
|
---|
1588 | STD c2,56(r_ptr)
|
---|
1589 |
|
---|
1590 | .EXIT
|
---|
1591 | FLDD -88(%sp),%fr13
|
---|
1592 | FLDD -96(%sp),%fr12
|
---|
1593 | LDD -104(%sp),%r6 ; restore r6
|
---|
1594 | LDD -112(%sp),%r5 ; restore r5
|
---|
1595 | LDD -120(%sp),%r4 ; restore r4
|
---|
1596 | BVE (%rp)
|
---|
1597 | LDD,MB -128(%sp),%r3
|
---|
1598 |
|
---|
1599 | .PROCEND
|
---|
1600 |
|
---|
1601 |
|
---|
1602 | .SPACE $TEXT$
|
---|
1603 | .SUBSPA $CODE$
|
---|
1604 | .SPACE $PRIVATE$,SORT=16
|
---|
1605 | .IMPORT $global$,DATA
|
---|
1606 | .SPACE $TEXT$
|
---|
1607 | .SUBSPA $CODE$
|
---|
1608 | .SUBSPA $LIT$,ACCESS=0x2c
|
---|
1609 | C$4
|
---|
1610 | .ALIGN 8
|
---|
1611 | .STRINGZ "Division would overflow (%d)\n"
|
---|
1612 | .END
|
---|