1 | ;; Copyright 2012-2016 The OpenSSL Project Authors. All Rights Reserved.
|
---|
2 | ;;
|
---|
3 | ;; Licensed under the OpenSSL license (the "License"). You may not use
|
---|
4 | ;; this file except in compliance with the License. You can obtain a copy
|
---|
5 | ;; in the file LICENSE in the source distribution or at
|
---|
6 | ;; https://www.openssl.org/source/license.html
|
---|
7 | ;;
|
---|
8 | ;;====================================================================
|
---|
9 | ;; Written by Andy Polyakov <[email protected]> for the OpenSSL
|
---|
10 | ;; project.
|
---|
11 | ;;
|
---|
12 | ;; Rights for redistribution and usage in source and binary forms are
|
---|
13 | ;; granted according to the OpenSSL license. Warranty of any kind is
|
---|
14 | ;; disclaimed.
|
---|
15 | ;;====================================================================
|
---|
16 | ;; Compiler-generated multiply-n-add SPLOOP runs at 12*n cycles, n
|
---|
17 | ;; being the number of 32-bit words, addition - 8*n. Corresponding 4x
|
---|
18 | ;; unrolled SPLOOP-free loops - at ~8*n and ~5*n. Below assembler
|
---|
19 | ;; SPLOOPs spin at ... 2*n cycles [plus epilogue].
|
---|
20 | ;;====================================================================
|
---|
21 | .text
|
---|
22 |
|
---|
23 | .if .ASSEMBLER_VERSION<7000000
|
---|
24 | .asg 0,__TI_EABI__
|
---|
25 | .endif
|
---|
26 | .if __TI_EABI__
|
---|
27 | .asg bn_mul_add_words,_bn_mul_add_words
|
---|
28 | .asg bn_mul_words,_bn_mul_words
|
---|
29 | .asg bn_sqr_words,_bn_sqr_words
|
---|
30 | .asg bn_add_words,_bn_add_words
|
---|
31 | .asg bn_sub_words,_bn_sub_words
|
---|
32 | .asg bn_div_words,_bn_div_words
|
---|
33 | .asg bn_sqr_comba8,_bn_sqr_comba8
|
---|
34 | .asg bn_mul_comba8,_bn_mul_comba8
|
---|
35 | .asg bn_sqr_comba4,_bn_sqr_comba4
|
---|
36 | .asg bn_mul_comba4,_bn_mul_comba4
|
---|
37 | .endif
|
---|
38 |
|
---|
39 | .asg B3,RA
|
---|
40 | .asg A4,ARG0
|
---|
41 | .asg B4,ARG1
|
---|
42 | .asg A6,ARG2
|
---|
43 | .asg B6,ARG3
|
---|
44 | .asg A8,ARG4
|
---|
45 | .asg B8,ARG5
|
---|
46 | .asg A4,RET
|
---|
47 | .asg A15,FP
|
---|
48 | .asg B14,DP
|
---|
49 | .asg B15,SP
|
---|
50 |
|
---|
51 | .global _bn_mul_add_words
|
---|
52 | _bn_mul_add_words:
|
---|
53 | .asmfunc
|
---|
54 | MV ARG2,B0
|
---|
55 | [!B0] BNOP RA
|
---|
56 | ||[!B0] MVK 0,RET
|
---|
57 | [B0] MVC B0,ILC
|
---|
58 | [B0] ZERO A19 ; high part of accumulator
|
---|
59 | || [B0] MV ARG0,A2
|
---|
60 | || [B0] MV ARG3,A3
|
---|
61 | NOP 3
|
---|
62 |
|
---|
63 | SPLOOP 2 ; 2*n+10
|
---|
64 | ;;====================================================================
|
---|
65 | LDW *ARG1++,B7 ; ap[i]
|
---|
66 | NOP 3
|
---|
67 | LDW *ARG0++,A7 ; rp[i]
|
---|
68 | MPY32U B7,A3,A17:A16
|
---|
69 | NOP 3 ; [2,0] in epilogue
|
---|
70 | ADDU A16,A7,A21:A20
|
---|
71 | ADDU A19,A21:A20,A19:A18
|
---|
72 | || MV.S A17,A23
|
---|
73 | SPKERNEL 2,1 ; leave slot for "return value"
|
---|
74 | || STW A18,*A2++ ; rp[i]
|
---|
75 | || ADD A19,A23,A19
|
---|
76 | ;;====================================================================
|
---|
77 | BNOP RA,4
|
---|
78 | MV A19,RET ; return value
|
---|
79 | .endasmfunc
|
---|
80 |
|
---|
81 | .global _bn_mul_words
|
---|
82 | _bn_mul_words:
|
---|
83 | .asmfunc
|
---|
84 | MV ARG2,B0
|
---|
85 | [!B0] BNOP RA
|
---|
86 | ||[!B0] MVK 0,RET
|
---|
87 | [B0] MVC B0,ILC
|
---|
88 | [B0] ZERO A19 ; high part of accumulator
|
---|
89 | NOP 3
|
---|
90 |
|
---|
91 | SPLOOP 2 ; 2*n+10
|
---|
92 | ;;====================================================================
|
---|
93 | LDW *ARG1++,A7 ; ap[i]
|
---|
94 | NOP 4
|
---|
95 | MPY32U A7,ARG3,A17:A16
|
---|
96 | NOP 4 ; [2,0] in epiloque
|
---|
97 | ADDU A19,A16,A19:A18
|
---|
98 | || MV.S A17,A21
|
---|
99 | SPKERNEL 2,1 ; leave slot for "return value"
|
---|
100 | || STW A18,*ARG0++ ; rp[i]
|
---|
101 | || ADD.L A19,A21,A19
|
---|
102 | ;;====================================================================
|
---|
103 | BNOP RA,4
|
---|
104 | MV A19,RET ; return value
|
---|
105 | .endasmfunc
|
---|
106 |
|
---|
107 | .global _bn_sqr_words
|
---|
108 | _bn_sqr_words:
|
---|
109 | .asmfunc
|
---|
110 | MV ARG2,B0
|
---|
111 | [!B0] BNOP RA
|
---|
112 | ||[!B0] MVK 0,RET
|
---|
113 | [B0] MVC B0,ILC
|
---|
114 | [B0] MV ARG0,B2
|
---|
115 | || [B0] ADD 4,ARG0,ARG0
|
---|
116 | NOP 3
|
---|
117 |
|
---|
118 | SPLOOP 2 ; 2*n+10
|
---|
119 | ;;====================================================================
|
---|
120 | LDW *ARG1++,B7 ; ap[i]
|
---|
121 | NOP 4
|
---|
122 | MPY32U B7,B7,B1:B0
|
---|
123 | NOP 3 ; [2,0] in epilogue
|
---|
124 | STW B0,*B2++(8) ; rp[2*i]
|
---|
125 | MV B1,A1
|
---|
126 | SPKERNEL 2,0 ; fully overlap BNOP RA,5
|
---|
127 | || STW A1,*ARG0++(8) ; rp[2*i+1]
|
---|
128 | ;;====================================================================
|
---|
129 | BNOP RA,5
|
---|
130 | .endasmfunc
|
---|
131 |
|
---|
132 | .global _bn_add_words
|
---|
133 | _bn_add_words:
|
---|
134 | .asmfunc
|
---|
135 | MV ARG3,B0
|
---|
136 | [!B0] BNOP RA
|
---|
137 | ||[!B0] MVK 0,RET
|
---|
138 | [B0] MVC B0,ILC
|
---|
139 | [B0] ZERO A1 ; carry flag
|
---|
140 | || [B0] MV ARG0,A3
|
---|
141 | NOP 3
|
---|
142 |
|
---|
143 | SPLOOP 2 ; 2*n+6
|
---|
144 | ;;====================================================================
|
---|
145 | LDW *ARG2++,A7 ; bp[i]
|
---|
146 | || LDW *ARG1++,B7 ; ap[i]
|
---|
147 | NOP 4
|
---|
148 | ADDU A7,B7,A9:A8
|
---|
149 | ADDU A1,A9:A8,A1:A0
|
---|
150 | SPKERNEL 0,0 ; fully overlap BNOP RA,5
|
---|
151 | || STW A0,*A3++ ; write result
|
---|
152 | || MV A1,RET ; keep carry flag in RET
|
---|
153 | ;;====================================================================
|
---|
154 | BNOP RA,5
|
---|
155 | .endasmfunc
|
---|
156 |
|
---|
157 | .global _bn_sub_words
|
---|
158 | _bn_sub_words:
|
---|
159 | .asmfunc
|
---|
160 | MV ARG3,B0
|
---|
161 | [!B0] BNOP RA
|
---|
162 | ||[!B0] MVK 0,RET
|
---|
163 | [B0] MVC B0,ILC
|
---|
164 | [B0] ZERO A2 ; borrow flag
|
---|
165 | || [B0] MV ARG0,A3
|
---|
166 | NOP 3
|
---|
167 |
|
---|
168 | SPLOOP 2 ; 2*n+6
|
---|
169 | ;;====================================================================
|
---|
170 | LDW *ARG2++,A7 ; bp[i]
|
---|
171 | || LDW *ARG1++,B7 ; ap[i]
|
---|
172 | NOP 4
|
---|
173 | SUBU B7,A7,A1:A0
|
---|
174 | [A2] SUB A1:A0,1,A1:A0
|
---|
175 | SPKERNEL 0,1 ; leave slot for "return borrow flag"
|
---|
176 | || STW A0,*A3++ ; write result
|
---|
177 | || AND 1,A1,A2 ; pass on borrow flag
|
---|
178 | ;;====================================================================
|
---|
179 | BNOP RA,4
|
---|
180 | AND 1,A1,RET ; return borrow flag
|
---|
181 | .endasmfunc
|
---|
182 |
|
---|
183 | .global _bn_div_words
|
---|
184 | _bn_div_words:
|
---|
185 | .asmfunc
|
---|
186 | LMBD 1,A6,A0 ; leading zero bits in dv
|
---|
187 | LMBD 1,A4,A1 ; leading zero bits in hi
|
---|
188 | || MVK 32,B0
|
---|
189 | CMPLTU A1,A0,A2
|
---|
190 | || ADD A0,B0,B0
|
---|
191 | [ A2] BNOP RA
|
---|
192 | ||[ A2] MVK -1,A4 ; return overflow
|
---|
193 | ||[!A2] MV A4,A3 ; reassign hi
|
---|
194 | [!A2] MV B4,A4 ; reassign lo, will be quotient
|
---|
195 | ||[!A2] MVC B0,ILC
|
---|
196 | [!A2] SHL A6,A0,A6 ; normalize dv
|
---|
197 | || MVK 1,A1
|
---|
198 |
|
---|
199 | [!A2] CMPLTU A3,A6,A1 ; hi<dv?
|
---|
200 | ||[!A2] SHL A4,1,A5:A4 ; lo<<1
|
---|
201 | [!A1] SUB A3,A6,A3 ; hi-=dv
|
---|
202 | ||[!A1] OR 1,A4,A4
|
---|
203 | [!A2] SHRU A3,31,A1 ; upper bit
|
---|
204 | ||[!A2] ADDAH A5,A3,A3 ; hi<<1|lo>>31
|
---|
205 |
|
---|
206 | SPLOOP 3
|
---|
207 | [!A1] CMPLTU A3,A6,A1 ; hi<dv?
|
---|
208 | ||[ A1] ZERO A1
|
---|
209 | || SHL A4,1,A5:A4 ; lo<<1
|
---|
210 | [!A1] SUB A3,A6,A3 ; hi-=dv
|
---|
211 | ||[!A1] OR 1,A4,A4 ; quotient
|
---|
212 | SHRU A3,31,A1 ; upper bit
|
---|
213 | || ADDAH A5,A3,A3 ; hi<<1|lo>>31
|
---|
214 | SPKERNEL
|
---|
215 |
|
---|
216 | BNOP RA,5
|
---|
217 | .endasmfunc
|
---|
218 |
|
---|
219 | ;;====================================================================
|
---|
220 | ;; Not really Comba algorithm, just straightforward NxM... Dedicated
|
---|
221 | ;; fully unrolled real Comba implementations are asymptotically 2x
|
---|
222 | ;; faster, but naturally larger undertaking. Purpose of this exercise
|
---|
223 | ;; was rather to learn to master nested SPLOOPs...
|
---|
224 | ;;====================================================================
|
---|
225 | .global _bn_sqr_comba8
|
---|
226 | .global _bn_mul_comba8
|
---|
227 | _bn_sqr_comba8:
|
---|
228 | MV ARG1,ARG2
|
---|
229 | _bn_mul_comba8:
|
---|
230 | .asmfunc
|
---|
231 | MVK 8,B0 ; N, RILC
|
---|
232 | || MVK 8,A0 ; M, outer loop counter
|
---|
233 | || MV ARG1,A5 ; copy ap
|
---|
234 | || MV ARG0,B4 ; copy rp
|
---|
235 | || ZERO B19 ; high part of accumulator
|
---|
236 | MVC B0,RILC
|
---|
237 | || SUB B0,2,B1 ; N-2, initial ILC
|
---|
238 | || SUB B0,1,B2 ; const B2=N-1
|
---|
239 | || LDW *A5++,B6 ; ap[0]
|
---|
240 | || MV A0,A3 ; const A3=M
|
---|
241 | sploopNxM?: ; for best performance arrange M<=N
|
---|
242 | [A0] SPLOOPD 2 ; 2*n+10
|
---|
243 | || MVC B1,ILC
|
---|
244 | || ADDAW B4,B0,B5
|
---|
245 | || ZERO B7
|
---|
246 | || LDW *A5++,A9 ; pre-fetch ap[1]
|
---|
247 | || ZERO A1
|
---|
248 | || SUB A0,1,A0
|
---|
249 | ;;====================================================================
|
---|
250 | ;; SPLOOP from bn_mul_add_words, but with flipped A<>B register files.
|
---|
251 | ;; This is because of Advisory 15 from TI publication SPRZ247I.
|
---|
252 | LDW *ARG2++,A7 ; bp[i]
|
---|
253 | NOP 3
|
---|
254 | [A1] LDW *B5++,B7 ; rp[i]
|
---|
255 | MPY32U A7,B6,B17:B16
|
---|
256 | NOP 3
|
---|
257 | ADDU B16,B7,B21:B20
|
---|
258 | ADDU B19,B21:B20,B19:B18
|
---|
259 | || MV.S B17,B23
|
---|
260 | SPKERNEL
|
---|
261 | || STW B18,*B4++ ; rp[i]
|
---|
262 | || ADD.S B19,B23,B19
|
---|
263 | ;;====================================================================
|
---|
264 | outer?: ; m*2*(n+1)+10
|
---|
265 | SUBAW ARG2,A3,ARG2 ; rewind bp to bp[0]
|
---|
266 | SPMASKR
|
---|
267 | || CMPGT A0,1,A2 ; done pre-fetching ap[i+1]?
|
---|
268 | MVD A9,B6 ; move through .M unit(*)
|
---|
269 | [A2] LDW *A5++,A9 ; pre-fetch ap[i+1]
|
---|
270 | SUBAW B5,B2,B5 ; rewind rp to rp[1]
|
---|
271 | MVK 1,A1
|
---|
272 | [A0] BNOP.S1 outer?,4
|
---|
273 | || [A0] SUB.L A0,1,A0
|
---|
274 | STW B19,*B4--[B2] ; rewind rp tp rp[1]
|
---|
275 | || ZERO.S B19 ; high part of accumulator
|
---|
276 | ;; end of outer?
|
---|
277 | BNOP RA,5 ; return
|
---|
278 | .endasmfunc
|
---|
279 | ;; (*) It should be noted that B6 is used as input to MPY32U in
|
---|
280 | ;; chronologically next cycle in *preceding* SPLOOP iteration.
|
---|
281 | ;; Normally such arrangement would require DINT, but at this
|
---|
282 | ;; point SPLOOP is draining and interrupts are disabled
|
---|
283 | ;; implicitly.
|
---|
284 |
|
---|
285 | .global _bn_sqr_comba4
|
---|
286 | .global _bn_mul_comba4
|
---|
287 | _bn_sqr_comba4:
|
---|
288 | MV ARG1,ARG2
|
---|
289 | _bn_mul_comba4:
|
---|
290 | .asmfunc
|
---|
291 | .if 0
|
---|
292 | BNOP sploopNxM?,3
|
---|
293 | ;; Above mentioned m*2*(n+1)+10 does not apply in n=m=4 case,
|
---|
294 | ;; because of low-counter effect, when prologue phase finishes
|
---|
295 | ;; before SPKERNEL instruction is reached. As result it's 25%
|
---|
296 | ;; slower than expected...
|
---|
297 | MVK 4,B0 ; N, RILC
|
---|
298 | || MVK 4,A0 ; M, outer loop counter
|
---|
299 | || MV ARG1,A5 ; copy ap
|
---|
300 | || MV ARG0,B4 ; copy rp
|
---|
301 | || ZERO B19 ; high part of accumulator
|
---|
302 | MVC B0,RILC
|
---|
303 | || SUB B0,2,B1 ; first ILC
|
---|
304 | || SUB B0,1,B2 ; const B2=N-1
|
---|
305 | || LDW *A5++,B6 ; ap[0]
|
---|
306 | || MV A0,A3 ; const A3=M
|
---|
307 | .else
|
---|
308 | ;; This alternative is an exercise in fully unrolled Comba
|
---|
309 | ;; algorithm implementation that operates at n*(n+1)+12, or
|
---|
310 | ;; as little as 32 cycles...
|
---|
311 | LDW *ARG1[0],B16 ; a[0]
|
---|
312 | || LDW *ARG2[0],A16 ; b[0]
|
---|
313 | LDW *ARG1[1],B17 ; a[1]
|
---|
314 | || LDW *ARG2[1],A17 ; b[1]
|
---|
315 | LDW *ARG1[2],B18 ; a[2]
|
---|
316 | || LDW *ARG2[2],A18 ; b[2]
|
---|
317 | LDW *ARG1[3],B19 ; a[3]
|
---|
318 | || LDW *ARG2[3],A19 ; b[3]
|
---|
319 | NOP
|
---|
320 | MPY32U A16,B16,A1:A0 ; a[0]*b[0]
|
---|
321 | MPY32U A17,B16,A23:A22 ; a[0]*b[1]
|
---|
322 | MPY32U A16,B17,A25:A24 ; a[1]*b[0]
|
---|
323 | MPY32U A16,B18,A27:A26 ; a[2]*b[0]
|
---|
324 | STW A0,*ARG0[0]
|
---|
325 | || MPY32U A17,B17,A29:A28 ; a[1]*b[1]
|
---|
326 | MPY32U A18,B16,A31:A30 ; a[0]*b[2]
|
---|
327 | || ADDU A22,A1,A1:A0
|
---|
328 | MV A23,B0
|
---|
329 | || MPY32U A19,B16,A21:A20 ; a[3]*b[0]
|
---|
330 | || ADDU A24,A1:A0,A1:A0
|
---|
331 | ADDU A25,B0,B1:B0
|
---|
332 | || STW A0,*ARG0[1]
|
---|
333 | || MPY32U A18,B17,A23:A22 ; a[2]*b[1]
|
---|
334 | || ADDU A26,A1,A9:A8
|
---|
335 | ADDU A27,B1,B9:B8
|
---|
336 | || MPY32U A17,B18,A25:A24 ; a[1]*b[2]
|
---|
337 | || ADDU A28,A9:A8,A9:A8
|
---|
338 | ADDU A29,B9:B8,B9:B8
|
---|
339 | || MPY32U A16,B19,A27:A26 ; a[0]*b[3]
|
---|
340 | || ADDU A30,A9:A8,A9:A8
|
---|
341 | ADDU A31,B9:B8,B9:B8
|
---|
342 | || ADDU B0,A9:A8,A9:A8
|
---|
343 | STW A8,*ARG0[2]
|
---|
344 | || ADDU A20,A9,A1:A0
|
---|
345 | ADDU A21,B9,B1:B0
|
---|
346 | || MPY32U A19,B17,A21:A20 ; a[3]*b[1]
|
---|
347 | || ADDU A22,A1:A0,A1:A0
|
---|
348 | ADDU A23,B1:B0,B1:B0
|
---|
349 | || MPY32U A18,B18,A23:A22 ; a[2]*b[2]
|
---|
350 | || ADDU A24,A1:A0,A1:A0
|
---|
351 | ADDU A25,B1:B0,B1:B0
|
---|
352 | || MPY32U A17,B19,A25:A24 ; a[1]*b[3]
|
---|
353 | || ADDU A26,A1:A0,A1:A0
|
---|
354 | ADDU A27,B1:B0,B1:B0
|
---|
355 | || ADDU B8,A1:A0,A1:A0
|
---|
356 | STW A0,*ARG0[3]
|
---|
357 | || MPY32U A19,B18,A27:A26 ; a[3]*b[2]
|
---|
358 | || ADDU A20,A1,A9:A8
|
---|
359 | ADDU A21,B1,B9:B8
|
---|
360 | || MPY32U A18,B19,A29:A28 ; a[2]*b[3]
|
---|
361 | || ADDU A22,A9:A8,A9:A8
|
---|
362 | ADDU A23,B9:B8,B9:B8
|
---|
363 | || MPY32U A19,B19,A31:A30 ; a[3]*b[3]
|
---|
364 | || ADDU A24,A9:A8,A9:A8
|
---|
365 | ADDU A25,B9:B8,B9:B8
|
---|
366 | || ADDU B0,A9:A8,A9:A8
|
---|
367 | STW A8,*ARG0[4]
|
---|
368 | || ADDU A26,A9,A1:A0
|
---|
369 | ADDU A27,B9,B1:B0
|
---|
370 | || ADDU A28,A1:A0,A1:A0
|
---|
371 | ADDU A29,B1:B0,B1:B0
|
---|
372 | || BNOP RA
|
---|
373 | || ADDU B8,A1:A0,A1:A0
|
---|
374 | STW A0,*ARG0[5]
|
---|
375 | || ADDU A30,A1,A9:A8
|
---|
376 | ADD A31,B1,B8
|
---|
377 | ADDU B0,A9:A8,A9:A8 ; removed || to avoid cross-path stall below
|
---|
378 | ADD B8,A9,A9
|
---|
379 | || STW A8,*ARG0[6]
|
---|
380 | STW A9,*ARG0[7]
|
---|
381 | .endif
|
---|
382 | .endasmfunc
|
---|