VirtualBox

source: vbox/trunk/src/libs/openssl-1.1.0g/crypto/bn/asm/bn-c64xplus.asm@ 69890

Last change on this file since 69890 was 69890, checked in by vboxsync, 7 years ago

Added OpenSSL 1.1.0g with unneeded files removed, otherwise unmodified.
bugref:8070: src/libs maintenance

  • Property svn:eol-style set to native
File size: 9.9 KB
Line 
1;; Copyright 2012-2016 The OpenSSL Project Authors. All Rights Reserved.
2;;
3;; Licensed under the OpenSSL license (the "License"). You may not use
4;; this file except in compliance with the License. You can obtain a copy
5;; in the file LICENSE in the source distribution or at
6;; https://www.openssl.org/source/license.html
7;;
8;;====================================================================
9;; Written by Andy Polyakov <[email protected]> for the OpenSSL
10;; project.
11;;
12;; Rights for redistribution and usage in source and binary forms are
13;; granted according to the OpenSSL license. Warranty of any kind is
14;; disclaimed.
15;;====================================================================
16;; Compiler-generated multiply-n-add SPLOOP runs at 12*n cycles, n
17;; being the number of 32-bit words, addition - 8*n. Corresponding 4x
18;; unrolled SPLOOP-free loops - at ~8*n and ~5*n. Below assembler
19;; SPLOOPs spin at ... 2*n cycles [plus epilogue].
20;;====================================================================
21 .text
22
23 .if .ASSEMBLER_VERSION<7000000
24 .asg 0,__TI_EABI__
25 .endif
26 .if __TI_EABI__
27 .asg bn_mul_add_words,_bn_mul_add_words
28 .asg bn_mul_words,_bn_mul_words
29 .asg bn_sqr_words,_bn_sqr_words
30 .asg bn_add_words,_bn_add_words
31 .asg bn_sub_words,_bn_sub_words
32 .asg bn_div_words,_bn_div_words
33 .asg bn_sqr_comba8,_bn_sqr_comba8
34 .asg bn_mul_comba8,_bn_mul_comba8
35 .asg bn_sqr_comba4,_bn_sqr_comba4
36 .asg bn_mul_comba4,_bn_mul_comba4
37 .endif
38
39 .asg B3,RA
40 .asg A4,ARG0
41 .asg B4,ARG1
42 .asg A6,ARG2
43 .asg B6,ARG3
44 .asg A8,ARG4
45 .asg B8,ARG5
46 .asg A4,RET
47 .asg A15,FP
48 .asg B14,DP
49 .asg B15,SP
50
51 .global _bn_mul_add_words
52_bn_mul_add_words:
53 .asmfunc
54 MV ARG2,B0
55 [!B0] BNOP RA
56||[!B0] MVK 0,RET
57 [B0] MVC B0,ILC
58 [B0] ZERO A19 ; high part of accumulator
59|| [B0] MV ARG0,A2
60|| [B0] MV ARG3,A3
61 NOP 3
62
63 SPLOOP 2 ; 2*n+10
64;;====================================================================
65 LDW *ARG1++,B7 ; ap[i]
66 NOP 3
67 LDW *ARG0++,A7 ; rp[i]
68 MPY32U B7,A3,A17:A16
69 NOP 3 ; [2,0] in epilogue
70 ADDU A16,A7,A21:A20
71 ADDU A19,A21:A20,A19:A18
72|| MV.S A17,A23
73 SPKERNEL 2,1 ; leave slot for "return value"
74|| STW A18,*A2++ ; rp[i]
75|| ADD A19,A23,A19
76;;====================================================================
77 BNOP RA,4
78 MV A19,RET ; return value
79 .endasmfunc
80
81 .global _bn_mul_words
82_bn_mul_words:
83 .asmfunc
84 MV ARG2,B0
85 [!B0] BNOP RA
86||[!B0] MVK 0,RET
87 [B0] MVC B0,ILC
88 [B0] ZERO A19 ; high part of accumulator
89 NOP 3
90
91 SPLOOP 2 ; 2*n+10
92;;====================================================================
93 LDW *ARG1++,A7 ; ap[i]
94 NOP 4
95 MPY32U A7,ARG3,A17:A16
96 NOP 4 ; [2,0] in epiloque
97 ADDU A19,A16,A19:A18
98|| MV.S A17,A21
99 SPKERNEL 2,1 ; leave slot for "return value"
100|| STW A18,*ARG0++ ; rp[i]
101|| ADD.L A19,A21,A19
102;;====================================================================
103 BNOP RA,4
104 MV A19,RET ; return value
105 .endasmfunc
106
107 .global _bn_sqr_words
108_bn_sqr_words:
109 .asmfunc
110 MV ARG2,B0
111 [!B0] BNOP RA
112||[!B0] MVK 0,RET
113 [B0] MVC B0,ILC
114 [B0] MV ARG0,B2
115|| [B0] ADD 4,ARG0,ARG0
116 NOP 3
117
118 SPLOOP 2 ; 2*n+10
119;;====================================================================
120 LDW *ARG1++,B7 ; ap[i]
121 NOP 4
122 MPY32U B7,B7,B1:B0
123 NOP 3 ; [2,0] in epilogue
124 STW B0,*B2++(8) ; rp[2*i]
125 MV B1,A1
126 SPKERNEL 2,0 ; fully overlap BNOP RA,5
127|| STW A1,*ARG0++(8) ; rp[2*i+1]
128;;====================================================================
129 BNOP RA,5
130 .endasmfunc
131
132 .global _bn_add_words
133_bn_add_words:
134 .asmfunc
135 MV ARG3,B0
136 [!B0] BNOP RA
137||[!B0] MVK 0,RET
138 [B0] MVC B0,ILC
139 [B0] ZERO A1 ; carry flag
140|| [B0] MV ARG0,A3
141 NOP 3
142
143 SPLOOP 2 ; 2*n+6
144;;====================================================================
145 LDW *ARG2++,A7 ; bp[i]
146|| LDW *ARG1++,B7 ; ap[i]
147 NOP 4
148 ADDU A7,B7,A9:A8
149 ADDU A1,A9:A8,A1:A0
150 SPKERNEL 0,0 ; fully overlap BNOP RA,5
151|| STW A0,*A3++ ; write result
152|| MV A1,RET ; keep carry flag in RET
153;;====================================================================
154 BNOP RA,5
155 .endasmfunc
156
157 .global _bn_sub_words
158_bn_sub_words:
159 .asmfunc
160 MV ARG3,B0
161 [!B0] BNOP RA
162||[!B0] MVK 0,RET
163 [B0] MVC B0,ILC
164 [B0] ZERO A2 ; borrow flag
165|| [B0] MV ARG0,A3
166 NOP 3
167
168 SPLOOP 2 ; 2*n+6
169;;====================================================================
170 LDW *ARG2++,A7 ; bp[i]
171|| LDW *ARG1++,B7 ; ap[i]
172 NOP 4
173 SUBU B7,A7,A1:A0
174 [A2] SUB A1:A0,1,A1:A0
175 SPKERNEL 0,1 ; leave slot for "return borrow flag"
176|| STW A0,*A3++ ; write result
177|| AND 1,A1,A2 ; pass on borrow flag
178;;====================================================================
179 BNOP RA,4
180 AND 1,A1,RET ; return borrow flag
181 .endasmfunc
182
183 .global _bn_div_words
184_bn_div_words:
185 .asmfunc
186 LMBD 1,A6,A0 ; leading zero bits in dv
187 LMBD 1,A4,A1 ; leading zero bits in hi
188|| MVK 32,B0
189 CMPLTU A1,A0,A2
190|| ADD A0,B0,B0
191 [ A2] BNOP RA
192||[ A2] MVK -1,A4 ; return overflow
193||[!A2] MV A4,A3 ; reassign hi
194 [!A2] MV B4,A4 ; reassign lo, will be quotient
195||[!A2] MVC B0,ILC
196 [!A2] SHL A6,A0,A6 ; normalize dv
197|| MVK 1,A1
198
199 [!A2] CMPLTU A3,A6,A1 ; hi<dv?
200||[!A2] SHL A4,1,A5:A4 ; lo<<1
201 [!A1] SUB A3,A6,A3 ; hi-=dv
202||[!A1] OR 1,A4,A4
203 [!A2] SHRU A3,31,A1 ; upper bit
204||[!A2] ADDAH A5,A3,A3 ; hi<<1|lo>>31
205
206 SPLOOP 3
207 [!A1] CMPLTU A3,A6,A1 ; hi<dv?
208||[ A1] ZERO A1
209|| SHL A4,1,A5:A4 ; lo<<1
210 [!A1] SUB A3,A6,A3 ; hi-=dv
211||[!A1] OR 1,A4,A4 ; quotient
212 SHRU A3,31,A1 ; upper bit
213|| ADDAH A5,A3,A3 ; hi<<1|lo>>31
214 SPKERNEL
215
216 BNOP RA,5
217 .endasmfunc
218
219;;====================================================================
220;; Not really Comba algorithm, just straightforward NxM... Dedicated
221;; fully unrolled real Comba implementations are asymptotically 2x
222;; faster, but naturally larger undertaking. Purpose of this exercise
223;; was rather to learn to master nested SPLOOPs...
224;;====================================================================
225 .global _bn_sqr_comba8
226 .global _bn_mul_comba8
227_bn_sqr_comba8:
228 MV ARG1,ARG2
229_bn_mul_comba8:
230 .asmfunc
231 MVK 8,B0 ; N, RILC
232|| MVK 8,A0 ; M, outer loop counter
233|| MV ARG1,A5 ; copy ap
234|| MV ARG0,B4 ; copy rp
235|| ZERO B19 ; high part of accumulator
236 MVC B0,RILC
237|| SUB B0,2,B1 ; N-2, initial ILC
238|| SUB B0,1,B2 ; const B2=N-1
239|| LDW *A5++,B6 ; ap[0]
240|| MV A0,A3 ; const A3=M
241sploopNxM?: ; for best performance arrange M<=N
242 [A0] SPLOOPD 2 ; 2*n+10
243|| MVC B1,ILC
244|| ADDAW B4,B0,B5
245|| ZERO B7
246|| LDW *A5++,A9 ; pre-fetch ap[1]
247|| ZERO A1
248|| SUB A0,1,A0
249;;====================================================================
250;; SPLOOP from bn_mul_add_words, but with flipped A<>B register files.
251;; This is because of Advisory 15 from TI publication SPRZ247I.
252 LDW *ARG2++,A7 ; bp[i]
253 NOP 3
254 [A1] LDW *B5++,B7 ; rp[i]
255 MPY32U A7,B6,B17:B16
256 NOP 3
257 ADDU B16,B7,B21:B20
258 ADDU B19,B21:B20,B19:B18
259|| MV.S B17,B23
260 SPKERNEL
261|| STW B18,*B4++ ; rp[i]
262|| ADD.S B19,B23,B19
263;;====================================================================
264outer?: ; m*2*(n+1)+10
265 SUBAW ARG2,A3,ARG2 ; rewind bp to bp[0]
266 SPMASKR
267|| CMPGT A0,1,A2 ; done pre-fetching ap[i+1]?
268 MVD A9,B6 ; move through .M unit(*)
269 [A2] LDW *A5++,A9 ; pre-fetch ap[i+1]
270 SUBAW B5,B2,B5 ; rewind rp to rp[1]
271 MVK 1,A1
272 [A0] BNOP.S1 outer?,4
273|| [A0] SUB.L A0,1,A0
274 STW B19,*B4--[B2] ; rewind rp tp rp[1]
275|| ZERO.S B19 ; high part of accumulator
276;; end of outer?
277 BNOP RA,5 ; return
278 .endasmfunc
279;; (*) It should be noted that B6 is used as input to MPY32U in
280;; chronologically next cycle in *preceding* SPLOOP iteration.
281;; Normally such arrangement would require DINT, but at this
282;; point SPLOOP is draining and interrupts are disabled
283;; implicitly.
284
285 .global _bn_sqr_comba4
286 .global _bn_mul_comba4
287_bn_sqr_comba4:
288 MV ARG1,ARG2
289_bn_mul_comba4:
290 .asmfunc
291 .if 0
292 BNOP sploopNxM?,3
293 ;; Above mentioned m*2*(n+1)+10 does not apply in n=m=4 case,
294 ;; because of low-counter effect, when prologue phase finishes
295 ;; before SPKERNEL instruction is reached. As result it's 25%
296 ;; slower than expected...
297 MVK 4,B0 ; N, RILC
298|| MVK 4,A0 ; M, outer loop counter
299|| MV ARG1,A5 ; copy ap
300|| MV ARG0,B4 ; copy rp
301|| ZERO B19 ; high part of accumulator
302 MVC B0,RILC
303|| SUB B0,2,B1 ; first ILC
304|| SUB B0,1,B2 ; const B2=N-1
305|| LDW *A5++,B6 ; ap[0]
306|| MV A0,A3 ; const A3=M
307 .else
308 ;; This alternative is an exercise in fully unrolled Comba
309 ;; algorithm implementation that operates at n*(n+1)+12, or
310 ;; as little as 32 cycles...
311 LDW *ARG1[0],B16 ; a[0]
312|| LDW *ARG2[0],A16 ; b[0]
313 LDW *ARG1[1],B17 ; a[1]
314|| LDW *ARG2[1],A17 ; b[1]
315 LDW *ARG1[2],B18 ; a[2]
316|| LDW *ARG2[2],A18 ; b[2]
317 LDW *ARG1[3],B19 ; a[3]
318|| LDW *ARG2[3],A19 ; b[3]
319 NOP
320 MPY32U A16,B16,A1:A0 ; a[0]*b[0]
321 MPY32U A17,B16,A23:A22 ; a[0]*b[1]
322 MPY32U A16,B17,A25:A24 ; a[1]*b[0]
323 MPY32U A16,B18,A27:A26 ; a[2]*b[0]
324 STW A0,*ARG0[0]
325|| MPY32U A17,B17,A29:A28 ; a[1]*b[1]
326 MPY32U A18,B16,A31:A30 ; a[0]*b[2]
327|| ADDU A22,A1,A1:A0
328 MV A23,B0
329|| MPY32U A19,B16,A21:A20 ; a[3]*b[0]
330|| ADDU A24,A1:A0,A1:A0
331 ADDU A25,B0,B1:B0
332|| STW A0,*ARG0[1]
333|| MPY32U A18,B17,A23:A22 ; a[2]*b[1]
334|| ADDU A26,A1,A9:A8
335 ADDU A27,B1,B9:B8
336|| MPY32U A17,B18,A25:A24 ; a[1]*b[2]
337|| ADDU A28,A9:A8,A9:A8
338 ADDU A29,B9:B8,B9:B8
339|| MPY32U A16,B19,A27:A26 ; a[0]*b[3]
340|| ADDU A30,A9:A8,A9:A8
341 ADDU A31,B9:B8,B9:B8
342|| ADDU B0,A9:A8,A9:A8
343 STW A8,*ARG0[2]
344|| ADDU A20,A9,A1:A0
345 ADDU A21,B9,B1:B0
346|| MPY32U A19,B17,A21:A20 ; a[3]*b[1]
347|| ADDU A22,A1:A0,A1:A0
348 ADDU A23,B1:B0,B1:B0
349|| MPY32U A18,B18,A23:A22 ; a[2]*b[2]
350|| ADDU A24,A1:A0,A1:A0
351 ADDU A25,B1:B0,B1:B0
352|| MPY32U A17,B19,A25:A24 ; a[1]*b[3]
353|| ADDU A26,A1:A0,A1:A0
354 ADDU A27,B1:B0,B1:B0
355|| ADDU B8,A1:A0,A1:A0
356 STW A0,*ARG0[3]
357|| MPY32U A19,B18,A27:A26 ; a[3]*b[2]
358|| ADDU A20,A1,A9:A8
359 ADDU A21,B1,B9:B8
360|| MPY32U A18,B19,A29:A28 ; a[2]*b[3]
361|| ADDU A22,A9:A8,A9:A8
362 ADDU A23,B9:B8,B9:B8
363|| MPY32U A19,B19,A31:A30 ; a[3]*b[3]
364|| ADDU A24,A9:A8,A9:A8
365 ADDU A25,B9:B8,B9:B8
366|| ADDU B0,A9:A8,A9:A8
367 STW A8,*ARG0[4]
368|| ADDU A26,A9,A1:A0
369 ADDU A27,B9,B1:B0
370|| ADDU A28,A1:A0,A1:A0
371 ADDU A29,B1:B0,B1:B0
372|| BNOP RA
373|| ADDU B8,A1:A0,A1:A0
374 STW A0,*ARG0[5]
375|| ADDU A30,A1,A9:A8
376 ADD A31,B1,B8
377 ADDU B0,A9:A8,A9:A8 ; removed || to avoid cross-path stall below
378 ADD B8,A9,A9
379|| STW A8,*ARG0[6]
380 STW A9,*ARG0[7]
381 .endif
382 .endasmfunc
Note: See TracBrowser for help on using the repository browser.

© 2024 Oracle Support Privacy / Do Not Sell My Info Terms of Use Trademark Policy Automated Access Etiquette