bn-c64xplus.asm@ 69890

Last change on this file since 69890 was 69890, checked in by vboxsync, 7 years ago
Added OpenSSL 1.1.0g with unneeded files removed, otherwise unmodified. bugref:8070: src/libs maintenance
Property svn:eol-style set to `native`
File size: 9.9 KB

Line
1	;; Copyright 2012-2016 The OpenSSL Project Authors. All Rights Reserved.
2	;;
3	;; Licensed under the OpenSSL license (the "License"). You may not use
4	;; this file except in compliance with the License. You can obtain a copy
5	;; in the file LICENSE in the source distribution or at
6	;; https://www.openssl.org/source/license.html
7	;;
8	;;====================================================================
9	;; Written by Andy Polyakov <[email protected]> for the OpenSSL
10	;; project.
11	;;
12	;; Rights for redistribution and usage in source and binary forms are
13	;; granted according to the OpenSSL license. Warranty of any kind is
14	;; disclaimed.
15	;;====================================================================
16	;; Compiler-generated multiply-n-add SPLOOP runs at 12*n cycles, n
17	;; being the number of 32-bit words, addition - 8*n. Corresponding 4x
18	;; unrolled SPLOOP-free loops - at ~8n and ~5n. Below assembler
19	;; SPLOOPs spin at ... 2*n cycles [plus epilogue].
20	;;====================================================================
21	.text
22
23	.if .ASSEMBLER_VERSION<7000000
24	.asg 0,__TI_EABI__
25	.endif
26	.if __TI_EABI__
27	.asg bn_mul_add_words,_bn_mul_add_words
28	.asg bn_mul_words,_bn_mul_words
29	.asg bn_sqr_words,_bn_sqr_words
30	.asg bn_add_words,_bn_add_words
31	.asg bn_sub_words,_bn_sub_words
32	.asg bn_div_words,_bn_div_words
33	.asg bn_sqr_comba8,_bn_sqr_comba8
34	.asg bn_mul_comba8,_bn_mul_comba8
35	.asg bn_sqr_comba4,_bn_sqr_comba4
36	.asg bn_mul_comba4,_bn_mul_comba4
37	.endif
38
39	.asg B3,RA
40	.asg A4,ARG0
41	.asg B4,ARG1
42	.asg A6,ARG2
43	.asg B6,ARG3
44	.asg A8,ARG4
45	.asg B8,ARG5
46	.asg A4,RET
47	.asg A15,FP
48	.asg B14,DP
49	.asg B15,SP
50
51	.global _bn_mul_add_words
52	_bn_mul_add_words:
53	.asmfunc
54	MV ARG2,B0
55	[!B0] BNOP RA
56	\|\|[!B0] MVK 0,RET
57	[B0] MVC B0,ILC
58	[B0] ZERO A19 ; high part of accumulator
59	\|\| [B0] MV ARG0,A2
60	\|\| [B0] MV ARG3,A3
61	NOP 3
62
63	SPLOOP 2 ; 2*n+10
64	;;====================================================================
65	LDW *ARG1++,B7 ; ap[i]
66	NOP 3
67	LDW *ARG0++,A7 ; rp[i]
68	MPY32U B7,A3,A17:A16
69	NOP 3 ; [2,0] in epilogue
70	ADDU A16,A7,A21:A20
71	ADDU A19,A21:A20,A19:A18
72	\|\| MV.S A17,A23
73	SPKERNEL 2,1 ; leave slot for "return value"
74	\|\| STW A18,*A2++ ; rp[i]
75	\|\| ADD A19,A23,A19
76	;;====================================================================
77	BNOP RA,4
78	MV A19,RET ; return value
79	.endasmfunc
80
81	.global _bn_mul_words
82	_bn_mul_words:
83	.asmfunc
84	MV ARG2,B0
85	[!B0] BNOP RA
86	\|\|[!B0] MVK 0,RET
87	[B0] MVC B0,ILC
88	[B0] ZERO A19 ; high part of accumulator
89	NOP 3
90
91	SPLOOP 2 ; 2*n+10
92	;;====================================================================
93	LDW *ARG1++,A7 ; ap[i]
94	NOP 4
95	MPY32U A7,ARG3,A17:A16
96	NOP 4 ; [2,0] in epiloque
97	ADDU A19,A16,A19:A18
98	\|\| MV.S A17,A21
99	SPKERNEL 2,1 ; leave slot for "return value"
100	\|\| STW A18,*ARG0++ ; rp[i]
101	\|\| ADD.L A19,A21,A19
102	;;====================================================================
103	BNOP RA,4
104	MV A19,RET ; return value
105	.endasmfunc
106
107	.global _bn_sqr_words
108	_bn_sqr_words:
109	.asmfunc
110	MV ARG2,B0
111	[!B0] BNOP RA
112	\|\|[!B0] MVK 0,RET
113	[B0] MVC B0,ILC
114	[B0] MV ARG0,B2
115	\|\| [B0] ADD 4,ARG0,ARG0
116	NOP 3
117
118	SPLOOP 2 ; 2*n+10
119	;;====================================================================
120	LDW *ARG1++,B7 ; ap[i]
121	NOP 4
122	MPY32U B7,B7,B1:B0
123	NOP 3 ; [2,0] in epilogue
124	STW B0,B2++(8) ; rp[2i]
125	MV B1,A1
126	SPKERNEL 2,0 ; fully overlap BNOP RA,5
127	\|\| STW A1,ARG0++(8) ; rp[2i+1]
128	;;====================================================================
129	BNOP RA,5
130	.endasmfunc
131
132	.global _bn_add_words
133	_bn_add_words:
134	.asmfunc
135	MV ARG3,B0
136	[!B0] BNOP RA
137	\|\|[!B0] MVK 0,RET
138	[B0] MVC B0,ILC
139	[B0] ZERO A1 ; carry flag
140	\|\| [B0] MV ARG0,A3
141	NOP 3
142
143	SPLOOP 2 ; 2*n+6
144	;;====================================================================
145	LDW *ARG2++,A7 ; bp[i]
146	\|\| LDW *ARG1++,B7 ; ap[i]
147	NOP 4
148	ADDU A7,B7,A9:A8
149	ADDU A1,A9:A8,A1:A0
150	SPKERNEL 0,0 ; fully overlap BNOP RA,5
151	\|\| STW A0,*A3++ ; write result
152	\|\| MV A1,RET ; keep carry flag in RET
153	;;====================================================================
154	BNOP RA,5
155	.endasmfunc
156
157	.global _bn_sub_words
158	_bn_sub_words:
159	.asmfunc
160	MV ARG3,B0
161	[!B0] BNOP RA
162	\|\|[!B0] MVK 0,RET
163	[B0] MVC B0,ILC
164	[B0] ZERO A2 ; borrow flag
165	\|\| [B0] MV ARG0,A3
166	NOP 3
167
168	SPLOOP 2 ; 2*n+6
169	;;====================================================================
170	LDW *ARG2++,A7 ; bp[i]
171	\|\| LDW *ARG1++,B7 ; ap[i]
172	NOP 4
173	SUBU B7,A7,A1:A0
174	[A2] SUB A1:A0,1,A1:A0
175	SPKERNEL 0,1 ; leave slot for "return borrow flag"
176	\|\| STW A0,*A3++ ; write result
177	\|\| AND 1,A1,A2 ; pass on borrow flag
178	;;====================================================================
179	BNOP RA,4
180	AND 1,A1,RET ; return borrow flag
181	.endasmfunc
182
183	.global _bn_div_words
184	_bn_div_words:
185	.asmfunc
186	LMBD 1,A6,A0 ; leading zero bits in dv
187	LMBD 1,A4,A1 ; leading zero bits in hi
188	\|\| MVK 32,B0
189	CMPLTU A1,A0,A2
190	\|\| ADD A0,B0,B0
191	[ A2] BNOP RA
192	\|\|[ A2] MVK -1,A4 ; return overflow
193	\|\|[!A2] MV A4,A3 ; reassign hi
194	[!A2] MV B4,A4 ; reassign lo, will be quotient
195	\|\|[!A2] MVC B0,ILC
196	[!A2] SHL A6,A0,A6 ; normalize dv
197	\|\| MVK 1,A1
198
199	[!A2] CMPLTU A3,A6,A1 ; hi<dv?
200	\|\|[!A2] SHL A4,1,A5:A4 ; lo<<1
201	[!A1] SUB A3,A6,A3 ; hi-=dv
202	\|\|[!A1] OR 1,A4,A4
203	[!A2] SHRU A3,31,A1 ; upper bit
204	\|\|[!A2] ADDAH A5,A3,A3 ; hi<<1\|lo>>31
205
206	SPLOOP 3
207	[!A1] CMPLTU A3,A6,A1 ; hi<dv?
208	\|\|[ A1] ZERO A1
209	\|\| SHL A4,1,A5:A4 ; lo<<1
210	[!A1] SUB A3,A6,A3 ; hi-=dv
211	\|\|[!A1] OR 1,A4,A4 ; quotient
212	SHRU A3,31,A1 ; upper bit
213	\|\| ADDAH A5,A3,A3 ; hi<<1\|lo>>31
214	SPKERNEL
215
216	BNOP RA,5
217	.endasmfunc
218
219	;;====================================================================
220	;; Not really Comba algorithm, just straightforward NxM... Dedicated
221	;; fully unrolled real Comba implementations are asymptotically 2x
222	;; faster, but naturally larger undertaking. Purpose of this exercise
223	;; was rather to learn to master nested SPLOOPs...
224	;;====================================================================
225	.global _bn_sqr_comba8
226	.global _bn_mul_comba8
227	_bn_sqr_comba8:
228	MV ARG1,ARG2
229	_bn_mul_comba8:
230	.asmfunc
231	MVK 8,B0 ; N, RILC
232	\|\| MVK 8,A0 ; M, outer loop counter
233	\|\| MV ARG1,A5 ; copy ap
234	\|\| MV ARG0,B4 ; copy rp
235	\|\| ZERO B19 ; high part of accumulator
236	MVC B0,RILC
237	\|\| SUB B0,2,B1 ; N-2, initial ILC
238	\|\| SUB B0,1,B2 ; const B2=N-1
239	\|\| LDW *A5++,B6 ; ap[0]
240	\|\| MV A0,A3 ; const A3=M
241	sploopNxM?: ; for best performance arrange M<=N
242	[A0] SPLOOPD 2 ; 2*n+10
243	\|\| MVC B1,ILC
244	\|\| ADDAW B4,B0,B5
245	\|\| ZERO B7
246	\|\| LDW *A5++,A9 ; pre-fetch ap[1]
247	\|\| ZERO A1
248	\|\| SUB A0,1,A0
249	;;====================================================================
250	;; SPLOOP from bn_mul_add_words, but with flipped A<>B register files.
251	;; This is because of Advisory 15 from TI publication SPRZ247I.
252	LDW *ARG2++,A7 ; bp[i]
253	NOP 3
254	[A1] LDW *B5++,B7 ; rp[i]
255	MPY32U A7,B6,B17:B16
256	NOP 3
257	ADDU B16,B7,B21:B20
258	ADDU B19,B21:B20,B19:B18
259	\|\| MV.S B17,B23
260	SPKERNEL
261	\|\| STW B18,*B4++ ; rp[i]
262	\|\| ADD.S B19,B23,B19
263	;;====================================================================
264	outer?: ; m2(n+1)+10
265	SUBAW ARG2,A3,ARG2 ; rewind bp to bp[0]
266	SPMASKR
267	\|\| CMPGT A0,1,A2 ; done pre-fetching ap[i+1]?
268	MVD A9,B6 ; move through .M unit(*)
269	[A2] LDW *A5++,A9 ; pre-fetch ap[i+1]
270	SUBAW B5,B2,B5 ; rewind rp to rp[1]
271	MVK 1,A1
272	[A0] BNOP.S1 outer?,4
273	\|\| [A0] SUB.L A0,1,A0
274	STW B19,*B4--[B2] ; rewind rp tp rp[1]
275	\|\| ZERO.S B19 ; high part of accumulator
276	;; end of outer?
277	BNOP RA,5 ; return
278	.endasmfunc
279	;; (*) It should be noted that B6 is used as input to MPY32U in
280	;; chronologically next cycle in preceding SPLOOP iteration.
281	;; Normally such arrangement would require DINT, but at this
282	;; point SPLOOP is draining and interrupts are disabled
283	;; implicitly.
284
285	.global _bn_sqr_comba4
286	.global _bn_mul_comba4
287	_bn_sqr_comba4:
288	MV ARG1,ARG2
289	_bn_mul_comba4:
290	.asmfunc
291	.if 0
292	BNOP sploopNxM?,3
293	;; Above mentioned m2(n+1)+10 does not apply in n=m=4 case,
294	;; because of low-counter effect, when prologue phase finishes
295	;; before SPKERNEL instruction is reached. As result it's 25%
296	;; slower than expected...
297	MVK 4,B0 ; N, RILC
298	\|\| MVK 4,A0 ; M, outer loop counter
299	\|\| MV ARG1,A5 ; copy ap
300	\|\| MV ARG0,B4 ; copy rp
301	\|\| ZERO B19 ; high part of accumulator
302	MVC B0,RILC
303	\|\| SUB B0,2,B1 ; first ILC
304	\|\| SUB B0,1,B2 ; const B2=N-1
305	\|\| LDW *A5++,B6 ; ap[0]
306	\|\| MV A0,A3 ; const A3=M
307	.else
308	;; This alternative is an exercise in fully unrolled Comba
309	;; algorithm implementation that operates at n*(n+1)+12, or
310	;; as little as 32 cycles...
311	LDW *ARG1[0],B16 ; a[0]
312	\|\| LDW *ARG2[0],A16 ; b[0]
313	LDW *ARG1[1],B17 ; a[1]
314	\|\| LDW *ARG2[1],A17 ; b[1]
315	LDW *ARG1[2],B18 ; a[2]
316	\|\| LDW *ARG2[2],A18 ; b[2]
317	LDW *ARG1[3],B19 ; a[3]
318	\|\| LDW *ARG2[3],A19 ; b[3]
319	NOP
320	MPY32U A16,B16,A1:A0 ; a[0]*b[0]
321	MPY32U A17,B16,A23:A22 ; a[0]*b[1]
322	MPY32U A16,B17,A25:A24 ; a[1]*b[0]
323	MPY32U A16,B18,A27:A26 ; a[2]*b[0]
324	STW A0,*ARG0[0]
325	\|\| MPY32U A17,B17,A29:A28 ; a[1]*b[1]
326	MPY32U A18,B16,A31:A30 ; a[0]*b[2]
327	\|\| ADDU A22,A1,A1:A0
328	MV A23,B0
329	\|\| MPY32U A19,B16,A21:A20 ; a[3]*b[0]
330	\|\| ADDU A24,A1:A0,A1:A0
331	ADDU A25,B0,B1:B0
332	\|\| STW A0,*ARG0[1]
333	\|\| MPY32U A18,B17,A23:A22 ; a[2]*b[1]
334	\|\| ADDU A26,A1,A9:A8
335	ADDU A27,B1,B9:B8
336	\|\| MPY32U A17,B18,A25:A24 ; a[1]*b[2]
337	\|\| ADDU A28,A9:A8,A9:A8
338	ADDU A29,B9:B8,B9:B8
339	\|\| MPY32U A16,B19,A27:A26 ; a[0]*b[3]
340	\|\| ADDU A30,A9:A8,A9:A8
341	ADDU A31,B9:B8,B9:B8
342	\|\| ADDU B0,A9:A8,A9:A8
343	STW A8,*ARG0[2]
344	\|\| ADDU A20,A9,A1:A0
345	ADDU A21,B9,B1:B0
346	\|\| MPY32U A19,B17,A21:A20 ; a[3]*b[1]
347	\|\| ADDU A22,A1:A0,A1:A0
348	ADDU A23,B1:B0,B1:B0
349	\|\| MPY32U A18,B18,A23:A22 ; a[2]*b[2]
350	\|\| ADDU A24,A1:A0,A1:A0
351	ADDU A25,B1:B0,B1:B0
352	\|\| MPY32U A17,B19,A25:A24 ; a[1]*b[3]
353	\|\| ADDU A26,A1:A0,A1:A0
354	ADDU A27,B1:B0,B1:B0
355	\|\| ADDU B8,A1:A0,A1:A0
356	STW A0,*ARG0[3]
357	\|\| MPY32U A19,B18,A27:A26 ; a[3]*b[2]
358	\|\| ADDU A20,A1,A9:A8
359	ADDU A21,B1,B9:B8
360	\|\| MPY32U A18,B19,A29:A28 ; a[2]*b[3]
361	\|\| ADDU A22,A9:A8,A9:A8
362	ADDU A23,B9:B8,B9:B8
363	\|\| MPY32U A19,B19,A31:A30 ; a[3]*b[3]
364	\|\| ADDU A24,A9:A8,A9:A8
365	ADDU A25,B9:B8,B9:B8
366	\|\| ADDU B0,A9:A8,A9:A8
367	STW A8,*ARG0[4]
368	\|\| ADDU A26,A9,A1:A0
369	ADDU A27,B9,B1:B0
370	\|\| ADDU A28,A1:A0,A1:A0
371	ADDU A29,B1:B0,B1:B0
372	\|\| BNOP RA
373	\|\| ADDU B8,A1:A0,A1:A0
374	STW A0,*ARG0[5]
375	\|\| ADDU A30,A1,A9:A8
376	ADD A31,B1,B8
377	ADDU B0,A9:A8,A9:A8 ; removed \|\| to avoid cross-path stall below
378	ADD B8,A9,A9
379	\|\| STW A8,*ARG0[6]
380	STW A9,*ARG0[7]
381	.endif
382	.endasmfunc

Note: See TracBrowser for help on using the repository browser.

source: vbox/trunk/src/libs/openssl-1.1.0g/crypto/bn/asm/bn-c64xplus.asm@ 69890

Download in other formats: