ppc.pl@ 69881

Last change on this file since 69881 was 69881, checked in by vboxsync, 7 years ago
Update OpenSSL to 1.1.0g. bugref:8070: src/libs maintenance
Property svn:eol-style set to `LF` Property svn:executable set to ``*
File size: 44.6 KB

Line
1	#! /usr/bin/env perl
2	# Copyright 2004-2016 The OpenSSL Project Authors. All Rights Reserved.
3	#
4	# Licensed under the OpenSSL license (the "License"). You may not use
5	# this file except in compliance with the License. You can obtain a copy
6	# in the file LICENSE in the source distribution or at
7	# https://www.openssl.org/source/license.html
8
9	# Implemented as a Perl wrapper as we want to support several different
10	# architectures with single file. We pick up the target based on the
11	# file name we are asked to generate.
12	#
13	# It should be noted though that this perl code is nothing like
14	# <openssl>/crypto/perlasm/x86*. In this case perl is used pretty much
15	# as pre-processor to cover for platform differences in name decoration,
16	# linker tables, 32-/64-bit instruction sets...
17	#
18	# As you might know there're several PowerPC ABI in use. Most notably
19	# Linux and AIX use different 32-bit ABIs. Good news are that these ABIs
20	# are similar enough to implement leaf(!) functions, which would be ABI
21	# neutral. And that's what you find here: ABI neutral leaf functions.
22	# In case you wonder what that is...
23	#
24	# AIX performance
25	#
26	# MEASUREMENTS WITH cc ON a 200 MhZ PowerPC 604e.
27	#
28	# The following is the performance of 32-bit compiler
29	# generated code:
30	#
31	# OpenSSL 0.9.6c 21 dec 2001
32	# built on: Tue Jun 11 11:06:51 EDT 2002
33	# options:bn(64,32) ...
34	#compiler: cc -DTHREADS -DAIX -DB_ENDIAN -DBN_LLONG -O3
35	# sign verify sign/s verify/s
36	#rsa 512 bits 0.0098s 0.0009s 102.0 1170.6
37	#rsa 1024 bits 0.0507s 0.0026s 19.7 387.5
38	#rsa 2048 bits 0.3036s 0.0085s 3.3 117.1
39	#rsa 4096 bits 2.0040s 0.0299s 0.5 33.4
40	#dsa 512 bits 0.0087s 0.0106s 114.3 94.5
41	#dsa 1024 bits 0.0256s 0.0313s 39.0 32.0
42	#
43	# Same bechmark with this assembler code:
44	#
45	#rsa 512 bits 0.0056s 0.0005s 178.6 2049.2
46	#rsa 1024 bits 0.0283s 0.0015s 35.3 674.1
47	#rsa 2048 bits 0.1744s 0.0050s 5.7 201.2
48	#rsa 4096 bits 1.1644s 0.0179s 0.9 55.7
49	#dsa 512 bits 0.0052s 0.0062s 191.6 162.0
50	#dsa 1024 bits 0.0149s 0.0180s 67.0 55.5
51	#
52	# Number of operations increases by at almost 75%
53	#
54	# Here are performance numbers for 64-bit compiler
55	# generated code:
56	#
57	# OpenSSL 0.9.6g [engine] 9 Aug 2002
58	# built on: Fri Apr 18 16:59:20 EDT 2003
59	# options:bn(64,64) ...
60	# compiler: cc -DTHREADS -D_REENTRANT -q64 -DB_ENDIAN -O3
61	# sign verify sign/s verify/s
62	#rsa 512 bits 0.0028s 0.0003s 357.1 3844.4
63	#rsa 1024 bits 0.0148s 0.0008s 67.5 1239.7
64	#rsa 2048 bits 0.0963s 0.0028s 10.4 353.0
65	#rsa 4096 bits 0.6538s 0.0102s 1.5 98.1
66	#dsa 512 bits 0.0026s 0.0032s 382.5 313.7
67	#dsa 1024 bits 0.0081s 0.0099s 122.8 100.6
68	#
69	# Same benchmark with this assembler code:
70	#
71	#rsa 512 bits 0.0020s 0.0002s 510.4 6273.7
72	#rsa 1024 bits 0.0088s 0.0005s 114.1 2128.3
73	#rsa 2048 bits 0.0540s 0.0016s 18.5 622.5
74	#rsa 4096 bits 0.3700s 0.0058s 2.7 171.0
75	#dsa 512 bits 0.0016s 0.0020s 610.7 507.1
76	#dsa 1024 bits 0.0047s 0.0058s 212.5 173.2
77	#
78	# Again, performance increases by at about 75%
79	#
80	# Mac OS X, Apple G5 1.8GHz (Note this is 32 bit code)
81	# OpenSSL 0.9.7c 30 Sep 2003
82	#
83	# Original code.
84	#
85	#rsa 512 bits 0.0011s 0.0001s 906.1 11012.5
86	#rsa 1024 bits 0.0060s 0.0003s 166.6 3363.1
87	#rsa 2048 bits 0.0370s 0.0010s 27.1 982.4
88	#rsa 4096 bits 0.2426s 0.0036s 4.1 280.4
89	#dsa 512 bits 0.0010s 0.0012s 1038.1 841.5
90	#dsa 1024 bits 0.0030s 0.0037s 329.6 269.7
91	#dsa 2048 bits 0.0101s 0.0127s 98.9 78.6
92	#
93	# Same benchmark with this assembler code:
94	#
95	#rsa 512 bits 0.0007s 0.0001s 1416.2 16645.9
96	#rsa 1024 bits 0.0036s 0.0002s 274.4 5380.6
97	#rsa 2048 bits 0.0222s 0.0006s 45.1 1589.5
98	#rsa 4096 bits 0.1469s 0.0022s 6.8 449.6
99	#dsa 512 bits 0.0006s 0.0007s 1664.2 1376.2
100	#dsa 1024 bits 0.0018s 0.0023s 545.0 442.2
101	#dsa 2048 bits 0.0061s 0.0075s 163.5 132.8
102	#
103	# Performance increase of ~60%
104	#
105	# If you have comments or suggestions to improve code send
106	# me a note at [email protected]
107	#
108
109	$flavour = shift;
110
111	if ($flavour =~ /32/) {
112	$BITS= 32;
113	$BNSZ= $BITS/8;
114	$ISA= "\"ppc\"";
115
116	$LD= "lwz"; # load
117	$LDU= "lwzu"; # load and update
118	$ST= "stw"; # store
119	$STU= "stwu"; # store and update
120	$UMULL= "mullw"; # unsigned multiply low
121	$UMULH= "mulhwu"; # unsigned multiply high
122	$UDIV= "divwu"; # unsigned divide
123	$UCMPI= "cmplwi"; # unsigned compare with immediate
124	$UCMP= "cmplw"; # unsigned compare
125	$CNTLZ= "cntlzw"; # count leading zeros
126	$SHL= "slw"; # shift left
127	$SHR= "srw"; # unsigned shift right
128	$SHRI= "srwi"; # unsigned shift right by immediate
129	$SHLI= "slwi"; # shift left by immediate
130	$CLRU= "clrlwi"; # clear upper bits
131	$INSR= "insrwi"; # insert right
132	$ROTL= "rotlwi"; # rotate left by immediate
133	$TR= "tw"; # conditional trap
134	} elsif ($flavour =~ /64/) {
135	$BITS= 64;
136	$BNSZ= $BITS/8;
137	$ISA= "\"ppc64\"";
138
139	# same as above, but 64-bit mnemonics...
140	$LD= "ld"; # load
141	$LDU= "ldu"; # load and update
142	$ST= "std"; # store
143	$STU= "stdu"; # store and update
144	$UMULL= "mulld"; # unsigned multiply low
145	$UMULH= "mulhdu"; # unsigned multiply high
146	$UDIV= "divdu"; # unsigned divide
147	$UCMPI= "cmpldi"; # unsigned compare with immediate
148	$UCMP= "cmpld"; # unsigned compare
149	$CNTLZ= "cntlzd"; # count leading zeros
150	$SHL= "sld"; # shift left
151	$SHR= "srd"; # unsigned shift right
152	$SHRI= "srdi"; # unsigned shift right by immediate
153	$SHLI= "sldi"; # shift left by immediate
154	$CLRU= "clrldi"; # clear upper bits
155	$INSR= "insrdi"; # insert right
156	$ROTL= "rotldi"; # rotate left by immediate
157	$TR= "td"; # conditional trap
158	} else { die "nonsense $flavour"; }
159
160	$0 =~ m/(.*[\/\\])[^\/\\]+$/; $dir=$1;
161	( $xlate="${dir}ppc-xlate.pl" and -f $xlate ) or
162	( $xlate="${dir}../../perlasm/ppc-xlate.pl" and -f $xlate) or
163	die "can't locate ppc-xlate.pl";
164
165	open STDOUT,"\| $^X $xlate $flavour ".shift \|\| die "can't call $xlate: $!";
166
167	$data=<<EOF;
168	#--------------------------------------------------------------------
169	#
170	#
171	#
172	#
173	# File: ppc32.s
174	#
175	# Created by: Suresh Chari
176	# IBM Thomas J. Watson Research Library
177	# Hawthorne, NY
178	#
179	#
180	# Description: Optimized assembly routines for OpenSSL crypto
181	# on the 32 bitPowerPC platform.
182	#
183	#
184	# Version History
185	#
186	# 2. Fixed bn_add,bn_sub and bn_div_words, added comments,
187	# cleaned up code. Also made a single version which can
188	# be used for both the AIX and Linux compilers. See NOTE
189	# below.
190	# 12/05/03 Suresh Chari
191	# (with lots of help from) Andy Polyakov
192	##
193	# 1. Initial version 10/20/02 Suresh Chari
194	#
195	#
196	# The following file works for the xlc,cc
197	# and gcc compilers.
198	#
199	# NOTE: To get the file to link correctly with the gcc compiler
200	# you have to change the names of the routines and remove
201	# the first .(dot) character. This should automatically
202	# be done in the build process.
203	#
204	# Hand optimized assembly code for the following routines
205	#
206	# bn_sqr_comba4
207	# bn_sqr_comba8
208	# bn_mul_comba4
209	# bn_mul_comba8
210	# bn_sub_words
211	# bn_add_words
212	# bn_div_words
213	# bn_sqr_words
214	# bn_mul_words
215	# bn_mul_add_words
216	#
217	# NOTE: It is possible to optimize this code more for
218	# specific PowerPC or Power architectures. On the Northstar
219	# architecture the optimizations in this file do
220	# NOT provide much improvement.
221	#
222	# If you have comments or suggestions to improve code send
223	# me a note at schari\@us.ibm.com
224	#
225	#--------------------------------------------------------------------------
226	#
227	# Defines to be used in the assembly code.
228	#
229	#.set r0,0 # we use it as storage for value of 0
230	#.set SP,1 # preserved
231	#.set RTOC,2 # preserved
232	#.set r3,3 # 1st argument/return value
233	#.set r4,4 # 2nd argument/volatile register
234	#.set r5,5 # 3rd argument/volatile register
235	#.set r6,6 # ...
236	#.set r7,7
237	#.set r8,8
238	#.set r9,9
239	#.set r10,10
240	#.set r11,11
241	#.set r12,12
242	#.set r13,13 # not used, nor any other "below" it...
243
244	# Declare function names to be global
245	# NOTE: For gcc these names MUST be changed to remove
246	# the first . i.e. for example change ".bn_sqr_comba4"
247	# to "bn_sqr_comba4". This should be automatically done
248	# in the build.
249
250	.globl .bn_sqr_comba4
251	.globl .bn_sqr_comba8
252	.globl .bn_mul_comba4
253	.globl .bn_mul_comba8
254	.globl .bn_sub_words
255	.globl .bn_add_words
256	.globl .bn_div_words
257	.globl .bn_sqr_words
258	.globl .bn_mul_words
259	.globl .bn_mul_add_words
260
261	# .text section
262
263	.machine "any"
264
265	#
266	# NOTE: The following label name should be changed to
267	# "bn_sqr_comba4" i.e. remove the first dot
268	# for the gcc compiler. This should be automatically
269	# done in the build
270	#
271
272	.align 4
273	.bn_sqr_comba4:
274	#
275	# Optimized version of bn_sqr_comba4.
276	#
277	# void bn_sqr_comba4(BN_ULONG r, BN_ULONG a)
278	# r3 contains r
279	# r4 contains a
280	#
281	# Freely use registers r5,r6,r7,r8,r9,r10,r11 as follows:
282	#
283	# r5,r6 are the two BN_ULONGs being multiplied.
284	# r7,r8 are the results of the 32x32 giving 64 bit multiply.
285	# r9,r10, r11 are the equivalents of c1,c2, c3.
286	# Here's the assembly
287	#
288	#
289	xor r0,r0,r0 # set r0 = 0. Used in the addze
290	# instructions below
291
292	#sqr_add_c(a,0,c1,c2,c3)
293	$LD r5,`0*$BNSZ`(r4)
294	$UMULL r9,r5,r5
295	$UMULH r10,r5,r5 #in first iteration. No need
296	#to add since c1=c2=c3=0.
297	# Note c3(r11) is NOT set to 0
298	# but will be.
299
300	$ST r9,`0*$BNSZ`(r3) # r[0]=c1;
301	# sqr_add_c2(a,1,0,c2,c3,c1);
302	$LD r6,`1*$BNSZ`(r4)
303	$UMULL r7,r5,r6
304	$UMULH r8,r5,r6
305
306	addc r7,r7,r7 # compute (r7,r8)=2*(r7,r8)
307	adde r8,r8,r8
308	addze r9,r0 # catch carry if any.
309	# r9= r0(=0) and carry
310
311	addc r10,r7,r10 # now add to temp result.
312	addze r11,r8 # r8 added to r11 which is 0
313	addze r9,r9
314
315	$ST r10,`1*$BNSZ`(r3) #r[1]=c2;
316	#sqr_add_c(a,1,c3,c1,c2)
317	$UMULL r7,r6,r6
318	$UMULH r8,r6,r6
319	addc r11,r7,r11
320	adde r9,r8,r9
321	addze r10,r0
322	#sqr_add_c2(a,2,0,c3,c1,c2)
323	$LD r6,`2*$BNSZ`(r4)
324	$UMULL r7,r5,r6
325	$UMULH r8,r5,r6
326
327	addc r7,r7,r7
328	adde r8,r8,r8
329	addze r10,r10
330
331	addc r11,r7,r11
332	adde r9,r8,r9
333	addze r10,r10
334	$ST r11,`2*$BNSZ`(r3) #r[2]=c3
335	#sqr_add_c2(a,3,0,c1,c2,c3);
336	$LD r6,`3*$BNSZ`(r4)
337	$UMULL r7,r5,r6
338	$UMULH r8,r5,r6
339	addc r7,r7,r7
340	adde r8,r8,r8
341	addze r11,r0
342
343	addc r9,r7,r9
344	adde r10,r8,r10
345	addze r11,r11
346	#sqr_add_c2(a,2,1,c1,c2,c3);
347	$LD r5,`1*$BNSZ`(r4)
348	$LD r6,`2*$BNSZ`(r4)
349	$UMULL r7,r5,r6
350	$UMULH r8,r5,r6
351
352	addc r7,r7,r7
353	adde r8,r8,r8
354	addze r11,r11
355	addc r9,r7,r9
356	adde r10,r8,r10
357	addze r11,r11
358	$ST r9,`3*$BNSZ`(r3) #r[3]=c1
359	#sqr_add_c(a,2,c2,c3,c1);
360	$UMULL r7,r6,r6
361	$UMULH r8,r6,r6
362	addc r10,r7,r10
363	adde r11,r8,r11
364	addze r9,r0
365	#sqr_add_c2(a,3,1,c2,c3,c1);
366	$LD r6,`3*$BNSZ`(r4)
367	$UMULL r7,r5,r6
368	$UMULH r8,r5,r6
369	addc r7,r7,r7
370	adde r8,r8,r8
371	addze r9,r9
372
373	addc r10,r7,r10
374	adde r11,r8,r11
375	addze r9,r9
376	$ST r10,`4*$BNSZ`(r3) #r[4]=c2
377	#sqr_add_c2(a,3,2,c3,c1,c2);
378	$LD r5,`2*$BNSZ`(r4)
379	$UMULL r7,r5,r6
380	$UMULH r8,r5,r6
381	addc r7,r7,r7
382	adde r8,r8,r8
383	addze r10,r0
384
385	addc r11,r7,r11
386	adde r9,r8,r9
387	addze r10,r10
388	$ST r11,`5*$BNSZ`(r3) #r[5] = c3
389	#sqr_add_c(a,3,c1,c2,c3);
390	$UMULL r7,r6,r6
391	$UMULH r8,r6,r6
392	addc r9,r7,r9
393	adde r10,r8,r10
394
395	$ST r9,`6*$BNSZ`(r3) #r[6]=c1
396	$ST r10,`7*$BNSZ`(r3) #r[7]=c2
397	blr
398	.long 0
399	.byte 0,12,0x14,0,0,0,2,0
400	.long 0
401	.size .bn_sqr_comba4,.-.bn_sqr_comba4
402
403	#
404	# NOTE: The following label name should be changed to
405	# "bn_sqr_comba8" i.e. remove the first dot
406	# for the gcc compiler. This should be automatically
407	# done in the build
408	#
409
410	.align 4
411	.bn_sqr_comba8:
412	#
413	# This is an optimized version of the bn_sqr_comba8 routine.
414	# Tightly uses the adde instruction
415	#
416	#
417	# void bn_sqr_comba8(BN_ULONG r, BN_ULONG a)
418	# r3 contains r
419	# r4 contains a
420	#
421	# Freely use registers r5,r6,r7,r8,r9,r10,r11 as follows:
422	#
423	# r5,r6 are the two BN_ULONGs being multiplied.
424	# r7,r8 are the results of the 32x32 giving 64 bit multiply.
425	# r9,r10, r11 are the equivalents of c1,c2, c3.
426	#
427	# Possible optimization of loading all 8 longs of a into registers
428	# doesn't provide any speedup
429	#
430
431	xor r0,r0,r0 #set r0 = 0.Used in addze
432	#instructions below.
433
434	#sqr_add_c(a,0,c1,c2,c3);
435	$LD r5,`0*$BNSZ`(r4)
436	$UMULL r9,r5,r5 #1st iteration: no carries.
437	$UMULH r10,r5,r5
438	$ST r9,`0*$BNSZ`(r3) # r[0]=c1;
439	#sqr_add_c2(a,1,0,c2,c3,c1);
440	$LD r6,`1*$BNSZ`(r4)
441	$UMULL r7,r5,r6
442	$UMULH r8,r5,r6
443
444	addc r10,r7,r10 #add the two register number
445	adde r11,r8,r0 # (r8,r7) to the three register
446	addze r9,r0 # number (r9,r11,r10).NOTE:r0=0
447
448	addc r10,r7,r10 #add the two register number
449	adde r11,r8,r11 # (r8,r7) to the three register
450	addze r9,r9 # number (r9,r11,r10).
451
452	$ST r10,`1*$BNSZ`(r3) # r[1]=c2
453
454	#sqr_add_c(a,1,c3,c1,c2);
455	$UMULL r7,r6,r6
456	$UMULH r8,r6,r6
457	addc r11,r7,r11
458	adde r9,r8,r9
459	addze r10,r0
460	#sqr_add_c2(a,2,0,c3,c1,c2);
461	$LD r6,`2*$BNSZ`(r4)
462	$UMULL r7,r5,r6
463	$UMULH r8,r5,r6
464
465	addc r11,r7,r11
466	adde r9,r8,r9
467	addze r10,r10
468
469	addc r11,r7,r11
470	adde r9,r8,r9
471	addze r10,r10
472
473	$ST r11,`2*$BNSZ`(r3) #r[2]=c3
474	#sqr_add_c2(a,3,0,c1,c2,c3);
475	$LD r6,`3*$BNSZ`(r4) #r6 = a[3]. r5 is already a[0].
476	$UMULL r7,r5,r6
477	$UMULH r8,r5,r6
478
479	addc r9,r7,r9
480	adde r10,r8,r10
481	addze r11,r0
482
483	addc r9,r7,r9
484	adde r10,r8,r10
485	addze r11,r11
486	#sqr_add_c2(a,2,1,c1,c2,c3);
487	$LD r5,`1*$BNSZ`(r4)
488	$LD r6,`2*$BNSZ`(r4)
489	$UMULL r7,r5,r6
490	$UMULH r8,r5,r6
491
492	addc r9,r7,r9
493	adde r10,r8,r10
494	addze r11,r11
495
496	addc r9,r7,r9
497	adde r10,r8,r10
498	addze r11,r11
499
500	$ST r9,`3*$BNSZ`(r3) #r[3]=c1;
501	#sqr_add_c(a,2,c2,c3,c1);
502	$UMULL r7,r6,r6
503	$UMULH r8,r6,r6
504
505	addc r10,r7,r10
506	adde r11,r8,r11
507	addze r9,r0
508	#sqr_add_c2(a,3,1,c2,c3,c1);
509	$LD r6,`3*$BNSZ`(r4)
510	$UMULL r7,r5,r6
511	$UMULH r8,r5,r6
512
513	addc r10,r7,r10
514	adde r11,r8,r11
515	addze r9,r9
516
517	addc r10,r7,r10
518	adde r11,r8,r11
519	addze r9,r9
520	#sqr_add_c2(a,4,0,c2,c3,c1);
521	$LD r5,`0*$BNSZ`(r4)
522	$LD r6,`4*$BNSZ`(r4)
523	$UMULL r7,r5,r6
524	$UMULH r8,r5,r6
525
526	addc r10,r7,r10
527	adde r11,r8,r11
528	addze r9,r9
529
530	addc r10,r7,r10
531	adde r11,r8,r11
532	addze r9,r9
533	$ST r10,`4*$BNSZ`(r3) #r[4]=c2;
534	#sqr_add_c2(a,5,0,c3,c1,c2);
535	$LD r6,`5*$BNSZ`(r4)
536	$UMULL r7,r5,r6
537	$UMULH r8,r5,r6
538
539	addc r11,r7,r11
540	adde r9,r8,r9
541	addze r10,r0
542
543	addc r11,r7,r11
544	adde r9,r8,r9
545	addze r10,r10
546	#sqr_add_c2(a,4,1,c3,c1,c2);
547	$LD r5,`1*$BNSZ`(r4)
548	$LD r6,`4*$BNSZ`(r4)
549	$UMULL r7,r5,r6
550	$UMULH r8,r5,r6
551
552	addc r11,r7,r11
553	adde r9,r8,r9
554	addze r10,r10
555
556	addc r11,r7,r11
557	adde r9,r8,r9
558	addze r10,r10
559	#sqr_add_c2(a,3,2,c3,c1,c2);
560	$LD r5,`2*$BNSZ`(r4)
561	$LD r6,`3*$BNSZ`(r4)
562	$UMULL r7,r5,r6
563	$UMULH r8,r5,r6
564
565	addc r11,r7,r11
566	adde r9,r8,r9
567	addze r10,r10
568
569	addc r11,r7,r11
570	adde r9,r8,r9
571	addze r10,r10
572	$ST r11,`5*$BNSZ`(r3) #r[5]=c3;
573	#sqr_add_c(a,3,c1,c2,c3);
574	$UMULL r7,r6,r6
575	$UMULH r8,r6,r6
576	addc r9,r7,r9
577	adde r10,r8,r10
578	addze r11,r0
579	#sqr_add_c2(a,4,2,c1,c2,c3);
580	$LD r6,`4*$BNSZ`(r4)
581	$UMULL r7,r5,r6
582	$UMULH r8,r5,r6
583
584	addc r9,r7,r9
585	adde r10,r8,r10
586	addze r11,r11
587
588	addc r9,r7,r9
589	adde r10,r8,r10
590	addze r11,r11
591	#sqr_add_c2(a,5,1,c1,c2,c3);
592	$LD r5,`1*$BNSZ`(r4)
593	$LD r6,`5*$BNSZ`(r4)
594	$UMULL r7,r5,r6
595	$UMULH r8,r5,r6
596
597	addc r9,r7,r9
598	adde r10,r8,r10
599	addze r11,r11
600
601	addc r9,r7,r9
602	adde r10,r8,r10
603	addze r11,r11
604	#sqr_add_c2(a,6,0,c1,c2,c3);
605	$LD r5,`0*$BNSZ`(r4)
606	$LD r6,`6*$BNSZ`(r4)
607	$UMULL r7,r5,r6
608	$UMULH r8,r5,r6
609	addc r9,r7,r9
610	adde r10,r8,r10
611	addze r11,r11
612	addc r9,r7,r9
613	adde r10,r8,r10
614	addze r11,r11
615	$ST r9,`6*$BNSZ`(r3) #r[6]=c1;
616	#sqr_add_c2(a,7,0,c2,c3,c1);
617	$LD r6,`7*$BNSZ`(r4)
618	$UMULL r7,r5,r6
619	$UMULH r8,r5,r6
620
621	addc r10,r7,r10
622	adde r11,r8,r11
623	addze r9,r0
624	addc r10,r7,r10
625	adde r11,r8,r11
626	addze r9,r9
627	#sqr_add_c2(a,6,1,c2,c3,c1);
628	$LD r5,`1*$BNSZ`(r4)
629	$LD r6,`6*$BNSZ`(r4)
630	$UMULL r7,r5,r6
631	$UMULH r8,r5,r6
632
633	addc r10,r7,r10
634	adde r11,r8,r11
635	addze r9,r9
636	addc r10,r7,r10
637	adde r11,r8,r11
638	addze r9,r9
639	#sqr_add_c2(a,5,2,c2,c3,c1);
640	$LD r5,`2*$BNSZ`(r4)
641	$LD r6,`5*$BNSZ`(r4)
642	$UMULL r7,r5,r6
643	$UMULH r8,r5,r6
644	addc r10,r7,r10
645	adde r11,r8,r11
646	addze r9,r9
647	addc r10,r7,r10
648	adde r11,r8,r11
649	addze r9,r9
650	#sqr_add_c2(a,4,3,c2,c3,c1);
651	$LD r5,`3*$BNSZ`(r4)
652	$LD r6,`4*$BNSZ`(r4)
653	$UMULL r7,r5,r6
654	$UMULH r8,r5,r6
655
656	addc r10,r7,r10
657	adde r11,r8,r11
658	addze r9,r9
659	addc r10,r7,r10
660	adde r11,r8,r11
661	addze r9,r9
662	$ST r10,`7*$BNSZ`(r3) #r[7]=c2;
663	#sqr_add_c(a,4,c3,c1,c2);
664	$UMULL r7,r6,r6
665	$UMULH r8,r6,r6
666	addc r11,r7,r11
667	adde r9,r8,r9
668	addze r10,r0
669	#sqr_add_c2(a,5,3,c3,c1,c2);
670	$LD r6,`5*$BNSZ`(r4)
671	$UMULL r7,r5,r6
672	$UMULH r8,r5,r6
673	addc r11,r7,r11
674	adde r9,r8,r9
675	addze r10,r10
676	addc r11,r7,r11
677	adde r9,r8,r9
678	addze r10,r10
679	#sqr_add_c2(a,6,2,c3,c1,c2);
680	$LD r5,`2*$BNSZ`(r4)
681	$LD r6,`6*$BNSZ`(r4)
682	$UMULL r7,r5,r6
683	$UMULH r8,r5,r6
684	addc r11,r7,r11
685	adde r9,r8,r9
686	addze r10,r10
687
688	addc r11,r7,r11
689	adde r9,r8,r9
690	addze r10,r10
691	#sqr_add_c2(a,7,1,c3,c1,c2);
692	$LD r5,`1*$BNSZ`(r4)
693	$LD r6,`7*$BNSZ`(r4)
694	$UMULL r7,r5,r6
695	$UMULH r8,r5,r6
696	addc r11,r7,r11
697	adde r9,r8,r9
698	addze r10,r10
699	addc r11,r7,r11
700	adde r9,r8,r9
701	addze r10,r10
702	$ST r11,`8*$BNSZ`(r3) #r[8]=c3;
703	#sqr_add_c2(a,7,2,c1,c2,c3);
704	$LD r5,`2*$BNSZ`(r4)
705	$UMULL r7,r5,r6
706	$UMULH r8,r5,r6
707
708	addc r9,r7,r9
709	adde r10,r8,r10
710	addze r11,r0
711	addc r9,r7,r9
712	adde r10,r8,r10
713	addze r11,r11
714	#sqr_add_c2(a,6,3,c1,c2,c3);
715	$LD r5,`3*$BNSZ`(r4)
716	$LD r6,`6*$BNSZ`(r4)
717	$UMULL r7,r5,r6
718	$UMULH r8,r5,r6
719	addc r9,r7,r9
720	adde r10,r8,r10
721	addze r11,r11
722	addc r9,r7,r9
723	adde r10,r8,r10
724	addze r11,r11
725	#sqr_add_c2(a,5,4,c1,c2,c3);
726	$LD r5,`4*$BNSZ`(r4)
727	$LD r6,`5*$BNSZ`(r4)
728	$UMULL r7,r5,r6
729	$UMULH r8,r5,r6
730	addc r9,r7,r9
731	adde r10,r8,r10
732	addze r11,r11
733	addc r9,r7,r9
734	adde r10,r8,r10
735	addze r11,r11
736	$ST r9,`9*$BNSZ`(r3) #r[9]=c1;
737	#sqr_add_c(a,5,c2,c3,c1);
738	$UMULL r7,r6,r6
739	$UMULH r8,r6,r6
740	addc r10,r7,r10
741	adde r11,r8,r11
742	addze r9,r0
743	#sqr_add_c2(a,6,4,c2,c3,c1);
744	$LD r6,`6*$BNSZ`(r4)
745	$UMULL r7,r5,r6
746	$UMULH r8,r5,r6
747	addc r10,r7,r10
748	adde r11,r8,r11
749	addze r9,r9
750	addc r10,r7,r10
751	adde r11,r8,r11
752	addze r9,r9
753	#sqr_add_c2(a,7,3,c2,c3,c1);
754	$LD r5,`3*$BNSZ`(r4)
755	$LD r6,`7*$BNSZ`(r4)
756	$UMULL r7,r5,r6
757	$UMULH r8,r5,r6
758	addc r10,r7,r10
759	adde r11,r8,r11
760	addze r9,r9
761	addc r10,r7,r10
762	adde r11,r8,r11
763	addze r9,r9
764	$ST r10,`10*$BNSZ`(r3) #r[10]=c2;
765	#sqr_add_c2(a,7,4,c3,c1,c2);
766	$LD r5,`4*$BNSZ`(r4)
767	$UMULL r7,r5,r6
768	$UMULH r8,r5,r6
769	addc r11,r7,r11
770	adde r9,r8,r9
771	addze r10,r0
772	addc r11,r7,r11
773	adde r9,r8,r9
774	addze r10,r10
775	#sqr_add_c2(a,6,5,c3,c1,c2);
776	$LD r5,`5*$BNSZ`(r4)
777	$LD r6,`6*$BNSZ`(r4)
778	$UMULL r7,r5,r6
779	$UMULH r8,r5,r6
780	addc r11,r7,r11
781	adde r9,r8,r9
782	addze r10,r10
783	addc r11,r7,r11
784	adde r9,r8,r9
785	addze r10,r10
786	$ST r11,`11*$BNSZ`(r3) #r[11]=c3;
787	#sqr_add_c(a,6,c1,c2,c3);
788	$UMULL r7,r6,r6
789	$UMULH r8,r6,r6
790	addc r9,r7,r9
791	adde r10,r8,r10
792	addze r11,r0
793	#sqr_add_c2(a,7,5,c1,c2,c3)
794	$LD r6,`7*$BNSZ`(r4)
795	$UMULL r7,r5,r6
796	$UMULH r8,r5,r6
797	addc r9,r7,r9
798	adde r10,r8,r10
799	addze r11,r11
800	addc r9,r7,r9
801	adde r10,r8,r10
802	addze r11,r11
803	$ST r9,`12*$BNSZ`(r3) #r[12]=c1;
804
805	#sqr_add_c2(a,7,6,c2,c3,c1)
806	$LD r5,`6*$BNSZ`(r4)
807	$UMULL r7,r5,r6
808	$UMULH r8,r5,r6
809	addc r10,r7,r10
810	adde r11,r8,r11
811	addze r9,r0
812	addc r10,r7,r10
813	adde r11,r8,r11
814	addze r9,r9
815	$ST r10,`13*$BNSZ`(r3) #r[13]=c2;
816	#sqr_add_c(a,7,c3,c1,c2);
817	$UMULL r7,r6,r6
818	$UMULH r8,r6,r6
819	addc r11,r7,r11
820	adde r9,r8,r9
821	$ST r11,`14*$BNSZ`(r3) #r[14]=c3;
822	$ST r9, `15*$BNSZ`(r3) #r[15]=c1;
823
824
825	blr
826	.long 0
827	.byte 0,12,0x14,0,0,0,2,0
828	.long 0
829	.size .bn_sqr_comba8,.-.bn_sqr_comba8
830
831	#
832	# NOTE: The following label name should be changed to
833	# "bn_mul_comba4" i.e. remove the first dot
834	# for the gcc compiler. This should be automatically
835	# done in the build
836	#
837
838	.align 4
839	.bn_mul_comba4:
840	#
841	# This is an optimized version of the bn_mul_comba4 routine.
842	#
843	# void bn_mul_comba4(BN_ULONG r, BN_ULONG a, BN_ULONG *b)
844	# r3 contains r
845	# r4 contains a
846	# r5 contains b
847	# r6, r7 are the 2 BN_ULONGs being multiplied.
848	# r8, r9 are the results of the 32x32 giving 64 multiply.
849	# r10, r11, r12 are the equivalents of c1, c2, and c3.
850	#
851	xor r0,r0,r0 #r0=0. Used in addze below.
852	#mul_add_c(a[0],b[0],c1,c2,c3);
853	$LD r6,`0*$BNSZ`(r4)
854	$LD r7,`0*$BNSZ`(r5)
855	$UMULL r10,r6,r7
856	$UMULH r11,r6,r7
857	$ST r10,`0*$BNSZ`(r3) #r[0]=c1
858	#mul_add_c(a[0],b[1],c2,c3,c1);
859	$LD r7,`1*$BNSZ`(r5)
860	$UMULL r8,r6,r7
861	$UMULH r9,r6,r7
862	addc r11,r8,r11
863	adde r12,r9,r0
864	addze r10,r0
865	#mul_add_c(a[1],b[0],c2,c3,c1);
866	$LD r6, `1*$BNSZ`(r4)
867	$LD r7, `0*$BNSZ`(r5)
868	$UMULL r8,r6,r7
869	$UMULH r9,r6,r7
870	addc r11,r8,r11
871	adde r12,r9,r12
872	addze r10,r10
873	$ST r11,`1*$BNSZ`(r3) #r[1]=c2
874	#mul_add_c(a[2],b[0],c3,c1,c2);
875	$LD r6,`2*$BNSZ`(r4)
876	$UMULL r8,r6,r7
877	$UMULH r9,r6,r7
878	addc r12,r8,r12
879	adde r10,r9,r10
880	addze r11,r0
881	#mul_add_c(a[1],b[1],c3,c1,c2);
882	$LD r6,`1*$BNSZ`(r4)
883	$LD r7,`1*$BNSZ`(r5)
884	$UMULL r8,r6,r7
885	$UMULH r9,r6,r7
886	addc r12,r8,r12
887	adde r10,r9,r10
888	addze r11,r11
889	#mul_add_c(a[0],b[2],c3,c1,c2);
890	$LD r6,`0*$BNSZ`(r4)
891	$LD r7,`2*$BNSZ`(r5)
892	$UMULL r8,r6,r7
893	$UMULH r9,r6,r7
894	addc r12,r8,r12
895	adde r10,r9,r10
896	addze r11,r11
897	$ST r12,`2*$BNSZ`(r3) #r[2]=c3
898	#mul_add_c(a[0],b[3],c1,c2,c3);
899	$LD r7,`3*$BNSZ`(r5)
900	$UMULL r8,r6,r7
901	$UMULH r9,r6,r7
902	addc r10,r8,r10
903	adde r11,r9,r11
904	addze r12,r0
905	#mul_add_c(a[1],b[2],c1,c2,c3);
906	$LD r6,`1*$BNSZ`(r4)
907	$LD r7,`2*$BNSZ`(r5)
908	$UMULL r8,r6,r7
909	$UMULH r9,r6,r7
910	addc r10,r8,r10
911	adde r11,r9,r11
912	addze r12,r12
913	#mul_add_c(a[2],b[1],c1,c2,c3);
914	$LD r6,`2*$BNSZ`(r4)
915	$LD r7,`1*$BNSZ`(r5)
916	$UMULL r8,r6,r7
917	$UMULH r9,r6,r7
918	addc r10,r8,r10
919	adde r11,r9,r11
920	addze r12,r12
921	#mul_add_c(a[3],b[0],c1,c2,c3);
922	$LD r6,`3*$BNSZ`(r4)
923	$LD r7,`0*$BNSZ`(r5)
924	$UMULL r8,r6,r7
925	$UMULH r9,r6,r7
926	addc r10,r8,r10
927	adde r11,r9,r11
928	addze r12,r12
929	$ST r10,`3*$BNSZ`(r3) #r[3]=c1
930	#mul_add_c(a[3],b[1],c2,c3,c1);
931	$LD r7,`1*$BNSZ`(r5)
932	$UMULL r8,r6,r7
933	$UMULH r9,r6,r7
934	addc r11,r8,r11
935	adde r12,r9,r12
936	addze r10,r0
937	#mul_add_c(a[2],b[2],c2,c3,c1);
938	$LD r6,`2*$BNSZ`(r4)
939	$LD r7,`2*$BNSZ`(r5)
940	$UMULL r8,r6,r7
941	$UMULH r9,r6,r7
942	addc r11,r8,r11
943	adde r12,r9,r12
944	addze r10,r10
945	#mul_add_c(a[1],b[3],c2,c3,c1);
946	$LD r6,`1*$BNSZ`(r4)
947	$LD r7,`3*$BNSZ`(r5)
948	$UMULL r8,r6,r7
949	$UMULH r9,r6,r7
950	addc r11,r8,r11
951	adde r12,r9,r12
952	addze r10,r10
953	$ST r11,`4*$BNSZ`(r3) #r[4]=c2
954	#mul_add_c(a[2],b[3],c3,c1,c2);
955	$LD r6,`2*$BNSZ`(r4)
956	$UMULL r8,r6,r7
957	$UMULH r9,r6,r7
958	addc r12,r8,r12
959	adde r10,r9,r10
960	addze r11,r0
961	#mul_add_c(a[3],b[2],c3,c1,c2);
962	$LD r6,`3*$BNSZ`(r4)
963	$LD r7,`2*$BNSZ`(r5)
964	$UMULL r8,r6,r7
965	$UMULH r9,r6,r7
966	addc r12,r8,r12
967	adde r10,r9,r10
968	addze r11,r11
969	$ST r12,`5*$BNSZ`(r3) #r[5]=c3
970	#mul_add_c(a[3],b[3],c1,c2,c3);
971	$LD r7,`3*$BNSZ`(r5)
972	$UMULL r8,r6,r7
973	$UMULH r9,r6,r7
974	addc r10,r8,r10
975	adde r11,r9,r11
976
977	$ST r10,`6*$BNSZ`(r3) #r[6]=c1
978	$ST r11,`7*$BNSZ`(r3) #r[7]=c2
979	blr
980	.long 0
981	.byte 0,12,0x14,0,0,0,3,0
982	.long 0
983	.size .bn_mul_comba4,.-.bn_mul_comba4
984
985	#
986	# NOTE: The following label name should be changed to
987	# "bn_mul_comba8" i.e. remove the first dot
988	# for the gcc compiler. This should be automatically
989	# done in the build
990	#
991
992	.align 4
993	.bn_mul_comba8:
994	#
995	# Optimized version of the bn_mul_comba8 routine.
996	#
997	# void bn_mul_comba8(BN_ULONG r, BN_ULONG a, BN_ULONG *b)
998	# r3 contains r
999	# r4 contains a
1000	# r5 contains b
1001	# r6, r7 are the 2 BN_ULONGs being multiplied.
1002	# r8, r9 are the results of the 32x32 giving 64 multiply.
1003	# r10, r11, r12 are the equivalents of c1, c2, and c3.
1004	#
1005	xor r0,r0,r0 #r0=0. Used in addze below.
1006
1007	#mul_add_c(a[0],b[0],c1,c2,c3);
1008	$LD r6,`0*$BNSZ`(r4) #a[0]
1009	$LD r7,`0*$BNSZ`(r5) #b[0]
1010	$UMULL r10,r6,r7
1011	$UMULH r11,r6,r7
1012	$ST r10,`0*$BNSZ`(r3) #r[0]=c1;
1013	#mul_add_c(a[0],b[1],c2,c3,c1);
1014	$LD r7,`1*$BNSZ`(r5)
1015	$UMULL r8,r6,r7
1016	$UMULH r9,r6,r7
1017	addc r11,r11,r8
1018	addze r12,r9 # since we didn't set r12 to zero before.
1019	addze r10,r0
1020	#mul_add_c(a[1],b[0],c2,c3,c1);
1021	$LD r6,`1*$BNSZ`(r4)
1022	$LD r7,`0*$BNSZ`(r5)
1023	$UMULL r8,r6,r7
1024	$UMULH r9,r6,r7
1025	addc r11,r11,r8
1026	adde r12,r12,r9
1027	addze r10,r10
1028	$ST r11,`1*$BNSZ`(r3) #r[1]=c2;
1029	#mul_add_c(a[2],b[0],c3,c1,c2);
1030	$LD r6,`2*$BNSZ`(r4)
1031	$UMULL r8,r6,r7
1032	$UMULH r9,r6,r7
1033	addc r12,r12,r8
1034	adde r10,r10,r9
1035	addze r11,r0
1036	#mul_add_c(a[1],b[1],c3,c1,c2);
1037	$LD r6,`1*$BNSZ`(r4)
1038	$LD r7,`1*$BNSZ`(r5)
1039	$UMULL r8,r6,r7
1040	$UMULH r9,r6,r7
1041	addc r12,r12,r8
1042	adde r10,r10,r9
1043	addze r11,r11
1044	#mul_add_c(a[0],b[2],c3,c1,c2);
1045	$LD r6,`0*$BNSZ`(r4)
1046	$LD r7,`2*$BNSZ`(r5)
1047	$UMULL r8,r6,r7
1048	$UMULH r9,r6,r7
1049	addc r12,r12,r8
1050	adde r10,r10,r9
1051	addze r11,r11
1052	$ST r12,`2*$BNSZ`(r3) #r[2]=c3;
1053	#mul_add_c(a[0],b[3],c1,c2,c3);
1054	$LD r7,`3*$BNSZ`(r5)
1055	$UMULL r8,r6,r7
1056	$UMULH r9,r6,r7
1057	addc r10,r10,r8
1058	adde r11,r11,r9
1059	addze r12,r0
1060	#mul_add_c(a[1],b[2],c1,c2,c3);
1061	$LD r6,`1*$BNSZ`(r4)
1062	$LD r7,`2*$BNSZ`(r5)
1063	$UMULL r8,r6,r7
1064	$UMULH r9,r6,r7
1065	addc r10,r10,r8
1066	adde r11,r11,r9
1067	addze r12,r12
1068
1069	#mul_add_c(a[2],b[1],c1,c2,c3);
1070	$LD r6,`2*$BNSZ`(r4)
1071	$LD r7,`1*$BNSZ`(r5)
1072	$UMULL r8,r6,r7
1073	$UMULH r9,r6,r7
1074	addc r10,r10,r8
1075	adde r11,r11,r9
1076	addze r12,r12
1077	#mul_add_c(a[3],b[0],c1,c2,c3);
1078	$LD r6,`3*$BNSZ`(r4)
1079	$LD r7,`0*$BNSZ`(r5)
1080	$UMULL r8,r6,r7
1081	$UMULH r9,r6,r7
1082	addc r10,r10,r8
1083	adde r11,r11,r9
1084	addze r12,r12
1085	$ST r10,`3*$BNSZ`(r3) #r[3]=c1;
1086	#mul_add_c(a[4],b[0],c2,c3,c1);
1087	$LD r6,`4*$BNSZ`(r4)
1088	$UMULL r8,r6,r7
1089	$UMULH r9,r6,r7
1090	addc r11,r11,r8
1091	adde r12,r12,r9
1092	addze r10,r0
1093	#mul_add_c(a[3],b[1],c2,c3,c1);
1094	$LD r6,`3*$BNSZ`(r4)
1095	$LD r7,`1*$BNSZ`(r5)
1096	$UMULL r8,r6,r7
1097	$UMULH r9,r6,r7
1098	addc r11,r11,r8
1099	adde r12,r12,r9
1100	addze r10,r10
1101	#mul_add_c(a[2],b[2],c2,c3,c1);
1102	$LD r6,`2*$BNSZ`(r4)
1103	$LD r7,`2*$BNSZ`(r5)
1104	$UMULL r8,r6,r7
1105	$UMULH r9,r6,r7
1106	addc r11,r11,r8
1107	adde r12,r12,r9
1108	addze r10,r10
1109	#mul_add_c(a[1],b[3],c2,c3,c1);
1110	$LD r6,`1*$BNSZ`(r4)
1111	$LD r7,`3*$BNSZ`(r5)
1112	$UMULL r8,r6,r7
1113	$UMULH r9,r6,r7
1114	addc r11,r11,r8
1115	adde r12,r12,r9
1116	addze r10,r10
1117	#mul_add_c(a[0],b[4],c2,c3,c1);
1118	$LD r6,`0*$BNSZ`(r4)
1119	$LD r7,`4*$BNSZ`(r5)
1120	$UMULL r8,r6,r7
1121	$UMULH r9,r6,r7
1122	addc r11,r11,r8
1123	adde r12,r12,r9
1124	addze r10,r10
1125	$ST r11,`4*$BNSZ`(r3) #r[4]=c2;
1126	#mul_add_c(a[0],b[5],c3,c1,c2);
1127	$LD r7,`5*$BNSZ`(r5)
1128	$UMULL r8,r6,r7
1129	$UMULH r9,r6,r7
1130	addc r12,r12,r8
1131	adde r10,r10,r9
1132	addze r11,r0
1133	#mul_add_c(a[1],b[4],c3,c1,c2);
1134	$LD r6,`1*$BNSZ`(r4)
1135	$LD r7,`4*$BNSZ`(r5)
1136	$UMULL r8,r6,r7
1137	$UMULH r9,r6,r7
1138	addc r12,r12,r8
1139	adde r10,r10,r9
1140	addze r11,r11
1141	#mul_add_c(a[2],b[3],c3,c1,c2);
1142	$LD r6,`2*$BNSZ`(r4)
1143	$LD r7,`3*$BNSZ`(r5)
1144	$UMULL r8,r6,r7
1145	$UMULH r9,r6,r7
1146	addc r12,r12,r8
1147	adde r10,r10,r9
1148	addze r11,r11
1149	#mul_add_c(a[3],b[2],c3,c1,c2);
1150	$LD r6,`3*$BNSZ`(r4)
1151	$LD r7,`2*$BNSZ`(r5)
1152	$UMULL r8,r6,r7
1153	$UMULH r9,r6,r7
1154	addc r12,r12,r8
1155	adde r10,r10,r9
1156	addze r11,r11
1157	#mul_add_c(a[4],b[1],c3,c1,c2);
1158	$LD r6,`4*$BNSZ`(r4)
1159	$LD r7,`1*$BNSZ`(r5)
1160	$UMULL r8,r6,r7
1161	$UMULH r9,r6,r7
1162	addc r12,r12,r8
1163	adde r10,r10,r9
1164	addze r11,r11
1165	#mul_add_c(a[5],b[0],c3,c1,c2);
1166	$LD r6,`5*$BNSZ`(r4)
1167	$LD r7,`0*$BNSZ`(r5)
1168	$UMULL r8,r6,r7
1169	$UMULH r9,r6,r7
1170	addc r12,r12,r8
1171	adde r10,r10,r9
1172	addze r11,r11
1173	$ST r12,`5*$BNSZ`(r3) #r[5]=c3;
1174	#mul_add_c(a[6],b[0],c1,c2,c3);
1175	$LD r6,`6*$BNSZ`(r4)
1176	$UMULL r8,r6,r7
1177	$UMULH r9,r6,r7
1178	addc r10,r10,r8
1179	adde r11,r11,r9
1180	addze r12,r0
1181	#mul_add_c(a[5],b[1],c1,c2,c3);
1182	$LD r6,`5*$BNSZ`(r4)
1183	$LD r7,`1*$BNSZ`(r5)
1184	$UMULL r8,r6,r7
1185	$UMULH r9,r6,r7
1186	addc r10,r10,r8
1187	adde r11,r11,r9
1188	addze r12,r12
1189	#mul_add_c(a[4],b[2],c1,c2,c3);
1190	$LD r6,`4*$BNSZ`(r4)
1191	$LD r7,`2*$BNSZ`(r5)
1192	$UMULL r8,r6,r7
1193	$UMULH r9,r6,r7
1194	addc r10,r10,r8
1195	adde r11,r11,r9
1196	addze r12,r12
1197	#mul_add_c(a[3],b[3],c1,c2,c3);
1198	$LD r6,`3*$BNSZ`(r4)
1199	$LD r7,`3*$BNSZ`(r5)
1200	$UMULL r8,r6,r7
1201	$UMULH r9,r6,r7
1202	addc r10,r10,r8
1203	adde r11,r11,r9
1204	addze r12,r12
1205	#mul_add_c(a[2],b[4],c1,c2,c3);
1206	$LD r6,`2*$BNSZ`(r4)
1207	$LD r7,`4*$BNSZ`(r5)
1208	$UMULL r8,r6,r7
1209	$UMULH r9,r6,r7
1210	addc r10,r10,r8
1211	adde r11,r11,r9
1212	addze r12,r12
1213	#mul_add_c(a[1],b[5],c1,c2,c3);
1214	$LD r6,`1*$BNSZ`(r4)
1215	$LD r7,`5*$BNSZ`(r5)
1216	$UMULL r8,r6,r7
1217	$UMULH r9,r6,r7
1218	addc r10,r10,r8
1219	adde r11,r11,r9
1220	addze r12,r12
1221	#mul_add_c(a[0],b[6],c1,c2,c3);
1222	$LD r6,`0*$BNSZ`(r4)
1223	$LD r7,`6*$BNSZ`(r5)
1224	$UMULL r8,r6,r7
1225	$UMULH r9,r6,r7
1226	addc r10,r10,r8
1227	adde r11,r11,r9
1228	addze r12,r12
1229	$ST r10,`6*$BNSZ`(r3) #r[6]=c1;
1230	#mul_add_c(a[0],b[7],c2,c3,c1);
1231	$LD r7,`7*$BNSZ`(r5)
1232	$UMULL r8,r6,r7
1233	$UMULH r9,r6,r7
1234	addc r11,r11,r8
1235	adde r12,r12,r9
1236	addze r10,r0
1237	#mul_add_c(a[1],b[6],c2,c3,c1);
1238	$LD r6,`1*$BNSZ`(r4)
1239	$LD r7,`6*$BNSZ`(r5)
1240	$UMULL r8,r6,r7
1241	$UMULH r9,r6,r7
1242	addc r11,r11,r8
1243	adde r12,r12,r9
1244	addze r10,r10
1245	#mul_add_c(a[2],b[5],c2,c3,c1);
1246	$LD r6,`2*$BNSZ`(r4)
1247	$LD r7,`5*$BNSZ`(r5)
1248	$UMULL r8,r6,r7
1249	$UMULH r9,r6,r7
1250	addc r11,r11,r8
1251	adde r12,r12,r9
1252	addze r10,r10
1253	#mul_add_c(a[3],b[4],c2,c3,c1);
1254	$LD r6,`3*$BNSZ`(r4)
1255	$LD r7,`4*$BNSZ`(r5)
1256	$UMULL r8,r6,r7
1257	$UMULH r9,r6,r7
1258	addc r11,r11,r8
1259	adde r12,r12,r9
1260	addze r10,r10
1261	#mul_add_c(a[4],b[3],c2,c3,c1);
1262	$LD r6,`4*$BNSZ`(r4)
1263	$LD r7,`3*$BNSZ`(r5)
1264	$UMULL r8,r6,r7
1265	$UMULH r9,r6,r7
1266	addc r11,r11,r8
1267	adde r12,r12,r9
1268	addze r10,r10
1269	#mul_add_c(a[5],b[2],c2,c3,c1);
1270	$LD r6,`5*$BNSZ`(r4)
1271	$LD r7,`2*$BNSZ`(r5)
1272	$UMULL r8,r6,r7
1273	$UMULH r9,r6,r7
1274	addc r11,r11,r8
1275	adde r12,r12,r9
1276	addze r10,r10
1277	#mul_add_c(a[6],b[1],c2,c3,c1);
1278	$LD r6,`6*$BNSZ`(r4)
1279	$LD r7,`1*$BNSZ`(r5)
1280	$UMULL r8,r6,r7
1281	$UMULH r9,r6,r7
1282	addc r11,r11,r8
1283	adde r12,r12,r9
1284	addze r10,r10
1285	#mul_add_c(a[7],b[0],c2,c3,c1);
1286	$LD r6,`7*$BNSZ`(r4)
1287	$LD r7,`0*$BNSZ`(r5)
1288	$UMULL r8,r6,r7
1289	$UMULH r9,r6,r7
1290	addc r11,r11,r8
1291	adde r12,r12,r9
1292	addze r10,r10
1293	$ST r11,`7*$BNSZ`(r3) #r[7]=c2;
1294	#mul_add_c(a[7],b[1],c3,c1,c2);
1295	$LD r7,`1*$BNSZ`(r5)
1296	$UMULL r8,r6,r7
1297	$UMULH r9,r6,r7
1298	addc r12,r12,r8
1299	adde r10,r10,r9
1300	addze r11,r0
1301	#mul_add_c(a[6],b[2],c3,c1,c2);
1302	$LD r6,`6*$BNSZ`(r4)
1303	$LD r7,`2*$BNSZ`(r5)
1304	$UMULL r8,r6,r7
1305	$UMULH r9,r6,r7
1306	addc r12,r12,r8
1307	adde r10,r10,r9
1308	addze r11,r11
1309	#mul_add_c(a[5],b[3],c3,c1,c2);
1310	$LD r6,`5*$BNSZ`(r4)
1311	$LD r7,`3*$BNSZ`(r5)
1312	$UMULL r8,r6,r7
1313	$UMULH r9,r6,r7
1314	addc r12,r12,r8
1315	adde r10,r10,r9
1316	addze r11,r11
1317	#mul_add_c(a[4],b[4],c3,c1,c2);
1318	$LD r6,`4*$BNSZ`(r4)
1319	$LD r7,`4*$BNSZ`(r5)
1320	$UMULL r8,r6,r7
1321	$UMULH r9,r6,r7
1322	addc r12,r12,r8
1323	adde r10,r10,r9
1324	addze r11,r11
1325	#mul_add_c(a[3],b[5],c3,c1,c2);
1326	$LD r6,`3*$BNSZ`(r4)
1327	$LD r7,`5*$BNSZ`(r5)
1328	$UMULL r8,r6,r7
1329	$UMULH r9,r6,r7
1330	addc r12,r12,r8
1331	adde r10,r10,r9
1332	addze r11,r11
1333	#mul_add_c(a[2],b[6],c3,c1,c2);
1334	$LD r6,`2*$BNSZ`(r4)
1335	$LD r7,`6*$BNSZ`(r5)
1336	$UMULL r8,r6,r7
1337	$UMULH r9,r6,r7
1338	addc r12,r12,r8
1339	adde r10,r10,r9
1340	addze r11,r11
1341	#mul_add_c(a[1],b[7],c3,c1,c2);
1342	$LD r6,`1*$BNSZ`(r4)
1343	$LD r7,`7*$BNSZ`(r5)
1344	$UMULL r8,r6,r7
1345	$UMULH r9,r6,r7
1346	addc r12,r12,r8
1347	adde r10,r10,r9
1348	addze r11,r11
1349	$ST r12,`8*$BNSZ`(r3) #r[8]=c3;
1350	#mul_add_c(a[2],b[7],c1,c2,c3);
1351	$LD r6,`2*$BNSZ`(r4)
1352	$UMULL r8,r6,r7
1353	$UMULH r9,r6,r7
1354	addc r10,r10,r8
1355	adde r11,r11,r9
1356	addze r12,r0
1357	#mul_add_c(a[3],b[6],c1,c2,c3);
1358	$LD r6,`3*$BNSZ`(r4)
1359	$LD r7,`6*$BNSZ`(r5)
1360	$UMULL r8,r6,r7
1361	$UMULH r9,r6,r7
1362	addc r10,r10,r8
1363	adde r11,r11,r9
1364	addze r12,r12
1365	#mul_add_c(a[4],b[5],c1,c2,c3);
1366	$LD r6,`4*$BNSZ`(r4)
1367	$LD r7,`5*$BNSZ`(r5)
1368	$UMULL r8,r6,r7
1369	$UMULH r9,r6,r7
1370	addc r10,r10,r8
1371	adde r11,r11,r9
1372	addze r12,r12
1373	#mul_add_c(a[5],b[4],c1,c2,c3);
1374	$LD r6,`5*$BNSZ`(r4)
1375	$LD r7,`4*$BNSZ`(r5)
1376	$UMULL r8,r6,r7
1377	$UMULH r9,r6,r7
1378	addc r10,r10,r8
1379	adde r11,r11,r9
1380	addze r12,r12
1381	#mul_add_c(a[6],b[3],c1,c2,c3);
1382	$LD r6,`6*$BNSZ`(r4)
1383	$LD r7,`3*$BNSZ`(r5)
1384	$UMULL r8,r6,r7
1385	$UMULH r9,r6,r7
1386	addc r10,r10,r8
1387	adde r11,r11,r9
1388	addze r12,r12
1389	#mul_add_c(a[7],b[2],c1,c2,c3);
1390	$LD r6,`7*$BNSZ`(r4)
1391	$LD r7,`2*$BNSZ`(r5)
1392	$UMULL r8,r6,r7
1393	$UMULH r9,r6,r7
1394	addc r10,r10,r8
1395	adde r11,r11,r9
1396	addze r12,r12
1397	$ST r10,`9*$BNSZ`(r3) #r[9]=c1;
1398	#mul_add_c(a[7],b[3],c2,c3,c1);
1399	$LD r7,`3*$BNSZ`(r5)
1400	$UMULL r8,r6,r7
1401	$UMULH r9,r6,r7
1402	addc r11,r11,r8
1403	adde r12,r12,r9
1404	addze r10,r0
1405	#mul_add_c(a[6],b[4],c2,c3,c1);
1406	$LD r6,`6*$BNSZ`(r4)
1407	$LD r7,`4*$BNSZ`(r5)
1408	$UMULL r8,r6,r7
1409	$UMULH r9,r6,r7
1410	addc r11,r11,r8
1411	adde r12,r12,r9
1412	addze r10,r10
1413	#mul_add_c(a[5],b[5],c2,c3,c1);
1414	$LD r6,`5*$BNSZ`(r4)
1415	$LD r7,`5*$BNSZ`(r5)
1416	$UMULL r8,r6,r7
1417	$UMULH r9,r6,r7
1418	addc r11,r11,r8
1419	adde r12,r12,r9
1420	addze r10,r10
1421	#mul_add_c(a[4],b[6],c2,c3,c1);
1422	$LD r6,`4*$BNSZ`(r4)
1423	$LD r7,`6*$BNSZ`(r5)
1424	$UMULL r8,r6,r7
1425	$UMULH r9,r6,r7
1426	addc r11,r11,r8
1427	adde r12,r12,r9
1428	addze r10,r10
1429	#mul_add_c(a[3],b[7],c2,c3,c1);
1430	$LD r6,`3*$BNSZ`(r4)
1431	$LD r7,`7*$BNSZ`(r5)
1432	$UMULL r8,r6,r7
1433	$UMULH r9,r6,r7
1434	addc r11,r11,r8
1435	adde r12,r12,r9
1436	addze r10,r10
1437	$ST r11,`10*$BNSZ`(r3) #r[10]=c2;
1438	#mul_add_c(a[4],b[7],c3,c1,c2);
1439	$LD r6,`4*$BNSZ`(r4)
1440	$UMULL r8,r6,r7
1441	$UMULH r9,r6,r7
1442	addc r12,r12,r8
1443	adde r10,r10,r9
1444	addze r11,r0
1445	#mul_add_c(a[5],b[6],c3,c1,c2);
1446	$LD r6,`5*$BNSZ`(r4)
1447	$LD r7,`6*$BNSZ`(r5)
1448	$UMULL r8,r6,r7
1449	$UMULH r9,r6,r7
1450	addc r12,r12,r8
1451	adde r10,r10,r9
1452	addze r11,r11
1453	#mul_add_c(a[6],b[5],c3,c1,c2);
1454	$LD r6,`6*$BNSZ`(r4)
1455	$LD r7,`5*$BNSZ`(r5)
1456	$UMULL r8,r6,r7
1457	$UMULH r9,r6,r7
1458	addc r12,r12,r8
1459	adde r10,r10,r9
1460	addze r11,r11
1461	#mul_add_c(a[7],b[4],c3,c1,c2);
1462	$LD r6,`7*$BNSZ`(r4)
1463	$LD r7,`4*$BNSZ`(r5)
1464	$UMULL r8,r6,r7
1465	$UMULH r9,r6,r7
1466	addc r12,r12,r8
1467	adde r10,r10,r9
1468	addze r11,r11
1469	$ST r12,`11*$BNSZ`(r3) #r[11]=c3;
1470	#mul_add_c(a[7],b[5],c1,c2,c3);
1471	$LD r7,`5*$BNSZ`(r5)
1472	$UMULL r8,r6,r7
1473	$UMULH r9,r6,r7
1474	addc r10,r10,r8
1475	adde r11,r11,r9
1476	addze r12,r0
1477	#mul_add_c(a[6],b[6],c1,c2,c3);
1478	$LD r6,`6*$BNSZ`(r4)
1479	$LD r7,`6*$BNSZ`(r5)
1480	$UMULL r8,r6,r7
1481	$UMULH r9,r6,r7
1482	addc r10,r10,r8
1483	adde r11,r11,r9
1484	addze r12,r12
1485	#mul_add_c(a[5],b[7],c1,c2,c3);
1486	$LD r6,`5*$BNSZ`(r4)
1487	$LD r7,`7*$BNSZ`(r5)
1488	$UMULL r8,r6,r7
1489	$UMULH r9,r6,r7
1490	addc r10,r10,r8
1491	adde r11,r11,r9
1492	addze r12,r12
1493	$ST r10,`12*$BNSZ`(r3) #r[12]=c1;
1494	#mul_add_c(a[6],b[7],c2,c3,c1);
1495	$LD r6,`6*$BNSZ`(r4)
1496	$UMULL r8,r6,r7
1497	$UMULH r9,r6,r7
1498	addc r11,r11,r8
1499	adde r12,r12,r9
1500	addze r10,r0
1501	#mul_add_c(a[7],b[6],c2,c3,c1);
1502	$LD r6,`7*$BNSZ`(r4)
1503	$LD r7,`6*$BNSZ`(r5)
1504	$UMULL r8,r6,r7
1505	$UMULH r9,r6,r7
1506	addc r11,r11,r8
1507	adde r12,r12,r9
1508	addze r10,r10
1509	$ST r11,`13*$BNSZ`(r3) #r[13]=c2;
1510	#mul_add_c(a[7],b[7],c3,c1,c2);
1511	$LD r7,`7*$BNSZ`(r5)
1512	$UMULL r8,r6,r7
1513	$UMULH r9,r6,r7
1514	addc r12,r12,r8
1515	adde r10,r10,r9
1516	$ST r12,`14*$BNSZ`(r3) #r[14]=c3;
1517	$ST r10,`15*$BNSZ`(r3) #r[15]=c1;
1518	blr
1519	.long 0
1520	.byte 0,12,0x14,0,0,0,3,0
1521	.long 0
1522	.size .bn_mul_comba8,.-.bn_mul_comba8
1523
1524	#
1525	# NOTE: The following label name should be changed to
1526	# "bn_sub_words" i.e. remove the first dot
1527	# for the gcc compiler. This should be automatically
1528	# done in the build
1529	#
1530	#
1531	.align 4
1532	.bn_sub_words:
1533	#
1534	# Handcoded version of bn_sub_words
1535	#
1536	#BN_ULONG bn_sub_words(BN_ULONG r, BN_ULONG a, BN_ULONG *b, int n)
1537	#
1538	# r3 = r
1539	# r4 = a
1540	# r5 = b
1541	# r6 = n
1542	#
1543	# Note: No loop unrolling done since this is not a performance
1544	# critical loop.
1545
1546	xor r0,r0,r0 #set r0 = 0
1547	#
1548	# check for r6 = 0 AND set carry bit.
1549	#
1550	subfc. r7,r0,r6 # If r6 is 0 then result is 0.
1551	# if r6 > 0 then result !=0
1552	# In either case carry bit is set.
1553	beq Lppcasm_sub_adios
1554	addi r4,r4,-$BNSZ
1555	addi r3,r3,-$BNSZ
1556	addi r5,r5,-$BNSZ
1557	mtctr r6
1558	Lppcasm_sub_mainloop:
1559	$LDU r7,$BNSZ(r4)
1560	$LDU r8,$BNSZ(r5)
1561	subfe r6,r8,r7 # r6 = r7+carry bit + onescomplement(r8)
1562	# if carry = 1 this is r7-r8. Else it
1563	# is r7-r8 -1 as we need.
1564	$STU r6,$BNSZ(r3)
1565	bdnz Lppcasm_sub_mainloop
1566	Lppcasm_sub_adios:
1567	subfze r3,r0 # if carry bit is set then r3 = 0 else -1
1568	andi. r3,r3,1 # keep only last bit.
1569	blr
1570	.long 0
1571	.byte 0,12,0x14,0,0,0,4,0
1572	.long 0
1573	.size .bn_sub_words,.-.bn_sub_words
1574
1575	#
1576	# NOTE: The following label name should be changed to
1577	# "bn_add_words" i.e. remove the first dot
1578	# for the gcc compiler. This should be automatically
1579	# done in the build
1580	#
1581
1582	.align 4
1583	.bn_add_words:
1584	#
1585	# Handcoded version of bn_add_words
1586	#
1587	#BN_ULONG bn_add_words(BN_ULONG r, BN_ULONG a, BN_ULONG *b, int n)
1588	#
1589	# r3 = r
1590	# r4 = a
1591	# r5 = b
1592	# r6 = n
1593	#
1594	# Note: No loop unrolling done since this is not a performance
1595	# critical loop.
1596
1597	xor r0,r0,r0
1598	#
1599	# check for r6 = 0. Is this needed?
1600	#
1601	addic. r6,r6,0 #test r6 and clear carry bit.
1602	beq Lppcasm_add_adios
1603	addi r4,r4,-$BNSZ
1604	addi r3,r3,-$BNSZ
1605	addi r5,r5,-$BNSZ
1606	mtctr r6
1607	Lppcasm_add_mainloop:
1608	$LDU r7,$BNSZ(r4)
1609	$LDU r8,$BNSZ(r5)
1610	adde r8,r7,r8
1611	$STU r8,$BNSZ(r3)
1612	bdnz Lppcasm_add_mainloop
1613	Lppcasm_add_adios:
1614	addze r3,r0 #return carry bit.
1615	blr
1616	.long 0
1617	.byte 0,12,0x14,0,0,0,4,0
1618	.long 0
1619	.size .bn_add_words,.-.bn_add_words
1620
1621	#
1622	# NOTE: The following label name should be changed to
1623	# "bn_div_words" i.e. remove the first dot
1624	# for the gcc compiler. This should be automatically
1625	# done in the build
1626	#
1627
1628	.align 4
1629	.bn_div_words:
1630	#
1631	# This is a cleaned up version of code generated by
1632	# the AIX compiler. The only optimization is to use
1633	# the PPC instruction to count leading zeros instead
1634	# of call to num_bits_word. Since this was compiled
1635	# only at level -O2 we can possibly squeeze it more?
1636	#
1637	# r3 = h
1638	# r4 = l
1639	# r5 = d
1640
1641	$UCMPI 0,r5,0 # compare r5 and 0
1642	bne Lppcasm_div1 # proceed if d!=0
1643	li r3,-1 # d=0 return -1
1644	blr
1645	Lppcasm_div1:
1646	xor r0,r0,r0 #r0=0
1647	li r8,$BITS
1648	$CNTLZ. r7,r5 #r7 = num leading 0s in d.
1649	beq Lppcasm_div2 #proceed if no leading zeros
1650	subf r8,r7,r8 #r8 = BN_num_bits_word(d)
1651	$SHR. r9,r3,r8 #are there any bits above r8'th?
1652	$TR 16,r9,r0 #if there're, signal to dump core...
1653	Lppcasm_div2:
1654	$UCMP 0,r3,r5 #h>=d?
1655	blt Lppcasm_div3 #goto Lppcasm_div3 if not
1656	subf r3,r5,r3 #h-=d ;
1657	Lppcasm_div3: #r7 = BN_BITS2-i. so r7=i
1658	cmpi 0,0,r7,0 # is (i == 0)?
1659	beq Lppcasm_div4
1660	$SHL r3,r3,r7 # h = (h<< i)
1661	$SHR r8,r4,r8 # r8 = (l >> BN_BITS2 -i)
1662	$SHL r5,r5,r7 # d<<=i
1663	or r3,r3,r8 # h = (h<<i)\|(l>>(BN_BITS2-i))
1664	$SHL r4,r4,r7 # l <<=i
1665	Lppcasm_div4:
1666	$SHRI r9,r5,`$BITS/2` # r9 = dh
1667	# dl will be computed when needed
1668	# as it saves registers.
1669	li r6,2 #r6=2
1670	mtctr r6 #counter will be in count.
1671	Lppcasm_divouterloop:
1672	$SHRI r8,r3,`$BITS/2` #r8 = (h>>BN_BITS4)
1673	$SHRI r11,r4,`$BITS/2` #r11= (l&BN_MASK2h)>>BN_BITS4
1674	# compute here for innerloop.
1675	$UCMP 0,r8,r9 # is (h>>BN_BITS4)==dh
1676	bne Lppcasm_div5 # goto Lppcasm_div5 if not
1677
1678	li r8,-1
1679	$CLRU r8,r8,`$BITS/2` #q = BN_MASK2l
1680	b Lppcasm_div6
1681	Lppcasm_div5:
1682	$UDIV r8,r3,r9 #q = h/dh
1683	Lppcasm_div6:
1684	$UMULL r12,r9,r8 #th = q*dh
1685	$CLRU r10,r5,`$BITS/2` #r10=dl
1686	$UMULL r6,r8,r10 #tl = q*dl
1687
1688	Lppcasm_divinnerloop:
1689	subf r10,r12,r3 #t = h -th
1690	$SHRI r7,r10,`$BITS/2` #r7= (t &BN_MASK2H), sort of...
1691	addic. r7,r7,0 #test if r7 == 0. used below.
1692	# now want to compute
1693	# r7 = (t<<BN_BITS4)\|((l&BN_MASK2h)>>BN_BITS4)
1694	# the following 2 instructions do that
1695	$SHLI r7,r10,`$BITS/2` # r7 = (t<<BN_BITS4)
1696	or r7,r7,r11 # r7\|=((l&BN_MASK2h)>>BN_BITS4)
1697	$UCMP cr1,r6,r7 # compare (tl <= r7)
1698	bne Lppcasm_divinnerexit
1699	ble cr1,Lppcasm_divinnerexit
1700	addi r8,r8,-1 #q--
1701	subf r12,r9,r12 #th -=dh
1702	$CLRU r10,r5,`$BITS/2` #r10=dl. t is no longer needed in loop.
1703	subf r6,r10,r6 #tl -=dl
1704	b Lppcasm_divinnerloop
1705	Lppcasm_divinnerexit:
1706	$SHRI r10,r6,`$BITS/2` #t=(tl>>BN_BITS4)
1707	$SHLI r11,r6,`$BITS/2` #tl=(tl<<BN_BITS4)&BN_MASK2h;
1708	$UCMP cr1,r4,r11 # compare l and tl
1709	add r12,r12,r10 # th+=t
1710	bge cr1,Lppcasm_div7 # if (l>=tl) goto Lppcasm_div7
1711	addi r12,r12,1 # th++
1712	Lppcasm_div7:
1713	subf r11,r11,r4 #r11=l-tl
1714	$UCMP cr1,r3,r12 #compare h and th
1715	bge cr1,Lppcasm_div8 #if (h>=th) goto Lppcasm_div8
1716	addi r8,r8,-1 # q--
1717	add r3,r5,r3 # h+=d
1718	Lppcasm_div8:
1719	subf r12,r12,r3 #r12 = h-th
1720	$SHLI r4,r11,`$BITS/2` #l=(l&BN_MASK2l)<<BN_BITS4
1721	# want to compute
1722	# h = ((h<<BN_BITS4)\|(l>>BN_BITS4))&BN_MASK2
1723	# the following 2 instructions will do this.
1724	$INSR r11,r12,`$BITS/2`,`$BITS/2` # r11 is the value we want rotated $BITS/2.
1725	$ROTL r3,r11,`$BITS/2` # rotate by $BITS/2 and store in r3
1726	bdz Lppcasm_div9 #if (count==0) break ;
1727	$SHLI r0,r8,`$BITS/2` #ret =q<<BN_BITS4
1728	b Lppcasm_divouterloop
1729	Lppcasm_div9:
1730	or r3,r8,r0
1731	blr
1732	.long 0
1733	.byte 0,12,0x14,0,0,0,3,0
1734	.long 0
1735	.size .bn_div_words,.-.bn_div_words
1736
1737	#
1738	# NOTE: The following label name should be changed to
1739	# "bn_sqr_words" i.e. remove the first dot
1740	# for the gcc compiler. This should be automatically
1741	# done in the build
1742	#
1743	.align 4
1744	.bn_sqr_words:
1745	#
1746	# Optimized version of bn_sqr_words
1747	#
1748	# void bn_sqr_words(BN_ULONG r, BN_ULONG a, int n)
1749	#
1750	# r3 = r
1751	# r4 = a
1752	# r5 = n
1753	#
1754	# r6 = a[i].
1755	# r7,r8 = product.
1756	#
1757	# No unrolling done here. Not performance critical.
1758
1759	addic. r5,r5,0 #test r5.
1760	beq Lppcasm_sqr_adios
1761	addi r4,r4,-$BNSZ
1762	addi r3,r3,-$BNSZ
1763	mtctr r5
1764	Lppcasm_sqr_mainloop:
1765	#sqr(r[0],r[1],a[0]);
1766	$LDU r6,$BNSZ(r4)
1767	$UMULL r7,r6,r6
1768	$UMULH r8,r6,r6
1769	$STU r7,$BNSZ(r3)
1770	$STU r8,$BNSZ(r3)
1771	bdnz Lppcasm_sqr_mainloop
1772	Lppcasm_sqr_adios:
1773	blr
1774	.long 0
1775	.byte 0,12,0x14,0,0,0,3,0
1776	.long 0
1777	.size .bn_sqr_words,.-.bn_sqr_words
1778
1779	#
1780	# NOTE: The following label name should be changed to
1781	# "bn_mul_words" i.e. remove the first dot
1782	# for the gcc compiler. This should be automatically
1783	# done in the build
1784	#
1785
1786	.align 4
1787	.bn_mul_words:
1788	#
1789	# BN_ULONG bn_mul_words(BN_ULONG rp, BN_ULONG ap, int num, BN_ULONG w)
1790	#
1791	# r3 = rp
1792	# r4 = ap
1793	# r5 = num
1794	# r6 = w
1795	xor r0,r0,r0
1796	xor r12,r12,r12 # used for carry
1797	rlwinm. r7,r5,30,2,31 # num >> 2
1798	beq Lppcasm_mw_REM
1799	mtctr r7
1800	Lppcasm_mw_LOOP:
1801	#mul(rp[0],ap[0],w,c1);
1802	$LD r8,`0*$BNSZ`(r4)
1803	$UMULL r9,r6,r8
1804	$UMULH r10,r6,r8
1805	addc r9,r9,r12
1806	#addze r10,r10 #carry is NOT ignored.
1807	#will be taken care of
1808	#in second spin below
1809	#using adde.
1810	$ST r9,`0*$BNSZ`(r3)
1811	#mul(rp[1],ap[1],w,c1);
1812	$LD r8,`1*$BNSZ`(r4)
1813	$UMULL r11,r6,r8
1814	$UMULH r12,r6,r8
1815	adde r11,r11,r10
1816	#addze r12,r12
1817	$ST r11,`1*$BNSZ`(r3)
1818	#mul(rp[2],ap[2],w,c1);
1819	$LD r8,`2*$BNSZ`(r4)
1820	$UMULL r9,r6,r8
1821	$UMULH r10,r6,r8
1822	adde r9,r9,r12
1823	#addze r10,r10
1824	$ST r9,`2*$BNSZ`(r3)
1825	#mul_add(rp[3],ap[3],w,c1);
1826	$LD r8,`3*$BNSZ`(r4)
1827	$UMULL r11,r6,r8
1828	$UMULH r12,r6,r8
1829	adde r11,r11,r10
1830	addze r12,r12 #this spin we collect carry into
1831	#r12
1832	$ST r11,`3*$BNSZ`(r3)
1833
1834	addi r3,r3,`4*$BNSZ`
1835	addi r4,r4,`4*$BNSZ`
1836	bdnz Lppcasm_mw_LOOP
1837
1838	Lppcasm_mw_REM:
1839	andi. r5,r5,0x3
1840	beq Lppcasm_mw_OVER
1841	#mul(rp[0],ap[0],w,c1);
1842	$LD r8,`0*$BNSZ`(r4)
1843	$UMULL r9,r6,r8
1844	$UMULH r10,r6,r8
1845	addc r9,r9,r12
1846	addze r10,r10
1847	$ST r9,`0*$BNSZ`(r3)
1848	addi r12,r10,0
1849
1850	addi r5,r5,-1
1851	cmpli 0,0,r5,0
1852	beq Lppcasm_mw_OVER
1853
1854
1855	#mul(rp[1],ap[1],w,c1);
1856	$LD r8,`1*$BNSZ`(r4)
1857	$UMULL r9,r6,r8
1858	$UMULH r10,r6,r8
1859	addc r9,r9,r12
1860	addze r10,r10
1861	$ST r9,`1*$BNSZ`(r3)
1862	addi r12,r10,0
1863
1864	addi r5,r5,-1
1865	cmpli 0,0,r5,0
1866	beq Lppcasm_mw_OVER
1867
1868	#mul_add(rp[2],ap[2],w,c1);
1869	$LD r8,`2*$BNSZ`(r4)
1870	$UMULL r9,r6,r8
1871	$UMULH r10,r6,r8
1872	addc r9,r9,r12
1873	addze r10,r10
1874	$ST r9,`2*$BNSZ`(r3)
1875	addi r12,r10,0
1876
1877	Lppcasm_mw_OVER:
1878	addi r3,r12,0
1879	blr
1880	.long 0
1881	.byte 0,12,0x14,0,0,0,4,0
1882	.long 0
1883	.size bn_mul_words,.-bn_mul_words
1884
1885	#
1886	# NOTE: The following label name should be changed to
1887	# "bn_mul_add_words" i.e. remove the first dot
1888	# for the gcc compiler. This should be automatically
1889	# done in the build
1890	#
1891
1892	.align 4
1893	.bn_mul_add_words:
1894	#
1895	# BN_ULONG bn_mul_add_words(BN_ULONG rp, BN_ULONG ap, int num, BN_ULONG w)
1896	#
1897	# r3 = rp
1898	# r4 = ap
1899	# r5 = num
1900	# r6 = w
1901	#
1902	# empirical evidence suggests that unrolled version performs best!!
1903	#
1904	xor r0,r0,r0 #r0 = 0
1905	xor r12,r12,r12 #r12 = 0 . used for carry
1906	rlwinm. r7,r5,30,2,31 # num >> 2
1907	beq Lppcasm_maw_leftover # if (num < 4) go LPPCASM_maw_leftover
1908	mtctr r7
1909	Lppcasm_maw_mainloop:
1910	#mul_add(rp[0],ap[0],w,c1);
1911	$LD r8,`0*$BNSZ`(r4)
1912	$LD r11,`0*$BNSZ`(r3)
1913	$UMULL r9,r6,r8
1914	$UMULH r10,r6,r8
1915	addc r9,r9,r12 #r12 is carry.
1916	addze r10,r10
1917	addc r9,r9,r11
1918	#addze r10,r10
1919	#the above instruction addze
1920	#is NOT needed. Carry will NOT
1921	#be ignored. It's not affected
1922	#by multiply and will be collected
1923	#in the next spin
1924	$ST r9,`0*$BNSZ`(r3)
1925
1926	#mul_add(rp[1],ap[1],w,c1);
1927	$LD r8,`1*$BNSZ`(r4)
1928	$LD r9,`1*$BNSZ`(r3)
1929	$UMULL r11,r6,r8
1930	$UMULH r12,r6,r8
1931	adde r11,r11,r10 #r10 is carry.
1932	addze r12,r12
1933	addc r11,r11,r9
1934	#addze r12,r12
1935	$ST r11,`1*$BNSZ`(r3)
1936
1937	#mul_add(rp[2],ap[2],w,c1);
1938	$LD r8,`2*$BNSZ`(r4)
1939	$UMULL r9,r6,r8
1940	$LD r11,`2*$BNSZ`(r3)
1941	$UMULH r10,r6,r8
1942	adde r9,r9,r12
1943	addze r10,r10
1944	addc r9,r9,r11
1945	#addze r10,r10
1946	$ST r9,`2*$BNSZ`(r3)
1947
1948	#mul_add(rp[3],ap[3],w,c1);
1949	$LD r8,`3*$BNSZ`(r4)
1950	$UMULL r11,r6,r8
1951	$LD r9,`3*$BNSZ`(r3)
1952	$UMULH r12,r6,r8
1953	adde r11,r11,r10
1954	addze r12,r12
1955	addc r11,r11,r9
1956	addze r12,r12
1957	$ST r11,`3*$BNSZ`(r3)
1958	addi r3,r3,`4*$BNSZ`
1959	addi r4,r4,`4*$BNSZ`
1960	bdnz Lppcasm_maw_mainloop
1961
1962	Lppcasm_maw_leftover:
1963	andi. r5,r5,0x3
1964	beq Lppcasm_maw_adios
1965	addi r3,r3,-$BNSZ
1966	addi r4,r4,-$BNSZ
1967	#mul_add(rp[0],ap[0],w,c1);
1968	mtctr r5
1969	$LDU r8,$BNSZ(r4)
1970	$UMULL r9,r6,r8
1971	$UMULH r10,r6,r8
1972	$LDU r11,$BNSZ(r3)
1973	addc r9,r9,r11
1974	addze r10,r10
1975	addc r9,r9,r12
1976	addze r12,r10
1977	$ST r9,0(r3)
1978
1979	bdz Lppcasm_maw_adios
1980	#mul_add(rp[1],ap[1],w,c1);
1981	$LDU r8,$BNSZ(r4)
1982	$UMULL r9,r6,r8
1983	$UMULH r10,r6,r8
1984	$LDU r11,$BNSZ(r3)
1985	addc r9,r9,r11
1986	addze r10,r10
1987	addc r9,r9,r12
1988	addze r12,r10
1989	$ST r9,0(r3)
1990
1991	bdz Lppcasm_maw_adios
1992	#mul_add(rp[2],ap[2],w,c1);
1993	$LDU r8,$BNSZ(r4)
1994	$UMULL r9,r6,r8
1995	$UMULH r10,r6,r8
1996	$LDU r11,$BNSZ(r3)
1997	addc r9,r9,r11
1998	addze r10,r10
1999	addc r9,r9,r12
2000	addze r12,r10
2001	$ST r9,0(r3)
2002
2003	Lppcasm_maw_adios:
2004	addi r3,r12,0
2005	blr
2006	.long 0
2007	.byte 0,12,0x14,0,0,0,4,0
2008	.long 0
2009	.size .bn_mul_add_words,.-.bn_mul_add_words
2010	.align 4
2011	EOF
2012	$data =~ s/\`([^\`]*)\`/eval $1/gem;
2013	print $data;
2014	close STDOUT;

Note: See TracBrowser for help on using the repository browser.

source: vbox/trunk/src/libs/openssl-1.1.0g/crypto/bn/asm/ppc.pl@ 69881

Download in other formats: