ppc.pl@ 94081

Last change on this file since 94081 was 91772, checked in by vboxsync, 3 years ago
openssl-1.1.1l: Applied and adjusted our OpenSSL changes to 1.1.1l. bugref:10126
File size: 44.4 KB

Line
1	#! /usr/bin/env perl
2	# Copyright 2004-2020 The OpenSSL Project Authors. All Rights Reserved.
3	#
4	# Licensed under the OpenSSL license (the "License"). You may not use
5	# this file except in compliance with the License. You can obtain a copy
6	# in the file LICENSE in the source distribution or at
7	# https://www.openssl.org/source/license.html
8
9	# Implemented as a Perl wrapper as we want to support several different
10	# architectures with single file. We pick up the target based on the
11	# file name we are asked to generate.
12	#
13	# It should be noted though that this perl code is nothing like
14	# <openssl>/crypto/perlasm/x86*. In this case perl is used pretty much
15	# as pre-processor to cover for platform differences in name decoration,
16	# linker tables, 32-/64-bit instruction sets...
17	#
18	# As you might know there're several PowerPC ABI in use. Most notably
19	# Linux and AIX use different 32-bit ABIs. Good news are that these ABIs
20	# are similar enough to implement leaf(!) functions, which would be ABI
21	# neutral. And that's what you find here: ABI neutral leaf functions.
22	# In case you wonder what that is...
23	#
24	# AIX performance
25	#
26	# MEASUREMENTS WITH cc ON a 200 MhZ PowerPC 604e.
27	#
28	# The following is the performance of 32-bit compiler
29	# generated code:
30	#
31	# OpenSSL 0.9.6c 21 dec 2001
32	# built on: Tue Jun 11 11:06:51 EDT 2002
33	# options:bn(64,32) ...
34	#compiler: cc -DTHREADS -DAIX -DB_ENDIAN -DBN_LLONG -O3
35	# sign verify sign/s verify/s
36	#rsa 512 bits 0.0098s 0.0009s 102.0 1170.6
37	#rsa 1024 bits 0.0507s 0.0026s 19.7 387.5
38	#rsa 2048 bits 0.3036s 0.0085s 3.3 117.1
39	#rsa 4096 bits 2.0040s 0.0299s 0.5 33.4
40	#dsa 512 bits 0.0087s 0.0106s 114.3 94.5
41	#dsa 1024 bits 0.0256s 0.0313s 39.0 32.0
42	#
43	# Same benchmark with this assembler code:
44	#
45	#rsa 512 bits 0.0056s 0.0005s 178.6 2049.2
46	#rsa 1024 bits 0.0283s 0.0015s 35.3 674.1
47	#rsa 2048 bits 0.1744s 0.0050s 5.7 201.2
48	#rsa 4096 bits 1.1644s 0.0179s 0.9 55.7
49	#dsa 512 bits 0.0052s 0.0062s 191.6 162.0
50	#dsa 1024 bits 0.0149s 0.0180s 67.0 55.5
51	#
52	# Number of operations increases by at almost 75%
53	#
54	# Here are performance numbers for 64-bit compiler
55	# generated code:
56	#
57	# OpenSSL 0.9.6g [engine] 9 Aug 2002
58	# built on: Fri Apr 18 16:59:20 EDT 2003
59	# options:bn(64,64) ...
60	# compiler: cc -DTHREADS -D_REENTRANT -q64 -DB_ENDIAN -O3
61	# sign verify sign/s verify/s
62	#rsa 512 bits 0.0028s 0.0003s 357.1 3844.4
63	#rsa 1024 bits 0.0148s 0.0008s 67.5 1239.7
64	#rsa 2048 bits 0.0963s 0.0028s 10.4 353.0
65	#rsa 4096 bits 0.6538s 0.0102s 1.5 98.1
66	#dsa 512 bits 0.0026s 0.0032s 382.5 313.7
67	#dsa 1024 bits 0.0081s 0.0099s 122.8 100.6
68	#
69	# Same benchmark with this assembler code:
70	#
71	#rsa 512 bits 0.0020s 0.0002s 510.4 6273.7
72	#rsa 1024 bits 0.0088s 0.0005s 114.1 2128.3
73	#rsa 2048 bits 0.0540s 0.0016s 18.5 622.5
74	#rsa 4096 bits 0.3700s 0.0058s 2.7 171.0
75	#dsa 512 bits 0.0016s 0.0020s 610.7 507.1
76	#dsa 1024 bits 0.0047s 0.0058s 212.5 173.2
77	#
78	# Again, performance increases by at about 75%
79	#
80	# Mac OS X, Apple G5 1.8GHz (Note this is 32 bit code)
81	# OpenSSL 0.9.7c 30 Sep 2003
82	#
83	# Original code.
84	#
85	#rsa 512 bits 0.0011s 0.0001s 906.1 11012.5
86	#rsa 1024 bits 0.0060s 0.0003s 166.6 3363.1
87	#rsa 2048 bits 0.0370s 0.0010s 27.1 982.4
88	#rsa 4096 bits 0.2426s 0.0036s 4.1 280.4
89	#dsa 512 bits 0.0010s 0.0012s 1038.1 841.5
90	#dsa 1024 bits 0.0030s 0.0037s 329.6 269.7
91	#dsa 2048 bits 0.0101s 0.0127s 98.9 78.6
92	#
93	# Same benchmark with this assembler code:
94	#
95	#rsa 512 bits 0.0007s 0.0001s 1416.2 16645.9
96	#rsa 1024 bits 0.0036s 0.0002s 274.4 5380.6
97	#rsa 2048 bits 0.0222s 0.0006s 45.1 1589.5
98	#rsa 4096 bits 0.1469s 0.0022s 6.8 449.6
99	#dsa 512 bits 0.0006s 0.0007s 1664.2 1376.2
100	#dsa 1024 bits 0.0018s 0.0023s 545.0 442.2
101	#dsa 2048 bits 0.0061s 0.0075s 163.5 132.8
102	#
103	# Performance increase of ~60%
104	# Based on submission from Suresh N. Chari of IBM
105
106	$flavour = shift;
107
108	if ($flavour =~ /32/) {
109	$BITS= 32;
110	$BNSZ= $BITS/8;
111	$ISA= "\"ppc\"";
112
113	$LD= "lwz"; # load
114	$LDU= "lwzu"; # load and update
115	$ST= "stw"; # store
116	$STU= "stwu"; # store and update
117	$UMULL= "mullw"; # unsigned multiply low
118	$UMULH= "mulhwu"; # unsigned multiply high
119	$UDIV= "divwu"; # unsigned divide
120	$UCMPI= "cmplwi"; # unsigned compare with immediate
121	$UCMP= "cmplw"; # unsigned compare
122	$CNTLZ= "cntlzw"; # count leading zeros
123	$SHL= "slw"; # shift left
124	$SHR= "srw"; # unsigned shift right
125	$SHRI= "srwi"; # unsigned shift right by immediate
126	$SHLI= "slwi"; # shift left by immediate
127	$CLRU= "clrlwi"; # clear upper bits
128	$INSR= "insrwi"; # insert right
129	$ROTL= "rotlwi"; # rotate left by immediate
130	$TR= "tw"; # conditional trap
131	} elsif ($flavour =~ /64/) {
132	$BITS= 64;
133	$BNSZ= $BITS/8;
134	$ISA= "\"ppc64\"";
135
136	# same as above, but 64-bit mnemonics...
137	$LD= "ld"; # load
138	$LDU= "ldu"; # load and update
139	$ST= "std"; # store
140	$STU= "stdu"; # store and update
141	$UMULL= "mulld"; # unsigned multiply low
142	$UMULH= "mulhdu"; # unsigned multiply high
143	$UDIV= "divdu"; # unsigned divide
144	$UCMPI= "cmpldi"; # unsigned compare with immediate
145	$UCMP= "cmpld"; # unsigned compare
146	$CNTLZ= "cntlzd"; # count leading zeros
147	$SHL= "sld"; # shift left
148	$SHR= "srd"; # unsigned shift right
149	$SHRI= "srdi"; # unsigned shift right by immediate
150	$SHLI= "sldi"; # shift left by immediate
151	$CLRU= "clrldi"; # clear upper bits
152	$INSR= "insrdi"; # insert right
153	$ROTL= "rotldi"; # rotate left by immediate
154	$TR= "td"; # conditional trap
155	} else { die "nonsense $flavour"; }
156
157	$0 =~ m/(.*[\/\\])[^\/\\]+$/; $dir=$1;
158	( $xlate="${dir}ppc-xlate.pl" and -f $xlate ) or
159	( $xlate="${dir}../../perlasm/ppc-xlate.pl" and -f $xlate) or
160	die "can't locate ppc-xlate.pl";
161
162	open STDOUT,"\| $^X $xlate $flavour ".shift \|\| die "can't call $xlate: $!";
163
164	$data=<<EOF;
165	#--------------------------------------------------------------------
166	#
167	#
168	#
169	#
170	# File: ppc32.s
171	#
172	# Created by: Suresh Chari
173	# IBM Thomas J. Watson Research Library
174	# Hawthorne, NY
175	#
176	#
177	# Description: Optimized assembly routines for OpenSSL crypto
178	# on the 32 bitPowerPC platform.
179	#
180	#
181	# Version History
182	#
183	# 2. Fixed bn_add,bn_sub and bn_div_words, added comments,
184	# cleaned up code. Also made a single version which can
185	# be used for both the AIX and Linux compilers. See NOTE
186	# below.
187	# 12/05/03 Suresh Chari
188	# (with lots of help from) Andy Polyakov
189	##
190	# 1. Initial version 10/20/02 Suresh Chari
191	#
192	#
193	# The following file works for the xlc,cc
194	# and gcc compilers.
195	#
196	# NOTE: To get the file to link correctly with the gcc compiler
197	# you have to change the names of the routines and remove
198	# the first .(dot) character. This should automatically
199	# be done in the build process.
200	#
201	# Hand optimized assembly code for the following routines
202	#
203	# bn_sqr_comba4
204	# bn_sqr_comba8
205	# bn_mul_comba4
206	# bn_mul_comba8
207	# bn_sub_words
208	# bn_add_words
209	# bn_div_words
210	# bn_sqr_words
211	# bn_mul_words
212	# bn_mul_add_words
213	#
214	# NOTE: It is possible to optimize this code more for
215	# specific PowerPC or Power architectures. On the Northstar
216	# architecture the optimizations in this file do
217	# NOT provide much improvement.
218	#
219	# If you have comments or suggestions to improve code send
220	# me a note at schari\@us.ibm.com
221	#
222	#--------------------------------------------------------------------------
223	#
224	# Defines to be used in the assembly code.
225	#
226	#.set r0,0 # we use it as storage for value of 0
227	#.set SP,1 # preserved
228	#.set RTOC,2 # preserved
229	#.set r3,3 # 1st argument/return value
230	#.set r4,4 # 2nd argument/volatile register
231	#.set r5,5 # 3rd argument/volatile register
232	#.set r6,6 # ...
233	#.set r7,7
234	#.set r8,8
235	#.set r9,9
236	#.set r10,10
237	#.set r11,11
238	#.set r12,12
239	#.set r13,13 # not used, nor any other "below" it...
240
241	# Declare function names to be global
242	# NOTE: For gcc these names MUST be changed to remove
243	# the first . i.e. for example change ".bn_sqr_comba4"
244	# to "bn_sqr_comba4". This should be automatically done
245	# in the build.
246
247	.globl .bn_sqr_comba4
248	.globl .bn_sqr_comba8
249	.globl .bn_mul_comba4
250	.globl .bn_mul_comba8
251	.globl .bn_sub_words
252	.globl .bn_add_words
253	.globl .bn_div_words
254	.globl .bn_sqr_words
255	.globl .bn_mul_words
256	.globl .bn_mul_add_words
257
258	# .text section
259
260	.machine "any"
261	.text
262
263	#
264	# NOTE: The following label name should be changed to
265	# "bn_sqr_comba4" i.e. remove the first dot
266	# for the gcc compiler. This should be automatically
267	# done in the build
268	#
269
270	.align 4
271	.bn_sqr_comba4:
272	#
273	# Optimized version of bn_sqr_comba4.
274	#
275	# void bn_sqr_comba4(BN_ULONG r, BN_ULONG a)
276	# r3 contains r
277	# r4 contains a
278	#
279	# Freely use registers r5,r6,r7,r8,r9,r10,r11 as follows:
280	#
281	# r5,r6 are the two BN_ULONGs being multiplied.
282	# r7,r8 are the results of the 32x32 giving 64 bit multiply.
283	# r9,r10, r11 are the equivalents of c1,c2, c3.
284	# Here's the assembly
285	#
286	#
287	xor r0,r0,r0 # set r0 = 0. Used in the addze
288	# instructions below
289
290	#sqr_add_c(a,0,c1,c2,c3)
291	$LD r5,`0*$BNSZ`(r4)
292	$UMULL r9,r5,r5
293	$UMULH r10,r5,r5 #in first iteration. No need
294	#to add since c1=c2=c3=0.
295	# Note c3(r11) is NOT set to 0
296	# but will be.
297
298	$ST r9,`0*$BNSZ`(r3) # r[0]=c1;
299	# sqr_add_c2(a,1,0,c2,c3,c1);
300	$LD r6,`1*$BNSZ`(r4)
301	$UMULL r7,r5,r6
302	$UMULH r8,r5,r6
303
304	addc r7,r7,r7 # compute (r7,r8)=2*(r7,r8)
305	adde r8,r8,r8
306	addze r9,r0 # catch carry if any.
307	# r9= r0(=0) and carry
308
309	addc r10,r7,r10 # now add to temp result.
310	addze r11,r8 # r8 added to r11 which is 0
311	addze r9,r9
312
313	$ST r10,`1*$BNSZ`(r3) #r[1]=c2;
314	#sqr_add_c(a,1,c3,c1,c2)
315	$UMULL r7,r6,r6
316	$UMULH r8,r6,r6
317	addc r11,r7,r11
318	adde r9,r8,r9
319	addze r10,r0
320	#sqr_add_c2(a,2,0,c3,c1,c2)
321	$LD r6,`2*$BNSZ`(r4)
322	$UMULL r7,r5,r6
323	$UMULH r8,r5,r6
324
325	addc r7,r7,r7
326	adde r8,r8,r8
327	addze r10,r10
328
329	addc r11,r7,r11
330	adde r9,r8,r9
331	addze r10,r10
332	$ST r11,`2*$BNSZ`(r3) #r[2]=c3
333	#sqr_add_c2(a,3,0,c1,c2,c3);
334	$LD r6,`3*$BNSZ`(r4)
335	$UMULL r7,r5,r6
336	$UMULH r8,r5,r6
337	addc r7,r7,r7
338	adde r8,r8,r8
339	addze r11,r0
340
341	addc r9,r7,r9
342	adde r10,r8,r10
343	addze r11,r11
344	#sqr_add_c2(a,2,1,c1,c2,c3);
345	$LD r5,`1*$BNSZ`(r4)
346	$LD r6,`2*$BNSZ`(r4)
347	$UMULL r7,r5,r6
348	$UMULH r8,r5,r6
349
350	addc r7,r7,r7
351	adde r8,r8,r8
352	addze r11,r11
353	addc r9,r7,r9
354	adde r10,r8,r10
355	addze r11,r11
356	$ST r9,`3*$BNSZ`(r3) #r[3]=c1
357	#sqr_add_c(a,2,c2,c3,c1);
358	$UMULL r7,r6,r6
359	$UMULH r8,r6,r6
360	addc r10,r7,r10
361	adde r11,r8,r11
362	addze r9,r0
363	#sqr_add_c2(a,3,1,c2,c3,c1);
364	$LD r6,`3*$BNSZ`(r4)
365	$UMULL r7,r5,r6
366	$UMULH r8,r5,r6
367	addc r7,r7,r7
368	adde r8,r8,r8
369	addze r9,r9
370
371	addc r10,r7,r10
372	adde r11,r8,r11
373	addze r9,r9
374	$ST r10,`4*$BNSZ`(r3) #r[4]=c2
375	#sqr_add_c2(a,3,2,c3,c1,c2);
376	$LD r5,`2*$BNSZ`(r4)
377	$UMULL r7,r5,r6
378	$UMULH r8,r5,r6
379	addc r7,r7,r7
380	adde r8,r8,r8
381	addze r10,r0
382
383	addc r11,r7,r11
384	adde r9,r8,r9
385	addze r10,r10
386	$ST r11,`5*$BNSZ`(r3) #r[5] = c3
387	#sqr_add_c(a,3,c1,c2,c3);
388	$UMULL r7,r6,r6
389	$UMULH r8,r6,r6
390	addc r9,r7,r9
391	adde r10,r8,r10
392
393	$ST r9,`6*$BNSZ`(r3) #r[6]=c1
394	$ST r10,`7*$BNSZ`(r3) #r[7]=c2
395	blr
396	.long 0
397	.byte 0,12,0x14,0,0,0,2,0
398	.long 0
399	.size .bn_sqr_comba4,.-.bn_sqr_comba4
400
401	#
402	# NOTE: The following label name should be changed to
403	# "bn_sqr_comba8" i.e. remove the first dot
404	# for the gcc compiler. This should be automatically
405	# done in the build
406	#
407
408	.align 4
409	.bn_sqr_comba8:
410	#
411	# This is an optimized version of the bn_sqr_comba8 routine.
412	# Tightly uses the adde instruction
413	#
414	#
415	# void bn_sqr_comba8(BN_ULONG r, BN_ULONG a)
416	# r3 contains r
417	# r4 contains a
418	#
419	# Freely use registers r5,r6,r7,r8,r9,r10,r11 as follows:
420	#
421	# r5,r6 are the two BN_ULONGs being multiplied.
422	# r7,r8 are the results of the 32x32 giving 64 bit multiply.
423	# r9,r10, r11 are the equivalents of c1,c2, c3.
424	#
425	# Possible optimization of loading all 8 longs of a into registers
426	# doesn't provide any speedup
427	#
428
429	xor r0,r0,r0 #set r0 = 0.Used in addze
430	#instructions below.
431
432	#sqr_add_c(a,0,c1,c2,c3);
433	$LD r5,`0*$BNSZ`(r4)
434	$UMULL r9,r5,r5 #1st iteration: no carries.
435	$UMULH r10,r5,r5
436	$ST r9,`0*$BNSZ`(r3) # r[0]=c1;
437	#sqr_add_c2(a,1,0,c2,c3,c1);
438	$LD r6,`1*$BNSZ`(r4)
439	$UMULL r7,r5,r6
440	$UMULH r8,r5,r6
441
442	addc r10,r7,r10 #add the two register number
443	adde r11,r8,r0 # (r8,r7) to the three register
444	addze r9,r0 # number (r9,r11,r10).NOTE:r0=0
445
446	addc r10,r7,r10 #add the two register number
447	adde r11,r8,r11 # (r8,r7) to the three register
448	addze r9,r9 # number (r9,r11,r10).
449
450	$ST r10,`1*$BNSZ`(r3) # r[1]=c2
451
452	#sqr_add_c(a,1,c3,c1,c2);
453	$UMULL r7,r6,r6
454	$UMULH r8,r6,r6
455	addc r11,r7,r11
456	adde r9,r8,r9
457	addze r10,r0
458	#sqr_add_c2(a,2,0,c3,c1,c2);
459	$LD r6,`2*$BNSZ`(r4)
460	$UMULL r7,r5,r6
461	$UMULH r8,r5,r6
462
463	addc r11,r7,r11
464	adde r9,r8,r9
465	addze r10,r10
466
467	addc r11,r7,r11
468	adde r9,r8,r9
469	addze r10,r10
470
471	$ST r11,`2*$BNSZ`(r3) #r[2]=c3
472	#sqr_add_c2(a,3,0,c1,c2,c3);
473	$LD r6,`3*$BNSZ`(r4) #r6 = a[3]. r5 is already a[0].
474	$UMULL r7,r5,r6
475	$UMULH r8,r5,r6
476
477	addc r9,r7,r9
478	adde r10,r8,r10
479	addze r11,r0
480
481	addc r9,r7,r9
482	adde r10,r8,r10
483	addze r11,r11
484	#sqr_add_c2(a,2,1,c1,c2,c3);
485	$LD r5,`1*$BNSZ`(r4)
486	$LD r6,`2*$BNSZ`(r4)
487	$UMULL r7,r5,r6
488	$UMULH r8,r5,r6
489
490	addc r9,r7,r9
491	adde r10,r8,r10
492	addze r11,r11
493
494	addc r9,r7,r9
495	adde r10,r8,r10
496	addze r11,r11
497
498	$ST r9,`3*$BNSZ`(r3) #r[3]=c1;
499	#sqr_add_c(a,2,c2,c3,c1);
500	$UMULL r7,r6,r6
501	$UMULH r8,r6,r6
502
503	addc r10,r7,r10
504	adde r11,r8,r11
505	addze r9,r0
506	#sqr_add_c2(a,3,1,c2,c3,c1);
507	$LD r6,`3*$BNSZ`(r4)
508	$UMULL r7,r5,r6
509	$UMULH r8,r5,r6
510
511	addc r10,r7,r10
512	adde r11,r8,r11
513	addze r9,r9
514
515	addc r10,r7,r10
516	adde r11,r8,r11
517	addze r9,r9
518	#sqr_add_c2(a,4,0,c2,c3,c1);
519	$LD r5,`0*$BNSZ`(r4)
520	$LD r6,`4*$BNSZ`(r4)
521	$UMULL r7,r5,r6
522	$UMULH r8,r5,r6
523
524	addc r10,r7,r10
525	adde r11,r8,r11
526	addze r9,r9
527
528	addc r10,r7,r10
529	adde r11,r8,r11
530	addze r9,r9
531	$ST r10,`4*$BNSZ`(r3) #r[4]=c2;
532	#sqr_add_c2(a,5,0,c3,c1,c2);
533	$LD r6,`5*$BNSZ`(r4)
534	$UMULL r7,r5,r6
535	$UMULH r8,r5,r6
536
537	addc r11,r7,r11
538	adde r9,r8,r9
539	addze r10,r0
540
541	addc r11,r7,r11
542	adde r9,r8,r9
543	addze r10,r10
544	#sqr_add_c2(a,4,1,c3,c1,c2);
545	$LD r5,`1*$BNSZ`(r4)
546	$LD r6,`4*$BNSZ`(r4)
547	$UMULL r7,r5,r6
548	$UMULH r8,r5,r6
549
550	addc r11,r7,r11
551	adde r9,r8,r9
552	addze r10,r10
553
554	addc r11,r7,r11
555	adde r9,r8,r9
556	addze r10,r10
557	#sqr_add_c2(a,3,2,c3,c1,c2);
558	$LD r5,`2*$BNSZ`(r4)
559	$LD r6,`3*$BNSZ`(r4)
560	$UMULL r7,r5,r6
561	$UMULH r8,r5,r6
562
563	addc r11,r7,r11
564	adde r9,r8,r9
565	addze r10,r10
566
567	addc r11,r7,r11
568	adde r9,r8,r9
569	addze r10,r10
570	$ST r11,`5*$BNSZ`(r3) #r[5]=c3;
571	#sqr_add_c(a,3,c1,c2,c3);
572	$UMULL r7,r6,r6
573	$UMULH r8,r6,r6
574	addc r9,r7,r9
575	adde r10,r8,r10
576	addze r11,r0
577	#sqr_add_c2(a,4,2,c1,c2,c3);
578	$LD r6,`4*$BNSZ`(r4)
579	$UMULL r7,r5,r6
580	$UMULH r8,r5,r6
581
582	addc r9,r7,r9
583	adde r10,r8,r10
584	addze r11,r11
585
586	addc r9,r7,r9
587	adde r10,r8,r10
588	addze r11,r11
589	#sqr_add_c2(a,5,1,c1,c2,c3);
590	$LD r5,`1*$BNSZ`(r4)
591	$LD r6,`5*$BNSZ`(r4)
592	$UMULL r7,r5,r6
593	$UMULH r8,r5,r6
594
595	addc r9,r7,r9
596	adde r10,r8,r10
597	addze r11,r11
598
599	addc r9,r7,r9
600	adde r10,r8,r10
601	addze r11,r11
602	#sqr_add_c2(a,6,0,c1,c2,c3);
603	$LD r5,`0*$BNSZ`(r4)
604	$LD r6,`6*$BNSZ`(r4)
605	$UMULL r7,r5,r6
606	$UMULH r8,r5,r6
607	addc r9,r7,r9
608	adde r10,r8,r10
609	addze r11,r11
610	addc r9,r7,r9
611	adde r10,r8,r10
612	addze r11,r11
613	$ST r9,`6*$BNSZ`(r3) #r[6]=c1;
614	#sqr_add_c2(a,7,0,c2,c3,c1);
615	$LD r6,`7*$BNSZ`(r4)
616	$UMULL r7,r5,r6
617	$UMULH r8,r5,r6
618
619	addc r10,r7,r10
620	adde r11,r8,r11
621	addze r9,r0
622	addc r10,r7,r10
623	adde r11,r8,r11
624	addze r9,r9
625	#sqr_add_c2(a,6,1,c2,c3,c1);
626	$LD r5,`1*$BNSZ`(r4)
627	$LD r6,`6*$BNSZ`(r4)
628	$UMULL r7,r5,r6
629	$UMULH r8,r5,r6
630
631	addc r10,r7,r10
632	adde r11,r8,r11
633	addze r9,r9
634	addc r10,r7,r10
635	adde r11,r8,r11
636	addze r9,r9
637	#sqr_add_c2(a,5,2,c2,c3,c1);
638	$LD r5,`2*$BNSZ`(r4)
639	$LD r6,`5*$BNSZ`(r4)
640	$UMULL r7,r5,r6
641	$UMULH r8,r5,r6
642	addc r10,r7,r10
643	adde r11,r8,r11
644	addze r9,r9
645	addc r10,r7,r10
646	adde r11,r8,r11
647	addze r9,r9
648	#sqr_add_c2(a,4,3,c2,c3,c1);
649	$LD r5,`3*$BNSZ`(r4)
650	$LD r6,`4*$BNSZ`(r4)
651	$UMULL r7,r5,r6
652	$UMULH r8,r5,r6
653
654	addc r10,r7,r10
655	adde r11,r8,r11
656	addze r9,r9
657	addc r10,r7,r10
658	adde r11,r8,r11
659	addze r9,r9
660	$ST r10,`7*$BNSZ`(r3) #r[7]=c2;
661	#sqr_add_c(a,4,c3,c1,c2);
662	$UMULL r7,r6,r6
663	$UMULH r8,r6,r6
664	addc r11,r7,r11
665	adde r9,r8,r9
666	addze r10,r0
667	#sqr_add_c2(a,5,3,c3,c1,c2);
668	$LD r6,`5*$BNSZ`(r4)
669	$UMULL r7,r5,r6
670	$UMULH r8,r5,r6
671	addc r11,r7,r11
672	adde r9,r8,r9
673	addze r10,r10
674	addc r11,r7,r11
675	adde r9,r8,r9
676	addze r10,r10
677	#sqr_add_c2(a,6,2,c3,c1,c2);
678	$LD r5,`2*$BNSZ`(r4)
679	$LD r6,`6*$BNSZ`(r4)
680	$UMULL r7,r5,r6
681	$UMULH r8,r5,r6
682	addc r11,r7,r11
683	adde r9,r8,r9
684	addze r10,r10
685
686	addc r11,r7,r11
687	adde r9,r8,r9
688	addze r10,r10
689	#sqr_add_c2(a,7,1,c3,c1,c2);
690	$LD r5,`1*$BNSZ`(r4)
691	$LD r6,`7*$BNSZ`(r4)
692	$UMULL r7,r5,r6
693	$UMULH r8,r5,r6
694	addc r11,r7,r11
695	adde r9,r8,r9
696	addze r10,r10
697	addc r11,r7,r11
698	adde r9,r8,r9
699	addze r10,r10
700	$ST r11,`8*$BNSZ`(r3) #r[8]=c3;
701	#sqr_add_c2(a,7,2,c1,c2,c3);
702	$LD r5,`2*$BNSZ`(r4)
703	$UMULL r7,r5,r6
704	$UMULH r8,r5,r6
705
706	addc r9,r7,r9
707	adde r10,r8,r10
708	addze r11,r0
709	addc r9,r7,r9
710	adde r10,r8,r10
711	addze r11,r11
712	#sqr_add_c2(a,6,3,c1,c2,c3);
713	$LD r5,`3*$BNSZ`(r4)
714	$LD r6,`6*$BNSZ`(r4)
715	$UMULL r7,r5,r6
716	$UMULH r8,r5,r6
717	addc r9,r7,r9
718	adde r10,r8,r10
719	addze r11,r11
720	addc r9,r7,r9
721	adde r10,r8,r10
722	addze r11,r11
723	#sqr_add_c2(a,5,4,c1,c2,c3);
724	$LD r5,`4*$BNSZ`(r4)
725	$LD r6,`5*$BNSZ`(r4)
726	$UMULL r7,r5,r6
727	$UMULH r8,r5,r6
728	addc r9,r7,r9
729	adde r10,r8,r10
730	addze r11,r11
731	addc r9,r7,r9
732	adde r10,r8,r10
733	addze r11,r11
734	$ST r9,`9*$BNSZ`(r3) #r[9]=c1;
735	#sqr_add_c(a,5,c2,c3,c1);
736	$UMULL r7,r6,r6
737	$UMULH r8,r6,r6
738	addc r10,r7,r10
739	adde r11,r8,r11
740	addze r9,r0
741	#sqr_add_c2(a,6,4,c2,c3,c1);
742	$LD r6,`6*$BNSZ`(r4)
743	$UMULL r7,r5,r6
744	$UMULH r8,r5,r6
745	addc r10,r7,r10
746	adde r11,r8,r11
747	addze r9,r9
748	addc r10,r7,r10
749	adde r11,r8,r11
750	addze r9,r9
751	#sqr_add_c2(a,7,3,c2,c3,c1);
752	$LD r5,`3*$BNSZ`(r4)
753	$LD r6,`7*$BNSZ`(r4)
754	$UMULL r7,r5,r6
755	$UMULH r8,r5,r6
756	addc r10,r7,r10
757	adde r11,r8,r11
758	addze r9,r9
759	addc r10,r7,r10
760	adde r11,r8,r11
761	addze r9,r9
762	$ST r10,`10*$BNSZ`(r3) #r[10]=c2;
763	#sqr_add_c2(a,7,4,c3,c1,c2);
764	$LD r5,`4*$BNSZ`(r4)
765	$UMULL r7,r5,r6
766	$UMULH r8,r5,r6
767	addc r11,r7,r11
768	adde r9,r8,r9
769	addze r10,r0
770	addc r11,r7,r11
771	adde r9,r8,r9
772	addze r10,r10
773	#sqr_add_c2(a,6,5,c3,c1,c2);
774	$LD r5,`5*$BNSZ`(r4)
775	$LD r6,`6*$BNSZ`(r4)
776	$UMULL r7,r5,r6
777	$UMULH r8,r5,r6
778	addc r11,r7,r11
779	adde r9,r8,r9
780	addze r10,r10
781	addc r11,r7,r11
782	adde r9,r8,r9
783	addze r10,r10
784	$ST r11,`11*$BNSZ`(r3) #r[11]=c3;
785	#sqr_add_c(a,6,c1,c2,c3);
786	$UMULL r7,r6,r6
787	$UMULH r8,r6,r6
788	addc r9,r7,r9
789	adde r10,r8,r10
790	addze r11,r0
791	#sqr_add_c2(a,7,5,c1,c2,c3)
792	$LD r6,`7*$BNSZ`(r4)
793	$UMULL r7,r5,r6
794	$UMULH r8,r5,r6
795	addc r9,r7,r9
796	adde r10,r8,r10
797	addze r11,r11
798	addc r9,r7,r9
799	adde r10,r8,r10
800	addze r11,r11
801	$ST r9,`12*$BNSZ`(r3) #r[12]=c1;
802
803	#sqr_add_c2(a,7,6,c2,c3,c1)
804	$LD r5,`6*$BNSZ`(r4)
805	$UMULL r7,r5,r6
806	$UMULH r8,r5,r6
807	addc r10,r7,r10
808	adde r11,r8,r11
809	addze r9,r0
810	addc r10,r7,r10
811	adde r11,r8,r11
812	addze r9,r9
813	$ST r10,`13*$BNSZ`(r3) #r[13]=c2;
814	#sqr_add_c(a,7,c3,c1,c2);
815	$UMULL r7,r6,r6
816	$UMULH r8,r6,r6
817	addc r11,r7,r11
818	adde r9,r8,r9
819	$ST r11,`14*$BNSZ`(r3) #r[14]=c3;
820	$ST r9, `15*$BNSZ`(r3) #r[15]=c1;
821
822
823	blr
824	.long 0
825	.byte 0,12,0x14,0,0,0,2,0
826	.long 0
827	.size .bn_sqr_comba8,.-.bn_sqr_comba8
828
829	#
830	# NOTE: The following label name should be changed to
831	# "bn_mul_comba4" i.e. remove the first dot
832	# for the gcc compiler. This should be automatically
833	# done in the build
834	#
835
836	.align 4
837	.bn_mul_comba4:
838	#
839	# This is an optimized version of the bn_mul_comba4 routine.
840	#
841	# void bn_mul_comba4(BN_ULONG r, BN_ULONG a, BN_ULONG *b)
842	# r3 contains r
843	# r4 contains a
844	# r5 contains b
845	# r6, r7 are the 2 BN_ULONGs being multiplied.
846	# r8, r9 are the results of the 32x32 giving 64 multiply.
847	# r10, r11, r12 are the equivalents of c1, c2, and c3.
848	#
849	xor r0,r0,r0 #r0=0. Used in addze below.
850	#mul_add_c(a[0],b[0],c1,c2,c3);
851	$LD r6,`0*$BNSZ`(r4)
852	$LD r7,`0*$BNSZ`(r5)
853	$UMULL r10,r6,r7
854	$UMULH r11,r6,r7
855	$ST r10,`0*$BNSZ`(r3) #r[0]=c1
856	#mul_add_c(a[0],b[1],c2,c3,c1);
857	$LD r7,`1*$BNSZ`(r5)
858	$UMULL r8,r6,r7
859	$UMULH r9,r6,r7
860	addc r11,r8,r11
861	adde r12,r9,r0
862	addze r10,r0
863	#mul_add_c(a[1],b[0],c2,c3,c1);
864	$LD r6, `1*$BNSZ`(r4)
865	$LD r7, `0*$BNSZ`(r5)
866	$UMULL r8,r6,r7
867	$UMULH r9,r6,r7
868	addc r11,r8,r11
869	adde r12,r9,r12
870	addze r10,r10
871	$ST r11,`1*$BNSZ`(r3) #r[1]=c2
872	#mul_add_c(a[2],b[0],c3,c1,c2);
873	$LD r6,`2*$BNSZ`(r4)
874	$UMULL r8,r6,r7
875	$UMULH r9,r6,r7
876	addc r12,r8,r12
877	adde r10,r9,r10
878	addze r11,r0
879	#mul_add_c(a[1],b[1],c3,c1,c2);
880	$LD r6,`1*$BNSZ`(r4)
881	$LD r7,`1*$BNSZ`(r5)
882	$UMULL r8,r6,r7
883	$UMULH r9,r6,r7
884	addc r12,r8,r12
885	adde r10,r9,r10
886	addze r11,r11
887	#mul_add_c(a[0],b[2],c3,c1,c2);
888	$LD r6,`0*$BNSZ`(r4)
889	$LD r7,`2*$BNSZ`(r5)
890	$UMULL r8,r6,r7
891	$UMULH r9,r6,r7
892	addc r12,r8,r12
893	adde r10,r9,r10
894	addze r11,r11
895	$ST r12,`2*$BNSZ`(r3) #r[2]=c3
896	#mul_add_c(a[0],b[3],c1,c2,c3);
897	$LD r7,`3*$BNSZ`(r5)
898	$UMULL r8,r6,r7
899	$UMULH r9,r6,r7
900	addc r10,r8,r10
901	adde r11,r9,r11
902	addze r12,r0
903	#mul_add_c(a[1],b[2],c1,c2,c3);
904	$LD r6,`1*$BNSZ`(r4)
905	$LD r7,`2*$BNSZ`(r5)
906	$UMULL r8,r6,r7
907	$UMULH r9,r6,r7
908	addc r10,r8,r10
909	adde r11,r9,r11
910	addze r12,r12
911	#mul_add_c(a[2],b[1],c1,c2,c3);
912	$LD r6,`2*$BNSZ`(r4)
913	$LD r7,`1*$BNSZ`(r5)
914	$UMULL r8,r6,r7
915	$UMULH r9,r6,r7
916	addc r10,r8,r10
917	adde r11,r9,r11
918	addze r12,r12
919	#mul_add_c(a[3],b[0],c1,c2,c3);
920	$LD r6,`3*$BNSZ`(r4)
921	$LD r7,`0*$BNSZ`(r5)
922	$UMULL r8,r6,r7
923	$UMULH r9,r6,r7
924	addc r10,r8,r10
925	adde r11,r9,r11
926	addze r12,r12
927	$ST r10,`3*$BNSZ`(r3) #r[3]=c1
928	#mul_add_c(a[3],b[1],c2,c3,c1);
929	$LD r7,`1*$BNSZ`(r5)
930	$UMULL r8,r6,r7
931	$UMULH r9,r6,r7
932	addc r11,r8,r11
933	adde r12,r9,r12
934	addze r10,r0
935	#mul_add_c(a[2],b[2],c2,c3,c1);
936	$LD r6,`2*$BNSZ`(r4)
937	$LD r7,`2*$BNSZ`(r5)
938	$UMULL r8,r6,r7
939	$UMULH r9,r6,r7
940	addc r11,r8,r11
941	adde r12,r9,r12
942	addze r10,r10
943	#mul_add_c(a[1],b[3],c2,c3,c1);
944	$LD r6,`1*$BNSZ`(r4)
945	$LD r7,`3*$BNSZ`(r5)
946	$UMULL r8,r6,r7
947	$UMULH r9,r6,r7
948	addc r11,r8,r11
949	adde r12,r9,r12
950	addze r10,r10
951	$ST r11,`4*$BNSZ`(r3) #r[4]=c2
952	#mul_add_c(a[2],b[3],c3,c1,c2);
953	$LD r6,`2*$BNSZ`(r4)
954	$UMULL r8,r6,r7
955	$UMULH r9,r6,r7
956	addc r12,r8,r12
957	adde r10,r9,r10
958	addze r11,r0
959	#mul_add_c(a[3],b[2],c3,c1,c2);
960	$LD r6,`3*$BNSZ`(r4)
961	$LD r7,`2*$BNSZ`(r5)
962	$UMULL r8,r6,r7
963	$UMULH r9,r6,r7
964	addc r12,r8,r12
965	adde r10,r9,r10
966	addze r11,r11
967	$ST r12,`5*$BNSZ`(r3) #r[5]=c3
968	#mul_add_c(a[3],b[3],c1,c2,c3);
969	$LD r7,`3*$BNSZ`(r5)
970	$UMULL r8,r6,r7
971	$UMULH r9,r6,r7
972	addc r10,r8,r10
973	adde r11,r9,r11
974
975	$ST r10,`6*$BNSZ`(r3) #r[6]=c1
976	$ST r11,`7*$BNSZ`(r3) #r[7]=c2
977	blr
978	.long 0
979	.byte 0,12,0x14,0,0,0,3,0
980	.long 0
981	.size .bn_mul_comba4,.-.bn_mul_comba4
982
983	#
984	# NOTE: The following label name should be changed to
985	# "bn_mul_comba8" i.e. remove the first dot
986	# for the gcc compiler. This should be automatically
987	# done in the build
988	#
989
990	.align 4
991	.bn_mul_comba8:
992	#
993	# Optimized version of the bn_mul_comba8 routine.
994	#
995	# void bn_mul_comba8(BN_ULONG r, BN_ULONG a, BN_ULONG *b)
996	# r3 contains r
997	# r4 contains a
998	# r5 contains b
999	# r6, r7 are the 2 BN_ULONGs being multiplied.
1000	# r8, r9 are the results of the 32x32 giving 64 multiply.
1001	# r10, r11, r12 are the equivalents of c1, c2, and c3.
1002	#
1003	xor r0,r0,r0 #r0=0. Used in addze below.
1004
1005	#mul_add_c(a[0],b[0],c1,c2,c3);
1006	$LD r6,`0*$BNSZ`(r4) #a[0]
1007	$LD r7,`0*$BNSZ`(r5) #b[0]
1008	$UMULL r10,r6,r7
1009	$UMULH r11,r6,r7
1010	$ST r10,`0*$BNSZ`(r3) #r[0]=c1;
1011	#mul_add_c(a[0],b[1],c2,c3,c1);
1012	$LD r7,`1*$BNSZ`(r5)
1013	$UMULL r8,r6,r7
1014	$UMULH r9,r6,r7
1015	addc r11,r11,r8
1016	addze r12,r9 # since we didn't set r12 to zero before.
1017	addze r10,r0
1018	#mul_add_c(a[1],b[0],c2,c3,c1);
1019	$LD r6,`1*$BNSZ`(r4)
1020	$LD r7,`0*$BNSZ`(r5)
1021	$UMULL r8,r6,r7
1022	$UMULH r9,r6,r7
1023	addc r11,r11,r8
1024	adde r12,r12,r9
1025	addze r10,r10
1026	$ST r11,`1*$BNSZ`(r3) #r[1]=c2;
1027	#mul_add_c(a[2],b[0],c3,c1,c2);
1028	$LD r6,`2*$BNSZ`(r4)
1029	$UMULL r8,r6,r7
1030	$UMULH r9,r6,r7
1031	addc r12,r12,r8
1032	adde r10,r10,r9
1033	addze r11,r0
1034	#mul_add_c(a[1],b[1],c3,c1,c2);
1035	$LD r6,`1*$BNSZ`(r4)
1036	$LD r7,`1*$BNSZ`(r5)
1037	$UMULL r8,r6,r7
1038	$UMULH r9,r6,r7
1039	addc r12,r12,r8
1040	adde r10,r10,r9
1041	addze r11,r11
1042	#mul_add_c(a[0],b[2],c3,c1,c2);
1043	$LD r6,`0*$BNSZ`(r4)
1044	$LD r7,`2*$BNSZ`(r5)
1045	$UMULL r8,r6,r7
1046	$UMULH r9,r6,r7
1047	addc r12,r12,r8
1048	adde r10,r10,r9
1049	addze r11,r11
1050	$ST r12,`2*$BNSZ`(r3) #r[2]=c3;
1051	#mul_add_c(a[0],b[3],c1,c2,c3);
1052	$LD r7,`3*$BNSZ`(r5)
1053	$UMULL r8,r6,r7
1054	$UMULH r9,r6,r7
1055	addc r10,r10,r8
1056	adde r11,r11,r9
1057	addze r12,r0
1058	#mul_add_c(a[1],b[2],c1,c2,c3);
1059	$LD r6,`1*$BNSZ`(r4)
1060	$LD r7,`2*$BNSZ`(r5)
1061	$UMULL r8,r6,r7
1062	$UMULH r9,r6,r7
1063	addc r10,r10,r8
1064	adde r11,r11,r9
1065	addze r12,r12
1066
1067	#mul_add_c(a[2],b[1],c1,c2,c3);
1068	$LD r6,`2*$BNSZ`(r4)
1069	$LD r7,`1*$BNSZ`(r5)
1070	$UMULL r8,r6,r7
1071	$UMULH r9,r6,r7
1072	addc r10,r10,r8
1073	adde r11,r11,r9
1074	addze r12,r12
1075	#mul_add_c(a[3],b[0],c1,c2,c3);
1076	$LD r6,`3*$BNSZ`(r4)
1077	$LD r7,`0*$BNSZ`(r5)
1078	$UMULL r8,r6,r7
1079	$UMULH r9,r6,r7
1080	addc r10,r10,r8
1081	adde r11,r11,r9
1082	addze r12,r12
1083	$ST r10,`3*$BNSZ`(r3) #r[3]=c1;
1084	#mul_add_c(a[4],b[0],c2,c3,c1);
1085	$LD r6,`4*$BNSZ`(r4)
1086	$UMULL r8,r6,r7
1087	$UMULH r9,r6,r7
1088	addc r11,r11,r8
1089	adde r12,r12,r9
1090	addze r10,r0
1091	#mul_add_c(a[3],b[1],c2,c3,c1);
1092	$LD r6,`3*$BNSZ`(r4)
1093	$LD r7,`1*$BNSZ`(r5)
1094	$UMULL r8,r6,r7
1095	$UMULH r9,r6,r7
1096	addc r11,r11,r8
1097	adde r12,r12,r9
1098	addze r10,r10
1099	#mul_add_c(a[2],b[2],c2,c3,c1);
1100	$LD r6,`2*$BNSZ`(r4)
1101	$LD r7,`2*$BNSZ`(r5)
1102	$UMULL r8,r6,r7
1103	$UMULH r9,r6,r7
1104	addc r11,r11,r8
1105	adde r12,r12,r9
1106	addze r10,r10
1107	#mul_add_c(a[1],b[3],c2,c3,c1);
1108	$LD r6,`1*$BNSZ`(r4)
1109	$LD r7,`3*$BNSZ`(r5)
1110	$UMULL r8,r6,r7
1111	$UMULH r9,r6,r7
1112	addc r11,r11,r8
1113	adde r12,r12,r9
1114	addze r10,r10
1115	#mul_add_c(a[0],b[4],c2,c3,c1);
1116	$LD r6,`0*$BNSZ`(r4)
1117	$LD r7,`4*$BNSZ`(r5)
1118	$UMULL r8,r6,r7
1119	$UMULH r9,r6,r7
1120	addc r11,r11,r8
1121	adde r12,r12,r9
1122	addze r10,r10
1123	$ST r11,`4*$BNSZ`(r3) #r[4]=c2;
1124	#mul_add_c(a[0],b[5],c3,c1,c2);
1125	$LD r7,`5*$BNSZ`(r5)
1126	$UMULL r8,r6,r7
1127	$UMULH r9,r6,r7
1128	addc r12,r12,r8
1129	adde r10,r10,r9
1130	addze r11,r0
1131	#mul_add_c(a[1],b[4],c3,c1,c2);
1132	$LD r6,`1*$BNSZ`(r4)
1133	$LD r7,`4*$BNSZ`(r5)
1134	$UMULL r8,r6,r7
1135	$UMULH r9,r6,r7
1136	addc r12,r12,r8
1137	adde r10,r10,r9
1138	addze r11,r11
1139	#mul_add_c(a[2],b[3],c3,c1,c2);
1140	$LD r6,`2*$BNSZ`(r4)
1141	$LD r7,`3*$BNSZ`(r5)
1142	$UMULL r8,r6,r7
1143	$UMULH r9,r6,r7
1144	addc r12,r12,r8
1145	adde r10,r10,r9
1146	addze r11,r11
1147	#mul_add_c(a[3],b[2],c3,c1,c2);
1148	$LD r6,`3*$BNSZ`(r4)
1149	$LD r7,`2*$BNSZ`(r5)
1150	$UMULL r8,r6,r7
1151	$UMULH r9,r6,r7
1152	addc r12,r12,r8
1153	adde r10,r10,r9
1154	addze r11,r11
1155	#mul_add_c(a[4],b[1],c3,c1,c2);
1156	$LD r6,`4*$BNSZ`(r4)
1157	$LD r7,`1*$BNSZ`(r5)
1158	$UMULL r8,r6,r7
1159	$UMULH r9,r6,r7
1160	addc r12,r12,r8
1161	adde r10,r10,r9
1162	addze r11,r11
1163	#mul_add_c(a[5],b[0],c3,c1,c2);
1164	$LD r6,`5*$BNSZ`(r4)
1165	$LD r7,`0*$BNSZ`(r5)
1166	$UMULL r8,r6,r7
1167	$UMULH r9,r6,r7
1168	addc r12,r12,r8
1169	adde r10,r10,r9
1170	addze r11,r11
1171	$ST r12,`5*$BNSZ`(r3) #r[5]=c3;
1172	#mul_add_c(a[6],b[0],c1,c2,c3);
1173	$LD r6,`6*$BNSZ`(r4)
1174	$UMULL r8,r6,r7
1175	$UMULH r9,r6,r7
1176	addc r10,r10,r8
1177	adde r11,r11,r9
1178	addze r12,r0
1179	#mul_add_c(a[5],b[1],c1,c2,c3);
1180	$LD r6,`5*$BNSZ`(r4)
1181	$LD r7,`1*$BNSZ`(r5)
1182	$UMULL r8,r6,r7
1183	$UMULH r9,r6,r7
1184	addc r10,r10,r8
1185	adde r11,r11,r9
1186	addze r12,r12
1187	#mul_add_c(a[4],b[2],c1,c2,c3);
1188	$LD r6,`4*$BNSZ`(r4)
1189	$LD r7,`2*$BNSZ`(r5)
1190	$UMULL r8,r6,r7
1191	$UMULH r9,r6,r7
1192	addc r10,r10,r8
1193	adde r11,r11,r9
1194	addze r12,r12
1195	#mul_add_c(a[3],b[3],c1,c2,c3);
1196	$LD r6,`3*$BNSZ`(r4)
1197	$LD r7,`3*$BNSZ`(r5)
1198	$UMULL r8,r6,r7
1199	$UMULH r9,r6,r7
1200	addc r10,r10,r8
1201	adde r11,r11,r9
1202	addze r12,r12
1203	#mul_add_c(a[2],b[4],c1,c2,c3);
1204	$LD r6,`2*$BNSZ`(r4)
1205	$LD r7,`4*$BNSZ`(r5)
1206	$UMULL r8,r6,r7
1207	$UMULH r9,r6,r7
1208	addc r10,r10,r8
1209	adde r11,r11,r9
1210	addze r12,r12
1211	#mul_add_c(a[1],b[5],c1,c2,c3);
1212	$LD r6,`1*$BNSZ`(r4)
1213	$LD r7,`5*$BNSZ`(r5)
1214	$UMULL r8,r6,r7
1215	$UMULH r9,r6,r7
1216	addc r10,r10,r8
1217	adde r11,r11,r9
1218	addze r12,r12
1219	#mul_add_c(a[0],b[6],c1,c2,c3);
1220	$LD r6,`0*$BNSZ`(r4)
1221	$LD r7,`6*$BNSZ`(r5)
1222	$UMULL r8,r6,r7
1223	$UMULH r9,r6,r7
1224	addc r10,r10,r8
1225	adde r11,r11,r9
1226	addze r12,r12
1227	$ST r10,`6*$BNSZ`(r3) #r[6]=c1;
1228	#mul_add_c(a[0],b[7],c2,c3,c1);
1229	$LD r7,`7*$BNSZ`(r5)
1230	$UMULL r8,r6,r7
1231	$UMULH r9,r6,r7
1232	addc r11,r11,r8
1233	adde r12,r12,r9
1234	addze r10,r0
1235	#mul_add_c(a[1],b[6],c2,c3,c1);
1236	$LD r6,`1*$BNSZ`(r4)
1237	$LD r7,`6*$BNSZ`(r5)
1238	$UMULL r8,r6,r7
1239	$UMULH r9,r6,r7
1240	addc r11,r11,r8
1241	adde r12,r12,r9
1242	addze r10,r10
1243	#mul_add_c(a[2],b[5],c2,c3,c1);
1244	$LD r6,`2*$BNSZ`(r4)
1245	$LD r7,`5*$BNSZ`(r5)
1246	$UMULL r8,r6,r7
1247	$UMULH r9,r6,r7
1248	addc r11,r11,r8
1249	adde r12,r12,r9
1250	addze r10,r10
1251	#mul_add_c(a[3],b[4],c2,c3,c1);
1252	$LD r6,`3*$BNSZ`(r4)
1253	$LD r7,`4*$BNSZ`(r5)
1254	$UMULL r8,r6,r7
1255	$UMULH r9,r6,r7
1256	addc r11,r11,r8
1257	adde r12,r12,r9
1258	addze r10,r10
1259	#mul_add_c(a[4],b[3],c2,c3,c1);
1260	$LD r6,`4*$BNSZ`(r4)
1261	$LD r7,`3*$BNSZ`(r5)
1262	$UMULL r8,r6,r7
1263	$UMULH r9,r6,r7
1264	addc r11,r11,r8
1265	adde r12,r12,r9
1266	addze r10,r10
1267	#mul_add_c(a[5],b[2],c2,c3,c1);
1268	$LD r6,`5*$BNSZ`(r4)
1269	$LD r7,`2*$BNSZ`(r5)
1270	$UMULL r8,r6,r7
1271	$UMULH r9,r6,r7
1272	addc r11,r11,r8
1273	adde r12,r12,r9
1274	addze r10,r10
1275	#mul_add_c(a[6],b[1],c2,c3,c1);
1276	$LD r6,`6*$BNSZ`(r4)
1277	$LD r7,`1*$BNSZ`(r5)
1278	$UMULL r8,r6,r7
1279	$UMULH r9,r6,r7
1280	addc r11,r11,r8
1281	adde r12,r12,r9
1282	addze r10,r10
1283	#mul_add_c(a[7],b[0],c2,c3,c1);
1284	$LD r6,`7*$BNSZ`(r4)
1285	$LD r7,`0*$BNSZ`(r5)
1286	$UMULL r8,r6,r7
1287	$UMULH r9,r6,r7
1288	addc r11,r11,r8
1289	adde r12,r12,r9
1290	addze r10,r10
1291	$ST r11,`7*$BNSZ`(r3) #r[7]=c2;
1292	#mul_add_c(a[7],b[1],c3,c1,c2);
1293	$LD r7,`1*$BNSZ`(r5)
1294	$UMULL r8,r6,r7
1295	$UMULH r9,r6,r7
1296	addc r12,r12,r8
1297	adde r10,r10,r9
1298	addze r11,r0
1299	#mul_add_c(a[6],b[2],c3,c1,c2);
1300	$LD r6,`6*$BNSZ`(r4)
1301	$LD r7,`2*$BNSZ`(r5)
1302	$UMULL r8,r6,r7
1303	$UMULH r9,r6,r7
1304	addc r12,r12,r8
1305	adde r10,r10,r9
1306	addze r11,r11
1307	#mul_add_c(a[5],b[3],c3,c1,c2);
1308	$LD r6,`5*$BNSZ`(r4)
1309	$LD r7,`3*$BNSZ`(r5)
1310	$UMULL r8,r6,r7
1311	$UMULH r9,r6,r7
1312	addc r12,r12,r8
1313	adde r10,r10,r9
1314	addze r11,r11
1315	#mul_add_c(a[4],b[4],c3,c1,c2);
1316	$LD r6,`4*$BNSZ`(r4)
1317	$LD r7,`4*$BNSZ`(r5)
1318	$UMULL r8,r6,r7
1319	$UMULH r9,r6,r7
1320	addc r12,r12,r8
1321	adde r10,r10,r9
1322	addze r11,r11
1323	#mul_add_c(a[3],b[5],c3,c1,c2);
1324	$LD r6,`3*$BNSZ`(r4)
1325	$LD r7,`5*$BNSZ`(r5)
1326	$UMULL r8,r6,r7
1327	$UMULH r9,r6,r7
1328	addc r12,r12,r8
1329	adde r10,r10,r9
1330	addze r11,r11
1331	#mul_add_c(a[2],b[6],c3,c1,c2);
1332	$LD r6,`2*$BNSZ`(r4)
1333	$LD r7,`6*$BNSZ`(r5)
1334	$UMULL r8,r6,r7
1335	$UMULH r9,r6,r7
1336	addc r12,r12,r8
1337	adde r10,r10,r9
1338	addze r11,r11
1339	#mul_add_c(a[1],b[7],c3,c1,c2);
1340	$LD r6,`1*$BNSZ`(r4)
1341	$LD r7,`7*$BNSZ`(r5)
1342	$UMULL r8,r6,r7
1343	$UMULH r9,r6,r7
1344	addc r12,r12,r8
1345	adde r10,r10,r9
1346	addze r11,r11
1347	$ST r12,`8*$BNSZ`(r3) #r[8]=c3;
1348	#mul_add_c(a[2],b[7],c1,c2,c3);
1349	$LD r6,`2*$BNSZ`(r4)
1350	$UMULL r8,r6,r7
1351	$UMULH r9,r6,r7
1352	addc r10,r10,r8
1353	adde r11,r11,r9
1354	addze r12,r0
1355	#mul_add_c(a[3],b[6],c1,c2,c3);
1356	$LD r6,`3*$BNSZ`(r4)
1357	$LD r7,`6*$BNSZ`(r5)
1358	$UMULL r8,r6,r7
1359	$UMULH r9,r6,r7
1360	addc r10,r10,r8
1361	adde r11,r11,r9
1362	addze r12,r12
1363	#mul_add_c(a[4],b[5],c1,c2,c3);
1364	$LD r6,`4*$BNSZ`(r4)
1365	$LD r7,`5*$BNSZ`(r5)
1366	$UMULL r8,r6,r7
1367	$UMULH r9,r6,r7
1368	addc r10,r10,r8
1369	adde r11,r11,r9
1370	addze r12,r12
1371	#mul_add_c(a[5],b[4],c1,c2,c3);
1372	$LD r6,`5*$BNSZ`(r4)
1373	$LD r7,`4*$BNSZ`(r5)
1374	$UMULL r8,r6,r7
1375	$UMULH r9,r6,r7
1376	addc r10,r10,r8
1377	adde r11,r11,r9
1378	addze r12,r12
1379	#mul_add_c(a[6],b[3],c1,c2,c3);
1380	$LD r6,`6*$BNSZ`(r4)
1381	$LD r7,`3*$BNSZ`(r5)
1382	$UMULL r8,r6,r7
1383	$UMULH r9,r6,r7
1384	addc r10,r10,r8
1385	adde r11,r11,r9
1386	addze r12,r12
1387	#mul_add_c(a[7],b[2],c1,c2,c3);
1388	$LD r6,`7*$BNSZ`(r4)
1389	$LD r7,`2*$BNSZ`(r5)
1390	$UMULL r8,r6,r7
1391	$UMULH r9,r6,r7
1392	addc r10,r10,r8
1393	adde r11,r11,r9
1394	addze r12,r12
1395	$ST r10,`9*$BNSZ`(r3) #r[9]=c1;
1396	#mul_add_c(a[7],b[3],c2,c3,c1);
1397	$LD r7,`3*$BNSZ`(r5)
1398	$UMULL r8,r6,r7
1399	$UMULH r9,r6,r7
1400	addc r11,r11,r8
1401	adde r12,r12,r9
1402	addze r10,r0
1403	#mul_add_c(a[6],b[4],c2,c3,c1);
1404	$LD r6,`6*$BNSZ`(r4)
1405	$LD r7,`4*$BNSZ`(r5)
1406	$UMULL r8,r6,r7
1407	$UMULH r9,r6,r7
1408	addc r11,r11,r8
1409	adde r12,r12,r9
1410	addze r10,r10
1411	#mul_add_c(a[5],b[5],c2,c3,c1);
1412	$LD r6,`5*$BNSZ`(r4)
1413	$LD r7,`5*$BNSZ`(r5)
1414	$UMULL r8,r6,r7
1415	$UMULH r9,r6,r7
1416	addc r11,r11,r8
1417	adde r12,r12,r9
1418	addze r10,r10
1419	#mul_add_c(a[4],b[6],c2,c3,c1);
1420	$LD r6,`4*$BNSZ`(r4)
1421	$LD r7,`6*$BNSZ`(r5)
1422	$UMULL r8,r6,r7
1423	$UMULH r9,r6,r7
1424	addc r11,r11,r8
1425	adde r12,r12,r9
1426	addze r10,r10
1427	#mul_add_c(a[3],b[7],c2,c3,c1);
1428	$LD r6,`3*$BNSZ`(r4)
1429	$LD r7,`7*$BNSZ`(r5)
1430	$UMULL r8,r6,r7
1431	$UMULH r9,r6,r7
1432	addc r11,r11,r8
1433	adde r12,r12,r9
1434	addze r10,r10
1435	$ST r11,`10*$BNSZ`(r3) #r[10]=c2;
1436	#mul_add_c(a[4],b[7],c3,c1,c2);
1437	$LD r6,`4*$BNSZ`(r4)
1438	$UMULL r8,r6,r7
1439	$UMULH r9,r6,r7
1440	addc r12,r12,r8
1441	adde r10,r10,r9
1442	addze r11,r0
1443	#mul_add_c(a[5],b[6],c3,c1,c2);
1444	$LD r6,`5*$BNSZ`(r4)
1445	$LD r7,`6*$BNSZ`(r5)
1446	$UMULL r8,r6,r7
1447	$UMULH r9,r6,r7
1448	addc r12,r12,r8
1449	adde r10,r10,r9
1450	addze r11,r11
1451	#mul_add_c(a[6],b[5],c3,c1,c2);
1452	$LD r6,`6*$BNSZ`(r4)
1453	$LD r7,`5*$BNSZ`(r5)
1454	$UMULL r8,r6,r7
1455	$UMULH r9,r6,r7
1456	addc r12,r12,r8
1457	adde r10,r10,r9
1458	addze r11,r11
1459	#mul_add_c(a[7],b[4],c3,c1,c2);
1460	$LD r6,`7*$BNSZ`(r4)
1461	$LD r7,`4*$BNSZ`(r5)
1462	$UMULL r8,r6,r7
1463	$UMULH r9,r6,r7
1464	addc r12,r12,r8
1465	adde r10,r10,r9
1466	addze r11,r11
1467	$ST r12,`11*$BNSZ`(r3) #r[11]=c3;
1468	#mul_add_c(a[7],b[5],c1,c2,c3);
1469	$LD r7,`5*$BNSZ`(r5)
1470	$UMULL r8,r6,r7
1471	$UMULH r9,r6,r7
1472	addc r10,r10,r8
1473	adde r11,r11,r9
1474	addze r12,r0
1475	#mul_add_c(a[6],b[6],c1,c2,c3);
1476	$LD r6,`6*$BNSZ`(r4)
1477	$LD r7,`6*$BNSZ`(r5)
1478	$UMULL r8,r6,r7
1479	$UMULH r9,r6,r7
1480	addc r10,r10,r8
1481	adde r11,r11,r9
1482	addze r12,r12
1483	#mul_add_c(a[5],b[7],c1,c2,c3);
1484	$LD r6,`5*$BNSZ`(r4)
1485	$LD r7,`7*$BNSZ`(r5)
1486	$UMULL r8,r6,r7
1487	$UMULH r9,r6,r7
1488	addc r10,r10,r8
1489	adde r11,r11,r9
1490	addze r12,r12
1491	$ST r10,`12*$BNSZ`(r3) #r[12]=c1;
1492	#mul_add_c(a[6],b[7],c2,c3,c1);
1493	$LD r6,`6*$BNSZ`(r4)
1494	$UMULL r8,r6,r7
1495	$UMULH r9,r6,r7
1496	addc r11,r11,r8
1497	adde r12,r12,r9
1498	addze r10,r0
1499	#mul_add_c(a[7],b[6],c2,c3,c1);
1500	$LD r6,`7*$BNSZ`(r4)
1501	$LD r7,`6*$BNSZ`(r5)
1502	$UMULL r8,r6,r7
1503	$UMULH r9,r6,r7
1504	addc r11,r11,r8
1505	adde r12,r12,r9
1506	addze r10,r10
1507	$ST r11,`13*$BNSZ`(r3) #r[13]=c2;
1508	#mul_add_c(a[7],b[7],c3,c1,c2);
1509	$LD r7,`7*$BNSZ`(r5)
1510	$UMULL r8,r6,r7
1511	$UMULH r9,r6,r7
1512	addc r12,r12,r8
1513	adde r10,r10,r9
1514	$ST r12,`14*$BNSZ`(r3) #r[14]=c3;
1515	$ST r10,`15*$BNSZ`(r3) #r[15]=c1;
1516	blr
1517	.long 0
1518	.byte 0,12,0x14,0,0,0,3,0
1519	.long 0
1520	.size .bn_mul_comba8,.-.bn_mul_comba8
1521
1522	#
1523	# NOTE: The following label name should be changed to
1524	# "bn_sub_words" i.e. remove the first dot
1525	# for the gcc compiler. This should be automatically
1526	# done in the build
1527	#
1528	#
1529	.align 4
1530	.bn_sub_words:
1531	#
1532	# Handcoded version of bn_sub_words
1533	#
1534	#BN_ULONG bn_sub_words(BN_ULONG r, BN_ULONG a, BN_ULONG *b, int n)
1535	#
1536	# r3 = r
1537	# r4 = a
1538	# r5 = b
1539	# r6 = n
1540	#
1541	# Note: No loop unrolling done since this is not a performance
1542	# critical loop.
1543
1544	xor r0,r0,r0 #set r0 = 0
1545	#
1546	# check for r6 = 0 AND set carry bit.
1547	#
1548	subfc. r7,r0,r6 # If r6 is 0 then result is 0.
1549	# if r6 > 0 then result !=0
1550	# In either case carry bit is set.
1551	beq Lppcasm_sub_adios
1552	addi r4,r4,-$BNSZ
1553	addi r3,r3,-$BNSZ
1554	addi r5,r5,-$BNSZ
1555	mtctr r6
1556	Lppcasm_sub_mainloop:
1557	$LDU r7,$BNSZ(r4)
1558	$LDU r8,$BNSZ(r5)
1559	subfe r6,r8,r7 # r6 = r7+carry bit + onescomplement(r8)
1560	# if carry = 1 this is r7-r8. Else it
1561	# is r7-r8 -1 as we need.
1562	$STU r6,$BNSZ(r3)
1563	bdnz Lppcasm_sub_mainloop
1564	Lppcasm_sub_adios:
1565	subfze r3,r0 # if carry bit is set then r3 = 0 else -1
1566	andi. r3,r3,1 # keep only last bit.
1567	blr
1568	.long 0
1569	.byte 0,12,0x14,0,0,0,4,0
1570	.long 0
1571	.size .bn_sub_words,.-.bn_sub_words
1572
1573	#
1574	# NOTE: The following label name should be changed to
1575	# "bn_add_words" i.e. remove the first dot
1576	# for the gcc compiler. This should be automatically
1577	# done in the build
1578	#
1579
1580	.align 4
1581	.bn_add_words:
1582	#
1583	# Handcoded version of bn_add_words
1584	#
1585	#BN_ULONG bn_add_words(BN_ULONG r, BN_ULONG a, BN_ULONG *b, int n)
1586	#
1587	# r3 = r
1588	# r4 = a
1589	# r5 = b
1590	# r6 = n
1591	#
1592	# Note: No loop unrolling done since this is not a performance
1593	# critical loop.
1594
1595	xor r0,r0,r0
1596	#
1597	# check for r6 = 0. Is this needed?
1598	#
1599	addic. r6,r6,0 #test r6 and clear carry bit.
1600	beq Lppcasm_add_adios
1601	addi r4,r4,-$BNSZ
1602	addi r3,r3,-$BNSZ
1603	addi r5,r5,-$BNSZ
1604	mtctr r6
1605	Lppcasm_add_mainloop:
1606	$LDU r7,$BNSZ(r4)
1607	$LDU r8,$BNSZ(r5)
1608	adde r8,r7,r8
1609	$STU r8,$BNSZ(r3)
1610	bdnz Lppcasm_add_mainloop
1611	Lppcasm_add_adios:
1612	addze r3,r0 #return carry bit.
1613	blr
1614	.long 0
1615	.byte 0,12,0x14,0,0,0,4,0
1616	.long 0
1617	.size .bn_add_words,.-.bn_add_words
1618
1619	#
1620	# NOTE: The following label name should be changed to
1621	# "bn_div_words" i.e. remove the first dot
1622	# for the gcc compiler. This should be automatically
1623	# done in the build
1624	#
1625
1626	.align 4
1627	.bn_div_words:
1628	#
1629	# This is a cleaned up version of code generated by
1630	# the AIX compiler. The only optimization is to use
1631	# the PPC instruction to count leading zeros instead
1632	# of call to num_bits_word. Since this was compiled
1633	# only at level -O2 we can possibly squeeze it more?
1634	#
1635	# r3 = h
1636	# r4 = l
1637	# r5 = d
1638
1639	$UCMPI 0,r5,0 # compare r5 and 0
1640	bne Lppcasm_div1 # proceed if d!=0
1641	li r3,-1 # d=0 return -1
1642	blr
1643	Lppcasm_div1:
1644	xor r0,r0,r0 #r0=0
1645	li r8,$BITS
1646	$CNTLZ. r7,r5 #r7 = num leading 0s in d.
1647	beq Lppcasm_div2 #proceed if no leading zeros
1648	subf r8,r7,r8 #r8 = BN_num_bits_word(d)
1649	$SHR. r9,r3,r8 #are there any bits above r8'th?
1650	$TR 16,r9,r0 #if there're, signal to dump core...
1651	Lppcasm_div2:
1652	$UCMP 0,r3,r5 #h>=d?
1653	blt Lppcasm_div3 #goto Lppcasm_div3 if not
1654	subf r3,r5,r3 #h-=d ;
1655	Lppcasm_div3: #r7 = BN_BITS2-i. so r7=i
1656	cmpi 0,0,r7,0 # is (i == 0)?
1657	beq Lppcasm_div4
1658	$SHL r3,r3,r7 # h = (h<< i)
1659	$SHR r8,r4,r8 # r8 = (l >> BN_BITS2 -i)
1660	$SHL r5,r5,r7 # d<<=i
1661	or r3,r3,r8 # h = (h<<i)\|(l>>(BN_BITS2-i))
1662	$SHL r4,r4,r7 # l <<=i
1663	Lppcasm_div4:
1664	$SHRI r9,r5,`$BITS/2` # r9 = dh
1665	# dl will be computed when needed
1666	# as it saves registers.
1667	li r6,2 #r6=2
1668	mtctr r6 #counter will be in count.
1669	Lppcasm_divouterloop:
1670	$SHRI r8,r3,`$BITS/2` #r8 = (h>>BN_BITS4)
1671	$SHRI r11,r4,`$BITS/2` #r11= (l&BN_MASK2h)>>BN_BITS4
1672	# compute here for innerloop.
1673	$UCMP 0,r8,r9 # is (h>>BN_BITS4)==dh
1674	bne Lppcasm_div5 # goto Lppcasm_div5 if not
1675
1676	li r8,-1
1677	$CLRU r8,r8,`$BITS/2` #q = BN_MASK2l
1678	b Lppcasm_div6
1679	Lppcasm_div5:
1680	$UDIV r8,r3,r9 #q = h/dh
1681	Lppcasm_div6:
1682	$UMULL r12,r9,r8 #th = q*dh
1683	$CLRU r10,r5,`$BITS/2` #r10=dl
1684	$UMULL r6,r8,r10 #tl = q*dl
1685
1686	Lppcasm_divinnerloop:
1687	subf r10,r12,r3 #t = h -th
1688	$SHRI r7,r10,`$BITS/2` #r7= (t &BN_MASK2H), sort of...
1689	addic. r7,r7,0 #test if r7 == 0. used below.
1690	# now want to compute
1691	# r7 = (t<<BN_BITS4)\|((l&BN_MASK2h)>>BN_BITS4)
1692	# the following 2 instructions do that
1693	$SHLI r7,r10,`$BITS/2` # r7 = (t<<BN_BITS4)
1694	or r7,r7,r11 # r7\|=((l&BN_MASK2h)>>BN_BITS4)
1695	$UCMP cr1,r6,r7 # compare (tl <= r7)
1696	bne Lppcasm_divinnerexit
1697	ble cr1,Lppcasm_divinnerexit
1698	addi r8,r8,-1 #q--
1699	subf r12,r9,r12 #th -=dh
1700	$CLRU r10,r5,`$BITS/2` #r10=dl. t is no longer needed in loop.
1701	subf r6,r10,r6 #tl -=dl
1702	b Lppcasm_divinnerloop
1703	Lppcasm_divinnerexit:
1704	$SHRI r10,r6,`$BITS/2` #t=(tl>>BN_BITS4)
1705	$SHLI r11,r6,`$BITS/2` #tl=(tl<<BN_BITS4)&BN_MASK2h;
1706	$UCMP cr1,r4,r11 # compare l and tl
1707	add r12,r12,r10 # th+=t
1708	bge cr1,Lppcasm_div7 # if (l>=tl) goto Lppcasm_div7
1709	addi r12,r12,1 # th++
1710	Lppcasm_div7:
1711	subf r11,r11,r4 #r11=l-tl
1712	$UCMP cr1,r3,r12 #compare h and th
1713	bge cr1,Lppcasm_div8 #if (h>=th) goto Lppcasm_div8
1714	addi r8,r8,-1 # q--
1715	add r3,r5,r3 # h+=d
1716	Lppcasm_div8:
1717	subf r12,r12,r3 #r12 = h-th
1718	$SHLI r4,r11,`$BITS/2` #l=(l&BN_MASK2l)<<BN_BITS4
1719	# want to compute
1720	# h = ((h<<BN_BITS4)\|(l>>BN_BITS4))&BN_MASK2
1721	# the following 2 instructions will do this.
1722	$INSR r11,r12,`$BITS/2`,`$BITS/2` # r11 is the value we want rotated $BITS/2.
1723	$ROTL r3,r11,`$BITS/2` # rotate by $BITS/2 and store in r3
1724	bdz Lppcasm_div9 #if (count==0) break ;
1725	$SHLI r0,r8,`$BITS/2` #ret =q<<BN_BITS4
1726	b Lppcasm_divouterloop
1727	Lppcasm_div9:
1728	or r3,r8,r0
1729	blr
1730	.long 0
1731	.byte 0,12,0x14,0,0,0,3,0
1732	.long 0
1733	.size .bn_div_words,.-.bn_div_words
1734
1735	#
1736	# NOTE: The following label name should be changed to
1737	# "bn_sqr_words" i.e. remove the first dot
1738	# for the gcc compiler. This should be automatically
1739	# done in the build
1740	#
1741	.align 4
1742	.bn_sqr_words:
1743	#
1744	# Optimized version of bn_sqr_words
1745	#
1746	# void bn_sqr_words(BN_ULONG r, BN_ULONG a, int n)
1747	#
1748	# r3 = r
1749	# r4 = a
1750	# r5 = n
1751	#
1752	# r6 = a[i].
1753	# r7,r8 = product.
1754	#
1755	# No unrolling done here. Not performance critical.
1756
1757	addic. r5,r5,0 #test r5.
1758	beq Lppcasm_sqr_adios
1759	addi r4,r4,-$BNSZ
1760	addi r3,r3,-$BNSZ
1761	mtctr r5
1762	Lppcasm_sqr_mainloop:
1763	#sqr(r[0],r[1],a[0]);
1764	$LDU r6,$BNSZ(r4)
1765	$UMULL r7,r6,r6
1766	$UMULH r8,r6,r6
1767	$STU r7,$BNSZ(r3)
1768	$STU r8,$BNSZ(r3)
1769	bdnz Lppcasm_sqr_mainloop
1770	Lppcasm_sqr_adios:
1771	blr
1772	.long 0
1773	.byte 0,12,0x14,0,0,0,3,0
1774	.long 0
1775	.size .bn_sqr_words,.-.bn_sqr_words
1776
1777	#
1778	# NOTE: The following label name should be changed to
1779	# "bn_mul_words" i.e. remove the first dot
1780	# for the gcc compiler. This should be automatically
1781	# done in the build
1782	#
1783
1784	.align 4
1785	.bn_mul_words:
1786	#
1787	# BN_ULONG bn_mul_words(BN_ULONG rp, BN_ULONG ap, int num, BN_ULONG w)
1788	#
1789	# r3 = rp
1790	# r4 = ap
1791	# r5 = num
1792	# r6 = w
1793	xor r0,r0,r0
1794	xor r12,r12,r12 # used for carry
1795	rlwinm. r7,r5,30,2,31 # num >> 2
1796	beq Lppcasm_mw_REM
1797	mtctr r7
1798	Lppcasm_mw_LOOP:
1799	#mul(rp[0],ap[0],w,c1);
1800	$LD r8,`0*$BNSZ`(r4)
1801	$UMULL r9,r6,r8
1802	$UMULH r10,r6,r8
1803	addc r9,r9,r12
1804	#addze r10,r10 #carry is NOT ignored.
1805	#will be taken care of
1806	#in second spin below
1807	#using adde.
1808	$ST r9,`0*$BNSZ`(r3)
1809	#mul(rp[1],ap[1],w,c1);
1810	$LD r8,`1*$BNSZ`(r4)
1811	$UMULL r11,r6,r8
1812	$UMULH r12,r6,r8
1813	adde r11,r11,r10
1814	#addze r12,r12
1815	$ST r11,`1*$BNSZ`(r3)
1816	#mul(rp[2],ap[2],w,c1);
1817	$LD r8,`2*$BNSZ`(r4)
1818	$UMULL r9,r6,r8
1819	$UMULH r10,r6,r8
1820	adde r9,r9,r12
1821	#addze r10,r10
1822	$ST r9,`2*$BNSZ`(r3)
1823	#mul_add(rp[3],ap[3],w,c1);
1824	$LD r8,`3*$BNSZ`(r4)
1825	$UMULL r11,r6,r8
1826	$UMULH r12,r6,r8
1827	adde r11,r11,r10
1828	addze r12,r12 #this spin we collect carry into
1829	#r12
1830	$ST r11,`3*$BNSZ`(r3)
1831
1832	addi r3,r3,`4*$BNSZ`
1833	addi r4,r4,`4*$BNSZ`
1834	bdnz Lppcasm_mw_LOOP
1835
1836	Lppcasm_mw_REM:
1837	andi. r5,r5,0x3
1838	beq Lppcasm_mw_OVER
1839	#mul(rp[0],ap[0],w,c1);
1840	$LD r8,`0*$BNSZ`(r4)
1841	$UMULL r9,r6,r8
1842	$UMULH r10,r6,r8
1843	addc r9,r9,r12
1844	addze r10,r10
1845	$ST r9,`0*$BNSZ`(r3)
1846	addi r12,r10,0
1847
1848	addi r5,r5,-1
1849	cmpli 0,0,r5,0
1850	beq Lppcasm_mw_OVER
1851
1852
1853	#mul(rp[1],ap[1],w,c1);
1854	$LD r8,`1*$BNSZ`(r4)
1855	$UMULL r9,r6,r8
1856	$UMULH r10,r6,r8
1857	addc r9,r9,r12
1858	addze r10,r10
1859	$ST r9,`1*$BNSZ`(r3)
1860	addi r12,r10,0
1861
1862	addi r5,r5,-1
1863	cmpli 0,0,r5,0
1864	beq Lppcasm_mw_OVER
1865
1866	#mul_add(rp[2],ap[2],w,c1);
1867	$LD r8,`2*$BNSZ`(r4)
1868	$UMULL r9,r6,r8
1869	$UMULH r10,r6,r8
1870	addc r9,r9,r12
1871	addze r10,r10
1872	$ST r9,`2*$BNSZ`(r3)
1873	addi r12,r10,0
1874
1875	Lppcasm_mw_OVER:
1876	addi r3,r12,0
1877	blr
1878	.long 0
1879	.byte 0,12,0x14,0,0,0,4,0
1880	.long 0
1881	.size .bn_mul_words,.-.bn_mul_words
1882
1883	#
1884	# NOTE: The following label name should be changed to
1885	# "bn_mul_add_words" i.e. remove the first dot
1886	# for the gcc compiler. This should be automatically
1887	# done in the build
1888	#
1889
1890	.align 4
1891	.bn_mul_add_words:
1892	#
1893	# BN_ULONG bn_mul_add_words(BN_ULONG rp, BN_ULONG ap, int num, BN_ULONG w)
1894	#
1895	# r3 = rp
1896	# r4 = ap
1897	# r5 = num
1898	# r6 = w
1899	#
1900	# empirical evidence suggests that unrolled version performs best!!
1901	#
1902	xor r0,r0,r0 #r0 = 0
1903	xor r12,r12,r12 #r12 = 0 . used for carry
1904	rlwinm. r7,r5,30,2,31 # num >> 2
1905	beq Lppcasm_maw_leftover # if (num < 4) go LPPCASM_maw_leftover
1906	mtctr r7
1907	Lppcasm_maw_mainloop:
1908	#mul_add(rp[0],ap[0],w,c1);
1909	$LD r8,`0*$BNSZ`(r4)
1910	$LD r11,`0*$BNSZ`(r3)
1911	$UMULL r9,r6,r8
1912	$UMULH r10,r6,r8
1913	addc r9,r9,r12 #r12 is carry.
1914	addze r10,r10
1915	addc r9,r9,r11
1916	#addze r10,r10
1917	#the above instruction addze
1918	#is NOT needed. Carry will NOT
1919	#be ignored. It's not affected
1920	#by multiply and will be collected
1921	#in the next spin
1922	$ST r9,`0*$BNSZ`(r3)
1923
1924	#mul_add(rp[1],ap[1],w,c1);
1925	$LD r8,`1*$BNSZ`(r4)
1926	$LD r9,`1*$BNSZ`(r3)
1927	$UMULL r11,r6,r8
1928	$UMULH r12,r6,r8
1929	adde r11,r11,r10 #r10 is carry.
1930	addze r12,r12
1931	addc r11,r11,r9
1932	#addze r12,r12
1933	$ST r11,`1*$BNSZ`(r3)
1934
1935	#mul_add(rp[2],ap[2],w,c1);
1936	$LD r8,`2*$BNSZ`(r4)
1937	$UMULL r9,r6,r8
1938	$LD r11,`2*$BNSZ`(r3)
1939	$UMULH r10,r6,r8
1940	adde r9,r9,r12
1941	addze r10,r10
1942	addc r9,r9,r11
1943	#addze r10,r10
1944	$ST r9,`2*$BNSZ`(r3)
1945
1946	#mul_add(rp[3],ap[3],w,c1);
1947	$LD r8,`3*$BNSZ`(r4)
1948	$UMULL r11,r6,r8
1949	$LD r9,`3*$BNSZ`(r3)
1950	$UMULH r12,r6,r8
1951	adde r11,r11,r10
1952	addze r12,r12
1953	addc r11,r11,r9
1954	addze r12,r12
1955	$ST r11,`3*$BNSZ`(r3)
1956	addi r3,r3,`4*$BNSZ`
1957	addi r4,r4,`4*$BNSZ`
1958	bdnz Lppcasm_maw_mainloop
1959
1960	Lppcasm_maw_leftover:
1961	andi. r5,r5,0x3
1962	beq Lppcasm_maw_adios
1963	addi r3,r3,-$BNSZ
1964	addi r4,r4,-$BNSZ
1965	#mul_add(rp[0],ap[0],w,c1);
1966	mtctr r5
1967	$LDU r8,$BNSZ(r4)
1968	$UMULL r9,r6,r8
1969	$UMULH r10,r6,r8
1970	$LDU r11,$BNSZ(r3)
1971	addc r9,r9,r11
1972	addze r10,r10
1973	addc r9,r9,r12
1974	addze r12,r10
1975	$ST r9,0(r3)
1976
1977	bdz Lppcasm_maw_adios
1978	#mul_add(rp[1],ap[1],w,c1);
1979	$LDU r8,$BNSZ(r4)
1980	$UMULL r9,r6,r8
1981	$UMULH r10,r6,r8
1982	$LDU r11,$BNSZ(r3)
1983	addc r9,r9,r11
1984	addze r10,r10
1985	addc r9,r9,r12
1986	addze r12,r10
1987	$ST r9,0(r3)
1988
1989	bdz Lppcasm_maw_adios
1990	#mul_add(rp[2],ap[2],w,c1);
1991	$LDU r8,$BNSZ(r4)
1992	$UMULL r9,r6,r8
1993	$UMULH r10,r6,r8
1994	$LDU r11,$BNSZ(r3)
1995	addc r9,r9,r11
1996	addze r10,r10
1997	addc r9,r9,r12
1998	addze r12,r10
1999	$ST r9,0(r3)
2000
2001	Lppcasm_maw_adios:
2002	addi r3,r12,0
2003	blr
2004	.long 0
2005	.byte 0,12,0x14,0,0,0,4,0
2006	.long 0
2007	.size .bn_mul_add_words,.-.bn_mul_add_words
2008	.align 4
2009	EOF
2010	$data =~ s/\`([^\`]*)\`/eval $1/gem;
2011	print $data;
2012	close STDOUT or die "error closing STDOUT: $!";

Note: See TracBrowser for help on using the repository browser.

source: vbox/trunk/src/libs/openssl-3.0.1/crypto/bn/asm/ppc.pl@ 94081

Download in other formats: