ppc.pl@ 94082

Last change on this file since 94082 was 94082, checked in by vboxsync, 3 years ago
libs/openssl-3.0.1: started applying and adjusting our OpenSSL changes to 3.0.1. bugref:10128
File size: 44.7 KB

Line
1	#! /usr/bin/env perl
2	# Copyright 2004-2020 The OpenSSL Project Authors. All Rights Reserved.
3	#
4	# Licensed under the Apache License 2.0 (the "License"). You may not use
5	# this file except in compliance with the License. You can obtain a copy
6	# in the file LICENSE in the source distribution or at
7	# https://www.openssl.org/source/license.html
8
9	# Implemented as a Perl wrapper as we want to support several different
10	# architectures with single file. We pick up the target based on the
11	# file name we are asked to generate.
12	#
13	# It should be noted though that this perl code is nothing like
14	# <openssl>/crypto/perlasm/x86*. In this case perl is used pretty much
15	# as pre-processor to cover for platform differences in name decoration,
16	# linker tables, 32-/64-bit instruction sets...
17	#
18	# As you might know there're several PowerPC ABI in use. Most notably
19	# Linux and AIX use different 32-bit ABIs. Good news are that these ABIs
20	# are similar enough to implement leaf(!) functions, which would be ABI
21	# neutral. And that's what you find here: ABI neutral leaf functions.
22	# In case you wonder what that is...
23	#
24	# AIX performance
25	#
26	# MEASUREMENTS WITH cc ON a 200 MhZ PowerPC 604e.
27	#
28	# The following is the performance of 32-bit compiler
29	# generated code:
30	#
31	# OpenSSL 0.9.6c 21 dec 2001
32	# built on: Tue Jun 11 11:06:51 EDT 2002
33	# options:bn(64,32) ...
34	#compiler: cc -DTHREADS -DAIX -DB_ENDIAN -DBN_LLONG -O3
35	# sign verify sign/s verify/s
36	#rsa 512 bits 0.0098s 0.0009s 102.0 1170.6
37	#rsa 1024 bits 0.0507s 0.0026s 19.7 387.5
38	#rsa 2048 bits 0.3036s 0.0085s 3.3 117.1
39	#rsa 4096 bits 2.0040s 0.0299s 0.5 33.4
40	#dsa 512 bits 0.0087s 0.0106s 114.3 94.5
41	#dsa 1024 bits 0.0256s 0.0313s 39.0 32.0
42	#
43	# Same benchmark with this assembler code:
44	#
45	#rsa 512 bits 0.0056s 0.0005s 178.6 2049.2
46	#rsa 1024 bits 0.0283s 0.0015s 35.3 674.1
47	#rsa 2048 bits 0.1744s 0.0050s 5.7 201.2
48	#rsa 4096 bits 1.1644s 0.0179s 0.9 55.7
49	#dsa 512 bits 0.0052s 0.0062s 191.6 162.0
50	#dsa 1024 bits 0.0149s 0.0180s 67.0 55.5
51	#
52	# Number of operations increases by at almost 75%
53	#
54	# Here are performance numbers for 64-bit compiler
55	# generated code:
56	#
57	# OpenSSL 0.9.6g [engine] 9 Aug 2002
58	# built on: Fri Apr 18 16:59:20 EDT 2003
59	# options:bn(64,64) ...
60	# compiler: cc -DTHREADS -D_REENTRANT -q64 -DB_ENDIAN -O3
61	# sign verify sign/s verify/s
62	#rsa 512 bits 0.0028s 0.0003s 357.1 3844.4
63	#rsa 1024 bits 0.0148s 0.0008s 67.5 1239.7
64	#rsa 2048 bits 0.0963s 0.0028s 10.4 353.0
65	#rsa 4096 bits 0.6538s 0.0102s 1.5 98.1
66	#dsa 512 bits 0.0026s 0.0032s 382.5 313.7
67	#dsa 1024 bits 0.0081s 0.0099s 122.8 100.6
68	#
69	# Same benchmark with this assembler code:
70	#
71	#rsa 512 bits 0.0020s 0.0002s 510.4 6273.7
72	#rsa 1024 bits 0.0088s 0.0005s 114.1 2128.3
73	#rsa 2048 bits 0.0540s 0.0016s 18.5 622.5
74	#rsa 4096 bits 0.3700s 0.0058s 2.7 171.0
75	#dsa 512 bits 0.0016s 0.0020s 610.7 507.1
76	#dsa 1024 bits 0.0047s 0.0058s 212.5 173.2
77	#
78	# Again, performance increases by at about 75%
79	#
80	# Mac OS X, Apple G5 1.8GHz (Note this is 32 bit code)
81	# OpenSSL 0.9.7c 30 Sep 2003
82	#
83	# Original code.
84	#
85	#rsa 512 bits 0.0011s 0.0001s 906.1 11012.5
86	#rsa 1024 bits 0.0060s 0.0003s 166.6 3363.1
87	#rsa 2048 bits 0.0370s 0.0010s 27.1 982.4
88	#rsa 4096 bits 0.2426s 0.0036s 4.1 280.4
89	#dsa 512 bits 0.0010s 0.0012s 1038.1 841.5
90	#dsa 1024 bits 0.0030s 0.0037s 329.6 269.7
91	#dsa 2048 bits 0.0101s 0.0127s 98.9 78.6
92	#
93	# Same benchmark with this assembler code:
94	#
95	#rsa 512 bits 0.0007s 0.0001s 1416.2 16645.9
96	#rsa 1024 bits 0.0036s 0.0002s 274.4 5380.6
97	#rsa 2048 bits 0.0222s 0.0006s 45.1 1589.5
98	#rsa 4096 bits 0.1469s 0.0022s 6.8 449.6
99	#dsa 512 bits 0.0006s 0.0007s 1664.2 1376.2
100	#dsa 1024 bits 0.0018s 0.0023s 545.0 442.2
101	#dsa 2048 bits 0.0061s 0.0075s 163.5 132.8
102	#
103	# Performance increase of ~60%
104	# Based on submission from Suresh N. Chari of IBM
105
106	# $output is the last argument if it looks like a file (it has an extension)
107	# $flavour is the first argument if it doesn't look like a file
108	$output = $#ARGV >= 0 && $ARGV[$#ARGV] =~ m\|\.\w+$\| ? pop : undef;
109	$flavour = $#ARGV >= 0 && $ARGV[0] !~ m\|\.\| ? shift : undef;
110
111	if ($flavour =~ /32/) {
112	$BITS= 32;
113	$BNSZ= $BITS/8;
114	$ISA= "\"ppc\"";
115
116	$LD= "lwz"; # load
117	$LDU= "lwzu"; # load and update
118	$ST= "stw"; # store
119	$STU= "stwu"; # store and update
120	$UMULL= "mullw"; # unsigned multiply low
121	$UMULH= "mulhwu"; # unsigned multiply high
122	$UDIV= "divwu"; # unsigned divide
123	$UCMPI= "cmplwi"; # unsigned compare with immediate
124	$UCMP= "cmplw"; # unsigned compare
125	$CNTLZ= "cntlzw"; # count leading zeros
126	$SHL= "slw"; # shift left
127	$SHR= "srw"; # unsigned shift right
128	$SHRI= "srwi"; # unsigned shift right by immediate
129	$SHLI= "slwi"; # shift left by immediate
130	$CLRU= "clrlwi"; # clear upper bits
131	$INSR= "insrwi"; # insert right
132	$ROTL= "rotlwi"; # rotate left by immediate
133	$TR= "tw"; # conditional trap
134	} elsif ($flavour =~ /64/) {
135	$BITS= 64;
136	$BNSZ= $BITS/8;
137	$ISA= "\"ppc64\"";
138
139	# same as above, but 64-bit mnemonics...
140	$LD= "ld"; # load
141	$LDU= "ldu"; # load and update
142	$ST= "std"; # store
143	$STU= "stdu"; # store and update
144	$UMULL= "mulld"; # unsigned multiply low
145	$UMULH= "mulhdu"; # unsigned multiply high
146	$UDIV= "divdu"; # unsigned divide
147	$UCMPI= "cmpldi"; # unsigned compare with immediate
148	$UCMP= "cmpld"; # unsigned compare
149	$CNTLZ= "cntlzd"; # count leading zeros
150	$SHL= "sld"; # shift left
151	$SHR= "srd"; # unsigned shift right
152	$SHRI= "srdi"; # unsigned shift right by immediate
153	$SHLI= "sldi"; # shift left by immediate
154	$CLRU= "clrldi"; # clear upper bits
155	$INSR= "insrdi"; # insert right
156	$ROTL= "rotldi"; # rotate left by immediate
157	$TR= "td"; # conditional trap
158	} else { die "nonsense $flavour"; }
159
160	$0 =~ m/(.*[\/\\])[^\/\\]+$/; $dir=$1;
161	( $xlate="${dir}ppc-xlate.pl" and -f $xlate ) or
162	( $xlate="${dir}../../perlasm/ppc-xlate.pl" and -f $xlate) or
163	die "can't locate ppc-xlate.pl";
164
165	open STDOUT,"\| $^X $xlate $flavour \"$output\""
166	or die "can't call $xlate: $!";
167
168	$data=<<EOF;
169	#--------------------------------------------------------------------
170	#
171	#
172	#
173	#
174	# File: ppc32.s
175	#
176	# Created by: Suresh Chari
177	# IBM Thomas J. Watson Research Library
178	# Hawthorne, NY
179	#
180	#
181	# Description: Optimized assembly routines for OpenSSL crypto
182	# on the 32 bitPowerPC platform.
183	#
184	#
185	# Version History
186	#
187	# 2. Fixed bn_add,bn_sub and bn_div_words, added comments,
188	# cleaned up code. Also made a single version which can
189	# be used for both the AIX and Linux compilers. See NOTE
190	# below.
191	# 12/05/03 Suresh Chari
192	# (with lots of help from) Andy Polyakov
193	##
194	# 1. Initial version 10/20/02 Suresh Chari
195	#
196	#
197	# The following file works for the xlc,cc
198	# and gcc compilers.
199	#
200	# NOTE: To get the file to link correctly with the gcc compiler
201	# you have to change the names of the routines and remove
202	# the first .(dot) character. This should automatically
203	# be done in the build process.
204	#
205	# Hand optimized assembly code for the following routines
206	#
207	# bn_sqr_comba4
208	# bn_sqr_comba8
209	# bn_mul_comba4
210	# bn_mul_comba8
211	# bn_sub_words
212	# bn_add_words
213	# bn_div_words
214	# bn_sqr_words
215	# bn_mul_words
216	# bn_mul_add_words
217	#
218	# NOTE: It is possible to optimize this code more for
219	# specific PowerPC or Power architectures. On the Northstar
220	# architecture the optimizations in this file do
221	# NOT provide much improvement.
222	#
223	# If you have comments or suggestions to improve code send
224	# me a note at schari\@us.ibm.com
225	#
226	#--------------------------------------------------------------------------
227	#
228	# Defines to be used in the assembly code.
229	#
230	#.set r0,0 # we use it as storage for value of 0
231	#.set SP,1 # preserved
232	#.set RTOC,2 # preserved
233	#.set r3,3 # 1st argument/return value
234	#.set r4,4 # 2nd argument/volatile register
235	#.set r5,5 # 3rd argument/volatile register
236	#.set r6,6 # ...
237	#.set r7,7
238	#.set r8,8
239	#.set r9,9
240	#.set r10,10
241	#.set r11,11
242	#.set r12,12
243	#.set r13,13 # not used, nor any other "below" it...
244
245	# Declare function names to be global
246	# NOTE: For gcc these names MUST be changed to remove
247	# the first . i.e. for example change ".bn_sqr_comba4"
248	# to "bn_sqr_comba4". This should be automatically done
249	# in the build.
250
251	.globl .bn_sqr_comba4
252	.globl .bn_sqr_comba8
253	.globl .bn_mul_comba4
254	.globl .bn_mul_comba8
255	.globl .bn_sub_words
256	.globl .bn_add_words
257	.globl .bn_div_words
258	.globl .bn_sqr_words
259	.globl .bn_mul_words
260	.globl .bn_mul_add_words
261
262	# .text section
263
264	.machine "any"
265	.text
266
267	#
268	# NOTE: The following label name should be changed to
269	# "bn_sqr_comba4" i.e. remove the first dot
270	# for the gcc compiler. This should be automatically
271	# done in the build
272	#
273
274	.align 4
275	.bn_sqr_comba4:
276	#
277	# Optimized version of bn_sqr_comba4.
278	#
279	# void bn_sqr_comba4(BN_ULONG r, BN_ULONG a)
280	# r3 contains r
281	# r4 contains a
282	#
283	# Freely use registers r5,r6,r7,r8,r9,r10,r11 as follows:
284	#
285	# r5,r6 are the two BN_ULONGs being multiplied.
286	# r7,r8 are the results of the 32x32 giving 64 bit multiply.
287	# r9,r10, r11 are the equivalents of c1,c2, c3.
288	# Here's the assembly
289	#
290	#
291	xor r0,r0,r0 # set r0 = 0. Used in the addze
292	# instructions below
293
294	#sqr_add_c(a,0,c1,c2,c3)
295	$LD r5,`0*$BNSZ`(r4)
296	$UMULL r9,r5,r5
297	$UMULH r10,r5,r5 #in first iteration. No need
298	#to add since c1=c2=c3=0.
299	# Note c3(r11) is NOT set to 0
300	# but will be.
301
302	$ST r9,`0*$BNSZ`(r3) # r[0]=c1;
303	# sqr_add_c2(a,1,0,c2,c3,c1);
304	$LD r6,`1*$BNSZ`(r4)
305	$UMULL r7,r5,r6
306	$UMULH r8,r5,r6
307
308	addc r7,r7,r7 # compute (r7,r8)=2*(r7,r8)
309	adde r8,r8,r8
310	addze r9,r0 # catch carry if any.
311	# r9= r0(=0) and carry
312
313	addc r10,r7,r10 # now add to temp result.
314	addze r11,r8 # r8 added to r11 which is 0
315	addze r9,r9
316
317	$ST r10,`1*$BNSZ`(r3) #r[1]=c2;
318	#sqr_add_c(a,1,c3,c1,c2)
319	$UMULL r7,r6,r6
320	$UMULH r8,r6,r6
321	addc r11,r7,r11
322	adde r9,r8,r9
323	addze r10,r0
324	#sqr_add_c2(a,2,0,c3,c1,c2)
325	$LD r6,`2*$BNSZ`(r4)
326	$UMULL r7,r5,r6
327	$UMULH r8,r5,r6
328
329	addc r7,r7,r7
330	adde r8,r8,r8
331	addze r10,r10
332
333	addc r11,r7,r11
334	adde r9,r8,r9
335	addze r10,r10
336	$ST r11,`2*$BNSZ`(r3) #r[2]=c3
337	#sqr_add_c2(a,3,0,c1,c2,c3);
338	$LD r6,`3*$BNSZ`(r4)
339	$UMULL r7,r5,r6
340	$UMULH r8,r5,r6
341	addc r7,r7,r7
342	adde r8,r8,r8
343	addze r11,r0
344
345	addc r9,r7,r9
346	adde r10,r8,r10
347	addze r11,r11
348	#sqr_add_c2(a,2,1,c1,c2,c3);
349	$LD r5,`1*$BNSZ`(r4)
350	$LD r6,`2*$BNSZ`(r4)
351	$UMULL r7,r5,r6
352	$UMULH r8,r5,r6
353
354	addc r7,r7,r7
355	adde r8,r8,r8
356	addze r11,r11
357	addc r9,r7,r9
358	adde r10,r8,r10
359	addze r11,r11
360	$ST r9,`3*$BNSZ`(r3) #r[3]=c1
361	#sqr_add_c(a,2,c2,c3,c1);
362	$UMULL r7,r6,r6
363	$UMULH r8,r6,r6
364	addc r10,r7,r10
365	adde r11,r8,r11
366	addze r9,r0
367	#sqr_add_c2(a,3,1,c2,c3,c1);
368	$LD r6,`3*$BNSZ`(r4)
369	$UMULL r7,r5,r6
370	$UMULH r8,r5,r6
371	addc r7,r7,r7
372	adde r8,r8,r8
373	addze r9,r9
374
375	addc r10,r7,r10
376	adde r11,r8,r11
377	addze r9,r9
378	$ST r10,`4*$BNSZ`(r3) #r[4]=c2
379	#sqr_add_c2(a,3,2,c3,c1,c2);
380	$LD r5,`2*$BNSZ`(r4)
381	$UMULL r7,r5,r6
382	$UMULH r8,r5,r6
383	addc r7,r7,r7
384	adde r8,r8,r8
385	addze r10,r0
386
387	addc r11,r7,r11
388	adde r9,r8,r9
389	addze r10,r10
390	$ST r11,`5*$BNSZ`(r3) #r[5] = c3
391	#sqr_add_c(a,3,c1,c2,c3);
392	$UMULL r7,r6,r6
393	$UMULH r8,r6,r6
394	addc r9,r7,r9
395	adde r10,r8,r10
396
397	$ST r9,`6*$BNSZ`(r3) #r[6]=c1
398	$ST r10,`7*$BNSZ`(r3) #r[7]=c2
399	blr
400	.long 0
401	.byte 0,12,0x14,0,0,0,2,0
402	.long 0
403	.size .bn_sqr_comba4,.-.bn_sqr_comba4
404
405	#
406	# NOTE: The following label name should be changed to
407	# "bn_sqr_comba8" i.e. remove the first dot
408	# for the gcc compiler. This should be automatically
409	# done in the build
410	#
411
412	.align 4
413	.bn_sqr_comba8:
414	#
415	# This is an optimized version of the bn_sqr_comba8 routine.
416	# Tightly uses the adde instruction
417	#
418	#
419	# void bn_sqr_comba8(BN_ULONG r, BN_ULONG a)
420	# r3 contains r
421	# r4 contains a
422	#
423	# Freely use registers r5,r6,r7,r8,r9,r10,r11 as follows:
424	#
425	# r5,r6 are the two BN_ULONGs being multiplied.
426	# r7,r8 are the results of the 32x32 giving 64 bit multiply.
427	# r9,r10, r11 are the equivalents of c1,c2, c3.
428	#
429	# Possible optimization of loading all 8 longs of a into registers
430	# doesn't provide any speedup
431	#
432
433	xor r0,r0,r0 #set r0 = 0.Used in addze
434	#instructions below.
435
436	#sqr_add_c(a,0,c1,c2,c3);
437	$LD r5,`0*$BNSZ`(r4)
438	$UMULL r9,r5,r5 #1st iteration: no carries.
439	$UMULH r10,r5,r5
440	$ST r9,`0*$BNSZ`(r3) # r[0]=c1;
441	#sqr_add_c2(a,1,0,c2,c3,c1);
442	$LD r6,`1*$BNSZ`(r4)
443	$UMULL r7,r5,r6
444	$UMULH r8,r5,r6
445
446	addc r10,r7,r10 #add the two register number
447	adde r11,r8,r0 # (r8,r7) to the three register
448	addze r9,r0 # number (r9,r11,r10).NOTE:r0=0
449
450	addc r10,r7,r10 #add the two register number
451	adde r11,r8,r11 # (r8,r7) to the three register
452	addze r9,r9 # number (r9,r11,r10).
453
454	$ST r10,`1*$BNSZ`(r3) # r[1]=c2
455
456	#sqr_add_c(a,1,c3,c1,c2);
457	$UMULL r7,r6,r6
458	$UMULH r8,r6,r6
459	addc r11,r7,r11
460	adde r9,r8,r9
461	addze r10,r0
462	#sqr_add_c2(a,2,0,c3,c1,c2);
463	$LD r6,`2*$BNSZ`(r4)
464	$UMULL r7,r5,r6
465	$UMULH r8,r5,r6
466
467	addc r11,r7,r11
468	adde r9,r8,r9
469	addze r10,r10
470
471	addc r11,r7,r11
472	adde r9,r8,r9
473	addze r10,r10
474
475	$ST r11,`2*$BNSZ`(r3) #r[2]=c3
476	#sqr_add_c2(a,3,0,c1,c2,c3);
477	$LD r6,`3*$BNSZ`(r4) #r6 = a[3]. r5 is already a[0].
478	$UMULL r7,r5,r6
479	$UMULH r8,r5,r6
480
481	addc r9,r7,r9
482	adde r10,r8,r10
483	addze r11,r0
484
485	addc r9,r7,r9
486	adde r10,r8,r10
487	addze r11,r11
488	#sqr_add_c2(a,2,1,c1,c2,c3);
489	$LD r5,`1*$BNSZ`(r4)
490	$LD r6,`2*$BNSZ`(r4)
491	$UMULL r7,r5,r6
492	$UMULH r8,r5,r6
493
494	addc r9,r7,r9
495	adde r10,r8,r10
496	addze r11,r11
497
498	addc r9,r7,r9
499	adde r10,r8,r10
500	addze r11,r11
501
502	$ST r9,`3*$BNSZ`(r3) #r[3]=c1;
503	#sqr_add_c(a,2,c2,c3,c1);
504	$UMULL r7,r6,r6
505	$UMULH r8,r6,r6
506
507	addc r10,r7,r10
508	adde r11,r8,r11
509	addze r9,r0
510	#sqr_add_c2(a,3,1,c2,c3,c1);
511	$LD r6,`3*$BNSZ`(r4)
512	$UMULL r7,r5,r6
513	$UMULH r8,r5,r6
514
515	addc r10,r7,r10
516	adde r11,r8,r11
517	addze r9,r9
518
519	addc r10,r7,r10
520	adde r11,r8,r11
521	addze r9,r9
522	#sqr_add_c2(a,4,0,c2,c3,c1);
523	$LD r5,`0*$BNSZ`(r4)
524	$LD r6,`4*$BNSZ`(r4)
525	$UMULL r7,r5,r6
526	$UMULH r8,r5,r6
527
528	addc r10,r7,r10
529	adde r11,r8,r11
530	addze r9,r9
531
532	addc r10,r7,r10
533	adde r11,r8,r11
534	addze r9,r9
535	$ST r10,`4*$BNSZ`(r3) #r[4]=c2;
536	#sqr_add_c2(a,5,0,c3,c1,c2);
537	$LD r6,`5*$BNSZ`(r4)
538	$UMULL r7,r5,r6
539	$UMULH r8,r5,r6
540
541	addc r11,r7,r11
542	adde r9,r8,r9
543	addze r10,r0
544
545	addc r11,r7,r11
546	adde r9,r8,r9
547	addze r10,r10
548	#sqr_add_c2(a,4,1,c3,c1,c2);
549	$LD r5,`1*$BNSZ`(r4)
550	$LD r6,`4*$BNSZ`(r4)
551	$UMULL r7,r5,r6
552	$UMULH r8,r5,r6
553
554	addc r11,r7,r11
555	adde r9,r8,r9
556	addze r10,r10
557
558	addc r11,r7,r11
559	adde r9,r8,r9
560	addze r10,r10
561	#sqr_add_c2(a,3,2,c3,c1,c2);
562	$LD r5,`2*$BNSZ`(r4)
563	$LD r6,`3*$BNSZ`(r4)
564	$UMULL r7,r5,r6
565	$UMULH r8,r5,r6
566
567	addc r11,r7,r11
568	adde r9,r8,r9
569	addze r10,r10
570
571	addc r11,r7,r11
572	adde r9,r8,r9
573	addze r10,r10
574	$ST r11,`5*$BNSZ`(r3) #r[5]=c3;
575	#sqr_add_c(a,3,c1,c2,c3);
576	$UMULL r7,r6,r6
577	$UMULH r8,r6,r6
578	addc r9,r7,r9
579	adde r10,r8,r10
580	addze r11,r0
581	#sqr_add_c2(a,4,2,c1,c2,c3);
582	$LD r6,`4*$BNSZ`(r4)
583	$UMULL r7,r5,r6
584	$UMULH r8,r5,r6
585
586	addc r9,r7,r9
587	adde r10,r8,r10
588	addze r11,r11
589
590	addc r9,r7,r9
591	adde r10,r8,r10
592	addze r11,r11
593	#sqr_add_c2(a,5,1,c1,c2,c3);
594	$LD r5,`1*$BNSZ`(r4)
595	$LD r6,`5*$BNSZ`(r4)
596	$UMULL r7,r5,r6
597	$UMULH r8,r5,r6
598
599	addc r9,r7,r9
600	adde r10,r8,r10
601	addze r11,r11
602
603	addc r9,r7,r9
604	adde r10,r8,r10
605	addze r11,r11
606	#sqr_add_c2(a,6,0,c1,c2,c3);
607	$LD r5,`0*$BNSZ`(r4)
608	$LD r6,`6*$BNSZ`(r4)
609	$UMULL r7,r5,r6
610	$UMULH r8,r5,r6
611	addc r9,r7,r9
612	adde r10,r8,r10
613	addze r11,r11
614	addc r9,r7,r9
615	adde r10,r8,r10
616	addze r11,r11
617	$ST r9,`6*$BNSZ`(r3) #r[6]=c1;
618	#sqr_add_c2(a,7,0,c2,c3,c1);
619	$LD r6,`7*$BNSZ`(r4)
620	$UMULL r7,r5,r6
621	$UMULH r8,r5,r6
622
623	addc r10,r7,r10
624	adde r11,r8,r11
625	addze r9,r0
626	addc r10,r7,r10
627	adde r11,r8,r11
628	addze r9,r9
629	#sqr_add_c2(a,6,1,c2,c3,c1);
630	$LD r5,`1*$BNSZ`(r4)
631	$LD r6,`6*$BNSZ`(r4)
632	$UMULL r7,r5,r6
633	$UMULH r8,r5,r6
634
635	addc r10,r7,r10
636	adde r11,r8,r11
637	addze r9,r9
638	addc r10,r7,r10
639	adde r11,r8,r11
640	addze r9,r9
641	#sqr_add_c2(a,5,2,c2,c3,c1);
642	$LD r5,`2*$BNSZ`(r4)
643	$LD r6,`5*$BNSZ`(r4)
644	$UMULL r7,r5,r6
645	$UMULH r8,r5,r6
646	addc r10,r7,r10
647	adde r11,r8,r11
648	addze r9,r9
649	addc r10,r7,r10
650	adde r11,r8,r11
651	addze r9,r9
652	#sqr_add_c2(a,4,3,c2,c3,c1);
653	$LD r5,`3*$BNSZ`(r4)
654	$LD r6,`4*$BNSZ`(r4)
655	$UMULL r7,r5,r6
656	$UMULH r8,r5,r6
657
658	addc r10,r7,r10
659	adde r11,r8,r11
660	addze r9,r9
661	addc r10,r7,r10
662	adde r11,r8,r11
663	addze r9,r9
664	$ST r10,`7*$BNSZ`(r3) #r[7]=c2;
665	#sqr_add_c(a,4,c3,c1,c2);
666	$UMULL r7,r6,r6
667	$UMULH r8,r6,r6
668	addc r11,r7,r11
669	adde r9,r8,r9
670	addze r10,r0
671	#sqr_add_c2(a,5,3,c3,c1,c2);
672	$LD r6,`5*$BNSZ`(r4)
673	$UMULL r7,r5,r6
674	$UMULH r8,r5,r6
675	addc r11,r7,r11
676	adde r9,r8,r9
677	addze r10,r10
678	addc r11,r7,r11
679	adde r9,r8,r9
680	addze r10,r10
681	#sqr_add_c2(a,6,2,c3,c1,c2);
682	$LD r5,`2*$BNSZ`(r4)
683	$LD r6,`6*$BNSZ`(r4)
684	$UMULL r7,r5,r6
685	$UMULH r8,r5,r6
686	addc r11,r7,r11
687	adde r9,r8,r9
688	addze r10,r10
689
690	addc r11,r7,r11
691	adde r9,r8,r9
692	addze r10,r10
693	#sqr_add_c2(a,7,1,c3,c1,c2);
694	$LD r5,`1*$BNSZ`(r4)
695	$LD r6,`7*$BNSZ`(r4)
696	$UMULL r7,r5,r6
697	$UMULH r8,r5,r6
698	addc r11,r7,r11
699	adde r9,r8,r9
700	addze r10,r10
701	addc r11,r7,r11
702	adde r9,r8,r9
703	addze r10,r10
704	$ST r11,`8*$BNSZ`(r3) #r[8]=c3;
705	#sqr_add_c2(a,7,2,c1,c2,c3);
706	$LD r5,`2*$BNSZ`(r4)
707	$UMULL r7,r5,r6
708	$UMULH r8,r5,r6
709
710	addc r9,r7,r9
711	adde r10,r8,r10
712	addze r11,r0
713	addc r9,r7,r9
714	adde r10,r8,r10
715	addze r11,r11
716	#sqr_add_c2(a,6,3,c1,c2,c3);
717	$LD r5,`3*$BNSZ`(r4)
718	$LD r6,`6*$BNSZ`(r4)
719	$UMULL r7,r5,r6
720	$UMULH r8,r5,r6
721	addc r9,r7,r9
722	adde r10,r8,r10
723	addze r11,r11
724	addc r9,r7,r9
725	adde r10,r8,r10
726	addze r11,r11
727	#sqr_add_c2(a,5,4,c1,c2,c3);
728	$LD r5,`4*$BNSZ`(r4)
729	$LD r6,`5*$BNSZ`(r4)
730	$UMULL r7,r5,r6
731	$UMULH r8,r5,r6
732	addc r9,r7,r9
733	adde r10,r8,r10
734	addze r11,r11
735	addc r9,r7,r9
736	adde r10,r8,r10
737	addze r11,r11
738	$ST r9,`9*$BNSZ`(r3) #r[9]=c1;
739	#sqr_add_c(a,5,c2,c3,c1);
740	$UMULL r7,r6,r6
741	$UMULH r8,r6,r6
742	addc r10,r7,r10
743	adde r11,r8,r11
744	addze r9,r0
745	#sqr_add_c2(a,6,4,c2,c3,c1);
746	$LD r6,`6*$BNSZ`(r4)
747	$UMULL r7,r5,r6
748	$UMULH r8,r5,r6
749	addc r10,r7,r10
750	adde r11,r8,r11
751	addze r9,r9
752	addc r10,r7,r10
753	adde r11,r8,r11
754	addze r9,r9
755	#sqr_add_c2(a,7,3,c2,c3,c1);
756	$LD r5,`3*$BNSZ`(r4)
757	$LD r6,`7*$BNSZ`(r4)
758	$UMULL r7,r5,r6
759	$UMULH r8,r5,r6
760	addc r10,r7,r10
761	adde r11,r8,r11
762	addze r9,r9
763	addc r10,r7,r10
764	adde r11,r8,r11
765	addze r9,r9
766	$ST r10,`10*$BNSZ`(r3) #r[10]=c2;
767	#sqr_add_c2(a,7,4,c3,c1,c2);
768	$LD r5,`4*$BNSZ`(r4)
769	$UMULL r7,r5,r6
770	$UMULH r8,r5,r6
771	addc r11,r7,r11
772	adde r9,r8,r9
773	addze r10,r0
774	addc r11,r7,r11
775	adde r9,r8,r9
776	addze r10,r10
777	#sqr_add_c2(a,6,5,c3,c1,c2);
778	$LD r5,`5*$BNSZ`(r4)
779	$LD r6,`6*$BNSZ`(r4)
780	$UMULL r7,r5,r6
781	$UMULH r8,r5,r6
782	addc r11,r7,r11
783	adde r9,r8,r9
784	addze r10,r10
785	addc r11,r7,r11
786	adde r9,r8,r9
787	addze r10,r10
788	$ST r11,`11*$BNSZ`(r3) #r[11]=c3;
789	#sqr_add_c(a,6,c1,c2,c3);
790	$UMULL r7,r6,r6
791	$UMULH r8,r6,r6
792	addc r9,r7,r9
793	adde r10,r8,r10
794	addze r11,r0
795	#sqr_add_c2(a,7,5,c1,c2,c3)
796	$LD r6,`7*$BNSZ`(r4)
797	$UMULL r7,r5,r6
798	$UMULH r8,r5,r6
799	addc r9,r7,r9
800	adde r10,r8,r10
801	addze r11,r11
802	addc r9,r7,r9
803	adde r10,r8,r10
804	addze r11,r11
805	$ST r9,`12*$BNSZ`(r3) #r[12]=c1;
806
807	#sqr_add_c2(a,7,6,c2,c3,c1)
808	$LD r5,`6*$BNSZ`(r4)
809	$UMULL r7,r5,r6
810	$UMULH r8,r5,r6
811	addc r10,r7,r10
812	adde r11,r8,r11
813	addze r9,r0
814	addc r10,r7,r10
815	adde r11,r8,r11
816	addze r9,r9
817	$ST r10,`13*$BNSZ`(r3) #r[13]=c2;
818	#sqr_add_c(a,7,c3,c1,c2);
819	$UMULL r7,r6,r6
820	$UMULH r8,r6,r6
821	addc r11,r7,r11
822	adde r9,r8,r9
823	$ST r11,`14*$BNSZ`(r3) #r[14]=c3;
824	$ST r9, `15*$BNSZ`(r3) #r[15]=c1;
825
826
827	blr
828	.long 0
829	.byte 0,12,0x14,0,0,0,2,0
830	.long 0
831	.size .bn_sqr_comba8,.-.bn_sqr_comba8
832
833	#
834	# NOTE: The following label name should be changed to
835	# "bn_mul_comba4" i.e. remove the first dot
836	# for the gcc compiler. This should be automatically
837	# done in the build
838	#
839
840	.align 4
841	.bn_mul_comba4:
842	#
843	# This is an optimized version of the bn_mul_comba4 routine.
844	#
845	# void bn_mul_comba4(BN_ULONG r, BN_ULONG a, BN_ULONG *b)
846	# r3 contains r
847	# r4 contains a
848	# r5 contains b
849	# r6, r7 are the 2 BN_ULONGs being multiplied.
850	# r8, r9 are the results of the 32x32 giving 64 multiply.
851	# r10, r11, r12 are the equivalents of c1, c2, and c3.
852	#
853	xor r0,r0,r0 #r0=0. Used in addze below.
854	#mul_add_c(a[0],b[0],c1,c2,c3);
855	$LD r6,`0*$BNSZ`(r4)
856	$LD r7,`0*$BNSZ`(r5)
857	$UMULL r10,r6,r7
858	$UMULH r11,r6,r7
859	$ST r10,`0*$BNSZ`(r3) #r[0]=c1
860	#mul_add_c(a[0],b[1],c2,c3,c1);
861	$LD r7,`1*$BNSZ`(r5)
862	$UMULL r8,r6,r7
863	$UMULH r9,r6,r7
864	addc r11,r8,r11
865	adde r12,r9,r0
866	addze r10,r0
867	#mul_add_c(a[1],b[0],c2,c3,c1);
868	$LD r6, `1*$BNSZ`(r4)
869	$LD r7, `0*$BNSZ`(r5)
870	$UMULL r8,r6,r7
871	$UMULH r9,r6,r7
872	addc r11,r8,r11
873	adde r12,r9,r12
874	addze r10,r10
875	$ST r11,`1*$BNSZ`(r3) #r[1]=c2
876	#mul_add_c(a[2],b[0],c3,c1,c2);
877	$LD r6,`2*$BNSZ`(r4)
878	$UMULL r8,r6,r7
879	$UMULH r9,r6,r7
880	addc r12,r8,r12
881	adde r10,r9,r10
882	addze r11,r0
883	#mul_add_c(a[1],b[1],c3,c1,c2);
884	$LD r6,`1*$BNSZ`(r4)
885	$LD r7,`1*$BNSZ`(r5)
886	$UMULL r8,r6,r7
887	$UMULH r9,r6,r7
888	addc r12,r8,r12
889	adde r10,r9,r10
890	addze r11,r11
891	#mul_add_c(a[0],b[2],c3,c1,c2);
892	$LD r6,`0*$BNSZ`(r4)
893	$LD r7,`2*$BNSZ`(r5)
894	$UMULL r8,r6,r7
895	$UMULH r9,r6,r7
896	addc r12,r8,r12
897	adde r10,r9,r10
898	addze r11,r11
899	$ST r12,`2*$BNSZ`(r3) #r[2]=c3
900	#mul_add_c(a[0],b[3],c1,c2,c3);
901	$LD r7,`3*$BNSZ`(r5)
902	$UMULL r8,r6,r7
903	$UMULH r9,r6,r7
904	addc r10,r8,r10
905	adde r11,r9,r11
906	addze r12,r0
907	#mul_add_c(a[1],b[2],c1,c2,c3);
908	$LD r6,`1*$BNSZ`(r4)
909	$LD r7,`2*$BNSZ`(r5)
910	$UMULL r8,r6,r7
911	$UMULH r9,r6,r7
912	addc r10,r8,r10
913	adde r11,r9,r11
914	addze r12,r12
915	#mul_add_c(a[2],b[1],c1,c2,c3);
916	$LD r6,`2*$BNSZ`(r4)
917	$LD r7,`1*$BNSZ`(r5)
918	$UMULL r8,r6,r7
919	$UMULH r9,r6,r7
920	addc r10,r8,r10
921	adde r11,r9,r11
922	addze r12,r12
923	#mul_add_c(a[3],b[0],c1,c2,c3);
924	$LD r6,`3*$BNSZ`(r4)
925	$LD r7,`0*$BNSZ`(r5)
926	$UMULL r8,r6,r7
927	$UMULH r9,r6,r7
928	addc r10,r8,r10
929	adde r11,r9,r11
930	addze r12,r12
931	$ST r10,`3*$BNSZ`(r3) #r[3]=c1
932	#mul_add_c(a[3],b[1],c2,c3,c1);
933	$LD r7,`1*$BNSZ`(r5)
934	$UMULL r8,r6,r7
935	$UMULH r9,r6,r7
936	addc r11,r8,r11
937	adde r12,r9,r12
938	addze r10,r0
939	#mul_add_c(a[2],b[2],c2,c3,c1);
940	$LD r6,`2*$BNSZ`(r4)
941	$LD r7,`2*$BNSZ`(r5)
942	$UMULL r8,r6,r7
943	$UMULH r9,r6,r7
944	addc r11,r8,r11
945	adde r12,r9,r12
946	addze r10,r10
947	#mul_add_c(a[1],b[3],c2,c3,c1);
948	$LD r6,`1*$BNSZ`(r4)
949	$LD r7,`3*$BNSZ`(r5)
950	$UMULL r8,r6,r7
951	$UMULH r9,r6,r7
952	addc r11,r8,r11
953	adde r12,r9,r12
954	addze r10,r10
955	$ST r11,`4*$BNSZ`(r3) #r[4]=c2
956	#mul_add_c(a[2],b[3],c3,c1,c2);
957	$LD r6,`2*$BNSZ`(r4)
958	$UMULL r8,r6,r7
959	$UMULH r9,r6,r7
960	addc r12,r8,r12
961	adde r10,r9,r10
962	addze r11,r0
963	#mul_add_c(a[3],b[2],c3,c1,c2);
964	$LD r6,`3*$BNSZ`(r4)
965	$LD r7,`2*$BNSZ`(r5)
966	$UMULL r8,r6,r7
967	$UMULH r9,r6,r7
968	addc r12,r8,r12
969	adde r10,r9,r10
970	addze r11,r11
971	$ST r12,`5*$BNSZ`(r3) #r[5]=c3
972	#mul_add_c(a[3],b[3],c1,c2,c3);
973	$LD r7,`3*$BNSZ`(r5)
974	$UMULL r8,r6,r7
975	$UMULH r9,r6,r7
976	addc r10,r8,r10
977	adde r11,r9,r11
978
979	$ST r10,`6*$BNSZ`(r3) #r[6]=c1
980	$ST r11,`7*$BNSZ`(r3) #r[7]=c2
981	blr
982	.long 0
983	.byte 0,12,0x14,0,0,0,3,0
984	.long 0
985	.size .bn_mul_comba4,.-.bn_mul_comba4
986
987	#
988	# NOTE: The following label name should be changed to
989	# "bn_mul_comba8" i.e. remove the first dot
990	# for the gcc compiler. This should be automatically
991	# done in the build
992	#
993
994	.align 4
995	.bn_mul_comba8:
996	#
997	# Optimized version of the bn_mul_comba8 routine.
998	#
999	# void bn_mul_comba8(BN_ULONG r, BN_ULONG a, BN_ULONG *b)
1000	# r3 contains r
1001	# r4 contains a
1002	# r5 contains b
1003	# r6, r7 are the 2 BN_ULONGs being multiplied.
1004	# r8, r9 are the results of the 32x32 giving 64 multiply.
1005	# r10, r11, r12 are the equivalents of c1, c2, and c3.
1006	#
1007	xor r0,r0,r0 #r0=0. Used in addze below.
1008
1009	#mul_add_c(a[0],b[0],c1,c2,c3);
1010	$LD r6,`0*$BNSZ`(r4) #a[0]
1011	$LD r7,`0*$BNSZ`(r5) #b[0]
1012	$UMULL r10,r6,r7
1013	$UMULH r11,r6,r7
1014	$ST r10,`0*$BNSZ`(r3) #r[0]=c1;
1015	#mul_add_c(a[0],b[1],c2,c3,c1);
1016	$LD r7,`1*$BNSZ`(r5)
1017	$UMULL r8,r6,r7
1018	$UMULH r9,r6,r7
1019	addc r11,r11,r8
1020	addze r12,r9 # since we didn't set r12 to zero before.
1021	addze r10,r0
1022	#mul_add_c(a[1],b[0],c2,c3,c1);
1023	$LD r6,`1*$BNSZ`(r4)
1024	$LD r7,`0*$BNSZ`(r5)
1025	$UMULL r8,r6,r7
1026	$UMULH r9,r6,r7
1027	addc r11,r11,r8
1028	adde r12,r12,r9
1029	addze r10,r10
1030	$ST r11,`1*$BNSZ`(r3) #r[1]=c2;
1031	#mul_add_c(a[2],b[0],c3,c1,c2);
1032	$LD r6,`2*$BNSZ`(r4)
1033	$UMULL r8,r6,r7
1034	$UMULH r9,r6,r7
1035	addc r12,r12,r8
1036	adde r10,r10,r9
1037	addze r11,r0
1038	#mul_add_c(a[1],b[1],c3,c1,c2);
1039	$LD r6,`1*$BNSZ`(r4)
1040	$LD r7,`1*$BNSZ`(r5)
1041	$UMULL r8,r6,r7
1042	$UMULH r9,r6,r7
1043	addc r12,r12,r8
1044	adde r10,r10,r9
1045	addze r11,r11
1046	#mul_add_c(a[0],b[2],c3,c1,c2);
1047	$LD r6,`0*$BNSZ`(r4)
1048	$LD r7,`2*$BNSZ`(r5)
1049	$UMULL r8,r6,r7
1050	$UMULH r9,r6,r7
1051	addc r12,r12,r8
1052	adde r10,r10,r9
1053	addze r11,r11
1054	$ST r12,`2*$BNSZ`(r3) #r[2]=c3;
1055	#mul_add_c(a[0],b[3],c1,c2,c3);
1056	$LD r7,`3*$BNSZ`(r5)
1057	$UMULL r8,r6,r7
1058	$UMULH r9,r6,r7
1059	addc r10,r10,r8
1060	adde r11,r11,r9
1061	addze r12,r0
1062	#mul_add_c(a[1],b[2],c1,c2,c3);
1063	$LD r6,`1*$BNSZ`(r4)
1064	$LD r7,`2*$BNSZ`(r5)
1065	$UMULL r8,r6,r7
1066	$UMULH r9,r6,r7
1067	addc r10,r10,r8
1068	adde r11,r11,r9
1069	addze r12,r12
1070
1071	#mul_add_c(a[2],b[1],c1,c2,c3);
1072	$LD r6,`2*$BNSZ`(r4)
1073	$LD r7,`1*$BNSZ`(r5)
1074	$UMULL r8,r6,r7
1075	$UMULH r9,r6,r7
1076	addc r10,r10,r8
1077	adde r11,r11,r9
1078	addze r12,r12
1079	#mul_add_c(a[3],b[0],c1,c2,c3);
1080	$LD r6,`3*$BNSZ`(r4)
1081	$LD r7,`0*$BNSZ`(r5)
1082	$UMULL r8,r6,r7
1083	$UMULH r9,r6,r7
1084	addc r10,r10,r8
1085	adde r11,r11,r9
1086	addze r12,r12
1087	$ST r10,`3*$BNSZ`(r3) #r[3]=c1;
1088	#mul_add_c(a[4],b[0],c2,c3,c1);
1089	$LD r6,`4*$BNSZ`(r4)
1090	$UMULL r8,r6,r7
1091	$UMULH r9,r6,r7
1092	addc r11,r11,r8
1093	adde r12,r12,r9
1094	addze r10,r0
1095	#mul_add_c(a[3],b[1],c2,c3,c1);
1096	$LD r6,`3*$BNSZ`(r4)
1097	$LD r7,`1*$BNSZ`(r5)
1098	$UMULL r8,r6,r7
1099	$UMULH r9,r6,r7
1100	addc r11,r11,r8
1101	adde r12,r12,r9
1102	addze r10,r10
1103	#mul_add_c(a[2],b[2],c2,c3,c1);
1104	$LD r6,`2*$BNSZ`(r4)
1105	$LD r7,`2*$BNSZ`(r5)
1106	$UMULL r8,r6,r7
1107	$UMULH r9,r6,r7
1108	addc r11,r11,r8
1109	adde r12,r12,r9
1110	addze r10,r10
1111	#mul_add_c(a[1],b[3],c2,c3,c1);
1112	$LD r6,`1*$BNSZ`(r4)
1113	$LD r7,`3*$BNSZ`(r5)
1114	$UMULL r8,r6,r7
1115	$UMULH r9,r6,r7
1116	addc r11,r11,r8
1117	adde r12,r12,r9
1118	addze r10,r10
1119	#mul_add_c(a[0],b[4],c2,c3,c1);
1120	$LD r6,`0*$BNSZ`(r4)
1121	$LD r7,`4*$BNSZ`(r5)
1122	$UMULL r8,r6,r7
1123	$UMULH r9,r6,r7
1124	addc r11,r11,r8
1125	adde r12,r12,r9
1126	addze r10,r10
1127	$ST r11,`4*$BNSZ`(r3) #r[4]=c2;
1128	#mul_add_c(a[0],b[5],c3,c1,c2);
1129	$LD r7,`5*$BNSZ`(r5)
1130	$UMULL r8,r6,r7
1131	$UMULH r9,r6,r7
1132	addc r12,r12,r8
1133	adde r10,r10,r9
1134	addze r11,r0
1135	#mul_add_c(a[1],b[4],c3,c1,c2);
1136	$LD r6,`1*$BNSZ`(r4)
1137	$LD r7,`4*$BNSZ`(r5)
1138	$UMULL r8,r6,r7
1139	$UMULH r9,r6,r7
1140	addc r12,r12,r8
1141	adde r10,r10,r9
1142	addze r11,r11
1143	#mul_add_c(a[2],b[3],c3,c1,c2);
1144	$LD r6,`2*$BNSZ`(r4)
1145	$LD r7,`3*$BNSZ`(r5)
1146	$UMULL r8,r6,r7
1147	$UMULH r9,r6,r7
1148	addc r12,r12,r8
1149	adde r10,r10,r9
1150	addze r11,r11
1151	#mul_add_c(a[3],b[2],c3,c1,c2);
1152	$LD r6,`3*$BNSZ`(r4)
1153	$LD r7,`2*$BNSZ`(r5)
1154	$UMULL r8,r6,r7
1155	$UMULH r9,r6,r7
1156	addc r12,r12,r8
1157	adde r10,r10,r9
1158	addze r11,r11
1159	#mul_add_c(a[4],b[1],c3,c1,c2);
1160	$LD r6,`4*$BNSZ`(r4)
1161	$LD r7,`1*$BNSZ`(r5)
1162	$UMULL r8,r6,r7
1163	$UMULH r9,r6,r7
1164	addc r12,r12,r8
1165	adde r10,r10,r9
1166	addze r11,r11
1167	#mul_add_c(a[5],b[0],c3,c1,c2);
1168	$LD r6,`5*$BNSZ`(r4)
1169	$LD r7,`0*$BNSZ`(r5)
1170	$UMULL r8,r6,r7
1171	$UMULH r9,r6,r7
1172	addc r12,r12,r8
1173	adde r10,r10,r9
1174	addze r11,r11
1175	$ST r12,`5*$BNSZ`(r3) #r[5]=c3;
1176	#mul_add_c(a[6],b[0],c1,c2,c3);
1177	$LD r6,`6*$BNSZ`(r4)
1178	$UMULL r8,r6,r7
1179	$UMULH r9,r6,r7
1180	addc r10,r10,r8
1181	adde r11,r11,r9
1182	addze r12,r0
1183	#mul_add_c(a[5],b[1],c1,c2,c3);
1184	$LD r6,`5*$BNSZ`(r4)
1185	$LD r7,`1*$BNSZ`(r5)
1186	$UMULL r8,r6,r7
1187	$UMULH r9,r6,r7
1188	addc r10,r10,r8
1189	adde r11,r11,r9
1190	addze r12,r12
1191	#mul_add_c(a[4],b[2],c1,c2,c3);
1192	$LD r6,`4*$BNSZ`(r4)
1193	$LD r7,`2*$BNSZ`(r5)
1194	$UMULL r8,r6,r7
1195	$UMULH r9,r6,r7
1196	addc r10,r10,r8
1197	adde r11,r11,r9
1198	addze r12,r12
1199	#mul_add_c(a[3],b[3],c1,c2,c3);
1200	$LD r6,`3*$BNSZ`(r4)
1201	$LD r7,`3*$BNSZ`(r5)
1202	$UMULL r8,r6,r7
1203	$UMULH r9,r6,r7
1204	addc r10,r10,r8
1205	adde r11,r11,r9
1206	addze r12,r12
1207	#mul_add_c(a[2],b[4],c1,c2,c3);
1208	$LD r6,`2*$BNSZ`(r4)
1209	$LD r7,`4*$BNSZ`(r5)
1210	$UMULL r8,r6,r7
1211	$UMULH r9,r6,r7
1212	addc r10,r10,r8
1213	adde r11,r11,r9
1214	addze r12,r12
1215	#mul_add_c(a[1],b[5],c1,c2,c3);
1216	$LD r6,`1*$BNSZ`(r4)
1217	$LD r7,`5*$BNSZ`(r5)
1218	$UMULL r8,r6,r7
1219	$UMULH r9,r6,r7
1220	addc r10,r10,r8
1221	adde r11,r11,r9
1222	addze r12,r12
1223	#mul_add_c(a[0],b[6],c1,c2,c3);
1224	$LD r6,`0*$BNSZ`(r4)
1225	$LD r7,`6*$BNSZ`(r5)
1226	$UMULL r8,r6,r7
1227	$UMULH r9,r6,r7
1228	addc r10,r10,r8
1229	adde r11,r11,r9
1230	addze r12,r12
1231	$ST r10,`6*$BNSZ`(r3) #r[6]=c1;
1232	#mul_add_c(a[0],b[7],c2,c3,c1);
1233	$LD r7,`7*$BNSZ`(r5)
1234	$UMULL r8,r6,r7
1235	$UMULH r9,r6,r7
1236	addc r11,r11,r8
1237	adde r12,r12,r9
1238	addze r10,r0
1239	#mul_add_c(a[1],b[6],c2,c3,c1);
1240	$LD r6,`1*$BNSZ`(r4)
1241	$LD r7,`6*$BNSZ`(r5)
1242	$UMULL r8,r6,r7
1243	$UMULH r9,r6,r7
1244	addc r11,r11,r8
1245	adde r12,r12,r9
1246	addze r10,r10
1247	#mul_add_c(a[2],b[5],c2,c3,c1);
1248	$LD r6,`2*$BNSZ`(r4)
1249	$LD r7,`5*$BNSZ`(r5)
1250	$UMULL r8,r6,r7
1251	$UMULH r9,r6,r7
1252	addc r11,r11,r8
1253	adde r12,r12,r9
1254	addze r10,r10
1255	#mul_add_c(a[3],b[4],c2,c3,c1);
1256	$LD r6,`3*$BNSZ`(r4)
1257	$LD r7,`4*$BNSZ`(r5)
1258	$UMULL r8,r6,r7
1259	$UMULH r9,r6,r7
1260	addc r11,r11,r8
1261	adde r12,r12,r9
1262	addze r10,r10
1263	#mul_add_c(a[4],b[3],c2,c3,c1);
1264	$LD r6,`4*$BNSZ`(r4)
1265	$LD r7,`3*$BNSZ`(r5)
1266	$UMULL r8,r6,r7
1267	$UMULH r9,r6,r7
1268	addc r11,r11,r8
1269	adde r12,r12,r9
1270	addze r10,r10
1271	#mul_add_c(a[5],b[2],c2,c3,c1);
1272	$LD r6,`5*$BNSZ`(r4)
1273	$LD r7,`2*$BNSZ`(r5)
1274	$UMULL r8,r6,r7
1275	$UMULH r9,r6,r7
1276	addc r11,r11,r8
1277	adde r12,r12,r9
1278	addze r10,r10
1279	#mul_add_c(a[6],b[1],c2,c3,c1);
1280	$LD r6,`6*$BNSZ`(r4)
1281	$LD r7,`1*$BNSZ`(r5)
1282	$UMULL r8,r6,r7
1283	$UMULH r9,r6,r7
1284	addc r11,r11,r8
1285	adde r12,r12,r9
1286	addze r10,r10
1287	#mul_add_c(a[7],b[0],c2,c3,c1);
1288	$LD r6,`7*$BNSZ`(r4)
1289	$LD r7,`0*$BNSZ`(r5)
1290	$UMULL r8,r6,r7
1291	$UMULH r9,r6,r7
1292	addc r11,r11,r8
1293	adde r12,r12,r9
1294	addze r10,r10
1295	$ST r11,`7*$BNSZ`(r3) #r[7]=c2;
1296	#mul_add_c(a[7],b[1],c3,c1,c2);
1297	$LD r7,`1*$BNSZ`(r5)
1298	$UMULL r8,r6,r7
1299	$UMULH r9,r6,r7
1300	addc r12,r12,r8
1301	adde r10,r10,r9
1302	addze r11,r0
1303	#mul_add_c(a[6],b[2],c3,c1,c2);
1304	$LD r6,`6*$BNSZ`(r4)
1305	$LD r7,`2*$BNSZ`(r5)
1306	$UMULL r8,r6,r7
1307	$UMULH r9,r6,r7
1308	addc r12,r12,r8
1309	adde r10,r10,r9
1310	addze r11,r11
1311	#mul_add_c(a[5],b[3],c3,c1,c2);
1312	$LD r6,`5*$BNSZ`(r4)
1313	$LD r7,`3*$BNSZ`(r5)
1314	$UMULL r8,r6,r7
1315	$UMULH r9,r6,r7
1316	addc r12,r12,r8
1317	adde r10,r10,r9
1318	addze r11,r11
1319	#mul_add_c(a[4],b[4],c3,c1,c2);
1320	$LD r6,`4*$BNSZ`(r4)
1321	$LD r7,`4*$BNSZ`(r5)
1322	$UMULL r8,r6,r7
1323	$UMULH r9,r6,r7
1324	addc r12,r12,r8
1325	adde r10,r10,r9
1326	addze r11,r11
1327	#mul_add_c(a[3],b[5],c3,c1,c2);
1328	$LD r6,`3*$BNSZ`(r4)
1329	$LD r7,`5*$BNSZ`(r5)
1330	$UMULL r8,r6,r7
1331	$UMULH r9,r6,r7
1332	addc r12,r12,r8
1333	adde r10,r10,r9
1334	addze r11,r11
1335	#mul_add_c(a[2],b[6],c3,c1,c2);
1336	$LD r6,`2*$BNSZ`(r4)
1337	$LD r7,`6*$BNSZ`(r5)
1338	$UMULL r8,r6,r7
1339	$UMULH r9,r6,r7
1340	addc r12,r12,r8
1341	adde r10,r10,r9
1342	addze r11,r11
1343	#mul_add_c(a[1],b[7],c3,c1,c2);
1344	$LD r6,`1*$BNSZ`(r4)
1345	$LD r7,`7*$BNSZ`(r5)
1346	$UMULL r8,r6,r7
1347	$UMULH r9,r6,r7
1348	addc r12,r12,r8
1349	adde r10,r10,r9
1350	addze r11,r11
1351	$ST r12,`8*$BNSZ`(r3) #r[8]=c3;
1352	#mul_add_c(a[2],b[7],c1,c2,c3);
1353	$LD r6,`2*$BNSZ`(r4)
1354	$UMULL r8,r6,r7
1355	$UMULH r9,r6,r7
1356	addc r10,r10,r8
1357	adde r11,r11,r9
1358	addze r12,r0
1359	#mul_add_c(a[3],b[6],c1,c2,c3);
1360	$LD r6,`3*$BNSZ`(r4)
1361	$LD r7,`6*$BNSZ`(r5)
1362	$UMULL r8,r6,r7
1363	$UMULH r9,r6,r7
1364	addc r10,r10,r8
1365	adde r11,r11,r9
1366	addze r12,r12
1367	#mul_add_c(a[4],b[5],c1,c2,c3);
1368	$LD r6,`4*$BNSZ`(r4)
1369	$LD r7,`5*$BNSZ`(r5)
1370	$UMULL r8,r6,r7
1371	$UMULH r9,r6,r7
1372	addc r10,r10,r8
1373	adde r11,r11,r9
1374	addze r12,r12
1375	#mul_add_c(a[5],b[4],c1,c2,c3);
1376	$LD r6,`5*$BNSZ`(r4)
1377	$LD r7,`4*$BNSZ`(r5)
1378	$UMULL r8,r6,r7
1379	$UMULH r9,r6,r7
1380	addc r10,r10,r8
1381	adde r11,r11,r9
1382	addze r12,r12
1383	#mul_add_c(a[6],b[3],c1,c2,c3);
1384	$LD r6,`6*$BNSZ`(r4)
1385	$LD r7,`3*$BNSZ`(r5)
1386	$UMULL r8,r6,r7
1387	$UMULH r9,r6,r7
1388	addc r10,r10,r8
1389	adde r11,r11,r9
1390	addze r12,r12
1391	#mul_add_c(a[7],b[2],c1,c2,c3);
1392	$LD r6,`7*$BNSZ`(r4)
1393	$LD r7,`2*$BNSZ`(r5)
1394	$UMULL r8,r6,r7
1395	$UMULH r9,r6,r7
1396	addc r10,r10,r8
1397	adde r11,r11,r9
1398	addze r12,r12
1399	$ST r10,`9*$BNSZ`(r3) #r[9]=c1;
1400	#mul_add_c(a[7],b[3],c2,c3,c1);
1401	$LD r7,`3*$BNSZ`(r5)
1402	$UMULL r8,r6,r7
1403	$UMULH r9,r6,r7
1404	addc r11,r11,r8
1405	adde r12,r12,r9
1406	addze r10,r0
1407	#mul_add_c(a[6],b[4],c2,c3,c1);
1408	$LD r6,`6*$BNSZ`(r4)
1409	$LD r7,`4*$BNSZ`(r5)
1410	$UMULL r8,r6,r7
1411	$UMULH r9,r6,r7
1412	addc r11,r11,r8
1413	adde r12,r12,r9
1414	addze r10,r10
1415	#mul_add_c(a[5],b[5],c2,c3,c1);
1416	$LD r6,`5*$BNSZ`(r4)
1417	$LD r7,`5*$BNSZ`(r5)
1418	$UMULL r8,r6,r7
1419	$UMULH r9,r6,r7
1420	addc r11,r11,r8
1421	adde r12,r12,r9
1422	addze r10,r10
1423	#mul_add_c(a[4],b[6],c2,c3,c1);
1424	$LD r6,`4*$BNSZ`(r4)
1425	$LD r7,`6*$BNSZ`(r5)
1426	$UMULL r8,r6,r7
1427	$UMULH r9,r6,r7
1428	addc r11,r11,r8
1429	adde r12,r12,r9
1430	addze r10,r10
1431	#mul_add_c(a[3],b[7],c2,c3,c1);
1432	$LD r6,`3*$BNSZ`(r4)
1433	$LD r7,`7*$BNSZ`(r5)
1434	$UMULL r8,r6,r7
1435	$UMULH r9,r6,r7
1436	addc r11,r11,r8
1437	adde r12,r12,r9
1438	addze r10,r10
1439	$ST r11,`10*$BNSZ`(r3) #r[10]=c2;
1440	#mul_add_c(a[4],b[7],c3,c1,c2);
1441	$LD r6,`4*$BNSZ`(r4)
1442	$UMULL r8,r6,r7
1443	$UMULH r9,r6,r7
1444	addc r12,r12,r8
1445	adde r10,r10,r9
1446	addze r11,r0
1447	#mul_add_c(a[5],b[6],c3,c1,c2);
1448	$LD r6,`5*$BNSZ`(r4)
1449	$LD r7,`6*$BNSZ`(r5)
1450	$UMULL r8,r6,r7
1451	$UMULH r9,r6,r7
1452	addc r12,r12,r8
1453	adde r10,r10,r9
1454	addze r11,r11
1455	#mul_add_c(a[6],b[5],c3,c1,c2);
1456	$LD r6,`6*$BNSZ`(r4)
1457	$LD r7,`5*$BNSZ`(r5)
1458	$UMULL r8,r6,r7
1459	$UMULH r9,r6,r7
1460	addc r12,r12,r8
1461	adde r10,r10,r9
1462	addze r11,r11
1463	#mul_add_c(a[7],b[4],c3,c1,c2);
1464	$LD r6,`7*$BNSZ`(r4)
1465	$LD r7,`4*$BNSZ`(r5)
1466	$UMULL r8,r6,r7
1467	$UMULH r9,r6,r7
1468	addc r12,r12,r8
1469	adde r10,r10,r9
1470	addze r11,r11
1471	$ST r12,`11*$BNSZ`(r3) #r[11]=c3;
1472	#mul_add_c(a[7],b[5],c1,c2,c3);
1473	$LD r7,`5*$BNSZ`(r5)
1474	$UMULL r8,r6,r7
1475	$UMULH r9,r6,r7
1476	addc r10,r10,r8
1477	adde r11,r11,r9
1478	addze r12,r0
1479	#mul_add_c(a[6],b[6],c1,c2,c3);
1480	$LD r6,`6*$BNSZ`(r4)
1481	$LD r7,`6*$BNSZ`(r5)
1482	$UMULL r8,r6,r7
1483	$UMULH r9,r6,r7
1484	addc r10,r10,r8
1485	adde r11,r11,r9
1486	addze r12,r12
1487	#mul_add_c(a[5],b[7],c1,c2,c3);
1488	$LD r6,`5*$BNSZ`(r4)
1489	$LD r7,`7*$BNSZ`(r5)
1490	$UMULL r8,r6,r7
1491	$UMULH r9,r6,r7
1492	addc r10,r10,r8
1493	adde r11,r11,r9
1494	addze r12,r12
1495	$ST r10,`12*$BNSZ`(r3) #r[12]=c1;
1496	#mul_add_c(a[6],b[7],c2,c3,c1);
1497	$LD r6,`6*$BNSZ`(r4)
1498	$UMULL r8,r6,r7
1499	$UMULH r9,r6,r7
1500	addc r11,r11,r8
1501	adde r12,r12,r9
1502	addze r10,r0
1503	#mul_add_c(a[7],b[6],c2,c3,c1);
1504	$LD r6,`7*$BNSZ`(r4)
1505	$LD r7,`6*$BNSZ`(r5)
1506	$UMULL r8,r6,r7
1507	$UMULH r9,r6,r7
1508	addc r11,r11,r8
1509	adde r12,r12,r9
1510	addze r10,r10
1511	$ST r11,`13*$BNSZ`(r3) #r[13]=c2;
1512	#mul_add_c(a[7],b[7],c3,c1,c2);
1513	$LD r7,`7*$BNSZ`(r5)
1514	$UMULL r8,r6,r7
1515	$UMULH r9,r6,r7
1516	addc r12,r12,r8
1517	adde r10,r10,r9
1518	$ST r12,`14*$BNSZ`(r3) #r[14]=c3;
1519	$ST r10,`15*$BNSZ`(r3) #r[15]=c1;
1520	blr
1521	.long 0
1522	.byte 0,12,0x14,0,0,0,3,0
1523	.long 0
1524	.size .bn_mul_comba8,.-.bn_mul_comba8
1525
1526	#
1527	# NOTE: The following label name should be changed to
1528	# "bn_sub_words" i.e. remove the first dot
1529	# for the gcc compiler. This should be automatically
1530	# done in the build
1531	#
1532	#
1533	.align 4
1534	.bn_sub_words:
1535	#
1536	# Handcoded version of bn_sub_words
1537	#
1538	#BN_ULONG bn_sub_words(BN_ULONG r, BN_ULONG a, BN_ULONG *b, int n)
1539	#
1540	# r3 = r
1541	# r4 = a
1542	# r5 = b
1543	# r6 = n
1544	#
1545	# Note: No loop unrolling done since this is not a performance
1546	# critical loop.
1547
1548	xor r0,r0,r0 #set r0 = 0
1549	#
1550	# check for r6 = 0 AND set carry bit.
1551	#
1552	subfc. r7,r0,r6 # If r6 is 0 then result is 0.
1553	# if r6 > 0 then result !=0
1554	# In either case carry bit is set.
1555	beq Lppcasm_sub_adios
1556	addi r4,r4,-$BNSZ
1557	addi r3,r3,-$BNSZ
1558	addi r5,r5,-$BNSZ
1559	mtctr r6
1560	Lppcasm_sub_mainloop:
1561	$LDU r7,$BNSZ(r4)
1562	$LDU r8,$BNSZ(r5)
1563	subfe r6,r8,r7 # r6 = r7+carry bit + onescomplement(r8)
1564	# if carry = 1 this is r7-r8. Else it
1565	# is r7-r8 -1 as we need.
1566	$STU r6,$BNSZ(r3)
1567	bdnz Lppcasm_sub_mainloop
1568	Lppcasm_sub_adios:
1569	subfze r3,r0 # if carry bit is set then r3 = 0 else -1
1570	andi. r3,r3,1 # keep only last bit.
1571	blr
1572	.long 0
1573	.byte 0,12,0x14,0,0,0,4,0
1574	.long 0
1575	.size .bn_sub_words,.-.bn_sub_words
1576
1577	#
1578	# NOTE: The following label name should be changed to
1579	# "bn_add_words" i.e. remove the first dot
1580	# for the gcc compiler. This should be automatically
1581	# done in the build
1582	#
1583
1584	.align 4
1585	.bn_add_words:
1586	#
1587	# Handcoded version of bn_add_words
1588	#
1589	#BN_ULONG bn_add_words(BN_ULONG r, BN_ULONG a, BN_ULONG *b, int n)
1590	#
1591	# r3 = r
1592	# r4 = a
1593	# r5 = b
1594	# r6 = n
1595	#
1596	# Note: No loop unrolling done since this is not a performance
1597	# critical loop.
1598
1599	xor r0,r0,r0
1600	#
1601	# check for r6 = 0. Is this needed?
1602	#
1603	addic. r6,r6,0 #test r6 and clear carry bit.
1604	beq Lppcasm_add_adios
1605	addi r4,r4,-$BNSZ
1606	addi r3,r3,-$BNSZ
1607	addi r5,r5,-$BNSZ
1608	mtctr r6
1609	Lppcasm_add_mainloop:
1610	$LDU r7,$BNSZ(r4)
1611	$LDU r8,$BNSZ(r5)
1612	adde r8,r7,r8
1613	$STU r8,$BNSZ(r3)
1614	bdnz Lppcasm_add_mainloop
1615	Lppcasm_add_adios:
1616	addze r3,r0 #return carry bit.
1617	blr
1618	.long 0
1619	.byte 0,12,0x14,0,0,0,4,0
1620	.long 0
1621	.size .bn_add_words,.-.bn_add_words
1622
1623	#
1624	# NOTE: The following label name should be changed to
1625	# "bn_div_words" i.e. remove the first dot
1626	# for the gcc compiler. This should be automatically
1627	# done in the build
1628	#
1629
1630	.align 4
1631	.bn_div_words:
1632	#
1633	# This is a cleaned up version of code generated by
1634	# the AIX compiler. The only optimization is to use
1635	# the PPC instruction to count leading zeros instead
1636	# of call to num_bits_word. Since this was compiled
1637	# only at level -O2 we can possibly squeeze it more?
1638	#
1639	# r3 = h
1640	# r4 = l
1641	# r5 = d
1642
1643	$UCMPI 0,r5,0 # compare r5 and 0
1644	bne Lppcasm_div1 # proceed if d!=0
1645	li r3,-1 # d=0 return -1
1646	blr
1647	Lppcasm_div1:
1648	xor r0,r0,r0 #r0=0
1649	li r8,$BITS
1650	$CNTLZ. r7,r5 #r7 = num leading 0s in d.
1651	beq Lppcasm_div2 #proceed if no leading zeros
1652	subf r8,r7,r8 #r8 = BN_num_bits_word(d)
1653	$SHR. r9,r3,r8 #are there any bits above r8'th?
1654	$TR 16,r9,r0 #if there're, signal to dump core...
1655	Lppcasm_div2:
1656	$UCMP 0,r3,r5 #h>=d?
1657	blt Lppcasm_div3 #goto Lppcasm_div3 if not
1658	subf r3,r5,r3 #h-=d ;
1659	Lppcasm_div3: #r7 = BN_BITS2-i. so r7=i
1660	cmpi 0,0,r7,0 # is (i == 0)?
1661	beq Lppcasm_div4
1662	$SHL r3,r3,r7 # h = (h<< i)
1663	$SHR r8,r4,r8 # r8 = (l >> BN_BITS2 -i)
1664	$SHL r5,r5,r7 # d<<=i
1665	or r3,r3,r8 # h = (h<<i)\|(l>>(BN_BITS2-i))
1666	$SHL r4,r4,r7 # l <<=i
1667	Lppcasm_div4:
1668	$SHRI r9,r5,`$BITS/2` # r9 = dh
1669	# dl will be computed when needed
1670	# as it saves registers.
1671	li r6,2 #r6=2
1672	mtctr r6 #counter will be in count.
1673	Lppcasm_divouterloop:
1674	$SHRI r8,r3,`$BITS/2` #r8 = (h>>BN_BITS4)
1675	$SHRI r11,r4,`$BITS/2` #r11= (l&BN_MASK2h)>>BN_BITS4
1676	# compute here for innerloop.
1677	$UCMP 0,r8,r9 # is (h>>BN_BITS4)==dh
1678	bne Lppcasm_div5 # goto Lppcasm_div5 if not
1679
1680	li r8,-1
1681	$CLRU r8,r8,`$BITS/2` #q = BN_MASK2l
1682	b Lppcasm_div6
1683	Lppcasm_div5:
1684	$UDIV r8,r3,r9 #q = h/dh
1685	Lppcasm_div6:
1686	$UMULL r12,r9,r8 #th = q*dh
1687	$CLRU r10,r5,`$BITS/2` #r10=dl
1688	$UMULL r6,r8,r10 #tl = q*dl
1689
1690	Lppcasm_divinnerloop:
1691	subf r10,r12,r3 #t = h -th
1692	$SHRI r7,r10,`$BITS/2` #r7= (t &BN_MASK2H), sort of...
1693	addic. r7,r7,0 #test if r7 == 0. used below.
1694	# now want to compute
1695	# r7 = (t<<BN_BITS4)\|((l&BN_MASK2h)>>BN_BITS4)
1696	# the following 2 instructions do that
1697	$SHLI r7,r10,`$BITS/2` # r7 = (t<<BN_BITS4)
1698	or r7,r7,r11 # r7\|=((l&BN_MASK2h)>>BN_BITS4)
1699	$UCMP cr1,r6,r7 # compare (tl <= r7)
1700	bne Lppcasm_divinnerexit
1701	ble cr1,Lppcasm_divinnerexit
1702	addi r8,r8,-1 #q--
1703	subf r12,r9,r12 #th -=dh
1704	$CLRU r10,r5,`$BITS/2` #r10=dl. t is no longer needed in loop.
1705	subf r6,r10,r6 #tl -=dl
1706	b Lppcasm_divinnerloop
1707	Lppcasm_divinnerexit:
1708	$SHRI r10,r6,`$BITS/2` #t=(tl>>BN_BITS4)
1709	$SHLI r11,r6,`$BITS/2` #tl=(tl<<BN_BITS4)&BN_MASK2h;
1710	$UCMP cr1,r4,r11 # compare l and tl
1711	add r12,r12,r10 # th+=t
1712	bge cr1,Lppcasm_div7 # if (l>=tl) goto Lppcasm_div7
1713	addi r12,r12,1 # th++
1714	Lppcasm_div7:
1715	subf r11,r11,r4 #r11=l-tl
1716	$UCMP cr1,r3,r12 #compare h and th
1717	bge cr1,Lppcasm_div8 #if (h>=th) goto Lppcasm_div8
1718	addi r8,r8,-1 # q--
1719	add r3,r5,r3 # h+=d
1720	Lppcasm_div8:
1721	subf r12,r12,r3 #r12 = h-th
1722	$SHLI r4,r11,`$BITS/2` #l=(l&BN_MASK2l)<<BN_BITS4
1723	# want to compute
1724	# h = ((h<<BN_BITS4)\|(l>>BN_BITS4))&BN_MASK2
1725	# the following 2 instructions will do this.
1726	$INSR r11,r12,`$BITS/2`,`$BITS/2` # r11 is the value we want rotated $BITS/2.
1727	$ROTL r3,r11,`$BITS/2` # rotate by $BITS/2 and store in r3
1728	bdz Lppcasm_div9 #if (count==0) break ;
1729	$SHLI r0,r8,`$BITS/2` #ret =q<<BN_BITS4
1730	b Lppcasm_divouterloop
1731	Lppcasm_div9:
1732	or r3,r8,r0
1733	blr
1734	.long 0
1735	.byte 0,12,0x14,0,0,0,3,0
1736	.long 0
1737	.size .bn_div_words,.-.bn_div_words
1738
1739	#
1740	# NOTE: The following label name should be changed to
1741	# "bn_sqr_words" i.e. remove the first dot
1742	# for the gcc compiler. This should be automatically
1743	# done in the build
1744	#
1745	.align 4
1746	.bn_sqr_words:
1747	#
1748	# Optimized version of bn_sqr_words
1749	#
1750	# void bn_sqr_words(BN_ULONG r, BN_ULONG a, int n)
1751	#
1752	# r3 = r
1753	# r4 = a
1754	# r5 = n
1755	#
1756	# r6 = a[i].
1757	# r7,r8 = product.
1758	#
1759	# No unrolling done here. Not performance critical.
1760
1761	addic. r5,r5,0 #test r5.
1762	beq Lppcasm_sqr_adios
1763	addi r4,r4,-$BNSZ
1764	addi r3,r3,-$BNSZ
1765	mtctr r5
1766	Lppcasm_sqr_mainloop:
1767	#sqr(r[0],r[1],a[0]);
1768	$LDU r6,$BNSZ(r4)
1769	$UMULL r7,r6,r6
1770	$UMULH r8,r6,r6
1771	$STU r7,$BNSZ(r3)
1772	$STU r8,$BNSZ(r3)
1773	bdnz Lppcasm_sqr_mainloop
1774	Lppcasm_sqr_adios:
1775	blr
1776	.long 0
1777	.byte 0,12,0x14,0,0,0,3,0
1778	.long 0
1779	.size .bn_sqr_words,.-.bn_sqr_words
1780
1781	#
1782	# NOTE: The following label name should be changed to
1783	# "bn_mul_words" i.e. remove the first dot
1784	# for the gcc compiler. This should be automatically
1785	# done in the build
1786	#
1787
1788	.align 4
1789	.bn_mul_words:
1790	#
1791	# BN_ULONG bn_mul_words(BN_ULONG rp, BN_ULONG ap, int num, BN_ULONG w)
1792	#
1793	# r3 = rp
1794	# r4 = ap
1795	# r5 = num
1796	# r6 = w
1797	xor r0,r0,r0
1798	xor r12,r12,r12 # used for carry
1799	rlwinm. r7,r5,30,2,31 # num >> 2
1800	beq Lppcasm_mw_REM
1801	mtctr r7
1802	Lppcasm_mw_LOOP:
1803	#mul(rp[0],ap[0],w,c1);
1804	$LD r8,`0*$BNSZ`(r4)
1805	$UMULL r9,r6,r8
1806	$UMULH r10,r6,r8
1807	addc r9,r9,r12
1808	#addze r10,r10 #carry is NOT ignored.
1809	#will be taken care of
1810	#in second spin below
1811	#using adde.
1812	$ST r9,`0*$BNSZ`(r3)
1813	#mul(rp[1],ap[1],w,c1);
1814	$LD r8,`1*$BNSZ`(r4)
1815	$UMULL r11,r6,r8
1816	$UMULH r12,r6,r8
1817	adde r11,r11,r10
1818	#addze r12,r12
1819	$ST r11,`1*$BNSZ`(r3)
1820	#mul(rp[2],ap[2],w,c1);
1821	$LD r8,`2*$BNSZ`(r4)
1822	$UMULL r9,r6,r8
1823	$UMULH r10,r6,r8
1824	adde r9,r9,r12
1825	#addze r10,r10
1826	$ST r9,`2*$BNSZ`(r3)
1827	#mul_add(rp[3],ap[3],w,c1);
1828	$LD r8,`3*$BNSZ`(r4)
1829	$UMULL r11,r6,r8
1830	$UMULH r12,r6,r8
1831	adde r11,r11,r10
1832	addze r12,r12 #this spin we collect carry into
1833	#r12
1834	$ST r11,`3*$BNSZ`(r3)
1835
1836	addi r3,r3,`4*$BNSZ`
1837	addi r4,r4,`4*$BNSZ`
1838	bdnz Lppcasm_mw_LOOP
1839
1840	Lppcasm_mw_REM:
1841	andi. r5,r5,0x3
1842	beq Lppcasm_mw_OVER
1843	#mul(rp[0],ap[0],w,c1);
1844	$LD r8,`0*$BNSZ`(r4)
1845	$UMULL r9,r6,r8
1846	$UMULH r10,r6,r8
1847	addc r9,r9,r12
1848	addze r10,r10
1849	$ST r9,`0*$BNSZ`(r3)
1850	addi r12,r10,0
1851
1852	addi r5,r5,-1
1853	cmpli 0,0,r5,0
1854	beq Lppcasm_mw_OVER
1855
1856
1857	#mul(rp[1],ap[1],w,c1);
1858	$LD r8,`1*$BNSZ`(r4)
1859	$UMULL r9,r6,r8
1860	$UMULH r10,r6,r8
1861	addc r9,r9,r12
1862	addze r10,r10
1863	$ST r9,`1*$BNSZ`(r3)
1864	addi r12,r10,0
1865
1866	addi r5,r5,-1
1867	cmpli 0,0,r5,0
1868	beq Lppcasm_mw_OVER
1869
1870	#mul_add(rp[2],ap[2],w,c1);
1871	$LD r8,`2*$BNSZ`(r4)
1872	$UMULL r9,r6,r8
1873	$UMULH r10,r6,r8
1874	addc r9,r9,r12
1875	addze r10,r10
1876	$ST r9,`2*$BNSZ`(r3)
1877	addi r12,r10,0
1878
1879	Lppcasm_mw_OVER:
1880	addi r3,r12,0
1881	blr
1882	.long 0
1883	.byte 0,12,0x14,0,0,0,4,0
1884	.long 0
1885	.size .bn_mul_words,.-.bn_mul_words
1886
1887	#
1888	# NOTE: The following label name should be changed to
1889	# "bn_mul_add_words" i.e. remove the first dot
1890	# for the gcc compiler. This should be automatically
1891	# done in the build
1892	#
1893
1894	.align 4
1895	.bn_mul_add_words:
1896	#
1897	# BN_ULONG bn_mul_add_words(BN_ULONG rp, BN_ULONG ap, int num, BN_ULONG w)
1898	#
1899	# r3 = rp
1900	# r4 = ap
1901	# r5 = num
1902	# r6 = w
1903	#
1904	# empirical evidence suggests that unrolled version performs best!!
1905	#
1906	xor r0,r0,r0 #r0 = 0
1907	xor r12,r12,r12 #r12 = 0 . used for carry
1908	rlwinm. r7,r5,30,2,31 # num >> 2
1909	beq Lppcasm_maw_leftover # if (num < 4) go LPPCASM_maw_leftover
1910	mtctr r7
1911	Lppcasm_maw_mainloop:
1912	#mul_add(rp[0],ap[0],w,c1);
1913	$LD r8,`0*$BNSZ`(r4)
1914	$LD r11,`0*$BNSZ`(r3)
1915	$UMULL r9,r6,r8
1916	$UMULH r10,r6,r8
1917	addc r9,r9,r12 #r12 is carry.
1918	addze r10,r10
1919	addc r9,r9,r11
1920	#addze r10,r10
1921	#the above instruction addze
1922	#is NOT needed. Carry will NOT
1923	#be ignored. It's not affected
1924	#by multiply and will be collected
1925	#in the next spin
1926	$ST r9,`0*$BNSZ`(r3)
1927
1928	#mul_add(rp[1],ap[1],w,c1);
1929	$LD r8,`1*$BNSZ`(r4)
1930	$LD r9,`1*$BNSZ`(r3)
1931	$UMULL r11,r6,r8
1932	$UMULH r12,r6,r8
1933	adde r11,r11,r10 #r10 is carry.
1934	addze r12,r12
1935	addc r11,r11,r9
1936	#addze r12,r12
1937	$ST r11,`1*$BNSZ`(r3)
1938
1939	#mul_add(rp[2],ap[2],w,c1);
1940	$LD r8,`2*$BNSZ`(r4)
1941	$UMULL r9,r6,r8
1942	$LD r11,`2*$BNSZ`(r3)
1943	$UMULH r10,r6,r8
1944	adde r9,r9,r12
1945	addze r10,r10
1946	addc r9,r9,r11
1947	#addze r10,r10
1948	$ST r9,`2*$BNSZ`(r3)
1949
1950	#mul_add(rp[3],ap[3],w,c1);
1951	$LD r8,`3*$BNSZ`(r4)
1952	$UMULL r11,r6,r8
1953	$LD r9,`3*$BNSZ`(r3)
1954	$UMULH r12,r6,r8
1955	adde r11,r11,r10
1956	addze r12,r12
1957	addc r11,r11,r9
1958	addze r12,r12
1959	$ST r11,`3*$BNSZ`(r3)
1960	addi r3,r3,`4*$BNSZ`
1961	addi r4,r4,`4*$BNSZ`
1962	bdnz Lppcasm_maw_mainloop
1963
1964	Lppcasm_maw_leftover:
1965	andi. r5,r5,0x3
1966	beq Lppcasm_maw_adios
1967	addi r3,r3,-$BNSZ
1968	addi r4,r4,-$BNSZ
1969	#mul_add(rp[0],ap[0],w,c1);
1970	mtctr r5
1971	$LDU r8,$BNSZ(r4)
1972	$UMULL r9,r6,r8
1973	$UMULH r10,r6,r8
1974	$LDU r11,$BNSZ(r3)
1975	addc r9,r9,r11
1976	addze r10,r10
1977	addc r9,r9,r12
1978	addze r12,r10
1979	$ST r9,0(r3)
1980
1981	bdz Lppcasm_maw_adios
1982	#mul_add(rp[1],ap[1],w,c1);
1983	$LDU r8,$BNSZ(r4)
1984	$UMULL r9,r6,r8
1985	$UMULH r10,r6,r8
1986	$LDU r11,$BNSZ(r3)
1987	addc r9,r9,r11
1988	addze r10,r10
1989	addc r9,r9,r12
1990	addze r12,r10
1991	$ST r9,0(r3)
1992
1993	bdz Lppcasm_maw_adios
1994	#mul_add(rp[2],ap[2],w,c1);
1995	$LDU r8,$BNSZ(r4)
1996	$UMULL r9,r6,r8
1997	$UMULH r10,r6,r8
1998	$LDU r11,$BNSZ(r3)
1999	addc r9,r9,r11
2000	addze r10,r10
2001	addc r9,r9,r12
2002	addze r12,r10
2003	$ST r9,0(r3)
2004
2005	Lppcasm_maw_adios:
2006	addi r3,r12,0
2007	blr
2008	.long 0
2009	.byte 0,12,0x14,0,0,0,4,0
2010	.long 0
2011	.size .bn_mul_add_words,.-.bn_mul_add_words
2012	.align 4
2013	EOF
2014	$data =~ s/\`([^\`]*)\`/eval $1/gem;
2015	print $data;
2016	close STDOUT or die "error closing STDOUT: $!";

Note: See TracBrowser for help on using the repository browser.

source: vbox/trunk/src/libs/openssl-3.0.1/crypto/bn/asm/ppc.pl@ 94082

Download in other formats: