VirtualBox

source: vbox/trunk/src/libs/openssl-3.0.1/crypto/bn/asm/ppc.pl@ 94081

Last change on this file since 94081 was 91772, checked in by vboxsync, 3 years ago

openssl-1.1.1l: Applied and adjusted our OpenSSL changes to 1.1.1l. bugref:10126

File size: 44.4 KB
Line 
1#! /usr/bin/env perl
2# Copyright 2004-2020 The OpenSSL Project Authors. All Rights Reserved.
3#
4# Licensed under the OpenSSL license (the "License"). You may not use
5# this file except in compliance with the License. You can obtain a copy
6# in the file LICENSE in the source distribution or at
7# https://www.openssl.org/source/license.html
8
9# Implemented as a Perl wrapper as we want to support several different
10# architectures with single file. We pick up the target based on the
11# file name we are asked to generate.
12#
13# It should be noted though that this perl code is nothing like
14# <openssl>/crypto/perlasm/x86*. In this case perl is used pretty much
15# as pre-processor to cover for platform differences in name decoration,
16# linker tables, 32-/64-bit instruction sets...
17#
18# As you might know there're several PowerPC ABI in use. Most notably
19# Linux and AIX use different 32-bit ABIs. Good news are that these ABIs
20# are similar enough to implement leaf(!) functions, which would be ABI
21# neutral. And that's what you find here: ABI neutral leaf functions.
22# In case you wonder what that is...
23#
24# AIX performance
25#
26# MEASUREMENTS WITH cc ON a 200 MhZ PowerPC 604e.
27#
28# The following is the performance of 32-bit compiler
29# generated code:
30#
31# OpenSSL 0.9.6c 21 dec 2001
32# built on: Tue Jun 11 11:06:51 EDT 2002
33# options:bn(64,32) ...
34#compiler: cc -DTHREADS -DAIX -DB_ENDIAN -DBN_LLONG -O3
35# sign verify sign/s verify/s
36#rsa 512 bits 0.0098s 0.0009s 102.0 1170.6
37#rsa 1024 bits 0.0507s 0.0026s 19.7 387.5
38#rsa 2048 bits 0.3036s 0.0085s 3.3 117.1
39#rsa 4096 bits 2.0040s 0.0299s 0.5 33.4
40#dsa 512 bits 0.0087s 0.0106s 114.3 94.5
41#dsa 1024 bits 0.0256s 0.0313s 39.0 32.0
42#
43# Same benchmark with this assembler code:
44#
45#rsa 512 bits 0.0056s 0.0005s 178.6 2049.2
46#rsa 1024 bits 0.0283s 0.0015s 35.3 674.1
47#rsa 2048 bits 0.1744s 0.0050s 5.7 201.2
48#rsa 4096 bits 1.1644s 0.0179s 0.9 55.7
49#dsa 512 bits 0.0052s 0.0062s 191.6 162.0
50#dsa 1024 bits 0.0149s 0.0180s 67.0 55.5
51#
52# Number of operations increases by at almost 75%
53#
54# Here are performance numbers for 64-bit compiler
55# generated code:
56#
57# OpenSSL 0.9.6g [engine] 9 Aug 2002
58# built on: Fri Apr 18 16:59:20 EDT 2003
59# options:bn(64,64) ...
60# compiler: cc -DTHREADS -D_REENTRANT -q64 -DB_ENDIAN -O3
61# sign verify sign/s verify/s
62#rsa 512 bits 0.0028s 0.0003s 357.1 3844.4
63#rsa 1024 bits 0.0148s 0.0008s 67.5 1239.7
64#rsa 2048 bits 0.0963s 0.0028s 10.4 353.0
65#rsa 4096 bits 0.6538s 0.0102s 1.5 98.1
66#dsa 512 bits 0.0026s 0.0032s 382.5 313.7
67#dsa 1024 bits 0.0081s 0.0099s 122.8 100.6
68#
69# Same benchmark with this assembler code:
70#
71#rsa 512 bits 0.0020s 0.0002s 510.4 6273.7
72#rsa 1024 bits 0.0088s 0.0005s 114.1 2128.3
73#rsa 2048 bits 0.0540s 0.0016s 18.5 622.5
74#rsa 4096 bits 0.3700s 0.0058s 2.7 171.0
75#dsa 512 bits 0.0016s 0.0020s 610.7 507.1
76#dsa 1024 bits 0.0047s 0.0058s 212.5 173.2
77#
78# Again, performance increases by at about 75%
79#
80# Mac OS X, Apple G5 1.8GHz (Note this is 32 bit code)
81# OpenSSL 0.9.7c 30 Sep 2003
82#
83# Original code.
84#
85#rsa 512 bits 0.0011s 0.0001s 906.1 11012.5
86#rsa 1024 bits 0.0060s 0.0003s 166.6 3363.1
87#rsa 2048 bits 0.0370s 0.0010s 27.1 982.4
88#rsa 4096 bits 0.2426s 0.0036s 4.1 280.4
89#dsa 512 bits 0.0010s 0.0012s 1038.1 841.5
90#dsa 1024 bits 0.0030s 0.0037s 329.6 269.7
91#dsa 2048 bits 0.0101s 0.0127s 98.9 78.6
92#
93# Same benchmark with this assembler code:
94#
95#rsa 512 bits 0.0007s 0.0001s 1416.2 16645.9
96#rsa 1024 bits 0.0036s 0.0002s 274.4 5380.6
97#rsa 2048 bits 0.0222s 0.0006s 45.1 1589.5
98#rsa 4096 bits 0.1469s 0.0022s 6.8 449.6
99#dsa 512 bits 0.0006s 0.0007s 1664.2 1376.2
100#dsa 1024 bits 0.0018s 0.0023s 545.0 442.2
101#dsa 2048 bits 0.0061s 0.0075s 163.5 132.8
102#
103# Performance increase of ~60%
104# Based on submission from Suresh N. Chari of IBM
105
106$flavour = shift;
107
108if ($flavour =~ /32/) {
109 $BITS= 32;
110 $BNSZ= $BITS/8;
111 $ISA= "\"ppc\"";
112
113 $LD= "lwz"; # load
114 $LDU= "lwzu"; # load and update
115 $ST= "stw"; # store
116 $STU= "stwu"; # store and update
117 $UMULL= "mullw"; # unsigned multiply low
118 $UMULH= "mulhwu"; # unsigned multiply high
119 $UDIV= "divwu"; # unsigned divide
120 $UCMPI= "cmplwi"; # unsigned compare with immediate
121 $UCMP= "cmplw"; # unsigned compare
122 $CNTLZ= "cntlzw"; # count leading zeros
123 $SHL= "slw"; # shift left
124 $SHR= "srw"; # unsigned shift right
125 $SHRI= "srwi"; # unsigned shift right by immediate
126 $SHLI= "slwi"; # shift left by immediate
127 $CLRU= "clrlwi"; # clear upper bits
128 $INSR= "insrwi"; # insert right
129 $ROTL= "rotlwi"; # rotate left by immediate
130 $TR= "tw"; # conditional trap
131} elsif ($flavour =~ /64/) {
132 $BITS= 64;
133 $BNSZ= $BITS/8;
134 $ISA= "\"ppc64\"";
135
136 # same as above, but 64-bit mnemonics...
137 $LD= "ld"; # load
138 $LDU= "ldu"; # load and update
139 $ST= "std"; # store
140 $STU= "stdu"; # store and update
141 $UMULL= "mulld"; # unsigned multiply low
142 $UMULH= "mulhdu"; # unsigned multiply high
143 $UDIV= "divdu"; # unsigned divide
144 $UCMPI= "cmpldi"; # unsigned compare with immediate
145 $UCMP= "cmpld"; # unsigned compare
146 $CNTLZ= "cntlzd"; # count leading zeros
147 $SHL= "sld"; # shift left
148 $SHR= "srd"; # unsigned shift right
149 $SHRI= "srdi"; # unsigned shift right by immediate
150 $SHLI= "sldi"; # shift left by immediate
151 $CLRU= "clrldi"; # clear upper bits
152 $INSR= "insrdi"; # insert right
153 $ROTL= "rotldi"; # rotate left by immediate
154 $TR= "td"; # conditional trap
155} else { die "nonsense $flavour"; }
156
157$0 =~ m/(.*[\/\\])[^\/\\]+$/; $dir=$1;
158( $xlate="${dir}ppc-xlate.pl" and -f $xlate ) or
159( $xlate="${dir}../../perlasm/ppc-xlate.pl" and -f $xlate) or
160die "can't locate ppc-xlate.pl";
161
162open STDOUT,"| $^X $xlate $flavour ".shift || die "can't call $xlate: $!";
163
164$data=<<EOF;
165#--------------------------------------------------------------------
166#
167#
168#
169#
170# File: ppc32.s
171#
172# Created by: Suresh Chari
173# IBM Thomas J. Watson Research Library
174# Hawthorne, NY
175#
176#
177# Description: Optimized assembly routines for OpenSSL crypto
178# on the 32 bitPowerPC platform.
179#
180#
181# Version History
182#
183# 2. Fixed bn_add,bn_sub and bn_div_words, added comments,
184# cleaned up code. Also made a single version which can
185# be used for both the AIX and Linux compilers. See NOTE
186# below.
187# 12/05/03 Suresh Chari
188# (with lots of help from) Andy Polyakov
189##
190# 1. Initial version 10/20/02 Suresh Chari
191#
192#
193# The following file works for the xlc,cc
194# and gcc compilers.
195#
196# NOTE: To get the file to link correctly with the gcc compiler
197# you have to change the names of the routines and remove
198# the first .(dot) character. This should automatically
199# be done in the build process.
200#
201# Hand optimized assembly code for the following routines
202#
203# bn_sqr_comba4
204# bn_sqr_comba8
205# bn_mul_comba4
206# bn_mul_comba8
207# bn_sub_words
208# bn_add_words
209# bn_div_words
210# bn_sqr_words
211# bn_mul_words
212# bn_mul_add_words
213#
214# NOTE: It is possible to optimize this code more for
215# specific PowerPC or Power architectures. On the Northstar
216# architecture the optimizations in this file do
217# NOT provide much improvement.
218#
219# If you have comments or suggestions to improve code send
220# me a note at schari\@us.ibm.com
221#
222#--------------------------------------------------------------------------
223#
224# Defines to be used in the assembly code.
225#
226#.set r0,0 # we use it as storage for value of 0
227#.set SP,1 # preserved
228#.set RTOC,2 # preserved
229#.set r3,3 # 1st argument/return value
230#.set r4,4 # 2nd argument/volatile register
231#.set r5,5 # 3rd argument/volatile register
232#.set r6,6 # ...
233#.set r7,7
234#.set r8,8
235#.set r9,9
236#.set r10,10
237#.set r11,11
238#.set r12,12
239#.set r13,13 # not used, nor any other "below" it...
240
241# Declare function names to be global
242# NOTE: For gcc these names MUST be changed to remove
243# the first . i.e. for example change ".bn_sqr_comba4"
244# to "bn_sqr_comba4". This should be automatically done
245# in the build.
246
247 .globl .bn_sqr_comba4
248 .globl .bn_sqr_comba8
249 .globl .bn_mul_comba4
250 .globl .bn_mul_comba8
251 .globl .bn_sub_words
252 .globl .bn_add_words
253 .globl .bn_div_words
254 .globl .bn_sqr_words
255 .globl .bn_mul_words
256 .globl .bn_mul_add_words
257
258# .text section
259
260 .machine "any"
261 .text
262
263#
264# NOTE: The following label name should be changed to
265# "bn_sqr_comba4" i.e. remove the first dot
266# for the gcc compiler. This should be automatically
267# done in the build
268#
269
270.align 4
271.bn_sqr_comba4:
272#
273# Optimized version of bn_sqr_comba4.
274#
275# void bn_sqr_comba4(BN_ULONG *r, BN_ULONG *a)
276# r3 contains r
277# r4 contains a
278#
279# Freely use registers r5,r6,r7,r8,r9,r10,r11 as follows:
280#
281# r5,r6 are the two BN_ULONGs being multiplied.
282# r7,r8 are the results of the 32x32 giving 64 bit multiply.
283# r9,r10, r11 are the equivalents of c1,c2, c3.
284# Here's the assembly
285#
286#
287 xor r0,r0,r0 # set r0 = 0. Used in the addze
288 # instructions below
289
290 #sqr_add_c(a,0,c1,c2,c3)
291 $LD r5,`0*$BNSZ`(r4)
292 $UMULL r9,r5,r5
293 $UMULH r10,r5,r5 #in first iteration. No need
294 #to add since c1=c2=c3=0.
295 # Note c3(r11) is NOT set to 0
296 # but will be.
297
298 $ST r9,`0*$BNSZ`(r3) # r[0]=c1;
299 # sqr_add_c2(a,1,0,c2,c3,c1);
300 $LD r6,`1*$BNSZ`(r4)
301 $UMULL r7,r5,r6
302 $UMULH r8,r5,r6
303
304 addc r7,r7,r7 # compute (r7,r8)=2*(r7,r8)
305 adde r8,r8,r8
306 addze r9,r0 # catch carry if any.
307 # r9= r0(=0) and carry
308
309 addc r10,r7,r10 # now add to temp result.
310 addze r11,r8 # r8 added to r11 which is 0
311 addze r9,r9
312
313 $ST r10,`1*$BNSZ`(r3) #r[1]=c2;
314 #sqr_add_c(a,1,c3,c1,c2)
315 $UMULL r7,r6,r6
316 $UMULH r8,r6,r6
317 addc r11,r7,r11
318 adde r9,r8,r9
319 addze r10,r0
320 #sqr_add_c2(a,2,0,c3,c1,c2)
321 $LD r6,`2*$BNSZ`(r4)
322 $UMULL r7,r5,r6
323 $UMULH r8,r5,r6
324
325 addc r7,r7,r7
326 adde r8,r8,r8
327 addze r10,r10
328
329 addc r11,r7,r11
330 adde r9,r8,r9
331 addze r10,r10
332 $ST r11,`2*$BNSZ`(r3) #r[2]=c3
333 #sqr_add_c2(a,3,0,c1,c2,c3);
334 $LD r6,`3*$BNSZ`(r4)
335 $UMULL r7,r5,r6
336 $UMULH r8,r5,r6
337 addc r7,r7,r7
338 adde r8,r8,r8
339 addze r11,r0
340
341 addc r9,r7,r9
342 adde r10,r8,r10
343 addze r11,r11
344 #sqr_add_c2(a,2,1,c1,c2,c3);
345 $LD r5,`1*$BNSZ`(r4)
346 $LD r6,`2*$BNSZ`(r4)
347 $UMULL r7,r5,r6
348 $UMULH r8,r5,r6
349
350 addc r7,r7,r7
351 adde r8,r8,r8
352 addze r11,r11
353 addc r9,r7,r9
354 adde r10,r8,r10
355 addze r11,r11
356 $ST r9,`3*$BNSZ`(r3) #r[3]=c1
357 #sqr_add_c(a,2,c2,c3,c1);
358 $UMULL r7,r6,r6
359 $UMULH r8,r6,r6
360 addc r10,r7,r10
361 adde r11,r8,r11
362 addze r9,r0
363 #sqr_add_c2(a,3,1,c2,c3,c1);
364 $LD r6,`3*$BNSZ`(r4)
365 $UMULL r7,r5,r6
366 $UMULH r8,r5,r6
367 addc r7,r7,r7
368 adde r8,r8,r8
369 addze r9,r9
370
371 addc r10,r7,r10
372 adde r11,r8,r11
373 addze r9,r9
374 $ST r10,`4*$BNSZ`(r3) #r[4]=c2
375 #sqr_add_c2(a,3,2,c3,c1,c2);
376 $LD r5,`2*$BNSZ`(r4)
377 $UMULL r7,r5,r6
378 $UMULH r8,r5,r6
379 addc r7,r7,r7
380 adde r8,r8,r8
381 addze r10,r0
382
383 addc r11,r7,r11
384 adde r9,r8,r9
385 addze r10,r10
386 $ST r11,`5*$BNSZ`(r3) #r[5] = c3
387 #sqr_add_c(a,3,c1,c2,c3);
388 $UMULL r7,r6,r6
389 $UMULH r8,r6,r6
390 addc r9,r7,r9
391 adde r10,r8,r10
392
393 $ST r9,`6*$BNSZ`(r3) #r[6]=c1
394 $ST r10,`7*$BNSZ`(r3) #r[7]=c2
395 blr
396 .long 0
397 .byte 0,12,0x14,0,0,0,2,0
398 .long 0
399.size .bn_sqr_comba4,.-.bn_sqr_comba4
400
401#
402# NOTE: The following label name should be changed to
403# "bn_sqr_comba8" i.e. remove the first dot
404# for the gcc compiler. This should be automatically
405# done in the build
406#
407
408.align 4
409.bn_sqr_comba8:
410#
411# This is an optimized version of the bn_sqr_comba8 routine.
412# Tightly uses the adde instruction
413#
414#
415# void bn_sqr_comba8(BN_ULONG *r, BN_ULONG *a)
416# r3 contains r
417# r4 contains a
418#
419# Freely use registers r5,r6,r7,r8,r9,r10,r11 as follows:
420#
421# r5,r6 are the two BN_ULONGs being multiplied.
422# r7,r8 are the results of the 32x32 giving 64 bit multiply.
423# r9,r10, r11 are the equivalents of c1,c2, c3.
424#
425# Possible optimization of loading all 8 longs of a into registers
426# doesn't provide any speedup
427#
428
429 xor r0,r0,r0 #set r0 = 0.Used in addze
430 #instructions below.
431
432 #sqr_add_c(a,0,c1,c2,c3);
433 $LD r5,`0*$BNSZ`(r4)
434 $UMULL r9,r5,r5 #1st iteration: no carries.
435 $UMULH r10,r5,r5
436 $ST r9,`0*$BNSZ`(r3) # r[0]=c1;
437 #sqr_add_c2(a,1,0,c2,c3,c1);
438 $LD r6,`1*$BNSZ`(r4)
439 $UMULL r7,r5,r6
440 $UMULH r8,r5,r6
441
442 addc r10,r7,r10 #add the two register number
443 adde r11,r8,r0 # (r8,r7) to the three register
444 addze r9,r0 # number (r9,r11,r10).NOTE:r0=0
445
446 addc r10,r7,r10 #add the two register number
447 adde r11,r8,r11 # (r8,r7) to the three register
448 addze r9,r9 # number (r9,r11,r10).
449
450 $ST r10,`1*$BNSZ`(r3) # r[1]=c2
451
452 #sqr_add_c(a,1,c3,c1,c2);
453 $UMULL r7,r6,r6
454 $UMULH r8,r6,r6
455 addc r11,r7,r11
456 adde r9,r8,r9
457 addze r10,r0
458 #sqr_add_c2(a,2,0,c3,c1,c2);
459 $LD r6,`2*$BNSZ`(r4)
460 $UMULL r7,r5,r6
461 $UMULH r8,r5,r6
462
463 addc r11,r7,r11
464 adde r9,r8,r9
465 addze r10,r10
466
467 addc r11,r7,r11
468 adde r9,r8,r9
469 addze r10,r10
470
471 $ST r11,`2*$BNSZ`(r3) #r[2]=c3
472 #sqr_add_c2(a,3,0,c1,c2,c3);
473 $LD r6,`3*$BNSZ`(r4) #r6 = a[3]. r5 is already a[0].
474 $UMULL r7,r5,r6
475 $UMULH r8,r5,r6
476
477 addc r9,r7,r9
478 adde r10,r8,r10
479 addze r11,r0
480
481 addc r9,r7,r9
482 adde r10,r8,r10
483 addze r11,r11
484 #sqr_add_c2(a,2,1,c1,c2,c3);
485 $LD r5,`1*$BNSZ`(r4)
486 $LD r6,`2*$BNSZ`(r4)
487 $UMULL r7,r5,r6
488 $UMULH r8,r5,r6
489
490 addc r9,r7,r9
491 adde r10,r8,r10
492 addze r11,r11
493
494 addc r9,r7,r9
495 adde r10,r8,r10
496 addze r11,r11
497
498 $ST r9,`3*$BNSZ`(r3) #r[3]=c1;
499 #sqr_add_c(a,2,c2,c3,c1);
500 $UMULL r7,r6,r6
501 $UMULH r8,r6,r6
502
503 addc r10,r7,r10
504 adde r11,r8,r11
505 addze r9,r0
506 #sqr_add_c2(a,3,1,c2,c3,c1);
507 $LD r6,`3*$BNSZ`(r4)
508 $UMULL r7,r5,r6
509 $UMULH r8,r5,r6
510
511 addc r10,r7,r10
512 adde r11,r8,r11
513 addze r9,r9
514
515 addc r10,r7,r10
516 adde r11,r8,r11
517 addze r9,r9
518 #sqr_add_c2(a,4,0,c2,c3,c1);
519 $LD r5,`0*$BNSZ`(r4)
520 $LD r6,`4*$BNSZ`(r4)
521 $UMULL r7,r5,r6
522 $UMULH r8,r5,r6
523
524 addc r10,r7,r10
525 adde r11,r8,r11
526 addze r9,r9
527
528 addc r10,r7,r10
529 adde r11,r8,r11
530 addze r9,r9
531 $ST r10,`4*$BNSZ`(r3) #r[4]=c2;
532 #sqr_add_c2(a,5,0,c3,c1,c2);
533 $LD r6,`5*$BNSZ`(r4)
534 $UMULL r7,r5,r6
535 $UMULH r8,r5,r6
536
537 addc r11,r7,r11
538 adde r9,r8,r9
539 addze r10,r0
540
541 addc r11,r7,r11
542 adde r9,r8,r9
543 addze r10,r10
544 #sqr_add_c2(a,4,1,c3,c1,c2);
545 $LD r5,`1*$BNSZ`(r4)
546 $LD r6,`4*$BNSZ`(r4)
547 $UMULL r7,r5,r6
548 $UMULH r8,r5,r6
549
550 addc r11,r7,r11
551 adde r9,r8,r9
552 addze r10,r10
553
554 addc r11,r7,r11
555 adde r9,r8,r9
556 addze r10,r10
557 #sqr_add_c2(a,3,2,c3,c1,c2);
558 $LD r5,`2*$BNSZ`(r4)
559 $LD r6,`3*$BNSZ`(r4)
560 $UMULL r7,r5,r6
561 $UMULH r8,r5,r6
562
563 addc r11,r7,r11
564 adde r9,r8,r9
565 addze r10,r10
566
567 addc r11,r7,r11
568 adde r9,r8,r9
569 addze r10,r10
570 $ST r11,`5*$BNSZ`(r3) #r[5]=c3;
571 #sqr_add_c(a,3,c1,c2,c3);
572 $UMULL r7,r6,r6
573 $UMULH r8,r6,r6
574 addc r9,r7,r9
575 adde r10,r8,r10
576 addze r11,r0
577 #sqr_add_c2(a,4,2,c1,c2,c3);
578 $LD r6,`4*$BNSZ`(r4)
579 $UMULL r7,r5,r6
580 $UMULH r8,r5,r6
581
582 addc r9,r7,r9
583 adde r10,r8,r10
584 addze r11,r11
585
586 addc r9,r7,r9
587 adde r10,r8,r10
588 addze r11,r11
589 #sqr_add_c2(a,5,1,c1,c2,c3);
590 $LD r5,`1*$BNSZ`(r4)
591 $LD r6,`5*$BNSZ`(r4)
592 $UMULL r7,r5,r6
593 $UMULH r8,r5,r6
594
595 addc r9,r7,r9
596 adde r10,r8,r10
597 addze r11,r11
598
599 addc r9,r7,r9
600 adde r10,r8,r10
601 addze r11,r11
602 #sqr_add_c2(a,6,0,c1,c2,c3);
603 $LD r5,`0*$BNSZ`(r4)
604 $LD r6,`6*$BNSZ`(r4)
605 $UMULL r7,r5,r6
606 $UMULH r8,r5,r6
607 addc r9,r7,r9
608 adde r10,r8,r10
609 addze r11,r11
610 addc r9,r7,r9
611 adde r10,r8,r10
612 addze r11,r11
613 $ST r9,`6*$BNSZ`(r3) #r[6]=c1;
614 #sqr_add_c2(a,7,0,c2,c3,c1);
615 $LD r6,`7*$BNSZ`(r4)
616 $UMULL r7,r5,r6
617 $UMULH r8,r5,r6
618
619 addc r10,r7,r10
620 adde r11,r8,r11
621 addze r9,r0
622 addc r10,r7,r10
623 adde r11,r8,r11
624 addze r9,r9
625 #sqr_add_c2(a,6,1,c2,c3,c1);
626 $LD r5,`1*$BNSZ`(r4)
627 $LD r6,`6*$BNSZ`(r4)
628 $UMULL r7,r5,r6
629 $UMULH r8,r5,r6
630
631 addc r10,r7,r10
632 adde r11,r8,r11
633 addze r9,r9
634 addc r10,r7,r10
635 adde r11,r8,r11
636 addze r9,r9
637 #sqr_add_c2(a,5,2,c2,c3,c1);
638 $LD r5,`2*$BNSZ`(r4)
639 $LD r6,`5*$BNSZ`(r4)
640 $UMULL r7,r5,r6
641 $UMULH r8,r5,r6
642 addc r10,r7,r10
643 adde r11,r8,r11
644 addze r9,r9
645 addc r10,r7,r10
646 adde r11,r8,r11
647 addze r9,r9
648 #sqr_add_c2(a,4,3,c2,c3,c1);
649 $LD r5,`3*$BNSZ`(r4)
650 $LD r6,`4*$BNSZ`(r4)
651 $UMULL r7,r5,r6
652 $UMULH r8,r5,r6
653
654 addc r10,r7,r10
655 adde r11,r8,r11
656 addze r9,r9
657 addc r10,r7,r10
658 adde r11,r8,r11
659 addze r9,r9
660 $ST r10,`7*$BNSZ`(r3) #r[7]=c2;
661 #sqr_add_c(a,4,c3,c1,c2);
662 $UMULL r7,r6,r6
663 $UMULH r8,r6,r6
664 addc r11,r7,r11
665 adde r9,r8,r9
666 addze r10,r0
667 #sqr_add_c2(a,5,3,c3,c1,c2);
668 $LD r6,`5*$BNSZ`(r4)
669 $UMULL r7,r5,r6
670 $UMULH r8,r5,r6
671 addc r11,r7,r11
672 adde r9,r8,r9
673 addze r10,r10
674 addc r11,r7,r11
675 adde r9,r8,r9
676 addze r10,r10
677 #sqr_add_c2(a,6,2,c3,c1,c2);
678 $LD r5,`2*$BNSZ`(r4)
679 $LD r6,`6*$BNSZ`(r4)
680 $UMULL r7,r5,r6
681 $UMULH r8,r5,r6
682 addc r11,r7,r11
683 adde r9,r8,r9
684 addze r10,r10
685
686 addc r11,r7,r11
687 adde r9,r8,r9
688 addze r10,r10
689 #sqr_add_c2(a,7,1,c3,c1,c2);
690 $LD r5,`1*$BNSZ`(r4)
691 $LD r6,`7*$BNSZ`(r4)
692 $UMULL r7,r5,r6
693 $UMULH r8,r5,r6
694 addc r11,r7,r11
695 adde r9,r8,r9
696 addze r10,r10
697 addc r11,r7,r11
698 adde r9,r8,r9
699 addze r10,r10
700 $ST r11,`8*$BNSZ`(r3) #r[8]=c3;
701 #sqr_add_c2(a,7,2,c1,c2,c3);
702 $LD r5,`2*$BNSZ`(r4)
703 $UMULL r7,r5,r6
704 $UMULH r8,r5,r6
705
706 addc r9,r7,r9
707 adde r10,r8,r10
708 addze r11,r0
709 addc r9,r7,r9
710 adde r10,r8,r10
711 addze r11,r11
712 #sqr_add_c2(a,6,3,c1,c2,c3);
713 $LD r5,`3*$BNSZ`(r4)
714 $LD r6,`6*$BNSZ`(r4)
715 $UMULL r7,r5,r6
716 $UMULH r8,r5,r6
717 addc r9,r7,r9
718 adde r10,r8,r10
719 addze r11,r11
720 addc r9,r7,r9
721 adde r10,r8,r10
722 addze r11,r11
723 #sqr_add_c2(a,5,4,c1,c2,c3);
724 $LD r5,`4*$BNSZ`(r4)
725 $LD r6,`5*$BNSZ`(r4)
726 $UMULL r7,r5,r6
727 $UMULH r8,r5,r6
728 addc r9,r7,r9
729 adde r10,r8,r10
730 addze r11,r11
731 addc r9,r7,r9
732 adde r10,r8,r10
733 addze r11,r11
734 $ST r9,`9*$BNSZ`(r3) #r[9]=c1;
735 #sqr_add_c(a,5,c2,c3,c1);
736 $UMULL r7,r6,r6
737 $UMULH r8,r6,r6
738 addc r10,r7,r10
739 adde r11,r8,r11
740 addze r9,r0
741 #sqr_add_c2(a,6,4,c2,c3,c1);
742 $LD r6,`6*$BNSZ`(r4)
743 $UMULL r7,r5,r6
744 $UMULH r8,r5,r6
745 addc r10,r7,r10
746 adde r11,r8,r11
747 addze r9,r9
748 addc r10,r7,r10
749 adde r11,r8,r11
750 addze r9,r9
751 #sqr_add_c2(a,7,3,c2,c3,c1);
752 $LD r5,`3*$BNSZ`(r4)
753 $LD r6,`7*$BNSZ`(r4)
754 $UMULL r7,r5,r6
755 $UMULH r8,r5,r6
756 addc r10,r7,r10
757 adde r11,r8,r11
758 addze r9,r9
759 addc r10,r7,r10
760 adde r11,r8,r11
761 addze r9,r9
762 $ST r10,`10*$BNSZ`(r3) #r[10]=c2;
763 #sqr_add_c2(a,7,4,c3,c1,c2);
764 $LD r5,`4*$BNSZ`(r4)
765 $UMULL r7,r5,r6
766 $UMULH r8,r5,r6
767 addc r11,r7,r11
768 adde r9,r8,r9
769 addze r10,r0
770 addc r11,r7,r11
771 adde r9,r8,r9
772 addze r10,r10
773 #sqr_add_c2(a,6,5,c3,c1,c2);
774 $LD r5,`5*$BNSZ`(r4)
775 $LD r6,`6*$BNSZ`(r4)
776 $UMULL r7,r5,r6
777 $UMULH r8,r5,r6
778 addc r11,r7,r11
779 adde r9,r8,r9
780 addze r10,r10
781 addc r11,r7,r11
782 adde r9,r8,r9
783 addze r10,r10
784 $ST r11,`11*$BNSZ`(r3) #r[11]=c3;
785 #sqr_add_c(a,6,c1,c2,c3);
786 $UMULL r7,r6,r6
787 $UMULH r8,r6,r6
788 addc r9,r7,r9
789 adde r10,r8,r10
790 addze r11,r0
791 #sqr_add_c2(a,7,5,c1,c2,c3)
792 $LD r6,`7*$BNSZ`(r4)
793 $UMULL r7,r5,r6
794 $UMULH r8,r5,r6
795 addc r9,r7,r9
796 adde r10,r8,r10
797 addze r11,r11
798 addc r9,r7,r9
799 adde r10,r8,r10
800 addze r11,r11
801 $ST r9,`12*$BNSZ`(r3) #r[12]=c1;
802
803 #sqr_add_c2(a,7,6,c2,c3,c1)
804 $LD r5,`6*$BNSZ`(r4)
805 $UMULL r7,r5,r6
806 $UMULH r8,r5,r6
807 addc r10,r7,r10
808 adde r11,r8,r11
809 addze r9,r0
810 addc r10,r7,r10
811 adde r11,r8,r11
812 addze r9,r9
813 $ST r10,`13*$BNSZ`(r3) #r[13]=c2;
814 #sqr_add_c(a,7,c3,c1,c2);
815 $UMULL r7,r6,r6
816 $UMULH r8,r6,r6
817 addc r11,r7,r11
818 adde r9,r8,r9
819 $ST r11,`14*$BNSZ`(r3) #r[14]=c3;
820 $ST r9, `15*$BNSZ`(r3) #r[15]=c1;
821
822
823 blr
824 .long 0
825 .byte 0,12,0x14,0,0,0,2,0
826 .long 0
827.size .bn_sqr_comba8,.-.bn_sqr_comba8
828
829#
830# NOTE: The following label name should be changed to
831# "bn_mul_comba4" i.e. remove the first dot
832# for the gcc compiler. This should be automatically
833# done in the build
834#
835
836.align 4
837.bn_mul_comba4:
838#
839# This is an optimized version of the bn_mul_comba4 routine.
840#
841# void bn_mul_comba4(BN_ULONG *r, BN_ULONG *a, BN_ULONG *b)
842# r3 contains r
843# r4 contains a
844# r5 contains b
845# r6, r7 are the 2 BN_ULONGs being multiplied.
846# r8, r9 are the results of the 32x32 giving 64 multiply.
847# r10, r11, r12 are the equivalents of c1, c2, and c3.
848#
849 xor r0,r0,r0 #r0=0. Used in addze below.
850 #mul_add_c(a[0],b[0],c1,c2,c3);
851 $LD r6,`0*$BNSZ`(r4)
852 $LD r7,`0*$BNSZ`(r5)
853 $UMULL r10,r6,r7
854 $UMULH r11,r6,r7
855 $ST r10,`0*$BNSZ`(r3) #r[0]=c1
856 #mul_add_c(a[0],b[1],c2,c3,c1);
857 $LD r7,`1*$BNSZ`(r5)
858 $UMULL r8,r6,r7
859 $UMULH r9,r6,r7
860 addc r11,r8,r11
861 adde r12,r9,r0
862 addze r10,r0
863 #mul_add_c(a[1],b[0],c2,c3,c1);
864 $LD r6, `1*$BNSZ`(r4)
865 $LD r7, `0*$BNSZ`(r5)
866 $UMULL r8,r6,r7
867 $UMULH r9,r6,r7
868 addc r11,r8,r11
869 adde r12,r9,r12
870 addze r10,r10
871 $ST r11,`1*$BNSZ`(r3) #r[1]=c2
872 #mul_add_c(a[2],b[0],c3,c1,c2);
873 $LD r6,`2*$BNSZ`(r4)
874 $UMULL r8,r6,r7
875 $UMULH r9,r6,r7
876 addc r12,r8,r12
877 adde r10,r9,r10
878 addze r11,r0
879 #mul_add_c(a[1],b[1],c3,c1,c2);
880 $LD r6,`1*$BNSZ`(r4)
881 $LD r7,`1*$BNSZ`(r5)
882 $UMULL r8,r6,r7
883 $UMULH r9,r6,r7
884 addc r12,r8,r12
885 adde r10,r9,r10
886 addze r11,r11
887 #mul_add_c(a[0],b[2],c3,c1,c2);
888 $LD r6,`0*$BNSZ`(r4)
889 $LD r7,`2*$BNSZ`(r5)
890 $UMULL r8,r6,r7
891 $UMULH r9,r6,r7
892 addc r12,r8,r12
893 adde r10,r9,r10
894 addze r11,r11
895 $ST r12,`2*$BNSZ`(r3) #r[2]=c3
896 #mul_add_c(a[0],b[3],c1,c2,c3);
897 $LD r7,`3*$BNSZ`(r5)
898 $UMULL r8,r6,r7
899 $UMULH r9,r6,r7
900 addc r10,r8,r10
901 adde r11,r9,r11
902 addze r12,r0
903 #mul_add_c(a[1],b[2],c1,c2,c3);
904 $LD r6,`1*$BNSZ`(r4)
905 $LD r7,`2*$BNSZ`(r5)
906 $UMULL r8,r6,r7
907 $UMULH r9,r6,r7
908 addc r10,r8,r10
909 adde r11,r9,r11
910 addze r12,r12
911 #mul_add_c(a[2],b[1],c1,c2,c3);
912 $LD r6,`2*$BNSZ`(r4)
913 $LD r7,`1*$BNSZ`(r5)
914 $UMULL r8,r6,r7
915 $UMULH r9,r6,r7
916 addc r10,r8,r10
917 adde r11,r9,r11
918 addze r12,r12
919 #mul_add_c(a[3],b[0],c1,c2,c3);
920 $LD r6,`3*$BNSZ`(r4)
921 $LD r7,`0*$BNSZ`(r5)
922 $UMULL r8,r6,r7
923 $UMULH r9,r6,r7
924 addc r10,r8,r10
925 adde r11,r9,r11
926 addze r12,r12
927 $ST r10,`3*$BNSZ`(r3) #r[3]=c1
928 #mul_add_c(a[3],b[1],c2,c3,c1);
929 $LD r7,`1*$BNSZ`(r5)
930 $UMULL r8,r6,r7
931 $UMULH r9,r6,r7
932 addc r11,r8,r11
933 adde r12,r9,r12
934 addze r10,r0
935 #mul_add_c(a[2],b[2],c2,c3,c1);
936 $LD r6,`2*$BNSZ`(r4)
937 $LD r7,`2*$BNSZ`(r5)
938 $UMULL r8,r6,r7
939 $UMULH r9,r6,r7
940 addc r11,r8,r11
941 adde r12,r9,r12
942 addze r10,r10
943 #mul_add_c(a[1],b[3],c2,c3,c1);
944 $LD r6,`1*$BNSZ`(r4)
945 $LD r7,`3*$BNSZ`(r5)
946 $UMULL r8,r6,r7
947 $UMULH r9,r6,r7
948 addc r11,r8,r11
949 adde r12,r9,r12
950 addze r10,r10
951 $ST r11,`4*$BNSZ`(r3) #r[4]=c2
952 #mul_add_c(a[2],b[3],c3,c1,c2);
953 $LD r6,`2*$BNSZ`(r4)
954 $UMULL r8,r6,r7
955 $UMULH r9,r6,r7
956 addc r12,r8,r12
957 adde r10,r9,r10
958 addze r11,r0
959 #mul_add_c(a[3],b[2],c3,c1,c2);
960 $LD r6,`3*$BNSZ`(r4)
961 $LD r7,`2*$BNSZ`(r5)
962 $UMULL r8,r6,r7
963 $UMULH r9,r6,r7
964 addc r12,r8,r12
965 adde r10,r9,r10
966 addze r11,r11
967 $ST r12,`5*$BNSZ`(r3) #r[5]=c3
968 #mul_add_c(a[3],b[3],c1,c2,c3);
969 $LD r7,`3*$BNSZ`(r5)
970 $UMULL r8,r6,r7
971 $UMULH r9,r6,r7
972 addc r10,r8,r10
973 adde r11,r9,r11
974
975 $ST r10,`6*$BNSZ`(r3) #r[6]=c1
976 $ST r11,`7*$BNSZ`(r3) #r[7]=c2
977 blr
978 .long 0
979 .byte 0,12,0x14,0,0,0,3,0
980 .long 0
981.size .bn_mul_comba4,.-.bn_mul_comba4
982
983#
984# NOTE: The following label name should be changed to
985# "bn_mul_comba8" i.e. remove the first dot
986# for the gcc compiler. This should be automatically
987# done in the build
988#
989
990.align 4
991.bn_mul_comba8:
992#
993# Optimized version of the bn_mul_comba8 routine.
994#
995# void bn_mul_comba8(BN_ULONG *r, BN_ULONG *a, BN_ULONG *b)
996# r3 contains r
997# r4 contains a
998# r5 contains b
999# r6, r7 are the 2 BN_ULONGs being multiplied.
1000# r8, r9 are the results of the 32x32 giving 64 multiply.
1001# r10, r11, r12 are the equivalents of c1, c2, and c3.
1002#
1003 xor r0,r0,r0 #r0=0. Used in addze below.
1004
1005 #mul_add_c(a[0],b[0],c1,c2,c3);
1006 $LD r6,`0*$BNSZ`(r4) #a[0]
1007 $LD r7,`0*$BNSZ`(r5) #b[0]
1008 $UMULL r10,r6,r7
1009 $UMULH r11,r6,r7
1010 $ST r10,`0*$BNSZ`(r3) #r[0]=c1;
1011 #mul_add_c(a[0],b[1],c2,c3,c1);
1012 $LD r7,`1*$BNSZ`(r5)
1013 $UMULL r8,r6,r7
1014 $UMULH r9,r6,r7
1015 addc r11,r11,r8
1016 addze r12,r9 # since we didn't set r12 to zero before.
1017 addze r10,r0
1018 #mul_add_c(a[1],b[0],c2,c3,c1);
1019 $LD r6,`1*$BNSZ`(r4)
1020 $LD r7,`0*$BNSZ`(r5)
1021 $UMULL r8,r6,r7
1022 $UMULH r9,r6,r7
1023 addc r11,r11,r8
1024 adde r12,r12,r9
1025 addze r10,r10
1026 $ST r11,`1*$BNSZ`(r3) #r[1]=c2;
1027 #mul_add_c(a[2],b[0],c3,c1,c2);
1028 $LD r6,`2*$BNSZ`(r4)
1029 $UMULL r8,r6,r7
1030 $UMULH r9,r6,r7
1031 addc r12,r12,r8
1032 adde r10,r10,r9
1033 addze r11,r0
1034 #mul_add_c(a[1],b[1],c3,c1,c2);
1035 $LD r6,`1*$BNSZ`(r4)
1036 $LD r7,`1*$BNSZ`(r5)
1037 $UMULL r8,r6,r7
1038 $UMULH r9,r6,r7
1039 addc r12,r12,r8
1040 adde r10,r10,r9
1041 addze r11,r11
1042 #mul_add_c(a[0],b[2],c3,c1,c2);
1043 $LD r6,`0*$BNSZ`(r4)
1044 $LD r7,`2*$BNSZ`(r5)
1045 $UMULL r8,r6,r7
1046 $UMULH r9,r6,r7
1047 addc r12,r12,r8
1048 adde r10,r10,r9
1049 addze r11,r11
1050 $ST r12,`2*$BNSZ`(r3) #r[2]=c3;
1051 #mul_add_c(a[0],b[3],c1,c2,c3);
1052 $LD r7,`3*$BNSZ`(r5)
1053 $UMULL r8,r6,r7
1054 $UMULH r9,r6,r7
1055 addc r10,r10,r8
1056 adde r11,r11,r9
1057 addze r12,r0
1058 #mul_add_c(a[1],b[2],c1,c2,c3);
1059 $LD r6,`1*$BNSZ`(r4)
1060 $LD r7,`2*$BNSZ`(r5)
1061 $UMULL r8,r6,r7
1062 $UMULH r9,r6,r7
1063 addc r10,r10,r8
1064 adde r11,r11,r9
1065 addze r12,r12
1066
1067 #mul_add_c(a[2],b[1],c1,c2,c3);
1068 $LD r6,`2*$BNSZ`(r4)
1069 $LD r7,`1*$BNSZ`(r5)
1070 $UMULL r8,r6,r7
1071 $UMULH r9,r6,r7
1072 addc r10,r10,r8
1073 adde r11,r11,r9
1074 addze r12,r12
1075 #mul_add_c(a[3],b[0],c1,c2,c3);
1076 $LD r6,`3*$BNSZ`(r4)
1077 $LD r7,`0*$BNSZ`(r5)
1078 $UMULL r8,r6,r7
1079 $UMULH r9,r6,r7
1080 addc r10,r10,r8
1081 adde r11,r11,r9
1082 addze r12,r12
1083 $ST r10,`3*$BNSZ`(r3) #r[3]=c1;
1084 #mul_add_c(a[4],b[0],c2,c3,c1);
1085 $LD r6,`4*$BNSZ`(r4)
1086 $UMULL r8,r6,r7
1087 $UMULH r9,r6,r7
1088 addc r11,r11,r8
1089 adde r12,r12,r9
1090 addze r10,r0
1091 #mul_add_c(a[3],b[1],c2,c3,c1);
1092 $LD r6,`3*$BNSZ`(r4)
1093 $LD r7,`1*$BNSZ`(r5)
1094 $UMULL r8,r6,r7
1095 $UMULH r9,r6,r7
1096 addc r11,r11,r8
1097 adde r12,r12,r9
1098 addze r10,r10
1099 #mul_add_c(a[2],b[2],c2,c3,c1);
1100 $LD r6,`2*$BNSZ`(r4)
1101 $LD r7,`2*$BNSZ`(r5)
1102 $UMULL r8,r6,r7
1103 $UMULH r9,r6,r7
1104 addc r11,r11,r8
1105 adde r12,r12,r9
1106 addze r10,r10
1107 #mul_add_c(a[1],b[3],c2,c3,c1);
1108 $LD r6,`1*$BNSZ`(r4)
1109 $LD r7,`3*$BNSZ`(r5)
1110 $UMULL r8,r6,r7
1111 $UMULH r9,r6,r7
1112 addc r11,r11,r8
1113 adde r12,r12,r9
1114 addze r10,r10
1115 #mul_add_c(a[0],b[4],c2,c3,c1);
1116 $LD r6,`0*$BNSZ`(r4)
1117 $LD r7,`4*$BNSZ`(r5)
1118 $UMULL r8,r6,r7
1119 $UMULH r9,r6,r7
1120 addc r11,r11,r8
1121 adde r12,r12,r9
1122 addze r10,r10
1123 $ST r11,`4*$BNSZ`(r3) #r[4]=c2;
1124 #mul_add_c(a[0],b[5],c3,c1,c2);
1125 $LD r7,`5*$BNSZ`(r5)
1126 $UMULL r8,r6,r7
1127 $UMULH r9,r6,r7
1128 addc r12,r12,r8
1129 adde r10,r10,r9
1130 addze r11,r0
1131 #mul_add_c(a[1],b[4],c3,c1,c2);
1132 $LD r6,`1*$BNSZ`(r4)
1133 $LD r7,`4*$BNSZ`(r5)
1134 $UMULL r8,r6,r7
1135 $UMULH r9,r6,r7
1136 addc r12,r12,r8
1137 adde r10,r10,r9
1138 addze r11,r11
1139 #mul_add_c(a[2],b[3],c3,c1,c2);
1140 $LD r6,`2*$BNSZ`(r4)
1141 $LD r7,`3*$BNSZ`(r5)
1142 $UMULL r8,r6,r7
1143 $UMULH r9,r6,r7
1144 addc r12,r12,r8
1145 adde r10,r10,r9
1146 addze r11,r11
1147 #mul_add_c(a[3],b[2],c3,c1,c2);
1148 $LD r6,`3*$BNSZ`(r4)
1149 $LD r7,`2*$BNSZ`(r5)
1150 $UMULL r8,r6,r7
1151 $UMULH r9,r6,r7
1152 addc r12,r12,r8
1153 adde r10,r10,r9
1154 addze r11,r11
1155 #mul_add_c(a[4],b[1],c3,c1,c2);
1156 $LD r6,`4*$BNSZ`(r4)
1157 $LD r7,`1*$BNSZ`(r5)
1158 $UMULL r8,r6,r7
1159 $UMULH r9,r6,r7
1160 addc r12,r12,r8
1161 adde r10,r10,r9
1162 addze r11,r11
1163 #mul_add_c(a[5],b[0],c3,c1,c2);
1164 $LD r6,`5*$BNSZ`(r4)
1165 $LD r7,`0*$BNSZ`(r5)
1166 $UMULL r8,r6,r7
1167 $UMULH r9,r6,r7
1168 addc r12,r12,r8
1169 adde r10,r10,r9
1170 addze r11,r11
1171 $ST r12,`5*$BNSZ`(r3) #r[5]=c3;
1172 #mul_add_c(a[6],b[0],c1,c2,c3);
1173 $LD r6,`6*$BNSZ`(r4)
1174 $UMULL r8,r6,r7
1175 $UMULH r9,r6,r7
1176 addc r10,r10,r8
1177 adde r11,r11,r9
1178 addze r12,r0
1179 #mul_add_c(a[5],b[1],c1,c2,c3);
1180 $LD r6,`5*$BNSZ`(r4)
1181 $LD r7,`1*$BNSZ`(r5)
1182 $UMULL r8,r6,r7
1183 $UMULH r9,r6,r7
1184 addc r10,r10,r8
1185 adde r11,r11,r9
1186 addze r12,r12
1187 #mul_add_c(a[4],b[2],c1,c2,c3);
1188 $LD r6,`4*$BNSZ`(r4)
1189 $LD r7,`2*$BNSZ`(r5)
1190 $UMULL r8,r6,r7
1191 $UMULH r9,r6,r7
1192 addc r10,r10,r8
1193 adde r11,r11,r9
1194 addze r12,r12
1195 #mul_add_c(a[3],b[3],c1,c2,c3);
1196 $LD r6,`3*$BNSZ`(r4)
1197 $LD r7,`3*$BNSZ`(r5)
1198 $UMULL r8,r6,r7
1199 $UMULH r9,r6,r7
1200 addc r10,r10,r8
1201 adde r11,r11,r9
1202 addze r12,r12
1203 #mul_add_c(a[2],b[4],c1,c2,c3);
1204 $LD r6,`2*$BNSZ`(r4)
1205 $LD r7,`4*$BNSZ`(r5)
1206 $UMULL r8,r6,r7
1207 $UMULH r9,r6,r7
1208 addc r10,r10,r8
1209 adde r11,r11,r9
1210 addze r12,r12
1211 #mul_add_c(a[1],b[5],c1,c2,c3);
1212 $LD r6,`1*$BNSZ`(r4)
1213 $LD r7,`5*$BNSZ`(r5)
1214 $UMULL r8,r6,r7
1215 $UMULH r9,r6,r7
1216 addc r10,r10,r8
1217 adde r11,r11,r9
1218 addze r12,r12
1219 #mul_add_c(a[0],b[6],c1,c2,c3);
1220 $LD r6,`0*$BNSZ`(r4)
1221 $LD r7,`6*$BNSZ`(r5)
1222 $UMULL r8,r6,r7
1223 $UMULH r9,r6,r7
1224 addc r10,r10,r8
1225 adde r11,r11,r9
1226 addze r12,r12
1227 $ST r10,`6*$BNSZ`(r3) #r[6]=c1;
1228 #mul_add_c(a[0],b[7],c2,c3,c1);
1229 $LD r7,`7*$BNSZ`(r5)
1230 $UMULL r8,r6,r7
1231 $UMULH r9,r6,r7
1232 addc r11,r11,r8
1233 adde r12,r12,r9
1234 addze r10,r0
1235 #mul_add_c(a[1],b[6],c2,c3,c1);
1236 $LD r6,`1*$BNSZ`(r4)
1237 $LD r7,`6*$BNSZ`(r5)
1238 $UMULL r8,r6,r7
1239 $UMULH r9,r6,r7
1240 addc r11,r11,r8
1241 adde r12,r12,r9
1242 addze r10,r10
1243 #mul_add_c(a[2],b[5],c2,c3,c1);
1244 $LD r6,`2*$BNSZ`(r4)
1245 $LD r7,`5*$BNSZ`(r5)
1246 $UMULL r8,r6,r7
1247 $UMULH r9,r6,r7
1248 addc r11,r11,r8
1249 adde r12,r12,r9
1250 addze r10,r10
1251 #mul_add_c(a[3],b[4],c2,c3,c1);
1252 $LD r6,`3*$BNSZ`(r4)
1253 $LD r7,`4*$BNSZ`(r5)
1254 $UMULL r8,r6,r7
1255 $UMULH r9,r6,r7
1256 addc r11,r11,r8
1257 adde r12,r12,r9
1258 addze r10,r10
1259 #mul_add_c(a[4],b[3],c2,c3,c1);
1260 $LD r6,`4*$BNSZ`(r4)
1261 $LD r7,`3*$BNSZ`(r5)
1262 $UMULL r8,r6,r7
1263 $UMULH r9,r6,r7
1264 addc r11,r11,r8
1265 adde r12,r12,r9
1266 addze r10,r10
1267 #mul_add_c(a[5],b[2],c2,c3,c1);
1268 $LD r6,`5*$BNSZ`(r4)
1269 $LD r7,`2*$BNSZ`(r5)
1270 $UMULL r8,r6,r7
1271 $UMULH r9,r6,r7
1272 addc r11,r11,r8
1273 adde r12,r12,r9
1274 addze r10,r10
1275 #mul_add_c(a[6],b[1],c2,c3,c1);
1276 $LD r6,`6*$BNSZ`(r4)
1277 $LD r7,`1*$BNSZ`(r5)
1278 $UMULL r8,r6,r7
1279 $UMULH r9,r6,r7
1280 addc r11,r11,r8
1281 adde r12,r12,r9
1282 addze r10,r10
1283 #mul_add_c(a[7],b[0],c2,c3,c1);
1284 $LD r6,`7*$BNSZ`(r4)
1285 $LD r7,`0*$BNSZ`(r5)
1286 $UMULL r8,r6,r7
1287 $UMULH r9,r6,r7
1288 addc r11,r11,r8
1289 adde r12,r12,r9
1290 addze r10,r10
1291 $ST r11,`7*$BNSZ`(r3) #r[7]=c2;
1292 #mul_add_c(a[7],b[1],c3,c1,c2);
1293 $LD r7,`1*$BNSZ`(r5)
1294 $UMULL r8,r6,r7
1295 $UMULH r9,r6,r7
1296 addc r12,r12,r8
1297 adde r10,r10,r9
1298 addze r11,r0
1299 #mul_add_c(a[6],b[2],c3,c1,c2);
1300 $LD r6,`6*$BNSZ`(r4)
1301 $LD r7,`2*$BNSZ`(r5)
1302 $UMULL r8,r6,r7
1303 $UMULH r9,r6,r7
1304 addc r12,r12,r8
1305 adde r10,r10,r9
1306 addze r11,r11
1307 #mul_add_c(a[5],b[3],c3,c1,c2);
1308 $LD r6,`5*$BNSZ`(r4)
1309 $LD r7,`3*$BNSZ`(r5)
1310 $UMULL r8,r6,r7
1311 $UMULH r9,r6,r7
1312 addc r12,r12,r8
1313 adde r10,r10,r9
1314 addze r11,r11
1315 #mul_add_c(a[4],b[4],c3,c1,c2);
1316 $LD r6,`4*$BNSZ`(r4)
1317 $LD r7,`4*$BNSZ`(r5)
1318 $UMULL r8,r6,r7
1319 $UMULH r9,r6,r7
1320 addc r12,r12,r8
1321 adde r10,r10,r9
1322 addze r11,r11
1323 #mul_add_c(a[3],b[5],c3,c1,c2);
1324 $LD r6,`3*$BNSZ`(r4)
1325 $LD r7,`5*$BNSZ`(r5)
1326 $UMULL r8,r6,r7
1327 $UMULH r9,r6,r7
1328 addc r12,r12,r8
1329 adde r10,r10,r9
1330 addze r11,r11
1331 #mul_add_c(a[2],b[6],c3,c1,c2);
1332 $LD r6,`2*$BNSZ`(r4)
1333 $LD r7,`6*$BNSZ`(r5)
1334 $UMULL r8,r6,r7
1335 $UMULH r9,r6,r7
1336 addc r12,r12,r8
1337 adde r10,r10,r9
1338 addze r11,r11
1339 #mul_add_c(a[1],b[7],c3,c1,c2);
1340 $LD r6,`1*$BNSZ`(r4)
1341 $LD r7,`7*$BNSZ`(r5)
1342 $UMULL r8,r6,r7
1343 $UMULH r9,r6,r7
1344 addc r12,r12,r8
1345 adde r10,r10,r9
1346 addze r11,r11
1347 $ST r12,`8*$BNSZ`(r3) #r[8]=c3;
1348 #mul_add_c(a[2],b[7],c1,c2,c3);
1349 $LD r6,`2*$BNSZ`(r4)
1350 $UMULL r8,r6,r7
1351 $UMULH r9,r6,r7
1352 addc r10,r10,r8
1353 adde r11,r11,r9
1354 addze r12,r0
1355 #mul_add_c(a[3],b[6],c1,c2,c3);
1356 $LD r6,`3*$BNSZ`(r4)
1357 $LD r7,`6*$BNSZ`(r5)
1358 $UMULL r8,r6,r7
1359 $UMULH r9,r6,r7
1360 addc r10,r10,r8
1361 adde r11,r11,r9
1362 addze r12,r12
1363 #mul_add_c(a[4],b[5],c1,c2,c3);
1364 $LD r6,`4*$BNSZ`(r4)
1365 $LD r7,`5*$BNSZ`(r5)
1366 $UMULL r8,r6,r7
1367 $UMULH r9,r6,r7
1368 addc r10,r10,r8
1369 adde r11,r11,r9
1370 addze r12,r12
1371 #mul_add_c(a[5],b[4],c1,c2,c3);
1372 $LD r6,`5*$BNSZ`(r4)
1373 $LD r7,`4*$BNSZ`(r5)
1374 $UMULL r8,r6,r7
1375 $UMULH r9,r6,r7
1376 addc r10,r10,r8
1377 adde r11,r11,r9
1378 addze r12,r12
1379 #mul_add_c(a[6],b[3],c1,c2,c3);
1380 $LD r6,`6*$BNSZ`(r4)
1381 $LD r7,`3*$BNSZ`(r5)
1382 $UMULL r8,r6,r7
1383 $UMULH r9,r6,r7
1384 addc r10,r10,r8
1385 adde r11,r11,r9
1386 addze r12,r12
1387 #mul_add_c(a[7],b[2],c1,c2,c3);
1388 $LD r6,`7*$BNSZ`(r4)
1389 $LD r7,`2*$BNSZ`(r5)
1390 $UMULL r8,r6,r7
1391 $UMULH r9,r6,r7
1392 addc r10,r10,r8
1393 adde r11,r11,r9
1394 addze r12,r12
1395 $ST r10,`9*$BNSZ`(r3) #r[9]=c1;
1396 #mul_add_c(a[7],b[3],c2,c3,c1);
1397 $LD r7,`3*$BNSZ`(r5)
1398 $UMULL r8,r6,r7
1399 $UMULH r9,r6,r7
1400 addc r11,r11,r8
1401 adde r12,r12,r9
1402 addze r10,r0
1403 #mul_add_c(a[6],b[4],c2,c3,c1);
1404 $LD r6,`6*$BNSZ`(r4)
1405 $LD r7,`4*$BNSZ`(r5)
1406 $UMULL r8,r6,r7
1407 $UMULH r9,r6,r7
1408 addc r11,r11,r8
1409 adde r12,r12,r9
1410 addze r10,r10
1411 #mul_add_c(a[5],b[5],c2,c3,c1);
1412 $LD r6,`5*$BNSZ`(r4)
1413 $LD r7,`5*$BNSZ`(r5)
1414 $UMULL r8,r6,r7
1415 $UMULH r9,r6,r7
1416 addc r11,r11,r8
1417 adde r12,r12,r9
1418 addze r10,r10
1419 #mul_add_c(a[4],b[6],c2,c3,c1);
1420 $LD r6,`4*$BNSZ`(r4)
1421 $LD r7,`6*$BNSZ`(r5)
1422 $UMULL r8,r6,r7
1423 $UMULH r9,r6,r7
1424 addc r11,r11,r8
1425 adde r12,r12,r9
1426 addze r10,r10
1427 #mul_add_c(a[3],b[7],c2,c3,c1);
1428 $LD r6,`3*$BNSZ`(r4)
1429 $LD r7,`7*$BNSZ`(r5)
1430 $UMULL r8,r6,r7
1431 $UMULH r9,r6,r7
1432 addc r11,r11,r8
1433 adde r12,r12,r9
1434 addze r10,r10
1435 $ST r11,`10*$BNSZ`(r3) #r[10]=c2;
1436 #mul_add_c(a[4],b[7],c3,c1,c2);
1437 $LD r6,`4*$BNSZ`(r4)
1438 $UMULL r8,r6,r7
1439 $UMULH r9,r6,r7
1440 addc r12,r12,r8
1441 adde r10,r10,r9
1442 addze r11,r0
1443 #mul_add_c(a[5],b[6],c3,c1,c2);
1444 $LD r6,`5*$BNSZ`(r4)
1445 $LD r7,`6*$BNSZ`(r5)
1446 $UMULL r8,r6,r7
1447 $UMULH r9,r6,r7
1448 addc r12,r12,r8
1449 adde r10,r10,r9
1450 addze r11,r11
1451 #mul_add_c(a[6],b[5],c3,c1,c2);
1452 $LD r6,`6*$BNSZ`(r4)
1453 $LD r7,`5*$BNSZ`(r5)
1454 $UMULL r8,r6,r7
1455 $UMULH r9,r6,r7
1456 addc r12,r12,r8
1457 adde r10,r10,r9
1458 addze r11,r11
1459 #mul_add_c(a[7],b[4],c3,c1,c2);
1460 $LD r6,`7*$BNSZ`(r4)
1461 $LD r7,`4*$BNSZ`(r5)
1462 $UMULL r8,r6,r7
1463 $UMULH r9,r6,r7
1464 addc r12,r12,r8
1465 adde r10,r10,r9
1466 addze r11,r11
1467 $ST r12,`11*$BNSZ`(r3) #r[11]=c3;
1468 #mul_add_c(a[7],b[5],c1,c2,c3);
1469 $LD r7,`5*$BNSZ`(r5)
1470 $UMULL r8,r6,r7
1471 $UMULH r9,r6,r7
1472 addc r10,r10,r8
1473 adde r11,r11,r9
1474 addze r12,r0
1475 #mul_add_c(a[6],b[6],c1,c2,c3);
1476 $LD r6,`6*$BNSZ`(r4)
1477 $LD r7,`6*$BNSZ`(r5)
1478 $UMULL r8,r6,r7
1479 $UMULH r9,r6,r7
1480 addc r10,r10,r8
1481 adde r11,r11,r9
1482 addze r12,r12
1483 #mul_add_c(a[5],b[7],c1,c2,c3);
1484 $LD r6,`5*$BNSZ`(r4)
1485 $LD r7,`7*$BNSZ`(r5)
1486 $UMULL r8,r6,r7
1487 $UMULH r9,r6,r7
1488 addc r10,r10,r8
1489 adde r11,r11,r9
1490 addze r12,r12
1491 $ST r10,`12*$BNSZ`(r3) #r[12]=c1;
1492 #mul_add_c(a[6],b[7],c2,c3,c1);
1493 $LD r6,`6*$BNSZ`(r4)
1494 $UMULL r8,r6,r7
1495 $UMULH r9,r6,r7
1496 addc r11,r11,r8
1497 adde r12,r12,r9
1498 addze r10,r0
1499 #mul_add_c(a[7],b[6],c2,c3,c1);
1500 $LD r6,`7*$BNSZ`(r4)
1501 $LD r7,`6*$BNSZ`(r5)
1502 $UMULL r8,r6,r7
1503 $UMULH r9,r6,r7
1504 addc r11,r11,r8
1505 adde r12,r12,r9
1506 addze r10,r10
1507 $ST r11,`13*$BNSZ`(r3) #r[13]=c2;
1508 #mul_add_c(a[7],b[7],c3,c1,c2);
1509 $LD r7,`7*$BNSZ`(r5)
1510 $UMULL r8,r6,r7
1511 $UMULH r9,r6,r7
1512 addc r12,r12,r8
1513 adde r10,r10,r9
1514 $ST r12,`14*$BNSZ`(r3) #r[14]=c3;
1515 $ST r10,`15*$BNSZ`(r3) #r[15]=c1;
1516 blr
1517 .long 0
1518 .byte 0,12,0x14,0,0,0,3,0
1519 .long 0
1520.size .bn_mul_comba8,.-.bn_mul_comba8
1521
1522#
1523# NOTE: The following label name should be changed to
1524# "bn_sub_words" i.e. remove the first dot
1525# for the gcc compiler. This should be automatically
1526# done in the build
1527#
1528#
1529.align 4
1530.bn_sub_words:
1531#
1532# Handcoded version of bn_sub_words
1533#
1534#BN_ULONG bn_sub_words(BN_ULONG *r, BN_ULONG *a, BN_ULONG *b, int n)
1535#
1536# r3 = r
1537# r4 = a
1538# r5 = b
1539# r6 = n
1540#
1541# Note: No loop unrolling done since this is not a performance
1542# critical loop.
1543
1544 xor r0,r0,r0 #set r0 = 0
1545#
1546# check for r6 = 0 AND set carry bit.
1547#
1548 subfc. r7,r0,r6 # If r6 is 0 then result is 0.
1549 # if r6 > 0 then result !=0
1550 # In either case carry bit is set.
1551 beq Lppcasm_sub_adios
1552 addi r4,r4,-$BNSZ
1553 addi r3,r3,-$BNSZ
1554 addi r5,r5,-$BNSZ
1555 mtctr r6
1556Lppcasm_sub_mainloop:
1557 $LDU r7,$BNSZ(r4)
1558 $LDU r8,$BNSZ(r5)
1559 subfe r6,r8,r7 # r6 = r7+carry bit + onescomplement(r8)
1560 # if carry = 1 this is r7-r8. Else it
1561 # is r7-r8 -1 as we need.
1562 $STU r6,$BNSZ(r3)
1563 bdnz Lppcasm_sub_mainloop
1564Lppcasm_sub_adios:
1565 subfze r3,r0 # if carry bit is set then r3 = 0 else -1
1566 andi. r3,r3,1 # keep only last bit.
1567 blr
1568 .long 0
1569 .byte 0,12,0x14,0,0,0,4,0
1570 .long 0
1571.size .bn_sub_words,.-.bn_sub_words
1572
1573#
1574# NOTE: The following label name should be changed to
1575# "bn_add_words" i.e. remove the first dot
1576# for the gcc compiler. This should be automatically
1577# done in the build
1578#
1579
1580.align 4
1581.bn_add_words:
1582#
1583# Handcoded version of bn_add_words
1584#
1585#BN_ULONG bn_add_words(BN_ULONG *r, BN_ULONG *a, BN_ULONG *b, int n)
1586#
1587# r3 = r
1588# r4 = a
1589# r5 = b
1590# r6 = n
1591#
1592# Note: No loop unrolling done since this is not a performance
1593# critical loop.
1594
1595 xor r0,r0,r0
1596#
1597# check for r6 = 0. Is this needed?
1598#
1599 addic. r6,r6,0 #test r6 and clear carry bit.
1600 beq Lppcasm_add_adios
1601 addi r4,r4,-$BNSZ
1602 addi r3,r3,-$BNSZ
1603 addi r5,r5,-$BNSZ
1604 mtctr r6
1605Lppcasm_add_mainloop:
1606 $LDU r7,$BNSZ(r4)
1607 $LDU r8,$BNSZ(r5)
1608 adde r8,r7,r8
1609 $STU r8,$BNSZ(r3)
1610 bdnz Lppcasm_add_mainloop
1611Lppcasm_add_adios:
1612 addze r3,r0 #return carry bit.
1613 blr
1614 .long 0
1615 .byte 0,12,0x14,0,0,0,4,0
1616 .long 0
1617.size .bn_add_words,.-.bn_add_words
1618
1619#
1620# NOTE: The following label name should be changed to
1621# "bn_div_words" i.e. remove the first dot
1622# for the gcc compiler. This should be automatically
1623# done in the build
1624#
1625
1626.align 4
1627.bn_div_words:
1628#
1629# This is a cleaned up version of code generated by
1630# the AIX compiler. The only optimization is to use
1631# the PPC instruction to count leading zeros instead
1632# of call to num_bits_word. Since this was compiled
1633# only at level -O2 we can possibly squeeze it more?
1634#
1635# r3 = h
1636# r4 = l
1637# r5 = d
1638
1639 $UCMPI 0,r5,0 # compare r5 and 0
1640 bne Lppcasm_div1 # proceed if d!=0
1641 li r3,-1 # d=0 return -1
1642 blr
1643Lppcasm_div1:
1644 xor r0,r0,r0 #r0=0
1645 li r8,$BITS
1646 $CNTLZ. r7,r5 #r7 = num leading 0s in d.
1647 beq Lppcasm_div2 #proceed if no leading zeros
1648 subf r8,r7,r8 #r8 = BN_num_bits_word(d)
1649 $SHR. r9,r3,r8 #are there any bits above r8'th?
1650 $TR 16,r9,r0 #if there're, signal to dump core...
1651Lppcasm_div2:
1652 $UCMP 0,r3,r5 #h>=d?
1653 blt Lppcasm_div3 #goto Lppcasm_div3 if not
1654 subf r3,r5,r3 #h-=d ;
1655Lppcasm_div3: #r7 = BN_BITS2-i. so r7=i
1656 cmpi 0,0,r7,0 # is (i == 0)?
1657 beq Lppcasm_div4
1658 $SHL r3,r3,r7 # h = (h<< i)
1659 $SHR r8,r4,r8 # r8 = (l >> BN_BITS2 -i)
1660 $SHL r5,r5,r7 # d<<=i
1661 or r3,r3,r8 # h = (h<<i)|(l>>(BN_BITS2-i))
1662 $SHL r4,r4,r7 # l <<=i
1663Lppcasm_div4:
1664 $SHRI r9,r5,`$BITS/2` # r9 = dh
1665 # dl will be computed when needed
1666 # as it saves registers.
1667 li r6,2 #r6=2
1668 mtctr r6 #counter will be in count.
1669Lppcasm_divouterloop:
1670 $SHRI r8,r3,`$BITS/2` #r8 = (h>>BN_BITS4)
1671 $SHRI r11,r4,`$BITS/2` #r11= (l&BN_MASK2h)>>BN_BITS4
1672 # compute here for innerloop.
1673 $UCMP 0,r8,r9 # is (h>>BN_BITS4)==dh
1674 bne Lppcasm_div5 # goto Lppcasm_div5 if not
1675
1676 li r8,-1
1677 $CLRU r8,r8,`$BITS/2` #q = BN_MASK2l
1678 b Lppcasm_div6
1679Lppcasm_div5:
1680 $UDIV r8,r3,r9 #q = h/dh
1681Lppcasm_div6:
1682 $UMULL r12,r9,r8 #th = q*dh
1683 $CLRU r10,r5,`$BITS/2` #r10=dl
1684 $UMULL r6,r8,r10 #tl = q*dl
1685
1686Lppcasm_divinnerloop:
1687 subf r10,r12,r3 #t = h -th
1688 $SHRI r7,r10,`$BITS/2` #r7= (t &BN_MASK2H), sort of...
1689 addic. r7,r7,0 #test if r7 == 0. used below.
1690 # now want to compute
1691 # r7 = (t<<BN_BITS4)|((l&BN_MASK2h)>>BN_BITS4)
1692 # the following 2 instructions do that
1693 $SHLI r7,r10,`$BITS/2` # r7 = (t<<BN_BITS4)
1694 or r7,r7,r11 # r7|=((l&BN_MASK2h)>>BN_BITS4)
1695 $UCMP cr1,r6,r7 # compare (tl <= r7)
1696 bne Lppcasm_divinnerexit
1697 ble cr1,Lppcasm_divinnerexit
1698 addi r8,r8,-1 #q--
1699 subf r12,r9,r12 #th -=dh
1700 $CLRU r10,r5,`$BITS/2` #r10=dl. t is no longer needed in loop.
1701 subf r6,r10,r6 #tl -=dl
1702 b Lppcasm_divinnerloop
1703Lppcasm_divinnerexit:
1704 $SHRI r10,r6,`$BITS/2` #t=(tl>>BN_BITS4)
1705 $SHLI r11,r6,`$BITS/2` #tl=(tl<<BN_BITS4)&BN_MASK2h;
1706 $UCMP cr1,r4,r11 # compare l and tl
1707 add r12,r12,r10 # th+=t
1708 bge cr1,Lppcasm_div7 # if (l>=tl) goto Lppcasm_div7
1709 addi r12,r12,1 # th++
1710Lppcasm_div7:
1711 subf r11,r11,r4 #r11=l-tl
1712 $UCMP cr1,r3,r12 #compare h and th
1713 bge cr1,Lppcasm_div8 #if (h>=th) goto Lppcasm_div8
1714 addi r8,r8,-1 # q--
1715 add r3,r5,r3 # h+=d
1716Lppcasm_div8:
1717 subf r12,r12,r3 #r12 = h-th
1718 $SHLI r4,r11,`$BITS/2` #l=(l&BN_MASK2l)<<BN_BITS4
1719 # want to compute
1720 # h = ((h<<BN_BITS4)|(l>>BN_BITS4))&BN_MASK2
1721 # the following 2 instructions will do this.
1722 $INSR r11,r12,`$BITS/2`,`$BITS/2` # r11 is the value we want rotated $BITS/2.
1723 $ROTL r3,r11,`$BITS/2` # rotate by $BITS/2 and store in r3
1724 bdz Lppcasm_div9 #if (count==0) break ;
1725 $SHLI r0,r8,`$BITS/2` #ret =q<<BN_BITS4
1726 b Lppcasm_divouterloop
1727Lppcasm_div9:
1728 or r3,r8,r0
1729 blr
1730 .long 0
1731 .byte 0,12,0x14,0,0,0,3,0
1732 .long 0
1733.size .bn_div_words,.-.bn_div_words
1734
1735#
1736# NOTE: The following label name should be changed to
1737# "bn_sqr_words" i.e. remove the first dot
1738# for the gcc compiler. This should be automatically
1739# done in the build
1740#
1741.align 4
1742.bn_sqr_words:
1743#
1744# Optimized version of bn_sqr_words
1745#
1746# void bn_sqr_words(BN_ULONG *r, BN_ULONG *a, int n)
1747#
1748# r3 = r
1749# r4 = a
1750# r5 = n
1751#
1752# r6 = a[i].
1753# r7,r8 = product.
1754#
1755# No unrolling done here. Not performance critical.
1756
1757 addic. r5,r5,0 #test r5.
1758 beq Lppcasm_sqr_adios
1759 addi r4,r4,-$BNSZ
1760 addi r3,r3,-$BNSZ
1761 mtctr r5
1762Lppcasm_sqr_mainloop:
1763 #sqr(r[0],r[1],a[0]);
1764 $LDU r6,$BNSZ(r4)
1765 $UMULL r7,r6,r6
1766 $UMULH r8,r6,r6
1767 $STU r7,$BNSZ(r3)
1768 $STU r8,$BNSZ(r3)
1769 bdnz Lppcasm_sqr_mainloop
1770Lppcasm_sqr_adios:
1771 blr
1772 .long 0
1773 .byte 0,12,0x14,0,0,0,3,0
1774 .long 0
1775.size .bn_sqr_words,.-.bn_sqr_words
1776
1777#
1778# NOTE: The following label name should be changed to
1779# "bn_mul_words" i.e. remove the first dot
1780# for the gcc compiler. This should be automatically
1781# done in the build
1782#
1783
1784.align 4
1785.bn_mul_words:
1786#
1787# BN_ULONG bn_mul_words(BN_ULONG *rp, BN_ULONG *ap, int num, BN_ULONG w)
1788#
1789# r3 = rp
1790# r4 = ap
1791# r5 = num
1792# r6 = w
1793 xor r0,r0,r0
1794 xor r12,r12,r12 # used for carry
1795 rlwinm. r7,r5,30,2,31 # num >> 2
1796 beq Lppcasm_mw_REM
1797 mtctr r7
1798Lppcasm_mw_LOOP:
1799 #mul(rp[0],ap[0],w,c1);
1800 $LD r8,`0*$BNSZ`(r4)
1801 $UMULL r9,r6,r8
1802 $UMULH r10,r6,r8
1803 addc r9,r9,r12
1804 #addze r10,r10 #carry is NOT ignored.
1805 #will be taken care of
1806 #in second spin below
1807 #using adde.
1808 $ST r9,`0*$BNSZ`(r3)
1809 #mul(rp[1],ap[1],w,c1);
1810 $LD r8,`1*$BNSZ`(r4)
1811 $UMULL r11,r6,r8
1812 $UMULH r12,r6,r8
1813 adde r11,r11,r10
1814 #addze r12,r12
1815 $ST r11,`1*$BNSZ`(r3)
1816 #mul(rp[2],ap[2],w,c1);
1817 $LD r8,`2*$BNSZ`(r4)
1818 $UMULL r9,r6,r8
1819 $UMULH r10,r6,r8
1820 adde r9,r9,r12
1821 #addze r10,r10
1822 $ST r9,`2*$BNSZ`(r3)
1823 #mul_add(rp[3],ap[3],w,c1);
1824 $LD r8,`3*$BNSZ`(r4)
1825 $UMULL r11,r6,r8
1826 $UMULH r12,r6,r8
1827 adde r11,r11,r10
1828 addze r12,r12 #this spin we collect carry into
1829 #r12
1830 $ST r11,`3*$BNSZ`(r3)
1831
1832 addi r3,r3,`4*$BNSZ`
1833 addi r4,r4,`4*$BNSZ`
1834 bdnz Lppcasm_mw_LOOP
1835
1836Lppcasm_mw_REM:
1837 andi. r5,r5,0x3
1838 beq Lppcasm_mw_OVER
1839 #mul(rp[0],ap[0],w,c1);
1840 $LD r8,`0*$BNSZ`(r4)
1841 $UMULL r9,r6,r8
1842 $UMULH r10,r6,r8
1843 addc r9,r9,r12
1844 addze r10,r10
1845 $ST r9,`0*$BNSZ`(r3)
1846 addi r12,r10,0
1847
1848 addi r5,r5,-1
1849 cmpli 0,0,r5,0
1850 beq Lppcasm_mw_OVER
1851
1852
1853 #mul(rp[1],ap[1],w,c1);
1854 $LD r8,`1*$BNSZ`(r4)
1855 $UMULL r9,r6,r8
1856 $UMULH r10,r6,r8
1857 addc r9,r9,r12
1858 addze r10,r10
1859 $ST r9,`1*$BNSZ`(r3)
1860 addi r12,r10,0
1861
1862 addi r5,r5,-1
1863 cmpli 0,0,r5,0
1864 beq Lppcasm_mw_OVER
1865
1866 #mul_add(rp[2],ap[2],w,c1);
1867 $LD r8,`2*$BNSZ`(r4)
1868 $UMULL r9,r6,r8
1869 $UMULH r10,r6,r8
1870 addc r9,r9,r12
1871 addze r10,r10
1872 $ST r9,`2*$BNSZ`(r3)
1873 addi r12,r10,0
1874
1875Lppcasm_mw_OVER:
1876 addi r3,r12,0
1877 blr
1878 .long 0
1879 .byte 0,12,0x14,0,0,0,4,0
1880 .long 0
1881.size .bn_mul_words,.-.bn_mul_words
1882
1883#
1884# NOTE: The following label name should be changed to
1885# "bn_mul_add_words" i.e. remove the first dot
1886# for the gcc compiler. This should be automatically
1887# done in the build
1888#
1889
1890.align 4
1891.bn_mul_add_words:
1892#
1893# BN_ULONG bn_mul_add_words(BN_ULONG *rp, BN_ULONG *ap, int num, BN_ULONG w)
1894#
1895# r3 = rp
1896# r4 = ap
1897# r5 = num
1898# r6 = w
1899#
1900# empirical evidence suggests that unrolled version performs best!!
1901#
1902 xor r0,r0,r0 #r0 = 0
1903 xor r12,r12,r12 #r12 = 0 . used for carry
1904 rlwinm. r7,r5,30,2,31 # num >> 2
1905 beq Lppcasm_maw_leftover # if (num < 4) go LPPCASM_maw_leftover
1906 mtctr r7
1907Lppcasm_maw_mainloop:
1908 #mul_add(rp[0],ap[0],w,c1);
1909 $LD r8,`0*$BNSZ`(r4)
1910 $LD r11,`0*$BNSZ`(r3)
1911 $UMULL r9,r6,r8
1912 $UMULH r10,r6,r8
1913 addc r9,r9,r12 #r12 is carry.
1914 addze r10,r10
1915 addc r9,r9,r11
1916 #addze r10,r10
1917 #the above instruction addze
1918 #is NOT needed. Carry will NOT
1919 #be ignored. It's not affected
1920 #by multiply and will be collected
1921 #in the next spin
1922 $ST r9,`0*$BNSZ`(r3)
1923
1924 #mul_add(rp[1],ap[1],w,c1);
1925 $LD r8,`1*$BNSZ`(r4)
1926 $LD r9,`1*$BNSZ`(r3)
1927 $UMULL r11,r6,r8
1928 $UMULH r12,r6,r8
1929 adde r11,r11,r10 #r10 is carry.
1930 addze r12,r12
1931 addc r11,r11,r9
1932 #addze r12,r12
1933 $ST r11,`1*$BNSZ`(r3)
1934
1935 #mul_add(rp[2],ap[2],w,c1);
1936 $LD r8,`2*$BNSZ`(r4)
1937 $UMULL r9,r6,r8
1938 $LD r11,`2*$BNSZ`(r3)
1939 $UMULH r10,r6,r8
1940 adde r9,r9,r12
1941 addze r10,r10
1942 addc r9,r9,r11
1943 #addze r10,r10
1944 $ST r9,`2*$BNSZ`(r3)
1945
1946 #mul_add(rp[3],ap[3],w,c1);
1947 $LD r8,`3*$BNSZ`(r4)
1948 $UMULL r11,r6,r8
1949 $LD r9,`3*$BNSZ`(r3)
1950 $UMULH r12,r6,r8
1951 adde r11,r11,r10
1952 addze r12,r12
1953 addc r11,r11,r9
1954 addze r12,r12
1955 $ST r11,`3*$BNSZ`(r3)
1956 addi r3,r3,`4*$BNSZ`
1957 addi r4,r4,`4*$BNSZ`
1958 bdnz Lppcasm_maw_mainloop
1959
1960Lppcasm_maw_leftover:
1961 andi. r5,r5,0x3
1962 beq Lppcasm_maw_adios
1963 addi r3,r3,-$BNSZ
1964 addi r4,r4,-$BNSZ
1965 #mul_add(rp[0],ap[0],w,c1);
1966 mtctr r5
1967 $LDU r8,$BNSZ(r4)
1968 $UMULL r9,r6,r8
1969 $UMULH r10,r6,r8
1970 $LDU r11,$BNSZ(r3)
1971 addc r9,r9,r11
1972 addze r10,r10
1973 addc r9,r9,r12
1974 addze r12,r10
1975 $ST r9,0(r3)
1976
1977 bdz Lppcasm_maw_adios
1978 #mul_add(rp[1],ap[1],w,c1);
1979 $LDU r8,$BNSZ(r4)
1980 $UMULL r9,r6,r8
1981 $UMULH r10,r6,r8
1982 $LDU r11,$BNSZ(r3)
1983 addc r9,r9,r11
1984 addze r10,r10
1985 addc r9,r9,r12
1986 addze r12,r10
1987 $ST r9,0(r3)
1988
1989 bdz Lppcasm_maw_adios
1990 #mul_add(rp[2],ap[2],w,c1);
1991 $LDU r8,$BNSZ(r4)
1992 $UMULL r9,r6,r8
1993 $UMULH r10,r6,r8
1994 $LDU r11,$BNSZ(r3)
1995 addc r9,r9,r11
1996 addze r10,r10
1997 addc r9,r9,r12
1998 addze r12,r10
1999 $ST r9,0(r3)
2000
2001Lppcasm_maw_adios:
2002 addi r3,r12,0
2003 blr
2004 .long 0
2005 .byte 0,12,0x14,0,0,0,4,0
2006 .long 0
2007.size .bn_mul_add_words,.-.bn_mul_add_words
2008 .align 4
2009EOF
2010$data =~ s/\`([^\`]*)\`/eval $1/gem;
2011print $data;
2012close STDOUT or die "error closing STDOUT: $!";
Note: See TracBrowser for help on using the repository browser.

© 2024 Oracle Support Privacy / Do Not Sell My Info Terms of Use Trademark Policy Automated Access Etiquette