VirtualBox

source: vbox/trunk/src/libs/openssl-1.1.1l/crypto/bn/asm/ppc64-mont.pl@ 92014

Last change on this file since 92014 was 91772, checked in by vboxsync, 3 years ago

openssl-1.1.1l: Applied and adjusted our OpenSSL changes to 1.1.1l. bugref:10126

File size: 39.8 KB
Line 
1#! /usr/bin/env perl
2# Copyright 2007-2020 The OpenSSL Project Authors. All Rights Reserved.
3#
4# Licensed under the OpenSSL license (the "License"). You may not use
5# this file except in compliance with the License. You can obtain a copy
6# in the file LICENSE in the source distribution or at
7# https://www.openssl.org/source/license.html
8
9
10# ====================================================================
11# Written by Andy Polyakov <[email protected]> for the OpenSSL
12# project. The module is, however, dual licensed under OpenSSL and
13# CRYPTOGAMS licenses depending on where you obtain it. For further
14# details see http://www.openssl.org/~appro/cryptogams/.
15# ====================================================================
16
17# December 2007
18
19# The reason for undertaken effort is basically following. Even though
20# Power 6 CPU operates at incredible 4.7GHz clock frequency, its PKI
21# performance was observed to be less than impressive, essentially as
22# fast as 1.8GHz PPC970, or 2.6 times(!) slower than one would hope.
23# Well, it's not surprising that IBM had to make some sacrifices to
24# boost the clock frequency that much, but no overall improvement?
25# Having observed how much difference did switching to FPU make on
26# UltraSPARC, playing same stunt on Power 6 appeared appropriate...
27# Unfortunately the resulting performance improvement is not as
28# impressive, ~30%, and in absolute terms is still very far from what
29# one would expect from 4.7GHz CPU. There is a chance that I'm doing
30# something wrong, but in the lack of assembler level micro-profiling
31# data or at least decent platform guide I can't tell... Or better
32# results might be achieved with VMX... Anyway, this module provides
33# *worse* performance on other PowerPC implementations, ~40-15% slower
34# on PPC970 depending on key length and ~40% slower on Power 5 for all
35# key lengths. As it's obviously inappropriate as "best all-round"
36# alternative, it has to be complemented with run-time CPU family
37# detection. Oh! It should also be noted that unlike other PowerPC
38# implementation IALU ppc-mont.pl module performs *suboptimally* on
39# >=1024-bit key lengths on Power 6. It should also be noted that
40# *everything* said so far applies to 64-bit builds! As far as 32-bit
41# application executed on 64-bit CPU goes, this module is likely to
42# become preferred choice, because it's easy to adapt it for such
43# case and *is* faster than 32-bit ppc-mont.pl on *all* processors.
44
45# February 2008
46
47# Micro-profiling assisted optimization results in ~15% improvement
48# over original ppc64-mont.pl version, or overall ~50% improvement
49# over ppc.pl module on Power 6. If compared to ppc-mont.pl on same
50# Power 6 CPU, this module is 5-150% faster depending on key length,
51# [hereafter] more for longer keys. But if compared to ppc-mont.pl
52# on 1.8GHz PPC970, it's only 5-55% faster. Still far from impressive
53# in absolute terms, but it's apparently the way Power 6 is...
54
55# December 2009
56
57# Adapted for 32-bit build this module delivers 25-120%, yes, more
58# than *twice* for longer keys, performance improvement over 32-bit
59# ppc-mont.pl on 1.8GHz PPC970. However! This implementation utilizes
60# even 64-bit integer operations and the trouble is that most PPC
61# operating systems don't preserve upper halves of general purpose
62# registers upon 32-bit signal delivery. They do preserve them upon
63# context switch, but not signalling:-( This means that asynchronous
64# signals have to be blocked upon entry to this subroutine. Signal
65# masking (and of course complementary unmasking) has quite an impact
66# on performance, naturally larger for shorter keys. It's so severe
67# that 512-bit key performance can be as low as 1/3 of expected one.
68# This is why this routine can be engaged for longer key operations
69# only on these OSes, see crypto/ppccap.c for further details. MacOS X
70# is an exception from this and doesn't require signal masking, and
71# that's where above improvement coefficients were collected. For
72# others alternative would be to break dependence on upper halves of
73# GPRs by sticking to 32-bit integer operations...
74
75# December 2012
76
77# Remove above mentioned dependence on GPRs' upper halves in 32-bit
78# build. No signal masking overhead, but integer instructions are
79# *more* numerous... It's still "universally" faster than 32-bit
80# ppc-mont.pl, but improvement coefficient is not as impressive
81# for longer keys...
82
83$flavour = shift;
84
85if ($flavour =~ /32/) {
86 $SIZE_T=4;
87 $RZONE= 224;
88 $fname= "bn_mul_mont_fpu64";
89
90 $STUX= "stwux"; # store indexed and update
91 $PUSH= "stw";
92 $POP= "lwz";
93} elsif ($flavour =~ /64/) {
94 $SIZE_T=8;
95 $RZONE= 288;
96 $fname= "bn_mul_mont_fpu64";
97
98 # same as above, but 64-bit mnemonics...
99 $STUX= "stdux"; # store indexed and update
100 $PUSH= "std";
101 $POP= "ld";
102} else { die "nonsense $flavour"; }
103
104$LITTLE_ENDIAN = ($flavour=~/le$/) ? 4 : 0;
105
106$0 =~ m/(.*[\/\\])[^\/\\]+$/; $dir=$1;
107( $xlate="${dir}ppc-xlate.pl" and -f $xlate ) or
108( $xlate="${dir}../../perlasm/ppc-xlate.pl" and -f $xlate) or
109die "can't locate ppc-xlate.pl";
110
111open STDOUT,"| $^X $xlate $flavour ".shift || die "can't call $xlate: $!";
112
113$FRAME=64; # padded frame header
114$TRANSFER=16*8;
115
116$carry="r0";
117$sp="r1";
118$toc="r2";
119$rp="r3"; $ovf="r3";
120$ap="r4";
121$bp="r5";
122$np="r6";
123$n0="r7";
124$num="r8";
125$rp="r9"; # $rp is reassigned
126$tp="r10";
127$j="r11";
128$i="r12";
129# non-volatile registers
130$c1="r19";
131$n1="r20";
132$a1="r21";
133$nap_d="r22"; # interleaved ap and np in double format
134$a0="r23"; # ap[0]
135$t0="r24"; # temporary registers
136$t1="r25";
137$t2="r26";
138$t3="r27";
139$t4="r28";
140$t5="r29";
141$t6="r30";
142$t7="r31";
143
144# PPC offers enough register bank capacity to unroll inner loops twice
145#
146# ..A3A2A1A0
147# dcba
148# -----------
149# A0a
150# A0b
151# A0c
152# A0d
153# A1a
154# A1b
155# A1c
156# A1d
157# A2a
158# A2b
159# A2c
160# A2d
161# A3a
162# A3b
163# A3c
164# A3d
165# ..a
166# ..b
167#
168$ba="f0"; $bb="f1"; $bc="f2"; $bd="f3";
169$na="f4"; $nb="f5"; $nc="f6"; $nd="f7";
170$dota="f8"; $dotb="f9";
171$A0="f10"; $A1="f11"; $A2="f12"; $A3="f13";
172$N0="f20"; $N1="f21"; $N2="f22"; $N3="f23";
173$T0a="f24"; $T0b="f25";
174$T1a="f26"; $T1b="f27";
175$T2a="f28"; $T2b="f29";
176$T3a="f30"; $T3b="f31";
177
178
179# sp----------->+-------------------------------+
180# | saved sp |
181# +-------------------------------+
182# . .
183# +64 +-------------------------------+
184# | 16 gpr<->fpr transfer zone |
185# . .
186# . .
187# +16*8 +-------------------------------+
188# | __int64 tmp[-1] |
189# +-------------------------------+
190# | __int64 tmp[num] |
191# . .
192# . .
193# . .
194# +(num+1)*8 +-------------------------------+
195# | padding to 64 byte boundary |
196# . .
197# +X +-------------------------------+
198# | double nap_d[4*num] |
199# . .
200# . .
201# . .
202# +-------------------------------+
203# . .
204# -13*size_t +-------------------------------+
205# | 13 saved gpr, r19-r31 |
206# . .
207# . .
208# -12*8 +-------------------------------+
209# | 12 saved fpr, f20-f31 |
210# . .
211# . .
212# +-------------------------------+
213
214
215$code=<<___;
216.machine "any"
217.text
218
219.globl .$fname
220.align 5
221.$fname:
222 cmpwi $num,`3*8/$SIZE_T`
223 mr $rp,r3 ; $rp is reassigned
224 li r3,0 ; possible "not handled" return code
225 bltlr-
226 andi. r0,$num,`16/$SIZE_T-1` ; $num has to be "even"
227 bnelr-
228
229 slwi $num,$num,`log($SIZE_T)/log(2)` ; num*=sizeof(BN_LONG)
230 li $i,-4096
231 slwi $tp,$num,2 ; place for {an}p_{lh}[num], i.e. 4*num
232 add $tp,$tp,$num ; place for tp[num+1]
233 addi $tp,$tp,`$FRAME+$TRANSFER+8+64+$RZONE`
234 subf $tp,$tp,$sp ; $sp-$tp
235 and $tp,$tp,$i ; minimize TLB usage
236 subf $tp,$sp,$tp ; $tp-$sp
237 mr $i,$sp
238 $STUX $sp,$sp,$tp ; alloca
239
240 $PUSH r19,`-12*8-13*$SIZE_T`($i)
241 $PUSH r20,`-12*8-12*$SIZE_T`($i)
242 $PUSH r21,`-12*8-11*$SIZE_T`($i)
243 $PUSH r22,`-12*8-10*$SIZE_T`($i)
244 $PUSH r23,`-12*8-9*$SIZE_T`($i)
245 $PUSH r24,`-12*8-8*$SIZE_T`($i)
246 $PUSH r25,`-12*8-7*$SIZE_T`($i)
247 $PUSH r26,`-12*8-6*$SIZE_T`($i)
248 $PUSH r27,`-12*8-5*$SIZE_T`($i)
249 $PUSH r28,`-12*8-4*$SIZE_T`($i)
250 $PUSH r29,`-12*8-3*$SIZE_T`($i)
251 $PUSH r30,`-12*8-2*$SIZE_T`($i)
252 $PUSH r31,`-12*8-1*$SIZE_T`($i)
253 stfd f20,`-12*8`($i)
254 stfd f21,`-11*8`($i)
255 stfd f22,`-10*8`($i)
256 stfd f23,`-9*8`($i)
257 stfd f24,`-8*8`($i)
258 stfd f25,`-7*8`($i)
259 stfd f26,`-6*8`($i)
260 stfd f27,`-5*8`($i)
261 stfd f28,`-4*8`($i)
262 stfd f29,`-3*8`($i)
263 stfd f30,`-2*8`($i)
264 stfd f31,`-1*8`($i)
265
266 addi $tp,$sp,`$FRAME+$TRANSFER+8+64`
267 li $i,-64
268 add $nap_d,$tp,$num
269 and $nap_d,$nap_d,$i ; align to 64 bytes
270 ; nap_d is off by 1, because it's used with stfdu/lfdu
271 addi $nap_d,$nap_d,-8
272 srwi $j,$num,`3+1` ; counter register, num/2
273 addi $j,$j,-1
274 addi $tp,$sp,`$FRAME+$TRANSFER-8`
275 li $carry,0
276 mtctr $j
277___
278
279
280$code.=<<___ if ($SIZE_T==8);
281 ld $a0,0($ap) ; pull ap[0] value
282 ld $t3,0($bp) ; bp[0]
283 ld $n0,0($n0) ; pull n0[0] value
284
285 mulld $t7,$a0,$t3 ; ap[0]*bp[0]
286 ; transfer bp[0] to FPU as 4x16-bit values
287 extrdi $t0,$t3,16,48
288 extrdi $t1,$t3,16,32
289 extrdi $t2,$t3,16,16
290 extrdi $t3,$t3,16,0
291 std $t0,`$FRAME+0`($sp)
292 std $t1,`$FRAME+8`($sp)
293 std $t2,`$FRAME+16`($sp)
294 std $t3,`$FRAME+24`($sp)
295
296 mulld $t7,$t7,$n0 ; tp[0]*n0
297 ; transfer (ap[0]*bp[0])*n0 to FPU as 4x16-bit values
298 extrdi $t4,$t7,16,48
299 extrdi $t5,$t7,16,32
300 extrdi $t6,$t7,16,16
301 extrdi $t7,$t7,16,0
302 std $t4,`$FRAME+32`($sp)
303 std $t5,`$FRAME+40`($sp)
304 std $t6,`$FRAME+48`($sp)
305 std $t7,`$FRAME+56`($sp)
306
307 extrdi $t0,$a0,32,32 ; lwz $t0,4($ap)
308 extrdi $t1,$a0,32,0 ; lwz $t1,0($ap)
309 lwz $t2,`12^$LITTLE_ENDIAN`($ap) ; load a[1] as 32-bit word pair
310 lwz $t3,`8^$LITTLE_ENDIAN`($ap)
311 lwz $t4,`4^$LITTLE_ENDIAN`($np) ; load n[0] as 32-bit word pair
312 lwz $t5,`0^$LITTLE_ENDIAN`($np)
313 lwz $t6,`12^$LITTLE_ENDIAN`($np) ; load n[1] as 32-bit word pair
314 lwz $t7,`8^$LITTLE_ENDIAN`($np)
315___
316$code.=<<___ if ($SIZE_T==4);
317 lwz $a0,0($ap) ; pull ap[0,1] value
318 mr $n1,$n0
319 lwz $a1,4($ap)
320 li $c1,0
321 lwz $t1,0($bp) ; bp[0,1]
322 lwz $t3,4($bp)
323 lwz $n0,0($n1) ; pull n0[0,1] value
324 lwz $n1,4($n1)
325
326 mullw $t4,$a0,$t1 ; mulld ap[0]*bp[0]
327 mulhwu $t5,$a0,$t1
328 mullw $t6,$a1,$t1
329 mullw $t7,$a0,$t3
330 add $t5,$t5,$t6
331 add $t5,$t5,$t7
332 ; transfer bp[0] to FPU as 4x16-bit values
333 extrwi $t0,$t1,16,16
334 extrwi $t1,$t1,16,0
335 extrwi $t2,$t3,16,16
336 extrwi $t3,$t3,16,0
337 std $t0,`$FRAME+0`($sp) ; yes, std in 32-bit build
338 std $t1,`$FRAME+8`($sp)
339 std $t2,`$FRAME+16`($sp)
340 std $t3,`$FRAME+24`($sp)
341
342 mullw $t0,$t4,$n0 ; mulld tp[0]*n0
343 mulhwu $t1,$t4,$n0
344 mullw $t2,$t5,$n0
345 mullw $t3,$t4,$n1
346 add $t1,$t1,$t2
347 add $t1,$t1,$t3
348 ; transfer (ap[0]*bp[0])*n0 to FPU as 4x16-bit values
349 extrwi $t4,$t0,16,16
350 extrwi $t5,$t0,16,0
351 extrwi $t6,$t1,16,16
352 extrwi $t7,$t1,16,0
353 std $t4,`$FRAME+32`($sp) ; yes, std in 32-bit build
354 std $t5,`$FRAME+40`($sp)
355 std $t6,`$FRAME+48`($sp)
356 std $t7,`$FRAME+56`($sp)
357
358 mr $t0,$a0 ; lwz $t0,0($ap)
359 mr $t1,$a1 ; lwz $t1,4($ap)
360 lwz $t2,8($ap) ; load a[j..j+3] as 32-bit word pairs
361 lwz $t3,12($ap)
362 lwz $t4,0($np) ; load n[j..j+3] as 32-bit word pairs
363 lwz $t5,4($np)
364 lwz $t6,8($np)
365 lwz $t7,12($np)
366___
367$code.=<<___;
368 lfd $ba,`$FRAME+0`($sp)
369 lfd $bb,`$FRAME+8`($sp)
370 lfd $bc,`$FRAME+16`($sp)
371 lfd $bd,`$FRAME+24`($sp)
372 lfd $na,`$FRAME+32`($sp)
373 lfd $nb,`$FRAME+40`($sp)
374 lfd $nc,`$FRAME+48`($sp)
375 lfd $nd,`$FRAME+56`($sp)
376 std $t0,`$FRAME+64`($sp) ; yes, std even in 32-bit build
377 std $t1,`$FRAME+72`($sp)
378 std $t2,`$FRAME+80`($sp)
379 std $t3,`$FRAME+88`($sp)
380 std $t4,`$FRAME+96`($sp)
381 std $t5,`$FRAME+104`($sp)
382 std $t6,`$FRAME+112`($sp)
383 std $t7,`$FRAME+120`($sp)
384 fcfid $ba,$ba
385 fcfid $bb,$bb
386 fcfid $bc,$bc
387 fcfid $bd,$bd
388 fcfid $na,$na
389 fcfid $nb,$nb
390 fcfid $nc,$nc
391 fcfid $nd,$nd
392
393 lfd $A0,`$FRAME+64`($sp)
394 lfd $A1,`$FRAME+72`($sp)
395 lfd $A2,`$FRAME+80`($sp)
396 lfd $A3,`$FRAME+88`($sp)
397 lfd $N0,`$FRAME+96`($sp)
398 lfd $N1,`$FRAME+104`($sp)
399 lfd $N2,`$FRAME+112`($sp)
400 lfd $N3,`$FRAME+120`($sp)
401 fcfid $A0,$A0
402 fcfid $A1,$A1
403 fcfid $A2,$A2
404 fcfid $A3,$A3
405 fcfid $N0,$N0
406 fcfid $N1,$N1
407 fcfid $N2,$N2
408 fcfid $N3,$N3
409 addi $ap,$ap,16
410 addi $np,$np,16
411
412 fmul $T1a,$A1,$ba
413 fmul $T1b,$A1,$bb
414 stfd $A0,8($nap_d) ; save a[j] in double format
415 stfd $A1,16($nap_d)
416 fmul $T2a,$A2,$ba
417 fmul $T2b,$A2,$bb
418 stfd $A2,24($nap_d) ; save a[j+1] in double format
419 stfd $A3,32($nap_d)
420 fmul $T3a,$A3,$ba
421 fmul $T3b,$A3,$bb
422 stfd $N0,40($nap_d) ; save n[j] in double format
423 stfd $N1,48($nap_d)
424 fmul $T0a,$A0,$ba
425 fmul $T0b,$A0,$bb
426 stfd $N2,56($nap_d) ; save n[j+1] in double format
427 stfdu $N3,64($nap_d)
428
429 fmadd $T1a,$A0,$bc,$T1a
430 fmadd $T1b,$A0,$bd,$T1b
431 fmadd $T2a,$A1,$bc,$T2a
432 fmadd $T2b,$A1,$bd,$T2b
433 fmadd $T3a,$A2,$bc,$T3a
434 fmadd $T3b,$A2,$bd,$T3b
435 fmul $dota,$A3,$bc
436 fmul $dotb,$A3,$bd
437
438 fmadd $T1a,$N1,$na,$T1a
439 fmadd $T1b,$N1,$nb,$T1b
440 fmadd $T2a,$N2,$na,$T2a
441 fmadd $T2b,$N2,$nb,$T2b
442 fmadd $T3a,$N3,$na,$T3a
443 fmadd $T3b,$N3,$nb,$T3b
444 fmadd $T0a,$N0,$na,$T0a
445 fmadd $T0b,$N0,$nb,$T0b
446
447 fmadd $T1a,$N0,$nc,$T1a
448 fmadd $T1b,$N0,$nd,$T1b
449 fmadd $T2a,$N1,$nc,$T2a
450 fmadd $T2b,$N1,$nd,$T2b
451 fmadd $T3a,$N2,$nc,$T3a
452 fmadd $T3b,$N2,$nd,$T3b
453 fmadd $dota,$N3,$nc,$dota
454 fmadd $dotb,$N3,$nd,$dotb
455
456 fctid $T0a,$T0a
457 fctid $T0b,$T0b
458 fctid $T1a,$T1a
459 fctid $T1b,$T1b
460 fctid $T2a,$T2a
461 fctid $T2b,$T2b
462 fctid $T3a,$T3a
463 fctid $T3b,$T3b
464
465 stfd $T0a,`$FRAME+0`($sp)
466 stfd $T0b,`$FRAME+8`($sp)
467 stfd $T1a,`$FRAME+16`($sp)
468 stfd $T1b,`$FRAME+24`($sp)
469 stfd $T2a,`$FRAME+32`($sp)
470 stfd $T2b,`$FRAME+40`($sp)
471 stfd $T3a,`$FRAME+48`($sp)
472 stfd $T3b,`$FRAME+56`($sp)
473
474
475.align 5
476L1st:
477___
478$code.=<<___ if ($SIZE_T==8);
479 lwz $t0,`4^$LITTLE_ENDIAN`($ap) ; load a[j] as 32-bit word pair
480 lwz $t1,`0^$LITTLE_ENDIAN`($ap)
481 lwz $t2,`12^$LITTLE_ENDIAN`($ap) ; load a[j+1] as 32-bit word pair
482 lwz $t3,`8^$LITTLE_ENDIAN`($ap)
483 lwz $t4,`4^$LITTLE_ENDIAN`($np) ; load n[j] as 32-bit word pair
484 lwz $t5,`0^$LITTLE_ENDIAN`($np)
485 lwz $t6,`12^$LITTLE_ENDIAN`($np) ; load n[j+1] as 32-bit word pair
486 lwz $t7,`8^$LITTLE_ENDIAN`($np)
487___
488$code.=<<___ if ($SIZE_T==4);
489 lwz $t0,0($ap) ; load a[j..j+3] as 32-bit word pairs
490 lwz $t1,4($ap)
491 lwz $t2,8($ap)
492 lwz $t3,12($ap)
493 lwz $t4,0($np) ; load n[j..j+3] as 32-bit word pairs
494 lwz $t5,4($np)
495 lwz $t6,8($np)
496 lwz $t7,12($np)
497___
498$code.=<<___;
499 std $t0,`$FRAME+64`($sp) ; yes, std even in 32-bit build
500 std $t1,`$FRAME+72`($sp)
501 std $t2,`$FRAME+80`($sp)
502 std $t3,`$FRAME+88`($sp)
503 std $t4,`$FRAME+96`($sp)
504 std $t5,`$FRAME+104`($sp)
505 std $t6,`$FRAME+112`($sp)
506 std $t7,`$FRAME+120`($sp)
507___
508if ($SIZE_T==8 or $flavour =~ /osx/) {
509$code.=<<___;
510 ld $t0,`$FRAME+0`($sp)
511 ld $t1,`$FRAME+8`($sp)
512 ld $t2,`$FRAME+16`($sp)
513 ld $t3,`$FRAME+24`($sp)
514 ld $t4,`$FRAME+32`($sp)
515 ld $t5,`$FRAME+40`($sp)
516 ld $t6,`$FRAME+48`($sp)
517 ld $t7,`$FRAME+56`($sp)
518___
519} else {
520$code.=<<___;
521 lwz $t1,`$FRAME+0^$LITTLE_ENDIAN`($sp)
522 lwz $t0,`$FRAME+4^$LITTLE_ENDIAN`($sp)
523 lwz $t3,`$FRAME+8^$LITTLE_ENDIAN`($sp)
524 lwz $t2,`$FRAME+12^$LITTLE_ENDIAN`($sp)
525 lwz $t5,`$FRAME+16^$LITTLE_ENDIAN`($sp)
526 lwz $t4,`$FRAME+20^$LITTLE_ENDIAN`($sp)
527 lwz $t7,`$FRAME+24^$LITTLE_ENDIAN`($sp)
528 lwz $t6,`$FRAME+28^$LITTLE_ENDIAN`($sp)
529___
530}
531$code.=<<___;
532 lfd $A0,`$FRAME+64`($sp)
533 lfd $A1,`$FRAME+72`($sp)
534 lfd $A2,`$FRAME+80`($sp)
535 lfd $A3,`$FRAME+88`($sp)
536 lfd $N0,`$FRAME+96`($sp)
537 lfd $N1,`$FRAME+104`($sp)
538 lfd $N2,`$FRAME+112`($sp)
539 lfd $N3,`$FRAME+120`($sp)
540 fcfid $A0,$A0
541 fcfid $A1,$A1
542 fcfid $A2,$A2
543 fcfid $A3,$A3
544 fcfid $N0,$N0
545 fcfid $N1,$N1
546 fcfid $N2,$N2
547 fcfid $N3,$N3
548 addi $ap,$ap,16
549 addi $np,$np,16
550
551 fmul $T1a,$A1,$ba
552 fmul $T1b,$A1,$bb
553 fmul $T2a,$A2,$ba
554 fmul $T2b,$A2,$bb
555 stfd $A0,8($nap_d) ; save a[j] in double format
556 stfd $A1,16($nap_d)
557 fmul $T3a,$A3,$ba
558 fmul $T3b,$A3,$bb
559 fmadd $T0a,$A0,$ba,$dota
560 fmadd $T0b,$A0,$bb,$dotb
561 stfd $A2,24($nap_d) ; save a[j+1] in double format
562 stfd $A3,32($nap_d)
563___
564if ($SIZE_T==8 or $flavour =~ /osx/) {
565$code.=<<___;
566 fmadd $T1a,$A0,$bc,$T1a
567 fmadd $T1b,$A0,$bd,$T1b
568 fmadd $T2a,$A1,$bc,$T2a
569 fmadd $T2b,$A1,$bd,$T2b
570 stfd $N0,40($nap_d) ; save n[j] in double format
571 stfd $N1,48($nap_d)
572 fmadd $T3a,$A2,$bc,$T3a
573 fmadd $T3b,$A2,$bd,$T3b
574 add $t0,$t0,$carry ; can not overflow
575 fmul $dota,$A3,$bc
576 fmul $dotb,$A3,$bd
577 stfd $N2,56($nap_d) ; save n[j+1] in double format
578 stfdu $N3,64($nap_d)
579 srdi $carry,$t0,16
580 add $t1,$t1,$carry
581 srdi $carry,$t1,16
582
583 fmadd $T1a,$N1,$na,$T1a
584 fmadd $T1b,$N1,$nb,$T1b
585 insrdi $t0,$t1,16,32
586 fmadd $T2a,$N2,$na,$T2a
587 fmadd $T2b,$N2,$nb,$T2b
588 add $t2,$t2,$carry
589 fmadd $T3a,$N3,$na,$T3a
590 fmadd $T3b,$N3,$nb,$T3b
591 srdi $carry,$t2,16
592 fmadd $T0a,$N0,$na,$T0a
593 fmadd $T0b,$N0,$nb,$T0b
594 insrdi $t0,$t2,16,16
595 add $t3,$t3,$carry
596 srdi $carry,$t3,16
597
598 fmadd $T1a,$N0,$nc,$T1a
599 fmadd $T1b,$N0,$nd,$T1b
600 insrdi $t0,$t3,16,0 ; 0..63 bits
601 fmadd $T2a,$N1,$nc,$T2a
602 fmadd $T2b,$N1,$nd,$T2b
603 add $t4,$t4,$carry
604 fmadd $T3a,$N2,$nc,$T3a
605 fmadd $T3b,$N2,$nd,$T3b
606 srdi $carry,$t4,16
607 fmadd $dota,$N3,$nc,$dota
608 fmadd $dotb,$N3,$nd,$dotb
609 add $t5,$t5,$carry
610 srdi $carry,$t5,16
611 insrdi $t4,$t5,16,32
612
613 fctid $T0a,$T0a
614 fctid $T0b,$T0b
615 add $t6,$t6,$carry
616 fctid $T1a,$T1a
617 fctid $T1b,$T1b
618 srdi $carry,$t6,16
619 fctid $T2a,$T2a
620 fctid $T2b,$T2b
621 insrdi $t4,$t6,16,16
622 fctid $T3a,$T3a
623 fctid $T3b,$T3b
624 add $t7,$t7,$carry
625 insrdi $t4,$t7,16,0 ; 64..127 bits
626 srdi $carry,$t7,16 ; upper 33 bits
627
628 stfd $T0a,`$FRAME+0`($sp)
629 stfd $T0b,`$FRAME+8`($sp)
630 stfd $T1a,`$FRAME+16`($sp)
631 stfd $T1b,`$FRAME+24`($sp)
632 stfd $T2a,`$FRAME+32`($sp)
633 stfd $T2b,`$FRAME+40`($sp)
634 stfd $T3a,`$FRAME+48`($sp)
635 stfd $T3b,`$FRAME+56`($sp)
636 std $t0,8($tp) ; tp[j-1]
637 stdu $t4,16($tp) ; tp[j]
638___
639} else {
640$code.=<<___;
641 fmadd $T1a,$A0,$bc,$T1a
642 fmadd $T1b,$A0,$bd,$T1b
643 addc $t0,$t0,$carry
644 adde $t1,$t1,$c1
645 srwi $carry,$t0,16
646 fmadd $T2a,$A1,$bc,$T2a
647 fmadd $T2b,$A1,$bd,$T2b
648 stfd $N0,40($nap_d) ; save n[j] in double format
649 stfd $N1,48($nap_d)
650 srwi $c1,$t1,16
651 insrwi $carry,$t1,16,0
652 fmadd $T3a,$A2,$bc,$T3a
653 fmadd $T3b,$A2,$bd,$T3b
654 addc $t2,$t2,$carry
655 adde $t3,$t3,$c1
656 srwi $carry,$t2,16
657 fmul $dota,$A3,$bc
658 fmul $dotb,$A3,$bd
659 stfd $N2,56($nap_d) ; save n[j+1] in double format
660 stfdu $N3,64($nap_d)
661 insrwi $t0,$t2,16,0 ; 0..31 bits
662 srwi $c1,$t3,16
663 insrwi $carry,$t3,16,0
664
665 fmadd $T1a,$N1,$na,$T1a
666 fmadd $T1b,$N1,$nb,$T1b
667 lwz $t3,`$FRAME+32^$LITTLE_ENDIAN`($sp) ; permuted $t1
668 lwz $t2,`$FRAME+36^$LITTLE_ENDIAN`($sp) ; permuted $t0
669 addc $t4,$t4,$carry
670 adde $t5,$t5,$c1
671 srwi $carry,$t4,16
672 fmadd $T2a,$N2,$na,$T2a
673 fmadd $T2b,$N2,$nb,$T2b
674 srwi $c1,$t5,16
675 insrwi $carry,$t5,16,0
676 fmadd $T3a,$N3,$na,$T3a
677 fmadd $T3b,$N3,$nb,$T3b
678 addc $t6,$t6,$carry
679 adde $t7,$t7,$c1
680 srwi $carry,$t6,16
681 fmadd $T0a,$N0,$na,$T0a
682 fmadd $T0b,$N0,$nb,$T0b
683 insrwi $t4,$t6,16,0 ; 32..63 bits
684 srwi $c1,$t7,16
685 insrwi $carry,$t7,16,0
686
687 fmadd $T1a,$N0,$nc,$T1a
688 fmadd $T1b,$N0,$nd,$T1b
689 lwz $t7,`$FRAME+40^$LITTLE_ENDIAN`($sp) ; permuted $t3
690 lwz $t6,`$FRAME+44^$LITTLE_ENDIAN`($sp) ; permuted $t2
691 addc $t2,$t2,$carry
692 adde $t3,$t3,$c1
693 srwi $carry,$t2,16
694 fmadd $T2a,$N1,$nc,$T2a
695 fmadd $T2b,$N1,$nd,$T2b
696 stw $t0,12($tp) ; tp[j-1]
697 stw $t4,8($tp)
698 srwi $c1,$t3,16
699 insrwi $carry,$t3,16,0
700 fmadd $T3a,$N2,$nc,$T3a
701 fmadd $T3b,$N2,$nd,$T3b
702 lwz $t1,`$FRAME+48^$LITTLE_ENDIAN`($sp) ; permuted $t5
703 lwz $t0,`$FRAME+52^$LITTLE_ENDIAN`($sp) ; permuted $t4
704 addc $t6,$t6,$carry
705 adde $t7,$t7,$c1
706 srwi $carry,$t6,16
707 fmadd $dota,$N3,$nc,$dota
708 fmadd $dotb,$N3,$nd,$dotb
709 insrwi $t2,$t6,16,0 ; 64..95 bits
710 srwi $c1,$t7,16
711 insrwi $carry,$t7,16,0
712
713 fctid $T0a,$T0a
714 fctid $T0b,$T0b
715 lwz $t5,`$FRAME+56^$LITTLE_ENDIAN`($sp) ; permuted $t7
716 lwz $t4,`$FRAME+60^$LITTLE_ENDIAN`($sp) ; permuted $t6
717 addc $t0,$t0,$carry
718 adde $t1,$t1,$c1
719 srwi $carry,$t0,16
720 fctid $T1a,$T1a
721 fctid $T1b,$T1b
722 srwi $c1,$t1,16
723 insrwi $carry,$t1,16,0
724 fctid $T2a,$T2a
725 fctid $T2b,$T2b
726 addc $t4,$t4,$carry
727 adde $t5,$t5,$c1
728 srwi $carry,$t4,16
729 fctid $T3a,$T3a
730 fctid $T3b,$T3b
731 insrwi $t0,$t4,16,0 ; 96..127 bits
732 srwi $c1,$t5,16
733 insrwi $carry,$t5,16,0
734
735 stfd $T0a,`$FRAME+0`($sp)
736 stfd $T0b,`$FRAME+8`($sp)
737 stfd $T1a,`$FRAME+16`($sp)
738 stfd $T1b,`$FRAME+24`($sp)
739 stfd $T2a,`$FRAME+32`($sp)
740 stfd $T2b,`$FRAME+40`($sp)
741 stfd $T3a,`$FRAME+48`($sp)
742 stfd $T3b,`$FRAME+56`($sp)
743 stw $t2,20($tp) ; tp[j]
744 stwu $t0,16($tp)
745___
746}
747$code.=<<___;
748 bdnz L1st
749
750
751 fctid $dota,$dota
752 fctid $dotb,$dotb
753___
754if ($SIZE_T==8 or $flavour =~ /osx/) {
755$code.=<<___;
756 ld $t0,`$FRAME+0`($sp)
757 ld $t1,`$FRAME+8`($sp)
758 ld $t2,`$FRAME+16`($sp)
759 ld $t3,`$FRAME+24`($sp)
760 ld $t4,`$FRAME+32`($sp)
761 ld $t5,`$FRAME+40`($sp)
762 ld $t6,`$FRAME+48`($sp)
763 ld $t7,`$FRAME+56`($sp)
764 stfd $dota,`$FRAME+64`($sp)
765 stfd $dotb,`$FRAME+72`($sp)
766
767 add $t0,$t0,$carry ; can not overflow
768 srdi $carry,$t0,16
769 add $t1,$t1,$carry
770 srdi $carry,$t1,16
771 insrdi $t0,$t1,16,32
772 add $t2,$t2,$carry
773 srdi $carry,$t2,16
774 insrdi $t0,$t2,16,16
775 add $t3,$t3,$carry
776 srdi $carry,$t3,16
777 insrdi $t0,$t3,16,0 ; 0..63 bits
778 add $t4,$t4,$carry
779 srdi $carry,$t4,16
780 add $t5,$t5,$carry
781 srdi $carry,$t5,16
782 insrdi $t4,$t5,16,32
783 add $t6,$t6,$carry
784 srdi $carry,$t6,16
785 insrdi $t4,$t6,16,16
786 add $t7,$t7,$carry
787 insrdi $t4,$t7,16,0 ; 64..127 bits
788 srdi $carry,$t7,16 ; upper 33 bits
789 ld $t6,`$FRAME+64`($sp)
790 ld $t7,`$FRAME+72`($sp)
791
792 std $t0,8($tp) ; tp[j-1]
793 stdu $t4,16($tp) ; tp[j]
794
795 add $t6,$t6,$carry ; can not overflow
796 srdi $carry,$t6,16
797 add $t7,$t7,$carry
798 insrdi $t6,$t7,48,0
799 srdi $ovf,$t7,48
800 std $t6,8($tp) ; tp[num-1]
801___
802} else {
803$code.=<<___;
804 lwz $t1,`$FRAME+0^$LITTLE_ENDIAN`($sp)
805 lwz $t0,`$FRAME+4^$LITTLE_ENDIAN`($sp)
806 lwz $t3,`$FRAME+8^$LITTLE_ENDIAN`($sp)
807 lwz $t2,`$FRAME+12^$LITTLE_ENDIAN`($sp)
808 lwz $t5,`$FRAME+16^$LITTLE_ENDIAN`($sp)
809 lwz $t4,`$FRAME+20^$LITTLE_ENDIAN`($sp)
810 lwz $t7,`$FRAME+24^$LITTLE_ENDIAN`($sp)
811 lwz $t6,`$FRAME+28^$LITTLE_ENDIAN`($sp)
812 stfd $dota,`$FRAME+64`($sp)
813 stfd $dotb,`$FRAME+72`($sp)
814
815 addc $t0,$t0,$carry
816 adde $t1,$t1,$c1
817 srwi $carry,$t0,16
818 insrwi $carry,$t1,16,0
819 srwi $c1,$t1,16
820 addc $t2,$t2,$carry
821 adde $t3,$t3,$c1
822 srwi $carry,$t2,16
823 insrwi $t0,$t2,16,0 ; 0..31 bits
824 insrwi $carry,$t3,16,0
825 srwi $c1,$t3,16
826 addc $t4,$t4,$carry
827 adde $t5,$t5,$c1
828 srwi $carry,$t4,16
829 insrwi $carry,$t5,16,0
830 srwi $c1,$t5,16
831 addc $t6,$t6,$carry
832 adde $t7,$t7,$c1
833 srwi $carry,$t6,16
834 insrwi $t4,$t6,16,0 ; 32..63 bits
835 insrwi $carry,$t7,16,0
836 srwi $c1,$t7,16
837 stw $t0,12($tp) ; tp[j-1]
838 stw $t4,8($tp)
839
840 lwz $t3,`$FRAME+32^$LITTLE_ENDIAN`($sp) ; permuted $t1
841 lwz $t2,`$FRAME+36^$LITTLE_ENDIAN`($sp) ; permuted $t0
842 lwz $t7,`$FRAME+40^$LITTLE_ENDIAN`($sp) ; permuted $t3
843 lwz $t6,`$FRAME+44^$LITTLE_ENDIAN`($sp) ; permuted $t2
844 lwz $t1,`$FRAME+48^$LITTLE_ENDIAN`($sp) ; permuted $t5
845 lwz $t0,`$FRAME+52^$LITTLE_ENDIAN`($sp) ; permuted $t4
846 lwz $t5,`$FRAME+56^$LITTLE_ENDIAN`($sp) ; permuted $t7
847 lwz $t4,`$FRAME+60^$LITTLE_ENDIAN`($sp) ; permuted $t6
848
849 addc $t2,$t2,$carry
850 adde $t3,$t3,$c1
851 srwi $carry,$t2,16
852 insrwi $carry,$t3,16,0
853 srwi $c1,$t3,16
854 addc $t6,$t6,$carry
855 adde $t7,$t7,$c1
856 srwi $carry,$t6,16
857 insrwi $t2,$t6,16,0 ; 64..95 bits
858 insrwi $carry,$t7,16,0
859 srwi $c1,$t7,16
860 addc $t0,$t0,$carry
861 adde $t1,$t1,$c1
862 srwi $carry,$t0,16
863 insrwi $carry,$t1,16,0
864 srwi $c1,$t1,16
865 addc $t4,$t4,$carry
866 adde $t5,$t5,$c1
867 srwi $carry,$t4,16
868 insrwi $t0,$t4,16,0 ; 96..127 bits
869 insrwi $carry,$t5,16,0
870 srwi $c1,$t5,16
871 stw $t2,20($tp) ; tp[j]
872 stwu $t0,16($tp)
873
874 lwz $t7,`$FRAME+64^$LITTLE_ENDIAN`($sp)
875 lwz $t6,`$FRAME+68^$LITTLE_ENDIAN`($sp)
876 lwz $t5,`$FRAME+72^$LITTLE_ENDIAN`($sp)
877 lwz $t4,`$FRAME+76^$LITTLE_ENDIAN`($sp)
878
879 addc $t6,$t6,$carry
880 adde $t7,$t7,$c1
881 srwi $carry,$t6,16
882 insrwi $carry,$t7,16,0
883 srwi $c1,$t7,16
884 addc $t4,$t4,$carry
885 adde $t5,$t5,$c1
886
887 insrwi $t6,$t4,16,0
888 srwi $t4,$t4,16
889 insrwi $t4,$t5,16,0
890 srwi $ovf,$t5,16
891 stw $t6,12($tp) ; tp[num-1]
892 stw $t4,8($tp)
893___
894}
895$code.=<<___;
896 slwi $t7,$num,2
897 subf $nap_d,$t7,$nap_d ; rewind pointer
898
899
900 li $i,8 ; i=1
901.align 5
902Louter:
903 addi $tp,$sp,`$FRAME+$TRANSFER`
904 li $carry,0
905 mtctr $j
906___
907$code.=<<___ if ($SIZE_T==8);
908 ldx $t3,$bp,$i ; bp[i]
909
910 ld $t6,`$FRAME+$TRANSFER+8`($sp) ; tp[0]
911 mulld $t7,$a0,$t3 ; ap[0]*bp[i]
912 add $t7,$t7,$t6 ; ap[0]*bp[i]+tp[0]
913 ; transfer bp[i] to FPU as 4x16-bit values
914 extrdi $t0,$t3,16,48
915 extrdi $t1,$t3,16,32
916 extrdi $t2,$t3,16,16
917 extrdi $t3,$t3,16,0
918 std $t0,`$FRAME+0`($sp)
919 std $t1,`$FRAME+8`($sp)
920 std $t2,`$FRAME+16`($sp)
921 std $t3,`$FRAME+24`($sp)
922
923 mulld $t7,$t7,$n0 ; tp[0]*n0
924 ; transfer (ap[0]*bp[i]+tp[0])*n0 to FPU as 4x16-bit values
925 extrdi $t4,$t7,16,48
926 extrdi $t5,$t7,16,32
927 extrdi $t6,$t7,16,16
928 extrdi $t7,$t7,16,0
929 std $t4,`$FRAME+32`($sp)
930 std $t5,`$FRAME+40`($sp)
931 std $t6,`$FRAME+48`($sp)
932 std $t7,`$FRAME+56`($sp)
933___
934$code.=<<___ if ($SIZE_T==4);
935 add $t0,$bp,$i
936 li $c1,0
937 lwz $t1,0($t0) ; bp[i,i+1]
938 lwz $t3,4($t0)
939
940 mullw $t4,$a0,$t1 ; ap[0]*bp[i]
941 lwz $t0,`$FRAME+$TRANSFER+8+4`($sp) ; tp[0]
942 mulhwu $t5,$a0,$t1
943 lwz $t2,`$FRAME+$TRANSFER+8`($sp) ; tp[0]
944 mullw $t6,$a1,$t1
945 mullw $t7,$a0,$t3
946 add $t5,$t5,$t6
947 add $t5,$t5,$t7
948 addc $t4,$t4,$t0 ; ap[0]*bp[i]+tp[0]
949 adde $t5,$t5,$t2
950 ; transfer bp[i] to FPU as 4x16-bit values
951 extrwi $t0,$t1,16,16
952 extrwi $t1,$t1,16,0
953 extrwi $t2,$t3,16,16
954 extrwi $t3,$t3,16,0
955 std $t0,`$FRAME+0`($sp) ; yes, std in 32-bit build
956 std $t1,`$FRAME+8`($sp)
957 std $t2,`$FRAME+16`($sp)
958 std $t3,`$FRAME+24`($sp)
959
960 mullw $t0,$t4,$n0 ; mulld tp[0]*n0
961 mulhwu $t1,$t4,$n0
962 mullw $t2,$t5,$n0
963 mullw $t3,$t4,$n1
964 add $t1,$t1,$t2
965 add $t1,$t1,$t3
966 ; transfer (ap[0]*bp[i]+tp[0])*n0 to FPU as 4x16-bit values
967 extrwi $t4,$t0,16,16
968 extrwi $t5,$t0,16,0
969 extrwi $t6,$t1,16,16
970 extrwi $t7,$t1,16,0
971 std $t4,`$FRAME+32`($sp) ; yes, std in 32-bit build
972 std $t5,`$FRAME+40`($sp)
973 std $t6,`$FRAME+48`($sp)
974 std $t7,`$FRAME+56`($sp)
975___
976$code.=<<___;
977 lfd $A0,8($nap_d) ; load a[j] in double format
978 lfd $A1,16($nap_d)
979 lfd $A2,24($nap_d) ; load a[j+1] in double format
980 lfd $A3,32($nap_d)
981 lfd $N0,40($nap_d) ; load n[j] in double format
982 lfd $N1,48($nap_d)
983 lfd $N2,56($nap_d) ; load n[j+1] in double format
984 lfdu $N3,64($nap_d)
985
986 lfd $ba,`$FRAME+0`($sp)
987 lfd $bb,`$FRAME+8`($sp)
988 lfd $bc,`$FRAME+16`($sp)
989 lfd $bd,`$FRAME+24`($sp)
990 lfd $na,`$FRAME+32`($sp)
991 lfd $nb,`$FRAME+40`($sp)
992 lfd $nc,`$FRAME+48`($sp)
993 lfd $nd,`$FRAME+56`($sp)
994
995 fcfid $ba,$ba
996 fcfid $bb,$bb
997 fcfid $bc,$bc
998 fcfid $bd,$bd
999 fcfid $na,$na
1000 fcfid $nb,$nb
1001 fcfid $nc,$nc
1002 fcfid $nd,$nd
1003
1004 fmul $T1a,$A1,$ba
1005 fmul $T1b,$A1,$bb
1006 fmul $T2a,$A2,$ba
1007 fmul $T2b,$A2,$bb
1008 fmul $T3a,$A3,$ba
1009 fmul $T3b,$A3,$bb
1010 fmul $T0a,$A0,$ba
1011 fmul $T0b,$A0,$bb
1012
1013 fmadd $T1a,$A0,$bc,$T1a
1014 fmadd $T1b,$A0,$bd,$T1b
1015 fmadd $T2a,$A1,$bc,$T2a
1016 fmadd $T2b,$A1,$bd,$T2b
1017 fmadd $T3a,$A2,$bc,$T3a
1018 fmadd $T3b,$A2,$bd,$T3b
1019 fmul $dota,$A3,$bc
1020 fmul $dotb,$A3,$bd
1021
1022 fmadd $T1a,$N1,$na,$T1a
1023 fmadd $T1b,$N1,$nb,$T1b
1024 lfd $A0,8($nap_d) ; load a[j] in double format
1025 lfd $A1,16($nap_d)
1026 fmadd $T2a,$N2,$na,$T2a
1027 fmadd $T2b,$N2,$nb,$T2b
1028 lfd $A2,24($nap_d) ; load a[j+1] in double format
1029 lfd $A3,32($nap_d)
1030 fmadd $T3a,$N3,$na,$T3a
1031 fmadd $T3b,$N3,$nb,$T3b
1032 fmadd $T0a,$N0,$na,$T0a
1033 fmadd $T0b,$N0,$nb,$T0b
1034
1035 fmadd $T1a,$N0,$nc,$T1a
1036 fmadd $T1b,$N0,$nd,$T1b
1037 fmadd $T2a,$N1,$nc,$T2a
1038 fmadd $T2b,$N1,$nd,$T2b
1039 fmadd $T3a,$N2,$nc,$T3a
1040 fmadd $T3b,$N2,$nd,$T3b
1041 fmadd $dota,$N3,$nc,$dota
1042 fmadd $dotb,$N3,$nd,$dotb
1043
1044 fctid $T0a,$T0a
1045 fctid $T0b,$T0b
1046 fctid $T1a,$T1a
1047 fctid $T1b,$T1b
1048 fctid $T2a,$T2a
1049 fctid $T2b,$T2b
1050 fctid $T3a,$T3a
1051 fctid $T3b,$T3b
1052
1053 stfd $T0a,`$FRAME+0`($sp)
1054 stfd $T0b,`$FRAME+8`($sp)
1055 stfd $T1a,`$FRAME+16`($sp)
1056 stfd $T1b,`$FRAME+24`($sp)
1057 stfd $T2a,`$FRAME+32`($sp)
1058 stfd $T2b,`$FRAME+40`($sp)
1059 stfd $T3a,`$FRAME+48`($sp)
1060 stfd $T3b,`$FRAME+56`($sp)
1061
1062
1063.align 5
1064Linner:
1065 fmul $T1a,$A1,$ba
1066 fmul $T1b,$A1,$bb
1067 fmul $T2a,$A2,$ba
1068 fmul $T2b,$A2,$bb
1069 lfd $N0,40($nap_d) ; load n[j] in double format
1070 lfd $N1,48($nap_d)
1071 fmul $T3a,$A3,$ba
1072 fmul $T3b,$A3,$bb
1073 fmadd $T0a,$A0,$ba,$dota
1074 fmadd $T0b,$A0,$bb,$dotb
1075 lfd $N2,56($nap_d) ; load n[j+1] in double format
1076 lfdu $N3,64($nap_d)
1077
1078 fmadd $T1a,$A0,$bc,$T1a
1079 fmadd $T1b,$A0,$bd,$T1b
1080 fmadd $T2a,$A1,$bc,$T2a
1081 fmadd $T2b,$A1,$bd,$T2b
1082 lfd $A0,8($nap_d) ; load a[j] in double format
1083 lfd $A1,16($nap_d)
1084 fmadd $T3a,$A2,$bc,$T3a
1085 fmadd $T3b,$A2,$bd,$T3b
1086 fmul $dota,$A3,$bc
1087 fmul $dotb,$A3,$bd
1088 lfd $A2,24($nap_d) ; load a[j+1] in double format
1089 lfd $A3,32($nap_d)
1090___
1091if ($SIZE_T==8 or $flavour =~ /osx/) {
1092$code.=<<___;
1093 fmadd $T1a,$N1,$na,$T1a
1094 fmadd $T1b,$N1,$nb,$T1b
1095 ld $t0,`$FRAME+0`($sp)
1096 ld $t1,`$FRAME+8`($sp)
1097 fmadd $T2a,$N2,$na,$T2a
1098 fmadd $T2b,$N2,$nb,$T2b
1099 ld $t2,`$FRAME+16`($sp)
1100 ld $t3,`$FRAME+24`($sp)
1101 fmadd $T3a,$N3,$na,$T3a
1102 fmadd $T3b,$N3,$nb,$T3b
1103 add $t0,$t0,$carry ; can not overflow
1104 ld $t4,`$FRAME+32`($sp)
1105 ld $t5,`$FRAME+40`($sp)
1106 fmadd $T0a,$N0,$na,$T0a
1107 fmadd $T0b,$N0,$nb,$T0b
1108 srdi $carry,$t0,16
1109 add $t1,$t1,$carry
1110 srdi $carry,$t1,16
1111 ld $t6,`$FRAME+48`($sp)
1112 ld $t7,`$FRAME+56`($sp)
1113
1114 fmadd $T1a,$N0,$nc,$T1a
1115 fmadd $T1b,$N0,$nd,$T1b
1116 insrdi $t0,$t1,16,32
1117 ld $t1,8($tp) ; tp[j]
1118 fmadd $T2a,$N1,$nc,$T2a
1119 fmadd $T2b,$N1,$nd,$T2b
1120 add $t2,$t2,$carry
1121 fmadd $T3a,$N2,$nc,$T3a
1122 fmadd $T3b,$N2,$nd,$T3b
1123 srdi $carry,$t2,16
1124 insrdi $t0,$t2,16,16
1125 fmadd $dota,$N3,$nc,$dota
1126 fmadd $dotb,$N3,$nd,$dotb
1127 add $t3,$t3,$carry
1128 ldu $t2,16($tp) ; tp[j+1]
1129 srdi $carry,$t3,16
1130 insrdi $t0,$t3,16,0 ; 0..63 bits
1131 add $t4,$t4,$carry
1132
1133 fctid $T0a,$T0a
1134 fctid $T0b,$T0b
1135 srdi $carry,$t4,16
1136 fctid $T1a,$T1a
1137 fctid $T1b,$T1b
1138 add $t5,$t5,$carry
1139 fctid $T2a,$T2a
1140 fctid $T2b,$T2b
1141 srdi $carry,$t5,16
1142 insrdi $t4,$t5,16,32
1143 fctid $T3a,$T3a
1144 fctid $T3b,$T3b
1145 add $t6,$t6,$carry
1146 srdi $carry,$t6,16
1147 insrdi $t4,$t6,16,16
1148
1149 stfd $T0a,`$FRAME+0`($sp)
1150 stfd $T0b,`$FRAME+8`($sp)
1151 add $t7,$t7,$carry
1152 addc $t3,$t0,$t1
1153___
1154$code.=<<___ if ($SIZE_T==4); # adjust XER[CA]
1155 extrdi $t0,$t0,32,0
1156 extrdi $t1,$t1,32,0
1157 adde $t0,$t0,$t1
1158___
1159$code.=<<___;
1160 stfd $T1a,`$FRAME+16`($sp)
1161 stfd $T1b,`$FRAME+24`($sp)
1162 insrdi $t4,$t7,16,0 ; 64..127 bits
1163 srdi $carry,$t7,16 ; upper 33 bits
1164 stfd $T2a,`$FRAME+32`($sp)
1165 stfd $T2b,`$FRAME+40`($sp)
1166 adde $t5,$t4,$t2
1167___
1168$code.=<<___ if ($SIZE_T==4); # adjust XER[CA]
1169 extrdi $t4,$t4,32,0
1170 extrdi $t2,$t2,32,0
1171 adde $t4,$t4,$t2
1172___
1173$code.=<<___;
1174 stfd $T3a,`$FRAME+48`($sp)
1175 stfd $T3b,`$FRAME+56`($sp)
1176 addze $carry,$carry
1177 std $t3,-16($tp) ; tp[j-1]
1178 std $t5,-8($tp) ; tp[j]
1179___
1180} else {
1181$code.=<<___;
1182 fmadd $T1a,$N1,$na,$T1a
1183 fmadd $T1b,$N1,$nb,$T1b
1184 lwz $t1,`$FRAME+0^$LITTLE_ENDIAN`($sp)
1185 lwz $t0,`$FRAME+4^$LITTLE_ENDIAN`($sp)
1186 fmadd $T2a,$N2,$na,$T2a
1187 fmadd $T2b,$N2,$nb,$T2b
1188 lwz $t3,`$FRAME+8^$LITTLE_ENDIAN`($sp)
1189 lwz $t2,`$FRAME+12^$LITTLE_ENDIAN`($sp)
1190 fmadd $T3a,$N3,$na,$T3a
1191 fmadd $T3b,$N3,$nb,$T3b
1192 lwz $t5,`$FRAME+16^$LITTLE_ENDIAN`($sp)
1193 lwz $t4,`$FRAME+20^$LITTLE_ENDIAN`($sp)
1194 addc $t0,$t0,$carry
1195 adde $t1,$t1,$c1
1196 srwi $carry,$t0,16
1197 fmadd $T0a,$N0,$na,$T0a
1198 fmadd $T0b,$N0,$nb,$T0b
1199 lwz $t7,`$FRAME+24^$LITTLE_ENDIAN`($sp)
1200 lwz $t6,`$FRAME+28^$LITTLE_ENDIAN`($sp)
1201 srwi $c1,$t1,16
1202 insrwi $carry,$t1,16,0
1203
1204 fmadd $T1a,$N0,$nc,$T1a
1205 fmadd $T1b,$N0,$nd,$T1b
1206 addc $t2,$t2,$carry
1207 adde $t3,$t3,$c1
1208 srwi $carry,$t2,16
1209 fmadd $T2a,$N1,$nc,$T2a
1210 fmadd $T2b,$N1,$nd,$T2b
1211 insrwi $t0,$t2,16,0 ; 0..31 bits
1212 srwi $c1,$t3,16
1213 insrwi $carry,$t3,16,0
1214 fmadd $T3a,$N2,$nc,$T3a
1215 fmadd $T3b,$N2,$nd,$T3b
1216 lwz $t2,12($tp) ; tp[j]
1217 lwz $t3,8($tp)
1218 addc $t4,$t4,$carry
1219 adde $t5,$t5,$c1
1220 srwi $carry,$t4,16
1221 fmadd $dota,$N3,$nc,$dota
1222 fmadd $dotb,$N3,$nd,$dotb
1223 srwi $c1,$t5,16
1224 insrwi $carry,$t5,16,0
1225
1226 fctid $T0a,$T0a
1227 addc $t6,$t6,$carry
1228 adde $t7,$t7,$c1
1229 srwi $carry,$t6,16
1230 fctid $T0b,$T0b
1231 insrwi $t4,$t6,16,0 ; 32..63 bits
1232 srwi $c1,$t7,16
1233 insrwi $carry,$t7,16,0
1234 fctid $T1a,$T1a
1235 addc $t0,$t0,$t2
1236 adde $t4,$t4,$t3
1237 lwz $t3,`$FRAME+32^$LITTLE_ENDIAN`($sp) ; permuted $t1
1238 lwz $t2,`$FRAME+36^$LITTLE_ENDIAN`($sp) ; permuted $t0
1239 fctid $T1b,$T1b
1240 addze $carry,$carry
1241 addze $c1,$c1
1242 stw $t0,4($tp) ; tp[j-1]
1243 stw $t4,0($tp)
1244 fctid $T2a,$T2a
1245 addc $t2,$t2,$carry
1246 adde $t3,$t3,$c1
1247 srwi $carry,$t2,16
1248 lwz $t7,`$FRAME+40^$LITTLE_ENDIAN`($sp) ; permuted $t3
1249 lwz $t6,`$FRAME+44^$LITTLE_ENDIAN`($sp) ; permuted $t2
1250 fctid $T2b,$T2b
1251 srwi $c1,$t3,16
1252 insrwi $carry,$t3,16,0
1253 lwz $t1,`$FRAME+48^$LITTLE_ENDIAN`($sp) ; permuted $t5
1254 lwz $t0,`$FRAME+52^$LITTLE_ENDIAN`($sp) ; permuted $t4
1255 fctid $T3a,$T3a
1256 addc $t6,$t6,$carry
1257 adde $t7,$t7,$c1
1258 srwi $carry,$t6,16
1259 lwz $t5,`$FRAME+56^$LITTLE_ENDIAN`($sp) ; permuted $t7
1260 lwz $t4,`$FRAME+60^$LITTLE_ENDIAN`($sp) ; permuted $t6
1261 fctid $T3b,$T3b
1262
1263 insrwi $t2,$t6,16,0 ; 64..95 bits
1264 insrwi $carry,$t7,16,0
1265 srwi $c1,$t7,16
1266 lwz $t6,20($tp)
1267 lwzu $t7,16($tp)
1268 addc $t0,$t0,$carry
1269 stfd $T0a,`$FRAME+0`($sp)
1270 adde $t1,$t1,$c1
1271 srwi $carry,$t0,16
1272 stfd $T0b,`$FRAME+8`($sp)
1273 insrwi $carry,$t1,16,0
1274 srwi $c1,$t1,16
1275 addc $t4,$t4,$carry
1276 stfd $T1a,`$FRAME+16`($sp)
1277 adde $t5,$t5,$c1
1278 srwi $carry,$t4,16
1279 insrwi $t0,$t4,16,0 ; 96..127 bits
1280 stfd $T1b,`$FRAME+24`($sp)
1281 insrwi $carry,$t5,16,0
1282 srwi $c1,$t5,16
1283
1284 addc $t2,$t2,$t6
1285 stfd $T2a,`$FRAME+32`($sp)
1286 adde $t0,$t0,$t7
1287 stfd $T2b,`$FRAME+40`($sp)
1288 addze $carry,$carry
1289 stfd $T3a,`$FRAME+48`($sp)
1290 addze $c1,$c1
1291 stfd $T3b,`$FRAME+56`($sp)
1292 stw $t2,-4($tp) ; tp[j]
1293 stw $t0,-8($tp)
1294___
1295}
1296$code.=<<___;
1297 bdnz Linner
1298
1299
1300 fctid $dota,$dota
1301 fctid $dotb,$dotb
1302___
1303if ($SIZE_T==8 or $flavour =~ /osx/) {
1304$code.=<<___;
1305 ld $t0,`$FRAME+0`($sp)
1306 ld $t1,`$FRAME+8`($sp)
1307 ld $t2,`$FRAME+16`($sp)
1308 ld $t3,`$FRAME+24`($sp)
1309 ld $t4,`$FRAME+32`($sp)
1310 ld $t5,`$FRAME+40`($sp)
1311 ld $t6,`$FRAME+48`($sp)
1312 ld $t7,`$FRAME+56`($sp)
1313 stfd $dota,`$FRAME+64`($sp)
1314 stfd $dotb,`$FRAME+72`($sp)
1315
1316 add $t0,$t0,$carry ; can not overflow
1317 srdi $carry,$t0,16
1318 add $t1,$t1,$carry
1319 srdi $carry,$t1,16
1320 insrdi $t0,$t1,16,32
1321 add $t2,$t2,$carry
1322 ld $t1,8($tp) ; tp[j]
1323 srdi $carry,$t2,16
1324 insrdi $t0,$t2,16,16
1325 add $t3,$t3,$carry
1326 ldu $t2,16($tp) ; tp[j+1]
1327 srdi $carry,$t3,16
1328 insrdi $t0,$t3,16,0 ; 0..63 bits
1329 add $t4,$t4,$carry
1330 srdi $carry,$t4,16
1331 add $t5,$t5,$carry
1332 srdi $carry,$t5,16
1333 insrdi $t4,$t5,16,32
1334 add $t6,$t6,$carry
1335 srdi $carry,$t6,16
1336 insrdi $t4,$t6,16,16
1337 add $t7,$t7,$carry
1338 insrdi $t4,$t7,16,0 ; 64..127 bits
1339 srdi $carry,$t7,16 ; upper 33 bits
1340 ld $t6,`$FRAME+64`($sp)
1341 ld $t7,`$FRAME+72`($sp)
1342
1343 addc $t3,$t0,$t1
1344___
1345$code.=<<___ if ($SIZE_T==4); # adjust XER[CA]
1346 extrdi $t0,$t0,32,0
1347 extrdi $t1,$t1,32,0
1348 adde $t0,$t0,$t1
1349___
1350$code.=<<___;
1351 adde $t5,$t4,$t2
1352___
1353$code.=<<___ if ($SIZE_T==4); # adjust XER[CA]
1354 extrdi $t4,$t4,32,0
1355 extrdi $t2,$t2,32,0
1356 adde $t4,$t4,$t2
1357___
1358$code.=<<___;
1359 addze $carry,$carry
1360
1361 std $t3,-16($tp) ; tp[j-1]
1362 std $t5,-8($tp) ; tp[j]
1363
1364 add $carry,$carry,$ovf ; consume upmost overflow
1365 add $t6,$t6,$carry ; can not overflow
1366 srdi $carry,$t6,16
1367 add $t7,$t7,$carry
1368 insrdi $t6,$t7,48,0
1369 srdi $ovf,$t7,48
1370 std $t6,0($tp) ; tp[num-1]
1371___
1372} else {
1373$code.=<<___;
1374 lwz $t1,`$FRAME+0^$LITTLE_ENDIAN`($sp)
1375 lwz $t0,`$FRAME+4^$LITTLE_ENDIAN`($sp)
1376 lwz $t3,`$FRAME+8^$LITTLE_ENDIAN`($sp)
1377 lwz $t2,`$FRAME+12^$LITTLE_ENDIAN`($sp)
1378 lwz $t5,`$FRAME+16^$LITTLE_ENDIAN`($sp)
1379 lwz $t4,`$FRAME+20^$LITTLE_ENDIAN`($sp)
1380 lwz $t7,`$FRAME+24^$LITTLE_ENDIAN`($sp)
1381 lwz $t6,`$FRAME+28^$LITTLE_ENDIAN`($sp)
1382 stfd $dota,`$FRAME+64`($sp)
1383 stfd $dotb,`$FRAME+72`($sp)
1384
1385 addc $t0,$t0,$carry
1386 adde $t1,$t1,$c1
1387 srwi $carry,$t0,16
1388 insrwi $carry,$t1,16,0
1389 srwi $c1,$t1,16
1390 addc $t2,$t2,$carry
1391 adde $t3,$t3,$c1
1392 srwi $carry,$t2,16
1393 insrwi $t0,$t2,16,0 ; 0..31 bits
1394 lwz $t2,12($tp) ; tp[j]
1395 insrwi $carry,$t3,16,0
1396 srwi $c1,$t3,16
1397 lwz $t3,8($tp)
1398 addc $t4,$t4,$carry
1399 adde $t5,$t5,$c1
1400 srwi $carry,$t4,16
1401 insrwi $carry,$t5,16,0
1402 srwi $c1,$t5,16
1403 addc $t6,$t6,$carry
1404 adde $t7,$t7,$c1
1405 srwi $carry,$t6,16
1406 insrwi $t4,$t6,16,0 ; 32..63 bits
1407 insrwi $carry,$t7,16,0
1408 srwi $c1,$t7,16
1409
1410 addc $t0,$t0,$t2
1411 adde $t4,$t4,$t3
1412 addze $carry,$carry
1413 addze $c1,$c1
1414 stw $t0,4($tp) ; tp[j-1]
1415 stw $t4,0($tp)
1416
1417 lwz $t3,`$FRAME+32^$LITTLE_ENDIAN`($sp) ; permuted $t1
1418 lwz $t2,`$FRAME+36^$LITTLE_ENDIAN`($sp) ; permuted $t0
1419 lwz $t7,`$FRAME+40^$LITTLE_ENDIAN`($sp) ; permuted $t3
1420 lwz $t6,`$FRAME+44^$LITTLE_ENDIAN`($sp) ; permuted $t2
1421 lwz $t1,`$FRAME+48^$LITTLE_ENDIAN`($sp) ; permuted $t5
1422 lwz $t0,`$FRAME+52^$LITTLE_ENDIAN`($sp) ; permuted $t4
1423 lwz $t5,`$FRAME+56^$LITTLE_ENDIAN`($sp) ; permuted $t7
1424 lwz $t4,`$FRAME+60^$LITTLE_ENDIAN`($sp) ; permuted $t6
1425
1426 addc $t2,$t2,$carry
1427 adde $t3,$t3,$c1
1428 srwi $carry,$t2,16
1429 insrwi $carry,$t3,16,0
1430 srwi $c1,$t3,16
1431 addc $t6,$t6,$carry
1432 adde $t7,$t7,$c1
1433 srwi $carry,$t6,16
1434 insrwi $t2,$t6,16,0 ; 64..95 bits
1435 lwz $t6,20($tp)
1436 insrwi $carry,$t7,16,0
1437 srwi $c1,$t7,16
1438 lwzu $t7,16($tp)
1439 addc $t0,$t0,$carry
1440 adde $t1,$t1,$c1
1441 srwi $carry,$t0,16
1442 insrwi $carry,$t1,16,0
1443 srwi $c1,$t1,16
1444 addc $t4,$t4,$carry
1445 adde $t5,$t5,$c1
1446 srwi $carry,$t4,16
1447 insrwi $t0,$t4,16,0 ; 96..127 bits
1448 insrwi $carry,$t5,16,0
1449 srwi $c1,$t5,16
1450
1451 addc $t2,$t2,$t6
1452 adde $t0,$t0,$t7
1453 lwz $t7,`$FRAME+64^$LITTLE_ENDIAN`($sp)
1454 lwz $t6,`$FRAME+68^$LITTLE_ENDIAN`($sp)
1455 addze $carry,$carry
1456 addze $c1,$c1
1457 lwz $t5,`$FRAME+72^$LITTLE_ENDIAN`($sp)
1458 lwz $t4,`$FRAME+76^$LITTLE_ENDIAN`($sp)
1459
1460 addc $t6,$t6,$carry
1461 adde $t7,$t7,$c1
1462 stw $t2,-4($tp) ; tp[j]
1463 stw $t0,-8($tp)
1464 addc $t6,$t6,$ovf
1465 addze $t7,$t7
1466 srwi $carry,$t6,16
1467 insrwi $carry,$t7,16,0
1468 srwi $c1,$t7,16
1469 addc $t4,$t4,$carry
1470 adde $t5,$t5,$c1
1471
1472 insrwi $t6,$t4,16,0
1473 srwi $t4,$t4,16
1474 insrwi $t4,$t5,16,0
1475 srwi $ovf,$t5,16
1476 stw $t6,4($tp) ; tp[num-1]
1477 stw $t4,0($tp)
1478___
1479}
1480$code.=<<___;
1481 slwi $t7,$num,2
1482 addi $i,$i,8
1483 subf $nap_d,$t7,$nap_d ; rewind pointer
1484 cmpw $i,$num
1485 blt- Louter
1486___
1487
1488
1489$code.=<<___ if ($SIZE_T==8);
1490 subf $np,$num,$np ; rewind np
1491 addi $j,$j,1 ; restore counter
1492 subfc $i,$i,$i ; j=0 and "clear" XER[CA]
1493 addi $tp,$sp,`$FRAME+$TRANSFER+8`
1494 addi $t4,$sp,`$FRAME+$TRANSFER+16`
1495 addi $t5,$np,8
1496 addi $t6,$rp,8
1497 mtctr $j
1498
1499.align 4
1500Lsub: ldx $t0,$tp,$i
1501 ldx $t1,$np,$i
1502 ldx $t2,$t4,$i
1503 ldx $t3,$t5,$i
1504 subfe $t0,$t1,$t0 ; tp[j]-np[j]
1505 subfe $t2,$t3,$t2 ; tp[j+1]-np[j+1]
1506 stdx $t0,$rp,$i
1507 stdx $t2,$t6,$i
1508 addi $i,$i,16
1509 bdnz Lsub
1510
1511 li $i,0
1512 subfe $ovf,$i,$ovf ; handle upmost overflow bit
1513 mtctr $j
1514
1515.align 4
1516Lcopy: ; conditional copy
1517 ldx $t0,$tp,$i
1518 ldx $t1,$t4,$i
1519 ldx $t2,$rp,$i
1520 ldx $t3,$t6,$i
1521 std $i,8($nap_d) ; zap nap_d
1522 std $i,16($nap_d)
1523 std $i,24($nap_d)
1524 std $i,32($nap_d)
1525 std $i,40($nap_d)
1526 std $i,48($nap_d)
1527 std $i,56($nap_d)
1528 stdu $i,64($nap_d)
1529 and $t0,$t0,$ovf
1530 and $t1,$t1,$ovf
1531 andc $t2,$t2,$ovf
1532 andc $t3,$t3,$ovf
1533 or $t0,$t0,$t2
1534 or $t1,$t1,$t3
1535 stdx $t0,$rp,$i
1536 stdx $t1,$t6,$i
1537 stdx $i,$tp,$i ; zap tp at once
1538 stdx $i,$t4,$i
1539 addi $i,$i,16
1540 bdnz Lcopy
1541___
1542$code.=<<___ if ($SIZE_T==4);
1543 subf $np,$num,$np ; rewind np
1544 addi $j,$j,1 ; restore counter
1545 subfc $i,$i,$i ; j=0 and "clear" XER[CA]
1546 addi $tp,$sp,`$FRAME+$TRANSFER`
1547 addi $np,$np,-4
1548 addi $rp,$rp,-4
1549 addi $ap,$sp,`$FRAME+$TRANSFER+4`
1550 mtctr $j
1551
1552.align 4
1553Lsub: lwz $t0,12($tp) ; load tp[j..j+3] in 64-bit word order
1554 lwz $t1,8($tp)
1555 lwz $t2,20($tp)
1556 lwzu $t3,16($tp)
1557 lwz $t4,4($np) ; load np[j..j+3] in 32-bit word order
1558 lwz $t5,8($np)
1559 lwz $t6,12($np)
1560 lwzu $t7,16($np)
1561 subfe $t4,$t4,$t0 ; tp[j]-np[j]
1562 stw $t0,4($ap) ; save tp[j..j+3] in 32-bit word order
1563 subfe $t5,$t5,$t1 ; tp[j+1]-np[j+1]
1564 stw $t1,8($ap)
1565 subfe $t6,$t6,$t2 ; tp[j+2]-np[j+2]
1566 stw $t2,12($ap)
1567 subfe $t7,$t7,$t3 ; tp[j+3]-np[j+3]
1568 stwu $t3,16($ap)
1569 stw $t4,4($rp)
1570 stw $t5,8($rp)
1571 stw $t6,12($rp)
1572 stwu $t7,16($rp)
1573 bdnz Lsub
1574
1575 li $i,0
1576 subfe $ovf,$i,$ovf ; handle upmost overflow bit
1577 addi $ap,$sp,`$FRAME+$TRANSFER+4`
1578 subf $rp,$num,$rp ; rewind rp
1579 addi $tp,$sp,`$FRAME+$TRANSFER`
1580 mtctr $j
1581
1582.align 4
1583Lcopy: ; conditional copy
1584 lwz $t0,4($ap)
1585 lwz $t1,8($ap)
1586 lwz $t2,12($ap)
1587 lwzu $t3,16($ap)
1588 lwz $t4,4($rp)
1589 lwz $t5,8($rp)
1590 lwz $t6,12($rp)
1591 lwz $t7,16($rp)
1592 std $i,8($nap_d) ; zap nap_d
1593 std $i,16($nap_d)
1594 std $i,24($nap_d)
1595 std $i,32($nap_d)
1596 std $i,40($nap_d)
1597 std $i,48($nap_d)
1598 std $i,56($nap_d)
1599 stdu $i,64($nap_d)
1600 and $t0,$t0,$ovf
1601 and $t1,$t1,$ovf
1602 and $t2,$t2,$ovf
1603 and $t3,$t3,$ovf
1604 andc $t4,$t4,$ovf
1605 andc $t5,$t5,$ovf
1606 andc $t6,$t6,$ovf
1607 andc $t7,$t7,$ovf
1608 or $t0,$t0,$t4
1609 or $t1,$t1,$t5
1610 or $t2,$t2,$t6
1611 or $t3,$t3,$t7
1612 stw $t0,4($rp)
1613 stw $t1,8($rp)
1614 stw $t2,12($rp)
1615 stwu $t3,16($rp)
1616 std $i,8($tp) ; zap tp at once
1617 stdu $i,16($tp)
1618 bdnz Lcopy
1619___
1620
1621
1622$code.=<<___;
1623 $POP $i,0($sp)
1624 li r3,1 ; signal "handled"
1625 $POP r19,`-12*8-13*$SIZE_T`($i)
1626 $POP r20,`-12*8-12*$SIZE_T`($i)
1627 $POP r21,`-12*8-11*$SIZE_T`($i)
1628 $POP r22,`-12*8-10*$SIZE_T`($i)
1629 $POP r23,`-12*8-9*$SIZE_T`($i)
1630 $POP r24,`-12*8-8*$SIZE_T`($i)
1631 $POP r25,`-12*8-7*$SIZE_T`($i)
1632 $POP r26,`-12*8-6*$SIZE_T`($i)
1633 $POP r27,`-12*8-5*$SIZE_T`($i)
1634 $POP r28,`-12*8-4*$SIZE_T`($i)
1635 $POP r29,`-12*8-3*$SIZE_T`($i)
1636 $POP r30,`-12*8-2*$SIZE_T`($i)
1637 $POP r31,`-12*8-1*$SIZE_T`($i)
1638 lfd f20,`-12*8`($i)
1639 lfd f21,`-11*8`($i)
1640 lfd f22,`-10*8`($i)
1641 lfd f23,`-9*8`($i)
1642 lfd f24,`-8*8`($i)
1643 lfd f25,`-7*8`($i)
1644 lfd f26,`-6*8`($i)
1645 lfd f27,`-5*8`($i)
1646 lfd f28,`-4*8`($i)
1647 lfd f29,`-3*8`($i)
1648 lfd f30,`-2*8`($i)
1649 lfd f31,`-1*8`($i)
1650 mr $sp,$i
1651 blr
1652 .long 0
1653 .byte 0,12,4,0,0x8c,13,6,0
1654 .long 0
1655.size .$fname,.-.$fname
1656
1657.asciz "Montgomery Multiplication for PPC64, CRYPTOGAMS by <appro\@openssl.org>"
1658___
1659
1660$code =~ s/\`([^\`]*)\`/eval $1/gem;
1661print $code;
1662close STDOUT or die "error closing STDOUT: $!";
Note: See TracBrowser for help on using the repository browser.

© 2024 Oracle Support Privacy / Do Not Sell My Info Terms of Use Trademark Policy Automated Access Etiquette