VirtualBox

source: vbox/trunk/src/libs/openssl-3.0.3/crypto/bn/asm/mips.pl@ 96662

Last change on this file since 96662 was 94082, checked in by vboxsync, 3 years ago

libs/openssl-3.0.1: started applying and adjusting our OpenSSL changes to 3.0.1. bugref:10128

File size: 48.5 KB
Line 
1#! /usr/bin/env perl
2# Copyright 2010-2021 The OpenSSL Project Authors. All Rights Reserved.
3#
4# Licensed under the Apache License 2.0 (the "License"). You may not use
5# this file except in compliance with the License. You can obtain a copy
6# in the file LICENSE in the source distribution or at
7# https://www.openssl.org/source/license.html
8
9#
10# ====================================================================
11# Written by Andy Polyakov <[email protected]> for the OpenSSL
12# project.
13#
14# Rights for redistribution and usage in source and binary forms are
15# granted according to the License. Warranty of any kind is disclaimed.
16# ====================================================================
17
18
19# July 1999
20#
21# This is drop-in MIPS III/IV ISA replacement for crypto/bn/bn_asm.c.
22#
23# The module is designed to work with either of the "new" MIPS ABI(5),
24# namely N32 or N64, offered by IRIX 6.x. It's not meant to work under
25# IRIX 5.x not only because it doesn't support new ABIs but also
26# because 5.x kernels put R4x00 CPU into 32-bit mode and all those
27# 64-bit instructions (daddu, dmultu, etc.) found below gonna only
28# cause illegal instruction exception:-(
29#
30# In addition the code depends on preprocessor flags set up by MIPSpro
31# compiler driver (either as or cc) and therefore (probably?) can't be
32# compiled by the GNU assembler. GNU C driver manages fine though...
33# I mean as long as -mmips-as is specified or is the default option,
34# because then it simply invokes /usr/bin/as which in turn takes
35# perfect care of the preprocessor definitions. Another neat feature
36# offered by the MIPSpro assembler is an optimization pass. This gave
37# me the opportunity to have the code looking more regular as all those
38# architecture dependent instruction rescheduling details were left to
39# the assembler. Cool, huh?
40#
41# Performance improvement is astonishing! 'apps/openssl speed rsa dsa'
42# goes way over 3 times faster!
43#
44# <[email protected]>
45
46# October 2010
47#
48# Adapt the module even for 32-bit ABIs and other OSes. The former was
49# achieved by mechanical replacement of 64-bit arithmetic instructions
50# such as dmultu, daddu, etc. with their 32-bit counterparts and
51# adjusting offsets denoting multiples of BN_ULONG. Above mentioned
52# >3x performance improvement naturally does not apply to 32-bit code
53# [because there is no instruction 32-bit compiler can't use], one
54# has to content with 40-85% improvement depending on benchmark and
55# key length, more for longer keys.
56
57# $output is the last argument if it looks like a file (it has an extension)
58# $flavour is the first argument if it doesn't look like a file
59$output = $#ARGV >= 0 && $ARGV[$#ARGV] =~ m|\.\w+$| ? pop : undef;
60$flavour = $#ARGV >= 0 && $ARGV[0] !~ m|\.| ? shift : "o32";
61
62if ($flavour =~ /64|n32/i) {
63 $LD="ld";
64 $ST="sd";
65 $MULTU="dmultu";
66 $DIVU="ddivu";
67 $ADDU="daddu";
68 $SUBU="dsubu";
69 $SRL="dsrl";
70 $SLL="dsll";
71 $BNSZ=8;
72 $PTR_ADD="daddu";
73 $PTR_SUB="dsubu";
74 $SZREG=8;
75 $REG_S="sd";
76 $REG_L="ld";
77} else {
78 $LD="lw";
79 $ST="sw";
80 $MULTU="multu";
81 $DIVU="divu";
82 $ADDU="addu";
83 $SUBU="subu";
84 $SRL="srl";
85 $SLL="sll";
86 $BNSZ=4;
87 $PTR_ADD="addu";
88 $PTR_SUB="subu";
89 $SZREG=4;
90 $REG_S="sw";
91 $REG_L="lw";
92 $code="#if !(defined (__mips_isa_rev) && (__mips_isa_rev >= 6))\n.set mips2\n#endif\n";
93}
94
95$output and open STDOUT,">$output";
96
97# Below is N32/64 register layout used in the original module.
98#
99($zero,$at,$v0,$v1)=map("\$$_",(0..3));
100($a0,$a1,$a2,$a3,$a4,$a5,$a6,$a7)=map("\$$_",(4..11));
101($t0,$t1,$t2,$t3,$t8,$t9)=map("\$$_",(12..15,24,25));
102($s0,$s1,$s2,$s3,$s4,$s5,$s6,$s7)=map("\$$_",(16..23));
103($gp,$sp,$fp,$ra)=map("\$$_",(28..31));
104($ta0,$ta1,$ta2,$ta3)=($a4,$a5,$a6,$a7);
105#
106# No special adaptation is required for O32. NUBI on the other hand
107# is treated by saving/restoring ($v1,$t0..$t3).
108
109$gp=$v1 if ($flavour =~ /nubi/i);
110
111$minus4=$v1;
112
113$code.=<<___;
114#include "mips_arch.h"
115
116#if defined(_MIPS_ARCH_MIPS64R6)
117# define ddivu(rs,rt)
118# define mfqt(rd,rs,rt) ddivu rd,rs,rt
119# define mfrm(rd,rs,rt) dmodu rd,rs,rt
120#elif defined(_MIPS_ARCH_MIPS32R6)
121# define divu(rs,rt)
122# define mfqt(rd,rs,rt) divu rd,rs,rt
123# define mfrm(rd,rs,rt) modu rd,rs,rt
124#else
125# define $DIVU(rs,rt) $DIVU $zero,rs,rt
126# define mfqt(rd,rs,rt) mflo rd
127# define mfrm(rd,rs,rt) mfhi rd
128#endif
129
130.rdata
131.asciiz "mips3.s, Version 1.2"
132.asciiz "MIPS II/III/IV ISA artwork by Andy Polyakov <appro\@fy.chalmers.se>"
133
134.text
135.set noat
136
137.align 5
138.globl bn_mul_add_words
139.ent bn_mul_add_words
140bn_mul_add_words:
141 .set noreorder
142 bgtz $a2,bn_mul_add_words_internal
143 move $v0,$zero
144 jr $ra
145 move $a0,$v0
146.end bn_mul_add_words
147
148.align 5
149.ent bn_mul_add_words_internal
150bn_mul_add_words_internal:
151___
152$code.=<<___ if ($flavour =~ /nubi/i);
153 .frame $sp,6*$SZREG,$ra
154 .mask 0x8000f008,-$SZREG
155 .set noreorder
156 $PTR_SUB $sp,6*$SZREG
157 $REG_S $ra,5*$SZREG($sp)
158 $REG_S $t3,4*$SZREG($sp)
159 $REG_S $t2,3*$SZREG($sp)
160 $REG_S $t1,2*$SZREG($sp)
161 $REG_S $t0,1*$SZREG($sp)
162 $REG_S $gp,0*$SZREG($sp)
163___
164$code.=<<___;
165 .set reorder
166 li $minus4,-4
167 and $ta0,$a2,$minus4
168 beqz $ta0,.L_bn_mul_add_words_tail
169
170.L_bn_mul_add_words_loop:
171 $LD $t0,0($a1)
172 $MULTU ($t0,$a3)
173 $LD $t1,0($a0)
174 $LD $t2,$BNSZ($a1)
175 $LD $t3,$BNSZ($a0)
176 $LD $ta0,2*$BNSZ($a1)
177 $LD $ta1,2*$BNSZ($a0)
178 $ADDU $t1,$v0
179 sltu $v0,$t1,$v0 # All manuals say it "compares 32-bit
180 # values", but it seems to work fine
181 # even on 64-bit registers.
182 mflo ($at,$t0,$a3)
183 mfhi ($t0,$t0,$a3)
184 $ADDU $t1,$at
185 $ADDU $v0,$t0
186 $MULTU ($t2,$a3)
187 sltu $at,$t1,$at
188 $ST $t1,0($a0)
189 $ADDU $v0,$at
190
191 $LD $ta2,3*$BNSZ($a1)
192 $LD $ta3,3*$BNSZ($a0)
193 $ADDU $t3,$v0
194 sltu $v0,$t3,$v0
195 mflo ($at,$t2,$a3)
196 mfhi ($t2,$t2,$a3)
197 $ADDU $t3,$at
198 $ADDU $v0,$t2
199 $MULTU ($ta0,$a3)
200 sltu $at,$t3,$at
201 $ST $t3,$BNSZ($a0)
202 $ADDU $v0,$at
203
204 subu $a2,4
205 $PTR_ADD $a0,4*$BNSZ
206 $PTR_ADD $a1,4*$BNSZ
207 $ADDU $ta1,$v0
208 sltu $v0,$ta1,$v0
209 mflo ($at,$ta0,$a3)
210 mfhi ($ta0,$ta0,$a3)
211 $ADDU $ta1,$at
212 $ADDU $v0,$ta0
213 $MULTU ($ta2,$a3)
214 sltu $at,$ta1,$at
215 $ST $ta1,-2*$BNSZ($a0)
216 $ADDU $v0,$at
217
218
219 and $ta0,$a2,$minus4
220 $ADDU $ta3,$v0
221 sltu $v0,$ta3,$v0
222 mflo ($at,$ta2,$a3)
223 mfhi ($ta2,$ta2,$a3)
224 $ADDU $ta3,$at
225 $ADDU $v0,$ta2
226 sltu $at,$ta3,$at
227 $ST $ta3,-$BNSZ($a0)
228 .set noreorder
229 bgtz $ta0,.L_bn_mul_add_words_loop
230 $ADDU $v0,$at
231
232 beqz $a2,.L_bn_mul_add_words_return
233 nop
234
235.L_bn_mul_add_words_tail:
236 .set reorder
237 $LD $t0,0($a1)
238 $MULTU ($t0,$a3)
239 $LD $t1,0($a0)
240 subu $a2,1
241 $ADDU $t1,$v0
242 sltu $v0,$t1,$v0
243 mflo ($at,$t0,$a3)
244 mfhi ($t0,$t0,$a3)
245 $ADDU $t1,$at
246 $ADDU $v0,$t0
247 sltu $at,$t1,$at
248 $ST $t1,0($a0)
249 $ADDU $v0,$at
250 beqz $a2,.L_bn_mul_add_words_return
251
252 $LD $t0,$BNSZ($a1)
253 $MULTU ($t0,$a3)
254 $LD $t1,$BNSZ($a0)
255 subu $a2,1
256 $ADDU $t1,$v0
257 sltu $v0,$t1,$v0
258 mflo ($at,$t0,$a3)
259 mfhi ($t0,$t0,$a3)
260 $ADDU $t1,$at
261 $ADDU $v0,$t0
262 sltu $at,$t1,$at
263 $ST $t1,$BNSZ($a0)
264 $ADDU $v0,$at
265 beqz $a2,.L_bn_mul_add_words_return
266
267 $LD $t0,2*$BNSZ($a1)
268 $MULTU ($t0,$a3)
269 $LD $t1,2*$BNSZ($a0)
270 $ADDU $t1,$v0
271 sltu $v0,$t1,$v0
272 mflo ($at,$t0,$a3)
273 mfhi ($t0,$t0,$a3)
274 $ADDU $t1,$at
275 $ADDU $v0,$t0
276 sltu $at,$t1,$at
277 $ST $t1,2*$BNSZ($a0)
278 $ADDU $v0,$at
279
280.L_bn_mul_add_words_return:
281 .set noreorder
282___
283$code.=<<___ if ($flavour =~ /nubi/i);
284 $REG_L $t3,4*$SZREG($sp)
285 $REG_L $t2,3*$SZREG($sp)
286 $REG_L $t1,2*$SZREG($sp)
287 $REG_L $t0,1*$SZREG($sp)
288 $REG_L $gp,0*$SZREG($sp)
289 $PTR_ADD $sp,6*$SZREG
290___
291$code.=<<___;
292 jr $ra
293 move $a0,$v0
294.end bn_mul_add_words_internal
295
296.align 5
297.globl bn_mul_words
298.ent bn_mul_words
299bn_mul_words:
300 .set noreorder
301 bgtz $a2,bn_mul_words_internal
302 move $v0,$zero
303 jr $ra
304 move $a0,$v0
305.end bn_mul_words
306
307.align 5
308.ent bn_mul_words_internal
309bn_mul_words_internal:
310___
311$code.=<<___ if ($flavour =~ /nubi/i);
312 .frame $sp,6*$SZREG,$ra
313 .mask 0x8000f008,-$SZREG
314 .set noreorder
315 $PTR_SUB $sp,6*$SZREG
316 $REG_S $ra,5*$SZREG($sp)
317 $REG_S $t3,4*$SZREG($sp)
318 $REG_S $t2,3*$SZREG($sp)
319 $REG_S $t1,2*$SZREG($sp)
320 $REG_S $t0,1*$SZREG($sp)
321 $REG_S $gp,0*$SZREG($sp)
322___
323$code.=<<___;
324 .set reorder
325 li $minus4,-4
326 and $ta0,$a2,$minus4
327 beqz $ta0,.L_bn_mul_words_tail
328
329.L_bn_mul_words_loop:
330 $LD $t0,0($a1)
331 $MULTU ($t0,$a3)
332 $LD $t2,$BNSZ($a1)
333 $LD $ta0,2*$BNSZ($a1)
334 $LD $ta2,3*$BNSZ($a1)
335 mflo ($at,$t0,$a3)
336 mfhi ($t0,$t0,$a3)
337 $ADDU $v0,$at
338 sltu $t1,$v0,$at
339 $MULTU ($t2,$a3)
340 $ST $v0,0($a0)
341 $ADDU $v0,$t1,$t0
342
343 subu $a2,4
344 $PTR_ADD $a0,4*$BNSZ
345 $PTR_ADD $a1,4*$BNSZ
346 mflo ($at,$t2,$a3)
347 mfhi ($t2,$t2,$a3)
348 $ADDU $v0,$at
349 sltu $t3,$v0,$at
350 $MULTU ($ta0,$a3)
351 $ST $v0,-3*$BNSZ($a0)
352 $ADDU $v0,$t3,$t2
353
354 mflo ($at,$ta0,$a3)
355 mfhi ($ta0,$ta0,$a3)
356 $ADDU $v0,$at
357 sltu $ta1,$v0,$at
358 $MULTU ($ta2,$a3)
359 $ST $v0,-2*$BNSZ($a0)
360 $ADDU $v0,$ta1,$ta0
361
362 and $ta0,$a2,$minus4
363 mflo ($at,$ta2,$a3)
364 mfhi ($ta2,$ta2,$a3)
365 $ADDU $v0,$at
366 sltu $ta3,$v0,$at
367 $ST $v0,-$BNSZ($a0)
368 .set noreorder
369 bgtz $ta0,.L_bn_mul_words_loop
370 $ADDU $v0,$ta3,$ta2
371
372 beqz $a2,.L_bn_mul_words_return
373 nop
374
375.L_bn_mul_words_tail:
376 .set reorder
377 $LD $t0,0($a1)
378 $MULTU ($t0,$a3)
379 subu $a2,1
380 mflo ($at,$t0,$a3)
381 mfhi ($t0,$t0,$a3)
382 $ADDU $v0,$at
383 sltu $t1,$v0,$at
384 $ST $v0,0($a0)
385 $ADDU $v0,$t1,$t0
386 beqz $a2,.L_bn_mul_words_return
387
388 $LD $t0,$BNSZ($a1)
389 $MULTU ($t0,$a3)
390 subu $a2,1
391 mflo ($at,$t0,$a3)
392 mfhi ($t0,$t0,$a3)
393 $ADDU $v0,$at
394 sltu $t1,$v0,$at
395 $ST $v0,$BNSZ($a0)
396 $ADDU $v0,$t1,$t0
397 beqz $a2,.L_bn_mul_words_return
398
399 $LD $t0,2*$BNSZ($a1)
400 $MULTU ($t0,$a3)
401 mflo ($at,$t0,$a3)
402 mfhi ($t0,$t0,$a3)
403 $ADDU $v0,$at
404 sltu $t1,$v0,$at
405 $ST $v0,2*$BNSZ($a0)
406 $ADDU $v0,$t1,$t0
407
408.L_bn_mul_words_return:
409 .set noreorder
410___
411$code.=<<___ if ($flavour =~ /nubi/i);
412 $REG_L $t3,4*$SZREG($sp)
413 $REG_L $t2,3*$SZREG($sp)
414 $REG_L $t1,2*$SZREG($sp)
415 $REG_L $t0,1*$SZREG($sp)
416 $REG_L $gp,0*$SZREG($sp)
417 $PTR_ADD $sp,6*$SZREG
418___
419$code.=<<___;
420 jr $ra
421 move $a0,$v0
422.end bn_mul_words_internal
423
424.align 5
425.globl bn_sqr_words
426.ent bn_sqr_words
427bn_sqr_words:
428 .set noreorder
429 bgtz $a2,bn_sqr_words_internal
430 move $v0,$zero
431 jr $ra
432 move $a0,$v0
433.end bn_sqr_words
434
435.align 5
436.ent bn_sqr_words_internal
437bn_sqr_words_internal:
438___
439$code.=<<___ if ($flavour =~ /nubi/i);
440 .frame $sp,6*$SZREG,$ra
441 .mask 0x8000f008,-$SZREG
442 .set noreorder
443 $PTR_SUB $sp,6*$SZREG
444 $REG_S $ra,5*$SZREG($sp)
445 $REG_S $t3,4*$SZREG($sp)
446 $REG_S $t2,3*$SZREG($sp)
447 $REG_S $t1,2*$SZREG($sp)
448 $REG_S $t0,1*$SZREG($sp)
449 $REG_S $gp,0*$SZREG($sp)
450___
451$code.=<<___;
452 .set reorder
453 li $minus4,-4
454 and $ta0,$a2,$minus4
455 beqz $ta0,.L_bn_sqr_words_tail
456
457.L_bn_sqr_words_loop:
458 $LD $t0,0($a1)
459 $MULTU ($t0,$t0)
460 $LD $t2,$BNSZ($a1)
461 $LD $ta0,2*$BNSZ($a1)
462 $LD $ta2,3*$BNSZ($a1)
463 mflo ($t1,$t0,$t0)
464 mfhi ($t0,$t0,$t0)
465 $ST $t1,0($a0)
466 $ST $t0,$BNSZ($a0)
467
468 $MULTU ($t2,$t2)
469 subu $a2,4
470 $PTR_ADD $a0,8*$BNSZ
471 $PTR_ADD $a1,4*$BNSZ
472 mflo ($t3,$t2,$t2)
473 mfhi ($t2,$t2,$t2)
474 $ST $t3,-6*$BNSZ($a0)
475 $ST $t2,-5*$BNSZ($a0)
476
477 $MULTU ($ta0,$ta0)
478 mflo ($ta1,$ta0,$ta0)
479 mfhi ($ta0,$ta0,$ta0)
480 $ST $ta1,-4*$BNSZ($a0)
481 $ST $ta0,-3*$BNSZ($a0)
482
483
484 $MULTU ($ta2,$ta2)
485 and $ta0,$a2,$minus4
486 mflo ($ta3,$ta2,$ta2)
487 mfhi ($ta2,$ta2,$ta2)
488 $ST $ta3,-2*$BNSZ($a0)
489
490 .set noreorder
491 bgtz $ta0,.L_bn_sqr_words_loop
492 $ST $ta2,-$BNSZ($a0)
493
494 beqz $a2,.L_bn_sqr_words_return
495 nop
496
497.L_bn_sqr_words_tail:
498 .set reorder
499 $LD $t0,0($a1)
500 $MULTU ($t0,$t0)
501 subu $a2,1
502 mflo ($t1,$t0,$t0)
503 mfhi ($t0,$t0,$t0)
504 $ST $t1,0($a0)
505 $ST $t0,$BNSZ($a0)
506 beqz $a2,.L_bn_sqr_words_return
507
508 $LD $t0,$BNSZ($a1)
509 $MULTU ($t0,$t0)
510 subu $a2,1
511 mflo ($t1,$t0,$t0)
512 mfhi ($t0,$t0,$t0)
513 $ST $t1,2*$BNSZ($a0)
514 $ST $t0,3*$BNSZ($a0)
515 beqz $a2,.L_bn_sqr_words_return
516
517 $LD $t0,2*$BNSZ($a1)
518 $MULTU ($t0,$t0)
519 mflo ($t1,$t0,$t0)
520 mfhi ($t0,$t0,$t0)
521 $ST $t1,4*$BNSZ($a0)
522 $ST $t0,5*$BNSZ($a0)
523
524.L_bn_sqr_words_return:
525 .set noreorder
526___
527$code.=<<___ if ($flavour =~ /nubi/i);
528 $REG_L $t3,4*$SZREG($sp)
529 $REG_L $t2,3*$SZREG($sp)
530 $REG_L $t1,2*$SZREG($sp)
531 $REG_L $t0,1*$SZREG($sp)
532 $REG_L $gp,0*$SZREG($sp)
533 $PTR_ADD $sp,6*$SZREG
534___
535$code.=<<___;
536 jr $ra
537 move $a0,$v0
538
539.end bn_sqr_words_internal
540
541.align 5
542.globl bn_add_words
543.ent bn_add_words
544bn_add_words:
545 .set noreorder
546 bgtz $a3,bn_add_words_internal
547 move $v0,$zero
548 jr $ra
549 move $a0,$v0
550.end bn_add_words
551
552.align 5
553.ent bn_add_words_internal
554bn_add_words_internal:
555___
556$code.=<<___ if ($flavour =~ /nubi/i);
557 .frame $sp,6*$SZREG,$ra
558 .mask 0x8000f008,-$SZREG
559 .set noreorder
560 $PTR_SUB $sp,6*$SZREG
561 $REG_S $ra,5*$SZREG($sp)
562 $REG_S $t3,4*$SZREG($sp)
563 $REG_S $t2,3*$SZREG($sp)
564 $REG_S $t1,2*$SZREG($sp)
565 $REG_S $t0,1*$SZREG($sp)
566 $REG_S $gp,0*$SZREG($sp)
567___
568$code.=<<___;
569 .set reorder
570 li $minus4,-4
571 and $at,$a3,$minus4
572 beqz $at,.L_bn_add_words_tail
573
574.L_bn_add_words_loop:
575 $LD $t0,0($a1)
576 $LD $ta0,0($a2)
577 subu $a3,4
578 $LD $t1,$BNSZ($a1)
579 and $at,$a3,$minus4
580 $LD $t2,2*$BNSZ($a1)
581 $PTR_ADD $a2,4*$BNSZ
582 $LD $t3,3*$BNSZ($a1)
583 $PTR_ADD $a0,4*$BNSZ
584 $LD $ta1,-3*$BNSZ($a2)
585 $PTR_ADD $a1,4*$BNSZ
586 $LD $ta2,-2*$BNSZ($a2)
587 $LD $ta3,-$BNSZ($a2)
588 $ADDU $ta0,$t0
589 sltu $t8,$ta0,$t0
590 $ADDU $t0,$ta0,$v0
591 sltu $v0,$t0,$ta0
592 $ST $t0,-4*$BNSZ($a0)
593 $ADDU $v0,$t8
594
595 $ADDU $ta1,$t1
596 sltu $t9,$ta1,$t1
597 $ADDU $t1,$ta1,$v0
598 sltu $v0,$t1,$ta1
599 $ST $t1,-3*$BNSZ($a0)
600 $ADDU $v0,$t9
601
602 $ADDU $ta2,$t2
603 sltu $t8,$ta2,$t2
604 $ADDU $t2,$ta2,$v0
605 sltu $v0,$t2,$ta2
606 $ST $t2,-2*$BNSZ($a0)
607 $ADDU $v0,$t8
608
609 $ADDU $ta3,$t3
610 sltu $t9,$ta3,$t3
611 $ADDU $t3,$ta3,$v0
612 sltu $v0,$t3,$ta3
613 $ST $t3,-$BNSZ($a0)
614
615 .set noreorder
616 bgtz $at,.L_bn_add_words_loop
617 $ADDU $v0,$t9
618
619 beqz $a3,.L_bn_add_words_return
620 nop
621
622.L_bn_add_words_tail:
623 .set reorder
624 $LD $t0,0($a1)
625 $LD $ta0,0($a2)
626 $ADDU $ta0,$t0
627 subu $a3,1
628 sltu $t8,$ta0,$t0
629 $ADDU $t0,$ta0,$v0
630 sltu $v0,$t0,$ta0
631 $ST $t0,0($a0)
632 $ADDU $v0,$t8
633 beqz $a3,.L_bn_add_words_return
634
635 $LD $t1,$BNSZ($a1)
636 $LD $ta1,$BNSZ($a2)
637 $ADDU $ta1,$t1
638 subu $a3,1
639 sltu $t9,$ta1,$t1
640 $ADDU $t1,$ta1,$v0
641 sltu $v0,$t1,$ta1
642 $ST $t1,$BNSZ($a0)
643 $ADDU $v0,$t9
644 beqz $a3,.L_bn_add_words_return
645
646 $LD $t2,2*$BNSZ($a1)
647 $LD $ta2,2*$BNSZ($a2)
648 $ADDU $ta2,$t2
649 sltu $t8,$ta2,$t2
650 $ADDU $t2,$ta2,$v0
651 sltu $v0,$t2,$ta2
652 $ST $t2,2*$BNSZ($a0)
653 $ADDU $v0,$t8
654
655.L_bn_add_words_return:
656 .set noreorder
657___
658$code.=<<___ if ($flavour =~ /nubi/i);
659 $REG_L $t3,4*$SZREG($sp)
660 $REG_L $t2,3*$SZREG($sp)
661 $REG_L $t1,2*$SZREG($sp)
662 $REG_L $t0,1*$SZREG($sp)
663 $REG_L $gp,0*$SZREG($sp)
664 $PTR_ADD $sp,6*$SZREG
665___
666$code.=<<___;
667 jr $ra
668 move $a0,$v0
669
670.end bn_add_words_internal
671
672.align 5
673.globl bn_sub_words
674.ent bn_sub_words
675bn_sub_words:
676 .set noreorder
677 bgtz $a3,bn_sub_words_internal
678 move $v0,$zero
679 jr $ra
680 move $a0,$zero
681.end bn_sub_words
682
683.align 5
684.ent bn_sub_words_internal
685bn_sub_words_internal:
686___
687$code.=<<___ if ($flavour =~ /nubi/i);
688 .frame $sp,6*$SZREG,$ra
689 .mask 0x8000f008,-$SZREG
690 .set noreorder
691 $PTR_SUB $sp,6*$SZREG
692 $REG_S $ra,5*$SZREG($sp)
693 $REG_S $t3,4*$SZREG($sp)
694 $REG_S $t2,3*$SZREG($sp)
695 $REG_S $t1,2*$SZREG($sp)
696 $REG_S $t0,1*$SZREG($sp)
697 $REG_S $gp,0*$SZREG($sp)
698___
699$code.=<<___;
700 .set reorder
701 li $minus4,-4
702 and $at,$a3,$minus4
703 beqz $at,.L_bn_sub_words_tail
704
705.L_bn_sub_words_loop:
706 $LD $t0,0($a1)
707 $LD $ta0,0($a2)
708 subu $a3,4
709 $LD $t1,$BNSZ($a1)
710 and $at,$a3,$minus4
711 $LD $t2,2*$BNSZ($a1)
712 $PTR_ADD $a2,4*$BNSZ
713 $LD $t3,3*$BNSZ($a1)
714 $PTR_ADD $a0,4*$BNSZ
715 $LD $ta1,-3*$BNSZ($a2)
716 $PTR_ADD $a1,4*$BNSZ
717 $LD $ta2,-2*$BNSZ($a2)
718 $LD $ta3,-$BNSZ($a2)
719 sltu $t8,$t0,$ta0
720 $SUBU $ta0,$t0,$ta0
721 $SUBU $t0,$ta0,$v0
722 sgtu $v0,$t0,$ta0
723 $ST $t0,-4*$BNSZ($a0)
724 $ADDU $v0,$t8
725
726 sltu $t9,$t1,$ta1
727 $SUBU $ta1,$t1,$ta1
728 $SUBU $t1,$ta1,$v0
729 sgtu $v0,$t1,$ta1
730 $ST $t1,-3*$BNSZ($a0)
731 $ADDU $v0,$t9
732
733
734 sltu $t8,$t2,$ta2
735 $SUBU $ta2,$t2,$ta2
736 $SUBU $t2,$ta2,$v0
737 sgtu $v0,$t2,$ta2
738 $ST $t2,-2*$BNSZ($a0)
739 $ADDU $v0,$t8
740
741 sltu $t9,$t3,$ta3
742 $SUBU $ta3,$t3,$ta3
743 $SUBU $t3,$ta3,$v0
744 sgtu $v0,$t3,$ta3
745 $ST $t3,-$BNSZ($a0)
746
747 .set noreorder
748 bgtz $at,.L_bn_sub_words_loop
749 $ADDU $v0,$t9
750
751 beqz $a3,.L_bn_sub_words_return
752 nop
753
754.L_bn_sub_words_tail:
755 .set reorder
756 $LD $t0,0($a1)
757 $LD $ta0,0($a2)
758 subu $a3,1
759 sltu $t8,$t0,$ta0
760 $SUBU $ta0,$t0,$ta0
761 $SUBU $t0,$ta0,$v0
762 sgtu $v0,$t0,$ta0
763 $ST $t0,0($a0)
764 $ADDU $v0,$t8
765 beqz $a3,.L_bn_sub_words_return
766
767 $LD $t1,$BNSZ($a1)
768 subu $a3,1
769 $LD $ta1,$BNSZ($a2)
770 sltu $t9,$t1,$ta1
771 $SUBU $ta1,$t1,$ta1
772 $SUBU $t1,$ta1,$v0
773 sgtu $v0,$t1,$ta1
774 $ST $t1,$BNSZ($a0)
775 $ADDU $v0,$t9
776 beqz $a3,.L_bn_sub_words_return
777
778 $LD $t2,2*$BNSZ($a1)
779 $LD $ta2,2*$BNSZ($a2)
780 sltu $t8,$t2,$ta2
781 $SUBU $ta2,$t2,$ta2
782 $SUBU $t2,$ta2,$v0
783 sgtu $v0,$t2,$ta2
784 $ST $t2,2*$BNSZ($a0)
785 $ADDU $v0,$t8
786
787.L_bn_sub_words_return:
788 .set noreorder
789___
790$code.=<<___ if ($flavour =~ /nubi/i);
791 $REG_L $t3,4*$SZREG($sp)
792 $REG_L $t2,3*$SZREG($sp)
793 $REG_L $t1,2*$SZREG($sp)
794 $REG_L $t0,1*$SZREG($sp)
795 $REG_L $gp,0*$SZREG($sp)
796 $PTR_ADD $sp,6*$SZREG
797___
798$code.=<<___;
799 jr $ra
800 move $a0,$v0
801.end bn_sub_words_internal
802
803#if 0
804/*
805 * The bn_div_3_words entry point is re-used for constant-time interface.
806 * Implementation is retained as historical reference.
807 */
808.align 5
809.globl bn_div_3_words
810.ent bn_div_3_words
811bn_div_3_words:
812 .set noreorder
813 move $a3,$a0 # we know that bn_div_words does not
814 # touch $a3, $ta2, $ta3 and preserves $a2
815 # so that we can save two arguments
816 # and return address in registers
817 # instead of stack:-)
818
819 $LD $a0,($a3)
820 move $ta2,$a1
821 bne $a0,$a2,bn_div_3_words_internal
822 $LD $a1,-$BNSZ($a3)
823 li $v0,-1
824 jr $ra
825 move $a0,$v0
826.end bn_div_3_words
827
828.align 5
829.ent bn_div_3_words_internal
830bn_div_3_words_internal:
831___
832$code.=<<___ if ($flavour =~ /nubi/i);
833 .frame $sp,6*$SZREG,$ra
834 .mask 0x8000f008,-$SZREG
835 .set noreorder
836 $PTR_SUB $sp,6*$SZREG
837 $REG_S $ra,5*$SZREG($sp)
838 $REG_S $t3,4*$SZREG($sp)
839 $REG_S $t2,3*$SZREG($sp)
840 $REG_S $t1,2*$SZREG($sp)
841 $REG_S $t0,1*$SZREG($sp)
842 $REG_S $gp,0*$SZREG($sp)
843___
844$code.=<<___;
845 .set reorder
846 move $ta3,$ra
847 bal bn_div_words_internal
848 move $ra,$ta3
849 $MULTU ($ta2,$v0)
850 $LD $t2,-2*$BNSZ($a3)
851 move $ta0,$zero
852 mfhi ($t1,$ta2,$v0)
853 mflo ($t0,$ta2,$v0)
854 sltu $t8,$t1,$a1
855.L_bn_div_3_words_inner_loop:
856 bnez $t8,.L_bn_div_3_words_inner_loop_done
857 sgeu $at,$t2,$t0
858 seq $t9,$t1,$a1
859 and $at,$t9
860 sltu $t3,$t0,$ta2
861 $ADDU $a1,$a2
862 $SUBU $t1,$t3
863 $SUBU $t0,$ta2
864 sltu $t8,$t1,$a1
865 sltu $ta0,$a1,$a2
866 or $t8,$ta0
867 .set noreorder
868 beqz $at,.L_bn_div_3_words_inner_loop
869 $SUBU $v0,1
870 $ADDU $v0,1
871 .set reorder
872.L_bn_div_3_words_inner_loop_done:
873 .set noreorder
874___
875$code.=<<___ if ($flavour =~ /nubi/i);
876 $REG_L $t3,4*$SZREG($sp)
877 $REG_L $t2,3*$SZREG($sp)
878 $REG_L $t1,2*$SZREG($sp)
879 $REG_L $t0,1*$SZREG($sp)
880 $REG_L $gp,0*$SZREG($sp)
881 $PTR_ADD $sp,6*$SZREG
882___
883$code.=<<___;
884 jr $ra
885 move $a0,$v0
886.end bn_div_3_words_internal
887#endif
888
889.align 5
890.globl bn_div_words
891.ent bn_div_words
892bn_div_words:
893 .set noreorder
894 bnez $a2,bn_div_words_internal
895 li $v0,-1 # I would rather signal div-by-zero
896 # which can be done with 'break 7'
897 jr $ra
898 move $a0,$v0
899.end bn_div_words
900
901.align 5
902.ent bn_div_words_internal
903bn_div_words_internal:
904___
905$code.=<<___ if ($flavour =~ /nubi/i);
906 .frame $sp,6*$SZREG,$ra
907 .mask 0x8000f008,-$SZREG
908 .set noreorder
909 $PTR_SUB $sp,6*$SZREG
910 $REG_S $ra,5*$SZREG($sp)
911 $REG_S $t3,4*$SZREG($sp)
912 $REG_S $t2,3*$SZREG($sp)
913 $REG_S $t1,2*$SZREG($sp)
914 $REG_S $t0,1*$SZREG($sp)
915 $REG_S $gp,0*$SZREG($sp)
916___
917$code.=<<___;
918 move $v1,$zero
919 bltz $a2,.L_bn_div_words_body
920 move $t9,$v1
921 $SLL $a2,1
922 bgtz $a2,.-4
923 addu $t9,1
924
925 .set reorder
926 negu $t1,$t9
927 li $t2,-1
928 $SLL $t2,$t1
929 and $t2,$a0
930 $SRL $at,$a1,$t1
931 .set noreorder
932 beqz $t2,.+12
933 nop
934 break 6 # signal overflow
935 .set reorder
936 $SLL $a0,$t9
937 $SLL $a1,$t9
938 or $a0,$at
939___
940$QT=$ta0;
941$HH=$ta1;
942$DH=$v1;
943$code.=<<___;
944.L_bn_div_words_body:
945 $SRL $DH,$a2,4*$BNSZ # bits
946 sgeu $at,$a0,$a2
947 .set noreorder
948 beqz $at,.+12
949 nop
950 $SUBU $a0,$a2
951 .set reorder
952
953 li $QT,-1
954 $SRL $HH,$a0,4*$BNSZ # bits
955 $SRL $QT,4*$BNSZ # q=0xffffffff
956 beq $DH,$HH,.L_bn_div_words_skip_div1
957 $DIVU ($a0,$DH)
958 mfqt ($QT,$a0,$DH)
959.L_bn_div_words_skip_div1:
960 $MULTU ($a2,$QT)
961 $SLL $t3,$a0,4*$BNSZ # bits
962 $SRL $at,$a1,4*$BNSZ # bits
963 or $t3,$at
964 mflo ($t0,$a2,$QT)
965 mfhi ($t1,$a2,$QT)
966.L_bn_div_words_inner_loop1:
967 sltu $t2,$t3,$t0
968 seq $t8,$HH,$t1
969 sltu $at,$HH,$t1
970 and $t2,$t8
971 sltu $v0,$t0,$a2
972 or $at,$t2
973 .set noreorder
974 beqz $at,.L_bn_div_words_inner_loop1_done
975 $SUBU $t1,$v0
976 $SUBU $t0,$a2
977 b .L_bn_div_words_inner_loop1
978 $SUBU $QT,1
979 .set reorder
980.L_bn_div_words_inner_loop1_done:
981
982 $SLL $a1,4*$BNSZ # bits
983 $SUBU $a0,$t3,$t0
984 $SLL $v0,$QT,4*$BNSZ # bits
985
986 li $QT,-1
987 $SRL $HH,$a0,4*$BNSZ # bits
988 $SRL $QT,4*$BNSZ # q=0xffffffff
989 beq $DH,$HH,.L_bn_div_words_skip_div2
990 $DIVU ($a0,$DH)
991 mfqt ($QT,$a0,$DH)
992.L_bn_div_words_skip_div2:
993 $MULTU ($a2,$QT)
994 $SLL $t3,$a0,4*$BNSZ # bits
995 $SRL $at,$a1,4*$BNSZ # bits
996 or $t3,$at
997 mflo ($t0,$a2,$QT)
998 mfhi ($t1,$a2,$QT)
999.L_bn_div_words_inner_loop2:
1000 sltu $t2,$t3,$t0
1001 seq $t8,$HH,$t1
1002 sltu $at,$HH,$t1
1003 and $t2,$t8
1004 sltu $v1,$t0,$a2
1005 or $at,$t2
1006 .set noreorder
1007 beqz $at,.L_bn_div_words_inner_loop2_done
1008 $SUBU $t1,$v1
1009 $SUBU $t0,$a2
1010 b .L_bn_div_words_inner_loop2
1011 $SUBU $QT,1
1012 .set reorder
1013.L_bn_div_words_inner_loop2_done:
1014
1015 $SUBU $a0,$t3,$t0
1016 or $v0,$QT
1017 $SRL $v1,$a0,$t9 # $v1 contains remainder if anybody wants it
1018 $SRL $a2,$t9 # restore $a2
1019
1020 .set noreorder
1021 move $a1,$v1
1022___
1023$code.=<<___ if ($flavour =~ /nubi/i);
1024 $REG_L $t3,4*$SZREG($sp)
1025 $REG_L $t2,3*$SZREG($sp)
1026 $REG_L $t1,2*$SZREG($sp)
1027 $REG_L $t0,1*$SZREG($sp)
1028 $REG_L $gp,0*$SZREG($sp)
1029 $PTR_ADD $sp,6*$SZREG
1030___
1031$code.=<<___;
1032 jr $ra
1033 move $a0,$v0
1034.end bn_div_words_internal
1035___
1036undef $HH; undef $QT; undef $DH;
1037
1038($a_0,$a_1,$a_2,$a_3)=($t0,$t1,$t2,$t3);
1039($b_0,$b_1,$b_2,$b_3)=($ta0,$ta1,$ta2,$ta3);
1040
1041($a_4,$a_5,$a_6,$a_7)=($s0,$s2,$s4,$a1); # once we load a[7], no use for $a1
1042($b_4,$b_5,$b_6,$b_7)=($s1,$s3,$s5,$a2); # once we load b[7], no use for $a2
1043
1044($t_1,$t_2,$c_1,$c_2,$c_3)=($t8,$t9,$v0,$v1,$a3);
1045
1046$code.=<<___;
1047
1048.align 5
1049.globl bn_mul_comba8
1050.ent bn_mul_comba8
1051bn_mul_comba8:
1052 .set noreorder
1053___
1054$code.=<<___ if ($flavour =~ /nubi/i);
1055 .frame $sp,12*$SZREG,$ra
1056 .mask 0x803ff008,-$SZREG
1057 $PTR_SUB $sp,12*$SZREG
1058 $REG_S $ra,11*$SZREG($sp)
1059 $REG_S $s5,10*$SZREG($sp)
1060 $REG_S $s4,9*$SZREG($sp)
1061 $REG_S $s3,8*$SZREG($sp)
1062 $REG_S $s2,7*$SZREG($sp)
1063 $REG_S $s1,6*$SZREG($sp)
1064 $REG_S $s0,5*$SZREG($sp)
1065 $REG_S $t3,4*$SZREG($sp)
1066 $REG_S $t2,3*$SZREG($sp)
1067 $REG_S $t1,2*$SZREG($sp)
1068 $REG_S $t0,1*$SZREG($sp)
1069 $REG_S $gp,0*$SZREG($sp)
1070___
1071$code.=<<___ if ($flavour !~ /nubi/i);
1072 .frame $sp,6*$SZREG,$ra
1073 .mask 0x003f0000,-$SZREG
1074 $PTR_SUB $sp,6*$SZREG
1075 $REG_S $s5,5*$SZREG($sp)
1076 $REG_S $s4,4*$SZREG($sp)
1077 $REG_S $s3,3*$SZREG($sp)
1078 $REG_S $s2,2*$SZREG($sp)
1079 $REG_S $s1,1*$SZREG($sp)
1080 $REG_S $s0,0*$SZREG($sp)
1081___
1082$code.=<<___;
1083
1084 .set reorder
1085 $LD $a_0,0($a1) # If compiled with -mips3 option on
1086 # R5000 box assembler barks on this
1087 # 1ine with "should not have mult/div
1088 # as last instruction in bb (R10K
1089 # bug)" warning. If anybody out there
1090 # has a clue about how to circumvent
1091 # this do send me a note.
1092 # <appro\@fy.chalmers.se>
1093
1094 $LD $b_0,0($a2)
1095 $LD $a_1,$BNSZ($a1)
1096 $LD $a_2,2*$BNSZ($a1)
1097 $MULTU ($a_0,$b_0) # mul_add_c(a[0],b[0],c1,c2,c3);
1098 $LD $a_3,3*$BNSZ($a1)
1099 $LD $b_1,$BNSZ($a2)
1100 $LD $b_2,2*$BNSZ($a2)
1101 $LD $b_3,3*$BNSZ($a2)
1102 mflo ($c_1,$a_0,$b_0)
1103 mfhi ($c_2,$a_0,$b_0)
1104
1105 $LD $a_4,4*$BNSZ($a1)
1106 $LD $a_5,5*$BNSZ($a1)
1107 $MULTU ($a_0,$b_1) # mul_add_c(a[0],b[1],c2,c3,c1);
1108 $LD $a_6,6*$BNSZ($a1)
1109 $LD $a_7,7*$BNSZ($a1)
1110 $LD $b_4,4*$BNSZ($a2)
1111 $LD $b_5,5*$BNSZ($a2)
1112 mflo ($t_1,$a_0,$b_1)
1113 mfhi ($t_2,$a_0,$b_1)
1114 $ADDU $c_2,$t_1
1115 sltu $at,$c_2,$t_1
1116 $MULTU ($a_1,$b_0) # mul_add_c(a[1],b[0],c2,c3,c1);
1117 $ADDU $c_3,$t_2,$at
1118 $LD $b_6,6*$BNSZ($a2)
1119 $LD $b_7,7*$BNSZ($a2)
1120 $ST $c_1,0($a0) # r[0]=c1;
1121 mflo ($t_1,$a_1,$b_0)
1122 mfhi ($t_2,$a_1,$b_0)
1123 $ADDU $c_2,$t_1
1124 sltu $at,$c_2,$t_1
1125 $MULTU ($a_2,$b_0) # mul_add_c(a[2],b[0],c3,c1,c2);
1126 $ADDU $t_2,$at
1127 $ADDU $c_3,$t_2
1128 sltu $c_1,$c_3,$t_2
1129 $ST $c_2,$BNSZ($a0) # r[1]=c2;
1130
1131 mflo ($t_1,$a_2,$b_0)
1132 mfhi ($t_2,$a_2,$b_0)
1133 $ADDU $c_3,$t_1
1134 sltu $at,$c_3,$t_1
1135 $MULTU ($a_1,$b_1) # mul_add_c(a[1],b[1],c3,c1,c2);
1136 $ADDU $t_2,$at
1137 $ADDU $c_1,$t_2
1138 mflo ($t_1,$a_1,$b_1)
1139 mfhi ($t_2,$a_1,$b_1)
1140 $ADDU $c_3,$t_1
1141 sltu $at,$c_3,$t_1
1142 $MULTU ($a_0,$b_2) # mul_add_c(a[0],b[2],c3,c1,c2);
1143 $ADDU $t_2,$at
1144 $ADDU $c_1,$t_2
1145 sltu $c_2,$c_1,$t_2
1146 mflo ($t_1,$a_0,$b_2)
1147 mfhi ($t_2,$a_0,$b_2)
1148 $ADDU $c_3,$t_1
1149 sltu $at,$c_3,$t_1
1150 $MULTU ($a_0,$b_3) # mul_add_c(a[0],b[3],c1,c2,c3);
1151 $ADDU $t_2,$at
1152 $ADDU $c_1,$t_2
1153 sltu $at,$c_1,$t_2
1154 $ADDU $c_2,$at
1155 $ST $c_3,2*$BNSZ($a0) # r[2]=c3;
1156
1157 mflo ($t_1,$a_0,$b_3)
1158 mfhi ($t_2,$a_0,$b_3)
1159 $ADDU $c_1,$t_1
1160 sltu $at,$c_1,$t_1
1161 $MULTU ($a_1,$b_2) # mul_add_c(a[1],b[2],c1,c2,c3);
1162 $ADDU $t_2,$at
1163 $ADDU $c_2,$t_2
1164 sltu $c_3,$c_2,$t_2
1165 mflo ($t_1,$a_1,$b_2)
1166 mfhi ($t_2,$a_1,$b_2)
1167 $ADDU $c_1,$t_1
1168 sltu $at,$c_1,$t_1
1169 $MULTU ($a_2,$b_1) # mul_add_c(a[2],b[1],c1,c2,c3);
1170 $ADDU $t_2,$at
1171 $ADDU $c_2,$t_2
1172 sltu $at,$c_2,$t_2
1173 $ADDU $c_3,$at
1174 mflo ($t_1,$a_2,$b_1)
1175 mfhi ($t_2,$a_2,$b_1)
1176 $ADDU $c_1,$t_1
1177 sltu $at,$c_1,$t_1
1178 $MULTU ($a_3,$b_0) # mul_add_c(a[3],b[0],c1,c2,c3);
1179 $ADDU $t_2,$at
1180 $ADDU $c_2,$t_2
1181 sltu $at,$c_2,$t_2
1182 $ADDU $c_3,$at
1183 mflo ($t_1,$a_3,$b_0)
1184 mfhi ($t_2,$a_3,$b_0)
1185 $ADDU $c_1,$t_1
1186 sltu $at,$c_1,$t_1
1187 $MULTU ($a_4,$b_0) # mul_add_c(a[4],b[0],c2,c3,c1);
1188 $ADDU $t_2,$at
1189 $ADDU $c_2,$t_2
1190 sltu $at,$c_2,$t_2
1191 $ADDU $c_3,$at
1192 $ST $c_1,3*$BNSZ($a0) # r[3]=c1;
1193
1194 mflo ($t_1,$a_4,$b_0)
1195 mfhi ($t_2,$a_4,$b_0)
1196 $ADDU $c_2,$t_1
1197 sltu $at,$c_2,$t_1
1198 $MULTU ($a_3,$b_1) # mul_add_c(a[3],b[1],c2,c3,c1);
1199 $ADDU $t_2,$at
1200 $ADDU $c_3,$t_2
1201 sltu $c_1,$c_3,$t_2
1202 mflo ($t_1,$a_3,$b_1)
1203 mfhi ($t_2,$a_3,$b_1)
1204 $ADDU $c_2,$t_1
1205 sltu $at,$c_2,$t_1
1206 $MULTU ($a_2,$b_2) # mul_add_c(a[2],b[2],c2,c3,c1);
1207 $ADDU $t_2,$at
1208 $ADDU $c_3,$t_2
1209 sltu $at,$c_3,$t_2
1210 $ADDU $c_1,$at
1211 mflo ($t_1,$a_2,$b_2)
1212 mfhi ($t_2,$a_2,$b_2)
1213 $ADDU $c_2,$t_1
1214 sltu $at,$c_2,$t_1
1215 $MULTU ($a_1,$b_3) # mul_add_c(a[1],b[3],c2,c3,c1);
1216 $ADDU $t_2,$at
1217 $ADDU $c_3,$t_2
1218 sltu $at,$c_3,$t_2
1219 $ADDU $c_1,$at
1220 mflo ($t_1,$a_1,$b_3)
1221 mfhi ($t_2,$a_1,$b_3)
1222 $ADDU $c_2,$t_1
1223 sltu $at,$c_2,$t_1
1224 $MULTU ($a_0,$b_4) # mul_add_c(a[0],b[4],c2,c3,c1);
1225 $ADDU $t_2,$at
1226 $ADDU $c_3,$t_2
1227 sltu $at,$c_3,$t_2
1228 $ADDU $c_1,$at
1229 mflo ($t_1,$a_0,$b_4)
1230 mfhi ($t_2,$a_0,$b_4)
1231 $ADDU $c_2,$t_1
1232 sltu $at,$c_2,$t_1
1233 $MULTU ($a_0,$b_5) # mul_add_c(a[0],b[5],c3,c1,c2);
1234 $ADDU $t_2,$at
1235 $ADDU $c_3,$t_2
1236 sltu $at,$c_3,$t_2
1237 $ADDU $c_1,$at
1238 $ST $c_2,4*$BNSZ($a0) # r[4]=c2;
1239
1240 mflo ($t_1,$a_0,$b_5)
1241 mfhi ($t_2,$a_0,$b_5)
1242 $ADDU $c_3,$t_1
1243 sltu $at,$c_3,$t_1
1244 $MULTU ($a_1,$b_4) # mul_add_c(a[1],b[4],c3,c1,c2);
1245 $ADDU $t_2,$at
1246 $ADDU $c_1,$t_2
1247 sltu $c_2,$c_1,$t_2
1248 mflo ($t_1,$a_1,$b_4)
1249 mfhi ($t_2,$a_1,$b_4)
1250 $ADDU $c_3,$t_1
1251 sltu $at,$c_3,$t_1
1252 $MULTU ($a_2,$b_3) # mul_add_c(a[2],b[3],c3,c1,c2);
1253 $ADDU $t_2,$at
1254 $ADDU $c_1,$t_2
1255 sltu $at,$c_1,$t_2
1256 $ADDU $c_2,$at
1257 mflo ($t_1,$a_2,$b_3)
1258 mfhi ($t_2,$a_2,$b_3)
1259 $ADDU $c_3,$t_1
1260 sltu $at,$c_3,$t_1
1261 $MULTU ($a_3,$b_2) # mul_add_c(a[3],b[2],c3,c1,c2);
1262 $ADDU $t_2,$at
1263 $ADDU $c_1,$t_2
1264 sltu $at,$c_1,$t_2
1265 $ADDU $c_2,$at
1266 mflo ($t_1,$a_3,$b_2)
1267 mfhi ($t_2,$a_3,$b_2)
1268 $ADDU $c_3,$t_1
1269 sltu $at,$c_3,$t_1
1270 $MULTU ($a_4,$b_1) # mul_add_c(a[4],b[1],c3,c1,c2);
1271 $ADDU $t_2,$at
1272 $ADDU $c_1,$t_2
1273 sltu $at,$c_1,$t_2
1274 $ADDU $c_2,$at
1275 mflo ($t_1,$a_4,$b_1)
1276 mfhi ($t_2,$a_4,$b_1)
1277 $ADDU $c_3,$t_1
1278 sltu $at,$c_3,$t_1
1279 $MULTU ($a_5,$b_0) # mul_add_c(a[5],b[0],c3,c1,c2);
1280 $ADDU $t_2,$at
1281 $ADDU $c_1,$t_2
1282 sltu $at,$c_1,$t_2
1283 $ADDU $c_2,$at
1284 mflo ($t_1,$a_5,$b_0)
1285 mfhi ($t_2,$a_5,$b_0)
1286 $ADDU $c_3,$t_1
1287 sltu $at,$c_3,$t_1
1288 $MULTU ($a_6,$b_0) # mul_add_c(a[6],b[0],c1,c2,c3);
1289 $ADDU $t_2,$at
1290 $ADDU $c_1,$t_2
1291 sltu $at,$c_1,$t_2
1292 $ADDU $c_2,$at
1293 $ST $c_3,5*$BNSZ($a0) # r[5]=c3;
1294
1295 mflo ($t_1,$a_6,$b_0)
1296 mfhi ($t_2,$a_6,$b_0)
1297 $ADDU $c_1,$t_1
1298 sltu $at,$c_1,$t_1
1299 $MULTU ($a_5,$b_1) # mul_add_c(a[5],b[1],c1,c2,c3);
1300 $ADDU $t_2,$at
1301 $ADDU $c_2,$t_2
1302 sltu $c_3,$c_2,$t_2
1303 mflo ($t_1,$a_5,$b_1)
1304 mfhi ($t_2,$a_5,$b_1)
1305 $ADDU $c_1,$t_1
1306 sltu $at,$c_1,$t_1
1307 $MULTU ($a_4,$b_2) # mul_add_c(a[4],b[2],c1,c2,c3);
1308 $ADDU $t_2,$at
1309 $ADDU $c_2,$t_2
1310 sltu $at,$c_2,$t_2
1311 $ADDU $c_3,$at
1312 mflo ($t_1,$a_4,$b_2)
1313 mfhi ($t_2,$a_4,$b_2)
1314 $ADDU $c_1,$t_1
1315 sltu $at,$c_1,$t_1
1316 $MULTU ($a_3,$b_3) # mul_add_c(a[3],b[3],c1,c2,c3);
1317 $ADDU $t_2,$at
1318 $ADDU $c_2,$t_2
1319 sltu $at,$c_2,$t_2
1320 $ADDU $c_3,$at
1321 mflo ($t_1,$a_3,$b_3)
1322 mfhi ($t_2,$a_3,$b_3)
1323 $ADDU $c_1,$t_1
1324 sltu $at,$c_1,$t_1
1325 $MULTU ($a_2,$b_4) # mul_add_c(a[2],b[4],c1,c2,c3);
1326 $ADDU $t_2,$at
1327 $ADDU $c_2,$t_2
1328 sltu $at,$c_2,$t_2
1329 $ADDU $c_3,$at
1330 mflo ($t_1,$a_2,$b_4)
1331 mfhi ($t_2,$a_2,$b_4)
1332 $ADDU $c_1,$t_1
1333 sltu $at,$c_1,$t_1
1334 $MULTU ($a_1,$b_5) # mul_add_c(a[1],b[5],c1,c2,c3);
1335 $ADDU $t_2,$at
1336 $ADDU $c_2,$t_2
1337 sltu $at,$c_2,$t_2
1338 $ADDU $c_3,$at
1339 mflo ($t_1,$a_1,$b_5)
1340 mfhi ($t_2,$a_1,$b_5)
1341 $ADDU $c_1,$t_1
1342 sltu $at,$c_1,$t_1
1343 $MULTU ($a_0,$b_6) # mul_add_c(a[0],b[6],c1,c2,c3);
1344 $ADDU $t_2,$at
1345 $ADDU $c_2,$t_2
1346 sltu $at,$c_2,$t_2
1347 $ADDU $c_3,$at
1348 mflo ($t_1,$a_0,$b_6)
1349 mfhi ($t_2,$a_0,$b_6)
1350 $ADDU $c_1,$t_1
1351 sltu $at,$c_1,$t_1
1352 $MULTU ($a_0,$b_7) # mul_add_c(a[0],b[7],c2,c3,c1);
1353 $ADDU $t_2,$at
1354 $ADDU $c_2,$t_2
1355 sltu $at,$c_2,$t_2
1356 $ADDU $c_3,$at
1357 $ST $c_1,6*$BNSZ($a0) # r[6]=c1;
1358
1359 mflo ($t_1,$a_0,$b_7)
1360 mfhi ($t_2,$a_0,$b_7)
1361 $ADDU $c_2,$t_1
1362 sltu $at,$c_2,$t_1
1363 $MULTU ($a_1,$b_6) # mul_add_c(a[1],b[6],c2,c3,c1);
1364 $ADDU $t_2,$at
1365 $ADDU $c_3,$t_2
1366 sltu $c_1,$c_3,$t_2
1367 mflo ($t_1,$a_1,$b_6)
1368 mfhi ($t_2,$a_1,$b_6)
1369 $ADDU $c_2,$t_1
1370 sltu $at,$c_2,$t_1
1371 $MULTU ($a_2,$b_5) # mul_add_c(a[2],b[5],c2,c3,c1);
1372 $ADDU $t_2,$at
1373 $ADDU $c_3,$t_2
1374 sltu $at,$c_3,$t_2
1375 $ADDU $c_1,$at
1376 mflo ($t_1,$a_2,$b_5)
1377 mfhi ($t_2,$a_2,$b_5)
1378 $ADDU $c_2,$t_1
1379 sltu $at,$c_2,$t_1
1380 $MULTU ($a_3,$b_4) # mul_add_c(a[3],b[4],c2,c3,c1);
1381 $ADDU $t_2,$at
1382 $ADDU $c_3,$t_2
1383 sltu $at,$c_3,$t_2
1384 $ADDU $c_1,$at
1385 mflo ($t_1,$a_3,$b_4)
1386 mfhi ($t_2,$a_3,$b_4)
1387 $ADDU $c_2,$t_1
1388 sltu $at,$c_2,$t_1
1389 $MULTU ($a_4,$b_3) # mul_add_c(a[4],b[3],c2,c3,c1);
1390 $ADDU $t_2,$at
1391 $ADDU $c_3,$t_2
1392 sltu $at,$c_3,$t_2
1393 $ADDU $c_1,$at
1394 mflo ($t_1,$a_4,$b_3)
1395 mfhi ($t_2,$a_4,$b_3)
1396 $ADDU $c_2,$t_1
1397 sltu $at,$c_2,$t_1
1398 $MULTU ($a_5,$b_2) # mul_add_c(a[5],b[2],c2,c3,c1);
1399 $ADDU $t_2,$at
1400 $ADDU $c_3,$t_2
1401 sltu $at,$c_3,$t_2
1402 $ADDU $c_1,$at
1403 mflo ($t_1,$a_5,$b_2)
1404 mfhi ($t_2,$a_5,$b_2)
1405 $ADDU $c_2,$t_1
1406 sltu $at,$c_2,$t_1
1407 $MULTU ($a_6,$b_1) # mul_add_c(a[6],b[1],c2,c3,c1);
1408 $ADDU $t_2,$at
1409 $ADDU $c_3,$t_2
1410 sltu $at,$c_3,$t_2
1411 $ADDU $c_1,$at
1412 mflo ($t_1,$a_6,$b_1)
1413 mfhi ($t_2,$a_6,$b_1)
1414 $ADDU $c_2,$t_1
1415 sltu $at,$c_2,$t_1
1416 $MULTU ($a_7,$b_0) # mul_add_c(a[7],b[0],c2,c3,c1);
1417 $ADDU $t_2,$at
1418 $ADDU $c_3,$t_2
1419 sltu $at,$c_3,$t_2
1420 $ADDU $c_1,$at
1421 mflo ($t_1,$a_7,$b_0)
1422 mfhi ($t_2,$a_7,$b_0)
1423 $ADDU $c_2,$t_1
1424 sltu $at,$c_2,$t_1
1425 $MULTU ($a_7,$b_1) # mul_add_c(a[7],b[1],c3,c1,c2);
1426 $ADDU $t_2,$at
1427 $ADDU $c_3,$t_2
1428 sltu $at,$c_3,$t_2
1429 $ADDU $c_1,$at
1430 $ST $c_2,7*$BNSZ($a0) # r[7]=c2;
1431
1432 mflo ($t_1,$a_7,$b_1)
1433 mfhi ($t_2,$a_7,$b_1)
1434 $ADDU $c_3,$t_1
1435 sltu $at,$c_3,$t_1
1436 $MULTU ($a_6,$b_2) # mul_add_c(a[6],b[2],c3,c1,c2);
1437 $ADDU $t_2,$at
1438 $ADDU $c_1,$t_2
1439 sltu $c_2,$c_1,$t_2
1440 mflo ($t_1,$a_6,$b_2)
1441 mfhi ($t_2,$a_6,$b_2)
1442 $ADDU $c_3,$t_1
1443 sltu $at,$c_3,$t_1
1444 $MULTU ($a_5,$b_3) # mul_add_c(a[5],b[3],c3,c1,c2);
1445 $ADDU $t_2,$at
1446 $ADDU $c_1,$t_2
1447 sltu $at,$c_1,$t_2
1448 $ADDU $c_2,$at
1449 mflo ($t_1,$a_5,$b_3)
1450 mfhi ($t_2,$a_5,$b_3)
1451 $ADDU $c_3,$t_1
1452 sltu $at,$c_3,$t_1
1453 $MULTU ($a_4,$b_4) # mul_add_c(a[4],b[4],c3,c1,c2);
1454 $ADDU $t_2,$at
1455 $ADDU $c_1,$t_2
1456 sltu $at,$c_1,$t_2
1457 $ADDU $c_2,$at
1458 mflo ($t_1,$a_4,$b_4)
1459 mfhi ($t_2,$a_4,$b_4)
1460 $ADDU $c_3,$t_1
1461 sltu $at,$c_3,$t_1
1462 $MULTU ($a_3,$b_5) # mul_add_c(a[3],b[5],c3,c1,c2);
1463 $ADDU $t_2,$at
1464 $ADDU $c_1,$t_2
1465 sltu $at,$c_1,$t_2
1466 $ADDU $c_2,$at
1467 mflo ($t_1,$a_3,$b_5)
1468 mfhi ($t_2,$a_3,$b_5)
1469 $ADDU $c_3,$t_1
1470 sltu $at,$c_3,$t_1
1471 $MULTU ($a_2,$b_6) # mul_add_c(a[2],b[6],c3,c1,c2);
1472 $ADDU $t_2,$at
1473 $ADDU $c_1,$t_2
1474 sltu $at,$c_1,$t_2
1475 $ADDU $c_2,$at
1476 mflo ($t_1,$a_2,$b_6)
1477 mfhi ($t_2,$a_2,$b_6)
1478 $ADDU $c_3,$t_1
1479 sltu $at,$c_3,$t_1
1480 $MULTU ($a_1,$b_7) # mul_add_c(a[1],b[7],c3,c1,c2);
1481 $ADDU $t_2,$at
1482 $ADDU $c_1,$t_2
1483 sltu $at,$c_1,$t_2
1484 $ADDU $c_2,$at
1485 mflo ($t_1,$a_1,$b_7)
1486 mfhi ($t_2,$a_1,$b_7)
1487 $ADDU $c_3,$t_1
1488 sltu $at,$c_3,$t_1
1489 $MULTU ($a_2,$b_7) # mul_add_c(a[2],b[7],c1,c2,c3);
1490 $ADDU $t_2,$at
1491 $ADDU $c_1,$t_2
1492 sltu $at,$c_1,$t_2
1493 $ADDU $c_2,$at
1494 $ST $c_3,8*$BNSZ($a0) # r[8]=c3;
1495
1496 mflo ($t_1,$a_2,$b_7)
1497 mfhi ($t_2,$a_2,$b_7)
1498 $ADDU $c_1,$t_1
1499 sltu $at,$c_1,$t_1
1500 $MULTU ($a_3,$b_6) # mul_add_c(a[3],b[6],c1,c2,c3);
1501 $ADDU $t_2,$at
1502 $ADDU $c_2,$t_2
1503 sltu $c_3,$c_2,$t_2
1504 mflo ($t_1,$a_3,$b_6)
1505 mfhi ($t_2,$a_3,$b_6)
1506 $ADDU $c_1,$t_1
1507 sltu $at,$c_1,$t_1
1508 $MULTU ($a_4,$b_5) # mul_add_c(a[4],b[5],c1,c2,c3);
1509 $ADDU $t_2,$at
1510 $ADDU $c_2,$t_2
1511 sltu $at,$c_2,$t_2
1512 $ADDU $c_3,$at
1513 mflo ($t_1,$a_4,$b_5)
1514 mfhi ($t_2,$a_4,$b_5)
1515 $ADDU $c_1,$t_1
1516 sltu $at,$c_1,$t_1
1517 $MULTU ($a_5,$b_4) # mul_add_c(a[5],b[4],c1,c2,c3);
1518 $ADDU $t_2,$at
1519 $ADDU $c_2,$t_2
1520 sltu $at,$c_2,$t_2
1521 $ADDU $c_3,$at
1522 mflo ($t_1,$a_5,$b_4)
1523 mfhi ($t_2,$a_5,$b_4)
1524 $ADDU $c_1,$t_1
1525 sltu $at,$c_1,$t_1
1526 $MULTU ($a_6,$b_3) # mul_add_c(a[6],b[3],c1,c2,c3);
1527 $ADDU $t_2,$at
1528 $ADDU $c_2,$t_2
1529 sltu $at,$c_2,$t_2
1530 $ADDU $c_3,$at
1531 mflo ($t_1,$a_6,$b_3)
1532 mfhi ($t_2,$a_6,$b_3)
1533 $ADDU $c_1,$t_1
1534 sltu $at,$c_1,$t_1
1535 $MULTU ($a_7,$b_2) # mul_add_c(a[7],b[2],c1,c2,c3);
1536 $ADDU $t_2,$at
1537 $ADDU $c_2,$t_2
1538 sltu $at,$c_2,$t_2
1539 $ADDU $c_3,$at
1540 mflo ($t_1,$a_7,$b_2)
1541 mfhi ($t_2,$a_7,$b_2)
1542 $ADDU $c_1,$t_1
1543 sltu $at,$c_1,$t_1
1544 $MULTU ($a_7,$b_3) # mul_add_c(a[7],b[3],c2,c3,c1);
1545 $ADDU $t_2,$at
1546 $ADDU $c_2,$t_2
1547 sltu $at,$c_2,$t_2
1548 $ADDU $c_3,$at
1549 $ST $c_1,9*$BNSZ($a0) # r[9]=c1;
1550
1551 mflo ($t_1,$a_7,$b_3)
1552 mfhi ($t_2,$a_7,$b_3)
1553 $ADDU $c_2,$t_1
1554 sltu $at,$c_2,$t_1
1555 $MULTU ($a_6,$b_4) # mul_add_c(a[6],b[4],c2,c3,c1);
1556 $ADDU $t_2,$at
1557 $ADDU $c_3,$t_2
1558 sltu $c_1,$c_3,$t_2
1559 mflo ($t_1,$a_6,$b_4)
1560 mfhi ($t_2,$a_6,$b_4)
1561 $ADDU $c_2,$t_1
1562 sltu $at,$c_2,$t_1
1563 $MULTU ($a_5,$b_5) # mul_add_c(a[5],b[5],c2,c3,c1);
1564 $ADDU $t_2,$at
1565 $ADDU $c_3,$t_2
1566 sltu $at,$c_3,$t_2
1567 $ADDU $c_1,$at
1568 mflo ($t_1,$a_5,$b_5)
1569 mfhi ($t_2,$a_5,$b_5)
1570 $ADDU $c_2,$t_1
1571 sltu $at,$c_2,$t_1
1572 $MULTU ($a_4,$b_6) # mul_add_c(a[4],b[6],c2,c3,c1);
1573 $ADDU $t_2,$at
1574 $ADDU $c_3,$t_2
1575 sltu $at,$c_3,$t_2
1576 $ADDU $c_1,$at
1577 mflo ($t_1,$a_4,$b_6)
1578 mfhi ($t_2,$a_4,$b_6)
1579 $ADDU $c_2,$t_1
1580 sltu $at,$c_2,$t_1
1581 $MULTU ($a_3,$b_7) # mul_add_c(a[3],b[7],c2,c3,c1);
1582 $ADDU $t_2,$at
1583 $ADDU $c_3,$t_2
1584 sltu $at,$c_3,$t_2
1585 $ADDU $c_1,$at
1586 mflo ($t_1,$a_3,$b_7)
1587 mfhi ($t_2,$a_3,$b_7)
1588 $ADDU $c_2,$t_1
1589 sltu $at,$c_2,$t_1
1590 $MULTU ($a_4,$b_7) # mul_add_c(a[4],b[7],c3,c1,c2);
1591 $ADDU $t_2,$at
1592 $ADDU $c_3,$t_2
1593 sltu $at,$c_3,$t_2
1594 $ADDU $c_1,$at
1595 $ST $c_2,10*$BNSZ($a0) # r[10]=c2;
1596
1597 mflo ($t_1,$a_4,$b_7)
1598 mfhi ($t_2,$a_4,$b_7)
1599 $ADDU $c_3,$t_1
1600 sltu $at,$c_3,$t_1
1601 $MULTU ($a_5,$b_6) # mul_add_c(a[5],b[6],c3,c1,c2);
1602 $ADDU $t_2,$at
1603 $ADDU $c_1,$t_2
1604 sltu $c_2,$c_1,$t_2
1605 mflo ($t_1,$a_5,$b_6)
1606 mfhi ($t_2,$a_5,$b_6)
1607 $ADDU $c_3,$t_1
1608 sltu $at,$c_3,$t_1
1609 $MULTU ($a_6,$b_5) # mul_add_c(a[6],b[5],c3,c1,c2);
1610 $ADDU $t_2,$at
1611 $ADDU $c_1,$t_2
1612 sltu $at,$c_1,$t_2
1613 $ADDU $c_2,$at
1614 mflo ($t_1,$a_6,$b_5)
1615 mfhi ($t_2,$a_6,$b_5)
1616 $ADDU $c_3,$t_1
1617 sltu $at,$c_3,$t_1
1618 $MULTU ($a_7,$b_4) # mul_add_c(a[7],b[4],c3,c1,c2);
1619 $ADDU $t_2,$at
1620 $ADDU $c_1,$t_2
1621 sltu $at,$c_1,$t_2
1622 $ADDU $c_2,$at
1623 mflo ($t_1,$a_7,$b_4)
1624 mfhi ($t_2,$a_7,$b_4)
1625 $ADDU $c_3,$t_1
1626 sltu $at,$c_3,$t_1
1627 $MULTU ($a_7,$b_5) # mul_add_c(a[7],b[5],c1,c2,c3);
1628 $ADDU $t_2,$at
1629 $ADDU $c_1,$t_2
1630 sltu $at,$c_1,$t_2
1631 $ADDU $c_2,$at
1632 $ST $c_3,11*$BNSZ($a0) # r[11]=c3;
1633
1634 mflo ($t_1,$a_7,$b_5)
1635 mfhi ($t_2,$a_7,$b_5)
1636 $ADDU $c_1,$t_1
1637 sltu $at,$c_1,$t_1
1638 $MULTU ($a_6,$b_6) # mul_add_c(a[6],b[6],c1,c2,c3);
1639 $ADDU $t_2,$at
1640 $ADDU $c_2,$t_2
1641 sltu $c_3,$c_2,$t_2
1642 mflo ($t_1,$a_6,$b_6)
1643 mfhi ($t_2,$a_6,$b_6)
1644 $ADDU $c_1,$t_1
1645 sltu $at,$c_1,$t_1
1646 $MULTU ($a_5,$b_7) # mul_add_c(a[5],b[7],c1,c2,c3);
1647 $ADDU $t_2,$at
1648 $ADDU $c_2,$t_2
1649 sltu $at,$c_2,$t_2
1650 $ADDU $c_3,$at
1651 mflo ($t_1,$a_5,$b_7)
1652 mfhi ($t_2,$a_5,$b_7)
1653 $ADDU $c_1,$t_1
1654 sltu $at,$c_1,$t_1
1655 $MULTU ($a_6,$b_7) # mul_add_c(a[6],b[7],c2,c3,c1);
1656 $ADDU $t_2,$at
1657 $ADDU $c_2,$t_2
1658 sltu $at,$c_2,$t_2
1659 $ADDU $c_3,$at
1660 $ST $c_1,12*$BNSZ($a0) # r[12]=c1;
1661
1662 mflo ($t_1,$a_6,$b_7)
1663 mfhi ($t_2,$a_6,$b_7)
1664 $ADDU $c_2,$t_1
1665 sltu $at,$c_2,$t_1
1666 $MULTU ($a_7,$b_6) # mul_add_c(a[7],b[6],c2,c3,c1);
1667 $ADDU $t_2,$at
1668 $ADDU $c_3,$t_2
1669 sltu $c_1,$c_3,$t_2
1670 mflo ($t_1,$a_7,$b_6)
1671 mfhi ($t_2,$a_7,$b_6)
1672 $ADDU $c_2,$t_1
1673 sltu $at,$c_2,$t_1
1674 $MULTU ($a_7,$b_7) # mul_add_c(a[7],b[7],c3,c1,c2);
1675 $ADDU $t_2,$at
1676 $ADDU $c_3,$t_2
1677 sltu $at,$c_3,$t_2
1678 $ADDU $c_1,$at
1679 $ST $c_2,13*$BNSZ($a0) # r[13]=c2;
1680
1681 mflo ($t_1,$a_7,$b_7)
1682 mfhi ($t_2,$a_7,$b_7)
1683 $ADDU $c_3,$t_1
1684 sltu $at,$c_3,$t_1
1685 $ADDU $t_2,$at
1686 $ADDU $c_1,$t_2
1687 $ST $c_3,14*$BNSZ($a0) # r[14]=c3;
1688 $ST $c_1,15*$BNSZ($a0) # r[15]=c1;
1689
1690 .set noreorder
1691___
1692$code.=<<___ if ($flavour =~ /nubi/i);
1693 $REG_L $s5,10*$SZREG($sp)
1694 $REG_L $s4,9*$SZREG($sp)
1695 $REG_L $s3,8*$SZREG($sp)
1696 $REG_L $s2,7*$SZREG($sp)
1697 $REG_L $s1,6*$SZREG($sp)
1698 $REG_L $s0,5*$SZREG($sp)
1699 $REG_L $t3,4*$SZREG($sp)
1700 $REG_L $t2,3*$SZREG($sp)
1701 $REG_L $t1,2*$SZREG($sp)
1702 $REG_L $t0,1*$SZREG($sp)
1703 $REG_L $gp,0*$SZREG($sp)
1704 jr $ra
1705 $PTR_ADD $sp,12*$SZREG
1706___
1707$code.=<<___ if ($flavour !~ /nubi/i);
1708 $REG_L $s5,5*$SZREG($sp)
1709 $REG_L $s4,4*$SZREG($sp)
1710 $REG_L $s3,3*$SZREG($sp)
1711 $REG_L $s2,2*$SZREG($sp)
1712 $REG_L $s1,1*$SZREG($sp)
1713 $REG_L $s0,0*$SZREG($sp)
1714 jr $ra
1715 $PTR_ADD $sp,6*$SZREG
1716___
1717$code.=<<___;
1718.end bn_mul_comba8
1719
1720.align 5
1721.globl bn_mul_comba4
1722.ent bn_mul_comba4
1723bn_mul_comba4:
1724___
1725$code.=<<___ if ($flavour =~ /nubi/i);
1726 .frame $sp,6*$SZREG,$ra
1727 .mask 0x8000f008,-$SZREG
1728 .set noreorder
1729 $PTR_SUB $sp,6*$SZREG
1730 $REG_S $ra,5*$SZREG($sp)
1731 $REG_S $t3,4*$SZREG($sp)
1732 $REG_S $t2,3*$SZREG($sp)
1733 $REG_S $t1,2*$SZREG($sp)
1734 $REG_S $t0,1*$SZREG($sp)
1735 $REG_S $gp,0*$SZREG($sp)
1736___
1737$code.=<<___;
1738 .set reorder
1739 $LD $a_0,0($a1)
1740 $LD $b_0,0($a2)
1741 $LD $a_1,$BNSZ($a1)
1742 $LD $a_2,2*$BNSZ($a1)
1743 $MULTU ($a_0,$b_0) # mul_add_c(a[0],b[0],c1,c2,c3);
1744 $LD $a_3,3*$BNSZ($a1)
1745 $LD $b_1,$BNSZ($a2)
1746 $LD $b_2,2*$BNSZ($a2)
1747 $LD $b_3,3*$BNSZ($a2)
1748 mflo ($c_1,$a_0,$b_0)
1749 mfhi ($c_2,$a_0,$b_0)
1750 $ST $c_1,0($a0)
1751
1752 $MULTU ($a_0,$b_1) # mul_add_c(a[0],b[1],c2,c3,c1);
1753 mflo ($t_1,$a_0,$b_1)
1754 mfhi ($t_2,$a_0,$b_1)
1755 $ADDU $c_2,$t_1
1756 sltu $at,$c_2,$t_1
1757 $MULTU ($a_1,$b_0) # mul_add_c(a[1],b[0],c2,c3,c1);
1758 $ADDU $c_3,$t_2,$at
1759 mflo ($t_1,$a_1,$b_0)
1760 mfhi ($t_2,$a_1,$b_0)
1761 $ADDU $c_2,$t_1
1762 sltu $at,$c_2,$t_1
1763 $MULTU ($a_2,$b_0) # mul_add_c(a[2],b[0],c3,c1,c2);
1764 $ADDU $t_2,$at
1765 $ADDU $c_3,$t_2
1766 sltu $c_1,$c_3,$t_2
1767 $ST $c_2,$BNSZ($a0)
1768
1769 mflo ($t_1,$a_2,$b_0)
1770 mfhi ($t_2,$a_2,$b_0)
1771 $ADDU $c_3,$t_1
1772 sltu $at,$c_3,$t_1
1773 $MULTU ($a_1,$b_1) # mul_add_c(a[1],b[1],c3,c1,c2);
1774 $ADDU $t_2,$at
1775 $ADDU $c_1,$t_2
1776 mflo ($t_1,$a_1,$b_1)
1777 mfhi ($t_2,$a_1,$b_1)
1778 $ADDU $c_3,$t_1
1779 sltu $at,$c_3,$t_1
1780 $MULTU ($a_0,$b_2) # mul_add_c(a[0],b[2],c3,c1,c2);
1781 $ADDU $t_2,$at
1782 $ADDU $c_1,$t_2
1783 sltu $c_2,$c_1,$t_2
1784 mflo ($t_1,$a_0,$b_2)
1785 mfhi ($t_2,$a_0,$b_2)
1786 $ADDU $c_3,$t_1
1787 sltu $at,$c_3,$t_1
1788 $MULTU ($a_0,$b_3) # mul_add_c(a[0],b[3],c1,c2,c3);
1789 $ADDU $t_2,$at
1790 $ADDU $c_1,$t_2
1791 sltu $at,$c_1,$t_2
1792 $ADDU $c_2,$at
1793 $ST $c_3,2*$BNSZ($a0)
1794
1795 mflo ($t_1,$a_0,$b_3)
1796 mfhi ($t_2,$a_0,$b_3)
1797 $ADDU $c_1,$t_1
1798 sltu $at,$c_1,$t_1
1799 $MULTU ($a_1,$b_2) # mul_add_c(a[1],b[2],c1,c2,c3);
1800 $ADDU $t_2,$at
1801 $ADDU $c_2,$t_2
1802 sltu $c_3,$c_2,$t_2
1803 mflo ($t_1,$a_1,$b_2)
1804 mfhi ($t_2,$a_1,$b_2)
1805 $ADDU $c_1,$t_1
1806 sltu $at,$c_1,$t_1
1807 $MULTU ($a_2,$b_1) # mul_add_c(a[2],b[1],c1,c2,c3);
1808 $ADDU $t_2,$at
1809 $ADDU $c_2,$t_2
1810 sltu $at,$c_2,$t_2
1811 $ADDU $c_3,$at
1812 mflo ($t_1,$a_2,$b_1)
1813 mfhi ($t_2,$a_2,$b_1)
1814 $ADDU $c_1,$t_1
1815 sltu $at,$c_1,$t_1
1816 $MULTU ($a_3,$b_0) # mul_add_c(a[3],b[0],c1,c2,c3);
1817 $ADDU $t_2,$at
1818 $ADDU $c_2,$t_2
1819 sltu $at,$c_2,$t_2
1820 $ADDU $c_3,$at
1821 mflo ($t_1,$a_3,$b_0)
1822 mfhi ($t_2,$a_3,$b_0)
1823 $ADDU $c_1,$t_1
1824 sltu $at,$c_1,$t_1
1825 $MULTU ($a_3,$b_1) # mul_add_c(a[3],b[1],c2,c3,c1);
1826 $ADDU $t_2,$at
1827 $ADDU $c_2,$t_2
1828 sltu $at,$c_2,$t_2
1829 $ADDU $c_3,$at
1830 $ST $c_1,3*$BNSZ($a0)
1831
1832 mflo ($t_1,$a_3,$b_1)
1833 mfhi ($t_2,$a_3,$b_1)
1834 $ADDU $c_2,$t_1
1835 sltu $at,$c_2,$t_1
1836 $MULTU ($a_2,$b_2) # mul_add_c(a[2],b[2],c2,c3,c1);
1837 $ADDU $t_2,$at
1838 $ADDU $c_3,$t_2
1839 sltu $c_1,$c_3,$t_2
1840 mflo ($t_1,$a_2,$b_2)
1841 mfhi ($t_2,$a_2,$b_2)
1842 $ADDU $c_2,$t_1
1843 sltu $at,$c_2,$t_1
1844 $MULTU ($a_1,$b_3) # mul_add_c(a[1],b[3],c2,c3,c1);
1845 $ADDU $t_2,$at
1846 $ADDU $c_3,$t_2
1847 sltu $at,$c_3,$t_2
1848 $ADDU $c_1,$at
1849 mflo ($t_1,$a_1,$b_3)
1850 mfhi ($t_2,$a_1,$b_3)
1851 $ADDU $c_2,$t_1
1852 sltu $at,$c_2,$t_1
1853 $MULTU ($a_2,$b_3) # mul_add_c(a[2],b[3],c3,c1,c2);
1854 $ADDU $t_2,$at
1855 $ADDU $c_3,$t_2
1856 sltu $at,$c_3,$t_2
1857 $ADDU $c_1,$at
1858 $ST $c_2,4*$BNSZ($a0)
1859
1860 mflo ($t_1,$a_2,$b_3)
1861 mfhi ($t_2,$a_2,$b_3)
1862 $ADDU $c_3,$t_1
1863 sltu $at,$c_3,$t_1
1864 $MULTU ($a_3,$b_2) # mul_add_c(a[3],b[2],c3,c1,c2);
1865 $ADDU $t_2,$at
1866 $ADDU $c_1,$t_2
1867 sltu $c_2,$c_1,$t_2
1868 mflo ($t_1,$a_3,$b_2)
1869 mfhi ($t_2,$a_3,$b_2)
1870 $ADDU $c_3,$t_1
1871 sltu $at,$c_3,$t_1
1872 $MULTU ($a_3,$b_3) # mul_add_c(a[3],b[3],c1,c2,c3);
1873 $ADDU $t_2,$at
1874 $ADDU $c_1,$t_2
1875 sltu $at,$c_1,$t_2
1876 $ADDU $c_2,$at
1877 $ST $c_3,5*$BNSZ($a0)
1878
1879 mflo ($t_1,$a_3,$b_3)
1880 mfhi ($t_2,$a_3,$b_3)
1881 $ADDU $c_1,$t_1
1882 sltu $at,$c_1,$t_1
1883 $ADDU $t_2,$at
1884 $ADDU $c_2,$t_2
1885 $ST $c_1,6*$BNSZ($a0)
1886 $ST $c_2,7*$BNSZ($a0)
1887
1888 .set noreorder
1889___
1890$code.=<<___ if ($flavour =~ /nubi/i);
1891 $REG_L $t3,4*$SZREG($sp)
1892 $REG_L $t2,3*$SZREG($sp)
1893 $REG_L $t1,2*$SZREG($sp)
1894 $REG_L $t0,1*$SZREG($sp)
1895 $REG_L $gp,0*$SZREG($sp)
1896 $PTR_ADD $sp,6*$SZREG
1897___
1898$code.=<<___;
1899 jr $ra
1900 nop
1901.end bn_mul_comba4
1902___
1903
1904($a_4,$a_5,$a_6,$a_7)=($b_0,$b_1,$b_2,$b_3);
1905
1906sub add_c2 () {
1907my ($hi,$lo,$c0,$c1,$c2,
1908 $warm, # !$warm denotes first call with specific sequence of
1909 # $c_[XYZ] when there is no Z-carry to accumulate yet;
1910 $an,$bn # these two are arguments for multiplication which
1911 # result is used in *next* step [which is why it's
1912 # commented as "forward multiplication" below];
1913 )=@_;
1914$code.=<<___;
1915 $ADDU $c0,$lo
1916 sltu $at,$c0,$lo
1917 $MULTU ($an,$bn) # forward multiplication
1918 $ADDU $c0,$lo
1919 $ADDU $at,$hi
1920 sltu $lo,$c0,$lo
1921 $ADDU $c1,$at
1922 $ADDU $hi,$lo
1923___
1924$code.=<<___ if (!$warm);
1925 sltu $c2,$c1,$at
1926 $ADDU $c1,$hi
1927___
1928$code.=<<___ if ($warm);
1929 sltu $at,$c1,$at
1930 $ADDU $c1,$hi
1931 $ADDU $c2,$at
1932___
1933$code.=<<___;
1934 sltu $hi,$c1,$hi
1935 $ADDU $c2,$hi
1936 mflo ($lo,$an,$bn)
1937 mfhi ($hi,$an,$bn)
1938___
1939}
1940
1941$code.=<<___;
1942
1943.align 5
1944.globl bn_sqr_comba8
1945.ent bn_sqr_comba8
1946bn_sqr_comba8:
1947___
1948$code.=<<___ if ($flavour =~ /nubi/i);
1949 .frame $sp,6*$SZREG,$ra
1950 .mask 0x8000f008,-$SZREG
1951 .set noreorder
1952 $PTR_SUB $sp,6*$SZREG
1953 $REG_S $ra,5*$SZREG($sp)
1954 $REG_S $t3,4*$SZREG($sp)
1955 $REG_S $t2,3*$SZREG($sp)
1956 $REG_S $t1,2*$SZREG($sp)
1957 $REG_S $t0,1*$SZREG($sp)
1958 $REG_S $gp,0*$SZREG($sp)
1959___
1960$code.=<<___;
1961 .set reorder
1962 $LD $a_0,0($a1)
1963 $LD $a_1,$BNSZ($a1)
1964 $LD $a_2,2*$BNSZ($a1)
1965 $LD $a_3,3*$BNSZ($a1)
1966
1967 $MULTU ($a_0,$a_0) # mul_add_c(a[0],b[0],c1,c2,c3);
1968 $LD $a_4,4*$BNSZ($a1)
1969 $LD $a_5,5*$BNSZ($a1)
1970 $LD $a_6,6*$BNSZ($a1)
1971 $LD $a_7,7*$BNSZ($a1)
1972 mflo ($c_1,$a_0,$a_0)
1973 mfhi ($c_2,$a_0,$a_0)
1974 $ST $c_1,0($a0)
1975
1976 $MULTU ($a_0,$a_1) # mul_add_c2(a[0],b[1],c2,c3,c1);
1977 mflo ($t_1,$a_0,$a_1)
1978 mfhi ($t_2,$a_0,$a_1)
1979 slt $c_1,$t_2,$zero
1980 $SLL $t_2,1
1981 $MULTU ($a_2,$a_0) # mul_add_c2(a[2],b[0],c3,c1,c2);
1982 slt $a2,$t_1,$zero
1983 $ADDU $t_2,$a2
1984 $SLL $t_1,1
1985 $ADDU $c_2,$t_1
1986 sltu $at,$c_2,$t_1
1987 $ADDU $c_3,$t_2,$at
1988 $ST $c_2,$BNSZ($a0)
1989 sltu $at,$c_3,$t_2
1990 $ADDU $c_1,$at
1991 mflo ($t_1,$a_2,$a_0)
1992 mfhi ($t_2,$a_2,$a_0)
1993___
1994 &add_c2($t_2,$t_1,$c_3,$c_1,$c_2,0,
1995 $a_1,$a_1); # mul_add_c(a[1],b[1],c3,c1,c2);
1996$code.=<<___;
1997 $ADDU $c_3,$t_1
1998 sltu $at,$c_3,$t_1
1999 $MULTU ($a_0,$a_3) # mul_add_c2(a[0],b[3],c1,c2,c3);
2000 $ADDU $t_2,$at
2001 $ADDU $c_1,$t_2
2002 sltu $at,$c_1,$t_2
2003 $ADDU $c_2,$at
2004 $ST $c_3,2*$BNSZ($a0)
2005 mflo ($t_1,$a_0,$a_3)
2006 mfhi ($t_2,$a_0,$a_3)
2007___
2008 &add_c2($t_2,$t_1,$c_1,$c_2,$c_3,0,
2009 $a_1,$a_2); # mul_add_c2(a[1],b[2],c1,c2,c3);
2010 &add_c2($t_2,$t_1,$c_1,$c_2,$c_3,1,
2011 $a_4,$a_0); # mul_add_c2(a[4],b[0],c2,c3,c1);
2012$code.=<<___;
2013 $ST $c_1,3*$BNSZ($a0)
2014___
2015 &add_c2($t_2,$t_1,$c_2,$c_3,$c_1,0,
2016 $a_3,$a_1); # mul_add_c2(a[3],b[1],c2,c3,c1);
2017 &add_c2($t_2,$t_1,$c_2,$c_3,$c_1,1,
2018 $a_2,$a_2); # mul_add_c(a[2],b[2],c2,c3,c1);
2019$code.=<<___;
2020 $ADDU $c_2,$t_1
2021 sltu $at,$c_2,$t_1
2022 $MULTU ($a_0,$a_5) # mul_add_c2(a[0],b[5],c3,c1,c2);
2023 $ADDU $t_2,$at
2024 $ADDU $c_3,$t_2
2025 sltu $at,$c_3,$t_2
2026 $ADDU $c_1,$at
2027 $ST $c_2,4*$BNSZ($a0)
2028 mflo ($t_1,$a_0,$a_5)
2029 mfhi ($t_2,$a_0,$a_5)
2030___
2031 &add_c2($t_2,$t_1,$c_3,$c_1,$c_2,0,
2032 $a_1,$a_4); # mul_add_c2(a[1],b[4],c3,c1,c2);
2033 &add_c2($t_2,$t_1,$c_3,$c_1,$c_2,1,
2034 $a_2,$a_3); # mul_add_c2(a[2],b[3],c3,c1,c2);
2035 &add_c2($t_2,$t_1,$c_3,$c_1,$c_2,1,
2036 $a_6,$a_0); # mul_add_c2(a[6],b[0],c1,c2,c3);
2037$code.=<<___;
2038 $ST $c_3,5*$BNSZ($a0)
2039___
2040 &add_c2($t_2,$t_1,$c_1,$c_2,$c_3,0,
2041 $a_5,$a_1); # mul_add_c2(a[5],b[1],c1,c2,c3);
2042 &add_c2($t_2,$t_1,$c_1,$c_2,$c_3,1,
2043 $a_4,$a_2); # mul_add_c2(a[4],b[2],c1,c2,c3);
2044 &add_c2($t_2,$t_1,$c_1,$c_2,$c_3,1,
2045 $a_3,$a_3); # mul_add_c(a[3],b[3],c1,c2,c3);
2046$code.=<<___;
2047 $ADDU $c_1,$t_1
2048 sltu $at,$c_1,$t_1
2049 $MULTU ($a_0,$a_7) # mul_add_c2(a[0],b[7],c2,c3,c1);
2050 $ADDU $t_2,$at
2051 $ADDU $c_2,$t_2
2052 sltu $at,$c_2,$t_2
2053 $ADDU $c_3,$at
2054 $ST $c_1,6*$BNSZ($a0)
2055 mflo ($t_1,$a_0,$a_7)
2056 mfhi ($t_2,$a_0,$a_7)
2057___
2058 &add_c2($t_2,$t_1,$c_2,$c_3,$c_1,0,
2059 $a_1,$a_6); # mul_add_c2(a[1],b[6],c2,c3,c1);
2060 &add_c2($t_2,$t_1,$c_2,$c_3,$c_1,1,
2061 $a_2,$a_5); # mul_add_c2(a[2],b[5],c2,c3,c1);
2062 &add_c2($t_2,$t_1,$c_2,$c_3,$c_1,1,
2063 $a_3,$a_4); # mul_add_c2(a[3],b[4],c2,c3,c1);
2064 &add_c2($t_2,$t_1,$c_2,$c_3,$c_1,1,
2065 $a_7,$a_1); # mul_add_c2(a[7],b[1],c3,c1,c2);
2066$code.=<<___;
2067 $ST $c_2,7*$BNSZ($a0)
2068___
2069 &add_c2($t_2,$t_1,$c_3,$c_1,$c_2,0,
2070 $a_6,$a_2); # mul_add_c2(a[6],b[2],c3,c1,c2);
2071 &add_c2($t_2,$t_1,$c_3,$c_1,$c_2,1,
2072 $a_5,$a_3); # mul_add_c2(a[5],b[3],c3,c1,c2);
2073 &add_c2($t_2,$t_1,$c_3,$c_1,$c_2,1,
2074 $a_4,$a_4); # mul_add_c(a[4],b[4],c3,c1,c2);
2075$code.=<<___;
2076 $ADDU $c_3,$t_1
2077 sltu $at,$c_3,$t_1
2078 $MULTU ($a_2,$a_7) # mul_add_c2(a[2],b[7],c1,c2,c3);
2079 $ADDU $t_2,$at
2080 $ADDU $c_1,$t_2
2081 sltu $at,$c_1,$t_2
2082 $ADDU $c_2,$at
2083 $ST $c_3,8*$BNSZ($a0)
2084 mflo ($t_1,$a_2,$a_7)
2085 mfhi ($t_2,$a_2,$a_7)
2086___
2087 &add_c2($t_2,$t_1,$c_1,$c_2,$c_3,0,
2088 $a_3,$a_6); # mul_add_c2(a[3],b[6],c1,c2,c3);
2089 &add_c2($t_2,$t_1,$c_1,$c_2,$c_3,1,
2090 $a_4,$a_5); # mul_add_c2(a[4],b[5],c1,c2,c3);
2091 &add_c2($t_2,$t_1,$c_1,$c_2,$c_3,1,
2092 $a_7,$a_3); # mul_add_c2(a[7],b[3],c2,c3,c1);
2093$code.=<<___;
2094 $ST $c_1,9*$BNSZ($a0)
2095___
2096 &add_c2($t_2,$t_1,$c_2,$c_3,$c_1,0,
2097 $a_6,$a_4); # mul_add_c2(a[6],b[4],c2,c3,c1);
2098 &add_c2($t_2,$t_1,$c_2,$c_3,$c_1,1,
2099 $a_5,$a_5); # mul_add_c(a[5],b[5],c2,c3,c1);
2100$code.=<<___;
2101 $ADDU $c_2,$t_1
2102 sltu $at,$c_2,$t_1
2103 $MULTU ($a_4,$a_7) # mul_add_c2(a[4],b[7],c3,c1,c2);
2104 $ADDU $t_2,$at
2105 $ADDU $c_3,$t_2
2106 sltu $at,$c_3,$t_2
2107 $ADDU $c_1,$at
2108 $ST $c_2,10*$BNSZ($a0)
2109 mflo ($t_1,$a_4,$a_7)
2110 mfhi ($t_2,$a_4,$a_7)
2111___
2112 &add_c2($t_2,$t_1,$c_3,$c_1,$c_2,0,
2113 $a_5,$a_6); # mul_add_c2(a[5],b[6],c3,c1,c2);
2114 &add_c2($t_2,$t_1,$c_3,$c_1,$c_2,1,
2115 $a_7,$a_5); # mul_add_c2(a[7],b[5],c1,c2,c3);
2116$code.=<<___;
2117 $ST $c_3,11*$BNSZ($a0)
2118___
2119 &add_c2($t_2,$t_1,$c_1,$c_2,$c_3,0,
2120 $a_6,$a_6); # mul_add_c(a[6],b[6],c1,c2,c3);
2121$code.=<<___;
2122 $ADDU $c_1,$t_1
2123 sltu $at,$c_1,$t_1
2124 $MULTU ($a_6,$a_7) # mul_add_c2(a[6],b[7],c2,c3,c1);
2125 $ADDU $t_2,$at
2126 $ADDU $c_2,$t_2
2127 sltu $at,$c_2,$t_2
2128 $ADDU $c_3,$at
2129 $ST $c_1,12*$BNSZ($a0)
2130 mflo ($t_1,$a_6,$a_7)
2131 mfhi ($t_2,$a_6,$a_7)
2132___
2133 &add_c2($t_2,$t_1,$c_2,$c_3,$c_1,0,
2134 $a_7,$a_7); # mul_add_c(a[7],b[7],c3,c1,c2);
2135$code.=<<___;
2136 $ST $c_2,13*$BNSZ($a0)
2137
2138 $ADDU $c_3,$t_1
2139 sltu $at,$c_3,$t_1
2140 $ADDU $t_2,$at
2141 $ADDU $c_1,$t_2
2142 $ST $c_3,14*$BNSZ($a0)
2143 $ST $c_1,15*$BNSZ($a0)
2144
2145 .set noreorder
2146___
2147$code.=<<___ if ($flavour =~ /nubi/i);
2148 $REG_L $t3,4*$SZREG($sp)
2149 $REG_L $t2,3*$SZREG($sp)
2150 $REG_L $t1,2*$SZREG($sp)
2151 $REG_L $t0,1*$SZREG($sp)
2152 $REG_L $gp,0*$SZREG($sp)
2153 $PTR_ADD $sp,6*$SZREG
2154___
2155$code.=<<___;
2156 jr $ra
2157 nop
2158.end bn_sqr_comba8
2159
2160.align 5
2161.globl bn_sqr_comba4
2162.ent bn_sqr_comba4
2163bn_sqr_comba4:
2164___
2165$code.=<<___ if ($flavour =~ /nubi/i);
2166 .frame $sp,6*$SZREG,$ra
2167 .mask 0x8000f008,-$SZREG
2168 .set noreorder
2169 $PTR_SUB $sp,6*$SZREG
2170 $REG_S $ra,5*$SZREG($sp)
2171 $REG_S $t3,4*$SZREG($sp)
2172 $REG_S $t2,3*$SZREG($sp)
2173 $REG_S $t1,2*$SZREG($sp)
2174 $REG_S $t0,1*$SZREG($sp)
2175 $REG_S $gp,0*$SZREG($sp)
2176___
2177$code.=<<___;
2178 .set reorder
2179 $LD $a_0,0($a1)
2180 $LD $a_1,$BNSZ($a1)
2181 $MULTU ($a_0,$a_0) # mul_add_c(a[0],b[0],c1,c2,c3);
2182 $LD $a_2,2*$BNSZ($a1)
2183 $LD $a_3,3*$BNSZ($a1)
2184 mflo ($c_1,$a_0,$a_0)
2185 mfhi ($c_2,$a_0,$a_0)
2186 $ST $c_1,0($a0)
2187
2188 $MULTU ($a_0,$a_1) # mul_add_c2(a[0],b[1],c2,c3,c1);
2189 mflo ($t_1,$a_0,$a_1)
2190 mfhi ($t_2,$a_0,$a_1)
2191 slt $c_1,$t_2,$zero
2192 $SLL $t_2,1
2193 $MULTU ($a_2,$a_0) # mul_add_c2(a[2],b[0],c3,c1,c2);
2194 slt $a2,$t_1,$zero
2195 $ADDU $t_2,$a2
2196 $SLL $t_1,1
2197 $ADDU $c_2,$t_1
2198 sltu $at,$c_2,$t_1
2199 $ADDU $c_3,$t_2,$at
2200 $ST $c_2,$BNSZ($a0)
2201 sltu $at,$c_3,$t_2
2202 $ADDU $c_1,$at
2203 mflo ($t_1,$a_2,$a_0)
2204 mfhi ($t_2,$a_2,$a_0)
2205___
2206 &add_c2($t_2,$t_1,$c_3,$c_1,$c_2,0,
2207 $a_1,$a_1); # mul_add_c(a[1],b[1],c3,c1,c2);
2208$code.=<<___;
2209 $ADDU $c_3,$t_1
2210 sltu $at,$c_3,$t_1
2211 $MULTU ($a_0,$a_3) # mul_add_c2(a[0],b[3],c1,c2,c3);
2212 $ADDU $t_2,$at
2213 $ADDU $c_1,$t_2
2214 sltu $at,$c_1,$t_2
2215 $ADDU $c_2,$at
2216 $ST $c_3,2*$BNSZ($a0)
2217 mflo ($t_1,$a_0,$a_3)
2218 mfhi ($t_2,$a_0,$a_3)
2219___
2220 &add_c2($t_2,$t_1,$c_1,$c_2,$c_3,0,
2221 $a_1,$a_2); # mul_add_c2(a2[1],b[2],c1,c2,c3);
2222 &add_c2($t_2,$t_1,$c_1,$c_2,$c_3,1,
2223 $a_3,$a_1); # mul_add_c2(a[3],b[1],c2,c3,c1);
2224$code.=<<___;
2225 $ST $c_1,3*$BNSZ($a0)
2226___
2227 &add_c2($t_2,$t_1,$c_2,$c_3,$c_1,0,
2228 $a_2,$a_2); # mul_add_c(a[2],b[2],c2,c3,c1);
2229$code.=<<___;
2230 $ADDU $c_2,$t_1
2231 sltu $at,$c_2,$t_1
2232 $MULTU ($a_2,$a_3) # mul_add_c2(a[2],b[3],c3,c1,c2);
2233 $ADDU $t_2,$at
2234 $ADDU $c_3,$t_2
2235 sltu $at,$c_3,$t_2
2236 $ADDU $c_1,$at
2237 $ST $c_2,4*$BNSZ($a0)
2238 mflo ($t_1,$a_2,$a_3)
2239 mfhi ($t_2,$a_2,$a_3)
2240___
2241 &add_c2($t_2,$t_1,$c_3,$c_1,$c_2,0,
2242 $a_3,$a_3); # mul_add_c(a[3],b[3],c1,c2,c3);
2243$code.=<<___;
2244 $ST $c_3,5*$BNSZ($a0)
2245
2246 $ADDU $c_1,$t_1
2247 sltu $at,$c_1,$t_1
2248 $ADDU $t_2,$at
2249 $ADDU $c_2,$t_2
2250 $ST $c_1,6*$BNSZ($a0)
2251 $ST $c_2,7*$BNSZ($a0)
2252
2253 .set noreorder
2254___
2255$code.=<<___ if ($flavour =~ /nubi/i);
2256 $REG_L $t3,4*$SZREG($sp)
2257 $REG_L $t2,3*$SZREG($sp)
2258 $REG_L $t1,2*$SZREG($sp)
2259 $REG_L $t0,1*$SZREG($sp)
2260 $REG_L $gp,0*$SZREG($sp)
2261 $PTR_ADD $sp,6*$SZREG
2262___
2263$code.=<<___;
2264 jr $ra
2265 nop
2266.end bn_sqr_comba4
2267___
2268print $code;
2269close STDOUT or die "error closing STDOUT: $!";
Note: See TracBrowser for help on using the repository browser.

© 2024 Oracle Support Privacy / Do Not Sell My Info Terms of Use Trademark Policy Automated Access Etiquette