VirtualBox

source: vbox/trunk/src/libs/openssl-3.0.1/crypto/bn/asm/mips.pl@ 94081

Last change on this file since 94081 was 91772, checked in by vboxsync, 3 years ago

openssl-1.1.1l: Applied and adjusted our OpenSSL changes to 1.1.1l. bugref:10126

File size: 48.2 KB
Line 
1#! /usr/bin/env perl
2# Copyright 2010-2020 The OpenSSL Project Authors. All Rights Reserved.
3#
4# Licensed under the OpenSSL license (the "License"). You may not use
5# this file except in compliance with the License. You can obtain a copy
6# in the file LICENSE in the source distribution or at
7# https://www.openssl.org/source/license.html
8
9#
10# ====================================================================
11# Written by Andy Polyakov <[email protected]> for the OpenSSL
12# project.
13#
14# Rights for redistribution and usage in source and binary forms are
15# granted according to the OpenSSL license. Warranty of any kind is
16# disclaimed.
17# ====================================================================
18
19
20# July 1999
21#
22# This is drop-in MIPS III/IV ISA replacement for crypto/bn/bn_asm.c.
23#
24# The module is designed to work with either of the "new" MIPS ABI(5),
25# namely N32 or N64, offered by IRIX 6.x. It's not meant to work under
26# IRIX 5.x not only because it doesn't support new ABIs but also
27# because 5.x kernels put R4x00 CPU into 32-bit mode and all those
28# 64-bit instructions (daddu, dmultu, etc.) found below gonna only
29# cause illegal instruction exception:-(
30#
31# In addition the code depends on preprocessor flags set up by MIPSpro
32# compiler driver (either as or cc) and therefore (probably?) can't be
33# compiled by the GNU assembler. GNU C driver manages fine though...
34# I mean as long as -mmips-as is specified or is the default option,
35# because then it simply invokes /usr/bin/as which in turn takes
36# perfect care of the preprocessor definitions. Another neat feature
37# offered by the MIPSpro assembler is an optimization pass. This gave
38# me the opportunity to have the code looking more regular as all those
39# architecture dependent instruction rescheduling details were left to
40# the assembler. Cool, huh?
41#
42# Performance improvement is astonishing! 'apps/openssl speed rsa dsa'
43# goes way over 3 times faster!
44#
45# <[email protected]>
46
47# October 2010
48#
49# Adapt the module even for 32-bit ABIs and other OSes. The former was
50# achieved by mechanical replacement of 64-bit arithmetic instructions
51# such as dmultu, daddu, etc. with their 32-bit counterparts and
52# adjusting offsets denoting multiples of BN_ULONG. Above mentioned
53# >3x performance improvement naturally does not apply to 32-bit code
54# [because there is no instruction 32-bit compiler can't use], one
55# has to content with 40-85% improvement depending on benchmark and
56# key length, more for longer keys.
57
58$flavour = shift || "o32";
59while (($output=shift) && ($output!~/\w[\w\-]*\.\w+$/)) {}
60open STDOUT,">$output";
61
62if ($flavour =~ /64|n32/i) {
63 $LD="ld";
64 $ST="sd";
65 $MULTU="dmultu";
66 $DIVU="ddivu";
67 $ADDU="daddu";
68 $SUBU="dsubu";
69 $SRL="dsrl";
70 $SLL="dsll";
71 $BNSZ=8;
72 $PTR_ADD="daddu";
73 $PTR_SUB="dsubu";
74 $SZREG=8;
75 $REG_S="sd";
76 $REG_L="ld";
77} else {
78 $LD="lw";
79 $ST="sw";
80 $MULTU="multu";
81 $DIVU="divu";
82 $ADDU="addu";
83 $SUBU="subu";
84 $SRL="srl";
85 $SLL="sll";
86 $BNSZ=4;
87 $PTR_ADD="addu";
88 $PTR_SUB="subu";
89 $SZREG=4;
90 $REG_S="sw";
91 $REG_L="lw";
92 $code="#if !(defined (__mips_isa_rev) && (__mips_isa_rev >= 6))\n.set mips2\n#endif\n";
93}
94
95# Below is N32/64 register layout used in the original module.
96#
97($zero,$at,$v0,$v1)=map("\$$_",(0..3));
98($a0,$a1,$a2,$a3,$a4,$a5,$a6,$a7)=map("\$$_",(4..11));
99($t0,$t1,$t2,$t3,$t8,$t9)=map("\$$_",(12..15,24,25));
100($s0,$s1,$s2,$s3,$s4,$s5,$s6,$s7)=map("\$$_",(16..23));
101($gp,$sp,$fp,$ra)=map("\$$_",(28..31));
102($ta0,$ta1,$ta2,$ta3)=($a4,$a5,$a6,$a7);
103#
104# No special adaptation is required for O32. NUBI on the other hand
105# is treated by saving/restoring ($v1,$t0..$t3).
106
107$gp=$v1 if ($flavour =~ /nubi/i);
108
109$minus4=$v1;
110
111$code.=<<___;
112#include "mips_arch.h"
113
114#if defined(_MIPS_ARCH_MIPS64R6)
115# define ddivu(rs,rt)
116# define mfqt(rd,rs,rt) ddivu rd,rs,rt
117# define mfrm(rd,rs,rt) dmodu rd,rs,rt
118#elif defined(_MIPS_ARCH_MIPS32R6)
119# define divu(rs,rt)
120# define mfqt(rd,rs,rt) divu rd,rs,rt
121# define mfrm(rd,rs,rt) modu rd,rs,rt
122#else
123# define $DIVU(rs,rt) $DIVU $zero,rs,rt
124# define mfqt(rd,rs,rt) mflo rd
125# define mfrm(rd,rs,rt) mfhi rd
126#endif
127
128.rdata
129.asciiz "mips3.s, Version 1.2"
130.asciiz "MIPS II/III/IV ISA artwork by Andy Polyakov <appro\@fy.chalmers.se>"
131
132.text
133.set noat
134
135.align 5
136.globl bn_mul_add_words
137.ent bn_mul_add_words
138bn_mul_add_words:
139 .set noreorder
140 bgtz $a2,bn_mul_add_words_internal
141 move $v0,$zero
142 jr $ra
143 move $a0,$v0
144.end bn_mul_add_words
145
146.align 5
147.ent bn_mul_add_words_internal
148bn_mul_add_words_internal:
149___
150$code.=<<___ if ($flavour =~ /nubi/i);
151 .frame $sp,6*$SZREG,$ra
152 .mask 0x8000f008,-$SZREG
153 .set noreorder
154 $PTR_SUB $sp,6*$SZREG
155 $REG_S $ra,5*$SZREG($sp)
156 $REG_S $t3,4*$SZREG($sp)
157 $REG_S $t2,3*$SZREG($sp)
158 $REG_S $t1,2*$SZREG($sp)
159 $REG_S $t0,1*$SZREG($sp)
160 $REG_S $gp,0*$SZREG($sp)
161___
162$code.=<<___;
163 .set reorder
164 li $minus4,-4
165 and $ta0,$a2,$minus4
166 beqz $ta0,.L_bn_mul_add_words_tail
167
168.L_bn_mul_add_words_loop:
169 $LD $t0,0($a1)
170 $MULTU ($t0,$a3)
171 $LD $t1,0($a0)
172 $LD $t2,$BNSZ($a1)
173 $LD $t3,$BNSZ($a0)
174 $LD $ta0,2*$BNSZ($a1)
175 $LD $ta1,2*$BNSZ($a0)
176 $ADDU $t1,$v0
177 sltu $v0,$t1,$v0 # All manuals say it "compares 32-bit
178 # values", but it seems to work fine
179 # even on 64-bit registers.
180 mflo ($at,$t0,$a3)
181 mfhi ($t0,$t0,$a3)
182 $ADDU $t1,$at
183 $ADDU $v0,$t0
184 $MULTU ($t2,$a3)
185 sltu $at,$t1,$at
186 $ST $t1,0($a0)
187 $ADDU $v0,$at
188
189 $LD $ta2,3*$BNSZ($a1)
190 $LD $ta3,3*$BNSZ($a0)
191 $ADDU $t3,$v0
192 sltu $v0,$t3,$v0
193 mflo ($at,$t2,$a3)
194 mfhi ($t2,$t2,$a3)
195 $ADDU $t3,$at
196 $ADDU $v0,$t2
197 $MULTU ($ta0,$a3)
198 sltu $at,$t3,$at
199 $ST $t3,$BNSZ($a0)
200 $ADDU $v0,$at
201
202 subu $a2,4
203 $PTR_ADD $a0,4*$BNSZ
204 $PTR_ADD $a1,4*$BNSZ
205 $ADDU $ta1,$v0
206 sltu $v0,$ta1,$v0
207 mflo ($at,$ta0,$a3)
208 mfhi ($ta0,$ta0,$a3)
209 $ADDU $ta1,$at
210 $ADDU $v0,$ta0
211 $MULTU ($ta2,$a3)
212 sltu $at,$ta1,$at
213 $ST $ta1,-2*$BNSZ($a0)
214 $ADDU $v0,$at
215
216
217 and $ta0,$a2,$minus4
218 $ADDU $ta3,$v0
219 sltu $v0,$ta3,$v0
220 mflo ($at,$ta2,$a3)
221 mfhi ($ta2,$ta2,$a3)
222 $ADDU $ta3,$at
223 $ADDU $v0,$ta2
224 sltu $at,$ta3,$at
225 $ST $ta3,-$BNSZ($a0)
226 .set noreorder
227 bgtz $ta0,.L_bn_mul_add_words_loop
228 $ADDU $v0,$at
229
230 beqz $a2,.L_bn_mul_add_words_return
231 nop
232
233.L_bn_mul_add_words_tail:
234 .set reorder
235 $LD $t0,0($a1)
236 $MULTU ($t0,$a3)
237 $LD $t1,0($a0)
238 subu $a2,1
239 $ADDU $t1,$v0
240 sltu $v0,$t1,$v0
241 mflo ($at,$t0,$a3)
242 mfhi ($t0,$t0,$a3)
243 $ADDU $t1,$at
244 $ADDU $v0,$t0
245 sltu $at,$t1,$at
246 $ST $t1,0($a0)
247 $ADDU $v0,$at
248 beqz $a2,.L_bn_mul_add_words_return
249
250 $LD $t0,$BNSZ($a1)
251 $MULTU ($t0,$a3)
252 $LD $t1,$BNSZ($a0)
253 subu $a2,1
254 $ADDU $t1,$v0
255 sltu $v0,$t1,$v0
256 mflo ($at,$t0,$a3)
257 mfhi ($t0,$t0,$a3)
258 $ADDU $t1,$at
259 $ADDU $v0,$t0
260 sltu $at,$t1,$at
261 $ST $t1,$BNSZ($a0)
262 $ADDU $v0,$at
263 beqz $a2,.L_bn_mul_add_words_return
264
265 $LD $t0,2*$BNSZ($a1)
266 $MULTU ($t0,$a3)
267 $LD $t1,2*$BNSZ($a0)
268 $ADDU $t1,$v0
269 sltu $v0,$t1,$v0
270 mflo ($at,$t0,$a3)
271 mfhi ($t0,$t0,$a3)
272 $ADDU $t1,$at
273 $ADDU $v0,$t0
274 sltu $at,$t1,$at
275 $ST $t1,2*$BNSZ($a0)
276 $ADDU $v0,$at
277
278.L_bn_mul_add_words_return:
279 .set noreorder
280___
281$code.=<<___ if ($flavour =~ /nubi/i);
282 $REG_L $t3,4*$SZREG($sp)
283 $REG_L $t2,3*$SZREG($sp)
284 $REG_L $t1,2*$SZREG($sp)
285 $REG_L $t0,1*$SZREG($sp)
286 $REG_L $gp,0*$SZREG($sp)
287 $PTR_ADD $sp,6*$SZREG
288___
289$code.=<<___;
290 jr $ra
291 move $a0,$v0
292.end bn_mul_add_words_internal
293
294.align 5
295.globl bn_mul_words
296.ent bn_mul_words
297bn_mul_words:
298 .set noreorder
299 bgtz $a2,bn_mul_words_internal
300 move $v0,$zero
301 jr $ra
302 move $a0,$v0
303.end bn_mul_words
304
305.align 5
306.ent bn_mul_words_internal
307bn_mul_words_internal:
308___
309$code.=<<___ if ($flavour =~ /nubi/i);
310 .frame $sp,6*$SZREG,$ra
311 .mask 0x8000f008,-$SZREG
312 .set noreorder
313 $PTR_SUB $sp,6*$SZREG
314 $REG_S $ra,5*$SZREG($sp)
315 $REG_S $t3,4*$SZREG($sp)
316 $REG_S $t2,3*$SZREG($sp)
317 $REG_S $t1,2*$SZREG($sp)
318 $REG_S $t0,1*$SZREG($sp)
319 $REG_S $gp,0*$SZREG($sp)
320___
321$code.=<<___;
322 .set reorder
323 li $minus4,-4
324 and $ta0,$a2,$minus4
325 beqz $ta0,.L_bn_mul_words_tail
326
327.L_bn_mul_words_loop:
328 $LD $t0,0($a1)
329 $MULTU ($t0,$a3)
330 $LD $t2,$BNSZ($a1)
331 $LD $ta0,2*$BNSZ($a1)
332 $LD $ta2,3*$BNSZ($a1)
333 mflo ($at,$t0,$a3)
334 mfhi ($t0,$t0,$a3)
335 $ADDU $v0,$at
336 sltu $t1,$v0,$at
337 $MULTU ($t2,$a3)
338 $ST $v0,0($a0)
339 $ADDU $v0,$t1,$t0
340
341 subu $a2,4
342 $PTR_ADD $a0,4*$BNSZ
343 $PTR_ADD $a1,4*$BNSZ
344 mflo ($at,$t2,$a3)
345 mfhi ($t2,$t2,$a3)
346 $ADDU $v0,$at
347 sltu $t3,$v0,$at
348 $MULTU ($ta0,$a3)
349 $ST $v0,-3*$BNSZ($a0)
350 $ADDU $v0,$t3,$t2
351
352 mflo ($at,$ta0,$a3)
353 mfhi ($ta0,$ta0,$a3)
354 $ADDU $v0,$at
355 sltu $ta1,$v0,$at
356 $MULTU ($ta2,$a3)
357 $ST $v0,-2*$BNSZ($a0)
358 $ADDU $v0,$ta1,$ta0
359
360 and $ta0,$a2,$minus4
361 mflo ($at,$ta2,$a3)
362 mfhi ($ta2,$ta2,$a3)
363 $ADDU $v0,$at
364 sltu $ta3,$v0,$at
365 $ST $v0,-$BNSZ($a0)
366 .set noreorder
367 bgtz $ta0,.L_bn_mul_words_loop
368 $ADDU $v0,$ta3,$ta2
369
370 beqz $a2,.L_bn_mul_words_return
371 nop
372
373.L_bn_mul_words_tail:
374 .set reorder
375 $LD $t0,0($a1)
376 $MULTU ($t0,$a3)
377 subu $a2,1
378 mflo ($at,$t0,$a3)
379 mfhi ($t0,$t0,$a3)
380 $ADDU $v0,$at
381 sltu $t1,$v0,$at
382 $ST $v0,0($a0)
383 $ADDU $v0,$t1,$t0
384 beqz $a2,.L_bn_mul_words_return
385
386 $LD $t0,$BNSZ($a1)
387 $MULTU ($t0,$a3)
388 subu $a2,1
389 mflo ($at,$t0,$a3)
390 mfhi ($t0,$t0,$a3)
391 $ADDU $v0,$at
392 sltu $t1,$v0,$at
393 $ST $v0,$BNSZ($a0)
394 $ADDU $v0,$t1,$t0
395 beqz $a2,.L_bn_mul_words_return
396
397 $LD $t0,2*$BNSZ($a1)
398 $MULTU ($t0,$a3)
399 mflo ($at,$t0,$a3)
400 mfhi ($t0,$t0,$a3)
401 $ADDU $v0,$at
402 sltu $t1,$v0,$at
403 $ST $v0,2*$BNSZ($a0)
404 $ADDU $v0,$t1,$t0
405
406.L_bn_mul_words_return:
407 .set noreorder
408___
409$code.=<<___ if ($flavour =~ /nubi/i);
410 $REG_L $t3,4*$SZREG($sp)
411 $REG_L $t2,3*$SZREG($sp)
412 $REG_L $t1,2*$SZREG($sp)
413 $REG_L $t0,1*$SZREG($sp)
414 $REG_L $gp,0*$SZREG($sp)
415 $PTR_ADD $sp,6*$SZREG
416___
417$code.=<<___;
418 jr $ra
419 move $a0,$v0
420.end bn_mul_words_internal
421
422.align 5
423.globl bn_sqr_words
424.ent bn_sqr_words
425bn_sqr_words:
426 .set noreorder
427 bgtz $a2,bn_sqr_words_internal
428 move $v0,$zero
429 jr $ra
430 move $a0,$v0
431.end bn_sqr_words
432
433.align 5
434.ent bn_sqr_words_internal
435bn_sqr_words_internal:
436___
437$code.=<<___ if ($flavour =~ /nubi/i);
438 .frame $sp,6*$SZREG,$ra
439 .mask 0x8000f008,-$SZREG
440 .set noreorder
441 $PTR_SUB $sp,6*$SZREG
442 $REG_S $ra,5*$SZREG($sp)
443 $REG_S $t3,4*$SZREG($sp)
444 $REG_S $t2,3*$SZREG($sp)
445 $REG_S $t1,2*$SZREG($sp)
446 $REG_S $t0,1*$SZREG($sp)
447 $REG_S $gp,0*$SZREG($sp)
448___
449$code.=<<___;
450 .set reorder
451 li $minus4,-4
452 and $ta0,$a2,$minus4
453 beqz $ta0,.L_bn_sqr_words_tail
454
455.L_bn_sqr_words_loop:
456 $LD $t0,0($a1)
457 $MULTU ($t0,$t0)
458 $LD $t2,$BNSZ($a1)
459 $LD $ta0,2*$BNSZ($a1)
460 $LD $ta2,3*$BNSZ($a1)
461 mflo ($t1,$t0,$t0)
462 mfhi ($t0,$t0,$t0)
463 $ST $t1,0($a0)
464 $ST $t0,$BNSZ($a0)
465
466 $MULTU ($t2,$t2)
467 subu $a2,4
468 $PTR_ADD $a0,8*$BNSZ
469 $PTR_ADD $a1,4*$BNSZ
470 mflo ($t3,$t2,$t2)
471 mfhi ($t2,$t2,$t2)
472 $ST $t3,-6*$BNSZ($a0)
473 $ST $t2,-5*$BNSZ($a0)
474
475 $MULTU ($ta0,$ta0)
476 mflo ($ta1,$ta0,$ta0)
477 mfhi ($ta0,$ta0,$ta0)
478 $ST $ta1,-4*$BNSZ($a0)
479 $ST $ta0,-3*$BNSZ($a0)
480
481
482 $MULTU ($ta2,$ta2)
483 and $ta0,$a2,$minus4
484 mflo ($ta3,$ta2,$ta2)
485 mfhi ($ta2,$ta2,$ta2)
486 $ST $ta3,-2*$BNSZ($a0)
487
488 .set noreorder
489 bgtz $ta0,.L_bn_sqr_words_loop
490 $ST $ta2,-$BNSZ($a0)
491
492 beqz $a2,.L_bn_sqr_words_return
493 nop
494
495.L_bn_sqr_words_tail:
496 .set reorder
497 $LD $t0,0($a1)
498 $MULTU ($t0,$t0)
499 subu $a2,1
500 mflo ($t1,$t0,$t0)
501 mfhi ($t0,$t0,$t0)
502 $ST $t1,0($a0)
503 $ST $t0,$BNSZ($a0)
504 beqz $a2,.L_bn_sqr_words_return
505
506 $LD $t0,$BNSZ($a1)
507 $MULTU ($t0,$t0)
508 subu $a2,1
509 mflo ($t1,$t0,$t0)
510 mfhi ($t0,$t0,$t0)
511 $ST $t1,2*$BNSZ($a0)
512 $ST $t0,3*$BNSZ($a0)
513 beqz $a2,.L_bn_sqr_words_return
514
515 $LD $t0,2*$BNSZ($a1)
516 $MULTU ($t0,$t0)
517 mflo ($t1,$t0,$t0)
518 mfhi ($t0,$t0,$t0)
519 $ST $t1,4*$BNSZ($a0)
520 $ST $t0,5*$BNSZ($a0)
521
522.L_bn_sqr_words_return:
523 .set noreorder
524___
525$code.=<<___ if ($flavour =~ /nubi/i);
526 $REG_L $t3,4*$SZREG($sp)
527 $REG_L $t2,3*$SZREG($sp)
528 $REG_L $t1,2*$SZREG($sp)
529 $REG_L $t0,1*$SZREG($sp)
530 $REG_L $gp,0*$SZREG($sp)
531 $PTR_ADD $sp,6*$SZREG
532___
533$code.=<<___;
534 jr $ra
535 move $a0,$v0
536
537.end bn_sqr_words_internal
538
539.align 5
540.globl bn_add_words
541.ent bn_add_words
542bn_add_words:
543 .set noreorder
544 bgtz $a3,bn_add_words_internal
545 move $v0,$zero
546 jr $ra
547 move $a0,$v0
548.end bn_add_words
549
550.align 5
551.ent bn_add_words_internal
552bn_add_words_internal:
553___
554$code.=<<___ if ($flavour =~ /nubi/i);
555 .frame $sp,6*$SZREG,$ra
556 .mask 0x8000f008,-$SZREG
557 .set noreorder
558 $PTR_SUB $sp,6*$SZREG
559 $REG_S $ra,5*$SZREG($sp)
560 $REG_S $t3,4*$SZREG($sp)
561 $REG_S $t2,3*$SZREG($sp)
562 $REG_S $t1,2*$SZREG($sp)
563 $REG_S $t0,1*$SZREG($sp)
564 $REG_S $gp,0*$SZREG($sp)
565___
566$code.=<<___;
567 .set reorder
568 li $minus4,-4
569 and $at,$a3,$minus4
570 beqz $at,.L_bn_add_words_tail
571
572.L_bn_add_words_loop:
573 $LD $t0,0($a1)
574 $LD $ta0,0($a2)
575 subu $a3,4
576 $LD $t1,$BNSZ($a1)
577 and $at,$a3,$minus4
578 $LD $t2,2*$BNSZ($a1)
579 $PTR_ADD $a2,4*$BNSZ
580 $LD $t3,3*$BNSZ($a1)
581 $PTR_ADD $a0,4*$BNSZ
582 $LD $ta1,-3*$BNSZ($a2)
583 $PTR_ADD $a1,4*$BNSZ
584 $LD $ta2,-2*$BNSZ($a2)
585 $LD $ta3,-$BNSZ($a2)
586 $ADDU $ta0,$t0
587 sltu $t8,$ta0,$t0
588 $ADDU $t0,$ta0,$v0
589 sltu $v0,$t0,$ta0
590 $ST $t0,-4*$BNSZ($a0)
591 $ADDU $v0,$t8
592
593 $ADDU $ta1,$t1
594 sltu $t9,$ta1,$t1
595 $ADDU $t1,$ta1,$v0
596 sltu $v0,$t1,$ta1
597 $ST $t1,-3*$BNSZ($a0)
598 $ADDU $v0,$t9
599
600 $ADDU $ta2,$t2
601 sltu $t8,$ta2,$t2
602 $ADDU $t2,$ta2,$v0
603 sltu $v0,$t2,$ta2
604 $ST $t2,-2*$BNSZ($a0)
605 $ADDU $v0,$t8
606
607 $ADDU $ta3,$t3
608 sltu $t9,$ta3,$t3
609 $ADDU $t3,$ta3,$v0
610 sltu $v0,$t3,$ta3
611 $ST $t3,-$BNSZ($a0)
612
613 .set noreorder
614 bgtz $at,.L_bn_add_words_loop
615 $ADDU $v0,$t9
616
617 beqz $a3,.L_bn_add_words_return
618 nop
619
620.L_bn_add_words_tail:
621 .set reorder
622 $LD $t0,0($a1)
623 $LD $ta0,0($a2)
624 $ADDU $ta0,$t0
625 subu $a3,1
626 sltu $t8,$ta0,$t0
627 $ADDU $t0,$ta0,$v0
628 sltu $v0,$t0,$ta0
629 $ST $t0,0($a0)
630 $ADDU $v0,$t8
631 beqz $a3,.L_bn_add_words_return
632
633 $LD $t1,$BNSZ($a1)
634 $LD $ta1,$BNSZ($a2)
635 $ADDU $ta1,$t1
636 subu $a3,1
637 sltu $t9,$ta1,$t1
638 $ADDU $t1,$ta1,$v0
639 sltu $v0,$t1,$ta1
640 $ST $t1,$BNSZ($a0)
641 $ADDU $v0,$t9
642 beqz $a3,.L_bn_add_words_return
643
644 $LD $t2,2*$BNSZ($a1)
645 $LD $ta2,2*$BNSZ($a2)
646 $ADDU $ta2,$t2
647 sltu $t8,$ta2,$t2
648 $ADDU $t2,$ta2,$v0
649 sltu $v0,$t2,$ta2
650 $ST $t2,2*$BNSZ($a0)
651 $ADDU $v0,$t8
652
653.L_bn_add_words_return:
654 .set noreorder
655___
656$code.=<<___ if ($flavour =~ /nubi/i);
657 $REG_L $t3,4*$SZREG($sp)
658 $REG_L $t2,3*$SZREG($sp)
659 $REG_L $t1,2*$SZREG($sp)
660 $REG_L $t0,1*$SZREG($sp)
661 $REG_L $gp,0*$SZREG($sp)
662 $PTR_ADD $sp,6*$SZREG
663___
664$code.=<<___;
665 jr $ra
666 move $a0,$v0
667
668.end bn_add_words_internal
669
670.align 5
671.globl bn_sub_words
672.ent bn_sub_words
673bn_sub_words:
674 .set noreorder
675 bgtz $a3,bn_sub_words_internal
676 move $v0,$zero
677 jr $ra
678 move $a0,$zero
679.end bn_sub_words
680
681.align 5
682.ent bn_sub_words_internal
683bn_sub_words_internal:
684___
685$code.=<<___ if ($flavour =~ /nubi/i);
686 .frame $sp,6*$SZREG,$ra
687 .mask 0x8000f008,-$SZREG
688 .set noreorder
689 $PTR_SUB $sp,6*$SZREG
690 $REG_S $ra,5*$SZREG($sp)
691 $REG_S $t3,4*$SZREG($sp)
692 $REG_S $t2,3*$SZREG($sp)
693 $REG_S $t1,2*$SZREG($sp)
694 $REG_S $t0,1*$SZREG($sp)
695 $REG_S $gp,0*$SZREG($sp)
696___
697$code.=<<___;
698 .set reorder
699 li $minus4,-4
700 and $at,$a3,$minus4
701 beqz $at,.L_bn_sub_words_tail
702
703.L_bn_sub_words_loop:
704 $LD $t0,0($a1)
705 $LD $ta0,0($a2)
706 subu $a3,4
707 $LD $t1,$BNSZ($a1)
708 and $at,$a3,$minus4
709 $LD $t2,2*$BNSZ($a1)
710 $PTR_ADD $a2,4*$BNSZ
711 $LD $t3,3*$BNSZ($a1)
712 $PTR_ADD $a0,4*$BNSZ
713 $LD $ta1,-3*$BNSZ($a2)
714 $PTR_ADD $a1,4*$BNSZ
715 $LD $ta2,-2*$BNSZ($a2)
716 $LD $ta3,-$BNSZ($a2)
717 sltu $t8,$t0,$ta0
718 $SUBU $ta0,$t0,$ta0
719 $SUBU $t0,$ta0,$v0
720 sgtu $v0,$t0,$ta0
721 $ST $t0,-4*$BNSZ($a0)
722 $ADDU $v0,$t8
723
724 sltu $t9,$t1,$ta1
725 $SUBU $ta1,$t1,$ta1
726 $SUBU $t1,$ta1,$v0
727 sgtu $v0,$t1,$ta1
728 $ST $t1,-3*$BNSZ($a0)
729 $ADDU $v0,$t9
730
731
732 sltu $t8,$t2,$ta2
733 $SUBU $ta2,$t2,$ta2
734 $SUBU $t2,$ta2,$v0
735 sgtu $v0,$t2,$ta2
736 $ST $t2,-2*$BNSZ($a0)
737 $ADDU $v0,$t8
738
739 sltu $t9,$t3,$ta3
740 $SUBU $ta3,$t3,$ta3
741 $SUBU $t3,$ta3,$v0
742 sgtu $v0,$t3,$ta3
743 $ST $t3,-$BNSZ($a0)
744
745 .set noreorder
746 bgtz $at,.L_bn_sub_words_loop
747 $ADDU $v0,$t9
748
749 beqz $a3,.L_bn_sub_words_return
750 nop
751
752.L_bn_sub_words_tail:
753 .set reorder
754 $LD $t0,0($a1)
755 $LD $ta0,0($a2)
756 subu $a3,1
757 sltu $t8,$t0,$ta0
758 $SUBU $ta0,$t0,$ta0
759 $SUBU $t0,$ta0,$v0
760 sgtu $v0,$t0,$ta0
761 $ST $t0,0($a0)
762 $ADDU $v0,$t8
763 beqz $a3,.L_bn_sub_words_return
764
765 $LD $t1,$BNSZ($a1)
766 subu $a3,1
767 $LD $ta1,$BNSZ($a2)
768 sltu $t9,$t1,$ta1
769 $SUBU $ta1,$t1,$ta1
770 $SUBU $t1,$ta1,$v0
771 sgtu $v0,$t1,$ta1
772 $ST $t1,$BNSZ($a0)
773 $ADDU $v0,$t9
774 beqz $a3,.L_bn_sub_words_return
775
776 $LD $t2,2*$BNSZ($a1)
777 $LD $ta2,2*$BNSZ($a2)
778 sltu $t8,$t2,$ta2
779 $SUBU $ta2,$t2,$ta2
780 $SUBU $t2,$ta2,$v0
781 sgtu $v0,$t2,$ta2
782 $ST $t2,2*$BNSZ($a0)
783 $ADDU $v0,$t8
784
785.L_bn_sub_words_return:
786 .set noreorder
787___
788$code.=<<___ if ($flavour =~ /nubi/i);
789 $REG_L $t3,4*$SZREG($sp)
790 $REG_L $t2,3*$SZREG($sp)
791 $REG_L $t1,2*$SZREG($sp)
792 $REG_L $t0,1*$SZREG($sp)
793 $REG_L $gp,0*$SZREG($sp)
794 $PTR_ADD $sp,6*$SZREG
795___
796$code.=<<___;
797 jr $ra
798 move $a0,$v0
799.end bn_sub_words_internal
800
801#if 0
802/*
803 * The bn_div_3_words entry point is re-used for constant-time interface.
804 * Implementation is retained as historical reference.
805 */
806.align 5
807.globl bn_div_3_words
808.ent bn_div_3_words
809bn_div_3_words:
810 .set noreorder
811 move $a3,$a0 # we know that bn_div_words does not
812 # touch $a3, $ta2, $ta3 and preserves $a2
813 # so that we can save two arguments
814 # and return address in registers
815 # instead of stack:-)
816
817 $LD $a0,($a3)
818 move $ta2,$a1
819 bne $a0,$a2,bn_div_3_words_internal
820 $LD $a1,-$BNSZ($a3)
821 li $v0,-1
822 jr $ra
823 move $a0,$v0
824.end bn_div_3_words
825
826.align 5
827.ent bn_div_3_words_internal
828bn_div_3_words_internal:
829___
830$code.=<<___ if ($flavour =~ /nubi/i);
831 .frame $sp,6*$SZREG,$ra
832 .mask 0x8000f008,-$SZREG
833 .set noreorder
834 $PTR_SUB $sp,6*$SZREG
835 $REG_S $ra,5*$SZREG($sp)
836 $REG_S $t3,4*$SZREG($sp)
837 $REG_S $t2,3*$SZREG($sp)
838 $REG_S $t1,2*$SZREG($sp)
839 $REG_S $t0,1*$SZREG($sp)
840 $REG_S $gp,0*$SZREG($sp)
841___
842$code.=<<___;
843 .set reorder
844 move $ta3,$ra
845 bal bn_div_words_internal
846 move $ra,$ta3
847 $MULTU ($ta2,$v0)
848 $LD $t2,-2*$BNSZ($a3)
849 move $ta0,$zero
850 mfhi ($t1,$ta2,$v0)
851 mflo ($t0,$ta2,$v0)
852 sltu $t8,$t1,$a1
853.L_bn_div_3_words_inner_loop:
854 bnez $t8,.L_bn_div_3_words_inner_loop_done
855 sgeu $at,$t2,$t0
856 seq $t9,$t1,$a1
857 and $at,$t9
858 sltu $t3,$t0,$ta2
859 $ADDU $a1,$a2
860 $SUBU $t1,$t3
861 $SUBU $t0,$ta2
862 sltu $t8,$t1,$a1
863 sltu $ta0,$a1,$a2
864 or $t8,$ta0
865 .set noreorder
866 beqz $at,.L_bn_div_3_words_inner_loop
867 $SUBU $v0,1
868 $ADDU $v0,1
869 .set reorder
870.L_bn_div_3_words_inner_loop_done:
871 .set noreorder
872___
873$code.=<<___ if ($flavour =~ /nubi/i);
874 $REG_L $t3,4*$SZREG($sp)
875 $REG_L $t2,3*$SZREG($sp)
876 $REG_L $t1,2*$SZREG($sp)
877 $REG_L $t0,1*$SZREG($sp)
878 $REG_L $gp,0*$SZREG($sp)
879 $PTR_ADD $sp,6*$SZREG
880___
881$code.=<<___;
882 jr $ra
883 move $a0,$v0
884.end bn_div_3_words_internal
885#endif
886
887.align 5
888.globl bn_div_words
889.ent bn_div_words
890bn_div_words:
891 .set noreorder
892 bnez $a2,bn_div_words_internal
893 li $v0,-1 # I would rather signal div-by-zero
894 # which can be done with 'break 7'
895 jr $ra
896 move $a0,$v0
897.end bn_div_words
898
899.align 5
900.ent bn_div_words_internal
901bn_div_words_internal:
902___
903$code.=<<___ if ($flavour =~ /nubi/i);
904 .frame $sp,6*$SZREG,$ra
905 .mask 0x8000f008,-$SZREG
906 .set noreorder
907 $PTR_SUB $sp,6*$SZREG
908 $REG_S $ra,5*$SZREG($sp)
909 $REG_S $t3,4*$SZREG($sp)
910 $REG_S $t2,3*$SZREG($sp)
911 $REG_S $t1,2*$SZREG($sp)
912 $REG_S $t0,1*$SZREG($sp)
913 $REG_S $gp,0*$SZREG($sp)
914___
915$code.=<<___;
916 move $v1,$zero
917 bltz $a2,.L_bn_div_words_body
918 move $t9,$v1
919 $SLL $a2,1
920 bgtz $a2,.-4
921 addu $t9,1
922
923 .set reorder
924 negu $t1,$t9
925 li $t2,-1
926 $SLL $t2,$t1
927 and $t2,$a0
928 $SRL $at,$a1,$t1
929 .set noreorder
930 beqz $t2,.+12
931 nop
932 break 6 # signal overflow
933 .set reorder
934 $SLL $a0,$t9
935 $SLL $a1,$t9
936 or $a0,$at
937___
938$QT=$ta0;
939$HH=$ta1;
940$DH=$v1;
941$code.=<<___;
942.L_bn_div_words_body:
943 $SRL $DH,$a2,4*$BNSZ # bits
944 sgeu $at,$a0,$a2
945 .set noreorder
946 beqz $at,.+12
947 nop
948 $SUBU $a0,$a2
949 .set reorder
950
951 li $QT,-1
952 $SRL $HH,$a0,4*$BNSZ # bits
953 $SRL $QT,4*$BNSZ # q=0xffffffff
954 beq $DH,$HH,.L_bn_div_words_skip_div1
955 $DIVU ($a0,$DH)
956 mfqt ($QT,$a0,$DH)
957.L_bn_div_words_skip_div1:
958 $MULTU ($a2,$QT)
959 $SLL $t3,$a0,4*$BNSZ # bits
960 $SRL $at,$a1,4*$BNSZ # bits
961 or $t3,$at
962 mflo ($t0,$a2,$QT)
963 mfhi ($t1,$a2,$QT)
964.L_bn_div_words_inner_loop1:
965 sltu $t2,$t3,$t0
966 seq $t8,$HH,$t1
967 sltu $at,$HH,$t1
968 and $t2,$t8
969 sltu $v0,$t0,$a2
970 or $at,$t2
971 .set noreorder
972 beqz $at,.L_bn_div_words_inner_loop1_done
973 $SUBU $t1,$v0
974 $SUBU $t0,$a2
975 b .L_bn_div_words_inner_loop1
976 $SUBU $QT,1
977 .set reorder
978.L_bn_div_words_inner_loop1_done:
979
980 $SLL $a1,4*$BNSZ # bits
981 $SUBU $a0,$t3,$t0
982 $SLL $v0,$QT,4*$BNSZ # bits
983
984 li $QT,-1
985 $SRL $HH,$a0,4*$BNSZ # bits
986 $SRL $QT,4*$BNSZ # q=0xffffffff
987 beq $DH,$HH,.L_bn_div_words_skip_div2
988 $DIVU ($a0,$DH)
989 mfqt ($QT,$a0,$DH)
990.L_bn_div_words_skip_div2:
991 $MULTU ($a2,$QT)
992 $SLL $t3,$a0,4*$BNSZ # bits
993 $SRL $at,$a1,4*$BNSZ # bits
994 or $t3,$at
995 mflo ($t0,$a2,$QT)
996 mfhi ($t1,$a2,$QT)
997.L_bn_div_words_inner_loop2:
998 sltu $t2,$t3,$t0
999 seq $t8,$HH,$t1
1000 sltu $at,$HH,$t1
1001 and $t2,$t8
1002 sltu $v1,$t0,$a2
1003 or $at,$t2
1004 .set noreorder
1005 beqz $at,.L_bn_div_words_inner_loop2_done
1006 $SUBU $t1,$v1
1007 $SUBU $t0,$a2
1008 b .L_bn_div_words_inner_loop2
1009 $SUBU $QT,1
1010 .set reorder
1011.L_bn_div_words_inner_loop2_done:
1012
1013 $SUBU $a0,$t3,$t0
1014 or $v0,$QT
1015 $SRL $v1,$a0,$t9 # $v1 contains remainder if anybody wants it
1016 $SRL $a2,$t9 # restore $a2
1017
1018 .set noreorder
1019 move $a1,$v1
1020___
1021$code.=<<___ if ($flavour =~ /nubi/i);
1022 $REG_L $t3,4*$SZREG($sp)
1023 $REG_L $t2,3*$SZREG($sp)
1024 $REG_L $t1,2*$SZREG($sp)
1025 $REG_L $t0,1*$SZREG($sp)
1026 $REG_L $gp,0*$SZREG($sp)
1027 $PTR_ADD $sp,6*$SZREG
1028___
1029$code.=<<___;
1030 jr $ra
1031 move $a0,$v0
1032.end bn_div_words_internal
1033___
1034undef $HH; undef $QT; undef $DH;
1035
1036($a_0,$a_1,$a_2,$a_3)=($t0,$t1,$t2,$t3);
1037($b_0,$b_1,$b_2,$b_3)=($ta0,$ta1,$ta2,$ta3);
1038
1039($a_4,$a_5,$a_6,$a_7)=($s0,$s2,$s4,$a1); # once we load a[7], no use for $a1
1040($b_4,$b_5,$b_6,$b_7)=($s1,$s3,$s5,$a2); # once we load b[7], no use for $a2
1041
1042($t_1,$t_2,$c_1,$c_2,$c_3)=($t8,$t9,$v0,$v1,$a3);
1043
1044$code.=<<___;
1045
1046.align 5
1047.globl bn_mul_comba8
1048.ent bn_mul_comba8
1049bn_mul_comba8:
1050 .set noreorder
1051___
1052$code.=<<___ if ($flavour =~ /nubi/i);
1053 .frame $sp,12*$SZREG,$ra
1054 .mask 0x803ff008,-$SZREG
1055 $PTR_SUB $sp,12*$SZREG
1056 $REG_S $ra,11*$SZREG($sp)
1057 $REG_S $s5,10*$SZREG($sp)
1058 $REG_S $s4,9*$SZREG($sp)
1059 $REG_S $s3,8*$SZREG($sp)
1060 $REG_S $s2,7*$SZREG($sp)
1061 $REG_S $s1,6*$SZREG($sp)
1062 $REG_S $s0,5*$SZREG($sp)
1063 $REG_S $t3,4*$SZREG($sp)
1064 $REG_S $t2,3*$SZREG($sp)
1065 $REG_S $t1,2*$SZREG($sp)
1066 $REG_S $t0,1*$SZREG($sp)
1067 $REG_S $gp,0*$SZREG($sp)
1068___
1069$code.=<<___ if ($flavour !~ /nubi/i);
1070 .frame $sp,6*$SZREG,$ra
1071 .mask 0x003f0000,-$SZREG
1072 $PTR_SUB $sp,6*$SZREG
1073 $REG_S $s5,5*$SZREG($sp)
1074 $REG_S $s4,4*$SZREG($sp)
1075 $REG_S $s3,3*$SZREG($sp)
1076 $REG_S $s2,2*$SZREG($sp)
1077 $REG_S $s1,1*$SZREG($sp)
1078 $REG_S $s0,0*$SZREG($sp)
1079___
1080$code.=<<___;
1081
1082 .set reorder
1083 $LD $a_0,0($a1) # If compiled with -mips3 option on
1084 # R5000 box assembler barks on this
1085 # 1ine with "should not have mult/div
1086 # as last instruction in bb (R10K
1087 # bug)" warning. If anybody out there
1088 # has a clue about how to circumvent
1089 # this do send me a note.
1090 # <appro\@fy.chalmers.se>
1091
1092 $LD $b_0,0($a2)
1093 $LD $a_1,$BNSZ($a1)
1094 $LD $a_2,2*$BNSZ($a1)
1095 $MULTU ($a_0,$b_0) # mul_add_c(a[0],b[0],c1,c2,c3);
1096 $LD $a_3,3*$BNSZ($a1)
1097 $LD $b_1,$BNSZ($a2)
1098 $LD $b_2,2*$BNSZ($a2)
1099 $LD $b_3,3*$BNSZ($a2)
1100 mflo ($c_1,$a_0,$b_0)
1101 mfhi ($c_2,$a_0,$b_0)
1102
1103 $LD $a_4,4*$BNSZ($a1)
1104 $LD $a_5,5*$BNSZ($a1)
1105 $MULTU ($a_0,$b_1) # mul_add_c(a[0],b[1],c2,c3,c1);
1106 $LD $a_6,6*$BNSZ($a1)
1107 $LD $a_7,7*$BNSZ($a1)
1108 $LD $b_4,4*$BNSZ($a2)
1109 $LD $b_5,5*$BNSZ($a2)
1110 mflo ($t_1,$a_0,$b_1)
1111 mfhi ($t_2,$a_0,$b_1)
1112 $ADDU $c_2,$t_1
1113 sltu $at,$c_2,$t_1
1114 $MULTU ($a_1,$b_0) # mul_add_c(a[1],b[0],c2,c3,c1);
1115 $ADDU $c_3,$t_2,$at
1116 $LD $b_6,6*$BNSZ($a2)
1117 $LD $b_7,7*$BNSZ($a2)
1118 $ST $c_1,0($a0) # r[0]=c1;
1119 mflo ($t_1,$a_1,$b_0)
1120 mfhi ($t_2,$a_1,$b_0)
1121 $ADDU $c_2,$t_1
1122 sltu $at,$c_2,$t_1
1123 $MULTU ($a_2,$b_0) # mul_add_c(a[2],b[0],c3,c1,c2);
1124 $ADDU $t_2,$at
1125 $ADDU $c_3,$t_2
1126 sltu $c_1,$c_3,$t_2
1127 $ST $c_2,$BNSZ($a0) # r[1]=c2;
1128
1129 mflo ($t_1,$a_2,$b_0)
1130 mfhi ($t_2,$a_2,$b_0)
1131 $ADDU $c_3,$t_1
1132 sltu $at,$c_3,$t_1
1133 $MULTU ($a_1,$b_1) # mul_add_c(a[1],b[1],c3,c1,c2);
1134 $ADDU $t_2,$at
1135 $ADDU $c_1,$t_2
1136 mflo ($t_1,$a_1,$b_1)
1137 mfhi ($t_2,$a_1,$b_1)
1138 $ADDU $c_3,$t_1
1139 sltu $at,$c_3,$t_1
1140 $MULTU ($a_0,$b_2) # mul_add_c(a[0],b[2],c3,c1,c2);
1141 $ADDU $t_2,$at
1142 $ADDU $c_1,$t_2
1143 sltu $c_2,$c_1,$t_2
1144 mflo ($t_1,$a_0,$b_2)
1145 mfhi ($t_2,$a_0,$b_2)
1146 $ADDU $c_3,$t_1
1147 sltu $at,$c_3,$t_1
1148 $MULTU ($a_0,$b_3) # mul_add_c(a[0],b[3],c1,c2,c3);
1149 $ADDU $t_2,$at
1150 $ADDU $c_1,$t_2
1151 sltu $at,$c_1,$t_2
1152 $ADDU $c_2,$at
1153 $ST $c_3,2*$BNSZ($a0) # r[2]=c3;
1154
1155 mflo ($t_1,$a_0,$b_3)
1156 mfhi ($t_2,$a_0,$b_3)
1157 $ADDU $c_1,$t_1
1158 sltu $at,$c_1,$t_1
1159 $MULTU ($a_1,$b_2) # mul_add_c(a[1],b[2],c1,c2,c3);
1160 $ADDU $t_2,$at
1161 $ADDU $c_2,$t_2
1162 sltu $c_3,$c_2,$t_2
1163 mflo ($t_1,$a_1,$b_2)
1164 mfhi ($t_2,$a_1,$b_2)
1165 $ADDU $c_1,$t_1
1166 sltu $at,$c_1,$t_1
1167 $MULTU ($a_2,$b_1) # mul_add_c(a[2],b[1],c1,c2,c3);
1168 $ADDU $t_2,$at
1169 $ADDU $c_2,$t_2
1170 sltu $at,$c_2,$t_2
1171 $ADDU $c_3,$at
1172 mflo ($t_1,$a_2,$b_1)
1173 mfhi ($t_2,$a_2,$b_1)
1174 $ADDU $c_1,$t_1
1175 sltu $at,$c_1,$t_1
1176 $MULTU ($a_3,$b_0) # mul_add_c(a[3],b[0],c1,c2,c3);
1177 $ADDU $t_2,$at
1178 $ADDU $c_2,$t_2
1179 sltu $at,$c_2,$t_2
1180 $ADDU $c_3,$at
1181 mflo ($t_1,$a_3,$b_0)
1182 mfhi ($t_2,$a_3,$b_0)
1183 $ADDU $c_1,$t_1
1184 sltu $at,$c_1,$t_1
1185 $MULTU ($a_4,$b_0) # mul_add_c(a[4],b[0],c2,c3,c1);
1186 $ADDU $t_2,$at
1187 $ADDU $c_2,$t_2
1188 sltu $at,$c_2,$t_2
1189 $ADDU $c_3,$at
1190 $ST $c_1,3*$BNSZ($a0) # r[3]=c1;
1191
1192 mflo ($t_1,$a_4,$b_0)
1193 mfhi ($t_2,$a_4,$b_0)
1194 $ADDU $c_2,$t_1
1195 sltu $at,$c_2,$t_1
1196 $MULTU ($a_3,$b_1) # mul_add_c(a[3],b[1],c2,c3,c1);
1197 $ADDU $t_2,$at
1198 $ADDU $c_3,$t_2
1199 sltu $c_1,$c_3,$t_2
1200 mflo ($t_1,$a_3,$b_1)
1201 mfhi ($t_2,$a_3,$b_1)
1202 $ADDU $c_2,$t_1
1203 sltu $at,$c_2,$t_1
1204 $MULTU ($a_2,$b_2) # mul_add_c(a[2],b[2],c2,c3,c1);
1205 $ADDU $t_2,$at
1206 $ADDU $c_3,$t_2
1207 sltu $at,$c_3,$t_2
1208 $ADDU $c_1,$at
1209 mflo ($t_1,$a_2,$b_2)
1210 mfhi ($t_2,$a_2,$b_2)
1211 $ADDU $c_2,$t_1
1212 sltu $at,$c_2,$t_1
1213 $MULTU ($a_1,$b_3) # mul_add_c(a[1],b[3],c2,c3,c1);
1214 $ADDU $t_2,$at
1215 $ADDU $c_3,$t_2
1216 sltu $at,$c_3,$t_2
1217 $ADDU $c_1,$at
1218 mflo ($t_1,$a_1,$b_3)
1219 mfhi ($t_2,$a_1,$b_3)
1220 $ADDU $c_2,$t_1
1221 sltu $at,$c_2,$t_1
1222 $MULTU ($a_0,$b_4) # mul_add_c(a[0],b[4],c2,c3,c1);
1223 $ADDU $t_2,$at
1224 $ADDU $c_3,$t_2
1225 sltu $at,$c_3,$t_2
1226 $ADDU $c_1,$at
1227 mflo ($t_1,$a_0,$b_4)
1228 mfhi ($t_2,$a_0,$b_4)
1229 $ADDU $c_2,$t_1
1230 sltu $at,$c_2,$t_1
1231 $MULTU ($a_0,$b_5) # mul_add_c(a[0],b[5],c3,c1,c2);
1232 $ADDU $t_2,$at
1233 $ADDU $c_3,$t_2
1234 sltu $at,$c_3,$t_2
1235 $ADDU $c_1,$at
1236 $ST $c_2,4*$BNSZ($a0) # r[4]=c2;
1237
1238 mflo ($t_1,$a_0,$b_5)
1239 mfhi ($t_2,$a_0,$b_5)
1240 $ADDU $c_3,$t_1
1241 sltu $at,$c_3,$t_1
1242 $MULTU ($a_1,$b_4) # mul_add_c(a[1],b[4],c3,c1,c2);
1243 $ADDU $t_2,$at
1244 $ADDU $c_1,$t_2
1245 sltu $c_2,$c_1,$t_2
1246 mflo ($t_1,$a_1,$b_4)
1247 mfhi ($t_2,$a_1,$b_4)
1248 $ADDU $c_3,$t_1
1249 sltu $at,$c_3,$t_1
1250 $MULTU ($a_2,$b_3) # mul_add_c(a[2],b[3],c3,c1,c2);
1251 $ADDU $t_2,$at
1252 $ADDU $c_1,$t_2
1253 sltu $at,$c_1,$t_2
1254 $ADDU $c_2,$at
1255 mflo ($t_1,$a_2,$b_3)
1256 mfhi ($t_2,$a_2,$b_3)
1257 $ADDU $c_3,$t_1
1258 sltu $at,$c_3,$t_1
1259 $MULTU ($a_3,$b_2) # mul_add_c(a[3],b[2],c3,c1,c2);
1260 $ADDU $t_2,$at
1261 $ADDU $c_1,$t_2
1262 sltu $at,$c_1,$t_2
1263 $ADDU $c_2,$at
1264 mflo ($t_1,$a_3,$b_2)
1265 mfhi ($t_2,$a_3,$b_2)
1266 $ADDU $c_3,$t_1
1267 sltu $at,$c_3,$t_1
1268 $MULTU ($a_4,$b_1) # mul_add_c(a[4],b[1],c3,c1,c2);
1269 $ADDU $t_2,$at
1270 $ADDU $c_1,$t_2
1271 sltu $at,$c_1,$t_2
1272 $ADDU $c_2,$at
1273 mflo ($t_1,$a_4,$b_1)
1274 mfhi ($t_2,$a_4,$b_1)
1275 $ADDU $c_3,$t_1
1276 sltu $at,$c_3,$t_1
1277 $MULTU ($a_5,$b_0) # mul_add_c(a[5],b[0],c3,c1,c2);
1278 $ADDU $t_2,$at
1279 $ADDU $c_1,$t_2
1280 sltu $at,$c_1,$t_2
1281 $ADDU $c_2,$at
1282 mflo ($t_1,$a_5,$b_0)
1283 mfhi ($t_2,$a_5,$b_0)
1284 $ADDU $c_3,$t_1
1285 sltu $at,$c_3,$t_1
1286 $MULTU ($a_6,$b_0) # mul_add_c(a[6],b[0],c1,c2,c3);
1287 $ADDU $t_2,$at
1288 $ADDU $c_1,$t_2
1289 sltu $at,$c_1,$t_2
1290 $ADDU $c_2,$at
1291 $ST $c_3,5*$BNSZ($a0) # r[5]=c3;
1292
1293 mflo ($t_1,$a_6,$b_0)
1294 mfhi ($t_2,$a_6,$b_0)
1295 $ADDU $c_1,$t_1
1296 sltu $at,$c_1,$t_1
1297 $MULTU ($a_5,$b_1) # mul_add_c(a[5],b[1],c1,c2,c3);
1298 $ADDU $t_2,$at
1299 $ADDU $c_2,$t_2
1300 sltu $c_3,$c_2,$t_2
1301 mflo ($t_1,$a_5,$b_1)
1302 mfhi ($t_2,$a_5,$b_1)
1303 $ADDU $c_1,$t_1
1304 sltu $at,$c_1,$t_1
1305 $MULTU ($a_4,$b_2) # mul_add_c(a[4],b[2],c1,c2,c3);
1306 $ADDU $t_2,$at
1307 $ADDU $c_2,$t_2
1308 sltu $at,$c_2,$t_2
1309 $ADDU $c_3,$at
1310 mflo ($t_1,$a_4,$b_2)
1311 mfhi ($t_2,$a_4,$b_2)
1312 $ADDU $c_1,$t_1
1313 sltu $at,$c_1,$t_1
1314 $MULTU ($a_3,$b_3) # mul_add_c(a[3],b[3],c1,c2,c3);
1315 $ADDU $t_2,$at
1316 $ADDU $c_2,$t_2
1317 sltu $at,$c_2,$t_2
1318 $ADDU $c_3,$at
1319 mflo ($t_1,$a_3,$b_3)
1320 mfhi ($t_2,$a_3,$b_3)
1321 $ADDU $c_1,$t_1
1322 sltu $at,$c_1,$t_1
1323 $MULTU ($a_2,$b_4) # mul_add_c(a[2],b[4],c1,c2,c3);
1324 $ADDU $t_2,$at
1325 $ADDU $c_2,$t_2
1326 sltu $at,$c_2,$t_2
1327 $ADDU $c_3,$at
1328 mflo ($t_1,$a_2,$b_4)
1329 mfhi ($t_2,$a_2,$b_4)
1330 $ADDU $c_1,$t_1
1331 sltu $at,$c_1,$t_1
1332 $MULTU ($a_1,$b_5) # mul_add_c(a[1],b[5],c1,c2,c3);
1333 $ADDU $t_2,$at
1334 $ADDU $c_2,$t_2
1335 sltu $at,$c_2,$t_2
1336 $ADDU $c_3,$at
1337 mflo ($t_1,$a_1,$b_5)
1338 mfhi ($t_2,$a_1,$b_5)
1339 $ADDU $c_1,$t_1
1340 sltu $at,$c_1,$t_1
1341 $MULTU ($a_0,$b_6) # mul_add_c(a[0],b[6],c1,c2,c3);
1342 $ADDU $t_2,$at
1343 $ADDU $c_2,$t_2
1344 sltu $at,$c_2,$t_2
1345 $ADDU $c_3,$at
1346 mflo ($t_1,$a_0,$b_6)
1347 mfhi ($t_2,$a_0,$b_6)
1348 $ADDU $c_1,$t_1
1349 sltu $at,$c_1,$t_1
1350 $MULTU ($a_0,$b_7) # mul_add_c(a[0],b[7],c2,c3,c1);
1351 $ADDU $t_2,$at
1352 $ADDU $c_2,$t_2
1353 sltu $at,$c_2,$t_2
1354 $ADDU $c_3,$at
1355 $ST $c_1,6*$BNSZ($a0) # r[6]=c1;
1356
1357 mflo ($t_1,$a_0,$b_7)
1358 mfhi ($t_2,$a_0,$b_7)
1359 $ADDU $c_2,$t_1
1360 sltu $at,$c_2,$t_1
1361 $MULTU ($a_1,$b_6) # mul_add_c(a[1],b[6],c2,c3,c1);
1362 $ADDU $t_2,$at
1363 $ADDU $c_3,$t_2
1364 sltu $c_1,$c_3,$t_2
1365 mflo ($t_1,$a_1,$b_6)
1366 mfhi ($t_2,$a_1,$b_6)
1367 $ADDU $c_2,$t_1
1368 sltu $at,$c_2,$t_1
1369 $MULTU ($a_2,$b_5) # mul_add_c(a[2],b[5],c2,c3,c1);
1370 $ADDU $t_2,$at
1371 $ADDU $c_3,$t_2
1372 sltu $at,$c_3,$t_2
1373 $ADDU $c_1,$at
1374 mflo ($t_1,$a_2,$b_5)
1375 mfhi ($t_2,$a_2,$b_5)
1376 $ADDU $c_2,$t_1
1377 sltu $at,$c_2,$t_1
1378 $MULTU ($a_3,$b_4) # mul_add_c(a[3],b[4],c2,c3,c1);
1379 $ADDU $t_2,$at
1380 $ADDU $c_3,$t_2
1381 sltu $at,$c_3,$t_2
1382 $ADDU $c_1,$at
1383 mflo ($t_1,$a_3,$b_4)
1384 mfhi ($t_2,$a_3,$b_4)
1385 $ADDU $c_2,$t_1
1386 sltu $at,$c_2,$t_1
1387 $MULTU ($a_4,$b_3) # mul_add_c(a[4],b[3],c2,c3,c1);
1388 $ADDU $t_2,$at
1389 $ADDU $c_3,$t_2
1390 sltu $at,$c_3,$t_2
1391 $ADDU $c_1,$at
1392 mflo ($t_1,$a_4,$b_3)
1393 mfhi ($t_2,$a_4,$b_3)
1394 $ADDU $c_2,$t_1
1395 sltu $at,$c_2,$t_1
1396 $MULTU ($a_5,$b_2) # mul_add_c(a[5],b[2],c2,c3,c1);
1397 $ADDU $t_2,$at
1398 $ADDU $c_3,$t_2
1399 sltu $at,$c_3,$t_2
1400 $ADDU $c_1,$at
1401 mflo ($t_1,$a_5,$b_2)
1402 mfhi ($t_2,$a_5,$b_2)
1403 $ADDU $c_2,$t_1
1404 sltu $at,$c_2,$t_1
1405 $MULTU ($a_6,$b_1) # mul_add_c(a[6],b[1],c2,c3,c1);
1406 $ADDU $t_2,$at
1407 $ADDU $c_3,$t_2
1408 sltu $at,$c_3,$t_2
1409 $ADDU $c_1,$at
1410 mflo ($t_1,$a_6,$b_1)
1411 mfhi ($t_2,$a_6,$b_1)
1412 $ADDU $c_2,$t_1
1413 sltu $at,$c_2,$t_1
1414 $MULTU ($a_7,$b_0) # mul_add_c(a[7],b[0],c2,c3,c1);
1415 $ADDU $t_2,$at
1416 $ADDU $c_3,$t_2
1417 sltu $at,$c_3,$t_2
1418 $ADDU $c_1,$at
1419 mflo ($t_1,$a_7,$b_0)
1420 mfhi ($t_2,$a_7,$b_0)
1421 $ADDU $c_2,$t_1
1422 sltu $at,$c_2,$t_1
1423 $MULTU ($a_7,$b_1) # mul_add_c(a[7],b[1],c3,c1,c2);
1424 $ADDU $t_2,$at
1425 $ADDU $c_3,$t_2
1426 sltu $at,$c_3,$t_2
1427 $ADDU $c_1,$at
1428 $ST $c_2,7*$BNSZ($a0) # r[7]=c2;
1429
1430 mflo ($t_1,$a_7,$b_1)
1431 mfhi ($t_2,$a_7,$b_1)
1432 $ADDU $c_3,$t_1
1433 sltu $at,$c_3,$t_1
1434 $MULTU ($a_6,$b_2) # mul_add_c(a[6],b[2],c3,c1,c2);
1435 $ADDU $t_2,$at
1436 $ADDU $c_1,$t_2
1437 sltu $c_2,$c_1,$t_2
1438 mflo ($t_1,$a_6,$b_2)
1439 mfhi ($t_2,$a_6,$b_2)
1440 $ADDU $c_3,$t_1
1441 sltu $at,$c_3,$t_1
1442 $MULTU ($a_5,$b_3) # mul_add_c(a[5],b[3],c3,c1,c2);
1443 $ADDU $t_2,$at
1444 $ADDU $c_1,$t_2
1445 sltu $at,$c_1,$t_2
1446 $ADDU $c_2,$at
1447 mflo ($t_1,$a_5,$b_3)
1448 mfhi ($t_2,$a_5,$b_3)
1449 $ADDU $c_3,$t_1
1450 sltu $at,$c_3,$t_1
1451 $MULTU ($a_4,$b_4) # mul_add_c(a[4],b[4],c3,c1,c2);
1452 $ADDU $t_2,$at
1453 $ADDU $c_1,$t_2
1454 sltu $at,$c_1,$t_2
1455 $ADDU $c_2,$at
1456 mflo ($t_1,$a_4,$b_4)
1457 mfhi ($t_2,$a_4,$b_4)
1458 $ADDU $c_3,$t_1
1459 sltu $at,$c_3,$t_1
1460 $MULTU ($a_3,$b_5) # mul_add_c(a[3],b[5],c3,c1,c2);
1461 $ADDU $t_2,$at
1462 $ADDU $c_1,$t_2
1463 sltu $at,$c_1,$t_2
1464 $ADDU $c_2,$at
1465 mflo ($t_1,$a_3,$b_5)
1466 mfhi ($t_2,$a_3,$b_5)
1467 $ADDU $c_3,$t_1
1468 sltu $at,$c_3,$t_1
1469 $MULTU ($a_2,$b_6) # mul_add_c(a[2],b[6],c3,c1,c2);
1470 $ADDU $t_2,$at
1471 $ADDU $c_1,$t_2
1472 sltu $at,$c_1,$t_2
1473 $ADDU $c_2,$at
1474 mflo ($t_1,$a_2,$b_6)
1475 mfhi ($t_2,$a_2,$b_6)
1476 $ADDU $c_3,$t_1
1477 sltu $at,$c_3,$t_1
1478 $MULTU ($a_1,$b_7) # mul_add_c(a[1],b[7],c3,c1,c2);
1479 $ADDU $t_2,$at
1480 $ADDU $c_1,$t_2
1481 sltu $at,$c_1,$t_2
1482 $ADDU $c_2,$at
1483 mflo ($t_1,$a_1,$b_7)
1484 mfhi ($t_2,$a_1,$b_7)
1485 $ADDU $c_3,$t_1
1486 sltu $at,$c_3,$t_1
1487 $MULTU ($a_2,$b_7) # mul_add_c(a[2],b[7],c1,c2,c3);
1488 $ADDU $t_2,$at
1489 $ADDU $c_1,$t_2
1490 sltu $at,$c_1,$t_2
1491 $ADDU $c_2,$at
1492 $ST $c_3,8*$BNSZ($a0) # r[8]=c3;
1493
1494 mflo ($t_1,$a_2,$b_7)
1495 mfhi ($t_2,$a_2,$b_7)
1496 $ADDU $c_1,$t_1
1497 sltu $at,$c_1,$t_1
1498 $MULTU ($a_3,$b_6) # mul_add_c(a[3],b[6],c1,c2,c3);
1499 $ADDU $t_2,$at
1500 $ADDU $c_2,$t_2
1501 sltu $c_3,$c_2,$t_2
1502 mflo ($t_1,$a_3,$b_6)
1503 mfhi ($t_2,$a_3,$b_6)
1504 $ADDU $c_1,$t_1
1505 sltu $at,$c_1,$t_1
1506 $MULTU ($a_4,$b_5) # mul_add_c(a[4],b[5],c1,c2,c3);
1507 $ADDU $t_2,$at
1508 $ADDU $c_2,$t_2
1509 sltu $at,$c_2,$t_2
1510 $ADDU $c_3,$at
1511 mflo ($t_1,$a_4,$b_5)
1512 mfhi ($t_2,$a_4,$b_5)
1513 $ADDU $c_1,$t_1
1514 sltu $at,$c_1,$t_1
1515 $MULTU ($a_5,$b_4) # mul_add_c(a[5],b[4],c1,c2,c3);
1516 $ADDU $t_2,$at
1517 $ADDU $c_2,$t_2
1518 sltu $at,$c_2,$t_2
1519 $ADDU $c_3,$at
1520 mflo ($t_1,$a_5,$b_4)
1521 mfhi ($t_2,$a_5,$b_4)
1522 $ADDU $c_1,$t_1
1523 sltu $at,$c_1,$t_1
1524 $MULTU ($a_6,$b_3) # mul_add_c(a[6],b[3],c1,c2,c3);
1525 $ADDU $t_2,$at
1526 $ADDU $c_2,$t_2
1527 sltu $at,$c_2,$t_2
1528 $ADDU $c_3,$at
1529 mflo ($t_1,$a_6,$b_3)
1530 mfhi ($t_2,$a_6,$b_3)
1531 $ADDU $c_1,$t_1
1532 sltu $at,$c_1,$t_1
1533 $MULTU ($a_7,$b_2) # mul_add_c(a[7],b[2],c1,c2,c3);
1534 $ADDU $t_2,$at
1535 $ADDU $c_2,$t_2
1536 sltu $at,$c_2,$t_2
1537 $ADDU $c_3,$at
1538 mflo ($t_1,$a_7,$b_2)
1539 mfhi ($t_2,$a_7,$b_2)
1540 $ADDU $c_1,$t_1
1541 sltu $at,$c_1,$t_1
1542 $MULTU ($a_7,$b_3) # mul_add_c(a[7],b[3],c2,c3,c1);
1543 $ADDU $t_2,$at
1544 $ADDU $c_2,$t_2
1545 sltu $at,$c_2,$t_2
1546 $ADDU $c_3,$at
1547 $ST $c_1,9*$BNSZ($a0) # r[9]=c1;
1548
1549 mflo ($t_1,$a_7,$b_3)
1550 mfhi ($t_2,$a_7,$b_3)
1551 $ADDU $c_2,$t_1
1552 sltu $at,$c_2,$t_1
1553 $MULTU ($a_6,$b_4) # mul_add_c(a[6],b[4],c2,c3,c1);
1554 $ADDU $t_2,$at
1555 $ADDU $c_3,$t_2
1556 sltu $c_1,$c_3,$t_2
1557 mflo ($t_1,$a_6,$b_4)
1558 mfhi ($t_2,$a_6,$b_4)
1559 $ADDU $c_2,$t_1
1560 sltu $at,$c_2,$t_1
1561 $MULTU ($a_5,$b_5) # mul_add_c(a[5],b[5],c2,c3,c1);
1562 $ADDU $t_2,$at
1563 $ADDU $c_3,$t_2
1564 sltu $at,$c_3,$t_2
1565 $ADDU $c_1,$at
1566 mflo ($t_1,$a_5,$b_5)
1567 mfhi ($t_2,$a_5,$b_5)
1568 $ADDU $c_2,$t_1
1569 sltu $at,$c_2,$t_1
1570 $MULTU ($a_4,$b_6) # mul_add_c(a[4],b[6],c2,c3,c1);
1571 $ADDU $t_2,$at
1572 $ADDU $c_3,$t_2
1573 sltu $at,$c_3,$t_2
1574 $ADDU $c_1,$at
1575 mflo ($t_1,$a_4,$b_6)
1576 mfhi ($t_2,$a_4,$b_6)
1577 $ADDU $c_2,$t_1
1578 sltu $at,$c_2,$t_1
1579 $MULTU ($a_3,$b_7) # mul_add_c(a[3],b[7],c2,c3,c1);
1580 $ADDU $t_2,$at
1581 $ADDU $c_3,$t_2
1582 sltu $at,$c_3,$t_2
1583 $ADDU $c_1,$at
1584 mflo ($t_1,$a_3,$b_7)
1585 mfhi ($t_2,$a_3,$b_7)
1586 $ADDU $c_2,$t_1
1587 sltu $at,$c_2,$t_1
1588 $MULTU ($a_4,$b_7) # mul_add_c(a[4],b[7],c3,c1,c2);
1589 $ADDU $t_2,$at
1590 $ADDU $c_3,$t_2
1591 sltu $at,$c_3,$t_2
1592 $ADDU $c_1,$at
1593 $ST $c_2,10*$BNSZ($a0) # r[10]=c2;
1594
1595 mflo ($t_1,$a_4,$b_7)
1596 mfhi ($t_2,$a_4,$b_7)
1597 $ADDU $c_3,$t_1
1598 sltu $at,$c_3,$t_1
1599 $MULTU ($a_5,$b_6) # mul_add_c(a[5],b[6],c3,c1,c2);
1600 $ADDU $t_2,$at
1601 $ADDU $c_1,$t_2
1602 sltu $c_2,$c_1,$t_2
1603 mflo ($t_1,$a_5,$b_6)
1604 mfhi ($t_2,$a_5,$b_6)
1605 $ADDU $c_3,$t_1
1606 sltu $at,$c_3,$t_1
1607 $MULTU ($a_6,$b_5) # mul_add_c(a[6],b[5],c3,c1,c2);
1608 $ADDU $t_2,$at
1609 $ADDU $c_1,$t_2
1610 sltu $at,$c_1,$t_2
1611 $ADDU $c_2,$at
1612 mflo ($t_1,$a_6,$b_5)
1613 mfhi ($t_2,$a_6,$b_5)
1614 $ADDU $c_3,$t_1
1615 sltu $at,$c_3,$t_1
1616 $MULTU ($a_7,$b_4) # mul_add_c(a[7],b[4],c3,c1,c2);
1617 $ADDU $t_2,$at
1618 $ADDU $c_1,$t_2
1619 sltu $at,$c_1,$t_2
1620 $ADDU $c_2,$at
1621 mflo ($t_1,$a_7,$b_4)
1622 mfhi ($t_2,$a_7,$b_4)
1623 $ADDU $c_3,$t_1
1624 sltu $at,$c_3,$t_1
1625 $MULTU ($a_7,$b_5) # mul_add_c(a[7],b[5],c1,c2,c3);
1626 $ADDU $t_2,$at
1627 $ADDU $c_1,$t_2
1628 sltu $at,$c_1,$t_2
1629 $ADDU $c_2,$at
1630 $ST $c_3,11*$BNSZ($a0) # r[11]=c3;
1631
1632 mflo ($t_1,$a_7,$b_5)
1633 mfhi ($t_2,$a_7,$b_5)
1634 $ADDU $c_1,$t_1
1635 sltu $at,$c_1,$t_1
1636 $MULTU ($a_6,$b_6) # mul_add_c(a[6],b[6],c1,c2,c3);
1637 $ADDU $t_2,$at
1638 $ADDU $c_2,$t_2
1639 sltu $c_3,$c_2,$t_2
1640 mflo ($t_1,$a_6,$b_6)
1641 mfhi ($t_2,$a_6,$b_6)
1642 $ADDU $c_1,$t_1
1643 sltu $at,$c_1,$t_1
1644 $MULTU ($a_5,$b_7) # mul_add_c(a[5],b[7],c1,c2,c3);
1645 $ADDU $t_2,$at
1646 $ADDU $c_2,$t_2
1647 sltu $at,$c_2,$t_2
1648 $ADDU $c_3,$at
1649 mflo ($t_1,$a_5,$b_7)
1650 mfhi ($t_2,$a_5,$b_7)
1651 $ADDU $c_1,$t_1
1652 sltu $at,$c_1,$t_1
1653 $MULTU ($a_6,$b_7) # mul_add_c(a[6],b[7],c2,c3,c1);
1654 $ADDU $t_2,$at
1655 $ADDU $c_2,$t_2
1656 sltu $at,$c_2,$t_2
1657 $ADDU $c_3,$at
1658 $ST $c_1,12*$BNSZ($a0) # r[12]=c1;
1659
1660 mflo ($t_1,$a_6,$b_7)
1661 mfhi ($t_2,$a_6,$b_7)
1662 $ADDU $c_2,$t_1
1663 sltu $at,$c_2,$t_1
1664 $MULTU ($a_7,$b_6) # mul_add_c(a[7],b[6],c2,c3,c1);
1665 $ADDU $t_2,$at
1666 $ADDU $c_3,$t_2
1667 sltu $c_1,$c_3,$t_2
1668 mflo ($t_1,$a_7,$b_6)
1669 mfhi ($t_2,$a_7,$b_6)
1670 $ADDU $c_2,$t_1
1671 sltu $at,$c_2,$t_1
1672 $MULTU ($a_7,$b_7) # mul_add_c(a[7],b[7],c3,c1,c2);
1673 $ADDU $t_2,$at
1674 $ADDU $c_3,$t_2
1675 sltu $at,$c_3,$t_2
1676 $ADDU $c_1,$at
1677 $ST $c_2,13*$BNSZ($a0) # r[13]=c2;
1678
1679 mflo ($t_1,$a_7,$b_7)
1680 mfhi ($t_2,$a_7,$b_7)
1681 $ADDU $c_3,$t_1
1682 sltu $at,$c_3,$t_1
1683 $ADDU $t_2,$at
1684 $ADDU $c_1,$t_2
1685 $ST $c_3,14*$BNSZ($a0) # r[14]=c3;
1686 $ST $c_1,15*$BNSZ($a0) # r[15]=c1;
1687
1688 .set noreorder
1689___
1690$code.=<<___ if ($flavour =~ /nubi/i);
1691 $REG_L $s5,10*$SZREG($sp)
1692 $REG_L $s4,9*$SZREG($sp)
1693 $REG_L $s3,8*$SZREG($sp)
1694 $REG_L $s2,7*$SZREG($sp)
1695 $REG_L $s1,6*$SZREG($sp)
1696 $REG_L $s0,5*$SZREG($sp)
1697 $REG_L $t3,4*$SZREG($sp)
1698 $REG_L $t2,3*$SZREG($sp)
1699 $REG_L $t1,2*$SZREG($sp)
1700 $REG_L $t0,1*$SZREG($sp)
1701 $REG_L $gp,0*$SZREG($sp)
1702 jr $ra
1703 $PTR_ADD $sp,12*$SZREG
1704___
1705$code.=<<___ if ($flavour !~ /nubi/i);
1706 $REG_L $s5,5*$SZREG($sp)
1707 $REG_L $s4,4*$SZREG($sp)
1708 $REG_L $s3,3*$SZREG($sp)
1709 $REG_L $s2,2*$SZREG($sp)
1710 $REG_L $s1,1*$SZREG($sp)
1711 $REG_L $s0,0*$SZREG($sp)
1712 jr $ra
1713 $PTR_ADD $sp,6*$SZREG
1714___
1715$code.=<<___;
1716.end bn_mul_comba8
1717
1718.align 5
1719.globl bn_mul_comba4
1720.ent bn_mul_comba4
1721bn_mul_comba4:
1722___
1723$code.=<<___ if ($flavour =~ /nubi/i);
1724 .frame $sp,6*$SZREG,$ra
1725 .mask 0x8000f008,-$SZREG
1726 .set noreorder
1727 $PTR_SUB $sp,6*$SZREG
1728 $REG_S $ra,5*$SZREG($sp)
1729 $REG_S $t3,4*$SZREG($sp)
1730 $REG_S $t2,3*$SZREG($sp)
1731 $REG_S $t1,2*$SZREG($sp)
1732 $REG_S $t0,1*$SZREG($sp)
1733 $REG_S $gp,0*$SZREG($sp)
1734___
1735$code.=<<___;
1736 .set reorder
1737 $LD $a_0,0($a1)
1738 $LD $b_0,0($a2)
1739 $LD $a_1,$BNSZ($a1)
1740 $LD $a_2,2*$BNSZ($a1)
1741 $MULTU ($a_0,$b_0) # mul_add_c(a[0],b[0],c1,c2,c3);
1742 $LD $a_3,3*$BNSZ($a1)
1743 $LD $b_1,$BNSZ($a2)
1744 $LD $b_2,2*$BNSZ($a2)
1745 $LD $b_3,3*$BNSZ($a2)
1746 mflo ($c_1,$a_0,$b_0)
1747 mfhi ($c_2,$a_0,$b_0)
1748 $ST $c_1,0($a0)
1749
1750 $MULTU ($a_0,$b_1) # mul_add_c(a[0],b[1],c2,c3,c1);
1751 mflo ($t_1,$a_0,$b_1)
1752 mfhi ($t_2,$a_0,$b_1)
1753 $ADDU $c_2,$t_1
1754 sltu $at,$c_2,$t_1
1755 $MULTU ($a_1,$b_0) # mul_add_c(a[1],b[0],c2,c3,c1);
1756 $ADDU $c_3,$t_2,$at
1757 mflo ($t_1,$a_1,$b_0)
1758 mfhi ($t_2,$a_1,$b_0)
1759 $ADDU $c_2,$t_1
1760 sltu $at,$c_2,$t_1
1761 $MULTU ($a_2,$b_0) # mul_add_c(a[2],b[0],c3,c1,c2);
1762 $ADDU $t_2,$at
1763 $ADDU $c_3,$t_2
1764 sltu $c_1,$c_3,$t_2
1765 $ST $c_2,$BNSZ($a0)
1766
1767 mflo ($t_1,$a_2,$b_0)
1768 mfhi ($t_2,$a_2,$b_0)
1769 $ADDU $c_3,$t_1
1770 sltu $at,$c_3,$t_1
1771 $MULTU ($a_1,$b_1) # mul_add_c(a[1],b[1],c3,c1,c2);
1772 $ADDU $t_2,$at
1773 $ADDU $c_1,$t_2
1774 mflo ($t_1,$a_1,$b_1)
1775 mfhi ($t_2,$a_1,$b_1)
1776 $ADDU $c_3,$t_1
1777 sltu $at,$c_3,$t_1
1778 $MULTU ($a_0,$b_2) # mul_add_c(a[0],b[2],c3,c1,c2);
1779 $ADDU $t_2,$at
1780 $ADDU $c_1,$t_2
1781 sltu $c_2,$c_1,$t_2
1782 mflo ($t_1,$a_0,$b_2)
1783 mfhi ($t_2,$a_0,$b_2)
1784 $ADDU $c_3,$t_1
1785 sltu $at,$c_3,$t_1
1786 $MULTU ($a_0,$b_3) # mul_add_c(a[0],b[3],c1,c2,c3);
1787 $ADDU $t_2,$at
1788 $ADDU $c_1,$t_2
1789 sltu $at,$c_1,$t_2
1790 $ADDU $c_2,$at
1791 $ST $c_3,2*$BNSZ($a0)
1792
1793 mflo ($t_1,$a_0,$b_3)
1794 mfhi ($t_2,$a_0,$b_3)
1795 $ADDU $c_1,$t_1
1796 sltu $at,$c_1,$t_1
1797 $MULTU ($a_1,$b_2) # mul_add_c(a[1],b[2],c1,c2,c3);
1798 $ADDU $t_2,$at
1799 $ADDU $c_2,$t_2
1800 sltu $c_3,$c_2,$t_2
1801 mflo ($t_1,$a_1,$b_2)
1802 mfhi ($t_2,$a_1,$b_2)
1803 $ADDU $c_1,$t_1
1804 sltu $at,$c_1,$t_1
1805 $MULTU ($a_2,$b_1) # mul_add_c(a[2],b[1],c1,c2,c3);
1806 $ADDU $t_2,$at
1807 $ADDU $c_2,$t_2
1808 sltu $at,$c_2,$t_2
1809 $ADDU $c_3,$at
1810 mflo ($t_1,$a_2,$b_1)
1811 mfhi ($t_2,$a_2,$b_1)
1812 $ADDU $c_1,$t_1
1813 sltu $at,$c_1,$t_1
1814 $MULTU ($a_3,$b_0) # mul_add_c(a[3],b[0],c1,c2,c3);
1815 $ADDU $t_2,$at
1816 $ADDU $c_2,$t_2
1817 sltu $at,$c_2,$t_2
1818 $ADDU $c_3,$at
1819 mflo ($t_1,$a_3,$b_0)
1820 mfhi ($t_2,$a_3,$b_0)
1821 $ADDU $c_1,$t_1
1822 sltu $at,$c_1,$t_1
1823 $MULTU ($a_3,$b_1) # mul_add_c(a[3],b[1],c2,c3,c1);
1824 $ADDU $t_2,$at
1825 $ADDU $c_2,$t_2
1826 sltu $at,$c_2,$t_2
1827 $ADDU $c_3,$at
1828 $ST $c_1,3*$BNSZ($a0)
1829
1830 mflo ($t_1,$a_3,$b_1)
1831 mfhi ($t_2,$a_3,$b_1)
1832 $ADDU $c_2,$t_1
1833 sltu $at,$c_2,$t_1
1834 $MULTU ($a_2,$b_2) # mul_add_c(a[2],b[2],c2,c3,c1);
1835 $ADDU $t_2,$at
1836 $ADDU $c_3,$t_2
1837 sltu $c_1,$c_3,$t_2
1838 mflo ($t_1,$a_2,$b_2)
1839 mfhi ($t_2,$a_2,$b_2)
1840 $ADDU $c_2,$t_1
1841 sltu $at,$c_2,$t_1
1842 $MULTU ($a_1,$b_3) # mul_add_c(a[1],b[3],c2,c3,c1);
1843 $ADDU $t_2,$at
1844 $ADDU $c_3,$t_2
1845 sltu $at,$c_3,$t_2
1846 $ADDU $c_1,$at
1847 mflo ($t_1,$a_1,$b_3)
1848 mfhi ($t_2,$a_1,$b_3)
1849 $ADDU $c_2,$t_1
1850 sltu $at,$c_2,$t_1
1851 $MULTU ($a_2,$b_3) # mul_add_c(a[2],b[3],c3,c1,c2);
1852 $ADDU $t_2,$at
1853 $ADDU $c_3,$t_2
1854 sltu $at,$c_3,$t_2
1855 $ADDU $c_1,$at
1856 $ST $c_2,4*$BNSZ($a0)
1857
1858 mflo ($t_1,$a_2,$b_3)
1859 mfhi ($t_2,$a_2,$b_3)
1860 $ADDU $c_3,$t_1
1861 sltu $at,$c_3,$t_1
1862 $MULTU ($a_3,$b_2) # mul_add_c(a[3],b[2],c3,c1,c2);
1863 $ADDU $t_2,$at
1864 $ADDU $c_1,$t_2
1865 sltu $c_2,$c_1,$t_2
1866 mflo ($t_1,$a_3,$b_2)
1867 mfhi ($t_2,$a_3,$b_2)
1868 $ADDU $c_3,$t_1
1869 sltu $at,$c_3,$t_1
1870 $MULTU ($a_3,$b_3) # mul_add_c(a[3],b[3],c1,c2,c3);
1871 $ADDU $t_2,$at
1872 $ADDU $c_1,$t_2
1873 sltu $at,$c_1,$t_2
1874 $ADDU $c_2,$at
1875 $ST $c_3,5*$BNSZ($a0)
1876
1877 mflo ($t_1,$a_3,$b_3)
1878 mfhi ($t_2,$a_3,$b_3)
1879 $ADDU $c_1,$t_1
1880 sltu $at,$c_1,$t_1
1881 $ADDU $t_2,$at
1882 $ADDU $c_2,$t_2
1883 $ST $c_1,6*$BNSZ($a0)
1884 $ST $c_2,7*$BNSZ($a0)
1885
1886 .set noreorder
1887___
1888$code.=<<___ if ($flavour =~ /nubi/i);
1889 $REG_L $t3,4*$SZREG($sp)
1890 $REG_L $t2,3*$SZREG($sp)
1891 $REG_L $t1,2*$SZREG($sp)
1892 $REG_L $t0,1*$SZREG($sp)
1893 $REG_L $gp,0*$SZREG($sp)
1894 $PTR_ADD $sp,6*$SZREG
1895___
1896$code.=<<___;
1897 jr $ra
1898 nop
1899.end bn_mul_comba4
1900___
1901
1902($a_4,$a_5,$a_6,$a_7)=($b_0,$b_1,$b_2,$b_3);
1903
1904sub add_c2 () {
1905my ($hi,$lo,$c0,$c1,$c2,
1906 $warm, # !$warm denotes first call with specific sequence of
1907 # $c_[XYZ] when there is no Z-carry to accumulate yet;
1908 $an,$bn # these two are arguments for multiplication which
1909 # result is used in *next* step [which is why it's
1910 # commented as "forward multiplication" below];
1911 )=@_;
1912$code.=<<___;
1913 $ADDU $c0,$lo
1914 sltu $at,$c0,$lo
1915 $MULTU ($an,$bn) # forward multiplication
1916 $ADDU $c0,$lo
1917 $ADDU $at,$hi
1918 sltu $lo,$c0,$lo
1919 $ADDU $c1,$at
1920 $ADDU $hi,$lo
1921___
1922$code.=<<___ if (!$warm);
1923 sltu $c2,$c1,$at
1924 $ADDU $c1,$hi
1925___
1926$code.=<<___ if ($warm);
1927 sltu $at,$c1,$at
1928 $ADDU $c1,$hi
1929 $ADDU $c2,$at
1930___
1931$code.=<<___;
1932 sltu $hi,$c1,$hi
1933 $ADDU $c2,$hi
1934 mflo ($lo,$an,$bn)
1935 mfhi ($hi,$an,$bn)
1936___
1937}
1938
1939$code.=<<___;
1940
1941.align 5
1942.globl bn_sqr_comba8
1943.ent bn_sqr_comba8
1944bn_sqr_comba8:
1945___
1946$code.=<<___ if ($flavour =~ /nubi/i);
1947 .frame $sp,6*$SZREG,$ra
1948 .mask 0x8000f008,-$SZREG
1949 .set noreorder
1950 $PTR_SUB $sp,6*$SZREG
1951 $REG_S $ra,5*$SZREG($sp)
1952 $REG_S $t3,4*$SZREG($sp)
1953 $REG_S $t2,3*$SZREG($sp)
1954 $REG_S $t1,2*$SZREG($sp)
1955 $REG_S $t0,1*$SZREG($sp)
1956 $REG_S $gp,0*$SZREG($sp)
1957___
1958$code.=<<___;
1959 .set reorder
1960 $LD $a_0,0($a1)
1961 $LD $a_1,$BNSZ($a1)
1962 $LD $a_2,2*$BNSZ($a1)
1963 $LD $a_3,3*$BNSZ($a1)
1964
1965 $MULTU ($a_0,$a_0) # mul_add_c(a[0],b[0],c1,c2,c3);
1966 $LD $a_4,4*$BNSZ($a1)
1967 $LD $a_5,5*$BNSZ($a1)
1968 $LD $a_6,6*$BNSZ($a1)
1969 $LD $a_7,7*$BNSZ($a1)
1970 mflo ($c_1,$a_0,$a_0)
1971 mfhi ($c_2,$a_0,$a_0)
1972 $ST $c_1,0($a0)
1973
1974 $MULTU ($a_0,$a_1) # mul_add_c2(a[0],b[1],c2,c3,c1);
1975 mflo ($t_1,$a_0,$a_1)
1976 mfhi ($t_2,$a_0,$a_1)
1977 slt $c_1,$t_2,$zero
1978 $SLL $t_2,1
1979 $MULTU ($a_2,$a_0) # mul_add_c2(a[2],b[0],c3,c1,c2);
1980 slt $a2,$t_1,$zero
1981 $ADDU $t_2,$a2
1982 $SLL $t_1,1
1983 $ADDU $c_2,$t_1
1984 sltu $at,$c_2,$t_1
1985 $ADDU $c_3,$t_2,$at
1986 $ST $c_2,$BNSZ($a0)
1987 mflo ($t_1,$a_2,$a_0)
1988 mfhi ($t_2,$a_2,$a_0)
1989___
1990 &add_c2($t_2,$t_1,$c_3,$c_1,$c_2,0,
1991 $a_1,$a_1); # mul_add_c(a[1],b[1],c3,c1,c2);
1992$code.=<<___;
1993 $ADDU $c_3,$t_1
1994 sltu $at,$c_3,$t_1
1995 $MULTU ($a_0,$a_3) # mul_add_c2(a[0],b[3],c1,c2,c3);
1996 $ADDU $t_2,$at
1997 $ADDU $c_1,$t_2
1998 sltu $at,$c_1,$t_2
1999 $ADDU $c_2,$at
2000 $ST $c_3,2*$BNSZ($a0)
2001 mflo ($t_1,$a_0,$a_3)
2002 mfhi ($t_2,$a_0,$a_3)
2003___
2004 &add_c2($t_2,$t_1,$c_1,$c_2,$c_3,0,
2005 $a_1,$a_2); # mul_add_c2(a[1],b[2],c1,c2,c3);
2006 &add_c2($t_2,$t_1,$c_1,$c_2,$c_3,1,
2007 $a_4,$a_0); # mul_add_c2(a[4],b[0],c2,c3,c1);
2008$code.=<<___;
2009 $ST $c_1,3*$BNSZ($a0)
2010___
2011 &add_c2($t_2,$t_1,$c_2,$c_3,$c_1,0,
2012 $a_3,$a_1); # mul_add_c2(a[3],b[1],c2,c3,c1);
2013 &add_c2($t_2,$t_1,$c_2,$c_3,$c_1,1,
2014 $a_2,$a_2); # mul_add_c(a[2],b[2],c2,c3,c1);
2015$code.=<<___;
2016 $ADDU $c_2,$t_1
2017 sltu $at,$c_2,$t_1
2018 $MULTU ($a_0,$a_5) # mul_add_c2(a[0],b[5],c3,c1,c2);
2019 $ADDU $t_2,$at
2020 $ADDU $c_3,$t_2
2021 sltu $at,$c_3,$t_2
2022 $ADDU $c_1,$at
2023 $ST $c_2,4*$BNSZ($a0)
2024 mflo ($t_1,$a_0,$a_5)
2025 mfhi ($t_2,$a_0,$a_5)
2026___
2027 &add_c2($t_2,$t_1,$c_3,$c_1,$c_2,0,
2028 $a_1,$a_4); # mul_add_c2(a[1],b[4],c3,c1,c2);
2029 &add_c2($t_2,$t_1,$c_3,$c_1,$c_2,1,
2030 $a_2,$a_3); # mul_add_c2(a[2],b[3],c3,c1,c2);
2031 &add_c2($t_2,$t_1,$c_3,$c_1,$c_2,1,
2032 $a_6,$a_0); # mul_add_c2(a[6],b[0],c1,c2,c3);
2033$code.=<<___;
2034 $ST $c_3,5*$BNSZ($a0)
2035___
2036 &add_c2($t_2,$t_1,$c_1,$c_2,$c_3,0,
2037 $a_5,$a_1); # mul_add_c2(a[5],b[1],c1,c2,c3);
2038 &add_c2($t_2,$t_1,$c_1,$c_2,$c_3,1,
2039 $a_4,$a_2); # mul_add_c2(a[4],b[2],c1,c2,c3);
2040 &add_c2($t_2,$t_1,$c_1,$c_2,$c_3,1,
2041 $a_3,$a_3); # mul_add_c(a[3],b[3],c1,c2,c3);
2042$code.=<<___;
2043 $ADDU $c_1,$t_1
2044 sltu $at,$c_1,$t_1
2045 $MULTU ($a_0,$a_7) # mul_add_c2(a[0],b[7],c2,c3,c1);
2046 $ADDU $t_2,$at
2047 $ADDU $c_2,$t_2
2048 sltu $at,$c_2,$t_2
2049 $ADDU $c_3,$at
2050 $ST $c_1,6*$BNSZ($a0)
2051 mflo ($t_1,$a_0,$a_7)
2052 mfhi ($t_2,$a_0,$a_7)
2053___
2054 &add_c2($t_2,$t_1,$c_2,$c_3,$c_1,0,
2055 $a_1,$a_6); # mul_add_c2(a[1],b[6],c2,c3,c1);
2056 &add_c2($t_2,$t_1,$c_2,$c_3,$c_1,1,
2057 $a_2,$a_5); # mul_add_c2(a[2],b[5],c2,c3,c1);
2058 &add_c2($t_2,$t_1,$c_2,$c_3,$c_1,1,
2059 $a_3,$a_4); # mul_add_c2(a[3],b[4],c2,c3,c1);
2060 &add_c2($t_2,$t_1,$c_2,$c_3,$c_1,1,
2061 $a_7,$a_1); # mul_add_c2(a[7],b[1],c3,c1,c2);
2062$code.=<<___;
2063 $ST $c_2,7*$BNSZ($a0)
2064___
2065 &add_c2($t_2,$t_1,$c_3,$c_1,$c_2,0,
2066 $a_6,$a_2); # mul_add_c2(a[6],b[2],c3,c1,c2);
2067 &add_c2($t_2,$t_1,$c_3,$c_1,$c_2,1,
2068 $a_5,$a_3); # mul_add_c2(a[5],b[3],c3,c1,c2);
2069 &add_c2($t_2,$t_1,$c_3,$c_1,$c_2,1,
2070 $a_4,$a_4); # mul_add_c(a[4],b[4],c3,c1,c2);
2071$code.=<<___;
2072 $ADDU $c_3,$t_1
2073 sltu $at,$c_3,$t_1
2074 $MULTU ($a_2,$a_7) # mul_add_c2(a[2],b[7],c1,c2,c3);
2075 $ADDU $t_2,$at
2076 $ADDU $c_1,$t_2
2077 sltu $at,$c_1,$t_2
2078 $ADDU $c_2,$at
2079 $ST $c_3,8*$BNSZ($a0)
2080 mflo ($t_1,$a_2,$a_7)
2081 mfhi ($t_2,$a_2,$a_7)
2082___
2083 &add_c2($t_2,$t_1,$c_1,$c_2,$c_3,0,
2084 $a_3,$a_6); # mul_add_c2(a[3],b[6],c1,c2,c3);
2085 &add_c2($t_2,$t_1,$c_1,$c_2,$c_3,1,
2086 $a_4,$a_5); # mul_add_c2(a[4],b[5],c1,c2,c3);
2087 &add_c2($t_2,$t_1,$c_1,$c_2,$c_3,1,
2088 $a_7,$a_3); # mul_add_c2(a[7],b[3],c2,c3,c1);
2089$code.=<<___;
2090 $ST $c_1,9*$BNSZ($a0)
2091___
2092 &add_c2($t_2,$t_1,$c_2,$c_3,$c_1,0,
2093 $a_6,$a_4); # mul_add_c2(a[6],b[4],c2,c3,c1);
2094 &add_c2($t_2,$t_1,$c_2,$c_3,$c_1,1,
2095 $a_5,$a_5); # mul_add_c(a[5],b[5],c2,c3,c1);
2096$code.=<<___;
2097 $ADDU $c_2,$t_1
2098 sltu $at,$c_2,$t_1
2099 $MULTU ($a_4,$a_7) # mul_add_c2(a[4],b[7],c3,c1,c2);
2100 $ADDU $t_2,$at
2101 $ADDU $c_3,$t_2
2102 sltu $at,$c_3,$t_2
2103 $ADDU $c_1,$at
2104 $ST $c_2,10*$BNSZ($a0)
2105 mflo ($t_1,$a_4,$a_7)
2106 mfhi ($t_2,$a_4,$a_7)
2107___
2108 &add_c2($t_2,$t_1,$c_3,$c_1,$c_2,0,
2109 $a_5,$a_6); # mul_add_c2(a[5],b[6],c3,c1,c2);
2110 &add_c2($t_2,$t_1,$c_3,$c_1,$c_2,1,
2111 $a_7,$a_5); # mul_add_c2(a[7],b[5],c1,c2,c3);
2112$code.=<<___;
2113 $ST $c_3,11*$BNSZ($a0)
2114___
2115 &add_c2($t_2,$t_1,$c_1,$c_2,$c_3,0,
2116 $a_6,$a_6); # mul_add_c(a[6],b[6],c1,c2,c3);
2117$code.=<<___;
2118 $ADDU $c_1,$t_1
2119 sltu $at,$c_1,$t_1
2120 $MULTU ($a_6,$a_7) # mul_add_c2(a[6],b[7],c2,c3,c1);
2121 $ADDU $t_2,$at
2122 $ADDU $c_2,$t_2
2123 sltu $at,$c_2,$t_2
2124 $ADDU $c_3,$at
2125 $ST $c_1,12*$BNSZ($a0)
2126 mflo ($t_1,$a_6,$a_7)
2127 mfhi ($t_2,$a_6,$a_7)
2128___
2129 &add_c2($t_2,$t_1,$c_2,$c_3,$c_1,0,
2130 $a_7,$a_7); # mul_add_c(a[7],b[7],c3,c1,c2);
2131$code.=<<___;
2132 $ST $c_2,13*$BNSZ($a0)
2133
2134 $ADDU $c_3,$t_1
2135 sltu $at,$c_3,$t_1
2136 $ADDU $t_2,$at
2137 $ADDU $c_1,$t_2
2138 $ST $c_3,14*$BNSZ($a0)
2139 $ST $c_1,15*$BNSZ($a0)
2140
2141 .set noreorder
2142___
2143$code.=<<___ if ($flavour =~ /nubi/i);
2144 $REG_L $t3,4*$SZREG($sp)
2145 $REG_L $t2,3*$SZREG($sp)
2146 $REG_L $t1,2*$SZREG($sp)
2147 $REG_L $t0,1*$SZREG($sp)
2148 $REG_L $gp,0*$SZREG($sp)
2149 $PTR_ADD $sp,6*$SZREG
2150___
2151$code.=<<___;
2152 jr $ra
2153 nop
2154.end bn_sqr_comba8
2155
2156.align 5
2157.globl bn_sqr_comba4
2158.ent bn_sqr_comba4
2159bn_sqr_comba4:
2160___
2161$code.=<<___ if ($flavour =~ /nubi/i);
2162 .frame $sp,6*$SZREG,$ra
2163 .mask 0x8000f008,-$SZREG
2164 .set noreorder
2165 $PTR_SUB $sp,6*$SZREG
2166 $REG_S $ra,5*$SZREG($sp)
2167 $REG_S $t3,4*$SZREG($sp)
2168 $REG_S $t2,3*$SZREG($sp)
2169 $REG_S $t1,2*$SZREG($sp)
2170 $REG_S $t0,1*$SZREG($sp)
2171 $REG_S $gp,0*$SZREG($sp)
2172___
2173$code.=<<___;
2174 .set reorder
2175 $LD $a_0,0($a1)
2176 $LD $a_1,$BNSZ($a1)
2177 $MULTU ($a_0,$a_0) # mul_add_c(a[0],b[0],c1,c2,c3);
2178 $LD $a_2,2*$BNSZ($a1)
2179 $LD $a_3,3*$BNSZ($a1)
2180 mflo ($c_1,$a_0,$a_0)
2181 mfhi ($c_2,$a_0,$a_0)
2182 $ST $c_1,0($a0)
2183
2184 $MULTU ($a_0,$a_1) # mul_add_c2(a[0],b[1],c2,c3,c1);
2185 mflo ($t_1,$a_0,$a_1)
2186 mfhi ($t_2,$a_0,$a_1)
2187 slt $c_1,$t_2,$zero
2188 $SLL $t_2,1
2189 $MULTU ($a_2,$a_0) # mul_add_c2(a[2],b[0],c3,c1,c2);
2190 slt $a2,$t_1,$zero
2191 $ADDU $t_2,$a2
2192 $SLL $t_1,1
2193 $ADDU $c_2,$t_1
2194 sltu $at,$c_2,$t_1
2195 $ADDU $c_3,$t_2,$at
2196 $ST $c_2,$BNSZ($a0)
2197 mflo ($t_1,$a_2,$a_0)
2198 mfhi ($t_2,$a_2,$a_0)
2199___
2200 &add_c2($t_2,$t_1,$c_3,$c_1,$c_2,0,
2201 $a_1,$a_1); # mul_add_c(a[1],b[1],c3,c1,c2);
2202$code.=<<___;
2203 $ADDU $c_3,$t_1
2204 sltu $at,$c_3,$t_1
2205 $MULTU ($a_0,$a_3) # mul_add_c2(a[0],b[3],c1,c2,c3);
2206 $ADDU $t_2,$at
2207 $ADDU $c_1,$t_2
2208 sltu $at,$c_1,$t_2
2209 $ADDU $c_2,$at
2210 $ST $c_3,2*$BNSZ($a0)
2211 mflo ($t_1,$a_0,$a_3)
2212 mfhi ($t_2,$a_0,$a_3)
2213___
2214 &add_c2($t_2,$t_1,$c_1,$c_2,$c_3,0,
2215 $a_1,$a_2); # mul_add_c2(a2[1],b[2],c1,c2,c3);
2216 &add_c2($t_2,$t_1,$c_1,$c_2,$c_3,1,
2217 $a_3,$a_1); # mul_add_c2(a[3],b[1],c2,c3,c1);
2218$code.=<<___;
2219 $ST $c_1,3*$BNSZ($a0)
2220___
2221 &add_c2($t_2,$t_1,$c_2,$c_3,$c_1,0,
2222 $a_2,$a_2); # mul_add_c(a[2],b[2],c2,c3,c1);
2223$code.=<<___;
2224 $ADDU $c_2,$t_1
2225 sltu $at,$c_2,$t_1
2226 $MULTU ($a_2,$a_3) # mul_add_c2(a[2],b[3],c3,c1,c2);
2227 $ADDU $t_2,$at
2228 $ADDU $c_3,$t_2
2229 sltu $at,$c_3,$t_2
2230 $ADDU $c_1,$at
2231 $ST $c_2,4*$BNSZ($a0)
2232 mflo ($t_1,$a_2,$a_3)
2233 mfhi ($t_2,$a_2,$a_3)
2234___
2235 &add_c2($t_2,$t_1,$c_3,$c_1,$c_2,0,
2236 $a_3,$a_3); # mul_add_c(a[3],b[3],c1,c2,c3);
2237$code.=<<___;
2238 $ST $c_3,5*$BNSZ($a0)
2239
2240 $ADDU $c_1,$t_1
2241 sltu $at,$c_1,$t_1
2242 $ADDU $t_2,$at
2243 $ADDU $c_2,$t_2
2244 $ST $c_1,6*$BNSZ($a0)
2245 $ST $c_2,7*$BNSZ($a0)
2246
2247 .set noreorder
2248___
2249$code.=<<___ if ($flavour =~ /nubi/i);
2250 $REG_L $t3,4*$SZREG($sp)
2251 $REG_L $t2,3*$SZREG($sp)
2252 $REG_L $t1,2*$SZREG($sp)
2253 $REG_L $t0,1*$SZREG($sp)
2254 $REG_L $gp,0*$SZREG($sp)
2255 $PTR_ADD $sp,6*$SZREG
2256___
2257$code.=<<___;
2258 jr $ra
2259 nop
2260.end bn_sqr_comba4
2261___
2262print $code;
2263close STDOUT or die "error closing STDOUT: $!";
Note: See TracBrowser for help on using the repository browser.

© 2024 Oracle Support Privacy / Do Not Sell My Info Terms of Use Trademark Policy Automated Access Etiquette