1 | #! /usr/bin/env perl
|
---|
2 | # Copyright 2010-2020 The OpenSSL Project Authors. All Rights Reserved.
|
---|
3 | #
|
---|
4 | # Licensed under the OpenSSL license (the "License"). You may not use
|
---|
5 | # this file except in compliance with the License. You can obtain a copy
|
---|
6 | # in the file LICENSE in the source distribution or at
|
---|
7 | # https://www.openssl.org/source/license.html
|
---|
8 |
|
---|
9 | #
|
---|
10 | # ====================================================================
|
---|
11 | # Written by Andy Polyakov <[email protected]> for the OpenSSL
|
---|
12 | # project. The module is, however, dual licensed under OpenSSL and
|
---|
13 | # CRYPTOGAMS licenses depending on where you obtain it. For further
|
---|
14 | # details see http://www.openssl.org/~appro/cryptogams/.
|
---|
15 | # ====================================================================
|
---|
16 |
|
---|
17 | # This module doesn't present direct interest for OpenSSL, because it
|
---|
18 | # doesn't provide better performance for longer keys, at least not on
|
---|
19 | # in-order-execution cores. While 512-bit RSA sign operations can be
|
---|
20 | # 65% faster in 64-bit mode, 1024-bit ones are only 15% faster, and
|
---|
21 | # 4096-bit ones are up to 15% slower. In 32-bit mode it varies from
|
---|
22 | # 16% improvement for 512-bit RSA sign to -33% for 4096-bit RSA
|
---|
23 | # verify:-( All comparisons are against bn_mul_mont-free assembler.
|
---|
24 | # The module might be of interest to embedded system developers, as
|
---|
25 | # the code is smaller than 1KB, yet offers >3x improvement on MIPS64
|
---|
26 | # and 75-30% [less for longer keys] on MIPS32 over compiler-generated
|
---|
27 | # code.
|
---|
28 |
|
---|
29 | ######################################################################
|
---|
30 | # There is a number of MIPS ABI in use, O32 and N32/64 are most
|
---|
31 | # widely used. Then there is a new contender: NUBI. It appears that if
|
---|
32 | # one picks the latter, it's possible to arrange code in ABI neutral
|
---|
33 | # manner. Therefore let's stick to NUBI register layout:
|
---|
34 | #
|
---|
35 | ($zero,$at,$t0,$t1,$t2)=map("\$$_",(0..2,24,25));
|
---|
36 | ($a0,$a1,$a2,$a3,$a4,$a5,$a6,$a7)=map("\$$_",(4..11));
|
---|
37 | ($s0,$s1,$s2,$s3,$s4,$s5,$s6,$s7,$s8,$s9,$s10,$s11)=map("\$$_",(12..23));
|
---|
38 | ($gp,$tp,$sp,$fp,$ra)=map("\$$_",(3,28..31));
|
---|
39 | #
|
---|
40 | # The return value is placed in $a0. Following coding rules facilitate
|
---|
41 | # interoperability:
|
---|
42 | #
|
---|
43 | # - never ever touch $tp, "thread pointer", former $gp;
|
---|
44 | # - copy return value to $t0, former $v0 [or to $a0 if you're adapting
|
---|
45 | # old code];
|
---|
46 | # - on O32 populate $a4-$a7 with 'lw $aN,4*N($sp)' if necessary;
|
---|
47 | #
|
---|
48 | # For reference here is register layout for N32/64 MIPS ABIs:
|
---|
49 | #
|
---|
50 | # ($zero,$at,$v0,$v1)=map("\$$_",(0..3));
|
---|
51 | # ($a0,$a1,$a2,$a3,$a4,$a5,$a6,$a7)=map("\$$_",(4..11));
|
---|
52 | # ($t0,$t1,$t2,$t3,$t8,$t9)=map("\$$_",(12..15,24,25));
|
---|
53 | # ($s0,$s1,$s2,$s3,$s4,$s5,$s6,$s7)=map("\$$_",(16..23));
|
---|
54 | # ($gp,$sp,$fp,$ra)=map("\$$_",(28..31));
|
---|
55 | #
|
---|
56 | $flavour = shift || "o32"; # supported flavours are o32,n32,64,nubi32,nubi64
|
---|
57 |
|
---|
58 | if ($flavour =~ /64|n32/i) {
|
---|
59 | $PTR_ADD="daddu"; # incidentally works even on n32
|
---|
60 | $PTR_SUB="dsubu"; # incidentally works even on n32
|
---|
61 | $REG_S="sd";
|
---|
62 | $REG_L="ld";
|
---|
63 | $SZREG=8;
|
---|
64 | } else {
|
---|
65 | $PTR_ADD="addu";
|
---|
66 | $PTR_SUB="subu";
|
---|
67 | $REG_S="sw";
|
---|
68 | $REG_L="lw";
|
---|
69 | $SZREG=4;
|
---|
70 | }
|
---|
71 | $SAVED_REGS_MASK = ($flavour =~ /nubi/i) ? 0x00fff000 : 0x00ff0000;
|
---|
72 | #
|
---|
73 | # <[email protected]>
|
---|
74 | #
|
---|
75 | ######################################################################
|
---|
76 |
|
---|
77 | while (($output=shift) && ($output!~/\w[\w\-]*\.\w+$/)) {}
|
---|
78 | open STDOUT,">$output";
|
---|
79 |
|
---|
80 | if ($flavour =~ /64|n32/i) {
|
---|
81 | $LD="ld";
|
---|
82 | $ST="sd";
|
---|
83 | $MULTU="dmultu";
|
---|
84 | $ADDU="daddu";
|
---|
85 | $SUBU="dsubu";
|
---|
86 | $BNSZ=8;
|
---|
87 | } else {
|
---|
88 | $LD="lw";
|
---|
89 | $ST="sw";
|
---|
90 | $MULTU="multu";
|
---|
91 | $ADDU="addu";
|
---|
92 | $SUBU="subu";
|
---|
93 | $BNSZ=4;
|
---|
94 | }
|
---|
95 |
|
---|
96 | # int bn_mul_mont(
|
---|
97 | $rp=$a0; # BN_ULONG *rp,
|
---|
98 | $ap=$a1; # const BN_ULONG *ap,
|
---|
99 | $bp=$a2; # const BN_ULONG *bp,
|
---|
100 | $np=$a3; # const BN_ULONG *np,
|
---|
101 | $n0=$a4; # const BN_ULONG *n0,
|
---|
102 | $num=$a5; # int num);
|
---|
103 |
|
---|
104 | $lo0=$a6;
|
---|
105 | $hi0=$a7;
|
---|
106 | $lo1=$t1;
|
---|
107 | $hi1=$t2;
|
---|
108 | $aj=$s0;
|
---|
109 | $bi=$s1;
|
---|
110 | $nj=$s2;
|
---|
111 | $tp=$s3;
|
---|
112 | $alo=$s4;
|
---|
113 | $ahi=$s5;
|
---|
114 | $nlo=$s6;
|
---|
115 | $nhi=$s7;
|
---|
116 | $tj=$s8;
|
---|
117 | $i=$s9;
|
---|
118 | $j=$s10;
|
---|
119 | $m1=$s11;
|
---|
120 |
|
---|
121 | $FRAMESIZE=14;
|
---|
122 |
|
---|
123 | $code=<<___;
|
---|
124 | #include "mips_arch.h"
|
---|
125 |
|
---|
126 | .text
|
---|
127 |
|
---|
128 | .set noat
|
---|
129 | .set noreorder
|
---|
130 |
|
---|
131 | .align 5
|
---|
132 | .globl bn_mul_mont
|
---|
133 | .ent bn_mul_mont
|
---|
134 | bn_mul_mont:
|
---|
135 | ___
|
---|
136 | $code.=<<___ if ($flavour =~ /o32/i);
|
---|
137 | lw $n0,16($sp)
|
---|
138 | lw $num,20($sp)
|
---|
139 | ___
|
---|
140 | $code.=<<___;
|
---|
141 | slt $at,$num,4
|
---|
142 | bnez $at,1f
|
---|
143 | li $t0,0
|
---|
144 | slt $at,$num,17 # on in-order CPU
|
---|
145 | bnez $at,bn_mul_mont_internal
|
---|
146 | nop
|
---|
147 | 1: jr $ra
|
---|
148 | li $a0,0
|
---|
149 | .end bn_mul_mont
|
---|
150 |
|
---|
151 | .align 5
|
---|
152 | .ent bn_mul_mont_internal
|
---|
153 | bn_mul_mont_internal:
|
---|
154 | .frame $fp,$FRAMESIZE*$SZREG,$ra
|
---|
155 | .mask 0x40000000|$SAVED_REGS_MASK,-$SZREG
|
---|
156 | $PTR_SUB $sp,$FRAMESIZE*$SZREG
|
---|
157 | $REG_S $fp,($FRAMESIZE-1)*$SZREG($sp)
|
---|
158 | $REG_S $s11,($FRAMESIZE-2)*$SZREG($sp)
|
---|
159 | $REG_S $s10,($FRAMESIZE-3)*$SZREG($sp)
|
---|
160 | $REG_S $s9,($FRAMESIZE-4)*$SZREG($sp)
|
---|
161 | $REG_S $s8,($FRAMESIZE-5)*$SZREG($sp)
|
---|
162 | $REG_S $s7,($FRAMESIZE-6)*$SZREG($sp)
|
---|
163 | $REG_S $s6,($FRAMESIZE-7)*$SZREG($sp)
|
---|
164 | $REG_S $s5,($FRAMESIZE-8)*$SZREG($sp)
|
---|
165 | $REG_S $s4,($FRAMESIZE-9)*$SZREG($sp)
|
---|
166 | ___
|
---|
167 | $code.=<<___ if ($flavour =~ /nubi/i);
|
---|
168 | $REG_S $s3,($FRAMESIZE-10)*$SZREG($sp)
|
---|
169 | $REG_S $s2,($FRAMESIZE-11)*$SZREG($sp)
|
---|
170 | $REG_S $s1,($FRAMESIZE-12)*$SZREG($sp)
|
---|
171 | $REG_S $s0,($FRAMESIZE-13)*$SZREG($sp)
|
---|
172 | ___
|
---|
173 | $code.=<<___;
|
---|
174 | move $fp,$sp
|
---|
175 |
|
---|
176 | .set reorder
|
---|
177 | $LD $n0,0($n0)
|
---|
178 | $LD $bi,0($bp) # bp[0]
|
---|
179 | $LD $aj,0($ap) # ap[0]
|
---|
180 | $LD $nj,0($np) # np[0]
|
---|
181 |
|
---|
182 | $PTR_SUB $sp,2*$BNSZ # place for two extra words
|
---|
183 | sll $num,`log($BNSZ)/log(2)`
|
---|
184 | li $at,-4096
|
---|
185 | $PTR_SUB $sp,$num
|
---|
186 | and $sp,$at
|
---|
187 |
|
---|
188 | $MULTU ($aj,$bi)
|
---|
189 | $LD $ahi,$BNSZ($ap)
|
---|
190 | $LD $nhi,$BNSZ($np)
|
---|
191 | mflo ($lo0,$aj,$bi)
|
---|
192 | mfhi ($hi0,$aj,$bi)
|
---|
193 | $MULTU ($lo0,$n0)
|
---|
194 | mflo ($m1,$lo0,$n0)
|
---|
195 |
|
---|
196 | $MULTU ($ahi,$bi)
|
---|
197 | mflo ($alo,$ahi,$bi)
|
---|
198 | mfhi ($ahi,$ahi,$bi)
|
---|
199 |
|
---|
200 | $MULTU ($nj,$m1)
|
---|
201 | mflo ($lo1,$nj,$m1)
|
---|
202 | mfhi ($hi1,$nj,$m1)
|
---|
203 | $MULTU ($nhi,$m1)
|
---|
204 | $ADDU $lo1,$lo0
|
---|
205 | sltu $at,$lo1,$lo0
|
---|
206 | $ADDU $hi1,$at
|
---|
207 | mflo ($nlo,$nhi,$m1)
|
---|
208 | mfhi ($nhi,$nhi,$m1)
|
---|
209 |
|
---|
210 | move $tp,$sp
|
---|
211 | li $j,2*$BNSZ
|
---|
212 | .align 4
|
---|
213 | .L1st:
|
---|
214 | .set noreorder
|
---|
215 | $PTR_ADD $aj,$ap,$j
|
---|
216 | $PTR_ADD $nj,$np,$j
|
---|
217 | $LD $aj,($aj)
|
---|
218 | $LD $nj,($nj)
|
---|
219 |
|
---|
220 | $MULTU ($aj,$bi)
|
---|
221 | $ADDU $lo0,$alo,$hi0
|
---|
222 | $ADDU $lo1,$nlo,$hi1
|
---|
223 | sltu $at,$lo0,$hi0
|
---|
224 | sltu $t0,$lo1,$hi1
|
---|
225 | $ADDU $hi0,$ahi,$at
|
---|
226 | $ADDU $hi1,$nhi,$t0
|
---|
227 | mflo ($alo,$aj,$bi)
|
---|
228 | mfhi ($ahi,$aj,$bi)
|
---|
229 |
|
---|
230 | $ADDU $lo1,$lo0
|
---|
231 | sltu $at,$lo1,$lo0
|
---|
232 | $MULTU ($nj,$m1)
|
---|
233 | $ADDU $hi1,$at
|
---|
234 | addu $j,$BNSZ
|
---|
235 | $ST $lo1,($tp)
|
---|
236 | sltu $t0,$j,$num
|
---|
237 | mflo ($nlo,$nj,$m1)
|
---|
238 | mfhi ($nhi,$nj,$m1)
|
---|
239 |
|
---|
240 | bnez $t0,.L1st
|
---|
241 | $PTR_ADD $tp,$BNSZ
|
---|
242 | .set reorder
|
---|
243 |
|
---|
244 | $ADDU $lo0,$alo,$hi0
|
---|
245 | sltu $at,$lo0,$hi0
|
---|
246 | $ADDU $hi0,$ahi,$at
|
---|
247 |
|
---|
248 | $ADDU $lo1,$nlo,$hi1
|
---|
249 | sltu $t0,$lo1,$hi1
|
---|
250 | $ADDU $hi1,$nhi,$t0
|
---|
251 | $ADDU $lo1,$lo0
|
---|
252 | sltu $at,$lo1,$lo0
|
---|
253 | $ADDU $hi1,$at
|
---|
254 |
|
---|
255 | $ST $lo1,($tp)
|
---|
256 |
|
---|
257 | $ADDU $hi1,$hi0
|
---|
258 | sltu $at,$hi1,$hi0
|
---|
259 | $ST $hi1,$BNSZ($tp)
|
---|
260 | $ST $at,2*$BNSZ($tp)
|
---|
261 |
|
---|
262 | li $i,$BNSZ
|
---|
263 | .align 4
|
---|
264 | .Louter:
|
---|
265 | $PTR_ADD $bi,$bp,$i
|
---|
266 | $LD $bi,($bi)
|
---|
267 | $LD $aj,($ap)
|
---|
268 | $LD $ahi,$BNSZ($ap)
|
---|
269 | $LD $tj,($sp)
|
---|
270 |
|
---|
271 | $MULTU ($aj,$bi)
|
---|
272 | $LD $nj,($np)
|
---|
273 | $LD $nhi,$BNSZ($np)
|
---|
274 | mflo ($lo0,$aj,$bi)
|
---|
275 | mfhi ($hi0,$aj,$bi)
|
---|
276 | $ADDU $lo0,$tj
|
---|
277 | $MULTU ($lo0,$n0)
|
---|
278 | sltu $at,$lo0,$tj
|
---|
279 | $ADDU $hi0,$at
|
---|
280 | mflo ($m1,$lo0,$n0)
|
---|
281 |
|
---|
282 | $MULTU ($ahi,$bi)
|
---|
283 | mflo ($alo,$ahi,$bi)
|
---|
284 | mfhi ($ahi,$ahi,$bi)
|
---|
285 |
|
---|
286 | $MULTU ($nj,$m1)
|
---|
287 | mflo ($lo1,$nj,$m1)
|
---|
288 | mfhi ($hi1,$nj,$m1)
|
---|
289 |
|
---|
290 | $MULTU ($nhi,$m1)
|
---|
291 | $ADDU $lo1,$lo0
|
---|
292 | sltu $at,$lo1,$lo0
|
---|
293 | $ADDU $hi1,$at
|
---|
294 | mflo ($nlo,$nhi,$m1)
|
---|
295 | mfhi ($nhi,$nhi,$m1)
|
---|
296 |
|
---|
297 | move $tp,$sp
|
---|
298 | li $j,2*$BNSZ
|
---|
299 | $LD $tj,$BNSZ($tp)
|
---|
300 | .align 4
|
---|
301 | .Linner:
|
---|
302 | .set noreorder
|
---|
303 | $PTR_ADD $aj,$ap,$j
|
---|
304 | $PTR_ADD $nj,$np,$j
|
---|
305 | $LD $aj,($aj)
|
---|
306 | $LD $nj,($nj)
|
---|
307 |
|
---|
308 | $MULTU ($aj,$bi)
|
---|
309 | $ADDU $lo0,$alo,$hi0
|
---|
310 | $ADDU $lo1,$nlo,$hi1
|
---|
311 | sltu $at,$lo0,$hi0
|
---|
312 | sltu $t0,$lo1,$hi1
|
---|
313 | $ADDU $hi0,$ahi,$at
|
---|
314 | $ADDU $hi1,$nhi,$t0
|
---|
315 | mflo ($alo,$aj,$bi)
|
---|
316 | mfhi ($ahi,$aj,$bi)
|
---|
317 |
|
---|
318 | $ADDU $lo0,$tj
|
---|
319 | addu $j,$BNSZ
|
---|
320 | $MULTU ($nj,$m1)
|
---|
321 | sltu $at,$lo0,$tj
|
---|
322 | $ADDU $lo1,$lo0
|
---|
323 | $ADDU $hi0,$at
|
---|
324 | sltu $t0,$lo1,$lo0
|
---|
325 | $LD $tj,2*$BNSZ($tp)
|
---|
326 | $ADDU $hi1,$t0
|
---|
327 | sltu $at,$j,$num
|
---|
328 | mflo ($nlo,$nj,$m1)
|
---|
329 | mfhi ($nhi,$nj,$m1)
|
---|
330 | $ST $lo1,($tp)
|
---|
331 | bnez $at,.Linner
|
---|
332 | $PTR_ADD $tp,$BNSZ
|
---|
333 | .set reorder
|
---|
334 |
|
---|
335 | $ADDU $lo0,$alo,$hi0
|
---|
336 | sltu $at,$lo0,$hi0
|
---|
337 | $ADDU $hi0,$ahi,$at
|
---|
338 | $ADDU $lo0,$tj
|
---|
339 | sltu $t0,$lo0,$tj
|
---|
340 | $ADDU $hi0,$t0
|
---|
341 |
|
---|
342 | $LD $tj,2*$BNSZ($tp)
|
---|
343 | $ADDU $lo1,$nlo,$hi1
|
---|
344 | sltu $at,$lo1,$hi1
|
---|
345 | $ADDU $hi1,$nhi,$at
|
---|
346 | $ADDU $lo1,$lo0
|
---|
347 | sltu $t0,$lo1,$lo0
|
---|
348 | $ADDU $hi1,$t0
|
---|
349 | $ST $lo1,($tp)
|
---|
350 |
|
---|
351 | $ADDU $lo1,$hi1,$hi0
|
---|
352 | sltu $hi1,$lo1,$hi0
|
---|
353 | $ADDU $lo1,$tj
|
---|
354 | sltu $at,$lo1,$tj
|
---|
355 | $ADDU $hi1,$at
|
---|
356 | $ST $lo1,$BNSZ($tp)
|
---|
357 | $ST $hi1,2*$BNSZ($tp)
|
---|
358 |
|
---|
359 | addu $i,$BNSZ
|
---|
360 | sltu $t0,$i,$num
|
---|
361 | bnez $t0,.Louter
|
---|
362 | |
---|
363 |
|
---|
364 | .set noreorder
|
---|
365 | $PTR_ADD $tj,$sp,$num # &tp[num]
|
---|
366 | move $tp,$sp
|
---|
367 | move $ap,$sp
|
---|
368 | li $hi0,0 # clear borrow bit
|
---|
369 |
|
---|
370 | .align 4
|
---|
371 | .Lsub: $LD $lo0,($tp)
|
---|
372 | $LD $lo1,($np)
|
---|
373 | $PTR_ADD $tp,$BNSZ
|
---|
374 | $PTR_ADD $np,$BNSZ
|
---|
375 | $SUBU $lo1,$lo0,$lo1 # tp[i]-np[i]
|
---|
376 | sgtu $at,$lo1,$lo0
|
---|
377 | $SUBU $lo0,$lo1,$hi0
|
---|
378 | sgtu $hi0,$lo0,$lo1
|
---|
379 | $ST $lo0,($rp)
|
---|
380 | or $hi0,$at
|
---|
381 | sltu $at,$tp,$tj
|
---|
382 | bnez $at,.Lsub
|
---|
383 | $PTR_ADD $rp,$BNSZ
|
---|
384 |
|
---|
385 | $SUBU $hi0,$hi1,$hi0 # handle upmost overflow bit
|
---|
386 | move $tp,$sp
|
---|
387 | $PTR_SUB $rp,$num # restore rp
|
---|
388 | not $hi1,$hi0
|
---|
389 |
|
---|
390 | .Lcopy: $LD $nj,($tp) # conditional move
|
---|
391 | $LD $aj,($rp)
|
---|
392 | $ST $zero,($tp)
|
---|
393 | $PTR_ADD $tp,$BNSZ
|
---|
394 | and $nj,$hi0
|
---|
395 | and $aj,$hi1
|
---|
396 | or $aj,$nj
|
---|
397 | sltu $at,$tp,$tj
|
---|
398 | $ST $aj,($rp)
|
---|
399 | bnez $at,.Lcopy
|
---|
400 | $PTR_ADD $rp,$BNSZ
|
---|
401 |
|
---|
402 | li $a0,1
|
---|
403 | li $t0,1
|
---|
404 |
|
---|
405 | .set noreorder
|
---|
406 | move $sp,$fp
|
---|
407 | $REG_L $fp,($FRAMESIZE-1)*$SZREG($sp)
|
---|
408 | $REG_L $s11,($FRAMESIZE-2)*$SZREG($sp)
|
---|
409 | $REG_L $s10,($FRAMESIZE-3)*$SZREG($sp)
|
---|
410 | $REG_L $s9,($FRAMESIZE-4)*$SZREG($sp)
|
---|
411 | $REG_L $s8,($FRAMESIZE-5)*$SZREG($sp)
|
---|
412 | $REG_L $s7,($FRAMESIZE-6)*$SZREG($sp)
|
---|
413 | $REG_L $s6,($FRAMESIZE-7)*$SZREG($sp)
|
---|
414 | $REG_L $s5,($FRAMESIZE-8)*$SZREG($sp)
|
---|
415 | $REG_L $s4,($FRAMESIZE-9)*$SZREG($sp)
|
---|
416 | ___
|
---|
417 | $code.=<<___ if ($flavour =~ /nubi/i);
|
---|
418 | $REG_L $s3,($FRAMESIZE-10)*$SZREG($sp)
|
---|
419 | $REG_L $s2,($FRAMESIZE-11)*$SZREG($sp)
|
---|
420 | $REG_L $s1,($FRAMESIZE-12)*$SZREG($sp)
|
---|
421 | $REG_L $s0,($FRAMESIZE-13)*$SZREG($sp)
|
---|
422 | ___
|
---|
423 | $code.=<<___;
|
---|
424 | jr $ra
|
---|
425 | $PTR_ADD $sp,$FRAMESIZE*$SZREG
|
---|
426 | .end bn_mul_mont_internal
|
---|
427 | .rdata
|
---|
428 | .asciiz "Montgomery Multiplication for MIPS, CRYPTOGAMS by <appro\@openssl.org>"
|
---|
429 | ___
|
---|
430 |
|
---|
431 | $code =~ s/\`([^\`]*)\`/eval $1/gem;
|
---|
432 |
|
---|
433 | print $code;
|
---|
434 | close STDOUT or die "error closing STDOUT: $!";
|
---|