VirtualBox

source: vbox/trunk/src/libs/openssl-1.1.1f/crypto/bn/asm/mips-mont.pl@ 83531

Last change on this file since 83531 was 83531, checked in by vboxsync, 5 years ago

setting svn:sync-process=export for openssl-1.1.1f, all files except tests

File size: 9.3 KB
Line 
1#! /usr/bin/env perl
2# Copyright 2010-2020 The OpenSSL Project Authors. All Rights Reserved.
3#
4# Licensed under the OpenSSL license (the "License"). You may not use
5# this file except in compliance with the License. You can obtain a copy
6# in the file LICENSE in the source distribution or at
7# https://www.openssl.org/source/license.html
8
9#
10# ====================================================================
11# Written by Andy Polyakov <[email protected]> for the OpenSSL
12# project. The module is, however, dual licensed under OpenSSL and
13# CRYPTOGAMS licenses depending on where you obtain it. For further
14# details see http://www.openssl.org/~appro/cryptogams/.
15# ====================================================================
16
17# This module doesn't present direct interest for OpenSSL, because it
18# doesn't provide better performance for longer keys, at least not on
19# in-order-execution cores. While 512-bit RSA sign operations can be
20# 65% faster in 64-bit mode, 1024-bit ones are only 15% faster, and
21# 4096-bit ones are up to 15% slower. In 32-bit mode it varies from
22# 16% improvement for 512-bit RSA sign to -33% for 4096-bit RSA
23# verify:-( All comparisons are against bn_mul_mont-free assembler.
24# The module might be of interest to embedded system developers, as
25# the code is smaller than 1KB, yet offers >3x improvement on MIPS64
26# and 75-30% [less for longer keys] on MIPS32 over compiler-generated
27# code.
28
29######################################################################
30# There is a number of MIPS ABI in use, O32 and N32/64 are most
31# widely used. Then there is a new contender: NUBI. It appears that if
32# one picks the latter, it's possible to arrange code in ABI neutral
33# manner. Therefore let's stick to NUBI register layout:
34#
35($zero,$at,$t0,$t1,$t2)=map("\$$_",(0..2,24,25));
36($a0,$a1,$a2,$a3,$a4,$a5,$a6,$a7)=map("\$$_",(4..11));
37($s0,$s1,$s2,$s3,$s4,$s5,$s6,$s7,$s8,$s9,$s10,$s11)=map("\$$_",(12..23));
38($gp,$tp,$sp,$fp,$ra)=map("\$$_",(3,28..31));
39#
40# The return value is placed in $a0. Following coding rules facilitate
41# interoperability:
42#
43# - never ever touch $tp, "thread pointer", former $gp;
44# - copy return value to $t0, former $v0 [or to $a0 if you're adapting
45# old code];
46# - on O32 populate $a4-$a7 with 'lw $aN,4*N($sp)' if necessary;
47#
48# For reference here is register layout for N32/64 MIPS ABIs:
49#
50# ($zero,$at,$v0,$v1)=map("\$$_",(0..3));
51# ($a0,$a1,$a2,$a3,$a4,$a5,$a6,$a7)=map("\$$_",(4..11));
52# ($t0,$t1,$t2,$t3,$t8,$t9)=map("\$$_",(12..15,24,25));
53# ($s0,$s1,$s2,$s3,$s4,$s5,$s6,$s7)=map("\$$_",(16..23));
54# ($gp,$sp,$fp,$ra)=map("\$$_",(28..31));
55#
56$flavour = shift || "o32"; # supported flavours are o32,n32,64,nubi32,nubi64
57
58if ($flavour =~ /64|n32/i) {
59 $PTR_ADD="daddu"; # incidentally works even on n32
60 $PTR_SUB="dsubu"; # incidentally works even on n32
61 $REG_S="sd";
62 $REG_L="ld";
63 $SZREG=8;
64} else {
65 $PTR_ADD="addu";
66 $PTR_SUB="subu";
67 $REG_S="sw";
68 $REG_L="lw";
69 $SZREG=4;
70}
71$SAVED_REGS_MASK = ($flavour =~ /nubi/i) ? 0x00fff000 : 0x00ff0000;
72#
73# <[email protected]>
74#
75######################################################################
76
77while (($output=shift) && ($output!~/\w[\w\-]*\.\w+$/)) {}
78open STDOUT,">$output";
79
80if ($flavour =~ /64|n32/i) {
81 $LD="ld";
82 $ST="sd";
83 $MULTU="dmultu";
84 $ADDU="daddu";
85 $SUBU="dsubu";
86 $BNSZ=8;
87} else {
88 $LD="lw";
89 $ST="sw";
90 $MULTU="multu";
91 $ADDU="addu";
92 $SUBU="subu";
93 $BNSZ=4;
94}
95
96# int bn_mul_mont(
97$rp=$a0; # BN_ULONG *rp,
98$ap=$a1; # const BN_ULONG *ap,
99$bp=$a2; # const BN_ULONG *bp,
100$np=$a3; # const BN_ULONG *np,
101$n0=$a4; # const BN_ULONG *n0,
102$num=$a5; # int num);
103
104$lo0=$a6;
105$hi0=$a7;
106$lo1=$t1;
107$hi1=$t2;
108$aj=$s0;
109$bi=$s1;
110$nj=$s2;
111$tp=$s3;
112$alo=$s4;
113$ahi=$s5;
114$nlo=$s6;
115$nhi=$s7;
116$tj=$s8;
117$i=$s9;
118$j=$s10;
119$m1=$s11;
120
121$FRAMESIZE=14;
122
123$code=<<___;
124#include "mips_arch.h"
125
126.text
127
128.set noat
129.set noreorder
130
131.align 5
132.globl bn_mul_mont
133.ent bn_mul_mont
134bn_mul_mont:
135___
136$code.=<<___ if ($flavour =~ /o32/i);
137 lw $n0,16($sp)
138 lw $num,20($sp)
139___
140$code.=<<___;
141 slt $at,$num,4
142 bnez $at,1f
143 li $t0,0
144 slt $at,$num,17 # on in-order CPU
145 bnez $at,bn_mul_mont_internal
146 nop
1471: jr $ra
148 li $a0,0
149.end bn_mul_mont
150
151.align 5
152.ent bn_mul_mont_internal
153bn_mul_mont_internal:
154 .frame $fp,$FRAMESIZE*$SZREG,$ra
155 .mask 0x40000000|$SAVED_REGS_MASK,-$SZREG
156 $PTR_SUB $sp,$FRAMESIZE*$SZREG
157 $REG_S $fp,($FRAMESIZE-1)*$SZREG($sp)
158 $REG_S $s11,($FRAMESIZE-2)*$SZREG($sp)
159 $REG_S $s10,($FRAMESIZE-3)*$SZREG($sp)
160 $REG_S $s9,($FRAMESIZE-4)*$SZREG($sp)
161 $REG_S $s8,($FRAMESIZE-5)*$SZREG($sp)
162 $REG_S $s7,($FRAMESIZE-6)*$SZREG($sp)
163 $REG_S $s6,($FRAMESIZE-7)*$SZREG($sp)
164 $REG_S $s5,($FRAMESIZE-8)*$SZREG($sp)
165 $REG_S $s4,($FRAMESIZE-9)*$SZREG($sp)
166___
167$code.=<<___ if ($flavour =~ /nubi/i);
168 $REG_S $s3,($FRAMESIZE-10)*$SZREG($sp)
169 $REG_S $s2,($FRAMESIZE-11)*$SZREG($sp)
170 $REG_S $s1,($FRAMESIZE-12)*$SZREG($sp)
171 $REG_S $s0,($FRAMESIZE-13)*$SZREG($sp)
172___
173$code.=<<___;
174 move $fp,$sp
175
176 .set reorder
177 $LD $n0,0($n0)
178 $LD $bi,0($bp) # bp[0]
179 $LD $aj,0($ap) # ap[0]
180 $LD $nj,0($np) # np[0]
181
182 $PTR_SUB $sp,2*$BNSZ # place for two extra words
183 sll $num,`log($BNSZ)/log(2)`
184 li $at,-4096
185 $PTR_SUB $sp,$num
186 and $sp,$at
187
188 $MULTU ($aj,$bi)
189 $LD $ahi,$BNSZ($ap)
190 $LD $nhi,$BNSZ($np)
191 mflo ($lo0,$aj,$bi)
192 mfhi ($hi0,$aj,$bi)
193 $MULTU ($lo0,$n0)
194 mflo ($m1,$lo0,$n0)
195
196 $MULTU ($ahi,$bi)
197 mflo ($alo,$ahi,$bi)
198 mfhi ($ahi,$ahi,$bi)
199
200 $MULTU ($nj,$m1)
201 mflo ($lo1,$nj,$m1)
202 mfhi ($hi1,$nj,$m1)
203 $MULTU ($nhi,$m1)
204 $ADDU $lo1,$lo0
205 sltu $at,$lo1,$lo0
206 $ADDU $hi1,$at
207 mflo ($nlo,$nhi,$m1)
208 mfhi ($nhi,$nhi,$m1)
209
210 move $tp,$sp
211 li $j,2*$BNSZ
212.align 4
213.L1st:
214 .set noreorder
215 $PTR_ADD $aj,$ap,$j
216 $PTR_ADD $nj,$np,$j
217 $LD $aj,($aj)
218 $LD $nj,($nj)
219
220 $MULTU ($aj,$bi)
221 $ADDU $lo0,$alo,$hi0
222 $ADDU $lo1,$nlo,$hi1
223 sltu $at,$lo0,$hi0
224 sltu $t0,$lo1,$hi1
225 $ADDU $hi0,$ahi,$at
226 $ADDU $hi1,$nhi,$t0
227 mflo ($alo,$aj,$bi)
228 mfhi ($ahi,$aj,$bi)
229
230 $ADDU $lo1,$lo0
231 sltu $at,$lo1,$lo0
232 $MULTU ($nj,$m1)
233 $ADDU $hi1,$at
234 addu $j,$BNSZ
235 $ST $lo1,($tp)
236 sltu $t0,$j,$num
237 mflo ($nlo,$nj,$m1)
238 mfhi ($nhi,$nj,$m1)
239
240 bnez $t0,.L1st
241 $PTR_ADD $tp,$BNSZ
242 .set reorder
243
244 $ADDU $lo0,$alo,$hi0
245 sltu $at,$lo0,$hi0
246 $ADDU $hi0,$ahi,$at
247
248 $ADDU $lo1,$nlo,$hi1
249 sltu $t0,$lo1,$hi1
250 $ADDU $hi1,$nhi,$t0
251 $ADDU $lo1,$lo0
252 sltu $at,$lo1,$lo0
253 $ADDU $hi1,$at
254
255 $ST $lo1,($tp)
256
257 $ADDU $hi1,$hi0
258 sltu $at,$hi1,$hi0
259 $ST $hi1,$BNSZ($tp)
260 $ST $at,2*$BNSZ($tp)
261
262 li $i,$BNSZ
263.align 4
264.Louter:
265 $PTR_ADD $bi,$bp,$i
266 $LD $bi,($bi)
267 $LD $aj,($ap)
268 $LD $ahi,$BNSZ($ap)
269 $LD $tj,($sp)
270
271 $MULTU ($aj,$bi)
272 $LD $nj,($np)
273 $LD $nhi,$BNSZ($np)
274 mflo ($lo0,$aj,$bi)
275 mfhi ($hi0,$aj,$bi)
276 $ADDU $lo0,$tj
277 $MULTU ($lo0,$n0)
278 sltu $at,$lo0,$tj
279 $ADDU $hi0,$at
280 mflo ($m1,$lo0,$n0)
281
282 $MULTU ($ahi,$bi)
283 mflo ($alo,$ahi,$bi)
284 mfhi ($ahi,$ahi,$bi)
285
286 $MULTU ($nj,$m1)
287 mflo ($lo1,$nj,$m1)
288 mfhi ($hi1,$nj,$m1)
289
290 $MULTU ($nhi,$m1)
291 $ADDU $lo1,$lo0
292 sltu $at,$lo1,$lo0
293 $ADDU $hi1,$at
294 mflo ($nlo,$nhi,$m1)
295 mfhi ($nhi,$nhi,$m1)
296
297 move $tp,$sp
298 li $j,2*$BNSZ
299 $LD $tj,$BNSZ($tp)
300.align 4
301.Linner:
302 .set noreorder
303 $PTR_ADD $aj,$ap,$j
304 $PTR_ADD $nj,$np,$j
305 $LD $aj,($aj)
306 $LD $nj,($nj)
307
308 $MULTU ($aj,$bi)
309 $ADDU $lo0,$alo,$hi0
310 $ADDU $lo1,$nlo,$hi1
311 sltu $at,$lo0,$hi0
312 sltu $t0,$lo1,$hi1
313 $ADDU $hi0,$ahi,$at
314 $ADDU $hi1,$nhi,$t0
315 mflo ($alo,$aj,$bi)
316 mfhi ($ahi,$aj,$bi)
317
318 $ADDU $lo0,$tj
319 addu $j,$BNSZ
320 $MULTU ($nj,$m1)
321 sltu $at,$lo0,$tj
322 $ADDU $lo1,$lo0
323 $ADDU $hi0,$at
324 sltu $t0,$lo1,$lo0
325 $LD $tj,2*$BNSZ($tp)
326 $ADDU $hi1,$t0
327 sltu $at,$j,$num
328 mflo ($nlo,$nj,$m1)
329 mfhi ($nhi,$nj,$m1)
330 $ST $lo1,($tp)
331 bnez $at,.Linner
332 $PTR_ADD $tp,$BNSZ
333 .set reorder
334
335 $ADDU $lo0,$alo,$hi0
336 sltu $at,$lo0,$hi0
337 $ADDU $hi0,$ahi,$at
338 $ADDU $lo0,$tj
339 sltu $t0,$lo0,$tj
340 $ADDU $hi0,$t0
341
342 $LD $tj,2*$BNSZ($tp)
343 $ADDU $lo1,$nlo,$hi1
344 sltu $at,$lo1,$hi1
345 $ADDU $hi1,$nhi,$at
346 $ADDU $lo1,$lo0
347 sltu $t0,$lo1,$lo0
348 $ADDU $hi1,$t0
349 $ST $lo1,($tp)
350
351 $ADDU $lo1,$hi1,$hi0
352 sltu $hi1,$lo1,$hi0
353 $ADDU $lo1,$tj
354 sltu $at,$lo1,$tj
355 $ADDU $hi1,$at
356 $ST $lo1,$BNSZ($tp)
357 $ST $hi1,2*$BNSZ($tp)
358
359 addu $i,$BNSZ
360 sltu $t0,$i,$num
361 bnez $t0,.Louter
362
363
364 .set noreorder
365 $PTR_ADD $tj,$sp,$num # &tp[num]
366 move $tp,$sp
367 move $ap,$sp
368 li $hi0,0 # clear borrow bit
369
370.align 4
371.Lsub: $LD $lo0,($tp)
372 $LD $lo1,($np)
373 $PTR_ADD $tp,$BNSZ
374 $PTR_ADD $np,$BNSZ
375 $SUBU $lo1,$lo0,$lo1 # tp[i]-np[i]
376 sgtu $at,$lo1,$lo0
377 $SUBU $lo0,$lo1,$hi0
378 sgtu $hi0,$lo0,$lo1
379 $ST $lo0,($rp)
380 or $hi0,$at
381 sltu $at,$tp,$tj
382 bnez $at,.Lsub
383 $PTR_ADD $rp,$BNSZ
384
385 $SUBU $hi0,$hi1,$hi0 # handle upmost overflow bit
386 move $tp,$sp
387 $PTR_SUB $rp,$num # restore rp
388 not $hi1,$hi0
389
390.Lcopy: $LD $nj,($tp) # conditional move
391 $LD $aj,($rp)
392 $ST $zero,($tp)
393 $PTR_ADD $tp,$BNSZ
394 and $nj,$hi0
395 and $aj,$hi1
396 or $aj,$nj
397 sltu $at,$tp,$tj
398 $ST $aj,($rp)
399 bnez $at,.Lcopy
400 $PTR_ADD $rp,$BNSZ
401
402 li $a0,1
403 li $t0,1
404
405 .set noreorder
406 move $sp,$fp
407 $REG_L $fp,($FRAMESIZE-1)*$SZREG($sp)
408 $REG_L $s11,($FRAMESIZE-2)*$SZREG($sp)
409 $REG_L $s10,($FRAMESIZE-3)*$SZREG($sp)
410 $REG_L $s9,($FRAMESIZE-4)*$SZREG($sp)
411 $REG_L $s8,($FRAMESIZE-5)*$SZREG($sp)
412 $REG_L $s7,($FRAMESIZE-6)*$SZREG($sp)
413 $REG_L $s6,($FRAMESIZE-7)*$SZREG($sp)
414 $REG_L $s5,($FRAMESIZE-8)*$SZREG($sp)
415 $REG_L $s4,($FRAMESIZE-9)*$SZREG($sp)
416___
417$code.=<<___ if ($flavour =~ /nubi/i);
418 $REG_L $s3,($FRAMESIZE-10)*$SZREG($sp)
419 $REG_L $s2,($FRAMESIZE-11)*$SZREG($sp)
420 $REG_L $s1,($FRAMESIZE-12)*$SZREG($sp)
421 $REG_L $s0,($FRAMESIZE-13)*$SZREG($sp)
422___
423$code.=<<___;
424 jr $ra
425 $PTR_ADD $sp,$FRAMESIZE*$SZREG
426.end bn_mul_mont_internal
427.rdata
428.asciiz "Montgomery Multiplication for MIPS, CRYPTOGAMS by <appro\@openssl.org>"
429___
430
431$code =~ s/\`([^\`]*)\`/eval $1/gem;
432
433print $code;
434close STDOUT or die "error closing STDOUT: $!";
Note: See TracBrowser for help on using the repository browser.

© 2024 Oracle Support Privacy / Do Not Sell My Info Terms of Use Trademark Policy Automated Access Etiquette