VirtualBox

source: vbox/trunk/src/libs/openssl-3.0.3/crypto/bn/asm/alpha-mont.pl@ 96662

Last change on this file since 96662 was 94082, checked in by vboxsync, 3 years ago

libs/openssl-3.0.1: started applying and adjusting our OpenSSL changes to 3.0.1. bugref:10128

File size: 5.8 KB
Line 
1#! /usr/bin/env perl
2# Copyright 2006-2020 The OpenSSL Project Authors. All Rights Reserved.
3#
4# Licensed under the Apache License 2.0 (the "License"). You may not use
5# this file except in compliance with the License. You can obtain a copy
6# in the file LICENSE in the source distribution or at
7# https://www.openssl.org/source/license.html
8
9#
10# ====================================================================
11# Written by Andy Polyakov <[email protected]> for the OpenSSL
12# project. The module is, however, dual licensed under OpenSSL and
13# CRYPTOGAMS licenses depending on where you obtain it. For further
14# details see http://www.openssl.org/~appro/cryptogams/.
15# ====================================================================
16#
17# On 21264 RSA sign performance improves by 70/35/20/15 percent for
18# 512/1024/2048/4096 bit key lengths. This is against vendor compiler
19# instructed to '-tune host' code with in-line assembler. Other
20# benchmarks improve by 15-20%. To anchor it to something else, the
21# code provides approximately the same performance per GHz as AMD64.
22# I.e. if you compare 1GHz 21264 and 2GHz Opteron, you'll observe ~2x
23# difference.
24
25$output=pop and open STDOUT,">$output";
26
27# int bn_mul_mont(
28$rp="a0"; # BN_ULONG *rp,
29$ap="a1"; # const BN_ULONG *ap,
30$bp="a2"; # const BN_ULONG *bp,
31$np="a3"; # const BN_ULONG *np,
32$n0="a4"; # const BN_ULONG *n0,
33$num="a5"; # int num);
34
35$lo0="t0";
36$hi0="t1";
37$lo1="t2";
38$hi1="t3";
39$aj="t4";
40$bi="t5";
41$nj="t6";
42$tp="t7";
43$alo="t8";
44$ahi="t9";
45$nlo="t10";
46$nhi="t11";
47$tj="t12";
48$i="s3";
49$j="s4";
50$m1="s5";
51
52$code=<<___;
53#ifdef __linux__
54#include <asm/regdef.h>
55#else
56#include <asm.h>
57#include <regdef.h>
58#endif
59
60.text
61
62.set noat
63.set noreorder
64
65.globl bn_mul_mont
66.align 5
67.ent bn_mul_mont
68bn_mul_mont:
69 lda sp,-48(sp)
70 stq ra,0(sp)
71 stq s3,8(sp)
72 stq s4,16(sp)
73 stq s5,24(sp)
74 stq fp,32(sp)
75 mov sp,fp
76 .mask 0x0400f000,-48
77 .frame fp,48,ra
78 .prologue 0
79
80 .align 4
81 .set reorder
82 sextl $num,$num
83 mov 0,v0
84 cmplt $num,4,AT
85 bne AT,.Lexit
86
87 ldq $hi0,0($ap) # ap[0]
88 s8addq $num,16,AT
89 ldq $aj,8($ap)
90 subq sp,AT,sp
91 ldq $bi,0($bp) # bp[0]
92 lda AT,-4096(zero) # mov -4096,AT
93 ldq $n0,0($n0)
94 and sp,AT,sp
95
96 mulq $hi0,$bi,$lo0
97 ldq $hi1,0($np) # np[0]
98 umulh $hi0,$bi,$hi0
99 ldq $nj,8($np)
100
101 mulq $lo0,$n0,$m1
102
103 mulq $hi1,$m1,$lo1
104 umulh $hi1,$m1,$hi1
105
106 addq $lo1,$lo0,$lo1
107 cmpult $lo1,$lo0,AT
108 addq $hi1,AT,$hi1
109
110 mulq $aj,$bi,$alo
111 mov 2,$j
112 umulh $aj,$bi,$ahi
113 mov sp,$tp
114
115 mulq $nj,$m1,$nlo
116 s8addq $j,$ap,$aj
117 umulh $nj,$m1,$nhi
118 s8addq $j,$np,$nj
119.align 4
120.L1st:
121 .set noreorder
122 ldq $aj,0($aj)
123 addl $j,1,$j
124 ldq $nj,0($nj)
125 lda $tp,8($tp)
126
127 addq $alo,$hi0,$lo0
128 mulq $aj,$bi,$alo
129 cmpult $lo0,$hi0,AT
130 addq $nlo,$hi1,$lo1
131
132 mulq $nj,$m1,$nlo
133 addq $ahi,AT,$hi0
134 cmpult $lo1,$hi1,v0
135 cmplt $j,$num,$tj
136
137 umulh $aj,$bi,$ahi
138 addq $nhi,v0,$hi1
139 addq $lo1,$lo0,$lo1
140 s8addq $j,$ap,$aj
141
142 umulh $nj,$m1,$nhi
143 cmpult $lo1,$lo0,v0
144 addq $hi1,v0,$hi1
145 s8addq $j,$np,$nj
146
147 stq $lo1,-8($tp)
148 nop
149 unop
150 bne $tj,.L1st
151 .set reorder
152
153 addq $alo,$hi0,$lo0
154 addq $nlo,$hi1,$lo1
155 cmpult $lo0,$hi0,AT
156 cmpult $lo1,$hi1,v0
157 addq $ahi,AT,$hi0
158 addq $nhi,v0,$hi1
159
160 addq $lo1,$lo0,$lo1
161 cmpult $lo1,$lo0,v0
162 addq $hi1,v0,$hi1
163
164 stq $lo1,0($tp)
165
166 addq $hi1,$hi0,$hi1
167 cmpult $hi1,$hi0,AT
168 stq $hi1,8($tp)
169 stq AT,16($tp)
170
171 mov 1,$i
172.align 4
173.Louter:
174 s8addq $i,$bp,$bi
175 ldq $hi0,0($ap)
176 ldq $aj,8($ap)
177 ldq $bi,0($bi)
178 ldq $hi1,0($np)
179 ldq $nj,8($np)
180 ldq $tj,0(sp)
181
182 mulq $hi0,$bi,$lo0
183 umulh $hi0,$bi,$hi0
184
185 addq $lo0,$tj,$lo0
186 cmpult $lo0,$tj,AT
187 addq $hi0,AT,$hi0
188
189 mulq $lo0,$n0,$m1
190
191 mulq $hi1,$m1,$lo1
192 umulh $hi1,$m1,$hi1
193
194 addq $lo1,$lo0,$lo1
195 cmpult $lo1,$lo0,AT
196 mov 2,$j
197 addq $hi1,AT,$hi1
198
199 mulq $aj,$bi,$alo
200 mov sp,$tp
201 umulh $aj,$bi,$ahi
202
203 mulq $nj,$m1,$nlo
204 s8addq $j,$ap,$aj
205 umulh $nj,$m1,$nhi
206.align 4
207.Linner:
208 .set noreorder
209 ldq $tj,8($tp) #L0
210 nop #U1
211 ldq $aj,0($aj) #L1
212 s8addq $j,$np,$nj #U0
213
214 ldq $nj,0($nj) #L0
215 nop #U1
216 addq $alo,$hi0,$lo0 #L1
217 lda $tp,8($tp)
218
219 mulq $aj,$bi,$alo #U1
220 cmpult $lo0,$hi0,AT #L0
221 addq $nlo,$hi1,$lo1 #L1
222 addl $j,1,$j
223
224 mulq $nj,$m1,$nlo #U1
225 addq $ahi,AT,$hi0 #L0
226 addq $lo0,$tj,$lo0 #L1
227 cmpult $lo1,$hi1,v0 #U0
228
229 umulh $aj,$bi,$ahi #U1
230 cmpult $lo0,$tj,AT #L0
231 addq $lo1,$lo0,$lo1 #L1
232 addq $nhi,v0,$hi1 #U0
233
234 umulh $nj,$m1,$nhi #U1
235 s8addq $j,$ap,$aj #L0
236 cmpult $lo1,$lo0,v0 #L1
237 cmplt $j,$num,$tj #U0 # borrow $tj
238
239 addq $hi0,AT,$hi0 #L0
240 addq $hi1,v0,$hi1 #U1
241 stq $lo1,-8($tp) #L1
242 bne $tj,.Linner #U0
243 .set reorder
244
245 ldq $tj,8($tp)
246 addq $alo,$hi0,$lo0
247 addq $nlo,$hi1,$lo1
248 cmpult $lo0,$hi0,AT
249 cmpult $lo1,$hi1,v0
250 addq $ahi,AT,$hi0
251 addq $nhi,v0,$hi1
252
253 addq $lo0,$tj,$lo0
254 cmpult $lo0,$tj,AT
255 addq $hi0,AT,$hi0
256
257 ldq $tj,16($tp)
258 addq $lo1,$lo0,$j
259 cmpult $j,$lo0,v0
260 addq $hi1,v0,$hi1
261
262 addq $hi1,$hi0,$lo1
263 stq $j,0($tp)
264 cmpult $lo1,$hi0,$hi1
265 addq $lo1,$tj,$lo1
266 cmpult $lo1,$tj,AT
267 addl $i,1,$i
268 addq $hi1,AT,$hi1
269 stq $lo1,8($tp)
270 cmplt $i,$num,$tj # borrow $tj
271 stq $hi1,16($tp)
272 bne $tj,.Louter
273
274
275 s8addq $num,sp,$tj # &tp[num]
276 mov $rp,$bp # put rp aside
277 mov sp,$tp
278 mov sp,$ap
279 mov 0,$hi0 # clear borrow bit
280
281.align 4
282.Lsub: ldq $lo0,0($tp)
283 ldq $lo1,0($np)
284 lda $tp,8($tp)
285 lda $np,8($np)
286 subq $lo0,$lo1,$lo1 # tp[i]-np[i]
287 cmpult $lo0,$lo1,AT
288 subq $lo1,$hi0,$lo0
289 cmpult $lo1,$lo0,$hi0
290 or $hi0,AT,$hi0
291 stq $lo0,0($rp)
292 cmpult $tp,$tj,v0
293 lda $rp,8($rp)
294 bne v0,.Lsub
295
296 subq $hi1,$hi0,$hi0 # handle upmost overflow bit
297 mov sp,$tp
298 mov $bp,$rp # restore rp
299
300.align 4
301.Lcopy: ldq $aj,0($tp) # conditional copy
302 ldq $nj,0($rp)
303 lda $tp,8($tp)
304 lda $rp,8($rp)
305 cmoveq $hi0,$nj,$aj
306 stq zero,-8($tp) # zap tp
307 cmpult $tp,$tj,AT
308 stq $aj,-8($rp)
309 bne AT,.Lcopy
310 mov 1,v0
311
312.Lexit:
313 .set noreorder
314 mov fp,sp
315 /*ldq ra,0(sp)*/
316 ldq s3,8(sp)
317 ldq s4,16(sp)
318 ldq s5,24(sp)
319 ldq fp,32(sp)
320 lda sp,48(sp)
321 ret (ra)
322.end bn_mul_mont
323.ascii "Montgomery Multiplication for Alpha, CRYPTOGAMS by <appro\@openssl.org>"
324.align 2
325___
326
327print $code;
328close STDOUT or die "error closing STDOUT: $!";
Note: See TracBrowser for help on using the repository browser.

© 2024 Oracle Support Privacy / Do Not Sell My Info Terms of Use Trademark Policy Automated Access Etiquette