1 | #! /usr/bin/env perl
|
---|
2 | # Copyright 2015-2021 The OpenSSL Project Authors. All Rights Reserved.
|
---|
3 | #
|
---|
4 | # Licensed under the Apache License 2.0 (the "License"). You may not use
|
---|
5 | # this file except in compliance with the License. You can obtain a copy
|
---|
6 | # in the file LICENSE in the source distribution or at
|
---|
7 | # https://www.openssl.org/source/license.html
|
---|
8 |
|
---|
9 |
|
---|
10 | # ====================================================================
|
---|
11 | # Written by Andy Polyakov <[email protected]> for the OpenSSL
|
---|
12 | # project. The module is, however, dual licensed under OpenSSL and
|
---|
13 | # CRYPTOGAMS licenses depending on where you obtain it. For further
|
---|
14 | # details see http://www.openssl.org/~appro/cryptogams/.
|
---|
15 | # ====================================================================
|
---|
16 |
|
---|
17 | # March 2015
|
---|
18 | #
|
---|
19 | # "Teaser" Montgomery multiplication module for ARMv8. Needs more
|
---|
20 | # work. While it does improve RSA sign performance by 20-30% (less for
|
---|
21 | # longer keys) on most processors, for some reason RSA2048 is not
|
---|
22 | # faster and RSA4096 goes 15-20% slower on Cortex-A57. Multiplication
|
---|
23 | # instruction issue rate is limited on processor in question, meaning
|
---|
24 | # that dedicated squaring procedure is a must. Well, actually all
|
---|
25 | # contemporary AArch64 processors seem to have limited multiplication
|
---|
26 | # issue rate, i.e. they can't issue multiplication every cycle, which
|
---|
27 | # explains moderate improvement coefficients in comparison to
|
---|
28 | # compiler-generated code. Recall that compiler is instructed to use
|
---|
29 | # umulh and therefore uses same amount of multiplication instructions
|
---|
30 | # to do the job. Assembly's edge is to minimize number of "collateral"
|
---|
31 | # instructions and of course instruction scheduling.
|
---|
32 | #
|
---|
33 | # April 2015
|
---|
34 | #
|
---|
35 | # Squaring procedure that handles lengths divisible by 8 improves
|
---|
36 | # RSA/DSA performance by 25-40-60% depending on processor and key
|
---|
37 | # length. Overall improvement coefficients are always positive in
|
---|
38 | # comparison to compiler-generated code. On Cortex-A57 improvement
|
---|
39 | # is still modest on longest key lengths, while others exhibit e.g.
|
---|
40 | # 50-70% improvement for RSA4096 sign. RSA2048 sign is ~25% faster
|
---|
41 | # on Cortex-A57 and ~60-100% faster on others.
|
---|
42 |
|
---|
43 | # $output is the last argument if it looks like a file (it has an extension)
|
---|
44 | # $flavour is the first argument if it doesn't look like a file
|
---|
45 | my $output = $#ARGV >= 0 && $ARGV[$#ARGV] =~ m|\.\w+$| ? pop : undef;
|
---|
46 | my $flavour = $#ARGV >= 0 && $ARGV[0] !~ m|\.| ? shift : undef;
|
---|
47 |
|
---|
48 | $0 =~ m/(.*[\/\\])[^\/\\]+$/; $dir=$1;
|
---|
49 | ( $xlate="${dir}arm-xlate.pl" and -f $xlate ) or
|
---|
50 | ( $xlate="${dir}../../perlasm/arm-xlate.pl" and -f $xlate) or
|
---|
51 | die "can't locate arm-xlate.pl";
|
---|
52 |
|
---|
53 | open OUT,"| \"$^X\" $xlate $flavour \"$output\""
|
---|
54 | or die "can't call $xlate: $1";
|
---|
55 | *STDOUT=*OUT;
|
---|
56 |
|
---|
57 | ($lo0,$hi0,$aj,$m0,$alo,$ahi,
|
---|
58 | $lo1,$hi1,$nj,$m1,$nlo,$nhi,
|
---|
59 | $ovf, $i,$j,$tp,$tj) = map("x$_",6..17,19..24);
|
---|
60 |
|
---|
61 | # int bn_mul_mont(
|
---|
62 | $rp="x0"; # BN_ULONG *rp,
|
---|
63 | $ap="x1"; # const BN_ULONG *ap,
|
---|
64 | $bp="x2"; # const BN_ULONG *bp,
|
---|
65 | $np="x3"; # const BN_ULONG *np,
|
---|
66 | $n0="x4"; # const BN_ULONG *n0,
|
---|
67 | $num="x5"; # int num);
|
---|
68 |
|
---|
69 | $code.=<<___;
|
---|
70 | #ifndef __KERNEL__
|
---|
71 | # include "arm_arch.h"
|
---|
72 | .extern OPENSSL_armv8_rsa_neonized
|
---|
73 | .hidden OPENSSL_armv8_rsa_neonized
|
---|
74 | #endif
|
---|
75 | .text
|
---|
76 |
|
---|
77 | .globl bn_mul_mont
|
---|
78 | .type bn_mul_mont,%function
|
---|
79 | .align 5
|
---|
80 | bn_mul_mont:
|
---|
81 | .Lbn_mul_mont:
|
---|
82 | tst $num,#3
|
---|
83 | b.ne .Lmul_mont
|
---|
84 | cmp $num,#32
|
---|
85 | b.le .Lscalar_impl
|
---|
86 | #ifndef __KERNEL__
|
---|
87 | adrp x17,OPENSSL_armv8_rsa_neonized
|
---|
88 | ldr w17,[x17,#:lo12:OPENSSL_armv8_rsa_neonized]
|
---|
89 | cbnz w17, bn_mul8x_mont_neon
|
---|
90 | #endif
|
---|
91 |
|
---|
92 | .Lscalar_impl:
|
---|
93 | tst $num,#7
|
---|
94 | b.eq __bn_sqr8x_mont
|
---|
95 | tst $num,#3
|
---|
96 | b.eq __bn_mul4x_mont
|
---|
97 |
|
---|
98 | .Lmul_mont:
|
---|
99 | stp x29,x30,[sp,#-64]!
|
---|
100 | add x29,sp,#0
|
---|
101 | stp x19,x20,[sp,#16]
|
---|
102 | stp x21,x22,[sp,#32]
|
---|
103 | stp x23,x24,[sp,#48]
|
---|
104 |
|
---|
105 | ldr $m0,[$bp],#8 // bp[0]
|
---|
106 | sub $tp,sp,$num,lsl#3
|
---|
107 | ldp $hi0,$aj,[$ap],#16 // ap[0..1]
|
---|
108 | lsl $num,$num,#3
|
---|
109 | ldr $n0,[$n0] // *n0
|
---|
110 | and $tp,$tp,#-16 // ABI says so
|
---|
111 | ldp $hi1,$nj,[$np],#16 // np[0..1]
|
---|
112 |
|
---|
113 | mul $lo0,$hi0,$m0 // ap[0]*bp[0]
|
---|
114 | sub $j,$num,#16 // j=num-2
|
---|
115 | umulh $hi0,$hi0,$m0
|
---|
116 | mul $alo,$aj,$m0 // ap[1]*bp[0]
|
---|
117 | umulh $ahi,$aj,$m0
|
---|
118 |
|
---|
119 | mul $m1,$lo0,$n0 // "tp[0]"*n0
|
---|
120 | mov sp,$tp // alloca
|
---|
121 |
|
---|
122 | // (*) mul $lo1,$hi1,$m1 // np[0]*m1
|
---|
123 | umulh $hi1,$hi1,$m1
|
---|
124 | mul $nlo,$nj,$m1 // np[1]*m1
|
---|
125 | // (*) adds $lo1,$lo1,$lo0 // discarded
|
---|
126 | // (*) As for removal of first multiplication and addition
|
---|
127 | // instructions. The outcome of first addition is
|
---|
128 | // guaranteed to be zero, which leaves two computationally
|
---|
129 | // significant outcomes: it either carries or not. Then
|
---|
130 | // question is when does it carry? Is there alternative
|
---|
131 | // way to deduce it? If you follow operations, you can
|
---|
132 | // observe that condition for carry is quite simple:
|
---|
133 | // $lo0 being non-zero. So that carry can be calculated
|
---|
134 | // by adding -1 to $lo0. That's what next instruction does.
|
---|
135 | subs xzr,$lo0,#1 // (*)
|
---|
136 | umulh $nhi,$nj,$m1
|
---|
137 | adc $hi1,$hi1,xzr
|
---|
138 | cbz $j,.L1st_skip
|
---|
139 |
|
---|
140 | .L1st:
|
---|
141 | ldr $aj,[$ap],#8
|
---|
142 | adds $lo0,$alo,$hi0
|
---|
143 | sub $j,$j,#8 // j--
|
---|
144 | adc $hi0,$ahi,xzr
|
---|
145 |
|
---|
146 | ldr $nj,[$np],#8
|
---|
147 | adds $lo1,$nlo,$hi1
|
---|
148 | mul $alo,$aj,$m0 // ap[j]*bp[0]
|
---|
149 | adc $hi1,$nhi,xzr
|
---|
150 | umulh $ahi,$aj,$m0
|
---|
151 |
|
---|
152 | adds $lo1,$lo1,$lo0
|
---|
153 | mul $nlo,$nj,$m1 // np[j]*m1
|
---|
154 | adc $hi1,$hi1,xzr
|
---|
155 | umulh $nhi,$nj,$m1
|
---|
156 | str $lo1,[$tp],#8 // tp[j-1]
|
---|
157 | cbnz $j,.L1st
|
---|
158 |
|
---|
159 | .L1st_skip:
|
---|
160 | adds $lo0,$alo,$hi0
|
---|
161 | sub $ap,$ap,$num // rewind $ap
|
---|
162 | adc $hi0,$ahi,xzr
|
---|
163 |
|
---|
164 | adds $lo1,$nlo,$hi1
|
---|
165 | sub $np,$np,$num // rewind $np
|
---|
166 | adc $hi1,$nhi,xzr
|
---|
167 |
|
---|
168 | adds $lo1,$lo1,$lo0
|
---|
169 | sub $i,$num,#8 // i=num-1
|
---|
170 | adcs $hi1,$hi1,$hi0
|
---|
171 |
|
---|
172 | adc $ovf,xzr,xzr // upmost overflow bit
|
---|
173 | stp $lo1,$hi1,[$tp]
|
---|
174 |
|
---|
175 | .Louter:
|
---|
176 | ldr $m0,[$bp],#8 // bp[i]
|
---|
177 | ldp $hi0,$aj,[$ap],#16
|
---|
178 | ldr $tj,[sp] // tp[0]
|
---|
179 | add $tp,sp,#8
|
---|
180 |
|
---|
181 | mul $lo0,$hi0,$m0 // ap[0]*bp[i]
|
---|
182 | sub $j,$num,#16 // j=num-2
|
---|
183 | umulh $hi0,$hi0,$m0
|
---|
184 | ldp $hi1,$nj,[$np],#16
|
---|
185 | mul $alo,$aj,$m0 // ap[1]*bp[i]
|
---|
186 | adds $lo0,$lo0,$tj
|
---|
187 | umulh $ahi,$aj,$m0
|
---|
188 | adc $hi0,$hi0,xzr
|
---|
189 |
|
---|
190 | mul $m1,$lo0,$n0
|
---|
191 | sub $i,$i,#8 // i--
|
---|
192 |
|
---|
193 | // (*) mul $lo1,$hi1,$m1 // np[0]*m1
|
---|
194 | umulh $hi1,$hi1,$m1
|
---|
195 | mul $nlo,$nj,$m1 // np[1]*m1
|
---|
196 | // (*) adds $lo1,$lo1,$lo0
|
---|
197 | subs xzr,$lo0,#1 // (*)
|
---|
198 | umulh $nhi,$nj,$m1
|
---|
199 | cbz $j,.Linner_skip
|
---|
200 |
|
---|
201 | .Linner:
|
---|
202 | ldr $aj,[$ap],#8
|
---|
203 | adc $hi1,$hi1,xzr
|
---|
204 | ldr $tj,[$tp],#8 // tp[j]
|
---|
205 | adds $lo0,$alo,$hi0
|
---|
206 | sub $j,$j,#8 // j--
|
---|
207 | adc $hi0,$ahi,xzr
|
---|
208 |
|
---|
209 | adds $lo1,$nlo,$hi1
|
---|
210 | ldr $nj,[$np],#8
|
---|
211 | adc $hi1,$nhi,xzr
|
---|
212 |
|
---|
213 | mul $alo,$aj,$m0 // ap[j]*bp[i]
|
---|
214 | adds $lo0,$lo0,$tj
|
---|
215 | umulh $ahi,$aj,$m0
|
---|
216 | adc $hi0,$hi0,xzr
|
---|
217 |
|
---|
218 | mul $nlo,$nj,$m1 // np[j]*m1
|
---|
219 | adds $lo1,$lo1,$lo0
|
---|
220 | umulh $nhi,$nj,$m1
|
---|
221 | stur $lo1,[$tp,#-16] // tp[j-1]
|
---|
222 | cbnz $j,.Linner
|
---|
223 |
|
---|
224 | .Linner_skip:
|
---|
225 | ldr $tj,[$tp],#8 // tp[j]
|
---|
226 | adc $hi1,$hi1,xzr
|
---|
227 | adds $lo0,$alo,$hi0
|
---|
228 | sub $ap,$ap,$num // rewind $ap
|
---|
229 | adc $hi0,$ahi,xzr
|
---|
230 |
|
---|
231 | adds $lo1,$nlo,$hi1
|
---|
232 | sub $np,$np,$num // rewind $np
|
---|
233 | adcs $hi1,$nhi,$ovf
|
---|
234 | adc $ovf,xzr,xzr
|
---|
235 |
|
---|
236 | adds $lo0,$lo0,$tj
|
---|
237 | adc $hi0,$hi0,xzr
|
---|
238 |
|
---|
239 | adds $lo1,$lo1,$lo0
|
---|
240 | adcs $hi1,$hi1,$hi0
|
---|
241 | adc $ovf,$ovf,xzr // upmost overflow bit
|
---|
242 | stp $lo1,$hi1,[$tp,#-16]
|
---|
243 |
|
---|
244 | cbnz $i,.Louter
|
---|
245 |
|
---|
246 | // Final step. We see if result is larger than modulus, and
|
---|
247 | // if it is, subtract the modulus. But comparison implies
|
---|
248 | // subtraction. So we subtract modulus, see if it borrowed,
|
---|
249 | // and conditionally copy original value.
|
---|
250 | ldr $tj,[sp] // tp[0]
|
---|
251 | add $tp,sp,#8
|
---|
252 | ldr $nj,[$np],#8 // np[0]
|
---|
253 | subs $j,$num,#8 // j=num-1 and clear borrow
|
---|
254 | mov $ap,$rp
|
---|
255 | .Lsub:
|
---|
256 | sbcs $aj,$tj,$nj // tp[j]-np[j]
|
---|
257 | ldr $tj,[$tp],#8
|
---|
258 | sub $j,$j,#8 // j--
|
---|
259 | ldr $nj,[$np],#8
|
---|
260 | str $aj,[$ap],#8 // rp[j]=tp[j]-np[j]
|
---|
261 | cbnz $j,.Lsub
|
---|
262 |
|
---|
263 | sbcs $aj,$tj,$nj
|
---|
264 | sbcs $ovf,$ovf,xzr // did it borrow?
|
---|
265 | str $aj,[$ap],#8 // rp[num-1]
|
---|
266 |
|
---|
267 | ldr $tj,[sp] // tp[0]
|
---|
268 | add $tp,sp,#8
|
---|
269 | ldr $aj,[$rp],#8 // rp[0]
|
---|
270 | sub $num,$num,#8 // num--
|
---|
271 | nop
|
---|
272 | .Lcond_copy:
|
---|
273 | sub $num,$num,#8 // num--
|
---|
274 | csel $nj,$tj,$aj,lo // did it borrow?
|
---|
275 | ldr $tj,[$tp],#8
|
---|
276 | ldr $aj,[$rp],#8
|
---|
277 | stur xzr,[$tp,#-16] // wipe tp
|
---|
278 | stur $nj,[$rp,#-16]
|
---|
279 | cbnz $num,.Lcond_copy
|
---|
280 |
|
---|
281 | csel $nj,$tj,$aj,lo
|
---|
282 | stur xzr,[$tp,#-8] // wipe tp
|
---|
283 | stur $nj,[$rp,#-8]
|
---|
284 |
|
---|
285 | ldp x19,x20,[x29,#16]
|
---|
286 | mov sp,x29
|
---|
287 | ldp x21,x22,[x29,#32]
|
---|
288 | mov x0,#1
|
---|
289 | ldp x23,x24,[x29,#48]
|
---|
290 | ldr x29,[sp],#64
|
---|
291 | ret
|
---|
292 | .size bn_mul_mont,.-bn_mul_mont
|
---|
293 | ___
|
---|
294 | {
|
---|
295 | my ($A0,$A1,$N0,$N1)=map("v$_",(0..3));
|
---|
296 | my ($Z,$Temp)=("v4.16b","v5");
|
---|
297 | my @ACC=map("v$_",(6..13));
|
---|
298 | my ($Bi,$Ni,$M0)=map("v$_",(28..30));
|
---|
299 | my $sBi="s28";
|
---|
300 | my $sM0="s30";
|
---|
301 | my $zero="v14";
|
---|
302 | my $temp="v15";
|
---|
303 | my $ACCTemp="v16";
|
---|
304 |
|
---|
305 | my ($rptr,$aptr,$bptr,$nptr,$n0,$num)=map("x$_",(0..5));
|
---|
306 | my ($tinptr,$toutptr,$inner,$outer,$bnptr)=map("x$_",(6..11));
|
---|
307 |
|
---|
308 | $code.=<<___;
|
---|
309 | .type bn_mul8x_mont_neon,%function
|
---|
310 | .align 5
|
---|
311 | bn_mul8x_mont_neon:
|
---|
312 | stp x29,x30,[sp,#-80]!
|
---|
313 | mov x16,sp
|
---|
314 | stp d8,d9,[sp,#16]
|
---|
315 | stp d10,d11,[sp,#32]
|
---|
316 | stp d12,d13,[sp,#48]
|
---|
317 | stp d14,d15,[sp,#64]
|
---|
318 | lsl $num,$num,#1
|
---|
319 | eor $zero.16b,$zero.16b,$zero.16b
|
---|
320 |
|
---|
321 | .align 4
|
---|
322 | .LNEON_8n:
|
---|
323 | eor @ACC[0].16b,@ACC[0].16b,@ACC[0].16b
|
---|
324 | sub $toutptr,sp,#128
|
---|
325 | eor @ACC[1].16b,@ACC[1].16b,@ACC[1].16b
|
---|
326 | sub $toutptr,$toutptr,$num,lsl#4
|
---|
327 | eor @ACC[2].16b,@ACC[2].16b,@ACC[2].16b
|
---|
328 | and $toutptr,$toutptr,#-64
|
---|
329 | eor @ACC[3].16b,@ACC[3].16b,@ACC[3].16b
|
---|
330 | mov sp,$toutptr // alloca
|
---|
331 | eor @ACC[4].16b,@ACC[4].16b,@ACC[4].16b
|
---|
332 | add $toutptr,$toutptr,#256
|
---|
333 | eor @ACC[5].16b,@ACC[5].16b,@ACC[5].16b
|
---|
334 | sub $inner,$num,#8
|
---|
335 | eor @ACC[6].16b,@ACC[6].16b,@ACC[6].16b
|
---|
336 | eor @ACC[7].16b,@ACC[7].16b,@ACC[7].16b
|
---|
337 |
|
---|
338 | .LNEON_8n_init:
|
---|
339 | st1 {@ACC[0].2d,@ACC[1].2d},[$toutptr],#32
|
---|
340 | subs $inner,$inner,#8
|
---|
341 | st1 {@ACC[2].2d,@ACC[3].2d},[$toutptr],#32
|
---|
342 | st1 {@ACC[4].2d,@ACC[5].2d},[$toutptr],#32
|
---|
343 | st1 {@ACC[6].2d,@ACC[7].2d},[$toutptr],#32
|
---|
344 | bne .LNEON_8n_init
|
---|
345 |
|
---|
346 | add $tinptr,sp,#256
|
---|
347 | ld1 {$A0.4s,$A1.4s},[$aptr],#32
|
---|
348 | add $bnptr,sp,#8
|
---|
349 | ldr $sM0,[$n0],#4
|
---|
350 | mov $outer,$num
|
---|
351 | b .LNEON_8n_outer
|
---|
352 |
|
---|
353 | .align 4
|
---|
354 | .LNEON_8n_outer:
|
---|
355 | ldr $sBi,[$bptr],#4 // *b++
|
---|
356 | uxtl $Bi.4s,$Bi.4h
|
---|
357 | add $toutptr,sp,#128
|
---|
358 | ld1 {$N0.4s,$N1.4s},[$nptr],#32
|
---|
359 |
|
---|
360 | umlal @ACC[0].2d,$Bi.2s,$A0.s[0]
|
---|
361 | umlal @ACC[1].2d,$Bi.2s,$A0.s[1]
|
---|
362 | umlal @ACC[2].2d,$Bi.2s,$A0.s[2]
|
---|
363 | shl $Ni.2d,@ACC[0].2d,#16
|
---|
364 | ext $Ni.16b,$Ni.16b,$Ni.16b,#8
|
---|
365 | umlal @ACC[3].2d,$Bi.2s,$A0.s[3]
|
---|
366 | add $Ni.2d,$Ni.2d,@ACC[0].2d
|
---|
367 | umlal @ACC[4].2d,$Bi.2s,$A1.s[0]
|
---|
368 | mul $Ni.2s,$Ni.2s,$M0.2s
|
---|
369 | umlal @ACC[5].2d,$Bi.2s,$A1.s[1]
|
---|
370 | st1 {$Bi.2s},[sp] // put aside smashed b[8*i+0]
|
---|
371 | umlal @ACC[6].2d,$Bi.2s,$A1.s[2]
|
---|
372 | uxtl $Ni.4s,$Ni.4h
|
---|
373 | umlal @ACC[7].2d,$Bi.2s,$A1.s[3]
|
---|
374 | ___
|
---|
375 | for ($i=0; $i<7;) {
|
---|
376 | $code.=<<___;
|
---|
377 | ldr $sBi,[$bptr],#4 // *b++
|
---|
378 | umlal @ACC[0].2d,$Ni.2s,$N0.s[0]
|
---|
379 | umlal @ACC[1].2d,$Ni.2s,$N0.s[1]
|
---|
380 | uxtl $Bi.4s,$Bi.4h
|
---|
381 | umlal @ACC[2].2d,$Ni.2s,$N0.s[2]
|
---|
382 | ushr $temp.2d,@ACC[0].2d,#16
|
---|
383 | umlal @ACC[3].2d,$Ni.2s,$N0.s[3]
|
---|
384 | umlal @ACC[4].2d,$Ni.2s,$N1.s[0]
|
---|
385 | ext @ACC[0].16b,@ACC[0].16b,@ACC[0].16b,#8
|
---|
386 | add @ACC[0].2d,@ACC[0].2d,$temp.2d
|
---|
387 | umlal @ACC[5].2d,$Ni.2s,$N1.s[1]
|
---|
388 | ushr @ACC[0].2d,@ACC[0].2d,#16
|
---|
389 | umlal @ACC[6].2d,$Ni.2s,$N1.s[2]
|
---|
390 | umlal @ACC[7].2d,$Ni.2s,$N1.s[3]
|
---|
391 | add $ACCTemp.2d,@ACC[1].2d,@ACC[0].2d
|
---|
392 | ins @ACC[1].d[0],$ACCTemp.d[0]
|
---|
393 | st1 {$Ni.2s},[$bnptr],#8 // put aside smashed m[8*i+$i]
|
---|
394 | ___
|
---|
395 | push(@ACC,shift(@ACC)); $i++;
|
---|
396 | $code.=<<___;
|
---|
397 | umlal @ACC[0].2d,$Bi.2s,$A0.s[0]
|
---|
398 | ld1 {@ACC[7].2d},[$tinptr],#16
|
---|
399 | umlal @ACC[1].2d,$Bi.2s,$A0.s[1]
|
---|
400 | umlal @ACC[2].2d,$Bi.2s,$A0.s[2]
|
---|
401 | shl $Ni.2d,@ACC[0].2d,#16
|
---|
402 | ext $Ni.16b,$Ni.16b,$Ni.16b,#8
|
---|
403 | umlal @ACC[3].2d,$Bi.2s,$A0.s[3]
|
---|
404 | add $Ni.2d,$Ni.2d,@ACC[0].2d
|
---|
405 | umlal @ACC[4].2d,$Bi.2s,$A1.s[0]
|
---|
406 | mul $Ni.2s,$Ni.2s,$M0.2s
|
---|
407 | umlal @ACC[5].2d,$Bi.2s,$A1.s[1]
|
---|
408 | st1 {$Bi.2s},[$bnptr],#8 // put aside smashed b[8*i+$i]
|
---|
409 | umlal @ACC[6].2d,$Bi.2s,$A1.s[2]
|
---|
410 | uxtl $Ni.4s,$Ni.4h
|
---|
411 | umlal @ACC[7].2d,$Bi.2s,$A1.s[3]
|
---|
412 | ___
|
---|
413 | }
|
---|
414 | $code.=<<___;
|
---|
415 | ld1 {$Bi.2s},[sp] // pull smashed b[8*i+0]
|
---|
416 | umlal @ACC[0].2d,$Ni.2s,$N0.s[0]
|
---|
417 | ld1 {$A0.4s,$A1.4s},[$aptr],#32
|
---|
418 | umlal @ACC[1].2d,$Ni.2s,$N0.s[1]
|
---|
419 | umlal @ACC[2].2d,$Ni.2s,$N0.s[2]
|
---|
420 | mov $Temp.16b,@ACC[0].16b
|
---|
421 | ushr $Temp.2d,$Temp.2d,#16
|
---|
422 | ext @ACC[0].16b,@ACC[0].16b,@ACC[0].16b,#8
|
---|
423 | umlal @ACC[3].2d,$Ni.2s,$N0.s[3]
|
---|
424 | umlal @ACC[4].2d,$Ni.2s,$N1.s[0]
|
---|
425 | add @ACC[0].2d,@ACC[0].2d,$Temp.2d
|
---|
426 | umlal @ACC[5].2d,$Ni.2s,$N1.s[1]
|
---|
427 | ushr @ACC[0].2d,@ACC[0].2d,#16
|
---|
428 | eor $temp.16b,$temp.16b,$temp.16b
|
---|
429 | ins @ACC[0].d[1],$temp.d[0]
|
---|
430 | umlal @ACC[6].2d,$Ni.2s,$N1.s[2]
|
---|
431 | umlal @ACC[7].2d,$Ni.2s,$N1.s[3]
|
---|
432 | add @ACC[1].2d,@ACC[1].2d,@ACC[0].2d
|
---|
433 | st1 {$Ni.2s},[$bnptr],#8 // put aside smashed m[8*i+$i]
|
---|
434 | add $bnptr,sp,#8 // rewind
|
---|
435 | ___
|
---|
436 | push(@ACC,shift(@ACC));
|
---|
437 | $code.=<<___;
|
---|
438 | sub $inner,$num,#8
|
---|
439 | b .LNEON_8n_inner
|
---|
440 |
|
---|
441 | .align 4
|
---|
442 | .LNEON_8n_inner:
|
---|
443 | subs $inner,$inner,#8
|
---|
444 | umlal @ACC[0].2d,$Bi.2s,$A0.s[0]
|
---|
445 | ld1 {@ACC[7].2d},[$tinptr]
|
---|
446 | umlal @ACC[1].2d,$Bi.2s,$A0.s[1]
|
---|
447 | ld1 {$Ni.2s},[$bnptr],#8 // pull smashed m[8*i+0]
|
---|
448 | umlal @ACC[2].2d,$Bi.2s,$A0.s[2]
|
---|
449 | ld1 {$N0.4s,$N1.4s},[$nptr],#32
|
---|
450 | umlal @ACC[3].2d,$Bi.2s,$A0.s[3]
|
---|
451 | b.eq .LInner_jump
|
---|
452 | add $tinptr,$tinptr,#16 // don't advance in last iteration
|
---|
453 | .LInner_jump:
|
---|
454 | umlal @ACC[4].2d,$Bi.2s,$A1.s[0]
|
---|
455 | umlal @ACC[5].2d,$Bi.2s,$A1.s[1]
|
---|
456 | umlal @ACC[6].2d,$Bi.2s,$A1.s[2]
|
---|
457 | umlal @ACC[7].2d,$Bi.2s,$A1.s[3]
|
---|
458 | ___
|
---|
459 | for ($i=1; $i<8; $i++) {
|
---|
460 | $code.=<<___;
|
---|
461 | ld1 {$Bi.2s},[$bnptr],#8 // pull smashed b[8*i+$i]
|
---|
462 | umlal @ACC[0].2d,$Ni.2s,$N0.s[0]
|
---|
463 | umlal @ACC[1].2d,$Ni.2s,$N0.s[1]
|
---|
464 | umlal @ACC[2].2d,$Ni.2s,$N0.s[2]
|
---|
465 | umlal @ACC[3].2d,$Ni.2s,$N0.s[3]
|
---|
466 | umlal @ACC[4].2d,$Ni.2s,$N1.s[0]
|
---|
467 | umlal @ACC[5].2d,$Ni.2s,$N1.s[1]
|
---|
468 | umlal @ACC[6].2d,$Ni.2s,$N1.s[2]
|
---|
469 | umlal @ACC[7].2d,$Ni.2s,$N1.s[3]
|
---|
470 | st1 {@ACC[0].2d},[$toutptr],#16
|
---|
471 | ___
|
---|
472 | push(@ACC,shift(@ACC));
|
---|
473 | $code.=<<___;
|
---|
474 | umlal @ACC[0].2d,$Bi.2s,$A0.s[0]
|
---|
475 | ld1 {@ACC[7].2d},[$tinptr]
|
---|
476 | umlal @ACC[1].2d,$Bi.2s,$A0.s[1]
|
---|
477 | ld1 {$Ni.2s},[$bnptr],#8 // pull smashed m[8*i+$i]
|
---|
478 | umlal @ACC[2].2d,$Bi.2s,$A0.s[2]
|
---|
479 | b.eq .LInner_jump$i
|
---|
480 | add $tinptr,$tinptr,#16 // don't advance in last iteration
|
---|
481 | .LInner_jump$i:
|
---|
482 | umlal @ACC[3].2d,$Bi.2s,$A0.s[3]
|
---|
483 | umlal @ACC[4].2d,$Bi.2s,$A1.s[0]
|
---|
484 | umlal @ACC[5].2d,$Bi.2s,$A1.s[1]
|
---|
485 | umlal @ACC[6].2d,$Bi.2s,$A1.s[2]
|
---|
486 | umlal @ACC[7].2d,$Bi.2s,$A1.s[3]
|
---|
487 | ___
|
---|
488 | }
|
---|
489 | $code.=<<___;
|
---|
490 | b.ne .LInner_after_rewind$i
|
---|
491 | sub $aptr,$aptr,$num,lsl#2 // rewind
|
---|
492 | .LInner_after_rewind$i:
|
---|
493 | umlal @ACC[0].2d,$Ni.2s,$N0.s[0]
|
---|
494 | ld1 {$Bi.2s},[sp] // pull smashed b[8*i+0]
|
---|
495 | umlal @ACC[1].2d,$Ni.2s,$N0.s[1]
|
---|
496 | ld1 {$A0.4s,$A1.4s},[$aptr],#32
|
---|
497 | umlal @ACC[2].2d,$Ni.2s,$N0.s[2]
|
---|
498 | add $bnptr,sp,#8 // rewind
|
---|
499 | umlal @ACC[3].2d,$Ni.2s,$N0.s[3]
|
---|
500 | umlal @ACC[4].2d,$Ni.2s,$N1.s[0]
|
---|
501 | umlal @ACC[5].2d,$Ni.2s,$N1.s[1]
|
---|
502 | umlal @ACC[6].2d,$Ni.2s,$N1.s[2]
|
---|
503 | st1 {@ACC[0].2d},[$toutptr],#16
|
---|
504 | umlal @ACC[7].2d,$Ni.2s,$N1.s[3]
|
---|
505 |
|
---|
506 | bne .LNEON_8n_inner
|
---|
507 | ___
|
---|
508 | push(@ACC,shift(@ACC));
|
---|
509 | $code.=<<___;
|
---|
510 | add $tinptr,sp,#128
|
---|
511 | st1 {@ACC[0].2d,@ACC[1].2d},[$toutptr],#32
|
---|
512 | eor $N0.16b,$N0.16b,$N0.16b // $N0
|
---|
513 | st1 {@ACC[2].2d,@ACC[3].2d},[$toutptr],#32
|
---|
514 | eor $N1.16b,$N1.16b,$N1.16b // $N1
|
---|
515 | st1 {@ACC[4].2d,@ACC[5].2d},[$toutptr],#32
|
---|
516 | st1 {@ACC[6].2d},[$toutptr]
|
---|
517 |
|
---|
518 | subs $outer,$outer,#8
|
---|
519 | ld1 {@ACC[0].2d,@ACC[1].2d},[$tinptr],#32
|
---|
520 | ld1 {@ACC[2].2d,@ACC[3].2d},[$tinptr],#32
|
---|
521 | ld1 {@ACC[4].2d,@ACC[5].2d},[$tinptr],#32
|
---|
522 | ld1 {@ACC[6].2d,@ACC[7].2d},[$tinptr],#32
|
---|
523 |
|
---|
524 | b.eq .LInner_8n_jump_2steps
|
---|
525 | sub $nptr,$nptr,$num,lsl#2 // rewind
|
---|
526 | b .LNEON_8n_outer
|
---|
527 |
|
---|
528 | .LInner_8n_jump_2steps:
|
---|
529 | add $toutptr,sp,#128
|
---|
530 | st1 {$N0.2d,$N1.2d}, [sp],#32 // start wiping stack frame
|
---|
531 | mov $Temp.16b,@ACC[0].16b
|
---|
532 | ushr $temp.2d,@ACC[0].2d,#16
|
---|
533 | ext @ACC[0].16b,@ACC[0].16b,@ACC[0].16b,#8
|
---|
534 | st1 {$N0.2d,$N1.2d}, [sp],#32
|
---|
535 | add @ACC[0].2d,@ACC[0].2d,$temp.2d
|
---|
536 | st1 {$N0.2d,$N1.2d}, [sp],#32
|
---|
537 | ushr $temp.2d,@ACC[0].2d,#16
|
---|
538 | st1 {$N0.2d,$N1.2d}, [sp],#32
|
---|
539 | zip1 @ACC[0].4h,$Temp.4h,@ACC[0].4h
|
---|
540 | ins $temp.d[1],$zero.d[0]
|
---|
541 |
|
---|
542 | mov $inner,$num
|
---|
543 | b .LNEON_tail_entry
|
---|
544 |
|
---|
545 | .align 4
|
---|
546 | .LNEON_tail:
|
---|
547 | add @ACC[0].2d,@ACC[0].2d,$temp.2d
|
---|
548 | mov $Temp.16b,@ACC[0].16b
|
---|
549 | ushr $temp.2d,@ACC[0].2d,#16
|
---|
550 | ext @ACC[0].16b,@ACC[0].16b,@ACC[0].16b,#8
|
---|
551 | ld1 {@ACC[2].2d,@ACC[3].2d}, [$tinptr],#32
|
---|
552 | add @ACC[0].2d,@ACC[0].2d,$temp.2d
|
---|
553 | ld1 {@ACC[4].2d,@ACC[5].2d}, [$tinptr],#32
|
---|
554 | ushr $temp.2d,@ACC[0].2d,#16
|
---|
555 | ld1 {@ACC[6].2d,@ACC[7].2d}, [$tinptr],#32
|
---|
556 | zip1 @ACC[0].4h,$Temp.4h,@ACC[0].4h
|
---|
557 | ins $temp.d[1],$zero.d[0]
|
---|
558 |
|
---|
559 | .LNEON_tail_entry:
|
---|
560 | ___
|
---|
561 | for ($i=1; $i<8; $i++) {
|
---|
562 | $code.=<<___;
|
---|
563 | add @ACC[1].2d,@ACC[1].2d,$temp.2d
|
---|
564 | st1 {@ACC[0].s}[0], [$toutptr],#4
|
---|
565 | ushr $temp.2d,@ACC[1].2d,#16
|
---|
566 | mov $Temp.16b,@ACC[1].16b
|
---|
567 | ext @ACC[1].16b,@ACC[1].16b,@ACC[1].16b,#8
|
---|
568 | add @ACC[1].2d,@ACC[1].2d,$temp.2d
|
---|
569 | ushr $temp.2d,@ACC[1].2d,#16
|
---|
570 | zip1 @ACC[1].4h,$Temp.4h,@ACC[1].4h
|
---|
571 | ins $temp.d[1],$zero.d[0]
|
---|
572 | ___
|
---|
573 | push(@ACC,shift(@ACC));
|
---|
574 | }
|
---|
575 | push(@ACC,shift(@ACC));
|
---|
576 | $code.=<<___;
|
---|
577 | ld1 {@ACC[0].2d,@ACC[1].2d}, [$tinptr],#32
|
---|
578 | subs $inner,$inner,#8
|
---|
579 | st1 {@ACC[7].s}[0], [$toutptr],#4
|
---|
580 | bne .LNEON_tail
|
---|
581 |
|
---|
582 | st1 {$temp.s}[0], [$toutptr],#4 // top-most bit
|
---|
583 | sub $nptr,$nptr,$num,lsl#2 // rewind $nptr
|
---|
584 | subs $aptr,sp,#0 // clear carry flag
|
---|
585 | add $bptr,sp,$num,lsl#2
|
---|
586 |
|
---|
587 | .LNEON_sub:
|
---|
588 | ldp w4,w5,[$aptr],#8
|
---|
589 | ldp w6,w7,[$aptr],#8
|
---|
590 | ldp w8,w9,[$nptr],#8
|
---|
591 | ldp w10,w11,[$nptr],#8
|
---|
592 | sbcs w8,w4,w8
|
---|
593 | sbcs w9,w5,w9
|
---|
594 | sbcs w10,w6,w10
|
---|
595 | sbcs w11,w7,w11
|
---|
596 | sub x17,$bptr,$aptr
|
---|
597 | stp w8,w9,[$rptr],#8
|
---|
598 | stp w10,w11,[$rptr],#8
|
---|
599 | cbnz x17,.LNEON_sub
|
---|
600 |
|
---|
601 | ldr w10, [$aptr] // load top-most bit
|
---|
602 | mov x11,sp
|
---|
603 | eor v0.16b,v0.16b,v0.16b
|
---|
604 | sub x11,$bptr,x11 // this is num*4
|
---|
605 | eor v1.16b,v1.16b,v1.16b
|
---|
606 | mov $aptr,sp
|
---|
607 | sub $rptr,$rptr,x11 // rewind $rptr
|
---|
608 | mov $nptr,$bptr // second 3/4th of frame
|
---|
609 | sbcs w10,w10,wzr // result is carry flag
|
---|
610 |
|
---|
611 | .LNEON_copy_n_zap:
|
---|
612 | ldp w4,w5,[$aptr],#8
|
---|
613 | ldp w6,w7,[$aptr],#8
|
---|
614 | ldp w8,w9,[$rptr],#8
|
---|
615 | ldp w10,w11,[$rptr]
|
---|
616 | sub $rptr,$rptr,#8
|
---|
617 | b.cs .LCopy_1
|
---|
618 | mov w8,w4
|
---|
619 | mov w9,w5
|
---|
620 | mov w10,w6
|
---|
621 | mov w11,w7
|
---|
622 | .LCopy_1:
|
---|
623 | st1 {v0.2d,v1.2d}, [$nptr],#32 // wipe
|
---|
624 | st1 {v0.2d,v1.2d}, [$nptr],#32 // wipe
|
---|
625 | ldp w4,w5,[$aptr],#8
|
---|
626 | ldp w6,w7,[$aptr],#8
|
---|
627 | stp w8,w9,[$rptr],#8
|
---|
628 | stp w10,w11,[$rptr],#8
|
---|
629 | sub $aptr,$aptr,#32
|
---|
630 | ldp w8,w9,[$rptr],#8
|
---|
631 | ldp w10,w11,[$rptr]
|
---|
632 | sub $rptr,$rptr,#8
|
---|
633 | b.cs .LCopy_2
|
---|
634 | mov w8, w4
|
---|
635 | mov w9, w5
|
---|
636 | mov w10, w6
|
---|
637 | mov w11, w7
|
---|
638 | .LCopy_2:
|
---|
639 | st1 {v0.2d,v1.2d}, [$aptr],#32 // wipe
|
---|
640 | st1 {v0.2d,v1.2d}, [$nptr],#32 // wipe
|
---|
641 | sub x17,$bptr,$aptr // preserves carry
|
---|
642 | stp w8,w9,[$rptr],#8
|
---|
643 | stp w10,w11,[$rptr],#8
|
---|
644 | cbnz x17,.LNEON_copy_n_zap
|
---|
645 |
|
---|
646 | mov sp,x16
|
---|
647 | ldp d14,d15,[sp,#64]
|
---|
648 | ldp d12,d13,[sp,#48]
|
---|
649 | ldp d10,d11,[sp,#32]
|
---|
650 | ldp d8,d9,[sp,#16]
|
---|
651 | ldr x29,[sp],#80
|
---|
652 | ret // bx lr
|
---|
653 |
|
---|
654 | .size bn_mul8x_mont_neon,.-bn_mul8x_mont_neon
|
---|
655 | ___
|
---|
656 | }
|
---|
657 | {
|
---|
658 | ########################################################################
|
---|
659 | # Following is ARMv8 adaptation of sqrx8x_mont from x86_64-mont5 module.
|
---|
660 |
|
---|
661 | my ($a0,$a1,$a2,$a3,$a4,$a5,$a6,$a7)=map("x$_",(6..13));
|
---|
662 | my ($t0,$t1,$t2,$t3)=map("x$_",(14..17));
|
---|
663 | my ($acc0,$acc1,$acc2,$acc3,$acc4,$acc5,$acc6,$acc7)=map("x$_",(19..26));
|
---|
664 | my ($cnt,$carry,$topmost)=("x27","x28","x30");
|
---|
665 | my ($tp,$ap_end,$na0)=($bp,$np,$carry);
|
---|
666 |
|
---|
667 | $code.=<<___;
|
---|
668 | .type __bn_sqr8x_mont,%function
|
---|
669 | .align 5
|
---|
670 | __bn_sqr8x_mont:
|
---|
671 | cmp $ap,$bp
|
---|
672 | b.ne __bn_mul4x_mont
|
---|
673 | .Lsqr8x_mont:
|
---|
674 | .inst 0xd503233f // paciasp
|
---|
675 | stp x29,x30,[sp,#-128]!
|
---|
676 | add x29,sp,#0
|
---|
677 | stp x19,x20,[sp,#16]
|
---|
678 | stp x21,x22,[sp,#32]
|
---|
679 | stp x23,x24,[sp,#48]
|
---|
680 | stp x25,x26,[sp,#64]
|
---|
681 | stp x27,x28,[sp,#80]
|
---|
682 | stp $rp,$np,[sp,#96] // offload rp and np
|
---|
683 |
|
---|
684 | ldp $a0,$a1,[$ap,#8*0]
|
---|
685 | ldp $a2,$a3,[$ap,#8*2]
|
---|
686 | ldp $a4,$a5,[$ap,#8*4]
|
---|
687 | ldp $a6,$a7,[$ap,#8*6]
|
---|
688 |
|
---|
689 | sub $tp,sp,$num,lsl#4
|
---|
690 | lsl $num,$num,#3
|
---|
691 | ldr $n0,[$n0] // *n0
|
---|
692 | mov sp,$tp // alloca
|
---|
693 | sub $cnt,$num,#8*8
|
---|
694 | b .Lsqr8x_zero_start
|
---|
695 |
|
---|
696 | .Lsqr8x_zero:
|
---|
697 | sub $cnt,$cnt,#8*8
|
---|
698 | stp xzr,xzr,[$tp,#8*0]
|
---|
699 | stp xzr,xzr,[$tp,#8*2]
|
---|
700 | stp xzr,xzr,[$tp,#8*4]
|
---|
701 | stp xzr,xzr,[$tp,#8*6]
|
---|
702 | .Lsqr8x_zero_start:
|
---|
703 | stp xzr,xzr,[$tp,#8*8]
|
---|
704 | stp xzr,xzr,[$tp,#8*10]
|
---|
705 | stp xzr,xzr,[$tp,#8*12]
|
---|
706 | stp xzr,xzr,[$tp,#8*14]
|
---|
707 | add $tp,$tp,#8*16
|
---|
708 | cbnz $cnt,.Lsqr8x_zero
|
---|
709 |
|
---|
710 | add $ap_end,$ap,$num
|
---|
711 | add $ap,$ap,#8*8
|
---|
712 | mov $acc0,xzr
|
---|
713 | mov $acc1,xzr
|
---|
714 | mov $acc2,xzr
|
---|
715 | mov $acc3,xzr
|
---|
716 | mov $acc4,xzr
|
---|
717 | mov $acc5,xzr
|
---|
718 | mov $acc6,xzr
|
---|
719 | mov $acc7,xzr
|
---|
720 | mov $tp,sp
|
---|
721 | str $n0,[x29,#112] // offload n0
|
---|
722 |
|
---|
723 | // Multiply everything but a[i]*a[i]
|
---|
724 | .align 4
|
---|
725 | .Lsqr8x_outer_loop:
|
---|
726 | // a[1]a[0] (i)
|
---|
727 | // a[2]a[0]
|
---|
728 | // a[3]a[0]
|
---|
729 | // a[4]a[0]
|
---|
730 | // a[5]a[0]
|
---|
731 | // a[6]a[0]
|
---|
732 | // a[7]a[0]
|
---|
733 | // a[2]a[1] (ii)
|
---|
734 | // a[3]a[1]
|
---|
735 | // a[4]a[1]
|
---|
736 | // a[5]a[1]
|
---|
737 | // a[6]a[1]
|
---|
738 | // a[7]a[1]
|
---|
739 | // a[3]a[2] (iii)
|
---|
740 | // a[4]a[2]
|
---|
741 | // a[5]a[2]
|
---|
742 | // a[6]a[2]
|
---|
743 | // a[7]a[2]
|
---|
744 | // a[4]a[3] (iv)
|
---|
745 | // a[5]a[3]
|
---|
746 | // a[6]a[3]
|
---|
747 | // a[7]a[3]
|
---|
748 | // a[5]a[4] (v)
|
---|
749 | // a[6]a[4]
|
---|
750 | // a[7]a[4]
|
---|
751 | // a[6]a[5] (vi)
|
---|
752 | // a[7]a[5]
|
---|
753 | // a[7]a[6] (vii)
|
---|
754 |
|
---|
755 | mul $t0,$a1,$a0 // lo(a[1..7]*a[0]) (i)
|
---|
756 | mul $t1,$a2,$a0
|
---|
757 | mul $t2,$a3,$a0
|
---|
758 | mul $t3,$a4,$a0
|
---|
759 | adds $acc1,$acc1,$t0 // t[1]+lo(a[1]*a[0])
|
---|
760 | mul $t0,$a5,$a0
|
---|
761 | adcs $acc2,$acc2,$t1
|
---|
762 | mul $t1,$a6,$a0
|
---|
763 | adcs $acc3,$acc3,$t2
|
---|
764 | mul $t2,$a7,$a0
|
---|
765 | adcs $acc4,$acc4,$t3
|
---|
766 | umulh $t3,$a1,$a0 // hi(a[1..7]*a[0])
|
---|
767 | adcs $acc5,$acc5,$t0
|
---|
768 | umulh $t0,$a2,$a0
|
---|
769 | adcs $acc6,$acc6,$t1
|
---|
770 | umulh $t1,$a3,$a0
|
---|
771 | adcs $acc7,$acc7,$t2
|
---|
772 | umulh $t2,$a4,$a0
|
---|
773 | stp $acc0,$acc1,[$tp],#8*2 // t[0..1]
|
---|
774 | adc $acc0,xzr,xzr // t[8]
|
---|
775 | adds $acc2,$acc2,$t3 // t[2]+lo(a[1]*a[0])
|
---|
776 | umulh $t3,$a5,$a0
|
---|
777 | adcs $acc3,$acc3,$t0
|
---|
778 | umulh $t0,$a6,$a0
|
---|
779 | adcs $acc4,$acc4,$t1
|
---|
780 | umulh $t1,$a7,$a0
|
---|
781 | adcs $acc5,$acc5,$t2
|
---|
782 | mul $t2,$a2,$a1 // lo(a[2..7]*a[1]) (ii)
|
---|
783 | adcs $acc6,$acc6,$t3
|
---|
784 | mul $t3,$a3,$a1
|
---|
785 | adcs $acc7,$acc7,$t0
|
---|
786 | mul $t0,$a4,$a1
|
---|
787 | adc $acc0,$acc0,$t1
|
---|
788 |
|
---|
789 | mul $t1,$a5,$a1
|
---|
790 | adds $acc3,$acc3,$t2
|
---|
791 | mul $t2,$a6,$a1
|
---|
792 | adcs $acc4,$acc4,$t3
|
---|
793 | mul $t3,$a7,$a1
|
---|
794 | adcs $acc5,$acc5,$t0
|
---|
795 | umulh $t0,$a2,$a1 // hi(a[2..7]*a[1])
|
---|
796 | adcs $acc6,$acc6,$t1
|
---|
797 | umulh $t1,$a3,$a1
|
---|
798 | adcs $acc7,$acc7,$t2
|
---|
799 | umulh $t2,$a4,$a1
|
---|
800 | adcs $acc0,$acc0,$t3
|
---|
801 | umulh $t3,$a5,$a1
|
---|
802 | stp $acc2,$acc3,[$tp],#8*2 // t[2..3]
|
---|
803 | adc $acc1,xzr,xzr // t[9]
|
---|
804 | adds $acc4,$acc4,$t0
|
---|
805 | umulh $t0,$a6,$a1
|
---|
806 | adcs $acc5,$acc5,$t1
|
---|
807 | umulh $t1,$a7,$a1
|
---|
808 | adcs $acc6,$acc6,$t2
|
---|
809 | mul $t2,$a3,$a2 // lo(a[3..7]*a[2]) (iii)
|
---|
810 | adcs $acc7,$acc7,$t3
|
---|
811 | mul $t3,$a4,$a2
|
---|
812 | adcs $acc0,$acc0,$t0
|
---|
813 | mul $t0,$a5,$a2
|
---|
814 | adc $acc1,$acc1,$t1
|
---|
815 |
|
---|
816 | mul $t1,$a6,$a2
|
---|
817 | adds $acc5,$acc5,$t2
|
---|
818 | mul $t2,$a7,$a2
|
---|
819 | adcs $acc6,$acc6,$t3
|
---|
820 | umulh $t3,$a3,$a2 // hi(a[3..7]*a[2])
|
---|
821 | adcs $acc7,$acc7,$t0
|
---|
822 | umulh $t0,$a4,$a2
|
---|
823 | adcs $acc0,$acc0,$t1
|
---|
824 | umulh $t1,$a5,$a2
|
---|
825 | adcs $acc1,$acc1,$t2
|
---|
826 | umulh $t2,$a6,$a2
|
---|
827 | stp $acc4,$acc5,[$tp],#8*2 // t[4..5]
|
---|
828 | adc $acc2,xzr,xzr // t[10]
|
---|
829 | adds $acc6,$acc6,$t3
|
---|
830 | umulh $t3,$a7,$a2
|
---|
831 | adcs $acc7,$acc7,$t0
|
---|
832 | mul $t0,$a4,$a3 // lo(a[4..7]*a[3]) (iv)
|
---|
833 | adcs $acc0,$acc0,$t1
|
---|
834 | mul $t1,$a5,$a3
|
---|
835 | adcs $acc1,$acc1,$t2
|
---|
836 | mul $t2,$a6,$a3
|
---|
837 | adc $acc2,$acc2,$t3
|
---|
838 |
|
---|
839 | mul $t3,$a7,$a3
|
---|
840 | adds $acc7,$acc7,$t0
|
---|
841 | umulh $t0,$a4,$a3 // hi(a[4..7]*a[3])
|
---|
842 | adcs $acc0,$acc0,$t1
|
---|
843 | umulh $t1,$a5,$a3
|
---|
844 | adcs $acc1,$acc1,$t2
|
---|
845 | umulh $t2,$a6,$a3
|
---|
846 | adcs $acc2,$acc2,$t3
|
---|
847 | umulh $t3,$a7,$a3
|
---|
848 | stp $acc6,$acc7,[$tp],#8*2 // t[6..7]
|
---|
849 | adc $acc3,xzr,xzr // t[11]
|
---|
850 | adds $acc0,$acc0,$t0
|
---|
851 | mul $t0,$a5,$a4 // lo(a[5..7]*a[4]) (v)
|
---|
852 | adcs $acc1,$acc1,$t1
|
---|
853 | mul $t1,$a6,$a4
|
---|
854 | adcs $acc2,$acc2,$t2
|
---|
855 | mul $t2,$a7,$a4
|
---|
856 | adc $acc3,$acc3,$t3
|
---|
857 |
|
---|
858 | umulh $t3,$a5,$a4 // hi(a[5..7]*a[4])
|
---|
859 | adds $acc1,$acc1,$t0
|
---|
860 | umulh $t0,$a6,$a4
|
---|
861 | adcs $acc2,$acc2,$t1
|
---|
862 | umulh $t1,$a7,$a4
|
---|
863 | adcs $acc3,$acc3,$t2
|
---|
864 | mul $t2,$a6,$a5 // lo(a[6..7]*a[5]) (vi)
|
---|
865 | adc $acc4,xzr,xzr // t[12]
|
---|
866 | adds $acc2,$acc2,$t3
|
---|
867 | mul $t3,$a7,$a5
|
---|
868 | adcs $acc3,$acc3,$t0
|
---|
869 | umulh $t0,$a6,$a5 // hi(a[6..7]*a[5])
|
---|
870 | adc $acc4,$acc4,$t1
|
---|
871 |
|
---|
872 | umulh $t1,$a7,$a5
|
---|
873 | adds $acc3,$acc3,$t2
|
---|
874 | mul $t2,$a7,$a6 // lo(a[7]*a[6]) (vii)
|
---|
875 | adcs $acc4,$acc4,$t3
|
---|
876 | umulh $t3,$a7,$a6 // hi(a[7]*a[6])
|
---|
877 | adc $acc5,xzr,xzr // t[13]
|
---|
878 | adds $acc4,$acc4,$t0
|
---|
879 | sub $cnt,$ap_end,$ap // done yet?
|
---|
880 | adc $acc5,$acc5,$t1
|
---|
881 |
|
---|
882 | adds $acc5,$acc5,$t2
|
---|
883 | sub $t0,$ap_end,$num // rewinded ap
|
---|
884 | adc $acc6,xzr,xzr // t[14]
|
---|
885 | add $acc6,$acc6,$t3
|
---|
886 |
|
---|
887 | cbz $cnt,.Lsqr8x_outer_break
|
---|
888 |
|
---|
889 | mov $n0,$a0
|
---|
890 | ldp $a0,$a1,[$tp,#8*0]
|
---|
891 | ldp $a2,$a3,[$tp,#8*2]
|
---|
892 | ldp $a4,$a5,[$tp,#8*4]
|
---|
893 | ldp $a6,$a7,[$tp,#8*6]
|
---|
894 | adds $acc0,$acc0,$a0
|
---|
895 | adcs $acc1,$acc1,$a1
|
---|
896 | ldp $a0,$a1,[$ap,#8*0]
|
---|
897 | adcs $acc2,$acc2,$a2
|
---|
898 | adcs $acc3,$acc3,$a3
|
---|
899 | ldp $a2,$a3,[$ap,#8*2]
|
---|
900 | adcs $acc4,$acc4,$a4
|
---|
901 | adcs $acc5,$acc5,$a5
|
---|
902 | ldp $a4,$a5,[$ap,#8*4]
|
---|
903 | adcs $acc6,$acc6,$a6
|
---|
904 | mov $rp,$ap
|
---|
905 | adcs $acc7,xzr,$a7
|
---|
906 | ldp $a6,$a7,[$ap,#8*6]
|
---|
907 | add $ap,$ap,#8*8
|
---|
908 | //adc $carry,xzr,xzr // moved below
|
---|
909 | mov $cnt,#-8*8
|
---|
910 |
|
---|
911 | // a[8]a[0]
|
---|
912 | // a[9]a[0]
|
---|
913 | // a[a]a[0]
|
---|
914 | // a[b]a[0]
|
---|
915 | // a[c]a[0]
|
---|
916 | // a[d]a[0]
|
---|
917 | // a[e]a[0]
|
---|
918 | // a[f]a[0]
|
---|
919 | // a[8]a[1]
|
---|
920 | // a[f]a[1]........................
|
---|
921 | // a[8]a[2]
|
---|
922 | // a[f]a[2]........................
|
---|
923 | // a[8]a[3]
|
---|
924 | // a[f]a[3]........................
|
---|
925 | // a[8]a[4]
|
---|
926 | // a[f]a[4]........................
|
---|
927 | // a[8]a[5]
|
---|
928 | // a[f]a[5]........................
|
---|
929 | // a[8]a[6]
|
---|
930 | // a[f]a[6]........................
|
---|
931 | // a[8]a[7]
|
---|
932 | // a[f]a[7]........................
|
---|
933 | .Lsqr8x_mul:
|
---|
934 | mul $t0,$a0,$n0
|
---|
935 | adc $carry,xzr,xzr // carry bit, modulo-scheduled
|
---|
936 | mul $t1,$a1,$n0
|
---|
937 | add $cnt,$cnt,#8
|
---|
938 | mul $t2,$a2,$n0
|
---|
939 | mul $t3,$a3,$n0
|
---|
940 | adds $acc0,$acc0,$t0
|
---|
941 | mul $t0,$a4,$n0
|
---|
942 | adcs $acc1,$acc1,$t1
|
---|
943 | mul $t1,$a5,$n0
|
---|
944 | adcs $acc2,$acc2,$t2
|
---|
945 | mul $t2,$a6,$n0
|
---|
946 | adcs $acc3,$acc3,$t3
|
---|
947 | mul $t3,$a7,$n0
|
---|
948 | adcs $acc4,$acc4,$t0
|
---|
949 | umulh $t0,$a0,$n0
|
---|
950 | adcs $acc5,$acc5,$t1
|
---|
951 | umulh $t1,$a1,$n0
|
---|
952 | adcs $acc6,$acc6,$t2
|
---|
953 | umulh $t2,$a2,$n0
|
---|
954 | adcs $acc7,$acc7,$t3
|
---|
955 | umulh $t3,$a3,$n0
|
---|
956 | adc $carry,$carry,xzr
|
---|
957 | str $acc0,[$tp],#8
|
---|
958 | adds $acc0,$acc1,$t0
|
---|
959 | umulh $t0,$a4,$n0
|
---|
960 | adcs $acc1,$acc2,$t1
|
---|
961 | umulh $t1,$a5,$n0
|
---|
962 | adcs $acc2,$acc3,$t2
|
---|
963 | umulh $t2,$a6,$n0
|
---|
964 | adcs $acc3,$acc4,$t3
|
---|
965 | umulh $t3,$a7,$n0
|
---|
966 | ldr $n0,[$rp,$cnt]
|
---|
967 | adcs $acc4,$acc5,$t0
|
---|
968 | adcs $acc5,$acc6,$t1
|
---|
969 | adcs $acc6,$acc7,$t2
|
---|
970 | adcs $acc7,$carry,$t3
|
---|
971 | //adc $carry,xzr,xzr // moved above
|
---|
972 | cbnz $cnt,.Lsqr8x_mul
|
---|
973 | // note that carry flag is guaranteed
|
---|
974 | // to be zero at this point
|
---|
975 | cmp $ap,$ap_end // done yet?
|
---|
976 | b.eq .Lsqr8x_break
|
---|
977 |
|
---|
978 | ldp $a0,$a1,[$tp,#8*0]
|
---|
979 | ldp $a2,$a3,[$tp,#8*2]
|
---|
980 | ldp $a4,$a5,[$tp,#8*4]
|
---|
981 | ldp $a6,$a7,[$tp,#8*6]
|
---|
982 | adds $acc0,$acc0,$a0
|
---|
983 | ldur $n0,[$rp,#-8*8]
|
---|
984 | adcs $acc1,$acc1,$a1
|
---|
985 | ldp $a0,$a1,[$ap,#8*0]
|
---|
986 | adcs $acc2,$acc2,$a2
|
---|
987 | adcs $acc3,$acc3,$a3
|
---|
988 | ldp $a2,$a3,[$ap,#8*2]
|
---|
989 | adcs $acc4,$acc4,$a4
|
---|
990 | adcs $acc5,$acc5,$a5
|
---|
991 | ldp $a4,$a5,[$ap,#8*4]
|
---|
992 | adcs $acc6,$acc6,$a6
|
---|
993 | mov $cnt,#-8*8
|
---|
994 | adcs $acc7,$acc7,$a7
|
---|
995 | ldp $a6,$a7,[$ap,#8*6]
|
---|
996 | add $ap,$ap,#8*8
|
---|
997 | //adc $carry,xzr,xzr // moved above
|
---|
998 | b .Lsqr8x_mul
|
---|
999 |
|
---|
1000 | .align 4
|
---|
1001 | .Lsqr8x_break:
|
---|
1002 | ldp $a0,$a1,[$rp,#8*0]
|
---|
1003 | add $ap,$rp,#8*8
|
---|
1004 | ldp $a2,$a3,[$rp,#8*2]
|
---|
1005 | sub $t0,$ap_end,$ap // is it last iteration?
|
---|
1006 | ldp $a4,$a5,[$rp,#8*4]
|
---|
1007 | sub $t1,$tp,$t0
|
---|
1008 | ldp $a6,$a7,[$rp,#8*6]
|
---|
1009 | cbz $t0,.Lsqr8x_outer_loop
|
---|
1010 |
|
---|
1011 | stp $acc0,$acc1,[$tp,#8*0]
|
---|
1012 | ldp $acc0,$acc1,[$t1,#8*0]
|
---|
1013 | stp $acc2,$acc3,[$tp,#8*2]
|
---|
1014 | ldp $acc2,$acc3,[$t1,#8*2]
|
---|
1015 | stp $acc4,$acc5,[$tp,#8*4]
|
---|
1016 | ldp $acc4,$acc5,[$t1,#8*4]
|
---|
1017 | stp $acc6,$acc7,[$tp,#8*6]
|
---|
1018 | mov $tp,$t1
|
---|
1019 | ldp $acc6,$acc7,[$t1,#8*6]
|
---|
1020 | b .Lsqr8x_outer_loop
|
---|
1021 |
|
---|
1022 | .align 4
|
---|
1023 | .Lsqr8x_outer_break:
|
---|
1024 | // Now multiply above result by 2 and add a[n-1]*a[n-1]|...|a[0]*a[0]
|
---|
1025 | ldp $a1,$a3,[$t0,#8*0] // recall that $t0 is &a[0]
|
---|
1026 | ldp $t1,$t2,[sp,#8*1]
|
---|
1027 | ldp $a5,$a7,[$t0,#8*2]
|
---|
1028 | add $ap,$t0,#8*4
|
---|
1029 | ldp $t3,$t0,[sp,#8*3]
|
---|
1030 |
|
---|
1031 | stp $acc0,$acc1,[$tp,#8*0]
|
---|
1032 | mul $acc0,$a1,$a1
|
---|
1033 | stp $acc2,$acc3,[$tp,#8*2]
|
---|
1034 | umulh $a1,$a1,$a1
|
---|
1035 | stp $acc4,$acc5,[$tp,#8*4]
|
---|
1036 | mul $a2,$a3,$a3
|
---|
1037 | stp $acc6,$acc7,[$tp,#8*6]
|
---|
1038 | mov $tp,sp
|
---|
1039 | umulh $a3,$a3,$a3
|
---|
1040 | adds $acc1,$a1,$t1,lsl#1
|
---|
1041 | extr $t1,$t2,$t1,#63
|
---|
1042 | sub $cnt,$num,#8*4
|
---|
1043 |
|
---|
1044 | .Lsqr4x_shift_n_add:
|
---|
1045 | adcs $acc2,$a2,$t1
|
---|
1046 | extr $t2,$t3,$t2,#63
|
---|
1047 | sub $cnt,$cnt,#8*4
|
---|
1048 | adcs $acc3,$a3,$t2
|
---|
1049 | ldp $t1,$t2,[$tp,#8*5]
|
---|
1050 | mul $a4,$a5,$a5
|
---|
1051 | ldp $a1,$a3,[$ap],#8*2
|
---|
1052 | umulh $a5,$a5,$a5
|
---|
1053 | mul $a6,$a7,$a7
|
---|
1054 | umulh $a7,$a7,$a7
|
---|
1055 | extr $t3,$t0,$t3,#63
|
---|
1056 | stp $acc0,$acc1,[$tp,#8*0]
|
---|
1057 | adcs $acc4,$a4,$t3
|
---|
1058 | extr $t0,$t1,$t0,#63
|
---|
1059 | stp $acc2,$acc3,[$tp,#8*2]
|
---|
1060 | adcs $acc5,$a5,$t0
|
---|
1061 | ldp $t3,$t0,[$tp,#8*7]
|
---|
1062 | extr $t1,$t2,$t1,#63
|
---|
1063 | adcs $acc6,$a6,$t1
|
---|
1064 | extr $t2,$t3,$t2,#63
|
---|
1065 | adcs $acc7,$a7,$t2
|
---|
1066 | ldp $t1,$t2,[$tp,#8*9]
|
---|
1067 | mul $a0,$a1,$a1
|
---|
1068 | ldp $a5,$a7,[$ap],#8*2
|
---|
1069 | umulh $a1,$a1,$a1
|
---|
1070 | mul $a2,$a3,$a3
|
---|
1071 | umulh $a3,$a3,$a3
|
---|
1072 | stp $acc4,$acc5,[$tp,#8*4]
|
---|
1073 | extr $t3,$t0,$t3,#63
|
---|
1074 | stp $acc6,$acc7,[$tp,#8*6]
|
---|
1075 | add $tp,$tp,#8*8
|
---|
1076 | adcs $acc0,$a0,$t3
|
---|
1077 | extr $t0,$t1,$t0,#63
|
---|
1078 | adcs $acc1,$a1,$t0
|
---|
1079 | ldp $t3,$t0,[$tp,#8*3]
|
---|
1080 | extr $t1,$t2,$t1,#63
|
---|
1081 | cbnz $cnt,.Lsqr4x_shift_n_add
|
---|
1082 | ___
|
---|
1083 | my ($np,$np_end)=($ap,$ap_end);
|
---|
1084 | $code.=<<___;
|
---|
1085 | ldp $np,$n0,[x29,#104] // pull np and n0
|
---|
1086 |
|
---|
1087 | adcs $acc2,$a2,$t1
|
---|
1088 | extr $t2,$t3,$t2,#63
|
---|
1089 | adcs $acc3,$a3,$t2
|
---|
1090 | ldp $t1,$t2,[$tp,#8*5]
|
---|
1091 | mul $a4,$a5,$a5
|
---|
1092 | umulh $a5,$a5,$a5
|
---|
1093 | stp $acc0,$acc1,[$tp,#8*0]
|
---|
1094 | mul $a6,$a7,$a7
|
---|
1095 | umulh $a7,$a7,$a7
|
---|
1096 | stp $acc2,$acc3,[$tp,#8*2]
|
---|
1097 | extr $t3,$t0,$t3,#63
|
---|
1098 | adcs $acc4,$a4,$t3
|
---|
1099 | extr $t0,$t1,$t0,#63
|
---|
1100 | ldp $acc0,$acc1,[sp,#8*0]
|
---|
1101 | adcs $acc5,$a5,$t0
|
---|
1102 | extr $t1,$t2,$t1,#63
|
---|
1103 | ldp $a0,$a1,[$np,#8*0]
|
---|
1104 | adcs $acc6,$a6,$t1
|
---|
1105 | extr $t2,xzr,$t2,#63
|
---|
1106 | ldp $a2,$a3,[$np,#8*2]
|
---|
1107 | adc $acc7,$a7,$t2
|
---|
1108 | ldp $a4,$a5,[$np,#8*4]
|
---|
1109 |
|
---|
1110 | // Reduce by 512 bits per iteration
|
---|
1111 | mul $na0,$n0,$acc0 // t[0]*n0
|
---|
1112 | ldp $a6,$a7,[$np,#8*6]
|
---|
1113 | add $np_end,$np,$num
|
---|
1114 | ldp $acc2,$acc3,[sp,#8*2]
|
---|
1115 | stp $acc4,$acc5,[$tp,#8*4]
|
---|
1116 | ldp $acc4,$acc5,[sp,#8*4]
|
---|
1117 | stp $acc6,$acc7,[$tp,#8*6]
|
---|
1118 | ldp $acc6,$acc7,[sp,#8*6]
|
---|
1119 | add $np,$np,#8*8
|
---|
1120 | mov $topmost,xzr // initial top-most carry
|
---|
1121 | mov $tp,sp
|
---|
1122 | mov $cnt,#8
|
---|
1123 |
|
---|
1124 | .Lsqr8x_reduction:
|
---|
1125 | // (*) mul $t0,$a0,$na0 // lo(n[0-7])*lo(t[0]*n0)
|
---|
1126 | mul $t1,$a1,$na0
|
---|
1127 | sub $cnt,$cnt,#1
|
---|
1128 | mul $t2,$a2,$na0
|
---|
1129 | str $na0,[$tp],#8 // put aside t[0]*n0 for tail processing
|
---|
1130 | mul $t3,$a3,$na0
|
---|
1131 | // (*) adds xzr,$acc0,$t0
|
---|
1132 | subs xzr,$acc0,#1 // (*)
|
---|
1133 | mul $t0,$a4,$na0
|
---|
1134 | adcs $acc0,$acc1,$t1
|
---|
1135 | mul $t1,$a5,$na0
|
---|
1136 | adcs $acc1,$acc2,$t2
|
---|
1137 | mul $t2,$a6,$na0
|
---|
1138 | adcs $acc2,$acc3,$t3
|
---|
1139 | mul $t3,$a7,$na0
|
---|
1140 | adcs $acc3,$acc4,$t0
|
---|
1141 | umulh $t0,$a0,$na0 // hi(n[0-7])*lo(t[0]*n0)
|
---|
1142 | adcs $acc4,$acc5,$t1
|
---|
1143 | umulh $t1,$a1,$na0
|
---|
1144 | adcs $acc5,$acc6,$t2
|
---|
1145 | umulh $t2,$a2,$na0
|
---|
1146 | adcs $acc6,$acc7,$t3
|
---|
1147 | umulh $t3,$a3,$na0
|
---|
1148 | adc $acc7,xzr,xzr
|
---|
1149 | adds $acc0,$acc0,$t0
|
---|
1150 | umulh $t0,$a4,$na0
|
---|
1151 | adcs $acc1,$acc1,$t1
|
---|
1152 | umulh $t1,$a5,$na0
|
---|
1153 | adcs $acc2,$acc2,$t2
|
---|
1154 | umulh $t2,$a6,$na0
|
---|
1155 | adcs $acc3,$acc3,$t3
|
---|
1156 | umulh $t3,$a7,$na0
|
---|
1157 | mul $na0,$n0,$acc0 // next t[0]*n0
|
---|
1158 | adcs $acc4,$acc4,$t0
|
---|
1159 | adcs $acc5,$acc5,$t1
|
---|
1160 | adcs $acc6,$acc6,$t2
|
---|
1161 | adc $acc7,$acc7,$t3
|
---|
1162 | cbnz $cnt,.Lsqr8x_reduction
|
---|
1163 |
|
---|
1164 | ldp $t0,$t1,[$tp,#8*0]
|
---|
1165 | ldp $t2,$t3,[$tp,#8*2]
|
---|
1166 | mov $rp,$tp
|
---|
1167 | sub $cnt,$np_end,$np // done yet?
|
---|
1168 | adds $acc0,$acc0,$t0
|
---|
1169 | adcs $acc1,$acc1,$t1
|
---|
1170 | ldp $t0,$t1,[$tp,#8*4]
|
---|
1171 | adcs $acc2,$acc2,$t2
|
---|
1172 | adcs $acc3,$acc3,$t3
|
---|
1173 | ldp $t2,$t3,[$tp,#8*6]
|
---|
1174 | adcs $acc4,$acc4,$t0
|
---|
1175 | adcs $acc5,$acc5,$t1
|
---|
1176 | adcs $acc6,$acc6,$t2
|
---|
1177 | adcs $acc7,$acc7,$t3
|
---|
1178 | //adc $carry,xzr,xzr // moved below
|
---|
1179 | cbz $cnt,.Lsqr8x8_post_condition
|
---|
1180 |
|
---|
1181 | ldur $n0,[$tp,#-8*8]
|
---|
1182 | ldp $a0,$a1,[$np,#8*0]
|
---|
1183 | ldp $a2,$a3,[$np,#8*2]
|
---|
1184 | ldp $a4,$a5,[$np,#8*4]
|
---|
1185 | mov $cnt,#-8*8
|
---|
1186 | ldp $a6,$a7,[$np,#8*6]
|
---|
1187 | add $np,$np,#8*8
|
---|
1188 |
|
---|
1189 | .Lsqr8x_tail:
|
---|
1190 | mul $t0,$a0,$n0
|
---|
1191 | adc $carry,xzr,xzr // carry bit, modulo-scheduled
|
---|
1192 | mul $t1,$a1,$n0
|
---|
1193 | add $cnt,$cnt,#8
|
---|
1194 | mul $t2,$a2,$n0
|
---|
1195 | mul $t3,$a3,$n0
|
---|
1196 | adds $acc0,$acc0,$t0
|
---|
1197 | mul $t0,$a4,$n0
|
---|
1198 | adcs $acc1,$acc1,$t1
|
---|
1199 | mul $t1,$a5,$n0
|
---|
1200 | adcs $acc2,$acc2,$t2
|
---|
1201 | mul $t2,$a6,$n0
|
---|
1202 | adcs $acc3,$acc3,$t3
|
---|
1203 | mul $t3,$a7,$n0
|
---|
1204 | adcs $acc4,$acc4,$t0
|
---|
1205 | umulh $t0,$a0,$n0
|
---|
1206 | adcs $acc5,$acc5,$t1
|
---|
1207 | umulh $t1,$a1,$n0
|
---|
1208 | adcs $acc6,$acc6,$t2
|
---|
1209 | umulh $t2,$a2,$n0
|
---|
1210 | adcs $acc7,$acc7,$t3
|
---|
1211 | umulh $t3,$a3,$n0
|
---|
1212 | adc $carry,$carry,xzr
|
---|
1213 | str $acc0,[$tp],#8
|
---|
1214 | adds $acc0,$acc1,$t0
|
---|
1215 | umulh $t0,$a4,$n0
|
---|
1216 | adcs $acc1,$acc2,$t1
|
---|
1217 | umulh $t1,$a5,$n0
|
---|
1218 | adcs $acc2,$acc3,$t2
|
---|
1219 | umulh $t2,$a6,$n0
|
---|
1220 | adcs $acc3,$acc4,$t3
|
---|
1221 | umulh $t3,$a7,$n0
|
---|
1222 | ldr $n0,[$rp,$cnt]
|
---|
1223 | adcs $acc4,$acc5,$t0
|
---|
1224 | adcs $acc5,$acc6,$t1
|
---|
1225 | adcs $acc6,$acc7,$t2
|
---|
1226 | adcs $acc7,$carry,$t3
|
---|
1227 | //adc $carry,xzr,xzr // moved above
|
---|
1228 | cbnz $cnt,.Lsqr8x_tail
|
---|
1229 | // note that carry flag is guaranteed
|
---|
1230 | // to be zero at this point
|
---|
1231 | ldp $a0,$a1,[$tp,#8*0]
|
---|
1232 | sub $cnt,$np_end,$np // done yet?
|
---|
1233 | sub $t2,$np_end,$num // rewinded np
|
---|
1234 | ldp $a2,$a3,[$tp,#8*2]
|
---|
1235 | ldp $a4,$a5,[$tp,#8*4]
|
---|
1236 | ldp $a6,$a7,[$tp,#8*6]
|
---|
1237 | cbz $cnt,.Lsqr8x_tail_break
|
---|
1238 |
|
---|
1239 | ldur $n0,[$rp,#-8*8]
|
---|
1240 | adds $acc0,$acc0,$a0
|
---|
1241 | adcs $acc1,$acc1,$a1
|
---|
1242 | ldp $a0,$a1,[$np,#8*0]
|
---|
1243 | adcs $acc2,$acc2,$a2
|
---|
1244 | adcs $acc3,$acc3,$a3
|
---|
1245 | ldp $a2,$a3,[$np,#8*2]
|
---|
1246 | adcs $acc4,$acc4,$a4
|
---|
1247 | adcs $acc5,$acc5,$a5
|
---|
1248 | ldp $a4,$a5,[$np,#8*4]
|
---|
1249 | adcs $acc6,$acc6,$a6
|
---|
1250 | mov $cnt,#-8*8
|
---|
1251 | adcs $acc7,$acc7,$a7
|
---|
1252 | ldp $a6,$a7,[$np,#8*6]
|
---|
1253 | add $np,$np,#8*8
|
---|
1254 | //adc $carry,xzr,xzr // moved above
|
---|
1255 | b .Lsqr8x_tail
|
---|
1256 |
|
---|
1257 | .align 4
|
---|
1258 | .Lsqr8x_tail_break:
|
---|
1259 | ldr $n0,[x29,#112] // pull n0
|
---|
1260 | add $cnt,$tp,#8*8 // end of current t[num] window
|
---|
1261 |
|
---|
1262 | subs xzr,$topmost,#1 // "move" top-most carry to carry bit
|
---|
1263 | adcs $t0,$acc0,$a0
|
---|
1264 | adcs $t1,$acc1,$a1
|
---|
1265 | ldp $acc0,$acc1,[$rp,#8*0]
|
---|
1266 | adcs $acc2,$acc2,$a2
|
---|
1267 | ldp $a0,$a1,[$t2,#8*0] // recall that $t2 is &n[0]
|
---|
1268 | adcs $acc3,$acc3,$a3
|
---|
1269 | ldp $a2,$a3,[$t2,#8*2]
|
---|
1270 | adcs $acc4,$acc4,$a4
|
---|
1271 | adcs $acc5,$acc5,$a5
|
---|
1272 | ldp $a4,$a5,[$t2,#8*4]
|
---|
1273 | adcs $acc6,$acc6,$a6
|
---|
1274 | adcs $acc7,$acc7,$a7
|
---|
1275 | ldp $a6,$a7,[$t2,#8*6]
|
---|
1276 | add $np,$t2,#8*8
|
---|
1277 | adc $topmost,xzr,xzr // top-most carry
|
---|
1278 | mul $na0,$n0,$acc0
|
---|
1279 | stp $t0,$t1,[$tp,#8*0]
|
---|
1280 | stp $acc2,$acc3,[$tp,#8*2]
|
---|
1281 | ldp $acc2,$acc3,[$rp,#8*2]
|
---|
1282 | stp $acc4,$acc5,[$tp,#8*4]
|
---|
1283 | ldp $acc4,$acc5,[$rp,#8*4]
|
---|
1284 | cmp $cnt,x29 // did we hit the bottom?
|
---|
1285 | stp $acc6,$acc7,[$tp,#8*6]
|
---|
1286 | mov $tp,$rp // slide the window
|
---|
1287 | ldp $acc6,$acc7,[$rp,#8*6]
|
---|
1288 | mov $cnt,#8
|
---|
1289 | b.ne .Lsqr8x_reduction
|
---|
1290 |
|
---|
1291 | // Final step. We see if result is larger than modulus, and
|
---|
1292 | // if it is, subtract the modulus. But comparison implies
|
---|
1293 | // subtraction. So we subtract modulus, see if it borrowed,
|
---|
1294 | // and conditionally copy original value.
|
---|
1295 | ldr $rp,[x29,#96] // pull rp
|
---|
1296 | add $tp,$tp,#8*8
|
---|
1297 | subs $t0,$acc0,$a0
|
---|
1298 | sbcs $t1,$acc1,$a1
|
---|
1299 | sub $cnt,$num,#8*8
|
---|
1300 | mov $ap_end,$rp // $rp copy
|
---|
1301 |
|
---|
1302 | .Lsqr8x_sub:
|
---|
1303 | sbcs $t2,$acc2,$a2
|
---|
1304 | ldp $a0,$a1,[$np,#8*0]
|
---|
1305 | sbcs $t3,$acc3,$a3
|
---|
1306 | stp $t0,$t1,[$rp,#8*0]
|
---|
1307 | sbcs $t0,$acc4,$a4
|
---|
1308 | ldp $a2,$a3,[$np,#8*2]
|
---|
1309 | sbcs $t1,$acc5,$a5
|
---|
1310 | stp $t2,$t3,[$rp,#8*2]
|
---|
1311 | sbcs $t2,$acc6,$a6
|
---|
1312 | ldp $a4,$a5,[$np,#8*4]
|
---|
1313 | sbcs $t3,$acc7,$a7
|
---|
1314 | ldp $a6,$a7,[$np,#8*6]
|
---|
1315 | add $np,$np,#8*8
|
---|
1316 | ldp $acc0,$acc1,[$tp,#8*0]
|
---|
1317 | sub $cnt,$cnt,#8*8
|
---|
1318 | ldp $acc2,$acc3,[$tp,#8*2]
|
---|
1319 | ldp $acc4,$acc5,[$tp,#8*4]
|
---|
1320 | ldp $acc6,$acc7,[$tp,#8*6]
|
---|
1321 | add $tp,$tp,#8*8
|
---|
1322 | stp $t0,$t1,[$rp,#8*4]
|
---|
1323 | sbcs $t0,$acc0,$a0
|
---|
1324 | stp $t2,$t3,[$rp,#8*6]
|
---|
1325 | add $rp,$rp,#8*8
|
---|
1326 | sbcs $t1,$acc1,$a1
|
---|
1327 | cbnz $cnt,.Lsqr8x_sub
|
---|
1328 |
|
---|
1329 | sbcs $t2,$acc2,$a2
|
---|
1330 | mov $tp,sp
|
---|
1331 | add $ap,sp,$num
|
---|
1332 | ldp $a0,$a1,[$ap_end,#8*0]
|
---|
1333 | sbcs $t3,$acc3,$a3
|
---|
1334 | stp $t0,$t1,[$rp,#8*0]
|
---|
1335 | sbcs $t0,$acc4,$a4
|
---|
1336 | ldp $a2,$a3,[$ap_end,#8*2]
|
---|
1337 | sbcs $t1,$acc5,$a5
|
---|
1338 | stp $t2,$t3,[$rp,#8*2]
|
---|
1339 | sbcs $t2,$acc6,$a6
|
---|
1340 | ldp $acc0,$acc1,[$ap,#8*0]
|
---|
1341 | sbcs $t3,$acc7,$a7
|
---|
1342 | ldp $acc2,$acc3,[$ap,#8*2]
|
---|
1343 | sbcs xzr,$topmost,xzr // did it borrow?
|
---|
1344 | ldr x30,[x29,#8] // pull return address
|
---|
1345 | stp $t0,$t1,[$rp,#8*4]
|
---|
1346 | stp $t2,$t3,[$rp,#8*6]
|
---|
1347 |
|
---|
1348 | sub $cnt,$num,#8*4
|
---|
1349 | .Lsqr4x_cond_copy:
|
---|
1350 | sub $cnt,$cnt,#8*4
|
---|
1351 | csel $t0,$acc0,$a0,lo
|
---|
1352 | stp xzr,xzr,[$tp,#8*0]
|
---|
1353 | csel $t1,$acc1,$a1,lo
|
---|
1354 | ldp $a0,$a1,[$ap_end,#8*4]
|
---|
1355 | ldp $acc0,$acc1,[$ap,#8*4]
|
---|
1356 | csel $t2,$acc2,$a2,lo
|
---|
1357 | stp xzr,xzr,[$tp,#8*2]
|
---|
1358 | add $tp,$tp,#8*4
|
---|
1359 | csel $t3,$acc3,$a3,lo
|
---|
1360 | ldp $a2,$a3,[$ap_end,#8*6]
|
---|
1361 | ldp $acc2,$acc3,[$ap,#8*6]
|
---|
1362 | add $ap,$ap,#8*4
|
---|
1363 | stp $t0,$t1,[$ap_end,#8*0]
|
---|
1364 | stp $t2,$t3,[$ap_end,#8*2]
|
---|
1365 | add $ap_end,$ap_end,#8*4
|
---|
1366 | stp xzr,xzr,[$ap,#8*0]
|
---|
1367 | stp xzr,xzr,[$ap,#8*2]
|
---|
1368 | cbnz $cnt,.Lsqr4x_cond_copy
|
---|
1369 |
|
---|
1370 | csel $t0,$acc0,$a0,lo
|
---|
1371 | stp xzr,xzr,[$tp,#8*0]
|
---|
1372 | csel $t1,$acc1,$a1,lo
|
---|
1373 | stp xzr,xzr,[$tp,#8*2]
|
---|
1374 | csel $t2,$acc2,$a2,lo
|
---|
1375 | csel $t3,$acc3,$a3,lo
|
---|
1376 | stp $t0,$t1,[$ap_end,#8*0]
|
---|
1377 | stp $t2,$t3,[$ap_end,#8*2]
|
---|
1378 |
|
---|
1379 | b .Lsqr8x_done
|
---|
1380 |
|
---|
1381 | .align 4
|
---|
1382 | .Lsqr8x8_post_condition:
|
---|
1383 | adc $carry,xzr,xzr
|
---|
1384 | ldr x30,[x29,#8] // pull return address
|
---|
1385 | // $acc0-7,$carry hold result, $a0-7 hold modulus
|
---|
1386 | subs $a0,$acc0,$a0
|
---|
1387 | ldr $ap,[x29,#96] // pull rp
|
---|
1388 | sbcs $a1,$acc1,$a1
|
---|
1389 | stp xzr,xzr,[sp,#8*0]
|
---|
1390 | sbcs $a2,$acc2,$a2
|
---|
1391 | stp xzr,xzr,[sp,#8*2]
|
---|
1392 | sbcs $a3,$acc3,$a3
|
---|
1393 | stp xzr,xzr,[sp,#8*4]
|
---|
1394 | sbcs $a4,$acc4,$a4
|
---|
1395 | stp xzr,xzr,[sp,#8*6]
|
---|
1396 | sbcs $a5,$acc5,$a5
|
---|
1397 | stp xzr,xzr,[sp,#8*8]
|
---|
1398 | sbcs $a6,$acc6,$a6
|
---|
1399 | stp xzr,xzr,[sp,#8*10]
|
---|
1400 | sbcs $a7,$acc7,$a7
|
---|
1401 | stp xzr,xzr,[sp,#8*12]
|
---|
1402 | sbcs $carry,$carry,xzr // did it borrow?
|
---|
1403 | stp xzr,xzr,[sp,#8*14]
|
---|
1404 |
|
---|
1405 | // $a0-7 hold result-modulus
|
---|
1406 | csel $a0,$acc0,$a0,lo
|
---|
1407 | csel $a1,$acc1,$a1,lo
|
---|
1408 | csel $a2,$acc2,$a2,lo
|
---|
1409 | csel $a3,$acc3,$a3,lo
|
---|
1410 | stp $a0,$a1,[$ap,#8*0]
|
---|
1411 | csel $a4,$acc4,$a4,lo
|
---|
1412 | csel $a5,$acc5,$a5,lo
|
---|
1413 | stp $a2,$a3,[$ap,#8*2]
|
---|
1414 | csel $a6,$acc6,$a6,lo
|
---|
1415 | csel $a7,$acc7,$a7,lo
|
---|
1416 | stp $a4,$a5,[$ap,#8*4]
|
---|
1417 | stp $a6,$a7,[$ap,#8*6]
|
---|
1418 |
|
---|
1419 | .Lsqr8x_done:
|
---|
1420 | ldp x19,x20,[x29,#16]
|
---|
1421 | mov sp,x29
|
---|
1422 | ldp x21,x22,[x29,#32]
|
---|
1423 | mov x0,#1
|
---|
1424 | ldp x23,x24,[x29,#48]
|
---|
1425 | ldp x25,x26,[x29,#64]
|
---|
1426 | ldp x27,x28,[x29,#80]
|
---|
1427 | ldr x29,[sp],#128
|
---|
1428 | .inst 0xd50323bf // autiasp
|
---|
1429 | ret
|
---|
1430 | .size __bn_sqr8x_mont,.-__bn_sqr8x_mont
|
---|
1431 | ___
|
---|
1432 | }
|
---|
1433 |
|
---|
1434 | {
|
---|
1435 | ########################################################################
|
---|
1436 | # Even though this might look as ARMv8 adaptation of mulx4x_mont from
|
---|
1437 | # x86_64-mont5 module, it's different in sense that it performs
|
---|
1438 | # reduction 256 bits at a time.
|
---|
1439 |
|
---|
1440 | my ($a0,$a1,$a2,$a3,
|
---|
1441 | $t0,$t1,$t2,$t3,
|
---|
1442 | $m0,$m1,$m2,$m3,
|
---|
1443 | $acc0,$acc1,$acc2,$acc3,$acc4,
|
---|
1444 | $bi,$mi,$tp,$ap_end,$cnt) = map("x$_",(6..17,19..28));
|
---|
1445 | my $bp_end=$rp;
|
---|
1446 | my ($carry,$topmost) = ($rp,"x30");
|
---|
1447 |
|
---|
1448 | $code.=<<___;
|
---|
1449 | .type __bn_mul4x_mont,%function
|
---|
1450 | .align 5
|
---|
1451 | __bn_mul4x_mont:
|
---|
1452 | .inst 0xd503233f // paciasp
|
---|
1453 | stp x29,x30,[sp,#-128]!
|
---|
1454 | add x29,sp,#0
|
---|
1455 | stp x19,x20,[sp,#16]
|
---|
1456 | stp x21,x22,[sp,#32]
|
---|
1457 | stp x23,x24,[sp,#48]
|
---|
1458 | stp x25,x26,[sp,#64]
|
---|
1459 | stp x27,x28,[sp,#80]
|
---|
1460 |
|
---|
1461 | sub $tp,sp,$num,lsl#3
|
---|
1462 | lsl $num,$num,#3
|
---|
1463 | ldr $n0,[$n0] // *n0
|
---|
1464 | sub sp,$tp,#8*4 // alloca
|
---|
1465 |
|
---|
1466 | add $t0,$bp,$num
|
---|
1467 | add $ap_end,$ap,$num
|
---|
1468 | stp $rp,$t0,[x29,#96] // offload rp and &b[num]
|
---|
1469 |
|
---|
1470 | ldr $bi,[$bp,#8*0] // b[0]
|
---|
1471 | ldp $a0,$a1,[$ap,#8*0] // a[0..3]
|
---|
1472 | ldp $a2,$a3,[$ap,#8*2]
|
---|
1473 | add $ap,$ap,#8*4
|
---|
1474 | mov $acc0,xzr
|
---|
1475 | mov $acc1,xzr
|
---|
1476 | mov $acc2,xzr
|
---|
1477 | mov $acc3,xzr
|
---|
1478 | ldp $m0,$m1,[$np,#8*0] // n[0..3]
|
---|
1479 | ldp $m2,$m3,[$np,#8*2]
|
---|
1480 | adds $np,$np,#8*4 // clear carry bit
|
---|
1481 | mov $carry,xzr
|
---|
1482 | mov $cnt,#0
|
---|
1483 | mov $tp,sp
|
---|
1484 |
|
---|
1485 | .Loop_mul4x_1st_reduction:
|
---|
1486 | mul $t0,$a0,$bi // lo(a[0..3]*b[0])
|
---|
1487 | adc $carry,$carry,xzr // modulo-scheduled
|
---|
1488 | mul $t1,$a1,$bi
|
---|
1489 | add $cnt,$cnt,#8
|
---|
1490 | mul $t2,$a2,$bi
|
---|
1491 | and $cnt,$cnt,#31
|
---|
1492 | mul $t3,$a3,$bi
|
---|
1493 | adds $acc0,$acc0,$t0
|
---|
1494 | umulh $t0,$a0,$bi // hi(a[0..3]*b[0])
|
---|
1495 | adcs $acc1,$acc1,$t1
|
---|
1496 | mul $mi,$acc0,$n0 // t[0]*n0
|
---|
1497 | adcs $acc2,$acc2,$t2
|
---|
1498 | umulh $t1,$a1,$bi
|
---|
1499 | adcs $acc3,$acc3,$t3
|
---|
1500 | umulh $t2,$a2,$bi
|
---|
1501 | adc $acc4,xzr,xzr
|
---|
1502 | umulh $t3,$a3,$bi
|
---|
1503 | ldr $bi,[$bp,$cnt] // next b[i] (or b[0])
|
---|
1504 | adds $acc1,$acc1,$t0
|
---|
1505 | // (*) mul $t0,$m0,$mi // lo(n[0..3]*t[0]*n0)
|
---|
1506 | str $mi,[$tp],#8 // put aside t[0]*n0 for tail processing
|
---|
1507 | adcs $acc2,$acc2,$t1
|
---|
1508 | mul $t1,$m1,$mi
|
---|
1509 | adcs $acc3,$acc3,$t2
|
---|
1510 | mul $t2,$m2,$mi
|
---|
1511 | adc $acc4,$acc4,$t3 // can't overflow
|
---|
1512 | mul $t3,$m3,$mi
|
---|
1513 | // (*) adds xzr,$acc0,$t0
|
---|
1514 | subs xzr,$acc0,#1 // (*)
|
---|
1515 | umulh $t0,$m0,$mi // hi(n[0..3]*t[0]*n0)
|
---|
1516 | adcs $acc0,$acc1,$t1
|
---|
1517 | umulh $t1,$m1,$mi
|
---|
1518 | adcs $acc1,$acc2,$t2
|
---|
1519 | umulh $t2,$m2,$mi
|
---|
1520 | adcs $acc2,$acc3,$t3
|
---|
1521 | umulh $t3,$m3,$mi
|
---|
1522 | adcs $acc3,$acc4,$carry
|
---|
1523 | adc $carry,xzr,xzr
|
---|
1524 | adds $acc0,$acc0,$t0
|
---|
1525 | sub $t0,$ap_end,$ap
|
---|
1526 | adcs $acc1,$acc1,$t1
|
---|
1527 | adcs $acc2,$acc2,$t2
|
---|
1528 | adcs $acc3,$acc3,$t3
|
---|
1529 | //adc $carry,$carry,xzr
|
---|
1530 | cbnz $cnt,.Loop_mul4x_1st_reduction
|
---|
1531 |
|
---|
1532 | cbz $t0,.Lmul4x4_post_condition
|
---|
1533 |
|
---|
1534 | ldp $a0,$a1,[$ap,#8*0] // a[4..7]
|
---|
1535 | ldp $a2,$a3,[$ap,#8*2]
|
---|
1536 | add $ap,$ap,#8*4
|
---|
1537 | ldr $mi,[sp] // a[0]*n0
|
---|
1538 | ldp $m0,$m1,[$np,#8*0] // n[4..7]
|
---|
1539 | ldp $m2,$m3,[$np,#8*2]
|
---|
1540 | add $np,$np,#8*4
|
---|
1541 |
|
---|
1542 | .Loop_mul4x_1st_tail:
|
---|
1543 | mul $t0,$a0,$bi // lo(a[4..7]*b[i])
|
---|
1544 | adc $carry,$carry,xzr // modulo-scheduled
|
---|
1545 | mul $t1,$a1,$bi
|
---|
1546 | add $cnt,$cnt,#8
|
---|
1547 | mul $t2,$a2,$bi
|
---|
1548 | and $cnt,$cnt,#31
|
---|
1549 | mul $t3,$a3,$bi
|
---|
1550 | adds $acc0,$acc0,$t0
|
---|
1551 | umulh $t0,$a0,$bi // hi(a[4..7]*b[i])
|
---|
1552 | adcs $acc1,$acc1,$t1
|
---|
1553 | umulh $t1,$a1,$bi
|
---|
1554 | adcs $acc2,$acc2,$t2
|
---|
1555 | umulh $t2,$a2,$bi
|
---|
1556 | adcs $acc3,$acc3,$t3
|
---|
1557 | umulh $t3,$a3,$bi
|
---|
1558 | adc $acc4,xzr,xzr
|
---|
1559 | ldr $bi,[$bp,$cnt] // next b[i] (or b[0])
|
---|
1560 | adds $acc1,$acc1,$t0
|
---|
1561 | mul $t0,$m0,$mi // lo(n[4..7]*a[0]*n0)
|
---|
1562 | adcs $acc2,$acc2,$t1
|
---|
1563 | mul $t1,$m1,$mi
|
---|
1564 | adcs $acc3,$acc3,$t2
|
---|
1565 | mul $t2,$m2,$mi
|
---|
1566 | adc $acc4,$acc4,$t3 // can't overflow
|
---|
1567 | mul $t3,$m3,$mi
|
---|
1568 | adds $acc0,$acc0,$t0
|
---|
1569 | umulh $t0,$m0,$mi // hi(n[4..7]*a[0]*n0)
|
---|
1570 | adcs $acc1,$acc1,$t1
|
---|
1571 | umulh $t1,$m1,$mi
|
---|
1572 | adcs $acc2,$acc2,$t2
|
---|
1573 | umulh $t2,$m2,$mi
|
---|
1574 | adcs $acc3,$acc3,$t3
|
---|
1575 | adcs $acc4,$acc4,$carry
|
---|
1576 | umulh $t3,$m3,$mi
|
---|
1577 | adc $carry,xzr,xzr
|
---|
1578 | ldr $mi,[sp,$cnt] // next t[0]*n0
|
---|
1579 | str $acc0,[$tp],#8 // result!!!
|
---|
1580 | adds $acc0,$acc1,$t0
|
---|
1581 | sub $t0,$ap_end,$ap // done yet?
|
---|
1582 | adcs $acc1,$acc2,$t1
|
---|
1583 | adcs $acc2,$acc3,$t2
|
---|
1584 | adcs $acc3,$acc4,$t3
|
---|
1585 | //adc $carry,$carry,xzr
|
---|
1586 | cbnz $cnt,.Loop_mul4x_1st_tail
|
---|
1587 |
|
---|
1588 | sub $t1,$ap_end,$num // rewinded $ap
|
---|
1589 | cbz $t0,.Lmul4x_proceed
|
---|
1590 |
|
---|
1591 | ldp $a0,$a1,[$ap,#8*0]
|
---|
1592 | ldp $a2,$a3,[$ap,#8*2]
|
---|
1593 | add $ap,$ap,#8*4
|
---|
1594 | ldp $m0,$m1,[$np,#8*0]
|
---|
1595 | ldp $m2,$m3,[$np,#8*2]
|
---|
1596 | add $np,$np,#8*4
|
---|
1597 | b .Loop_mul4x_1st_tail
|
---|
1598 |
|
---|
1599 | .align 5
|
---|
1600 | .Lmul4x_proceed:
|
---|
1601 | ldr $bi,[$bp,#8*4]! // *++b
|
---|
1602 | adc $topmost,$carry,xzr
|
---|
1603 | ldp $a0,$a1,[$t1,#8*0] // a[0..3]
|
---|
1604 | sub $np,$np,$num // rewind np
|
---|
1605 | ldp $a2,$a3,[$t1,#8*2]
|
---|
1606 | add $ap,$t1,#8*4
|
---|
1607 |
|
---|
1608 | stp $acc0,$acc1,[$tp,#8*0] // result!!!
|
---|
1609 | ldp $acc0,$acc1,[sp,#8*4] // t[0..3]
|
---|
1610 | stp $acc2,$acc3,[$tp,#8*2] // result!!!
|
---|
1611 | ldp $acc2,$acc3,[sp,#8*6]
|
---|
1612 |
|
---|
1613 | ldp $m0,$m1,[$np,#8*0] // n[0..3]
|
---|
1614 | mov $tp,sp
|
---|
1615 | ldp $m2,$m3,[$np,#8*2]
|
---|
1616 | adds $np,$np,#8*4 // clear carry bit
|
---|
1617 | mov $carry,xzr
|
---|
1618 |
|
---|
1619 | .align 4
|
---|
1620 | .Loop_mul4x_reduction:
|
---|
1621 | mul $t0,$a0,$bi // lo(a[0..3]*b[4])
|
---|
1622 | adc $carry,$carry,xzr // modulo-scheduled
|
---|
1623 | mul $t1,$a1,$bi
|
---|
1624 | add $cnt,$cnt,#8
|
---|
1625 | mul $t2,$a2,$bi
|
---|
1626 | and $cnt,$cnt,#31
|
---|
1627 | mul $t3,$a3,$bi
|
---|
1628 | adds $acc0,$acc0,$t0
|
---|
1629 | umulh $t0,$a0,$bi // hi(a[0..3]*b[4])
|
---|
1630 | adcs $acc1,$acc1,$t1
|
---|
1631 | mul $mi,$acc0,$n0 // t[0]*n0
|
---|
1632 | adcs $acc2,$acc2,$t2
|
---|
1633 | umulh $t1,$a1,$bi
|
---|
1634 | adcs $acc3,$acc3,$t3
|
---|
1635 | umulh $t2,$a2,$bi
|
---|
1636 | adc $acc4,xzr,xzr
|
---|
1637 | umulh $t3,$a3,$bi
|
---|
1638 | ldr $bi,[$bp,$cnt] // next b[i]
|
---|
1639 | adds $acc1,$acc1,$t0
|
---|
1640 | // (*) mul $t0,$m0,$mi
|
---|
1641 | str $mi,[$tp],#8 // put aside t[0]*n0 for tail processing
|
---|
1642 | adcs $acc2,$acc2,$t1
|
---|
1643 | mul $t1,$m1,$mi // lo(n[0..3]*t[0]*n0
|
---|
1644 | adcs $acc3,$acc3,$t2
|
---|
1645 | mul $t2,$m2,$mi
|
---|
1646 | adc $acc4,$acc4,$t3 // can't overflow
|
---|
1647 | mul $t3,$m3,$mi
|
---|
1648 | // (*) adds xzr,$acc0,$t0
|
---|
1649 | subs xzr,$acc0,#1 // (*)
|
---|
1650 | umulh $t0,$m0,$mi // hi(n[0..3]*t[0]*n0
|
---|
1651 | adcs $acc0,$acc1,$t1
|
---|
1652 | umulh $t1,$m1,$mi
|
---|
1653 | adcs $acc1,$acc2,$t2
|
---|
1654 | umulh $t2,$m2,$mi
|
---|
1655 | adcs $acc2,$acc3,$t3
|
---|
1656 | umulh $t3,$m3,$mi
|
---|
1657 | adcs $acc3,$acc4,$carry
|
---|
1658 | adc $carry,xzr,xzr
|
---|
1659 | adds $acc0,$acc0,$t0
|
---|
1660 | adcs $acc1,$acc1,$t1
|
---|
1661 | adcs $acc2,$acc2,$t2
|
---|
1662 | adcs $acc3,$acc3,$t3
|
---|
1663 | //adc $carry,$carry,xzr
|
---|
1664 | cbnz $cnt,.Loop_mul4x_reduction
|
---|
1665 |
|
---|
1666 | adc $carry,$carry,xzr
|
---|
1667 | ldp $t0,$t1,[$tp,#8*4] // t[4..7]
|
---|
1668 | ldp $t2,$t3,[$tp,#8*6]
|
---|
1669 | ldp $a0,$a1,[$ap,#8*0] // a[4..7]
|
---|
1670 | ldp $a2,$a3,[$ap,#8*2]
|
---|
1671 | add $ap,$ap,#8*4
|
---|
1672 | adds $acc0,$acc0,$t0
|
---|
1673 | adcs $acc1,$acc1,$t1
|
---|
1674 | adcs $acc2,$acc2,$t2
|
---|
1675 | adcs $acc3,$acc3,$t3
|
---|
1676 | //adc $carry,$carry,xzr
|
---|
1677 |
|
---|
1678 | ldr $mi,[sp] // t[0]*n0
|
---|
1679 | ldp $m0,$m1,[$np,#8*0] // n[4..7]
|
---|
1680 | ldp $m2,$m3,[$np,#8*2]
|
---|
1681 | add $np,$np,#8*4
|
---|
1682 |
|
---|
1683 | .align 4
|
---|
1684 | .Loop_mul4x_tail:
|
---|
1685 | mul $t0,$a0,$bi // lo(a[4..7]*b[4])
|
---|
1686 | adc $carry,$carry,xzr // modulo-scheduled
|
---|
1687 | mul $t1,$a1,$bi
|
---|
1688 | add $cnt,$cnt,#8
|
---|
1689 | mul $t2,$a2,$bi
|
---|
1690 | and $cnt,$cnt,#31
|
---|
1691 | mul $t3,$a3,$bi
|
---|
1692 | adds $acc0,$acc0,$t0
|
---|
1693 | umulh $t0,$a0,$bi // hi(a[4..7]*b[4])
|
---|
1694 | adcs $acc1,$acc1,$t1
|
---|
1695 | umulh $t1,$a1,$bi
|
---|
1696 | adcs $acc2,$acc2,$t2
|
---|
1697 | umulh $t2,$a2,$bi
|
---|
1698 | adcs $acc3,$acc3,$t3
|
---|
1699 | umulh $t3,$a3,$bi
|
---|
1700 | adc $acc4,xzr,xzr
|
---|
1701 | ldr $bi,[$bp,$cnt] // next b[i]
|
---|
1702 | adds $acc1,$acc1,$t0
|
---|
1703 | mul $t0,$m0,$mi // lo(n[4..7]*t[0]*n0)
|
---|
1704 | adcs $acc2,$acc2,$t1
|
---|
1705 | mul $t1,$m1,$mi
|
---|
1706 | adcs $acc3,$acc3,$t2
|
---|
1707 | mul $t2,$m2,$mi
|
---|
1708 | adc $acc4,$acc4,$t3 // can't overflow
|
---|
1709 | mul $t3,$m3,$mi
|
---|
1710 | adds $acc0,$acc0,$t0
|
---|
1711 | umulh $t0,$m0,$mi // hi(n[4..7]*t[0]*n0)
|
---|
1712 | adcs $acc1,$acc1,$t1
|
---|
1713 | umulh $t1,$m1,$mi
|
---|
1714 | adcs $acc2,$acc2,$t2
|
---|
1715 | umulh $t2,$m2,$mi
|
---|
1716 | adcs $acc3,$acc3,$t3
|
---|
1717 | umulh $t3,$m3,$mi
|
---|
1718 | adcs $acc4,$acc4,$carry
|
---|
1719 | ldr $mi,[sp,$cnt] // next a[0]*n0
|
---|
1720 | adc $carry,xzr,xzr
|
---|
1721 | str $acc0,[$tp],#8 // result!!!
|
---|
1722 | adds $acc0,$acc1,$t0
|
---|
1723 | sub $t0,$ap_end,$ap // done yet?
|
---|
1724 | adcs $acc1,$acc2,$t1
|
---|
1725 | adcs $acc2,$acc3,$t2
|
---|
1726 | adcs $acc3,$acc4,$t3
|
---|
1727 | //adc $carry,$carry,xzr
|
---|
1728 | cbnz $cnt,.Loop_mul4x_tail
|
---|
1729 |
|
---|
1730 | sub $t1,$np,$num // rewinded np?
|
---|
1731 | adc $carry,$carry,xzr
|
---|
1732 | cbz $t0,.Loop_mul4x_break
|
---|
1733 |
|
---|
1734 | ldp $t0,$t1,[$tp,#8*4]
|
---|
1735 | ldp $t2,$t3,[$tp,#8*6]
|
---|
1736 | ldp $a0,$a1,[$ap,#8*0]
|
---|
1737 | ldp $a2,$a3,[$ap,#8*2]
|
---|
1738 | add $ap,$ap,#8*4
|
---|
1739 | adds $acc0,$acc0,$t0
|
---|
1740 | adcs $acc1,$acc1,$t1
|
---|
1741 | adcs $acc2,$acc2,$t2
|
---|
1742 | adcs $acc3,$acc3,$t3
|
---|
1743 | //adc $carry,$carry,xzr
|
---|
1744 | ldp $m0,$m1,[$np,#8*0]
|
---|
1745 | ldp $m2,$m3,[$np,#8*2]
|
---|
1746 | add $np,$np,#8*4
|
---|
1747 | b .Loop_mul4x_tail
|
---|
1748 |
|
---|
1749 | .align 4
|
---|
1750 | .Loop_mul4x_break:
|
---|
1751 | ldp $t2,$t3,[x29,#96] // pull rp and &b[num]
|
---|
1752 | adds $acc0,$acc0,$topmost
|
---|
1753 | add $bp,$bp,#8*4 // bp++
|
---|
1754 | adcs $acc1,$acc1,xzr
|
---|
1755 | sub $ap,$ap,$num // rewind ap
|
---|
1756 | adcs $acc2,$acc2,xzr
|
---|
1757 | stp $acc0,$acc1,[$tp,#8*0] // result!!!
|
---|
1758 | adcs $acc3,$acc3,xzr
|
---|
1759 | ldp $acc0,$acc1,[sp,#8*4] // t[0..3]
|
---|
1760 | adc $topmost,$carry,xzr
|
---|
1761 | stp $acc2,$acc3,[$tp,#8*2] // result!!!
|
---|
1762 | cmp $bp,$t3 // done yet?
|
---|
1763 | ldp $acc2,$acc3,[sp,#8*6]
|
---|
1764 | ldp $m0,$m1,[$t1,#8*0] // n[0..3]
|
---|
1765 | ldp $m2,$m3,[$t1,#8*2]
|
---|
1766 | add $np,$t1,#8*4
|
---|
1767 | b.eq .Lmul4x_post
|
---|
1768 |
|
---|
1769 | ldr $bi,[$bp]
|
---|
1770 | ldp $a0,$a1,[$ap,#8*0] // a[0..3]
|
---|
1771 | ldp $a2,$a3,[$ap,#8*2]
|
---|
1772 | adds $ap,$ap,#8*4 // clear carry bit
|
---|
1773 | mov $carry,xzr
|
---|
1774 | mov $tp,sp
|
---|
1775 | b .Loop_mul4x_reduction
|
---|
1776 |
|
---|
1777 | .align 4
|
---|
1778 | .Lmul4x_post:
|
---|
1779 | // Final step. We see if result is larger than modulus, and
|
---|
1780 | // if it is, subtract the modulus. But comparison implies
|
---|
1781 | // subtraction. So we subtract modulus, see if it borrowed,
|
---|
1782 | // and conditionally copy original value.
|
---|
1783 | mov $rp,$t2
|
---|
1784 | mov $ap_end,$t2 // $rp copy
|
---|
1785 | subs $t0,$acc0,$m0
|
---|
1786 | add $tp,sp,#8*8
|
---|
1787 | sbcs $t1,$acc1,$m1
|
---|
1788 | sub $cnt,$num,#8*4
|
---|
1789 |
|
---|
1790 | .Lmul4x_sub:
|
---|
1791 | sbcs $t2,$acc2,$m2
|
---|
1792 | ldp $m0,$m1,[$np,#8*0]
|
---|
1793 | sub $cnt,$cnt,#8*4
|
---|
1794 | ldp $acc0,$acc1,[$tp,#8*0]
|
---|
1795 | sbcs $t3,$acc3,$m3
|
---|
1796 | ldp $m2,$m3,[$np,#8*2]
|
---|
1797 | add $np,$np,#8*4
|
---|
1798 | ldp $acc2,$acc3,[$tp,#8*2]
|
---|
1799 | add $tp,$tp,#8*4
|
---|
1800 | stp $t0,$t1,[$rp,#8*0]
|
---|
1801 | sbcs $t0,$acc0,$m0
|
---|
1802 | stp $t2,$t3,[$rp,#8*2]
|
---|
1803 | add $rp,$rp,#8*4
|
---|
1804 | sbcs $t1,$acc1,$m1
|
---|
1805 | cbnz $cnt,.Lmul4x_sub
|
---|
1806 |
|
---|
1807 | sbcs $t2,$acc2,$m2
|
---|
1808 | mov $tp,sp
|
---|
1809 | add $ap,sp,#8*4
|
---|
1810 | ldp $a0,$a1,[$ap_end,#8*0]
|
---|
1811 | sbcs $t3,$acc3,$m3
|
---|
1812 | stp $t0,$t1,[$rp,#8*0]
|
---|
1813 | ldp $a2,$a3,[$ap_end,#8*2]
|
---|
1814 | stp $t2,$t3,[$rp,#8*2]
|
---|
1815 | ldp $acc0,$acc1,[$ap,#8*0]
|
---|
1816 | ldp $acc2,$acc3,[$ap,#8*2]
|
---|
1817 | sbcs xzr,$topmost,xzr // did it borrow?
|
---|
1818 | ldr x30,[x29,#8] // pull return address
|
---|
1819 |
|
---|
1820 | sub $cnt,$num,#8*4
|
---|
1821 | .Lmul4x_cond_copy:
|
---|
1822 | sub $cnt,$cnt,#8*4
|
---|
1823 | csel $t0,$acc0,$a0,lo
|
---|
1824 | stp xzr,xzr,[$tp,#8*0]
|
---|
1825 | csel $t1,$acc1,$a1,lo
|
---|
1826 | ldp $a0,$a1,[$ap_end,#8*4]
|
---|
1827 | ldp $acc0,$acc1,[$ap,#8*4]
|
---|
1828 | csel $t2,$acc2,$a2,lo
|
---|
1829 | stp xzr,xzr,[$tp,#8*2]
|
---|
1830 | add $tp,$tp,#8*4
|
---|
1831 | csel $t3,$acc3,$a3,lo
|
---|
1832 | ldp $a2,$a3,[$ap_end,#8*6]
|
---|
1833 | ldp $acc2,$acc3,[$ap,#8*6]
|
---|
1834 | add $ap,$ap,#8*4
|
---|
1835 | stp $t0,$t1,[$ap_end,#8*0]
|
---|
1836 | stp $t2,$t3,[$ap_end,#8*2]
|
---|
1837 | add $ap_end,$ap_end,#8*4
|
---|
1838 | cbnz $cnt,.Lmul4x_cond_copy
|
---|
1839 |
|
---|
1840 | csel $t0,$acc0,$a0,lo
|
---|
1841 | stp xzr,xzr,[$tp,#8*0]
|
---|
1842 | csel $t1,$acc1,$a1,lo
|
---|
1843 | stp xzr,xzr,[$tp,#8*2]
|
---|
1844 | csel $t2,$acc2,$a2,lo
|
---|
1845 | stp xzr,xzr,[$tp,#8*3]
|
---|
1846 | csel $t3,$acc3,$a3,lo
|
---|
1847 | stp xzr,xzr,[$tp,#8*4]
|
---|
1848 | stp $t0,$t1,[$ap_end,#8*0]
|
---|
1849 | stp $t2,$t3,[$ap_end,#8*2]
|
---|
1850 |
|
---|
1851 | b .Lmul4x_done
|
---|
1852 |
|
---|
1853 | .align 4
|
---|
1854 | .Lmul4x4_post_condition:
|
---|
1855 | adc $carry,$carry,xzr
|
---|
1856 | ldr $ap,[x29,#96] // pull rp
|
---|
1857 | // $acc0-3,$carry hold result, $m0-7 hold modulus
|
---|
1858 | subs $a0,$acc0,$m0
|
---|
1859 | ldr x30,[x29,#8] // pull return address
|
---|
1860 | sbcs $a1,$acc1,$m1
|
---|
1861 | stp xzr,xzr,[sp,#8*0]
|
---|
1862 | sbcs $a2,$acc2,$m2
|
---|
1863 | stp xzr,xzr,[sp,#8*2]
|
---|
1864 | sbcs $a3,$acc3,$m3
|
---|
1865 | stp xzr,xzr,[sp,#8*4]
|
---|
1866 | sbcs xzr,$carry,xzr // did it borrow?
|
---|
1867 | stp xzr,xzr,[sp,#8*6]
|
---|
1868 |
|
---|
1869 | // $a0-3 hold result-modulus
|
---|
1870 | csel $a0,$acc0,$a0,lo
|
---|
1871 | csel $a1,$acc1,$a1,lo
|
---|
1872 | csel $a2,$acc2,$a2,lo
|
---|
1873 | csel $a3,$acc3,$a3,lo
|
---|
1874 | stp $a0,$a1,[$ap,#8*0]
|
---|
1875 | stp $a2,$a3,[$ap,#8*2]
|
---|
1876 |
|
---|
1877 | .Lmul4x_done:
|
---|
1878 | ldp x19,x20,[x29,#16]
|
---|
1879 | mov sp,x29
|
---|
1880 | ldp x21,x22,[x29,#32]
|
---|
1881 | mov x0,#1
|
---|
1882 | ldp x23,x24,[x29,#48]
|
---|
1883 | ldp x25,x26,[x29,#64]
|
---|
1884 | ldp x27,x28,[x29,#80]
|
---|
1885 | ldr x29,[sp],#128
|
---|
1886 | .inst 0xd50323bf // autiasp
|
---|
1887 | ret
|
---|
1888 | .size __bn_mul4x_mont,.-__bn_mul4x_mont
|
---|
1889 | ___
|
---|
1890 | }
|
---|
1891 | $code.=<<___;
|
---|
1892 | .asciz "Montgomery Multiplication for ARMv8, CRYPTOGAMS by <appro\@openssl.org>"
|
---|
1893 | .align 4
|
---|
1894 | ___
|
---|
1895 |
|
---|
1896 | print $code;
|
---|
1897 |
|
---|
1898 | close STDOUT or die "error closing STDOUT: $!";
|
---|