VirtualBox

source: vbox/trunk/src/libs/openssl-3.0.3/crypto/bn/asm/armv8-mont.pl@ 96662

Last change on this file since 96662 was 94082, checked in by vboxsync, 3 years ago

libs/openssl-3.0.1: started applying and adjusting our OpenSSL changes to 3.0.1. bugref:10128

  • Property svn:executable set to *
File size: 46.1 KB
Line 
1#! /usr/bin/env perl
2# Copyright 2015-2021 The OpenSSL Project Authors. All Rights Reserved.
3#
4# Licensed under the Apache License 2.0 (the "License"). You may not use
5# this file except in compliance with the License. You can obtain a copy
6# in the file LICENSE in the source distribution or at
7# https://www.openssl.org/source/license.html
8
9
10# ====================================================================
11# Written by Andy Polyakov <[email protected]> for the OpenSSL
12# project. The module is, however, dual licensed under OpenSSL and
13# CRYPTOGAMS licenses depending on where you obtain it. For further
14# details see http://www.openssl.org/~appro/cryptogams/.
15# ====================================================================
16
17# March 2015
18#
19# "Teaser" Montgomery multiplication module for ARMv8. Needs more
20# work. While it does improve RSA sign performance by 20-30% (less for
21# longer keys) on most processors, for some reason RSA2048 is not
22# faster and RSA4096 goes 15-20% slower on Cortex-A57. Multiplication
23# instruction issue rate is limited on processor in question, meaning
24# that dedicated squaring procedure is a must. Well, actually all
25# contemporary AArch64 processors seem to have limited multiplication
26# issue rate, i.e. they can't issue multiplication every cycle, which
27# explains moderate improvement coefficients in comparison to
28# compiler-generated code. Recall that compiler is instructed to use
29# umulh and therefore uses same amount of multiplication instructions
30# to do the job. Assembly's edge is to minimize number of "collateral"
31# instructions and of course instruction scheduling.
32#
33# April 2015
34#
35# Squaring procedure that handles lengths divisible by 8 improves
36# RSA/DSA performance by 25-40-60% depending on processor and key
37# length. Overall improvement coefficients are always positive in
38# comparison to compiler-generated code. On Cortex-A57 improvement
39# is still modest on longest key lengths, while others exhibit e.g.
40# 50-70% improvement for RSA4096 sign. RSA2048 sign is ~25% faster
41# on Cortex-A57 and ~60-100% faster on others.
42
43# $output is the last argument if it looks like a file (it has an extension)
44# $flavour is the first argument if it doesn't look like a file
45my $output = $#ARGV >= 0 && $ARGV[$#ARGV] =~ m|\.\w+$| ? pop : undef;
46my $flavour = $#ARGV >= 0 && $ARGV[0] !~ m|\.| ? shift : undef;
47
48$0 =~ m/(.*[\/\\])[^\/\\]+$/; $dir=$1;
49( $xlate="${dir}arm-xlate.pl" and -f $xlate ) or
50( $xlate="${dir}../../perlasm/arm-xlate.pl" and -f $xlate) or
51die "can't locate arm-xlate.pl";
52
53open OUT,"| \"$^X\" $xlate $flavour \"$output\""
54 or die "can't call $xlate: $1";
55*STDOUT=*OUT;
56
57($lo0,$hi0,$aj,$m0,$alo,$ahi,
58 $lo1,$hi1,$nj,$m1,$nlo,$nhi,
59 $ovf, $i,$j,$tp,$tj) = map("x$_",6..17,19..24);
60
61# int bn_mul_mont(
62$rp="x0"; # BN_ULONG *rp,
63$ap="x1"; # const BN_ULONG *ap,
64$bp="x2"; # const BN_ULONG *bp,
65$np="x3"; # const BN_ULONG *np,
66$n0="x4"; # const BN_ULONG *n0,
67$num="x5"; # int num);
68
69$code.=<<___;
70#ifndef __KERNEL__
71# include "arm_arch.h"
72.extern OPENSSL_armv8_rsa_neonized
73.hidden OPENSSL_armv8_rsa_neonized
74#endif
75.text
76
77.globl bn_mul_mont
78.type bn_mul_mont,%function
79.align 5
80bn_mul_mont:
81.Lbn_mul_mont:
82 tst $num,#3
83 b.ne .Lmul_mont
84 cmp $num,#32
85 b.le .Lscalar_impl
86#ifndef __KERNEL__
87 adrp x17,OPENSSL_armv8_rsa_neonized
88 ldr w17,[x17,#:lo12:OPENSSL_armv8_rsa_neonized]
89 cbnz w17, bn_mul8x_mont_neon
90#endif
91
92.Lscalar_impl:
93 tst $num,#7
94 b.eq __bn_sqr8x_mont
95 tst $num,#3
96 b.eq __bn_mul4x_mont
97
98.Lmul_mont:
99 stp x29,x30,[sp,#-64]!
100 add x29,sp,#0
101 stp x19,x20,[sp,#16]
102 stp x21,x22,[sp,#32]
103 stp x23,x24,[sp,#48]
104
105 ldr $m0,[$bp],#8 // bp[0]
106 sub $tp,sp,$num,lsl#3
107 ldp $hi0,$aj,[$ap],#16 // ap[0..1]
108 lsl $num,$num,#3
109 ldr $n0,[$n0] // *n0
110 and $tp,$tp,#-16 // ABI says so
111 ldp $hi1,$nj,[$np],#16 // np[0..1]
112
113 mul $lo0,$hi0,$m0 // ap[0]*bp[0]
114 sub $j,$num,#16 // j=num-2
115 umulh $hi0,$hi0,$m0
116 mul $alo,$aj,$m0 // ap[1]*bp[0]
117 umulh $ahi,$aj,$m0
118
119 mul $m1,$lo0,$n0 // "tp[0]"*n0
120 mov sp,$tp // alloca
121
122 // (*) mul $lo1,$hi1,$m1 // np[0]*m1
123 umulh $hi1,$hi1,$m1
124 mul $nlo,$nj,$m1 // np[1]*m1
125 // (*) adds $lo1,$lo1,$lo0 // discarded
126 // (*) As for removal of first multiplication and addition
127 // instructions. The outcome of first addition is
128 // guaranteed to be zero, which leaves two computationally
129 // significant outcomes: it either carries or not. Then
130 // question is when does it carry? Is there alternative
131 // way to deduce it? If you follow operations, you can
132 // observe that condition for carry is quite simple:
133 // $lo0 being non-zero. So that carry can be calculated
134 // by adding -1 to $lo0. That's what next instruction does.
135 subs xzr,$lo0,#1 // (*)
136 umulh $nhi,$nj,$m1
137 adc $hi1,$hi1,xzr
138 cbz $j,.L1st_skip
139
140.L1st:
141 ldr $aj,[$ap],#8
142 adds $lo0,$alo,$hi0
143 sub $j,$j,#8 // j--
144 adc $hi0,$ahi,xzr
145
146 ldr $nj,[$np],#8
147 adds $lo1,$nlo,$hi1
148 mul $alo,$aj,$m0 // ap[j]*bp[0]
149 adc $hi1,$nhi,xzr
150 umulh $ahi,$aj,$m0
151
152 adds $lo1,$lo1,$lo0
153 mul $nlo,$nj,$m1 // np[j]*m1
154 adc $hi1,$hi1,xzr
155 umulh $nhi,$nj,$m1
156 str $lo1,[$tp],#8 // tp[j-1]
157 cbnz $j,.L1st
158
159.L1st_skip:
160 adds $lo0,$alo,$hi0
161 sub $ap,$ap,$num // rewind $ap
162 adc $hi0,$ahi,xzr
163
164 adds $lo1,$nlo,$hi1
165 sub $np,$np,$num // rewind $np
166 adc $hi1,$nhi,xzr
167
168 adds $lo1,$lo1,$lo0
169 sub $i,$num,#8 // i=num-1
170 adcs $hi1,$hi1,$hi0
171
172 adc $ovf,xzr,xzr // upmost overflow bit
173 stp $lo1,$hi1,[$tp]
174
175.Louter:
176 ldr $m0,[$bp],#8 // bp[i]
177 ldp $hi0,$aj,[$ap],#16
178 ldr $tj,[sp] // tp[0]
179 add $tp,sp,#8
180
181 mul $lo0,$hi0,$m0 // ap[0]*bp[i]
182 sub $j,$num,#16 // j=num-2
183 umulh $hi0,$hi0,$m0
184 ldp $hi1,$nj,[$np],#16
185 mul $alo,$aj,$m0 // ap[1]*bp[i]
186 adds $lo0,$lo0,$tj
187 umulh $ahi,$aj,$m0
188 adc $hi0,$hi0,xzr
189
190 mul $m1,$lo0,$n0
191 sub $i,$i,#8 // i--
192
193 // (*) mul $lo1,$hi1,$m1 // np[0]*m1
194 umulh $hi1,$hi1,$m1
195 mul $nlo,$nj,$m1 // np[1]*m1
196 // (*) adds $lo1,$lo1,$lo0
197 subs xzr,$lo0,#1 // (*)
198 umulh $nhi,$nj,$m1
199 cbz $j,.Linner_skip
200
201.Linner:
202 ldr $aj,[$ap],#8
203 adc $hi1,$hi1,xzr
204 ldr $tj,[$tp],#8 // tp[j]
205 adds $lo0,$alo,$hi0
206 sub $j,$j,#8 // j--
207 adc $hi0,$ahi,xzr
208
209 adds $lo1,$nlo,$hi1
210 ldr $nj,[$np],#8
211 adc $hi1,$nhi,xzr
212
213 mul $alo,$aj,$m0 // ap[j]*bp[i]
214 adds $lo0,$lo0,$tj
215 umulh $ahi,$aj,$m0
216 adc $hi0,$hi0,xzr
217
218 mul $nlo,$nj,$m1 // np[j]*m1
219 adds $lo1,$lo1,$lo0
220 umulh $nhi,$nj,$m1
221 stur $lo1,[$tp,#-16] // tp[j-1]
222 cbnz $j,.Linner
223
224.Linner_skip:
225 ldr $tj,[$tp],#8 // tp[j]
226 adc $hi1,$hi1,xzr
227 adds $lo0,$alo,$hi0
228 sub $ap,$ap,$num // rewind $ap
229 adc $hi0,$ahi,xzr
230
231 adds $lo1,$nlo,$hi1
232 sub $np,$np,$num // rewind $np
233 adcs $hi1,$nhi,$ovf
234 adc $ovf,xzr,xzr
235
236 adds $lo0,$lo0,$tj
237 adc $hi0,$hi0,xzr
238
239 adds $lo1,$lo1,$lo0
240 adcs $hi1,$hi1,$hi0
241 adc $ovf,$ovf,xzr // upmost overflow bit
242 stp $lo1,$hi1,[$tp,#-16]
243
244 cbnz $i,.Louter
245
246 // Final step. We see if result is larger than modulus, and
247 // if it is, subtract the modulus. But comparison implies
248 // subtraction. So we subtract modulus, see if it borrowed,
249 // and conditionally copy original value.
250 ldr $tj,[sp] // tp[0]
251 add $tp,sp,#8
252 ldr $nj,[$np],#8 // np[0]
253 subs $j,$num,#8 // j=num-1 and clear borrow
254 mov $ap,$rp
255.Lsub:
256 sbcs $aj,$tj,$nj // tp[j]-np[j]
257 ldr $tj,[$tp],#8
258 sub $j,$j,#8 // j--
259 ldr $nj,[$np],#8
260 str $aj,[$ap],#8 // rp[j]=tp[j]-np[j]
261 cbnz $j,.Lsub
262
263 sbcs $aj,$tj,$nj
264 sbcs $ovf,$ovf,xzr // did it borrow?
265 str $aj,[$ap],#8 // rp[num-1]
266
267 ldr $tj,[sp] // tp[0]
268 add $tp,sp,#8
269 ldr $aj,[$rp],#8 // rp[0]
270 sub $num,$num,#8 // num--
271 nop
272.Lcond_copy:
273 sub $num,$num,#8 // num--
274 csel $nj,$tj,$aj,lo // did it borrow?
275 ldr $tj,[$tp],#8
276 ldr $aj,[$rp],#8
277 stur xzr,[$tp,#-16] // wipe tp
278 stur $nj,[$rp,#-16]
279 cbnz $num,.Lcond_copy
280
281 csel $nj,$tj,$aj,lo
282 stur xzr,[$tp,#-8] // wipe tp
283 stur $nj,[$rp,#-8]
284
285 ldp x19,x20,[x29,#16]
286 mov sp,x29
287 ldp x21,x22,[x29,#32]
288 mov x0,#1
289 ldp x23,x24,[x29,#48]
290 ldr x29,[sp],#64
291 ret
292.size bn_mul_mont,.-bn_mul_mont
293___
294{
295my ($A0,$A1,$N0,$N1)=map("v$_",(0..3));
296my ($Z,$Temp)=("v4.16b","v5");
297my @ACC=map("v$_",(6..13));
298my ($Bi,$Ni,$M0)=map("v$_",(28..30));
299my $sBi="s28";
300my $sM0="s30";
301my $zero="v14";
302my $temp="v15";
303my $ACCTemp="v16";
304
305my ($rptr,$aptr,$bptr,$nptr,$n0,$num)=map("x$_",(0..5));
306my ($tinptr,$toutptr,$inner,$outer,$bnptr)=map("x$_",(6..11));
307
308$code.=<<___;
309.type bn_mul8x_mont_neon,%function
310.align 5
311bn_mul8x_mont_neon:
312 stp x29,x30,[sp,#-80]!
313 mov x16,sp
314 stp d8,d9,[sp,#16]
315 stp d10,d11,[sp,#32]
316 stp d12,d13,[sp,#48]
317 stp d14,d15,[sp,#64]
318 lsl $num,$num,#1
319 eor $zero.16b,$zero.16b,$zero.16b
320
321.align 4
322.LNEON_8n:
323 eor @ACC[0].16b,@ACC[0].16b,@ACC[0].16b
324 sub $toutptr,sp,#128
325 eor @ACC[1].16b,@ACC[1].16b,@ACC[1].16b
326 sub $toutptr,$toutptr,$num,lsl#4
327 eor @ACC[2].16b,@ACC[2].16b,@ACC[2].16b
328 and $toutptr,$toutptr,#-64
329 eor @ACC[3].16b,@ACC[3].16b,@ACC[3].16b
330 mov sp,$toutptr // alloca
331 eor @ACC[4].16b,@ACC[4].16b,@ACC[4].16b
332 add $toutptr,$toutptr,#256
333 eor @ACC[5].16b,@ACC[5].16b,@ACC[5].16b
334 sub $inner,$num,#8
335 eor @ACC[6].16b,@ACC[6].16b,@ACC[6].16b
336 eor @ACC[7].16b,@ACC[7].16b,@ACC[7].16b
337
338.LNEON_8n_init:
339 st1 {@ACC[0].2d,@ACC[1].2d},[$toutptr],#32
340 subs $inner,$inner,#8
341 st1 {@ACC[2].2d,@ACC[3].2d},[$toutptr],#32
342 st1 {@ACC[4].2d,@ACC[5].2d},[$toutptr],#32
343 st1 {@ACC[6].2d,@ACC[7].2d},[$toutptr],#32
344 bne .LNEON_8n_init
345
346 add $tinptr,sp,#256
347 ld1 {$A0.4s,$A1.4s},[$aptr],#32
348 add $bnptr,sp,#8
349 ldr $sM0,[$n0],#4
350 mov $outer,$num
351 b .LNEON_8n_outer
352
353.align 4
354.LNEON_8n_outer:
355 ldr $sBi,[$bptr],#4 // *b++
356 uxtl $Bi.4s,$Bi.4h
357 add $toutptr,sp,#128
358 ld1 {$N0.4s,$N1.4s},[$nptr],#32
359
360 umlal @ACC[0].2d,$Bi.2s,$A0.s[0]
361 umlal @ACC[1].2d,$Bi.2s,$A0.s[1]
362 umlal @ACC[2].2d,$Bi.2s,$A0.s[2]
363 shl $Ni.2d,@ACC[0].2d,#16
364 ext $Ni.16b,$Ni.16b,$Ni.16b,#8
365 umlal @ACC[3].2d,$Bi.2s,$A0.s[3]
366 add $Ni.2d,$Ni.2d,@ACC[0].2d
367 umlal @ACC[4].2d,$Bi.2s,$A1.s[0]
368 mul $Ni.2s,$Ni.2s,$M0.2s
369 umlal @ACC[5].2d,$Bi.2s,$A1.s[1]
370 st1 {$Bi.2s},[sp] // put aside smashed b[8*i+0]
371 umlal @ACC[6].2d,$Bi.2s,$A1.s[2]
372 uxtl $Ni.4s,$Ni.4h
373 umlal @ACC[7].2d,$Bi.2s,$A1.s[3]
374___
375for ($i=0; $i<7;) {
376$code.=<<___;
377 ldr $sBi,[$bptr],#4 // *b++
378 umlal @ACC[0].2d,$Ni.2s,$N0.s[0]
379 umlal @ACC[1].2d,$Ni.2s,$N0.s[1]
380 uxtl $Bi.4s,$Bi.4h
381 umlal @ACC[2].2d,$Ni.2s,$N0.s[2]
382 ushr $temp.2d,@ACC[0].2d,#16
383 umlal @ACC[3].2d,$Ni.2s,$N0.s[3]
384 umlal @ACC[4].2d,$Ni.2s,$N1.s[0]
385 ext @ACC[0].16b,@ACC[0].16b,@ACC[0].16b,#8
386 add @ACC[0].2d,@ACC[0].2d,$temp.2d
387 umlal @ACC[5].2d,$Ni.2s,$N1.s[1]
388 ushr @ACC[0].2d,@ACC[0].2d,#16
389 umlal @ACC[6].2d,$Ni.2s,$N1.s[2]
390 umlal @ACC[7].2d,$Ni.2s,$N1.s[3]
391 add $ACCTemp.2d,@ACC[1].2d,@ACC[0].2d
392 ins @ACC[1].d[0],$ACCTemp.d[0]
393 st1 {$Ni.2s},[$bnptr],#8 // put aside smashed m[8*i+$i]
394___
395 push(@ACC,shift(@ACC)); $i++;
396$code.=<<___;
397 umlal @ACC[0].2d,$Bi.2s,$A0.s[0]
398 ld1 {@ACC[7].2d},[$tinptr],#16
399 umlal @ACC[1].2d,$Bi.2s,$A0.s[1]
400 umlal @ACC[2].2d,$Bi.2s,$A0.s[2]
401 shl $Ni.2d,@ACC[0].2d,#16
402 ext $Ni.16b,$Ni.16b,$Ni.16b,#8
403 umlal @ACC[3].2d,$Bi.2s,$A0.s[3]
404 add $Ni.2d,$Ni.2d,@ACC[0].2d
405 umlal @ACC[4].2d,$Bi.2s,$A1.s[0]
406 mul $Ni.2s,$Ni.2s,$M0.2s
407 umlal @ACC[5].2d,$Bi.2s,$A1.s[1]
408 st1 {$Bi.2s},[$bnptr],#8 // put aside smashed b[8*i+$i]
409 umlal @ACC[6].2d,$Bi.2s,$A1.s[2]
410 uxtl $Ni.4s,$Ni.4h
411 umlal @ACC[7].2d,$Bi.2s,$A1.s[3]
412___
413}
414$code.=<<___;
415 ld1 {$Bi.2s},[sp] // pull smashed b[8*i+0]
416 umlal @ACC[0].2d,$Ni.2s,$N0.s[0]
417 ld1 {$A0.4s,$A1.4s},[$aptr],#32
418 umlal @ACC[1].2d,$Ni.2s,$N0.s[1]
419 umlal @ACC[2].2d,$Ni.2s,$N0.s[2]
420 mov $Temp.16b,@ACC[0].16b
421 ushr $Temp.2d,$Temp.2d,#16
422 ext @ACC[0].16b,@ACC[0].16b,@ACC[0].16b,#8
423 umlal @ACC[3].2d,$Ni.2s,$N0.s[3]
424 umlal @ACC[4].2d,$Ni.2s,$N1.s[0]
425 add @ACC[0].2d,@ACC[0].2d,$Temp.2d
426 umlal @ACC[5].2d,$Ni.2s,$N1.s[1]
427 ushr @ACC[0].2d,@ACC[0].2d,#16
428 eor $temp.16b,$temp.16b,$temp.16b
429 ins @ACC[0].d[1],$temp.d[0]
430 umlal @ACC[6].2d,$Ni.2s,$N1.s[2]
431 umlal @ACC[7].2d,$Ni.2s,$N1.s[3]
432 add @ACC[1].2d,@ACC[1].2d,@ACC[0].2d
433 st1 {$Ni.2s},[$bnptr],#8 // put aside smashed m[8*i+$i]
434 add $bnptr,sp,#8 // rewind
435___
436 push(@ACC,shift(@ACC));
437$code.=<<___;
438 sub $inner,$num,#8
439 b .LNEON_8n_inner
440
441.align 4
442.LNEON_8n_inner:
443 subs $inner,$inner,#8
444 umlal @ACC[0].2d,$Bi.2s,$A0.s[0]
445 ld1 {@ACC[7].2d},[$tinptr]
446 umlal @ACC[1].2d,$Bi.2s,$A0.s[1]
447 ld1 {$Ni.2s},[$bnptr],#8 // pull smashed m[8*i+0]
448 umlal @ACC[2].2d,$Bi.2s,$A0.s[2]
449 ld1 {$N0.4s,$N1.4s},[$nptr],#32
450 umlal @ACC[3].2d,$Bi.2s,$A0.s[3]
451 b.eq .LInner_jump
452 add $tinptr,$tinptr,#16 // don't advance in last iteration
453.LInner_jump:
454 umlal @ACC[4].2d,$Bi.2s,$A1.s[0]
455 umlal @ACC[5].2d,$Bi.2s,$A1.s[1]
456 umlal @ACC[6].2d,$Bi.2s,$A1.s[2]
457 umlal @ACC[7].2d,$Bi.2s,$A1.s[3]
458___
459for ($i=1; $i<8; $i++) {
460$code.=<<___;
461 ld1 {$Bi.2s},[$bnptr],#8 // pull smashed b[8*i+$i]
462 umlal @ACC[0].2d,$Ni.2s,$N0.s[0]
463 umlal @ACC[1].2d,$Ni.2s,$N0.s[1]
464 umlal @ACC[2].2d,$Ni.2s,$N0.s[2]
465 umlal @ACC[3].2d,$Ni.2s,$N0.s[3]
466 umlal @ACC[4].2d,$Ni.2s,$N1.s[0]
467 umlal @ACC[5].2d,$Ni.2s,$N1.s[1]
468 umlal @ACC[6].2d,$Ni.2s,$N1.s[2]
469 umlal @ACC[7].2d,$Ni.2s,$N1.s[3]
470 st1 {@ACC[0].2d},[$toutptr],#16
471___
472 push(@ACC,shift(@ACC));
473$code.=<<___;
474 umlal @ACC[0].2d,$Bi.2s,$A0.s[0]
475 ld1 {@ACC[7].2d},[$tinptr]
476 umlal @ACC[1].2d,$Bi.2s,$A0.s[1]
477 ld1 {$Ni.2s},[$bnptr],#8 // pull smashed m[8*i+$i]
478 umlal @ACC[2].2d,$Bi.2s,$A0.s[2]
479 b.eq .LInner_jump$i
480 add $tinptr,$tinptr,#16 // don't advance in last iteration
481.LInner_jump$i:
482 umlal @ACC[3].2d,$Bi.2s,$A0.s[3]
483 umlal @ACC[4].2d,$Bi.2s,$A1.s[0]
484 umlal @ACC[5].2d,$Bi.2s,$A1.s[1]
485 umlal @ACC[6].2d,$Bi.2s,$A1.s[2]
486 umlal @ACC[7].2d,$Bi.2s,$A1.s[3]
487___
488}
489$code.=<<___;
490 b.ne .LInner_after_rewind$i
491 sub $aptr,$aptr,$num,lsl#2 // rewind
492.LInner_after_rewind$i:
493 umlal @ACC[0].2d,$Ni.2s,$N0.s[0]
494 ld1 {$Bi.2s},[sp] // pull smashed b[8*i+0]
495 umlal @ACC[1].2d,$Ni.2s,$N0.s[1]
496 ld1 {$A0.4s,$A1.4s},[$aptr],#32
497 umlal @ACC[2].2d,$Ni.2s,$N0.s[2]
498 add $bnptr,sp,#8 // rewind
499 umlal @ACC[3].2d,$Ni.2s,$N0.s[3]
500 umlal @ACC[4].2d,$Ni.2s,$N1.s[0]
501 umlal @ACC[5].2d,$Ni.2s,$N1.s[1]
502 umlal @ACC[6].2d,$Ni.2s,$N1.s[2]
503 st1 {@ACC[0].2d},[$toutptr],#16
504 umlal @ACC[7].2d,$Ni.2s,$N1.s[3]
505
506 bne .LNEON_8n_inner
507___
508 push(@ACC,shift(@ACC));
509$code.=<<___;
510 add $tinptr,sp,#128
511 st1 {@ACC[0].2d,@ACC[1].2d},[$toutptr],#32
512 eor $N0.16b,$N0.16b,$N0.16b // $N0
513 st1 {@ACC[2].2d,@ACC[3].2d},[$toutptr],#32
514 eor $N1.16b,$N1.16b,$N1.16b // $N1
515 st1 {@ACC[4].2d,@ACC[5].2d},[$toutptr],#32
516 st1 {@ACC[6].2d},[$toutptr]
517
518 subs $outer,$outer,#8
519 ld1 {@ACC[0].2d,@ACC[1].2d},[$tinptr],#32
520 ld1 {@ACC[2].2d,@ACC[3].2d},[$tinptr],#32
521 ld1 {@ACC[4].2d,@ACC[5].2d},[$tinptr],#32
522 ld1 {@ACC[6].2d,@ACC[7].2d},[$tinptr],#32
523
524 b.eq .LInner_8n_jump_2steps
525 sub $nptr,$nptr,$num,lsl#2 // rewind
526 b .LNEON_8n_outer
527
528.LInner_8n_jump_2steps:
529 add $toutptr,sp,#128
530 st1 {$N0.2d,$N1.2d}, [sp],#32 // start wiping stack frame
531 mov $Temp.16b,@ACC[0].16b
532 ushr $temp.2d,@ACC[0].2d,#16
533 ext @ACC[0].16b,@ACC[0].16b,@ACC[0].16b,#8
534 st1 {$N0.2d,$N1.2d}, [sp],#32
535 add @ACC[0].2d,@ACC[0].2d,$temp.2d
536 st1 {$N0.2d,$N1.2d}, [sp],#32
537 ushr $temp.2d,@ACC[0].2d,#16
538 st1 {$N0.2d,$N1.2d}, [sp],#32
539 zip1 @ACC[0].4h,$Temp.4h,@ACC[0].4h
540 ins $temp.d[1],$zero.d[0]
541
542 mov $inner,$num
543 b .LNEON_tail_entry
544
545.align 4
546.LNEON_tail:
547 add @ACC[0].2d,@ACC[0].2d,$temp.2d
548 mov $Temp.16b,@ACC[0].16b
549 ushr $temp.2d,@ACC[0].2d,#16
550 ext @ACC[0].16b,@ACC[0].16b,@ACC[0].16b,#8
551 ld1 {@ACC[2].2d,@ACC[3].2d}, [$tinptr],#32
552 add @ACC[0].2d,@ACC[0].2d,$temp.2d
553 ld1 {@ACC[4].2d,@ACC[5].2d}, [$tinptr],#32
554 ushr $temp.2d,@ACC[0].2d,#16
555 ld1 {@ACC[6].2d,@ACC[7].2d}, [$tinptr],#32
556 zip1 @ACC[0].4h,$Temp.4h,@ACC[0].4h
557 ins $temp.d[1],$zero.d[0]
558
559.LNEON_tail_entry:
560___
561for ($i=1; $i<8; $i++) {
562$code.=<<___;
563 add @ACC[1].2d,@ACC[1].2d,$temp.2d
564 st1 {@ACC[0].s}[0], [$toutptr],#4
565 ushr $temp.2d,@ACC[1].2d,#16
566 mov $Temp.16b,@ACC[1].16b
567 ext @ACC[1].16b,@ACC[1].16b,@ACC[1].16b,#8
568 add @ACC[1].2d,@ACC[1].2d,$temp.2d
569 ushr $temp.2d,@ACC[1].2d,#16
570 zip1 @ACC[1].4h,$Temp.4h,@ACC[1].4h
571 ins $temp.d[1],$zero.d[0]
572___
573 push(@ACC,shift(@ACC));
574}
575 push(@ACC,shift(@ACC));
576$code.=<<___;
577 ld1 {@ACC[0].2d,@ACC[1].2d}, [$tinptr],#32
578 subs $inner,$inner,#8
579 st1 {@ACC[7].s}[0], [$toutptr],#4
580 bne .LNEON_tail
581
582 st1 {$temp.s}[0], [$toutptr],#4 // top-most bit
583 sub $nptr,$nptr,$num,lsl#2 // rewind $nptr
584 subs $aptr,sp,#0 // clear carry flag
585 add $bptr,sp,$num,lsl#2
586
587.LNEON_sub:
588 ldp w4,w5,[$aptr],#8
589 ldp w6,w7,[$aptr],#8
590 ldp w8,w9,[$nptr],#8
591 ldp w10,w11,[$nptr],#8
592 sbcs w8,w4,w8
593 sbcs w9,w5,w9
594 sbcs w10,w6,w10
595 sbcs w11,w7,w11
596 sub x17,$bptr,$aptr
597 stp w8,w9,[$rptr],#8
598 stp w10,w11,[$rptr],#8
599 cbnz x17,.LNEON_sub
600
601 ldr w10, [$aptr] // load top-most bit
602 mov x11,sp
603 eor v0.16b,v0.16b,v0.16b
604 sub x11,$bptr,x11 // this is num*4
605 eor v1.16b,v1.16b,v1.16b
606 mov $aptr,sp
607 sub $rptr,$rptr,x11 // rewind $rptr
608 mov $nptr,$bptr // second 3/4th of frame
609 sbcs w10,w10,wzr // result is carry flag
610
611.LNEON_copy_n_zap:
612 ldp w4,w5,[$aptr],#8
613 ldp w6,w7,[$aptr],#8
614 ldp w8,w9,[$rptr],#8
615 ldp w10,w11,[$rptr]
616 sub $rptr,$rptr,#8
617 b.cs .LCopy_1
618 mov w8,w4
619 mov w9,w5
620 mov w10,w6
621 mov w11,w7
622.LCopy_1:
623 st1 {v0.2d,v1.2d}, [$nptr],#32 // wipe
624 st1 {v0.2d,v1.2d}, [$nptr],#32 // wipe
625 ldp w4,w5,[$aptr],#8
626 ldp w6,w7,[$aptr],#8
627 stp w8,w9,[$rptr],#8
628 stp w10,w11,[$rptr],#8
629 sub $aptr,$aptr,#32
630 ldp w8,w9,[$rptr],#8
631 ldp w10,w11,[$rptr]
632 sub $rptr,$rptr,#8
633 b.cs .LCopy_2
634 mov w8, w4
635 mov w9, w5
636 mov w10, w6
637 mov w11, w7
638.LCopy_2:
639 st1 {v0.2d,v1.2d}, [$aptr],#32 // wipe
640 st1 {v0.2d,v1.2d}, [$nptr],#32 // wipe
641 sub x17,$bptr,$aptr // preserves carry
642 stp w8,w9,[$rptr],#8
643 stp w10,w11,[$rptr],#8
644 cbnz x17,.LNEON_copy_n_zap
645
646 mov sp,x16
647 ldp d14,d15,[sp,#64]
648 ldp d12,d13,[sp,#48]
649 ldp d10,d11,[sp,#32]
650 ldp d8,d9,[sp,#16]
651 ldr x29,[sp],#80
652 ret // bx lr
653
654.size bn_mul8x_mont_neon,.-bn_mul8x_mont_neon
655___
656}
657{
658########################################################################
659# Following is ARMv8 adaptation of sqrx8x_mont from x86_64-mont5 module.
660
661my ($a0,$a1,$a2,$a3,$a4,$a5,$a6,$a7)=map("x$_",(6..13));
662my ($t0,$t1,$t2,$t3)=map("x$_",(14..17));
663my ($acc0,$acc1,$acc2,$acc3,$acc4,$acc5,$acc6,$acc7)=map("x$_",(19..26));
664my ($cnt,$carry,$topmost)=("x27","x28","x30");
665my ($tp,$ap_end,$na0)=($bp,$np,$carry);
666
667$code.=<<___;
668.type __bn_sqr8x_mont,%function
669.align 5
670__bn_sqr8x_mont:
671 cmp $ap,$bp
672 b.ne __bn_mul4x_mont
673.Lsqr8x_mont:
674 .inst 0xd503233f // paciasp
675 stp x29,x30,[sp,#-128]!
676 add x29,sp,#0
677 stp x19,x20,[sp,#16]
678 stp x21,x22,[sp,#32]
679 stp x23,x24,[sp,#48]
680 stp x25,x26,[sp,#64]
681 stp x27,x28,[sp,#80]
682 stp $rp,$np,[sp,#96] // offload rp and np
683
684 ldp $a0,$a1,[$ap,#8*0]
685 ldp $a2,$a3,[$ap,#8*2]
686 ldp $a4,$a5,[$ap,#8*4]
687 ldp $a6,$a7,[$ap,#8*6]
688
689 sub $tp,sp,$num,lsl#4
690 lsl $num,$num,#3
691 ldr $n0,[$n0] // *n0
692 mov sp,$tp // alloca
693 sub $cnt,$num,#8*8
694 b .Lsqr8x_zero_start
695
696.Lsqr8x_zero:
697 sub $cnt,$cnt,#8*8
698 stp xzr,xzr,[$tp,#8*0]
699 stp xzr,xzr,[$tp,#8*2]
700 stp xzr,xzr,[$tp,#8*4]
701 stp xzr,xzr,[$tp,#8*6]
702.Lsqr8x_zero_start:
703 stp xzr,xzr,[$tp,#8*8]
704 stp xzr,xzr,[$tp,#8*10]
705 stp xzr,xzr,[$tp,#8*12]
706 stp xzr,xzr,[$tp,#8*14]
707 add $tp,$tp,#8*16
708 cbnz $cnt,.Lsqr8x_zero
709
710 add $ap_end,$ap,$num
711 add $ap,$ap,#8*8
712 mov $acc0,xzr
713 mov $acc1,xzr
714 mov $acc2,xzr
715 mov $acc3,xzr
716 mov $acc4,xzr
717 mov $acc5,xzr
718 mov $acc6,xzr
719 mov $acc7,xzr
720 mov $tp,sp
721 str $n0,[x29,#112] // offload n0
722
723 // Multiply everything but a[i]*a[i]
724.align 4
725.Lsqr8x_outer_loop:
726 // a[1]a[0] (i)
727 // a[2]a[0]
728 // a[3]a[0]
729 // a[4]a[0]
730 // a[5]a[0]
731 // a[6]a[0]
732 // a[7]a[0]
733 // a[2]a[1] (ii)
734 // a[3]a[1]
735 // a[4]a[1]
736 // a[5]a[1]
737 // a[6]a[1]
738 // a[7]a[1]
739 // a[3]a[2] (iii)
740 // a[4]a[2]
741 // a[5]a[2]
742 // a[6]a[2]
743 // a[7]a[2]
744 // a[4]a[3] (iv)
745 // a[5]a[3]
746 // a[6]a[3]
747 // a[7]a[3]
748 // a[5]a[4] (v)
749 // a[6]a[4]
750 // a[7]a[4]
751 // a[6]a[5] (vi)
752 // a[7]a[5]
753 // a[7]a[6] (vii)
754
755 mul $t0,$a1,$a0 // lo(a[1..7]*a[0]) (i)
756 mul $t1,$a2,$a0
757 mul $t2,$a3,$a0
758 mul $t3,$a4,$a0
759 adds $acc1,$acc1,$t0 // t[1]+lo(a[1]*a[0])
760 mul $t0,$a5,$a0
761 adcs $acc2,$acc2,$t1
762 mul $t1,$a6,$a0
763 adcs $acc3,$acc3,$t2
764 mul $t2,$a7,$a0
765 adcs $acc4,$acc4,$t3
766 umulh $t3,$a1,$a0 // hi(a[1..7]*a[0])
767 adcs $acc5,$acc5,$t0
768 umulh $t0,$a2,$a0
769 adcs $acc6,$acc6,$t1
770 umulh $t1,$a3,$a0
771 adcs $acc7,$acc7,$t2
772 umulh $t2,$a4,$a0
773 stp $acc0,$acc1,[$tp],#8*2 // t[0..1]
774 adc $acc0,xzr,xzr // t[8]
775 adds $acc2,$acc2,$t3 // t[2]+lo(a[1]*a[0])
776 umulh $t3,$a5,$a0
777 adcs $acc3,$acc3,$t0
778 umulh $t0,$a6,$a0
779 adcs $acc4,$acc4,$t1
780 umulh $t1,$a7,$a0
781 adcs $acc5,$acc5,$t2
782 mul $t2,$a2,$a1 // lo(a[2..7]*a[1]) (ii)
783 adcs $acc6,$acc6,$t3
784 mul $t3,$a3,$a1
785 adcs $acc7,$acc7,$t0
786 mul $t0,$a4,$a1
787 adc $acc0,$acc0,$t1
788
789 mul $t1,$a5,$a1
790 adds $acc3,$acc3,$t2
791 mul $t2,$a6,$a1
792 adcs $acc4,$acc4,$t3
793 mul $t3,$a7,$a1
794 adcs $acc5,$acc5,$t0
795 umulh $t0,$a2,$a1 // hi(a[2..7]*a[1])
796 adcs $acc6,$acc6,$t1
797 umulh $t1,$a3,$a1
798 adcs $acc7,$acc7,$t2
799 umulh $t2,$a4,$a1
800 adcs $acc0,$acc0,$t3
801 umulh $t3,$a5,$a1
802 stp $acc2,$acc3,[$tp],#8*2 // t[2..3]
803 adc $acc1,xzr,xzr // t[9]
804 adds $acc4,$acc4,$t0
805 umulh $t0,$a6,$a1
806 adcs $acc5,$acc5,$t1
807 umulh $t1,$a7,$a1
808 adcs $acc6,$acc6,$t2
809 mul $t2,$a3,$a2 // lo(a[3..7]*a[2]) (iii)
810 adcs $acc7,$acc7,$t3
811 mul $t3,$a4,$a2
812 adcs $acc0,$acc0,$t0
813 mul $t0,$a5,$a2
814 adc $acc1,$acc1,$t1
815
816 mul $t1,$a6,$a2
817 adds $acc5,$acc5,$t2
818 mul $t2,$a7,$a2
819 adcs $acc6,$acc6,$t3
820 umulh $t3,$a3,$a2 // hi(a[3..7]*a[2])
821 adcs $acc7,$acc7,$t0
822 umulh $t0,$a4,$a2
823 adcs $acc0,$acc0,$t1
824 umulh $t1,$a5,$a2
825 adcs $acc1,$acc1,$t2
826 umulh $t2,$a6,$a2
827 stp $acc4,$acc5,[$tp],#8*2 // t[4..5]
828 adc $acc2,xzr,xzr // t[10]
829 adds $acc6,$acc6,$t3
830 umulh $t3,$a7,$a2
831 adcs $acc7,$acc7,$t0
832 mul $t0,$a4,$a3 // lo(a[4..7]*a[3]) (iv)
833 adcs $acc0,$acc0,$t1
834 mul $t1,$a5,$a3
835 adcs $acc1,$acc1,$t2
836 mul $t2,$a6,$a3
837 adc $acc2,$acc2,$t3
838
839 mul $t3,$a7,$a3
840 adds $acc7,$acc7,$t0
841 umulh $t0,$a4,$a3 // hi(a[4..7]*a[3])
842 adcs $acc0,$acc0,$t1
843 umulh $t1,$a5,$a3
844 adcs $acc1,$acc1,$t2
845 umulh $t2,$a6,$a3
846 adcs $acc2,$acc2,$t3
847 umulh $t3,$a7,$a3
848 stp $acc6,$acc7,[$tp],#8*2 // t[6..7]
849 adc $acc3,xzr,xzr // t[11]
850 adds $acc0,$acc0,$t0
851 mul $t0,$a5,$a4 // lo(a[5..7]*a[4]) (v)
852 adcs $acc1,$acc1,$t1
853 mul $t1,$a6,$a4
854 adcs $acc2,$acc2,$t2
855 mul $t2,$a7,$a4
856 adc $acc3,$acc3,$t3
857
858 umulh $t3,$a5,$a4 // hi(a[5..7]*a[4])
859 adds $acc1,$acc1,$t0
860 umulh $t0,$a6,$a4
861 adcs $acc2,$acc2,$t1
862 umulh $t1,$a7,$a4
863 adcs $acc3,$acc3,$t2
864 mul $t2,$a6,$a5 // lo(a[6..7]*a[5]) (vi)
865 adc $acc4,xzr,xzr // t[12]
866 adds $acc2,$acc2,$t3
867 mul $t3,$a7,$a5
868 adcs $acc3,$acc3,$t0
869 umulh $t0,$a6,$a5 // hi(a[6..7]*a[5])
870 adc $acc4,$acc4,$t1
871
872 umulh $t1,$a7,$a5
873 adds $acc3,$acc3,$t2
874 mul $t2,$a7,$a6 // lo(a[7]*a[6]) (vii)
875 adcs $acc4,$acc4,$t3
876 umulh $t3,$a7,$a6 // hi(a[7]*a[6])
877 adc $acc5,xzr,xzr // t[13]
878 adds $acc4,$acc4,$t0
879 sub $cnt,$ap_end,$ap // done yet?
880 adc $acc5,$acc5,$t1
881
882 adds $acc5,$acc5,$t2
883 sub $t0,$ap_end,$num // rewinded ap
884 adc $acc6,xzr,xzr // t[14]
885 add $acc6,$acc6,$t3
886
887 cbz $cnt,.Lsqr8x_outer_break
888
889 mov $n0,$a0
890 ldp $a0,$a1,[$tp,#8*0]
891 ldp $a2,$a3,[$tp,#8*2]
892 ldp $a4,$a5,[$tp,#8*4]
893 ldp $a6,$a7,[$tp,#8*6]
894 adds $acc0,$acc0,$a0
895 adcs $acc1,$acc1,$a1
896 ldp $a0,$a1,[$ap,#8*0]
897 adcs $acc2,$acc2,$a2
898 adcs $acc3,$acc3,$a3
899 ldp $a2,$a3,[$ap,#8*2]
900 adcs $acc4,$acc4,$a4
901 adcs $acc5,$acc5,$a5
902 ldp $a4,$a5,[$ap,#8*4]
903 adcs $acc6,$acc6,$a6
904 mov $rp,$ap
905 adcs $acc7,xzr,$a7
906 ldp $a6,$a7,[$ap,#8*6]
907 add $ap,$ap,#8*8
908 //adc $carry,xzr,xzr // moved below
909 mov $cnt,#-8*8
910
911 // a[8]a[0]
912 // a[9]a[0]
913 // a[a]a[0]
914 // a[b]a[0]
915 // a[c]a[0]
916 // a[d]a[0]
917 // a[e]a[0]
918 // a[f]a[0]
919 // a[8]a[1]
920 // a[f]a[1]........................
921 // a[8]a[2]
922 // a[f]a[2]........................
923 // a[8]a[3]
924 // a[f]a[3]........................
925 // a[8]a[4]
926 // a[f]a[4]........................
927 // a[8]a[5]
928 // a[f]a[5]........................
929 // a[8]a[6]
930 // a[f]a[6]........................
931 // a[8]a[7]
932 // a[f]a[7]........................
933.Lsqr8x_mul:
934 mul $t0,$a0,$n0
935 adc $carry,xzr,xzr // carry bit, modulo-scheduled
936 mul $t1,$a1,$n0
937 add $cnt,$cnt,#8
938 mul $t2,$a2,$n0
939 mul $t3,$a3,$n0
940 adds $acc0,$acc0,$t0
941 mul $t0,$a4,$n0
942 adcs $acc1,$acc1,$t1
943 mul $t1,$a5,$n0
944 adcs $acc2,$acc2,$t2
945 mul $t2,$a6,$n0
946 adcs $acc3,$acc3,$t3
947 mul $t3,$a7,$n0
948 adcs $acc4,$acc4,$t0
949 umulh $t0,$a0,$n0
950 adcs $acc5,$acc5,$t1
951 umulh $t1,$a1,$n0
952 adcs $acc6,$acc6,$t2
953 umulh $t2,$a2,$n0
954 adcs $acc7,$acc7,$t3
955 umulh $t3,$a3,$n0
956 adc $carry,$carry,xzr
957 str $acc0,[$tp],#8
958 adds $acc0,$acc1,$t0
959 umulh $t0,$a4,$n0
960 adcs $acc1,$acc2,$t1
961 umulh $t1,$a5,$n0
962 adcs $acc2,$acc3,$t2
963 umulh $t2,$a6,$n0
964 adcs $acc3,$acc4,$t3
965 umulh $t3,$a7,$n0
966 ldr $n0,[$rp,$cnt]
967 adcs $acc4,$acc5,$t0
968 adcs $acc5,$acc6,$t1
969 adcs $acc6,$acc7,$t2
970 adcs $acc7,$carry,$t3
971 //adc $carry,xzr,xzr // moved above
972 cbnz $cnt,.Lsqr8x_mul
973 // note that carry flag is guaranteed
974 // to be zero at this point
975 cmp $ap,$ap_end // done yet?
976 b.eq .Lsqr8x_break
977
978 ldp $a0,$a1,[$tp,#8*0]
979 ldp $a2,$a3,[$tp,#8*2]
980 ldp $a4,$a5,[$tp,#8*4]
981 ldp $a6,$a7,[$tp,#8*6]
982 adds $acc0,$acc0,$a0
983 ldur $n0,[$rp,#-8*8]
984 adcs $acc1,$acc1,$a1
985 ldp $a0,$a1,[$ap,#8*0]
986 adcs $acc2,$acc2,$a2
987 adcs $acc3,$acc3,$a3
988 ldp $a2,$a3,[$ap,#8*2]
989 adcs $acc4,$acc4,$a4
990 adcs $acc5,$acc5,$a5
991 ldp $a4,$a5,[$ap,#8*4]
992 adcs $acc6,$acc6,$a6
993 mov $cnt,#-8*8
994 adcs $acc7,$acc7,$a7
995 ldp $a6,$a7,[$ap,#8*6]
996 add $ap,$ap,#8*8
997 //adc $carry,xzr,xzr // moved above
998 b .Lsqr8x_mul
999
1000.align 4
1001.Lsqr8x_break:
1002 ldp $a0,$a1,[$rp,#8*0]
1003 add $ap,$rp,#8*8
1004 ldp $a2,$a3,[$rp,#8*2]
1005 sub $t0,$ap_end,$ap // is it last iteration?
1006 ldp $a4,$a5,[$rp,#8*4]
1007 sub $t1,$tp,$t0
1008 ldp $a6,$a7,[$rp,#8*6]
1009 cbz $t0,.Lsqr8x_outer_loop
1010
1011 stp $acc0,$acc1,[$tp,#8*0]
1012 ldp $acc0,$acc1,[$t1,#8*0]
1013 stp $acc2,$acc3,[$tp,#8*2]
1014 ldp $acc2,$acc3,[$t1,#8*2]
1015 stp $acc4,$acc5,[$tp,#8*4]
1016 ldp $acc4,$acc5,[$t1,#8*4]
1017 stp $acc6,$acc7,[$tp,#8*6]
1018 mov $tp,$t1
1019 ldp $acc6,$acc7,[$t1,#8*6]
1020 b .Lsqr8x_outer_loop
1021
1022.align 4
1023.Lsqr8x_outer_break:
1024 // Now multiply above result by 2 and add a[n-1]*a[n-1]|...|a[0]*a[0]
1025 ldp $a1,$a3,[$t0,#8*0] // recall that $t0 is &a[0]
1026 ldp $t1,$t2,[sp,#8*1]
1027 ldp $a5,$a7,[$t0,#8*2]
1028 add $ap,$t0,#8*4
1029 ldp $t3,$t0,[sp,#8*3]
1030
1031 stp $acc0,$acc1,[$tp,#8*0]
1032 mul $acc0,$a1,$a1
1033 stp $acc2,$acc3,[$tp,#8*2]
1034 umulh $a1,$a1,$a1
1035 stp $acc4,$acc5,[$tp,#8*4]
1036 mul $a2,$a3,$a3
1037 stp $acc6,$acc7,[$tp,#8*6]
1038 mov $tp,sp
1039 umulh $a3,$a3,$a3
1040 adds $acc1,$a1,$t1,lsl#1
1041 extr $t1,$t2,$t1,#63
1042 sub $cnt,$num,#8*4
1043
1044.Lsqr4x_shift_n_add:
1045 adcs $acc2,$a2,$t1
1046 extr $t2,$t3,$t2,#63
1047 sub $cnt,$cnt,#8*4
1048 adcs $acc3,$a3,$t2
1049 ldp $t1,$t2,[$tp,#8*5]
1050 mul $a4,$a5,$a5
1051 ldp $a1,$a3,[$ap],#8*2
1052 umulh $a5,$a5,$a5
1053 mul $a6,$a7,$a7
1054 umulh $a7,$a7,$a7
1055 extr $t3,$t0,$t3,#63
1056 stp $acc0,$acc1,[$tp,#8*0]
1057 adcs $acc4,$a4,$t3
1058 extr $t0,$t1,$t0,#63
1059 stp $acc2,$acc3,[$tp,#8*2]
1060 adcs $acc5,$a5,$t0
1061 ldp $t3,$t0,[$tp,#8*7]
1062 extr $t1,$t2,$t1,#63
1063 adcs $acc6,$a6,$t1
1064 extr $t2,$t3,$t2,#63
1065 adcs $acc7,$a7,$t2
1066 ldp $t1,$t2,[$tp,#8*9]
1067 mul $a0,$a1,$a1
1068 ldp $a5,$a7,[$ap],#8*2
1069 umulh $a1,$a1,$a1
1070 mul $a2,$a3,$a3
1071 umulh $a3,$a3,$a3
1072 stp $acc4,$acc5,[$tp,#8*4]
1073 extr $t3,$t0,$t3,#63
1074 stp $acc6,$acc7,[$tp,#8*6]
1075 add $tp,$tp,#8*8
1076 adcs $acc0,$a0,$t3
1077 extr $t0,$t1,$t0,#63
1078 adcs $acc1,$a1,$t0
1079 ldp $t3,$t0,[$tp,#8*3]
1080 extr $t1,$t2,$t1,#63
1081 cbnz $cnt,.Lsqr4x_shift_n_add
1082___
1083my ($np,$np_end)=($ap,$ap_end);
1084$code.=<<___;
1085 ldp $np,$n0,[x29,#104] // pull np and n0
1086
1087 adcs $acc2,$a2,$t1
1088 extr $t2,$t3,$t2,#63
1089 adcs $acc3,$a3,$t2
1090 ldp $t1,$t2,[$tp,#8*5]
1091 mul $a4,$a5,$a5
1092 umulh $a5,$a5,$a5
1093 stp $acc0,$acc1,[$tp,#8*0]
1094 mul $a6,$a7,$a7
1095 umulh $a7,$a7,$a7
1096 stp $acc2,$acc3,[$tp,#8*2]
1097 extr $t3,$t0,$t3,#63
1098 adcs $acc4,$a4,$t3
1099 extr $t0,$t1,$t0,#63
1100 ldp $acc0,$acc1,[sp,#8*0]
1101 adcs $acc5,$a5,$t0
1102 extr $t1,$t2,$t1,#63
1103 ldp $a0,$a1,[$np,#8*0]
1104 adcs $acc6,$a6,$t1
1105 extr $t2,xzr,$t2,#63
1106 ldp $a2,$a3,[$np,#8*2]
1107 adc $acc7,$a7,$t2
1108 ldp $a4,$a5,[$np,#8*4]
1109
1110 // Reduce by 512 bits per iteration
1111 mul $na0,$n0,$acc0 // t[0]*n0
1112 ldp $a6,$a7,[$np,#8*6]
1113 add $np_end,$np,$num
1114 ldp $acc2,$acc3,[sp,#8*2]
1115 stp $acc4,$acc5,[$tp,#8*4]
1116 ldp $acc4,$acc5,[sp,#8*4]
1117 stp $acc6,$acc7,[$tp,#8*6]
1118 ldp $acc6,$acc7,[sp,#8*6]
1119 add $np,$np,#8*8
1120 mov $topmost,xzr // initial top-most carry
1121 mov $tp,sp
1122 mov $cnt,#8
1123
1124.Lsqr8x_reduction:
1125 // (*) mul $t0,$a0,$na0 // lo(n[0-7])*lo(t[0]*n0)
1126 mul $t1,$a1,$na0
1127 sub $cnt,$cnt,#1
1128 mul $t2,$a2,$na0
1129 str $na0,[$tp],#8 // put aside t[0]*n0 for tail processing
1130 mul $t3,$a3,$na0
1131 // (*) adds xzr,$acc0,$t0
1132 subs xzr,$acc0,#1 // (*)
1133 mul $t0,$a4,$na0
1134 adcs $acc0,$acc1,$t1
1135 mul $t1,$a5,$na0
1136 adcs $acc1,$acc2,$t2
1137 mul $t2,$a6,$na0
1138 adcs $acc2,$acc3,$t3
1139 mul $t3,$a7,$na0
1140 adcs $acc3,$acc4,$t0
1141 umulh $t0,$a0,$na0 // hi(n[0-7])*lo(t[0]*n0)
1142 adcs $acc4,$acc5,$t1
1143 umulh $t1,$a1,$na0
1144 adcs $acc5,$acc6,$t2
1145 umulh $t2,$a2,$na0
1146 adcs $acc6,$acc7,$t3
1147 umulh $t3,$a3,$na0
1148 adc $acc7,xzr,xzr
1149 adds $acc0,$acc0,$t0
1150 umulh $t0,$a4,$na0
1151 adcs $acc1,$acc1,$t1
1152 umulh $t1,$a5,$na0
1153 adcs $acc2,$acc2,$t2
1154 umulh $t2,$a6,$na0
1155 adcs $acc3,$acc3,$t3
1156 umulh $t3,$a7,$na0
1157 mul $na0,$n0,$acc0 // next t[0]*n0
1158 adcs $acc4,$acc4,$t0
1159 adcs $acc5,$acc5,$t1
1160 adcs $acc6,$acc6,$t2
1161 adc $acc7,$acc7,$t3
1162 cbnz $cnt,.Lsqr8x_reduction
1163
1164 ldp $t0,$t1,[$tp,#8*0]
1165 ldp $t2,$t3,[$tp,#8*2]
1166 mov $rp,$tp
1167 sub $cnt,$np_end,$np // done yet?
1168 adds $acc0,$acc0,$t0
1169 adcs $acc1,$acc1,$t1
1170 ldp $t0,$t1,[$tp,#8*4]
1171 adcs $acc2,$acc2,$t2
1172 adcs $acc3,$acc3,$t3
1173 ldp $t2,$t3,[$tp,#8*6]
1174 adcs $acc4,$acc4,$t0
1175 adcs $acc5,$acc5,$t1
1176 adcs $acc6,$acc6,$t2
1177 adcs $acc7,$acc7,$t3
1178 //adc $carry,xzr,xzr // moved below
1179 cbz $cnt,.Lsqr8x8_post_condition
1180
1181 ldur $n0,[$tp,#-8*8]
1182 ldp $a0,$a1,[$np,#8*0]
1183 ldp $a2,$a3,[$np,#8*2]
1184 ldp $a4,$a5,[$np,#8*4]
1185 mov $cnt,#-8*8
1186 ldp $a6,$a7,[$np,#8*6]
1187 add $np,$np,#8*8
1188
1189.Lsqr8x_tail:
1190 mul $t0,$a0,$n0
1191 adc $carry,xzr,xzr // carry bit, modulo-scheduled
1192 mul $t1,$a1,$n0
1193 add $cnt,$cnt,#8
1194 mul $t2,$a2,$n0
1195 mul $t3,$a3,$n0
1196 adds $acc0,$acc0,$t0
1197 mul $t0,$a4,$n0
1198 adcs $acc1,$acc1,$t1
1199 mul $t1,$a5,$n0
1200 adcs $acc2,$acc2,$t2
1201 mul $t2,$a6,$n0
1202 adcs $acc3,$acc3,$t3
1203 mul $t3,$a7,$n0
1204 adcs $acc4,$acc4,$t0
1205 umulh $t0,$a0,$n0
1206 adcs $acc5,$acc5,$t1
1207 umulh $t1,$a1,$n0
1208 adcs $acc6,$acc6,$t2
1209 umulh $t2,$a2,$n0
1210 adcs $acc7,$acc7,$t3
1211 umulh $t3,$a3,$n0
1212 adc $carry,$carry,xzr
1213 str $acc0,[$tp],#8
1214 adds $acc0,$acc1,$t0
1215 umulh $t0,$a4,$n0
1216 adcs $acc1,$acc2,$t1
1217 umulh $t1,$a5,$n0
1218 adcs $acc2,$acc3,$t2
1219 umulh $t2,$a6,$n0
1220 adcs $acc3,$acc4,$t3
1221 umulh $t3,$a7,$n0
1222 ldr $n0,[$rp,$cnt]
1223 adcs $acc4,$acc5,$t0
1224 adcs $acc5,$acc6,$t1
1225 adcs $acc6,$acc7,$t2
1226 adcs $acc7,$carry,$t3
1227 //adc $carry,xzr,xzr // moved above
1228 cbnz $cnt,.Lsqr8x_tail
1229 // note that carry flag is guaranteed
1230 // to be zero at this point
1231 ldp $a0,$a1,[$tp,#8*0]
1232 sub $cnt,$np_end,$np // done yet?
1233 sub $t2,$np_end,$num // rewinded np
1234 ldp $a2,$a3,[$tp,#8*2]
1235 ldp $a4,$a5,[$tp,#8*4]
1236 ldp $a6,$a7,[$tp,#8*6]
1237 cbz $cnt,.Lsqr8x_tail_break
1238
1239 ldur $n0,[$rp,#-8*8]
1240 adds $acc0,$acc0,$a0
1241 adcs $acc1,$acc1,$a1
1242 ldp $a0,$a1,[$np,#8*0]
1243 adcs $acc2,$acc2,$a2
1244 adcs $acc3,$acc3,$a3
1245 ldp $a2,$a3,[$np,#8*2]
1246 adcs $acc4,$acc4,$a4
1247 adcs $acc5,$acc5,$a5
1248 ldp $a4,$a5,[$np,#8*4]
1249 adcs $acc6,$acc6,$a6
1250 mov $cnt,#-8*8
1251 adcs $acc7,$acc7,$a7
1252 ldp $a6,$a7,[$np,#8*6]
1253 add $np,$np,#8*8
1254 //adc $carry,xzr,xzr // moved above
1255 b .Lsqr8x_tail
1256
1257.align 4
1258.Lsqr8x_tail_break:
1259 ldr $n0,[x29,#112] // pull n0
1260 add $cnt,$tp,#8*8 // end of current t[num] window
1261
1262 subs xzr,$topmost,#1 // "move" top-most carry to carry bit
1263 adcs $t0,$acc0,$a0
1264 adcs $t1,$acc1,$a1
1265 ldp $acc0,$acc1,[$rp,#8*0]
1266 adcs $acc2,$acc2,$a2
1267 ldp $a0,$a1,[$t2,#8*0] // recall that $t2 is &n[0]
1268 adcs $acc3,$acc3,$a3
1269 ldp $a2,$a3,[$t2,#8*2]
1270 adcs $acc4,$acc4,$a4
1271 adcs $acc5,$acc5,$a5
1272 ldp $a4,$a5,[$t2,#8*4]
1273 adcs $acc6,$acc6,$a6
1274 adcs $acc7,$acc7,$a7
1275 ldp $a6,$a7,[$t2,#8*6]
1276 add $np,$t2,#8*8
1277 adc $topmost,xzr,xzr // top-most carry
1278 mul $na0,$n0,$acc0
1279 stp $t0,$t1,[$tp,#8*0]
1280 stp $acc2,$acc3,[$tp,#8*2]
1281 ldp $acc2,$acc3,[$rp,#8*2]
1282 stp $acc4,$acc5,[$tp,#8*4]
1283 ldp $acc4,$acc5,[$rp,#8*4]
1284 cmp $cnt,x29 // did we hit the bottom?
1285 stp $acc6,$acc7,[$tp,#8*6]
1286 mov $tp,$rp // slide the window
1287 ldp $acc6,$acc7,[$rp,#8*6]
1288 mov $cnt,#8
1289 b.ne .Lsqr8x_reduction
1290
1291 // Final step. We see if result is larger than modulus, and
1292 // if it is, subtract the modulus. But comparison implies
1293 // subtraction. So we subtract modulus, see if it borrowed,
1294 // and conditionally copy original value.
1295 ldr $rp,[x29,#96] // pull rp
1296 add $tp,$tp,#8*8
1297 subs $t0,$acc0,$a0
1298 sbcs $t1,$acc1,$a1
1299 sub $cnt,$num,#8*8
1300 mov $ap_end,$rp // $rp copy
1301
1302.Lsqr8x_sub:
1303 sbcs $t2,$acc2,$a2
1304 ldp $a0,$a1,[$np,#8*0]
1305 sbcs $t3,$acc3,$a3
1306 stp $t0,$t1,[$rp,#8*0]
1307 sbcs $t0,$acc4,$a4
1308 ldp $a2,$a3,[$np,#8*2]
1309 sbcs $t1,$acc5,$a5
1310 stp $t2,$t3,[$rp,#8*2]
1311 sbcs $t2,$acc6,$a6
1312 ldp $a4,$a5,[$np,#8*4]
1313 sbcs $t3,$acc7,$a7
1314 ldp $a6,$a7,[$np,#8*6]
1315 add $np,$np,#8*8
1316 ldp $acc0,$acc1,[$tp,#8*0]
1317 sub $cnt,$cnt,#8*8
1318 ldp $acc2,$acc3,[$tp,#8*2]
1319 ldp $acc4,$acc5,[$tp,#8*4]
1320 ldp $acc6,$acc7,[$tp,#8*6]
1321 add $tp,$tp,#8*8
1322 stp $t0,$t1,[$rp,#8*4]
1323 sbcs $t0,$acc0,$a0
1324 stp $t2,$t3,[$rp,#8*6]
1325 add $rp,$rp,#8*8
1326 sbcs $t1,$acc1,$a1
1327 cbnz $cnt,.Lsqr8x_sub
1328
1329 sbcs $t2,$acc2,$a2
1330 mov $tp,sp
1331 add $ap,sp,$num
1332 ldp $a0,$a1,[$ap_end,#8*0]
1333 sbcs $t3,$acc3,$a3
1334 stp $t0,$t1,[$rp,#8*0]
1335 sbcs $t0,$acc4,$a4
1336 ldp $a2,$a3,[$ap_end,#8*2]
1337 sbcs $t1,$acc5,$a5
1338 stp $t2,$t3,[$rp,#8*2]
1339 sbcs $t2,$acc6,$a6
1340 ldp $acc0,$acc1,[$ap,#8*0]
1341 sbcs $t3,$acc7,$a7
1342 ldp $acc2,$acc3,[$ap,#8*2]
1343 sbcs xzr,$topmost,xzr // did it borrow?
1344 ldr x30,[x29,#8] // pull return address
1345 stp $t0,$t1,[$rp,#8*4]
1346 stp $t2,$t3,[$rp,#8*6]
1347
1348 sub $cnt,$num,#8*4
1349.Lsqr4x_cond_copy:
1350 sub $cnt,$cnt,#8*4
1351 csel $t0,$acc0,$a0,lo
1352 stp xzr,xzr,[$tp,#8*0]
1353 csel $t1,$acc1,$a1,lo
1354 ldp $a0,$a1,[$ap_end,#8*4]
1355 ldp $acc0,$acc1,[$ap,#8*4]
1356 csel $t2,$acc2,$a2,lo
1357 stp xzr,xzr,[$tp,#8*2]
1358 add $tp,$tp,#8*4
1359 csel $t3,$acc3,$a3,lo
1360 ldp $a2,$a3,[$ap_end,#8*6]
1361 ldp $acc2,$acc3,[$ap,#8*6]
1362 add $ap,$ap,#8*4
1363 stp $t0,$t1,[$ap_end,#8*0]
1364 stp $t2,$t3,[$ap_end,#8*2]
1365 add $ap_end,$ap_end,#8*4
1366 stp xzr,xzr,[$ap,#8*0]
1367 stp xzr,xzr,[$ap,#8*2]
1368 cbnz $cnt,.Lsqr4x_cond_copy
1369
1370 csel $t0,$acc0,$a0,lo
1371 stp xzr,xzr,[$tp,#8*0]
1372 csel $t1,$acc1,$a1,lo
1373 stp xzr,xzr,[$tp,#8*2]
1374 csel $t2,$acc2,$a2,lo
1375 csel $t3,$acc3,$a3,lo
1376 stp $t0,$t1,[$ap_end,#8*0]
1377 stp $t2,$t3,[$ap_end,#8*2]
1378
1379 b .Lsqr8x_done
1380
1381.align 4
1382.Lsqr8x8_post_condition:
1383 adc $carry,xzr,xzr
1384 ldr x30,[x29,#8] // pull return address
1385 // $acc0-7,$carry hold result, $a0-7 hold modulus
1386 subs $a0,$acc0,$a0
1387 ldr $ap,[x29,#96] // pull rp
1388 sbcs $a1,$acc1,$a1
1389 stp xzr,xzr,[sp,#8*0]
1390 sbcs $a2,$acc2,$a2
1391 stp xzr,xzr,[sp,#8*2]
1392 sbcs $a3,$acc3,$a3
1393 stp xzr,xzr,[sp,#8*4]
1394 sbcs $a4,$acc4,$a4
1395 stp xzr,xzr,[sp,#8*6]
1396 sbcs $a5,$acc5,$a5
1397 stp xzr,xzr,[sp,#8*8]
1398 sbcs $a6,$acc6,$a6
1399 stp xzr,xzr,[sp,#8*10]
1400 sbcs $a7,$acc7,$a7
1401 stp xzr,xzr,[sp,#8*12]
1402 sbcs $carry,$carry,xzr // did it borrow?
1403 stp xzr,xzr,[sp,#8*14]
1404
1405 // $a0-7 hold result-modulus
1406 csel $a0,$acc0,$a0,lo
1407 csel $a1,$acc1,$a1,lo
1408 csel $a2,$acc2,$a2,lo
1409 csel $a3,$acc3,$a3,lo
1410 stp $a0,$a1,[$ap,#8*0]
1411 csel $a4,$acc4,$a4,lo
1412 csel $a5,$acc5,$a5,lo
1413 stp $a2,$a3,[$ap,#8*2]
1414 csel $a6,$acc6,$a6,lo
1415 csel $a7,$acc7,$a7,lo
1416 stp $a4,$a5,[$ap,#8*4]
1417 stp $a6,$a7,[$ap,#8*6]
1418
1419.Lsqr8x_done:
1420 ldp x19,x20,[x29,#16]
1421 mov sp,x29
1422 ldp x21,x22,[x29,#32]
1423 mov x0,#1
1424 ldp x23,x24,[x29,#48]
1425 ldp x25,x26,[x29,#64]
1426 ldp x27,x28,[x29,#80]
1427 ldr x29,[sp],#128
1428 .inst 0xd50323bf // autiasp
1429 ret
1430.size __bn_sqr8x_mont,.-__bn_sqr8x_mont
1431___
1432}
1433
1434{
1435########################################################################
1436# Even though this might look as ARMv8 adaptation of mulx4x_mont from
1437# x86_64-mont5 module, it's different in sense that it performs
1438# reduction 256 bits at a time.
1439
1440my ($a0,$a1,$a2,$a3,
1441 $t0,$t1,$t2,$t3,
1442 $m0,$m1,$m2,$m3,
1443 $acc0,$acc1,$acc2,$acc3,$acc4,
1444 $bi,$mi,$tp,$ap_end,$cnt) = map("x$_",(6..17,19..28));
1445my $bp_end=$rp;
1446my ($carry,$topmost) = ($rp,"x30");
1447
1448$code.=<<___;
1449.type __bn_mul4x_mont,%function
1450.align 5
1451__bn_mul4x_mont:
1452 .inst 0xd503233f // paciasp
1453 stp x29,x30,[sp,#-128]!
1454 add x29,sp,#0
1455 stp x19,x20,[sp,#16]
1456 stp x21,x22,[sp,#32]
1457 stp x23,x24,[sp,#48]
1458 stp x25,x26,[sp,#64]
1459 stp x27,x28,[sp,#80]
1460
1461 sub $tp,sp,$num,lsl#3
1462 lsl $num,$num,#3
1463 ldr $n0,[$n0] // *n0
1464 sub sp,$tp,#8*4 // alloca
1465
1466 add $t0,$bp,$num
1467 add $ap_end,$ap,$num
1468 stp $rp,$t0,[x29,#96] // offload rp and &b[num]
1469
1470 ldr $bi,[$bp,#8*0] // b[0]
1471 ldp $a0,$a1,[$ap,#8*0] // a[0..3]
1472 ldp $a2,$a3,[$ap,#8*2]
1473 add $ap,$ap,#8*4
1474 mov $acc0,xzr
1475 mov $acc1,xzr
1476 mov $acc2,xzr
1477 mov $acc3,xzr
1478 ldp $m0,$m1,[$np,#8*0] // n[0..3]
1479 ldp $m2,$m3,[$np,#8*2]
1480 adds $np,$np,#8*4 // clear carry bit
1481 mov $carry,xzr
1482 mov $cnt,#0
1483 mov $tp,sp
1484
1485.Loop_mul4x_1st_reduction:
1486 mul $t0,$a0,$bi // lo(a[0..3]*b[0])
1487 adc $carry,$carry,xzr // modulo-scheduled
1488 mul $t1,$a1,$bi
1489 add $cnt,$cnt,#8
1490 mul $t2,$a2,$bi
1491 and $cnt,$cnt,#31
1492 mul $t3,$a3,$bi
1493 adds $acc0,$acc0,$t0
1494 umulh $t0,$a0,$bi // hi(a[0..3]*b[0])
1495 adcs $acc1,$acc1,$t1
1496 mul $mi,$acc0,$n0 // t[0]*n0
1497 adcs $acc2,$acc2,$t2
1498 umulh $t1,$a1,$bi
1499 adcs $acc3,$acc3,$t3
1500 umulh $t2,$a2,$bi
1501 adc $acc4,xzr,xzr
1502 umulh $t3,$a3,$bi
1503 ldr $bi,[$bp,$cnt] // next b[i] (or b[0])
1504 adds $acc1,$acc1,$t0
1505 // (*) mul $t0,$m0,$mi // lo(n[0..3]*t[0]*n0)
1506 str $mi,[$tp],#8 // put aside t[0]*n0 for tail processing
1507 adcs $acc2,$acc2,$t1
1508 mul $t1,$m1,$mi
1509 adcs $acc3,$acc3,$t2
1510 mul $t2,$m2,$mi
1511 adc $acc4,$acc4,$t3 // can't overflow
1512 mul $t3,$m3,$mi
1513 // (*) adds xzr,$acc0,$t0
1514 subs xzr,$acc0,#1 // (*)
1515 umulh $t0,$m0,$mi // hi(n[0..3]*t[0]*n0)
1516 adcs $acc0,$acc1,$t1
1517 umulh $t1,$m1,$mi
1518 adcs $acc1,$acc2,$t2
1519 umulh $t2,$m2,$mi
1520 adcs $acc2,$acc3,$t3
1521 umulh $t3,$m3,$mi
1522 adcs $acc3,$acc4,$carry
1523 adc $carry,xzr,xzr
1524 adds $acc0,$acc0,$t0
1525 sub $t0,$ap_end,$ap
1526 adcs $acc1,$acc1,$t1
1527 adcs $acc2,$acc2,$t2
1528 adcs $acc3,$acc3,$t3
1529 //adc $carry,$carry,xzr
1530 cbnz $cnt,.Loop_mul4x_1st_reduction
1531
1532 cbz $t0,.Lmul4x4_post_condition
1533
1534 ldp $a0,$a1,[$ap,#8*0] // a[4..7]
1535 ldp $a2,$a3,[$ap,#8*2]
1536 add $ap,$ap,#8*4
1537 ldr $mi,[sp] // a[0]*n0
1538 ldp $m0,$m1,[$np,#8*0] // n[4..7]
1539 ldp $m2,$m3,[$np,#8*2]
1540 add $np,$np,#8*4
1541
1542.Loop_mul4x_1st_tail:
1543 mul $t0,$a0,$bi // lo(a[4..7]*b[i])
1544 adc $carry,$carry,xzr // modulo-scheduled
1545 mul $t1,$a1,$bi
1546 add $cnt,$cnt,#8
1547 mul $t2,$a2,$bi
1548 and $cnt,$cnt,#31
1549 mul $t3,$a3,$bi
1550 adds $acc0,$acc0,$t0
1551 umulh $t0,$a0,$bi // hi(a[4..7]*b[i])
1552 adcs $acc1,$acc1,$t1
1553 umulh $t1,$a1,$bi
1554 adcs $acc2,$acc2,$t2
1555 umulh $t2,$a2,$bi
1556 adcs $acc3,$acc3,$t3
1557 umulh $t3,$a3,$bi
1558 adc $acc4,xzr,xzr
1559 ldr $bi,[$bp,$cnt] // next b[i] (or b[0])
1560 adds $acc1,$acc1,$t0
1561 mul $t0,$m0,$mi // lo(n[4..7]*a[0]*n0)
1562 adcs $acc2,$acc2,$t1
1563 mul $t1,$m1,$mi
1564 adcs $acc3,$acc3,$t2
1565 mul $t2,$m2,$mi
1566 adc $acc4,$acc4,$t3 // can't overflow
1567 mul $t3,$m3,$mi
1568 adds $acc0,$acc0,$t0
1569 umulh $t0,$m0,$mi // hi(n[4..7]*a[0]*n0)
1570 adcs $acc1,$acc1,$t1
1571 umulh $t1,$m1,$mi
1572 adcs $acc2,$acc2,$t2
1573 umulh $t2,$m2,$mi
1574 adcs $acc3,$acc3,$t3
1575 adcs $acc4,$acc4,$carry
1576 umulh $t3,$m3,$mi
1577 adc $carry,xzr,xzr
1578 ldr $mi,[sp,$cnt] // next t[0]*n0
1579 str $acc0,[$tp],#8 // result!!!
1580 adds $acc0,$acc1,$t0
1581 sub $t0,$ap_end,$ap // done yet?
1582 adcs $acc1,$acc2,$t1
1583 adcs $acc2,$acc3,$t2
1584 adcs $acc3,$acc4,$t3
1585 //adc $carry,$carry,xzr
1586 cbnz $cnt,.Loop_mul4x_1st_tail
1587
1588 sub $t1,$ap_end,$num // rewinded $ap
1589 cbz $t0,.Lmul4x_proceed
1590
1591 ldp $a0,$a1,[$ap,#8*0]
1592 ldp $a2,$a3,[$ap,#8*2]
1593 add $ap,$ap,#8*4
1594 ldp $m0,$m1,[$np,#8*0]
1595 ldp $m2,$m3,[$np,#8*2]
1596 add $np,$np,#8*4
1597 b .Loop_mul4x_1st_tail
1598
1599.align 5
1600.Lmul4x_proceed:
1601 ldr $bi,[$bp,#8*4]! // *++b
1602 adc $topmost,$carry,xzr
1603 ldp $a0,$a1,[$t1,#8*0] // a[0..3]
1604 sub $np,$np,$num // rewind np
1605 ldp $a2,$a3,[$t1,#8*2]
1606 add $ap,$t1,#8*4
1607
1608 stp $acc0,$acc1,[$tp,#8*0] // result!!!
1609 ldp $acc0,$acc1,[sp,#8*4] // t[0..3]
1610 stp $acc2,$acc3,[$tp,#8*2] // result!!!
1611 ldp $acc2,$acc3,[sp,#8*6]
1612
1613 ldp $m0,$m1,[$np,#8*0] // n[0..3]
1614 mov $tp,sp
1615 ldp $m2,$m3,[$np,#8*2]
1616 adds $np,$np,#8*4 // clear carry bit
1617 mov $carry,xzr
1618
1619.align 4
1620.Loop_mul4x_reduction:
1621 mul $t0,$a0,$bi // lo(a[0..3]*b[4])
1622 adc $carry,$carry,xzr // modulo-scheduled
1623 mul $t1,$a1,$bi
1624 add $cnt,$cnt,#8
1625 mul $t2,$a2,$bi
1626 and $cnt,$cnt,#31
1627 mul $t3,$a3,$bi
1628 adds $acc0,$acc0,$t0
1629 umulh $t0,$a0,$bi // hi(a[0..3]*b[4])
1630 adcs $acc1,$acc1,$t1
1631 mul $mi,$acc0,$n0 // t[0]*n0
1632 adcs $acc2,$acc2,$t2
1633 umulh $t1,$a1,$bi
1634 adcs $acc3,$acc3,$t3
1635 umulh $t2,$a2,$bi
1636 adc $acc4,xzr,xzr
1637 umulh $t3,$a3,$bi
1638 ldr $bi,[$bp,$cnt] // next b[i]
1639 adds $acc1,$acc1,$t0
1640 // (*) mul $t0,$m0,$mi
1641 str $mi,[$tp],#8 // put aside t[0]*n0 for tail processing
1642 adcs $acc2,$acc2,$t1
1643 mul $t1,$m1,$mi // lo(n[0..3]*t[0]*n0
1644 adcs $acc3,$acc3,$t2
1645 mul $t2,$m2,$mi
1646 adc $acc4,$acc4,$t3 // can't overflow
1647 mul $t3,$m3,$mi
1648 // (*) adds xzr,$acc0,$t0
1649 subs xzr,$acc0,#1 // (*)
1650 umulh $t0,$m0,$mi // hi(n[0..3]*t[0]*n0
1651 adcs $acc0,$acc1,$t1
1652 umulh $t1,$m1,$mi
1653 adcs $acc1,$acc2,$t2
1654 umulh $t2,$m2,$mi
1655 adcs $acc2,$acc3,$t3
1656 umulh $t3,$m3,$mi
1657 adcs $acc3,$acc4,$carry
1658 adc $carry,xzr,xzr
1659 adds $acc0,$acc0,$t0
1660 adcs $acc1,$acc1,$t1
1661 adcs $acc2,$acc2,$t2
1662 adcs $acc3,$acc3,$t3
1663 //adc $carry,$carry,xzr
1664 cbnz $cnt,.Loop_mul4x_reduction
1665
1666 adc $carry,$carry,xzr
1667 ldp $t0,$t1,[$tp,#8*4] // t[4..7]
1668 ldp $t2,$t3,[$tp,#8*6]
1669 ldp $a0,$a1,[$ap,#8*0] // a[4..7]
1670 ldp $a2,$a3,[$ap,#8*2]
1671 add $ap,$ap,#8*4
1672 adds $acc0,$acc0,$t0
1673 adcs $acc1,$acc1,$t1
1674 adcs $acc2,$acc2,$t2
1675 adcs $acc3,$acc3,$t3
1676 //adc $carry,$carry,xzr
1677
1678 ldr $mi,[sp] // t[0]*n0
1679 ldp $m0,$m1,[$np,#8*0] // n[4..7]
1680 ldp $m2,$m3,[$np,#8*2]
1681 add $np,$np,#8*4
1682
1683.align 4
1684.Loop_mul4x_tail:
1685 mul $t0,$a0,$bi // lo(a[4..7]*b[4])
1686 adc $carry,$carry,xzr // modulo-scheduled
1687 mul $t1,$a1,$bi
1688 add $cnt,$cnt,#8
1689 mul $t2,$a2,$bi
1690 and $cnt,$cnt,#31
1691 mul $t3,$a3,$bi
1692 adds $acc0,$acc0,$t0
1693 umulh $t0,$a0,$bi // hi(a[4..7]*b[4])
1694 adcs $acc1,$acc1,$t1
1695 umulh $t1,$a1,$bi
1696 adcs $acc2,$acc2,$t2
1697 umulh $t2,$a2,$bi
1698 adcs $acc3,$acc3,$t3
1699 umulh $t3,$a3,$bi
1700 adc $acc4,xzr,xzr
1701 ldr $bi,[$bp,$cnt] // next b[i]
1702 adds $acc1,$acc1,$t0
1703 mul $t0,$m0,$mi // lo(n[4..7]*t[0]*n0)
1704 adcs $acc2,$acc2,$t1
1705 mul $t1,$m1,$mi
1706 adcs $acc3,$acc3,$t2
1707 mul $t2,$m2,$mi
1708 adc $acc4,$acc4,$t3 // can't overflow
1709 mul $t3,$m3,$mi
1710 adds $acc0,$acc0,$t0
1711 umulh $t0,$m0,$mi // hi(n[4..7]*t[0]*n0)
1712 adcs $acc1,$acc1,$t1
1713 umulh $t1,$m1,$mi
1714 adcs $acc2,$acc2,$t2
1715 umulh $t2,$m2,$mi
1716 adcs $acc3,$acc3,$t3
1717 umulh $t3,$m3,$mi
1718 adcs $acc4,$acc4,$carry
1719 ldr $mi,[sp,$cnt] // next a[0]*n0
1720 adc $carry,xzr,xzr
1721 str $acc0,[$tp],#8 // result!!!
1722 adds $acc0,$acc1,$t0
1723 sub $t0,$ap_end,$ap // done yet?
1724 adcs $acc1,$acc2,$t1
1725 adcs $acc2,$acc3,$t2
1726 adcs $acc3,$acc4,$t3
1727 //adc $carry,$carry,xzr
1728 cbnz $cnt,.Loop_mul4x_tail
1729
1730 sub $t1,$np,$num // rewinded np?
1731 adc $carry,$carry,xzr
1732 cbz $t0,.Loop_mul4x_break
1733
1734 ldp $t0,$t1,[$tp,#8*4]
1735 ldp $t2,$t3,[$tp,#8*6]
1736 ldp $a0,$a1,[$ap,#8*0]
1737 ldp $a2,$a3,[$ap,#8*2]
1738 add $ap,$ap,#8*4
1739 adds $acc0,$acc0,$t0
1740 adcs $acc1,$acc1,$t1
1741 adcs $acc2,$acc2,$t2
1742 adcs $acc3,$acc3,$t3
1743 //adc $carry,$carry,xzr
1744 ldp $m0,$m1,[$np,#8*0]
1745 ldp $m2,$m3,[$np,#8*2]
1746 add $np,$np,#8*4
1747 b .Loop_mul4x_tail
1748
1749.align 4
1750.Loop_mul4x_break:
1751 ldp $t2,$t3,[x29,#96] // pull rp and &b[num]
1752 adds $acc0,$acc0,$topmost
1753 add $bp,$bp,#8*4 // bp++
1754 adcs $acc1,$acc1,xzr
1755 sub $ap,$ap,$num // rewind ap
1756 adcs $acc2,$acc2,xzr
1757 stp $acc0,$acc1,[$tp,#8*0] // result!!!
1758 adcs $acc3,$acc3,xzr
1759 ldp $acc0,$acc1,[sp,#8*4] // t[0..3]
1760 adc $topmost,$carry,xzr
1761 stp $acc2,$acc3,[$tp,#8*2] // result!!!
1762 cmp $bp,$t3 // done yet?
1763 ldp $acc2,$acc3,[sp,#8*6]
1764 ldp $m0,$m1,[$t1,#8*0] // n[0..3]
1765 ldp $m2,$m3,[$t1,#8*2]
1766 add $np,$t1,#8*4
1767 b.eq .Lmul4x_post
1768
1769 ldr $bi,[$bp]
1770 ldp $a0,$a1,[$ap,#8*0] // a[0..3]
1771 ldp $a2,$a3,[$ap,#8*2]
1772 adds $ap,$ap,#8*4 // clear carry bit
1773 mov $carry,xzr
1774 mov $tp,sp
1775 b .Loop_mul4x_reduction
1776
1777.align 4
1778.Lmul4x_post:
1779 // Final step. We see if result is larger than modulus, and
1780 // if it is, subtract the modulus. But comparison implies
1781 // subtraction. So we subtract modulus, see if it borrowed,
1782 // and conditionally copy original value.
1783 mov $rp,$t2
1784 mov $ap_end,$t2 // $rp copy
1785 subs $t0,$acc0,$m0
1786 add $tp,sp,#8*8
1787 sbcs $t1,$acc1,$m1
1788 sub $cnt,$num,#8*4
1789
1790.Lmul4x_sub:
1791 sbcs $t2,$acc2,$m2
1792 ldp $m0,$m1,[$np,#8*0]
1793 sub $cnt,$cnt,#8*4
1794 ldp $acc0,$acc1,[$tp,#8*0]
1795 sbcs $t3,$acc3,$m3
1796 ldp $m2,$m3,[$np,#8*2]
1797 add $np,$np,#8*4
1798 ldp $acc2,$acc3,[$tp,#8*2]
1799 add $tp,$tp,#8*4
1800 stp $t0,$t1,[$rp,#8*0]
1801 sbcs $t0,$acc0,$m0
1802 stp $t2,$t3,[$rp,#8*2]
1803 add $rp,$rp,#8*4
1804 sbcs $t1,$acc1,$m1
1805 cbnz $cnt,.Lmul4x_sub
1806
1807 sbcs $t2,$acc2,$m2
1808 mov $tp,sp
1809 add $ap,sp,#8*4
1810 ldp $a0,$a1,[$ap_end,#8*0]
1811 sbcs $t3,$acc3,$m3
1812 stp $t0,$t1,[$rp,#8*0]
1813 ldp $a2,$a3,[$ap_end,#8*2]
1814 stp $t2,$t3,[$rp,#8*2]
1815 ldp $acc0,$acc1,[$ap,#8*0]
1816 ldp $acc2,$acc3,[$ap,#8*2]
1817 sbcs xzr,$topmost,xzr // did it borrow?
1818 ldr x30,[x29,#8] // pull return address
1819
1820 sub $cnt,$num,#8*4
1821.Lmul4x_cond_copy:
1822 sub $cnt,$cnt,#8*4
1823 csel $t0,$acc0,$a0,lo
1824 stp xzr,xzr,[$tp,#8*0]
1825 csel $t1,$acc1,$a1,lo
1826 ldp $a0,$a1,[$ap_end,#8*4]
1827 ldp $acc0,$acc1,[$ap,#8*4]
1828 csel $t2,$acc2,$a2,lo
1829 stp xzr,xzr,[$tp,#8*2]
1830 add $tp,$tp,#8*4
1831 csel $t3,$acc3,$a3,lo
1832 ldp $a2,$a3,[$ap_end,#8*6]
1833 ldp $acc2,$acc3,[$ap,#8*6]
1834 add $ap,$ap,#8*4
1835 stp $t0,$t1,[$ap_end,#8*0]
1836 stp $t2,$t3,[$ap_end,#8*2]
1837 add $ap_end,$ap_end,#8*4
1838 cbnz $cnt,.Lmul4x_cond_copy
1839
1840 csel $t0,$acc0,$a0,lo
1841 stp xzr,xzr,[$tp,#8*0]
1842 csel $t1,$acc1,$a1,lo
1843 stp xzr,xzr,[$tp,#8*2]
1844 csel $t2,$acc2,$a2,lo
1845 stp xzr,xzr,[$tp,#8*3]
1846 csel $t3,$acc3,$a3,lo
1847 stp xzr,xzr,[$tp,#8*4]
1848 stp $t0,$t1,[$ap_end,#8*0]
1849 stp $t2,$t3,[$ap_end,#8*2]
1850
1851 b .Lmul4x_done
1852
1853.align 4
1854.Lmul4x4_post_condition:
1855 adc $carry,$carry,xzr
1856 ldr $ap,[x29,#96] // pull rp
1857 // $acc0-3,$carry hold result, $m0-7 hold modulus
1858 subs $a0,$acc0,$m0
1859 ldr x30,[x29,#8] // pull return address
1860 sbcs $a1,$acc1,$m1
1861 stp xzr,xzr,[sp,#8*0]
1862 sbcs $a2,$acc2,$m2
1863 stp xzr,xzr,[sp,#8*2]
1864 sbcs $a3,$acc3,$m3
1865 stp xzr,xzr,[sp,#8*4]
1866 sbcs xzr,$carry,xzr // did it borrow?
1867 stp xzr,xzr,[sp,#8*6]
1868
1869 // $a0-3 hold result-modulus
1870 csel $a0,$acc0,$a0,lo
1871 csel $a1,$acc1,$a1,lo
1872 csel $a2,$acc2,$a2,lo
1873 csel $a3,$acc3,$a3,lo
1874 stp $a0,$a1,[$ap,#8*0]
1875 stp $a2,$a3,[$ap,#8*2]
1876
1877.Lmul4x_done:
1878 ldp x19,x20,[x29,#16]
1879 mov sp,x29
1880 ldp x21,x22,[x29,#32]
1881 mov x0,#1
1882 ldp x23,x24,[x29,#48]
1883 ldp x25,x26,[x29,#64]
1884 ldp x27,x28,[x29,#80]
1885 ldr x29,[sp],#128
1886 .inst 0xd50323bf // autiasp
1887 ret
1888.size __bn_mul4x_mont,.-__bn_mul4x_mont
1889___
1890}
1891$code.=<<___;
1892.asciz "Montgomery Multiplication for ARMv8, CRYPTOGAMS by <appro\@openssl.org>"
1893.align 4
1894___
1895
1896print $code;
1897
1898close STDOUT or die "error closing STDOUT: $!";
Note: See TracBrowser for help on using the repository browser.

© 2024 Oracle Support Privacy / Do Not Sell My Info Terms of Use Trademark Policy Automated Access Etiquette