VirtualBox

source: vbox/trunk/src/libs/openssl-1.1.0g/crypto/bn/asm/armv4-mont.pl@ 69890

Last change on this file since 69890 was 69890, checked in by vboxsync, 7 years ago

Added OpenSSL 1.1.0g with unneeded files removed, otherwise unmodified.
bugref:8070: src/libs maintenance

  • Property svn:eol-style set to LF
  • Property svn:executable set to *
File size: 19.2 KB
Line 
1#! /usr/bin/env perl
2# Copyright 2007-2016 The OpenSSL Project Authors. All Rights Reserved.
3#
4# Licensed under the OpenSSL license (the "License"). You may not use
5# this file except in compliance with the License. You can obtain a copy
6# in the file LICENSE in the source distribution or at
7# https://www.openssl.org/source/license.html
8
9
10# ====================================================================
11# Written by Andy Polyakov <[email protected]> for the OpenSSL
12# project. The module is, however, dual licensed under OpenSSL and
13# CRYPTOGAMS licenses depending on where you obtain it. For further
14# details see http://www.openssl.org/~appro/cryptogams/.
15# ====================================================================
16
17# January 2007.
18
19# Montgomery multiplication for ARMv4.
20#
21# Performance improvement naturally varies among CPU implementations
22# and compilers. The code was observed to provide +65-35% improvement
23# [depending on key length, less for longer keys] on ARM920T, and
24# +115-80% on Intel IXP425. This is compared to pre-bn_mul_mont code
25# base and compiler generated code with in-lined umull and even umlal
26# instructions. The latter means that this code didn't really have an
27# "advantage" of utilizing some "secret" instruction.
28#
29# The code is interoperable with Thumb ISA and is rather compact, less
30# than 1/2KB. Windows CE port would be trivial, as it's exclusively
31# about decorations, ABI and instruction syntax are identical.
32
33# November 2013
34#
35# Add NEON code path, which handles lengths divisible by 8. RSA/DSA
36# performance improvement on Cortex-A8 is ~45-100% depending on key
37# length, more for longer keys. On Cortex-A15 the span is ~10-105%.
38# On Snapdragon S4 improvement was measured to vary from ~70% to
39# incredible ~380%, yes, 4.8x faster, for RSA4096 sign. But this is
40# rather because original integer-only code seems to perform
41# suboptimally on S4. Situation on Cortex-A9 is unfortunately
42# different. It's being looked into, but the trouble is that
43# performance for vectors longer than 256 bits is actually couple
44# of percent worse than for integer-only code. The code is chosen
45# for execution on all NEON-capable processors, because gain on
46# others outweighs the marginal loss on Cortex-A9.
47
48# September 2015
49#
50# Align Cortex-A9 performance with November 2013 improvements, i.e.
51# NEON code is now ~20-105% faster than integer-only one on this
52# processor. But this optimization further improved performance even
53# on other processors: NEON code path is ~45-180% faster than original
54# integer-only on Cortex-A8, ~10-210% on Cortex-A15, ~70-450% on
55# Snapdragon S4.
56
57$flavour = shift;
58if ($flavour=~/\w[\w\-]*\.\w+$/) { $output=$flavour; undef $flavour; }
59else { while (($output=shift) && ($output!~/\w[\w\-]*\.\w+$/)) {} }
60
61if ($flavour && $flavour ne "void") {
62 $0 =~ m/(.*[\/\\])[^\/\\]+$/; $dir=$1;
63 ( $xlate="${dir}arm-xlate.pl" and -f $xlate ) or
64 ( $xlate="${dir}../../perlasm/arm-xlate.pl" and -f $xlate) or
65 die "can't locate arm-xlate.pl";
66
67 open STDOUT,"| \"$^X\" $xlate $flavour $output";
68} else {
69 open STDOUT,">$output";
70}
71
72$num="r0"; # starts as num argument, but holds &tp[num-1]
73$ap="r1";
74$bp="r2"; $bi="r2"; $rp="r2";
75$np="r3";
76$tp="r4";
77$aj="r5";
78$nj="r6";
79$tj="r7";
80$n0="r8";
81########### # r9 is reserved by ELF as platform specific, e.g. TLS pointer
82$alo="r10"; # sl, gcc uses it to keep @GOT
83$ahi="r11"; # fp
84$nlo="r12"; # ip
85########### # r13 is stack pointer
86$nhi="r14"; # lr
87########### # r15 is program counter
88
89#### argument block layout relative to &tp[num-1], a.k.a. $num
90$_rp="$num,#12*4";
91# ap permanently resides in r1
92$_bp="$num,#13*4";
93# np permanently resides in r3
94$_n0="$num,#14*4";
95$_num="$num,#15*4"; $_bpend=$_num;
96
97$code=<<___;
98#include "arm_arch.h"
99
100.text
101#if defined(__thumb2__)
102.syntax unified
103.thumb
104#else
105.code 32
106#endif
107
108#if __ARM_MAX_ARCH__>=7
109.align 5
110.LOPENSSL_armcap:
111.word OPENSSL_armcap_P-.Lbn_mul_mont
112#endif
113
114.global bn_mul_mont
115.type bn_mul_mont,%function
116
117.align 5
118bn_mul_mont:
119.Lbn_mul_mont:
120 ldr ip,[sp,#4] @ load num
121 stmdb sp!,{r0,r2} @ sp points at argument block
122#if __ARM_MAX_ARCH__>=7
123 tst ip,#7
124 bne .Lialu
125 adr r0,.Lbn_mul_mont
126 ldr r2,.LOPENSSL_armcap
127 ldr r0,[r0,r2]
128#ifdef __APPLE__
129 ldr r0,[r0]
130#endif
131 tst r0,#ARMV7_NEON @ NEON available?
132 ldmia sp, {r0,r2}
133 beq .Lialu
134 add sp,sp,#8
135 b bn_mul8x_mont_neon
136.align 4
137.Lialu:
138#endif
139 cmp ip,#2
140 mov $num,ip @ load num
141#ifdef __thumb2__
142 ittt lt
143#endif
144 movlt r0,#0
145 addlt sp,sp,#2*4
146 blt .Labrt
147
148 stmdb sp!,{r4-r12,lr} @ save 10 registers
149
150 mov $num,$num,lsl#2 @ rescale $num for byte count
151 sub sp,sp,$num @ alloca(4*num)
152 sub sp,sp,#4 @ +extra dword
153 sub $num,$num,#4 @ "num=num-1"
154 add $tp,$bp,$num @ &bp[num-1]
155
156 add $num,sp,$num @ $num to point at &tp[num-1]
157 ldr $n0,[$_n0] @ &n0
158 ldr $bi,[$bp] @ bp[0]
159 ldr $aj,[$ap],#4 @ ap[0],ap++
160 ldr $nj,[$np],#4 @ np[0],np++
161 ldr $n0,[$n0] @ *n0
162 str $tp,[$_bpend] @ save &bp[num]
163
164 umull $alo,$ahi,$aj,$bi @ ap[0]*bp[0]
165 str $n0,[$_n0] @ save n0 value
166 mul $n0,$alo,$n0 @ "tp[0]"*n0
167 mov $nlo,#0
168 umlal $alo,$nlo,$nj,$n0 @ np[0]*n0+"t[0]"
169 mov $tp,sp
170
171.L1st:
172 ldr $aj,[$ap],#4 @ ap[j],ap++
173 mov $alo,$ahi
174 ldr $nj,[$np],#4 @ np[j],np++
175 mov $ahi,#0
176 umlal $alo,$ahi,$aj,$bi @ ap[j]*bp[0]
177 mov $nhi,#0
178 umlal $nlo,$nhi,$nj,$n0 @ np[j]*n0
179 adds $nlo,$nlo,$alo
180 str $nlo,[$tp],#4 @ tp[j-1]=,tp++
181 adc $nlo,$nhi,#0
182 cmp $tp,$num
183 bne .L1st
184
185 adds $nlo,$nlo,$ahi
186 ldr $tp,[$_bp] @ restore bp
187 mov $nhi,#0
188 ldr $n0,[$_n0] @ restore n0
189 adc $nhi,$nhi,#0
190 str $nlo,[$num] @ tp[num-1]=
191 mov $tj,sp
192 str $nhi,[$num,#4] @ tp[num]=
193
194
195.Louter:
196 sub $tj,$num,$tj @ "original" $num-1 value
197 sub $ap,$ap,$tj @ "rewind" ap to &ap[1]
198 ldr $bi,[$tp,#4]! @ *(++bp)
199 sub $np,$np,$tj @ "rewind" np to &np[1]
200 ldr $aj,[$ap,#-4] @ ap[0]
201 ldr $alo,[sp] @ tp[0]
202 ldr $nj,[$np,#-4] @ np[0]
203 ldr $tj,[sp,#4] @ tp[1]
204
205 mov $ahi,#0
206 umlal $alo,$ahi,$aj,$bi @ ap[0]*bp[i]+tp[0]
207 str $tp,[$_bp] @ save bp
208 mul $n0,$alo,$n0
209 mov $nlo,#0
210 umlal $alo,$nlo,$nj,$n0 @ np[0]*n0+"tp[0]"
211 mov $tp,sp
212
213.Linner:
214 ldr $aj,[$ap],#4 @ ap[j],ap++
215 adds $alo,$ahi,$tj @ +=tp[j]
216 ldr $nj,[$np],#4 @ np[j],np++
217 mov $ahi,#0
218 umlal $alo,$ahi,$aj,$bi @ ap[j]*bp[i]
219 mov $nhi,#0
220 umlal $nlo,$nhi,$nj,$n0 @ np[j]*n0
221 adc $ahi,$ahi,#0
222 ldr $tj,[$tp,#8] @ tp[j+1]
223 adds $nlo,$nlo,$alo
224 str $nlo,[$tp],#4 @ tp[j-1]=,tp++
225 adc $nlo,$nhi,#0
226 cmp $tp,$num
227 bne .Linner
228
229 adds $nlo,$nlo,$ahi
230 mov $nhi,#0
231 ldr $tp,[$_bp] @ restore bp
232 adc $nhi,$nhi,#0
233 ldr $n0,[$_n0] @ restore n0
234 adds $nlo,$nlo,$tj
235 ldr $tj,[$_bpend] @ restore &bp[num]
236 adc $nhi,$nhi,#0
237 str $nlo,[$num] @ tp[num-1]=
238 str $nhi,[$num,#4] @ tp[num]=
239
240 cmp $tp,$tj
241#ifdef __thumb2__
242 itt ne
243#endif
244 movne $tj,sp
245 bne .Louter
246
247
248 ldr $rp,[$_rp] @ pull rp
249 mov $aj,sp
250 add $num,$num,#4 @ $num to point at &tp[num]
251 sub $aj,$num,$aj @ "original" num value
252 mov $tp,sp @ "rewind" $tp
253 mov $ap,$tp @ "borrow" $ap
254 sub $np,$np,$aj @ "rewind" $np to &np[0]
255
256 subs $tj,$tj,$tj @ "clear" carry flag
257.Lsub: ldr $tj,[$tp],#4
258 ldr $nj,[$np],#4
259 sbcs $tj,$tj,$nj @ tp[j]-np[j]
260 str $tj,[$rp],#4 @ rp[j]=
261 teq $tp,$num @ preserve carry
262 bne .Lsub
263 sbcs $nhi,$nhi,#0 @ upmost carry
264 mov $tp,sp @ "rewind" $tp
265 sub $rp,$rp,$aj @ "rewind" $rp
266
267 and $ap,$tp,$nhi
268 bic $np,$rp,$nhi
269 orr $ap,$ap,$np @ ap=borrow?tp:rp
270
271.Lcopy: ldr $tj,[$ap],#4 @ copy or in-place refresh
272 str sp,[$tp],#4 @ zap tp
273 str $tj,[$rp],#4
274 cmp $tp,$num
275 bne .Lcopy
276
277 mov sp,$num
278 add sp,sp,#4 @ skip over tp[num+1]
279 ldmia sp!,{r4-r12,lr} @ restore registers
280 add sp,sp,#2*4 @ skip over {r0,r2}
281 mov r0,#1
282.Labrt:
283#if __ARM_ARCH__>=5
284 ret @ bx lr
285#else
286 tst lr,#1
287 moveq pc,lr @ be binary compatible with V4, yet
288 bx lr @ interoperable with Thumb ISA:-)
289#endif
290.size bn_mul_mont,.-bn_mul_mont
291___
292{
293my ($A0,$A1,$A2,$A3)=map("d$_",(0..3));
294my ($N0,$N1,$N2,$N3)=map("d$_",(4..7));
295my ($Z,$Temp)=("q4","q5");
296my @ACC=map("q$_",(6..13));
297my ($Bi,$Ni,$M0)=map("d$_",(28..31));
298my $zero="$Z#lo";
299my $temp="$Temp#lo";
300
301my ($rptr,$aptr,$bptr,$nptr,$n0,$num)=map("r$_",(0..5));
302my ($tinptr,$toutptr,$inner,$outer,$bnptr)=map("r$_",(6..11));
303
304$code.=<<___;
305#if __ARM_MAX_ARCH__>=7
306.arch armv7-a
307.fpu neon
308
309.type bn_mul8x_mont_neon,%function
310.align 5
311bn_mul8x_mont_neon:
312 mov ip,sp
313 stmdb sp!,{r4-r11}
314 vstmdb sp!,{d8-d15} @ ABI specification says so
315 ldmia ip,{r4-r5} @ load rest of parameter block
316 mov ip,sp
317
318 cmp $num,#8
319 bhi .LNEON_8n
320
321 @ special case for $num==8, everything is in register bank...
322
323 vld1.32 {${Bi}[0]}, [$bptr,:32]!
324 veor $zero,$zero,$zero
325 sub $toutptr,sp,$num,lsl#4
326 vld1.32 {$A0-$A3}, [$aptr]! @ can't specify :32 :-(
327 and $toutptr,$toutptr,#-64
328 vld1.32 {${M0}[0]}, [$n0,:32]
329 mov sp,$toutptr @ alloca
330 vzip.16 $Bi,$zero
331
332 vmull.u32 @ACC[0],$Bi,${A0}[0]
333 vmull.u32 @ACC[1],$Bi,${A0}[1]
334 vmull.u32 @ACC[2],$Bi,${A1}[0]
335 vshl.i64 $Ni,@ACC[0]#hi,#16
336 vmull.u32 @ACC[3],$Bi,${A1}[1]
337
338 vadd.u64 $Ni,$Ni,@ACC[0]#lo
339 veor $zero,$zero,$zero
340 vmul.u32 $Ni,$Ni,$M0
341
342 vmull.u32 @ACC[4],$Bi,${A2}[0]
343 vld1.32 {$N0-$N3}, [$nptr]!
344 vmull.u32 @ACC[5],$Bi,${A2}[1]
345 vmull.u32 @ACC[6],$Bi,${A3}[0]
346 vzip.16 $Ni,$zero
347 vmull.u32 @ACC[7],$Bi,${A3}[1]
348
349 vmlal.u32 @ACC[0],$Ni,${N0}[0]
350 sub $outer,$num,#1
351 vmlal.u32 @ACC[1],$Ni,${N0}[1]
352 vmlal.u32 @ACC[2],$Ni,${N1}[0]
353 vmlal.u32 @ACC[3],$Ni,${N1}[1]
354
355 vmlal.u32 @ACC[4],$Ni,${N2}[0]
356 vmov $Temp,@ACC[0]
357 vmlal.u32 @ACC[5],$Ni,${N2}[1]
358 vmov @ACC[0],@ACC[1]
359 vmlal.u32 @ACC[6],$Ni,${N3}[0]
360 vmov @ACC[1],@ACC[2]
361 vmlal.u32 @ACC[7],$Ni,${N3}[1]
362 vmov @ACC[2],@ACC[3]
363 vmov @ACC[3],@ACC[4]
364 vshr.u64 $temp,$temp,#16
365 vmov @ACC[4],@ACC[5]
366 vmov @ACC[5],@ACC[6]
367 vadd.u64 $temp,$temp,$Temp#hi
368 vmov @ACC[6],@ACC[7]
369 veor @ACC[7],@ACC[7]
370 vshr.u64 $temp,$temp,#16
371
372 b .LNEON_outer8
373
374.align 4
375.LNEON_outer8:
376 vld1.32 {${Bi}[0]}, [$bptr,:32]!
377 veor $zero,$zero,$zero
378 vzip.16 $Bi,$zero
379 vadd.u64 @ACC[0]#lo,@ACC[0]#lo,$temp
380
381 vmlal.u32 @ACC[0],$Bi,${A0}[0]
382 vmlal.u32 @ACC[1],$Bi,${A0}[1]
383 vmlal.u32 @ACC[2],$Bi,${A1}[0]
384 vshl.i64 $Ni,@ACC[0]#hi,#16
385 vmlal.u32 @ACC[3],$Bi,${A1}[1]
386
387 vadd.u64 $Ni,$Ni,@ACC[0]#lo
388 veor $zero,$zero,$zero
389 subs $outer,$outer,#1
390 vmul.u32 $Ni,$Ni,$M0
391
392 vmlal.u32 @ACC[4],$Bi,${A2}[0]
393 vmlal.u32 @ACC[5],$Bi,${A2}[1]
394 vmlal.u32 @ACC[6],$Bi,${A3}[0]
395 vzip.16 $Ni,$zero
396 vmlal.u32 @ACC[7],$Bi,${A3}[1]
397
398 vmlal.u32 @ACC[0],$Ni,${N0}[0]
399 vmlal.u32 @ACC[1],$Ni,${N0}[1]
400 vmlal.u32 @ACC[2],$Ni,${N1}[0]
401 vmlal.u32 @ACC[3],$Ni,${N1}[1]
402
403 vmlal.u32 @ACC[4],$Ni,${N2}[0]
404 vmov $Temp,@ACC[0]
405 vmlal.u32 @ACC[5],$Ni,${N2}[1]
406 vmov @ACC[0],@ACC[1]
407 vmlal.u32 @ACC[6],$Ni,${N3}[0]
408 vmov @ACC[1],@ACC[2]
409 vmlal.u32 @ACC[7],$Ni,${N3}[1]
410 vmov @ACC[2],@ACC[3]
411 vmov @ACC[3],@ACC[4]
412 vshr.u64 $temp,$temp,#16
413 vmov @ACC[4],@ACC[5]
414 vmov @ACC[5],@ACC[6]
415 vadd.u64 $temp,$temp,$Temp#hi
416 vmov @ACC[6],@ACC[7]
417 veor @ACC[7],@ACC[7]
418 vshr.u64 $temp,$temp,#16
419
420 bne .LNEON_outer8
421
422 vadd.u64 @ACC[0]#lo,@ACC[0]#lo,$temp
423 mov $toutptr,sp
424 vshr.u64 $temp,@ACC[0]#lo,#16
425 mov $inner,$num
426 vadd.u64 @ACC[0]#hi,@ACC[0]#hi,$temp
427 add $tinptr,sp,#96
428 vshr.u64 $temp,@ACC[0]#hi,#16
429 vzip.16 @ACC[0]#lo,@ACC[0]#hi
430
431 b .LNEON_tail_entry
432
433.align 4
434.LNEON_8n:
435 veor @ACC[0],@ACC[0],@ACC[0]
436 sub $toutptr,sp,#128
437 veor @ACC[1],@ACC[1],@ACC[1]
438 sub $toutptr,$toutptr,$num,lsl#4
439 veor @ACC[2],@ACC[2],@ACC[2]
440 and $toutptr,$toutptr,#-64
441 veor @ACC[3],@ACC[3],@ACC[3]
442 mov sp,$toutptr @ alloca
443 veor @ACC[4],@ACC[4],@ACC[4]
444 add $toutptr,$toutptr,#256
445 veor @ACC[5],@ACC[5],@ACC[5]
446 sub $inner,$num,#8
447 veor @ACC[6],@ACC[6],@ACC[6]
448 veor @ACC[7],@ACC[7],@ACC[7]
449
450.LNEON_8n_init:
451 vst1.64 {@ACC[0]-@ACC[1]},[$toutptr,:256]!
452 subs $inner,$inner,#8
453 vst1.64 {@ACC[2]-@ACC[3]},[$toutptr,:256]!
454 vst1.64 {@ACC[4]-@ACC[5]},[$toutptr,:256]!
455 vst1.64 {@ACC[6]-@ACC[7]},[$toutptr,:256]!
456 bne .LNEON_8n_init
457
458 add $tinptr,sp,#256
459 vld1.32 {$A0-$A3},[$aptr]!
460 add $bnptr,sp,#8
461 vld1.32 {${M0}[0]},[$n0,:32]
462 mov $outer,$num
463 b .LNEON_8n_outer
464
465.align 4
466.LNEON_8n_outer:
467 vld1.32 {${Bi}[0]},[$bptr,:32]! @ *b++
468 veor $zero,$zero,$zero
469 vzip.16 $Bi,$zero
470 add $toutptr,sp,#128
471 vld1.32 {$N0-$N3},[$nptr]!
472
473 vmlal.u32 @ACC[0],$Bi,${A0}[0]
474 vmlal.u32 @ACC[1],$Bi,${A0}[1]
475 veor $zero,$zero,$zero
476 vmlal.u32 @ACC[2],$Bi,${A1}[0]
477 vshl.i64 $Ni,@ACC[0]#hi,#16
478 vmlal.u32 @ACC[3],$Bi,${A1}[1]
479 vadd.u64 $Ni,$Ni,@ACC[0]#lo
480 vmlal.u32 @ACC[4],$Bi,${A2}[0]
481 vmul.u32 $Ni,$Ni,$M0
482 vmlal.u32 @ACC[5],$Bi,${A2}[1]
483 vst1.32 {$Bi},[sp,:64] @ put aside smashed b[8*i+0]
484 vmlal.u32 @ACC[6],$Bi,${A3}[0]
485 vzip.16 $Ni,$zero
486 vmlal.u32 @ACC[7],$Bi,${A3}[1]
487___
488for ($i=0; $i<7;) {
489$code.=<<___;
490 vld1.32 {${Bi}[0]},[$bptr,:32]! @ *b++
491 vmlal.u32 @ACC[0],$Ni,${N0}[0]
492 veor $temp,$temp,$temp
493 vmlal.u32 @ACC[1],$Ni,${N0}[1]
494 vzip.16 $Bi,$temp
495 vmlal.u32 @ACC[2],$Ni,${N1}[0]
496 vshr.u64 @ACC[0]#lo,@ACC[0]#lo,#16
497 vmlal.u32 @ACC[3],$Ni,${N1}[1]
498 vmlal.u32 @ACC[4],$Ni,${N2}[0]
499 vadd.u64 @ACC[0]#lo,@ACC[0]#lo,@ACC[0]#hi
500 vmlal.u32 @ACC[5],$Ni,${N2}[1]
501 vshr.u64 @ACC[0]#lo,@ACC[0]#lo,#16
502 vmlal.u32 @ACC[6],$Ni,${N3}[0]
503 vmlal.u32 @ACC[7],$Ni,${N3}[1]
504 vadd.u64 @ACC[1]#lo,@ACC[1]#lo,@ACC[0]#lo
505 vst1.32 {$Ni},[$bnptr,:64]! @ put aside smashed m[8*i+$i]
506___
507 push(@ACC,shift(@ACC)); $i++;
508$code.=<<___;
509 vmlal.u32 @ACC[0],$Bi,${A0}[0]
510 vld1.64 {@ACC[7]},[$tinptr,:128]!
511 vmlal.u32 @ACC[1],$Bi,${A0}[1]
512 veor $zero,$zero,$zero
513 vmlal.u32 @ACC[2],$Bi,${A1}[0]
514 vshl.i64 $Ni,@ACC[0]#hi,#16
515 vmlal.u32 @ACC[3],$Bi,${A1}[1]
516 vadd.u64 $Ni,$Ni,@ACC[0]#lo
517 vmlal.u32 @ACC[4],$Bi,${A2}[0]
518 vmul.u32 $Ni,$Ni,$M0
519 vmlal.u32 @ACC[5],$Bi,${A2}[1]
520 vst1.32 {$Bi},[$bnptr,:64]! @ put aside smashed b[8*i+$i]
521 vmlal.u32 @ACC[6],$Bi,${A3}[0]
522 vzip.16 $Ni,$zero
523 vmlal.u32 @ACC[7],$Bi,${A3}[1]
524___
525}
526$code.=<<___;
527 vld1.32 {$Bi},[sp,:64] @ pull smashed b[8*i+0]
528 vmlal.u32 @ACC[0],$Ni,${N0}[0]
529 vld1.32 {$A0-$A3},[$aptr]!
530 vmlal.u32 @ACC[1],$Ni,${N0}[1]
531 vmlal.u32 @ACC[2],$Ni,${N1}[0]
532 vshr.u64 @ACC[0]#lo,@ACC[0]#lo,#16
533 vmlal.u32 @ACC[3],$Ni,${N1}[1]
534 vmlal.u32 @ACC[4],$Ni,${N2}[0]
535 vadd.u64 @ACC[0]#lo,@ACC[0]#lo,@ACC[0]#hi
536 vmlal.u32 @ACC[5],$Ni,${N2}[1]
537 vshr.u64 @ACC[0]#lo,@ACC[0]#lo,#16
538 vmlal.u32 @ACC[6],$Ni,${N3}[0]
539 vmlal.u32 @ACC[7],$Ni,${N3}[1]
540 vadd.u64 @ACC[1]#lo,@ACC[1]#lo,@ACC[0]#lo
541 vst1.32 {$Ni},[$bnptr,:64] @ put aside smashed m[8*i+$i]
542 add $bnptr,sp,#8 @ rewind
543___
544 push(@ACC,shift(@ACC));
545$code.=<<___;
546 sub $inner,$num,#8
547 b .LNEON_8n_inner
548
549.align 4
550.LNEON_8n_inner:
551 subs $inner,$inner,#8
552 vmlal.u32 @ACC[0],$Bi,${A0}[0]
553 vld1.64 {@ACC[7]},[$tinptr,:128]
554 vmlal.u32 @ACC[1],$Bi,${A0}[1]
555 vld1.32 {$Ni},[$bnptr,:64]! @ pull smashed m[8*i+0]
556 vmlal.u32 @ACC[2],$Bi,${A1}[0]
557 vld1.32 {$N0-$N3},[$nptr]!
558 vmlal.u32 @ACC[3],$Bi,${A1}[1]
559 it ne
560 addne $tinptr,$tinptr,#16 @ don't advance in last iteration
561 vmlal.u32 @ACC[4],$Bi,${A2}[0]
562 vmlal.u32 @ACC[5],$Bi,${A2}[1]
563 vmlal.u32 @ACC[6],$Bi,${A3}[0]
564 vmlal.u32 @ACC[7],$Bi,${A3}[1]
565___
566for ($i=1; $i<8; $i++) {
567$code.=<<___;
568 vld1.32 {$Bi},[$bnptr,:64]! @ pull smashed b[8*i+$i]
569 vmlal.u32 @ACC[0],$Ni,${N0}[0]
570 vmlal.u32 @ACC[1],$Ni,${N0}[1]
571 vmlal.u32 @ACC[2],$Ni,${N1}[0]
572 vmlal.u32 @ACC[3],$Ni,${N1}[1]
573 vmlal.u32 @ACC[4],$Ni,${N2}[0]
574 vmlal.u32 @ACC[5],$Ni,${N2}[1]
575 vmlal.u32 @ACC[6],$Ni,${N3}[0]
576 vmlal.u32 @ACC[7],$Ni,${N3}[1]
577 vst1.64 {@ACC[0]},[$toutptr,:128]!
578___
579 push(@ACC,shift(@ACC));
580$code.=<<___;
581 vmlal.u32 @ACC[0],$Bi,${A0}[0]
582 vld1.64 {@ACC[7]},[$tinptr,:128]
583 vmlal.u32 @ACC[1],$Bi,${A0}[1]
584 vld1.32 {$Ni},[$bnptr,:64]! @ pull smashed m[8*i+$i]
585 vmlal.u32 @ACC[2],$Bi,${A1}[0]
586 it ne
587 addne $tinptr,$tinptr,#16 @ don't advance in last iteration
588 vmlal.u32 @ACC[3],$Bi,${A1}[1]
589 vmlal.u32 @ACC[4],$Bi,${A2}[0]
590 vmlal.u32 @ACC[5],$Bi,${A2}[1]
591 vmlal.u32 @ACC[6],$Bi,${A3}[0]
592 vmlal.u32 @ACC[7],$Bi,${A3}[1]
593___
594}
595$code.=<<___;
596 it eq
597 subeq $aptr,$aptr,$num,lsl#2 @ rewind
598 vmlal.u32 @ACC[0],$Ni,${N0}[0]
599 vld1.32 {$Bi},[sp,:64] @ pull smashed b[8*i+0]
600 vmlal.u32 @ACC[1],$Ni,${N0}[1]
601 vld1.32 {$A0-$A3},[$aptr]!
602 vmlal.u32 @ACC[2],$Ni,${N1}[0]
603 add $bnptr,sp,#8 @ rewind
604 vmlal.u32 @ACC[3],$Ni,${N1}[1]
605 vmlal.u32 @ACC[4],$Ni,${N2}[0]
606 vmlal.u32 @ACC[5],$Ni,${N2}[1]
607 vmlal.u32 @ACC[6],$Ni,${N3}[0]
608 vst1.64 {@ACC[0]},[$toutptr,:128]!
609 vmlal.u32 @ACC[7],$Ni,${N3}[1]
610
611 bne .LNEON_8n_inner
612___
613 push(@ACC,shift(@ACC));
614$code.=<<___;
615 add $tinptr,sp,#128
616 vst1.64 {@ACC[0]-@ACC[1]},[$toutptr,:256]!
617 veor q2,q2,q2 @ $N0-$N1
618 vst1.64 {@ACC[2]-@ACC[3]},[$toutptr,:256]!
619 veor q3,q3,q3 @ $N2-$N3
620 vst1.64 {@ACC[4]-@ACC[5]},[$toutptr,:256]!
621 vst1.64 {@ACC[6]},[$toutptr,:128]
622
623 subs $outer,$outer,#8
624 vld1.64 {@ACC[0]-@ACC[1]},[$tinptr,:256]!
625 vld1.64 {@ACC[2]-@ACC[3]},[$tinptr,:256]!
626 vld1.64 {@ACC[4]-@ACC[5]},[$tinptr,:256]!
627 vld1.64 {@ACC[6]-@ACC[7]},[$tinptr,:256]!
628
629 itt ne
630 subne $nptr,$nptr,$num,lsl#2 @ rewind
631 bne .LNEON_8n_outer
632
633 add $toutptr,sp,#128
634 vst1.64 {q2-q3}, [sp,:256]! @ start wiping stack frame
635 vshr.u64 $temp,@ACC[0]#lo,#16
636 vst1.64 {q2-q3},[sp,:256]!
637 vadd.u64 @ACC[0]#hi,@ACC[0]#hi,$temp
638 vst1.64 {q2-q3}, [sp,:256]!
639 vshr.u64 $temp,@ACC[0]#hi,#16
640 vst1.64 {q2-q3}, [sp,:256]!
641 vzip.16 @ACC[0]#lo,@ACC[0]#hi
642
643 mov $inner,$num
644 b .LNEON_tail_entry
645
646.align 4
647.LNEON_tail:
648 vadd.u64 @ACC[0]#lo,@ACC[0]#lo,$temp
649 vshr.u64 $temp,@ACC[0]#lo,#16
650 vld1.64 {@ACC[2]-@ACC[3]}, [$tinptr, :256]!
651 vadd.u64 @ACC[0]#hi,@ACC[0]#hi,$temp
652 vld1.64 {@ACC[4]-@ACC[5]}, [$tinptr, :256]!
653 vshr.u64 $temp,@ACC[0]#hi,#16
654 vld1.64 {@ACC[6]-@ACC[7]}, [$tinptr, :256]!
655 vzip.16 @ACC[0]#lo,@ACC[0]#hi
656
657.LNEON_tail_entry:
658___
659for ($i=1; $i<8; $i++) {
660$code.=<<___;
661 vadd.u64 @ACC[1]#lo,@ACC[1]#lo,$temp
662 vst1.32 {@ACC[0]#lo[0]}, [$toutptr, :32]!
663 vshr.u64 $temp,@ACC[1]#lo,#16
664 vadd.u64 @ACC[1]#hi,@ACC[1]#hi,$temp
665 vshr.u64 $temp,@ACC[1]#hi,#16
666 vzip.16 @ACC[1]#lo,@ACC[1]#hi
667___
668 push(@ACC,shift(@ACC));
669}
670 push(@ACC,shift(@ACC));
671$code.=<<___;
672 vld1.64 {@ACC[0]-@ACC[1]}, [$tinptr, :256]!
673 subs $inner,$inner,#8
674 vst1.32 {@ACC[7]#lo[0]}, [$toutptr, :32]!
675 bne .LNEON_tail
676
677 vst1.32 {${temp}[0]}, [$toutptr, :32] @ top-most bit
678 sub $nptr,$nptr,$num,lsl#2 @ rewind $nptr
679 subs $aptr,sp,#0 @ clear carry flag
680 add $bptr,sp,$num,lsl#2
681
682.LNEON_sub:
683 ldmia $aptr!, {r4-r7}
684 ldmia $nptr!, {r8-r11}
685 sbcs r8, r4,r8
686 sbcs r9, r5,r9
687 sbcs r10,r6,r10
688 sbcs r11,r7,r11
689 teq $aptr,$bptr @ preserves carry
690 stmia $rptr!, {r8-r11}
691 bne .LNEON_sub
692
693 ldr r10, [$aptr] @ load top-most bit
694 mov r11,sp
695 veor q0,q0,q0
696 sub r11,$bptr,r11 @ this is num*4
697 veor q1,q1,q1
698 mov $aptr,sp
699 sub $rptr,$rptr,r11 @ rewind $rptr
700 mov $nptr,$bptr @ second 3/4th of frame
701 sbcs r10,r10,#0 @ result is carry flag
702
703.LNEON_copy_n_zap:
704 ldmia $aptr!, {r4-r7}
705 ldmia $rptr, {r8-r11}
706 it cc
707 movcc r8, r4
708 vst1.64 {q0-q1}, [$nptr,:256]! @ wipe
709 itt cc
710 movcc r9, r5
711 movcc r10,r6
712 vst1.64 {q0-q1}, [$nptr,:256]! @ wipe
713 it cc
714 movcc r11,r7
715 ldmia $aptr, {r4-r7}
716 stmia $rptr!, {r8-r11}
717 sub $aptr,$aptr,#16
718 ldmia $rptr, {r8-r11}
719 it cc
720 movcc r8, r4
721 vst1.64 {q0-q1}, [$aptr,:256]! @ wipe
722 itt cc
723 movcc r9, r5
724 movcc r10,r6
725 vst1.64 {q0-q1}, [$nptr,:256]! @ wipe
726 it cc
727 movcc r11,r7
728 teq $aptr,$bptr @ preserves carry
729 stmia $rptr!, {r8-r11}
730 bne .LNEON_copy_n_zap
731
732 mov sp,ip
733 vldmia sp!,{d8-d15}
734 ldmia sp!,{r4-r11}
735 ret @ bx lr
736.size bn_mul8x_mont_neon,.-bn_mul8x_mont_neon
737#endif
738___
739}
740$code.=<<___;
741.asciz "Montgomery multiplication for ARMv4/NEON, CRYPTOGAMS by <appro\@openssl.org>"
742.align 2
743#if __ARM_MAX_ARCH__>=7
744.comm OPENSSL_armcap_P,4,4
745#endif
746___
747
748foreach (split("\n",$code)) {
749 s/\`([^\`]*)\`/eval $1/ge;
750
751 s/\bq([0-9]+)#(lo|hi)/sprintf "d%d",2*$1+($2 eq "hi")/ge or
752 s/\bret\b/bx lr/g or
753 s/\bbx\s+lr\b/.word\t0xe12fff1e/g; # make it possible to compile with -march=armv4
754
755 print $_,"\n";
756}
757
758close STDOUT;
Note: See TracBrowser for help on using the repository browser.

© 2024 Oracle Support Privacy / Do Not Sell My Info Terms of Use Trademark Policy Automated Access Etiquette