VirtualBox

source: vbox/trunk/src/libs/openssl-1.1.1l/crypto/ec/asm/ecp_nistz256-armv4.pl@ 91772

Last change on this file since 91772 was 91772, checked in by vboxsync, 3 years ago

openssl-1.1.1l: Applied and adjusted our OpenSSL changes to 1.1.1l. bugref:10126

  • Property svn:executable set to *
File size: 44.8 KB
Line 
1#! /usr/bin/env perl
2# Copyright 2015-2020 The OpenSSL Project Authors. All Rights Reserved.
3#
4# Licensed under the OpenSSL license (the "License"). You may not use
5# this file except in compliance with the License. You can obtain a copy
6# in the file LICENSE in the source distribution or at
7# https://www.openssl.org/source/license.html
8
9
10# ====================================================================
11# Written by Andy Polyakov <[email protected]> for the OpenSSL
12# project. The module is, however, dual licensed under OpenSSL and
13# CRYPTOGAMS licenses depending on where you obtain it. For further
14# details see http://www.openssl.org/~appro/cryptogams/.
15# ====================================================================
16#
17# ECP_NISTZ256 module for ARMv4.
18#
19# October 2014.
20#
21# Original ECP_NISTZ256 submission targeting x86_64 is detailed in
22# http://eprint.iacr.org/2013/816. In the process of adaptation
23# original .c module was made 32-bit savvy in order to make this
24# implementation possible.
25#
26# with/without -DECP_NISTZ256_ASM
27# Cortex-A8 +53-170%
28# Cortex-A9 +76-205%
29# Cortex-A15 +100-316%
30# Snapdragon S4 +66-187%
31#
32# Ranges denote minimum and maximum improvement coefficients depending
33# on benchmark. Lower coefficients are for ECDSA sign, server-side
34# operation. Keep in mind that +200% means 3x improvement.
35
36$flavour = shift;
37if ($flavour=~/\w[\w\-]*\.\w+$/) { $output=$flavour; undef $flavour; }
38else { while (($output=shift) && ($output!~/\w[\w\-]*\.\w+$/)) {} }
39
40if ($flavour && $flavour ne "void") {
41 $0 =~ m/(.*[\/\\])[^\/\\]+$/; $dir=$1;
42 ( $xlate="${dir}arm-xlate.pl" and -f $xlate ) or
43 ( $xlate="${dir}../../perlasm/arm-xlate.pl" and -f $xlate) or
44 die "can't locate arm-xlate.pl";
45
46 open STDOUT,"| \"$^X\" $xlate $flavour $output";
47} else {
48 open STDOUT,">$output";
49}
50
51$code.=<<___;
52#include "arm_arch.h"
53
54.text
55#if defined(__thumb2__)
56.syntax unified
57.thumb
58#else
59.code 32
60#endif
61___
62########################################################################
63# Convert ecp_nistz256_table.c to layout expected by ecp_nistz_gather_w7
64#
65$0 =~ m/(.*[\/\\])[^\/\\]+$/; $dir=$1;
66open TABLE,"<ecp_nistz256_table.c" or
67open TABLE,"<${dir}../ecp_nistz256_table.c" or
68die "failed to open ecp_nistz256_table.c:",$!;
69
70use integer;
71
72foreach(<TABLE>) {
73 s/TOBN\(\s*(0x[0-9a-f]+),\s*(0x[0-9a-f]+)\s*\)/push @arr,hex($2),hex($1)/geo;
74}
75close TABLE;
76
77# See ecp_nistz256_table.c for explanation for why it's 64*16*37.
78# 64*16*37-1 is because $#arr returns last valid index or @arr, not
79# amount of elements.
80die "insane number of elements" if ($#arr != 64*16*37-1);
81
82$code.=<<___;
83.globl ecp_nistz256_precomputed
84.type ecp_nistz256_precomputed,%object
85.align 12
86ecp_nistz256_precomputed:
87___
88########################################################################
89# this conversion smashes P256_POINT_AFFINE by individual bytes with
90# 64 byte interval, similar to
91# 1111222233334444
92# 1234123412341234
93for(1..37) {
94 @tbl = splice(@arr,0,64*16);
95 for($i=0;$i<64;$i++) {
96 undef @line;
97 for($j=0;$j<64;$j++) {
98 push @line,(@tbl[$j*16+$i/4]>>(($i%4)*8))&0xff;
99 }
100 $code.=".byte\t";
101 $code.=join(',',map { sprintf "0x%02x",$_} @line);
102 $code.="\n";
103 }
104}
105$code.=<<___;
106.size ecp_nistz256_precomputed,.-ecp_nistz256_precomputed
107.align 5
108.LRR: @ 2^512 mod P precomputed for NIST P256 polynomial
109.long 0x00000003, 0x00000000, 0xffffffff, 0xfffffffb
110.long 0xfffffffe, 0xffffffff, 0xfffffffd, 0x00000004
111.Lone:
112.long 1,0,0,0,0,0,0,0
113.asciz "ECP_NISTZ256 for ARMv4, CRYPTOGAMS by <appro\@openssl.org>"
114.align 6
115___
116
117########################################################################
118# common register layout, note that $t2 is link register, so that if
119# internal subroutine uses $t2, then it has to offload lr...
120
121($r_ptr,$a_ptr,$b_ptr,$ff,$a0,$a1,$a2,$a3,$a4,$a5,$a6,$a7,$t1,$t2)=
122 map("r$_",(0..12,14));
123($t0,$t3)=($ff,$a_ptr);
124
125$code.=<<___;
126@ void ecp_nistz256_to_mont(BN_ULONG r0[8],const BN_ULONG r1[8]);
127.globl ecp_nistz256_to_mont
128.type ecp_nistz256_to_mont,%function
129ecp_nistz256_to_mont:
130 adr $b_ptr,.LRR
131 b .Lecp_nistz256_mul_mont
132.size ecp_nistz256_to_mont,.-ecp_nistz256_to_mont
133
134@ void ecp_nistz256_from_mont(BN_ULONG r0[8],const BN_ULONG r1[8]);
135.globl ecp_nistz256_from_mont
136.type ecp_nistz256_from_mont,%function
137ecp_nistz256_from_mont:
138 adr $b_ptr,.Lone
139 b .Lecp_nistz256_mul_mont
140.size ecp_nistz256_from_mont,.-ecp_nistz256_from_mont
141
142@ void ecp_nistz256_mul_by_2(BN_ULONG r0[8],const BN_ULONG r1[8]);
143.globl ecp_nistz256_mul_by_2
144.type ecp_nistz256_mul_by_2,%function
145.align 4
146ecp_nistz256_mul_by_2:
147 stmdb sp!,{r4-r12,lr}
148 bl __ecp_nistz256_mul_by_2
149#if __ARM_ARCH__>=5 || !defined(__thumb__)
150 ldmia sp!,{r4-r12,pc}
151#else
152 ldmia sp!,{r4-r12,lr}
153 bx lr @ interoperable with Thumb ISA:-)
154#endif
155.size ecp_nistz256_mul_by_2,.-ecp_nistz256_mul_by_2
156
157.type __ecp_nistz256_mul_by_2,%function
158.align 4
159__ecp_nistz256_mul_by_2:
160 ldr $a0,[$a_ptr,#0]
161 ldr $a1,[$a_ptr,#4]
162 ldr $a2,[$a_ptr,#8]
163 adds $a0,$a0,$a0 @ a[0:7]+=a[0:7], i.e. add with itself
164 ldr $a3,[$a_ptr,#12]
165 adcs $a1,$a1,$a1
166 ldr $a4,[$a_ptr,#16]
167 adcs $a2,$a2,$a2
168 ldr $a5,[$a_ptr,#20]
169 adcs $a3,$a3,$a3
170 ldr $a6,[$a_ptr,#24]
171 adcs $a4,$a4,$a4
172 ldr $a7,[$a_ptr,#28]
173 adcs $a5,$a5,$a5
174 adcs $a6,$a6,$a6
175 mov $ff,#0
176 adcs $a7,$a7,$a7
177 adc $ff,$ff,#0
178
179 b .Lreduce_by_sub
180.size __ecp_nistz256_mul_by_2,.-__ecp_nistz256_mul_by_2
181
182@ void ecp_nistz256_add(BN_ULONG r0[8],const BN_ULONG r1[8],
183@ const BN_ULONG r2[8]);
184.globl ecp_nistz256_add
185.type ecp_nistz256_add,%function
186.align 4
187ecp_nistz256_add:
188 stmdb sp!,{r4-r12,lr}
189 bl __ecp_nistz256_add
190#if __ARM_ARCH__>=5 || !defined(__thumb__)
191 ldmia sp!,{r4-r12,pc}
192#else
193 ldmia sp!,{r4-r12,lr}
194 bx lr @ interoperable with Thumb ISA:-)
195#endif
196.size ecp_nistz256_add,.-ecp_nistz256_add
197
198.type __ecp_nistz256_add,%function
199.align 4
200__ecp_nistz256_add:
201 str lr,[sp,#-4]! @ push lr
202
203 ldr $a0,[$a_ptr,#0]
204 ldr $a1,[$a_ptr,#4]
205 ldr $a2,[$a_ptr,#8]
206 ldr $a3,[$a_ptr,#12]
207 ldr $a4,[$a_ptr,#16]
208 ldr $t0,[$b_ptr,#0]
209 ldr $a5,[$a_ptr,#20]
210 ldr $t1,[$b_ptr,#4]
211 ldr $a6,[$a_ptr,#24]
212 ldr $t2,[$b_ptr,#8]
213 ldr $a7,[$a_ptr,#28]
214 ldr $t3,[$b_ptr,#12]
215 adds $a0,$a0,$t0
216 ldr $t0,[$b_ptr,#16]
217 adcs $a1,$a1,$t1
218 ldr $t1,[$b_ptr,#20]
219 adcs $a2,$a2,$t2
220 ldr $t2,[$b_ptr,#24]
221 adcs $a3,$a3,$t3
222 ldr $t3,[$b_ptr,#28]
223 adcs $a4,$a4,$t0
224 adcs $a5,$a5,$t1
225 adcs $a6,$a6,$t2
226 mov $ff,#0
227 adcs $a7,$a7,$t3
228 adc $ff,$ff,#0
229 ldr lr,[sp],#4 @ pop lr
230
231.Lreduce_by_sub:
232
233 @ if a+b >= modulus, subtract modulus.
234 @
235 @ But since comparison implies subtraction, we subtract
236 @ modulus and then add it back if subtraction borrowed.
237
238 subs $a0,$a0,#-1
239 sbcs $a1,$a1,#-1
240 sbcs $a2,$a2,#-1
241 sbcs $a3,$a3,#0
242 sbcs $a4,$a4,#0
243 sbcs $a5,$a5,#0
244 sbcs $a6,$a6,#1
245 sbcs $a7,$a7,#-1
246 sbc $ff,$ff,#0
247
248 @ Note that because mod has special form, i.e. consists of
249 @ 0xffffffff, 1 and 0s, we can conditionally synthesize it by
250 @ using value of borrow as a whole or extracting single bit.
251 @ Follow $ff register...
252
253 adds $a0,$a0,$ff @ add synthesized modulus
254 adcs $a1,$a1,$ff
255 str $a0,[$r_ptr,#0]
256 adcs $a2,$a2,$ff
257 str $a1,[$r_ptr,#4]
258 adcs $a3,$a3,#0
259 str $a2,[$r_ptr,#8]
260 adcs $a4,$a4,#0
261 str $a3,[$r_ptr,#12]
262 adcs $a5,$a5,#0
263 str $a4,[$r_ptr,#16]
264 adcs $a6,$a6,$ff,lsr#31
265 str $a5,[$r_ptr,#20]
266 adcs $a7,$a7,$ff
267 str $a6,[$r_ptr,#24]
268 str $a7,[$r_ptr,#28]
269
270 mov pc,lr
271.size __ecp_nistz256_add,.-__ecp_nistz256_add
272
273@ void ecp_nistz256_mul_by_3(BN_ULONG r0[8],const BN_ULONG r1[8]);
274.globl ecp_nistz256_mul_by_3
275.type ecp_nistz256_mul_by_3,%function
276.align 4
277ecp_nistz256_mul_by_3:
278 stmdb sp!,{r4-r12,lr}
279 bl __ecp_nistz256_mul_by_3
280#if __ARM_ARCH__>=5 || !defined(__thumb__)
281 ldmia sp!,{r4-r12,pc}
282#else
283 ldmia sp!,{r4-r12,lr}
284 bx lr @ interoperable with Thumb ISA:-)
285#endif
286.size ecp_nistz256_mul_by_3,.-ecp_nistz256_mul_by_3
287
288.type __ecp_nistz256_mul_by_3,%function
289.align 4
290__ecp_nistz256_mul_by_3:
291 str lr,[sp,#-4]! @ push lr
292
293 @ As multiplication by 3 is performed as 2*n+n, below are inline
294 @ copies of __ecp_nistz256_mul_by_2 and __ecp_nistz256_add, see
295 @ corresponding subroutines for details.
296
297 ldr $a0,[$a_ptr,#0]
298 ldr $a1,[$a_ptr,#4]
299 ldr $a2,[$a_ptr,#8]
300 adds $a0,$a0,$a0 @ a[0:7]+=a[0:7]
301 ldr $a3,[$a_ptr,#12]
302 adcs $a1,$a1,$a1
303 ldr $a4,[$a_ptr,#16]
304 adcs $a2,$a2,$a2
305 ldr $a5,[$a_ptr,#20]
306 adcs $a3,$a3,$a3
307 ldr $a6,[$a_ptr,#24]
308 adcs $a4,$a4,$a4
309 ldr $a7,[$a_ptr,#28]
310 adcs $a5,$a5,$a5
311 adcs $a6,$a6,$a6
312 mov $ff,#0
313 adcs $a7,$a7,$a7
314 adc $ff,$ff,#0
315
316 subs $a0,$a0,#-1 @ .Lreduce_by_sub but without stores
317 sbcs $a1,$a1,#-1
318 sbcs $a2,$a2,#-1
319 sbcs $a3,$a3,#0
320 sbcs $a4,$a4,#0
321 sbcs $a5,$a5,#0
322 sbcs $a6,$a6,#1
323 sbcs $a7,$a7,#-1
324 sbc $ff,$ff,#0
325
326 adds $a0,$a0,$ff @ add synthesized modulus
327 adcs $a1,$a1,$ff
328 adcs $a2,$a2,$ff
329 adcs $a3,$a3,#0
330 adcs $a4,$a4,#0
331 ldr $b_ptr,[$a_ptr,#0]
332 adcs $a5,$a5,#0
333 ldr $t1,[$a_ptr,#4]
334 adcs $a6,$a6,$ff,lsr#31
335 ldr $t2,[$a_ptr,#8]
336 adc $a7,$a7,$ff
337
338 ldr $t0,[$a_ptr,#12]
339 adds $a0,$a0,$b_ptr @ 2*a[0:7]+=a[0:7]
340 ldr $b_ptr,[$a_ptr,#16]
341 adcs $a1,$a1,$t1
342 ldr $t1,[$a_ptr,#20]
343 adcs $a2,$a2,$t2
344 ldr $t2,[$a_ptr,#24]
345 adcs $a3,$a3,$t0
346 ldr $t3,[$a_ptr,#28]
347 adcs $a4,$a4,$b_ptr
348 adcs $a5,$a5,$t1
349 adcs $a6,$a6,$t2
350 mov $ff,#0
351 adcs $a7,$a7,$t3
352 adc $ff,$ff,#0
353 ldr lr,[sp],#4 @ pop lr
354
355 b .Lreduce_by_sub
356.size ecp_nistz256_mul_by_3,.-ecp_nistz256_mul_by_3
357
358@ void ecp_nistz256_div_by_2(BN_ULONG r0[8],const BN_ULONG r1[8]);
359.globl ecp_nistz256_div_by_2
360.type ecp_nistz256_div_by_2,%function
361.align 4
362ecp_nistz256_div_by_2:
363 stmdb sp!,{r4-r12,lr}
364 bl __ecp_nistz256_div_by_2
365#if __ARM_ARCH__>=5 || !defined(__thumb__)
366 ldmia sp!,{r4-r12,pc}
367#else
368 ldmia sp!,{r4-r12,lr}
369 bx lr @ interoperable with Thumb ISA:-)
370#endif
371.size ecp_nistz256_div_by_2,.-ecp_nistz256_div_by_2
372
373.type __ecp_nistz256_div_by_2,%function
374.align 4
375__ecp_nistz256_div_by_2:
376 @ ret = (a is odd ? a+mod : a) >> 1
377
378 ldr $a0,[$a_ptr,#0]
379 ldr $a1,[$a_ptr,#4]
380 ldr $a2,[$a_ptr,#8]
381 mov $ff,$a0,lsl#31 @ place least significant bit to most
382 @ significant position, now arithmetic
383 @ right shift by 31 will produce -1 or
384 @ 0, while logical right shift 1 or 0,
385 @ this is how modulus is conditionally
386 @ synthesized in this case...
387 ldr $a3,[$a_ptr,#12]
388 adds $a0,$a0,$ff,asr#31
389 ldr $a4,[$a_ptr,#16]
390 adcs $a1,$a1,$ff,asr#31
391 ldr $a5,[$a_ptr,#20]
392 adcs $a2,$a2,$ff,asr#31
393 ldr $a6,[$a_ptr,#24]
394 adcs $a3,$a3,#0
395 ldr $a7,[$a_ptr,#28]
396 adcs $a4,$a4,#0
397 mov $a0,$a0,lsr#1 @ a[0:7]>>=1, we can start early
398 @ because it doesn't affect flags
399 adcs $a5,$a5,#0
400 orr $a0,$a0,$a1,lsl#31
401 adcs $a6,$a6,$ff,lsr#31
402 mov $b_ptr,#0
403 adcs $a7,$a7,$ff,asr#31
404 mov $a1,$a1,lsr#1
405 adc $b_ptr,$b_ptr,#0 @ top-most carry bit from addition
406
407 orr $a1,$a1,$a2,lsl#31
408 mov $a2,$a2,lsr#1
409 str $a0,[$r_ptr,#0]
410 orr $a2,$a2,$a3,lsl#31
411 mov $a3,$a3,lsr#1
412 str $a1,[$r_ptr,#4]
413 orr $a3,$a3,$a4,lsl#31
414 mov $a4,$a4,lsr#1
415 str $a2,[$r_ptr,#8]
416 orr $a4,$a4,$a5,lsl#31
417 mov $a5,$a5,lsr#1
418 str $a3,[$r_ptr,#12]
419 orr $a5,$a5,$a6,lsl#31
420 mov $a6,$a6,lsr#1
421 str $a4,[$r_ptr,#16]
422 orr $a6,$a6,$a7,lsl#31
423 mov $a7,$a7,lsr#1
424 str $a5,[$r_ptr,#20]
425 orr $a7,$a7,$b_ptr,lsl#31 @ don't forget the top-most carry bit
426 str $a6,[$r_ptr,#24]
427 str $a7,[$r_ptr,#28]
428
429 mov pc,lr
430.size __ecp_nistz256_div_by_2,.-__ecp_nistz256_div_by_2
431
432@ void ecp_nistz256_sub(BN_ULONG r0[8],const BN_ULONG r1[8],
433@ const BN_ULONG r2[8]);
434.globl ecp_nistz256_sub
435.type ecp_nistz256_sub,%function
436.align 4
437ecp_nistz256_sub:
438 stmdb sp!,{r4-r12,lr}
439 bl __ecp_nistz256_sub
440#if __ARM_ARCH__>=5 || !defined(__thumb__)
441 ldmia sp!,{r4-r12,pc}
442#else
443 ldmia sp!,{r4-r12,lr}
444 bx lr @ interoperable with Thumb ISA:-)
445#endif
446.size ecp_nistz256_sub,.-ecp_nistz256_sub
447
448.type __ecp_nistz256_sub,%function
449.align 4
450__ecp_nistz256_sub:
451 str lr,[sp,#-4]! @ push lr
452
453 ldr $a0,[$a_ptr,#0]
454 ldr $a1,[$a_ptr,#4]
455 ldr $a2,[$a_ptr,#8]
456 ldr $a3,[$a_ptr,#12]
457 ldr $a4,[$a_ptr,#16]
458 ldr $t0,[$b_ptr,#0]
459 ldr $a5,[$a_ptr,#20]
460 ldr $t1,[$b_ptr,#4]
461 ldr $a6,[$a_ptr,#24]
462 ldr $t2,[$b_ptr,#8]
463 ldr $a7,[$a_ptr,#28]
464 ldr $t3,[$b_ptr,#12]
465 subs $a0,$a0,$t0
466 ldr $t0,[$b_ptr,#16]
467 sbcs $a1,$a1,$t1
468 ldr $t1,[$b_ptr,#20]
469 sbcs $a2,$a2,$t2
470 ldr $t2,[$b_ptr,#24]
471 sbcs $a3,$a3,$t3
472 ldr $t3,[$b_ptr,#28]
473 sbcs $a4,$a4,$t0
474 sbcs $a5,$a5,$t1
475 sbcs $a6,$a6,$t2
476 sbcs $a7,$a7,$t3
477 sbc $ff,$ff,$ff @ broadcast borrow bit
478 ldr lr,[sp],#4 @ pop lr
479
480.Lreduce_by_add:
481
482 @ if a-b borrows, add modulus.
483 @
484 @ Note that because mod has special form, i.e. consists of
485 @ 0xffffffff, 1 and 0s, we can conditionally synthesize it by
486 @ broadcasting borrow bit to a register, $ff, and using it as
487 @ a whole or extracting single bit.
488
489 adds $a0,$a0,$ff @ add synthesized modulus
490 adcs $a1,$a1,$ff
491 str $a0,[$r_ptr,#0]
492 adcs $a2,$a2,$ff
493 str $a1,[$r_ptr,#4]
494 adcs $a3,$a3,#0
495 str $a2,[$r_ptr,#8]
496 adcs $a4,$a4,#0
497 str $a3,[$r_ptr,#12]
498 adcs $a5,$a5,#0
499 str $a4,[$r_ptr,#16]
500 adcs $a6,$a6,$ff,lsr#31
501 str $a5,[$r_ptr,#20]
502 adcs $a7,$a7,$ff
503 str $a6,[$r_ptr,#24]
504 str $a7,[$r_ptr,#28]
505
506 mov pc,lr
507.size __ecp_nistz256_sub,.-__ecp_nistz256_sub
508
509@ void ecp_nistz256_neg(BN_ULONG r0[8],const BN_ULONG r1[8]);
510.globl ecp_nistz256_neg
511.type ecp_nistz256_neg,%function
512.align 4
513ecp_nistz256_neg:
514 stmdb sp!,{r4-r12,lr}
515 bl __ecp_nistz256_neg
516#if __ARM_ARCH__>=5 || !defined(__thumb__)
517 ldmia sp!,{r4-r12,pc}
518#else
519 ldmia sp!,{r4-r12,lr}
520 bx lr @ interoperable with Thumb ISA:-)
521#endif
522.size ecp_nistz256_neg,.-ecp_nistz256_neg
523
524.type __ecp_nistz256_neg,%function
525.align 4
526__ecp_nistz256_neg:
527 ldr $a0,[$a_ptr,#0]
528 eor $ff,$ff,$ff
529 ldr $a1,[$a_ptr,#4]
530 ldr $a2,[$a_ptr,#8]
531 subs $a0,$ff,$a0
532 ldr $a3,[$a_ptr,#12]
533 sbcs $a1,$ff,$a1
534 ldr $a4,[$a_ptr,#16]
535 sbcs $a2,$ff,$a2
536 ldr $a5,[$a_ptr,#20]
537 sbcs $a3,$ff,$a3
538 ldr $a6,[$a_ptr,#24]
539 sbcs $a4,$ff,$a4
540 ldr $a7,[$a_ptr,#28]
541 sbcs $a5,$ff,$a5
542 sbcs $a6,$ff,$a6
543 sbcs $a7,$ff,$a7
544 sbc $ff,$ff,$ff
545
546 b .Lreduce_by_add
547.size __ecp_nistz256_neg,.-__ecp_nistz256_neg
548___
549{
550my @acc=map("r$_",(3..11));
551my ($t0,$t1,$bj,$t2,$t3)=map("r$_",(0,1,2,12,14));
552
553$code.=<<___;
554@ void ecp_nistz256_sqr_mont(BN_ULONG r0[8],const BN_ULONG r1[8]);
555.globl ecp_nistz256_sqr_mont
556.type ecp_nistz256_sqr_mont,%function
557.align 4
558ecp_nistz256_sqr_mont:
559 mov $b_ptr,$a_ptr
560 b .Lecp_nistz256_mul_mont
561.size ecp_nistz256_sqr_mont,.-ecp_nistz256_sqr_mont
562
563@ void ecp_nistz256_mul_mont(BN_ULONG r0[8],const BN_ULONG r1[8],
564@ const BN_ULONG r2[8]);
565.globl ecp_nistz256_mul_mont
566.type ecp_nistz256_mul_mont,%function
567.align 4
568ecp_nistz256_mul_mont:
569.Lecp_nistz256_mul_mont:
570 stmdb sp!,{r4-r12,lr}
571 bl __ecp_nistz256_mul_mont
572#if __ARM_ARCH__>=5 || !defined(__thumb__)
573 ldmia sp!,{r4-r12,pc}
574#else
575 ldmia sp!,{r4-r12,lr}
576 bx lr @ interoperable with Thumb ISA:-)
577#endif
578.size ecp_nistz256_mul_mont,.-ecp_nistz256_mul_mont
579
580.type __ecp_nistz256_mul_mont,%function
581.align 4
582__ecp_nistz256_mul_mont:
583 stmdb sp!,{r0-r2,lr} @ make a copy of arguments too
584
585 ldr $bj,[$b_ptr,#0] @ b[0]
586 ldmia $a_ptr,{@acc[1]-@acc[8]}
587
588 umull @acc[0],$t3,@acc[1],$bj @ r[0]=a[0]*b[0]
589 stmdb sp!,{$acc[1]-@acc[8]} @ copy a[0-7] to stack, so
590 @ that it can be addressed
591 @ without spending register
592 @ on address
593 umull @acc[1],$t0,@acc[2],$bj @ r[1]=a[1]*b[0]
594 umull @acc[2],$t1,@acc[3],$bj
595 adds @acc[1],@acc[1],$t3 @ accumulate high part of mult
596 umull @acc[3],$t2,@acc[4],$bj
597 adcs @acc[2],@acc[2],$t0
598 umull @acc[4],$t3,@acc[5],$bj
599 adcs @acc[3],@acc[3],$t1
600 umull @acc[5],$t0,@acc[6],$bj
601 adcs @acc[4],@acc[4],$t2
602 umull @acc[6],$t1,@acc[7],$bj
603 adcs @acc[5],@acc[5],$t3
604 umull @acc[7],$t2,@acc[8],$bj
605 adcs @acc[6],@acc[6],$t0
606 adcs @acc[7],@acc[7],$t1
607 eor $t3,$t3,$t3 @ first overflow bit is zero
608 adc @acc[8],$t2,#0
609___
610for(my $i=1;$i<8;$i++) {
611my $t4=@acc[0];
612
613 # Reduction iteration is normally performed by accumulating
614 # result of multiplication of modulus by "magic" digit [and
615 # omitting least significant word, which is guaranteed to
616 # be 0], but thanks to special form of modulus and "magic"
617 # digit being equal to least significant word, it can be
618 # performed with additions and subtractions alone. Indeed:
619 #
620 # ffff.0001.0000.0000.0000.ffff.ffff.ffff
621 # * abcd
622 # + xxxx.xxxx.xxxx.xxxx.xxxx.xxxx.xxxx.xxxx.abcd
623 #
624 # Now observing that ff..ff*x = (2^n-1)*x = 2^n*x-x, we
625 # rewrite above as:
626 #
627 # xxxx.xxxx.xxxx.xxxx.xxxx.xxxx.xxxx.xxxx.abcd
628 # + abcd.0000.abcd.0000.0000.abcd.0000.0000.0000
629 # - abcd.0000.0000.0000.0000.0000.0000.abcd
630 #
631 # or marking redundant operations:
632 #
633 # xxxx.xxxx.xxxx.xxxx.xxxx.xxxx.xxxx.xxxx.----
634 # + abcd.0000.abcd.0000.0000.abcd.----.----.----
635 # - abcd.----.----.----.----.----.----.----
636
637$code.=<<___;
638 @ multiplication-less reduction $i
639 adds @acc[3],@acc[3],@acc[0] @ r[3]+=r[0]
640 ldr $bj,[sp,#40] @ restore b_ptr
641 adcs @acc[4],@acc[4],#0 @ r[4]+=0
642 adcs @acc[5],@acc[5],#0 @ r[5]+=0
643 adcs @acc[6],@acc[6],@acc[0] @ r[6]+=r[0]
644 ldr $t1,[sp,#0] @ load a[0]
645 adcs @acc[7],@acc[7],#0 @ r[7]+=0
646 ldr $bj,[$bj,#4*$i] @ load b[i]
647 adcs @acc[8],@acc[8],@acc[0] @ r[8]+=r[0]
648 eor $t0,$t0,$t0
649 adc $t3,$t3,#0 @ overflow bit
650 subs @acc[7],@acc[7],@acc[0] @ r[7]-=r[0]
651 ldr $t2,[sp,#4] @ a[1]
652 sbcs @acc[8],@acc[8],#0 @ r[8]-=0
653 umlal @acc[1],$t0,$t1,$bj @ "r[0]"+=a[0]*b[i]
654 eor $t1,$t1,$t1
655 sbc @acc[0],$t3,#0 @ overflow bit, keep in mind
656 @ that netto result is
657 @ addition of a value which
658 @ makes underflow impossible
659
660 ldr $t3,[sp,#8] @ a[2]
661 umlal @acc[2],$t1,$t2,$bj @ "r[1]"+=a[1]*b[i]
662 str @acc[0],[sp,#36] @ temporarily offload overflow
663 eor $t2,$t2,$t2
664 ldr $t4,[sp,#12] @ a[3], $t4 is alias @acc[0]
665 umlal @acc[3],$t2,$t3,$bj @ "r[2]"+=a[2]*b[i]
666 eor $t3,$t3,$t3
667 adds @acc[2],@acc[2],$t0 @ accumulate high part of mult
668 ldr $t0,[sp,#16] @ a[4]
669 umlal @acc[4],$t3,$t4,$bj @ "r[3]"+=a[3]*b[i]
670 eor $t4,$t4,$t4
671 adcs @acc[3],@acc[3],$t1
672 ldr $t1,[sp,#20] @ a[5]
673 umlal @acc[5],$t4,$t0,$bj @ "r[4]"+=a[4]*b[i]
674 eor $t0,$t0,$t0
675 adcs @acc[4],@acc[4],$t2
676 ldr $t2,[sp,#24] @ a[6]
677 umlal @acc[6],$t0,$t1,$bj @ "r[5]"+=a[5]*b[i]
678 eor $t1,$t1,$t1
679 adcs @acc[5],@acc[5],$t3
680 ldr $t3,[sp,#28] @ a[7]
681 umlal @acc[7],$t1,$t2,$bj @ "r[6]"+=a[6]*b[i]
682 eor $t2,$t2,$t2
683 adcs @acc[6],@acc[6],$t4
684 ldr @acc[0],[sp,#36] @ restore overflow bit
685 umlal @acc[8],$t2,$t3,$bj @ "r[7]"+=a[7]*b[i]
686 eor $t3,$t3,$t3
687 adcs @acc[7],@acc[7],$t0
688 adcs @acc[8],@acc[8],$t1
689 adcs @acc[0],$acc[0],$t2
690 adc $t3,$t3,#0 @ new overflow bit
691___
692 push(@acc,shift(@acc)); # rotate registers, so that
693 # "r[i]" becomes r[i]
694}
695$code.=<<___;
696 @ last multiplication-less reduction
697 adds @acc[3],@acc[3],@acc[0]
698 ldr $r_ptr,[sp,#32] @ restore r_ptr
699 adcs @acc[4],@acc[4],#0
700 adcs @acc[5],@acc[5],#0
701 adcs @acc[6],@acc[6],@acc[0]
702 adcs @acc[7],@acc[7],#0
703 adcs @acc[8],@acc[8],@acc[0]
704 adc $t3,$t3,#0
705 subs @acc[7],@acc[7],@acc[0]
706 sbcs @acc[8],@acc[8],#0
707 sbc @acc[0],$t3,#0 @ overflow bit
708
709 @ Final step is "if result > mod, subtract mod", but we do it
710 @ "other way around", namely subtract modulus from result
711 @ and if it borrowed, add modulus back.
712
713 adds @acc[1],@acc[1],#1 @ subs @acc[1],@acc[1],#-1
714 adcs @acc[2],@acc[2],#0 @ sbcs @acc[2],@acc[2],#-1
715 adcs @acc[3],@acc[3],#0 @ sbcs @acc[3],@acc[3],#-1
716 sbcs @acc[4],@acc[4],#0
717 sbcs @acc[5],@acc[5],#0
718 sbcs @acc[6],@acc[6],#0
719 sbcs @acc[7],@acc[7],#1
720 adcs @acc[8],@acc[8],#0 @ sbcs @acc[8],@acc[8],#-1
721 ldr lr,[sp,#44] @ restore lr
722 sbc @acc[0],@acc[0],#0 @ broadcast borrow bit
723 add sp,sp,#48
724
725 @ Note that because mod has special form, i.e. consists of
726 @ 0xffffffff, 1 and 0s, we can conditionally synthesize it by
727 @ broadcasting borrow bit to a register, @acc[0], and using it as
728 @ a whole or extracting single bit.
729
730 adds @acc[1],@acc[1],@acc[0] @ add modulus or zero
731 adcs @acc[2],@acc[2],@acc[0]
732 str @acc[1],[$r_ptr,#0]
733 adcs @acc[3],@acc[3],@acc[0]
734 str @acc[2],[$r_ptr,#4]
735 adcs @acc[4],@acc[4],#0
736 str @acc[3],[$r_ptr,#8]
737 adcs @acc[5],@acc[5],#0
738 str @acc[4],[$r_ptr,#12]
739 adcs @acc[6],@acc[6],#0
740 str @acc[5],[$r_ptr,#16]
741 adcs @acc[7],@acc[7],@acc[0],lsr#31
742 str @acc[6],[$r_ptr,#20]
743 adc @acc[8],@acc[8],@acc[0]
744 str @acc[7],[$r_ptr,#24]
745 str @acc[8],[$r_ptr,#28]
746
747 mov pc,lr
748.size __ecp_nistz256_mul_mont,.-__ecp_nistz256_mul_mont
749___
750}
751
752{
753my ($out,$inp,$index,$mask)=map("r$_",(0..3));
754$code.=<<___;
755@ void ecp_nistz256_scatter_w5(void *r0,const P256_POINT *r1,
756@ int r2);
757.globl ecp_nistz256_scatter_w5
758.type ecp_nistz256_scatter_w5,%function
759.align 5
760ecp_nistz256_scatter_w5:
761 stmdb sp!,{r4-r11}
762
763 add $out,$out,$index,lsl#2
764
765 ldmia $inp!,{r4-r11} @ X
766 str r4,[$out,#64*0-4]
767 str r5,[$out,#64*1-4]
768 str r6,[$out,#64*2-4]
769 str r7,[$out,#64*3-4]
770 str r8,[$out,#64*4-4]
771 str r9,[$out,#64*5-4]
772 str r10,[$out,#64*6-4]
773 str r11,[$out,#64*7-4]
774 add $out,$out,#64*8
775
776 ldmia $inp!,{r4-r11} @ Y
777 str r4,[$out,#64*0-4]
778 str r5,[$out,#64*1-4]
779 str r6,[$out,#64*2-4]
780 str r7,[$out,#64*3-4]
781 str r8,[$out,#64*4-4]
782 str r9,[$out,#64*5-4]
783 str r10,[$out,#64*6-4]
784 str r11,[$out,#64*7-4]
785 add $out,$out,#64*8
786
787 ldmia $inp,{r4-r11} @ Z
788 str r4,[$out,#64*0-4]
789 str r5,[$out,#64*1-4]
790 str r6,[$out,#64*2-4]
791 str r7,[$out,#64*3-4]
792 str r8,[$out,#64*4-4]
793 str r9,[$out,#64*5-4]
794 str r10,[$out,#64*6-4]
795 str r11,[$out,#64*7-4]
796
797 ldmia sp!,{r4-r11}
798#if __ARM_ARCH__>=5 || defined(__thumb__)
799 bx lr
800#else
801 mov pc,lr
802#endif
803.size ecp_nistz256_scatter_w5,.-ecp_nistz256_scatter_w5
804
805@ void ecp_nistz256_gather_w5(P256_POINT *r0,const void *r1,
806@ int r2);
807.globl ecp_nistz256_gather_w5
808.type ecp_nistz256_gather_w5,%function
809.align 5
810ecp_nistz256_gather_w5:
811 stmdb sp!,{r4-r11}
812
813 cmp $index,#0
814 mov $mask,#0
815#ifdef __thumb2__
816 itt ne
817#endif
818 subne $index,$index,#1
819 movne $mask,#-1
820 add $inp,$inp,$index,lsl#2
821
822 ldr r4,[$inp,#64*0]
823 ldr r5,[$inp,#64*1]
824 ldr r6,[$inp,#64*2]
825 and r4,r4,$mask
826 ldr r7,[$inp,#64*3]
827 and r5,r5,$mask
828 ldr r8,[$inp,#64*4]
829 and r6,r6,$mask
830 ldr r9,[$inp,#64*5]
831 and r7,r7,$mask
832 ldr r10,[$inp,#64*6]
833 and r8,r8,$mask
834 ldr r11,[$inp,#64*7]
835 add $inp,$inp,#64*8
836 and r9,r9,$mask
837 and r10,r10,$mask
838 and r11,r11,$mask
839 stmia $out!,{r4-r11} @ X
840
841 ldr r4,[$inp,#64*0]
842 ldr r5,[$inp,#64*1]
843 ldr r6,[$inp,#64*2]
844 and r4,r4,$mask
845 ldr r7,[$inp,#64*3]
846 and r5,r5,$mask
847 ldr r8,[$inp,#64*4]
848 and r6,r6,$mask
849 ldr r9,[$inp,#64*5]
850 and r7,r7,$mask
851 ldr r10,[$inp,#64*6]
852 and r8,r8,$mask
853 ldr r11,[$inp,#64*7]
854 add $inp,$inp,#64*8
855 and r9,r9,$mask
856 and r10,r10,$mask
857 and r11,r11,$mask
858 stmia $out!,{r4-r11} @ Y
859
860 ldr r4,[$inp,#64*0]
861 ldr r5,[$inp,#64*1]
862 ldr r6,[$inp,#64*2]
863 and r4,r4,$mask
864 ldr r7,[$inp,#64*3]
865 and r5,r5,$mask
866 ldr r8,[$inp,#64*4]
867 and r6,r6,$mask
868 ldr r9,[$inp,#64*5]
869 and r7,r7,$mask
870 ldr r10,[$inp,#64*6]
871 and r8,r8,$mask
872 ldr r11,[$inp,#64*7]
873 and r9,r9,$mask
874 and r10,r10,$mask
875 and r11,r11,$mask
876 stmia $out,{r4-r11} @ Z
877
878 ldmia sp!,{r4-r11}
879#if __ARM_ARCH__>=5 || defined(__thumb__)
880 bx lr
881#else
882 mov pc,lr
883#endif
884.size ecp_nistz256_gather_w5,.-ecp_nistz256_gather_w5
885
886@ void ecp_nistz256_scatter_w7(void *r0,const P256_POINT_AFFINE *r1,
887@ int r2);
888.globl ecp_nistz256_scatter_w7
889.type ecp_nistz256_scatter_w7,%function
890.align 5
891ecp_nistz256_scatter_w7:
892 add $out,$out,$index
893 mov $index,#64/4
894.Loop_scatter_w7:
895 ldr $mask,[$inp],#4
896 subs $index,$index,#1
897 strb $mask,[$out,#64*0]
898 mov $mask,$mask,lsr#8
899 strb $mask,[$out,#64*1]
900 mov $mask,$mask,lsr#8
901 strb $mask,[$out,#64*2]
902 mov $mask,$mask,lsr#8
903 strb $mask,[$out,#64*3]
904 add $out,$out,#64*4
905 bne .Loop_scatter_w7
906
907#if __ARM_ARCH__>=5 || defined(__thumb__)
908 bx lr
909#else
910 mov pc,lr
911#endif
912.size ecp_nistz256_scatter_w7,.-ecp_nistz256_scatter_w7
913
914@ void ecp_nistz256_gather_w7(P256_POINT_AFFINE *r0,const void *r1,
915@ int r2);
916.globl ecp_nistz256_gather_w7
917.type ecp_nistz256_gather_w7,%function
918.align 5
919ecp_nistz256_gather_w7:
920 stmdb sp!,{r4-r7}
921
922 cmp $index,#0
923 mov $mask,#0
924#ifdef __thumb2__
925 itt ne
926#endif
927 subne $index,$index,#1
928 movne $mask,#-1
929 add $inp,$inp,$index
930 mov $index,#64/4
931 nop
932.Loop_gather_w7:
933 ldrb r4,[$inp,#64*0]
934 subs $index,$index,#1
935 ldrb r5,[$inp,#64*1]
936 ldrb r6,[$inp,#64*2]
937 ldrb r7,[$inp,#64*3]
938 add $inp,$inp,#64*4
939 orr r4,r4,r5,lsl#8
940 orr r4,r4,r6,lsl#16
941 orr r4,r4,r7,lsl#24
942 and r4,r4,$mask
943 str r4,[$out],#4
944 bne .Loop_gather_w7
945
946 ldmia sp!,{r4-r7}
947#if __ARM_ARCH__>=5 || defined(__thumb__)
948 bx lr
949#else
950 mov pc,lr
951#endif
952.size ecp_nistz256_gather_w7,.-ecp_nistz256_gather_w7
953___
954}
955if (0) {
956# In comparison to integer-only equivalent of below subroutine:
957#
958# Cortex-A8 +10%
959# Cortex-A9 -10%
960# Snapdragon S4 +5%
961#
962# As not all time is spent in multiplication, overall impact is deemed
963# too low to care about.
964
965my ($A0,$A1,$A2,$A3,$Bi,$zero,$temp)=map("d$_",(0..7));
966my $mask="q4";
967my $mult="q5";
968my @AxB=map("q$_",(8..15));
969
970my ($rptr,$aptr,$bptr,$toutptr)=map("r$_",(0..3));
971
972$code.=<<___;
973#if __ARM_ARCH__>=7
974.fpu neon
975
976.globl ecp_nistz256_mul_mont_neon
977.type ecp_nistz256_mul_mont_neon,%function
978.align 5
979ecp_nistz256_mul_mont_neon:
980 mov ip,sp
981 stmdb sp!,{r4-r9}
982 vstmdb sp!,{q4-q5} @ ABI specification says so
983
984 sub $toutptr,sp,#40
985 vld1.32 {${Bi}[0]},[$bptr,:32]!
986 veor $zero,$zero,$zero
987 vld1.32 {$A0-$A3}, [$aptr] @ can't specify :32 :-(
988 vzip.16 $Bi,$zero
989 mov sp,$toutptr @ alloca
990 vmov.i64 $mask,#0xffff
991
992 vmull.u32 @AxB[0],$Bi,${A0}[0]
993 vmull.u32 @AxB[1],$Bi,${A0}[1]
994 vmull.u32 @AxB[2],$Bi,${A1}[0]
995 vmull.u32 @AxB[3],$Bi,${A1}[1]
996 vshr.u64 $temp,@AxB[0]#lo,#16
997 vmull.u32 @AxB[4],$Bi,${A2}[0]
998 vadd.u64 @AxB[0]#hi,@AxB[0]#hi,$temp
999 vmull.u32 @AxB[5],$Bi,${A2}[1]
1000 vshr.u64 $temp,@AxB[0]#hi,#16 @ upper 32 bits of a[0]*b[0]
1001 vmull.u32 @AxB[6],$Bi,${A3}[0]
1002 vand.u64 @AxB[0],@AxB[0],$mask @ lower 32 bits of a[0]*b[0]
1003 vmull.u32 @AxB[7],$Bi,${A3}[1]
1004___
1005for($i=1;$i<8;$i++) {
1006$code.=<<___;
1007 vld1.32 {${Bi}[0]},[$bptr,:32]!
1008 veor $zero,$zero,$zero
1009 vadd.u64 @AxB[1]#lo,@AxB[1]#lo,$temp @ reduction
1010 vshl.u64 $mult,@AxB[0],#32
1011 vadd.u64 @AxB[3],@AxB[3],@AxB[0]
1012 vsub.u64 $mult,$mult,@AxB[0]
1013 vzip.16 $Bi,$zero
1014 vadd.u64 @AxB[6],@AxB[6],@AxB[0]
1015 vadd.u64 @AxB[7],@AxB[7],$mult
1016___
1017 push(@AxB,shift(@AxB));
1018$code.=<<___;
1019 vmlal.u32 @AxB[0],$Bi,${A0}[0]
1020 vmlal.u32 @AxB[1],$Bi,${A0}[1]
1021 vmlal.u32 @AxB[2],$Bi,${A1}[0]
1022 vmlal.u32 @AxB[3],$Bi,${A1}[1]
1023 vshr.u64 $temp,@AxB[0]#lo,#16
1024 vmlal.u32 @AxB[4],$Bi,${A2}[0]
1025 vadd.u64 @AxB[0]#hi,@AxB[0]#hi,$temp
1026 vmlal.u32 @AxB[5],$Bi,${A2}[1]
1027 vshr.u64 $temp,@AxB[0]#hi,#16 @ upper 33 bits of a[0]*b[i]+t[0]
1028 vmlal.u32 @AxB[6],$Bi,${A3}[0]
1029 vand.u64 @AxB[0],@AxB[0],$mask @ lower 32 bits of a[0]*b[0]
1030 vmull.u32 @AxB[7],$Bi,${A3}[1]
1031___
1032}
1033$code.=<<___;
1034 vadd.u64 @AxB[1]#lo,@AxB[1]#lo,$temp @ last reduction
1035 vshl.u64 $mult,@AxB[0],#32
1036 vadd.u64 @AxB[3],@AxB[3],@AxB[0]
1037 vsub.u64 $mult,$mult,@AxB[0]
1038 vadd.u64 @AxB[6],@AxB[6],@AxB[0]
1039 vadd.u64 @AxB[7],@AxB[7],$mult
1040
1041 vshr.u64 $temp,@AxB[1]#lo,#16 @ convert
1042 vadd.u64 @AxB[1]#hi,@AxB[1]#hi,$temp
1043 vshr.u64 $temp,@AxB[1]#hi,#16
1044 vzip.16 @AxB[1]#lo,@AxB[1]#hi
1045___
1046foreach (2..7) {
1047$code.=<<___;
1048 vadd.u64 @AxB[$_]#lo,@AxB[$_]#lo,$temp
1049 vst1.32 {@AxB[$_-1]#lo[0]},[$toutptr,:32]!
1050 vshr.u64 $temp,@AxB[$_]#lo,#16
1051 vadd.u64 @AxB[$_]#hi,@AxB[$_]#hi,$temp
1052 vshr.u64 $temp,@AxB[$_]#hi,#16
1053 vzip.16 @AxB[$_]#lo,@AxB[$_]#hi
1054___
1055}
1056$code.=<<___;
1057 vst1.32 {@AxB[7]#lo[0]},[$toutptr,:32]!
1058 vst1.32 {$temp},[$toutptr] @ upper 33 bits
1059
1060 ldr r1,[sp,#0]
1061 ldr r2,[sp,#4]
1062 ldr r3,[sp,#8]
1063 subs r1,r1,#-1
1064 ldr r4,[sp,#12]
1065 sbcs r2,r2,#-1
1066 ldr r5,[sp,#16]
1067 sbcs r3,r3,#-1
1068 ldr r6,[sp,#20]
1069 sbcs r4,r4,#0
1070 ldr r7,[sp,#24]
1071 sbcs r5,r5,#0
1072 ldr r8,[sp,#28]
1073 sbcs r6,r6,#0
1074 ldr r9,[sp,#32] @ top-most bit
1075 sbcs r7,r7,#1
1076 sub sp,ip,#40+16
1077 sbcs r8,r8,#-1
1078 sbc r9,r9,#0
1079 vldmia sp!,{q4-q5}
1080
1081 adds r1,r1,r9
1082 adcs r2,r2,r9
1083 str r1,[$rptr,#0]
1084 adcs r3,r3,r9
1085 str r2,[$rptr,#4]
1086 adcs r4,r4,#0
1087 str r3,[$rptr,#8]
1088 adcs r5,r5,#0
1089 str r4,[$rptr,#12]
1090 adcs r6,r6,#0
1091 str r5,[$rptr,#16]
1092 adcs r7,r7,r9,lsr#31
1093 str r6,[$rptr,#20]
1094 adcs r8,r8,r9
1095 str r7,[$rptr,#24]
1096 str r8,[$rptr,#28]
1097
1098 ldmia sp!,{r4-r9}
1099 bx lr
1100.size ecp_nistz256_mul_mont_neon,.-ecp_nistz256_mul_mont_neon
1101#endif
1102___
1103}
1104
1105{{{
1106########################################################################
1107# Below $aN assignment matches order in which 256-bit result appears in
1108# register bank at return from __ecp_nistz256_mul_mont, so that we can
1109# skip over reloading it from memory. This means that below functions
1110# use custom calling sequence accepting 256-bit input in registers,
1111# output pointer in r0, $r_ptr, and optional pointer in r2, $b_ptr.
1112#
1113# See their "normal" counterparts for insights on calculations.
1114
1115my ($a0,$a1,$a2,$a3,$a4,$a5,$a6,$a7,
1116 $t0,$t1,$t2,$t3)=map("r$_",(11,3..10,12,14,1));
1117my $ff=$b_ptr;
1118
1119$code.=<<___;
1120.type __ecp_nistz256_sub_from,%function
1121.align 5
1122__ecp_nistz256_sub_from:
1123 str lr,[sp,#-4]! @ push lr
1124
1125 ldr $t0,[$b_ptr,#0]
1126 ldr $t1,[$b_ptr,#4]
1127 ldr $t2,[$b_ptr,#8]
1128 ldr $t3,[$b_ptr,#12]
1129 subs $a0,$a0,$t0
1130 ldr $t0,[$b_ptr,#16]
1131 sbcs $a1,$a1,$t1
1132 ldr $t1,[$b_ptr,#20]
1133 sbcs $a2,$a2,$t2
1134 ldr $t2,[$b_ptr,#24]
1135 sbcs $a3,$a3,$t3
1136 ldr $t3,[$b_ptr,#28]
1137 sbcs $a4,$a4,$t0
1138 sbcs $a5,$a5,$t1
1139 sbcs $a6,$a6,$t2
1140 sbcs $a7,$a7,$t3
1141 sbc $ff,$ff,$ff @ broadcast borrow bit
1142 ldr lr,[sp],#4 @ pop lr
1143
1144 adds $a0,$a0,$ff @ add synthesized modulus
1145 adcs $a1,$a1,$ff
1146 str $a0,[$r_ptr,#0]
1147 adcs $a2,$a2,$ff
1148 str $a1,[$r_ptr,#4]
1149 adcs $a3,$a3,#0
1150 str $a2,[$r_ptr,#8]
1151 adcs $a4,$a4,#0
1152 str $a3,[$r_ptr,#12]
1153 adcs $a5,$a5,#0
1154 str $a4,[$r_ptr,#16]
1155 adcs $a6,$a6,$ff,lsr#31
1156 str $a5,[$r_ptr,#20]
1157 adcs $a7,$a7,$ff
1158 str $a6,[$r_ptr,#24]
1159 str $a7,[$r_ptr,#28]
1160
1161 mov pc,lr
1162.size __ecp_nistz256_sub_from,.-__ecp_nistz256_sub_from
1163
1164.type __ecp_nistz256_sub_morf,%function
1165.align 5
1166__ecp_nistz256_sub_morf:
1167 str lr,[sp,#-4]! @ push lr
1168
1169 ldr $t0,[$b_ptr,#0]
1170 ldr $t1,[$b_ptr,#4]
1171 ldr $t2,[$b_ptr,#8]
1172 ldr $t3,[$b_ptr,#12]
1173 subs $a0,$t0,$a0
1174 ldr $t0,[$b_ptr,#16]
1175 sbcs $a1,$t1,$a1
1176 ldr $t1,[$b_ptr,#20]
1177 sbcs $a2,$t2,$a2
1178 ldr $t2,[$b_ptr,#24]
1179 sbcs $a3,$t3,$a3
1180 ldr $t3,[$b_ptr,#28]
1181 sbcs $a4,$t0,$a4
1182 sbcs $a5,$t1,$a5
1183 sbcs $a6,$t2,$a6
1184 sbcs $a7,$t3,$a7
1185 sbc $ff,$ff,$ff @ broadcast borrow bit
1186 ldr lr,[sp],#4 @ pop lr
1187
1188 adds $a0,$a0,$ff @ add synthesized modulus
1189 adcs $a1,$a1,$ff
1190 str $a0,[$r_ptr,#0]
1191 adcs $a2,$a2,$ff
1192 str $a1,[$r_ptr,#4]
1193 adcs $a3,$a3,#0
1194 str $a2,[$r_ptr,#8]
1195 adcs $a4,$a4,#0
1196 str $a3,[$r_ptr,#12]
1197 adcs $a5,$a5,#0
1198 str $a4,[$r_ptr,#16]
1199 adcs $a6,$a6,$ff,lsr#31
1200 str $a5,[$r_ptr,#20]
1201 adcs $a7,$a7,$ff
1202 str $a6,[$r_ptr,#24]
1203 str $a7,[$r_ptr,#28]
1204
1205 mov pc,lr
1206.size __ecp_nistz256_sub_morf,.-__ecp_nistz256_sub_morf
1207
1208.type __ecp_nistz256_add_self,%function
1209.align 4
1210__ecp_nistz256_add_self:
1211 adds $a0,$a0,$a0 @ a[0:7]+=a[0:7]
1212 adcs $a1,$a1,$a1
1213 adcs $a2,$a2,$a2
1214 adcs $a3,$a3,$a3
1215 adcs $a4,$a4,$a4
1216 adcs $a5,$a5,$a5
1217 adcs $a6,$a6,$a6
1218 mov $ff,#0
1219 adcs $a7,$a7,$a7
1220 adc $ff,$ff,#0
1221
1222 @ if a+b >= modulus, subtract modulus.
1223 @
1224 @ But since comparison implies subtraction, we subtract
1225 @ modulus and then add it back if subtraction borrowed.
1226
1227 subs $a0,$a0,#-1
1228 sbcs $a1,$a1,#-1
1229 sbcs $a2,$a2,#-1
1230 sbcs $a3,$a3,#0
1231 sbcs $a4,$a4,#0
1232 sbcs $a5,$a5,#0
1233 sbcs $a6,$a6,#1
1234 sbcs $a7,$a7,#-1
1235 sbc $ff,$ff,#0
1236
1237 @ Note that because mod has special form, i.e. consists of
1238 @ 0xffffffff, 1 and 0s, we can conditionally synthesize it by
1239 @ using value of borrow as a whole or extracting single bit.
1240 @ Follow $ff register...
1241
1242 adds $a0,$a0,$ff @ add synthesized modulus
1243 adcs $a1,$a1,$ff
1244 str $a0,[$r_ptr,#0]
1245 adcs $a2,$a2,$ff
1246 str $a1,[$r_ptr,#4]
1247 adcs $a3,$a3,#0
1248 str $a2,[$r_ptr,#8]
1249 adcs $a4,$a4,#0
1250 str $a3,[$r_ptr,#12]
1251 adcs $a5,$a5,#0
1252 str $a4,[$r_ptr,#16]
1253 adcs $a6,$a6,$ff,lsr#31
1254 str $a5,[$r_ptr,#20]
1255 adcs $a7,$a7,$ff
1256 str $a6,[$r_ptr,#24]
1257 str $a7,[$r_ptr,#28]
1258
1259 mov pc,lr
1260.size __ecp_nistz256_add_self,.-__ecp_nistz256_add_self
1261
1262___
1263
1264########################################################################
1265# following subroutines are "literal" implementation of those found in
1266# ecp_nistz256.c
1267#
1268########################################################################
1269# void ecp_nistz256_point_double(P256_POINT *out,const P256_POINT *inp);
1270#
1271{
1272my ($S,$M,$Zsqr,$in_x,$tmp0)=map(32*$_,(0..4));
1273# above map() describes stack layout with 5 temporary
1274# 256-bit vectors on top. Then note that we push
1275# starting from r0, which means that we have copy of
1276# input arguments just below these temporary vectors.
1277
1278$code.=<<___;
1279.globl ecp_nistz256_point_double
1280.type ecp_nistz256_point_double,%function
1281.align 5
1282ecp_nistz256_point_double:
1283 stmdb sp!,{r0-r12,lr} @ push from r0, unusual, but intentional
1284 sub sp,sp,#32*5
1285
1286.Lpoint_double_shortcut:
1287 add r3,sp,#$in_x
1288 ldmia $a_ptr!,{r4-r11} @ copy in_x
1289 stmia r3,{r4-r11}
1290
1291 add $r_ptr,sp,#$S
1292 bl __ecp_nistz256_mul_by_2 @ p256_mul_by_2(S, in_y);
1293
1294 add $b_ptr,$a_ptr,#32
1295 add $a_ptr,$a_ptr,#32
1296 add $r_ptr,sp,#$Zsqr
1297 bl __ecp_nistz256_mul_mont @ p256_sqr_mont(Zsqr, in_z);
1298
1299 add $a_ptr,sp,#$S
1300 add $b_ptr,sp,#$S
1301 add $r_ptr,sp,#$S
1302 bl __ecp_nistz256_mul_mont @ p256_sqr_mont(S, S);
1303
1304 ldr $b_ptr,[sp,#32*5+4]
1305 add $a_ptr,$b_ptr,#32
1306 add $b_ptr,$b_ptr,#64
1307 add $r_ptr,sp,#$tmp0
1308 bl __ecp_nistz256_mul_mont @ p256_mul_mont(tmp0, in_z, in_y);
1309
1310 ldr $r_ptr,[sp,#32*5]
1311 add $r_ptr,$r_ptr,#64
1312 bl __ecp_nistz256_add_self @ p256_mul_by_2(res_z, tmp0);
1313
1314 add $a_ptr,sp,#$in_x
1315 add $b_ptr,sp,#$Zsqr
1316 add $r_ptr,sp,#$M
1317 bl __ecp_nistz256_add @ p256_add(M, in_x, Zsqr);
1318
1319 add $a_ptr,sp,#$in_x
1320 add $b_ptr,sp,#$Zsqr
1321 add $r_ptr,sp,#$Zsqr
1322 bl __ecp_nistz256_sub @ p256_sub(Zsqr, in_x, Zsqr);
1323
1324 add $a_ptr,sp,#$S
1325 add $b_ptr,sp,#$S
1326 add $r_ptr,sp,#$tmp0
1327 bl __ecp_nistz256_mul_mont @ p256_sqr_mont(tmp0, S);
1328
1329 add $a_ptr,sp,#$Zsqr
1330 add $b_ptr,sp,#$M
1331 add $r_ptr,sp,#$M
1332 bl __ecp_nistz256_mul_mont @ p256_mul_mont(M, M, Zsqr);
1333
1334 ldr $r_ptr,[sp,#32*5]
1335 add $a_ptr,sp,#$tmp0
1336 add $r_ptr,$r_ptr,#32
1337 bl __ecp_nistz256_div_by_2 @ p256_div_by_2(res_y, tmp0);
1338
1339 add $a_ptr,sp,#$M
1340 add $r_ptr,sp,#$M
1341 bl __ecp_nistz256_mul_by_3 @ p256_mul_by_3(M, M);
1342
1343 add $a_ptr,sp,#$in_x
1344 add $b_ptr,sp,#$S
1345 add $r_ptr,sp,#$S
1346 bl __ecp_nistz256_mul_mont @ p256_mul_mont(S, S, in_x);
1347
1348 add $r_ptr,sp,#$tmp0
1349 bl __ecp_nistz256_add_self @ p256_mul_by_2(tmp0, S);
1350
1351 ldr $r_ptr,[sp,#32*5]
1352 add $a_ptr,sp,#$M
1353 add $b_ptr,sp,#$M
1354 bl __ecp_nistz256_mul_mont @ p256_sqr_mont(res_x, M);
1355
1356 add $b_ptr,sp,#$tmp0
1357 bl __ecp_nistz256_sub_from @ p256_sub(res_x, res_x, tmp0);
1358
1359 add $b_ptr,sp,#$S
1360 add $r_ptr,sp,#$S
1361 bl __ecp_nistz256_sub_morf @ p256_sub(S, S, res_x);
1362
1363 add $a_ptr,sp,#$M
1364 add $b_ptr,sp,#$S
1365 bl __ecp_nistz256_mul_mont @ p256_mul_mont(S, S, M);
1366
1367 ldr $r_ptr,[sp,#32*5]
1368 add $b_ptr,$r_ptr,#32
1369 add $r_ptr,$r_ptr,#32
1370 bl __ecp_nistz256_sub_from @ p256_sub(res_y, S, res_y);
1371
1372 add sp,sp,#32*5+16 @ +16 means "skip even over saved r0-r3"
1373#if __ARM_ARCH__>=5 || !defined(__thumb__)
1374 ldmia sp!,{r4-r12,pc}
1375#else
1376 ldmia sp!,{r4-r12,lr}
1377 bx lr @ interoperable with Thumb ISA:-)
1378#endif
1379.size ecp_nistz256_point_double,.-ecp_nistz256_point_double
1380___
1381}
1382
1383########################################################################
1384# void ecp_nistz256_point_add(P256_POINT *out,const P256_POINT *in1,
1385# const P256_POINT *in2);
1386{
1387my ($res_x,$res_y,$res_z,
1388 $in1_x,$in1_y,$in1_z,
1389 $in2_x,$in2_y,$in2_z,
1390 $H,$Hsqr,$R,$Rsqr,$Hcub,
1391 $U1,$U2,$S1,$S2)=map(32*$_,(0..17));
1392my ($Z1sqr, $Z2sqr) = ($Hsqr, $Rsqr);
1393# above map() describes stack layout with 18 temporary
1394# 256-bit vectors on top. Then note that we push
1395# starting from r0, which means that we have copy of
1396# input arguments just below these temporary vectors.
1397# We use three of them for ~in1infty, ~in2infty and
1398# result of check for zero.
1399
1400$code.=<<___;
1401.globl ecp_nistz256_point_add
1402.type ecp_nistz256_point_add,%function
1403.align 5
1404ecp_nistz256_point_add:
1405 stmdb sp!,{r0-r12,lr} @ push from r0, unusual, but intentional
1406 sub sp,sp,#32*18+16
1407
1408 ldmia $b_ptr!,{r4-r11} @ copy in2_x
1409 add r3,sp,#$in2_x
1410 stmia r3!,{r4-r11}
1411 ldmia $b_ptr!,{r4-r11} @ copy in2_y
1412 stmia r3!,{r4-r11}
1413 ldmia $b_ptr,{r4-r11} @ copy in2_z
1414 orr r12,r4,r5
1415 orr r12,r12,r6
1416 orr r12,r12,r7
1417 orr r12,r12,r8
1418 orr r12,r12,r9
1419 orr r12,r12,r10
1420 orr r12,r12,r11
1421 cmp r12,#0
1422#ifdef __thumb2__
1423 it ne
1424#endif
1425 movne r12,#-1
1426 stmia r3,{r4-r11}
1427 str r12,[sp,#32*18+8] @ ~in2infty
1428
1429 ldmia $a_ptr!,{r4-r11} @ copy in1_x
1430 add r3,sp,#$in1_x
1431 stmia r3!,{r4-r11}
1432 ldmia $a_ptr!,{r4-r11} @ copy in1_y
1433 stmia r3!,{r4-r11}
1434 ldmia $a_ptr,{r4-r11} @ copy in1_z
1435 orr r12,r4,r5
1436 orr r12,r12,r6
1437 orr r12,r12,r7
1438 orr r12,r12,r8
1439 orr r12,r12,r9
1440 orr r12,r12,r10
1441 orr r12,r12,r11
1442 cmp r12,#0
1443#ifdef __thumb2__
1444 it ne
1445#endif
1446 movne r12,#-1
1447 stmia r3,{r4-r11}
1448 str r12,[sp,#32*18+4] @ ~in1infty
1449
1450 add $a_ptr,sp,#$in2_z
1451 add $b_ptr,sp,#$in2_z
1452 add $r_ptr,sp,#$Z2sqr
1453 bl __ecp_nistz256_mul_mont @ p256_sqr_mont(Z2sqr, in2_z);
1454
1455 add $a_ptr,sp,#$in1_z
1456 add $b_ptr,sp,#$in1_z
1457 add $r_ptr,sp,#$Z1sqr
1458 bl __ecp_nistz256_mul_mont @ p256_sqr_mont(Z1sqr, in1_z);
1459
1460 add $a_ptr,sp,#$in2_z
1461 add $b_ptr,sp,#$Z2sqr
1462 add $r_ptr,sp,#$S1
1463 bl __ecp_nistz256_mul_mont @ p256_mul_mont(S1, Z2sqr, in2_z);
1464
1465 add $a_ptr,sp,#$in1_z
1466 add $b_ptr,sp,#$Z1sqr
1467 add $r_ptr,sp,#$S2
1468 bl __ecp_nistz256_mul_mont @ p256_mul_mont(S2, Z1sqr, in1_z);
1469
1470 add $a_ptr,sp,#$in1_y
1471 add $b_ptr,sp,#$S1
1472 add $r_ptr,sp,#$S1
1473 bl __ecp_nistz256_mul_mont @ p256_mul_mont(S1, S1, in1_y);
1474
1475 add $a_ptr,sp,#$in2_y
1476 add $b_ptr,sp,#$S2
1477 add $r_ptr,sp,#$S2
1478 bl __ecp_nistz256_mul_mont @ p256_mul_mont(S2, S2, in2_y);
1479
1480 add $b_ptr,sp,#$S1
1481 add $r_ptr,sp,#$R
1482 bl __ecp_nistz256_sub_from @ p256_sub(R, S2, S1);
1483
1484 orr $a0,$a0,$a1 @ see if result is zero
1485 orr $a2,$a2,$a3
1486 orr $a4,$a4,$a5
1487 orr $a0,$a0,$a2
1488 orr $a4,$a4,$a6
1489 orr $a0,$a0,$a7
1490 add $a_ptr,sp,#$in1_x
1491 orr $a0,$a0,$a4
1492 add $b_ptr,sp,#$Z2sqr
1493 str $a0,[sp,#32*18+12]
1494
1495 add $r_ptr,sp,#$U1
1496 bl __ecp_nistz256_mul_mont @ p256_mul_mont(U1, in1_x, Z2sqr);
1497
1498 add $a_ptr,sp,#$in2_x
1499 add $b_ptr,sp,#$Z1sqr
1500 add $r_ptr,sp,#$U2
1501 bl __ecp_nistz256_mul_mont @ p256_mul_mont(U2, in2_x, Z1sqr);
1502
1503 add $b_ptr,sp,#$U1
1504 add $r_ptr,sp,#$H
1505 bl __ecp_nistz256_sub_from @ p256_sub(H, U2, U1);
1506
1507 orr $a0,$a0,$a1 @ see if result is zero
1508 orr $a2,$a2,$a3
1509 orr $a4,$a4,$a5
1510 orr $a0,$a0,$a2
1511 orr $a4,$a4,$a6
1512 orr $a0,$a0,$a7
1513 orr $a0,$a0,$a4 @ ~is_equal(U1,U2)
1514
1515 ldr $t0,[sp,#32*18+4] @ ~in1infty
1516 ldr $t1,[sp,#32*18+8] @ ~in2infty
1517 ldr $t2,[sp,#32*18+12] @ ~is_equal(S1,S2)
1518 mvn $t0,$t0 @ -1/0 -> 0/-1
1519 mvn $t1,$t1 @ -1/0 -> 0/-1
1520 orr $a0,$a0,$t0
1521 orr $a0,$a0,$t1
1522 orrs $a0,$a0,$t2 @ set flags
1523
1524 @ if(~is_equal(U1,U2) | in1infty | in2infty | ~is_equal(S1,S2))
1525 bne .Ladd_proceed
1526
1527.Ladd_double:
1528 ldr $a_ptr,[sp,#32*18+20]
1529 add sp,sp,#32*(18-5)+16 @ difference in frame sizes
1530 b .Lpoint_double_shortcut
1531
1532.align 4
1533.Ladd_proceed:
1534 add $a_ptr,sp,#$R
1535 add $b_ptr,sp,#$R
1536 add $r_ptr,sp,#$Rsqr
1537 bl __ecp_nistz256_mul_mont @ p256_sqr_mont(Rsqr, R);
1538
1539 add $a_ptr,sp,#$H
1540 add $b_ptr,sp,#$in1_z
1541 add $r_ptr,sp,#$res_z
1542 bl __ecp_nistz256_mul_mont @ p256_mul_mont(res_z, H, in1_z);
1543
1544 add $a_ptr,sp,#$H
1545 add $b_ptr,sp,#$H
1546 add $r_ptr,sp,#$Hsqr
1547 bl __ecp_nistz256_mul_mont @ p256_sqr_mont(Hsqr, H);
1548
1549 add $a_ptr,sp,#$in2_z
1550 add $b_ptr,sp,#$res_z
1551 add $r_ptr,sp,#$res_z
1552 bl __ecp_nistz256_mul_mont @ p256_mul_mont(res_z, res_z, in2_z);
1553
1554 add $a_ptr,sp,#$H
1555 add $b_ptr,sp,#$Hsqr
1556 add $r_ptr,sp,#$Hcub
1557 bl __ecp_nistz256_mul_mont @ p256_mul_mont(Hcub, Hsqr, H);
1558
1559 add $a_ptr,sp,#$Hsqr
1560 add $b_ptr,sp,#$U1
1561 add $r_ptr,sp,#$U2
1562 bl __ecp_nistz256_mul_mont @ p256_mul_mont(U2, U1, Hsqr);
1563
1564 add $r_ptr,sp,#$Hsqr
1565 bl __ecp_nistz256_add_self @ p256_mul_by_2(Hsqr, U2);
1566
1567 add $b_ptr,sp,#$Rsqr
1568 add $r_ptr,sp,#$res_x
1569 bl __ecp_nistz256_sub_morf @ p256_sub(res_x, Rsqr, Hsqr);
1570
1571 add $b_ptr,sp,#$Hcub
1572 bl __ecp_nistz256_sub_from @ p256_sub(res_x, res_x, Hcub);
1573
1574 add $b_ptr,sp,#$U2
1575 add $r_ptr,sp,#$res_y
1576 bl __ecp_nistz256_sub_morf @ p256_sub(res_y, U2, res_x);
1577
1578 add $a_ptr,sp,#$Hcub
1579 add $b_ptr,sp,#$S1
1580 add $r_ptr,sp,#$S2
1581 bl __ecp_nistz256_mul_mont @ p256_mul_mont(S2, S1, Hcub);
1582
1583 add $a_ptr,sp,#$R
1584 add $b_ptr,sp,#$res_y
1585 add $r_ptr,sp,#$res_y
1586 bl __ecp_nistz256_mul_mont @ p256_mul_mont(res_y, res_y, R);
1587
1588 add $b_ptr,sp,#$S2
1589 bl __ecp_nistz256_sub_from @ p256_sub(res_y, res_y, S2);
1590
1591 ldr r11,[sp,#32*18+4] @ ~in1infty
1592 ldr r12,[sp,#32*18+8] @ ~in2infty
1593 add r1,sp,#$res_x
1594 add r2,sp,#$in2_x
1595 and r10,r11,r12 @ ~in1infty & ~in2infty
1596 mvn r11,r11
1597 add r3,sp,#$in1_x
1598 and r11,r11,r12 @ in1infty & ~in2infty
1599 mvn r12,r12 @ in2infty
1600 ldr $r_ptr,[sp,#32*18+16]
1601___
1602for($i=0;$i<96;$i+=8) { # conditional moves
1603$code.=<<___;
1604 ldmia r1!,{r4-r5} @ res_x
1605 ldmia r2!,{r6-r7} @ in2_x
1606 ldmia r3!,{r8-r9} @ in1_x
1607 and r4,r4,r10 @ ~in1infty & ~in2infty
1608 and r5,r5,r10
1609 and r6,r6,r11 @ in1infty & ~in2infty
1610 and r7,r7,r11
1611 and r8,r8,r12 @ in2infty
1612 and r9,r9,r12
1613 orr r4,r4,r6
1614 orr r5,r5,r7
1615 orr r4,r4,r8
1616 orr r5,r5,r9
1617 stmia $r_ptr!,{r4-r5}
1618___
1619}
1620$code.=<<___;
1621.Ladd_done:
1622 add sp,sp,#32*18+16+16 @ +16 means "skip even over saved r0-r3"
1623#if __ARM_ARCH__>=5 || !defined(__thumb__)
1624 ldmia sp!,{r4-r12,pc}
1625#else
1626 ldmia sp!,{r4-r12,lr}
1627 bx lr @ interoperable with Thumb ISA:-)
1628#endif
1629.size ecp_nistz256_point_add,.-ecp_nistz256_point_add
1630___
1631}
1632
1633########################################################################
1634# void ecp_nistz256_point_add_affine(P256_POINT *out,const P256_POINT *in1,
1635# const P256_POINT_AFFINE *in2);
1636{
1637my ($res_x,$res_y,$res_z,
1638 $in1_x,$in1_y,$in1_z,
1639 $in2_x,$in2_y,
1640 $U2,$S2,$H,$R,$Hsqr,$Hcub,$Rsqr)=map(32*$_,(0..14));
1641my $Z1sqr = $S2;
1642# above map() describes stack layout with 18 temporary
1643# 256-bit vectors on top. Then note that we push
1644# starting from r0, which means that we have copy of
1645# input arguments just below these temporary vectors.
1646# We use two of them for ~in1infty, ~in2infty.
1647
1648my @ONE_mont=(1,0,0,-1,-1,-1,-2,0);
1649
1650$code.=<<___;
1651.globl ecp_nistz256_point_add_affine
1652.type ecp_nistz256_point_add_affine,%function
1653.align 5
1654ecp_nistz256_point_add_affine:
1655 stmdb sp!,{r0-r12,lr} @ push from r0, unusual, but intentional
1656 sub sp,sp,#32*15
1657
1658 ldmia $a_ptr!,{r4-r11} @ copy in1_x
1659 add r3,sp,#$in1_x
1660 stmia r3!,{r4-r11}
1661 ldmia $a_ptr!,{r4-r11} @ copy in1_y
1662 stmia r3!,{r4-r11}
1663 ldmia $a_ptr,{r4-r11} @ copy in1_z
1664 orr r12,r4,r5
1665 orr r12,r12,r6
1666 orr r12,r12,r7
1667 orr r12,r12,r8
1668 orr r12,r12,r9
1669 orr r12,r12,r10
1670 orr r12,r12,r11
1671 cmp r12,#0
1672#ifdef __thumb2__
1673 it ne
1674#endif
1675 movne r12,#-1
1676 stmia r3,{r4-r11}
1677 str r12,[sp,#32*15+4] @ ~in1infty
1678
1679 ldmia $b_ptr!,{r4-r11} @ copy in2_x
1680 add r3,sp,#$in2_x
1681 orr r12,r4,r5
1682 orr r12,r12,r6
1683 orr r12,r12,r7
1684 orr r12,r12,r8
1685 orr r12,r12,r9
1686 orr r12,r12,r10
1687 orr r12,r12,r11
1688 stmia r3!,{r4-r11}
1689 ldmia $b_ptr!,{r4-r11} @ copy in2_y
1690 orr r12,r12,r4
1691 orr r12,r12,r5
1692 orr r12,r12,r6
1693 orr r12,r12,r7
1694 orr r12,r12,r8
1695 orr r12,r12,r9
1696 orr r12,r12,r10
1697 orr r12,r12,r11
1698 stmia r3!,{r4-r11}
1699 cmp r12,#0
1700#ifdef __thumb2__
1701 it ne
1702#endif
1703 movne r12,#-1
1704 str r12,[sp,#32*15+8] @ ~in2infty
1705
1706 add $a_ptr,sp,#$in1_z
1707 add $b_ptr,sp,#$in1_z
1708 add $r_ptr,sp,#$Z1sqr
1709 bl __ecp_nistz256_mul_mont @ p256_sqr_mont(Z1sqr, in1_z);
1710
1711 add $a_ptr,sp,#$Z1sqr
1712 add $b_ptr,sp,#$in2_x
1713 add $r_ptr,sp,#$U2
1714 bl __ecp_nistz256_mul_mont @ p256_mul_mont(U2, Z1sqr, in2_x);
1715
1716 add $b_ptr,sp,#$in1_x
1717 add $r_ptr,sp,#$H
1718 bl __ecp_nistz256_sub_from @ p256_sub(H, U2, in1_x);
1719
1720 add $a_ptr,sp,#$Z1sqr
1721 add $b_ptr,sp,#$in1_z
1722 add $r_ptr,sp,#$S2
1723 bl __ecp_nistz256_mul_mont @ p256_mul_mont(S2, Z1sqr, in1_z);
1724
1725 add $a_ptr,sp,#$H
1726 add $b_ptr,sp,#$in1_z
1727 add $r_ptr,sp,#$res_z
1728 bl __ecp_nistz256_mul_mont @ p256_mul_mont(res_z, H, in1_z);
1729
1730 add $a_ptr,sp,#$in2_y
1731 add $b_ptr,sp,#$S2
1732 add $r_ptr,sp,#$S2
1733 bl __ecp_nistz256_mul_mont @ p256_mul_mont(S2, S2, in2_y);
1734
1735 add $b_ptr,sp,#$in1_y
1736 add $r_ptr,sp,#$R
1737 bl __ecp_nistz256_sub_from @ p256_sub(R, S2, in1_y);
1738
1739 add $a_ptr,sp,#$H
1740 add $b_ptr,sp,#$H
1741 add $r_ptr,sp,#$Hsqr
1742 bl __ecp_nistz256_mul_mont @ p256_sqr_mont(Hsqr, H);
1743
1744 add $a_ptr,sp,#$R
1745 add $b_ptr,sp,#$R
1746 add $r_ptr,sp,#$Rsqr
1747 bl __ecp_nistz256_mul_mont @ p256_sqr_mont(Rsqr, R);
1748
1749 add $a_ptr,sp,#$H
1750 add $b_ptr,sp,#$Hsqr
1751 add $r_ptr,sp,#$Hcub
1752 bl __ecp_nistz256_mul_mont @ p256_mul_mont(Hcub, Hsqr, H);
1753
1754 add $a_ptr,sp,#$Hsqr
1755 add $b_ptr,sp,#$in1_x
1756 add $r_ptr,sp,#$U2
1757 bl __ecp_nistz256_mul_mont @ p256_mul_mont(U2, in1_x, Hsqr);
1758
1759 add $r_ptr,sp,#$Hsqr
1760 bl __ecp_nistz256_add_self @ p256_mul_by_2(Hsqr, U2);
1761
1762 add $b_ptr,sp,#$Rsqr
1763 add $r_ptr,sp,#$res_x
1764 bl __ecp_nistz256_sub_morf @ p256_sub(res_x, Rsqr, Hsqr);
1765
1766 add $b_ptr,sp,#$Hcub
1767 bl __ecp_nistz256_sub_from @ p256_sub(res_x, res_x, Hcub);
1768
1769 add $b_ptr,sp,#$U2
1770 add $r_ptr,sp,#$res_y
1771 bl __ecp_nistz256_sub_morf @ p256_sub(res_y, U2, res_x);
1772
1773 add $a_ptr,sp,#$Hcub
1774 add $b_ptr,sp,#$in1_y
1775 add $r_ptr,sp,#$S2
1776 bl __ecp_nistz256_mul_mont @ p256_mul_mont(S2, in1_y, Hcub);
1777
1778 add $a_ptr,sp,#$R
1779 add $b_ptr,sp,#$res_y
1780 add $r_ptr,sp,#$res_y
1781 bl __ecp_nistz256_mul_mont @ p256_mul_mont(res_y, res_y, R);
1782
1783 add $b_ptr,sp,#$S2
1784 bl __ecp_nistz256_sub_from @ p256_sub(res_y, res_y, S2);
1785
1786 ldr r11,[sp,#32*15+4] @ ~in1infty
1787 ldr r12,[sp,#32*15+8] @ ~in2infty
1788 add r1,sp,#$res_x
1789 add r2,sp,#$in2_x
1790 and r10,r11,r12 @ ~in1infty & ~in2infty
1791 mvn r11,r11
1792 add r3,sp,#$in1_x
1793 and r11,r11,r12 @ in1infty & ~in2infty
1794 mvn r12,r12 @ in2infty
1795 ldr $r_ptr,[sp,#32*15]
1796___
1797for($i=0;$i<64;$i+=8) { # conditional moves
1798$code.=<<___;
1799 ldmia r1!,{r4-r5} @ res_x
1800 ldmia r2!,{r6-r7} @ in2_x
1801 ldmia r3!,{r8-r9} @ in1_x
1802 and r4,r4,r10 @ ~in1infty & ~in2infty
1803 and r5,r5,r10
1804 and r6,r6,r11 @ in1infty & ~in2infty
1805 and r7,r7,r11
1806 and r8,r8,r12 @ in2infty
1807 and r9,r9,r12
1808 orr r4,r4,r6
1809 orr r5,r5,r7
1810 orr r4,r4,r8
1811 orr r5,r5,r9
1812 stmia $r_ptr!,{r4-r5}
1813___
1814}
1815for(;$i<96;$i+=8) {
1816my $j=($i-64)/4;
1817$code.=<<___;
1818 ldmia r1!,{r4-r5} @ res_z
1819 ldmia r3!,{r8-r9} @ in1_z
1820 and r4,r4,r10
1821 and r5,r5,r10
1822 and r6,r11,#@ONE_mont[$j]
1823 and r7,r11,#@ONE_mont[$j+1]
1824 and r8,r8,r12
1825 and r9,r9,r12
1826 orr r4,r4,r6
1827 orr r5,r5,r7
1828 orr r4,r4,r8
1829 orr r5,r5,r9
1830 stmia $r_ptr!,{r4-r5}
1831___
1832}
1833$code.=<<___;
1834 add sp,sp,#32*15+16 @ +16 means "skip even over saved r0-r3"
1835#if __ARM_ARCH__>=5 || !defined(__thumb__)
1836 ldmia sp!,{r4-r12,pc}
1837#else
1838 ldmia sp!,{r4-r12,lr}
1839 bx lr @ interoperable with Thumb ISA:-)
1840#endif
1841.size ecp_nistz256_point_add_affine,.-ecp_nistz256_point_add_affine
1842___
1843} }}}
1844
1845foreach (split("\n",$code)) {
1846 s/\`([^\`]*)\`/eval $1/geo;
1847
1848 s/\bq([0-9]+)#(lo|hi)/sprintf "d%d",2*$1+($2 eq "hi")/geo;
1849
1850 print $_,"\n";
1851}
1852close STDOUT or die "error closing STDOUT: $!"; # enforce flush
Note: See TracBrowser for help on using the repository browser.

© 2024 Oracle Support Privacy / Do Not Sell My Info Terms of Use Trademark Policy Automated Access Etiquette