1 | #! /usr/bin/env perl
|
---|
2 | # Copyright 2015-2020 The OpenSSL Project Authors. All Rights Reserved.
|
---|
3 | #
|
---|
4 | # Licensed under the Apache License 2.0 (the "License"). You may not use
|
---|
5 | # this file except in compliance with the License. You can obtain a copy
|
---|
6 | # in the file LICENSE in the source distribution or at
|
---|
7 | # https://www.openssl.org/source/license.html
|
---|
8 |
|
---|
9 |
|
---|
10 | # ====================================================================
|
---|
11 | # Written by Andy Polyakov <[email protected]> for the OpenSSL
|
---|
12 | # project. The module is, however, dual licensed under OpenSSL and
|
---|
13 | # CRYPTOGAMS licenses depending on where you obtain it. For further
|
---|
14 | # details see http://www.openssl.org/~appro/cryptogams/.
|
---|
15 | # ====================================================================
|
---|
16 | #
|
---|
17 | # ECP_NISTZ256 module for ARMv4.
|
---|
18 | #
|
---|
19 | # October 2014.
|
---|
20 | #
|
---|
21 | # Original ECP_NISTZ256 submission targeting x86_64 is detailed in
|
---|
22 | # http://eprint.iacr.org/2013/816. In the process of adaptation
|
---|
23 | # original .c module was made 32-bit savvy in order to make this
|
---|
24 | # implementation possible.
|
---|
25 | #
|
---|
26 | # with/without -DECP_NISTZ256_ASM
|
---|
27 | # Cortex-A8 +53-170%
|
---|
28 | # Cortex-A9 +76-205%
|
---|
29 | # Cortex-A15 +100-316%
|
---|
30 | # Snapdragon S4 +66-187%
|
---|
31 | #
|
---|
32 | # Ranges denote minimum and maximum improvement coefficients depending
|
---|
33 | # on benchmark. Lower coefficients are for ECDSA sign, server-side
|
---|
34 | # operation. Keep in mind that +200% means 3x improvement.
|
---|
35 |
|
---|
36 | # $output is the last argument if it looks like a file (it has an extension)
|
---|
37 | # $flavour is the first argument if it doesn't look like a file
|
---|
38 | $output = $#ARGV >= 0 && $ARGV[$#ARGV] =~ m|\.\w+$| ? pop : undef;
|
---|
39 | $flavour = $#ARGV >= 0 && $ARGV[0] !~ m|\.| ? shift : undef;
|
---|
40 |
|
---|
41 | if ($flavour && $flavour ne "void") {
|
---|
42 | $0 =~ m/(.*[\/\\])[^\/\\]+$/; $dir=$1;
|
---|
43 | ( $xlate="${dir}arm-xlate.pl" and -f $xlate ) or
|
---|
44 | ( $xlate="${dir}../../perlasm/arm-xlate.pl" and -f $xlate) or
|
---|
45 | die "can't locate arm-xlate.pl";
|
---|
46 |
|
---|
47 | open STDOUT,"| \"$^X\" $xlate $flavour \"$output\""
|
---|
48 | or die "can't call $xlate: $!";
|
---|
49 | } else {
|
---|
50 | $output and open STDOUT,">$output";
|
---|
51 | }
|
---|
52 |
|
---|
53 | $code.=<<___;
|
---|
54 | #include "arm_arch.h"
|
---|
55 |
|
---|
56 | #if defined(__thumb2__)
|
---|
57 | .syntax unified
|
---|
58 | .thumb
|
---|
59 | #else
|
---|
60 | .code 32
|
---|
61 | #endif
|
---|
62 | ___
|
---|
63 | ########################################################################
|
---|
64 | # Convert ecp_nistz256_table.c to layout expected by ecp_nistz_gather_w7
|
---|
65 | #
|
---|
66 | $0 =~ m/(.*[\/\\])[^\/\\]+$/; $dir=$1;
|
---|
67 | open TABLE,"<ecp_nistz256_table.c" or
|
---|
68 | open TABLE,"<${dir}../ecp_nistz256_table.c" or
|
---|
69 | die "failed to open ecp_nistz256_table.c:",$!;
|
---|
70 |
|
---|
71 | use integer;
|
---|
72 |
|
---|
73 | foreach(<TABLE>) {
|
---|
74 | s/TOBN\(\s*(0x[0-9a-f]+),\s*(0x[0-9a-f]+)\s*\)/push @arr,hex($2),hex($1)/geo;
|
---|
75 | }
|
---|
76 | close TABLE;
|
---|
77 |
|
---|
78 | # See ecp_nistz256_table.c for explanation for why it's 64*16*37.
|
---|
79 | # 64*16*37-1 is because $#arr returns last valid index or @arr, not
|
---|
80 | # amount of elements.
|
---|
81 | die "insane number of elements" if ($#arr != 64*16*37-1);
|
---|
82 |
|
---|
83 | $code.=<<___;
|
---|
84 | .rodata
|
---|
85 | .globl ecp_nistz256_precomputed
|
---|
86 | .type ecp_nistz256_precomputed,%object
|
---|
87 | .align 12
|
---|
88 | ecp_nistz256_precomputed:
|
---|
89 | ___
|
---|
90 | ########################################################################
|
---|
91 | # this conversion smashes P256_POINT_AFFINE by individual bytes with
|
---|
92 | # 64 byte interval, similar to
|
---|
93 | # 1111222233334444
|
---|
94 | # 1234123412341234
|
---|
95 | for(1..37) {
|
---|
96 | @tbl = splice(@arr,0,64*16);
|
---|
97 | for($i=0;$i<64;$i++) {
|
---|
98 | undef @line;
|
---|
99 | for($j=0;$j<64;$j++) {
|
---|
100 | push @line,(@tbl[$j*16+$i/4]>>(($i%4)*8))&0xff;
|
---|
101 | }
|
---|
102 | $code.=".byte\t";
|
---|
103 | $code.=join(',',map { sprintf "0x%02x",$_} @line);
|
---|
104 | $code.="\n";
|
---|
105 | }
|
---|
106 | }
|
---|
107 | $code.=<<___;
|
---|
108 | .size ecp_nistz256_precomputed,.-ecp_nistz256_precomputed
|
---|
109 |
|
---|
110 | .text
|
---|
111 | .align 5
|
---|
112 | .LRR: @ 2^512 mod P precomputed for NIST P256 polynomial
|
---|
113 | .long 0x00000003, 0x00000000, 0xffffffff, 0xfffffffb
|
---|
114 | .long 0xfffffffe, 0xffffffff, 0xfffffffd, 0x00000004
|
---|
115 | .Lone:
|
---|
116 | .long 1,0,0,0,0,0,0,0
|
---|
117 | .asciz "ECP_NISTZ256 for ARMv4, CRYPTOGAMS by <appro\@openssl.org>"
|
---|
118 | .align 6
|
---|
119 | ___
|
---|
120 |
|
---|
121 | ########################################################################
|
---|
122 | # common register layout, note that $t2 is link register, so that if
|
---|
123 | # internal subroutine uses $t2, then it has to offload lr...
|
---|
124 |
|
---|
125 | ($r_ptr,$a_ptr,$b_ptr,$ff,$a0,$a1,$a2,$a3,$a4,$a5,$a6,$a7,$t1,$t2)=
|
---|
126 | map("r$_",(0..12,14));
|
---|
127 | ($t0,$t3)=($ff,$a_ptr);
|
---|
128 |
|
---|
129 | $code.=<<___;
|
---|
130 | @ void ecp_nistz256_to_mont(BN_ULONG r0[8],const BN_ULONG r1[8]);
|
---|
131 | .globl ecp_nistz256_to_mont
|
---|
132 | .type ecp_nistz256_to_mont,%function
|
---|
133 | ecp_nistz256_to_mont:
|
---|
134 | adr $b_ptr,.LRR
|
---|
135 | b .Lecp_nistz256_mul_mont
|
---|
136 | .size ecp_nistz256_to_mont,.-ecp_nistz256_to_mont
|
---|
137 |
|
---|
138 | @ void ecp_nistz256_from_mont(BN_ULONG r0[8],const BN_ULONG r1[8]);
|
---|
139 | .globl ecp_nistz256_from_mont
|
---|
140 | .type ecp_nistz256_from_mont,%function
|
---|
141 | ecp_nistz256_from_mont:
|
---|
142 | adr $b_ptr,.Lone
|
---|
143 | b .Lecp_nistz256_mul_mont
|
---|
144 | .size ecp_nistz256_from_mont,.-ecp_nistz256_from_mont
|
---|
145 |
|
---|
146 | @ void ecp_nistz256_mul_by_2(BN_ULONG r0[8],const BN_ULONG r1[8]);
|
---|
147 | .globl ecp_nistz256_mul_by_2
|
---|
148 | .type ecp_nistz256_mul_by_2,%function
|
---|
149 | .align 4
|
---|
150 | ecp_nistz256_mul_by_2:
|
---|
151 | stmdb sp!,{r4-r12,lr}
|
---|
152 | bl __ecp_nistz256_mul_by_2
|
---|
153 | #if __ARM_ARCH__>=5 || !defined(__thumb__)
|
---|
154 | ldmia sp!,{r4-r12,pc}
|
---|
155 | #else
|
---|
156 | ldmia sp!,{r4-r12,lr}
|
---|
157 | bx lr @ interoperable with Thumb ISA:-)
|
---|
158 | #endif
|
---|
159 | .size ecp_nistz256_mul_by_2,.-ecp_nistz256_mul_by_2
|
---|
160 |
|
---|
161 | .type __ecp_nistz256_mul_by_2,%function
|
---|
162 | .align 4
|
---|
163 | __ecp_nistz256_mul_by_2:
|
---|
164 | ldr $a0,[$a_ptr,#0]
|
---|
165 | ldr $a1,[$a_ptr,#4]
|
---|
166 | ldr $a2,[$a_ptr,#8]
|
---|
167 | adds $a0,$a0,$a0 @ a[0:7]+=a[0:7], i.e. add with itself
|
---|
168 | ldr $a3,[$a_ptr,#12]
|
---|
169 | adcs $a1,$a1,$a1
|
---|
170 | ldr $a4,[$a_ptr,#16]
|
---|
171 | adcs $a2,$a2,$a2
|
---|
172 | ldr $a5,[$a_ptr,#20]
|
---|
173 | adcs $a3,$a3,$a3
|
---|
174 | ldr $a6,[$a_ptr,#24]
|
---|
175 | adcs $a4,$a4,$a4
|
---|
176 | ldr $a7,[$a_ptr,#28]
|
---|
177 | adcs $a5,$a5,$a5
|
---|
178 | adcs $a6,$a6,$a6
|
---|
179 | mov $ff,#0
|
---|
180 | adcs $a7,$a7,$a7
|
---|
181 | adc $ff,$ff,#0
|
---|
182 |
|
---|
183 | b .Lreduce_by_sub
|
---|
184 | .size __ecp_nistz256_mul_by_2,.-__ecp_nistz256_mul_by_2
|
---|
185 |
|
---|
186 | @ void ecp_nistz256_add(BN_ULONG r0[8],const BN_ULONG r1[8],
|
---|
187 | @ const BN_ULONG r2[8]);
|
---|
188 | .globl ecp_nistz256_add
|
---|
189 | .type ecp_nistz256_add,%function
|
---|
190 | .align 4
|
---|
191 | ecp_nistz256_add:
|
---|
192 | stmdb sp!,{r4-r12,lr}
|
---|
193 | bl __ecp_nistz256_add
|
---|
194 | #if __ARM_ARCH__>=5 || !defined(__thumb__)
|
---|
195 | ldmia sp!,{r4-r12,pc}
|
---|
196 | #else
|
---|
197 | ldmia sp!,{r4-r12,lr}
|
---|
198 | bx lr @ interoperable with Thumb ISA:-)
|
---|
199 | #endif
|
---|
200 | .size ecp_nistz256_add,.-ecp_nistz256_add
|
---|
201 |
|
---|
202 | .type __ecp_nistz256_add,%function
|
---|
203 | .align 4
|
---|
204 | __ecp_nistz256_add:
|
---|
205 | str lr,[sp,#-4]! @ push lr
|
---|
206 |
|
---|
207 | ldr $a0,[$a_ptr,#0]
|
---|
208 | ldr $a1,[$a_ptr,#4]
|
---|
209 | ldr $a2,[$a_ptr,#8]
|
---|
210 | ldr $a3,[$a_ptr,#12]
|
---|
211 | ldr $a4,[$a_ptr,#16]
|
---|
212 | ldr $t0,[$b_ptr,#0]
|
---|
213 | ldr $a5,[$a_ptr,#20]
|
---|
214 | ldr $t1,[$b_ptr,#4]
|
---|
215 | ldr $a6,[$a_ptr,#24]
|
---|
216 | ldr $t2,[$b_ptr,#8]
|
---|
217 | ldr $a7,[$a_ptr,#28]
|
---|
218 | ldr $t3,[$b_ptr,#12]
|
---|
219 | adds $a0,$a0,$t0
|
---|
220 | ldr $t0,[$b_ptr,#16]
|
---|
221 | adcs $a1,$a1,$t1
|
---|
222 | ldr $t1,[$b_ptr,#20]
|
---|
223 | adcs $a2,$a2,$t2
|
---|
224 | ldr $t2,[$b_ptr,#24]
|
---|
225 | adcs $a3,$a3,$t3
|
---|
226 | ldr $t3,[$b_ptr,#28]
|
---|
227 | adcs $a4,$a4,$t0
|
---|
228 | adcs $a5,$a5,$t1
|
---|
229 | adcs $a6,$a6,$t2
|
---|
230 | mov $ff,#0
|
---|
231 | adcs $a7,$a7,$t3
|
---|
232 | adc $ff,$ff,#0
|
---|
233 | ldr lr,[sp],#4 @ pop lr
|
---|
234 |
|
---|
235 | .Lreduce_by_sub:
|
---|
236 |
|
---|
237 | @ if a+b >= modulus, subtract modulus.
|
---|
238 | @
|
---|
239 | @ But since comparison implies subtraction, we subtract
|
---|
240 | @ modulus and then add it back if subtraction borrowed.
|
---|
241 |
|
---|
242 | subs $a0,$a0,#-1
|
---|
243 | sbcs $a1,$a1,#-1
|
---|
244 | sbcs $a2,$a2,#-1
|
---|
245 | sbcs $a3,$a3,#0
|
---|
246 | sbcs $a4,$a4,#0
|
---|
247 | sbcs $a5,$a5,#0
|
---|
248 | sbcs $a6,$a6,#1
|
---|
249 | sbcs $a7,$a7,#-1
|
---|
250 | sbc $ff,$ff,#0
|
---|
251 |
|
---|
252 | @ Note that because mod has special form, i.e. consists of
|
---|
253 | @ 0xffffffff, 1 and 0s, we can conditionally synthesize it by
|
---|
254 | @ using value of borrow as a whole or extracting single bit.
|
---|
255 | @ Follow $ff register...
|
---|
256 |
|
---|
257 | adds $a0,$a0,$ff @ add synthesized modulus
|
---|
258 | adcs $a1,$a1,$ff
|
---|
259 | str $a0,[$r_ptr,#0]
|
---|
260 | adcs $a2,$a2,$ff
|
---|
261 | str $a1,[$r_ptr,#4]
|
---|
262 | adcs $a3,$a3,#0
|
---|
263 | str $a2,[$r_ptr,#8]
|
---|
264 | adcs $a4,$a4,#0
|
---|
265 | str $a3,[$r_ptr,#12]
|
---|
266 | adcs $a5,$a5,#0
|
---|
267 | str $a4,[$r_ptr,#16]
|
---|
268 | adcs $a6,$a6,$ff,lsr#31
|
---|
269 | str $a5,[$r_ptr,#20]
|
---|
270 | adcs $a7,$a7,$ff
|
---|
271 | str $a6,[$r_ptr,#24]
|
---|
272 | str $a7,[$r_ptr,#28]
|
---|
273 |
|
---|
274 | mov pc,lr
|
---|
275 | .size __ecp_nistz256_add,.-__ecp_nistz256_add
|
---|
276 |
|
---|
277 | @ void ecp_nistz256_mul_by_3(BN_ULONG r0[8],const BN_ULONG r1[8]);
|
---|
278 | .globl ecp_nistz256_mul_by_3
|
---|
279 | .type ecp_nistz256_mul_by_3,%function
|
---|
280 | .align 4
|
---|
281 | ecp_nistz256_mul_by_3:
|
---|
282 | stmdb sp!,{r4-r12,lr}
|
---|
283 | bl __ecp_nistz256_mul_by_3
|
---|
284 | #if __ARM_ARCH__>=5 || !defined(__thumb__)
|
---|
285 | ldmia sp!,{r4-r12,pc}
|
---|
286 | #else
|
---|
287 | ldmia sp!,{r4-r12,lr}
|
---|
288 | bx lr @ interoperable with Thumb ISA:-)
|
---|
289 | #endif
|
---|
290 | .size ecp_nistz256_mul_by_3,.-ecp_nistz256_mul_by_3
|
---|
291 |
|
---|
292 | .type __ecp_nistz256_mul_by_3,%function
|
---|
293 | .align 4
|
---|
294 | __ecp_nistz256_mul_by_3:
|
---|
295 | str lr,[sp,#-4]! @ push lr
|
---|
296 |
|
---|
297 | @ As multiplication by 3 is performed as 2*n+n, below are inline
|
---|
298 | @ copies of __ecp_nistz256_mul_by_2 and __ecp_nistz256_add, see
|
---|
299 | @ corresponding subroutines for details.
|
---|
300 |
|
---|
301 | ldr $a0,[$a_ptr,#0]
|
---|
302 | ldr $a1,[$a_ptr,#4]
|
---|
303 | ldr $a2,[$a_ptr,#8]
|
---|
304 | adds $a0,$a0,$a0 @ a[0:7]+=a[0:7]
|
---|
305 | ldr $a3,[$a_ptr,#12]
|
---|
306 | adcs $a1,$a1,$a1
|
---|
307 | ldr $a4,[$a_ptr,#16]
|
---|
308 | adcs $a2,$a2,$a2
|
---|
309 | ldr $a5,[$a_ptr,#20]
|
---|
310 | adcs $a3,$a3,$a3
|
---|
311 | ldr $a6,[$a_ptr,#24]
|
---|
312 | adcs $a4,$a4,$a4
|
---|
313 | ldr $a7,[$a_ptr,#28]
|
---|
314 | adcs $a5,$a5,$a5
|
---|
315 | adcs $a6,$a6,$a6
|
---|
316 | mov $ff,#0
|
---|
317 | adcs $a7,$a7,$a7
|
---|
318 | adc $ff,$ff,#0
|
---|
319 |
|
---|
320 | subs $a0,$a0,#-1 @ .Lreduce_by_sub but without stores
|
---|
321 | sbcs $a1,$a1,#-1
|
---|
322 | sbcs $a2,$a2,#-1
|
---|
323 | sbcs $a3,$a3,#0
|
---|
324 | sbcs $a4,$a4,#0
|
---|
325 | sbcs $a5,$a5,#0
|
---|
326 | sbcs $a6,$a6,#1
|
---|
327 | sbcs $a7,$a7,#-1
|
---|
328 | sbc $ff,$ff,#0
|
---|
329 |
|
---|
330 | adds $a0,$a0,$ff @ add synthesized modulus
|
---|
331 | adcs $a1,$a1,$ff
|
---|
332 | adcs $a2,$a2,$ff
|
---|
333 | adcs $a3,$a3,#0
|
---|
334 | adcs $a4,$a4,#0
|
---|
335 | ldr $b_ptr,[$a_ptr,#0]
|
---|
336 | adcs $a5,$a5,#0
|
---|
337 | ldr $t1,[$a_ptr,#4]
|
---|
338 | adcs $a6,$a6,$ff,lsr#31
|
---|
339 | ldr $t2,[$a_ptr,#8]
|
---|
340 | adc $a7,$a7,$ff
|
---|
341 |
|
---|
342 | ldr $t0,[$a_ptr,#12]
|
---|
343 | adds $a0,$a0,$b_ptr @ 2*a[0:7]+=a[0:7]
|
---|
344 | ldr $b_ptr,[$a_ptr,#16]
|
---|
345 | adcs $a1,$a1,$t1
|
---|
346 | ldr $t1,[$a_ptr,#20]
|
---|
347 | adcs $a2,$a2,$t2
|
---|
348 | ldr $t2,[$a_ptr,#24]
|
---|
349 | adcs $a3,$a3,$t0
|
---|
350 | ldr $t3,[$a_ptr,#28]
|
---|
351 | adcs $a4,$a4,$b_ptr
|
---|
352 | adcs $a5,$a5,$t1
|
---|
353 | adcs $a6,$a6,$t2
|
---|
354 | mov $ff,#0
|
---|
355 | adcs $a7,$a7,$t3
|
---|
356 | adc $ff,$ff,#0
|
---|
357 | ldr lr,[sp],#4 @ pop lr
|
---|
358 |
|
---|
359 | b .Lreduce_by_sub
|
---|
360 | .size ecp_nistz256_mul_by_3,.-ecp_nistz256_mul_by_3
|
---|
361 |
|
---|
362 | @ void ecp_nistz256_div_by_2(BN_ULONG r0[8],const BN_ULONG r1[8]);
|
---|
363 | .globl ecp_nistz256_div_by_2
|
---|
364 | .type ecp_nistz256_div_by_2,%function
|
---|
365 | .align 4
|
---|
366 | ecp_nistz256_div_by_2:
|
---|
367 | stmdb sp!,{r4-r12,lr}
|
---|
368 | bl __ecp_nistz256_div_by_2
|
---|
369 | #if __ARM_ARCH__>=5 || !defined(__thumb__)
|
---|
370 | ldmia sp!,{r4-r12,pc}
|
---|
371 | #else
|
---|
372 | ldmia sp!,{r4-r12,lr}
|
---|
373 | bx lr @ interoperable with Thumb ISA:-)
|
---|
374 | #endif
|
---|
375 | .size ecp_nistz256_div_by_2,.-ecp_nistz256_div_by_2
|
---|
376 |
|
---|
377 | .type __ecp_nistz256_div_by_2,%function
|
---|
378 | .align 4
|
---|
379 | __ecp_nistz256_div_by_2:
|
---|
380 | @ ret = (a is odd ? a+mod : a) >> 1
|
---|
381 |
|
---|
382 | ldr $a0,[$a_ptr,#0]
|
---|
383 | ldr $a1,[$a_ptr,#4]
|
---|
384 | ldr $a2,[$a_ptr,#8]
|
---|
385 | mov $ff,$a0,lsl#31 @ place least significant bit to most
|
---|
386 | @ significant position, now arithmetic
|
---|
387 | @ right shift by 31 will produce -1 or
|
---|
388 | @ 0, while logical right shift 1 or 0,
|
---|
389 | @ this is how modulus is conditionally
|
---|
390 | @ synthesized in this case...
|
---|
391 | ldr $a3,[$a_ptr,#12]
|
---|
392 | adds $a0,$a0,$ff,asr#31
|
---|
393 | ldr $a4,[$a_ptr,#16]
|
---|
394 | adcs $a1,$a1,$ff,asr#31
|
---|
395 | ldr $a5,[$a_ptr,#20]
|
---|
396 | adcs $a2,$a2,$ff,asr#31
|
---|
397 | ldr $a6,[$a_ptr,#24]
|
---|
398 | adcs $a3,$a3,#0
|
---|
399 | ldr $a7,[$a_ptr,#28]
|
---|
400 | adcs $a4,$a4,#0
|
---|
401 | mov $a0,$a0,lsr#1 @ a[0:7]>>=1, we can start early
|
---|
402 | @ because it doesn't affect flags
|
---|
403 | adcs $a5,$a5,#0
|
---|
404 | orr $a0,$a0,$a1,lsl#31
|
---|
405 | adcs $a6,$a6,$ff,lsr#31
|
---|
406 | mov $b_ptr,#0
|
---|
407 | adcs $a7,$a7,$ff,asr#31
|
---|
408 | mov $a1,$a1,lsr#1
|
---|
409 | adc $b_ptr,$b_ptr,#0 @ top-most carry bit from addition
|
---|
410 |
|
---|
411 | orr $a1,$a1,$a2,lsl#31
|
---|
412 | mov $a2,$a2,lsr#1
|
---|
413 | str $a0,[$r_ptr,#0]
|
---|
414 | orr $a2,$a2,$a3,lsl#31
|
---|
415 | mov $a3,$a3,lsr#1
|
---|
416 | str $a1,[$r_ptr,#4]
|
---|
417 | orr $a3,$a3,$a4,lsl#31
|
---|
418 | mov $a4,$a4,lsr#1
|
---|
419 | str $a2,[$r_ptr,#8]
|
---|
420 | orr $a4,$a4,$a5,lsl#31
|
---|
421 | mov $a5,$a5,lsr#1
|
---|
422 | str $a3,[$r_ptr,#12]
|
---|
423 | orr $a5,$a5,$a6,lsl#31
|
---|
424 | mov $a6,$a6,lsr#1
|
---|
425 | str $a4,[$r_ptr,#16]
|
---|
426 | orr $a6,$a6,$a7,lsl#31
|
---|
427 | mov $a7,$a7,lsr#1
|
---|
428 | str $a5,[$r_ptr,#20]
|
---|
429 | orr $a7,$a7,$b_ptr,lsl#31 @ don't forget the top-most carry bit
|
---|
430 | str $a6,[$r_ptr,#24]
|
---|
431 | str $a7,[$r_ptr,#28]
|
---|
432 |
|
---|
433 | mov pc,lr
|
---|
434 | .size __ecp_nistz256_div_by_2,.-__ecp_nistz256_div_by_2
|
---|
435 |
|
---|
436 | @ void ecp_nistz256_sub(BN_ULONG r0[8],const BN_ULONG r1[8],
|
---|
437 | @ const BN_ULONG r2[8]);
|
---|
438 | .globl ecp_nistz256_sub
|
---|
439 | .type ecp_nistz256_sub,%function
|
---|
440 | .align 4
|
---|
441 | ecp_nistz256_sub:
|
---|
442 | stmdb sp!,{r4-r12,lr}
|
---|
443 | bl __ecp_nistz256_sub
|
---|
444 | #if __ARM_ARCH__>=5 || !defined(__thumb__)
|
---|
445 | ldmia sp!,{r4-r12,pc}
|
---|
446 | #else
|
---|
447 | ldmia sp!,{r4-r12,lr}
|
---|
448 | bx lr @ interoperable with Thumb ISA:-)
|
---|
449 | #endif
|
---|
450 | .size ecp_nistz256_sub,.-ecp_nistz256_sub
|
---|
451 |
|
---|
452 | .type __ecp_nistz256_sub,%function
|
---|
453 | .align 4
|
---|
454 | __ecp_nistz256_sub:
|
---|
455 | str lr,[sp,#-4]! @ push lr
|
---|
456 |
|
---|
457 | ldr $a0,[$a_ptr,#0]
|
---|
458 | ldr $a1,[$a_ptr,#4]
|
---|
459 | ldr $a2,[$a_ptr,#8]
|
---|
460 | ldr $a3,[$a_ptr,#12]
|
---|
461 | ldr $a4,[$a_ptr,#16]
|
---|
462 | ldr $t0,[$b_ptr,#0]
|
---|
463 | ldr $a5,[$a_ptr,#20]
|
---|
464 | ldr $t1,[$b_ptr,#4]
|
---|
465 | ldr $a6,[$a_ptr,#24]
|
---|
466 | ldr $t2,[$b_ptr,#8]
|
---|
467 | ldr $a7,[$a_ptr,#28]
|
---|
468 | ldr $t3,[$b_ptr,#12]
|
---|
469 | subs $a0,$a0,$t0
|
---|
470 | ldr $t0,[$b_ptr,#16]
|
---|
471 | sbcs $a1,$a1,$t1
|
---|
472 | ldr $t1,[$b_ptr,#20]
|
---|
473 | sbcs $a2,$a2,$t2
|
---|
474 | ldr $t2,[$b_ptr,#24]
|
---|
475 | sbcs $a3,$a3,$t3
|
---|
476 | ldr $t3,[$b_ptr,#28]
|
---|
477 | sbcs $a4,$a4,$t0
|
---|
478 | sbcs $a5,$a5,$t1
|
---|
479 | sbcs $a6,$a6,$t2
|
---|
480 | sbcs $a7,$a7,$t3
|
---|
481 | sbc $ff,$ff,$ff @ broadcast borrow bit
|
---|
482 | ldr lr,[sp],#4 @ pop lr
|
---|
483 |
|
---|
484 | .Lreduce_by_add:
|
---|
485 |
|
---|
486 | @ if a-b borrows, add modulus.
|
---|
487 | @
|
---|
488 | @ Note that because mod has special form, i.e. consists of
|
---|
489 | @ 0xffffffff, 1 and 0s, we can conditionally synthesize it by
|
---|
490 | @ broadcasting borrow bit to a register, $ff, and using it as
|
---|
491 | @ a whole or extracting single bit.
|
---|
492 |
|
---|
493 | adds $a0,$a0,$ff @ add synthesized modulus
|
---|
494 | adcs $a1,$a1,$ff
|
---|
495 | str $a0,[$r_ptr,#0]
|
---|
496 | adcs $a2,$a2,$ff
|
---|
497 | str $a1,[$r_ptr,#4]
|
---|
498 | adcs $a3,$a3,#0
|
---|
499 | str $a2,[$r_ptr,#8]
|
---|
500 | adcs $a4,$a4,#0
|
---|
501 | str $a3,[$r_ptr,#12]
|
---|
502 | adcs $a5,$a5,#0
|
---|
503 | str $a4,[$r_ptr,#16]
|
---|
504 | adcs $a6,$a6,$ff,lsr#31
|
---|
505 | str $a5,[$r_ptr,#20]
|
---|
506 | adcs $a7,$a7,$ff
|
---|
507 | str $a6,[$r_ptr,#24]
|
---|
508 | str $a7,[$r_ptr,#28]
|
---|
509 |
|
---|
510 | mov pc,lr
|
---|
511 | .size __ecp_nistz256_sub,.-__ecp_nistz256_sub
|
---|
512 |
|
---|
513 | @ void ecp_nistz256_neg(BN_ULONG r0[8],const BN_ULONG r1[8]);
|
---|
514 | .globl ecp_nistz256_neg
|
---|
515 | .type ecp_nistz256_neg,%function
|
---|
516 | .align 4
|
---|
517 | ecp_nistz256_neg:
|
---|
518 | stmdb sp!,{r4-r12,lr}
|
---|
519 | bl __ecp_nistz256_neg
|
---|
520 | #if __ARM_ARCH__>=5 || !defined(__thumb__)
|
---|
521 | ldmia sp!,{r4-r12,pc}
|
---|
522 | #else
|
---|
523 | ldmia sp!,{r4-r12,lr}
|
---|
524 | bx lr @ interoperable with Thumb ISA:-)
|
---|
525 | #endif
|
---|
526 | .size ecp_nistz256_neg,.-ecp_nistz256_neg
|
---|
527 |
|
---|
528 | .type __ecp_nistz256_neg,%function
|
---|
529 | .align 4
|
---|
530 | __ecp_nistz256_neg:
|
---|
531 | ldr $a0,[$a_ptr,#0]
|
---|
532 | eor $ff,$ff,$ff
|
---|
533 | ldr $a1,[$a_ptr,#4]
|
---|
534 | ldr $a2,[$a_ptr,#8]
|
---|
535 | subs $a0,$ff,$a0
|
---|
536 | ldr $a3,[$a_ptr,#12]
|
---|
537 | sbcs $a1,$ff,$a1
|
---|
538 | ldr $a4,[$a_ptr,#16]
|
---|
539 | sbcs $a2,$ff,$a2
|
---|
540 | ldr $a5,[$a_ptr,#20]
|
---|
541 | sbcs $a3,$ff,$a3
|
---|
542 | ldr $a6,[$a_ptr,#24]
|
---|
543 | sbcs $a4,$ff,$a4
|
---|
544 | ldr $a7,[$a_ptr,#28]
|
---|
545 | sbcs $a5,$ff,$a5
|
---|
546 | sbcs $a6,$ff,$a6
|
---|
547 | sbcs $a7,$ff,$a7
|
---|
548 | sbc $ff,$ff,$ff
|
---|
549 |
|
---|
550 | b .Lreduce_by_add
|
---|
551 | .size __ecp_nistz256_neg,.-__ecp_nistz256_neg
|
---|
552 | ___
|
---|
553 | {
|
---|
554 | my @acc=map("r$_",(3..11));
|
---|
555 | my ($t0,$t1,$bj,$t2,$t3)=map("r$_",(0,1,2,12,14));
|
---|
556 |
|
---|
557 | $code.=<<___;
|
---|
558 | @ void ecp_nistz256_sqr_mont(BN_ULONG r0[8],const BN_ULONG r1[8]);
|
---|
559 | .globl ecp_nistz256_sqr_mont
|
---|
560 | .type ecp_nistz256_sqr_mont,%function
|
---|
561 | .align 4
|
---|
562 | ecp_nistz256_sqr_mont:
|
---|
563 | mov $b_ptr,$a_ptr
|
---|
564 | b .Lecp_nistz256_mul_mont
|
---|
565 | .size ecp_nistz256_sqr_mont,.-ecp_nistz256_sqr_mont
|
---|
566 |
|
---|
567 | @ void ecp_nistz256_mul_mont(BN_ULONG r0[8],const BN_ULONG r1[8],
|
---|
568 | @ const BN_ULONG r2[8]);
|
---|
569 | .globl ecp_nistz256_mul_mont
|
---|
570 | .type ecp_nistz256_mul_mont,%function
|
---|
571 | .align 4
|
---|
572 | ecp_nistz256_mul_mont:
|
---|
573 | .Lecp_nistz256_mul_mont:
|
---|
574 | stmdb sp!,{r4-r12,lr}
|
---|
575 | bl __ecp_nistz256_mul_mont
|
---|
576 | #if __ARM_ARCH__>=5 || !defined(__thumb__)
|
---|
577 | ldmia sp!,{r4-r12,pc}
|
---|
578 | #else
|
---|
579 | ldmia sp!,{r4-r12,lr}
|
---|
580 | bx lr @ interoperable with Thumb ISA:-)
|
---|
581 | #endif
|
---|
582 | .size ecp_nistz256_mul_mont,.-ecp_nistz256_mul_mont
|
---|
583 |
|
---|
584 | .type __ecp_nistz256_mul_mont,%function
|
---|
585 | .align 4
|
---|
586 | __ecp_nistz256_mul_mont:
|
---|
587 | stmdb sp!,{r0-r2,lr} @ make a copy of arguments too
|
---|
588 |
|
---|
589 | ldr $bj,[$b_ptr,#0] @ b[0]
|
---|
590 | ldmia $a_ptr,{@acc[1]-@acc[8]}
|
---|
591 |
|
---|
592 | umull @acc[0],$t3,@acc[1],$bj @ r[0]=a[0]*b[0]
|
---|
593 | stmdb sp!,{$acc[1]-@acc[8]} @ copy a[0-7] to stack, so
|
---|
594 | @ that it can be addressed
|
---|
595 | @ without spending register
|
---|
596 | @ on address
|
---|
597 | umull @acc[1],$t0,@acc[2],$bj @ r[1]=a[1]*b[0]
|
---|
598 | umull @acc[2],$t1,@acc[3],$bj
|
---|
599 | adds @acc[1],@acc[1],$t3 @ accumulate high part of mult
|
---|
600 | umull @acc[3],$t2,@acc[4],$bj
|
---|
601 | adcs @acc[2],@acc[2],$t0
|
---|
602 | umull @acc[4],$t3,@acc[5],$bj
|
---|
603 | adcs @acc[3],@acc[3],$t1
|
---|
604 | umull @acc[5],$t0,@acc[6],$bj
|
---|
605 | adcs @acc[4],@acc[4],$t2
|
---|
606 | umull @acc[6],$t1,@acc[7],$bj
|
---|
607 | adcs @acc[5],@acc[5],$t3
|
---|
608 | umull @acc[7],$t2,@acc[8],$bj
|
---|
609 | adcs @acc[6],@acc[6],$t0
|
---|
610 | adcs @acc[7],@acc[7],$t1
|
---|
611 | eor $t3,$t3,$t3 @ first overflow bit is zero
|
---|
612 | adc @acc[8],$t2,#0
|
---|
613 | ___
|
---|
614 | for(my $i=1;$i<8;$i++) {
|
---|
615 | my $t4=@acc[0];
|
---|
616 |
|
---|
617 | # Reduction iteration is normally performed by accumulating
|
---|
618 | # result of multiplication of modulus by "magic" digit [and
|
---|
619 | # omitting least significant word, which is guaranteed to
|
---|
620 | # be 0], but thanks to special form of modulus and "magic"
|
---|
621 | # digit being equal to least significant word, it can be
|
---|
622 | # performed with additions and subtractions alone. Indeed:
|
---|
623 | #
|
---|
624 | # ffff.0001.0000.0000.0000.ffff.ffff.ffff
|
---|
625 | # * abcd
|
---|
626 | # + xxxx.xxxx.xxxx.xxxx.xxxx.xxxx.xxxx.xxxx.abcd
|
---|
627 | #
|
---|
628 | # Now observing that ff..ff*x = (2^n-1)*x = 2^n*x-x, we
|
---|
629 | # rewrite above as:
|
---|
630 | #
|
---|
631 | # xxxx.xxxx.xxxx.xxxx.xxxx.xxxx.xxxx.xxxx.abcd
|
---|
632 | # + abcd.0000.abcd.0000.0000.abcd.0000.0000.0000
|
---|
633 | # - abcd.0000.0000.0000.0000.0000.0000.abcd
|
---|
634 | #
|
---|
635 | # or marking redundant operations:
|
---|
636 | #
|
---|
637 | # xxxx.xxxx.xxxx.xxxx.xxxx.xxxx.xxxx.xxxx.----
|
---|
638 | # + abcd.0000.abcd.0000.0000.abcd.----.----.----
|
---|
639 | # - abcd.----.----.----.----.----.----.----
|
---|
640 |
|
---|
641 | $code.=<<___;
|
---|
642 | @ multiplication-less reduction $i
|
---|
643 | adds @acc[3],@acc[3],@acc[0] @ r[3]+=r[0]
|
---|
644 | ldr $bj,[sp,#40] @ restore b_ptr
|
---|
645 | adcs @acc[4],@acc[4],#0 @ r[4]+=0
|
---|
646 | adcs @acc[5],@acc[5],#0 @ r[5]+=0
|
---|
647 | adcs @acc[6],@acc[6],@acc[0] @ r[6]+=r[0]
|
---|
648 | ldr $t1,[sp,#0] @ load a[0]
|
---|
649 | adcs @acc[7],@acc[7],#0 @ r[7]+=0
|
---|
650 | ldr $bj,[$bj,#4*$i] @ load b[i]
|
---|
651 | adcs @acc[8],@acc[8],@acc[0] @ r[8]+=r[0]
|
---|
652 | eor $t0,$t0,$t0
|
---|
653 | adc $t3,$t3,#0 @ overflow bit
|
---|
654 | subs @acc[7],@acc[7],@acc[0] @ r[7]-=r[0]
|
---|
655 | ldr $t2,[sp,#4] @ a[1]
|
---|
656 | sbcs @acc[8],@acc[8],#0 @ r[8]-=0
|
---|
657 | umlal @acc[1],$t0,$t1,$bj @ "r[0]"+=a[0]*b[i]
|
---|
658 | eor $t1,$t1,$t1
|
---|
659 | sbc @acc[0],$t3,#0 @ overflow bit, keep in mind
|
---|
660 | @ that netto result is
|
---|
661 | @ addition of a value which
|
---|
662 | @ makes underflow impossible
|
---|
663 |
|
---|
664 | ldr $t3,[sp,#8] @ a[2]
|
---|
665 | umlal @acc[2],$t1,$t2,$bj @ "r[1]"+=a[1]*b[i]
|
---|
666 | str @acc[0],[sp,#36] @ temporarily offload overflow
|
---|
667 | eor $t2,$t2,$t2
|
---|
668 | ldr $t4,[sp,#12] @ a[3], $t4 is alias @acc[0]
|
---|
669 | umlal @acc[3],$t2,$t3,$bj @ "r[2]"+=a[2]*b[i]
|
---|
670 | eor $t3,$t3,$t3
|
---|
671 | adds @acc[2],@acc[2],$t0 @ accumulate high part of mult
|
---|
672 | ldr $t0,[sp,#16] @ a[4]
|
---|
673 | umlal @acc[4],$t3,$t4,$bj @ "r[3]"+=a[3]*b[i]
|
---|
674 | eor $t4,$t4,$t4
|
---|
675 | adcs @acc[3],@acc[3],$t1
|
---|
676 | ldr $t1,[sp,#20] @ a[5]
|
---|
677 | umlal @acc[5],$t4,$t0,$bj @ "r[4]"+=a[4]*b[i]
|
---|
678 | eor $t0,$t0,$t0
|
---|
679 | adcs @acc[4],@acc[4],$t2
|
---|
680 | ldr $t2,[sp,#24] @ a[6]
|
---|
681 | umlal @acc[6],$t0,$t1,$bj @ "r[5]"+=a[5]*b[i]
|
---|
682 | eor $t1,$t1,$t1
|
---|
683 | adcs @acc[5],@acc[5],$t3
|
---|
684 | ldr $t3,[sp,#28] @ a[7]
|
---|
685 | umlal @acc[7],$t1,$t2,$bj @ "r[6]"+=a[6]*b[i]
|
---|
686 | eor $t2,$t2,$t2
|
---|
687 | adcs @acc[6],@acc[6],$t4
|
---|
688 | ldr @acc[0],[sp,#36] @ restore overflow bit
|
---|
689 | umlal @acc[8],$t2,$t3,$bj @ "r[7]"+=a[7]*b[i]
|
---|
690 | eor $t3,$t3,$t3
|
---|
691 | adcs @acc[7],@acc[7],$t0
|
---|
692 | adcs @acc[8],@acc[8],$t1
|
---|
693 | adcs @acc[0],$acc[0],$t2
|
---|
694 | adc $t3,$t3,#0 @ new overflow bit
|
---|
695 | ___
|
---|
696 | push(@acc,shift(@acc)); # rotate registers, so that
|
---|
697 | # "r[i]" becomes r[i]
|
---|
698 | }
|
---|
699 | $code.=<<___;
|
---|
700 | @ last multiplication-less reduction
|
---|
701 | adds @acc[3],@acc[3],@acc[0]
|
---|
702 | ldr $r_ptr,[sp,#32] @ restore r_ptr
|
---|
703 | adcs @acc[4],@acc[4],#0
|
---|
704 | adcs @acc[5],@acc[5],#0
|
---|
705 | adcs @acc[6],@acc[6],@acc[0]
|
---|
706 | adcs @acc[7],@acc[7],#0
|
---|
707 | adcs @acc[8],@acc[8],@acc[0]
|
---|
708 | adc $t3,$t3,#0
|
---|
709 | subs @acc[7],@acc[7],@acc[0]
|
---|
710 | sbcs @acc[8],@acc[8],#0
|
---|
711 | sbc @acc[0],$t3,#0 @ overflow bit
|
---|
712 |
|
---|
713 | @ Final step is "if result > mod, subtract mod", but we do it
|
---|
714 | @ "other way around", namely subtract modulus from result
|
---|
715 | @ and if it borrowed, add modulus back.
|
---|
716 |
|
---|
717 | adds @acc[1],@acc[1],#1 @ subs @acc[1],@acc[1],#-1
|
---|
718 | adcs @acc[2],@acc[2],#0 @ sbcs @acc[2],@acc[2],#-1
|
---|
719 | adcs @acc[3],@acc[3],#0 @ sbcs @acc[3],@acc[3],#-1
|
---|
720 | sbcs @acc[4],@acc[4],#0
|
---|
721 | sbcs @acc[5],@acc[5],#0
|
---|
722 | sbcs @acc[6],@acc[6],#0
|
---|
723 | sbcs @acc[7],@acc[7],#1
|
---|
724 | adcs @acc[8],@acc[8],#0 @ sbcs @acc[8],@acc[8],#-1
|
---|
725 | ldr lr,[sp,#44] @ restore lr
|
---|
726 | sbc @acc[0],@acc[0],#0 @ broadcast borrow bit
|
---|
727 | add sp,sp,#48
|
---|
728 |
|
---|
729 | @ Note that because mod has special form, i.e. consists of
|
---|
730 | @ 0xffffffff, 1 and 0s, we can conditionally synthesize it by
|
---|
731 | @ broadcasting borrow bit to a register, @acc[0], and using it as
|
---|
732 | @ a whole or extracting single bit.
|
---|
733 |
|
---|
734 | adds @acc[1],@acc[1],@acc[0] @ add modulus or zero
|
---|
735 | adcs @acc[2],@acc[2],@acc[0]
|
---|
736 | str @acc[1],[$r_ptr,#0]
|
---|
737 | adcs @acc[3],@acc[3],@acc[0]
|
---|
738 | str @acc[2],[$r_ptr,#4]
|
---|
739 | adcs @acc[4],@acc[4],#0
|
---|
740 | str @acc[3],[$r_ptr,#8]
|
---|
741 | adcs @acc[5],@acc[5],#0
|
---|
742 | str @acc[4],[$r_ptr,#12]
|
---|
743 | adcs @acc[6],@acc[6],#0
|
---|
744 | str @acc[5],[$r_ptr,#16]
|
---|
745 | adcs @acc[7],@acc[7],@acc[0],lsr#31
|
---|
746 | str @acc[6],[$r_ptr,#20]
|
---|
747 | adc @acc[8],@acc[8],@acc[0]
|
---|
748 | str @acc[7],[$r_ptr,#24]
|
---|
749 | str @acc[8],[$r_ptr,#28]
|
---|
750 |
|
---|
751 | mov pc,lr
|
---|
752 | .size __ecp_nistz256_mul_mont,.-__ecp_nistz256_mul_mont
|
---|
753 | ___
|
---|
754 | }
|
---|
755 |
|
---|
756 | {
|
---|
757 | my ($out,$inp,$index,$mask)=map("r$_",(0..3));
|
---|
758 | $code.=<<___;
|
---|
759 | @ void ecp_nistz256_scatter_w5(void *r0,const P256_POINT *r1,
|
---|
760 | @ int r2);
|
---|
761 | .globl ecp_nistz256_scatter_w5
|
---|
762 | .type ecp_nistz256_scatter_w5,%function
|
---|
763 | .align 5
|
---|
764 | ecp_nistz256_scatter_w5:
|
---|
765 | stmdb sp!,{r4-r11}
|
---|
766 |
|
---|
767 | add $out,$out,$index,lsl#2
|
---|
768 |
|
---|
769 | ldmia $inp!,{r4-r11} @ X
|
---|
770 | str r4,[$out,#64*0-4]
|
---|
771 | str r5,[$out,#64*1-4]
|
---|
772 | str r6,[$out,#64*2-4]
|
---|
773 | str r7,[$out,#64*3-4]
|
---|
774 | str r8,[$out,#64*4-4]
|
---|
775 | str r9,[$out,#64*5-4]
|
---|
776 | str r10,[$out,#64*6-4]
|
---|
777 | str r11,[$out,#64*7-4]
|
---|
778 | add $out,$out,#64*8
|
---|
779 |
|
---|
780 | ldmia $inp!,{r4-r11} @ Y
|
---|
781 | str r4,[$out,#64*0-4]
|
---|
782 | str r5,[$out,#64*1-4]
|
---|
783 | str r6,[$out,#64*2-4]
|
---|
784 | str r7,[$out,#64*3-4]
|
---|
785 | str r8,[$out,#64*4-4]
|
---|
786 | str r9,[$out,#64*5-4]
|
---|
787 | str r10,[$out,#64*6-4]
|
---|
788 | str r11,[$out,#64*7-4]
|
---|
789 | add $out,$out,#64*8
|
---|
790 |
|
---|
791 | ldmia $inp,{r4-r11} @ Z
|
---|
792 | str r4,[$out,#64*0-4]
|
---|
793 | str r5,[$out,#64*1-4]
|
---|
794 | str r6,[$out,#64*2-4]
|
---|
795 | str r7,[$out,#64*3-4]
|
---|
796 | str r8,[$out,#64*4-4]
|
---|
797 | str r9,[$out,#64*5-4]
|
---|
798 | str r10,[$out,#64*6-4]
|
---|
799 | str r11,[$out,#64*7-4]
|
---|
800 |
|
---|
801 | ldmia sp!,{r4-r11}
|
---|
802 | #if __ARM_ARCH__>=5 || defined(__thumb__)
|
---|
803 | bx lr
|
---|
804 | #else
|
---|
805 | mov pc,lr
|
---|
806 | #endif
|
---|
807 | .size ecp_nistz256_scatter_w5,.-ecp_nistz256_scatter_w5
|
---|
808 |
|
---|
809 | @ void ecp_nistz256_gather_w5(P256_POINT *r0,const void *r1,
|
---|
810 | @ int r2);
|
---|
811 | .globl ecp_nistz256_gather_w5
|
---|
812 | .type ecp_nistz256_gather_w5,%function
|
---|
813 | .align 5
|
---|
814 | ecp_nistz256_gather_w5:
|
---|
815 | stmdb sp!,{r4-r11}
|
---|
816 |
|
---|
817 | cmp $index,#0
|
---|
818 | mov $mask,#0
|
---|
819 | #ifdef __thumb2__
|
---|
820 | itt ne
|
---|
821 | #endif
|
---|
822 | subne $index,$index,#1
|
---|
823 | movne $mask,#-1
|
---|
824 | add $inp,$inp,$index,lsl#2
|
---|
825 |
|
---|
826 | ldr r4,[$inp,#64*0]
|
---|
827 | ldr r5,[$inp,#64*1]
|
---|
828 | ldr r6,[$inp,#64*2]
|
---|
829 | and r4,r4,$mask
|
---|
830 | ldr r7,[$inp,#64*3]
|
---|
831 | and r5,r5,$mask
|
---|
832 | ldr r8,[$inp,#64*4]
|
---|
833 | and r6,r6,$mask
|
---|
834 | ldr r9,[$inp,#64*5]
|
---|
835 | and r7,r7,$mask
|
---|
836 | ldr r10,[$inp,#64*6]
|
---|
837 | and r8,r8,$mask
|
---|
838 | ldr r11,[$inp,#64*7]
|
---|
839 | add $inp,$inp,#64*8
|
---|
840 | and r9,r9,$mask
|
---|
841 | and r10,r10,$mask
|
---|
842 | and r11,r11,$mask
|
---|
843 | stmia $out!,{r4-r11} @ X
|
---|
844 |
|
---|
845 | ldr r4,[$inp,#64*0]
|
---|
846 | ldr r5,[$inp,#64*1]
|
---|
847 | ldr r6,[$inp,#64*2]
|
---|
848 | and r4,r4,$mask
|
---|
849 | ldr r7,[$inp,#64*3]
|
---|
850 | and r5,r5,$mask
|
---|
851 | ldr r8,[$inp,#64*4]
|
---|
852 | and r6,r6,$mask
|
---|
853 | ldr r9,[$inp,#64*5]
|
---|
854 | and r7,r7,$mask
|
---|
855 | ldr r10,[$inp,#64*6]
|
---|
856 | and r8,r8,$mask
|
---|
857 | ldr r11,[$inp,#64*7]
|
---|
858 | add $inp,$inp,#64*8
|
---|
859 | and r9,r9,$mask
|
---|
860 | and r10,r10,$mask
|
---|
861 | and r11,r11,$mask
|
---|
862 | stmia $out!,{r4-r11} @ Y
|
---|
863 |
|
---|
864 | ldr r4,[$inp,#64*0]
|
---|
865 | ldr r5,[$inp,#64*1]
|
---|
866 | ldr r6,[$inp,#64*2]
|
---|
867 | and r4,r4,$mask
|
---|
868 | ldr r7,[$inp,#64*3]
|
---|
869 | and r5,r5,$mask
|
---|
870 | ldr r8,[$inp,#64*4]
|
---|
871 | and r6,r6,$mask
|
---|
872 | ldr r9,[$inp,#64*5]
|
---|
873 | and r7,r7,$mask
|
---|
874 | ldr r10,[$inp,#64*6]
|
---|
875 | and r8,r8,$mask
|
---|
876 | ldr r11,[$inp,#64*7]
|
---|
877 | and r9,r9,$mask
|
---|
878 | and r10,r10,$mask
|
---|
879 | and r11,r11,$mask
|
---|
880 | stmia $out,{r4-r11} @ Z
|
---|
881 |
|
---|
882 | ldmia sp!,{r4-r11}
|
---|
883 | #if __ARM_ARCH__>=5 || defined(__thumb__)
|
---|
884 | bx lr
|
---|
885 | #else
|
---|
886 | mov pc,lr
|
---|
887 | #endif
|
---|
888 | .size ecp_nistz256_gather_w5,.-ecp_nistz256_gather_w5
|
---|
889 |
|
---|
890 | @ void ecp_nistz256_scatter_w7(void *r0,const P256_POINT_AFFINE *r1,
|
---|
891 | @ int r2);
|
---|
892 | .globl ecp_nistz256_scatter_w7
|
---|
893 | .type ecp_nistz256_scatter_w7,%function
|
---|
894 | .align 5
|
---|
895 | ecp_nistz256_scatter_w7:
|
---|
896 | add $out,$out,$index
|
---|
897 | mov $index,#64/4
|
---|
898 | .Loop_scatter_w7:
|
---|
899 | ldr $mask,[$inp],#4
|
---|
900 | subs $index,$index,#1
|
---|
901 | strb $mask,[$out,#64*0]
|
---|
902 | mov $mask,$mask,lsr#8
|
---|
903 | strb $mask,[$out,#64*1]
|
---|
904 | mov $mask,$mask,lsr#8
|
---|
905 | strb $mask,[$out,#64*2]
|
---|
906 | mov $mask,$mask,lsr#8
|
---|
907 | strb $mask,[$out,#64*3]
|
---|
908 | add $out,$out,#64*4
|
---|
909 | bne .Loop_scatter_w7
|
---|
910 |
|
---|
911 | #if __ARM_ARCH__>=5 || defined(__thumb__)
|
---|
912 | bx lr
|
---|
913 | #else
|
---|
914 | mov pc,lr
|
---|
915 | #endif
|
---|
916 | .size ecp_nistz256_scatter_w7,.-ecp_nistz256_scatter_w7
|
---|
917 |
|
---|
918 | @ void ecp_nistz256_gather_w7(P256_POINT_AFFINE *r0,const void *r1,
|
---|
919 | @ int r2);
|
---|
920 | .globl ecp_nistz256_gather_w7
|
---|
921 | .type ecp_nistz256_gather_w7,%function
|
---|
922 | .align 5
|
---|
923 | ecp_nistz256_gather_w7:
|
---|
924 | stmdb sp!,{r4-r7}
|
---|
925 |
|
---|
926 | cmp $index,#0
|
---|
927 | mov $mask,#0
|
---|
928 | #ifdef __thumb2__
|
---|
929 | itt ne
|
---|
930 | #endif
|
---|
931 | subne $index,$index,#1
|
---|
932 | movne $mask,#-1
|
---|
933 | add $inp,$inp,$index
|
---|
934 | mov $index,#64/4
|
---|
935 | nop
|
---|
936 | .Loop_gather_w7:
|
---|
937 | ldrb r4,[$inp,#64*0]
|
---|
938 | subs $index,$index,#1
|
---|
939 | ldrb r5,[$inp,#64*1]
|
---|
940 | ldrb r6,[$inp,#64*2]
|
---|
941 | ldrb r7,[$inp,#64*3]
|
---|
942 | add $inp,$inp,#64*4
|
---|
943 | orr r4,r4,r5,lsl#8
|
---|
944 | orr r4,r4,r6,lsl#16
|
---|
945 | orr r4,r4,r7,lsl#24
|
---|
946 | and r4,r4,$mask
|
---|
947 | str r4,[$out],#4
|
---|
948 | bne .Loop_gather_w7
|
---|
949 |
|
---|
950 | ldmia sp!,{r4-r7}
|
---|
951 | #if __ARM_ARCH__>=5 || defined(__thumb__)
|
---|
952 | bx lr
|
---|
953 | #else
|
---|
954 | mov pc,lr
|
---|
955 | #endif
|
---|
956 | .size ecp_nistz256_gather_w7,.-ecp_nistz256_gather_w7
|
---|
957 | ___
|
---|
958 | }
|
---|
959 | if (0) {
|
---|
960 | # In comparison to integer-only equivalent of below subroutine:
|
---|
961 | #
|
---|
962 | # Cortex-A8 +10%
|
---|
963 | # Cortex-A9 -10%
|
---|
964 | # Snapdragon S4 +5%
|
---|
965 | #
|
---|
966 | # As not all time is spent in multiplication, overall impact is deemed
|
---|
967 | # too low to care about.
|
---|
968 |
|
---|
969 | my ($A0,$A1,$A2,$A3,$Bi,$zero,$temp)=map("d$_",(0..7));
|
---|
970 | my $mask="q4";
|
---|
971 | my $mult="q5";
|
---|
972 | my @AxB=map("q$_",(8..15));
|
---|
973 |
|
---|
974 | my ($rptr,$aptr,$bptr,$toutptr)=map("r$_",(0..3));
|
---|
975 |
|
---|
976 | $code.=<<___;
|
---|
977 | #if __ARM_ARCH__>=7
|
---|
978 | .fpu neon
|
---|
979 |
|
---|
980 | .globl ecp_nistz256_mul_mont_neon
|
---|
981 | .type ecp_nistz256_mul_mont_neon,%function
|
---|
982 | .align 5
|
---|
983 | ecp_nistz256_mul_mont_neon:
|
---|
984 | mov ip,sp
|
---|
985 | stmdb sp!,{r4-r9}
|
---|
986 | vstmdb sp!,{q4-q5} @ ABI specification says so
|
---|
987 |
|
---|
988 | sub $toutptr,sp,#40
|
---|
989 | vld1.32 {${Bi}[0]},[$bptr,:32]!
|
---|
990 | veor $zero,$zero,$zero
|
---|
991 | vld1.32 {$A0-$A3}, [$aptr] @ can't specify :32 :-(
|
---|
992 | vzip.16 $Bi,$zero
|
---|
993 | mov sp,$toutptr @ alloca
|
---|
994 | vmov.i64 $mask,#0xffff
|
---|
995 |
|
---|
996 | vmull.u32 @AxB[0],$Bi,${A0}[0]
|
---|
997 | vmull.u32 @AxB[1],$Bi,${A0}[1]
|
---|
998 | vmull.u32 @AxB[2],$Bi,${A1}[0]
|
---|
999 | vmull.u32 @AxB[3],$Bi,${A1}[1]
|
---|
1000 | vshr.u64 $temp,@AxB[0]#lo,#16
|
---|
1001 | vmull.u32 @AxB[4],$Bi,${A2}[0]
|
---|
1002 | vadd.u64 @AxB[0]#hi,@AxB[0]#hi,$temp
|
---|
1003 | vmull.u32 @AxB[5],$Bi,${A2}[1]
|
---|
1004 | vshr.u64 $temp,@AxB[0]#hi,#16 @ upper 32 bits of a[0]*b[0]
|
---|
1005 | vmull.u32 @AxB[6],$Bi,${A3}[0]
|
---|
1006 | vand.u64 @AxB[0],@AxB[0],$mask @ lower 32 bits of a[0]*b[0]
|
---|
1007 | vmull.u32 @AxB[7],$Bi,${A3}[1]
|
---|
1008 | ___
|
---|
1009 | for($i=1;$i<8;$i++) {
|
---|
1010 | $code.=<<___;
|
---|
1011 | vld1.32 {${Bi}[0]},[$bptr,:32]!
|
---|
1012 | veor $zero,$zero,$zero
|
---|
1013 | vadd.u64 @AxB[1]#lo,@AxB[1]#lo,$temp @ reduction
|
---|
1014 | vshl.u64 $mult,@AxB[0],#32
|
---|
1015 | vadd.u64 @AxB[3],@AxB[3],@AxB[0]
|
---|
1016 | vsub.u64 $mult,$mult,@AxB[0]
|
---|
1017 | vzip.16 $Bi,$zero
|
---|
1018 | vadd.u64 @AxB[6],@AxB[6],@AxB[0]
|
---|
1019 | vadd.u64 @AxB[7],@AxB[7],$mult
|
---|
1020 | ___
|
---|
1021 | push(@AxB,shift(@AxB));
|
---|
1022 | $code.=<<___;
|
---|
1023 | vmlal.u32 @AxB[0],$Bi,${A0}[0]
|
---|
1024 | vmlal.u32 @AxB[1],$Bi,${A0}[1]
|
---|
1025 | vmlal.u32 @AxB[2],$Bi,${A1}[0]
|
---|
1026 | vmlal.u32 @AxB[3],$Bi,${A1}[1]
|
---|
1027 | vshr.u64 $temp,@AxB[0]#lo,#16
|
---|
1028 | vmlal.u32 @AxB[4],$Bi,${A2}[0]
|
---|
1029 | vadd.u64 @AxB[0]#hi,@AxB[0]#hi,$temp
|
---|
1030 | vmlal.u32 @AxB[5],$Bi,${A2}[1]
|
---|
1031 | vshr.u64 $temp,@AxB[0]#hi,#16 @ upper 33 bits of a[0]*b[i]+t[0]
|
---|
1032 | vmlal.u32 @AxB[6],$Bi,${A3}[0]
|
---|
1033 | vand.u64 @AxB[0],@AxB[0],$mask @ lower 32 bits of a[0]*b[0]
|
---|
1034 | vmull.u32 @AxB[7],$Bi,${A3}[1]
|
---|
1035 | ___
|
---|
1036 | }
|
---|
1037 | $code.=<<___;
|
---|
1038 | vadd.u64 @AxB[1]#lo,@AxB[1]#lo,$temp @ last reduction
|
---|
1039 | vshl.u64 $mult,@AxB[0],#32
|
---|
1040 | vadd.u64 @AxB[3],@AxB[3],@AxB[0]
|
---|
1041 | vsub.u64 $mult,$mult,@AxB[0]
|
---|
1042 | vadd.u64 @AxB[6],@AxB[6],@AxB[0]
|
---|
1043 | vadd.u64 @AxB[7],@AxB[7],$mult
|
---|
1044 |
|
---|
1045 | vshr.u64 $temp,@AxB[1]#lo,#16 @ convert
|
---|
1046 | vadd.u64 @AxB[1]#hi,@AxB[1]#hi,$temp
|
---|
1047 | vshr.u64 $temp,@AxB[1]#hi,#16
|
---|
1048 | vzip.16 @AxB[1]#lo,@AxB[1]#hi
|
---|
1049 | ___
|
---|
1050 | foreach (2..7) {
|
---|
1051 | $code.=<<___;
|
---|
1052 | vadd.u64 @AxB[$_]#lo,@AxB[$_]#lo,$temp
|
---|
1053 | vst1.32 {@AxB[$_-1]#lo[0]},[$toutptr,:32]!
|
---|
1054 | vshr.u64 $temp,@AxB[$_]#lo,#16
|
---|
1055 | vadd.u64 @AxB[$_]#hi,@AxB[$_]#hi,$temp
|
---|
1056 | vshr.u64 $temp,@AxB[$_]#hi,#16
|
---|
1057 | vzip.16 @AxB[$_]#lo,@AxB[$_]#hi
|
---|
1058 | ___
|
---|
1059 | }
|
---|
1060 | $code.=<<___;
|
---|
1061 | vst1.32 {@AxB[7]#lo[0]},[$toutptr,:32]!
|
---|
1062 | vst1.32 {$temp},[$toutptr] @ upper 33 bits
|
---|
1063 |
|
---|
1064 | ldr r1,[sp,#0]
|
---|
1065 | ldr r2,[sp,#4]
|
---|
1066 | ldr r3,[sp,#8]
|
---|
1067 | subs r1,r1,#-1
|
---|
1068 | ldr r4,[sp,#12]
|
---|
1069 | sbcs r2,r2,#-1
|
---|
1070 | ldr r5,[sp,#16]
|
---|
1071 | sbcs r3,r3,#-1
|
---|
1072 | ldr r6,[sp,#20]
|
---|
1073 | sbcs r4,r4,#0
|
---|
1074 | ldr r7,[sp,#24]
|
---|
1075 | sbcs r5,r5,#0
|
---|
1076 | ldr r8,[sp,#28]
|
---|
1077 | sbcs r6,r6,#0
|
---|
1078 | ldr r9,[sp,#32] @ top-most bit
|
---|
1079 | sbcs r7,r7,#1
|
---|
1080 | sub sp,ip,#40+16
|
---|
1081 | sbcs r8,r8,#-1
|
---|
1082 | sbc r9,r9,#0
|
---|
1083 | vldmia sp!,{q4-q5}
|
---|
1084 |
|
---|
1085 | adds r1,r1,r9
|
---|
1086 | adcs r2,r2,r9
|
---|
1087 | str r1,[$rptr,#0]
|
---|
1088 | adcs r3,r3,r9
|
---|
1089 | str r2,[$rptr,#4]
|
---|
1090 | adcs r4,r4,#0
|
---|
1091 | str r3,[$rptr,#8]
|
---|
1092 | adcs r5,r5,#0
|
---|
1093 | str r4,[$rptr,#12]
|
---|
1094 | adcs r6,r6,#0
|
---|
1095 | str r5,[$rptr,#16]
|
---|
1096 | adcs r7,r7,r9,lsr#31
|
---|
1097 | str r6,[$rptr,#20]
|
---|
1098 | adcs r8,r8,r9
|
---|
1099 | str r7,[$rptr,#24]
|
---|
1100 | str r8,[$rptr,#28]
|
---|
1101 |
|
---|
1102 | ldmia sp!,{r4-r9}
|
---|
1103 | bx lr
|
---|
1104 | .size ecp_nistz256_mul_mont_neon,.-ecp_nistz256_mul_mont_neon
|
---|
1105 | #endif
|
---|
1106 | ___
|
---|
1107 | }
|
---|
1108 |
|
---|
1109 | {{{
|
---|
1110 | ########################################################################
|
---|
1111 | # Below $aN assignment matches order in which 256-bit result appears in
|
---|
1112 | # register bank at return from __ecp_nistz256_mul_mont, so that we can
|
---|
1113 | # skip over reloading it from memory. This means that below functions
|
---|
1114 | # use custom calling sequence accepting 256-bit input in registers,
|
---|
1115 | # output pointer in r0, $r_ptr, and optional pointer in r2, $b_ptr.
|
---|
1116 | #
|
---|
1117 | # See their "normal" counterparts for insights on calculations.
|
---|
1118 |
|
---|
1119 | my ($a0,$a1,$a2,$a3,$a4,$a5,$a6,$a7,
|
---|
1120 | $t0,$t1,$t2,$t3)=map("r$_",(11,3..10,12,14,1));
|
---|
1121 | my $ff=$b_ptr;
|
---|
1122 |
|
---|
1123 | $code.=<<___;
|
---|
1124 | .type __ecp_nistz256_sub_from,%function
|
---|
1125 | .align 5
|
---|
1126 | __ecp_nistz256_sub_from:
|
---|
1127 | str lr,[sp,#-4]! @ push lr
|
---|
1128 |
|
---|
1129 | ldr $t0,[$b_ptr,#0]
|
---|
1130 | ldr $t1,[$b_ptr,#4]
|
---|
1131 | ldr $t2,[$b_ptr,#8]
|
---|
1132 | ldr $t3,[$b_ptr,#12]
|
---|
1133 | subs $a0,$a0,$t0
|
---|
1134 | ldr $t0,[$b_ptr,#16]
|
---|
1135 | sbcs $a1,$a1,$t1
|
---|
1136 | ldr $t1,[$b_ptr,#20]
|
---|
1137 | sbcs $a2,$a2,$t2
|
---|
1138 | ldr $t2,[$b_ptr,#24]
|
---|
1139 | sbcs $a3,$a3,$t3
|
---|
1140 | ldr $t3,[$b_ptr,#28]
|
---|
1141 | sbcs $a4,$a4,$t0
|
---|
1142 | sbcs $a5,$a5,$t1
|
---|
1143 | sbcs $a6,$a6,$t2
|
---|
1144 | sbcs $a7,$a7,$t3
|
---|
1145 | sbc $ff,$ff,$ff @ broadcast borrow bit
|
---|
1146 | ldr lr,[sp],#4 @ pop lr
|
---|
1147 |
|
---|
1148 | adds $a0,$a0,$ff @ add synthesized modulus
|
---|
1149 | adcs $a1,$a1,$ff
|
---|
1150 | str $a0,[$r_ptr,#0]
|
---|
1151 | adcs $a2,$a2,$ff
|
---|
1152 | str $a1,[$r_ptr,#4]
|
---|
1153 | adcs $a3,$a3,#0
|
---|
1154 | str $a2,[$r_ptr,#8]
|
---|
1155 | adcs $a4,$a4,#0
|
---|
1156 | str $a3,[$r_ptr,#12]
|
---|
1157 | adcs $a5,$a5,#0
|
---|
1158 | str $a4,[$r_ptr,#16]
|
---|
1159 | adcs $a6,$a6,$ff,lsr#31
|
---|
1160 | str $a5,[$r_ptr,#20]
|
---|
1161 | adcs $a7,$a7,$ff
|
---|
1162 | str $a6,[$r_ptr,#24]
|
---|
1163 | str $a7,[$r_ptr,#28]
|
---|
1164 |
|
---|
1165 | mov pc,lr
|
---|
1166 | .size __ecp_nistz256_sub_from,.-__ecp_nistz256_sub_from
|
---|
1167 |
|
---|
1168 | .type __ecp_nistz256_sub_morf,%function
|
---|
1169 | .align 5
|
---|
1170 | __ecp_nistz256_sub_morf:
|
---|
1171 | str lr,[sp,#-4]! @ push lr
|
---|
1172 |
|
---|
1173 | ldr $t0,[$b_ptr,#0]
|
---|
1174 | ldr $t1,[$b_ptr,#4]
|
---|
1175 | ldr $t2,[$b_ptr,#8]
|
---|
1176 | ldr $t3,[$b_ptr,#12]
|
---|
1177 | subs $a0,$t0,$a0
|
---|
1178 | ldr $t0,[$b_ptr,#16]
|
---|
1179 | sbcs $a1,$t1,$a1
|
---|
1180 | ldr $t1,[$b_ptr,#20]
|
---|
1181 | sbcs $a2,$t2,$a2
|
---|
1182 | ldr $t2,[$b_ptr,#24]
|
---|
1183 | sbcs $a3,$t3,$a3
|
---|
1184 | ldr $t3,[$b_ptr,#28]
|
---|
1185 | sbcs $a4,$t0,$a4
|
---|
1186 | sbcs $a5,$t1,$a5
|
---|
1187 | sbcs $a6,$t2,$a6
|
---|
1188 | sbcs $a7,$t3,$a7
|
---|
1189 | sbc $ff,$ff,$ff @ broadcast borrow bit
|
---|
1190 | ldr lr,[sp],#4 @ pop lr
|
---|
1191 |
|
---|
1192 | adds $a0,$a0,$ff @ add synthesized modulus
|
---|
1193 | adcs $a1,$a1,$ff
|
---|
1194 | str $a0,[$r_ptr,#0]
|
---|
1195 | adcs $a2,$a2,$ff
|
---|
1196 | str $a1,[$r_ptr,#4]
|
---|
1197 | adcs $a3,$a3,#0
|
---|
1198 | str $a2,[$r_ptr,#8]
|
---|
1199 | adcs $a4,$a4,#0
|
---|
1200 | str $a3,[$r_ptr,#12]
|
---|
1201 | adcs $a5,$a5,#0
|
---|
1202 | str $a4,[$r_ptr,#16]
|
---|
1203 | adcs $a6,$a6,$ff,lsr#31
|
---|
1204 | str $a5,[$r_ptr,#20]
|
---|
1205 | adcs $a7,$a7,$ff
|
---|
1206 | str $a6,[$r_ptr,#24]
|
---|
1207 | str $a7,[$r_ptr,#28]
|
---|
1208 |
|
---|
1209 | mov pc,lr
|
---|
1210 | .size __ecp_nistz256_sub_morf,.-__ecp_nistz256_sub_morf
|
---|
1211 |
|
---|
1212 | .type __ecp_nistz256_add_self,%function
|
---|
1213 | .align 4
|
---|
1214 | __ecp_nistz256_add_self:
|
---|
1215 | adds $a0,$a0,$a0 @ a[0:7]+=a[0:7]
|
---|
1216 | adcs $a1,$a1,$a1
|
---|
1217 | adcs $a2,$a2,$a2
|
---|
1218 | adcs $a3,$a3,$a3
|
---|
1219 | adcs $a4,$a4,$a4
|
---|
1220 | adcs $a5,$a5,$a5
|
---|
1221 | adcs $a6,$a6,$a6
|
---|
1222 | mov $ff,#0
|
---|
1223 | adcs $a7,$a7,$a7
|
---|
1224 | adc $ff,$ff,#0
|
---|
1225 |
|
---|
1226 | @ if a+b >= modulus, subtract modulus.
|
---|
1227 | @
|
---|
1228 | @ But since comparison implies subtraction, we subtract
|
---|
1229 | @ modulus and then add it back if subtraction borrowed.
|
---|
1230 |
|
---|
1231 | subs $a0,$a0,#-1
|
---|
1232 | sbcs $a1,$a1,#-1
|
---|
1233 | sbcs $a2,$a2,#-1
|
---|
1234 | sbcs $a3,$a3,#0
|
---|
1235 | sbcs $a4,$a4,#0
|
---|
1236 | sbcs $a5,$a5,#0
|
---|
1237 | sbcs $a6,$a6,#1
|
---|
1238 | sbcs $a7,$a7,#-1
|
---|
1239 | sbc $ff,$ff,#0
|
---|
1240 |
|
---|
1241 | @ Note that because mod has special form, i.e. consists of
|
---|
1242 | @ 0xffffffff, 1 and 0s, we can conditionally synthesize it by
|
---|
1243 | @ using value of borrow as a whole or extracting single bit.
|
---|
1244 | @ Follow $ff register...
|
---|
1245 |
|
---|
1246 | adds $a0,$a0,$ff @ add synthesized modulus
|
---|
1247 | adcs $a1,$a1,$ff
|
---|
1248 | str $a0,[$r_ptr,#0]
|
---|
1249 | adcs $a2,$a2,$ff
|
---|
1250 | str $a1,[$r_ptr,#4]
|
---|
1251 | adcs $a3,$a3,#0
|
---|
1252 | str $a2,[$r_ptr,#8]
|
---|
1253 | adcs $a4,$a4,#0
|
---|
1254 | str $a3,[$r_ptr,#12]
|
---|
1255 | adcs $a5,$a5,#0
|
---|
1256 | str $a4,[$r_ptr,#16]
|
---|
1257 | adcs $a6,$a6,$ff,lsr#31
|
---|
1258 | str $a5,[$r_ptr,#20]
|
---|
1259 | adcs $a7,$a7,$ff
|
---|
1260 | str $a6,[$r_ptr,#24]
|
---|
1261 | str $a7,[$r_ptr,#28]
|
---|
1262 |
|
---|
1263 | mov pc,lr
|
---|
1264 | .size __ecp_nistz256_add_self,.-__ecp_nistz256_add_self
|
---|
1265 |
|
---|
1266 | ___
|
---|
1267 |
|
---|
1268 | ########################################################################
|
---|
1269 | # following subroutines are "literal" implementation of those found in
|
---|
1270 | # ecp_nistz256.c
|
---|
1271 | #
|
---|
1272 | ########################################################################
|
---|
1273 | # void ecp_nistz256_point_double(P256_POINT *out,const P256_POINT *inp);
|
---|
1274 | #
|
---|
1275 | {
|
---|
1276 | my ($S,$M,$Zsqr,$in_x,$tmp0)=map(32*$_,(0..4));
|
---|
1277 | # above map() describes stack layout with 5 temporary
|
---|
1278 | # 256-bit vectors on top. Then note that we push
|
---|
1279 | # starting from r0, which means that we have copy of
|
---|
1280 | # input arguments just below these temporary vectors.
|
---|
1281 |
|
---|
1282 | $code.=<<___;
|
---|
1283 | .globl ecp_nistz256_point_double
|
---|
1284 | .type ecp_nistz256_point_double,%function
|
---|
1285 | .align 5
|
---|
1286 | ecp_nistz256_point_double:
|
---|
1287 | stmdb sp!,{r0-r12,lr} @ push from r0, unusual, but intentional
|
---|
1288 | sub sp,sp,#32*5
|
---|
1289 |
|
---|
1290 | .Lpoint_double_shortcut:
|
---|
1291 | add r3,sp,#$in_x
|
---|
1292 | ldmia $a_ptr!,{r4-r11} @ copy in_x
|
---|
1293 | stmia r3,{r4-r11}
|
---|
1294 |
|
---|
1295 | add $r_ptr,sp,#$S
|
---|
1296 | bl __ecp_nistz256_mul_by_2 @ p256_mul_by_2(S, in_y);
|
---|
1297 |
|
---|
1298 | add $b_ptr,$a_ptr,#32
|
---|
1299 | add $a_ptr,$a_ptr,#32
|
---|
1300 | add $r_ptr,sp,#$Zsqr
|
---|
1301 | bl __ecp_nistz256_mul_mont @ p256_sqr_mont(Zsqr, in_z);
|
---|
1302 |
|
---|
1303 | add $a_ptr,sp,#$S
|
---|
1304 | add $b_ptr,sp,#$S
|
---|
1305 | add $r_ptr,sp,#$S
|
---|
1306 | bl __ecp_nistz256_mul_mont @ p256_sqr_mont(S, S);
|
---|
1307 |
|
---|
1308 | ldr $b_ptr,[sp,#32*5+4]
|
---|
1309 | add $a_ptr,$b_ptr,#32
|
---|
1310 | add $b_ptr,$b_ptr,#64
|
---|
1311 | add $r_ptr,sp,#$tmp0
|
---|
1312 | bl __ecp_nistz256_mul_mont @ p256_mul_mont(tmp0, in_z, in_y);
|
---|
1313 |
|
---|
1314 | ldr $r_ptr,[sp,#32*5]
|
---|
1315 | add $r_ptr,$r_ptr,#64
|
---|
1316 | bl __ecp_nistz256_add_self @ p256_mul_by_2(res_z, tmp0);
|
---|
1317 |
|
---|
1318 | add $a_ptr,sp,#$in_x
|
---|
1319 | add $b_ptr,sp,#$Zsqr
|
---|
1320 | add $r_ptr,sp,#$M
|
---|
1321 | bl __ecp_nistz256_add @ p256_add(M, in_x, Zsqr);
|
---|
1322 |
|
---|
1323 | add $a_ptr,sp,#$in_x
|
---|
1324 | add $b_ptr,sp,#$Zsqr
|
---|
1325 | add $r_ptr,sp,#$Zsqr
|
---|
1326 | bl __ecp_nistz256_sub @ p256_sub(Zsqr, in_x, Zsqr);
|
---|
1327 |
|
---|
1328 | add $a_ptr,sp,#$S
|
---|
1329 | add $b_ptr,sp,#$S
|
---|
1330 | add $r_ptr,sp,#$tmp0
|
---|
1331 | bl __ecp_nistz256_mul_mont @ p256_sqr_mont(tmp0, S);
|
---|
1332 |
|
---|
1333 | add $a_ptr,sp,#$Zsqr
|
---|
1334 | add $b_ptr,sp,#$M
|
---|
1335 | add $r_ptr,sp,#$M
|
---|
1336 | bl __ecp_nistz256_mul_mont @ p256_mul_mont(M, M, Zsqr);
|
---|
1337 |
|
---|
1338 | ldr $r_ptr,[sp,#32*5]
|
---|
1339 | add $a_ptr,sp,#$tmp0
|
---|
1340 | add $r_ptr,$r_ptr,#32
|
---|
1341 | bl __ecp_nistz256_div_by_2 @ p256_div_by_2(res_y, tmp0);
|
---|
1342 |
|
---|
1343 | add $a_ptr,sp,#$M
|
---|
1344 | add $r_ptr,sp,#$M
|
---|
1345 | bl __ecp_nistz256_mul_by_3 @ p256_mul_by_3(M, M);
|
---|
1346 |
|
---|
1347 | add $a_ptr,sp,#$in_x
|
---|
1348 | add $b_ptr,sp,#$S
|
---|
1349 | add $r_ptr,sp,#$S
|
---|
1350 | bl __ecp_nistz256_mul_mont @ p256_mul_mont(S, S, in_x);
|
---|
1351 |
|
---|
1352 | add $r_ptr,sp,#$tmp0
|
---|
1353 | bl __ecp_nistz256_add_self @ p256_mul_by_2(tmp0, S);
|
---|
1354 |
|
---|
1355 | ldr $r_ptr,[sp,#32*5]
|
---|
1356 | add $a_ptr,sp,#$M
|
---|
1357 | add $b_ptr,sp,#$M
|
---|
1358 | bl __ecp_nistz256_mul_mont @ p256_sqr_mont(res_x, M);
|
---|
1359 |
|
---|
1360 | add $b_ptr,sp,#$tmp0
|
---|
1361 | bl __ecp_nistz256_sub_from @ p256_sub(res_x, res_x, tmp0);
|
---|
1362 |
|
---|
1363 | add $b_ptr,sp,#$S
|
---|
1364 | add $r_ptr,sp,#$S
|
---|
1365 | bl __ecp_nistz256_sub_morf @ p256_sub(S, S, res_x);
|
---|
1366 |
|
---|
1367 | add $a_ptr,sp,#$M
|
---|
1368 | add $b_ptr,sp,#$S
|
---|
1369 | bl __ecp_nistz256_mul_mont @ p256_mul_mont(S, S, M);
|
---|
1370 |
|
---|
1371 | ldr $r_ptr,[sp,#32*5]
|
---|
1372 | add $b_ptr,$r_ptr,#32
|
---|
1373 | add $r_ptr,$r_ptr,#32
|
---|
1374 | bl __ecp_nistz256_sub_from @ p256_sub(res_y, S, res_y);
|
---|
1375 |
|
---|
1376 | add sp,sp,#32*5+16 @ +16 means "skip even over saved r0-r3"
|
---|
1377 | #if __ARM_ARCH__>=5 || !defined(__thumb__)
|
---|
1378 | ldmia sp!,{r4-r12,pc}
|
---|
1379 | #else
|
---|
1380 | ldmia sp!,{r4-r12,lr}
|
---|
1381 | bx lr @ interoperable with Thumb ISA:-)
|
---|
1382 | #endif
|
---|
1383 | .size ecp_nistz256_point_double,.-ecp_nistz256_point_double
|
---|
1384 | ___
|
---|
1385 | }
|
---|
1386 |
|
---|
1387 | ########################################################################
|
---|
1388 | # void ecp_nistz256_point_add(P256_POINT *out,const P256_POINT *in1,
|
---|
1389 | # const P256_POINT *in2);
|
---|
1390 | {
|
---|
1391 | my ($res_x,$res_y,$res_z,
|
---|
1392 | $in1_x,$in1_y,$in1_z,
|
---|
1393 | $in2_x,$in2_y,$in2_z,
|
---|
1394 | $H,$Hsqr,$R,$Rsqr,$Hcub,
|
---|
1395 | $U1,$U2,$S1,$S2)=map(32*$_,(0..17));
|
---|
1396 | my ($Z1sqr, $Z2sqr) = ($Hsqr, $Rsqr);
|
---|
1397 | # above map() describes stack layout with 18 temporary
|
---|
1398 | # 256-bit vectors on top. Then note that we push
|
---|
1399 | # starting from r0, which means that we have copy of
|
---|
1400 | # input arguments just below these temporary vectors.
|
---|
1401 | # We use three of them for ~in1infty, ~in2infty and
|
---|
1402 | # result of check for zero.
|
---|
1403 |
|
---|
1404 | $code.=<<___;
|
---|
1405 | .globl ecp_nistz256_point_add
|
---|
1406 | .type ecp_nistz256_point_add,%function
|
---|
1407 | .align 5
|
---|
1408 | ecp_nistz256_point_add:
|
---|
1409 | stmdb sp!,{r0-r12,lr} @ push from r0, unusual, but intentional
|
---|
1410 | sub sp,sp,#32*18+16
|
---|
1411 |
|
---|
1412 | ldmia $b_ptr!,{r4-r11} @ copy in2_x
|
---|
1413 | add r3,sp,#$in2_x
|
---|
1414 | stmia r3!,{r4-r11}
|
---|
1415 | ldmia $b_ptr!,{r4-r11} @ copy in2_y
|
---|
1416 | stmia r3!,{r4-r11}
|
---|
1417 | ldmia $b_ptr,{r4-r11} @ copy in2_z
|
---|
1418 | orr r12,r4,r5
|
---|
1419 | orr r12,r12,r6
|
---|
1420 | orr r12,r12,r7
|
---|
1421 | orr r12,r12,r8
|
---|
1422 | orr r12,r12,r9
|
---|
1423 | orr r12,r12,r10
|
---|
1424 | orr r12,r12,r11
|
---|
1425 | cmp r12,#0
|
---|
1426 | #ifdef __thumb2__
|
---|
1427 | it ne
|
---|
1428 | #endif
|
---|
1429 | movne r12,#-1
|
---|
1430 | stmia r3,{r4-r11}
|
---|
1431 | str r12,[sp,#32*18+8] @ ~in2infty
|
---|
1432 |
|
---|
1433 | ldmia $a_ptr!,{r4-r11} @ copy in1_x
|
---|
1434 | add r3,sp,#$in1_x
|
---|
1435 | stmia r3!,{r4-r11}
|
---|
1436 | ldmia $a_ptr!,{r4-r11} @ copy in1_y
|
---|
1437 | stmia r3!,{r4-r11}
|
---|
1438 | ldmia $a_ptr,{r4-r11} @ copy in1_z
|
---|
1439 | orr r12,r4,r5
|
---|
1440 | orr r12,r12,r6
|
---|
1441 | orr r12,r12,r7
|
---|
1442 | orr r12,r12,r8
|
---|
1443 | orr r12,r12,r9
|
---|
1444 | orr r12,r12,r10
|
---|
1445 | orr r12,r12,r11
|
---|
1446 | cmp r12,#0
|
---|
1447 | #ifdef __thumb2__
|
---|
1448 | it ne
|
---|
1449 | #endif
|
---|
1450 | movne r12,#-1
|
---|
1451 | stmia r3,{r4-r11}
|
---|
1452 | str r12,[sp,#32*18+4] @ ~in1infty
|
---|
1453 |
|
---|
1454 | add $a_ptr,sp,#$in2_z
|
---|
1455 | add $b_ptr,sp,#$in2_z
|
---|
1456 | add $r_ptr,sp,#$Z2sqr
|
---|
1457 | bl __ecp_nistz256_mul_mont @ p256_sqr_mont(Z2sqr, in2_z);
|
---|
1458 |
|
---|
1459 | add $a_ptr,sp,#$in1_z
|
---|
1460 | add $b_ptr,sp,#$in1_z
|
---|
1461 | add $r_ptr,sp,#$Z1sqr
|
---|
1462 | bl __ecp_nistz256_mul_mont @ p256_sqr_mont(Z1sqr, in1_z);
|
---|
1463 |
|
---|
1464 | add $a_ptr,sp,#$in2_z
|
---|
1465 | add $b_ptr,sp,#$Z2sqr
|
---|
1466 | add $r_ptr,sp,#$S1
|
---|
1467 | bl __ecp_nistz256_mul_mont @ p256_mul_mont(S1, Z2sqr, in2_z);
|
---|
1468 |
|
---|
1469 | add $a_ptr,sp,#$in1_z
|
---|
1470 | add $b_ptr,sp,#$Z1sqr
|
---|
1471 | add $r_ptr,sp,#$S2
|
---|
1472 | bl __ecp_nistz256_mul_mont @ p256_mul_mont(S2, Z1sqr, in1_z);
|
---|
1473 |
|
---|
1474 | add $a_ptr,sp,#$in1_y
|
---|
1475 | add $b_ptr,sp,#$S1
|
---|
1476 | add $r_ptr,sp,#$S1
|
---|
1477 | bl __ecp_nistz256_mul_mont @ p256_mul_mont(S1, S1, in1_y);
|
---|
1478 |
|
---|
1479 | add $a_ptr,sp,#$in2_y
|
---|
1480 | add $b_ptr,sp,#$S2
|
---|
1481 | add $r_ptr,sp,#$S2
|
---|
1482 | bl __ecp_nistz256_mul_mont @ p256_mul_mont(S2, S2, in2_y);
|
---|
1483 |
|
---|
1484 | add $b_ptr,sp,#$S1
|
---|
1485 | add $r_ptr,sp,#$R
|
---|
1486 | bl __ecp_nistz256_sub_from @ p256_sub(R, S2, S1);
|
---|
1487 |
|
---|
1488 | orr $a0,$a0,$a1 @ see if result is zero
|
---|
1489 | orr $a2,$a2,$a3
|
---|
1490 | orr $a4,$a4,$a5
|
---|
1491 | orr $a0,$a0,$a2
|
---|
1492 | orr $a4,$a4,$a6
|
---|
1493 | orr $a0,$a0,$a7
|
---|
1494 | add $a_ptr,sp,#$in1_x
|
---|
1495 | orr $a0,$a0,$a4
|
---|
1496 | add $b_ptr,sp,#$Z2sqr
|
---|
1497 | str $a0,[sp,#32*18+12]
|
---|
1498 |
|
---|
1499 | add $r_ptr,sp,#$U1
|
---|
1500 | bl __ecp_nistz256_mul_mont @ p256_mul_mont(U1, in1_x, Z2sqr);
|
---|
1501 |
|
---|
1502 | add $a_ptr,sp,#$in2_x
|
---|
1503 | add $b_ptr,sp,#$Z1sqr
|
---|
1504 | add $r_ptr,sp,#$U2
|
---|
1505 | bl __ecp_nistz256_mul_mont @ p256_mul_mont(U2, in2_x, Z1sqr);
|
---|
1506 |
|
---|
1507 | add $b_ptr,sp,#$U1
|
---|
1508 | add $r_ptr,sp,#$H
|
---|
1509 | bl __ecp_nistz256_sub_from @ p256_sub(H, U2, U1);
|
---|
1510 |
|
---|
1511 | orr $a0,$a0,$a1 @ see if result is zero
|
---|
1512 | orr $a2,$a2,$a3
|
---|
1513 | orr $a4,$a4,$a5
|
---|
1514 | orr $a0,$a0,$a2
|
---|
1515 | orr $a4,$a4,$a6
|
---|
1516 | orr $a0,$a0,$a7
|
---|
1517 | orr $a0,$a0,$a4 @ ~is_equal(U1,U2)
|
---|
1518 |
|
---|
1519 | ldr $t0,[sp,#32*18+4] @ ~in1infty
|
---|
1520 | ldr $t1,[sp,#32*18+8] @ ~in2infty
|
---|
1521 | ldr $t2,[sp,#32*18+12] @ ~is_equal(S1,S2)
|
---|
1522 | mvn $t0,$t0 @ -1/0 -> 0/-1
|
---|
1523 | mvn $t1,$t1 @ -1/0 -> 0/-1
|
---|
1524 | orr $a0,$a0,$t0
|
---|
1525 | orr $a0,$a0,$t1
|
---|
1526 | orrs $a0,$a0,$t2 @ set flags
|
---|
1527 |
|
---|
1528 | @ if(~is_equal(U1,U2) | in1infty | in2infty | ~is_equal(S1,S2))
|
---|
1529 | bne .Ladd_proceed
|
---|
1530 |
|
---|
1531 | .Ladd_double:
|
---|
1532 | ldr $a_ptr,[sp,#32*18+20]
|
---|
1533 | add sp,sp,#32*(18-5)+16 @ difference in frame sizes
|
---|
1534 | b .Lpoint_double_shortcut
|
---|
1535 |
|
---|
1536 | .align 4
|
---|
1537 | .Ladd_proceed:
|
---|
1538 | add $a_ptr,sp,#$R
|
---|
1539 | add $b_ptr,sp,#$R
|
---|
1540 | add $r_ptr,sp,#$Rsqr
|
---|
1541 | bl __ecp_nistz256_mul_mont @ p256_sqr_mont(Rsqr, R);
|
---|
1542 |
|
---|
1543 | add $a_ptr,sp,#$H
|
---|
1544 | add $b_ptr,sp,#$in1_z
|
---|
1545 | add $r_ptr,sp,#$res_z
|
---|
1546 | bl __ecp_nistz256_mul_mont @ p256_mul_mont(res_z, H, in1_z);
|
---|
1547 |
|
---|
1548 | add $a_ptr,sp,#$H
|
---|
1549 | add $b_ptr,sp,#$H
|
---|
1550 | add $r_ptr,sp,#$Hsqr
|
---|
1551 | bl __ecp_nistz256_mul_mont @ p256_sqr_mont(Hsqr, H);
|
---|
1552 |
|
---|
1553 | add $a_ptr,sp,#$in2_z
|
---|
1554 | add $b_ptr,sp,#$res_z
|
---|
1555 | add $r_ptr,sp,#$res_z
|
---|
1556 | bl __ecp_nistz256_mul_mont @ p256_mul_mont(res_z, res_z, in2_z);
|
---|
1557 |
|
---|
1558 | add $a_ptr,sp,#$H
|
---|
1559 | add $b_ptr,sp,#$Hsqr
|
---|
1560 | add $r_ptr,sp,#$Hcub
|
---|
1561 | bl __ecp_nistz256_mul_mont @ p256_mul_mont(Hcub, Hsqr, H);
|
---|
1562 |
|
---|
1563 | add $a_ptr,sp,#$Hsqr
|
---|
1564 | add $b_ptr,sp,#$U1
|
---|
1565 | add $r_ptr,sp,#$U2
|
---|
1566 | bl __ecp_nistz256_mul_mont @ p256_mul_mont(U2, U1, Hsqr);
|
---|
1567 |
|
---|
1568 | add $r_ptr,sp,#$Hsqr
|
---|
1569 | bl __ecp_nistz256_add_self @ p256_mul_by_2(Hsqr, U2);
|
---|
1570 |
|
---|
1571 | add $b_ptr,sp,#$Rsqr
|
---|
1572 | add $r_ptr,sp,#$res_x
|
---|
1573 | bl __ecp_nistz256_sub_morf @ p256_sub(res_x, Rsqr, Hsqr);
|
---|
1574 |
|
---|
1575 | add $b_ptr,sp,#$Hcub
|
---|
1576 | bl __ecp_nistz256_sub_from @ p256_sub(res_x, res_x, Hcub);
|
---|
1577 |
|
---|
1578 | add $b_ptr,sp,#$U2
|
---|
1579 | add $r_ptr,sp,#$res_y
|
---|
1580 | bl __ecp_nistz256_sub_morf @ p256_sub(res_y, U2, res_x);
|
---|
1581 |
|
---|
1582 | add $a_ptr,sp,#$Hcub
|
---|
1583 | add $b_ptr,sp,#$S1
|
---|
1584 | add $r_ptr,sp,#$S2
|
---|
1585 | bl __ecp_nistz256_mul_mont @ p256_mul_mont(S2, S1, Hcub);
|
---|
1586 |
|
---|
1587 | add $a_ptr,sp,#$R
|
---|
1588 | add $b_ptr,sp,#$res_y
|
---|
1589 | add $r_ptr,sp,#$res_y
|
---|
1590 | bl __ecp_nistz256_mul_mont @ p256_mul_mont(res_y, res_y, R);
|
---|
1591 |
|
---|
1592 | add $b_ptr,sp,#$S2
|
---|
1593 | bl __ecp_nistz256_sub_from @ p256_sub(res_y, res_y, S2);
|
---|
1594 |
|
---|
1595 | ldr r11,[sp,#32*18+4] @ ~in1infty
|
---|
1596 | ldr r12,[sp,#32*18+8] @ ~in2infty
|
---|
1597 | add r1,sp,#$res_x
|
---|
1598 | add r2,sp,#$in2_x
|
---|
1599 | and r10,r11,r12 @ ~in1infty & ~in2infty
|
---|
1600 | mvn r11,r11
|
---|
1601 | add r3,sp,#$in1_x
|
---|
1602 | and r11,r11,r12 @ in1infty & ~in2infty
|
---|
1603 | mvn r12,r12 @ in2infty
|
---|
1604 | ldr $r_ptr,[sp,#32*18+16]
|
---|
1605 | ___
|
---|
1606 | for($i=0;$i<96;$i+=8) { # conditional moves
|
---|
1607 | $code.=<<___;
|
---|
1608 | ldmia r1!,{r4-r5} @ res_x
|
---|
1609 | ldmia r2!,{r6-r7} @ in2_x
|
---|
1610 | ldmia r3!,{r8-r9} @ in1_x
|
---|
1611 | and r4,r4,r10 @ ~in1infty & ~in2infty
|
---|
1612 | and r5,r5,r10
|
---|
1613 | and r6,r6,r11 @ in1infty & ~in2infty
|
---|
1614 | and r7,r7,r11
|
---|
1615 | and r8,r8,r12 @ in2infty
|
---|
1616 | and r9,r9,r12
|
---|
1617 | orr r4,r4,r6
|
---|
1618 | orr r5,r5,r7
|
---|
1619 | orr r4,r4,r8
|
---|
1620 | orr r5,r5,r9
|
---|
1621 | stmia $r_ptr!,{r4-r5}
|
---|
1622 | ___
|
---|
1623 | }
|
---|
1624 | $code.=<<___;
|
---|
1625 | .Ladd_done:
|
---|
1626 | add sp,sp,#32*18+16+16 @ +16 means "skip even over saved r0-r3"
|
---|
1627 | #if __ARM_ARCH__>=5 || !defined(__thumb__)
|
---|
1628 | ldmia sp!,{r4-r12,pc}
|
---|
1629 | #else
|
---|
1630 | ldmia sp!,{r4-r12,lr}
|
---|
1631 | bx lr @ interoperable with Thumb ISA:-)
|
---|
1632 | #endif
|
---|
1633 | .size ecp_nistz256_point_add,.-ecp_nistz256_point_add
|
---|
1634 | ___
|
---|
1635 | }
|
---|
1636 |
|
---|
1637 | ########################################################################
|
---|
1638 | # void ecp_nistz256_point_add_affine(P256_POINT *out,const P256_POINT *in1,
|
---|
1639 | # const P256_POINT_AFFINE *in2);
|
---|
1640 | {
|
---|
1641 | my ($res_x,$res_y,$res_z,
|
---|
1642 | $in1_x,$in1_y,$in1_z,
|
---|
1643 | $in2_x,$in2_y,
|
---|
1644 | $U2,$S2,$H,$R,$Hsqr,$Hcub,$Rsqr)=map(32*$_,(0..14));
|
---|
1645 | my $Z1sqr = $S2;
|
---|
1646 | # above map() describes stack layout with 18 temporary
|
---|
1647 | # 256-bit vectors on top. Then note that we push
|
---|
1648 | # starting from r0, which means that we have copy of
|
---|
1649 | # input arguments just below these temporary vectors.
|
---|
1650 | # We use two of them for ~in1infty, ~in2infty.
|
---|
1651 |
|
---|
1652 | my @ONE_mont=(1,0,0,-1,-1,-1,-2,0);
|
---|
1653 |
|
---|
1654 | $code.=<<___;
|
---|
1655 | .globl ecp_nistz256_point_add_affine
|
---|
1656 | .type ecp_nistz256_point_add_affine,%function
|
---|
1657 | .align 5
|
---|
1658 | ecp_nistz256_point_add_affine:
|
---|
1659 | stmdb sp!,{r0-r12,lr} @ push from r0, unusual, but intentional
|
---|
1660 | sub sp,sp,#32*15
|
---|
1661 |
|
---|
1662 | ldmia $a_ptr!,{r4-r11} @ copy in1_x
|
---|
1663 | add r3,sp,#$in1_x
|
---|
1664 | stmia r3!,{r4-r11}
|
---|
1665 | ldmia $a_ptr!,{r4-r11} @ copy in1_y
|
---|
1666 | stmia r3!,{r4-r11}
|
---|
1667 | ldmia $a_ptr,{r4-r11} @ copy in1_z
|
---|
1668 | orr r12,r4,r5
|
---|
1669 | orr r12,r12,r6
|
---|
1670 | orr r12,r12,r7
|
---|
1671 | orr r12,r12,r8
|
---|
1672 | orr r12,r12,r9
|
---|
1673 | orr r12,r12,r10
|
---|
1674 | orr r12,r12,r11
|
---|
1675 | cmp r12,#0
|
---|
1676 | #ifdef __thumb2__
|
---|
1677 | it ne
|
---|
1678 | #endif
|
---|
1679 | movne r12,#-1
|
---|
1680 | stmia r3,{r4-r11}
|
---|
1681 | str r12,[sp,#32*15+4] @ ~in1infty
|
---|
1682 |
|
---|
1683 | ldmia $b_ptr!,{r4-r11} @ copy in2_x
|
---|
1684 | add r3,sp,#$in2_x
|
---|
1685 | orr r12,r4,r5
|
---|
1686 | orr r12,r12,r6
|
---|
1687 | orr r12,r12,r7
|
---|
1688 | orr r12,r12,r8
|
---|
1689 | orr r12,r12,r9
|
---|
1690 | orr r12,r12,r10
|
---|
1691 | orr r12,r12,r11
|
---|
1692 | stmia r3!,{r4-r11}
|
---|
1693 | ldmia $b_ptr!,{r4-r11} @ copy in2_y
|
---|
1694 | orr r12,r12,r4
|
---|
1695 | orr r12,r12,r5
|
---|
1696 | orr r12,r12,r6
|
---|
1697 | orr r12,r12,r7
|
---|
1698 | orr r12,r12,r8
|
---|
1699 | orr r12,r12,r9
|
---|
1700 | orr r12,r12,r10
|
---|
1701 | orr r12,r12,r11
|
---|
1702 | stmia r3!,{r4-r11}
|
---|
1703 | cmp r12,#0
|
---|
1704 | #ifdef __thumb2__
|
---|
1705 | it ne
|
---|
1706 | #endif
|
---|
1707 | movne r12,#-1
|
---|
1708 | str r12,[sp,#32*15+8] @ ~in2infty
|
---|
1709 |
|
---|
1710 | add $a_ptr,sp,#$in1_z
|
---|
1711 | add $b_ptr,sp,#$in1_z
|
---|
1712 | add $r_ptr,sp,#$Z1sqr
|
---|
1713 | bl __ecp_nistz256_mul_mont @ p256_sqr_mont(Z1sqr, in1_z);
|
---|
1714 |
|
---|
1715 | add $a_ptr,sp,#$Z1sqr
|
---|
1716 | add $b_ptr,sp,#$in2_x
|
---|
1717 | add $r_ptr,sp,#$U2
|
---|
1718 | bl __ecp_nistz256_mul_mont @ p256_mul_mont(U2, Z1sqr, in2_x);
|
---|
1719 |
|
---|
1720 | add $b_ptr,sp,#$in1_x
|
---|
1721 | add $r_ptr,sp,#$H
|
---|
1722 | bl __ecp_nistz256_sub_from @ p256_sub(H, U2, in1_x);
|
---|
1723 |
|
---|
1724 | add $a_ptr,sp,#$Z1sqr
|
---|
1725 | add $b_ptr,sp,#$in1_z
|
---|
1726 | add $r_ptr,sp,#$S2
|
---|
1727 | bl __ecp_nistz256_mul_mont @ p256_mul_mont(S2, Z1sqr, in1_z);
|
---|
1728 |
|
---|
1729 | add $a_ptr,sp,#$H
|
---|
1730 | add $b_ptr,sp,#$in1_z
|
---|
1731 | add $r_ptr,sp,#$res_z
|
---|
1732 | bl __ecp_nistz256_mul_mont @ p256_mul_mont(res_z, H, in1_z);
|
---|
1733 |
|
---|
1734 | add $a_ptr,sp,#$in2_y
|
---|
1735 | add $b_ptr,sp,#$S2
|
---|
1736 | add $r_ptr,sp,#$S2
|
---|
1737 | bl __ecp_nistz256_mul_mont @ p256_mul_mont(S2, S2, in2_y);
|
---|
1738 |
|
---|
1739 | add $b_ptr,sp,#$in1_y
|
---|
1740 | add $r_ptr,sp,#$R
|
---|
1741 | bl __ecp_nistz256_sub_from @ p256_sub(R, S2, in1_y);
|
---|
1742 |
|
---|
1743 | add $a_ptr,sp,#$H
|
---|
1744 | add $b_ptr,sp,#$H
|
---|
1745 | add $r_ptr,sp,#$Hsqr
|
---|
1746 | bl __ecp_nistz256_mul_mont @ p256_sqr_mont(Hsqr, H);
|
---|
1747 |
|
---|
1748 | add $a_ptr,sp,#$R
|
---|
1749 | add $b_ptr,sp,#$R
|
---|
1750 | add $r_ptr,sp,#$Rsqr
|
---|
1751 | bl __ecp_nistz256_mul_mont @ p256_sqr_mont(Rsqr, R);
|
---|
1752 |
|
---|
1753 | add $a_ptr,sp,#$H
|
---|
1754 | add $b_ptr,sp,#$Hsqr
|
---|
1755 | add $r_ptr,sp,#$Hcub
|
---|
1756 | bl __ecp_nistz256_mul_mont @ p256_mul_mont(Hcub, Hsqr, H);
|
---|
1757 |
|
---|
1758 | add $a_ptr,sp,#$Hsqr
|
---|
1759 | add $b_ptr,sp,#$in1_x
|
---|
1760 | add $r_ptr,sp,#$U2
|
---|
1761 | bl __ecp_nistz256_mul_mont @ p256_mul_mont(U2, in1_x, Hsqr);
|
---|
1762 |
|
---|
1763 | add $r_ptr,sp,#$Hsqr
|
---|
1764 | bl __ecp_nistz256_add_self @ p256_mul_by_2(Hsqr, U2);
|
---|
1765 |
|
---|
1766 | add $b_ptr,sp,#$Rsqr
|
---|
1767 | add $r_ptr,sp,#$res_x
|
---|
1768 | bl __ecp_nistz256_sub_morf @ p256_sub(res_x, Rsqr, Hsqr);
|
---|
1769 |
|
---|
1770 | add $b_ptr,sp,#$Hcub
|
---|
1771 | bl __ecp_nistz256_sub_from @ p256_sub(res_x, res_x, Hcub);
|
---|
1772 |
|
---|
1773 | add $b_ptr,sp,#$U2
|
---|
1774 | add $r_ptr,sp,#$res_y
|
---|
1775 | bl __ecp_nistz256_sub_morf @ p256_sub(res_y, U2, res_x);
|
---|
1776 |
|
---|
1777 | add $a_ptr,sp,#$Hcub
|
---|
1778 | add $b_ptr,sp,#$in1_y
|
---|
1779 | add $r_ptr,sp,#$S2
|
---|
1780 | bl __ecp_nistz256_mul_mont @ p256_mul_mont(S2, in1_y, Hcub);
|
---|
1781 |
|
---|
1782 | add $a_ptr,sp,#$R
|
---|
1783 | add $b_ptr,sp,#$res_y
|
---|
1784 | add $r_ptr,sp,#$res_y
|
---|
1785 | bl __ecp_nistz256_mul_mont @ p256_mul_mont(res_y, res_y, R);
|
---|
1786 |
|
---|
1787 | add $b_ptr,sp,#$S2
|
---|
1788 | bl __ecp_nistz256_sub_from @ p256_sub(res_y, res_y, S2);
|
---|
1789 |
|
---|
1790 | ldr r11,[sp,#32*15+4] @ ~in1infty
|
---|
1791 | ldr r12,[sp,#32*15+8] @ ~in2infty
|
---|
1792 | add r1,sp,#$res_x
|
---|
1793 | add r2,sp,#$in2_x
|
---|
1794 | and r10,r11,r12 @ ~in1infty & ~in2infty
|
---|
1795 | mvn r11,r11
|
---|
1796 | add r3,sp,#$in1_x
|
---|
1797 | and r11,r11,r12 @ in1infty & ~in2infty
|
---|
1798 | mvn r12,r12 @ in2infty
|
---|
1799 | ldr $r_ptr,[sp,#32*15]
|
---|
1800 | ___
|
---|
1801 | for($i=0;$i<64;$i+=8) { # conditional moves
|
---|
1802 | $code.=<<___;
|
---|
1803 | ldmia r1!,{r4-r5} @ res_x
|
---|
1804 | ldmia r2!,{r6-r7} @ in2_x
|
---|
1805 | ldmia r3!,{r8-r9} @ in1_x
|
---|
1806 | and r4,r4,r10 @ ~in1infty & ~in2infty
|
---|
1807 | and r5,r5,r10
|
---|
1808 | and r6,r6,r11 @ in1infty & ~in2infty
|
---|
1809 | and r7,r7,r11
|
---|
1810 | and r8,r8,r12 @ in2infty
|
---|
1811 | and r9,r9,r12
|
---|
1812 | orr r4,r4,r6
|
---|
1813 | orr r5,r5,r7
|
---|
1814 | orr r4,r4,r8
|
---|
1815 | orr r5,r5,r9
|
---|
1816 | stmia $r_ptr!,{r4-r5}
|
---|
1817 | ___
|
---|
1818 | }
|
---|
1819 | for(;$i<96;$i+=8) {
|
---|
1820 | my $j=($i-64)/4;
|
---|
1821 | $code.=<<___;
|
---|
1822 | ldmia r1!,{r4-r5} @ res_z
|
---|
1823 | ldmia r3!,{r8-r9} @ in1_z
|
---|
1824 | and r4,r4,r10
|
---|
1825 | and r5,r5,r10
|
---|
1826 | and r6,r11,#@ONE_mont[$j]
|
---|
1827 | and r7,r11,#@ONE_mont[$j+1]
|
---|
1828 | and r8,r8,r12
|
---|
1829 | and r9,r9,r12
|
---|
1830 | orr r4,r4,r6
|
---|
1831 | orr r5,r5,r7
|
---|
1832 | orr r4,r4,r8
|
---|
1833 | orr r5,r5,r9
|
---|
1834 | stmia $r_ptr!,{r4-r5}
|
---|
1835 | ___
|
---|
1836 | }
|
---|
1837 | $code.=<<___;
|
---|
1838 | add sp,sp,#32*15+16 @ +16 means "skip even over saved r0-r3"
|
---|
1839 | #if __ARM_ARCH__>=5 || !defined(__thumb__)
|
---|
1840 | ldmia sp!,{r4-r12,pc}
|
---|
1841 | #else
|
---|
1842 | ldmia sp!,{r4-r12,lr}
|
---|
1843 | bx lr @ interoperable with Thumb ISA:-)
|
---|
1844 | #endif
|
---|
1845 | .size ecp_nistz256_point_add_affine,.-ecp_nistz256_point_add_affine
|
---|
1846 | ___
|
---|
1847 | } }}}
|
---|
1848 |
|
---|
1849 | foreach (split("\n",$code)) {
|
---|
1850 | s/\`([^\`]*)\`/eval $1/geo;
|
---|
1851 |
|
---|
1852 | s/\bq([0-9]+)#(lo|hi)/sprintf "d%d",2*$1+($2 eq "hi")/geo;
|
---|
1853 |
|
---|
1854 | print $_,"\n";
|
---|
1855 | }
|
---|
1856 | close STDOUT or die "error closing STDOUT: $!"; # enforce flush
|
---|