1 | #! /usr/bin/env perl
|
---|
2 | # Copyright 2015-2020 The OpenSSL Project Authors. All Rights Reserved.
|
---|
3 | #
|
---|
4 | # Licensed under the OpenSSL license (the "License"). You may not use
|
---|
5 | # this file except in compliance with the License. You can obtain a copy
|
---|
6 | # in the file LICENSE in the source distribution or at
|
---|
7 | # https://www.openssl.org/source/license.html
|
---|
8 |
|
---|
9 |
|
---|
10 | # ====================================================================
|
---|
11 | # Written by Andy Polyakov <[email protected]> for the OpenSSL
|
---|
12 | # project. The module is, however, dual licensed under OpenSSL and
|
---|
13 | # CRYPTOGAMS licenses depending on where you obtain it. For further
|
---|
14 | # details see http://www.openssl.org/~appro/cryptogams/.
|
---|
15 | # ====================================================================
|
---|
16 | #
|
---|
17 | # ECP_NISTZ256 module for ARMv4.
|
---|
18 | #
|
---|
19 | # October 2014.
|
---|
20 | #
|
---|
21 | # Original ECP_NISTZ256 submission targeting x86_64 is detailed in
|
---|
22 | # http://eprint.iacr.org/2013/816. In the process of adaptation
|
---|
23 | # original .c module was made 32-bit savvy in order to make this
|
---|
24 | # implementation possible.
|
---|
25 | #
|
---|
26 | # with/without -DECP_NISTZ256_ASM
|
---|
27 | # Cortex-A8 +53-170%
|
---|
28 | # Cortex-A9 +76-205%
|
---|
29 | # Cortex-A15 +100-316%
|
---|
30 | # Snapdragon S4 +66-187%
|
---|
31 | #
|
---|
32 | # Ranges denote minimum and maximum improvement coefficients depending
|
---|
33 | # on benchmark. Lower coefficients are for ECDSA sign, server-side
|
---|
34 | # operation. Keep in mind that +200% means 3x improvement.
|
---|
35 |
|
---|
36 | $flavour = shift;
|
---|
37 | if ($flavour=~/\w[\w\-]*\.\w+$/) { $output=$flavour; undef $flavour; }
|
---|
38 | else { while (($output=shift) && ($output!~/\w[\w\-]*\.\w+$/)) {} }
|
---|
39 |
|
---|
40 | if ($flavour && $flavour ne "void") {
|
---|
41 | $0 =~ m/(.*[\/\\])[^\/\\]+$/; $dir=$1;
|
---|
42 | ( $xlate="${dir}arm-xlate.pl" and -f $xlate ) or
|
---|
43 | ( $xlate="${dir}../../perlasm/arm-xlate.pl" and -f $xlate) or
|
---|
44 | die "can't locate arm-xlate.pl";
|
---|
45 |
|
---|
46 | open STDOUT,"| \"$^X\" $xlate $flavour $output";
|
---|
47 | } else {
|
---|
48 | open STDOUT,">$output";
|
---|
49 | }
|
---|
50 |
|
---|
51 | $code.=<<___;
|
---|
52 | #include "arm_arch.h"
|
---|
53 |
|
---|
54 | .text
|
---|
55 | #if defined(__thumb2__)
|
---|
56 | .syntax unified
|
---|
57 | .thumb
|
---|
58 | #else
|
---|
59 | .code 32
|
---|
60 | #endif
|
---|
61 | ___
|
---|
62 | ########################################################################
|
---|
63 | # Convert ecp_nistz256_table.c to layout expected by ecp_nistz_gather_w7
|
---|
64 | #
|
---|
65 | $0 =~ m/(.*[\/\\])[^\/\\]+$/; $dir=$1;
|
---|
66 | open TABLE,"<ecp_nistz256_table.c" or
|
---|
67 | open TABLE,"<${dir}../ecp_nistz256_table.c" or
|
---|
68 | die "failed to open ecp_nistz256_table.c:",$!;
|
---|
69 |
|
---|
70 | use integer;
|
---|
71 |
|
---|
72 | foreach(<TABLE>) {
|
---|
73 | s/TOBN\(\s*(0x[0-9a-f]+),\s*(0x[0-9a-f]+)\s*\)/push @arr,hex($2),hex($1)/geo;
|
---|
74 | }
|
---|
75 | close TABLE;
|
---|
76 |
|
---|
77 | # See ecp_nistz256_table.c for explanation for why it's 64*16*37.
|
---|
78 | # 64*16*37-1 is because $#arr returns last valid index or @arr, not
|
---|
79 | # amount of elements.
|
---|
80 | die "insane number of elements" if ($#arr != 64*16*37-1);
|
---|
81 |
|
---|
82 | $code.=<<___;
|
---|
83 | .globl ecp_nistz256_precomputed
|
---|
84 | .type ecp_nistz256_precomputed,%object
|
---|
85 | .align 12
|
---|
86 | ecp_nistz256_precomputed:
|
---|
87 | ___
|
---|
88 | ########################################################################
|
---|
89 | # this conversion smashes P256_POINT_AFFINE by individual bytes with
|
---|
90 | # 64 byte interval, similar to
|
---|
91 | # 1111222233334444
|
---|
92 | # 1234123412341234
|
---|
93 | for(1..37) {
|
---|
94 | @tbl = splice(@arr,0,64*16);
|
---|
95 | for($i=0;$i<64;$i++) {
|
---|
96 | undef @line;
|
---|
97 | for($j=0;$j<64;$j++) {
|
---|
98 | push @line,(@tbl[$j*16+$i/4]>>(($i%4)*8))&0xff;
|
---|
99 | }
|
---|
100 | $code.=".byte\t";
|
---|
101 | $code.=join(',',map { sprintf "0x%02x",$_} @line);
|
---|
102 | $code.="\n";
|
---|
103 | }
|
---|
104 | }
|
---|
105 | $code.=<<___;
|
---|
106 | .size ecp_nistz256_precomputed,.-ecp_nistz256_precomputed
|
---|
107 | .align 5
|
---|
108 | .LRR: @ 2^512 mod P precomputed for NIST P256 polynomial
|
---|
109 | .long 0x00000003, 0x00000000, 0xffffffff, 0xfffffffb
|
---|
110 | .long 0xfffffffe, 0xffffffff, 0xfffffffd, 0x00000004
|
---|
111 | .Lone:
|
---|
112 | .long 1,0,0,0,0,0,0,0
|
---|
113 | .asciz "ECP_NISTZ256 for ARMv4, CRYPTOGAMS by <appro\@openssl.org>"
|
---|
114 | .align 6
|
---|
115 | ___
|
---|
116 |
|
---|
117 | ########################################################################
|
---|
118 | # common register layout, note that $t2 is link register, so that if
|
---|
119 | # internal subroutine uses $t2, then it has to offload lr...
|
---|
120 |
|
---|
121 | ($r_ptr,$a_ptr,$b_ptr,$ff,$a0,$a1,$a2,$a3,$a4,$a5,$a6,$a7,$t1,$t2)=
|
---|
122 | map("r$_",(0..12,14));
|
---|
123 | ($t0,$t3)=($ff,$a_ptr);
|
---|
124 |
|
---|
125 | $code.=<<___;
|
---|
126 | @ void ecp_nistz256_to_mont(BN_ULONG r0[8],const BN_ULONG r1[8]);
|
---|
127 | .globl ecp_nistz256_to_mont
|
---|
128 | .type ecp_nistz256_to_mont,%function
|
---|
129 | ecp_nistz256_to_mont:
|
---|
130 | adr $b_ptr,.LRR
|
---|
131 | b .Lecp_nistz256_mul_mont
|
---|
132 | .size ecp_nistz256_to_mont,.-ecp_nistz256_to_mont
|
---|
133 |
|
---|
134 | @ void ecp_nistz256_from_mont(BN_ULONG r0[8],const BN_ULONG r1[8]);
|
---|
135 | .globl ecp_nistz256_from_mont
|
---|
136 | .type ecp_nistz256_from_mont,%function
|
---|
137 | ecp_nistz256_from_mont:
|
---|
138 | adr $b_ptr,.Lone
|
---|
139 | b .Lecp_nistz256_mul_mont
|
---|
140 | .size ecp_nistz256_from_mont,.-ecp_nistz256_from_mont
|
---|
141 |
|
---|
142 | @ void ecp_nistz256_mul_by_2(BN_ULONG r0[8],const BN_ULONG r1[8]);
|
---|
143 | .globl ecp_nistz256_mul_by_2
|
---|
144 | .type ecp_nistz256_mul_by_2,%function
|
---|
145 | .align 4
|
---|
146 | ecp_nistz256_mul_by_2:
|
---|
147 | stmdb sp!,{r4-r12,lr}
|
---|
148 | bl __ecp_nistz256_mul_by_2
|
---|
149 | #if __ARM_ARCH__>=5 || !defined(__thumb__)
|
---|
150 | ldmia sp!,{r4-r12,pc}
|
---|
151 | #else
|
---|
152 | ldmia sp!,{r4-r12,lr}
|
---|
153 | bx lr @ interoperable with Thumb ISA:-)
|
---|
154 | #endif
|
---|
155 | .size ecp_nistz256_mul_by_2,.-ecp_nistz256_mul_by_2
|
---|
156 |
|
---|
157 | .type __ecp_nistz256_mul_by_2,%function
|
---|
158 | .align 4
|
---|
159 | __ecp_nistz256_mul_by_2:
|
---|
160 | ldr $a0,[$a_ptr,#0]
|
---|
161 | ldr $a1,[$a_ptr,#4]
|
---|
162 | ldr $a2,[$a_ptr,#8]
|
---|
163 | adds $a0,$a0,$a0 @ a[0:7]+=a[0:7], i.e. add with itself
|
---|
164 | ldr $a3,[$a_ptr,#12]
|
---|
165 | adcs $a1,$a1,$a1
|
---|
166 | ldr $a4,[$a_ptr,#16]
|
---|
167 | adcs $a2,$a2,$a2
|
---|
168 | ldr $a5,[$a_ptr,#20]
|
---|
169 | adcs $a3,$a3,$a3
|
---|
170 | ldr $a6,[$a_ptr,#24]
|
---|
171 | adcs $a4,$a4,$a4
|
---|
172 | ldr $a7,[$a_ptr,#28]
|
---|
173 | adcs $a5,$a5,$a5
|
---|
174 | adcs $a6,$a6,$a6
|
---|
175 | mov $ff,#0
|
---|
176 | adcs $a7,$a7,$a7
|
---|
177 | adc $ff,$ff,#0
|
---|
178 |
|
---|
179 | b .Lreduce_by_sub
|
---|
180 | .size __ecp_nistz256_mul_by_2,.-__ecp_nistz256_mul_by_2
|
---|
181 |
|
---|
182 | @ void ecp_nistz256_add(BN_ULONG r0[8],const BN_ULONG r1[8],
|
---|
183 | @ const BN_ULONG r2[8]);
|
---|
184 | .globl ecp_nistz256_add
|
---|
185 | .type ecp_nistz256_add,%function
|
---|
186 | .align 4
|
---|
187 | ecp_nistz256_add:
|
---|
188 | stmdb sp!,{r4-r12,lr}
|
---|
189 | bl __ecp_nistz256_add
|
---|
190 | #if __ARM_ARCH__>=5 || !defined(__thumb__)
|
---|
191 | ldmia sp!,{r4-r12,pc}
|
---|
192 | #else
|
---|
193 | ldmia sp!,{r4-r12,lr}
|
---|
194 | bx lr @ interoperable with Thumb ISA:-)
|
---|
195 | #endif
|
---|
196 | .size ecp_nistz256_add,.-ecp_nistz256_add
|
---|
197 |
|
---|
198 | .type __ecp_nistz256_add,%function
|
---|
199 | .align 4
|
---|
200 | __ecp_nistz256_add:
|
---|
201 | str lr,[sp,#-4]! @ push lr
|
---|
202 |
|
---|
203 | ldr $a0,[$a_ptr,#0]
|
---|
204 | ldr $a1,[$a_ptr,#4]
|
---|
205 | ldr $a2,[$a_ptr,#8]
|
---|
206 | ldr $a3,[$a_ptr,#12]
|
---|
207 | ldr $a4,[$a_ptr,#16]
|
---|
208 | ldr $t0,[$b_ptr,#0]
|
---|
209 | ldr $a5,[$a_ptr,#20]
|
---|
210 | ldr $t1,[$b_ptr,#4]
|
---|
211 | ldr $a6,[$a_ptr,#24]
|
---|
212 | ldr $t2,[$b_ptr,#8]
|
---|
213 | ldr $a7,[$a_ptr,#28]
|
---|
214 | ldr $t3,[$b_ptr,#12]
|
---|
215 | adds $a0,$a0,$t0
|
---|
216 | ldr $t0,[$b_ptr,#16]
|
---|
217 | adcs $a1,$a1,$t1
|
---|
218 | ldr $t1,[$b_ptr,#20]
|
---|
219 | adcs $a2,$a2,$t2
|
---|
220 | ldr $t2,[$b_ptr,#24]
|
---|
221 | adcs $a3,$a3,$t3
|
---|
222 | ldr $t3,[$b_ptr,#28]
|
---|
223 | adcs $a4,$a4,$t0
|
---|
224 | adcs $a5,$a5,$t1
|
---|
225 | adcs $a6,$a6,$t2
|
---|
226 | mov $ff,#0
|
---|
227 | adcs $a7,$a7,$t3
|
---|
228 | adc $ff,$ff,#0
|
---|
229 | ldr lr,[sp],#4 @ pop lr
|
---|
230 |
|
---|
231 | .Lreduce_by_sub:
|
---|
232 |
|
---|
233 | @ if a+b >= modulus, subtract modulus.
|
---|
234 | @
|
---|
235 | @ But since comparison implies subtraction, we subtract
|
---|
236 | @ modulus and then add it back if subtraction borrowed.
|
---|
237 |
|
---|
238 | subs $a0,$a0,#-1
|
---|
239 | sbcs $a1,$a1,#-1
|
---|
240 | sbcs $a2,$a2,#-1
|
---|
241 | sbcs $a3,$a3,#0
|
---|
242 | sbcs $a4,$a4,#0
|
---|
243 | sbcs $a5,$a5,#0
|
---|
244 | sbcs $a6,$a6,#1
|
---|
245 | sbcs $a7,$a7,#-1
|
---|
246 | sbc $ff,$ff,#0
|
---|
247 |
|
---|
248 | @ Note that because mod has special form, i.e. consists of
|
---|
249 | @ 0xffffffff, 1 and 0s, we can conditionally synthesize it by
|
---|
250 | @ using value of borrow as a whole or extracting single bit.
|
---|
251 | @ Follow $ff register...
|
---|
252 |
|
---|
253 | adds $a0,$a0,$ff @ add synthesized modulus
|
---|
254 | adcs $a1,$a1,$ff
|
---|
255 | str $a0,[$r_ptr,#0]
|
---|
256 | adcs $a2,$a2,$ff
|
---|
257 | str $a1,[$r_ptr,#4]
|
---|
258 | adcs $a3,$a3,#0
|
---|
259 | str $a2,[$r_ptr,#8]
|
---|
260 | adcs $a4,$a4,#0
|
---|
261 | str $a3,[$r_ptr,#12]
|
---|
262 | adcs $a5,$a5,#0
|
---|
263 | str $a4,[$r_ptr,#16]
|
---|
264 | adcs $a6,$a6,$ff,lsr#31
|
---|
265 | str $a5,[$r_ptr,#20]
|
---|
266 | adcs $a7,$a7,$ff
|
---|
267 | str $a6,[$r_ptr,#24]
|
---|
268 | str $a7,[$r_ptr,#28]
|
---|
269 |
|
---|
270 | mov pc,lr
|
---|
271 | .size __ecp_nistz256_add,.-__ecp_nistz256_add
|
---|
272 |
|
---|
273 | @ void ecp_nistz256_mul_by_3(BN_ULONG r0[8],const BN_ULONG r1[8]);
|
---|
274 | .globl ecp_nistz256_mul_by_3
|
---|
275 | .type ecp_nistz256_mul_by_3,%function
|
---|
276 | .align 4
|
---|
277 | ecp_nistz256_mul_by_3:
|
---|
278 | stmdb sp!,{r4-r12,lr}
|
---|
279 | bl __ecp_nistz256_mul_by_3
|
---|
280 | #if __ARM_ARCH__>=5 || !defined(__thumb__)
|
---|
281 | ldmia sp!,{r4-r12,pc}
|
---|
282 | #else
|
---|
283 | ldmia sp!,{r4-r12,lr}
|
---|
284 | bx lr @ interoperable with Thumb ISA:-)
|
---|
285 | #endif
|
---|
286 | .size ecp_nistz256_mul_by_3,.-ecp_nistz256_mul_by_3
|
---|
287 |
|
---|
288 | .type __ecp_nistz256_mul_by_3,%function
|
---|
289 | .align 4
|
---|
290 | __ecp_nistz256_mul_by_3:
|
---|
291 | str lr,[sp,#-4]! @ push lr
|
---|
292 |
|
---|
293 | @ As multiplication by 3 is performed as 2*n+n, below are inline
|
---|
294 | @ copies of __ecp_nistz256_mul_by_2 and __ecp_nistz256_add, see
|
---|
295 | @ corresponding subroutines for details.
|
---|
296 |
|
---|
297 | ldr $a0,[$a_ptr,#0]
|
---|
298 | ldr $a1,[$a_ptr,#4]
|
---|
299 | ldr $a2,[$a_ptr,#8]
|
---|
300 | adds $a0,$a0,$a0 @ a[0:7]+=a[0:7]
|
---|
301 | ldr $a3,[$a_ptr,#12]
|
---|
302 | adcs $a1,$a1,$a1
|
---|
303 | ldr $a4,[$a_ptr,#16]
|
---|
304 | adcs $a2,$a2,$a2
|
---|
305 | ldr $a5,[$a_ptr,#20]
|
---|
306 | adcs $a3,$a3,$a3
|
---|
307 | ldr $a6,[$a_ptr,#24]
|
---|
308 | adcs $a4,$a4,$a4
|
---|
309 | ldr $a7,[$a_ptr,#28]
|
---|
310 | adcs $a5,$a5,$a5
|
---|
311 | adcs $a6,$a6,$a6
|
---|
312 | mov $ff,#0
|
---|
313 | adcs $a7,$a7,$a7
|
---|
314 | adc $ff,$ff,#0
|
---|
315 |
|
---|
316 | subs $a0,$a0,#-1 @ .Lreduce_by_sub but without stores
|
---|
317 | sbcs $a1,$a1,#-1
|
---|
318 | sbcs $a2,$a2,#-1
|
---|
319 | sbcs $a3,$a3,#0
|
---|
320 | sbcs $a4,$a4,#0
|
---|
321 | sbcs $a5,$a5,#0
|
---|
322 | sbcs $a6,$a6,#1
|
---|
323 | sbcs $a7,$a7,#-1
|
---|
324 | sbc $ff,$ff,#0
|
---|
325 |
|
---|
326 | adds $a0,$a0,$ff @ add synthesized modulus
|
---|
327 | adcs $a1,$a1,$ff
|
---|
328 | adcs $a2,$a2,$ff
|
---|
329 | adcs $a3,$a3,#0
|
---|
330 | adcs $a4,$a4,#0
|
---|
331 | ldr $b_ptr,[$a_ptr,#0]
|
---|
332 | adcs $a5,$a5,#0
|
---|
333 | ldr $t1,[$a_ptr,#4]
|
---|
334 | adcs $a6,$a6,$ff,lsr#31
|
---|
335 | ldr $t2,[$a_ptr,#8]
|
---|
336 | adc $a7,$a7,$ff
|
---|
337 |
|
---|
338 | ldr $t0,[$a_ptr,#12]
|
---|
339 | adds $a0,$a0,$b_ptr @ 2*a[0:7]+=a[0:7]
|
---|
340 | ldr $b_ptr,[$a_ptr,#16]
|
---|
341 | adcs $a1,$a1,$t1
|
---|
342 | ldr $t1,[$a_ptr,#20]
|
---|
343 | adcs $a2,$a2,$t2
|
---|
344 | ldr $t2,[$a_ptr,#24]
|
---|
345 | adcs $a3,$a3,$t0
|
---|
346 | ldr $t3,[$a_ptr,#28]
|
---|
347 | adcs $a4,$a4,$b_ptr
|
---|
348 | adcs $a5,$a5,$t1
|
---|
349 | adcs $a6,$a6,$t2
|
---|
350 | mov $ff,#0
|
---|
351 | adcs $a7,$a7,$t3
|
---|
352 | adc $ff,$ff,#0
|
---|
353 | ldr lr,[sp],#4 @ pop lr
|
---|
354 |
|
---|
355 | b .Lreduce_by_sub
|
---|
356 | .size ecp_nistz256_mul_by_3,.-ecp_nistz256_mul_by_3
|
---|
357 |
|
---|
358 | @ void ecp_nistz256_div_by_2(BN_ULONG r0[8],const BN_ULONG r1[8]);
|
---|
359 | .globl ecp_nistz256_div_by_2
|
---|
360 | .type ecp_nistz256_div_by_2,%function
|
---|
361 | .align 4
|
---|
362 | ecp_nistz256_div_by_2:
|
---|
363 | stmdb sp!,{r4-r12,lr}
|
---|
364 | bl __ecp_nistz256_div_by_2
|
---|
365 | #if __ARM_ARCH__>=5 || !defined(__thumb__)
|
---|
366 | ldmia sp!,{r4-r12,pc}
|
---|
367 | #else
|
---|
368 | ldmia sp!,{r4-r12,lr}
|
---|
369 | bx lr @ interoperable with Thumb ISA:-)
|
---|
370 | #endif
|
---|
371 | .size ecp_nistz256_div_by_2,.-ecp_nistz256_div_by_2
|
---|
372 |
|
---|
373 | .type __ecp_nistz256_div_by_2,%function
|
---|
374 | .align 4
|
---|
375 | __ecp_nistz256_div_by_2:
|
---|
376 | @ ret = (a is odd ? a+mod : a) >> 1
|
---|
377 |
|
---|
378 | ldr $a0,[$a_ptr,#0]
|
---|
379 | ldr $a1,[$a_ptr,#4]
|
---|
380 | ldr $a2,[$a_ptr,#8]
|
---|
381 | mov $ff,$a0,lsl#31 @ place least significant bit to most
|
---|
382 | @ significant position, now arithmetic
|
---|
383 | @ right shift by 31 will produce -1 or
|
---|
384 | @ 0, while logical right shift 1 or 0,
|
---|
385 | @ this is how modulus is conditionally
|
---|
386 | @ synthesized in this case...
|
---|
387 | ldr $a3,[$a_ptr,#12]
|
---|
388 | adds $a0,$a0,$ff,asr#31
|
---|
389 | ldr $a4,[$a_ptr,#16]
|
---|
390 | adcs $a1,$a1,$ff,asr#31
|
---|
391 | ldr $a5,[$a_ptr,#20]
|
---|
392 | adcs $a2,$a2,$ff,asr#31
|
---|
393 | ldr $a6,[$a_ptr,#24]
|
---|
394 | adcs $a3,$a3,#0
|
---|
395 | ldr $a7,[$a_ptr,#28]
|
---|
396 | adcs $a4,$a4,#0
|
---|
397 | mov $a0,$a0,lsr#1 @ a[0:7]>>=1, we can start early
|
---|
398 | @ because it doesn't affect flags
|
---|
399 | adcs $a5,$a5,#0
|
---|
400 | orr $a0,$a0,$a1,lsl#31
|
---|
401 | adcs $a6,$a6,$ff,lsr#31
|
---|
402 | mov $b_ptr,#0
|
---|
403 | adcs $a7,$a7,$ff,asr#31
|
---|
404 | mov $a1,$a1,lsr#1
|
---|
405 | adc $b_ptr,$b_ptr,#0 @ top-most carry bit from addition
|
---|
406 |
|
---|
407 | orr $a1,$a1,$a2,lsl#31
|
---|
408 | mov $a2,$a2,lsr#1
|
---|
409 | str $a0,[$r_ptr,#0]
|
---|
410 | orr $a2,$a2,$a3,lsl#31
|
---|
411 | mov $a3,$a3,lsr#1
|
---|
412 | str $a1,[$r_ptr,#4]
|
---|
413 | orr $a3,$a3,$a4,lsl#31
|
---|
414 | mov $a4,$a4,lsr#1
|
---|
415 | str $a2,[$r_ptr,#8]
|
---|
416 | orr $a4,$a4,$a5,lsl#31
|
---|
417 | mov $a5,$a5,lsr#1
|
---|
418 | str $a3,[$r_ptr,#12]
|
---|
419 | orr $a5,$a5,$a6,lsl#31
|
---|
420 | mov $a6,$a6,lsr#1
|
---|
421 | str $a4,[$r_ptr,#16]
|
---|
422 | orr $a6,$a6,$a7,lsl#31
|
---|
423 | mov $a7,$a7,lsr#1
|
---|
424 | str $a5,[$r_ptr,#20]
|
---|
425 | orr $a7,$a7,$b_ptr,lsl#31 @ don't forget the top-most carry bit
|
---|
426 | str $a6,[$r_ptr,#24]
|
---|
427 | str $a7,[$r_ptr,#28]
|
---|
428 |
|
---|
429 | mov pc,lr
|
---|
430 | .size __ecp_nistz256_div_by_2,.-__ecp_nistz256_div_by_2
|
---|
431 |
|
---|
432 | @ void ecp_nistz256_sub(BN_ULONG r0[8],const BN_ULONG r1[8],
|
---|
433 | @ const BN_ULONG r2[8]);
|
---|
434 | .globl ecp_nistz256_sub
|
---|
435 | .type ecp_nistz256_sub,%function
|
---|
436 | .align 4
|
---|
437 | ecp_nistz256_sub:
|
---|
438 | stmdb sp!,{r4-r12,lr}
|
---|
439 | bl __ecp_nistz256_sub
|
---|
440 | #if __ARM_ARCH__>=5 || !defined(__thumb__)
|
---|
441 | ldmia sp!,{r4-r12,pc}
|
---|
442 | #else
|
---|
443 | ldmia sp!,{r4-r12,lr}
|
---|
444 | bx lr @ interoperable with Thumb ISA:-)
|
---|
445 | #endif
|
---|
446 | .size ecp_nistz256_sub,.-ecp_nistz256_sub
|
---|
447 |
|
---|
448 | .type __ecp_nistz256_sub,%function
|
---|
449 | .align 4
|
---|
450 | __ecp_nistz256_sub:
|
---|
451 | str lr,[sp,#-4]! @ push lr
|
---|
452 |
|
---|
453 | ldr $a0,[$a_ptr,#0]
|
---|
454 | ldr $a1,[$a_ptr,#4]
|
---|
455 | ldr $a2,[$a_ptr,#8]
|
---|
456 | ldr $a3,[$a_ptr,#12]
|
---|
457 | ldr $a4,[$a_ptr,#16]
|
---|
458 | ldr $t0,[$b_ptr,#0]
|
---|
459 | ldr $a5,[$a_ptr,#20]
|
---|
460 | ldr $t1,[$b_ptr,#4]
|
---|
461 | ldr $a6,[$a_ptr,#24]
|
---|
462 | ldr $t2,[$b_ptr,#8]
|
---|
463 | ldr $a7,[$a_ptr,#28]
|
---|
464 | ldr $t3,[$b_ptr,#12]
|
---|
465 | subs $a0,$a0,$t0
|
---|
466 | ldr $t0,[$b_ptr,#16]
|
---|
467 | sbcs $a1,$a1,$t1
|
---|
468 | ldr $t1,[$b_ptr,#20]
|
---|
469 | sbcs $a2,$a2,$t2
|
---|
470 | ldr $t2,[$b_ptr,#24]
|
---|
471 | sbcs $a3,$a3,$t3
|
---|
472 | ldr $t3,[$b_ptr,#28]
|
---|
473 | sbcs $a4,$a4,$t0
|
---|
474 | sbcs $a5,$a5,$t1
|
---|
475 | sbcs $a6,$a6,$t2
|
---|
476 | sbcs $a7,$a7,$t3
|
---|
477 | sbc $ff,$ff,$ff @ broadcast borrow bit
|
---|
478 | ldr lr,[sp],#4 @ pop lr
|
---|
479 |
|
---|
480 | .Lreduce_by_add:
|
---|
481 |
|
---|
482 | @ if a-b borrows, add modulus.
|
---|
483 | @
|
---|
484 | @ Note that because mod has special form, i.e. consists of
|
---|
485 | @ 0xffffffff, 1 and 0s, we can conditionally synthesize it by
|
---|
486 | @ broadcasting borrow bit to a register, $ff, and using it as
|
---|
487 | @ a whole or extracting single bit.
|
---|
488 |
|
---|
489 | adds $a0,$a0,$ff @ add synthesized modulus
|
---|
490 | adcs $a1,$a1,$ff
|
---|
491 | str $a0,[$r_ptr,#0]
|
---|
492 | adcs $a2,$a2,$ff
|
---|
493 | str $a1,[$r_ptr,#4]
|
---|
494 | adcs $a3,$a3,#0
|
---|
495 | str $a2,[$r_ptr,#8]
|
---|
496 | adcs $a4,$a4,#0
|
---|
497 | str $a3,[$r_ptr,#12]
|
---|
498 | adcs $a5,$a5,#0
|
---|
499 | str $a4,[$r_ptr,#16]
|
---|
500 | adcs $a6,$a6,$ff,lsr#31
|
---|
501 | str $a5,[$r_ptr,#20]
|
---|
502 | adcs $a7,$a7,$ff
|
---|
503 | str $a6,[$r_ptr,#24]
|
---|
504 | str $a7,[$r_ptr,#28]
|
---|
505 |
|
---|
506 | mov pc,lr
|
---|
507 | .size __ecp_nistz256_sub,.-__ecp_nistz256_sub
|
---|
508 |
|
---|
509 | @ void ecp_nistz256_neg(BN_ULONG r0[8],const BN_ULONG r1[8]);
|
---|
510 | .globl ecp_nistz256_neg
|
---|
511 | .type ecp_nistz256_neg,%function
|
---|
512 | .align 4
|
---|
513 | ecp_nistz256_neg:
|
---|
514 | stmdb sp!,{r4-r12,lr}
|
---|
515 | bl __ecp_nistz256_neg
|
---|
516 | #if __ARM_ARCH__>=5 || !defined(__thumb__)
|
---|
517 | ldmia sp!,{r4-r12,pc}
|
---|
518 | #else
|
---|
519 | ldmia sp!,{r4-r12,lr}
|
---|
520 | bx lr @ interoperable with Thumb ISA:-)
|
---|
521 | #endif
|
---|
522 | .size ecp_nistz256_neg,.-ecp_nistz256_neg
|
---|
523 |
|
---|
524 | .type __ecp_nistz256_neg,%function
|
---|
525 | .align 4
|
---|
526 | __ecp_nistz256_neg:
|
---|
527 | ldr $a0,[$a_ptr,#0]
|
---|
528 | eor $ff,$ff,$ff
|
---|
529 | ldr $a1,[$a_ptr,#4]
|
---|
530 | ldr $a2,[$a_ptr,#8]
|
---|
531 | subs $a0,$ff,$a0
|
---|
532 | ldr $a3,[$a_ptr,#12]
|
---|
533 | sbcs $a1,$ff,$a1
|
---|
534 | ldr $a4,[$a_ptr,#16]
|
---|
535 | sbcs $a2,$ff,$a2
|
---|
536 | ldr $a5,[$a_ptr,#20]
|
---|
537 | sbcs $a3,$ff,$a3
|
---|
538 | ldr $a6,[$a_ptr,#24]
|
---|
539 | sbcs $a4,$ff,$a4
|
---|
540 | ldr $a7,[$a_ptr,#28]
|
---|
541 | sbcs $a5,$ff,$a5
|
---|
542 | sbcs $a6,$ff,$a6
|
---|
543 | sbcs $a7,$ff,$a7
|
---|
544 | sbc $ff,$ff,$ff
|
---|
545 |
|
---|
546 | b .Lreduce_by_add
|
---|
547 | .size __ecp_nistz256_neg,.-__ecp_nistz256_neg
|
---|
548 | ___
|
---|
549 | {
|
---|
550 | my @acc=map("r$_",(3..11));
|
---|
551 | my ($t0,$t1,$bj,$t2,$t3)=map("r$_",(0,1,2,12,14));
|
---|
552 |
|
---|
553 | $code.=<<___;
|
---|
554 | @ void ecp_nistz256_sqr_mont(BN_ULONG r0[8],const BN_ULONG r1[8]);
|
---|
555 | .globl ecp_nistz256_sqr_mont
|
---|
556 | .type ecp_nistz256_sqr_mont,%function
|
---|
557 | .align 4
|
---|
558 | ecp_nistz256_sqr_mont:
|
---|
559 | mov $b_ptr,$a_ptr
|
---|
560 | b .Lecp_nistz256_mul_mont
|
---|
561 | .size ecp_nistz256_sqr_mont,.-ecp_nistz256_sqr_mont
|
---|
562 |
|
---|
563 | @ void ecp_nistz256_mul_mont(BN_ULONG r0[8],const BN_ULONG r1[8],
|
---|
564 | @ const BN_ULONG r2[8]);
|
---|
565 | .globl ecp_nistz256_mul_mont
|
---|
566 | .type ecp_nistz256_mul_mont,%function
|
---|
567 | .align 4
|
---|
568 | ecp_nistz256_mul_mont:
|
---|
569 | .Lecp_nistz256_mul_mont:
|
---|
570 | stmdb sp!,{r4-r12,lr}
|
---|
571 | bl __ecp_nistz256_mul_mont
|
---|
572 | #if __ARM_ARCH__>=5 || !defined(__thumb__)
|
---|
573 | ldmia sp!,{r4-r12,pc}
|
---|
574 | #else
|
---|
575 | ldmia sp!,{r4-r12,lr}
|
---|
576 | bx lr @ interoperable with Thumb ISA:-)
|
---|
577 | #endif
|
---|
578 | .size ecp_nistz256_mul_mont,.-ecp_nistz256_mul_mont
|
---|
579 |
|
---|
580 | .type __ecp_nistz256_mul_mont,%function
|
---|
581 | .align 4
|
---|
582 | __ecp_nistz256_mul_mont:
|
---|
583 | stmdb sp!,{r0-r2,lr} @ make a copy of arguments too
|
---|
584 |
|
---|
585 | ldr $bj,[$b_ptr,#0] @ b[0]
|
---|
586 | ldmia $a_ptr,{@acc[1]-@acc[8]}
|
---|
587 |
|
---|
588 | umull @acc[0],$t3,@acc[1],$bj @ r[0]=a[0]*b[0]
|
---|
589 | stmdb sp!,{$acc[1]-@acc[8]} @ copy a[0-7] to stack, so
|
---|
590 | @ that it can be addressed
|
---|
591 | @ without spending register
|
---|
592 | @ on address
|
---|
593 | umull @acc[1],$t0,@acc[2],$bj @ r[1]=a[1]*b[0]
|
---|
594 | umull @acc[2],$t1,@acc[3],$bj
|
---|
595 | adds @acc[1],@acc[1],$t3 @ accumulate high part of mult
|
---|
596 | umull @acc[3],$t2,@acc[4],$bj
|
---|
597 | adcs @acc[2],@acc[2],$t0
|
---|
598 | umull @acc[4],$t3,@acc[5],$bj
|
---|
599 | adcs @acc[3],@acc[3],$t1
|
---|
600 | umull @acc[5],$t0,@acc[6],$bj
|
---|
601 | adcs @acc[4],@acc[4],$t2
|
---|
602 | umull @acc[6],$t1,@acc[7],$bj
|
---|
603 | adcs @acc[5],@acc[5],$t3
|
---|
604 | umull @acc[7],$t2,@acc[8],$bj
|
---|
605 | adcs @acc[6],@acc[6],$t0
|
---|
606 | adcs @acc[7],@acc[7],$t1
|
---|
607 | eor $t3,$t3,$t3 @ first overflow bit is zero
|
---|
608 | adc @acc[8],$t2,#0
|
---|
609 | ___
|
---|
610 | for(my $i=1;$i<8;$i++) {
|
---|
611 | my $t4=@acc[0];
|
---|
612 |
|
---|
613 | # Reduction iteration is normally performed by accumulating
|
---|
614 | # result of multiplication of modulus by "magic" digit [and
|
---|
615 | # omitting least significant word, which is guaranteed to
|
---|
616 | # be 0], but thanks to special form of modulus and "magic"
|
---|
617 | # digit being equal to least significant word, it can be
|
---|
618 | # performed with additions and subtractions alone. Indeed:
|
---|
619 | #
|
---|
620 | # ffff.0001.0000.0000.0000.ffff.ffff.ffff
|
---|
621 | # * abcd
|
---|
622 | # + xxxx.xxxx.xxxx.xxxx.xxxx.xxxx.xxxx.xxxx.abcd
|
---|
623 | #
|
---|
624 | # Now observing that ff..ff*x = (2^n-1)*x = 2^n*x-x, we
|
---|
625 | # rewrite above as:
|
---|
626 | #
|
---|
627 | # xxxx.xxxx.xxxx.xxxx.xxxx.xxxx.xxxx.xxxx.abcd
|
---|
628 | # + abcd.0000.abcd.0000.0000.abcd.0000.0000.0000
|
---|
629 | # - abcd.0000.0000.0000.0000.0000.0000.abcd
|
---|
630 | #
|
---|
631 | # or marking redundant operations:
|
---|
632 | #
|
---|
633 | # xxxx.xxxx.xxxx.xxxx.xxxx.xxxx.xxxx.xxxx.----
|
---|
634 | # + abcd.0000.abcd.0000.0000.abcd.----.----.----
|
---|
635 | # - abcd.----.----.----.----.----.----.----
|
---|
636 |
|
---|
637 | $code.=<<___;
|
---|
638 | @ multiplication-less reduction $i
|
---|
639 | adds @acc[3],@acc[3],@acc[0] @ r[3]+=r[0]
|
---|
640 | ldr $bj,[sp,#40] @ restore b_ptr
|
---|
641 | adcs @acc[4],@acc[4],#0 @ r[4]+=0
|
---|
642 | adcs @acc[5],@acc[5],#0 @ r[5]+=0
|
---|
643 | adcs @acc[6],@acc[6],@acc[0] @ r[6]+=r[0]
|
---|
644 | ldr $t1,[sp,#0] @ load a[0]
|
---|
645 | adcs @acc[7],@acc[7],#0 @ r[7]+=0
|
---|
646 | ldr $bj,[$bj,#4*$i] @ load b[i]
|
---|
647 | adcs @acc[8],@acc[8],@acc[0] @ r[8]+=r[0]
|
---|
648 | eor $t0,$t0,$t0
|
---|
649 | adc $t3,$t3,#0 @ overflow bit
|
---|
650 | subs @acc[7],@acc[7],@acc[0] @ r[7]-=r[0]
|
---|
651 | ldr $t2,[sp,#4] @ a[1]
|
---|
652 | sbcs @acc[8],@acc[8],#0 @ r[8]-=0
|
---|
653 | umlal @acc[1],$t0,$t1,$bj @ "r[0]"+=a[0]*b[i]
|
---|
654 | eor $t1,$t1,$t1
|
---|
655 | sbc @acc[0],$t3,#0 @ overflow bit, keep in mind
|
---|
656 | @ that netto result is
|
---|
657 | @ addition of a value which
|
---|
658 | @ makes underflow impossible
|
---|
659 |
|
---|
660 | ldr $t3,[sp,#8] @ a[2]
|
---|
661 | umlal @acc[2],$t1,$t2,$bj @ "r[1]"+=a[1]*b[i]
|
---|
662 | str @acc[0],[sp,#36] @ temporarily offload overflow
|
---|
663 | eor $t2,$t2,$t2
|
---|
664 | ldr $t4,[sp,#12] @ a[3], $t4 is alias @acc[0]
|
---|
665 | umlal @acc[3],$t2,$t3,$bj @ "r[2]"+=a[2]*b[i]
|
---|
666 | eor $t3,$t3,$t3
|
---|
667 | adds @acc[2],@acc[2],$t0 @ accumulate high part of mult
|
---|
668 | ldr $t0,[sp,#16] @ a[4]
|
---|
669 | umlal @acc[4],$t3,$t4,$bj @ "r[3]"+=a[3]*b[i]
|
---|
670 | eor $t4,$t4,$t4
|
---|
671 | adcs @acc[3],@acc[3],$t1
|
---|
672 | ldr $t1,[sp,#20] @ a[5]
|
---|
673 | umlal @acc[5],$t4,$t0,$bj @ "r[4]"+=a[4]*b[i]
|
---|
674 | eor $t0,$t0,$t0
|
---|
675 | adcs @acc[4],@acc[4],$t2
|
---|
676 | ldr $t2,[sp,#24] @ a[6]
|
---|
677 | umlal @acc[6],$t0,$t1,$bj @ "r[5]"+=a[5]*b[i]
|
---|
678 | eor $t1,$t1,$t1
|
---|
679 | adcs @acc[5],@acc[5],$t3
|
---|
680 | ldr $t3,[sp,#28] @ a[7]
|
---|
681 | umlal @acc[7],$t1,$t2,$bj @ "r[6]"+=a[6]*b[i]
|
---|
682 | eor $t2,$t2,$t2
|
---|
683 | adcs @acc[6],@acc[6],$t4
|
---|
684 | ldr @acc[0],[sp,#36] @ restore overflow bit
|
---|
685 | umlal @acc[8],$t2,$t3,$bj @ "r[7]"+=a[7]*b[i]
|
---|
686 | eor $t3,$t3,$t3
|
---|
687 | adcs @acc[7],@acc[7],$t0
|
---|
688 | adcs @acc[8],@acc[8],$t1
|
---|
689 | adcs @acc[0],$acc[0],$t2
|
---|
690 | adc $t3,$t3,#0 @ new overflow bit
|
---|
691 | ___
|
---|
692 | push(@acc,shift(@acc)); # rotate registers, so that
|
---|
693 | # "r[i]" becomes r[i]
|
---|
694 | }
|
---|
695 | $code.=<<___;
|
---|
696 | @ last multiplication-less reduction
|
---|
697 | adds @acc[3],@acc[3],@acc[0]
|
---|
698 | ldr $r_ptr,[sp,#32] @ restore r_ptr
|
---|
699 | adcs @acc[4],@acc[4],#0
|
---|
700 | adcs @acc[5],@acc[5],#0
|
---|
701 | adcs @acc[6],@acc[6],@acc[0]
|
---|
702 | adcs @acc[7],@acc[7],#0
|
---|
703 | adcs @acc[8],@acc[8],@acc[0]
|
---|
704 | adc $t3,$t3,#0
|
---|
705 | subs @acc[7],@acc[7],@acc[0]
|
---|
706 | sbcs @acc[8],@acc[8],#0
|
---|
707 | sbc @acc[0],$t3,#0 @ overflow bit
|
---|
708 |
|
---|
709 | @ Final step is "if result > mod, subtract mod", but we do it
|
---|
710 | @ "other way around", namely subtract modulus from result
|
---|
711 | @ and if it borrowed, add modulus back.
|
---|
712 |
|
---|
713 | adds @acc[1],@acc[1],#1 @ subs @acc[1],@acc[1],#-1
|
---|
714 | adcs @acc[2],@acc[2],#0 @ sbcs @acc[2],@acc[2],#-1
|
---|
715 | adcs @acc[3],@acc[3],#0 @ sbcs @acc[3],@acc[3],#-1
|
---|
716 | sbcs @acc[4],@acc[4],#0
|
---|
717 | sbcs @acc[5],@acc[5],#0
|
---|
718 | sbcs @acc[6],@acc[6],#0
|
---|
719 | sbcs @acc[7],@acc[7],#1
|
---|
720 | adcs @acc[8],@acc[8],#0 @ sbcs @acc[8],@acc[8],#-1
|
---|
721 | ldr lr,[sp,#44] @ restore lr
|
---|
722 | sbc @acc[0],@acc[0],#0 @ broadcast borrow bit
|
---|
723 | add sp,sp,#48
|
---|
724 |
|
---|
725 | @ Note that because mod has special form, i.e. consists of
|
---|
726 | @ 0xffffffff, 1 and 0s, we can conditionally synthesize it by
|
---|
727 | @ broadcasting borrow bit to a register, @acc[0], and using it as
|
---|
728 | @ a whole or extracting single bit.
|
---|
729 |
|
---|
730 | adds @acc[1],@acc[1],@acc[0] @ add modulus or zero
|
---|
731 | adcs @acc[2],@acc[2],@acc[0]
|
---|
732 | str @acc[1],[$r_ptr,#0]
|
---|
733 | adcs @acc[3],@acc[3],@acc[0]
|
---|
734 | str @acc[2],[$r_ptr,#4]
|
---|
735 | adcs @acc[4],@acc[4],#0
|
---|
736 | str @acc[3],[$r_ptr,#8]
|
---|
737 | adcs @acc[5],@acc[5],#0
|
---|
738 | str @acc[4],[$r_ptr,#12]
|
---|
739 | adcs @acc[6],@acc[6],#0
|
---|
740 | str @acc[5],[$r_ptr,#16]
|
---|
741 | adcs @acc[7],@acc[7],@acc[0],lsr#31
|
---|
742 | str @acc[6],[$r_ptr,#20]
|
---|
743 | adc @acc[8],@acc[8],@acc[0]
|
---|
744 | str @acc[7],[$r_ptr,#24]
|
---|
745 | str @acc[8],[$r_ptr,#28]
|
---|
746 |
|
---|
747 | mov pc,lr
|
---|
748 | .size __ecp_nistz256_mul_mont,.-__ecp_nistz256_mul_mont
|
---|
749 | ___
|
---|
750 | }
|
---|
751 |
|
---|
752 | {
|
---|
753 | my ($out,$inp,$index,$mask)=map("r$_",(0..3));
|
---|
754 | $code.=<<___;
|
---|
755 | @ void ecp_nistz256_scatter_w5(void *r0,const P256_POINT *r1,
|
---|
756 | @ int r2);
|
---|
757 | .globl ecp_nistz256_scatter_w5
|
---|
758 | .type ecp_nistz256_scatter_w5,%function
|
---|
759 | .align 5
|
---|
760 | ecp_nistz256_scatter_w5:
|
---|
761 | stmdb sp!,{r4-r11}
|
---|
762 |
|
---|
763 | add $out,$out,$index,lsl#2
|
---|
764 |
|
---|
765 | ldmia $inp!,{r4-r11} @ X
|
---|
766 | str r4,[$out,#64*0-4]
|
---|
767 | str r5,[$out,#64*1-4]
|
---|
768 | str r6,[$out,#64*2-4]
|
---|
769 | str r7,[$out,#64*3-4]
|
---|
770 | str r8,[$out,#64*4-4]
|
---|
771 | str r9,[$out,#64*5-4]
|
---|
772 | str r10,[$out,#64*6-4]
|
---|
773 | str r11,[$out,#64*7-4]
|
---|
774 | add $out,$out,#64*8
|
---|
775 |
|
---|
776 | ldmia $inp!,{r4-r11} @ Y
|
---|
777 | str r4,[$out,#64*0-4]
|
---|
778 | str r5,[$out,#64*1-4]
|
---|
779 | str r6,[$out,#64*2-4]
|
---|
780 | str r7,[$out,#64*3-4]
|
---|
781 | str r8,[$out,#64*4-4]
|
---|
782 | str r9,[$out,#64*5-4]
|
---|
783 | str r10,[$out,#64*6-4]
|
---|
784 | str r11,[$out,#64*7-4]
|
---|
785 | add $out,$out,#64*8
|
---|
786 |
|
---|
787 | ldmia $inp,{r4-r11} @ Z
|
---|
788 | str r4,[$out,#64*0-4]
|
---|
789 | str r5,[$out,#64*1-4]
|
---|
790 | str r6,[$out,#64*2-4]
|
---|
791 | str r7,[$out,#64*3-4]
|
---|
792 | str r8,[$out,#64*4-4]
|
---|
793 | str r9,[$out,#64*5-4]
|
---|
794 | str r10,[$out,#64*6-4]
|
---|
795 | str r11,[$out,#64*7-4]
|
---|
796 |
|
---|
797 | ldmia sp!,{r4-r11}
|
---|
798 | #if __ARM_ARCH__>=5 || defined(__thumb__)
|
---|
799 | bx lr
|
---|
800 | #else
|
---|
801 | mov pc,lr
|
---|
802 | #endif
|
---|
803 | .size ecp_nistz256_scatter_w5,.-ecp_nistz256_scatter_w5
|
---|
804 |
|
---|
805 | @ void ecp_nistz256_gather_w5(P256_POINT *r0,const void *r1,
|
---|
806 | @ int r2);
|
---|
807 | .globl ecp_nistz256_gather_w5
|
---|
808 | .type ecp_nistz256_gather_w5,%function
|
---|
809 | .align 5
|
---|
810 | ecp_nistz256_gather_w5:
|
---|
811 | stmdb sp!,{r4-r11}
|
---|
812 |
|
---|
813 | cmp $index,#0
|
---|
814 | mov $mask,#0
|
---|
815 | #ifdef __thumb2__
|
---|
816 | itt ne
|
---|
817 | #endif
|
---|
818 | subne $index,$index,#1
|
---|
819 | movne $mask,#-1
|
---|
820 | add $inp,$inp,$index,lsl#2
|
---|
821 |
|
---|
822 | ldr r4,[$inp,#64*0]
|
---|
823 | ldr r5,[$inp,#64*1]
|
---|
824 | ldr r6,[$inp,#64*2]
|
---|
825 | and r4,r4,$mask
|
---|
826 | ldr r7,[$inp,#64*3]
|
---|
827 | and r5,r5,$mask
|
---|
828 | ldr r8,[$inp,#64*4]
|
---|
829 | and r6,r6,$mask
|
---|
830 | ldr r9,[$inp,#64*5]
|
---|
831 | and r7,r7,$mask
|
---|
832 | ldr r10,[$inp,#64*6]
|
---|
833 | and r8,r8,$mask
|
---|
834 | ldr r11,[$inp,#64*7]
|
---|
835 | add $inp,$inp,#64*8
|
---|
836 | and r9,r9,$mask
|
---|
837 | and r10,r10,$mask
|
---|
838 | and r11,r11,$mask
|
---|
839 | stmia $out!,{r4-r11} @ X
|
---|
840 |
|
---|
841 | ldr r4,[$inp,#64*0]
|
---|
842 | ldr r5,[$inp,#64*1]
|
---|
843 | ldr r6,[$inp,#64*2]
|
---|
844 | and r4,r4,$mask
|
---|
845 | ldr r7,[$inp,#64*3]
|
---|
846 | and r5,r5,$mask
|
---|
847 | ldr r8,[$inp,#64*4]
|
---|
848 | and r6,r6,$mask
|
---|
849 | ldr r9,[$inp,#64*5]
|
---|
850 | and r7,r7,$mask
|
---|
851 | ldr r10,[$inp,#64*6]
|
---|
852 | and r8,r8,$mask
|
---|
853 | ldr r11,[$inp,#64*7]
|
---|
854 | add $inp,$inp,#64*8
|
---|
855 | and r9,r9,$mask
|
---|
856 | and r10,r10,$mask
|
---|
857 | and r11,r11,$mask
|
---|
858 | stmia $out!,{r4-r11} @ Y
|
---|
859 |
|
---|
860 | ldr r4,[$inp,#64*0]
|
---|
861 | ldr r5,[$inp,#64*1]
|
---|
862 | ldr r6,[$inp,#64*2]
|
---|
863 | and r4,r4,$mask
|
---|
864 | ldr r7,[$inp,#64*3]
|
---|
865 | and r5,r5,$mask
|
---|
866 | ldr r8,[$inp,#64*4]
|
---|
867 | and r6,r6,$mask
|
---|
868 | ldr r9,[$inp,#64*5]
|
---|
869 | and r7,r7,$mask
|
---|
870 | ldr r10,[$inp,#64*6]
|
---|
871 | and r8,r8,$mask
|
---|
872 | ldr r11,[$inp,#64*7]
|
---|
873 | and r9,r9,$mask
|
---|
874 | and r10,r10,$mask
|
---|
875 | and r11,r11,$mask
|
---|
876 | stmia $out,{r4-r11} @ Z
|
---|
877 |
|
---|
878 | ldmia sp!,{r4-r11}
|
---|
879 | #if __ARM_ARCH__>=5 || defined(__thumb__)
|
---|
880 | bx lr
|
---|
881 | #else
|
---|
882 | mov pc,lr
|
---|
883 | #endif
|
---|
884 | .size ecp_nistz256_gather_w5,.-ecp_nistz256_gather_w5
|
---|
885 |
|
---|
886 | @ void ecp_nistz256_scatter_w7(void *r0,const P256_POINT_AFFINE *r1,
|
---|
887 | @ int r2);
|
---|
888 | .globl ecp_nistz256_scatter_w7
|
---|
889 | .type ecp_nistz256_scatter_w7,%function
|
---|
890 | .align 5
|
---|
891 | ecp_nistz256_scatter_w7:
|
---|
892 | add $out,$out,$index
|
---|
893 | mov $index,#64/4
|
---|
894 | .Loop_scatter_w7:
|
---|
895 | ldr $mask,[$inp],#4
|
---|
896 | subs $index,$index,#1
|
---|
897 | strb $mask,[$out,#64*0]
|
---|
898 | mov $mask,$mask,lsr#8
|
---|
899 | strb $mask,[$out,#64*1]
|
---|
900 | mov $mask,$mask,lsr#8
|
---|
901 | strb $mask,[$out,#64*2]
|
---|
902 | mov $mask,$mask,lsr#8
|
---|
903 | strb $mask,[$out,#64*3]
|
---|
904 | add $out,$out,#64*4
|
---|
905 | bne .Loop_scatter_w7
|
---|
906 |
|
---|
907 | #if __ARM_ARCH__>=5 || defined(__thumb__)
|
---|
908 | bx lr
|
---|
909 | #else
|
---|
910 | mov pc,lr
|
---|
911 | #endif
|
---|
912 | .size ecp_nistz256_scatter_w7,.-ecp_nistz256_scatter_w7
|
---|
913 |
|
---|
914 | @ void ecp_nistz256_gather_w7(P256_POINT_AFFINE *r0,const void *r1,
|
---|
915 | @ int r2);
|
---|
916 | .globl ecp_nistz256_gather_w7
|
---|
917 | .type ecp_nistz256_gather_w7,%function
|
---|
918 | .align 5
|
---|
919 | ecp_nistz256_gather_w7:
|
---|
920 | stmdb sp!,{r4-r7}
|
---|
921 |
|
---|
922 | cmp $index,#0
|
---|
923 | mov $mask,#0
|
---|
924 | #ifdef __thumb2__
|
---|
925 | itt ne
|
---|
926 | #endif
|
---|
927 | subne $index,$index,#1
|
---|
928 | movne $mask,#-1
|
---|
929 | add $inp,$inp,$index
|
---|
930 | mov $index,#64/4
|
---|
931 | nop
|
---|
932 | .Loop_gather_w7:
|
---|
933 | ldrb r4,[$inp,#64*0]
|
---|
934 | subs $index,$index,#1
|
---|
935 | ldrb r5,[$inp,#64*1]
|
---|
936 | ldrb r6,[$inp,#64*2]
|
---|
937 | ldrb r7,[$inp,#64*3]
|
---|
938 | add $inp,$inp,#64*4
|
---|
939 | orr r4,r4,r5,lsl#8
|
---|
940 | orr r4,r4,r6,lsl#16
|
---|
941 | orr r4,r4,r7,lsl#24
|
---|
942 | and r4,r4,$mask
|
---|
943 | str r4,[$out],#4
|
---|
944 | bne .Loop_gather_w7
|
---|
945 |
|
---|
946 | ldmia sp!,{r4-r7}
|
---|
947 | #if __ARM_ARCH__>=5 || defined(__thumb__)
|
---|
948 | bx lr
|
---|
949 | #else
|
---|
950 | mov pc,lr
|
---|
951 | #endif
|
---|
952 | .size ecp_nistz256_gather_w7,.-ecp_nistz256_gather_w7
|
---|
953 | ___
|
---|
954 | }
|
---|
955 | if (0) {
|
---|
956 | # In comparison to integer-only equivalent of below subroutine:
|
---|
957 | #
|
---|
958 | # Cortex-A8 +10%
|
---|
959 | # Cortex-A9 -10%
|
---|
960 | # Snapdragon S4 +5%
|
---|
961 | #
|
---|
962 | # As not all time is spent in multiplication, overall impact is deemed
|
---|
963 | # too low to care about.
|
---|
964 |
|
---|
965 | my ($A0,$A1,$A2,$A3,$Bi,$zero,$temp)=map("d$_",(0..7));
|
---|
966 | my $mask="q4";
|
---|
967 | my $mult="q5";
|
---|
968 | my @AxB=map("q$_",(8..15));
|
---|
969 |
|
---|
970 | my ($rptr,$aptr,$bptr,$toutptr)=map("r$_",(0..3));
|
---|
971 |
|
---|
972 | $code.=<<___;
|
---|
973 | #if __ARM_ARCH__>=7
|
---|
974 | .fpu neon
|
---|
975 |
|
---|
976 | .globl ecp_nistz256_mul_mont_neon
|
---|
977 | .type ecp_nistz256_mul_mont_neon,%function
|
---|
978 | .align 5
|
---|
979 | ecp_nistz256_mul_mont_neon:
|
---|
980 | mov ip,sp
|
---|
981 | stmdb sp!,{r4-r9}
|
---|
982 | vstmdb sp!,{q4-q5} @ ABI specification says so
|
---|
983 |
|
---|
984 | sub $toutptr,sp,#40
|
---|
985 | vld1.32 {${Bi}[0]},[$bptr,:32]!
|
---|
986 | veor $zero,$zero,$zero
|
---|
987 | vld1.32 {$A0-$A3}, [$aptr] @ can't specify :32 :-(
|
---|
988 | vzip.16 $Bi,$zero
|
---|
989 | mov sp,$toutptr @ alloca
|
---|
990 | vmov.i64 $mask,#0xffff
|
---|
991 |
|
---|
992 | vmull.u32 @AxB[0],$Bi,${A0}[0]
|
---|
993 | vmull.u32 @AxB[1],$Bi,${A0}[1]
|
---|
994 | vmull.u32 @AxB[2],$Bi,${A1}[0]
|
---|
995 | vmull.u32 @AxB[3],$Bi,${A1}[1]
|
---|
996 | vshr.u64 $temp,@AxB[0]#lo,#16
|
---|
997 | vmull.u32 @AxB[4],$Bi,${A2}[0]
|
---|
998 | vadd.u64 @AxB[0]#hi,@AxB[0]#hi,$temp
|
---|
999 | vmull.u32 @AxB[5],$Bi,${A2}[1]
|
---|
1000 | vshr.u64 $temp,@AxB[0]#hi,#16 @ upper 32 bits of a[0]*b[0]
|
---|
1001 | vmull.u32 @AxB[6],$Bi,${A3}[0]
|
---|
1002 | vand.u64 @AxB[0],@AxB[0],$mask @ lower 32 bits of a[0]*b[0]
|
---|
1003 | vmull.u32 @AxB[7],$Bi,${A3}[1]
|
---|
1004 | ___
|
---|
1005 | for($i=1;$i<8;$i++) {
|
---|
1006 | $code.=<<___;
|
---|
1007 | vld1.32 {${Bi}[0]},[$bptr,:32]!
|
---|
1008 | veor $zero,$zero,$zero
|
---|
1009 | vadd.u64 @AxB[1]#lo,@AxB[1]#lo,$temp @ reduction
|
---|
1010 | vshl.u64 $mult,@AxB[0],#32
|
---|
1011 | vadd.u64 @AxB[3],@AxB[3],@AxB[0]
|
---|
1012 | vsub.u64 $mult,$mult,@AxB[0]
|
---|
1013 | vzip.16 $Bi,$zero
|
---|
1014 | vadd.u64 @AxB[6],@AxB[6],@AxB[0]
|
---|
1015 | vadd.u64 @AxB[7],@AxB[7],$mult
|
---|
1016 | ___
|
---|
1017 | push(@AxB,shift(@AxB));
|
---|
1018 | $code.=<<___;
|
---|
1019 | vmlal.u32 @AxB[0],$Bi,${A0}[0]
|
---|
1020 | vmlal.u32 @AxB[1],$Bi,${A0}[1]
|
---|
1021 | vmlal.u32 @AxB[2],$Bi,${A1}[0]
|
---|
1022 | vmlal.u32 @AxB[3],$Bi,${A1}[1]
|
---|
1023 | vshr.u64 $temp,@AxB[0]#lo,#16
|
---|
1024 | vmlal.u32 @AxB[4],$Bi,${A2}[0]
|
---|
1025 | vadd.u64 @AxB[0]#hi,@AxB[0]#hi,$temp
|
---|
1026 | vmlal.u32 @AxB[5],$Bi,${A2}[1]
|
---|
1027 | vshr.u64 $temp,@AxB[0]#hi,#16 @ upper 33 bits of a[0]*b[i]+t[0]
|
---|
1028 | vmlal.u32 @AxB[6],$Bi,${A3}[0]
|
---|
1029 | vand.u64 @AxB[0],@AxB[0],$mask @ lower 32 bits of a[0]*b[0]
|
---|
1030 | vmull.u32 @AxB[7],$Bi,${A3}[1]
|
---|
1031 | ___
|
---|
1032 | }
|
---|
1033 | $code.=<<___;
|
---|
1034 | vadd.u64 @AxB[1]#lo,@AxB[1]#lo,$temp @ last reduction
|
---|
1035 | vshl.u64 $mult,@AxB[0],#32
|
---|
1036 | vadd.u64 @AxB[3],@AxB[3],@AxB[0]
|
---|
1037 | vsub.u64 $mult,$mult,@AxB[0]
|
---|
1038 | vadd.u64 @AxB[6],@AxB[6],@AxB[0]
|
---|
1039 | vadd.u64 @AxB[7],@AxB[7],$mult
|
---|
1040 |
|
---|
1041 | vshr.u64 $temp,@AxB[1]#lo,#16 @ convert
|
---|
1042 | vadd.u64 @AxB[1]#hi,@AxB[1]#hi,$temp
|
---|
1043 | vshr.u64 $temp,@AxB[1]#hi,#16
|
---|
1044 | vzip.16 @AxB[1]#lo,@AxB[1]#hi
|
---|
1045 | ___
|
---|
1046 | foreach (2..7) {
|
---|
1047 | $code.=<<___;
|
---|
1048 | vadd.u64 @AxB[$_]#lo,@AxB[$_]#lo,$temp
|
---|
1049 | vst1.32 {@AxB[$_-1]#lo[0]},[$toutptr,:32]!
|
---|
1050 | vshr.u64 $temp,@AxB[$_]#lo,#16
|
---|
1051 | vadd.u64 @AxB[$_]#hi,@AxB[$_]#hi,$temp
|
---|
1052 | vshr.u64 $temp,@AxB[$_]#hi,#16
|
---|
1053 | vzip.16 @AxB[$_]#lo,@AxB[$_]#hi
|
---|
1054 | ___
|
---|
1055 | }
|
---|
1056 | $code.=<<___;
|
---|
1057 | vst1.32 {@AxB[7]#lo[0]},[$toutptr,:32]!
|
---|
1058 | vst1.32 {$temp},[$toutptr] @ upper 33 bits
|
---|
1059 |
|
---|
1060 | ldr r1,[sp,#0]
|
---|
1061 | ldr r2,[sp,#4]
|
---|
1062 | ldr r3,[sp,#8]
|
---|
1063 | subs r1,r1,#-1
|
---|
1064 | ldr r4,[sp,#12]
|
---|
1065 | sbcs r2,r2,#-1
|
---|
1066 | ldr r5,[sp,#16]
|
---|
1067 | sbcs r3,r3,#-1
|
---|
1068 | ldr r6,[sp,#20]
|
---|
1069 | sbcs r4,r4,#0
|
---|
1070 | ldr r7,[sp,#24]
|
---|
1071 | sbcs r5,r5,#0
|
---|
1072 | ldr r8,[sp,#28]
|
---|
1073 | sbcs r6,r6,#0
|
---|
1074 | ldr r9,[sp,#32] @ top-most bit
|
---|
1075 | sbcs r7,r7,#1
|
---|
1076 | sub sp,ip,#40+16
|
---|
1077 | sbcs r8,r8,#-1
|
---|
1078 | sbc r9,r9,#0
|
---|
1079 | vldmia sp!,{q4-q5}
|
---|
1080 |
|
---|
1081 | adds r1,r1,r9
|
---|
1082 | adcs r2,r2,r9
|
---|
1083 | str r1,[$rptr,#0]
|
---|
1084 | adcs r3,r3,r9
|
---|
1085 | str r2,[$rptr,#4]
|
---|
1086 | adcs r4,r4,#0
|
---|
1087 | str r3,[$rptr,#8]
|
---|
1088 | adcs r5,r5,#0
|
---|
1089 | str r4,[$rptr,#12]
|
---|
1090 | adcs r6,r6,#0
|
---|
1091 | str r5,[$rptr,#16]
|
---|
1092 | adcs r7,r7,r9,lsr#31
|
---|
1093 | str r6,[$rptr,#20]
|
---|
1094 | adcs r8,r8,r9
|
---|
1095 | str r7,[$rptr,#24]
|
---|
1096 | str r8,[$rptr,#28]
|
---|
1097 |
|
---|
1098 | ldmia sp!,{r4-r9}
|
---|
1099 | bx lr
|
---|
1100 | .size ecp_nistz256_mul_mont_neon,.-ecp_nistz256_mul_mont_neon
|
---|
1101 | #endif
|
---|
1102 | ___
|
---|
1103 | }
|
---|
1104 |
|
---|
1105 | {{{
|
---|
1106 | ########################################################################
|
---|
1107 | # Below $aN assignment matches order in which 256-bit result appears in
|
---|
1108 | # register bank at return from __ecp_nistz256_mul_mont, so that we can
|
---|
1109 | # skip over reloading it from memory. This means that below functions
|
---|
1110 | # use custom calling sequence accepting 256-bit input in registers,
|
---|
1111 | # output pointer in r0, $r_ptr, and optional pointer in r2, $b_ptr.
|
---|
1112 | #
|
---|
1113 | # See their "normal" counterparts for insights on calculations.
|
---|
1114 |
|
---|
1115 | my ($a0,$a1,$a2,$a3,$a4,$a5,$a6,$a7,
|
---|
1116 | $t0,$t1,$t2,$t3)=map("r$_",(11,3..10,12,14,1));
|
---|
1117 | my $ff=$b_ptr;
|
---|
1118 |
|
---|
1119 | $code.=<<___;
|
---|
1120 | .type __ecp_nistz256_sub_from,%function
|
---|
1121 | .align 5
|
---|
1122 | __ecp_nistz256_sub_from:
|
---|
1123 | str lr,[sp,#-4]! @ push lr
|
---|
1124 |
|
---|
1125 | ldr $t0,[$b_ptr,#0]
|
---|
1126 | ldr $t1,[$b_ptr,#4]
|
---|
1127 | ldr $t2,[$b_ptr,#8]
|
---|
1128 | ldr $t3,[$b_ptr,#12]
|
---|
1129 | subs $a0,$a0,$t0
|
---|
1130 | ldr $t0,[$b_ptr,#16]
|
---|
1131 | sbcs $a1,$a1,$t1
|
---|
1132 | ldr $t1,[$b_ptr,#20]
|
---|
1133 | sbcs $a2,$a2,$t2
|
---|
1134 | ldr $t2,[$b_ptr,#24]
|
---|
1135 | sbcs $a3,$a3,$t3
|
---|
1136 | ldr $t3,[$b_ptr,#28]
|
---|
1137 | sbcs $a4,$a4,$t0
|
---|
1138 | sbcs $a5,$a5,$t1
|
---|
1139 | sbcs $a6,$a6,$t2
|
---|
1140 | sbcs $a7,$a7,$t3
|
---|
1141 | sbc $ff,$ff,$ff @ broadcast borrow bit
|
---|
1142 | ldr lr,[sp],#4 @ pop lr
|
---|
1143 |
|
---|
1144 | adds $a0,$a0,$ff @ add synthesized modulus
|
---|
1145 | adcs $a1,$a1,$ff
|
---|
1146 | str $a0,[$r_ptr,#0]
|
---|
1147 | adcs $a2,$a2,$ff
|
---|
1148 | str $a1,[$r_ptr,#4]
|
---|
1149 | adcs $a3,$a3,#0
|
---|
1150 | str $a2,[$r_ptr,#8]
|
---|
1151 | adcs $a4,$a4,#0
|
---|
1152 | str $a3,[$r_ptr,#12]
|
---|
1153 | adcs $a5,$a5,#0
|
---|
1154 | str $a4,[$r_ptr,#16]
|
---|
1155 | adcs $a6,$a6,$ff,lsr#31
|
---|
1156 | str $a5,[$r_ptr,#20]
|
---|
1157 | adcs $a7,$a7,$ff
|
---|
1158 | str $a6,[$r_ptr,#24]
|
---|
1159 | str $a7,[$r_ptr,#28]
|
---|
1160 |
|
---|
1161 | mov pc,lr
|
---|
1162 | .size __ecp_nistz256_sub_from,.-__ecp_nistz256_sub_from
|
---|
1163 |
|
---|
1164 | .type __ecp_nistz256_sub_morf,%function
|
---|
1165 | .align 5
|
---|
1166 | __ecp_nistz256_sub_morf:
|
---|
1167 | str lr,[sp,#-4]! @ push lr
|
---|
1168 |
|
---|
1169 | ldr $t0,[$b_ptr,#0]
|
---|
1170 | ldr $t1,[$b_ptr,#4]
|
---|
1171 | ldr $t2,[$b_ptr,#8]
|
---|
1172 | ldr $t3,[$b_ptr,#12]
|
---|
1173 | subs $a0,$t0,$a0
|
---|
1174 | ldr $t0,[$b_ptr,#16]
|
---|
1175 | sbcs $a1,$t1,$a1
|
---|
1176 | ldr $t1,[$b_ptr,#20]
|
---|
1177 | sbcs $a2,$t2,$a2
|
---|
1178 | ldr $t2,[$b_ptr,#24]
|
---|
1179 | sbcs $a3,$t3,$a3
|
---|
1180 | ldr $t3,[$b_ptr,#28]
|
---|
1181 | sbcs $a4,$t0,$a4
|
---|
1182 | sbcs $a5,$t1,$a5
|
---|
1183 | sbcs $a6,$t2,$a6
|
---|
1184 | sbcs $a7,$t3,$a7
|
---|
1185 | sbc $ff,$ff,$ff @ broadcast borrow bit
|
---|
1186 | ldr lr,[sp],#4 @ pop lr
|
---|
1187 |
|
---|
1188 | adds $a0,$a0,$ff @ add synthesized modulus
|
---|
1189 | adcs $a1,$a1,$ff
|
---|
1190 | str $a0,[$r_ptr,#0]
|
---|
1191 | adcs $a2,$a2,$ff
|
---|
1192 | str $a1,[$r_ptr,#4]
|
---|
1193 | adcs $a3,$a3,#0
|
---|
1194 | str $a2,[$r_ptr,#8]
|
---|
1195 | adcs $a4,$a4,#0
|
---|
1196 | str $a3,[$r_ptr,#12]
|
---|
1197 | adcs $a5,$a5,#0
|
---|
1198 | str $a4,[$r_ptr,#16]
|
---|
1199 | adcs $a6,$a6,$ff,lsr#31
|
---|
1200 | str $a5,[$r_ptr,#20]
|
---|
1201 | adcs $a7,$a7,$ff
|
---|
1202 | str $a6,[$r_ptr,#24]
|
---|
1203 | str $a7,[$r_ptr,#28]
|
---|
1204 |
|
---|
1205 | mov pc,lr
|
---|
1206 | .size __ecp_nistz256_sub_morf,.-__ecp_nistz256_sub_morf
|
---|
1207 |
|
---|
1208 | .type __ecp_nistz256_add_self,%function
|
---|
1209 | .align 4
|
---|
1210 | __ecp_nistz256_add_self:
|
---|
1211 | adds $a0,$a0,$a0 @ a[0:7]+=a[0:7]
|
---|
1212 | adcs $a1,$a1,$a1
|
---|
1213 | adcs $a2,$a2,$a2
|
---|
1214 | adcs $a3,$a3,$a3
|
---|
1215 | adcs $a4,$a4,$a4
|
---|
1216 | adcs $a5,$a5,$a5
|
---|
1217 | adcs $a6,$a6,$a6
|
---|
1218 | mov $ff,#0
|
---|
1219 | adcs $a7,$a7,$a7
|
---|
1220 | adc $ff,$ff,#0
|
---|
1221 |
|
---|
1222 | @ if a+b >= modulus, subtract modulus.
|
---|
1223 | @
|
---|
1224 | @ But since comparison implies subtraction, we subtract
|
---|
1225 | @ modulus and then add it back if subtraction borrowed.
|
---|
1226 |
|
---|
1227 | subs $a0,$a0,#-1
|
---|
1228 | sbcs $a1,$a1,#-1
|
---|
1229 | sbcs $a2,$a2,#-1
|
---|
1230 | sbcs $a3,$a3,#0
|
---|
1231 | sbcs $a4,$a4,#0
|
---|
1232 | sbcs $a5,$a5,#0
|
---|
1233 | sbcs $a6,$a6,#1
|
---|
1234 | sbcs $a7,$a7,#-1
|
---|
1235 | sbc $ff,$ff,#0
|
---|
1236 |
|
---|
1237 | @ Note that because mod has special form, i.e. consists of
|
---|
1238 | @ 0xffffffff, 1 and 0s, we can conditionally synthesize it by
|
---|
1239 | @ using value of borrow as a whole or extracting single bit.
|
---|
1240 | @ Follow $ff register...
|
---|
1241 |
|
---|
1242 | adds $a0,$a0,$ff @ add synthesized modulus
|
---|
1243 | adcs $a1,$a1,$ff
|
---|
1244 | str $a0,[$r_ptr,#0]
|
---|
1245 | adcs $a2,$a2,$ff
|
---|
1246 | str $a1,[$r_ptr,#4]
|
---|
1247 | adcs $a3,$a3,#0
|
---|
1248 | str $a2,[$r_ptr,#8]
|
---|
1249 | adcs $a4,$a4,#0
|
---|
1250 | str $a3,[$r_ptr,#12]
|
---|
1251 | adcs $a5,$a5,#0
|
---|
1252 | str $a4,[$r_ptr,#16]
|
---|
1253 | adcs $a6,$a6,$ff,lsr#31
|
---|
1254 | str $a5,[$r_ptr,#20]
|
---|
1255 | adcs $a7,$a7,$ff
|
---|
1256 | str $a6,[$r_ptr,#24]
|
---|
1257 | str $a7,[$r_ptr,#28]
|
---|
1258 |
|
---|
1259 | mov pc,lr
|
---|
1260 | .size __ecp_nistz256_add_self,.-__ecp_nistz256_add_self
|
---|
1261 |
|
---|
1262 | ___
|
---|
1263 |
|
---|
1264 | ########################################################################
|
---|
1265 | # following subroutines are "literal" implementation of those found in
|
---|
1266 | # ecp_nistz256.c
|
---|
1267 | #
|
---|
1268 | ########################################################################
|
---|
1269 | # void ecp_nistz256_point_double(P256_POINT *out,const P256_POINT *inp);
|
---|
1270 | #
|
---|
1271 | {
|
---|
1272 | my ($S,$M,$Zsqr,$in_x,$tmp0)=map(32*$_,(0..4));
|
---|
1273 | # above map() describes stack layout with 5 temporary
|
---|
1274 | # 256-bit vectors on top. Then note that we push
|
---|
1275 | # starting from r0, which means that we have copy of
|
---|
1276 | # input arguments just below these temporary vectors.
|
---|
1277 |
|
---|
1278 | $code.=<<___;
|
---|
1279 | .globl ecp_nistz256_point_double
|
---|
1280 | .type ecp_nistz256_point_double,%function
|
---|
1281 | .align 5
|
---|
1282 | ecp_nistz256_point_double:
|
---|
1283 | stmdb sp!,{r0-r12,lr} @ push from r0, unusual, but intentional
|
---|
1284 | sub sp,sp,#32*5
|
---|
1285 |
|
---|
1286 | .Lpoint_double_shortcut:
|
---|
1287 | add r3,sp,#$in_x
|
---|
1288 | ldmia $a_ptr!,{r4-r11} @ copy in_x
|
---|
1289 | stmia r3,{r4-r11}
|
---|
1290 |
|
---|
1291 | add $r_ptr,sp,#$S
|
---|
1292 | bl __ecp_nistz256_mul_by_2 @ p256_mul_by_2(S, in_y);
|
---|
1293 |
|
---|
1294 | add $b_ptr,$a_ptr,#32
|
---|
1295 | add $a_ptr,$a_ptr,#32
|
---|
1296 | add $r_ptr,sp,#$Zsqr
|
---|
1297 | bl __ecp_nistz256_mul_mont @ p256_sqr_mont(Zsqr, in_z);
|
---|
1298 |
|
---|
1299 | add $a_ptr,sp,#$S
|
---|
1300 | add $b_ptr,sp,#$S
|
---|
1301 | add $r_ptr,sp,#$S
|
---|
1302 | bl __ecp_nistz256_mul_mont @ p256_sqr_mont(S, S);
|
---|
1303 |
|
---|
1304 | ldr $b_ptr,[sp,#32*5+4]
|
---|
1305 | add $a_ptr,$b_ptr,#32
|
---|
1306 | add $b_ptr,$b_ptr,#64
|
---|
1307 | add $r_ptr,sp,#$tmp0
|
---|
1308 | bl __ecp_nistz256_mul_mont @ p256_mul_mont(tmp0, in_z, in_y);
|
---|
1309 |
|
---|
1310 | ldr $r_ptr,[sp,#32*5]
|
---|
1311 | add $r_ptr,$r_ptr,#64
|
---|
1312 | bl __ecp_nistz256_add_self @ p256_mul_by_2(res_z, tmp0);
|
---|
1313 |
|
---|
1314 | add $a_ptr,sp,#$in_x
|
---|
1315 | add $b_ptr,sp,#$Zsqr
|
---|
1316 | add $r_ptr,sp,#$M
|
---|
1317 | bl __ecp_nistz256_add @ p256_add(M, in_x, Zsqr);
|
---|
1318 |
|
---|
1319 | add $a_ptr,sp,#$in_x
|
---|
1320 | add $b_ptr,sp,#$Zsqr
|
---|
1321 | add $r_ptr,sp,#$Zsqr
|
---|
1322 | bl __ecp_nistz256_sub @ p256_sub(Zsqr, in_x, Zsqr);
|
---|
1323 |
|
---|
1324 | add $a_ptr,sp,#$S
|
---|
1325 | add $b_ptr,sp,#$S
|
---|
1326 | add $r_ptr,sp,#$tmp0
|
---|
1327 | bl __ecp_nistz256_mul_mont @ p256_sqr_mont(tmp0, S);
|
---|
1328 |
|
---|
1329 | add $a_ptr,sp,#$Zsqr
|
---|
1330 | add $b_ptr,sp,#$M
|
---|
1331 | add $r_ptr,sp,#$M
|
---|
1332 | bl __ecp_nistz256_mul_mont @ p256_mul_mont(M, M, Zsqr);
|
---|
1333 |
|
---|
1334 | ldr $r_ptr,[sp,#32*5]
|
---|
1335 | add $a_ptr,sp,#$tmp0
|
---|
1336 | add $r_ptr,$r_ptr,#32
|
---|
1337 | bl __ecp_nistz256_div_by_2 @ p256_div_by_2(res_y, tmp0);
|
---|
1338 |
|
---|
1339 | add $a_ptr,sp,#$M
|
---|
1340 | add $r_ptr,sp,#$M
|
---|
1341 | bl __ecp_nistz256_mul_by_3 @ p256_mul_by_3(M, M);
|
---|
1342 |
|
---|
1343 | add $a_ptr,sp,#$in_x
|
---|
1344 | add $b_ptr,sp,#$S
|
---|
1345 | add $r_ptr,sp,#$S
|
---|
1346 | bl __ecp_nistz256_mul_mont @ p256_mul_mont(S, S, in_x);
|
---|
1347 |
|
---|
1348 | add $r_ptr,sp,#$tmp0
|
---|
1349 | bl __ecp_nistz256_add_self @ p256_mul_by_2(tmp0, S);
|
---|
1350 |
|
---|
1351 | ldr $r_ptr,[sp,#32*5]
|
---|
1352 | add $a_ptr,sp,#$M
|
---|
1353 | add $b_ptr,sp,#$M
|
---|
1354 | bl __ecp_nistz256_mul_mont @ p256_sqr_mont(res_x, M);
|
---|
1355 |
|
---|
1356 | add $b_ptr,sp,#$tmp0
|
---|
1357 | bl __ecp_nistz256_sub_from @ p256_sub(res_x, res_x, tmp0);
|
---|
1358 |
|
---|
1359 | add $b_ptr,sp,#$S
|
---|
1360 | add $r_ptr,sp,#$S
|
---|
1361 | bl __ecp_nistz256_sub_morf @ p256_sub(S, S, res_x);
|
---|
1362 |
|
---|
1363 | add $a_ptr,sp,#$M
|
---|
1364 | add $b_ptr,sp,#$S
|
---|
1365 | bl __ecp_nistz256_mul_mont @ p256_mul_mont(S, S, M);
|
---|
1366 |
|
---|
1367 | ldr $r_ptr,[sp,#32*5]
|
---|
1368 | add $b_ptr,$r_ptr,#32
|
---|
1369 | add $r_ptr,$r_ptr,#32
|
---|
1370 | bl __ecp_nistz256_sub_from @ p256_sub(res_y, S, res_y);
|
---|
1371 |
|
---|
1372 | add sp,sp,#32*5+16 @ +16 means "skip even over saved r0-r3"
|
---|
1373 | #if __ARM_ARCH__>=5 || !defined(__thumb__)
|
---|
1374 | ldmia sp!,{r4-r12,pc}
|
---|
1375 | #else
|
---|
1376 | ldmia sp!,{r4-r12,lr}
|
---|
1377 | bx lr @ interoperable with Thumb ISA:-)
|
---|
1378 | #endif
|
---|
1379 | .size ecp_nistz256_point_double,.-ecp_nistz256_point_double
|
---|
1380 | ___
|
---|
1381 | }
|
---|
1382 |
|
---|
1383 | ########################################################################
|
---|
1384 | # void ecp_nistz256_point_add(P256_POINT *out,const P256_POINT *in1,
|
---|
1385 | # const P256_POINT *in2);
|
---|
1386 | {
|
---|
1387 | my ($res_x,$res_y,$res_z,
|
---|
1388 | $in1_x,$in1_y,$in1_z,
|
---|
1389 | $in2_x,$in2_y,$in2_z,
|
---|
1390 | $H,$Hsqr,$R,$Rsqr,$Hcub,
|
---|
1391 | $U1,$U2,$S1,$S2)=map(32*$_,(0..17));
|
---|
1392 | my ($Z1sqr, $Z2sqr) = ($Hsqr, $Rsqr);
|
---|
1393 | # above map() describes stack layout with 18 temporary
|
---|
1394 | # 256-bit vectors on top. Then note that we push
|
---|
1395 | # starting from r0, which means that we have copy of
|
---|
1396 | # input arguments just below these temporary vectors.
|
---|
1397 | # We use three of them for ~in1infty, ~in2infty and
|
---|
1398 | # result of check for zero.
|
---|
1399 |
|
---|
1400 | $code.=<<___;
|
---|
1401 | .globl ecp_nistz256_point_add
|
---|
1402 | .type ecp_nistz256_point_add,%function
|
---|
1403 | .align 5
|
---|
1404 | ecp_nistz256_point_add:
|
---|
1405 | stmdb sp!,{r0-r12,lr} @ push from r0, unusual, but intentional
|
---|
1406 | sub sp,sp,#32*18+16
|
---|
1407 |
|
---|
1408 | ldmia $b_ptr!,{r4-r11} @ copy in2_x
|
---|
1409 | add r3,sp,#$in2_x
|
---|
1410 | stmia r3!,{r4-r11}
|
---|
1411 | ldmia $b_ptr!,{r4-r11} @ copy in2_y
|
---|
1412 | stmia r3!,{r4-r11}
|
---|
1413 | ldmia $b_ptr,{r4-r11} @ copy in2_z
|
---|
1414 | orr r12,r4,r5
|
---|
1415 | orr r12,r12,r6
|
---|
1416 | orr r12,r12,r7
|
---|
1417 | orr r12,r12,r8
|
---|
1418 | orr r12,r12,r9
|
---|
1419 | orr r12,r12,r10
|
---|
1420 | orr r12,r12,r11
|
---|
1421 | cmp r12,#0
|
---|
1422 | #ifdef __thumb2__
|
---|
1423 | it ne
|
---|
1424 | #endif
|
---|
1425 | movne r12,#-1
|
---|
1426 | stmia r3,{r4-r11}
|
---|
1427 | str r12,[sp,#32*18+8] @ ~in2infty
|
---|
1428 |
|
---|
1429 | ldmia $a_ptr!,{r4-r11} @ copy in1_x
|
---|
1430 | add r3,sp,#$in1_x
|
---|
1431 | stmia r3!,{r4-r11}
|
---|
1432 | ldmia $a_ptr!,{r4-r11} @ copy in1_y
|
---|
1433 | stmia r3!,{r4-r11}
|
---|
1434 | ldmia $a_ptr,{r4-r11} @ copy in1_z
|
---|
1435 | orr r12,r4,r5
|
---|
1436 | orr r12,r12,r6
|
---|
1437 | orr r12,r12,r7
|
---|
1438 | orr r12,r12,r8
|
---|
1439 | orr r12,r12,r9
|
---|
1440 | orr r12,r12,r10
|
---|
1441 | orr r12,r12,r11
|
---|
1442 | cmp r12,#0
|
---|
1443 | #ifdef __thumb2__
|
---|
1444 | it ne
|
---|
1445 | #endif
|
---|
1446 | movne r12,#-1
|
---|
1447 | stmia r3,{r4-r11}
|
---|
1448 | str r12,[sp,#32*18+4] @ ~in1infty
|
---|
1449 |
|
---|
1450 | add $a_ptr,sp,#$in2_z
|
---|
1451 | add $b_ptr,sp,#$in2_z
|
---|
1452 | add $r_ptr,sp,#$Z2sqr
|
---|
1453 | bl __ecp_nistz256_mul_mont @ p256_sqr_mont(Z2sqr, in2_z);
|
---|
1454 |
|
---|
1455 | add $a_ptr,sp,#$in1_z
|
---|
1456 | add $b_ptr,sp,#$in1_z
|
---|
1457 | add $r_ptr,sp,#$Z1sqr
|
---|
1458 | bl __ecp_nistz256_mul_mont @ p256_sqr_mont(Z1sqr, in1_z);
|
---|
1459 |
|
---|
1460 | add $a_ptr,sp,#$in2_z
|
---|
1461 | add $b_ptr,sp,#$Z2sqr
|
---|
1462 | add $r_ptr,sp,#$S1
|
---|
1463 | bl __ecp_nistz256_mul_mont @ p256_mul_mont(S1, Z2sqr, in2_z);
|
---|
1464 |
|
---|
1465 | add $a_ptr,sp,#$in1_z
|
---|
1466 | add $b_ptr,sp,#$Z1sqr
|
---|
1467 | add $r_ptr,sp,#$S2
|
---|
1468 | bl __ecp_nistz256_mul_mont @ p256_mul_mont(S2, Z1sqr, in1_z);
|
---|
1469 |
|
---|
1470 | add $a_ptr,sp,#$in1_y
|
---|
1471 | add $b_ptr,sp,#$S1
|
---|
1472 | add $r_ptr,sp,#$S1
|
---|
1473 | bl __ecp_nistz256_mul_mont @ p256_mul_mont(S1, S1, in1_y);
|
---|
1474 |
|
---|
1475 | add $a_ptr,sp,#$in2_y
|
---|
1476 | add $b_ptr,sp,#$S2
|
---|
1477 | add $r_ptr,sp,#$S2
|
---|
1478 | bl __ecp_nistz256_mul_mont @ p256_mul_mont(S2, S2, in2_y);
|
---|
1479 |
|
---|
1480 | add $b_ptr,sp,#$S1
|
---|
1481 | add $r_ptr,sp,#$R
|
---|
1482 | bl __ecp_nistz256_sub_from @ p256_sub(R, S2, S1);
|
---|
1483 |
|
---|
1484 | orr $a0,$a0,$a1 @ see if result is zero
|
---|
1485 | orr $a2,$a2,$a3
|
---|
1486 | orr $a4,$a4,$a5
|
---|
1487 | orr $a0,$a0,$a2
|
---|
1488 | orr $a4,$a4,$a6
|
---|
1489 | orr $a0,$a0,$a7
|
---|
1490 | add $a_ptr,sp,#$in1_x
|
---|
1491 | orr $a0,$a0,$a4
|
---|
1492 | add $b_ptr,sp,#$Z2sqr
|
---|
1493 | str $a0,[sp,#32*18+12]
|
---|
1494 |
|
---|
1495 | add $r_ptr,sp,#$U1
|
---|
1496 | bl __ecp_nistz256_mul_mont @ p256_mul_mont(U1, in1_x, Z2sqr);
|
---|
1497 |
|
---|
1498 | add $a_ptr,sp,#$in2_x
|
---|
1499 | add $b_ptr,sp,#$Z1sqr
|
---|
1500 | add $r_ptr,sp,#$U2
|
---|
1501 | bl __ecp_nistz256_mul_mont @ p256_mul_mont(U2, in2_x, Z1sqr);
|
---|
1502 |
|
---|
1503 | add $b_ptr,sp,#$U1
|
---|
1504 | add $r_ptr,sp,#$H
|
---|
1505 | bl __ecp_nistz256_sub_from @ p256_sub(H, U2, U1);
|
---|
1506 |
|
---|
1507 | orr $a0,$a0,$a1 @ see if result is zero
|
---|
1508 | orr $a2,$a2,$a3
|
---|
1509 | orr $a4,$a4,$a5
|
---|
1510 | orr $a0,$a0,$a2
|
---|
1511 | orr $a4,$a4,$a6
|
---|
1512 | orr $a0,$a0,$a7
|
---|
1513 | orr $a0,$a0,$a4 @ ~is_equal(U1,U2)
|
---|
1514 |
|
---|
1515 | ldr $t0,[sp,#32*18+4] @ ~in1infty
|
---|
1516 | ldr $t1,[sp,#32*18+8] @ ~in2infty
|
---|
1517 | ldr $t2,[sp,#32*18+12] @ ~is_equal(S1,S2)
|
---|
1518 | mvn $t0,$t0 @ -1/0 -> 0/-1
|
---|
1519 | mvn $t1,$t1 @ -1/0 -> 0/-1
|
---|
1520 | orr $a0,$a0,$t0
|
---|
1521 | orr $a0,$a0,$t1
|
---|
1522 | orrs $a0,$a0,$t2 @ set flags
|
---|
1523 |
|
---|
1524 | @ if(~is_equal(U1,U2) | in1infty | in2infty | ~is_equal(S1,S2))
|
---|
1525 | bne .Ladd_proceed
|
---|
1526 |
|
---|
1527 | .Ladd_double:
|
---|
1528 | ldr $a_ptr,[sp,#32*18+20]
|
---|
1529 | add sp,sp,#32*(18-5)+16 @ difference in frame sizes
|
---|
1530 | b .Lpoint_double_shortcut
|
---|
1531 |
|
---|
1532 | .align 4
|
---|
1533 | .Ladd_proceed:
|
---|
1534 | add $a_ptr,sp,#$R
|
---|
1535 | add $b_ptr,sp,#$R
|
---|
1536 | add $r_ptr,sp,#$Rsqr
|
---|
1537 | bl __ecp_nistz256_mul_mont @ p256_sqr_mont(Rsqr, R);
|
---|
1538 |
|
---|
1539 | add $a_ptr,sp,#$H
|
---|
1540 | add $b_ptr,sp,#$in1_z
|
---|
1541 | add $r_ptr,sp,#$res_z
|
---|
1542 | bl __ecp_nistz256_mul_mont @ p256_mul_mont(res_z, H, in1_z);
|
---|
1543 |
|
---|
1544 | add $a_ptr,sp,#$H
|
---|
1545 | add $b_ptr,sp,#$H
|
---|
1546 | add $r_ptr,sp,#$Hsqr
|
---|
1547 | bl __ecp_nistz256_mul_mont @ p256_sqr_mont(Hsqr, H);
|
---|
1548 |
|
---|
1549 | add $a_ptr,sp,#$in2_z
|
---|
1550 | add $b_ptr,sp,#$res_z
|
---|
1551 | add $r_ptr,sp,#$res_z
|
---|
1552 | bl __ecp_nistz256_mul_mont @ p256_mul_mont(res_z, res_z, in2_z);
|
---|
1553 |
|
---|
1554 | add $a_ptr,sp,#$H
|
---|
1555 | add $b_ptr,sp,#$Hsqr
|
---|
1556 | add $r_ptr,sp,#$Hcub
|
---|
1557 | bl __ecp_nistz256_mul_mont @ p256_mul_mont(Hcub, Hsqr, H);
|
---|
1558 |
|
---|
1559 | add $a_ptr,sp,#$Hsqr
|
---|
1560 | add $b_ptr,sp,#$U1
|
---|
1561 | add $r_ptr,sp,#$U2
|
---|
1562 | bl __ecp_nistz256_mul_mont @ p256_mul_mont(U2, U1, Hsqr);
|
---|
1563 |
|
---|
1564 | add $r_ptr,sp,#$Hsqr
|
---|
1565 | bl __ecp_nistz256_add_self @ p256_mul_by_2(Hsqr, U2);
|
---|
1566 |
|
---|
1567 | add $b_ptr,sp,#$Rsqr
|
---|
1568 | add $r_ptr,sp,#$res_x
|
---|
1569 | bl __ecp_nistz256_sub_morf @ p256_sub(res_x, Rsqr, Hsqr);
|
---|
1570 |
|
---|
1571 | add $b_ptr,sp,#$Hcub
|
---|
1572 | bl __ecp_nistz256_sub_from @ p256_sub(res_x, res_x, Hcub);
|
---|
1573 |
|
---|
1574 | add $b_ptr,sp,#$U2
|
---|
1575 | add $r_ptr,sp,#$res_y
|
---|
1576 | bl __ecp_nistz256_sub_morf @ p256_sub(res_y, U2, res_x);
|
---|
1577 |
|
---|
1578 | add $a_ptr,sp,#$Hcub
|
---|
1579 | add $b_ptr,sp,#$S1
|
---|
1580 | add $r_ptr,sp,#$S2
|
---|
1581 | bl __ecp_nistz256_mul_mont @ p256_mul_mont(S2, S1, Hcub);
|
---|
1582 |
|
---|
1583 | add $a_ptr,sp,#$R
|
---|
1584 | add $b_ptr,sp,#$res_y
|
---|
1585 | add $r_ptr,sp,#$res_y
|
---|
1586 | bl __ecp_nistz256_mul_mont @ p256_mul_mont(res_y, res_y, R);
|
---|
1587 |
|
---|
1588 | add $b_ptr,sp,#$S2
|
---|
1589 | bl __ecp_nistz256_sub_from @ p256_sub(res_y, res_y, S2);
|
---|
1590 |
|
---|
1591 | ldr r11,[sp,#32*18+4] @ ~in1infty
|
---|
1592 | ldr r12,[sp,#32*18+8] @ ~in2infty
|
---|
1593 | add r1,sp,#$res_x
|
---|
1594 | add r2,sp,#$in2_x
|
---|
1595 | and r10,r11,r12 @ ~in1infty & ~in2infty
|
---|
1596 | mvn r11,r11
|
---|
1597 | add r3,sp,#$in1_x
|
---|
1598 | and r11,r11,r12 @ in1infty & ~in2infty
|
---|
1599 | mvn r12,r12 @ in2infty
|
---|
1600 | ldr $r_ptr,[sp,#32*18+16]
|
---|
1601 | ___
|
---|
1602 | for($i=0;$i<96;$i+=8) { # conditional moves
|
---|
1603 | $code.=<<___;
|
---|
1604 | ldmia r1!,{r4-r5} @ res_x
|
---|
1605 | ldmia r2!,{r6-r7} @ in2_x
|
---|
1606 | ldmia r3!,{r8-r9} @ in1_x
|
---|
1607 | and r4,r4,r10 @ ~in1infty & ~in2infty
|
---|
1608 | and r5,r5,r10
|
---|
1609 | and r6,r6,r11 @ in1infty & ~in2infty
|
---|
1610 | and r7,r7,r11
|
---|
1611 | and r8,r8,r12 @ in2infty
|
---|
1612 | and r9,r9,r12
|
---|
1613 | orr r4,r4,r6
|
---|
1614 | orr r5,r5,r7
|
---|
1615 | orr r4,r4,r8
|
---|
1616 | orr r5,r5,r9
|
---|
1617 | stmia $r_ptr!,{r4-r5}
|
---|
1618 | ___
|
---|
1619 | }
|
---|
1620 | $code.=<<___;
|
---|
1621 | .Ladd_done:
|
---|
1622 | add sp,sp,#32*18+16+16 @ +16 means "skip even over saved r0-r3"
|
---|
1623 | #if __ARM_ARCH__>=5 || !defined(__thumb__)
|
---|
1624 | ldmia sp!,{r4-r12,pc}
|
---|
1625 | #else
|
---|
1626 | ldmia sp!,{r4-r12,lr}
|
---|
1627 | bx lr @ interoperable with Thumb ISA:-)
|
---|
1628 | #endif
|
---|
1629 | .size ecp_nistz256_point_add,.-ecp_nistz256_point_add
|
---|
1630 | ___
|
---|
1631 | }
|
---|
1632 |
|
---|
1633 | ########################################################################
|
---|
1634 | # void ecp_nistz256_point_add_affine(P256_POINT *out,const P256_POINT *in1,
|
---|
1635 | # const P256_POINT_AFFINE *in2);
|
---|
1636 | {
|
---|
1637 | my ($res_x,$res_y,$res_z,
|
---|
1638 | $in1_x,$in1_y,$in1_z,
|
---|
1639 | $in2_x,$in2_y,
|
---|
1640 | $U2,$S2,$H,$R,$Hsqr,$Hcub,$Rsqr)=map(32*$_,(0..14));
|
---|
1641 | my $Z1sqr = $S2;
|
---|
1642 | # above map() describes stack layout with 18 temporary
|
---|
1643 | # 256-bit vectors on top. Then note that we push
|
---|
1644 | # starting from r0, which means that we have copy of
|
---|
1645 | # input arguments just below these temporary vectors.
|
---|
1646 | # We use two of them for ~in1infty, ~in2infty.
|
---|
1647 |
|
---|
1648 | my @ONE_mont=(1,0,0,-1,-1,-1,-2,0);
|
---|
1649 |
|
---|
1650 | $code.=<<___;
|
---|
1651 | .globl ecp_nistz256_point_add_affine
|
---|
1652 | .type ecp_nistz256_point_add_affine,%function
|
---|
1653 | .align 5
|
---|
1654 | ecp_nistz256_point_add_affine:
|
---|
1655 | stmdb sp!,{r0-r12,lr} @ push from r0, unusual, but intentional
|
---|
1656 | sub sp,sp,#32*15
|
---|
1657 |
|
---|
1658 | ldmia $a_ptr!,{r4-r11} @ copy in1_x
|
---|
1659 | add r3,sp,#$in1_x
|
---|
1660 | stmia r3!,{r4-r11}
|
---|
1661 | ldmia $a_ptr!,{r4-r11} @ copy in1_y
|
---|
1662 | stmia r3!,{r4-r11}
|
---|
1663 | ldmia $a_ptr,{r4-r11} @ copy in1_z
|
---|
1664 | orr r12,r4,r5
|
---|
1665 | orr r12,r12,r6
|
---|
1666 | orr r12,r12,r7
|
---|
1667 | orr r12,r12,r8
|
---|
1668 | orr r12,r12,r9
|
---|
1669 | orr r12,r12,r10
|
---|
1670 | orr r12,r12,r11
|
---|
1671 | cmp r12,#0
|
---|
1672 | #ifdef __thumb2__
|
---|
1673 | it ne
|
---|
1674 | #endif
|
---|
1675 | movne r12,#-1
|
---|
1676 | stmia r3,{r4-r11}
|
---|
1677 | str r12,[sp,#32*15+4] @ ~in1infty
|
---|
1678 |
|
---|
1679 | ldmia $b_ptr!,{r4-r11} @ copy in2_x
|
---|
1680 | add r3,sp,#$in2_x
|
---|
1681 | orr r12,r4,r5
|
---|
1682 | orr r12,r12,r6
|
---|
1683 | orr r12,r12,r7
|
---|
1684 | orr r12,r12,r8
|
---|
1685 | orr r12,r12,r9
|
---|
1686 | orr r12,r12,r10
|
---|
1687 | orr r12,r12,r11
|
---|
1688 | stmia r3!,{r4-r11}
|
---|
1689 | ldmia $b_ptr!,{r4-r11} @ copy in2_y
|
---|
1690 | orr r12,r12,r4
|
---|
1691 | orr r12,r12,r5
|
---|
1692 | orr r12,r12,r6
|
---|
1693 | orr r12,r12,r7
|
---|
1694 | orr r12,r12,r8
|
---|
1695 | orr r12,r12,r9
|
---|
1696 | orr r12,r12,r10
|
---|
1697 | orr r12,r12,r11
|
---|
1698 | stmia r3!,{r4-r11}
|
---|
1699 | cmp r12,#0
|
---|
1700 | #ifdef __thumb2__
|
---|
1701 | it ne
|
---|
1702 | #endif
|
---|
1703 | movne r12,#-1
|
---|
1704 | str r12,[sp,#32*15+8] @ ~in2infty
|
---|
1705 |
|
---|
1706 | add $a_ptr,sp,#$in1_z
|
---|
1707 | add $b_ptr,sp,#$in1_z
|
---|
1708 | add $r_ptr,sp,#$Z1sqr
|
---|
1709 | bl __ecp_nistz256_mul_mont @ p256_sqr_mont(Z1sqr, in1_z);
|
---|
1710 |
|
---|
1711 | add $a_ptr,sp,#$Z1sqr
|
---|
1712 | add $b_ptr,sp,#$in2_x
|
---|
1713 | add $r_ptr,sp,#$U2
|
---|
1714 | bl __ecp_nistz256_mul_mont @ p256_mul_mont(U2, Z1sqr, in2_x);
|
---|
1715 |
|
---|
1716 | add $b_ptr,sp,#$in1_x
|
---|
1717 | add $r_ptr,sp,#$H
|
---|
1718 | bl __ecp_nistz256_sub_from @ p256_sub(H, U2, in1_x);
|
---|
1719 |
|
---|
1720 | add $a_ptr,sp,#$Z1sqr
|
---|
1721 | add $b_ptr,sp,#$in1_z
|
---|
1722 | add $r_ptr,sp,#$S2
|
---|
1723 | bl __ecp_nistz256_mul_mont @ p256_mul_mont(S2, Z1sqr, in1_z);
|
---|
1724 |
|
---|
1725 | add $a_ptr,sp,#$H
|
---|
1726 | add $b_ptr,sp,#$in1_z
|
---|
1727 | add $r_ptr,sp,#$res_z
|
---|
1728 | bl __ecp_nistz256_mul_mont @ p256_mul_mont(res_z, H, in1_z);
|
---|
1729 |
|
---|
1730 | add $a_ptr,sp,#$in2_y
|
---|
1731 | add $b_ptr,sp,#$S2
|
---|
1732 | add $r_ptr,sp,#$S2
|
---|
1733 | bl __ecp_nistz256_mul_mont @ p256_mul_mont(S2, S2, in2_y);
|
---|
1734 |
|
---|
1735 | add $b_ptr,sp,#$in1_y
|
---|
1736 | add $r_ptr,sp,#$R
|
---|
1737 | bl __ecp_nistz256_sub_from @ p256_sub(R, S2, in1_y);
|
---|
1738 |
|
---|
1739 | add $a_ptr,sp,#$H
|
---|
1740 | add $b_ptr,sp,#$H
|
---|
1741 | add $r_ptr,sp,#$Hsqr
|
---|
1742 | bl __ecp_nistz256_mul_mont @ p256_sqr_mont(Hsqr, H);
|
---|
1743 |
|
---|
1744 | add $a_ptr,sp,#$R
|
---|
1745 | add $b_ptr,sp,#$R
|
---|
1746 | add $r_ptr,sp,#$Rsqr
|
---|
1747 | bl __ecp_nistz256_mul_mont @ p256_sqr_mont(Rsqr, R);
|
---|
1748 |
|
---|
1749 | add $a_ptr,sp,#$H
|
---|
1750 | add $b_ptr,sp,#$Hsqr
|
---|
1751 | add $r_ptr,sp,#$Hcub
|
---|
1752 | bl __ecp_nistz256_mul_mont @ p256_mul_mont(Hcub, Hsqr, H);
|
---|
1753 |
|
---|
1754 | add $a_ptr,sp,#$Hsqr
|
---|
1755 | add $b_ptr,sp,#$in1_x
|
---|
1756 | add $r_ptr,sp,#$U2
|
---|
1757 | bl __ecp_nistz256_mul_mont @ p256_mul_mont(U2, in1_x, Hsqr);
|
---|
1758 |
|
---|
1759 | add $r_ptr,sp,#$Hsqr
|
---|
1760 | bl __ecp_nistz256_add_self @ p256_mul_by_2(Hsqr, U2);
|
---|
1761 |
|
---|
1762 | add $b_ptr,sp,#$Rsqr
|
---|
1763 | add $r_ptr,sp,#$res_x
|
---|
1764 | bl __ecp_nistz256_sub_morf @ p256_sub(res_x, Rsqr, Hsqr);
|
---|
1765 |
|
---|
1766 | add $b_ptr,sp,#$Hcub
|
---|
1767 | bl __ecp_nistz256_sub_from @ p256_sub(res_x, res_x, Hcub);
|
---|
1768 |
|
---|
1769 | add $b_ptr,sp,#$U2
|
---|
1770 | add $r_ptr,sp,#$res_y
|
---|
1771 | bl __ecp_nistz256_sub_morf @ p256_sub(res_y, U2, res_x);
|
---|
1772 |
|
---|
1773 | add $a_ptr,sp,#$Hcub
|
---|
1774 | add $b_ptr,sp,#$in1_y
|
---|
1775 | add $r_ptr,sp,#$S2
|
---|
1776 | bl __ecp_nistz256_mul_mont @ p256_mul_mont(S2, in1_y, Hcub);
|
---|
1777 |
|
---|
1778 | add $a_ptr,sp,#$R
|
---|
1779 | add $b_ptr,sp,#$res_y
|
---|
1780 | add $r_ptr,sp,#$res_y
|
---|
1781 | bl __ecp_nistz256_mul_mont @ p256_mul_mont(res_y, res_y, R);
|
---|
1782 |
|
---|
1783 | add $b_ptr,sp,#$S2
|
---|
1784 | bl __ecp_nistz256_sub_from @ p256_sub(res_y, res_y, S2);
|
---|
1785 |
|
---|
1786 | ldr r11,[sp,#32*15+4] @ ~in1infty
|
---|
1787 | ldr r12,[sp,#32*15+8] @ ~in2infty
|
---|
1788 | add r1,sp,#$res_x
|
---|
1789 | add r2,sp,#$in2_x
|
---|
1790 | and r10,r11,r12 @ ~in1infty & ~in2infty
|
---|
1791 | mvn r11,r11
|
---|
1792 | add r3,sp,#$in1_x
|
---|
1793 | and r11,r11,r12 @ in1infty & ~in2infty
|
---|
1794 | mvn r12,r12 @ in2infty
|
---|
1795 | ldr $r_ptr,[sp,#32*15]
|
---|
1796 | ___
|
---|
1797 | for($i=0;$i<64;$i+=8) { # conditional moves
|
---|
1798 | $code.=<<___;
|
---|
1799 | ldmia r1!,{r4-r5} @ res_x
|
---|
1800 | ldmia r2!,{r6-r7} @ in2_x
|
---|
1801 | ldmia r3!,{r8-r9} @ in1_x
|
---|
1802 | and r4,r4,r10 @ ~in1infty & ~in2infty
|
---|
1803 | and r5,r5,r10
|
---|
1804 | and r6,r6,r11 @ in1infty & ~in2infty
|
---|
1805 | and r7,r7,r11
|
---|
1806 | and r8,r8,r12 @ in2infty
|
---|
1807 | and r9,r9,r12
|
---|
1808 | orr r4,r4,r6
|
---|
1809 | orr r5,r5,r7
|
---|
1810 | orr r4,r4,r8
|
---|
1811 | orr r5,r5,r9
|
---|
1812 | stmia $r_ptr!,{r4-r5}
|
---|
1813 | ___
|
---|
1814 | }
|
---|
1815 | for(;$i<96;$i+=8) {
|
---|
1816 | my $j=($i-64)/4;
|
---|
1817 | $code.=<<___;
|
---|
1818 | ldmia r1!,{r4-r5} @ res_z
|
---|
1819 | ldmia r3!,{r8-r9} @ in1_z
|
---|
1820 | and r4,r4,r10
|
---|
1821 | and r5,r5,r10
|
---|
1822 | and r6,r11,#@ONE_mont[$j]
|
---|
1823 | and r7,r11,#@ONE_mont[$j+1]
|
---|
1824 | and r8,r8,r12
|
---|
1825 | and r9,r9,r12
|
---|
1826 | orr r4,r4,r6
|
---|
1827 | orr r5,r5,r7
|
---|
1828 | orr r4,r4,r8
|
---|
1829 | orr r5,r5,r9
|
---|
1830 | stmia $r_ptr!,{r4-r5}
|
---|
1831 | ___
|
---|
1832 | }
|
---|
1833 | $code.=<<___;
|
---|
1834 | add sp,sp,#32*15+16 @ +16 means "skip even over saved r0-r3"
|
---|
1835 | #if __ARM_ARCH__>=5 || !defined(__thumb__)
|
---|
1836 | ldmia sp!,{r4-r12,pc}
|
---|
1837 | #else
|
---|
1838 | ldmia sp!,{r4-r12,lr}
|
---|
1839 | bx lr @ interoperable with Thumb ISA:-)
|
---|
1840 | #endif
|
---|
1841 | .size ecp_nistz256_point_add_affine,.-ecp_nistz256_point_add_affine
|
---|
1842 | ___
|
---|
1843 | } }}}
|
---|
1844 |
|
---|
1845 | foreach (split("\n",$code)) {
|
---|
1846 | s/\`([^\`]*)\`/eval $1/geo;
|
---|
1847 |
|
---|
1848 | s/\bq([0-9]+)#(lo|hi)/sprintf "d%d",2*$1+($2 eq "hi")/geo;
|
---|
1849 |
|
---|
1850 | print $_,"\n";
|
---|
1851 | }
|
---|
1852 | close STDOUT or die "error closing STDOUT: $!"; # enforce flush
|
---|