1 | #! /usr/bin/env perl
|
---|
2 | # Copyright 2016-2020 The OpenSSL Project Authors. All Rights Reserved.
|
---|
3 | #
|
---|
4 | # Licensed under the OpenSSL license (the "License"). You may not use
|
---|
5 | # this file except in compliance with the License. You can obtain a copy
|
---|
6 | # in the file LICENSE in the source distribution or at
|
---|
7 | # https://www.openssl.org/source/license.html
|
---|
8 |
|
---|
9 | #
|
---|
10 | # ====================================================================
|
---|
11 | # Written by Andy Polyakov <[email protected]> for the OpenSSL
|
---|
12 | # project. The module is, however, dual licensed under OpenSSL and
|
---|
13 | # CRYPTOGAMS licenses depending on where you obtain it. For further
|
---|
14 | # details see http://www.openssl.org/~appro/cryptogams/.
|
---|
15 | # ====================================================================
|
---|
16 | #
|
---|
17 | # ECP_NISTZ256 module for PPC64.
|
---|
18 | #
|
---|
19 | # August 2016.
|
---|
20 | #
|
---|
21 | # Original ECP_NISTZ256 submission targeting x86_64 is detailed in
|
---|
22 | # http://eprint.iacr.org/2013/816.
|
---|
23 | #
|
---|
24 | # with/without -DECP_NISTZ256_ASM
|
---|
25 | # POWER7 +260-530%
|
---|
26 | # POWER8 +220-340%
|
---|
27 |
|
---|
28 | $flavour = shift;
|
---|
29 | while (($output=shift) && ($output!~/\w[\w\-]*\.\w+$/)) {}
|
---|
30 |
|
---|
31 | $0 =~ m/(.*[\/\\])[^\/\\]+$/; $dir=$1;
|
---|
32 | ( $xlate="${dir}ppc-xlate.pl" and -f $xlate ) or
|
---|
33 | ( $xlate="${dir}../../perlasm/ppc-xlate.pl" and -f $xlate) or
|
---|
34 | die "can't locate ppc-xlate.pl";
|
---|
35 |
|
---|
36 | open OUT,"| \"$^X\" $xlate $flavour $output";
|
---|
37 | *STDOUT=*OUT;
|
---|
38 |
|
---|
39 | my $sp="r1";
|
---|
40 |
|
---|
41 | {
|
---|
42 | my ($rp,$ap,$bp,$bi,$acc0,$acc1,$acc2,$acc3,$poly1,$poly3,
|
---|
43 | $acc4,$acc5,$a0,$a1,$a2,$a3,$t0,$t1,$t2,$t3) =
|
---|
44 | map("r$_",(3..12,22..31));
|
---|
45 |
|
---|
46 | my ($acc6,$acc7)=($bp,$bi); # used in __ecp_nistz256_sqr_mont
|
---|
47 |
|
---|
48 | $code.=<<___;
|
---|
49 | .machine "any"
|
---|
50 | .text
|
---|
51 | ___
|
---|
52 | ########################################################################
|
---|
53 | # Convert ecp_nistz256_table.c to layout expected by ecp_nistz_gather_w7
|
---|
54 | #
|
---|
55 | $0 =~ m/(.*[\/\\])[^\/\\]+$/; $dir=$1;
|
---|
56 | open TABLE,"<ecp_nistz256_table.c" or
|
---|
57 | open TABLE,"<${dir}../ecp_nistz256_table.c" or
|
---|
58 | die "failed to open ecp_nistz256_table.c:",$!;
|
---|
59 |
|
---|
60 | use integer;
|
---|
61 |
|
---|
62 | foreach(<TABLE>) {
|
---|
63 | s/TOBN\(\s*(0x[0-9a-f]+),\s*(0x[0-9a-f]+)\s*\)/push @arr,hex($2),hex($1)/geo;
|
---|
64 | }
|
---|
65 | close TABLE;
|
---|
66 |
|
---|
67 | # See ecp_nistz256_table.c for explanation for why it's 64*16*37.
|
---|
68 | # 64*16*37-1 is because $#arr returns last valid index or @arr, not
|
---|
69 | # amount of elements.
|
---|
70 | die "insane number of elements" if ($#arr != 64*16*37-1);
|
---|
71 |
|
---|
72 | $code.=<<___;
|
---|
73 | .type ecp_nistz256_precomputed,\@object
|
---|
74 | .globl ecp_nistz256_precomputed
|
---|
75 | .align 12
|
---|
76 | ecp_nistz256_precomputed:
|
---|
77 | ___
|
---|
78 | ########################################################################
|
---|
79 | # this conversion smashes P256_POINT_AFFINE by individual bytes with
|
---|
80 | # 64 byte interval, similar to
|
---|
81 | # 1111222233334444
|
---|
82 | # 1234123412341234
|
---|
83 | for(1..37) {
|
---|
84 | @tbl = splice(@arr,0,64*16);
|
---|
85 | for($i=0;$i<64;$i++) {
|
---|
86 | undef @line;
|
---|
87 | for($j=0;$j<64;$j++) {
|
---|
88 | push @line,(@tbl[$j*16+$i/4]>>(($i%4)*8))&0xff;
|
---|
89 | }
|
---|
90 | $code.=".byte\t";
|
---|
91 | $code.=join(',',map { sprintf "0x%02x",$_} @line);
|
---|
92 | $code.="\n";
|
---|
93 | }
|
---|
94 | }
|
---|
95 |
|
---|
96 | $code.=<<___;
|
---|
97 | .size ecp_nistz256_precomputed,.-ecp_nistz256_precomputed
|
---|
98 | .asciz "ECP_NISTZ256 for PPC64, CRYPTOGAMS by <appro\@openssl.org>"
|
---|
99 |
|
---|
100 | # void ecp_nistz256_mul_mont(BN_ULONG x0[4],const BN_ULONG x1[4],
|
---|
101 | # const BN_ULONG x2[4]);
|
---|
102 | .globl ecp_nistz256_mul_mont
|
---|
103 | .align 5
|
---|
104 | ecp_nistz256_mul_mont:
|
---|
105 | stdu $sp,-128($sp)
|
---|
106 | mflr r0
|
---|
107 | std r22,48($sp)
|
---|
108 | std r23,56($sp)
|
---|
109 | std r24,64($sp)
|
---|
110 | std r25,72($sp)
|
---|
111 | std r26,80($sp)
|
---|
112 | std r27,88($sp)
|
---|
113 | std r28,96($sp)
|
---|
114 | std r29,104($sp)
|
---|
115 | std r30,112($sp)
|
---|
116 | std r31,120($sp)
|
---|
117 |
|
---|
118 | ld $a0,0($ap)
|
---|
119 | ld $bi,0($bp)
|
---|
120 | ld $a1,8($ap)
|
---|
121 | ld $a2,16($ap)
|
---|
122 | ld $a3,24($ap)
|
---|
123 |
|
---|
124 | li $poly1,-1
|
---|
125 | srdi $poly1,$poly1,32 # 0x00000000ffffffff
|
---|
126 | li $poly3,1
|
---|
127 | orc $poly3,$poly3,$poly1 # 0xffffffff00000001
|
---|
128 |
|
---|
129 | bl __ecp_nistz256_mul_mont
|
---|
130 |
|
---|
131 | mtlr r0
|
---|
132 | ld r22,48($sp)
|
---|
133 | ld r23,56($sp)
|
---|
134 | ld r24,64($sp)
|
---|
135 | ld r25,72($sp)
|
---|
136 | ld r26,80($sp)
|
---|
137 | ld r27,88($sp)
|
---|
138 | ld r28,96($sp)
|
---|
139 | ld r29,104($sp)
|
---|
140 | ld r30,112($sp)
|
---|
141 | ld r31,120($sp)
|
---|
142 | addi $sp,$sp,128
|
---|
143 | blr
|
---|
144 | .long 0
|
---|
145 | .byte 0,12,4,0,0x80,10,3,0
|
---|
146 | .long 0
|
---|
147 | .size ecp_nistz256_mul_mont,.-ecp_nistz256_mul_mont
|
---|
148 |
|
---|
149 | # void ecp_nistz256_sqr_mont(BN_ULONG x0[4],const BN_ULONG x1[4]);
|
---|
150 | .globl ecp_nistz256_sqr_mont
|
---|
151 | .align 4
|
---|
152 | ecp_nistz256_sqr_mont:
|
---|
153 | stdu $sp,-128($sp)
|
---|
154 | mflr r0
|
---|
155 | std r22,48($sp)
|
---|
156 | std r23,56($sp)
|
---|
157 | std r24,64($sp)
|
---|
158 | std r25,72($sp)
|
---|
159 | std r26,80($sp)
|
---|
160 | std r27,88($sp)
|
---|
161 | std r28,96($sp)
|
---|
162 | std r29,104($sp)
|
---|
163 | std r30,112($sp)
|
---|
164 | std r31,120($sp)
|
---|
165 |
|
---|
166 | ld $a0,0($ap)
|
---|
167 | ld $a1,8($ap)
|
---|
168 | ld $a2,16($ap)
|
---|
169 | ld $a3,24($ap)
|
---|
170 |
|
---|
171 | li $poly1,-1
|
---|
172 | srdi $poly1,$poly1,32 # 0x00000000ffffffff
|
---|
173 | li $poly3,1
|
---|
174 | orc $poly3,$poly3,$poly1 # 0xffffffff00000001
|
---|
175 |
|
---|
176 | bl __ecp_nistz256_sqr_mont
|
---|
177 |
|
---|
178 | mtlr r0
|
---|
179 | ld r22,48($sp)
|
---|
180 | ld r23,56($sp)
|
---|
181 | ld r24,64($sp)
|
---|
182 | ld r25,72($sp)
|
---|
183 | ld r26,80($sp)
|
---|
184 | ld r27,88($sp)
|
---|
185 | ld r28,96($sp)
|
---|
186 | ld r29,104($sp)
|
---|
187 | ld r30,112($sp)
|
---|
188 | ld r31,120($sp)
|
---|
189 | addi $sp,$sp,128
|
---|
190 | blr
|
---|
191 | .long 0
|
---|
192 | .byte 0,12,4,0,0x80,10,2,0
|
---|
193 | .long 0
|
---|
194 | .size ecp_nistz256_sqr_mont,.-ecp_nistz256_sqr_mont
|
---|
195 |
|
---|
196 | # void ecp_nistz256_add(BN_ULONG x0[4],const BN_ULONG x1[4],
|
---|
197 | # const BN_ULONG x2[4]);
|
---|
198 | .globl ecp_nistz256_add
|
---|
199 | .align 4
|
---|
200 | ecp_nistz256_add:
|
---|
201 | stdu $sp,-128($sp)
|
---|
202 | mflr r0
|
---|
203 | std r28,96($sp)
|
---|
204 | std r29,104($sp)
|
---|
205 | std r30,112($sp)
|
---|
206 | std r31,120($sp)
|
---|
207 |
|
---|
208 | ld $acc0,0($ap)
|
---|
209 | ld $t0, 0($bp)
|
---|
210 | ld $acc1,8($ap)
|
---|
211 | ld $t1, 8($bp)
|
---|
212 | ld $acc2,16($ap)
|
---|
213 | ld $t2, 16($bp)
|
---|
214 | ld $acc3,24($ap)
|
---|
215 | ld $t3, 24($bp)
|
---|
216 |
|
---|
217 | li $poly1,-1
|
---|
218 | srdi $poly1,$poly1,32 # 0x00000000ffffffff
|
---|
219 | li $poly3,1
|
---|
220 | orc $poly3,$poly3,$poly1 # 0xffffffff00000001
|
---|
221 |
|
---|
222 | bl __ecp_nistz256_add
|
---|
223 |
|
---|
224 | mtlr r0
|
---|
225 | ld r28,96($sp)
|
---|
226 | ld r29,104($sp)
|
---|
227 | ld r30,112($sp)
|
---|
228 | ld r31,120($sp)
|
---|
229 | addi $sp,$sp,128
|
---|
230 | blr
|
---|
231 | .long 0
|
---|
232 | .byte 0,12,4,0,0x80,4,3,0
|
---|
233 | .long 0
|
---|
234 | .size ecp_nistz256_add,.-ecp_nistz256_add
|
---|
235 |
|
---|
236 | # void ecp_nistz256_div_by_2(BN_ULONG x0[4],const BN_ULONG x1[4]);
|
---|
237 | .globl ecp_nistz256_div_by_2
|
---|
238 | .align 4
|
---|
239 | ecp_nistz256_div_by_2:
|
---|
240 | stdu $sp,-128($sp)
|
---|
241 | mflr r0
|
---|
242 | std r28,96($sp)
|
---|
243 | std r29,104($sp)
|
---|
244 | std r30,112($sp)
|
---|
245 | std r31,120($sp)
|
---|
246 |
|
---|
247 | ld $acc0,0($ap)
|
---|
248 | ld $acc1,8($ap)
|
---|
249 | ld $acc2,16($ap)
|
---|
250 | ld $acc3,24($ap)
|
---|
251 |
|
---|
252 | li $poly1,-1
|
---|
253 | srdi $poly1,$poly1,32 # 0x00000000ffffffff
|
---|
254 | li $poly3,1
|
---|
255 | orc $poly3,$poly3,$poly1 # 0xffffffff00000001
|
---|
256 |
|
---|
257 | bl __ecp_nistz256_div_by_2
|
---|
258 |
|
---|
259 | mtlr r0
|
---|
260 | ld r28,96($sp)
|
---|
261 | ld r29,104($sp)
|
---|
262 | ld r30,112($sp)
|
---|
263 | ld r31,120($sp)
|
---|
264 | addi $sp,$sp,128
|
---|
265 | blr
|
---|
266 | .long 0
|
---|
267 | .byte 0,12,4,0,0x80,4,2,0
|
---|
268 | .long 0
|
---|
269 | .size ecp_nistz256_div_by_2,.-ecp_nistz256_div_by_2
|
---|
270 |
|
---|
271 | # void ecp_nistz256_mul_by_2(BN_ULONG x0[4],const BN_ULONG x1[4]);
|
---|
272 | .globl ecp_nistz256_mul_by_2
|
---|
273 | .align 4
|
---|
274 | ecp_nistz256_mul_by_2:
|
---|
275 | stdu $sp,-128($sp)
|
---|
276 | mflr r0
|
---|
277 | std r28,96($sp)
|
---|
278 | std r29,104($sp)
|
---|
279 | std r30,112($sp)
|
---|
280 | std r31,120($sp)
|
---|
281 |
|
---|
282 | ld $acc0,0($ap)
|
---|
283 | ld $acc1,8($ap)
|
---|
284 | ld $acc2,16($ap)
|
---|
285 | ld $acc3,24($ap)
|
---|
286 |
|
---|
287 | mr $t0,$acc0
|
---|
288 | mr $t1,$acc1
|
---|
289 | mr $t2,$acc2
|
---|
290 | mr $t3,$acc3
|
---|
291 |
|
---|
292 | li $poly1,-1
|
---|
293 | srdi $poly1,$poly1,32 # 0x00000000ffffffff
|
---|
294 | li $poly3,1
|
---|
295 | orc $poly3,$poly3,$poly1 # 0xffffffff00000001
|
---|
296 |
|
---|
297 | bl __ecp_nistz256_add # ret = a+a // 2*a
|
---|
298 |
|
---|
299 | mtlr r0
|
---|
300 | ld r28,96($sp)
|
---|
301 | ld r29,104($sp)
|
---|
302 | ld r30,112($sp)
|
---|
303 | ld r31,120($sp)
|
---|
304 | addi $sp,$sp,128
|
---|
305 | blr
|
---|
306 | .long 0
|
---|
307 | .byte 0,12,4,0,0x80,4,3,0
|
---|
308 | .long 0
|
---|
309 | .size ecp_nistz256_mul_by_2,.-ecp_nistz256_mul_by_2
|
---|
310 |
|
---|
311 | # void ecp_nistz256_mul_by_3(BN_ULONG x0[4],const BN_ULONG x1[4]);
|
---|
312 | .globl ecp_nistz256_mul_by_3
|
---|
313 | .align 4
|
---|
314 | ecp_nistz256_mul_by_3:
|
---|
315 | stdu $sp,-128($sp)
|
---|
316 | mflr r0
|
---|
317 | std r28,96($sp)
|
---|
318 | std r29,104($sp)
|
---|
319 | std r30,112($sp)
|
---|
320 | std r31,120($sp)
|
---|
321 |
|
---|
322 | ld $acc0,0($ap)
|
---|
323 | ld $acc1,8($ap)
|
---|
324 | ld $acc2,16($ap)
|
---|
325 | ld $acc3,24($ap)
|
---|
326 |
|
---|
327 | mr $t0,$acc0
|
---|
328 | std $acc0,64($sp)
|
---|
329 | mr $t1,$acc1
|
---|
330 | std $acc1,72($sp)
|
---|
331 | mr $t2,$acc2
|
---|
332 | std $acc2,80($sp)
|
---|
333 | mr $t3,$acc3
|
---|
334 | std $acc3,88($sp)
|
---|
335 |
|
---|
336 | li $poly1,-1
|
---|
337 | srdi $poly1,$poly1,32 # 0x00000000ffffffff
|
---|
338 | li $poly3,1
|
---|
339 | orc $poly3,$poly3,$poly1 # 0xffffffff00000001
|
---|
340 |
|
---|
341 | bl __ecp_nistz256_add # ret = a+a // 2*a
|
---|
342 |
|
---|
343 | ld $t0,64($sp)
|
---|
344 | ld $t1,72($sp)
|
---|
345 | ld $t2,80($sp)
|
---|
346 | ld $t3,88($sp)
|
---|
347 |
|
---|
348 | bl __ecp_nistz256_add # ret += a // 2*a+a=3*a
|
---|
349 |
|
---|
350 | mtlr r0
|
---|
351 | ld r28,96($sp)
|
---|
352 | ld r29,104($sp)
|
---|
353 | ld r30,112($sp)
|
---|
354 | ld r31,120($sp)
|
---|
355 | addi $sp,$sp,128
|
---|
356 | blr
|
---|
357 | .long 0
|
---|
358 | .byte 0,12,4,0,0x80,4,2,0
|
---|
359 | .long 0
|
---|
360 | .size ecp_nistz256_mul_by_3,.-ecp_nistz256_mul_by_3
|
---|
361 |
|
---|
362 | # void ecp_nistz256_sub(BN_ULONG x0[4],const BN_ULONG x1[4],
|
---|
363 | # const BN_ULONG x2[4]);
|
---|
364 | .globl ecp_nistz256_sub
|
---|
365 | .align 4
|
---|
366 | ecp_nistz256_sub:
|
---|
367 | stdu $sp,-128($sp)
|
---|
368 | mflr r0
|
---|
369 | std r28,96($sp)
|
---|
370 | std r29,104($sp)
|
---|
371 | std r30,112($sp)
|
---|
372 | std r31,120($sp)
|
---|
373 |
|
---|
374 | ld $acc0,0($ap)
|
---|
375 | ld $acc1,8($ap)
|
---|
376 | ld $acc2,16($ap)
|
---|
377 | ld $acc3,24($ap)
|
---|
378 |
|
---|
379 | li $poly1,-1
|
---|
380 | srdi $poly1,$poly1,32 # 0x00000000ffffffff
|
---|
381 | li $poly3,1
|
---|
382 | orc $poly3,$poly3,$poly1 # 0xffffffff00000001
|
---|
383 |
|
---|
384 | bl __ecp_nistz256_sub_from
|
---|
385 |
|
---|
386 | mtlr r0
|
---|
387 | ld r28,96($sp)
|
---|
388 | ld r29,104($sp)
|
---|
389 | ld r30,112($sp)
|
---|
390 | ld r31,120($sp)
|
---|
391 | addi $sp,$sp,128
|
---|
392 | blr
|
---|
393 | .long 0
|
---|
394 | .byte 0,12,4,0,0x80,4,3,0
|
---|
395 | .long 0
|
---|
396 | .size ecp_nistz256_sub,.-ecp_nistz256_sub
|
---|
397 |
|
---|
398 | # void ecp_nistz256_neg(BN_ULONG x0[4],const BN_ULONG x1[4]);
|
---|
399 | .globl ecp_nistz256_neg
|
---|
400 | .align 4
|
---|
401 | ecp_nistz256_neg:
|
---|
402 | stdu $sp,-128($sp)
|
---|
403 | mflr r0
|
---|
404 | std r28,96($sp)
|
---|
405 | std r29,104($sp)
|
---|
406 | std r30,112($sp)
|
---|
407 | std r31,120($sp)
|
---|
408 |
|
---|
409 | mr $bp,$ap
|
---|
410 | li $acc0,0
|
---|
411 | li $acc1,0
|
---|
412 | li $acc2,0
|
---|
413 | li $acc3,0
|
---|
414 |
|
---|
415 | li $poly1,-1
|
---|
416 | srdi $poly1,$poly1,32 # 0x00000000ffffffff
|
---|
417 | li $poly3,1
|
---|
418 | orc $poly3,$poly3,$poly1 # 0xffffffff00000001
|
---|
419 |
|
---|
420 | bl __ecp_nistz256_sub_from
|
---|
421 |
|
---|
422 | mtlr r0
|
---|
423 | ld r28,96($sp)
|
---|
424 | ld r29,104($sp)
|
---|
425 | ld r30,112($sp)
|
---|
426 | ld r31,120($sp)
|
---|
427 | addi $sp,$sp,128
|
---|
428 | blr
|
---|
429 | .long 0
|
---|
430 | .byte 0,12,4,0,0x80,4,2,0
|
---|
431 | .long 0
|
---|
432 | .size ecp_nistz256_neg,.-ecp_nistz256_neg
|
---|
433 |
|
---|
434 | # note that __ecp_nistz256_mul_mont expects a[0-3] input pre-loaded
|
---|
435 | # to $a0-$a3 and b[0] - to $bi
|
---|
436 | .type __ecp_nistz256_mul_mont,\@function
|
---|
437 | .align 4
|
---|
438 | __ecp_nistz256_mul_mont:
|
---|
439 | mulld $acc0,$a0,$bi # a[0]*b[0]
|
---|
440 | mulhdu $t0,$a0,$bi
|
---|
441 |
|
---|
442 | mulld $acc1,$a1,$bi # a[1]*b[0]
|
---|
443 | mulhdu $t1,$a1,$bi
|
---|
444 |
|
---|
445 | mulld $acc2,$a2,$bi # a[2]*b[0]
|
---|
446 | mulhdu $t2,$a2,$bi
|
---|
447 |
|
---|
448 | mulld $acc3,$a3,$bi # a[3]*b[0]
|
---|
449 | mulhdu $t3,$a3,$bi
|
---|
450 | ld $bi,8($bp) # b[1]
|
---|
451 |
|
---|
452 | addc $acc1,$acc1,$t0 # accumulate high parts of multiplication
|
---|
453 | sldi $t0,$acc0,32
|
---|
454 | adde $acc2,$acc2,$t1
|
---|
455 | srdi $t1,$acc0,32
|
---|
456 | adde $acc3,$acc3,$t2
|
---|
457 | addze $acc4,$t3
|
---|
458 | li $acc5,0
|
---|
459 | ___
|
---|
460 | for($i=1;$i<4;$i++) {
|
---|
461 | ################################################################
|
---|
462 | # Reduction iteration is normally performed by accumulating
|
---|
463 | # result of multiplication of modulus by "magic" digit [and
|
---|
464 | # omitting least significant word, which is guaranteed to
|
---|
465 | # be 0], but thanks to special form of modulus and "magic"
|
---|
466 | # digit being equal to least significant word, it can be
|
---|
467 | # performed with additions and subtractions alone. Indeed:
|
---|
468 | #
|
---|
469 | # ffff0001.00000000.0000ffff.ffffffff
|
---|
470 | # * abcdefgh
|
---|
471 | # + xxxxxxxx.xxxxxxxx.xxxxxxxx.xxxxxxxx.abcdefgh
|
---|
472 | #
|
---|
473 | # Now observing that ff..ff*x = (2^n-1)*x = 2^n*x-x, we
|
---|
474 | # rewrite above as:
|
---|
475 | #
|
---|
476 | # xxxxxxxx.xxxxxxxx.xxxxxxxx.xxxxxxxx.abcdefgh
|
---|
477 | # + abcdefgh.abcdefgh.0000abcd.efgh0000.00000000
|
---|
478 | # - 0000abcd.efgh0000.00000000.00000000.abcdefgh
|
---|
479 | #
|
---|
480 | # or marking redundant operations:
|
---|
481 | #
|
---|
482 | # xxxxxxxx.xxxxxxxx.xxxxxxxx.xxxxxxxx.--------
|
---|
483 | # + abcdefgh.abcdefgh.0000abcd.efgh0000.--------
|
---|
484 | # - 0000abcd.efgh0000.--------.--------.--------
|
---|
485 |
|
---|
486 | $code.=<<___;
|
---|
487 | subfc $t2,$t0,$acc0 # "*0xffff0001"
|
---|
488 | subfe $t3,$t1,$acc0
|
---|
489 | addc $acc0,$acc1,$t0 # +=acc[0]<<96 and omit acc[0]
|
---|
490 | adde $acc1,$acc2,$t1
|
---|
491 | adde $acc2,$acc3,$t2 # +=acc[0]*0xffff0001
|
---|
492 | adde $acc3,$acc4,$t3
|
---|
493 | addze $acc4,$acc5
|
---|
494 |
|
---|
495 | mulld $t0,$a0,$bi # lo(a[0]*b[i])
|
---|
496 | mulld $t1,$a1,$bi # lo(a[1]*b[i])
|
---|
497 | mulld $t2,$a2,$bi # lo(a[2]*b[i])
|
---|
498 | mulld $t3,$a3,$bi # lo(a[3]*b[i])
|
---|
499 | addc $acc0,$acc0,$t0 # accumulate low parts of multiplication
|
---|
500 | mulhdu $t0,$a0,$bi # hi(a[0]*b[i])
|
---|
501 | adde $acc1,$acc1,$t1
|
---|
502 | mulhdu $t1,$a1,$bi # hi(a[1]*b[i])
|
---|
503 | adde $acc2,$acc2,$t2
|
---|
504 | mulhdu $t2,$a2,$bi # hi(a[2]*b[i])
|
---|
505 | adde $acc3,$acc3,$t3
|
---|
506 | mulhdu $t3,$a3,$bi # hi(a[3]*b[i])
|
---|
507 | addze $acc4,$acc4
|
---|
508 | ___
|
---|
509 | $code.=<<___ if ($i<3);
|
---|
510 | ld $bi,8*($i+1)($bp) # b[$i+1]
|
---|
511 | ___
|
---|
512 | $code.=<<___;
|
---|
513 | addc $acc1,$acc1,$t0 # accumulate high parts of multiplication
|
---|
514 | sldi $t0,$acc0,32
|
---|
515 | adde $acc2,$acc2,$t1
|
---|
516 | srdi $t1,$acc0,32
|
---|
517 | adde $acc3,$acc3,$t2
|
---|
518 | adde $acc4,$acc4,$t3
|
---|
519 | li $acc5,0
|
---|
520 | addze $acc5,$acc5
|
---|
521 | ___
|
---|
522 | }
|
---|
523 | $code.=<<___;
|
---|
524 | # last reduction
|
---|
525 | subfc $t2,$t0,$acc0 # "*0xffff0001"
|
---|
526 | subfe $t3,$t1,$acc0
|
---|
527 | addc $acc0,$acc1,$t0 # +=acc[0]<<96 and omit acc[0]
|
---|
528 | adde $acc1,$acc2,$t1
|
---|
529 | adde $acc2,$acc3,$t2 # +=acc[0]*0xffff0001
|
---|
530 | adde $acc3,$acc4,$t3
|
---|
531 | addze $acc4,$acc5
|
---|
532 |
|
---|
533 | li $t2,0
|
---|
534 | addic $acc0,$acc0,1 # ret -= modulus
|
---|
535 | subfe $acc1,$poly1,$acc1
|
---|
536 | subfe $acc2,$t2,$acc2
|
---|
537 | subfe $acc3,$poly3,$acc3
|
---|
538 | subfe $acc4,$t2,$acc4
|
---|
539 |
|
---|
540 | addc $acc0,$acc0,$acc4 # ret += modulus if borrow
|
---|
541 | and $t1,$poly1,$acc4
|
---|
542 | and $t3,$poly3,$acc4
|
---|
543 | adde $acc1,$acc1,$t1
|
---|
544 | addze $acc2,$acc2
|
---|
545 | adde $acc3,$acc3,$t3
|
---|
546 |
|
---|
547 | std $acc0,0($rp)
|
---|
548 | std $acc1,8($rp)
|
---|
549 | std $acc2,16($rp)
|
---|
550 | std $acc3,24($rp)
|
---|
551 |
|
---|
552 | blr
|
---|
553 | .long 0
|
---|
554 | .byte 0,12,0x14,0,0,0,1,0
|
---|
555 | .long 0
|
---|
556 | .size __ecp_nistz256_mul_mont,.-__ecp_nistz256_mul_mont
|
---|
557 |
|
---|
558 | # note that __ecp_nistz256_sqr_mont expects a[0-3] input pre-loaded
|
---|
559 | # to $a0-$a3
|
---|
560 | .type __ecp_nistz256_sqr_mont,\@function
|
---|
561 | .align 4
|
---|
562 | __ecp_nistz256_sqr_mont:
|
---|
563 | ################################################################
|
---|
564 | # | | | | | |a1*a0| |
|
---|
565 | # | | | | |a2*a0| | |
|
---|
566 | # | |a3*a2|a3*a0| | | |
|
---|
567 | # | | | |a2*a1| | | |
|
---|
568 | # | | |a3*a1| | | | |
|
---|
569 | # *| | | | | | | | 2|
|
---|
570 | # +|a3*a3|a2*a2|a1*a1|a0*a0|
|
---|
571 | # |--+--+--+--+--+--+--+--|
|
---|
572 | # |A7|A6|A5|A4|A3|A2|A1|A0|, where Ax is $accx, i.e. follow $accx
|
---|
573 | #
|
---|
574 | # "can't overflow" below mark carrying into high part of
|
---|
575 | # multiplication result, which can't overflow, because it
|
---|
576 | # can never be all ones.
|
---|
577 |
|
---|
578 | mulld $acc1,$a1,$a0 # a[1]*a[0]
|
---|
579 | mulhdu $t1,$a1,$a0
|
---|
580 | mulld $acc2,$a2,$a0 # a[2]*a[0]
|
---|
581 | mulhdu $t2,$a2,$a0
|
---|
582 | mulld $acc3,$a3,$a0 # a[3]*a[0]
|
---|
583 | mulhdu $acc4,$a3,$a0
|
---|
584 |
|
---|
585 | addc $acc2,$acc2,$t1 # accumulate high parts of multiplication
|
---|
586 | mulld $t0,$a2,$a1 # a[2]*a[1]
|
---|
587 | mulhdu $t1,$a2,$a1
|
---|
588 | adde $acc3,$acc3,$t2
|
---|
589 | mulld $t2,$a3,$a1 # a[3]*a[1]
|
---|
590 | mulhdu $t3,$a3,$a1
|
---|
591 | addze $acc4,$acc4 # can't overflow
|
---|
592 |
|
---|
593 | mulld $acc5,$a3,$a2 # a[3]*a[2]
|
---|
594 | mulhdu $acc6,$a3,$a2
|
---|
595 |
|
---|
596 | addc $t1,$t1,$t2 # accumulate high parts of multiplication
|
---|
597 | addze $t2,$t3 # can't overflow
|
---|
598 |
|
---|
599 | addc $acc3,$acc3,$t0 # accumulate low parts of multiplication
|
---|
600 | adde $acc4,$acc4,$t1
|
---|
601 | adde $acc5,$acc5,$t2
|
---|
602 | addze $acc6,$acc6 # can't overflow
|
---|
603 |
|
---|
604 | addc $acc1,$acc1,$acc1 # acc[1-6]*=2
|
---|
605 | adde $acc2,$acc2,$acc2
|
---|
606 | adde $acc3,$acc3,$acc3
|
---|
607 | adde $acc4,$acc4,$acc4
|
---|
608 | adde $acc5,$acc5,$acc5
|
---|
609 | adde $acc6,$acc6,$acc6
|
---|
610 | li $acc7,0
|
---|
611 | addze $acc7,$acc7
|
---|
612 |
|
---|
613 | mulld $acc0,$a0,$a0 # a[0]*a[0]
|
---|
614 | mulhdu $a0,$a0,$a0
|
---|
615 | mulld $t1,$a1,$a1 # a[1]*a[1]
|
---|
616 | mulhdu $a1,$a1,$a1
|
---|
617 | mulld $t2,$a2,$a2 # a[2]*a[2]
|
---|
618 | mulhdu $a2,$a2,$a2
|
---|
619 | mulld $t3,$a3,$a3 # a[3]*a[3]
|
---|
620 | mulhdu $a3,$a3,$a3
|
---|
621 | addc $acc1,$acc1,$a0 # +a[i]*a[i]
|
---|
622 | sldi $t0,$acc0,32
|
---|
623 | adde $acc2,$acc2,$t1
|
---|
624 | srdi $t1,$acc0,32
|
---|
625 | adde $acc3,$acc3,$a1
|
---|
626 | adde $acc4,$acc4,$t2
|
---|
627 | adde $acc5,$acc5,$a2
|
---|
628 | adde $acc6,$acc6,$t3
|
---|
629 | adde $acc7,$acc7,$a3
|
---|
630 | ___
|
---|
631 | for($i=0;$i<3;$i++) { # reductions, see commentary in
|
---|
632 | # multiplication for details
|
---|
633 | $code.=<<___;
|
---|
634 | subfc $t2,$t0,$acc0 # "*0xffff0001"
|
---|
635 | subfe $t3,$t1,$acc0
|
---|
636 | addc $acc0,$acc1,$t0 # +=acc[0]<<96 and omit acc[0]
|
---|
637 | sldi $t0,$acc0,32
|
---|
638 | adde $acc1,$acc2,$t1
|
---|
639 | srdi $t1,$acc0,32
|
---|
640 | adde $acc2,$acc3,$t2 # +=acc[0]*0xffff0001
|
---|
641 | addze $acc3,$t3 # can't overflow
|
---|
642 | ___
|
---|
643 | }
|
---|
644 | $code.=<<___;
|
---|
645 | subfc $t2,$t0,$acc0 # "*0xffff0001"
|
---|
646 | subfe $t3,$t1,$acc0
|
---|
647 | addc $acc0,$acc1,$t0 # +=acc[0]<<96 and omit acc[0]
|
---|
648 | adde $acc1,$acc2,$t1
|
---|
649 | adde $acc2,$acc3,$t2 # +=acc[0]*0xffff0001
|
---|
650 | addze $acc3,$t3 # can't overflow
|
---|
651 |
|
---|
652 | addc $acc0,$acc0,$acc4 # accumulate upper half
|
---|
653 | adde $acc1,$acc1,$acc5
|
---|
654 | adde $acc2,$acc2,$acc6
|
---|
655 | adde $acc3,$acc3,$acc7
|
---|
656 | li $t2,0
|
---|
657 | addze $acc4,$t2
|
---|
658 |
|
---|
659 | addic $acc0,$acc0,1 # ret -= modulus
|
---|
660 | subfe $acc1,$poly1,$acc1
|
---|
661 | subfe $acc2,$t2,$acc2
|
---|
662 | subfe $acc3,$poly3,$acc3
|
---|
663 | subfe $acc4,$t2,$acc4
|
---|
664 |
|
---|
665 | addc $acc0,$acc0,$acc4 # ret += modulus if borrow
|
---|
666 | and $t1,$poly1,$acc4
|
---|
667 | and $t3,$poly3,$acc4
|
---|
668 | adde $acc1,$acc1,$t1
|
---|
669 | addze $acc2,$acc2
|
---|
670 | adde $acc3,$acc3,$t3
|
---|
671 |
|
---|
672 | std $acc0,0($rp)
|
---|
673 | std $acc1,8($rp)
|
---|
674 | std $acc2,16($rp)
|
---|
675 | std $acc3,24($rp)
|
---|
676 |
|
---|
677 | blr
|
---|
678 | .long 0
|
---|
679 | .byte 0,12,0x14,0,0,0,1,0
|
---|
680 | .long 0
|
---|
681 | .size __ecp_nistz256_sqr_mont,.-__ecp_nistz256_sqr_mont
|
---|
682 |
|
---|
683 | # Note that __ecp_nistz256_add expects both input vectors pre-loaded to
|
---|
684 | # $a0-$a3 and $t0-$t3. This is done because it's used in multiple
|
---|
685 | # contexts, e.g. in multiplication by 2 and 3...
|
---|
686 | .type __ecp_nistz256_add,\@function
|
---|
687 | .align 4
|
---|
688 | __ecp_nistz256_add:
|
---|
689 | addc $acc0,$acc0,$t0 # ret = a+b
|
---|
690 | adde $acc1,$acc1,$t1
|
---|
691 | adde $acc2,$acc2,$t2
|
---|
692 | li $t2,0
|
---|
693 | adde $acc3,$acc3,$t3
|
---|
694 | addze $t0,$t2
|
---|
695 |
|
---|
696 | # if a+b >= modulus, subtract modulus
|
---|
697 | #
|
---|
698 | # But since comparison implies subtraction, we subtract
|
---|
699 | # modulus and then add it back if subtraction borrowed.
|
---|
700 |
|
---|
701 | subic $acc0,$acc0,-1
|
---|
702 | subfe $acc1,$poly1,$acc1
|
---|
703 | subfe $acc2,$t2,$acc2
|
---|
704 | subfe $acc3,$poly3,$acc3
|
---|
705 | subfe $t0,$t2,$t0
|
---|
706 |
|
---|
707 | addc $acc0,$acc0,$t0
|
---|
708 | and $t1,$poly1,$t0
|
---|
709 | and $t3,$poly3,$t0
|
---|
710 | adde $acc1,$acc1,$t1
|
---|
711 | addze $acc2,$acc2
|
---|
712 | adde $acc3,$acc3,$t3
|
---|
713 |
|
---|
714 | std $acc0,0($rp)
|
---|
715 | std $acc1,8($rp)
|
---|
716 | std $acc2,16($rp)
|
---|
717 | std $acc3,24($rp)
|
---|
718 |
|
---|
719 | blr
|
---|
720 | .long 0
|
---|
721 | .byte 0,12,0x14,0,0,0,3,0
|
---|
722 | .long 0
|
---|
723 | .size __ecp_nistz256_add,.-__ecp_nistz256_add
|
---|
724 |
|
---|
725 | .type __ecp_nistz256_sub_from,\@function
|
---|
726 | .align 4
|
---|
727 | __ecp_nistz256_sub_from:
|
---|
728 | ld $t0,0($bp)
|
---|
729 | ld $t1,8($bp)
|
---|
730 | ld $t2,16($bp)
|
---|
731 | ld $t3,24($bp)
|
---|
732 | subfc $acc0,$t0,$acc0 # ret = a-b
|
---|
733 | subfe $acc1,$t1,$acc1
|
---|
734 | subfe $acc2,$t2,$acc2
|
---|
735 | subfe $acc3,$t3,$acc3
|
---|
736 | subfe $t0,$t0,$t0 # t0 = borrow ? -1 : 0
|
---|
737 |
|
---|
738 | # if a-b borrowed, add modulus
|
---|
739 |
|
---|
740 | addc $acc0,$acc0,$t0 # ret -= modulus & t0
|
---|
741 | and $t1,$poly1,$t0
|
---|
742 | and $t3,$poly3,$t0
|
---|
743 | adde $acc1,$acc1,$t1
|
---|
744 | addze $acc2,$acc2
|
---|
745 | adde $acc3,$acc3,$t3
|
---|
746 |
|
---|
747 | std $acc0,0($rp)
|
---|
748 | std $acc1,8($rp)
|
---|
749 | std $acc2,16($rp)
|
---|
750 | std $acc3,24($rp)
|
---|
751 |
|
---|
752 | blr
|
---|
753 | .long 0
|
---|
754 | .byte 0,12,0x14,0,0,0,3,0
|
---|
755 | .long 0
|
---|
756 | .size __ecp_nistz256_sub_from,.-__ecp_nistz256_sub_from
|
---|
757 |
|
---|
758 | .type __ecp_nistz256_sub_morf,\@function
|
---|
759 | .align 4
|
---|
760 | __ecp_nistz256_sub_morf:
|
---|
761 | ld $t0,0($bp)
|
---|
762 | ld $t1,8($bp)
|
---|
763 | ld $t2,16($bp)
|
---|
764 | ld $t3,24($bp)
|
---|
765 | subfc $acc0,$acc0,$t0 # ret = b-a
|
---|
766 | subfe $acc1,$acc1,$t1
|
---|
767 | subfe $acc2,$acc2,$t2
|
---|
768 | subfe $acc3,$acc3,$t3
|
---|
769 | subfe $t0,$t0,$t0 # t0 = borrow ? -1 : 0
|
---|
770 |
|
---|
771 | # if b-a borrowed, add modulus
|
---|
772 |
|
---|
773 | addc $acc0,$acc0,$t0 # ret -= modulus & t0
|
---|
774 | and $t1,$poly1,$t0
|
---|
775 | and $t3,$poly3,$t0
|
---|
776 | adde $acc1,$acc1,$t1
|
---|
777 | addze $acc2,$acc2
|
---|
778 | adde $acc3,$acc3,$t3
|
---|
779 |
|
---|
780 | std $acc0,0($rp)
|
---|
781 | std $acc1,8($rp)
|
---|
782 | std $acc2,16($rp)
|
---|
783 | std $acc3,24($rp)
|
---|
784 |
|
---|
785 | blr
|
---|
786 | .long 0
|
---|
787 | .byte 0,12,0x14,0,0,0,3,0
|
---|
788 | .long 0
|
---|
789 | .size __ecp_nistz256_sub_morf,.-__ecp_nistz256_sub_morf
|
---|
790 |
|
---|
791 | .type __ecp_nistz256_div_by_2,\@function
|
---|
792 | .align 4
|
---|
793 | __ecp_nistz256_div_by_2:
|
---|
794 | andi. $t0,$acc0,1
|
---|
795 | addic $acc0,$acc0,-1 # a += modulus
|
---|
796 | neg $t0,$t0
|
---|
797 | adde $acc1,$acc1,$poly1
|
---|
798 | not $t0,$t0
|
---|
799 | addze $acc2,$acc2
|
---|
800 | li $t2,0
|
---|
801 | adde $acc3,$acc3,$poly3
|
---|
802 | and $t1,$poly1,$t0
|
---|
803 | addze $ap,$t2 # ap = carry
|
---|
804 | and $t3,$poly3,$t0
|
---|
805 |
|
---|
806 | subfc $acc0,$t0,$acc0 # a -= modulus if a was even
|
---|
807 | subfe $acc1,$t1,$acc1
|
---|
808 | subfe $acc2,$t2,$acc2
|
---|
809 | subfe $acc3,$t3,$acc3
|
---|
810 | subfe $ap, $t2,$ap
|
---|
811 |
|
---|
812 | srdi $acc0,$acc0,1
|
---|
813 | sldi $t0,$acc1,63
|
---|
814 | srdi $acc1,$acc1,1
|
---|
815 | sldi $t1,$acc2,63
|
---|
816 | srdi $acc2,$acc2,1
|
---|
817 | sldi $t2,$acc3,63
|
---|
818 | srdi $acc3,$acc3,1
|
---|
819 | sldi $t3,$ap,63
|
---|
820 | or $acc0,$acc0,$t0
|
---|
821 | or $acc1,$acc1,$t1
|
---|
822 | or $acc2,$acc2,$t2
|
---|
823 | or $acc3,$acc3,$t3
|
---|
824 |
|
---|
825 | std $acc0,0($rp)
|
---|
826 | std $acc1,8($rp)
|
---|
827 | std $acc2,16($rp)
|
---|
828 | std $acc3,24($rp)
|
---|
829 |
|
---|
830 | blr
|
---|
831 | .long 0
|
---|
832 | .byte 0,12,0x14,0,0,0,1,0
|
---|
833 | .long 0
|
---|
834 | .size __ecp_nistz256_div_by_2,.-__ecp_nistz256_div_by_2
|
---|
835 | ___
|
---|
836 | ########################################################################
|
---|
837 | # following subroutines are "literal" implementation of those found in
|
---|
838 | # ecp_nistz256.c
|
---|
839 | #
|
---|
840 | ########################################################################
|
---|
841 | # void ecp_nistz256_point_double(P256_POINT *out,const P256_POINT *inp);
|
---|
842 | #
|
---|
843 | if (1) {
|
---|
844 | my $FRAME=64+32*4+12*8;
|
---|
845 | my ($S,$M,$Zsqr,$tmp0)=map(64+32*$_,(0..3));
|
---|
846 | # above map() describes stack layout with 4 temporary
|
---|
847 | # 256-bit vectors on top.
|
---|
848 | my ($rp_real,$ap_real) = map("r$_",(20,21));
|
---|
849 |
|
---|
850 | $code.=<<___;
|
---|
851 | .globl ecp_nistz256_point_double
|
---|
852 | .align 5
|
---|
853 | ecp_nistz256_point_double:
|
---|
854 | stdu $sp,-$FRAME($sp)
|
---|
855 | mflr r0
|
---|
856 | std r20,$FRAME-8*12($sp)
|
---|
857 | std r21,$FRAME-8*11($sp)
|
---|
858 | std r22,$FRAME-8*10($sp)
|
---|
859 | std r23,$FRAME-8*9($sp)
|
---|
860 | std r24,$FRAME-8*8($sp)
|
---|
861 | std r25,$FRAME-8*7($sp)
|
---|
862 | std r26,$FRAME-8*6($sp)
|
---|
863 | std r27,$FRAME-8*5($sp)
|
---|
864 | std r28,$FRAME-8*4($sp)
|
---|
865 | std r29,$FRAME-8*3($sp)
|
---|
866 | std r30,$FRAME-8*2($sp)
|
---|
867 | std r31,$FRAME-8*1($sp)
|
---|
868 |
|
---|
869 | li $poly1,-1
|
---|
870 | srdi $poly1,$poly1,32 # 0x00000000ffffffff
|
---|
871 | li $poly3,1
|
---|
872 | orc $poly3,$poly3,$poly1 # 0xffffffff00000001
|
---|
873 | .Ldouble_shortcut:
|
---|
874 | ld $acc0,32($ap)
|
---|
875 | ld $acc1,40($ap)
|
---|
876 | ld $acc2,48($ap)
|
---|
877 | ld $acc3,56($ap)
|
---|
878 | mr $t0,$acc0
|
---|
879 | mr $t1,$acc1
|
---|
880 | mr $t2,$acc2
|
---|
881 | mr $t3,$acc3
|
---|
882 | ld $a0,64($ap) # forward load for p256_sqr_mont
|
---|
883 | ld $a1,72($ap)
|
---|
884 | ld $a2,80($ap)
|
---|
885 | ld $a3,88($ap)
|
---|
886 | mr $rp_real,$rp
|
---|
887 | mr $ap_real,$ap
|
---|
888 | addi $rp,$sp,$S
|
---|
889 | bl __ecp_nistz256_add # p256_mul_by_2(S, in_y);
|
---|
890 |
|
---|
891 | addi $rp,$sp,$Zsqr
|
---|
892 | bl __ecp_nistz256_sqr_mont # p256_sqr_mont(Zsqr, in_z);
|
---|
893 |
|
---|
894 | ld $t0,0($ap_real)
|
---|
895 | ld $t1,8($ap_real)
|
---|
896 | ld $t2,16($ap_real)
|
---|
897 | ld $t3,24($ap_real)
|
---|
898 | mr $a0,$acc0 # put Zsqr aside for p256_sub
|
---|
899 | mr $a1,$acc1
|
---|
900 | mr $a2,$acc2
|
---|
901 | mr $a3,$acc3
|
---|
902 | addi $rp,$sp,$M
|
---|
903 | bl __ecp_nistz256_add # p256_add(M, Zsqr, in_x);
|
---|
904 |
|
---|
905 | addi $bp,$ap_real,0
|
---|
906 | mr $acc0,$a0 # restore Zsqr
|
---|
907 | mr $acc1,$a1
|
---|
908 | mr $acc2,$a2
|
---|
909 | mr $acc3,$a3
|
---|
910 | ld $a0,$S+0($sp) # forward load for p256_sqr_mont
|
---|
911 | ld $a1,$S+8($sp)
|
---|
912 | ld $a2,$S+16($sp)
|
---|
913 | ld $a3,$S+24($sp)
|
---|
914 | addi $rp,$sp,$Zsqr
|
---|
915 | bl __ecp_nistz256_sub_morf # p256_sub(Zsqr, in_x, Zsqr);
|
---|
916 |
|
---|
917 | addi $rp,$sp,$S
|
---|
918 | bl __ecp_nistz256_sqr_mont # p256_sqr_mont(S, S);
|
---|
919 |
|
---|
920 | ld $bi,32($ap_real)
|
---|
921 | ld $a0,64($ap_real)
|
---|
922 | ld $a1,72($ap_real)
|
---|
923 | ld $a2,80($ap_real)
|
---|
924 | ld $a3,88($ap_real)
|
---|
925 | addi $bp,$ap_real,32
|
---|
926 | addi $rp,$sp,$tmp0
|
---|
927 | bl __ecp_nistz256_mul_mont # p256_mul_mont(tmp0, in_z, in_y);
|
---|
928 |
|
---|
929 | mr $t0,$acc0
|
---|
930 | mr $t1,$acc1
|
---|
931 | mr $t2,$acc2
|
---|
932 | mr $t3,$acc3
|
---|
933 | ld $a0,$S+0($sp) # forward load for p256_sqr_mont
|
---|
934 | ld $a1,$S+8($sp)
|
---|
935 | ld $a2,$S+16($sp)
|
---|
936 | ld $a3,$S+24($sp)
|
---|
937 | addi $rp,$rp_real,64
|
---|
938 | bl __ecp_nistz256_add # p256_mul_by_2(res_z, tmp0);
|
---|
939 |
|
---|
940 | addi $rp,$sp,$tmp0
|
---|
941 | bl __ecp_nistz256_sqr_mont # p256_sqr_mont(tmp0, S);
|
---|
942 |
|
---|
943 | ld $bi,$Zsqr($sp) # forward load for p256_mul_mont
|
---|
944 | ld $a0,$M+0($sp)
|
---|
945 | ld $a1,$M+8($sp)
|
---|
946 | ld $a2,$M+16($sp)
|
---|
947 | ld $a3,$M+24($sp)
|
---|
948 | addi $rp,$rp_real,32
|
---|
949 | bl __ecp_nistz256_div_by_2 # p256_div_by_2(res_y, tmp0);
|
---|
950 |
|
---|
951 | addi $bp,$sp,$Zsqr
|
---|
952 | addi $rp,$sp,$M
|
---|
953 | bl __ecp_nistz256_mul_mont # p256_mul_mont(M, M, Zsqr);
|
---|
954 |
|
---|
955 | mr $t0,$acc0 # duplicate M
|
---|
956 | mr $t1,$acc1
|
---|
957 | mr $t2,$acc2
|
---|
958 | mr $t3,$acc3
|
---|
959 | mr $a0,$acc0 # put M aside
|
---|
960 | mr $a1,$acc1
|
---|
961 | mr $a2,$acc2
|
---|
962 | mr $a3,$acc3
|
---|
963 | addi $rp,$sp,$M
|
---|
964 | bl __ecp_nistz256_add
|
---|
965 | mr $t0,$a0 # restore M
|
---|
966 | mr $t1,$a1
|
---|
967 | mr $t2,$a2
|
---|
968 | mr $t3,$a3
|
---|
969 | ld $bi,0($ap_real) # forward load for p256_mul_mont
|
---|
970 | ld $a0,$S+0($sp)
|
---|
971 | ld $a1,$S+8($sp)
|
---|
972 | ld $a2,$S+16($sp)
|
---|
973 | ld $a3,$S+24($sp)
|
---|
974 | bl __ecp_nistz256_add # p256_mul_by_3(M, M);
|
---|
975 |
|
---|
976 | addi $bp,$ap_real,0
|
---|
977 | addi $rp,$sp,$S
|
---|
978 | bl __ecp_nistz256_mul_mont # p256_mul_mont(S, S, in_x);
|
---|
979 |
|
---|
980 | mr $t0,$acc0
|
---|
981 | mr $t1,$acc1
|
---|
982 | mr $t2,$acc2
|
---|
983 | mr $t3,$acc3
|
---|
984 | ld $a0,$M+0($sp) # forward load for p256_sqr_mont
|
---|
985 | ld $a1,$M+8($sp)
|
---|
986 | ld $a2,$M+16($sp)
|
---|
987 | ld $a3,$M+24($sp)
|
---|
988 | addi $rp,$sp,$tmp0
|
---|
989 | bl __ecp_nistz256_add # p256_mul_by_2(tmp0, S);
|
---|
990 |
|
---|
991 | addi $rp,$rp_real,0
|
---|
992 | bl __ecp_nistz256_sqr_mont # p256_sqr_mont(res_x, M);
|
---|
993 |
|
---|
994 | addi $bp,$sp,$tmp0
|
---|
995 | bl __ecp_nistz256_sub_from # p256_sub(res_x, res_x, tmp0);
|
---|
996 |
|
---|
997 | addi $bp,$sp,$S
|
---|
998 | addi $rp,$sp,$S
|
---|
999 | bl __ecp_nistz256_sub_morf # p256_sub(S, S, res_x);
|
---|
1000 |
|
---|
1001 | ld $bi,$M($sp)
|
---|
1002 | mr $a0,$acc0 # copy S
|
---|
1003 | mr $a1,$acc1
|
---|
1004 | mr $a2,$acc2
|
---|
1005 | mr $a3,$acc3
|
---|
1006 | addi $bp,$sp,$M
|
---|
1007 | bl __ecp_nistz256_mul_mont # p256_mul_mont(S, S, M);
|
---|
1008 |
|
---|
1009 | addi $bp,$rp_real,32
|
---|
1010 | addi $rp,$rp_real,32
|
---|
1011 | bl __ecp_nistz256_sub_from # p256_sub(res_y, S, res_y);
|
---|
1012 |
|
---|
1013 | mtlr r0
|
---|
1014 | ld r20,$FRAME-8*12($sp)
|
---|
1015 | ld r21,$FRAME-8*11($sp)
|
---|
1016 | ld r22,$FRAME-8*10($sp)
|
---|
1017 | ld r23,$FRAME-8*9($sp)
|
---|
1018 | ld r24,$FRAME-8*8($sp)
|
---|
1019 | ld r25,$FRAME-8*7($sp)
|
---|
1020 | ld r26,$FRAME-8*6($sp)
|
---|
1021 | ld r27,$FRAME-8*5($sp)
|
---|
1022 | ld r28,$FRAME-8*4($sp)
|
---|
1023 | ld r29,$FRAME-8*3($sp)
|
---|
1024 | ld r30,$FRAME-8*2($sp)
|
---|
1025 | ld r31,$FRAME-8*1($sp)
|
---|
1026 | addi $sp,$sp,$FRAME
|
---|
1027 | blr
|
---|
1028 | .long 0
|
---|
1029 | .byte 0,12,4,0,0x80,12,2,0
|
---|
1030 | .long 0
|
---|
1031 | .size ecp_nistz256_point_double,.-ecp_nistz256_point_double
|
---|
1032 | ___
|
---|
1033 | }
|
---|
1034 |
|
---|
1035 | ########################################################################
|
---|
1036 | # void ecp_nistz256_point_add(P256_POINT *out,const P256_POINT *in1,
|
---|
1037 | # const P256_POINT *in2);
|
---|
1038 | if (1) {
|
---|
1039 | my $FRAME = 64 + 32*12 + 16*8;
|
---|
1040 | my ($res_x,$res_y,$res_z,
|
---|
1041 | $H,$Hsqr,$R,$Rsqr,$Hcub,
|
---|
1042 | $U1,$U2,$S1,$S2)=map(64+32*$_,(0..11));
|
---|
1043 | my ($Z1sqr, $Z2sqr) = ($Hsqr, $Rsqr);
|
---|
1044 | # above map() describes stack layout with 12 temporary
|
---|
1045 | # 256-bit vectors on top.
|
---|
1046 | my ($rp_real,$ap_real,$bp_real,$in1infty,$in2infty,$temp)=map("r$_",(16..21));
|
---|
1047 |
|
---|
1048 | $code.=<<___;
|
---|
1049 | .globl ecp_nistz256_point_add
|
---|
1050 | .align 5
|
---|
1051 | ecp_nistz256_point_add:
|
---|
1052 | stdu $sp,-$FRAME($sp)
|
---|
1053 | mflr r0
|
---|
1054 | std r16,$FRAME-8*16($sp)
|
---|
1055 | std r17,$FRAME-8*15($sp)
|
---|
1056 | std r18,$FRAME-8*14($sp)
|
---|
1057 | std r19,$FRAME-8*13($sp)
|
---|
1058 | std r20,$FRAME-8*12($sp)
|
---|
1059 | std r21,$FRAME-8*11($sp)
|
---|
1060 | std r22,$FRAME-8*10($sp)
|
---|
1061 | std r23,$FRAME-8*9($sp)
|
---|
1062 | std r24,$FRAME-8*8($sp)
|
---|
1063 | std r25,$FRAME-8*7($sp)
|
---|
1064 | std r26,$FRAME-8*6($sp)
|
---|
1065 | std r27,$FRAME-8*5($sp)
|
---|
1066 | std r28,$FRAME-8*4($sp)
|
---|
1067 | std r29,$FRAME-8*3($sp)
|
---|
1068 | std r30,$FRAME-8*2($sp)
|
---|
1069 | std r31,$FRAME-8*1($sp)
|
---|
1070 |
|
---|
1071 | li $poly1,-1
|
---|
1072 | srdi $poly1,$poly1,32 # 0x00000000ffffffff
|
---|
1073 | li $poly3,1
|
---|
1074 | orc $poly3,$poly3,$poly1 # 0xffffffff00000001
|
---|
1075 |
|
---|
1076 | ld $a0,64($bp) # in2_z
|
---|
1077 | ld $a1,72($bp)
|
---|
1078 | ld $a2,80($bp)
|
---|
1079 | ld $a3,88($bp)
|
---|
1080 | mr $rp_real,$rp
|
---|
1081 | mr $ap_real,$ap
|
---|
1082 | mr $bp_real,$bp
|
---|
1083 | or $t0,$a0,$a1
|
---|
1084 | or $t2,$a2,$a3
|
---|
1085 | or $in2infty,$t0,$t2
|
---|
1086 | neg $t0,$in2infty
|
---|
1087 | or $in2infty,$in2infty,$t0
|
---|
1088 | sradi $in2infty,$in2infty,63 # !in2infty
|
---|
1089 | addi $rp,$sp,$Z2sqr
|
---|
1090 | bl __ecp_nistz256_sqr_mont # p256_sqr_mont(Z2sqr, in2_z);
|
---|
1091 |
|
---|
1092 | ld $a0,64($ap_real) # in1_z
|
---|
1093 | ld $a1,72($ap_real)
|
---|
1094 | ld $a2,80($ap_real)
|
---|
1095 | ld $a3,88($ap_real)
|
---|
1096 | or $t0,$a0,$a1
|
---|
1097 | or $t2,$a2,$a3
|
---|
1098 | or $in1infty,$t0,$t2
|
---|
1099 | neg $t0,$in1infty
|
---|
1100 | or $in1infty,$in1infty,$t0
|
---|
1101 | sradi $in1infty,$in1infty,63 # !in1infty
|
---|
1102 | addi $rp,$sp,$Z1sqr
|
---|
1103 | bl __ecp_nistz256_sqr_mont # p256_sqr_mont(Z1sqr, in1_z);
|
---|
1104 |
|
---|
1105 | ld $bi,64($bp_real)
|
---|
1106 | ld $a0,$Z2sqr+0($sp)
|
---|
1107 | ld $a1,$Z2sqr+8($sp)
|
---|
1108 | ld $a2,$Z2sqr+16($sp)
|
---|
1109 | ld $a3,$Z2sqr+24($sp)
|
---|
1110 | addi $bp,$bp_real,64
|
---|
1111 | addi $rp,$sp,$S1
|
---|
1112 | bl __ecp_nistz256_mul_mont # p256_mul_mont(S1, Z2sqr, in2_z);
|
---|
1113 |
|
---|
1114 | ld $bi,64($ap_real)
|
---|
1115 | ld $a0,$Z1sqr+0($sp)
|
---|
1116 | ld $a1,$Z1sqr+8($sp)
|
---|
1117 | ld $a2,$Z1sqr+16($sp)
|
---|
1118 | ld $a3,$Z1sqr+24($sp)
|
---|
1119 | addi $bp,$ap_real,64
|
---|
1120 | addi $rp,$sp,$S2
|
---|
1121 | bl __ecp_nistz256_mul_mont # p256_mul_mont(S2, Z1sqr, in1_z);
|
---|
1122 |
|
---|
1123 | ld $bi,32($ap_real)
|
---|
1124 | ld $a0,$S1+0($sp)
|
---|
1125 | ld $a1,$S1+8($sp)
|
---|
1126 | ld $a2,$S1+16($sp)
|
---|
1127 | ld $a3,$S1+24($sp)
|
---|
1128 | addi $bp,$ap_real,32
|
---|
1129 | addi $rp,$sp,$S1
|
---|
1130 | bl __ecp_nistz256_mul_mont # p256_mul_mont(S1, S1, in1_y);
|
---|
1131 |
|
---|
1132 | ld $bi,32($bp_real)
|
---|
1133 | ld $a0,$S2+0($sp)
|
---|
1134 | ld $a1,$S2+8($sp)
|
---|
1135 | ld $a2,$S2+16($sp)
|
---|
1136 | ld $a3,$S2+24($sp)
|
---|
1137 | addi $bp,$bp_real,32
|
---|
1138 | addi $rp,$sp,$S2
|
---|
1139 | bl __ecp_nistz256_mul_mont # p256_mul_mont(S2, S2, in2_y);
|
---|
1140 |
|
---|
1141 | addi $bp,$sp,$S1
|
---|
1142 | ld $bi,$Z2sqr($sp) # forward load for p256_mul_mont
|
---|
1143 | ld $a0,0($ap_real)
|
---|
1144 | ld $a1,8($ap_real)
|
---|
1145 | ld $a2,16($ap_real)
|
---|
1146 | ld $a3,24($ap_real)
|
---|
1147 | addi $rp,$sp,$R
|
---|
1148 | bl __ecp_nistz256_sub_from # p256_sub(R, S2, S1);
|
---|
1149 |
|
---|
1150 | or $acc0,$acc0,$acc1 # see if result is zero
|
---|
1151 | or $acc2,$acc2,$acc3
|
---|
1152 | or $temp,$acc0,$acc2
|
---|
1153 |
|
---|
1154 | addi $bp,$sp,$Z2sqr
|
---|
1155 | addi $rp,$sp,$U1
|
---|
1156 | bl __ecp_nistz256_mul_mont # p256_mul_mont(U1, in1_x, Z2sqr);
|
---|
1157 |
|
---|
1158 | ld $bi,$Z1sqr($sp)
|
---|
1159 | ld $a0,0($bp_real)
|
---|
1160 | ld $a1,8($bp_real)
|
---|
1161 | ld $a2,16($bp_real)
|
---|
1162 | ld $a3,24($bp_real)
|
---|
1163 | addi $bp,$sp,$Z1sqr
|
---|
1164 | addi $rp,$sp,$U2
|
---|
1165 | bl __ecp_nistz256_mul_mont # p256_mul_mont(U2, in2_x, Z1sqr);
|
---|
1166 |
|
---|
1167 | addi $bp,$sp,$U1
|
---|
1168 | ld $a0,$R+0($sp) # forward load for p256_sqr_mont
|
---|
1169 | ld $a1,$R+8($sp)
|
---|
1170 | ld $a2,$R+16($sp)
|
---|
1171 | ld $a3,$R+24($sp)
|
---|
1172 | addi $rp,$sp,$H
|
---|
1173 | bl __ecp_nistz256_sub_from # p256_sub(H, U2, U1);
|
---|
1174 |
|
---|
1175 | or $acc0,$acc0,$acc1 # see if result is zero
|
---|
1176 | or $acc2,$acc2,$acc3
|
---|
1177 | or. $acc0,$acc0,$acc2
|
---|
1178 | bne .Ladd_proceed # is_equal(U1,U2)?
|
---|
1179 |
|
---|
1180 | and. $t0,$in1infty,$in2infty
|
---|
1181 | beq .Ladd_proceed # (in1infty || in2infty)?
|
---|
1182 |
|
---|
1183 | cmpldi $temp,0
|
---|
1184 | beq .Ladd_double # is_equal(S1,S2)?
|
---|
1185 |
|
---|
1186 | xor $a0,$a0,$a0
|
---|
1187 | std $a0,0($rp_real)
|
---|
1188 | std $a0,8($rp_real)
|
---|
1189 | std $a0,16($rp_real)
|
---|
1190 | std $a0,24($rp_real)
|
---|
1191 | std $a0,32($rp_real)
|
---|
1192 | std $a0,40($rp_real)
|
---|
1193 | std $a0,48($rp_real)
|
---|
1194 | std $a0,56($rp_real)
|
---|
1195 | std $a0,64($rp_real)
|
---|
1196 | std $a0,72($rp_real)
|
---|
1197 | std $a0,80($rp_real)
|
---|
1198 | std $a0,88($rp_real)
|
---|
1199 | b .Ladd_done
|
---|
1200 |
|
---|
1201 | .align 4
|
---|
1202 | .Ladd_double:
|
---|
1203 | ld $bp,0($sp) # back-link
|
---|
1204 | mr $ap,$ap_real
|
---|
1205 | mr $rp,$rp_real
|
---|
1206 | ld r16,$FRAME-8*16($sp)
|
---|
1207 | ld r17,$FRAME-8*15($sp)
|
---|
1208 | ld r18,$FRAME-8*14($sp)
|
---|
1209 | ld r19,$FRAME-8*13($sp)
|
---|
1210 | stdu $bp,$FRAME-288($sp) # difference in stack frame sizes
|
---|
1211 | b .Ldouble_shortcut
|
---|
1212 |
|
---|
1213 | .align 4
|
---|
1214 | .Ladd_proceed:
|
---|
1215 | addi $rp,$sp,$Rsqr
|
---|
1216 | bl __ecp_nistz256_sqr_mont # p256_sqr_mont(Rsqr, R);
|
---|
1217 |
|
---|
1218 | ld $bi,64($ap_real)
|
---|
1219 | ld $a0,$H+0($sp)
|
---|
1220 | ld $a1,$H+8($sp)
|
---|
1221 | ld $a2,$H+16($sp)
|
---|
1222 | ld $a3,$H+24($sp)
|
---|
1223 | addi $bp,$ap_real,64
|
---|
1224 | addi $rp,$sp,$res_z
|
---|
1225 | bl __ecp_nistz256_mul_mont # p256_mul_mont(res_z, H, in1_z);
|
---|
1226 |
|
---|
1227 | ld $a0,$H+0($sp)
|
---|
1228 | ld $a1,$H+8($sp)
|
---|
1229 | ld $a2,$H+16($sp)
|
---|
1230 | ld $a3,$H+24($sp)
|
---|
1231 | addi $rp,$sp,$Hsqr
|
---|
1232 | bl __ecp_nistz256_sqr_mont # p256_sqr_mont(Hsqr, H);
|
---|
1233 |
|
---|
1234 | ld $bi,64($bp_real)
|
---|
1235 | ld $a0,$res_z+0($sp)
|
---|
1236 | ld $a1,$res_z+8($sp)
|
---|
1237 | ld $a2,$res_z+16($sp)
|
---|
1238 | ld $a3,$res_z+24($sp)
|
---|
1239 | addi $bp,$bp_real,64
|
---|
1240 | addi $rp,$sp,$res_z
|
---|
1241 | bl __ecp_nistz256_mul_mont # p256_mul_mont(res_z, res_z, in2_z);
|
---|
1242 |
|
---|
1243 | ld $bi,$H($sp)
|
---|
1244 | ld $a0,$Hsqr+0($sp)
|
---|
1245 | ld $a1,$Hsqr+8($sp)
|
---|
1246 | ld $a2,$Hsqr+16($sp)
|
---|
1247 | ld $a3,$Hsqr+24($sp)
|
---|
1248 | addi $bp,$sp,$H
|
---|
1249 | addi $rp,$sp,$Hcub
|
---|
1250 | bl __ecp_nistz256_mul_mont # p256_mul_mont(Hcub, Hsqr, H);
|
---|
1251 |
|
---|
1252 | ld $bi,$Hsqr($sp)
|
---|
1253 | ld $a0,$U1+0($sp)
|
---|
1254 | ld $a1,$U1+8($sp)
|
---|
1255 | ld $a2,$U1+16($sp)
|
---|
1256 | ld $a3,$U1+24($sp)
|
---|
1257 | addi $bp,$sp,$Hsqr
|
---|
1258 | addi $rp,$sp,$U2
|
---|
1259 | bl __ecp_nistz256_mul_mont # p256_mul_mont(U2, U1, Hsqr);
|
---|
1260 |
|
---|
1261 | mr $t0,$acc0
|
---|
1262 | mr $t1,$acc1
|
---|
1263 | mr $t2,$acc2
|
---|
1264 | mr $t3,$acc3
|
---|
1265 | addi $rp,$sp,$Hsqr
|
---|
1266 | bl __ecp_nistz256_add # p256_mul_by_2(Hsqr, U2);
|
---|
1267 |
|
---|
1268 | addi $bp,$sp,$Rsqr
|
---|
1269 | addi $rp,$sp,$res_x
|
---|
1270 | bl __ecp_nistz256_sub_morf # p256_sub(res_x, Rsqr, Hsqr);
|
---|
1271 |
|
---|
1272 | addi $bp,$sp,$Hcub
|
---|
1273 | bl __ecp_nistz256_sub_from # p256_sub(res_x, res_x, Hcub);
|
---|
1274 |
|
---|
1275 | addi $bp,$sp,$U2
|
---|
1276 | ld $bi,$Hcub($sp) # forward load for p256_mul_mont
|
---|
1277 | ld $a0,$S1+0($sp)
|
---|
1278 | ld $a1,$S1+8($sp)
|
---|
1279 | ld $a2,$S1+16($sp)
|
---|
1280 | ld $a3,$S1+24($sp)
|
---|
1281 | addi $rp,$sp,$res_y
|
---|
1282 | bl __ecp_nistz256_sub_morf # p256_sub(res_y, U2, res_x);
|
---|
1283 |
|
---|
1284 | addi $bp,$sp,$Hcub
|
---|
1285 | addi $rp,$sp,$S2
|
---|
1286 | bl __ecp_nistz256_mul_mont # p256_mul_mont(S2, S1, Hcub);
|
---|
1287 |
|
---|
1288 | ld $bi,$R($sp)
|
---|
1289 | ld $a0,$res_y+0($sp)
|
---|
1290 | ld $a1,$res_y+8($sp)
|
---|
1291 | ld $a2,$res_y+16($sp)
|
---|
1292 | ld $a3,$res_y+24($sp)
|
---|
1293 | addi $bp,$sp,$R
|
---|
1294 | addi $rp,$sp,$res_y
|
---|
1295 | bl __ecp_nistz256_mul_mont # p256_mul_mont(res_y, res_y, R);
|
---|
1296 |
|
---|
1297 | addi $bp,$sp,$S2
|
---|
1298 | bl __ecp_nistz256_sub_from # p256_sub(res_y, res_y, S2);
|
---|
1299 |
|
---|
1300 | ld $t0,0($bp_real) # in2
|
---|
1301 | ld $t1,8($bp_real)
|
---|
1302 | ld $t2,16($bp_real)
|
---|
1303 | ld $t3,24($bp_real)
|
---|
1304 | ld $a0,$res_x+0($sp) # res
|
---|
1305 | ld $a1,$res_x+8($sp)
|
---|
1306 | ld $a2,$res_x+16($sp)
|
---|
1307 | ld $a3,$res_x+24($sp)
|
---|
1308 | ___
|
---|
1309 | for($i=0;$i<64;$i+=32) { # conditional moves
|
---|
1310 | $code.=<<___;
|
---|
1311 | ld $acc0,$i+0($ap_real) # in1
|
---|
1312 | ld $acc1,$i+8($ap_real)
|
---|
1313 | ld $acc2,$i+16($ap_real)
|
---|
1314 | ld $acc3,$i+24($ap_real)
|
---|
1315 | andc $t0,$t0,$in1infty
|
---|
1316 | andc $t1,$t1,$in1infty
|
---|
1317 | andc $t2,$t2,$in1infty
|
---|
1318 | andc $t3,$t3,$in1infty
|
---|
1319 | and $a0,$a0,$in1infty
|
---|
1320 | and $a1,$a1,$in1infty
|
---|
1321 | and $a2,$a2,$in1infty
|
---|
1322 | and $a3,$a3,$in1infty
|
---|
1323 | or $t0,$t0,$a0
|
---|
1324 | or $t1,$t1,$a1
|
---|
1325 | or $t2,$t2,$a2
|
---|
1326 | or $t3,$t3,$a3
|
---|
1327 | andc $acc0,$acc0,$in2infty
|
---|
1328 | andc $acc1,$acc1,$in2infty
|
---|
1329 | andc $acc2,$acc2,$in2infty
|
---|
1330 | andc $acc3,$acc3,$in2infty
|
---|
1331 | and $t0,$t0,$in2infty
|
---|
1332 | and $t1,$t1,$in2infty
|
---|
1333 | and $t2,$t2,$in2infty
|
---|
1334 | and $t3,$t3,$in2infty
|
---|
1335 | or $acc0,$acc0,$t0
|
---|
1336 | or $acc1,$acc1,$t1
|
---|
1337 | or $acc2,$acc2,$t2
|
---|
1338 | or $acc3,$acc3,$t3
|
---|
1339 |
|
---|
1340 | ld $t0,$i+32($bp_real) # in2
|
---|
1341 | ld $t1,$i+40($bp_real)
|
---|
1342 | ld $t2,$i+48($bp_real)
|
---|
1343 | ld $t3,$i+56($bp_real)
|
---|
1344 | ld $a0,$res_x+$i+32($sp)
|
---|
1345 | ld $a1,$res_x+$i+40($sp)
|
---|
1346 | ld $a2,$res_x+$i+48($sp)
|
---|
1347 | ld $a3,$res_x+$i+56($sp)
|
---|
1348 | std $acc0,$i+0($rp_real)
|
---|
1349 | std $acc1,$i+8($rp_real)
|
---|
1350 | std $acc2,$i+16($rp_real)
|
---|
1351 | std $acc3,$i+24($rp_real)
|
---|
1352 | ___
|
---|
1353 | }
|
---|
1354 | $code.=<<___;
|
---|
1355 | ld $acc0,$i+0($ap_real) # in1
|
---|
1356 | ld $acc1,$i+8($ap_real)
|
---|
1357 | ld $acc2,$i+16($ap_real)
|
---|
1358 | ld $acc3,$i+24($ap_real)
|
---|
1359 | andc $t0,$t0,$in1infty
|
---|
1360 | andc $t1,$t1,$in1infty
|
---|
1361 | andc $t2,$t2,$in1infty
|
---|
1362 | andc $t3,$t3,$in1infty
|
---|
1363 | and $a0,$a0,$in1infty
|
---|
1364 | and $a1,$a1,$in1infty
|
---|
1365 | and $a2,$a2,$in1infty
|
---|
1366 | and $a3,$a3,$in1infty
|
---|
1367 | or $t0,$t0,$a0
|
---|
1368 | or $t1,$t1,$a1
|
---|
1369 | or $t2,$t2,$a2
|
---|
1370 | or $t3,$t3,$a3
|
---|
1371 | andc $acc0,$acc0,$in2infty
|
---|
1372 | andc $acc1,$acc1,$in2infty
|
---|
1373 | andc $acc2,$acc2,$in2infty
|
---|
1374 | andc $acc3,$acc3,$in2infty
|
---|
1375 | and $t0,$t0,$in2infty
|
---|
1376 | and $t1,$t1,$in2infty
|
---|
1377 | and $t2,$t2,$in2infty
|
---|
1378 | and $t3,$t3,$in2infty
|
---|
1379 | or $acc0,$acc0,$t0
|
---|
1380 | or $acc1,$acc1,$t1
|
---|
1381 | or $acc2,$acc2,$t2
|
---|
1382 | or $acc3,$acc3,$t3
|
---|
1383 | std $acc0,$i+0($rp_real)
|
---|
1384 | std $acc1,$i+8($rp_real)
|
---|
1385 | std $acc2,$i+16($rp_real)
|
---|
1386 | std $acc3,$i+24($rp_real)
|
---|
1387 |
|
---|
1388 | .Ladd_done:
|
---|
1389 | mtlr r0
|
---|
1390 | ld r16,$FRAME-8*16($sp)
|
---|
1391 | ld r17,$FRAME-8*15($sp)
|
---|
1392 | ld r18,$FRAME-8*14($sp)
|
---|
1393 | ld r19,$FRAME-8*13($sp)
|
---|
1394 | ld r20,$FRAME-8*12($sp)
|
---|
1395 | ld r21,$FRAME-8*11($sp)
|
---|
1396 | ld r22,$FRAME-8*10($sp)
|
---|
1397 | ld r23,$FRAME-8*9($sp)
|
---|
1398 | ld r24,$FRAME-8*8($sp)
|
---|
1399 | ld r25,$FRAME-8*7($sp)
|
---|
1400 | ld r26,$FRAME-8*6($sp)
|
---|
1401 | ld r27,$FRAME-8*5($sp)
|
---|
1402 | ld r28,$FRAME-8*4($sp)
|
---|
1403 | ld r29,$FRAME-8*3($sp)
|
---|
1404 | ld r30,$FRAME-8*2($sp)
|
---|
1405 | ld r31,$FRAME-8*1($sp)
|
---|
1406 | addi $sp,$sp,$FRAME
|
---|
1407 | blr
|
---|
1408 | .long 0
|
---|
1409 | .byte 0,12,4,0,0x80,16,3,0
|
---|
1410 | .long 0
|
---|
1411 | .size ecp_nistz256_point_add,.-ecp_nistz256_point_add
|
---|
1412 | ___
|
---|
1413 | }
|
---|
1414 |
|
---|
1415 | ########################################################################
|
---|
1416 | # void ecp_nistz256_point_add_affine(P256_POINT *out,const P256_POINT *in1,
|
---|
1417 | # const P256_POINT_AFFINE *in2);
|
---|
1418 | if (1) {
|
---|
1419 | my $FRAME = 64 + 32*10 + 16*8;
|
---|
1420 | my ($res_x,$res_y,$res_z,
|
---|
1421 | $U2,$S2,$H,$R,$Hsqr,$Hcub,$Rsqr)=map(64+32*$_,(0..9));
|
---|
1422 | my $Z1sqr = $S2;
|
---|
1423 | # above map() describes stack layout with 10 temporary
|
---|
1424 | # 256-bit vectors on top.
|
---|
1425 | my ($rp_real,$ap_real,$bp_real,$in1infty,$in2infty,$temp)=map("r$_",(16..21));
|
---|
1426 |
|
---|
1427 | $code.=<<___;
|
---|
1428 | .globl ecp_nistz256_point_add_affine
|
---|
1429 | .align 5
|
---|
1430 | ecp_nistz256_point_add_affine:
|
---|
1431 | stdu $sp,-$FRAME($sp)
|
---|
1432 | mflr r0
|
---|
1433 | std r16,$FRAME-8*16($sp)
|
---|
1434 | std r17,$FRAME-8*15($sp)
|
---|
1435 | std r18,$FRAME-8*14($sp)
|
---|
1436 | std r19,$FRAME-8*13($sp)
|
---|
1437 | std r20,$FRAME-8*12($sp)
|
---|
1438 | std r21,$FRAME-8*11($sp)
|
---|
1439 | std r22,$FRAME-8*10($sp)
|
---|
1440 | std r23,$FRAME-8*9($sp)
|
---|
1441 | std r24,$FRAME-8*8($sp)
|
---|
1442 | std r25,$FRAME-8*7($sp)
|
---|
1443 | std r26,$FRAME-8*6($sp)
|
---|
1444 | std r27,$FRAME-8*5($sp)
|
---|
1445 | std r28,$FRAME-8*4($sp)
|
---|
1446 | std r29,$FRAME-8*3($sp)
|
---|
1447 | std r30,$FRAME-8*2($sp)
|
---|
1448 | std r31,$FRAME-8*1($sp)
|
---|
1449 |
|
---|
1450 | li $poly1,-1
|
---|
1451 | srdi $poly1,$poly1,32 # 0x00000000ffffffff
|
---|
1452 | li $poly3,1
|
---|
1453 | orc $poly3,$poly3,$poly1 # 0xffffffff00000001
|
---|
1454 |
|
---|
1455 | mr $rp_real,$rp
|
---|
1456 | mr $ap_real,$ap
|
---|
1457 | mr $bp_real,$bp
|
---|
1458 |
|
---|
1459 | ld $a0,64($ap) # in1_z
|
---|
1460 | ld $a1,72($ap)
|
---|
1461 | ld $a2,80($ap)
|
---|
1462 | ld $a3,88($ap)
|
---|
1463 | or $t0,$a0,$a1
|
---|
1464 | or $t2,$a2,$a3
|
---|
1465 | or $in1infty,$t0,$t2
|
---|
1466 | neg $t0,$in1infty
|
---|
1467 | or $in1infty,$in1infty,$t0
|
---|
1468 | sradi $in1infty,$in1infty,63 # !in1infty
|
---|
1469 |
|
---|
1470 | ld $acc0,0($bp) # in2_x
|
---|
1471 | ld $acc1,8($bp)
|
---|
1472 | ld $acc2,16($bp)
|
---|
1473 | ld $acc3,24($bp)
|
---|
1474 | ld $t0,32($bp) # in2_y
|
---|
1475 | ld $t1,40($bp)
|
---|
1476 | ld $t2,48($bp)
|
---|
1477 | ld $t3,56($bp)
|
---|
1478 | or $acc0,$acc0,$acc1
|
---|
1479 | or $acc2,$acc2,$acc3
|
---|
1480 | or $acc0,$acc0,$acc2
|
---|
1481 | or $t0,$t0,$t1
|
---|
1482 | or $t2,$t2,$t3
|
---|
1483 | or $t0,$t0,$t2
|
---|
1484 | or $in2infty,$acc0,$t0
|
---|
1485 | neg $t0,$in2infty
|
---|
1486 | or $in2infty,$in2infty,$t0
|
---|
1487 | sradi $in2infty,$in2infty,63 # !in2infty
|
---|
1488 |
|
---|
1489 | addi $rp,$sp,$Z1sqr
|
---|
1490 | bl __ecp_nistz256_sqr_mont # p256_sqr_mont(Z1sqr, in1_z);
|
---|
1491 |
|
---|
1492 | mr $a0,$acc0
|
---|
1493 | mr $a1,$acc1
|
---|
1494 | mr $a2,$acc2
|
---|
1495 | mr $a3,$acc3
|
---|
1496 | ld $bi,0($bp_real)
|
---|
1497 | addi $bp,$bp_real,0
|
---|
1498 | addi $rp,$sp,$U2
|
---|
1499 | bl __ecp_nistz256_mul_mont # p256_mul_mont(U2, Z1sqr, in2_x);
|
---|
1500 |
|
---|
1501 | addi $bp,$ap_real,0
|
---|
1502 | ld $bi,64($ap_real) # forward load for p256_mul_mont
|
---|
1503 | ld $a0,$Z1sqr+0($sp)
|
---|
1504 | ld $a1,$Z1sqr+8($sp)
|
---|
1505 | ld $a2,$Z1sqr+16($sp)
|
---|
1506 | ld $a3,$Z1sqr+24($sp)
|
---|
1507 | addi $rp,$sp,$H
|
---|
1508 | bl __ecp_nistz256_sub_from # p256_sub(H, U2, in1_x);
|
---|
1509 |
|
---|
1510 | addi $bp,$ap_real,64
|
---|
1511 | addi $rp,$sp,$S2
|
---|
1512 | bl __ecp_nistz256_mul_mont # p256_mul_mont(S2, Z1sqr, in1_z);
|
---|
1513 |
|
---|
1514 | ld $bi,64($ap_real)
|
---|
1515 | ld $a0,$H+0($sp)
|
---|
1516 | ld $a1,$H+8($sp)
|
---|
1517 | ld $a2,$H+16($sp)
|
---|
1518 | ld $a3,$H+24($sp)
|
---|
1519 | addi $bp,$ap_real,64
|
---|
1520 | addi $rp,$sp,$res_z
|
---|
1521 | bl __ecp_nistz256_mul_mont # p256_mul_mont(res_z, H, in1_z);
|
---|
1522 |
|
---|
1523 | ld $bi,32($bp_real)
|
---|
1524 | ld $a0,$S2+0($sp)
|
---|
1525 | ld $a1,$S2+8($sp)
|
---|
1526 | ld $a2,$S2+16($sp)
|
---|
1527 | ld $a3,$S2+24($sp)
|
---|
1528 | addi $bp,$bp_real,32
|
---|
1529 | addi $rp,$sp,$S2
|
---|
1530 | bl __ecp_nistz256_mul_mont # p256_mul_mont(S2, S2, in2_y);
|
---|
1531 |
|
---|
1532 | addi $bp,$ap_real,32
|
---|
1533 | ld $a0,$H+0($sp) # forward load for p256_sqr_mont
|
---|
1534 | ld $a1,$H+8($sp)
|
---|
1535 | ld $a2,$H+16($sp)
|
---|
1536 | ld $a3,$H+24($sp)
|
---|
1537 | addi $rp,$sp,$R
|
---|
1538 | bl __ecp_nistz256_sub_from # p256_sub(R, S2, in1_y);
|
---|
1539 |
|
---|
1540 | addi $rp,$sp,$Hsqr
|
---|
1541 | bl __ecp_nistz256_sqr_mont # p256_sqr_mont(Hsqr, H);
|
---|
1542 |
|
---|
1543 | ld $a0,$R+0($sp)
|
---|
1544 | ld $a1,$R+8($sp)
|
---|
1545 | ld $a2,$R+16($sp)
|
---|
1546 | ld $a3,$R+24($sp)
|
---|
1547 | addi $rp,$sp,$Rsqr
|
---|
1548 | bl __ecp_nistz256_sqr_mont # p256_sqr_mont(Rsqr, R);
|
---|
1549 |
|
---|
1550 | ld $bi,$H($sp)
|
---|
1551 | ld $a0,$Hsqr+0($sp)
|
---|
1552 | ld $a1,$Hsqr+8($sp)
|
---|
1553 | ld $a2,$Hsqr+16($sp)
|
---|
1554 | ld $a3,$Hsqr+24($sp)
|
---|
1555 | addi $bp,$sp,$H
|
---|
1556 | addi $rp,$sp,$Hcub
|
---|
1557 | bl __ecp_nistz256_mul_mont # p256_mul_mont(Hcub, Hsqr, H);
|
---|
1558 |
|
---|
1559 | ld $bi,0($ap_real)
|
---|
1560 | ld $a0,$Hsqr+0($sp)
|
---|
1561 | ld $a1,$Hsqr+8($sp)
|
---|
1562 | ld $a2,$Hsqr+16($sp)
|
---|
1563 | ld $a3,$Hsqr+24($sp)
|
---|
1564 | addi $bp,$ap_real,0
|
---|
1565 | addi $rp,$sp,$U2
|
---|
1566 | bl __ecp_nistz256_mul_mont # p256_mul_mont(U2, in1_x, Hsqr);
|
---|
1567 |
|
---|
1568 | mr $t0,$acc0
|
---|
1569 | mr $t1,$acc1
|
---|
1570 | mr $t2,$acc2
|
---|
1571 | mr $t3,$acc3
|
---|
1572 | addi $rp,$sp,$Hsqr
|
---|
1573 | bl __ecp_nistz256_add # p256_mul_by_2(Hsqr, U2);
|
---|
1574 |
|
---|
1575 | addi $bp,$sp,$Rsqr
|
---|
1576 | addi $rp,$sp,$res_x
|
---|
1577 | bl __ecp_nistz256_sub_morf # p256_sub(res_x, Rsqr, Hsqr);
|
---|
1578 |
|
---|
1579 | addi $bp,$sp,$Hcub
|
---|
1580 | bl __ecp_nistz256_sub_from # p256_sub(res_x, res_x, Hcub);
|
---|
1581 |
|
---|
1582 | addi $bp,$sp,$U2
|
---|
1583 | ld $bi,32($ap_real) # forward load for p256_mul_mont
|
---|
1584 | ld $a0,$Hcub+0($sp)
|
---|
1585 | ld $a1,$Hcub+8($sp)
|
---|
1586 | ld $a2,$Hcub+16($sp)
|
---|
1587 | ld $a3,$Hcub+24($sp)
|
---|
1588 | addi $rp,$sp,$res_y
|
---|
1589 | bl __ecp_nistz256_sub_morf # p256_sub(res_y, U2, res_x);
|
---|
1590 |
|
---|
1591 | addi $bp,$ap_real,32
|
---|
1592 | addi $rp,$sp,$S2
|
---|
1593 | bl __ecp_nistz256_mul_mont # p256_mul_mont(S2, in1_y, Hcub);
|
---|
1594 |
|
---|
1595 | ld $bi,$R($sp)
|
---|
1596 | ld $a0,$res_y+0($sp)
|
---|
1597 | ld $a1,$res_y+8($sp)
|
---|
1598 | ld $a2,$res_y+16($sp)
|
---|
1599 | ld $a3,$res_y+24($sp)
|
---|
1600 | addi $bp,$sp,$R
|
---|
1601 | addi $rp,$sp,$res_y
|
---|
1602 | bl __ecp_nistz256_mul_mont # p256_mul_mont(res_y, res_y, R);
|
---|
1603 |
|
---|
1604 | addi $bp,$sp,$S2
|
---|
1605 | bl __ecp_nistz256_sub_from # p256_sub(res_y, res_y, S2);
|
---|
1606 |
|
---|
1607 | ld $t0,0($bp_real) # in2
|
---|
1608 | ld $t1,8($bp_real)
|
---|
1609 | ld $t2,16($bp_real)
|
---|
1610 | ld $t3,24($bp_real)
|
---|
1611 | ld $a0,$res_x+0($sp) # res
|
---|
1612 | ld $a1,$res_x+8($sp)
|
---|
1613 | ld $a2,$res_x+16($sp)
|
---|
1614 | ld $a3,$res_x+24($sp)
|
---|
1615 | ___
|
---|
1616 | for($i=0;$i<64;$i+=32) { # conditional moves
|
---|
1617 | $code.=<<___;
|
---|
1618 | ld $acc0,$i+0($ap_real) # in1
|
---|
1619 | ld $acc1,$i+8($ap_real)
|
---|
1620 | ld $acc2,$i+16($ap_real)
|
---|
1621 | ld $acc3,$i+24($ap_real)
|
---|
1622 | andc $t0,$t0,$in1infty
|
---|
1623 | andc $t1,$t1,$in1infty
|
---|
1624 | andc $t2,$t2,$in1infty
|
---|
1625 | andc $t3,$t3,$in1infty
|
---|
1626 | and $a0,$a0,$in1infty
|
---|
1627 | and $a1,$a1,$in1infty
|
---|
1628 | and $a2,$a2,$in1infty
|
---|
1629 | and $a3,$a3,$in1infty
|
---|
1630 | or $t0,$t0,$a0
|
---|
1631 | or $t1,$t1,$a1
|
---|
1632 | or $t2,$t2,$a2
|
---|
1633 | or $t3,$t3,$a3
|
---|
1634 | andc $acc0,$acc0,$in2infty
|
---|
1635 | andc $acc1,$acc1,$in2infty
|
---|
1636 | andc $acc2,$acc2,$in2infty
|
---|
1637 | andc $acc3,$acc3,$in2infty
|
---|
1638 | and $t0,$t0,$in2infty
|
---|
1639 | and $t1,$t1,$in2infty
|
---|
1640 | and $t2,$t2,$in2infty
|
---|
1641 | and $t3,$t3,$in2infty
|
---|
1642 | or $acc0,$acc0,$t0
|
---|
1643 | or $acc1,$acc1,$t1
|
---|
1644 | or $acc2,$acc2,$t2
|
---|
1645 | or $acc3,$acc3,$t3
|
---|
1646 | ___
|
---|
1647 | $code.=<<___ if ($i==0);
|
---|
1648 | ld $t0,32($bp_real) # in2
|
---|
1649 | ld $t1,40($bp_real)
|
---|
1650 | ld $t2,48($bp_real)
|
---|
1651 | ld $t3,56($bp_real)
|
---|
1652 | ___
|
---|
1653 | $code.=<<___ if ($i==32);
|
---|
1654 | li $t0,1 # Lone_mont
|
---|
1655 | not $t1,$poly1
|
---|
1656 | li $t2,-1
|
---|
1657 | not $t3,$poly3
|
---|
1658 | ___
|
---|
1659 | $code.=<<___;
|
---|
1660 | ld $a0,$res_x+$i+32($sp)
|
---|
1661 | ld $a1,$res_x+$i+40($sp)
|
---|
1662 | ld $a2,$res_x+$i+48($sp)
|
---|
1663 | ld $a3,$res_x+$i+56($sp)
|
---|
1664 | std $acc0,$i+0($rp_real)
|
---|
1665 | std $acc1,$i+8($rp_real)
|
---|
1666 | std $acc2,$i+16($rp_real)
|
---|
1667 | std $acc3,$i+24($rp_real)
|
---|
1668 | ___
|
---|
1669 | }
|
---|
1670 | $code.=<<___;
|
---|
1671 | ld $acc0,$i+0($ap_real) # in1
|
---|
1672 | ld $acc1,$i+8($ap_real)
|
---|
1673 | ld $acc2,$i+16($ap_real)
|
---|
1674 | ld $acc3,$i+24($ap_real)
|
---|
1675 | andc $t0,$t0,$in1infty
|
---|
1676 | andc $t1,$t1,$in1infty
|
---|
1677 | andc $t2,$t2,$in1infty
|
---|
1678 | andc $t3,$t3,$in1infty
|
---|
1679 | and $a0,$a0,$in1infty
|
---|
1680 | and $a1,$a1,$in1infty
|
---|
1681 | and $a2,$a2,$in1infty
|
---|
1682 | and $a3,$a3,$in1infty
|
---|
1683 | or $t0,$t0,$a0
|
---|
1684 | or $t1,$t1,$a1
|
---|
1685 | or $t2,$t2,$a2
|
---|
1686 | or $t3,$t3,$a3
|
---|
1687 | andc $acc0,$acc0,$in2infty
|
---|
1688 | andc $acc1,$acc1,$in2infty
|
---|
1689 | andc $acc2,$acc2,$in2infty
|
---|
1690 | andc $acc3,$acc3,$in2infty
|
---|
1691 | and $t0,$t0,$in2infty
|
---|
1692 | and $t1,$t1,$in2infty
|
---|
1693 | and $t2,$t2,$in2infty
|
---|
1694 | and $t3,$t3,$in2infty
|
---|
1695 | or $acc0,$acc0,$t0
|
---|
1696 | or $acc1,$acc1,$t1
|
---|
1697 | or $acc2,$acc2,$t2
|
---|
1698 | or $acc3,$acc3,$t3
|
---|
1699 | std $acc0,$i+0($rp_real)
|
---|
1700 | std $acc1,$i+8($rp_real)
|
---|
1701 | std $acc2,$i+16($rp_real)
|
---|
1702 | std $acc3,$i+24($rp_real)
|
---|
1703 |
|
---|
1704 | mtlr r0
|
---|
1705 | ld r16,$FRAME-8*16($sp)
|
---|
1706 | ld r17,$FRAME-8*15($sp)
|
---|
1707 | ld r18,$FRAME-8*14($sp)
|
---|
1708 | ld r19,$FRAME-8*13($sp)
|
---|
1709 | ld r20,$FRAME-8*12($sp)
|
---|
1710 | ld r21,$FRAME-8*11($sp)
|
---|
1711 | ld r22,$FRAME-8*10($sp)
|
---|
1712 | ld r23,$FRAME-8*9($sp)
|
---|
1713 | ld r24,$FRAME-8*8($sp)
|
---|
1714 | ld r25,$FRAME-8*7($sp)
|
---|
1715 | ld r26,$FRAME-8*6($sp)
|
---|
1716 | ld r27,$FRAME-8*5($sp)
|
---|
1717 | ld r28,$FRAME-8*4($sp)
|
---|
1718 | ld r29,$FRAME-8*3($sp)
|
---|
1719 | ld r30,$FRAME-8*2($sp)
|
---|
1720 | ld r31,$FRAME-8*1($sp)
|
---|
1721 | addi $sp,$sp,$FRAME
|
---|
1722 | blr
|
---|
1723 | .long 0
|
---|
1724 | .byte 0,12,4,0,0x80,16,3,0
|
---|
1725 | .long 0
|
---|
1726 | .size ecp_nistz256_point_add_affine,.-ecp_nistz256_point_add_affine
|
---|
1727 | ___
|
---|
1728 | }
|
---|
1729 | if (1) {
|
---|
1730 | my ($ordk,$ord0,$ord1,$t4) = map("r$_",(18..21));
|
---|
1731 | my ($ord2,$ord3,$zr) = ($poly1,$poly3,"r0");
|
---|
1732 |
|
---|
1733 | $code.=<<___;
|
---|
1734 | ########################################################################
|
---|
1735 | # void ecp_nistz256_ord_mul_mont(uint64_t res[4], uint64_t a[4],
|
---|
1736 | # uint64_t b[4]);
|
---|
1737 | .globl ecp_nistz256_ord_mul_mont
|
---|
1738 | .align 5
|
---|
1739 | ecp_nistz256_ord_mul_mont:
|
---|
1740 | stdu $sp,-160($sp)
|
---|
1741 | std r18,48($sp)
|
---|
1742 | std r19,56($sp)
|
---|
1743 | std r20,64($sp)
|
---|
1744 | std r21,72($sp)
|
---|
1745 | std r22,80($sp)
|
---|
1746 | std r23,88($sp)
|
---|
1747 | std r24,96($sp)
|
---|
1748 | std r25,104($sp)
|
---|
1749 | std r26,112($sp)
|
---|
1750 | std r27,120($sp)
|
---|
1751 | std r28,128($sp)
|
---|
1752 | std r29,136($sp)
|
---|
1753 | std r30,144($sp)
|
---|
1754 | std r31,152($sp)
|
---|
1755 |
|
---|
1756 | ld $a0,0($ap)
|
---|
1757 | ld $bi,0($bp)
|
---|
1758 | ld $a1,8($ap)
|
---|
1759 | ld $a2,16($ap)
|
---|
1760 | ld $a3,24($ap)
|
---|
1761 |
|
---|
1762 | lis $ordk,0xccd1
|
---|
1763 | lis $ord0,0xf3b9
|
---|
1764 | lis $ord1,0xbce6
|
---|
1765 | ori $ordk,$ordk,0xc8aa
|
---|
1766 | ori $ord0,$ord0,0xcac2
|
---|
1767 | ori $ord1,$ord1,0xfaad
|
---|
1768 | sldi $ordk,$ordk,32
|
---|
1769 | sldi $ord0,$ord0,32
|
---|
1770 | sldi $ord1,$ord1,32
|
---|
1771 | oris $ordk,$ordk,0xee00
|
---|
1772 | oris $ord0,$ord0,0xfc63
|
---|
1773 | oris $ord1,$ord1,0xa717
|
---|
1774 | ori $ordk,$ordk,0xbc4f # 0xccd1c8aaee00bc4f
|
---|
1775 | ori $ord0,$ord0,0x2551 # 0xf3b9cac2fc632551
|
---|
1776 | ori $ord1,$ord1,0x9e84 # 0xbce6faada7179e84
|
---|
1777 | li $ord2,-1 # 0xffffffffffffffff
|
---|
1778 | sldi $ord3,$ord2,32 # 0xffffffff00000000
|
---|
1779 | li $zr,0
|
---|
1780 |
|
---|
1781 | mulld $acc0,$a0,$bi # a[0]*b[0]
|
---|
1782 | mulhdu $t0,$a0,$bi
|
---|
1783 |
|
---|
1784 | mulld $acc1,$a1,$bi # a[1]*b[0]
|
---|
1785 | mulhdu $t1,$a1,$bi
|
---|
1786 |
|
---|
1787 | mulld $acc2,$a2,$bi # a[2]*b[0]
|
---|
1788 | mulhdu $t2,$a2,$bi
|
---|
1789 |
|
---|
1790 | mulld $acc3,$a3,$bi # a[3]*b[0]
|
---|
1791 | mulhdu $acc4,$a3,$bi
|
---|
1792 |
|
---|
1793 | mulld $t4,$acc0,$ordk
|
---|
1794 |
|
---|
1795 | addc $acc1,$acc1,$t0 # accumulate high parts of multiplication
|
---|
1796 | adde $acc2,$acc2,$t1
|
---|
1797 | adde $acc3,$acc3,$t2
|
---|
1798 | addze $acc4,$acc4
|
---|
1799 | li $acc5,0
|
---|
1800 | ___
|
---|
1801 | for ($i=1;$i<4;$i++) {
|
---|
1802 | ################################################################
|
---|
1803 | # ffff0000.ffffffff.yyyyyyyy.zzzzzzzz
|
---|
1804 | # * abcdefgh
|
---|
1805 | # + xxxxxxxx.xxxxxxxx.xxxxxxxx.xxxxxxxx.xxxxxxxx
|
---|
1806 | #
|
---|
1807 | # Now observing that ff..ff*x = (2^n-1)*x = 2^n*x-x, we
|
---|
1808 | # rewrite above as:
|
---|
1809 | #
|
---|
1810 | # xxxxxxxx.xxxxxxxx.xxxxxxxx.xxxxxxxx.xxxxxxxx
|
---|
1811 | # - 0000abcd.efgh0000.abcdefgh.00000000.00000000
|
---|
1812 | # + abcdefgh.abcdefgh.yzayzbyz.cyzdyzey.zfyzgyzh
|
---|
1813 | $code.=<<___;
|
---|
1814 | ld $bi,8*$i($bp) # b[i]
|
---|
1815 |
|
---|
1816 | sldi $t0,$t4,32
|
---|
1817 | subfc $acc2,$t4,$acc2
|
---|
1818 | srdi $t1,$t4,32
|
---|
1819 | subfe $acc3,$t0,$acc3
|
---|
1820 | subfe $acc4,$t1,$acc4
|
---|
1821 | subfe $acc5,$zr,$acc5
|
---|
1822 |
|
---|
1823 | addic $t0,$acc0,-1 # discarded
|
---|
1824 | mulhdu $t1,$ord0,$t4
|
---|
1825 | mulld $t2,$ord1,$t4
|
---|
1826 | mulhdu $t3,$ord1,$t4
|
---|
1827 |
|
---|
1828 | adde $t2,$t2,$t1
|
---|
1829 | mulld $t0,$a0,$bi
|
---|
1830 | addze $t3,$t3
|
---|
1831 | mulld $t1,$a1,$bi
|
---|
1832 |
|
---|
1833 | addc $acc0,$acc1,$t2
|
---|
1834 | mulld $t2,$a2,$bi
|
---|
1835 | adde $acc1,$acc2,$t3
|
---|
1836 | mulld $t3,$a3,$bi
|
---|
1837 | adde $acc2,$acc3,$t4
|
---|
1838 | adde $acc3,$acc4,$t4
|
---|
1839 | addze $acc4,$acc5
|
---|
1840 |
|
---|
1841 | addc $acc0,$acc0,$t0 # accumulate low parts
|
---|
1842 | mulhdu $t0,$a0,$bi
|
---|
1843 | adde $acc1,$acc1,$t1
|
---|
1844 | mulhdu $t1,$a1,$bi
|
---|
1845 | adde $acc2,$acc2,$t2
|
---|
1846 | mulhdu $t2,$a2,$bi
|
---|
1847 | adde $acc3,$acc3,$t3
|
---|
1848 | mulhdu $t3,$a3,$bi
|
---|
1849 | addze $acc4,$acc4
|
---|
1850 | mulld $t4,$acc0,$ordk
|
---|
1851 | addc $acc1,$acc1,$t0 # accumulate high parts
|
---|
1852 | adde $acc2,$acc2,$t1
|
---|
1853 | adde $acc3,$acc3,$t2
|
---|
1854 | adde $acc4,$acc4,$t3
|
---|
1855 | addze $acc5,$zr
|
---|
1856 | ___
|
---|
1857 | }
|
---|
1858 | $code.=<<___;
|
---|
1859 | sldi $t0,$t4,32 # last reduction
|
---|
1860 | subfc $acc2,$t4,$acc2
|
---|
1861 | srdi $t1,$t4,32
|
---|
1862 | subfe $acc3,$t0,$acc3
|
---|
1863 | subfe $acc4,$t1,$acc4
|
---|
1864 | subfe $acc5,$zr,$acc5
|
---|
1865 |
|
---|
1866 | addic $t0,$acc0,-1 # discarded
|
---|
1867 | mulhdu $t1,$ord0,$t4
|
---|
1868 | mulld $t2,$ord1,$t4
|
---|
1869 | mulhdu $t3,$ord1,$t4
|
---|
1870 |
|
---|
1871 | adde $t2,$t2,$t1
|
---|
1872 | addze $t3,$t3
|
---|
1873 |
|
---|
1874 | addc $acc0,$acc1,$t2
|
---|
1875 | adde $acc1,$acc2,$t3
|
---|
1876 | adde $acc2,$acc3,$t4
|
---|
1877 | adde $acc3,$acc4,$t4
|
---|
1878 | addze $acc4,$acc5
|
---|
1879 |
|
---|
1880 | subfc $acc0,$ord0,$acc0 # ret -= modulus
|
---|
1881 | subfe $acc1,$ord1,$acc1
|
---|
1882 | subfe $acc2,$ord2,$acc2
|
---|
1883 | subfe $acc3,$ord3,$acc3
|
---|
1884 | subfe $acc4,$zr,$acc4
|
---|
1885 |
|
---|
1886 | and $t0,$ord0,$acc4
|
---|
1887 | and $t1,$ord1,$acc4
|
---|
1888 | addc $acc0,$acc0,$t0 # ret += modulus if borrow
|
---|
1889 | and $t3,$ord3,$acc4
|
---|
1890 | adde $acc1,$acc1,$t1
|
---|
1891 | adde $acc2,$acc2,$acc4
|
---|
1892 | adde $acc3,$acc3,$t3
|
---|
1893 |
|
---|
1894 | std $acc0,0($rp)
|
---|
1895 | std $acc1,8($rp)
|
---|
1896 | std $acc2,16($rp)
|
---|
1897 | std $acc3,24($rp)
|
---|
1898 |
|
---|
1899 | ld r18,48($sp)
|
---|
1900 | ld r19,56($sp)
|
---|
1901 | ld r20,64($sp)
|
---|
1902 | ld r21,72($sp)
|
---|
1903 | ld r22,80($sp)
|
---|
1904 | ld r23,88($sp)
|
---|
1905 | ld r24,96($sp)
|
---|
1906 | ld r25,104($sp)
|
---|
1907 | ld r26,112($sp)
|
---|
1908 | ld r27,120($sp)
|
---|
1909 | ld r28,128($sp)
|
---|
1910 | ld r29,136($sp)
|
---|
1911 | ld r30,144($sp)
|
---|
1912 | ld r31,152($sp)
|
---|
1913 | addi $sp,$sp,160
|
---|
1914 | blr
|
---|
1915 | .long 0
|
---|
1916 | .byte 0,12,4,0,0x80,14,3,0
|
---|
1917 | .long 0
|
---|
1918 | .size ecp_nistz256_ord_mul_mont,.-ecp_nistz256_ord_mul_mont
|
---|
1919 |
|
---|
1920 | ################################################################################
|
---|
1921 | # void ecp_nistz256_ord_sqr_mont(uint64_t res[4], uint64_t a[4],
|
---|
1922 | # int rep);
|
---|
1923 | .globl ecp_nistz256_ord_sqr_mont
|
---|
1924 | .align 5
|
---|
1925 | ecp_nistz256_ord_sqr_mont:
|
---|
1926 | stdu $sp,-160($sp)
|
---|
1927 | std r18,48($sp)
|
---|
1928 | std r19,56($sp)
|
---|
1929 | std r20,64($sp)
|
---|
1930 | std r21,72($sp)
|
---|
1931 | std r22,80($sp)
|
---|
1932 | std r23,88($sp)
|
---|
1933 | std r24,96($sp)
|
---|
1934 | std r25,104($sp)
|
---|
1935 | std r26,112($sp)
|
---|
1936 | std r27,120($sp)
|
---|
1937 | std r28,128($sp)
|
---|
1938 | std r29,136($sp)
|
---|
1939 | std r30,144($sp)
|
---|
1940 | std r31,152($sp)
|
---|
1941 |
|
---|
1942 | mtctr $bp
|
---|
1943 |
|
---|
1944 | ld $a0,0($ap)
|
---|
1945 | ld $a1,8($ap)
|
---|
1946 | ld $a2,16($ap)
|
---|
1947 | ld $a3,24($ap)
|
---|
1948 |
|
---|
1949 | lis $ordk,0xccd1
|
---|
1950 | lis $ord0,0xf3b9
|
---|
1951 | lis $ord1,0xbce6
|
---|
1952 | ori $ordk,$ordk,0xc8aa
|
---|
1953 | ori $ord0,$ord0,0xcac2
|
---|
1954 | ori $ord1,$ord1,0xfaad
|
---|
1955 | sldi $ordk,$ordk,32
|
---|
1956 | sldi $ord0,$ord0,32
|
---|
1957 | sldi $ord1,$ord1,32
|
---|
1958 | oris $ordk,$ordk,0xee00
|
---|
1959 | oris $ord0,$ord0,0xfc63
|
---|
1960 | oris $ord1,$ord1,0xa717
|
---|
1961 | ori $ordk,$ordk,0xbc4f # 0xccd1c8aaee00bc4f
|
---|
1962 | ori $ord0,$ord0,0x2551 # 0xf3b9cac2fc632551
|
---|
1963 | ori $ord1,$ord1,0x9e84 # 0xbce6faada7179e84
|
---|
1964 | li $ord2,-1 # 0xffffffffffffffff
|
---|
1965 | sldi $ord3,$ord2,32 # 0xffffffff00000000
|
---|
1966 | li $zr,0
|
---|
1967 | b .Loop_ord_sqr
|
---|
1968 |
|
---|
1969 | .align 5
|
---|
1970 | .Loop_ord_sqr:
|
---|
1971 | ################################################################
|
---|
1972 | # | | | | | |a1*a0| |
|
---|
1973 | # | | | | |a2*a0| | |
|
---|
1974 | # | |a3*a2|a3*a0| | | |
|
---|
1975 | # | | | |a2*a1| | | |
|
---|
1976 | # | | |a3*a1| | | | |
|
---|
1977 | # *| | | | | | | | 2|
|
---|
1978 | # +|a3*a3|a2*a2|a1*a1|a0*a0|
|
---|
1979 | # |--+--+--+--+--+--+--+--|
|
---|
1980 | # |A7|A6|A5|A4|A3|A2|A1|A0|, where Ax is $accx, i.e. follow $accx
|
---|
1981 | #
|
---|
1982 | # "can't overflow" below mark carrying into high part of
|
---|
1983 | # multiplication result, which can't overflow, because it
|
---|
1984 | # can never be all ones.
|
---|
1985 |
|
---|
1986 | mulld $acc1,$a1,$a0 # a[1]*a[0]
|
---|
1987 | mulhdu $t1,$a1,$a0
|
---|
1988 | mulld $acc2,$a2,$a0 # a[2]*a[0]
|
---|
1989 | mulhdu $t2,$a2,$a0
|
---|
1990 | mulld $acc3,$a3,$a0 # a[3]*a[0]
|
---|
1991 | mulhdu $acc4,$a3,$a0
|
---|
1992 |
|
---|
1993 | addc $acc2,$acc2,$t1 # accumulate high parts of multiplication
|
---|
1994 | mulld $t0,$a2,$a1 # a[2]*a[1]
|
---|
1995 | mulhdu $t1,$a2,$a1
|
---|
1996 | adde $acc3,$acc3,$t2
|
---|
1997 | mulld $t2,$a3,$a1 # a[3]*a[1]
|
---|
1998 | mulhdu $t3,$a3,$a1
|
---|
1999 | addze $acc4,$acc4 # can't overflow
|
---|
2000 |
|
---|
2001 | mulld $acc5,$a3,$a2 # a[3]*a[2]
|
---|
2002 | mulhdu $acc6,$a3,$a2
|
---|
2003 |
|
---|
2004 | addc $t1,$t1,$t2 # accumulate high parts of multiplication
|
---|
2005 | mulld $acc0,$a0,$a0 # a[0]*a[0]
|
---|
2006 | addze $t2,$t3 # can't overflow
|
---|
2007 |
|
---|
2008 | addc $acc3,$acc3,$t0 # accumulate low parts of multiplication
|
---|
2009 | mulhdu $a0,$a0,$a0
|
---|
2010 | adde $acc4,$acc4,$t1
|
---|
2011 | mulld $t1,$a1,$a1 # a[1]*a[1]
|
---|
2012 | adde $acc5,$acc5,$t2
|
---|
2013 | mulhdu $a1,$a1,$a1
|
---|
2014 | addze $acc6,$acc6 # can't overflow
|
---|
2015 |
|
---|
2016 | addc $acc1,$acc1,$acc1 # acc[1-6]*=2
|
---|
2017 | mulld $t2,$a2,$a2 # a[2]*a[2]
|
---|
2018 | adde $acc2,$acc2,$acc2
|
---|
2019 | mulhdu $a2,$a2,$a2
|
---|
2020 | adde $acc3,$acc3,$acc3
|
---|
2021 | mulld $t3,$a3,$a3 # a[3]*a[3]
|
---|
2022 | adde $acc4,$acc4,$acc4
|
---|
2023 | mulhdu $a3,$a3,$a3
|
---|
2024 | adde $acc5,$acc5,$acc5
|
---|
2025 | adde $acc6,$acc6,$acc6
|
---|
2026 | addze $acc7,$zr
|
---|
2027 |
|
---|
2028 | addc $acc1,$acc1,$a0 # +a[i]*a[i]
|
---|
2029 | mulld $t4,$acc0,$ordk
|
---|
2030 | adde $acc2,$acc2,$t1
|
---|
2031 | adde $acc3,$acc3,$a1
|
---|
2032 | adde $acc4,$acc4,$t2
|
---|
2033 | adde $acc5,$acc5,$a2
|
---|
2034 | adde $acc6,$acc6,$t3
|
---|
2035 | adde $acc7,$acc7,$a3
|
---|
2036 | ___
|
---|
2037 | for($i=0; $i<4; $i++) { # reductions
|
---|
2038 | $code.=<<___;
|
---|
2039 | addic $t0,$acc0,-1 # discarded
|
---|
2040 | mulhdu $t1,$ord0,$t4
|
---|
2041 | mulld $t2,$ord1,$t4
|
---|
2042 | mulhdu $t3,$ord1,$t4
|
---|
2043 |
|
---|
2044 | adde $t2,$t2,$t1
|
---|
2045 | addze $t3,$t3
|
---|
2046 |
|
---|
2047 | addc $acc0,$acc1,$t2
|
---|
2048 | adde $acc1,$acc2,$t3
|
---|
2049 | adde $acc2,$acc3,$t4
|
---|
2050 | adde $acc3,$zr,$t4 # can't overflow
|
---|
2051 | ___
|
---|
2052 | $code.=<<___ if ($i<3);
|
---|
2053 | mulld $t3,$acc0,$ordk
|
---|
2054 | ___
|
---|
2055 | $code.=<<___;
|
---|
2056 | sldi $t0,$t4,32
|
---|
2057 | subfc $acc1,$t4,$acc1
|
---|
2058 | srdi $t1,$t4,32
|
---|
2059 | subfe $acc2,$t0,$acc2
|
---|
2060 | subfe $acc3,$t1,$acc3 # can't borrow
|
---|
2061 | ___
|
---|
2062 | ($t3,$t4) = ($t4,$t3);
|
---|
2063 | }
|
---|
2064 | $code.=<<___;
|
---|
2065 | addc $acc0,$acc0,$acc4 # accumulate upper half
|
---|
2066 | adde $acc1,$acc1,$acc5
|
---|
2067 | adde $acc2,$acc2,$acc6
|
---|
2068 | adde $acc3,$acc3,$acc7
|
---|
2069 | addze $acc4,$zr
|
---|
2070 |
|
---|
2071 | subfc $acc0,$ord0,$acc0 # ret -= modulus
|
---|
2072 | subfe $acc1,$ord1,$acc1
|
---|
2073 | subfe $acc2,$ord2,$acc2
|
---|
2074 | subfe $acc3,$ord3,$acc3
|
---|
2075 | subfe $acc4,$zr,$acc4
|
---|
2076 |
|
---|
2077 | and $t0,$ord0,$acc4
|
---|
2078 | and $t1,$ord1,$acc4
|
---|
2079 | addc $a0,$acc0,$t0 # ret += modulus if borrow
|
---|
2080 | and $t3,$ord3,$acc4
|
---|
2081 | adde $a1,$acc1,$t1
|
---|
2082 | adde $a2,$acc2,$acc4
|
---|
2083 | adde $a3,$acc3,$t3
|
---|
2084 |
|
---|
2085 | bdnz .Loop_ord_sqr
|
---|
2086 |
|
---|
2087 | std $a0,0($rp)
|
---|
2088 | std $a1,8($rp)
|
---|
2089 | std $a2,16($rp)
|
---|
2090 | std $a3,24($rp)
|
---|
2091 |
|
---|
2092 | ld r18,48($sp)
|
---|
2093 | ld r19,56($sp)
|
---|
2094 | ld r20,64($sp)
|
---|
2095 | ld r21,72($sp)
|
---|
2096 | ld r22,80($sp)
|
---|
2097 | ld r23,88($sp)
|
---|
2098 | ld r24,96($sp)
|
---|
2099 | ld r25,104($sp)
|
---|
2100 | ld r26,112($sp)
|
---|
2101 | ld r27,120($sp)
|
---|
2102 | ld r28,128($sp)
|
---|
2103 | ld r29,136($sp)
|
---|
2104 | ld r30,144($sp)
|
---|
2105 | ld r31,152($sp)
|
---|
2106 | addi $sp,$sp,160
|
---|
2107 | blr
|
---|
2108 | .long 0
|
---|
2109 | .byte 0,12,4,0,0x80,14,3,0
|
---|
2110 | .long 0
|
---|
2111 | .size ecp_nistz256_ord_sqr_mont,.-ecp_nistz256_ord_sqr_mont
|
---|
2112 | ___
|
---|
2113 | } }
|
---|
2114 |
|
---|
2115 | ########################################################################
|
---|
2116 | # scatter-gather subroutines
|
---|
2117 | {
|
---|
2118 | my ($out,$inp,$index,$mask)=map("r$_",(3..7));
|
---|
2119 | $code.=<<___;
|
---|
2120 | ########################################################################
|
---|
2121 | # void ecp_nistz256_scatter_w5(void *out, const P256_POINT *inp,
|
---|
2122 | # int index);
|
---|
2123 | .globl ecp_nistz256_scatter_w5
|
---|
2124 | .align 4
|
---|
2125 | ecp_nistz256_scatter_w5:
|
---|
2126 | slwi $index,$index,2
|
---|
2127 | add $out,$out,$index
|
---|
2128 |
|
---|
2129 | ld r8, 0($inp) # X
|
---|
2130 | ld r9, 8($inp)
|
---|
2131 | ld r10,16($inp)
|
---|
2132 | ld r11,24($inp)
|
---|
2133 |
|
---|
2134 | stw r8, 64*0-4($out)
|
---|
2135 | srdi r8, r8, 32
|
---|
2136 | stw r9, 64*1-4($out)
|
---|
2137 | srdi r9, r9, 32
|
---|
2138 | stw r10,64*2-4($out)
|
---|
2139 | srdi r10,r10,32
|
---|
2140 | stw r11,64*3-4($out)
|
---|
2141 | srdi r11,r11,32
|
---|
2142 | stw r8, 64*4-4($out)
|
---|
2143 | stw r9, 64*5-4($out)
|
---|
2144 | stw r10,64*6-4($out)
|
---|
2145 | stw r11,64*7-4($out)
|
---|
2146 | addi $out,$out,64*8
|
---|
2147 |
|
---|
2148 | ld r8, 32($inp) # Y
|
---|
2149 | ld r9, 40($inp)
|
---|
2150 | ld r10,48($inp)
|
---|
2151 | ld r11,56($inp)
|
---|
2152 |
|
---|
2153 | stw r8, 64*0-4($out)
|
---|
2154 | srdi r8, r8, 32
|
---|
2155 | stw r9, 64*1-4($out)
|
---|
2156 | srdi r9, r9, 32
|
---|
2157 | stw r10,64*2-4($out)
|
---|
2158 | srdi r10,r10,32
|
---|
2159 | stw r11,64*3-4($out)
|
---|
2160 | srdi r11,r11,32
|
---|
2161 | stw r8, 64*4-4($out)
|
---|
2162 | stw r9, 64*5-4($out)
|
---|
2163 | stw r10,64*6-4($out)
|
---|
2164 | stw r11,64*7-4($out)
|
---|
2165 | addi $out,$out,64*8
|
---|
2166 |
|
---|
2167 | ld r8, 64($inp) # Z
|
---|
2168 | ld r9, 72($inp)
|
---|
2169 | ld r10,80($inp)
|
---|
2170 | ld r11,88($inp)
|
---|
2171 |
|
---|
2172 | stw r8, 64*0-4($out)
|
---|
2173 | srdi r8, r8, 32
|
---|
2174 | stw r9, 64*1-4($out)
|
---|
2175 | srdi r9, r9, 32
|
---|
2176 | stw r10,64*2-4($out)
|
---|
2177 | srdi r10,r10,32
|
---|
2178 | stw r11,64*3-4($out)
|
---|
2179 | srdi r11,r11,32
|
---|
2180 | stw r8, 64*4-4($out)
|
---|
2181 | stw r9, 64*5-4($out)
|
---|
2182 | stw r10,64*6-4($out)
|
---|
2183 | stw r11,64*7-4($out)
|
---|
2184 |
|
---|
2185 | blr
|
---|
2186 | .long 0
|
---|
2187 | .byte 0,12,0x14,0,0,0,3,0
|
---|
2188 | .long 0
|
---|
2189 | .size ecp_nistz256_scatter_w5,.-ecp_nistz256_scatter_w5
|
---|
2190 |
|
---|
2191 | ########################################################################
|
---|
2192 | # void ecp_nistz256_gather_w5(P256_POINT *out, const void *inp,
|
---|
2193 | # int index);
|
---|
2194 | .globl ecp_nistz256_gather_w5
|
---|
2195 | .align 4
|
---|
2196 | ecp_nistz256_gather_w5:
|
---|
2197 | neg r0,$index
|
---|
2198 | sradi r0,r0,63
|
---|
2199 |
|
---|
2200 | add $index,$index,r0
|
---|
2201 | slwi $index,$index,2
|
---|
2202 | add $inp,$inp,$index
|
---|
2203 |
|
---|
2204 | lwz r5, 64*0($inp)
|
---|
2205 | lwz r6, 64*1($inp)
|
---|
2206 | lwz r7, 64*2($inp)
|
---|
2207 | lwz r8, 64*3($inp)
|
---|
2208 | lwz r9, 64*4($inp)
|
---|
2209 | lwz r10,64*5($inp)
|
---|
2210 | lwz r11,64*6($inp)
|
---|
2211 | lwz r12,64*7($inp)
|
---|
2212 | addi $inp,$inp,64*8
|
---|
2213 | sldi r9, r9, 32
|
---|
2214 | sldi r10,r10,32
|
---|
2215 | sldi r11,r11,32
|
---|
2216 | sldi r12,r12,32
|
---|
2217 | or r5,r5,r9
|
---|
2218 | or r6,r6,r10
|
---|
2219 | or r7,r7,r11
|
---|
2220 | or r8,r8,r12
|
---|
2221 | and r5,r5,r0
|
---|
2222 | and r6,r6,r0
|
---|
2223 | and r7,r7,r0
|
---|
2224 | and r8,r8,r0
|
---|
2225 | std r5,0($out) # X
|
---|
2226 | std r6,8($out)
|
---|
2227 | std r7,16($out)
|
---|
2228 | std r8,24($out)
|
---|
2229 |
|
---|
2230 | lwz r5, 64*0($inp)
|
---|
2231 | lwz r6, 64*1($inp)
|
---|
2232 | lwz r7, 64*2($inp)
|
---|
2233 | lwz r8, 64*3($inp)
|
---|
2234 | lwz r9, 64*4($inp)
|
---|
2235 | lwz r10,64*5($inp)
|
---|
2236 | lwz r11,64*6($inp)
|
---|
2237 | lwz r12,64*7($inp)
|
---|
2238 | addi $inp,$inp,64*8
|
---|
2239 | sldi r9, r9, 32
|
---|
2240 | sldi r10,r10,32
|
---|
2241 | sldi r11,r11,32
|
---|
2242 | sldi r12,r12,32
|
---|
2243 | or r5,r5,r9
|
---|
2244 | or r6,r6,r10
|
---|
2245 | or r7,r7,r11
|
---|
2246 | or r8,r8,r12
|
---|
2247 | and r5,r5,r0
|
---|
2248 | and r6,r6,r0
|
---|
2249 | and r7,r7,r0
|
---|
2250 | and r8,r8,r0
|
---|
2251 | std r5,32($out) # Y
|
---|
2252 | std r6,40($out)
|
---|
2253 | std r7,48($out)
|
---|
2254 | std r8,56($out)
|
---|
2255 |
|
---|
2256 | lwz r5, 64*0($inp)
|
---|
2257 | lwz r6, 64*1($inp)
|
---|
2258 | lwz r7, 64*2($inp)
|
---|
2259 | lwz r8, 64*3($inp)
|
---|
2260 | lwz r9, 64*4($inp)
|
---|
2261 | lwz r10,64*5($inp)
|
---|
2262 | lwz r11,64*6($inp)
|
---|
2263 | lwz r12,64*7($inp)
|
---|
2264 | sldi r9, r9, 32
|
---|
2265 | sldi r10,r10,32
|
---|
2266 | sldi r11,r11,32
|
---|
2267 | sldi r12,r12,32
|
---|
2268 | or r5,r5,r9
|
---|
2269 | or r6,r6,r10
|
---|
2270 | or r7,r7,r11
|
---|
2271 | or r8,r8,r12
|
---|
2272 | and r5,r5,r0
|
---|
2273 | and r6,r6,r0
|
---|
2274 | and r7,r7,r0
|
---|
2275 | and r8,r8,r0
|
---|
2276 | std r5,64($out) # Z
|
---|
2277 | std r6,72($out)
|
---|
2278 | std r7,80($out)
|
---|
2279 | std r8,88($out)
|
---|
2280 |
|
---|
2281 | blr
|
---|
2282 | .long 0
|
---|
2283 | .byte 0,12,0x14,0,0,0,3,0
|
---|
2284 | .long 0
|
---|
2285 | .size ecp_nistz256_gather_w5,.-ecp_nistz256_gather_w5
|
---|
2286 |
|
---|
2287 | ########################################################################
|
---|
2288 | # void ecp_nistz256_scatter_w7(void *out, const P256_POINT_AFFINE *inp,
|
---|
2289 | # int index);
|
---|
2290 | .globl ecp_nistz256_scatter_w7
|
---|
2291 | .align 4
|
---|
2292 | ecp_nistz256_scatter_w7:
|
---|
2293 | li r0,8
|
---|
2294 | mtctr r0
|
---|
2295 | add $out,$out,$index
|
---|
2296 | subi $inp,$inp,8
|
---|
2297 |
|
---|
2298 | .Loop_scatter_w7:
|
---|
2299 | ldu r0,8($inp)
|
---|
2300 | stb r0,64*0($out)
|
---|
2301 | srdi r0,r0,8
|
---|
2302 | stb r0,64*1($out)
|
---|
2303 | srdi r0,r0,8
|
---|
2304 | stb r0,64*2($out)
|
---|
2305 | srdi r0,r0,8
|
---|
2306 | stb r0,64*3($out)
|
---|
2307 | srdi r0,r0,8
|
---|
2308 | stb r0,64*4($out)
|
---|
2309 | srdi r0,r0,8
|
---|
2310 | stb r0,64*5($out)
|
---|
2311 | srdi r0,r0,8
|
---|
2312 | stb r0,64*6($out)
|
---|
2313 | srdi r0,r0,8
|
---|
2314 | stb r0,64*7($out)
|
---|
2315 | addi $out,$out,64*8
|
---|
2316 | bdnz .Loop_scatter_w7
|
---|
2317 |
|
---|
2318 | blr
|
---|
2319 | .long 0
|
---|
2320 | .byte 0,12,0x14,0,0,0,3,0
|
---|
2321 | .long 0
|
---|
2322 | .size ecp_nistz256_scatter_w7,.-ecp_nistz256_scatter_w7
|
---|
2323 |
|
---|
2324 | ########################################################################
|
---|
2325 | # void ecp_nistz256_gather_w7(P256_POINT_AFFINE *out, const void *inp,
|
---|
2326 | # int index);
|
---|
2327 | .globl ecp_nistz256_gather_w7
|
---|
2328 | .align 4
|
---|
2329 | ecp_nistz256_gather_w7:
|
---|
2330 | li r0,8
|
---|
2331 | mtctr r0
|
---|
2332 | neg r0,$index
|
---|
2333 | sradi r0,r0,63
|
---|
2334 |
|
---|
2335 | add $index,$index,r0
|
---|
2336 | add $inp,$inp,$index
|
---|
2337 | subi $out,$out,8
|
---|
2338 |
|
---|
2339 | .Loop_gather_w7:
|
---|
2340 | lbz r5, 64*0($inp)
|
---|
2341 | lbz r6, 64*1($inp)
|
---|
2342 | lbz r7, 64*2($inp)
|
---|
2343 | lbz r8, 64*3($inp)
|
---|
2344 | lbz r9, 64*4($inp)
|
---|
2345 | lbz r10,64*5($inp)
|
---|
2346 | lbz r11,64*6($inp)
|
---|
2347 | lbz r12,64*7($inp)
|
---|
2348 | addi $inp,$inp,64*8
|
---|
2349 |
|
---|
2350 | sldi r6, r6, 8
|
---|
2351 | sldi r7, r7, 16
|
---|
2352 | sldi r8, r8, 24
|
---|
2353 | sldi r9, r9, 32
|
---|
2354 | sldi r10,r10,40
|
---|
2355 | sldi r11,r11,48
|
---|
2356 | sldi r12,r12,56
|
---|
2357 |
|
---|
2358 | or r5,r5,r6
|
---|
2359 | or r7,r7,r8
|
---|
2360 | or r9,r9,r10
|
---|
2361 | or r11,r11,r12
|
---|
2362 | or r5,r5,r7
|
---|
2363 | or r9,r9,r11
|
---|
2364 | or r5,r5,r9
|
---|
2365 | and r5,r5,r0
|
---|
2366 | stdu r5,8($out)
|
---|
2367 | bdnz .Loop_gather_w7
|
---|
2368 |
|
---|
2369 | blr
|
---|
2370 | .long 0
|
---|
2371 | .byte 0,12,0x14,0,0,0,3,0
|
---|
2372 | .long 0
|
---|
2373 | .size ecp_nistz256_gather_w7,.-ecp_nistz256_gather_w7
|
---|
2374 | ___
|
---|
2375 | }
|
---|
2376 |
|
---|
2377 | foreach (split("\n",$code)) {
|
---|
2378 | s/\`([^\`]*)\`/eval $1/ge;
|
---|
2379 |
|
---|
2380 | print $_,"\n";
|
---|
2381 | }
|
---|
2382 | close STDOUT or die "error closing STDOUT: $!"; # enforce flush
|
---|