1 | #! /usr/bin/env perl
|
---|
2 | # Copyright 2016-2020 The OpenSSL Project Authors. All Rights Reserved.
|
---|
3 | #
|
---|
4 | # Licensed under the Apache License 2.0 (the "License"). You may not use
|
---|
5 | # this file except in compliance with the License. You can obtain a copy
|
---|
6 | # in the file LICENSE in the source distribution or at
|
---|
7 | # https://www.openssl.org/source/license.html
|
---|
8 |
|
---|
9 | #
|
---|
10 | # ====================================================================
|
---|
11 | # Written by Andy Polyakov <[email protected]> for the OpenSSL
|
---|
12 | # project. The module is, however, dual licensed under OpenSSL and
|
---|
13 | # CRYPTOGAMS licenses depending on where you obtain it. For further
|
---|
14 | # details see http://www.openssl.org/~appro/cryptogams/.
|
---|
15 | # ====================================================================
|
---|
16 | #
|
---|
17 | # ECP_NISTZ256 module for PPC64.
|
---|
18 | #
|
---|
19 | # August 2016.
|
---|
20 | #
|
---|
21 | # Original ECP_NISTZ256 submission targeting x86_64 is detailed in
|
---|
22 | # http://eprint.iacr.org/2013/816.
|
---|
23 | #
|
---|
24 | # with/without -DECP_NISTZ256_ASM
|
---|
25 | # POWER7 +260-530%
|
---|
26 | # POWER8 +220-340%
|
---|
27 |
|
---|
28 | # $output is the last argument if it looks like a file (it has an extension)
|
---|
29 | # $flavour is the first argument if it doesn't look like a file
|
---|
30 | $output = $#ARGV >= 0 && $ARGV[$#ARGV] =~ m|\.\w+$| ? pop : undef;
|
---|
31 | $flavour = $#ARGV >= 0 && $ARGV[0] !~ m|\.| ? shift : undef;
|
---|
32 |
|
---|
33 | $0 =~ m/(.*[\/\\])[^\/\\]+$/; $dir=$1;
|
---|
34 | ( $xlate="${dir}ppc-xlate.pl" and -f $xlate ) or
|
---|
35 | ( $xlate="${dir}../../perlasm/ppc-xlate.pl" and -f $xlate) or
|
---|
36 | die "can't locate ppc-xlate.pl";
|
---|
37 |
|
---|
38 | open OUT,"| \"$^X\" $xlate $flavour \"$output\""
|
---|
39 | or die "can't call $xlate: $!";
|
---|
40 | *STDOUT=*OUT;
|
---|
41 |
|
---|
42 | my $sp="r1";
|
---|
43 |
|
---|
44 | {
|
---|
45 | my ($rp,$ap,$bp,$bi,$acc0,$acc1,$acc2,$acc3,$poly1,$poly3,
|
---|
46 | $acc4,$acc5,$a0,$a1,$a2,$a3,$t0,$t1,$t2,$t3) =
|
---|
47 | map("r$_",(3..12,22..31));
|
---|
48 |
|
---|
49 | my ($acc6,$acc7)=($bp,$bi); # used in __ecp_nistz256_sqr_mont
|
---|
50 |
|
---|
51 | $code.=<<___;
|
---|
52 | .machine "any"
|
---|
53 | .text
|
---|
54 | ___
|
---|
55 | ########################################################################
|
---|
56 | # Convert ecp_nistz256_table.c to layout expected by ecp_nistz_gather_w7
|
---|
57 | #
|
---|
58 | $0 =~ m/(.*[\/\\])[^\/\\]+$/; $dir=$1;
|
---|
59 | open TABLE,"<ecp_nistz256_table.c" or
|
---|
60 | open TABLE,"<${dir}../ecp_nistz256_table.c" or
|
---|
61 | die "failed to open ecp_nistz256_table.c:",$!;
|
---|
62 |
|
---|
63 | use integer;
|
---|
64 |
|
---|
65 | foreach(<TABLE>) {
|
---|
66 | s/TOBN\(\s*(0x[0-9a-f]+),\s*(0x[0-9a-f]+)\s*\)/push @arr,hex($2),hex($1)/geo;
|
---|
67 | }
|
---|
68 | close TABLE;
|
---|
69 |
|
---|
70 | # See ecp_nistz256_table.c for explanation for why it's 64*16*37.
|
---|
71 | # 64*16*37-1 is because $#arr returns last valid index or @arr, not
|
---|
72 | # amount of elements.
|
---|
73 | die "insane number of elements" if ($#arr != 64*16*37-1);
|
---|
74 |
|
---|
75 | $code.=<<___;
|
---|
76 | .type ecp_nistz256_precomputed,\@object
|
---|
77 | .globl ecp_nistz256_precomputed
|
---|
78 | .align 12
|
---|
79 | ecp_nistz256_precomputed:
|
---|
80 | ___
|
---|
81 | ########################################################################
|
---|
82 | # this conversion smashes P256_POINT_AFFINE by individual bytes with
|
---|
83 | # 64 byte interval, similar to
|
---|
84 | # 1111222233334444
|
---|
85 | # 1234123412341234
|
---|
86 | for(1..37) {
|
---|
87 | @tbl = splice(@arr,0,64*16);
|
---|
88 | for($i=0;$i<64;$i++) {
|
---|
89 | undef @line;
|
---|
90 | for($j=0;$j<64;$j++) {
|
---|
91 | push @line,(@tbl[$j*16+$i/4]>>(($i%4)*8))&0xff;
|
---|
92 | }
|
---|
93 | $code.=".byte\t";
|
---|
94 | $code.=join(',',map { sprintf "0x%02x",$_} @line);
|
---|
95 | $code.="\n";
|
---|
96 | }
|
---|
97 | }
|
---|
98 |
|
---|
99 | $code.=<<___;
|
---|
100 | .size ecp_nistz256_precomputed,.-ecp_nistz256_precomputed
|
---|
101 | .asciz "ECP_NISTZ256 for PPC64, CRYPTOGAMS by <appro\@openssl.org>"
|
---|
102 |
|
---|
103 | # void ecp_nistz256_mul_mont(BN_ULONG x0[4],const BN_ULONG x1[4],
|
---|
104 | # const BN_ULONG x2[4]);
|
---|
105 | .globl ecp_nistz256_mul_mont
|
---|
106 | .align 5
|
---|
107 | ecp_nistz256_mul_mont:
|
---|
108 | stdu $sp,-128($sp)
|
---|
109 | mflr r0
|
---|
110 | std r22,48($sp)
|
---|
111 | std r23,56($sp)
|
---|
112 | std r24,64($sp)
|
---|
113 | std r25,72($sp)
|
---|
114 | std r26,80($sp)
|
---|
115 | std r27,88($sp)
|
---|
116 | std r28,96($sp)
|
---|
117 | std r29,104($sp)
|
---|
118 | std r30,112($sp)
|
---|
119 | std r31,120($sp)
|
---|
120 |
|
---|
121 | ld $a0,0($ap)
|
---|
122 | ld $bi,0($bp)
|
---|
123 | ld $a1,8($ap)
|
---|
124 | ld $a2,16($ap)
|
---|
125 | ld $a3,24($ap)
|
---|
126 |
|
---|
127 | li $poly1,-1
|
---|
128 | srdi $poly1,$poly1,32 # 0x00000000ffffffff
|
---|
129 | li $poly3,1
|
---|
130 | orc $poly3,$poly3,$poly1 # 0xffffffff00000001
|
---|
131 |
|
---|
132 | bl __ecp_nistz256_mul_mont
|
---|
133 |
|
---|
134 | mtlr r0
|
---|
135 | ld r22,48($sp)
|
---|
136 | ld r23,56($sp)
|
---|
137 | ld r24,64($sp)
|
---|
138 | ld r25,72($sp)
|
---|
139 | ld r26,80($sp)
|
---|
140 | ld r27,88($sp)
|
---|
141 | ld r28,96($sp)
|
---|
142 | ld r29,104($sp)
|
---|
143 | ld r30,112($sp)
|
---|
144 | ld r31,120($sp)
|
---|
145 | addi $sp,$sp,128
|
---|
146 | blr
|
---|
147 | .long 0
|
---|
148 | .byte 0,12,4,0,0x80,10,3,0
|
---|
149 | .long 0
|
---|
150 | .size ecp_nistz256_mul_mont,.-ecp_nistz256_mul_mont
|
---|
151 |
|
---|
152 | # void ecp_nistz256_sqr_mont(BN_ULONG x0[4],const BN_ULONG x1[4]);
|
---|
153 | .globl ecp_nistz256_sqr_mont
|
---|
154 | .align 4
|
---|
155 | ecp_nistz256_sqr_mont:
|
---|
156 | stdu $sp,-128($sp)
|
---|
157 | mflr r0
|
---|
158 | std r22,48($sp)
|
---|
159 | std r23,56($sp)
|
---|
160 | std r24,64($sp)
|
---|
161 | std r25,72($sp)
|
---|
162 | std r26,80($sp)
|
---|
163 | std r27,88($sp)
|
---|
164 | std r28,96($sp)
|
---|
165 | std r29,104($sp)
|
---|
166 | std r30,112($sp)
|
---|
167 | std r31,120($sp)
|
---|
168 |
|
---|
169 | ld $a0,0($ap)
|
---|
170 | ld $a1,8($ap)
|
---|
171 | ld $a2,16($ap)
|
---|
172 | ld $a3,24($ap)
|
---|
173 |
|
---|
174 | li $poly1,-1
|
---|
175 | srdi $poly1,$poly1,32 # 0x00000000ffffffff
|
---|
176 | li $poly3,1
|
---|
177 | orc $poly3,$poly3,$poly1 # 0xffffffff00000001
|
---|
178 |
|
---|
179 | bl __ecp_nistz256_sqr_mont
|
---|
180 |
|
---|
181 | mtlr r0
|
---|
182 | ld r22,48($sp)
|
---|
183 | ld r23,56($sp)
|
---|
184 | ld r24,64($sp)
|
---|
185 | ld r25,72($sp)
|
---|
186 | ld r26,80($sp)
|
---|
187 | ld r27,88($sp)
|
---|
188 | ld r28,96($sp)
|
---|
189 | ld r29,104($sp)
|
---|
190 | ld r30,112($sp)
|
---|
191 | ld r31,120($sp)
|
---|
192 | addi $sp,$sp,128
|
---|
193 | blr
|
---|
194 | .long 0
|
---|
195 | .byte 0,12,4,0,0x80,10,2,0
|
---|
196 | .long 0
|
---|
197 | .size ecp_nistz256_sqr_mont,.-ecp_nistz256_sqr_mont
|
---|
198 |
|
---|
199 | # void ecp_nistz256_add(BN_ULONG x0[4],const BN_ULONG x1[4],
|
---|
200 | # const BN_ULONG x2[4]);
|
---|
201 | .globl ecp_nistz256_add
|
---|
202 | .align 4
|
---|
203 | ecp_nistz256_add:
|
---|
204 | stdu $sp,-128($sp)
|
---|
205 | mflr r0
|
---|
206 | std r28,96($sp)
|
---|
207 | std r29,104($sp)
|
---|
208 | std r30,112($sp)
|
---|
209 | std r31,120($sp)
|
---|
210 |
|
---|
211 | ld $acc0,0($ap)
|
---|
212 | ld $t0, 0($bp)
|
---|
213 | ld $acc1,8($ap)
|
---|
214 | ld $t1, 8($bp)
|
---|
215 | ld $acc2,16($ap)
|
---|
216 | ld $t2, 16($bp)
|
---|
217 | ld $acc3,24($ap)
|
---|
218 | ld $t3, 24($bp)
|
---|
219 |
|
---|
220 | li $poly1,-1
|
---|
221 | srdi $poly1,$poly1,32 # 0x00000000ffffffff
|
---|
222 | li $poly3,1
|
---|
223 | orc $poly3,$poly3,$poly1 # 0xffffffff00000001
|
---|
224 |
|
---|
225 | bl __ecp_nistz256_add
|
---|
226 |
|
---|
227 | mtlr r0
|
---|
228 | ld r28,96($sp)
|
---|
229 | ld r29,104($sp)
|
---|
230 | ld r30,112($sp)
|
---|
231 | ld r31,120($sp)
|
---|
232 | addi $sp,$sp,128
|
---|
233 | blr
|
---|
234 | .long 0
|
---|
235 | .byte 0,12,4,0,0x80,4,3,0
|
---|
236 | .long 0
|
---|
237 | .size ecp_nistz256_add,.-ecp_nistz256_add
|
---|
238 |
|
---|
239 | # void ecp_nistz256_div_by_2(BN_ULONG x0[4],const BN_ULONG x1[4]);
|
---|
240 | .globl ecp_nistz256_div_by_2
|
---|
241 | .align 4
|
---|
242 | ecp_nistz256_div_by_2:
|
---|
243 | stdu $sp,-128($sp)
|
---|
244 | mflr r0
|
---|
245 | std r28,96($sp)
|
---|
246 | std r29,104($sp)
|
---|
247 | std r30,112($sp)
|
---|
248 | std r31,120($sp)
|
---|
249 |
|
---|
250 | ld $acc0,0($ap)
|
---|
251 | ld $acc1,8($ap)
|
---|
252 | ld $acc2,16($ap)
|
---|
253 | ld $acc3,24($ap)
|
---|
254 |
|
---|
255 | li $poly1,-1
|
---|
256 | srdi $poly1,$poly1,32 # 0x00000000ffffffff
|
---|
257 | li $poly3,1
|
---|
258 | orc $poly3,$poly3,$poly1 # 0xffffffff00000001
|
---|
259 |
|
---|
260 | bl __ecp_nistz256_div_by_2
|
---|
261 |
|
---|
262 | mtlr r0
|
---|
263 | ld r28,96($sp)
|
---|
264 | ld r29,104($sp)
|
---|
265 | ld r30,112($sp)
|
---|
266 | ld r31,120($sp)
|
---|
267 | addi $sp,$sp,128
|
---|
268 | blr
|
---|
269 | .long 0
|
---|
270 | .byte 0,12,4,0,0x80,4,2,0
|
---|
271 | .long 0
|
---|
272 | .size ecp_nistz256_div_by_2,.-ecp_nistz256_div_by_2
|
---|
273 |
|
---|
274 | # void ecp_nistz256_mul_by_2(BN_ULONG x0[4],const BN_ULONG x1[4]);
|
---|
275 | .globl ecp_nistz256_mul_by_2
|
---|
276 | .align 4
|
---|
277 | ecp_nistz256_mul_by_2:
|
---|
278 | stdu $sp,-128($sp)
|
---|
279 | mflr r0
|
---|
280 | std r28,96($sp)
|
---|
281 | std r29,104($sp)
|
---|
282 | std r30,112($sp)
|
---|
283 | std r31,120($sp)
|
---|
284 |
|
---|
285 | ld $acc0,0($ap)
|
---|
286 | ld $acc1,8($ap)
|
---|
287 | ld $acc2,16($ap)
|
---|
288 | ld $acc3,24($ap)
|
---|
289 |
|
---|
290 | mr $t0,$acc0
|
---|
291 | mr $t1,$acc1
|
---|
292 | mr $t2,$acc2
|
---|
293 | mr $t3,$acc3
|
---|
294 |
|
---|
295 | li $poly1,-1
|
---|
296 | srdi $poly1,$poly1,32 # 0x00000000ffffffff
|
---|
297 | li $poly3,1
|
---|
298 | orc $poly3,$poly3,$poly1 # 0xffffffff00000001
|
---|
299 |
|
---|
300 | bl __ecp_nistz256_add # ret = a+a // 2*a
|
---|
301 |
|
---|
302 | mtlr r0
|
---|
303 | ld r28,96($sp)
|
---|
304 | ld r29,104($sp)
|
---|
305 | ld r30,112($sp)
|
---|
306 | ld r31,120($sp)
|
---|
307 | addi $sp,$sp,128
|
---|
308 | blr
|
---|
309 | .long 0
|
---|
310 | .byte 0,12,4,0,0x80,4,3,0
|
---|
311 | .long 0
|
---|
312 | .size ecp_nistz256_mul_by_2,.-ecp_nistz256_mul_by_2
|
---|
313 |
|
---|
314 | # void ecp_nistz256_mul_by_3(BN_ULONG x0[4],const BN_ULONG x1[4]);
|
---|
315 | .globl ecp_nistz256_mul_by_3
|
---|
316 | .align 4
|
---|
317 | ecp_nistz256_mul_by_3:
|
---|
318 | stdu $sp,-128($sp)
|
---|
319 | mflr r0
|
---|
320 | std r28,96($sp)
|
---|
321 | std r29,104($sp)
|
---|
322 | std r30,112($sp)
|
---|
323 | std r31,120($sp)
|
---|
324 |
|
---|
325 | ld $acc0,0($ap)
|
---|
326 | ld $acc1,8($ap)
|
---|
327 | ld $acc2,16($ap)
|
---|
328 | ld $acc3,24($ap)
|
---|
329 |
|
---|
330 | mr $t0,$acc0
|
---|
331 | std $acc0,64($sp)
|
---|
332 | mr $t1,$acc1
|
---|
333 | std $acc1,72($sp)
|
---|
334 | mr $t2,$acc2
|
---|
335 | std $acc2,80($sp)
|
---|
336 | mr $t3,$acc3
|
---|
337 | std $acc3,88($sp)
|
---|
338 |
|
---|
339 | li $poly1,-1
|
---|
340 | srdi $poly1,$poly1,32 # 0x00000000ffffffff
|
---|
341 | li $poly3,1
|
---|
342 | orc $poly3,$poly3,$poly1 # 0xffffffff00000001
|
---|
343 |
|
---|
344 | bl __ecp_nistz256_add # ret = a+a // 2*a
|
---|
345 |
|
---|
346 | ld $t0,64($sp)
|
---|
347 | ld $t1,72($sp)
|
---|
348 | ld $t2,80($sp)
|
---|
349 | ld $t3,88($sp)
|
---|
350 |
|
---|
351 | bl __ecp_nistz256_add # ret += a // 2*a+a=3*a
|
---|
352 |
|
---|
353 | mtlr r0
|
---|
354 | ld r28,96($sp)
|
---|
355 | ld r29,104($sp)
|
---|
356 | ld r30,112($sp)
|
---|
357 | ld r31,120($sp)
|
---|
358 | addi $sp,$sp,128
|
---|
359 | blr
|
---|
360 | .long 0
|
---|
361 | .byte 0,12,4,0,0x80,4,2,0
|
---|
362 | .long 0
|
---|
363 | .size ecp_nistz256_mul_by_3,.-ecp_nistz256_mul_by_3
|
---|
364 |
|
---|
365 | # void ecp_nistz256_sub(BN_ULONG x0[4],const BN_ULONG x1[4],
|
---|
366 | # const BN_ULONG x2[4]);
|
---|
367 | .globl ecp_nistz256_sub
|
---|
368 | .align 4
|
---|
369 | ecp_nistz256_sub:
|
---|
370 | stdu $sp,-128($sp)
|
---|
371 | mflr r0
|
---|
372 | std r28,96($sp)
|
---|
373 | std r29,104($sp)
|
---|
374 | std r30,112($sp)
|
---|
375 | std r31,120($sp)
|
---|
376 |
|
---|
377 | ld $acc0,0($ap)
|
---|
378 | ld $acc1,8($ap)
|
---|
379 | ld $acc2,16($ap)
|
---|
380 | ld $acc3,24($ap)
|
---|
381 |
|
---|
382 | li $poly1,-1
|
---|
383 | srdi $poly1,$poly1,32 # 0x00000000ffffffff
|
---|
384 | li $poly3,1
|
---|
385 | orc $poly3,$poly3,$poly1 # 0xffffffff00000001
|
---|
386 |
|
---|
387 | bl __ecp_nistz256_sub_from
|
---|
388 |
|
---|
389 | mtlr r0
|
---|
390 | ld r28,96($sp)
|
---|
391 | ld r29,104($sp)
|
---|
392 | ld r30,112($sp)
|
---|
393 | ld r31,120($sp)
|
---|
394 | addi $sp,$sp,128
|
---|
395 | blr
|
---|
396 | .long 0
|
---|
397 | .byte 0,12,4,0,0x80,4,3,0
|
---|
398 | .long 0
|
---|
399 | .size ecp_nistz256_sub,.-ecp_nistz256_sub
|
---|
400 |
|
---|
401 | # void ecp_nistz256_neg(BN_ULONG x0[4],const BN_ULONG x1[4]);
|
---|
402 | .globl ecp_nistz256_neg
|
---|
403 | .align 4
|
---|
404 | ecp_nistz256_neg:
|
---|
405 | stdu $sp,-128($sp)
|
---|
406 | mflr r0
|
---|
407 | std r28,96($sp)
|
---|
408 | std r29,104($sp)
|
---|
409 | std r30,112($sp)
|
---|
410 | std r31,120($sp)
|
---|
411 |
|
---|
412 | mr $bp,$ap
|
---|
413 | li $acc0,0
|
---|
414 | li $acc1,0
|
---|
415 | li $acc2,0
|
---|
416 | li $acc3,0
|
---|
417 |
|
---|
418 | li $poly1,-1
|
---|
419 | srdi $poly1,$poly1,32 # 0x00000000ffffffff
|
---|
420 | li $poly3,1
|
---|
421 | orc $poly3,$poly3,$poly1 # 0xffffffff00000001
|
---|
422 |
|
---|
423 | bl __ecp_nistz256_sub_from
|
---|
424 |
|
---|
425 | mtlr r0
|
---|
426 | ld r28,96($sp)
|
---|
427 | ld r29,104($sp)
|
---|
428 | ld r30,112($sp)
|
---|
429 | ld r31,120($sp)
|
---|
430 | addi $sp,$sp,128
|
---|
431 | blr
|
---|
432 | .long 0
|
---|
433 | .byte 0,12,4,0,0x80,4,2,0
|
---|
434 | .long 0
|
---|
435 | .size ecp_nistz256_neg,.-ecp_nistz256_neg
|
---|
436 |
|
---|
437 | # note that __ecp_nistz256_mul_mont expects a[0-3] input pre-loaded
|
---|
438 | # to $a0-$a3 and b[0] - to $bi
|
---|
439 | .type __ecp_nistz256_mul_mont,\@function
|
---|
440 | .align 4
|
---|
441 | __ecp_nistz256_mul_mont:
|
---|
442 | mulld $acc0,$a0,$bi # a[0]*b[0]
|
---|
443 | mulhdu $t0,$a0,$bi
|
---|
444 |
|
---|
445 | mulld $acc1,$a1,$bi # a[1]*b[0]
|
---|
446 | mulhdu $t1,$a1,$bi
|
---|
447 |
|
---|
448 | mulld $acc2,$a2,$bi # a[2]*b[0]
|
---|
449 | mulhdu $t2,$a2,$bi
|
---|
450 |
|
---|
451 | mulld $acc3,$a3,$bi # a[3]*b[0]
|
---|
452 | mulhdu $t3,$a3,$bi
|
---|
453 | ld $bi,8($bp) # b[1]
|
---|
454 |
|
---|
455 | addc $acc1,$acc1,$t0 # accumulate high parts of multiplication
|
---|
456 | sldi $t0,$acc0,32
|
---|
457 | adde $acc2,$acc2,$t1
|
---|
458 | srdi $t1,$acc0,32
|
---|
459 | adde $acc3,$acc3,$t2
|
---|
460 | addze $acc4,$t3
|
---|
461 | li $acc5,0
|
---|
462 | ___
|
---|
463 | for($i=1;$i<4;$i++) {
|
---|
464 | ################################################################
|
---|
465 | # Reduction iteration is normally performed by accumulating
|
---|
466 | # result of multiplication of modulus by "magic" digit [and
|
---|
467 | # omitting least significant word, which is guaranteed to
|
---|
468 | # be 0], but thanks to special form of modulus and "magic"
|
---|
469 | # digit being equal to least significant word, it can be
|
---|
470 | # performed with additions and subtractions alone. Indeed:
|
---|
471 | #
|
---|
472 | # ffff0001.00000000.0000ffff.ffffffff
|
---|
473 | # * abcdefgh
|
---|
474 | # + xxxxxxxx.xxxxxxxx.xxxxxxxx.xxxxxxxx.abcdefgh
|
---|
475 | #
|
---|
476 | # Now observing that ff..ff*x = (2^n-1)*x = 2^n*x-x, we
|
---|
477 | # rewrite above as:
|
---|
478 | #
|
---|
479 | # xxxxxxxx.xxxxxxxx.xxxxxxxx.xxxxxxxx.abcdefgh
|
---|
480 | # + abcdefgh.abcdefgh.0000abcd.efgh0000.00000000
|
---|
481 | # - 0000abcd.efgh0000.00000000.00000000.abcdefgh
|
---|
482 | #
|
---|
483 | # or marking redundant operations:
|
---|
484 | #
|
---|
485 | # xxxxxxxx.xxxxxxxx.xxxxxxxx.xxxxxxxx.--------
|
---|
486 | # + abcdefgh.abcdefgh.0000abcd.efgh0000.--------
|
---|
487 | # - 0000abcd.efgh0000.--------.--------.--------
|
---|
488 |
|
---|
489 | $code.=<<___;
|
---|
490 | subfc $t2,$t0,$acc0 # "*0xffff0001"
|
---|
491 | subfe $t3,$t1,$acc0
|
---|
492 | addc $acc0,$acc1,$t0 # +=acc[0]<<96 and omit acc[0]
|
---|
493 | adde $acc1,$acc2,$t1
|
---|
494 | adde $acc2,$acc3,$t2 # +=acc[0]*0xffff0001
|
---|
495 | adde $acc3,$acc4,$t3
|
---|
496 | addze $acc4,$acc5
|
---|
497 |
|
---|
498 | mulld $t0,$a0,$bi # lo(a[0]*b[i])
|
---|
499 | mulld $t1,$a1,$bi # lo(a[1]*b[i])
|
---|
500 | mulld $t2,$a2,$bi # lo(a[2]*b[i])
|
---|
501 | mulld $t3,$a3,$bi # lo(a[3]*b[i])
|
---|
502 | addc $acc0,$acc0,$t0 # accumulate low parts of multiplication
|
---|
503 | mulhdu $t0,$a0,$bi # hi(a[0]*b[i])
|
---|
504 | adde $acc1,$acc1,$t1
|
---|
505 | mulhdu $t1,$a1,$bi # hi(a[1]*b[i])
|
---|
506 | adde $acc2,$acc2,$t2
|
---|
507 | mulhdu $t2,$a2,$bi # hi(a[2]*b[i])
|
---|
508 | adde $acc3,$acc3,$t3
|
---|
509 | mulhdu $t3,$a3,$bi # hi(a[3]*b[i])
|
---|
510 | addze $acc4,$acc4
|
---|
511 | ___
|
---|
512 | $code.=<<___ if ($i<3);
|
---|
513 | ld $bi,8*($i+1)($bp) # b[$i+1]
|
---|
514 | ___
|
---|
515 | $code.=<<___;
|
---|
516 | addc $acc1,$acc1,$t0 # accumulate high parts of multiplication
|
---|
517 | sldi $t0,$acc0,32
|
---|
518 | adde $acc2,$acc2,$t1
|
---|
519 | srdi $t1,$acc0,32
|
---|
520 | adde $acc3,$acc3,$t2
|
---|
521 | adde $acc4,$acc4,$t3
|
---|
522 | li $acc5,0
|
---|
523 | addze $acc5,$acc5
|
---|
524 | ___
|
---|
525 | }
|
---|
526 | $code.=<<___;
|
---|
527 | # last reduction
|
---|
528 | subfc $t2,$t0,$acc0 # "*0xffff0001"
|
---|
529 | subfe $t3,$t1,$acc0
|
---|
530 | addc $acc0,$acc1,$t0 # +=acc[0]<<96 and omit acc[0]
|
---|
531 | adde $acc1,$acc2,$t1
|
---|
532 | adde $acc2,$acc3,$t2 # +=acc[0]*0xffff0001
|
---|
533 | adde $acc3,$acc4,$t3
|
---|
534 | addze $acc4,$acc5
|
---|
535 |
|
---|
536 | li $t2,0
|
---|
537 | addic $acc0,$acc0,1 # ret -= modulus
|
---|
538 | subfe $acc1,$poly1,$acc1
|
---|
539 | subfe $acc2,$t2,$acc2
|
---|
540 | subfe $acc3,$poly3,$acc3
|
---|
541 | subfe $acc4,$t2,$acc4
|
---|
542 |
|
---|
543 | addc $acc0,$acc0,$acc4 # ret += modulus if borrow
|
---|
544 | and $t1,$poly1,$acc4
|
---|
545 | and $t3,$poly3,$acc4
|
---|
546 | adde $acc1,$acc1,$t1
|
---|
547 | addze $acc2,$acc2
|
---|
548 | adde $acc3,$acc3,$t3
|
---|
549 |
|
---|
550 | std $acc0,0($rp)
|
---|
551 | std $acc1,8($rp)
|
---|
552 | std $acc2,16($rp)
|
---|
553 | std $acc3,24($rp)
|
---|
554 |
|
---|
555 | blr
|
---|
556 | .long 0
|
---|
557 | .byte 0,12,0x14,0,0,0,1,0
|
---|
558 | .long 0
|
---|
559 | .size __ecp_nistz256_mul_mont,.-__ecp_nistz256_mul_mont
|
---|
560 |
|
---|
561 | # note that __ecp_nistz256_sqr_mont expects a[0-3] input pre-loaded
|
---|
562 | # to $a0-$a3
|
---|
563 | .type __ecp_nistz256_sqr_mont,\@function
|
---|
564 | .align 4
|
---|
565 | __ecp_nistz256_sqr_mont:
|
---|
566 | ################################################################
|
---|
567 | # | | | | | |a1*a0| |
|
---|
568 | # | | | | |a2*a0| | |
|
---|
569 | # | |a3*a2|a3*a0| | | |
|
---|
570 | # | | | |a2*a1| | | |
|
---|
571 | # | | |a3*a1| | | | |
|
---|
572 | # *| | | | | | | | 2|
|
---|
573 | # +|a3*a3|a2*a2|a1*a1|a0*a0|
|
---|
574 | # |--+--+--+--+--+--+--+--|
|
---|
575 | # |A7|A6|A5|A4|A3|A2|A1|A0|, where Ax is $accx, i.e. follow $accx
|
---|
576 | #
|
---|
577 | # "can't overflow" below mark carrying into high part of
|
---|
578 | # multiplication result, which can't overflow, because it
|
---|
579 | # can never be all ones.
|
---|
580 |
|
---|
581 | mulld $acc1,$a1,$a0 # a[1]*a[0]
|
---|
582 | mulhdu $t1,$a1,$a0
|
---|
583 | mulld $acc2,$a2,$a0 # a[2]*a[0]
|
---|
584 | mulhdu $t2,$a2,$a0
|
---|
585 | mulld $acc3,$a3,$a0 # a[3]*a[0]
|
---|
586 | mulhdu $acc4,$a3,$a0
|
---|
587 |
|
---|
588 | addc $acc2,$acc2,$t1 # accumulate high parts of multiplication
|
---|
589 | mulld $t0,$a2,$a1 # a[2]*a[1]
|
---|
590 | mulhdu $t1,$a2,$a1
|
---|
591 | adde $acc3,$acc3,$t2
|
---|
592 | mulld $t2,$a3,$a1 # a[3]*a[1]
|
---|
593 | mulhdu $t3,$a3,$a1
|
---|
594 | addze $acc4,$acc4 # can't overflow
|
---|
595 |
|
---|
596 | mulld $acc5,$a3,$a2 # a[3]*a[2]
|
---|
597 | mulhdu $acc6,$a3,$a2
|
---|
598 |
|
---|
599 | addc $t1,$t1,$t2 # accumulate high parts of multiplication
|
---|
600 | addze $t2,$t3 # can't overflow
|
---|
601 |
|
---|
602 | addc $acc3,$acc3,$t0 # accumulate low parts of multiplication
|
---|
603 | adde $acc4,$acc4,$t1
|
---|
604 | adde $acc5,$acc5,$t2
|
---|
605 | addze $acc6,$acc6 # can't overflow
|
---|
606 |
|
---|
607 | addc $acc1,$acc1,$acc1 # acc[1-6]*=2
|
---|
608 | adde $acc2,$acc2,$acc2
|
---|
609 | adde $acc3,$acc3,$acc3
|
---|
610 | adde $acc4,$acc4,$acc4
|
---|
611 | adde $acc5,$acc5,$acc5
|
---|
612 | adde $acc6,$acc6,$acc6
|
---|
613 | li $acc7,0
|
---|
614 | addze $acc7,$acc7
|
---|
615 |
|
---|
616 | mulld $acc0,$a0,$a0 # a[0]*a[0]
|
---|
617 | mulhdu $a0,$a0,$a0
|
---|
618 | mulld $t1,$a1,$a1 # a[1]*a[1]
|
---|
619 | mulhdu $a1,$a1,$a1
|
---|
620 | mulld $t2,$a2,$a2 # a[2]*a[2]
|
---|
621 | mulhdu $a2,$a2,$a2
|
---|
622 | mulld $t3,$a3,$a3 # a[3]*a[3]
|
---|
623 | mulhdu $a3,$a3,$a3
|
---|
624 | addc $acc1,$acc1,$a0 # +a[i]*a[i]
|
---|
625 | sldi $t0,$acc0,32
|
---|
626 | adde $acc2,$acc2,$t1
|
---|
627 | srdi $t1,$acc0,32
|
---|
628 | adde $acc3,$acc3,$a1
|
---|
629 | adde $acc4,$acc4,$t2
|
---|
630 | adde $acc5,$acc5,$a2
|
---|
631 | adde $acc6,$acc6,$t3
|
---|
632 | adde $acc7,$acc7,$a3
|
---|
633 | ___
|
---|
634 | for($i=0;$i<3;$i++) { # reductions, see commentary in
|
---|
635 | # multiplication for details
|
---|
636 | $code.=<<___;
|
---|
637 | subfc $t2,$t0,$acc0 # "*0xffff0001"
|
---|
638 | subfe $t3,$t1,$acc0
|
---|
639 | addc $acc0,$acc1,$t0 # +=acc[0]<<96 and omit acc[0]
|
---|
640 | sldi $t0,$acc0,32
|
---|
641 | adde $acc1,$acc2,$t1
|
---|
642 | srdi $t1,$acc0,32
|
---|
643 | adde $acc2,$acc3,$t2 # +=acc[0]*0xffff0001
|
---|
644 | addze $acc3,$t3 # can't overflow
|
---|
645 | ___
|
---|
646 | }
|
---|
647 | $code.=<<___;
|
---|
648 | subfc $t2,$t0,$acc0 # "*0xffff0001"
|
---|
649 | subfe $t3,$t1,$acc0
|
---|
650 | addc $acc0,$acc1,$t0 # +=acc[0]<<96 and omit acc[0]
|
---|
651 | adde $acc1,$acc2,$t1
|
---|
652 | adde $acc2,$acc3,$t2 # +=acc[0]*0xffff0001
|
---|
653 | addze $acc3,$t3 # can't overflow
|
---|
654 |
|
---|
655 | addc $acc0,$acc0,$acc4 # accumulate upper half
|
---|
656 | adde $acc1,$acc1,$acc5
|
---|
657 | adde $acc2,$acc2,$acc6
|
---|
658 | adde $acc3,$acc3,$acc7
|
---|
659 | li $t2,0
|
---|
660 | addze $acc4,$t2
|
---|
661 |
|
---|
662 | addic $acc0,$acc0,1 # ret -= modulus
|
---|
663 | subfe $acc1,$poly1,$acc1
|
---|
664 | subfe $acc2,$t2,$acc2
|
---|
665 | subfe $acc3,$poly3,$acc3
|
---|
666 | subfe $acc4,$t2,$acc4
|
---|
667 |
|
---|
668 | addc $acc0,$acc0,$acc4 # ret += modulus if borrow
|
---|
669 | and $t1,$poly1,$acc4
|
---|
670 | and $t3,$poly3,$acc4
|
---|
671 | adde $acc1,$acc1,$t1
|
---|
672 | addze $acc2,$acc2
|
---|
673 | adde $acc3,$acc3,$t3
|
---|
674 |
|
---|
675 | std $acc0,0($rp)
|
---|
676 | std $acc1,8($rp)
|
---|
677 | std $acc2,16($rp)
|
---|
678 | std $acc3,24($rp)
|
---|
679 |
|
---|
680 | blr
|
---|
681 | .long 0
|
---|
682 | .byte 0,12,0x14,0,0,0,1,0
|
---|
683 | .long 0
|
---|
684 | .size __ecp_nistz256_sqr_mont,.-__ecp_nistz256_sqr_mont
|
---|
685 |
|
---|
686 | # Note that __ecp_nistz256_add expects both input vectors pre-loaded to
|
---|
687 | # $a0-$a3 and $t0-$t3. This is done because it's used in multiple
|
---|
688 | # contexts, e.g. in multiplication by 2 and 3...
|
---|
689 | .type __ecp_nistz256_add,\@function
|
---|
690 | .align 4
|
---|
691 | __ecp_nistz256_add:
|
---|
692 | addc $acc0,$acc0,$t0 # ret = a+b
|
---|
693 | adde $acc1,$acc1,$t1
|
---|
694 | adde $acc2,$acc2,$t2
|
---|
695 | li $t2,0
|
---|
696 | adde $acc3,$acc3,$t3
|
---|
697 | addze $t0,$t2
|
---|
698 |
|
---|
699 | # if a+b >= modulus, subtract modulus
|
---|
700 | #
|
---|
701 | # But since comparison implies subtraction, we subtract
|
---|
702 | # modulus and then add it back if subtraction borrowed.
|
---|
703 |
|
---|
704 | subic $acc0,$acc0,-1
|
---|
705 | subfe $acc1,$poly1,$acc1
|
---|
706 | subfe $acc2,$t2,$acc2
|
---|
707 | subfe $acc3,$poly3,$acc3
|
---|
708 | subfe $t0,$t2,$t0
|
---|
709 |
|
---|
710 | addc $acc0,$acc0,$t0
|
---|
711 | and $t1,$poly1,$t0
|
---|
712 | and $t3,$poly3,$t0
|
---|
713 | adde $acc1,$acc1,$t1
|
---|
714 | addze $acc2,$acc2
|
---|
715 | adde $acc3,$acc3,$t3
|
---|
716 |
|
---|
717 | std $acc0,0($rp)
|
---|
718 | std $acc1,8($rp)
|
---|
719 | std $acc2,16($rp)
|
---|
720 | std $acc3,24($rp)
|
---|
721 |
|
---|
722 | blr
|
---|
723 | .long 0
|
---|
724 | .byte 0,12,0x14,0,0,0,3,0
|
---|
725 | .long 0
|
---|
726 | .size __ecp_nistz256_add,.-__ecp_nistz256_add
|
---|
727 |
|
---|
728 | .type __ecp_nistz256_sub_from,\@function
|
---|
729 | .align 4
|
---|
730 | __ecp_nistz256_sub_from:
|
---|
731 | ld $t0,0($bp)
|
---|
732 | ld $t1,8($bp)
|
---|
733 | ld $t2,16($bp)
|
---|
734 | ld $t3,24($bp)
|
---|
735 | subfc $acc0,$t0,$acc0 # ret = a-b
|
---|
736 | subfe $acc1,$t1,$acc1
|
---|
737 | subfe $acc2,$t2,$acc2
|
---|
738 | subfe $acc3,$t3,$acc3
|
---|
739 | subfe $t0,$t0,$t0 # t0 = borrow ? -1 : 0
|
---|
740 |
|
---|
741 | # if a-b borrowed, add modulus
|
---|
742 |
|
---|
743 | addc $acc0,$acc0,$t0 # ret -= modulus & t0
|
---|
744 | and $t1,$poly1,$t0
|
---|
745 | and $t3,$poly3,$t0
|
---|
746 | adde $acc1,$acc1,$t1
|
---|
747 | addze $acc2,$acc2
|
---|
748 | adde $acc3,$acc3,$t3
|
---|
749 |
|
---|
750 | std $acc0,0($rp)
|
---|
751 | std $acc1,8($rp)
|
---|
752 | std $acc2,16($rp)
|
---|
753 | std $acc3,24($rp)
|
---|
754 |
|
---|
755 | blr
|
---|
756 | .long 0
|
---|
757 | .byte 0,12,0x14,0,0,0,3,0
|
---|
758 | .long 0
|
---|
759 | .size __ecp_nistz256_sub_from,.-__ecp_nistz256_sub_from
|
---|
760 |
|
---|
761 | .type __ecp_nistz256_sub_morf,\@function
|
---|
762 | .align 4
|
---|
763 | __ecp_nistz256_sub_morf:
|
---|
764 | ld $t0,0($bp)
|
---|
765 | ld $t1,8($bp)
|
---|
766 | ld $t2,16($bp)
|
---|
767 | ld $t3,24($bp)
|
---|
768 | subfc $acc0,$acc0,$t0 # ret = b-a
|
---|
769 | subfe $acc1,$acc1,$t1
|
---|
770 | subfe $acc2,$acc2,$t2
|
---|
771 | subfe $acc3,$acc3,$t3
|
---|
772 | subfe $t0,$t0,$t0 # t0 = borrow ? -1 : 0
|
---|
773 |
|
---|
774 | # if b-a borrowed, add modulus
|
---|
775 |
|
---|
776 | addc $acc0,$acc0,$t0 # ret -= modulus & t0
|
---|
777 | and $t1,$poly1,$t0
|
---|
778 | and $t3,$poly3,$t0
|
---|
779 | adde $acc1,$acc1,$t1
|
---|
780 | addze $acc2,$acc2
|
---|
781 | adde $acc3,$acc3,$t3
|
---|
782 |
|
---|
783 | std $acc0,0($rp)
|
---|
784 | std $acc1,8($rp)
|
---|
785 | std $acc2,16($rp)
|
---|
786 | std $acc3,24($rp)
|
---|
787 |
|
---|
788 | blr
|
---|
789 | .long 0
|
---|
790 | .byte 0,12,0x14,0,0,0,3,0
|
---|
791 | .long 0
|
---|
792 | .size __ecp_nistz256_sub_morf,.-__ecp_nistz256_sub_morf
|
---|
793 |
|
---|
794 | .type __ecp_nistz256_div_by_2,\@function
|
---|
795 | .align 4
|
---|
796 | __ecp_nistz256_div_by_2:
|
---|
797 | andi. $t0,$acc0,1
|
---|
798 | addic $acc0,$acc0,-1 # a += modulus
|
---|
799 | neg $t0,$t0
|
---|
800 | adde $acc1,$acc1,$poly1
|
---|
801 | not $t0,$t0
|
---|
802 | addze $acc2,$acc2
|
---|
803 | li $t2,0
|
---|
804 | adde $acc3,$acc3,$poly3
|
---|
805 | and $t1,$poly1,$t0
|
---|
806 | addze $ap,$t2 # ap = carry
|
---|
807 | and $t3,$poly3,$t0
|
---|
808 |
|
---|
809 | subfc $acc0,$t0,$acc0 # a -= modulus if a was even
|
---|
810 | subfe $acc1,$t1,$acc1
|
---|
811 | subfe $acc2,$t2,$acc2
|
---|
812 | subfe $acc3,$t3,$acc3
|
---|
813 | subfe $ap, $t2,$ap
|
---|
814 |
|
---|
815 | srdi $acc0,$acc0,1
|
---|
816 | sldi $t0,$acc1,63
|
---|
817 | srdi $acc1,$acc1,1
|
---|
818 | sldi $t1,$acc2,63
|
---|
819 | srdi $acc2,$acc2,1
|
---|
820 | sldi $t2,$acc3,63
|
---|
821 | srdi $acc3,$acc3,1
|
---|
822 | sldi $t3,$ap,63
|
---|
823 | or $acc0,$acc0,$t0
|
---|
824 | or $acc1,$acc1,$t1
|
---|
825 | or $acc2,$acc2,$t2
|
---|
826 | or $acc3,$acc3,$t3
|
---|
827 |
|
---|
828 | std $acc0,0($rp)
|
---|
829 | std $acc1,8($rp)
|
---|
830 | std $acc2,16($rp)
|
---|
831 | std $acc3,24($rp)
|
---|
832 |
|
---|
833 | blr
|
---|
834 | .long 0
|
---|
835 | .byte 0,12,0x14,0,0,0,1,0
|
---|
836 | .long 0
|
---|
837 | .size __ecp_nistz256_div_by_2,.-__ecp_nistz256_div_by_2
|
---|
838 | ___
|
---|
839 | ########################################################################
|
---|
840 | # following subroutines are "literal" implementation of those found in
|
---|
841 | # ecp_nistz256.c
|
---|
842 | #
|
---|
843 | ########################################################################
|
---|
844 | # void ecp_nistz256_point_double(P256_POINT *out,const P256_POINT *inp);
|
---|
845 | #
|
---|
846 | if (1) {
|
---|
847 | my $FRAME=64+32*4+12*8;
|
---|
848 | my ($S,$M,$Zsqr,$tmp0)=map(64+32*$_,(0..3));
|
---|
849 | # above map() describes stack layout with 4 temporary
|
---|
850 | # 256-bit vectors on top.
|
---|
851 | my ($rp_real,$ap_real) = map("r$_",(20,21));
|
---|
852 |
|
---|
853 | $code.=<<___;
|
---|
854 | .globl ecp_nistz256_point_double
|
---|
855 | .align 5
|
---|
856 | ecp_nistz256_point_double:
|
---|
857 | stdu $sp,-$FRAME($sp)
|
---|
858 | mflr r0
|
---|
859 | std r20,$FRAME-8*12($sp)
|
---|
860 | std r21,$FRAME-8*11($sp)
|
---|
861 | std r22,$FRAME-8*10($sp)
|
---|
862 | std r23,$FRAME-8*9($sp)
|
---|
863 | std r24,$FRAME-8*8($sp)
|
---|
864 | std r25,$FRAME-8*7($sp)
|
---|
865 | std r26,$FRAME-8*6($sp)
|
---|
866 | std r27,$FRAME-8*5($sp)
|
---|
867 | std r28,$FRAME-8*4($sp)
|
---|
868 | std r29,$FRAME-8*3($sp)
|
---|
869 | std r30,$FRAME-8*2($sp)
|
---|
870 | std r31,$FRAME-8*1($sp)
|
---|
871 |
|
---|
872 | li $poly1,-1
|
---|
873 | srdi $poly1,$poly1,32 # 0x00000000ffffffff
|
---|
874 | li $poly3,1
|
---|
875 | orc $poly3,$poly3,$poly1 # 0xffffffff00000001
|
---|
876 | .Ldouble_shortcut:
|
---|
877 | ld $acc0,32($ap)
|
---|
878 | ld $acc1,40($ap)
|
---|
879 | ld $acc2,48($ap)
|
---|
880 | ld $acc3,56($ap)
|
---|
881 | mr $t0,$acc0
|
---|
882 | mr $t1,$acc1
|
---|
883 | mr $t2,$acc2
|
---|
884 | mr $t3,$acc3
|
---|
885 | ld $a0,64($ap) # forward load for p256_sqr_mont
|
---|
886 | ld $a1,72($ap)
|
---|
887 | ld $a2,80($ap)
|
---|
888 | ld $a3,88($ap)
|
---|
889 | mr $rp_real,$rp
|
---|
890 | mr $ap_real,$ap
|
---|
891 | addi $rp,$sp,$S
|
---|
892 | bl __ecp_nistz256_add # p256_mul_by_2(S, in_y);
|
---|
893 |
|
---|
894 | addi $rp,$sp,$Zsqr
|
---|
895 | bl __ecp_nistz256_sqr_mont # p256_sqr_mont(Zsqr, in_z);
|
---|
896 |
|
---|
897 | ld $t0,0($ap_real)
|
---|
898 | ld $t1,8($ap_real)
|
---|
899 | ld $t2,16($ap_real)
|
---|
900 | ld $t3,24($ap_real)
|
---|
901 | mr $a0,$acc0 # put Zsqr aside for p256_sub
|
---|
902 | mr $a1,$acc1
|
---|
903 | mr $a2,$acc2
|
---|
904 | mr $a3,$acc3
|
---|
905 | addi $rp,$sp,$M
|
---|
906 | bl __ecp_nistz256_add # p256_add(M, Zsqr, in_x);
|
---|
907 |
|
---|
908 | addi $bp,$ap_real,0
|
---|
909 | mr $acc0,$a0 # restore Zsqr
|
---|
910 | mr $acc1,$a1
|
---|
911 | mr $acc2,$a2
|
---|
912 | mr $acc3,$a3
|
---|
913 | ld $a0,$S+0($sp) # forward load for p256_sqr_mont
|
---|
914 | ld $a1,$S+8($sp)
|
---|
915 | ld $a2,$S+16($sp)
|
---|
916 | ld $a3,$S+24($sp)
|
---|
917 | addi $rp,$sp,$Zsqr
|
---|
918 | bl __ecp_nistz256_sub_morf # p256_sub(Zsqr, in_x, Zsqr);
|
---|
919 |
|
---|
920 | addi $rp,$sp,$S
|
---|
921 | bl __ecp_nistz256_sqr_mont # p256_sqr_mont(S, S);
|
---|
922 |
|
---|
923 | ld $bi,32($ap_real)
|
---|
924 | ld $a0,64($ap_real)
|
---|
925 | ld $a1,72($ap_real)
|
---|
926 | ld $a2,80($ap_real)
|
---|
927 | ld $a3,88($ap_real)
|
---|
928 | addi $bp,$ap_real,32
|
---|
929 | addi $rp,$sp,$tmp0
|
---|
930 | bl __ecp_nistz256_mul_mont # p256_mul_mont(tmp0, in_z, in_y);
|
---|
931 |
|
---|
932 | mr $t0,$acc0
|
---|
933 | mr $t1,$acc1
|
---|
934 | mr $t2,$acc2
|
---|
935 | mr $t3,$acc3
|
---|
936 | ld $a0,$S+0($sp) # forward load for p256_sqr_mont
|
---|
937 | ld $a1,$S+8($sp)
|
---|
938 | ld $a2,$S+16($sp)
|
---|
939 | ld $a3,$S+24($sp)
|
---|
940 | addi $rp,$rp_real,64
|
---|
941 | bl __ecp_nistz256_add # p256_mul_by_2(res_z, tmp0);
|
---|
942 |
|
---|
943 | addi $rp,$sp,$tmp0
|
---|
944 | bl __ecp_nistz256_sqr_mont # p256_sqr_mont(tmp0, S);
|
---|
945 |
|
---|
946 | ld $bi,$Zsqr($sp) # forward load for p256_mul_mont
|
---|
947 | ld $a0,$M+0($sp)
|
---|
948 | ld $a1,$M+8($sp)
|
---|
949 | ld $a2,$M+16($sp)
|
---|
950 | ld $a3,$M+24($sp)
|
---|
951 | addi $rp,$rp_real,32
|
---|
952 | bl __ecp_nistz256_div_by_2 # p256_div_by_2(res_y, tmp0);
|
---|
953 |
|
---|
954 | addi $bp,$sp,$Zsqr
|
---|
955 | addi $rp,$sp,$M
|
---|
956 | bl __ecp_nistz256_mul_mont # p256_mul_mont(M, M, Zsqr);
|
---|
957 |
|
---|
958 | mr $t0,$acc0 # duplicate M
|
---|
959 | mr $t1,$acc1
|
---|
960 | mr $t2,$acc2
|
---|
961 | mr $t3,$acc3
|
---|
962 | mr $a0,$acc0 # put M aside
|
---|
963 | mr $a1,$acc1
|
---|
964 | mr $a2,$acc2
|
---|
965 | mr $a3,$acc3
|
---|
966 | addi $rp,$sp,$M
|
---|
967 | bl __ecp_nistz256_add
|
---|
968 | mr $t0,$a0 # restore M
|
---|
969 | mr $t1,$a1
|
---|
970 | mr $t2,$a2
|
---|
971 | mr $t3,$a3
|
---|
972 | ld $bi,0($ap_real) # forward load for p256_mul_mont
|
---|
973 | ld $a0,$S+0($sp)
|
---|
974 | ld $a1,$S+8($sp)
|
---|
975 | ld $a2,$S+16($sp)
|
---|
976 | ld $a3,$S+24($sp)
|
---|
977 | bl __ecp_nistz256_add # p256_mul_by_3(M, M);
|
---|
978 |
|
---|
979 | addi $bp,$ap_real,0
|
---|
980 | addi $rp,$sp,$S
|
---|
981 | bl __ecp_nistz256_mul_mont # p256_mul_mont(S, S, in_x);
|
---|
982 |
|
---|
983 | mr $t0,$acc0
|
---|
984 | mr $t1,$acc1
|
---|
985 | mr $t2,$acc2
|
---|
986 | mr $t3,$acc3
|
---|
987 | ld $a0,$M+0($sp) # forward load for p256_sqr_mont
|
---|
988 | ld $a1,$M+8($sp)
|
---|
989 | ld $a2,$M+16($sp)
|
---|
990 | ld $a3,$M+24($sp)
|
---|
991 | addi $rp,$sp,$tmp0
|
---|
992 | bl __ecp_nistz256_add # p256_mul_by_2(tmp0, S);
|
---|
993 |
|
---|
994 | addi $rp,$rp_real,0
|
---|
995 | bl __ecp_nistz256_sqr_mont # p256_sqr_mont(res_x, M);
|
---|
996 |
|
---|
997 | addi $bp,$sp,$tmp0
|
---|
998 | bl __ecp_nistz256_sub_from # p256_sub(res_x, res_x, tmp0);
|
---|
999 |
|
---|
1000 | addi $bp,$sp,$S
|
---|
1001 | addi $rp,$sp,$S
|
---|
1002 | bl __ecp_nistz256_sub_morf # p256_sub(S, S, res_x);
|
---|
1003 |
|
---|
1004 | ld $bi,$M($sp)
|
---|
1005 | mr $a0,$acc0 # copy S
|
---|
1006 | mr $a1,$acc1
|
---|
1007 | mr $a2,$acc2
|
---|
1008 | mr $a3,$acc3
|
---|
1009 | addi $bp,$sp,$M
|
---|
1010 | bl __ecp_nistz256_mul_mont # p256_mul_mont(S, S, M);
|
---|
1011 |
|
---|
1012 | addi $bp,$rp_real,32
|
---|
1013 | addi $rp,$rp_real,32
|
---|
1014 | bl __ecp_nistz256_sub_from # p256_sub(res_y, S, res_y);
|
---|
1015 |
|
---|
1016 | mtlr r0
|
---|
1017 | ld r20,$FRAME-8*12($sp)
|
---|
1018 | ld r21,$FRAME-8*11($sp)
|
---|
1019 | ld r22,$FRAME-8*10($sp)
|
---|
1020 | ld r23,$FRAME-8*9($sp)
|
---|
1021 | ld r24,$FRAME-8*8($sp)
|
---|
1022 | ld r25,$FRAME-8*7($sp)
|
---|
1023 | ld r26,$FRAME-8*6($sp)
|
---|
1024 | ld r27,$FRAME-8*5($sp)
|
---|
1025 | ld r28,$FRAME-8*4($sp)
|
---|
1026 | ld r29,$FRAME-8*3($sp)
|
---|
1027 | ld r30,$FRAME-8*2($sp)
|
---|
1028 | ld r31,$FRAME-8*1($sp)
|
---|
1029 | addi $sp,$sp,$FRAME
|
---|
1030 | blr
|
---|
1031 | .long 0
|
---|
1032 | .byte 0,12,4,0,0x80,12,2,0
|
---|
1033 | .long 0
|
---|
1034 | .size ecp_nistz256_point_double,.-ecp_nistz256_point_double
|
---|
1035 | ___
|
---|
1036 | }
|
---|
1037 |
|
---|
1038 | ########################################################################
|
---|
1039 | # void ecp_nistz256_point_add(P256_POINT *out,const P256_POINT *in1,
|
---|
1040 | # const P256_POINT *in2);
|
---|
1041 | if (1) {
|
---|
1042 | my $FRAME = 64 + 32*12 + 16*8;
|
---|
1043 | my ($res_x,$res_y,$res_z,
|
---|
1044 | $H,$Hsqr,$R,$Rsqr,$Hcub,
|
---|
1045 | $U1,$U2,$S1,$S2)=map(64+32*$_,(0..11));
|
---|
1046 | my ($Z1sqr, $Z2sqr) = ($Hsqr, $Rsqr);
|
---|
1047 | # above map() describes stack layout with 12 temporary
|
---|
1048 | # 256-bit vectors on top.
|
---|
1049 | my ($rp_real,$ap_real,$bp_real,$in1infty,$in2infty,$temp)=map("r$_",(16..21));
|
---|
1050 |
|
---|
1051 | $code.=<<___;
|
---|
1052 | .globl ecp_nistz256_point_add
|
---|
1053 | .align 5
|
---|
1054 | ecp_nistz256_point_add:
|
---|
1055 | stdu $sp,-$FRAME($sp)
|
---|
1056 | mflr r0
|
---|
1057 | std r16,$FRAME-8*16($sp)
|
---|
1058 | std r17,$FRAME-8*15($sp)
|
---|
1059 | std r18,$FRAME-8*14($sp)
|
---|
1060 | std r19,$FRAME-8*13($sp)
|
---|
1061 | std r20,$FRAME-8*12($sp)
|
---|
1062 | std r21,$FRAME-8*11($sp)
|
---|
1063 | std r22,$FRAME-8*10($sp)
|
---|
1064 | std r23,$FRAME-8*9($sp)
|
---|
1065 | std r24,$FRAME-8*8($sp)
|
---|
1066 | std r25,$FRAME-8*7($sp)
|
---|
1067 | std r26,$FRAME-8*6($sp)
|
---|
1068 | std r27,$FRAME-8*5($sp)
|
---|
1069 | std r28,$FRAME-8*4($sp)
|
---|
1070 | std r29,$FRAME-8*3($sp)
|
---|
1071 | std r30,$FRAME-8*2($sp)
|
---|
1072 | std r31,$FRAME-8*1($sp)
|
---|
1073 |
|
---|
1074 | li $poly1,-1
|
---|
1075 | srdi $poly1,$poly1,32 # 0x00000000ffffffff
|
---|
1076 | li $poly3,1
|
---|
1077 | orc $poly3,$poly3,$poly1 # 0xffffffff00000001
|
---|
1078 |
|
---|
1079 | ld $a0,64($bp) # in2_z
|
---|
1080 | ld $a1,72($bp)
|
---|
1081 | ld $a2,80($bp)
|
---|
1082 | ld $a3,88($bp)
|
---|
1083 | mr $rp_real,$rp
|
---|
1084 | mr $ap_real,$ap
|
---|
1085 | mr $bp_real,$bp
|
---|
1086 | or $t0,$a0,$a1
|
---|
1087 | or $t2,$a2,$a3
|
---|
1088 | or $in2infty,$t0,$t2
|
---|
1089 | neg $t0,$in2infty
|
---|
1090 | or $in2infty,$in2infty,$t0
|
---|
1091 | sradi $in2infty,$in2infty,63 # !in2infty
|
---|
1092 | addi $rp,$sp,$Z2sqr
|
---|
1093 | bl __ecp_nistz256_sqr_mont # p256_sqr_mont(Z2sqr, in2_z);
|
---|
1094 |
|
---|
1095 | ld $a0,64($ap_real) # in1_z
|
---|
1096 | ld $a1,72($ap_real)
|
---|
1097 | ld $a2,80($ap_real)
|
---|
1098 | ld $a3,88($ap_real)
|
---|
1099 | or $t0,$a0,$a1
|
---|
1100 | or $t2,$a2,$a3
|
---|
1101 | or $in1infty,$t0,$t2
|
---|
1102 | neg $t0,$in1infty
|
---|
1103 | or $in1infty,$in1infty,$t0
|
---|
1104 | sradi $in1infty,$in1infty,63 # !in1infty
|
---|
1105 | addi $rp,$sp,$Z1sqr
|
---|
1106 | bl __ecp_nistz256_sqr_mont # p256_sqr_mont(Z1sqr, in1_z);
|
---|
1107 |
|
---|
1108 | ld $bi,64($bp_real)
|
---|
1109 | ld $a0,$Z2sqr+0($sp)
|
---|
1110 | ld $a1,$Z2sqr+8($sp)
|
---|
1111 | ld $a2,$Z2sqr+16($sp)
|
---|
1112 | ld $a3,$Z2sqr+24($sp)
|
---|
1113 | addi $bp,$bp_real,64
|
---|
1114 | addi $rp,$sp,$S1
|
---|
1115 | bl __ecp_nistz256_mul_mont # p256_mul_mont(S1, Z2sqr, in2_z);
|
---|
1116 |
|
---|
1117 | ld $bi,64($ap_real)
|
---|
1118 | ld $a0,$Z1sqr+0($sp)
|
---|
1119 | ld $a1,$Z1sqr+8($sp)
|
---|
1120 | ld $a2,$Z1sqr+16($sp)
|
---|
1121 | ld $a3,$Z1sqr+24($sp)
|
---|
1122 | addi $bp,$ap_real,64
|
---|
1123 | addi $rp,$sp,$S2
|
---|
1124 | bl __ecp_nistz256_mul_mont # p256_mul_mont(S2, Z1sqr, in1_z);
|
---|
1125 |
|
---|
1126 | ld $bi,32($ap_real)
|
---|
1127 | ld $a0,$S1+0($sp)
|
---|
1128 | ld $a1,$S1+8($sp)
|
---|
1129 | ld $a2,$S1+16($sp)
|
---|
1130 | ld $a3,$S1+24($sp)
|
---|
1131 | addi $bp,$ap_real,32
|
---|
1132 | addi $rp,$sp,$S1
|
---|
1133 | bl __ecp_nistz256_mul_mont # p256_mul_mont(S1, S1, in1_y);
|
---|
1134 |
|
---|
1135 | ld $bi,32($bp_real)
|
---|
1136 | ld $a0,$S2+0($sp)
|
---|
1137 | ld $a1,$S2+8($sp)
|
---|
1138 | ld $a2,$S2+16($sp)
|
---|
1139 | ld $a3,$S2+24($sp)
|
---|
1140 | addi $bp,$bp_real,32
|
---|
1141 | addi $rp,$sp,$S2
|
---|
1142 | bl __ecp_nistz256_mul_mont # p256_mul_mont(S2, S2, in2_y);
|
---|
1143 |
|
---|
1144 | addi $bp,$sp,$S1
|
---|
1145 | ld $bi,$Z2sqr($sp) # forward load for p256_mul_mont
|
---|
1146 | ld $a0,0($ap_real)
|
---|
1147 | ld $a1,8($ap_real)
|
---|
1148 | ld $a2,16($ap_real)
|
---|
1149 | ld $a3,24($ap_real)
|
---|
1150 | addi $rp,$sp,$R
|
---|
1151 | bl __ecp_nistz256_sub_from # p256_sub(R, S2, S1);
|
---|
1152 |
|
---|
1153 | or $acc0,$acc0,$acc1 # see if result is zero
|
---|
1154 | or $acc2,$acc2,$acc3
|
---|
1155 | or $temp,$acc0,$acc2
|
---|
1156 |
|
---|
1157 | addi $bp,$sp,$Z2sqr
|
---|
1158 | addi $rp,$sp,$U1
|
---|
1159 | bl __ecp_nistz256_mul_mont # p256_mul_mont(U1, in1_x, Z2sqr);
|
---|
1160 |
|
---|
1161 | ld $bi,$Z1sqr($sp)
|
---|
1162 | ld $a0,0($bp_real)
|
---|
1163 | ld $a1,8($bp_real)
|
---|
1164 | ld $a2,16($bp_real)
|
---|
1165 | ld $a3,24($bp_real)
|
---|
1166 | addi $bp,$sp,$Z1sqr
|
---|
1167 | addi $rp,$sp,$U2
|
---|
1168 | bl __ecp_nistz256_mul_mont # p256_mul_mont(U2, in2_x, Z1sqr);
|
---|
1169 |
|
---|
1170 | addi $bp,$sp,$U1
|
---|
1171 | ld $a0,$R+0($sp) # forward load for p256_sqr_mont
|
---|
1172 | ld $a1,$R+8($sp)
|
---|
1173 | ld $a2,$R+16($sp)
|
---|
1174 | ld $a3,$R+24($sp)
|
---|
1175 | addi $rp,$sp,$H
|
---|
1176 | bl __ecp_nistz256_sub_from # p256_sub(H, U2, U1);
|
---|
1177 |
|
---|
1178 | or $acc0,$acc0,$acc1 # see if result is zero
|
---|
1179 | or $acc2,$acc2,$acc3
|
---|
1180 | or. $acc0,$acc0,$acc2
|
---|
1181 | bne .Ladd_proceed # is_equal(U1,U2)?
|
---|
1182 |
|
---|
1183 | and. $t0,$in1infty,$in2infty
|
---|
1184 | beq .Ladd_proceed # (in1infty || in2infty)?
|
---|
1185 |
|
---|
1186 | cmpldi $temp,0
|
---|
1187 | beq .Ladd_double # is_equal(S1,S2)?
|
---|
1188 |
|
---|
1189 | xor $a0,$a0,$a0
|
---|
1190 | std $a0,0($rp_real)
|
---|
1191 | std $a0,8($rp_real)
|
---|
1192 | std $a0,16($rp_real)
|
---|
1193 | std $a0,24($rp_real)
|
---|
1194 | std $a0,32($rp_real)
|
---|
1195 | std $a0,40($rp_real)
|
---|
1196 | std $a0,48($rp_real)
|
---|
1197 | std $a0,56($rp_real)
|
---|
1198 | std $a0,64($rp_real)
|
---|
1199 | std $a0,72($rp_real)
|
---|
1200 | std $a0,80($rp_real)
|
---|
1201 | std $a0,88($rp_real)
|
---|
1202 | b .Ladd_done
|
---|
1203 |
|
---|
1204 | .align 4
|
---|
1205 | .Ladd_double:
|
---|
1206 | ld $bp,0($sp) # back-link
|
---|
1207 | mr $ap,$ap_real
|
---|
1208 | mr $rp,$rp_real
|
---|
1209 | ld r16,$FRAME-8*16($sp)
|
---|
1210 | ld r17,$FRAME-8*15($sp)
|
---|
1211 | ld r18,$FRAME-8*14($sp)
|
---|
1212 | ld r19,$FRAME-8*13($sp)
|
---|
1213 | stdu $bp,$FRAME-288($sp) # difference in stack frame sizes
|
---|
1214 | b .Ldouble_shortcut
|
---|
1215 |
|
---|
1216 | .align 4
|
---|
1217 | .Ladd_proceed:
|
---|
1218 | addi $rp,$sp,$Rsqr
|
---|
1219 | bl __ecp_nistz256_sqr_mont # p256_sqr_mont(Rsqr, R);
|
---|
1220 |
|
---|
1221 | ld $bi,64($ap_real)
|
---|
1222 | ld $a0,$H+0($sp)
|
---|
1223 | ld $a1,$H+8($sp)
|
---|
1224 | ld $a2,$H+16($sp)
|
---|
1225 | ld $a3,$H+24($sp)
|
---|
1226 | addi $bp,$ap_real,64
|
---|
1227 | addi $rp,$sp,$res_z
|
---|
1228 | bl __ecp_nistz256_mul_mont # p256_mul_mont(res_z, H, in1_z);
|
---|
1229 |
|
---|
1230 | ld $a0,$H+0($sp)
|
---|
1231 | ld $a1,$H+8($sp)
|
---|
1232 | ld $a2,$H+16($sp)
|
---|
1233 | ld $a3,$H+24($sp)
|
---|
1234 | addi $rp,$sp,$Hsqr
|
---|
1235 | bl __ecp_nistz256_sqr_mont # p256_sqr_mont(Hsqr, H);
|
---|
1236 |
|
---|
1237 | ld $bi,64($bp_real)
|
---|
1238 | ld $a0,$res_z+0($sp)
|
---|
1239 | ld $a1,$res_z+8($sp)
|
---|
1240 | ld $a2,$res_z+16($sp)
|
---|
1241 | ld $a3,$res_z+24($sp)
|
---|
1242 | addi $bp,$bp_real,64
|
---|
1243 | addi $rp,$sp,$res_z
|
---|
1244 | bl __ecp_nistz256_mul_mont # p256_mul_mont(res_z, res_z, in2_z);
|
---|
1245 |
|
---|
1246 | ld $bi,$H($sp)
|
---|
1247 | ld $a0,$Hsqr+0($sp)
|
---|
1248 | ld $a1,$Hsqr+8($sp)
|
---|
1249 | ld $a2,$Hsqr+16($sp)
|
---|
1250 | ld $a3,$Hsqr+24($sp)
|
---|
1251 | addi $bp,$sp,$H
|
---|
1252 | addi $rp,$sp,$Hcub
|
---|
1253 | bl __ecp_nistz256_mul_mont # p256_mul_mont(Hcub, Hsqr, H);
|
---|
1254 |
|
---|
1255 | ld $bi,$Hsqr($sp)
|
---|
1256 | ld $a0,$U1+0($sp)
|
---|
1257 | ld $a1,$U1+8($sp)
|
---|
1258 | ld $a2,$U1+16($sp)
|
---|
1259 | ld $a3,$U1+24($sp)
|
---|
1260 | addi $bp,$sp,$Hsqr
|
---|
1261 | addi $rp,$sp,$U2
|
---|
1262 | bl __ecp_nistz256_mul_mont # p256_mul_mont(U2, U1, Hsqr);
|
---|
1263 |
|
---|
1264 | mr $t0,$acc0
|
---|
1265 | mr $t1,$acc1
|
---|
1266 | mr $t2,$acc2
|
---|
1267 | mr $t3,$acc3
|
---|
1268 | addi $rp,$sp,$Hsqr
|
---|
1269 | bl __ecp_nistz256_add # p256_mul_by_2(Hsqr, U2);
|
---|
1270 |
|
---|
1271 | addi $bp,$sp,$Rsqr
|
---|
1272 | addi $rp,$sp,$res_x
|
---|
1273 | bl __ecp_nistz256_sub_morf # p256_sub(res_x, Rsqr, Hsqr);
|
---|
1274 |
|
---|
1275 | addi $bp,$sp,$Hcub
|
---|
1276 | bl __ecp_nistz256_sub_from # p256_sub(res_x, res_x, Hcub);
|
---|
1277 |
|
---|
1278 | addi $bp,$sp,$U2
|
---|
1279 | ld $bi,$Hcub($sp) # forward load for p256_mul_mont
|
---|
1280 | ld $a0,$S1+0($sp)
|
---|
1281 | ld $a1,$S1+8($sp)
|
---|
1282 | ld $a2,$S1+16($sp)
|
---|
1283 | ld $a3,$S1+24($sp)
|
---|
1284 | addi $rp,$sp,$res_y
|
---|
1285 | bl __ecp_nistz256_sub_morf # p256_sub(res_y, U2, res_x);
|
---|
1286 |
|
---|
1287 | addi $bp,$sp,$Hcub
|
---|
1288 | addi $rp,$sp,$S2
|
---|
1289 | bl __ecp_nistz256_mul_mont # p256_mul_mont(S2, S1, Hcub);
|
---|
1290 |
|
---|
1291 | ld $bi,$R($sp)
|
---|
1292 | ld $a0,$res_y+0($sp)
|
---|
1293 | ld $a1,$res_y+8($sp)
|
---|
1294 | ld $a2,$res_y+16($sp)
|
---|
1295 | ld $a3,$res_y+24($sp)
|
---|
1296 | addi $bp,$sp,$R
|
---|
1297 | addi $rp,$sp,$res_y
|
---|
1298 | bl __ecp_nistz256_mul_mont # p256_mul_mont(res_y, res_y, R);
|
---|
1299 |
|
---|
1300 | addi $bp,$sp,$S2
|
---|
1301 | bl __ecp_nistz256_sub_from # p256_sub(res_y, res_y, S2);
|
---|
1302 |
|
---|
1303 | ld $t0,0($bp_real) # in2
|
---|
1304 | ld $t1,8($bp_real)
|
---|
1305 | ld $t2,16($bp_real)
|
---|
1306 | ld $t3,24($bp_real)
|
---|
1307 | ld $a0,$res_x+0($sp) # res
|
---|
1308 | ld $a1,$res_x+8($sp)
|
---|
1309 | ld $a2,$res_x+16($sp)
|
---|
1310 | ld $a3,$res_x+24($sp)
|
---|
1311 | ___
|
---|
1312 | for($i=0;$i<64;$i+=32) { # conditional moves
|
---|
1313 | $code.=<<___;
|
---|
1314 | ld $acc0,$i+0($ap_real) # in1
|
---|
1315 | ld $acc1,$i+8($ap_real)
|
---|
1316 | ld $acc2,$i+16($ap_real)
|
---|
1317 | ld $acc3,$i+24($ap_real)
|
---|
1318 | andc $t0,$t0,$in1infty
|
---|
1319 | andc $t1,$t1,$in1infty
|
---|
1320 | andc $t2,$t2,$in1infty
|
---|
1321 | andc $t3,$t3,$in1infty
|
---|
1322 | and $a0,$a0,$in1infty
|
---|
1323 | and $a1,$a1,$in1infty
|
---|
1324 | and $a2,$a2,$in1infty
|
---|
1325 | and $a3,$a3,$in1infty
|
---|
1326 | or $t0,$t0,$a0
|
---|
1327 | or $t1,$t1,$a1
|
---|
1328 | or $t2,$t2,$a2
|
---|
1329 | or $t3,$t3,$a3
|
---|
1330 | andc $acc0,$acc0,$in2infty
|
---|
1331 | andc $acc1,$acc1,$in2infty
|
---|
1332 | andc $acc2,$acc2,$in2infty
|
---|
1333 | andc $acc3,$acc3,$in2infty
|
---|
1334 | and $t0,$t0,$in2infty
|
---|
1335 | and $t1,$t1,$in2infty
|
---|
1336 | and $t2,$t2,$in2infty
|
---|
1337 | and $t3,$t3,$in2infty
|
---|
1338 | or $acc0,$acc0,$t0
|
---|
1339 | or $acc1,$acc1,$t1
|
---|
1340 | or $acc2,$acc2,$t2
|
---|
1341 | or $acc3,$acc3,$t3
|
---|
1342 |
|
---|
1343 | ld $t0,$i+32($bp_real) # in2
|
---|
1344 | ld $t1,$i+40($bp_real)
|
---|
1345 | ld $t2,$i+48($bp_real)
|
---|
1346 | ld $t3,$i+56($bp_real)
|
---|
1347 | ld $a0,$res_x+$i+32($sp)
|
---|
1348 | ld $a1,$res_x+$i+40($sp)
|
---|
1349 | ld $a2,$res_x+$i+48($sp)
|
---|
1350 | ld $a3,$res_x+$i+56($sp)
|
---|
1351 | std $acc0,$i+0($rp_real)
|
---|
1352 | std $acc1,$i+8($rp_real)
|
---|
1353 | std $acc2,$i+16($rp_real)
|
---|
1354 | std $acc3,$i+24($rp_real)
|
---|
1355 | ___
|
---|
1356 | }
|
---|
1357 | $code.=<<___;
|
---|
1358 | ld $acc0,$i+0($ap_real) # in1
|
---|
1359 | ld $acc1,$i+8($ap_real)
|
---|
1360 | ld $acc2,$i+16($ap_real)
|
---|
1361 | ld $acc3,$i+24($ap_real)
|
---|
1362 | andc $t0,$t0,$in1infty
|
---|
1363 | andc $t1,$t1,$in1infty
|
---|
1364 | andc $t2,$t2,$in1infty
|
---|
1365 | andc $t3,$t3,$in1infty
|
---|
1366 | and $a0,$a0,$in1infty
|
---|
1367 | and $a1,$a1,$in1infty
|
---|
1368 | and $a2,$a2,$in1infty
|
---|
1369 | and $a3,$a3,$in1infty
|
---|
1370 | or $t0,$t0,$a0
|
---|
1371 | or $t1,$t1,$a1
|
---|
1372 | or $t2,$t2,$a2
|
---|
1373 | or $t3,$t3,$a3
|
---|
1374 | andc $acc0,$acc0,$in2infty
|
---|
1375 | andc $acc1,$acc1,$in2infty
|
---|
1376 | andc $acc2,$acc2,$in2infty
|
---|
1377 | andc $acc3,$acc3,$in2infty
|
---|
1378 | and $t0,$t0,$in2infty
|
---|
1379 | and $t1,$t1,$in2infty
|
---|
1380 | and $t2,$t2,$in2infty
|
---|
1381 | and $t3,$t3,$in2infty
|
---|
1382 | or $acc0,$acc0,$t0
|
---|
1383 | or $acc1,$acc1,$t1
|
---|
1384 | or $acc2,$acc2,$t2
|
---|
1385 | or $acc3,$acc3,$t3
|
---|
1386 | std $acc0,$i+0($rp_real)
|
---|
1387 | std $acc1,$i+8($rp_real)
|
---|
1388 | std $acc2,$i+16($rp_real)
|
---|
1389 | std $acc3,$i+24($rp_real)
|
---|
1390 |
|
---|
1391 | .Ladd_done:
|
---|
1392 | mtlr r0
|
---|
1393 | ld r16,$FRAME-8*16($sp)
|
---|
1394 | ld r17,$FRAME-8*15($sp)
|
---|
1395 | ld r18,$FRAME-8*14($sp)
|
---|
1396 | ld r19,$FRAME-8*13($sp)
|
---|
1397 | ld r20,$FRAME-8*12($sp)
|
---|
1398 | ld r21,$FRAME-8*11($sp)
|
---|
1399 | ld r22,$FRAME-8*10($sp)
|
---|
1400 | ld r23,$FRAME-8*9($sp)
|
---|
1401 | ld r24,$FRAME-8*8($sp)
|
---|
1402 | ld r25,$FRAME-8*7($sp)
|
---|
1403 | ld r26,$FRAME-8*6($sp)
|
---|
1404 | ld r27,$FRAME-8*5($sp)
|
---|
1405 | ld r28,$FRAME-8*4($sp)
|
---|
1406 | ld r29,$FRAME-8*3($sp)
|
---|
1407 | ld r30,$FRAME-8*2($sp)
|
---|
1408 | ld r31,$FRAME-8*1($sp)
|
---|
1409 | addi $sp,$sp,$FRAME
|
---|
1410 | blr
|
---|
1411 | .long 0
|
---|
1412 | .byte 0,12,4,0,0x80,16,3,0
|
---|
1413 | .long 0
|
---|
1414 | .size ecp_nistz256_point_add,.-ecp_nistz256_point_add
|
---|
1415 | ___
|
---|
1416 | }
|
---|
1417 |
|
---|
1418 | ########################################################################
|
---|
1419 | # void ecp_nistz256_point_add_affine(P256_POINT *out,const P256_POINT *in1,
|
---|
1420 | # const P256_POINT_AFFINE *in2);
|
---|
1421 | if (1) {
|
---|
1422 | my $FRAME = 64 + 32*10 + 16*8;
|
---|
1423 | my ($res_x,$res_y,$res_z,
|
---|
1424 | $U2,$S2,$H,$R,$Hsqr,$Hcub,$Rsqr)=map(64+32*$_,(0..9));
|
---|
1425 | my $Z1sqr = $S2;
|
---|
1426 | # above map() describes stack layout with 10 temporary
|
---|
1427 | # 256-bit vectors on top.
|
---|
1428 | my ($rp_real,$ap_real,$bp_real,$in1infty,$in2infty,$temp)=map("r$_",(16..21));
|
---|
1429 |
|
---|
1430 | $code.=<<___;
|
---|
1431 | .globl ecp_nistz256_point_add_affine
|
---|
1432 | .align 5
|
---|
1433 | ecp_nistz256_point_add_affine:
|
---|
1434 | stdu $sp,-$FRAME($sp)
|
---|
1435 | mflr r0
|
---|
1436 | std r16,$FRAME-8*16($sp)
|
---|
1437 | std r17,$FRAME-8*15($sp)
|
---|
1438 | std r18,$FRAME-8*14($sp)
|
---|
1439 | std r19,$FRAME-8*13($sp)
|
---|
1440 | std r20,$FRAME-8*12($sp)
|
---|
1441 | std r21,$FRAME-8*11($sp)
|
---|
1442 | std r22,$FRAME-8*10($sp)
|
---|
1443 | std r23,$FRAME-8*9($sp)
|
---|
1444 | std r24,$FRAME-8*8($sp)
|
---|
1445 | std r25,$FRAME-8*7($sp)
|
---|
1446 | std r26,$FRAME-8*6($sp)
|
---|
1447 | std r27,$FRAME-8*5($sp)
|
---|
1448 | std r28,$FRAME-8*4($sp)
|
---|
1449 | std r29,$FRAME-8*3($sp)
|
---|
1450 | std r30,$FRAME-8*2($sp)
|
---|
1451 | std r31,$FRAME-8*1($sp)
|
---|
1452 |
|
---|
1453 | li $poly1,-1
|
---|
1454 | srdi $poly1,$poly1,32 # 0x00000000ffffffff
|
---|
1455 | li $poly3,1
|
---|
1456 | orc $poly3,$poly3,$poly1 # 0xffffffff00000001
|
---|
1457 |
|
---|
1458 | mr $rp_real,$rp
|
---|
1459 | mr $ap_real,$ap
|
---|
1460 | mr $bp_real,$bp
|
---|
1461 |
|
---|
1462 | ld $a0,64($ap) # in1_z
|
---|
1463 | ld $a1,72($ap)
|
---|
1464 | ld $a2,80($ap)
|
---|
1465 | ld $a3,88($ap)
|
---|
1466 | or $t0,$a0,$a1
|
---|
1467 | or $t2,$a2,$a3
|
---|
1468 | or $in1infty,$t0,$t2
|
---|
1469 | neg $t0,$in1infty
|
---|
1470 | or $in1infty,$in1infty,$t0
|
---|
1471 | sradi $in1infty,$in1infty,63 # !in1infty
|
---|
1472 |
|
---|
1473 | ld $acc0,0($bp) # in2_x
|
---|
1474 | ld $acc1,8($bp)
|
---|
1475 | ld $acc2,16($bp)
|
---|
1476 | ld $acc3,24($bp)
|
---|
1477 | ld $t0,32($bp) # in2_y
|
---|
1478 | ld $t1,40($bp)
|
---|
1479 | ld $t2,48($bp)
|
---|
1480 | ld $t3,56($bp)
|
---|
1481 | or $acc0,$acc0,$acc1
|
---|
1482 | or $acc2,$acc2,$acc3
|
---|
1483 | or $acc0,$acc0,$acc2
|
---|
1484 | or $t0,$t0,$t1
|
---|
1485 | or $t2,$t2,$t3
|
---|
1486 | or $t0,$t0,$t2
|
---|
1487 | or $in2infty,$acc0,$t0
|
---|
1488 | neg $t0,$in2infty
|
---|
1489 | or $in2infty,$in2infty,$t0
|
---|
1490 | sradi $in2infty,$in2infty,63 # !in2infty
|
---|
1491 |
|
---|
1492 | addi $rp,$sp,$Z1sqr
|
---|
1493 | bl __ecp_nistz256_sqr_mont # p256_sqr_mont(Z1sqr, in1_z);
|
---|
1494 |
|
---|
1495 | mr $a0,$acc0
|
---|
1496 | mr $a1,$acc1
|
---|
1497 | mr $a2,$acc2
|
---|
1498 | mr $a3,$acc3
|
---|
1499 | ld $bi,0($bp_real)
|
---|
1500 | addi $bp,$bp_real,0
|
---|
1501 | addi $rp,$sp,$U2
|
---|
1502 | bl __ecp_nistz256_mul_mont # p256_mul_mont(U2, Z1sqr, in2_x);
|
---|
1503 |
|
---|
1504 | addi $bp,$ap_real,0
|
---|
1505 | ld $bi,64($ap_real) # forward load for p256_mul_mont
|
---|
1506 | ld $a0,$Z1sqr+0($sp)
|
---|
1507 | ld $a1,$Z1sqr+8($sp)
|
---|
1508 | ld $a2,$Z1sqr+16($sp)
|
---|
1509 | ld $a3,$Z1sqr+24($sp)
|
---|
1510 | addi $rp,$sp,$H
|
---|
1511 | bl __ecp_nistz256_sub_from # p256_sub(H, U2, in1_x);
|
---|
1512 |
|
---|
1513 | addi $bp,$ap_real,64
|
---|
1514 | addi $rp,$sp,$S2
|
---|
1515 | bl __ecp_nistz256_mul_mont # p256_mul_mont(S2, Z1sqr, in1_z);
|
---|
1516 |
|
---|
1517 | ld $bi,64($ap_real)
|
---|
1518 | ld $a0,$H+0($sp)
|
---|
1519 | ld $a1,$H+8($sp)
|
---|
1520 | ld $a2,$H+16($sp)
|
---|
1521 | ld $a3,$H+24($sp)
|
---|
1522 | addi $bp,$ap_real,64
|
---|
1523 | addi $rp,$sp,$res_z
|
---|
1524 | bl __ecp_nistz256_mul_mont # p256_mul_mont(res_z, H, in1_z);
|
---|
1525 |
|
---|
1526 | ld $bi,32($bp_real)
|
---|
1527 | ld $a0,$S2+0($sp)
|
---|
1528 | ld $a1,$S2+8($sp)
|
---|
1529 | ld $a2,$S2+16($sp)
|
---|
1530 | ld $a3,$S2+24($sp)
|
---|
1531 | addi $bp,$bp_real,32
|
---|
1532 | addi $rp,$sp,$S2
|
---|
1533 | bl __ecp_nistz256_mul_mont # p256_mul_mont(S2, S2, in2_y);
|
---|
1534 |
|
---|
1535 | addi $bp,$ap_real,32
|
---|
1536 | ld $a0,$H+0($sp) # forward load for p256_sqr_mont
|
---|
1537 | ld $a1,$H+8($sp)
|
---|
1538 | ld $a2,$H+16($sp)
|
---|
1539 | ld $a3,$H+24($sp)
|
---|
1540 | addi $rp,$sp,$R
|
---|
1541 | bl __ecp_nistz256_sub_from # p256_sub(R, S2, in1_y);
|
---|
1542 |
|
---|
1543 | addi $rp,$sp,$Hsqr
|
---|
1544 | bl __ecp_nistz256_sqr_mont # p256_sqr_mont(Hsqr, H);
|
---|
1545 |
|
---|
1546 | ld $a0,$R+0($sp)
|
---|
1547 | ld $a1,$R+8($sp)
|
---|
1548 | ld $a2,$R+16($sp)
|
---|
1549 | ld $a3,$R+24($sp)
|
---|
1550 | addi $rp,$sp,$Rsqr
|
---|
1551 | bl __ecp_nistz256_sqr_mont # p256_sqr_mont(Rsqr, R);
|
---|
1552 |
|
---|
1553 | ld $bi,$H($sp)
|
---|
1554 | ld $a0,$Hsqr+0($sp)
|
---|
1555 | ld $a1,$Hsqr+8($sp)
|
---|
1556 | ld $a2,$Hsqr+16($sp)
|
---|
1557 | ld $a3,$Hsqr+24($sp)
|
---|
1558 | addi $bp,$sp,$H
|
---|
1559 | addi $rp,$sp,$Hcub
|
---|
1560 | bl __ecp_nistz256_mul_mont # p256_mul_mont(Hcub, Hsqr, H);
|
---|
1561 |
|
---|
1562 | ld $bi,0($ap_real)
|
---|
1563 | ld $a0,$Hsqr+0($sp)
|
---|
1564 | ld $a1,$Hsqr+8($sp)
|
---|
1565 | ld $a2,$Hsqr+16($sp)
|
---|
1566 | ld $a3,$Hsqr+24($sp)
|
---|
1567 | addi $bp,$ap_real,0
|
---|
1568 | addi $rp,$sp,$U2
|
---|
1569 | bl __ecp_nistz256_mul_mont # p256_mul_mont(U2, in1_x, Hsqr);
|
---|
1570 |
|
---|
1571 | mr $t0,$acc0
|
---|
1572 | mr $t1,$acc1
|
---|
1573 | mr $t2,$acc2
|
---|
1574 | mr $t3,$acc3
|
---|
1575 | addi $rp,$sp,$Hsqr
|
---|
1576 | bl __ecp_nistz256_add # p256_mul_by_2(Hsqr, U2);
|
---|
1577 |
|
---|
1578 | addi $bp,$sp,$Rsqr
|
---|
1579 | addi $rp,$sp,$res_x
|
---|
1580 | bl __ecp_nistz256_sub_morf # p256_sub(res_x, Rsqr, Hsqr);
|
---|
1581 |
|
---|
1582 | addi $bp,$sp,$Hcub
|
---|
1583 | bl __ecp_nistz256_sub_from # p256_sub(res_x, res_x, Hcub);
|
---|
1584 |
|
---|
1585 | addi $bp,$sp,$U2
|
---|
1586 | ld $bi,32($ap_real) # forward load for p256_mul_mont
|
---|
1587 | ld $a0,$Hcub+0($sp)
|
---|
1588 | ld $a1,$Hcub+8($sp)
|
---|
1589 | ld $a2,$Hcub+16($sp)
|
---|
1590 | ld $a3,$Hcub+24($sp)
|
---|
1591 | addi $rp,$sp,$res_y
|
---|
1592 | bl __ecp_nistz256_sub_morf # p256_sub(res_y, U2, res_x);
|
---|
1593 |
|
---|
1594 | addi $bp,$ap_real,32
|
---|
1595 | addi $rp,$sp,$S2
|
---|
1596 | bl __ecp_nistz256_mul_mont # p256_mul_mont(S2, in1_y, Hcub);
|
---|
1597 |
|
---|
1598 | ld $bi,$R($sp)
|
---|
1599 | ld $a0,$res_y+0($sp)
|
---|
1600 | ld $a1,$res_y+8($sp)
|
---|
1601 | ld $a2,$res_y+16($sp)
|
---|
1602 | ld $a3,$res_y+24($sp)
|
---|
1603 | addi $bp,$sp,$R
|
---|
1604 | addi $rp,$sp,$res_y
|
---|
1605 | bl __ecp_nistz256_mul_mont # p256_mul_mont(res_y, res_y, R);
|
---|
1606 |
|
---|
1607 | addi $bp,$sp,$S2
|
---|
1608 | bl __ecp_nistz256_sub_from # p256_sub(res_y, res_y, S2);
|
---|
1609 |
|
---|
1610 | ld $t0,0($bp_real) # in2
|
---|
1611 | ld $t1,8($bp_real)
|
---|
1612 | ld $t2,16($bp_real)
|
---|
1613 | ld $t3,24($bp_real)
|
---|
1614 | ld $a0,$res_x+0($sp) # res
|
---|
1615 | ld $a1,$res_x+8($sp)
|
---|
1616 | ld $a2,$res_x+16($sp)
|
---|
1617 | ld $a3,$res_x+24($sp)
|
---|
1618 | ___
|
---|
1619 | for($i=0;$i<64;$i+=32) { # conditional moves
|
---|
1620 | $code.=<<___;
|
---|
1621 | ld $acc0,$i+0($ap_real) # in1
|
---|
1622 | ld $acc1,$i+8($ap_real)
|
---|
1623 | ld $acc2,$i+16($ap_real)
|
---|
1624 | ld $acc3,$i+24($ap_real)
|
---|
1625 | andc $t0,$t0,$in1infty
|
---|
1626 | andc $t1,$t1,$in1infty
|
---|
1627 | andc $t2,$t2,$in1infty
|
---|
1628 | andc $t3,$t3,$in1infty
|
---|
1629 | and $a0,$a0,$in1infty
|
---|
1630 | and $a1,$a1,$in1infty
|
---|
1631 | and $a2,$a2,$in1infty
|
---|
1632 | and $a3,$a3,$in1infty
|
---|
1633 | or $t0,$t0,$a0
|
---|
1634 | or $t1,$t1,$a1
|
---|
1635 | or $t2,$t2,$a2
|
---|
1636 | or $t3,$t3,$a3
|
---|
1637 | andc $acc0,$acc0,$in2infty
|
---|
1638 | andc $acc1,$acc1,$in2infty
|
---|
1639 | andc $acc2,$acc2,$in2infty
|
---|
1640 | andc $acc3,$acc3,$in2infty
|
---|
1641 | and $t0,$t0,$in2infty
|
---|
1642 | and $t1,$t1,$in2infty
|
---|
1643 | and $t2,$t2,$in2infty
|
---|
1644 | and $t3,$t3,$in2infty
|
---|
1645 | or $acc0,$acc0,$t0
|
---|
1646 | or $acc1,$acc1,$t1
|
---|
1647 | or $acc2,$acc2,$t2
|
---|
1648 | or $acc3,$acc3,$t3
|
---|
1649 | ___
|
---|
1650 | $code.=<<___ if ($i==0);
|
---|
1651 | ld $t0,32($bp_real) # in2
|
---|
1652 | ld $t1,40($bp_real)
|
---|
1653 | ld $t2,48($bp_real)
|
---|
1654 | ld $t3,56($bp_real)
|
---|
1655 | ___
|
---|
1656 | $code.=<<___ if ($i==32);
|
---|
1657 | li $t0,1 # Lone_mont
|
---|
1658 | not $t1,$poly1
|
---|
1659 | li $t2,-1
|
---|
1660 | not $t3,$poly3
|
---|
1661 | ___
|
---|
1662 | $code.=<<___;
|
---|
1663 | ld $a0,$res_x+$i+32($sp)
|
---|
1664 | ld $a1,$res_x+$i+40($sp)
|
---|
1665 | ld $a2,$res_x+$i+48($sp)
|
---|
1666 | ld $a3,$res_x+$i+56($sp)
|
---|
1667 | std $acc0,$i+0($rp_real)
|
---|
1668 | std $acc1,$i+8($rp_real)
|
---|
1669 | std $acc2,$i+16($rp_real)
|
---|
1670 | std $acc3,$i+24($rp_real)
|
---|
1671 | ___
|
---|
1672 | }
|
---|
1673 | $code.=<<___;
|
---|
1674 | ld $acc0,$i+0($ap_real) # in1
|
---|
1675 | ld $acc1,$i+8($ap_real)
|
---|
1676 | ld $acc2,$i+16($ap_real)
|
---|
1677 | ld $acc3,$i+24($ap_real)
|
---|
1678 | andc $t0,$t0,$in1infty
|
---|
1679 | andc $t1,$t1,$in1infty
|
---|
1680 | andc $t2,$t2,$in1infty
|
---|
1681 | andc $t3,$t3,$in1infty
|
---|
1682 | and $a0,$a0,$in1infty
|
---|
1683 | and $a1,$a1,$in1infty
|
---|
1684 | and $a2,$a2,$in1infty
|
---|
1685 | and $a3,$a3,$in1infty
|
---|
1686 | or $t0,$t0,$a0
|
---|
1687 | or $t1,$t1,$a1
|
---|
1688 | or $t2,$t2,$a2
|
---|
1689 | or $t3,$t3,$a3
|
---|
1690 | andc $acc0,$acc0,$in2infty
|
---|
1691 | andc $acc1,$acc1,$in2infty
|
---|
1692 | andc $acc2,$acc2,$in2infty
|
---|
1693 | andc $acc3,$acc3,$in2infty
|
---|
1694 | and $t0,$t0,$in2infty
|
---|
1695 | and $t1,$t1,$in2infty
|
---|
1696 | and $t2,$t2,$in2infty
|
---|
1697 | and $t3,$t3,$in2infty
|
---|
1698 | or $acc0,$acc0,$t0
|
---|
1699 | or $acc1,$acc1,$t1
|
---|
1700 | or $acc2,$acc2,$t2
|
---|
1701 | or $acc3,$acc3,$t3
|
---|
1702 | std $acc0,$i+0($rp_real)
|
---|
1703 | std $acc1,$i+8($rp_real)
|
---|
1704 | std $acc2,$i+16($rp_real)
|
---|
1705 | std $acc3,$i+24($rp_real)
|
---|
1706 |
|
---|
1707 | mtlr r0
|
---|
1708 | ld r16,$FRAME-8*16($sp)
|
---|
1709 | ld r17,$FRAME-8*15($sp)
|
---|
1710 | ld r18,$FRAME-8*14($sp)
|
---|
1711 | ld r19,$FRAME-8*13($sp)
|
---|
1712 | ld r20,$FRAME-8*12($sp)
|
---|
1713 | ld r21,$FRAME-8*11($sp)
|
---|
1714 | ld r22,$FRAME-8*10($sp)
|
---|
1715 | ld r23,$FRAME-8*9($sp)
|
---|
1716 | ld r24,$FRAME-8*8($sp)
|
---|
1717 | ld r25,$FRAME-8*7($sp)
|
---|
1718 | ld r26,$FRAME-8*6($sp)
|
---|
1719 | ld r27,$FRAME-8*5($sp)
|
---|
1720 | ld r28,$FRAME-8*4($sp)
|
---|
1721 | ld r29,$FRAME-8*3($sp)
|
---|
1722 | ld r30,$FRAME-8*2($sp)
|
---|
1723 | ld r31,$FRAME-8*1($sp)
|
---|
1724 | addi $sp,$sp,$FRAME
|
---|
1725 | blr
|
---|
1726 | .long 0
|
---|
1727 | .byte 0,12,4,0,0x80,16,3,0
|
---|
1728 | .long 0
|
---|
1729 | .size ecp_nistz256_point_add_affine,.-ecp_nistz256_point_add_affine
|
---|
1730 | ___
|
---|
1731 | }
|
---|
1732 | if (1) {
|
---|
1733 | my ($ordk,$ord0,$ord1,$t4) = map("r$_",(18..21));
|
---|
1734 | my ($ord2,$ord3,$zr) = ($poly1,$poly3,"r0");
|
---|
1735 |
|
---|
1736 | $code.=<<___;
|
---|
1737 | ########################################################################
|
---|
1738 | # void ecp_nistz256_ord_mul_mont(uint64_t res[4], uint64_t a[4],
|
---|
1739 | # uint64_t b[4]);
|
---|
1740 | .globl ecp_nistz256_ord_mul_mont
|
---|
1741 | .align 5
|
---|
1742 | ecp_nistz256_ord_mul_mont:
|
---|
1743 | stdu $sp,-160($sp)
|
---|
1744 | std r18,48($sp)
|
---|
1745 | std r19,56($sp)
|
---|
1746 | std r20,64($sp)
|
---|
1747 | std r21,72($sp)
|
---|
1748 | std r22,80($sp)
|
---|
1749 | std r23,88($sp)
|
---|
1750 | std r24,96($sp)
|
---|
1751 | std r25,104($sp)
|
---|
1752 | std r26,112($sp)
|
---|
1753 | std r27,120($sp)
|
---|
1754 | std r28,128($sp)
|
---|
1755 | std r29,136($sp)
|
---|
1756 | std r30,144($sp)
|
---|
1757 | std r31,152($sp)
|
---|
1758 |
|
---|
1759 | ld $a0,0($ap)
|
---|
1760 | ld $bi,0($bp)
|
---|
1761 | ld $a1,8($ap)
|
---|
1762 | ld $a2,16($ap)
|
---|
1763 | ld $a3,24($ap)
|
---|
1764 |
|
---|
1765 | lis $ordk,0xccd1
|
---|
1766 | lis $ord0,0xf3b9
|
---|
1767 | lis $ord1,0xbce6
|
---|
1768 | ori $ordk,$ordk,0xc8aa
|
---|
1769 | ori $ord0,$ord0,0xcac2
|
---|
1770 | ori $ord1,$ord1,0xfaad
|
---|
1771 | sldi $ordk,$ordk,32
|
---|
1772 | sldi $ord0,$ord0,32
|
---|
1773 | sldi $ord1,$ord1,32
|
---|
1774 | oris $ordk,$ordk,0xee00
|
---|
1775 | oris $ord0,$ord0,0xfc63
|
---|
1776 | oris $ord1,$ord1,0xa717
|
---|
1777 | ori $ordk,$ordk,0xbc4f # 0xccd1c8aaee00bc4f
|
---|
1778 | ori $ord0,$ord0,0x2551 # 0xf3b9cac2fc632551
|
---|
1779 | ori $ord1,$ord1,0x9e84 # 0xbce6faada7179e84
|
---|
1780 | li $ord2,-1 # 0xffffffffffffffff
|
---|
1781 | sldi $ord3,$ord2,32 # 0xffffffff00000000
|
---|
1782 | li $zr,0
|
---|
1783 |
|
---|
1784 | mulld $acc0,$a0,$bi # a[0]*b[0]
|
---|
1785 | mulhdu $t0,$a0,$bi
|
---|
1786 |
|
---|
1787 | mulld $acc1,$a1,$bi # a[1]*b[0]
|
---|
1788 | mulhdu $t1,$a1,$bi
|
---|
1789 |
|
---|
1790 | mulld $acc2,$a2,$bi # a[2]*b[0]
|
---|
1791 | mulhdu $t2,$a2,$bi
|
---|
1792 |
|
---|
1793 | mulld $acc3,$a3,$bi # a[3]*b[0]
|
---|
1794 | mulhdu $acc4,$a3,$bi
|
---|
1795 |
|
---|
1796 | mulld $t4,$acc0,$ordk
|
---|
1797 |
|
---|
1798 | addc $acc1,$acc1,$t0 # accumulate high parts of multiplication
|
---|
1799 | adde $acc2,$acc2,$t1
|
---|
1800 | adde $acc3,$acc3,$t2
|
---|
1801 | addze $acc4,$acc4
|
---|
1802 | li $acc5,0
|
---|
1803 | ___
|
---|
1804 | for ($i=1;$i<4;$i++) {
|
---|
1805 | ################################################################
|
---|
1806 | # ffff0000.ffffffff.yyyyyyyy.zzzzzzzz
|
---|
1807 | # * abcdefgh
|
---|
1808 | # + xxxxxxxx.xxxxxxxx.xxxxxxxx.xxxxxxxx.xxxxxxxx
|
---|
1809 | #
|
---|
1810 | # Now observing that ff..ff*x = (2^n-1)*x = 2^n*x-x, we
|
---|
1811 | # rewrite above as:
|
---|
1812 | #
|
---|
1813 | # xxxxxxxx.xxxxxxxx.xxxxxxxx.xxxxxxxx.xxxxxxxx
|
---|
1814 | # - 0000abcd.efgh0000.abcdefgh.00000000.00000000
|
---|
1815 | # + abcdefgh.abcdefgh.yzayzbyz.cyzdyzey.zfyzgyzh
|
---|
1816 | $code.=<<___;
|
---|
1817 | ld $bi,8*$i($bp) # b[i]
|
---|
1818 |
|
---|
1819 | sldi $t0,$t4,32
|
---|
1820 | subfc $acc2,$t4,$acc2
|
---|
1821 | srdi $t1,$t4,32
|
---|
1822 | subfe $acc3,$t0,$acc3
|
---|
1823 | subfe $acc4,$t1,$acc4
|
---|
1824 | subfe $acc5,$zr,$acc5
|
---|
1825 |
|
---|
1826 | addic $t0,$acc0,-1 # discarded
|
---|
1827 | mulhdu $t1,$ord0,$t4
|
---|
1828 | mulld $t2,$ord1,$t4
|
---|
1829 | mulhdu $t3,$ord1,$t4
|
---|
1830 |
|
---|
1831 | adde $t2,$t2,$t1
|
---|
1832 | mulld $t0,$a0,$bi
|
---|
1833 | addze $t3,$t3
|
---|
1834 | mulld $t1,$a1,$bi
|
---|
1835 |
|
---|
1836 | addc $acc0,$acc1,$t2
|
---|
1837 | mulld $t2,$a2,$bi
|
---|
1838 | adde $acc1,$acc2,$t3
|
---|
1839 | mulld $t3,$a3,$bi
|
---|
1840 | adde $acc2,$acc3,$t4
|
---|
1841 | adde $acc3,$acc4,$t4
|
---|
1842 | addze $acc4,$acc5
|
---|
1843 |
|
---|
1844 | addc $acc0,$acc0,$t0 # accumulate low parts
|
---|
1845 | mulhdu $t0,$a0,$bi
|
---|
1846 | adde $acc1,$acc1,$t1
|
---|
1847 | mulhdu $t1,$a1,$bi
|
---|
1848 | adde $acc2,$acc2,$t2
|
---|
1849 | mulhdu $t2,$a2,$bi
|
---|
1850 | adde $acc3,$acc3,$t3
|
---|
1851 | mulhdu $t3,$a3,$bi
|
---|
1852 | addze $acc4,$acc4
|
---|
1853 | mulld $t4,$acc0,$ordk
|
---|
1854 | addc $acc1,$acc1,$t0 # accumulate high parts
|
---|
1855 | adde $acc2,$acc2,$t1
|
---|
1856 | adde $acc3,$acc3,$t2
|
---|
1857 | adde $acc4,$acc4,$t3
|
---|
1858 | addze $acc5,$zr
|
---|
1859 | ___
|
---|
1860 | }
|
---|
1861 | $code.=<<___;
|
---|
1862 | sldi $t0,$t4,32 # last reduction
|
---|
1863 | subfc $acc2,$t4,$acc2
|
---|
1864 | srdi $t1,$t4,32
|
---|
1865 | subfe $acc3,$t0,$acc3
|
---|
1866 | subfe $acc4,$t1,$acc4
|
---|
1867 | subfe $acc5,$zr,$acc5
|
---|
1868 |
|
---|
1869 | addic $t0,$acc0,-1 # discarded
|
---|
1870 | mulhdu $t1,$ord0,$t4
|
---|
1871 | mulld $t2,$ord1,$t4
|
---|
1872 | mulhdu $t3,$ord1,$t4
|
---|
1873 |
|
---|
1874 | adde $t2,$t2,$t1
|
---|
1875 | addze $t3,$t3
|
---|
1876 |
|
---|
1877 | addc $acc0,$acc1,$t2
|
---|
1878 | adde $acc1,$acc2,$t3
|
---|
1879 | adde $acc2,$acc3,$t4
|
---|
1880 | adde $acc3,$acc4,$t4
|
---|
1881 | addze $acc4,$acc5
|
---|
1882 |
|
---|
1883 | subfc $acc0,$ord0,$acc0 # ret -= modulus
|
---|
1884 | subfe $acc1,$ord1,$acc1
|
---|
1885 | subfe $acc2,$ord2,$acc2
|
---|
1886 | subfe $acc3,$ord3,$acc3
|
---|
1887 | subfe $acc4,$zr,$acc4
|
---|
1888 |
|
---|
1889 | and $t0,$ord0,$acc4
|
---|
1890 | and $t1,$ord1,$acc4
|
---|
1891 | addc $acc0,$acc0,$t0 # ret += modulus if borrow
|
---|
1892 | and $t3,$ord3,$acc4
|
---|
1893 | adde $acc1,$acc1,$t1
|
---|
1894 | adde $acc2,$acc2,$acc4
|
---|
1895 | adde $acc3,$acc3,$t3
|
---|
1896 |
|
---|
1897 | std $acc0,0($rp)
|
---|
1898 | std $acc1,8($rp)
|
---|
1899 | std $acc2,16($rp)
|
---|
1900 | std $acc3,24($rp)
|
---|
1901 |
|
---|
1902 | ld r18,48($sp)
|
---|
1903 | ld r19,56($sp)
|
---|
1904 | ld r20,64($sp)
|
---|
1905 | ld r21,72($sp)
|
---|
1906 | ld r22,80($sp)
|
---|
1907 | ld r23,88($sp)
|
---|
1908 | ld r24,96($sp)
|
---|
1909 | ld r25,104($sp)
|
---|
1910 | ld r26,112($sp)
|
---|
1911 | ld r27,120($sp)
|
---|
1912 | ld r28,128($sp)
|
---|
1913 | ld r29,136($sp)
|
---|
1914 | ld r30,144($sp)
|
---|
1915 | ld r31,152($sp)
|
---|
1916 | addi $sp,$sp,160
|
---|
1917 | blr
|
---|
1918 | .long 0
|
---|
1919 | .byte 0,12,4,0,0x80,14,3,0
|
---|
1920 | .long 0
|
---|
1921 | .size ecp_nistz256_ord_mul_mont,.-ecp_nistz256_ord_mul_mont
|
---|
1922 |
|
---|
1923 | ################################################################################
|
---|
1924 | # void ecp_nistz256_ord_sqr_mont(uint64_t res[4], uint64_t a[4],
|
---|
1925 | # uint64_t rep);
|
---|
1926 | .globl ecp_nistz256_ord_sqr_mont
|
---|
1927 | .align 5
|
---|
1928 | ecp_nistz256_ord_sqr_mont:
|
---|
1929 | stdu $sp,-160($sp)
|
---|
1930 | std r18,48($sp)
|
---|
1931 | std r19,56($sp)
|
---|
1932 | std r20,64($sp)
|
---|
1933 | std r21,72($sp)
|
---|
1934 | std r22,80($sp)
|
---|
1935 | std r23,88($sp)
|
---|
1936 | std r24,96($sp)
|
---|
1937 | std r25,104($sp)
|
---|
1938 | std r26,112($sp)
|
---|
1939 | std r27,120($sp)
|
---|
1940 | std r28,128($sp)
|
---|
1941 | std r29,136($sp)
|
---|
1942 | std r30,144($sp)
|
---|
1943 | std r31,152($sp)
|
---|
1944 |
|
---|
1945 | mtctr $bp
|
---|
1946 |
|
---|
1947 | ld $a0,0($ap)
|
---|
1948 | ld $a1,8($ap)
|
---|
1949 | ld $a2,16($ap)
|
---|
1950 | ld $a3,24($ap)
|
---|
1951 |
|
---|
1952 | lis $ordk,0xccd1
|
---|
1953 | lis $ord0,0xf3b9
|
---|
1954 | lis $ord1,0xbce6
|
---|
1955 | ori $ordk,$ordk,0xc8aa
|
---|
1956 | ori $ord0,$ord0,0xcac2
|
---|
1957 | ori $ord1,$ord1,0xfaad
|
---|
1958 | sldi $ordk,$ordk,32
|
---|
1959 | sldi $ord0,$ord0,32
|
---|
1960 | sldi $ord1,$ord1,32
|
---|
1961 | oris $ordk,$ordk,0xee00
|
---|
1962 | oris $ord0,$ord0,0xfc63
|
---|
1963 | oris $ord1,$ord1,0xa717
|
---|
1964 | ori $ordk,$ordk,0xbc4f # 0xccd1c8aaee00bc4f
|
---|
1965 | ori $ord0,$ord0,0x2551 # 0xf3b9cac2fc632551
|
---|
1966 | ori $ord1,$ord1,0x9e84 # 0xbce6faada7179e84
|
---|
1967 | li $ord2,-1 # 0xffffffffffffffff
|
---|
1968 | sldi $ord3,$ord2,32 # 0xffffffff00000000
|
---|
1969 | li $zr,0
|
---|
1970 | b .Loop_ord_sqr
|
---|
1971 |
|
---|
1972 | .align 5
|
---|
1973 | .Loop_ord_sqr:
|
---|
1974 | ################################################################
|
---|
1975 | # | | | | | |a1*a0| |
|
---|
1976 | # | | | | |a2*a0| | |
|
---|
1977 | # | |a3*a2|a3*a0| | | |
|
---|
1978 | # | | | |a2*a1| | | |
|
---|
1979 | # | | |a3*a1| | | | |
|
---|
1980 | # *| | | | | | | | 2|
|
---|
1981 | # +|a3*a3|a2*a2|a1*a1|a0*a0|
|
---|
1982 | # |--+--+--+--+--+--+--+--|
|
---|
1983 | # |A7|A6|A5|A4|A3|A2|A1|A0|, where Ax is $accx, i.e. follow $accx
|
---|
1984 | #
|
---|
1985 | # "can't overflow" below mark carrying into high part of
|
---|
1986 | # multiplication result, which can't overflow, because it
|
---|
1987 | # can never be all ones.
|
---|
1988 |
|
---|
1989 | mulld $acc1,$a1,$a0 # a[1]*a[0]
|
---|
1990 | mulhdu $t1,$a1,$a0
|
---|
1991 | mulld $acc2,$a2,$a0 # a[2]*a[0]
|
---|
1992 | mulhdu $t2,$a2,$a0
|
---|
1993 | mulld $acc3,$a3,$a0 # a[3]*a[0]
|
---|
1994 | mulhdu $acc4,$a3,$a0
|
---|
1995 |
|
---|
1996 | addc $acc2,$acc2,$t1 # accumulate high parts of multiplication
|
---|
1997 | mulld $t0,$a2,$a1 # a[2]*a[1]
|
---|
1998 | mulhdu $t1,$a2,$a1
|
---|
1999 | adde $acc3,$acc3,$t2
|
---|
2000 | mulld $t2,$a3,$a1 # a[3]*a[1]
|
---|
2001 | mulhdu $t3,$a3,$a1
|
---|
2002 | addze $acc4,$acc4 # can't overflow
|
---|
2003 |
|
---|
2004 | mulld $acc5,$a3,$a2 # a[3]*a[2]
|
---|
2005 | mulhdu $acc6,$a3,$a2
|
---|
2006 |
|
---|
2007 | addc $t1,$t1,$t2 # accumulate high parts of multiplication
|
---|
2008 | mulld $acc0,$a0,$a0 # a[0]*a[0]
|
---|
2009 | addze $t2,$t3 # can't overflow
|
---|
2010 |
|
---|
2011 | addc $acc3,$acc3,$t0 # accumulate low parts of multiplication
|
---|
2012 | mulhdu $a0,$a0,$a0
|
---|
2013 | adde $acc4,$acc4,$t1
|
---|
2014 | mulld $t1,$a1,$a1 # a[1]*a[1]
|
---|
2015 | adde $acc5,$acc5,$t2
|
---|
2016 | mulhdu $a1,$a1,$a1
|
---|
2017 | addze $acc6,$acc6 # can't overflow
|
---|
2018 |
|
---|
2019 | addc $acc1,$acc1,$acc1 # acc[1-6]*=2
|
---|
2020 | mulld $t2,$a2,$a2 # a[2]*a[2]
|
---|
2021 | adde $acc2,$acc2,$acc2
|
---|
2022 | mulhdu $a2,$a2,$a2
|
---|
2023 | adde $acc3,$acc3,$acc3
|
---|
2024 | mulld $t3,$a3,$a3 # a[3]*a[3]
|
---|
2025 | adde $acc4,$acc4,$acc4
|
---|
2026 | mulhdu $a3,$a3,$a3
|
---|
2027 | adde $acc5,$acc5,$acc5
|
---|
2028 | adde $acc6,$acc6,$acc6
|
---|
2029 | addze $acc7,$zr
|
---|
2030 |
|
---|
2031 | addc $acc1,$acc1,$a0 # +a[i]*a[i]
|
---|
2032 | mulld $t4,$acc0,$ordk
|
---|
2033 | adde $acc2,$acc2,$t1
|
---|
2034 | adde $acc3,$acc3,$a1
|
---|
2035 | adde $acc4,$acc4,$t2
|
---|
2036 | adde $acc5,$acc5,$a2
|
---|
2037 | adde $acc6,$acc6,$t3
|
---|
2038 | adde $acc7,$acc7,$a3
|
---|
2039 | ___
|
---|
2040 | for($i=0; $i<4; $i++) { # reductions
|
---|
2041 | $code.=<<___;
|
---|
2042 | addic $t0,$acc0,-1 # discarded
|
---|
2043 | mulhdu $t1,$ord0,$t4
|
---|
2044 | mulld $t2,$ord1,$t4
|
---|
2045 | mulhdu $t3,$ord1,$t4
|
---|
2046 |
|
---|
2047 | adde $t2,$t2,$t1
|
---|
2048 | addze $t3,$t3
|
---|
2049 |
|
---|
2050 | addc $acc0,$acc1,$t2
|
---|
2051 | adde $acc1,$acc2,$t3
|
---|
2052 | adde $acc2,$acc3,$t4
|
---|
2053 | adde $acc3,$zr,$t4 # can't overflow
|
---|
2054 | ___
|
---|
2055 | $code.=<<___ if ($i<3);
|
---|
2056 | mulld $t3,$acc0,$ordk
|
---|
2057 | ___
|
---|
2058 | $code.=<<___;
|
---|
2059 | sldi $t0,$t4,32
|
---|
2060 | subfc $acc1,$t4,$acc1
|
---|
2061 | srdi $t1,$t4,32
|
---|
2062 | subfe $acc2,$t0,$acc2
|
---|
2063 | subfe $acc3,$t1,$acc3 # can't borrow
|
---|
2064 | ___
|
---|
2065 | ($t3,$t4) = ($t4,$t3);
|
---|
2066 | }
|
---|
2067 | $code.=<<___;
|
---|
2068 | addc $acc0,$acc0,$acc4 # accumulate upper half
|
---|
2069 | adde $acc1,$acc1,$acc5
|
---|
2070 | adde $acc2,$acc2,$acc6
|
---|
2071 | adde $acc3,$acc3,$acc7
|
---|
2072 | addze $acc4,$zr
|
---|
2073 |
|
---|
2074 | subfc $acc0,$ord0,$acc0 # ret -= modulus
|
---|
2075 | subfe $acc1,$ord1,$acc1
|
---|
2076 | subfe $acc2,$ord2,$acc2
|
---|
2077 | subfe $acc3,$ord3,$acc3
|
---|
2078 | subfe $acc4,$zr,$acc4
|
---|
2079 |
|
---|
2080 | and $t0,$ord0,$acc4
|
---|
2081 | and $t1,$ord1,$acc4
|
---|
2082 | addc $a0,$acc0,$t0 # ret += modulus if borrow
|
---|
2083 | and $t3,$ord3,$acc4
|
---|
2084 | adde $a1,$acc1,$t1
|
---|
2085 | adde $a2,$acc2,$acc4
|
---|
2086 | adde $a3,$acc3,$t3
|
---|
2087 |
|
---|
2088 | bdnz .Loop_ord_sqr
|
---|
2089 |
|
---|
2090 | std $a0,0($rp)
|
---|
2091 | std $a1,8($rp)
|
---|
2092 | std $a2,16($rp)
|
---|
2093 | std $a3,24($rp)
|
---|
2094 |
|
---|
2095 | ld r18,48($sp)
|
---|
2096 | ld r19,56($sp)
|
---|
2097 | ld r20,64($sp)
|
---|
2098 | ld r21,72($sp)
|
---|
2099 | ld r22,80($sp)
|
---|
2100 | ld r23,88($sp)
|
---|
2101 | ld r24,96($sp)
|
---|
2102 | ld r25,104($sp)
|
---|
2103 | ld r26,112($sp)
|
---|
2104 | ld r27,120($sp)
|
---|
2105 | ld r28,128($sp)
|
---|
2106 | ld r29,136($sp)
|
---|
2107 | ld r30,144($sp)
|
---|
2108 | ld r31,152($sp)
|
---|
2109 | addi $sp,$sp,160
|
---|
2110 | blr
|
---|
2111 | .long 0
|
---|
2112 | .byte 0,12,4,0,0x80,14,3,0
|
---|
2113 | .long 0
|
---|
2114 | .size ecp_nistz256_ord_sqr_mont,.-ecp_nistz256_ord_sqr_mont
|
---|
2115 | ___
|
---|
2116 | } }
|
---|
2117 |
|
---|
2118 | ########################################################################
|
---|
2119 | # scatter-gather subroutines
|
---|
2120 | {
|
---|
2121 | my ($out,$inp,$index,$mask)=map("r$_",(3..7));
|
---|
2122 | $code.=<<___;
|
---|
2123 | ########################################################################
|
---|
2124 | # void ecp_nistz256_scatter_w5(void *out, const P256_POINT *inp,
|
---|
2125 | # int index);
|
---|
2126 | .globl ecp_nistz256_scatter_w5
|
---|
2127 | .align 4
|
---|
2128 | ecp_nistz256_scatter_w5:
|
---|
2129 | slwi $index,$index,2
|
---|
2130 | add $out,$out,$index
|
---|
2131 |
|
---|
2132 | ld r8, 0($inp) # X
|
---|
2133 | ld r9, 8($inp)
|
---|
2134 | ld r10,16($inp)
|
---|
2135 | ld r11,24($inp)
|
---|
2136 |
|
---|
2137 | stw r8, 64*0-4($out)
|
---|
2138 | srdi r8, r8, 32
|
---|
2139 | stw r9, 64*1-4($out)
|
---|
2140 | srdi r9, r9, 32
|
---|
2141 | stw r10,64*2-4($out)
|
---|
2142 | srdi r10,r10,32
|
---|
2143 | stw r11,64*3-4($out)
|
---|
2144 | srdi r11,r11,32
|
---|
2145 | stw r8, 64*4-4($out)
|
---|
2146 | stw r9, 64*5-4($out)
|
---|
2147 | stw r10,64*6-4($out)
|
---|
2148 | stw r11,64*7-4($out)
|
---|
2149 | addi $out,$out,64*8
|
---|
2150 |
|
---|
2151 | ld r8, 32($inp) # Y
|
---|
2152 | ld r9, 40($inp)
|
---|
2153 | ld r10,48($inp)
|
---|
2154 | ld r11,56($inp)
|
---|
2155 |
|
---|
2156 | stw r8, 64*0-4($out)
|
---|
2157 | srdi r8, r8, 32
|
---|
2158 | stw r9, 64*1-4($out)
|
---|
2159 | srdi r9, r9, 32
|
---|
2160 | stw r10,64*2-4($out)
|
---|
2161 | srdi r10,r10,32
|
---|
2162 | stw r11,64*3-4($out)
|
---|
2163 | srdi r11,r11,32
|
---|
2164 | stw r8, 64*4-4($out)
|
---|
2165 | stw r9, 64*5-4($out)
|
---|
2166 | stw r10,64*6-4($out)
|
---|
2167 | stw r11,64*7-4($out)
|
---|
2168 | addi $out,$out,64*8
|
---|
2169 |
|
---|
2170 | ld r8, 64($inp) # Z
|
---|
2171 | ld r9, 72($inp)
|
---|
2172 | ld r10,80($inp)
|
---|
2173 | ld r11,88($inp)
|
---|
2174 |
|
---|
2175 | stw r8, 64*0-4($out)
|
---|
2176 | srdi r8, r8, 32
|
---|
2177 | stw r9, 64*1-4($out)
|
---|
2178 | srdi r9, r9, 32
|
---|
2179 | stw r10,64*2-4($out)
|
---|
2180 | srdi r10,r10,32
|
---|
2181 | stw r11,64*3-4($out)
|
---|
2182 | srdi r11,r11,32
|
---|
2183 | stw r8, 64*4-4($out)
|
---|
2184 | stw r9, 64*5-4($out)
|
---|
2185 | stw r10,64*6-4($out)
|
---|
2186 | stw r11,64*7-4($out)
|
---|
2187 |
|
---|
2188 | blr
|
---|
2189 | .long 0
|
---|
2190 | .byte 0,12,0x14,0,0,0,3,0
|
---|
2191 | .long 0
|
---|
2192 | .size ecp_nistz256_scatter_w5,.-ecp_nistz256_scatter_w5
|
---|
2193 |
|
---|
2194 | ########################################################################
|
---|
2195 | # void ecp_nistz256_gather_w5(P256_POINT *out, const void *inp,
|
---|
2196 | # int index);
|
---|
2197 | .globl ecp_nistz256_gather_w5
|
---|
2198 | .align 4
|
---|
2199 | ecp_nistz256_gather_w5:
|
---|
2200 | neg r0,$index
|
---|
2201 | sradi r0,r0,63
|
---|
2202 |
|
---|
2203 | add $index,$index,r0
|
---|
2204 | slwi $index,$index,2
|
---|
2205 | add $inp,$inp,$index
|
---|
2206 |
|
---|
2207 | lwz r5, 64*0($inp)
|
---|
2208 | lwz r6, 64*1($inp)
|
---|
2209 | lwz r7, 64*2($inp)
|
---|
2210 | lwz r8, 64*3($inp)
|
---|
2211 | lwz r9, 64*4($inp)
|
---|
2212 | lwz r10,64*5($inp)
|
---|
2213 | lwz r11,64*6($inp)
|
---|
2214 | lwz r12,64*7($inp)
|
---|
2215 | addi $inp,$inp,64*8
|
---|
2216 | sldi r9, r9, 32
|
---|
2217 | sldi r10,r10,32
|
---|
2218 | sldi r11,r11,32
|
---|
2219 | sldi r12,r12,32
|
---|
2220 | or r5,r5,r9
|
---|
2221 | or r6,r6,r10
|
---|
2222 | or r7,r7,r11
|
---|
2223 | or r8,r8,r12
|
---|
2224 | and r5,r5,r0
|
---|
2225 | and r6,r6,r0
|
---|
2226 | and r7,r7,r0
|
---|
2227 | and r8,r8,r0
|
---|
2228 | std r5,0($out) # X
|
---|
2229 | std r6,8($out)
|
---|
2230 | std r7,16($out)
|
---|
2231 | std r8,24($out)
|
---|
2232 |
|
---|
2233 | lwz r5, 64*0($inp)
|
---|
2234 | lwz r6, 64*1($inp)
|
---|
2235 | lwz r7, 64*2($inp)
|
---|
2236 | lwz r8, 64*3($inp)
|
---|
2237 | lwz r9, 64*4($inp)
|
---|
2238 | lwz r10,64*5($inp)
|
---|
2239 | lwz r11,64*6($inp)
|
---|
2240 | lwz r12,64*7($inp)
|
---|
2241 | addi $inp,$inp,64*8
|
---|
2242 | sldi r9, r9, 32
|
---|
2243 | sldi r10,r10,32
|
---|
2244 | sldi r11,r11,32
|
---|
2245 | sldi r12,r12,32
|
---|
2246 | or r5,r5,r9
|
---|
2247 | or r6,r6,r10
|
---|
2248 | or r7,r7,r11
|
---|
2249 | or r8,r8,r12
|
---|
2250 | and r5,r5,r0
|
---|
2251 | and r6,r6,r0
|
---|
2252 | and r7,r7,r0
|
---|
2253 | and r8,r8,r0
|
---|
2254 | std r5,32($out) # Y
|
---|
2255 | std r6,40($out)
|
---|
2256 | std r7,48($out)
|
---|
2257 | std r8,56($out)
|
---|
2258 |
|
---|
2259 | lwz r5, 64*0($inp)
|
---|
2260 | lwz r6, 64*1($inp)
|
---|
2261 | lwz r7, 64*2($inp)
|
---|
2262 | lwz r8, 64*3($inp)
|
---|
2263 | lwz r9, 64*4($inp)
|
---|
2264 | lwz r10,64*5($inp)
|
---|
2265 | lwz r11,64*6($inp)
|
---|
2266 | lwz r12,64*7($inp)
|
---|
2267 | sldi r9, r9, 32
|
---|
2268 | sldi r10,r10,32
|
---|
2269 | sldi r11,r11,32
|
---|
2270 | sldi r12,r12,32
|
---|
2271 | or r5,r5,r9
|
---|
2272 | or r6,r6,r10
|
---|
2273 | or r7,r7,r11
|
---|
2274 | or r8,r8,r12
|
---|
2275 | and r5,r5,r0
|
---|
2276 | and r6,r6,r0
|
---|
2277 | and r7,r7,r0
|
---|
2278 | and r8,r8,r0
|
---|
2279 | std r5,64($out) # Z
|
---|
2280 | std r6,72($out)
|
---|
2281 | std r7,80($out)
|
---|
2282 | std r8,88($out)
|
---|
2283 |
|
---|
2284 | blr
|
---|
2285 | .long 0
|
---|
2286 | .byte 0,12,0x14,0,0,0,3,0
|
---|
2287 | .long 0
|
---|
2288 | .size ecp_nistz256_gather_w5,.-ecp_nistz256_gather_w5
|
---|
2289 |
|
---|
2290 | ########################################################################
|
---|
2291 | # void ecp_nistz256_scatter_w7(void *out, const P256_POINT_AFFINE *inp,
|
---|
2292 | # int index);
|
---|
2293 | .globl ecp_nistz256_scatter_w7
|
---|
2294 | .align 4
|
---|
2295 | ecp_nistz256_scatter_w7:
|
---|
2296 | li r0,8
|
---|
2297 | mtctr r0
|
---|
2298 | add $out,$out,$index
|
---|
2299 | subi $inp,$inp,8
|
---|
2300 |
|
---|
2301 | .Loop_scatter_w7:
|
---|
2302 | ldu r0,8($inp)
|
---|
2303 | stb r0,64*0($out)
|
---|
2304 | srdi r0,r0,8
|
---|
2305 | stb r0,64*1($out)
|
---|
2306 | srdi r0,r0,8
|
---|
2307 | stb r0,64*2($out)
|
---|
2308 | srdi r0,r0,8
|
---|
2309 | stb r0,64*3($out)
|
---|
2310 | srdi r0,r0,8
|
---|
2311 | stb r0,64*4($out)
|
---|
2312 | srdi r0,r0,8
|
---|
2313 | stb r0,64*5($out)
|
---|
2314 | srdi r0,r0,8
|
---|
2315 | stb r0,64*6($out)
|
---|
2316 | srdi r0,r0,8
|
---|
2317 | stb r0,64*7($out)
|
---|
2318 | addi $out,$out,64*8
|
---|
2319 | bdnz .Loop_scatter_w7
|
---|
2320 |
|
---|
2321 | blr
|
---|
2322 | .long 0
|
---|
2323 | .byte 0,12,0x14,0,0,0,3,0
|
---|
2324 | .long 0
|
---|
2325 | .size ecp_nistz256_scatter_w7,.-ecp_nistz256_scatter_w7
|
---|
2326 |
|
---|
2327 | ########################################################################
|
---|
2328 | # void ecp_nistz256_gather_w7(P256_POINT_AFFINE *out, const void *inp,
|
---|
2329 | # int index);
|
---|
2330 | .globl ecp_nistz256_gather_w7
|
---|
2331 | .align 4
|
---|
2332 | ecp_nistz256_gather_w7:
|
---|
2333 | li r0,8
|
---|
2334 | mtctr r0
|
---|
2335 | neg r0,$index
|
---|
2336 | sradi r0,r0,63
|
---|
2337 |
|
---|
2338 | add $index,$index,r0
|
---|
2339 | add $inp,$inp,$index
|
---|
2340 | subi $out,$out,8
|
---|
2341 |
|
---|
2342 | .Loop_gather_w7:
|
---|
2343 | lbz r5, 64*0($inp)
|
---|
2344 | lbz r6, 64*1($inp)
|
---|
2345 | lbz r7, 64*2($inp)
|
---|
2346 | lbz r8, 64*3($inp)
|
---|
2347 | lbz r9, 64*4($inp)
|
---|
2348 | lbz r10,64*5($inp)
|
---|
2349 | lbz r11,64*6($inp)
|
---|
2350 | lbz r12,64*7($inp)
|
---|
2351 | addi $inp,$inp,64*8
|
---|
2352 |
|
---|
2353 | sldi r6, r6, 8
|
---|
2354 | sldi r7, r7, 16
|
---|
2355 | sldi r8, r8, 24
|
---|
2356 | sldi r9, r9, 32
|
---|
2357 | sldi r10,r10,40
|
---|
2358 | sldi r11,r11,48
|
---|
2359 | sldi r12,r12,56
|
---|
2360 |
|
---|
2361 | or r5,r5,r6
|
---|
2362 | or r7,r7,r8
|
---|
2363 | or r9,r9,r10
|
---|
2364 | or r11,r11,r12
|
---|
2365 | or r5,r5,r7
|
---|
2366 | or r9,r9,r11
|
---|
2367 | or r5,r5,r9
|
---|
2368 | and r5,r5,r0
|
---|
2369 | stdu r5,8($out)
|
---|
2370 | bdnz .Loop_gather_w7
|
---|
2371 |
|
---|
2372 | blr
|
---|
2373 | .long 0
|
---|
2374 | .byte 0,12,0x14,0,0,0,3,0
|
---|
2375 | .long 0
|
---|
2376 | .size ecp_nistz256_gather_w7,.-ecp_nistz256_gather_w7
|
---|
2377 | ___
|
---|
2378 | }
|
---|
2379 |
|
---|
2380 | foreach (split("\n",$code)) {
|
---|
2381 | s/\`([^\`]*)\`/eval $1/ge;
|
---|
2382 |
|
---|
2383 | print $_,"\n";
|
---|
2384 | }
|
---|
2385 | close STDOUT or die "error closing STDOUT: $!"; # enforce flush
|
---|