VirtualBox

source: vbox/trunk/src/libs/openssl-3.1.2/crypto/ec/asm/x25519-ppc64.pl@ 101021

Last change on this file since 101021 was 101021, checked in by vboxsync, 18 months ago

openssl-3.1.2: Applied and adjusted our OpenSSL changes to 3.1.0. bugref:10519

File size: 17.2 KB
Line 
1#! /usr/bin/env perl
2# Copyright 2018-2020 The OpenSSL Project Authors. All Rights Reserved.
3#
4# Licensed under the Apache License 2.0 (the "License"). You may not use
5# this file except in compliance with the License. You can obtain a copy
6# in the file LICENSE in the source distribution or at
7# https://www.openssl.org/source/license.html
8#
9# ====================================================================
10# Written by Andy Polyakov <[email protected]> for the OpenSSL
11# project. The module is, however, dual licensed under OpenSSL and
12# CRYPTOGAMS licenses depending on where you obtain it. For further
13# details see http://www.openssl.org/~appro/cryptogams/.
14# ====================================================================
15#
16# X25519 lower-level primitives for PPC64.
17#
18# July 2018.
19#
20# Base 2^64 is faster than base 2^51 on pre-POWER8, most notably ~15%
21# faster on PPC970/G5. POWER8 on the other hand seems to trip on own
22# shoelaces when handling longer carry chains. As base 2^51 has just
23# single-carry pairs, it's 25% faster than base 2^64. Since PPC970 is
24# pretty old, base 2^64 implementation is not engaged. Comparison to
25# compiler-generated code is complicated by the fact that not all
26# compilers support 128-bit integers. When compiler doesn't, like xlc,
27# this module delivers more than 2x improvement, and when it does,
28# from 12% to 30% improvement was measured...
29
30# $output is the last argument if it looks like a file (it has an extension)
31# $flavour is the first argument if it doesn't look like a file
32$output = $#ARGV >= 0 && $ARGV[$#ARGV] =~ m|\.\w+$| ? pop : undef;
33$flavour = $#ARGV >= 0 && $ARGV[0] !~ m|\.| ? shift : undef;
34
35$0 =~ m/(.*[\/\\])[^\/\\]+$/; $dir=$1;
36( $xlate="${dir}ppc-xlate.pl" and -f $xlate ) or
37( $xlate="${dir}../../perlasm/ppc-xlate.pl" and -f $xlate) or
38die "can't locate ppc-xlate.pl";
39
40open OUT,"| \"$^X\" $xlate $flavour \"$output\""
41 or die "can't call $xlate: $!";
42*STDOUT=*OUT;
43
44my $sp = "r1";
45my ($rp,$ap,$bp) = map("r$_",3..5);
46
47####################################################### base 2^64
48if (0) {
49my ($bi,$a0,$a1,$a2,$a3,$t0,$t1, $t2,$t3,
50 $acc0,$acc1,$acc2,$acc3,$acc4,$acc5,$acc6,$acc7) =
51 map("r$_",(6..12,22..31));
52my $zero = "r0";
53my $FRAME = 16*8;
54
55$code.=<<___;
56.text
57
58.globl x25519_fe64_mul
59.type x25519_fe64_mul,\@function
60.align 5
61x25519_fe64_mul:
62 stdu $sp,-$FRAME($sp)
63 std r22,`$FRAME-8*10`($sp)
64 std r23,`$FRAME-8*9`($sp)
65 std r24,`$FRAME-8*8`($sp)
66 std r25,`$FRAME-8*7`($sp)
67 std r26,`$FRAME-8*6`($sp)
68 std r27,`$FRAME-8*5`($sp)
69 std r28,`$FRAME-8*4`($sp)
70 std r29,`$FRAME-8*3`($sp)
71 std r30,`$FRAME-8*2`($sp)
72 std r31,`$FRAME-8*1`($sp)
73
74 ld $bi,0($bp)
75 ld $a0,0($ap)
76 xor $zero,$zero,$zero
77 ld $a1,8($ap)
78 ld $a2,16($ap)
79 ld $a3,24($ap)
80
81 mulld $acc0,$a0,$bi # a[0]*b[0]
82 mulhdu $t0,$a0,$bi
83 mulld $acc1,$a1,$bi # a[1]*b[0]
84 mulhdu $t1,$a1,$bi
85 mulld $acc2,$a2,$bi # a[2]*b[0]
86 mulhdu $t2,$a2,$bi
87 mulld $acc3,$a3,$bi # a[3]*b[0]
88 mulhdu $t3,$a3,$bi
89___
90for(my @acc=($acc0,$acc1,$acc2,$acc3,$acc4,$acc5,$acc6,$acc7),
91 my $i=1; $i<4; shift(@acc), $i++) {
92my $acc4 = $i==1? $zero : @acc[4];
93
94$code.=<<___;
95 ld $bi,`8*$i`($bp)
96 addc @acc[1],@acc[1],$t0 # accumulate high parts
97 mulld $t0,$a0,$bi
98 adde @acc[2],@acc[2],$t1
99 mulld $t1,$a1,$bi
100 adde @acc[3],@acc[3],$t2
101 mulld $t2,$a2,$bi
102 adde @acc[4],$acc4,$t3
103 mulld $t3,$a3,$bi
104 addc @acc[1],@acc[1],$t0 # accumulate low parts
105 mulhdu $t0,$a0,$bi
106 adde @acc[2],@acc[2],$t1
107 mulhdu $t1,$a1,$bi
108 adde @acc[3],@acc[3],$t2
109 mulhdu $t2,$a2,$bi
110 adde @acc[4],@acc[4],$t3
111 mulhdu $t3,$a3,$bi
112 adde @acc[5],$zero,$zero
113___
114}
115$code.=<<___;
116 li $bi,38
117 addc $acc4,$acc4,$t0
118 mulld $t0,$acc4,$bi
119 adde $acc5,$acc5,$t1
120 mulld $t1,$acc5,$bi
121 adde $acc6,$acc6,$t2
122 mulld $t2,$acc6,$bi
123 adde $acc7,$acc7,$t3
124 mulld $t3,$acc7,$bi
125
126 addc $acc0,$acc0,$t0
127 mulhdu $t0,$acc4,$bi
128 adde $acc1,$acc1,$t1
129 mulhdu $t1,$acc5,$bi
130 adde $acc2,$acc2,$t2
131 mulhdu $t2,$acc6,$bi
132 adde $acc3,$acc3,$t3
133 mulhdu $t3,$acc7,$bi
134 adde $acc4,$zero,$zero
135
136 addc $acc1,$acc1,$t0
137 adde $acc2,$acc2,$t1
138 adde $acc3,$acc3,$t2
139 adde $acc4,$acc4,$t3
140
141 mulld $acc4,$acc4,$bi
142
143 addc $acc0,$acc0,$acc4
144 addze $acc1,$acc1
145 addze $acc2,$acc2
146 addze $acc3,$acc3
147
148 subfe $acc4,$acc4,$acc4 # carry -> ~mask
149 std $acc1,8($rp)
150 andc $acc4,$bi,$acc4
151 std $acc2,16($rp)
152 add $acc0,$acc0,$acc4
153 std $acc3,24($rp)
154 std $acc0,0($rp)
155
156 ld r22,`$FRAME-8*10`($sp)
157 ld r23,`$FRAME-8*9`($sp)
158 ld r24,`$FRAME-8*8`($sp)
159 ld r25,`$FRAME-8*7`($sp)
160 ld r26,`$FRAME-8*6`($sp)
161 ld r27,`$FRAME-8*5`($sp)
162 ld r28,`$FRAME-8*4`($sp)
163 ld r29,`$FRAME-8*3`($sp)
164 ld r30,`$FRAME-8*2`($sp)
165 ld r31,`$FRAME-8*1`($sp)
166 addi $sp,$sp,$FRAME
167 blr
168 .long 0
169 .byte 0,12,4,0,0x80,10,3,0
170 .long 0
171.size x25519_fe64_mul,.-x25519_fe64_mul
172
173.globl x25519_fe64_sqr
174.type x25519_fe64_sqr,\@function
175.align 5
176x25519_fe64_sqr:
177 stdu $sp,-$FRAME($sp)
178 std r22,`$FRAME-8*10`($sp)
179 std r23,`$FRAME-8*9`($sp)
180 std r24,`$FRAME-8*8`($sp)
181 std r25,`$FRAME-8*7`($sp)
182 std r26,`$FRAME-8*6`($sp)
183 std r27,`$FRAME-8*5`($sp)
184 std r28,`$FRAME-8*4`($sp)
185 std r29,`$FRAME-8*3`($sp)
186 std r30,`$FRAME-8*2`($sp)
187 std r31,`$FRAME-8*1`($sp)
188
189 ld $a0,0($ap)
190 xor $zero,$zero,$zero
191 ld $a1,8($ap)
192 ld $a2,16($ap)
193 ld $a3,24($ap)
194
195 ################################
196 # | | | | | |a1*a0| |
197 # | | | | |a2*a0| | |
198 # | |a3*a2|a3*a0| | | |
199 # | | | |a2*a1| | | |
200 # | | |a3*a1| | | | |
201 # *| | | | | | | | 2|
202 # +|a3*a3|a2*a2|a1*a1|a0*a0|
203 # |--+--+--+--+--+--+--+--|
204 # |A7|A6|A5|A4|A3|A2|A1|A0|, where Ax is $accx, i.e. follow $accx
205 #
206 # "can't overflow" below mark carrying into high part of
207 # multiplication result, which can't overflow, because it
208 # can never be all ones.
209
210 mulld $acc1,$a1,$a0 # a[1]*a[0]
211 mulhdu $t1,$a1,$a0
212 mulld $acc2,$a2,$a0 # a[2]*a[0]
213 mulhdu $t2,$a2,$a0
214 mulld $acc3,$a3,$a0 # a[3]*a[0]
215 mulhdu $acc4,$a3,$a0
216
217 addc $acc2,$acc2,$t1 # accumulate high parts of multiplication
218 mulld $t0,$a2,$a1 # a[2]*a[1]
219 mulhdu $t1,$a2,$a1
220 adde $acc3,$acc3,$t2
221 mulld $t2,$a3,$a1 # a[3]*a[1]
222 mulhdu $t3,$a3,$a1
223 addze $acc4,$acc4 # can't overflow
224
225 mulld $acc5,$a3,$a2 # a[3]*a[2]
226 mulhdu $acc6,$a3,$a2
227
228 addc $t1,$t1,$t2 # accumulate high parts of multiplication
229 mulld $acc0,$a0,$a0 # a[0]*a[0]
230 addze $t2,$t3 # can't overflow
231
232 addc $acc3,$acc3,$t0 # accumulate low parts of multiplication
233 mulhdu $a0,$a0,$a0
234 adde $acc4,$acc4,$t1
235 mulld $t1,$a1,$a1 # a[1]*a[1]
236 adde $acc5,$acc5,$t2
237 mulhdu $a1,$a1,$a1
238 addze $acc6,$acc6 # can't overflow
239
240 addc $acc1,$acc1,$acc1 # acc[1-6]*=2
241 mulld $t2,$a2,$a2 # a[2]*a[2]
242 adde $acc2,$acc2,$acc2
243 mulhdu $a2,$a2,$a2
244 adde $acc3,$acc3,$acc3
245 mulld $t3,$a3,$a3 # a[3]*a[3]
246 adde $acc4,$acc4,$acc4
247 mulhdu $a3,$a3,$a3
248 adde $acc5,$acc5,$acc5
249 adde $acc6,$acc6,$acc6
250 addze $acc7,$zero
251
252 addc $acc1,$acc1,$a0 # +a[i]*a[i]
253 li $bi,38
254 adde $acc2,$acc2,$t1
255 adde $acc3,$acc3,$a1
256 adde $acc4,$acc4,$t2
257 adde $acc5,$acc5,$a2
258 adde $acc6,$acc6,$t3
259 adde $acc7,$acc7,$a3
260
261 mulld $t0,$acc4,$bi
262 mulld $t1,$acc5,$bi
263 mulld $t2,$acc6,$bi
264 mulld $t3,$acc7,$bi
265
266 addc $acc0,$acc0,$t0
267 mulhdu $t0,$acc4,$bi
268 adde $acc1,$acc1,$t1
269 mulhdu $t1,$acc5,$bi
270 adde $acc2,$acc2,$t2
271 mulhdu $t2,$acc6,$bi
272 adde $acc3,$acc3,$t3
273 mulhdu $t3,$acc7,$bi
274 addze $acc4,$zero
275
276 addc $acc1,$acc1,$t0
277 adde $acc2,$acc2,$t1
278 adde $acc3,$acc3,$t2
279 adde $acc4,$acc4,$t3
280
281 mulld $acc4,$acc4,$bi
282
283 addc $acc0,$acc0,$acc4
284 addze $acc1,$acc1
285 addze $acc2,$acc2
286 addze $acc3,$acc3
287
288 subfe $acc4,$acc4,$acc4 # carry -> ~mask
289 std $acc1,8($rp)
290 andc $acc4,$bi,$acc4
291 std $acc2,16($rp)
292 add $acc0,$acc0,$acc4
293 std $acc3,24($rp)
294 std $acc0,0($rp)
295
296 ld r22,`$FRAME-8*10`($sp)
297 ld r23,`$FRAME-8*9`($sp)
298 ld r24,`$FRAME-8*8`($sp)
299 ld r25,`$FRAME-8*7`($sp)
300 ld r26,`$FRAME-8*6`($sp)
301 ld r27,`$FRAME-8*5`($sp)
302 ld r28,`$FRAME-8*4`($sp)
303 ld r29,`$FRAME-8*3`($sp)
304 ld r30,`$FRAME-8*2`($sp)
305 ld r31,`$FRAME-8*1`($sp)
306 addi $sp,$sp,$FRAME
307 blr
308 .long 0
309 .byte 0,12,4,0,0x80,10,2,0
310 .long 0
311.size x25519_fe64_sqr,.-x25519_fe64_sqr
312
313.globl x25519_fe64_mul121666
314.type x25519_fe64_mul121666,\@function
315.align 5
316x25519_fe64_mul121666:
317 lis $bi,`65536>>16`
318 ori $bi,$bi,`121666-65536`
319
320 ld $t0,0($ap)
321 ld $t1,8($ap)
322 ld $bp,16($ap)
323 ld $ap,24($ap)
324
325 mulld $a0,$t0,$bi
326 mulhdu $t0,$t0,$bi
327 mulld $a1,$t1,$bi
328 mulhdu $t1,$t1,$bi
329 mulld $a2,$bp,$bi
330 mulhdu $bp,$bp,$bi
331 mulld $a3,$ap,$bi
332 mulhdu $ap,$ap,$bi
333
334 addc $a1,$a1,$t0
335 adde $a2,$a2,$t1
336 adde $a3,$a3,$bp
337 addze $ap, $ap
338
339 mulli $ap,$ap,38
340
341 addc $a0,$a0,$ap
342 addze $a1,$a1
343 addze $a2,$a2
344 addze $a3,$a3
345
346 subfe $t1,$t1,$t1 # carry -> ~mask
347 std $a1,8($rp)
348 andc $t0,$t0,$t1
349 std $a2,16($rp)
350 add $a0,$a0,$t0
351 std $a3,24($rp)
352 std $a0,0($rp)
353
354 blr
355 .long 0
356 .byte 0,12,0x14,0,0,0,2,0
357 .long 0
358.size x25519_fe64_mul121666,.-x25519_fe64_mul121666
359
360.globl x25519_fe64_add
361.type x25519_fe64_add,\@function
362.align 5
363x25519_fe64_add:
364 ld $a0,0($ap)
365 ld $t0,0($bp)
366 ld $a1,8($ap)
367 ld $t1,8($bp)
368 ld $a2,16($ap)
369 ld $bi,16($bp)
370 ld $a3,24($ap)
371 ld $bp,24($bp)
372
373 addc $a0,$a0,$t0
374 adde $a1,$a1,$t1
375 adde $a2,$a2,$bi
376 adde $a3,$a3,$bp
377
378 li $t0,38
379 subfe $t1,$t1,$t1 # carry -> ~mask
380 andc $t1,$t0,$t1
381
382 addc $a0,$a0,$t1
383 addze $a1,$a1
384 addze $a2,$a2
385 addze $a3,$a3
386
387 subfe $t1,$t1,$t1 # carry -> ~mask
388 std $a1,8($rp)
389 andc $t0,$t0,$t1
390 std $a2,16($rp)
391 add $a0,$a0,$t0
392 std $a3,24($rp)
393 std $a0,0($rp)
394
395 blr
396 .long 0
397 .byte 0,12,0x14,0,0,0,3,0
398 .long 0
399.size x25519_fe64_add,.-x25519_fe64_add
400
401.globl x25519_fe64_sub
402.type x25519_fe64_sub,\@function
403.align 5
404x25519_fe64_sub:
405 ld $a0,0($ap)
406 ld $t0,0($bp)
407 ld $a1,8($ap)
408 ld $t1,8($bp)
409 ld $a2,16($ap)
410 ld $bi,16($bp)
411 ld $a3,24($ap)
412 ld $bp,24($bp)
413
414 subfc $a0,$t0,$a0
415 subfe $a1,$t1,$a1
416 subfe $a2,$bi,$a2
417 subfe $a3,$bp,$a3
418
419 li $t0,38
420 subfe $t1,$t1,$t1 # borrow -> mask
421 xor $zero,$zero,$zero
422 and $t1,$t0,$t1
423
424 subfc $a0,$t1,$a0
425 subfe $a1,$zero,$a1
426 subfe $a2,$zero,$a2
427 subfe $a3,$zero,$a3
428
429 subfe $t1,$t1,$t1 # borrow -> mask
430 std $a1,8($rp)
431 and $t0,$t0,$t1
432 std $a2,16($rp)
433 subf $a0,$t0,$a0
434 std $a3,24($rp)
435 std $a0,0($rp)
436
437 blr
438 .long 0
439 .byte 0,12,0x14,0,0,0,3,0
440 .long 0
441.size x25519_fe64_sub,.-x25519_fe64_sub
442
443.globl x25519_fe64_tobytes
444.type x25519_fe64_tobytes,\@function
445.align 5
446x25519_fe64_tobytes:
447 ld $a3,24($ap)
448 ld $a0,0($ap)
449 ld $a1,8($ap)
450 ld $a2,16($ap)
451
452 sradi $t0,$a3,63 # most significant bit -> mask
453 li $t1,19
454 and $t0,$t0,$t1
455 sldi $a3,$a3,1
456 add $t0,$t0,$t1 # compare to modulus in the same go
457 srdi $a3,$a3,1 # most significant bit cleared
458
459 addc $a0,$a0,$t0
460 addze $a1,$a1
461 addze $a2,$a2
462 addze $a3,$a3
463
464 xor $zero,$zero,$zero
465 sradi $t0,$a3,63 # most significant bit -> mask
466 sldi $a3,$a3,1
467 andc $t0,$t1,$t0
468 srdi $a3,$a3,1 # most significant bit cleared
469
470 subi $rp,$rp,1
471 subfc $a0,$t0,$a0
472 subfe $a1,$zero,$a1
473 subfe $a2,$zero,$a2
474 subfe $a3,$zero,$a3
475
476___
477for (my @a=($a0,$a1,$a2,$a3), my $i=0; $i<4; shift(@a), $i++) {
478$code.=<<___;
479 srdi $t0,@a[0],8
480 stbu @a[0],1($rp)
481 srdi @a[0],@a[0],16
482 stbu $t0,1($rp)
483 srdi $t0,@a[0],8
484 stbu @a[0],1($rp)
485 srdi @a[0],@a[0],16
486 stbu $t0,1($rp)
487 srdi $t0,@a[0],8
488 stbu @a[0],1($rp)
489 srdi @a[0],@a[0],16
490 stbu $t0,1($rp)
491 srdi $t0,@a[0],8
492 stbu @a[0],1($rp)
493 stbu $t0,1($rp)
494___
495}
496$code.=<<___;
497 blr
498 .long 0
499 .byte 0,12,0x14,0,0,0,2,0
500 .long 0
501.size x25519_fe64_tobytes,.-x25519_fe64_tobytes
502___
503}
504####################################################### base 2^51
505{
506my ($bi,$a0,$a1,$a2,$a3,$a4,$t0, $t1,
507 $h0lo,$h0hi,$h1lo,$h1hi,$h2lo,$h2hi,$h3lo,$h3hi,$h4lo,$h4hi) =
508 map("r$_",(6..12,21..31));
509my $mask = "r0";
510my $FRAME = 18*8;
511
512$code.=<<___;
513.text
514
515.globl x25519_fe51_mul
516.type x25519_fe51_mul,\@function
517.align 5
518x25519_fe51_mul:
519 stdu $sp,-$FRAME($sp)
520 std r21,`$FRAME-8*11`($sp)
521 std r22,`$FRAME-8*10`($sp)
522 std r23,`$FRAME-8*9`($sp)
523 std r24,`$FRAME-8*8`($sp)
524 std r25,`$FRAME-8*7`($sp)
525 std r26,`$FRAME-8*6`($sp)
526 std r27,`$FRAME-8*5`($sp)
527 std r28,`$FRAME-8*4`($sp)
528 std r29,`$FRAME-8*3`($sp)
529 std r30,`$FRAME-8*2`($sp)
530 std r31,`$FRAME-8*1`($sp)
531
532 ld $bi,0($bp)
533 ld $a0,0($ap)
534 ld $a1,8($ap)
535 ld $a2,16($ap)
536 ld $a3,24($ap)
537 ld $a4,32($ap)
538
539 mulld $h0lo,$a0,$bi # a[0]*b[0]
540 mulhdu $h0hi,$a0,$bi
541
542 mulld $h1lo,$a1,$bi # a[1]*b[0]
543 mulhdu $h1hi,$a1,$bi
544
545 mulld $h4lo,$a4,$bi # a[4]*b[0]
546 mulhdu $h4hi,$a4,$bi
547 ld $ap,8($bp)
548 mulli $a4,$a4,19
549
550 mulld $h2lo,$a2,$bi # a[2]*b[0]
551 mulhdu $h2hi,$a2,$bi
552
553 mulld $h3lo,$a3,$bi # a[3]*b[0]
554 mulhdu $h3hi,$a3,$bi
555___
556for(my @a=($a0,$a1,$a2,$a3,$a4),
557 my $i=1; $i<4; $i++) {
558 ($ap,$bi) = ($bi,$ap);
559$code.=<<___;
560 mulld $t0,@a[4],$bi
561 mulhdu $t1,@a[4],$bi
562 addc $h0lo,$h0lo,$t0
563 adde $h0hi,$h0hi,$t1
564
565 mulld $t0,@a[0],$bi
566 mulhdu $t1,@a[0],$bi
567 addc $h1lo,$h1lo,$t0
568 adde $h1hi,$h1hi,$t1
569
570 mulld $t0,@a[3],$bi
571 mulhdu $t1,@a[3],$bi
572 ld $ap,`8*($i+1)`($bp)
573 mulli @a[3],@a[3],19
574 addc $h4lo,$h4lo,$t0
575 adde $h4hi,$h4hi,$t1
576
577 mulld $t0,@a[1],$bi
578 mulhdu $t1,@a[1],$bi
579 addc $h2lo,$h2lo,$t0
580 adde $h2hi,$h2hi,$t1
581
582 mulld $t0,@a[2],$bi
583 mulhdu $t1,@a[2],$bi
584 addc $h3lo,$h3lo,$t0
585 adde $h3hi,$h3hi,$t1
586___
587 unshift(@a,pop(@a));
588}
589 ($ap,$bi) = ($bi,$ap);
590$code.=<<___;
591 mulld $t0,$a1,$bi
592 mulhdu $t1,$a1,$bi
593 addc $h0lo,$h0lo,$t0
594 adde $h0hi,$h0hi,$t1
595
596 mulld $t0,$a2,$bi
597 mulhdu $t1,$a2,$bi
598 addc $h1lo,$h1lo,$t0
599 adde $h1hi,$h1hi,$t1
600
601 mulld $t0,$a3,$bi
602 mulhdu $t1,$a3,$bi
603 addc $h2lo,$h2lo,$t0
604 adde $h2hi,$h2hi,$t1
605
606 mulld $t0,$a4,$bi
607 mulhdu $t1,$a4,$bi
608 addc $h3lo,$h3lo,$t0
609 adde $h3hi,$h3hi,$t1
610
611 mulld $t0,$a0,$bi
612 mulhdu $t1,$a0,$bi
613 addc $h4lo,$h4lo,$t0
614 adde $h4hi,$h4hi,$t1
615
616.Lfe51_reduce:
617 li $mask,-1
618 srdi $mask,$mask,13 # 0x7ffffffffffff
619
620 srdi $t0,$h2lo,51
621 and $a2,$h2lo,$mask
622 insrdi $t0,$h2hi,51,0 # h2>>51
623 srdi $t1,$h0lo,51
624 and $a0,$h0lo,$mask
625 insrdi $t1,$h0hi,51,0 # h0>>51
626 addc $h3lo,$h3lo,$t0
627 addze $h3hi,$h3hi
628 addc $h1lo,$h1lo,$t1
629 addze $h1hi,$h1hi
630
631 srdi $t0,$h3lo,51
632 and $a3,$h3lo,$mask
633 insrdi $t0,$h3hi,51,0 # h3>>51
634 srdi $t1,$h1lo,51
635 and $a1,$h1lo,$mask
636 insrdi $t1,$h1hi,51,0 # h1>>51
637 addc $h4lo,$h4lo,$t0
638 addze $h4hi,$h4hi
639 add $a2,$a2,$t1
640
641 srdi $t0,$h4lo,51
642 and $a4,$h4lo,$mask
643 insrdi $t0,$h4hi,51,0
644 mulli $t0,$t0,19 # (h4 >> 51) * 19
645
646 add $a0,$a0,$t0
647
648 srdi $t1,$a2,51
649 and $a2,$a2,$mask
650 add $a3,$a3,$t1
651
652 srdi $t0,$a0,51
653 and $a0,$a0,$mask
654 add $a1,$a1,$t0
655
656 std $a2,16($rp)
657 std $a3,24($rp)
658 std $a4,32($rp)
659 std $a0,0($rp)
660 std $a1,8($rp)
661
662 ld r21,`$FRAME-8*11`($sp)
663 ld r22,`$FRAME-8*10`($sp)
664 ld r23,`$FRAME-8*9`($sp)
665 ld r24,`$FRAME-8*8`($sp)
666 ld r25,`$FRAME-8*7`($sp)
667 ld r26,`$FRAME-8*6`($sp)
668 ld r27,`$FRAME-8*5`($sp)
669 ld r28,`$FRAME-8*4`($sp)
670 ld r29,`$FRAME-8*3`($sp)
671 ld r30,`$FRAME-8*2`($sp)
672 ld r31,`$FRAME-8*1`($sp)
673 addi $sp,$sp,$FRAME
674 blr
675 .long 0
676 .byte 0,12,4,0,0x80,11,3,0
677 .long 0
678.size x25519_fe51_mul,.-x25519_fe51_mul
679___
680{
681my ($a0,$a1,$a2,$a3,$a4,$t0,$t1) = ($a0,$a1,$a2,$a3,$a4,$t0,$t1);
682$code.=<<___;
683.globl x25519_fe51_sqr
684.type x25519_fe51_sqr,\@function
685.align 5
686x25519_fe51_sqr:
687 stdu $sp,-$FRAME($sp)
688 std r21,`$FRAME-8*11`($sp)
689 std r22,`$FRAME-8*10`($sp)
690 std r23,`$FRAME-8*9`($sp)
691 std r24,`$FRAME-8*8`($sp)
692 std r25,`$FRAME-8*7`($sp)
693 std r26,`$FRAME-8*6`($sp)
694 std r27,`$FRAME-8*5`($sp)
695 std r28,`$FRAME-8*4`($sp)
696 std r29,`$FRAME-8*3`($sp)
697 std r30,`$FRAME-8*2`($sp)
698 std r31,`$FRAME-8*1`($sp)
699
700 ld $a0,0($ap)
701 ld $a1,8($ap)
702 ld $a2,16($ap)
703 ld $a3,24($ap)
704 ld $a4,32($ap)
705
706 add $bi,$a0,$a0 # a[0]*2
707 mulli $t1,$a4,19 # a[4]*19
708
709 mulld $h0lo,$a0,$a0
710 mulhdu $h0hi,$a0,$a0
711 mulld $h1lo,$a1,$bi
712 mulhdu $h1hi,$a1,$bi
713 mulld $h2lo,$a2,$bi
714 mulhdu $h2hi,$a2,$bi
715 mulld $h3lo,$a3,$bi
716 mulhdu $h3hi,$a3,$bi
717 mulld $h4lo,$a4,$bi
718 mulhdu $h4hi,$a4,$bi
719 add $bi,$a1,$a1 # a[1]*2
720___
721 ($a4,$t1) = ($t1,$a4);
722$code.=<<___;
723 mulld $t0,$t1,$a4
724 mulhdu $t1,$t1,$a4
725 addc $h3lo,$h3lo,$t0
726 adde $h3hi,$h3hi,$t1
727
728 mulli $bp,$a3,19 # a[3]*19
729
730 mulld $t0,$a1,$a1
731 mulhdu $t1,$a1,$a1
732 addc $h2lo,$h2lo,$t0
733 adde $h2hi,$h2hi,$t1
734 mulld $t0,$a2,$bi
735 mulhdu $t1,$a2,$bi
736 addc $h3lo,$h3lo,$t0
737 adde $h3hi,$h3hi,$t1
738 mulld $t0,$a3,$bi
739 mulhdu $t1,$a3,$bi
740 addc $h4lo,$h4lo,$t0
741 adde $h4hi,$h4hi,$t1
742 mulld $t0,$a4,$bi
743 mulhdu $t1,$a4,$bi
744 add $bi,$a3,$a3 # a[3]*2
745 addc $h0lo,$h0lo,$t0
746 adde $h0hi,$h0hi,$t1
747___
748 ($a3,$t1) = ($bp,$a3);
749$code.=<<___;
750 mulld $t0,$t1,$a3
751 mulhdu $t1,$t1,$a3
752 addc $h1lo,$h1lo,$t0
753 adde $h1hi,$h1hi,$t1
754 mulld $t0,$bi,$a4
755 mulhdu $t1,$bi,$a4
756 add $bi,$a2,$a2 # a[2]*2
757 addc $h2lo,$h2lo,$t0
758 adde $h2hi,$h2hi,$t1
759
760 mulld $t0,$a2,$a2
761 mulhdu $t1,$a2,$a2
762 addc $h4lo,$h4lo,$t0
763 adde $h4hi,$h4hi,$t1
764 mulld $t0,$a3,$bi
765 mulhdu $t1,$a3,$bi
766 addc $h0lo,$h0lo,$t0
767 adde $h0hi,$h0hi,$t1
768 mulld $t0,$a4,$bi
769 mulhdu $t1,$a4,$bi
770 addc $h1lo,$h1lo,$t0
771 adde $h1hi,$h1hi,$t1
772
773 b .Lfe51_reduce
774 .long 0
775 .byte 0,12,4,0,0x80,11,2,0
776 .long 0
777.size x25519_fe51_sqr,.-x25519_fe51_sqr
778___
779}
780$code.=<<___;
781.globl x25519_fe51_mul121666
782.type x25519_fe51_mul121666,\@function
783.align 5
784x25519_fe51_mul121666:
785 stdu $sp,-$FRAME($sp)
786 std r21,`$FRAME-8*11`($sp)
787 std r22,`$FRAME-8*10`($sp)
788 std r23,`$FRAME-8*9`($sp)
789 std r24,`$FRAME-8*8`($sp)
790 std r25,`$FRAME-8*7`($sp)
791 std r26,`$FRAME-8*6`($sp)
792 std r27,`$FRAME-8*5`($sp)
793 std r28,`$FRAME-8*4`($sp)
794 std r29,`$FRAME-8*3`($sp)
795 std r30,`$FRAME-8*2`($sp)
796 std r31,`$FRAME-8*1`($sp)
797
798 lis $bi,`65536>>16`
799 ori $bi,$bi,`121666-65536`
800 ld $a0,0($ap)
801 ld $a1,8($ap)
802 ld $a2,16($ap)
803 ld $a3,24($ap)
804 ld $a4,32($ap)
805
806 mulld $h0lo,$a0,$bi # a[0]*121666
807 mulhdu $h0hi,$a0,$bi
808 mulld $h1lo,$a1,$bi # a[1]*121666
809 mulhdu $h1hi,$a1,$bi
810 mulld $h2lo,$a2,$bi # a[2]*121666
811 mulhdu $h2hi,$a2,$bi
812 mulld $h3lo,$a3,$bi # a[3]*121666
813 mulhdu $h3hi,$a3,$bi
814 mulld $h4lo,$a4,$bi # a[4]*121666
815 mulhdu $h4hi,$a4,$bi
816
817 b .Lfe51_reduce
818 .long 0
819 .byte 0,12,4,0,0x80,11,2,0
820 .long 0
821.size x25519_fe51_mul121666,.-x25519_fe51_mul121666
822___
823}
824
825$code =~ s/\`([^\`]*)\`/eval $1/gem;
826print $code;
827close STDOUT or die "error closing STDOUT: $!";
Note: See TracBrowser for help on using the repository browser.

© 2024 Oracle Support Privacy / Do Not Sell My Info Terms of Use Trademark Policy Automated Access Etiquette