VirtualBox

source: vbox/trunk/src/libs/openssl-1.1.1l/crypto/poly1305/asm/poly1305-ppcfp.pl@ 91772

Last change on this file since 91772 was 91772, checked in by vboxsync, 3 years ago

openssl-1.1.1l: Applied and adjusted our OpenSSL changes to 1.1.1l. bugref:10126

  • Property svn:executable set to *
File size: 17.2 KB
Line 
1#! /usr/bin/env perl
2# Copyright 2016-2020 The OpenSSL Project Authors. All Rights Reserved.
3#
4# Licensed under the OpenSSL license (the "License"). You may not use
5# this file except in compliance with the License. You can obtain a copy
6# in the file LICENSE in the source distribution or at
7# https://www.openssl.org/source/license.html
8
9#
10# ====================================================================
11# Written by Andy Polyakov <[email protected]> for the OpenSSL
12# project. The module is, however, dual licensed under OpenSSL and
13# CRYPTOGAMS licenses depending on where you obtain it. For further
14# details see http://www.openssl.org/~appro/cryptogams/.
15# ====================================================================
16#
17# This module implements Poly1305 hash for PowerPC FPU.
18#
19# June 2015
20#
21# Numbers are cycles per processed byte with poly1305_blocks alone,
22# and improvement coefficients relative to gcc-generated code.
23#
24# Freescale e300 9.78/+30%
25# PPC74x0 6.92/+50%
26# PPC970 6.03/+80%
27# POWER7 3.50/+30%
28# POWER8 3.75/+10%
29
30$flavour = shift;
31
32if ($flavour =~ /64/) {
33 $SIZE_T =8;
34 $LRSAVE =2*$SIZE_T;
35 $UCMP ="cmpld";
36 $STU ="stdu";
37 $POP ="ld";
38 $PUSH ="std";
39} elsif ($flavour =~ /32/) {
40 $SIZE_T =4;
41 $LRSAVE =$SIZE_T;
42 $UCMP ="cmplw";
43 $STU ="stwu";
44 $POP ="lwz";
45 $PUSH ="stw";
46} else { die "nonsense $flavour"; }
47
48$LITTLE_ENDIAN = ($flavour=~/le$/) ? 4 : 0;
49
50$LWXLE = $LITTLE_ENDIAN ? "lwzx" : "lwbrx";
51
52$0 =~ m/(.*[\/\\])[^\/\\]+$/; $dir=$1;
53( $xlate="${dir}ppc-xlate.pl" and -f $xlate ) or
54( $xlate="${dir}../../perlasm/ppc-xlate.pl" and -f $xlate) or
55die "can't locate ppc-xlate.pl";
56
57open STDOUT,"| $^X $xlate $flavour ".shift || die "can't call $xlate: $!";
58
59$LOCALS=6*$SIZE_T;
60$FRAME=$LOCALS+6*8+18*8;
61
62my $sp="r1";
63
64my ($ctx,$inp,$len,$padbit) = map("r$_",(3..6));
65my ($in0,$in1,$in2,$in3,$i1,$i2,$i3) = map("r$_",(7..12,6));
66
67my ($h0lo,$h0hi,$h1lo,$h1hi,$h2lo,$h2hi,$h3lo,$h3hi,
68 $two0,$two32,$two64,$two96,$two130,$five_two130,
69 $r0lo,$r0hi,$r1lo,$r1hi,$r2lo,$r2hi,
70 $s2lo,$s2hi,$s3lo,$s3hi,
71 $c0lo,$c0hi,$c1lo,$c1hi,$c2lo,$c2hi,$c3lo,$c3hi) = map("f$_",(0..31));
72# borrowings
73my ($r3lo,$r3hi,$s1lo,$s1hi) = ($c0lo,$c0hi,$c1lo,$c1hi);
74my ($x0,$x1,$x2,$x3) = ($c2lo,$c2hi,$c3lo,$c3hi);
75my ($y0,$y1,$y2,$y3) = ($c3lo,$c3hi,$c1lo,$c1hi);
76
77$code.=<<___;
78.machine "any"
79.text
80
81.globl .poly1305_init_fpu
82.align 6
83.poly1305_init_fpu:
84 $STU $sp,-$LOCALS($sp) # minimal frame
85 mflr $padbit
86 $PUSH $padbit,`$LOCALS+$LRSAVE`($sp)
87
88 bl LPICmeup
89
90 xor r0,r0,r0
91 mtlr $padbit # restore lr
92
93 lfd $two0,8*0($len) # load constants
94 lfd $two32,8*1($len)
95 lfd $two64,8*2($len)
96 lfd $two96,8*3($len)
97 lfd $two130,8*4($len)
98 lfd $five_two130,8*5($len)
99
100 stfd $two0,8*0($ctx) # initial hash value, biased 0
101 stfd $two32,8*1($ctx)
102 stfd $two64,8*2($ctx)
103 stfd $two96,8*3($ctx)
104
105 $UCMP $inp,r0
106 beq- Lno_key
107
108 lfd $h3lo,8*13($len) # new fpscr
109 mffs $h3hi # old fpscr
110
111 stfd $two0,8*4($ctx) # key "template"
112 stfd $two32,8*5($ctx)
113 stfd $two64,8*6($ctx)
114 stfd $two96,8*7($ctx)
115
116 li $in1,4
117 li $in2,8
118 li $in3,12
119 $LWXLE $in0,0,$inp # load key
120 $LWXLE $in1,$in1,$inp
121 $LWXLE $in2,$in2,$inp
122 $LWXLE $in3,$in3,$inp
123
124 lis $i1,0xf000 # 0xf0000000
125 ori $i2,$i1,3 # 0xf0000003
126 andc $in0,$in0,$i1 # &=0x0fffffff
127 andc $in1,$in1,$i2 # &=0x0ffffffc
128 andc $in2,$in2,$i2
129 andc $in3,$in3,$i2
130
131 stw $in0,`8*4+(4^$LITTLE_ENDIAN)`($ctx) # fill "template"
132 stw $in1,`8*5+(4^$LITTLE_ENDIAN)`($ctx)
133 stw $in2,`8*6+(4^$LITTLE_ENDIAN)`($ctx)
134 stw $in3,`8*7+(4^$LITTLE_ENDIAN)`($ctx)
135
136 mtfsf 255,$h3lo # fpscr
137 stfd $two0,8*18($ctx) # copy constants to context
138 stfd $two32,8*19($ctx)
139 stfd $two64,8*20($ctx)
140 stfd $two96,8*21($ctx)
141 stfd $two130,8*22($ctx)
142 stfd $five_two130,8*23($ctx)
143
144 lfd $h0lo,8*4($ctx) # load [biased] key
145 lfd $h1lo,8*5($ctx)
146 lfd $h2lo,8*6($ctx)
147 lfd $h3lo,8*7($ctx)
148
149 fsub $h0lo,$h0lo,$two0 # r0
150 fsub $h1lo,$h1lo,$two32 # r1
151 fsub $h2lo,$h2lo,$two64 # r2
152 fsub $h3lo,$h3lo,$two96 # r3
153
154 lfd $two0,8*6($len) # more constants
155 lfd $two32,8*7($len)
156 lfd $two64,8*8($len)
157 lfd $two96,8*9($len)
158
159 fmul $h1hi,$h1lo,$five_two130 # s1
160 fmul $h2hi,$h2lo,$five_two130 # s2
161 stfd $h3hi,8*15($ctx) # borrow slot for original fpscr
162 fmul $h3hi,$h3lo,$five_two130 # s3
163
164 fadd $h0hi,$h0lo,$two0
165 stfd $h1hi,8*12($ctx) # put aside for now
166 fadd $h1hi,$h1lo,$two32
167 stfd $h2hi,8*13($ctx)
168 fadd $h2hi,$h2lo,$two64
169 stfd $h3hi,8*14($ctx)
170 fadd $h3hi,$h3lo,$two96
171
172 fsub $h0hi,$h0hi,$two0
173 fsub $h1hi,$h1hi,$two32
174 fsub $h2hi,$h2hi,$two64
175 fsub $h3hi,$h3hi,$two96
176
177 lfd $two0,8*10($len) # more constants
178 lfd $two32,8*11($len)
179 lfd $two64,8*12($len)
180
181 fsub $h0lo,$h0lo,$h0hi
182 fsub $h1lo,$h1lo,$h1hi
183 fsub $h2lo,$h2lo,$h2hi
184 fsub $h3lo,$h3lo,$h3hi
185
186 stfd $h0hi,8*5($ctx) # r0hi
187 stfd $h1hi,8*7($ctx) # r1hi
188 stfd $h2hi,8*9($ctx) # r2hi
189 stfd $h3hi,8*11($ctx) # r3hi
190
191 stfd $h0lo,8*4($ctx) # r0lo
192 stfd $h1lo,8*6($ctx) # r1lo
193 stfd $h2lo,8*8($ctx) # r2lo
194 stfd $h3lo,8*10($ctx) # r3lo
195
196 lfd $h1lo,8*12($ctx) # s1
197 lfd $h2lo,8*13($ctx) # s2
198 lfd $h3lo,8*14($ctx) # s3
199 lfd $h0lo,8*15($ctx) # pull original fpscr
200
201 fadd $h1hi,$h1lo,$two0
202 fadd $h2hi,$h2lo,$two32
203 fadd $h3hi,$h3lo,$two64
204
205 fsub $h1hi,$h1hi,$two0
206 fsub $h2hi,$h2hi,$two32
207 fsub $h3hi,$h3hi,$two64
208
209 fsub $h1lo,$h1lo,$h1hi
210 fsub $h2lo,$h2lo,$h2hi
211 fsub $h3lo,$h3lo,$h3hi
212
213 stfd $h1hi,8*13($ctx) # s1hi
214 stfd $h2hi,8*15($ctx) # s2hi
215 stfd $h3hi,8*17($ctx) # s3hi
216
217 stfd $h1lo,8*12($ctx) # s1lo
218 stfd $h2lo,8*14($ctx) # s2lo
219 stfd $h3lo,8*16($ctx) # s3lo
220
221 mtfsf 255,$h0lo # restore fpscr
222Lno_key:
223 xor r3,r3,r3
224 addi $sp,$sp,$LOCALS
225 blr
226 .long 0
227 .byte 0,12,4,1,0x80,0,2,0
228.size .poly1305_init_fpu,.-.poly1305_init_fpu
229
230.globl .poly1305_blocks_fpu
231.align 4
232.poly1305_blocks_fpu:
233 srwi. $len,$len,4
234 beq- Labort
235
236 $STU $sp,-$FRAME($sp)
237 mflr r0
238 stfd f14,`$FRAME-8*18`($sp)
239 stfd f15,`$FRAME-8*17`($sp)
240 stfd f16,`$FRAME-8*16`($sp)
241 stfd f17,`$FRAME-8*15`($sp)
242 stfd f18,`$FRAME-8*14`($sp)
243 stfd f19,`$FRAME-8*13`($sp)
244 stfd f20,`$FRAME-8*12`($sp)
245 stfd f21,`$FRAME-8*11`($sp)
246 stfd f22,`$FRAME-8*10`($sp)
247 stfd f23,`$FRAME-8*9`($sp)
248 stfd f24,`$FRAME-8*8`($sp)
249 stfd f25,`$FRAME-8*7`($sp)
250 stfd f26,`$FRAME-8*6`($sp)
251 stfd f27,`$FRAME-8*5`($sp)
252 stfd f28,`$FRAME-8*4`($sp)
253 stfd f29,`$FRAME-8*3`($sp)
254 stfd f30,`$FRAME-8*2`($sp)
255 stfd f31,`$FRAME-8*1`($sp)
256 $PUSH r0,`$FRAME+$LRSAVE`($sp)
257
258 xor r0,r0,r0
259 li $in3,1
260 mtctr $len
261 neg $len,$len
262 stw r0,`$LOCALS+8*4+(0^$LITTLE_ENDIAN)`($sp)
263 stw $in3,`$LOCALS+8*4+(4^$LITTLE_ENDIAN)`($sp)
264
265 lfd $two0,8*18($ctx) # load constants
266 lfd $two32,8*19($ctx)
267 lfd $two64,8*20($ctx)
268 lfd $two96,8*21($ctx)
269 lfd $two130,8*22($ctx)
270 lfd $five_two130,8*23($ctx)
271
272 lfd $h0lo,8*0($ctx) # load [biased] hash value
273 lfd $h1lo,8*1($ctx)
274 lfd $h2lo,8*2($ctx)
275 lfd $h3lo,8*3($ctx)
276
277 stfd $two0,`$LOCALS+8*0`($sp) # input "template"
278 oris $in3,$padbit,`(1023+52+96)<<4`
279 stfd $two32,`$LOCALS+8*1`($sp)
280 stfd $two64,`$LOCALS+8*2`($sp)
281 stw $in3,`$LOCALS+8*3+(0^$LITTLE_ENDIAN)`($sp)
282
283 li $i1,4
284 li $i2,8
285 li $i3,12
286 $LWXLE $in0,0,$inp # load input
287 $LWXLE $in1,$i1,$inp
288 $LWXLE $in2,$i2,$inp
289 $LWXLE $in3,$i3,$inp
290 addi $inp,$inp,16
291
292 stw $in0,`$LOCALS+8*0+(4^$LITTLE_ENDIAN)`($sp) # fill "template"
293 stw $in1,`$LOCALS+8*1+(4^$LITTLE_ENDIAN)`($sp)
294 stw $in2,`$LOCALS+8*2+(4^$LITTLE_ENDIAN)`($sp)
295 stw $in3,`$LOCALS+8*3+(4^$LITTLE_ENDIAN)`($sp)
296
297 mffs $x0 # original fpscr
298 lfd $x1,`$LOCALS+8*4`($sp) # new fpscr
299 lfd $r0lo,8*4($ctx) # load key
300 lfd $r0hi,8*5($ctx)
301 lfd $r1lo,8*6($ctx)
302 lfd $r1hi,8*7($ctx)
303 lfd $r2lo,8*8($ctx)
304 lfd $r2hi,8*9($ctx)
305 lfd $r3lo,8*10($ctx)
306 lfd $r3hi,8*11($ctx)
307 lfd $s1lo,8*12($ctx)
308 lfd $s1hi,8*13($ctx)
309 lfd $s2lo,8*14($ctx)
310 lfd $s2hi,8*15($ctx)
311 lfd $s3lo,8*16($ctx)
312 lfd $s3hi,8*17($ctx)
313
314 stfd $x0,`$LOCALS+8*4`($sp) # save original fpscr
315 mtfsf 255,$x1
316
317 addic $len,$len,1
318 addze r0,r0
319 slwi. r0,r0,4
320 sub $inp,$inp,r0 # conditional rewind
321
322 lfd $x0,`$LOCALS+8*0`($sp)
323 lfd $x1,`$LOCALS+8*1`($sp)
324 lfd $x2,`$LOCALS+8*2`($sp)
325 lfd $x3,`$LOCALS+8*3`($sp)
326
327 fsub $h0lo,$h0lo,$two0 # de-bias hash value
328 $LWXLE $in0,0,$inp # modulo-scheduled input load
329 fsub $h1lo,$h1lo,$two32
330 $LWXLE $in1,$i1,$inp
331 fsub $h2lo,$h2lo,$two64
332 $LWXLE $in2,$i2,$inp
333 fsub $h3lo,$h3lo,$two96
334 $LWXLE $in3,$i3,$inp
335
336 fsub $x0,$x0,$two0 # de-bias input
337 addi $inp,$inp,16
338 fsub $x1,$x1,$two32
339 fsub $x2,$x2,$two64
340 fsub $x3,$x3,$two96
341
342 fadd $x0,$x0,$h0lo # accumulate input
343 stw $in0,`$LOCALS+8*0+(4^$LITTLE_ENDIAN)`($sp)
344 fadd $x1,$x1,$h1lo
345 stw $in1,`$LOCALS+8*1+(4^$LITTLE_ENDIAN)`($sp)
346 fadd $x2,$x2,$h2lo
347 stw $in2,`$LOCALS+8*2+(4^$LITTLE_ENDIAN)`($sp)
348 fadd $x3,$x3,$h3lo
349 stw $in3,`$LOCALS+8*3+(4^$LITTLE_ENDIAN)`($sp)
350
351 b Lentry
352
353.align 4
354Loop:
355 fsub $y0,$y0,$two0 # de-bias input
356 addic $len,$len,1
357 fsub $y1,$y1,$two32
358 addze r0,r0
359 fsub $y2,$y2,$two64
360 slwi. r0,r0,4
361 fsub $y3,$y3,$two96
362 sub $inp,$inp,r0 # conditional rewind
363
364 fadd $h0lo,$h0lo,$y0 # accumulate input
365 fadd $h0hi,$h0hi,$y1
366 fadd $h2lo,$h2lo,$y2
367 fadd $h2hi,$h2hi,$y3
368
369 ######################################### base 2^48 -> base 2^32
370 fadd $c1lo,$h1lo,$two64
371 $LWXLE $in0,0,$inp # modulo-scheduled input load
372 fadd $c1hi,$h1hi,$two64
373 $LWXLE $in1,$i1,$inp
374 fadd $c3lo,$h3lo,$two130
375 $LWXLE $in2,$i2,$inp
376 fadd $c3hi,$h3hi,$two130
377 $LWXLE $in3,$i3,$inp
378 fadd $c0lo,$h0lo,$two32
379 addi $inp,$inp,16
380 fadd $c0hi,$h0hi,$two32
381 fadd $c2lo,$h2lo,$two96
382 fadd $c2hi,$h2hi,$two96
383
384 fsub $c1lo,$c1lo,$two64
385 stw $in0,`$LOCALS+8*0+(4^$LITTLE_ENDIAN)`($sp) # fill "template"
386 fsub $c1hi,$c1hi,$two64
387 stw $in1,`$LOCALS+8*1+(4^$LITTLE_ENDIAN)`($sp)
388 fsub $c3lo,$c3lo,$two130
389 stw $in2,`$LOCALS+8*2+(4^$LITTLE_ENDIAN)`($sp)
390 fsub $c3hi,$c3hi,$two130
391 stw $in3,`$LOCALS+8*3+(4^$LITTLE_ENDIAN)`($sp)
392 fsub $c0lo,$c0lo,$two32
393 fsub $c0hi,$c0hi,$two32
394 fsub $c2lo,$c2lo,$two96
395 fsub $c2hi,$c2hi,$two96
396
397 fsub $h1lo,$h1lo,$c1lo
398 fsub $h1hi,$h1hi,$c1hi
399 fsub $h3lo,$h3lo,$c3lo
400 fsub $h3hi,$h3hi,$c3hi
401 fsub $h2lo,$h2lo,$c2lo
402 fsub $h2hi,$h2hi,$c2hi
403 fsub $h0lo,$h0lo,$c0lo
404 fsub $h0hi,$h0hi,$c0hi
405
406 fadd $h1lo,$h1lo,$c0lo
407 fadd $h1hi,$h1hi,$c0hi
408 fadd $h3lo,$h3lo,$c2lo
409 fadd $h3hi,$h3hi,$c2hi
410 fadd $h2lo,$h2lo,$c1lo
411 fadd $h2hi,$h2hi,$c1hi
412 fmadd $h0lo,$c3lo,$five_two130,$h0lo
413 fmadd $h0hi,$c3hi,$five_two130,$h0hi
414
415 fadd $x1,$h1lo,$h1hi
416 lfd $s1lo,8*12($ctx) # reload constants
417 fadd $x3,$h3lo,$h3hi
418 lfd $s1hi,8*13($ctx)
419 fadd $x2,$h2lo,$h2hi
420 lfd $r3lo,8*10($ctx)
421 fadd $x0,$h0lo,$h0hi
422 lfd $r3hi,8*11($ctx)
423Lentry:
424 fmul $h0lo,$s3lo,$x1
425 fmul $h0hi,$s3hi,$x1
426 fmul $h2lo,$r1lo,$x1
427 fmul $h2hi,$r1hi,$x1
428 fmul $h1lo,$r0lo,$x1
429 fmul $h1hi,$r0hi,$x1
430 fmul $h3lo,$r2lo,$x1
431 fmul $h3hi,$r2hi,$x1
432
433 fmadd $h0lo,$s1lo,$x3,$h0lo
434 fmadd $h0hi,$s1hi,$x3,$h0hi
435 fmadd $h2lo,$s3lo,$x3,$h2lo
436 fmadd $h2hi,$s3hi,$x3,$h2hi
437 fmadd $h1lo,$s2lo,$x3,$h1lo
438 fmadd $h1hi,$s2hi,$x3,$h1hi
439 fmadd $h3lo,$r0lo,$x3,$h3lo
440 fmadd $h3hi,$r0hi,$x3,$h3hi
441
442 fmadd $h0lo,$s2lo,$x2,$h0lo
443 fmadd $h0hi,$s2hi,$x2,$h0hi
444 fmadd $h2lo,$r0lo,$x2,$h2lo
445 fmadd $h2hi,$r0hi,$x2,$h2hi
446 fmadd $h1lo,$s3lo,$x2,$h1lo
447 fmadd $h1hi,$s3hi,$x2,$h1hi
448 fmadd $h3lo,$r1lo,$x2,$h3lo
449 fmadd $h3hi,$r1hi,$x2,$h3hi
450
451 fmadd $h0lo,$r0lo,$x0,$h0lo
452 lfd $y0,`$LOCALS+8*0`($sp) # load [biased] input
453 fmadd $h0hi,$r0hi,$x0,$h0hi
454 lfd $y1,`$LOCALS+8*1`($sp)
455 fmadd $h2lo,$r2lo,$x0,$h2lo
456 lfd $y2,`$LOCALS+8*2`($sp)
457 fmadd $h2hi,$r2hi,$x0,$h2hi
458 lfd $y3,`$LOCALS+8*3`($sp)
459 fmadd $h1lo,$r1lo,$x0,$h1lo
460 fmadd $h1hi,$r1hi,$x0,$h1hi
461 fmadd $h3lo,$r3lo,$x0,$h3lo
462 fmadd $h3hi,$r3hi,$x0,$h3hi
463
464 bdnz Loop
465
466 ######################################### base 2^48 -> base 2^32
467 fadd $c0lo,$h0lo,$two32
468 fadd $c0hi,$h0hi,$two32
469 fadd $c2lo,$h2lo,$two96
470 fadd $c2hi,$h2hi,$two96
471 fadd $c1lo,$h1lo,$two64
472 fadd $c1hi,$h1hi,$two64
473 fadd $c3lo,$h3lo,$two130
474 fadd $c3hi,$h3hi,$two130
475
476 fsub $c0lo,$c0lo,$two32
477 fsub $c0hi,$c0hi,$two32
478 fsub $c2lo,$c2lo,$two96
479 fsub $c2hi,$c2hi,$two96
480 fsub $c1lo,$c1lo,$two64
481 fsub $c1hi,$c1hi,$two64
482 fsub $c3lo,$c3lo,$two130
483 fsub $c3hi,$c3hi,$two130
484
485 fsub $h1lo,$h1lo,$c1lo
486 fsub $h1hi,$h1hi,$c1hi
487 fsub $h3lo,$h3lo,$c3lo
488 fsub $h3hi,$h3hi,$c3hi
489 fsub $h2lo,$h2lo,$c2lo
490 fsub $h2hi,$h2hi,$c2hi
491 fsub $h0lo,$h0lo,$c0lo
492 fsub $h0hi,$h0hi,$c0hi
493
494 fadd $h1lo,$h1lo,$c0lo
495 fadd $h1hi,$h1hi,$c0hi
496 fadd $h3lo,$h3lo,$c2lo
497 fadd $h3hi,$h3hi,$c2hi
498 fadd $h2lo,$h2lo,$c1lo
499 fadd $h2hi,$h2hi,$c1hi
500 fmadd $h0lo,$c3lo,$five_two130,$h0lo
501 fmadd $h0hi,$c3hi,$five_two130,$h0hi
502
503 fadd $x1,$h1lo,$h1hi
504 fadd $x3,$h3lo,$h3hi
505 fadd $x2,$h2lo,$h2hi
506 fadd $x0,$h0lo,$h0hi
507
508 lfd $h0lo,`$LOCALS+8*4`($sp) # pull saved fpscr
509 fadd $x1,$x1,$two32 # bias
510 fadd $x3,$x3,$two96
511 fadd $x2,$x2,$two64
512 fadd $x0,$x0,$two0
513
514 stfd $x1,8*1($ctx) # store [biased] hash value
515 stfd $x3,8*3($ctx)
516 stfd $x2,8*2($ctx)
517 stfd $x0,8*0($ctx)
518
519 mtfsf 255,$h0lo # restore original fpscr
520 lfd f14,`$FRAME-8*18`($sp)
521 lfd f15,`$FRAME-8*17`($sp)
522 lfd f16,`$FRAME-8*16`($sp)
523 lfd f17,`$FRAME-8*15`($sp)
524 lfd f18,`$FRAME-8*14`($sp)
525 lfd f19,`$FRAME-8*13`($sp)
526 lfd f20,`$FRAME-8*12`($sp)
527 lfd f21,`$FRAME-8*11`($sp)
528 lfd f22,`$FRAME-8*10`($sp)
529 lfd f23,`$FRAME-8*9`($sp)
530 lfd f24,`$FRAME-8*8`($sp)
531 lfd f25,`$FRAME-8*7`($sp)
532 lfd f26,`$FRAME-8*6`($sp)
533 lfd f27,`$FRAME-8*5`($sp)
534 lfd f28,`$FRAME-8*4`($sp)
535 lfd f29,`$FRAME-8*3`($sp)
536 lfd f30,`$FRAME-8*2`($sp)
537 lfd f31,`$FRAME-8*1`($sp)
538 addi $sp,$sp,$FRAME
539Labort:
540 blr
541 .long 0
542 .byte 0,12,4,1,0x80,0,4,0
543.size .poly1305_blocks_fpu,.-.poly1305_blocks_fpu
544___
545{
546my ($mac,$nonce)=($inp,$len);
547
548my ($h0,$h1,$h2,$h3,$h4, $d0,$d1,$d2,$d3
549 ) = map("r$_",(7..11,28..31));
550my $mask = "r0";
551my $FRAME = (6+4)*$SIZE_T;
552
553$code.=<<___;
554.globl .poly1305_emit_fpu
555.align 4
556.poly1305_emit_fpu:
557 $STU $sp,-$FRAME($sp)
558 mflr r0
559 $PUSH r28,`$FRAME-$SIZE_T*4`($sp)
560 $PUSH r29,`$FRAME-$SIZE_T*3`($sp)
561 $PUSH r30,`$FRAME-$SIZE_T*2`($sp)
562 $PUSH r31,`$FRAME-$SIZE_T*1`($sp)
563 $PUSH r0,`$FRAME+$LRSAVE`($sp)
564
565 lwz $d0,`8*0+(0^$LITTLE_ENDIAN)`($ctx) # load hash
566 lwz $h0,`8*0+(4^$LITTLE_ENDIAN)`($ctx)
567 lwz $d1,`8*1+(0^$LITTLE_ENDIAN)`($ctx)
568 lwz $h1,`8*1+(4^$LITTLE_ENDIAN)`($ctx)
569 lwz $d2,`8*2+(0^$LITTLE_ENDIAN)`($ctx)
570 lwz $h2,`8*2+(4^$LITTLE_ENDIAN)`($ctx)
571 lwz $d3,`8*3+(0^$LITTLE_ENDIAN)`($ctx)
572 lwz $h3,`8*3+(4^$LITTLE_ENDIAN)`($ctx)
573
574 lis $mask,0xfff0
575 andc $d0,$d0,$mask # mask exponent
576 andc $d1,$d1,$mask
577 andc $d2,$d2,$mask
578 andc $d3,$d3,$mask # can be partially reduced...
579 li $mask,3
580
581 srwi $padbit,$d3,2 # ... so reduce
582 and $h4,$d3,$mask
583 andc $d3,$d3,$mask
584 add $d3,$d3,$padbit
585___
586 if ($SIZE_T==4) {
587$code.=<<___;
588 addc $h0,$h0,$d3
589 adde $h1,$h1,$d0
590 adde $h2,$h2,$d1
591 adde $h3,$h3,$d2
592 addze $h4,$h4
593
594 addic $d0,$h0,5 # compare to modulus
595 addze $d1,$h1
596 addze $d2,$h2
597 addze $d3,$h3
598 addze $mask,$h4
599
600 srwi $mask,$mask,2 # did it carry/borrow?
601 neg $mask,$mask
602 srawi $mask,$mask,31 # mask
603
604 andc $h0,$h0,$mask
605 and $d0,$d0,$mask
606 andc $h1,$h1,$mask
607 and $d1,$d1,$mask
608 or $h0,$h0,$d0
609 lwz $d0,0($nonce) # load nonce
610 andc $h2,$h2,$mask
611 and $d2,$d2,$mask
612 or $h1,$h1,$d1
613 lwz $d1,4($nonce)
614 andc $h3,$h3,$mask
615 and $d3,$d3,$mask
616 or $h2,$h2,$d2
617 lwz $d2,8($nonce)
618 or $h3,$h3,$d3
619 lwz $d3,12($nonce)
620
621 addc $h0,$h0,$d0 # accumulate nonce
622 adde $h1,$h1,$d1
623 adde $h2,$h2,$d2
624 adde $h3,$h3,$d3
625___
626 } else {
627$code.=<<___;
628 add $h0,$h0,$d3
629 add $h1,$h1,$d0
630 add $h2,$h2,$d1
631 add $h3,$h3,$d2
632
633 srdi $d0,$h0,32
634 add $h1,$h1,$d0
635 srdi $d1,$h1,32
636 add $h2,$h2,$d1
637 srdi $d2,$h2,32
638 add $h3,$h3,$d2
639 srdi $d3,$h3,32
640 add $h4,$h4,$d3
641
642 insrdi $h0,$h1,32,0
643 insrdi $h2,$h3,32,0
644
645 addic $d0,$h0,5 # compare to modulus
646 addze $d1,$h2
647 addze $d2,$h4
648
649 srdi $mask,$d2,2 # did it carry/borrow?
650 neg $mask,$mask
651 sradi $mask,$mask,63 # mask
652 ld $d2,0($nonce) # load nonce
653 ld $d3,8($nonce)
654
655 andc $h0,$h0,$mask
656 and $d0,$d0,$mask
657 andc $h2,$h2,$mask
658 and $d1,$d1,$mask
659 or $h0,$h0,$d0
660 or $h2,$h2,$d1
661___
662$code.=<<___ if (!$LITTLE_ENDIAN);
663 rotldi $d2,$d2,32 # flip nonce words
664 rotldi $d3,$d3,32
665___
666$code.=<<___;
667 addc $h0,$h0,$d2 # accumulate nonce
668 adde $h2,$h2,$d3
669
670 srdi $h1,$h0,32
671 srdi $h3,$h2,32
672___
673 }
674$code.=<<___ if ($LITTLE_ENDIAN);
675 stw $h0,0($mac) # write result
676 stw $h1,4($mac)
677 stw $h2,8($mac)
678 stw $h3,12($mac)
679___
680$code.=<<___ if (!$LITTLE_ENDIAN);
681 li $d1,4
682 stwbrx $h0,0,$mac # write result
683 li $d2,8
684 stwbrx $h1,$d1,$mac
685 li $d3,12
686 stwbrx $h2,$d2,$mac
687 stwbrx $h3,$d3,$mac
688___
689$code.=<<___;
690 $POP r28,`$FRAME-$SIZE_T*4`($sp)
691 $POP r29,`$FRAME-$SIZE_T*3`($sp)
692 $POP r30,`$FRAME-$SIZE_T*2`($sp)
693 $POP r31,`$FRAME-$SIZE_T*1`($sp)
694 addi $sp,$sp,$FRAME
695 blr
696 .long 0
697 .byte 0,12,4,1,0x80,4,3,0
698.size .poly1305_emit_fpu,.-.poly1305_emit_fpu
699___
700}
701# Ugly hack here, because PPC assembler syntax seem to vary too
702# much from platforms to platform...
703$code.=<<___;
704.align 6
705LPICmeup:
706 mflr r0
707 bcl 20,31,\$+4
708 mflr $len # vvvvvv "distance" between . and 1st data entry
709 addi $len,$len,`64-8` # borrow $len
710 mtlr r0
711 blr
712 .long 0
713 .byte 0,12,0x14,0,0,0,0,0
714 .space `64-9*4`
715
716.quad 0x4330000000000000 # 2^(52+0)
717.quad 0x4530000000000000 # 2^(52+32)
718.quad 0x4730000000000000 # 2^(52+64)
719.quad 0x4930000000000000 # 2^(52+96)
720.quad 0x4b50000000000000 # 2^(52+130)
721
722.quad 0x37f4000000000000 # 5/2^130
723
724.quad 0x4430000000000000 # 2^(52+16+0)
725.quad 0x4630000000000000 # 2^(52+16+32)
726.quad 0x4830000000000000 # 2^(52+16+64)
727.quad 0x4a30000000000000 # 2^(52+16+96)
728.quad 0x3e30000000000000 # 2^(52+16+0-96)
729.quad 0x4030000000000000 # 2^(52+16+32-96)
730.quad 0x4230000000000000 # 2^(52+16+64-96)
731
732.quad 0x0000000000000001 # fpscr: truncate, no exceptions
733.asciz "Poly1305 for PPC FPU, CRYPTOGAMS by <appro\@openssl.org>"
734.align 4
735___
736
737$code =~ s/\`([^\`]*)\`/eval $1/gem;
738print $code;
739close STDOUT or die "error closing STDOUT: $!";
Note: See TracBrowser for help on using the repository browser.

© 2024 Oracle Support Privacy / Do Not Sell My Info Terms of Use Trademark Policy Automated Access Etiquette