1 | #! /usr/bin/env perl
|
---|
2 | # Copyright 2016-2020 The OpenSSL Project Authors. All Rights Reserved.
|
---|
3 | #
|
---|
4 | # Licensed under the OpenSSL license (the "License"). You may not use
|
---|
5 | # this file except in compliance with the License. You can obtain a copy
|
---|
6 | # in the file LICENSE in the source distribution or at
|
---|
7 | # https://www.openssl.org/source/license.html
|
---|
8 |
|
---|
9 | #
|
---|
10 | # ====================================================================
|
---|
11 | # Written by Andy Polyakov <[email protected]> for the OpenSSL
|
---|
12 | # project. The module is, however, dual licensed under OpenSSL and
|
---|
13 | # CRYPTOGAMS licenses depending on where you obtain it. For further
|
---|
14 | # details see http://www.openssl.org/~appro/cryptogams/.
|
---|
15 | # ====================================================================
|
---|
16 | #
|
---|
17 | # This module implements Poly1305 hash for PowerPC FPU.
|
---|
18 | #
|
---|
19 | # June 2015
|
---|
20 | #
|
---|
21 | # Numbers are cycles per processed byte with poly1305_blocks alone,
|
---|
22 | # and improvement coefficients relative to gcc-generated code.
|
---|
23 | #
|
---|
24 | # Freescale e300 9.78/+30%
|
---|
25 | # PPC74x0 6.92/+50%
|
---|
26 | # PPC970 6.03/+80%
|
---|
27 | # POWER7 3.50/+30%
|
---|
28 | # POWER8 3.75/+10%
|
---|
29 |
|
---|
30 | $flavour = shift;
|
---|
31 |
|
---|
32 | if ($flavour =~ /64/) {
|
---|
33 | $SIZE_T =8;
|
---|
34 | $LRSAVE =2*$SIZE_T;
|
---|
35 | $UCMP ="cmpld";
|
---|
36 | $STU ="stdu";
|
---|
37 | $POP ="ld";
|
---|
38 | $PUSH ="std";
|
---|
39 | } elsif ($flavour =~ /32/) {
|
---|
40 | $SIZE_T =4;
|
---|
41 | $LRSAVE =$SIZE_T;
|
---|
42 | $UCMP ="cmplw";
|
---|
43 | $STU ="stwu";
|
---|
44 | $POP ="lwz";
|
---|
45 | $PUSH ="stw";
|
---|
46 | } else { die "nonsense $flavour"; }
|
---|
47 |
|
---|
48 | $LITTLE_ENDIAN = ($flavour=~/le$/) ? 4 : 0;
|
---|
49 |
|
---|
50 | $LWXLE = $LITTLE_ENDIAN ? "lwzx" : "lwbrx";
|
---|
51 |
|
---|
52 | $0 =~ m/(.*[\/\\])[^\/\\]+$/; $dir=$1;
|
---|
53 | ( $xlate="${dir}ppc-xlate.pl" and -f $xlate ) or
|
---|
54 | ( $xlate="${dir}../../perlasm/ppc-xlate.pl" and -f $xlate) or
|
---|
55 | die "can't locate ppc-xlate.pl";
|
---|
56 |
|
---|
57 | open STDOUT,"| $^X $xlate $flavour ".shift || die "can't call $xlate: $!";
|
---|
58 |
|
---|
59 | $LOCALS=6*$SIZE_T;
|
---|
60 | $FRAME=$LOCALS+6*8+18*8;
|
---|
61 |
|
---|
62 | my $sp="r1";
|
---|
63 |
|
---|
64 | my ($ctx,$inp,$len,$padbit) = map("r$_",(3..6));
|
---|
65 | my ($in0,$in1,$in2,$in3,$i1,$i2,$i3) = map("r$_",(7..12,6));
|
---|
66 |
|
---|
67 | my ($h0lo,$h0hi,$h1lo,$h1hi,$h2lo,$h2hi,$h3lo,$h3hi,
|
---|
68 | $two0,$two32,$two64,$two96,$two130,$five_two130,
|
---|
69 | $r0lo,$r0hi,$r1lo,$r1hi,$r2lo,$r2hi,
|
---|
70 | $s2lo,$s2hi,$s3lo,$s3hi,
|
---|
71 | $c0lo,$c0hi,$c1lo,$c1hi,$c2lo,$c2hi,$c3lo,$c3hi) = map("f$_",(0..31));
|
---|
72 | # borrowings
|
---|
73 | my ($r3lo,$r3hi,$s1lo,$s1hi) = ($c0lo,$c0hi,$c1lo,$c1hi);
|
---|
74 | my ($x0,$x1,$x2,$x3) = ($c2lo,$c2hi,$c3lo,$c3hi);
|
---|
75 | my ($y0,$y1,$y2,$y3) = ($c3lo,$c3hi,$c1lo,$c1hi);
|
---|
76 |
|
---|
77 | $code.=<<___;
|
---|
78 | .machine "any"
|
---|
79 | .text
|
---|
80 |
|
---|
81 | .globl .poly1305_init_fpu
|
---|
82 | .align 6
|
---|
83 | .poly1305_init_fpu:
|
---|
84 | $STU $sp,-$LOCALS($sp) # minimal frame
|
---|
85 | mflr $padbit
|
---|
86 | $PUSH $padbit,`$LOCALS+$LRSAVE`($sp)
|
---|
87 |
|
---|
88 | bl LPICmeup
|
---|
89 |
|
---|
90 | xor r0,r0,r0
|
---|
91 | mtlr $padbit # restore lr
|
---|
92 |
|
---|
93 | lfd $two0,8*0($len) # load constants
|
---|
94 | lfd $two32,8*1($len)
|
---|
95 | lfd $two64,8*2($len)
|
---|
96 | lfd $two96,8*3($len)
|
---|
97 | lfd $two130,8*4($len)
|
---|
98 | lfd $five_two130,8*5($len)
|
---|
99 |
|
---|
100 | stfd $two0,8*0($ctx) # initial hash value, biased 0
|
---|
101 | stfd $two32,8*1($ctx)
|
---|
102 | stfd $two64,8*2($ctx)
|
---|
103 | stfd $two96,8*3($ctx)
|
---|
104 |
|
---|
105 | $UCMP $inp,r0
|
---|
106 | beq- Lno_key
|
---|
107 |
|
---|
108 | lfd $h3lo,8*13($len) # new fpscr
|
---|
109 | mffs $h3hi # old fpscr
|
---|
110 |
|
---|
111 | stfd $two0,8*4($ctx) # key "template"
|
---|
112 | stfd $two32,8*5($ctx)
|
---|
113 | stfd $two64,8*6($ctx)
|
---|
114 | stfd $two96,8*7($ctx)
|
---|
115 |
|
---|
116 | li $in1,4
|
---|
117 | li $in2,8
|
---|
118 | li $in3,12
|
---|
119 | $LWXLE $in0,0,$inp # load key
|
---|
120 | $LWXLE $in1,$in1,$inp
|
---|
121 | $LWXLE $in2,$in2,$inp
|
---|
122 | $LWXLE $in3,$in3,$inp
|
---|
123 |
|
---|
124 | lis $i1,0xf000 # 0xf0000000
|
---|
125 | ori $i2,$i1,3 # 0xf0000003
|
---|
126 | andc $in0,$in0,$i1 # &=0x0fffffff
|
---|
127 | andc $in1,$in1,$i2 # &=0x0ffffffc
|
---|
128 | andc $in2,$in2,$i2
|
---|
129 | andc $in3,$in3,$i2
|
---|
130 |
|
---|
131 | stw $in0,`8*4+(4^$LITTLE_ENDIAN)`($ctx) # fill "template"
|
---|
132 | stw $in1,`8*5+(4^$LITTLE_ENDIAN)`($ctx)
|
---|
133 | stw $in2,`8*6+(4^$LITTLE_ENDIAN)`($ctx)
|
---|
134 | stw $in3,`8*7+(4^$LITTLE_ENDIAN)`($ctx)
|
---|
135 |
|
---|
136 | mtfsf 255,$h3lo # fpscr
|
---|
137 | stfd $two0,8*18($ctx) # copy constants to context
|
---|
138 | stfd $two32,8*19($ctx)
|
---|
139 | stfd $two64,8*20($ctx)
|
---|
140 | stfd $two96,8*21($ctx)
|
---|
141 | stfd $two130,8*22($ctx)
|
---|
142 | stfd $five_two130,8*23($ctx)
|
---|
143 |
|
---|
144 | lfd $h0lo,8*4($ctx) # load [biased] key
|
---|
145 | lfd $h1lo,8*5($ctx)
|
---|
146 | lfd $h2lo,8*6($ctx)
|
---|
147 | lfd $h3lo,8*7($ctx)
|
---|
148 |
|
---|
149 | fsub $h0lo,$h0lo,$two0 # r0
|
---|
150 | fsub $h1lo,$h1lo,$two32 # r1
|
---|
151 | fsub $h2lo,$h2lo,$two64 # r2
|
---|
152 | fsub $h3lo,$h3lo,$two96 # r3
|
---|
153 |
|
---|
154 | lfd $two0,8*6($len) # more constants
|
---|
155 | lfd $two32,8*7($len)
|
---|
156 | lfd $two64,8*8($len)
|
---|
157 | lfd $two96,8*9($len)
|
---|
158 |
|
---|
159 | fmul $h1hi,$h1lo,$five_two130 # s1
|
---|
160 | fmul $h2hi,$h2lo,$five_two130 # s2
|
---|
161 | stfd $h3hi,8*15($ctx) # borrow slot for original fpscr
|
---|
162 | fmul $h3hi,$h3lo,$five_two130 # s3
|
---|
163 |
|
---|
164 | fadd $h0hi,$h0lo,$two0
|
---|
165 | stfd $h1hi,8*12($ctx) # put aside for now
|
---|
166 | fadd $h1hi,$h1lo,$two32
|
---|
167 | stfd $h2hi,8*13($ctx)
|
---|
168 | fadd $h2hi,$h2lo,$two64
|
---|
169 | stfd $h3hi,8*14($ctx)
|
---|
170 | fadd $h3hi,$h3lo,$two96
|
---|
171 |
|
---|
172 | fsub $h0hi,$h0hi,$two0
|
---|
173 | fsub $h1hi,$h1hi,$two32
|
---|
174 | fsub $h2hi,$h2hi,$two64
|
---|
175 | fsub $h3hi,$h3hi,$two96
|
---|
176 |
|
---|
177 | lfd $two0,8*10($len) # more constants
|
---|
178 | lfd $two32,8*11($len)
|
---|
179 | lfd $two64,8*12($len)
|
---|
180 |
|
---|
181 | fsub $h0lo,$h0lo,$h0hi
|
---|
182 | fsub $h1lo,$h1lo,$h1hi
|
---|
183 | fsub $h2lo,$h2lo,$h2hi
|
---|
184 | fsub $h3lo,$h3lo,$h3hi
|
---|
185 |
|
---|
186 | stfd $h0hi,8*5($ctx) # r0hi
|
---|
187 | stfd $h1hi,8*7($ctx) # r1hi
|
---|
188 | stfd $h2hi,8*9($ctx) # r2hi
|
---|
189 | stfd $h3hi,8*11($ctx) # r3hi
|
---|
190 |
|
---|
191 | stfd $h0lo,8*4($ctx) # r0lo
|
---|
192 | stfd $h1lo,8*6($ctx) # r1lo
|
---|
193 | stfd $h2lo,8*8($ctx) # r2lo
|
---|
194 | stfd $h3lo,8*10($ctx) # r3lo
|
---|
195 |
|
---|
196 | lfd $h1lo,8*12($ctx) # s1
|
---|
197 | lfd $h2lo,8*13($ctx) # s2
|
---|
198 | lfd $h3lo,8*14($ctx) # s3
|
---|
199 | lfd $h0lo,8*15($ctx) # pull original fpscr
|
---|
200 |
|
---|
201 | fadd $h1hi,$h1lo,$two0
|
---|
202 | fadd $h2hi,$h2lo,$two32
|
---|
203 | fadd $h3hi,$h3lo,$two64
|
---|
204 |
|
---|
205 | fsub $h1hi,$h1hi,$two0
|
---|
206 | fsub $h2hi,$h2hi,$two32
|
---|
207 | fsub $h3hi,$h3hi,$two64
|
---|
208 |
|
---|
209 | fsub $h1lo,$h1lo,$h1hi
|
---|
210 | fsub $h2lo,$h2lo,$h2hi
|
---|
211 | fsub $h3lo,$h3lo,$h3hi
|
---|
212 |
|
---|
213 | stfd $h1hi,8*13($ctx) # s1hi
|
---|
214 | stfd $h2hi,8*15($ctx) # s2hi
|
---|
215 | stfd $h3hi,8*17($ctx) # s3hi
|
---|
216 |
|
---|
217 | stfd $h1lo,8*12($ctx) # s1lo
|
---|
218 | stfd $h2lo,8*14($ctx) # s2lo
|
---|
219 | stfd $h3lo,8*16($ctx) # s3lo
|
---|
220 |
|
---|
221 | mtfsf 255,$h0lo # restore fpscr
|
---|
222 | Lno_key:
|
---|
223 | xor r3,r3,r3
|
---|
224 | addi $sp,$sp,$LOCALS
|
---|
225 | blr
|
---|
226 | .long 0
|
---|
227 | .byte 0,12,4,1,0x80,0,2,0
|
---|
228 | .size .poly1305_init_fpu,.-.poly1305_init_fpu
|
---|
229 |
|
---|
230 | .globl .poly1305_blocks_fpu
|
---|
231 | .align 4
|
---|
232 | .poly1305_blocks_fpu:
|
---|
233 | srwi. $len,$len,4
|
---|
234 | beq- Labort
|
---|
235 |
|
---|
236 | $STU $sp,-$FRAME($sp)
|
---|
237 | mflr r0
|
---|
238 | stfd f14,`$FRAME-8*18`($sp)
|
---|
239 | stfd f15,`$FRAME-8*17`($sp)
|
---|
240 | stfd f16,`$FRAME-8*16`($sp)
|
---|
241 | stfd f17,`$FRAME-8*15`($sp)
|
---|
242 | stfd f18,`$FRAME-8*14`($sp)
|
---|
243 | stfd f19,`$FRAME-8*13`($sp)
|
---|
244 | stfd f20,`$FRAME-8*12`($sp)
|
---|
245 | stfd f21,`$FRAME-8*11`($sp)
|
---|
246 | stfd f22,`$FRAME-8*10`($sp)
|
---|
247 | stfd f23,`$FRAME-8*9`($sp)
|
---|
248 | stfd f24,`$FRAME-8*8`($sp)
|
---|
249 | stfd f25,`$FRAME-8*7`($sp)
|
---|
250 | stfd f26,`$FRAME-8*6`($sp)
|
---|
251 | stfd f27,`$FRAME-8*5`($sp)
|
---|
252 | stfd f28,`$FRAME-8*4`($sp)
|
---|
253 | stfd f29,`$FRAME-8*3`($sp)
|
---|
254 | stfd f30,`$FRAME-8*2`($sp)
|
---|
255 | stfd f31,`$FRAME-8*1`($sp)
|
---|
256 | $PUSH r0,`$FRAME+$LRSAVE`($sp)
|
---|
257 |
|
---|
258 | xor r0,r0,r0
|
---|
259 | li $in3,1
|
---|
260 | mtctr $len
|
---|
261 | neg $len,$len
|
---|
262 | stw r0,`$LOCALS+8*4+(0^$LITTLE_ENDIAN)`($sp)
|
---|
263 | stw $in3,`$LOCALS+8*4+(4^$LITTLE_ENDIAN)`($sp)
|
---|
264 |
|
---|
265 | lfd $two0,8*18($ctx) # load constants
|
---|
266 | lfd $two32,8*19($ctx)
|
---|
267 | lfd $two64,8*20($ctx)
|
---|
268 | lfd $two96,8*21($ctx)
|
---|
269 | lfd $two130,8*22($ctx)
|
---|
270 | lfd $five_two130,8*23($ctx)
|
---|
271 |
|
---|
272 | lfd $h0lo,8*0($ctx) # load [biased] hash value
|
---|
273 | lfd $h1lo,8*1($ctx)
|
---|
274 | lfd $h2lo,8*2($ctx)
|
---|
275 | lfd $h3lo,8*3($ctx)
|
---|
276 |
|
---|
277 | stfd $two0,`$LOCALS+8*0`($sp) # input "template"
|
---|
278 | oris $in3,$padbit,`(1023+52+96)<<4`
|
---|
279 | stfd $two32,`$LOCALS+8*1`($sp)
|
---|
280 | stfd $two64,`$LOCALS+8*2`($sp)
|
---|
281 | stw $in3,`$LOCALS+8*3+(0^$LITTLE_ENDIAN)`($sp)
|
---|
282 |
|
---|
283 | li $i1,4
|
---|
284 | li $i2,8
|
---|
285 | li $i3,12
|
---|
286 | $LWXLE $in0,0,$inp # load input
|
---|
287 | $LWXLE $in1,$i1,$inp
|
---|
288 | $LWXLE $in2,$i2,$inp
|
---|
289 | $LWXLE $in3,$i3,$inp
|
---|
290 | addi $inp,$inp,16
|
---|
291 |
|
---|
292 | stw $in0,`$LOCALS+8*0+(4^$LITTLE_ENDIAN)`($sp) # fill "template"
|
---|
293 | stw $in1,`$LOCALS+8*1+(4^$LITTLE_ENDIAN)`($sp)
|
---|
294 | stw $in2,`$LOCALS+8*2+(4^$LITTLE_ENDIAN)`($sp)
|
---|
295 | stw $in3,`$LOCALS+8*3+(4^$LITTLE_ENDIAN)`($sp)
|
---|
296 |
|
---|
297 | mffs $x0 # original fpscr
|
---|
298 | lfd $x1,`$LOCALS+8*4`($sp) # new fpscr
|
---|
299 | lfd $r0lo,8*4($ctx) # load key
|
---|
300 | lfd $r0hi,8*5($ctx)
|
---|
301 | lfd $r1lo,8*6($ctx)
|
---|
302 | lfd $r1hi,8*7($ctx)
|
---|
303 | lfd $r2lo,8*8($ctx)
|
---|
304 | lfd $r2hi,8*9($ctx)
|
---|
305 | lfd $r3lo,8*10($ctx)
|
---|
306 | lfd $r3hi,8*11($ctx)
|
---|
307 | lfd $s1lo,8*12($ctx)
|
---|
308 | lfd $s1hi,8*13($ctx)
|
---|
309 | lfd $s2lo,8*14($ctx)
|
---|
310 | lfd $s2hi,8*15($ctx)
|
---|
311 | lfd $s3lo,8*16($ctx)
|
---|
312 | lfd $s3hi,8*17($ctx)
|
---|
313 |
|
---|
314 | stfd $x0,`$LOCALS+8*4`($sp) # save original fpscr
|
---|
315 | mtfsf 255,$x1
|
---|
316 |
|
---|
317 | addic $len,$len,1
|
---|
318 | addze r0,r0
|
---|
319 | slwi. r0,r0,4
|
---|
320 | sub $inp,$inp,r0 # conditional rewind
|
---|
321 |
|
---|
322 | lfd $x0,`$LOCALS+8*0`($sp)
|
---|
323 | lfd $x1,`$LOCALS+8*1`($sp)
|
---|
324 | lfd $x2,`$LOCALS+8*2`($sp)
|
---|
325 | lfd $x3,`$LOCALS+8*3`($sp)
|
---|
326 |
|
---|
327 | fsub $h0lo,$h0lo,$two0 # de-bias hash value
|
---|
328 | $LWXLE $in0,0,$inp # modulo-scheduled input load
|
---|
329 | fsub $h1lo,$h1lo,$two32
|
---|
330 | $LWXLE $in1,$i1,$inp
|
---|
331 | fsub $h2lo,$h2lo,$two64
|
---|
332 | $LWXLE $in2,$i2,$inp
|
---|
333 | fsub $h3lo,$h3lo,$two96
|
---|
334 | $LWXLE $in3,$i3,$inp
|
---|
335 |
|
---|
336 | fsub $x0,$x0,$two0 # de-bias input
|
---|
337 | addi $inp,$inp,16
|
---|
338 | fsub $x1,$x1,$two32
|
---|
339 | fsub $x2,$x2,$two64
|
---|
340 | fsub $x3,$x3,$two96
|
---|
341 |
|
---|
342 | fadd $x0,$x0,$h0lo # accumulate input
|
---|
343 | stw $in0,`$LOCALS+8*0+(4^$LITTLE_ENDIAN)`($sp)
|
---|
344 | fadd $x1,$x1,$h1lo
|
---|
345 | stw $in1,`$LOCALS+8*1+(4^$LITTLE_ENDIAN)`($sp)
|
---|
346 | fadd $x2,$x2,$h2lo
|
---|
347 | stw $in2,`$LOCALS+8*2+(4^$LITTLE_ENDIAN)`($sp)
|
---|
348 | fadd $x3,$x3,$h3lo
|
---|
349 | stw $in3,`$LOCALS+8*3+(4^$LITTLE_ENDIAN)`($sp)
|
---|
350 |
|
---|
351 | b Lentry
|
---|
352 |
|
---|
353 | .align 4
|
---|
354 | Loop:
|
---|
355 | fsub $y0,$y0,$two0 # de-bias input
|
---|
356 | addic $len,$len,1
|
---|
357 | fsub $y1,$y1,$two32
|
---|
358 | addze r0,r0
|
---|
359 | fsub $y2,$y2,$two64
|
---|
360 | slwi. r0,r0,4
|
---|
361 | fsub $y3,$y3,$two96
|
---|
362 | sub $inp,$inp,r0 # conditional rewind
|
---|
363 |
|
---|
364 | fadd $h0lo,$h0lo,$y0 # accumulate input
|
---|
365 | fadd $h0hi,$h0hi,$y1
|
---|
366 | fadd $h2lo,$h2lo,$y2
|
---|
367 | fadd $h2hi,$h2hi,$y3
|
---|
368 |
|
---|
369 | ######################################### base 2^48 -> base 2^32
|
---|
370 | fadd $c1lo,$h1lo,$two64
|
---|
371 | $LWXLE $in0,0,$inp # modulo-scheduled input load
|
---|
372 | fadd $c1hi,$h1hi,$two64
|
---|
373 | $LWXLE $in1,$i1,$inp
|
---|
374 | fadd $c3lo,$h3lo,$two130
|
---|
375 | $LWXLE $in2,$i2,$inp
|
---|
376 | fadd $c3hi,$h3hi,$two130
|
---|
377 | $LWXLE $in3,$i3,$inp
|
---|
378 | fadd $c0lo,$h0lo,$two32
|
---|
379 | addi $inp,$inp,16
|
---|
380 | fadd $c0hi,$h0hi,$two32
|
---|
381 | fadd $c2lo,$h2lo,$two96
|
---|
382 | fadd $c2hi,$h2hi,$two96
|
---|
383 |
|
---|
384 | fsub $c1lo,$c1lo,$two64
|
---|
385 | stw $in0,`$LOCALS+8*0+(4^$LITTLE_ENDIAN)`($sp) # fill "template"
|
---|
386 | fsub $c1hi,$c1hi,$two64
|
---|
387 | stw $in1,`$LOCALS+8*1+(4^$LITTLE_ENDIAN)`($sp)
|
---|
388 | fsub $c3lo,$c3lo,$two130
|
---|
389 | stw $in2,`$LOCALS+8*2+(4^$LITTLE_ENDIAN)`($sp)
|
---|
390 | fsub $c3hi,$c3hi,$two130
|
---|
391 | stw $in3,`$LOCALS+8*3+(4^$LITTLE_ENDIAN)`($sp)
|
---|
392 | fsub $c0lo,$c0lo,$two32
|
---|
393 | fsub $c0hi,$c0hi,$two32
|
---|
394 | fsub $c2lo,$c2lo,$two96
|
---|
395 | fsub $c2hi,$c2hi,$two96
|
---|
396 |
|
---|
397 | fsub $h1lo,$h1lo,$c1lo
|
---|
398 | fsub $h1hi,$h1hi,$c1hi
|
---|
399 | fsub $h3lo,$h3lo,$c3lo
|
---|
400 | fsub $h3hi,$h3hi,$c3hi
|
---|
401 | fsub $h2lo,$h2lo,$c2lo
|
---|
402 | fsub $h2hi,$h2hi,$c2hi
|
---|
403 | fsub $h0lo,$h0lo,$c0lo
|
---|
404 | fsub $h0hi,$h0hi,$c0hi
|
---|
405 |
|
---|
406 | fadd $h1lo,$h1lo,$c0lo
|
---|
407 | fadd $h1hi,$h1hi,$c0hi
|
---|
408 | fadd $h3lo,$h3lo,$c2lo
|
---|
409 | fadd $h3hi,$h3hi,$c2hi
|
---|
410 | fadd $h2lo,$h2lo,$c1lo
|
---|
411 | fadd $h2hi,$h2hi,$c1hi
|
---|
412 | fmadd $h0lo,$c3lo,$five_two130,$h0lo
|
---|
413 | fmadd $h0hi,$c3hi,$five_two130,$h0hi
|
---|
414 |
|
---|
415 | fadd $x1,$h1lo,$h1hi
|
---|
416 | lfd $s1lo,8*12($ctx) # reload constants
|
---|
417 | fadd $x3,$h3lo,$h3hi
|
---|
418 | lfd $s1hi,8*13($ctx)
|
---|
419 | fadd $x2,$h2lo,$h2hi
|
---|
420 | lfd $r3lo,8*10($ctx)
|
---|
421 | fadd $x0,$h0lo,$h0hi
|
---|
422 | lfd $r3hi,8*11($ctx)
|
---|
423 | Lentry:
|
---|
424 | fmul $h0lo,$s3lo,$x1
|
---|
425 | fmul $h0hi,$s3hi,$x1
|
---|
426 | fmul $h2lo,$r1lo,$x1
|
---|
427 | fmul $h2hi,$r1hi,$x1
|
---|
428 | fmul $h1lo,$r0lo,$x1
|
---|
429 | fmul $h1hi,$r0hi,$x1
|
---|
430 | fmul $h3lo,$r2lo,$x1
|
---|
431 | fmul $h3hi,$r2hi,$x1
|
---|
432 |
|
---|
433 | fmadd $h0lo,$s1lo,$x3,$h0lo
|
---|
434 | fmadd $h0hi,$s1hi,$x3,$h0hi
|
---|
435 | fmadd $h2lo,$s3lo,$x3,$h2lo
|
---|
436 | fmadd $h2hi,$s3hi,$x3,$h2hi
|
---|
437 | fmadd $h1lo,$s2lo,$x3,$h1lo
|
---|
438 | fmadd $h1hi,$s2hi,$x3,$h1hi
|
---|
439 | fmadd $h3lo,$r0lo,$x3,$h3lo
|
---|
440 | fmadd $h3hi,$r0hi,$x3,$h3hi
|
---|
441 |
|
---|
442 | fmadd $h0lo,$s2lo,$x2,$h0lo
|
---|
443 | fmadd $h0hi,$s2hi,$x2,$h0hi
|
---|
444 | fmadd $h2lo,$r0lo,$x2,$h2lo
|
---|
445 | fmadd $h2hi,$r0hi,$x2,$h2hi
|
---|
446 | fmadd $h1lo,$s3lo,$x2,$h1lo
|
---|
447 | fmadd $h1hi,$s3hi,$x2,$h1hi
|
---|
448 | fmadd $h3lo,$r1lo,$x2,$h3lo
|
---|
449 | fmadd $h3hi,$r1hi,$x2,$h3hi
|
---|
450 |
|
---|
451 | fmadd $h0lo,$r0lo,$x0,$h0lo
|
---|
452 | lfd $y0,`$LOCALS+8*0`($sp) # load [biased] input
|
---|
453 | fmadd $h0hi,$r0hi,$x0,$h0hi
|
---|
454 | lfd $y1,`$LOCALS+8*1`($sp)
|
---|
455 | fmadd $h2lo,$r2lo,$x0,$h2lo
|
---|
456 | lfd $y2,`$LOCALS+8*2`($sp)
|
---|
457 | fmadd $h2hi,$r2hi,$x0,$h2hi
|
---|
458 | lfd $y3,`$LOCALS+8*3`($sp)
|
---|
459 | fmadd $h1lo,$r1lo,$x0,$h1lo
|
---|
460 | fmadd $h1hi,$r1hi,$x0,$h1hi
|
---|
461 | fmadd $h3lo,$r3lo,$x0,$h3lo
|
---|
462 | fmadd $h3hi,$r3hi,$x0,$h3hi
|
---|
463 |
|
---|
464 | bdnz Loop
|
---|
465 |
|
---|
466 | ######################################### base 2^48 -> base 2^32
|
---|
467 | fadd $c0lo,$h0lo,$two32
|
---|
468 | fadd $c0hi,$h0hi,$two32
|
---|
469 | fadd $c2lo,$h2lo,$two96
|
---|
470 | fadd $c2hi,$h2hi,$two96
|
---|
471 | fadd $c1lo,$h1lo,$two64
|
---|
472 | fadd $c1hi,$h1hi,$two64
|
---|
473 | fadd $c3lo,$h3lo,$two130
|
---|
474 | fadd $c3hi,$h3hi,$two130
|
---|
475 |
|
---|
476 | fsub $c0lo,$c0lo,$two32
|
---|
477 | fsub $c0hi,$c0hi,$two32
|
---|
478 | fsub $c2lo,$c2lo,$two96
|
---|
479 | fsub $c2hi,$c2hi,$two96
|
---|
480 | fsub $c1lo,$c1lo,$two64
|
---|
481 | fsub $c1hi,$c1hi,$two64
|
---|
482 | fsub $c3lo,$c3lo,$two130
|
---|
483 | fsub $c3hi,$c3hi,$two130
|
---|
484 |
|
---|
485 | fsub $h1lo,$h1lo,$c1lo
|
---|
486 | fsub $h1hi,$h1hi,$c1hi
|
---|
487 | fsub $h3lo,$h3lo,$c3lo
|
---|
488 | fsub $h3hi,$h3hi,$c3hi
|
---|
489 | fsub $h2lo,$h2lo,$c2lo
|
---|
490 | fsub $h2hi,$h2hi,$c2hi
|
---|
491 | fsub $h0lo,$h0lo,$c0lo
|
---|
492 | fsub $h0hi,$h0hi,$c0hi
|
---|
493 |
|
---|
494 | fadd $h1lo,$h1lo,$c0lo
|
---|
495 | fadd $h1hi,$h1hi,$c0hi
|
---|
496 | fadd $h3lo,$h3lo,$c2lo
|
---|
497 | fadd $h3hi,$h3hi,$c2hi
|
---|
498 | fadd $h2lo,$h2lo,$c1lo
|
---|
499 | fadd $h2hi,$h2hi,$c1hi
|
---|
500 | fmadd $h0lo,$c3lo,$five_two130,$h0lo
|
---|
501 | fmadd $h0hi,$c3hi,$five_two130,$h0hi
|
---|
502 |
|
---|
503 | fadd $x1,$h1lo,$h1hi
|
---|
504 | fadd $x3,$h3lo,$h3hi
|
---|
505 | fadd $x2,$h2lo,$h2hi
|
---|
506 | fadd $x0,$h0lo,$h0hi
|
---|
507 |
|
---|
508 | lfd $h0lo,`$LOCALS+8*4`($sp) # pull saved fpscr
|
---|
509 | fadd $x1,$x1,$two32 # bias
|
---|
510 | fadd $x3,$x3,$two96
|
---|
511 | fadd $x2,$x2,$two64
|
---|
512 | fadd $x0,$x0,$two0
|
---|
513 |
|
---|
514 | stfd $x1,8*1($ctx) # store [biased] hash value
|
---|
515 | stfd $x3,8*3($ctx)
|
---|
516 | stfd $x2,8*2($ctx)
|
---|
517 | stfd $x0,8*0($ctx)
|
---|
518 |
|
---|
519 | mtfsf 255,$h0lo # restore original fpscr
|
---|
520 | lfd f14,`$FRAME-8*18`($sp)
|
---|
521 | lfd f15,`$FRAME-8*17`($sp)
|
---|
522 | lfd f16,`$FRAME-8*16`($sp)
|
---|
523 | lfd f17,`$FRAME-8*15`($sp)
|
---|
524 | lfd f18,`$FRAME-8*14`($sp)
|
---|
525 | lfd f19,`$FRAME-8*13`($sp)
|
---|
526 | lfd f20,`$FRAME-8*12`($sp)
|
---|
527 | lfd f21,`$FRAME-8*11`($sp)
|
---|
528 | lfd f22,`$FRAME-8*10`($sp)
|
---|
529 | lfd f23,`$FRAME-8*9`($sp)
|
---|
530 | lfd f24,`$FRAME-8*8`($sp)
|
---|
531 | lfd f25,`$FRAME-8*7`($sp)
|
---|
532 | lfd f26,`$FRAME-8*6`($sp)
|
---|
533 | lfd f27,`$FRAME-8*5`($sp)
|
---|
534 | lfd f28,`$FRAME-8*4`($sp)
|
---|
535 | lfd f29,`$FRAME-8*3`($sp)
|
---|
536 | lfd f30,`$FRAME-8*2`($sp)
|
---|
537 | lfd f31,`$FRAME-8*1`($sp)
|
---|
538 | addi $sp,$sp,$FRAME
|
---|
539 | Labort:
|
---|
540 | blr
|
---|
541 | .long 0
|
---|
542 | .byte 0,12,4,1,0x80,0,4,0
|
---|
543 | .size .poly1305_blocks_fpu,.-.poly1305_blocks_fpu
|
---|
544 | ___
|
---|
545 | {
|
---|
546 | my ($mac,$nonce)=($inp,$len);
|
---|
547 |
|
---|
548 | my ($h0,$h1,$h2,$h3,$h4, $d0,$d1,$d2,$d3
|
---|
549 | ) = map("r$_",(7..11,28..31));
|
---|
550 | my $mask = "r0";
|
---|
551 | my $FRAME = (6+4)*$SIZE_T;
|
---|
552 |
|
---|
553 | $code.=<<___;
|
---|
554 | .globl .poly1305_emit_fpu
|
---|
555 | .align 4
|
---|
556 | .poly1305_emit_fpu:
|
---|
557 | $STU $sp,-$FRAME($sp)
|
---|
558 | mflr r0
|
---|
559 | $PUSH r28,`$FRAME-$SIZE_T*4`($sp)
|
---|
560 | $PUSH r29,`$FRAME-$SIZE_T*3`($sp)
|
---|
561 | $PUSH r30,`$FRAME-$SIZE_T*2`($sp)
|
---|
562 | $PUSH r31,`$FRAME-$SIZE_T*1`($sp)
|
---|
563 | $PUSH r0,`$FRAME+$LRSAVE`($sp)
|
---|
564 |
|
---|
565 | lwz $d0,`8*0+(0^$LITTLE_ENDIAN)`($ctx) # load hash
|
---|
566 | lwz $h0,`8*0+(4^$LITTLE_ENDIAN)`($ctx)
|
---|
567 | lwz $d1,`8*1+(0^$LITTLE_ENDIAN)`($ctx)
|
---|
568 | lwz $h1,`8*1+(4^$LITTLE_ENDIAN)`($ctx)
|
---|
569 | lwz $d2,`8*2+(0^$LITTLE_ENDIAN)`($ctx)
|
---|
570 | lwz $h2,`8*2+(4^$LITTLE_ENDIAN)`($ctx)
|
---|
571 | lwz $d3,`8*3+(0^$LITTLE_ENDIAN)`($ctx)
|
---|
572 | lwz $h3,`8*3+(4^$LITTLE_ENDIAN)`($ctx)
|
---|
573 |
|
---|
574 | lis $mask,0xfff0
|
---|
575 | andc $d0,$d0,$mask # mask exponent
|
---|
576 | andc $d1,$d1,$mask
|
---|
577 | andc $d2,$d2,$mask
|
---|
578 | andc $d3,$d3,$mask # can be partially reduced...
|
---|
579 | li $mask,3
|
---|
580 |
|
---|
581 | srwi $padbit,$d3,2 # ... so reduce
|
---|
582 | and $h4,$d3,$mask
|
---|
583 | andc $d3,$d3,$mask
|
---|
584 | add $d3,$d3,$padbit
|
---|
585 | ___
|
---|
586 | if ($SIZE_T==4) {
|
---|
587 | $code.=<<___;
|
---|
588 | addc $h0,$h0,$d3
|
---|
589 | adde $h1,$h1,$d0
|
---|
590 | adde $h2,$h2,$d1
|
---|
591 | adde $h3,$h3,$d2
|
---|
592 | addze $h4,$h4
|
---|
593 |
|
---|
594 | addic $d0,$h0,5 # compare to modulus
|
---|
595 | addze $d1,$h1
|
---|
596 | addze $d2,$h2
|
---|
597 | addze $d3,$h3
|
---|
598 | addze $mask,$h4
|
---|
599 |
|
---|
600 | srwi $mask,$mask,2 # did it carry/borrow?
|
---|
601 | neg $mask,$mask
|
---|
602 | srawi $mask,$mask,31 # mask
|
---|
603 |
|
---|
604 | andc $h0,$h0,$mask
|
---|
605 | and $d0,$d0,$mask
|
---|
606 | andc $h1,$h1,$mask
|
---|
607 | and $d1,$d1,$mask
|
---|
608 | or $h0,$h0,$d0
|
---|
609 | lwz $d0,0($nonce) # load nonce
|
---|
610 | andc $h2,$h2,$mask
|
---|
611 | and $d2,$d2,$mask
|
---|
612 | or $h1,$h1,$d1
|
---|
613 | lwz $d1,4($nonce)
|
---|
614 | andc $h3,$h3,$mask
|
---|
615 | and $d3,$d3,$mask
|
---|
616 | or $h2,$h2,$d2
|
---|
617 | lwz $d2,8($nonce)
|
---|
618 | or $h3,$h3,$d3
|
---|
619 | lwz $d3,12($nonce)
|
---|
620 |
|
---|
621 | addc $h0,$h0,$d0 # accumulate nonce
|
---|
622 | adde $h1,$h1,$d1
|
---|
623 | adde $h2,$h2,$d2
|
---|
624 | adde $h3,$h3,$d3
|
---|
625 | ___
|
---|
626 | } else {
|
---|
627 | $code.=<<___;
|
---|
628 | add $h0,$h0,$d3
|
---|
629 | add $h1,$h1,$d0
|
---|
630 | add $h2,$h2,$d1
|
---|
631 | add $h3,$h3,$d2
|
---|
632 |
|
---|
633 | srdi $d0,$h0,32
|
---|
634 | add $h1,$h1,$d0
|
---|
635 | srdi $d1,$h1,32
|
---|
636 | add $h2,$h2,$d1
|
---|
637 | srdi $d2,$h2,32
|
---|
638 | add $h3,$h3,$d2
|
---|
639 | srdi $d3,$h3,32
|
---|
640 | add $h4,$h4,$d3
|
---|
641 |
|
---|
642 | insrdi $h0,$h1,32,0
|
---|
643 | insrdi $h2,$h3,32,0
|
---|
644 |
|
---|
645 | addic $d0,$h0,5 # compare to modulus
|
---|
646 | addze $d1,$h2
|
---|
647 | addze $d2,$h4
|
---|
648 |
|
---|
649 | srdi $mask,$d2,2 # did it carry/borrow?
|
---|
650 | neg $mask,$mask
|
---|
651 | sradi $mask,$mask,63 # mask
|
---|
652 | ld $d2,0($nonce) # load nonce
|
---|
653 | ld $d3,8($nonce)
|
---|
654 |
|
---|
655 | andc $h0,$h0,$mask
|
---|
656 | and $d0,$d0,$mask
|
---|
657 | andc $h2,$h2,$mask
|
---|
658 | and $d1,$d1,$mask
|
---|
659 | or $h0,$h0,$d0
|
---|
660 | or $h2,$h2,$d1
|
---|
661 | ___
|
---|
662 | $code.=<<___ if (!$LITTLE_ENDIAN);
|
---|
663 | rotldi $d2,$d2,32 # flip nonce words
|
---|
664 | rotldi $d3,$d3,32
|
---|
665 | ___
|
---|
666 | $code.=<<___;
|
---|
667 | addc $h0,$h0,$d2 # accumulate nonce
|
---|
668 | adde $h2,$h2,$d3
|
---|
669 |
|
---|
670 | srdi $h1,$h0,32
|
---|
671 | srdi $h3,$h2,32
|
---|
672 | ___
|
---|
673 | }
|
---|
674 | $code.=<<___ if ($LITTLE_ENDIAN);
|
---|
675 | stw $h0,0($mac) # write result
|
---|
676 | stw $h1,4($mac)
|
---|
677 | stw $h2,8($mac)
|
---|
678 | stw $h3,12($mac)
|
---|
679 | ___
|
---|
680 | $code.=<<___ if (!$LITTLE_ENDIAN);
|
---|
681 | li $d1,4
|
---|
682 | stwbrx $h0,0,$mac # write result
|
---|
683 | li $d2,8
|
---|
684 | stwbrx $h1,$d1,$mac
|
---|
685 | li $d3,12
|
---|
686 | stwbrx $h2,$d2,$mac
|
---|
687 | stwbrx $h3,$d3,$mac
|
---|
688 | ___
|
---|
689 | $code.=<<___;
|
---|
690 | $POP r28,`$FRAME-$SIZE_T*4`($sp)
|
---|
691 | $POP r29,`$FRAME-$SIZE_T*3`($sp)
|
---|
692 | $POP r30,`$FRAME-$SIZE_T*2`($sp)
|
---|
693 | $POP r31,`$FRAME-$SIZE_T*1`($sp)
|
---|
694 | addi $sp,$sp,$FRAME
|
---|
695 | blr
|
---|
696 | .long 0
|
---|
697 | .byte 0,12,4,1,0x80,4,3,0
|
---|
698 | .size .poly1305_emit_fpu,.-.poly1305_emit_fpu
|
---|
699 | ___
|
---|
700 | }
|
---|
701 | # Ugly hack here, because PPC assembler syntax seem to vary too
|
---|
702 | # much from platforms to platform...
|
---|
703 | $code.=<<___;
|
---|
704 | .align 6
|
---|
705 | LPICmeup:
|
---|
706 | mflr r0
|
---|
707 | bcl 20,31,\$+4
|
---|
708 | mflr $len # vvvvvv "distance" between . and 1st data entry
|
---|
709 | addi $len,$len,`64-8` # borrow $len
|
---|
710 | mtlr r0
|
---|
711 | blr
|
---|
712 | .long 0
|
---|
713 | .byte 0,12,0x14,0,0,0,0,0
|
---|
714 | .space `64-9*4`
|
---|
715 |
|
---|
716 | .quad 0x4330000000000000 # 2^(52+0)
|
---|
717 | .quad 0x4530000000000000 # 2^(52+32)
|
---|
718 | .quad 0x4730000000000000 # 2^(52+64)
|
---|
719 | .quad 0x4930000000000000 # 2^(52+96)
|
---|
720 | .quad 0x4b50000000000000 # 2^(52+130)
|
---|
721 |
|
---|
722 | .quad 0x37f4000000000000 # 5/2^130
|
---|
723 |
|
---|
724 | .quad 0x4430000000000000 # 2^(52+16+0)
|
---|
725 | .quad 0x4630000000000000 # 2^(52+16+32)
|
---|
726 | .quad 0x4830000000000000 # 2^(52+16+64)
|
---|
727 | .quad 0x4a30000000000000 # 2^(52+16+96)
|
---|
728 | .quad 0x3e30000000000000 # 2^(52+16+0-96)
|
---|
729 | .quad 0x4030000000000000 # 2^(52+16+32-96)
|
---|
730 | .quad 0x4230000000000000 # 2^(52+16+64-96)
|
---|
731 |
|
---|
732 | .quad 0x0000000000000001 # fpscr: truncate, no exceptions
|
---|
733 | .asciz "Poly1305 for PPC FPU, CRYPTOGAMS by <appro\@openssl.org>"
|
---|
734 | .align 4
|
---|
735 | ___
|
---|
736 |
|
---|
737 | $code =~ s/\`([^\`]*)\`/eval $1/gem;
|
---|
738 | print $code;
|
---|
739 | close STDOUT or die "error closing STDOUT: $!";
|
---|