1 | #! /usr/bin/env perl
|
---|
2 | # Copyright 2016-2020 The OpenSSL Project Authors. All Rights Reserved.
|
---|
3 | #
|
---|
4 | # Licensed under the Apache License 2.0 (the "License"). You may not use
|
---|
5 | # this file except in compliance with the License. You can obtain a copy
|
---|
6 | # in the file LICENSE in the source distribution or at
|
---|
7 | # https://www.openssl.org/source/license.html
|
---|
8 |
|
---|
9 | #
|
---|
10 | # ====================================================================
|
---|
11 | # Written by Andy Polyakov, @dot-asm, initially for use in the OpenSSL
|
---|
12 | # project. The module is dual licensed under OpenSSL and CRYPTOGAMS
|
---|
13 | # licenses depending on where you obtain it. For further details see
|
---|
14 | # https://github.com/dot-asm/cryptogams/.
|
---|
15 | # ====================================================================
|
---|
16 | #
|
---|
17 | # This module implements Poly1305 hash for PowerPC.
|
---|
18 | #
|
---|
19 | # June 2015
|
---|
20 | #
|
---|
21 | # Numbers are cycles per processed byte with poly1305_blocks alone,
|
---|
22 | # and improvement coefficients relative to gcc-generated code.
|
---|
23 | #
|
---|
24 | # -m32 -m64
|
---|
25 | #
|
---|
26 | # Freescale e300 14.8/+80% -
|
---|
27 | # PPC74x0 7.60/+60% -
|
---|
28 | # PPC970 7.00/+114% 3.51/+205%
|
---|
29 | # POWER7 3.75/+260% 1.93/+100%
|
---|
30 | # POWER8 - 2.03/+200%
|
---|
31 | # POWER9 - 2.00/+150%
|
---|
32 | #
|
---|
33 | # Do we need floating-point implementation for PPC? Results presented
|
---|
34 | # in poly1305_ieee754.c are tricky to compare to, because they are for
|
---|
35 | # compiler-generated code. On the other hand it's known that floating-
|
---|
36 | # point performance can be dominated by FPU latency, which means that
|
---|
37 | # there is limit even for ideally optimized (and even vectorized) code.
|
---|
38 | # And this limit is estimated to be higher than above -m64 results. Or
|
---|
39 | # in other words floating-point implementation can be meaningful to
|
---|
40 | # consider only in 32-bit application context. We probably have to
|
---|
41 | # recognize that 32-bit builds are getting less popular on high-end
|
---|
42 | # systems and therefore tend to target embedded ones, which might not
|
---|
43 | # even have FPU...
|
---|
44 | #
|
---|
45 | # On side note, Power ISA 2.07 enables vector base 2^26 implementation,
|
---|
46 | # and POWER8 might have capacity to break 1.0 cycle per byte barrier...
|
---|
47 | #
|
---|
48 | # January 2019
|
---|
49 | #
|
---|
50 | # ... Unfortunately not:-( Estimate was a projection of ARM result,
|
---|
51 | # but ARM has vector multiply-n-add instruction, while PowerISA does
|
---|
52 | # not, not one usable in the context. Improvement is ~40% over -m64
|
---|
53 | # result above and is ~1.43 on little-endian systems.
|
---|
54 |
|
---|
55 | # $output is the last argument if it looks like a file (it has an extension)
|
---|
56 | # $flavour is the first argument if it doesn't look like a file
|
---|
57 | $output = $#ARGV >= 0 && $ARGV[$#ARGV] =~ m|\.\w+$| ? pop : undef;
|
---|
58 | $flavour = $#ARGV >= 0 && $ARGV[0] !~ m|\.| ? shift : undef;
|
---|
59 |
|
---|
60 | if ($flavour =~ /64/) {
|
---|
61 | $SIZE_T =8;
|
---|
62 | $LRSAVE =2*$SIZE_T;
|
---|
63 | $UCMP ="cmpld";
|
---|
64 | $STU ="stdu";
|
---|
65 | $POP ="ld";
|
---|
66 | $PUSH ="std";
|
---|
67 | } elsif ($flavour =~ /32/) {
|
---|
68 | $SIZE_T =4;
|
---|
69 | $LRSAVE =$SIZE_T;
|
---|
70 | $UCMP ="cmplw";
|
---|
71 | $STU ="stwu";
|
---|
72 | $POP ="lwz";
|
---|
73 | $PUSH ="stw";
|
---|
74 | } else { die "nonsense $flavour"; }
|
---|
75 |
|
---|
76 | # Define endianness based on flavour
|
---|
77 | # i.e.: linux64le
|
---|
78 | $LITTLE_ENDIAN = ($flavour=~/le$/) ? $SIZE_T : 0;
|
---|
79 |
|
---|
80 | $0 =~ m/(.*[\/\\])[^\/\\]+$/; $dir=$1;
|
---|
81 | ( $xlate="${dir}ppc-xlate.pl" and -f $xlate ) or
|
---|
82 | ( $xlate="${dir}../../perlasm/ppc-xlate.pl" and -f $xlate) or
|
---|
83 | die "can't locate ppc-xlate.pl";
|
---|
84 |
|
---|
85 | open STDOUT,"| $^X $xlate $flavour \"$output\""
|
---|
86 | or die "can't call $xlate: $!";
|
---|
87 |
|
---|
88 | $FRAME=24*$SIZE_T;
|
---|
89 |
|
---|
90 | $sp="r1";
|
---|
91 | my ($ctx,$inp,$len,$padbit) = map("r$_",(3..6));
|
---|
92 | my ($mac,$nonce)=($inp,$len);
|
---|
93 | my $mask = "r0";
|
---|
94 |
|
---|
95 | $code=<<___;
|
---|
96 | .machine "any"
|
---|
97 | .text
|
---|
98 | ___
|
---|
99 | if ($flavour =~ /64/) {
|
---|
100 | ###############################################################################
|
---|
101 | # base 2^64 implementation
|
---|
102 |
|
---|
103 | my ($h0,$h1,$h2,$d0,$d1,$d2, $r0,$r1,$s1, $t0,$t1) = map("r$_",(7..12,27..31));
|
---|
104 |
|
---|
105 | $code.=<<___;
|
---|
106 | .globl .poly1305_init_int
|
---|
107 | .align 4
|
---|
108 | .poly1305_init_int:
|
---|
109 | xor r0,r0,r0
|
---|
110 | std r0,0($ctx) # zero hash value
|
---|
111 | std r0,8($ctx)
|
---|
112 | std r0,16($ctx)
|
---|
113 | stw r0,24($ctx) # clear is_base2_26
|
---|
114 |
|
---|
115 | $UCMP $inp,r0
|
---|
116 | beq- Lno_key
|
---|
117 | ___
|
---|
118 | $code.=<<___ if ($LITTLE_ENDIAN);
|
---|
119 | ld $d0,0($inp) # load key material
|
---|
120 | ld $d1,8($inp)
|
---|
121 | ___
|
---|
122 | $code.=<<___ if (!$LITTLE_ENDIAN);
|
---|
123 | li $h0,4
|
---|
124 | lwbrx $d0,0,$inp # load key material
|
---|
125 | li $d1,8
|
---|
126 | lwbrx $h0,$h0,$inp
|
---|
127 | li $h1,12
|
---|
128 | lwbrx $d1,$d1,$inp
|
---|
129 | lwbrx $h1,$h1,$inp
|
---|
130 | insrdi $d0,$h0,32,0
|
---|
131 | insrdi $d1,$h1,32,0
|
---|
132 | ___
|
---|
133 | $code.=<<___;
|
---|
134 | lis $h1,0xfff # 0x0fff0000
|
---|
135 | ori $h1,$h1,0xfffc # 0x0ffffffc
|
---|
136 | insrdi $h1,$h1,32,0 # 0x0ffffffc0ffffffc
|
---|
137 | ori $h0,$h1,3 # 0x0ffffffc0fffffff
|
---|
138 |
|
---|
139 | and $d0,$d0,$h0
|
---|
140 | and $d1,$d1,$h1
|
---|
141 |
|
---|
142 | std $d0,32($ctx) # store key
|
---|
143 | std $d1,40($ctx)
|
---|
144 |
|
---|
145 | Lno_key:
|
---|
146 | xor r3,r3,r3
|
---|
147 | blr
|
---|
148 | .long 0
|
---|
149 | .byte 0,12,0x14,0,0,0,2,0
|
---|
150 | .size .poly1305_init_int,.-.poly1305_init_int
|
---|
151 |
|
---|
152 | .globl .poly1305_blocks
|
---|
153 | .align 4
|
---|
154 | .poly1305_blocks:
|
---|
155 | Lpoly1305_blocks:
|
---|
156 | srdi. $len,$len,4
|
---|
157 | beq- Labort
|
---|
158 |
|
---|
159 | $STU $sp,-$FRAME($sp)
|
---|
160 | mflr r0
|
---|
161 | $PUSH r27,`$FRAME-$SIZE_T*5`($sp)
|
---|
162 | $PUSH r28,`$FRAME-$SIZE_T*4`($sp)
|
---|
163 | $PUSH r29,`$FRAME-$SIZE_T*3`($sp)
|
---|
164 | $PUSH r30,`$FRAME-$SIZE_T*2`($sp)
|
---|
165 | $PUSH r31,`$FRAME-$SIZE_T*1`($sp)
|
---|
166 | $PUSH r0,`$FRAME+$LRSAVE`($sp)
|
---|
167 |
|
---|
168 | ld $r0,32($ctx) # load key
|
---|
169 | ld $r1,40($ctx)
|
---|
170 |
|
---|
171 | ld $h0,0($ctx) # load hash value
|
---|
172 | ld $h1,8($ctx)
|
---|
173 | ld $h2,16($ctx)
|
---|
174 |
|
---|
175 | srdi $s1,$r1,2
|
---|
176 | mtctr $len
|
---|
177 | add $s1,$s1,$r1 # s1 = r1 + r1>>2
|
---|
178 | li $mask,3
|
---|
179 | b Loop
|
---|
180 |
|
---|
181 | .align 4
|
---|
182 | Loop:
|
---|
183 | ___
|
---|
184 | $code.=<<___ if ($LITTLE_ENDIAN);
|
---|
185 | ld $t0,0($inp) # load input
|
---|
186 | ld $t1,8($inp)
|
---|
187 | ___
|
---|
188 | $code.=<<___ if (!$LITTLE_ENDIAN);
|
---|
189 | li $d0,4
|
---|
190 | lwbrx $t0,0,$inp # load input
|
---|
191 | li $t1,8
|
---|
192 | lwbrx $d0,$d0,$inp
|
---|
193 | li $d1,12
|
---|
194 | lwbrx $t1,$t1,$inp
|
---|
195 | lwbrx $d1,$d1,$inp
|
---|
196 | insrdi $t0,$d0,32,0
|
---|
197 | insrdi $t1,$d1,32,0
|
---|
198 | ___
|
---|
199 | $code.=<<___;
|
---|
200 | addi $inp,$inp,16
|
---|
201 |
|
---|
202 | addc $h0,$h0,$t0 # accumulate input
|
---|
203 | adde $h1,$h1,$t1
|
---|
204 |
|
---|
205 | mulld $d0,$h0,$r0 # h0*r0
|
---|
206 | mulhdu $d1,$h0,$r0
|
---|
207 | adde $h2,$h2,$padbit
|
---|
208 |
|
---|
209 | mulld $t0,$h1,$s1 # h1*5*r1
|
---|
210 | mulhdu $t1,$h1,$s1
|
---|
211 | addc $d0,$d0,$t0
|
---|
212 | adde $d1,$d1,$t1
|
---|
213 |
|
---|
214 | mulld $t0,$h0,$r1 # h0*r1
|
---|
215 | mulhdu $d2,$h0,$r1
|
---|
216 | addc $d1,$d1,$t0
|
---|
217 | addze $d2,$d2
|
---|
218 |
|
---|
219 | mulld $t0,$h1,$r0 # h1*r0
|
---|
220 | mulhdu $t1,$h1,$r0
|
---|
221 | addc $d1,$d1,$t0
|
---|
222 | adde $d2,$d2,$t1
|
---|
223 |
|
---|
224 | mulld $t0,$h2,$s1 # h2*5*r1
|
---|
225 | mulld $t1,$h2,$r0 # h2*r0
|
---|
226 | addc $d1,$d1,$t0
|
---|
227 | adde $d2,$d2,$t1
|
---|
228 |
|
---|
229 | andc $t0,$d2,$mask # final reduction step
|
---|
230 | and $h2,$d2,$mask
|
---|
231 | srdi $t1,$t0,2
|
---|
232 | add $t0,$t0,$t1
|
---|
233 | addc $h0,$d0,$t0
|
---|
234 | addze $h1,$d1
|
---|
235 | addze $h2,$h2
|
---|
236 |
|
---|
237 | bdnz Loop
|
---|
238 |
|
---|
239 | std $h0,0($ctx) # store hash value
|
---|
240 | std $h1,8($ctx)
|
---|
241 | std $h2,16($ctx)
|
---|
242 |
|
---|
243 | $POP r27,`$FRAME-$SIZE_T*5`($sp)
|
---|
244 | $POP r28,`$FRAME-$SIZE_T*4`($sp)
|
---|
245 | $POP r29,`$FRAME-$SIZE_T*3`($sp)
|
---|
246 | $POP r30,`$FRAME-$SIZE_T*2`($sp)
|
---|
247 | $POP r31,`$FRAME-$SIZE_T*1`($sp)
|
---|
248 | addi $sp,$sp,$FRAME
|
---|
249 | Labort:
|
---|
250 | blr
|
---|
251 | .long 0
|
---|
252 | .byte 0,12,4,1,0x80,5,4,0
|
---|
253 | .size .poly1305_blocks,.-.poly1305_blocks
|
---|
254 | ___
|
---|
255 | {
|
---|
256 | my ($h0,$h1,$h2,$h3,$h4,$t0) = map("r$_",(7..12));
|
---|
257 |
|
---|
258 | $code.=<<___;
|
---|
259 | .globl .poly1305_emit
|
---|
260 | .align 5
|
---|
261 | .poly1305_emit:
|
---|
262 | lwz $h0,0($ctx) # load hash value base 2^26
|
---|
263 | lwz $h1,4($ctx)
|
---|
264 | lwz $h2,8($ctx)
|
---|
265 | lwz $h3,12($ctx)
|
---|
266 | lwz $h4,16($ctx)
|
---|
267 | lwz r0,24($ctx) # is_base2_26
|
---|
268 |
|
---|
269 | sldi $h1,$h1,26 # base 2^26 -> base 2^64
|
---|
270 | sldi $t0,$h2,52
|
---|
271 | srdi $h2,$h2,12
|
---|
272 | sldi $h3,$h3,14
|
---|
273 | add $h0,$h0,$h1
|
---|
274 | addc $h0,$h0,$t0
|
---|
275 | sldi $t0,$h4,40
|
---|
276 | srdi $h4,$h4,24
|
---|
277 | adde $h1,$h2,$h3
|
---|
278 | addc $h1,$h1,$t0
|
---|
279 | addze $h2,$h4
|
---|
280 |
|
---|
281 | ld $h3,0($ctx) # load hash value base 2^64
|
---|
282 | ld $h4,8($ctx)
|
---|
283 | ld $t0,16($ctx)
|
---|
284 |
|
---|
285 | neg r0,r0
|
---|
286 | xor $h0,$h0,$h3 # choose between radixes
|
---|
287 | xor $h1,$h1,$h4
|
---|
288 | xor $h2,$h2,$t0
|
---|
289 | and $h0,$h0,r0
|
---|
290 | and $h1,$h1,r0
|
---|
291 | and $h2,$h2,r0
|
---|
292 | xor $h0,$h0,$h3
|
---|
293 | xor $h1,$h1,$h4
|
---|
294 | xor $h2,$h2,$t0
|
---|
295 |
|
---|
296 | addic $h3,$h0,5 # compare to modulus
|
---|
297 | addze $h4,$h1
|
---|
298 | addze $t0,$h2
|
---|
299 |
|
---|
300 | srdi $t0,$t0,2 # see if it carried/borrowed
|
---|
301 | neg $t0,$t0
|
---|
302 |
|
---|
303 | andc $h0,$h0,$t0
|
---|
304 | and $h3,$h3,$t0
|
---|
305 | andc $h1,$h1,$t0
|
---|
306 | and $h4,$h4,$t0
|
---|
307 | or $h0,$h0,$h3
|
---|
308 | or $h1,$h1,$h4
|
---|
309 |
|
---|
310 | lwz $t0,4($nonce)
|
---|
311 | lwz $h2,12($nonce)
|
---|
312 | lwz $h3,0($nonce)
|
---|
313 | lwz $h4,8($nonce)
|
---|
314 |
|
---|
315 | insrdi $h3,$t0,32,0
|
---|
316 | insrdi $h4,$h2,32,0
|
---|
317 |
|
---|
318 | addc $h0,$h0,$h3 # accumulate nonce
|
---|
319 | adde $h1,$h1,$h4
|
---|
320 |
|
---|
321 | addi $ctx,$mac,-1
|
---|
322 | addi $mac,$mac,7
|
---|
323 |
|
---|
324 | stbu $h0,1($ctx) # write [little-endian] result
|
---|
325 | srdi $h0,$h0,8
|
---|
326 | stbu $h1,1($mac)
|
---|
327 | srdi $h1,$h1,8
|
---|
328 |
|
---|
329 | stbu $h0,1($ctx)
|
---|
330 | srdi $h0,$h0,8
|
---|
331 | stbu $h1,1($mac)
|
---|
332 | srdi $h1,$h1,8
|
---|
333 |
|
---|
334 | stbu $h0,1($ctx)
|
---|
335 | srdi $h0,$h0,8
|
---|
336 | stbu $h1,1($mac)
|
---|
337 | srdi $h1,$h1,8
|
---|
338 |
|
---|
339 | stbu $h0,1($ctx)
|
---|
340 | srdi $h0,$h0,8
|
---|
341 | stbu $h1,1($mac)
|
---|
342 | srdi $h1,$h1,8
|
---|
343 |
|
---|
344 | stbu $h0,1($ctx)
|
---|
345 | srdi $h0,$h0,8
|
---|
346 | stbu $h1,1($mac)
|
---|
347 | srdi $h1,$h1,8
|
---|
348 |
|
---|
349 | stbu $h0,1($ctx)
|
---|
350 | srdi $h0,$h0,8
|
---|
351 | stbu $h1,1($mac)
|
---|
352 | srdi $h1,$h1,8
|
---|
353 |
|
---|
354 | stbu $h0,1($ctx)
|
---|
355 | srdi $h0,$h0,8
|
---|
356 | stbu $h1,1($mac)
|
---|
357 | srdi $h1,$h1,8
|
---|
358 |
|
---|
359 | stbu $h0,1($ctx)
|
---|
360 | stbu $h1,1($mac)
|
---|
361 |
|
---|
362 | blr
|
---|
363 | .long 0
|
---|
364 | .byte 0,12,0x14,0,0,0,3,0
|
---|
365 | .size .poly1305_emit,.-.poly1305_emit
|
---|
366 | ___
|
---|
367 | } } else {
|
---|
368 | ###############################################################################
|
---|
369 | # base 2^32 implementation
|
---|
370 |
|
---|
371 | my ($h0,$h1,$h2,$h3,$h4, $r0,$r1,$r2,$r3, $s1,$s2,$s3,
|
---|
372 | $t0,$t1,$t2,$t3, $D0,$D1,$D2,$D3, $d0,$d1,$d2,$d3
|
---|
373 | ) = map("r$_",(7..12,14..31));
|
---|
374 |
|
---|
375 | $code.=<<___;
|
---|
376 | .globl .poly1305_init_int
|
---|
377 | .align 4
|
---|
378 | .poly1305_init_int:
|
---|
379 | xor r0,r0,r0
|
---|
380 | stw r0,0($ctx) # zero hash value
|
---|
381 | stw r0,4($ctx)
|
---|
382 | stw r0,8($ctx)
|
---|
383 | stw r0,12($ctx)
|
---|
384 | stw r0,16($ctx)
|
---|
385 | stw r0,24($ctx) # clear is_base2_26
|
---|
386 |
|
---|
387 | $UCMP $inp,r0
|
---|
388 | beq- Lno_key
|
---|
389 | ___
|
---|
390 | $code.=<<___ if ($LITTLE_ENDIAN);
|
---|
391 | lw $h0,0($inp) # load key material
|
---|
392 | lw $h1,4($inp)
|
---|
393 | lw $h2,8($inp)
|
---|
394 | lw $h3,12($inp)
|
---|
395 | ___
|
---|
396 | $code.=<<___ if (!$LITTLE_ENDIAN);
|
---|
397 | li $h1,4
|
---|
398 | lwbrx $h0,0,$inp # load key material
|
---|
399 | li $h2,8
|
---|
400 | lwbrx $h1,$h1,$inp
|
---|
401 | li $h3,12
|
---|
402 | lwbrx $h2,$h2,$inp
|
---|
403 | lwbrx $h3,$h3,$inp
|
---|
404 | ___
|
---|
405 | $code.=<<___;
|
---|
406 | lis $mask,0xf000 # 0xf0000000
|
---|
407 | li $r0,-4
|
---|
408 | andc $r0,$r0,$mask # 0x0ffffffc
|
---|
409 |
|
---|
410 | andc $h0,$h0,$mask
|
---|
411 | and $h1,$h1,$r0
|
---|
412 | and $h2,$h2,$r0
|
---|
413 | and $h3,$h3,$r0
|
---|
414 |
|
---|
415 | stw $h0,32($ctx) # store key
|
---|
416 | stw $h1,36($ctx)
|
---|
417 | stw $h2,40($ctx)
|
---|
418 | stw $h3,44($ctx)
|
---|
419 |
|
---|
420 | Lno_key:
|
---|
421 | xor r3,r3,r3
|
---|
422 | blr
|
---|
423 | .long 0
|
---|
424 | .byte 0,12,0x14,0,0,0,2,0
|
---|
425 | .size .poly1305_init_int,.-.poly1305_init_int
|
---|
426 |
|
---|
427 | .globl .poly1305_blocks
|
---|
428 | .align 4
|
---|
429 | .poly1305_blocks:
|
---|
430 | Lpoly1305_blocks:
|
---|
431 | srwi. $len,$len,4
|
---|
432 | beq- Labort
|
---|
433 |
|
---|
434 | $STU $sp,-$FRAME($sp)
|
---|
435 | mflr r0
|
---|
436 | $PUSH r14,`$FRAME-$SIZE_T*18`($sp)
|
---|
437 | $PUSH r15,`$FRAME-$SIZE_T*17`($sp)
|
---|
438 | $PUSH r16,`$FRAME-$SIZE_T*16`($sp)
|
---|
439 | $PUSH r17,`$FRAME-$SIZE_T*15`($sp)
|
---|
440 | $PUSH r18,`$FRAME-$SIZE_T*14`($sp)
|
---|
441 | $PUSH r19,`$FRAME-$SIZE_T*13`($sp)
|
---|
442 | $PUSH r20,`$FRAME-$SIZE_T*12`($sp)
|
---|
443 | $PUSH r21,`$FRAME-$SIZE_T*11`($sp)
|
---|
444 | $PUSH r22,`$FRAME-$SIZE_T*10`($sp)
|
---|
445 | $PUSH r23,`$FRAME-$SIZE_T*9`($sp)
|
---|
446 | $PUSH r24,`$FRAME-$SIZE_T*8`($sp)
|
---|
447 | $PUSH r25,`$FRAME-$SIZE_T*7`($sp)
|
---|
448 | $PUSH r26,`$FRAME-$SIZE_T*6`($sp)
|
---|
449 | $PUSH r27,`$FRAME-$SIZE_T*5`($sp)
|
---|
450 | $PUSH r28,`$FRAME-$SIZE_T*4`($sp)
|
---|
451 | $PUSH r29,`$FRAME-$SIZE_T*3`($sp)
|
---|
452 | $PUSH r30,`$FRAME-$SIZE_T*2`($sp)
|
---|
453 | $PUSH r31,`$FRAME-$SIZE_T*1`($sp)
|
---|
454 | $PUSH r0,`$FRAME+$LRSAVE`($sp)
|
---|
455 |
|
---|
456 | lwz $r0,32($ctx) # load key
|
---|
457 | lwz $r1,36($ctx)
|
---|
458 | lwz $r2,40($ctx)
|
---|
459 | lwz $r3,44($ctx)
|
---|
460 |
|
---|
461 | lwz $h0,0($ctx) # load hash value
|
---|
462 | lwz $h1,4($ctx)
|
---|
463 | lwz $h2,8($ctx)
|
---|
464 | lwz $h3,12($ctx)
|
---|
465 | lwz $h4,16($ctx)
|
---|
466 |
|
---|
467 | srwi $s1,$r1,2
|
---|
468 | srwi $s2,$r2,2
|
---|
469 | srwi $s3,$r3,2
|
---|
470 | add $s1,$s1,$r1 # si = ri + ri>>2
|
---|
471 | add $s2,$s2,$r2
|
---|
472 | add $s3,$s3,$r3
|
---|
473 | mtctr $len
|
---|
474 | li $mask,3
|
---|
475 | b Loop
|
---|
476 |
|
---|
477 | .align 4
|
---|
478 | Loop:
|
---|
479 | ___
|
---|
480 | $code.=<<___ if ($LITTLE_ENDIAN);
|
---|
481 | lwz $d0,0($inp) # load input
|
---|
482 | lwz $d1,4($inp)
|
---|
483 | lwz $d2,8($inp)
|
---|
484 | lwz $d3,12($inp)
|
---|
485 | ___
|
---|
486 | $code.=<<___ if (!$LITTLE_ENDIAN);
|
---|
487 | li $d1,4
|
---|
488 | lwbrx $d0,0,$inp # load input
|
---|
489 | li $d2,8
|
---|
490 | lwbrx $d1,$d1,$inp
|
---|
491 | li $d3,12
|
---|
492 | lwbrx $d2,$d2,$inp
|
---|
493 | lwbrx $d3,$d3,$inp
|
---|
494 | ___
|
---|
495 | $code.=<<___;
|
---|
496 | addi $inp,$inp,16
|
---|
497 |
|
---|
498 | addc $h0,$h0,$d0 # accumulate input
|
---|
499 | adde $h1,$h1,$d1
|
---|
500 | adde $h2,$h2,$d2
|
---|
501 |
|
---|
502 | mullw $d0,$h0,$r0 # h0*r0
|
---|
503 | mulhwu $D0,$h0,$r0
|
---|
504 |
|
---|
505 | mullw $d1,$h0,$r1 # h0*r1
|
---|
506 | mulhwu $D1,$h0,$r1
|
---|
507 |
|
---|
508 | mullw $d2,$h0,$r2 # h0*r2
|
---|
509 | mulhwu $D2,$h0,$r2
|
---|
510 |
|
---|
511 | adde $h3,$h3,$d3
|
---|
512 | adde $h4,$h4,$padbit
|
---|
513 |
|
---|
514 | mullw $d3,$h0,$r3 # h0*r3
|
---|
515 | mulhwu $D3,$h0,$r3
|
---|
516 |
|
---|
517 | mullw $t0,$h1,$s3 # h1*s3
|
---|
518 | mulhwu $t1,$h1,$s3
|
---|
519 |
|
---|
520 | mullw $t2,$h1,$r0 # h1*r0
|
---|
521 | mulhwu $t3,$h1,$r0
|
---|
522 | addc $d0,$d0,$t0
|
---|
523 | adde $D0,$D0,$t1
|
---|
524 |
|
---|
525 | mullw $t0,$h1,$r1 # h1*r1
|
---|
526 | mulhwu $t1,$h1,$r1
|
---|
527 | addc $d1,$d1,$t2
|
---|
528 | adde $D1,$D1,$t3
|
---|
529 |
|
---|
530 | mullw $t2,$h1,$r2 # h1*r2
|
---|
531 | mulhwu $t3,$h1,$r2
|
---|
532 | addc $d2,$d2,$t0
|
---|
533 | adde $D2,$D2,$t1
|
---|
534 |
|
---|
535 | mullw $t0,$h2,$s2 # h2*s2
|
---|
536 | mulhwu $t1,$h2,$s2
|
---|
537 | addc $d3,$d3,$t2
|
---|
538 | adde $D3,$D3,$t3
|
---|
539 |
|
---|
540 | mullw $t2,$h2,$s3 # h2*s3
|
---|
541 | mulhwu $t3,$h2,$s3
|
---|
542 | addc $d0,$d0,$t0
|
---|
543 | adde $D0,$D0,$t1
|
---|
544 |
|
---|
545 | mullw $t0,$h2,$r0 # h2*r0
|
---|
546 | mulhwu $t1,$h2,$r0
|
---|
547 | addc $d1,$d1,$t2
|
---|
548 | adde $D1,$D1,$t3
|
---|
549 |
|
---|
550 | mullw $t2,$h2,$r1 # h2*r1
|
---|
551 | mulhwu $t3,$h2,$r1
|
---|
552 | addc $d2,$d2,$t0
|
---|
553 | adde $D2,$D2,$t1
|
---|
554 |
|
---|
555 | mullw $t0,$h3,$s1 # h3*s1
|
---|
556 | mulhwu $t1,$h3,$s1
|
---|
557 | addc $d3,$d3,$t2
|
---|
558 | adde $D3,$D3,$t3
|
---|
559 |
|
---|
560 | mullw $t2,$h3,$s2 # h3*s2
|
---|
561 | mulhwu $t3,$h3,$s2
|
---|
562 | addc $d0,$d0,$t0
|
---|
563 | adde $D0,$D0,$t1
|
---|
564 |
|
---|
565 | mullw $t0,$h3,$s3 # h3*s3
|
---|
566 | mulhwu $t1,$h3,$s3
|
---|
567 | addc $d1,$d1,$t2
|
---|
568 | adde $D1,$D1,$t3
|
---|
569 |
|
---|
570 | mullw $t2,$h3,$r0 # h3*r0
|
---|
571 | mulhwu $t3,$h3,$r0
|
---|
572 | addc $d2,$d2,$t0
|
---|
573 | adde $D2,$D2,$t1
|
---|
574 |
|
---|
575 | mullw $t0,$h4,$s1 # h4*s1
|
---|
576 | addc $d3,$d3,$t2
|
---|
577 | adde $D3,$D3,$t3
|
---|
578 | addc $d1,$d1,$t0
|
---|
579 |
|
---|
580 | mullw $t1,$h4,$s2 # h4*s2
|
---|
581 | addze $D1,$D1
|
---|
582 | addc $d2,$d2,$t1
|
---|
583 | addze $D2,$D2
|
---|
584 |
|
---|
585 | mullw $t2,$h4,$s3 # h4*s3
|
---|
586 | addc $d3,$d3,$t2
|
---|
587 | addze $D3,$D3
|
---|
588 |
|
---|
589 | mullw $h4,$h4,$r0 # h4*r0
|
---|
590 |
|
---|
591 | addc $h1,$d1,$D0
|
---|
592 | adde $h2,$d2,$D1
|
---|
593 | adde $h3,$d3,$D2
|
---|
594 | adde $h4,$h4,$D3
|
---|
595 |
|
---|
596 | andc $D0,$h4,$mask # final reduction step
|
---|
597 | and $h4,$h4,$mask
|
---|
598 | srwi $D1,$D0,2
|
---|
599 | add $D0,$D0,$D1
|
---|
600 | addc $h0,$d0,$D0
|
---|
601 | addze $h1,$h1
|
---|
602 | addze $h2,$h2
|
---|
603 | addze $h3,$h3
|
---|
604 | addze $h4,$h4
|
---|
605 |
|
---|
606 | bdnz Loop
|
---|
607 |
|
---|
608 | stw $h0,0($ctx) # store hash value
|
---|
609 | stw $h1,4($ctx)
|
---|
610 | stw $h2,8($ctx)
|
---|
611 | stw $h3,12($ctx)
|
---|
612 | stw $h4,16($ctx)
|
---|
613 |
|
---|
614 | $POP r14,`$FRAME-$SIZE_T*18`($sp)
|
---|
615 | $POP r15,`$FRAME-$SIZE_T*17`($sp)
|
---|
616 | $POP r16,`$FRAME-$SIZE_T*16`($sp)
|
---|
617 | $POP r17,`$FRAME-$SIZE_T*15`($sp)
|
---|
618 | $POP r18,`$FRAME-$SIZE_T*14`($sp)
|
---|
619 | $POP r19,`$FRAME-$SIZE_T*13`($sp)
|
---|
620 | $POP r20,`$FRAME-$SIZE_T*12`($sp)
|
---|
621 | $POP r21,`$FRAME-$SIZE_T*11`($sp)
|
---|
622 | $POP r22,`$FRAME-$SIZE_T*10`($sp)
|
---|
623 | $POP r23,`$FRAME-$SIZE_T*9`($sp)
|
---|
624 | $POP r24,`$FRAME-$SIZE_T*8`($sp)
|
---|
625 | $POP r25,`$FRAME-$SIZE_T*7`($sp)
|
---|
626 | $POP r26,`$FRAME-$SIZE_T*6`($sp)
|
---|
627 | $POP r27,`$FRAME-$SIZE_T*5`($sp)
|
---|
628 | $POP r28,`$FRAME-$SIZE_T*4`($sp)
|
---|
629 | $POP r29,`$FRAME-$SIZE_T*3`($sp)
|
---|
630 | $POP r30,`$FRAME-$SIZE_T*2`($sp)
|
---|
631 | $POP r31,`$FRAME-$SIZE_T*1`($sp)
|
---|
632 | addi $sp,$sp,$FRAME
|
---|
633 | Labort:
|
---|
634 | blr
|
---|
635 | .long 0
|
---|
636 | .byte 0,12,4,1,0x80,18,4,0
|
---|
637 | .size .poly1305_blocks,.-.poly1305_blocks
|
---|
638 | ___
|
---|
639 | {
|
---|
640 | my ($h0,$h1,$h2,$h3,$h4,$t0,$t1) = map("r$_",(6..12));
|
---|
641 |
|
---|
642 | $code.=<<___;
|
---|
643 | .globl .poly1305_emit
|
---|
644 | .align 5
|
---|
645 | .poly1305_emit:
|
---|
646 | lwz r0,24($ctx) # is_base2_26
|
---|
647 | lwz $h0,0($ctx) # load hash value
|
---|
648 | lwz $h1,4($ctx)
|
---|
649 | lwz $h2,8($ctx)
|
---|
650 | lwz $h3,12($ctx)
|
---|
651 | lwz $h4,16($ctx)
|
---|
652 | cmplwi r0,0
|
---|
653 | beq Lemit_base2_32
|
---|
654 |
|
---|
655 | slwi $t0,$h1,26 # base 2^26 -> base 2^32
|
---|
656 | srwi $h1,$h1,6
|
---|
657 | slwi $t1,$h2,20
|
---|
658 | srwi $h2,$h2,12
|
---|
659 | addc $h0,$h0,$t0
|
---|
660 | slwi $t0,$h3,14
|
---|
661 | srwi $h3,$h3,18
|
---|
662 | adde $h1,$h1,$t1
|
---|
663 | slwi $t1,$h4,8
|
---|
664 | srwi $h4,$h4,24
|
---|
665 | adde $h2,$h2,$t0
|
---|
666 | adde $h3,$h3,$t1
|
---|
667 | addze $h4,$h4
|
---|
668 |
|
---|
669 | Lemit_base2_32:
|
---|
670 | addic r0,$h0,5 # compare to modulus
|
---|
671 | addze r0,$h1
|
---|
672 | addze r0,$h2
|
---|
673 | addze r0,$h3
|
---|
674 | addze r0,$h4
|
---|
675 |
|
---|
676 | srwi r0,r0,2 # see if it carried/borrowed
|
---|
677 | neg r0,r0
|
---|
678 | andi. r0,r0,5
|
---|
679 |
|
---|
680 | addc $h0,$h0,r0
|
---|
681 | lwz r0,0($nonce)
|
---|
682 | addze $h1,$h1
|
---|
683 | lwz $t0,4($nonce)
|
---|
684 | addze $h2,$h2
|
---|
685 | lwz $t1,8($nonce)
|
---|
686 | addze $h3,$h3
|
---|
687 | lwz $h4,12($nonce)
|
---|
688 |
|
---|
689 | addc $h0,$h0,r0 # accumulate nonce
|
---|
690 | adde $h1,$h1,$t0
|
---|
691 | adde $h2,$h2,$t1
|
---|
692 | adde $h3,$h3,$h4
|
---|
693 |
|
---|
694 | addi $ctx,$mac,-1
|
---|
695 | addi $mac,$mac,7
|
---|
696 |
|
---|
697 | stbu $h0,1($ctx) # write [little-endian] result
|
---|
698 | srwi $h0,$h0,8
|
---|
699 | stbu $h2,1($mac)
|
---|
700 | srwi $h2,$h2,8
|
---|
701 |
|
---|
702 | stbu $h0,1($ctx)
|
---|
703 | srwi $h0,$h0,8
|
---|
704 | stbu $h2,1($mac)
|
---|
705 | srwi $h2,$h2,8
|
---|
706 |
|
---|
707 | stbu $h0,1($ctx)
|
---|
708 | srwi $h0,$h0,8
|
---|
709 | stbu $h2,1($mac)
|
---|
710 | srwi $h2,$h2,8
|
---|
711 |
|
---|
712 | stbu $h0,1($ctx)
|
---|
713 | stbu $h2,1($mac)
|
---|
714 |
|
---|
715 | stbu $h1,1($ctx)
|
---|
716 | srwi $h1,$h1,8
|
---|
717 | stbu $h3,1($mac)
|
---|
718 | srwi $h3,$h3,8
|
---|
719 |
|
---|
720 | stbu $h1,1($ctx)
|
---|
721 | srwi $h1,$h1,8
|
---|
722 | stbu $h3,1($mac)
|
---|
723 | srwi $h3,$h3,8
|
---|
724 |
|
---|
725 | stbu $h1,1($ctx)
|
---|
726 | srwi $h1,$h1,8
|
---|
727 | stbu $h3,1($mac)
|
---|
728 | srwi $h3,$h3,8
|
---|
729 |
|
---|
730 | stbu $h1,1($ctx)
|
---|
731 | stbu $h3,1($mac)
|
---|
732 |
|
---|
733 | blr
|
---|
734 | .long 0
|
---|
735 | .byte 0,12,0x14,0,0,0,3,0
|
---|
736 | .size .poly1305_emit,.-.poly1305_emit
|
---|
737 | ___
|
---|
738 | } }
|
---|
739 | {{{
|
---|
740 | ########################################################################
|
---|
741 | # PowerISA 2.07/VSX section #
|
---|
742 | ########################################################################
|
---|
743 |
|
---|
744 | my $LOCALS= 6*$SIZE_T;
|
---|
745 | my $VSXFRAME = $LOCALS + 6*$SIZE_T;
|
---|
746 | $VSXFRAME += 128; # local variables
|
---|
747 | $VSXFRAME += 13*16; # v20-v31 offload
|
---|
748 |
|
---|
749 | my $BIG_ENDIAN = ($flavour !~ /le/) ? 4 : 0;
|
---|
750 |
|
---|
751 | ########################################################################
|
---|
752 | # Layout of opaque area is following:
|
---|
753 | #
|
---|
754 | # unsigned __int32 h[5]; # current hash value base 2^26
|
---|
755 | # unsigned __int32 pad;
|
---|
756 | # unsigned __int32 is_base2_26, pad;
|
---|
757 | # unsigned __int64 r[2]; # key value base 2^64
|
---|
758 | # struct { unsigned __int32 r^2, r^4, r^1, r^3; } r[9];
|
---|
759 | #
|
---|
760 | # where r^n are base 2^26 digits of powers of multiplier key. There are
|
---|
761 | # 5 digits, but last four are interleaved with multiples of 5, totalling
|
---|
762 | # in 9 elements: r0, r1, 5*r1, r2, 5*r2, r3, 5*r3, r4, 5*r4. Order of
|
---|
763 | # powers is as they appear in register, not memory.
|
---|
764 |
|
---|
765 | my ($H0, $H1, $H2, $H3, $H4) = map("v$_",(0..4));
|
---|
766 | my ($I0, $I1, $I2, $I3, $I4) = map("v$_",(5..9));
|
---|
767 | my ($R0, $R1, $S1, $R2, $S2) = map("v$_",(10..14));
|
---|
768 | my ($R3, $S3, $R4, $S4) = ($R1, $S1, $R2, $S2);
|
---|
769 | my ($ACC0, $ACC1, $ACC2, $ACC3, $ACC4) = map("v$_",(15..19));
|
---|
770 | my ($T0, $T1, $T2, $T3, $T4) = map("v$_",(20..24));
|
---|
771 | my ($_26,$_4,$_40,$_14,$mask26,$padbits,$I2perm) = map("v$_",(25..31));
|
---|
772 | my ($x00,$x60,$x70,$x10,$x20,$x30,$x40,$x50) = (0, map("r$_",(7,8,27..31)));
|
---|
773 | my ($ctx_,$_ctx,$const) = map("r$_",(10..12));
|
---|
774 |
|
---|
775 | if ($flavour =~ /64/) {
|
---|
776 | ###############################################################################
|
---|
777 | # setup phase of poly1305_blocks_vsx is different on 32- and 64-bit platforms,
|
---|
778 | # but the base 2^26 computational part is same...
|
---|
779 |
|
---|
780 | my ($h0,$h1,$h2,$d0,$d1,$d2, $r0,$r1,$s1, $t0,$t1) = map("r$_",(6..11,27..31));
|
---|
781 | my $mask = "r0";
|
---|
782 |
|
---|
783 | $code.=<<___;
|
---|
784 | .globl .poly1305_blocks_vsx
|
---|
785 | .align 5
|
---|
786 | .poly1305_blocks_vsx:
|
---|
787 | lwz r7,24($ctx) # is_base2_26
|
---|
788 | cmpldi $len,128
|
---|
789 | bge __poly1305_blocks_vsx
|
---|
790 |
|
---|
791 | neg r0,r7 # is_base2_26 as mask
|
---|
792 | lwz r7,0($ctx) # load hash base 2^26
|
---|
793 | lwz r8,4($ctx)
|
---|
794 | lwz r9,8($ctx)
|
---|
795 | lwz r10,12($ctx)
|
---|
796 | lwz r11,16($ctx)
|
---|
797 |
|
---|
798 | sldi r8,r8,26 # base 2^26 -> base 2^64
|
---|
799 | sldi r12,r9,52
|
---|
800 | add r7,r7,r8
|
---|
801 | srdi r9,r9,12
|
---|
802 | sldi r10,r10,14
|
---|
803 | addc r7,r7,r12
|
---|
804 | sldi r8,r11,40
|
---|
805 | adde r9,r9,r10
|
---|
806 | srdi r11,r11,24
|
---|
807 | addc r9,r9,r8
|
---|
808 | addze r11,r11
|
---|
809 |
|
---|
810 | ld r8,0($ctx) # load hash base 2^64
|
---|
811 | ld r10,8($ctx)
|
---|
812 | ld r12,16($ctx)
|
---|
813 |
|
---|
814 | xor r7,r7,r8 # select between radixes
|
---|
815 | xor r9,r9,r10
|
---|
816 | xor r11,r11,r12
|
---|
817 | and r7,r7,r0
|
---|
818 | and r9,r9,r0
|
---|
819 | and r11,r11,r0
|
---|
820 | xor r7,r7,r8
|
---|
821 | xor r9,r9,r10
|
---|
822 | xor r11,r11,r12
|
---|
823 |
|
---|
824 | li r0,0
|
---|
825 | std r7,0($ctx) # store hash base 2^64
|
---|
826 | std r9,8($ctx)
|
---|
827 | std r11,16($ctx)
|
---|
828 | stw r0,24($ctx) # clear is_base2_26
|
---|
829 |
|
---|
830 | b Lpoly1305_blocks
|
---|
831 | .long 0
|
---|
832 | .byte 0,12,0x14,0,0,0,4,0
|
---|
833 | .size .poly1305_blocks_vsx,.-.poly1305_blocks_vsx
|
---|
834 |
|
---|
835 | .align 5
|
---|
836 | __poly1305_mul:
|
---|
837 | mulld $d0,$h0,$r0 # h0*r0
|
---|
838 | mulhdu $d1,$h0,$r0
|
---|
839 |
|
---|
840 | mulld $t0,$h1,$s1 # h1*5*r1
|
---|
841 | mulhdu $t1,$h1,$s1
|
---|
842 | addc $d0,$d0,$t0
|
---|
843 | adde $d1,$d1,$t1
|
---|
844 |
|
---|
845 | mulld $t0,$h0,$r1 # h0*r1
|
---|
846 | mulhdu $d2,$h0,$r1
|
---|
847 | addc $d1,$d1,$t0
|
---|
848 | addze $d2,$d2
|
---|
849 |
|
---|
850 | mulld $t0,$h1,$r0 # h1*r0
|
---|
851 | mulhdu $t1,$h1,$r0
|
---|
852 | addc $d1,$d1,$t0
|
---|
853 | adde $d2,$d2,$t1
|
---|
854 |
|
---|
855 | mulld $t0,$h2,$s1 # h2*5*r1
|
---|
856 | mulld $t1,$h2,$r0 # h2*r0
|
---|
857 | addc $d1,$d1,$t0
|
---|
858 | adde $d2,$d2,$t1
|
---|
859 |
|
---|
860 | andc $t0,$d2,$mask # final reduction step
|
---|
861 | and $h2,$d2,$mask
|
---|
862 | srdi $t1,$t0,2
|
---|
863 | add $t0,$t0,$t1
|
---|
864 | addc $h0,$d0,$t0
|
---|
865 | addze $h1,$d1
|
---|
866 | addze $h2,$h2
|
---|
867 |
|
---|
868 | blr
|
---|
869 | .long 0
|
---|
870 | .byte 0,12,0x14,0,0,0,0,0
|
---|
871 | .size __poly1305_mul,.-__poly1305_mul
|
---|
872 |
|
---|
873 | .align 5
|
---|
874 | __poly1305_splat:
|
---|
875 | extrdi $d0,$h0,26,38
|
---|
876 | extrdi $d1,$h0,26,12
|
---|
877 | stw $d0,0x00($t1)
|
---|
878 |
|
---|
879 | extrdi $d2,$h0,12,0
|
---|
880 | slwi $d0,$d1,2
|
---|
881 | stw $d1,0x10($t1)
|
---|
882 | add $d0,$d0,$d1 # * 5
|
---|
883 | stw $d0,0x20($t1)
|
---|
884 |
|
---|
885 | insrdi $d2,$h1,14,38
|
---|
886 | slwi $d0,$d2,2
|
---|
887 | stw $d2,0x30($t1)
|
---|
888 | add $d0,$d0,$d2 # * 5
|
---|
889 | stw $d0,0x40($t1)
|
---|
890 |
|
---|
891 | extrdi $d1,$h1,26,24
|
---|
892 | extrdi $d2,$h1,24,0
|
---|
893 | slwi $d0,$d1,2
|
---|
894 | stw $d1,0x50($t1)
|
---|
895 | add $d0,$d0,$d1 # * 5
|
---|
896 | stw $d0,0x60($t1)
|
---|
897 |
|
---|
898 | insrdi $d2,$h2,3,37
|
---|
899 | slwi $d0,$d2,2
|
---|
900 | stw $d2,0x70($t1)
|
---|
901 | add $d0,$d0,$d2 # * 5
|
---|
902 | stw $d0,0x80($t1)
|
---|
903 |
|
---|
904 | blr
|
---|
905 | .long 0
|
---|
906 | .byte 0,12,0x14,0,0,0,0,0
|
---|
907 | .size __poly1305_splat,.-__poly1305_splat
|
---|
908 |
|
---|
909 | .align 5
|
---|
910 | __poly1305_blocks_vsx:
|
---|
911 | $STU $sp,-$VSXFRAME($sp)
|
---|
912 | mflr r0
|
---|
913 | li r10,`15+$LOCALS+128`
|
---|
914 | li r11,`31+$LOCALS+128`
|
---|
915 | mfspr r12,256
|
---|
916 | stvx v20,r10,$sp
|
---|
917 | addi r10,r10,32
|
---|
918 | stvx v21,r11,$sp
|
---|
919 | addi r11,r11,32
|
---|
920 | stvx v22,r10,$sp
|
---|
921 | addi r10,r10,32
|
---|
922 | stvx v23,r10,$sp
|
---|
923 | addi r10,r10,32
|
---|
924 | stvx v24,r11,$sp
|
---|
925 | addi r11,r11,32
|
---|
926 | stvx v25,r10,$sp
|
---|
927 | addi r10,r10,32
|
---|
928 | stvx v26,r10,$sp
|
---|
929 | addi r10,r10,32
|
---|
930 | stvx v27,r11,$sp
|
---|
931 | addi r11,r11,32
|
---|
932 | stvx v28,r10,$sp
|
---|
933 | addi r10,r10,32
|
---|
934 | stvx v29,r11,$sp
|
---|
935 | addi r11,r11,32
|
---|
936 | stvx v30,r10,$sp
|
---|
937 | stvx v31,r11,$sp
|
---|
938 | stw r12,`$VSXFRAME-$SIZE_T*5-4`($sp)# save vrsave
|
---|
939 | li r12,-1
|
---|
940 | mtspr 256,r12 # preserve all AltiVec registers
|
---|
941 | $PUSH r27,`$VSXFRAME-$SIZE_T*5`($sp)
|
---|
942 | $PUSH r28,`$VSXFRAME-$SIZE_T*4`($sp)
|
---|
943 | $PUSH r29,`$VSXFRAME-$SIZE_T*3`($sp)
|
---|
944 | $PUSH r30,`$VSXFRAME-$SIZE_T*2`($sp)
|
---|
945 | $PUSH r31,`$VSXFRAME-$SIZE_T*1`($sp)
|
---|
946 | $PUSH r0,`$VSXFRAME+$LRSAVE`($sp)
|
---|
947 |
|
---|
948 | bl LPICmeup
|
---|
949 |
|
---|
950 | li $x10,0x10
|
---|
951 | li $x20,0x20
|
---|
952 | li $x30,0x30
|
---|
953 | li $x40,0x40
|
---|
954 | li $x50,0x50
|
---|
955 | lvx_u $mask26,$x00,$const
|
---|
956 | lvx_u $_26,$x10,$const
|
---|
957 | lvx_u $_40,$x20,$const
|
---|
958 | lvx_u $I2perm,$x30,$const
|
---|
959 | lvx_u $padbits,$x40,$const
|
---|
960 |
|
---|
961 | cmplwi r7,0 # is_base2_26?
|
---|
962 | bne Lskip_init_vsx
|
---|
963 |
|
---|
964 | ld $r0,32($ctx) # load key base 2^64
|
---|
965 | ld $r1,40($ctx)
|
---|
966 | srdi $s1,$r1,2
|
---|
967 | li $mask,3
|
---|
968 | add $s1,$s1,$r1 # s1 = r1 + r1>>2
|
---|
969 |
|
---|
970 | mr $h0,$r0 # "calculate" r^1
|
---|
971 | mr $h1,$r1
|
---|
972 | li $h2,0
|
---|
973 | addi $t1,$ctx,`48+(12^$BIG_ENDIAN)`
|
---|
974 | bl __poly1305_splat
|
---|
975 |
|
---|
976 | bl __poly1305_mul # calculate r^2
|
---|
977 | addi $t1,$ctx,`48+(4^$BIG_ENDIAN)`
|
---|
978 | bl __poly1305_splat
|
---|
979 |
|
---|
980 | bl __poly1305_mul # calculate r^3
|
---|
981 | addi $t1,$ctx,`48+(8^$BIG_ENDIAN)`
|
---|
982 | bl __poly1305_splat
|
---|
983 |
|
---|
984 | bl __poly1305_mul # calculate r^4
|
---|
985 | addi $t1,$ctx,`48+(0^$BIG_ENDIAN)`
|
---|
986 | bl __poly1305_splat
|
---|
987 |
|
---|
988 | ld $h0,0($ctx) # load hash
|
---|
989 | ld $h1,8($ctx)
|
---|
990 | ld $h2,16($ctx)
|
---|
991 |
|
---|
992 | extrdi $d0,$h0,26,38 # base 2^64 -> base 2^26
|
---|
993 | extrdi $d1,$h0,26,12
|
---|
994 | extrdi $d2,$h0,12,0
|
---|
995 | mtvrwz $H0,$d0
|
---|
996 | insrdi $d2,$h1,14,38
|
---|
997 | mtvrwz $H1,$d1
|
---|
998 | extrdi $d1,$h1,26,24
|
---|
999 | mtvrwz $H2,$d2
|
---|
1000 | extrdi $d2,$h1,24,0
|
---|
1001 | mtvrwz $H3,$d1
|
---|
1002 | insrdi $d2,$h2,3,37
|
---|
1003 | mtvrwz $H4,$d2
|
---|
1004 | ___
|
---|
1005 | } else {
|
---|
1006 | ###############################################################################
|
---|
1007 | # 32-bit initialization
|
---|
1008 |
|
---|
1009 | my ($h0,$h1,$h2,$h3,$h4,$t0,$t1) = map("r$_",(7..11,0,12));
|
---|
1010 | my ($R3,$S3,$R4,$S4)=($I1,$I2,$I3,$I4);
|
---|
1011 |
|
---|
1012 | $code.=<<___;
|
---|
1013 | .globl .poly1305_blocks_vsx
|
---|
1014 | .align 5
|
---|
1015 | .poly1305_blocks_vsx:
|
---|
1016 | lwz r7,24($ctx) # is_base2_26
|
---|
1017 | cmplwi $len,128
|
---|
1018 | bge __poly1305_blocks_vsx
|
---|
1019 | cmplwi r7,0
|
---|
1020 | beq Lpoly1305_blocks
|
---|
1021 |
|
---|
1022 | lwz $h0,0($ctx) # load hash
|
---|
1023 | lwz $h1,4($ctx)
|
---|
1024 | lwz $h2,8($ctx)
|
---|
1025 | lwz $h3,12($ctx)
|
---|
1026 | lwz $h4,16($ctx)
|
---|
1027 |
|
---|
1028 | slwi $t0,$h1,26 # base 2^26 -> base 2^32
|
---|
1029 | srwi $h1,$h1,6
|
---|
1030 | slwi $t1,$h2,20
|
---|
1031 | srwi $h2,$h2,12
|
---|
1032 | addc $h0,$h0,$t0
|
---|
1033 | slwi $t0,$h3,14
|
---|
1034 | srwi $h3,$h3,18
|
---|
1035 | adde $h1,$h1,$t1
|
---|
1036 | slwi $t1,$h4,8
|
---|
1037 | srwi $h4,$h4,24
|
---|
1038 | adde $h2,$h2,$t0
|
---|
1039 | li $t0,0
|
---|
1040 | adde $h3,$h3,$t1
|
---|
1041 | addze $h4,$h4
|
---|
1042 |
|
---|
1043 | stw $h0,0($ctx) # store hash base 2^32
|
---|
1044 | stw $h1,4($ctx)
|
---|
1045 | stw $h2,8($ctx)
|
---|
1046 | stw $h3,12($ctx)
|
---|
1047 | stw $h4,16($ctx)
|
---|
1048 | stw $t0,24($ctx) # clear is_base2_26
|
---|
1049 |
|
---|
1050 | b Lpoly1305_blocks
|
---|
1051 | .long 0
|
---|
1052 | .byte 0,12,0x14,0,0,0,4,0
|
---|
1053 | .size .poly1305_blocks_vsx,.-.poly1305_blocks_vsx
|
---|
1054 |
|
---|
1055 | .align 5
|
---|
1056 | __poly1305_mul:
|
---|
1057 | vmulouw $ACC0,$H0,$R0
|
---|
1058 | vmulouw $ACC1,$H1,$R0
|
---|
1059 | vmulouw $ACC2,$H2,$R0
|
---|
1060 | vmulouw $ACC3,$H3,$R0
|
---|
1061 | vmulouw $ACC4,$H4,$R0
|
---|
1062 |
|
---|
1063 | vmulouw $T0,$H4,$S1
|
---|
1064 | vaddudm $ACC0,$ACC0,$T0
|
---|
1065 | vmulouw $T0,$H0,$R1
|
---|
1066 | vaddudm $ACC1,$ACC1,$T0
|
---|
1067 | vmulouw $T0,$H1,$R1
|
---|
1068 | vaddudm $ACC2,$ACC2,$T0
|
---|
1069 | vmulouw $T0,$H2,$R1
|
---|
1070 | vaddudm $ACC3,$ACC3,$T0
|
---|
1071 | vmulouw $T0,$H3,$R1
|
---|
1072 | vaddudm $ACC4,$ACC4,$T0
|
---|
1073 |
|
---|
1074 | vmulouw $T0,$H3,$S2
|
---|
1075 | vaddudm $ACC0,$ACC0,$T0
|
---|
1076 | vmulouw $T0,$H4,$S2
|
---|
1077 | vaddudm $ACC1,$ACC1,$T0
|
---|
1078 | vmulouw $T0,$H0,$R2
|
---|
1079 | vaddudm $ACC2,$ACC2,$T0
|
---|
1080 | vmulouw $T0,$H1,$R2
|
---|
1081 | vaddudm $ACC3,$ACC3,$T0
|
---|
1082 | vmulouw $T0,$H2,$R2
|
---|
1083 | vaddudm $ACC4,$ACC4,$T0
|
---|
1084 |
|
---|
1085 | vmulouw $T0,$H2,$S3
|
---|
1086 | vaddudm $ACC0,$ACC0,$T0
|
---|
1087 | vmulouw $T0,$H3,$S3
|
---|
1088 | vaddudm $ACC1,$ACC1,$T0
|
---|
1089 | vmulouw $T0,$H4,$S3
|
---|
1090 | vaddudm $ACC2,$ACC2,$T0
|
---|
1091 | vmulouw $T0,$H0,$R3
|
---|
1092 | vaddudm $ACC3,$ACC3,$T0
|
---|
1093 | vmulouw $T0,$H1,$R3
|
---|
1094 | vaddudm $ACC4,$ACC4,$T0
|
---|
1095 |
|
---|
1096 | vmulouw $T0,$H1,$S4
|
---|
1097 | vaddudm $ACC0,$ACC0,$T0
|
---|
1098 | vmulouw $T0,$H2,$S4
|
---|
1099 | vaddudm $ACC1,$ACC1,$T0
|
---|
1100 | vmulouw $T0,$H3,$S4
|
---|
1101 | vaddudm $ACC2,$ACC2,$T0
|
---|
1102 | vmulouw $T0,$H4,$S4
|
---|
1103 | vaddudm $ACC3,$ACC3,$T0
|
---|
1104 | vmulouw $T0,$H0,$R4
|
---|
1105 | vaddudm $ACC4,$ACC4,$T0
|
---|
1106 |
|
---|
1107 | ################################################################
|
---|
1108 | # lazy reduction
|
---|
1109 |
|
---|
1110 | vspltisb $T0,2
|
---|
1111 | vsrd $H4,$ACC3,$_26
|
---|
1112 | vsrd $H1,$ACC0,$_26
|
---|
1113 | vand $H3,$ACC3,$mask26
|
---|
1114 | vand $H0,$ACC0,$mask26
|
---|
1115 | vaddudm $H4,$H4,$ACC4 # h3 -> h4
|
---|
1116 | vaddudm $H1,$H1,$ACC1 # h0 -> h1
|
---|
1117 |
|
---|
1118 | vsrd $ACC4,$H4,$_26
|
---|
1119 | vsrd $ACC1,$H1,$_26
|
---|
1120 | vand $H4,$H4,$mask26
|
---|
1121 | vand $H1,$H1,$mask26
|
---|
1122 | vaddudm $H0,$H0,$ACC4
|
---|
1123 | vaddudm $H2,$ACC2,$ACC1 # h1 -> h2
|
---|
1124 |
|
---|
1125 | vsld $ACC4,$ACC4,$T0 # <<2
|
---|
1126 | vsrd $ACC2,$H2,$_26
|
---|
1127 | vand $H2,$H2,$mask26
|
---|
1128 | vaddudm $H0,$H0,$ACC4 # h4 -> h0
|
---|
1129 | vaddudm $H3,$H3,$ACC2 # h2 -> h3
|
---|
1130 |
|
---|
1131 | vsrd $ACC0,$H0,$_26
|
---|
1132 | vsrd $ACC3,$H3,$_26
|
---|
1133 | vand $H0,$H0,$mask26
|
---|
1134 | vand $H3,$H3,$mask26
|
---|
1135 | vaddudm $H1,$H1,$ACC0 # h0 -> h1
|
---|
1136 | vaddudm $H4,$H4,$ACC3 # h3 -> h4
|
---|
1137 |
|
---|
1138 | blr
|
---|
1139 | .long 0
|
---|
1140 | .byte 0,12,0x14,0,0,0,0,0
|
---|
1141 | .size __poly1305_mul,.-__poly1305_mul
|
---|
1142 |
|
---|
1143 | .align 5
|
---|
1144 | __poly1305_blocks_vsx:
|
---|
1145 | $STU $sp,-$VSXFRAME($sp)
|
---|
1146 | mflr r0
|
---|
1147 | li r10,`15+$LOCALS+128`
|
---|
1148 | li r11,`31+$LOCALS+128`
|
---|
1149 | mfspr r12,256
|
---|
1150 | stvx v20,r10,$sp
|
---|
1151 | addi r10,r10,32
|
---|
1152 | stvx v21,r11,$sp
|
---|
1153 | addi r11,r11,32
|
---|
1154 | stvx v22,r10,$sp
|
---|
1155 | addi r10,r10,32
|
---|
1156 | stvx v23,r10,$sp
|
---|
1157 | addi r10,r10,32
|
---|
1158 | stvx v24,r11,$sp
|
---|
1159 | addi r11,r11,32
|
---|
1160 | stvx v25,r10,$sp
|
---|
1161 | addi r10,r10,32
|
---|
1162 | stvx v26,r10,$sp
|
---|
1163 | addi r10,r10,32
|
---|
1164 | stvx v27,r11,$sp
|
---|
1165 | addi r11,r11,32
|
---|
1166 | stvx v28,r10,$sp
|
---|
1167 | addi r10,r10,32
|
---|
1168 | stvx v29,r11,$sp
|
---|
1169 | addi r11,r11,32
|
---|
1170 | stvx v30,r10,$sp
|
---|
1171 | stvx v31,r11,$sp
|
---|
1172 | stw r12,`$VSXFRAME-$SIZE_T*5-4`($sp)# save vrsave
|
---|
1173 | li r12,-1
|
---|
1174 | mtspr 256,r12 # preserve all AltiVec registers
|
---|
1175 | $PUSH r27,`$VSXFRAME-$SIZE_T*5`($sp)
|
---|
1176 | $PUSH r28,`$VSXFRAME-$SIZE_T*4`($sp)
|
---|
1177 | $PUSH r29,`$VSXFRAME-$SIZE_T*3`($sp)
|
---|
1178 | $PUSH r30,`$VSXFRAME-$SIZE_T*2`($sp)
|
---|
1179 | $PUSH r31,`$VSXFRAME-$SIZE_T*1`($sp)
|
---|
1180 | $PUSH r0,`$VSXFRAME+$LRSAVE`($sp)
|
---|
1181 |
|
---|
1182 | bl LPICmeup
|
---|
1183 |
|
---|
1184 | li $x10,0x10
|
---|
1185 | li $x20,0x20
|
---|
1186 | li $x30,0x30
|
---|
1187 | li $x40,0x40
|
---|
1188 | li $x50,0x50
|
---|
1189 | lvx_u $mask26,$x00,$const
|
---|
1190 | lvx_u $_26,$x10,$const
|
---|
1191 | lvx_u $_40,$x20,$const
|
---|
1192 | lvx_u $I2perm,$x30,$const
|
---|
1193 | lvx_u $padbits,$x40,$const
|
---|
1194 |
|
---|
1195 | cmplwi r7,0 # is_base2_26?
|
---|
1196 | bne Lskip_init_vsx
|
---|
1197 |
|
---|
1198 | lwz $h1,32($ctx) # load key base 2^32
|
---|
1199 | lwz $h2,36($ctx)
|
---|
1200 | lwz $h3,40($ctx)
|
---|
1201 | lwz $h4,44($ctx)
|
---|
1202 |
|
---|
1203 | extrwi $h0,$h1,26,6 # base 2^32 -> base 2^26
|
---|
1204 | extrwi $h1,$h1,6,0
|
---|
1205 | insrwi $h1,$h2,20,6
|
---|
1206 | extrwi $h2,$h2,12,0
|
---|
1207 | insrwi $h2,$h3,14,6
|
---|
1208 | extrwi $h3,$h3,18,0
|
---|
1209 | insrwi $h3,$h4,8,6
|
---|
1210 | extrwi $h4,$h4,24,0
|
---|
1211 |
|
---|
1212 | mtvrwz $R0,$h0
|
---|
1213 | slwi $h0,$h1,2
|
---|
1214 | mtvrwz $R1,$h1
|
---|
1215 | add $h1,$h1,$h0
|
---|
1216 | mtvrwz $S1,$h1
|
---|
1217 | slwi $h1,$h2,2
|
---|
1218 | mtvrwz $R2,$h2
|
---|
1219 | add $h2,$h2,$h1
|
---|
1220 | mtvrwz $S2,$h2
|
---|
1221 | slwi $h2,$h3,2
|
---|
1222 | mtvrwz $R3,$h3
|
---|
1223 | add $h3,$h3,$h2
|
---|
1224 | mtvrwz $S3,$h3
|
---|
1225 | slwi $h3,$h4,2
|
---|
1226 | mtvrwz $R4,$h4
|
---|
1227 | add $h4,$h4,$h3
|
---|
1228 | mtvrwz $S4,$h4
|
---|
1229 |
|
---|
1230 | vmr $H0,$R0
|
---|
1231 | vmr $H1,$R1
|
---|
1232 | vmr $H2,$R2
|
---|
1233 | vmr $H3,$R3
|
---|
1234 | vmr $H4,$R4
|
---|
1235 |
|
---|
1236 | bl __poly1305_mul # r^1:- * r^1:-
|
---|
1237 |
|
---|
1238 | vpermdi $R0,$H0,$R0,0b00
|
---|
1239 | vpermdi $R1,$H1,$R1,0b00
|
---|
1240 | vpermdi $R2,$H2,$R2,0b00
|
---|
1241 | vpermdi $R3,$H3,$R3,0b00
|
---|
1242 | vpermdi $R4,$H4,$R4,0b00
|
---|
1243 | vpermdi $H0,$H0,$H0,0b00
|
---|
1244 | vpermdi $H1,$H1,$H1,0b00
|
---|
1245 | vpermdi $H2,$H2,$H2,0b00
|
---|
1246 | vpermdi $H3,$H3,$H3,0b00
|
---|
1247 | vpermdi $H4,$H4,$H4,0b00
|
---|
1248 | vsld $S1,$R1,$T0 # <<2
|
---|
1249 | vsld $S2,$R2,$T0
|
---|
1250 | vsld $S3,$R3,$T0
|
---|
1251 | vsld $S4,$R4,$T0
|
---|
1252 | vaddudm $S1,$S1,$R1
|
---|
1253 | vaddudm $S2,$S2,$R2
|
---|
1254 | vaddudm $S3,$S3,$R3
|
---|
1255 | vaddudm $S4,$S4,$R4
|
---|
1256 |
|
---|
1257 | bl __poly1305_mul # r^2:r^2 * r^2:r^1
|
---|
1258 |
|
---|
1259 | addi $h0,$ctx,0x60
|
---|
1260 | lwz $h1,0($ctx) # load hash
|
---|
1261 | lwz $h2,4($ctx)
|
---|
1262 | lwz $h3,8($ctx)
|
---|
1263 | lwz $h4,12($ctx)
|
---|
1264 | lwz $t0,16($ctx)
|
---|
1265 |
|
---|
1266 | vmrgow $R0,$R0,$H0 # r^2:r^4:r^1:r^3
|
---|
1267 | vmrgow $R1,$R1,$H1
|
---|
1268 | vmrgow $R2,$R2,$H2
|
---|
1269 | vmrgow $R3,$R3,$H3
|
---|
1270 | vmrgow $R4,$R4,$H4
|
---|
1271 | vslw $S1,$R1,$T0 # <<2
|
---|
1272 | vslw $S2,$R2,$T0
|
---|
1273 | vslw $S3,$R3,$T0
|
---|
1274 | vslw $S4,$R4,$T0
|
---|
1275 | vadduwm $S1,$S1,$R1
|
---|
1276 | vadduwm $S2,$S2,$R2
|
---|
1277 | vadduwm $S3,$S3,$R3
|
---|
1278 | vadduwm $S4,$S4,$R4
|
---|
1279 |
|
---|
1280 | stvx_u $R0,$x30,$ctx
|
---|
1281 | stvx_u $R1,$x40,$ctx
|
---|
1282 | stvx_u $S1,$x50,$ctx
|
---|
1283 | stvx_u $R2,$x00,$h0
|
---|
1284 | stvx_u $S2,$x10,$h0
|
---|
1285 | stvx_u $R3,$x20,$h0
|
---|
1286 | stvx_u $S3,$x30,$h0
|
---|
1287 | stvx_u $R4,$x40,$h0
|
---|
1288 | stvx_u $S4,$x50,$h0
|
---|
1289 |
|
---|
1290 | extrwi $h0,$h1,26,6 # base 2^32 -> base 2^26
|
---|
1291 | extrwi $h1,$h1,6,0
|
---|
1292 | mtvrwz $H0,$h0
|
---|
1293 | insrwi $h1,$h2,20,6
|
---|
1294 | extrwi $h2,$h2,12,0
|
---|
1295 | mtvrwz $H1,$h1
|
---|
1296 | insrwi $h2,$h3,14,6
|
---|
1297 | extrwi $h3,$h3,18,0
|
---|
1298 | mtvrwz $H2,$h2
|
---|
1299 | insrwi $h3,$h4,8,6
|
---|
1300 | extrwi $h4,$h4,24,0
|
---|
1301 | mtvrwz $H3,$h3
|
---|
1302 | insrwi $h4,$t0,3,5
|
---|
1303 | mtvrwz $H4,$h4
|
---|
1304 | ___
|
---|
1305 | }
|
---|
1306 | $code.=<<___;
|
---|
1307 | li r0,1
|
---|
1308 | stw r0,24($ctx) # set is_base2_26
|
---|
1309 | b Loaded_vsx
|
---|
1310 |
|
---|
1311 | .align 4
|
---|
1312 | Lskip_init_vsx:
|
---|
1313 | li $x10,4
|
---|
1314 | li $x20,8
|
---|
1315 | li $x30,12
|
---|
1316 | li $x40,16
|
---|
1317 | lvwzx_u $H0,$x00,$ctx
|
---|
1318 | lvwzx_u $H1,$x10,$ctx
|
---|
1319 | lvwzx_u $H2,$x20,$ctx
|
---|
1320 | lvwzx_u $H3,$x30,$ctx
|
---|
1321 | lvwzx_u $H4,$x40,$ctx
|
---|
1322 |
|
---|
1323 | Loaded_vsx:
|
---|
1324 | li $x10,0x10
|
---|
1325 | li $x20,0x20
|
---|
1326 | li $x30,0x30
|
---|
1327 | li $x40,0x40
|
---|
1328 | li $x50,0x50
|
---|
1329 | li $x60,0x60
|
---|
1330 | li $x70,0x70
|
---|
1331 | addi $ctx_,$ctx,64 # &ctx->r[1]
|
---|
1332 | addi $_ctx,$sp,`$LOCALS+15` # &ctx->r[1], r^2:r^4 shadow
|
---|
1333 |
|
---|
1334 | vxor $T0,$T0,$T0 # ensure second half is zero
|
---|
1335 | vpermdi $H0,$H0,$T0,0b00
|
---|
1336 | vpermdi $H1,$H1,$T0,0b00
|
---|
1337 | vpermdi $H2,$H2,$T0,0b00
|
---|
1338 | vpermdi $H3,$H3,$T0,0b00
|
---|
1339 | vpermdi $H4,$H4,$T0,0b00
|
---|
1340 |
|
---|
1341 | be?lvx_u $_4,$x50,$const # byte swap mask
|
---|
1342 | lvx_u $T1,$x00,$inp # load first input block
|
---|
1343 | lvx_u $T2,$x10,$inp
|
---|
1344 | lvx_u $T3,$x20,$inp
|
---|
1345 | lvx_u $T4,$x30,$inp
|
---|
1346 | be?vperm $T1,$T1,$T1,$_4
|
---|
1347 | be?vperm $T2,$T2,$T2,$_4
|
---|
1348 | be?vperm $T3,$T3,$T3,$_4
|
---|
1349 | be?vperm $T4,$T4,$T4,$_4
|
---|
1350 |
|
---|
1351 | vpermdi $I0,$T1,$T2,0b00 # smash input to base 2^26
|
---|
1352 | vspltisb $_4,4
|
---|
1353 | vperm $I2,$T1,$T2,$I2perm # 0x...0e0f0001...1e1f1011
|
---|
1354 | vspltisb $_14,14
|
---|
1355 | vpermdi $I3,$T1,$T2,0b11
|
---|
1356 |
|
---|
1357 | vsrd $I1,$I0,$_26
|
---|
1358 | vsrd $I2,$I2,$_4
|
---|
1359 | vsrd $I4,$I3,$_40
|
---|
1360 | vsrd $I3,$I3,$_14
|
---|
1361 | vand $I0,$I0,$mask26
|
---|
1362 | vand $I1,$I1,$mask26
|
---|
1363 | vand $I2,$I2,$mask26
|
---|
1364 | vand $I3,$I3,$mask26
|
---|
1365 |
|
---|
1366 | vpermdi $T1,$T3,$T4,0b00
|
---|
1367 | vperm $T2,$T3,$T4,$I2perm # 0x...0e0f0001...1e1f1011
|
---|
1368 | vpermdi $T3,$T3,$T4,0b11
|
---|
1369 |
|
---|
1370 | vsrd $T0,$T1,$_26
|
---|
1371 | vsrd $T2,$T2,$_4
|
---|
1372 | vsrd $T4,$T3,$_40
|
---|
1373 | vsrd $T3,$T3,$_14
|
---|
1374 | vand $T1,$T1,$mask26
|
---|
1375 | vand $T0,$T0,$mask26
|
---|
1376 | vand $T2,$T2,$mask26
|
---|
1377 | vand $T3,$T3,$mask26
|
---|
1378 |
|
---|
1379 | # inp[2]:inp[0]:inp[3]:inp[1]
|
---|
1380 | vmrgow $I4,$T4,$I4
|
---|
1381 | vmrgow $I0,$T1,$I0
|
---|
1382 | vmrgow $I1,$T0,$I1
|
---|
1383 | vmrgow $I2,$T2,$I2
|
---|
1384 | vmrgow $I3,$T3,$I3
|
---|
1385 | vor $I4,$I4,$padbits
|
---|
1386 |
|
---|
1387 | lvx_splt $R0,$x30,$ctx # taking lvx_vsplt out of loop
|
---|
1388 | lvx_splt $R1,$x00,$ctx_ # gives ~8% improvement
|
---|
1389 | lvx_splt $S1,$x10,$ctx_
|
---|
1390 | lvx_splt $R2,$x20,$ctx_
|
---|
1391 | lvx_splt $S2,$x30,$ctx_
|
---|
1392 | lvx_splt $T1,$x40,$ctx_
|
---|
1393 | lvx_splt $T2,$x50,$ctx_
|
---|
1394 | lvx_splt $T3,$x60,$ctx_
|
---|
1395 | lvx_splt $T4,$x70,$ctx_
|
---|
1396 | stvx $R1,$x00,$_ctx
|
---|
1397 | stvx $S1,$x10,$_ctx
|
---|
1398 | stvx $R2,$x20,$_ctx
|
---|
1399 | stvx $S2,$x30,$_ctx
|
---|
1400 | stvx $T1,$x40,$_ctx
|
---|
1401 | stvx $T2,$x50,$_ctx
|
---|
1402 | stvx $T3,$x60,$_ctx
|
---|
1403 | stvx $T4,$x70,$_ctx
|
---|
1404 |
|
---|
1405 | addi $inp,$inp,0x40
|
---|
1406 | addi $const,$const,0x50
|
---|
1407 | addi r0,$len,-64
|
---|
1408 | srdi r0,r0,6
|
---|
1409 | mtctr r0
|
---|
1410 | b Loop_vsx
|
---|
1411 |
|
---|
1412 | .align 4
|
---|
1413 | Loop_vsx:
|
---|
1414 | ################################################################
|
---|
1415 | ## ((inp[0]*r^4+inp[2]*r^2+inp[4])*r^4+inp[6]*r^2
|
---|
1416 | ## ((inp[1]*r^4+inp[3]*r^2+inp[5])*r^3+inp[7]*r
|
---|
1417 | ## \___________________/
|
---|
1418 | ##
|
---|
1419 | ## Note that we start with inp[2:3]*r^2. This is because it
|
---|
1420 | ## doesn't depend on reduction in previous iteration.
|
---|
1421 | ################################################################
|
---|
1422 | ## d4 = h4*r0 + h3*r1 + h2*r2 + h1*r3 + h0*r4
|
---|
1423 | ## d3 = h3*r0 + h2*r1 + h1*r2 + h0*r3 + h4*5*r4
|
---|
1424 | ## d2 = h2*r0 + h1*r1 + h0*r2 + h4*5*r3 + h3*5*r4
|
---|
1425 | ## d1 = h1*r0 + h0*r1 + h4*5*r2 + h3*5*r3 + h2*5*r4
|
---|
1426 | ## d0 = h0*r0 + h4*5*r1 + h3*5*r2 + h2*5*r3 + h1*5*r4
|
---|
1427 |
|
---|
1428 | vmuleuw $ACC0,$I0,$R0
|
---|
1429 | vmuleuw $ACC1,$I0,$R1
|
---|
1430 | vmuleuw $ACC2,$I0,$R2
|
---|
1431 | vmuleuw $ACC3,$I1,$R2
|
---|
1432 |
|
---|
1433 | vmuleuw $T0,$I1,$R0
|
---|
1434 | vaddudm $ACC1,$ACC1,$T0
|
---|
1435 | vmuleuw $T0,$I1,$R1
|
---|
1436 | vaddudm $ACC2,$ACC2,$T0
|
---|
1437 | vmuleuw $ACC4,$I2,$R2
|
---|
1438 | vmuleuw $T0,$I4,$S1
|
---|
1439 | vaddudm $ACC0,$ACC0,$T0
|
---|
1440 | vmuleuw $T0,$I2,$R1
|
---|
1441 | vaddudm $ACC3,$ACC3,$T0
|
---|
1442 | lvx $S3,$x50,$_ctx
|
---|
1443 | vmuleuw $T0,$I3,$R1
|
---|
1444 | vaddudm $ACC4,$ACC4,$T0
|
---|
1445 | lvx $R3,$x40,$_ctx
|
---|
1446 |
|
---|
1447 | vaddudm $H2,$H2,$I2
|
---|
1448 | vaddudm $H0,$H0,$I0
|
---|
1449 | vaddudm $H3,$H3,$I3
|
---|
1450 | vaddudm $H1,$H1,$I1
|
---|
1451 | vaddudm $H4,$H4,$I4
|
---|
1452 |
|
---|
1453 | vmuleuw $T0,$I3,$S2
|
---|
1454 | vaddudm $ACC0,$ACC0,$T0
|
---|
1455 | vmuleuw $T0,$I4,$S2
|
---|
1456 | vaddudm $ACC1,$ACC1,$T0
|
---|
1457 | vmuleuw $T0,$I2,$R0
|
---|
1458 | vaddudm $ACC2,$ACC2,$T0
|
---|
1459 | vmuleuw $T0,$I3,$R0
|
---|
1460 | vaddudm $ACC3,$ACC3,$T0
|
---|
1461 | lvx $S4,$x70,$_ctx
|
---|
1462 | vmuleuw $T0,$I4,$R0
|
---|
1463 | vaddudm $ACC4,$ACC4,$T0
|
---|
1464 | lvx $R4,$x60,$_ctx
|
---|
1465 |
|
---|
1466 | vmuleuw $T0,$I2,$S3
|
---|
1467 | vaddudm $ACC0,$ACC0,$T0
|
---|
1468 | vmuleuw $T0,$I3,$S3
|
---|
1469 | vaddudm $ACC1,$ACC1,$T0
|
---|
1470 | vmuleuw $T0,$I4,$S3
|
---|
1471 | vaddudm $ACC2,$ACC2,$T0
|
---|
1472 | vmuleuw $T0,$I0,$R3
|
---|
1473 | vaddudm $ACC3,$ACC3,$T0
|
---|
1474 | vmuleuw $T0,$I1,$R3
|
---|
1475 | vaddudm $ACC4,$ACC4,$T0
|
---|
1476 |
|
---|
1477 | be?lvx_u $_4,$x00,$const # byte swap mask
|
---|
1478 | lvx_u $T1,$x00,$inp # load next input block
|
---|
1479 | lvx_u $T2,$x10,$inp
|
---|
1480 | lvx_u $T3,$x20,$inp
|
---|
1481 | lvx_u $T4,$x30,$inp
|
---|
1482 | be?vperm $T1,$T1,$T1,$_4
|
---|
1483 | be?vperm $T2,$T2,$T2,$_4
|
---|
1484 | be?vperm $T3,$T3,$T3,$_4
|
---|
1485 | be?vperm $T4,$T4,$T4,$_4
|
---|
1486 |
|
---|
1487 | vmuleuw $T0,$I1,$S4
|
---|
1488 | vaddudm $ACC0,$ACC0,$T0
|
---|
1489 | vmuleuw $T0,$I2,$S4
|
---|
1490 | vaddudm $ACC1,$ACC1,$T0
|
---|
1491 | vmuleuw $T0,$I3,$S4
|
---|
1492 | vaddudm $ACC2,$ACC2,$T0
|
---|
1493 | vmuleuw $T0,$I4,$S4
|
---|
1494 | vaddudm $ACC3,$ACC3,$T0
|
---|
1495 | vmuleuw $T0,$I0,$R4
|
---|
1496 | vaddudm $ACC4,$ACC4,$T0
|
---|
1497 |
|
---|
1498 | vpermdi $I0,$T1,$T2,0b00 # smash input to base 2^26
|
---|
1499 | vspltisb $_4,4
|
---|
1500 | vperm $I2,$T1,$T2,$I2perm # 0x...0e0f0001...1e1f1011
|
---|
1501 | vpermdi $I3,$T1,$T2,0b11
|
---|
1502 |
|
---|
1503 | # (hash + inp[0:1]) * r^4
|
---|
1504 | vmulouw $T0,$H0,$R0
|
---|
1505 | vaddudm $ACC0,$ACC0,$T0
|
---|
1506 | vmulouw $T0,$H1,$R0
|
---|
1507 | vaddudm $ACC1,$ACC1,$T0
|
---|
1508 | vmulouw $T0,$H2,$R0
|
---|
1509 | vaddudm $ACC2,$ACC2,$T0
|
---|
1510 | vmulouw $T0,$H3,$R0
|
---|
1511 | vaddudm $ACC3,$ACC3,$T0
|
---|
1512 | vmulouw $T0,$H4,$R0
|
---|
1513 | vaddudm $ACC4,$ACC4,$T0
|
---|
1514 |
|
---|
1515 | vpermdi $T1,$T3,$T4,0b00
|
---|
1516 | vperm $T2,$T3,$T4,$I2perm # 0x...0e0f0001...1e1f1011
|
---|
1517 | vpermdi $T3,$T3,$T4,0b11
|
---|
1518 |
|
---|
1519 | vmulouw $T0,$H2,$S3
|
---|
1520 | vaddudm $ACC0,$ACC0,$T0
|
---|
1521 | vmulouw $T0,$H3,$S3
|
---|
1522 | vaddudm $ACC1,$ACC1,$T0
|
---|
1523 | vmulouw $T0,$H4,$S3
|
---|
1524 | vaddudm $ACC2,$ACC2,$T0
|
---|
1525 | vmulouw $T0,$H0,$R3
|
---|
1526 | vaddudm $ACC3,$ACC3,$T0
|
---|
1527 | lvx $S1,$x10,$_ctx
|
---|
1528 | vmulouw $T0,$H1,$R3
|
---|
1529 | vaddudm $ACC4,$ACC4,$T0
|
---|
1530 | lvx $R1,$x00,$_ctx
|
---|
1531 |
|
---|
1532 | vsrd $I1,$I0,$_26
|
---|
1533 | vsrd $I2,$I2,$_4
|
---|
1534 | vsrd $I4,$I3,$_40
|
---|
1535 | vsrd $I3,$I3,$_14
|
---|
1536 |
|
---|
1537 | vmulouw $T0,$H1,$S4
|
---|
1538 | vaddudm $ACC0,$ACC0,$T0
|
---|
1539 | vmulouw $T0,$H2,$S4
|
---|
1540 | vaddudm $ACC1,$ACC1,$T0
|
---|
1541 | vmulouw $T0,$H3,$S4
|
---|
1542 | vaddudm $ACC2,$ACC2,$T0
|
---|
1543 | vmulouw $T0,$H4,$S4
|
---|
1544 | vaddudm $ACC3,$ACC3,$T0
|
---|
1545 | lvx $S2,$x30,$_ctx
|
---|
1546 | vmulouw $T0,$H0,$R4
|
---|
1547 | vaddudm $ACC4,$ACC4,$T0
|
---|
1548 | lvx $R2,$x20,$_ctx
|
---|
1549 |
|
---|
1550 | vand $I0,$I0,$mask26
|
---|
1551 | vand $I1,$I1,$mask26
|
---|
1552 | vand $I2,$I2,$mask26
|
---|
1553 | vand $I3,$I3,$mask26
|
---|
1554 |
|
---|
1555 | vmulouw $T0,$H4,$S1
|
---|
1556 | vaddudm $ACC0,$ACC0,$T0
|
---|
1557 | vmulouw $T0,$H0,$R1
|
---|
1558 | vaddudm $ACC1,$ACC1,$T0
|
---|
1559 | vmulouw $T0,$H1,$R1
|
---|
1560 | vaddudm $ACC2,$ACC2,$T0
|
---|
1561 | vmulouw $T0,$H2,$R1
|
---|
1562 | vaddudm $ACC3,$ACC3,$T0
|
---|
1563 | vmulouw $T0,$H3,$R1
|
---|
1564 | vaddudm $ACC4,$ACC4,$T0
|
---|
1565 |
|
---|
1566 | vsrd $T2,$T2,$_4
|
---|
1567 | vsrd $_4,$T1,$_26
|
---|
1568 | vsrd $T4,$T3,$_40
|
---|
1569 | vsrd $T3,$T3,$_14
|
---|
1570 |
|
---|
1571 | vmulouw $T0,$H3,$S2
|
---|
1572 | vaddudm $ACC0,$ACC0,$T0
|
---|
1573 | vmulouw $T0,$H4,$S2
|
---|
1574 | vaddudm $ACC1,$ACC1,$T0
|
---|
1575 | vmulouw $T0,$H0,$R2
|
---|
1576 | vaddudm $ACC2,$ACC2,$T0
|
---|
1577 | vmulouw $T0,$H1,$R2
|
---|
1578 | vaddudm $ACC3,$ACC3,$T0
|
---|
1579 | vmulouw $T0,$H2,$R2
|
---|
1580 | vaddudm $ACC4,$ACC4,$T0
|
---|
1581 |
|
---|
1582 | vand $T1,$T1,$mask26
|
---|
1583 | vand $_4,$_4,$mask26
|
---|
1584 | vand $T2,$T2,$mask26
|
---|
1585 | vand $T3,$T3,$mask26
|
---|
1586 |
|
---|
1587 | ################################################################
|
---|
1588 | # lazy reduction as discussed in "NEON crypto" by D.J. Bernstein
|
---|
1589 | # and P. Schwabe
|
---|
1590 |
|
---|
1591 | vspltisb $T0,2
|
---|
1592 | vsrd $H4,$ACC3,$_26
|
---|
1593 | vsrd $H1,$ACC0,$_26
|
---|
1594 | vand $H3,$ACC3,$mask26
|
---|
1595 | vand $H0,$ACC0,$mask26
|
---|
1596 | vaddudm $H4,$H4,$ACC4 # h3 -> h4
|
---|
1597 | vaddudm $H1,$H1,$ACC1 # h0 -> h1
|
---|
1598 |
|
---|
1599 | vmrgow $I4,$T4,$I4
|
---|
1600 | vmrgow $I0,$T1,$I0
|
---|
1601 | vmrgow $I1,$_4,$I1
|
---|
1602 | vmrgow $I2,$T2,$I2
|
---|
1603 | vmrgow $I3,$T3,$I3
|
---|
1604 | vor $I4,$I4,$padbits
|
---|
1605 |
|
---|
1606 | vsrd $ACC4,$H4,$_26
|
---|
1607 | vsrd $ACC1,$H1,$_26
|
---|
1608 | vand $H4,$H4,$mask26
|
---|
1609 | vand $H1,$H1,$mask26
|
---|
1610 | vaddudm $H0,$H0,$ACC4
|
---|
1611 | vaddudm $H2,$ACC2,$ACC1 # h1 -> h2
|
---|
1612 |
|
---|
1613 | vsld $ACC4,$ACC4,$T0 # <<2
|
---|
1614 | vsrd $ACC2,$H2,$_26
|
---|
1615 | vand $H2,$H2,$mask26
|
---|
1616 | vaddudm $H0,$H0,$ACC4 # h4 -> h0
|
---|
1617 | vaddudm $H3,$H3,$ACC2 # h2 -> h3
|
---|
1618 |
|
---|
1619 | vsrd $ACC0,$H0,$_26
|
---|
1620 | vsrd $ACC3,$H3,$_26
|
---|
1621 | vand $H0,$H0,$mask26
|
---|
1622 | vand $H3,$H3,$mask26
|
---|
1623 | vaddudm $H1,$H1,$ACC0 # h0 -> h1
|
---|
1624 | vaddudm $H4,$H4,$ACC3 # h3 -> h4
|
---|
1625 |
|
---|
1626 | addi $inp,$inp,0x40
|
---|
1627 | bdnz Loop_vsx
|
---|
1628 |
|
---|
1629 | neg $len,$len
|
---|
1630 | andi. $len,$len,0x30
|
---|
1631 | sub $inp,$inp,$len
|
---|
1632 |
|
---|
1633 | lvx_u $R0,$x30,$ctx # load all powers
|
---|
1634 | lvx_u $R1,$x00,$ctx_
|
---|
1635 | lvx_u $S1,$x10,$ctx_
|
---|
1636 | lvx_u $R2,$x20,$ctx_
|
---|
1637 | lvx_u $S2,$x30,$ctx_
|
---|
1638 |
|
---|
1639 | Last_vsx:
|
---|
1640 | vmuleuw $ACC0,$I0,$R0
|
---|
1641 | vmuleuw $ACC1,$I1,$R0
|
---|
1642 | vmuleuw $ACC2,$I2,$R0
|
---|
1643 | vmuleuw $ACC3,$I3,$R0
|
---|
1644 | vmuleuw $ACC4,$I4,$R0
|
---|
1645 |
|
---|
1646 | vmuleuw $T0,$I4,$S1
|
---|
1647 | vaddudm $ACC0,$ACC0,$T0
|
---|
1648 | vmuleuw $T0,$I0,$R1
|
---|
1649 | vaddudm $ACC1,$ACC1,$T0
|
---|
1650 | vmuleuw $T0,$I1,$R1
|
---|
1651 | vaddudm $ACC2,$ACC2,$T0
|
---|
1652 | vmuleuw $T0,$I2,$R1
|
---|
1653 | vaddudm $ACC3,$ACC3,$T0
|
---|
1654 | lvx_u $S3,$x50,$ctx_
|
---|
1655 | vmuleuw $T0,$I3,$R1
|
---|
1656 | vaddudm $ACC4,$ACC4,$T0
|
---|
1657 | lvx_u $R3,$x40,$ctx_
|
---|
1658 |
|
---|
1659 | vaddudm $H2,$H2,$I2
|
---|
1660 | vaddudm $H0,$H0,$I0
|
---|
1661 | vaddudm $H3,$H3,$I3
|
---|
1662 | vaddudm $H1,$H1,$I1
|
---|
1663 | vaddudm $H4,$H4,$I4
|
---|
1664 |
|
---|
1665 | vmuleuw $T0,$I3,$S2
|
---|
1666 | vaddudm $ACC0,$ACC0,$T0
|
---|
1667 | vmuleuw $T0,$I4,$S2
|
---|
1668 | vaddudm $ACC1,$ACC1,$T0
|
---|
1669 | vmuleuw $T0,$I0,$R2
|
---|
1670 | vaddudm $ACC2,$ACC2,$T0
|
---|
1671 | vmuleuw $T0,$I1,$R2
|
---|
1672 | vaddudm $ACC3,$ACC3,$T0
|
---|
1673 | lvx_u $S4,$x70,$ctx_
|
---|
1674 | vmuleuw $T0,$I2,$R2
|
---|
1675 | vaddudm $ACC4,$ACC4,$T0
|
---|
1676 | lvx_u $R4,$x60,$ctx_
|
---|
1677 |
|
---|
1678 | vmuleuw $T0,$I2,$S3
|
---|
1679 | vaddudm $ACC0,$ACC0,$T0
|
---|
1680 | vmuleuw $T0,$I3,$S3
|
---|
1681 | vaddudm $ACC1,$ACC1,$T0
|
---|
1682 | vmuleuw $T0,$I4,$S3
|
---|
1683 | vaddudm $ACC2,$ACC2,$T0
|
---|
1684 | vmuleuw $T0,$I0,$R3
|
---|
1685 | vaddudm $ACC3,$ACC3,$T0
|
---|
1686 | vmuleuw $T0,$I1,$R3
|
---|
1687 | vaddudm $ACC4,$ACC4,$T0
|
---|
1688 |
|
---|
1689 | vmuleuw $T0,$I1,$S4
|
---|
1690 | vaddudm $ACC0,$ACC0,$T0
|
---|
1691 | vmuleuw $T0,$I2,$S4
|
---|
1692 | vaddudm $ACC1,$ACC1,$T0
|
---|
1693 | vmuleuw $T0,$I3,$S4
|
---|
1694 | vaddudm $ACC2,$ACC2,$T0
|
---|
1695 | vmuleuw $T0,$I4,$S4
|
---|
1696 | vaddudm $ACC3,$ACC3,$T0
|
---|
1697 | vmuleuw $T0,$I0,$R4
|
---|
1698 | vaddudm $ACC4,$ACC4,$T0
|
---|
1699 |
|
---|
1700 | # (hash + inp[0:1]) * r^4
|
---|
1701 | vmulouw $T0,$H0,$R0
|
---|
1702 | vaddudm $ACC0,$ACC0,$T0
|
---|
1703 | vmulouw $T0,$H1,$R0
|
---|
1704 | vaddudm $ACC1,$ACC1,$T0
|
---|
1705 | vmulouw $T0,$H2,$R0
|
---|
1706 | vaddudm $ACC2,$ACC2,$T0
|
---|
1707 | vmulouw $T0,$H3,$R0
|
---|
1708 | vaddudm $ACC3,$ACC3,$T0
|
---|
1709 | vmulouw $T0,$H4,$R0
|
---|
1710 | vaddudm $ACC4,$ACC4,$T0
|
---|
1711 |
|
---|
1712 | vmulouw $T0,$H2,$S3
|
---|
1713 | vaddudm $ACC0,$ACC0,$T0
|
---|
1714 | vmulouw $T0,$H3,$S3
|
---|
1715 | vaddudm $ACC1,$ACC1,$T0
|
---|
1716 | vmulouw $T0,$H4,$S3
|
---|
1717 | vaddudm $ACC2,$ACC2,$T0
|
---|
1718 | vmulouw $T0,$H0,$R3
|
---|
1719 | vaddudm $ACC3,$ACC3,$T0
|
---|
1720 | lvx_u $S1,$x10,$ctx_
|
---|
1721 | vmulouw $T0,$H1,$R3
|
---|
1722 | vaddudm $ACC4,$ACC4,$T0
|
---|
1723 | lvx_u $R1,$x00,$ctx_
|
---|
1724 |
|
---|
1725 | vmulouw $T0,$H1,$S4
|
---|
1726 | vaddudm $ACC0,$ACC0,$T0
|
---|
1727 | vmulouw $T0,$H2,$S4
|
---|
1728 | vaddudm $ACC1,$ACC1,$T0
|
---|
1729 | vmulouw $T0,$H3,$S4
|
---|
1730 | vaddudm $ACC2,$ACC2,$T0
|
---|
1731 | vmulouw $T0,$H4,$S4
|
---|
1732 | vaddudm $ACC3,$ACC3,$T0
|
---|
1733 | lvx_u $S2,$x30,$ctx_
|
---|
1734 | vmulouw $T0,$H0,$R4
|
---|
1735 | vaddudm $ACC4,$ACC4,$T0
|
---|
1736 | lvx_u $R2,$x20,$ctx_
|
---|
1737 |
|
---|
1738 | vmulouw $T0,$H4,$S1
|
---|
1739 | vaddudm $ACC0,$ACC0,$T0
|
---|
1740 | vmulouw $T0,$H0,$R1
|
---|
1741 | vaddudm $ACC1,$ACC1,$T0
|
---|
1742 | vmulouw $T0,$H1,$R1
|
---|
1743 | vaddudm $ACC2,$ACC2,$T0
|
---|
1744 | vmulouw $T0,$H2,$R1
|
---|
1745 | vaddudm $ACC3,$ACC3,$T0
|
---|
1746 | vmulouw $T0,$H3,$R1
|
---|
1747 | vaddudm $ACC4,$ACC4,$T0
|
---|
1748 |
|
---|
1749 | vmulouw $T0,$H3,$S2
|
---|
1750 | vaddudm $ACC0,$ACC0,$T0
|
---|
1751 | vmulouw $T0,$H4,$S2
|
---|
1752 | vaddudm $ACC1,$ACC1,$T0
|
---|
1753 | vmulouw $T0,$H0,$R2
|
---|
1754 | vaddudm $ACC2,$ACC2,$T0
|
---|
1755 | vmulouw $T0,$H1,$R2
|
---|
1756 | vaddudm $ACC3,$ACC3,$T0
|
---|
1757 | vmulouw $T0,$H2,$R2
|
---|
1758 | vaddudm $ACC4,$ACC4,$T0
|
---|
1759 |
|
---|
1760 | ################################################################
|
---|
1761 | # horizontal addition
|
---|
1762 |
|
---|
1763 | vpermdi $H0,$ACC0,$ACC0,0b10
|
---|
1764 | vpermdi $H1,$ACC1,$ACC1,0b10
|
---|
1765 | vpermdi $H2,$ACC2,$ACC2,0b10
|
---|
1766 | vpermdi $H3,$ACC3,$ACC3,0b10
|
---|
1767 | vpermdi $H4,$ACC4,$ACC4,0b10
|
---|
1768 | vaddudm $ACC0,$ACC0,$H0
|
---|
1769 | vaddudm $ACC1,$ACC1,$H1
|
---|
1770 | vaddudm $ACC2,$ACC2,$H2
|
---|
1771 | vaddudm $ACC3,$ACC3,$H3
|
---|
1772 | vaddudm $ACC4,$ACC4,$H4
|
---|
1773 |
|
---|
1774 | ################################################################
|
---|
1775 | # lazy reduction
|
---|
1776 |
|
---|
1777 | vspltisb $T0,2
|
---|
1778 | vsrd $H4,$ACC3,$_26
|
---|
1779 | vsrd $H1,$ACC0,$_26
|
---|
1780 | vand $H3,$ACC3,$mask26
|
---|
1781 | vand $H0,$ACC0,$mask26
|
---|
1782 | vaddudm $H4,$H4,$ACC4 # h3 -> h4
|
---|
1783 | vaddudm $H1,$H1,$ACC1 # h0 -> h1
|
---|
1784 |
|
---|
1785 | vsrd $ACC4,$H4,$_26
|
---|
1786 | vsrd $ACC1,$H1,$_26
|
---|
1787 | vand $H4,$H4,$mask26
|
---|
1788 | vand $H1,$H1,$mask26
|
---|
1789 | vaddudm $H0,$H0,$ACC4
|
---|
1790 | vaddudm $H2,$ACC2,$ACC1 # h1 -> h2
|
---|
1791 |
|
---|
1792 | vsld $ACC4,$ACC4,$T0 # <<2
|
---|
1793 | vsrd $ACC2,$H2,$_26
|
---|
1794 | vand $H2,$H2,$mask26
|
---|
1795 | vaddudm $H0,$H0,$ACC4 # h4 -> h0
|
---|
1796 | vaddudm $H3,$H3,$ACC2 # h2 -> h3
|
---|
1797 |
|
---|
1798 | vsrd $ACC0,$H0,$_26
|
---|
1799 | vsrd $ACC3,$H3,$_26
|
---|
1800 | vand $H0,$H0,$mask26
|
---|
1801 | vand $H3,$H3,$mask26
|
---|
1802 | vaddudm $H1,$H1,$ACC0 # h0 -> h1
|
---|
1803 | vaddudm $H4,$H4,$ACC3 # h3 -> h4
|
---|
1804 |
|
---|
1805 | beq Ldone_vsx
|
---|
1806 |
|
---|
1807 | add r6,$const,$len
|
---|
1808 |
|
---|
1809 | be?lvx_u $_4,$x00,$const # byte swap mask
|
---|
1810 | lvx_u $T1,$x00,$inp # load last partial input block
|
---|
1811 | lvx_u $T2,$x10,$inp
|
---|
1812 | lvx_u $T3,$x20,$inp
|
---|
1813 | lvx_u $T4,$x30,$inp
|
---|
1814 | be?vperm $T1,$T1,$T1,$_4
|
---|
1815 | be?vperm $T2,$T2,$T2,$_4
|
---|
1816 | be?vperm $T3,$T3,$T3,$_4
|
---|
1817 | be?vperm $T4,$T4,$T4,$_4
|
---|
1818 |
|
---|
1819 | vpermdi $I0,$T1,$T2,0b00 # smash input to base 2^26
|
---|
1820 | vspltisb $_4,4
|
---|
1821 | vperm $I2,$T1,$T2,$I2perm # 0x...0e0f0001...1e1f1011
|
---|
1822 | vpermdi $I3,$T1,$T2,0b11
|
---|
1823 |
|
---|
1824 | vsrd $I1,$I0,$_26
|
---|
1825 | vsrd $I2,$I2,$_4
|
---|
1826 | vsrd $I4,$I3,$_40
|
---|
1827 | vsrd $I3,$I3,$_14
|
---|
1828 | vand $I0,$I0,$mask26
|
---|
1829 | vand $I1,$I1,$mask26
|
---|
1830 | vand $I2,$I2,$mask26
|
---|
1831 | vand $I3,$I3,$mask26
|
---|
1832 |
|
---|
1833 | vpermdi $T0,$T3,$T4,0b00
|
---|
1834 | vperm $T1,$T3,$T4,$I2perm # 0x...0e0f0001...1e1f1011
|
---|
1835 | vpermdi $T2,$T3,$T4,0b11
|
---|
1836 |
|
---|
1837 | lvx_u $ACC0,$x00,r6
|
---|
1838 | lvx_u $ACC1,$x30,r6
|
---|
1839 |
|
---|
1840 | vsrd $T3,$T0,$_26
|
---|
1841 | vsrd $T1,$T1,$_4
|
---|
1842 | vsrd $T4,$T2,$_40
|
---|
1843 | vsrd $T2,$T2,$_14
|
---|
1844 | vand $T0,$T0,$mask26
|
---|
1845 | vand $T3,$T3,$mask26
|
---|
1846 | vand $T1,$T1,$mask26
|
---|
1847 | vand $T2,$T2,$mask26
|
---|
1848 |
|
---|
1849 | # inp[2]:inp[0]:inp[3]:inp[1]
|
---|
1850 | vmrgow $I4,$T4,$I4
|
---|
1851 | vmrgow $I0,$T0,$I0
|
---|
1852 | vmrgow $I1,$T3,$I1
|
---|
1853 | vmrgow $I2,$T1,$I2
|
---|
1854 | vmrgow $I3,$T2,$I3
|
---|
1855 | vor $I4,$I4,$padbits
|
---|
1856 |
|
---|
1857 | vperm $H0,$H0,$H0,$ACC0 # move hash to right lane
|
---|
1858 | vand $I0,$I0, $ACC1 # mask redundant input lane[s]
|
---|
1859 | vperm $H1,$H1,$H1,$ACC0
|
---|
1860 | vand $I1,$I1, $ACC1
|
---|
1861 | vperm $H2,$H2,$H2,$ACC0
|
---|
1862 | vand $I2,$I2, $ACC1
|
---|
1863 | vperm $H3,$H3,$H3,$ACC0
|
---|
1864 | vand $I3,$I3, $ACC1
|
---|
1865 | vperm $H4,$H4,$H4,$ACC0
|
---|
1866 | vand $I4,$I4, $ACC1
|
---|
1867 |
|
---|
1868 | vaddudm $I0,$I0,$H0 # accumulate hash
|
---|
1869 | vxor $H0,$H0,$H0 # wipe hash value
|
---|
1870 | vaddudm $I1,$I1,$H1
|
---|
1871 | vxor $H1,$H1,$H1
|
---|
1872 | vaddudm $I2,$I2,$H2
|
---|
1873 | vxor $H2,$H2,$H2
|
---|
1874 | vaddudm $I3,$I3,$H3
|
---|
1875 | vxor $H3,$H3,$H3
|
---|
1876 | vaddudm $I4,$I4,$H4
|
---|
1877 | vxor $H4,$H4,$H4
|
---|
1878 |
|
---|
1879 | xor. $len,$len,$len
|
---|
1880 | b Last_vsx
|
---|
1881 |
|
---|
1882 | .align 4
|
---|
1883 | Ldone_vsx:
|
---|
1884 | $POP r0,`$VSXFRAME+$LRSAVE`($sp)
|
---|
1885 | li $x10,4
|
---|
1886 | li $x20,8
|
---|
1887 | li $x30,12
|
---|
1888 | li $x40,16
|
---|
1889 | stvwx_u $H0,$x00,$ctx # store hash
|
---|
1890 | stvwx_u $H1,$x10,$ctx
|
---|
1891 | stvwx_u $H2,$x20,$ctx
|
---|
1892 | stvwx_u $H3,$x30,$ctx
|
---|
1893 | stvwx_u $H4,$x40,$ctx
|
---|
1894 |
|
---|
1895 | lwz r12,`$VSXFRAME-$SIZE_T*5-4`($sp)# pull vrsave
|
---|
1896 | mtlr r0
|
---|
1897 | li r10,`15+$LOCALS+128`
|
---|
1898 | li r11,`31+$LOCALS+128`
|
---|
1899 | mtspr 256,r12 # restore vrsave
|
---|
1900 | lvx v20,r10,$sp
|
---|
1901 | addi r10,r10,32
|
---|
1902 | lvx v21,r10,$sp
|
---|
1903 | addi r10,r10,32
|
---|
1904 | lvx v22,r11,$sp
|
---|
1905 | addi r11,r11,32
|
---|
1906 | lvx v23,r10,$sp
|
---|
1907 | addi r10,r10,32
|
---|
1908 | lvx v24,r11,$sp
|
---|
1909 | addi r11,r11,32
|
---|
1910 | lvx v25,r10,$sp
|
---|
1911 | addi r10,r10,32
|
---|
1912 | lvx v26,r11,$sp
|
---|
1913 | addi r11,r11,32
|
---|
1914 | lvx v27,r10,$sp
|
---|
1915 | addi r10,r10,32
|
---|
1916 | lvx v28,r11,$sp
|
---|
1917 | addi r11,r11,32
|
---|
1918 | lvx v29,r10,$sp
|
---|
1919 | addi r10,r10,32
|
---|
1920 | lvx v30,r11,$sp
|
---|
1921 | lvx v31,r10,$sp
|
---|
1922 | $POP r27,`$VSXFRAME-$SIZE_T*5`($sp)
|
---|
1923 | $POP r28,`$VSXFRAME-$SIZE_T*4`($sp)
|
---|
1924 | $POP r29,`$VSXFRAME-$SIZE_T*3`($sp)
|
---|
1925 | $POP r30,`$VSXFRAME-$SIZE_T*2`($sp)
|
---|
1926 | $POP r31,`$VSXFRAME-$SIZE_T*1`($sp)
|
---|
1927 | addi $sp,$sp,$VSXFRAME
|
---|
1928 | blr
|
---|
1929 | .long 0
|
---|
1930 | .byte 0,12,0x04,1,0x80,5,4,0
|
---|
1931 | .long 0
|
---|
1932 | .size __poly1305_blocks_vsx,.-__poly1305_blocks_vsx
|
---|
1933 |
|
---|
1934 | .align 6
|
---|
1935 | LPICmeup:
|
---|
1936 | mflr r0
|
---|
1937 | bcl 20,31,\$+4
|
---|
1938 | mflr $const # vvvvvv "distance" between . and 1st data entry
|
---|
1939 | addi $const,$const,`64-8`
|
---|
1940 | mtlr r0
|
---|
1941 | blr
|
---|
1942 | .long 0
|
---|
1943 | .byte 0,12,0x14,0,0,0,0,0
|
---|
1944 | .space `64-9*4`
|
---|
1945 |
|
---|
1946 | .quad 0x0000000003ffffff,0x0000000003ffffff # mask26
|
---|
1947 | .quad 0x000000000000001a,0x000000000000001a # _26
|
---|
1948 | .quad 0x0000000000000028,0x0000000000000028 # _40
|
---|
1949 | .quad 0x000000000e0f0001,0x000000001e1f1011 # I2perm
|
---|
1950 | .quad 0x0100000001000000,0x0100000001000000 # padbits
|
---|
1951 | .quad 0x0706050403020100,0x0f0e0d0c0b0a0908 # byte swap for big-endian
|
---|
1952 |
|
---|
1953 | .quad 0x0000000000000000,0x0000000004050607 # magic tail masks
|
---|
1954 | .quad 0x0405060700000000,0x0000000000000000
|
---|
1955 | .quad 0x0000000000000000,0x0405060700000000
|
---|
1956 |
|
---|
1957 | .quad 0xffffffff00000000,0xffffffffffffffff
|
---|
1958 | .quad 0xffffffff00000000,0xffffffff00000000
|
---|
1959 | .quad 0x0000000000000000,0xffffffff00000000
|
---|
1960 | ___
|
---|
1961 | }}}
|
---|
1962 | $code.=<<___;
|
---|
1963 | .asciz "Poly1305 for PPC, CRYPTOGAMS by \@dot-asm"
|
---|
1964 | ___
|
---|
1965 |
|
---|
1966 | foreach (split("\n",$code)) {
|
---|
1967 | s/\`([^\`]*)\`/eval($1)/ge;
|
---|
1968 |
|
---|
1969 | # instructions prefixed with '?' are endian-specific and need
|
---|
1970 | # to be adjusted accordingly...
|
---|
1971 | if ($flavour !~ /le$/) { # big-endian
|
---|
1972 | s/be\?// or
|
---|
1973 | s/le\?/#le#/
|
---|
1974 | } else { # little-endian
|
---|
1975 | s/le\?// or
|
---|
1976 | s/be\?/#be#/
|
---|
1977 | }
|
---|
1978 |
|
---|
1979 | print $_,"\n";
|
---|
1980 | }
|
---|
1981 | close STDOUT or die "error closing STDOUT: $!";
|
---|