1 | #! /usr/bin/env perl
|
---|
2 | # Copyright 2016-2021 The OpenSSL Project Authors. All Rights Reserved.
|
---|
3 | #
|
---|
4 | # Licensed under the Apache License 2.0 (the "License"). You may not use
|
---|
5 | # this file except in compliance with the License. You can obtain a copy
|
---|
6 | # in the file LICENSE in the source distribution or at
|
---|
7 | # https://www.openssl.org/source/license.html
|
---|
8 |
|
---|
9 | #
|
---|
10 | # ====================================================================
|
---|
11 | # Written by Andy Polyakov <[email protected]> for the OpenSSL
|
---|
12 | # project. The module is, however, dual licensed under OpenSSL and
|
---|
13 | # CRYPTOGAMS licenses depending on where you obtain it. For further
|
---|
14 | # details see http://www.openssl.org/~appro/cryptogams/.
|
---|
15 | # ====================================================================
|
---|
16 | #
|
---|
17 | # This module implements Poly1305 hash for SPARCv9, vanilla, as well
|
---|
18 | # as VIS3 and FMA extensions.
|
---|
19 | #
|
---|
20 | # May, August 2015
|
---|
21 | #
|
---|
22 | # Numbers are cycles per processed byte with poly1305_blocks alone.
|
---|
23 | #
|
---|
24 | # IALU(*) FMA
|
---|
25 | #
|
---|
26 | # UltraSPARC III 12.3(**)
|
---|
27 | # SPARC T3 7.92
|
---|
28 | # SPARC T4 1.70(***) 6.55
|
---|
29 | # SPARC64 X 5.60 3.64
|
---|
30 | #
|
---|
31 | # (*) Comparison to compiler-generated code is really problematic,
|
---|
32 | # because latter's performance varies too much depending on too
|
---|
33 | # many variables. For example, one can measure from 5x to 15x
|
---|
34 | # improvement on T4 for gcc-4.6. Well, in T4 case it's a bit
|
---|
35 | # unfair comparison, because compiler doesn't use VIS3, but
|
---|
36 | # given same initial conditions coefficient varies from 3x to 9x.
|
---|
37 | # (**) Pre-III performance should be even worse; floating-point
|
---|
38 | # performance for UltraSPARC I-IV on the other hand is reported
|
---|
39 | # to be 4.25 for hand-coded assembly, but they are just too old
|
---|
40 | # to care about.
|
---|
41 | # (***) Multi-process benchmark saturates at ~12.5x single-process
|
---|
42 | # result on 8-core processor, or ~21GBps per 2.85GHz socket.
|
---|
43 |
|
---|
44 | # $output is the last argument if it looks like a file (it has an extension)
|
---|
45 | my $output = $#ARGV >= 0 && $ARGV[$#ARGV] =~ m|\.\w+$| ? pop : undef;
|
---|
46 |
|
---|
47 | open STDOUT,">$output" if $output;
|
---|
48 |
|
---|
49 | my ($ctx,$inp,$len,$padbit,$shl,$shr) = map("%i$_",(0..5));
|
---|
50 | my ($r0,$r1,$r2,$r3,$s1,$s2,$s3,$h4) = map("%l$_",(0..7));
|
---|
51 | my ($h0,$h1,$h2,$h3, $t0,$t1,$t2) = map("%o$_",(0..5,7));
|
---|
52 | my ($d0,$d1,$d2,$d3) = map("%g$_",(1..4));
|
---|
53 |
|
---|
54 | $code.=<<___;
|
---|
55 | #ifndef __ASSEMBLER__
|
---|
56 | # define __ASSEMBLER__ 1
|
---|
57 | #endif
|
---|
58 | #include "crypto/sparc_arch.h"
|
---|
59 |
|
---|
60 | #ifdef __arch64__
|
---|
61 | .register %g2,#scratch
|
---|
62 | .register %g3,#scratch
|
---|
63 | # define STPTR stx
|
---|
64 | # define SIZE_T 8
|
---|
65 | #else
|
---|
66 | # define STPTR st
|
---|
67 | # define SIZE_T 4
|
---|
68 | #endif
|
---|
69 | #define LOCALS (STACK_BIAS+STACK_FRAME)
|
---|
70 |
|
---|
71 | .section ".text",#alloc,#execinstr
|
---|
72 |
|
---|
73 | #ifdef __PIC__
|
---|
74 | SPARC_PIC_THUNK(%g1)
|
---|
75 | #endif
|
---|
76 |
|
---|
77 | .globl poly1305_init
|
---|
78 | .align 32
|
---|
79 | poly1305_init:
|
---|
80 | save %sp,-STACK_FRAME-16,%sp
|
---|
81 | nop
|
---|
82 |
|
---|
83 | SPARC_LOAD_ADDRESS(OPENSSL_sparcv9cap_P,%g1)
|
---|
84 | ld [%g1],%g1
|
---|
85 |
|
---|
86 | and %g1,SPARCV9_FMADD|SPARCV9_VIS3,%g1
|
---|
87 | cmp %g1,SPARCV9_FMADD
|
---|
88 | be .Lpoly1305_init_fma
|
---|
89 | nop
|
---|
90 |
|
---|
91 | stx %g0,[$ctx+0]
|
---|
92 | stx %g0,[$ctx+8] ! zero hash value
|
---|
93 | brz,pn $inp,.Lno_key
|
---|
94 | stx %g0,[$ctx+16]
|
---|
95 |
|
---|
96 | and $inp,7,$shr ! alignment factor
|
---|
97 | andn $inp,7,$inp
|
---|
98 | sll $shr,3,$shr ! *8
|
---|
99 | neg $shr,$shl
|
---|
100 |
|
---|
101 | sethi %hi(0x0ffffffc),$t0
|
---|
102 | set 8,$h1
|
---|
103 | or $t0,%lo(0x0ffffffc),$t0
|
---|
104 | set 16,$h2
|
---|
105 | sllx $t0,32,$t1
|
---|
106 | or $t0,$t1,$t1 ! 0x0ffffffc0ffffffc
|
---|
107 | or $t1,3,$t0 ! 0x0ffffffc0fffffff
|
---|
108 |
|
---|
109 | ldxa [$inp+%g0]0x88,$h0 ! load little-endian key
|
---|
110 | brz,pt $shr,.Lkey_aligned
|
---|
111 | ldxa [$inp+$h1]0x88,$h1
|
---|
112 |
|
---|
113 | ldxa [$inp+$h2]0x88,$h2
|
---|
114 | srlx $h0,$shr,$h0
|
---|
115 | sllx $h1,$shl,$t2
|
---|
116 | srlx $h1,$shr,$h1
|
---|
117 | or $t2,$h0,$h0
|
---|
118 | sllx $h2,$shl,$h2
|
---|
119 | or $h2,$h1,$h1
|
---|
120 |
|
---|
121 | .Lkey_aligned:
|
---|
122 | and $t0,$h0,$h0
|
---|
123 | and $t1,$h1,$h1
|
---|
124 | stx $h0,[$ctx+32+0] ! store key
|
---|
125 | stx $h1,[$ctx+32+8]
|
---|
126 |
|
---|
127 | andcc %g1,SPARCV9_VIS3,%g0
|
---|
128 | be .Lno_key
|
---|
129 | nop
|
---|
130 |
|
---|
131 | 1: call .+8
|
---|
132 | add %o7,poly1305_blocks_vis3-1b,%o7
|
---|
133 |
|
---|
134 | add %o7,poly1305_emit-poly1305_blocks_vis3,%o5
|
---|
135 | STPTR %o7,[%i2]
|
---|
136 | STPTR %o5,[%i2+SIZE_T]
|
---|
137 |
|
---|
138 | ret
|
---|
139 | restore %g0,1,%o0 ! return 1
|
---|
140 |
|
---|
141 | .Lno_key:
|
---|
142 | ret
|
---|
143 | restore %g0,%g0,%o0 ! return 0
|
---|
144 | .type poly1305_init,#function
|
---|
145 | .size poly1305_init,.-poly1305_init
|
---|
146 |
|
---|
147 | .globl poly1305_blocks
|
---|
148 | .align 32
|
---|
149 | poly1305_blocks:
|
---|
150 | save %sp,-STACK_FRAME,%sp
|
---|
151 | srln $len,4,$len
|
---|
152 |
|
---|
153 | brz,pn $len,.Lno_data
|
---|
154 | nop
|
---|
155 |
|
---|
156 | ld [$ctx+32+0],$r1 ! load key
|
---|
157 | ld [$ctx+32+4],$r0
|
---|
158 | ld [$ctx+32+8],$r3
|
---|
159 | ld [$ctx+32+12],$r2
|
---|
160 |
|
---|
161 | ld [$ctx+0],$h1 ! load hash value
|
---|
162 | ld [$ctx+4],$h0
|
---|
163 | ld [$ctx+8],$h3
|
---|
164 | ld [$ctx+12],$h2
|
---|
165 | ld [$ctx+16],$h4
|
---|
166 |
|
---|
167 | and $inp,7,$shr ! alignment factor
|
---|
168 | andn $inp,7,$inp
|
---|
169 | set 8,$d1
|
---|
170 | sll $shr,3,$shr ! *8
|
---|
171 | set 16,$d2
|
---|
172 | neg $shr,$shl
|
---|
173 |
|
---|
174 | srl $r1,2,$s1
|
---|
175 | srl $r2,2,$s2
|
---|
176 | add $r1,$s1,$s1
|
---|
177 | srl $r3,2,$s3
|
---|
178 | add $r2,$s2,$s2
|
---|
179 | add $r3,$s3,$s3
|
---|
180 |
|
---|
181 | .Loop:
|
---|
182 | ldxa [$inp+%g0]0x88,$d0 ! load little-endian input
|
---|
183 | brz,pt $shr,.Linp_aligned
|
---|
184 | ldxa [$inp+$d1]0x88,$d1
|
---|
185 |
|
---|
186 | ldxa [$inp+$d2]0x88,$d2
|
---|
187 | srlx $d0,$shr,$d0
|
---|
188 | sllx $d1,$shl,$t1
|
---|
189 | srlx $d1,$shr,$d1
|
---|
190 | or $t1,$d0,$d0
|
---|
191 | sllx $d2,$shl,$d2
|
---|
192 | or $d2,$d1,$d1
|
---|
193 |
|
---|
194 | .Linp_aligned:
|
---|
195 | srlx $d0,32,$t0
|
---|
196 | addcc $d0,$h0,$h0 ! accumulate input
|
---|
197 | srlx $d1,32,$t1
|
---|
198 | addccc $t0,$h1,$h1
|
---|
199 | addccc $d1,$h2,$h2
|
---|
200 | addccc $t1,$h3,$h3
|
---|
201 | addc $padbit,$h4,$h4
|
---|
202 |
|
---|
203 | umul $r0,$h0,$d0
|
---|
204 | umul $r1,$h0,$d1
|
---|
205 | umul $r2,$h0,$d2
|
---|
206 | umul $r3,$h0,$d3
|
---|
207 | sub $len,1,$len
|
---|
208 | add $inp,16,$inp
|
---|
209 |
|
---|
210 | umul $s3,$h1,$t0
|
---|
211 | umul $r0,$h1,$t1
|
---|
212 | umul $r1,$h1,$t2
|
---|
213 | add $t0,$d0,$d0
|
---|
214 | add $t1,$d1,$d1
|
---|
215 | umul $r2,$h1,$t0
|
---|
216 | add $t2,$d2,$d2
|
---|
217 | add $t0,$d3,$d3
|
---|
218 |
|
---|
219 | umul $s2,$h2,$t1
|
---|
220 | umul $s3,$h2,$t2
|
---|
221 | umul $r0,$h2,$t0
|
---|
222 | add $t1,$d0,$d0
|
---|
223 | add $t2,$d1,$d1
|
---|
224 | umul $r1,$h2,$t1
|
---|
225 | add $t0,$d2,$d2
|
---|
226 | add $t1,$d3,$d3
|
---|
227 |
|
---|
228 | umul $s1,$h3,$t2
|
---|
229 | umul $s2,$h3,$t0
|
---|
230 | umul $s3,$h3,$t1
|
---|
231 | add $t2,$d0,$d0
|
---|
232 | add $t0,$d1,$d1
|
---|
233 | umul $r0,$h3,$t2
|
---|
234 | add $t1,$d2,$d2
|
---|
235 | add $t2,$d3,$d3
|
---|
236 |
|
---|
237 | umul $s1,$h4,$t0
|
---|
238 | umul $s2,$h4,$t1
|
---|
239 | umul $s3,$h4,$t2
|
---|
240 | umul $r0,$h4,$h4
|
---|
241 | add $t0,$d1,$d1
|
---|
242 | add $t1,$d2,$d2
|
---|
243 | srlx $d0,32,$h1
|
---|
244 | add $t2,$d3,$d3
|
---|
245 | srlx $d1,32,$h2
|
---|
246 |
|
---|
247 | addcc $d1,$h1,$h1
|
---|
248 | srlx $d2,32,$h3
|
---|
249 | set 8,$d1
|
---|
250 | addccc $d2,$h2,$h2
|
---|
251 | srlx $d3,32,$t0
|
---|
252 | set 16,$d2
|
---|
253 | addccc $d3,$h3,$h3
|
---|
254 | addc $t0,$h4,$h4
|
---|
255 |
|
---|
256 | srl $h4,2,$t0 ! final reduction step
|
---|
257 | andn $h4,3,$t1
|
---|
258 | and $h4,3,$h4
|
---|
259 | add $t1,$t0,$t0
|
---|
260 |
|
---|
261 | addcc $t0,$d0,$h0
|
---|
262 | addccc %g0,$h1,$h1
|
---|
263 | addccc %g0,$h2,$h2
|
---|
264 | addccc %g0,$h3,$h3
|
---|
265 | brnz,pt $len,.Loop
|
---|
266 | addc %g0,$h4,$h4
|
---|
267 |
|
---|
268 | st $h1,[$ctx+0] ! store hash value
|
---|
269 | st $h0,[$ctx+4]
|
---|
270 | st $h3,[$ctx+8]
|
---|
271 | st $h2,[$ctx+12]
|
---|
272 | st $h4,[$ctx+16]
|
---|
273 |
|
---|
274 | .Lno_data:
|
---|
275 | ret
|
---|
276 | restore
|
---|
277 | .type poly1305_blocks,#function
|
---|
278 | .size poly1305_blocks,.-poly1305_blocks
|
---|
279 | ___
|
---|
280 | ########################################################################
|
---|
281 | # VIS3 has umulxhi and addxc...
|
---|
282 | {
|
---|
283 | my ($H0,$H1,$H2,$R0,$R1,$S1,$T1) = map("%o$_",(0..5,7));
|
---|
284 | my ($D0,$D1,$D2,$T0) = map("%g$_",(1..4));
|
---|
285 |
|
---|
286 | $code.=<<___;
|
---|
287 | .align 32
|
---|
288 | poly1305_blocks_vis3:
|
---|
289 | save %sp,-STACK_FRAME,%sp
|
---|
290 | srln $len,4,$len
|
---|
291 |
|
---|
292 | brz,pn $len,.Lno_data
|
---|
293 | nop
|
---|
294 |
|
---|
295 | ldx [$ctx+32+0],$R0 ! load key
|
---|
296 | ldx [$ctx+32+8],$R1
|
---|
297 |
|
---|
298 | ldx [$ctx+0],$H0 ! load hash value
|
---|
299 | ldx [$ctx+8],$H1
|
---|
300 | ld [$ctx+16],$H2
|
---|
301 |
|
---|
302 | and $inp,7,$shr ! alignment factor
|
---|
303 | andn $inp,7,$inp
|
---|
304 | set 8,$r1
|
---|
305 | sll $shr,3,$shr ! *8
|
---|
306 | set 16,$r2
|
---|
307 | neg $shr,$shl
|
---|
308 |
|
---|
309 | srlx $R1,2,$S1
|
---|
310 | b .Loop_vis3
|
---|
311 | add $R1,$S1,$S1
|
---|
312 |
|
---|
313 | .Loop_vis3:
|
---|
314 | ldxa [$inp+%g0]0x88,$D0 ! load little-endian input
|
---|
315 | brz,pt $shr,.Linp_aligned_vis3
|
---|
316 | ldxa [$inp+$r1]0x88,$D1
|
---|
317 |
|
---|
318 | ldxa [$inp+$r2]0x88,$D2
|
---|
319 | srlx $D0,$shr,$D0
|
---|
320 | sllx $D1,$shl,$T1
|
---|
321 | srlx $D1,$shr,$D1
|
---|
322 | or $T1,$D0,$D0
|
---|
323 | sllx $D2,$shl,$D2
|
---|
324 | or $D2,$D1,$D1
|
---|
325 |
|
---|
326 | .Linp_aligned_vis3:
|
---|
327 | addcc $D0,$H0,$H0 ! accumulate input
|
---|
328 | sub $len,1,$len
|
---|
329 | addxccc $D1,$H1,$H1
|
---|
330 | add $inp,16,$inp
|
---|
331 |
|
---|
332 | mulx $R0,$H0,$D0 ! r0*h0
|
---|
333 | addxc $padbit,$H2,$H2
|
---|
334 | umulxhi $R0,$H0,$D1
|
---|
335 | mulx $S1,$H1,$T0 ! s1*h1
|
---|
336 | umulxhi $S1,$H1,$T1
|
---|
337 | addcc $T0,$D0,$D0
|
---|
338 | mulx $R1,$H0,$T0 ! r1*h0
|
---|
339 | addxc $T1,$D1,$D1
|
---|
340 | umulxhi $R1,$H0,$D2
|
---|
341 | addcc $T0,$D1,$D1
|
---|
342 | mulx $R0,$H1,$T0 ! r0*h1
|
---|
343 | addxc %g0,$D2,$D2
|
---|
344 | umulxhi $R0,$H1,$T1
|
---|
345 | addcc $T0,$D1,$D1
|
---|
346 | mulx $S1,$H2,$T0 ! s1*h2
|
---|
347 | addxc $T1,$D2,$D2
|
---|
348 | mulx $R0,$H2,$T1 ! r0*h2
|
---|
349 | addcc $T0,$D1,$D1
|
---|
350 | addxc $T1,$D2,$D2
|
---|
351 |
|
---|
352 | srlx $D2,2,$T0 ! final reduction step
|
---|
353 | andn $D2,3,$T1
|
---|
354 | and $D2,3,$H2
|
---|
355 | add $T1,$T0,$T0
|
---|
356 |
|
---|
357 | addcc $T0,$D0,$H0
|
---|
358 | addxccc %g0,$D1,$H1
|
---|
359 | brnz,pt $len,.Loop_vis3
|
---|
360 | addxc %g0,$H2,$H2
|
---|
361 |
|
---|
362 | stx $H0,[$ctx+0] ! store hash value
|
---|
363 | stx $H1,[$ctx+8]
|
---|
364 | st $H2,[$ctx+16]
|
---|
365 |
|
---|
366 | ret
|
---|
367 | restore
|
---|
368 | .type poly1305_blocks_vis3,#function
|
---|
369 | .size poly1305_blocks_vis3,.-poly1305_blocks_vis3
|
---|
370 | ___
|
---|
371 | }
|
---|
372 | my ($mac,$nonce) = ($inp,$len);
|
---|
373 |
|
---|
374 | $code.=<<___;
|
---|
375 | .globl poly1305_emit
|
---|
376 | .align 32
|
---|
377 | poly1305_emit:
|
---|
378 | save %sp,-STACK_FRAME,%sp
|
---|
379 |
|
---|
380 | ld [$ctx+0],$h1 ! load hash value
|
---|
381 | ld [$ctx+4],$h0
|
---|
382 | ld [$ctx+8],$h3
|
---|
383 | ld [$ctx+12],$h2
|
---|
384 | ld [$ctx+16],$h4
|
---|
385 |
|
---|
386 | addcc $h0,5,$r0 ! compare to modulus
|
---|
387 | addccc $h1,0,$r1
|
---|
388 | addccc $h2,0,$r2
|
---|
389 | addccc $h3,0,$r3
|
---|
390 | addc $h4,0,$h4
|
---|
391 | andcc $h4,4,%g0 ! did it carry/borrow?
|
---|
392 |
|
---|
393 | movnz %icc,$r0,$h0
|
---|
394 | ld [$nonce+0],$r0 ! load nonce
|
---|
395 | movnz %icc,$r1,$h1
|
---|
396 | ld [$nonce+4],$r1
|
---|
397 | movnz %icc,$r2,$h2
|
---|
398 | ld [$nonce+8],$r2
|
---|
399 | movnz %icc,$r3,$h3
|
---|
400 | ld [$nonce+12],$r3
|
---|
401 |
|
---|
402 | addcc $r0,$h0,$h0 ! accumulate nonce
|
---|
403 | addccc $r1,$h1,$h1
|
---|
404 | addccc $r2,$h2,$h2
|
---|
405 | addc $r3,$h3,$h3
|
---|
406 |
|
---|
407 | srl $h0,8,$r0
|
---|
408 | stb $h0,[$mac+0] ! store little-endian result
|
---|
409 | srl $h0,16,$r1
|
---|
410 | stb $r0,[$mac+1]
|
---|
411 | srl $h0,24,$r2
|
---|
412 | stb $r1,[$mac+2]
|
---|
413 | stb $r2,[$mac+3]
|
---|
414 |
|
---|
415 | srl $h1,8,$r0
|
---|
416 | stb $h1,[$mac+4]
|
---|
417 | srl $h1,16,$r1
|
---|
418 | stb $r0,[$mac+5]
|
---|
419 | srl $h1,24,$r2
|
---|
420 | stb $r1,[$mac+6]
|
---|
421 | stb $r2,[$mac+7]
|
---|
422 |
|
---|
423 | srl $h2,8,$r0
|
---|
424 | stb $h2,[$mac+8]
|
---|
425 | srl $h2,16,$r1
|
---|
426 | stb $r0,[$mac+9]
|
---|
427 | srl $h2,24,$r2
|
---|
428 | stb $r1,[$mac+10]
|
---|
429 | stb $r2,[$mac+11]
|
---|
430 |
|
---|
431 | srl $h3,8,$r0
|
---|
432 | stb $h3,[$mac+12]
|
---|
433 | srl $h3,16,$r1
|
---|
434 | stb $r0,[$mac+13]
|
---|
435 | srl $h3,24,$r2
|
---|
436 | stb $r1,[$mac+14]
|
---|
437 | stb $r2,[$mac+15]
|
---|
438 |
|
---|
439 | ret
|
---|
440 | restore
|
---|
441 | .type poly1305_emit,#function
|
---|
442 | .size poly1305_emit,.-poly1305_emit
|
---|
443 | ___
|
---|
444 |
|
---|
445 | {
|
---|
446 | my ($ctx,$inp,$len,$padbit) = map("%i$_",(0..3));
|
---|
447 | my ($in0,$in1,$in2,$in3,$in4) = map("%o$_",(0..4));
|
---|
448 | my ($i1,$step,$shr,$shl) = map("%l$_",(0..7));
|
---|
449 | my $i2=$step;
|
---|
450 |
|
---|
451 | my ($h0lo,$h0hi,$h1lo,$h1hi,$h2lo,$h2hi,$h3lo,$h3hi,
|
---|
452 | $two0,$two32,$two64,$two96,$two130,$five_two130,
|
---|
453 | $r0lo,$r0hi,$r1lo,$r1hi,$r2lo,$r2hi,
|
---|
454 | $s2lo,$s2hi,$s3lo,$s3hi,
|
---|
455 | $c0lo,$c0hi,$c1lo,$c1hi,$c2lo,$c2hi,$c3lo,$c3hi) = map("%f".2*$_,(0..31));
|
---|
456 | # borrowings
|
---|
457 | my ($r3lo,$r3hi,$s1lo,$s1hi) = ($c0lo,$c0hi,$c1lo,$c1hi);
|
---|
458 | my ($x0,$x1,$x2,$x3) = ($c2lo,$c2hi,$c3lo,$c3hi);
|
---|
459 | my ($y0,$y1,$y2,$y3) = ($c1lo,$c1hi,$c3hi,$c3lo);
|
---|
460 |
|
---|
461 | $code.=<<___;
|
---|
462 | .align 32
|
---|
463 | poly1305_init_fma:
|
---|
464 | save %sp,-STACK_FRAME-16,%sp
|
---|
465 | nop
|
---|
466 |
|
---|
467 | .Lpoly1305_init_fma:
|
---|
468 | 1: call .+8
|
---|
469 | add %o7,.Lconsts_fma-1b,%o7
|
---|
470 |
|
---|
471 | ldd [%o7+8*0],$two0 ! load constants
|
---|
472 | ldd [%o7+8*1],$two32
|
---|
473 | ldd [%o7+8*2],$two64
|
---|
474 | ldd [%o7+8*3],$two96
|
---|
475 | ldd [%o7+8*5],$five_two130
|
---|
476 |
|
---|
477 | std $two0,[$ctx+8*0] ! initial hash value, biased 0
|
---|
478 | std $two32,[$ctx+8*1]
|
---|
479 | std $two64,[$ctx+8*2]
|
---|
480 | std $two96,[$ctx+8*3]
|
---|
481 |
|
---|
482 | brz,pn $inp,.Lno_key_fma
|
---|
483 | nop
|
---|
484 |
|
---|
485 | stx %fsr,[%sp+LOCALS] ! save original %fsr
|
---|
486 | ldx [%o7+8*6],%fsr ! load new %fsr
|
---|
487 |
|
---|
488 | std $two0,[$ctx+8*4] ! key "template"
|
---|
489 | std $two32,[$ctx+8*5]
|
---|
490 | std $two64,[$ctx+8*6]
|
---|
491 | std $two96,[$ctx+8*7]
|
---|
492 |
|
---|
493 | and $inp,7,$shr
|
---|
494 | andn $inp,7,$inp ! align pointer
|
---|
495 | mov 8,$i1
|
---|
496 | sll $shr,3,$shr
|
---|
497 | mov 16,$i2
|
---|
498 | neg $shr,$shl
|
---|
499 |
|
---|
500 | ldxa [$inp+%g0]0x88,$in0 ! load little-endian key
|
---|
501 | ldxa [$inp+$i1]0x88,$in2
|
---|
502 |
|
---|
503 | brz $shr,.Lkey_aligned_fma
|
---|
504 | sethi %hi(0xf0000000),$i1 ! 0xf0000000
|
---|
505 |
|
---|
506 | ldxa [$inp+$i2]0x88,$in4
|
---|
507 |
|
---|
508 | srlx $in0,$shr,$in0 ! align data
|
---|
509 | sllx $in2,$shl,$in1
|
---|
510 | srlx $in2,$shr,$in2
|
---|
511 | or $in1,$in0,$in0
|
---|
512 | sllx $in4,$shl,$in3
|
---|
513 | or $in3,$in2,$in2
|
---|
514 |
|
---|
515 | .Lkey_aligned_fma:
|
---|
516 | or $i1,3,$i2 ! 0xf0000003
|
---|
517 | srlx $in0,32,$in1
|
---|
518 | andn $in0,$i1,$in0 ! &=0x0fffffff
|
---|
519 | andn $in1,$i2,$in1 ! &=0x0ffffffc
|
---|
520 | srlx $in2,32,$in3
|
---|
521 | andn $in2,$i2,$in2
|
---|
522 | andn $in3,$i2,$in3
|
---|
523 |
|
---|
524 | st $in0,[$ctx+`8*4+4`] ! fill "template"
|
---|
525 | st $in1,[$ctx+`8*5+4`]
|
---|
526 | st $in2,[$ctx+`8*6+4`]
|
---|
527 | st $in3,[$ctx+`8*7+4`]
|
---|
528 |
|
---|
529 | ldd [$ctx+8*4],$h0lo ! load [biased] key
|
---|
530 | ldd [$ctx+8*5],$h1lo
|
---|
531 | ldd [$ctx+8*6],$h2lo
|
---|
532 | ldd [$ctx+8*7],$h3lo
|
---|
533 |
|
---|
534 | fsubd $h0lo,$two0, $h0lo ! r0
|
---|
535 | ldd [%o7+8*7],$two0 ! more constants
|
---|
536 | fsubd $h1lo,$two32,$h1lo ! r1
|
---|
537 | ldd [%o7+8*8],$two32
|
---|
538 | fsubd $h2lo,$two64,$h2lo ! r2
|
---|
539 | ldd [%o7+8*9],$two64
|
---|
540 | fsubd $h3lo,$two96,$h3lo ! r3
|
---|
541 | ldd [%o7+8*10],$two96
|
---|
542 |
|
---|
543 | fmuld $five_two130,$h1lo,$s1lo ! s1
|
---|
544 | fmuld $five_two130,$h2lo,$s2lo ! s2
|
---|
545 | fmuld $five_two130,$h3lo,$s3lo ! s3
|
---|
546 |
|
---|
547 | faddd $h0lo,$two0, $h0hi
|
---|
548 | faddd $h1lo,$two32,$h1hi
|
---|
549 | faddd $h2lo,$two64,$h2hi
|
---|
550 | faddd $h3lo,$two96,$h3hi
|
---|
551 |
|
---|
552 | fsubd $h0hi,$two0, $h0hi
|
---|
553 | ldd [%o7+8*11],$two0 ! more constants
|
---|
554 | fsubd $h1hi,$two32,$h1hi
|
---|
555 | ldd [%o7+8*12],$two32
|
---|
556 | fsubd $h2hi,$two64,$h2hi
|
---|
557 | ldd [%o7+8*13],$two64
|
---|
558 | fsubd $h3hi,$two96,$h3hi
|
---|
559 |
|
---|
560 | fsubd $h0lo,$h0hi,$h0lo
|
---|
561 | std $h0hi,[$ctx+8*5] ! r0hi
|
---|
562 | fsubd $h1lo,$h1hi,$h1lo
|
---|
563 | std $h1hi,[$ctx+8*7] ! r1hi
|
---|
564 | fsubd $h2lo,$h2hi,$h2lo
|
---|
565 | std $h2hi,[$ctx+8*9] ! r2hi
|
---|
566 | fsubd $h3lo,$h3hi,$h3lo
|
---|
567 | std $h3hi,[$ctx+8*11] ! r3hi
|
---|
568 |
|
---|
569 | faddd $s1lo,$two0, $s1hi
|
---|
570 | faddd $s2lo,$two32,$s2hi
|
---|
571 | faddd $s3lo,$two64,$s3hi
|
---|
572 |
|
---|
573 | fsubd $s1hi,$two0, $s1hi
|
---|
574 | fsubd $s2hi,$two32,$s2hi
|
---|
575 | fsubd $s3hi,$two64,$s3hi
|
---|
576 |
|
---|
577 | fsubd $s1lo,$s1hi,$s1lo
|
---|
578 | fsubd $s2lo,$s2hi,$s2lo
|
---|
579 | fsubd $s3lo,$s3hi,$s3lo
|
---|
580 |
|
---|
581 | ldx [%sp+LOCALS],%fsr ! restore %fsr
|
---|
582 |
|
---|
583 | std $h0lo,[$ctx+8*4] ! r0lo
|
---|
584 | std $h1lo,[$ctx+8*6] ! r1lo
|
---|
585 | std $h2lo,[$ctx+8*8] ! r2lo
|
---|
586 | std $h3lo,[$ctx+8*10] ! r3lo
|
---|
587 |
|
---|
588 | std $s1hi,[$ctx+8*13]
|
---|
589 | std $s2hi,[$ctx+8*15]
|
---|
590 | std $s3hi,[$ctx+8*17]
|
---|
591 |
|
---|
592 | std $s1lo,[$ctx+8*12]
|
---|
593 | std $s2lo,[$ctx+8*14]
|
---|
594 | std $s3lo,[$ctx+8*16]
|
---|
595 |
|
---|
596 | add %o7,poly1305_blocks_fma-.Lconsts_fma,%o0
|
---|
597 | add %o7,poly1305_emit_fma-.Lconsts_fma,%o1
|
---|
598 | STPTR %o0,[%i2]
|
---|
599 | STPTR %o1,[%i2+SIZE_T]
|
---|
600 |
|
---|
601 | ret
|
---|
602 | restore %g0,1,%o0 ! return 1
|
---|
603 |
|
---|
604 | .Lno_key_fma:
|
---|
605 | ret
|
---|
606 | restore %g0,%g0,%o0 ! return 0
|
---|
607 | .type poly1305_init_fma,#function
|
---|
608 | .size poly1305_init_fma,.-poly1305_init_fma
|
---|
609 |
|
---|
610 | .align 32
|
---|
611 | poly1305_blocks_fma:
|
---|
612 | save %sp,-STACK_FRAME-48,%sp
|
---|
613 | srln $len,4,$len
|
---|
614 |
|
---|
615 | brz,pn $len,.Labort
|
---|
616 | sub $len,1,$len
|
---|
617 |
|
---|
618 | 1: call .+8
|
---|
619 | add %o7,.Lconsts_fma-1b,%o7
|
---|
620 |
|
---|
621 | ldd [%o7+8*0],$two0 ! load constants
|
---|
622 | ldd [%o7+8*1],$two32
|
---|
623 | ldd [%o7+8*2],$two64
|
---|
624 | ldd [%o7+8*3],$two96
|
---|
625 | ldd [%o7+8*4],$two130
|
---|
626 | ldd [%o7+8*5],$five_two130
|
---|
627 |
|
---|
628 | ldd [$ctx+8*0],$h0lo ! load [biased] hash value
|
---|
629 | ldd [$ctx+8*1],$h1lo
|
---|
630 | ldd [$ctx+8*2],$h2lo
|
---|
631 | ldd [$ctx+8*3],$h3lo
|
---|
632 |
|
---|
633 | std $two0,[%sp+LOCALS+8*0] ! input "template"
|
---|
634 | sethi %hi((1023+52+96)<<20),$in3
|
---|
635 | std $two32,[%sp+LOCALS+8*1]
|
---|
636 | or $padbit,$in3,$in3
|
---|
637 | std $two64,[%sp+LOCALS+8*2]
|
---|
638 | st $in3,[%sp+LOCALS+8*3]
|
---|
639 |
|
---|
640 | and $inp,7,$shr
|
---|
641 | andn $inp,7,$inp ! align pointer
|
---|
642 | mov 8,$i1
|
---|
643 | sll $shr,3,$shr
|
---|
644 | mov 16,$step
|
---|
645 | neg $shr,$shl
|
---|
646 |
|
---|
647 | ldxa [$inp+%g0]0x88,$in0 ! load little-endian input
|
---|
648 | brz $shr,.Linp_aligned_fma
|
---|
649 | ldxa [$inp+$i1]0x88,$in2
|
---|
650 |
|
---|
651 | ldxa [$inp+$step]0x88,$in4
|
---|
652 | add $inp,8,$inp
|
---|
653 |
|
---|
654 | srlx $in0,$shr,$in0 ! align data
|
---|
655 | sllx $in2,$shl,$in1
|
---|
656 | srlx $in2,$shr,$in2
|
---|
657 | or $in1,$in0,$in0
|
---|
658 | sllx $in4,$shl,$in3
|
---|
659 | srlx $in4,$shr,$in4 ! pre-shift
|
---|
660 | or $in3,$in2,$in2
|
---|
661 |
|
---|
662 | .Linp_aligned_fma:
|
---|
663 | srlx $in0,32,$in1
|
---|
664 | movrz $len,0,$step
|
---|
665 | srlx $in2,32,$in3
|
---|
666 | add $step,$inp,$inp ! conditional advance
|
---|
667 |
|
---|
668 | st $in0,[%sp+LOCALS+8*0+4] ! fill "template"
|
---|
669 | st $in1,[%sp+LOCALS+8*1+4]
|
---|
670 | st $in2,[%sp+LOCALS+8*2+4]
|
---|
671 | st $in3,[%sp+LOCALS+8*3+4]
|
---|
672 |
|
---|
673 | ldd [$ctx+8*4],$r0lo ! load key
|
---|
674 | ldd [$ctx+8*5],$r0hi
|
---|
675 | ldd [$ctx+8*6],$r1lo
|
---|
676 | ldd [$ctx+8*7],$r1hi
|
---|
677 | ldd [$ctx+8*8],$r2lo
|
---|
678 | ldd [$ctx+8*9],$r2hi
|
---|
679 | ldd [$ctx+8*10],$r3lo
|
---|
680 | ldd [$ctx+8*11],$r3hi
|
---|
681 | ldd [$ctx+8*12],$s1lo
|
---|
682 | ldd [$ctx+8*13],$s1hi
|
---|
683 | ldd [$ctx+8*14],$s2lo
|
---|
684 | ldd [$ctx+8*15],$s2hi
|
---|
685 | ldd [$ctx+8*16],$s3lo
|
---|
686 | ldd [$ctx+8*17],$s3hi
|
---|
687 |
|
---|
688 | stx %fsr,[%sp+LOCALS+8*4] ! save original %fsr
|
---|
689 | ldx [%o7+8*6],%fsr ! load new %fsr
|
---|
690 |
|
---|
691 | subcc $len,1,$len
|
---|
692 | movrz $len,0,$step
|
---|
693 |
|
---|
694 | ldd [%sp+LOCALS+8*0],$x0 ! load biased input
|
---|
695 | ldd [%sp+LOCALS+8*1],$x1
|
---|
696 | ldd [%sp+LOCALS+8*2],$x2
|
---|
697 | ldd [%sp+LOCALS+8*3],$x3
|
---|
698 |
|
---|
699 | fsubd $h0lo,$two0, $h0lo ! de-bias hash value
|
---|
700 | fsubd $h1lo,$two32,$h1lo
|
---|
701 | ldxa [$inp+%g0]0x88,$in0 ! modulo-scheduled input load
|
---|
702 | fsubd $h2lo,$two64,$h2lo
|
---|
703 | fsubd $h3lo,$two96,$h3lo
|
---|
704 | ldxa [$inp+$i1]0x88,$in2
|
---|
705 |
|
---|
706 | fsubd $x0,$two0, $x0 ! de-bias input
|
---|
707 | fsubd $x1,$two32,$x1
|
---|
708 | fsubd $x2,$two64,$x2
|
---|
709 | fsubd $x3,$two96,$x3
|
---|
710 |
|
---|
711 | brz $shr,.Linp_aligned_fma2
|
---|
712 | add $step,$inp,$inp ! conditional advance
|
---|
713 |
|
---|
714 | sllx $in0,$shl,$in1 ! align data
|
---|
715 | srlx $in0,$shr,$in3
|
---|
716 | or $in1,$in4,$in0
|
---|
717 | sllx $in2,$shl,$in1
|
---|
718 | srlx $in2,$shr,$in4 ! pre-shift
|
---|
719 | or $in3,$in1,$in2
|
---|
720 | .Linp_aligned_fma2:
|
---|
721 | srlx $in0,32,$in1
|
---|
722 | srlx $in2,32,$in3
|
---|
723 |
|
---|
724 | faddd $h0lo,$x0,$x0 ! accumulate input
|
---|
725 | stw $in0,[%sp+LOCALS+8*0+4]
|
---|
726 | faddd $h1lo,$x1,$x1
|
---|
727 | stw $in1,[%sp+LOCALS+8*1+4]
|
---|
728 | faddd $h2lo,$x2,$x2
|
---|
729 | stw $in2,[%sp+LOCALS+8*2+4]
|
---|
730 | faddd $h3lo,$x3,$x3
|
---|
731 | stw $in3,[%sp+LOCALS+8*3+4]
|
---|
732 |
|
---|
733 | b .Lentry_fma
|
---|
734 | nop
|
---|
735 |
|
---|
736 | .align 16
|
---|
737 | .Loop_fma:
|
---|
738 | ldxa [$inp+%g0]0x88,$in0 ! modulo-scheduled input load
|
---|
739 | ldxa [$inp+$i1]0x88,$in2
|
---|
740 | movrz $len,0,$step
|
---|
741 |
|
---|
742 | faddd $y0,$h0lo,$h0lo ! accumulate input
|
---|
743 | faddd $y1,$h0hi,$h0hi
|
---|
744 | faddd $y2,$h2lo,$h2lo
|
---|
745 | faddd $y3,$h2hi,$h2hi
|
---|
746 |
|
---|
747 | brz,pn $shr,.Linp_aligned_fma3
|
---|
748 | add $step,$inp,$inp ! conditional advance
|
---|
749 |
|
---|
750 | sllx $in0,$shl,$in1 ! align data
|
---|
751 | srlx $in0,$shr,$in3
|
---|
752 | or $in1,$in4,$in0
|
---|
753 | sllx $in2,$shl,$in1
|
---|
754 | srlx $in2,$shr,$in4 ! pre-shift
|
---|
755 | or $in3,$in1,$in2
|
---|
756 |
|
---|
757 | .Linp_aligned_fma3:
|
---|
758 | !!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!! base 2^48 -> base 2^32
|
---|
759 | faddd $two64,$h1lo,$c1lo
|
---|
760 | srlx $in0,32,$in1
|
---|
761 | faddd $two64,$h1hi,$c1hi
|
---|
762 | srlx $in2,32,$in3
|
---|
763 | faddd $two130,$h3lo,$c3lo
|
---|
764 | st $in0,[%sp+LOCALS+8*0+4] ! fill "template"
|
---|
765 | faddd $two130,$h3hi,$c3hi
|
---|
766 | st $in1,[%sp+LOCALS+8*1+4]
|
---|
767 | faddd $two32,$h0lo,$c0lo
|
---|
768 | st $in2,[%sp+LOCALS+8*2+4]
|
---|
769 | faddd $two32,$h0hi,$c0hi
|
---|
770 | st $in3,[%sp+LOCALS+8*3+4]
|
---|
771 | faddd $two96,$h2lo,$c2lo
|
---|
772 | faddd $two96,$h2hi,$c2hi
|
---|
773 |
|
---|
774 | fsubd $c1lo,$two64,$c1lo
|
---|
775 | fsubd $c1hi,$two64,$c1hi
|
---|
776 | fsubd $c3lo,$two130,$c3lo
|
---|
777 | fsubd $c3hi,$two130,$c3hi
|
---|
778 | fsubd $c0lo,$two32,$c0lo
|
---|
779 | fsubd $c0hi,$two32,$c0hi
|
---|
780 | fsubd $c2lo,$two96,$c2lo
|
---|
781 | fsubd $c2hi,$two96,$c2hi
|
---|
782 |
|
---|
783 | fsubd $h1lo,$c1lo,$h1lo
|
---|
784 | fsubd $h1hi,$c1hi,$h1hi
|
---|
785 | fsubd $h3lo,$c3lo,$h3lo
|
---|
786 | fsubd $h3hi,$c3hi,$h3hi
|
---|
787 | fsubd $h2lo,$c2lo,$h2lo
|
---|
788 | fsubd $h2hi,$c2hi,$h2hi
|
---|
789 | fsubd $h0lo,$c0lo,$h0lo
|
---|
790 | fsubd $h0hi,$c0hi,$h0hi
|
---|
791 |
|
---|
792 | faddd $h1lo,$c0lo,$h1lo
|
---|
793 | faddd $h1hi,$c0hi,$h1hi
|
---|
794 | faddd $h3lo,$c2lo,$h3lo
|
---|
795 | faddd $h3hi,$c2hi,$h3hi
|
---|
796 | faddd $h2lo,$c1lo,$h2lo
|
---|
797 | faddd $h2hi,$c1hi,$h2hi
|
---|
798 | fmaddd $five_two130,$c3lo,$h0lo,$h0lo
|
---|
799 | fmaddd $five_two130,$c3hi,$h0hi,$h0hi
|
---|
800 |
|
---|
801 | faddd $h1lo,$h1hi,$x1
|
---|
802 | ldd [$ctx+8*12],$s1lo ! reload constants
|
---|
803 | faddd $h3lo,$h3hi,$x3
|
---|
804 | ldd [$ctx+8*13],$s1hi
|
---|
805 | faddd $h2lo,$h2hi,$x2
|
---|
806 | ldd [$ctx+8*10],$r3lo
|
---|
807 | faddd $h0lo,$h0hi,$x0
|
---|
808 | ldd [$ctx+8*11],$r3hi
|
---|
809 |
|
---|
810 | .Lentry_fma:
|
---|
811 | fmuld $x1,$s3lo,$h0lo
|
---|
812 | fmuld $x1,$s3hi,$h0hi
|
---|
813 | fmuld $x1,$r1lo,$h2lo
|
---|
814 | fmuld $x1,$r1hi,$h2hi
|
---|
815 | fmuld $x1,$r0lo,$h1lo
|
---|
816 | fmuld $x1,$r0hi,$h1hi
|
---|
817 | fmuld $x1,$r2lo,$h3lo
|
---|
818 | fmuld $x1,$r2hi,$h3hi
|
---|
819 |
|
---|
820 | fmaddd $x3,$s1lo,$h0lo,$h0lo
|
---|
821 | fmaddd $x3,$s1hi,$h0hi,$h0hi
|
---|
822 | fmaddd $x3,$s3lo,$h2lo,$h2lo
|
---|
823 | fmaddd $x3,$s3hi,$h2hi,$h2hi
|
---|
824 | fmaddd $x3,$s2lo,$h1lo,$h1lo
|
---|
825 | fmaddd $x3,$s2hi,$h1hi,$h1hi
|
---|
826 | fmaddd $x3,$r0lo,$h3lo,$h3lo
|
---|
827 | fmaddd $x3,$r0hi,$h3hi,$h3hi
|
---|
828 |
|
---|
829 | fmaddd $x2,$s2lo,$h0lo,$h0lo
|
---|
830 | fmaddd $x2,$s2hi,$h0hi,$h0hi
|
---|
831 | fmaddd $x2,$r0lo,$h2lo,$h2lo
|
---|
832 | fmaddd $x2,$r0hi,$h2hi,$h2hi
|
---|
833 | fmaddd $x2,$s3lo,$h1lo,$h1lo
|
---|
834 | ldd [%sp+LOCALS+8*0],$y0 ! load [biased] input
|
---|
835 | fmaddd $x2,$s3hi,$h1hi,$h1hi
|
---|
836 | ldd [%sp+LOCALS+8*1],$y1
|
---|
837 | fmaddd $x2,$r1lo,$h3lo,$h3lo
|
---|
838 | ldd [%sp+LOCALS+8*2],$y2
|
---|
839 | fmaddd $x2,$r1hi,$h3hi,$h3hi
|
---|
840 | ldd [%sp+LOCALS+8*3],$y3
|
---|
841 |
|
---|
842 | fmaddd $x0,$r0lo,$h0lo,$h0lo
|
---|
843 | fsubd $y0,$two0, $y0 ! de-bias input
|
---|
844 | fmaddd $x0,$r0hi,$h0hi,$h0hi
|
---|
845 | fsubd $y1,$two32,$y1
|
---|
846 | fmaddd $x0,$r2lo,$h2lo,$h2lo
|
---|
847 | fsubd $y2,$two64,$y2
|
---|
848 | fmaddd $x0,$r2hi,$h2hi,$h2hi
|
---|
849 | fsubd $y3,$two96,$y3
|
---|
850 | fmaddd $x0,$r1lo,$h1lo,$h1lo
|
---|
851 | fmaddd $x0,$r1hi,$h1hi,$h1hi
|
---|
852 | fmaddd $x0,$r3lo,$h3lo,$h3lo
|
---|
853 | fmaddd $x0,$r3hi,$h3hi,$h3hi
|
---|
854 |
|
---|
855 | bcc SIZE_T_CC,.Loop_fma
|
---|
856 | subcc $len,1,$len
|
---|
857 |
|
---|
858 | !!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!! base 2^48 -> base 2^32
|
---|
859 | faddd $h0lo,$two32,$c0lo
|
---|
860 | faddd $h0hi,$two32,$c0hi
|
---|
861 | faddd $h2lo,$two96,$c2lo
|
---|
862 | faddd $h2hi,$two96,$c2hi
|
---|
863 | faddd $h1lo,$two64,$c1lo
|
---|
864 | faddd $h1hi,$two64,$c1hi
|
---|
865 | faddd $h3lo,$two130,$c3lo
|
---|
866 | faddd $h3hi,$two130,$c3hi
|
---|
867 |
|
---|
868 | fsubd $c0lo,$two32,$c0lo
|
---|
869 | fsubd $c0hi,$two32,$c0hi
|
---|
870 | fsubd $c2lo,$two96,$c2lo
|
---|
871 | fsubd $c2hi,$two96,$c2hi
|
---|
872 | fsubd $c1lo,$two64,$c1lo
|
---|
873 | fsubd $c1hi,$two64,$c1hi
|
---|
874 | fsubd $c3lo,$two130,$c3lo
|
---|
875 | fsubd $c3hi,$two130,$c3hi
|
---|
876 |
|
---|
877 | fsubd $h1lo,$c1lo,$h1lo
|
---|
878 | fsubd $h1hi,$c1hi,$h1hi
|
---|
879 | fsubd $h3lo,$c3lo,$h3lo
|
---|
880 | fsubd $h3hi,$c3hi,$h3hi
|
---|
881 | fsubd $h2lo,$c2lo,$h2lo
|
---|
882 | fsubd $h2hi,$c2hi,$h2hi
|
---|
883 | fsubd $h0lo,$c0lo,$h0lo
|
---|
884 | fsubd $h0hi,$c0hi,$h0hi
|
---|
885 |
|
---|
886 | faddd $h1lo,$c0lo,$h1lo
|
---|
887 | faddd $h1hi,$c0hi,$h1hi
|
---|
888 | faddd $h3lo,$c2lo,$h3lo
|
---|
889 | faddd $h3hi,$c2hi,$h3hi
|
---|
890 | faddd $h2lo,$c1lo,$h2lo
|
---|
891 | faddd $h2hi,$c1hi,$h2hi
|
---|
892 | fmaddd $five_two130,$c3lo,$h0lo,$h0lo
|
---|
893 | fmaddd $five_two130,$c3hi,$h0hi,$h0hi
|
---|
894 |
|
---|
895 | faddd $h1lo,$h1hi,$x1
|
---|
896 | faddd $h3lo,$h3hi,$x3
|
---|
897 | faddd $h2lo,$h2hi,$x2
|
---|
898 | faddd $h0lo,$h0hi,$x0
|
---|
899 |
|
---|
900 | faddd $x1,$two32,$x1 ! bias
|
---|
901 | faddd $x3,$two96,$x3
|
---|
902 | faddd $x2,$two64,$x2
|
---|
903 | faddd $x0,$two0, $x0
|
---|
904 |
|
---|
905 | ldx [%sp+LOCALS+8*4],%fsr ! restore saved %fsr
|
---|
906 |
|
---|
907 | std $x1,[$ctx+8*1] ! store [biased] hash value
|
---|
908 | std $x3,[$ctx+8*3]
|
---|
909 | std $x2,[$ctx+8*2]
|
---|
910 | std $x0,[$ctx+8*0]
|
---|
911 |
|
---|
912 | .Labort:
|
---|
913 | ret
|
---|
914 | restore
|
---|
915 | .type poly1305_blocks_fma,#function
|
---|
916 | .size poly1305_blocks_fma,.-poly1305_blocks_fma
|
---|
917 | ___
|
---|
918 | {
|
---|
919 | my ($mac,$nonce)=($inp,$len);
|
---|
920 |
|
---|
921 | my ($h0,$h1,$h2,$h3,$h4, $d0,$d1,$d2,$d3, $mask
|
---|
922 | ) = (map("%l$_",(0..5)),map("%o$_",(0..4)));
|
---|
923 |
|
---|
924 | $code.=<<___;
|
---|
925 | .align 32
|
---|
926 | poly1305_emit_fma:
|
---|
927 | save %sp,-STACK_FRAME,%sp
|
---|
928 |
|
---|
929 | ld [$ctx+8*0+0],$d0 ! load hash
|
---|
930 | ld [$ctx+8*0+4],$h0
|
---|
931 | ld [$ctx+8*1+0],$d1
|
---|
932 | ld [$ctx+8*1+4],$h1
|
---|
933 | ld [$ctx+8*2+0],$d2
|
---|
934 | ld [$ctx+8*2+4],$h2
|
---|
935 | ld [$ctx+8*3+0],$d3
|
---|
936 | ld [$ctx+8*3+4],$h3
|
---|
937 |
|
---|
938 | sethi %hi(0xfff00000),$mask
|
---|
939 | andn $d0,$mask,$d0 ! mask exponent
|
---|
940 | andn $d1,$mask,$d1
|
---|
941 | andn $d2,$mask,$d2
|
---|
942 | andn $d3,$mask,$d3 ! can be partially reduced...
|
---|
943 | mov 3,$mask
|
---|
944 |
|
---|
945 | srl $d3,2,$padbit ! ... so reduce
|
---|
946 | and $d3,$mask,$h4
|
---|
947 | andn $d3,$mask,$d3
|
---|
948 | add $padbit,$d3,$d3
|
---|
949 |
|
---|
950 | addcc $d3,$h0,$h0
|
---|
951 | addccc $d0,$h1,$h1
|
---|
952 | addccc $d1,$h2,$h2
|
---|
953 | addccc $d2,$h3,$h3
|
---|
954 | addc %g0,$h4,$h4
|
---|
955 |
|
---|
956 | addcc $h0,5,$d0 ! compare to modulus
|
---|
957 | addccc $h1,0,$d1
|
---|
958 | addccc $h2,0,$d2
|
---|
959 | addccc $h3,0,$d3
|
---|
960 | addc $h4,0,$mask
|
---|
961 |
|
---|
962 | srl $mask,2,$mask ! did it carry/borrow?
|
---|
963 | neg $mask,$mask
|
---|
964 | sra $mask,31,$mask ! mask
|
---|
965 |
|
---|
966 | andn $h0,$mask,$h0
|
---|
967 | and $d0,$mask,$d0
|
---|
968 | andn $h1,$mask,$h1
|
---|
969 | and $d1,$mask,$d1
|
---|
970 | or $d0,$h0,$h0
|
---|
971 | ld [$nonce+0],$d0 ! load nonce
|
---|
972 | andn $h2,$mask,$h2
|
---|
973 | and $d2,$mask,$d2
|
---|
974 | or $d1,$h1,$h1
|
---|
975 | ld [$nonce+4],$d1
|
---|
976 | andn $h3,$mask,$h3
|
---|
977 | and $d3,$mask,$d3
|
---|
978 | or $d2,$h2,$h2
|
---|
979 | ld [$nonce+8],$d2
|
---|
980 | or $d3,$h3,$h3
|
---|
981 | ld [$nonce+12],$d3
|
---|
982 |
|
---|
983 | addcc $d0,$h0,$h0 ! accumulate nonce
|
---|
984 | addccc $d1,$h1,$h1
|
---|
985 | addccc $d2,$h2,$h2
|
---|
986 | addc $d3,$h3,$h3
|
---|
987 |
|
---|
988 | stb $h0,[$mac+0] ! write little-endian result
|
---|
989 | srl $h0,8,$h0
|
---|
990 | stb $h1,[$mac+4]
|
---|
991 | srl $h1,8,$h1
|
---|
992 | stb $h2,[$mac+8]
|
---|
993 | srl $h2,8,$h2
|
---|
994 | stb $h3,[$mac+12]
|
---|
995 | srl $h3,8,$h3
|
---|
996 |
|
---|
997 | stb $h0,[$mac+1]
|
---|
998 | srl $h0,8,$h0
|
---|
999 | stb $h1,[$mac+5]
|
---|
1000 | srl $h1,8,$h1
|
---|
1001 | stb $h2,[$mac+9]
|
---|
1002 | srl $h2,8,$h2
|
---|
1003 | stb $h3,[$mac+13]
|
---|
1004 | srl $h3,8,$h3
|
---|
1005 |
|
---|
1006 | stb $h0,[$mac+2]
|
---|
1007 | srl $h0,8,$h0
|
---|
1008 | stb $h1,[$mac+6]
|
---|
1009 | srl $h1,8,$h1
|
---|
1010 | stb $h2,[$mac+10]
|
---|
1011 | srl $h2,8,$h2
|
---|
1012 | stb $h3,[$mac+14]
|
---|
1013 | srl $h3,8,$h3
|
---|
1014 |
|
---|
1015 | stb $h0,[$mac+3]
|
---|
1016 | stb $h1,[$mac+7]
|
---|
1017 | stb $h2,[$mac+11]
|
---|
1018 | stb $h3,[$mac+15]
|
---|
1019 |
|
---|
1020 | ret
|
---|
1021 | restore
|
---|
1022 | .type poly1305_emit_fma,#function
|
---|
1023 | .size poly1305_emit_fma,.-poly1305_emit_fma
|
---|
1024 | ___
|
---|
1025 | }
|
---|
1026 |
|
---|
1027 | $code.=<<___;
|
---|
1028 | .align 64
|
---|
1029 | .Lconsts_fma:
|
---|
1030 | .word 0x43300000,0x00000000 ! 2^(52+0)
|
---|
1031 | .word 0x45300000,0x00000000 ! 2^(52+32)
|
---|
1032 | .word 0x47300000,0x00000000 ! 2^(52+64)
|
---|
1033 | .word 0x49300000,0x00000000 ! 2^(52+96)
|
---|
1034 | .word 0x4b500000,0x00000000 ! 2^(52+130)
|
---|
1035 |
|
---|
1036 | .word 0x37f40000,0x00000000 ! 5/2^130
|
---|
1037 | .word 0,1<<30 ! fsr: truncate, no exceptions
|
---|
1038 |
|
---|
1039 | .word 0x44300000,0x00000000 ! 2^(52+16+0)
|
---|
1040 | .word 0x46300000,0x00000000 ! 2^(52+16+32)
|
---|
1041 | .word 0x48300000,0x00000000 ! 2^(52+16+64)
|
---|
1042 | .word 0x4a300000,0x00000000 ! 2^(52+16+96)
|
---|
1043 | .word 0x3e300000,0x00000000 ! 2^(52+16+0-96)
|
---|
1044 | .word 0x40300000,0x00000000 ! 2^(52+16+32-96)
|
---|
1045 | .word 0x42300000,0x00000000 ! 2^(52+16+64-96)
|
---|
1046 | .asciz "Poly1305 for SPARCv9/VIS3/FMA, CRYPTOGAMS by <appro\@openssl.org>"
|
---|
1047 | .align 4
|
---|
1048 | ___
|
---|
1049 | }
|
---|
1050 | |
---|
1051 |
|
---|
1052 | # Purpose of these subroutines is to explicitly encode VIS instructions,
|
---|
1053 | # so that one can compile the module without having to specify VIS
|
---|
1054 | # extensions on compiler command line, e.g. -xarch=v9 vs. -xarch=v9a.
|
---|
1055 | # Idea is to reserve for option to produce "universal" binary and let
|
---|
1056 | # programmer detect if current CPU is VIS capable at run-time.
|
---|
1057 | sub unvis3 {
|
---|
1058 | my ($mnemonic,$rs1,$rs2,$rd)=@_;
|
---|
1059 | my %bias = ( "g" => 0, "o" => 8, "l" => 16, "i" => 24 );
|
---|
1060 | my ($ref,$opf);
|
---|
1061 | my %visopf = ( "addxc" => 0x011,
|
---|
1062 | "addxccc" => 0x013,
|
---|
1063 | "umulxhi" => 0x016 );
|
---|
1064 |
|
---|
1065 | $ref = "$mnemonic\t$rs1,$rs2,$rd";
|
---|
1066 |
|
---|
1067 | if ($opf=$visopf{$mnemonic}) {
|
---|
1068 | foreach ($rs1,$rs2,$rd) {
|
---|
1069 | return $ref if (!/%([goli])([0-9])/);
|
---|
1070 | $_=$bias{$1}+$2;
|
---|
1071 | }
|
---|
1072 |
|
---|
1073 | return sprintf ".word\t0x%08x !%s",
|
---|
1074 | 0x81b00000|$rd<<25|$rs1<<14|$opf<<5|$rs2,
|
---|
1075 | $ref;
|
---|
1076 | } else {
|
---|
1077 | return $ref;
|
---|
1078 | }
|
---|
1079 | }
|
---|
1080 |
|
---|
1081 | sub unfma {
|
---|
1082 | my ($mnemonic,$rs1,$rs2,$rs3,$rd)=@_;
|
---|
1083 | my ($ref,$opf);
|
---|
1084 | my %fmaopf = ( "fmadds" => 0x1,
|
---|
1085 | "fmaddd" => 0x2,
|
---|
1086 | "fmsubs" => 0x5,
|
---|
1087 | "fmsubd" => 0x6 );
|
---|
1088 |
|
---|
1089 | $ref = "$mnemonic\t$rs1,$rs2,$rs3,$rd";
|
---|
1090 |
|
---|
1091 | if ($opf=$fmaopf{$mnemonic}) {
|
---|
1092 | foreach ($rs1,$rs2,$rs3,$rd) {
|
---|
1093 | return $ref if (!/%f([0-9]{1,2})/);
|
---|
1094 | $_=$1;
|
---|
1095 | if ($1>=32) {
|
---|
1096 | return $ref if ($1&1);
|
---|
1097 | # re-encode for upper double register addressing
|
---|
1098 | $_=($1|$1>>5)&31;
|
---|
1099 | }
|
---|
1100 | }
|
---|
1101 |
|
---|
1102 | return sprintf ".word\t0x%08x !%s",
|
---|
1103 | 0x81b80000|$rd<<25|$rs1<<14|$rs3<<9|$opf<<5|$rs2,
|
---|
1104 | $ref;
|
---|
1105 | } else {
|
---|
1106 | return $ref;
|
---|
1107 | }
|
---|
1108 | }
|
---|
1109 |
|
---|
1110 | foreach (split("\n",$code)) {
|
---|
1111 | s/\`([^\`]*)\`/eval $1/ge;
|
---|
1112 |
|
---|
1113 | s/\b(umulxhi|addxc[c]{0,2})\s+(%[goli][0-7]),\s*(%[goli][0-7]),\s*(%[goli][0-7])/
|
---|
1114 | &unvis3($1,$2,$3,$4)
|
---|
1115 | /ge or
|
---|
1116 | s/\b(fmadd[sd])\s+(%f[0-9]+),\s*(%f[0-9]+),\s*(%f[0-9]+),\s*(%f[0-9]+)/
|
---|
1117 | &unfma($1,$2,$3,$4,$5)
|
---|
1118 | /ge;
|
---|
1119 |
|
---|
1120 | print $_,"\n";
|
---|
1121 | }
|
---|
1122 |
|
---|
1123 | close STDOUT or die "error closing STDOUT: $!";
|
---|