1 | #! /usr/bin/env perl
|
---|
2 | # Copyright 2016-2020 The OpenSSL Project Authors. All Rights Reserved.
|
---|
3 | #
|
---|
4 | # Licensed under the OpenSSL license (the "License"). You may not use
|
---|
5 | # this file except in compliance with the License. You can obtain a copy
|
---|
6 | # in the file LICENSE in the source distribution or at
|
---|
7 | # https://www.openssl.org/source/license.html
|
---|
8 |
|
---|
9 | #
|
---|
10 | # ====================================================================
|
---|
11 | # Written by Andy Polyakov <[email protected]> for the OpenSSL
|
---|
12 | # project. The module is, however, dual licensed under OpenSSL and
|
---|
13 | # CRYPTOGAMS licenses depending on where you obtain it. For further
|
---|
14 | # details see http://www.openssl.org/~appro/cryptogams/.
|
---|
15 | # ====================================================================
|
---|
16 | #
|
---|
17 | # This module implements Poly1305 hash for SPARCv9, vanilla, as well
|
---|
18 | # as VIS3 and FMA extensions.
|
---|
19 | #
|
---|
20 | # May, August 2015
|
---|
21 | #
|
---|
22 | # Numbers are cycles per processed byte with poly1305_blocks alone.
|
---|
23 | #
|
---|
24 | # IALU(*) FMA
|
---|
25 | #
|
---|
26 | # UltraSPARC III 12.3(**)
|
---|
27 | # SPARC T3 7.92
|
---|
28 | # SPARC T4 1.70(***) 6.55
|
---|
29 | # SPARC64 X 5.60 3.64
|
---|
30 | #
|
---|
31 | # (*) Comparison to compiler-generated code is really problematic,
|
---|
32 | # because latter's performance varies too much depending on too
|
---|
33 | # many variables. For example, one can measure from 5x to 15x
|
---|
34 | # improvement on T4 for gcc-4.6. Well, in T4 case it's a bit
|
---|
35 | # unfair comparison, because compiler doesn't use VIS3, but
|
---|
36 | # given same initial conditions coefficient varies from 3x to 9x.
|
---|
37 | # (**) Pre-III performance should be even worse; floating-point
|
---|
38 | # performance for UltraSPARC I-IV on the other hand is reported
|
---|
39 | # to be 4.25 for hand-coded assembly, but they are just too old
|
---|
40 | # to care about.
|
---|
41 | # (***) Multi-process benchmark saturates at ~12.5x single-process
|
---|
42 | # result on 8-core processor, or ~21GBps per 2.85GHz socket.
|
---|
43 |
|
---|
44 | my $output = pop;
|
---|
45 | open STDOUT,">$output";
|
---|
46 |
|
---|
47 | my ($ctx,$inp,$len,$padbit,$shl,$shr) = map("%i$_",(0..5));
|
---|
48 | my ($r0,$r1,$r2,$r3,$s1,$s2,$s3,$h4) = map("%l$_",(0..7));
|
---|
49 | my ($h0,$h1,$h2,$h3, $t0,$t1,$t2) = map("%o$_",(0..5,7));
|
---|
50 | my ($d0,$d1,$d2,$d3) = map("%g$_",(1..4));
|
---|
51 |
|
---|
52 | my $output = pop;
|
---|
53 | open STDOUT,">$stdout";
|
---|
54 |
|
---|
55 | $code.=<<___;
|
---|
56 | #include "sparc_arch.h"
|
---|
57 |
|
---|
58 | #ifdef __arch64__
|
---|
59 | .register %g2,#scratch
|
---|
60 | .register %g3,#scratch
|
---|
61 | # define STPTR stx
|
---|
62 | # define SIZE_T 8
|
---|
63 | #else
|
---|
64 | # define STPTR st
|
---|
65 | # define SIZE_T 4
|
---|
66 | #endif
|
---|
67 | #define LOCALS (STACK_BIAS+STACK_FRAME)
|
---|
68 |
|
---|
69 | .section ".text",#alloc,#execinstr
|
---|
70 |
|
---|
71 | #ifdef __PIC__
|
---|
72 | SPARC_PIC_THUNK(%g1)
|
---|
73 | #endif
|
---|
74 |
|
---|
75 | .globl poly1305_init
|
---|
76 | .align 32
|
---|
77 | poly1305_init:
|
---|
78 | save %sp,-STACK_FRAME-16,%sp
|
---|
79 | nop
|
---|
80 |
|
---|
81 | SPARC_LOAD_ADDRESS(OPENSSL_sparcv9cap_P,%g1)
|
---|
82 | ld [%g1],%g1
|
---|
83 |
|
---|
84 | and %g1,SPARCV9_FMADD|SPARCV9_VIS3,%g1
|
---|
85 | cmp %g1,SPARCV9_FMADD
|
---|
86 | be .Lpoly1305_init_fma
|
---|
87 | nop
|
---|
88 |
|
---|
89 | stx %g0,[$ctx+0]
|
---|
90 | stx %g0,[$ctx+8] ! zero hash value
|
---|
91 | brz,pn $inp,.Lno_key
|
---|
92 | stx %g0,[$ctx+16]
|
---|
93 |
|
---|
94 | and $inp,7,$shr ! alignment factor
|
---|
95 | andn $inp,7,$inp
|
---|
96 | sll $shr,3,$shr ! *8
|
---|
97 | neg $shr,$shl
|
---|
98 |
|
---|
99 | sethi %hi(0x0ffffffc),$t0
|
---|
100 | set 8,$h1
|
---|
101 | or $t0,%lo(0x0ffffffc),$t0
|
---|
102 | set 16,$h2
|
---|
103 | sllx $t0,32,$t1
|
---|
104 | or $t0,$t1,$t1 ! 0x0ffffffc0ffffffc
|
---|
105 | or $t1,3,$t0 ! 0x0ffffffc0fffffff
|
---|
106 |
|
---|
107 | ldxa [$inp+%g0]0x88,$h0 ! load little-endian key
|
---|
108 | brz,pt $shr,.Lkey_aligned
|
---|
109 | ldxa [$inp+$h1]0x88,$h1
|
---|
110 |
|
---|
111 | ldxa [$inp+$h2]0x88,$h2
|
---|
112 | srlx $h0,$shr,$h0
|
---|
113 | sllx $h1,$shl,$t2
|
---|
114 | srlx $h1,$shr,$h1
|
---|
115 | or $t2,$h0,$h0
|
---|
116 | sllx $h2,$shl,$h2
|
---|
117 | or $h2,$h1,$h1
|
---|
118 |
|
---|
119 | .Lkey_aligned:
|
---|
120 | and $t0,$h0,$h0
|
---|
121 | and $t1,$h1,$h1
|
---|
122 | stx $h0,[$ctx+32+0] ! store key
|
---|
123 | stx $h1,[$ctx+32+8]
|
---|
124 |
|
---|
125 | andcc %g1,SPARCV9_VIS3,%g0
|
---|
126 | be .Lno_key
|
---|
127 | nop
|
---|
128 |
|
---|
129 | 1: call .+8
|
---|
130 | add %o7,poly1305_blocks_vis3-1b,%o7
|
---|
131 |
|
---|
132 | add %o7,poly1305_emit-poly1305_blocks_vis3,%o5
|
---|
133 | STPTR %o7,[%i2]
|
---|
134 | STPTR %o5,[%i2+SIZE_T]
|
---|
135 |
|
---|
136 | ret
|
---|
137 | restore %g0,1,%o0 ! return 1
|
---|
138 |
|
---|
139 | .Lno_key:
|
---|
140 | ret
|
---|
141 | restore %g0,%g0,%o0 ! return 0
|
---|
142 | .type poly1305_init,#function
|
---|
143 | .size poly1305_init,.-poly1305_init
|
---|
144 |
|
---|
145 | .globl poly1305_blocks
|
---|
146 | .align 32
|
---|
147 | poly1305_blocks:
|
---|
148 | save %sp,-STACK_FRAME,%sp
|
---|
149 | srln $len,4,$len
|
---|
150 |
|
---|
151 | brz,pn $len,.Lno_data
|
---|
152 | nop
|
---|
153 |
|
---|
154 | ld [$ctx+32+0],$r1 ! load key
|
---|
155 | ld [$ctx+32+4],$r0
|
---|
156 | ld [$ctx+32+8],$r3
|
---|
157 | ld [$ctx+32+12],$r2
|
---|
158 |
|
---|
159 | ld [$ctx+0],$h1 ! load hash value
|
---|
160 | ld [$ctx+4],$h0
|
---|
161 | ld [$ctx+8],$h3
|
---|
162 | ld [$ctx+12],$h2
|
---|
163 | ld [$ctx+16],$h4
|
---|
164 |
|
---|
165 | and $inp,7,$shr ! alignment factor
|
---|
166 | andn $inp,7,$inp
|
---|
167 | set 8,$d1
|
---|
168 | sll $shr,3,$shr ! *8
|
---|
169 | set 16,$d2
|
---|
170 | neg $shr,$shl
|
---|
171 |
|
---|
172 | srl $r1,2,$s1
|
---|
173 | srl $r2,2,$s2
|
---|
174 | add $r1,$s1,$s1
|
---|
175 | srl $r3,2,$s3
|
---|
176 | add $r2,$s2,$s2
|
---|
177 | add $r3,$s3,$s3
|
---|
178 |
|
---|
179 | .Loop:
|
---|
180 | ldxa [$inp+%g0]0x88,$d0 ! load little-endian input
|
---|
181 | brz,pt $shr,.Linp_aligned
|
---|
182 | ldxa [$inp+$d1]0x88,$d1
|
---|
183 |
|
---|
184 | ldxa [$inp+$d2]0x88,$d2
|
---|
185 | srlx $d0,$shr,$d0
|
---|
186 | sllx $d1,$shl,$t1
|
---|
187 | srlx $d1,$shr,$d1
|
---|
188 | or $t1,$d0,$d0
|
---|
189 | sllx $d2,$shl,$d2
|
---|
190 | or $d2,$d1,$d1
|
---|
191 |
|
---|
192 | .Linp_aligned:
|
---|
193 | srlx $d0,32,$t0
|
---|
194 | addcc $d0,$h0,$h0 ! accumulate input
|
---|
195 | srlx $d1,32,$t1
|
---|
196 | addccc $t0,$h1,$h1
|
---|
197 | addccc $d1,$h2,$h2
|
---|
198 | addccc $t1,$h3,$h3
|
---|
199 | addc $padbit,$h4,$h4
|
---|
200 |
|
---|
201 | umul $r0,$h0,$d0
|
---|
202 | umul $r1,$h0,$d1
|
---|
203 | umul $r2,$h0,$d2
|
---|
204 | umul $r3,$h0,$d3
|
---|
205 | sub $len,1,$len
|
---|
206 | add $inp,16,$inp
|
---|
207 |
|
---|
208 | umul $s3,$h1,$t0
|
---|
209 | umul $r0,$h1,$t1
|
---|
210 | umul $r1,$h1,$t2
|
---|
211 | add $t0,$d0,$d0
|
---|
212 | add $t1,$d1,$d1
|
---|
213 | umul $r2,$h1,$t0
|
---|
214 | add $t2,$d2,$d2
|
---|
215 | add $t0,$d3,$d3
|
---|
216 |
|
---|
217 | umul $s2,$h2,$t1
|
---|
218 | umul $s3,$h2,$t2
|
---|
219 | umul $r0,$h2,$t0
|
---|
220 | add $t1,$d0,$d0
|
---|
221 | add $t2,$d1,$d1
|
---|
222 | umul $r1,$h2,$t1
|
---|
223 | add $t0,$d2,$d2
|
---|
224 | add $t1,$d3,$d3
|
---|
225 |
|
---|
226 | umul $s1,$h3,$t2
|
---|
227 | umul $s2,$h3,$t0
|
---|
228 | umul $s3,$h3,$t1
|
---|
229 | add $t2,$d0,$d0
|
---|
230 | add $t0,$d1,$d1
|
---|
231 | umul $r0,$h3,$t2
|
---|
232 | add $t1,$d2,$d2
|
---|
233 | add $t2,$d3,$d3
|
---|
234 |
|
---|
235 | umul $s1,$h4,$t0
|
---|
236 | umul $s2,$h4,$t1
|
---|
237 | umul $s3,$h4,$t2
|
---|
238 | umul $r0,$h4,$h4
|
---|
239 | add $t0,$d1,$d1
|
---|
240 | add $t1,$d2,$d2
|
---|
241 | srlx $d0,32,$h1
|
---|
242 | add $t2,$d3,$d3
|
---|
243 | srlx $d1,32,$h2
|
---|
244 |
|
---|
245 | addcc $d1,$h1,$h1
|
---|
246 | srlx $d2,32,$h3
|
---|
247 | set 8,$d1
|
---|
248 | addccc $d2,$h2,$h2
|
---|
249 | srlx $d3,32,$t0
|
---|
250 | set 16,$d2
|
---|
251 | addccc $d3,$h3,$h3
|
---|
252 | addc $t0,$h4,$h4
|
---|
253 |
|
---|
254 | srl $h4,2,$t0 ! final reduction step
|
---|
255 | andn $h4,3,$t1
|
---|
256 | and $h4,3,$h4
|
---|
257 | add $t1,$t0,$t0
|
---|
258 |
|
---|
259 | addcc $t0,$d0,$h0
|
---|
260 | addccc %g0,$h1,$h1
|
---|
261 | addccc %g0,$h2,$h2
|
---|
262 | addccc %g0,$h3,$h3
|
---|
263 | brnz,pt $len,.Loop
|
---|
264 | addc %g0,$h4,$h4
|
---|
265 |
|
---|
266 | st $h1,[$ctx+0] ! store hash value
|
---|
267 | st $h0,[$ctx+4]
|
---|
268 | st $h3,[$ctx+8]
|
---|
269 | st $h2,[$ctx+12]
|
---|
270 | st $h4,[$ctx+16]
|
---|
271 |
|
---|
272 | .Lno_data:
|
---|
273 | ret
|
---|
274 | restore
|
---|
275 | .type poly1305_blocks,#function
|
---|
276 | .size poly1305_blocks,.-poly1305_blocks
|
---|
277 | ___
|
---|
278 | ########################################################################
|
---|
279 | # VIS3 has umulxhi and addxc...
|
---|
280 | {
|
---|
281 | my ($H0,$H1,$H2,$R0,$R1,$S1,$T1) = map("%o$_",(0..5,7));
|
---|
282 | my ($D0,$D1,$D2,$T0) = map("%g$_",(1..4));
|
---|
283 |
|
---|
284 | $code.=<<___;
|
---|
285 | .align 32
|
---|
286 | poly1305_blocks_vis3:
|
---|
287 | save %sp,-STACK_FRAME,%sp
|
---|
288 | srln $len,4,$len
|
---|
289 |
|
---|
290 | brz,pn $len,.Lno_data
|
---|
291 | nop
|
---|
292 |
|
---|
293 | ldx [$ctx+32+0],$R0 ! load key
|
---|
294 | ldx [$ctx+32+8],$R1
|
---|
295 |
|
---|
296 | ldx [$ctx+0],$H0 ! load hash value
|
---|
297 | ldx [$ctx+8],$H1
|
---|
298 | ld [$ctx+16],$H2
|
---|
299 |
|
---|
300 | and $inp,7,$shr ! alignment factor
|
---|
301 | andn $inp,7,$inp
|
---|
302 | set 8,$r1
|
---|
303 | sll $shr,3,$shr ! *8
|
---|
304 | set 16,$r2
|
---|
305 | neg $shr,$shl
|
---|
306 |
|
---|
307 | srlx $R1,2,$S1
|
---|
308 | b .Loop_vis3
|
---|
309 | add $R1,$S1,$S1
|
---|
310 |
|
---|
311 | .Loop_vis3:
|
---|
312 | ldxa [$inp+%g0]0x88,$D0 ! load little-endian input
|
---|
313 | brz,pt $shr,.Linp_aligned_vis3
|
---|
314 | ldxa [$inp+$r1]0x88,$D1
|
---|
315 |
|
---|
316 | ldxa [$inp+$r2]0x88,$D2
|
---|
317 | srlx $D0,$shr,$D0
|
---|
318 | sllx $D1,$shl,$T1
|
---|
319 | srlx $D1,$shr,$D1
|
---|
320 | or $T1,$D0,$D0
|
---|
321 | sllx $D2,$shl,$D2
|
---|
322 | or $D2,$D1,$D1
|
---|
323 |
|
---|
324 | .Linp_aligned_vis3:
|
---|
325 | addcc $D0,$H0,$H0 ! accumulate input
|
---|
326 | sub $len,1,$len
|
---|
327 | addxccc $D1,$H1,$H1
|
---|
328 | add $inp,16,$inp
|
---|
329 |
|
---|
330 | mulx $R0,$H0,$D0 ! r0*h0
|
---|
331 | addxc $padbit,$H2,$H2
|
---|
332 | umulxhi $R0,$H0,$D1
|
---|
333 | mulx $S1,$H1,$T0 ! s1*h1
|
---|
334 | umulxhi $S1,$H1,$T1
|
---|
335 | addcc $T0,$D0,$D0
|
---|
336 | mulx $R1,$H0,$T0 ! r1*h0
|
---|
337 | addxc $T1,$D1,$D1
|
---|
338 | umulxhi $R1,$H0,$D2
|
---|
339 | addcc $T0,$D1,$D1
|
---|
340 | mulx $R0,$H1,$T0 ! r0*h1
|
---|
341 | addxc %g0,$D2,$D2
|
---|
342 | umulxhi $R0,$H1,$T1
|
---|
343 | addcc $T0,$D1,$D1
|
---|
344 | mulx $S1,$H2,$T0 ! s1*h2
|
---|
345 | addxc $T1,$D2,$D2
|
---|
346 | mulx $R0,$H2,$T1 ! r0*h2
|
---|
347 | addcc $T0,$D1,$D1
|
---|
348 | addxc $T1,$D2,$D2
|
---|
349 |
|
---|
350 | srlx $D2,2,$T0 ! final reduction step
|
---|
351 | andn $D2,3,$T1
|
---|
352 | and $D2,3,$H2
|
---|
353 | add $T1,$T0,$T0
|
---|
354 |
|
---|
355 | addcc $T0,$D0,$H0
|
---|
356 | addxccc %g0,$D1,$H1
|
---|
357 | brnz,pt $len,.Loop_vis3
|
---|
358 | addxc %g0,$H2,$H2
|
---|
359 |
|
---|
360 | stx $H0,[$ctx+0] ! store hash value
|
---|
361 | stx $H1,[$ctx+8]
|
---|
362 | st $H2,[$ctx+16]
|
---|
363 |
|
---|
364 | ret
|
---|
365 | restore
|
---|
366 | .type poly1305_blocks_vis3,#function
|
---|
367 | .size poly1305_blocks_vis3,.-poly1305_blocks_vis3
|
---|
368 | ___
|
---|
369 | }
|
---|
370 | my ($mac,$nonce) = ($inp,$len);
|
---|
371 |
|
---|
372 | $code.=<<___;
|
---|
373 | .globl poly1305_emit
|
---|
374 | .align 32
|
---|
375 | poly1305_emit:
|
---|
376 | save %sp,-STACK_FRAME,%sp
|
---|
377 |
|
---|
378 | ld [$ctx+0],$h1 ! load hash value
|
---|
379 | ld [$ctx+4],$h0
|
---|
380 | ld [$ctx+8],$h3
|
---|
381 | ld [$ctx+12],$h2
|
---|
382 | ld [$ctx+16],$h4
|
---|
383 |
|
---|
384 | addcc $h0,5,$r0 ! compare to modulus
|
---|
385 | addccc $h1,0,$r1
|
---|
386 | addccc $h2,0,$r2
|
---|
387 | addccc $h3,0,$r3
|
---|
388 | addc $h4,0,$h4
|
---|
389 | andcc $h4,4,%g0 ! did it carry/borrow?
|
---|
390 |
|
---|
391 | movnz %icc,$r0,$h0
|
---|
392 | ld [$nonce+0],$r0 ! load nonce
|
---|
393 | movnz %icc,$r1,$h1
|
---|
394 | ld [$nonce+4],$r1
|
---|
395 | movnz %icc,$r2,$h2
|
---|
396 | ld [$nonce+8],$r2
|
---|
397 | movnz %icc,$r3,$h3
|
---|
398 | ld [$nonce+12],$r3
|
---|
399 |
|
---|
400 | addcc $r0,$h0,$h0 ! accumulate nonce
|
---|
401 | addccc $r1,$h1,$h1
|
---|
402 | addccc $r2,$h2,$h2
|
---|
403 | addc $r3,$h3,$h3
|
---|
404 |
|
---|
405 | srl $h0,8,$r0
|
---|
406 | stb $h0,[$mac+0] ! store little-endian result
|
---|
407 | srl $h0,16,$r1
|
---|
408 | stb $r0,[$mac+1]
|
---|
409 | srl $h0,24,$r2
|
---|
410 | stb $r1,[$mac+2]
|
---|
411 | stb $r2,[$mac+3]
|
---|
412 |
|
---|
413 | srl $h1,8,$r0
|
---|
414 | stb $h1,[$mac+4]
|
---|
415 | srl $h1,16,$r1
|
---|
416 | stb $r0,[$mac+5]
|
---|
417 | srl $h1,24,$r2
|
---|
418 | stb $r1,[$mac+6]
|
---|
419 | stb $r2,[$mac+7]
|
---|
420 |
|
---|
421 | srl $h2,8,$r0
|
---|
422 | stb $h2,[$mac+8]
|
---|
423 | srl $h2,16,$r1
|
---|
424 | stb $r0,[$mac+9]
|
---|
425 | srl $h2,24,$r2
|
---|
426 | stb $r1,[$mac+10]
|
---|
427 | stb $r2,[$mac+11]
|
---|
428 |
|
---|
429 | srl $h3,8,$r0
|
---|
430 | stb $h3,[$mac+12]
|
---|
431 | srl $h3,16,$r1
|
---|
432 | stb $r0,[$mac+13]
|
---|
433 | srl $h3,24,$r2
|
---|
434 | stb $r1,[$mac+14]
|
---|
435 | stb $r2,[$mac+15]
|
---|
436 |
|
---|
437 | ret
|
---|
438 | restore
|
---|
439 | .type poly1305_emit,#function
|
---|
440 | .size poly1305_emit,.-poly1305_emit
|
---|
441 | ___
|
---|
442 |
|
---|
443 | {
|
---|
444 | my ($ctx,$inp,$len,$padbit) = map("%i$_",(0..3));
|
---|
445 | my ($in0,$in1,$in2,$in3,$in4) = map("%o$_",(0..4));
|
---|
446 | my ($i1,$step,$shr,$shl) = map("%l$_",(0..7));
|
---|
447 | my $i2=$step;
|
---|
448 |
|
---|
449 | my ($h0lo,$h0hi,$h1lo,$h1hi,$h2lo,$h2hi,$h3lo,$h3hi,
|
---|
450 | $two0,$two32,$two64,$two96,$two130,$five_two130,
|
---|
451 | $r0lo,$r0hi,$r1lo,$r1hi,$r2lo,$r2hi,
|
---|
452 | $s2lo,$s2hi,$s3lo,$s3hi,
|
---|
453 | $c0lo,$c0hi,$c1lo,$c1hi,$c2lo,$c2hi,$c3lo,$c3hi) = map("%f".2*$_,(0..31));
|
---|
454 | # borrowings
|
---|
455 | my ($r3lo,$r3hi,$s1lo,$s1hi) = ($c0lo,$c0hi,$c1lo,$c1hi);
|
---|
456 | my ($x0,$x1,$x2,$x3) = ($c2lo,$c2hi,$c3lo,$c3hi);
|
---|
457 | my ($y0,$y1,$y2,$y3) = ($c1lo,$c1hi,$c3hi,$c3lo);
|
---|
458 |
|
---|
459 | $code.=<<___;
|
---|
460 | .align 32
|
---|
461 | poly1305_init_fma:
|
---|
462 | save %sp,-STACK_FRAME-16,%sp
|
---|
463 | nop
|
---|
464 |
|
---|
465 | .Lpoly1305_init_fma:
|
---|
466 | 1: call .+8
|
---|
467 | add %o7,.Lconsts_fma-1b,%o7
|
---|
468 |
|
---|
469 | ldd [%o7+8*0],$two0 ! load constants
|
---|
470 | ldd [%o7+8*1],$two32
|
---|
471 | ldd [%o7+8*2],$two64
|
---|
472 | ldd [%o7+8*3],$two96
|
---|
473 | ldd [%o7+8*5],$five_two130
|
---|
474 |
|
---|
475 | std $two0,[$ctx+8*0] ! initial hash value, biased 0
|
---|
476 | std $two32,[$ctx+8*1]
|
---|
477 | std $two64,[$ctx+8*2]
|
---|
478 | std $two96,[$ctx+8*3]
|
---|
479 |
|
---|
480 | brz,pn $inp,.Lno_key_fma
|
---|
481 | nop
|
---|
482 |
|
---|
483 | stx %fsr,[%sp+LOCALS] ! save original %fsr
|
---|
484 | ldx [%o7+8*6],%fsr ! load new %fsr
|
---|
485 |
|
---|
486 | std $two0,[$ctx+8*4] ! key "template"
|
---|
487 | std $two32,[$ctx+8*5]
|
---|
488 | std $two64,[$ctx+8*6]
|
---|
489 | std $two96,[$ctx+8*7]
|
---|
490 |
|
---|
491 | and $inp,7,$shr
|
---|
492 | andn $inp,7,$inp ! align pointer
|
---|
493 | mov 8,$i1
|
---|
494 | sll $shr,3,$shr
|
---|
495 | mov 16,$i2
|
---|
496 | neg $shr,$shl
|
---|
497 |
|
---|
498 | ldxa [$inp+%g0]0x88,$in0 ! load little-endian key
|
---|
499 | ldxa [$inp+$i1]0x88,$in2
|
---|
500 |
|
---|
501 | brz $shr,.Lkey_aligned_fma
|
---|
502 | sethi %hi(0xf0000000),$i1 ! 0xf0000000
|
---|
503 |
|
---|
504 | ldxa [$inp+$i2]0x88,$in4
|
---|
505 |
|
---|
506 | srlx $in0,$shr,$in0 ! align data
|
---|
507 | sllx $in2,$shl,$in1
|
---|
508 | srlx $in2,$shr,$in2
|
---|
509 | or $in1,$in0,$in0
|
---|
510 | sllx $in4,$shl,$in3
|
---|
511 | or $in3,$in2,$in2
|
---|
512 |
|
---|
513 | .Lkey_aligned_fma:
|
---|
514 | or $i1,3,$i2 ! 0xf0000003
|
---|
515 | srlx $in0,32,$in1
|
---|
516 | andn $in0,$i1,$in0 ! &=0x0fffffff
|
---|
517 | andn $in1,$i2,$in1 ! &=0x0ffffffc
|
---|
518 | srlx $in2,32,$in3
|
---|
519 | andn $in2,$i2,$in2
|
---|
520 | andn $in3,$i2,$in3
|
---|
521 |
|
---|
522 | st $in0,[$ctx+`8*4+4`] ! fill "template"
|
---|
523 | st $in1,[$ctx+`8*5+4`]
|
---|
524 | st $in2,[$ctx+`8*6+4`]
|
---|
525 | st $in3,[$ctx+`8*7+4`]
|
---|
526 |
|
---|
527 | ldd [$ctx+8*4],$h0lo ! load [biased] key
|
---|
528 | ldd [$ctx+8*5],$h1lo
|
---|
529 | ldd [$ctx+8*6],$h2lo
|
---|
530 | ldd [$ctx+8*7],$h3lo
|
---|
531 |
|
---|
532 | fsubd $h0lo,$two0, $h0lo ! r0
|
---|
533 | ldd [%o7+8*7],$two0 ! more constants
|
---|
534 | fsubd $h1lo,$two32,$h1lo ! r1
|
---|
535 | ldd [%o7+8*8],$two32
|
---|
536 | fsubd $h2lo,$two64,$h2lo ! r2
|
---|
537 | ldd [%o7+8*9],$two64
|
---|
538 | fsubd $h3lo,$two96,$h3lo ! r3
|
---|
539 | ldd [%o7+8*10],$two96
|
---|
540 |
|
---|
541 | fmuld $five_two130,$h1lo,$s1lo ! s1
|
---|
542 | fmuld $five_two130,$h2lo,$s2lo ! s2
|
---|
543 | fmuld $five_two130,$h3lo,$s3lo ! s3
|
---|
544 |
|
---|
545 | faddd $h0lo,$two0, $h0hi
|
---|
546 | faddd $h1lo,$two32,$h1hi
|
---|
547 | faddd $h2lo,$two64,$h2hi
|
---|
548 | faddd $h3lo,$two96,$h3hi
|
---|
549 |
|
---|
550 | fsubd $h0hi,$two0, $h0hi
|
---|
551 | ldd [%o7+8*11],$two0 ! more constants
|
---|
552 | fsubd $h1hi,$two32,$h1hi
|
---|
553 | ldd [%o7+8*12],$two32
|
---|
554 | fsubd $h2hi,$two64,$h2hi
|
---|
555 | ldd [%o7+8*13],$two64
|
---|
556 | fsubd $h3hi,$two96,$h3hi
|
---|
557 |
|
---|
558 | fsubd $h0lo,$h0hi,$h0lo
|
---|
559 | std $h0hi,[$ctx+8*5] ! r0hi
|
---|
560 | fsubd $h1lo,$h1hi,$h1lo
|
---|
561 | std $h1hi,[$ctx+8*7] ! r1hi
|
---|
562 | fsubd $h2lo,$h2hi,$h2lo
|
---|
563 | std $h2hi,[$ctx+8*9] ! r2hi
|
---|
564 | fsubd $h3lo,$h3hi,$h3lo
|
---|
565 | std $h3hi,[$ctx+8*11] ! r3hi
|
---|
566 |
|
---|
567 | faddd $s1lo,$two0, $s1hi
|
---|
568 | faddd $s2lo,$two32,$s2hi
|
---|
569 | faddd $s3lo,$two64,$s3hi
|
---|
570 |
|
---|
571 | fsubd $s1hi,$two0, $s1hi
|
---|
572 | fsubd $s2hi,$two32,$s2hi
|
---|
573 | fsubd $s3hi,$two64,$s3hi
|
---|
574 |
|
---|
575 | fsubd $s1lo,$s1hi,$s1lo
|
---|
576 | fsubd $s2lo,$s2hi,$s2lo
|
---|
577 | fsubd $s3lo,$s3hi,$s3lo
|
---|
578 |
|
---|
579 | ldx [%sp+LOCALS],%fsr ! restore %fsr
|
---|
580 |
|
---|
581 | std $h0lo,[$ctx+8*4] ! r0lo
|
---|
582 | std $h1lo,[$ctx+8*6] ! r1lo
|
---|
583 | std $h2lo,[$ctx+8*8] ! r2lo
|
---|
584 | std $h3lo,[$ctx+8*10] ! r3lo
|
---|
585 |
|
---|
586 | std $s1hi,[$ctx+8*13]
|
---|
587 | std $s2hi,[$ctx+8*15]
|
---|
588 | std $s3hi,[$ctx+8*17]
|
---|
589 |
|
---|
590 | std $s1lo,[$ctx+8*12]
|
---|
591 | std $s2lo,[$ctx+8*14]
|
---|
592 | std $s3lo,[$ctx+8*16]
|
---|
593 |
|
---|
594 | add %o7,poly1305_blocks_fma-.Lconsts_fma,%o0
|
---|
595 | add %o7,poly1305_emit_fma-.Lconsts_fma,%o1
|
---|
596 | STPTR %o0,[%i2]
|
---|
597 | STPTR %o1,[%i2+SIZE_T]
|
---|
598 |
|
---|
599 | ret
|
---|
600 | restore %g0,1,%o0 ! return 1
|
---|
601 |
|
---|
602 | .Lno_key_fma:
|
---|
603 | ret
|
---|
604 | restore %g0,%g0,%o0 ! return 0
|
---|
605 | .type poly1305_init_fma,#function
|
---|
606 | .size poly1305_init_fma,.-poly1305_init_fma
|
---|
607 |
|
---|
608 | .align 32
|
---|
609 | poly1305_blocks_fma:
|
---|
610 | save %sp,-STACK_FRAME-48,%sp
|
---|
611 | srln $len,4,$len
|
---|
612 |
|
---|
613 | brz,pn $len,.Labort
|
---|
614 | sub $len,1,$len
|
---|
615 |
|
---|
616 | 1: call .+8
|
---|
617 | add %o7,.Lconsts_fma-1b,%o7
|
---|
618 |
|
---|
619 | ldd [%o7+8*0],$two0 ! load constants
|
---|
620 | ldd [%o7+8*1],$two32
|
---|
621 | ldd [%o7+8*2],$two64
|
---|
622 | ldd [%o7+8*3],$two96
|
---|
623 | ldd [%o7+8*4],$two130
|
---|
624 | ldd [%o7+8*5],$five_two130
|
---|
625 |
|
---|
626 | ldd [$ctx+8*0],$h0lo ! load [biased] hash value
|
---|
627 | ldd [$ctx+8*1],$h1lo
|
---|
628 | ldd [$ctx+8*2],$h2lo
|
---|
629 | ldd [$ctx+8*3],$h3lo
|
---|
630 |
|
---|
631 | std $two0,[%sp+LOCALS+8*0] ! input "template"
|
---|
632 | sethi %hi((1023+52+96)<<20),$in3
|
---|
633 | std $two32,[%sp+LOCALS+8*1]
|
---|
634 | or $padbit,$in3,$in3
|
---|
635 | std $two64,[%sp+LOCALS+8*2]
|
---|
636 | st $in3,[%sp+LOCALS+8*3]
|
---|
637 |
|
---|
638 | and $inp,7,$shr
|
---|
639 | andn $inp,7,$inp ! align pointer
|
---|
640 | mov 8,$i1
|
---|
641 | sll $shr,3,$shr
|
---|
642 | mov 16,$step
|
---|
643 | neg $shr,$shl
|
---|
644 |
|
---|
645 | ldxa [$inp+%g0]0x88,$in0 ! load little-endian input
|
---|
646 | brz $shr,.Linp_aligned_fma
|
---|
647 | ldxa [$inp+$i1]0x88,$in2
|
---|
648 |
|
---|
649 | ldxa [$inp+$step]0x88,$in4
|
---|
650 | add $inp,8,$inp
|
---|
651 |
|
---|
652 | srlx $in0,$shr,$in0 ! align data
|
---|
653 | sllx $in2,$shl,$in1
|
---|
654 | srlx $in2,$shr,$in2
|
---|
655 | or $in1,$in0,$in0
|
---|
656 | sllx $in4,$shl,$in3
|
---|
657 | srlx $in4,$shr,$in4 ! pre-shift
|
---|
658 | or $in3,$in2,$in2
|
---|
659 |
|
---|
660 | .Linp_aligned_fma:
|
---|
661 | srlx $in0,32,$in1
|
---|
662 | movrz $len,0,$step
|
---|
663 | srlx $in2,32,$in3
|
---|
664 | add $step,$inp,$inp ! conditional advance
|
---|
665 |
|
---|
666 | st $in0,[%sp+LOCALS+8*0+4] ! fill "template"
|
---|
667 | st $in1,[%sp+LOCALS+8*1+4]
|
---|
668 | st $in2,[%sp+LOCALS+8*2+4]
|
---|
669 | st $in3,[%sp+LOCALS+8*3+4]
|
---|
670 |
|
---|
671 | ldd [$ctx+8*4],$r0lo ! load key
|
---|
672 | ldd [$ctx+8*5],$r0hi
|
---|
673 | ldd [$ctx+8*6],$r1lo
|
---|
674 | ldd [$ctx+8*7],$r1hi
|
---|
675 | ldd [$ctx+8*8],$r2lo
|
---|
676 | ldd [$ctx+8*9],$r2hi
|
---|
677 | ldd [$ctx+8*10],$r3lo
|
---|
678 | ldd [$ctx+8*11],$r3hi
|
---|
679 | ldd [$ctx+8*12],$s1lo
|
---|
680 | ldd [$ctx+8*13],$s1hi
|
---|
681 | ldd [$ctx+8*14],$s2lo
|
---|
682 | ldd [$ctx+8*15],$s2hi
|
---|
683 | ldd [$ctx+8*16],$s3lo
|
---|
684 | ldd [$ctx+8*17],$s3hi
|
---|
685 |
|
---|
686 | stx %fsr,[%sp+LOCALS+8*4] ! save original %fsr
|
---|
687 | ldx [%o7+8*6],%fsr ! load new %fsr
|
---|
688 |
|
---|
689 | subcc $len,1,$len
|
---|
690 | movrz $len,0,$step
|
---|
691 |
|
---|
692 | ldd [%sp+LOCALS+8*0],$x0 ! load biased input
|
---|
693 | ldd [%sp+LOCALS+8*1],$x1
|
---|
694 | ldd [%sp+LOCALS+8*2],$x2
|
---|
695 | ldd [%sp+LOCALS+8*3],$x3
|
---|
696 |
|
---|
697 | fsubd $h0lo,$two0, $h0lo ! de-bias hash value
|
---|
698 | fsubd $h1lo,$two32,$h1lo
|
---|
699 | ldxa [$inp+%g0]0x88,$in0 ! modulo-scheduled input load
|
---|
700 | fsubd $h2lo,$two64,$h2lo
|
---|
701 | fsubd $h3lo,$two96,$h3lo
|
---|
702 | ldxa [$inp+$i1]0x88,$in2
|
---|
703 |
|
---|
704 | fsubd $x0,$two0, $x0 ! de-bias input
|
---|
705 | fsubd $x1,$two32,$x1
|
---|
706 | fsubd $x2,$two64,$x2
|
---|
707 | fsubd $x3,$two96,$x3
|
---|
708 |
|
---|
709 | brz $shr,.Linp_aligned_fma2
|
---|
710 | add $step,$inp,$inp ! conditional advance
|
---|
711 |
|
---|
712 | sllx $in0,$shl,$in1 ! align data
|
---|
713 | srlx $in0,$shr,$in3
|
---|
714 | or $in1,$in4,$in0
|
---|
715 | sllx $in2,$shl,$in1
|
---|
716 | srlx $in2,$shr,$in4 ! pre-shift
|
---|
717 | or $in3,$in1,$in2
|
---|
718 | .Linp_aligned_fma2:
|
---|
719 | srlx $in0,32,$in1
|
---|
720 | srlx $in2,32,$in3
|
---|
721 |
|
---|
722 | faddd $h0lo,$x0,$x0 ! accumulate input
|
---|
723 | stw $in0,[%sp+LOCALS+8*0+4]
|
---|
724 | faddd $h1lo,$x1,$x1
|
---|
725 | stw $in1,[%sp+LOCALS+8*1+4]
|
---|
726 | faddd $h2lo,$x2,$x2
|
---|
727 | stw $in2,[%sp+LOCALS+8*2+4]
|
---|
728 | faddd $h3lo,$x3,$x3
|
---|
729 | stw $in3,[%sp+LOCALS+8*3+4]
|
---|
730 |
|
---|
731 | b .Lentry_fma
|
---|
732 | nop
|
---|
733 |
|
---|
734 | .align 16
|
---|
735 | .Loop_fma:
|
---|
736 | ldxa [$inp+%g0]0x88,$in0 ! modulo-scheduled input load
|
---|
737 | ldxa [$inp+$i1]0x88,$in2
|
---|
738 | movrz $len,0,$step
|
---|
739 |
|
---|
740 | faddd $y0,$h0lo,$h0lo ! accumulate input
|
---|
741 | faddd $y1,$h0hi,$h0hi
|
---|
742 | faddd $y2,$h2lo,$h2lo
|
---|
743 | faddd $y3,$h2hi,$h2hi
|
---|
744 |
|
---|
745 | brz,pn $shr,.Linp_aligned_fma3
|
---|
746 | add $step,$inp,$inp ! conditional advance
|
---|
747 |
|
---|
748 | sllx $in0,$shl,$in1 ! align data
|
---|
749 | srlx $in0,$shr,$in3
|
---|
750 | or $in1,$in4,$in0
|
---|
751 | sllx $in2,$shl,$in1
|
---|
752 | srlx $in2,$shr,$in4 ! pre-shift
|
---|
753 | or $in3,$in1,$in2
|
---|
754 |
|
---|
755 | .Linp_aligned_fma3:
|
---|
756 | !!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!! base 2^48 -> base 2^32
|
---|
757 | faddd $two64,$h1lo,$c1lo
|
---|
758 | srlx $in0,32,$in1
|
---|
759 | faddd $two64,$h1hi,$c1hi
|
---|
760 | srlx $in2,32,$in3
|
---|
761 | faddd $two130,$h3lo,$c3lo
|
---|
762 | st $in0,[%sp+LOCALS+8*0+4] ! fill "template"
|
---|
763 | faddd $two130,$h3hi,$c3hi
|
---|
764 | st $in1,[%sp+LOCALS+8*1+4]
|
---|
765 | faddd $two32,$h0lo,$c0lo
|
---|
766 | st $in2,[%sp+LOCALS+8*2+4]
|
---|
767 | faddd $two32,$h0hi,$c0hi
|
---|
768 | st $in3,[%sp+LOCALS+8*3+4]
|
---|
769 | faddd $two96,$h2lo,$c2lo
|
---|
770 | faddd $two96,$h2hi,$c2hi
|
---|
771 |
|
---|
772 | fsubd $c1lo,$two64,$c1lo
|
---|
773 | fsubd $c1hi,$two64,$c1hi
|
---|
774 | fsubd $c3lo,$two130,$c3lo
|
---|
775 | fsubd $c3hi,$two130,$c3hi
|
---|
776 | fsubd $c0lo,$two32,$c0lo
|
---|
777 | fsubd $c0hi,$two32,$c0hi
|
---|
778 | fsubd $c2lo,$two96,$c2lo
|
---|
779 | fsubd $c2hi,$two96,$c2hi
|
---|
780 |
|
---|
781 | fsubd $h1lo,$c1lo,$h1lo
|
---|
782 | fsubd $h1hi,$c1hi,$h1hi
|
---|
783 | fsubd $h3lo,$c3lo,$h3lo
|
---|
784 | fsubd $h3hi,$c3hi,$h3hi
|
---|
785 | fsubd $h2lo,$c2lo,$h2lo
|
---|
786 | fsubd $h2hi,$c2hi,$h2hi
|
---|
787 | fsubd $h0lo,$c0lo,$h0lo
|
---|
788 | fsubd $h0hi,$c0hi,$h0hi
|
---|
789 |
|
---|
790 | faddd $h1lo,$c0lo,$h1lo
|
---|
791 | faddd $h1hi,$c0hi,$h1hi
|
---|
792 | faddd $h3lo,$c2lo,$h3lo
|
---|
793 | faddd $h3hi,$c2hi,$h3hi
|
---|
794 | faddd $h2lo,$c1lo,$h2lo
|
---|
795 | faddd $h2hi,$c1hi,$h2hi
|
---|
796 | fmaddd $five_two130,$c3lo,$h0lo,$h0lo
|
---|
797 | fmaddd $five_two130,$c3hi,$h0hi,$h0hi
|
---|
798 |
|
---|
799 | faddd $h1lo,$h1hi,$x1
|
---|
800 | ldd [$ctx+8*12],$s1lo ! reload constants
|
---|
801 | faddd $h3lo,$h3hi,$x3
|
---|
802 | ldd [$ctx+8*13],$s1hi
|
---|
803 | faddd $h2lo,$h2hi,$x2
|
---|
804 | ldd [$ctx+8*10],$r3lo
|
---|
805 | faddd $h0lo,$h0hi,$x0
|
---|
806 | ldd [$ctx+8*11],$r3hi
|
---|
807 |
|
---|
808 | .Lentry_fma:
|
---|
809 | fmuld $x1,$s3lo,$h0lo
|
---|
810 | fmuld $x1,$s3hi,$h0hi
|
---|
811 | fmuld $x1,$r1lo,$h2lo
|
---|
812 | fmuld $x1,$r1hi,$h2hi
|
---|
813 | fmuld $x1,$r0lo,$h1lo
|
---|
814 | fmuld $x1,$r0hi,$h1hi
|
---|
815 | fmuld $x1,$r2lo,$h3lo
|
---|
816 | fmuld $x1,$r2hi,$h3hi
|
---|
817 |
|
---|
818 | fmaddd $x3,$s1lo,$h0lo,$h0lo
|
---|
819 | fmaddd $x3,$s1hi,$h0hi,$h0hi
|
---|
820 | fmaddd $x3,$s3lo,$h2lo,$h2lo
|
---|
821 | fmaddd $x3,$s3hi,$h2hi,$h2hi
|
---|
822 | fmaddd $x3,$s2lo,$h1lo,$h1lo
|
---|
823 | fmaddd $x3,$s2hi,$h1hi,$h1hi
|
---|
824 | fmaddd $x3,$r0lo,$h3lo,$h3lo
|
---|
825 | fmaddd $x3,$r0hi,$h3hi,$h3hi
|
---|
826 |
|
---|
827 | fmaddd $x2,$s2lo,$h0lo,$h0lo
|
---|
828 | fmaddd $x2,$s2hi,$h0hi,$h0hi
|
---|
829 | fmaddd $x2,$r0lo,$h2lo,$h2lo
|
---|
830 | fmaddd $x2,$r0hi,$h2hi,$h2hi
|
---|
831 | fmaddd $x2,$s3lo,$h1lo,$h1lo
|
---|
832 | ldd [%sp+LOCALS+8*0],$y0 ! load [biased] input
|
---|
833 | fmaddd $x2,$s3hi,$h1hi,$h1hi
|
---|
834 | ldd [%sp+LOCALS+8*1],$y1
|
---|
835 | fmaddd $x2,$r1lo,$h3lo,$h3lo
|
---|
836 | ldd [%sp+LOCALS+8*2],$y2
|
---|
837 | fmaddd $x2,$r1hi,$h3hi,$h3hi
|
---|
838 | ldd [%sp+LOCALS+8*3],$y3
|
---|
839 |
|
---|
840 | fmaddd $x0,$r0lo,$h0lo,$h0lo
|
---|
841 | fsubd $y0,$two0, $y0 ! de-bias input
|
---|
842 | fmaddd $x0,$r0hi,$h0hi,$h0hi
|
---|
843 | fsubd $y1,$two32,$y1
|
---|
844 | fmaddd $x0,$r2lo,$h2lo,$h2lo
|
---|
845 | fsubd $y2,$two64,$y2
|
---|
846 | fmaddd $x0,$r2hi,$h2hi,$h2hi
|
---|
847 | fsubd $y3,$two96,$y3
|
---|
848 | fmaddd $x0,$r1lo,$h1lo,$h1lo
|
---|
849 | fmaddd $x0,$r1hi,$h1hi,$h1hi
|
---|
850 | fmaddd $x0,$r3lo,$h3lo,$h3lo
|
---|
851 | fmaddd $x0,$r3hi,$h3hi,$h3hi
|
---|
852 |
|
---|
853 | bcc SIZE_T_CC,.Loop_fma
|
---|
854 | subcc $len,1,$len
|
---|
855 |
|
---|
856 | !!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!! base 2^48 -> base 2^32
|
---|
857 | faddd $h0lo,$two32,$c0lo
|
---|
858 | faddd $h0hi,$two32,$c0hi
|
---|
859 | faddd $h2lo,$two96,$c2lo
|
---|
860 | faddd $h2hi,$two96,$c2hi
|
---|
861 | faddd $h1lo,$two64,$c1lo
|
---|
862 | faddd $h1hi,$two64,$c1hi
|
---|
863 | faddd $h3lo,$two130,$c3lo
|
---|
864 | faddd $h3hi,$two130,$c3hi
|
---|
865 |
|
---|
866 | fsubd $c0lo,$two32,$c0lo
|
---|
867 | fsubd $c0hi,$two32,$c0hi
|
---|
868 | fsubd $c2lo,$two96,$c2lo
|
---|
869 | fsubd $c2hi,$two96,$c2hi
|
---|
870 | fsubd $c1lo,$two64,$c1lo
|
---|
871 | fsubd $c1hi,$two64,$c1hi
|
---|
872 | fsubd $c3lo,$two130,$c3lo
|
---|
873 | fsubd $c3hi,$two130,$c3hi
|
---|
874 |
|
---|
875 | fsubd $h1lo,$c1lo,$h1lo
|
---|
876 | fsubd $h1hi,$c1hi,$h1hi
|
---|
877 | fsubd $h3lo,$c3lo,$h3lo
|
---|
878 | fsubd $h3hi,$c3hi,$h3hi
|
---|
879 | fsubd $h2lo,$c2lo,$h2lo
|
---|
880 | fsubd $h2hi,$c2hi,$h2hi
|
---|
881 | fsubd $h0lo,$c0lo,$h0lo
|
---|
882 | fsubd $h0hi,$c0hi,$h0hi
|
---|
883 |
|
---|
884 | faddd $h1lo,$c0lo,$h1lo
|
---|
885 | faddd $h1hi,$c0hi,$h1hi
|
---|
886 | faddd $h3lo,$c2lo,$h3lo
|
---|
887 | faddd $h3hi,$c2hi,$h3hi
|
---|
888 | faddd $h2lo,$c1lo,$h2lo
|
---|
889 | faddd $h2hi,$c1hi,$h2hi
|
---|
890 | fmaddd $five_two130,$c3lo,$h0lo,$h0lo
|
---|
891 | fmaddd $five_two130,$c3hi,$h0hi,$h0hi
|
---|
892 |
|
---|
893 | faddd $h1lo,$h1hi,$x1
|
---|
894 | faddd $h3lo,$h3hi,$x3
|
---|
895 | faddd $h2lo,$h2hi,$x2
|
---|
896 | faddd $h0lo,$h0hi,$x0
|
---|
897 |
|
---|
898 | faddd $x1,$two32,$x1 ! bias
|
---|
899 | faddd $x3,$two96,$x3
|
---|
900 | faddd $x2,$two64,$x2
|
---|
901 | faddd $x0,$two0, $x0
|
---|
902 |
|
---|
903 | ldx [%sp+LOCALS+8*4],%fsr ! restore saved %fsr
|
---|
904 |
|
---|
905 | std $x1,[$ctx+8*1] ! store [biased] hash value
|
---|
906 | std $x3,[$ctx+8*3]
|
---|
907 | std $x2,[$ctx+8*2]
|
---|
908 | std $x0,[$ctx+8*0]
|
---|
909 |
|
---|
910 | .Labort:
|
---|
911 | ret
|
---|
912 | restore
|
---|
913 | .type poly1305_blocks_fma,#function
|
---|
914 | .size poly1305_blocks_fma,.-poly1305_blocks_fma
|
---|
915 | ___
|
---|
916 | {
|
---|
917 | my ($mac,$nonce)=($inp,$len);
|
---|
918 |
|
---|
919 | my ($h0,$h1,$h2,$h3,$h4, $d0,$d1,$d2,$d3, $mask
|
---|
920 | ) = (map("%l$_",(0..5)),map("%o$_",(0..4)));
|
---|
921 |
|
---|
922 | $code.=<<___;
|
---|
923 | .align 32
|
---|
924 | poly1305_emit_fma:
|
---|
925 | save %sp,-STACK_FRAME,%sp
|
---|
926 |
|
---|
927 | ld [$ctx+8*0+0],$d0 ! load hash
|
---|
928 | ld [$ctx+8*0+4],$h0
|
---|
929 | ld [$ctx+8*1+0],$d1
|
---|
930 | ld [$ctx+8*1+4],$h1
|
---|
931 | ld [$ctx+8*2+0],$d2
|
---|
932 | ld [$ctx+8*2+4],$h2
|
---|
933 | ld [$ctx+8*3+0],$d3
|
---|
934 | ld [$ctx+8*3+4],$h3
|
---|
935 |
|
---|
936 | sethi %hi(0xfff00000),$mask
|
---|
937 | andn $d0,$mask,$d0 ! mask exponent
|
---|
938 | andn $d1,$mask,$d1
|
---|
939 | andn $d2,$mask,$d2
|
---|
940 | andn $d3,$mask,$d3 ! can be partially reduced...
|
---|
941 | mov 3,$mask
|
---|
942 |
|
---|
943 | srl $d3,2,$padbit ! ... so reduce
|
---|
944 | and $d3,$mask,$h4
|
---|
945 | andn $d3,$mask,$d3
|
---|
946 | add $padbit,$d3,$d3
|
---|
947 |
|
---|
948 | addcc $d3,$h0,$h0
|
---|
949 | addccc $d0,$h1,$h1
|
---|
950 | addccc $d1,$h2,$h2
|
---|
951 | addccc $d2,$h3,$h3
|
---|
952 | addc %g0,$h4,$h4
|
---|
953 |
|
---|
954 | addcc $h0,5,$d0 ! compare to modulus
|
---|
955 | addccc $h1,0,$d1
|
---|
956 | addccc $h2,0,$d2
|
---|
957 | addccc $h3,0,$d3
|
---|
958 | addc $h4,0,$mask
|
---|
959 |
|
---|
960 | srl $mask,2,$mask ! did it carry/borrow?
|
---|
961 | neg $mask,$mask
|
---|
962 | sra $mask,31,$mask ! mask
|
---|
963 |
|
---|
964 | andn $h0,$mask,$h0
|
---|
965 | and $d0,$mask,$d0
|
---|
966 | andn $h1,$mask,$h1
|
---|
967 | and $d1,$mask,$d1
|
---|
968 | or $d0,$h0,$h0
|
---|
969 | ld [$nonce+0],$d0 ! load nonce
|
---|
970 | andn $h2,$mask,$h2
|
---|
971 | and $d2,$mask,$d2
|
---|
972 | or $d1,$h1,$h1
|
---|
973 | ld [$nonce+4],$d1
|
---|
974 | andn $h3,$mask,$h3
|
---|
975 | and $d3,$mask,$d3
|
---|
976 | or $d2,$h2,$h2
|
---|
977 | ld [$nonce+8],$d2
|
---|
978 | or $d3,$h3,$h3
|
---|
979 | ld [$nonce+12],$d3
|
---|
980 |
|
---|
981 | addcc $d0,$h0,$h0 ! accumulate nonce
|
---|
982 | addccc $d1,$h1,$h1
|
---|
983 | addccc $d2,$h2,$h2
|
---|
984 | addc $d3,$h3,$h3
|
---|
985 |
|
---|
986 | stb $h0,[$mac+0] ! write little-endian result
|
---|
987 | srl $h0,8,$h0
|
---|
988 | stb $h1,[$mac+4]
|
---|
989 | srl $h1,8,$h1
|
---|
990 | stb $h2,[$mac+8]
|
---|
991 | srl $h2,8,$h2
|
---|
992 | stb $h3,[$mac+12]
|
---|
993 | srl $h3,8,$h3
|
---|
994 |
|
---|
995 | stb $h0,[$mac+1]
|
---|
996 | srl $h0,8,$h0
|
---|
997 | stb $h1,[$mac+5]
|
---|
998 | srl $h1,8,$h1
|
---|
999 | stb $h2,[$mac+9]
|
---|
1000 | srl $h2,8,$h2
|
---|
1001 | stb $h3,[$mac+13]
|
---|
1002 | srl $h3,8,$h3
|
---|
1003 |
|
---|
1004 | stb $h0,[$mac+2]
|
---|
1005 | srl $h0,8,$h0
|
---|
1006 | stb $h1,[$mac+6]
|
---|
1007 | srl $h1,8,$h1
|
---|
1008 | stb $h2,[$mac+10]
|
---|
1009 | srl $h2,8,$h2
|
---|
1010 | stb $h3,[$mac+14]
|
---|
1011 | srl $h3,8,$h3
|
---|
1012 |
|
---|
1013 | stb $h0,[$mac+3]
|
---|
1014 | stb $h1,[$mac+7]
|
---|
1015 | stb $h2,[$mac+11]
|
---|
1016 | stb $h3,[$mac+15]
|
---|
1017 |
|
---|
1018 | ret
|
---|
1019 | restore
|
---|
1020 | .type poly1305_emit_fma,#function
|
---|
1021 | .size poly1305_emit_fma,.-poly1305_emit_fma
|
---|
1022 | ___
|
---|
1023 | }
|
---|
1024 |
|
---|
1025 | $code.=<<___;
|
---|
1026 | .align 64
|
---|
1027 | .Lconsts_fma:
|
---|
1028 | .word 0x43300000,0x00000000 ! 2^(52+0)
|
---|
1029 | .word 0x45300000,0x00000000 ! 2^(52+32)
|
---|
1030 | .word 0x47300000,0x00000000 ! 2^(52+64)
|
---|
1031 | .word 0x49300000,0x00000000 ! 2^(52+96)
|
---|
1032 | .word 0x4b500000,0x00000000 ! 2^(52+130)
|
---|
1033 |
|
---|
1034 | .word 0x37f40000,0x00000000 ! 5/2^130
|
---|
1035 | .word 0,1<<30 ! fsr: truncate, no exceptions
|
---|
1036 |
|
---|
1037 | .word 0x44300000,0x00000000 ! 2^(52+16+0)
|
---|
1038 | .word 0x46300000,0x00000000 ! 2^(52+16+32)
|
---|
1039 | .word 0x48300000,0x00000000 ! 2^(52+16+64)
|
---|
1040 | .word 0x4a300000,0x00000000 ! 2^(52+16+96)
|
---|
1041 | .word 0x3e300000,0x00000000 ! 2^(52+16+0-96)
|
---|
1042 | .word 0x40300000,0x00000000 ! 2^(52+16+32-96)
|
---|
1043 | .word 0x42300000,0x00000000 ! 2^(52+16+64-96)
|
---|
1044 | .asciz "Poly1305 for SPARCv9/VIS3/FMA, CRYPTOGAMS by <appro\@openssl.org>"
|
---|
1045 | .align 4
|
---|
1046 | ___
|
---|
1047 | }
|
---|
1048 | |
---|
1049 |
|
---|
1050 | # Purpose of these subroutines is to explicitly encode VIS instructions,
|
---|
1051 | # so that one can compile the module without having to specify VIS
|
---|
1052 | # extensions on compiler command line, e.g. -xarch=v9 vs. -xarch=v9a.
|
---|
1053 | # Idea is to reserve for option to produce "universal" binary and let
|
---|
1054 | # programmer detect if current CPU is VIS capable at run-time.
|
---|
1055 | sub unvis3 {
|
---|
1056 | my ($mnemonic,$rs1,$rs2,$rd)=@_;
|
---|
1057 | my %bias = ( "g" => 0, "o" => 8, "l" => 16, "i" => 24 );
|
---|
1058 | my ($ref,$opf);
|
---|
1059 | my %visopf = ( "addxc" => 0x011,
|
---|
1060 | "addxccc" => 0x013,
|
---|
1061 | "umulxhi" => 0x016 );
|
---|
1062 |
|
---|
1063 | $ref = "$mnemonic\t$rs1,$rs2,$rd";
|
---|
1064 |
|
---|
1065 | if ($opf=$visopf{$mnemonic}) {
|
---|
1066 | foreach ($rs1,$rs2,$rd) {
|
---|
1067 | return $ref if (!/%([goli])([0-9])/);
|
---|
1068 | $_=$bias{$1}+$2;
|
---|
1069 | }
|
---|
1070 |
|
---|
1071 | return sprintf ".word\t0x%08x !%s",
|
---|
1072 | 0x81b00000|$rd<<25|$rs1<<14|$opf<<5|$rs2,
|
---|
1073 | $ref;
|
---|
1074 | } else {
|
---|
1075 | return $ref;
|
---|
1076 | }
|
---|
1077 | }
|
---|
1078 |
|
---|
1079 | sub unfma {
|
---|
1080 | my ($mnemonic,$rs1,$rs2,$rs3,$rd)=@_;
|
---|
1081 | my ($ref,$opf);
|
---|
1082 | my %fmaopf = ( "fmadds" => 0x1,
|
---|
1083 | "fmaddd" => 0x2,
|
---|
1084 | "fmsubs" => 0x5,
|
---|
1085 | "fmsubd" => 0x6 );
|
---|
1086 |
|
---|
1087 | $ref = "$mnemonic\t$rs1,$rs2,$rs3,$rd";
|
---|
1088 |
|
---|
1089 | if ($opf=$fmaopf{$mnemonic}) {
|
---|
1090 | foreach ($rs1,$rs2,$rs3,$rd) {
|
---|
1091 | return $ref if (!/%f([0-9]{1,2})/);
|
---|
1092 | $_=$1;
|
---|
1093 | if ($1>=32) {
|
---|
1094 | return $ref if ($1&1);
|
---|
1095 | # re-encode for upper double register addressing
|
---|
1096 | $_=($1|$1>>5)&31;
|
---|
1097 | }
|
---|
1098 | }
|
---|
1099 |
|
---|
1100 | return sprintf ".word\t0x%08x !%s",
|
---|
1101 | 0x81b80000|$rd<<25|$rs1<<14|$rs3<<9|$opf<<5|$rs2,
|
---|
1102 | $ref;
|
---|
1103 | } else {
|
---|
1104 | return $ref;
|
---|
1105 | }
|
---|
1106 | }
|
---|
1107 |
|
---|
1108 | foreach (split("\n",$code)) {
|
---|
1109 | s/\`([^\`]*)\`/eval $1/ge;
|
---|
1110 |
|
---|
1111 | s/\b(umulxhi|addxc[c]{0,2})\s+(%[goli][0-7]),\s*(%[goli][0-7]),\s*(%[goli][0-7])/
|
---|
1112 | &unvis3($1,$2,$3,$4)
|
---|
1113 | /ge or
|
---|
1114 | s/\b(fmadd[sd])\s+(%f[0-9]+),\s*(%f[0-9]+),\s*(%f[0-9]+),\s*(%f[0-9]+)/
|
---|
1115 | &unfma($1,$2,$3,$4,$5)
|
---|
1116 | /ge;
|
---|
1117 |
|
---|
1118 | print $_,"\n";
|
---|
1119 | }
|
---|
1120 |
|
---|
1121 | close STDOUT or die "error closing STDOUT: $!";
|
---|