VirtualBox

source: vbox/trunk/src/libs/openssl-3.1.2/crypto/poly1305/asm/poly1305-sparcv9.pl@ 101021

Last change on this file since 101021 was 101021, checked in by vboxsync, 15 months ago

openssl-3.1.2: Applied and adjusted our OpenSSL changes to 3.1.0. bugref:10519

  • Property svn:executable set to *
File size: 23.9 KB
Line 
1#! /usr/bin/env perl
2# Copyright 2016-2021 The OpenSSL Project Authors. All Rights Reserved.
3#
4# Licensed under the Apache License 2.0 (the "License"). You may not use
5# this file except in compliance with the License. You can obtain a copy
6# in the file LICENSE in the source distribution or at
7# https://www.openssl.org/source/license.html
8
9#
10# ====================================================================
11# Written by Andy Polyakov <[email protected]> for the OpenSSL
12# project. The module is, however, dual licensed under OpenSSL and
13# CRYPTOGAMS licenses depending on where you obtain it. For further
14# details see http://www.openssl.org/~appro/cryptogams/.
15# ====================================================================
16#
17# This module implements Poly1305 hash for SPARCv9, vanilla, as well
18# as VIS3 and FMA extensions.
19#
20# May, August 2015
21#
22# Numbers are cycles per processed byte with poly1305_blocks alone.
23#
24# IALU(*) FMA
25#
26# UltraSPARC III 12.3(**)
27# SPARC T3 7.92
28# SPARC T4 1.70(***) 6.55
29# SPARC64 X 5.60 3.64
30#
31# (*) Comparison to compiler-generated code is really problematic,
32# because latter's performance varies too much depending on too
33# many variables. For example, one can measure from 5x to 15x
34# improvement on T4 for gcc-4.6. Well, in T4 case it's a bit
35# unfair comparison, because compiler doesn't use VIS3, but
36# given same initial conditions coefficient varies from 3x to 9x.
37# (**) Pre-III performance should be even worse; floating-point
38# performance for UltraSPARC I-IV on the other hand is reported
39# to be 4.25 for hand-coded assembly, but they are just too old
40# to care about.
41# (***) Multi-process benchmark saturates at ~12.5x single-process
42# result on 8-core processor, or ~21GBps per 2.85GHz socket.
43
44# $output is the last argument if it looks like a file (it has an extension)
45my $output = $#ARGV >= 0 && $ARGV[$#ARGV] =~ m|\.\w+$| ? pop : undef;
46
47open STDOUT,">$output" if $output;
48
49my ($ctx,$inp,$len,$padbit,$shl,$shr) = map("%i$_",(0..5));
50my ($r0,$r1,$r2,$r3,$s1,$s2,$s3,$h4) = map("%l$_",(0..7));
51my ($h0,$h1,$h2,$h3, $t0,$t1,$t2) = map("%o$_",(0..5,7));
52my ($d0,$d1,$d2,$d3) = map("%g$_",(1..4));
53
54$code.=<<___;
55#ifndef __ASSEMBLER__
56# define __ASSEMBLER__ 1
57#endif
58#include "crypto/sparc_arch.h"
59
60#ifdef __arch64__
61.register %g2,#scratch
62.register %g3,#scratch
63# define STPTR stx
64# define SIZE_T 8
65#else
66# define STPTR st
67# define SIZE_T 4
68#endif
69#define LOCALS (STACK_BIAS+STACK_FRAME)
70
71.section ".text",#alloc,#execinstr
72
73#ifdef __PIC__
74SPARC_PIC_THUNK(%g1)
75#endif
76
77.globl poly1305_init
78.align 32
79poly1305_init:
80 save %sp,-STACK_FRAME-16,%sp
81 nop
82
83 SPARC_LOAD_ADDRESS(OPENSSL_sparcv9cap_P,%g1)
84 ld [%g1],%g1
85
86 and %g1,SPARCV9_FMADD|SPARCV9_VIS3,%g1
87 cmp %g1,SPARCV9_FMADD
88 be .Lpoly1305_init_fma
89 nop
90
91 stx %g0,[$ctx+0]
92 stx %g0,[$ctx+8] ! zero hash value
93 brz,pn $inp,.Lno_key
94 stx %g0,[$ctx+16]
95
96 and $inp,7,$shr ! alignment factor
97 andn $inp,7,$inp
98 sll $shr,3,$shr ! *8
99 neg $shr,$shl
100
101 sethi %hi(0x0ffffffc),$t0
102 set 8,$h1
103 or $t0,%lo(0x0ffffffc),$t0
104 set 16,$h2
105 sllx $t0,32,$t1
106 or $t0,$t1,$t1 ! 0x0ffffffc0ffffffc
107 or $t1,3,$t0 ! 0x0ffffffc0fffffff
108
109 ldxa [$inp+%g0]0x88,$h0 ! load little-endian key
110 brz,pt $shr,.Lkey_aligned
111 ldxa [$inp+$h1]0x88,$h1
112
113 ldxa [$inp+$h2]0x88,$h2
114 srlx $h0,$shr,$h0
115 sllx $h1,$shl,$t2
116 srlx $h1,$shr,$h1
117 or $t2,$h0,$h0
118 sllx $h2,$shl,$h2
119 or $h2,$h1,$h1
120
121.Lkey_aligned:
122 and $t0,$h0,$h0
123 and $t1,$h1,$h1
124 stx $h0,[$ctx+32+0] ! store key
125 stx $h1,[$ctx+32+8]
126
127 andcc %g1,SPARCV9_VIS3,%g0
128 be .Lno_key
129 nop
130
1311: call .+8
132 add %o7,poly1305_blocks_vis3-1b,%o7
133
134 add %o7,poly1305_emit-poly1305_blocks_vis3,%o5
135 STPTR %o7,[%i2]
136 STPTR %o5,[%i2+SIZE_T]
137
138 ret
139 restore %g0,1,%o0 ! return 1
140
141.Lno_key:
142 ret
143 restore %g0,%g0,%o0 ! return 0
144.type poly1305_init,#function
145.size poly1305_init,.-poly1305_init
146
147.globl poly1305_blocks
148.align 32
149poly1305_blocks:
150 save %sp,-STACK_FRAME,%sp
151 srln $len,4,$len
152
153 brz,pn $len,.Lno_data
154 nop
155
156 ld [$ctx+32+0],$r1 ! load key
157 ld [$ctx+32+4],$r0
158 ld [$ctx+32+8],$r3
159 ld [$ctx+32+12],$r2
160
161 ld [$ctx+0],$h1 ! load hash value
162 ld [$ctx+4],$h0
163 ld [$ctx+8],$h3
164 ld [$ctx+12],$h2
165 ld [$ctx+16],$h4
166
167 and $inp,7,$shr ! alignment factor
168 andn $inp,7,$inp
169 set 8,$d1
170 sll $shr,3,$shr ! *8
171 set 16,$d2
172 neg $shr,$shl
173
174 srl $r1,2,$s1
175 srl $r2,2,$s2
176 add $r1,$s1,$s1
177 srl $r3,2,$s3
178 add $r2,$s2,$s2
179 add $r3,$s3,$s3
180
181.Loop:
182 ldxa [$inp+%g0]0x88,$d0 ! load little-endian input
183 brz,pt $shr,.Linp_aligned
184 ldxa [$inp+$d1]0x88,$d1
185
186 ldxa [$inp+$d2]0x88,$d2
187 srlx $d0,$shr,$d0
188 sllx $d1,$shl,$t1
189 srlx $d1,$shr,$d1
190 or $t1,$d0,$d0
191 sllx $d2,$shl,$d2
192 or $d2,$d1,$d1
193
194.Linp_aligned:
195 srlx $d0,32,$t0
196 addcc $d0,$h0,$h0 ! accumulate input
197 srlx $d1,32,$t1
198 addccc $t0,$h1,$h1
199 addccc $d1,$h2,$h2
200 addccc $t1,$h3,$h3
201 addc $padbit,$h4,$h4
202
203 umul $r0,$h0,$d0
204 umul $r1,$h0,$d1
205 umul $r2,$h0,$d2
206 umul $r3,$h0,$d3
207 sub $len,1,$len
208 add $inp,16,$inp
209
210 umul $s3,$h1,$t0
211 umul $r0,$h1,$t1
212 umul $r1,$h1,$t2
213 add $t0,$d0,$d0
214 add $t1,$d1,$d1
215 umul $r2,$h1,$t0
216 add $t2,$d2,$d2
217 add $t0,$d3,$d3
218
219 umul $s2,$h2,$t1
220 umul $s3,$h2,$t2
221 umul $r0,$h2,$t0
222 add $t1,$d0,$d0
223 add $t2,$d1,$d1
224 umul $r1,$h2,$t1
225 add $t0,$d2,$d2
226 add $t1,$d3,$d3
227
228 umul $s1,$h3,$t2
229 umul $s2,$h3,$t0
230 umul $s3,$h3,$t1
231 add $t2,$d0,$d0
232 add $t0,$d1,$d1
233 umul $r0,$h3,$t2
234 add $t1,$d2,$d2
235 add $t2,$d3,$d3
236
237 umul $s1,$h4,$t0
238 umul $s2,$h4,$t1
239 umul $s3,$h4,$t2
240 umul $r0,$h4,$h4
241 add $t0,$d1,$d1
242 add $t1,$d2,$d2
243 srlx $d0,32,$h1
244 add $t2,$d3,$d3
245 srlx $d1,32,$h2
246
247 addcc $d1,$h1,$h1
248 srlx $d2,32,$h3
249 set 8,$d1
250 addccc $d2,$h2,$h2
251 srlx $d3,32,$t0
252 set 16,$d2
253 addccc $d3,$h3,$h3
254 addc $t0,$h4,$h4
255
256 srl $h4,2,$t0 ! final reduction step
257 andn $h4,3,$t1
258 and $h4,3,$h4
259 add $t1,$t0,$t0
260
261 addcc $t0,$d0,$h0
262 addccc %g0,$h1,$h1
263 addccc %g0,$h2,$h2
264 addccc %g0,$h3,$h3
265 brnz,pt $len,.Loop
266 addc %g0,$h4,$h4
267
268 st $h1,[$ctx+0] ! store hash value
269 st $h0,[$ctx+4]
270 st $h3,[$ctx+8]
271 st $h2,[$ctx+12]
272 st $h4,[$ctx+16]
273
274.Lno_data:
275 ret
276 restore
277.type poly1305_blocks,#function
278.size poly1305_blocks,.-poly1305_blocks
279___
280########################################################################
281# VIS3 has umulxhi and addxc...
282{
283my ($H0,$H1,$H2,$R0,$R1,$S1,$T1) = map("%o$_",(0..5,7));
284my ($D0,$D1,$D2,$T0) = map("%g$_",(1..4));
285
286$code.=<<___;
287.align 32
288poly1305_blocks_vis3:
289 save %sp,-STACK_FRAME,%sp
290 srln $len,4,$len
291
292 brz,pn $len,.Lno_data
293 nop
294
295 ldx [$ctx+32+0],$R0 ! load key
296 ldx [$ctx+32+8],$R1
297
298 ldx [$ctx+0],$H0 ! load hash value
299 ldx [$ctx+8],$H1
300 ld [$ctx+16],$H2
301
302 and $inp,7,$shr ! alignment factor
303 andn $inp,7,$inp
304 set 8,$r1
305 sll $shr,3,$shr ! *8
306 set 16,$r2
307 neg $shr,$shl
308
309 srlx $R1,2,$S1
310 b .Loop_vis3
311 add $R1,$S1,$S1
312
313.Loop_vis3:
314 ldxa [$inp+%g0]0x88,$D0 ! load little-endian input
315 brz,pt $shr,.Linp_aligned_vis3
316 ldxa [$inp+$r1]0x88,$D1
317
318 ldxa [$inp+$r2]0x88,$D2
319 srlx $D0,$shr,$D0
320 sllx $D1,$shl,$T1
321 srlx $D1,$shr,$D1
322 or $T1,$D0,$D0
323 sllx $D2,$shl,$D2
324 or $D2,$D1,$D1
325
326.Linp_aligned_vis3:
327 addcc $D0,$H0,$H0 ! accumulate input
328 sub $len,1,$len
329 addxccc $D1,$H1,$H1
330 add $inp,16,$inp
331
332 mulx $R0,$H0,$D0 ! r0*h0
333 addxc $padbit,$H2,$H2
334 umulxhi $R0,$H0,$D1
335 mulx $S1,$H1,$T0 ! s1*h1
336 umulxhi $S1,$H1,$T1
337 addcc $T0,$D0,$D0
338 mulx $R1,$H0,$T0 ! r1*h0
339 addxc $T1,$D1,$D1
340 umulxhi $R1,$H0,$D2
341 addcc $T0,$D1,$D1
342 mulx $R0,$H1,$T0 ! r0*h1
343 addxc %g0,$D2,$D2
344 umulxhi $R0,$H1,$T1
345 addcc $T0,$D1,$D1
346 mulx $S1,$H2,$T0 ! s1*h2
347 addxc $T1,$D2,$D2
348 mulx $R0,$H2,$T1 ! r0*h2
349 addcc $T0,$D1,$D1
350 addxc $T1,$D2,$D2
351
352 srlx $D2,2,$T0 ! final reduction step
353 andn $D2,3,$T1
354 and $D2,3,$H2
355 add $T1,$T0,$T0
356
357 addcc $T0,$D0,$H0
358 addxccc %g0,$D1,$H1
359 brnz,pt $len,.Loop_vis3
360 addxc %g0,$H2,$H2
361
362 stx $H0,[$ctx+0] ! store hash value
363 stx $H1,[$ctx+8]
364 st $H2,[$ctx+16]
365
366 ret
367 restore
368.type poly1305_blocks_vis3,#function
369.size poly1305_blocks_vis3,.-poly1305_blocks_vis3
370___
371}
372my ($mac,$nonce) = ($inp,$len);
373
374$code.=<<___;
375.globl poly1305_emit
376.align 32
377poly1305_emit:
378 save %sp,-STACK_FRAME,%sp
379
380 ld [$ctx+0],$h1 ! load hash value
381 ld [$ctx+4],$h0
382 ld [$ctx+8],$h3
383 ld [$ctx+12],$h2
384 ld [$ctx+16],$h4
385
386 addcc $h0,5,$r0 ! compare to modulus
387 addccc $h1,0,$r1
388 addccc $h2,0,$r2
389 addccc $h3,0,$r3
390 addc $h4,0,$h4
391 andcc $h4,4,%g0 ! did it carry/borrow?
392
393 movnz %icc,$r0,$h0
394 ld [$nonce+0],$r0 ! load nonce
395 movnz %icc,$r1,$h1
396 ld [$nonce+4],$r1
397 movnz %icc,$r2,$h2
398 ld [$nonce+8],$r2
399 movnz %icc,$r3,$h3
400 ld [$nonce+12],$r3
401
402 addcc $r0,$h0,$h0 ! accumulate nonce
403 addccc $r1,$h1,$h1
404 addccc $r2,$h2,$h2
405 addc $r3,$h3,$h3
406
407 srl $h0,8,$r0
408 stb $h0,[$mac+0] ! store little-endian result
409 srl $h0,16,$r1
410 stb $r0,[$mac+1]
411 srl $h0,24,$r2
412 stb $r1,[$mac+2]
413 stb $r2,[$mac+3]
414
415 srl $h1,8,$r0
416 stb $h1,[$mac+4]
417 srl $h1,16,$r1
418 stb $r0,[$mac+5]
419 srl $h1,24,$r2
420 stb $r1,[$mac+6]
421 stb $r2,[$mac+7]
422
423 srl $h2,8,$r0
424 stb $h2,[$mac+8]
425 srl $h2,16,$r1
426 stb $r0,[$mac+9]
427 srl $h2,24,$r2
428 stb $r1,[$mac+10]
429 stb $r2,[$mac+11]
430
431 srl $h3,8,$r0
432 stb $h3,[$mac+12]
433 srl $h3,16,$r1
434 stb $r0,[$mac+13]
435 srl $h3,24,$r2
436 stb $r1,[$mac+14]
437 stb $r2,[$mac+15]
438
439 ret
440 restore
441.type poly1305_emit,#function
442.size poly1305_emit,.-poly1305_emit
443___
444
445{
446my ($ctx,$inp,$len,$padbit) = map("%i$_",(0..3));
447my ($in0,$in1,$in2,$in3,$in4) = map("%o$_",(0..4));
448my ($i1,$step,$shr,$shl) = map("%l$_",(0..7));
449my $i2=$step;
450
451my ($h0lo,$h0hi,$h1lo,$h1hi,$h2lo,$h2hi,$h3lo,$h3hi,
452 $two0,$two32,$two64,$two96,$two130,$five_two130,
453 $r0lo,$r0hi,$r1lo,$r1hi,$r2lo,$r2hi,
454 $s2lo,$s2hi,$s3lo,$s3hi,
455 $c0lo,$c0hi,$c1lo,$c1hi,$c2lo,$c2hi,$c3lo,$c3hi) = map("%f".2*$_,(0..31));
456# borrowings
457my ($r3lo,$r3hi,$s1lo,$s1hi) = ($c0lo,$c0hi,$c1lo,$c1hi);
458my ($x0,$x1,$x2,$x3) = ($c2lo,$c2hi,$c3lo,$c3hi);
459my ($y0,$y1,$y2,$y3) = ($c1lo,$c1hi,$c3hi,$c3lo);
460
461$code.=<<___;
462.align 32
463poly1305_init_fma:
464 save %sp,-STACK_FRAME-16,%sp
465 nop
466
467.Lpoly1305_init_fma:
4681: call .+8
469 add %o7,.Lconsts_fma-1b,%o7
470
471 ldd [%o7+8*0],$two0 ! load constants
472 ldd [%o7+8*1],$two32
473 ldd [%o7+8*2],$two64
474 ldd [%o7+8*3],$two96
475 ldd [%o7+8*5],$five_two130
476
477 std $two0,[$ctx+8*0] ! initial hash value, biased 0
478 std $two32,[$ctx+8*1]
479 std $two64,[$ctx+8*2]
480 std $two96,[$ctx+8*3]
481
482 brz,pn $inp,.Lno_key_fma
483 nop
484
485 stx %fsr,[%sp+LOCALS] ! save original %fsr
486 ldx [%o7+8*6],%fsr ! load new %fsr
487
488 std $two0,[$ctx+8*4] ! key "template"
489 std $two32,[$ctx+8*5]
490 std $two64,[$ctx+8*6]
491 std $two96,[$ctx+8*7]
492
493 and $inp,7,$shr
494 andn $inp,7,$inp ! align pointer
495 mov 8,$i1
496 sll $shr,3,$shr
497 mov 16,$i2
498 neg $shr,$shl
499
500 ldxa [$inp+%g0]0x88,$in0 ! load little-endian key
501 ldxa [$inp+$i1]0x88,$in2
502
503 brz $shr,.Lkey_aligned_fma
504 sethi %hi(0xf0000000),$i1 ! 0xf0000000
505
506 ldxa [$inp+$i2]0x88,$in4
507
508 srlx $in0,$shr,$in0 ! align data
509 sllx $in2,$shl,$in1
510 srlx $in2,$shr,$in2
511 or $in1,$in0,$in0
512 sllx $in4,$shl,$in3
513 or $in3,$in2,$in2
514
515.Lkey_aligned_fma:
516 or $i1,3,$i2 ! 0xf0000003
517 srlx $in0,32,$in1
518 andn $in0,$i1,$in0 ! &=0x0fffffff
519 andn $in1,$i2,$in1 ! &=0x0ffffffc
520 srlx $in2,32,$in3
521 andn $in2,$i2,$in2
522 andn $in3,$i2,$in3
523
524 st $in0,[$ctx+`8*4+4`] ! fill "template"
525 st $in1,[$ctx+`8*5+4`]
526 st $in2,[$ctx+`8*6+4`]
527 st $in3,[$ctx+`8*7+4`]
528
529 ldd [$ctx+8*4],$h0lo ! load [biased] key
530 ldd [$ctx+8*5],$h1lo
531 ldd [$ctx+8*6],$h2lo
532 ldd [$ctx+8*7],$h3lo
533
534 fsubd $h0lo,$two0, $h0lo ! r0
535 ldd [%o7+8*7],$two0 ! more constants
536 fsubd $h1lo,$two32,$h1lo ! r1
537 ldd [%o7+8*8],$two32
538 fsubd $h2lo,$two64,$h2lo ! r2
539 ldd [%o7+8*9],$two64
540 fsubd $h3lo,$two96,$h3lo ! r3
541 ldd [%o7+8*10],$two96
542
543 fmuld $five_two130,$h1lo,$s1lo ! s1
544 fmuld $five_two130,$h2lo,$s2lo ! s2
545 fmuld $five_two130,$h3lo,$s3lo ! s3
546
547 faddd $h0lo,$two0, $h0hi
548 faddd $h1lo,$two32,$h1hi
549 faddd $h2lo,$two64,$h2hi
550 faddd $h3lo,$two96,$h3hi
551
552 fsubd $h0hi,$two0, $h0hi
553 ldd [%o7+8*11],$two0 ! more constants
554 fsubd $h1hi,$two32,$h1hi
555 ldd [%o7+8*12],$two32
556 fsubd $h2hi,$two64,$h2hi
557 ldd [%o7+8*13],$two64
558 fsubd $h3hi,$two96,$h3hi
559
560 fsubd $h0lo,$h0hi,$h0lo
561 std $h0hi,[$ctx+8*5] ! r0hi
562 fsubd $h1lo,$h1hi,$h1lo
563 std $h1hi,[$ctx+8*7] ! r1hi
564 fsubd $h2lo,$h2hi,$h2lo
565 std $h2hi,[$ctx+8*9] ! r2hi
566 fsubd $h3lo,$h3hi,$h3lo
567 std $h3hi,[$ctx+8*11] ! r3hi
568
569 faddd $s1lo,$two0, $s1hi
570 faddd $s2lo,$two32,$s2hi
571 faddd $s3lo,$two64,$s3hi
572
573 fsubd $s1hi,$two0, $s1hi
574 fsubd $s2hi,$two32,$s2hi
575 fsubd $s3hi,$two64,$s3hi
576
577 fsubd $s1lo,$s1hi,$s1lo
578 fsubd $s2lo,$s2hi,$s2lo
579 fsubd $s3lo,$s3hi,$s3lo
580
581 ldx [%sp+LOCALS],%fsr ! restore %fsr
582
583 std $h0lo,[$ctx+8*4] ! r0lo
584 std $h1lo,[$ctx+8*6] ! r1lo
585 std $h2lo,[$ctx+8*8] ! r2lo
586 std $h3lo,[$ctx+8*10] ! r3lo
587
588 std $s1hi,[$ctx+8*13]
589 std $s2hi,[$ctx+8*15]
590 std $s3hi,[$ctx+8*17]
591
592 std $s1lo,[$ctx+8*12]
593 std $s2lo,[$ctx+8*14]
594 std $s3lo,[$ctx+8*16]
595
596 add %o7,poly1305_blocks_fma-.Lconsts_fma,%o0
597 add %o7,poly1305_emit_fma-.Lconsts_fma,%o1
598 STPTR %o0,[%i2]
599 STPTR %o1,[%i2+SIZE_T]
600
601 ret
602 restore %g0,1,%o0 ! return 1
603
604.Lno_key_fma:
605 ret
606 restore %g0,%g0,%o0 ! return 0
607.type poly1305_init_fma,#function
608.size poly1305_init_fma,.-poly1305_init_fma
609
610.align 32
611poly1305_blocks_fma:
612 save %sp,-STACK_FRAME-48,%sp
613 srln $len,4,$len
614
615 brz,pn $len,.Labort
616 sub $len,1,$len
617
6181: call .+8
619 add %o7,.Lconsts_fma-1b,%o7
620
621 ldd [%o7+8*0],$two0 ! load constants
622 ldd [%o7+8*1],$two32
623 ldd [%o7+8*2],$two64
624 ldd [%o7+8*3],$two96
625 ldd [%o7+8*4],$two130
626 ldd [%o7+8*5],$five_two130
627
628 ldd [$ctx+8*0],$h0lo ! load [biased] hash value
629 ldd [$ctx+8*1],$h1lo
630 ldd [$ctx+8*2],$h2lo
631 ldd [$ctx+8*3],$h3lo
632
633 std $two0,[%sp+LOCALS+8*0] ! input "template"
634 sethi %hi((1023+52+96)<<20),$in3
635 std $two32,[%sp+LOCALS+8*1]
636 or $padbit,$in3,$in3
637 std $two64,[%sp+LOCALS+8*2]
638 st $in3,[%sp+LOCALS+8*3]
639
640 and $inp,7,$shr
641 andn $inp,7,$inp ! align pointer
642 mov 8,$i1
643 sll $shr,3,$shr
644 mov 16,$step
645 neg $shr,$shl
646
647 ldxa [$inp+%g0]0x88,$in0 ! load little-endian input
648 brz $shr,.Linp_aligned_fma
649 ldxa [$inp+$i1]0x88,$in2
650
651 ldxa [$inp+$step]0x88,$in4
652 add $inp,8,$inp
653
654 srlx $in0,$shr,$in0 ! align data
655 sllx $in2,$shl,$in1
656 srlx $in2,$shr,$in2
657 or $in1,$in0,$in0
658 sllx $in4,$shl,$in3
659 srlx $in4,$shr,$in4 ! pre-shift
660 or $in3,$in2,$in2
661
662.Linp_aligned_fma:
663 srlx $in0,32,$in1
664 movrz $len,0,$step
665 srlx $in2,32,$in3
666 add $step,$inp,$inp ! conditional advance
667
668 st $in0,[%sp+LOCALS+8*0+4] ! fill "template"
669 st $in1,[%sp+LOCALS+8*1+4]
670 st $in2,[%sp+LOCALS+8*2+4]
671 st $in3,[%sp+LOCALS+8*3+4]
672
673 ldd [$ctx+8*4],$r0lo ! load key
674 ldd [$ctx+8*5],$r0hi
675 ldd [$ctx+8*6],$r1lo
676 ldd [$ctx+8*7],$r1hi
677 ldd [$ctx+8*8],$r2lo
678 ldd [$ctx+8*9],$r2hi
679 ldd [$ctx+8*10],$r3lo
680 ldd [$ctx+8*11],$r3hi
681 ldd [$ctx+8*12],$s1lo
682 ldd [$ctx+8*13],$s1hi
683 ldd [$ctx+8*14],$s2lo
684 ldd [$ctx+8*15],$s2hi
685 ldd [$ctx+8*16],$s3lo
686 ldd [$ctx+8*17],$s3hi
687
688 stx %fsr,[%sp+LOCALS+8*4] ! save original %fsr
689 ldx [%o7+8*6],%fsr ! load new %fsr
690
691 subcc $len,1,$len
692 movrz $len,0,$step
693
694 ldd [%sp+LOCALS+8*0],$x0 ! load biased input
695 ldd [%sp+LOCALS+8*1],$x1
696 ldd [%sp+LOCALS+8*2],$x2
697 ldd [%sp+LOCALS+8*3],$x3
698
699 fsubd $h0lo,$two0, $h0lo ! de-bias hash value
700 fsubd $h1lo,$two32,$h1lo
701 ldxa [$inp+%g0]0x88,$in0 ! modulo-scheduled input load
702 fsubd $h2lo,$two64,$h2lo
703 fsubd $h3lo,$two96,$h3lo
704 ldxa [$inp+$i1]0x88,$in2
705
706 fsubd $x0,$two0, $x0 ! de-bias input
707 fsubd $x1,$two32,$x1
708 fsubd $x2,$two64,$x2
709 fsubd $x3,$two96,$x3
710
711 brz $shr,.Linp_aligned_fma2
712 add $step,$inp,$inp ! conditional advance
713
714 sllx $in0,$shl,$in1 ! align data
715 srlx $in0,$shr,$in3
716 or $in1,$in4,$in0
717 sllx $in2,$shl,$in1
718 srlx $in2,$shr,$in4 ! pre-shift
719 or $in3,$in1,$in2
720.Linp_aligned_fma2:
721 srlx $in0,32,$in1
722 srlx $in2,32,$in3
723
724 faddd $h0lo,$x0,$x0 ! accumulate input
725 stw $in0,[%sp+LOCALS+8*0+4]
726 faddd $h1lo,$x1,$x1
727 stw $in1,[%sp+LOCALS+8*1+4]
728 faddd $h2lo,$x2,$x2
729 stw $in2,[%sp+LOCALS+8*2+4]
730 faddd $h3lo,$x3,$x3
731 stw $in3,[%sp+LOCALS+8*3+4]
732
733 b .Lentry_fma
734 nop
735
736.align 16
737.Loop_fma:
738 ldxa [$inp+%g0]0x88,$in0 ! modulo-scheduled input load
739 ldxa [$inp+$i1]0x88,$in2
740 movrz $len,0,$step
741
742 faddd $y0,$h0lo,$h0lo ! accumulate input
743 faddd $y1,$h0hi,$h0hi
744 faddd $y2,$h2lo,$h2lo
745 faddd $y3,$h2hi,$h2hi
746
747 brz,pn $shr,.Linp_aligned_fma3
748 add $step,$inp,$inp ! conditional advance
749
750 sllx $in0,$shl,$in1 ! align data
751 srlx $in0,$shr,$in3
752 or $in1,$in4,$in0
753 sllx $in2,$shl,$in1
754 srlx $in2,$shr,$in4 ! pre-shift
755 or $in3,$in1,$in2
756
757.Linp_aligned_fma3:
758 !!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!! base 2^48 -> base 2^32
759 faddd $two64,$h1lo,$c1lo
760 srlx $in0,32,$in1
761 faddd $two64,$h1hi,$c1hi
762 srlx $in2,32,$in3
763 faddd $two130,$h3lo,$c3lo
764 st $in0,[%sp+LOCALS+8*0+4] ! fill "template"
765 faddd $two130,$h3hi,$c3hi
766 st $in1,[%sp+LOCALS+8*1+4]
767 faddd $two32,$h0lo,$c0lo
768 st $in2,[%sp+LOCALS+8*2+4]
769 faddd $two32,$h0hi,$c0hi
770 st $in3,[%sp+LOCALS+8*3+4]
771 faddd $two96,$h2lo,$c2lo
772 faddd $two96,$h2hi,$c2hi
773
774 fsubd $c1lo,$two64,$c1lo
775 fsubd $c1hi,$two64,$c1hi
776 fsubd $c3lo,$two130,$c3lo
777 fsubd $c3hi,$two130,$c3hi
778 fsubd $c0lo,$two32,$c0lo
779 fsubd $c0hi,$two32,$c0hi
780 fsubd $c2lo,$two96,$c2lo
781 fsubd $c2hi,$two96,$c2hi
782
783 fsubd $h1lo,$c1lo,$h1lo
784 fsubd $h1hi,$c1hi,$h1hi
785 fsubd $h3lo,$c3lo,$h3lo
786 fsubd $h3hi,$c3hi,$h3hi
787 fsubd $h2lo,$c2lo,$h2lo
788 fsubd $h2hi,$c2hi,$h2hi
789 fsubd $h0lo,$c0lo,$h0lo
790 fsubd $h0hi,$c0hi,$h0hi
791
792 faddd $h1lo,$c0lo,$h1lo
793 faddd $h1hi,$c0hi,$h1hi
794 faddd $h3lo,$c2lo,$h3lo
795 faddd $h3hi,$c2hi,$h3hi
796 faddd $h2lo,$c1lo,$h2lo
797 faddd $h2hi,$c1hi,$h2hi
798 fmaddd $five_two130,$c3lo,$h0lo,$h0lo
799 fmaddd $five_two130,$c3hi,$h0hi,$h0hi
800
801 faddd $h1lo,$h1hi,$x1
802 ldd [$ctx+8*12],$s1lo ! reload constants
803 faddd $h3lo,$h3hi,$x3
804 ldd [$ctx+8*13],$s1hi
805 faddd $h2lo,$h2hi,$x2
806 ldd [$ctx+8*10],$r3lo
807 faddd $h0lo,$h0hi,$x0
808 ldd [$ctx+8*11],$r3hi
809
810.Lentry_fma:
811 fmuld $x1,$s3lo,$h0lo
812 fmuld $x1,$s3hi,$h0hi
813 fmuld $x1,$r1lo,$h2lo
814 fmuld $x1,$r1hi,$h2hi
815 fmuld $x1,$r0lo,$h1lo
816 fmuld $x1,$r0hi,$h1hi
817 fmuld $x1,$r2lo,$h3lo
818 fmuld $x1,$r2hi,$h3hi
819
820 fmaddd $x3,$s1lo,$h0lo,$h0lo
821 fmaddd $x3,$s1hi,$h0hi,$h0hi
822 fmaddd $x3,$s3lo,$h2lo,$h2lo
823 fmaddd $x3,$s3hi,$h2hi,$h2hi
824 fmaddd $x3,$s2lo,$h1lo,$h1lo
825 fmaddd $x3,$s2hi,$h1hi,$h1hi
826 fmaddd $x3,$r0lo,$h3lo,$h3lo
827 fmaddd $x3,$r0hi,$h3hi,$h3hi
828
829 fmaddd $x2,$s2lo,$h0lo,$h0lo
830 fmaddd $x2,$s2hi,$h0hi,$h0hi
831 fmaddd $x2,$r0lo,$h2lo,$h2lo
832 fmaddd $x2,$r0hi,$h2hi,$h2hi
833 fmaddd $x2,$s3lo,$h1lo,$h1lo
834 ldd [%sp+LOCALS+8*0],$y0 ! load [biased] input
835 fmaddd $x2,$s3hi,$h1hi,$h1hi
836 ldd [%sp+LOCALS+8*1],$y1
837 fmaddd $x2,$r1lo,$h3lo,$h3lo
838 ldd [%sp+LOCALS+8*2],$y2
839 fmaddd $x2,$r1hi,$h3hi,$h3hi
840 ldd [%sp+LOCALS+8*3],$y3
841
842 fmaddd $x0,$r0lo,$h0lo,$h0lo
843 fsubd $y0,$two0, $y0 ! de-bias input
844 fmaddd $x0,$r0hi,$h0hi,$h0hi
845 fsubd $y1,$two32,$y1
846 fmaddd $x0,$r2lo,$h2lo,$h2lo
847 fsubd $y2,$two64,$y2
848 fmaddd $x0,$r2hi,$h2hi,$h2hi
849 fsubd $y3,$two96,$y3
850 fmaddd $x0,$r1lo,$h1lo,$h1lo
851 fmaddd $x0,$r1hi,$h1hi,$h1hi
852 fmaddd $x0,$r3lo,$h3lo,$h3lo
853 fmaddd $x0,$r3hi,$h3hi,$h3hi
854
855 bcc SIZE_T_CC,.Loop_fma
856 subcc $len,1,$len
857
858 !!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!! base 2^48 -> base 2^32
859 faddd $h0lo,$two32,$c0lo
860 faddd $h0hi,$two32,$c0hi
861 faddd $h2lo,$two96,$c2lo
862 faddd $h2hi,$two96,$c2hi
863 faddd $h1lo,$two64,$c1lo
864 faddd $h1hi,$two64,$c1hi
865 faddd $h3lo,$two130,$c3lo
866 faddd $h3hi,$two130,$c3hi
867
868 fsubd $c0lo,$two32,$c0lo
869 fsubd $c0hi,$two32,$c0hi
870 fsubd $c2lo,$two96,$c2lo
871 fsubd $c2hi,$two96,$c2hi
872 fsubd $c1lo,$two64,$c1lo
873 fsubd $c1hi,$two64,$c1hi
874 fsubd $c3lo,$two130,$c3lo
875 fsubd $c3hi,$two130,$c3hi
876
877 fsubd $h1lo,$c1lo,$h1lo
878 fsubd $h1hi,$c1hi,$h1hi
879 fsubd $h3lo,$c3lo,$h3lo
880 fsubd $h3hi,$c3hi,$h3hi
881 fsubd $h2lo,$c2lo,$h2lo
882 fsubd $h2hi,$c2hi,$h2hi
883 fsubd $h0lo,$c0lo,$h0lo
884 fsubd $h0hi,$c0hi,$h0hi
885
886 faddd $h1lo,$c0lo,$h1lo
887 faddd $h1hi,$c0hi,$h1hi
888 faddd $h3lo,$c2lo,$h3lo
889 faddd $h3hi,$c2hi,$h3hi
890 faddd $h2lo,$c1lo,$h2lo
891 faddd $h2hi,$c1hi,$h2hi
892 fmaddd $five_two130,$c3lo,$h0lo,$h0lo
893 fmaddd $five_two130,$c3hi,$h0hi,$h0hi
894
895 faddd $h1lo,$h1hi,$x1
896 faddd $h3lo,$h3hi,$x3
897 faddd $h2lo,$h2hi,$x2
898 faddd $h0lo,$h0hi,$x0
899
900 faddd $x1,$two32,$x1 ! bias
901 faddd $x3,$two96,$x3
902 faddd $x2,$two64,$x2
903 faddd $x0,$two0, $x0
904
905 ldx [%sp+LOCALS+8*4],%fsr ! restore saved %fsr
906
907 std $x1,[$ctx+8*1] ! store [biased] hash value
908 std $x3,[$ctx+8*3]
909 std $x2,[$ctx+8*2]
910 std $x0,[$ctx+8*0]
911
912.Labort:
913 ret
914 restore
915.type poly1305_blocks_fma,#function
916.size poly1305_blocks_fma,.-poly1305_blocks_fma
917___
918{
919my ($mac,$nonce)=($inp,$len);
920
921my ($h0,$h1,$h2,$h3,$h4, $d0,$d1,$d2,$d3, $mask
922 ) = (map("%l$_",(0..5)),map("%o$_",(0..4)));
923
924$code.=<<___;
925.align 32
926poly1305_emit_fma:
927 save %sp,-STACK_FRAME,%sp
928
929 ld [$ctx+8*0+0],$d0 ! load hash
930 ld [$ctx+8*0+4],$h0
931 ld [$ctx+8*1+0],$d1
932 ld [$ctx+8*1+4],$h1
933 ld [$ctx+8*2+0],$d2
934 ld [$ctx+8*2+4],$h2
935 ld [$ctx+8*3+0],$d3
936 ld [$ctx+8*3+4],$h3
937
938 sethi %hi(0xfff00000),$mask
939 andn $d0,$mask,$d0 ! mask exponent
940 andn $d1,$mask,$d1
941 andn $d2,$mask,$d2
942 andn $d3,$mask,$d3 ! can be partially reduced...
943 mov 3,$mask
944
945 srl $d3,2,$padbit ! ... so reduce
946 and $d3,$mask,$h4
947 andn $d3,$mask,$d3
948 add $padbit,$d3,$d3
949
950 addcc $d3,$h0,$h0
951 addccc $d0,$h1,$h1
952 addccc $d1,$h2,$h2
953 addccc $d2,$h3,$h3
954 addc %g0,$h4,$h4
955
956 addcc $h0,5,$d0 ! compare to modulus
957 addccc $h1,0,$d1
958 addccc $h2,0,$d2
959 addccc $h3,0,$d3
960 addc $h4,0,$mask
961
962 srl $mask,2,$mask ! did it carry/borrow?
963 neg $mask,$mask
964 sra $mask,31,$mask ! mask
965
966 andn $h0,$mask,$h0
967 and $d0,$mask,$d0
968 andn $h1,$mask,$h1
969 and $d1,$mask,$d1
970 or $d0,$h0,$h0
971 ld [$nonce+0],$d0 ! load nonce
972 andn $h2,$mask,$h2
973 and $d2,$mask,$d2
974 or $d1,$h1,$h1
975 ld [$nonce+4],$d1
976 andn $h3,$mask,$h3
977 and $d3,$mask,$d3
978 or $d2,$h2,$h2
979 ld [$nonce+8],$d2
980 or $d3,$h3,$h3
981 ld [$nonce+12],$d3
982
983 addcc $d0,$h0,$h0 ! accumulate nonce
984 addccc $d1,$h1,$h1
985 addccc $d2,$h2,$h2
986 addc $d3,$h3,$h3
987
988 stb $h0,[$mac+0] ! write little-endian result
989 srl $h0,8,$h0
990 stb $h1,[$mac+4]
991 srl $h1,8,$h1
992 stb $h2,[$mac+8]
993 srl $h2,8,$h2
994 stb $h3,[$mac+12]
995 srl $h3,8,$h3
996
997 stb $h0,[$mac+1]
998 srl $h0,8,$h0
999 stb $h1,[$mac+5]
1000 srl $h1,8,$h1
1001 stb $h2,[$mac+9]
1002 srl $h2,8,$h2
1003 stb $h3,[$mac+13]
1004 srl $h3,8,$h3
1005
1006 stb $h0,[$mac+2]
1007 srl $h0,8,$h0
1008 stb $h1,[$mac+6]
1009 srl $h1,8,$h1
1010 stb $h2,[$mac+10]
1011 srl $h2,8,$h2
1012 stb $h3,[$mac+14]
1013 srl $h3,8,$h3
1014
1015 stb $h0,[$mac+3]
1016 stb $h1,[$mac+7]
1017 stb $h2,[$mac+11]
1018 stb $h3,[$mac+15]
1019
1020 ret
1021 restore
1022.type poly1305_emit_fma,#function
1023.size poly1305_emit_fma,.-poly1305_emit_fma
1024___
1025}
1026
1027$code.=<<___;
1028.align 64
1029.Lconsts_fma:
1030.word 0x43300000,0x00000000 ! 2^(52+0)
1031.word 0x45300000,0x00000000 ! 2^(52+32)
1032.word 0x47300000,0x00000000 ! 2^(52+64)
1033.word 0x49300000,0x00000000 ! 2^(52+96)
1034.word 0x4b500000,0x00000000 ! 2^(52+130)
1035
1036.word 0x37f40000,0x00000000 ! 5/2^130
1037.word 0,1<<30 ! fsr: truncate, no exceptions
1038
1039.word 0x44300000,0x00000000 ! 2^(52+16+0)
1040.word 0x46300000,0x00000000 ! 2^(52+16+32)
1041.word 0x48300000,0x00000000 ! 2^(52+16+64)
1042.word 0x4a300000,0x00000000 ! 2^(52+16+96)
1043.word 0x3e300000,0x00000000 ! 2^(52+16+0-96)
1044.word 0x40300000,0x00000000 ! 2^(52+16+32-96)
1045.word 0x42300000,0x00000000 ! 2^(52+16+64-96)
1046.asciz "Poly1305 for SPARCv9/VIS3/FMA, CRYPTOGAMS by <appro\@openssl.org>"
1047.align 4
1048___
1049}
1050
1051
1052# Purpose of these subroutines is to explicitly encode VIS instructions,
1053# so that one can compile the module without having to specify VIS
1054# extensions on compiler command line, e.g. -xarch=v9 vs. -xarch=v9a.
1055# Idea is to reserve for option to produce "universal" binary and let
1056# programmer detect if current CPU is VIS capable at run-time.
1057sub unvis3 {
1058my ($mnemonic,$rs1,$rs2,$rd)=@_;
1059my %bias = ( "g" => 0, "o" => 8, "l" => 16, "i" => 24 );
1060my ($ref,$opf);
1061my %visopf = ( "addxc" => 0x011,
1062 "addxccc" => 0x013,
1063 "umulxhi" => 0x016 );
1064
1065 $ref = "$mnemonic\t$rs1,$rs2,$rd";
1066
1067 if ($opf=$visopf{$mnemonic}) {
1068 foreach ($rs1,$rs2,$rd) {
1069 return $ref if (!/%([goli])([0-9])/);
1070 $_=$bias{$1}+$2;
1071 }
1072
1073 return sprintf ".word\t0x%08x !%s",
1074 0x81b00000|$rd<<25|$rs1<<14|$opf<<5|$rs2,
1075 $ref;
1076 } else {
1077 return $ref;
1078 }
1079}
1080
1081sub unfma {
1082my ($mnemonic,$rs1,$rs2,$rs3,$rd)=@_;
1083my ($ref,$opf);
1084my %fmaopf = ( "fmadds" => 0x1,
1085 "fmaddd" => 0x2,
1086 "fmsubs" => 0x5,
1087 "fmsubd" => 0x6 );
1088
1089 $ref = "$mnemonic\t$rs1,$rs2,$rs3,$rd";
1090
1091 if ($opf=$fmaopf{$mnemonic}) {
1092 foreach ($rs1,$rs2,$rs3,$rd) {
1093 return $ref if (!/%f([0-9]{1,2})/);
1094 $_=$1;
1095 if ($1>=32) {
1096 return $ref if ($1&1);
1097 # re-encode for upper double register addressing
1098 $_=($1|$1>>5)&31;
1099 }
1100 }
1101
1102 return sprintf ".word\t0x%08x !%s",
1103 0x81b80000|$rd<<25|$rs1<<14|$rs3<<9|$opf<<5|$rs2,
1104 $ref;
1105 } else {
1106 return $ref;
1107 }
1108}
1109
1110foreach (split("\n",$code)) {
1111 s/\`([^\`]*)\`/eval $1/ge;
1112
1113 s/\b(umulxhi|addxc[c]{0,2})\s+(%[goli][0-7]),\s*(%[goli][0-7]),\s*(%[goli][0-7])/
1114 &unvis3($1,$2,$3,$4)
1115 /ge or
1116 s/\b(fmadd[sd])\s+(%f[0-9]+),\s*(%f[0-9]+),\s*(%f[0-9]+),\s*(%f[0-9]+)/
1117 &unfma($1,$2,$3,$4,$5)
1118 /ge;
1119
1120 print $_,"\n";
1121}
1122
1123close STDOUT or die "error closing STDOUT: $!";
Note: See TracBrowser for help on using the repository browser.

© 2024 Oracle Support Privacy / Do Not Sell My Info Terms of Use Trademark Policy Automated Access Etiquette