VirtualBox

source: vbox/trunk/src/libs/openssl-3.1.3/crypto/aes/asm/aesfx-sparcv9.pl@ 102427

Last change on this file since 102427 was 101211, checked in by vboxsync, 17 months ago

openssl-3.1.3: Applied and adjusted our OpenSSL changes to 3.1.2. bugref:10527

File size: 27.6 KB
Line 
1#! /usr/bin/env perl
2# Copyright 2016-2021 The OpenSSL Project Authors. All Rights Reserved.
3#
4# Licensed under the Apache License 2.0 (the "License"). You may not use
5# this file except in compliance with the License. You can obtain a copy
6# in the file LICENSE in the source distribution or at
7# https://www.openssl.org/source/license.html
8
9#
10# ====================================================================
11# Written by Andy Polyakov <[email protected]> for the OpenSSL
12# project. The module is, however, dual licensed under OpenSSL and
13# CRYPTOGAMS licenses depending on where you obtain it. For further
14# details see http://www.openssl.org/~appro/cryptogams/.
15# ====================================================================
16
17# March 2016
18#
19# Initial support for Fujitsu SPARC64 X/X+ comprises minimally
20# required key setup and single-block procedures.
21#
22# April 2016
23#
24# Add "teaser" CBC and CTR mode-specific subroutines. "Teaser" means
25# that parallelizable nature of CBC decrypt and CTR is not utilized
26# yet. CBC encrypt on the other hand is as good as it can possibly
27# get processing one byte in 4.1 cycles with 128-bit key on SPARC64 X.
28# This is ~6x faster than pure software implementation...
29#
30# July 2016
31#
32# Switch from faligndata to fshiftorx, which allows to omit alignaddr
33# instructions and improve single-block and short-input performance
34# with misaligned data.
35
36$output = pop and open STDOUT,">$output";
37
38{
39my ($inp,$out,$key,$rounds,$tmp,$mask) = map("%o$_",(0..5));
40
41$code.=<<___;
42#ifndef __ASSEMBLER__
43# define __ASSEMBLER__ 1
44#endif
45#include "crypto/sparc_arch.h"
46
47#define LOCALS (STACK_BIAS+STACK_FRAME)
48
49.text
50
51.globl aes_fx_encrypt
52.align 32
53aes_fx_encrypt:
54 and $inp, 7, $tmp ! is input aligned?
55 andn $inp, 7, $inp
56 ldd [$key + 0], %f6 ! round[0]
57 ldd [$key + 8], %f8
58 mov %o7, %g1
59 ld [$key + 240], $rounds
60
611: call .+8
62 add %o7, .Linp_align-1b, %o7
63
64 sll $tmp, 3, $tmp
65 ldd [$inp + 0], %f0 ! load input
66 brz,pt $tmp, .Lenc_inp_aligned
67 ldd [$inp + 8], %f2
68
69 ldd [%o7 + $tmp], %f14 ! shift left params
70 ldd [$inp + 16], %f4
71 fshiftorx %f0, %f2, %f14, %f0
72 fshiftorx %f2, %f4, %f14, %f2
73
74.Lenc_inp_aligned:
75 ldd [$key + 16], %f10 ! round[1]
76 ldd [$key + 24], %f12
77
78 fxor %f0, %f6, %f0 ! ^=round[0]
79 fxor %f2, %f8, %f2
80 ldd [$key + 32], %f6 ! round[2]
81 ldd [$key + 40], %f8
82 add $key, 32, $key
83 sub $rounds, 4, $rounds
84
85.Loop_enc:
86 fmovd %f0, %f4
87 faesencx %f2, %f10, %f0
88 faesencx %f4, %f12, %f2
89 ldd [$key + 16], %f10
90 ldd [$key + 24], %f12
91 add $key, 32, $key
92
93 fmovd %f0, %f4
94 faesencx %f2, %f6, %f0
95 faesencx %f4, %f8, %f2
96 ldd [$key + 0], %f6
97 ldd [$key + 8], %f8
98
99 brnz,a $rounds, .Loop_enc
100 sub $rounds, 2, $rounds
101
102 andcc $out, 7, $tmp ! is output aligned?
103 andn $out, 7, $out
104 mov 0xff, $mask
105 srl $mask, $tmp, $mask
106 add %o7, 64, %o7
107 sll $tmp, 3, $tmp
108
109 fmovd %f0, %f4
110 faesencx %f2, %f10, %f0
111 faesencx %f4, %f12, %f2
112 ldd [%o7 + $tmp], %f14 ! shift right params
113
114 fmovd %f0, %f4
115 faesenclx %f2, %f6, %f0
116 faesenclx %f4, %f8, %f2
117
118 bnz,pn %icc, .Lenc_out_unaligned
119 mov %g1, %o7
120
121 std %f0, [$out + 0]
122 retl
123 std %f2, [$out + 8]
124
125.align 16
126.Lenc_out_unaligned:
127 add $out, 16, $inp
128 orn %g0, $mask, $tmp
129 fshiftorx %f0, %f0, %f14, %f4
130 fshiftorx %f0, %f2, %f14, %f6
131 fshiftorx %f2, %f2, %f14, %f8
132
133 stda %f4, [$out + $mask]0xc0 ! partial store
134 std %f6, [$out + 8]
135 stda %f8, [$inp + $tmp]0xc0 ! partial store
136 retl
137 nop
138.type aes_fx_encrypt,#function
139.size aes_fx_encrypt,.-aes_fx_encrypt
140
141.globl aes_fx_decrypt
142.align 32
143aes_fx_decrypt:
144 and $inp, 7, $tmp ! is input aligned?
145 andn $inp, 7, $inp
146 ldd [$key + 0], %f6 ! round[0]
147 ldd [$key + 8], %f8
148 mov %o7, %g1
149 ld [$key + 240], $rounds
150
1511: call .+8
152 add %o7, .Linp_align-1b, %o7
153
154 sll $tmp, 3, $tmp
155 ldd [$inp + 0], %f0 ! load input
156 brz,pt $tmp, .Ldec_inp_aligned
157 ldd [$inp + 8], %f2
158
159 ldd [%o7 + $tmp], %f14 ! shift left params
160 ldd [$inp + 16], %f4
161 fshiftorx %f0, %f2, %f14, %f0
162 fshiftorx %f2, %f4, %f14, %f2
163
164.Ldec_inp_aligned:
165 ldd [$key + 16], %f10 ! round[1]
166 ldd [$key + 24], %f12
167
168 fxor %f0, %f6, %f0 ! ^=round[0]
169 fxor %f2, %f8, %f2
170 ldd [$key + 32], %f6 ! round[2]
171 ldd [$key + 40], %f8
172 add $key, 32, $key
173 sub $rounds, 4, $rounds
174
175.Loop_dec:
176 fmovd %f0, %f4
177 faesdecx %f2, %f10, %f0
178 faesdecx %f4, %f12, %f2
179 ldd [$key + 16], %f10
180 ldd [$key + 24], %f12
181 add $key, 32, $key
182
183 fmovd %f0, %f4
184 faesdecx %f2, %f6, %f0
185 faesdecx %f4, %f8, %f2
186 ldd [$key + 0], %f6
187 ldd [$key + 8], %f8
188
189 brnz,a $rounds, .Loop_dec
190 sub $rounds, 2, $rounds
191
192 andcc $out, 7, $tmp ! is output aligned?
193 andn $out, 7, $out
194 mov 0xff, $mask
195 srl $mask, $tmp, $mask
196 add %o7, 64, %o7
197 sll $tmp, 3, $tmp
198
199 fmovd %f0, %f4
200 faesdecx %f2, %f10, %f0
201 faesdecx %f4, %f12, %f2
202 ldd [%o7 + $tmp], %f14 ! shift right params
203
204 fmovd %f0, %f4
205 faesdeclx %f2, %f6, %f0
206 faesdeclx %f4, %f8, %f2
207
208 bnz,pn %icc, .Ldec_out_unaligned
209 mov %g1, %o7
210
211 std %f0, [$out + 0]
212 retl
213 std %f2, [$out + 8]
214
215.align 16
216.Ldec_out_unaligned:
217 add $out, 16, $inp
218 orn %g0, $mask, $tmp
219 fshiftorx %f0, %f0, %f14, %f4
220 fshiftorx %f0, %f2, %f14, %f6
221 fshiftorx %f2, %f2, %f14, %f8
222
223 stda %f4, [$out + $mask]0xc0 ! partial store
224 std %f6, [$out + 8]
225 stda %f8, [$inp + $tmp]0xc0 ! partial store
226 retl
227 nop
228.type aes_fx_decrypt,#function
229.size aes_fx_decrypt,.-aes_fx_decrypt
230___
231}
232{
233my ($inp,$bits,$out,$tmp,$inc) = map("%o$_",(0..5));
234$code.=<<___;
235.globl aes_fx_set_decrypt_key
236.align 32
237aes_fx_set_decrypt_key:
238 b .Lset_encrypt_key
239 mov -1, $inc
240 retl
241 nop
242.type aes_fx_set_decrypt_key,#function
243.size aes_fx_set_decrypt_key,.-aes_fx_set_decrypt_key
244
245.globl aes_fx_set_encrypt_key
246.align 32
247aes_fx_set_encrypt_key:
248 mov 1, $inc
249 nop
250.Lset_encrypt_key:
251 and $inp, 7, $tmp
252 andn $inp, 7, $inp
253 sll $tmp, 3, $tmp
254 mov %o7, %g1
255
2561: call .+8
257 add %o7, .Linp_align-1b, %o7
258
259 ldd [%o7 + $tmp], %f10 ! shift left params
260 mov %g1, %o7
261
262 cmp $bits, 192
263 ldd [$inp + 0], %f0
264 bl,pt %icc, .L128
265 ldd [$inp + 8], %f2
266
267 be,pt %icc, .L192
268 ldd [$inp + 16], %f4
269 brz,pt $tmp, .L256aligned
270 ldd [$inp + 24], %f6
271
272 ldd [$inp + 32], %f8
273 fshiftorx %f0, %f2, %f10, %f0
274 fshiftorx %f2, %f4, %f10, %f2
275 fshiftorx %f4, %f6, %f10, %f4
276 fshiftorx %f6, %f8, %f10, %f6
277
278.L256aligned:
279 mov 14, $bits
280 and $inc, `14*16`, $tmp
281 st $bits, [$out + 240] ! store rounds
282 add $out, $tmp, $out ! start or end of key schedule
283 sllx $inc, 4, $inc ! 16 or -16
284___
285for ($i=0; $i<6; $i++) {
286 $code.=<<___;
287 std %f0, [$out + 0]
288 faeskeyx %f6, `0x10+$i`, %f0
289 std %f2, [$out + 8]
290 add $out, $inc, $out
291 faeskeyx %f0, 0x00, %f2
292 std %f4, [$out + 0]
293 faeskeyx %f2, 0x01, %f4
294 std %f6, [$out + 8]
295 add $out, $inc, $out
296 faeskeyx %f4, 0x00, %f6
297___
298}
299$code.=<<___;
300 std %f0, [$out + 0]
301 faeskeyx %f6, `0x10+$i`, %f0
302 std %f2, [$out + 8]
303 add $out, $inc, $out
304 faeskeyx %f0, 0x00, %f2
305 std %f4,[$out + 0]
306 std %f6,[$out + 8]
307 add $out, $inc, $out
308 std %f0,[$out + 0]
309 std %f2,[$out + 8]
310 retl
311 xor %o0, %o0, %o0 ! return 0
312
313.align 16
314.L192:
315 brz,pt $tmp, .L192aligned
316 nop
317
318 ldd [$inp + 24], %f6
319 fshiftorx %f0, %f2, %f10, %f0
320 fshiftorx %f2, %f4, %f10, %f2
321 fshiftorx %f4, %f6, %f10, %f4
322
323.L192aligned:
324 mov 12, $bits
325 and $inc, `12*16`, $tmp
326 st $bits, [$out + 240] ! store rounds
327 add $out, $tmp, $out ! start or end of key schedule
328 sllx $inc, 4, $inc ! 16 or -16
329___
330for ($i=0; $i<8; $i+=2) {
331 $code.=<<___;
332 std %f0, [$out + 0]
333 faeskeyx %f4, `0x10+$i`, %f0
334 std %f2, [$out + 8]
335 add $out, $inc, $out
336 faeskeyx %f0, 0x00, %f2
337 std %f4, [$out + 0]
338 faeskeyx %f2, 0x00, %f4
339 std %f0, [$out + 8]
340 add $out, $inc, $out
341 faeskeyx %f4, `0x10+$i+1`, %f0
342 std %f2, [$out + 0]
343 faeskeyx %f0, 0x00, %f2
344 std %f4, [$out + 8]
345 add $out, $inc, $out
346___
347$code.=<<___ if ($i<6);
348 faeskeyx %f2, 0x00, %f4
349___
350}
351$code.=<<___;
352 std %f0, [$out + 0]
353 std %f2, [$out + 8]
354 retl
355 xor %o0, %o0, %o0 ! return 0
356
357.align 16
358.L128:
359 brz,pt $tmp, .L128aligned
360 nop
361
362 ldd [$inp + 16], %f4
363 fshiftorx %f0, %f2, %f10, %f0
364 fshiftorx %f2, %f4, %f10, %f2
365
366.L128aligned:
367 mov 10, $bits
368 and $inc, `10*16`, $tmp
369 st $bits, [$out + 240] ! store rounds
370 add $out, $tmp, $out ! start or end of key schedule
371 sllx $inc, 4, $inc ! 16 or -16
372___
373for ($i=0; $i<10; $i++) {
374 $code.=<<___;
375 std %f0, [$out + 0]
376 faeskeyx %f2, `0x10+$i`, %f0
377 std %f2, [$out + 8]
378 add $out, $inc, $out
379 faeskeyx %f0, 0x00, %f2
380___
381}
382$code.=<<___;
383 std %f0, [$out + 0]
384 std %f2, [$out + 8]
385 retl
386 xor %o0, %o0, %o0 ! return 0
387.type aes_fx_set_encrypt_key,#function
388.size aes_fx_set_encrypt_key,.-aes_fx_set_encrypt_key
389___
390}
391{
392my ($inp,$out,$len,$key,$ivp,$dir) = map("%i$_",(0..5));
393my ($rounds,$inner,$end,$inc,$ialign,$oalign,$mask) = map("%l$_",(0..7));
394my ($iv0,$iv1,$r0hi,$r0lo,$rlhi,$rllo,$in0,$in1,$intail,$outhead,$fshift)
395 = map("%f$_",grep { !($_ & 1) } (16 .. 62));
396my ($ileft,$iright) = ($ialign,$oalign);
397
398$code.=<<___;
399.globl aes_fx_cbc_encrypt
400.align 32
401aes_fx_cbc_encrypt:
402 save %sp, -STACK_FRAME-16, %sp
403 srln $len, 4, $len
404 and $inp, 7, $ialign
405 andn $inp, 7, $inp
406 brz,pn $len, .Lcbc_no_data
407 sll $ialign, 3, $ileft
408
4091: call .+8
410 add %o7, .Linp_align-1b, %o7
411
412 ld [$key + 240], $rounds
413 and $out, 7, $oalign
414 ld [$ivp + 0], %f0 ! load ivec
415 andn $out, 7, $out
416 ld [$ivp + 4], %f1
417 sll $oalign, 3, $mask
418 ld [$ivp + 8], %f2
419 ld [$ivp + 12], %f3
420
421 sll $rounds, 4, $rounds
422 add $rounds, $key, $end
423 ldd [$key + 0], $r0hi ! round[0]
424 ldd [$key + 8], $r0lo
425
426 add $inp, 16, $inp
427 sub $len, 1, $len
428 ldd [$end + 0], $rlhi ! round[last]
429 ldd [$end + 8], $rllo
430
431 mov 16, $inc
432 movrz $len, 0, $inc
433 ldd [$key + 16], %f10 ! round[1]
434 ldd [$key + 24], %f12
435
436 ldd [%o7 + $ileft], $fshift ! shift left params
437 add %o7, 64, %o7
438 ldd [$inp - 16], $in0 ! load input
439 ldd [$inp - 8], $in1
440 ldda [$inp]0x82, $intail ! non-faulting load
441 brz $dir, .Lcbc_decrypt
442 add $inp, $inc, $inp ! inp+=16
443
444 fxor $r0hi, %f0, %f0 ! ivec^=round[0]
445 fxor $r0lo, %f2, %f2
446 fshiftorx $in0, $in1, $fshift, $in0
447 fshiftorx $in1, $intail, $fshift, $in1
448 nop
449
450.Loop_cbc_enc:
451 fxor $in0, %f0, %f0 ! inp^ivec^round[0]
452 fxor $in1, %f2, %f2
453 ldd [$key + 32], %f6 ! round[2]
454 ldd [$key + 40], %f8
455 add $key, 32, $end
456 sub $rounds, 16*6, $inner
457
458.Lcbc_enc:
459 fmovd %f0, %f4
460 faesencx %f2, %f10, %f0
461 faesencx %f4, %f12, %f2
462 ldd [$end + 16], %f10
463 ldd [$end + 24], %f12
464 add $end, 32, $end
465
466 fmovd %f0, %f4
467 faesencx %f2, %f6, %f0
468 faesencx %f4, %f8, %f2
469 ldd [$end + 0], %f6
470 ldd [$end + 8], %f8
471
472 brnz,a $inner, .Lcbc_enc
473 sub $inner, 16*2, $inner
474
475 fmovd %f0, %f4
476 faesencx %f2, %f10, %f0
477 faesencx %f4, %f12, %f2
478 ldd [$end + 16], %f10 ! round[last-1]
479 ldd [$end + 24], %f12
480
481 movrz $len, 0, $inc
482 fmovd $intail, $in0
483 ldd [$inp - 8], $in1 ! load next input block
484 ldda [$inp]0x82, $intail ! non-faulting load
485 add $inp, $inc, $inp ! inp+=16
486
487 fmovd %f0, %f4
488 faesencx %f2, %f6, %f0
489 faesencx %f4, %f8, %f2
490
491 fshiftorx $in0, $in1, $fshift, $in0
492 fshiftorx $in1, $intail, $fshift, $in1
493
494 fmovd %f0, %f4
495 faesencx %f2, %f10, %f0
496 faesencx %f4, %f12, %f2
497 ldd [$key + 16], %f10 ! round[1]
498 ldd [$key + 24], %f12
499
500 fxor $r0hi, $in0, $in0 ! inp^=round[0]
501 fxor $r0lo, $in1, $in1
502
503 fmovd %f0, %f4
504 faesenclx %f2, $rlhi, %f0
505 faesenclx %f4, $rllo, %f2
506
507 brnz,pn $oalign, .Lcbc_enc_unaligned_out
508 nop
509
510 std %f0, [$out + 0]
511 std %f2, [$out + 8]
512 add $out, 16, $out
513
514 brnz,a $len, .Loop_cbc_enc
515 sub $len, 1, $len
516
517 st %f0, [$ivp + 0] ! output ivec
518 st %f1, [$ivp + 4]
519 st %f2, [$ivp + 8]
520 st %f3, [$ivp + 12]
521
522.Lcbc_no_data:
523 ret
524 restore
525
526.align 32
527.Lcbc_enc_unaligned_out:
528 ldd [%o7 + $mask], $fshift ! shift right params
529 mov 0xff, $mask
530 srl $mask, $oalign, $mask
531 sub %g0, $ileft, $iright
532
533 fshiftorx %f0, %f0, $fshift, %f6
534 fshiftorx %f0, %f2, $fshift, %f8
535
536 stda %f6, [$out + $mask]0xc0 ! partial store
537 orn %g0, $mask, $mask
538 std %f8, [$out + 8]
539 add $out, 16, $out
540 brz $len, .Lcbc_enc_unaligned_out_done
541 sub $len, 1, $len
542 b .Loop_cbc_enc_unaligned_out
543 nop
544
545.align 32
546.Loop_cbc_enc_unaligned_out:
547 fmovd %f2, $outhead
548 fxor $in0, %f0, %f0 ! inp^ivec^round[0]
549 fxor $in1, %f2, %f2
550 ldd [$key + 32], %f6 ! round[2]
551 ldd [$key + 40], %f8
552
553 fmovd %f0, %f4
554 faesencx %f2, %f10, %f0
555 faesencx %f4, %f12, %f2
556 ldd [$key + 48], %f10 ! round[3]
557 ldd [$key + 56], %f12
558
559 ldx [$inp - 16], %o0
560 ldx [$inp - 8], %o1
561 brz $ileft, .Lcbc_enc_aligned_inp
562 movrz $len, 0, $inc
563
564 ldx [$inp], %o2
565 sllx %o0, $ileft, %o0
566 srlx %o1, $iright, %g1
567 sllx %o1, $ileft, %o1
568 or %g1, %o0, %o0
569 srlx %o2, $iright, %o2
570 or %o2, %o1, %o1
571
572.Lcbc_enc_aligned_inp:
573 fmovd %f0, %f4
574 faesencx %f2, %f6, %f0
575 faesencx %f4, %f8, %f2
576 ldd [$key + 64], %f6 ! round[4]
577 ldd [$key + 72], %f8
578 add $key, 64, $end
579 sub $rounds, 16*8, $inner
580
581 stx %o0, [%sp + LOCALS + 0]
582 stx %o1, [%sp + LOCALS + 8]
583 add $inp, $inc, $inp ! inp+=16
584 nop
585
586.Lcbc_enc_unaligned:
587 fmovd %f0, %f4
588 faesencx %f2, %f10, %f0
589 faesencx %f4, %f12, %f2
590 ldd [$end + 16], %f10
591 ldd [$end + 24], %f12
592 add $end, 32, $end
593
594 fmovd %f0, %f4
595 faesencx %f2, %f6, %f0
596 faesencx %f4, %f8, %f2
597 ldd [$end + 0], %f6
598 ldd [$end + 8], %f8
599
600 brnz,a $inner, .Lcbc_enc_unaligned
601 sub $inner, 16*2, $inner
602
603 fmovd %f0, %f4
604 faesencx %f2, %f10, %f0
605 faesencx %f4, %f12, %f2
606 ldd [$end + 16], %f10 ! round[last-1]
607 ldd [$end + 24], %f12
608
609 fmovd %f0, %f4
610 faesencx %f2, %f6, %f0
611 faesencx %f4, %f8, %f2
612
613 ldd [%sp + LOCALS + 0], $in0
614 ldd [%sp + LOCALS + 8], $in1
615
616 fmovd %f0, %f4
617 faesencx %f2, %f10, %f0
618 faesencx %f4, %f12, %f2
619 ldd [$key + 16], %f10 ! round[1]
620 ldd [$key + 24], %f12
621
622 fxor $r0hi, $in0, $in0 ! inp^=round[0]
623 fxor $r0lo, $in1, $in1
624
625 fmovd %f0, %f4
626 faesenclx %f2, $rlhi, %f0
627 faesenclx %f4, $rllo, %f2
628
629 fshiftorx $outhead, %f0, $fshift, %f6
630 fshiftorx %f0, %f2, $fshift, %f8
631 std %f6, [$out + 0]
632 std %f8, [$out + 8]
633 add $out, 16, $out
634
635 brnz,a $len, .Loop_cbc_enc_unaligned_out
636 sub $len, 1, $len
637
638.Lcbc_enc_unaligned_out_done:
639 fshiftorx %f2, %f2, $fshift, %f8
640 stda %f8, [$out + $mask]0xc0 ! partial store
641
642 st %f0, [$ivp + 0] ! output ivec
643 st %f1, [$ivp + 4]
644 st %f2, [$ivp + 8]
645 st %f3, [$ivp + 12]
646
647 ret
648 restore
649
650.align 32
651.Lcbc_decrypt:
652 fshiftorx $in0, $in1, $fshift, $in0
653 fshiftorx $in1, $intail, $fshift, $in1
654 fmovd %f0, $iv0
655 fmovd %f2, $iv1
656
657.Loop_cbc_dec:
658 fxor $in0, $r0hi, %f0 ! inp^round[0]
659 fxor $in1, $r0lo, %f2
660 ldd [$key + 32], %f6 ! round[2]
661 ldd [$key + 40], %f8
662 add $key, 32, $end
663 sub $rounds, 16*6, $inner
664
665.Lcbc_dec:
666 fmovd %f0, %f4
667 faesdecx %f2, %f10, %f0
668 faesdecx %f4, %f12, %f2
669 ldd [$end + 16], %f10
670 ldd [$end + 24], %f12
671 add $end, 32, $end
672
673 fmovd %f0, %f4
674 faesdecx %f2, %f6, %f0
675 faesdecx %f4, %f8, %f2
676 ldd [$end + 0], %f6
677 ldd [$end + 8], %f8
678
679 brnz,a $inner, .Lcbc_dec
680 sub $inner, 16*2, $inner
681
682 fmovd %f0, %f4
683 faesdecx %f2, %f10, %f0
684 faesdecx %f4, %f12, %f2
685 ldd [$end + 16], %f10 ! round[last-1]
686 ldd [$end + 24], %f12
687
688 fmovd %f0, %f4
689 faesdecx %f2, %f6, %f0
690 faesdecx %f4, %f8, %f2
691 fxor $iv0, $rlhi, %f6 ! ivec^round[last]
692 fxor $iv1, $rllo, %f8
693 fmovd $in0, $iv0
694 fmovd $in1, $iv1
695
696 movrz $len, 0, $inc
697 fmovd $intail, $in0
698 ldd [$inp - 8], $in1 ! load next input block
699 ldda [$inp]0x82, $intail ! non-faulting load
700 add $inp, $inc, $inp ! inp+=16
701
702 fmovd %f0, %f4
703 faesdecx %f2, %f10, %f0
704 faesdecx %f4, %f12, %f2
705 ldd [$key + 16], %f10 ! round[1]
706 ldd [$key + 24], %f12
707
708 fshiftorx $in0, $in1, $fshift, $in0
709 fshiftorx $in1, $intail, $fshift, $in1
710
711 fmovd %f0, %f4
712 faesdeclx %f2, %f6, %f0
713 faesdeclx %f4, %f8, %f2
714
715 brnz,pn $oalign, .Lcbc_dec_unaligned_out
716 nop
717
718 std %f0, [$out + 0]
719 std %f2, [$out + 8]
720 add $out, 16, $out
721
722 brnz,a $len, .Loop_cbc_dec
723 sub $len, 1, $len
724
725 st $iv0, [$ivp + 0] ! output ivec
726 st $iv0#lo, [$ivp + 4]
727 st $iv1, [$ivp + 8]
728 st $iv1#lo, [$ivp + 12]
729
730 ret
731 restore
732
733.align 32
734.Lcbc_dec_unaligned_out:
735 ldd [%o7 + $mask], $fshift ! shift right params
736 mov 0xff, $mask
737 srl $mask, $oalign, $mask
738 sub %g0, $ileft, $iright
739
740 fshiftorx %f0, %f0, $fshift, %f6
741 fshiftorx %f0, %f2, $fshift, %f8
742
743 stda %f6, [$out + $mask]0xc0 ! partial store
744 orn %g0, $mask, $mask
745 std %f8, [$out + 8]
746 add $out, 16, $out
747 brz $len, .Lcbc_dec_unaligned_out_done
748 sub $len, 1, $len
749 b .Loop_cbc_dec_unaligned_out
750 nop
751
752.align 32
753.Loop_cbc_dec_unaligned_out:
754 fmovd %f2, $outhead
755 fxor $in0, $r0hi, %f0 ! inp^round[0]
756 fxor $in1, $r0lo, %f2
757 ldd [$key + 32], %f6 ! round[2]
758 ldd [$key + 40], %f8
759
760 fmovd %f0, %f4
761 faesdecx %f2, %f10, %f0
762 faesdecx %f4, %f12, %f2
763 ldd [$key + 48], %f10 ! round[3]
764 ldd [$key + 56], %f12
765
766 ldx [$inp - 16], %o0
767 ldx [$inp - 8], %o1
768 brz $ileft, .Lcbc_dec_aligned_inp
769 movrz $len, 0, $inc
770
771 ldx [$inp], %o2
772 sllx %o0, $ileft, %o0
773 srlx %o1, $iright, %g1
774 sllx %o1, $ileft, %o1
775 or %g1, %o0, %o0
776 srlx %o2, $iright, %o2
777 or %o2, %o1, %o1
778
779.Lcbc_dec_aligned_inp:
780 fmovd %f0, %f4
781 faesdecx %f2, %f6, %f0
782 faesdecx %f4, %f8, %f2
783 ldd [$key + 64], %f6 ! round[4]
784 ldd [$key + 72], %f8
785 add $key, 64, $end
786 sub $rounds, 16*8, $inner
787
788 stx %o0, [%sp + LOCALS + 0]
789 stx %o1, [%sp + LOCALS + 8]
790 add $inp, $inc, $inp ! inp+=16
791 nop
792
793.Lcbc_dec_unaligned:
794 fmovd %f0, %f4
795 faesdecx %f2, %f10, %f0
796 faesdecx %f4, %f12, %f2
797 ldd [$end + 16], %f10
798 ldd [$end + 24], %f12
799 add $end, 32, $end
800
801 fmovd %f0, %f4
802 faesdecx %f2, %f6, %f0
803 faesdecx %f4, %f8, %f2
804 ldd [$end + 0], %f6
805 ldd [$end + 8], %f8
806
807 brnz,a $inner, .Lcbc_dec_unaligned
808 sub $inner, 16*2, $inner
809
810 fmovd %f0, %f4
811 faesdecx %f2, %f10, %f0
812 faesdecx %f4, %f12, %f2
813 ldd [$end + 16], %f10 ! round[last-1]
814 ldd [$end + 24], %f12
815
816 fmovd %f0, %f4
817 faesdecx %f2, %f6, %f0
818 faesdecx %f4, %f8, %f2
819
820 fxor $iv0, $rlhi, %f6 ! ivec^round[last]
821 fxor $iv1, $rllo, %f8
822 fmovd $in0, $iv0
823 fmovd $in1, $iv1
824 ldd [%sp + LOCALS + 0], $in0
825 ldd [%sp + LOCALS + 8], $in1
826
827 fmovd %f0, %f4
828 faesdecx %f2, %f10, %f0
829 faesdecx %f4, %f12, %f2
830 ldd [$key + 16], %f10 ! round[1]
831 ldd [$key + 24], %f12
832
833 fmovd %f0, %f4
834 faesdeclx %f2, %f6, %f0
835 faesdeclx %f4, %f8, %f2
836
837 fshiftorx $outhead, %f0, $fshift, %f6
838 fshiftorx %f0, %f2, $fshift, %f8
839 std %f6, [$out + 0]
840 std %f8, [$out + 8]
841 add $out, 16, $out
842
843 brnz,a $len, .Loop_cbc_dec_unaligned_out
844 sub $len, 1, $len
845
846.Lcbc_dec_unaligned_out_done:
847 fshiftorx %f2, %f2, $fshift, %f8
848 stda %f8, [$out + $mask]0xc0 ! partial store
849
850 st $iv0, [$ivp + 0] ! output ivec
851 st $iv0#lo, [$ivp + 4]
852 st $iv1, [$ivp + 8]
853 st $iv1#lo, [$ivp + 12]
854
855 ret
856 restore
857.type aes_fx_cbc_encrypt,#function
858.size aes_fx_cbc_encrypt,.-aes_fx_cbc_encrypt
859___
860}
861{
862my ($inp,$out,$len,$key,$ivp) = map("%i$_",(0..5));
863my ($rounds,$inner,$end,$inc,$ialign,$oalign,$mask) = map("%l$_",(0..7));
864my ($ctr0,$ctr1,$r0hi,$r0lo,$rlhi,$rllo,$in0,$in1,$intail,$outhead,$fshift)
865 = map("%f$_",grep { !($_ & 1) } (16 .. 62));
866my ($ileft,$iright) = ($ialign, $oalign);
867my $one = "%f14";
868
869$code.=<<___;
870.globl aes_fx_ctr32_encrypt_blocks
871.align 32
872aes_fx_ctr32_encrypt_blocks:
873 save %sp, -STACK_FRAME-16, %sp
874 srln $len, 0, $len
875 and $inp, 7, $ialign
876 andn $inp, 7, $inp
877 brz,pn $len, .Lctr32_no_data
878 sll $ialign, 3, $ileft
879
880.Lpic: call .+8
881 add %o7, .Linp_align - .Lpic, %o7
882
883 ld [$key + 240], $rounds
884 and $out, 7, $oalign
885 ld [$ivp + 0], $ctr0 ! load counter
886 andn $out, 7, $out
887 ld [$ivp + 4], $ctr0#lo
888 sll $oalign, 3, $mask
889 ld [$ivp + 8], $ctr1
890 ld [$ivp + 12], $ctr1#lo
891 ldd [%o7 + 128], $one
892
893 sll $rounds, 4, $rounds
894 add $rounds, $key, $end
895 ldd [$key + 0], $r0hi ! round[0]
896 ldd [$key + 8], $r0lo
897
898 add $inp, 16, $inp
899 sub $len, 1, $len
900 ldd [$key + 16], %f10 ! round[1]
901 ldd [$key + 24], %f12
902
903 mov 16, $inc
904 movrz $len, 0, $inc
905 ldd [$end + 0], $rlhi ! round[last]
906 ldd [$end + 8], $rllo
907
908 ldd [%o7 + $ileft], $fshift ! shiftleft params
909 add %o7, 64, %o7
910 ldd [$inp - 16], $in0 ! load input
911 ldd [$inp - 8], $in1
912 ldda [$inp]0x82, $intail ! non-faulting load
913 add $inp, $inc, $inp ! inp+=16
914
915 fshiftorx $in0, $in1, $fshift, $in0
916 fshiftorx $in1, $intail, $fshift, $in1
917
918.Loop_ctr32:
919 fxor $ctr0, $r0hi, %f0 ! counter^round[0]
920 fxor $ctr1, $r0lo, %f2
921 ldd [$key + 32], %f6 ! round[2]
922 ldd [$key + 40], %f8
923 add $key, 32, $end
924 sub $rounds, 16*6, $inner
925
926.Lctr32_enc:
927 fmovd %f0, %f4
928 faesencx %f2, %f10, %f0
929 faesencx %f4, %f12, %f2
930 ldd [$end + 16], %f10
931 ldd [$end + 24], %f12
932 add $end, 32, $end
933
934 fmovd %f0, %f4
935 faesencx %f2, %f6, %f0
936 faesencx %f4, %f8, %f2
937 ldd [$end + 0], %f6
938 ldd [$end + 8], %f8
939
940 brnz,a $inner, .Lctr32_enc
941 sub $inner, 16*2, $inner
942
943 fmovd %f0, %f4
944 faesencx %f2, %f10, %f0
945 faesencx %f4, %f12, %f2
946 ldd [$end + 16], %f10 ! round[last-1]
947 ldd [$end + 24], %f12
948
949 fmovd %f0, %f4
950 faesencx %f2, %f6, %f0
951 faesencx %f4, %f8, %f2
952 fxor $in0, $rlhi, %f6 ! inp^round[last]
953 fxor $in1, $rllo, %f8
954
955 movrz $len, 0, $inc
956 fmovd $intail, $in0
957 ldd [$inp - 8], $in1 ! load next input block
958 ldda [$inp]0x82, $intail ! non-faulting load
959 add $inp, $inc, $inp ! inp+=16
960
961 fmovd %f0, %f4
962 faesencx %f2, %f10, %f0
963 faesencx %f4, %f12, %f2
964 ldd [$key + 16], %f10 ! round[1]
965 ldd [$key + 24], %f12
966
967 fshiftorx $in0, $in1, $fshift, $in0
968 fshiftorx $in1, $intail, $fshift, $in1
969 fpadd32 $ctr1, $one, $ctr1 ! increment counter
970
971 fmovd %f0, %f4
972 faesenclx %f2, %f6, %f0
973 faesenclx %f4, %f8, %f2
974
975 brnz,pn $oalign, .Lctr32_unaligned_out
976 nop
977
978 std %f0, [$out + 0]
979 std %f2, [$out + 8]
980 add $out, 16, $out
981
982 brnz,a $len, .Loop_ctr32
983 sub $len, 1, $len
984
985.Lctr32_no_data:
986 ret
987 restore
988
989.align 32
990.Lctr32_unaligned_out:
991 ldd [%o7 + $mask], $fshift ! shift right params
992 mov 0xff, $mask
993 srl $mask, $oalign, $mask
994 sub %g0, $ileft, $iright
995
996 fshiftorx %f0, %f0, $fshift, %f6
997 fshiftorx %f0, %f2, $fshift, %f8
998
999 stda %f6, [$out + $mask]0xc0 ! partial store
1000 orn %g0, $mask, $mask
1001 std %f8, [$out + 8]
1002 add $out, 16, $out
1003 brz $len, .Lctr32_unaligned_out_done
1004 sub $len, 1, $len
1005 b .Loop_ctr32_unaligned_out
1006 nop
1007
1008.align 32
1009.Loop_ctr32_unaligned_out:
1010 fmovd %f2, $outhead
1011 fxor $ctr0, $r0hi, %f0 ! counter^round[0]
1012 fxor $ctr1, $r0lo, %f2
1013 ldd [$key + 32], %f6 ! round[2]
1014 ldd [$key + 40], %f8
1015
1016 fmovd %f0, %f4
1017 faesencx %f2, %f10, %f0
1018 faesencx %f4, %f12, %f2
1019 ldd [$key + 48], %f10 ! round[3]
1020 ldd [$key + 56], %f12
1021
1022 ldx [$inp - 16], %o0
1023 ldx [$inp - 8], %o1
1024 brz $ileft, .Lctr32_aligned_inp
1025 movrz $len, 0, $inc
1026
1027 ldx [$inp], %o2
1028 sllx %o0, $ileft, %o0
1029 srlx %o1, $iright, %g1
1030 sllx %o1, $ileft, %o1
1031 or %g1, %o0, %o0
1032 srlx %o2, $iright, %o2
1033 or %o2, %o1, %o1
1034
1035.Lctr32_aligned_inp:
1036 fmovd %f0, %f4
1037 faesencx %f2, %f6, %f0
1038 faesencx %f4, %f8, %f2
1039 ldd [$key + 64], %f6 ! round[4]
1040 ldd [$key + 72], %f8
1041 add $key, 64, $end
1042 sub $rounds, 16*8, $inner
1043
1044 stx %o0, [%sp + LOCALS + 0]
1045 stx %o1, [%sp + LOCALS + 8]
1046 add $inp, $inc, $inp ! inp+=16
1047 nop
1048
1049.Lctr32_enc_unaligned:
1050 fmovd %f0, %f4
1051 faesencx %f2, %f10, %f0
1052 faesencx %f4, %f12, %f2
1053 ldd [$end + 16], %f10
1054 ldd [$end + 24], %f12
1055 add $end, 32, $end
1056
1057 fmovd %f0, %f4
1058 faesencx %f2, %f6, %f0
1059 faesencx %f4, %f8, %f2
1060 ldd [$end + 0], %f6
1061 ldd [$end + 8], %f8
1062
1063 brnz,a $inner, .Lctr32_enc_unaligned
1064 sub $inner, 16*2, $inner
1065
1066 fmovd %f0, %f4
1067 faesencx %f2, %f10, %f0
1068 faesencx %f4, %f12, %f2
1069 ldd [$end + 16], %f10 ! round[last-1]
1070 ldd [$end + 24], %f12
1071 fpadd32 $ctr1, $one, $ctr1 ! increment counter
1072
1073 fmovd %f0, %f4
1074 faesencx %f2, %f6, %f0
1075 faesencx %f4, %f8, %f2
1076 fxor $in0, $rlhi, %f6 ! inp^round[last]
1077 fxor $in1, $rllo, %f8
1078 ldd [%sp + LOCALS + 0], $in0
1079 ldd [%sp + LOCALS + 8], $in1
1080
1081 fmovd %f0, %f4
1082 faesencx %f2, %f10, %f0
1083 faesencx %f4, %f12, %f2
1084 ldd [$key + 16], %f10 ! round[1]
1085 ldd [$key + 24], %f12
1086
1087 fmovd %f0, %f4
1088 faesenclx %f2, %f6, %f0
1089 faesenclx %f4, %f8, %f2
1090
1091 fshiftorx $outhead, %f0, $fshift, %f6
1092 fshiftorx %f0, %f2, $fshift, %f8
1093 std %f6, [$out + 0]
1094 std %f8, [$out + 8]
1095 add $out, 16, $out
1096
1097 brnz,a $len, .Loop_ctr32_unaligned_out
1098 sub $len, 1, $len
1099
1100.Lctr32_unaligned_out_done:
1101 fshiftorx %f2, %f2, $fshift, %f8
1102 stda %f8, [$out + $mask]0xc0 ! partial store
1103
1104 ret
1105 restore
1106.type aes_fx_ctr32_encrypt_blocks,#function
1107.size aes_fx_ctr32_encrypt_blocks,.-aes_fx_ctr32_encrypt_blocks
1108
1109.align 32
1110.Linp_align: ! fshiftorx parameters for left shift toward %rs1
1111 .byte 0, 0, 64, 0, 0, 64, 0, -64
1112 .byte 0, 0, 56, 8, 0, 56, 8, -56
1113 .byte 0, 0, 48, 16, 0, 48, 16, -48
1114 .byte 0, 0, 40, 24, 0, 40, 24, -40
1115 .byte 0, 0, 32, 32, 0, 32, 32, -32
1116 .byte 0, 0, 24, 40, 0, 24, 40, -24
1117 .byte 0, 0, 16, 48, 0, 16, 48, -16
1118 .byte 0, 0, 8, 56, 0, 8, 56, -8
1119.Lout_align: ! fshiftorx parameters for right shift toward %rs2
1120 .byte 0, 0, 0, 64, 0, 0, 64, 0
1121 .byte 0, 0, 8, 56, 0, 8, 56, -8
1122 .byte 0, 0, 16, 48, 0, 16, 48, -16
1123 .byte 0, 0, 24, 40, 0, 24, 40, -24
1124 .byte 0, 0, 32, 32, 0, 32, 32, -32
1125 .byte 0, 0, 40, 24, 0, 40, 24, -40
1126 .byte 0, 0, 48, 16, 0, 48, 16, -48
1127 .byte 0, 0, 56, 8, 0, 56, 8, -56
1128.Lone:
1129 .word 0, 1
1130.asciz "AES for Fujitsu SPARC64 X, CRYPTOGAMS by <appro\@openssl.org>"
1131.align 4
1132___
1133}
1134# Purpose of these subroutines is to explicitly encode VIS instructions,
1135# so that one can compile the module without having to specify VIS
1136# extensions on compiler command line, e.g. -xarch=v9 vs. -xarch=v9a.
1137# Idea is to reserve for option to produce "universal" binary and let
1138# programmer detect if current CPU is VIS capable at run-time.
1139sub unvis {
1140my ($mnemonic,$rs1,$rs2,$rd)=@_;
1141my ($ref,$opf);
1142my %visopf = ( "faligndata" => 0x048,
1143 "bshuffle" => 0x04c,
1144 "fpadd32" => 0x052,
1145 "fxor" => 0x06c,
1146 "fsrc2" => 0x078 );
1147
1148 $ref = "$mnemonic\t$rs1,$rs2,$rd";
1149
1150 if ($opf=$visopf{$mnemonic}) {
1151 foreach ($rs1,$rs2,$rd) {
1152 return $ref if (!/%f([0-9]{1,2})/);
1153 $_=$1;
1154 if ($1>=32) {
1155 return $ref if ($1&1);
1156 # re-encode for upper double register addressing
1157 $_=($1|$1>>5)&31;
1158 }
1159 }
1160
1161 return sprintf ".word\t0x%08x !%s",
1162 0x81b00000|$rd<<25|$rs1<<14|$opf<<5|$rs2,
1163 $ref;
1164 } else {
1165 return $ref;
1166 }
1167}
1168
1169sub unvis3 {
1170my ($mnemonic,$rs1,$rs2,$rd)=@_;
1171my %bias = ( "g" => 0, "o" => 8, "l" => 16, "i" => 24 );
1172my ($ref,$opf);
1173my %visopf = ( "alignaddr" => 0x018,
1174 "bmask" => 0x019,
1175 "alignaddrl" => 0x01a );
1176
1177 $ref = "$mnemonic\t$rs1,$rs2,$rd";
1178
1179 if ($opf=$visopf{$mnemonic}) {
1180 foreach ($rs1,$rs2,$rd) {
1181 return $ref if (!/%([goli])([0-9])/);
1182 $_=$bias{$1}+$2;
1183 }
1184
1185 return sprintf ".word\t0x%08x !%s",
1186 0x81b00000|$rd<<25|$rs1<<14|$opf<<5|$rs2,
1187 $ref;
1188 } else {
1189 return $ref;
1190 }
1191}
1192
1193sub unfx {
1194my ($mnemonic,$rs1,$rs2,$rd)=@_;
1195my ($ref,$opf);
1196my %aesopf = ( "faesencx" => 0x90,
1197 "faesdecx" => 0x91,
1198 "faesenclx" => 0x92,
1199 "faesdeclx" => 0x93,
1200 "faeskeyx" => 0x94 );
1201
1202 $ref = "$mnemonic\t$rs1,$rs2,$rd";
1203
1204 if (defined($opf=$aesopf{$mnemonic})) {
1205 $rs2 = ($rs2 =~ /%f([0-6]*[02468])/) ? (($1|$1>>5)&31) : $rs2;
1206 $rs2 = oct($rs2) if ($rs2 =~ /^0/);
1207
1208 foreach ($rs1,$rd) {
1209 return $ref if (!/%f([0-9]{1,2})/);
1210 $_=$1;
1211 if ($1>=32) {
1212 return $ref if ($1&1);
1213 # re-encode for upper double register addressing
1214 $_=($1|$1>>5)&31;
1215 }
1216 }
1217
1218 return sprintf ".word\t0x%08x !%s",
1219 2<<30|$rd<<25|0x36<<19|$rs1<<14|$opf<<5|$rs2,
1220 $ref;
1221 } else {
1222 return $ref;
1223 }
1224}
1225
1226sub unfx3src {
1227my ($mnemonic,$rs1,$rs2,$rs3,$rd)=@_;
1228my ($ref,$opf);
1229my %aesopf = ( "fshiftorx" => 0x0b );
1230
1231 $ref = "$mnemonic\t$rs1,$rs2,$rs3,$rd";
1232
1233 if (defined($opf=$aesopf{$mnemonic})) {
1234 foreach ($rs1,$rs2,$rs3,$rd) {
1235 return $ref if (!/%f([0-9]{1,2})/);
1236 $_=$1;
1237 if ($1>=32) {
1238 return $ref if ($1&1);
1239 # re-encode for upper double register addressing
1240 $_=($1|$1>>5)&31;
1241 }
1242 }
1243
1244 return sprintf ".word\t0x%08x !%s",
1245 2<<30|$rd<<25|0x37<<19|$rs1<<14|$rs3<<9|$opf<<5|$rs2,
1246 $ref;
1247 } else {
1248 return $ref;
1249 }
1250}
1251
1252foreach (split("\n",$code)) {
1253 s/\`([^\`]*)\`/eval $1/ge;
1254
1255 s/%f([0-9]+)#lo/sprintf "%%f%d",$1+1/ge;
1256
1257 s/\b(faes[^x]{3,4}x)\s+(%f[0-9]{1,2}),\s*([%fx0-9]+),\s*(%f[0-9]{1,2})/
1258 &unfx($1,$2,$3,$4)
1259 /ge or
1260 s/\b([f][^\s]*)\s+(%f[0-9]{1,2}),\s*(%f[0-9]{1,2}),\s*(%f[0-9]{1,2}),\s*(%f[0-9]{1,2})/
1261 &unfx3src($1,$2,$3,$4,$5)
1262 /ge or
1263 s/\b([fb][^\s]*)\s+(%f[0-9]{1,2}),\s*(%f[0-9]{1,2}),\s*(%f[0-9]{1,2})/
1264 &unvis($1,$2,$3,$4)
1265 /ge or
1266 s/\b(alignaddr[l]*)\s+(%[goli][0-7]),\s*(%[goli][0-7]),\s*(%[goli][0-7])/
1267 &unvis3($1,$2,$3,$4)
1268 /ge;
1269 print $_,"\n";
1270}
1271
1272close STDOUT or die "error closing STDOUT: $!";
Note: See TracBrowser for help on using the repository browser.

© 2024 Oracle Support Privacy / Do Not Sell My Info Terms of Use Trademark Policy Automated Access Etiquette