VirtualBox

source: vbox/trunk/src/libs/openssl-1.1.1l/crypto/perlasm/sparcv9_modes.pl@ 91772

Last change on this file since 91772 was 91772, checked in by vboxsync, 3 years ago

openssl-1.1.1l: Applied and adjusted our OpenSSL changes to 1.1.1l. bugref:10126

File size: 38.4 KB
Line 
1#! /usr/bin/env perl
2# Copyright 2012-2016 The OpenSSL Project Authors. All Rights Reserved.
3#
4# Licensed under the OpenSSL license (the "License"). You may not use
5# this file except in compliance with the License. You can obtain a copy
6# in the file LICENSE in the source distribution or at
7# https://www.openssl.org/source/license.html
8
9
10# Specific modes implementations for SPARC Architecture 2011. There
11# is T4 dependency though, an ASI value that is not specified in the
12# Architecture Manual. But as SPARC universe is rather monocultural,
13# we imply that processor capable of executing crypto instructions
14# can handle the ASI in question as well. This means that we ought to
15# keep eyes open when new processors emerge...
16#
17# As for above mentioned ASI. It's so called "block initializing
18# store" which cancels "read" in "read-update-write" on cache lines.
19# This is "cooperative" optimization, as it reduces overall pressure
20# on memory interface. Benefits can't be observed/quantified with
21# usual benchmarks, on the contrary you can notice that single-thread
22# performance for parallelizable modes is ~1.5% worse for largest
23# block sizes [though few percent better for not so long ones]. All
24# this based on suggestions from David Miller.
25
26$::bias="STACK_BIAS";
27$::frame="STACK_FRAME";
28$::size_t_cc="SIZE_T_CC";
29
30sub asm_init { # to be called with @ARGV as argument
31 for (@_) { $::abibits=64 if (/\-m64/ || /\-xarch\=v9/); }
32 if ($::abibits==64) { $::bias=2047; $::frame=192; $::size_t_cc="%xcc"; }
33 else { $::bias=0; $::frame=112; $::size_t_cc="%icc"; }
34}
35
36# unified interface
37my ($inp,$out,$len,$key,$ivec)=map("%i$_",(0..5));
38# local variables
39my ($ileft,$iright,$ooff,$omask,$ivoff,$blk_init)=map("%l$_",(0..7));
40
41sub alg_cbc_encrypt_implement {
42my ($alg,$bits) = @_;
43
44$::code.=<<___;
45.globl ${alg}${bits}_t4_cbc_encrypt
46.align 32
47${alg}${bits}_t4_cbc_encrypt:
48 save %sp, -$::frame, %sp
49 cmp $len, 0
50 be,pn $::size_t_cc, .L${bits}_cbc_enc_abort
51 srln $len, 0, $len ! needed on v8+, "nop" on v9
52 sub $inp, $out, $blk_init ! $inp!=$out
53___
54$::code.=<<___ if (!$::evp);
55 andcc $ivec, 7, $ivoff
56 alignaddr $ivec, %g0, $ivec
57
58 ldd [$ivec + 0], %f0 ! load ivec
59 bz,pt %icc, 1f
60 ldd [$ivec + 8], %f2
61 ldd [$ivec + 16], %f4
62 faligndata %f0, %f2, %f0
63 faligndata %f2, %f4, %f2
641:
65___
66$::code.=<<___ if ($::evp);
67 ld [$ivec + 0], %f0
68 ld [$ivec + 4], %f1
69 ld [$ivec + 8], %f2
70 ld [$ivec + 12], %f3
71___
72$::code.=<<___;
73 prefetch [$inp], 20
74 prefetch [$inp + 63], 20
75 call _${alg}${bits}_load_enckey
76 and $inp, 7, $ileft
77 andn $inp, 7, $inp
78 sll $ileft, 3, $ileft
79 mov 64, $iright
80 mov 0xff, $omask
81 sub $iright, $ileft, $iright
82 and $out, 7, $ooff
83 cmp $len, 127
84 movrnz $ooff, 0, $blk_init ! if ( $out&7 ||
85 movleu $::size_t_cc, 0, $blk_init ! $len<128 ||
86 brnz,pn $blk_init, .L${bits}cbc_enc_blk ! $inp==$out)
87 srl $omask, $ooff, $omask
88
89 alignaddrl $out, %g0, $out
90 srlx $len, 4, $len
91 prefetch [$out], 22
92
93.L${bits}_cbc_enc_loop:
94 ldx [$inp + 0], %o0
95 brz,pt $ileft, 4f
96 ldx [$inp + 8], %o1
97
98 ldx [$inp + 16], %o2
99 sllx %o0, $ileft, %o0
100 srlx %o1, $iright, %g1
101 sllx %o1, $ileft, %o1
102 or %g1, %o0, %o0
103 srlx %o2, $iright, %o2
104 or %o2, %o1, %o1
1054:
106 xor %g4, %o0, %o0 ! ^= rk[0]
107 xor %g5, %o1, %o1
108 movxtod %o0, %f12
109 movxtod %o1, %f14
110
111 fxor %f12, %f0, %f0 ! ^= ivec
112 fxor %f14, %f2, %f2
113 prefetch [$out + 63], 22
114 prefetch [$inp + 16+63], 20
115 call _${alg}${bits}_encrypt_1x
116 add $inp, 16, $inp
117
118 brnz,pn $ooff, 2f
119 sub $len, 1, $len
120
121 std %f0, [$out + 0]
122 std %f2, [$out + 8]
123 brnz,pt $len, .L${bits}_cbc_enc_loop
124 add $out, 16, $out
125___
126$::code.=<<___ if ($::evp);
127 st %f0, [$ivec + 0]
128 st %f1, [$ivec + 4]
129 st %f2, [$ivec + 8]
130 st %f3, [$ivec + 12]
131___
132$::code.=<<___ if (!$::evp);
133 brnz,pn $ivoff, 3f
134 nop
135
136 std %f0, [$ivec + 0] ! write out ivec
137 std %f2, [$ivec + 8]
138___
139$::code.=<<___;
140.L${bits}_cbc_enc_abort:
141 ret
142 restore
143
144.align 16
1452: ldxa [$inp]0x82, %o0 ! avoid read-after-write hazard
146 ! and ~3x deterioration
147 ! in inp==out case
148 faligndata %f0, %f0, %f4 ! handle unaligned output
149 faligndata %f0, %f2, %f6
150 faligndata %f2, %f2, %f8
151
152 stda %f4, [$out + $omask]0xc0 ! partial store
153 std %f6, [$out + 8]
154 add $out, 16, $out
155 orn %g0, $omask, $omask
156 stda %f8, [$out + $omask]0xc0 ! partial store
157
158 brnz,pt $len, .L${bits}_cbc_enc_loop+4
159 orn %g0, $omask, $omask
160___
161$::code.=<<___ if ($::evp);
162 st %f0, [$ivec + 0]
163 st %f1, [$ivec + 4]
164 st %f2, [$ivec + 8]
165 st %f3, [$ivec + 12]
166___
167$::code.=<<___ if (!$::evp);
168 brnz,pn $ivoff, 3f
169 nop
170
171 std %f0, [$ivec + 0] ! write out ivec
172 std %f2, [$ivec + 8]
173 ret
174 restore
175
176.align 16
1773: alignaddrl $ivec, $ivoff, %g0 ! handle unaligned ivec
178 mov 0xff, $omask
179 srl $omask, $ivoff, $omask
180 faligndata %f0, %f0, %f4
181 faligndata %f0, %f2, %f6
182 faligndata %f2, %f2, %f8
183 stda %f4, [$ivec + $omask]0xc0
184 std %f6, [$ivec + 8]
185 add $ivec, 16, $ivec
186 orn %g0, $omask, $omask
187 stda %f8, [$ivec + $omask]0xc0
188___
189$::code.=<<___;
190 ret
191 restore
192
193!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!
194.align 32
195.L${bits}cbc_enc_blk:
196 add $out, $len, $blk_init
197 and $blk_init, 63, $blk_init ! tail
198 sub $len, $blk_init, $len
199 add $blk_init, 15, $blk_init ! round up to 16n
200 srlx $len, 4, $len
201 srl $blk_init, 4, $blk_init
202
203.L${bits}_cbc_enc_blk_loop:
204 ldx [$inp + 0], %o0
205 brz,pt $ileft, 5f
206 ldx [$inp + 8], %o1
207
208 ldx [$inp + 16], %o2
209 sllx %o0, $ileft, %o0
210 srlx %o1, $iright, %g1
211 sllx %o1, $ileft, %o1
212 or %g1, %o0, %o0
213 srlx %o2, $iright, %o2
214 or %o2, %o1, %o1
2155:
216 xor %g4, %o0, %o0 ! ^= rk[0]
217 xor %g5, %o1, %o1
218 movxtod %o0, %f12
219 movxtod %o1, %f14
220
221 fxor %f12, %f0, %f0 ! ^= ivec
222 fxor %f14, %f2, %f2
223 prefetch [$inp + 16+63], 20
224 call _${alg}${bits}_encrypt_1x
225 add $inp, 16, $inp
226 sub $len, 1, $len
227
228 stda %f0, [$out]0xe2 ! ASI_BLK_INIT, T4-specific
229 add $out, 8, $out
230 stda %f2, [$out]0xe2 ! ASI_BLK_INIT, T4-specific
231 brnz,pt $len, .L${bits}_cbc_enc_blk_loop
232 add $out, 8, $out
233
234 membar #StoreLoad|#StoreStore
235 brnz,pt $blk_init, .L${bits}_cbc_enc_loop
236 mov $blk_init, $len
237___
238$::code.=<<___ if ($::evp);
239 st %f0, [$ivec + 0]
240 st %f1, [$ivec + 4]
241 st %f2, [$ivec + 8]
242 st %f3, [$ivec + 12]
243___
244$::code.=<<___ if (!$::evp);
245 brnz,pn $ivoff, 3b
246 nop
247
248 std %f0, [$ivec + 0] ! write out ivec
249 std %f2, [$ivec + 8]
250___
251$::code.=<<___;
252 ret
253 restore
254.type ${alg}${bits}_t4_cbc_encrypt,#function
255.size ${alg}${bits}_t4_cbc_encrypt,.-${alg}${bits}_t4_cbc_encrypt
256___
257}
258
259sub alg_cbc_decrypt_implement {
260my ($alg,$bits) = @_;
261
262$::code.=<<___;
263.globl ${alg}${bits}_t4_cbc_decrypt
264.align 32
265${alg}${bits}_t4_cbc_decrypt:
266 save %sp, -$::frame, %sp
267 cmp $len, 0
268 be,pn $::size_t_cc, .L${bits}_cbc_dec_abort
269 srln $len, 0, $len ! needed on v8+, "nop" on v9
270 sub $inp, $out, $blk_init ! $inp!=$out
271___
272$::code.=<<___ if (!$::evp);
273 andcc $ivec, 7, $ivoff
274 alignaddr $ivec, %g0, $ivec
275
276 ldd [$ivec + 0], %f12 ! load ivec
277 bz,pt %icc, 1f
278 ldd [$ivec + 8], %f14
279 ldd [$ivec + 16], %f0
280 faligndata %f12, %f14, %f12
281 faligndata %f14, %f0, %f14
2821:
283___
284$::code.=<<___ if ($::evp);
285 ld [$ivec + 0], %f12 ! load ivec
286 ld [$ivec + 4], %f13
287 ld [$ivec + 8], %f14
288 ld [$ivec + 12], %f15
289___
290$::code.=<<___;
291 prefetch [$inp], 20
292 prefetch [$inp + 63], 20
293 call _${alg}${bits}_load_deckey
294 and $inp, 7, $ileft
295 andn $inp, 7, $inp
296 sll $ileft, 3, $ileft
297 mov 64, $iright
298 mov 0xff, $omask
299 sub $iright, $ileft, $iright
300 and $out, 7, $ooff
301 cmp $len, 255
302 movrnz $ooff, 0, $blk_init ! if ( $out&7 ||
303 movleu $::size_t_cc, 0, $blk_init ! $len<256 ||
304 brnz,pn $blk_init, .L${bits}cbc_dec_blk ! $inp==$out)
305 srl $omask, $ooff, $omask
306
307 andcc $len, 16, %g0 ! is number of blocks even?
308 srlx $len, 4, $len
309 alignaddrl $out, %g0, $out
310 bz %icc, .L${bits}_cbc_dec_loop2x
311 prefetch [$out], 22
312.L${bits}_cbc_dec_loop:
313 ldx [$inp + 0], %o0
314 brz,pt $ileft, 4f
315 ldx [$inp + 8], %o1
316
317 ldx [$inp + 16], %o2
318 sllx %o0, $ileft, %o0
319 srlx %o1, $iright, %g1
320 sllx %o1, $ileft, %o1
321 or %g1, %o0, %o0
322 srlx %o2, $iright, %o2
323 or %o2, %o1, %o1
3244:
325 xor %g4, %o0, %o2 ! ^= rk[0]
326 xor %g5, %o1, %o3
327 movxtod %o2, %f0
328 movxtod %o3, %f2
329
330 prefetch [$out + 63], 22
331 prefetch [$inp + 16+63], 20
332 call _${alg}${bits}_decrypt_1x
333 add $inp, 16, $inp
334
335 fxor %f12, %f0, %f0 ! ^= ivec
336 fxor %f14, %f2, %f2
337 movxtod %o0, %f12
338 movxtod %o1, %f14
339
340 brnz,pn $ooff, 2f
341 sub $len, 1, $len
342
343 std %f0, [$out + 0]
344 std %f2, [$out + 8]
345 brnz,pt $len, .L${bits}_cbc_dec_loop2x
346 add $out, 16, $out
347___
348$::code.=<<___ if ($::evp);
349 st %f12, [$ivec + 0]
350 st %f13, [$ivec + 4]
351 st %f14, [$ivec + 8]
352 st %f15, [$ivec + 12]
353___
354$::code.=<<___ if (!$::evp);
355 brnz,pn $ivoff, .L${bits}_cbc_dec_unaligned_ivec
356 nop
357
358 std %f12, [$ivec + 0] ! write out ivec
359 std %f14, [$ivec + 8]
360___
361$::code.=<<___;
362.L${bits}_cbc_dec_abort:
363 ret
364 restore
365
366.align 16
3672: ldxa [$inp]0x82, %o0 ! avoid read-after-write hazard
368 ! and ~3x deterioration
369 ! in inp==out case
370 faligndata %f0, %f0, %f4 ! handle unaligned output
371 faligndata %f0, %f2, %f6
372 faligndata %f2, %f2, %f8
373
374 stda %f4, [$out + $omask]0xc0 ! partial store
375 std %f6, [$out + 8]
376 add $out, 16, $out
377 orn %g0, $omask, $omask
378 stda %f8, [$out + $omask]0xc0 ! partial store
379
380 brnz,pt $len, .L${bits}_cbc_dec_loop2x+4
381 orn %g0, $omask, $omask
382___
383$::code.=<<___ if ($::evp);
384 st %f12, [$ivec + 0]
385 st %f13, [$ivec + 4]
386 st %f14, [$ivec + 8]
387 st %f15, [$ivec + 12]
388___
389$::code.=<<___ if (!$::evp);
390 brnz,pn $ivoff, .L${bits}_cbc_dec_unaligned_ivec
391 nop
392
393 std %f12, [$ivec + 0] ! write out ivec
394 std %f14, [$ivec + 8]
395___
396$::code.=<<___;
397 ret
398 restore
399
400!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!
401.align 32
402.L${bits}_cbc_dec_loop2x:
403 ldx [$inp + 0], %o0
404 ldx [$inp + 8], %o1
405 ldx [$inp + 16], %o2
406 brz,pt $ileft, 4f
407 ldx [$inp + 24], %o3
408
409 ldx [$inp + 32], %o4
410 sllx %o0, $ileft, %o0
411 srlx %o1, $iright, %g1
412 or %g1, %o0, %o0
413 sllx %o1, $ileft, %o1
414 srlx %o2, $iright, %g1
415 or %g1, %o1, %o1
416 sllx %o2, $ileft, %o2
417 srlx %o3, $iright, %g1
418 or %g1, %o2, %o2
419 sllx %o3, $ileft, %o3
420 srlx %o4, $iright, %o4
421 or %o4, %o3, %o3
4224:
423 xor %g4, %o0, %o4 ! ^= rk[0]
424 xor %g5, %o1, %o5
425 movxtod %o4, %f0
426 movxtod %o5, %f2
427 xor %g4, %o2, %o4
428 xor %g5, %o3, %o5
429 movxtod %o4, %f4
430 movxtod %o5, %f6
431
432 prefetch [$out + 63], 22
433 prefetch [$inp + 32+63], 20
434 call _${alg}${bits}_decrypt_2x
435 add $inp, 32, $inp
436
437 movxtod %o0, %f8
438 movxtod %o1, %f10
439 fxor %f12, %f0, %f0 ! ^= ivec
440 fxor %f14, %f2, %f2
441 movxtod %o2, %f12
442 movxtod %o3, %f14
443 fxor %f8, %f4, %f4
444 fxor %f10, %f6, %f6
445
446 brnz,pn $ooff, 2f
447 sub $len, 2, $len
448
449 std %f0, [$out + 0]
450 std %f2, [$out + 8]
451 std %f4, [$out + 16]
452 std %f6, [$out + 24]
453 brnz,pt $len, .L${bits}_cbc_dec_loop2x
454 add $out, 32, $out
455___
456$::code.=<<___ if ($::evp);
457 st %f12, [$ivec + 0]
458 st %f13, [$ivec + 4]
459 st %f14, [$ivec + 8]
460 st %f15, [$ivec + 12]
461___
462$::code.=<<___ if (!$::evp);
463 brnz,pn $ivoff, .L${bits}_cbc_dec_unaligned_ivec
464 nop
465
466 std %f12, [$ivec + 0] ! write out ivec
467 std %f14, [$ivec + 8]
468___
469$::code.=<<___;
470 ret
471 restore
472
473.align 16
4742: ldxa [$inp]0x82, %o0 ! avoid read-after-write hazard
475 ! and ~3x deterioration
476 ! in inp==out case
477 faligndata %f0, %f0, %f8 ! handle unaligned output
478 faligndata %f0, %f2, %f0
479 faligndata %f2, %f4, %f2
480 faligndata %f4, %f6, %f4
481 faligndata %f6, %f6, %f6
482 stda %f8, [$out + $omask]0xc0 ! partial store
483 std %f0, [$out + 8]
484 std %f2, [$out + 16]
485 std %f4, [$out + 24]
486 add $out, 32, $out
487 orn %g0, $omask, $omask
488 stda %f6, [$out + $omask]0xc0 ! partial store
489
490 brnz,pt $len, .L${bits}_cbc_dec_loop2x+4
491 orn %g0, $omask, $omask
492___
493$::code.=<<___ if ($::evp);
494 st %f12, [$ivec + 0]
495 st %f13, [$ivec + 4]
496 st %f14, [$ivec + 8]
497 st %f15, [$ivec + 12]
498___
499$::code.=<<___ if (!$::evp);
500 brnz,pn $ivoff, .L${bits}_cbc_dec_unaligned_ivec
501 nop
502
503 std %f12, [$ivec + 0] ! write out ivec
504 std %f14, [$ivec + 8]
505 ret
506 restore
507
508.align 16
509.L${bits}_cbc_dec_unaligned_ivec:
510 alignaddrl $ivec, $ivoff, %g0 ! handle unaligned ivec
511 mov 0xff, $omask
512 srl $omask, $ivoff, $omask
513 faligndata %f12, %f12, %f0
514 faligndata %f12, %f14, %f2
515 faligndata %f14, %f14, %f4
516 stda %f0, [$ivec + $omask]0xc0
517 std %f2, [$ivec + 8]
518 add $ivec, 16, $ivec
519 orn %g0, $omask, $omask
520 stda %f4, [$ivec + $omask]0xc0
521___
522$::code.=<<___;
523 ret
524 restore
525
526!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!
527.align 32
528.L${bits}cbc_dec_blk:
529 add $out, $len, $blk_init
530 and $blk_init, 63, $blk_init ! tail
531 sub $len, $blk_init, $len
532 add $blk_init, 15, $blk_init ! round up to 16n
533 srlx $len, 4, $len
534 srl $blk_init, 4, $blk_init
535 sub $len, 1, $len
536 add $blk_init, 1, $blk_init
537
538.L${bits}_cbc_dec_blk_loop2x:
539 ldx [$inp + 0], %o0
540 ldx [$inp + 8], %o1
541 ldx [$inp + 16], %o2
542 brz,pt $ileft, 5f
543 ldx [$inp + 24], %o3
544
545 ldx [$inp + 32], %o4
546 sllx %o0, $ileft, %o0
547 srlx %o1, $iright, %g1
548 or %g1, %o0, %o0
549 sllx %o1, $ileft, %o1
550 srlx %o2, $iright, %g1
551 or %g1, %o1, %o1
552 sllx %o2, $ileft, %o2
553 srlx %o3, $iright, %g1
554 or %g1, %o2, %o2
555 sllx %o3, $ileft, %o3
556 srlx %o4, $iright, %o4
557 or %o4, %o3, %o3
5585:
559 xor %g4, %o0, %o4 ! ^= rk[0]
560 xor %g5, %o1, %o5
561 movxtod %o4, %f0
562 movxtod %o5, %f2
563 xor %g4, %o2, %o4
564 xor %g5, %o3, %o5
565 movxtod %o4, %f4
566 movxtod %o5, %f6
567
568 prefetch [$inp + 32+63], 20
569 call _${alg}${bits}_decrypt_2x
570 add $inp, 32, $inp
571 subcc $len, 2, $len
572
573 movxtod %o0, %f8
574 movxtod %o1, %f10
575 fxor %f12, %f0, %f0 ! ^= ivec
576 fxor %f14, %f2, %f2
577 movxtod %o2, %f12
578 movxtod %o3, %f14
579 fxor %f8, %f4, %f4
580 fxor %f10, %f6, %f6
581
582 stda %f0, [$out]0xe2 ! ASI_BLK_INIT, T4-specific
583 add $out, 8, $out
584 stda %f2, [$out]0xe2 ! ASI_BLK_INIT, T4-specific
585 add $out, 8, $out
586 stda %f4, [$out]0xe2 ! ASI_BLK_INIT, T4-specific
587 add $out, 8, $out
588 stda %f6, [$out]0xe2 ! ASI_BLK_INIT, T4-specific
589 bgu,pt $::size_t_cc, .L${bits}_cbc_dec_blk_loop2x
590 add $out, 8, $out
591
592 add $blk_init, $len, $len
593 andcc $len, 1, %g0 ! is number of blocks even?
594 membar #StoreLoad|#StoreStore
595 bnz,pt %icc, .L${bits}_cbc_dec_loop
596 srl $len, 0, $len
597 brnz,pn $len, .L${bits}_cbc_dec_loop2x
598 nop
599___
600$::code.=<<___ if ($::evp);
601 st %f12, [$ivec + 0] ! write out ivec
602 st %f13, [$ivec + 4]
603 st %f14, [$ivec + 8]
604 st %f15, [$ivec + 12]
605___
606$::code.=<<___ if (!$::evp);
607 brnz,pn $ivoff, 3b
608 nop
609
610 std %f12, [$ivec + 0] ! write out ivec
611 std %f14, [$ivec + 8]
612___
613$::code.=<<___;
614 ret
615 restore
616.type ${alg}${bits}_t4_cbc_decrypt,#function
617.size ${alg}${bits}_t4_cbc_decrypt,.-${alg}${bits}_t4_cbc_decrypt
618___
619}
620
621sub alg_ctr32_implement {
622my ($alg,$bits) = @_;
623
624$::code.=<<___;
625.globl ${alg}${bits}_t4_ctr32_encrypt
626.align 32
627${alg}${bits}_t4_ctr32_encrypt:
628 save %sp, -$::frame, %sp
629 srln $len, 0, $len ! needed on v8+, "nop" on v9
630
631 prefetch [$inp], 20
632 prefetch [$inp + 63], 20
633 call _${alg}${bits}_load_enckey
634 sllx $len, 4, $len
635
636 ld [$ivec + 0], %l4 ! counter
637 ld [$ivec + 4], %l5
638 ld [$ivec + 8], %l6
639 ld [$ivec + 12], %l7
640
641 sllx %l4, 32, %o5
642 or %l5, %o5, %o5
643 sllx %l6, 32, %g1
644 xor %o5, %g4, %g4 ! ^= rk[0]
645 xor %g1, %g5, %g5
646 movxtod %g4, %f14 ! most significant 64 bits
647
648 sub $inp, $out, $blk_init ! $inp!=$out
649 and $inp, 7, $ileft
650 andn $inp, 7, $inp
651 sll $ileft, 3, $ileft
652 mov 64, $iright
653 mov 0xff, $omask
654 sub $iright, $ileft, $iright
655 and $out, 7, $ooff
656 cmp $len, 255
657 movrnz $ooff, 0, $blk_init ! if ( $out&7 ||
658 movleu $::size_t_cc, 0, $blk_init ! $len<256 ||
659 brnz,pn $blk_init, .L${bits}_ctr32_blk ! $inp==$out)
660 srl $omask, $ooff, $omask
661
662 andcc $len, 16, %g0 ! is number of blocks even?
663 alignaddrl $out, %g0, $out
664 bz %icc, .L${bits}_ctr32_loop2x
665 srlx $len, 4, $len
666.L${bits}_ctr32_loop:
667 ldx [$inp + 0], %o0
668 brz,pt $ileft, 4f
669 ldx [$inp + 8], %o1
670
671 ldx [$inp + 16], %o2
672 sllx %o0, $ileft, %o0
673 srlx %o1, $iright, %g1
674 sllx %o1, $ileft, %o1
675 or %g1, %o0, %o0
676 srlx %o2, $iright, %o2
677 or %o2, %o1, %o1
6784:
679 xor %g5, %l7, %g1 ! ^= rk[0]
680 add %l7, 1, %l7
681 movxtod %g1, %f2
682 srl %l7, 0, %l7 ! clruw
683 prefetch [$out + 63], 22
684 prefetch [$inp + 16+63], 20
685___
686$::code.=<<___ if ($alg eq "aes");
687 aes_eround01 %f16, %f14, %f2, %f4
688 aes_eround23 %f18, %f14, %f2, %f2
689___
690$::code.=<<___ if ($alg eq "cmll");
691 camellia_f %f16, %f2, %f14, %f2
692 camellia_f %f18, %f14, %f2, %f0
693___
694$::code.=<<___;
695 call _${alg}${bits}_encrypt_1x+8
696 add $inp, 16, $inp
697
698 movxtod %o0, %f10
699 movxtod %o1, %f12
700 fxor %f10, %f0, %f0 ! ^= inp
701 fxor %f12, %f2, %f2
702
703 brnz,pn $ooff, 2f
704 sub $len, 1, $len
705
706 std %f0, [$out + 0]
707 std %f2, [$out + 8]
708 brnz,pt $len, .L${bits}_ctr32_loop2x
709 add $out, 16, $out
710
711 ret
712 restore
713
714.align 16
7152: ldxa [$inp]0x82, %o0 ! avoid read-after-write hazard
716 ! and ~3x deterioration
717 ! in inp==out case
718 faligndata %f0, %f0, %f4 ! handle unaligned output
719 faligndata %f0, %f2, %f6
720 faligndata %f2, %f2, %f8
721 stda %f4, [$out + $omask]0xc0 ! partial store
722 std %f6, [$out + 8]
723 add $out, 16, $out
724 orn %g0, $omask, $omask
725 stda %f8, [$out + $omask]0xc0 ! partial store
726
727 brnz,pt $len, .L${bits}_ctr32_loop2x+4
728 orn %g0, $omask, $omask
729
730 ret
731 restore
732
733!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!
734.align 32
735.L${bits}_ctr32_loop2x:
736 ldx [$inp + 0], %o0
737 ldx [$inp + 8], %o1
738 ldx [$inp + 16], %o2
739 brz,pt $ileft, 4f
740 ldx [$inp + 24], %o3
741
742 ldx [$inp + 32], %o4
743 sllx %o0, $ileft, %o0
744 srlx %o1, $iright, %g1
745 or %g1, %o0, %o0
746 sllx %o1, $ileft, %o1
747 srlx %o2, $iright, %g1
748 or %g1, %o1, %o1
749 sllx %o2, $ileft, %o2
750 srlx %o3, $iright, %g1
751 or %g1, %o2, %o2
752 sllx %o3, $ileft, %o3
753 srlx %o4, $iright, %o4
754 or %o4, %o3, %o3
7554:
756 xor %g5, %l7, %g1 ! ^= rk[0]
757 add %l7, 1, %l7
758 movxtod %g1, %f2
759 srl %l7, 0, %l7 ! clruw
760 xor %g5, %l7, %g1
761 add %l7, 1, %l7
762 movxtod %g1, %f6
763 srl %l7, 0, %l7 ! clruw
764 prefetch [$out + 63], 22
765 prefetch [$inp + 32+63], 20
766___
767$::code.=<<___ if ($alg eq "aes");
768 aes_eround01 %f16, %f14, %f2, %f8
769 aes_eround23 %f18, %f14, %f2, %f2
770 aes_eround01 %f16, %f14, %f6, %f10
771 aes_eround23 %f18, %f14, %f6, %f6
772___
773$::code.=<<___ if ($alg eq "cmll");
774 camellia_f %f16, %f2, %f14, %f2
775 camellia_f %f16, %f6, %f14, %f6
776 camellia_f %f18, %f14, %f2, %f0
777 camellia_f %f18, %f14, %f6, %f4
778___
779$::code.=<<___;
780 call _${alg}${bits}_encrypt_2x+16
781 add $inp, 32, $inp
782
783 movxtod %o0, %f8
784 movxtod %o1, %f10
785 movxtod %o2, %f12
786 fxor %f8, %f0, %f0 ! ^= inp
787 movxtod %o3, %f8
788 fxor %f10, %f2, %f2
789 fxor %f12, %f4, %f4
790 fxor %f8, %f6, %f6
791
792 brnz,pn $ooff, 2f
793 sub $len, 2, $len
794
795 std %f0, [$out + 0]
796 std %f2, [$out + 8]
797 std %f4, [$out + 16]
798 std %f6, [$out + 24]
799 brnz,pt $len, .L${bits}_ctr32_loop2x
800 add $out, 32, $out
801
802 ret
803 restore
804
805.align 16
8062: ldxa [$inp]0x82, %o0 ! avoid read-after-write hazard
807 ! and ~3x deterioration
808 ! in inp==out case
809 faligndata %f0, %f0, %f8 ! handle unaligned output
810 faligndata %f0, %f2, %f0
811 faligndata %f2, %f4, %f2
812 faligndata %f4, %f6, %f4
813 faligndata %f6, %f6, %f6
814
815 stda %f8, [$out + $omask]0xc0 ! partial store
816 std %f0, [$out + 8]
817 std %f2, [$out + 16]
818 std %f4, [$out + 24]
819 add $out, 32, $out
820 orn %g0, $omask, $omask
821 stda %f6, [$out + $omask]0xc0 ! partial store
822
823 brnz,pt $len, .L${bits}_ctr32_loop2x+4
824 orn %g0, $omask, $omask
825
826 ret
827 restore
828
829!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!
830.align 32
831.L${bits}_ctr32_blk:
832 add $out, $len, $blk_init
833 and $blk_init, 63, $blk_init ! tail
834 sub $len, $blk_init, $len
835 add $blk_init, 15, $blk_init ! round up to 16n
836 srlx $len, 4, $len
837 srl $blk_init, 4, $blk_init
838 sub $len, 1, $len
839 add $blk_init, 1, $blk_init
840
841.L${bits}_ctr32_blk_loop2x:
842 ldx [$inp + 0], %o0
843 ldx [$inp + 8], %o1
844 ldx [$inp + 16], %o2
845 brz,pt $ileft, 5f
846 ldx [$inp + 24], %o3
847
848 ldx [$inp + 32], %o4
849 sllx %o0, $ileft, %o0
850 srlx %o1, $iright, %g1
851 or %g1, %o0, %o0
852 sllx %o1, $ileft, %o1
853 srlx %o2, $iright, %g1
854 or %g1, %o1, %o1
855 sllx %o2, $ileft, %o2
856 srlx %o3, $iright, %g1
857 or %g1, %o2, %o2
858 sllx %o3, $ileft, %o3
859 srlx %o4, $iright, %o4
860 or %o4, %o3, %o3
8615:
862 xor %g5, %l7, %g1 ! ^= rk[0]
863 add %l7, 1, %l7
864 movxtod %g1, %f2
865 srl %l7, 0, %l7 ! clruw
866 xor %g5, %l7, %g1
867 add %l7, 1, %l7
868 movxtod %g1, %f6
869 srl %l7, 0, %l7 ! clruw
870 prefetch [$inp + 32+63], 20
871___
872$::code.=<<___ if ($alg eq "aes");
873 aes_eround01 %f16, %f14, %f2, %f8
874 aes_eround23 %f18, %f14, %f2, %f2
875 aes_eround01 %f16, %f14, %f6, %f10
876 aes_eround23 %f18, %f14, %f6, %f6
877___
878$::code.=<<___ if ($alg eq "cmll");
879 camellia_f %f16, %f2, %f14, %f2
880 camellia_f %f16, %f6, %f14, %f6
881 camellia_f %f18, %f14, %f2, %f0
882 camellia_f %f18, %f14, %f6, %f4
883___
884$::code.=<<___;
885 call _${alg}${bits}_encrypt_2x+16
886 add $inp, 32, $inp
887 subcc $len, 2, $len
888
889 movxtod %o0, %f8
890 movxtod %o1, %f10
891 movxtod %o2, %f12
892 fxor %f8, %f0, %f0 ! ^= inp
893 movxtod %o3, %f8
894 fxor %f10, %f2, %f2
895 fxor %f12, %f4, %f4
896 fxor %f8, %f6, %f6
897
898 stda %f0, [$out]0xe2 ! ASI_BLK_INIT, T4-specific
899 add $out, 8, $out
900 stda %f2, [$out]0xe2 ! ASI_BLK_INIT, T4-specific
901 add $out, 8, $out
902 stda %f4, [$out]0xe2 ! ASI_BLK_INIT, T4-specific
903 add $out, 8, $out
904 stda %f6, [$out]0xe2 ! ASI_BLK_INIT, T4-specific
905 bgu,pt $::size_t_cc, .L${bits}_ctr32_blk_loop2x
906 add $out, 8, $out
907
908 add $blk_init, $len, $len
909 andcc $len, 1, %g0 ! is number of blocks even?
910 membar #StoreLoad|#StoreStore
911 bnz,pt %icc, .L${bits}_ctr32_loop
912 srl $len, 0, $len
913 brnz,pn $len, .L${bits}_ctr32_loop2x
914 nop
915
916 ret
917 restore
918.type ${alg}${bits}_t4_ctr32_encrypt,#function
919.size ${alg}${bits}_t4_ctr32_encrypt,.-${alg}${bits}_t4_ctr32_encrypt
920___
921}
922
923sub alg_xts_implement {
924my ($alg,$bits,$dir) = @_;
925my ($inp,$out,$len,$key1,$key2,$ivec)=map("%i$_",(0..5));
926my $rem=$ivec;
927
928$::code.=<<___;
929.globl ${alg}${bits}_t4_xts_${dir}crypt
930.align 32
931${alg}${bits}_t4_xts_${dir}crypt:
932 save %sp, -$::frame-16, %sp
933 srln $len, 0, $len ! needed on v8+, "nop" on v9
934
935 mov $ivec, %o0
936 add %fp, $::bias-16, %o1
937 call ${alg}_t4_encrypt
938 mov $key2, %o2
939
940 add %fp, $::bias-16, %l7
941 ldxa [%l7]0x88, %g2
942 add %fp, $::bias-8, %l7
943 ldxa [%l7]0x88, %g3 ! %g3:%g2 is tweak
944
945 sethi %hi(0x76543210), %l7
946 or %l7, %lo(0x76543210), %l7
947 bmask %l7, %g0, %g0 ! byte swap mask
948
949 prefetch [$inp], 20
950 prefetch [$inp + 63], 20
951 call _${alg}${bits}_load_${dir}ckey
952 and $len, 15, $rem
953 and $len, -16, $len
954___
955$code.=<<___ if ($dir eq "de");
956 mov 0, %l7
957 movrnz $rem, 16, %l7
958 sub $len, %l7, $len
959___
960$code.=<<___;
961
962 sub $inp, $out, $blk_init ! $inp!=$out
963 and $inp, 7, $ileft
964 andn $inp, 7, $inp
965 sll $ileft, 3, $ileft
966 mov 64, $iright
967 mov 0xff, $omask
968 sub $iright, $ileft, $iright
969 and $out, 7, $ooff
970 cmp $len, 255
971 movrnz $ooff, 0, $blk_init ! if ( $out&7 ||
972 movleu $::size_t_cc, 0, $blk_init ! $len<256 ||
973 brnz,pn $blk_init, .L${bits}_xts_${dir}blk ! $inp==$out)
974 srl $omask, $ooff, $omask
975
976 andcc $len, 16, %g0 ! is number of blocks even?
977___
978$code.=<<___ if ($dir eq "de");
979 brz,pn $len, .L${bits}_xts_${dir}steal
980___
981$code.=<<___;
982 alignaddrl $out, %g0, $out
983 bz %icc, .L${bits}_xts_${dir}loop2x
984 srlx $len, 4, $len
985.L${bits}_xts_${dir}loop:
986 ldx [$inp + 0], %o0
987 brz,pt $ileft, 4f
988 ldx [$inp + 8], %o1
989
990 ldx [$inp + 16], %o2
991 sllx %o0, $ileft, %o0
992 srlx %o1, $iright, %g1
993 sllx %o1, $ileft, %o1
994 or %g1, %o0, %o0
995 srlx %o2, $iright, %o2
996 or %o2, %o1, %o1
9974:
998 movxtod %g2, %f12
999 movxtod %g3, %f14
1000 bshuffle %f12, %f12, %f12
1001 bshuffle %f14, %f14, %f14
1002
1003 xor %g4, %o0, %o0 ! ^= rk[0]
1004 xor %g5, %o1, %o1
1005 movxtod %o0, %f0
1006 movxtod %o1, %f2
1007
1008 fxor %f12, %f0, %f0 ! ^= tweak[0]
1009 fxor %f14, %f2, %f2
1010
1011 prefetch [$out + 63], 22
1012 prefetch [$inp + 16+63], 20
1013 call _${alg}${bits}_${dir}crypt_1x
1014 add $inp, 16, $inp
1015
1016 fxor %f12, %f0, %f0 ! ^= tweak[0]
1017 fxor %f14, %f2, %f2
1018
1019 srax %g3, 63, %l7 ! next tweak value
1020 addcc %g2, %g2, %g2
1021 and %l7, 0x87, %l7
1022 addxc %g3, %g3, %g3
1023 xor %l7, %g2, %g2
1024
1025 brnz,pn $ooff, 2f
1026 sub $len, 1, $len
1027
1028 std %f0, [$out + 0]
1029 std %f2, [$out + 8]
1030 brnz,pt $len, .L${bits}_xts_${dir}loop2x
1031 add $out, 16, $out
1032
1033 brnz,pn $rem, .L${bits}_xts_${dir}steal
1034 nop
1035
1036 ret
1037 restore
1038
1039.align 16
10402: ldxa [$inp]0x82, %o0 ! avoid read-after-write hazard
1041 ! and ~3x deterioration
1042 ! in inp==out case
1043 faligndata %f0, %f0, %f4 ! handle unaligned output
1044 faligndata %f0, %f2, %f6
1045 faligndata %f2, %f2, %f8
1046 stda %f4, [$out + $omask]0xc0 ! partial store
1047 std %f6, [$out + 8]
1048 add $out, 16, $out
1049 orn %g0, $omask, $omask
1050 stda %f8, [$out + $omask]0xc0 ! partial store
1051
1052 brnz,pt $len, .L${bits}_xts_${dir}loop2x+4
1053 orn %g0, $omask, $omask
1054
1055 brnz,pn $rem, .L${bits}_xts_${dir}steal
1056 nop
1057
1058 ret
1059 restore
1060
1061!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!
1062.align 32
1063.L${bits}_xts_${dir}loop2x:
1064 ldx [$inp + 0], %o0
1065 ldx [$inp + 8], %o1
1066 ldx [$inp + 16], %o2
1067 brz,pt $ileft, 4f
1068 ldx [$inp + 24], %o3
1069
1070 ldx [$inp + 32], %o4
1071 sllx %o0, $ileft, %o0
1072 srlx %o1, $iright, %g1
1073 or %g1, %o0, %o0
1074 sllx %o1, $ileft, %o1
1075 srlx %o2, $iright, %g1
1076 or %g1, %o1, %o1
1077 sllx %o2, $ileft, %o2
1078 srlx %o3, $iright, %g1
1079 or %g1, %o2, %o2
1080 sllx %o3, $ileft, %o3
1081 srlx %o4, $iright, %o4
1082 or %o4, %o3, %o3
10834:
1084 movxtod %g2, %f12
1085 movxtod %g3, %f14
1086 bshuffle %f12, %f12, %f12
1087 bshuffle %f14, %f14, %f14
1088
1089 srax %g3, 63, %l7 ! next tweak value
1090 addcc %g2, %g2, %g2
1091 and %l7, 0x87, %l7
1092 addxc %g3, %g3, %g3
1093 xor %l7, %g2, %g2
1094
1095 movxtod %g2, %f8
1096 movxtod %g3, %f10
1097 bshuffle %f8, %f8, %f8
1098 bshuffle %f10, %f10, %f10
1099
1100 xor %g4, %o0, %o0 ! ^= rk[0]
1101 xor %g5, %o1, %o1
1102 xor %g4, %o2, %o2 ! ^= rk[0]
1103 xor %g5, %o3, %o3
1104 movxtod %o0, %f0
1105 movxtod %o1, %f2
1106 movxtod %o2, %f4
1107 movxtod %o3, %f6
1108
1109 fxor %f12, %f0, %f0 ! ^= tweak[0]
1110 fxor %f14, %f2, %f2
1111 fxor %f8, %f4, %f4 ! ^= tweak[0]
1112 fxor %f10, %f6, %f6
1113
1114 prefetch [$out + 63], 22
1115 prefetch [$inp + 32+63], 20
1116 call _${alg}${bits}_${dir}crypt_2x
1117 add $inp, 32, $inp
1118
1119 movxtod %g2, %f8
1120 movxtod %g3, %f10
1121
1122 srax %g3, 63, %l7 ! next tweak value
1123 addcc %g2, %g2, %g2
1124 and %l7, 0x87, %l7
1125 addxc %g3, %g3, %g3
1126 xor %l7, %g2, %g2
1127
1128 bshuffle %f8, %f8, %f8
1129 bshuffle %f10, %f10, %f10
1130
1131 fxor %f12, %f0, %f0 ! ^= tweak[0]
1132 fxor %f14, %f2, %f2
1133 fxor %f8, %f4, %f4
1134 fxor %f10, %f6, %f6
1135
1136 brnz,pn $ooff, 2f
1137 sub $len, 2, $len
1138
1139 std %f0, [$out + 0]
1140 std %f2, [$out + 8]
1141 std %f4, [$out + 16]
1142 std %f6, [$out + 24]
1143 brnz,pt $len, .L${bits}_xts_${dir}loop2x
1144 add $out, 32, $out
1145
1146 fsrc2 %f4, %f0
1147 fsrc2 %f6, %f2
1148 brnz,pn $rem, .L${bits}_xts_${dir}steal
1149 nop
1150
1151 ret
1152 restore
1153
1154.align 16
11552: ldxa [$inp]0x82, %o0 ! avoid read-after-write hazard
1156 ! and ~3x deterioration
1157 ! in inp==out case
1158 faligndata %f0, %f0, %f8 ! handle unaligned output
1159 faligndata %f0, %f2, %f10
1160 faligndata %f2, %f4, %f12
1161 faligndata %f4, %f6, %f14
1162 faligndata %f6, %f6, %f0
1163
1164 stda %f8, [$out + $omask]0xc0 ! partial store
1165 std %f10, [$out + 8]
1166 std %f12, [$out + 16]
1167 std %f14, [$out + 24]
1168 add $out, 32, $out
1169 orn %g0, $omask, $omask
1170 stda %f0, [$out + $omask]0xc0 ! partial store
1171
1172 brnz,pt $len, .L${bits}_xts_${dir}loop2x+4
1173 orn %g0, $omask, $omask
1174
1175 fsrc2 %f4, %f0
1176 fsrc2 %f6, %f2
1177 brnz,pn $rem, .L${bits}_xts_${dir}steal
1178 nop
1179
1180 ret
1181 restore
1182
1183!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!
1184.align 32
1185.L${bits}_xts_${dir}blk:
1186 add $out, $len, $blk_init
1187 and $blk_init, 63, $blk_init ! tail
1188 sub $len, $blk_init, $len
1189 add $blk_init, 15, $blk_init ! round up to 16n
1190 srlx $len, 4, $len
1191 srl $blk_init, 4, $blk_init
1192 sub $len, 1, $len
1193 add $blk_init, 1, $blk_init
1194
1195.L${bits}_xts_${dir}blk2x:
1196 ldx [$inp + 0], %o0
1197 ldx [$inp + 8], %o1
1198 ldx [$inp + 16], %o2
1199 brz,pt $ileft, 5f
1200 ldx [$inp + 24], %o3
1201
1202 ldx [$inp + 32], %o4
1203 sllx %o0, $ileft, %o0
1204 srlx %o1, $iright, %g1
1205 or %g1, %o0, %o0
1206 sllx %o1, $ileft, %o1
1207 srlx %o2, $iright, %g1
1208 or %g1, %o1, %o1
1209 sllx %o2, $ileft, %o2
1210 srlx %o3, $iright, %g1
1211 or %g1, %o2, %o2
1212 sllx %o3, $ileft, %o3
1213 srlx %o4, $iright, %o4
1214 or %o4, %o3, %o3
12155:
1216 movxtod %g2, %f12
1217 movxtod %g3, %f14
1218 bshuffle %f12, %f12, %f12
1219 bshuffle %f14, %f14, %f14
1220
1221 srax %g3, 63, %l7 ! next tweak value
1222 addcc %g2, %g2, %g2
1223 and %l7, 0x87, %l7
1224 addxc %g3, %g3, %g3
1225 xor %l7, %g2, %g2
1226
1227 movxtod %g2, %f8
1228 movxtod %g3, %f10
1229 bshuffle %f8, %f8, %f8
1230 bshuffle %f10, %f10, %f10
1231
1232 xor %g4, %o0, %o0 ! ^= rk[0]
1233 xor %g5, %o1, %o1
1234 xor %g4, %o2, %o2 ! ^= rk[0]
1235 xor %g5, %o3, %o3
1236 movxtod %o0, %f0
1237 movxtod %o1, %f2
1238 movxtod %o2, %f4
1239 movxtod %o3, %f6
1240
1241 fxor %f12, %f0, %f0 ! ^= tweak[0]
1242 fxor %f14, %f2, %f2
1243 fxor %f8, %f4, %f4 ! ^= tweak[0]
1244 fxor %f10, %f6, %f6
1245
1246 prefetch [$inp + 32+63], 20
1247 call _${alg}${bits}_${dir}crypt_2x
1248 add $inp, 32, $inp
1249
1250 movxtod %g2, %f8
1251 movxtod %g3, %f10
1252
1253 srax %g3, 63, %l7 ! next tweak value
1254 addcc %g2, %g2, %g2
1255 and %l7, 0x87, %l7
1256 addxc %g3, %g3, %g3
1257 xor %l7, %g2, %g2
1258
1259 bshuffle %f8, %f8, %f8
1260 bshuffle %f10, %f10, %f10
1261
1262 fxor %f12, %f0, %f0 ! ^= tweak[0]
1263 fxor %f14, %f2, %f2
1264 fxor %f8, %f4, %f4
1265 fxor %f10, %f6, %f6
1266
1267 subcc $len, 2, $len
1268 stda %f0, [$out]0xe2 ! ASI_BLK_INIT, T4-specific
1269 add $out, 8, $out
1270 stda %f2, [$out]0xe2 ! ASI_BLK_INIT, T4-specific
1271 add $out, 8, $out
1272 stda %f4, [$out]0xe2 ! ASI_BLK_INIT, T4-specific
1273 add $out, 8, $out
1274 stda %f6, [$out]0xe2 ! ASI_BLK_INIT, T4-specific
1275 bgu,pt $::size_t_cc, .L${bits}_xts_${dir}blk2x
1276 add $out, 8, $out
1277
1278 add $blk_init, $len, $len
1279 andcc $len, 1, %g0 ! is number of blocks even?
1280 membar #StoreLoad|#StoreStore
1281 bnz,pt %icc, .L${bits}_xts_${dir}loop
1282 srl $len, 0, $len
1283 brnz,pn $len, .L${bits}_xts_${dir}loop2x
1284 nop
1285
1286 fsrc2 %f4, %f0
1287 fsrc2 %f6, %f2
1288 brnz,pn $rem, .L${bits}_xts_${dir}steal
1289 nop
1290
1291 ret
1292 restore
1293!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!
1294___
1295$code.=<<___ if ($dir eq "en");
1296.align 32
1297.L${bits}_xts_${dir}steal:
1298 std %f0, [%fp + $::bias-16] ! copy of output
1299 std %f2, [%fp + $::bias-8]
1300
1301 srl $ileft, 3, $ileft
1302 add %fp, $::bias-16, %l7
1303 add $inp, $ileft, $inp ! original $inp+$len&-15
1304 add $out, $ooff, $out ! original $out+$len&-15
1305 mov 0, $ileft
1306 nop ! align
1307
1308.L${bits}_xts_${dir}stealing:
1309 ldub [$inp + $ileft], %o0
1310 ldub [%l7 + $ileft], %o1
1311 dec $rem
1312 stb %o0, [%l7 + $ileft]
1313 stb %o1, [$out + $ileft]
1314 brnz $rem, .L${bits}_xts_${dir}stealing
1315 inc $ileft
1316
1317 mov %l7, $inp
1318 sub $out, 16, $out
1319 mov 0, $ileft
1320 sub $out, $ooff, $out
1321 ba .L${bits}_xts_${dir}loop ! one more time
1322 mov 1, $len ! $rem is 0
1323___
1324$code.=<<___ if ($dir eq "de");
1325.align 32
1326.L${bits}_xts_${dir}steal:
1327 ldx [$inp + 0], %o0
1328 brz,pt $ileft, 8f
1329 ldx [$inp + 8], %o1
1330
1331 ldx [$inp + 16], %o2
1332 sllx %o0, $ileft, %o0
1333 srlx %o1, $iright, %g1
1334 sllx %o1, $ileft, %o1
1335 or %g1, %o0, %o0
1336 srlx %o2, $iright, %o2
1337 or %o2, %o1, %o1
13388:
1339 srax %g3, 63, %l7 ! next tweak value
1340 addcc %g2, %g2, %o2
1341 and %l7, 0x87, %l7
1342 addxc %g3, %g3, %o3
1343 xor %l7, %o2, %o2
1344
1345 movxtod %o2, %f12
1346 movxtod %o3, %f14
1347 bshuffle %f12, %f12, %f12
1348 bshuffle %f14, %f14, %f14
1349
1350 xor %g4, %o0, %o0 ! ^= rk[0]
1351 xor %g5, %o1, %o1
1352 movxtod %o0, %f0
1353 movxtod %o1, %f2
1354
1355 fxor %f12, %f0, %f0 ! ^= tweak[0]
1356 fxor %f14, %f2, %f2
1357
1358 call _${alg}${bits}_${dir}crypt_1x
1359 add $inp, 16, $inp
1360
1361 fxor %f12, %f0, %f0 ! ^= tweak[0]
1362 fxor %f14, %f2, %f2
1363
1364 std %f0, [%fp + $::bias-16]
1365 std %f2, [%fp + $::bias-8]
1366
1367 srl $ileft, 3, $ileft
1368 add %fp, $::bias-16, %l7
1369 add $inp, $ileft, $inp ! original $inp+$len&-15
1370 add $out, $ooff, $out ! original $out+$len&-15
1371 mov 0, $ileft
1372 add $out, 16, $out
1373 nop ! align
1374
1375.L${bits}_xts_${dir}stealing:
1376 ldub [$inp + $ileft], %o0
1377 ldub [%l7 + $ileft], %o1
1378 dec $rem
1379 stb %o0, [%l7 + $ileft]
1380 stb %o1, [$out + $ileft]
1381 brnz $rem, .L${bits}_xts_${dir}stealing
1382 inc $ileft
1383
1384 mov %l7, $inp
1385 sub $out, 16, $out
1386 mov 0, $ileft
1387 sub $out, $ooff, $out
1388 ba .L${bits}_xts_${dir}loop ! one more time
1389 mov 1, $len ! $rem is 0
1390___
1391$code.=<<___;
1392 ret
1393 restore
1394.type ${alg}${bits}_t4_xts_${dir}crypt,#function
1395.size ${alg}${bits}_t4_xts_${dir}crypt,.-${alg}${bits}_t4_xts_${dir}crypt
1396___
1397}
1398
1399# Purpose of these subroutines is to explicitly encode VIS instructions,
1400# so that one can compile the module without having to specify VIS
1401# extensions on compiler command line, e.g. -xarch=v9 vs. -xarch=v9a.
1402# Idea is to reserve for option to produce "universal" binary and let
1403# programmer detect if current CPU is VIS capable at run-time.
1404sub unvis {
1405my ($mnemonic,$rs1,$rs2,$rd)=@_;
1406my ($ref,$opf);
1407my %visopf = ( "faligndata" => 0x048,
1408 "bshuffle" => 0x04c,
1409 "fnot2" => 0x066,
1410 "fxor" => 0x06c,
1411 "fsrc2" => 0x078 );
1412
1413 $ref = "$mnemonic\t$rs1,$rs2,$rd";
1414
1415 if ($opf=$visopf{$mnemonic}) {
1416 foreach ($rs1,$rs2,$rd) {
1417 return $ref if (!/%f([0-9]{1,2})/);
1418 $_=$1;
1419 if ($1>=32) {
1420 return $ref if ($1&1);
1421 # re-encode for upper double register addressing
1422 $_=($1|$1>>5)&31;
1423 }
1424 }
1425
1426 return sprintf ".word\t0x%08x !%s",
1427 0x81b00000|$rd<<25|$rs1<<14|$opf<<5|$rs2,
1428 $ref;
1429 } else {
1430 return $ref;
1431 }
1432}
1433
1434sub unvis3 {
1435my ($mnemonic,$rs1,$rs2,$rd)=@_;
1436my %bias = ( "g" => 0, "o" => 8, "l" => 16, "i" => 24 );
1437my ($ref,$opf);
1438my %visopf = ( "addxc" => 0x011,
1439 "addxccc" => 0x013,
1440 "umulxhi" => 0x016,
1441 "alignaddr" => 0x018,
1442 "bmask" => 0x019,
1443 "alignaddrl" => 0x01a );
1444
1445 $ref = "$mnemonic\t$rs1,$rs2,$rd";
1446
1447 if ($opf=$visopf{$mnemonic}) {
1448 foreach ($rs1,$rs2,$rd) {
1449 return $ref if (!/%([goli])([0-9])/);
1450 $_=$bias{$1}+$2;
1451 }
1452
1453 return sprintf ".word\t0x%08x !%s",
1454 0x81b00000|$rd<<25|$rs1<<14|$opf<<5|$rs2,
1455 $ref;
1456 } else {
1457 return $ref;
1458 }
1459}
1460
1461sub unaes_round { # 4-argument instructions
1462my ($mnemonic,$rs1,$rs2,$rs3,$rd)=@_;
1463my ($ref,$opf);
1464my %aesopf = ( "aes_eround01" => 0,
1465 "aes_eround23" => 1,
1466 "aes_dround01" => 2,
1467 "aes_dround23" => 3,
1468 "aes_eround01_l"=> 4,
1469 "aes_eround23_l"=> 5,
1470 "aes_dround01_l"=> 6,
1471 "aes_dround23_l"=> 7,
1472 "aes_kexpand1" => 8 );
1473
1474 $ref = "$mnemonic\t$rs1,$rs2,$rs3,$rd";
1475
1476 if (defined($opf=$aesopf{$mnemonic})) {
1477 $rs3 = ($rs3 =~ /%f([0-6]*[02468])/) ? (($1|$1>>5)&31) : $rs3;
1478 foreach ($rs1,$rs2,$rd) {
1479 return $ref if (!/%f([0-9]{1,2})/);
1480 $_=$1;
1481 if ($1>=32) {
1482 return $ref if ($1&1);
1483 # re-encode for upper double register addressing
1484 $_=($1|$1>>5)&31;
1485 }
1486 }
1487
1488 return sprintf ".word\t0x%08x !%s",
1489 2<<30|$rd<<25|0x19<<19|$rs1<<14|$rs3<<9|$opf<<5|$rs2,
1490 $ref;
1491 } else {
1492 return $ref;
1493 }
1494}
1495
1496sub unaes_kexpand { # 3-argument instructions
1497my ($mnemonic,$rs1,$rs2,$rd)=@_;
1498my ($ref,$opf);
1499my %aesopf = ( "aes_kexpand0" => 0x130,
1500 "aes_kexpand2" => 0x131 );
1501
1502 $ref = "$mnemonic\t$rs1,$rs2,$rd";
1503
1504 if (defined($opf=$aesopf{$mnemonic})) {
1505 foreach ($rs1,$rs2,$rd) {
1506 return $ref if (!/%f([0-9]{1,2})/);
1507 $_=$1;
1508 if ($1>=32) {
1509 return $ref if ($1&1);
1510 # re-encode for upper double register addressing
1511 $_=($1|$1>>5)&31;
1512 }
1513 }
1514
1515 return sprintf ".word\t0x%08x !%s",
1516 2<<30|$rd<<25|0x36<<19|$rs1<<14|$opf<<5|$rs2,
1517 $ref;
1518 } else {
1519 return $ref;
1520 }
1521}
1522
1523sub uncamellia_f { # 4-argument instructions
1524my ($mnemonic,$rs1,$rs2,$rs3,$rd)=@_;
1525my ($ref,$opf);
1526
1527 $ref = "$mnemonic\t$rs1,$rs2,$rs3,$rd";
1528
1529 if (1) {
1530 $rs3 = ($rs3 =~ /%f([0-6]*[02468])/) ? (($1|$1>>5)&31) : $rs3;
1531 foreach ($rs1,$rs2,$rd) {
1532 return $ref if (!/%f([0-9]{1,2})/);
1533 $_=$1;
1534 if ($1>=32) {
1535 return $ref if ($1&1);
1536 # re-encode for upper double register addressing
1537 $_=($1|$1>>5)&31;
1538 }
1539 }
1540
1541 return sprintf ".word\t0x%08x !%s",
1542 2<<30|$rd<<25|0x19<<19|$rs1<<14|$rs3<<9|0xc<<5|$rs2,
1543 $ref;
1544 } else {
1545 return $ref;
1546 }
1547}
1548
1549sub uncamellia3 { # 3-argument instructions
1550my ($mnemonic,$rs1,$rs2,$rd)=@_;
1551my ($ref,$opf);
1552my %cmllopf = ( "camellia_fl" => 0x13c,
1553 "camellia_fli" => 0x13d );
1554
1555 $ref = "$mnemonic\t$rs1,$rs2,$rd";
1556
1557 if (defined($opf=$cmllopf{$mnemonic})) {
1558 foreach ($rs1,$rs2,$rd) {
1559 return $ref if (!/%f([0-9]{1,2})/);
1560 $_=$1;
1561 if ($1>=32) {
1562 return $ref if ($1&1);
1563 # re-encode for upper double register addressing
1564 $_=($1|$1>>5)&31;
1565 }
1566 }
1567
1568 return sprintf ".word\t0x%08x !%s",
1569 2<<30|$rd<<25|0x36<<19|$rs1<<14|$opf<<5|$rs2,
1570 $ref;
1571 } else {
1572 return $ref;
1573 }
1574}
1575
1576sub unmovxtox { # 2-argument instructions
1577my ($mnemonic,$rs,$rd)=@_;
1578my %bias = ( "g" => 0, "o" => 8, "l" => 16, "i" => 24, "f" => 0 );
1579my ($ref,$opf);
1580my %movxopf = ( "movdtox" => 0x110,
1581 "movstouw" => 0x111,
1582 "movstosw" => 0x113,
1583 "movxtod" => 0x118,
1584 "movwtos" => 0x119 );
1585
1586 $ref = "$mnemonic\t$rs,$rd";
1587
1588 if (defined($opf=$movxopf{$mnemonic})) {
1589 foreach ($rs,$rd) {
1590 return $ref if (!/%([fgoli])([0-9]{1,2})/);
1591 $_=$bias{$1}+$2;
1592 if ($2>=32) {
1593 return $ref if ($2&1);
1594 # re-encode for upper double register addressing
1595 $_=($2|$2>>5)&31;
1596 }
1597 }
1598
1599 return sprintf ".word\t0x%08x !%s",
1600 2<<30|$rd<<25|0x36<<19|$opf<<5|$rs,
1601 $ref;
1602 } else {
1603 return $ref;
1604 }
1605}
1606
1607sub undes {
1608my ($mnemonic)=shift;
1609my @args=@_;
1610my ($ref,$opf);
1611my %desopf = ( "des_round" => 0b1001,
1612 "des_ip" => 0b100110100,
1613 "des_iip" => 0b100110101,
1614 "des_kexpand" => 0b100110110 );
1615
1616 $ref = "$mnemonic\t".join(",",@_);
1617
1618 if (defined($opf=$desopf{$mnemonic})) { # 4-arg
1619 if ($mnemonic eq "des_round") {
1620 foreach (@args[0..3]) {
1621 return $ref if (!/%f([0-9]{1,2})/);
1622 $_=$1;
1623 if ($1>=32) {
1624 return $ref if ($1&1);
1625 # re-encode for upper double register addressing
1626 $_=($1|$1>>5)&31;
1627 }
1628 }
1629 return sprintf ".word\t0x%08x !%s",
1630 2<<30|0b011001<<19|$opf<<5|$args[0]<<14|$args[1]|$args[2]<<9|$args[3]<<25,
1631 $ref;
1632 } elsif ($mnemonic eq "des_kexpand") { # 3-arg
1633 foreach (@args[0..2]) {
1634 return $ref if (!/(%f)?([0-9]{1,2})/);
1635 $_=$2;
1636 if ($2>=32) {
1637 return $ref if ($2&1);
1638 # re-encode for upper double register addressing
1639 $_=($2|$2>>5)&31;
1640 }
1641 }
1642 return sprintf ".word\t0x%08x !%s",
1643 2<<30|0b110110<<19|$opf<<5|$args[0]<<14|$args[1]|$args[2]<<25,
1644 $ref;
1645 } else { # 2-arg
1646 foreach (@args[0..1]) {
1647 return $ref if (!/%f([0-9]{1,2})/);
1648 $_=$1;
1649 if ($1>=32) {
1650 return $ref if ($2&1);
1651 # re-encode for upper double register addressing
1652 $_=($1|$1>>5)&31;
1653 }
1654 }
1655 return sprintf ".word\t0x%08x !%s",
1656 2<<30|0b110110<<19|$opf<<5|$args[0]<<14|$args[1]<<25,
1657 $ref;
1658 }
1659 } else {
1660 return $ref;
1661 }
1662}
1663
1664sub emit_assembler {
1665 foreach (split("\n",$::code)) {
1666 s/\`([^\`]*)\`/eval $1/ge;
1667
1668 s/\b(f[a-z]+2[sd]*)\s+(%f[0-9]{1,2}),\s*(%f[0-9]{1,2})\s*$/$1\t%f0,$2,$3/go;
1669
1670 s/\b(aes_[edk][^\s]*)\s+(%f[0-9]{1,2}),\s*(%f[0-9]{1,2}),\s*([%fx0-9]+),\s*(%f[0-9]{1,2})/
1671 &unaes_round($1,$2,$3,$4,$5)
1672 /geo or
1673 s/\b(aes_kexpand[02])\s+(%f[0-9]{1,2}),\s*(%f[0-9]{1,2}),\s*(%f[0-9]{1,2})/
1674 &unaes_kexpand($1,$2,$3,$4)
1675 /geo or
1676 s/\b(camellia_f)\s+(%f[0-9]{1,2}),\s*(%f[0-9]{1,2}),\s*([%fx0-9]+),\s*(%f[0-9]{1,2})/
1677 &uncamellia_f($1,$2,$3,$4,$5)
1678 /geo or
1679 s/\b(camellia_[^s]+)\s+(%f[0-9]{1,2}),\s*(%f[0-9]{1,2}),\s*(%f[0-9]{1,2})/
1680 &uncamellia3($1,$2,$3,$4)
1681 /geo or
1682 s/\b(des_\w+)\s+(%f[0-9]{1,2}),\s*([%fx0-9]+)(?:,\s*(%f[0-9]{1,2})(?:,\s*(%f[0-9]{1,2}))?)?/
1683 &undes($1,$2,$3,$4,$5)
1684 /geo or
1685 s/\b(mov[ds]to\w+)\s+(%f[0-9]{1,2}),\s*(%[goli][0-7])/
1686 &unmovxtox($1,$2,$3)
1687 /geo or
1688 s/\b(mov[xw]to[ds])\s+(%[goli][0-7]),\s*(%f[0-9]{1,2})/
1689 &unmovxtox($1,$2,$3)
1690 /geo or
1691 s/\b([fb][^\s]*)\s+(%f[0-9]{1,2}),\s*(%f[0-9]{1,2}),\s*(%f[0-9]{1,2})/
1692 &unvis($1,$2,$3,$4)
1693 /geo or
1694 s/\b(umulxhi|bmask|addxc[c]{0,2}|alignaddr[l]*)\s+(%[goli][0-7]),\s*(%[goli][0-7]),\s*(%[goli][0-7])/
1695 &unvis3($1,$2,$3,$4)
1696 /geo;
1697
1698 print $_,"\n";
1699 }
1700}
1701
17021;
Note: See TracBrowser for help on using the repository browser.

© 2024 Oracle Support Privacy / Do Not Sell My Info Terms of Use Trademark Policy Automated Access Etiquette