VirtualBox

source: vbox/trunk/src/libs/openssl-3.0.1/crypto/camellia/asm/cmll-x86_64.pl@ 94082

Last change on this file since 94082 was 94082, checked in by vboxsync, 3 years ago

libs/openssl-3.0.1: started applying and adjusting our OpenSSL changes to 3.0.1. bugref:10128

File size: 26.7 KB
Line 
1#! /usr/bin/env perl
2# Copyright 2008-2020 The OpenSSL Project Authors. All Rights Reserved.
3#
4# Licensed under the Apache License 2.0 (the "License"). You may not use
5# this file except in compliance with the License. You can obtain a copy
6# in the file LICENSE in the source distribution or at
7# https://www.openssl.org/source/license.html
8
9
10# ====================================================================
11# Copyright (c) 2008 Andy Polyakov <[email protected]>
12#
13# This module may be used under the terms of either the GNU General
14# Public License version 2 or later, the GNU Lesser General Public
15# License version 2.1 or later, the Mozilla Public License version
16# 1.1 or the BSD License. The exact terms of either license are
17# distributed along with this module. For further details see
18# http://www.openssl.org/~appro/camellia/.
19# ====================================================================
20
21# Performance in cycles per processed byte (less is better) in
22# 'openssl speed ...' benchmark:
23#
24# AMD64 Core2 EM64T
25# -evp camellia-128-ecb 16.7 21.0 22.7
26# + over gcc 3.4.6 +25% +5% 0%
27#
28# camellia-128-cbc 15.7 20.4 21.1
29#
30# 128-bit key setup 128 216 205 cycles/key
31# + over gcc 3.4.6 +54% +39% +15%
32#
33# Numbers in "+" rows represent performance improvement over compiler
34# generated code. Key setup timings are impressive on AMD and Core2
35# thanks to 64-bit operations being covertly deployed. Improvement on
36# EM64T, pre-Core2 Intel x86_64 CPU, is not as impressive, because it
37# apparently emulates some of 64-bit operations in [32-bit] microcode.
38
39# $output is the last argument if it looks like a file (it has an extension)
40# $flavour is the first argument if it doesn't look like a file
41$output = $#ARGV >= 0 && $ARGV[$#ARGV] =~ m|\.\w+$| ? pop : undef;
42$flavour = $#ARGV >= 0 && $ARGV[0] !~ m|\.| ? shift : undef;
43
44$win64=0; $win64=1 if ($flavour =~ /[nm]asm|mingw64/ || $output =~ /\.asm$/);
45
46$0 =~ m/(.*[\/\\])[^\/\\]+$/; $dir=$1;
47( $xlate="${dir}x86_64-xlate.pl" and -f $xlate ) or
48( $xlate="${dir}../../perlasm/x86_64-xlate.pl" and -f $xlate) or
49die "can't locate x86_64-xlate.pl";
50
51open OUT,"| \"$^X\" \"$xlate\" $flavour \"$output\""
52 or die "can't call $xlate: $!";
53*STDOUT=*OUT;
54
55sub hi() { my $r=shift; $r =~ s/%[er]([a-d])x/%\1h/; $r; }
56sub lo() { my $r=shift; $r =~ s/%[er]([a-d])x/%\1l/;
57 $r =~ s/%[er]([sd]i)/%\1l/;
58 $r =~ s/%(r[0-9]+)[d]?/%\1b/; $r; }
59
60$t0="%eax";$t1="%ebx";$t2="%ecx";$t3="%edx";
61@S=("%r8d","%r9d","%r10d","%r11d");
62$i0="%esi";
63$i1="%edi";
64$Tbl="%rbp"; # size optimization
65$inp="%r12";
66$out="%r13";
67$key="%r14";
68$keyend="%r15";
69$arg0d=$win64?"%ecx":"%edi";
70
71# const unsigned int Camellia_SBOX[4][256];
72# Well, sort of... Camellia_SBOX[0][] is interleaved with [1][],
73# and [2][] - with [3][]. This is done to minimize code size.
74$SBOX1_1110=0; # Camellia_SBOX[0]
75$SBOX4_4404=4; # Camellia_SBOX[1]
76$SBOX2_0222=2048; # Camellia_SBOX[2]
77$SBOX3_3033=2052; # Camellia_SBOX[3]
78
79sub Camellia_Feistel {
80my $i=@_[0];
81my $seed=defined(@_[1])?@_[1]:0;
82my $scale=$seed<0?-8:8;
83my $j=($i&1)*2;
84my ($s0,$s1,$s2,$s3)=(@S[($j)%4],@S[($j+1)%4],@S[($j+2)%4],@S[($j+3)%4]);
85
86$code.=<<___;
87 xor $s0,$t0 # t0^=key[0]
88 xor $s1,$t1 # t1^=key[1]
89 movz `&hi("$t0")`,$i0 # (t0>>8)&0xff
90 movz `&lo("$t1")`,$i1 # (t1>>0)&0xff
91 mov $SBOX3_3033($Tbl,$i0,8),$t3 # t3=SBOX3_3033[0]
92 mov $SBOX1_1110($Tbl,$i1,8),$t2 # t2=SBOX1_1110[1]
93 movz `&lo("$t0")`,$i0 # (t0>>0)&0xff
94 shr \$16,$t0
95 movz `&hi("$t1")`,$i1 # (t1>>8)&0xff
96 xor $SBOX4_4404($Tbl,$i0,8),$t3 # t3^=SBOX4_4404[0]
97 shr \$16,$t1
98 xor $SBOX4_4404($Tbl,$i1,8),$t2 # t2^=SBOX4_4404[1]
99 movz `&hi("$t0")`,$i0 # (t0>>24)&0xff
100 movz `&lo("$t1")`,$i1 # (t1>>16)&0xff
101 xor $SBOX1_1110($Tbl,$i0,8),$t3 # t3^=SBOX1_1110[0]
102 xor $SBOX3_3033($Tbl,$i1,8),$t2 # t2^=SBOX3_3033[1]
103 movz `&lo("$t0")`,$i0 # (t0>>16)&0xff
104 movz `&hi("$t1")`,$i1 # (t1>>24)&0xff
105 xor $SBOX2_0222($Tbl,$i0,8),$t3 # t3^=SBOX2_0222[0]
106 xor $SBOX2_0222($Tbl,$i1,8),$t2 # t2^=SBOX2_0222[1]
107 mov `$seed+($i+1)*$scale`($key),$t1 # prefetch key[i+1]
108 mov `$seed+($i+1)*$scale+4`($key),$t0
109 xor $t3,$t2 # t2^=t3
110 ror \$8,$t3 # t3=RightRotate(t3,8)
111 xor $t2,$s2
112 xor $t2,$s3
113 xor $t3,$s3
114___
115}
116
117# void Camellia_EncryptBlock_Rounds(
118# int grandRounds,
119# const Byte plaintext[],
120# const KEY_TABLE_TYPE keyTable,
121# Byte ciphertext[])
122$code=<<___;
123.text
124
125# V1.x API
126.globl Camellia_EncryptBlock
127.type Camellia_EncryptBlock,\@abi-omnipotent
128.align 16
129Camellia_EncryptBlock:
130.cfi_startproc
131 movl \$128,%eax
132 subl $arg0d,%eax
133 movl \$3,$arg0d
134 adcl \$0,$arg0d # keyBitLength==128?3:4
135 jmp .Lenc_rounds
136.cfi_endproc
137.size Camellia_EncryptBlock,.-Camellia_EncryptBlock
138# V2
139.globl Camellia_EncryptBlock_Rounds
140.type Camellia_EncryptBlock_Rounds,\@function,4
141.align 16
142.Lenc_rounds:
143Camellia_EncryptBlock_Rounds:
144.cfi_startproc
145 push %rbx
146.cfi_push %rbx
147 push %rbp
148.cfi_push %rbp
149 push %r13
150.cfi_push %r13
151 push %r14
152.cfi_push %r14
153 push %r15
154.cfi_push %r15
155.Lenc_prologue:
156
157 #mov %rsi,$inp # put away arguments
158 mov %rcx,$out
159 mov %rdx,$key
160
161 shl \$6,%edi # process grandRounds
162 lea .LCamellia_SBOX(%rip),$Tbl
163 lea ($key,%rdi),$keyend
164
165 mov 0(%rsi),@S[0] # load plaintext
166 mov 4(%rsi),@S[1]
167 mov 8(%rsi),@S[2]
168 bswap @S[0]
169 mov 12(%rsi),@S[3]
170 bswap @S[1]
171 bswap @S[2]
172 bswap @S[3]
173
174 call _x86_64_Camellia_encrypt
175
176 bswap @S[0]
177 bswap @S[1]
178 bswap @S[2]
179 mov @S[0],0($out)
180 bswap @S[3]
181 mov @S[1],4($out)
182 mov @S[2],8($out)
183 mov @S[3],12($out)
184
185 mov 0(%rsp),%r15
186.cfi_restore %r15
187 mov 8(%rsp),%r14
188.cfi_restore %r14
189 mov 16(%rsp),%r13
190.cfi_restore %r13
191 mov 24(%rsp),%rbp
192.cfi_restore %rbp
193 mov 32(%rsp),%rbx
194.cfi_restore %rbx
195 lea 40(%rsp),%rsp
196.cfi_adjust_cfa_offset -40
197.Lenc_epilogue:
198 ret
199.cfi_endproc
200.size Camellia_EncryptBlock_Rounds,.-Camellia_EncryptBlock_Rounds
201
202.type _x86_64_Camellia_encrypt,\@abi-omnipotent
203.align 16
204_x86_64_Camellia_encrypt:
205.cfi_startproc
206 xor 0($key),@S[1]
207 xor 4($key),@S[0] # ^=key[0-3]
208 xor 8($key),@S[3]
209 xor 12($key),@S[2]
210.align 16
211.Leloop:
212 mov 16($key),$t1 # prefetch key[4-5]
213 mov 20($key),$t0
214
215___
216 for ($i=0;$i<6;$i++) { Camellia_Feistel($i,16); }
217$code.=<<___;
218 lea 16*4($key),$key
219 cmp $keyend,$key
220 mov 8($key),$t3 # prefetch key[2-3]
221 mov 12($key),$t2
222 je .Ledone
223
224 and @S[0],$t0
225 or @S[3],$t3
226 rol \$1,$t0
227 xor $t3,@S[2] # s2^=s3|key[3];
228 xor $t0,@S[1] # s1^=LeftRotate(s0&key[0],1);
229 and @S[2],$t2
230 or @S[1],$t1
231 rol \$1,$t2
232 xor $t1,@S[0] # s0^=s1|key[1];
233 xor $t2,@S[3] # s3^=LeftRotate(s2&key[2],1);
234 jmp .Leloop
235
236.align 16
237.Ledone:
238 xor @S[2],$t0 # SwapHalf
239 xor @S[3],$t1
240 xor @S[0],$t2
241 xor @S[1],$t3
242
243 mov $t0,@S[0]
244 mov $t1,@S[1]
245 mov $t2,@S[2]
246 mov $t3,@S[3]
247
248 .byte 0xf3,0xc3 # rep ret
249.cfi_endproc
250.size _x86_64_Camellia_encrypt,.-_x86_64_Camellia_encrypt
251
252# V1.x API
253.globl Camellia_DecryptBlock
254.type Camellia_DecryptBlock,\@abi-omnipotent
255.align 16
256Camellia_DecryptBlock:
257.cfi_startproc
258 movl \$128,%eax
259 subl $arg0d,%eax
260 movl \$3,$arg0d
261 adcl \$0,$arg0d # keyBitLength==128?3:4
262 jmp .Ldec_rounds
263.cfi_endproc
264.size Camellia_DecryptBlock,.-Camellia_DecryptBlock
265# V2
266.globl Camellia_DecryptBlock_Rounds
267.type Camellia_DecryptBlock_Rounds,\@function,4
268.align 16
269.Ldec_rounds:
270Camellia_DecryptBlock_Rounds:
271.cfi_startproc
272 push %rbx
273.cfi_push %rbx
274 push %rbp
275.cfi_push %rbp
276 push %r13
277.cfi_push %r13
278 push %r14
279.cfi_push %r14
280 push %r15
281.cfi_push %r15
282.Ldec_prologue:
283
284 #mov %rsi,$inp # put away arguments
285 mov %rcx,$out
286 mov %rdx,$keyend
287
288 shl \$6,%edi # process grandRounds
289 lea .LCamellia_SBOX(%rip),$Tbl
290 lea ($keyend,%rdi),$key
291
292 mov 0(%rsi),@S[0] # load plaintext
293 mov 4(%rsi),@S[1]
294 mov 8(%rsi),@S[2]
295 bswap @S[0]
296 mov 12(%rsi),@S[3]
297 bswap @S[1]
298 bswap @S[2]
299 bswap @S[3]
300
301 call _x86_64_Camellia_decrypt
302
303 bswap @S[0]
304 bswap @S[1]
305 bswap @S[2]
306 mov @S[0],0($out)
307 bswap @S[3]
308 mov @S[1],4($out)
309 mov @S[2],8($out)
310 mov @S[3],12($out)
311
312 mov 0(%rsp),%r15
313.cfi_restore %r15
314 mov 8(%rsp),%r14
315.cfi_restore %r14
316 mov 16(%rsp),%r13
317.cfi_restore %r13
318 mov 24(%rsp),%rbp
319.cfi_restore %rbp
320 mov 32(%rsp),%rbx
321.cfi_restore %rbx
322 lea 40(%rsp),%rsp
323.cfi_adjust_cfa_offset -40
324.Ldec_epilogue:
325 ret
326.cfi_endproc
327.size Camellia_DecryptBlock_Rounds,.-Camellia_DecryptBlock_Rounds
328
329.type _x86_64_Camellia_decrypt,\@abi-omnipotent
330.align 16
331_x86_64_Camellia_decrypt:
332.cfi_startproc
333 xor 0($key),@S[1]
334 xor 4($key),@S[0] # ^=key[0-3]
335 xor 8($key),@S[3]
336 xor 12($key),@S[2]
337.align 16
338.Ldloop:
339 mov -8($key),$t1 # prefetch key[4-5]
340 mov -4($key),$t0
341
342___
343 for ($i=0;$i<6;$i++) { Camellia_Feistel($i,-8); }
344$code.=<<___;
345 lea -16*4($key),$key
346 cmp $keyend,$key
347 mov 0($key),$t3 # prefetch key[2-3]
348 mov 4($key),$t2
349 je .Lddone
350
351 and @S[0],$t0
352 or @S[3],$t3
353 rol \$1,$t0
354 xor $t3,@S[2] # s2^=s3|key[3];
355 xor $t0,@S[1] # s1^=LeftRotate(s0&key[0],1);
356 and @S[2],$t2
357 or @S[1],$t1
358 rol \$1,$t2
359 xor $t1,@S[0] # s0^=s1|key[1];
360 xor $t2,@S[3] # s3^=LeftRotate(s2&key[2],1);
361
362 jmp .Ldloop
363
364.align 16
365.Lddone:
366 xor @S[2],$t2
367 xor @S[3],$t3
368 xor @S[0],$t0
369 xor @S[1],$t1
370
371 mov $t2,@S[0] # SwapHalf
372 mov $t3,@S[1]
373 mov $t0,@S[2]
374 mov $t1,@S[3]
375
376 .byte 0xf3,0xc3 # rep ret
377.cfi_endproc
378.size _x86_64_Camellia_decrypt,.-_x86_64_Camellia_decrypt
379___
380
381sub _saveround {
382my ($rnd,$key,@T)=@_;
383my $bias=int(@T[0])?shift(@T):0;
384
385 if ($#T==3) {
386 $code.=<<___;
387 mov @T[1],`$bias+$rnd*8+0`($key)
388 mov @T[0],`$bias+$rnd*8+4`($key)
389 mov @T[3],`$bias+$rnd*8+8`($key)
390 mov @T[2],`$bias+$rnd*8+12`($key)
391___
392 } else {
393 $code.=" mov @T[0],`$bias+$rnd*8+0`($key)\n";
394 $code.=" mov @T[1],`$bias+$rnd*8+8`($key)\n" if ($#T>=1);
395 }
396}
397
398sub _loadround {
399my ($rnd,$key,@T)=@_;
400my $bias=int(@T[0])?shift(@T):0;
401
402$code.=" mov `$bias+$rnd*8+0`($key),@T[0]\n";
403$code.=" mov `$bias+$rnd*8+8`($key),@T[1]\n" if ($#T>=1);
404}
405
406# shld is very slow on Intel EM64T family. Even on AMD it limits
407# instruction decode rate [because it's VectorPath] and consequently
408# performance...
409sub __rotl128 {
410my ($i0,$i1,$rot)=@_;
411
412 if ($rot) {
413 $code.=<<___;
414 mov $i0,%r11
415 shld \$$rot,$i1,$i0
416 shld \$$rot,%r11,$i1
417___
418 }
419}
420
421# ... Implementing 128-bit rotate without shld gives 80% better
422# performance EM64T, +15% on AMD64 and only ~7% degradation on
423# Core2. This is therefore preferred.
424sub _rotl128 {
425my ($i0,$i1,$rot)=@_;
426
427 if ($rot) {
428 $code.=<<___;
429 mov $i0,%r11
430 shl \$$rot,$i0
431 mov $i1,%r9
432 shr \$`64-$rot`,%r9
433 shr \$`64-$rot`,%r11
434 or %r9,$i0
435 shl \$$rot,$i1
436 or %r11,$i1
437___
438 }
439}
440
441{ my $step=0;
442
443$code.=<<___;
444.globl Camellia_Ekeygen
445.type Camellia_Ekeygen,\@function,3
446.align 16
447Camellia_Ekeygen:
448.cfi_startproc
449 push %rbx
450.cfi_push %rbx
451 push %rbp
452.cfi_push %rbp
453 push %r13
454.cfi_push %r13
455 push %r14
456.cfi_push %r14
457 push %r15
458.cfi_push %r15
459.Lkey_prologue:
460
461 mov %edi,${keyend}d # put away arguments, keyBitLength
462 mov %rdx,$out # keyTable
463
464 mov 0(%rsi),@S[0] # load 0-127 bits
465 mov 4(%rsi),@S[1]
466 mov 8(%rsi),@S[2]
467 mov 12(%rsi),@S[3]
468
469 bswap @S[0]
470 bswap @S[1]
471 bswap @S[2]
472 bswap @S[3]
473___
474 &_saveround (0,$out,@S); # KL<<<0
475$code.=<<___;
476 cmp \$128,$keyend # check keyBitLength
477 je .L1st128
478
479 mov 16(%rsi),@S[0] # load 128-191 bits
480 mov 20(%rsi),@S[1]
481 cmp \$192,$keyend
482 je .L1st192
483 mov 24(%rsi),@S[2] # load 192-255 bits
484 mov 28(%rsi),@S[3]
485 jmp .L1st256
486.L1st192:
487 mov @S[0],@S[2]
488 mov @S[1],@S[3]
489 not @S[2]
490 not @S[3]
491.L1st256:
492 bswap @S[0]
493 bswap @S[1]
494 bswap @S[2]
495 bswap @S[3]
496___
497 &_saveround (4,$out,@S); # temp storage for KR!
498$code.=<<___;
499 xor 0($out),@S[1] # KR^KL
500 xor 4($out),@S[0]
501 xor 8($out),@S[3]
502 xor 12($out),@S[2]
503
504.L1st128:
505 lea .LCamellia_SIGMA(%rip),$key
506 lea .LCamellia_SBOX(%rip),$Tbl
507
508 mov 0($key),$t1
509 mov 4($key),$t0
510___
511 &Camellia_Feistel($step++);
512 &Camellia_Feistel($step++);
513$code.=<<___;
514 xor 0($out),@S[1] # ^KL
515 xor 4($out),@S[0]
516 xor 8($out),@S[3]
517 xor 12($out),@S[2]
518___
519 &Camellia_Feistel($step++);
520 &Camellia_Feistel($step++);
521$code.=<<___;
522 cmp \$128,$keyend
523 jne .L2nd256
524
525 lea 128($out),$out # size optimization
526 shl \$32,%r8 # @S[0]||
527 shl \$32,%r10 # @S[2]||
528 or %r9,%r8 # ||@S[1]
529 or %r11,%r10 # ||@S[3]
530___
531 &_loadround (0,$out,-128,"%rax","%rbx"); # KL
532 &_saveround (2,$out,-128,"%r8","%r10"); # KA<<<0
533 &_rotl128 ("%rax","%rbx",15);
534 &_saveround (4,$out,-128,"%rax","%rbx"); # KL<<<15
535 &_rotl128 ("%r8","%r10",15);
536 &_saveround (6,$out,-128,"%r8","%r10"); # KA<<<15
537 &_rotl128 ("%r8","%r10",15); # 15+15=30
538 &_saveround (8,$out,-128,"%r8","%r10"); # KA<<<30
539 &_rotl128 ("%rax","%rbx",30); # 15+30=45
540 &_saveround (10,$out,-128,"%rax","%rbx"); # KL<<<45
541 &_rotl128 ("%r8","%r10",15); # 30+15=45
542 &_saveround (12,$out,-128,"%r8"); # KA<<<45
543 &_rotl128 ("%rax","%rbx",15); # 45+15=60
544 &_saveround (13,$out,-128,"%rbx"); # KL<<<60
545 &_rotl128 ("%r8","%r10",15); # 45+15=60
546 &_saveround (14,$out,-128,"%r8","%r10"); # KA<<<60
547 &_rotl128 ("%rax","%rbx",17); # 60+17=77
548 &_saveround (16,$out,-128,"%rax","%rbx"); # KL<<<77
549 &_rotl128 ("%rax","%rbx",17); # 77+17=94
550 &_saveround (18,$out,-128,"%rax","%rbx"); # KL<<<94
551 &_rotl128 ("%r8","%r10",34); # 60+34=94
552 &_saveround (20,$out,-128,"%r8","%r10"); # KA<<<94
553 &_rotl128 ("%rax","%rbx",17); # 94+17=111
554 &_saveround (22,$out,-128,"%rax","%rbx"); # KL<<<111
555 &_rotl128 ("%r8","%r10",17); # 94+17=111
556 &_saveround (24,$out,-128,"%r8","%r10"); # KA<<<111
557$code.=<<___;
558 mov \$3,%eax
559 jmp .Ldone
560.align 16
561.L2nd256:
562___
563 &_saveround (6,$out,@S); # temp storage for KA!
564$code.=<<___;
565 xor `4*8+0`($out),@S[1] # KA^KR
566 xor `4*8+4`($out),@S[0]
567 xor `5*8+0`($out),@S[3]
568 xor `5*8+4`($out),@S[2]
569___
570 &Camellia_Feistel($step++);
571 &Camellia_Feistel($step++);
572
573 &_loadround (0,$out,"%rax","%rbx"); # KL
574 &_loadround (4,$out,"%rcx","%rdx"); # KR
575 &_loadround (6,$out,"%r14","%r15"); # KA
576$code.=<<___;
577 lea 128($out),$out # size optimization
578 shl \$32,%r8 # @S[0]||
579 shl \$32,%r10 # @S[2]||
580 or %r9,%r8 # ||@S[1]
581 or %r11,%r10 # ||@S[3]
582___
583 &_saveround (2,$out,-128,"%r8","%r10"); # KB<<<0
584 &_rotl128 ("%rcx","%rdx",15);
585 &_saveround (4,$out,-128,"%rcx","%rdx"); # KR<<<15
586 &_rotl128 ("%r14","%r15",15);
587 &_saveround (6,$out,-128,"%r14","%r15"); # KA<<<15
588 &_rotl128 ("%rcx","%rdx",15); # 15+15=30
589 &_saveround (8,$out,-128,"%rcx","%rdx"); # KR<<<30
590 &_rotl128 ("%r8","%r10",30);
591 &_saveround (10,$out,-128,"%r8","%r10"); # KB<<<30
592 &_rotl128 ("%rax","%rbx",45);
593 &_saveround (12,$out,-128,"%rax","%rbx"); # KL<<<45
594 &_rotl128 ("%r14","%r15",30); # 15+30=45
595 &_saveround (14,$out,-128,"%r14","%r15"); # KA<<<45
596 &_rotl128 ("%rax","%rbx",15); # 45+15=60
597 &_saveround (16,$out,-128,"%rax","%rbx"); # KL<<<60
598 &_rotl128 ("%rcx","%rdx",30); # 30+30=60
599 &_saveround (18,$out,-128,"%rcx","%rdx"); # KR<<<60
600 &_rotl128 ("%r8","%r10",30); # 30+30=60
601 &_saveround (20,$out,-128,"%r8","%r10"); # KB<<<60
602 &_rotl128 ("%rax","%rbx",17); # 60+17=77
603 &_saveround (22,$out,-128,"%rax","%rbx"); # KL<<<77
604 &_rotl128 ("%r14","%r15",32); # 45+32=77
605 &_saveround (24,$out,-128,"%r14","%r15"); # KA<<<77
606 &_rotl128 ("%rcx","%rdx",34); # 60+34=94
607 &_saveround (26,$out,-128,"%rcx","%rdx"); # KR<<<94
608 &_rotl128 ("%r14","%r15",17); # 77+17=94
609 &_saveround (28,$out,-128,"%r14","%r15"); # KA<<<77
610 &_rotl128 ("%rax","%rbx",34); # 77+34=111
611 &_saveround (30,$out,-128,"%rax","%rbx"); # KL<<<111
612 &_rotl128 ("%r8","%r10",51); # 60+51=111
613 &_saveround (32,$out,-128,"%r8","%r10"); # KB<<<111
614$code.=<<___;
615 mov \$4,%eax
616.Ldone:
617 mov 0(%rsp),%r15
618.cfi_restore %r15
619 mov 8(%rsp),%r14
620.cfi_restore %r14
621 mov 16(%rsp),%r13
622.cfi_restore %r13
623 mov 24(%rsp),%rbp
624.cfi_restore %rbp
625 mov 32(%rsp),%rbx
626.cfi_restore %rbx
627 lea 40(%rsp),%rsp
628.cfi_adjust_cfa_offset -40
629.Lkey_epilogue:
630 ret
631.cfi_endproc
632.size Camellia_Ekeygen,.-Camellia_Ekeygen
633___
634}
635
636@SBOX=(
637112,130, 44,236,179, 39,192,229,228,133, 87, 53,234, 12,174, 65,
638 35,239,107,147, 69, 25,165, 33,237, 14, 79, 78, 29,101,146,189,
639134,184,175,143,124,235, 31,206, 62, 48,220, 95, 94,197, 11, 26,
640166,225, 57,202,213, 71, 93, 61,217, 1, 90,214, 81, 86,108, 77,
641139, 13,154,102,251,204,176, 45,116, 18, 43, 32,240,177,132,153,
642223, 76,203,194, 52,126,118, 5,109,183,169, 49,209, 23, 4,215,
643 20, 88, 58, 97,222, 27, 17, 28, 50, 15,156, 22, 83, 24,242, 34,
644254, 68,207,178,195,181,122,145, 36, 8,232,168, 96,252,105, 80,
645170,208,160,125,161,137, 98,151, 84, 91, 30,149,224,255,100,210,
646 16,196, 0, 72,163,247,117,219,138, 3,230,218, 9, 63,221,148,
647135, 92,131, 2,205, 74,144, 51,115,103,246,243,157,127,191,226,
648 82,155,216, 38,200, 55,198, 59,129,150,111, 75, 19,190, 99, 46,
649233,121,167,140,159,110,188,142, 41,245,249,182, 47,253,180, 89,
650120,152, 6,106,231, 70,113,186,212, 37,171, 66,136,162,141,250,
651114, 7,185, 85,248,238,172, 10, 54, 73, 42,104, 60, 56,241,164,
652 64, 40,211,123,187,201, 67,193, 21,227,173,244,119,199,128,158);
653
654sub S1110 { my $i=shift; $i=@SBOX[$i]; $i=$i<<24|$i<<16|$i<<8; sprintf("0x%08x",$i); }
655sub S4404 { my $i=shift; $i=($i<<1|$i>>7)&0xff; $i=@SBOX[$i]; $i=$i<<24|$i<<16|$i; sprintf("0x%08x",$i); }
656sub S0222 { my $i=shift; $i=@SBOX[$i]; $i=($i<<1|$i>>7)&0xff; $i=$i<<16|$i<<8|$i; sprintf("0x%08x",$i); }
657sub S3033 { my $i=shift; $i=@SBOX[$i]; $i=($i>>1|$i<<7)&0xff; $i=$i<<24|$i<<8|$i; sprintf("0x%08x",$i); }
658
659$code.=<<___;
660.align 64
661.LCamellia_SIGMA:
662.long 0x3bcc908b, 0xa09e667f, 0x4caa73b2, 0xb67ae858
663.long 0xe94f82be, 0xc6ef372f, 0xf1d36f1c, 0x54ff53a5
664.long 0xde682d1d, 0x10e527fa, 0xb3e6c1fd, 0xb05688c2
665.long 0, 0, 0, 0
666.LCamellia_SBOX:
667___
668# tables are interleaved, remember?
669sub data_word { $code.=".long\t".join(',',@_)."\n"; }
670for ($i=0;$i<256;$i++) { &data_word(&S1110($i),&S4404($i)); }
671for ($i=0;$i<256;$i++) { &data_word(&S0222($i),&S3033($i)); }
672
673# void Camellia_cbc_encrypt (const void char *inp, unsigned char *out,
674# size_t length, const CAMELLIA_KEY *key,
675# unsigned char *ivp,const int enc);
676{
677$_key="0(%rsp)";
678$_end="8(%rsp)"; # inp+len&~15
679$_res="16(%rsp)"; # len&15
680$ivec="24(%rsp)";
681$_ivp="40(%rsp)";
682$_rsp="48(%rsp)";
683
684$code.=<<___;
685.globl Camellia_cbc_encrypt
686.type Camellia_cbc_encrypt,\@function,6
687.align 16
688Camellia_cbc_encrypt:
689.cfi_startproc
690 endbranch
691 cmp \$0,%rdx
692 je .Lcbc_abort
693 push %rbx
694.cfi_push %rbx
695 push %rbp
696.cfi_push %rbp
697 push %r12
698.cfi_push %r12
699 push %r13
700.cfi_push %r13
701 push %r14
702.cfi_push %r14
703 push %r15
704.cfi_push %r15
705.Lcbc_prologue:
706
707 mov %rsp,%rbp
708.cfi_def_cfa_register %rbp
709 sub \$64,%rsp
710 and \$-64,%rsp
711
712 # place stack frame just "above mod 1024" the key schedule,
713 # this ensures that cache associativity suffices
714 lea -64-63(%rcx),%r10
715 sub %rsp,%r10
716 neg %r10
717 and \$0x3C0,%r10
718 sub %r10,%rsp
719 #add \$8,%rsp # 8 is reserved for callee's ra
720
721 mov %rdi,$inp # inp argument
722 mov %rsi,$out # out argument
723 mov %r8,%rbx # ivp argument
724 mov %rcx,$key # key argument
725 mov 272(%rcx),${keyend}d # grandRounds
726
727 mov %r8,$_ivp
728 mov %rbp,$_rsp
729.cfi_cfa_expression $_rsp,deref,+56
730
731.Lcbc_body:
732 lea .LCamellia_SBOX(%rip),$Tbl
733
734 mov \$32,%ecx
735.align 4
736.Lcbc_prefetch_sbox:
737 mov 0($Tbl),%rax
738 mov 32($Tbl),%rsi
739 mov 64($Tbl),%rdi
740 mov 96($Tbl),%r11
741 lea 128($Tbl),$Tbl
742 loop .Lcbc_prefetch_sbox
743 sub \$4096,$Tbl
744 shl \$6,$keyend
745 mov %rdx,%rcx # len argument
746 lea ($key,$keyend),$keyend
747
748 cmp \$0,%r9d # enc argument
749 je .LCBC_DECRYPT
750
751 and \$-16,%rdx
752 and \$15,%rcx # length residue
753 lea ($inp,%rdx),%rdx
754 mov $key,$_key
755 mov %rdx,$_end
756 mov %rcx,$_res
757
758 cmp $inp,%rdx
759 mov 0(%rbx),@S[0] # load IV
760 mov 4(%rbx),@S[1]
761 mov 8(%rbx),@S[2]
762 mov 12(%rbx),@S[3]
763 je .Lcbc_enc_tail
764 jmp .Lcbc_eloop
765
766.align 16
767.Lcbc_eloop:
768 xor 0($inp),@S[0]
769 xor 4($inp),@S[1]
770 xor 8($inp),@S[2]
771 bswap @S[0]
772 xor 12($inp),@S[3]
773 bswap @S[1]
774 bswap @S[2]
775 bswap @S[3]
776
777 call _x86_64_Camellia_encrypt
778
779 mov $_key,$key # "rewind" the key
780 bswap @S[0]
781 mov $_end,%rdx
782 bswap @S[1]
783 mov $_res,%rcx
784 bswap @S[2]
785 mov @S[0],0($out)
786 bswap @S[3]
787 mov @S[1],4($out)
788 mov @S[2],8($out)
789 lea 16($inp),$inp
790 mov @S[3],12($out)
791 cmp %rdx,$inp
792 lea 16($out),$out
793 jne .Lcbc_eloop
794
795 cmp \$0,%rcx
796 jne .Lcbc_enc_tail
797
798 mov $_ivp,$out
799 mov @S[0],0($out) # write out IV residue
800 mov @S[1],4($out)
801 mov @S[2],8($out)
802 mov @S[3],12($out)
803 jmp .Lcbc_done
804
805.align 16
806.Lcbc_enc_tail:
807 xor %rax,%rax
808 mov %rax,0+$ivec
809 mov %rax,8+$ivec
810 mov %rax,$_res
811
812.Lcbc_enc_pushf:
813 pushfq
814 cld
815 mov $inp,%rsi
816 lea 8+$ivec,%rdi
817 .long 0x9066A4F3 # rep movsb
818 popfq
819.Lcbc_enc_popf:
820
821 lea $ivec,$inp
822 lea 16+$ivec,%rax
823 mov %rax,$_end
824 jmp .Lcbc_eloop # one more time
825
826.align 16
827.LCBC_DECRYPT:
828 xchg $key,$keyend
829 add \$15,%rdx
830 and \$15,%rcx # length residue
831 and \$-16,%rdx
832 mov $key,$_key
833 lea ($inp,%rdx),%rdx
834 mov %rdx,$_end
835 mov %rcx,$_res
836
837 mov (%rbx),%rax # load IV
838 mov 8(%rbx),%rbx
839 jmp .Lcbc_dloop
840.align 16
841.Lcbc_dloop:
842 mov 0($inp),@S[0]
843 mov 4($inp),@S[1]
844 mov 8($inp),@S[2]
845 bswap @S[0]
846 mov 12($inp),@S[3]
847 bswap @S[1]
848 mov %rax,0+$ivec # save IV to temporary storage
849 bswap @S[2]
850 mov %rbx,8+$ivec
851 bswap @S[3]
852
853 call _x86_64_Camellia_decrypt
854
855 mov $_key,$key # "rewind" the key
856 mov $_end,%rdx
857 mov $_res,%rcx
858
859 bswap @S[0]
860 mov ($inp),%rax # load IV for next iteration
861 bswap @S[1]
862 mov 8($inp),%rbx
863 bswap @S[2]
864 xor 0+$ivec,@S[0]
865 bswap @S[3]
866 xor 4+$ivec,@S[1]
867 xor 8+$ivec,@S[2]
868 lea 16($inp),$inp
869 xor 12+$ivec,@S[3]
870 cmp %rdx,$inp
871 je .Lcbc_ddone
872
873 mov @S[0],0($out)
874 mov @S[1],4($out)
875 mov @S[2],8($out)
876 mov @S[3],12($out)
877
878 lea 16($out),$out
879 jmp .Lcbc_dloop
880
881.align 16
882.Lcbc_ddone:
883 mov $_ivp,%rdx
884 cmp \$0,%rcx
885 jne .Lcbc_dec_tail
886
887 mov @S[0],0($out)
888 mov @S[1],4($out)
889 mov @S[2],8($out)
890 mov @S[3],12($out)
891
892 mov %rax,(%rdx) # write out IV residue
893 mov %rbx,8(%rdx)
894 jmp .Lcbc_done
895.align 16
896.Lcbc_dec_tail:
897 mov @S[0],0+$ivec
898 mov @S[1],4+$ivec
899 mov @S[2],8+$ivec
900 mov @S[3],12+$ivec
901
902.Lcbc_dec_pushf:
903 pushfq
904 cld
905 lea 8+$ivec,%rsi
906 lea ($out),%rdi
907 .long 0x9066A4F3 # rep movsb
908 popfq
909.Lcbc_dec_popf:
910
911 mov %rax,(%rdx) # write out IV residue
912 mov %rbx,8(%rdx)
913 jmp .Lcbc_done
914
915.align 16
916.Lcbc_done:
917 mov $_rsp,%rcx
918.cfi_def_cfa %rcx,56
919 mov 0(%rcx),%r15
920.cfi_restore %r15
921 mov 8(%rcx),%r14
922.cfi_restore %r14
923 mov 16(%rcx),%r13
924.cfi_restore %r13
925 mov 24(%rcx),%r12
926.cfi_restore %r12
927 mov 32(%rcx),%rbp
928.cfi_restore %rbp
929 mov 40(%rcx),%rbx
930.cfi_restore %rbx
931 lea 48(%rcx),%rsp
932.cfi_def_cfa %rsp,8
933.Lcbc_abort:
934 ret
935.cfi_endproc
936.size Camellia_cbc_encrypt,.-Camellia_cbc_encrypt
937
938.asciz "Camellia for x86_64 by <appro\@openssl.org>"
939___
940}
941
942# EXCEPTION_DISPOSITION handler (EXCEPTION_RECORD *rec,ULONG64 frame,
943# CONTEXT *context,DISPATCHER_CONTEXT *disp)
944if ($win64) {
945$rec="%rcx";
946$frame="%rdx";
947$context="%r8";
948$disp="%r9";
949
950$code.=<<___;
951.extern __imp_RtlVirtualUnwind
952.type common_se_handler,\@abi-omnipotent
953.align 16
954common_se_handler:
955 push %rsi
956 push %rdi
957 push %rbx
958 push %rbp
959 push %r12
960 push %r13
961 push %r14
962 push %r15
963 pushfq
964 lea -64(%rsp),%rsp
965
966 mov 120($context),%rax # pull context->Rax
967 mov 248($context),%rbx # pull context->Rip
968
969 mov 8($disp),%rsi # disp->ImageBase
970 mov 56($disp),%r11 # disp->HandlerData
971
972 mov 0(%r11),%r10d # HandlerData[0]
973 lea (%rsi,%r10),%r10 # prologue label
974 cmp %r10,%rbx # context->Rip<prologue label
975 jb .Lin_prologue
976
977 mov 152($context),%rax # pull context->Rsp
978
979 mov 4(%r11),%r10d # HandlerData[1]
980 lea (%rsi,%r10),%r10 # epilogue label
981 cmp %r10,%rbx # context->Rip>=epilogue label
982 jae .Lin_prologue
983
984 lea 40(%rax),%rax
985 mov -8(%rax),%rbx
986 mov -16(%rax),%rbp
987 mov -24(%rax),%r13
988 mov -32(%rax),%r14
989 mov -40(%rax),%r15
990 mov %rbx,144($context) # restore context->Rbx
991 mov %rbp,160($context) # restore context->Rbp
992 mov %r13,224($context) # restore context->R13
993 mov %r14,232($context) # restore context->R14
994 mov %r15,240($context) # restore context->R15
995
996.Lin_prologue:
997 mov 8(%rax),%rdi
998 mov 16(%rax),%rsi
999 mov %rax,152($context) # restore context->Rsp
1000 mov %rsi,168($context) # restore context->Rsi
1001 mov %rdi,176($context) # restore context->Rdi
1002
1003 jmp .Lcommon_seh_exit
1004.size common_se_handler,.-common_se_handler
1005
1006.type cbc_se_handler,\@abi-omnipotent
1007.align 16
1008cbc_se_handler:
1009 push %rsi
1010 push %rdi
1011 push %rbx
1012 push %rbp
1013 push %r12
1014 push %r13
1015 push %r14
1016 push %r15
1017 pushfq
1018 lea -64(%rsp),%rsp
1019
1020 mov 120($context),%rax # pull context->Rax
1021 mov 248($context),%rbx # pull context->Rip
1022
1023 lea .Lcbc_prologue(%rip),%r10
1024 cmp %r10,%rbx # context->Rip<.Lcbc_prologue
1025 jb .Lin_cbc_prologue
1026
1027 lea .Lcbc_body(%rip),%r10
1028 cmp %r10,%rbx # context->Rip<.Lcbc_body
1029 jb .Lin_cbc_frame_setup
1030
1031 mov 152($context),%rax # pull context->Rsp
1032
1033 lea .Lcbc_abort(%rip),%r10
1034 cmp %r10,%rbx # context->Rip>=.Lcbc_abort
1035 jae .Lin_cbc_prologue
1036
1037 # handle pushf/popf in Camellia_cbc_encrypt
1038 lea .Lcbc_enc_pushf(%rip),%r10
1039 cmp %r10,%rbx # context->Rip<=.Lcbc_enc_pushf
1040 jbe .Lin_cbc_no_flag
1041 lea 8(%rax),%rax
1042 lea .Lcbc_enc_popf(%rip),%r10
1043 cmp %r10,%rbx # context->Rip<.Lcbc_enc_popf
1044 jb .Lin_cbc_no_flag
1045 lea -8(%rax),%rax
1046 lea .Lcbc_dec_pushf(%rip),%r10
1047 cmp %r10,%rbx # context->Rip<=.Lcbc_dec_pushf
1048 jbe .Lin_cbc_no_flag
1049 lea 8(%rax),%rax
1050 lea .Lcbc_dec_popf(%rip),%r10
1051 cmp %r10,%rbx # context->Rip<.Lcbc_dec_popf
1052 jb .Lin_cbc_no_flag
1053 lea -8(%rax),%rax
1054
1055.Lin_cbc_no_flag:
1056 mov 48(%rax),%rax # $_rsp
1057 lea 48(%rax),%rax
1058
1059.Lin_cbc_frame_setup:
1060 mov -8(%rax),%rbx
1061 mov -16(%rax),%rbp
1062 mov -24(%rax),%r12
1063 mov -32(%rax),%r13
1064 mov -40(%rax),%r14
1065 mov -48(%rax),%r15
1066 mov %rbx,144($context) # restore context->Rbx
1067 mov %rbp,160($context) # restore context->Rbp
1068 mov %r12,216($context) # restore context->R12
1069 mov %r13,224($context) # restore context->R13
1070 mov %r14,232($context) # restore context->R14
1071 mov %r15,240($context) # restore context->R15
1072
1073.Lin_cbc_prologue:
1074 mov 8(%rax),%rdi
1075 mov 16(%rax),%rsi
1076 mov %rax,152($context) # restore context->Rsp
1077 mov %rsi,168($context) # restore context->Rsi
1078 mov %rdi,176($context) # restore context->Rdi
1079
1080.align 4
1081.Lcommon_seh_exit:
1082
1083 mov 40($disp),%rdi # disp->ContextRecord
1084 mov $context,%rsi # context
1085 mov \$`1232/8`,%ecx # sizeof(CONTEXT)
1086 .long 0xa548f3fc # cld; rep movsq
1087
1088 mov $disp,%rsi
1089 xor %rcx,%rcx # arg1, UNW_FLAG_NHANDLER
1090 mov 8(%rsi),%rdx # arg2, disp->ImageBase
1091 mov 0(%rsi),%r8 # arg3, disp->ControlPc
1092 mov 16(%rsi),%r9 # arg4, disp->FunctionEntry
1093 mov 40(%rsi),%r10 # disp->ContextRecord
1094 lea 56(%rsi),%r11 # &disp->HandlerData
1095 lea 24(%rsi),%r12 # &disp->EstablisherFrame
1096 mov %r10,32(%rsp) # arg5
1097 mov %r11,40(%rsp) # arg6
1098 mov %r12,48(%rsp) # arg7
1099 mov %rcx,56(%rsp) # arg8, (NULL)
1100 call *__imp_RtlVirtualUnwind(%rip)
1101
1102 mov \$1,%eax # ExceptionContinueSearch
1103 lea 64(%rsp),%rsp
1104 popfq
1105 pop %r15
1106 pop %r14
1107 pop %r13
1108 pop %r12
1109 pop %rbp
1110 pop %rbx
1111 pop %rdi
1112 pop %rsi
1113 ret
1114.size cbc_se_handler,.-cbc_se_handler
1115
1116.section .pdata
1117.align 4
1118 .rva .LSEH_begin_Camellia_EncryptBlock_Rounds
1119 .rva .LSEH_end_Camellia_EncryptBlock_Rounds
1120 .rva .LSEH_info_Camellia_EncryptBlock_Rounds
1121
1122 .rva .LSEH_begin_Camellia_DecryptBlock_Rounds
1123 .rva .LSEH_end_Camellia_DecryptBlock_Rounds
1124 .rva .LSEH_info_Camellia_DecryptBlock_Rounds
1125
1126 .rva .LSEH_begin_Camellia_Ekeygen
1127 .rva .LSEH_end_Camellia_Ekeygen
1128 .rva .LSEH_info_Camellia_Ekeygen
1129
1130 .rva .LSEH_begin_Camellia_cbc_encrypt
1131 .rva .LSEH_end_Camellia_cbc_encrypt
1132 .rva .LSEH_info_Camellia_cbc_encrypt
1133
1134.section .xdata
1135.align 8
1136.LSEH_info_Camellia_EncryptBlock_Rounds:
1137 .byte 9,0,0,0
1138 .rva common_se_handler
1139 .rva .Lenc_prologue,.Lenc_epilogue # HandlerData[]
1140.LSEH_info_Camellia_DecryptBlock_Rounds:
1141 .byte 9,0,0,0
1142 .rva common_se_handler
1143 .rva .Ldec_prologue,.Ldec_epilogue # HandlerData[]
1144.LSEH_info_Camellia_Ekeygen:
1145 .byte 9,0,0,0
1146 .rva common_se_handler
1147 .rva .Lkey_prologue,.Lkey_epilogue # HandlerData[]
1148.LSEH_info_Camellia_cbc_encrypt:
1149 .byte 9,0,0,0
1150 .rva cbc_se_handler
1151___
1152}
1153
1154$code =~ s/\`([^\`]*)\`/eval $1/gem;
1155print $code;
1156close STDOUT or die "error closing STDOUT: $!";
Note: See TracBrowser for help on using the repository browser.

© 2024 Oracle Support Privacy / Do Not Sell My Info Terms of Use Trademark Policy Automated Access Etiquette