1 | #! /usr/bin/env perl
|
---|
2 | # Copyright 2016-2020 The OpenSSL Project Authors. All Rights Reserved.
|
---|
3 | #
|
---|
4 | # Licensed under the OpenSSL license (the "License"). You may not use
|
---|
5 | # this file except in compliance with the License. You can obtain a copy
|
---|
6 | # in the file LICENSE in the source distribution or at
|
---|
7 | # https://www.openssl.org/source/license.html
|
---|
8 |
|
---|
9 | #
|
---|
10 | # ====================================================================
|
---|
11 | # Written by Andy Polyakov <appro@openssl.org> for the OpenSSL
|
---|
12 | # project. The module is, however, dual licensed under OpenSSL and
|
---|
13 | # CRYPTOGAMS licenses depending on where you obtain it. For further
|
---|
14 | # details see http://www.openssl.org/~appro/cryptogams/.
|
---|
15 | # ====================================================================
|
---|
16 | #
|
---|
17 | # November 2014
|
---|
18 | #
|
---|
19 | # ChaCha20 for x86_64.
|
---|
20 | #
|
---|
21 | # December 2016
|
---|
22 | #
|
---|
23 | # Add AVX512F code path.
|
---|
24 | #
|
---|
25 | # December 2017
|
---|
26 | #
|
---|
27 | # Add AVX512VL code path.
|
---|
28 | #
|
---|
29 | # Performance in cycles per byte out of large buffer.
|
---|
30 | #
|
---|
31 | # IALU/gcc 4.8(i) 1x/2xSSSE3(ii) 4xSSSE3 NxAVX(v)
|
---|
32 | #
|
---|
33 | # P4 9.48/+99% - -
|
---|
34 | # Core2 7.83/+55% 7.90/5.76 4.35
|
---|
35 | # Westmere 7.19/+50% 5.60/4.50 3.00
|
---|
36 | # Sandy Bridge 8.31/+42% 5.45/4.00 2.72
|
---|
37 | # Ivy Bridge 6.71/+46% 5.40/? 2.41
|
---|
38 | # Haswell 5.92/+43% 5.20/3.45 2.42 1.23
|
---|
39 | # Skylake[-X] 5.87/+39% 4.70/3.22 2.31 1.19[0.80(vi)]
|
---|
40 | # Silvermont 12.0/+33% 7.75/6.90 7.03(iii)
|
---|
41 | # Knights L 11.7/- ? 9.60(iii) 0.80
|
---|
42 | # Goldmont 10.6/+17% 5.10/3.52 3.28
|
---|
43 | # Sledgehammer 7.28/+52% - -
|
---|
44 | # Bulldozer 9.66/+28% 9.85/5.35(iv) 3.06(iv)
|
---|
45 | # Ryzen 5.96/+50% 5.19/3.00 2.40 2.09
|
---|
46 | # VIA Nano 10.5/+46% 6.72/6.88 6.05
|
---|
47 | #
|
---|
48 | # (i) compared to older gcc 3.x one can observe >2x improvement on
|
---|
49 | # most platforms;
|
---|
50 | # (ii) 2xSSSE3 is code path optimized specifically for 128 bytes used
|
---|
51 | # by chacha20_poly1305_tls_cipher, results are EVP-free;
|
---|
52 | # (iii) this is not optimal result for Atom because of MSROM
|
---|
53 | # limitations, SSE2 can do better, but gain is considered too
|
---|
54 | # low to justify the [maintenance] effort;
|
---|
55 | # (iv) Bulldozer actually executes 4xXOP code path that delivers 2.20
|
---|
56 | # and 4.85 for 128-byte inputs;
|
---|
57 | # (v) 8xAVX2, 8xAVX512VL or 16xAVX512F, whichever best applicable;
|
---|
58 | # (vi) even though Skylake-X can execute AVX512F code and deliver 0.57
|
---|
59 | # cpb in single thread, the corresponding capability is suppressed;
|
---|
60 |
|
---|
61 | $flavour = shift;
|
---|
62 | $output = shift;
|
---|
63 | if ($flavour =~ /\./) { $output = $flavour; undef $flavour; }
|
---|
64 |
|
---|
65 | $win64=0; $win64=1 if ($flavour =~ /[nm]asm|mingw64/ || $output =~ /\.asm$/);
|
---|
66 |
|
---|
67 | $0 =~ m/(.*[\/\\])[^\/\\]+$/; $dir=$1;
|
---|
68 | ( $xlate="${dir}x86_64-xlate.pl" and -f $xlate ) or
|
---|
69 | ( $xlate="${dir}../../perlasm/x86_64-xlate.pl" and -f $xlate) or
|
---|
70 | die "can't locate x86_64-xlate.pl";
|
---|
71 |
|
---|
72 | if (`$ENV{CC} -Wa,-v -c -o /dev/null -x assembler /dev/null 2>&1`
|
---|
73 | =~ /GNU assembler version ([2-9]\.[0-9]+)/) {
|
---|
74 | $avx = ($1>=2.19) + ($1>=2.22) + ($1>=2.25);
|
---|
75 | }
|
---|
76 |
|
---|
77 | if (!$avx && $win64 && ($flavour =~ /nasm/ || $ENV{ASM} =~ /nasm/) &&
|
---|
78 | `nasm -v 2>&1` =~ /NASM version ([2-9]\.[0-9]+)(?:\.([0-9]+))?/) {
|
---|
79 | $avx = ($1>=2.09) + ($1>=2.10) + ($1>=2.12);
|
---|
80 | $avx += 1 if ($1==2.11 && $2>=8);
|
---|
81 | }
|
---|
82 |
|
---|
83 | if (!$avx && $win64 && ($flavour =~ /masm/ || $ENV{ASM} =~ /ml64/) &&
|
---|
84 | `ml64 2>&1` =~ /Version ([0-9]+)\./) {
|
---|
85 | $avx = ($1>=10) + ($1>=11);
|
---|
86 | }
|
---|
87 |
|
---|
88 | if (!$avx && `$ENV{CC} -v 2>&1` =~ /((?:^clang|LLVM) version|.*based on LLVM) ([0-9]+\.[0-9]+)/) {
|
---|
89 | $avx = ($2>=3.0) + ($2>3.0);
|
---|
90 | }
|
---|
91 |
|
---|
92 | open OUT,"| \"$^X\" \"$xlate\" $flavour \"$output\"";
|
---|
93 | *STDOUT=*OUT;
|
---|
94 |
|
---|
95 | # input parameter block
|
---|
96 | ($out,$inp,$len,$key,$counter)=("%rdi","%rsi","%rdx","%rcx","%r8");
|
---|
97 |
|
---|
98 | $code.=<<___;
|
---|
99 | .text
|
---|
100 |
|
---|
101 | .extern OPENSSL_ia32cap_P
|
---|
102 |
|
---|
103 | .align 64
|
---|
104 | .Lzero:
|
---|
105 | .long 0,0,0,0
|
---|
106 | .Lone:
|
---|
107 | .long 1,0,0,0
|
---|
108 | .Linc:
|
---|
109 | .long 0,1,2,3
|
---|
110 | .Lfour:
|
---|
111 | .long 4,4,4,4
|
---|
112 | .Lincy:
|
---|
113 | .long 0,2,4,6,1,3,5,7
|
---|
114 | .Leight:
|
---|
115 | .long 8,8,8,8,8,8,8,8
|
---|
116 | .Lrot16:
|
---|
117 | .byte 0x2,0x3,0x0,0x1, 0x6,0x7,0x4,0x5, 0xa,0xb,0x8,0x9, 0xe,0xf,0xc,0xd
|
---|
118 | .Lrot24:
|
---|
119 | .byte 0x3,0x0,0x1,0x2, 0x7,0x4,0x5,0x6, 0xb,0x8,0x9,0xa, 0xf,0xc,0xd,0xe
|
---|
120 | .Ltwoy:
|
---|
121 | .long 2,0,0,0, 2,0,0,0
|
---|
122 | .align 64
|
---|
123 | .Lzeroz:
|
---|
124 | .long 0,0,0,0, 1,0,0,0, 2,0,0,0, 3,0,0,0
|
---|
125 | .Lfourz:
|
---|
126 | .long 4,0,0,0, 4,0,0,0, 4,0,0,0, 4,0,0,0
|
---|
127 | .Lincz:
|
---|
128 | .long 0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15
|
---|
129 | .Lsixteen:
|
---|
130 | .long 16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16
|
---|
131 | .Lsigma:
|
---|
132 | .asciz "expand 32-byte k"
|
---|
133 | .asciz "ChaCha20 for x86_64, CRYPTOGAMS by <appro\@openssl.org>"
|
---|
134 | ___
|
---|
135 |
|
---|
136 | sub AUTOLOAD() # thunk [simplified] 32-bit style perlasm
|
---|
137 | { my $opcode = $AUTOLOAD; $opcode =~ s/.*:://;
|
---|
138 | my $arg = pop;
|
---|
139 | $arg = "\$$arg" if ($arg*1 eq $arg);
|
---|
140 | $code .= "\t$opcode\t".join(',',$arg,reverse @_)."\n";
|
---|
141 | }
|
---|
142 |
|
---|
143 | @x=("%eax","%ebx","%ecx","%edx",map("%r${_}d",(8..11)),
|
---|
144 | "%nox","%nox","%nox","%nox",map("%r${_}d",(12..15)));
|
---|
145 | @t=("%esi","%edi");
|
---|
146 |
|
---|
147 | sub ROUND { # critical path is 24 cycles per round
|
---|
148 | my ($a0,$b0,$c0,$d0)=@_;
|
---|
149 | my ($a1,$b1,$c1,$d1)=map(($_&~3)+(($_+1)&3),($a0,$b0,$c0,$d0));
|
---|
150 | my ($a2,$b2,$c2,$d2)=map(($_&~3)+(($_+1)&3),($a1,$b1,$c1,$d1));
|
---|
151 | my ($a3,$b3,$c3,$d3)=map(($_&~3)+(($_+1)&3),($a2,$b2,$c2,$d2));
|
---|
152 | my ($xc,$xc_)=map("\"$_\"",@t);
|
---|
153 | my @x=map("\"$_\"",@x);
|
---|
154 |
|
---|
155 | # Consider order in which variables are addressed by their
|
---|
156 | # index:
|
---|
157 | #
|
---|
158 | # a b c d
|
---|
159 | #
|
---|
160 | # 0 4 8 12 < even round
|
---|
161 | # 1 5 9 13
|
---|
162 | # 2 6 10 14
|
---|
163 | # 3 7 11 15
|
---|
164 | # 0 5 10 15 < odd round
|
---|
165 | # 1 6 11 12
|
---|
166 | # 2 7 8 13
|
---|
167 | # 3 4 9 14
|
---|
168 | #
|
---|
169 | # 'a', 'b' and 'd's are permanently allocated in registers,
|
---|
170 | # @x[0..7,12..15], while 'c's are maintained in memory. If
|
---|
171 | # you observe 'c' column, you'll notice that pair of 'c's is
|
---|
172 | # invariant between rounds. This means that we have to reload
|
---|
173 | # them once per round, in the middle. This is why you'll see
|
---|
174 | # bunch of 'c' stores and loads in the middle, but none in
|
---|
175 | # the beginning or end.
|
---|
176 |
|
---|
177 | # Normally instructions would be interleaved to favour in-order
|
---|
178 | # execution. Generally out-of-order cores manage it gracefully,
|
---|
179 | # but not this time for some reason. As in-order execution
|
---|
180 | # cores are dying breed, old Atom is the only one around,
|
---|
181 | # instructions are left uninterleaved. Besides, Atom is better
|
---|
182 | # off executing 1xSSSE3 code anyway...
|
---|
183 |
|
---|
184 | (
|
---|
185 | "&add (@x[$a0],@x[$b0])", # Q1
|
---|
186 | "&xor (@x[$d0],@x[$a0])",
|
---|
187 | "&rol (@x[$d0],16)",
|
---|
188 | "&add (@x[$a1],@x[$b1])", # Q2
|
---|
189 | "&xor (@x[$d1],@x[$a1])",
|
---|
190 | "&rol (@x[$d1],16)",
|
---|
191 |
|
---|
192 | "&add ($xc,@x[$d0])",
|
---|
193 | "&xor (@x[$b0],$xc)",
|
---|
194 | "&rol (@x[$b0],12)",
|
---|
195 | "&add ($xc_,@x[$d1])",
|
---|
196 | "&xor (@x[$b1],$xc_)",
|
---|
197 | "&rol (@x[$b1],12)",
|
---|
198 |
|
---|
199 | "&add (@x[$a0],@x[$b0])",
|
---|
200 | "&xor (@x[$d0],@x[$a0])",
|
---|
201 | "&rol (@x[$d0],8)",
|
---|
202 | "&add (@x[$a1],@x[$b1])",
|
---|
203 | "&xor (@x[$d1],@x[$a1])",
|
---|
204 | "&rol (@x[$d1],8)",
|
---|
205 |
|
---|
206 | "&add ($xc,@x[$d0])",
|
---|
207 | "&xor (@x[$b0],$xc)",
|
---|
208 | "&rol (@x[$b0],7)",
|
---|
209 | "&add ($xc_,@x[$d1])",
|
---|
210 | "&xor (@x[$b1],$xc_)",
|
---|
211 | "&rol (@x[$b1],7)",
|
---|
212 |
|
---|
213 | "&mov (\"4*$c0(%rsp)\",$xc)", # reload pair of 'c's
|
---|
214 | "&mov (\"4*$c1(%rsp)\",$xc_)",
|
---|
215 | "&mov ($xc,\"4*$c2(%rsp)\")",
|
---|
216 | "&mov ($xc_,\"4*$c3(%rsp)\")",
|
---|
217 |
|
---|
218 | "&add (@x[$a2],@x[$b2])", # Q3
|
---|
219 | "&xor (@x[$d2],@x[$a2])",
|
---|
220 | "&rol (@x[$d2],16)",
|
---|
221 | "&add (@x[$a3],@x[$b3])", # Q4
|
---|
222 | "&xor (@x[$d3],@x[$a3])",
|
---|
223 | "&rol (@x[$d3],16)",
|
---|
224 |
|
---|
225 | "&add ($xc,@x[$d2])",
|
---|
226 | "&xor (@x[$b2],$xc)",
|
---|
227 | "&rol (@x[$b2],12)",
|
---|
228 | "&add ($xc_,@x[$d3])",
|
---|
229 | "&xor (@x[$b3],$xc_)",
|
---|
230 | "&rol (@x[$b3],12)",
|
---|
231 |
|
---|
232 | "&add (@x[$a2],@x[$b2])",
|
---|
233 | "&xor (@x[$d2],@x[$a2])",
|
---|
234 | "&rol (@x[$d2],8)",
|
---|
235 | "&add (@x[$a3],@x[$b3])",
|
---|
236 | "&xor (@x[$d3],@x[$a3])",
|
---|
237 | "&rol (@x[$d3],8)",
|
---|
238 |
|
---|
239 | "&add ($xc,@x[$d2])",
|
---|
240 | "&xor (@x[$b2],$xc)",
|
---|
241 | "&rol (@x[$b2],7)",
|
---|
242 | "&add ($xc_,@x[$d3])",
|
---|
243 | "&xor (@x[$b3],$xc_)",
|
---|
244 | "&rol (@x[$b3],7)"
|
---|
245 | );
|
---|
246 | }
|
---|
247 |
|
---|
248 | ########################################################################
|
---|
249 | # Generic code path that handles all lengths on pre-SSSE3 processors.
|
---|
250 | $code.=<<___;
|
---|
251 | .globl ChaCha20_ctr32
|
---|
252 | .type ChaCha20_ctr32,\@function,5
|
---|
253 | .align 64
|
---|
254 | ChaCha20_ctr32:
|
---|
255 | .cfi_startproc
|
---|
256 | cmp \$0,$len
|
---|
257 | je .Lno_data
|
---|
258 | mov OPENSSL_ia32cap_P+4(%rip),%r10
|
---|
259 | ___
|
---|
260 | $code.=<<___ if ($avx>2);
|
---|
261 | bt \$48,%r10 # check for AVX512F
|
---|
262 | jc .LChaCha20_avx512
|
---|
263 | test %r10,%r10 # check for AVX512VL
|
---|
264 | js .LChaCha20_avx512vl
|
---|
265 | ___
|
---|
266 | $code.=<<___;
|
---|
267 | test \$`1<<(41-32)`,%r10d
|
---|
268 | jnz .LChaCha20_ssse3
|
---|
269 |
|
---|
270 | push %rbx
|
---|
271 | .cfi_push %rbx
|
---|
272 | push %rbp
|
---|
273 | .cfi_push %rbp
|
---|
274 | push %r12
|
---|
275 | .cfi_push %r12
|
---|
276 | push %r13
|
---|
277 | .cfi_push %r13
|
---|
278 | push %r14
|
---|
279 | .cfi_push %r14
|
---|
280 | push %r15
|
---|
281 | .cfi_push %r15
|
---|
282 | sub \$64+24,%rsp
|
---|
283 | .cfi_adjust_cfa_offset 64+24
|
---|
284 | .Lctr32_body:
|
---|
285 |
|
---|
286 | #movdqa .Lsigma(%rip),%xmm0
|
---|
287 | movdqu ($key),%xmm1
|
---|
288 | movdqu 16($key),%xmm2
|
---|
289 | movdqu ($counter),%xmm3
|
---|
290 | movdqa .Lone(%rip),%xmm4
|
---|
291 |
|
---|
292 | #movdqa %xmm0,4*0(%rsp) # key[0]
|
---|
293 | movdqa %xmm1,4*4(%rsp) # key[1]
|
---|
294 | movdqa %xmm2,4*8(%rsp) # key[2]
|
---|
295 | movdqa %xmm3,4*12(%rsp) # key[3]
|
---|
296 | mov $len,%rbp # reassign $len
|
---|
297 | jmp .Loop_outer
|
---|
298 |
|
---|
299 | .align 32
|
---|
300 | .Loop_outer:
|
---|
301 | mov \$0x61707865,@x[0] # 'expa'
|
---|
302 | mov \$0x3320646e,@x[1] # 'nd 3'
|
---|
303 | mov \$0x79622d32,@x[2] # '2-by'
|
---|
304 | mov \$0x6b206574,@x[3] # 'te k'
|
---|
305 | mov 4*4(%rsp),@x[4]
|
---|
306 | mov 4*5(%rsp),@x[5]
|
---|
307 | mov 4*6(%rsp),@x[6]
|
---|
308 | mov 4*7(%rsp),@x[7]
|
---|
309 | movd %xmm3,@x[12]
|
---|
310 | mov 4*13(%rsp),@x[13]
|
---|
311 | mov 4*14(%rsp),@x[14]
|
---|
312 | mov 4*15(%rsp),@x[15]
|
---|
313 |
|
---|
314 | mov %rbp,64+0(%rsp) # save len
|
---|
315 | mov \$10,%ebp
|
---|
316 | mov $inp,64+8(%rsp) # save inp
|
---|
317 | movq %xmm2,%rsi # "@x[8]"
|
---|
318 | mov $out,64+16(%rsp) # save out
|
---|
319 | mov %rsi,%rdi
|
---|
320 | shr \$32,%rdi # "@x[9]"
|
---|
321 | jmp .Loop
|
---|
322 |
|
---|
323 | .align 32
|
---|
324 | .Loop:
|
---|
325 | ___
|
---|
326 | foreach (&ROUND (0, 4, 8,12)) { eval; }
|
---|
327 | foreach (&ROUND (0, 5,10,15)) { eval; }
|
---|
328 | &dec ("%ebp");
|
---|
329 | &jnz (".Loop");
|
---|
330 |
|
---|
331 | $code.=<<___;
|
---|
332 | mov @t[1],4*9(%rsp) # modulo-scheduled
|
---|
333 | mov @t[0],4*8(%rsp)
|
---|
334 | mov 64(%rsp),%rbp # load len
|
---|
335 | movdqa %xmm2,%xmm1
|
---|
336 | mov 64+8(%rsp),$inp # load inp
|
---|
337 | paddd %xmm4,%xmm3 # increment counter
|
---|
338 | mov 64+16(%rsp),$out # load out
|
---|
339 |
|
---|
340 | add \$0x61707865,@x[0] # 'expa'
|
---|
341 | add \$0x3320646e,@x[1] # 'nd 3'
|
---|
342 | add \$0x79622d32,@x[2] # '2-by'
|
---|
343 | add \$0x6b206574,@x[3] # 'te k'
|
---|
344 | add 4*4(%rsp),@x[4]
|
---|
345 | add 4*5(%rsp),@x[5]
|
---|
346 | add 4*6(%rsp),@x[6]
|
---|
347 | add 4*7(%rsp),@x[7]
|
---|
348 | add 4*12(%rsp),@x[12]
|
---|
349 | add 4*13(%rsp),@x[13]
|
---|
350 | add 4*14(%rsp),@x[14]
|
---|
351 | add 4*15(%rsp),@x[15]
|
---|
352 | paddd 4*8(%rsp),%xmm1
|
---|
353 |
|
---|
354 | cmp \$64,%rbp
|
---|
355 | jb .Ltail
|
---|
356 |
|
---|
357 | xor 4*0($inp),@x[0] # xor with input
|
---|
358 | xor 4*1($inp),@x[1]
|
---|
359 | xor 4*2($inp),@x[2]
|
---|
360 | xor 4*3($inp),@x[3]
|
---|
361 | xor 4*4($inp),@x[4]
|
---|
362 | xor 4*5($inp),@x[5]
|
---|
363 | xor 4*6($inp),@x[6]
|
---|
364 | xor 4*7($inp),@x[7]
|
---|
365 | movdqu 4*8($inp),%xmm0
|
---|
366 | xor 4*12($inp),@x[12]
|
---|
367 | xor 4*13($inp),@x[13]
|
---|
368 | xor 4*14($inp),@x[14]
|
---|
369 | xor 4*15($inp),@x[15]
|
---|
370 | lea 4*16($inp),$inp # inp+=64
|
---|
371 | pxor %xmm1,%xmm0
|
---|
372 |
|
---|
373 | movdqa %xmm2,4*8(%rsp)
|
---|
374 | movd %xmm3,4*12(%rsp)
|
---|
375 |
|
---|
376 | mov @x[0],4*0($out) # write output
|
---|
377 | mov @x[1],4*1($out)
|
---|
378 | mov @x[2],4*2($out)
|
---|
379 | mov @x[3],4*3($out)
|
---|
380 | mov @x[4],4*4($out)
|
---|
381 | mov @x[5],4*5($out)
|
---|
382 | mov @x[6],4*6($out)
|
---|
383 | mov @x[7],4*7($out)
|
---|
384 | movdqu %xmm0,4*8($out)
|
---|
385 | mov @x[12],4*12($out)
|
---|
386 | mov @x[13],4*13($out)
|
---|
387 | mov @x[14],4*14($out)
|
---|
388 | mov @x[15],4*15($out)
|
---|
389 | lea 4*16($out),$out # out+=64
|
---|
390 |
|
---|
391 | sub \$64,%rbp
|
---|
392 | jnz .Loop_outer
|
---|
393 |
|
---|
394 | jmp .Ldone
|
---|
395 |
|
---|
396 | .align 16
|
---|
397 | .Ltail:
|
---|
398 | mov @x[0],4*0(%rsp)
|
---|
399 | mov @x[1],4*1(%rsp)
|
---|
400 | xor %rbx,%rbx
|
---|
401 | mov @x[2],4*2(%rsp)
|
---|
402 | mov @x[3],4*3(%rsp)
|
---|
403 | mov @x[4],4*4(%rsp)
|
---|
404 | mov @x[5],4*5(%rsp)
|
---|
405 | mov @x[6],4*6(%rsp)
|
---|
406 | mov @x[7],4*7(%rsp)
|
---|
407 | movdqa %xmm1,4*8(%rsp)
|
---|
408 | mov @x[12],4*12(%rsp)
|
---|
409 | mov @x[13],4*13(%rsp)
|
---|
410 | mov @x[14],4*14(%rsp)
|
---|
411 | mov @x[15],4*15(%rsp)
|
---|
412 |
|
---|
413 | .Loop_tail:
|
---|
414 | movzb ($inp,%rbx),%eax
|
---|
415 | movzb (%rsp,%rbx),%edx
|
---|
416 | lea 1(%rbx),%rbx
|
---|
417 | xor %edx,%eax
|
---|
418 | mov %al,-1($out,%rbx)
|
---|
419 | dec %rbp
|
---|
420 | jnz .Loop_tail
|
---|
421 |
|
---|
422 | .Ldone:
|
---|
423 | lea 64+24+48(%rsp),%rsi
|
---|
424 | .cfi_def_cfa %rsi,8
|
---|
425 | mov -48(%rsi),%r15
|
---|
426 | .cfi_restore %r15
|
---|
427 | mov -40(%rsi),%r14
|
---|
428 | .cfi_restore %r14
|
---|
429 | mov -32(%rsi),%r13
|
---|
430 | .cfi_restore %r13
|
---|
431 | mov -24(%rsi),%r12
|
---|
432 | .cfi_restore %r12
|
---|
433 | mov -16(%rsi),%rbp
|
---|
434 | .cfi_restore %rbp
|
---|
435 | mov -8(%rsi),%rbx
|
---|
436 | .cfi_restore %rbx
|
---|
437 | lea (%rsi),%rsp
|
---|
438 | .cfi_def_cfa_register %rsp
|
---|
439 | .Lno_data:
|
---|
440 | ret
|
---|
441 | .cfi_endproc
|
---|
442 | .size ChaCha20_ctr32,.-ChaCha20_ctr32
|
---|
443 | ___
|
---|
444 |
|
---|
445 | ########################################################################
|
---|
446 | # SSSE3 code path that handles shorter lengths
|
---|
447 | {
|
---|
448 | my ($a,$b,$c,$d,$t,$t1,$rot16,$rot24)=map("%xmm$_",(0..7));
|
---|
449 |
|
---|
450 | sub SSSE3ROUND { # critical path is 20 "SIMD ticks" per round
|
---|
451 | &paddd ($a,$b);
|
---|
452 | &pxor ($d,$a);
|
---|
453 | &pshufb ($d,$rot16);
|
---|
454 |
|
---|
455 | &paddd ($c,$d);
|
---|
456 | &pxor ($b,$c);
|
---|
457 | &movdqa ($t,$b);
|
---|
458 | &psrld ($b,20);
|
---|
459 | &pslld ($t,12);
|
---|
460 | &por ($b,$t);
|
---|
461 |
|
---|
462 | &paddd ($a,$b);
|
---|
463 | &pxor ($d,$a);
|
---|
464 | &pshufb ($d,$rot24);
|
---|
465 |
|
---|
466 | &paddd ($c,$d);
|
---|
467 | &pxor ($b,$c);
|
---|
468 | &movdqa ($t,$b);
|
---|
469 | &psrld ($b,25);
|
---|
470 | &pslld ($t,7);
|
---|
471 | &por ($b,$t);
|
---|
472 | }
|
---|
473 |
|
---|
474 | my $xframe = $win64 ? 32+8 : 8;
|
---|
475 |
|
---|
476 | $code.=<<___;
|
---|
477 | .type ChaCha20_ssse3,\@function,5
|
---|
478 | .align 32
|
---|
479 | ChaCha20_ssse3:
|
---|
480 | .cfi_startproc
|
---|
481 | .LChaCha20_ssse3:
|
---|
482 | mov %rsp,%r9 # frame pointer
|
---|
483 | .cfi_def_cfa_register %r9
|
---|
484 | ___
|
---|
485 | $code.=<<___ if ($avx);
|
---|
486 | test \$`1<<(43-32)`,%r10d
|
---|
487 | jnz .LChaCha20_4xop # XOP is fastest even if we use 1/4
|
---|
488 | ___
|
---|
489 | $code.=<<___;
|
---|
490 | cmp \$128,$len # we might throw away some data,
|
---|
491 | je .LChaCha20_128
|
---|
492 | ja .LChaCha20_4x # but overall it won't be slower
|
---|
493 |
|
---|
494 | .Ldo_sse3_after_all:
|
---|
495 | sub \$64+$xframe,%rsp
|
---|
496 | ___
|
---|
497 | $code.=<<___ if ($win64);
|
---|
498 | movaps %xmm6,-0x28(%r9)
|
---|
499 | movaps %xmm7,-0x18(%r9)
|
---|
500 | .Lssse3_body:
|
---|
501 | ___
|
---|
502 | $code.=<<___;
|
---|
503 | movdqa .Lsigma(%rip),$a
|
---|
504 | movdqu ($key),$b
|
---|
505 | movdqu 16($key),$c
|
---|
506 | movdqu ($counter),$d
|
---|
507 | movdqa .Lrot16(%rip),$rot16
|
---|
508 | movdqa .Lrot24(%rip),$rot24
|
---|
509 |
|
---|
510 | movdqa $a,0x00(%rsp)
|
---|
511 | movdqa $b,0x10(%rsp)
|
---|
512 | movdqa $c,0x20(%rsp)
|
---|
513 | movdqa $d,0x30(%rsp)
|
---|
514 | mov \$10,$counter # reuse $counter
|
---|
515 | jmp .Loop_ssse3
|
---|
516 |
|
---|
517 | .align 32
|
---|
518 | .Loop_outer_ssse3:
|
---|
519 | movdqa .Lone(%rip),$d
|
---|
520 | movdqa 0x00(%rsp),$a
|
---|
521 | movdqa 0x10(%rsp),$b
|
---|
522 | movdqa 0x20(%rsp),$c
|
---|
523 | paddd 0x30(%rsp),$d
|
---|
524 | mov \$10,$counter
|
---|
525 | movdqa $d,0x30(%rsp)
|
---|
526 | jmp .Loop_ssse3
|
---|
527 |
|
---|
528 | .align 32
|
---|
529 | .Loop_ssse3:
|
---|
530 | ___
|
---|
531 | &SSSE3ROUND();
|
---|
532 | &pshufd ($c,$c,0b01001110);
|
---|
533 | &pshufd ($b,$b,0b00111001);
|
---|
534 | &pshufd ($d,$d,0b10010011);
|
---|
535 | &nop ();
|
---|
536 |
|
---|
537 | &SSSE3ROUND();
|
---|
538 | &pshufd ($c,$c,0b01001110);
|
---|
539 | &pshufd ($b,$b,0b10010011);
|
---|
540 | &pshufd ($d,$d,0b00111001);
|
---|
541 |
|
---|
542 | &dec ($counter);
|
---|
543 | &jnz (".Loop_ssse3");
|
---|
544 |
|
---|
545 | $code.=<<___;
|
---|
546 | paddd 0x00(%rsp),$a
|
---|
547 | paddd 0x10(%rsp),$b
|
---|
548 | paddd 0x20(%rsp),$c
|
---|
549 | paddd 0x30(%rsp),$d
|
---|
550 |
|
---|
551 | cmp \$64,$len
|
---|
552 | jb .Ltail_ssse3
|
---|
553 |
|
---|
554 | movdqu 0x00($inp),$t
|
---|
555 | movdqu 0x10($inp),$t1
|
---|
556 | pxor $t,$a # xor with input
|
---|
557 | movdqu 0x20($inp),$t
|
---|
558 | pxor $t1,$b
|
---|
559 | movdqu 0x30($inp),$t1
|
---|
560 | lea 0x40($inp),$inp # inp+=64
|
---|
561 | pxor $t,$c
|
---|
562 | pxor $t1,$d
|
---|
563 |
|
---|
564 | movdqu $a,0x00($out) # write output
|
---|
565 | movdqu $b,0x10($out)
|
---|
566 | movdqu $c,0x20($out)
|
---|
567 | movdqu $d,0x30($out)
|
---|
568 | lea 0x40($out),$out # out+=64
|
---|
569 |
|
---|
570 | sub \$64,$len
|
---|
571 | jnz .Loop_outer_ssse3
|
---|
572 |
|
---|
573 | jmp .Ldone_ssse3
|
---|
574 |
|
---|
575 | .align 16
|
---|
576 | .Ltail_ssse3:
|
---|
577 | movdqa $a,0x00(%rsp)
|
---|
578 | movdqa $b,0x10(%rsp)
|
---|
579 | movdqa $c,0x20(%rsp)
|
---|
580 | movdqa $d,0x30(%rsp)
|
---|
581 | xor $counter,$counter
|
---|
582 |
|
---|
583 | .Loop_tail_ssse3:
|
---|
584 | movzb ($inp,$counter),%eax
|
---|
585 | movzb (%rsp,$counter),%ecx
|
---|
586 | lea 1($counter),$counter
|
---|
587 | xor %ecx,%eax
|
---|
588 | mov %al,-1($out,$counter)
|
---|
589 | dec $len
|
---|
590 | jnz .Loop_tail_ssse3
|
---|
591 |
|
---|
592 | .Ldone_ssse3:
|
---|
593 | ___
|
---|
594 | $code.=<<___ if ($win64);
|
---|
595 | movaps -0x28(%r9),%xmm6
|
---|
596 | movaps -0x18(%r9),%xmm7
|
---|
597 | ___
|
---|
598 | $code.=<<___;
|
---|
599 | lea (%r9),%rsp
|
---|
600 | .cfi_def_cfa_register %rsp
|
---|
601 | .Lssse3_epilogue:
|
---|
602 | ret
|
---|
603 | .cfi_endproc
|
---|
604 | .size ChaCha20_ssse3,.-ChaCha20_ssse3
|
---|
605 | ___
|
---|
606 | }
|
---|
607 |
|
---|
608 | ########################################################################
|
---|
609 | # SSSE3 code path that handles 128-byte inputs
|
---|
610 | {
|
---|
611 | my ($a,$b,$c,$d,$t,$t1,$rot16,$rot24)=map("%xmm$_",(8,9,2..7));
|
---|
612 | my ($a1,$b1,$c1,$d1)=map("%xmm$_",(10,11,0,1));
|
---|
613 |
|
---|
614 | sub SSSE3ROUND_2x {
|
---|
615 | &paddd ($a,$b);
|
---|
616 | &pxor ($d,$a);
|
---|
617 | &paddd ($a1,$b1);
|
---|
618 | &pxor ($d1,$a1);
|
---|
619 | &pshufb ($d,$rot16);
|
---|
620 | &pshufb($d1,$rot16);
|
---|
621 |
|
---|
622 | &paddd ($c,$d);
|
---|
623 | &paddd ($c1,$d1);
|
---|
624 | &pxor ($b,$c);
|
---|
625 | &pxor ($b1,$c1);
|
---|
626 | &movdqa ($t,$b);
|
---|
627 | &psrld ($b,20);
|
---|
628 | &movdqa($t1,$b1);
|
---|
629 | &pslld ($t,12);
|
---|
630 | &psrld ($b1,20);
|
---|
631 | &por ($b,$t);
|
---|
632 | &pslld ($t1,12);
|
---|
633 | &por ($b1,$t1);
|
---|
634 |
|
---|
635 | &paddd ($a,$b);
|
---|
636 | &pxor ($d,$a);
|
---|
637 | &paddd ($a1,$b1);
|
---|
638 | &pxor ($d1,$a1);
|
---|
639 | &pshufb ($d,$rot24);
|
---|
640 | &pshufb($d1,$rot24);
|
---|
641 |
|
---|
642 | &paddd ($c,$d);
|
---|
643 | &paddd ($c1,$d1);
|
---|
644 | &pxor ($b,$c);
|
---|
645 | &pxor ($b1,$c1);
|
---|
646 | &movdqa ($t,$b);
|
---|
647 | &psrld ($b,25);
|
---|
648 | &movdqa($t1,$b1);
|
---|
649 | &pslld ($t,7);
|
---|
650 | &psrld ($b1,25);
|
---|
651 | &por ($b,$t);
|
---|
652 | &pslld ($t1,7);
|
---|
653 | &por ($b1,$t1);
|
---|
654 | }
|
---|
655 |
|
---|
656 | my $xframe = $win64 ? 0x68 : 8;
|
---|
657 |
|
---|
658 | $code.=<<___;
|
---|
659 | .type ChaCha20_128,\@function,5
|
---|
660 | .align 32
|
---|
661 | ChaCha20_128:
|
---|
662 | .cfi_startproc
|
---|
663 | .LChaCha20_128:
|
---|
664 | mov %rsp,%r9 # frame pointer
|
---|
665 | .cfi_def_cfa_register %r9
|
---|
666 | sub \$64+$xframe,%rsp
|
---|
667 | ___
|
---|
668 | $code.=<<___ if ($win64);
|
---|
669 | movaps %xmm6,-0x68(%r9)
|
---|
670 | movaps %xmm7,-0x58(%r9)
|
---|
671 | movaps %xmm8,-0x48(%r9)
|
---|
672 | movaps %xmm9,-0x38(%r9)
|
---|
673 | movaps %xmm10,-0x28(%r9)
|
---|
674 | movaps %xmm11,-0x18(%r9)
|
---|
675 | .L128_body:
|
---|
676 | ___
|
---|
677 | $code.=<<___;
|
---|
678 | movdqa .Lsigma(%rip),$a
|
---|
679 | movdqu ($key),$b
|
---|
680 | movdqu 16($key),$c
|
---|
681 | movdqu ($counter),$d
|
---|
682 | movdqa .Lone(%rip),$d1
|
---|
683 | movdqa .Lrot16(%rip),$rot16
|
---|
684 | movdqa .Lrot24(%rip),$rot24
|
---|
685 |
|
---|
686 | movdqa $a,$a1
|
---|
687 | movdqa $a,0x00(%rsp)
|
---|
688 | movdqa $b,$b1
|
---|
689 | movdqa $b,0x10(%rsp)
|
---|
690 | movdqa $c,$c1
|
---|
691 | movdqa $c,0x20(%rsp)
|
---|
692 | paddd $d,$d1
|
---|
693 | movdqa $d,0x30(%rsp)
|
---|
694 | mov \$10,$counter # reuse $counter
|
---|
695 | jmp .Loop_128
|
---|
696 |
|
---|
697 | .align 32
|
---|
698 | .Loop_128:
|
---|
699 | ___
|
---|
700 | &SSSE3ROUND_2x();
|
---|
701 | &pshufd ($c,$c,0b01001110);
|
---|
702 | &pshufd ($b,$b,0b00111001);
|
---|
703 | &pshufd ($d,$d,0b10010011);
|
---|
704 | &pshufd ($c1,$c1,0b01001110);
|
---|
705 | &pshufd ($b1,$b1,0b00111001);
|
---|
706 | &pshufd ($d1,$d1,0b10010011);
|
---|
707 |
|
---|
708 | &SSSE3ROUND_2x();
|
---|
709 | &pshufd ($c,$c,0b01001110);
|
---|
710 | &pshufd ($b,$b,0b10010011);
|
---|
711 | &pshufd ($d,$d,0b00111001);
|
---|
712 | &pshufd ($c1,$c1,0b01001110);
|
---|
713 | &pshufd ($b1,$b1,0b10010011);
|
---|
714 | &pshufd ($d1,$d1,0b00111001);
|
---|
715 |
|
---|
716 | &dec ($counter);
|
---|
717 | &jnz (".Loop_128");
|
---|
718 |
|
---|
719 | $code.=<<___;
|
---|
720 | paddd 0x00(%rsp),$a
|
---|
721 | paddd 0x10(%rsp),$b
|
---|
722 | paddd 0x20(%rsp),$c
|
---|
723 | paddd 0x30(%rsp),$d
|
---|
724 | paddd .Lone(%rip),$d1
|
---|
725 | paddd 0x00(%rsp),$a1
|
---|
726 | paddd 0x10(%rsp),$b1
|
---|
727 | paddd 0x20(%rsp),$c1
|
---|
728 | paddd 0x30(%rsp),$d1
|
---|
729 |
|
---|
730 | movdqu 0x00($inp),$t
|
---|
731 | movdqu 0x10($inp),$t1
|
---|
732 | pxor $t,$a # xor with input
|
---|
733 | movdqu 0x20($inp),$t
|
---|
734 | pxor $t1,$b
|
---|
735 | movdqu 0x30($inp),$t1
|
---|
736 | pxor $t,$c
|
---|
737 | movdqu 0x40($inp),$t
|
---|
738 | pxor $t1,$d
|
---|
739 | movdqu 0x50($inp),$t1
|
---|
740 | pxor $t,$a1
|
---|
741 | movdqu 0x60($inp),$t
|
---|
742 | pxor $t1,$b1
|
---|
743 | movdqu 0x70($inp),$t1
|
---|
744 | pxor $t,$c1
|
---|
745 | pxor $t1,$d1
|
---|
746 |
|
---|
747 | movdqu $a,0x00($out) # write output
|
---|
748 | movdqu $b,0x10($out)
|
---|
749 | movdqu $c,0x20($out)
|
---|
750 | movdqu $d,0x30($out)
|
---|
751 | movdqu $a1,0x40($out)
|
---|
752 | movdqu $b1,0x50($out)
|
---|
753 | movdqu $c1,0x60($out)
|
---|
754 | movdqu $d1,0x70($out)
|
---|
755 | ___
|
---|
756 | $code.=<<___ if ($win64);
|
---|
757 | movaps -0x68(%r9),%xmm6
|
---|
758 | movaps -0x58(%r9),%xmm7
|
---|
759 | movaps -0x48(%r9),%xmm8
|
---|
760 | movaps -0x38(%r9),%xmm9
|
---|
761 | movaps -0x28(%r9),%xmm10
|
---|
762 | movaps -0x18(%r9),%xmm11
|
---|
763 | ___
|
---|
764 | $code.=<<___;
|
---|
765 | lea (%r9),%rsp
|
---|
766 | .cfi_def_cfa_register %rsp
|
---|
767 | .L128_epilogue:
|
---|
768 | ret
|
---|
769 | .cfi_endproc
|
---|
770 | .size ChaCha20_128,.-ChaCha20_128
|
---|
771 | ___
|
---|
772 | }
|
---|
773 |
|
---|
774 | ########################################################################
|
---|
775 | # SSSE3 code path that handles longer messages.
|
---|
776 | {
|
---|
777 | # assign variables to favor Atom front-end
|
---|
778 | my ($xd0,$xd1,$xd2,$xd3, $xt0,$xt1,$xt2,$xt3,
|
---|
779 | $xa0,$xa1,$xa2,$xa3, $xb0,$xb1,$xb2,$xb3)=map("%xmm$_",(0..15));
|
---|
780 | my @xx=($xa0,$xa1,$xa2,$xa3, $xb0,$xb1,$xb2,$xb3,
|
---|
781 | "%nox","%nox","%nox","%nox", $xd0,$xd1,$xd2,$xd3);
|
---|
782 |
|
---|
783 | sub SSSE3_lane_ROUND {
|
---|
784 | my ($a0,$b0,$c0,$d0)=@_;
|
---|
785 | my ($a1,$b1,$c1,$d1)=map(($_&~3)+(($_+1)&3),($a0,$b0,$c0,$d0));
|
---|
786 | my ($a2,$b2,$c2,$d2)=map(($_&~3)+(($_+1)&3),($a1,$b1,$c1,$d1));
|
---|
787 | my ($a3,$b3,$c3,$d3)=map(($_&~3)+(($_+1)&3),($a2,$b2,$c2,$d2));
|
---|
788 | my ($xc,$xc_,$t0,$t1)=map("\"$_\"",$xt0,$xt1,$xt2,$xt3);
|
---|
789 | my @x=map("\"$_\"",@xx);
|
---|
790 |
|
---|
791 | # Consider order in which variables are addressed by their
|
---|
792 | # index:
|
---|
793 | #
|
---|
794 | # a b c d
|
---|
795 | #
|
---|
796 | # 0 4 8 12 < even round
|
---|
797 | # 1 5 9 13
|
---|
798 | # 2 6 10 14
|
---|
799 | # 3 7 11 15
|
---|
800 | # 0 5 10 15 < odd round
|
---|
801 | # 1 6 11 12
|
---|
802 | # 2 7 8 13
|
---|
803 | # 3 4 9 14
|
---|
804 | #
|
---|
805 | # 'a', 'b' and 'd's are permanently allocated in registers,
|
---|
806 | # @x[0..7,12..15], while 'c's are maintained in memory. If
|
---|
807 | # you observe 'c' column, you'll notice that pair of 'c's is
|
---|
808 | # invariant between rounds. This means that we have to reload
|
---|
809 | # them once per round, in the middle. This is why you'll see
|
---|
810 | # bunch of 'c' stores and loads in the middle, but none in
|
---|
811 | # the beginning or end.
|
---|
812 |
|
---|
813 | (
|
---|
814 | "&paddd (@x[$a0],@x[$b0])", # Q1
|
---|
815 | "&paddd (@x[$a1],@x[$b1])", # Q2
|
---|
816 | "&pxor (@x[$d0],@x[$a0])",
|
---|
817 | "&pxor (@x[$d1],@x[$a1])",
|
---|
818 | "&pshufb (@x[$d0],$t1)",
|
---|
819 | "&pshufb (@x[$d1],$t1)",
|
---|
820 |
|
---|
821 | "&paddd ($xc,@x[$d0])",
|
---|
822 | "&paddd ($xc_,@x[$d1])",
|
---|
823 | "&pxor (@x[$b0],$xc)",
|
---|
824 | "&pxor (@x[$b1],$xc_)",
|
---|
825 | "&movdqa ($t0,@x[$b0])",
|
---|
826 | "&pslld (@x[$b0],12)",
|
---|
827 | "&psrld ($t0,20)",
|
---|
828 | "&movdqa ($t1,@x[$b1])",
|
---|
829 | "&pslld (@x[$b1],12)",
|
---|
830 | "&por (@x[$b0],$t0)",
|
---|
831 | "&psrld ($t1,20)",
|
---|
832 | "&movdqa ($t0,'(%r11)')", # .Lrot24(%rip)
|
---|
833 | "&por (@x[$b1],$t1)",
|
---|
834 |
|
---|
835 | "&paddd (@x[$a0],@x[$b0])",
|
---|
836 | "&paddd (@x[$a1],@x[$b1])",
|
---|
837 | "&pxor (@x[$d0],@x[$a0])",
|
---|
838 | "&pxor (@x[$d1],@x[$a1])",
|
---|
839 | "&pshufb (@x[$d0],$t0)",
|
---|
840 | "&pshufb (@x[$d1],$t0)",
|
---|
841 |
|
---|
842 | "&paddd ($xc,@x[$d0])",
|
---|
843 | "&paddd ($xc_,@x[$d1])",
|
---|
844 | "&pxor (@x[$b0],$xc)",
|
---|
845 | "&pxor (@x[$b1],$xc_)",
|
---|
846 | "&movdqa ($t1,@x[$b0])",
|
---|
847 | "&pslld (@x[$b0],7)",
|
---|
848 | "&psrld ($t1,25)",
|
---|
849 | "&movdqa ($t0,@x[$b1])",
|
---|
850 | "&pslld (@x[$b1],7)",
|
---|
851 | "&por (@x[$b0],$t1)",
|
---|
852 | "&psrld ($t0,25)",
|
---|
853 | "&movdqa ($t1,'(%r10)')", # .Lrot16(%rip)
|
---|
854 | "&por (@x[$b1],$t0)",
|
---|
855 |
|
---|
856 | "&movdqa (\"`16*($c0-8)`(%rsp)\",$xc)", # reload pair of 'c's
|
---|
857 | "&movdqa (\"`16*($c1-8)`(%rsp)\",$xc_)",
|
---|
858 | "&movdqa ($xc,\"`16*($c2-8)`(%rsp)\")",
|
---|
859 | "&movdqa ($xc_,\"`16*($c3-8)`(%rsp)\")",
|
---|
860 |
|
---|
861 | "&paddd (@x[$a2],@x[$b2])", # Q3
|
---|
862 | "&paddd (@x[$a3],@x[$b3])", # Q4
|
---|
863 | "&pxor (@x[$d2],@x[$a2])",
|
---|
864 | "&pxor (@x[$d3],@x[$a3])",
|
---|
865 | "&pshufb (@x[$d2],$t1)",
|
---|
866 | "&pshufb (@x[$d3],$t1)",
|
---|
867 |
|
---|
868 | "&paddd ($xc,@x[$d2])",
|
---|
869 | "&paddd ($xc_,@x[$d3])",
|
---|
870 | "&pxor (@x[$b2],$xc)",
|
---|
871 | "&pxor (@x[$b3],$xc_)",
|
---|
872 | "&movdqa ($t0,@x[$b2])",
|
---|
873 | "&pslld (@x[$b2],12)",
|
---|
874 | "&psrld ($t0,20)",
|
---|
875 | "&movdqa ($t1,@x[$b3])",
|
---|
876 | "&pslld (@x[$b3],12)",
|
---|
877 | "&por (@x[$b2],$t0)",
|
---|
878 | "&psrld ($t1,20)",
|
---|
879 | "&movdqa ($t0,'(%r11)')", # .Lrot24(%rip)
|
---|
880 | "&por (@x[$b3],$t1)",
|
---|
881 |
|
---|
882 | "&paddd (@x[$a2],@x[$b2])",
|
---|
883 | "&paddd (@x[$a3],@x[$b3])",
|
---|
884 | "&pxor (@x[$d2],@x[$a2])",
|
---|
885 | "&pxor (@x[$d3],@x[$a3])",
|
---|
886 | "&pshufb (@x[$d2],$t0)",
|
---|
887 | "&pshufb (@x[$d3],$t0)",
|
---|
888 |
|
---|
889 | "&paddd ($xc,@x[$d2])",
|
---|
890 | "&paddd ($xc_,@x[$d3])",
|
---|
891 | "&pxor (@x[$b2],$xc)",
|
---|
892 | "&pxor (@x[$b3],$xc_)",
|
---|
893 | "&movdqa ($t1,@x[$b2])",
|
---|
894 | "&pslld (@x[$b2],7)",
|
---|
895 | "&psrld ($t1,25)",
|
---|
896 | "&movdqa ($t0,@x[$b3])",
|
---|
897 | "&pslld (@x[$b3],7)",
|
---|
898 | "&por (@x[$b2],$t1)",
|
---|
899 | "&psrld ($t0,25)",
|
---|
900 | "&movdqa ($t1,'(%r10)')", # .Lrot16(%rip)
|
---|
901 | "&por (@x[$b3],$t0)"
|
---|
902 | );
|
---|
903 | }
|
---|
904 |
|
---|
905 | my $xframe = $win64 ? 0xa8 : 8;
|
---|
906 |
|
---|
907 | $code.=<<___;
|
---|
908 | .type ChaCha20_4x,\@function,5
|
---|
909 | .align 32
|
---|
910 | ChaCha20_4x:
|
---|
911 | .cfi_startproc
|
---|
912 | .LChaCha20_4x:
|
---|
913 | mov %rsp,%r9 # frame pointer
|
---|
914 | .cfi_def_cfa_register %r9
|
---|
915 | mov %r10,%r11
|
---|
916 | ___
|
---|
917 | $code.=<<___ if ($avx>1);
|
---|
918 | shr \$32,%r10 # OPENSSL_ia32cap_P+8
|
---|
919 | test \$`1<<5`,%r10 # test AVX2
|
---|
920 | jnz .LChaCha20_8x
|
---|
921 | ___
|
---|
922 | $code.=<<___;
|
---|
923 | cmp \$192,$len
|
---|
924 | ja .Lproceed4x
|
---|
925 |
|
---|
926 | and \$`1<<26|1<<22`,%r11 # isolate XSAVE+MOVBE
|
---|
927 | cmp \$`1<<22`,%r11 # check for MOVBE without XSAVE
|
---|
928 | je .Ldo_sse3_after_all # to detect Atom
|
---|
929 |
|
---|
930 | .Lproceed4x:
|
---|
931 | sub \$0x140+$xframe,%rsp
|
---|
932 | ___
|
---|
933 | ################ stack layout
|
---|
934 | # +0x00 SIMD equivalent of @x[8-12]
|
---|
935 | # ...
|
---|
936 | # +0x40 constant copy of key[0-2] smashed by lanes
|
---|
937 | # ...
|
---|
938 | # +0x100 SIMD counters (with nonce smashed by lanes)
|
---|
939 | # ...
|
---|
940 | # +0x140
|
---|
941 | $code.=<<___ if ($win64);
|
---|
942 | movaps %xmm6,-0xa8(%r9)
|
---|
943 | movaps %xmm7,-0x98(%r9)
|
---|
944 | movaps %xmm8,-0x88(%r9)
|
---|
945 | movaps %xmm9,-0x78(%r9)
|
---|
946 | movaps %xmm10,-0x68(%r9)
|
---|
947 | movaps %xmm11,-0x58(%r9)
|
---|
948 | movaps %xmm12,-0x48(%r9)
|
---|
949 | movaps %xmm13,-0x38(%r9)
|
---|
950 | movaps %xmm14,-0x28(%r9)
|
---|
951 | movaps %xmm15,-0x18(%r9)
|
---|
952 | .L4x_body:
|
---|
953 | ___
|
---|
954 | $code.=<<___;
|
---|
955 | movdqa .Lsigma(%rip),$xa3 # key[0]
|
---|
956 | movdqu ($key),$xb3 # key[1]
|
---|
957 | movdqu 16($key),$xt3 # key[2]
|
---|
958 | movdqu ($counter),$xd3 # key[3]
|
---|
959 | lea 0x100(%rsp),%rcx # size optimization
|
---|
960 | lea .Lrot16(%rip),%r10
|
---|
961 | lea .Lrot24(%rip),%r11
|
---|
962 |
|
---|
963 | pshufd \$0x00,$xa3,$xa0 # smash key by lanes...
|
---|
964 | pshufd \$0x55,$xa3,$xa1
|
---|
965 | movdqa $xa0,0x40(%rsp) # ... and offload
|
---|
966 | pshufd \$0xaa,$xa3,$xa2
|
---|
967 | movdqa $xa1,0x50(%rsp)
|
---|
968 | pshufd \$0xff,$xa3,$xa3
|
---|
969 | movdqa $xa2,0x60(%rsp)
|
---|
970 | movdqa $xa3,0x70(%rsp)
|
---|
971 |
|
---|
972 | pshufd \$0x00,$xb3,$xb0
|
---|
973 | pshufd \$0x55,$xb3,$xb1
|
---|
974 | movdqa $xb0,0x80-0x100(%rcx)
|
---|
975 | pshufd \$0xaa,$xb3,$xb2
|
---|
976 | movdqa $xb1,0x90-0x100(%rcx)
|
---|
977 | pshufd \$0xff,$xb3,$xb3
|
---|
978 | movdqa $xb2,0xa0-0x100(%rcx)
|
---|
979 | movdqa $xb3,0xb0-0x100(%rcx)
|
---|
980 |
|
---|
981 | pshufd \$0x00,$xt3,$xt0 # "$xc0"
|
---|
982 | pshufd \$0x55,$xt3,$xt1 # "$xc1"
|
---|
983 | movdqa $xt0,0xc0-0x100(%rcx)
|
---|
984 | pshufd \$0xaa,$xt3,$xt2 # "$xc2"
|
---|
985 | movdqa $xt1,0xd0-0x100(%rcx)
|
---|
986 | pshufd \$0xff,$xt3,$xt3 # "$xc3"
|
---|
987 | movdqa $xt2,0xe0-0x100(%rcx)
|
---|
988 | movdqa $xt3,0xf0-0x100(%rcx)
|
---|
989 |
|
---|
990 | pshufd \$0x00,$xd3,$xd0
|
---|
991 | pshufd \$0x55,$xd3,$xd1
|
---|
992 | paddd .Linc(%rip),$xd0 # don't save counters yet
|
---|
993 | pshufd \$0xaa,$xd3,$xd2
|
---|
994 | movdqa $xd1,0x110-0x100(%rcx)
|
---|
995 | pshufd \$0xff,$xd3,$xd3
|
---|
996 | movdqa $xd2,0x120-0x100(%rcx)
|
---|
997 | movdqa $xd3,0x130-0x100(%rcx)
|
---|
998 |
|
---|
999 | jmp .Loop_enter4x
|
---|
1000 |
|
---|
1001 | .align 32
|
---|
1002 | .Loop_outer4x:
|
---|
1003 | movdqa 0x40(%rsp),$xa0 # re-load smashed key
|
---|
1004 | movdqa 0x50(%rsp),$xa1
|
---|
1005 | movdqa 0x60(%rsp),$xa2
|
---|
1006 | movdqa 0x70(%rsp),$xa3
|
---|
1007 | movdqa 0x80-0x100(%rcx),$xb0
|
---|
1008 | movdqa 0x90-0x100(%rcx),$xb1
|
---|
1009 | movdqa 0xa0-0x100(%rcx),$xb2
|
---|
1010 | movdqa 0xb0-0x100(%rcx),$xb3
|
---|
1011 | movdqa 0xc0-0x100(%rcx),$xt0 # "$xc0"
|
---|
1012 | movdqa 0xd0-0x100(%rcx),$xt1 # "$xc1"
|
---|
1013 | movdqa 0xe0-0x100(%rcx),$xt2 # "$xc2"
|
---|
1014 | movdqa 0xf0-0x100(%rcx),$xt3 # "$xc3"
|
---|
1015 | movdqa 0x100-0x100(%rcx),$xd0
|
---|
1016 | movdqa 0x110-0x100(%rcx),$xd1
|
---|
1017 | movdqa 0x120-0x100(%rcx),$xd2
|
---|
1018 | movdqa 0x130-0x100(%rcx),$xd3
|
---|
1019 | paddd .Lfour(%rip),$xd0 # next SIMD counters
|
---|
1020 |
|
---|
1021 | .Loop_enter4x:
|
---|
1022 | movdqa $xt2,0x20(%rsp) # SIMD equivalent of "@x[10]"
|
---|
1023 | movdqa $xt3,0x30(%rsp) # SIMD equivalent of "@x[11]"
|
---|
1024 | movdqa (%r10),$xt3 # .Lrot16(%rip)
|
---|
1025 | mov \$10,%eax
|
---|
1026 | movdqa $xd0,0x100-0x100(%rcx) # save SIMD counters
|
---|
1027 | jmp .Loop4x
|
---|
1028 |
|
---|
1029 | .align 32
|
---|
1030 | .Loop4x:
|
---|
1031 | ___
|
---|
1032 | foreach (&SSSE3_lane_ROUND(0, 4, 8,12)) { eval; }
|
---|
1033 | foreach (&SSSE3_lane_ROUND(0, 5,10,15)) { eval; }
|
---|
1034 | $code.=<<___;
|
---|
1035 | dec %eax
|
---|
1036 | jnz .Loop4x
|
---|
1037 |
|
---|
1038 | paddd 0x40(%rsp),$xa0 # accumulate key material
|
---|
1039 | paddd 0x50(%rsp),$xa1
|
---|
1040 | paddd 0x60(%rsp),$xa2
|
---|
1041 | paddd 0x70(%rsp),$xa3
|
---|
1042 |
|
---|
1043 | movdqa $xa0,$xt2 # "de-interlace" data
|
---|
1044 | punpckldq $xa1,$xa0
|
---|
1045 | movdqa $xa2,$xt3
|
---|
1046 | punpckldq $xa3,$xa2
|
---|
1047 | punpckhdq $xa1,$xt2
|
---|
1048 | punpckhdq $xa3,$xt3
|
---|
1049 | movdqa $xa0,$xa1
|
---|
1050 | punpcklqdq $xa2,$xa0 # "a0"
|
---|
1051 | movdqa $xt2,$xa3
|
---|
1052 | punpcklqdq $xt3,$xt2 # "a2"
|
---|
1053 | punpckhqdq $xa2,$xa1 # "a1"
|
---|
1054 | punpckhqdq $xt3,$xa3 # "a3"
|
---|
1055 | ___
|
---|
1056 | ($xa2,$xt2)=($xt2,$xa2);
|
---|
1057 | $code.=<<___;
|
---|
1058 | paddd 0x80-0x100(%rcx),$xb0
|
---|
1059 | paddd 0x90-0x100(%rcx),$xb1
|
---|
1060 | paddd 0xa0-0x100(%rcx),$xb2
|
---|
1061 | paddd 0xb0-0x100(%rcx),$xb3
|
---|
1062 |
|
---|
1063 | movdqa $xa0,0x00(%rsp) # offload $xaN
|
---|
1064 | movdqa $xa1,0x10(%rsp)
|
---|
1065 | movdqa 0x20(%rsp),$xa0 # "xc2"
|
---|
1066 | movdqa 0x30(%rsp),$xa1 # "xc3"
|
---|
1067 |
|
---|
1068 | movdqa $xb0,$xt2
|
---|
1069 | punpckldq $xb1,$xb0
|
---|
1070 | movdqa $xb2,$xt3
|
---|
1071 | punpckldq $xb3,$xb2
|
---|
1072 | punpckhdq $xb1,$xt2
|
---|
1073 | punpckhdq $xb3,$xt3
|
---|
1074 | movdqa $xb0,$xb1
|
---|
1075 | punpcklqdq $xb2,$xb0 # "b0"
|
---|
1076 | movdqa $xt2,$xb3
|
---|
1077 | punpcklqdq $xt3,$xt2 # "b2"
|
---|
1078 | punpckhqdq $xb2,$xb1 # "b1"
|
---|
1079 | punpckhqdq $xt3,$xb3 # "b3"
|
---|
1080 | ___
|
---|
1081 | ($xb2,$xt2)=($xt2,$xb2);
|
---|
1082 | my ($xc0,$xc1,$xc2,$xc3)=($xt0,$xt1,$xa0,$xa1);
|
---|
1083 | $code.=<<___;
|
---|
1084 | paddd 0xc0-0x100(%rcx),$xc0
|
---|
1085 | paddd 0xd0-0x100(%rcx),$xc1
|
---|
1086 | paddd 0xe0-0x100(%rcx),$xc2
|
---|
1087 | paddd 0xf0-0x100(%rcx),$xc3
|
---|
1088 |
|
---|
1089 | movdqa $xa2,0x20(%rsp) # keep offloading $xaN
|
---|
1090 | movdqa $xa3,0x30(%rsp)
|
---|
1091 |
|
---|
1092 | movdqa $xc0,$xt2
|
---|
1093 | punpckldq $xc1,$xc0
|
---|
1094 | movdqa $xc2,$xt3
|
---|
1095 | punpckldq $xc3,$xc2
|
---|
1096 | punpckhdq $xc1,$xt2
|
---|
1097 | punpckhdq $xc3,$xt3
|
---|
1098 | movdqa $xc0,$xc1
|
---|
1099 | punpcklqdq $xc2,$xc0 # "c0"
|
---|
1100 | movdqa $xt2,$xc3
|
---|
1101 | punpcklqdq $xt3,$xt2 # "c2"
|
---|
1102 | punpckhqdq $xc2,$xc1 # "c1"
|
---|
1103 | punpckhqdq $xt3,$xc3 # "c3"
|
---|
1104 | ___
|
---|
1105 | ($xc2,$xt2)=($xt2,$xc2);
|
---|
1106 | ($xt0,$xt1)=($xa2,$xa3); # use $xaN as temporary
|
---|
1107 | $code.=<<___;
|
---|
1108 | paddd 0x100-0x100(%rcx),$xd0
|
---|
1109 | paddd 0x110-0x100(%rcx),$xd1
|
---|
1110 | paddd 0x120-0x100(%rcx),$xd2
|
---|
1111 | paddd 0x130-0x100(%rcx),$xd3
|
---|
1112 |
|
---|
1113 | movdqa $xd0,$xt2
|
---|
1114 | punpckldq $xd1,$xd0
|
---|
1115 | movdqa $xd2,$xt3
|
---|
1116 | punpckldq $xd3,$xd2
|
---|
1117 | punpckhdq $xd1,$xt2
|
---|
1118 | punpckhdq $xd3,$xt3
|
---|
1119 | movdqa $xd0,$xd1
|
---|
1120 | punpcklqdq $xd2,$xd0 # "d0"
|
---|
1121 | movdqa $xt2,$xd3
|
---|
1122 | punpcklqdq $xt3,$xt2 # "d2"
|
---|
1123 | punpckhqdq $xd2,$xd1 # "d1"
|
---|
1124 | punpckhqdq $xt3,$xd3 # "d3"
|
---|
1125 | ___
|
---|
1126 | ($xd2,$xt2)=($xt2,$xd2);
|
---|
1127 | $code.=<<___;
|
---|
1128 | cmp \$64*4,$len
|
---|
1129 | jb .Ltail4x
|
---|
1130 |
|
---|
1131 | movdqu 0x00($inp),$xt0 # xor with input
|
---|
1132 | movdqu 0x10($inp),$xt1
|
---|
1133 | movdqu 0x20($inp),$xt2
|
---|
1134 | movdqu 0x30($inp),$xt3
|
---|
1135 | pxor 0x00(%rsp),$xt0 # $xaN is offloaded, remember?
|
---|
1136 | pxor $xb0,$xt1
|
---|
1137 | pxor $xc0,$xt2
|
---|
1138 | pxor $xd0,$xt3
|
---|
1139 |
|
---|
1140 | movdqu $xt0,0x00($out)
|
---|
1141 | movdqu 0x40($inp),$xt0
|
---|
1142 | movdqu $xt1,0x10($out)
|
---|
1143 | movdqu 0x50($inp),$xt1
|
---|
1144 | movdqu $xt2,0x20($out)
|
---|
1145 | movdqu 0x60($inp),$xt2
|
---|
1146 | movdqu $xt3,0x30($out)
|
---|
1147 | movdqu 0x70($inp),$xt3
|
---|
1148 | lea 0x80($inp),$inp # size optimization
|
---|
1149 | pxor 0x10(%rsp),$xt0
|
---|
1150 | pxor $xb1,$xt1
|
---|
1151 | pxor $xc1,$xt2
|
---|
1152 | pxor $xd1,$xt3
|
---|
1153 |
|
---|
1154 | movdqu $xt0,0x40($out)
|
---|
1155 | movdqu 0x00($inp),$xt0
|
---|
1156 | movdqu $xt1,0x50($out)
|
---|
1157 | movdqu 0x10($inp),$xt1
|
---|
1158 | movdqu $xt2,0x60($out)
|
---|
1159 | movdqu 0x20($inp),$xt2
|
---|
1160 | movdqu $xt3,0x70($out)
|
---|
1161 | lea 0x80($out),$out # size optimization
|
---|
1162 | movdqu 0x30($inp),$xt3
|
---|
1163 | pxor 0x20(%rsp),$xt0
|
---|
1164 | pxor $xb2,$xt1
|
---|
1165 | pxor $xc2,$xt2
|
---|
1166 | pxor $xd2,$xt3
|
---|
1167 |
|
---|
1168 | movdqu $xt0,0x00($out)
|
---|
1169 | movdqu 0x40($inp),$xt0
|
---|
1170 | movdqu $xt1,0x10($out)
|
---|
1171 | movdqu 0x50($inp),$xt1
|
---|
1172 | movdqu $xt2,0x20($out)
|
---|
1173 | movdqu 0x60($inp),$xt2
|
---|
1174 | movdqu $xt3,0x30($out)
|
---|
1175 | movdqu 0x70($inp),$xt3
|
---|
1176 | lea 0x80($inp),$inp # inp+=64*4
|
---|
1177 | pxor 0x30(%rsp),$xt0
|
---|
1178 | pxor $xb3,$xt1
|
---|
1179 | pxor $xc3,$xt2
|
---|
1180 | pxor $xd3,$xt3
|
---|
1181 | movdqu $xt0,0x40($out)
|
---|
1182 | movdqu $xt1,0x50($out)
|
---|
1183 | movdqu $xt2,0x60($out)
|
---|
1184 | movdqu $xt3,0x70($out)
|
---|
1185 | lea 0x80($out),$out # out+=64*4
|
---|
1186 |
|
---|
1187 | sub \$64*4,$len
|
---|
1188 | jnz .Loop_outer4x
|
---|
1189 |
|
---|
1190 | jmp .Ldone4x
|
---|
1191 |
|
---|
1192 | .Ltail4x:
|
---|
1193 | cmp \$192,$len
|
---|
1194 | jae .L192_or_more4x
|
---|
1195 | cmp \$128,$len
|
---|
1196 | jae .L128_or_more4x
|
---|
1197 | cmp \$64,$len
|
---|
1198 | jae .L64_or_more4x
|
---|
1199 |
|
---|
1200 | #movdqa 0x00(%rsp),$xt0 # $xaN is offloaded, remember?
|
---|
1201 | xor %r10,%r10
|
---|
1202 | #movdqa $xt0,0x00(%rsp)
|
---|
1203 | movdqa $xb0,0x10(%rsp)
|
---|
1204 | movdqa $xc0,0x20(%rsp)
|
---|
1205 | movdqa $xd0,0x30(%rsp)
|
---|
1206 | jmp .Loop_tail4x
|
---|
1207 |
|
---|
1208 | .align 32
|
---|
1209 | .L64_or_more4x:
|
---|
1210 | movdqu 0x00($inp),$xt0 # xor with input
|
---|
1211 | movdqu 0x10($inp),$xt1
|
---|
1212 | movdqu 0x20($inp),$xt2
|
---|
1213 | movdqu 0x30($inp),$xt3
|
---|
1214 | pxor 0x00(%rsp),$xt0 # $xaxN is offloaded, remember?
|
---|
1215 | pxor $xb0,$xt1
|
---|
1216 | pxor $xc0,$xt2
|
---|
1217 | pxor $xd0,$xt3
|
---|
1218 | movdqu $xt0,0x00($out)
|
---|
1219 | movdqu $xt1,0x10($out)
|
---|
1220 | movdqu $xt2,0x20($out)
|
---|
1221 | movdqu $xt3,0x30($out)
|
---|
1222 | je .Ldone4x
|
---|
1223 |
|
---|
1224 | movdqa 0x10(%rsp),$xt0 # $xaN is offloaded, remember?
|
---|
1225 | lea 0x40($inp),$inp # inp+=64*1
|
---|
1226 | xor %r10,%r10
|
---|
1227 | movdqa $xt0,0x00(%rsp)
|
---|
1228 | movdqa $xb1,0x10(%rsp)
|
---|
1229 | lea 0x40($out),$out # out+=64*1
|
---|
1230 | movdqa $xc1,0x20(%rsp)
|
---|
1231 | sub \$64,$len # len-=64*1
|
---|
1232 | movdqa $xd1,0x30(%rsp)
|
---|
1233 | jmp .Loop_tail4x
|
---|
1234 |
|
---|
1235 | .align 32
|
---|
1236 | .L128_or_more4x:
|
---|
1237 | movdqu 0x00($inp),$xt0 # xor with input
|
---|
1238 | movdqu 0x10($inp),$xt1
|
---|
1239 | movdqu 0x20($inp),$xt2
|
---|
1240 | movdqu 0x30($inp),$xt3
|
---|
1241 | pxor 0x00(%rsp),$xt0 # $xaN is offloaded, remember?
|
---|
1242 | pxor $xb0,$xt1
|
---|
1243 | pxor $xc0,$xt2
|
---|
1244 | pxor $xd0,$xt3
|
---|
1245 |
|
---|
1246 | movdqu $xt0,0x00($out)
|
---|
1247 | movdqu 0x40($inp),$xt0
|
---|
1248 | movdqu $xt1,0x10($out)
|
---|
1249 | movdqu 0x50($inp),$xt1
|
---|
1250 | movdqu $xt2,0x20($out)
|
---|
1251 | movdqu 0x60($inp),$xt2
|
---|
1252 | movdqu $xt3,0x30($out)
|
---|
1253 | movdqu 0x70($inp),$xt3
|
---|
1254 | pxor 0x10(%rsp),$xt0
|
---|
1255 | pxor $xb1,$xt1
|
---|
1256 | pxor $xc1,$xt2
|
---|
1257 | pxor $xd1,$xt3
|
---|
1258 | movdqu $xt0,0x40($out)
|
---|
1259 | movdqu $xt1,0x50($out)
|
---|
1260 | movdqu $xt2,0x60($out)
|
---|
1261 | movdqu $xt3,0x70($out)
|
---|
1262 | je .Ldone4x
|
---|
1263 |
|
---|
1264 | movdqa 0x20(%rsp),$xt0 # $xaN is offloaded, remember?
|
---|
1265 | lea 0x80($inp),$inp # inp+=64*2
|
---|
1266 | xor %r10,%r10
|
---|
1267 | movdqa $xt0,0x00(%rsp)
|
---|
1268 | movdqa $xb2,0x10(%rsp)
|
---|
1269 | lea 0x80($out),$out # out+=64*2
|
---|
1270 | movdqa $xc2,0x20(%rsp)
|
---|
1271 | sub \$128,$len # len-=64*2
|
---|
1272 | movdqa $xd2,0x30(%rsp)
|
---|
1273 | jmp .Loop_tail4x
|
---|
1274 |
|
---|
1275 | .align 32
|
---|
1276 | .L192_or_more4x:
|
---|
1277 | movdqu 0x00($inp),$xt0 # xor with input
|
---|
1278 | movdqu 0x10($inp),$xt1
|
---|
1279 | movdqu 0x20($inp),$xt2
|
---|
1280 | movdqu 0x30($inp),$xt3
|
---|
1281 | pxor 0x00(%rsp),$xt0 # $xaN is offloaded, remember?
|
---|
1282 | pxor $xb0,$xt1
|
---|
1283 | pxor $xc0,$xt2
|
---|
1284 | pxor $xd0,$xt3
|
---|
1285 |
|
---|
1286 | movdqu $xt0,0x00($out)
|
---|
1287 | movdqu 0x40($inp),$xt0
|
---|
1288 | movdqu $xt1,0x10($out)
|
---|
1289 | movdqu 0x50($inp),$xt1
|
---|
1290 | movdqu $xt2,0x20($out)
|
---|
1291 | movdqu 0x60($inp),$xt2
|
---|
1292 | movdqu $xt3,0x30($out)
|
---|
1293 | movdqu 0x70($inp),$xt3
|
---|
1294 | lea 0x80($inp),$inp # size optimization
|
---|
1295 | pxor 0x10(%rsp),$xt0
|
---|
1296 | pxor $xb1,$xt1
|
---|
1297 | pxor $xc1,$xt2
|
---|
1298 | pxor $xd1,$xt3
|
---|
1299 |
|
---|
1300 | movdqu $xt0,0x40($out)
|
---|
1301 | movdqu 0x00($inp),$xt0
|
---|
1302 | movdqu $xt1,0x50($out)
|
---|
1303 | movdqu 0x10($inp),$xt1
|
---|
1304 | movdqu $xt2,0x60($out)
|
---|
1305 | movdqu 0x20($inp),$xt2
|
---|
1306 | movdqu $xt3,0x70($out)
|
---|
1307 | lea 0x80($out),$out # size optimization
|
---|
1308 | movdqu 0x30($inp),$xt3
|
---|
1309 | pxor 0x20(%rsp),$xt0
|
---|
1310 | pxor $xb2,$xt1
|
---|
1311 | pxor $xc2,$xt2
|
---|
1312 | pxor $xd2,$xt3
|
---|
1313 | movdqu $xt0,0x00($out)
|
---|
1314 | movdqu $xt1,0x10($out)
|
---|
1315 | movdqu $xt2,0x20($out)
|
---|
1316 | movdqu $xt3,0x30($out)
|
---|
1317 | je .Ldone4x
|
---|
1318 |
|
---|
1319 | movdqa 0x30(%rsp),$xt0 # $xaN is offloaded, remember?
|
---|
1320 | lea 0x40($inp),$inp # inp+=64*3
|
---|
1321 | xor %r10,%r10
|
---|
1322 | movdqa $xt0,0x00(%rsp)
|
---|
1323 | movdqa $xb3,0x10(%rsp)
|
---|
1324 | lea 0x40($out),$out # out+=64*3
|
---|
1325 | movdqa $xc3,0x20(%rsp)
|
---|
1326 | sub \$192,$len # len-=64*3
|
---|
1327 | movdqa $xd3,0x30(%rsp)
|
---|
1328 |
|
---|
1329 | .Loop_tail4x:
|
---|
1330 | movzb ($inp,%r10),%eax
|
---|
1331 | movzb (%rsp,%r10),%ecx
|
---|
1332 | lea 1(%r10),%r10
|
---|
1333 | xor %ecx,%eax
|
---|
1334 | mov %al,-1($out,%r10)
|
---|
1335 | dec $len
|
---|
1336 | jnz .Loop_tail4x
|
---|
1337 |
|
---|
1338 | .Ldone4x:
|
---|
1339 | ___
|
---|
1340 | $code.=<<___ if ($win64);
|
---|
1341 | movaps -0xa8(%r9),%xmm6
|
---|
1342 | movaps -0x98(%r9),%xmm7
|
---|
1343 | movaps -0x88(%r9),%xmm8
|
---|
1344 | movaps -0x78(%r9),%xmm9
|
---|
1345 | movaps -0x68(%r9),%xmm10
|
---|
1346 | movaps -0x58(%r9),%xmm11
|
---|
1347 | movaps -0x48(%r9),%xmm12
|
---|
1348 | movaps -0x38(%r9),%xmm13
|
---|
1349 | movaps -0x28(%r9),%xmm14
|
---|
1350 | movaps -0x18(%r9),%xmm15
|
---|
1351 | ___
|
---|
1352 | $code.=<<___;
|
---|
1353 | lea (%r9),%rsp
|
---|
1354 | .cfi_def_cfa_register %rsp
|
---|
1355 | .L4x_epilogue:
|
---|
1356 | ret
|
---|
1357 | .cfi_endproc
|
---|
1358 | .size ChaCha20_4x,.-ChaCha20_4x
|
---|
1359 | ___
|
---|
1360 | }
|
---|
1361 |
|
---|
1362 | ########################################################################
|
---|
1363 | # XOP code path that handles all lengths.
|
---|
1364 | if ($avx) {
|
---|
1365 | # There is some "anomaly" observed depending on instructions' size or
|
---|
1366 | # alignment. If you look closely at below code you'll notice that
|
---|
1367 | # sometimes argument order varies. The order affects instruction
|
---|
1368 | # encoding by making it larger, and such fiddling gives 5% performance
|
---|
1369 | # improvement. This is on FX-4100...
|
---|
1370 |
|
---|
1371 | my ($xb0,$xb1,$xb2,$xb3, $xd0,$xd1,$xd2,$xd3,
|
---|
1372 | $xa0,$xa1,$xa2,$xa3, $xt0,$xt1,$xt2,$xt3)=map("%xmm$_",(0..15));
|
---|
1373 | my @xx=($xa0,$xa1,$xa2,$xa3, $xb0,$xb1,$xb2,$xb3,
|
---|
1374 | $xt0,$xt1,$xt2,$xt3, $xd0,$xd1,$xd2,$xd3);
|
---|
1375 |
|
---|
1376 | sub XOP_lane_ROUND {
|
---|
1377 | my ($a0,$b0,$c0,$d0)=@_;
|
---|
1378 | my ($a1,$b1,$c1,$d1)=map(($_&~3)+(($_+1)&3),($a0,$b0,$c0,$d0));
|
---|
1379 | my ($a2,$b2,$c2,$d2)=map(($_&~3)+(($_+1)&3),($a1,$b1,$c1,$d1));
|
---|
1380 | my ($a3,$b3,$c3,$d3)=map(($_&~3)+(($_+1)&3),($a2,$b2,$c2,$d2));
|
---|
1381 | my @x=map("\"$_\"",@xx);
|
---|
1382 |
|
---|
1383 | (
|
---|
1384 | "&vpaddd (@x[$a0],@x[$a0],@x[$b0])", # Q1
|
---|
1385 | "&vpaddd (@x[$a1],@x[$a1],@x[$b1])", # Q2
|
---|
1386 | "&vpaddd (@x[$a2],@x[$a2],@x[$b2])", # Q3
|
---|
1387 | "&vpaddd (@x[$a3],@x[$a3],@x[$b3])", # Q4
|
---|
1388 | "&vpxor (@x[$d0],@x[$a0],@x[$d0])",
|
---|
1389 | "&vpxor (@x[$d1],@x[$a1],@x[$d1])",
|
---|
1390 | "&vpxor (@x[$d2],@x[$a2],@x[$d2])",
|
---|
1391 | "&vpxor (@x[$d3],@x[$a3],@x[$d3])",
|
---|
1392 | "&vprotd (@x[$d0],@x[$d0],16)",
|
---|
1393 | "&vprotd (@x[$d1],@x[$d1],16)",
|
---|
1394 | "&vprotd (@x[$d2],@x[$d2],16)",
|
---|
1395 | "&vprotd (@x[$d3],@x[$d3],16)",
|
---|
1396 |
|
---|
1397 | "&vpaddd (@x[$c0],@x[$c0],@x[$d0])",
|
---|
1398 | "&vpaddd (@x[$c1],@x[$c1],@x[$d1])",
|
---|
1399 | "&vpaddd (@x[$c2],@x[$c2],@x[$d2])",
|
---|
1400 | "&vpaddd (@x[$c3],@x[$c3],@x[$d3])",
|
---|
1401 | "&vpxor (@x[$b0],@x[$c0],@x[$b0])",
|
---|
1402 | "&vpxor (@x[$b1],@x[$c1],@x[$b1])",
|
---|
1403 | "&vpxor (@x[$b2],@x[$b2],@x[$c2])", # flip
|
---|
1404 | "&vpxor (@x[$b3],@x[$b3],@x[$c3])", # flip
|
---|
1405 | "&vprotd (@x[$b0],@x[$b0],12)",
|
---|
1406 | "&vprotd (@x[$b1],@x[$b1],12)",
|
---|
1407 | "&vprotd (@x[$b2],@x[$b2],12)",
|
---|
1408 | "&vprotd (@x[$b3],@x[$b3],12)",
|
---|
1409 |
|
---|
1410 | "&vpaddd (@x[$a0],@x[$b0],@x[$a0])", # flip
|
---|
1411 | "&vpaddd (@x[$a1],@x[$b1],@x[$a1])", # flip
|
---|
1412 | "&vpaddd (@x[$a2],@x[$a2],@x[$b2])",
|
---|
1413 | "&vpaddd (@x[$a3],@x[$a3],@x[$b3])",
|
---|
1414 | "&vpxor (@x[$d0],@x[$a0],@x[$d0])",
|
---|
1415 | "&vpxor (@x[$d1],@x[$a1],@x[$d1])",
|
---|
1416 | "&vpxor (@x[$d2],@x[$a2],@x[$d2])",
|
---|
1417 | "&vpxor (@x[$d3],@x[$a3],@x[$d3])",
|
---|
1418 | "&vprotd (@x[$d0],@x[$d0],8)",
|
---|
1419 | "&vprotd (@x[$d1],@x[$d1],8)",
|
---|
1420 | "&vprotd (@x[$d2],@x[$d2],8)",
|
---|
1421 | "&vprotd (@x[$d3],@x[$d3],8)",
|
---|
1422 |
|
---|
1423 | "&vpaddd (@x[$c0],@x[$c0],@x[$d0])",
|
---|
1424 | "&vpaddd (@x[$c1],@x[$c1],@x[$d1])",
|
---|
1425 | "&vpaddd (@x[$c2],@x[$c2],@x[$d2])",
|
---|
1426 | "&vpaddd (@x[$c3],@x[$c3],@x[$d3])",
|
---|
1427 | "&vpxor (@x[$b0],@x[$c0],@x[$b0])",
|
---|
1428 | "&vpxor (@x[$b1],@x[$c1],@x[$b1])",
|
---|
1429 | "&vpxor (@x[$b2],@x[$b2],@x[$c2])", # flip
|
---|
1430 | "&vpxor (@x[$b3],@x[$b3],@x[$c3])", # flip
|
---|
1431 | "&vprotd (@x[$b0],@x[$b0],7)",
|
---|
1432 | "&vprotd (@x[$b1],@x[$b1],7)",
|
---|
1433 | "&vprotd (@x[$b2],@x[$b2],7)",
|
---|
1434 | "&vprotd (@x[$b3],@x[$b3],7)"
|
---|
1435 | );
|
---|
1436 | }
|
---|
1437 |
|
---|
1438 | my $xframe = $win64 ? 0xa8 : 8;
|
---|
1439 |
|
---|
1440 | $code.=<<___;
|
---|
1441 | .type ChaCha20_4xop,\@function,5
|
---|
1442 | .align 32
|
---|
1443 | ChaCha20_4xop:
|
---|
1444 | .cfi_startproc
|
---|
1445 | .LChaCha20_4xop:
|
---|
1446 | mov %rsp,%r9 # frame pointer
|
---|
1447 | .cfi_def_cfa_register %r9
|
---|
1448 | sub \$0x140+$xframe,%rsp
|
---|
1449 | ___
|
---|
1450 | ################ stack layout
|
---|
1451 | # +0x00 SIMD equivalent of @x[8-12]
|
---|
1452 | # ...
|
---|
1453 | # +0x40 constant copy of key[0-2] smashed by lanes
|
---|
1454 | # ...
|
---|
1455 | # +0x100 SIMD counters (with nonce smashed by lanes)
|
---|
1456 | # ...
|
---|
1457 | # +0x140
|
---|
1458 | $code.=<<___ if ($win64);
|
---|
1459 | movaps %xmm6,-0xa8(%r9)
|
---|
1460 | movaps %xmm7,-0x98(%r9)
|
---|
1461 | movaps %xmm8,-0x88(%r9)
|
---|
1462 | movaps %xmm9,-0x78(%r9)
|
---|
1463 | movaps %xmm10,-0x68(%r9)
|
---|
1464 | movaps %xmm11,-0x58(%r9)
|
---|
1465 | movaps %xmm12,-0x48(%r9)
|
---|
1466 | movaps %xmm13,-0x38(%r9)
|
---|
1467 | movaps %xmm14,-0x28(%r9)
|
---|
1468 | movaps %xmm15,-0x18(%r9)
|
---|
1469 | .L4xop_body:
|
---|
1470 | ___
|
---|
1471 | $code.=<<___;
|
---|
1472 | vzeroupper
|
---|
1473 |
|
---|
1474 | vmovdqa .Lsigma(%rip),$xa3 # key[0]
|
---|
1475 | vmovdqu ($key),$xb3 # key[1]
|
---|
1476 | vmovdqu 16($key),$xt3 # key[2]
|
---|
1477 | vmovdqu ($counter),$xd3 # key[3]
|
---|
1478 | lea 0x100(%rsp),%rcx # size optimization
|
---|
1479 |
|
---|
1480 | vpshufd \$0x00,$xa3,$xa0 # smash key by lanes...
|
---|
1481 | vpshufd \$0x55,$xa3,$xa1
|
---|
1482 | vmovdqa $xa0,0x40(%rsp) # ... and offload
|
---|
1483 | vpshufd \$0xaa,$xa3,$xa2
|
---|
1484 | vmovdqa $xa1,0x50(%rsp)
|
---|
1485 | vpshufd \$0xff,$xa3,$xa3
|
---|
1486 | vmovdqa $xa2,0x60(%rsp)
|
---|
1487 | vmovdqa $xa3,0x70(%rsp)
|
---|
1488 |
|
---|
1489 | vpshufd \$0x00,$xb3,$xb0
|
---|
1490 | vpshufd \$0x55,$xb3,$xb1
|
---|
1491 | vmovdqa $xb0,0x80-0x100(%rcx)
|
---|
1492 | vpshufd \$0xaa,$xb3,$xb2
|
---|
1493 | vmovdqa $xb1,0x90-0x100(%rcx)
|
---|
1494 | vpshufd \$0xff,$xb3,$xb3
|
---|
1495 | vmovdqa $xb2,0xa0-0x100(%rcx)
|
---|
1496 | vmovdqa $xb3,0xb0-0x100(%rcx)
|
---|
1497 |
|
---|
1498 | vpshufd \$0x00,$xt3,$xt0 # "$xc0"
|
---|
1499 | vpshufd \$0x55,$xt3,$xt1 # "$xc1"
|
---|
1500 | vmovdqa $xt0,0xc0-0x100(%rcx)
|
---|
1501 | vpshufd \$0xaa,$xt3,$xt2 # "$xc2"
|
---|
1502 | vmovdqa $xt1,0xd0-0x100(%rcx)
|
---|
1503 | vpshufd \$0xff,$xt3,$xt3 # "$xc3"
|
---|
1504 | vmovdqa $xt2,0xe0-0x100(%rcx)
|
---|
1505 | vmovdqa $xt3,0xf0-0x100(%rcx)
|
---|
1506 |
|
---|
1507 | vpshufd \$0x00,$xd3,$xd0
|
---|
1508 | vpshufd \$0x55,$xd3,$xd1
|
---|
1509 | vpaddd .Linc(%rip),$xd0,$xd0 # don't save counters yet
|
---|
1510 | vpshufd \$0xaa,$xd3,$xd2
|
---|
1511 | vmovdqa $xd1,0x110-0x100(%rcx)
|
---|
1512 | vpshufd \$0xff,$xd3,$xd3
|
---|
1513 | vmovdqa $xd2,0x120-0x100(%rcx)
|
---|
1514 | vmovdqa $xd3,0x130-0x100(%rcx)
|
---|
1515 |
|
---|
1516 | jmp .Loop_enter4xop
|
---|
1517 |
|
---|
1518 | .align 32
|
---|
1519 | .Loop_outer4xop:
|
---|
1520 | vmovdqa 0x40(%rsp),$xa0 # re-load smashed key
|
---|
1521 | vmovdqa 0x50(%rsp),$xa1
|
---|
1522 | vmovdqa 0x60(%rsp),$xa2
|
---|
1523 | vmovdqa 0x70(%rsp),$xa3
|
---|
1524 | vmovdqa 0x80-0x100(%rcx),$xb0
|
---|
1525 | vmovdqa 0x90-0x100(%rcx),$xb1
|
---|
1526 | vmovdqa 0xa0-0x100(%rcx),$xb2
|
---|
1527 | vmovdqa 0xb0-0x100(%rcx),$xb3
|
---|
1528 | vmovdqa 0xc0-0x100(%rcx),$xt0 # "$xc0"
|
---|
1529 | vmovdqa 0xd0-0x100(%rcx),$xt1 # "$xc1"
|
---|
1530 | vmovdqa 0xe0-0x100(%rcx),$xt2 # "$xc2"
|
---|
1531 | vmovdqa 0xf0-0x100(%rcx),$xt3 # "$xc3"
|
---|
1532 | vmovdqa 0x100-0x100(%rcx),$xd0
|
---|
1533 | vmovdqa 0x110-0x100(%rcx),$xd1
|
---|
1534 | vmovdqa 0x120-0x100(%rcx),$xd2
|
---|
1535 | vmovdqa 0x130-0x100(%rcx),$xd3
|
---|
1536 | vpaddd .Lfour(%rip),$xd0,$xd0 # next SIMD counters
|
---|
1537 |
|
---|
1538 | .Loop_enter4xop:
|
---|
1539 | mov \$10,%eax
|
---|
1540 | vmovdqa $xd0,0x100-0x100(%rcx) # save SIMD counters
|
---|
1541 | jmp .Loop4xop
|
---|
1542 |
|
---|
1543 | .align 32
|
---|
1544 | .Loop4xop:
|
---|
1545 | ___
|
---|
1546 | foreach (&XOP_lane_ROUND(0, 4, 8,12)) { eval; }
|
---|
1547 | foreach (&XOP_lane_ROUND(0, 5,10,15)) { eval; }
|
---|
1548 | $code.=<<___;
|
---|
1549 | dec %eax
|
---|
1550 | jnz .Loop4xop
|
---|
1551 |
|
---|
1552 | vpaddd 0x40(%rsp),$xa0,$xa0 # accumulate key material
|
---|
1553 | vpaddd 0x50(%rsp),$xa1,$xa1
|
---|
1554 | vpaddd 0x60(%rsp),$xa2,$xa2
|
---|
1555 | vpaddd 0x70(%rsp),$xa3,$xa3
|
---|
1556 |
|
---|
1557 | vmovdqa $xt2,0x20(%rsp) # offload $xc2,3
|
---|
1558 | vmovdqa $xt3,0x30(%rsp)
|
---|
1559 |
|
---|
1560 | vpunpckldq $xa1,$xa0,$xt2 # "de-interlace" data
|
---|
1561 | vpunpckldq $xa3,$xa2,$xt3
|
---|
1562 | vpunpckhdq $xa1,$xa0,$xa0
|
---|
1563 | vpunpckhdq $xa3,$xa2,$xa2
|
---|
1564 | vpunpcklqdq $xt3,$xt2,$xa1 # "a0"
|
---|
1565 | vpunpckhqdq $xt3,$xt2,$xt2 # "a1"
|
---|
1566 | vpunpcklqdq $xa2,$xa0,$xa3 # "a2"
|
---|
1567 | vpunpckhqdq $xa2,$xa0,$xa0 # "a3"
|
---|
1568 | ___
|
---|
1569 | ($xa0,$xa1,$xa2,$xa3,$xt2)=($xa1,$xt2,$xa3,$xa0,$xa2);
|
---|
1570 | $code.=<<___;
|
---|
1571 | vpaddd 0x80-0x100(%rcx),$xb0,$xb0
|
---|
1572 | vpaddd 0x90-0x100(%rcx),$xb1,$xb1
|
---|
1573 | vpaddd 0xa0-0x100(%rcx),$xb2,$xb2
|
---|
1574 | vpaddd 0xb0-0x100(%rcx),$xb3,$xb3
|
---|
1575 |
|
---|
1576 | vmovdqa $xa0,0x00(%rsp) # offload $xa0,1
|
---|
1577 | vmovdqa $xa1,0x10(%rsp)
|
---|
1578 | vmovdqa 0x20(%rsp),$xa0 # "xc2"
|
---|
1579 | vmovdqa 0x30(%rsp),$xa1 # "xc3"
|
---|
1580 |
|
---|
1581 | vpunpckldq $xb1,$xb0,$xt2
|
---|
1582 | vpunpckldq $xb3,$xb2,$xt3
|
---|
1583 | vpunpckhdq $xb1,$xb0,$xb0
|
---|
1584 | vpunpckhdq $xb3,$xb2,$xb2
|
---|
1585 | vpunpcklqdq $xt3,$xt2,$xb1 # "b0"
|
---|
1586 | vpunpckhqdq $xt3,$xt2,$xt2 # "b1"
|
---|
1587 | vpunpcklqdq $xb2,$xb0,$xb3 # "b2"
|
---|
1588 | vpunpckhqdq $xb2,$xb0,$xb0 # "b3"
|
---|
1589 | ___
|
---|
1590 | ($xb0,$xb1,$xb2,$xb3,$xt2)=($xb1,$xt2,$xb3,$xb0,$xb2);
|
---|
1591 | my ($xc0,$xc1,$xc2,$xc3)=($xt0,$xt1,$xa0,$xa1);
|
---|
1592 | $code.=<<___;
|
---|
1593 | vpaddd 0xc0-0x100(%rcx),$xc0,$xc0
|
---|
1594 | vpaddd 0xd0-0x100(%rcx),$xc1,$xc1
|
---|
1595 | vpaddd 0xe0-0x100(%rcx),$xc2,$xc2
|
---|
1596 | vpaddd 0xf0-0x100(%rcx),$xc3,$xc3
|
---|
1597 |
|
---|
1598 | vpunpckldq $xc1,$xc0,$xt2
|
---|
1599 | vpunpckldq $xc3,$xc2,$xt3
|
---|
1600 | vpunpckhdq $xc1,$xc0,$xc0
|
---|
1601 | vpunpckhdq $xc3,$xc2,$xc2
|
---|
1602 | vpunpcklqdq $xt3,$xt2,$xc1 # "c0"
|
---|
1603 | vpunpckhqdq $xt3,$xt2,$xt2 # "c1"
|
---|
1604 | vpunpcklqdq $xc2,$xc0,$xc3 # "c2"
|
---|
1605 | vpunpckhqdq $xc2,$xc0,$xc0 # "c3"
|
---|
1606 | ___
|
---|
1607 | ($xc0,$xc1,$xc2,$xc3,$xt2)=($xc1,$xt2,$xc3,$xc0,$xc2);
|
---|
1608 | $code.=<<___;
|
---|
1609 | vpaddd 0x100-0x100(%rcx),$xd0,$xd0
|
---|
1610 | vpaddd 0x110-0x100(%rcx),$xd1,$xd1
|
---|
1611 | vpaddd 0x120-0x100(%rcx),$xd2,$xd2
|
---|
1612 | vpaddd 0x130-0x100(%rcx),$xd3,$xd3
|
---|
1613 |
|
---|
1614 | vpunpckldq $xd1,$xd0,$xt2
|
---|
1615 | vpunpckldq $xd3,$xd2,$xt3
|
---|
1616 | vpunpckhdq $xd1,$xd0,$xd0
|
---|
1617 | vpunpckhdq $xd3,$xd2,$xd2
|
---|
1618 | vpunpcklqdq $xt3,$xt2,$xd1 # "d0"
|
---|
1619 | vpunpckhqdq $xt3,$xt2,$xt2 # "d1"
|
---|
1620 | vpunpcklqdq $xd2,$xd0,$xd3 # "d2"
|
---|
1621 | vpunpckhqdq $xd2,$xd0,$xd0 # "d3"
|
---|
1622 | ___
|
---|
1623 | ($xd0,$xd1,$xd2,$xd3,$xt2)=($xd1,$xt2,$xd3,$xd0,$xd2);
|
---|
1624 | ($xa0,$xa1)=($xt2,$xt3);
|
---|
1625 | $code.=<<___;
|
---|
1626 | vmovdqa 0x00(%rsp),$xa0 # restore $xa0,1
|
---|
1627 | vmovdqa 0x10(%rsp),$xa1
|
---|
1628 |
|
---|
1629 | cmp \$64*4,$len
|
---|
1630 | jb .Ltail4xop
|
---|
1631 |
|
---|
1632 | vpxor 0x00($inp),$xa0,$xa0 # xor with input
|
---|
1633 | vpxor 0x10($inp),$xb0,$xb0
|
---|
1634 | vpxor 0x20($inp),$xc0,$xc0
|
---|
1635 | vpxor 0x30($inp),$xd0,$xd0
|
---|
1636 | vpxor 0x40($inp),$xa1,$xa1
|
---|
1637 | vpxor 0x50($inp),$xb1,$xb1
|
---|
1638 | vpxor 0x60($inp),$xc1,$xc1
|
---|
1639 | vpxor 0x70($inp),$xd1,$xd1
|
---|
1640 | lea 0x80($inp),$inp # size optimization
|
---|
1641 | vpxor 0x00($inp),$xa2,$xa2
|
---|
1642 | vpxor 0x10($inp),$xb2,$xb2
|
---|
1643 | vpxor 0x20($inp),$xc2,$xc2
|
---|
1644 | vpxor 0x30($inp),$xd2,$xd2
|
---|
1645 | vpxor 0x40($inp),$xa3,$xa3
|
---|
1646 | vpxor 0x50($inp),$xb3,$xb3
|
---|
1647 | vpxor 0x60($inp),$xc3,$xc3
|
---|
1648 | vpxor 0x70($inp),$xd3,$xd3
|
---|
1649 | lea 0x80($inp),$inp # inp+=64*4
|
---|
1650 |
|
---|
1651 | vmovdqu $xa0,0x00($out)
|
---|
1652 | vmovdqu $xb0,0x10($out)
|
---|
1653 | vmovdqu $xc0,0x20($out)
|
---|
1654 | vmovdqu $xd0,0x30($out)
|
---|
1655 | vmovdqu $xa1,0x40($out)
|
---|
1656 | vmovdqu $xb1,0x50($out)
|
---|
1657 | vmovdqu $xc1,0x60($out)
|
---|
1658 | vmovdqu $xd1,0x70($out)
|
---|
1659 | lea 0x80($out),$out # size optimization
|
---|
1660 | vmovdqu $xa2,0x00($out)
|
---|
1661 | vmovdqu $xb2,0x10($out)
|
---|
1662 | vmovdqu $xc2,0x20($out)
|
---|
1663 | vmovdqu $xd2,0x30($out)
|
---|
1664 | vmovdqu $xa3,0x40($out)
|
---|
1665 | vmovdqu $xb3,0x50($out)
|
---|
1666 | vmovdqu $xc3,0x60($out)
|
---|
1667 | vmovdqu $xd3,0x70($out)
|
---|
1668 | lea 0x80($out),$out # out+=64*4
|
---|
1669 |
|
---|
1670 | sub \$64*4,$len
|
---|
1671 | jnz .Loop_outer4xop
|
---|
1672 |
|
---|
1673 | jmp .Ldone4xop
|
---|
1674 |
|
---|
1675 | .align 32
|
---|
1676 | .Ltail4xop:
|
---|
1677 | cmp \$192,$len
|
---|
1678 | jae .L192_or_more4xop
|
---|
1679 | cmp \$128,$len
|
---|
1680 | jae .L128_or_more4xop
|
---|
1681 | cmp \$64,$len
|
---|
1682 | jae .L64_or_more4xop
|
---|
1683 |
|
---|
1684 | xor %r10,%r10
|
---|
1685 | vmovdqa $xa0,0x00(%rsp)
|
---|
1686 | vmovdqa $xb0,0x10(%rsp)
|
---|
1687 | vmovdqa $xc0,0x20(%rsp)
|
---|
1688 | vmovdqa $xd0,0x30(%rsp)
|
---|
1689 | jmp .Loop_tail4xop
|
---|
1690 |
|
---|
1691 | .align 32
|
---|
1692 | .L64_or_more4xop:
|
---|
1693 | vpxor 0x00($inp),$xa0,$xa0 # xor with input
|
---|
1694 | vpxor 0x10($inp),$xb0,$xb0
|
---|
1695 | vpxor 0x20($inp),$xc0,$xc0
|
---|
1696 | vpxor 0x30($inp),$xd0,$xd0
|
---|
1697 | vmovdqu $xa0,0x00($out)
|
---|
1698 | vmovdqu $xb0,0x10($out)
|
---|
1699 | vmovdqu $xc0,0x20($out)
|
---|
1700 | vmovdqu $xd0,0x30($out)
|
---|
1701 | je .Ldone4xop
|
---|
1702 |
|
---|
1703 | lea 0x40($inp),$inp # inp+=64*1
|
---|
1704 | vmovdqa $xa1,0x00(%rsp)
|
---|
1705 | xor %r10,%r10
|
---|
1706 | vmovdqa $xb1,0x10(%rsp)
|
---|
1707 | lea 0x40($out),$out # out+=64*1
|
---|
1708 | vmovdqa $xc1,0x20(%rsp)
|
---|
1709 | sub \$64,$len # len-=64*1
|
---|
1710 | vmovdqa $xd1,0x30(%rsp)
|
---|
1711 | jmp .Loop_tail4xop
|
---|
1712 |
|
---|
1713 | .align 32
|
---|
1714 | .L128_or_more4xop:
|
---|
1715 | vpxor 0x00($inp),$xa0,$xa0 # xor with input
|
---|
1716 | vpxor 0x10($inp),$xb0,$xb0
|
---|
1717 | vpxor 0x20($inp),$xc0,$xc0
|
---|
1718 | vpxor 0x30($inp),$xd0,$xd0
|
---|
1719 | vpxor 0x40($inp),$xa1,$xa1
|
---|
1720 | vpxor 0x50($inp),$xb1,$xb1
|
---|
1721 | vpxor 0x60($inp),$xc1,$xc1
|
---|
1722 | vpxor 0x70($inp),$xd1,$xd1
|
---|
1723 |
|
---|
1724 | vmovdqu $xa0,0x00($out)
|
---|
1725 | vmovdqu $xb0,0x10($out)
|
---|
1726 | vmovdqu $xc0,0x20($out)
|
---|
1727 | vmovdqu $xd0,0x30($out)
|
---|
1728 | vmovdqu $xa1,0x40($out)
|
---|
1729 | vmovdqu $xb1,0x50($out)
|
---|
1730 | vmovdqu $xc1,0x60($out)
|
---|
1731 | vmovdqu $xd1,0x70($out)
|
---|
1732 | je .Ldone4xop
|
---|
1733 |
|
---|
1734 | lea 0x80($inp),$inp # inp+=64*2
|
---|
1735 | vmovdqa $xa2,0x00(%rsp)
|
---|
1736 | xor %r10,%r10
|
---|
1737 | vmovdqa $xb2,0x10(%rsp)
|
---|
1738 | lea 0x80($out),$out # out+=64*2
|
---|
1739 | vmovdqa $xc2,0x20(%rsp)
|
---|
1740 | sub \$128,$len # len-=64*2
|
---|
1741 | vmovdqa $xd2,0x30(%rsp)
|
---|
1742 | jmp .Loop_tail4xop
|
---|
1743 |
|
---|
1744 | .align 32
|
---|
1745 | .L192_or_more4xop:
|
---|
1746 | vpxor 0x00($inp),$xa0,$xa0 # xor with input
|
---|
1747 | vpxor 0x10($inp),$xb0,$xb0
|
---|
1748 | vpxor 0x20($inp),$xc0,$xc0
|
---|
1749 | vpxor 0x30($inp),$xd0,$xd0
|
---|
1750 | vpxor 0x40($inp),$xa1,$xa1
|
---|
1751 | vpxor 0x50($inp),$xb1,$xb1
|
---|
1752 | vpxor 0x60($inp),$xc1,$xc1
|
---|
1753 | vpxor 0x70($inp),$xd1,$xd1
|
---|
1754 | lea 0x80($inp),$inp # size optimization
|
---|
1755 | vpxor 0x00($inp),$xa2,$xa2
|
---|
1756 | vpxor 0x10($inp),$xb2,$xb2
|
---|
1757 | vpxor 0x20($inp),$xc2,$xc2
|
---|
1758 | vpxor 0x30($inp),$xd2,$xd2
|
---|
1759 |
|
---|
1760 | vmovdqu $xa0,0x00($out)
|
---|
1761 | vmovdqu $xb0,0x10($out)
|
---|
1762 | vmovdqu $xc0,0x20($out)
|
---|
1763 | vmovdqu $xd0,0x30($out)
|
---|
1764 | vmovdqu $xa1,0x40($out)
|
---|
1765 | vmovdqu $xb1,0x50($out)
|
---|
1766 | vmovdqu $xc1,0x60($out)
|
---|
1767 | vmovdqu $xd1,0x70($out)
|
---|
1768 | lea 0x80($out),$out # size optimization
|
---|
1769 | vmovdqu $xa2,0x00($out)
|
---|
1770 | vmovdqu $xb2,0x10($out)
|
---|
1771 | vmovdqu $xc2,0x20($out)
|
---|
1772 | vmovdqu $xd2,0x30($out)
|
---|
1773 | je .Ldone4xop
|
---|
1774 |
|
---|
1775 | lea 0x40($inp),$inp # inp+=64*3
|
---|
1776 | vmovdqa $xa3,0x00(%rsp)
|
---|
1777 | xor %r10,%r10
|
---|
1778 | vmovdqa $xb3,0x10(%rsp)
|
---|
1779 | lea 0x40($out),$out # out+=64*3
|
---|
1780 | vmovdqa $xc3,0x20(%rsp)
|
---|
1781 | sub \$192,$len # len-=64*3
|
---|
1782 | vmovdqa $xd3,0x30(%rsp)
|
---|
1783 |
|
---|
1784 | .Loop_tail4xop:
|
---|
1785 | movzb ($inp,%r10),%eax
|
---|
1786 | movzb (%rsp,%r10),%ecx
|
---|
1787 | lea 1(%r10),%r10
|
---|
1788 | xor %ecx,%eax
|
---|
1789 | mov %al,-1($out,%r10)
|
---|
1790 | dec $len
|
---|
1791 | jnz .Loop_tail4xop
|
---|
1792 |
|
---|
1793 | .Ldone4xop:
|
---|
1794 | vzeroupper
|
---|
1795 | ___
|
---|
1796 | $code.=<<___ if ($win64);
|
---|
1797 | movaps -0xa8(%r9),%xmm6
|
---|
1798 | movaps -0x98(%r9),%xmm7
|
---|
1799 | movaps -0x88(%r9),%xmm8
|
---|
1800 | movaps -0x78(%r9),%xmm9
|
---|
1801 | movaps -0x68(%r9),%xmm10
|
---|
1802 | movaps -0x58(%r9),%xmm11
|
---|
1803 | movaps -0x48(%r9),%xmm12
|
---|
1804 | movaps -0x38(%r9),%xmm13
|
---|
1805 | movaps -0x28(%r9),%xmm14
|
---|
1806 | movaps -0x18(%r9),%xmm15
|
---|
1807 | ___
|
---|
1808 | $code.=<<___;
|
---|
1809 | lea (%r9),%rsp
|
---|
1810 | .cfi_def_cfa_register %rsp
|
---|
1811 | .L4xop_epilogue:
|
---|
1812 | ret
|
---|
1813 | .cfi_endproc
|
---|
1814 | .size ChaCha20_4xop,.-ChaCha20_4xop
|
---|
1815 | ___
|
---|
1816 | }
|
---|
1817 |
|
---|
1818 | ########################################################################
|
---|
1819 | # AVX2 code path
|
---|
1820 | if ($avx>1) {
|
---|
1821 | my ($xb0,$xb1,$xb2,$xb3, $xd0,$xd1,$xd2,$xd3,
|
---|
1822 | $xa0,$xa1,$xa2,$xa3, $xt0,$xt1,$xt2,$xt3)=map("%ymm$_",(0..15));
|
---|
1823 | my @xx=($xa0,$xa1,$xa2,$xa3, $xb0,$xb1,$xb2,$xb3,
|
---|
1824 | "%nox","%nox","%nox","%nox", $xd0,$xd1,$xd2,$xd3);
|
---|
1825 |
|
---|
1826 | sub AVX2_lane_ROUND {
|
---|
1827 | my ($a0,$b0,$c0,$d0)=@_;
|
---|
1828 | my ($a1,$b1,$c1,$d1)=map(($_&~3)+(($_+1)&3),($a0,$b0,$c0,$d0));
|
---|
1829 | my ($a2,$b2,$c2,$d2)=map(($_&~3)+(($_+1)&3),($a1,$b1,$c1,$d1));
|
---|
1830 | my ($a3,$b3,$c3,$d3)=map(($_&~3)+(($_+1)&3),($a2,$b2,$c2,$d2));
|
---|
1831 | my ($xc,$xc_,$t0,$t1)=map("\"$_\"",$xt0,$xt1,$xt2,$xt3);
|
---|
1832 | my @x=map("\"$_\"",@xx);
|
---|
1833 |
|
---|
1834 | # Consider order in which variables are addressed by their
|
---|
1835 | # index:
|
---|
1836 | #
|
---|
1837 | # a b c d
|
---|
1838 | #
|
---|
1839 | # 0 4 8 12 < even round
|
---|
1840 | # 1 5 9 13
|
---|
1841 | # 2 6 10 14
|
---|
1842 | # 3 7 11 15
|
---|
1843 | # 0 5 10 15 < odd round
|
---|
1844 | # 1 6 11 12
|
---|
1845 | # 2 7 8 13
|
---|
1846 | # 3 4 9 14
|
---|
1847 | #
|
---|
1848 | # 'a', 'b' and 'd's are permanently allocated in registers,
|
---|
1849 | # @x[0..7,12..15], while 'c's are maintained in memory. If
|
---|
1850 | # you observe 'c' column, you'll notice that pair of 'c's is
|
---|
1851 | # invariant between rounds. This means that we have to reload
|
---|
1852 | # them once per round, in the middle. This is why you'll see
|
---|
1853 | # bunch of 'c' stores and loads in the middle, but none in
|
---|
1854 | # the beginning or end.
|
---|
1855 |
|
---|
1856 | (
|
---|
1857 | "&vpaddd (@x[$a0],@x[$a0],@x[$b0])", # Q1
|
---|
1858 | "&vpxor (@x[$d0],@x[$a0],@x[$d0])",
|
---|
1859 | "&vpshufb (@x[$d0],@x[$d0],$t1)",
|
---|
1860 | "&vpaddd (@x[$a1],@x[$a1],@x[$b1])", # Q2
|
---|
1861 | "&vpxor (@x[$d1],@x[$a1],@x[$d1])",
|
---|
1862 | "&vpshufb (@x[$d1],@x[$d1],$t1)",
|
---|
1863 |
|
---|
1864 | "&vpaddd ($xc,$xc,@x[$d0])",
|
---|
1865 | "&vpxor (@x[$b0],$xc,@x[$b0])",
|
---|
1866 | "&vpslld ($t0,@x[$b0],12)",
|
---|
1867 | "&vpsrld (@x[$b0],@x[$b0],20)",
|
---|
1868 | "&vpor (@x[$b0],$t0,@x[$b0])",
|
---|
1869 | "&vbroadcasti128($t0,'(%r11)')", # .Lrot24(%rip)
|
---|
1870 | "&vpaddd ($xc_,$xc_,@x[$d1])",
|
---|
1871 | "&vpxor (@x[$b1],$xc_,@x[$b1])",
|
---|
1872 | "&vpslld ($t1,@x[$b1],12)",
|
---|
1873 | "&vpsrld (@x[$b1],@x[$b1],20)",
|
---|
1874 | "&vpor (@x[$b1],$t1,@x[$b1])",
|
---|
1875 |
|
---|
1876 | "&vpaddd (@x[$a0],@x[$a0],@x[$b0])",
|
---|
1877 | "&vpxor (@x[$d0],@x[$a0],@x[$d0])",
|
---|
1878 | "&vpshufb (@x[$d0],@x[$d0],$t0)",
|
---|
1879 | "&vpaddd (@x[$a1],@x[$a1],@x[$b1])",
|
---|
1880 | "&vpxor (@x[$d1],@x[$a1],@x[$d1])",
|
---|
1881 | "&vpshufb (@x[$d1],@x[$d1],$t0)",
|
---|
1882 |
|
---|
1883 | "&vpaddd ($xc,$xc,@x[$d0])",
|
---|
1884 | "&vpxor (@x[$b0],$xc,@x[$b0])",
|
---|
1885 | "&vpslld ($t1,@x[$b0],7)",
|
---|
1886 | "&vpsrld (@x[$b0],@x[$b0],25)",
|
---|
1887 | "&vpor (@x[$b0],$t1,@x[$b0])",
|
---|
1888 | "&vbroadcasti128($t1,'(%r10)')", # .Lrot16(%rip)
|
---|
1889 | "&vpaddd ($xc_,$xc_,@x[$d1])",
|
---|
1890 | "&vpxor (@x[$b1],$xc_,@x[$b1])",
|
---|
1891 | "&vpslld ($t0,@x[$b1],7)",
|
---|
1892 | "&vpsrld (@x[$b1],@x[$b1],25)",
|
---|
1893 | "&vpor (@x[$b1],$t0,@x[$b1])",
|
---|
1894 |
|
---|
1895 | "&vmovdqa (\"`32*($c0-8)`(%rsp)\",$xc)", # reload pair of 'c's
|
---|
1896 | "&vmovdqa (\"`32*($c1-8)`(%rsp)\",$xc_)",
|
---|
1897 | "&vmovdqa ($xc,\"`32*($c2-8)`(%rsp)\")",
|
---|
1898 | "&vmovdqa ($xc_,\"`32*($c3-8)`(%rsp)\")",
|
---|
1899 |
|
---|
1900 | "&vpaddd (@x[$a2],@x[$a2],@x[$b2])", # Q3
|
---|
1901 | "&vpxor (@x[$d2],@x[$a2],@x[$d2])",
|
---|
1902 | "&vpshufb (@x[$d2],@x[$d2],$t1)",
|
---|
1903 | "&vpaddd (@x[$a3],@x[$a3],@x[$b3])", # Q4
|
---|
1904 | "&vpxor (@x[$d3],@x[$a3],@x[$d3])",
|
---|
1905 | "&vpshufb (@x[$d3],@x[$d3],$t1)",
|
---|
1906 |
|
---|
1907 | "&vpaddd ($xc,$xc,@x[$d2])",
|
---|
1908 | "&vpxor (@x[$b2],$xc,@x[$b2])",
|
---|
1909 | "&vpslld ($t0,@x[$b2],12)",
|
---|
1910 | "&vpsrld (@x[$b2],@x[$b2],20)",
|
---|
1911 | "&vpor (@x[$b2],$t0,@x[$b2])",
|
---|
1912 | "&vbroadcasti128($t0,'(%r11)')", # .Lrot24(%rip)
|
---|
1913 | "&vpaddd ($xc_,$xc_,@x[$d3])",
|
---|
1914 | "&vpxor (@x[$b3],$xc_,@x[$b3])",
|
---|
1915 | "&vpslld ($t1,@x[$b3],12)",
|
---|
1916 | "&vpsrld (@x[$b3],@x[$b3],20)",
|
---|
1917 | "&vpor (@x[$b3],$t1,@x[$b3])",
|
---|
1918 |
|
---|
1919 | "&vpaddd (@x[$a2],@x[$a2],@x[$b2])",
|
---|
1920 | "&vpxor (@x[$d2],@x[$a2],@x[$d2])",
|
---|
1921 | "&vpshufb (@x[$d2],@x[$d2],$t0)",
|
---|
1922 | "&vpaddd (@x[$a3],@x[$a3],@x[$b3])",
|
---|
1923 | "&vpxor (@x[$d3],@x[$a3],@x[$d3])",
|
---|
1924 | "&vpshufb (@x[$d3],@x[$d3],$t0)",
|
---|
1925 |
|
---|
1926 | "&vpaddd ($xc,$xc,@x[$d2])",
|
---|
1927 | "&vpxor (@x[$b2],$xc,@x[$b2])",
|
---|
1928 | "&vpslld ($t1,@x[$b2],7)",
|
---|
1929 | "&vpsrld (@x[$b2],@x[$b2],25)",
|
---|
1930 | "&vpor (@x[$b2],$t1,@x[$b2])",
|
---|
1931 | "&vbroadcasti128($t1,'(%r10)')", # .Lrot16(%rip)
|
---|
1932 | "&vpaddd ($xc_,$xc_,@x[$d3])",
|
---|
1933 | "&vpxor (@x[$b3],$xc_,@x[$b3])",
|
---|
1934 | "&vpslld ($t0,@x[$b3],7)",
|
---|
1935 | "&vpsrld (@x[$b3],@x[$b3],25)",
|
---|
1936 | "&vpor (@x[$b3],$t0,@x[$b3])"
|
---|
1937 | );
|
---|
1938 | }
|
---|
1939 |
|
---|
1940 | my $xframe = $win64 ? 0xa8 : 8;
|
---|
1941 |
|
---|
1942 | $code.=<<___;
|
---|
1943 | .type ChaCha20_8x,\@function,5
|
---|
1944 | .align 32
|
---|
1945 | ChaCha20_8x:
|
---|
1946 | .cfi_startproc
|
---|
1947 | .LChaCha20_8x:
|
---|
1948 | mov %rsp,%r9 # frame register
|
---|
1949 | .cfi_def_cfa_register %r9
|
---|
1950 | sub \$0x280+$xframe,%rsp
|
---|
1951 | and \$-32,%rsp
|
---|
1952 | ___
|
---|
1953 | $code.=<<___ if ($win64);
|
---|
1954 | movaps %xmm6,-0xa8(%r9)
|
---|
1955 | movaps %xmm7,-0x98(%r9)
|
---|
1956 | movaps %xmm8,-0x88(%r9)
|
---|
1957 | movaps %xmm9,-0x78(%r9)
|
---|
1958 | movaps %xmm10,-0x68(%r9)
|
---|
1959 | movaps %xmm11,-0x58(%r9)
|
---|
1960 | movaps %xmm12,-0x48(%r9)
|
---|
1961 | movaps %xmm13,-0x38(%r9)
|
---|
1962 | movaps %xmm14,-0x28(%r9)
|
---|
1963 | movaps %xmm15,-0x18(%r9)
|
---|
1964 | .L8x_body:
|
---|
1965 | ___
|
---|
1966 | $code.=<<___;
|
---|
1967 | vzeroupper
|
---|
1968 |
|
---|
1969 | ################ stack layout
|
---|
1970 | # +0x00 SIMD equivalent of @x[8-12]
|
---|
1971 | # ...
|
---|
1972 | # +0x80 constant copy of key[0-2] smashed by lanes
|
---|
1973 | # ...
|
---|
1974 | # +0x200 SIMD counters (with nonce smashed by lanes)
|
---|
1975 | # ...
|
---|
1976 | # +0x280
|
---|
1977 |
|
---|
1978 | vbroadcasti128 .Lsigma(%rip),$xa3 # key[0]
|
---|
1979 | vbroadcasti128 ($key),$xb3 # key[1]
|
---|
1980 | vbroadcasti128 16($key),$xt3 # key[2]
|
---|
1981 | vbroadcasti128 ($counter),$xd3 # key[3]
|
---|
1982 | lea 0x100(%rsp),%rcx # size optimization
|
---|
1983 | lea 0x200(%rsp),%rax # size optimization
|
---|
1984 | lea .Lrot16(%rip),%r10
|
---|
1985 | lea .Lrot24(%rip),%r11
|
---|
1986 |
|
---|
1987 | vpshufd \$0x00,$xa3,$xa0 # smash key by lanes...
|
---|
1988 | vpshufd \$0x55,$xa3,$xa1
|
---|
1989 | vmovdqa $xa0,0x80-0x100(%rcx) # ... and offload
|
---|
1990 | vpshufd \$0xaa,$xa3,$xa2
|
---|
1991 | vmovdqa $xa1,0xa0-0x100(%rcx)
|
---|
1992 | vpshufd \$0xff,$xa3,$xa3
|
---|
1993 | vmovdqa $xa2,0xc0-0x100(%rcx)
|
---|
1994 | vmovdqa $xa3,0xe0-0x100(%rcx)
|
---|
1995 |
|
---|
1996 | vpshufd \$0x00,$xb3,$xb0
|
---|
1997 | vpshufd \$0x55,$xb3,$xb1
|
---|
1998 | vmovdqa $xb0,0x100-0x100(%rcx)
|
---|
1999 | vpshufd \$0xaa,$xb3,$xb2
|
---|
2000 | vmovdqa $xb1,0x120-0x100(%rcx)
|
---|
2001 | vpshufd \$0xff,$xb3,$xb3
|
---|
2002 | vmovdqa $xb2,0x140-0x100(%rcx)
|
---|
2003 | vmovdqa $xb3,0x160-0x100(%rcx)
|
---|
2004 |
|
---|
2005 | vpshufd \$0x00,$xt3,$xt0 # "xc0"
|
---|
2006 | vpshufd \$0x55,$xt3,$xt1 # "xc1"
|
---|
2007 | vmovdqa $xt0,0x180-0x200(%rax)
|
---|
2008 | vpshufd \$0xaa,$xt3,$xt2 # "xc2"
|
---|
2009 | vmovdqa $xt1,0x1a0-0x200(%rax)
|
---|
2010 | vpshufd \$0xff,$xt3,$xt3 # "xc3"
|
---|
2011 | vmovdqa $xt2,0x1c0-0x200(%rax)
|
---|
2012 | vmovdqa $xt3,0x1e0-0x200(%rax)
|
---|
2013 |
|
---|
2014 | vpshufd \$0x00,$xd3,$xd0
|
---|
2015 | vpshufd \$0x55,$xd3,$xd1
|
---|
2016 | vpaddd .Lincy(%rip),$xd0,$xd0 # don't save counters yet
|
---|
2017 | vpshufd \$0xaa,$xd3,$xd2
|
---|
2018 | vmovdqa $xd1,0x220-0x200(%rax)
|
---|
2019 | vpshufd \$0xff,$xd3,$xd3
|
---|
2020 | vmovdqa $xd2,0x240-0x200(%rax)
|
---|
2021 | vmovdqa $xd3,0x260-0x200(%rax)
|
---|
2022 |
|
---|
2023 | jmp .Loop_enter8x
|
---|
2024 |
|
---|
2025 | .align 32
|
---|
2026 | .Loop_outer8x:
|
---|
2027 | vmovdqa 0x80-0x100(%rcx),$xa0 # re-load smashed key
|
---|
2028 | vmovdqa 0xa0-0x100(%rcx),$xa1
|
---|
2029 | vmovdqa 0xc0-0x100(%rcx),$xa2
|
---|
2030 | vmovdqa 0xe0-0x100(%rcx),$xa3
|
---|
2031 | vmovdqa 0x100-0x100(%rcx),$xb0
|
---|
2032 | vmovdqa 0x120-0x100(%rcx),$xb1
|
---|
2033 | vmovdqa 0x140-0x100(%rcx),$xb2
|
---|
2034 | vmovdqa 0x160-0x100(%rcx),$xb3
|
---|
2035 | vmovdqa 0x180-0x200(%rax),$xt0 # "xc0"
|
---|
2036 | vmovdqa 0x1a0-0x200(%rax),$xt1 # "xc1"
|
---|
2037 | vmovdqa 0x1c0-0x200(%rax),$xt2 # "xc2"
|
---|
2038 | vmovdqa 0x1e0-0x200(%rax),$xt3 # "xc3"
|
---|
2039 | vmovdqa 0x200-0x200(%rax),$xd0
|
---|
2040 | vmovdqa 0x220-0x200(%rax),$xd1
|
---|
2041 | vmovdqa 0x240-0x200(%rax),$xd2
|
---|
2042 | vmovdqa 0x260-0x200(%rax),$xd3
|
---|
2043 | vpaddd .Leight(%rip),$xd0,$xd0 # next SIMD counters
|
---|
2044 |
|
---|
2045 | .Loop_enter8x:
|
---|
2046 | vmovdqa $xt2,0x40(%rsp) # SIMD equivalent of "@x[10]"
|
---|
2047 | vmovdqa $xt3,0x60(%rsp) # SIMD equivalent of "@x[11]"
|
---|
2048 | vbroadcasti128 (%r10),$xt3
|
---|
2049 | vmovdqa $xd0,0x200-0x200(%rax) # save SIMD counters
|
---|
2050 | mov \$10,%eax
|
---|
2051 | jmp .Loop8x
|
---|
2052 |
|
---|
2053 | .align 32
|
---|
2054 | .Loop8x:
|
---|
2055 | ___
|
---|
2056 | foreach (&AVX2_lane_ROUND(0, 4, 8,12)) { eval; }
|
---|
2057 | foreach (&AVX2_lane_ROUND(0, 5,10,15)) { eval; }
|
---|
2058 | $code.=<<___;
|
---|
2059 | dec %eax
|
---|
2060 | jnz .Loop8x
|
---|
2061 |
|
---|
2062 | lea 0x200(%rsp),%rax # size optimization
|
---|
2063 | vpaddd 0x80-0x100(%rcx),$xa0,$xa0 # accumulate key
|
---|
2064 | vpaddd 0xa0-0x100(%rcx),$xa1,$xa1
|
---|
2065 | vpaddd 0xc0-0x100(%rcx),$xa2,$xa2
|
---|
2066 | vpaddd 0xe0-0x100(%rcx),$xa3,$xa3
|
---|
2067 |
|
---|
2068 | vpunpckldq $xa1,$xa0,$xt2 # "de-interlace" data
|
---|
2069 | vpunpckldq $xa3,$xa2,$xt3
|
---|
2070 | vpunpckhdq $xa1,$xa0,$xa0
|
---|
2071 | vpunpckhdq $xa3,$xa2,$xa2
|
---|
2072 | vpunpcklqdq $xt3,$xt2,$xa1 # "a0"
|
---|
2073 | vpunpckhqdq $xt3,$xt2,$xt2 # "a1"
|
---|
2074 | vpunpcklqdq $xa2,$xa0,$xa3 # "a2"
|
---|
2075 | vpunpckhqdq $xa2,$xa0,$xa0 # "a3"
|
---|
2076 | ___
|
---|
2077 | ($xa0,$xa1,$xa2,$xa3,$xt2)=($xa1,$xt2,$xa3,$xa0,$xa2);
|
---|
2078 | $code.=<<___;
|
---|
2079 | vpaddd 0x100-0x100(%rcx),$xb0,$xb0
|
---|
2080 | vpaddd 0x120-0x100(%rcx),$xb1,$xb1
|
---|
2081 | vpaddd 0x140-0x100(%rcx),$xb2,$xb2
|
---|
2082 | vpaddd 0x160-0x100(%rcx),$xb3,$xb3
|
---|
2083 |
|
---|
2084 | vpunpckldq $xb1,$xb0,$xt2
|
---|
2085 | vpunpckldq $xb3,$xb2,$xt3
|
---|
2086 | vpunpckhdq $xb1,$xb0,$xb0
|
---|
2087 | vpunpckhdq $xb3,$xb2,$xb2
|
---|
2088 | vpunpcklqdq $xt3,$xt2,$xb1 # "b0"
|
---|
2089 | vpunpckhqdq $xt3,$xt2,$xt2 # "b1"
|
---|
2090 | vpunpcklqdq $xb2,$xb0,$xb3 # "b2"
|
---|
2091 | vpunpckhqdq $xb2,$xb0,$xb0 # "b3"
|
---|
2092 | ___
|
---|
2093 | ($xb0,$xb1,$xb2,$xb3,$xt2)=($xb1,$xt2,$xb3,$xb0,$xb2);
|
---|
2094 | $code.=<<___;
|
---|
2095 | vperm2i128 \$0x20,$xb0,$xa0,$xt3 # "de-interlace" further
|
---|
2096 | vperm2i128 \$0x31,$xb0,$xa0,$xb0
|
---|
2097 | vperm2i128 \$0x20,$xb1,$xa1,$xa0
|
---|
2098 | vperm2i128 \$0x31,$xb1,$xa1,$xb1
|
---|
2099 | vperm2i128 \$0x20,$xb2,$xa2,$xa1
|
---|
2100 | vperm2i128 \$0x31,$xb2,$xa2,$xb2
|
---|
2101 | vperm2i128 \$0x20,$xb3,$xa3,$xa2
|
---|
2102 | vperm2i128 \$0x31,$xb3,$xa3,$xb3
|
---|
2103 | ___
|
---|
2104 | ($xa0,$xa1,$xa2,$xa3,$xt3)=($xt3,$xa0,$xa1,$xa2,$xa3);
|
---|
2105 | my ($xc0,$xc1,$xc2,$xc3)=($xt0,$xt1,$xa0,$xa1);
|
---|
2106 | $code.=<<___;
|
---|
2107 | vmovdqa $xa0,0x00(%rsp) # offload $xaN
|
---|
2108 | vmovdqa $xa1,0x20(%rsp)
|
---|
2109 | vmovdqa 0x40(%rsp),$xc2 # $xa0
|
---|
2110 | vmovdqa 0x60(%rsp),$xc3 # $xa1
|
---|
2111 |
|
---|
2112 | vpaddd 0x180-0x200(%rax),$xc0,$xc0
|
---|
2113 | vpaddd 0x1a0-0x200(%rax),$xc1,$xc1
|
---|
2114 | vpaddd 0x1c0-0x200(%rax),$xc2,$xc2
|
---|
2115 | vpaddd 0x1e0-0x200(%rax),$xc3,$xc3
|
---|
2116 |
|
---|
2117 | vpunpckldq $xc1,$xc0,$xt2
|
---|
2118 | vpunpckldq $xc3,$xc2,$xt3
|
---|
2119 | vpunpckhdq $xc1,$xc0,$xc0
|
---|
2120 | vpunpckhdq $xc3,$xc2,$xc2
|
---|
2121 | vpunpcklqdq $xt3,$xt2,$xc1 # "c0"
|
---|
2122 | vpunpckhqdq $xt3,$xt2,$xt2 # "c1"
|
---|
2123 | vpunpcklqdq $xc2,$xc0,$xc3 # "c2"
|
---|
2124 | vpunpckhqdq $xc2,$xc0,$xc0 # "c3"
|
---|
2125 | ___
|
---|
2126 | ($xc0,$xc1,$xc2,$xc3,$xt2)=($xc1,$xt2,$xc3,$xc0,$xc2);
|
---|
2127 | $code.=<<___;
|
---|
2128 | vpaddd 0x200-0x200(%rax),$xd0,$xd0
|
---|
2129 | vpaddd 0x220-0x200(%rax),$xd1,$xd1
|
---|
2130 | vpaddd 0x240-0x200(%rax),$xd2,$xd2
|
---|
2131 | vpaddd 0x260-0x200(%rax),$xd3,$xd3
|
---|
2132 |
|
---|
2133 | vpunpckldq $xd1,$xd0,$xt2
|
---|
2134 | vpunpckldq $xd3,$xd2,$xt3
|
---|
2135 | vpunpckhdq $xd1,$xd0,$xd0
|
---|
2136 | vpunpckhdq $xd3,$xd2,$xd2
|
---|
2137 | vpunpcklqdq $xt3,$xt2,$xd1 # "d0"
|
---|
2138 | vpunpckhqdq $xt3,$xt2,$xt2 # "d1"
|
---|
2139 | vpunpcklqdq $xd2,$xd0,$xd3 # "d2"
|
---|
2140 | vpunpckhqdq $xd2,$xd0,$xd0 # "d3"
|
---|
2141 | ___
|
---|
2142 | ($xd0,$xd1,$xd2,$xd3,$xt2)=($xd1,$xt2,$xd3,$xd0,$xd2);
|
---|
2143 | $code.=<<___;
|
---|
2144 | vperm2i128 \$0x20,$xd0,$xc0,$xt3 # "de-interlace" further
|
---|
2145 | vperm2i128 \$0x31,$xd0,$xc0,$xd0
|
---|
2146 | vperm2i128 \$0x20,$xd1,$xc1,$xc0
|
---|
2147 | vperm2i128 \$0x31,$xd1,$xc1,$xd1
|
---|
2148 | vperm2i128 \$0x20,$xd2,$xc2,$xc1
|
---|
2149 | vperm2i128 \$0x31,$xd2,$xc2,$xd2
|
---|
2150 | vperm2i128 \$0x20,$xd3,$xc3,$xc2
|
---|
2151 | vperm2i128 \$0x31,$xd3,$xc3,$xd3
|
---|
2152 | ___
|
---|
2153 | ($xc0,$xc1,$xc2,$xc3,$xt3)=($xt3,$xc0,$xc1,$xc2,$xc3);
|
---|
2154 | ($xb0,$xb1,$xb2,$xb3,$xc0,$xc1,$xc2,$xc3)=
|
---|
2155 | ($xc0,$xc1,$xc2,$xc3,$xb0,$xb1,$xb2,$xb3);
|
---|
2156 | ($xa0,$xa1)=($xt2,$xt3);
|
---|
2157 | $code.=<<___;
|
---|
2158 | vmovdqa 0x00(%rsp),$xa0 # $xaN was offloaded, remember?
|
---|
2159 | vmovdqa 0x20(%rsp),$xa1
|
---|
2160 |
|
---|
2161 | cmp \$64*8,$len
|
---|
2162 | jb .Ltail8x
|
---|
2163 |
|
---|
2164 | vpxor 0x00($inp),$xa0,$xa0 # xor with input
|
---|
2165 | vpxor 0x20($inp),$xb0,$xb0
|
---|
2166 | vpxor 0x40($inp),$xc0,$xc0
|
---|
2167 | vpxor 0x60($inp),$xd0,$xd0
|
---|
2168 | lea 0x80($inp),$inp # size optimization
|
---|
2169 | vmovdqu $xa0,0x00($out)
|
---|
2170 | vmovdqu $xb0,0x20($out)
|
---|
2171 | vmovdqu $xc0,0x40($out)
|
---|
2172 | vmovdqu $xd0,0x60($out)
|
---|
2173 | lea 0x80($out),$out # size optimization
|
---|
2174 |
|
---|
2175 | vpxor 0x00($inp),$xa1,$xa1
|
---|
2176 | vpxor 0x20($inp),$xb1,$xb1
|
---|
2177 | vpxor 0x40($inp),$xc1,$xc1
|
---|
2178 | vpxor 0x60($inp),$xd1,$xd1
|
---|
2179 | lea 0x80($inp),$inp # size optimization
|
---|
2180 | vmovdqu $xa1,0x00($out)
|
---|
2181 | vmovdqu $xb1,0x20($out)
|
---|
2182 | vmovdqu $xc1,0x40($out)
|
---|
2183 | vmovdqu $xd1,0x60($out)
|
---|
2184 | lea 0x80($out),$out # size optimization
|
---|
2185 |
|
---|
2186 | vpxor 0x00($inp),$xa2,$xa2
|
---|
2187 | vpxor 0x20($inp),$xb2,$xb2
|
---|
2188 | vpxor 0x40($inp),$xc2,$xc2
|
---|
2189 | vpxor 0x60($inp),$xd2,$xd2
|
---|
2190 | lea 0x80($inp),$inp # size optimization
|
---|
2191 | vmovdqu $xa2,0x00($out)
|
---|
2192 | vmovdqu $xb2,0x20($out)
|
---|
2193 | vmovdqu $xc2,0x40($out)
|
---|
2194 | vmovdqu $xd2,0x60($out)
|
---|
2195 | lea 0x80($out),$out # size optimization
|
---|
2196 |
|
---|
2197 | vpxor 0x00($inp),$xa3,$xa3
|
---|
2198 | vpxor 0x20($inp),$xb3,$xb3
|
---|
2199 | vpxor 0x40($inp),$xc3,$xc3
|
---|
2200 | vpxor 0x60($inp),$xd3,$xd3
|
---|
2201 | lea 0x80($inp),$inp # size optimization
|
---|
2202 | vmovdqu $xa3,0x00($out)
|
---|
2203 | vmovdqu $xb3,0x20($out)
|
---|
2204 | vmovdqu $xc3,0x40($out)
|
---|
2205 | vmovdqu $xd3,0x60($out)
|
---|
2206 | lea 0x80($out),$out # size optimization
|
---|
2207 |
|
---|
2208 | sub \$64*8,$len
|
---|
2209 | jnz .Loop_outer8x
|
---|
2210 |
|
---|
2211 | jmp .Ldone8x
|
---|
2212 |
|
---|
2213 | .Ltail8x:
|
---|
2214 | cmp \$448,$len
|
---|
2215 | jae .L448_or_more8x
|
---|
2216 | cmp \$384,$len
|
---|
2217 | jae .L384_or_more8x
|
---|
2218 | cmp \$320,$len
|
---|
2219 | jae .L320_or_more8x
|
---|
2220 | cmp \$256,$len
|
---|
2221 | jae .L256_or_more8x
|
---|
2222 | cmp \$192,$len
|
---|
2223 | jae .L192_or_more8x
|
---|
2224 | cmp \$128,$len
|
---|
2225 | jae .L128_or_more8x
|
---|
2226 | cmp \$64,$len
|
---|
2227 | jae .L64_or_more8x
|
---|
2228 |
|
---|
2229 | xor %r10,%r10
|
---|
2230 | vmovdqa $xa0,0x00(%rsp)
|
---|
2231 | vmovdqa $xb0,0x20(%rsp)
|
---|
2232 | jmp .Loop_tail8x
|
---|
2233 |
|
---|
2234 | .align 32
|
---|
2235 | .L64_or_more8x:
|
---|
2236 | vpxor 0x00($inp),$xa0,$xa0 # xor with input
|
---|
2237 | vpxor 0x20($inp),$xb0,$xb0
|
---|
2238 | vmovdqu $xa0,0x00($out)
|
---|
2239 | vmovdqu $xb0,0x20($out)
|
---|
2240 | je .Ldone8x
|
---|
2241 |
|
---|
2242 | lea 0x40($inp),$inp # inp+=64*1
|
---|
2243 | xor %r10,%r10
|
---|
2244 | vmovdqa $xc0,0x00(%rsp)
|
---|
2245 | lea 0x40($out),$out # out+=64*1
|
---|
2246 | sub \$64,$len # len-=64*1
|
---|
2247 | vmovdqa $xd0,0x20(%rsp)
|
---|
2248 | jmp .Loop_tail8x
|
---|
2249 |
|
---|
2250 | .align 32
|
---|
2251 | .L128_or_more8x:
|
---|
2252 | vpxor 0x00($inp),$xa0,$xa0 # xor with input
|
---|
2253 | vpxor 0x20($inp),$xb0,$xb0
|
---|
2254 | vpxor 0x40($inp),$xc0,$xc0
|
---|
2255 | vpxor 0x60($inp),$xd0,$xd0
|
---|
2256 | vmovdqu $xa0,0x00($out)
|
---|
2257 | vmovdqu $xb0,0x20($out)
|
---|
2258 | vmovdqu $xc0,0x40($out)
|
---|
2259 | vmovdqu $xd0,0x60($out)
|
---|
2260 | je .Ldone8x
|
---|
2261 |
|
---|
2262 | lea 0x80($inp),$inp # inp+=64*2
|
---|
2263 | xor %r10,%r10
|
---|
2264 | vmovdqa $xa1,0x00(%rsp)
|
---|
2265 | lea 0x80($out),$out # out+=64*2
|
---|
2266 | sub \$128,$len # len-=64*2
|
---|
2267 | vmovdqa $xb1,0x20(%rsp)
|
---|
2268 | jmp .Loop_tail8x
|
---|
2269 |
|
---|
2270 | .align 32
|
---|
2271 | .L192_or_more8x:
|
---|
2272 | vpxor 0x00($inp),$xa0,$xa0 # xor with input
|
---|
2273 | vpxor 0x20($inp),$xb0,$xb0
|
---|
2274 | vpxor 0x40($inp),$xc0,$xc0
|
---|
2275 | vpxor 0x60($inp),$xd0,$xd0
|
---|
2276 | vpxor 0x80($inp),$xa1,$xa1
|
---|
2277 | vpxor 0xa0($inp),$xb1,$xb1
|
---|
2278 | vmovdqu $xa0,0x00($out)
|
---|
2279 | vmovdqu $xb0,0x20($out)
|
---|
2280 | vmovdqu $xc0,0x40($out)
|
---|
2281 | vmovdqu $xd0,0x60($out)
|
---|
2282 | vmovdqu $xa1,0x80($out)
|
---|
2283 | vmovdqu $xb1,0xa0($out)
|
---|
2284 | je .Ldone8x
|
---|
2285 |
|
---|
2286 | lea 0xc0($inp),$inp # inp+=64*3
|
---|
2287 | xor %r10,%r10
|
---|
2288 | vmovdqa $xc1,0x00(%rsp)
|
---|
2289 | lea 0xc0($out),$out # out+=64*3
|
---|
2290 | sub \$192,$len # len-=64*3
|
---|
2291 | vmovdqa $xd1,0x20(%rsp)
|
---|
2292 | jmp .Loop_tail8x
|
---|
2293 |
|
---|
2294 | .align 32
|
---|
2295 | .L256_or_more8x:
|
---|
2296 | vpxor 0x00($inp),$xa0,$xa0 # xor with input
|
---|
2297 | vpxor 0x20($inp),$xb0,$xb0
|
---|
2298 | vpxor 0x40($inp),$xc0,$xc0
|
---|
2299 | vpxor 0x60($inp),$xd0,$xd0
|
---|
2300 | vpxor 0x80($inp),$xa1,$xa1
|
---|
2301 | vpxor 0xa0($inp),$xb1,$xb1
|
---|
2302 | vpxor 0xc0($inp),$xc1,$xc1
|
---|
2303 | vpxor 0xe0($inp),$xd1,$xd1
|
---|
2304 | vmovdqu $xa0,0x00($out)
|
---|
2305 | vmovdqu $xb0,0x20($out)
|
---|
2306 | vmovdqu $xc0,0x40($out)
|
---|
2307 | vmovdqu $xd0,0x60($out)
|
---|
2308 | vmovdqu $xa1,0x80($out)
|
---|
2309 | vmovdqu $xb1,0xa0($out)
|
---|
2310 | vmovdqu $xc1,0xc0($out)
|
---|
2311 | vmovdqu $xd1,0xe0($out)
|
---|
2312 | je .Ldone8x
|
---|
2313 |
|
---|
2314 | lea 0x100($inp),$inp # inp+=64*4
|
---|
2315 | xor %r10,%r10
|
---|
2316 | vmovdqa $xa2,0x00(%rsp)
|
---|
2317 | lea 0x100($out),$out # out+=64*4
|
---|
2318 | sub \$256,$len # len-=64*4
|
---|
2319 | vmovdqa $xb2,0x20(%rsp)
|
---|
2320 | jmp .Loop_tail8x
|
---|
2321 |
|
---|
2322 | .align 32
|
---|
2323 | .L320_or_more8x:
|
---|
2324 | vpxor 0x00($inp),$xa0,$xa0 # xor with input
|
---|
2325 | vpxor 0x20($inp),$xb0,$xb0
|
---|
2326 | vpxor 0x40($inp),$xc0,$xc0
|
---|
2327 | vpxor 0x60($inp),$xd0,$xd0
|
---|
2328 | vpxor 0x80($inp),$xa1,$xa1
|
---|
2329 | vpxor 0xa0($inp),$xb1,$xb1
|
---|
2330 | vpxor 0xc0($inp),$xc1,$xc1
|
---|
2331 | vpxor 0xe0($inp),$xd1,$xd1
|
---|
2332 | vpxor 0x100($inp),$xa2,$xa2
|
---|
2333 | vpxor 0x120($inp),$xb2,$xb2
|
---|
2334 | vmovdqu $xa0,0x00($out)
|
---|
2335 | vmovdqu $xb0,0x20($out)
|
---|
2336 | vmovdqu $xc0,0x40($out)
|
---|
2337 | vmovdqu $xd0,0x60($out)
|
---|
2338 | vmovdqu $xa1,0x80($out)
|
---|
2339 | vmovdqu $xb1,0xa0($out)
|
---|
2340 | vmovdqu $xc1,0xc0($out)
|
---|
2341 | vmovdqu $xd1,0xe0($out)
|
---|
2342 | vmovdqu $xa2,0x100($out)
|
---|
2343 | vmovdqu $xb2,0x120($out)
|
---|
2344 | je .Ldone8x
|
---|
2345 |
|
---|
2346 | lea 0x140($inp),$inp # inp+=64*5
|
---|
2347 | xor %r10,%r10
|
---|
2348 | vmovdqa $xc2,0x00(%rsp)
|
---|
2349 | lea 0x140($out),$out # out+=64*5
|
---|
2350 | sub \$320,$len # len-=64*5
|
---|
2351 | vmovdqa $xd2,0x20(%rsp)
|
---|
2352 | jmp .Loop_tail8x
|
---|
2353 |
|
---|
2354 | .align 32
|
---|
2355 | .L384_or_more8x:
|
---|
2356 | vpxor 0x00($inp),$xa0,$xa0 # xor with input
|
---|
2357 | vpxor 0x20($inp),$xb0,$xb0
|
---|
2358 | vpxor 0x40($inp),$xc0,$xc0
|
---|
2359 | vpxor 0x60($inp),$xd0,$xd0
|
---|
2360 | vpxor 0x80($inp),$xa1,$xa1
|
---|
2361 | vpxor 0xa0($inp),$xb1,$xb1
|
---|
2362 | vpxor 0xc0($inp),$xc1,$xc1
|
---|
2363 | vpxor 0xe0($inp),$xd1,$xd1
|
---|
2364 | vpxor 0x100($inp),$xa2,$xa2
|
---|
2365 | vpxor 0x120($inp),$xb2,$xb2
|
---|
2366 | vpxor 0x140($inp),$xc2,$xc2
|
---|
2367 | vpxor 0x160($inp),$xd2,$xd2
|
---|
2368 | vmovdqu $xa0,0x00($out)
|
---|
2369 | vmovdqu $xb0,0x20($out)
|
---|
2370 | vmovdqu $xc0,0x40($out)
|
---|
2371 | vmovdqu $xd0,0x60($out)
|
---|
2372 | vmovdqu $xa1,0x80($out)
|
---|
2373 | vmovdqu $xb1,0xa0($out)
|
---|
2374 | vmovdqu $xc1,0xc0($out)
|
---|
2375 | vmovdqu $xd1,0xe0($out)
|
---|
2376 | vmovdqu $xa2,0x100($out)
|
---|
2377 | vmovdqu $xb2,0x120($out)
|
---|
2378 | vmovdqu $xc2,0x140($out)
|
---|
2379 | vmovdqu $xd2,0x160($out)
|
---|
2380 | je .Ldone8x
|
---|
2381 |
|
---|
2382 | lea 0x180($inp),$inp # inp+=64*6
|
---|
2383 | xor %r10,%r10
|
---|
2384 | vmovdqa $xa3,0x00(%rsp)
|
---|
2385 | lea 0x180($out),$out # out+=64*6
|
---|
2386 | sub \$384,$len # len-=64*6
|
---|
2387 | vmovdqa $xb3,0x20(%rsp)
|
---|
2388 | jmp .Loop_tail8x
|
---|
2389 |
|
---|
2390 | .align 32
|
---|
2391 | .L448_or_more8x:
|
---|
2392 | vpxor 0x00($inp),$xa0,$xa0 # xor with input
|
---|
2393 | vpxor 0x20($inp),$xb0,$xb0
|
---|
2394 | vpxor 0x40($inp),$xc0,$xc0
|
---|
2395 | vpxor 0x60($inp),$xd0,$xd0
|
---|
2396 | vpxor 0x80($inp),$xa1,$xa1
|
---|
2397 | vpxor 0xa0($inp),$xb1,$xb1
|
---|
2398 | vpxor 0xc0($inp),$xc1,$xc1
|
---|
2399 | vpxor 0xe0($inp),$xd1,$xd1
|
---|
2400 | vpxor 0x100($inp),$xa2,$xa2
|
---|
2401 | vpxor 0x120($inp),$xb2,$xb2
|
---|
2402 | vpxor 0x140($inp),$xc2,$xc2
|
---|
2403 | vpxor 0x160($inp),$xd2,$xd2
|
---|
2404 | vpxor 0x180($inp),$xa3,$xa3
|
---|
2405 | vpxor 0x1a0($inp),$xb3,$xb3
|
---|
2406 | vmovdqu $xa0,0x00($out)
|
---|
2407 | vmovdqu $xb0,0x20($out)
|
---|
2408 | vmovdqu $xc0,0x40($out)
|
---|
2409 | vmovdqu $xd0,0x60($out)
|
---|
2410 | vmovdqu $xa1,0x80($out)
|
---|
2411 | vmovdqu $xb1,0xa0($out)
|
---|
2412 | vmovdqu $xc1,0xc0($out)
|
---|
2413 | vmovdqu $xd1,0xe0($out)
|
---|
2414 | vmovdqu $xa2,0x100($out)
|
---|
2415 | vmovdqu $xb2,0x120($out)
|
---|
2416 | vmovdqu $xc2,0x140($out)
|
---|
2417 | vmovdqu $xd2,0x160($out)
|
---|
2418 | vmovdqu $xa3,0x180($out)
|
---|
2419 | vmovdqu $xb3,0x1a0($out)
|
---|
2420 | je .Ldone8x
|
---|
2421 |
|
---|
2422 | lea 0x1c0($inp),$inp # inp+=64*7
|
---|
2423 | xor %r10,%r10
|
---|
2424 | vmovdqa $xc3,0x00(%rsp)
|
---|
2425 | lea 0x1c0($out),$out # out+=64*7
|
---|
2426 | sub \$448,$len # len-=64*7
|
---|
2427 | vmovdqa $xd3,0x20(%rsp)
|
---|
2428 |
|
---|
2429 | .Loop_tail8x:
|
---|
2430 | movzb ($inp,%r10),%eax
|
---|
2431 | movzb (%rsp,%r10),%ecx
|
---|
2432 | lea 1(%r10),%r10
|
---|
2433 | xor %ecx,%eax
|
---|
2434 | mov %al,-1($out,%r10)
|
---|
2435 | dec $len
|
---|
2436 | jnz .Loop_tail8x
|
---|
2437 |
|
---|
2438 | .Ldone8x:
|
---|
2439 | vzeroall
|
---|
2440 | ___
|
---|
2441 | $code.=<<___ if ($win64);
|
---|
2442 | movaps -0xa8(%r9),%xmm6
|
---|
2443 | movaps -0x98(%r9),%xmm7
|
---|
2444 | movaps -0x88(%r9),%xmm8
|
---|
2445 | movaps -0x78(%r9),%xmm9
|
---|
2446 | movaps -0x68(%r9),%xmm10
|
---|
2447 | movaps -0x58(%r9),%xmm11
|
---|
2448 | movaps -0x48(%r9),%xmm12
|
---|
2449 | movaps -0x38(%r9),%xmm13
|
---|
2450 | movaps -0x28(%r9),%xmm14
|
---|
2451 | movaps -0x18(%r9),%xmm15
|
---|
2452 | ___
|
---|
2453 | $code.=<<___;
|
---|
2454 | lea (%r9),%rsp
|
---|
2455 | .cfi_def_cfa_register %rsp
|
---|
2456 | .L8x_epilogue:
|
---|
2457 | ret
|
---|
2458 | .cfi_endproc
|
---|
2459 | .size ChaCha20_8x,.-ChaCha20_8x
|
---|
2460 | ___
|
---|
2461 | }
|
---|
2462 |
|
---|
2463 | ########################################################################
|
---|
2464 | # AVX512 code paths
|
---|
2465 | if ($avx>2) {
|
---|
2466 | # This one handles shorter inputs...
|
---|
2467 |
|
---|
2468 | my ($a,$b,$c,$d, $a_,$b_,$c_,$d_,$fourz) = map("%zmm$_",(0..3,16..20));
|
---|
2469 | my ($t0,$t1,$t2,$t3) = map("%xmm$_",(4..7));
|
---|
2470 |
|
---|
2471 | sub vpxord() # size optimization
|
---|
2472 | { my $opcode = "vpxor"; # adhere to vpxor when possible
|
---|
2473 |
|
---|
2474 | foreach (@_) {
|
---|
2475 | if (/%([zy])mm([0-9]+)/ && ($1 eq "z" || $2>=16)) {
|
---|
2476 | $opcode = "vpxord";
|
---|
2477 | last;
|
---|
2478 | }
|
---|
2479 | }
|
---|
2480 |
|
---|
2481 | $code .= "\t$opcode\t".join(',',reverse @_)."\n";
|
---|
2482 | }
|
---|
2483 |
|
---|
2484 | sub AVX512ROUND { # critical path is 14 "SIMD ticks" per round
|
---|
2485 | &vpaddd ($a,$a,$b);
|
---|
2486 | &vpxord ($d,$d,$a);
|
---|
2487 | &vprold ($d,$d,16);
|
---|
2488 |
|
---|
2489 | &vpaddd ($c,$c,$d);
|
---|
2490 | &vpxord ($b,$b,$c);
|
---|
2491 | &vprold ($b,$b,12);
|
---|
2492 |
|
---|
2493 | &vpaddd ($a,$a,$b);
|
---|
2494 | &vpxord ($d,$d,$a);
|
---|
2495 | &vprold ($d,$d,8);
|
---|
2496 |
|
---|
2497 | &vpaddd ($c,$c,$d);
|
---|
2498 | &vpxord ($b,$b,$c);
|
---|
2499 | &vprold ($b,$b,7);
|
---|
2500 | }
|
---|
2501 |
|
---|
2502 | my $xframe = $win64 ? 32+8 : 8;
|
---|
2503 |
|
---|
2504 | $code.=<<___;
|
---|
2505 | .type ChaCha20_avx512,\@function,5
|
---|
2506 | .align 32
|
---|
2507 | ChaCha20_avx512:
|
---|
2508 | .cfi_startproc
|
---|
2509 | .LChaCha20_avx512:
|
---|
2510 | mov %rsp,%r9 # frame pointer
|
---|
2511 | .cfi_def_cfa_register %r9
|
---|
2512 | cmp \$512,$len
|
---|
2513 | ja .LChaCha20_16x
|
---|
2514 |
|
---|
2515 | sub \$64+$xframe,%rsp
|
---|
2516 | ___
|
---|
2517 | $code.=<<___ if ($win64);
|
---|
2518 | movaps %xmm6,-0x28(%r9)
|
---|
2519 | movaps %xmm7,-0x18(%r9)
|
---|
2520 | .Lavx512_body:
|
---|
2521 | ___
|
---|
2522 | $code.=<<___;
|
---|
2523 | vbroadcasti32x4 .Lsigma(%rip),$a
|
---|
2524 | vbroadcasti32x4 ($key),$b
|
---|
2525 | vbroadcasti32x4 16($key),$c
|
---|
2526 | vbroadcasti32x4 ($counter),$d
|
---|
2527 |
|
---|
2528 | vmovdqa32 $a,$a_
|
---|
2529 | vmovdqa32 $b,$b_
|
---|
2530 | vmovdqa32 $c,$c_
|
---|
2531 | vpaddd .Lzeroz(%rip),$d,$d
|
---|
2532 | vmovdqa32 .Lfourz(%rip),$fourz
|
---|
2533 | mov \$10,$counter # reuse $counter
|
---|
2534 | vmovdqa32 $d,$d_
|
---|
2535 | jmp .Loop_avx512
|
---|
2536 |
|
---|
2537 | .align 16
|
---|
2538 | .Loop_outer_avx512:
|
---|
2539 | vmovdqa32 $a_,$a
|
---|
2540 | vmovdqa32 $b_,$b
|
---|
2541 | vmovdqa32 $c_,$c
|
---|
2542 | vpaddd $fourz,$d_,$d
|
---|
2543 | mov \$10,$counter
|
---|
2544 | vmovdqa32 $d,$d_
|
---|
2545 | jmp .Loop_avx512
|
---|
2546 |
|
---|
2547 | .align 32
|
---|
2548 | .Loop_avx512:
|
---|
2549 | ___
|
---|
2550 | &AVX512ROUND();
|
---|
2551 | &vpshufd ($c,$c,0b01001110);
|
---|
2552 | &vpshufd ($b,$b,0b00111001);
|
---|
2553 | &vpshufd ($d,$d,0b10010011);
|
---|
2554 |
|
---|
2555 | &AVX512ROUND();
|
---|
2556 | &vpshufd ($c,$c,0b01001110);
|
---|
2557 | &vpshufd ($b,$b,0b10010011);
|
---|
2558 | &vpshufd ($d,$d,0b00111001);
|
---|
2559 |
|
---|
2560 | &dec ($counter);
|
---|
2561 | &jnz (".Loop_avx512");
|
---|
2562 |
|
---|
2563 | $code.=<<___;
|
---|
2564 | vpaddd $a_,$a,$a
|
---|
2565 | vpaddd $b_,$b,$b
|
---|
2566 | vpaddd $c_,$c,$c
|
---|
2567 | vpaddd $d_,$d,$d
|
---|
2568 |
|
---|
2569 | sub \$64,$len
|
---|
2570 | jb .Ltail64_avx512
|
---|
2571 |
|
---|
2572 | vpxor 0x00($inp),%x#$a,$t0 # xor with input
|
---|
2573 | vpxor 0x10($inp),%x#$b,$t1
|
---|
2574 | vpxor 0x20($inp),%x#$c,$t2
|
---|
2575 | vpxor 0x30($inp),%x#$d,$t3
|
---|
2576 | lea 0x40($inp),$inp # inp+=64
|
---|
2577 |
|
---|
2578 | vmovdqu $t0,0x00($out) # write output
|
---|
2579 | vmovdqu $t1,0x10($out)
|
---|
2580 | vmovdqu $t2,0x20($out)
|
---|
2581 | vmovdqu $t3,0x30($out)
|
---|
2582 | lea 0x40($out),$out # out+=64
|
---|
2583 |
|
---|
2584 | jz .Ldone_avx512
|
---|
2585 |
|
---|
2586 | vextracti32x4 \$1,$a,$t0
|
---|
2587 | vextracti32x4 \$1,$b,$t1
|
---|
2588 | vextracti32x4 \$1,$c,$t2
|
---|
2589 | vextracti32x4 \$1,$d,$t3
|
---|
2590 |
|
---|
2591 | sub \$64,$len
|
---|
2592 | jb .Ltail_avx512
|
---|
2593 |
|
---|
2594 | vpxor 0x00($inp),$t0,$t0 # xor with input
|
---|
2595 | vpxor 0x10($inp),$t1,$t1
|
---|
2596 | vpxor 0x20($inp),$t2,$t2
|
---|
2597 | vpxor 0x30($inp),$t3,$t3
|
---|
2598 | lea 0x40($inp),$inp # inp+=64
|
---|
2599 |
|
---|
2600 | vmovdqu $t0,0x00($out) # write output
|
---|
2601 | vmovdqu $t1,0x10($out)
|
---|
2602 | vmovdqu $t2,0x20($out)
|
---|
2603 | vmovdqu $t3,0x30($out)
|
---|
2604 | lea 0x40($out),$out # out+=64
|
---|
2605 |
|
---|
2606 | jz .Ldone_avx512
|
---|
2607 |
|
---|
2608 | vextracti32x4 \$2,$a,$t0
|
---|
2609 | vextracti32x4 \$2,$b,$t1
|
---|
2610 | vextracti32x4 \$2,$c,$t2
|
---|
2611 | vextracti32x4 \$2,$d,$t3
|
---|
2612 |
|
---|
2613 | sub \$64,$len
|
---|
2614 | jb .Ltail_avx512
|
---|
2615 |
|
---|
2616 | vpxor 0x00($inp),$t0,$t0 # xor with input
|
---|
2617 | vpxor 0x10($inp),$t1,$t1
|
---|
2618 | vpxor 0x20($inp),$t2,$t2
|
---|
2619 | vpxor 0x30($inp),$t3,$t3
|
---|
2620 | lea 0x40($inp),$inp # inp+=64
|
---|
2621 |
|
---|
2622 | vmovdqu $t0,0x00($out) # write output
|
---|
2623 | vmovdqu $t1,0x10($out)
|
---|
2624 | vmovdqu $t2,0x20($out)
|
---|
2625 | vmovdqu $t3,0x30($out)
|
---|
2626 | lea 0x40($out),$out # out+=64
|
---|
2627 |
|
---|
2628 | jz .Ldone_avx512
|
---|
2629 |
|
---|
2630 | vextracti32x4 \$3,$a,$t0
|
---|
2631 | vextracti32x4 \$3,$b,$t1
|
---|
2632 | vextracti32x4 \$3,$c,$t2
|
---|
2633 | vextracti32x4 \$3,$d,$t3
|
---|
2634 |
|
---|
2635 | sub \$64,$len
|
---|
2636 | jb .Ltail_avx512
|
---|
2637 |
|
---|
2638 | vpxor 0x00($inp),$t0,$t0 # xor with input
|
---|
2639 | vpxor 0x10($inp),$t1,$t1
|
---|
2640 | vpxor 0x20($inp),$t2,$t2
|
---|
2641 | vpxor 0x30($inp),$t3,$t3
|
---|
2642 | lea 0x40($inp),$inp # inp+=64
|
---|
2643 |
|
---|
2644 | vmovdqu $t0,0x00($out) # write output
|
---|
2645 | vmovdqu $t1,0x10($out)
|
---|
2646 | vmovdqu $t2,0x20($out)
|
---|
2647 | vmovdqu $t3,0x30($out)
|
---|
2648 | lea 0x40($out),$out # out+=64
|
---|
2649 |
|
---|
2650 | jnz .Loop_outer_avx512
|
---|
2651 |
|
---|
2652 | jmp .Ldone_avx512
|
---|
2653 |
|
---|
2654 | .align 16
|
---|
2655 | .Ltail64_avx512:
|
---|
2656 | vmovdqa %x#$a,0x00(%rsp)
|
---|
2657 | vmovdqa %x#$b,0x10(%rsp)
|
---|
2658 | vmovdqa %x#$c,0x20(%rsp)
|
---|
2659 | vmovdqa %x#$d,0x30(%rsp)
|
---|
2660 | add \$64,$len
|
---|
2661 | jmp .Loop_tail_avx512
|
---|
2662 |
|
---|
2663 | .align 16
|
---|
2664 | .Ltail_avx512:
|
---|
2665 | vmovdqa $t0,0x00(%rsp)
|
---|
2666 | vmovdqa $t1,0x10(%rsp)
|
---|
2667 | vmovdqa $t2,0x20(%rsp)
|
---|
2668 | vmovdqa $t3,0x30(%rsp)
|
---|
2669 | add \$64,$len
|
---|
2670 |
|
---|
2671 | .Loop_tail_avx512:
|
---|
2672 | movzb ($inp,$counter),%eax
|
---|
2673 | movzb (%rsp,$counter),%ecx
|
---|
2674 | lea 1($counter),$counter
|
---|
2675 | xor %ecx,%eax
|
---|
2676 | mov %al,-1($out,$counter)
|
---|
2677 | dec $len
|
---|
2678 | jnz .Loop_tail_avx512
|
---|
2679 |
|
---|
2680 | vmovdqu32 $a_,0x00(%rsp)
|
---|
2681 |
|
---|
2682 | .Ldone_avx512:
|
---|
2683 | vzeroall
|
---|
2684 | ___
|
---|
2685 | $code.=<<___ if ($win64);
|
---|
2686 | movaps -0x28(%r9),%xmm6
|
---|
2687 | movaps -0x18(%r9),%xmm7
|
---|
2688 | ___
|
---|
2689 | $code.=<<___;
|
---|
2690 | lea (%r9),%rsp
|
---|
2691 | .cfi_def_cfa_register %rsp
|
---|
2692 | .Lavx512_epilogue:
|
---|
2693 | ret
|
---|
2694 | .cfi_endproc
|
---|
2695 | .size ChaCha20_avx512,.-ChaCha20_avx512
|
---|
2696 | ___
|
---|
2697 |
|
---|
2698 | map(s/%z/%y/, $a,$b,$c,$d, $a_,$b_,$c_,$d_,$fourz);
|
---|
2699 |
|
---|
2700 | $code.=<<___;
|
---|
2701 | .type ChaCha20_avx512vl,\@function,5
|
---|
2702 | .align 32
|
---|
2703 | ChaCha20_avx512vl:
|
---|
2704 | .cfi_startproc
|
---|
2705 | .LChaCha20_avx512vl:
|
---|
2706 | mov %rsp,%r9 # frame pointer
|
---|
2707 | .cfi_def_cfa_register %r9
|
---|
2708 | cmp \$128,$len
|
---|
2709 | ja .LChaCha20_8xvl
|
---|
2710 |
|
---|
2711 | sub \$64+$xframe,%rsp
|
---|
2712 | ___
|
---|
2713 | $code.=<<___ if ($win64);
|
---|
2714 | movaps %xmm6,-0x28(%r9)
|
---|
2715 | movaps %xmm7,-0x18(%r9)
|
---|
2716 | .Lavx512vl_body:
|
---|
2717 | ___
|
---|
2718 | $code.=<<___;
|
---|
2719 | vbroadcasti128 .Lsigma(%rip),$a
|
---|
2720 | vbroadcasti128 ($key),$b
|
---|
2721 | vbroadcasti128 16($key),$c
|
---|
2722 | vbroadcasti128 ($counter),$d
|
---|
2723 |
|
---|
2724 | vmovdqa32 $a,$a_
|
---|
2725 | vmovdqa32 $b,$b_
|
---|
2726 | vmovdqa32 $c,$c_
|
---|
2727 | vpaddd .Lzeroz(%rip),$d,$d
|
---|
2728 | vmovdqa32 .Ltwoy(%rip),$fourz
|
---|
2729 | mov \$10,$counter # reuse $counter
|
---|
2730 | vmovdqa32 $d,$d_
|
---|
2731 | jmp .Loop_avx512vl
|
---|
2732 |
|
---|
2733 | .align 16
|
---|
2734 | .Loop_outer_avx512vl:
|
---|
2735 | vmovdqa32 $c_,$c
|
---|
2736 | vpaddd $fourz,$d_,$d
|
---|
2737 | mov \$10,$counter
|
---|
2738 | vmovdqa32 $d,$d_
|
---|
2739 | jmp .Loop_avx512vl
|
---|
2740 |
|
---|
2741 | .align 32
|
---|
2742 | .Loop_avx512vl:
|
---|
2743 | ___
|
---|
2744 | &AVX512ROUND();
|
---|
2745 | &vpshufd ($c,$c,0b01001110);
|
---|
2746 | &vpshufd ($b,$b,0b00111001);
|
---|
2747 | &vpshufd ($d,$d,0b10010011);
|
---|
2748 |
|
---|
2749 | &AVX512ROUND();
|
---|
2750 | &vpshufd ($c,$c,0b01001110);
|
---|
2751 | &vpshufd ($b,$b,0b10010011);
|
---|
2752 | &vpshufd ($d,$d,0b00111001);
|
---|
2753 |
|
---|
2754 | &dec ($counter);
|
---|
2755 | &jnz (".Loop_avx512vl");
|
---|
2756 |
|
---|
2757 | $code.=<<___;
|
---|
2758 | vpaddd $a_,$a,$a
|
---|
2759 | vpaddd $b_,$b,$b
|
---|
2760 | vpaddd $c_,$c,$c
|
---|
2761 | vpaddd $d_,$d,$d
|
---|
2762 |
|
---|
2763 | sub \$64,$len
|
---|
2764 | jb .Ltail64_avx512vl
|
---|
2765 |
|
---|
2766 | vpxor 0x00($inp),%x#$a,$t0 # xor with input
|
---|
2767 | vpxor 0x10($inp),%x#$b,$t1
|
---|
2768 | vpxor 0x20($inp),%x#$c,$t2
|
---|
2769 | vpxor 0x30($inp),%x#$d,$t3
|
---|
2770 | lea 0x40($inp),$inp # inp+=64
|
---|
2771 |
|
---|
2772 | vmovdqu $t0,0x00($out) # write output
|
---|
2773 | vmovdqu $t1,0x10($out)
|
---|
2774 | vmovdqu $t2,0x20($out)
|
---|
2775 | vmovdqu $t3,0x30($out)
|
---|
2776 | lea 0x40($out),$out # out+=64
|
---|
2777 |
|
---|
2778 | jz .Ldone_avx512vl
|
---|
2779 |
|
---|
2780 | vextracti128 \$1,$a,$t0
|
---|
2781 | vextracti128 \$1,$b,$t1
|
---|
2782 | vextracti128 \$1,$c,$t2
|
---|
2783 | vextracti128 \$1,$d,$t3
|
---|
2784 |
|
---|
2785 | sub \$64,$len
|
---|
2786 | jb .Ltail_avx512vl
|
---|
2787 |
|
---|
2788 | vpxor 0x00($inp),$t0,$t0 # xor with input
|
---|
2789 | vpxor 0x10($inp),$t1,$t1
|
---|
2790 | vpxor 0x20($inp),$t2,$t2
|
---|
2791 | vpxor 0x30($inp),$t3,$t3
|
---|
2792 | lea 0x40($inp),$inp # inp+=64
|
---|
2793 |
|
---|
2794 | vmovdqu $t0,0x00($out) # write output
|
---|
2795 | vmovdqu $t1,0x10($out)
|
---|
2796 | vmovdqu $t2,0x20($out)
|
---|
2797 | vmovdqu $t3,0x30($out)
|
---|
2798 | lea 0x40($out),$out # out+=64
|
---|
2799 |
|
---|
2800 | vmovdqa32 $a_,$a
|
---|
2801 | vmovdqa32 $b_,$b
|
---|
2802 | jnz .Loop_outer_avx512vl
|
---|
2803 |
|
---|
2804 | jmp .Ldone_avx512vl
|
---|
2805 |
|
---|
2806 | .align 16
|
---|
2807 | .Ltail64_avx512vl:
|
---|
2808 | vmovdqa %x#$a,0x00(%rsp)
|
---|
2809 | vmovdqa %x#$b,0x10(%rsp)
|
---|
2810 | vmovdqa %x#$c,0x20(%rsp)
|
---|
2811 | vmovdqa %x#$d,0x30(%rsp)
|
---|
2812 | add \$64,$len
|
---|
2813 | jmp .Loop_tail_avx512vl
|
---|
2814 |
|
---|
2815 | .align 16
|
---|
2816 | .Ltail_avx512vl:
|
---|
2817 | vmovdqa $t0,0x00(%rsp)
|
---|
2818 | vmovdqa $t1,0x10(%rsp)
|
---|
2819 | vmovdqa $t2,0x20(%rsp)
|
---|
2820 | vmovdqa $t3,0x30(%rsp)
|
---|
2821 | add \$64,$len
|
---|
2822 |
|
---|
2823 | .Loop_tail_avx512vl:
|
---|
2824 | movzb ($inp,$counter),%eax
|
---|
2825 | movzb (%rsp,$counter),%ecx
|
---|
2826 | lea 1($counter),$counter
|
---|
2827 | xor %ecx,%eax
|
---|
2828 | mov %al,-1($out,$counter)
|
---|
2829 | dec $len
|
---|
2830 | jnz .Loop_tail_avx512vl
|
---|
2831 |
|
---|
2832 | vmovdqu32 $a_,0x00(%rsp)
|
---|
2833 | vmovdqu32 $a_,0x20(%rsp)
|
---|
2834 |
|
---|
2835 | .Ldone_avx512vl:
|
---|
2836 | vzeroall
|
---|
2837 | ___
|
---|
2838 | $code.=<<___ if ($win64);
|
---|
2839 | movaps -0x28(%r9),%xmm6
|
---|
2840 | movaps -0x18(%r9),%xmm7
|
---|
2841 | ___
|
---|
2842 | $code.=<<___;
|
---|
2843 | lea (%r9),%rsp
|
---|
2844 | .cfi_def_cfa_register %rsp
|
---|
2845 | .Lavx512vl_epilogue:
|
---|
2846 | ret
|
---|
2847 | .cfi_endproc
|
---|
2848 | .size ChaCha20_avx512vl,.-ChaCha20_avx512vl
|
---|
2849 | ___
|
---|
2850 | }
|
---|
2851 | if ($avx>2) {
|
---|
2852 | # This one handles longer inputs...
|
---|
2853 |
|
---|
2854 | my ($xa0,$xa1,$xa2,$xa3, $xb0,$xb1,$xb2,$xb3,
|
---|
2855 | $xc0,$xc1,$xc2,$xc3, $xd0,$xd1,$xd2,$xd3)=map("%zmm$_",(0..15));
|
---|
2856 | my @xx=($xa0,$xa1,$xa2,$xa3, $xb0,$xb1,$xb2,$xb3,
|
---|
2857 | $xc0,$xc1,$xc2,$xc3, $xd0,$xd1,$xd2,$xd3);
|
---|
2858 | my @key=map("%zmm$_",(16..31));
|
---|
2859 | my ($xt0,$xt1,$xt2,$xt3)=@key[0..3];
|
---|
2860 |
|
---|
2861 | sub AVX512_lane_ROUND {
|
---|
2862 | my ($a0,$b0,$c0,$d0)=@_;
|
---|
2863 | my ($a1,$b1,$c1,$d1)=map(($_&~3)+(($_+1)&3),($a0,$b0,$c0,$d0));
|
---|
2864 | my ($a2,$b2,$c2,$d2)=map(($_&~3)+(($_+1)&3),($a1,$b1,$c1,$d1));
|
---|
2865 | my ($a3,$b3,$c3,$d3)=map(($_&~3)+(($_+1)&3),($a2,$b2,$c2,$d2));
|
---|
2866 | my @x=map("\"$_\"",@xx);
|
---|
2867 |
|
---|
2868 | (
|
---|
2869 | "&vpaddd (@x[$a0],@x[$a0],@x[$b0])", # Q1
|
---|
2870 | "&vpaddd (@x[$a1],@x[$a1],@x[$b1])", # Q2
|
---|
2871 | "&vpaddd (@x[$a2],@x[$a2],@x[$b2])", # Q3
|
---|
2872 | "&vpaddd (@x[$a3],@x[$a3],@x[$b3])", # Q4
|
---|
2873 | "&vpxord (@x[$d0],@x[$d0],@x[$a0])",
|
---|
2874 | "&vpxord (@x[$d1],@x[$d1],@x[$a1])",
|
---|
2875 | "&vpxord (@x[$d2],@x[$d2],@x[$a2])",
|
---|
2876 | "&vpxord (@x[$d3],@x[$d3],@x[$a3])",
|
---|
2877 | "&vprold (@x[$d0],@x[$d0],16)",
|
---|
2878 | "&vprold (@x[$d1],@x[$d1],16)",
|
---|
2879 | "&vprold (@x[$d2],@x[$d2],16)",
|
---|
2880 | "&vprold (@x[$d3],@x[$d3],16)",
|
---|
2881 |
|
---|
2882 | "&vpaddd (@x[$c0],@x[$c0],@x[$d0])",
|
---|
2883 | "&vpaddd (@x[$c1],@x[$c1],@x[$d1])",
|
---|
2884 | "&vpaddd (@x[$c2],@x[$c2],@x[$d2])",
|
---|
2885 | "&vpaddd (@x[$c3],@x[$c3],@x[$d3])",
|
---|
2886 | "&vpxord (@x[$b0],@x[$b0],@x[$c0])",
|
---|
2887 | "&vpxord (@x[$b1],@x[$b1],@x[$c1])",
|
---|
2888 | "&vpxord (@x[$b2],@x[$b2],@x[$c2])",
|
---|
2889 | "&vpxord (@x[$b3],@x[$b3],@x[$c3])",
|
---|
2890 | "&vprold (@x[$b0],@x[$b0],12)",
|
---|
2891 | "&vprold (@x[$b1],@x[$b1],12)",
|
---|
2892 | "&vprold (@x[$b2],@x[$b2],12)",
|
---|
2893 | "&vprold (@x[$b3],@x[$b3],12)",
|
---|
2894 |
|
---|
2895 | "&vpaddd (@x[$a0],@x[$a0],@x[$b0])",
|
---|
2896 | "&vpaddd (@x[$a1],@x[$a1],@x[$b1])",
|
---|
2897 | "&vpaddd (@x[$a2],@x[$a2],@x[$b2])",
|
---|
2898 | "&vpaddd (@x[$a3],@x[$a3],@x[$b3])",
|
---|
2899 | "&vpxord (@x[$d0],@x[$d0],@x[$a0])",
|
---|
2900 | "&vpxord (@x[$d1],@x[$d1],@x[$a1])",
|
---|
2901 | "&vpxord (@x[$d2],@x[$d2],@x[$a2])",
|
---|
2902 | "&vpxord (@x[$d3],@x[$d3],@x[$a3])",
|
---|
2903 | "&vprold (@x[$d0],@x[$d0],8)",
|
---|
2904 | "&vprold (@x[$d1],@x[$d1],8)",
|
---|
2905 | "&vprold (@x[$d2],@x[$d2],8)",
|
---|
2906 | "&vprold (@x[$d3],@x[$d3],8)",
|
---|
2907 |
|
---|
2908 | "&vpaddd (@x[$c0],@x[$c0],@x[$d0])",
|
---|
2909 | "&vpaddd (@x[$c1],@x[$c1],@x[$d1])",
|
---|
2910 | "&vpaddd (@x[$c2],@x[$c2],@x[$d2])",
|
---|
2911 | "&vpaddd (@x[$c3],@x[$c3],@x[$d3])",
|
---|
2912 | "&vpxord (@x[$b0],@x[$b0],@x[$c0])",
|
---|
2913 | "&vpxord (@x[$b1],@x[$b1],@x[$c1])",
|
---|
2914 | "&vpxord (@x[$b2],@x[$b2],@x[$c2])",
|
---|
2915 | "&vpxord (@x[$b3],@x[$b3],@x[$c3])",
|
---|
2916 | "&vprold (@x[$b0],@x[$b0],7)",
|
---|
2917 | "&vprold (@x[$b1],@x[$b1],7)",
|
---|
2918 | "&vprold (@x[$b2],@x[$b2],7)",
|
---|
2919 | "&vprold (@x[$b3],@x[$b3],7)"
|
---|
2920 | );
|
---|
2921 | }
|
---|
2922 |
|
---|
2923 | my $xframe = $win64 ? 0xa8 : 8;
|
---|
2924 |
|
---|
2925 | $code.=<<___;
|
---|
2926 | .type ChaCha20_16x,\@function,5
|
---|
2927 | .align 32
|
---|
2928 | ChaCha20_16x:
|
---|
2929 | .cfi_startproc
|
---|
2930 | .LChaCha20_16x:
|
---|
2931 | mov %rsp,%r9 # frame register
|
---|
2932 | .cfi_def_cfa_register %r9
|
---|
2933 | sub \$64+$xframe,%rsp
|
---|
2934 | and \$-64,%rsp
|
---|
2935 | ___
|
---|
2936 | $code.=<<___ if ($win64);
|
---|
2937 | movaps %xmm6,-0xa8(%r9)
|
---|
2938 | movaps %xmm7,-0x98(%r9)
|
---|
2939 | movaps %xmm8,-0x88(%r9)
|
---|
2940 | movaps %xmm9,-0x78(%r9)
|
---|
2941 | movaps %xmm10,-0x68(%r9)
|
---|
2942 | movaps %xmm11,-0x58(%r9)
|
---|
2943 | movaps %xmm12,-0x48(%r9)
|
---|
2944 | movaps %xmm13,-0x38(%r9)
|
---|
2945 | movaps %xmm14,-0x28(%r9)
|
---|
2946 | movaps %xmm15,-0x18(%r9)
|
---|
2947 | .L16x_body:
|
---|
2948 | ___
|
---|
2949 | $code.=<<___;
|
---|
2950 | vzeroupper
|
---|
2951 |
|
---|
2952 | lea .Lsigma(%rip),%r10
|
---|
2953 | vbroadcasti32x4 (%r10),$xa3 # key[0]
|
---|
2954 | vbroadcasti32x4 ($key),$xb3 # key[1]
|
---|
2955 | vbroadcasti32x4 16($key),$xc3 # key[2]
|
---|
2956 | vbroadcasti32x4 ($counter),$xd3 # key[3]
|
---|
2957 |
|
---|
2958 | vpshufd \$0x00,$xa3,$xa0 # smash key by lanes...
|
---|
2959 | vpshufd \$0x55,$xa3,$xa1
|
---|
2960 | vpshufd \$0xaa,$xa3,$xa2
|
---|
2961 | vpshufd \$0xff,$xa3,$xa3
|
---|
2962 | vmovdqa64 $xa0,@key[0]
|
---|
2963 | vmovdqa64 $xa1,@key[1]
|
---|
2964 | vmovdqa64 $xa2,@key[2]
|
---|
2965 | vmovdqa64 $xa3,@key[3]
|
---|
2966 |
|
---|
2967 | vpshufd \$0x00,$xb3,$xb0
|
---|
2968 | vpshufd \$0x55,$xb3,$xb1
|
---|
2969 | vpshufd \$0xaa,$xb3,$xb2
|
---|
2970 | vpshufd \$0xff,$xb3,$xb3
|
---|
2971 | vmovdqa64 $xb0,@key[4]
|
---|
2972 | vmovdqa64 $xb1,@key[5]
|
---|
2973 | vmovdqa64 $xb2,@key[6]
|
---|
2974 | vmovdqa64 $xb3,@key[7]
|
---|
2975 |
|
---|
2976 | vpshufd \$0x00,$xc3,$xc0
|
---|
2977 | vpshufd \$0x55,$xc3,$xc1
|
---|
2978 | vpshufd \$0xaa,$xc3,$xc2
|
---|
2979 | vpshufd \$0xff,$xc3,$xc3
|
---|
2980 | vmovdqa64 $xc0,@key[8]
|
---|
2981 | vmovdqa64 $xc1,@key[9]
|
---|
2982 | vmovdqa64 $xc2,@key[10]
|
---|
2983 | vmovdqa64 $xc3,@key[11]
|
---|
2984 |
|
---|
2985 | vpshufd \$0x00,$xd3,$xd0
|
---|
2986 | vpshufd \$0x55,$xd3,$xd1
|
---|
2987 | vpshufd \$0xaa,$xd3,$xd2
|
---|
2988 | vpshufd \$0xff,$xd3,$xd3
|
---|
2989 | vpaddd .Lincz(%rip),$xd0,$xd0 # don't save counters yet
|
---|
2990 | vmovdqa64 $xd0,@key[12]
|
---|
2991 | vmovdqa64 $xd1,@key[13]
|
---|
2992 | vmovdqa64 $xd2,@key[14]
|
---|
2993 | vmovdqa64 $xd3,@key[15]
|
---|
2994 |
|
---|
2995 | mov \$10,%eax
|
---|
2996 | jmp .Loop16x
|
---|
2997 |
|
---|
2998 | .align 32
|
---|
2999 | .Loop_outer16x:
|
---|
3000 | vpbroadcastd 0(%r10),$xa0 # reload key
|
---|
3001 | vpbroadcastd 4(%r10),$xa1
|
---|
3002 | vpbroadcastd 8(%r10),$xa2
|
---|
3003 | vpbroadcastd 12(%r10),$xa3
|
---|
3004 | vpaddd .Lsixteen(%rip),@key[12],@key[12] # next SIMD counters
|
---|
3005 | vmovdqa64 @key[4],$xb0
|
---|
3006 | vmovdqa64 @key[5],$xb1
|
---|
3007 | vmovdqa64 @key[6],$xb2
|
---|
3008 | vmovdqa64 @key[7],$xb3
|
---|
3009 | vmovdqa64 @key[8],$xc0
|
---|
3010 | vmovdqa64 @key[9],$xc1
|
---|
3011 | vmovdqa64 @key[10],$xc2
|
---|
3012 | vmovdqa64 @key[11],$xc3
|
---|
3013 | vmovdqa64 @key[12],$xd0
|
---|
3014 | vmovdqa64 @key[13],$xd1
|
---|
3015 | vmovdqa64 @key[14],$xd2
|
---|
3016 | vmovdqa64 @key[15],$xd3
|
---|
3017 |
|
---|
3018 | vmovdqa64 $xa0,@key[0]
|
---|
3019 | vmovdqa64 $xa1,@key[1]
|
---|
3020 | vmovdqa64 $xa2,@key[2]
|
---|
3021 | vmovdqa64 $xa3,@key[3]
|
---|
3022 |
|
---|
3023 | mov \$10,%eax
|
---|
3024 | jmp .Loop16x
|
---|
3025 |
|
---|
3026 | .align 32
|
---|
3027 | .Loop16x:
|
---|
3028 | ___
|
---|
3029 | foreach (&AVX512_lane_ROUND(0, 4, 8,12)) { eval; }
|
---|
3030 | foreach (&AVX512_lane_ROUND(0, 5,10,15)) { eval; }
|
---|
3031 | $code.=<<___;
|
---|
3032 | dec %eax
|
---|
3033 | jnz .Loop16x
|
---|
3034 |
|
---|
3035 | vpaddd @key[0],$xa0,$xa0 # accumulate key
|
---|
3036 | vpaddd @key[1],$xa1,$xa1
|
---|
3037 | vpaddd @key[2],$xa2,$xa2
|
---|
3038 | vpaddd @key[3],$xa3,$xa3
|
---|
3039 |
|
---|
3040 | vpunpckldq $xa1,$xa0,$xt2 # "de-interlace" data
|
---|
3041 | vpunpckldq $xa3,$xa2,$xt3
|
---|
3042 | vpunpckhdq $xa1,$xa0,$xa0
|
---|
3043 | vpunpckhdq $xa3,$xa2,$xa2
|
---|
3044 | vpunpcklqdq $xt3,$xt2,$xa1 # "a0"
|
---|
3045 | vpunpckhqdq $xt3,$xt2,$xt2 # "a1"
|
---|
3046 | vpunpcklqdq $xa2,$xa0,$xa3 # "a2"
|
---|
3047 | vpunpckhqdq $xa2,$xa0,$xa0 # "a3"
|
---|
3048 | ___
|
---|
3049 | ($xa0,$xa1,$xa2,$xa3,$xt2)=($xa1,$xt2,$xa3,$xa0,$xa2);
|
---|
3050 | $code.=<<___;
|
---|
3051 | vpaddd @key[4],$xb0,$xb0
|
---|
3052 | vpaddd @key[5],$xb1,$xb1
|
---|
3053 | vpaddd @key[6],$xb2,$xb2
|
---|
3054 | vpaddd @key[7],$xb3,$xb3
|
---|
3055 |
|
---|
3056 | vpunpckldq $xb1,$xb0,$xt2
|
---|
3057 | vpunpckldq $xb3,$xb2,$xt3
|
---|
3058 | vpunpckhdq $xb1,$xb0,$xb0
|
---|
3059 | vpunpckhdq $xb3,$xb2,$xb2
|
---|
3060 | vpunpcklqdq $xt3,$xt2,$xb1 # "b0"
|
---|
3061 | vpunpckhqdq $xt3,$xt2,$xt2 # "b1"
|
---|
3062 | vpunpcklqdq $xb2,$xb0,$xb3 # "b2"
|
---|
3063 | vpunpckhqdq $xb2,$xb0,$xb0 # "b3"
|
---|
3064 | ___
|
---|
3065 | ($xb0,$xb1,$xb2,$xb3,$xt2)=($xb1,$xt2,$xb3,$xb0,$xb2);
|
---|
3066 | $code.=<<___;
|
---|
3067 | vshufi32x4 \$0x44,$xb0,$xa0,$xt3 # "de-interlace" further
|
---|
3068 | vshufi32x4 \$0xee,$xb0,$xa0,$xb0
|
---|
3069 | vshufi32x4 \$0x44,$xb1,$xa1,$xa0
|
---|
3070 | vshufi32x4 \$0xee,$xb1,$xa1,$xb1
|
---|
3071 | vshufi32x4 \$0x44,$xb2,$xa2,$xa1
|
---|
3072 | vshufi32x4 \$0xee,$xb2,$xa2,$xb2
|
---|
3073 | vshufi32x4 \$0x44,$xb3,$xa3,$xa2
|
---|
3074 | vshufi32x4 \$0xee,$xb3,$xa3,$xb3
|
---|
3075 | ___
|
---|
3076 | ($xa0,$xa1,$xa2,$xa3,$xt3)=($xt3,$xa0,$xa1,$xa2,$xa3);
|
---|
3077 | $code.=<<___;
|
---|
3078 | vpaddd @key[8],$xc0,$xc0
|
---|
3079 | vpaddd @key[9],$xc1,$xc1
|
---|
3080 | vpaddd @key[10],$xc2,$xc2
|
---|
3081 | vpaddd @key[11],$xc3,$xc3
|
---|
3082 |
|
---|
3083 | vpunpckldq $xc1,$xc0,$xt2
|
---|
3084 | vpunpckldq $xc3,$xc2,$xt3
|
---|
3085 | vpunpckhdq $xc1,$xc0,$xc0
|
---|
3086 | vpunpckhdq $xc3,$xc2,$xc2
|
---|
3087 | vpunpcklqdq $xt3,$xt2,$xc1 # "c0"
|
---|
3088 | vpunpckhqdq $xt3,$xt2,$xt2 # "c1"
|
---|
3089 | vpunpcklqdq $xc2,$xc0,$xc3 # "c2"
|
---|
3090 | vpunpckhqdq $xc2,$xc0,$xc0 # "c3"
|
---|
3091 | ___
|
---|
3092 | ($xc0,$xc1,$xc2,$xc3,$xt2)=($xc1,$xt2,$xc3,$xc0,$xc2);
|
---|
3093 | $code.=<<___;
|
---|
3094 | vpaddd @key[12],$xd0,$xd0
|
---|
3095 | vpaddd @key[13],$xd1,$xd1
|
---|
3096 | vpaddd @key[14],$xd2,$xd2
|
---|
3097 | vpaddd @key[15],$xd3,$xd3
|
---|
3098 |
|
---|
3099 | vpunpckldq $xd1,$xd0,$xt2
|
---|
3100 | vpunpckldq $xd3,$xd2,$xt3
|
---|
3101 | vpunpckhdq $xd1,$xd0,$xd0
|
---|
3102 | vpunpckhdq $xd3,$xd2,$xd2
|
---|
3103 | vpunpcklqdq $xt3,$xt2,$xd1 # "d0"
|
---|
3104 | vpunpckhqdq $xt3,$xt2,$xt2 # "d1"
|
---|
3105 | vpunpcklqdq $xd2,$xd0,$xd3 # "d2"
|
---|
3106 | vpunpckhqdq $xd2,$xd0,$xd0 # "d3"
|
---|
3107 | ___
|
---|
3108 | ($xd0,$xd1,$xd2,$xd3,$xt2)=($xd1,$xt2,$xd3,$xd0,$xd2);
|
---|
3109 | $code.=<<___;
|
---|
3110 | vshufi32x4 \$0x44,$xd0,$xc0,$xt3 # "de-interlace" further
|
---|
3111 | vshufi32x4 \$0xee,$xd0,$xc0,$xd0
|
---|
3112 | vshufi32x4 \$0x44,$xd1,$xc1,$xc0
|
---|
3113 | vshufi32x4 \$0xee,$xd1,$xc1,$xd1
|
---|
3114 | vshufi32x4 \$0x44,$xd2,$xc2,$xc1
|
---|
3115 | vshufi32x4 \$0xee,$xd2,$xc2,$xd2
|
---|
3116 | vshufi32x4 \$0x44,$xd3,$xc3,$xc2
|
---|
3117 | vshufi32x4 \$0xee,$xd3,$xc3,$xd3
|
---|
3118 | ___
|
---|
3119 | ($xc0,$xc1,$xc2,$xc3,$xt3)=($xt3,$xc0,$xc1,$xc2,$xc3);
|
---|
3120 | $code.=<<___;
|
---|
3121 | vshufi32x4 \$0x88,$xc0,$xa0,$xt0 # "de-interlace" further
|
---|
3122 | vshufi32x4 \$0xdd,$xc0,$xa0,$xa0
|
---|
3123 | vshufi32x4 \$0x88,$xd0,$xb0,$xc0
|
---|
3124 | vshufi32x4 \$0xdd,$xd0,$xb0,$xd0
|
---|
3125 | vshufi32x4 \$0x88,$xc1,$xa1,$xt1
|
---|
3126 | vshufi32x4 \$0xdd,$xc1,$xa1,$xa1
|
---|
3127 | vshufi32x4 \$0x88,$xd1,$xb1,$xc1
|
---|
3128 | vshufi32x4 \$0xdd,$xd1,$xb1,$xd1
|
---|
3129 | vshufi32x4 \$0x88,$xc2,$xa2,$xt2
|
---|
3130 | vshufi32x4 \$0xdd,$xc2,$xa2,$xa2
|
---|
3131 | vshufi32x4 \$0x88,$xd2,$xb2,$xc2
|
---|
3132 | vshufi32x4 \$0xdd,$xd2,$xb2,$xd2
|
---|
3133 | vshufi32x4 \$0x88,$xc3,$xa3,$xt3
|
---|
3134 | vshufi32x4 \$0xdd,$xc3,$xa3,$xa3
|
---|
3135 | vshufi32x4 \$0x88,$xd3,$xb3,$xc3
|
---|
3136 | vshufi32x4 \$0xdd,$xd3,$xb3,$xd3
|
---|
3137 | ___
|
---|
3138 | ($xa0,$xa1,$xa2,$xa3,$xb0,$xb1,$xb2,$xb3)=
|
---|
3139 | ($xt0,$xt1,$xt2,$xt3,$xa0,$xa1,$xa2,$xa3);
|
---|
3140 |
|
---|
3141 | ($xa0,$xb0,$xc0,$xd0, $xa1,$xb1,$xc1,$xd1,
|
---|
3142 | $xa2,$xb2,$xc2,$xd2, $xa3,$xb3,$xc3,$xd3) =
|
---|
3143 | ($xa0,$xa1,$xa2,$xa3, $xb0,$xb1,$xb2,$xb3,
|
---|
3144 | $xc0,$xc1,$xc2,$xc3, $xd0,$xd1,$xd2,$xd3);
|
---|
3145 | $code.=<<___;
|
---|
3146 | cmp \$64*16,$len
|
---|
3147 | jb .Ltail16x
|
---|
3148 |
|
---|
3149 | vpxord 0x00($inp),$xa0,$xa0 # xor with input
|
---|
3150 | vpxord 0x40($inp),$xb0,$xb0
|
---|
3151 | vpxord 0x80($inp),$xc0,$xc0
|
---|
3152 | vpxord 0xc0($inp),$xd0,$xd0
|
---|
3153 | vmovdqu32 $xa0,0x00($out)
|
---|
3154 | vmovdqu32 $xb0,0x40($out)
|
---|
3155 | vmovdqu32 $xc0,0x80($out)
|
---|
3156 | vmovdqu32 $xd0,0xc0($out)
|
---|
3157 |
|
---|
3158 | vpxord 0x100($inp),$xa1,$xa1
|
---|
3159 | vpxord 0x140($inp),$xb1,$xb1
|
---|
3160 | vpxord 0x180($inp),$xc1,$xc1
|
---|
3161 | vpxord 0x1c0($inp),$xd1,$xd1
|
---|
3162 | vmovdqu32 $xa1,0x100($out)
|
---|
3163 | vmovdqu32 $xb1,0x140($out)
|
---|
3164 | vmovdqu32 $xc1,0x180($out)
|
---|
3165 | vmovdqu32 $xd1,0x1c0($out)
|
---|
3166 |
|
---|
3167 | vpxord 0x200($inp),$xa2,$xa2
|
---|
3168 | vpxord 0x240($inp),$xb2,$xb2
|
---|
3169 | vpxord 0x280($inp),$xc2,$xc2
|
---|
3170 | vpxord 0x2c0($inp),$xd2,$xd2
|
---|
3171 | vmovdqu32 $xa2,0x200($out)
|
---|
3172 | vmovdqu32 $xb2,0x240($out)
|
---|
3173 | vmovdqu32 $xc2,0x280($out)
|
---|
3174 | vmovdqu32 $xd2,0x2c0($out)
|
---|
3175 |
|
---|
3176 | vpxord 0x300($inp),$xa3,$xa3
|
---|
3177 | vpxord 0x340($inp),$xb3,$xb3
|
---|
3178 | vpxord 0x380($inp),$xc3,$xc3
|
---|
3179 | vpxord 0x3c0($inp),$xd3,$xd3
|
---|
3180 | lea 0x400($inp),$inp
|
---|
3181 | vmovdqu32 $xa3,0x300($out)
|
---|
3182 | vmovdqu32 $xb3,0x340($out)
|
---|
3183 | vmovdqu32 $xc3,0x380($out)
|
---|
3184 | vmovdqu32 $xd3,0x3c0($out)
|
---|
3185 | lea 0x400($out),$out
|
---|
3186 |
|
---|
3187 | sub \$64*16,$len
|
---|
3188 | jnz .Loop_outer16x
|
---|
3189 |
|
---|
3190 | jmp .Ldone16x
|
---|
3191 |
|
---|
3192 | .align 32
|
---|
3193 | .Ltail16x:
|
---|
3194 | xor %r10,%r10
|
---|
3195 | sub $inp,$out
|
---|
3196 | cmp \$64*1,$len
|
---|
3197 | jb .Less_than_64_16x
|
---|
3198 | vpxord ($inp),$xa0,$xa0 # xor with input
|
---|
3199 | vmovdqu32 $xa0,($out,$inp)
|
---|
3200 | je .Ldone16x
|
---|
3201 | vmovdqa32 $xb0,$xa0
|
---|
3202 | lea 64($inp),$inp
|
---|
3203 |
|
---|
3204 | cmp \$64*2,$len
|
---|
3205 | jb .Less_than_64_16x
|
---|
3206 | vpxord ($inp),$xb0,$xb0
|
---|
3207 | vmovdqu32 $xb0,($out,$inp)
|
---|
3208 | je .Ldone16x
|
---|
3209 | vmovdqa32 $xc0,$xa0
|
---|
3210 | lea 64($inp),$inp
|
---|
3211 |
|
---|
3212 | cmp \$64*3,$len
|
---|
3213 | jb .Less_than_64_16x
|
---|
3214 | vpxord ($inp),$xc0,$xc0
|
---|
3215 | vmovdqu32 $xc0,($out,$inp)
|
---|
3216 | je .Ldone16x
|
---|
3217 | vmovdqa32 $xd0,$xa0
|
---|
3218 | lea 64($inp),$inp
|
---|
3219 |
|
---|
3220 | cmp \$64*4,$len
|
---|
3221 | jb .Less_than_64_16x
|
---|
3222 | vpxord ($inp),$xd0,$xd0
|
---|
3223 | vmovdqu32 $xd0,($out,$inp)
|
---|
3224 | je .Ldone16x
|
---|
3225 | vmovdqa32 $xa1,$xa0
|
---|
3226 | lea 64($inp),$inp
|
---|
3227 |
|
---|
3228 | cmp \$64*5,$len
|
---|
3229 | jb .Less_than_64_16x
|
---|
3230 | vpxord ($inp),$xa1,$xa1
|
---|
3231 | vmovdqu32 $xa1,($out,$inp)
|
---|
3232 | je .Ldone16x
|
---|
3233 | vmovdqa32 $xb1,$xa0
|
---|
3234 | lea 64($inp),$inp
|
---|
3235 |
|
---|
3236 | cmp \$64*6,$len
|
---|
3237 | jb .Less_than_64_16x
|
---|
3238 | vpxord ($inp),$xb1,$xb1
|
---|
3239 | vmovdqu32 $xb1,($out,$inp)
|
---|
3240 | je .Ldone16x
|
---|
3241 | vmovdqa32 $xc1,$xa0
|
---|
3242 | lea 64($inp),$inp
|
---|
3243 |
|
---|
3244 | cmp \$64*7,$len
|
---|
3245 | jb .Less_than_64_16x
|
---|
3246 | vpxord ($inp),$xc1,$xc1
|
---|
3247 | vmovdqu32 $xc1,($out,$inp)
|
---|
3248 | je .Ldone16x
|
---|
3249 | vmovdqa32 $xd1,$xa0
|
---|
3250 | lea 64($inp),$inp
|
---|
3251 |
|
---|
3252 | cmp \$64*8,$len
|
---|
3253 | jb .Less_than_64_16x
|
---|
3254 | vpxord ($inp),$xd1,$xd1
|
---|
3255 | vmovdqu32 $xd1,($out,$inp)
|
---|
3256 | je .Ldone16x
|
---|
3257 | vmovdqa32 $xa2,$xa0
|
---|
3258 | lea 64($inp),$inp
|
---|
3259 |
|
---|
3260 | cmp \$64*9,$len
|
---|
3261 | jb .Less_than_64_16x
|
---|
3262 | vpxord ($inp),$xa2,$xa2
|
---|
3263 | vmovdqu32 $xa2,($out,$inp)
|
---|
3264 | je .Ldone16x
|
---|
3265 | vmovdqa32 $xb2,$xa0
|
---|
3266 | lea 64($inp),$inp
|
---|
3267 |
|
---|
3268 | cmp \$64*10,$len
|
---|
3269 | jb .Less_than_64_16x
|
---|
3270 | vpxord ($inp),$xb2,$xb2
|
---|
3271 | vmovdqu32 $xb2,($out,$inp)
|
---|
3272 | je .Ldone16x
|
---|
3273 | vmovdqa32 $xc2,$xa0
|
---|
3274 | lea 64($inp),$inp
|
---|
3275 |
|
---|
3276 | cmp \$64*11,$len
|
---|
3277 | jb .Less_than_64_16x
|
---|
3278 | vpxord ($inp),$xc2,$xc2
|
---|
3279 | vmovdqu32 $xc2,($out,$inp)
|
---|
3280 | je .Ldone16x
|
---|
3281 | vmovdqa32 $xd2,$xa0
|
---|
3282 | lea 64($inp),$inp
|
---|
3283 |
|
---|
3284 | cmp \$64*12,$len
|
---|
3285 | jb .Less_than_64_16x
|
---|
3286 | vpxord ($inp),$xd2,$xd2
|
---|
3287 | vmovdqu32 $xd2,($out,$inp)
|
---|
3288 | je .Ldone16x
|
---|
3289 | vmovdqa32 $xa3,$xa0
|
---|
3290 | lea 64($inp),$inp
|
---|
3291 |
|
---|
3292 | cmp \$64*13,$len
|
---|
3293 | jb .Less_than_64_16x
|
---|
3294 | vpxord ($inp),$xa3,$xa3
|
---|
3295 | vmovdqu32 $xa3,($out,$inp)
|
---|
3296 | je .Ldone16x
|
---|
3297 | vmovdqa32 $xb3,$xa0
|
---|
3298 | lea 64($inp),$inp
|
---|
3299 |
|
---|
3300 | cmp \$64*14,$len
|
---|
3301 | jb .Less_than_64_16x
|
---|
3302 | vpxord ($inp),$xb3,$xb3
|
---|
3303 | vmovdqu32 $xb3,($out,$inp)
|
---|
3304 | je .Ldone16x
|
---|
3305 | vmovdqa32 $xc3,$xa0
|
---|
3306 | lea 64($inp),$inp
|
---|
3307 |
|
---|
3308 | cmp \$64*15,$len
|
---|
3309 | jb .Less_than_64_16x
|
---|
3310 | vpxord ($inp),$xc3,$xc3
|
---|
3311 | vmovdqu32 $xc3,($out,$inp)
|
---|
3312 | je .Ldone16x
|
---|
3313 | vmovdqa32 $xd3,$xa0
|
---|
3314 | lea 64($inp),$inp
|
---|
3315 |
|
---|
3316 | .Less_than_64_16x:
|
---|
3317 | vmovdqa32 $xa0,0x00(%rsp)
|
---|
3318 | lea ($out,$inp),$out
|
---|
3319 | and \$63,$len
|
---|
3320 |
|
---|
3321 | .Loop_tail16x:
|
---|
3322 | movzb ($inp,%r10),%eax
|
---|
3323 | movzb (%rsp,%r10),%ecx
|
---|
3324 | lea 1(%r10),%r10
|
---|
3325 | xor %ecx,%eax
|
---|
3326 | mov %al,-1($out,%r10)
|
---|
3327 | dec $len
|
---|
3328 | jnz .Loop_tail16x
|
---|
3329 |
|
---|
3330 | vpxord $xa0,$xa0,$xa0
|
---|
3331 | vmovdqa32 $xa0,0(%rsp)
|
---|
3332 |
|
---|
3333 | .Ldone16x:
|
---|
3334 | vzeroall
|
---|
3335 | ___
|
---|
3336 | $code.=<<___ if ($win64);
|
---|
3337 | movaps -0xa8(%r9),%xmm6
|
---|
3338 | movaps -0x98(%r9),%xmm7
|
---|
3339 | movaps -0x88(%r9),%xmm8
|
---|
3340 | movaps -0x78(%r9),%xmm9
|
---|
3341 | movaps -0x68(%r9),%xmm10
|
---|
3342 | movaps -0x58(%r9),%xmm11
|
---|
3343 | movaps -0x48(%r9),%xmm12
|
---|
3344 | movaps -0x38(%r9),%xmm13
|
---|
3345 | movaps -0x28(%r9),%xmm14
|
---|
3346 | movaps -0x18(%r9),%xmm15
|
---|
3347 | ___
|
---|
3348 | $code.=<<___;
|
---|
3349 | lea (%r9),%rsp
|
---|
3350 | .cfi_def_cfa_register %rsp
|
---|
3351 | .L16x_epilogue:
|
---|
3352 | ret
|
---|
3353 | .cfi_endproc
|
---|
3354 | .size ChaCha20_16x,.-ChaCha20_16x
|
---|
3355 | ___
|
---|
3356 |
|
---|
3357 | # switch to %ymm domain
|
---|
3358 | ($xa0,$xa1,$xa2,$xa3, $xb0,$xb1,$xb2,$xb3,
|
---|
3359 | $xc0,$xc1,$xc2,$xc3, $xd0,$xd1,$xd2,$xd3)=map("%ymm$_",(0..15));
|
---|
3360 | @xx=($xa0,$xa1,$xa2,$xa3, $xb0,$xb1,$xb2,$xb3,
|
---|
3361 | $xc0,$xc1,$xc2,$xc3, $xd0,$xd1,$xd2,$xd3);
|
---|
3362 | @key=map("%ymm$_",(16..31));
|
---|
3363 | ($xt0,$xt1,$xt2,$xt3)=@key[0..3];
|
---|
3364 |
|
---|
3365 | $code.=<<___;
|
---|
3366 | .type ChaCha20_8xvl,\@function,5
|
---|
3367 | .align 32
|
---|
3368 | ChaCha20_8xvl:
|
---|
3369 | .cfi_startproc
|
---|
3370 | .LChaCha20_8xvl:
|
---|
3371 | mov %rsp,%r9 # frame register
|
---|
3372 | .cfi_def_cfa_register %r9
|
---|
3373 | sub \$64+$xframe,%rsp
|
---|
3374 | and \$-64,%rsp
|
---|
3375 | ___
|
---|
3376 | $code.=<<___ if ($win64);
|
---|
3377 | movaps %xmm6,-0xa8(%r9)
|
---|
3378 | movaps %xmm7,-0x98(%r9)
|
---|
3379 | movaps %xmm8,-0x88(%r9)
|
---|
3380 | movaps %xmm9,-0x78(%r9)
|
---|
3381 | movaps %xmm10,-0x68(%r9)
|
---|
3382 | movaps %xmm11,-0x58(%r9)
|
---|
3383 | movaps %xmm12,-0x48(%r9)
|
---|
3384 | movaps %xmm13,-0x38(%r9)
|
---|
3385 | movaps %xmm14,-0x28(%r9)
|
---|
3386 | movaps %xmm15,-0x18(%r9)
|
---|
3387 | .L8xvl_body:
|
---|
3388 | ___
|
---|
3389 | $code.=<<___;
|
---|
3390 | vzeroupper
|
---|
3391 |
|
---|
3392 | lea .Lsigma(%rip),%r10
|
---|
3393 | vbroadcasti128 (%r10),$xa3 # key[0]
|
---|
3394 | vbroadcasti128 ($key),$xb3 # key[1]
|
---|
3395 | vbroadcasti128 16($key),$xc3 # key[2]
|
---|
3396 | vbroadcasti128 ($counter),$xd3 # key[3]
|
---|
3397 |
|
---|
3398 | vpshufd \$0x00,$xa3,$xa0 # smash key by lanes...
|
---|
3399 | vpshufd \$0x55,$xa3,$xa1
|
---|
3400 | vpshufd \$0xaa,$xa3,$xa2
|
---|
3401 | vpshufd \$0xff,$xa3,$xa3
|
---|
3402 | vmovdqa64 $xa0,@key[0]
|
---|
3403 | vmovdqa64 $xa1,@key[1]
|
---|
3404 | vmovdqa64 $xa2,@key[2]
|
---|
3405 | vmovdqa64 $xa3,@key[3]
|
---|
3406 |
|
---|
3407 | vpshufd \$0x00,$xb3,$xb0
|
---|
3408 | vpshufd \$0x55,$xb3,$xb1
|
---|
3409 | vpshufd \$0xaa,$xb3,$xb2
|
---|
3410 | vpshufd \$0xff,$xb3,$xb3
|
---|
3411 | vmovdqa64 $xb0,@key[4]
|
---|
3412 | vmovdqa64 $xb1,@key[5]
|
---|
3413 | vmovdqa64 $xb2,@key[6]
|
---|
3414 | vmovdqa64 $xb3,@key[7]
|
---|
3415 |
|
---|
3416 | vpshufd \$0x00,$xc3,$xc0
|
---|
3417 | vpshufd \$0x55,$xc3,$xc1
|
---|
3418 | vpshufd \$0xaa,$xc3,$xc2
|
---|
3419 | vpshufd \$0xff,$xc3,$xc3
|
---|
3420 | vmovdqa64 $xc0,@key[8]
|
---|
3421 | vmovdqa64 $xc1,@key[9]
|
---|
3422 | vmovdqa64 $xc2,@key[10]
|
---|
3423 | vmovdqa64 $xc3,@key[11]
|
---|
3424 |
|
---|
3425 | vpshufd \$0x00,$xd3,$xd0
|
---|
3426 | vpshufd \$0x55,$xd3,$xd1
|
---|
3427 | vpshufd \$0xaa,$xd3,$xd2
|
---|
3428 | vpshufd \$0xff,$xd3,$xd3
|
---|
3429 | vpaddd .Lincy(%rip),$xd0,$xd0 # don't save counters yet
|
---|
3430 | vmovdqa64 $xd0,@key[12]
|
---|
3431 | vmovdqa64 $xd1,@key[13]
|
---|
3432 | vmovdqa64 $xd2,@key[14]
|
---|
3433 | vmovdqa64 $xd3,@key[15]
|
---|
3434 |
|
---|
3435 | mov \$10,%eax
|
---|
3436 | jmp .Loop8xvl
|
---|
3437 |
|
---|
3438 | .align 32
|
---|
3439 | .Loop_outer8xvl:
|
---|
3440 | #vpbroadcastd 0(%r10),$xa0 # reload key
|
---|
3441 | #vpbroadcastd 4(%r10),$xa1
|
---|
3442 | vpbroadcastd 8(%r10),$xa2
|
---|
3443 | vpbroadcastd 12(%r10),$xa3
|
---|
3444 | vpaddd .Leight(%rip),@key[12],@key[12] # next SIMD counters
|
---|
3445 | vmovdqa64 @key[4],$xb0
|
---|
3446 | vmovdqa64 @key[5],$xb1
|
---|
3447 | vmovdqa64 @key[6],$xb2
|
---|
3448 | vmovdqa64 @key[7],$xb3
|
---|
3449 | vmovdqa64 @key[8],$xc0
|
---|
3450 | vmovdqa64 @key[9],$xc1
|
---|
3451 | vmovdqa64 @key[10],$xc2
|
---|
3452 | vmovdqa64 @key[11],$xc3
|
---|
3453 | vmovdqa64 @key[12],$xd0
|
---|
3454 | vmovdqa64 @key[13],$xd1
|
---|
3455 | vmovdqa64 @key[14],$xd2
|
---|
3456 | vmovdqa64 @key[15],$xd3
|
---|
3457 |
|
---|
3458 | vmovdqa64 $xa0,@key[0]
|
---|
3459 | vmovdqa64 $xa1,@key[1]
|
---|
3460 | vmovdqa64 $xa2,@key[2]
|
---|
3461 | vmovdqa64 $xa3,@key[3]
|
---|
3462 |
|
---|
3463 | mov \$10,%eax
|
---|
3464 | jmp .Loop8xvl
|
---|
3465 |
|
---|
3466 | .align 32
|
---|
3467 | .Loop8xvl:
|
---|
3468 | ___
|
---|
3469 | foreach (&AVX512_lane_ROUND(0, 4, 8,12)) { eval; }
|
---|
3470 | foreach (&AVX512_lane_ROUND(0, 5,10,15)) { eval; }
|
---|
3471 | $code.=<<___;
|
---|
3472 | dec %eax
|
---|
3473 | jnz .Loop8xvl
|
---|
3474 |
|
---|
3475 | vpaddd @key[0],$xa0,$xa0 # accumulate key
|
---|
3476 | vpaddd @key[1],$xa1,$xa1
|
---|
3477 | vpaddd @key[2],$xa2,$xa2
|
---|
3478 | vpaddd @key[3],$xa3,$xa3
|
---|
3479 |
|
---|
3480 | vpunpckldq $xa1,$xa0,$xt2 # "de-interlace" data
|
---|
3481 | vpunpckldq $xa3,$xa2,$xt3
|
---|
3482 | vpunpckhdq $xa1,$xa0,$xa0
|
---|
3483 | vpunpckhdq $xa3,$xa2,$xa2
|
---|
3484 | vpunpcklqdq $xt3,$xt2,$xa1 # "a0"
|
---|
3485 | vpunpckhqdq $xt3,$xt2,$xt2 # "a1"
|
---|
3486 | vpunpcklqdq $xa2,$xa0,$xa3 # "a2"
|
---|
3487 | vpunpckhqdq $xa2,$xa0,$xa0 # "a3"
|
---|
3488 | ___
|
---|
3489 | ($xa0,$xa1,$xa2,$xa3,$xt2)=($xa1,$xt2,$xa3,$xa0,$xa2);
|
---|
3490 | $code.=<<___;
|
---|
3491 | vpaddd @key[4],$xb0,$xb0
|
---|
3492 | vpaddd @key[5],$xb1,$xb1
|
---|
3493 | vpaddd @key[6],$xb2,$xb2
|
---|
3494 | vpaddd @key[7],$xb3,$xb3
|
---|
3495 |
|
---|
3496 | vpunpckldq $xb1,$xb0,$xt2
|
---|
3497 | vpunpckldq $xb3,$xb2,$xt3
|
---|
3498 | vpunpckhdq $xb1,$xb0,$xb0
|
---|
3499 | vpunpckhdq $xb3,$xb2,$xb2
|
---|
3500 | vpunpcklqdq $xt3,$xt2,$xb1 # "b0"
|
---|
3501 | vpunpckhqdq $xt3,$xt2,$xt2 # "b1"
|
---|
3502 | vpunpcklqdq $xb2,$xb0,$xb3 # "b2"
|
---|
3503 | vpunpckhqdq $xb2,$xb0,$xb0 # "b3"
|
---|
3504 | ___
|
---|
3505 | ($xb0,$xb1,$xb2,$xb3,$xt2)=($xb1,$xt2,$xb3,$xb0,$xb2);
|
---|
3506 | $code.=<<___;
|
---|
3507 | vshufi32x4 \$0,$xb0,$xa0,$xt3 # "de-interlace" further
|
---|
3508 | vshufi32x4 \$3,$xb0,$xa0,$xb0
|
---|
3509 | vshufi32x4 \$0,$xb1,$xa1,$xa0
|
---|
3510 | vshufi32x4 \$3,$xb1,$xa1,$xb1
|
---|
3511 | vshufi32x4 \$0,$xb2,$xa2,$xa1
|
---|
3512 | vshufi32x4 \$3,$xb2,$xa2,$xb2
|
---|
3513 | vshufi32x4 \$0,$xb3,$xa3,$xa2
|
---|
3514 | vshufi32x4 \$3,$xb3,$xa3,$xb3
|
---|
3515 | ___
|
---|
3516 | ($xa0,$xa1,$xa2,$xa3,$xt3)=($xt3,$xa0,$xa1,$xa2,$xa3);
|
---|
3517 | $code.=<<___;
|
---|
3518 | vpaddd @key[8],$xc0,$xc0
|
---|
3519 | vpaddd @key[9],$xc1,$xc1
|
---|
3520 | vpaddd @key[10],$xc2,$xc2
|
---|
3521 | vpaddd @key[11],$xc3,$xc3
|
---|
3522 |
|
---|
3523 | vpunpckldq $xc1,$xc0,$xt2
|
---|
3524 | vpunpckldq $xc3,$xc2,$xt3
|
---|
3525 | vpunpckhdq $xc1,$xc0,$xc0
|
---|
3526 | vpunpckhdq $xc3,$xc2,$xc2
|
---|
3527 | vpunpcklqdq $xt3,$xt2,$xc1 # "c0"
|
---|
3528 | vpunpckhqdq $xt3,$xt2,$xt2 # "c1"
|
---|
3529 | vpunpcklqdq $xc2,$xc0,$xc3 # "c2"
|
---|
3530 | vpunpckhqdq $xc2,$xc0,$xc0 # "c3"
|
---|
3531 | ___
|
---|
3532 | ($xc0,$xc1,$xc2,$xc3,$xt2)=($xc1,$xt2,$xc3,$xc0,$xc2);
|
---|
3533 | $code.=<<___;
|
---|
3534 | vpaddd @key[12],$xd0,$xd0
|
---|
3535 | vpaddd @key[13],$xd1,$xd1
|
---|
3536 | vpaddd @key[14],$xd2,$xd2
|
---|
3537 | vpaddd @key[15],$xd3,$xd3
|
---|
3538 |
|
---|
3539 | vpunpckldq $xd1,$xd0,$xt2
|
---|
3540 | vpunpckldq $xd3,$xd2,$xt3
|
---|
3541 | vpunpckhdq $xd1,$xd0,$xd0
|
---|
3542 | vpunpckhdq $xd3,$xd2,$xd2
|
---|
3543 | vpunpcklqdq $xt3,$xt2,$xd1 # "d0"
|
---|
3544 | vpunpckhqdq $xt3,$xt2,$xt2 # "d1"
|
---|
3545 | vpunpcklqdq $xd2,$xd0,$xd3 # "d2"
|
---|
3546 | vpunpckhqdq $xd2,$xd0,$xd0 # "d3"
|
---|
3547 | ___
|
---|
3548 | ($xd0,$xd1,$xd2,$xd3,$xt2)=($xd1,$xt2,$xd3,$xd0,$xd2);
|
---|
3549 | $code.=<<___;
|
---|
3550 | vperm2i128 \$0x20,$xd0,$xc0,$xt3 # "de-interlace" further
|
---|
3551 | vperm2i128 \$0x31,$xd0,$xc0,$xd0
|
---|
3552 | vperm2i128 \$0x20,$xd1,$xc1,$xc0
|
---|
3553 | vperm2i128 \$0x31,$xd1,$xc1,$xd1
|
---|
3554 | vperm2i128 \$0x20,$xd2,$xc2,$xc1
|
---|
3555 | vperm2i128 \$0x31,$xd2,$xc2,$xd2
|
---|
3556 | vperm2i128 \$0x20,$xd3,$xc3,$xc2
|
---|
3557 | vperm2i128 \$0x31,$xd3,$xc3,$xd3
|
---|
3558 | ___
|
---|
3559 | ($xc0,$xc1,$xc2,$xc3,$xt3)=($xt3,$xc0,$xc1,$xc2,$xc3);
|
---|
3560 | ($xb0,$xb1,$xb2,$xb3,$xc0,$xc1,$xc2,$xc3)=
|
---|
3561 | ($xc0,$xc1,$xc2,$xc3,$xb0,$xb1,$xb2,$xb3);
|
---|
3562 | $code.=<<___;
|
---|
3563 | cmp \$64*8,$len
|
---|
3564 | jb .Ltail8xvl
|
---|
3565 |
|
---|
3566 | mov \$0x80,%eax # size optimization
|
---|
3567 | vpxord 0x00($inp),$xa0,$xa0 # xor with input
|
---|
3568 | vpxor 0x20($inp),$xb0,$xb0
|
---|
3569 | vpxor 0x40($inp),$xc0,$xc0
|
---|
3570 | vpxor 0x60($inp),$xd0,$xd0
|
---|
3571 | lea ($inp,%rax),$inp # size optimization
|
---|
3572 | vmovdqu32 $xa0,0x00($out)
|
---|
3573 | vmovdqu $xb0,0x20($out)
|
---|
3574 | vmovdqu $xc0,0x40($out)
|
---|
3575 | vmovdqu $xd0,0x60($out)
|
---|
3576 | lea ($out,%rax),$out # size optimization
|
---|
3577 |
|
---|
3578 | vpxor 0x00($inp),$xa1,$xa1
|
---|
3579 | vpxor 0x20($inp),$xb1,$xb1
|
---|
3580 | vpxor 0x40($inp),$xc1,$xc1
|
---|
3581 | vpxor 0x60($inp),$xd1,$xd1
|
---|
3582 | lea ($inp,%rax),$inp # size optimization
|
---|
3583 | vmovdqu $xa1,0x00($out)
|
---|
3584 | vmovdqu $xb1,0x20($out)
|
---|
3585 | vmovdqu $xc1,0x40($out)
|
---|
3586 | vmovdqu $xd1,0x60($out)
|
---|
3587 | lea ($out,%rax),$out # size optimization
|
---|
3588 |
|
---|
3589 | vpxord 0x00($inp),$xa2,$xa2
|
---|
3590 | vpxor 0x20($inp),$xb2,$xb2
|
---|
3591 | vpxor 0x40($inp),$xc2,$xc2
|
---|
3592 | vpxor 0x60($inp),$xd2,$xd2
|
---|
3593 | lea ($inp,%rax),$inp # size optimization
|
---|
3594 | vmovdqu32 $xa2,0x00($out)
|
---|
3595 | vmovdqu $xb2,0x20($out)
|
---|
3596 | vmovdqu $xc2,0x40($out)
|
---|
3597 | vmovdqu $xd2,0x60($out)
|
---|
3598 | lea ($out,%rax),$out # size optimization
|
---|
3599 |
|
---|
3600 | vpxor 0x00($inp),$xa3,$xa3
|
---|
3601 | vpxor 0x20($inp),$xb3,$xb3
|
---|
3602 | vpxor 0x40($inp),$xc3,$xc3
|
---|
3603 | vpxor 0x60($inp),$xd3,$xd3
|
---|
3604 | lea ($inp,%rax),$inp # size optimization
|
---|
3605 | vmovdqu $xa3,0x00($out)
|
---|
3606 | vmovdqu $xb3,0x20($out)
|
---|
3607 | vmovdqu $xc3,0x40($out)
|
---|
3608 | vmovdqu $xd3,0x60($out)
|
---|
3609 | lea ($out,%rax),$out # size optimization
|
---|
3610 |
|
---|
3611 | vpbroadcastd 0(%r10),%ymm0 # reload key
|
---|
3612 | vpbroadcastd 4(%r10),%ymm1
|
---|
3613 |
|
---|
3614 | sub \$64*8,$len
|
---|
3615 | jnz .Loop_outer8xvl
|
---|
3616 |
|
---|
3617 | jmp .Ldone8xvl
|
---|
3618 |
|
---|
3619 | .align 32
|
---|
3620 | .Ltail8xvl:
|
---|
3621 | vmovdqa64 $xa0,%ymm8 # size optimization
|
---|
3622 | ___
|
---|
3623 | $xa0 = "%ymm8";
|
---|
3624 | $code.=<<___;
|
---|
3625 | xor %r10,%r10
|
---|
3626 | sub $inp,$out
|
---|
3627 | cmp \$64*1,$len
|
---|
3628 | jb .Less_than_64_8xvl
|
---|
3629 | vpxor 0x00($inp),$xa0,$xa0 # xor with input
|
---|
3630 | vpxor 0x20($inp),$xb0,$xb0
|
---|
3631 | vmovdqu $xa0,0x00($out,$inp)
|
---|
3632 | vmovdqu $xb0,0x20($out,$inp)
|
---|
3633 | je .Ldone8xvl
|
---|
3634 | vmovdqa $xc0,$xa0
|
---|
3635 | vmovdqa $xd0,$xb0
|
---|
3636 | lea 64($inp),$inp
|
---|
3637 |
|
---|
3638 | cmp \$64*2,$len
|
---|
3639 | jb .Less_than_64_8xvl
|
---|
3640 | vpxor 0x00($inp),$xc0,$xc0
|
---|
3641 | vpxor 0x20($inp),$xd0,$xd0
|
---|
3642 | vmovdqu $xc0,0x00($out,$inp)
|
---|
3643 | vmovdqu $xd0,0x20($out,$inp)
|
---|
3644 | je .Ldone8xvl
|
---|
3645 | vmovdqa $xa1,$xa0
|
---|
3646 | vmovdqa $xb1,$xb0
|
---|
3647 | lea 64($inp),$inp
|
---|
3648 |
|
---|
3649 | cmp \$64*3,$len
|
---|
3650 | jb .Less_than_64_8xvl
|
---|
3651 | vpxor 0x00($inp),$xa1,$xa1
|
---|
3652 | vpxor 0x20($inp),$xb1,$xb1
|
---|
3653 | vmovdqu $xa1,0x00($out,$inp)
|
---|
3654 | vmovdqu $xb1,0x20($out,$inp)
|
---|
3655 | je .Ldone8xvl
|
---|
3656 | vmovdqa $xc1,$xa0
|
---|
3657 | vmovdqa $xd1,$xb0
|
---|
3658 | lea 64($inp),$inp
|
---|
3659 |
|
---|
3660 | cmp \$64*4,$len
|
---|
3661 | jb .Less_than_64_8xvl
|
---|
3662 | vpxor 0x00($inp),$xc1,$xc1
|
---|
3663 | vpxor 0x20($inp),$xd1,$xd1
|
---|
3664 | vmovdqu $xc1,0x00($out,$inp)
|
---|
3665 | vmovdqu $xd1,0x20($out,$inp)
|
---|
3666 | je .Ldone8xvl
|
---|
3667 | vmovdqa32 $xa2,$xa0
|
---|
3668 | vmovdqa $xb2,$xb0
|
---|
3669 | lea 64($inp),$inp
|
---|
3670 |
|
---|
3671 | cmp \$64*5,$len
|
---|
3672 | jb .Less_than_64_8xvl
|
---|
3673 | vpxord 0x00($inp),$xa2,$xa2
|
---|
3674 | vpxor 0x20($inp),$xb2,$xb2
|
---|
3675 | vmovdqu32 $xa2,0x00($out,$inp)
|
---|
3676 | vmovdqu $xb2,0x20($out,$inp)
|
---|
3677 | je .Ldone8xvl
|
---|
3678 | vmovdqa $xc2,$xa0
|
---|
3679 | vmovdqa $xd2,$xb0
|
---|
3680 | lea 64($inp),$inp
|
---|
3681 |
|
---|
3682 | cmp \$64*6,$len
|
---|
3683 | jb .Less_than_64_8xvl
|
---|
3684 | vpxor 0x00($inp),$xc2,$xc2
|
---|
3685 | vpxor 0x20($inp),$xd2,$xd2
|
---|
3686 | vmovdqu $xc2,0x00($out,$inp)
|
---|
3687 | vmovdqu $xd2,0x20($out,$inp)
|
---|
3688 | je .Ldone8xvl
|
---|
3689 | vmovdqa $xa3,$xa0
|
---|
3690 | vmovdqa $xb3,$xb0
|
---|
3691 | lea 64($inp),$inp
|
---|
3692 |
|
---|
3693 | cmp \$64*7,$len
|
---|
3694 | jb .Less_than_64_8xvl
|
---|
3695 | vpxor 0x00($inp),$xa3,$xa3
|
---|
3696 | vpxor 0x20($inp),$xb3,$xb3
|
---|
3697 | vmovdqu $xa3,0x00($out,$inp)
|
---|
3698 | vmovdqu $xb3,0x20($out,$inp)
|
---|
3699 | je .Ldone8xvl
|
---|
3700 | vmovdqa $xc3,$xa0
|
---|
3701 | vmovdqa $xd3,$xb0
|
---|
3702 | lea 64($inp),$inp
|
---|
3703 |
|
---|
3704 | .Less_than_64_8xvl:
|
---|
3705 | vmovdqa $xa0,0x00(%rsp)
|
---|
3706 | vmovdqa $xb0,0x20(%rsp)
|
---|
3707 | lea ($out,$inp),$out
|
---|
3708 | and \$63,$len
|
---|
3709 |
|
---|
3710 | .Loop_tail8xvl:
|
---|
3711 | movzb ($inp,%r10),%eax
|
---|
3712 | movzb (%rsp,%r10),%ecx
|
---|
3713 | lea 1(%r10),%r10
|
---|
3714 | xor %ecx,%eax
|
---|
3715 | mov %al,-1($out,%r10)
|
---|
3716 | dec $len
|
---|
3717 | jnz .Loop_tail8xvl
|
---|
3718 |
|
---|
3719 | vpxor $xa0,$xa0,$xa0
|
---|
3720 | vmovdqa $xa0,0x00(%rsp)
|
---|
3721 | vmovdqa $xa0,0x20(%rsp)
|
---|
3722 |
|
---|
3723 | .Ldone8xvl:
|
---|
3724 | vzeroall
|
---|
3725 | ___
|
---|
3726 | $code.=<<___ if ($win64);
|
---|
3727 | movaps -0xa8(%r9),%xmm6
|
---|
3728 | movaps -0x98(%r9),%xmm7
|
---|
3729 | movaps -0x88(%r9),%xmm8
|
---|
3730 | movaps -0x78(%r9),%xmm9
|
---|
3731 | movaps -0x68(%r9),%xmm10
|
---|
3732 | movaps -0x58(%r9),%xmm11
|
---|
3733 | movaps -0x48(%r9),%xmm12
|
---|
3734 | movaps -0x38(%r9),%xmm13
|
---|
3735 | movaps -0x28(%r9),%xmm14
|
---|
3736 | movaps -0x18(%r9),%xmm15
|
---|
3737 | ___
|
---|
3738 | $code.=<<___;
|
---|
3739 | lea (%r9),%rsp
|
---|
3740 | .cfi_def_cfa_register %rsp
|
---|
3741 | .L8xvl_epilogue:
|
---|
3742 | ret
|
---|
3743 | .cfi_endproc
|
---|
3744 | .size ChaCha20_8xvl,.-ChaCha20_8xvl
|
---|
3745 | ___
|
---|
3746 | }
|
---|
3747 |
|
---|
3748 | # EXCEPTION_DISPOSITION handler (EXCEPTION_RECORD *rec,ULONG64 frame,
|
---|
3749 | # CONTEXT *context,DISPATCHER_CONTEXT *disp)
|
---|
3750 | if ($win64) {
|
---|
3751 | $rec="%rcx";
|
---|
3752 | $frame="%rdx";
|
---|
3753 | $context="%r8";
|
---|
3754 | $disp="%r9";
|
---|
3755 |
|
---|
3756 | $code.=<<___;
|
---|
3757 | .extern __imp_RtlVirtualUnwind
|
---|
3758 | .type se_handler,\@abi-omnipotent
|
---|
3759 | .align 16
|
---|
3760 | se_handler:
|
---|
3761 | push %rsi
|
---|
3762 | push %rdi
|
---|
3763 | push %rbx
|
---|
3764 | push %rbp
|
---|
3765 | push %r12
|
---|
3766 | push %r13
|
---|
3767 | push %r14
|
---|
3768 | push %r15
|
---|
3769 | pushfq
|
---|
3770 | sub \$64,%rsp
|
---|
3771 |
|
---|
3772 | mov 120($context),%rax # pull context->Rax
|
---|
3773 | mov 248($context),%rbx # pull context->Rip
|
---|
3774 |
|
---|
3775 | mov 8($disp),%rsi # disp->ImageBase
|
---|
3776 | mov 56($disp),%r11 # disp->HandlerData
|
---|
3777 |
|
---|
3778 | lea .Lctr32_body(%rip),%r10
|
---|
3779 | cmp %r10,%rbx # context->Rip<.Lprologue
|
---|
3780 | jb .Lcommon_seh_tail
|
---|
3781 |
|
---|
3782 | mov 152($context),%rax # pull context->Rsp
|
---|
3783 |
|
---|
3784 | lea .Lno_data(%rip),%r10 # epilogue label
|
---|
3785 | cmp %r10,%rbx # context->Rip>=.Lepilogue
|
---|
3786 | jae .Lcommon_seh_tail
|
---|
3787 |
|
---|
3788 | lea 64+24+48(%rax),%rax
|
---|
3789 |
|
---|
3790 | mov -8(%rax),%rbx
|
---|
3791 | mov -16(%rax),%rbp
|
---|
3792 | mov -24(%rax),%r12
|
---|
3793 | mov -32(%rax),%r13
|
---|
3794 | mov -40(%rax),%r14
|
---|
3795 | mov -48(%rax),%r15
|
---|
3796 | mov %rbx,144($context) # restore context->Rbx
|
---|
3797 | mov %rbp,160($context) # restore context->Rbp
|
---|
3798 | mov %r12,216($context) # restore context->R12
|
---|
3799 | mov %r13,224($context) # restore context->R13
|
---|
3800 | mov %r14,232($context) # restore context->R14
|
---|
3801 | mov %r15,240($context) # restore context->R14
|
---|
3802 |
|
---|
3803 | .Lcommon_seh_tail:
|
---|
3804 | mov 8(%rax),%rdi
|
---|
3805 | mov 16(%rax),%rsi
|
---|
3806 | mov %rax,152($context) # restore context->Rsp
|
---|
3807 | mov %rsi,168($context) # restore context->Rsi
|
---|
3808 | mov %rdi,176($context) # restore context->Rdi
|
---|
3809 |
|
---|
3810 | mov 40($disp),%rdi # disp->ContextRecord
|
---|
3811 | mov $context,%rsi # context
|
---|
3812 | mov \$154,%ecx # sizeof(CONTEXT)
|
---|
3813 | .long 0xa548f3fc # cld; rep movsq
|
---|
3814 |
|
---|
3815 | mov $disp,%rsi
|
---|
3816 | xor %rcx,%rcx # arg1, UNW_FLAG_NHANDLER
|
---|
3817 | mov 8(%rsi),%rdx # arg2, disp->ImageBase
|
---|
3818 | mov 0(%rsi),%r8 # arg3, disp->ControlPc
|
---|
3819 | mov 16(%rsi),%r9 # arg4, disp->FunctionEntry
|
---|
3820 | mov 40(%rsi),%r10 # disp->ContextRecord
|
---|
3821 | lea 56(%rsi),%r11 # &disp->HandlerData
|
---|
3822 | lea 24(%rsi),%r12 # &disp->EstablisherFrame
|
---|
3823 | mov %r10,32(%rsp) # arg5
|
---|
3824 | mov %r11,40(%rsp) # arg6
|
---|
3825 | mov %r12,48(%rsp) # arg7
|
---|
3826 | mov %rcx,56(%rsp) # arg8, (NULL)
|
---|
3827 | call *__imp_RtlVirtualUnwind(%rip)
|
---|
3828 |
|
---|
3829 | mov \$1,%eax # ExceptionContinueSearch
|
---|
3830 | add \$64,%rsp
|
---|
3831 | popfq
|
---|
3832 | pop %r15
|
---|
3833 | pop %r14
|
---|
3834 | pop %r13
|
---|
3835 | pop %r12
|
---|
3836 | pop %rbp
|
---|
3837 | pop %rbx
|
---|
3838 | pop %rdi
|
---|
3839 | pop %rsi
|
---|
3840 | ret
|
---|
3841 | .size se_handler,.-se_handler
|
---|
3842 |
|
---|
3843 | .type simd_handler,\@abi-omnipotent
|
---|
3844 | .align 16
|
---|
3845 | simd_handler:
|
---|
3846 | push %rsi
|
---|
3847 | push %rdi
|
---|
3848 | push %rbx
|
---|
3849 | push %rbp
|
---|
3850 | push %r12
|
---|
3851 | push %r13
|
---|
3852 | push %r14
|
---|
3853 | push %r15
|
---|
3854 | pushfq
|
---|
3855 | sub \$64,%rsp
|
---|
3856 |
|
---|
3857 | mov 120($context),%rax # pull context->Rax
|
---|
3858 | mov 248($context),%rbx # pull context->Rip
|
---|
3859 |
|
---|
3860 | mov 8($disp),%rsi # disp->ImageBase
|
---|
3861 | mov 56($disp),%r11 # disp->HandlerData
|
---|
3862 |
|
---|
3863 | mov 0(%r11),%r10d # HandlerData[0]
|
---|
3864 | lea (%rsi,%r10),%r10 # prologue label
|
---|
3865 | cmp %r10,%rbx # context->Rip<prologue label
|
---|
3866 | jb .Lcommon_seh_tail
|
---|
3867 |
|
---|
3868 | mov 192($context),%rax # pull context->R9
|
---|
3869 |
|
---|
3870 | mov 4(%r11),%r10d # HandlerData[1]
|
---|
3871 | mov 8(%r11),%ecx # HandlerData[2]
|
---|
3872 | lea (%rsi,%r10),%r10 # epilogue label
|
---|
3873 | cmp %r10,%rbx # context->Rip>=epilogue label
|
---|
3874 | jae .Lcommon_seh_tail
|
---|
3875 |
|
---|
3876 | neg %rcx
|
---|
3877 | lea -8(%rax,%rcx),%rsi
|
---|
3878 | lea 512($context),%rdi # &context.Xmm6
|
---|
3879 | neg %ecx
|
---|
3880 | shr \$3,%ecx
|
---|
3881 | .long 0xa548f3fc # cld; rep movsq
|
---|
3882 |
|
---|
3883 | jmp .Lcommon_seh_tail
|
---|
3884 | .size simd_handler,.-simd_handler
|
---|
3885 |
|
---|
3886 | .section .pdata
|
---|
3887 | .align 4
|
---|
3888 | .rva .LSEH_begin_ChaCha20_ctr32
|
---|
3889 | .rva .LSEH_end_ChaCha20_ctr32
|
---|
3890 | .rva .LSEH_info_ChaCha20_ctr32
|
---|
3891 |
|
---|
3892 | .rva .LSEH_begin_ChaCha20_ssse3
|
---|
3893 | .rva .LSEH_end_ChaCha20_ssse3
|
---|
3894 | .rva .LSEH_info_ChaCha20_ssse3
|
---|
3895 |
|
---|
3896 | .rva .LSEH_begin_ChaCha20_128
|
---|
3897 | .rva .LSEH_end_ChaCha20_128
|
---|
3898 | .rva .LSEH_info_ChaCha20_128
|
---|
3899 |
|
---|
3900 | .rva .LSEH_begin_ChaCha20_4x
|
---|
3901 | .rva .LSEH_end_ChaCha20_4x
|
---|
3902 | .rva .LSEH_info_ChaCha20_4x
|
---|
3903 | ___
|
---|
3904 | $code.=<<___ if ($avx);
|
---|
3905 | .rva .LSEH_begin_ChaCha20_4xop
|
---|
3906 | .rva .LSEH_end_ChaCha20_4xop
|
---|
3907 | .rva .LSEH_info_ChaCha20_4xop
|
---|
3908 | ___
|
---|
3909 | $code.=<<___ if ($avx>1);
|
---|
3910 | .rva .LSEH_begin_ChaCha20_8x
|
---|
3911 | .rva .LSEH_end_ChaCha20_8x
|
---|
3912 | .rva .LSEH_info_ChaCha20_8x
|
---|
3913 | ___
|
---|
3914 | $code.=<<___ if ($avx>2);
|
---|
3915 | .rva .LSEH_begin_ChaCha20_avx512
|
---|
3916 | .rva .LSEH_end_ChaCha20_avx512
|
---|
3917 | .rva .LSEH_info_ChaCha20_avx512
|
---|
3918 |
|
---|
3919 | .rva .LSEH_begin_ChaCha20_avx512vl
|
---|
3920 | .rva .LSEH_end_ChaCha20_avx512vl
|
---|
3921 | .rva .LSEH_info_ChaCha20_avx512vl
|
---|
3922 |
|
---|
3923 | .rva .LSEH_begin_ChaCha20_16x
|
---|
3924 | .rva .LSEH_end_ChaCha20_16x
|
---|
3925 | .rva .LSEH_info_ChaCha20_16x
|
---|
3926 |
|
---|
3927 | .rva .LSEH_begin_ChaCha20_8xvl
|
---|
3928 | .rva .LSEH_end_ChaCha20_8xvl
|
---|
3929 | .rva .LSEH_info_ChaCha20_8xvl
|
---|
3930 | ___
|
---|
3931 | $code.=<<___;
|
---|
3932 | .section .xdata
|
---|
3933 | .align 8
|
---|
3934 | .LSEH_info_ChaCha20_ctr32:
|
---|
3935 | .byte 9,0,0,0
|
---|
3936 | .rva se_handler
|
---|
3937 |
|
---|
3938 | .LSEH_info_ChaCha20_ssse3:
|
---|
3939 | .byte 9,0,0,0
|
---|
3940 | .rva simd_handler
|
---|
3941 | .rva .Lssse3_body,.Lssse3_epilogue
|
---|
3942 | .long 0x20,0
|
---|
3943 |
|
---|
3944 | .LSEH_info_ChaCha20_128:
|
---|
3945 | .byte 9,0,0,0
|
---|
3946 | .rva simd_handler
|
---|
3947 | .rva .L128_body,.L128_epilogue
|
---|
3948 | .long 0x60,0
|
---|
3949 |
|
---|
3950 | .LSEH_info_ChaCha20_4x:
|
---|
3951 | .byte 9,0,0,0
|
---|
3952 | .rva simd_handler
|
---|
3953 | .rva .L4x_body,.L4x_epilogue
|
---|
3954 | .long 0xa0,0
|
---|
3955 | ___
|
---|
3956 | $code.=<<___ if ($avx);
|
---|
3957 | .LSEH_info_ChaCha20_4xop:
|
---|
3958 | .byte 9,0,0,0
|
---|
3959 | .rva simd_handler
|
---|
3960 | .rva .L4xop_body,.L4xop_epilogue # HandlerData[]
|
---|
3961 | .long 0xa0,0
|
---|
3962 | ___
|
---|
3963 | $code.=<<___ if ($avx>1);
|
---|
3964 | .LSEH_info_ChaCha20_8x:
|
---|
3965 | .byte 9,0,0,0
|
---|
3966 | .rva simd_handler
|
---|
3967 | .rva .L8x_body,.L8x_epilogue # HandlerData[]
|
---|
3968 | .long 0xa0,0
|
---|
3969 | ___
|
---|
3970 | $code.=<<___ if ($avx>2);
|
---|
3971 | .LSEH_info_ChaCha20_avx512:
|
---|
3972 | .byte 9,0,0,0
|
---|
3973 | .rva simd_handler
|
---|
3974 | .rva .Lavx512_body,.Lavx512_epilogue # HandlerData[]
|
---|
3975 | .long 0x20,0
|
---|
3976 |
|
---|
3977 | .LSEH_info_ChaCha20_avx512vl:
|
---|
3978 | .byte 9,0,0,0
|
---|
3979 | .rva simd_handler
|
---|
3980 | .rva .Lavx512vl_body,.Lavx512vl_epilogue # HandlerData[]
|
---|
3981 | .long 0x20,0
|
---|
3982 |
|
---|
3983 | .LSEH_info_ChaCha20_16x:
|
---|
3984 | .byte 9,0,0,0
|
---|
3985 | .rva simd_handler
|
---|
3986 | .rva .L16x_body,.L16x_epilogue # HandlerData[]
|
---|
3987 | .long 0xa0,0
|
---|
3988 |
|
---|
3989 | .LSEH_info_ChaCha20_8xvl:
|
---|
3990 | .byte 9,0,0,0
|
---|
3991 | .rva simd_handler
|
---|
3992 | .rva .L8xvl_body,.L8xvl_epilogue # HandlerData[]
|
---|
3993 | .long 0xa0,0
|
---|
3994 | ___
|
---|
3995 | }
|
---|
3996 |
|
---|
3997 | foreach (split("\n",$code)) {
|
---|
3998 | s/\`([^\`]*)\`/eval $1/ge;
|
---|
3999 |
|
---|
4000 | s/%x#%[yz]/%x/g; # "down-shift"
|
---|
4001 |
|
---|
4002 | print $_,"\n";
|
---|
4003 | }
|
---|
4004 |
|
---|
4005 | close STDOUT or die "error closing STDOUT: $!";
|
---|