1 | #! /usr/bin/env perl
|
---|
2 | # Copyright 2016 The OpenSSL Project Authors. All Rights Reserved.
|
---|
3 | #
|
---|
4 | # Licensed under the OpenSSL license (the "License"). You may not use
|
---|
5 | # this file except in compliance with the License. You can obtain a copy
|
---|
6 | # in the file LICENSE in the source distribution or at
|
---|
7 | # https://www.openssl.org/source/license.html
|
---|
8 |
|
---|
9 | #
|
---|
10 | # ====================================================================
|
---|
11 | # Written by Andy Polyakov <[email protected]> for the OpenSSL
|
---|
12 | # project. The module is, however, dual licensed under OpenSSL and
|
---|
13 | # CRYPTOGAMS licenses depending on where you obtain it. For further
|
---|
14 | # details see http://www.openssl.org/~appro/cryptogams/.
|
---|
15 | # ====================================================================
|
---|
16 | #
|
---|
17 | # November 2014
|
---|
18 | #
|
---|
19 | # ChaCha20 for x86_64.
|
---|
20 | #
|
---|
21 | # Performance in cycles per byte out of large buffer.
|
---|
22 | #
|
---|
23 | # IALU/gcc 4.8(i) 1xSSSE3/SSE2 4xSSSE3 8xAVX2
|
---|
24 | #
|
---|
25 | # P4 9.48/+99% -/22.7(ii) -
|
---|
26 | # Core2 7.83/+55% 7.90/8.08 4.35
|
---|
27 | # Westmere 7.19/+50% 5.60/6.70 3.00
|
---|
28 | # Sandy Bridge 8.31/+42% 5.45/6.76 2.72
|
---|
29 | # Ivy Bridge 6.71/+46% 5.40/6.49 2.41
|
---|
30 | # Haswell 5.92/+43% 5.20/6.45 2.42 1.23
|
---|
31 | # Silvermont 12.0/+33% 7.75/7.40 7.03(iii)
|
---|
32 | # Goldmont 10.6/+17% 5.10/- 3.28
|
---|
33 | # Sledgehammer 7.28/+52% -/14.2(ii) -
|
---|
34 | # Bulldozer 9.66/+28% 9.85/11.1 3.06(iv)
|
---|
35 | # VIA Nano 10.5/+46% 6.72/8.60 6.05
|
---|
36 | #
|
---|
37 | # (i) compared to older gcc 3.x one can observe >2x improvement on
|
---|
38 | # most platforms;
|
---|
39 | # (ii) as it can be seen, SSE2 performance is too low on legacy
|
---|
40 | # processors; NxSSE2 results are naturally better, but not
|
---|
41 | # impressively better than IALU ones, which is why you won't
|
---|
42 | # find SSE2 code below;
|
---|
43 | # (iii) this is not optimal result for Atom because of MSROM
|
---|
44 | # limitations, SSE2 can do better, but gain is considered too
|
---|
45 | # low to justify the [maintenance] effort;
|
---|
46 | # (iv) Bulldozer actually executes 4xXOP code path that delivers 2.20;
|
---|
47 |
|
---|
48 | $flavour = shift;
|
---|
49 | $output = shift;
|
---|
50 | if ($flavour =~ /\./) { $output = $flavour; undef $flavour; }
|
---|
51 |
|
---|
52 | $win64=0; $win64=1 if ($flavour =~ /[nm]asm|mingw64/ || $output =~ /\.asm$/);
|
---|
53 |
|
---|
54 | $0 =~ m/(.*[\/\\])[^\/\\]+$/; $dir=$1;
|
---|
55 | ( $xlate="${dir}x86_64-xlate.pl" and -f $xlate ) or
|
---|
56 | ( $xlate="${dir}../../perlasm/x86_64-xlate.pl" and -f $xlate) or
|
---|
57 | die "can't locate x86_64-xlate.pl";
|
---|
58 |
|
---|
59 | if (`$ENV{CC} -Wa,-v -c -o /dev/null -x assembler /dev/null 2>&1`
|
---|
60 | =~ /GNU assembler version ([2-9]\.[0-9]+)/) {
|
---|
61 | $avx = ($1>=2.19) + ($1>=2.22);
|
---|
62 | }
|
---|
63 |
|
---|
64 | if (!$avx && $win64 && ($flavour =~ /nasm/ || $ENV{ASM} =~ /nasm/) &&
|
---|
65 | `nasm -v 2>&1` =~ /NASM version ([2-9]\.[0-9]+)/) {
|
---|
66 | $avx = ($1>=2.09) + ($1>=2.10);
|
---|
67 | }
|
---|
68 |
|
---|
69 | if (!$avx && $win64 && ($flavour =~ /masm/ || $ENV{ASM} =~ /ml64/) &&
|
---|
70 | `ml64 2>&1` =~ /Version ([0-9]+)\./) {
|
---|
71 | $avx = ($1>=10) + ($1>=11);
|
---|
72 | }
|
---|
73 |
|
---|
74 | if (!$avx && `$ENV{CC} -v 2>&1` =~ /((?:^clang|LLVM) version|.*based on LLVM) ([3-9]\.[0-9]+)/) {
|
---|
75 | $avx = ($2>=3.0) + ($2>3.0);
|
---|
76 | }
|
---|
77 |
|
---|
78 | open OUT,"| \"$^X\" \"$xlate\" $flavour \"$output\"";
|
---|
79 | *STDOUT=*OUT;
|
---|
80 |
|
---|
81 | # input parameter block
|
---|
82 | ($out,$inp,$len,$key,$counter)=("%rdi","%rsi","%rdx","%rcx","%r8");
|
---|
83 |
|
---|
84 | $code.=<<___;
|
---|
85 | .text
|
---|
86 |
|
---|
87 | .extern OPENSSL_ia32cap_P
|
---|
88 |
|
---|
89 | .align 64
|
---|
90 | .Lzero:
|
---|
91 | .long 0,0,0,0
|
---|
92 | .Lone:
|
---|
93 | .long 1,0,0,0
|
---|
94 | .Linc:
|
---|
95 | .long 0,1,2,3
|
---|
96 | .Lfour:
|
---|
97 | .long 4,4,4,4
|
---|
98 | .Lincy:
|
---|
99 | .long 0,2,4,6,1,3,5,7
|
---|
100 | .Leight:
|
---|
101 | .long 8,8,8,8,8,8,8,8
|
---|
102 | .Lrot16:
|
---|
103 | .byte 0x2,0x3,0x0,0x1, 0x6,0x7,0x4,0x5, 0xa,0xb,0x8,0x9, 0xe,0xf,0xc,0xd
|
---|
104 | .Lrot24:
|
---|
105 | .byte 0x3,0x0,0x1,0x2, 0x7,0x4,0x5,0x6, 0xb,0x8,0x9,0xa, 0xf,0xc,0xd,0xe
|
---|
106 | .Lsigma:
|
---|
107 | .asciz "expand 32-byte k"
|
---|
108 | .asciz "ChaCha20 for x86_64, CRYPTOGAMS by <appro\@openssl.org>"
|
---|
109 | ___
|
---|
110 |
|
---|
111 | sub AUTOLOAD() # thunk [simplified] 32-bit style perlasm
|
---|
112 | { my $opcode = $AUTOLOAD; $opcode =~ s/.*:://;
|
---|
113 | my $arg = pop;
|
---|
114 | $arg = "\$$arg" if ($arg*1 eq $arg);
|
---|
115 | $code .= "\t$opcode\t".join(',',$arg,reverse @_)."\n";
|
---|
116 | }
|
---|
117 |
|
---|
118 | @x=("%eax","%ebx","%ecx","%edx",map("%r${_}d",(8..11)),
|
---|
119 | "%nox","%nox","%nox","%nox",map("%r${_}d",(12..15)));
|
---|
120 | @t=("%esi","%edi");
|
---|
121 |
|
---|
122 | sub ROUND { # critical path is 24 cycles per round
|
---|
123 | my ($a0,$b0,$c0,$d0)=@_;
|
---|
124 | my ($a1,$b1,$c1,$d1)=map(($_&~3)+(($_+1)&3),($a0,$b0,$c0,$d0));
|
---|
125 | my ($a2,$b2,$c2,$d2)=map(($_&~3)+(($_+1)&3),($a1,$b1,$c1,$d1));
|
---|
126 | my ($a3,$b3,$c3,$d3)=map(($_&~3)+(($_+1)&3),($a2,$b2,$c2,$d2));
|
---|
127 | my ($xc,$xc_)=map("\"$_\"",@t);
|
---|
128 | my @x=map("\"$_\"",@x);
|
---|
129 |
|
---|
130 | # Consider order in which variables are addressed by their
|
---|
131 | # index:
|
---|
132 | #
|
---|
133 | # a b c d
|
---|
134 | #
|
---|
135 | # 0 4 8 12 < even round
|
---|
136 | # 1 5 9 13
|
---|
137 | # 2 6 10 14
|
---|
138 | # 3 7 11 15
|
---|
139 | # 0 5 10 15 < odd round
|
---|
140 | # 1 6 11 12
|
---|
141 | # 2 7 8 13
|
---|
142 | # 3 4 9 14
|
---|
143 | #
|
---|
144 | # 'a', 'b' and 'd's are permanently allocated in registers,
|
---|
145 | # @x[0..7,12..15], while 'c's are maintained in memory. If
|
---|
146 | # you observe 'c' column, you'll notice that pair of 'c's is
|
---|
147 | # invariant between rounds. This means that we have to reload
|
---|
148 | # them once per round, in the middle. This is why you'll see
|
---|
149 | # bunch of 'c' stores and loads in the middle, but none in
|
---|
150 | # the beginning or end.
|
---|
151 |
|
---|
152 | # Normally instructions would be interleaved to favour in-order
|
---|
153 | # execution. Generally out-of-order cores manage it gracefully,
|
---|
154 | # but not this time for some reason. As in-order execution
|
---|
155 | # cores are dying breed, old Atom is the only one around,
|
---|
156 | # instructions are left uninterleaved. Besides, Atom is better
|
---|
157 | # off executing 1xSSSE3 code anyway...
|
---|
158 |
|
---|
159 | (
|
---|
160 | "&add (@x[$a0],@x[$b0])", # Q1
|
---|
161 | "&xor (@x[$d0],@x[$a0])",
|
---|
162 | "&rol (@x[$d0],16)",
|
---|
163 | "&add (@x[$a1],@x[$b1])", # Q2
|
---|
164 | "&xor (@x[$d1],@x[$a1])",
|
---|
165 | "&rol (@x[$d1],16)",
|
---|
166 |
|
---|
167 | "&add ($xc,@x[$d0])",
|
---|
168 | "&xor (@x[$b0],$xc)",
|
---|
169 | "&rol (@x[$b0],12)",
|
---|
170 | "&add ($xc_,@x[$d1])",
|
---|
171 | "&xor (@x[$b1],$xc_)",
|
---|
172 | "&rol (@x[$b1],12)",
|
---|
173 |
|
---|
174 | "&add (@x[$a0],@x[$b0])",
|
---|
175 | "&xor (@x[$d0],@x[$a0])",
|
---|
176 | "&rol (@x[$d0],8)",
|
---|
177 | "&add (@x[$a1],@x[$b1])",
|
---|
178 | "&xor (@x[$d1],@x[$a1])",
|
---|
179 | "&rol (@x[$d1],8)",
|
---|
180 |
|
---|
181 | "&add ($xc,@x[$d0])",
|
---|
182 | "&xor (@x[$b0],$xc)",
|
---|
183 | "&rol (@x[$b0],7)",
|
---|
184 | "&add ($xc_,@x[$d1])",
|
---|
185 | "&xor (@x[$b1],$xc_)",
|
---|
186 | "&rol (@x[$b1],7)",
|
---|
187 |
|
---|
188 | "&mov (\"4*$c0(%rsp)\",$xc)", # reload pair of 'c's
|
---|
189 | "&mov (\"4*$c1(%rsp)\",$xc_)",
|
---|
190 | "&mov ($xc,\"4*$c2(%rsp)\")",
|
---|
191 | "&mov ($xc_,\"4*$c3(%rsp)\")",
|
---|
192 |
|
---|
193 | "&add (@x[$a2],@x[$b2])", # Q3
|
---|
194 | "&xor (@x[$d2],@x[$a2])",
|
---|
195 | "&rol (@x[$d2],16)",
|
---|
196 | "&add (@x[$a3],@x[$b3])", # Q4
|
---|
197 | "&xor (@x[$d3],@x[$a3])",
|
---|
198 | "&rol (@x[$d3],16)",
|
---|
199 |
|
---|
200 | "&add ($xc,@x[$d2])",
|
---|
201 | "&xor (@x[$b2],$xc)",
|
---|
202 | "&rol (@x[$b2],12)",
|
---|
203 | "&add ($xc_,@x[$d3])",
|
---|
204 | "&xor (@x[$b3],$xc_)",
|
---|
205 | "&rol (@x[$b3],12)",
|
---|
206 |
|
---|
207 | "&add (@x[$a2],@x[$b2])",
|
---|
208 | "&xor (@x[$d2],@x[$a2])",
|
---|
209 | "&rol (@x[$d2],8)",
|
---|
210 | "&add (@x[$a3],@x[$b3])",
|
---|
211 | "&xor (@x[$d3],@x[$a3])",
|
---|
212 | "&rol (@x[$d3],8)",
|
---|
213 |
|
---|
214 | "&add ($xc,@x[$d2])",
|
---|
215 | "&xor (@x[$b2],$xc)",
|
---|
216 | "&rol (@x[$b2],7)",
|
---|
217 | "&add ($xc_,@x[$d3])",
|
---|
218 | "&xor (@x[$b3],$xc_)",
|
---|
219 | "&rol (@x[$b3],7)"
|
---|
220 | );
|
---|
221 | }
|
---|
222 |
|
---|
223 | ########################################################################
|
---|
224 | # Generic code path that handles all lengths on pre-SSSE3 processors.
|
---|
225 | $code.=<<___;
|
---|
226 | .globl ChaCha20_ctr32
|
---|
227 | .type ChaCha20_ctr32,\@function,5
|
---|
228 | .align 64
|
---|
229 | ChaCha20_ctr32:
|
---|
230 | cmp \$0,$len
|
---|
231 | je .Lno_data
|
---|
232 | mov OPENSSL_ia32cap_P+4(%rip),%r10
|
---|
233 | test \$`1<<(41-32)`,%r10d
|
---|
234 | jnz .LChaCha20_ssse3
|
---|
235 |
|
---|
236 | push %rbx
|
---|
237 | push %rbp
|
---|
238 | push %r12
|
---|
239 | push %r13
|
---|
240 | push %r14
|
---|
241 | push %r15
|
---|
242 | sub \$64+24,%rsp
|
---|
243 |
|
---|
244 | #movdqa .Lsigma(%rip),%xmm0
|
---|
245 | movdqu ($key),%xmm1
|
---|
246 | movdqu 16($key),%xmm2
|
---|
247 | movdqu ($counter),%xmm3
|
---|
248 | movdqa .Lone(%rip),%xmm4
|
---|
249 |
|
---|
250 | #movdqa %xmm0,4*0(%rsp) # key[0]
|
---|
251 | movdqa %xmm1,4*4(%rsp) # key[1]
|
---|
252 | movdqa %xmm2,4*8(%rsp) # key[2]
|
---|
253 | movdqa %xmm3,4*12(%rsp) # key[3]
|
---|
254 | mov $len,%rbp # reassign $len
|
---|
255 | jmp .Loop_outer
|
---|
256 |
|
---|
257 | .align 32
|
---|
258 | .Loop_outer:
|
---|
259 | mov \$0x61707865,@x[0] # 'expa'
|
---|
260 | mov \$0x3320646e,@x[1] # 'nd 3'
|
---|
261 | mov \$0x79622d32,@x[2] # '2-by'
|
---|
262 | mov \$0x6b206574,@x[3] # 'te k'
|
---|
263 | mov 4*4(%rsp),@x[4]
|
---|
264 | mov 4*5(%rsp),@x[5]
|
---|
265 | mov 4*6(%rsp),@x[6]
|
---|
266 | mov 4*7(%rsp),@x[7]
|
---|
267 | movd %xmm3,@x[12]
|
---|
268 | mov 4*13(%rsp),@x[13]
|
---|
269 | mov 4*14(%rsp),@x[14]
|
---|
270 | mov 4*15(%rsp),@x[15]
|
---|
271 |
|
---|
272 | mov %rbp,64+0(%rsp) # save len
|
---|
273 | mov \$10,%ebp
|
---|
274 | mov $inp,64+8(%rsp) # save inp
|
---|
275 | movq %xmm2,%rsi # "@x[8]"
|
---|
276 | mov $out,64+16(%rsp) # save out
|
---|
277 | mov %rsi,%rdi
|
---|
278 | shr \$32,%rdi # "@x[9]"
|
---|
279 | jmp .Loop
|
---|
280 |
|
---|
281 | .align 32
|
---|
282 | .Loop:
|
---|
283 | ___
|
---|
284 | foreach (&ROUND (0, 4, 8,12)) { eval; }
|
---|
285 | foreach (&ROUND (0, 5,10,15)) { eval; }
|
---|
286 | &dec ("%ebp");
|
---|
287 | &jnz (".Loop");
|
---|
288 |
|
---|
289 | $code.=<<___;
|
---|
290 | mov @t[1],4*9(%rsp) # modulo-scheduled
|
---|
291 | mov @t[0],4*8(%rsp)
|
---|
292 | mov 64(%rsp),%rbp # load len
|
---|
293 | movdqa %xmm2,%xmm1
|
---|
294 | mov 64+8(%rsp),$inp # load inp
|
---|
295 | paddd %xmm4,%xmm3 # increment counter
|
---|
296 | mov 64+16(%rsp),$out # load out
|
---|
297 |
|
---|
298 | add \$0x61707865,@x[0] # 'expa'
|
---|
299 | add \$0x3320646e,@x[1] # 'nd 3'
|
---|
300 | add \$0x79622d32,@x[2] # '2-by'
|
---|
301 | add \$0x6b206574,@x[3] # 'te k'
|
---|
302 | add 4*4(%rsp),@x[4]
|
---|
303 | add 4*5(%rsp),@x[5]
|
---|
304 | add 4*6(%rsp),@x[6]
|
---|
305 | add 4*7(%rsp),@x[7]
|
---|
306 | add 4*12(%rsp),@x[12]
|
---|
307 | add 4*13(%rsp),@x[13]
|
---|
308 | add 4*14(%rsp),@x[14]
|
---|
309 | add 4*15(%rsp),@x[15]
|
---|
310 | paddd 4*8(%rsp),%xmm1
|
---|
311 |
|
---|
312 | cmp \$64,%rbp
|
---|
313 | jb .Ltail
|
---|
314 |
|
---|
315 | xor 4*0($inp),@x[0] # xor with input
|
---|
316 | xor 4*1($inp),@x[1]
|
---|
317 | xor 4*2($inp),@x[2]
|
---|
318 | xor 4*3($inp),@x[3]
|
---|
319 | xor 4*4($inp),@x[4]
|
---|
320 | xor 4*5($inp),@x[5]
|
---|
321 | xor 4*6($inp),@x[6]
|
---|
322 | xor 4*7($inp),@x[7]
|
---|
323 | movdqu 4*8($inp),%xmm0
|
---|
324 | xor 4*12($inp),@x[12]
|
---|
325 | xor 4*13($inp),@x[13]
|
---|
326 | xor 4*14($inp),@x[14]
|
---|
327 | xor 4*15($inp),@x[15]
|
---|
328 | lea 4*16($inp),$inp # inp+=64
|
---|
329 | pxor %xmm1,%xmm0
|
---|
330 |
|
---|
331 | movdqa %xmm2,4*8(%rsp)
|
---|
332 | movd %xmm3,4*12(%rsp)
|
---|
333 |
|
---|
334 | mov @x[0],4*0($out) # write output
|
---|
335 | mov @x[1],4*1($out)
|
---|
336 | mov @x[2],4*2($out)
|
---|
337 | mov @x[3],4*3($out)
|
---|
338 | mov @x[4],4*4($out)
|
---|
339 | mov @x[5],4*5($out)
|
---|
340 | mov @x[6],4*6($out)
|
---|
341 | mov @x[7],4*7($out)
|
---|
342 | movdqu %xmm0,4*8($out)
|
---|
343 | mov @x[12],4*12($out)
|
---|
344 | mov @x[13],4*13($out)
|
---|
345 | mov @x[14],4*14($out)
|
---|
346 | mov @x[15],4*15($out)
|
---|
347 | lea 4*16($out),$out # out+=64
|
---|
348 |
|
---|
349 | sub \$64,%rbp
|
---|
350 | jnz .Loop_outer
|
---|
351 |
|
---|
352 | jmp .Ldone
|
---|
353 |
|
---|
354 | .align 16
|
---|
355 | .Ltail:
|
---|
356 | mov @x[0],4*0(%rsp)
|
---|
357 | mov @x[1],4*1(%rsp)
|
---|
358 | xor %rbx,%rbx
|
---|
359 | mov @x[2],4*2(%rsp)
|
---|
360 | mov @x[3],4*3(%rsp)
|
---|
361 | mov @x[4],4*4(%rsp)
|
---|
362 | mov @x[5],4*5(%rsp)
|
---|
363 | mov @x[6],4*6(%rsp)
|
---|
364 | mov @x[7],4*7(%rsp)
|
---|
365 | movdqa %xmm1,4*8(%rsp)
|
---|
366 | mov @x[12],4*12(%rsp)
|
---|
367 | mov @x[13],4*13(%rsp)
|
---|
368 | mov @x[14],4*14(%rsp)
|
---|
369 | mov @x[15],4*15(%rsp)
|
---|
370 |
|
---|
371 | .Loop_tail:
|
---|
372 | movzb ($inp,%rbx),%eax
|
---|
373 | movzb (%rsp,%rbx),%edx
|
---|
374 | lea 1(%rbx),%rbx
|
---|
375 | xor %edx,%eax
|
---|
376 | mov %al,-1($out,%rbx)
|
---|
377 | dec %rbp
|
---|
378 | jnz .Loop_tail
|
---|
379 |
|
---|
380 | .Ldone:
|
---|
381 | add \$64+24,%rsp
|
---|
382 | pop %r15
|
---|
383 | pop %r14
|
---|
384 | pop %r13
|
---|
385 | pop %r12
|
---|
386 | pop %rbp
|
---|
387 | pop %rbx
|
---|
388 | .Lno_data:
|
---|
389 | ret
|
---|
390 | .size ChaCha20_ctr32,.-ChaCha20_ctr32
|
---|
391 | ___
|
---|
392 |
|
---|
393 | ########################################################################
|
---|
394 | # SSSE3 code path that handles shorter lengths
|
---|
395 | {
|
---|
396 | my ($a,$b,$c,$d,$t,$t1,$rot16,$rot24)=map("%xmm$_",(0..7));
|
---|
397 |
|
---|
398 | sub SSSE3ROUND { # critical path is 20 "SIMD ticks" per round
|
---|
399 | &paddd ($a,$b);
|
---|
400 | &pxor ($d,$a);
|
---|
401 | &pshufb ($d,$rot16);
|
---|
402 |
|
---|
403 | &paddd ($c,$d);
|
---|
404 | &pxor ($b,$c);
|
---|
405 | &movdqa ($t,$b);
|
---|
406 | &psrld ($b,20);
|
---|
407 | &pslld ($t,12);
|
---|
408 | &por ($b,$t);
|
---|
409 |
|
---|
410 | &paddd ($a,$b);
|
---|
411 | &pxor ($d,$a);
|
---|
412 | &pshufb ($d,$rot24);
|
---|
413 |
|
---|
414 | &paddd ($c,$d);
|
---|
415 | &pxor ($b,$c);
|
---|
416 | &movdqa ($t,$b);
|
---|
417 | &psrld ($b,25);
|
---|
418 | &pslld ($t,7);
|
---|
419 | &por ($b,$t);
|
---|
420 | }
|
---|
421 |
|
---|
422 | my $xframe = $win64 ? 32+32+8 : 24;
|
---|
423 |
|
---|
424 | $code.=<<___;
|
---|
425 | .type ChaCha20_ssse3,\@function,5
|
---|
426 | .align 32
|
---|
427 | ChaCha20_ssse3:
|
---|
428 | .LChaCha20_ssse3:
|
---|
429 | ___
|
---|
430 | $code.=<<___ if ($avx);
|
---|
431 | test \$`1<<(43-32)`,%r10d
|
---|
432 | jnz .LChaCha20_4xop # XOP is fastest even if we use 1/4
|
---|
433 | ___
|
---|
434 | $code.=<<___;
|
---|
435 | cmp \$128,$len # we might throw away some data,
|
---|
436 | ja .LChaCha20_4x # but overall it won't be slower
|
---|
437 |
|
---|
438 | .Ldo_sse3_after_all:
|
---|
439 | push %rbx
|
---|
440 | push %rbp
|
---|
441 | push %r12
|
---|
442 | push %r13
|
---|
443 | push %r14
|
---|
444 | push %r15
|
---|
445 |
|
---|
446 | sub \$64+$xframe,%rsp
|
---|
447 | ___
|
---|
448 | $code.=<<___ if ($win64);
|
---|
449 | movaps %xmm6,64+32(%rsp)
|
---|
450 | movaps %xmm7,64+48(%rsp)
|
---|
451 | ___
|
---|
452 | $code.=<<___;
|
---|
453 | movdqa .Lsigma(%rip),$a
|
---|
454 | movdqu ($key),$b
|
---|
455 | movdqu 16($key),$c
|
---|
456 | movdqu ($counter),$d
|
---|
457 | movdqa .Lrot16(%rip),$rot16
|
---|
458 | movdqa .Lrot24(%rip),$rot24
|
---|
459 |
|
---|
460 | movdqa $a,0x00(%rsp)
|
---|
461 | movdqa $b,0x10(%rsp)
|
---|
462 | movdqa $c,0x20(%rsp)
|
---|
463 | movdqa $d,0x30(%rsp)
|
---|
464 | mov \$10,%ebp
|
---|
465 | jmp .Loop_ssse3
|
---|
466 |
|
---|
467 | .align 32
|
---|
468 | .Loop_outer_ssse3:
|
---|
469 | movdqa .Lone(%rip),$d
|
---|
470 | movdqa 0x00(%rsp),$a
|
---|
471 | movdqa 0x10(%rsp),$b
|
---|
472 | movdqa 0x20(%rsp),$c
|
---|
473 | paddd 0x30(%rsp),$d
|
---|
474 | mov \$10,%ebp
|
---|
475 | movdqa $d,0x30(%rsp)
|
---|
476 | jmp .Loop_ssse3
|
---|
477 |
|
---|
478 | .align 32
|
---|
479 | .Loop_ssse3:
|
---|
480 | ___
|
---|
481 | &SSSE3ROUND();
|
---|
482 | &pshufd ($c,$c,0b01001110);
|
---|
483 | &pshufd ($b,$b,0b00111001);
|
---|
484 | &pshufd ($d,$d,0b10010011);
|
---|
485 | &nop ();
|
---|
486 |
|
---|
487 | &SSSE3ROUND();
|
---|
488 | &pshufd ($c,$c,0b01001110);
|
---|
489 | &pshufd ($b,$b,0b10010011);
|
---|
490 | &pshufd ($d,$d,0b00111001);
|
---|
491 |
|
---|
492 | &dec ("%ebp");
|
---|
493 | &jnz (".Loop_ssse3");
|
---|
494 |
|
---|
495 | $code.=<<___;
|
---|
496 | paddd 0x00(%rsp),$a
|
---|
497 | paddd 0x10(%rsp),$b
|
---|
498 | paddd 0x20(%rsp),$c
|
---|
499 | paddd 0x30(%rsp),$d
|
---|
500 |
|
---|
501 | cmp \$64,$len
|
---|
502 | jb .Ltail_ssse3
|
---|
503 |
|
---|
504 | movdqu 0x00($inp),$t
|
---|
505 | movdqu 0x10($inp),$t1
|
---|
506 | pxor $t,$a # xor with input
|
---|
507 | movdqu 0x20($inp),$t
|
---|
508 | pxor $t1,$b
|
---|
509 | movdqu 0x30($inp),$t1
|
---|
510 | lea 0x40($inp),$inp # inp+=64
|
---|
511 | pxor $t,$c
|
---|
512 | pxor $t1,$d
|
---|
513 |
|
---|
514 | movdqu $a,0x00($out) # write output
|
---|
515 | movdqu $b,0x10($out)
|
---|
516 | movdqu $c,0x20($out)
|
---|
517 | movdqu $d,0x30($out)
|
---|
518 | lea 0x40($out),$out # out+=64
|
---|
519 |
|
---|
520 | sub \$64,$len
|
---|
521 | jnz .Loop_outer_ssse3
|
---|
522 |
|
---|
523 | jmp .Ldone_ssse3
|
---|
524 |
|
---|
525 | .align 16
|
---|
526 | .Ltail_ssse3:
|
---|
527 | movdqa $a,0x00(%rsp)
|
---|
528 | movdqa $b,0x10(%rsp)
|
---|
529 | movdqa $c,0x20(%rsp)
|
---|
530 | movdqa $d,0x30(%rsp)
|
---|
531 | xor %rbx,%rbx
|
---|
532 |
|
---|
533 | .Loop_tail_ssse3:
|
---|
534 | movzb ($inp,%rbx),%eax
|
---|
535 | movzb (%rsp,%rbx),%ecx
|
---|
536 | lea 1(%rbx),%rbx
|
---|
537 | xor %ecx,%eax
|
---|
538 | mov %al,-1($out,%rbx)
|
---|
539 | dec $len
|
---|
540 | jnz .Loop_tail_ssse3
|
---|
541 |
|
---|
542 | .Ldone_ssse3:
|
---|
543 | ___
|
---|
544 | $code.=<<___ if ($win64);
|
---|
545 | movaps 64+32(%rsp),%xmm6
|
---|
546 | movaps 64+48(%rsp),%xmm7
|
---|
547 | ___
|
---|
548 | $code.=<<___;
|
---|
549 | add \$64+$xframe,%rsp
|
---|
550 | pop %r15
|
---|
551 | pop %r14
|
---|
552 | pop %r13
|
---|
553 | pop %r12
|
---|
554 | pop %rbp
|
---|
555 | pop %rbx
|
---|
556 | ret
|
---|
557 | .size ChaCha20_ssse3,.-ChaCha20_ssse3
|
---|
558 | ___
|
---|
559 | }
|
---|
560 |
|
---|
561 | ########################################################################
|
---|
562 | # SSSE3 code path that handles longer messages.
|
---|
563 | {
|
---|
564 | # assign variables to favor Atom front-end
|
---|
565 | my ($xd0,$xd1,$xd2,$xd3, $xt0,$xt1,$xt2,$xt3,
|
---|
566 | $xa0,$xa1,$xa2,$xa3, $xb0,$xb1,$xb2,$xb3)=map("%xmm$_",(0..15));
|
---|
567 | my @xx=($xa0,$xa1,$xa2,$xa3, $xb0,$xb1,$xb2,$xb3,
|
---|
568 | "%nox","%nox","%nox","%nox", $xd0,$xd1,$xd2,$xd3);
|
---|
569 |
|
---|
570 | sub SSSE3_lane_ROUND {
|
---|
571 | my ($a0,$b0,$c0,$d0)=@_;
|
---|
572 | my ($a1,$b1,$c1,$d1)=map(($_&~3)+(($_+1)&3),($a0,$b0,$c0,$d0));
|
---|
573 | my ($a2,$b2,$c2,$d2)=map(($_&~3)+(($_+1)&3),($a1,$b1,$c1,$d1));
|
---|
574 | my ($a3,$b3,$c3,$d3)=map(($_&~3)+(($_+1)&3),($a2,$b2,$c2,$d2));
|
---|
575 | my ($xc,$xc_,$t0,$t1)=map("\"$_\"",$xt0,$xt1,$xt2,$xt3);
|
---|
576 | my @x=map("\"$_\"",@xx);
|
---|
577 |
|
---|
578 | # Consider order in which variables are addressed by their
|
---|
579 | # index:
|
---|
580 | #
|
---|
581 | # a b c d
|
---|
582 | #
|
---|
583 | # 0 4 8 12 < even round
|
---|
584 | # 1 5 9 13
|
---|
585 | # 2 6 10 14
|
---|
586 | # 3 7 11 15
|
---|
587 | # 0 5 10 15 < odd round
|
---|
588 | # 1 6 11 12
|
---|
589 | # 2 7 8 13
|
---|
590 | # 3 4 9 14
|
---|
591 | #
|
---|
592 | # 'a', 'b' and 'd's are permanently allocated in registers,
|
---|
593 | # @x[0..7,12..15], while 'c's are maintained in memory. If
|
---|
594 | # you observe 'c' column, you'll notice that pair of 'c's is
|
---|
595 | # invariant between rounds. This means that we have to reload
|
---|
596 | # them once per round, in the middle. This is why you'll see
|
---|
597 | # bunch of 'c' stores and loads in the middle, but none in
|
---|
598 | # the beginning or end.
|
---|
599 |
|
---|
600 | (
|
---|
601 | "&paddd (@x[$a0],@x[$b0])", # Q1
|
---|
602 | "&paddd (@x[$a1],@x[$b1])", # Q2
|
---|
603 | "&pxor (@x[$d0],@x[$a0])",
|
---|
604 | "&pxor (@x[$d1],@x[$a1])",
|
---|
605 | "&pshufb (@x[$d0],$t1)",
|
---|
606 | "&pshufb (@x[$d1],$t1)",
|
---|
607 |
|
---|
608 | "&paddd ($xc,@x[$d0])",
|
---|
609 | "&paddd ($xc_,@x[$d1])",
|
---|
610 | "&pxor (@x[$b0],$xc)",
|
---|
611 | "&pxor (@x[$b1],$xc_)",
|
---|
612 | "&movdqa ($t0,@x[$b0])",
|
---|
613 | "&pslld (@x[$b0],12)",
|
---|
614 | "&psrld ($t0,20)",
|
---|
615 | "&movdqa ($t1,@x[$b1])",
|
---|
616 | "&pslld (@x[$b1],12)",
|
---|
617 | "&por (@x[$b0],$t0)",
|
---|
618 | "&psrld ($t1,20)",
|
---|
619 | "&movdqa ($t0,'(%r11)')", # .Lrot24(%rip)
|
---|
620 | "&por (@x[$b1],$t1)",
|
---|
621 |
|
---|
622 | "&paddd (@x[$a0],@x[$b0])",
|
---|
623 | "&paddd (@x[$a1],@x[$b1])",
|
---|
624 | "&pxor (@x[$d0],@x[$a0])",
|
---|
625 | "&pxor (@x[$d1],@x[$a1])",
|
---|
626 | "&pshufb (@x[$d0],$t0)",
|
---|
627 | "&pshufb (@x[$d1],$t0)",
|
---|
628 |
|
---|
629 | "&paddd ($xc,@x[$d0])",
|
---|
630 | "&paddd ($xc_,@x[$d1])",
|
---|
631 | "&pxor (@x[$b0],$xc)",
|
---|
632 | "&pxor (@x[$b1],$xc_)",
|
---|
633 | "&movdqa ($t1,@x[$b0])",
|
---|
634 | "&pslld (@x[$b0],7)",
|
---|
635 | "&psrld ($t1,25)",
|
---|
636 | "&movdqa ($t0,@x[$b1])",
|
---|
637 | "&pslld (@x[$b1],7)",
|
---|
638 | "&por (@x[$b0],$t1)",
|
---|
639 | "&psrld ($t0,25)",
|
---|
640 | "&movdqa ($t1,'(%r10)')", # .Lrot16(%rip)
|
---|
641 | "&por (@x[$b1],$t0)",
|
---|
642 |
|
---|
643 | "&movdqa (\"`16*($c0-8)`(%rsp)\",$xc)", # reload pair of 'c's
|
---|
644 | "&movdqa (\"`16*($c1-8)`(%rsp)\",$xc_)",
|
---|
645 | "&movdqa ($xc,\"`16*($c2-8)`(%rsp)\")",
|
---|
646 | "&movdqa ($xc_,\"`16*($c3-8)`(%rsp)\")",
|
---|
647 |
|
---|
648 | "&paddd (@x[$a2],@x[$b2])", # Q3
|
---|
649 | "&paddd (@x[$a3],@x[$b3])", # Q4
|
---|
650 | "&pxor (@x[$d2],@x[$a2])",
|
---|
651 | "&pxor (@x[$d3],@x[$a3])",
|
---|
652 | "&pshufb (@x[$d2],$t1)",
|
---|
653 | "&pshufb (@x[$d3],$t1)",
|
---|
654 |
|
---|
655 | "&paddd ($xc,@x[$d2])",
|
---|
656 | "&paddd ($xc_,@x[$d3])",
|
---|
657 | "&pxor (@x[$b2],$xc)",
|
---|
658 | "&pxor (@x[$b3],$xc_)",
|
---|
659 | "&movdqa ($t0,@x[$b2])",
|
---|
660 | "&pslld (@x[$b2],12)",
|
---|
661 | "&psrld ($t0,20)",
|
---|
662 | "&movdqa ($t1,@x[$b3])",
|
---|
663 | "&pslld (@x[$b3],12)",
|
---|
664 | "&por (@x[$b2],$t0)",
|
---|
665 | "&psrld ($t1,20)",
|
---|
666 | "&movdqa ($t0,'(%r11)')", # .Lrot24(%rip)
|
---|
667 | "&por (@x[$b3],$t1)",
|
---|
668 |
|
---|
669 | "&paddd (@x[$a2],@x[$b2])",
|
---|
670 | "&paddd (@x[$a3],@x[$b3])",
|
---|
671 | "&pxor (@x[$d2],@x[$a2])",
|
---|
672 | "&pxor (@x[$d3],@x[$a3])",
|
---|
673 | "&pshufb (@x[$d2],$t0)",
|
---|
674 | "&pshufb (@x[$d3],$t0)",
|
---|
675 |
|
---|
676 | "&paddd ($xc,@x[$d2])",
|
---|
677 | "&paddd ($xc_,@x[$d3])",
|
---|
678 | "&pxor (@x[$b2],$xc)",
|
---|
679 | "&pxor (@x[$b3],$xc_)",
|
---|
680 | "&movdqa ($t1,@x[$b2])",
|
---|
681 | "&pslld (@x[$b2],7)",
|
---|
682 | "&psrld ($t1,25)",
|
---|
683 | "&movdqa ($t0,@x[$b3])",
|
---|
684 | "&pslld (@x[$b3],7)",
|
---|
685 | "&por (@x[$b2],$t1)",
|
---|
686 | "&psrld ($t0,25)",
|
---|
687 | "&movdqa ($t1,'(%r10)')", # .Lrot16(%rip)
|
---|
688 | "&por (@x[$b3],$t0)"
|
---|
689 | );
|
---|
690 | }
|
---|
691 |
|
---|
692 | my $xframe = $win64 ? 0xa0 : 0;
|
---|
693 |
|
---|
694 | $code.=<<___;
|
---|
695 | .type ChaCha20_4x,\@function,5
|
---|
696 | .align 32
|
---|
697 | ChaCha20_4x:
|
---|
698 | .LChaCha20_4x:
|
---|
699 | mov %r10,%r11
|
---|
700 | ___
|
---|
701 | $code.=<<___ if ($avx>1);
|
---|
702 | shr \$32,%r10 # OPENSSL_ia32cap_P+8
|
---|
703 | test \$`1<<5`,%r10 # test AVX2
|
---|
704 | jnz .LChaCha20_8x
|
---|
705 | ___
|
---|
706 | $code.=<<___;
|
---|
707 | cmp \$192,$len
|
---|
708 | ja .Lproceed4x
|
---|
709 |
|
---|
710 | and \$`1<<26|1<<22`,%r11 # isolate XSAVE+MOVBE
|
---|
711 | cmp \$`1<<22`,%r11 # check for MOVBE without XSAVE
|
---|
712 | je .Ldo_sse3_after_all # to detect Atom
|
---|
713 |
|
---|
714 | .Lproceed4x:
|
---|
715 | lea -0x78(%rsp),%r11
|
---|
716 | sub \$0x148+$xframe,%rsp
|
---|
717 | ___
|
---|
718 | ################ stack layout
|
---|
719 | # +0x00 SIMD equivalent of @x[8-12]
|
---|
720 | # ...
|
---|
721 | # +0x40 constant copy of key[0-2] smashed by lanes
|
---|
722 | # ...
|
---|
723 | # +0x100 SIMD counters (with nonce smashed by lanes)
|
---|
724 | # ...
|
---|
725 | # +0x140
|
---|
726 | $code.=<<___ if ($win64);
|
---|
727 | movaps %xmm6,-0x30(%r11)
|
---|
728 | movaps %xmm7,-0x20(%r11)
|
---|
729 | movaps %xmm8,-0x10(%r11)
|
---|
730 | movaps %xmm9,0x00(%r11)
|
---|
731 | movaps %xmm10,0x10(%r11)
|
---|
732 | movaps %xmm11,0x20(%r11)
|
---|
733 | movaps %xmm12,0x30(%r11)
|
---|
734 | movaps %xmm13,0x40(%r11)
|
---|
735 | movaps %xmm14,0x50(%r11)
|
---|
736 | movaps %xmm15,0x60(%r11)
|
---|
737 | ___
|
---|
738 | $code.=<<___;
|
---|
739 | movdqa .Lsigma(%rip),$xa3 # key[0]
|
---|
740 | movdqu ($key),$xb3 # key[1]
|
---|
741 | movdqu 16($key),$xt3 # key[2]
|
---|
742 | movdqu ($counter),$xd3 # key[3]
|
---|
743 | lea 0x100(%rsp),%rcx # size optimization
|
---|
744 | lea .Lrot16(%rip),%r10
|
---|
745 | lea .Lrot24(%rip),%r11
|
---|
746 |
|
---|
747 | pshufd \$0x00,$xa3,$xa0 # smash key by lanes...
|
---|
748 | pshufd \$0x55,$xa3,$xa1
|
---|
749 | movdqa $xa0,0x40(%rsp) # ... and offload
|
---|
750 | pshufd \$0xaa,$xa3,$xa2
|
---|
751 | movdqa $xa1,0x50(%rsp)
|
---|
752 | pshufd \$0xff,$xa3,$xa3
|
---|
753 | movdqa $xa2,0x60(%rsp)
|
---|
754 | movdqa $xa3,0x70(%rsp)
|
---|
755 |
|
---|
756 | pshufd \$0x00,$xb3,$xb0
|
---|
757 | pshufd \$0x55,$xb3,$xb1
|
---|
758 | movdqa $xb0,0x80-0x100(%rcx)
|
---|
759 | pshufd \$0xaa,$xb3,$xb2
|
---|
760 | movdqa $xb1,0x90-0x100(%rcx)
|
---|
761 | pshufd \$0xff,$xb3,$xb3
|
---|
762 | movdqa $xb2,0xa0-0x100(%rcx)
|
---|
763 | movdqa $xb3,0xb0-0x100(%rcx)
|
---|
764 |
|
---|
765 | pshufd \$0x00,$xt3,$xt0 # "$xc0"
|
---|
766 | pshufd \$0x55,$xt3,$xt1 # "$xc1"
|
---|
767 | movdqa $xt0,0xc0-0x100(%rcx)
|
---|
768 | pshufd \$0xaa,$xt3,$xt2 # "$xc2"
|
---|
769 | movdqa $xt1,0xd0-0x100(%rcx)
|
---|
770 | pshufd \$0xff,$xt3,$xt3 # "$xc3"
|
---|
771 | movdqa $xt2,0xe0-0x100(%rcx)
|
---|
772 | movdqa $xt3,0xf0-0x100(%rcx)
|
---|
773 |
|
---|
774 | pshufd \$0x00,$xd3,$xd0
|
---|
775 | pshufd \$0x55,$xd3,$xd1
|
---|
776 | paddd .Linc(%rip),$xd0 # don't save counters yet
|
---|
777 | pshufd \$0xaa,$xd3,$xd2
|
---|
778 | movdqa $xd1,0x110-0x100(%rcx)
|
---|
779 | pshufd \$0xff,$xd3,$xd3
|
---|
780 | movdqa $xd2,0x120-0x100(%rcx)
|
---|
781 | movdqa $xd3,0x130-0x100(%rcx)
|
---|
782 |
|
---|
783 | jmp .Loop_enter4x
|
---|
784 |
|
---|
785 | .align 32
|
---|
786 | .Loop_outer4x:
|
---|
787 | movdqa 0x40(%rsp),$xa0 # re-load smashed key
|
---|
788 | movdqa 0x50(%rsp),$xa1
|
---|
789 | movdqa 0x60(%rsp),$xa2
|
---|
790 | movdqa 0x70(%rsp),$xa3
|
---|
791 | movdqa 0x80-0x100(%rcx),$xb0
|
---|
792 | movdqa 0x90-0x100(%rcx),$xb1
|
---|
793 | movdqa 0xa0-0x100(%rcx),$xb2
|
---|
794 | movdqa 0xb0-0x100(%rcx),$xb3
|
---|
795 | movdqa 0xc0-0x100(%rcx),$xt0 # "$xc0"
|
---|
796 | movdqa 0xd0-0x100(%rcx),$xt1 # "$xc1"
|
---|
797 | movdqa 0xe0-0x100(%rcx),$xt2 # "$xc2"
|
---|
798 | movdqa 0xf0-0x100(%rcx),$xt3 # "$xc3"
|
---|
799 | movdqa 0x100-0x100(%rcx),$xd0
|
---|
800 | movdqa 0x110-0x100(%rcx),$xd1
|
---|
801 | movdqa 0x120-0x100(%rcx),$xd2
|
---|
802 | movdqa 0x130-0x100(%rcx),$xd3
|
---|
803 | paddd .Lfour(%rip),$xd0 # next SIMD counters
|
---|
804 |
|
---|
805 | .Loop_enter4x:
|
---|
806 | movdqa $xt2,0x20(%rsp) # SIMD equivalent of "@x[10]"
|
---|
807 | movdqa $xt3,0x30(%rsp) # SIMD equivalent of "@x[11]"
|
---|
808 | movdqa (%r10),$xt3 # .Lrot16(%rip)
|
---|
809 | mov \$10,%eax
|
---|
810 | movdqa $xd0,0x100-0x100(%rcx) # save SIMD counters
|
---|
811 | jmp .Loop4x
|
---|
812 |
|
---|
813 | .align 32
|
---|
814 | .Loop4x:
|
---|
815 | ___
|
---|
816 | foreach (&SSSE3_lane_ROUND(0, 4, 8,12)) { eval; }
|
---|
817 | foreach (&SSSE3_lane_ROUND(0, 5,10,15)) { eval; }
|
---|
818 | $code.=<<___;
|
---|
819 | dec %eax
|
---|
820 | jnz .Loop4x
|
---|
821 |
|
---|
822 | paddd 0x40(%rsp),$xa0 # accumulate key material
|
---|
823 | paddd 0x50(%rsp),$xa1
|
---|
824 | paddd 0x60(%rsp),$xa2
|
---|
825 | paddd 0x70(%rsp),$xa3
|
---|
826 |
|
---|
827 | movdqa $xa0,$xt2 # "de-interlace" data
|
---|
828 | punpckldq $xa1,$xa0
|
---|
829 | movdqa $xa2,$xt3
|
---|
830 | punpckldq $xa3,$xa2
|
---|
831 | punpckhdq $xa1,$xt2
|
---|
832 | punpckhdq $xa3,$xt3
|
---|
833 | movdqa $xa0,$xa1
|
---|
834 | punpcklqdq $xa2,$xa0 # "a0"
|
---|
835 | movdqa $xt2,$xa3
|
---|
836 | punpcklqdq $xt3,$xt2 # "a2"
|
---|
837 | punpckhqdq $xa2,$xa1 # "a1"
|
---|
838 | punpckhqdq $xt3,$xa3 # "a3"
|
---|
839 | ___
|
---|
840 | ($xa2,$xt2)=($xt2,$xa2);
|
---|
841 | $code.=<<___;
|
---|
842 | paddd 0x80-0x100(%rcx),$xb0
|
---|
843 | paddd 0x90-0x100(%rcx),$xb1
|
---|
844 | paddd 0xa0-0x100(%rcx),$xb2
|
---|
845 | paddd 0xb0-0x100(%rcx),$xb3
|
---|
846 |
|
---|
847 | movdqa $xa0,0x00(%rsp) # offload $xaN
|
---|
848 | movdqa $xa1,0x10(%rsp)
|
---|
849 | movdqa 0x20(%rsp),$xa0 # "xc2"
|
---|
850 | movdqa 0x30(%rsp),$xa1 # "xc3"
|
---|
851 |
|
---|
852 | movdqa $xb0,$xt2
|
---|
853 | punpckldq $xb1,$xb0
|
---|
854 | movdqa $xb2,$xt3
|
---|
855 | punpckldq $xb3,$xb2
|
---|
856 | punpckhdq $xb1,$xt2
|
---|
857 | punpckhdq $xb3,$xt3
|
---|
858 | movdqa $xb0,$xb1
|
---|
859 | punpcklqdq $xb2,$xb0 # "b0"
|
---|
860 | movdqa $xt2,$xb3
|
---|
861 | punpcklqdq $xt3,$xt2 # "b2"
|
---|
862 | punpckhqdq $xb2,$xb1 # "b1"
|
---|
863 | punpckhqdq $xt3,$xb3 # "b3"
|
---|
864 | ___
|
---|
865 | ($xb2,$xt2)=($xt2,$xb2);
|
---|
866 | my ($xc0,$xc1,$xc2,$xc3)=($xt0,$xt1,$xa0,$xa1);
|
---|
867 | $code.=<<___;
|
---|
868 | paddd 0xc0-0x100(%rcx),$xc0
|
---|
869 | paddd 0xd0-0x100(%rcx),$xc1
|
---|
870 | paddd 0xe0-0x100(%rcx),$xc2
|
---|
871 | paddd 0xf0-0x100(%rcx),$xc3
|
---|
872 |
|
---|
873 | movdqa $xa2,0x20(%rsp) # keep offloading $xaN
|
---|
874 | movdqa $xa3,0x30(%rsp)
|
---|
875 |
|
---|
876 | movdqa $xc0,$xt2
|
---|
877 | punpckldq $xc1,$xc0
|
---|
878 | movdqa $xc2,$xt3
|
---|
879 | punpckldq $xc3,$xc2
|
---|
880 | punpckhdq $xc1,$xt2
|
---|
881 | punpckhdq $xc3,$xt3
|
---|
882 | movdqa $xc0,$xc1
|
---|
883 | punpcklqdq $xc2,$xc0 # "c0"
|
---|
884 | movdqa $xt2,$xc3
|
---|
885 | punpcklqdq $xt3,$xt2 # "c2"
|
---|
886 | punpckhqdq $xc2,$xc1 # "c1"
|
---|
887 | punpckhqdq $xt3,$xc3 # "c3"
|
---|
888 | ___
|
---|
889 | ($xc2,$xt2)=($xt2,$xc2);
|
---|
890 | ($xt0,$xt1)=($xa2,$xa3); # use $xaN as temporary
|
---|
891 | $code.=<<___;
|
---|
892 | paddd 0x100-0x100(%rcx),$xd0
|
---|
893 | paddd 0x110-0x100(%rcx),$xd1
|
---|
894 | paddd 0x120-0x100(%rcx),$xd2
|
---|
895 | paddd 0x130-0x100(%rcx),$xd3
|
---|
896 |
|
---|
897 | movdqa $xd0,$xt2
|
---|
898 | punpckldq $xd1,$xd0
|
---|
899 | movdqa $xd2,$xt3
|
---|
900 | punpckldq $xd3,$xd2
|
---|
901 | punpckhdq $xd1,$xt2
|
---|
902 | punpckhdq $xd3,$xt3
|
---|
903 | movdqa $xd0,$xd1
|
---|
904 | punpcklqdq $xd2,$xd0 # "d0"
|
---|
905 | movdqa $xt2,$xd3
|
---|
906 | punpcklqdq $xt3,$xt2 # "d2"
|
---|
907 | punpckhqdq $xd2,$xd1 # "d1"
|
---|
908 | punpckhqdq $xt3,$xd3 # "d3"
|
---|
909 | ___
|
---|
910 | ($xd2,$xt2)=($xt2,$xd2);
|
---|
911 | $code.=<<___;
|
---|
912 | cmp \$64*4,$len
|
---|
913 | jb .Ltail4x
|
---|
914 |
|
---|
915 | movdqu 0x00($inp),$xt0 # xor with input
|
---|
916 | movdqu 0x10($inp),$xt1
|
---|
917 | movdqu 0x20($inp),$xt2
|
---|
918 | movdqu 0x30($inp),$xt3
|
---|
919 | pxor 0x00(%rsp),$xt0 # $xaN is offloaded, remember?
|
---|
920 | pxor $xb0,$xt1
|
---|
921 | pxor $xc0,$xt2
|
---|
922 | pxor $xd0,$xt3
|
---|
923 |
|
---|
924 | movdqu $xt0,0x00($out)
|
---|
925 | movdqu 0x40($inp),$xt0
|
---|
926 | movdqu $xt1,0x10($out)
|
---|
927 | movdqu 0x50($inp),$xt1
|
---|
928 | movdqu $xt2,0x20($out)
|
---|
929 | movdqu 0x60($inp),$xt2
|
---|
930 | movdqu $xt3,0x30($out)
|
---|
931 | movdqu 0x70($inp),$xt3
|
---|
932 | lea 0x80($inp),$inp # size optimization
|
---|
933 | pxor 0x10(%rsp),$xt0
|
---|
934 | pxor $xb1,$xt1
|
---|
935 | pxor $xc1,$xt2
|
---|
936 | pxor $xd1,$xt3
|
---|
937 |
|
---|
938 | movdqu $xt0,0x40($out)
|
---|
939 | movdqu 0x00($inp),$xt0
|
---|
940 | movdqu $xt1,0x50($out)
|
---|
941 | movdqu 0x10($inp),$xt1
|
---|
942 | movdqu $xt2,0x60($out)
|
---|
943 | movdqu 0x20($inp),$xt2
|
---|
944 | movdqu $xt3,0x70($out)
|
---|
945 | lea 0x80($out),$out # size optimization
|
---|
946 | movdqu 0x30($inp),$xt3
|
---|
947 | pxor 0x20(%rsp),$xt0
|
---|
948 | pxor $xb2,$xt1
|
---|
949 | pxor $xc2,$xt2
|
---|
950 | pxor $xd2,$xt3
|
---|
951 |
|
---|
952 | movdqu $xt0,0x00($out)
|
---|
953 | movdqu 0x40($inp),$xt0
|
---|
954 | movdqu $xt1,0x10($out)
|
---|
955 | movdqu 0x50($inp),$xt1
|
---|
956 | movdqu $xt2,0x20($out)
|
---|
957 | movdqu 0x60($inp),$xt2
|
---|
958 | movdqu $xt3,0x30($out)
|
---|
959 | movdqu 0x70($inp),$xt3
|
---|
960 | lea 0x80($inp),$inp # inp+=64*4
|
---|
961 | pxor 0x30(%rsp),$xt0
|
---|
962 | pxor $xb3,$xt1
|
---|
963 | pxor $xc3,$xt2
|
---|
964 | pxor $xd3,$xt3
|
---|
965 | movdqu $xt0,0x40($out)
|
---|
966 | movdqu $xt1,0x50($out)
|
---|
967 | movdqu $xt2,0x60($out)
|
---|
968 | movdqu $xt3,0x70($out)
|
---|
969 | lea 0x80($out),$out # out+=64*4
|
---|
970 |
|
---|
971 | sub \$64*4,$len
|
---|
972 | jnz .Loop_outer4x
|
---|
973 |
|
---|
974 | jmp .Ldone4x
|
---|
975 |
|
---|
976 | .Ltail4x:
|
---|
977 | cmp \$192,$len
|
---|
978 | jae .L192_or_more4x
|
---|
979 | cmp \$128,$len
|
---|
980 | jae .L128_or_more4x
|
---|
981 | cmp \$64,$len
|
---|
982 | jae .L64_or_more4x
|
---|
983 |
|
---|
984 | #movdqa 0x00(%rsp),$xt0 # $xaN is offloaded, remember?
|
---|
985 | xor %r10,%r10
|
---|
986 | #movdqa $xt0,0x00(%rsp)
|
---|
987 | movdqa $xb0,0x10(%rsp)
|
---|
988 | movdqa $xc0,0x20(%rsp)
|
---|
989 | movdqa $xd0,0x30(%rsp)
|
---|
990 | jmp .Loop_tail4x
|
---|
991 |
|
---|
992 | .align 32
|
---|
993 | .L64_or_more4x:
|
---|
994 | movdqu 0x00($inp),$xt0 # xor with input
|
---|
995 | movdqu 0x10($inp),$xt1
|
---|
996 | movdqu 0x20($inp),$xt2
|
---|
997 | movdqu 0x30($inp),$xt3
|
---|
998 | pxor 0x00(%rsp),$xt0 # $xaxN is offloaded, remember?
|
---|
999 | pxor $xb0,$xt1
|
---|
1000 | pxor $xc0,$xt2
|
---|
1001 | pxor $xd0,$xt3
|
---|
1002 | movdqu $xt0,0x00($out)
|
---|
1003 | movdqu $xt1,0x10($out)
|
---|
1004 | movdqu $xt2,0x20($out)
|
---|
1005 | movdqu $xt3,0x30($out)
|
---|
1006 | je .Ldone4x
|
---|
1007 |
|
---|
1008 | movdqa 0x10(%rsp),$xt0 # $xaN is offloaded, remember?
|
---|
1009 | lea 0x40($inp),$inp # inp+=64*1
|
---|
1010 | xor %r10,%r10
|
---|
1011 | movdqa $xt0,0x00(%rsp)
|
---|
1012 | movdqa $xb1,0x10(%rsp)
|
---|
1013 | lea 0x40($out),$out # out+=64*1
|
---|
1014 | movdqa $xc1,0x20(%rsp)
|
---|
1015 | sub \$64,$len # len-=64*1
|
---|
1016 | movdqa $xd1,0x30(%rsp)
|
---|
1017 | jmp .Loop_tail4x
|
---|
1018 |
|
---|
1019 | .align 32
|
---|
1020 | .L128_or_more4x:
|
---|
1021 | movdqu 0x00($inp),$xt0 # xor with input
|
---|
1022 | movdqu 0x10($inp),$xt1
|
---|
1023 | movdqu 0x20($inp),$xt2
|
---|
1024 | movdqu 0x30($inp),$xt3
|
---|
1025 | pxor 0x00(%rsp),$xt0 # $xaN is offloaded, remember?
|
---|
1026 | pxor $xb0,$xt1
|
---|
1027 | pxor $xc0,$xt2
|
---|
1028 | pxor $xd0,$xt3
|
---|
1029 |
|
---|
1030 | movdqu $xt0,0x00($out)
|
---|
1031 | movdqu 0x40($inp),$xt0
|
---|
1032 | movdqu $xt1,0x10($out)
|
---|
1033 | movdqu 0x50($inp),$xt1
|
---|
1034 | movdqu $xt2,0x20($out)
|
---|
1035 | movdqu 0x60($inp),$xt2
|
---|
1036 | movdqu $xt3,0x30($out)
|
---|
1037 | movdqu 0x70($inp),$xt3
|
---|
1038 | pxor 0x10(%rsp),$xt0
|
---|
1039 | pxor $xb1,$xt1
|
---|
1040 | pxor $xc1,$xt2
|
---|
1041 | pxor $xd1,$xt3
|
---|
1042 | movdqu $xt0,0x40($out)
|
---|
1043 | movdqu $xt1,0x50($out)
|
---|
1044 | movdqu $xt2,0x60($out)
|
---|
1045 | movdqu $xt3,0x70($out)
|
---|
1046 | je .Ldone4x
|
---|
1047 |
|
---|
1048 | movdqa 0x20(%rsp),$xt0 # $xaN is offloaded, remember?
|
---|
1049 | lea 0x80($inp),$inp # inp+=64*2
|
---|
1050 | xor %r10,%r10
|
---|
1051 | movdqa $xt0,0x00(%rsp)
|
---|
1052 | movdqa $xb2,0x10(%rsp)
|
---|
1053 | lea 0x80($out),$out # out+=64*2
|
---|
1054 | movdqa $xc2,0x20(%rsp)
|
---|
1055 | sub \$128,$len # len-=64*2
|
---|
1056 | movdqa $xd2,0x30(%rsp)
|
---|
1057 | jmp .Loop_tail4x
|
---|
1058 |
|
---|
1059 | .align 32
|
---|
1060 | .L192_or_more4x:
|
---|
1061 | movdqu 0x00($inp),$xt0 # xor with input
|
---|
1062 | movdqu 0x10($inp),$xt1
|
---|
1063 | movdqu 0x20($inp),$xt2
|
---|
1064 | movdqu 0x30($inp),$xt3
|
---|
1065 | pxor 0x00(%rsp),$xt0 # $xaN is offloaded, remember?
|
---|
1066 | pxor $xb0,$xt1
|
---|
1067 | pxor $xc0,$xt2
|
---|
1068 | pxor $xd0,$xt3
|
---|
1069 |
|
---|
1070 | movdqu $xt0,0x00($out)
|
---|
1071 | movdqu 0x40($inp),$xt0
|
---|
1072 | movdqu $xt1,0x10($out)
|
---|
1073 | movdqu 0x50($inp),$xt1
|
---|
1074 | movdqu $xt2,0x20($out)
|
---|
1075 | movdqu 0x60($inp),$xt2
|
---|
1076 | movdqu $xt3,0x30($out)
|
---|
1077 | movdqu 0x70($inp),$xt3
|
---|
1078 | lea 0x80($inp),$inp # size optimization
|
---|
1079 | pxor 0x10(%rsp),$xt0
|
---|
1080 | pxor $xb1,$xt1
|
---|
1081 | pxor $xc1,$xt2
|
---|
1082 | pxor $xd1,$xt3
|
---|
1083 |
|
---|
1084 | movdqu $xt0,0x40($out)
|
---|
1085 | movdqu 0x00($inp),$xt0
|
---|
1086 | movdqu $xt1,0x50($out)
|
---|
1087 | movdqu 0x10($inp),$xt1
|
---|
1088 | movdqu $xt2,0x60($out)
|
---|
1089 | movdqu 0x20($inp),$xt2
|
---|
1090 | movdqu $xt3,0x70($out)
|
---|
1091 | lea 0x80($out),$out # size optimization
|
---|
1092 | movdqu 0x30($inp),$xt3
|
---|
1093 | pxor 0x20(%rsp),$xt0
|
---|
1094 | pxor $xb2,$xt1
|
---|
1095 | pxor $xc2,$xt2
|
---|
1096 | pxor $xd2,$xt3
|
---|
1097 | movdqu $xt0,0x00($out)
|
---|
1098 | movdqu $xt1,0x10($out)
|
---|
1099 | movdqu $xt2,0x20($out)
|
---|
1100 | movdqu $xt3,0x30($out)
|
---|
1101 | je .Ldone4x
|
---|
1102 |
|
---|
1103 | movdqa 0x30(%rsp),$xt0 # $xaN is offloaded, remember?
|
---|
1104 | lea 0x40($inp),$inp # inp+=64*3
|
---|
1105 | xor %r10,%r10
|
---|
1106 | movdqa $xt0,0x00(%rsp)
|
---|
1107 | movdqa $xb3,0x10(%rsp)
|
---|
1108 | lea 0x40($out),$out # out+=64*3
|
---|
1109 | movdqa $xc3,0x20(%rsp)
|
---|
1110 | sub \$192,$len # len-=64*3
|
---|
1111 | movdqa $xd3,0x30(%rsp)
|
---|
1112 |
|
---|
1113 | .Loop_tail4x:
|
---|
1114 | movzb ($inp,%r10),%eax
|
---|
1115 | movzb (%rsp,%r10),%ecx
|
---|
1116 | lea 1(%r10),%r10
|
---|
1117 | xor %ecx,%eax
|
---|
1118 | mov %al,-1($out,%r10)
|
---|
1119 | dec $len
|
---|
1120 | jnz .Loop_tail4x
|
---|
1121 |
|
---|
1122 | .Ldone4x:
|
---|
1123 | ___
|
---|
1124 | $code.=<<___ if ($win64);
|
---|
1125 | lea 0x140+0x30(%rsp),%r11
|
---|
1126 | movaps -0x30(%r11),%xmm6
|
---|
1127 | movaps -0x20(%r11),%xmm7
|
---|
1128 | movaps -0x10(%r11),%xmm8
|
---|
1129 | movaps 0x00(%r11),%xmm9
|
---|
1130 | movaps 0x10(%r11),%xmm10
|
---|
1131 | movaps 0x20(%r11),%xmm11
|
---|
1132 | movaps 0x30(%r11),%xmm12
|
---|
1133 | movaps 0x40(%r11),%xmm13
|
---|
1134 | movaps 0x50(%r11),%xmm14
|
---|
1135 | movaps 0x60(%r11),%xmm15
|
---|
1136 | ___
|
---|
1137 | $code.=<<___;
|
---|
1138 | add \$0x148+$xframe,%rsp
|
---|
1139 | ret
|
---|
1140 | .size ChaCha20_4x,.-ChaCha20_4x
|
---|
1141 | ___
|
---|
1142 | }
|
---|
1143 |
|
---|
1144 | ########################################################################
|
---|
1145 | # XOP code path that handles all lengths.
|
---|
1146 | if ($avx) {
|
---|
1147 | # There is some "anomaly" observed depending on instructions' size or
|
---|
1148 | # alignment. If you look closely at below code you'll notice that
|
---|
1149 | # sometimes argument order varies. The order affects instruction
|
---|
1150 | # encoding by making it larger, and such fiddling gives 5% performance
|
---|
1151 | # improvement. This is on FX-4100...
|
---|
1152 |
|
---|
1153 | my ($xb0,$xb1,$xb2,$xb3, $xd0,$xd1,$xd2,$xd3,
|
---|
1154 | $xa0,$xa1,$xa2,$xa3, $xt0,$xt1,$xt2,$xt3)=map("%xmm$_",(0..15));
|
---|
1155 | my @xx=($xa0,$xa1,$xa2,$xa3, $xb0,$xb1,$xb2,$xb3,
|
---|
1156 | $xt0,$xt1,$xt2,$xt3, $xd0,$xd1,$xd2,$xd3);
|
---|
1157 |
|
---|
1158 | sub XOP_lane_ROUND {
|
---|
1159 | my ($a0,$b0,$c0,$d0)=@_;
|
---|
1160 | my ($a1,$b1,$c1,$d1)=map(($_&~3)+(($_+1)&3),($a0,$b0,$c0,$d0));
|
---|
1161 | my ($a2,$b2,$c2,$d2)=map(($_&~3)+(($_+1)&3),($a1,$b1,$c1,$d1));
|
---|
1162 | my ($a3,$b3,$c3,$d3)=map(($_&~3)+(($_+1)&3),($a2,$b2,$c2,$d2));
|
---|
1163 | my @x=map("\"$_\"",@xx);
|
---|
1164 |
|
---|
1165 | (
|
---|
1166 | "&vpaddd (@x[$a0],@x[$a0],@x[$b0])", # Q1
|
---|
1167 | "&vpaddd (@x[$a1],@x[$a1],@x[$b1])", # Q2
|
---|
1168 | "&vpaddd (@x[$a2],@x[$a2],@x[$b2])", # Q3
|
---|
1169 | "&vpaddd (@x[$a3],@x[$a3],@x[$b3])", # Q4
|
---|
1170 | "&vpxor (@x[$d0],@x[$a0],@x[$d0])",
|
---|
1171 | "&vpxor (@x[$d1],@x[$a1],@x[$d1])",
|
---|
1172 | "&vpxor (@x[$d2],@x[$a2],@x[$d2])",
|
---|
1173 | "&vpxor (@x[$d3],@x[$a3],@x[$d3])",
|
---|
1174 | "&vprotd (@x[$d0],@x[$d0],16)",
|
---|
1175 | "&vprotd (@x[$d1],@x[$d1],16)",
|
---|
1176 | "&vprotd (@x[$d2],@x[$d2],16)",
|
---|
1177 | "&vprotd (@x[$d3],@x[$d3],16)",
|
---|
1178 |
|
---|
1179 | "&vpaddd (@x[$c0],@x[$c0],@x[$d0])",
|
---|
1180 | "&vpaddd (@x[$c1],@x[$c1],@x[$d1])",
|
---|
1181 | "&vpaddd (@x[$c2],@x[$c2],@x[$d2])",
|
---|
1182 | "&vpaddd (@x[$c3],@x[$c3],@x[$d3])",
|
---|
1183 | "&vpxor (@x[$b0],@x[$c0],@x[$b0])",
|
---|
1184 | "&vpxor (@x[$b1],@x[$c1],@x[$b1])",
|
---|
1185 | "&vpxor (@x[$b2],@x[$b2],@x[$c2])", # flip
|
---|
1186 | "&vpxor (@x[$b3],@x[$b3],@x[$c3])", # flip
|
---|
1187 | "&vprotd (@x[$b0],@x[$b0],12)",
|
---|
1188 | "&vprotd (@x[$b1],@x[$b1],12)",
|
---|
1189 | "&vprotd (@x[$b2],@x[$b2],12)",
|
---|
1190 | "&vprotd (@x[$b3],@x[$b3],12)",
|
---|
1191 |
|
---|
1192 | "&vpaddd (@x[$a0],@x[$b0],@x[$a0])", # flip
|
---|
1193 | "&vpaddd (@x[$a1],@x[$b1],@x[$a1])", # flip
|
---|
1194 | "&vpaddd (@x[$a2],@x[$a2],@x[$b2])",
|
---|
1195 | "&vpaddd (@x[$a3],@x[$a3],@x[$b3])",
|
---|
1196 | "&vpxor (@x[$d0],@x[$a0],@x[$d0])",
|
---|
1197 | "&vpxor (@x[$d1],@x[$a1],@x[$d1])",
|
---|
1198 | "&vpxor (@x[$d2],@x[$a2],@x[$d2])",
|
---|
1199 | "&vpxor (@x[$d3],@x[$a3],@x[$d3])",
|
---|
1200 | "&vprotd (@x[$d0],@x[$d0],8)",
|
---|
1201 | "&vprotd (@x[$d1],@x[$d1],8)",
|
---|
1202 | "&vprotd (@x[$d2],@x[$d2],8)",
|
---|
1203 | "&vprotd (@x[$d3],@x[$d3],8)",
|
---|
1204 |
|
---|
1205 | "&vpaddd (@x[$c0],@x[$c0],@x[$d0])",
|
---|
1206 | "&vpaddd (@x[$c1],@x[$c1],@x[$d1])",
|
---|
1207 | "&vpaddd (@x[$c2],@x[$c2],@x[$d2])",
|
---|
1208 | "&vpaddd (@x[$c3],@x[$c3],@x[$d3])",
|
---|
1209 | "&vpxor (@x[$b0],@x[$c0],@x[$b0])",
|
---|
1210 | "&vpxor (@x[$b1],@x[$c1],@x[$b1])",
|
---|
1211 | "&vpxor (@x[$b2],@x[$b2],@x[$c2])", # flip
|
---|
1212 | "&vpxor (@x[$b3],@x[$b3],@x[$c3])", # flip
|
---|
1213 | "&vprotd (@x[$b0],@x[$b0],7)",
|
---|
1214 | "&vprotd (@x[$b1],@x[$b1],7)",
|
---|
1215 | "&vprotd (@x[$b2],@x[$b2],7)",
|
---|
1216 | "&vprotd (@x[$b3],@x[$b3],7)"
|
---|
1217 | );
|
---|
1218 | }
|
---|
1219 |
|
---|
1220 | my $xframe = $win64 ? 0xa0 : 0;
|
---|
1221 |
|
---|
1222 | $code.=<<___;
|
---|
1223 | .type ChaCha20_4xop,\@function,5
|
---|
1224 | .align 32
|
---|
1225 | ChaCha20_4xop:
|
---|
1226 | .LChaCha20_4xop:
|
---|
1227 | lea -0x78(%rsp),%r11
|
---|
1228 | sub \$0x148+$xframe,%rsp
|
---|
1229 | ___
|
---|
1230 | ################ stack layout
|
---|
1231 | # +0x00 SIMD equivalent of @x[8-12]
|
---|
1232 | # ...
|
---|
1233 | # +0x40 constant copy of key[0-2] smashed by lanes
|
---|
1234 | # ...
|
---|
1235 | # +0x100 SIMD counters (with nonce smashed by lanes)
|
---|
1236 | # ...
|
---|
1237 | # +0x140
|
---|
1238 | $code.=<<___ if ($win64);
|
---|
1239 | movaps %xmm6,-0x30(%r11)
|
---|
1240 | movaps %xmm7,-0x20(%r11)
|
---|
1241 | movaps %xmm8,-0x10(%r11)
|
---|
1242 | movaps %xmm9,0x00(%r11)
|
---|
1243 | movaps %xmm10,0x10(%r11)
|
---|
1244 | movaps %xmm11,0x20(%r11)
|
---|
1245 | movaps %xmm12,0x30(%r11)
|
---|
1246 | movaps %xmm13,0x40(%r11)
|
---|
1247 | movaps %xmm14,0x50(%r11)
|
---|
1248 | movaps %xmm15,0x60(%r11)
|
---|
1249 | ___
|
---|
1250 | $code.=<<___;
|
---|
1251 | vzeroupper
|
---|
1252 |
|
---|
1253 | vmovdqa .Lsigma(%rip),$xa3 # key[0]
|
---|
1254 | vmovdqu ($key),$xb3 # key[1]
|
---|
1255 | vmovdqu 16($key),$xt3 # key[2]
|
---|
1256 | vmovdqu ($counter),$xd3 # key[3]
|
---|
1257 | lea 0x100(%rsp),%rcx # size optimization
|
---|
1258 |
|
---|
1259 | vpshufd \$0x00,$xa3,$xa0 # smash key by lanes...
|
---|
1260 | vpshufd \$0x55,$xa3,$xa1
|
---|
1261 | vmovdqa $xa0,0x40(%rsp) # ... and offload
|
---|
1262 | vpshufd \$0xaa,$xa3,$xa2
|
---|
1263 | vmovdqa $xa1,0x50(%rsp)
|
---|
1264 | vpshufd \$0xff,$xa3,$xa3
|
---|
1265 | vmovdqa $xa2,0x60(%rsp)
|
---|
1266 | vmovdqa $xa3,0x70(%rsp)
|
---|
1267 |
|
---|
1268 | vpshufd \$0x00,$xb3,$xb0
|
---|
1269 | vpshufd \$0x55,$xb3,$xb1
|
---|
1270 | vmovdqa $xb0,0x80-0x100(%rcx)
|
---|
1271 | vpshufd \$0xaa,$xb3,$xb2
|
---|
1272 | vmovdqa $xb1,0x90-0x100(%rcx)
|
---|
1273 | vpshufd \$0xff,$xb3,$xb3
|
---|
1274 | vmovdqa $xb2,0xa0-0x100(%rcx)
|
---|
1275 | vmovdqa $xb3,0xb0-0x100(%rcx)
|
---|
1276 |
|
---|
1277 | vpshufd \$0x00,$xt3,$xt0 # "$xc0"
|
---|
1278 | vpshufd \$0x55,$xt3,$xt1 # "$xc1"
|
---|
1279 | vmovdqa $xt0,0xc0-0x100(%rcx)
|
---|
1280 | vpshufd \$0xaa,$xt3,$xt2 # "$xc2"
|
---|
1281 | vmovdqa $xt1,0xd0-0x100(%rcx)
|
---|
1282 | vpshufd \$0xff,$xt3,$xt3 # "$xc3"
|
---|
1283 | vmovdqa $xt2,0xe0-0x100(%rcx)
|
---|
1284 | vmovdqa $xt3,0xf0-0x100(%rcx)
|
---|
1285 |
|
---|
1286 | vpshufd \$0x00,$xd3,$xd0
|
---|
1287 | vpshufd \$0x55,$xd3,$xd1
|
---|
1288 | vpaddd .Linc(%rip),$xd0,$xd0 # don't save counters yet
|
---|
1289 | vpshufd \$0xaa,$xd3,$xd2
|
---|
1290 | vmovdqa $xd1,0x110-0x100(%rcx)
|
---|
1291 | vpshufd \$0xff,$xd3,$xd3
|
---|
1292 | vmovdqa $xd2,0x120-0x100(%rcx)
|
---|
1293 | vmovdqa $xd3,0x130-0x100(%rcx)
|
---|
1294 |
|
---|
1295 | jmp .Loop_enter4xop
|
---|
1296 |
|
---|
1297 | .align 32
|
---|
1298 | .Loop_outer4xop:
|
---|
1299 | vmovdqa 0x40(%rsp),$xa0 # re-load smashed key
|
---|
1300 | vmovdqa 0x50(%rsp),$xa1
|
---|
1301 | vmovdqa 0x60(%rsp),$xa2
|
---|
1302 | vmovdqa 0x70(%rsp),$xa3
|
---|
1303 | vmovdqa 0x80-0x100(%rcx),$xb0
|
---|
1304 | vmovdqa 0x90-0x100(%rcx),$xb1
|
---|
1305 | vmovdqa 0xa0-0x100(%rcx),$xb2
|
---|
1306 | vmovdqa 0xb0-0x100(%rcx),$xb3
|
---|
1307 | vmovdqa 0xc0-0x100(%rcx),$xt0 # "$xc0"
|
---|
1308 | vmovdqa 0xd0-0x100(%rcx),$xt1 # "$xc1"
|
---|
1309 | vmovdqa 0xe0-0x100(%rcx),$xt2 # "$xc2"
|
---|
1310 | vmovdqa 0xf0-0x100(%rcx),$xt3 # "$xc3"
|
---|
1311 | vmovdqa 0x100-0x100(%rcx),$xd0
|
---|
1312 | vmovdqa 0x110-0x100(%rcx),$xd1
|
---|
1313 | vmovdqa 0x120-0x100(%rcx),$xd2
|
---|
1314 | vmovdqa 0x130-0x100(%rcx),$xd3
|
---|
1315 | vpaddd .Lfour(%rip),$xd0,$xd0 # next SIMD counters
|
---|
1316 |
|
---|
1317 | .Loop_enter4xop:
|
---|
1318 | mov \$10,%eax
|
---|
1319 | vmovdqa $xd0,0x100-0x100(%rcx) # save SIMD counters
|
---|
1320 | jmp .Loop4xop
|
---|
1321 |
|
---|
1322 | .align 32
|
---|
1323 | .Loop4xop:
|
---|
1324 | ___
|
---|
1325 | foreach (&XOP_lane_ROUND(0, 4, 8,12)) { eval; }
|
---|
1326 | foreach (&XOP_lane_ROUND(0, 5,10,15)) { eval; }
|
---|
1327 | $code.=<<___;
|
---|
1328 | dec %eax
|
---|
1329 | jnz .Loop4xop
|
---|
1330 |
|
---|
1331 | vpaddd 0x40(%rsp),$xa0,$xa0 # accumulate key material
|
---|
1332 | vpaddd 0x50(%rsp),$xa1,$xa1
|
---|
1333 | vpaddd 0x60(%rsp),$xa2,$xa2
|
---|
1334 | vpaddd 0x70(%rsp),$xa3,$xa3
|
---|
1335 |
|
---|
1336 | vmovdqa $xt2,0x20(%rsp) # offload $xc2,3
|
---|
1337 | vmovdqa $xt3,0x30(%rsp)
|
---|
1338 |
|
---|
1339 | vpunpckldq $xa1,$xa0,$xt2 # "de-interlace" data
|
---|
1340 | vpunpckldq $xa3,$xa2,$xt3
|
---|
1341 | vpunpckhdq $xa1,$xa0,$xa0
|
---|
1342 | vpunpckhdq $xa3,$xa2,$xa2
|
---|
1343 | vpunpcklqdq $xt3,$xt2,$xa1 # "a0"
|
---|
1344 | vpunpckhqdq $xt3,$xt2,$xt2 # "a1"
|
---|
1345 | vpunpcklqdq $xa2,$xa0,$xa3 # "a2"
|
---|
1346 | vpunpckhqdq $xa2,$xa0,$xa0 # "a3"
|
---|
1347 | ___
|
---|
1348 | ($xa0,$xa1,$xa2,$xa3,$xt2)=($xa1,$xt2,$xa3,$xa0,$xa2);
|
---|
1349 | $code.=<<___;
|
---|
1350 | vpaddd 0x80-0x100(%rcx),$xb0,$xb0
|
---|
1351 | vpaddd 0x90-0x100(%rcx),$xb1,$xb1
|
---|
1352 | vpaddd 0xa0-0x100(%rcx),$xb2,$xb2
|
---|
1353 | vpaddd 0xb0-0x100(%rcx),$xb3,$xb3
|
---|
1354 |
|
---|
1355 | vmovdqa $xa0,0x00(%rsp) # offload $xa0,1
|
---|
1356 | vmovdqa $xa1,0x10(%rsp)
|
---|
1357 | vmovdqa 0x20(%rsp),$xa0 # "xc2"
|
---|
1358 | vmovdqa 0x30(%rsp),$xa1 # "xc3"
|
---|
1359 |
|
---|
1360 | vpunpckldq $xb1,$xb0,$xt2
|
---|
1361 | vpunpckldq $xb3,$xb2,$xt3
|
---|
1362 | vpunpckhdq $xb1,$xb0,$xb0
|
---|
1363 | vpunpckhdq $xb3,$xb2,$xb2
|
---|
1364 | vpunpcklqdq $xt3,$xt2,$xb1 # "b0"
|
---|
1365 | vpunpckhqdq $xt3,$xt2,$xt2 # "b1"
|
---|
1366 | vpunpcklqdq $xb2,$xb0,$xb3 # "b2"
|
---|
1367 | vpunpckhqdq $xb2,$xb0,$xb0 # "b3"
|
---|
1368 | ___
|
---|
1369 | ($xb0,$xb1,$xb2,$xb3,$xt2)=($xb1,$xt2,$xb3,$xb0,$xb2);
|
---|
1370 | my ($xc0,$xc1,$xc2,$xc3)=($xt0,$xt1,$xa0,$xa1);
|
---|
1371 | $code.=<<___;
|
---|
1372 | vpaddd 0xc0-0x100(%rcx),$xc0,$xc0
|
---|
1373 | vpaddd 0xd0-0x100(%rcx),$xc1,$xc1
|
---|
1374 | vpaddd 0xe0-0x100(%rcx),$xc2,$xc2
|
---|
1375 | vpaddd 0xf0-0x100(%rcx),$xc3,$xc3
|
---|
1376 |
|
---|
1377 | vpunpckldq $xc1,$xc0,$xt2
|
---|
1378 | vpunpckldq $xc3,$xc2,$xt3
|
---|
1379 | vpunpckhdq $xc1,$xc0,$xc0
|
---|
1380 | vpunpckhdq $xc3,$xc2,$xc2
|
---|
1381 | vpunpcklqdq $xt3,$xt2,$xc1 # "c0"
|
---|
1382 | vpunpckhqdq $xt3,$xt2,$xt2 # "c1"
|
---|
1383 | vpunpcklqdq $xc2,$xc0,$xc3 # "c2"
|
---|
1384 | vpunpckhqdq $xc2,$xc0,$xc0 # "c3"
|
---|
1385 | ___
|
---|
1386 | ($xc0,$xc1,$xc2,$xc3,$xt2)=($xc1,$xt2,$xc3,$xc0,$xc2);
|
---|
1387 | $code.=<<___;
|
---|
1388 | vpaddd 0x100-0x100(%rcx),$xd0,$xd0
|
---|
1389 | vpaddd 0x110-0x100(%rcx),$xd1,$xd1
|
---|
1390 | vpaddd 0x120-0x100(%rcx),$xd2,$xd2
|
---|
1391 | vpaddd 0x130-0x100(%rcx),$xd3,$xd3
|
---|
1392 |
|
---|
1393 | vpunpckldq $xd1,$xd0,$xt2
|
---|
1394 | vpunpckldq $xd3,$xd2,$xt3
|
---|
1395 | vpunpckhdq $xd1,$xd0,$xd0
|
---|
1396 | vpunpckhdq $xd3,$xd2,$xd2
|
---|
1397 | vpunpcklqdq $xt3,$xt2,$xd1 # "d0"
|
---|
1398 | vpunpckhqdq $xt3,$xt2,$xt2 # "d1"
|
---|
1399 | vpunpcklqdq $xd2,$xd0,$xd3 # "d2"
|
---|
1400 | vpunpckhqdq $xd2,$xd0,$xd0 # "d3"
|
---|
1401 | ___
|
---|
1402 | ($xd0,$xd1,$xd2,$xd3,$xt2)=($xd1,$xt2,$xd3,$xd0,$xd2);
|
---|
1403 | ($xa0,$xa1)=($xt2,$xt3);
|
---|
1404 | $code.=<<___;
|
---|
1405 | vmovdqa 0x00(%rsp),$xa0 # restore $xa0,1
|
---|
1406 | vmovdqa 0x10(%rsp),$xa1
|
---|
1407 |
|
---|
1408 | cmp \$64*4,$len
|
---|
1409 | jb .Ltail4xop
|
---|
1410 |
|
---|
1411 | vpxor 0x00($inp),$xa0,$xa0 # xor with input
|
---|
1412 | vpxor 0x10($inp),$xb0,$xb0
|
---|
1413 | vpxor 0x20($inp),$xc0,$xc0
|
---|
1414 | vpxor 0x30($inp),$xd0,$xd0
|
---|
1415 | vpxor 0x40($inp),$xa1,$xa1
|
---|
1416 | vpxor 0x50($inp),$xb1,$xb1
|
---|
1417 | vpxor 0x60($inp),$xc1,$xc1
|
---|
1418 | vpxor 0x70($inp),$xd1,$xd1
|
---|
1419 | lea 0x80($inp),$inp # size optimization
|
---|
1420 | vpxor 0x00($inp),$xa2,$xa2
|
---|
1421 | vpxor 0x10($inp),$xb2,$xb2
|
---|
1422 | vpxor 0x20($inp),$xc2,$xc2
|
---|
1423 | vpxor 0x30($inp),$xd2,$xd2
|
---|
1424 | vpxor 0x40($inp),$xa3,$xa3
|
---|
1425 | vpxor 0x50($inp),$xb3,$xb3
|
---|
1426 | vpxor 0x60($inp),$xc3,$xc3
|
---|
1427 | vpxor 0x70($inp),$xd3,$xd3
|
---|
1428 | lea 0x80($inp),$inp # inp+=64*4
|
---|
1429 |
|
---|
1430 | vmovdqu $xa0,0x00($out)
|
---|
1431 | vmovdqu $xb0,0x10($out)
|
---|
1432 | vmovdqu $xc0,0x20($out)
|
---|
1433 | vmovdqu $xd0,0x30($out)
|
---|
1434 | vmovdqu $xa1,0x40($out)
|
---|
1435 | vmovdqu $xb1,0x50($out)
|
---|
1436 | vmovdqu $xc1,0x60($out)
|
---|
1437 | vmovdqu $xd1,0x70($out)
|
---|
1438 | lea 0x80($out),$out # size optimization
|
---|
1439 | vmovdqu $xa2,0x00($out)
|
---|
1440 | vmovdqu $xb2,0x10($out)
|
---|
1441 | vmovdqu $xc2,0x20($out)
|
---|
1442 | vmovdqu $xd2,0x30($out)
|
---|
1443 | vmovdqu $xa3,0x40($out)
|
---|
1444 | vmovdqu $xb3,0x50($out)
|
---|
1445 | vmovdqu $xc3,0x60($out)
|
---|
1446 | vmovdqu $xd3,0x70($out)
|
---|
1447 | lea 0x80($out),$out # out+=64*4
|
---|
1448 |
|
---|
1449 | sub \$64*4,$len
|
---|
1450 | jnz .Loop_outer4xop
|
---|
1451 |
|
---|
1452 | jmp .Ldone4xop
|
---|
1453 |
|
---|
1454 | .align 32
|
---|
1455 | .Ltail4xop:
|
---|
1456 | cmp \$192,$len
|
---|
1457 | jae .L192_or_more4xop
|
---|
1458 | cmp \$128,$len
|
---|
1459 | jae .L128_or_more4xop
|
---|
1460 | cmp \$64,$len
|
---|
1461 | jae .L64_or_more4xop
|
---|
1462 |
|
---|
1463 | xor %r10,%r10
|
---|
1464 | vmovdqa $xa0,0x00(%rsp)
|
---|
1465 | vmovdqa $xb0,0x10(%rsp)
|
---|
1466 | vmovdqa $xc0,0x20(%rsp)
|
---|
1467 | vmovdqa $xd0,0x30(%rsp)
|
---|
1468 | jmp .Loop_tail4xop
|
---|
1469 |
|
---|
1470 | .align 32
|
---|
1471 | .L64_or_more4xop:
|
---|
1472 | vpxor 0x00($inp),$xa0,$xa0 # xor with input
|
---|
1473 | vpxor 0x10($inp),$xb0,$xb0
|
---|
1474 | vpxor 0x20($inp),$xc0,$xc0
|
---|
1475 | vpxor 0x30($inp),$xd0,$xd0
|
---|
1476 | vmovdqu $xa0,0x00($out)
|
---|
1477 | vmovdqu $xb0,0x10($out)
|
---|
1478 | vmovdqu $xc0,0x20($out)
|
---|
1479 | vmovdqu $xd0,0x30($out)
|
---|
1480 | je .Ldone4xop
|
---|
1481 |
|
---|
1482 | lea 0x40($inp),$inp # inp+=64*1
|
---|
1483 | vmovdqa $xa1,0x00(%rsp)
|
---|
1484 | xor %r10,%r10
|
---|
1485 | vmovdqa $xb1,0x10(%rsp)
|
---|
1486 | lea 0x40($out),$out # out+=64*1
|
---|
1487 | vmovdqa $xc1,0x20(%rsp)
|
---|
1488 | sub \$64,$len # len-=64*1
|
---|
1489 | vmovdqa $xd1,0x30(%rsp)
|
---|
1490 | jmp .Loop_tail4xop
|
---|
1491 |
|
---|
1492 | .align 32
|
---|
1493 | .L128_or_more4xop:
|
---|
1494 | vpxor 0x00($inp),$xa0,$xa0 # xor with input
|
---|
1495 | vpxor 0x10($inp),$xb0,$xb0
|
---|
1496 | vpxor 0x20($inp),$xc0,$xc0
|
---|
1497 | vpxor 0x30($inp),$xd0,$xd0
|
---|
1498 | vpxor 0x40($inp),$xa1,$xa1
|
---|
1499 | vpxor 0x50($inp),$xb1,$xb1
|
---|
1500 | vpxor 0x60($inp),$xc1,$xc1
|
---|
1501 | vpxor 0x70($inp),$xd1,$xd1
|
---|
1502 |
|
---|
1503 | vmovdqu $xa0,0x00($out)
|
---|
1504 | vmovdqu $xb0,0x10($out)
|
---|
1505 | vmovdqu $xc0,0x20($out)
|
---|
1506 | vmovdqu $xd0,0x30($out)
|
---|
1507 | vmovdqu $xa1,0x40($out)
|
---|
1508 | vmovdqu $xb1,0x50($out)
|
---|
1509 | vmovdqu $xc1,0x60($out)
|
---|
1510 | vmovdqu $xd1,0x70($out)
|
---|
1511 | je .Ldone4xop
|
---|
1512 |
|
---|
1513 | lea 0x80($inp),$inp # inp+=64*2
|
---|
1514 | vmovdqa $xa2,0x00(%rsp)
|
---|
1515 | xor %r10,%r10
|
---|
1516 | vmovdqa $xb2,0x10(%rsp)
|
---|
1517 | lea 0x80($out),$out # out+=64*2
|
---|
1518 | vmovdqa $xc2,0x20(%rsp)
|
---|
1519 | sub \$128,$len # len-=64*2
|
---|
1520 | vmovdqa $xd2,0x30(%rsp)
|
---|
1521 | jmp .Loop_tail4xop
|
---|
1522 |
|
---|
1523 | .align 32
|
---|
1524 | .L192_or_more4xop:
|
---|
1525 | vpxor 0x00($inp),$xa0,$xa0 # xor with input
|
---|
1526 | vpxor 0x10($inp),$xb0,$xb0
|
---|
1527 | vpxor 0x20($inp),$xc0,$xc0
|
---|
1528 | vpxor 0x30($inp),$xd0,$xd0
|
---|
1529 | vpxor 0x40($inp),$xa1,$xa1
|
---|
1530 | vpxor 0x50($inp),$xb1,$xb1
|
---|
1531 | vpxor 0x60($inp),$xc1,$xc1
|
---|
1532 | vpxor 0x70($inp),$xd1,$xd1
|
---|
1533 | lea 0x80($inp),$inp # size optimization
|
---|
1534 | vpxor 0x00($inp),$xa2,$xa2
|
---|
1535 | vpxor 0x10($inp),$xb2,$xb2
|
---|
1536 | vpxor 0x20($inp),$xc2,$xc2
|
---|
1537 | vpxor 0x30($inp),$xd2,$xd2
|
---|
1538 |
|
---|
1539 | vmovdqu $xa0,0x00($out)
|
---|
1540 | vmovdqu $xb0,0x10($out)
|
---|
1541 | vmovdqu $xc0,0x20($out)
|
---|
1542 | vmovdqu $xd0,0x30($out)
|
---|
1543 | vmovdqu $xa1,0x40($out)
|
---|
1544 | vmovdqu $xb1,0x50($out)
|
---|
1545 | vmovdqu $xc1,0x60($out)
|
---|
1546 | vmovdqu $xd1,0x70($out)
|
---|
1547 | lea 0x80($out),$out # size optimization
|
---|
1548 | vmovdqu $xa2,0x00($out)
|
---|
1549 | vmovdqu $xb2,0x10($out)
|
---|
1550 | vmovdqu $xc2,0x20($out)
|
---|
1551 | vmovdqu $xd2,0x30($out)
|
---|
1552 | je .Ldone4xop
|
---|
1553 |
|
---|
1554 | lea 0x40($inp),$inp # inp+=64*3
|
---|
1555 | vmovdqa $xa3,0x00(%rsp)
|
---|
1556 | xor %r10,%r10
|
---|
1557 | vmovdqa $xb3,0x10(%rsp)
|
---|
1558 | lea 0x40($out),$out # out+=64*3
|
---|
1559 | vmovdqa $xc3,0x20(%rsp)
|
---|
1560 | sub \$192,$len # len-=64*3
|
---|
1561 | vmovdqa $xd3,0x30(%rsp)
|
---|
1562 |
|
---|
1563 | .Loop_tail4xop:
|
---|
1564 | movzb ($inp,%r10),%eax
|
---|
1565 | movzb (%rsp,%r10),%ecx
|
---|
1566 | lea 1(%r10),%r10
|
---|
1567 | xor %ecx,%eax
|
---|
1568 | mov %al,-1($out,%r10)
|
---|
1569 | dec $len
|
---|
1570 | jnz .Loop_tail4xop
|
---|
1571 |
|
---|
1572 | .Ldone4xop:
|
---|
1573 | vzeroupper
|
---|
1574 | ___
|
---|
1575 | $code.=<<___ if ($win64);
|
---|
1576 | lea 0x140+0x30(%rsp),%r11
|
---|
1577 | movaps -0x30(%r11),%xmm6
|
---|
1578 | movaps -0x20(%r11),%xmm7
|
---|
1579 | movaps -0x10(%r11),%xmm8
|
---|
1580 | movaps 0x00(%r11),%xmm9
|
---|
1581 | movaps 0x10(%r11),%xmm10
|
---|
1582 | movaps 0x20(%r11),%xmm11
|
---|
1583 | movaps 0x30(%r11),%xmm12
|
---|
1584 | movaps 0x40(%r11),%xmm13
|
---|
1585 | movaps 0x50(%r11),%xmm14
|
---|
1586 | movaps 0x60(%r11),%xmm15
|
---|
1587 | ___
|
---|
1588 | $code.=<<___;
|
---|
1589 | add \$0x148+$xframe,%rsp
|
---|
1590 | ret
|
---|
1591 | .size ChaCha20_4xop,.-ChaCha20_4xop
|
---|
1592 | ___
|
---|
1593 | }
|
---|
1594 |
|
---|
1595 | ########################################################################
|
---|
1596 | # AVX2 code path
|
---|
1597 | if ($avx>1) {
|
---|
1598 | my ($xb0,$xb1,$xb2,$xb3, $xd0,$xd1,$xd2,$xd3,
|
---|
1599 | $xa0,$xa1,$xa2,$xa3, $xt0,$xt1,$xt2,$xt3)=map("%ymm$_",(0..15));
|
---|
1600 | my @xx=($xa0,$xa1,$xa2,$xa3, $xb0,$xb1,$xb2,$xb3,
|
---|
1601 | "%nox","%nox","%nox","%nox", $xd0,$xd1,$xd2,$xd3);
|
---|
1602 |
|
---|
1603 | sub AVX2_lane_ROUND {
|
---|
1604 | my ($a0,$b0,$c0,$d0)=@_;
|
---|
1605 | my ($a1,$b1,$c1,$d1)=map(($_&~3)+(($_+1)&3),($a0,$b0,$c0,$d0));
|
---|
1606 | my ($a2,$b2,$c2,$d2)=map(($_&~3)+(($_+1)&3),($a1,$b1,$c1,$d1));
|
---|
1607 | my ($a3,$b3,$c3,$d3)=map(($_&~3)+(($_+1)&3),($a2,$b2,$c2,$d2));
|
---|
1608 | my ($xc,$xc_,$t0,$t1)=map("\"$_\"",$xt0,$xt1,$xt2,$xt3);
|
---|
1609 | my @x=map("\"$_\"",@xx);
|
---|
1610 |
|
---|
1611 | # Consider order in which variables are addressed by their
|
---|
1612 | # index:
|
---|
1613 | #
|
---|
1614 | # a b c d
|
---|
1615 | #
|
---|
1616 | # 0 4 8 12 < even round
|
---|
1617 | # 1 5 9 13
|
---|
1618 | # 2 6 10 14
|
---|
1619 | # 3 7 11 15
|
---|
1620 | # 0 5 10 15 < odd round
|
---|
1621 | # 1 6 11 12
|
---|
1622 | # 2 7 8 13
|
---|
1623 | # 3 4 9 14
|
---|
1624 | #
|
---|
1625 | # 'a', 'b' and 'd's are permanently allocated in registers,
|
---|
1626 | # @x[0..7,12..15], while 'c's are maintained in memory. If
|
---|
1627 | # you observe 'c' column, you'll notice that pair of 'c's is
|
---|
1628 | # invariant between rounds. This means that we have to reload
|
---|
1629 | # them once per round, in the middle. This is why you'll see
|
---|
1630 | # bunch of 'c' stores and loads in the middle, but none in
|
---|
1631 | # the beginning or end.
|
---|
1632 |
|
---|
1633 | (
|
---|
1634 | "&vpaddd (@x[$a0],@x[$a0],@x[$b0])", # Q1
|
---|
1635 | "&vpxor (@x[$d0],@x[$a0],@x[$d0])",
|
---|
1636 | "&vpshufb (@x[$d0],@x[$d0],$t1)",
|
---|
1637 | "&vpaddd (@x[$a1],@x[$a1],@x[$b1])", # Q2
|
---|
1638 | "&vpxor (@x[$d1],@x[$a1],@x[$d1])",
|
---|
1639 | "&vpshufb (@x[$d1],@x[$d1],$t1)",
|
---|
1640 |
|
---|
1641 | "&vpaddd ($xc,$xc,@x[$d0])",
|
---|
1642 | "&vpxor (@x[$b0],$xc,@x[$b0])",
|
---|
1643 | "&vpslld ($t0,@x[$b0],12)",
|
---|
1644 | "&vpsrld (@x[$b0],@x[$b0],20)",
|
---|
1645 | "&vpor (@x[$b0],$t0,@x[$b0])",
|
---|
1646 | "&vbroadcasti128($t0,'(%r11)')", # .Lrot24(%rip)
|
---|
1647 | "&vpaddd ($xc_,$xc_,@x[$d1])",
|
---|
1648 | "&vpxor (@x[$b1],$xc_,@x[$b1])",
|
---|
1649 | "&vpslld ($t1,@x[$b1],12)",
|
---|
1650 | "&vpsrld (@x[$b1],@x[$b1],20)",
|
---|
1651 | "&vpor (@x[$b1],$t1,@x[$b1])",
|
---|
1652 |
|
---|
1653 | "&vpaddd (@x[$a0],@x[$a0],@x[$b0])",
|
---|
1654 | "&vpxor (@x[$d0],@x[$a0],@x[$d0])",
|
---|
1655 | "&vpshufb (@x[$d0],@x[$d0],$t0)",
|
---|
1656 | "&vpaddd (@x[$a1],@x[$a1],@x[$b1])",
|
---|
1657 | "&vpxor (@x[$d1],@x[$a1],@x[$d1])",
|
---|
1658 | "&vpshufb (@x[$d1],@x[$d1],$t0)",
|
---|
1659 |
|
---|
1660 | "&vpaddd ($xc,$xc,@x[$d0])",
|
---|
1661 | "&vpxor (@x[$b0],$xc,@x[$b0])",
|
---|
1662 | "&vpslld ($t1,@x[$b0],7)",
|
---|
1663 | "&vpsrld (@x[$b0],@x[$b0],25)",
|
---|
1664 | "&vpor (@x[$b0],$t1,@x[$b0])",
|
---|
1665 | "&vbroadcasti128($t1,'(%r10)')", # .Lrot16(%rip)
|
---|
1666 | "&vpaddd ($xc_,$xc_,@x[$d1])",
|
---|
1667 | "&vpxor (@x[$b1],$xc_,@x[$b1])",
|
---|
1668 | "&vpslld ($t0,@x[$b1],7)",
|
---|
1669 | "&vpsrld (@x[$b1],@x[$b1],25)",
|
---|
1670 | "&vpor (@x[$b1],$t0,@x[$b1])",
|
---|
1671 |
|
---|
1672 | "&vmovdqa (\"`32*($c0-8)`(%rsp)\",$xc)", # reload pair of 'c's
|
---|
1673 | "&vmovdqa (\"`32*($c1-8)`(%rsp)\",$xc_)",
|
---|
1674 | "&vmovdqa ($xc,\"`32*($c2-8)`(%rsp)\")",
|
---|
1675 | "&vmovdqa ($xc_,\"`32*($c3-8)`(%rsp)\")",
|
---|
1676 |
|
---|
1677 | "&vpaddd (@x[$a2],@x[$a2],@x[$b2])", # Q3
|
---|
1678 | "&vpxor (@x[$d2],@x[$a2],@x[$d2])",
|
---|
1679 | "&vpshufb (@x[$d2],@x[$d2],$t1)",
|
---|
1680 | "&vpaddd (@x[$a3],@x[$a3],@x[$b3])", # Q4
|
---|
1681 | "&vpxor (@x[$d3],@x[$a3],@x[$d3])",
|
---|
1682 | "&vpshufb (@x[$d3],@x[$d3],$t1)",
|
---|
1683 |
|
---|
1684 | "&vpaddd ($xc,$xc,@x[$d2])",
|
---|
1685 | "&vpxor (@x[$b2],$xc,@x[$b2])",
|
---|
1686 | "&vpslld ($t0,@x[$b2],12)",
|
---|
1687 | "&vpsrld (@x[$b2],@x[$b2],20)",
|
---|
1688 | "&vpor (@x[$b2],$t0,@x[$b2])",
|
---|
1689 | "&vbroadcasti128($t0,'(%r11)')", # .Lrot24(%rip)
|
---|
1690 | "&vpaddd ($xc_,$xc_,@x[$d3])",
|
---|
1691 | "&vpxor (@x[$b3],$xc_,@x[$b3])",
|
---|
1692 | "&vpslld ($t1,@x[$b3],12)",
|
---|
1693 | "&vpsrld (@x[$b3],@x[$b3],20)",
|
---|
1694 | "&vpor (@x[$b3],$t1,@x[$b3])",
|
---|
1695 |
|
---|
1696 | "&vpaddd (@x[$a2],@x[$a2],@x[$b2])",
|
---|
1697 | "&vpxor (@x[$d2],@x[$a2],@x[$d2])",
|
---|
1698 | "&vpshufb (@x[$d2],@x[$d2],$t0)",
|
---|
1699 | "&vpaddd (@x[$a3],@x[$a3],@x[$b3])",
|
---|
1700 | "&vpxor (@x[$d3],@x[$a3],@x[$d3])",
|
---|
1701 | "&vpshufb (@x[$d3],@x[$d3],$t0)",
|
---|
1702 |
|
---|
1703 | "&vpaddd ($xc,$xc,@x[$d2])",
|
---|
1704 | "&vpxor (@x[$b2],$xc,@x[$b2])",
|
---|
1705 | "&vpslld ($t1,@x[$b2],7)",
|
---|
1706 | "&vpsrld (@x[$b2],@x[$b2],25)",
|
---|
1707 | "&vpor (@x[$b2],$t1,@x[$b2])",
|
---|
1708 | "&vbroadcasti128($t1,'(%r10)')", # .Lrot16(%rip)
|
---|
1709 | "&vpaddd ($xc_,$xc_,@x[$d3])",
|
---|
1710 | "&vpxor (@x[$b3],$xc_,@x[$b3])",
|
---|
1711 | "&vpslld ($t0,@x[$b3],7)",
|
---|
1712 | "&vpsrld (@x[$b3],@x[$b3],25)",
|
---|
1713 | "&vpor (@x[$b3],$t0,@x[$b3])"
|
---|
1714 | );
|
---|
1715 | }
|
---|
1716 |
|
---|
1717 | my $xframe = $win64 ? 0xb0 : 8;
|
---|
1718 |
|
---|
1719 | $code.=<<___;
|
---|
1720 | .type ChaCha20_8x,\@function,5
|
---|
1721 | .align 32
|
---|
1722 | ChaCha20_8x:
|
---|
1723 | .LChaCha20_8x:
|
---|
1724 | mov %rsp,%r10
|
---|
1725 | sub \$0x280+$xframe,%rsp
|
---|
1726 | and \$-32,%rsp
|
---|
1727 | ___
|
---|
1728 | $code.=<<___ if ($win64);
|
---|
1729 | lea 0x290+0x30(%rsp),%r11
|
---|
1730 | movaps %xmm6,-0x30(%r11)
|
---|
1731 | movaps %xmm7,-0x20(%r11)
|
---|
1732 | movaps %xmm8,-0x10(%r11)
|
---|
1733 | movaps %xmm9,0x00(%r11)
|
---|
1734 | movaps %xmm10,0x10(%r11)
|
---|
1735 | movaps %xmm11,0x20(%r11)
|
---|
1736 | movaps %xmm12,0x30(%r11)
|
---|
1737 | movaps %xmm13,0x40(%r11)
|
---|
1738 | movaps %xmm14,0x50(%r11)
|
---|
1739 | movaps %xmm15,0x60(%r11)
|
---|
1740 | ___
|
---|
1741 | $code.=<<___;
|
---|
1742 | vzeroupper
|
---|
1743 | mov %r10,0x280(%rsp)
|
---|
1744 |
|
---|
1745 | ################ stack layout
|
---|
1746 | # +0x00 SIMD equivalent of @x[8-12]
|
---|
1747 | # ...
|
---|
1748 | # +0x80 constant copy of key[0-2] smashed by lanes
|
---|
1749 | # ...
|
---|
1750 | # +0x200 SIMD counters (with nonce smashed by lanes)
|
---|
1751 | # ...
|
---|
1752 | # +0x280 saved %rsp
|
---|
1753 |
|
---|
1754 | vbroadcasti128 .Lsigma(%rip),$xa3 # key[0]
|
---|
1755 | vbroadcasti128 ($key),$xb3 # key[1]
|
---|
1756 | vbroadcasti128 16($key),$xt3 # key[2]
|
---|
1757 | vbroadcasti128 ($counter),$xd3 # key[3]
|
---|
1758 | lea 0x100(%rsp),%rcx # size optimization
|
---|
1759 | lea 0x200(%rsp),%rax # size optimization
|
---|
1760 | lea .Lrot16(%rip),%r10
|
---|
1761 | lea .Lrot24(%rip),%r11
|
---|
1762 |
|
---|
1763 | vpshufd \$0x00,$xa3,$xa0 # smash key by lanes...
|
---|
1764 | vpshufd \$0x55,$xa3,$xa1
|
---|
1765 | vmovdqa $xa0,0x80-0x100(%rcx) # ... and offload
|
---|
1766 | vpshufd \$0xaa,$xa3,$xa2
|
---|
1767 | vmovdqa $xa1,0xa0-0x100(%rcx)
|
---|
1768 | vpshufd \$0xff,$xa3,$xa3
|
---|
1769 | vmovdqa $xa2,0xc0-0x100(%rcx)
|
---|
1770 | vmovdqa $xa3,0xe0-0x100(%rcx)
|
---|
1771 |
|
---|
1772 | vpshufd \$0x00,$xb3,$xb0
|
---|
1773 | vpshufd \$0x55,$xb3,$xb1
|
---|
1774 | vmovdqa $xb0,0x100-0x100(%rcx)
|
---|
1775 | vpshufd \$0xaa,$xb3,$xb2
|
---|
1776 | vmovdqa $xb1,0x120-0x100(%rcx)
|
---|
1777 | vpshufd \$0xff,$xb3,$xb3
|
---|
1778 | vmovdqa $xb2,0x140-0x100(%rcx)
|
---|
1779 | vmovdqa $xb3,0x160-0x100(%rcx)
|
---|
1780 |
|
---|
1781 | vpshufd \$0x00,$xt3,$xt0 # "xc0"
|
---|
1782 | vpshufd \$0x55,$xt3,$xt1 # "xc1"
|
---|
1783 | vmovdqa $xt0,0x180-0x200(%rax)
|
---|
1784 | vpshufd \$0xaa,$xt3,$xt2 # "xc2"
|
---|
1785 | vmovdqa $xt1,0x1a0-0x200(%rax)
|
---|
1786 | vpshufd \$0xff,$xt3,$xt3 # "xc3"
|
---|
1787 | vmovdqa $xt2,0x1c0-0x200(%rax)
|
---|
1788 | vmovdqa $xt3,0x1e0-0x200(%rax)
|
---|
1789 |
|
---|
1790 | vpshufd \$0x00,$xd3,$xd0
|
---|
1791 | vpshufd \$0x55,$xd3,$xd1
|
---|
1792 | vpaddd .Lincy(%rip),$xd0,$xd0 # don't save counters yet
|
---|
1793 | vpshufd \$0xaa,$xd3,$xd2
|
---|
1794 | vmovdqa $xd1,0x220-0x200(%rax)
|
---|
1795 | vpshufd \$0xff,$xd3,$xd3
|
---|
1796 | vmovdqa $xd2,0x240-0x200(%rax)
|
---|
1797 | vmovdqa $xd3,0x260-0x200(%rax)
|
---|
1798 |
|
---|
1799 | jmp .Loop_enter8x
|
---|
1800 |
|
---|
1801 | .align 32
|
---|
1802 | .Loop_outer8x:
|
---|
1803 | vmovdqa 0x80-0x100(%rcx),$xa0 # re-load smashed key
|
---|
1804 | vmovdqa 0xa0-0x100(%rcx),$xa1
|
---|
1805 | vmovdqa 0xc0-0x100(%rcx),$xa2
|
---|
1806 | vmovdqa 0xe0-0x100(%rcx),$xa3
|
---|
1807 | vmovdqa 0x100-0x100(%rcx),$xb0
|
---|
1808 | vmovdqa 0x120-0x100(%rcx),$xb1
|
---|
1809 | vmovdqa 0x140-0x100(%rcx),$xb2
|
---|
1810 | vmovdqa 0x160-0x100(%rcx),$xb3
|
---|
1811 | vmovdqa 0x180-0x200(%rax),$xt0 # "xc0"
|
---|
1812 | vmovdqa 0x1a0-0x200(%rax),$xt1 # "xc1"
|
---|
1813 | vmovdqa 0x1c0-0x200(%rax),$xt2 # "xc2"
|
---|
1814 | vmovdqa 0x1e0-0x200(%rax),$xt3 # "xc3"
|
---|
1815 | vmovdqa 0x200-0x200(%rax),$xd0
|
---|
1816 | vmovdqa 0x220-0x200(%rax),$xd1
|
---|
1817 | vmovdqa 0x240-0x200(%rax),$xd2
|
---|
1818 | vmovdqa 0x260-0x200(%rax),$xd3
|
---|
1819 | vpaddd .Leight(%rip),$xd0,$xd0 # next SIMD counters
|
---|
1820 |
|
---|
1821 | .Loop_enter8x:
|
---|
1822 | vmovdqa $xt2,0x40(%rsp) # SIMD equivalent of "@x[10]"
|
---|
1823 | vmovdqa $xt3,0x60(%rsp) # SIMD equivalent of "@x[11]"
|
---|
1824 | vbroadcasti128 (%r10),$xt3
|
---|
1825 | vmovdqa $xd0,0x200-0x200(%rax) # save SIMD counters
|
---|
1826 | mov \$10,%eax
|
---|
1827 | jmp .Loop8x
|
---|
1828 |
|
---|
1829 | .align 32
|
---|
1830 | .Loop8x:
|
---|
1831 | ___
|
---|
1832 | foreach (&AVX2_lane_ROUND(0, 4, 8,12)) { eval; }
|
---|
1833 | foreach (&AVX2_lane_ROUND(0, 5,10,15)) { eval; }
|
---|
1834 | $code.=<<___;
|
---|
1835 | dec %eax
|
---|
1836 | jnz .Loop8x
|
---|
1837 |
|
---|
1838 | lea 0x200(%rsp),%rax # size optimization
|
---|
1839 | vpaddd 0x80-0x100(%rcx),$xa0,$xa0 # accumulate key
|
---|
1840 | vpaddd 0xa0-0x100(%rcx),$xa1,$xa1
|
---|
1841 | vpaddd 0xc0-0x100(%rcx),$xa2,$xa2
|
---|
1842 | vpaddd 0xe0-0x100(%rcx),$xa3,$xa3
|
---|
1843 |
|
---|
1844 | vpunpckldq $xa1,$xa0,$xt2 # "de-interlace" data
|
---|
1845 | vpunpckldq $xa3,$xa2,$xt3
|
---|
1846 | vpunpckhdq $xa1,$xa0,$xa0
|
---|
1847 | vpunpckhdq $xa3,$xa2,$xa2
|
---|
1848 | vpunpcklqdq $xt3,$xt2,$xa1 # "a0"
|
---|
1849 | vpunpckhqdq $xt3,$xt2,$xt2 # "a1"
|
---|
1850 | vpunpcklqdq $xa2,$xa0,$xa3 # "a2"
|
---|
1851 | vpunpckhqdq $xa2,$xa0,$xa0 # "a3"
|
---|
1852 | ___
|
---|
1853 | ($xa0,$xa1,$xa2,$xa3,$xt2)=($xa1,$xt2,$xa3,$xa0,$xa2);
|
---|
1854 | $code.=<<___;
|
---|
1855 | vpaddd 0x100-0x100(%rcx),$xb0,$xb0
|
---|
1856 | vpaddd 0x120-0x100(%rcx),$xb1,$xb1
|
---|
1857 | vpaddd 0x140-0x100(%rcx),$xb2,$xb2
|
---|
1858 | vpaddd 0x160-0x100(%rcx),$xb3,$xb3
|
---|
1859 |
|
---|
1860 | vpunpckldq $xb1,$xb0,$xt2
|
---|
1861 | vpunpckldq $xb3,$xb2,$xt3
|
---|
1862 | vpunpckhdq $xb1,$xb0,$xb0
|
---|
1863 | vpunpckhdq $xb3,$xb2,$xb2
|
---|
1864 | vpunpcklqdq $xt3,$xt2,$xb1 # "b0"
|
---|
1865 | vpunpckhqdq $xt3,$xt2,$xt2 # "b1"
|
---|
1866 | vpunpcklqdq $xb2,$xb0,$xb3 # "b2"
|
---|
1867 | vpunpckhqdq $xb2,$xb0,$xb0 # "b3"
|
---|
1868 | ___
|
---|
1869 | ($xb0,$xb1,$xb2,$xb3,$xt2)=($xb1,$xt2,$xb3,$xb0,$xb2);
|
---|
1870 | $code.=<<___;
|
---|
1871 | vperm2i128 \$0x20,$xb0,$xa0,$xt3 # "de-interlace" further
|
---|
1872 | vperm2i128 \$0x31,$xb0,$xa0,$xb0
|
---|
1873 | vperm2i128 \$0x20,$xb1,$xa1,$xa0
|
---|
1874 | vperm2i128 \$0x31,$xb1,$xa1,$xb1
|
---|
1875 | vperm2i128 \$0x20,$xb2,$xa2,$xa1
|
---|
1876 | vperm2i128 \$0x31,$xb2,$xa2,$xb2
|
---|
1877 | vperm2i128 \$0x20,$xb3,$xa3,$xa2
|
---|
1878 | vperm2i128 \$0x31,$xb3,$xa3,$xb3
|
---|
1879 | ___
|
---|
1880 | ($xa0,$xa1,$xa2,$xa3,$xt3)=($xt3,$xa0,$xa1,$xa2,$xa3);
|
---|
1881 | my ($xc0,$xc1,$xc2,$xc3)=($xt0,$xt1,$xa0,$xa1);
|
---|
1882 | $code.=<<___;
|
---|
1883 | vmovdqa $xa0,0x00(%rsp) # offload $xaN
|
---|
1884 | vmovdqa $xa1,0x20(%rsp)
|
---|
1885 | vmovdqa 0x40(%rsp),$xc2 # $xa0
|
---|
1886 | vmovdqa 0x60(%rsp),$xc3 # $xa1
|
---|
1887 |
|
---|
1888 | vpaddd 0x180-0x200(%rax),$xc0,$xc0
|
---|
1889 | vpaddd 0x1a0-0x200(%rax),$xc1,$xc1
|
---|
1890 | vpaddd 0x1c0-0x200(%rax),$xc2,$xc2
|
---|
1891 | vpaddd 0x1e0-0x200(%rax),$xc3,$xc3
|
---|
1892 |
|
---|
1893 | vpunpckldq $xc1,$xc0,$xt2
|
---|
1894 | vpunpckldq $xc3,$xc2,$xt3
|
---|
1895 | vpunpckhdq $xc1,$xc0,$xc0
|
---|
1896 | vpunpckhdq $xc3,$xc2,$xc2
|
---|
1897 | vpunpcklqdq $xt3,$xt2,$xc1 # "c0"
|
---|
1898 | vpunpckhqdq $xt3,$xt2,$xt2 # "c1"
|
---|
1899 | vpunpcklqdq $xc2,$xc0,$xc3 # "c2"
|
---|
1900 | vpunpckhqdq $xc2,$xc0,$xc0 # "c3"
|
---|
1901 | ___
|
---|
1902 | ($xc0,$xc1,$xc2,$xc3,$xt2)=($xc1,$xt2,$xc3,$xc0,$xc2);
|
---|
1903 | $code.=<<___;
|
---|
1904 | vpaddd 0x200-0x200(%rax),$xd0,$xd0
|
---|
1905 | vpaddd 0x220-0x200(%rax),$xd1,$xd1
|
---|
1906 | vpaddd 0x240-0x200(%rax),$xd2,$xd2
|
---|
1907 | vpaddd 0x260-0x200(%rax),$xd3,$xd3
|
---|
1908 |
|
---|
1909 | vpunpckldq $xd1,$xd0,$xt2
|
---|
1910 | vpunpckldq $xd3,$xd2,$xt3
|
---|
1911 | vpunpckhdq $xd1,$xd0,$xd0
|
---|
1912 | vpunpckhdq $xd3,$xd2,$xd2
|
---|
1913 | vpunpcklqdq $xt3,$xt2,$xd1 # "d0"
|
---|
1914 | vpunpckhqdq $xt3,$xt2,$xt2 # "d1"
|
---|
1915 | vpunpcklqdq $xd2,$xd0,$xd3 # "d2"
|
---|
1916 | vpunpckhqdq $xd2,$xd0,$xd0 # "d3"
|
---|
1917 | ___
|
---|
1918 | ($xd0,$xd1,$xd2,$xd3,$xt2)=($xd1,$xt2,$xd3,$xd0,$xd2);
|
---|
1919 | $code.=<<___;
|
---|
1920 | vperm2i128 \$0x20,$xd0,$xc0,$xt3 # "de-interlace" further
|
---|
1921 | vperm2i128 \$0x31,$xd0,$xc0,$xd0
|
---|
1922 | vperm2i128 \$0x20,$xd1,$xc1,$xc0
|
---|
1923 | vperm2i128 \$0x31,$xd1,$xc1,$xd1
|
---|
1924 | vperm2i128 \$0x20,$xd2,$xc2,$xc1
|
---|
1925 | vperm2i128 \$0x31,$xd2,$xc2,$xd2
|
---|
1926 | vperm2i128 \$0x20,$xd3,$xc3,$xc2
|
---|
1927 | vperm2i128 \$0x31,$xd3,$xc3,$xd3
|
---|
1928 | ___
|
---|
1929 | ($xc0,$xc1,$xc2,$xc3,$xt3)=($xt3,$xc0,$xc1,$xc2,$xc3);
|
---|
1930 | ($xb0,$xb1,$xb2,$xb3,$xc0,$xc1,$xc2,$xc3)=
|
---|
1931 | ($xc0,$xc1,$xc2,$xc3,$xb0,$xb1,$xb2,$xb3);
|
---|
1932 | ($xa0,$xa1)=($xt2,$xt3);
|
---|
1933 | $code.=<<___;
|
---|
1934 | vmovdqa 0x00(%rsp),$xa0 # $xaN was offloaded, remember?
|
---|
1935 | vmovdqa 0x20(%rsp),$xa1
|
---|
1936 |
|
---|
1937 | cmp \$64*8,$len
|
---|
1938 | jb .Ltail8x
|
---|
1939 |
|
---|
1940 | vpxor 0x00($inp),$xa0,$xa0 # xor with input
|
---|
1941 | vpxor 0x20($inp),$xb0,$xb0
|
---|
1942 | vpxor 0x40($inp),$xc0,$xc0
|
---|
1943 | vpxor 0x60($inp),$xd0,$xd0
|
---|
1944 | lea 0x80($inp),$inp # size optimization
|
---|
1945 | vmovdqu $xa0,0x00($out)
|
---|
1946 | vmovdqu $xb0,0x20($out)
|
---|
1947 | vmovdqu $xc0,0x40($out)
|
---|
1948 | vmovdqu $xd0,0x60($out)
|
---|
1949 | lea 0x80($out),$out # size optimization
|
---|
1950 |
|
---|
1951 | vpxor 0x00($inp),$xa1,$xa1
|
---|
1952 | vpxor 0x20($inp),$xb1,$xb1
|
---|
1953 | vpxor 0x40($inp),$xc1,$xc1
|
---|
1954 | vpxor 0x60($inp),$xd1,$xd1
|
---|
1955 | lea 0x80($inp),$inp # size optimization
|
---|
1956 | vmovdqu $xa1,0x00($out)
|
---|
1957 | vmovdqu $xb1,0x20($out)
|
---|
1958 | vmovdqu $xc1,0x40($out)
|
---|
1959 | vmovdqu $xd1,0x60($out)
|
---|
1960 | lea 0x80($out),$out # size optimization
|
---|
1961 |
|
---|
1962 | vpxor 0x00($inp),$xa2,$xa2
|
---|
1963 | vpxor 0x20($inp),$xb2,$xb2
|
---|
1964 | vpxor 0x40($inp),$xc2,$xc2
|
---|
1965 | vpxor 0x60($inp),$xd2,$xd2
|
---|
1966 | lea 0x80($inp),$inp # size optimization
|
---|
1967 | vmovdqu $xa2,0x00($out)
|
---|
1968 | vmovdqu $xb2,0x20($out)
|
---|
1969 | vmovdqu $xc2,0x40($out)
|
---|
1970 | vmovdqu $xd2,0x60($out)
|
---|
1971 | lea 0x80($out),$out # size optimization
|
---|
1972 |
|
---|
1973 | vpxor 0x00($inp),$xa3,$xa3
|
---|
1974 | vpxor 0x20($inp),$xb3,$xb3
|
---|
1975 | vpxor 0x40($inp),$xc3,$xc3
|
---|
1976 | vpxor 0x60($inp),$xd3,$xd3
|
---|
1977 | lea 0x80($inp),$inp # size optimization
|
---|
1978 | vmovdqu $xa3,0x00($out)
|
---|
1979 | vmovdqu $xb3,0x20($out)
|
---|
1980 | vmovdqu $xc3,0x40($out)
|
---|
1981 | vmovdqu $xd3,0x60($out)
|
---|
1982 | lea 0x80($out),$out # size optimization
|
---|
1983 |
|
---|
1984 | sub \$64*8,$len
|
---|
1985 | jnz .Loop_outer8x
|
---|
1986 |
|
---|
1987 | jmp .Ldone8x
|
---|
1988 |
|
---|
1989 | .Ltail8x:
|
---|
1990 | cmp \$448,$len
|
---|
1991 | jae .L448_or_more8x
|
---|
1992 | cmp \$384,$len
|
---|
1993 | jae .L384_or_more8x
|
---|
1994 | cmp \$320,$len
|
---|
1995 | jae .L320_or_more8x
|
---|
1996 | cmp \$256,$len
|
---|
1997 | jae .L256_or_more8x
|
---|
1998 | cmp \$192,$len
|
---|
1999 | jae .L192_or_more8x
|
---|
2000 | cmp \$128,$len
|
---|
2001 | jae .L128_or_more8x
|
---|
2002 | cmp \$64,$len
|
---|
2003 | jae .L64_or_more8x
|
---|
2004 |
|
---|
2005 | xor %r10,%r10
|
---|
2006 | vmovdqa $xa0,0x00(%rsp)
|
---|
2007 | vmovdqa $xb0,0x20(%rsp)
|
---|
2008 | jmp .Loop_tail8x
|
---|
2009 |
|
---|
2010 | .align 32
|
---|
2011 | .L64_or_more8x:
|
---|
2012 | vpxor 0x00($inp),$xa0,$xa0 # xor with input
|
---|
2013 | vpxor 0x20($inp),$xb0,$xb0
|
---|
2014 | vmovdqu $xa0,0x00($out)
|
---|
2015 | vmovdqu $xb0,0x20($out)
|
---|
2016 | je .Ldone8x
|
---|
2017 |
|
---|
2018 | lea 0x40($inp),$inp # inp+=64*1
|
---|
2019 | xor %r10,%r10
|
---|
2020 | vmovdqa $xc0,0x00(%rsp)
|
---|
2021 | lea 0x40($out),$out # out+=64*1
|
---|
2022 | sub \$64,$len # len-=64*1
|
---|
2023 | vmovdqa $xd0,0x20(%rsp)
|
---|
2024 | jmp .Loop_tail8x
|
---|
2025 |
|
---|
2026 | .align 32
|
---|
2027 | .L128_or_more8x:
|
---|
2028 | vpxor 0x00($inp),$xa0,$xa0 # xor with input
|
---|
2029 | vpxor 0x20($inp),$xb0,$xb0
|
---|
2030 | vpxor 0x40($inp),$xc0,$xc0
|
---|
2031 | vpxor 0x60($inp),$xd0,$xd0
|
---|
2032 | vmovdqu $xa0,0x00($out)
|
---|
2033 | vmovdqu $xb0,0x20($out)
|
---|
2034 | vmovdqu $xc0,0x40($out)
|
---|
2035 | vmovdqu $xd0,0x60($out)
|
---|
2036 | je .Ldone8x
|
---|
2037 |
|
---|
2038 | lea 0x80($inp),$inp # inp+=64*2
|
---|
2039 | xor %r10,%r10
|
---|
2040 | vmovdqa $xa1,0x00(%rsp)
|
---|
2041 | lea 0x80($out),$out # out+=64*2
|
---|
2042 | sub \$128,$len # len-=64*2
|
---|
2043 | vmovdqa $xb1,0x20(%rsp)
|
---|
2044 | jmp .Loop_tail8x
|
---|
2045 |
|
---|
2046 | .align 32
|
---|
2047 | .L192_or_more8x:
|
---|
2048 | vpxor 0x00($inp),$xa0,$xa0 # xor with input
|
---|
2049 | vpxor 0x20($inp),$xb0,$xb0
|
---|
2050 | vpxor 0x40($inp),$xc0,$xc0
|
---|
2051 | vpxor 0x60($inp),$xd0,$xd0
|
---|
2052 | vpxor 0x80($inp),$xa1,$xa1
|
---|
2053 | vpxor 0xa0($inp),$xb1,$xb1
|
---|
2054 | vmovdqu $xa0,0x00($out)
|
---|
2055 | vmovdqu $xb0,0x20($out)
|
---|
2056 | vmovdqu $xc0,0x40($out)
|
---|
2057 | vmovdqu $xd0,0x60($out)
|
---|
2058 | vmovdqu $xa1,0x80($out)
|
---|
2059 | vmovdqu $xb1,0xa0($out)
|
---|
2060 | je .Ldone8x
|
---|
2061 |
|
---|
2062 | lea 0xc0($inp),$inp # inp+=64*3
|
---|
2063 | xor %r10,%r10
|
---|
2064 | vmovdqa $xc1,0x00(%rsp)
|
---|
2065 | lea 0xc0($out),$out # out+=64*3
|
---|
2066 | sub \$192,$len # len-=64*3
|
---|
2067 | vmovdqa $xd1,0x20(%rsp)
|
---|
2068 | jmp .Loop_tail8x
|
---|
2069 |
|
---|
2070 | .align 32
|
---|
2071 | .L256_or_more8x:
|
---|
2072 | vpxor 0x00($inp),$xa0,$xa0 # xor with input
|
---|
2073 | vpxor 0x20($inp),$xb0,$xb0
|
---|
2074 | vpxor 0x40($inp),$xc0,$xc0
|
---|
2075 | vpxor 0x60($inp),$xd0,$xd0
|
---|
2076 | vpxor 0x80($inp),$xa1,$xa1
|
---|
2077 | vpxor 0xa0($inp),$xb1,$xb1
|
---|
2078 | vpxor 0xc0($inp),$xc1,$xc1
|
---|
2079 | vpxor 0xe0($inp),$xd1,$xd1
|
---|
2080 | vmovdqu $xa0,0x00($out)
|
---|
2081 | vmovdqu $xb0,0x20($out)
|
---|
2082 | vmovdqu $xc0,0x40($out)
|
---|
2083 | vmovdqu $xd0,0x60($out)
|
---|
2084 | vmovdqu $xa1,0x80($out)
|
---|
2085 | vmovdqu $xb1,0xa0($out)
|
---|
2086 | vmovdqu $xc1,0xc0($out)
|
---|
2087 | vmovdqu $xd1,0xe0($out)
|
---|
2088 | je .Ldone8x
|
---|
2089 |
|
---|
2090 | lea 0x100($inp),$inp # inp+=64*4
|
---|
2091 | xor %r10,%r10
|
---|
2092 | vmovdqa $xa2,0x00(%rsp)
|
---|
2093 | lea 0x100($out),$out # out+=64*4
|
---|
2094 | sub \$256,$len # len-=64*4
|
---|
2095 | vmovdqa $xb2,0x20(%rsp)
|
---|
2096 | jmp .Loop_tail8x
|
---|
2097 |
|
---|
2098 | .align 32
|
---|
2099 | .L320_or_more8x:
|
---|
2100 | vpxor 0x00($inp),$xa0,$xa0 # xor with input
|
---|
2101 | vpxor 0x20($inp),$xb0,$xb0
|
---|
2102 | vpxor 0x40($inp),$xc0,$xc0
|
---|
2103 | vpxor 0x60($inp),$xd0,$xd0
|
---|
2104 | vpxor 0x80($inp),$xa1,$xa1
|
---|
2105 | vpxor 0xa0($inp),$xb1,$xb1
|
---|
2106 | vpxor 0xc0($inp),$xc1,$xc1
|
---|
2107 | vpxor 0xe0($inp),$xd1,$xd1
|
---|
2108 | vpxor 0x100($inp),$xa2,$xa2
|
---|
2109 | vpxor 0x120($inp),$xb2,$xb2
|
---|
2110 | vmovdqu $xa0,0x00($out)
|
---|
2111 | vmovdqu $xb0,0x20($out)
|
---|
2112 | vmovdqu $xc0,0x40($out)
|
---|
2113 | vmovdqu $xd0,0x60($out)
|
---|
2114 | vmovdqu $xa1,0x80($out)
|
---|
2115 | vmovdqu $xb1,0xa0($out)
|
---|
2116 | vmovdqu $xc1,0xc0($out)
|
---|
2117 | vmovdqu $xd1,0xe0($out)
|
---|
2118 | vmovdqu $xa2,0x100($out)
|
---|
2119 | vmovdqu $xb2,0x120($out)
|
---|
2120 | je .Ldone8x
|
---|
2121 |
|
---|
2122 | lea 0x140($inp),$inp # inp+=64*5
|
---|
2123 | xor %r10,%r10
|
---|
2124 | vmovdqa $xc2,0x00(%rsp)
|
---|
2125 | lea 0x140($out),$out # out+=64*5
|
---|
2126 | sub \$320,$len # len-=64*5
|
---|
2127 | vmovdqa $xd2,0x20(%rsp)
|
---|
2128 | jmp .Loop_tail8x
|
---|
2129 |
|
---|
2130 | .align 32
|
---|
2131 | .L384_or_more8x:
|
---|
2132 | vpxor 0x00($inp),$xa0,$xa0 # xor with input
|
---|
2133 | vpxor 0x20($inp),$xb0,$xb0
|
---|
2134 | vpxor 0x40($inp),$xc0,$xc0
|
---|
2135 | vpxor 0x60($inp),$xd0,$xd0
|
---|
2136 | vpxor 0x80($inp),$xa1,$xa1
|
---|
2137 | vpxor 0xa0($inp),$xb1,$xb1
|
---|
2138 | vpxor 0xc0($inp),$xc1,$xc1
|
---|
2139 | vpxor 0xe0($inp),$xd1,$xd1
|
---|
2140 | vpxor 0x100($inp),$xa2,$xa2
|
---|
2141 | vpxor 0x120($inp),$xb2,$xb2
|
---|
2142 | vpxor 0x140($inp),$xc2,$xc2
|
---|
2143 | vpxor 0x160($inp),$xd2,$xd2
|
---|
2144 | vmovdqu $xa0,0x00($out)
|
---|
2145 | vmovdqu $xb0,0x20($out)
|
---|
2146 | vmovdqu $xc0,0x40($out)
|
---|
2147 | vmovdqu $xd0,0x60($out)
|
---|
2148 | vmovdqu $xa1,0x80($out)
|
---|
2149 | vmovdqu $xb1,0xa0($out)
|
---|
2150 | vmovdqu $xc1,0xc0($out)
|
---|
2151 | vmovdqu $xd1,0xe0($out)
|
---|
2152 | vmovdqu $xa2,0x100($out)
|
---|
2153 | vmovdqu $xb2,0x120($out)
|
---|
2154 | vmovdqu $xc2,0x140($out)
|
---|
2155 | vmovdqu $xd2,0x160($out)
|
---|
2156 | je .Ldone8x
|
---|
2157 |
|
---|
2158 | lea 0x180($inp),$inp # inp+=64*6
|
---|
2159 | xor %r10,%r10
|
---|
2160 | vmovdqa $xa3,0x00(%rsp)
|
---|
2161 | lea 0x180($out),$out # out+=64*6
|
---|
2162 | sub \$384,$len # len-=64*6
|
---|
2163 | vmovdqa $xb3,0x20(%rsp)
|
---|
2164 | jmp .Loop_tail8x
|
---|
2165 |
|
---|
2166 | .align 32
|
---|
2167 | .L448_or_more8x:
|
---|
2168 | vpxor 0x00($inp),$xa0,$xa0 # xor with input
|
---|
2169 | vpxor 0x20($inp),$xb0,$xb0
|
---|
2170 | vpxor 0x40($inp),$xc0,$xc0
|
---|
2171 | vpxor 0x60($inp),$xd0,$xd0
|
---|
2172 | vpxor 0x80($inp),$xa1,$xa1
|
---|
2173 | vpxor 0xa0($inp),$xb1,$xb1
|
---|
2174 | vpxor 0xc0($inp),$xc1,$xc1
|
---|
2175 | vpxor 0xe0($inp),$xd1,$xd1
|
---|
2176 | vpxor 0x100($inp),$xa2,$xa2
|
---|
2177 | vpxor 0x120($inp),$xb2,$xb2
|
---|
2178 | vpxor 0x140($inp),$xc2,$xc2
|
---|
2179 | vpxor 0x160($inp),$xd2,$xd2
|
---|
2180 | vpxor 0x180($inp),$xa3,$xa3
|
---|
2181 | vpxor 0x1a0($inp),$xb3,$xb3
|
---|
2182 | vmovdqu $xa0,0x00($out)
|
---|
2183 | vmovdqu $xb0,0x20($out)
|
---|
2184 | vmovdqu $xc0,0x40($out)
|
---|
2185 | vmovdqu $xd0,0x60($out)
|
---|
2186 | vmovdqu $xa1,0x80($out)
|
---|
2187 | vmovdqu $xb1,0xa0($out)
|
---|
2188 | vmovdqu $xc1,0xc0($out)
|
---|
2189 | vmovdqu $xd1,0xe0($out)
|
---|
2190 | vmovdqu $xa2,0x100($out)
|
---|
2191 | vmovdqu $xb2,0x120($out)
|
---|
2192 | vmovdqu $xc2,0x140($out)
|
---|
2193 | vmovdqu $xd2,0x160($out)
|
---|
2194 | vmovdqu $xa3,0x180($out)
|
---|
2195 | vmovdqu $xb3,0x1a0($out)
|
---|
2196 | je .Ldone8x
|
---|
2197 |
|
---|
2198 | lea 0x1c0($inp),$inp # inp+=64*7
|
---|
2199 | xor %r10,%r10
|
---|
2200 | vmovdqa $xc3,0x00(%rsp)
|
---|
2201 | lea 0x1c0($out),$out # out+=64*7
|
---|
2202 | sub \$448,$len # len-=64*7
|
---|
2203 | vmovdqa $xd3,0x20(%rsp)
|
---|
2204 |
|
---|
2205 | .Loop_tail8x:
|
---|
2206 | movzb ($inp,%r10),%eax
|
---|
2207 | movzb (%rsp,%r10),%ecx
|
---|
2208 | lea 1(%r10),%r10
|
---|
2209 | xor %ecx,%eax
|
---|
2210 | mov %al,-1($out,%r10)
|
---|
2211 | dec $len
|
---|
2212 | jnz .Loop_tail8x
|
---|
2213 |
|
---|
2214 | .Ldone8x:
|
---|
2215 | vzeroall
|
---|
2216 | ___
|
---|
2217 | $code.=<<___ if ($win64);
|
---|
2218 | lea 0x290+0x30(%rsp),%r11
|
---|
2219 | movaps -0x30(%r11),%xmm6
|
---|
2220 | movaps -0x20(%r11),%xmm7
|
---|
2221 | movaps -0x10(%r11),%xmm8
|
---|
2222 | movaps 0x00(%r11),%xmm9
|
---|
2223 | movaps 0x10(%r11),%xmm10
|
---|
2224 | movaps 0x20(%r11),%xmm11
|
---|
2225 | movaps 0x30(%r11),%xmm12
|
---|
2226 | movaps 0x40(%r11),%xmm13
|
---|
2227 | movaps 0x50(%r11),%xmm14
|
---|
2228 | movaps 0x60(%r11),%xmm15
|
---|
2229 | ___
|
---|
2230 | $code.=<<___;
|
---|
2231 | mov 0x280(%rsp),%rsp
|
---|
2232 | ret
|
---|
2233 | .size ChaCha20_8x,.-ChaCha20_8x
|
---|
2234 | ___
|
---|
2235 | }
|
---|
2236 |
|
---|
2237 | foreach (split("\n",$code)) {
|
---|
2238 | s/\`([^\`]*)\`/eval $1/geo;
|
---|
2239 |
|
---|
2240 | s/%x#%y/%x/go;
|
---|
2241 |
|
---|
2242 | print $_,"\n";
|
---|
2243 | }
|
---|
2244 |
|
---|
2245 | close STDOUT;
|
---|