VirtualBox

source: vbox/trunk/src/libs/openssl-3.0.3/crypto/genasm-nasm/chacha-x86_64.S@ 95218

Last change on this file since 95218 was 94083, checked in by vboxsync, 3 years ago

libs/openssl-3.0.1: Recreate asm files, bugref:10128

File size: 84.9 KB
Line 
1default rel
2%define XMMWORD
3%define YMMWORD
4%define ZMMWORD
5section .text code align=64
6
7
8EXTERN OPENSSL_ia32cap_P
9
10ALIGN 64
11$L$zero:
12 DD 0,0,0,0
13$L$one:
14 DD 1,0,0,0
15$L$inc:
16 DD 0,1,2,3
17$L$four:
18 DD 4,4,4,4
19$L$incy:
20 DD 0,2,4,6,1,3,5,7
21$L$eight:
22 DD 8,8,8,8,8,8,8,8
23$L$rot16:
24DB 0x2,0x3,0x0,0x1,0x6,0x7,0x4,0x5,0xa,0xb,0x8,0x9,0xe,0xf,0xc,0xd
25$L$rot24:
26DB 0x3,0x0,0x1,0x2,0x7,0x4,0x5,0x6,0xb,0x8,0x9,0xa,0xf,0xc,0xd,0xe
27$L$twoy:
28 DD 2,0,0,0,2,0,0,0
29ALIGN 64
30$L$zeroz:
31 DD 0,0,0,0,1,0,0,0,2,0,0,0,3,0,0,0
32$L$fourz:
33 DD 4,0,0,0,4,0,0,0,4,0,0,0,4,0,0,0
34$L$incz:
35 DD 0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15
36$L$sixteen:
37 DD 16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16
38$L$sigma:
39DB 101,120,112,97,110,100,32,51,50,45,98,121,116,101,32,107
40DB 0
41DB 67,104,97,67,104,97,50,48,32,102,111,114,32,120,56,54
42DB 95,54,52,44,32,67,82,89,80,84,79,71,65,77,83,32
43DB 98,121,32,60,97,112,112,114,111,64,111,112,101,110,115,115
44DB 108,46,111,114,103,62,0
45global ChaCha20_ctr32
46
47ALIGN 64
48ChaCha20_ctr32:
49 mov QWORD[8+rsp],rdi ;WIN64 prologue
50 mov QWORD[16+rsp],rsi
51 mov rax,rsp
52$L$SEH_begin_ChaCha20_ctr32:
53 mov rdi,rcx
54 mov rsi,rdx
55 mov rdx,r8
56 mov rcx,r9
57 mov r8,QWORD[40+rsp]
58
59
60
61 cmp rdx,0
62 je NEAR $L$no_data
63 mov r10,QWORD[((OPENSSL_ia32cap_P+4))]
64 bt r10,48
65 jc NEAR $L$ChaCha20_avx512
66 test r10,r10
67 js NEAR $L$ChaCha20_avx512vl
68 test r10d,512
69 jnz NEAR $L$ChaCha20_ssse3
70
71 push rbx
72
73 push rbp
74
75 push r12
76
77 push r13
78
79 push r14
80
81 push r15
82
83 sub rsp,64+24
84
85$L$ctr32_body:
86
87
88 movdqu xmm1,XMMWORD[rcx]
89 movdqu xmm2,XMMWORD[16+rcx]
90 movdqu xmm3,XMMWORD[r8]
91 movdqa xmm4,XMMWORD[$L$one]
92
93
94 movdqa XMMWORD[16+rsp],xmm1
95 movdqa XMMWORD[32+rsp],xmm2
96 movdqa XMMWORD[48+rsp],xmm3
97 mov rbp,rdx
98 jmp NEAR $L$oop_outer
99
100ALIGN 32
101$L$oop_outer:
102 mov eax,0x61707865
103 mov ebx,0x3320646e
104 mov ecx,0x79622d32
105 mov edx,0x6b206574
106 mov r8d,DWORD[16+rsp]
107 mov r9d,DWORD[20+rsp]
108 mov r10d,DWORD[24+rsp]
109 mov r11d,DWORD[28+rsp]
110 movd r12d,xmm3
111 mov r13d,DWORD[52+rsp]
112 mov r14d,DWORD[56+rsp]
113 mov r15d,DWORD[60+rsp]
114
115 mov QWORD[((64+0))+rsp],rbp
116 mov ebp,10
117 mov QWORD[((64+8))+rsp],rsi
118DB 102,72,15,126,214
119 mov QWORD[((64+16))+rsp],rdi
120 mov rdi,rsi
121 shr rdi,32
122 jmp NEAR $L$oop
123
124ALIGN 32
125$L$oop:
126 add eax,r8d
127 xor r12d,eax
128 rol r12d,16
129 add ebx,r9d
130 xor r13d,ebx
131 rol r13d,16
132 add esi,r12d
133 xor r8d,esi
134 rol r8d,12
135 add edi,r13d
136 xor r9d,edi
137 rol r9d,12
138 add eax,r8d
139 xor r12d,eax
140 rol r12d,8
141 add ebx,r9d
142 xor r13d,ebx
143 rol r13d,8
144 add esi,r12d
145 xor r8d,esi
146 rol r8d,7
147 add edi,r13d
148 xor r9d,edi
149 rol r9d,7
150 mov DWORD[32+rsp],esi
151 mov DWORD[36+rsp],edi
152 mov esi,DWORD[40+rsp]
153 mov edi,DWORD[44+rsp]
154 add ecx,r10d
155 xor r14d,ecx
156 rol r14d,16
157 add edx,r11d
158 xor r15d,edx
159 rol r15d,16
160 add esi,r14d
161 xor r10d,esi
162 rol r10d,12
163 add edi,r15d
164 xor r11d,edi
165 rol r11d,12
166 add ecx,r10d
167 xor r14d,ecx
168 rol r14d,8
169 add edx,r11d
170 xor r15d,edx
171 rol r15d,8
172 add esi,r14d
173 xor r10d,esi
174 rol r10d,7
175 add edi,r15d
176 xor r11d,edi
177 rol r11d,7
178 add eax,r9d
179 xor r15d,eax
180 rol r15d,16
181 add ebx,r10d
182 xor r12d,ebx
183 rol r12d,16
184 add esi,r15d
185 xor r9d,esi
186 rol r9d,12
187 add edi,r12d
188 xor r10d,edi
189 rol r10d,12
190 add eax,r9d
191 xor r15d,eax
192 rol r15d,8
193 add ebx,r10d
194 xor r12d,ebx
195 rol r12d,8
196 add esi,r15d
197 xor r9d,esi
198 rol r9d,7
199 add edi,r12d
200 xor r10d,edi
201 rol r10d,7
202 mov DWORD[40+rsp],esi
203 mov DWORD[44+rsp],edi
204 mov esi,DWORD[32+rsp]
205 mov edi,DWORD[36+rsp]
206 add ecx,r11d
207 xor r13d,ecx
208 rol r13d,16
209 add edx,r8d
210 xor r14d,edx
211 rol r14d,16
212 add esi,r13d
213 xor r11d,esi
214 rol r11d,12
215 add edi,r14d
216 xor r8d,edi
217 rol r8d,12
218 add ecx,r11d
219 xor r13d,ecx
220 rol r13d,8
221 add edx,r8d
222 xor r14d,edx
223 rol r14d,8
224 add esi,r13d
225 xor r11d,esi
226 rol r11d,7
227 add edi,r14d
228 xor r8d,edi
229 rol r8d,7
230 dec ebp
231 jnz NEAR $L$oop
232 mov DWORD[36+rsp],edi
233 mov DWORD[32+rsp],esi
234 mov rbp,QWORD[64+rsp]
235 movdqa xmm1,xmm2
236 mov rsi,QWORD[((64+8))+rsp]
237 paddd xmm3,xmm4
238 mov rdi,QWORD[((64+16))+rsp]
239
240 add eax,0x61707865
241 add ebx,0x3320646e
242 add ecx,0x79622d32
243 add edx,0x6b206574
244 add r8d,DWORD[16+rsp]
245 add r9d,DWORD[20+rsp]
246 add r10d,DWORD[24+rsp]
247 add r11d,DWORD[28+rsp]
248 add r12d,DWORD[48+rsp]
249 add r13d,DWORD[52+rsp]
250 add r14d,DWORD[56+rsp]
251 add r15d,DWORD[60+rsp]
252 paddd xmm1,XMMWORD[32+rsp]
253
254 cmp rbp,64
255 jb NEAR $L$tail
256
257 xor eax,DWORD[rsi]
258 xor ebx,DWORD[4+rsi]
259 xor ecx,DWORD[8+rsi]
260 xor edx,DWORD[12+rsi]
261 xor r8d,DWORD[16+rsi]
262 xor r9d,DWORD[20+rsi]
263 xor r10d,DWORD[24+rsi]
264 xor r11d,DWORD[28+rsi]
265 movdqu xmm0,XMMWORD[32+rsi]
266 xor r12d,DWORD[48+rsi]
267 xor r13d,DWORD[52+rsi]
268 xor r14d,DWORD[56+rsi]
269 xor r15d,DWORD[60+rsi]
270 lea rsi,[64+rsi]
271 pxor xmm0,xmm1
272
273 movdqa XMMWORD[32+rsp],xmm2
274 movd DWORD[48+rsp],xmm3
275
276 mov DWORD[rdi],eax
277 mov DWORD[4+rdi],ebx
278 mov DWORD[8+rdi],ecx
279 mov DWORD[12+rdi],edx
280 mov DWORD[16+rdi],r8d
281 mov DWORD[20+rdi],r9d
282 mov DWORD[24+rdi],r10d
283 mov DWORD[28+rdi],r11d
284 movdqu XMMWORD[32+rdi],xmm0
285 mov DWORD[48+rdi],r12d
286 mov DWORD[52+rdi],r13d
287 mov DWORD[56+rdi],r14d
288 mov DWORD[60+rdi],r15d
289 lea rdi,[64+rdi]
290
291 sub rbp,64
292 jnz NEAR $L$oop_outer
293
294 jmp NEAR $L$done
295
296ALIGN 16
297$L$tail:
298 mov DWORD[rsp],eax
299 mov DWORD[4+rsp],ebx
300 xor rbx,rbx
301 mov DWORD[8+rsp],ecx
302 mov DWORD[12+rsp],edx
303 mov DWORD[16+rsp],r8d
304 mov DWORD[20+rsp],r9d
305 mov DWORD[24+rsp],r10d
306 mov DWORD[28+rsp],r11d
307 movdqa XMMWORD[32+rsp],xmm1
308 mov DWORD[48+rsp],r12d
309 mov DWORD[52+rsp],r13d
310 mov DWORD[56+rsp],r14d
311 mov DWORD[60+rsp],r15d
312
313$L$oop_tail:
314 movzx eax,BYTE[rbx*1+rsi]
315 movzx edx,BYTE[rbx*1+rsp]
316 lea rbx,[1+rbx]
317 xor eax,edx
318 mov BYTE[((-1))+rbx*1+rdi],al
319 dec rbp
320 jnz NEAR $L$oop_tail
321
322$L$done:
323 lea rsi,[((64+24+48))+rsp]
324
325 mov r15,QWORD[((-48))+rsi]
326
327 mov r14,QWORD[((-40))+rsi]
328
329 mov r13,QWORD[((-32))+rsi]
330
331 mov r12,QWORD[((-24))+rsi]
332
333 mov rbp,QWORD[((-16))+rsi]
334
335 mov rbx,QWORD[((-8))+rsi]
336
337 lea rsp,[rsi]
338
339$L$no_data:
340 mov rdi,QWORD[8+rsp] ;WIN64 epilogue
341 mov rsi,QWORD[16+rsp]
342 DB 0F3h,0C3h ;repret
343
344$L$SEH_end_ChaCha20_ctr32:
345
346ALIGN 32
347ChaCha20_ssse3:
348 mov QWORD[8+rsp],rdi ;WIN64 prologue
349 mov QWORD[16+rsp],rsi
350 mov rax,rsp
351$L$SEH_begin_ChaCha20_ssse3:
352 mov rdi,rcx
353 mov rsi,rdx
354 mov rdx,r8
355 mov rcx,r9
356 mov r8,QWORD[40+rsp]
357
358
359
360$L$ChaCha20_ssse3:
361 mov r9,rsp
362
363 test r10d,2048
364 jnz NEAR $L$ChaCha20_4xop
365 cmp rdx,128
366 je NEAR $L$ChaCha20_128
367 ja NEAR $L$ChaCha20_4x
368
369$L$do_sse3_after_all:
370 sub rsp,64+168
371 movaps XMMWORD[(-40)+r9],xmm6
372 movaps XMMWORD[(-24)+r9],xmm7
373$L$ssse3_body:
374 movdqa xmm0,XMMWORD[$L$sigma]
375 movdqu xmm1,XMMWORD[rcx]
376 movdqu xmm2,XMMWORD[16+rcx]
377 movdqu xmm3,XMMWORD[r8]
378 movdqa xmm6,XMMWORD[$L$rot16]
379 movdqa xmm7,XMMWORD[$L$rot24]
380
381 movdqa XMMWORD[rsp],xmm0
382 movdqa XMMWORD[16+rsp],xmm1
383 movdqa XMMWORD[32+rsp],xmm2
384 movdqa XMMWORD[48+rsp],xmm3
385 mov r8,10
386 jmp NEAR $L$oop_ssse3
387
388ALIGN 32
389$L$oop_outer_ssse3:
390 movdqa xmm3,XMMWORD[$L$one]
391 movdqa xmm0,XMMWORD[rsp]
392 movdqa xmm1,XMMWORD[16+rsp]
393 movdqa xmm2,XMMWORD[32+rsp]
394 paddd xmm3,XMMWORD[48+rsp]
395 mov r8,10
396 movdqa XMMWORD[48+rsp],xmm3
397 jmp NEAR $L$oop_ssse3
398
399ALIGN 32
400$L$oop_ssse3:
401 paddd xmm0,xmm1
402 pxor xmm3,xmm0
403DB 102,15,56,0,222
404 paddd xmm2,xmm3
405 pxor xmm1,xmm2
406 movdqa xmm4,xmm1
407 psrld xmm1,20
408 pslld xmm4,12
409 por xmm1,xmm4
410 paddd xmm0,xmm1
411 pxor xmm3,xmm0
412DB 102,15,56,0,223
413 paddd xmm2,xmm3
414 pxor xmm1,xmm2
415 movdqa xmm4,xmm1
416 psrld xmm1,25
417 pslld xmm4,7
418 por xmm1,xmm4
419 pshufd xmm2,xmm2,78
420 pshufd xmm1,xmm1,57
421 pshufd xmm3,xmm3,147
422 nop
423 paddd xmm0,xmm1
424 pxor xmm3,xmm0
425DB 102,15,56,0,222
426 paddd xmm2,xmm3
427 pxor xmm1,xmm2
428 movdqa xmm4,xmm1
429 psrld xmm1,20
430 pslld xmm4,12
431 por xmm1,xmm4
432 paddd xmm0,xmm1
433 pxor xmm3,xmm0
434DB 102,15,56,0,223
435 paddd xmm2,xmm3
436 pxor xmm1,xmm2
437 movdqa xmm4,xmm1
438 psrld xmm1,25
439 pslld xmm4,7
440 por xmm1,xmm4
441 pshufd xmm2,xmm2,78
442 pshufd xmm1,xmm1,147
443 pshufd xmm3,xmm3,57
444 dec r8
445 jnz NEAR $L$oop_ssse3
446 paddd xmm0,XMMWORD[rsp]
447 paddd xmm1,XMMWORD[16+rsp]
448 paddd xmm2,XMMWORD[32+rsp]
449 paddd xmm3,XMMWORD[48+rsp]
450
451 cmp rdx,64
452 jb NEAR $L$tail_ssse3
453
454 movdqu xmm4,XMMWORD[rsi]
455 movdqu xmm5,XMMWORD[16+rsi]
456 pxor xmm0,xmm4
457 movdqu xmm4,XMMWORD[32+rsi]
458 pxor xmm1,xmm5
459 movdqu xmm5,XMMWORD[48+rsi]
460 lea rsi,[64+rsi]
461 pxor xmm2,xmm4
462 pxor xmm3,xmm5
463
464 movdqu XMMWORD[rdi],xmm0
465 movdqu XMMWORD[16+rdi],xmm1
466 movdqu XMMWORD[32+rdi],xmm2
467 movdqu XMMWORD[48+rdi],xmm3
468 lea rdi,[64+rdi]
469
470 sub rdx,64
471 jnz NEAR $L$oop_outer_ssse3
472
473 jmp NEAR $L$done_ssse3
474
475ALIGN 16
476$L$tail_ssse3:
477 movdqa XMMWORD[rsp],xmm0
478 movdqa XMMWORD[16+rsp],xmm1
479 movdqa XMMWORD[32+rsp],xmm2
480 movdqa XMMWORD[48+rsp],xmm3
481 xor r8,r8
482
483$L$oop_tail_ssse3:
484 movzx eax,BYTE[r8*1+rsi]
485 movzx ecx,BYTE[r8*1+rsp]
486 lea r8,[1+r8]
487 xor eax,ecx
488 mov BYTE[((-1))+r8*1+rdi],al
489 dec rdx
490 jnz NEAR $L$oop_tail_ssse3
491
492$L$done_ssse3:
493 movaps xmm6,XMMWORD[((-40))+r9]
494 movaps xmm7,XMMWORD[((-24))+r9]
495 lea rsp,[r9]
496
497$L$ssse3_epilogue:
498 mov rdi,QWORD[8+rsp] ;WIN64 epilogue
499 mov rsi,QWORD[16+rsp]
500 DB 0F3h,0C3h ;repret
501
502$L$SEH_end_ChaCha20_ssse3:
503
504ALIGN 32
505ChaCha20_128:
506 mov QWORD[8+rsp],rdi ;WIN64 prologue
507 mov QWORD[16+rsp],rsi
508 mov rax,rsp
509$L$SEH_begin_ChaCha20_128:
510 mov rdi,rcx
511 mov rsi,rdx
512 mov rdx,r8
513 mov rcx,r9
514 mov r8,QWORD[40+rsp]
515
516
517
518$L$ChaCha20_128:
519 mov r9,rsp
520
521 sub rsp,64+104
522 movaps XMMWORD[(-104)+r9],xmm6
523 movaps XMMWORD[(-88)+r9],xmm7
524 movaps XMMWORD[(-72)+r9],xmm8
525 movaps XMMWORD[(-56)+r9],xmm9
526 movaps XMMWORD[(-40)+r9],xmm10
527 movaps XMMWORD[(-24)+r9],xmm11
528$L$128_body:
529 movdqa xmm8,XMMWORD[$L$sigma]
530 movdqu xmm9,XMMWORD[rcx]
531 movdqu xmm2,XMMWORD[16+rcx]
532 movdqu xmm3,XMMWORD[r8]
533 movdqa xmm1,XMMWORD[$L$one]
534 movdqa xmm6,XMMWORD[$L$rot16]
535 movdqa xmm7,XMMWORD[$L$rot24]
536
537 movdqa xmm10,xmm8
538 movdqa XMMWORD[rsp],xmm8
539 movdqa xmm11,xmm9
540 movdqa XMMWORD[16+rsp],xmm9
541 movdqa xmm0,xmm2
542 movdqa XMMWORD[32+rsp],xmm2
543 paddd xmm1,xmm3
544 movdqa XMMWORD[48+rsp],xmm3
545 mov r8,10
546 jmp NEAR $L$oop_128
547
548ALIGN 32
549$L$oop_128:
550 paddd xmm8,xmm9
551 pxor xmm3,xmm8
552 paddd xmm10,xmm11
553 pxor xmm1,xmm10
554DB 102,15,56,0,222
555DB 102,15,56,0,206
556 paddd xmm2,xmm3
557 paddd xmm0,xmm1
558 pxor xmm9,xmm2
559 pxor xmm11,xmm0
560 movdqa xmm4,xmm9
561 psrld xmm9,20
562 movdqa xmm5,xmm11
563 pslld xmm4,12
564 psrld xmm11,20
565 por xmm9,xmm4
566 pslld xmm5,12
567 por xmm11,xmm5
568 paddd xmm8,xmm9
569 pxor xmm3,xmm8
570 paddd xmm10,xmm11
571 pxor xmm1,xmm10
572DB 102,15,56,0,223
573DB 102,15,56,0,207
574 paddd xmm2,xmm3
575 paddd xmm0,xmm1
576 pxor xmm9,xmm2
577 pxor xmm11,xmm0
578 movdqa xmm4,xmm9
579 psrld xmm9,25
580 movdqa xmm5,xmm11
581 pslld xmm4,7
582 psrld xmm11,25
583 por xmm9,xmm4
584 pslld xmm5,7
585 por xmm11,xmm5
586 pshufd xmm2,xmm2,78
587 pshufd xmm9,xmm9,57
588 pshufd xmm3,xmm3,147
589 pshufd xmm0,xmm0,78
590 pshufd xmm11,xmm11,57
591 pshufd xmm1,xmm1,147
592 paddd xmm8,xmm9
593 pxor xmm3,xmm8
594 paddd xmm10,xmm11
595 pxor xmm1,xmm10
596DB 102,15,56,0,222
597DB 102,15,56,0,206
598 paddd xmm2,xmm3
599 paddd xmm0,xmm1
600 pxor xmm9,xmm2
601 pxor xmm11,xmm0
602 movdqa xmm4,xmm9
603 psrld xmm9,20
604 movdqa xmm5,xmm11
605 pslld xmm4,12
606 psrld xmm11,20
607 por xmm9,xmm4
608 pslld xmm5,12
609 por xmm11,xmm5
610 paddd xmm8,xmm9
611 pxor xmm3,xmm8
612 paddd xmm10,xmm11
613 pxor xmm1,xmm10
614DB 102,15,56,0,223
615DB 102,15,56,0,207
616 paddd xmm2,xmm3
617 paddd xmm0,xmm1
618 pxor xmm9,xmm2
619 pxor xmm11,xmm0
620 movdqa xmm4,xmm9
621 psrld xmm9,25
622 movdqa xmm5,xmm11
623 pslld xmm4,7
624 psrld xmm11,25
625 por xmm9,xmm4
626 pslld xmm5,7
627 por xmm11,xmm5
628 pshufd xmm2,xmm2,78
629 pshufd xmm9,xmm9,147
630 pshufd xmm3,xmm3,57
631 pshufd xmm0,xmm0,78
632 pshufd xmm11,xmm11,147
633 pshufd xmm1,xmm1,57
634 dec r8
635 jnz NEAR $L$oop_128
636 paddd xmm8,XMMWORD[rsp]
637 paddd xmm9,XMMWORD[16+rsp]
638 paddd xmm2,XMMWORD[32+rsp]
639 paddd xmm3,XMMWORD[48+rsp]
640 paddd xmm1,XMMWORD[$L$one]
641 paddd xmm10,XMMWORD[rsp]
642 paddd xmm11,XMMWORD[16+rsp]
643 paddd xmm0,XMMWORD[32+rsp]
644 paddd xmm1,XMMWORD[48+rsp]
645
646 movdqu xmm4,XMMWORD[rsi]
647 movdqu xmm5,XMMWORD[16+rsi]
648 pxor xmm8,xmm4
649 movdqu xmm4,XMMWORD[32+rsi]
650 pxor xmm9,xmm5
651 movdqu xmm5,XMMWORD[48+rsi]
652 pxor xmm2,xmm4
653 movdqu xmm4,XMMWORD[64+rsi]
654 pxor xmm3,xmm5
655 movdqu xmm5,XMMWORD[80+rsi]
656 pxor xmm10,xmm4
657 movdqu xmm4,XMMWORD[96+rsi]
658 pxor xmm11,xmm5
659 movdqu xmm5,XMMWORD[112+rsi]
660 pxor xmm0,xmm4
661 pxor xmm1,xmm5
662
663 movdqu XMMWORD[rdi],xmm8
664 movdqu XMMWORD[16+rdi],xmm9
665 movdqu XMMWORD[32+rdi],xmm2
666 movdqu XMMWORD[48+rdi],xmm3
667 movdqu XMMWORD[64+rdi],xmm10
668 movdqu XMMWORD[80+rdi],xmm11
669 movdqu XMMWORD[96+rdi],xmm0
670 movdqu XMMWORD[112+rdi],xmm1
671 movaps xmm6,XMMWORD[((-104))+r9]
672 movaps xmm7,XMMWORD[((-88))+r9]
673 movaps xmm8,XMMWORD[((-72))+r9]
674 movaps xmm9,XMMWORD[((-56))+r9]
675 movaps xmm10,XMMWORD[((-40))+r9]
676 movaps xmm11,XMMWORD[((-24))+r9]
677 lea rsp,[r9]
678
679$L$128_epilogue:
680 mov rdi,QWORD[8+rsp] ;WIN64 epilogue
681 mov rsi,QWORD[16+rsp]
682 DB 0F3h,0C3h ;repret
683
684$L$SEH_end_ChaCha20_128:
685
686ALIGN 32
687ChaCha20_4x:
688 mov QWORD[8+rsp],rdi ;WIN64 prologue
689 mov QWORD[16+rsp],rsi
690 mov rax,rsp
691$L$SEH_begin_ChaCha20_4x:
692 mov rdi,rcx
693 mov rsi,rdx
694 mov rdx,r8
695 mov rcx,r9
696 mov r8,QWORD[40+rsp]
697
698
699
700$L$ChaCha20_4x:
701 mov r9,rsp
702
703 mov r11,r10
704 shr r10,32
705 test r10,32
706 jnz NEAR $L$ChaCha20_8x
707 cmp rdx,192
708 ja NEAR $L$proceed4x
709
710 and r11,71303168
711 cmp r11,4194304
712 je NEAR $L$do_sse3_after_all
713
714$L$proceed4x:
715 sub rsp,0x140+168
716 movaps XMMWORD[(-168)+r9],xmm6
717 movaps XMMWORD[(-152)+r9],xmm7
718 movaps XMMWORD[(-136)+r9],xmm8
719 movaps XMMWORD[(-120)+r9],xmm9
720 movaps XMMWORD[(-104)+r9],xmm10
721 movaps XMMWORD[(-88)+r9],xmm11
722 movaps XMMWORD[(-72)+r9],xmm12
723 movaps XMMWORD[(-56)+r9],xmm13
724 movaps XMMWORD[(-40)+r9],xmm14
725 movaps XMMWORD[(-24)+r9],xmm15
726$L$4x_body:
727 movdqa xmm11,XMMWORD[$L$sigma]
728 movdqu xmm15,XMMWORD[rcx]
729 movdqu xmm7,XMMWORD[16+rcx]
730 movdqu xmm3,XMMWORD[r8]
731 lea rcx,[256+rsp]
732 lea r10,[$L$rot16]
733 lea r11,[$L$rot24]
734
735 pshufd xmm8,xmm11,0x00
736 pshufd xmm9,xmm11,0x55
737 movdqa XMMWORD[64+rsp],xmm8
738 pshufd xmm10,xmm11,0xaa
739 movdqa XMMWORD[80+rsp],xmm9
740 pshufd xmm11,xmm11,0xff
741 movdqa XMMWORD[96+rsp],xmm10
742 movdqa XMMWORD[112+rsp],xmm11
743
744 pshufd xmm12,xmm15,0x00
745 pshufd xmm13,xmm15,0x55
746 movdqa XMMWORD[(128-256)+rcx],xmm12
747 pshufd xmm14,xmm15,0xaa
748 movdqa XMMWORD[(144-256)+rcx],xmm13
749 pshufd xmm15,xmm15,0xff
750 movdqa XMMWORD[(160-256)+rcx],xmm14
751 movdqa XMMWORD[(176-256)+rcx],xmm15
752
753 pshufd xmm4,xmm7,0x00
754 pshufd xmm5,xmm7,0x55
755 movdqa XMMWORD[(192-256)+rcx],xmm4
756 pshufd xmm6,xmm7,0xaa
757 movdqa XMMWORD[(208-256)+rcx],xmm5
758 pshufd xmm7,xmm7,0xff
759 movdqa XMMWORD[(224-256)+rcx],xmm6
760 movdqa XMMWORD[(240-256)+rcx],xmm7
761
762 pshufd xmm0,xmm3,0x00
763 pshufd xmm1,xmm3,0x55
764 paddd xmm0,XMMWORD[$L$inc]
765 pshufd xmm2,xmm3,0xaa
766 movdqa XMMWORD[(272-256)+rcx],xmm1
767 pshufd xmm3,xmm3,0xff
768 movdqa XMMWORD[(288-256)+rcx],xmm2
769 movdqa XMMWORD[(304-256)+rcx],xmm3
770
771 jmp NEAR $L$oop_enter4x
772
773ALIGN 32
774$L$oop_outer4x:
775 movdqa xmm8,XMMWORD[64+rsp]
776 movdqa xmm9,XMMWORD[80+rsp]
777 movdqa xmm10,XMMWORD[96+rsp]
778 movdqa xmm11,XMMWORD[112+rsp]
779 movdqa xmm12,XMMWORD[((128-256))+rcx]
780 movdqa xmm13,XMMWORD[((144-256))+rcx]
781 movdqa xmm14,XMMWORD[((160-256))+rcx]
782 movdqa xmm15,XMMWORD[((176-256))+rcx]
783 movdqa xmm4,XMMWORD[((192-256))+rcx]
784 movdqa xmm5,XMMWORD[((208-256))+rcx]
785 movdqa xmm6,XMMWORD[((224-256))+rcx]
786 movdqa xmm7,XMMWORD[((240-256))+rcx]
787 movdqa xmm0,XMMWORD[((256-256))+rcx]
788 movdqa xmm1,XMMWORD[((272-256))+rcx]
789 movdqa xmm2,XMMWORD[((288-256))+rcx]
790 movdqa xmm3,XMMWORD[((304-256))+rcx]
791 paddd xmm0,XMMWORD[$L$four]
792
793$L$oop_enter4x:
794 movdqa XMMWORD[32+rsp],xmm6
795 movdqa XMMWORD[48+rsp],xmm7
796 movdqa xmm7,XMMWORD[r10]
797 mov eax,10
798 movdqa XMMWORD[(256-256)+rcx],xmm0
799 jmp NEAR $L$oop4x
800
801ALIGN 32
802$L$oop4x:
803 paddd xmm8,xmm12
804 paddd xmm9,xmm13
805 pxor xmm0,xmm8
806 pxor xmm1,xmm9
807DB 102,15,56,0,199
808DB 102,15,56,0,207
809 paddd xmm4,xmm0
810 paddd xmm5,xmm1
811 pxor xmm12,xmm4
812 pxor xmm13,xmm5
813 movdqa xmm6,xmm12
814 pslld xmm12,12
815 psrld xmm6,20
816 movdqa xmm7,xmm13
817 pslld xmm13,12
818 por xmm12,xmm6
819 psrld xmm7,20
820 movdqa xmm6,XMMWORD[r11]
821 por xmm13,xmm7
822 paddd xmm8,xmm12
823 paddd xmm9,xmm13
824 pxor xmm0,xmm8
825 pxor xmm1,xmm9
826DB 102,15,56,0,198
827DB 102,15,56,0,206
828 paddd xmm4,xmm0
829 paddd xmm5,xmm1
830 pxor xmm12,xmm4
831 pxor xmm13,xmm5
832 movdqa xmm7,xmm12
833 pslld xmm12,7
834 psrld xmm7,25
835 movdqa xmm6,xmm13
836 pslld xmm13,7
837 por xmm12,xmm7
838 psrld xmm6,25
839 movdqa xmm7,XMMWORD[r10]
840 por xmm13,xmm6
841 movdqa XMMWORD[rsp],xmm4
842 movdqa XMMWORD[16+rsp],xmm5
843 movdqa xmm4,XMMWORD[32+rsp]
844 movdqa xmm5,XMMWORD[48+rsp]
845 paddd xmm10,xmm14
846 paddd xmm11,xmm15
847 pxor xmm2,xmm10
848 pxor xmm3,xmm11
849DB 102,15,56,0,215
850DB 102,15,56,0,223
851 paddd xmm4,xmm2
852 paddd xmm5,xmm3
853 pxor xmm14,xmm4
854 pxor xmm15,xmm5
855 movdqa xmm6,xmm14
856 pslld xmm14,12
857 psrld xmm6,20
858 movdqa xmm7,xmm15
859 pslld xmm15,12
860 por xmm14,xmm6
861 psrld xmm7,20
862 movdqa xmm6,XMMWORD[r11]
863 por xmm15,xmm7
864 paddd xmm10,xmm14
865 paddd xmm11,xmm15
866 pxor xmm2,xmm10
867 pxor xmm3,xmm11
868DB 102,15,56,0,214
869DB 102,15,56,0,222
870 paddd xmm4,xmm2
871 paddd xmm5,xmm3
872 pxor xmm14,xmm4
873 pxor xmm15,xmm5
874 movdqa xmm7,xmm14
875 pslld xmm14,7
876 psrld xmm7,25
877 movdqa xmm6,xmm15
878 pslld xmm15,7
879 por xmm14,xmm7
880 psrld xmm6,25
881 movdqa xmm7,XMMWORD[r10]
882 por xmm15,xmm6
883 paddd xmm8,xmm13
884 paddd xmm9,xmm14
885 pxor xmm3,xmm8
886 pxor xmm0,xmm9
887DB 102,15,56,0,223
888DB 102,15,56,0,199
889 paddd xmm4,xmm3
890 paddd xmm5,xmm0
891 pxor xmm13,xmm4
892 pxor xmm14,xmm5
893 movdqa xmm6,xmm13
894 pslld xmm13,12
895 psrld xmm6,20
896 movdqa xmm7,xmm14
897 pslld xmm14,12
898 por xmm13,xmm6
899 psrld xmm7,20
900 movdqa xmm6,XMMWORD[r11]
901 por xmm14,xmm7
902 paddd xmm8,xmm13
903 paddd xmm9,xmm14
904 pxor xmm3,xmm8
905 pxor xmm0,xmm9
906DB 102,15,56,0,222
907DB 102,15,56,0,198
908 paddd xmm4,xmm3
909 paddd xmm5,xmm0
910 pxor xmm13,xmm4
911 pxor xmm14,xmm5
912 movdqa xmm7,xmm13
913 pslld xmm13,7
914 psrld xmm7,25
915 movdqa xmm6,xmm14
916 pslld xmm14,7
917 por xmm13,xmm7
918 psrld xmm6,25
919 movdqa xmm7,XMMWORD[r10]
920 por xmm14,xmm6
921 movdqa XMMWORD[32+rsp],xmm4
922 movdqa XMMWORD[48+rsp],xmm5
923 movdqa xmm4,XMMWORD[rsp]
924 movdqa xmm5,XMMWORD[16+rsp]
925 paddd xmm10,xmm15
926 paddd xmm11,xmm12
927 pxor xmm1,xmm10
928 pxor xmm2,xmm11
929DB 102,15,56,0,207
930DB 102,15,56,0,215
931 paddd xmm4,xmm1
932 paddd xmm5,xmm2
933 pxor xmm15,xmm4
934 pxor xmm12,xmm5
935 movdqa xmm6,xmm15
936 pslld xmm15,12
937 psrld xmm6,20
938 movdqa xmm7,xmm12
939 pslld xmm12,12
940 por xmm15,xmm6
941 psrld xmm7,20
942 movdqa xmm6,XMMWORD[r11]
943 por xmm12,xmm7
944 paddd xmm10,xmm15
945 paddd xmm11,xmm12
946 pxor xmm1,xmm10
947 pxor xmm2,xmm11
948DB 102,15,56,0,206
949DB 102,15,56,0,214
950 paddd xmm4,xmm1
951 paddd xmm5,xmm2
952 pxor xmm15,xmm4
953 pxor xmm12,xmm5
954 movdqa xmm7,xmm15
955 pslld xmm15,7
956 psrld xmm7,25
957 movdqa xmm6,xmm12
958 pslld xmm12,7
959 por xmm15,xmm7
960 psrld xmm6,25
961 movdqa xmm7,XMMWORD[r10]
962 por xmm12,xmm6
963 dec eax
964 jnz NEAR $L$oop4x
965
966 paddd xmm8,XMMWORD[64+rsp]
967 paddd xmm9,XMMWORD[80+rsp]
968 paddd xmm10,XMMWORD[96+rsp]
969 paddd xmm11,XMMWORD[112+rsp]
970
971 movdqa xmm6,xmm8
972 punpckldq xmm8,xmm9
973 movdqa xmm7,xmm10
974 punpckldq xmm10,xmm11
975 punpckhdq xmm6,xmm9
976 punpckhdq xmm7,xmm11
977 movdqa xmm9,xmm8
978 punpcklqdq xmm8,xmm10
979 movdqa xmm11,xmm6
980 punpcklqdq xmm6,xmm7
981 punpckhqdq xmm9,xmm10
982 punpckhqdq xmm11,xmm7
983 paddd xmm12,XMMWORD[((128-256))+rcx]
984 paddd xmm13,XMMWORD[((144-256))+rcx]
985 paddd xmm14,XMMWORD[((160-256))+rcx]
986 paddd xmm15,XMMWORD[((176-256))+rcx]
987
988 movdqa XMMWORD[rsp],xmm8
989 movdqa XMMWORD[16+rsp],xmm9
990 movdqa xmm8,XMMWORD[32+rsp]
991 movdqa xmm9,XMMWORD[48+rsp]
992
993 movdqa xmm10,xmm12
994 punpckldq xmm12,xmm13
995 movdqa xmm7,xmm14
996 punpckldq xmm14,xmm15
997 punpckhdq xmm10,xmm13
998 punpckhdq xmm7,xmm15
999 movdqa xmm13,xmm12
1000 punpcklqdq xmm12,xmm14
1001 movdqa xmm15,xmm10
1002 punpcklqdq xmm10,xmm7
1003 punpckhqdq xmm13,xmm14
1004 punpckhqdq xmm15,xmm7
1005 paddd xmm4,XMMWORD[((192-256))+rcx]
1006 paddd xmm5,XMMWORD[((208-256))+rcx]
1007 paddd xmm8,XMMWORD[((224-256))+rcx]
1008 paddd xmm9,XMMWORD[((240-256))+rcx]
1009
1010 movdqa XMMWORD[32+rsp],xmm6
1011 movdqa XMMWORD[48+rsp],xmm11
1012
1013 movdqa xmm14,xmm4
1014 punpckldq xmm4,xmm5
1015 movdqa xmm7,xmm8
1016 punpckldq xmm8,xmm9
1017 punpckhdq xmm14,xmm5
1018 punpckhdq xmm7,xmm9
1019 movdqa xmm5,xmm4
1020 punpcklqdq xmm4,xmm8
1021 movdqa xmm9,xmm14
1022 punpcklqdq xmm14,xmm7
1023 punpckhqdq xmm5,xmm8
1024 punpckhqdq xmm9,xmm7
1025 paddd xmm0,XMMWORD[((256-256))+rcx]
1026 paddd xmm1,XMMWORD[((272-256))+rcx]
1027 paddd xmm2,XMMWORD[((288-256))+rcx]
1028 paddd xmm3,XMMWORD[((304-256))+rcx]
1029
1030 movdqa xmm8,xmm0
1031 punpckldq xmm0,xmm1
1032 movdqa xmm7,xmm2
1033 punpckldq xmm2,xmm3
1034 punpckhdq xmm8,xmm1
1035 punpckhdq xmm7,xmm3
1036 movdqa xmm1,xmm0
1037 punpcklqdq xmm0,xmm2
1038 movdqa xmm3,xmm8
1039 punpcklqdq xmm8,xmm7
1040 punpckhqdq xmm1,xmm2
1041 punpckhqdq xmm3,xmm7
1042 cmp rdx,64*4
1043 jb NEAR $L$tail4x
1044
1045 movdqu xmm6,XMMWORD[rsi]
1046 movdqu xmm11,XMMWORD[16+rsi]
1047 movdqu xmm2,XMMWORD[32+rsi]
1048 movdqu xmm7,XMMWORD[48+rsi]
1049 pxor xmm6,XMMWORD[rsp]
1050 pxor xmm11,xmm12
1051 pxor xmm2,xmm4
1052 pxor xmm7,xmm0
1053
1054 movdqu XMMWORD[rdi],xmm6
1055 movdqu xmm6,XMMWORD[64+rsi]
1056 movdqu XMMWORD[16+rdi],xmm11
1057 movdqu xmm11,XMMWORD[80+rsi]
1058 movdqu XMMWORD[32+rdi],xmm2
1059 movdqu xmm2,XMMWORD[96+rsi]
1060 movdqu XMMWORD[48+rdi],xmm7
1061 movdqu xmm7,XMMWORD[112+rsi]
1062 lea rsi,[128+rsi]
1063 pxor xmm6,XMMWORD[16+rsp]
1064 pxor xmm11,xmm13
1065 pxor xmm2,xmm5
1066 pxor xmm7,xmm1
1067
1068 movdqu XMMWORD[64+rdi],xmm6
1069 movdqu xmm6,XMMWORD[rsi]
1070 movdqu XMMWORD[80+rdi],xmm11
1071 movdqu xmm11,XMMWORD[16+rsi]
1072 movdqu XMMWORD[96+rdi],xmm2
1073 movdqu xmm2,XMMWORD[32+rsi]
1074 movdqu XMMWORD[112+rdi],xmm7
1075 lea rdi,[128+rdi]
1076 movdqu xmm7,XMMWORD[48+rsi]
1077 pxor xmm6,XMMWORD[32+rsp]
1078 pxor xmm11,xmm10
1079 pxor xmm2,xmm14
1080 pxor xmm7,xmm8
1081
1082 movdqu XMMWORD[rdi],xmm6
1083 movdqu xmm6,XMMWORD[64+rsi]
1084 movdqu XMMWORD[16+rdi],xmm11
1085 movdqu xmm11,XMMWORD[80+rsi]
1086 movdqu XMMWORD[32+rdi],xmm2
1087 movdqu xmm2,XMMWORD[96+rsi]
1088 movdqu XMMWORD[48+rdi],xmm7
1089 movdqu xmm7,XMMWORD[112+rsi]
1090 lea rsi,[128+rsi]
1091 pxor xmm6,XMMWORD[48+rsp]
1092 pxor xmm11,xmm15
1093 pxor xmm2,xmm9
1094 pxor xmm7,xmm3
1095 movdqu XMMWORD[64+rdi],xmm6
1096 movdqu XMMWORD[80+rdi],xmm11
1097 movdqu XMMWORD[96+rdi],xmm2
1098 movdqu XMMWORD[112+rdi],xmm7
1099 lea rdi,[128+rdi]
1100
1101 sub rdx,64*4
1102 jnz NEAR $L$oop_outer4x
1103
1104 jmp NEAR $L$done4x
1105
1106$L$tail4x:
1107 cmp rdx,192
1108 jae NEAR $L$192_or_more4x
1109 cmp rdx,128
1110 jae NEAR $L$128_or_more4x
1111 cmp rdx,64
1112 jae NEAR $L$64_or_more4x
1113
1114
1115 xor r10,r10
1116
1117 movdqa XMMWORD[16+rsp],xmm12
1118 movdqa XMMWORD[32+rsp],xmm4
1119 movdqa XMMWORD[48+rsp],xmm0
1120 jmp NEAR $L$oop_tail4x
1121
1122ALIGN 32
1123$L$64_or_more4x:
1124 movdqu xmm6,XMMWORD[rsi]
1125 movdqu xmm11,XMMWORD[16+rsi]
1126 movdqu xmm2,XMMWORD[32+rsi]
1127 movdqu xmm7,XMMWORD[48+rsi]
1128 pxor xmm6,XMMWORD[rsp]
1129 pxor xmm11,xmm12
1130 pxor xmm2,xmm4
1131 pxor xmm7,xmm0
1132 movdqu XMMWORD[rdi],xmm6
1133 movdqu XMMWORD[16+rdi],xmm11
1134 movdqu XMMWORD[32+rdi],xmm2
1135 movdqu XMMWORD[48+rdi],xmm7
1136 je NEAR $L$done4x
1137
1138 movdqa xmm6,XMMWORD[16+rsp]
1139 lea rsi,[64+rsi]
1140 xor r10,r10
1141 movdqa XMMWORD[rsp],xmm6
1142 movdqa XMMWORD[16+rsp],xmm13
1143 lea rdi,[64+rdi]
1144 movdqa XMMWORD[32+rsp],xmm5
1145 sub rdx,64
1146 movdqa XMMWORD[48+rsp],xmm1
1147 jmp NEAR $L$oop_tail4x
1148
1149ALIGN 32
1150$L$128_or_more4x:
1151 movdqu xmm6,XMMWORD[rsi]
1152 movdqu xmm11,XMMWORD[16+rsi]
1153 movdqu xmm2,XMMWORD[32+rsi]
1154 movdqu xmm7,XMMWORD[48+rsi]
1155 pxor xmm6,XMMWORD[rsp]
1156 pxor xmm11,xmm12
1157 pxor xmm2,xmm4
1158 pxor xmm7,xmm0
1159
1160 movdqu XMMWORD[rdi],xmm6
1161 movdqu xmm6,XMMWORD[64+rsi]
1162 movdqu XMMWORD[16+rdi],xmm11
1163 movdqu xmm11,XMMWORD[80+rsi]
1164 movdqu XMMWORD[32+rdi],xmm2
1165 movdqu xmm2,XMMWORD[96+rsi]
1166 movdqu XMMWORD[48+rdi],xmm7
1167 movdqu xmm7,XMMWORD[112+rsi]
1168 pxor xmm6,XMMWORD[16+rsp]
1169 pxor xmm11,xmm13
1170 pxor xmm2,xmm5
1171 pxor xmm7,xmm1
1172 movdqu XMMWORD[64+rdi],xmm6
1173 movdqu XMMWORD[80+rdi],xmm11
1174 movdqu XMMWORD[96+rdi],xmm2
1175 movdqu XMMWORD[112+rdi],xmm7
1176 je NEAR $L$done4x
1177
1178 movdqa xmm6,XMMWORD[32+rsp]
1179 lea rsi,[128+rsi]
1180 xor r10,r10
1181 movdqa XMMWORD[rsp],xmm6
1182 movdqa XMMWORD[16+rsp],xmm10
1183 lea rdi,[128+rdi]
1184 movdqa XMMWORD[32+rsp],xmm14
1185 sub rdx,128
1186 movdqa XMMWORD[48+rsp],xmm8
1187 jmp NEAR $L$oop_tail4x
1188
1189ALIGN 32
1190$L$192_or_more4x:
1191 movdqu xmm6,XMMWORD[rsi]
1192 movdqu xmm11,XMMWORD[16+rsi]
1193 movdqu xmm2,XMMWORD[32+rsi]
1194 movdqu xmm7,XMMWORD[48+rsi]
1195 pxor xmm6,XMMWORD[rsp]
1196 pxor xmm11,xmm12
1197 pxor xmm2,xmm4
1198 pxor xmm7,xmm0
1199
1200 movdqu XMMWORD[rdi],xmm6
1201 movdqu xmm6,XMMWORD[64+rsi]
1202 movdqu XMMWORD[16+rdi],xmm11
1203 movdqu xmm11,XMMWORD[80+rsi]
1204 movdqu XMMWORD[32+rdi],xmm2
1205 movdqu xmm2,XMMWORD[96+rsi]
1206 movdqu XMMWORD[48+rdi],xmm7
1207 movdqu xmm7,XMMWORD[112+rsi]
1208 lea rsi,[128+rsi]
1209 pxor xmm6,XMMWORD[16+rsp]
1210 pxor xmm11,xmm13
1211 pxor xmm2,xmm5
1212 pxor xmm7,xmm1
1213
1214 movdqu XMMWORD[64+rdi],xmm6
1215 movdqu xmm6,XMMWORD[rsi]
1216 movdqu XMMWORD[80+rdi],xmm11
1217 movdqu xmm11,XMMWORD[16+rsi]
1218 movdqu XMMWORD[96+rdi],xmm2
1219 movdqu xmm2,XMMWORD[32+rsi]
1220 movdqu XMMWORD[112+rdi],xmm7
1221 lea rdi,[128+rdi]
1222 movdqu xmm7,XMMWORD[48+rsi]
1223 pxor xmm6,XMMWORD[32+rsp]
1224 pxor xmm11,xmm10
1225 pxor xmm2,xmm14
1226 pxor xmm7,xmm8
1227 movdqu XMMWORD[rdi],xmm6
1228 movdqu XMMWORD[16+rdi],xmm11
1229 movdqu XMMWORD[32+rdi],xmm2
1230 movdqu XMMWORD[48+rdi],xmm7
1231 je NEAR $L$done4x
1232
1233 movdqa xmm6,XMMWORD[48+rsp]
1234 lea rsi,[64+rsi]
1235 xor r10,r10
1236 movdqa XMMWORD[rsp],xmm6
1237 movdqa XMMWORD[16+rsp],xmm15
1238 lea rdi,[64+rdi]
1239 movdqa XMMWORD[32+rsp],xmm9
1240 sub rdx,192
1241 movdqa XMMWORD[48+rsp],xmm3
1242
1243$L$oop_tail4x:
1244 movzx eax,BYTE[r10*1+rsi]
1245 movzx ecx,BYTE[r10*1+rsp]
1246 lea r10,[1+r10]
1247 xor eax,ecx
1248 mov BYTE[((-1))+r10*1+rdi],al
1249 dec rdx
1250 jnz NEAR $L$oop_tail4x
1251
1252$L$done4x:
1253 movaps xmm6,XMMWORD[((-168))+r9]
1254 movaps xmm7,XMMWORD[((-152))+r9]
1255 movaps xmm8,XMMWORD[((-136))+r9]
1256 movaps xmm9,XMMWORD[((-120))+r9]
1257 movaps xmm10,XMMWORD[((-104))+r9]
1258 movaps xmm11,XMMWORD[((-88))+r9]
1259 movaps xmm12,XMMWORD[((-72))+r9]
1260 movaps xmm13,XMMWORD[((-56))+r9]
1261 movaps xmm14,XMMWORD[((-40))+r9]
1262 movaps xmm15,XMMWORD[((-24))+r9]
1263 lea rsp,[r9]
1264
1265$L$4x_epilogue:
1266 mov rdi,QWORD[8+rsp] ;WIN64 epilogue
1267 mov rsi,QWORD[16+rsp]
1268 DB 0F3h,0C3h ;repret
1269
1270$L$SEH_end_ChaCha20_4x:
1271
1272ALIGN 32
1273ChaCha20_4xop:
1274 mov QWORD[8+rsp],rdi ;WIN64 prologue
1275 mov QWORD[16+rsp],rsi
1276 mov rax,rsp
1277$L$SEH_begin_ChaCha20_4xop:
1278 mov rdi,rcx
1279 mov rsi,rdx
1280 mov rdx,r8
1281 mov rcx,r9
1282 mov r8,QWORD[40+rsp]
1283
1284
1285
1286$L$ChaCha20_4xop:
1287 mov r9,rsp
1288
1289 sub rsp,0x140+168
1290 movaps XMMWORD[(-168)+r9],xmm6
1291 movaps XMMWORD[(-152)+r9],xmm7
1292 movaps XMMWORD[(-136)+r9],xmm8
1293 movaps XMMWORD[(-120)+r9],xmm9
1294 movaps XMMWORD[(-104)+r9],xmm10
1295 movaps XMMWORD[(-88)+r9],xmm11
1296 movaps XMMWORD[(-72)+r9],xmm12
1297 movaps XMMWORD[(-56)+r9],xmm13
1298 movaps XMMWORD[(-40)+r9],xmm14
1299 movaps XMMWORD[(-24)+r9],xmm15
1300$L$4xop_body:
1301 vzeroupper
1302
1303 vmovdqa xmm11,XMMWORD[$L$sigma]
1304 vmovdqu xmm3,XMMWORD[rcx]
1305 vmovdqu xmm15,XMMWORD[16+rcx]
1306 vmovdqu xmm7,XMMWORD[r8]
1307 lea rcx,[256+rsp]
1308
1309 vpshufd xmm8,xmm11,0x00
1310 vpshufd xmm9,xmm11,0x55
1311 vmovdqa XMMWORD[64+rsp],xmm8
1312 vpshufd xmm10,xmm11,0xaa
1313 vmovdqa XMMWORD[80+rsp],xmm9
1314 vpshufd xmm11,xmm11,0xff
1315 vmovdqa XMMWORD[96+rsp],xmm10
1316 vmovdqa XMMWORD[112+rsp],xmm11
1317
1318 vpshufd xmm0,xmm3,0x00
1319 vpshufd xmm1,xmm3,0x55
1320 vmovdqa XMMWORD[(128-256)+rcx],xmm0
1321 vpshufd xmm2,xmm3,0xaa
1322 vmovdqa XMMWORD[(144-256)+rcx],xmm1
1323 vpshufd xmm3,xmm3,0xff
1324 vmovdqa XMMWORD[(160-256)+rcx],xmm2
1325 vmovdqa XMMWORD[(176-256)+rcx],xmm3
1326
1327 vpshufd xmm12,xmm15,0x00
1328 vpshufd xmm13,xmm15,0x55
1329 vmovdqa XMMWORD[(192-256)+rcx],xmm12
1330 vpshufd xmm14,xmm15,0xaa
1331 vmovdqa XMMWORD[(208-256)+rcx],xmm13
1332 vpshufd xmm15,xmm15,0xff
1333 vmovdqa XMMWORD[(224-256)+rcx],xmm14
1334 vmovdqa XMMWORD[(240-256)+rcx],xmm15
1335
1336 vpshufd xmm4,xmm7,0x00
1337 vpshufd xmm5,xmm7,0x55
1338 vpaddd xmm4,xmm4,XMMWORD[$L$inc]
1339 vpshufd xmm6,xmm7,0xaa
1340 vmovdqa XMMWORD[(272-256)+rcx],xmm5
1341 vpshufd xmm7,xmm7,0xff
1342 vmovdqa XMMWORD[(288-256)+rcx],xmm6
1343 vmovdqa XMMWORD[(304-256)+rcx],xmm7
1344
1345 jmp NEAR $L$oop_enter4xop
1346
1347ALIGN 32
1348$L$oop_outer4xop:
1349 vmovdqa xmm8,XMMWORD[64+rsp]
1350 vmovdqa xmm9,XMMWORD[80+rsp]
1351 vmovdqa xmm10,XMMWORD[96+rsp]
1352 vmovdqa xmm11,XMMWORD[112+rsp]
1353 vmovdqa xmm0,XMMWORD[((128-256))+rcx]
1354 vmovdqa xmm1,XMMWORD[((144-256))+rcx]
1355 vmovdqa xmm2,XMMWORD[((160-256))+rcx]
1356 vmovdqa xmm3,XMMWORD[((176-256))+rcx]
1357 vmovdqa xmm12,XMMWORD[((192-256))+rcx]
1358 vmovdqa xmm13,XMMWORD[((208-256))+rcx]
1359 vmovdqa xmm14,XMMWORD[((224-256))+rcx]
1360 vmovdqa xmm15,XMMWORD[((240-256))+rcx]
1361 vmovdqa xmm4,XMMWORD[((256-256))+rcx]
1362 vmovdqa xmm5,XMMWORD[((272-256))+rcx]
1363 vmovdqa xmm6,XMMWORD[((288-256))+rcx]
1364 vmovdqa xmm7,XMMWORD[((304-256))+rcx]
1365 vpaddd xmm4,xmm4,XMMWORD[$L$four]
1366
1367$L$oop_enter4xop:
1368 mov eax,10
1369 vmovdqa XMMWORD[(256-256)+rcx],xmm4
1370 jmp NEAR $L$oop4xop
1371
1372ALIGN 32
1373$L$oop4xop:
1374 vpaddd xmm8,xmm8,xmm0
1375 vpaddd xmm9,xmm9,xmm1
1376 vpaddd xmm10,xmm10,xmm2
1377 vpaddd xmm11,xmm11,xmm3
1378 vpxor xmm4,xmm8,xmm4
1379 vpxor xmm5,xmm9,xmm5
1380 vpxor xmm6,xmm10,xmm6
1381 vpxor xmm7,xmm11,xmm7
1382DB 143,232,120,194,228,16
1383DB 143,232,120,194,237,16
1384DB 143,232,120,194,246,16
1385DB 143,232,120,194,255,16
1386 vpaddd xmm12,xmm12,xmm4
1387 vpaddd xmm13,xmm13,xmm5
1388 vpaddd xmm14,xmm14,xmm6
1389 vpaddd xmm15,xmm15,xmm7
1390 vpxor xmm0,xmm12,xmm0
1391 vpxor xmm1,xmm13,xmm1
1392 vpxor xmm2,xmm2,xmm14
1393 vpxor xmm3,xmm3,xmm15
1394DB 143,232,120,194,192,12
1395DB 143,232,120,194,201,12
1396DB 143,232,120,194,210,12
1397DB 143,232,120,194,219,12
1398 vpaddd xmm8,xmm0,xmm8
1399 vpaddd xmm9,xmm1,xmm9
1400 vpaddd xmm10,xmm10,xmm2
1401 vpaddd xmm11,xmm11,xmm3
1402 vpxor xmm4,xmm8,xmm4
1403 vpxor xmm5,xmm9,xmm5
1404 vpxor xmm6,xmm10,xmm6
1405 vpxor xmm7,xmm11,xmm7
1406DB 143,232,120,194,228,8
1407DB 143,232,120,194,237,8
1408DB 143,232,120,194,246,8
1409DB 143,232,120,194,255,8
1410 vpaddd xmm12,xmm12,xmm4
1411 vpaddd xmm13,xmm13,xmm5
1412 vpaddd xmm14,xmm14,xmm6
1413 vpaddd xmm15,xmm15,xmm7
1414 vpxor xmm0,xmm12,xmm0
1415 vpxor xmm1,xmm13,xmm1
1416 vpxor xmm2,xmm2,xmm14
1417 vpxor xmm3,xmm3,xmm15
1418DB 143,232,120,194,192,7
1419DB 143,232,120,194,201,7
1420DB 143,232,120,194,210,7
1421DB 143,232,120,194,219,7
1422 vpaddd xmm8,xmm8,xmm1
1423 vpaddd xmm9,xmm9,xmm2
1424 vpaddd xmm10,xmm10,xmm3
1425 vpaddd xmm11,xmm11,xmm0
1426 vpxor xmm7,xmm8,xmm7
1427 vpxor xmm4,xmm9,xmm4
1428 vpxor xmm5,xmm10,xmm5
1429 vpxor xmm6,xmm11,xmm6
1430DB 143,232,120,194,255,16
1431DB 143,232,120,194,228,16
1432DB 143,232,120,194,237,16
1433DB 143,232,120,194,246,16
1434 vpaddd xmm14,xmm14,xmm7
1435 vpaddd xmm15,xmm15,xmm4
1436 vpaddd xmm12,xmm12,xmm5
1437 vpaddd xmm13,xmm13,xmm6
1438 vpxor xmm1,xmm14,xmm1
1439 vpxor xmm2,xmm15,xmm2
1440 vpxor xmm3,xmm3,xmm12
1441 vpxor xmm0,xmm0,xmm13
1442DB 143,232,120,194,201,12
1443DB 143,232,120,194,210,12
1444DB 143,232,120,194,219,12
1445DB 143,232,120,194,192,12
1446 vpaddd xmm8,xmm1,xmm8
1447 vpaddd xmm9,xmm2,xmm9
1448 vpaddd xmm10,xmm10,xmm3
1449 vpaddd xmm11,xmm11,xmm0
1450 vpxor xmm7,xmm8,xmm7
1451 vpxor xmm4,xmm9,xmm4
1452 vpxor xmm5,xmm10,xmm5
1453 vpxor xmm6,xmm11,xmm6
1454DB 143,232,120,194,255,8
1455DB 143,232,120,194,228,8
1456DB 143,232,120,194,237,8
1457DB 143,232,120,194,246,8
1458 vpaddd xmm14,xmm14,xmm7
1459 vpaddd xmm15,xmm15,xmm4
1460 vpaddd xmm12,xmm12,xmm5
1461 vpaddd xmm13,xmm13,xmm6
1462 vpxor xmm1,xmm14,xmm1
1463 vpxor xmm2,xmm15,xmm2
1464 vpxor xmm3,xmm3,xmm12
1465 vpxor xmm0,xmm0,xmm13
1466DB 143,232,120,194,201,7
1467DB 143,232,120,194,210,7
1468DB 143,232,120,194,219,7
1469DB 143,232,120,194,192,7
1470 dec eax
1471 jnz NEAR $L$oop4xop
1472
1473 vpaddd xmm8,xmm8,XMMWORD[64+rsp]
1474 vpaddd xmm9,xmm9,XMMWORD[80+rsp]
1475 vpaddd xmm10,xmm10,XMMWORD[96+rsp]
1476 vpaddd xmm11,xmm11,XMMWORD[112+rsp]
1477
1478 vmovdqa XMMWORD[32+rsp],xmm14
1479 vmovdqa XMMWORD[48+rsp],xmm15
1480
1481 vpunpckldq xmm14,xmm8,xmm9
1482 vpunpckldq xmm15,xmm10,xmm11
1483 vpunpckhdq xmm8,xmm8,xmm9
1484 vpunpckhdq xmm10,xmm10,xmm11
1485 vpunpcklqdq xmm9,xmm14,xmm15
1486 vpunpckhqdq xmm14,xmm14,xmm15
1487 vpunpcklqdq xmm11,xmm8,xmm10
1488 vpunpckhqdq xmm8,xmm8,xmm10
1489 vpaddd xmm0,xmm0,XMMWORD[((128-256))+rcx]
1490 vpaddd xmm1,xmm1,XMMWORD[((144-256))+rcx]
1491 vpaddd xmm2,xmm2,XMMWORD[((160-256))+rcx]
1492 vpaddd xmm3,xmm3,XMMWORD[((176-256))+rcx]
1493
1494 vmovdqa XMMWORD[rsp],xmm9
1495 vmovdqa XMMWORD[16+rsp],xmm14
1496 vmovdqa xmm9,XMMWORD[32+rsp]
1497 vmovdqa xmm14,XMMWORD[48+rsp]
1498
1499 vpunpckldq xmm10,xmm0,xmm1
1500 vpunpckldq xmm15,xmm2,xmm3
1501 vpunpckhdq xmm0,xmm0,xmm1
1502 vpunpckhdq xmm2,xmm2,xmm3
1503 vpunpcklqdq xmm1,xmm10,xmm15
1504 vpunpckhqdq xmm10,xmm10,xmm15
1505 vpunpcklqdq xmm3,xmm0,xmm2
1506 vpunpckhqdq xmm0,xmm0,xmm2
1507 vpaddd xmm12,xmm12,XMMWORD[((192-256))+rcx]
1508 vpaddd xmm13,xmm13,XMMWORD[((208-256))+rcx]
1509 vpaddd xmm9,xmm9,XMMWORD[((224-256))+rcx]
1510 vpaddd xmm14,xmm14,XMMWORD[((240-256))+rcx]
1511
1512 vpunpckldq xmm2,xmm12,xmm13
1513 vpunpckldq xmm15,xmm9,xmm14
1514 vpunpckhdq xmm12,xmm12,xmm13
1515 vpunpckhdq xmm9,xmm9,xmm14
1516 vpunpcklqdq xmm13,xmm2,xmm15
1517 vpunpckhqdq xmm2,xmm2,xmm15
1518 vpunpcklqdq xmm14,xmm12,xmm9
1519 vpunpckhqdq xmm12,xmm12,xmm9
1520 vpaddd xmm4,xmm4,XMMWORD[((256-256))+rcx]
1521 vpaddd xmm5,xmm5,XMMWORD[((272-256))+rcx]
1522 vpaddd xmm6,xmm6,XMMWORD[((288-256))+rcx]
1523 vpaddd xmm7,xmm7,XMMWORD[((304-256))+rcx]
1524
1525 vpunpckldq xmm9,xmm4,xmm5
1526 vpunpckldq xmm15,xmm6,xmm7
1527 vpunpckhdq xmm4,xmm4,xmm5
1528 vpunpckhdq xmm6,xmm6,xmm7
1529 vpunpcklqdq xmm5,xmm9,xmm15
1530 vpunpckhqdq xmm9,xmm9,xmm15
1531 vpunpcklqdq xmm7,xmm4,xmm6
1532 vpunpckhqdq xmm4,xmm4,xmm6
1533 vmovdqa xmm6,XMMWORD[rsp]
1534 vmovdqa xmm15,XMMWORD[16+rsp]
1535
1536 cmp rdx,64*4
1537 jb NEAR $L$tail4xop
1538
1539 vpxor xmm6,xmm6,XMMWORD[rsi]
1540 vpxor xmm1,xmm1,XMMWORD[16+rsi]
1541 vpxor xmm13,xmm13,XMMWORD[32+rsi]
1542 vpxor xmm5,xmm5,XMMWORD[48+rsi]
1543 vpxor xmm15,xmm15,XMMWORD[64+rsi]
1544 vpxor xmm10,xmm10,XMMWORD[80+rsi]
1545 vpxor xmm2,xmm2,XMMWORD[96+rsi]
1546 vpxor xmm9,xmm9,XMMWORD[112+rsi]
1547 lea rsi,[128+rsi]
1548 vpxor xmm11,xmm11,XMMWORD[rsi]
1549 vpxor xmm3,xmm3,XMMWORD[16+rsi]
1550 vpxor xmm14,xmm14,XMMWORD[32+rsi]
1551 vpxor xmm7,xmm7,XMMWORD[48+rsi]
1552 vpxor xmm8,xmm8,XMMWORD[64+rsi]
1553 vpxor xmm0,xmm0,XMMWORD[80+rsi]
1554 vpxor xmm12,xmm12,XMMWORD[96+rsi]
1555 vpxor xmm4,xmm4,XMMWORD[112+rsi]
1556 lea rsi,[128+rsi]
1557
1558 vmovdqu XMMWORD[rdi],xmm6
1559 vmovdqu XMMWORD[16+rdi],xmm1
1560 vmovdqu XMMWORD[32+rdi],xmm13
1561 vmovdqu XMMWORD[48+rdi],xmm5
1562 vmovdqu XMMWORD[64+rdi],xmm15
1563 vmovdqu XMMWORD[80+rdi],xmm10
1564 vmovdqu XMMWORD[96+rdi],xmm2
1565 vmovdqu XMMWORD[112+rdi],xmm9
1566 lea rdi,[128+rdi]
1567 vmovdqu XMMWORD[rdi],xmm11
1568 vmovdqu XMMWORD[16+rdi],xmm3
1569 vmovdqu XMMWORD[32+rdi],xmm14
1570 vmovdqu XMMWORD[48+rdi],xmm7
1571 vmovdqu XMMWORD[64+rdi],xmm8
1572 vmovdqu XMMWORD[80+rdi],xmm0
1573 vmovdqu XMMWORD[96+rdi],xmm12
1574 vmovdqu XMMWORD[112+rdi],xmm4
1575 lea rdi,[128+rdi]
1576
1577 sub rdx,64*4
1578 jnz NEAR $L$oop_outer4xop
1579
1580 jmp NEAR $L$done4xop
1581
1582ALIGN 32
1583$L$tail4xop:
1584 cmp rdx,192
1585 jae NEAR $L$192_or_more4xop
1586 cmp rdx,128
1587 jae NEAR $L$128_or_more4xop
1588 cmp rdx,64
1589 jae NEAR $L$64_or_more4xop
1590
1591 xor r10,r10
1592 vmovdqa XMMWORD[rsp],xmm6
1593 vmovdqa XMMWORD[16+rsp],xmm1
1594 vmovdqa XMMWORD[32+rsp],xmm13
1595 vmovdqa XMMWORD[48+rsp],xmm5
1596 jmp NEAR $L$oop_tail4xop
1597
1598ALIGN 32
1599$L$64_or_more4xop:
1600 vpxor xmm6,xmm6,XMMWORD[rsi]
1601 vpxor xmm1,xmm1,XMMWORD[16+rsi]
1602 vpxor xmm13,xmm13,XMMWORD[32+rsi]
1603 vpxor xmm5,xmm5,XMMWORD[48+rsi]
1604 vmovdqu XMMWORD[rdi],xmm6
1605 vmovdqu XMMWORD[16+rdi],xmm1
1606 vmovdqu XMMWORD[32+rdi],xmm13
1607 vmovdqu XMMWORD[48+rdi],xmm5
1608 je NEAR $L$done4xop
1609
1610 lea rsi,[64+rsi]
1611 vmovdqa XMMWORD[rsp],xmm15
1612 xor r10,r10
1613 vmovdqa XMMWORD[16+rsp],xmm10
1614 lea rdi,[64+rdi]
1615 vmovdqa XMMWORD[32+rsp],xmm2
1616 sub rdx,64
1617 vmovdqa XMMWORD[48+rsp],xmm9
1618 jmp NEAR $L$oop_tail4xop
1619
1620ALIGN 32
1621$L$128_or_more4xop:
1622 vpxor xmm6,xmm6,XMMWORD[rsi]
1623 vpxor xmm1,xmm1,XMMWORD[16+rsi]
1624 vpxor xmm13,xmm13,XMMWORD[32+rsi]
1625 vpxor xmm5,xmm5,XMMWORD[48+rsi]
1626 vpxor xmm15,xmm15,XMMWORD[64+rsi]
1627 vpxor xmm10,xmm10,XMMWORD[80+rsi]
1628 vpxor xmm2,xmm2,XMMWORD[96+rsi]
1629 vpxor xmm9,xmm9,XMMWORD[112+rsi]
1630
1631 vmovdqu XMMWORD[rdi],xmm6
1632 vmovdqu XMMWORD[16+rdi],xmm1
1633 vmovdqu XMMWORD[32+rdi],xmm13
1634 vmovdqu XMMWORD[48+rdi],xmm5
1635 vmovdqu XMMWORD[64+rdi],xmm15
1636 vmovdqu XMMWORD[80+rdi],xmm10
1637 vmovdqu XMMWORD[96+rdi],xmm2
1638 vmovdqu XMMWORD[112+rdi],xmm9
1639 je NEAR $L$done4xop
1640
1641 lea rsi,[128+rsi]
1642 vmovdqa XMMWORD[rsp],xmm11
1643 xor r10,r10
1644 vmovdqa XMMWORD[16+rsp],xmm3
1645 lea rdi,[128+rdi]
1646 vmovdqa XMMWORD[32+rsp],xmm14
1647 sub rdx,128
1648 vmovdqa XMMWORD[48+rsp],xmm7
1649 jmp NEAR $L$oop_tail4xop
1650
1651ALIGN 32
1652$L$192_or_more4xop:
1653 vpxor xmm6,xmm6,XMMWORD[rsi]
1654 vpxor xmm1,xmm1,XMMWORD[16+rsi]
1655 vpxor xmm13,xmm13,XMMWORD[32+rsi]
1656 vpxor xmm5,xmm5,XMMWORD[48+rsi]
1657 vpxor xmm15,xmm15,XMMWORD[64+rsi]
1658 vpxor xmm10,xmm10,XMMWORD[80+rsi]
1659 vpxor xmm2,xmm2,XMMWORD[96+rsi]
1660 vpxor xmm9,xmm9,XMMWORD[112+rsi]
1661 lea rsi,[128+rsi]
1662 vpxor xmm11,xmm11,XMMWORD[rsi]
1663 vpxor xmm3,xmm3,XMMWORD[16+rsi]
1664 vpxor xmm14,xmm14,XMMWORD[32+rsi]
1665 vpxor xmm7,xmm7,XMMWORD[48+rsi]
1666
1667 vmovdqu XMMWORD[rdi],xmm6
1668 vmovdqu XMMWORD[16+rdi],xmm1
1669 vmovdqu XMMWORD[32+rdi],xmm13
1670 vmovdqu XMMWORD[48+rdi],xmm5
1671 vmovdqu XMMWORD[64+rdi],xmm15
1672 vmovdqu XMMWORD[80+rdi],xmm10
1673 vmovdqu XMMWORD[96+rdi],xmm2
1674 vmovdqu XMMWORD[112+rdi],xmm9
1675 lea rdi,[128+rdi]
1676 vmovdqu XMMWORD[rdi],xmm11
1677 vmovdqu XMMWORD[16+rdi],xmm3
1678 vmovdqu XMMWORD[32+rdi],xmm14
1679 vmovdqu XMMWORD[48+rdi],xmm7
1680 je NEAR $L$done4xop
1681
1682 lea rsi,[64+rsi]
1683 vmovdqa XMMWORD[rsp],xmm8
1684 xor r10,r10
1685 vmovdqa XMMWORD[16+rsp],xmm0
1686 lea rdi,[64+rdi]
1687 vmovdqa XMMWORD[32+rsp],xmm12
1688 sub rdx,192
1689 vmovdqa XMMWORD[48+rsp],xmm4
1690
1691$L$oop_tail4xop:
1692 movzx eax,BYTE[r10*1+rsi]
1693 movzx ecx,BYTE[r10*1+rsp]
1694 lea r10,[1+r10]
1695 xor eax,ecx
1696 mov BYTE[((-1))+r10*1+rdi],al
1697 dec rdx
1698 jnz NEAR $L$oop_tail4xop
1699
1700$L$done4xop:
1701 vzeroupper
1702 movaps xmm6,XMMWORD[((-168))+r9]
1703 movaps xmm7,XMMWORD[((-152))+r9]
1704 movaps xmm8,XMMWORD[((-136))+r9]
1705 movaps xmm9,XMMWORD[((-120))+r9]
1706 movaps xmm10,XMMWORD[((-104))+r9]
1707 movaps xmm11,XMMWORD[((-88))+r9]
1708 movaps xmm12,XMMWORD[((-72))+r9]
1709 movaps xmm13,XMMWORD[((-56))+r9]
1710 movaps xmm14,XMMWORD[((-40))+r9]
1711 movaps xmm15,XMMWORD[((-24))+r9]
1712 lea rsp,[r9]
1713
1714$L$4xop_epilogue:
1715 mov rdi,QWORD[8+rsp] ;WIN64 epilogue
1716 mov rsi,QWORD[16+rsp]
1717 DB 0F3h,0C3h ;repret
1718
1719$L$SEH_end_ChaCha20_4xop:
1720
1721ALIGN 32
1722ChaCha20_8x:
1723 mov QWORD[8+rsp],rdi ;WIN64 prologue
1724 mov QWORD[16+rsp],rsi
1725 mov rax,rsp
1726$L$SEH_begin_ChaCha20_8x:
1727 mov rdi,rcx
1728 mov rsi,rdx
1729 mov rdx,r8
1730 mov rcx,r9
1731 mov r8,QWORD[40+rsp]
1732
1733
1734
1735$L$ChaCha20_8x:
1736 mov r9,rsp
1737
1738 sub rsp,0x280+168
1739 and rsp,-32
1740 movaps XMMWORD[(-168)+r9],xmm6
1741 movaps XMMWORD[(-152)+r9],xmm7
1742 movaps XMMWORD[(-136)+r9],xmm8
1743 movaps XMMWORD[(-120)+r9],xmm9
1744 movaps XMMWORD[(-104)+r9],xmm10
1745 movaps XMMWORD[(-88)+r9],xmm11
1746 movaps XMMWORD[(-72)+r9],xmm12
1747 movaps XMMWORD[(-56)+r9],xmm13
1748 movaps XMMWORD[(-40)+r9],xmm14
1749 movaps XMMWORD[(-24)+r9],xmm15
1750$L$8x_body:
1751 vzeroupper
1752
1753
1754
1755
1756
1757
1758
1759
1760
1761
1762 vbroadcasti128 ymm11,XMMWORD[$L$sigma]
1763 vbroadcasti128 ymm3,XMMWORD[rcx]
1764 vbroadcasti128 ymm15,XMMWORD[16+rcx]
1765 vbroadcasti128 ymm7,XMMWORD[r8]
1766 lea rcx,[256+rsp]
1767 lea rax,[512+rsp]
1768 lea r10,[$L$rot16]
1769 lea r11,[$L$rot24]
1770
1771 vpshufd ymm8,ymm11,0x00
1772 vpshufd ymm9,ymm11,0x55
1773 vmovdqa YMMWORD[(128-256)+rcx],ymm8
1774 vpshufd ymm10,ymm11,0xaa
1775 vmovdqa YMMWORD[(160-256)+rcx],ymm9
1776 vpshufd ymm11,ymm11,0xff
1777 vmovdqa YMMWORD[(192-256)+rcx],ymm10
1778 vmovdqa YMMWORD[(224-256)+rcx],ymm11
1779
1780 vpshufd ymm0,ymm3,0x00
1781 vpshufd ymm1,ymm3,0x55
1782 vmovdqa YMMWORD[(256-256)+rcx],ymm0
1783 vpshufd ymm2,ymm3,0xaa
1784 vmovdqa YMMWORD[(288-256)+rcx],ymm1
1785 vpshufd ymm3,ymm3,0xff
1786 vmovdqa YMMWORD[(320-256)+rcx],ymm2
1787 vmovdqa YMMWORD[(352-256)+rcx],ymm3
1788
1789 vpshufd ymm12,ymm15,0x00
1790 vpshufd ymm13,ymm15,0x55
1791 vmovdqa YMMWORD[(384-512)+rax],ymm12
1792 vpshufd ymm14,ymm15,0xaa
1793 vmovdqa YMMWORD[(416-512)+rax],ymm13
1794 vpshufd ymm15,ymm15,0xff
1795 vmovdqa YMMWORD[(448-512)+rax],ymm14
1796 vmovdqa YMMWORD[(480-512)+rax],ymm15
1797
1798 vpshufd ymm4,ymm7,0x00
1799 vpshufd ymm5,ymm7,0x55
1800 vpaddd ymm4,ymm4,YMMWORD[$L$incy]
1801 vpshufd ymm6,ymm7,0xaa
1802 vmovdqa YMMWORD[(544-512)+rax],ymm5
1803 vpshufd ymm7,ymm7,0xff
1804 vmovdqa YMMWORD[(576-512)+rax],ymm6
1805 vmovdqa YMMWORD[(608-512)+rax],ymm7
1806
1807 jmp NEAR $L$oop_enter8x
1808
1809ALIGN 32
1810$L$oop_outer8x:
1811 vmovdqa ymm8,YMMWORD[((128-256))+rcx]
1812 vmovdqa ymm9,YMMWORD[((160-256))+rcx]
1813 vmovdqa ymm10,YMMWORD[((192-256))+rcx]
1814 vmovdqa ymm11,YMMWORD[((224-256))+rcx]
1815 vmovdqa ymm0,YMMWORD[((256-256))+rcx]
1816 vmovdqa ymm1,YMMWORD[((288-256))+rcx]
1817 vmovdqa ymm2,YMMWORD[((320-256))+rcx]
1818 vmovdqa ymm3,YMMWORD[((352-256))+rcx]
1819 vmovdqa ymm12,YMMWORD[((384-512))+rax]
1820 vmovdqa ymm13,YMMWORD[((416-512))+rax]
1821 vmovdqa ymm14,YMMWORD[((448-512))+rax]
1822 vmovdqa ymm15,YMMWORD[((480-512))+rax]
1823 vmovdqa ymm4,YMMWORD[((512-512))+rax]
1824 vmovdqa ymm5,YMMWORD[((544-512))+rax]
1825 vmovdqa ymm6,YMMWORD[((576-512))+rax]
1826 vmovdqa ymm7,YMMWORD[((608-512))+rax]
1827 vpaddd ymm4,ymm4,YMMWORD[$L$eight]
1828
1829$L$oop_enter8x:
1830 vmovdqa YMMWORD[64+rsp],ymm14
1831 vmovdqa YMMWORD[96+rsp],ymm15
1832 vbroadcasti128 ymm15,XMMWORD[r10]
1833 vmovdqa YMMWORD[(512-512)+rax],ymm4
1834 mov eax,10
1835 jmp NEAR $L$oop8x
1836
1837ALIGN 32
1838$L$oop8x:
1839 vpaddd ymm8,ymm8,ymm0
1840 vpxor ymm4,ymm8,ymm4
1841 vpshufb ymm4,ymm4,ymm15
1842 vpaddd ymm9,ymm9,ymm1
1843 vpxor ymm5,ymm9,ymm5
1844 vpshufb ymm5,ymm5,ymm15
1845 vpaddd ymm12,ymm12,ymm4
1846 vpxor ymm0,ymm12,ymm0
1847 vpslld ymm14,ymm0,12
1848 vpsrld ymm0,ymm0,20
1849 vpor ymm0,ymm14,ymm0
1850 vbroadcasti128 ymm14,XMMWORD[r11]
1851 vpaddd ymm13,ymm13,ymm5
1852 vpxor ymm1,ymm13,ymm1
1853 vpslld ymm15,ymm1,12
1854 vpsrld ymm1,ymm1,20
1855 vpor ymm1,ymm15,ymm1
1856 vpaddd ymm8,ymm8,ymm0
1857 vpxor ymm4,ymm8,ymm4
1858 vpshufb ymm4,ymm4,ymm14
1859 vpaddd ymm9,ymm9,ymm1
1860 vpxor ymm5,ymm9,ymm5
1861 vpshufb ymm5,ymm5,ymm14
1862 vpaddd ymm12,ymm12,ymm4
1863 vpxor ymm0,ymm12,ymm0
1864 vpslld ymm15,ymm0,7
1865 vpsrld ymm0,ymm0,25
1866 vpor ymm0,ymm15,ymm0
1867 vbroadcasti128 ymm15,XMMWORD[r10]
1868 vpaddd ymm13,ymm13,ymm5
1869 vpxor ymm1,ymm13,ymm1
1870 vpslld ymm14,ymm1,7
1871 vpsrld ymm1,ymm1,25
1872 vpor ymm1,ymm14,ymm1
1873 vmovdqa YMMWORD[rsp],ymm12
1874 vmovdqa YMMWORD[32+rsp],ymm13
1875 vmovdqa ymm12,YMMWORD[64+rsp]
1876 vmovdqa ymm13,YMMWORD[96+rsp]
1877 vpaddd ymm10,ymm10,ymm2
1878 vpxor ymm6,ymm10,ymm6
1879 vpshufb ymm6,ymm6,ymm15
1880 vpaddd ymm11,ymm11,ymm3
1881 vpxor ymm7,ymm11,ymm7
1882 vpshufb ymm7,ymm7,ymm15
1883 vpaddd ymm12,ymm12,ymm6
1884 vpxor ymm2,ymm12,ymm2
1885 vpslld ymm14,ymm2,12
1886 vpsrld ymm2,ymm2,20
1887 vpor ymm2,ymm14,ymm2
1888 vbroadcasti128 ymm14,XMMWORD[r11]
1889 vpaddd ymm13,ymm13,ymm7
1890 vpxor ymm3,ymm13,ymm3
1891 vpslld ymm15,ymm3,12
1892 vpsrld ymm3,ymm3,20
1893 vpor ymm3,ymm15,ymm3
1894 vpaddd ymm10,ymm10,ymm2
1895 vpxor ymm6,ymm10,ymm6
1896 vpshufb ymm6,ymm6,ymm14
1897 vpaddd ymm11,ymm11,ymm3
1898 vpxor ymm7,ymm11,ymm7
1899 vpshufb ymm7,ymm7,ymm14
1900 vpaddd ymm12,ymm12,ymm6
1901 vpxor ymm2,ymm12,ymm2
1902 vpslld ymm15,ymm2,7
1903 vpsrld ymm2,ymm2,25
1904 vpor ymm2,ymm15,ymm2
1905 vbroadcasti128 ymm15,XMMWORD[r10]
1906 vpaddd ymm13,ymm13,ymm7
1907 vpxor ymm3,ymm13,ymm3
1908 vpslld ymm14,ymm3,7
1909 vpsrld ymm3,ymm3,25
1910 vpor ymm3,ymm14,ymm3
1911 vpaddd ymm8,ymm8,ymm1
1912 vpxor ymm7,ymm8,ymm7
1913 vpshufb ymm7,ymm7,ymm15
1914 vpaddd ymm9,ymm9,ymm2
1915 vpxor ymm4,ymm9,ymm4
1916 vpshufb ymm4,ymm4,ymm15
1917 vpaddd ymm12,ymm12,ymm7
1918 vpxor ymm1,ymm12,ymm1
1919 vpslld ymm14,ymm1,12
1920 vpsrld ymm1,ymm1,20
1921 vpor ymm1,ymm14,ymm1
1922 vbroadcasti128 ymm14,XMMWORD[r11]
1923 vpaddd ymm13,ymm13,ymm4
1924 vpxor ymm2,ymm13,ymm2
1925 vpslld ymm15,ymm2,12
1926 vpsrld ymm2,ymm2,20
1927 vpor ymm2,ymm15,ymm2
1928 vpaddd ymm8,ymm8,ymm1
1929 vpxor ymm7,ymm8,ymm7
1930 vpshufb ymm7,ymm7,ymm14
1931 vpaddd ymm9,ymm9,ymm2
1932 vpxor ymm4,ymm9,ymm4
1933 vpshufb ymm4,ymm4,ymm14
1934 vpaddd ymm12,ymm12,ymm7
1935 vpxor ymm1,ymm12,ymm1
1936 vpslld ymm15,ymm1,7
1937 vpsrld ymm1,ymm1,25
1938 vpor ymm1,ymm15,ymm1
1939 vbroadcasti128 ymm15,XMMWORD[r10]
1940 vpaddd ymm13,ymm13,ymm4
1941 vpxor ymm2,ymm13,ymm2
1942 vpslld ymm14,ymm2,7
1943 vpsrld ymm2,ymm2,25
1944 vpor ymm2,ymm14,ymm2
1945 vmovdqa YMMWORD[64+rsp],ymm12
1946 vmovdqa YMMWORD[96+rsp],ymm13
1947 vmovdqa ymm12,YMMWORD[rsp]
1948 vmovdqa ymm13,YMMWORD[32+rsp]
1949 vpaddd ymm10,ymm10,ymm3
1950 vpxor ymm5,ymm10,ymm5
1951 vpshufb ymm5,ymm5,ymm15
1952 vpaddd ymm11,ymm11,ymm0
1953 vpxor ymm6,ymm11,ymm6
1954 vpshufb ymm6,ymm6,ymm15
1955 vpaddd ymm12,ymm12,ymm5
1956 vpxor ymm3,ymm12,ymm3
1957 vpslld ymm14,ymm3,12
1958 vpsrld ymm3,ymm3,20
1959 vpor ymm3,ymm14,ymm3
1960 vbroadcasti128 ymm14,XMMWORD[r11]
1961 vpaddd ymm13,ymm13,ymm6
1962 vpxor ymm0,ymm13,ymm0
1963 vpslld ymm15,ymm0,12
1964 vpsrld ymm0,ymm0,20
1965 vpor ymm0,ymm15,ymm0
1966 vpaddd ymm10,ymm10,ymm3
1967 vpxor ymm5,ymm10,ymm5
1968 vpshufb ymm5,ymm5,ymm14
1969 vpaddd ymm11,ymm11,ymm0
1970 vpxor ymm6,ymm11,ymm6
1971 vpshufb ymm6,ymm6,ymm14
1972 vpaddd ymm12,ymm12,ymm5
1973 vpxor ymm3,ymm12,ymm3
1974 vpslld ymm15,ymm3,7
1975 vpsrld ymm3,ymm3,25
1976 vpor ymm3,ymm15,ymm3
1977 vbroadcasti128 ymm15,XMMWORD[r10]
1978 vpaddd ymm13,ymm13,ymm6
1979 vpxor ymm0,ymm13,ymm0
1980 vpslld ymm14,ymm0,7
1981 vpsrld ymm0,ymm0,25
1982 vpor ymm0,ymm14,ymm0
1983 dec eax
1984 jnz NEAR $L$oop8x
1985
1986 lea rax,[512+rsp]
1987 vpaddd ymm8,ymm8,YMMWORD[((128-256))+rcx]
1988 vpaddd ymm9,ymm9,YMMWORD[((160-256))+rcx]
1989 vpaddd ymm10,ymm10,YMMWORD[((192-256))+rcx]
1990 vpaddd ymm11,ymm11,YMMWORD[((224-256))+rcx]
1991
1992 vpunpckldq ymm14,ymm8,ymm9
1993 vpunpckldq ymm15,ymm10,ymm11
1994 vpunpckhdq ymm8,ymm8,ymm9
1995 vpunpckhdq ymm10,ymm10,ymm11
1996 vpunpcklqdq ymm9,ymm14,ymm15
1997 vpunpckhqdq ymm14,ymm14,ymm15
1998 vpunpcklqdq ymm11,ymm8,ymm10
1999 vpunpckhqdq ymm8,ymm8,ymm10
2000 vpaddd ymm0,ymm0,YMMWORD[((256-256))+rcx]
2001 vpaddd ymm1,ymm1,YMMWORD[((288-256))+rcx]
2002 vpaddd ymm2,ymm2,YMMWORD[((320-256))+rcx]
2003 vpaddd ymm3,ymm3,YMMWORD[((352-256))+rcx]
2004
2005 vpunpckldq ymm10,ymm0,ymm1
2006 vpunpckldq ymm15,ymm2,ymm3
2007 vpunpckhdq ymm0,ymm0,ymm1
2008 vpunpckhdq ymm2,ymm2,ymm3
2009 vpunpcklqdq ymm1,ymm10,ymm15
2010 vpunpckhqdq ymm10,ymm10,ymm15
2011 vpunpcklqdq ymm3,ymm0,ymm2
2012 vpunpckhqdq ymm0,ymm0,ymm2
2013 vperm2i128 ymm15,ymm9,ymm1,0x20
2014 vperm2i128 ymm1,ymm9,ymm1,0x31
2015 vperm2i128 ymm9,ymm14,ymm10,0x20
2016 vperm2i128 ymm10,ymm14,ymm10,0x31
2017 vperm2i128 ymm14,ymm11,ymm3,0x20
2018 vperm2i128 ymm3,ymm11,ymm3,0x31
2019 vperm2i128 ymm11,ymm8,ymm0,0x20
2020 vperm2i128 ymm0,ymm8,ymm0,0x31
2021 vmovdqa YMMWORD[rsp],ymm15
2022 vmovdqa YMMWORD[32+rsp],ymm9
2023 vmovdqa ymm15,YMMWORD[64+rsp]
2024 vmovdqa ymm9,YMMWORD[96+rsp]
2025
2026 vpaddd ymm12,ymm12,YMMWORD[((384-512))+rax]
2027 vpaddd ymm13,ymm13,YMMWORD[((416-512))+rax]
2028 vpaddd ymm15,ymm15,YMMWORD[((448-512))+rax]
2029 vpaddd ymm9,ymm9,YMMWORD[((480-512))+rax]
2030
2031 vpunpckldq ymm2,ymm12,ymm13
2032 vpunpckldq ymm8,ymm15,ymm9
2033 vpunpckhdq ymm12,ymm12,ymm13
2034 vpunpckhdq ymm15,ymm15,ymm9
2035 vpunpcklqdq ymm13,ymm2,ymm8
2036 vpunpckhqdq ymm2,ymm2,ymm8
2037 vpunpcklqdq ymm9,ymm12,ymm15
2038 vpunpckhqdq ymm12,ymm12,ymm15
2039 vpaddd ymm4,ymm4,YMMWORD[((512-512))+rax]
2040 vpaddd ymm5,ymm5,YMMWORD[((544-512))+rax]
2041 vpaddd ymm6,ymm6,YMMWORD[((576-512))+rax]
2042 vpaddd ymm7,ymm7,YMMWORD[((608-512))+rax]
2043
2044 vpunpckldq ymm15,ymm4,ymm5
2045 vpunpckldq ymm8,ymm6,ymm7
2046 vpunpckhdq ymm4,ymm4,ymm5
2047 vpunpckhdq ymm6,ymm6,ymm7
2048 vpunpcklqdq ymm5,ymm15,ymm8
2049 vpunpckhqdq ymm15,ymm15,ymm8
2050 vpunpcklqdq ymm7,ymm4,ymm6
2051 vpunpckhqdq ymm4,ymm4,ymm6
2052 vperm2i128 ymm8,ymm13,ymm5,0x20
2053 vperm2i128 ymm5,ymm13,ymm5,0x31
2054 vperm2i128 ymm13,ymm2,ymm15,0x20
2055 vperm2i128 ymm15,ymm2,ymm15,0x31
2056 vperm2i128 ymm2,ymm9,ymm7,0x20
2057 vperm2i128 ymm7,ymm9,ymm7,0x31
2058 vperm2i128 ymm9,ymm12,ymm4,0x20
2059 vperm2i128 ymm4,ymm12,ymm4,0x31
2060 vmovdqa ymm6,YMMWORD[rsp]
2061 vmovdqa ymm12,YMMWORD[32+rsp]
2062
2063 cmp rdx,64*8
2064 jb NEAR $L$tail8x
2065
2066 vpxor ymm6,ymm6,YMMWORD[rsi]
2067 vpxor ymm8,ymm8,YMMWORD[32+rsi]
2068 vpxor ymm1,ymm1,YMMWORD[64+rsi]
2069 vpxor ymm5,ymm5,YMMWORD[96+rsi]
2070 lea rsi,[128+rsi]
2071 vmovdqu YMMWORD[rdi],ymm6
2072 vmovdqu YMMWORD[32+rdi],ymm8
2073 vmovdqu YMMWORD[64+rdi],ymm1
2074 vmovdqu YMMWORD[96+rdi],ymm5
2075 lea rdi,[128+rdi]
2076
2077 vpxor ymm12,ymm12,YMMWORD[rsi]
2078 vpxor ymm13,ymm13,YMMWORD[32+rsi]
2079 vpxor ymm10,ymm10,YMMWORD[64+rsi]
2080 vpxor ymm15,ymm15,YMMWORD[96+rsi]
2081 lea rsi,[128+rsi]
2082 vmovdqu YMMWORD[rdi],ymm12
2083 vmovdqu YMMWORD[32+rdi],ymm13
2084 vmovdqu YMMWORD[64+rdi],ymm10
2085 vmovdqu YMMWORD[96+rdi],ymm15
2086 lea rdi,[128+rdi]
2087
2088 vpxor ymm14,ymm14,YMMWORD[rsi]
2089 vpxor ymm2,ymm2,YMMWORD[32+rsi]
2090 vpxor ymm3,ymm3,YMMWORD[64+rsi]
2091 vpxor ymm7,ymm7,YMMWORD[96+rsi]
2092 lea rsi,[128+rsi]
2093 vmovdqu YMMWORD[rdi],ymm14
2094 vmovdqu YMMWORD[32+rdi],ymm2
2095 vmovdqu YMMWORD[64+rdi],ymm3
2096 vmovdqu YMMWORD[96+rdi],ymm7
2097 lea rdi,[128+rdi]
2098
2099 vpxor ymm11,ymm11,YMMWORD[rsi]
2100 vpxor ymm9,ymm9,YMMWORD[32+rsi]
2101 vpxor ymm0,ymm0,YMMWORD[64+rsi]
2102 vpxor ymm4,ymm4,YMMWORD[96+rsi]
2103 lea rsi,[128+rsi]
2104 vmovdqu YMMWORD[rdi],ymm11
2105 vmovdqu YMMWORD[32+rdi],ymm9
2106 vmovdqu YMMWORD[64+rdi],ymm0
2107 vmovdqu YMMWORD[96+rdi],ymm4
2108 lea rdi,[128+rdi]
2109
2110 sub rdx,64*8
2111 jnz NEAR $L$oop_outer8x
2112
2113 jmp NEAR $L$done8x
2114
2115$L$tail8x:
2116 cmp rdx,448
2117 jae NEAR $L$448_or_more8x
2118 cmp rdx,384
2119 jae NEAR $L$384_or_more8x
2120 cmp rdx,320
2121 jae NEAR $L$320_or_more8x
2122 cmp rdx,256
2123 jae NEAR $L$256_or_more8x
2124 cmp rdx,192
2125 jae NEAR $L$192_or_more8x
2126 cmp rdx,128
2127 jae NEAR $L$128_or_more8x
2128 cmp rdx,64
2129 jae NEAR $L$64_or_more8x
2130
2131 xor r10,r10
2132 vmovdqa YMMWORD[rsp],ymm6
2133 vmovdqa YMMWORD[32+rsp],ymm8
2134 jmp NEAR $L$oop_tail8x
2135
2136ALIGN 32
2137$L$64_or_more8x:
2138 vpxor ymm6,ymm6,YMMWORD[rsi]
2139 vpxor ymm8,ymm8,YMMWORD[32+rsi]
2140 vmovdqu YMMWORD[rdi],ymm6
2141 vmovdqu YMMWORD[32+rdi],ymm8
2142 je NEAR $L$done8x
2143
2144 lea rsi,[64+rsi]
2145 xor r10,r10
2146 vmovdqa YMMWORD[rsp],ymm1
2147 lea rdi,[64+rdi]
2148 sub rdx,64
2149 vmovdqa YMMWORD[32+rsp],ymm5
2150 jmp NEAR $L$oop_tail8x
2151
2152ALIGN 32
2153$L$128_or_more8x:
2154 vpxor ymm6,ymm6,YMMWORD[rsi]
2155 vpxor ymm8,ymm8,YMMWORD[32+rsi]
2156 vpxor ymm1,ymm1,YMMWORD[64+rsi]
2157 vpxor ymm5,ymm5,YMMWORD[96+rsi]
2158 vmovdqu YMMWORD[rdi],ymm6
2159 vmovdqu YMMWORD[32+rdi],ymm8
2160 vmovdqu YMMWORD[64+rdi],ymm1
2161 vmovdqu YMMWORD[96+rdi],ymm5
2162 je NEAR $L$done8x
2163
2164 lea rsi,[128+rsi]
2165 xor r10,r10
2166 vmovdqa YMMWORD[rsp],ymm12
2167 lea rdi,[128+rdi]
2168 sub rdx,128
2169 vmovdqa YMMWORD[32+rsp],ymm13
2170 jmp NEAR $L$oop_tail8x
2171
2172ALIGN 32
2173$L$192_or_more8x:
2174 vpxor ymm6,ymm6,YMMWORD[rsi]
2175 vpxor ymm8,ymm8,YMMWORD[32+rsi]
2176 vpxor ymm1,ymm1,YMMWORD[64+rsi]
2177 vpxor ymm5,ymm5,YMMWORD[96+rsi]
2178 vpxor ymm12,ymm12,YMMWORD[128+rsi]
2179 vpxor ymm13,ymm13,YMMWORD[160+rsi]
2180 vmovdqu YMMWORD[rdi],ymm6
2181 vmovdqu YMMWORD[32+rdi],ymm8
2182 vmovdqu YMMWORD[64+rdi],ymm1
2183 vmovdqu YMMWORD[96+rdi],ymm5
2184 vmovdqu YMMWORD[128+rdi],ymm12
2185 vmovdqu YMMWORD[160+rdi],ymm13
2186 je NEAR $L$done8x
2187
2188 lea rsi,[192+rsi]
2189 xor r10,r10
2190 vmovdqa YMMWORD[rsp],ymm10
2191 lea rdi,[192+rdi]
2192 sub rdx,192
2193 vmovdqa YMMWORD[32+rsp],ymm15
2194 jmp NEAR $L$oop_tail8x
2195
2196ALIGN 32
2197$L$256_or_more8x:
2198 vpxor ymm6,ymm6,YMMWORD[rsi]
2199 vpxor ymm8,ymm8,YMMWORD[32+rsi]
2200 vpxor ymm1,ymm1,YMMWORD[64+rsi]
2201 vpxor ymm5,ymm5,YMMWORD[96+rsi]
2202 vpxor ymm12,ymm12,YMMWORD[128+rsi]
2203 vpxor ymm13,ymm13,YMMWORD[160+rsi]
2204 vpxor ymm10,ymm10,YMMWORD[192+rsi]
2205 vpxor ymm15,ymm15,YMMWORD[224+rsi]
2206 vmovdqu YMMWORD[rdi],ymm6
2207 vmovdqu YMMWORD[32+rdi],ymm8
2208 vmovdqu YMMWORD[64+rdi],ymm1
2209 vmovdqu YMMWORD[96+rdi],ymm5
2210 vmovdqu YMMWORD[128+rdi],ymm12
2211 vmovdqu YMMWORD[160+rdi],ymm13
2212 vmovdqu YMMWORD[192+rdi],ymm10
2213 vmovdqu YMMWORD[224+rdi],ymm15
2214 je NEAR $L$done8x
2215
2216 lea rsi,[256+rsi]
2217 xor r10,r10
2218 vmovdqa YMMWORD[rsp],ymm14
2219 lea rdi,[256+rdi]
2220 sub rdx,256
2221 vmovdqa YMMWORD[32+rsp],ymm2
2222 jmp NEAR $L$oop_tail8x
2223
2224ALIGN 32
2225$L$320_or_more8x:
2226 vpxor ymm6,ymm6,YMMWORD[rsi]
2227 vpxor ymm8,ymm8,YMMWORD[32+rsi]
2228 vpxor ymm1,ymm1,YMMWORD[64+rsi]
2229 vpxor ymm5,ymm5,YMMWORD[96+rsi]
2230 vpxor ymm12,ymm12,YMMWORD[128+rsi]
2231 vpxor ymm13,ymm13,YMMWORD[160+rsi]
2232 vpxor ymm10,ymm10,YMMWORD[192+rsi]
2233 vpxor ymm15,ymm15,YMMWORD[224+rsi]
2234 vpxor ymm14,ymm14,YMMWORD[256+rsi]
2235 vpxor ymm2,ymm2,YMMWORD[288+rsi]
2236 vmovdqu YMMWORD[rdi],ymm6
2237 vmovdqu YMMWORD[32+rdi],ymm8
2238 vmovdqu YMMWORD[64+rdi],ymm1
2239 vmovdqu YMMWORD[96+rdi],ymm5
2240 vmovdqu YMMWORD[128+rdi],ymm12
2241 vmovdqu YMMWORD[160+rdi],ymm13
2242 vmovdqu YMMWORD[192+rdi],ymm10
2243 vmovdqu YMMWORD[224+rdi],ymm15
2244 vmovdqu YMMWORD[256+rdi],ymm14
2245 vmovdqu YMMWORD[288+rdi],ymm2
2246 je NEAR $L$done8x
2247
2248 lea rsi,[320+rsi]
2249 xor r10,r10
2250 vmovdqa YMMWORD[rsp],ymm3
2251 lea rdi,[320+rdi]
2252 sub rdx,320
2253 vmovdqa YMMWORD[32+rsp],ymm7
2254 jmp NEAR $L$oop_tail8x
2255
2256ALIGN 32
2257$L$384_or_more8x:
2258 vpxor ymm6,ymm6,YMMWORD[rsi]
2259 vpxor ymm8,ymm8,YMMWORD[32+rsi]
2260 vpxor ymm1,ymm1,YMMWORD[64+rsi]
2261 vpxor ymm5,ymm5,YMMWORD[96+rsi]
2262 vpxor ymm12,ymm12,YMMWORD[128+rsi]
2263 vpxor ymm13,ymm13,YMMWORD[160+rsi]
2264 vpxor ymm10,ymm10,YMMWORD[192+rsi]
2265 vpxor ymm15,ymm15,YMMWORD[224+rsi]
2266 vpxor ymm14,ymm14,YMMWORD[256+rsi]
2267 vpxor ymm2,ymm2,YMMWORD[288+rsi]
2268 vpxor ymm3,ymm3,YMMWORD[320+rsi]
2269 vpxor ymm7,ymm7,YMMWORD[352+rsi]
2270 vmovdqu YMMWORD[rdi],ymm6
2271 vmovdqu YMMWORD[32+rdi],ymm8
2272 vmovdqu YMMWORD[64+rdi],ymm1
2273 vmovdqu YMMWORD[96+rdi],ymm5
2274 vmovdqu YMMWORD[128+rdi],ymm12
2275 vmovdqu YMMWORD[160+rdi],ymm13
2276 vmovdqu YMMWORD[192+rdi],ymm10
2277 vmovdqu YMMWORD[224+rdi],ymm15
2278 vmovdqu YMMWORD[256+rdi],ymm14
2279 vmovdqu YMMWORD[288+rdi],ymm2
2280 vmovdqu YMMWORD[320+rdi],ymm3
2281 vmovdqu YMMWORD[352+rdi],ymm7
2282 je NEAR $L$done8x
2283
2284 lea rsi,[384+rsi]
2285 xor r10,r10
2286 vmovdqa YMMWORD[rsp],ymm11
2287 lea rdi,[384+rdi]
2288 sub rdx,384
2289 vmovdqa YMMWORD[32+rsp],ymm9
2290 jmp NEAR $L$oop_tail8x
2291
2292ALIGN 32
2293$L$448_or_more8x:
2294 vpxor ymm6,ymm6,YMMWORD[rsi]
2295 vpxor ymm8,ymm8,YMMWORD[32+rsi]
2296 vpxor ymm1,ymm1,YMMWORD[64+rsi]
2297 vpxor ymm5,ymm5,YMMWORD[96+rsi]
2298 vpxor ymm12,ymm12,YMMWORD[128+rsi]
2299 vpxor ymm13,ymm13,YMMWORD[160+rsi]
2300 vpxor ymm10,ymm10,YMMWORD[192+rsi]
2301 vpxor ymm15,ymm15,YMMWORD[224+rsi]
2302 vpxor ymm14,ymm14,YMMWORD[256+rsi]
2303 vpxor ymm2,ymm2,YMMWORD[288+rsi]
2304 vpxor ymm3,ymm3,YMMWORD[320+rsi]
2305 vpxor ymm7,ymm7,YMMWORD[352+rsi]
2306 vpxor ymm11,ymm11,YMMWORD[384+rsi]
2307 vpxor ymm9,ymm9,YMMWORD[416+rsi]
2308 vmovdqu YMMWORD[rdi],ymm6
2309 vmovdqu YMMWORD[32+rdi],ymm8
2310 vmovdqu YMMWORD[64+rdi],ymm1
2311 vmovdqu YMMWORD[96+rdi],ymm5
2312 vmovdqu YMMWORD[128+rdi],ymm12
2313 vmovdqu YMMWORD[160+rdi],ymm13
2314 vmovdqu YMMWORD[192+rdi],ymm10
2315 vmovdqu YMMWORD[224+rdi],ymm15
2316 vmovdqu YMMWORD[256+rdi],ymm14
2317 vmovdqu YMMWORD[288+rdi],ymm2
2318 vmovdqu YMMWORD[320+rdi],ymm3
2319 vmovdqu YMMWORD[352+rdi],ymm7
2320 vmovdqu YMMWORD[384+rdi],ymm11
2321 vmovdqu YMMWORD[416+rdi],ymm9
2322 je NEAR $L$done8x
2323
2324 lea rsi,[448+rsi]
2325 xor r10,r10
2326 vmovdqa YMMWORD[rsp],ymm0
2327 lea rdi,[448+rdi]
2328 sub rdx,448
2329 vmovdqa YMMWORD[32+rsp],ymm4
2330
2331$L$oop_tail8x:
2332 movzx eax,BYTE[r10*1+rsi]
2333 movzx ecx,BYTE[r10*1+rsp]
2334 lea r10,[1+r10]
2335 xor eax,ecx
2336 mov BYTE[((-1))+r10*1+rdi],al
2337 dec rdx
2338 jnz NEAR $L$oop_tail8x
2339
2340$L$done8x:
2341 vzeroall
2342 movaps xmm6,XMMWORD[((-168))+r9]
2343 movaps xmm7,XMMWORD[((-152))+r9]
2344 movaps xmm8,XMMWORD[((-136))+r9]
2345 movaps xmm9,XMMWORD[((-120))+r9]
2346 movaps xmm10,XMMWORD[((-104))+r9]
2347 movaps xmm11,XMMWORD[((-88))+r9]
2348 movaps xmm12,XMMWORD[((-72))+r9]
2349 movaps xmm13,XMMWORD[((-56))+r9]
2350 movaps xmm14,XMMWORD[((-40))+r9]
2351 movaps xmm15,XMMWORD[((-24))+r9]
2352 lea rsp,[r9]
2353
2354$L$8x_epilogue:
2355 mov rdi,QWORD[8+rsp] ;WIN64 epilogue
2356 mov rsi,QWORD[16+rsp]
2357 DB 0F3h,0C3h ;repret
2358
2359$L$SEH_end_ChaCha20_8x:
2360
2361ALIGN 32
2362ChaCha20_avx512:
2363 mov QWORD[8+rsp],rdi ;WIN64 prologue
2364 mov QWORD[16+rsp],rsi
2365 mov rax,rsp
2366$L$SEH_begin_ChaCha20_avx512:
2367 mov rdi,rcx
2368 mov rsi,rdx
2369 mov rdx,r8
2370 mov rcx,r9
2371 mov r8,QWORD[40+rsp]
2372
2373
2374
2375$L$ChaCha20_avx512:
2376 mov r9,rsp
2377
2378 cmp rdx,512
2379 ja NEAR $L$ChaCha20_16x
2380
2381 sub rsp,64+168
2382 movaps XMMWORD[(-168)+r9],xmm6
2383 movaps XMMWORD[(-152)+r9],xmm7
2384 movaps XMMWORD[(-136)+r9],xmm8
2385 movaps XMMWORD[(-120)+r9],xmm9
2386 movaps XMMWORD[(-104)+r9],xmm10
2387 movaps XMMWORD[(-88)+r9],xmm11
2388 movaps XMMWORD[(-72)+r9],xmm12
2389 movaps XMMWORD[(-56)+r9],xmm13
2390 movaps XMMWORD[(-40)+r9],xmm14
2391 movaps XMMWORD[(-24)+r9],xmm15
2392$L$avx512_body:
2393 vbroadcasti32x4 zmm0,ZMMWORD[$L$sigma]
2394 vbroadcasti32x4 zmm1,ZMMWORD[rcx]
2395 vbroadcasti32x4 zmm2,ZMMWORD[16+rcx]
2396 vbroadcasti32x4 zmm3,ZMMWORD[r8]
2397
2398 vmovdqa32 zmm16,zmm0
2399 vmovdqa32 zmm17,zmm1
2400 vmovdqa32 zmm18,zmm2
2401 vpaddd zmm3,zmm3,ZMMWORD[$L$zeroz]
2402 vmovdqa32 zmm20,ZMMWORD[$L$fourz]
2403 mov r8,10
2404 vmovdqa32 zmm19,zmm3
2405 jmp NEAR $L$oop_avx512
2406
2407ALIGN 16
2408$L$oop_outer_avx512:
2409 vmovdqa32 zmm0,zmm16
2410 vmovdqa32 zmm1,zmm17
2411 vmovdqa32 zmm2,zmm18
2412 vpaddd zmm3,zmm19,zmm20
2413 mov r8,10
2414 vmovdqa32 zmm19,zmm3
2415 jmp NEAR $L$oop_avx512
2416
2417ALIGN 32
2418$L$oop_avx512:
2419 vpaddd zmm0,zmm0,zmm1
2420 vpxord zmm3,zmm3,zmm0
2421 vprold zmm3,zmm3,16
2422 vpaddd zmm2,zmm2,zmm3
2423 vpxord zmm1,zmm1,zmm2
2424 vprold zmm1,zmm1,12
2425 vpaddd zmm0,zmm0,zmm1
2426 vpxord zmm3,zmm3,zmm0
2427 vprold zmm3,zmm3,8
2428 vpaddd zmm2,zmm2,zmm3
2429 vpxord zmm1,zmm1,zmm2
2430 vprold zmm1,zmm1,7
2431 vpshufd zmm2,zmm2,78
2432 vpshufd zmm1,zmm1,57
2433 vpshufd zmm3,zmm3,147
2434 vpaddd zmm0,zmm0,zmm1
2435 vpxord zmm3,zmm3,zmm0
2436 vprold zmm3,zmm3,16
2437 vpaddd zmm2,zmm2,zmm3
2438 vpxord zmm1,zmm1,zmm2
2439 vprold zmm1,zmm1,12
2440 vpaddd zmm0,zmm0,zmm1
2441 vpxord zmm3,zmm3,zmm0
2442 vprold zmm3,zmm3,8
2443 vpaddd zmm2,zmm2,zmm3
2444 vpxord zmm1,zmm1,zmm2
2445 vprold zmm1,zmm1,7
2446 vpshufd zmm2,zmm2,78
2447 vpshufd zmm1,zmm1,147
2448 vpshufd zmm3,zmm3,57
2449 dec r8
2450 jnz NEAR $L$oop_avx512
2451 vpaddd zmm0,zmm0,zmm16
2452 vpaddd zmm1,zmm1,zmm17
2453 vpaddd zmm2,zmm2,zmm18
2454 vpaddd zmm3,zmm3,zmm19
2455
2456 sub rdx,64
2457 jb NEAR $L$tail64_avx512
2458
2459 vpxor xmm4,xmm0,XMMWORD[rsi]
2460 vpxor xmm5,xmm1,XMMWORD[16+rsi]
2461 vpxor xmm6,xmm2,XMMWORD[32+rsi]
2462 vpxor xmm7,xmm3,XMMWORD[48+rsi]
2463 lea rsi,[64+rsi]
2464
2465 vmovdqu XMMWORD[rdi],xmm4
2466 vmovdqu XMMWORD[16+rdi],xmm5
2467 vmovdqu XMMWORD[32+rdi],xmm6
2468 vmovdqu XMMWORD[48+rdi],xmm7
2469 lea rdi,[64+rdi]
2470
2471 jz NEAR $L$done_avx512
2472
2473 vextracti32x4 xmm4,zmm0,1
2474 vextracti32x4 xmm5,zmm1,1
2475 vextracti32x4 xmm6,zmm2,1
2476 vextracti32x4 xmm7,zmm3,1
2477
2478 sub rdx,64
2479 jb NEAR $L$tail_avx512
2480
2481 vpxor xmm4,xmm4,XMMWORD[rsi]
2482 vpxor xmm5,xmm5,XMMWORD[16+rsi]
2483 vpxor xmm6,xmm6,XMMWORD[32+rsi]
2484 vpxor xmm7,xmm7,XMMWORD[48+rsi]
2485 lea rsi,[64+rsi]
2486
2487 vmovdqu XMMWORD[rdi],xmm4
2488 vmovdqu XMMWORD[16+rdi],xmm5
2489 vmovdqu XMMWORD[32+rdi],xmm6
2490 vmovdqu XMMWORD[48+rdi],xmm7
2491 lea rdi,[64+rdi]
2492
2493 jz NEAR $L$done_avx512
2494
2495 vextracti32x4 xmm4,zmm0,2
2496 vextracti32x4 xmm5,zmm1,2
2497 vextracti32x4 xmm6,zmm2,2
2498 vextracti32x4 xmm7,zmm3,2
2499
2500 sub rdx,64
2501 jb NEAR $L$tail_avx512
2502
2503 vpxor xmm4,xmm4,XMMWORD[rsi]
2504 vpxor xmm5,xmm5,XMMWORD[16+rsi]
2505 vpxor xmm6,xmm6,XMMWORD[32+rsi]
2506 vpxor xmm7,xmm7,XMMWORD[48+rsi]
2507 lea rsi,[64+rsi]
2508
2509 vmovdqu XMMWORD[rdi],xmm4
2510 vmovdqu XMMWORD[16+rdi],xmm5
2511 vmovdqu XMMWORD[32+rdi],xmm6
2512 vmovdqu XMMWORD[48+rdi],xmm7
2513 lea rdi,[64+rdi]
2514
2515 jz NEAR $L$done_avx512
2516
2517 vextracti32x4 xmm4,zmm0,3
2518 vextracti32x4 xmm5,zmm1,3
2519 vextracti32x4 xmm6,zmm2,3
2520 vextracti32x4 xmm7,zmm3,3
2521
2522 sub rdx,64
2523 jb NEAR $L$tail_avx512
2524
2525 vpxor xmm4,xmm4,XMMWORD[rsi]
2526 vpxor xmm5,xmm5,XMMWORD[16+rsi]
2527 vpxor xmm6,xmm6,XMMWORD[32+rsi]
2528 vpxor xmm7,xmm7,XMMWORD[48+rsi]
2529 lea rsi,[64+rsi]
2530
2531 vmovdqu XMMWORD[rdi],xmm4
2532 vmovdqu XMMWORD[16+rdi],xmm5
2533 vmovdqu XMMWORD[32+rdi],xmm6
2534 vmovdqu XMMWORD[48+rdi],xmm7
2535 lea rdi,[64+rdi]
2536
2537 jnz NEAR $L$oop_outer_avx512
2538
2539 jmp NEAR $L$done_avx512
2540
2541ALIGN 16
2542$L$tail64_avx512:
2543 vmovdqa XMMWORD[rsp],xmm0
2544 vmovdqa XMMWORD[16+rsp],xmm1
2545 vmovdqa XMMWORD[32+rsp],xmm2
2546 vmovdqa XMMWORD[48+rsp],xmm3
2547 add rdx,64
2548 jmp NEAR $L$oop_tail_avx512
2549
2550ALIGN 16
2551$L$tail_avx512:
2552 vmovdqa XMMWORD[rsp],xmm4
2553 vmovdqa XMMWORD[16+rsp],xmm5
2554 vmovdqa XMMWORD[32+rsp],xmm6
2555 vmovdqa XMMWORD[48+rsp],xmm7
2556 add rdx,64
2557
2558$L$oop_tail_avx512:
2559 movzx eax,BYTE[r8*1+rsi]
2560 movzx ecx,BYTE[r8*1+rsp]
2561 lea r8,[1+r8]
2562 xor eax,ecx
2563 mov BYTE[((-1))+r8*1+rdi],al
2564 dec rdx
2565 jnz NEAR $L$oop_tail_avx512
2566
2567 vmovdqu32 ZMMWORD[rsp],zmm16
2568
2569$L$done_avx512:
2570 vzeroall
2571 movaps xmm6,XMMWORD[((-168))+r9]
2572 movaps xmm7,XMMWORD[((-152))+r9]
2573 movaps xmm8,XMMWORD[((-136))+r9]
2574 movaps xmm9,XMMWORD[((-120))+r9]
2575 movaps xmm10,XMMWORD[((-104))+r9]
2576 movaps xmm11,XMMWORD[((-88))+r9]
2577 movaps xmm12,XMMWORD[((-72))+r9]
2578 movaps xmm13,XMMWORD[((-56))+r9]
2579 movaps xmm14,XMMWORD[((-40))+r9]
2580 movaps xmm15,XMMWORD[((-24))+r9]
2581 lea rsp,[r9]
2582
2583$L$avx512_epilogue:
2584 mov rdi,QWORD[8+rsp] ;WIN64 epilogue
2585 mov rsi,QWORD[16+rsp]
2586 DB 0F3h,0C3h ;repret
2587
2588$L$SEH_end_ChaCha20_avx512:
2589
2590ALIGN 32
2591ChaCha20_avx512vl:
2592 mov QWORD[8+rsp],rdi ;WIN64 prologue
2593 mov QWORD[16+rsp],rsi
2594 mov rax,rsp
2595$L$SEH_begin_ChaCha20_avx512vl:
2596 mov rdi,rcx
2597 mov rsi,rdx
2598 mov rdx,r8
2599 mov rcx,r9
2600 mov r8,QWORD[40+rsp]
2601
2602
2603
2604$L$ChaCha20_avx512vl:
2605 mov r9,rsp
2606
2607 cmp rdx,128
2608 ja NEAR $L$ChaCha20_8xvl
2609
2610 sub rsp,64+168
2611 movaps XMMWORD[(-168)+r9],xmm6
2612 movaps XMMWORD[(-152)+r9],xmm7
2613 movaps XMMWORD[(-136)+r9],xmm8
2614 movaps XMMWORD[(-120)+r9],xmm9
2615 movaps XMMWORD[(-104)+r9],xmm10
2616 movaps XMMWORD[(-88)+r9],xmm11
2617 movaps XMMWORD[(-72)+r9],xmm12
2618 movaps XMMWORD[(-56)+r9],xmm13
2619 movaps XMMWORD[(-40)+r9],xmm14
2620 movaps XMMWORD[(-24)+r9],xmm15
2621$L$avx512vl_body:
2622 vbroadcasti128 ymm0,XMMWORD[$L$sigma]
2623 vbroadcasti128 ymm1,XMMWORD[rcx]
2624 vbroadcasti128 ymm2,XMMWORD[16+rcx]
2625 vbroadcasti128 ymm3,XMMWORD[r8]
2626
2627 vmovdqa32 ymm16,ymm0
2628 vmovdqa32 ymm17,ymm1
2629 vmovdqa32 ymm18,ymm2
2630 vpaddd ymm3,ymm3,YMMWORD[$L$zeroz]
2631 vmovdqa32 ymm20,YMMWORD[$L$twoy]
2632 mov r8,10
2633 vmovdqa32 ymm19,ymm3
2634 jmp NEAR $L$oop_avx512vl
2635
2636ALIGN 16
2637$L$oop_outer_avx512vl:
2638 vmovdqa32 ymm2,ymm18
2639 vpaddd ymm3,ymm19,ymm20
2640 mov r8,10
2641 vmovdqa32 ymm19,ymm3
2642 jmp NEAR $L$oop_avx512vl
2643
2644ALIGN 32
2645$L$oop_avx512vl:
2646 vpaddd ymm0,ymm0,ymm1
2647 vpxor ymm3,ymm3,ymm0
2648 vprold ymm3,ymm3,16
2649 vpaddd ymm2,ymm2,ymm3
2650 vpxor ymm1,ymm1,ymm2
2651 vprold ymm1,ymm1,12
2652 vpaddd ymm0,ymm0,ymm1
2653 vpxor ymm3,ymm3,ymm0
2654 vprold ymm3,ymm3,8
2655 vpaddd ymm2,ymm2,ymm3
2656 vpxor ymm1,ymm1,ymm2
2657 vprold ymm1,ymm1,7
2658 vpshufd ymm2,ymm2,78
2659 vpshufd ymm1,ymm1,57
2660 vpshufd ymm3,ymm3,147
2661 vpaddd ymm0,ymm0,ymm1
2662 vpxor ymm3,ymm3,ymm0
2663 vprold ymm3,ymm3,16
2664 vpaddd ymm2,ymm2,ymm3
2665 vpxor ymm1,ymm1,ymm2
2666 vprold ymm1,ymm1,12
2667 vpaddd ymm0,ymm0,ymm1
2668 vpxor ymm3,ymm3,ymm0
2669 vprold ymm3,ymm3,8
2670 vpaddd ymm2,ymm2,ymm3
2671 vpxor ymm1,ymm1,ymm2
2672 vprold ymm1,ymm1,7
2673 vpshufd ymm2,ymm2,78
2674 vpshufd ymm1,ymm1,147
2675 vpshufd ymm3,ymm3,57
2676 dec r8
2677 jnz NEAR $L$oop_avx512vl
2678 vpaddd ymm0,ymm0,ymm16
2679 vpaddd ymm1,ymm1,ymm17
2680 vpaddd ymm2,ymm2,ymm18
2681 vpaddd ymm3,ymm3,ymm19
2682
2683 sub rdx,64
2684 jb NEAR $L$tail64_avx512vl
2685
2686 vpxor xmm4,xmm0,XMMWORD[rsi]
2687 vpxor xmm5,xmm1,XMMWORD[16+rsi]
2688 vpxor xmm6,xmm2,XMMWORD[32+rsi]
2689 vpxor xmm7,xmm3,XMMWORD[48+rsi]
2690 lea rsi,[64+rsi]
2691
2692 vmovdqu XMMWORD[rdi],xmm4
2693 vmovdqu XMMWORD[16+rdi],xmm5
2694 vmovdqu XMMWORD[32+rdi],xmm6
2695 vmovdqu XMMWORD[48+rdi],xmm7
2696 lea rdi,[64+rdi]
2697
2698 jz NEAR $L$done_avx512vl
2699
2700 vextracti128 xmm4,ymm0,1
2701 vextracti128 xmm5,ymm1,1
2702 vextracti128 xmm6,ymm2,1
2703 vextracti128 xmm7,ymm3,1
2704
2705 sub rdx,64
2706 jb NEAR $L$tail_avx512vl
2707
2708 vpxor xmm4,xmm4,XMMWORD[rsi]
2709 vpxor xmm5,xmm5,XMMWORD[16+rsi]
2710 vpxor xmm6,xmm6,XMMWORD[32+rsi]
2711 vpxor xmm7,xmm7,XMMWORD[48+rsi]
2712 lea rsi,[64+rsi]
2713
2714 vmovdqu XMMWORD[rdi],xmm4
2715 vmovdqu XMMWORD[16+rdi],xmm5
2716 vmovdqu XMMWORD[32+rdi],xmm6
2717 vmovdqu XMMWORD[48+rdi],xmm7
2718 lea rdi,[64+rdi]
2719
2720 vmovdqa32 ymm0,ymm16
2721 vmovdqa32 ymm1,ymm17
2722 jnz NEAR $L$oop_outer_avx512vl
2723
2724 jmp NEAR $L$done_avx512vl
2725
2726ALIGN 16
2727$L$tail64_avx512vl:
2728 vmovdqa XMMWORD[rsp],xmm0
2729 vmovdqa XMMWORD[16+rsp],xmm1
2730 vmovdqa XMMWORD[32+rsp],xmm2
2731 vmovdqa XMMWORD[48+rsp],xmm3
2732 add rdx,64
2733 jmp NEAR $L$oop_tail_avx512vl
2734
2735ALIGN 16
2736$L$tail_avx512vl:
2737 vmovdqa XMMWORD[rsp],xmm4
2738 vmovdqa XMMWORD[16+rsp],xmm5
2739 vmovdqa XMMWORD[32+rsp],xmm6
2740 vmovdqa XMMWORD[48+rsp],xmm7
2741 add rdx,64
2742
2743$L$oop_tail_avx512vl:
2744 movzx eax,BYTE[r8*1+rsi]
2745 movzx ecx,BYTE[r8*1+rsp]
2746 lea r8,[1+r8]
2747 xor eax,ecx
2748 mov BYTE[((-1))+r8*1+rdi],al
2749 dec rdx
2750 jnz NEAR $L$oop_tail_avx512vl
2751
2752 vmovdqu32 YMMWORD[rsp],ymm16
2753 vmovdqu32 YMMWORD[32+rsp],ymm16
2754
2755$L$done_avx512vl:
2756 vzeroall
2757 movaps xmm6,XMMWORD[((-168))+r9]
2758 movaps xmm7,XMMWORD[((-152))+r9]
2759 movaps xmm8,XMMWORD[((-136))+r9]
2760 movaps xmm9,XMMWORD[((-120))+r9]
2761 movaps xmm10,XMMWORD[((-104))+r9]
2762 movaps xmm11,XMMWORD[((-88))+r9]
2763 movaps xmm12,XMMWORD[((-72))+r9]
2764 movaps xmm13,XMMWORD[((-56))+r9]
2765 movaps xmm14,XMMWORD[((-40))+r9]
2766 movaps xmm15,XMMWORD[((-24))+r9]
2767 lea rsp,[r9]
2768
2769$L$avx512vl_epilogue:
2770 mov rdi,QWORD[8+rsp] ;WIN64 epilogue
2771 mov rsi,QWORD[16+rsp]
2772 DB 0F3h,0C3h ;repret
2773
2774$L$SEH_end_ChaCha20_avx512vl:
2775
2776ALIGN 32
2777ChaCha20_16x:
2778 mov QWORD[8+rsp],rdi ;WIN64 prologue
2779 mov QWORD[16+rsp],rsi
2780 mov rax,rsp
2781$L$SEH_begin_ChaCha20_16x:
2782 mov rdi,rcx
2783 mov rsi,rdx
2784 mov rdx,r8
2785 mov rcx,r9
2786 mov r8,QWORD[40+rsp]
2787
2788
2789
2790$L$ChaCha20_16x:
2791 mov r9,rsp
2792
2793 sub rsp,64+168
2794 and rsp,-64
2795 movaps XMMWORD[(-168)+r9],xmm6
2796 movaps XMMWORD[(-152)+r9],xmm7
2797 movaps XMMWORD[(-136)+r9],xmm8
2798 movaps XMMWORD[(-120)+r9],xmm9
2799 movaps XMMWORD[(-104)+r9],xmm10
2800 movaps XMMWORD[(-88)+r9],xmm11
2801 movaps XMMWORD[(-72)+r9],xmm12
2802 movaps XMMWORD[(-56)+r9],xmm13
2803 movaps XMMWORD[(-40)+r9],xmm14
2804 movaps XMMWORD[(-24)+r9],xmm15
2805$L$16x_body:
2806 vzeroupper
2807
2808 lea r10,[$L$sigma]
2809 vbroadcasti32x4 zmm3,ZMMWORD[r10]
2810 vbroadcasti32x4 zmm7,ZMMWORD[rcx]
2811 vbroadcasti32x4 zmm11,ZMMWORD[16+rcx]
2812 vbroadcasti32x4 zmm15,ZMMWORD[r8]
2813
2814 vpshufd zmm0,zmm3,0x00
2815 vpshufd zmm1,zmm3,0x55
2816 vpshufd zmm2,zmm3,0xaa
2817 vpshufd zmm3,zmm3,0xff
2818 vmovdqa64 zmm16,zmm0
2819 vmovdqa64 zmm17,zmm1
2820 vmovdqa64 zmm18,zmm2
2821 vmovdqa64 zmm19,zmm3
2822
2823 vpshufd zmm4,zmm7,0x00
2824 vpshufd zmm5,zmm7,0x55
2825 vpshufd zmm6,zmm7,0xaa
2826 vpshufd zmm7,zmm7,0xff
2827 vmovdqa64 zmm20,zmm4
2828 vmovdqa64 zmm21,zmm5
2829 vmovdqa64 zmm22,zmm6
2830 vmovdqa64 zmm23,zmm7
2831
2832 vpshufd zmm8,zmm11,0x00
2833 vpshufd zmm9,zmm11,0x55
2834 vpshufd zmm10,zmm11,0xaa
2835 vpshufd zmm11,zmm11,0xff
2836 vmovdqa64 zmm24,zmm8
2837 vmovdqa64 zmm25,zmm9
2838 vmovdqa64 zmm26,zmm10
2839 vmovdqa64 zmm27,zmm11
2840
2841 vpshufd zmm12,zmm15,0x00
2842 vpshufd zmm13,zmm15,0x55
2843 vpshufd zmm14,zmm15,0xaa
2844 vpshufd zmm15,zmm15,0xff
2845 vpaddd zmm12,zmm12,ZMMWORD[$L$incz]
2846 vmovdqa64 zmm28,zmm12
2847 vmovdqa64 zmm29,zmm13
2848 vmovdqa64 zmm30,zmm14
2849 vmovdqa64 zmm31,zmm15
2850
2851 mov eax,10
2852 jmp NEAR $L$oop16x
2853
2854ALIGN 32
2855$L$oop_outer16x:
2856 vpbroadcastd zmm0,DWORD[r10]
2857 vpbroadcastd zmm1,DWORD[4+r10]
2858 vpbroadcastd zmm2,DWORD[8+r10]
2859 vpbroadcastd zmm3,DWORD[12+r10]
2860 vpaddd zmm28,zmm28,ZMMWORD[$L$sixteen]
2861 vmovdqa64 zmm4,zmm20
2862 vmovdqa64 zmm5,zmm21
2863 vmovdqa64 zmm6,zmm22
2864 vmovdqa64 zmm7,zmm23
2865 vmovdqa64 zmm8,zmm24
2866 vmovdqa64 zmm9,zmm25
2867 vmovdqa64 zmm10,zmm26
2868 vmovdqa64 zmm11,zmm27
2869 vmovdqa64 zmm12,zmm28
2870 vmovdqa64 zmm13,zmm29
2871 vmovdqa64 zmm14,zmm30
2872 vmovdqa64 zmm15,zmm31
2873
2874 vmovdqa64 zmm16,zmm0
2875 vmovdqa64 zmm17,zmm1
2876 vmovdqa64 zmm18,zmm2
2877 vmovdqa64 zmm19,zmm3
2878
2879 mov eax,10
2880 jmp NEAR $L$oop16x
2881
2882ALIGN 32
2883$L$oop16x:
2884 vpaddd zmm0,zmm0,zmm4
2885 vpaddd zmm1,zmm1,zmm5
2886 vpaddd zmm2,zmm2,zmm6
2887 vpaddd zmm3,zmm3,zmm7
2888 vpxord zmm12,zmm12,zmm0
2889 vpxord zmm13,zmm13,zmm1
2890 vpxord zmm14,zmm14,zmm2
2891 vpxord zmm15,zmm15,zmm3
2892 vprold zmm12,zmm12,16
2893 vprold zmm13,zmm13,16
2894 vprold zmm14,zmm14,16
2895 vprold zmm15,zmm15,16
2896 vpaddd zmm8,zmm8,zmm12
2897 vpaddd zmm9,zmm9,zmm13
2898 vpaddd zmm10,zmm10,zmm14
2899 vpaddd zmm11,zmm11,zmm15
2900 vpxord zmm4,zmm4,zmm8
2901 vpxord zmm5,zmm5,zmm9
2902 vpxord zmm6,zmm6,zmm10
2903 vpxord zmm7,zmm7,zmm11
2904 vprold zmm4,zmm4,12
2905 vprold zmm5,zmm5,12
2906 vprold zmm6,zmm6,12
2907 vprold zmm7,zmm7,12
2908 vpaddd zmm0,zmm0,zmm4
2909 vpaddd zmm1,zmm1,zmm5
2910 vpaddd zmm2,zmm2,zmm6
2911 vpaddd zmm3,zmm3,zmm7
2912 vpxord zmm12,zmm12,zmm0
2913 vpxord zmm13,zmm13,zmm1
2914 vpxord zmm14,zmm14,zmm2
2915 vpxord zmm15,zmm15,zmm3
2916 vprold zmm12,zmm12,8
2917 vprold zmm13,zmm13,8
2918 vprold zmm14,zmm14,8
2919 vprold zmm15,zmm15,8
2920 vpaddd zmm8,zmm8,zmm12
2921 vpaddd zmm9,zmm9,zmm13
2922 vpaddd zmm10,zmm10,zmm14
2923 vpaddd zmm11,zmm11,zmm15
2924 vpxord zmm4,zmm4,zmm8
2925 vpxord zmm5,zmm5,zmm9
2926 vpxord zmm6,zmm6,zmm10
2927 vpxord zmm7,zmm7,zmm11
2928 vprold zmm4,zmm4,7
2929 vprold zmm5,zmm5,7
2930 vprold zmm6,zmm6,7
2931 vprold zmm7,zmm7,7
2932 vpaddd zmm0,zmm0,zmm5
2933 vpaddd zmm1,zmm1,zmm6
2934 vpaddd zmm2,zmm2,zmm7
2935 vpaddd zmm3,zmm3,zmm4
2936 vpxord zmm15,zmm15,zmm0
2937 vpxord zmm12,zmm12,zmm1
2938 vpxord zmm13,zmm13,zmm2
2939 vpxord zmm14,zmm14,zmm3
2940 vprold zmm15,zmm15,16
2941 vprold zmm12,zmm12,16
2942 vprold zmm13,zmm13,16
2943 vprold zmm14,zmm14,16
2944 vpaddd zmm10,zmm10,zmm15
2945 vpaddd zmm11,zmm11,zmm12
2946 vpaddd zmm8,zmm8,zmm13
2947 vpaddd zmm9,zmm9,zmm14
2948 vpxord zmm5,zmm5,zmm10
2949 vpxord zmm6,zmm6,zmm11
2950 vpxord zmm7,zmm7,zmm8
2951 vpxord zmm4,zmm4,zmm9
2952 vprold zmm5,zmm5,12
2953 vprold zmm6,zmm6,12
2954 vprold zmm7,zmm7,12
2955 vprold zmm4,zmm4,12
2956 vpaddd zmm0,zmm0,zmm5
2957 vpaddd zmm1,zmm1,zmm6
2958 vpaddd zmm2,zmm2,zmm7
2959 vpaddd zmm3,zmm3,zmm4
2960 vpxord zmm15,zmm15,zmm0
2961 vpxord zmm12,zmm12,zmm1
2962 vpxord zmm13,zmm13,zmm2
2963 vpxord zmm14,zmm14,zmm3
2964 vprold zmm15,zmm15,8
2965 vprold zmm12,zmm12,8
2966 vprold zmm13,zmm13,8
2967 vprold zmm14,zmm14,8
2968 vpaddd zmm10,zmm10,zmm15
2969 vpaddd zmm11,zmm11,zmm12
2970 vpaddd zmm8,zmm8,zmm13
2971 vpaddd zmm9,zmm9,zmm14
2972 vpxord zmm5,zmm5,zmm10
2973 vpxord zmm6,zmm6,zmm11
2974 vpxord zmm7,zmm7,zmm8
2975 vpxord zmm4,zmm4,zmm9
2976 vprold zmm5,zmm5,7
2977 vprold zmm6,zmm6,7
2978 vprold zmm7,zmm7,7
2979 vprold zmm4,zmm4,7
2980 dec eax
2981 jnz NEAR $L$oop16x
2982
2983 vpaddd zmm0,zmm0,zmm16
2984 vpaddd zmm1,zmm1,zmm17
2985 vpaddd zmm2,zmm2,zmm18
2986 vpaddd zmm3,zmm3,zmm19
2987
2988 vpunpckldq zmm18,zmm0,zmm1
2989 vpunpckldq zmm19,zmm2,zmm3
2990 vpunpckhdq zmm0,zmm0,zmm1
2991 vpunpckhdq zmm2,zmm2,zmm3
2992 vpunpcklqdq zmm1,zmm18,zmm19
2993 vpunpckhqdq zmm18,zmm18,zmm19
2994 vpunpcklqdq zmm3,zmm0,zmm2
2995 vpunpckhqdq zmm0,zmm0,zmm2
2996 vpaddd zmm4,zmm4,zmm20
2997 vpaddd zmm5,zmm5,zmm21
2998 vpaddd zmm6,zmm6,zmm22
2999 vpaddd zmm7,zmm7,zmm23
3000
3001 vpunpckldq zmm2,zmm4,zmm5
3002 vpunpckldq zmm19,zmm6,zmm7
3003 vpunpckhdq zmm4,zmm4,zmm5
3004 vpunpckhdq zmm6,zmm6,zmm7
3005 vpunpcklqdq zmm5,zmm2,zmm19
3006 vpunpckhqdq zmm2,zmm2,zmm19
3007 vpunpcklqdq zmm7,zmm4,zmm6
3008 vpunpckhqdq zmm4,zmm4,zmm6
3009 vshufi32x4 zmm19,zmm1,zmm5,0x44
3010 vshufi32x4 zmm5,zmm1,zmm5,0xee
3011 vshufi32x4 zmm1,zmm18,zmm2,0x44
3012 vshufi32x4 zmm2,zmm18,zmm2,0xee
3013 vshufi32x4 zmm18,zmm3,zmm7,0x44
3014 vshufi32x4 zmm7,zmm3,zmm7,0xee
3015 vshufi32x4 zmm3,zmm0,zmm4,0x44
3016 vshufi32x4 zmm4,zmm0,zmm4,0xee
3017 vpaddd zmm8,zmm8,zmm24
3018 vpaddd zmm9,zmm9,zmm25
3019 vpaddd zmm10,zmm10,zmm26
3020 vpaddd zmm11,zmm11,zmm27
3021
3022 vpunpckldq zmm6,zmm8,zmm9
3023 vpunpckldq zmm0,zmm10,zmm11
3024 vpunpckhdq zmm8,zmm8,zmm9
3025 vpunpckhdq zmm10,zmm10,zmm11
3026 vpunpcklqdq zmm9,zmm6,zmm0
3027 vpunpckhqdq zmm6,zmm6,zmm0
3028 vpunpcklqdq zmm11,zmm8,zmm10
3029 vpunpckhqdq zmm8,zmm8,zmm10
3030 vpaddd zmm12,zmm12,zmm28
3031 vpaddd zmm13,zmm13,zmm29
3032 vpaddd zmm14,zmm14,zmm30
3033 vpaddd zmm15,zmm15,zmm31
3034
3035 vpunpckldq zmm10,zmm12,zmm13
3036 vpunpckldq zmm0,zmm14,zmm15
3037 vpunpckhdq zmm12,zmm12,zmm13
3038 vpunpckhdq zmm14,zmm14,zmm15
3039 vpunpcklqdq zmm13,zmm10,zmm0
3040 vpunpckhqdq zmm10,zmm10,zmm0
3041 vpunpcklqdq zmm15,zmm12,zmm14
3042 vpunpckhqdq zmm12,zmm12,zmm14
3043 vshufi32x4 zmm0,zmm9,zmm13,0x44
3044 vshufi32x4 zmm13,zmm9,zmm13,0xee
3045 vshufi32x4 zmm9,zmm6,zmm10,0x44
3046 vshufi32x4 zmm10,zmm6,zmm10,0xee
3047 vshufi32x4 zmm6,zmm11,zmm15,0x44
3048 vshufi32x4 zmm15,zmm11,zmm15,0xee
3049 vshufi32x4 zmm11,zmm8,zmm12,0x44
3050 vshufi32x4 zmm12,zmm8,zmm12,0xee
3051 vshufi32x4 zmm16,zmm19,zmm0,0x88
3052 vshufi32x4 zmm19,zmm19,zmm0,0xdd
3053 vshufi32x4 zmm0,zmm5,zmm13,0x88
3054 vshufi32x4 zmm13,zmm5,zmm13,0xdd
3055 vshufi32x4 zmm17,zmm1,zmm9,0x88
3056 vshufi32x4 zmm1,zmm1,zmm9,0xdd
3057 vshufi32x4 zmm9,zmm2,zmm10,0x88
3058 vshufi32x4 zmm10,zmm2,zmm10,0xdd
3059 vshufi32x4 zmm14,zmm18,zmm6,0x88
3060 vshufi32x4 zmm18,zmm18,zmm6,0xdd
3061 vshufi32x4 zmm6,zmm7,zmm15,0x88
3062 vshufi32x4 zmm15,zmm7,zmm15,0xdd
3063 vshufi32x4 zmm8,zmm3,zmm11,0x88
3064 vshufi32x4 zmm3,zmm3,zmm11,0xdd
3065 vshufi32x4 zmm11,zmm4,zmm12,0x88
3066 vshufi32x4 zmm12,zmm4,zmm12,0xdd
3067 cmp rdx,64*16
3068 jb NEAR $L$tail16x
3069
3070 vpxord zmm16,zmm16,ZMMWORD[rsi]
3071 vpxord zmm17,zmm17,ZMMWORD[64+rsi]
3072 vpxord zmm14,zmm14,ZMMWORD[128+rsi]
3073 vpxord zmm8,zmm8,ZMMWORD[192+rsi]
3074 vmovdqu32 ZMMWORD[rdi],zmm16
3075 vmovdqu32 ZMMWORD[64+rdi],zmm17
3076 vmovdqu32 ZMMWORD[128+rdi],zmm14
3077 vmovdqu32 ZMMWORD[192+rdi],zmm8
3078
3079 vpxord zmm19,zmm19,ZMMWORD[256+rsi]
3080 vpxord zmm1,zmm1,ZMMWORD[320+rsi]
3081 vpxord zmm18,zmm18,ZMMWORD[384+rsi]
3082 vpxord zmm3,zmm3,ZMMWORD[448+rsi]
3083 vmovdqu32 ZMMWORD[256+rdi],zmm19
3084 vmovdqu32 ZMMWORD[320+rdi],zmm1
3085 vmovdqu32 ZMMWORD[384+rdi],zmm18
3086 vmovdqu32 ZMMWORD[448+rdi],zmm3
3087
3088 vpxord zmm0,zmm0,ZMMWORD[512+rsi]
3089 vpxord zmm9,zmm9,ZMMWORD[576+rsi]
3090 vpxord zmm6,zmm6,ZMMWORD[640+rsi]
3091 vpxord zmm11,zmm11,ZMMWORD[704+rsi]
3092 vmovdqu32 ZMMWORD[512+rdi],zmm0
3093 vmovdqu32 ZMMWORD[576+rdi],zmm9
3094 vmovdqu32 ZMMWORD[640+rdi],zmm6
3095 vmovdqu32 ZMMWORD[704+rdi],zmm11
3096
3097 vpxord zmm13,zmm13,ZMMWORD[768+rsi]
3098 vpxord zmm10,zmm10,ZMMWORD[832+rsi]
3099 vpxord zmm15,zmm15,ZMMWORD[896+rsi]
3100 vpxord zmm12,zmm12,ZMMWORD[960+rsi]
3101 lea rsi,[1024+rsi]
3102 vmovdqu32 ZMMWORD[768+rdi],zmm13
3103 vmovdqu32 ZMMWORD[832+rdi],zmm10
3104 vmovdqu32 ZMMWORD[896+rdi],zmm15
3105 vmovdqu32 ZMMWORD[960+rdi],zmm12
3106 lea rdi,[1024+rdi]
3107
3108 sub rdx,64*16
3109 jnz NEAR $L$oop_outer16x
3110
3111 jmp NEAR $L$done16x
3112
3113ALIGN 32
3114$L$tail16x:
3115 xor r10,r10
3116 sub rdi,rsi
3117 cmp rdx,64*1
3118 jb NEAR $L$ess_than_64_16x
3119 vpxord zmm16,zmm16,ZMMWORD[rsi]
3120 vmovdqu32 ZMMWORD[rsi*1+rdi],zmm16
3121 je NEAR $L$done16x
3122 vmovdqa32 zmm16,zmm17
3123 lea rsi,[64+rsi]
3124
3125 cmp rdx,64*2
3126 jb NEAR $L$ess_than_64_16x
3127 vpxord zmm17,zmm17,ZMMWORD[rsi]
3128 vmovdqu32 ZMMWORD[rsi*1+rdi],zmm17
3129 je NEAR $L$done16x
3130 vmovdqa32 zmm16,zmm14
3131 lea rsi,[64+rsi]
3132
3133 cmp rdx,64*3
3134 jb NEAR $L$ess_than_64_16x
3135 vpxord zmm14,zmm14,ZMMWORD[rsi]
3136 vmovdqu32 ZMMWORD[rsi*1+rdi],zmm14
3137 je NEAR $L$done16x
3138 vmovdqa32 zmm16,zmm8
3139 lea rsi,[64+rsi]
3140
3141 cmp rdx,64*4
3142 jb NEAR $L$ess_than_64_16x
3143 vpxord zmm8,zmm8,ZMMWORD[rsi]
3144 vmovdqu32 ZMMWORD[rsi*1+rdi],zmm8
3145 je NEAR $L$done16x
3146 vmovdqa32 zmm16,zmm19
3147 lea rsi,[64+rsi]
3148
3149 cmp rdx,64*5
3150 jb NEAR $L$ess_than_64_16x
3151 vpxord zmm19,zmm19,ZMMWORD[rsi]
3152 vmovdqu32 ZMMWORD[rsi*1+rdi],zmm19
3153 je NEAR $L$done16x
3154 vmovdqa32 zmm16,zmm1
3155 lea rsi,[64+rsi]
3156
3157 cmp rdx,64*6
3158 jb NEAR $L$ess_than_64_16x
3159 vpxord zmm1,zmm1,ZMMWORD[rsi]
3160 vmovdqu32 ZMMWORD[rsi*1+rdi],zmm1
3161 je NEAR $L$done16x
3162 vmovdqa32 zmm16,zmm18
3163 lea rsi,[64+rsi]
3164
3165 cmp rdx,64*7
3166 jb NEAR $L$ess_than_64_16x
3167 vpxord zmm18,zmm18,ZMMWORD[rsi]
3168 vmovdqu32 ZMMWORD[rsi*1+rdi],zmm18
3169 je NEAR $L$done16x
3170 vmovdqa32 zmm16,zmm3
3171 lea rsi,[64+rsi]
3172
3173 cmp rdx,64*8
3174 jb NEAR $L$ess_than_64_16x
3175 vpxord zmm3,zmm3,ZMMWORD[rsi]
3176 vmovdqu32 ZMMWORD[rsi*1+rdi],zmm3
3177 je NEAR $L$done16x
3178 vmovdqa32 zmm16,zmm0
3179 lea rsi,[64+rsi]
3180
3181 cmp rdx,64*9
3182 jb NEAR $L$ess_than_64_16x
3183 vpxord zmm0,zmm0,ZMMWORD[rsi]
3184 vmovdqu32 ZMMWORD[rsi*1+rdi],zmm0
3185 je NEAR $L$done16x
3186 vmovdqa32 zmm16,zmm9
3187 lea rsi,[64+rsi]
3188
3189 cmp rdx,64*10
3190 jb NEAR $L$ess_than_64_16x
3191 vpxord zmm9,zmm9,ZMMWORD[rsi]
3192 vmovdqu32 ZMMWORD[rsi*1+rdi],zmm9
3193 je NEAR $L$done16x
3194 vmovdqa32 zmm16,zmm6
3195 lea rsi,[64+rsi]
3196
3197 cmp rdx,64*11
3198 jb NEAR $L$ess_than_64_16x
3199 vpxord zmm6,zmm6,ZMMWORD[rsi]
3200 vmovdqu32 ZMMWORD[rsi*1+rdi],zmm6
3201 je NEAR $L$done16x
3202 vmovdqa32 zmm16,zmm11
3203 lea rsi,[64+rsi]
3204
3205 cmp rdx,64*12
3206 jb NEAR $L$ess_than_64_16x
3207 vpxord zmm11,zmm11,ZMMWORD[rsi]
3208 vmovdqu32 ZMMWORD[rsi*1+rdi],zmm11
3209 je NEAR $L$done16x
3210 vmovdqa32 zmm16,zmm13
3211 lea rsi,[64+rsi]
3212
3213 cmp rdx,64*13
3214 jb NEAR $L$ess_than_64_16x
3215 vpxord zmm13,zmm13,ZMMWORD[rsi]
3216 vmovdqu32 ZMMWORD[rsi*1+rdi],zmm13
3217 je NEAR $L$done16x
3218 vmovdqa32 zmm16,zmm10
3219 lea rsi,[64+rsi]
3220
3221 cmp rdx,64*14
3222 jb NEAR $L$ess_than_64_16x
3223 vpxord zmm10,zmm10,ZMMWORD[rsi]
3224 vmovdqu32 ZMMWORD[rsi*1+rdi],zmm10
3225 je NEAR $L$done16x
3226 vmovdqa32 zmm16,zmm15
3227 lea rsi,[64+rsi]
3228
3229 cmp rdx,64*15
3230 jb NEAR $L$ess_than_64_16x
3231 vpxord zmm15,zmm15,ZMMWORD[rsi]
3232 vmovdqu32 ZMMWORD[rsi*1+rdi],zmm15
3233 je NEAR $L$done16x
3234 vmovdqa32 zmm16,zmm12
3235 lea rsi,[64+rsi]
3236
3237$L$ess_than_64_16x:
3238 vmovdqa32 ZMMWORD[rsp],zmm16
3239 lea rdi,[rsi*1+rdi]
3240 and rdx,63
3241
3242$L$oop_tail16x:
3243 movzx eax,BYTE[r10*1+rsi]
3244 movzx ecx,BYTE[r10*1+rsp]
3245 lea r10,[1+r10]
3246 xor eax,ecx
3247 mov BYTE[((-1))+r10*1+rdi],al
3248 dec rdx
3249 jnz NEAR $L$oop_tail16x
3250
3251 vpxord zmm16,zmm16,zmm16
3252 vmovdqa32 ZMMWORD[rsp],zmm16
3253
3254$L$done16x:
3255 vzeroall
3256 movaps xmm6,XMMWORD[((-168))+r9]
3257 movaps xmm7,XMMWORD[((-152))+r9]
3258 movaps xmm8,XMMWORD[((-136))+r9]
3259 movaps xmm9,XMMWORD[((-120))+r9]
3260 movaps xmm10,XMMWORD[((-104))+r9]
3261 movaps xmm11,XMMWORD[((-88))+r9]
3262 movaps xmm12,XMMWORD[((-72))+r9]
3263 movaps xmm13,XMMWORD[((-56))+r9]
3264 movaps xmm14,XMMWORD[((-40))+r9]
3265 movaps xmm15,XMMWORD[((-24))+r9]
3266 lea rsp,[r9]
3267
3268$L$16x_epilogue:
3269 mov rdi,QWORD[8+rsp] ;WIN64 epilogue
3270 mov rsi,QWORD[16+rsp]
3271 DB 0F3h,0C3h ;repret
3272
3273$L$SEH_end_ChaCha20_16x:
3274
3275ALIGN 32
3276ChaCha20_8xvl:
3277 mov QWORD[8+rsp],rdi ;WIN64 prologue
3278 mov QWORD[16+rsp],rsi
3279 mov rax,rsp
3280$L$SEH_begin_ChaCha20_8xvl:
3281 mov rdi,rcx
3282 mov rsi,rdx
3283 mov rdx,r8
3284 mov rcx,r9
3285 mov r8,QWORD[40+rsp]
3286
3287
3288
3289$L$ChaCha20_8xvl:
3290 mov r9,rsp
3291
3292 sub rsp,64+168
3293 and rsp,-64
3294 movaps XMMWORD[(-168)+r9],xmm6
3295 movaps XMMWORD[(-152)+r9],xmm7
3296 movaps XMMWORD[(-136)+r9],xmm8
3297 movaps XMMWORD[(-120)+r9],xmm9
3298 movaps XMMWORD[(-104)+r9],xmm10
3299 movaps XMMWORD[(-88)+r9],xmm11
3300 movaps XMMWORD[(-72)+r9],xmm12
3301 movaps XMMWORD[(-56)+r9],xmm13
3302 movaps XMMWORD[(-40)+r9],xmm14
3303 movaps XMMWORD[(-24)+r9],xmm15
3304$L$8xvl_body:
3305 vzeroupper
3306
3307 lea r10,[$L$sigma]
3308 vbroadcasti128 ymm3,XMMWORD[r10]
3309 vbroadcasti128 ymm7,XMMWORD[rcx]
3310 vbroadcasti128 ymm11,XMMWORD[16+rcx]
3311 vbroadcasti128 ymm15,XMMWORD[r8]
3312
3313 vpshufd ymm0,ymm3,0x00
3314 vpshufd ymm1,ymm3,0x55
3315 vpshufd ymm2,ymm3,0xaa
3316 vpshufd ymm3,ymm3,0xff
3317 vmovdqa64 ymm16,ymm0
3318 vmovdqa64 ymm17,ymm1
3319 vmovdqa64 ymm18,ymm2
3320 vmovdqa64 ymm19,ymm3
3321
3322 vpshufd ymm4,ymm7,0x00
3323 vpshufd ymm5,ymm7,0x55
3324 vpshufd ymm6,ymm7,0xaa
3325 vpshufd ymm7,ymm7,0xff
3326 vmovdqa64 ymm20,ymm4
3327 vmovdqa64 ymm21,ymm5
3328 vmovdqa64 ymm22,ymm6
3329 vmovdqa64 ymm23,ymm7
3330
3331 vpshufd ymm8,ymm11,0x00
3332 vpshufd ymm9,ymm11,0x55
3333 vpshufd ymm10,ymm11,0xaa
3334 vpshufd ymm11,ymm11,0xff
3335 vmovdqa64 ymm24,ymm8
3336 vmovdqa64 ymm25,ymm9
3337 vmovdqa64 ymm26,ymm10
3338 vmovdqa64 ymm27,ymm11
3339
3340 vpshufd ymm12,ymm15,0x00
3341 vpshufd ymm13,ymm15,0x55
3342 vpshufd ymm14,ymm15,0xaa
3343 vpshufd ymm15,ymm15,0xff
3344 vpaddd ymm12,ymm12,YMMWORD[$L$incy]
3345 vmovdqa64 ymm28,ymm12
3346 vmovdqa64 ymm29,ymm13
3347 vmovdqa64 ymm30,ymm14
3348 vmovdqa64 ymm31,ymm15
3349
3350 mov eax,10
3351 jmp NEAR $L$oop8xvl
3352
3353ALIGN 32
3354$L$oop_outer8xvl:
3355
3356
3357 vpbroadcastd ymm2,DWORD[8+r10]
3358 vpbroadcastd ymm3,DWORD[12+r10]
3359 vpaddd ymm28,ymm28,YMMWORD[$L$eight]
3360 vmovdqa64 ymm4,ymm20
3361 vmovdqa64 ymm5,ymm21
3362 vmovdqa64 ymm6,ymm22
3363 vmovdqa64 ymm7,ymm23
3364 vmovdqa64 ymm8,ymm24
3365 vmovdqa64 ymm9,ymm25
3366 vmovdqa64 ymm10,ymm26
3367 vmovdqa64 ymm11,ymm27
3368 vmovdqa64 ymm12,ymm28
3369 vmovdqa64 ymm13,ymm29
3370 vmovdqa64 ymm14,ymm30
3371 vmovdqa64 ymm15,ymm31
3372
3373 vmovdqa64 ymm16,ymm0
3374 vmovdqa64 ymm17,ymm1
3375 vmovdqa64 ymm18,ymm2
3376 vmovdqa64 ymm19,ymm3
3377
3378 mov eax,10
3379 jmp NEAR $L$oop8xvl
3380
3381ALIGN 32
3382$L$oop8xvl:
3383 vpaddd ymm0,ymm0,ymm4
3384 vpaddd ymm1,ymm1,ymm5
3385 vpaddd ymm2,ymm2,ymm6
3386 vpaddd ymm3,ymm3,ymm7
3387 vpxor ymm12,ymm12,ymm0
3388 vpxor ymm13,ymm13,ymm1
3389 vpxor ymm14,ymm14,ymm2
3390 vpxor ymm15,ymm15,ymm3
3391 vprold ymm12,ymm12,16
3392 vprold ymm13,ymm13,16
3393 vprold ymm14,ymm14,16
3394 vprold ymm15,ymm15,16
3395 vpaddd ymm8,ymm8,ymm12
3396 vpaddd ymm9,ymm9,ymm13
3397 vpaddd ymm10,ymm10,ymm14
3398 vpaddd ymm11,ymm11,ymm15
3399 vpxor ymm4,ymm4,ymm8
3400 vpxor ymm5,ymm5,ymm9
3401 vpxor ymm6,ymm6,ymm10
3402 vpxor ymm7,ymm7,ymm11
3403 vprold ymm4,ymm4,12
3404 vprold ymm5,ymm5,12
3405 vprold ymm6,ymm6,12
3406 vprold ymm7,ymm7,12
3407 vpaddd ymm0,ymm0,ymm4
3408 vpaddd ymm1,ymm1,ymm5
3409 vpaddd ymm2,ymm2,ymm6
3410 vpaddd ymm3,ymm3,ymm7
3411 vpxor ymm12,ymm12,ymm0
3412 vpxor ymm13,ymm13,ymm1
3413 vpxor ymm14,ymm14,ymm2
3414 vpxor ymm15,ymm15,ymm3
3415 vprold ymm12,ymm12,8
3416 vprold ymm13,ymm13,8
3417 vprold ymm14,ymm14,8
3418 vprold ymm15,ymm15,8
3419 vpaddd ymm8,ymm8,ymm12
3420 vpaddd ymm9,ymm9,ymm13
3421 vpaddd ymm10,ymm10,ymm14
3422 vpaddd ymm11,ymm11,ymm15
3423 vpxor ymm4,ymm4,ymm8
3424 vpxor ymm5,ymm5,ymm9
3425 vpxor ymm6,ymm6,ymm10
3426 vpxor ymm7,ymm7,ymm11
3427 vprold ymm4,ymm4,7
3428 vprold ymm5,ymm5,7
3429 vprold ymm6,ymm6,7
3430 vprold ymm7,ymm7,7
3431 vpaddd ymm0,ymm0,ymm5
3432 vpaddd ymm1,ymm1,ymm6
3433 vpaddd ymm2,ymm2,ymm7
3434 vpaddd ymm3,ymm3,ymm4
3435 vpxor ymm15,ymm15,ymm0
3436 vpxor ymm12,ymm12,ymm1
3437 vpxor ymm13,ymm13,ymm2
3438 vpxor ymm14,ymm14,ymm3
3439 vprold ymm15,ymm15,16
3440 vprold ymm12,ymm12,16
3441 vprold ymm13,ymm13,16
3442 vprold ymm14,ymm14,16
3443 vpaddd ymm10,ymm10,ymm15
3444 vpaddd ymm11,ymm11,ymm12
3445 vpaddd ymm8,ymm8,ymm13
3446 vpaddd ymm9,ymm9,ymm14
3447 vpxor ymm5,ymm5,ymm10
3448 vpxor ymm6,ymm6,ymm11
3449 vpxor ymm7,ymm7,ymm8
3450 vpxor ymm4,ymm4,ymm9
3451 vprold ymm5,ymm5,12
3452 vprold ymm6,ymm6,12
3453 vprold ymm7,ymm7,12
3454 vprold ymm4,ymm4,12
3455 vpaddd ymm0,ymm0,ymm5
3456 vpaddd ymm1,ymm1,ymm6
3457 vpaddd ymm2,ymm2,ymm7
3458 vpaddd ymm3,ymm3,ymm4
3459 vpxor ymm15,ymm15,ymm0
3460 vpxor ymm12,ymm12,ymm1
3461 vpxor ymm13,ymm13,ymm2
3462 vpxor ymm14,ymm14,ymm3
3463 vprold ymm15,ymm15,8
3464 vprold ymm12,ymm12,8
3465 vprold ymm13,ymm13,8
3466 vprold ymm14,ymm14,8
3467 vpaddd ymm10,ymm10,ymm15
3468 vpaddd ymm11,ymm11,ymm12
3469 vpaddd ymm8,ymm8,ymm13
3470 vpaddd ymm9,ymm9,ymm14
3471 vpxor ymm5,ymm5,ymm10
3472 vpxor ymm6,ymm6,ymm11
3473 vpxor ymm7,ymm7,ymm8
3474 vpxor ymm4,ymm4,ymm9
3475 vprold ymm5,ymm5,7
3476 vprold ymm6,ymm6,7
3477 vprold ymm7,ymm7,7
3478 vprold ymm4,ymm4,7
3479 dec eax
3480 jnz NEAR $L$oop8xvl
3481
3482 vpaddd ymm0,ymm0,ymm16
3483 vpaddd ymm1,ymm1,ymm17
3484 vpaddd ymm2,ymm2,ymm18
3485 vpaddd ymm3,ymm3,ymm19
3486
3487 vpunpckldq ymm18,ymm0,ymm1
3488 vpunpckldq ymm19,ymm2,ymm3
3489 vpunpckhdq ymm0,ymm0,ymm1
3490 vpunpckhdq ymm2,ymm2,ymm3
3491 vpunpcklqdq ymm1,ymm18,ymm19
3492 vpunpckhqdq ymm18,ymm18,ymm19
3493 vpunpcklqdq ymm3,ymm0,ymm2
3494 vpunpckhqdq ymm0,ymm0,ymm2
3495 vpaddd ymm4,ymm4,ymm20
3496 vpaddd ymm5,ymm5,ymm21
3497 vpaddd ymm6,ymm6,ymm22
3498 vpaddd ymm7,ymm7,ymm23
3499
3500 vpunpckldq ymm2,ymm4,ymm5
3501 vpunpckldq ymm19,ymm6,ymm7
3502 vpunpckhdq ymm4,ymm4,ymm5
3503 vpunpckhdq ymm6,ymm6,ymm7
3504 vpunpcklqdq ymm5,ymm2,ymm19
3505 vpunpckhqdq ymm2,ymm2,ymm19
3506 vpunpcklqdq ymm7,ymm4,ymm6
3507 vpunpckhqdq ymm4,ymm4,ymm6
3508 vshufi32x4 ymm19,ymm1,ymm5,0
3509 vshufi32x4 ymm5,ymm1,ymm5,3
3510 vshufi32x4 ymm1,ymm18,ymm2,0
3511 vshufi32x4 ymm2,ymm18,ymm2,3
3512 vshufi32x4 ymm18,ymm3,ymm7,0
3513 vshufi32x4 ymm7,ymm3,ymm7,3
3514 vshufi32x4 ymm3,ymm0,ymm4,0
3515 vshufi32x4 ymm4,ymm0,ymm4,3
3516 vpaddd ymm8,ymm8,ymm24
3517 vpaddd ymm9,ymm9,ymm25
3518 vpaddd ymm10,ymm10,ymm26
3519 vpaddd ymm11,ymm11,ymm27
3520
3521 vpunpckldq ymm6,ymm8,ymm9
3522 vpunpckldq ymm0,ymm10,ymm11
3523 vpunpckhdq ymm8,ymm8,ymm9
3524 vpunpckhdq ymm10,ymm10,ymm11
3525 vpunpcklqdq ymm9,ymm6,ymm0
3526 vpunpckhqdq ymm6,ymm6,ymm0
3527 vpunpcklqdq ymm11,ymm8,ymm10
3528 vpunpckhqdq ymm8,ymm8,ymm10
3529 vpaddd ymm12,ymm12,ymm28
3530 vpaddd ymm13,ymm13,ymm29
3531 vpaddd ymm14,ymm14,ymm30
3532 vpaddd ymm15,ymm15,ymm31
3533
3534 vpunpckldq ymm10,ymm12,ymm13
3535 vpunpckldq ymm0,ymm14,ymm15
3536 vpunpckhdq ymm12,ymm12,ymm13
3537 vpunpckhdq ymm14,ymm14,ymm15
3538 vpunpcklqdq ymm13,ymm10,ymm0
3539 vpunpckhqdq ymm10,ymm10,ymm0
3540 vpunpcklqdq ymm15,ymm12,ymm14
3541 vpunpckhqdq ymm12,ymm12,ymm14
3542 vperm2i128 ymm0,ymm9,ymm13,0x20
3543 vperm2i128 ymm13,ymm9,ymm13,0x31
3544 vperm2i128 ymm9,ymm6,ymm10,0x20
3545 vperm2i128 ymm10,ymm6,ymm10,0x31
3546 vperm2i128 ymm6,ymm11,ymm15,0x20
3547 vperm2i128 ymm15,ymm11,ymm15,0x31
3548 vperm2i128 ymm11,ymm8,ymm12,0x20
3549 vperm2i128 ymm12,ymm8,ymm12,0x31
3550 cmp rdx,64*8
3551 jb NEAR $L$tail8xvl
3552
3553 mov eax,0x80
3554 vpxord ymm19,ymm19,YMMWORD[rsi]
3555 vpxor ymm0,ymm0,YMMWORD[32+rsi]
3556 vpxor ymm5,ymm5,YMMWORD[64+rsi]
3557 vpxor ymm13,ymm13,YMMWORD[96+rsi]
3558 lea rsi,[rax*1+rsi]
3559 vmovdqu32 YMMWORD[rdi],ymm19
3560 vmovdqu YMMWORD[32+rdi],ymm0
3561 vmovdqu YMMWORD[64+rdi],ymm5
3562 vmovdqu YMMWORD[96+rdi],ymm13
3563 lea rdi,[rax*1+rdi]
3564
3565 vpxor ymm1,ymm1,YMMWORD[rsi]
3566 vpxor ymm9,ymm9,YMMWORD[32+rsi]
3567 vpxor ymm2,ymm2,YMMWORD[64+rsi]
3568 vpxor ymm10,ymm10,YMMWORD[96+rsi]
3569 lea rsi,[rax*1+rsi]
3570 vmovdqu YMMWORD[rdi],ymm1
3571 vmovdqu YMMWORD[32+rdi],ymm9
3572 vmovdqu YMMWORD[64+rdi],ymm2
3573 vmovdqu YMMWORD[96+rdi],ymm10
3574 lea rdi,[rax*1+rdi]
3575
3576 vpxord ymm18,ymm18,YMMWORD[rsi]
3577 vpxor ymm6,ymm6,YMMWORD[32+rsi]
3578 vpxor ymm7,ymm7,YMMWORD[64+rsi]
3579 vpxor ymm15,ymm15,YMMWORD[96+rsi]
3580 lea rsi,[rax*1+rsi]
3581 vmovdqu32 YMMWORD[rdi],ymm18
3582 vmovdqu YMMWORD[32+rdi],ymm6
3583 vmovdqu YMMWORD[64+rdi],ymm7
3584 vmovdqu YMMWORD[96+rdi],ymm15
3585 lea rdi,[rax*1+rdi]
3586
3587 vpxor ymm3,ymm3,YMMWORD[rsi]
3588 vpxor ymm11,ymm11,YMMWORD[32+rsi]
3589 vpxor ymm4,ymm4,YMMWORD[64+rsi]
3590 vpxor ymm12,ymm12,YMMWORD[96+rsi]
3591 lea rsi,[rax*1+rsi]
3592 vmovdqu YMMWORD[rdi],ymm3
3593 vmovdqu YMMWORD[32+rdi],ymm11
3594 vmovdqu YMMWORD[64+rdi],ymm4
3595 vmovdqu YMMWORD[96+rdi],ymm12
3596 lea rdi,[rax*1+rdi]
3597
3598 vpbroadcastd ymm0,DWORD[r10]
3599 vpbroadcastd ymm1,DWORD[4+r10]
3600
3601 sub rdx,64*8
3602 jnz NEAR $L$oop_outer8xvl
3603
3604 jmp NEAR $L$done8xvl
3605
3606ALIGN 32
3607$L$tail8xvl:
3608 vmovdqa64 ymm8,ymm19
3609 xor r10,r10
3610 sub rdi,rsi
3611 cmp rdx,64*1
3612 jb NEAR $L$ess_than_64_8xvl
3613 vpxor ymm8,ymm8,YMMWORD[rsi]
3614 vpxor ymm0,ymm0,YMMWORD[32+rsi]
3615 vmovdqu YMMWORD[rsi*1+rdi],ymm8
3616 vmovdqu YMMWORD[32+rsi*1+rdi],ymm0
3617 je NEAR $L$done8xvl
3618 vmovdqa ymm8,ymm5
3619 vmovdqa ymm0,ymm13
3620 lea rsi,[64+rsi]
3621
3622 cmp rdx,64*2
3623 jb NEAR $L$ess_than_64_8xvl
3624 vpxor ymm5,ymm5,YMMWORD[rsi]
3625 vpxor ymm13,ymm13,YMMWORD[32+rsi]
3626 vmovdqu YMMWORD[rsi*1+rdi],ymm5
3627 vmovdqu YMMWORD[32+rsi*1+rdi],ymm13
3628 je NEAR $L$done8xvl
3629 vmovdqa ymm8,ymm1
3630 vmovdqa ymm0,ymm9
3631 lea rsi,[64+rsi]
3632
3633 cmp rdx,64*3
3634 jb NEAR $L$ess_than_64_8xvl
3635 vpxor ymm1,ymm1,YMMWORD[rsi]
3636 vpxor ymm9,ymm9,YMMWORD[32+rsi]
3637 vmovdqu YMMWORD[rsi*1+rdi],ymm1
3638 vmovdqu YMMWORD[32+rsi*1+rdi],ymm9
3639 je NEAR $L$done8xvl
3640 vmovdqa ymm8,ymm2
3641 vmovdqa ymm0,ymm10
3642 lea rsi,[64+rsi]
3643
3644 cmp rdx,64*4
3645 jb NEAR $L$ess_than_64_8xvl
3646 vpxor ymm2,ymm2,YMMWORD[rsi]
3647 vpxor ymm10,ymm10,YMMWORD[32+rsi]
3648 vmovdqu YMMWORD[rsi*1+rdi],ymm2
3649 vmovdqu YMMWORD[32+rsi*1+rdi],ymm10
3650 je NEAR $L$done8xvl
3651 vmovdqa32 ymm8,ymm18
3652 vmovdqa ymm0,ymm6
3653 lea rsi,[64+rsi]
3654
3655 cmp rdx,64*5
3656 jb NEAR $L$ess_than_64_8xvl
3657 vpxord ymm18,ymm18,YMMWORD[rsi]
3658 vpxor ymm6,ymm6,YMMWORD[32+rsi]
3659 vmovdqu32 YMMWORD[rsi*1+rdi],ymm18
3660 vmovdqu YMMWORD[32+rsi*1+rdi],ymm6
3661 je NEAR $L$done8xvl
3662 vmovdqa ymm8,ymm7
3663 vmovdqa ymm0,ymm15
3664 lea rsi,[64+rsi]
3665
3666 cmp rdx,64*6
3667 jb NEAR $L$ess_than_64_8xvl
3668 vpxor ymm7,ymm7,YMMWORD[rsi]
3669 vpxor ymm15,ymm15,YMMWORD[32+rsi]
3670 vmovdqu YMMWORD[rsi*1+rdi],ymm7
3671 vmovdqu YMMWORD[32+rsi*1+rdi],ymm15
3672 je NEAR $L$done8xvl
3673 vmovdqa ymm8,ymm3
3674 vmovdqa ymm0,ymm11
3675 lea rsi,[64+rsi]
3676
3677 cmp rdx,64*7
3678 jb NEAR $L$ess_than_64_8xvl
3679 vpxor ymm3,ymm3,YMMWORD[rsi]
3680 vpxor ymm11,ymm11,YMMWORD[32+rsi]
3681 vmovdqu YMMWORD[rsi*1+rdi],ymm3
3682 vmovdqu YMMWORD[32+rsi*1+rdi],ymm11
3683 je NEAR $L$done8xvl
3684 vmovdqa ymm8,ymm4
3685 vmovdqa ymm0,ymm12
3686 lea rsi,[64+rsi]
3687
3688$L$ess_than_64_8xvl:
3689 vmovdqa YMMWORD[rsp],ymm8
3690 vmovdqa YMMWORD[32+rsp],ymm0
3691 lea rdi,[rsi*1+rdi]
3692 and rdx,63
3693
3694$L$oop_tail8xvl:
3695 movzx eax,BYTE[r10*1+rsi]
3696 movzx ecx,BYTE[r10*1+rsp]
3697 lea r10,[1+r10]
3698 xor eax,ecx
3699 mov BYTE[((-1))+r10*1+rdi],al
3700 dec rdx
3701 jnz NEAR $L$oop_tail8xvl
3702
3703 vpxor ymm8,ymm8,ymm8
3704 vmovdqa YMMWORD[rsp],ymm8
3705 vmovdqa YMMWORD[32+rsp],ymm8
3706
3707$L$done8xvl:
3708 vzeroall
3709 movaps xmm6,XMMWORD[((-168))+r9]
3710 movaps xmm7,XMMWORD[((-152))+r9]
3711 movaps xmm8,XMMWORD[((-136))+r9]
3712 movaps xmm9,XMMWORD[((-120))+r9]
3713 movaps xmm10,XMMWORD[((-104))+r9]
3714 movaps xmm11,XMMWORD[((-88))+r9]
3715 movaps xmm12,XMMWORD[((-72))+r9]
3716 movaps xmm13,XMMWORD[((-56))+r9]
3717 movaps xmm14,XMMWORD[((-40))+r9]
3718 movaps xmm15,XMMWORD[((-24))+r9]
3719 lea rsp,[r9]
3720
3721$L$8xvl_epilogue:
3722 mov rdi,QWORD[8+rsp] ;WIN64 epilogue
3723 mov rsi,QWORD[16+rsp]
3724 DB 0F3h,0C3h ;repret
3725
3726$L$SEH_end_ChaCha20_8xvl:
3727EXTERN __imp_RtlVirtualUnwind
3728
3729ALIGN 16
3730se_handler:
3731 push rsi
3732 push rdi
3733 push rbx
3734 push rbp
3735 push r12
3736 push r13
3737 push r14
3738 push r15
3739 pushfq
3740 sub rsp,64
3741
3742 mov rax,QWORD[120+r8]
3743 mov rbx,QWORD[248+r8]
3744
3745 mov rsi,QWORD[8+r9]
3746 mov r11,QWORD[56+r9]
3747
3748 lea r10,[$L$ctr32_body]
3749 cmp rbx,r10
3750 jb NEAR $L$common_seh_tail
3751
3752 mov rax,QWORD[152+r8]
3753
3754 lea r10,[$L$no_data]
3755 cmp rbx,r10
3756 jae NEAR $L$common_seh_tail
3757
3758 lea rax,[((64+24+48))+rax]
3759
3760 mov rbx,QWORD[((-8))+rax]
3761 mov rbp,QWORD[((-16))+rax]
3762 mov r12,QWORD[((-24))+rax]
3763 mov r13,QWORD[((-32))+rax]
3764 mov r14,QWORD[((-40))+rax]
3765 mov r15,QWORD[((-48))+rax]
3766 mov QWORD[144+r8],rbx
3767 mov QWORD[160+r8],rbp
3768 mov QWORD[216+r8],r12
3769 mov QWORD[224+r8],r13
3770 mov QWORD[232+r8],r14
3771 mov QWORD[240+r8],r15
3772
3773$L$common_seh_tail:
3774 mov rdi,QWORD[8+rax]
3775 mov rsi,QWORD[16+rax]
3776 mov QWORD[152+r8],rax
3777 mov QWORD[168+r8],rsi
3778 mov QWORD[176+r8],rdi
3779
3780 mov rdi,QWORD[40+r9]
3781 mov rsi,r8
3782 mov ecx,154
3783 DD 0xa548f3fc
3784
3785 mov rsi,r9
3786 xor rcx,rcx
3787 mov rdx,QWORD[8+rsi]
3788 mov r8,QWORD[rsi]
3789 mov r9,QWORD[16+rsi]
3790 mov r10,QWORD[40+rsi]
3791 lea r11,[56+rsi]
3792 lea r12,[24+rsi]
3793 mov QWORD[32+rsp],r10
3794 mov QWORD[40+rsp],r11
3795 mov QWORD[48+rsp],r12
3796 mov QWORD[56+rsp],rcx
3797 call QWORD[__imp_RtlVirtualUnwind]
3798
3799 mov eax,1
3800 add rsp,64
3801 popfq
3802 pop r15
3803 pop r14
3804 pop r13
3805 pop r12
3806 pop rbp
3807 pop rbx
3808 pop rdi
3809 pop rsi
3810 DB 0F3h,0C3h ;repret
3811
3812
3813
3814ALIGN 16
3815simd_handler:
3816 push rsi
3817 push rdi
3818 push rbx
3819 push rbp
3820 push r12
3821 push r13
3822 push r14
3823 push r15
3824 pushfq
3825 sub rsp,64
3826
3827 mov rax,QWORD[120+r8]
3828 mov rbx,QWORD[248+r8]
3829
3830 mov rsi,QWORD[8+r9]
3831 mov r11,QWORD[56+r9]
3832
3833 mov r10d,DWORD[r11]
3834 lea r10,[r10*1+rsi]
3835 cmp rbx,r10
3836 jb NEAR $L$common_seh_tail
3837
3838 mov rax,QWORD[192+r8]
3839
3840 mov r10d,DWORD[4+r11]
3841 mov ecx,DWORD[8+r11]
3842 lea r10,[r10*1+rsi]
3843 cmp rbx,r10
3844 jae NEAR $L$common_seh_tail
3845
3846 neg rcx
3847 lea rsi,[((-8))+rcx*1+rax]
3848 lea rdi,[512+r8]
3849 neg ecx
3850 shr ecx,3
3851 DD 0xa548f3fc
3852
3853 jmp NEAR $L$common_seh_tail
3854
3855
3856section .pdata rdata align=4
3857ALIGN 4
3858 DD $L$SEH_begin_ChaCha20_ctr32 wrt ..imagebase
3859 DD $L$SEH_end_ChaCha20_ctr32 wrt ..imagebase
3860 DD $L$SEH_info_ChaCha20_ctr32 wrt ..imagebase
3861
3862 DD $L$SEH_begin_ChaCha20_ssse3 wrt ..imagebase
3863 DD $L$SEH_end_ChaCha20_ssse3 wrt ..imagebase
3864 DD $L$SEH_info_ChaCha20_ssse3 wrt ..imagebase
3865
3866 DD $L$SEH_begin_ChaCha20_128 wrt ..imagebase
3867 DD $L$SEH_end_ChaCha20_128 wrt ..imagebase
3868 DD $L$SEH_info_ChaCha20_128 wrt ..imagebase
3869
3870 DD $L$SEH_begin_ChaCha20_4x wrt ..imagebase
3871 DD $L$SEH_end_ChaCha20_4x wrt ..imagebase
3872 DD $L$SEH_info_ChaCha20_4x wrt ..imagebase
3873 DD $L$SEH_begin_ChaCha20_4xop wrt ..imagebase
3874 DD $L$SEH_end_ChaCha20_4xop wrt ..imagebase
3875 DD $L$SEH_info_ChaCha20_4xop wrt ..imagebase
3876 DD $L$SEH_begin_ChaCha20_8x wrt ..imagebase
3877 DD $L$SEH_end_ChaCha20_8x wrt ..imagebase
3878 DD $L$SEH_info_ChaCha20_8x wrt ..imagebase
3879 DD $L$SEH_begin_ChaCha20_avx512 wrt ..imagebase
3880 DD $L$SEH_end_ChaCha20_avx512 wrt ..imagebase
3881 DD $L$SEH_info_ChaCha20_avx512 wrt ..imagebase
3882
3883 DD $L$SEH_begin_ChaCha20_avx512vl wrt ..imagebase
3884 DD $L$SEH_end_ChaCha20_avx512vl wrt ..imagebase
3885 DD $L$SEH_info_ChaCha20_avx512vl wrt ..imagebase
3886
3887 DD $L$SEH_begin_ChaCha20_16x wrt ..imagebase
3888 DD $L$SEH_end_ChaCha20_16x wrt ..imagebase
3889 DD $L$SEH_info_ChaCha20_16x wrt ..imagebase
3890
3891 DD $L$SEH_begin_ChaCha20_8xvl wrt ..imagebase
3892 DD $L$SEH_end_ChaCha20_8xvl wrt ..imagebase
3893 DD $L$SEH_info_ChaCha20_8xvl wrt ..imagebase
3894section .xdata rdata align=8
3895ALIGN 8
3896$L$SEH_info_ChaCha20_ctr32:
3897DB 9,0,0,0
3898 DD se_handler wrt ..imagebase
3899
3900$L$SEH_info_ChaCha20_ssse3:
3901DB 9,0,0,0
3902 DD simd_handler wrt ..imagebase
3903 DD $L$ssse3_body wrt ..imagebase,$L$ssse3_epilogue wrt ..imagebase
3904 DD 0x20,0
3905
3906$L$SEH_info_ChaCha20_128:
3907DB 9,0,0,0
3908 DD simd_handler wrt ..imagebase
3909 DD $L$128_body wrt ..imagebase,$L$128_epilogue wrt ..imagebase
3910 DD 0x60,0
3911
3912$L$SEH_info_ChaCha20_4x:
3913DB 9,0,0,0
3914 DD simd_handler wrt ..imagebase
3915 DD $L$4x_body wrt ..imagebase,$L$4x_epilogue wrt ..imagebase
3916 DD 0xa0,0
3917$L$SEH_info_ChaCha20_4xop:
3918DB 9,0,0,0
3919 DD simd_handler wrt ..imagebase
3920 DD $L$4xop_body wrt ..imagebase,$L$4xop_epilogue wrt ..imagebase
3921 DD 0xa0,0
3922$L$SEH_info_ChaCha20_8x:
3923DB 9,0,0,0
3924 DD simd_handler wrt ..imagebase
3925 DD $L$8x_body wrt ..imagebase,$L$8x_epilogue wrt ..imagebase
3926 DD 0xa0,0
3927$L$SEH_info_ChaCha20_avx512:
3928DB 9,0,0,0
3929 DD simd_handler wrt ..imagebase
3930 DD $L$avx512_body wrt ..imagebase,$L$avx512_epilogue wrt ..imagebase
3931 DD 0x20,0
3932
3933$L$SEH_info_ChaCha20_avx512vl:
3934DB 9,0,0,0
3935 DD simd_handler wrt ..imagebase
3936 DD $L$avx512vl_body wrt ..imagebase,$L$avx512vl_epilogue wrt ..imagebase
3937 DD 0x20,0
3938
3939$L$SEH_info_ChaCha20_16x:
3940DB 9,0,0,0
3941 DD simd_handler wrt ..imagebase
3942 DD $L$16x_body wrt ..imagebase,$L$16x_epilogue wrt ..imagebase
3943 DD 0xa0,0
3944
3945$L$SEH_info_ChaCha20_8xvl:
3946DB 9,0,0,0
3947 DD simd_handler wrt ..imagebase
3948 DD $L$8xvl_body wrt ..imagebase,$L$8xvl_epilogue wrt ..imagebase
3949 DD 0xa0,0
Note: See TracBrowser for help on using the repository browser.

© 2024 Oracle Support Privacy / Do Not Sell My Info Terms of Use Trademark Policy Automated Access Etiquette