VirtualBox

source: vbox/trunk/src/libs/openssl-1.1.1i/crypto/genasm-elf/chacha-x86_64.S@ 87083

Last change on this file since 87083 was 83531, checked in by vboxsync, 5 years ago

setting svn:sync-process=export for openssl-1.1.1f, all files except tests

File size: 21.7 KB
Line 
1.text
2
3
4
5.align 64
6.Lzero:
7.long 0,0,0,0
8.Lone:
9.long 1,0,0,0
10.Linc:
11.long 0,1,2,3
12.Lfour:
13.long 4,4,4,4
14.Lincy:
15.long 0,2,4,6,1,3,5,7
16.Leight:
17.long 8,8,8,8,8,8,8,8
18.Lrot16:
19.byte 0x2,0x3,0x0,0x1, 0x6,0x7,0x4,0x5, 0xa,0xb,0x8,0x9, 0xe,0xf,0xc,0xd
20.Lrot24:
21.byte 0x3,0x0,0x1,0x2, 0x7,0x4,0x5,0x6, 0xb,0x8,0x9,0xa, 0xf,0xc,0xd,0xe
22.Ltwoy:
23.long 2,0,0,0, 2,0,0,0
24.align 64
25.Lzeroz:
26.long 0,0,0,0, 1,0,0,0, 2,0,0,0, 3,0,0,0
27.Lfourz:
28.long 4,0,0,0, 4,0,0,0, 4,0,0,0, 4,0,0,0
29.Lincz:
30.long 0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15
31.Lsixteen:
32.long 16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16
33.Lsigma:
34.byte 101,120,112,97,110,100,32,51,50,45,98,121,116,101,32,107,0
35.byte 67,104,97,67,104,97,50,48,32,102,111,114,32,120,56,54,95,54,52,44,32,67,82,89,80,84,79,71,65,77,83,32,98,121,32,60,97,112,112,114,111,64,111,112,101,110,115,115,108,46,111,114,103,62,0
36.globl ChaCha20_ctr32
37.type ChaCha20_ctr32,@function
38.align 64
39ChaCha20_ctr32:
40.cfi_startproc
41 cmpq $0,%rdx
42 je .Lno_data
43 movq OPENSSL_ia32cap_P+4(%rip),%r10
44 testl $512,%r10d
45 jnz .LChaCha20_ssse3
46
47 pushq %rbx
48.cfi_adjust_cfa_offset 8
49.cfi_offset %rbx,-16
50 pushq %rbp
51.cfi_adjust_cfa_offset 8
52.cfi_offset %rbp,-24
53 pushq %r12
54.cfi_adjust_cfa_offset 8
55.cfi_offset %r12,-32
56 pushq %r13
57.cfi_adjust_cfa_offset 8
58.cfi_offset %r13,-40
59 pushq %r14
60.cfi_adjust_cfa_offset 8
61.cfi_offset %r14,-48
62 pushq %r15
63.cfi_adjust_cfa_offset 8
64.cfi_offset %r15,-56
65 subq $64+24,%rsp
66.cfi_adjust_cfa_offset 64+24
67.Lctr32_body:
68
69
70 movdqu (%rcx),%xmm1
71 movdqu 16(%rcx),%xmm2
72 movdqu (%r8),%xmm3
73 movdqa .Lone(%rip),%xmm4
74
75
76 movdqa %xmm1,16(%rsp)
77 movdqa %xmm2,32(%rsp)
78 movdqa %xmm3,48(%rsp)
79 movq %rdx,%rbp
80 jmp .Loop_outer
81
82.align 32
83.Loop_outer:
84 movl $0x61707865,%eax
85 movl $0x3320646e,%ebx
86 movl $0x79622d32,%ecx
87 movl $0x6b206574,%edx
88 movl 16(%rsp),%r8d
89 movl 20(%rsp),%r9d
90 movl 24(%rsp),%r10d
91 movl 28(%rsp),%r11d
92 movd %xmm3,%r12d
93 movl 52(%rsp),%r13d
94 movl 56(%rsp),%r14d
95 movl 60(%rsp),%r15d
96
97 movq %rbp,64+0(%rsp)
98 movl $10,%ebp
99 movq %rsi,64+8(%rsp)
100.byte 102,72,15,126,214
101 movq %rdi,64+16(%rsp)
102 movq %rsi,%rdi
103 shrq $32,%rdi
104 jmp .Loop
105
106.align 32
107.Loop:
108 addl %r8d,%eax
109 xorl %eax,%r12d
110 roll $16,%r12d
111 addl %r9d,%ebx
112 xorl %ebx,%r13d
113 roll $16,%r13d
114 addl %r12d,%esi
115 xorl %esi,%r8d
116 roll $12,%r8d
117 addl %r13d,%edi
118 xorl %edi,%r9d
119 roll $12,%r9d
120 addl %r8d,%eax
121 xorl %eax,%r12d
122 roll $8,%r12d
123 addl %r9d,%ebx
124 xorl %ebx,%r13d
125 roll $8,%r13d
126 addl %r12d,%esi
127 xorl %esi,%r8d
128 roll $7,%r8d
129 addl %r13d,%edi
130 xorl %edi,%r9d
131 roll $7,%r9d
132 movl %esi,32(%rsp)
133 movl %edi,36(%rsp)
134 movl 40(%rsp),%esi
135 movl 44(%rsp),%edi
136 addl %r10d,%ecx
137 xorl %ecx,%r14d
138 roll $16,%r14d
139 addl %r11d,%edx
140 xorl %edx,%r15d
141 roll $16,%r15d
142 addl %r14d,%esi
143 xorl %esi,%r10d
144 roll $12,%r10d
145 addl %r15d,%edi
146 xorl %edi,%r11d
147 roll $12,%r11d
148 addl %r10d,%ecx
149 xorl %ecx,%r14d
150 roll $8,%r14d
151 addl %r11d,%edx
152 xorl %edx,%r15d
153 roll $8,%r15d
154 addl %r14d,%esi
155 xorl %esi,%r10d
156 roll $7,%r10d
157 addl %r15d,%edi
158 xorl %edi,%r11d
159 roll $7,%r11d
160 addl %r9d,%eax
161 xorl %eax,%r15d
162 roll $16,%r15d
163 addl %r10d,%ebx
164 xorl %ebx,%r12d
165 roll $16,%r12d
166 addl %r15d,%esi
167 xorl %esi,%r9d
168 roll $12,%r9d
169 addl %r12d,%edi
170 xorl %edi,%r10d
171 roll $12,%r10d
172 addl %r9d,%eax
173 xorl %eax,%r15d
174 roll $8,%r15d
175 addl %r10d,%ebx
176 xorl %ebx,%r12d
177 roll $8,%r12d
178 addl %r15d,%esi
179 xorl %esi,%r9d
180 roll $7,%r9d
181 addl %r12d,%edi
182 xorl %edi,%r10d
183 roll $7,%r10d
184 movl %esi,40(%rsp)
185 movl %edi,44(%rsp)
186 movl 32(%rsp),%esi
187 movl 36(%rsp),%edi
188 addl %r11d,%ecx
189 xorl %ecx,%r13d
190 roll $16,%r13d
191 addl %r8d,%edx
192 xorl %edx,%r14d
193 roll $16,%r14d
194 addl %r13d,%esi
195 xorl %esi,%r11d
196 roll $12,%r11d
197 addl %r14d,%edi
198 xorl %edi,%r8d
199 roll $12,%r8d
200 addl %r11d,%ecx
201 xorl %ecx,%r13d
202 roll $8,%r13d
203 addl %r8d,%edx
204 xorl %edx,%r14d
205 roll $8,%r14d
206 addl %r13d,%esi
207 xorl %esi,%r11d
208 roll $7,%r11d
209 addl %r14d,%edi
210 xorl %edi,%r8d
211 roll $7,%r8d
212 decl %ebp
213 jnz .Loop
214 movl %edi,36(%rsp)
215 movl %esi,32(%rsp)
216 movq 64(%rsp),%rbp
217 movdqa %xmm2,%xmm1
218 movq 64+8(%rsp),%rsi
219 paddd %xmm4,%xmm3
220 movq 64+16(%rsp),%rdi
221
222 addl $0x61707865,%eax
223 addl $0x3320646e,%ebx
224 addl $0x79622d32,%ecx
225 addl $0x6b206574,%edx
226 addl 16(%rsp),%r8d
227 addl 20(%rsp),%r9d
228 addl 24(%rsp),%r10d
229 addl 28(%rsp),%r11d
230 addl 48(%rsp),%r12d
231 addl 52(%rsp),%r13d
232 addl 56(%rsp),%r14d
233 addl 60(%rsp),%r15d
234 paddd 32(%rsp),%xmm1
235
236 cmpq $64,%rbp
237 jb .Ltail
238
239 xorl 0(%rsi),%eax
240 xorl 4(%rsi),%ebx
241 xorl 8(%rsi),%ecx
242 xorl 12(%rsi),%edx
243 xorl 16(%rsi),%r8d
244 xorl 20(%rsi),%r9d
245 xorl 24(%rsi),%r10d
246 xorl 28(%rsi),%r11d
247 movdqu 32(%rsi),%xmm0
248 xorl 48(%rsi),%r12d
249 xorl 52(%rsi),%r13d
250 xorl 56(%rsi),%r14d
251 xorl 60(%rsi),%r15d
252 leaq 64(%rsi),%rsi
253 pxor %xmm1,%xmm0
254
255 movdqa %xmm2,32(%rsp)
256 movd %xmm3,48(%rsp)
257
258 movl %eax,0(%rdi)
259 movl %ebx,4(%rdi)
260 movl %ecx,8(%rdi)
261 movl %edx,12(%rdi)
262 movl %r8d,16(%rdi)
263 movl %r9d,20(%rdi)
264 movl %r10d,24(%rdi)
265 movl %r11d,28(%rdi)
266 movdqu %xmm0,32(%rdi)
267 movl %r12d,48(%rdi)
268 movl %r13d,52(%rdi)
269 movl %r14d,56(%rdi)
270 movl %r15d,60(%rdi)
271 leaq 64(%rdi),%rdi
272
273 subq $64,%rbp
274 jnz .Loop_outer
275
276 jmp .Ldone
277
278.align 16
279.Ltail:
280 movl %eax,0(%rsp)
281 movl %ebx,4(%rsp)
282 xorq %rbx,%rbx
283 movl %ecx,8(%rsp)
284 movl %edx,12(%rsp)
285 movl %r8d,16(%rsp)
286 movl %r9d,20(%rsp)
287 movl %r10d,24(%rsp)
288 movl %r11d,28(%rsp)
289 movdqa %xmm1,32(%rsp)
290 movl %r12d,48(%rsp)
291 movl %r13d,52(%rsp)
292 movl %r14d,56(%rsp)
293 movl %r15d,60(%rsp)
294
295.Loop_tail:
296 movzbl (%rsi,%rbx,1),%eax
297 movzbl (%rsp,%rbx,1),%edx
298 leaq 1(%rbx),%rbx
299 xorl %edx,%eax
300 movb %al,-1(%rdi,%rbx,1)
301 decq %rbp
302 jnz .Loop_tail
303
304.Ldone:
305 leaq 64+24+48(%rsp),%rsi
306.cfi_def_cfa %rsi,8
307 movq -48(%rsi),%r15
308.cfi_restore %r15
309 movq -40(%rsi),%r14
310.cfi_restore %r14
311 movq -32(%rsi),%r13
312.cfi_restore %r13
313 movq -24(%rsi),%r12
314.cfi_restore %r12
315 movq -16(%rsi),%rbp
316.cfi_restore %rbp
317 movq -8(%rsi),%rbx
318.cfi_restore %rbx
319 leaq (%rsi),%rsp
320.cfi_def_cfa_register %rsp
321.Lno_data:
322 .byte 0xf3,0xc3
323.cfi_endproc
324.size ChaCha20_ctr32,.-ChaCha20_ctr32
325.type ChaCha20_ssse3,@function
326.align 32
327ChaCha20_ssse3:
328.cfi_startproc
329.LChaCha20_ssse3:
330 movq %rsp,%r9
331.cfi_def_cfa_register %r9
332 cmpq $128,%rdx
333 je .LChaCha20_128
334 ja .LChaCha20_4x
335
336.Ldo_sse3_after_all:
337 subq $64+8,%rsp
338 movdqa .Lsigma(%rip),%xmm0
339 movdqu (%rcx),%xmm1
340 movdqu 16(%rcx),%xmm2
341 movdqu (%r8),%xmm3
342 movdqa .Lrot16(%rip),%xmm6
343 movdqa .Lrot24(%rip),%xmm7
344
345 movdqa %xmm0,0(%rsp)
346 movdqa %xmm1,16(%rsp)
347 movdqa %xmm2,32(%rsp)
348 movdqa %xmm3,48(%rsp)
349 movq $10,%r8
350 jmp .Loop_ssse3
351
352.align 32
353.Loop_outer_ssse3:
354 movdqa .Lone(%rip),%xmm3
355 movdqa 0(%rsp),%xmm0
356 movdqa 16(%rsp),%xmm1
357 movdqa 32(%rsp),%xmm2
358 paddd 48(%rsp),%xmm3
359 movq $10,%r8
360 movdqa %xmm3,48(%rsp)
361 jmp .Loop_ssse3
362
363.align 32
364.Loop_ssse3:
365 paddd %xmm1,%xmm0
366 pxor %xmm0,%xmm3
367.byte 102,15,56,0,222
368 paddd %xmm3,%xmm2
369 pxor %xmm2,%xmm1
370 movdqa %xmm1,%xmm4
371 psrld $20,%xmm1
372 pslld $12,%xmm4
373 por %xmm4,%xmm1
374 paddd %xmm1,%xmm0
375 pxor %xmm0,%xmm3
376.byte 102,15,56,0,223
377 paddd %xmm3,%xmm2
378 pxor %xmm2,%xmm1
379 movdqa %xmm1,%xmm4
380 psrld $25,%xmm1
381 pslld $7,%xmm4
382 por %xmm4,%xmm1
383 pshufd $78,%xmm2,%xmm2
384 pshufd $57,%xmm1,%xmm1
385 pshufd $147,%xmm3,%xmm3
386 nop
387 paddd %xmm1,%xmm0
388 pxor %xmm0,%xmm3
389.byte 102,15,56,0,222
390 paddd %xmm3,%xmm2
391 pxor %xmm2,%xmm1
392 movdqa %xmm1,%xmm4
393 psrld $20,%xmm1
394 pslld $12,%xmm4
395 por %xmm4,%xmm1
396 paddd %xmm1,%xmm0
397 pxor %xmm0,%xmm3
398.byte 102,15,56,0,223
399 paddd %xmm3,%xmm2
400 pxor %xmm2,%xmm1
401 movdqa %xmm1,%xmm4
402 psrld $25,%xmm1
403 pslld $7,%xmm4
404 por %xmm4,%xmm1
405 pshufd $78,%xmm2,%xmm2
406 pshufd $147,%xmm1,%xmm1
407 pshufd $57,%xmm3,%xmm3
408 decq %r8
409 jnz .Loop_ssse3
410 paddd 0(%rsp),%xmm0
411 paddd 16(%rsp),%xmm1
412 paddd 32(%rsp),%xmm2
413 paddd 48(%rsp),%xmm3
414
415 cmpq $64,%rdx
416 jb .Ltail_ssse3
417
418 movdqu 0(%rsi),%xmm4
419 movdqu 16(%rsi),%xmm5
420 pxor %xmm4,%xmm0
421 movdqu 32(%rsi),%xmm4
422 pxor %xmm5,%xmm1
423 movdqu 48(%rsi),%xmm5
424 leaq 64(%rsi),%rsi
425 pxor %xmm4,%xmm2
426 pxor %xmm5,%xmm3
427
428 movdqu %xmm0,0(%rdi)
429 movdqu %xmm1,16(%rdi)
430 movdqu %xmm2,32(%rdi)
431 movdqu %xmm3,48(%rdi)
432 leaq 64(%rdi),%rdi
433
434 subq $64,%rdx
435 jnz .Loop_outer_ssse3
436
437 jmp .Ldone_ssse3
438
439.align 16
440.Ltail_ssse3:
441 movdqa %xmm0,0(%rsp)
442 movdqa %xmm1,16(%rsp)
443 movdqa %xmm2,32(%rsp)
444 movdqa %xmm3,48(%rsp)
445 xorq %r8,%r8
446
447.Loop_tail_ssse3:
448 movzbl (%rsi,%r8,1),%eax
449 movzbl (%rsp,%r8,1),%ecx
450 leaq 1(%r8),%r8
451 xorl %ecx,%eax
452 movb %al,-1(%rdi,%r8,1)
453 decq %rdx
454 jnz .Loop_tail_ssse3
455
456.Ldone_ssse3:
457 leaq (%r9),%rsp
458.cfi_def_cfa_register %rsp
459.Lssse3_epilogue:
460 .byte 0xf3,0xc3
461.cfi_endproc
462.size ChaCha20_ssse3,.-ChaCha20_ssse3
463.type ChaCha20_128,@function
464.align 32
465ChaCha20_128:
466.cfi_startproc
467.LChaCha20_128:
468 movq %rsp,%r9
469.cfi_def_cfa_register %r9
470 subq $64+8,%rsp
471 movdqa .Lsigma(%rip),%xmm8
472 movdqu (%rcx),%xmm9
473 movdqu 16(%rcx),%xmm2
474 movdqu (%r8),%xmm3
475 movdqa .Lone(%rip),%xmm1
476 movdqa .Lrot16(%rip),%xmm6
477 movdqa .Lrot24(%rip),%xmm7
478
479 movdqa %xmm8,%xmm10
480 movdqa %xmm8,0(%rsp)
481 movdqa %xmm9,%xmm11
482 movdqa %xmm9,16(%rsp)
483 movdqa %xmm2,%xmm0
484 movdqa %xmm2,32(%rsp)
485 paddd %xmm3,%xmm1
486 movdqa %xmm3,48(%rsp)
487 movq $10,%r8
488 jmp .Loop_128
489
490.align 32
491.Loop_128:
492 paddd %xmm9,%xmm8
493 pxor %xmm8,%xmm3
494 paddd %xmm11,%xmm10
495 pxor %xmm10,%xmm1
496.byte 102,15,56,0,222
497.byte 102,15,56,0,206
498 paddd %xmm3,%xmm2
499 paddd %xmm1,%xmm0
500 pxor %xmm2,%xmm9
501 pxor %xmm0,%xmm11
502 movdqa %xmm9,%xmm4
503 psrld $20,%xmm9
504 movdqa %xmm11,%xmm5
505 pslld $12,%xmm4
506 psrld $20,%xmm11
507 por %xmm4,%xmm9
508 pslld $12,%xmm5
509 por %xmm5,%xmm11
510 paddd %xmm9,%xmm8
511 pxor %xmm8,%xmm3
512 paddd %xmm11,%xmm10
513 pxor %xmm10,%xmm1
514.byte 102,15,56,0,223
515.byte 102,15,56,0,207
516 paddd %xmm3,%xmm2
517 paddd %xmm1,%xmm0
518 pxor %xmm2,%xmm9
519 pxor %xmm0,%xmm11
520 movdqa %xmm9,%xmm4
521 psrld $25,%xmm9
522 movdqa %xmm11,%xmm5
523 pslld $7,%xmm4
524 psrld $25,%xmm11
525 por %xmm4,%xmm9
526 pslld $7,%xmm5
527 por %xmm5,%xmm11
528 pshufd $78,%xmm2,%xmm2
529 pshufd $57,%xmm9,%xmm9
530 pshufd $147,%xmm3,%xmm3
531 pshufd $78,%xmm0,%xmm0
532 pshufd $57,%xmm11,%xmm11
533 pshufd $147,%xmm1,%xmm1
534 paddd %xmm9,%xmm8
535 pxor %xmm8,%xmm3
536 paddd %xmm11,%xmm10
537 pxor %xmm10,%xmm1
538.byte 102,15,56,0,222
539.byte 102,15,56,0,206
540 paddd %xmm3,%xmm2
541 paddd %xmm1,%xmm0
542 pxor %xmm2,%xmm9
543 pxor %xmm0,%xmm11
544 movdqa %xmm9,%xmm4
545 psrld $20,%xmm9
546 movdqa %xmm11,%xmm5
547 pslld $12,%xmm4
548 psrld $20,%xmm11
549 por %xmm4,%xmm9
550 pslld $12,%xmm5
551 por %xmm5,%xmm11
552 paddd %xmm9,%xmm8
553 pxor %xmm8,%xmm3
554 paddd %xmm11,%xmm10
555 pxor %xmm10,%xmm1
556.byte 102,15,56,0,223
557.byte 102,15,56,0,207
558 paddd %xmm3,%xmm2
559 paddd %xmm1,%xmm0
560 pxor %xmm2,%xmm9
561 pxor %xmm0,%xmm11
562 movdqa %xmm9,%xmm4
563 psrld $25,%xmm9
564 movdqa %xmm11,%xmm5
565 pslld $7,%xmm4
566 psrld $25,%xmm11
567 por %xmm4,%xmm9
568 pslld $7,%xmm5
569 por %xmm5,%xmm11
570 pshufd $78,%xmm2,%xmm2
571 pshufd $147,%xmm9,%xmm9
572 pshufd $57,%xmm3,%xmm3
573 pshufd $78,%xmm0,%xmm0
574 pshufd $147,%xmm11,%xmm11
575 pshufd $57,%xmm1,%xmm1
576 decq %r8
577 jnz .Loop_128
578 paddd 0(%rsp),%xmm8
579 paddd 16(%rsp),%xmm9
580 paddd 32(%rsp),%xmm2
581 paddd 48(%rsp),%xmm3
582 paddd .Lone(%rip),%xmm1
583 paddd 0(%rsp),%xmm10
584 paddd 16(%rsp),%xmm11
585 paddd 32(%rsp),%xmm0
586 paddd 48(%rsp),%xmm1
587
588 movdqu 0(%rsi),%xmm4
589 movdqu 16(%rsi),%xmm5
590 pxor %xmm4,%xmm8
591 movdqu 32(%rsi),%xmm4
592 pxor %xmm5,%xmm9
593 movdqu 48(%rsi),%xmm5
594 pxor %xmm4,%xmm2
595 movdqu 64(%rsi),%xmm4
596 pxor %xmm5,%xmm3
597 movdqu 80(%rsi),%xmm5
598 pxor %xmm4,%xmm10
599 movdqu 96(%rsi),%xmm4
600 pxor %xmm5,%xmm11
601 movdqu 112(%rsi),%xmm5
602 pxor %xmm4,%xmm0
603 pxor %xmm5,%xmm1
604
605 movdqu %xmm8,0(%rdi)
606 movdqu %xmm9,16(%rdi)
607 movdqu %xmm2,32(%rdi)
608 movdqu %xmm3,48(%rdi)
609 movdqu %xmm10,64(%rdi)
610 movdqu %xmm11,80(%rdi)
611 movdqu %xmm0,96(%rdi)
612 movdqu %xmm1,112(%rdi)
613 leaq (%r9),%rsp
614.cfi_def_cfa_register %rsp
615.L128_epilogue:
616 .byte 0xf3,0xc3
617.cfi_endproc
618.size ChaCha20_128,.-ChaCha20_128
619.type ChaCha20_4x,@function
620.align 32
621ChaCha20_4x:
622.cfi_startproc
623.LChaCha20_4x:
624 movq %rsp,%r9
625.cfi_def_cfa_register %r9
626 movq %r10,%r11
627 cmpq $192,%rdx
628 ja .Lproceed4x
629
630 andq $71303168,%r11
631 cmpq $4194304,%r11
632 je .Ldo_sse3_after_all
633
634.Lproceed4x:
635 subq $0x140+8,%rsp
636 movdqa .Lsigma(%rip),%xmm11
637 movdqu (%rcx),%xmm15
638 movdqu 16(%rcx),%xmm7
639 movdqu (%r8),%xmm3
640 leaq 256(%rsp),%rcx
641 leaq .Lrot16(%rip),%r10
642 leaq .Lrot24(%rip),%r11
643
644 pshufd $0x00,%xmm11,%xmm8
645 pshufd $0x55,%xmm11,%xmm9
646 movdqa %xmm8,64(%rsp)
647 pshufd $0xaa,%xmm11,%xmm10
648 movdqa %xmm9,80(%rsp)
649 pshufd $0xff,%xmm11,%xmm11
650 movdqa %xmm10,96(%rsp)
651 movdqa %xmm11,112(%rsp)
652
653 pshufd $0x00,%xmm15,%xmm12
654 pshufd $0x55,%xmm15,%xmm13
655 movdqa %xmm12,128-256(%rcx)
656 pshufd $0xaa,%xmm15,%xmm14
657 movdqa %xmm13,144-256(%rcx)
658 pshufd $0xff,%xmm15,%xmm15
659 movdqa %xmm14,160-256(%rcx)
660 movdqa %xmm15,176-256(%rcx)
661
662 pshufd $0x00,%xmm7,%xmm4
663 pshufd $0x55,%xmm7,%xmm5
664 movdqa %xmm4,192-256(%rcx)
665 pshufd $0xaa,%xmm7,%xmm6
666 movdqa %xmm5,208-256(%rcx)
667 pshufd $0xff,%xmm7,%xmm7
668 movdqa %xmm6,224-256(%rcx)
669 movdqa %xmm7,240-256(%rcx)
670
671 pshufd $0x00,%xmm3,%xmm0
672 pshufd $0x55,%xmm3,%xmm1
673 paddd .Linc(%rip),%xmm0
674 pshufd $0xaa,%xmm3,%xmm2
675 movdqa %xmm1,272-256(%rcx)
676 pshufd $0xff,%xmm3,%xmm3
677 movdqa %xmm2,288-256(%rcx)
678 movdqa %xmm3,304-256(%rcx)
679
680 jmp .Loop_enter4x
681
682.align 32
683.Loop_outer4x:
684 movdqa 64(%rsp),%xmm8
685 movdqa 80(%rsp),%xmm9
686 movdqa 96(%rsp),%xmm10
687 movdqa 112(%rsp),%xmm11
688 movdqa 128-256(%rcx),%xmm12
689 movdqa 144-256(%rcx),%xmm13
690 movdqa 160-256(%rcx),%xmm14
691 movdqa 176-256(%rcx),%xmm15
692 movdqa 192-256(%rcx),%xmm4
693 movdqa 208-256(%rcx),%xmm5
694 movdqa 224-256(%rcx),%xmm6
695 movdqa 240-256(%rcx),%xmm7
696 movdqa 256-256(%rcx),%xmm0
697 movdqa 272-256(%rcx),%xmm1
698 movdqa 288-256(%rcx),%xmm2
699 movdqa 304-256(%rcx),%xmm3
700 paddd .Lfour(%rip),%xmm0
701
702.Loop_enter4x:
703 movdqa %xmm6,32(%rsp)
704 movdqa %xmm7,48(%rsp)
705 movdqa (%r10),%xmm7
706 movl $10,%eax
707 movdqa %xmm0,256-256(%rcx)
708 jmp .Loop4x
709
710.align 32
711.Loop4x:
712 paddd %xmm12,%xmm8
713 paddd %xmm13,%xmm9
714 pxor %xmm8,%xmm0
715 pxor %xmm9,%xmm1
716.byte 102,15,56,0,199
717.byte 102,15,56,0,207
718 paddd %xmm0,%xmm4
719 paddd %xmm1,%xmm5
720 pxor %xmm4,%xmm12
721 pxor %xmm5,%xmm13
722 movdqa %xmm12,%xmm6
723 pslld $12,%xmm12
724 psrld $20,%xmm6
725 movdqa %xmm13,%xmm7
726 pslld $12,%xmm13
727 por %xmm6,%xmm12
728 psrld $20,%xmm7
729 movdqa (%r11),%xmm6
730 por %xmm7,%xmm13
731 paddd %xmm12,%xmm8
732 paddd %xmm13,%xmm9
733 pxor %xmm8,%xmm0
734 pxor %xmm9,%xmm1
735.byte 102,15,56,0,198
736.byte 102,15,56,0,206
737 paddd %xmm0,%xmm4
738 paddd %xmm1,%xmm5
739 pxor %xmm4,%xmm12
740 pxor %xmm5,%xmm13
741 movdqa %xmm12,%xmm7
742 pslld $7,%xmm12
743 psrld $25,%xmm7
744 movdqa %xmm13,%xmm6
745 pslld $7,%xmm13
746 por %xmm7,%xmm12
747 psrld $25,%xmm6
748 movdqa (%r10),%xmm7
749 por %xmm6,%xmm13
750 movdqa %xmm4,0(%rsp)
751 movdqa %xmm5,16(%rsp)
752 movdqa 32(%rsp),%xmm4
753 movdqa 48(%rsp),%xmm5
754 paddd %xmm14,%xmm10
755 paddd %xmm15,%xmm11
756 pxor %xmm10,%xmm2
757 pxor %xmm11,%xmm3
758.byte 102,15,56,0,215
759.byte 102,15,56,0,223
760 paddd %xmm2,%xmm4
761 paddd %xmm3,%xmm5
762 pxor %xmm4,%xmm14
763 pxor %xmm5,%xmm15
764 movdqa %xmm14,%xmm6
765 pslld $12,%xmm14
766 psrld $20,%xmm6
767 movdqa %xmm15,%xmm7
768 pslld $12,%xmm15
769 por %xmm6,%xmm14
770 psrld $20,%xmm7
771 movdqa (%r11),%xmm6
772 por %xmm7,%xmm15
773 paddd %xmm14,%xmm10
774 paddd %xmm15,%xmm11
775 pxor %xmm10,%xmm2
776 pxor %xmm11,%xmm3
777.byte 102,15,56,0,214
778.byte 102,15,56,0,222
779 paddd %xmm2,%xmm4
780 paddd %xmm3,%xmm5
781 pxor %xmm4,%xmm14
782 pxor %xmm5,%xmm15
783 movdqa %xmm14,%xmm7
784 pslld $7,%xmm14
785 psrld $25,%xmm7
786 movdqa %xmm15,%xmm6
787 pslld $7,%xmm15
788 por %xmm7,%xmm14
789 psrld $25,%xmm6
790 movdqa (%r10),%xmm7
791 por %xmm6,%xmm15
792 paddd %xmm13,%xmm8
793 paddd %xmm14,%xmm9
794 pxor %xmm8,%xmm3
795 pxor %xmm9,%xmm0
796.byte 102,15,56,0,223
797.byte 102,15,56,0,199
798 paddd %xmm3,%xmm4
799 paddd %xmm0,%xmm5
800 pxor %xmm4,%xmm13
801 pxor %xmm5,%xmm14
802 movdqa %xmm13,%xmm6
803 pslld $12,%xmm13
804 psrld $20,%xmm6
805 movdqa %xmm14,%xmm7
806 pslld $12,%xmm14
807 por %xmm6,%xmm13
808 psrld $20,%xmm7
809 movdqa (%r11),%xmm6
810 por %xmm7,%xmm14
811 paddd %xmm13,%xmm8
812 paddd %xmm14,%xmm9
813 pxor %xmm8,%xmm3
814 pxor %xmm9,%xmm0
815.byte 102,15,56,0,222
816.byte 102,15,56,0,198
817 paddd %xmm3,%xmm4
818 paddd %xmm0,%xmm5
819 pxor %xmm4,%xmm13
820 pxor %xmm5,%xmm14
821 movdqa %xmm13,%xmm7
822 pslld $7,%xmm13
823 psrld $25,%xmm7
824 movdqa %xmm14,%xmm6
825 pslld $7,%xmm14
826 por %xmm7,%xmm13
827 psrld $25,%xmm6
828 movdqa (%r10),%xmm7
829 por %xmm6,%xmm14
830 movdqa %xmm4,32(%rsp)
831 movdqa %xmm5,48(%rsp)
832 movdqa 0(%rsp),%xmm4
833 movdqa 16(%rsp),%xmm5
834 paddd %xmm15,%xmm10
835 paddd %xmm12,%xmm11
836 pxor %xmm10,%xmm1
837 pxor %xmm11,%xmm2
838.byte 102,15,56,0,207
839.byte 102,15,56,0,215
840 paddd %xmm1,%xmm4
841 paddd %xmm2,%xmm5
842 pxor %xmm4,%xmm15
843 pxor %xmm5,%xmm12
844 movdqa %xmm15,%xmm6
845 pslld $12,%xmm15
846 psrld $20,%xmm6
847 movdqa %xmm12,%xmm7
848 pslld $12,%xmm12
849 por %xmm6,%xmm15
850 psrld $20,%xmm7
851 movdqa (%r11),%xmm6
852 por %xmm7,%xmm12
853 paddd %xmm15,%xmm10
854 paddd %xmm12,%xmm11
855 pxor %xmm10,%xmm1
856 pxor %xmm11,%xmm2
857.byte 102,15,56,0,206
858.byte 102,15,56,0,214
859 paddd %xmm1,%xmm4
860 paddd %xmm2,%xmm5
861 pxor %xmm4,%xmm15
862 pxor %xmm5,%xmm12
863 movdqa %xmm15,%xmm7
864 pslld $7,%xmm15
865 psrld $25,%xmm7
866 movdqa %xmm12,%xmm6
867 pslld $7,%xmm12
868 por %xmm7,%xmm15
869 psrld $25,%xmm6
870 movdqa (%r10),%xmm7
871 por %xmm6,%xmm12
872 decl %eax
873 jnz .Loop4x
874
875 paddd 64(%rsp),%xmm8
876 paddd 80(%rsp),%xmm9
877 paddd 96(%rsp),%xmm10
878 paddd 112(%rsp),%xmm11
879
880 movdqa %xmm8,%xmm6
881 punpckldq %xmm9,%xmm8
882 movdqa %xmm10,%xmm7
883 punpckldq %xmm11,%xmm10
884 punpckhdq %xmm9,%xmm6
885 punpckhdq %xmm11,%xmm7
886 movdqa %xmm8,%xmm9
887 punpcklqdq %xmm10,%xmm8
888 movdqa %xmm6,%xmm11
889 punpcklqdq %xmm7,%xmm6
890 punpckhqdq %xmm10,%xmm9
891 punpckhqdq %xmm7,%xmm11
892 paddd 128-256(%rcx),%xmm12
893 paddd 144-256(%rcx),%xmm13
894 paddd 160-256(%rcx),%xmm14
895 paddd 176-256(%rcx),%xmm15
896
897 movdqa %xmm8,0(%rsp)
898 movdqa %xmm9,16(%rsp)
899 movdqa 32(%rsp),%xmm8
900 movdqa 48(%rsp),%xmm9
901
902 movdqa %xmm12,%xmm10
903 punpckldq %xmm13,%xmm12
904 movdqa %xmm14,%xmm7
905 punpckldq %xmm15,%xmm14
906 punpckhdq %xmm13,%xmm10
907 punpckhdq %xmm15,%xmm7
908 movdqa %xmm12,%xmm13
909 punpcklqdq %xmm14,%xmm12
910 movdqa %xmm10,%xmm15
911 punpcklqdq %xmm7,%xmm10
912 punpckhqdq %xmm14,%xmm13
913 punpckhqdq %xmm7,%xmm15
914 paddd 192-256(%rcx),%xmm4
915 paddd 208-256(%rcx),%xmm5
916 paddd 224-256(%rcx),%xmm8
917 paddd 240-256(%rcx),%xmm9
918
919 movdqa %xmm6,32(%rsp)
920 movdqa %xmm11,48(%rsp)
921
922 movdqa %xmm4,%xmm14
923 punpckldq %xmm5,%xmm4
924 movdqa %xmm8,%xmm7
925 punpckldq %xmm9,%xmm8
926 punpckhdq %xmm5,%xmm14
927 punpckhdq %xmm9,%xmm7
928 movdqa %xmm4,%xmm5
929 punpcklqdq %xmm8,%xmm4
930 movdqa %xmm14,%xmm9
931 punpcklqdq %xmm7,%xmm14
932 punpckhqdq %xmm8,%xmm5
933 punpckhqdq %xmm7,%xmm9
934 paddd 256-256(%rcx),%xmm0
935 paddd 272-256(%rcx),%xmm1
936 paddd 288-256(%rcx),%xmm2
937 paddd 304-256(%rcx),%xmm3
938
939 movdqa %xmm0,%xmm8
940 punpckldq %xmm1,%xmm0
941 movdqa %xmm2,%xmm7
942 punpckldq %xmm3,%xmm2
943 punpckhdq %xmm1,%xmm8
944 punpckhdq %xmm3,%xmm7
945 movdqa %xmm0,%xmm1
946 punpcklqdq %xmm2,%xmm0
947 movdqa %xmm8,%xmm3
948 punpcklqdq %xmm7,%xmm8
949 punpckhqdq %xmm2,%xmm1
950 punpckhqdq %xmm7,%xmm3
951 cmpq $256,%rdx
952 jb .Ltail4x
953
954 movdqu 0(%rsi),%xmm6
955 movdqu 16(%rsi),%xmm11
956 movdqu 32(%rsi),%xmm2
957 movdqu 48(%rsi),%xmm7
958 pxor 0(%rsp),%xmm6
959 pxor %xmm12,%xmm11
960 pxor %xmm4,%xmm2
961 pxor %xmm0,%xmm7
962
963 movdqu %xmm6,0(%rdi)
964 movdqu 64(%rsi),%xmm6
965 movdqu %xmm11,16(%rdi)
966 movdqu 80(%rsi),%xmm11
967 movdqu %xmm2,32(%rdi)
968 movdqu 96(%rsi),%xmm2
969 movdqu %xmm7,48(%rdi)
970 movdqu 112(%rsi),%xmm7
971 leaq 128(%rsi),%rsi
972 pxor 16(%rsp),%xmm6
973 pxor %xmm13,%xmm11
974 pxor %xmm5,%xmm2
975 pxor %xmm1,%xmm7
976
977 movdqu %xmm6,64(%rdi)
978 movdqu 0(%rsi),%xmm6
979 movdqu %xmm11,80(%rdi)
980 movdqu 16(%rsi),%xmm11
981 movdqu %xmm2,96(%rdi)
982 movdqu 32(%rsi),%xmm2
983 movdqu %xmm7,112(%rdi)
984 leaq 128(%rdi),%rdi
985 movdqu 48(%rsi),%xmm7
986 pxor 32(%rsp),%xmm6
987 pxor %xmm10,%xmm11
988 pxor %xmm14,%xmm2
989 pxor %xmm8,%xmm7
990
991 movdqu %xmm6,0(%rdi)
992 movdqu 64(%rsi),%xmm6
993 movdqu %xmm11,16(%rdi)
994 movdqu 80(%rsi),%xmm11
995 movdqu %xmm2,32(%rdi)
996 movdqu 96(%rsi),%xmm2
997 movdqu %xmm7,48(%rdi)
998 movdqu 112(%rsi),%xmm7
999 leaq 128(%rsi),%rsi
1000 pxor 48(%rsp),%xmm6
1001 pxor %xmm15,%xmm11
1002 pxor %xmm9,%xmm2
1003 pxor %xmm3,%xmm7
1004 movdqu %xmm6,64(%rdi)
1005 movdqu %xmm11,80(%rdi)
1006 movdqu %xmm2,96(%rdi)
1007 movdqu %xmm7,112(%rdi)
1008 leaq 128(%rdi),%rdi
1009
1010 subq $256,%rdx
1011 jnz .Loop_outer4x
1012
1013 jmp .Ldone4x
1014
1015.Ltail4x:
1016 cmpq $192,%rdx
1017 jae .L192_or_more4x
1018 cmpq $128,%rdx
1019 jae .L128_or_more4x
1020 cmpq $64,%rdx
1021 jae .L64_or_more4x
1022
1023
1024 xorq %r10,%r10
1025
1026 movdqa %xmm12,16(%rsp)
1027 movdqa %xmm4,32(%rsp)
1028 movdqa %xmm0,48(%rsp)
1029 jmp .Loop_tail4x
1030
1031.align 32
1032.L64_or_more4x:
1033 movdqu 0(%rsi),%xmm6
1034 movdqu 16(%rsi),%xmm11
1035 movdqu 32(%rsi),%xmm2
1036 movdqu 48(%rsi),%xmm7
1037 pxor 0(%rsp),%xmm6
1038 pxor %xmm12,%xmm11
1039 pxor %xmm4,%xmm2
1040 pxor %xmm0,%xmm7
1041 movdqu %xmm6,0(%rdi)
1042 movdqu %xmm11,16(%rdi)
1043 movdqu %xmm2,32(%rdi)
1044 movdqu %xmm7,48(%rdi)
1045 je .Ldone4x
1046
1047 movdqa 16(%rsp),%xmm6
1048 leaq 64(%rsi),%rsi
1049 xorq %r10,%r10
1050 movdqa %xmm6,0(%rsp)
1051 movdqa %xmm13,16(%rsp)
1052 leaq 64(%rdi),%rdi
1053 movdqa %xmm5,32(%rsp)
1054 subq $64,%rdx
1055 movdqa %xmm1,48(%rsp)
1056 jmp .Loop_tail4x
1057
1058.align 32
1059.L128_or_more4x:
1060 movdqu 0(%rsi),%xmm6
1061 movdqu 16(%rsi),%xmm11
1062 movdqu 32(%rsi),%xmm2
1063 movdqu 48(%rsi),%xmm7
1064 pxor 0(%rsp),%xmm6
1065 pxor %xmm12,%xmm11
1066 pxor %xmm4,%xmm2
1067 pxor %xmm0,%xmm7
1068
1069 movdqu %xmm6,0(%rdi)
1070 movdqu 64(%rsi),%xmm6
1071 movdqu %xmm11,16(%rdi)
1072 movdqu 80(%rsi),%xmm11
1073 movdqu %xmm2,32(%rdi)
1074 movdqu 96(%rsi),%xmm2
1075 movdqu %xmm7,48(%rdi)
1076 movdqu 112(%rsi),%xmm7
1077 pxor 16(%rsp),%xmm6
1078 pxor %xmm13,%xmm11
1079 pxor %xmm5,%xmm2
1080 pxor %xmm1,%xmm7
1081 movdqu %xmm6,64(%rdi)
1082 movdqu %xmm11,80(%rdi)
1083 movdqu %xmm2,96(%rdi)
1084 movdqu %xmm7,112(%rdi)
1085 je .Ldone4x
1086
1087 movdqa 32(%rsp),%xmm6
1088 leaq 128(%rsi),%rsi
1089 xorq %r10,%r10
1090 movdqa %xmm6,0(%rsp)
1091 movdqa %xmm10,16(%rsp)
1092 leaq 128(%rdi),%rdi
1093 movdqa %xmm14,32(%rsp)
1094 subq $128,%rdx
1095 movdqa %xmm8,48(%rsp)
1096 jmp .Loop_tail4x
1097
1098.align 32
1099.L192_or_more4x:
1100 movdqu 0(%rsi),%xmm6
1101 movdqu 16(%rsi),%xmm11
1102 movdqu 32(%rsi),%xmm2
1103 movdqu 48(%rsi),%xmm7
1104 pxor 0(%rsp),%xmm6
1105 pxor %xmm12,%xmm11
1106 pxor %xmm4,%xmm2
1107 pxor %xmm0,%xmm7
1108
1109 movdqu %xmm6,0(%rdi)
1110 movdqu 64(%rsi),%xmm6
1111 movdqu %xmm11,16(%rdi)
1112 movdqu 80(%rsi),%xmm11
1113 movdqu %xmm2,32(%rdi)
1114 movdqu 96(%rsi),%xmm2
1115 movdqu %xmm7,48(%rdi)
1116 movdqu 112(%rsi),%xmm7
1117 leaq 128(%rsi),%rsi
1118 pxor 16(%rsp),%xmm6
1119 pxor %xmm13,%xmm11
1120 pxor %xmm5,%xmm2
1121 pxor %xmm1,%xmm7
1122
1123 movdqu %xmm6,64(%rdi)
1124 movdqu 0(%rsi),%xmm6
1125 movdqu %xmm11,80(%rdi)
1126 movdqu 16(%rsi),%xmm11
1127 movdqu %xmm2,96(%rdi)
1128 movdqu 32(%rsi),%xmm2
1129 movdqu %xmm7,112(%rdi)
1130 leaq 128(%rdi),%rdi
1131 movdqu 48(%rsi),%xmm7
1132 pxor 32(%rsp),%xmm6
1133 pxor %xmm10,%xmm11
1134 pxor %xmm14,%xmm2
1135 pxor %xmm8,%xmm7
1136 movdqu %xmm6,0(%rdi)
1137 movdqu %xmm11,16(%rdi)
1138 movdqu %xmm2,32(%rdi)
1139 movdqu %xmm7,48(%rdi)
1140 je .Ldone4x
1141
1142 movdqa 48(%rsp),%xmm6
1143 leaq 64(%rsi),%rsi
1144 xorq %r10,%r10
1145 movdqa %xmm6,0(%rsp)
1146 movdqa %xmm15,16(%rsp)
1147 leaq 64(%rdi),%rdi
1148 movdqa %xmm9,32(%rsp)
1149 subq $192,%rdx
1150 movdqa %xmm3,48(%rsp)
1151
1152.Loop_tail4x:
1153 movzbl (%rsi,%r10,1),%eax
1154 movzbl (%rsp,%r10,1),%ecx
1155 leaq 1(%r10),%r10
1156 xorl %ecx,%eax
1157 movb %al,-1(%rdi,%r10,1)
1158 decq %rdx
1159 jnz .Loop_tail4x
1160
1161.Ldone4x:
1162 leaq (%r9),%rsp
1163.cfi_def_cfa_register %rsp
1164.L4x_epilogue:
1165 .byte 0xf3,0xc3
1166.cfi_endproc
1167.size ChaCha20_4x,.-ChaCha20_4x
Note: See TracBrowser for help on using the repository browser.

© 2024 Oracle Support Privacy / Do Not Sell My Info Terms of Use Trademark Policy Automated Access Etiquette