VirtualBox

source: vbox/trunk/src/libs/openssl-3.1.0/crypto/genasm-nasm/rsaz-2k-avx512.S@ 100939

Last change on this file since 100939 was 100939, checked in by vboxsync, 18 months ago

openssl: adding missed files bugref:10418

File size: 18.7 KB
Line 
1
2.globl ossl_rsaz_avx512ifma_eligible
3.type ossl_rsaz_avx512ifma_eligible,@function
4.align 32
5ossl_rsaz_avx512ifma_eligible:
6 movl OPENSSL_ia32cap_P+8(%rip),%ecx
7 xorl %eax,%eax
8 andl $2149777408,%ecx
9 cmpl $2149777408,%ecx
10 cmovel %ecx,%eax
11 .byte 0xf3,0xc3
12.size ossl_rsaz_avx512ifma_eligible, .-ossl_rsaz_avx512ifma_eligible
13.text
14
15.globl ossl_rsaz_amm52x20_x1_ifma256
16.type ossl_rsaz_amm52x20_x1_ifma256,@function
17.align 32
18ossl_rsaz_amm52x20_x1_ifma256:
19.cfi_startproc
20.byte 243,15,30,250
21 pushq %rbx
22.cfi_adjust_cfa_offset 8
23.cfi_offset %rbx,-16
24 pushq %rbp
25.cfi_adjust_cfa_offset 8
26.cfi_offset %rbp,-24
27 pushq %r12
28.cfi_adjust_cfa_offset 8
29.cfi_offset %r12,-32
30 pushq %r13
31.cfi_adjust_cfa_offset 8
32.cfi_offset %r13,-40
33 pushq %r14
34.cfi_adjust_cfa_offset 8
35.cfi_offset %r14,-48
36 pushq %r15
37.cfi_adjust_cfa_offset 8
38.cfi_offset %r15,-56
39.Lossl_rsaz_amm52x20_x1_ifma256_body:
40
41
42 vpxord %ymm0,%ymm0,%ymm0
43 vmovdqa64 %ymm0,%ymm3
44 vmovdqa64 %ymm0,%ymm16
45 vmovdqa64 %ymm0,%ymm17
46 vmovdqa64 %ymm0,%ymm18
47 vmovdqa64 %ymm0,%ymm19
48
49 xorl %r9d,%r9d
50
51 movq %rdx,%r11
52 movq $0xfffffffffffff,%rax
53
54
55 movl $5,%ebx
56
57.align 32
58.Lloop5:
59 movq 0(%r11),%r13
60
61 vpbroadcastq %r13,%ymm1
62 movq 0(%rsi),%rdx
63 mulxq %r13,%r13,%r12
64 addq %r13,%r9
65 movq %r12,%r10
66 adcq $0,%r10
67
68 movq %r8,%r13
69 imulq %r9,%r13
70 andq %rax,%r13
71
72 vpbroadcastq %r13,%ymm2
73 movq 0(%rcx),%rdx
74 mulxq %r13,%r13,%r12
75 addq %r13,%r9
76 adcq %r12,%r10
77
78 shrq $52,%r9
79 salq $12,%r10
80 orq %r10,%r9
81
82 vpmadd52luq 0(%rsi),%ymm1,%ymm3
83 vpmadd52luq 32(%rsi),%ymm1,%ymm16
84 vpmadd52luq 64(%rsi),%ymm1,%ymm17
85 vpmadd52luq 96(%rsi),%ymm1,%ymm18
86 vpmadd52luq 128(%rsi),%ymm1,%ymm19
87
88 vpmadd52luq 0(%rcx),%ymm2,%ymm3
89 vpmadd52luq 32(%rcx),%ymm2,%ymm16
90 vpmadd52luq 64(%rcx),%ymm2,%ymm17
91 vpmadd52luq 96(%rcx),%ymm2,%ymm18
92 vpmadd52luq 128(%rcx),%ymm2,%ymm19
93
94
95 valignq $1,%ymm3,%ymm16,%ymm3
96 valignq $1,%ymm16,%ymm17,%ymm16
97 valignq $1,%ymm17,%ymm18,%ymm17
98 valignq $1,%ymm18,%ymm19,%ymm18
99 valignq $1,%ymm19,%ymm0,%ymm19
100
101 vmovq %xmm3,%r13
102 addq %r13,%r9
103
104 vpmadd52huq 0(%rsi),%ymm1,%ymm3
105 vpmadd52huq 32(%rsi),%ymm1,%ymm16
106 vpmadd52huq 64(%rsi),%ymm1,%ymm17
107 vpmadd52huq 96(%rsi),%ymm1,%ymm18
108 vpmadd52huq 128(%rsi),%ymm1,%ymm19
109
110 vpmadd52huq 0(%rcx),%ymm2,%ymm3
111 vpmadd52huq 32(%rcx),%ymm2,%ymm16
112 vpmadd52huq 64(%rcx),%ymm2,%ymm17
113 vpmadd52huq 96(%rcx),%ymm2,%ymm18
114 vpmadd52huq 128(%rcx),%ymm2,%ymm19
115 movq 8(%r11),%r13
116
117 vpbroadcastq %r13,%ymm1
118 movq 0(%rsi),%rdx
119 mulxq %r13,%r13,%r12
120 addq %r13,%r9
121 movq %r12,%r10
122 adcq $0,%r10
123
124 movq %r8,%r13
125 imulq %r9,%r13
126 andq %rax,%r13
127
128 vpbroadcastq %r13,%ymm2
129 movq 0(%rcx),%rdx
130 mulxq %r13,%r13,%r12
131 addq %r13,%r9
132 adcq %r12,%r10
133
134 shrq $52,%r9
135 salq $12,%r10
136 orq %r10,%r9
137
138 vpmadd52luq 0(%rsi),%ymm1,%ymm3
139 vpmadd52luq 32(%rsi),%ymm1,%ymm16
140 vpmadd52luq 64(%rsi),%ymm1,%ymm17
141 vpmadd52luq 96(%rsi),%ymm1,%ymm18
142 vpmadd52luq 128(%rsi),%ymm1,%ymm19
143
144 vpmadd52luq 0(%rcx),%ymm2,%ymm3
145 vpmadd52luq 32(%rcx),%ymm2,%ymm16
146 vpmadd52luq 64(%rcx),%ymm2,%ymm17
147 vpmadd52luq 96(%rcx),%ymm2,%ymm18
148 vpmadd52luq 128(%rcx),%ymm2,%ymm19
149
150
151 valignq $1,%ymm3,%ymm16,%ymm3
152 valignq $1,%ymm16,%ymm17,%ymm16
153 valignq $1,%ymm17,%ymm18,%ymm17
154 valignq $1,%ymm18,%ymm19,%ymm18
155 valignq $1,%ymm19,%ymm0,%ymm19
156
157 vmovq %xmm3,%r13
158 addq %r13,%r9
159
160 vpmadd52huq 0(%rsi),%ymm1,%ymm3
161 vpmadd52huq 32(%rsi),%ymm1,%ymm16
162 vpmadd52huq 64(%rsi),%ymm1,%ymm17
163 vpmadd52huq 96(%rsi),%ymm1,%ymm18
164 vpmadd52huq 128(%rsi),%ymm1,%ymm19
165
166 vpmadd52huq 0(%rcx),%ymm2,%ymm3
167 vpmadd52huq 32(%rcx),%ymm2,%ymm16
168 vpmadd52huq 64(%rcx),%ymm2,%ymm17
169 vpmadd52huq 96(%rcx),%ymm2,%ymm18
170 vpmadd52huq 128(%rcx),%ymm2,%ymm19
171 movq 16(%r11),%r13
172
173 vpbroadcastq %r13,%ymm1
174 movq 0(%rsi),%rdx
175 mulxq %r13,%r13,%r12
176 addq %r13,%r9
177 movq %r12,%r10
178 adcq $0,%r10
179
180 movq %r8,%r13
181 imulq %r9,%r13
182 andq %rax,%r13
183
184 vpbroadcastq %r13,%ymm2
185 movq 0(%rcx),%rdx
186 mulxq %r13,%r13,%r12
187 addq %r13,%r9
188 adcq %r12,%r10
189
190 shrq $52,%r9
191 salq $12,%r10
192 orq %r10,%r9
193
194 vpmadd52luq 0(%rsi),%ymm1,%ymm3
195 vpmadd52luq 32(%rsi),%ymm1,%ymm16
196 vpmadd52luq 64(%rsi),%ymm1,%ymm17
197 vpmadd52luq 96(%rsi),%ymm1,%ymm18
198 vpmadd52luq 128(%rsi),%ymm1,%ymm19
199
200 vpmadd52luq 0(%rcx),%ymm2,%ymm3
201 vpmadd52luq 32(%rcx),%ymm2,%ymm16
202 vpmadd52luq 64(%rcx),%ymm2,%ymm17
203 vpmadd52luq 96(%rcx),%ymm2,%ymm18
204 vpmadd52luq 128(%rcx),%ymm2,%ymm19
205
206
207 valignq $1,%ymm3,%ymm16,%ymm3
208 valignq $1,%ymm16,%ymm17,%ymm16
209 valignq $1,%ymm17,%ymm18,%ymm17
210 valignq $1,%ymm18,%ymm19,%ymm18
211 valignq $1,%ymm19,%ymm0,%ymm19
212
213 vmovq %xmm3,%r13
214 addq %r13,%r9
215
216 vpmadd52huq 0(%rsi),%ymm1,%ymm3
217 vpmadd52huq 32(%rsi),%ymm1,%ymm16
218 vpmadd52huq 64(%rsi),%ymm1,%ymm17
219 vpmadd52huq 96(%rsi),%ymm1,%ymm18
220 vpmadd52huq 128(%rsi),%ymm1,%ymm19
221
222 vpmadd52huq 0(%rcx),%ymm2,%ymm3
223 vpmadd52huq 32(%rcx),%ymm2,%ymm16
224 vpmadd52huq 64(%rcx),%ymm2,%ymm17
225 vpmadd52huq 96(%rcx),%ymm2,%ymm18
226 vpmadd52huq 128(%rcx),%ymm2,%ymm19
227 movq 24(%r11),%r13
228
229 vpbroadcastq %r13,%ymm1
230 movq 0(%rsi),%rdx
231 mulxq %r13,%r13,%r12
232 addq %r13,%r9
233 movq %r12,%r10
234 adcq $0,%r10
235
236 movq %r8,%r13
237 imulq %r9,%r13
238 andq %rax,%r13
239
240 vpbroadcastq %r13,%ymm2
241 movq 0(%rcx),%rdx
242 mulxq %r13,%r13,%r12
243 addq %r13,%r9
244 adcq %r12,%r10
245
246 shrq $52,%r9
247 salq $12,%r10
248 orq %r10,%r9
249
250 vpmadd52luq 0(%rsi),%ymm1,%ymm3
251 vpmadd52luq 32(%rsi),%ymm1,%ymm16
252 vpmadd52luq 64(%rsi),%ymm1,%ymm17
253 vpmadd52luq 96(%rsi),%ymm1,%ymm18
254 vpmadd52luq 128(%rsi),%ymm1,%ymm19
255
256 vpmadd52luq 0(%rcx),%ymm2,%ymm3
257 vpmadd52luq 32(%rcx),%ymm2,%ymm16
258 vpmadd52luq 64(%rcx),%ymm2,%ymm17
259 vpmadd52luq 96(%rcx),%ymm2,%ymm18
260 vpmadd52luq 128(%rcx),%ymm2,%ymm19
261
262
263 valignq $1,%ymm3,%ymm16,%ymm3
264 valignq $1,%ymm16,%ymm17,%ymm16
265 valignq $1,%ymm17,%ymm18,%ymm17
266 valignq $1,%ymm18,%ymm19,%ymm18
267 valignq $1,%ymm19,%ymm0,%ymm19
268
269 vmovq %xmm3,%r13
270 addq %r13,%r9
271
272 vpmadd52huq 0(%rsi),%ymm1,%ymm3
273 vpmadd52huq 32(%rsi),%ymm1,%ymm16
274 vpmadd52huq 64(%rsi),%ymm1,%ymm17
275 vpmadd52huq 96(%rsi),%ymm1,%ymm18
276 vpmadd52huq 128(%rsi),%ymm1,%ymm19
277
278 vpmadd52huq 0(%rcx),%ymm2,%ymm3
279 vpmadd52huq 32(%rcx),%ymm2,%ymm16
280 vpmadd52huq 64(%rcx),%ymm2,%ymm17
281 vpmadd52huq 96(%rcx),%ymm2,%ymm18
282 vpmadd52huq 128(%rcx),%ymm2,%ymm19
283 leaq 32(%r11),%r11
284 decl %ebx
285 jne .Lloop5
286
287 vpbroadcastq %r9,%ymm0
288 vpblendd $3,%ymm0,%ymm3,%ymm3
289
290
291
292 vpsrlq $52,%ymm3,%ymm0
293 vpsrlq $52,%ymm16,%ymm1
294 vpsrlq $52,%ymm17,%ymm2
295 vpsrlq $52,%ymm18,%ymm25
296 vpsrlq $52,%ymm19,%ymm26
297
298
299 valignq $3,%ymm25,%ymm26,%ymm26
300 valignq $3,%ymm2,%ymm25,%ymm25
301 valignq $3,%ymm1,%ymm2,%ymm2
302 valignq $3,%ymm0,%ymm1,%ymm1
303 valignq $3,.Lzeros(%rip),%ymm0,%ymm0
304
305
306 vpandq .Lmask52x4(%rip),%ymm3,%ymm3
307 vpandq .Lmask52x4(%rip),%ymm16,%ymm16
308 vpandq .Lmask52x4(%rip),%ymm17,%ymm17
309 vpandq .Lmask52x4(%rip),%ymm18,%ymm18
310 vpandq .Lmask52x4(%rip),%ymm19,%ymm19
311
312
313 vpaddq %ymm0,%ymm3,%ymm3
314 vpaddq %ymm1,%ymm16,%ymm16
315 vpaddq %ymm2,%ymm17,%ymm17
316 vpaddq %ymm25,%ymm18,%ymm18
317 vpaddq %ymm26,%ymm19,%ymm19
318
319
320
321 vpcmpuq $6,.Lmask52x4(%rip),%ymm3,%k1
322 vpcmpuq $6,.Lmask52x4(%rip),%ymm16,%k2
323 vpcmpuq $6,.Lmask52x4(%rip),%ymm17,%k3
324 vpcmpuq $6,.Lmask52x4(%rip),%ymm18,%k4
325 vpcmpuq $6,.Lmask52x4(%rip),%ymm19,%k5
326 kmovb %k1,%r14d
327 kmovb %k2,%r13d
328 kmovb %k3,%r12d
329 kmovb %k4,%r11d
330 kmovb %k5,%r10d
331
332
333 vpcmpuq $0,.Lmask52x4(%rip),%ymm3,%k1
334 vpcmpuq $0,.Lmask52x4(%rip),%ymm16,%k2
335 vpcmpuq $0,.Lmask52x4(%rip),%ymm17,%k3
336 vpcmpuq $0,.Lmask52x4(%rip),%ymm18,%k4
337 vpcmpuq $0,.Lmask52x4(%rip),%ymm19,%k5
338 kmovb %k1,%r9d
339 kmovb %k2,%r8d
340 kmovb %k3,%ebx
341 kmovb %k4,%ecx
342 kmovb %k5,%edx
343
344
345
346 shlb $4,%r13b
347 orb %r13b,%r14b
348 shlb $4,%r11b
349 orb %r11b,%r12b
350
351 addb %r14b,%r14b
352 adcb %r12b,%r12b
353 adcb %r10b,%r10b
354
355 shlb $4,%r8b
356 orb %r8b,%r9b
357 shlb $4,%cl
358 orb %cl,%bl
359
360 addb %r9b,%r14b
361 adcb %bl,%r12b
362 adcb %dl,%r10b
363
364 xorb %r9b,%r14b
365 xorb %bl,%r12b
366 xorb %dl,%r10b
367
368 kmovb %r14d,%k1
369 shrb $4,%r14b
370 kmovb %r14d,%k2
371 kmovb %r12d,%k3
372 shrb $4,%r12b
373 kmovb %r12d,%k4
374 kmovb %r10d,%k5
375
376
377 vpsubq .Lmask52x4(%rip),%ymm3,%ymm3{%k1}
378 vpsubq .Lmask52x4(%rip),%ymm16,%ymm16{%k2}
379 vpsubq .Lmask52x4(%rip),%ymm17,%ymm17{%k3}
380 vpsubq .Lmask52x4(%rip),%ymm18,%ymm18{%k4}
381 vpsubq .Lmask52x4(%rip),%ymm19,%ymm19{%k5}
382
383 vpandq .Lmask52x4(%rip),%ymm3,%ymm3
384 vpandq .Lmask52x4(%rip),%ymm16,%ymm16
385 vpandq .Lmask52x4(%rip),%ymm17,%ymm17
386 vpandq .Lmask52x4(%rip),%ymm18,%ymm18
387 vpandq .Lmask52x4(%rip),%ymm19,%ymm19
388
389 vmovdqu64 %ymm3,0(%rdi)
390 vmovdqu64 %ymm16,32(%rdi)
391 vmovdqu64 %ymm17,64(%rdi)
392 vmovdqu64 %ymm18,96(%rdi)
393 vmovdqu64 %ymm19,128(%rdi)
394
395 vzeroupper
396 movq 0(%rsp),%r15
397.cfi_restore %r15
398 movq 8(%rsp),%r14
399.cfi_restore %r14
400 movq 16(%rsp),%r13
401.cfi_restore %r13
402 movq 24(%rsp),%r12
403.cfi_restore %r12
404 movq 32(%rsp),%rbp
405.cfi_restore %rbp
406 movq 40(%rsp),%rbx
407.cfi_restore %rbx
408 leaq 48(%rsp),%rsp
409.cfi_adjust_cfa_offset -48
410.Lossl_rsaz_amm52x20_x1_ifma256_epilogue:
411 .byte 0xf3,0xc3
412.cfi_endproc
413.size ossl_rsaz_amm52x20_x1_ifma256, .-ossl_rsaz_amm52x20_x1_ifma256
414.data
415.align 32
416.Lmask52x4:
417.quad 0xfffffffffffff
418.quad 0xfffffffffffff
419.quad 0xfffffffffffff
420.quad 0xfffffffffffff
421.text
422
423.globl ossl_rsaz_amm52x20_x2_ifma256
424.type ossl_rsaz_amm52x20_x2_ifma256,@function
425.align 32
426ossl_rsaz_amm52x20_x2_ifma256:
427.cfi_startproc
428.byte 243,15,30,250
429 pushq %rbx
430.cfi_adjust_cfa_offset 8
431.cfi_offset %rbx,-16
432 pushq %rbp
433.cfi_adjust_cfa_offset 8
434.cfi_offset %rbp,-24
435 pushq %r12
436.cfi_adjust_cfa_offset 8
437.cfi_offset %r12,-32
438 pushq %r13
439.cfi_adjust_cfa_offset 8
440.cfi_offset %r13,-40
441 pushq %r14
442.cfi_adjust_cfa_offset 8
443.cfi_offset %r14,-48
444 pushq %r15
445.cfi_adjust_cfa_offset 8
446.cfi_offset %r15,-56
447.Lossl_rsaz_amm52x20_x2_ifma256_body:
448
449
450 vpxord %ymm0,%ymm0,%ymm0
451 vmovdqa64 %ymm0,%ymm3
452 vmovdqa64 %ymm0,%ymm16
453 vmovdqa64 %ymm0,%ymm17
454 vmovdqa64 %ymm0,%ymm18
455 vmovdqa64 %ymm0,%ymm19
456 vmovdqa64 %ymm0,%ymm4
457 vmovdqa64 %ymm0,%ymm20
458 vmovdqa64 %ymm0,%ymm21
459 vmovdqa64 %ymm0,%ymm22
460 vmovdqa64 %ymm0,%ymm23
461
462 xorl %r9d,%r9d
463 xorl %r15d,%r15d
464
465 movq %rdx,%r11
466 movq $0xfffffffffffff,%rax
467
468 movl $20,%ebx
469
470.align 32
471.Lloop20:
472 movq 0(%r11),%r13
473
474 vpbroadcastq %r13,%ymm1
475 movq 0(%rsi),%rdx
476 mulxq %r13,%r13,%r12
477 addq %r13,%r9
478 movq %r12,%r10
479 adcq $0,%r10
480
481 movq (%r8),%r13
482 imulq %r9,%r13
483 andq %rax,%r13
484
485 vpbroadcastq %r13,%ymm2
486 movq 0(%rcx),%rdx
487 mulxq %r13,%r13,%r12
488 addq %r13,%r9
489 adcq %r12,%r10
490
491 shrq $52,%r9
492 salq $12,%r10
493 orq %r10,%r9
494
495 vpmadd52luq 0(%rsi),%ymm1,%ymm3
496 vpmadd52luq 32(%rsi),%ymm1,%ymm16
497 vpmadd52luq 64(%rsi),%ymm1,%ymm17
498 vpmadd52luq 96(%rsi),%ymm1,%ymm18
499 vpmadd52luq 128(%rsi),%ymm1,%ymm19
500
501 vpmadd52luq 0(%rcx),%ymm2,%ymm3
502 vpmadd52luq 32(%rcx),%ymm2,%ymm16
503 vpmadd52luq 64(%rcx),%ymm2,%ymm17
504 vpmadd52luq 96(%rcx),%ymm2,%ymm18
505 vpmadd52luq 128(%rcx),%ymm2,%ymm19
506
507
508 valignq $1,%ymm3,%ymm16,%ymm3
509 valignq $1,%ymm16,%ymm17,%ymm16
510 valignq $1,%ymm17,%ymm18,%ymm17
511 valignq $1,%ymm18,%ymm19,%ymm18
512 valignq $1,%ymm19,%ymm0,%ymm19
513
514 vmovq %xmm3,%r13
515 addq %r13,%r9
516
517 vpmadd52huq 0(%rsi),%ymm1,%ymm3
518 vpmadd52huq 32(%rsi),%ymm1,%ymm16
519 vpmadd52huq 64(%rsi),%ymm1,%ymm17
520 vpmadd52huq 96(%rsi),%ymm1,%ymm18
521 vpmadd52huq 128(%rsi),%ymm1,%ymm19
522
523 vpmadd52huq 0(%rcx),%ymm2,%ymm3
524 vpmadd52huq 32(%rcx),%ymm2,%ymm16
525 vpmadd52huq 64(%rcx),%ymm2,%ymm17
526 vpmadd52huq 96(%rcx),%ymm2,%ymm18
527 vpmadd52huq 128(%rcx),%ymm2,%ymm19
528 movq 160(%r11),%r13
529
530 vpbroadcastq %r13,%ymm1
531 movq 160(%rsi),%rdx
532 mulxq %r13,%r13,%r12
533 addq %r13,%r15
534 movq %r12,%r10
535 adcq $0,%r10
536
537 movq 8(%r8),%r13
538 imulq %r15,%r13
539 andq %rax,%r13
540
541 vpbroadcastq %r13,%ymm2
542 movq 160(%rcx),%rdx
543 mulxq %r13,%r13,%r12
544 addq %r13,%r15
545 adcq %r12,%r10
546
547 shrq $52,%r15
548 salq $12,%r10
549 orq %r10,%r15
550
551 vpmadd52luq 160(%rsi),%ymm1,%ymm4
552 vpmadd52luq 192(%rsi),%ymm1,%ymm20
553 vpmadd52luq 224(%rsi),%ymm1,%ymm21
554 vpmadd52luq 256(%rsi),%ymm1,%ymm22
555 vpmadd52luq 288(%rsi),%ymm1,%ymm23
556
557 vpmadd52luq 160(%rcx),%ymm2,%ymm4
558 vpmadd52luq 192(%rcx),%ymm2,%ymm20
559 vpmadd52luq 224(%rcx),%ymm2,%ymm21
560 vpmadd52luq 256(%rcx),%ymm2,%ymm22
561 vpmadd52luq 288(%rcx),%ymm2,%ymm23
562
563
564 valignq $1,%ymm4,%ymm20,%ymm4
565 valignq $1,%ymm20,%ymm21,%ymm20
566 valignq $1,%ymm21,%ymm22,%ymm21
567 valignq $1,%ymm22,%ymm23,%ymm22
568 valignq $1,%ymm23,%ymm0,%ymm23
569
570 vmovq %xmm4,%r13
571 addq %r13,%r15
572
573 vpmadd52huq 160(%rsi),%ymm1,%ymm4
574 vpmadd52huq 192(%rsi),%ymm1,%ymm20
575 vpmadd52huq 224(%rsi),%ymm1,%ymm21
576 vpmadd52huq 256(%rsi),%ymm1,%ymm22
577 vpmadd52huq 288(%rsi),%ymm1,%ymm23
578
579 vpmadd52huq 160(%rcx),%ymm2,%ymm4
580 vpmadd52huq 192(%rcx),%ymm2,%ymm20
581 vpmadd52huq 224(%rcx),%ymm2,%ymm21
582 vpmadd52huq 256(%rcx),%ymm2,%ymm22
583 vpmadd52huq 288(%rcx),%ymm2,%ymm23
584 leaq 8(%r11),%r11
585 decl %ebx
586 jne .Lloop20
587
588 vpbroadcastq %r9,%ymm0
589 vpblendd $3,%ymm0,%ymm3,%ymm3
590
591
592
593 vpsrlq $52,%ymm3,%ymm0
594 vpsrlq $52,%ymm16,%ymm1
595 vpsrlq $52,%ymm17,%ymm2
596 vpsrlq $52,%ymm18,%ymm25
597 vpsrlq $52,%ymm19,%ymm26
598
599
600 valignq $3,%ymm25,%ymm26,%ymm26
601 valignq $3,%ymm2,%ymm25,%ymm25
602 valignq $3,%ymm1,%ymm2,%ymm2
603 valignq $3,%ymm0,%ymm1,%ymm1
604 valignq $3,.Lzeros(%rip),%ymm0,%ymm0
605
606
607 vpandq .Lmask52x4(%rip),%ymm3,%ymm3
608 vpandq .Lmask52x4(%rip),%ymm16,%ymm16
609 vpandq .Lmask52x4(%rip),%ymm17,%ymm17
610 vpandq .Lmask52x4(%rip),%ymm18,%ymm18
611 vpandq .Lmask52x4(%rip),%ymm19,%ymm19
612
613
614 vpaddq %ymm0,%ymm3,%ymm3
615 vpaddq %ymm1,%ymm16,%ymm16
616 vpaddq %ymm2,%ymm17,%ymm17
617 vpaddq %ymm25,%ymm18,%ymm18
618 vpaddq %ymm26,%ymm19,%ymm19
619
620
621
622 vpcmpuq $6,.Lmask52x4(%rip),%ymm3,%k1
623 vpcmpuq $6,.Lmask52x4(%rip),%ymm16,%k2
624 vpcmpuq $6,.Lmask52x4(%rip),%ymm17,%k3
625 vpcmpuq $6,.Lmask52x4(%rip),%ymm18,%k4
626 vpcmpuq $6,.Lmask52x4(%rip),%ymm19,%k5
627 kmovb %k1,%r14d
628 kmovb %k2,%r13d
629 kmovb %k3,%r12d
630 kmovb %k4,%r11d
631 kmovb %k5,%r10d
632
633
634 vpcmpuq $0,.Lmask52x4(%rip),%ymm3,%k1
635 vpcmpuq $0,.Lmask52x4(%rip),%ymm16,%k2
636 vpcmpuq $0,.Lmask52x4(%rip),%ymm17,%k3
637 vpcmpuq $0,.Lmask52x4(%rip),%ymm18,%k4
638 vpcmpuq $0,.Lmask52x4(%rip),%ymm19,%k5
639 kmovb %k1,%r9d
640 kmovb %k2,%r8d
641 kmovb %k3,%ebx
642 kmovb %k4,%ecx
643 kmovb %k5,%edx
644
645
646
647 shlb $4,%r13b
648 orb %r13b,%r14b
649 shlb $4,%r11b
650 orb %r11b,%r12b
651
652 addb %r14b,%r14b
653 adcb %r12b,%r12b
654 adcb %r10b,%r10b
655
656 shlb $4,%r8b
657 orb %r8b,%r9b
658 shlb $4,%cl
659 orb %cl,%bl
660
661 addb %r9b,%r14b
662 adcb %bl,%r12b
663 adcb %dl,%r10b
664
665 xorb %r9b,%r14b
666 xorb %bl,%r12b
667 xorb %dl,%r10b
668
669 kmovb %r14d,%k1
670 shrb $4,%r14b
671 kmovb %r14d,%k2
672 kmovb %r12d,%k3
673 shrb $4,%r12b
674 kmovb %r12d,%k4
675 kmovb %r10d,%k5
676
677
678 vpsubq .Lmask52x4(%rip),%ymm3,%ymm3{%k1}
679 vpsubq .Lmask52x4(%rip),%ymm16,%ymm16{%k2}
680 vpsubq .Lmask52x4(%rip),%ymm17,%ymm17{%k3}
681 vpsubq .Lmask52x4(%rip),%ymm18,%ymm18{%k4}
682 vpsubq .Lmask52x4(%rip),%ymm19,%ymm19{%k5}
683
684 vpandq .Lmask52x4(%rip),%ymm3,%ymm3
685 vpandq .Lmask52x4(%rip),%ymm16,%ymm16
686 vpandq .Lmask52x4(%rip),%ymm17,%ymm17
687 vpandq .Lmask52x4(%rip),%ymm18,%ymm18
688 vpandq .Lmask52x4(%rip),%ymm19,%ymm19
689
690 vpbroadcastq %r15,%ymm0
691 vpblendd $3,%ymm0,%ymm4,%ymm4
692
693
694
695 vpsrlq $52,%ymm4,%ymm0
696 vpsrlq $52,%ymm20,%ymm1
697 vpsrlq $52,%ymm21,%ymm2
698 vpsrlq $52,%ymm22,%ymm25
699 vpsrlq $52,%ymm23,%ymm26
700
701
702 valignq $3,%ymm25,%ymm26,%ymm26
703 valignq $3,%ymm2,%ymm25,%ymm25
704 valignq $3,%ymm1,%ymm2,%ymm2
705 valignq $3,%ymm0,%ymm1,%ymm1
706 valignq $3,.Lzeros(%rip),%ymm0,%ymm0
707
708
709 vpandq .Lmask52x4(%rip),%ymm4,%ymm4
710 vpandq .Lmask52x4(%rip),%ymm20,%ymm20
711 vpandq .Lmask52x4(%rip),%ymm21,%ymm21
712 vpandq .Lmask52x4(%rip),%ymm22,%ymm22
713 vpandq .Lmask52x4(%rip),%ymm23,%ymm23
714
715
716 vpaddq %ymm0,%ymm4,%ymm4
717 vpaddq %ymm1,%ymm20,%ymm20
718 vpaddq %ymm2,%ymm21,%ymm21
719 vpaddq %ymm25,%ymm22,%ymm22
720 vpaddq %ymm26,%ymm23,%ymm23
721
722
723
724 vpcmpuq $6,.Lmask52x4(%rip),%ymm4,%k1
725 vpcmpuq $6,.Lmask52x4(%rip),%ymm20,%k2
726 vpcmpuq $6,.Lmask52x4(%rip),%ymm21,%k3
727 vpcmpuq $6,.Lmask52x4(%rip),%ymm22,%k4
728 vpcmpuq $6,.Lmask52x4(%rip),%ymm23,%k5
729 kmovb %k1,%r14d
730 kmovb %k2,%r13d
731 kmovb %k3,%r12d
732 kmovb %k4,%r11d
733 kmovb %k5,%r10d
734
735
736 vpcmpuq $0,.Lmask52x4(%rip),%ymm4,%k1
737 vpcmpuq $0,.Lmask52x4(%rip),%ymm20,%k2
738 vpcmpuq $0,.Lmask52x4(%rip),%ymm21,%k3
739 vpcmpuq $0,.Lmask52x4(%rip),%ymm22,%k4
740 vpcmpuq $0,.Lmask52x4(%rip),%ymm23,%k5
741 kmovb %k1,%r9d
742 kmovb %k2,%r8d
743 kmovb %k3,%ebx
744 kmovb %k4,%ecx
745 kmovb %k5,%edx
746
747
748
749 shlb $4,%r13b
750 orb %r13b,%r14b
751 shlb $4,%r11b
752 orb %r11b,%r12b
753
754 addb %r14b,%r14b
755 adcb %r12b,%r12b
756 adcb %r10b,%r10b
757
758 shlb $4,%r8b
759 orb %r8b,%r9b
760 shlb $4,%cl
761 orb %cl,%bl
762
763 addb %r9b,%r14b
764 adcb %bl,%r12b
765 adcb %dl,%r10b
766
767 xorb %r9b,%r14b
768 xorb %bl,%r12b
769 xorb %dl,%r10b
770
771 kmovb %r14d,%k1
772 shrb $4,%r14b
773 kmovb %r14d,%k2
774 kmovb %r12d,%k3
775 shrb $4,%r12b
776 kmovb %r12d,%k4
777 kmovb %r10d,%k5
778
779
780 vpsubq .Lmask52x4(%rip),%ymm4,%ymm4{%k1}
781 vpsubq .Lmask52x4(%rip),%ymm20,%ymm20{%k2}
782 vpsubq .Lmask52x4(%rip),%ymm21,%ymm21{%k3}
783 vpsubq .Lmask52x4(%rip),%ymm22,%ymm22{%k4}
784 vpsubq .Lmask52x4(%rip),%ymm23,%ymm23{%k5}
785
786 vpandq .Lmask52x4(%rip),%ymm4,%ymm4
787 vpandq .Lmask52x4(%rip),%ymm20,%ymm20
788 vpandq .Lmask52x4(%rip),%ymm21,%ymm21
789 vpandq .Lmask52x4(%rip),%ymm22,%ymm22
790 vpandq .Lmask52x4(%rip),%ymm23,%ymm23
791
792 vmovdqu64 %ymm3,0(%rdi)
793 vmovdqu64 %ymm16,32(%rdi)
794 vmovdqu64 %ymm17,64(%rdi)
795 vmovdqu64 %ymm18,96(%rdi)
796 vmovdqu64 %ymm19,128(%rdi)
797
798 vmovdqu64 %ymm4,160(%rdi)
799 vmovdqu64 %ymm20,192(%rdi)
800 vmovdqu64 %ymm21,224(%rdi)
801 vmovdqu64 %ymm22,256(%rdi)
802 vmovdqu64 %ymm23,288(%rdi)
803
804 vzeroupper
805 movq 0(%rsp),%r15
806.cfi_restore %r15
807 movq 8(%rsp),%r14
808.cfi_restore %r14
809 movq 16(%rsp),%r13
810.cfi_restore %r13
811 movq 24(%rsp),%r12
812.cfi_restore %r12
813 movq 32(%rsp),%rbp
814.cfi_restore %rbp
815 movq 40(%rsp),%rbx
816.cfi_restore %rbx
817 leaq 48(%rsp),%rsp
818.cfi_adjust_cfa_offset -48
819.Lossl_rsaz_amm52x20_x2_ifma256_epilogue:
820 .byte 0xf3,0xc3
821.cfi_endproc
822.size ossl_rsaz_amm52x20_x2_ifma256, .-ossl_rsaz_amm52x20_x2_ifma256
823.text
824
825.align 32
826.globl ossl_extract_multiplier_2x20_win5
827.type ossl_extract_multiplier_2x20_win5,@function
828ossl_extract_multiplier_2x20_win5:
829.cfi_startproc
830.byte 243,15,30,250
831 vmovdqa64 .Lones(%rip),%ymm24
832 vpbroadcastq %rdx,%ymm22
833 vpbroadcastq %rcx,%ymm23
834 leaq 10240(%rsi),%rax
835
836
837 vpxor %xmm0,%xmm0,%xmm0
838 vmovdqa64 %ymm0,%ymm21
839 vmovdqa64 %ymm0,%ymm1
840 vmovdqa64 %ymm0,%ymm2
841 vmovdqa64 %ymm0,%ymm3
842 vmovdqa64 %ymm0,%ymm4
843 vmovdqa64 %ymm0,%ymm5
844 vmovdqa64 %ymm0,%ymm16
845 vmovdqa64 %ymm0,%ymm17
846 vmovdqa64 %ymm0,%ymm18
847 vmovdqa64 %ymm0,%ymm19
848
849.align 32
850.Lloop:
851 vpcmpq $0,%ymm21,%ymm22,%k1
852 vpcmpq $0,%ymm21,%ymm23,%k2
853 vmovdqu64 0(%rsi),%ymm20
854 vpblendmq %ymm20,%ymm0,%ymm0{%k1}
855 vmovdqu64 32(%rsi),%ymm20
856 vpblendmq %ymm20,%ymm1,%ymm1{%k1}
857 vmovdqu64 64(%rsi),%ymm20
858 vpblendmq %ymm20,%ymm2,%ymm2{%k1}
859 vmovdqu64 96(%rsi),%ymm20
860 vpblendmq %ymm20,%ymm3,%ymm3{%k1}
861 vmovdqu64 128(%rsi),%ymm20
862 vpblendmq %ymm20,%ymm4,%ymm4{%k1}
863 vmovdqu64 160(%rsi),%ymm20
864 vpblendmq %ymm20,%ymm5,%ymm5{%k2}
865 vmovdqu64 192(%rsi),%ymm20
866 vpblendmq %ymm20,%ymm16,%ymm16{%k2}
867 vmovdqu64 224(%rsi),%ymm20
868 vpblendmq %ymm20,%ymm17,%ymm17{%k2}
869 vmovdqu64 256(%rsi),%ymm20
870 vpblendmq %ymm20,%ymm18,%ymm18{%k2}
871 vmovdqu64 288(%rsi),%ymm20
872 vpblendmq %ymm20,%ymm19,%ymm19{%k2}
873 vpaddq %ymm24,%ymm21,%ymm21
874 addq $320,%rsi
875 cmpq %rsi,%rax
876 jne .Lloop
877 vmovdqu64 %ymm0,0(%rdi)
878 vmovdqu64 %ymm1,32(%rdi)
879 vmovdqu64 %ymm2,64(%rdi)
880 vmovdqu64 %ymm3,96(%rdi)
881 vmovdqu64 %ymm4,128(%rdi)
882 vmovdqu64 %ymm5,160(%rdi)
883 vmovdqu64 %ymm16,192(%rdi)
884 vmovdqu64 %ymm17,224(%rdi)
885 vmovdqu64 %ymm18,256(%rdi)
886 vmovdqu64 %ymm19,288(%rdi)
887 .byte 0xf3,0xc3
888.cfi_endproc
889.size ossl_extract_multiplier_2x20_win5, .-ossl_extract_multiplier_2x20_win5
890.data
891.align 32
892.Lones:
893.quad 1,1,1,1
894.Lzeros:
895.quad 0,0,0,0
896 .section ".note.gnu.property", "a"
897 .p2align 3
898 .long 1f - 0f
899 .long 4f - 1f
900 .long 5
9010:
902 # "GNU" encoded with .byte, since .asciz isn't supported
903 # on Solaris.
904 .byte 0x47
905 .byte 0x4e
906 .byte 0x55
907 .byte 0
9081:
909 .p2align 3
910 .long 0xc0000002
911 .long 3f - 2f
9122:
913 .long 3
9143:
915 .p2align 3
9164:
Note: See TracBrowser for help on using the repository browser.

© 2024 Oracle Support Privacy / Do Not Sell My Info Terms of Use Trademark Policy Automated Access Etiquette