VirtualBox

source: vbox/trunk/src/libs/openssl-3.0.3/crypto/genasm-nasm/rsaz-avx512.S@ 95219

Last change on this file since 95219 was 95219, checked in by vboxsync, 3 years ago

libs/openssl: Switched to v3.0.3, bugref:10128

  • Property svn:eol-style set to native
  • Property svn:keywords set to Author Date Id Revision
File size: 18.8 KB
Line 
1default rel
2%define XMMWORD
3%define YMMWORD
4%define ZMMWORD
5EXTERN OPENSSL_ia32cap_P
6global ossl_rsaz_avx512ifma_eligible
7
8ALIGN 32
9ossl_rsaz_avx512ifma_eligible:
10 mov ecx,DWORD[((OPENSSL_ia32cap_P+8))]
11 xor eax,eax
12 and ecx,2149777408
13 cmp ecx,2149777408
14 cmove eax,ecx
15 DB 0F3h,0C3h ;repret
16
17section .text code align=64
18
19
20global ossl_rsaz_amm52x20_x1_256
21
22ALIGN 32
23ossl_rsaz_amm52x20_x1_256:
24 mov QWORD[8+rsp],rdi ;WIN64 prologue
25 mov QWORD[16+rsp],rsi
26 mov rax,rsp
27$L$SEH_begin_ossl_rsaz_amm52x20_x1_256:
28 mov rdi,rcx
29 mov rsi,rdx
30 mov rdx,r8
31 mov rcx,r9
32 mov r8,QWORD[40+rsp]
33
34
35
36DB 243,15,30,250
37 push rbx
38
39 push rbp
40
41 push r12
42
43 push r13
44
45 push r14
46
47 push r15
48
49$L$rsaz_amm52x20_x1_256_body:
50
51
52 vpxord ymm0,ymm0,ymm0
53 vmovdqa64 ymm1,ymm0
54 vmovdqa64 ymm16,ymm0
55 vmovdqa64 ymm17,ymm0
56 vmovdqa64 ymm18,ymm0
57 vmovdqa64 ymm19,ymm0
58
59 xor r9d,r9d
60
61 mov r11,rdx
62 mov rax,0xfffffffffffff
63
64
65 mov ebx,5
66
67ALIGN 32
68$L$loop5:
69 mov r13,QWORD[r11]
70
71 vpbroadcastq ymm3,r13
72 mov rdx,QWORD[rsi]
73 mulx r12,r13,r13
74 add r9,r13
75 mov r10,r12
76 adc r10,0
77
78 mov r13,r8
79 imul r13,r9
80 and r13,rax
81
82 vpbroadcastq ymm4,r13
83 mov rdx,QWORD[rcx]
84 mulx r12,r13,r13
85 add r9,r13
86 adc r10,r12
87
88 shr r9,52
89 sal r10,12
90 or r9,r10
91
92 vpmadd52luq ymm1,ymm3,YMMWORD[rsi]
93 vpmadd52luq ymm16,ymm3,YMMWORD[32+rsi]
94 vpmadd52luq ymm17,ymm3,YMMWORD[64+rsi]
95 vpmadd52luq ymm18,ymm3,YMMWORD[96+rsi]
96 vpmadd52luq ymm19,ymm3,YMMWORD[128+rsi]
97
98 vpmadd52luq ymm1,ymm4,YMMWORD[rcx]
99 vpmadd52luq ymm16,ymm4,YMMWORD[32+rcx]
100 vpmadd52luq ymm17,ymm4,YMMWORD[64+rcx]
101 vpmadd52luq ymm18,ymm4,YMMWORD[96+rcx]
102 vpmadd52luq ymm19,ymm4,YMMWORD[128+rcx]
103
104
105 valignq ymm1,ymm16,ymm1,1
106 valignq ymm16,ymm17,ymm16,1
107 valignq ymm17,ymm18,ymm17,1
108 valignq ymm18,ymm19,ymm18,1
109 valignq ymm19,ymm0,ymm19,1
110
111 vmovq r13,xmm1
112 add r9,r13
113
114 vpmadd52huq ymm1,ymm3,YMMWORD[rsi]
115 vpmadd52huq ymm16,ymm3,YMMWORD[32+rsi]
116 vpmadd52huq ymm17,ymm3,YMMWORD[64+rsi]
117 vpmadd52huq ymm18,ymm3,YMMWORD[96+rsi]
118 vpmadd52huq ymm19,ymm3,YMMWORD[128+rsi]
119
120 vpmadd52huq ymm1,ymm4,YMMWORD[rcx]
121 vpmadd52huq ymm16,ymm4,YMMWORD[32+rcx]
122 vpmadd52huq ymm17,ymm4,YMMWORD[64+rcx]
123 vpmadd52huq ymm18,ymm4,YMMWORD[96+rcx]
124 vpmadd52huq ymm19,ymm4,YMMWORD[128+rcx]
125 mov r13,QWORD[8+r11]
126
127 vpbroadcastq ymm3,r13
128 mov rdx,QWORD[rsi]
129 mulx r12,r13,r13
130 add r9,r13
131 mov r10,r12
132 adc r10,0
133
134 mov r13,r8
135 imul r13,r9
136 and r13,rax
137
138 vpbroadcastq ymm4,r13
139 mov rdx,QWORD[rcx]
140 mulx r12,r13,r13
141 add r9,r13
142 adc r10,r12
143
144 shr r9,52
145 sal r10,12
146 or r9,r10
147
148 vpmadd52luq ymm1,ymm3,YMMWORD[rsi]
149 vpmadd52luq ymm16,ymm3,YMMWORD[32+rsi]
150 vpmadd52luq ymm17,ymm3,YMMWORD[64+rsi]
151 vpmadd52luq ymm18,ymm3,YMMWORD[96+rsi]
152 vpmadd52luq ymm19,ymm3,YMMWORD[128+rsi]
153
154 vpmadd52luq ymm1,ymm4,YMMWORD[rcx]
155 vpmadd52luq ymm16,ymm4,YMMWORD[32+rcx]
156 vpmadd52luq ymm17,ymm4,YMMWORD[64+rcx]
157 vpmadd52luq ymm18,ymm4,YMMWORD[96+rcx]
158 vpmadd52luq ymm19,ymm4,YMMWORD[128+rcx]
159
160
161 valignq ymm1,ymm16,ymm1,1
162 valignq ymm16,ymm17,ymm16,1
163 valignq ymm17,ymm18,ymm17,1
164 valignq ymm18,ymm19,ymm18,1
165 valignq ymm19,ymm0,ymm19,1
166
167 vmovq r13,xmm1
168 add r9,r13
169
170 vpmadd52huq ymm1,ymm3,YMMWORD[rsi]
171 vpmadd52huq ymm16,ymm3,YMMWORD[32+rsi]
172 vpmadd52huq ymm17,ymm3,YMMWORD[64+rsi]
173 vpmadd52huq ymm18,ymm3,YMMWORD[96+rsi]
174 vpmadd52huq ymm19,ymm3,YMMWORD[128+rsi]
175
176 vpmadd52huq ymm1,ymm4,YMMWORD[rcx]
177 vpmadd52huq ymm16,ymm4,YMMWORD[32+rcx]
178 vpmadd52huq ymm17,ymm4,YMMWORD[64+rcx]
179 vpmadd52huq ymm18,ymm4,YMMWORD[96+rcx]
180 vpmadd52huq ymm19,ymm4,YMMWORD[128+rcx]
181 mov r13,QWORD[16+r11]
182
183 vpbroadcastq ymm3,r13
184 mov rdx,QWORD[rsi]
185 mulx r12,r13,r13
186 add r9,r13
187 mov r10,r12
188 adc r10,0
189
190 mov r13,r8
191 imul r13,r9
192 and r13,rax
193
194 vpbroadcastq ymm4,r13
195 mov rdx,QWORD[rcx]
196 mulx r12,r13,r13
197 add r9,r13
198 adc r10,r12
199
200 shr r9,52
201 sal r10,12
202 or r9,r10
203
204 vpmadd52luq ymm1,ymm3,YMMWORD[rsi]
205 vpmadd52luq ymm16,ymm3,YMMWORD[32+rsi]
206 vpmadd52luq ymm17,ymm3,YMMWORD[64+rsi]
207 vpmadd52luq ymm18,ymm3,YMMWORD[96+rsi]
208 vpmadd52luq ymm19,ymm3,YMMWORD[128+rsi]
209
210 vpmadd52luq ymm1,ymm4,YMMWORD[rcx]
211 vpmadd52luq ymm16,ymm4,YMMWORD[32+rcx]
212 vpmadd52luq ymm17,ymm4,YMMWORD[64+rcx]
213 vpmadd52luq ymm18,ymm4,YMMWORD[96+rcx]
214 vpmadd52luq ymm19,ymm4,YMMWORD[128+rcx]
215
216
217 valignq ymm1,ymm16,ymm1,1
218 valignq ymm16,ymm17,ymm16,1
219 valignq ymm17,ymm18,ymm17,1
220 valignq ymm18,ymm19,ymm18,1
221 valignq ymm19,ymm0,ymm19,1
222
223 vmovq r13,xmm1
224 add r9,r13
225
226 vpmadd52huq ymm1,ymm3,YMMWORD[rsi]
227 vpmadd52huq ymm16,ymm3,YMMWORD[32+rsi]
228 vpmadd52huq ymm17,ymm3,YMMWORD[64+rsi]
229 vpmadd52huq ymm18,ymm3,YMMWORD[96+rsi]
230 vpmadd52huq ymm19,ymm3,YMMWORD[128+rsi]
231
232 vpmadd52huq ymm1,ymm4,YMMWORD[rcx]
233 vpmadd52huq ymm16,ymm4,YMMWORD[32+rcx]
234 vpmadd52huq ymm17,ymm4,YMMWORD[64+rcx]
235 vpmadd52huq ymm18,ymm4,YMMWORD[96+rcx]
236 vpmadd52huq ymm19,ymm4,YMMWORD[128+rcx]
237 mov r13,QWORD[24+r11]
238
239 vpbroadcastq ymm3,r13
240 mov rdx,QWORD[rsi]
241 mulx r12,r13,r13
242 add r9,r13
243 mov r10,r12
244 adc r10,0
245
246 mov r13,r8
247 imul r13,r9
248 and r13,rax
249
250 vpbroadcastq ymm4,r13
251 mov rdx,QWORD[rcx]
252 mulx r12,r13,r13
253 add r9,r13
254 adc r10,r12
255
256 shr r9,52
257 sal r10,12
258 or r9,r10
259
260 vpmadd52luq ymm1,ymm3,YMMWORD[rsi]
261 vpmadd52luq ymm16,ymm3,YMMWORD[32+rsi]
262 vpmadd52luq ymm17,ymm3,YMMWORD[64+rsi]
263 vpmadd52luq ymm18,ymm3,YMMWORD[96+rsi]
264 vpmadd52luq ymm19,ymm3,YMMWORD[128+rsi]
265
266 vpmadd52luq ymm1,ymm4,YMMWORD[rcx]
267 vpmadd52luq ymm16,ymm4,YMMWORD[32+rcx]
268 vpmadd52luq ymm17,ymm4,YMMWORD[64+rcx]
269 vpmadd52luq ymm18,ymm4,YMMWORD[96+rcx]
270 vpmadd52luq ymm19,ymm4,YMMWORD[128+rcx]
271
272
273 valignq ymm1,ymm16,ymm1,1
274 valignq ymm16,ymm17,ymm16,1
275 valignq ymm17,ymm18,ymm17,1
276 valignq ymm18,ymm19,ymm18,1
277 valignq ymm19,ymm0,ymm19,1
278
279 vmovq r13,xmm1
280 add r9,r13
281
282 vpmadd52huq ymm1,ymm3,YMMWORD[rsi]
283 vpmadd52huq ymm16,ymm3,YMMWORD[32+rsi]
284 vpmadd52huq ymm17,ymm3,YMMWORD[64+rsi]
285 vpmadd52huq ymm18,ymm3,YMMWORD[96+rsi]
286 vpmadd52huq ymm19,ymm3,YMMWORD[128+rsi]
287
288 vpmadd52huq ymm1,ymm4,YMMWORD[rcx]
289 vpmadd52huq ymm16,ymm4,YMMWORD[32+rcx]
290 vpmadd52huq ymm17,ymm4,YMMWORD[64+rcx]
291 vpmadd52huq ymm18,ymm4,YMMWORD[96+rcx]
292 vpmadd52huq ymm19,ymm4,YMMWORD[128+rcx]
293 lea r11,[32+r11]
294 dec ebx
295 jne NEAR $L$loop5
296
297 vmovdqa64 ymm4,YMMWORD[$L$mask52x4]
298
299 vpbroadcastq ymm3,r9
300 vpblendd ymm1,ymm1,ymm3,3
301
302
303
304 vpsrlq ymm24,ymm1,52
305 vpsrlq ymm25,ymm16,52
306 vpsrlq ymm26,ymm17,52
307 vpsrlq ymm27,ymm18,52
308 vpsrlq ymm28,ymm19,52
309
310
311 valignq ymm28,ymm28,ymm27,3
312 valignq ymm27,ymm27,ymm26,3
313 valignq ymm26,ymm26,ymm25,3
314 valignq ymm25,ymm25,ymm24,3
315 valignq ymm24,ymm24,ymm0,3
316
317
318 vpandq ymm1,ymm1,ymm4
319 vpandq ymm16,ymm16,ymm4
320 vpandq ymm17,ymm17,ymm4
321 vpandq ymm18,ymm18,ymm4
322 vpandq ymm19,ymm19,ymm4
323
324
325 vpaddq ymm1,ymm1,ymm24
326 vpaddq ymm16,ymm16,ymm25
327 vpaddq ymm17,ymm17,ymm26
328 vpaddq ymm18,ymm18,ymm27
329 vpaddq ymm19,ymm19,ymm28
330
331
332
333 vpcmpuq k1,ymm4,ymm1,1
334 vpcmpuq k2,ymm4,ymm16,1
335 vpcmpuq k3,ymm4,ymm17,1
336 vpcmpuq k4,ymm4,ymm18,1
337 vpcmpuq k5,ymm4,ymm19,1
338 kmovb r14d,k1
339 kmovb r13d,k2
340 kmovb r12d,k3
341 kmovb r11d,k4
342 kmovb r10d,k5
343
344
345 vpcmpuq k1,ymm4,ymm1,0
346 vpcmpuq k2,ymm4,ymm16,0
347 vpcmpuq k3,ymm4,ymm17,0
348 vpcmpuq k4,ymm4,ymm18,0
349 vpcmpuq k5,ymm4,ymm19,0
350 kmovb r9d,k1
351 kmovb r8d,k2
352 kmovb ebx,k3
353 kmovb ecx,k4
354 kmovb edx,k5
355
356
357
358 shl r13b,4
359 or r14b,r13b
360 shl r11b,4
361 or r12b,r11b
362
363 add r14b,r14b
364 adc r12b,r12b
365 adc r10b,r10b
366
367 shl r8b,4
368 or r9b,r8b
369 shl cl,4
370 or bl,cl
371
372 add r14b,r9b
373 adc r12b,bl
374 adc r10b,dl
375
376 xor r14b,r9b
377 xor r12b,bl
378 xor r10b,dl
379
380 kmovb k1,r14d
381 shr r14b,4
382 kmovb k2,r14d
383 kmovb k3,r12d
384 shr r12b,4
385 kmovb k4,r12d
386 kmovb k5,r10d
387
388
389 vpsubq ymm1{k1},ymm1,ymm4
390 vpsubq ymm16{k2},ymm16,ymm4
391 vpsubq ymm17{k3},ymm17,ymm4
392 vpsubq ymm18{k4},ymm18,ymm4
393 vpsubq ymm19{k5},ymm19,ymm4
394
395 vpandq ymm1,ymm1,ymm4
396 vpandq ymm16,ymm16,ymm4
397 vpandq ymm17,ymm17,ymm4
398 vpandq ymm18,ymm18,ymm4
399 vpandq ymm19,ymm19,ymm4
400
401 vmovdqu64 YMMWORD[rdi],ymm1
402 vmovdqu64 YMMWORD[32+rdi],ymm16
403 vmovdqu64 YMMWORD[64+rdi],ymm17
404 vmovdqu64 YMMWORD[96+rdi],ymm18
405 vmovdqu64 YMMWORD[128+rdi],ymm19
406
407 vzeroupper
408 mov r15,QWORD[rsp]
409
410 mov r14,QWORD[8+rsp]
411
412 mov r13,QWORD[16+rsp]
413
414 mov r12,QWORD[24+rsp]
415
416 mov rbp,QWORD[32+rsp]
417
418 mov rbx,QWORD[40+rsp]
419
420 lea rsp,[48+rsp]
421
422$L$rsaz_amm52x20_x1_256_epilogue:
423 mov rdi,QWORD[8+rsp] ;WIN64 epilogue
424 mov rsi,QWORD[16+rsp]
425 DB 0F3h,0C3h ;repret
426
427$L$SEH_end_ossl_rsaz_amm52x20_x1_256:
428section .data data align=8
429
430ALIGN 32
431$L$mask52x4:
432 DQ 0xfffffffffffff
433 DQ 0xfffffffffffff
434 DQ 0xfffffffffffff
435 DQ 0xfffffffffffff
436section .text code align=64
437
438
439global ossl_rsaz_amm52x20_x2_256
440
441ALIGN 32
442ossl_rsaz_amm52x20_x2_256:
443 mov QWORD[8+rsp],rdi ;WIN64 prologue
444 mov QWORD[16+rsp],rsi
445 mov rax,rsp
446$L$SEH_begin_ossl_rsaz_amm52x20_x2_256:
447 mov rdi,rcx
448 mov rsi,rdx
449 mov rdx,r8
450 mov rcx,r9
451 mov r8,QWORD[40+rsp]
452
453
454
455DB 243,15,30,250
456 push rbx
457
458 push rbp
459
460 push r12
461
462 push r13
463
464 push r14
465
466 push r15
467
468$L$rsaz_amm52x20_x2_256_body:
469
470
471 vpxord ymm0,ymm0,ymm0
472 vmovdqa64 ymm1,ymm0
473 vmovdqa64 ymm16,ymm0
474 vmovdqa64 ymm17,ymm0
475 vmovdqa64 ymm18,ymm0
476 vmovdqa64 ymm19,ymm0
477 vmovdqa64 ymm2,ymm0
478 vmovdqa64 ymm20,ymm0
479 vmovdqa64 ymm21,ymm0
480 vmovdqa64 ymm22,ymm0
481 vmovdqa64 ymm23,ymm0
482
483 xor r9d,r9d
484 xor r15d,r15d
485
486 mov r11,rdx
487 mov rax,0xfffffffffffff
488
489 mov ebx,20
490
491ALIGN 32
492$L$loop20:
493 mov r13,QWORD[r11]
494
495 vpbroadcastq ymm3,r13
496 mov rdx,QWORD[rsi]
497 mulx r12,r13,r13
498 add r9,r13
499 mov r10,r12
500 adc r10,0
501
502 mov r13,QWORD[r8]
503 imul r13,r9
504 and r13,rax
505
506 vpbroadcastq ymm4,r13
507 mov rdx,QWORD[rcx]
508 mulx r12,r13,r13
509 add r9,r13
510 adc r10,r12
511
512 shr r9,52
513 sal r10,12
514 or r9,r10
515
516 vpmadd52luq ymm1,ymm3,YMMWORD[rsi]
517 vpmadd52luq ymm16,ymm3,YMMWORD[32+rsi]
518 vpmadd52luq ymm17,ymm3,YMMWORD[64+rsi]
519 vpmadd52luq ymm18,ymm3,YMMWORD[96+rsi]
520 vpmadd52luq ymm19,ymm3,YMMWORD[128+rsi]
521
522 vpmadd52luq ymm1,ymm4,YMMWORD[rcx]
523 vpmadd52luq ymm16,ymm4,YMMWORD[32+rcx]
524 vpmadd52luq ymm17,ymm4,YMMWORD[64+rcx]
525 vpmadd52luq ymm18,ymm4,YMMWORD[96+rcx]
526 vpmadd52luq ymm19,ymm4,YMMWORD[128+rcx]
527
528
529 valignq ymm1,ymm16,ymm1,1
530 valignq ymm16,ymm17,ymm16,1
531 valignq ymm17,ymm18,ymm17,1
532 valignq ymm18,ymm19,ymm18,1
533 valignq ymm19,ymm0,ymm19,1
534
535 vmovq r13,xmm1
536 add r9,r13
537
538 vpmadd52huq ymm1,ymm3,YMMWORD[rsi]
539 vpmadd52huq ymm16,ymm3,YMMWORD[32+rsi]
540 vpmadd52huq ymm17,ymm3,YMMWORD[64+rsi]
541 vpmadd52huq ymm18,ymm3,YMMWORD[96+rsi]
542 vpmadd52huq ymm19,ymm3,YMMWORD[128+rsi]
543
544 vpmadd52huq ymm1,ymm4,YMMWORD[rcx]
545 vpmadd52huq ymm16,ymm4,YMMWORD[32+rcx]
546 vpmadd52huq ymm17,ymm4,YMMWORD[64+rcx]
547 vpmadd52huq ymm18,ymm4,YMMWORD[96+rcx]
548 vpmadd52huq ymm19,ymm4,YMMWORD[128+rcx]
549 mov r13,QWORD[160+r11]
550
551 vpbroadcastq ymm3,r13
552 mov rdx,QWORD[160+rsi]
553 mulx r12,r13,r13
554 add r15,r13
555 mov r10,r12
556 adc r10,0
557
558 mov r13,QWORD[8+r8]
559 imul r13,r15
560 and r13,rax
561
562 vpbroadcastq ymm4,r13
563 mov rdx,QWORD[160+rcx]
564 mulx r12,r13,r13
565 add r15,r13
566 adc r10,r12
567
568 shr r15,52
569 sal r10,12
570 or r15,r10
571
572 vpmadd52luq ymm2,ymm3,YMMWORD[160+rsi]
573 vpmadd52luq ymm20,ymm3,YMMWORD[192+rsi]
574 vpmadd52luq ymm21,ymm3,YMMWORD[224+rsi]
575 vpmadd52luq ymm22,ymm3,YMMWORD[256+rsi]
576 vpmadd52luq ymm23,ymm3,YMMWORD[288+rsi]
577
578 vpmadd52luq ymm2,ymm4,YMMWORD[160+rcx]
579 vpmadd52luq ymm20,ymm4,YMMWORD[192+rcx]
580 vpmadd52luq ymm21,ymm4,YMMWORD[224+rcx]
581 vpmadd52luq ymm22,ymm4,YMMWORD[256+rcx]
582 vpmadd52luq ymm23,ymm4,YMMWORD[288+rcx]
583
584
585 valignq ymm2,ymm20,ymm2,1
586 valignq ymm20,ymm21,ymm20,1
587 valignq ymm21,ymm22,ymm21,1
588 valignq ymm22,ymm23,ymm22,1
589 valignq ymm23,ymm0,ymm23,1
590
591 vmovq r13,xmm2
592 add r15,r13
593
594 vpmadd52huq ymm2,ymm3,YMMWORD[160+rsi]
595 vpmadd52huq ymm20,ymm3,YMMWORD[192+rsi]
596 vpmadd52huq ymm21,ymm3,YMMWORD[224+rsi]
597 vpmadd52huq ymm22,ymm3,YMMWORD[256+rsi]
598 vpmadd52huq ymm23,ymm3,YMMWORD[288+rsi]
599
600 vpmadd52huq ymm2,ymm4,YMMWORD[160+rcx]
601 vpmadd52huq ymm20,ymm4,YMMWORD[192+rcx]
602 vpmadd52huq ymm21,ymm4,YMMWORD[224+rcx]
603 vpmadd52huq ymm22,ymm4,YMMWORD[256+rcx]
604 vpmadd52huq ymm23,ymm4,YMMWORD[288+rcx]
605 lea r11,[8+r11]
606 dec ebx
607 jne NEAR $L$loop20
608
609 vmovdqa64 ymm4,YMMWORD[$L$mask52x4]
610
611 vpbroadcastq ymm3,r9
612 vpblendd ymm1,ymm1,ymm3,3
613
614
615
616 vpsrlq ymm24,ymm1,52
617 vpsrlq ymm25,ymm16,52
618 vpsrlq ymm26,ymm17,52
619 vpsrlq ymm27,ymm18,52
620 vpsrlq ymm28,ymm19,52
621
622
623 valignq ymm28,ymm28,ymm27,3
624 valignq ymm27,ymm27,ymm26,3
625 valignq ymm26,ymm26,ymm25,3
626 valignq ymm25,ymm25,ymm24,3
627 valignq ymm24,ymm24,ymm0,3
628
629
630 vpandq ymm1,ymm1,ymm4
631 vpandq ymm16,ymm16,ymm4
632 vpandq ymm17,ymm17,ymm4
633 vpandq ymm18,ymm18,ymm4
634 vpandq ymm19,ymm19,ymm4
635
636
637 vpaddq ymm1,ymm1,ymm24
638 vpaddq ymm16,ymm16,ymm25
639 vpaddq ymm17,ymm17,ymm26
640 vpaddq ymm18,ymm18,ymm27
641 vpaddq ymm19,ymm19,ymm28
642
643
644
645 vpcmpuq k1,ymm4,ymm1,1
646 vpcmpuq k2,ymm4,ymm16,1
647 vpcmpuq k3,ymm4,ymm17,1
648 vpcmpuq k4,ymm4,ymm18,1
649 vpcmpuq k5,ymm4,ymm19,1
650 kmovb r14d,k1
651 kmovb r13d,k2
652 kmovb r12d,k3
653 kmovb r11d,k4
654 kmovb r10d,k5
655
656
657 vpcmpuq k1,ymm4,ymm1,0
658 vpcmpuq k2,ymm4,ymm16,0
659 vpcmpuq k3,ymm4,ymm17,0
660 vpcmpuq k4,ymm4,ymm18,0
661 vpcmpuq k5,ymm4,ymm19,0
662 kmovb r9d,k1
663 kmovb r8d,k2
664 kmovb ebx,k3
665 kmovb ecx,k4
666 kmovb edx,k5
667
668
669
670 shl r13b,4
671 or r14b,r13b
672 shl r11b,4
673 or r12b,r11b
674
675 add r14b,r14b
676 adc r12b,r12b
677 adc r10b,r10b
678
679 shl r8b,4
680 or r9b,r8b
681 shl cl,4
682 or bl,cl
683
684 add r14b,r9b
685 adc r12b,bl
686 adc r10b,dl
687
688 xor r14b,r9b
689 xor r12b,bl
690 xor r10b,dl
691
692 kmovb k1,r14d
693 shr r14b,4
694 kmovb k2,r14d
695 kmovb k3,r12d
696 shr r12b,4
697 kmovb k4,r12d
698 kmovb k5,r10d
699
700
701 vpsubq ymm1{k1},ymm1,ymm4
702 vpsubq ymm16{k2},ymm16,ymm4
703 vpsubq ymm17{k3},ymm17,ymm4
704 vpsubq ymm18{k4},ymm18,ymm4
705 vpsubq ymm19{k5},ymm19,ymm4
706
707 vpandq ymm1,ymm1,ymm4
708 vpandq ymm16,ymm16,ymm4
709 vpandq ymm17,ymm17,ymm4
710 vpandq ymm18,ymm18,ymm4
711 vpandq ymm19,ymm19,ymm4
712
713 vpbroadcastq ymm3,r15
714 vpblendd ymm2,ymm2,ymm3,3
715
716
717
718 vpsrlq ymm24,ymm2,52
719 vpsrlq ymm25,ymm20,52
720 vpsrlq ymm26,ymm21,52
721 vpsrlq ymm27,ymm22,52
722 vpsrlq ymm28,ymm23,52
723
724
725 valignq ymm28,ymm28,ymm27,3
726 valignq ymm27,ymm27,ymm26,3
727 valignq ymm26,ymm26,ymm25,3
728 valignq ymm25,ymm25,ymm24,3
729 valignq ymm24,ymm24,ymm0,3
730
731
732 vpandq ymm2,ymm2,ymm4
733 vpandq ymm20,ymm20,ymm4
734 vpandq ymm21,ymm21,ymm4
735 vpandq ymm22,ymm22,ymm4
736 vpandq ymm23,ymm23,ymm4
737
738
739 vpaddq ymm2,ymm2,ymm24
740 vpaddq ymm20,ymm20,ymm25
741 vpaddq ymm21,ymm21,ymm26
742 vpaddq ymm22,ymm22,ymm27
743 vpaddq ymm23,ymm23,ymm28
744
745
746
747 vpcmpuq k1,ymm4,ymm2,1
748 vpcmpuq k2,ymm4,ymm20,1
749 vpcmpuq k3,ymm4,ymm21,1
750 vpcmpuq k4,ymm4,ymm22,1
751 vpcmpuq k5,ymm4,ymm23,1
752 kmovb r14d,k1
753 kmovb r13d,k2
754 kmovb r12d,k3
755 kmovb r11d,k4
756 kmovb r10d,k5
757
758
759 vpcmpuq k1,ymm4,ymm2,0
760 vpcmpuq k2,ymm4,ymm20,0
761 vpcmpuq k3,ymm4,ymm21,0
762 vpcmpuq k4,ymm4,ymm22,0
763 vpcmpuq k5,ymm4,ymm23,0
764 kmovb r9d,k1
765 kmovb r8d,k2
766 kmovb ebx,k3
767 kmovb ecx,k4
768 kmovb edx,k5
769
770
771
772 shl r13b,4
773 or r14b,r13b
774 shl r11b,4
775 or r12b,r11b
776
777 add r14b,r14b
778 adc r12b,r12b
779 adc r10b,r10b
780
781 shl r8b,4
782 or r9b,r8b
783 shl cl,4
784 or bl,cl
785
786 add r14b,r9b
787 adc r12b,bl
788 adc r10b,dl
789
790 xor r14b,r9b
791 xor r12b,bl
792 xor r10b,dl
793
794 kmovb k1,r14d
795 shr r14b,4
796 kmovb k2,r14d
797 kmovb k3,r12d
798 shr r12b,4
799 kmovb k4,r12d
800 kmovb k5,r10d
801
802
803 vpsubq ymm2{k1},ymm2,ymm4
804 vpsubq ymm20{k2},ymm20,ymm4
805 vpsubq ymm21{k3},ymm21,ymm4
806 vpsubq ymm22{k4},ymm22,ymm4
807 vpsubq ymm23{k5},ymm23,ymm4
808
809 vpandq ymm2,ymm2,ymm4
810 vpandq ymm20,ymm20,ymm4
811 vpandq ymm21,ymm21,ymm4
812 vpandq ymm22,ymm22,ymm4
813 vpandq ymm23,ymm23,ymm4
814
815 vmovdqu64 YMMWORD[rdi],ymm1
816 vmovdqu64 YMMWORD[32+rdi],ymm16
817 vmovdqu64 YMMWORD[64+rdi],ymm17
818 vmovdqu64 YMMWORD[96+rdi],ymm18
819 vmovdqu64 YMMWORD[128+rdi],ymm19
820
821 vmovdqu64 YMMWORD[160+rdi],ymm2
822 vmovdqu64 YMMWORD[192+rdi],ymm20
823 vmovdqu64 YMMWORD[224+rdi],ymm21
824 vmovdqu64 YMMWORD[256+rdi],ymm22
825 vmovdqu64 YMMWORD[288+rdi],ymm23
826
827 vzeroupper
828 mov r15,QWORD[rsp]
829
830 mov r14,QWORD[8+rsp]
831
832 mov r13,QWORD[16+rsp]
833
834 mov r12,QWORD[24+rsp]
835
836 mov rbp,QWORD[32+rsp]
837
838 mov rbx,QWORD[40+rsp]
839
840 lea rsp,[48+rsp]
841
842$L$rsaz_amm52x20_x2_256_epilogue:
843 mov rdi,QWORD[8+rsp] ;WIN64 epilogue
844 mov rsi,QWORD[16+rsp]
845 DB 0F3h,0C3h ;repret
846
847$L$SEH_end_ossl_rsaz_amm52x20_x2_256:
848section .text code align=64
849
850
851ALIGN 32
852global ossl_extract_multiplier_2x20_win5
853
854ossl_extract_multiplier_2x20_win5:
855 mov QWORD[8+rsp],rdi ;WIN64 prologue
856 mov QWORD[16+rsp],rsi
857 mov rax,rsp
858$L$SEH_begin_ossl_extract_multiplier_2x20_win5:
859 mov rdi,rcx
860 mov rsi,rdx
861 mov rdx,r8
862 mov rcx,r9
863
864
865
866DB 243,15,30,250
867 lea rax,[rcx*4+rcx]
868 sal rax,5
869 add rsi,rax
870
871 vmovdqa64 ymm23,YMMWORD[$L$ones]
872 vpbroadcastq ymm22,rdx
873 lea rax,[10240+rsi]
874
875 vpxor xmm4,xmm4,xmm4
876 vmovdqa64 ymm3,ymm4
877 vmovdqa64 ymm2,ymm4
878 vmovdqa64 ymm1,ymm4
879 vmovdqa64 ymm0,ymm4
880 vmovdqa64 ymm21,ymm4
881
882ALIGN 32
883$L$loop:
884 vpcmpq k1,ymm22,ymm21,0
885 add rsi,320
886 vpaddq ymm21,ymm21,ymm23
887 vmovdqu64 ymm16,YMMWORD[((-320))+rsi]
888 vmovdqu64 ymm17,YMMWORD[((-288))+rsi]
889 vmovdqu64 ymm18,YMMWORD[((-256))+rsi]
890 vmovdqu64 ymm19,YMMWORD[((-224))+rsi]
891 vmovdqu64 ymm20,YMMWORD[((-192))+rsi]
892 vpblendmq ymm0{k1},ymm0,ymm16
893 vpblendmq ymm1{k1},ymm1,ymm17
894 vpblendmq ymm2{k1},ymm2,ymm18
895 vpblendmq ymm3{k1},ymm3,ymm19
896 vpblendmq ymm4{k1},ymm4,ymm20
897 cmp rax,rsi
898 jne NEAR $L$loop
899
900 vmovdqu64 YMMWORD[rdi],ymm0
901 vmovdqu64 YMMWORD[32+rdi],ymm1
902 vmovdqu64 YMMWORD[64+rdi],ymm2
903 vmovdqu64 YMMWORD[96+rdi],ymm3
904 vmovdqu64 YMMWORD[128+rdi],ymm4
905
906 mov rdi,QWORD[8+rsp] ;WIN64 epilogue
907 mov rsi,QWORD[16+rsp]
908 DB 0F3h,0C3h ;repret
909
910$L$SEH_end_ossl_extract_multiplier_2x20_win5:
911section .data data align=8
912
913ALIGN 32
914$L$ones:
915 DQ 1,1,1,1
916EXTERN __imp_RtlVirtualUnwind
917
918ALIGN 16
919rsaz_def_handler:
920 push rsi
921 push rdi
922 push rbx
923 push rbp
924 push r12
925 push r13
926 push r14
927 push r15
928 pushfq
929 sub rsp,64
930
931 mov rax,QWORD[120+r8]
932 mov rbx,QWORD[248+r8]
933
934 mov rsi,QWORD[8+r9]
935 mov r11,QWORD[56+r9]
936
937 mov r10d,DWORD[r11]
938 lea r10,[r10*1+rsi]
939 cmp rbx,r10
940 jb NEAR $L$common_seh_tail
941
942 mov rax,QWORD[152+r8]
943
944 mov r10d,DWORD[4+r11]
945 lea r10,[r10*1+rsi]
946 cmp rbx,r10
947 jae NEAR $L$common_seh_tail
948
949 lea rax,[48+rax]
950
951 mov rbx,QWORD[((-8))+rax]
952 mov rbp,QWORD[((-16))+rax]
953 mov r12,QWORD[((-24))+rax]
954 mov r13,QWORD[((-32))+rax]
955 mov r14,QWORD[((-40))+rax]
956 mov r15,QWORD[((-48))+rax]
957 mov QWORD[144+r8],rbx
958 mov QWORD[160+r8],rbp
959 mov QWORD[216+r8],r12
960 mov QWORD[224+r8],r13
961 mov QWORD[232+r8],r14
962 mov QWORD[240+r8],r15
963
964$L$common_seh_tail:
965 mov rdi,QWORD[8+rax]
966 mov rsi,QWORD[16+rax]
967 mov QWORD[152+r8],rax
968 mov QWORD[168+r8],rsi
969 mov QWORD[176+r8],rdi
970
971 mov rdi,QWORD[40+r9]
972 mov rsi,r8
973 mov ecx,154
974 DD 0xa548f3fc
975
976 mov rsi,r9
977 xor rcx,rcx
978 mov rdx,QWORD[8+rsi]
979 mov r8,QWORD[rsi]
980 mov r9,QWORD[16+rsi]
981 mov r10,QWORD[40+rsi]
982 lea r11,[56+rsi]
983 lea r12,[24+rsi]
984 mov QWORD[32+rsp],r10
985 mov QWORD[40+rsp],r11
986 mov QWORD[48+rsp],r12
987 mov QWORD[56+rsp],rcx
988 call QWORD[__imp_RtlVirtualUnwind]
989
990 mov eax,1
991 add rsp,64
992 popfq
993 pop r15
994 pop r14
995 pop r13
996 pop r12
997 pop rbp
998 pop rbx
999 pop rdi
1000 pop rsi
1001 DB 0F3h,0C3h ;repret
1002
1003
1004section .pdata rdata align=4
1005ALIGN 4
1006 DD $L$SEH_begin_ossl_rsaz_amm52x20_x1_256 wrt ..imagebase
1007 DD $L$SEH_end_ossl_rsaz_amm52x20_x1_256 wrt ..imagebase
1008 DD $L$SEH_info_ossl_rsaz_amm52x20_x1_256 wrt ..imagebase
1009
1010 DD $L$SEH_begin_ossl_rsaz_amm52x20_x2_256 wrt ..imagebase
1011 DD $L$SEH_end_ossl_rsaz_amm52x20_x2_256 wrt ..imagebase
1012 DD $L$SEH_info_ossl_rsaz_amm52x20_x2_256 wrt ..imagebase
1013
1014 DD $L$SEH_begin_ossl_extract_multiplier_2x20_win5 wrt ..imagebase
1015 DD $L$SEH_end_ossl_extract_multiplier_2x20_win5 wrt ..imagebase
1016 DD $L$SEH_info_ossl_extract_multiplier_2x20_win5 wrt ..imagebase
1017
1018section .xdata rdata align=8
1019ALIGN 8
1020$L$SEH_info_ossl_rsaz_amm52x20_x1_256:
1021DB 9,0,0,0
1022 DD rsaz_def_handler wrt ..imagebase
1023 DD $L$rsaz_amm52x20_x1_256_body wrt ..imagebase,$L$rsaz_amm52x20_x1_256_epilogue wrt ..imagebase
1024$L$SEH_info_ossl_rsaz_amm52x20_x2_256:
1025DB 9,0,0,0
1026 DD rsaz_def_handler wrt ..imagebase
1027 DD $L$rsaz_amm52x20_x2_256_body wrt ..imagebase,$L$rsaz_amm52x20_x2_256_epilogue wrt ..imagebase
1028$L$SEH_info_ossl_extract_multiplier_2x20_win5:
1029DB 9,0,0,0
1030 DD rsaz_def_handler wrt ..imagebase
1031 DD $L$SEH_begin_ossl_extract_multiplier_2x20_win5 wrt ..imagebase,$L$SEH_begin_ossl_extract_multiplier_2x20_win5 wrt ..imagebase
Note: See TracBrowser for help on using the repository browser.

© 2024 Oracle Support Privacy / Do Not Sell My Info Terms of Use Trademark Policy Automated Access Etiquette