Changeset 100971 in vbox for trunk/src/libs/openssl-3.1.0/crypto/genasm-nasm/rsaz-2k-avx512.S
- Timestamp:
- Aug 25, 2023 1:16:51 PM (18 months ago)
- svn:sync-xref-src-repo-rev:
- 158912
- File:
-
- 1 edited
Legend:
- Unmodified
- Added
- Removed
-
trunk/src/libs/openssl-3.1.0/crypto/genasm-nasm/rsaz-2k-avx512.S
r100939 r100971 1 2 .globl ossl_rsaz_avx512ifma_eligible 3 .type ossl_rsaz_avx512ifma_eligible,@function 4 .align 32 5 ossl_rsaz_avx512ifma_eligible: 6 movl OPENSSL_ia32cap_P+8(%rip),%ecx 7 xorl %eax,%eax 8 andl $2149777408,%ecx 9 cmpl $2149777408,%ecx 10 cmovel %ecx,%eax 11 .byte 0xf3,0xc3 12 .size ossl_rsaz_avx512ifma_eligible, .-ossl_rsaz_avx512ifma_eligible 13 .text 14 15 .globl ossl_rsaz_amm52x20_x1_ifma256 16 .type ossl_rsaz_amm52x20_x1_ifma256,@function 17 .align 32 18 ossl_rsaz_amm52x20_x1_ifma256: 19 .cfi_startproc 20 .byte 243,15,30,250 21 pushq %rbx 22 .cfi_adjust_cfa_offset 8 23 .cfi_offset %rbx,-16 24 pushq %rbp 25 .cfi_adjust_cfa_offset 8 26 .cfi_offset %rbp,-24 27 pushq %r12 28 .cfi_adjust_cfa_offset 8 29 .cfi_offset %r12,-32 30 pushq %r13 31 .cfi_adjust_cfa_offset 8 32 .cfi_offset %r13,-40 33 pushq %r14 34 .cfi_adjust_cfa_offset 8 35 .cfi_offset %r14,-48 36 pushq %r15 37 .cfi_adjust_cfa_offset 8 38 .cfi_offset %r15,-56 39 .Lossl_rsaz_amm52x20_x1_ifma256_body: 1 default rel 2 %define XMMWORD 3 %define YMMWORD 4 %define ZMMWORD 5 section .text code align=64 40 6 41 7 42 vpxord %ymm0,%ymm0,%ymm0 43 vmovdqa64 %ymm0,%ymm3 44 vmovdqa64 %ymm0,%ymm16 45 vmovdqa64 %ymm0,%ymm17 46 vmovdqa64 %ymm0,%ymm18 47 vmovdqa64 %ymm0,%ymm19 8 global ossl_rsaz_avx512ifma_eligible 48 9 49 xorl %r9d,%r9d 50 51 movq %rdx,%r11 52 movq $0xfffffffffffff,%rax 10 ossl_rsaz_avx512ifma_eligible: 11 xor eax,eax 12 DB 0F3h,0C3h ;repret 53 13 54 14 55 movl $5,%ebx 15 global ossl_rsaz_amm52x20_x1_ifma256 16 global ossl_rsaz_amm52x20_x2_ifma256 17 global ossl_extract_multiplier_2x20_win5 56 18 57 .align 32 58 .Lloop5: 59 movq 0(%r11),%r13 19 ossl_rsaz_amm52x20_x1_ifma256: 20 ossl_rsaz_amm52x20_x2_ifma256: 21 ossl_extract_multiplier_2x20_win5: 22 DB 0x0f,0x0b 23 DB 0F3h,0C3h ;repret 60 24 61 vpbroadcastq %r13,%ymm162 movq 0(%rsi),%rdx63 mulxq %r13,%r13,%r1264 addq %r13,%r965 movq %r12,%r1066 adcq $0,%r1067 68 movq %r8,%r1369 imulq %r9,%r1370 andq %rax,%r1371 72 vpbroadcastq %r13,%ymm273 movq 0(%rcx),%rdx74 mulxq %r13,%r13,%r1275 addq %r13,%r976 adcq %r12,%r1077 78 shrq $52,%r979 salq $12,%r1080 orq %r10,%r981 82 vpmadd52luq 0(%rsi),%ymm1,%ymm383 vpmadd52luq 32(%rsi),%ymm1,%ymm1684 vpmadd52luq 64(%rsi),%ymm1,%ymm1785 vpmadd52luq 96(%rsi),%ymm1,%ymm1886 vpmadd52luq 128(%rsi),%ymm1,%ymm1987 88 vpmadd52luq 0(%rcx),%ymm2,%ymm389 vpmadd52luq 32(%rcx),%ymm2,%ymm1690 vpmadd52luq 64(%rcx),%ymm2,%ymm1791 vpmadd52luq 96(%rcx),%ymm2,%ymm1892 vpmadd52luq 128(%rcx),%ymm2,%ymm1993 94 95 valignq $1,%ymm3,%ymm16,%ymm396 valignq $1,%ymm16,%ymm17,%ymm1697 valignq $1,%ymm17,%ymm18,%ymm1798 valignq $1,%ymm18,%ymm19,%ymm1899 valignq $1,%ymm19,%ymm0,%ymm19100 101 vmovq %xmm3,%r13102 addq %r13,%r9103 104 vpmadd52huq 0(%rsi),%ymm1,%ymm3105 vpmadd52huq 32(%rsi),%ymm1,%ymm16106 vpmadd52huq 64(%rsi),%ymm1,%ymm17107 vpmadd52huq 96(%rsi),%ymm1,%ymm18108 vpmadd52huq 128(%rsi),%ymm1,%ymm19109 110 vpmadd52huq 0(%rcx),%ymm2,%ymm3111 vpmadd52huq 32(%rcx),%ymm2,%ymm16112 vpmadd52huq 64(%rcx),%ymm2,%ymm17113 vpmadd52huq 96(%rcx),%ymm2,%ymm18114 vpmadd52huq 128(%rcx),%ymm2,%ymm19115 movq 8(%r11),%r13116 117 vpbroadcastq %r13,%ymm1118 movq 0(%rsi),%rdx119 mulxq %r13,%r13,%r12120 addq %r13,%r9121 movq %r12,%r10122 adcq $0,%r10123 124 movq %r8,%r13125 imulq %r9,%r13126 andq %rax,%r13127 128 vpbroadcastq %r13,%ymm2129 movq 0(%rcx),%rdx130 mulxq %r13,%r13,%r12131 addq %r13,%r9132 adcq %r12,%r10133 134 shrq $52,%r9135 salq $12,%r10136 orq %r10,%r9137 138 vpmadd52luq 0(%rsi),%ymm1,%ymm3139 vpmadd52luq 32(%rsi),%ymm1,%ymm16140 vpmadd52luq 64(%rsi),%ymm1,%ymm17141 vpmadd52luq 96(%rsi),%ymm1,%ymm18142 vpmadd52luq 128(%rsi),%ymm1,%ymm19143 144 vpmadd52luq 0(%rcx),%ymm2,%ymm3145 vpmadd52luq 32(%rcx),%ymm2,%ymm16146 vpmadd52luq 64(%rcx),%ymm2,%ymm17147 vpmadd52luq 96(%rcx),%ymm2,%ymm18148 vpmadd52luq 128(%rcx),%ymm2,%ymm19149 150 151 valignq $1,%ymm3,%ymm16,%ymm3152 valignq $1,%ymm16,%ymm17,%ymm16153 valignq $1,%ymm17,%ymm18,%ymm17154 valignq $1,%ymm18,%ymm19,%ymm18155 valignq $1,%ymm19,%ymm0,%ymm19156 157 vmovq %xmm3,%r13158 addq %r13,%r9159 160 vpmadd52huq 0(%rsi),%ymm1,%ymm3161 vpmadd52huq 32(%rsi),%ymm1,%ymm16162 vpmadd52huq 64(%rsi),%ymm1,%ymm17163 vpmadd52huq 96(%rsi),%ymm1,%ymm18164 vpmadd52huq 128(%rsi),%ymm1,%ymm19165 166 vpmadd52huq 0(%rcx),%ymm2,%ymm3167 vpmadd52huq 32(%rcx),%ymm2,%ymm16168 vpmadd52huq 64(%rcx),%ymm2,%ymm17169 vpmadd52huq 96(%rcx),%ymm2,%ymm18170 vpmadd52huq 128(%rcx),%ymm2,%ymm19171 movq 16(%r11),%r13172 173 vpbroadcastq %r13,%ymm1174 movq 0(%rsi),%rdx175 mulxq %r13,%r13,%r12176 addq %r13,%r9177 movq %r12,%r10178 adcq $0,%r10179 180 movq %r8,%r13181 imulq %r9,%r13182 andq %rax,%r13183 184 vpbroadcastq %r13,%ymm2185 movq 0(%rcx),%rdx186 mulxq %r13,%r13,%r12187 addq %r13,%r9188 adcq %r12,%r10189 190 shrq $52,%r9191 salq $12,%r10192 orq %r10,%r9193 194 vpmadd52luq 0(%rsi),%ymm1,%ymm3195 vpmadd52luq 32(%rsi),%ymm1,%ymm16196 vpmadd52luq 64(%rsi),%ymm1,%ymm17197 vpmadd52luq 96(%rsi),%ymm1,%ymm18198 vpmadd52luq 128(%rsi),%ymm1,%ymm19199 200 vpmadd52luq 0(%rcx),%ymm2,%ymm3201 vpmadd52luq 32(%rcx),%ymm2,%ymm16202 vpmadd52luq 64(%rcx),%ymm2,%ymm17203 vpmadd52luq 96(%rcx),%ymm2,%ymm18204 vpmadd52luq 128(%rcx),%ymm2,%ymm19205 206 207 valignq $1,%ymm3,%ymm16,%ymm3208 valignq $1,%ymm16,%ymm17,%ymm16209 valignq $1,%ymm17,%ymm18,%ymm17210 valignq $1,%ymm18,%ymm19,%ymm18211 valignq $1,%ymm19,%ymm0,%ymm19212 213 vmovq %xmm3,%r13214 addq %r13,%r9215 216 vpmadd52huq 0(%rsi),%ymm1,%ymm3217 vpmadd52huq 32(%rsi),%ymm1,%ymm16218 vpmadd52huq 64(%rsi),%ymm1,%ymm17219 vpmadd52huq 96(%rsi),%ymm1,%ymm18220 vpmadd52huq 128(%rsi),%ymm1,%ymm19221 222 vpmadd52huq 0(%rcx),%ymm2,%ymm3223 vpmadd52huq 32(%rcx),%ymm2,%ymm16224 vpmadd52huq 64(%rcx),%ymm2,%ymm17225 vpmadd52huq 96(%rcx),%ymm2,%ymm18226 vpmadd52huq 128(%rcx),%ymm2,%ymm19227 movq 24(%r11),%r13228 229 vpbroadcastq %r13,%ymm1230 movq 0(%rsi),%rdx231 mulxq %r13,%r13,%r12232 addq %r13,%r9233 movq %r12,%r10234 adcq $0,%r10235 236 movq %r8,%r13237 imulq %r9,%r13238 andq %rax,%r13239 240 vpbroadcastq %r13,%ymm2241 movq 0(%rcx),%rdx242 mulxq %r13,%r13,%r12243 addq %r13,%r9244 adcq %r12,%r10245 246 shrq $52,%r9247 salq $12,%r10248 orq %r10,%r9249 250 vpmadd52luq 0(%rsi),%ymm1,%ymm3251 vpmadd52luq 32(%rsi),%ymm1,%ymm16252 vpmadd52luq 64(%rsi),%ymm1,%ymm17253 vpmadd52luq 96(%rsi),%ymm1,%ymm18254 vpmadd52luq 128(%rsi),%ymm1,%ymm19255 256 vpmadd52luq 0(%rcx),%ymm2,%ymm3257 vpmadd52luq 32(%rcx),%ymm2,%ymm16258 vpmadd52luq 64(%rcx),%ymm2,%ymm17259 vpmadd52luq 96(%rcx),%ymm2,%ymm18260 vpmadd52luq 128(%rcx),%ymm2,%ymm19261 262 263 valignq $1,%ymm3,%ymm16,%ymm3264 valignq $1,%ymm16,%ymm17,%ymm16265 valignq $1,%ymm17,%ymm18,%ymm17266 valignq $1,%ymm18,%ymm19,%ymm18267 valignq $1,%ymm19,%ymm0,%ymm19268 269 vmovq %xmm3,%r13270 addq %r13,%r9271 272 vpmadd52huq 0(%rsi),%ymm1,%ymm3273 vpmadd52huq 32(%rsi),%ymm1,%ymm16274 vpmadd52huq 64(%rsi),%ymm1,%ymm17275 vpmadd52huq 96(%rsi),%ymm1,%ymm18276 vpmadd52huq 128(%rsi),%ymm1,%ymm19277 278 vpmadd52huq 0(%rcx),%ymm2,%ymm3279 vpmadd52huq 32(%rcx),%ymm2,%ymm16280 vpmadd52huq 64(%rcx),%ymm2,%ymm17281 vpmadd52huq 96(%rcx),%ymm2,%ymm18282 vpmadd52huq 128(%rcx),%ymm2,%ymm19283 leaq 32(%r11),%r11284 decl %ebx285 jne .Lloop5286 287 vpbroadcastq %r9,%ymm0288 vpblendd $3,%ymm0,%ymm3,%ymm3289 290 291 292 vpsrlq $52,%ymm3,%ymm0293 vpsrlq $52,%ymm16,%ymm1294 vpsrlq $52,%ymm17,%ymm2295 vpsrlq $52,%ymm18,%ymm25296 vpsrlq $52,%ymm19,%ymm26297 298 299 valignq $3,%ymm25,%ymm26,%ymm26300 valignq $3,%ymm2,%ymm25,%ymm25301 valignq $3,%ymm1,%ymm2,%ymm2302 valignq $3,%ymm0,%ymm1,%ymm1303 valignq $3,.Lzeros(%rip),%ymm0,%ymm0304 305 306 vpandq .Lmask52x4(%rip),%ymm3,%ymm3307 vpandq .Lmask52x4(%rip),%ymm16,%ymm16308 vpandq .Lmask52x4(%rip),%ymm17,%ymm17309 vpandq .Lmask52x4(%rip),%ymm18,%ymm18310 vpandq .Lmask52x4(%rip),%ymm19,%ymm19311 312 313 vpaddq %ymm0,%ymm3,%ymm3314 vpaddq %ymm1,%ymm16,%ymm16315 vpaddq %ymm2,%ymm17,%ymm17316 vpaddq %ymm25,%ymm18,%ymm18317 vpaddq %ymm26,%ymm19,%ymm19318 319 320 321 vpcmpuq $6,.Lmask52x4(%rip),%ymm3,%k1322 vpcmpuq $6,.Lmask52x4(%rip),%ymm16,%k2323 vpcmpuq $6,.Lmask52x4(%rip),%ymm17,%k3324 vpcmpuq $6,.Lmask52x4(%rip),%ymm18,%k4325 vpcmpuq $6,.Lmask52x4(%rip),%ymm19,%k5326 kmovb %k1,%r14d327 kmovb %k2,%r13d328 kmovb %k3,%r12d329 kmovb %k4,%r11d330 kmovb %k5,%r10d331 332 333 vpcmpuq $0,.Lmask52x4(%rip),%ymm3,%k1334 vpcmpuq $0,.Lmask52x4(%rip),%ymm16,%k2335 vpcmpuq $0,.Lmask52x4(%rip),%ymm17,%k3336 vpcmpuq $0,.Lmask52x4(%rip),%ymm18,%k4337 vpcmpuq $0,.Lmask52x4(%rip),%ymm19,%k5338 kmovb %k1,%r9d339 kmovb %k2,%r8d340 kmovb %k3,%ebx341 kmovb %k4,%ecx342 kmovb %k5,%edx343 344 345 346 shlb $4,%r13b347 orb %r13b,%r14b348 shlb $4,%r11b349 orb %r11b,%r12b350 351 addb %r14b,%r14b352 adcb %r12b,%r12b353 adcb %r10b,%r10b354 355 shlb $4,%r8b356 orb %r8b,%r9b357 shlb $4,%cl358 orb %cl,%bl359 360 addb %r9b,%r14b361 adcb %bl,%r12b362 adcb %dl,%r10b363 364 xorb %r9b,%r14b365 xorb %bl,%r12b366 xorb %dl,%r10b367 368 kmovb %r14d,%k1369 shrb $4,%r14b370 kmovb %r14d,%k2371 kmovb %r12d,%k3372 shrb $4,%r12b373 kmovb %r12d,%k4374 kmovb %r10d,%k5375 376 377 vpsubq .Lmask52x4(%rip),%ymm3,%ymm3{%k1}378 vpsubq .Lmask52x4(%rip),%ymm16,%ymm16{%k2}379 vpsubq .Lmask52x4(%rip),%ymm17,%ymm17{%k3}380 vpsubq .Lmask52x4(%rip),%ymm18,%ymm18{%k4}381 vpsubq .Lmask52x4(%rip),%ymm19,%ymm19{%k5}382 383 vpandq .Lmask52x4(%rip),%ymm3,%ymm3384 vpandq .Lmask52x4(%rip),%ymm16,%ymm16385 vpandq .Lmask52x4(%rip),%ymm17,%ymm17386 vpandq .Lmask52x4(%rip),%ymm18,%ymm18387 vpandq .Lmask52x4(%rip),%ymm19,%ymm19388 389 vmovdqu64 %ymm3,0(%rdi)390 vmovdqu64 %ymm16,32(%rdi)391 vmovdqu64 %ymm17,64(%rdi)392 vmovdqu64 %ymm18,96(%rdi)393 vmovdqu64 %ymm19,128(%rdi)394 395 vzeroupper396 movq 0(%rsp),%r15397 .cfi_restore %r15398 movq 8(%rsp),%r14399 .cfi_restore %r14400 movq 16(%rsp),%r13401 .cfi_restore %r13402 movq 24(%rsp),%r12403 .cfi_restore %r12404 movq 32(%rsp),%rbp405 .cfi_restore %rbp406 movq 40(%rsp),%rbx407 .cfi_restore %rbx408 leaq 48(%rsp),%rsp409 .cfi_adjust_cfa_offset -48410 .Lossl_rsaz_amm52x20_x1_ifma256_epilogue:411 .byte 0xf3,0xc3412 .cfi_endproc413 .size ossl_rsaz_amm52x20_x1_ifma256, .-ossl_rsaz_amm52x20_x1_ifma256414 .data415 .align 32416 .Lmask52x4:417 .quad 0xfffffffffffff418 .quad 0xfffffffffffff419 .quad 0xfffffffffffff420 .quad 0xfffffffffffff421 .text422 423 .globl ossl_rsaz_amm52x20_x2_ifma256424 .type ossl_rsaz_amm52x20_x2_ifma256,@function425 .align 32426 ossl_rsaz_amm52x20_x2_ifma256:427 .cfi_startproc428 .byte 243,15,30,250429 pushq %rbx430 .cfi_adjust_cfa_offset 8431 .cfi_offset %rbx,-16432 pushq %rbp433 .cfi_adjust_cfa_offset 8434 .cfi_offset %rbp,-24435 pushq %r12436 .cfi_adjust_cfa_offset 8437 .cfi_offset %r12,-32438 pushq %r13439 .cfi_adjust_cfa_offset 8440 .cfi_offset %r13,-40441 pushq %r14442 .cfi_adjust_cfa_offset 8443 .cfi_offset %r14,-48444 pushq %r15445 .cfi_adjust_cfa_offset 8446 .cfi_offset %r15,-56447 .Lossl_rsaz_amm52x20_x2_ifma256_body:448 449 450 vpxord %ymm0,%ymm0,%ymm0451 vmovdqa64 %ymm0,%ymm3452 vmovdqa64 %ymm0,%ymm16453 vmovdqa64 %ymm0,%ymm17454 vmovdqa64 %ymm0,%ymm18455 vmovdqa64 %ymm0,%ymm19456 vmovdqa64 %ymm0,%ymm4457 vmovdqa64 %ymm0,%ymm20458 vmovdqa64 %ymm0,%ymm21459 vmovdqa64 %ymm0,%ymm22460 vmovdqa64 %ymm0,%ymm23461 462 xorl %r9d,%r9d463 xorl %r15d,%r15d464 465 movq %rdx,%r11466 movq $0xfffffffffffff,%rax467 468 movl $20,%ebx469 470 .align 32471 .Lloop20:472 movq 0(%r11),%r13473 474 vpbroadcastq %r13,%ymm1475 movq 0(%rsi),%rdx476 mulxq %r13,%r13,%r12477 addq %r13,%r9478 movq %r12,%r10479 adcq $0,%r10480 481 movq (%r8),%r13482 imulq %r9,%r13483 andq %rax,%r13484 485 vpbroadcastq %r13,%ymm2486 movq 0(%rcx),%rdx487 mulxq %r13,%r13,%r12488 addq %r13,%r9489 adcq %r12,%r10490 491 shrq $52,%r9492 salq $12,%r10493 orq %r10,%r9494 495 vpmadd52luq 0(%rsi),%ymm1,%ymm3496 vpmadd52luq 32(%rsi),%ymm1,%ymm16497 vpmadd52luq 64(%rsi),%ymm1,%ymm17498 vpmadd52luq 96(%rsi),%ymm1,%ymm18499 vpmadd52luq 128(%rsi),%ymm1,%ymm19500 501 vpmadd52luq 0(%rcx),%ymm2,%ymm3502 vpmadd52luq 32(%rcx),%ymm2,%ymm16503 vpmadd52luq 64(%rcx),%ymm2,%ymm17504 vpmadd52luq 96(%rcx),%ymm2,%ymm18505 vpmadd52luq 128(%rcx),%ymm2,%ymm19506 507 508 valignq $1,%ymm3,%ymm16,%ymm3509 valignq $1,%ymm16,%ymm17,%ymm16510 valignq $1,%ymm17,%ymm18,%ymm17511 valignq $1,%ymm18,%ymm19,%ymm18512 valignq $1,%ymm19,%ymm0,%ymm19513 514 vmovq %xmm3,%r13515 addq %r13,%r9516 517 vpmadd52huq 0(%rsi),%ymm1,%ymm3518 vpmadd52huq 32(%rsi),%ymm1,%ymm16519 vpmadd52huq 64(%rsi),%ymm1,%ymm17520 vpmadd52huq 96(%rsi),%ymm1,%ymm18521 vpmadd52huq 128(%rsi),%ymm1,%ymm19522 523 vpmadd52huq 0(%rcx),%ymm2,%ymm3524 vpmadd52huq 32(%rcx),%ymm2,%ymm16525 vpmadd52huq 64(%rcx),%ymm2,%ymm17526 vpmadd52huq 96(%rcx),%ymm2,%ymm18527 vpmadd52huq 128(%rcx),%ymm2,%ymm19528 movq 160(%r11),%r13529 530 vpbroadcastq %r13,%ymm1531 movq 160(%rsi),%rdx532 mulxq %r13,%r13,%r12533 addq %r13,%r15534 movq %r12,%r10535 adcq $0,%r10536 537 movq 8(%r8),%r13538 imulq %r15,%r13539 andq %rax,%r13540 541 vpbroadcastq %r13,%ymm2542 movq 160(%rcx),%rdx543 mulxq %r13,%r13,%r12544 addq %r13,%r15545 adcq %r12,%r10546 547 shrq $52,%r15548 salq $12,%r10549 orq %r10,%r15550 551 vpmadd52luq 160(%rsi),%ymm1,%ymm4552 vpmadd52luq 192(%rsi),%ymm1,%ymm20553 vpmadd52luq 224(%rsi),%ymm1,%ymm21554 vpmadd52luq 256(%rsi),%ymm1,%ymm22555 vpmadd52luq 288(%rsi),%ymm1,%ymm23556 557 vpmadd52luq 160(%rcx),%ymm2,%ymm4558 vpmadd52luq 192(%rcx),%ymm2,%ymm20559 vpmadd52luq 224(%rcx),%ymm2,%ymm21560 vpmadd52luq 256(%rcx),%ymm2,%ymm22561 vpmadd52luq 288(%rcx),%ymm2,%ymm23562 563 564 valignq $1,%ymm4,%ymm20,%ymm4565 valignq $1,%ymm20,%ymm21,%ymm20566 valignq $1,%ymm21,%ymm22,%ymm21567 valignq $1,%ymm22,%ymm23,%ymm22568 valignq $1,%ymm23,%ymm0,%ymm23569 570 vmovq %xmm4,%r13571 addq %r13,%r15572 573 vpmadd52huq 160(%rsi),%ymm1,%ymm4574 vpmadd52huq 192(%rsi),%ymm1,%ymm20575 vpmadd52huq 224(%rsi),%ymm1,%ymm21576 vpmadd52huq 256(%rsi),%ymm1,%ymm22577 vpmadd52huq 288(%rsi),%ymm1,%ymm23578 579 vpmadd52huq 160(%rcx),%ymm2,%ymm4580 vpmadd52huq 192(%rcx),%ymm2,%ymm20581 vpmadd52huq 224(%rcx),%ymm2,%ymm21582 vpmadd52huq 256(%rcx),%ymm2,%ymm22583 vpmadd52huq 288(%rcx),%ymm2,%ymm23584 leaq 8(%r11),%r11585 decl %ebx586 jne .Lloop20587 588 vpbroadcastq %r9,%ymm0589 vpblendd $3,%ymm0,%ymm3,%ymm3590 591 592 593 vpsrlq $52,%ymm3,%ymm0594 vpsrlq $52,%ymm16,%ymm1595 vpsrlq $52,%ymm17,%ymm2596 vpsrlq $52,%ymm18,%ymm25597 vpsrlq $52,%ymm19,%ymm26598 599 600 valignq $3,%ymm25,%ymm26,%ymm26601 valignq $3,%ymm2,%ymm25,%ymm25602 valignq $3,%ymm1,%ymm2,%ymm2603 valignq $3,%ymm0,%ymm1,%ymm1604 valignq $3,.Lzeros(%rip),%ymm0,%ymm0605 606 607 vpandq .Lmask52x4(%rip),%ymm3,%ymm3608 vpandq .Lmask52x4(%rip),%ymm16,%ymm16609 vpandq .Lmask52x4(%rip),%ymm17,%ymm17610 vpandq .Lmask52x4(%rip),%ymm18,%ymm18611 vpandq .Lmask52x4(%rip),%ymm19,%ymm19612 613 614 vpaddq %ymm0,%ymm3,%ymm3615 vpaddq %ymm1,%ymm16,%ymm16616 vpaddq %ymm2,%ymm17,%ymm17617 vpaddq %ymm25,%ymm18,%ymm18618 vpaddq %ymm26,%ymm19,%ymm19619 620 621 622 vpcmpuq $6,.Lmask52x4(%rip),%ymm3,%k1623 vpcmpuq $6,.Lmask52x4(%rip),%ymm16,%k2624 vpcmpuq $6,.Lmask52x4(%rip),%ymm17,%k3625 vpcmpuq $6,.Lmask52x4(%rip),%ymm18,%k4626 vpcmpuq $6,.Lmask52x4(%rip),%ymm19,%k5627 kmovb %k1,%r14d628 kmovb %k2,%r13d629 kmovb %k3,%r12d630 kmovb %k4,%r11d631 kmovb %k5,%r10d632 633 634 vpcmpuq $0,.Lmask52x4(%rip),%ymm3,%k1635 vpcmpuq $0,.Lmask52x4(%rip),%ymm16,%k2636 vpcmpuq $0,.Lmask52x4(%rip),%ymm17,%k3637 vpcmpuq $0,.Lmask52x4(%rip),%ymm18,%k4638 vpcmpuq $0,.Lmask52x4(%rip),%ymm19,%k5639 kmovb %k1,%r9d640 kmovb %k2,%r8d641 kmovb %k3,%ebx642 kmovb %k4,%ecx643 kmovb %k5,%edx644 645 646 647 shlb $4,%r13b648 orb %r13b,%r14b649 shlb $4,%r11b650 orb %r11b,%r12b651 652 addb %r14b,%r14b653 adcb %r12b,%r12b654 adcb %r10b,%r10b655 656 shlb $4,%r8b657 orb %r8b,%r9b658 shlb $4,%cl659 orb %cl,%bl660 661 addb %r9b,%r14b662 adcb %bl,%r12b663 adcb %dl,%r10b664 665 xorb %r9b,%r14b666 xorb %bl,%r12b667 xorb %dl,%r10b668 669 kmovb %r14d,%k1670 shrb $4,%r14b671 kmovb %r14d,%k2672 kmovb %r12d,%k3673 shrb $4,%r12b674 kmovb %r12d,%k4675 kmovb %r10d,%k5676 677 678 vpsubq .Lmask52x4(%rip),%ymm3,%ymm3{%k1}679 vpsubq .Lmask52x4(%rip),%ymm16,%ymm16{%k2}680 vpsubq .Lmask52x4(%rip),%ymm17,%ymm17{%k3}681 vpsubq .Lmask52x4(%rip),%ymm18,%ymm18{%k4}682 vpsubq .Lmask52x4(%rip),%ymm19,%ymm19{%k5}683 684 vpandq .Lmask52x4(%rip),%ymm3,%ymm3685 vpandq .Lmask52x4(%rip),%ymm16,%ymm16686 vpandq .Lmask52x4(%rip),%ymm17,%ymm17687 vpandq .Lmask52x4(%rip),%ymm18,%ymm18688 vpandq .Lmask52x4(%rip),%ymm19,%ymm19689 690 vpbroadcastq %r15,%ymm0691 vpblendd $3,%ymm0,%ymm4,%ymm4692 693 694 695 vpsrlq $52,%ymm4,%ymm0696 vpsrlq $52,%ymm20,%ymm1697 vpsrlq $52,%ymm21,%ymm2698 vpsrlq $52,%ymm22,%ymm25699 vpsrlq $52,%ymm23,%ymm26700 701 702 valignq $3,%ymm25,%ymm26,%ymm26703 valignq $3,%ymm2,%ymm25,%ymm25704 valignq $3,%ymm1,%ymm2,%ymm2705 valignq $3,%ymm0,%ymm1,%ymm1706 valignq $3,.Lzeros(%rip),%ymm0,%ymm0707 708 709 vpandq .Lmask52x4(%rip),%ymm4,%ymm4710 vpandq .Lmask52x4(%rip),%ymm20,%ymm20711 vpandq .Lmask52x4(%rip),%ymm21,%ymm21712 vpandq .Lmask52x4(%rip),%ymm22,%ymm22713 vpandq .Lmask52x4(%rip),%ymm23,%ymm23714 715 716 vpaddq %ymm0,%ymm4,%ymm4717 vpaddq %ymm1,%ymm20,%ymm20718 vpaddq %ymm2,%ymm21,%ymm21719 vpaddq %ymm25,%ymm22,%ymm22720 vpaddq %ymm26,%ymm23,%ymm23721 722 723 724 vpcmpuq $6,.Lmask52x4(%rip),%ymm4,%k1725 vpcmpuq $6,.Lmask52x4(%rip),%ymm20,%k2726 vpcmpuq $6,.Lmask52x4(%rip),%ymm21,%k3727 vpcmpuq $6,.Lmask52x4(%rip),%ymm22,%k4728 vpcmpuq $6,.Lmask52x4(%rip),%ymm23,%k5729 kmovb %k1,%r14d730 kmovb %k2,%r13d731 kmovb %k3,%r12d732 kmovb %k4,%r11d733 kmovb %k5,%r10d734 735 736 vpcmpuq $0,.Lmask52x4(%rip),%ymm4,%k1737 vpcmpuq $0,.Lmask52x4(%rip),%ymm20,%k2738 vpcmpuq $0,.Lmask52x4(%rip),%ymm21,%k3739 vpcmpuq $0,.Lmask52x4(%rip),%ymm22,%k4740 vpcmpuq $0,.Lmask52x4(%rip),%ymm23,%k5741 kmovb %k1,%r9d742 kmovb %k2,%r8d743 kmovb %k3,%ebx744 kmovb %k4,%ecx745 kmovb %k5,%edx746 747 748 749 shlb $4,%r13b750 orb %r13b,%r14b751 shlb $4,%r11b752 orb %r11b,%r12b753 754 addb %r14b,%r14b755 adcb %r12b,%r12b756 adcb %r10b,%r10b757 758 shlb $4,%r8b759 orb %r8b,%r9b760 shlb $4,%cl761 orb %cl,%bl762 763 addb %r9b,%r14b764 adcb %bl,%r12b765 adcb %dl,%r10b766 767 xorb %r9b,%r14b768 xorb %bl,%r12b769 xorb %dl,%r10b770 771 kmovb %r14d,%k1772 shrb $4,%r14b773 kmovb %r14d,%k2774 kmovb %r12d,%k3775 shrb $4,%r12b776 kmovb %r12d,%k4777 kmovb %r10d,%k5778 779 780 vpsubq .Lmask52x4(%rip),%ymm4,%ymm4{%k1}781 vpsubq .Lmask52x4(%rip),%ymm20,%ymm20{%k2}782 vpsubq .Lmask52x4(%rip),%ymm21,%ymm21{%k3}783 vpsubq .Lmask52x4(%rip),%ymm22,%ymm22{%k4}784 vpsubq .Lmask52x4(%rip),%ymm23,%ymm23{%k5}785 786 vpandq .Lmask52x4(%rip),%ymm4,%ymm4787 vpandq .Lmask52x4(%rip),%ymm20,%ymm20788 vpandq .Lmask52x4(%rip),%ymm21,%ymm21789 vpandq .Lmask52x4(%rip),%ymm22,%ymm22790 vpandq .Lmask52x4(%rip),%ymm23,%ymm23791 792 vmovdqu64 %ymm3,0(%rdi)793 vmovdqu64 %ymm16,32(%rdi)794 vmovdqu64 %ymm17,64(%rdi)795 vmovdqu64 %ymm18,96(%rdi)796 vmovdqu64 %ymm19,128(%rdi)797 798 vmovdqu64 %ymm4,160(%rdi)799 vmovdqu64 %ymm20,192(%rdi)800 vmovdqu64 %ymm21,224(%rdi)801 vmovdqu64 %ymm22,256(%rdi)802 vmovdqu64 %ymm23,288(%rdi)803 804 vzeroupper805 movq 0(%rsp),%r15806 .cfi_restore %r15807 movq 8(%rsp),%r14808 .cfi_restore %r14809 movq 16(%rsp),%r13810 .cfi_restore %r13811 movq 24(%rsp),%r12812 .cfi_restore %r12813 movq 32(%rsp),%rbp814 .cfi_restore %rbp815 movq 40(%rsp),%rbx816 .cfi_restore %rbx817 leaq 48(%rsp),%rsp818 .cfi_adjust_cfa_offset -48819 .Lossl_rsaz_amm52x20_x2_ifma256_epilogue:820 .byte 0xf3,0xc3821 .cfi_endproc822 .size ossl_rsaz_amm52x20_x2_ifma256, .-ossl_rsaz_amm52x20_x2_ifma256823 .text824 825 .align 32826 .globl ossl_extract_multiplier_2x20_win5827 .type ossl_extract_multiplier_2x20_win5,@function828 ossl_extract_multiplier_2x20_win5:829 .cfi_startproc830 .byte 243,15,30,250831 vmovdqa64 .Lones(%rip),%ymm24832 vpbroadcastq %rdx,%ymm22833 vpbroadcastq %rcx,%ymm23834 leaq 10240(%rsi),%rax835 836 837 vpxor %xmm0,%xmm0,%xmm0838 vmovdqa64 %ymm0,%ymm21839 vmovdqa64 %ymm0,%ymm1840 vmovdqa64 %ymm0,%ymm2841 vmovdqa64 %ymm0,%ymm3842 vmovdqa64 %ymm0,%ymm4843 vmovdqa64 %ymm0,%ymm5844 vmovdqa64 %ymm0,%ymm16845 vmovdqa64 %ymm0,%ymm17846 vmovdqa64 %ymm0,%ymm18847 vmovdqa64 %ymm0,%ymm19848 849 .align 32850 .Lloop:851 vpcmpq $0,%ymm21,%ymm22,%k1852 vpcmpq $0,%ymm21,%ymm23,%k2853 vmovdqu64 0(%rsi),%ymm20854 vpblendmq %ymm20,%ymm0,%ymm0{%k1}855 vmovdqu64 32(%rsi),%ymm20856 vpblendmq %ymm20,%ymm1,%ymm1{%k1}857 vmovdqu64 64(%rsi),%ymm20858 vpblendmq %ymm20,%ymm2,%ymm2{%k1}859 vmovdqu64 96(%rsi),%ymm20860 vpblendmq %ymm20,%ymm3,%ymm3{%k1}861 vmovdqu64 128(%rsi),%ymm20862 vpblendmq %ymm20,%ymm4,%ymm4{%k1}863 vmovdqu64 160(%rsi),%ymm20864 vpblendmq %ymm20,%ymm5,%ymm5{%k2}865 vmovdqu64 192(%rsi),%ymm20866 vpblendmq %ymm20,%ymm16,%ymm16{%k2}867 vmovdqu64 224(%rsi),%ymm20868 vpblendmq %ymm20,%ymm17,%ymm17{%k2}869 vmovdqu64 256(%rsi),%ymm20870 vpblendmq %ymm20,%ymm18,%ymm18{%k2}871 vmovdqu64 288(%rsi),%ymm20872 vpblendmq %ymm20,%ymm19,%ymm19{%k2}873 vpaddq %ymm24,%ymm21,%ymm21874 addq $320,%rsi875 cmpq %rsi,%rax876 jne .Lloop877 vmovdqu64 %ymm0,0(%rdi)878 vmovdqu64 %ymm1,32(%rdi)879 vmovdqu64 %ymm2,64(%rdi)880 vmovdqu64 %ymm3,96(%rdi)881 vmovdqu64 %ymm4,128(%rdi)882 vmovdqu64 %ymm5,160(%rdi)883 vmovdqu64 %ymm16,192(%rdi)884 vmovdqu64 %ymm17,224(%rdi)885 vmovdqu64 %ymm18,256(%rdi)886 vmovdqu64 %ymm19,288(%rdi)887 .byte 0xf3,0xc3888 .cfi_endproc889 .size ossl_extract_multiplier_2x20_win5, .-ossl_extract_multiplier_2x20_win5890 .data891 .align 32892 .Lones:893 .quad 1,1,1,1894 .Lzeros:895 .quad 0,0,0,0896 .section ".note.gnu.property", "a"897 .p2align 3898 .long 1f - 0f899 .long 4f - 1f900 .long 5901 0:902 # "GNU" encoded with .byte, since .asciz isn't supported903 # on Solaris.904 .byte 0x47905 .byte 0x4e906 .byte 0x55907 .byte 0908 1:909 .p2align 3910 .long 0xc0000002911 .long 3f - 2f912 2:913 .long 3914 3:915 .p2align 3916 4:
Note:
See TracChangeset
for help on using the changeset viewer.