VirtualBox

source: vbox/trunk/src/libs/openssl-3.0.3/crypto/bn/asm/rsaz-x86_64.pl@ 95218

Last change on this file since 95218 was 94082, checked in by vboxsync, 3 years ago

libs/openssl-3.0.1: started applying and adjusting our OpenSSL changes to 3.0.1. bugref:10128

  • Property svn:executable set to *
File size: 46.1 KB
Line 
1#! /usr/bin/env perl
2# Copyright 2013-2020 The OpenSSL Project Authors. All Rights Reserved.
3# Copyright (c) 2012, Intel Corporation. All Rights Reserved.
4#
5# Licensed under the Apache License 2.0 (the "License"). You may not use
6# this file except in compliance with the License. You can obtain a copy
7# in the file LICENSE in the source distribution or at
8# https://www.openssl.org/source/license.html
9#
10# Originally written by Shay Gueron (1, 2), and Vlad Krasnov (1)
11# (1) Intel Corporation, Israel Development Center, Haifa, Israel
12# (2) University of Haifa, Israel
13#
14# References:
15# [1] S. Gueron, "Efficient Software Implementations of Modular
16# Exponentiation", http://eprint.iacr.org/2011/239
17# [2] S. Gueron, V. Krasnov. "Speeding up Big-Numbers Squaring".
18# IEEE Proceedings of 9th International Conference on Information
19# Technology: New Generations (ITNG 2012), 821-823 (2012).
20# [3] S. Gueron, Efficient Software Implementations of Modular Exponentiation
21# Journal of Cryptographic Engineering 2:31-43 (2012).
22# [4] S. Gueron, V. Krasnov: "[PATCH] Efficient and side channel analysis
23# resistant 512-bit and 1024-bit modular exponentiation for optimizing
24# RSA1024 and RSA2048 on x86_64 platforms",
25# http://rt.openssl.org/Ticket/Display.html?id=2582&user=guest&pass=guest
26#
27# While original submission covers 512- and 1024-bit exponentiation,
28# this module is limited to 512-bit version only (and as such
29# accelerates RSA1024 sign). This is because improvement for longer
30# keys is not high enough to justify the effort, highest measured
31# was ~5% on Westmere. [This is relative to OpenSSL 1.0.2, upcoming
32# for the moment of this writing!] Nor does this module implement
33# "monolithic" complete exponentiation jumbo-subroutine, but adheres
34# to more modular mixture of C and assembly. And it's optimized even
35# for processors other than Intel Core family (see table below for
36# improvement coefficients).
37# <[email protected]>
38#
39# RSA1024 sign/sec this/original |this/rsax(*) this/fips(*)
40# ----------------+---------------------------
41# Opteron +13% |+5% +20%
42# Bulldozer -0% |-1% +10%
43# P4 +11% |+7% +8%
44# Westmere +5% |+14% +17%
45# Sandy Bridge +2% |+12% +29%
46# Ivy Bridge +1% |+11% +35%
47# Haswell(**) -0% |+12% +39%
48# Atom +13% |+11% +4%
49# VIA Nano +70% |+9% +25%
50#
51# (*) rsax engine and fips numbers are presented for reference
52# purposes;
53# (**) MULX was attempted, but found to give only marginal improvement;
54
55# $output is the last argument if it looks like a file (it has an extension)
56# $flavour is the first argument if it doesn't look like a file
57$output = $#ARGV >= 0 && $ARGV[$#ARGV] =~ m|\.\w+$| ? pop : undef;
58$flavour = $#ARGV >= 0 && $ARGV[0] !~ m|\.| ? shift : undef;
59
60$win64=0; $win64=1 if ($flavour =~ /[nm]asm|mingw64/ || $output =~ /\.asm$/);
61
62$0 =~ m/(.*[\/\\])[^\/\\]+$/; $dir=$1;
63( $xlate="${dir}x86_64-xlate.pl" and -f $xlate ) or
64( $xlate="${dir}../../perlasm/x86_64-xlate.pl" and -f $xlate) or
65die "can't locate x86_64-xlate.pl";
66
67open OUT,"| \"$^X\" \"$xlate\" $flavour \"$output\""
68 or die "can't call $xlate: $!";
69*STDOUT=*OUT;
70
71if (`$ENV{CC} -Wa,-v -c -o /dev/null -x assembler /dev/null 2>&1`
72 =~ /GNU assembler version ([2-9]\.[0-9]+)/) {
73 $addx = ($1>=2.23);
74}
75
76if (!$addx && $win64 && ($flavour =~ /nasm/ || $ENV{ASM} =~ /nasm/) &&
77 `nasm -v 2>&1` =~ /NASM version ([2-9]\.[0-9]+)/) {
78 $addx = ($1>=2.10);
79}
80
81if (!$addx && $win64 && ($flavour =~ /masm/ || $ENV{ASM} =~ /ml64/) &&
82 `ml64 2>&1` =~ /Version ([0-9]+)\./) {
83 $addx = ($1>=12);
84}
85
86if (!$addx && `$ENV{CC} -v 2>&1` =~ /((?:clang|LLVM) version|.*based on LLVM) ([0-9]+)\.([0-9]+)/) {
87 my $ver = $2 + $3/100.0; # 3.1->3.01, 3.10->3.10
88 $addx = ($ver>=3.03);
89}
90
91($out, $inp, $mod) = ("%rdi", "%rsi", "%rbp"); # common internal API
92{
93my ($out,$inp,$mod,$n0,$times) = ("%rdi","%rsi","%rdx","%rcx","%r8d");
94
95$code.=<<___;
96.text
97
98.extern OPENSSL_ia32cap_P
99
100.globl rsaz_512_sqr
101.type rsaz_512_sqr,\@function,5
102.align 32
103rsaz_512_sqr: # 25-29% faster than rsaz_512_mul
104.cfi_startproc
105 push %rbx
106.cfi_push %rbx
107 push %rbp
108.cfi_push %rbp
109 push %r12
110.cfi_push %r12
111 push %r13
112.cfi_push %r13
113 push %r14
114.cfi_push %r14
115 push %r15
116.cfi_push %r15
117
118 subq \$128+24, %rsp
119.cfi_adjust_cfa_offset 128+24
120.Lsqr_body:
121 movq $mod, %xmm1 # common off-load
122 movq ($inp), %rdx
123 movq 8($inp), %rax
124 movq $n0, 128(%rsp)
125___
126$code.=<<___ if ($addx);
127 movl \$0x80100,%r11d
128 andl OPENSSL_ia32cap_P+8(%rip),%r11d
129 cmpl \$0x80100,%r11d # check for MULX and ADO/CX
130 je .Loop_sqrx
131___
132$code.=<<___;
133 jmp .Loop_sqr
134
135.align 32
136.Loop_sqr:
137 movl $times,128+8(%rsp)
138#first iteration
139 movq %rdx, %rbx # 0($inp)
140 mov %rax, %rbp # 8($inp)
141 mulq %rdx
142 movq %rax, %r8
143 movq 16($inp), %rax
144 movq %rdx, %r9
145
146 mulq %rbx
147 addq %rax, %r9
148 movq 24($inp), %rax
149 movq %rdx, %r10
150 adcq \$0, %r10
151
152 mulq %rbx
153 addq %rax, %r10
154 movq 32($inp), %rax
155 movq %rdx, %r11
156 adcq \$0, %r11
157
158 mulq %rbx
159 addq %rax, %r11
160 movq 40($inp), %rax
161 movq %rdx, %r12
162 adcq \$0, %r12
163
164 mulq %rbx
165 addq %rax, %r12
166 movq 48($inp), %rax
167 movq %rdx, %r13
168 adcq \$0, %r13
169
170 mulq %rbx
171 addq %rax, %r13
172 movq 56($inp), %rax
173 movq %rdx, %r14
174 adcq \$0, %r14
175
176 mulq %rbx
177 addq %rax, %r14
178 movq %rbx, %rax
179 adcq \$0, %rdx
180
181 xorq %rcx,%rcx # rcx:r8 = r8 << 1
182 addq %r8, %r8
183 movq %rdx, %r15
184 adcq \$0, %rcx
185
186 mulq %rax
187 addq %r8, %rdx
188 adcq \$0, %rcx
189
190 movq %rax, (%rsp)
191 movq %rdx, 8(%rsp)
192
193#second iteration
194 movq 16($inp), %rax
195 mulq %rbp
196 addq %rax, %r10
197 movq 24($inp), %rax
198 movq %rdx, %rbx
199 adcq \$0, %rbx
200
201 mulq %rbp
202 addq %rax, %r11
203 movq 32($inp), %rax
204 adcq \$0, %rdx
205 addq %rbx, %r11
206 movq %rdx, %rbx
207 adcq \$0, %rbx
208
209 mulq %rbp
210 addq %rax, %r12
211 movq 40($inp), %rax
212 adcq \$0, %rdx
213 addq %rbx, %r12
214 movq %rdx, %rbx
215 adcq \$0, %rbx
216
217 mulq %rbp
218 addq %rax, %r13
219 movq 48($inp), %rax
220 adcq \$0, %rdx
221 addq %rbx, %r13
222 movq %rdx, %rbx
223 adcq \$0, %rbx
224
225 mulq %rbp
226 addq %rax, %r14
227 movq 56($inp), %rax
228 adcq \$0, %rdx
229 addq %rbx, %r14
230 movq %rdx, %rbx
231 adcq \$0, %rbx
232
233 mulq %rbp
234 addq %rax, %r15
235 movq %rbp, %rax
236 adcq \$0, %rdx
237 addq %rbx, %r15
238 adcq \$0, %rdx
239
240 xorq %rbx, %rbx # rbx:r10:r9 = r10:r9 << 1
241 addq %r9, %r9
242 movq %rdx, %r8
243 adcq %r10, %r10
244 adcq \$0, %rbx
245
246 mulq %rax
247 # rcx <= 2 and rax <= 0xFFFF..F9, so carry must be zero here
248 addq %rcx, %rax
249 movq 16($inp), %rbp
250 addq %rax, %r9
251 movq 24($inp), %rax
252 adcq %rdx, %r10
253 adcq \$0, %rbx
254
255 movq %r9, 16(%rsp)
256 movq %r10, 24(%rsp)
257
258#third iteration
259 mulq %rbp
260 addq %rax, %r12
261 movq 32($inp), %rax
262 movq %rdx, %rcx
263 adcq \$0, %rcx
264
265 mulq %rbp
266 addq %rax, %r13
267 movq 40($inp), %rax
268 adcq \$0, %rdx
269 addq %rcx, %r13
270 movq %rdx, %rcx
271 adcq \$0, %rcx
272
273 mulq %rbp
274 addq %rax, %r14
275 movq 48($inp), %rax
276 adcq \$0, %rdx
277 addq %rcx, %r14
278 movq %rdx, %rcx
279 adcq \$0, %rcx
280
281 mulq %rbp
282 addq %rax, %r15
283 movq 56($inp), %rax
284 adcq \$0, %rdx
285 addq %rcx, %r15
286 movq %rdx, %rcx
287 adcq \$0, %rcx
288
289 mulq %rbp
290 addq %rax, %r8
291 movq %rbp, %rax
292 adcq \$0, %rdx
293 addq %rcx, %r8
294 adcq \$0, %rdx
295
296 xorq %rcx, %rcx # rcx:r12:r11 = r12:r11 << 1
297 addq %r11, %r11
298 movq %rdx, %r9
299 adcq %r12, %r12
300 adcq \$0, %rcx
301
302 mulq %rax
303 # rbx <= 2 and rax <= 0xFFFF..F9, so carry must be zero here
304 addq %rbx, %rax
305 movq 24($inp), %r10
306 addq %rax, %r11
307 movq 32($inp), %rax
308 adcq %rdx, %r12
309 adcq \$0, %rcx
310
311 movq %r11, 32(%rsp)
312 movq %r12, 40(%rsp)
313
314#fourth iteration
315 mov %rax, %r11 # 32($inp)
316 mulq %r10
317 addq %rax, %r14
318 movq 40($inp), %rax
319 movq %rdx, %rbx
320 adcq \$0, %rbx
321
322 mov %rax, %r12 # 40($inp)
323 mulq %r10
324 addq %rax, %r15
325 movq 48($inp), %rax
326 adcq \$0, %rdx
327 addq %rbx, %r15
328 movq %rdx, %rbx
329 adcq \$0, %rbx
330
331 mov %rax, %rbp # 48($inp)
332 mulq %r10
333 addq %rax, %r8
334 movq 56($inp), %rax
335 adcq \$0, %rdx
336 addq %rbx, %r8
337 movq %rdx, %rbx
338 adcq \$0, %rbx
339
340 mulq %r10
341 addq %rax, %r9
342 movq %r10, %rax
343 adcq \$0, %rdx
344 addq %rbx, %r9
345 adcq \$0, %rdx
346
347 xorq %rbx, %rbx # rbx:r13:r14 = r13:r14 << 1
348 addq %r13, %r13
349 movq %rdx, %r10
350 adcq %r14, %r14
351 adcq \$0, %rbx
352
353 mulq %rax
354 # rcx <= 2 and rax <= 0xFFFF..F9, so carry must be zero here
355 addq %rcx, %rax
356 addq %rax, %r13
357 movq %r12, %rax # 40($inp)
358 adcq %rdx, %r14
359 adcq \$0, %rbx
360
361 movq %r13, 48(%rsp)
362 movq %r14, 56(%rsp)
363
364#fifth iteration
365 mulq %r11
366 addq %rax, %r8
367 movq %rbp, %rax # 48($inp)
368 movq %rdx, %rcx
369 adcq \$0, %rcx
370
371 mulq %r11
372 addq %rax, %r9
373 movq 56($inp), %rax
374 adcq \$0, %rdx
375 addq %rcx, %r9
376 movq %rdx, %rcx
377 adcq \$0, %rcx
378
379 mov %rax, %r14 # 56($inp)
380 mulq %r11
381 addq %rax, %r10
382 movq %r11, %rax
383 adcq \$0, %rdx
384 addq %rcx, %r10
385 adcq \$0, %rdx
386
387 xorq %rcx, %rcx # rcx:r8:r15 = r8:r15 << 1
388 addq %r15, %r15
389 movq %rdx, %r11
390 adcq %r8, %r8
391 adcq \$0, %rcx
392
393 mulq %rax
394 # rbx <= 2 and rax <= 0xFFFF..F9, so carry must be zero here
395 addq %rbx, %rax
396 addq %rax, %r15
397 movq %rbp, %rax # 48($inp)
398 adcq %rdx, %r8
399 adcq \$0, %rcx
400
401 movq %r15, 64(%rsp)
402 movq %r8, 72(%rsp)
403
404#sixth iteration
405 mulq %r12
406 addq %rax, %r10
407 movq %r14, %rax # 56($inp)
408 movq %rdx, %rbx
409 adcq \$0, %rbx
410
411 mulq %r12
412 addq %rax, %r11
413 movq %r12, %rax
414 adcq \$0, %rdx
415 addq %rbx, %r11
416 adcq \$0, %rdx
417
418 xorq %rbx, %rbx # rbx:r10:r9 = r10:r9 << 1
419 addq %r9, %r9
420 movq %rdx, %r12
421 adcq %r10, %r10
422 adcq \$0, %rbx
423
424 mulq %rax
425 # rcx <= 2 and rax <= 0xFFFF..F9, so carry must be zero here
426 addq %rcx, %rax
427 addq %rax, %r9
428 movq %r14, %rax # 56($inp)
429 adcq %rdx, %r10
430 adcq \$0, %rbx
431
432 movq %r9, 80(%rsp)
433 movq %r10, 88(%rsp)
434
435#seventh iteration
436 mulq %rbp
437 addq %rax, %r12
438 movq %rbp, %rax
439 adcq \$0, %rdx
440
441 xorq %rcx, %rcx # rcx:r12:r11 = r12:r11 << 1
442 addq %r11, %r11
443 movq %rdx, %r13
444 adcq %r12, %r12
445 adcq \$0, %rcx
446
447 mulq %rax
448 # rbx <= 2 and rax <= 0xFFFF..F9, so carry must be zero here
449 addq %rbx, %rax
450 addq %rax, %r11
451 movq %r14, %rax # 56($inp)
452 adcq %rdx, %r12
453 adcq \$0, %rcx
454
455 movq %r11, 96(%rsp)
456 movq %r12, 104(%rsp)
457
458#eighth iteration
459 xorq %rbx, %rbx # rbx:r13 = r13 << 1
460 addq %r13, %r13
461 adcq \$0, %rbx
462
463 mulq %rax
464 # rcx <= 2 and rax <= 0xFFFF..F9, so carry must be zero here
465 addq %rcx, %rax
466 addq %r13, %rax
467 adcq %rbx, %rdx
468
469 movq (%rsp), %r8
470 movq 8(%rsp), %r9
471 movq 16(%rsp), %r10
472 movq 24(%rsp), %r11
473 movq 32(%rsp), %r12
474 movq 40(%rsp), %r13
475 movq 48(%rsp), %r14
476 movq 56(%rsp), %r15
477 movq %xmm1, %rbp
478
479 movq %rax, 112(%rsp)
480 movq %rdx, 120(%rsp)
481
482 call __rsaz_512_reduce
483
484 addq 64(%rsp), %r8
485 adcq 72(%rsp), %r9
486 adcq 80(%rsp), %r10
487 adcq 88(%rsp), %r11
488 adcq 96(%rsp), %r12
489 adcq 104(%rsp), %r13
490 adcq 112(%rsp), %r14
491 adcq 120(%rsp), %r15
492 sbbq %rcx, %rcx
493
494 call __rsaz_512_subtract
495
496 movq %r8, %rdx
497 movq %r9, %rax
498 movl 128+8(%rsp), $times
499 movq $out, $inp
500
501 decl $times
502 jnz .Loop_sqr
503___
504if ($addx) {
505$code.=<<___;
506 jmp .Lsqr_tail
507
508.align 32
509.Loop_sqrx:
510 movl $times,128+8(%rsp)
511 movq $out, %xmm0 # off-load
512#first iteration
513 mulx %rax, %r8, %r9
514 mov %rax, %rbx
515
516 mulx 16($inp), %rcx, %r10
517 xor %rbp, %rbp # cf=0, of=0
518
519 mulx 24($inp), %rax, %r11
520 adcx %rcx, %r9
521
522 .byte 0xc4,0x62,0xf3,0xf6,0xa6,0x20,0x00,0x00,0x00 # mulx 32($inp), %rcx, %r12
523 adcx %rax, %r10
524
525 .byte 0xc4,0x62,0xfb,0xf6,0xae,0x28,0x00,0x00,0x00 # mulx 40($inp), %rax, %r13
526 adcx %rcx, %r11
527
528 mulx 48($inp), %rcx, %r14
529 adcx %rax, %r12
530 adcx %rcx, %r13
531
532 mulx 56($inp), %rax, %r15
533 adcx %rax, %r14
534 adcx %rbp, %r15 # %rbp is 0
535
536 mulx %rdx, %rax, $out
537 mov %rbx, %rdx # 8($inp)
538 xor %rcx, %rcx
539 adox %r8, %r8
540 adcx $out, %r8
541 adox %rbp, %rcx
542 adcx %rbp, %rcx
543
544 mov %rax, (%rsp)
545 mov %r8, 8(%rsp)
546
547#second iteration
548 .byte 0xc4,0xe2,0xfb,0xf6,0x9e,0x10,0x00,0x00,0x00 # mulx 16($inp), %rax, %rbx
549 adox %rax, %r10
550 adcx %rbx, %r11
551
552 mulx 24($inp), $out, %r8
553 adox $out, %r11
554 .byte 0x66
555 adcx %r8, %r12
556
557 mulx 32($inp), %rax, %rbx
558 adox %rax, %r12
559 adcx %rbx, %r13
560
561 mulx 40($inp), $out, %r8
562 adox $out, %r13
563 adcx %r8, %r14
564
565 .byte 0xc4,0xe2,0xfb,0xf6,0x9e,0x30,0x00,0x00,0x00 # mulx 48($inp), %rax, %rbx
566 adox %rax, %r14
567 adcx %rbx, %r15
568
569 .byte 0xc4,0x62,0xc3,0xf6,0x86,0x38,0x00,0x00,0x00 # mulx 56($inp), $out, %r8
570 adox $out, %r15
571 adcx %rbp, %r8
572 mulx %rdx, %rax, $out
573 adox %rbp, %r8
574 .byte 0x48,0x8b,0x96,0x10,0x00,0x00,0x00 # mov 16($inp), %rdx
575
576 xor %rbx, %rbx
577 adox %r9, %r9
578 # rcx <= 2 and rax <= 0xFFFF..F9, so carry must be zero here
579 adcx %rcx, %rax
580 adox %r10, %r10
581 adcx %rax, %r9
582 adox %rbp, %rbx
583 adcx $out, %r10
584 adcx %rbp, %rbx
585
586 mov %r9, 16(%rsp)
587 .byte 0x4c,0x89,0x94,0x24,0x18,0x00,0x00,0x00 # mov %r10, 24(%rsp)
588
589#third iteration
590 mulx 24($inp), $out, %r9
591 adox $out, %r12
592 adcx %r9, %r13
593
594 mulx 32($inp), %rax, %rcx
595 adox %rax, %r13
596 adcx %rcx, %r14
597
598 .byte 0xc4,0x62,0xc3,0xf6,0x8e,0x28,0x00,0x00,0x00 # mulx 40($inp), $out, %r9
599 adox $out, %r14
600 adcx %r9, %r15
601
602 .byte 0xc4,0xe2,0xfb,0xf6,0x8e,0x30,0x00,0x00,0x00 # mulx 48($inp), %rax, %rcx
603 adox %rax, %r15
604 adcx %rcx, %r8
605
606 mulx 56($inp), $out, %r9
607 adox $out, %r8
608 adcx %rbp, %r9
609 mulx %rdx, %rax, $out
610 adox %rbp, %r9
611 mov 24($inp), %rdx
612
613 xor %rcx, %rcx
614 adox %r11, %r11
615 # rbx <= 2 and rax <= 0xFFFF..F9, so carry must be zero here
616 adcx %rbx, %rax
617 adox %r12, %r12
618 adcx %rax, %r11
619 adox %rbp, %rcx
620 adcx $out, %r12
621 adcx %rbp, %rcx
622
623 mov %r11, 32(%rsp)
624 mov %r12, 40(%rsp)
625
626#fourth iteration
627 mulx 32($inp), %rax, %rbx
628 adox %rax, %r14
629 adcx %rbx, %r15
630
631 mulx 40($inp), $out, %r10
632 adox $out, %r15
633 adcx %r10, %r8
634
635 mulx 48($inp), %rax, %rbx
636 adox %rax, %r8
637 adcx %rbx, %r9
638
639 mulx 56($inp), $out, %r10
640 adox $out, %r9
641 adcx %rbp, %r10
642 mulx %rdx, %rax, $out
643 adox %rbp, %r10
644 mov 32($inp), %rdx
645
646 xor %rbx, %rbx
647 adox %r13, %r13
648 # rcx <= 2 and rax <= 0xFFFF..F9, so carry must be zero here
649 adcx %rcx, %rax
650 adox %r14, %r14
651 adcx %rax, %r13
652 adox %rbp, %rbx
653 adcx $out, %r14
654 adcx %rbp, %rbx
655
656 mov %r13, 48(%rsp)
657 mov %r14, 56(%rsp)
658
659#fifth iteration
660 mulx 40($inp), $out, %r11
661 adox $out, %r8
662 adcx %r11, %r9
663
664 mulx 48($inp), %rax, %rcx
665 adox %rax, %r9
666 adcx %rcx, %r10
667
668 mulx 56($inp), $out, %r11
669 adox $out, %r10
670 adcx %rbp, %r11
671 mulx %rdx, %rax, $out
672 mov 40($inp), %rdx
673 adox %rbp, %r11
674
675 xor %rcx, %rcx
676 adox %r15, %r15
677 # rbx <= 2 and rax <= 0xFFFF..F9, so carry must be zero here
678 adcx %rbx, %rax
679 adox %r8, %r8
680 adcx %rax, %r15
681 adox %rbp, %rcx
682 adcx $out, %r8
683 adcx %rbp, %rcx
684
685 mov %r15, 64(%rsp)
686 mov %r8, 72(%rsp)
687
688#sixth iteration
689 .byte 0xc4,0xe2,0xfb,0xf6,0x9e,0x30,0x00,0x00,0x00 # mulx 48($inp), %rax, %rbx
690 adox %rax, %r10
691 adcx %rbx, %r11
692
693 .byte 0xc4,0x62,0xc3,0xf6,0xa6,0x38,0x00,0x00,0x00 # mulx 56($inp), $out, %r12
694 adox $out, %r11
695 adcx %rbp, %r12
696 mulx %rdx, %rax, $out
697 adox %rbp, %r12
698 mov 48($inp), %rdx
699
700 xor %rbx, %rbx
701 adox %r9, %r9
702 # rcx <= 2 and rax <= 0xFFFF..F9, so carry must be zero here
703 adcx %rcx, %rax
704 adox %r10, %r10
705 adcx %rax, %r9
706 adcx $out, %r10
707 adox %rbp, %rbx
708 adcx %rbp, %rbx
709
710 mov %r9, 80(%rsp)
711 mov %r10, 88(%rsp)
712
713#seventh iteration
714 .byte 0xc4,0x62,0xfb,0xf6,0xae,0x38,0x00,0x00,0x00 # mulx 56($inp), %rax, %r13
715 adox %rax, %r12
716 adox %rbp, %r13
717
718 mulx %rdx, %rax, $out
719 xor %rcx, %rcx
720 mov 56($inp), %rdx
721 adox %r11, %r11
722 # rbx <= 2 and rax <= 0xFFFF..F9, so carry must be zero here
723 adcx %rbx, %rax
724 adox %r12, %r12
725 adcx %rax, %r11
726 adox %rbp, %rcx
727 adcx $out, %r12
728 adcx %rbp, %rcx
729
730 .byte 0x4c,0x89,0x9c,0x24,0x60,0x00,0x00,0x00 # mov %r11, 96(%rsp)
731 .byte 0x4c,0x89,0xa4,0x24,0x68,0x00,0x00,0x00 # mov %r12, 104(%rsp)
732
733#eighth iteration
734 mulx %rdx, %rax, %rdx
735 xor %rbx, %rbx
736 adox %r13, %r13
737 # rcx <= 2 and rax <= 0xFFFF..F9, so carry must be zero here
738 adcx %rcx, %rax
739 adox %rbp, %rbx
740 adcx %r13, %rax
741 adcx %rdx, %rbx
742
743 movq %xmm0, $out
744 movq %xmm1, %rbp
745
746 movq 128(%rsp), %rdx # pull $n0
747 movq (%rsp), %r8
748 movq 8(%rsp), %r9
749 movq 16(%rsp), %r10
750 movq 24(%rsp), %r11
751 movq 32(%rsp), %r12
752 movq 40(%rsp), %r13
753 movq 48(%rsp), %r14
754 movq 56(%rsp), %r15
755
756 movq %rax, 112(%rsp)
757 movq %rbx, 120(%rsp)
758
759 call __rsaz_512_reducex
760
761 addq 64(%rsp), %r8
762 adcq 72(%rsp), %r9
763 adcq 80(%rsp), %r10
764 adcq 88(%rsp), %r11
765 adcq 96(%rsp), %r12
766 adcq 104(%rsp), %r13
767 adcq 112(%rsp), %r14
768 adcq 120(%rsp), %r15
769 sbbq %rcx, %rcx
770
771 call __rsaz_512_subtract
772
773 movq %r8, %rdx
774 movq %r9, %rax
775 movl 128+8(%rsp), $times
776 movq $out, $inp
777
778 decl $times
779 jnz .Loop_sqrx
780
781.Lsqr_tail:
782___
783}
784$code.=<<___;
785
786 leaq 128+24+48(%rsp), %rax
787.cfi_def_cfa %rax,8
788 movq -48(%rax), %r15
789.cfi_restore %r15
790 movq -40(%rax), %r14
791.cfi_restore %r14
792 movq -32(%rax), %r13
793.cfi_restore %r13
794 movq -24(%rax), %r12
795.cfi_restore %r12
796 movq -16(%rax), %rbp
797.cfi_restore %rbp
798 movq -8(%rax), %rbx
799.cfi_restore %rbx
800 leaq (%rax), %rsp
801.cfi_def_cfa_register %rsp
802.Lsqr_epilogue:
803 ret
804.cfi_endproc
805.size rsaz_512_sqr,.-rsaz_512_sqr
806___
807}
808{
809my ($out,$ap,$bp,$mod,$n0) = ("%rdi","%rsi","%rdx","%rcx","%r8");
810$code.=<<___;
811.globl rsaz_512_mul
812.type rsaz_512_mul,\@function,5
813.align 32
814rsaz_512_mul:
815.cfi_startproc
816 push %rbx
817.cfi_push %rbx
818 push %rbp
819.cfi_push %rbp
820 push %r12
821.cfi_push %r12
822 push %r13
823.cfi_push %r13
824 push %r14
825.cfi_push %r14
826 push %r15
827.cfi_push %r15
828
829 subq \$128+24, %rsp
830.cfi_adjust_cfa_offset 128+24
831.Lmul_body:
832 movq $out, %xmm0 # off-load arguments
833 movq $mod, %xmm1
834 movq $n0, 128(%rsp)
835___
836$code.=<<___ if ($addx);
837 movl \$0x80100,%r11d
838 andl OPENSSL_ia32cap_P+8(%rip),%r11d
839 cmpl \$0x80100,%r11d # check for MULX and ADO/CX
840 je .Lmulx
841___
842$code.=<<___;
843 movq ($bp), %rbx # pass b[0]
844 movq $bp, %rbp # pass argument
845 call __rsaz_512_mul
846
847 movq %xmm0, $out
848 movq %xmm1, %rbp
849
850 movq (%rsp), %r8
851 movq 8(%rsp), %r9
852 movq 16(%rsp), %r10
853 movq 24(%rsp), %r11
854 movq 32(%rsp), %r12
855 movq 40(%rsp), %r13
856 movq 48(%rsp), %r14
857 movq 56(%rsp), %r15
858
859 call __rsaz_512_reduce
860___
861$code.=<<___ if ($addx);
862 jmp .Lmul_tail
863
864.align 32
865.Lmulx:
866 movq $bp, %rbp # pass argument
867 movq ($bp), %rdx # pass b[0]
868 call __rsaz_512_mulx
869
870 movq %xmm0, $out
871 movq %xmm1, %rbp
872
873 movq 128(%rsp), %rdx # pull $n0
874 movq (%rsp), %r8
875 movq 8(%rsp), %r9
876 movq 16(%rsp), %r10
877 movq 24(%rsp), %r11
878 movq 32(%rsp), %r12
879 movq 40(%rsp), %r13
880 movq 48(%rsp), %r14
881 movq 56(%rsp), %r15
882
883 call __rsaz_512_reducex
884.Lmul_tail:
885___
886$code.=<<___;
887 addq 64(%rsp), %r8
888 adcq 72(%rsp), %r9
889 adcq 80(%rsp), %r10
890 adcq 88(%rsp), %r11
891 adcq 96(%rsp), %r12
892 adcq 104(%rsp), %r13
893 adcq 112(%rsp), %r14
894 adcq 120(%rsp), %r15
895 sbbq %rcx, %rcx
896
897 call __rsaz_512_subtract
898
899 leaq 128+24+48(%rsp), %rax
900.cfi_def_cfa %rax,8
901 movq -48(%rax), %r15
902.cfi_restore %r15
903 movq -40(%rax), %r14
904.cfi_restore %r14
905 movq -32(%rax), %r13
906.cfi_restore %r13
907 movq -24(%rax), %r12
908.cfi_restore %r12
909 movq -16(%rax), %rbp
910.cfi_restore %rbp
911 movq -8(%rax), %rbx
912.cfi_restore %rbx
913 leaq (%rax), %rsp
914.cfi_def_cfa_register %rsp
915.Lmul_epilogue:
916 ret
917.cfi_endproc
918.size rsaz_512_mul,.-rsaz_512_mul
919___
920}
921{
922my ($out,$ap,$bp,$mod,$n0,$pwr) = ("%rdi","%rsi","%rdx","%rcx","%r8","%r9d");
923$code.=<<___;
924.globl rsaz_512_mul_gather4
925.type rsaz_512_mul_gather4,\@function,6
926.align 32
927rsaz_512_mul_gather4:
928.cfi_startproc
929 push %rbx
930.cfi_push %rbx
931 push %rbp
932.cfi_push %rbp
933 push %r12
934.cfi_push %r12
935 push %r13
936.cfi_push %r13
937 push %r14
938.cfi_push %r14
939 push %r15
940.cfi_push %r15
941
942 subq \$`128+24+($win64?0xb0:0)`, %rsp
943.cfi_adjust_cfa_offset `128+24+($win64?0xb0:0)`
944___
945$code.=<<___ if ($win64);
946 movaps %xmm6,0xa0(%rsp)
947 movaps %xmm7,0xb0(%rsp)
948 movaps %xmm8,0xc0(%rsp)
949 movaps %xmm9,0xd0(%rsp)
950 movaps %xmm10,0xe0(%rsp)
951 movaps %xmm11,0xf0(%rsp)
952 movaps %xmm12,0x100(%rsp)
953 movaps %xmm13,0x110(%rsp)
954 movaps %xmm14,0x120(%rsp)
955 movaps %xmm15,0x130(%rsp)
956___
957$code.=<<___;
958.Lmul_gather4_body:
959 movd $pwr,%xmm8
960 movdqa .Linc+16(%rip),%xmm1 # 00000002000000020000000200000002
961 movdqa .Linc(%rip),%xmm0 # 00000001000000010000000000000000
962
963 pshufd \$0,%xmm8,%xmm8 # broadcast $power
964 movdqa %xmm1,%xmm7
965 movdqa %xmm1,%xmm2
966___
967########################################################################
968# calculate mask by comparing 0..15 to $power
969#
970for($i=0;$i<4;$i++) {
971$code.=<<___;
972 paddd %xmm`$i`,%xmm`$i+1`
973 pcmpeqd %xmm8,%xmm`$i`
974 movdqa %xmm7,%xmm`$i+3`
975___
976}
977for(;$i<7;$i++) {
978$code.=<<___;
979 paddd %xmm`$i`,%xmm`$i+1`
980 pcmpeqd %xmm8,%xmm`$i`
981___
982}
983$code.=<<___;
984 pcmpeqd %xmm8,%xmm7
985
986 movdqa 16*0($bp),%xmm8
987 movdqa 16*1($bp),%xmm9
988 movdqa 16*2($bp),%xmm10
989 movdqa 16*3($bp),%xmm11
990 pand %xmm0,%xmm8
991 movdqa 16*4($bp),%xmm12
992 pand %xmm1,%xmm9
993 movdqa 16*5($bp),%xmm13
994 pand %xmm2,%xmm10
995 movdqa 16*6($bp),%xmm14
996 pand %xmm3,%xmm11
997 movdqa 16*7($bp),%xmm15
998 leaq 128($bp), %rbp
999 pand %xmm4,%xmm12
1000 pand %xmm5,%xmm13
1001 pand %xmm6,%xmm14
1002 pand %xmm7,%xmm15
1003 por %xmm10,%xmm8
1004 por %xmm11,%xmm9
1005 por %xmm12,%xmm8
1006 por %xmm13,%xmm9
1007 por %xmm14,%xmm8
1008 por %xmm15,%xmm9
1009
1010 por %xmm9,%xmm8
1011 pshufd \$0x4e,%xmm8,%xmm9
1012 por %xmm9,%xmm8
1013___
1014$code.=<<___ if ($addx);
1015 movl \$0x80100,%r11d
1016 andl OPENSSL_ia32cap_P+8(%rip),%r11d
1017 cmpl \$0x80100,%r11d # check for MULX and ADO/CX
1018 je .Lmulx_gather
1019___
1020$code.=<<___;
1021 movq %xmm8,%rbx
1022
1023 movq $n0, 128(%rsp) # off-load arguments
1024 movq $out, 128+8(%rsp)
1025 movq $mod, 128+16(%rsp)
1026
1027 movq ($ap), %rax
1028 movq 8($ap), %rcx
1029 mulq %rbx # 0 iteration
1030 movq %rax, (%rsp)
1031 movq %rcx, %rax
1032 movq %rdx, %r8
1033
1034 mulq %rbx
1035 addq %rax, %r8
1036 movq 16($ap), %rax
1037 movq %rdx, %r9
1038 adcq \$0, %r9
1039
1040 mulq %rbx
1041 addq %rax, %r9
1042 movq 24($ap), %rax
1043 movq %rdx, %r10
1044 adcq \$0, %r10
1045
1046 mulq %rbx
1047 addq %rax, %r10
1048 movq 32($ap), %rax
1049 movq %rdx, %r11
1050 adcq \$0, %r11
1051
1052 mulq %rbx
1053 addq %rax, %r11
1054 movq 40($ap), %rax
1055 movq %rdx, %r12
1056 adcq \$0, %r12
1057
1058 mulq %rbx
1059 addq %rax, %r12
1060 movq 48($ap), %rax
1061 movq %rdx, %r13
1062 adcq \$0, %r13
1063
1064 mulq %rbx
1065 addq %rax, %r13
1066 movq 56($ap), %rax
1067 movq %rdx, %r14
1068 adcq \$0, %r14
1069
1070 mulq %rbx
1071 addq %rax, %r14
1072 movq ($ap), %rax
1073 movq %rdx, %r15
1074 adcq \$0, %r15
1075
1076 leaq 8(%rsp), %rdi
1077 movl \$7, %ecx
1078 jmp .Loop_mul_gather
1079
1080.align 32
1081.Loop_mul_gather:
1082 movdqa 16*0(%rbp),%xmm8
1083 movdqa 16*1(%rbp),%xmm9
1084 movdqa 16*2(%rbp),%xmm10
1085 movdqa 16*3(%rbp),%xmm11
1086 pand %xmm0,%xmm8
1087 movdqa 16*4(%rbp),%xmm12
1088 pand %xmm1,%xmm9
1089 movdqa 16*5(%rbp),%xmm13
1090 pand %xmm2,%xmm10
1091 movdqa 16*6(%rbp),%xmm14
1092 pand %xmm3,%xmm11
1093 movdqa 16*7(%rbp),%xmm15
1094 leaq 128(%rbp), %rbp
1095 pand %xmm4,%xmm12
1096 pand %xmm5,%xmm13
1097 pand %xmm6,%xmm14
1098 pand %xmm7,%xmm15
1099 por %xmm10,%xmm8
1100 por %xmm11,%xmm9
1101 por %xmm12,%xmm8
1102 por %xmm13,%xmm9
1103 por %xmm14,%xmm8
1104 por %xmm15,%xmm9
1105
1106 por %xmm9,%xmm8
1107 pshufd \$0x4e,%xmm8,%xmm9
1108 por %xmm9,%xmm8
1109 movq %xmm8,%rbx
1110
1111 mulq %rbx
1112 addq %rax, %r8
1113 movq 8($ap), %rax
1114 movq %r8, (%rdi)
1115 movq %rdx, %r8
1116 adcq \$0, %r8
1117
1118 mulq %rbx
1119 addq %rax, %r9
1120 movq 16($ap), %rax
1121 adcq \$0, %rdx
1122 addq %r9, %r8
1123 movq %rdx, %r9
1124 adcq \$0, %r9
1125
1126 mulq %rbx
1127 addq %rax, %r10
1128 movq 24($ap), %rax
1129 adcq \$0, %rdx
1130 addq %r10, %r9
1131 movq %rdx, %r10
1132 adcq \$0, %r10
1133
1134 mulq %rbx
1135 addq %rax, %r11
1136 movq 32($ap), %rax
1137 adcq \$0, %rdx
1138 addq %r11, %r10
1139 movq %rdx, %r11
1140 adcq \$0, %r11
1141
1142 mulq %rbx
1143 addq %rax, %r12
1144 movq 40($ap), %rax
1145 adcq \$0, %rdx
1146 addq %r12, %r11
1147 movq %rdx, %r12
1148 adcq \$0, %r12
1149
1150 mulq %rbx
1151 addq %rax, %r13
1152 movq 48($ap), %rax
1153 adcq \$0, %rdx
1154 addq %r13, %r12
1155 movq %rdx, %r13
1156 adcq \$0, %r13
1157
1158 mulq %rbx
1159 addq %rax, %r14
1160 movq 56($ap), %rax
1161 adcq \$0, %rdx
1162 addq %r14, %r13
1163 movq %rdx, %r14
1164 adcq \$0, %r14
1165
1166 mulq %rbx
1167 addq %rax, %r15
1168 movq ($ap), %rax
1169 adcq \$0, %rdx
1170 addq %r15, %r14
1171 movq %rdx, %r15
1172 adcq \$0, %r15
1173
1174 leaq 8(%rdi), %rdi
1175
1176 decl %ecx
1177 jnz .Loop_mul_gather
1178
1179 movq %r8, (%rdi)
1180 movq %r9, 8(%rdi)
1181 movq %r10, 16(%rdi)
1182 movq %r11, 24(%rdi)
1183 movq %r12, 32(%rdi)
1184 movq %r13, 40(%rdi)
1185 movq %r14, 48(%rdi)
1186 movq %r15, 56(%rdi)
1187
1188 movq 128+8(%rsp), $out
1189 movq 128+16(%rsp), %rbp
1190
1191 movq (%rsp), %r8
1192 movq 8(%rsp), %r9
1193 movq 16(%rsp), %r10
1194 movq 24(%rsp), %r11
1195 movq 32(%rsp), %r12
1196 movq 40(%rsp), %r13
1197 movq 48(%rsp), %r14
1198 movq 56(%rsp), %r15
1199
1200 call __rsaz_512_reduce
1201___
1202$code.=<<___ if ($addx);
1203 jmp .Lmul_gather_tail
1204
1205.align 32
1206.Lmulx_gather:
1207 movq %xmm8,%rdx
1208
1209 mov $n0, 128(%rsp) # off-load arguments
1210 mov $out, 128+8(%rsp)
1211 mov $mod, 128+16(%rsp)
1212
1213 mulx ($ap), %rbx, %r8 # 0 iteration
1214 mov %rbx, (%rsp)
1215 xor %edi, %edi # cf=0, of=0
1216
1217 mulx 8($ap), %rax, %r9
1218
1219 mulx 16($ap), %rbx, %r10
1220 adcx %rax, %r8
1221
1222 mulx 24($ap), %rax, %r11
1223 adcx %rbx, %r9
1224
1225 mulx 32($ap), %rbx, %r12
1226 adcx %rax, %r10
1227
1228 mulx 40($ap), %rax, %r13
1229 adcx %rbx, %r11
1230
1231 mulx 48($ap), %rbx, %r14
1232 adcx %rax, %r12
1233
1234 mulx 56($ap), %rax, %r15
1235 adcx %rbx, %r13
1236 adcx %rax, %r14
1237 .byte 0x67
1238 mov %r8, %rbx
1239 adcx %rdi, %r15 # %rdi is 0
1240
1241 mov \$-7, %rcx
1242 jmp .Loop_mulx_gather
1243
1244.align 32
1245.Loop_mulx_gather:
1246 movdqa 16*0(%rbp),%xmm8
1247 movdqa 16*1(%rbp),%xmm9
1248 movdqa 16*2(%rbp),%xmm10
1249 movdqa 16*3(%rbp),%xmm11
1250 pand %xmm0,%xmm8
1251 movdqa 16*4(%rbp),%xmm12
1252 pand %xmm1,%xmm9
1253 movdqa 16*5(%rbp),%xmm13
1254 pand %xmm2,%xmm10
1255 movdqa 16*6(%rbp),%xmm14
1256 pand %xmm3,%xmm11
1257 movdqa 16*7(%rbp),%xmm15
1258 leaq 128(%rbp), %rbp
1259 pand %xmm4,%xmm12
1260 pand %xmm5,%xmm13
1261 pand %xmm6,%xmm14
1262 pand %xmm7,%xmm15
1263 por %xmm10,%xmm8
1264 por %xmm11,%xmm9
1265 por %xmm12,%xmm8
1266 por %xmm13,%xmm9
1267 por %xmm14,%xmm8
1268 por %xmm15,%xmm9
1269
1270 por %xmm9,%xmm8
1271 pshufd \$0x4e,%xmm8,%xmm9
1272 por %xmm9,%xmm8
1273 movq %xmm8,%rdx
1274
1275 .byte 0xc4,0x62,0xfb,0xf6,0x86,0x00,0x00,0x00,0x00 # mulx ($ap), %rax, %r8
1276 adcx %rax, %rbx
1277 adox %r9, %r8
1278
1279 mulx 8($ap), %rax, %r9
1280 adcx %rax, %r8
1281 adox %r10, %r9
1282
1283 mulx 16($ap), %rax, %r10
1284 adcx %rax, %r9
1285 adox %r11, %r10
1286
1287 .byte 0xc4,0x62,0xfb,0xf6,0x9e,0x18,0x00,0x00,0x00 # mulx 24($ap), %rax, %r11
1288 adcx %rax, %r10
1289 adox %r12, %r11
1290
1291 mulx 32($ap), %rax, %r12
1292 adcx %rax, %r11
1293 adox %r13, %r12
1294
1295 mulx 40($ap), %rax, %r13
1296 adcx %rax, %r12
1297 adox %r14, %r13
1298
1299 .byte 0xc4,0x62,0xfb,0xf6,0xb6,0x30,0x00,0x00,0x00 # mulx 48($ap), %rax, %r14
1300 adcx %rax, %r13
1301 .byte 0x67
1302 adox %r15, %r14
1303
1304 mulx 56($ap), %rax, %r15
1305 mov %rbx, 64(%rsp,%rcx,8)
1306 adcx %rax, %r14
1307 adox %rdi, %r15
1308 mov %r8, %rbx
1309 adcx %rdi, %r15 # cf=0
1310
1311 inc %rcx # of=0
1312 jnz .Loop_mulx_gather
1313
1314 mov %r8, 64(%rsp)
1315 mov %r9, 64+8(%rsp)
1316 mov %r10, 64+16(%rsp)
1317 mov %r11, 64+24(%rsp)
1318 mov %r12, 64+32(%rsp)
1319 mov %r13, 64+40(%rsp)
1320 mov %r14, 64+48(%rsp)
1321 mov %r15, 64+56(%rsp)
1322
1323 mov 128(%rsp), %rdx # pull arguments
1324 mov 128+8(%rsp), $out
1325 mov 128+16(%rsp), %rbp
1326
1327 mov (%rsp), %r8
1328 mov 8(%rsp), %r9
1329 mov 16(%rsp), %r10
1330 mov 24(%rsp), %r11
1331 mov 32(%rsp), %r12
1332 mov 40(%rsp), %r13
1333 mov 48(%rsp), %r14
1334 mov 56(%rsp), %r15
1335
1336 call __rsaz_512_reducex
1337
1338.Lmul_gather_tail:
1339___
1340$code.=<<___;
1341 addq 64(%rsp), %r8
1342 adcq 72(%rsp), %r9
1343 adcq 80(%rsp), %r10
1344 adcq 88(%rsp), %r11
1345 adcq 96(%rsp), %r12
1346 adcq 104(%rsp), %r13
1347 adcq 112(%rsp), %r14
1348 adcq 120(%rsp), %r15
1349 sbbq %rcx, %rcx
1350
1351 call __rsaz_512_subtract
1352
1353 leaq 128+24+48(%rsp), %rax
1354___
1355$code.=<<___ if ($win64);
1356 movaps 0xa0-0xc8(%rax),%xmm6
1357 movaps 0xb0-0xc8(%rax),%xmm7
1358 movaps 0xc0-0xc8(%rax),%xmm8
1359 movaps 0xd0-0xc8(%rax),%xmm9
1360 movaps 0xe0-0xc8(%rax),%xmm10
1361 movaps 0xf0-0xc8(%rax),%xmm11
1362 movaps 0x100-0xc8(%rax),%xmm12
1363 movaps 0x110-0xc8(%rax),%xmm13
1364 movaps 0x120-0xc8(%rax),%xmm14
1365 movaps 0x130-0xc8(%rax),%xmm15
1366 lea 0xb0(%rax),%rax
1367___
1368$code.=<<___;
1369.cfi_def_cfa %rax,8
1370 movq -48(%rax), %r15
1371.cfi_restore %r15
1372 movq -40(%rax), %r14
1373.cfi_restore %r14
1374 movq -32(%rax), %r13
1375.cfi_restore %r13
1376 movq -24(%rax), %r12
1377.cfi_restore %r12
1378 movq -16(%rax), %rbp
1379.cfi_restore %rbp
1380 movq -8(%rax), %rbx
1381.cfi_restore %rbx
1382 leaq (%rax), %rsp
1383.cfi_def_cfa_register %rsp
1384.Lmul_gather4_epilogue:
1385 ret
1386.cfi_endproc
1387.size rsaz_512_mul_gather4,.-rsaz_512_mul_gather4
1388___
1389}
1390{
1391my ($out,$ap,$mod,$n0,$tbl,$pwr) = ("%rdi","%rsi","%rdx","%rcx","%r8","%r9d");
1392$code.=<<___;
1393.globl rsaz_512_mul_scatter4
1394.type rsaz_512_mul_scatter4,\@function,6
1395.align 32
1396rsaz_512_mul_scatter4:
1397.cfi_startproc
1398 push %rbx
1399.cfi_push %rbx
1400 push %rbp
1401.cfi_push %rbp
1402 push %r12
1403.cfi_push %r12
1404 push %r13
1405.cfi_push %r13
1406 push %r14
1407.cfi_push %r14
1408 push %r15
1409.cfi_push %r15
1410
1411 mov $pwr, $pwr
1412 subq \$128+24, %rsp
1413.cfi_adjust_cfa_offset 128+24
1414.Lmul_scatter4_body:
1415 leaq ($tbl,$pwr,8), $tbl
1416 movq $out, %xmm0 # off-load arguments
1417 movq $mod, %xmm1
1418 movq $tbl, %xmm2
1419 movq $n0, 128(%rsp)
1420
1421 movq $out, %rbp
1422___
1423$code.=<<___ if ($addx);
1424 movl \$0x80100,%r11d
1425 andl OPENSSL_ia32cap_P+8(%rip),%r11d
1426 cmpl \$0x80100,%r11d # check for MULX and ADO/CX
1427 je .Lmulx_scatter
1428___
1429$code.=<<___;
1430 movq ($out),%rbx # pass b[0]
1431 call __rsaz_512_mul
1432
1433 movq %xmm0, $out
1434 movq %xmm1, %rbp
1435
1436 movq (%rsp), %r8
1437 movq 8(%rsp), %r9
1438 movq 16(%rsp), %r10
1439 movq 24(%rsp), %r11
1440 movq 32(%rsp), %r12
1441 movq 40(%rsp), %r13
1442 movq 48(%rsp), %r14
1443 movq 56(%rsp), %r15
1444
1445 call __rsaz_512_reduce
1446___
1447$code.=<<___ if ($addx);
1448 jmp .Lmul_scatter_tail
1449
1450.align 32
1451.Lmulx_scatter:
1452 movq ($out), %rdx # pass b[0]
1453 call __rsaz_512_mulx
1454
1455 movq %xmm0, $out
1456 movq %xmm1, %rbp
1457
1458 movq 128(%rsp), %rdx # pull $n0
1459 movq (%rsp), %r8
1460 movq 8(%rsp), %r9
1461 movq 16(%rsp), %r10
1462 movq 24(%rsp), %r11
1463 movq 32(%rsp), %r12
1464 movq 40(%rsp), %r13
1465 movq 48(%rsp), %r14
1466 movq 56(%rsp), %r15
1467
1468 call __rsaz_512_reducex
1469
1470.Lmul_scatter_tail:
1471___
1472$code.=<<___;
1473 addq 64(%rsp), %r8
1474 adcq 72(%rsp), %r9
1475 adcq 80(%rsp), %r10
1476 adcq 88(%rsp), %r11
1477 adcq 96(%rsp), %r12
1478 adcq 104(%rsp), %r13
1479 adcq 112(%rsp), %r14
1480 adcq 120(%rsp), %r15
1481 movq %xmm2, $inp
1482 sbbq %rcx, %rcx
1483
1484 call __rsaz_512_subtract
1485
1486 movq %r8, 128*0($inp) # scatter
1487 movq %r9, 128*1($inp)
1488 movq %r10, 128*2($inp)
1489 movq %r11, 128*3($inp)
1490 movq %r12, 128*4($inp)
1491 movq %r13, 128*5($inp)
1492 movq %r14, 128*6($inp)
1493 movq %r15, 128*7($inp)
1494
1495 leaq 128+24+48(%rsp), %rax
1496.cfi_def_cfa %rax,8
1497 movq -48(%rax), %r15
1498.cfi_restore %r15
1499 movq -40(%rax), %r14
1500.cfi_restore %r14
1501 movq -32(%rax), %r13
1502.cfi_restore %r13
1503 movq -24(%rax), %r12
1504.cfi_restore %r12
1505 movq -16(%rax), %rbp
1506.cfi_restore %rbp
1507 movq -8(%rax), %rbx
1508.cfi_restore %rbx
1509 leaq (%rax), %rsp
1510.cfi_def_cfa_register %rsp
1511.Lmul_scatter4_epilogue:
1512 ret
1513.cfi_endproc
1514.size rsaz_512_mul_scatter4,.-rsaz_512_mul_scatter4
1515___
1516}
1517{
1518my ($out,$inp,$mod,$n0) = ("%rdi","%rsi","%rdx","%rcx");
1519$code.=<<___;
1520.globl rsaz_512_mul_by_one
1521.type rsaz_512_mul_by_one,\@function,4
1522.align 32
1523rsaz_512_mul_by_one:
1524.cfi_startproc
1525 push %rbx
1526.cfi_push %rbx
1527 push %rbp
1528.cfi_push %rbp
1529 push %r12
1530.cfi_push %r12
1531 push %r13
1532.cfi_push %r13
1533 push %r14
1534.cfi_push %r14
1535 push %r15
1536.cfi_push %r15
1537
1538 subq \$128+24, %rsp
1539.cfi_adjust_cfa_offset 128+24
1540.Lmul_by_one_body:
1541___
1542$code.=<<___ if ($addx);
1543 movl OPENSSL_ia32cap_P+8(%rip),%eax
1544___
1545$code.=<<___;
1546 movq $mod, %rbp # reassign argument
1547 movq $n0, 128(%rsp)
1548
1549 movq ($inp), %r8
1550 pxor %xmm0, %xmm0
1551 movq 8($inp), %r9
1552 movq 16($inp), %r10
1553 movq 24($inp), %r11
1554 movq 32($inp), %r12
1555 movq 40($inp), %r13
1556 movq 48($inp), %r14
1557 movq 56($inp), %r15
1558
1559 movdqa %xmm0, (%rsp)
1560 movdqa %xmm0, 16(%rsp)
1561 movdqa %xmm0, 32(%rsp)
1562 movdqa %xmm0, 48(%rsp)
1563 movdqa %xmm0, 64(%rsp)
1564 movdqa %xmm0, 80(%rsp)
1565 movdqa %xmm0, 96(%rsp)
1566___
1567$code.=<<___ if ($addx);
1568 andl \$0x80100,%eax
1569 cmpl \$0x80100,%eax # check for MULX and ADO/CX
1570 je .Lby_one_callx
1571___
1572$code.=<<___;
1573 call __rsaz_512_reduce
1574___
1575$code.=<<___ if ($addx);
1576 jmp .Lby_one_tail
1577.align 32
1578.Lby_one_callx:
1579 movq 128(%rsp), %rdx # pull $n0
1580 call __rsaz_512_reducex
1581.Lby_one_tail:
1582___
1583$code.=<<___;
1584 movq %r8, ($out)
1585 movq %r9, 8($out)
1586 movq %r10, 16($out)
1587 movq %r11, 24($out)
1588 movq %r12, 32($out)
1589 movq %r13, 40($out)
1590 movq %r14, 48($out)
1591 movq %r15, 56($out)
1592
1593 leaq 128+24+48(%rsp), %rax
1594.cfi_def_cfa %rax,8
1595 movq -48(%rax), %r15
1596.cfi_restore %r15
1597 movq -40(%rax), %r14
1598.cfi_restore %r14
1599 movq -32(%rax), %r13
1600.cfi_restore %r13
1601 movq -24(%rax), %r12
1602.cfi_restore %r12
1603 movq -16(%rax), %rbp
1604.cfi_restore %rbp
1605 movq -8(%rax), %rbx
1606.cfi_restore %rbx
1607 leaq (%rax), %rsp
1608.cfi_def_cfa_register %rsp
1609.Lmul_by_one_epilogue:
1610 ret
1611.cfi_endproc
1612.size rsaz_512_mul_by_one,.-rsaz_512_mul_by_one
1613___
1614}
1615{ # __rsaz_512_reduce
1616 #
1617 # input: %r8-%r15, %rbp - mod, 128(%rsp) - n0
1618 # output: %r8-%r15
1619 # clobbers: everything except %rbp and %rdi
1620$code.=<<___;
1621.type __rsaz_512_reduce,\@abi-omnipotent
1622.align 32
1623__rsaz_512_reduce:
1624.cfi_startproc
1625 movq %r8, %rbx
1626 imulq 128+8(%rsp), %rbx
1627 movq 0(%rbp), %rax
1628 movl \$8, %ecx
1629 jmp .Lreduction_loop
1630
1631.align 32
1632.Lreduction_loop:
1633 mulq %rbx
1634 movq 8(%rbp), %rax
1635 negq %r8
1636 movq %rdx, %r8
1637 adcq \$0, %r8
1638
1639 mulq %rbx
1640 addq %rax, %r9
1641 movq 16(%rbp), %rax
1642 adcq \$0, %rdx
1643 addq %r9, %r8
1644 movq %rdx, %r9
1645 adcq \$0, %r9
1646
1647 mulq %rbx
1648 addq %rax, %r10
1649 movq 24(%rbp), %rax
1650 adcq \$0, %rdx
1651 addq %r10, %r9
1652 movq %rdx, %r10
1653 adcq \$0, %r10
1654
1655 mulq %rbx
1656 addq %rax, %r11
1657 movq 32(%rbp), %rax
1658 adcq \$0, %rdx
1659 addq %r11, %r10
1660 movq 128+8(%rsp), %rsi
1661 #movq %rdx, %r11
1662 #adcq \$0, %r11
1663 adcq \$0, %rdx
1664 movq %rdx, %r11
1665
1666 mulq %rbx
1667 addq %rax, %r12
1668 movq 40(%rbp), %rax
1669 adcq \$0, %rdx
1670 imulq %r8, %rsi
1671 addq %r12, %r11
1672 movq %rdx, %r12
1673 adcq \$0, %r12
1674
1675 mulq %rbx
1676 addq %rax, %r13
1677 movq 48(%rbp), %rax
1678 adcq \$0, %rdx
1679 addq %r13, %r12
1680 movq %rdx, %r13
1681 adcq \$0, %r13
1682
1683 mulq %rbx
1684 addq %rax, %r14
1685 movq 56(%rbp), %rax
1686 adcq \$0, %rdx
1687 addq %r14, %r13
1688 movq %rdx, %r14
1689 adcq \$0, %r14
1690
1691 mulq %rbx
1692 movq %rsi, %rbx
1693 addq %rax, %r15
1694 movq 0(%rbp), %rax
1695 adcq \$0, %rdx
1696 addq %r15, %r14
1697 movq %rdx, %r15
1698 adcq \$0, %r15
1699
1700 decl %ecx
1701 jne .Lreduction_loop
1702
1703 ret
1704.cfi_endproc
1705.size __rsaz_512_reduce,.-__rsaz_512_reduce
1706___
1707}
1708if ($addx) {
1709 # __rsaz_512_reducex
1710 #
1711 # input: %r8-%r15, %rbp - mod, 128(%rsp) - n0
1712 # output: %r8-%r15
1713 # clobbers: everything except %rbp and %rdi
1714$code.=<<___;
1715.type __rsaz_512_reducex,\@abi-omnipotent
1716.align 32
1717__rsaz_512_reducex:
1718.cfi_startproc
1719 #movq 128+8(%rsp), %rdx # pull $n0
1720 imulq %r8, %rdx
1721 xorq %rsi, %rsi # cf=0,of=0
1722 movl \$8, %ecx
1723 jmp .Lreduction_loopx
1724
1725.align 32
1726.Lreduction_loopx:
1727 mov %r8, %rbx
1728 mulx 0(%rbp), %rax, %r8
1729 adcx %rbx, %rax
1730 adox %r9, %r8
1731
1732 mulx 8(%rbp), %rax, %r9
1733 adcx %rax, %r8
1734 adox %r10, %r9
1735
1736 mulx 16(%rbp), %rbx, %r10
1737 adcx %rbx, %r9
1738 adox %r11, %r10
1739
1740 mulx 24(%rbp), %rbx, %r11
1741 adcx %rbx, %r10
1742 adox %r12, %r11
1743
1744 .byte 0xc4,0x62,0xe3,0xf6,0xa5,0x20,0x00,0x00,0x00 # mulx 32(%rbp), %rbx, %r12
1745 mov %rdx, %rax
1746 mov %r8, %rdx
1747 adcx %rbx, %r11
1748 adox %r13, %r12
1749
1750 mulx 128+8(%rsp), %rbx, %rdx
1751 mov %rax, %rdx
1752
1753 mulx 40(%rbp), %rax, %r13
1754 adcx %rax, %r12
1755 adox %r14, %r13
1756
1757 .byte 0xc4,0x62,0xfb,0xf6,0xb5,0x30,0x00,0x00,0x00 # mulx 48(%rbp), %rax, %r14
1758 adcx %rax, %r13
1759 adox %r15, %r14
1760
1761 mulx 56(%rbp), %rax, %r15
1762 mov %rbx, %rdx
1763 adcx %rax, %r14
1764 adox %rsi, %r15 # %rsi is 0
1765 adcx %rsi, %r15 # cf=0
1766
1767 decl %ecx # of=0
1768 jne .Lreduction_loopx
1769
1770 ret
1771.cfi_endproc
1772.size __rsaz_512_reducex,.-__rsaz_512_reducex
1773___
1774}
1775{ # __rsaz_512_subtract
1776 # input: %r8-%r15, %rdi - $out, %rbp - $mod, %rcx - mask
1777 # output:
1778 # clobbers: everything but %rdi, %rsi and %rbp
1779$code.=<<___;
1780.type __rsaz_512_subtract,\@abi-omnipotent
1781.align 32
1782__rsaz_512_subtract:
1783.cfi_startproc
1784 movq %r8, ($out)
1785 movq %r9, 8($out)
1786 movq %r10, 16($out)
1787 movq %r11, 24($out)
1788 movq %r12, 32($out)
1789 movq %r13, 40($out)
1790 movq %r14, 48($out)
1791 movq %r15, 56($out)
1792
1793 movq 0($mod), %r8
1794 movq 8($mod), %r9
1795 negq %r8
1796 notq %r9
1797 andq %rcx, %r8
1798 movq 16($mod), %r10
1799 andq %rcx, %r9
1800 notq %r10
1801 movq 24($mod), %r11
1802 andq %rcx, %r10
1803 notq %r11
1804 movq 32($mod), %r12
1805 andq %rcx, %r11
1806 notq %r12
1807 movq 40($mod), %r13
1808 andq %rcx, %r12
1809 notq %r13
1810 movq 48($mod), %r14
1811 andq %rcx, %r13
1812 notq %r14
1813 movq 56($mod), %r15
1814 andq %rcx, %r14
1815 notq %r15
1816 andq %rcx, %r15
1817
1818 addq ($out), %r8
1819 adcq 8($out), %r9
1820 adcq 16($out), %r10
1821 adcq 24($out), %r11
1822 adcq 32($out), %r12
1823 adcq 40($out), %r13
1824 adcq 48($out), %r14
1825 adcq 56($out), %r15
1826
1827 movq %r8, ($out)
1828 movq %r9, 8($out)
1829 movq %r10, 16($out)
1830 movq %r11, 24($out)
1831 movq %r12, 32($out)
1832 movq %r13, 40($out)
1833 movq %r14, 48($out)
1834 movq %r15, 56($out)
1835
1836 ret
1837.cfi_endproc
1838.size __rsaz_512_subtract,.-__rsaz_512_subtract
1839___
1840}
1841{ # __rsaz_512_mul
1842 #
1843 # input: %rsi - ap, %rbp - bp
1844 # output:
1845 # clobbers: everything
1846my ($ap,$bp) = ("%rsi","%rbp");
1847$code.=<<___;
1848.type __rsaz_512_mul,\@abi-omnipotent
1849.align 32
1850__rsaz_512_mul:
1851.cfi_startproc
1852 leaq 8(%rsp), %rdi
1853
1854 movq ($ap), %rax
1855 mulq %rbx
1856 movq %rax, (%rdi)
1857 movq 8($ap), %rax
1858 movq %rdx, %r8
1859
1860 mulq %rbx
1861 addq %rax, %r8
1862 movq 16($ap), %rax
1863 movq %rdx, %r9
1864 adcq \$0, %r9
1865
1866 mulq %rbx
1867 addq %rax, %r9
1868 movq 24($ap), %rax
1869 movq %rdx, %r10
1870 adcq \$0, %r10
1871
1872 mulq %rbx
1873 addq %rax, %r10
1874 movq 32($ap), %rax
1875 movq %rdx, %r11
1876 adcq \$0, %r11
1877
1878 mulq %rbx
1879 addq %rax, %r11
1880 movq 40($ap), %rax
1881 movq %rdx, %r12
1882 adcq \$0, %r12
1883
1884 mulq %rbx
1885 addq %rax, %r12
1886 movq 48($ap), %rax
1887 movq %rdx, %r13
1888 adcq \$0, %r13
1889
1890 mulq %rbx
1891 addq %rax, %r13
1892 movq 56($ap), %rax
1893 movq %rdx, %r14
1894 adcq \$0, %r14
1895
1896 mulq %rbx
1897 addq %rax, %r14
1898 movq ($ap), %rax
1899 movq %rdx, %r15
1900 adcq \$0, %r15
1901
1902 leaq 8($bp), $bp
1903 leaq 8(%rdi), %rdi
1904
1905 movl \$7, %ecx
1906 jmp .Loop_mul
1907
1908.align 32
1909.Loop_mul:
1910 movq ($bp), %rbx
1911 mulq %rbx
1912 addq %rax, %r8
1913 movq 8($ap), %rax
1914 movq %r8, (%rdi)
1915 movq %rdx, %r8
1916 adcq \$0, %r8
1917
1918 mulq %rbx
1919 addq %rax, %r9
1920 movq 16($ap), %rax
1921 adcq \$0, %rdx
1922 addq %r9, %r8
1923 movq %rdx, %r9
1924 adcq \$0, %r9
1925
1926 mulq %rbx
1927 addq %rax, %r10
1928 movq 24($ap), %rax
1929 adcq \$0, %rdx
1930 addq %r10, %r9
1931 movq %rdx, %r10
1932 adcq \$0, %r10
1933
1934 mulq %rbx
1935 addq %rax, %r11
1936 movq 32($ap), %rax
1937 adcq \$0, %rdx
1938 addq %r11, %r10
1939 movq %rdx, %r11
1940 adcq \$0, %r11
1941
1942 mulq %rbx
1943 addq %rax, %r12
1944 movq 40($ap), %rax
1945 adcq \$0, %rdx
1946 addq %r12, %r11
1947 movq %rdx, %r12
1948 adcq \$0, %r12
1949
1950 mulq %rbx
1951 addq %rax, %r13
1952 movq 48($ap), %rax
1953 adcq \$0, %rdx
1954 addq %r13, %r12
1955 movq %rdx, %r13
1956 adcq \$0, %r13
1957
1958 mulq %rbx
1959 addq %rax, %r14
1960 movq 56($ap), %rax
1961 adcq \$0, %rdx
1962 addq %r14, %r13
1963 movq %rdx, %r14
1964 leaq 8($bp), $bp
1965 adcq \$0, %r14
1966
1967 mulq %rbx
1968 addq %rax, %r15
1969 movq ($ap), %rax
1970 adcq \$0, %rdx
1971 addq %r15, %r14
1972 movq %rdx, %r15
1973 adcq \$0, %r15
1974
1975 leaq 8(%rdi), %rdi
1976
1977 decl %ecx
1978 jnz .Loop_mul
1979
1980 movq %r8, (%rdi)
1981 movq %r9, 8(%rdi)
1982 movq %r10, 16(%rdi)
1983 movq %r11, 24(%rdi)
1984 movq %r12, 32(%rdi)
1985 movq %r13, 40(%rdi)
1986 movq %r14, 48(%rdi)
1987 movq %r15, 56(%rdi)
1988
1989 ret
1990.cfi_endproc
1991.size __rsaz_512_mul,.-__rsaz_512_mul
1992___
1993}
1994if ($addx) {
1995 # __rsaz_512_mulx
1996 #
1997 # input: %rsi - ap, %rbp - bp
1998 # output:
1999 # clobbers: everything
2000my ($ap,$bp,$zero) = ("%rsi","%rbp","%rdi");
2001$code.=<<___;
2002.type __rsaz_512_mulx,\@abi-omnipotent
2003.align 32
2004__rsaz_512_mulx:
2005.cfi_startproc
2006 mulx ($ap), %rbx, %r8 # initial %rdx preloaded by caller
2007 mov \$-6, %rcx
2008
2009 mulx 8($ap), %rax, %r9
2010 movq %rbx, 8(%rsp)
2011
2012 mulx 16($ap), %rbx, %r10
2013 adc %rax, %r8
2014
2015 mulx 24($ap), %rax, %r11
2016 adc %rbx, %r9
2017
2018 mulx 32($ap), %rbx, %r12
2019 adc %rax, %r10
2020
2021 mulx 40($ap), %rax, %r13
2022 adc %rbx, %r11
2023
2024 mulx 48($ap), %rbx, %r14
2025 adc %rax, %r12
2026
2027 mulx 56($ap), %rax, %r15
2028 mov 8($bp), %rdx
2029 adc %rbx, %r13
2030 adc %rax, %r14
2031 adc \$0, %r15
2032
2033 xor $zero, $zero # cf=0,of=0
2034 jmp .Loop_mulx
2035
2036.align 32
2037.Loop_mulx:
2038 movq %r8, %rbx
2039 mulx ($ap), %rax, %r8
2040 adcx %rax, %rbx
2041 adox %r9, %r8
2042
2043 mulx 8($ap), %rax, %r9
2044 adcx %rax, %r8
2045 adox %r10, %r9
2046
2047 mulx 16($ap), %rax, %r10
2048 adcx %rax, %r9
2049 adox %r11, %r10
2050
2051 mulx 24($ap), %rax, %r11
2052 adcx %rax, %r10
2053 adox %r12, %r11
2054
2055 .byte 0x3e,0xc4,0x62,0xfb,0xf6,0xa6,0x20,0x00,0x00,0x00 # mulx 32($ap), %rax, %r12
2056 adcx %rax, %r11
2057 adox %r13, %r12
2058
2059 mulx 40($ap), %rax, %r13
2060 adcx %rax, %r12
2061 adox %r14, %r13
2062
2063 mulx 48($ap), %rax, %r14
2064 adcx %rax, %r13
2065 adox %r15, %r14
2066
2067 mulx 56($ap), %rax, %r15
2068 movq 64($bp,%rcx,8), %rdx
2069 movq %rbx, 8+64-8(%rsp,%rcx,8)
2070 adcx %rax, %r14
2071 adox $zero, %r15
2072 adcx $zero, %r15 # cf=0
2073
2074 inc %rcx # of=0
2075 jnz .Loop_mulx
2076
2077 movq %r8, %rbx
2078 mulx ($ap), %rax, %r8
2079 adcx %rax, %rbx
2080 adox %r9, %r8
2081
2082 .byte 0xc4,0x62,0xfb,0xf6,0x8e,0x08,0x00,0x00,0x00 # mulx 8($ap), %rax, %r9
2083 adcx %rax, %r8
2084 adox %r10, %r9
2085
2086 .byte 0xc4,0x62,0xfb,0xf6,0x96,0x10,0x00,0x00,0x00 # mulx 16($ap), %rax, %r10
2087 adcx %rax, %r9
2088 adox %r11, %r10
2089
2090 mulx 24($ap), %rax, %r11
2091 adcx %rax, %r10
2092 adox %r12, %r11
2093
2094 mulx 32($ap), %rax, %r12
2095 adcx %rax, %r11
2096 adox %r13, %r12
2097
2098 mulx 40($ap), %rax, %r13
2099 adcx %rax, %r12
2100 adox %r14, %r13
2101
2102 .byte 0xc4,0x62,0xfb,0xf6,0xb6,0x30,0x00,0x00,0x00 # mulx 48($ap), %rax, %r14
2103 adcx %rax, %r13
2104 adox %r15, %r14
2105
2106 .byte 0xc4,0x62,0xfb,0xf6,0xbe,0x38,0x00,0x00,0x00 # mulx 56($ap), %rax, %r15
2107 adcx %rax, %r14
2108 adox $zero, %r15
2109 adcx $zero, %r15
2110
2111 mov %rbx, 8+64-8(%rsp)
2112 mov %r8, 8+64(%rsp)
2113 mov %r9, 8+64+8(%rsp)
2114 mov %r10, 8+64+16(%rsp)
2115 mov %r11, 8+64+24(%rsp)
2116 mov %r12, 8+64+32(%rsp)
2117 mov %r13, 8+64+40(%rsp)
2118 mov %r14, 8+64+48(%rsp)
2119 mov %r15, 8+64+56(%rsp)
2120
2121 ret
2122.cfi_endproc
2123.size __rsaz_512_mulx,.-__rsaz_512_mulx
2124___
2125}
2126{
2127my ($out,$inp,$power)= $win64 ? ("%rcx","%rdx","%r8d") : ("%rdi","%rsi","%edx");
2128$code.=<<___;
2129.globl rsaz_512_scatter4
2130.type rsaz_512_scatter4,\@abi-omnipotent
2131.align 16
2132rsaz_512_scatter4:
2133.cfi_startproc
2134 leaq ($out,$power,8), $out
2135 movl \$8, %r9d
2136 jmp .Loop_scatter
2137.align 16
2138.Loop_scatter:
2139 movq ($inp), %rax
2140 leaq 8($inp), $inp
2141 movq %rax, ($out)
2142 leaq 128($out), $out
2143 decl %r9d
2144 jnz .Loop_scatter
2145 ret
2146.cfi_endproc
2147.size rsaz_512_scatter4,.-rsaz_512_scatter4
2148
2149.globl rsaz_512_gather4
2150.type rsaz_512_gather4,\@abi-omnipotent
2151.align 16
2152rsaz_512_gather4:
2153.cfi_startproc
2154___
2155$code.=<<___ if ($win64);
2156.LSEH_begin_rsaz_512_gather4:
2157 .byte 0x48,0x81,0xec,0xa8,0x00,0x00,0x00 # sub $0xa8,%rsp
2158 .byte 0x0f,0x29,0x34,0x24 # movaps %xmm6,(%rsp)
2159 .byte 0x0f,0x29,0x7c,0x24,0x10 # movaps %xmm7,0x10(%rsp)
2160 .byte 0x44,0x0f,0x29,0x44,0x24,0x20 # movaps %xmm8,0x20(%rsp)
2161 .byte 0x44,0x0f,0x29,0x4c,0x24,0x30 # movaps %xmm9,0x30(%rsp)
2162 .byte 0x44,0x0f,0x29,0x54,0x24,0x40 # movaps %xmm10,0x40(%rsp)
2163 .byte 0x44,0x0f,0x29,0x5c,0x24,0x50 # movaps %xmm11,0x50(%rsp)
2164 .byte 0x44,0x0f,0x29,0x64,0x24,0x60 # movaps %xmm12,0x60(%rsp)
2165 .byte 0x44,0x0f,0x29,0x6c,0x24,0x70 # movaps %xmm13,0x70(%rsp)
2166 .byte 0x44,0x0f,0x29,0xb4,0x24,0x80,0,0,0 # movaps %xmm14,0x80(%rsp)
2167 .byte 0x44,0x0f,0x29,0xbc,0x24,0x90,0,0,0 # movaps %xmm15,0x90(%rsp)
2168___
2169$code.=<<___;
2170 movd $power,%xmm8
2171 movdqa .Linc+16(%rip),%xmm1 # 00000002000000020000000200000002
2172 movdqa .Linc(%rip),%xmm0 # 00000001000000010000000000000000
2173
2174 pshufd \$0,%xmm8,%xmm8 # broadcast $power
2175 movdqa %xmm1,%xmm7
2176 movdqa %xmm1,%xmm2
2177___
2178########################################################################
2179# calculate mask by comparing 0..15 to $power
2180#
2181for($i=0;$i<4;$i++) {
2182$code.=<<___;
2183 paddd %xmm`$i`,%xmm`$i+1`
2184 pcmpeqd %xmm8,%xmm`$i`
2185 movdqa %xmm7,%xmm`$i+3`
2186___
2187}
2188for(;$i<7;$i++) {
2189$code.=<<___;
2190 paddd %xmm`$i`,%xmm`$i+1`
2191 pcmpeqd %xmm8,%xmm`$i`
2192___
2193}
2194$code.=<<___;
2195 pcmpeqd %xmm8,%xmm7
2196 movl \$8, %r9d
2197 jmp .Loop_gather
2198.align 16
2199.Loop_gather:
2200 movdqa 16*0($inp),%xmm8
2201 movdqa 16*1($inp),%xmm9
2202 movdqa 16*2($inp),%xmm10
2203 movdqa 16*3($inp),%xmm11
2204 pand %xmm0,%xmm8
2205 movdqa 16*4($inp),%xmm12
2206 pand %xmm1,%xmm9
2207 movdqa 16*5($inp),%xmm13
2208 pand %xmm2,%xmm10
2209 movdqa 16*6($inp),%xmm14
2210 pand %xmm3,%xmm11
2211 movdqa 16*7($inp),%xmm15
2212 leaq 128($inp), $inp
2213 pand %xmm4,%xmm12
2214 pand %xmm5,%xmm13
2215 pand %xmm6,%xmm14
2216 pand %xmm7,%xmm15
2217 por %xmm10,%xmm8
2218 por %xmm11,%xmm9
2219 por %xmm12,%xmm8
2220 por %xmm13,%xmm9
2221 por %xmm14,%xmm8
2222 por %xmm15,%xmm9
2223
2224 por %xmm9,%xmm8
2225 pshufd \$0x4e,%xmm8,%xmm9
2226 por %xmm9,%xmm8
2227 movq %xmm8,($out)
2228 leaq 8($out), $out
2229 decl %r9d
2230 jnz .Loop_gather
2231___
2232$code.=<<___ if ($win64);
2233 movaps 0x00(%rsp),%xmm6
2234 movaps 0x10(%rsp),%xmm7
2235 movaps 0x20(%rsp),%xmm8
2236 movaps 0x30(%rsp),%xmm9
2237 movaps 0x40(%rsp),%xmm10
2238 movaps 0x50(%rsp),%xmm11
2239 movaps 0x60(%rsp),%xmm12
2240 movaps 0x70(%rsp),%xmm13
2241 movaps 0x80(%rsp),%xmm14
2242 movaps 0x90(%rsp),%xmm15
2243 add \$0xa8,%rsp
2244___
2245$code.=<<___;
2246 ret
2247.LSEH_end_rsaz_512_gather4:
2248.cfi_endproc
2249.size rsaz_512_gather4,.-rsaz_512_gather4
2250
2251.align 64
2252.Linc:
2253 .long 0,0, 1,1
2254 .long 2,2, 2,2
2255___
2256}
2257
2258# EXCEPTION_DISPOSITION handler (EXCEPTION_RECORD *rec,ULONG64 frame,
2259# CONTEXT *context,DISPATCHER_CONTEXT *disp)
2260if ($win64) {
2261$rec="%rcx";
2262$frame="%rdx";
2263$context="%r8";
2264$disp="%r9";
2265
2266$code.=<<___;
2267.extern __imp_RtlVirtualUnwind
2268.type se_handler,\@abi-omnipotent
2269.align 16
2270se_handler:
2271 push %rsi
2272 push %rdi
2273 push %rbx
2274 push %rbp
2275 push %r12
2276 push %r13
2277 push %r14
2278 push %r15
2279 pushfq
2280 sub \$64,%rsp
2281
2282 mov 120($context),%rax # pull context->Rax
2283 mov 248($context),%rbx # pull context->Rip
2284
2285 mov 8($disp),%rsi # disp->ImageBase
2286 mov 56($disp),%r11 # disp->HandlerData
2287
2288 mov 0(%r11),%r10d # HandlerData[0]
2289 lea (%rsi,%r10),%r10 # end of prologue label
2290 cmp %r10,%rbx # context->Rip<end of prologue label
2291 jb .Lcommon_seh_tail
2292
2293 mov 152($context),%rax # pull context->Rsp
2294
2295 mov 4(%r11),%r10d # HandlerData[1]
2296 lea (%rsi,%r10),%r10 # epilogue label
2297 cmp %r10,%rbx # context->Rip>=epilogue label
2298 jae .Lcommon_seh_tail
2299
2300 lea 128+24+48(%rax),%rax
2301
2302 lea .Lmul_gather4_epilogue(%rip),%rbx
2303 cmp %r10,%rbx
2304 jne .Lse_not_in_mul_gather4
2305
2306 lea 0xb0(%rax),%rax
2307
2308 lea -48-0xa8(%rax),%rsi
2309 lea 512($context),%rdi
2310 mov \$20,%ecx
2311 .long 0xa548f3fc # cld; rep movsq
2312
2313.Lse_not_in_mul_gather4:
2314 mov -8(%rax),%rbx
2315 mov -16(%rax),%rbp
2316 mov -24(%rax),%r12
2317 mov -32(%rax),%r13
2318 mov -40(%rax),%r14
2319 mov -48(%rax),%r15
2320 mov %rbx,144($context) # restore context->Rbx
2321 mov %rbp,160($context) # restore context->Rbp
2322 mov %r12,216($context) # restore context->R12
2323 mov %r13,224($context) # restore context->R13
2324 mov %r14,232($context) # restore context->R14
2325 mov %r15,240($context) # restore context->R15
2326
2327.Lcommon_seh_tail:
2328 mov 8(%rax),%rdi
2329 mov 16(%rax),%rsi
2330 mov %rax,152($context) # restore context->Rsp
2331 mov %rsi,168($context) # restore context->Rsi
2332 mov %rdi,176($context) # restore context->Rdi
2333
2334 mov 40($disp),%rdi # disp->ContextRecord
2335 mov $context,%rsi # context
2336 mov \$154,%ecx # sizeof(CONTEXT)
2337 .long 0xa548f3fc # cld; rep movsq
2338
2339 mov $disp,%rsi
2340 xor %rcx,%rcx # arg1, UNW_FLAG_NHANDLER
2341 mov 8(%rsi),%rdx # arg2, disp->ImageBase
2342 mov 0(%rsi),%r8 # arg3, disp->ControlPc
2343 mov 16(%rsi),%r9 # arg4, disp->FunctionEntry
2344 mov 40(%rsi),%r10 # disp->ContextRecord
2345 lea 56(%rsi),%r11 # &disp->HandlerData
2346 lea 24(%rsi),%r12 # &disp->EstablisherFrame
2347 mov %r10,32(%rsp) # arg5
2348 mov %r11,40(%rsp) # arg6
2349 mov %r12,48(%rsp) # arg7
2350 mov %rcx,56(%rsp) # arg8, (NULL)
2351 call *__imp_RtlVirtualUnwind(%rip)
2352
2353 mov \$1,%eax # ExceptionContinueSearch
2354 add \$64,%rsp
2355 popfq
2356 pop %r15
2357 pop %r14
2358 pop %r13
2359 pop %r12
2360 pop %rbp
2361 pop %rbx
2362 pop %rdi
2363 pop %rsi
2364 ret
2365.size se_handler,.-se_handler
2366
2367.section .pdata
2368.align 4
2369 .rva .LSEH_begin_rsaz_512_sqr
2370 .rva .LSEH_end_rsaz_512_sqr
2371 .rva .LSEH_info_rsaz_512_sqr
2372
2373 .rva .LSEH_begin_rsaz_512_mul
2374 .rva .LSEH_end_rsaz_512_mul
2375 .rva .LSEH_info_rsaz_512_mul
2376
2377 .rva .LSEH_begin_rsaz_512_mul_gather4
2378 .rva .LSEH_end_rsaz_512_mul_gather4
2379 .rva .LSEH_info_rsaz_512_mul_gather4
2380
2381 .rva .LSEH_begin_rsaz_512_mul_scatter4
2382 .rva .LSEH_end_rsaz_512_mul_scatter4
2383 .rva .LSEH_info_rsaz_512_mul_scatter4
2384
2385 .rva .LSEH_begin_rsaz_512_mul_by_one
2386 .rva .LSEH_end_rsaz_512_mul_by_one
2387 .rva .LSEH_info_rsaz_512_mul_by_one
2388
2389 .rva .LSEH_begin_rsaz_512_gather4
2390 .rva .LSEH_end_rsaz_512_gather4
2391 .rva .LSEH_info_rsaz_512_gather4
2392
2393.section .xdata
2394.align 8
2395.LSEH_info_rsaz_512_sqr:
2396 .byte 9,0,0,0
2397 .rva se_handler
2398 .rva .Lsqr_body,.Lsqr_epilogue # HandlerData[]
2399.LSEH_info_rsaz_512_mul:
2400 .byte 9,0,0,0
2401 .rva se_handler
2402 .rva .Lmul_body,.Lmul_epilogue # HandlerData[]
2403.LSEH_info_rsaz_512_mul_gather4:
2404 .byte 9,0,0,0
2405 .rva se_handler
2406 .rva .Lmul_gather4_body,.Lmul_gather4_epilogue # HandlerData[]
2407.LSEH_info_rsaz_512_mul_scatter4:
2408 .byte 9,0,0,0
2409 .rva se_handler
2410 .rva .Lmul_scatter4_body,.Lmul_scatter4_epilogue # HandlerData[]
2411.LSEH_info_rsaz_512_mul_by_one:
2412 .byte 9,0,0,0
2413 .rva se_handler
2414 .rva .Lmul_by_one_body,.Lmul_by_one_epilogue # HandlerData[]
2415.LSEH_info_rsaz_512_gather4:
2416 .byte 0x01,0x46,0x16,0x00
2417 .byte 0x46,0xf8,0x09,0x00 # vmovaps 0x90(rsp),xmm15
2418 .byte 0x3d,0xe8,0x08,0x00 # vmovaps 0x80(rsp),xmm14
2419 .byte 0x34,0xd8,0x07,0x00 # vmovaps 0x70(rsp),xmm13
2420 .byte 0x2e,0xc8,0x06,0x00 # vmovaps 0x60(rsp),xmm12
2421 .byte 0x28,0xb8,0x05,0x00 # vmovaps 0x50(rsp),xmm11
2422 .byte 0x22,0xa8,0x04,0x00 # vmovaps 0x40(rsp),xmm10
2423 .byte 0x1c,0x98,0x03,0x00 # vmovaps 0x30(rsp),xmm9
2424 .byte 0x16,0x88,0x02,0x00 # vmovaps 0x20(rsp),xmm8
2425 .byte 0x10,0x78,0x01,0x00 # vmovaps 0x10(rsp),xmm7
2426 .byte 0x0b,0x68,0x00,0x00 # vmovaps 0x00(rsp),xmm6
2427 .byte 0x07,0x01,0x15,0x00 # sub rsp,0xa8
2428___
2429}
2430
2431$code =~ s/\`([^\`]*)\`/eval $1/gem;
2432print $code;
2433close STDOUT or die "error closing STDOUT: $!";
Note: See TracBrowser for help on using the repository browser.

© 2024 Oracle Support Privacy / Do Not Sell My Info Terms of Use Trademark Policy Automated Access Etiquette