VirtualBox

source: vbox/trunk/src/libs/openssl-1.1.1k/crypto/bn/asm/x86_64-mont.pl@ 90293

Last change on this file since 90293 was 90293, checked in by vboxsync, 4 years ago

openssl-1.1.1k: Applied and adjusted our OpenSSL changes to 1.1.1k. bugref:10072

  • Property svn:executable set to *
File size: 32.4 KB
Line 
1#! /usr/bin/env perl
2# Copyright 2005-2020 The OpenSSL Project Authors. All Rights Reserved.
3#
4# Licensed under the OpenSSL license (the "License"). You may not use
5# this file except in compliance with the License. You can obtain a copy
6# in the file LICENSE in the source distribution or at
7# https://www.openssl.org/source/license.html
8
9
10# ====================================================================
11# Written by Andy Polyakov <[email protected]> for the OpenSSL
12# project. The module is, however, dual licensed under OpenSSL and
13# CRYPTOGAMS licenses depending on where you obtain it. For further
14# details see http://www.openssl.org/~appro/cryptogams/.
15# ====================================================================
16
17# October 2005.
18#
19# Montgomery multiplication routine for x86_64. While it gives modest
20# 9% improvement of rsa4096 sign on Opteron, rsa512 sign runs more
21# than twice, >2x, as fast. Most common rsa1024 sign is improved by
22# respectful 50%. It remains to be seen if loop unrolling and
23# dedicated squaring routine can provide further improvement...
24
25# July 2011.
26#
27# Add dedicated squaring procedure. Performance improvement varies
28# from platform to platform, but in average it's ~5%/15%/25%/33%
29# for 512-/1024-/2048-/4096-bit RSA *sign* benchmarks respectively.
30
31# August 2011.
32#
33# Unroll and modulo-schedule inner loops in such manner that they
34# are "fallen through" for input lengths of 8, which is critical for
35# 1024-bit RSA *sign*. Average performance improvement in comparison
36# to *initial* version of this module from 2005 is ~0%/30%/40%/45%
37# for 512-/1024-/2048-/4096-bit RSA *sign* benchmarks respectively.
38
39# June 2013.
40#
41# Optimize reduction in squaring procedure and improve 1024+-bit RSA
42# sign performance by 10-16% on Intel Sandy Bridge and later
43# (virtually same on non-Intel processors).
44
45# August 2013.
46#
47# Add MULX/ADOX/ADCX code path.
48
49$flavour = shift;
50$output = shift;
51if ($flavour =~ /\./) { $output = $flavour; undef $flavour; }
52
53$win64=0; $win64=1 if ($flavour =~ /[nm]asm|mingw64/ || $output =~ /\.asm$/);
54
55$0 =~ m/(.*[\/\\])[^\/\\]+$/; $dir=$1;
56( $xlate="${dir}x86_64-xlate.pl" and -f $xlate ) or
57( $xlate="${dir}../../perlasm/x86_64-xlate.pl" and -f $xlate) or
58die "can't locate x86_64-xlate.pl";
59
60open OUT,"| \"$^X\" \"$xlate\" $flavour \"$output\"";
61*STDOUT=*OUT;
62
63if (`$ENV{CC} -Wa,-v -c -o /dev/null -x assembler /dev/null 2>&1`
64 =~ /GNU assembler version ([2-9]\.[0-9]+)/) {
65 $addx = ($1>=2.23);
66}
67
68if (!$addx && $win64 && ($flavour =~ /nasm/ || $ENV{ASM} =~ /nasm/) &&
69 `nasm -v 2>&1` =~ /NASM version ([2-9]\.[0-9]+)/) {
70 $addx = ($1>=2.10);
71}
72
73if (!$addx && $win64 && ($flavour =~ /masm/ || $ENV{ASM} =~ /ml64/) &&
74 `ml64 2>&1` =~ /Version ([0-9]+)\./) {
75 $addx = ($1>=12);
76}
77
78if (!$addx && `$ENV{CC} -v 2>&1` =~ /((?:clang|LLVM) version|.*based on LLVM) ([0-9]+)\.([0-9]+)/) {
79 my $ver = $2 + $3/100.0; # 3.1->3.01, 3.10->3.10
80 $addx = ($ver>=3.03);
81}
82
83# int bn_mul_mont(
84$rp="%rdi"; # BN_ULONG *rp,
85$ap="%rsi"; # const BN_ULONG *ap,
86$bp="%rdx"; # const BN_ULONG *bp,
87$np="%rcx"; # const BN_ULONG *np,
88$n0="%r8"; # const BN_ULONG *n0,
89$num="%r9"; # int num);
90$lo0="%r10";
91$hi0="%r11";
92$hi1="%r13";
93$i="%r14";
94$j="%r15";
95$m0="%rbx";
96$m1="%rbp";
97
98$code=<<___;
99.text
100
101.extern OPENSSL_ia32cap_P
102
103.globl bn_mul_mont
104.type bn_mul_mont,\@function,6
105.align 16
106bn_mul_mont:
107.cfi_startproc
108 mov ${num}d,${num}d
109 mov %rsp,%rax
110.cfi_def_cfa_register %rax
111 test \$3,${num}d
112 jnz .Lmul_enter
113 cmp \$8,${num}d
114 jb .Lmul_enter
115___
116$code.=<<___ if ($addx);
117 mov OPENSSL_ia32cap_P+8(%rip),%r11d
118___
119$code.=<<___;
120 cmp $ap,$bp
121 jne .Lmul4x_enter
122 test \$7,${num}d
123 jz .Lsqr8x_enter
124 jmp .Lmul4x_enter
125
126.align 16
127.Lmul_enter:
128 push %rbx
129.cfi_push %rbx
130 push %rbp
131.cfi_push %rbp
132 push %r12
133.cfi_push %r12
134 push %r13
135.cfi_push %r13
136 push %r14
137.cfi_push %r14
138 push %r15
139.cfi_push %r15
140
141 neg $num
142 mov %rsp,%r11
143 lea -16(%rsp,$num,8),%r10 # future alloca(8*(num+2))
144 neg $num # restore $num
145 and \$-1024,%r10 # minimize TLB usage
146
147 # An OS-agnostic version of __chkstk.
148 #
149 # Some OSes (Windows) insist on stack being "wired" to
150 # physical memory in strictly sequential manner, i.e. if stack
151 # allocation spans two pages, then reference to farmost one can
152 # be punishable by SEGV. But page walking can do good even on
153 # other OSes, because it guarantees that villain thread hits
154 # the guard page before it can make damage to innocent one...
155 sub %r10,%r11
156 and \$-4096,%r11
157 lea (%r10,%r11),%rsp
158 mov (%rsp),%r11
159 cmp %r10,%rsp
160 ja .Lmul_page_walk
161 jmp .Lmul_page_walk_done
162
163.align 16
164.Lmul_page_walk:
165 lea -4096(%rsp),%rsp
166 mov (%rsp),%r11
167 cmp %r10,%rsp
168 ja .Lmul_page_walk
169.Lmul_page_walk_done:
170
171 mov %rax,8(%rsp,$num,8) # tp[num+1]=%rsp
172.cfi_cfa_expression %rsp+8,$num,8,mul,plus,deref,+8
173.Lmul_body:
174 mov $bp,%r12 # reassign $bp
175___
176 $bp="%r12";
177$code.=<<___;
178 mov ($n0),$n0 # pull n0[0] value
179 mov ($bp),$m0 # m0=bp[0]
180 mov ($ap),%rax
181
182 xor $i,$i # i=0
183 xor $j,$j # j=0
184
185 mov $n0,$m1
186 mulq $m0 # ap[0]*bp[0]
187 mov %rax,$lo0
188 mov ($np),%rax
189
190 imulq $lo0,$m1 # "tp[0]"*n0
191 mov %rdx,$hi0
192
193 mulq $m1 # np[0]*m1
194 add %rax,$lo0 # discarded
195 mov 8($ap),%rax
196 adc \$0,%rdx
197 mov %rdx,$hi1
198
199 lea 1($j),$j # j++
200 jmp .L1st_enter
201
202.align 16
203.L1st:
204 add %rax,$hi1
205 mov ($ap,$j,8),%rax
206 adc \$0,%rdx
207 add $hi0,$hi1 # np[j]*m1+ap[j]*bp[0]
208 mov $lo0,$hi0
209 adc \$0,%rdx
210 mov $hi1,-16(%rsp,$j,8) # tp[j-1]
211 mov %rdx,$hi1
212
213.L1st_enter:
214 mulq $m0 # ap[j]*bp[0]
215 add %rax,$hi0
216 mov ($np,$j,8),%rax
217 adc \$0,%rdx
218 lea 1($j),$j # j++
219 mov %rdx,$lo0
220
221 mulq $m1 # np[j]*m1
222 cmp $num,$j
223 jne .L1st
224
225 add %rax,$hi1
226 mov ($ap),%rax # ap[0]
227 adc \$0,%rdx
228 add $hi0,$hi1 # np[j]*m1+ap[j]*bp[0]
229 adc \$0,%rdx
230 mov $hi1,-16(%rsp,$j,8) # tp[j-1]
231 mov %rdx,$hi1
232 mov $lo0,$hi0
233
234 xor %rdx,%rdx
235 add $hi0,$hi1
236 adc \$0,%rdx
237 mov $hi1,-8(%rsp,$num,8)
238 mov %rdx,(%rsp,$num,8) # store upmost overflow bit
239
240 lea 1($i),$i # i++
241 jmp .Louter
242.align 16
243.Louter:
244 mov ($bp,$i,8),$m0 # m0=bp[i]
245 xor $j,$j # j=0
246 mov $n0,$m1
247 mov (%rsp),$lo0
248 mulq $m0 # ap[0]*bp[i]
249 add %rax,$lo0 # ap[0]*bp[i]+tp[0]
250 mov ($np),%rax
251 adc \$0,%rdx
252
253 imulq $lo0,$m1 # tp[0]*n0
254 mov %rdx,$hi0
255
256 mulq $m1 # np[0]*m1
257 add %rax,$lo0 # discarded
258 mov 8($ap),%rax
259 adc \$0,%rdx
260 mov 8(%rsp),$lo0 # tp[1]
261 mov %rdx,$hi1
262
263 lea 1($j),$j # j++
264 jmp .Linner_enter
265
266.align 16
267.Linner:
268 add %rax,$hi1
269 mov ($ap,$j,8),%rax
270 adc \$0,%rdx
271 add $lo0,$hi1 # np[j]*m1+ap[j]*bp[i]+tp[j]
272 mov (%rsp,$j,8),$lo0
273 adc \$0,%rdx
274 mov $hi1,-16(%rsp,$j,8) # tp[j-1]
275 mov %rdx,$hi1
276
277.Linner_enter:
278 mulq $m0 # ap[j]*bp[i]
279 add %rax,$hi0
280 mov ($np,$j,8),%rax
281 adc \$0,%rdx
282 add $hi0,$lo0 # ap[j]*bp[i]+tp[j]
283 mov %rdx,$hi0
284 adc \$0,$hi0
285 lea 1($j),$j # j++
286
287 mulq $m1 # np[j]*m1
288 cmp $num,$j
289 jne .Linner
290
291 add %rax,$hi1
292 mov ($ap),%rax # ap[0]
293 adc \$0,%rdx
294 add $lo0,$hi1 # np[j]*m1+ap[j]*bp[i]+tp[j]
295 mov (%rsp,$j,8),$lo0
296 adc \$0,%rdx
297 mov $hi1,-16(%rsp,$j,8) # tp[j-1]
298 mov %rdx,$hi1
299
300 xor %rdx,%rdx
301 add $hi0,$hi1
302 adc \$0,%rdx
303 add $lo0,$hi1 # pull upmost overflow bit
304 adc \$0,%rdx
305 mov $hi1,-8(%rsp,$num,8)
306 mov %rdx,(%rsp,$num,8) # store upmost overflow bit
307
308 lea 1($i),$i # i++
309 cmp $num,$i
310 jb .Louter
311
312 xor $i,$i # i=0 and clear CF!
313 mov (%rsp),%rax # tp[0]
314 mov $num,$j # j=num
315
316.align 16
317.Lsub: sbb ($np,$i,8),%rax
318 mov %rax,($rp,$i,8) # rp[i]=tp[i]-np[i]
319 mov 8(%rsp,$i,8),%rax # tp[i+1]
320 lea 1($i),$i # i++
321 dec $j # doesn't affect CF!
322 jnz .Lsub
323
324 sbb \$0,%rax # handle upmost overflow bit
325 mov \$-1,%rbx
326 xor %rax,%rbx # not %rax
327 xor $i,$i
328 mov $num,$j # j=num
329
330.Lcopy: # conditional copy
331 mov ($rp,$i,8),%rcx
332 mov (%rsp,$i,8),%rdx
333 and %rbx,%rcx
334 and %rax,%rdx
335 mov $num,(%rsp,$i,8) # zap temporary vector
336 or %rcx,%rdx
337 mov %rdx,($rp,$i,8) # rp[i]=tp[i]
338 lea 1($i),$i
339 sub \$1,$j
340 jnz .Lcopy
341
342 mov 8(%rsp,$num,8),%rsi # restore %rsp
343.cfi_def_cfa %rsi,8
344 mov \$1,%rax
345 mov -48(%rsi),%r15
346.cfi_restore %r15
347 mov -40(%rsi),%r14
348.cfi_restore %r14
349 mov -32(%rsi),%r13
350.cfi_restore %r13
351 mov -24(%rsi),%r12
352.cfi_restore %r12
353 mov -16(%rsi),%rbp
354.cfi_restore %rbp
355 mov -8(%rsi),%rbx
356.cfi_restore %rbx
357 lea (%rsi),%rsp
358.cfi_def_cfa_register %rsp
359.Lmul_epilogue:
360 ret
361.cfi_endproc
362.size bn_mul_mont,.-bn_mul_mont
363___
364{{{
365my @A=("%r10","%r11");
366my @N=("%r13","%rdi");
367$code.=<<___;
368.type bn_mul4x_mont,\@function,6
369.align 16
370bn_mul4x_mont:
371.cfi_startproc
372 mov ${num}d,${num}d
373 mov %rsp,%rax
374.cfi_def_cfa_register %rax
375.Lmul4x_enter:
376___
377$code.=<<___ if ($addx);
378 and \$0x80100,%r11d
379 cmp \$0x80100,%r11d
380 je .Lmulx4x_enter
381___
382$code.=<<___;
383 push %rbx
384.cfi_push %rbx
385 push %rbp
386.cfi_push %rbp
387 push %r12
388.cfi_push %r12
389 push %r13
390.cfi_push %r13
391 push %r14
392.cfi_push %r14
393 push %r15
394.cfi_push %r15
395
396 neg $num
397 mov %rsp,%r11
398 lea -32(%rsp,$num,8),%r10 # future alloca(8*(num+4))
399 neg $num # restore
400 and \$-1024,%r10 # minimize TLB usage
401
402 sub %r10,%r11
403 and \$-4096,%r11
404 lea (%r10,%r11),%rsp
405 mov (%rsp),%r11
406 cmp %r10,%rsp
407 ja .Lmul4x_page_walk
408 jmp .Lmul4x_page_walk_done
409
410.Lmul4x_page_walk:
411 lea -4096(%rsp),%rsp
412 mov (%rsp),%r11
413 cmp %r10,%rsp
414 ja .Lmul4x_page_walk
415.Lmul4x_page_walk_done:
416
417 mov %rax,8(%rsp,$num,8) # tp[num+1]=%rsp
418.cfi_cfa_expression %rsp+8,$num,8,mul,plus,deref,+8
419.Lmul4x_body:
420 mov $rp,16(%rsp,$num,8) # tp[num+2]=$rp
421 mov %rdx,%r12 # reassign $bp
422___
423 $bp="%r12";
424$code.=<<___;
425 mov ($n0),$n0 # pull n0[0] value
426 mov ($bp),$m0 # m0=bp[0]
427 mov ($ap),%rax
428
429 xor $i,$i # i=0
430 xor $j,$j # j=0
431
432 mov $n0,$m1
433 mulq $m0 # ap[0]*bp[0]
434 mov %rax,$A[0]
435 mov ($np),%rax
436
437 imulq $A[0],$m1 # "tp[0]"*n0
438 mov %rdx,$A[1]
439
440 mulq $m1 # np[0]*m1
441 add %rax,$A[0] # discarded
442 mov 8($ap),%rax
443 adc \$0,%rdx
444 mov %rdx,$N[1]
445
446 mulq $m0
447 add %rax,$A[1]
448 mov 8($np),%rax
449 adc \$0,%rdx
450 mov %rdx,$A[0]
451
452 mulq $m1
453 add %rax,$N[1]
454 mov 16($ap),%rax
455 adc \$0,%rdx
456 add $A[1],$N[1]
457 lea 4($j),$j # j++
458 adc \$0,%rdx
459 mov $N[1],(%rsp)
460 mov %rdx,$N[0]
461 jmp .L1st4x
462.align 16
463.L1st4x:
464 mulq $m0 # ap[j]*bp[0]
465 add %rax,$A[0]
466 mov -16($np,$j,8),%rax
467 adc \$0,%rdx
468 mov %rdx,$A[1]
469
470 mulq $m1 # np[j]*m1
471 add %rax,$N[0]
472 mov -8($ap,$j,8),%rax
473 adc \$0,%rdx
474 add $A[0],$N[0] # np[j]*m1+ap[j]*bp[0]
475 adc \$0,%rdx
476 mov $N[0],-24(%rsp,$j,8) # tp[j-1]
477 mov %rdx,$N[1]
478
479 mulq $m0 # ap[j]*bp[0]
480 add %rax,$A[1]
481 mov -8($np,$j,8),%rax
482 adc \$0,%rdx
483 mov %rdx,$A[0]
484
485 mulq $m1 # np[j]*m1
486 add %rax,$N[1]
487 mov ($ap,$j,8),%rax
488 adc \$0,%rdx
489 add $A[1],$N[1] # np[j]*m1+ap[j]*bp[0]
490 adc \$0,%rdx
491 mov $N[1],-16(%rsp,$j,8) # tp[j-1]
492 mov %rdx,$N[0]
493
494 mulq $m0 # ap[j]*bp[0]
495 add %rax,$A[0]
496 mov ($np,$j,8),%rax
497 adc \$0,%rdx
498 mov %rdx,$A[1]
499
500 mulq $m1 # np[j]*m1
501 add %rax,$N[0]
502 mov 8($ap,$j,8),%rax
503 adc \$0,%rdx
504 add $A[0],$N[0] # np[j]*m1+ap[j]*bp[0]
505 adc \$0,%rdx
506 mov $N[0],-8(%rsp,$j,8) # tp[j-1]
507 mov %rdx,$N[1]
508
509 mulq $m0 # ap[j]*bp[0]
510 add %rax,$A[1]
511 mov 8($np,$j,8),%rax
512 adc \$0,%rdx
513 lea 4($j),$j # j++
514 mov %rdx,$A[0]
515
516 mulq $m1 # np[j]*m1
517 add %rax,$N[1]
518 mov -16($ap,$j,8),%rax
519 adc \$0,%rdx
520 add $A[1],$N[1] # np[j]*m1+ap[j]*bp[0]
521 adc \$0,%rdx
522 mov $N[1],-32(%rsp,$j,8) # tp[j-1]
523 mov %rdx,$N[0]
524 cmp $num,$j
525 jb .L1st4x
526
527 mulq $m0 # ap[j]*bp[0]
528 add %rax,$A[0]
529 mov -16($np,$j,8),%rax
530 adc \$0,%rdx
531 mov %rdx,$A[1]
532
533 mulq $m1 # np[j]*m1
534 add %rax,$N[0]
535 mov -8($ap,$j,8),%rax
536 adc \$0,%rdx
537 add $A[0],$N[0] # np[j]*m1+ap[j]*bp[0]
538 adc \$0,%rdx
539 mov $N[0],-24(%rsp,$j,8) # tp[j-1]
540 mov %rdx,$N[1]
541
542 mulq $m0 # ap[j]*bp[0]
543 add %rax,$A[1]
544 mov -8($np,$j,8),%rax
545 adc \$0,%rdx
546 mov %rdx,$A[0]
547
548 mulq $m1 # np[j]*m1
549 add %rax,$N[1]
550 mov ($ap),%rax # ap[0]
551 adc \$0,%rdx
552 add $A[1],$N[1] # np[j]*m1+ap[j]*bp[0]
553 adc \$0,%rdx
554 mov $N[1],-16(%rsp,$j,8) # tp[j-1]
555 mov %rdx,$N[0]
556
557 xor $N[1],$N[1]
558 add $A[0],$N[0]
559 adc \$0,$N[1]
560 mov $N[0],-8(%rsp,$j,8)
561 mov $N[1],(%rsp,$j,8) # store upmost overflow bit
562
563 lea 1($i),$i # i++
564.align 4
565.Louter4x:
566 mov ($bp,$i,8),$m0 # m0=bp[i]
567 xor $j,$j # j=0
568 mov (%rsp),$A[0]
569 mov $n0,$m1
570 mulq $m0 # ap[0]*bp[i]
571 add %rax,$A[0] # ap[0]*bp[i]+tp[0]
572 mov ($np),%rax
573 adc \$0,%rdx
574
575 imulq $A[0],$m1 # tp[0]*n0
576 mov %rdx,$A[1]
577
578 mulq $m1 # np[0]*m1
579 add %rax,$A[0] # "$N[0]", discarded
580 mov 8($ap),%rax
581 adc \$0,%rdx
582 mov %rdx,$N[1]
583
584 mulq $m0 # ap[j]*bp[i]
585 add %rax,$A[1]
586 mov 8($np),%rax
587 adc \$0,%rdx
588 add 8(%rsp),$A[1] # +tp[1]
589 adc \$0,%rdx
590 mov %rdx,$A[0]
591
592 mulq $m1 # np[j]*m1
593 add %rax,$N[1]
594 mov 16($ap),%rax
595 adc \$0,%rdx
596 add $A[1],$N[1] # np[j]*m1+ap[j]*bp[i]+tp[j]
597 lea 4($j),$j # j+=2
598 adc \$0,%rdx
599 mov $N[1],(%rsp) # tp[j-1]
600 mov %rdx,$N[0]
601 jmp .Linner4x
602.align 16
603.Linner4x:
604 mulq $m0 # ap[j]*bp[i]
605 add %rax,$A[0]
606 mov -16($np,$j,8),%rax
607 adc \$0,%rdx
608 add -16(%rsp,$j,8),$A[0] # ap[j]*bp[i]+tp[j]
609 adc \$0,%rdx
610 mov %rdx,$A[1]
611
612 mulq $m1 # np[j]*m1
613 add %rax,$N[0]
614 mov -8($ap,$j,8),%rax
615 adc \$0,%rdx
616 add $A[0],$N[0]
617 adc \$0,%rdx
618 mov $N[0],-24(%rsp,$j,8) # tp[j-1]
619 mov %rdx,$N[1]
620
621 mulq $m0 # ap[j]*bp[i]
622 add %rax,$A[1]
623 mov -8($np,$j,8),%rax
624 adc \$0,%rdx
625 add -8(%rsp,$j,8),$A[1]
626 adc \$0,%rdx
627 mov %rdx,$A[0]
628
629 mulq $m1 # np[j]*m1
630 add %rax,$N[1]
631 mov ($ap,$j,8),%rax
632 adc \$0,%rdx
633 add $A[1],$N[1]
634 adc \$0,%rdx
635 mov $N[1],-16(%rsp,$j,8) # tp[j-1]
636 mov %rdx,$N[0]
637
638 mulq $m0 # ap[j]*bp[i]
639 add %rax,$A[0]
640 mov ($np,$j,8),%rax
641 adc \$0,%rdx
642 add (%rsp,$j,8),$A[0] # ap[j]*bp[i]+tp[j]
643 adc \$0,%rdx
644 mov %rdx,$A[1]
645
646 mulq $m1 # np[j]*m1
647 add %rax,$N[0]
648 mov 8($ap,$j,8),%rax
649 adc \$0,%rdx
650 add $A[0],$N[0]
651 adc \$0,%rdx
652 mov $N[0],-8(%rsp,$j,8) # tp[j-1]
653 mov %rdx,$N[1]
654
655 mulq $m0 # ap[j]*bp[i]
656 add %rax,$A[1]
657 mov 8($np,$j,8),%rax
658 adc \$0,%rdx
659 add 8(%rsp,$j,8),$A[1]
660 adc \$0,%rdx
661 lea 4($j),$j # j++
662 mov %rdx,$A[0]
663
664 mulq $m1 # np[j]*m1
665 add %rax,$N[1]
666 mov -16($ap,$j,8),%rax
667 adc \$0,%rdx
668 add $A[1],$N[1]
669 adc \$0,%rdx
670 mov $N[1],-32(%rsp,$j,8) # tp[j-1]
671 mov %rdx,$N[0]
672 cmp $num,$j
673 jb .Linner4x
674
675 mulq $m0 # ap[j]*bp[i]
676 add %rax,$A[0]
677 mov -16($np,$j,8),%rax
678 adc \$0,%rdx
679 add -16(%rsp,$j,8),$A[0] # ap[j]*bp[i]+tp[j]
680 adc \$0,%rdx
681 mov %rdx,$A[1]
682
683 mulq $m1 # np[j]*m1
684 add %rax,$N[0]
685 mov -8($ap,$j,8),%rax
686 adc \$0,%rdx
687 add $A[0],$N[0]
688 adc \$0,%rdx
689 mov $N[0],-24(%rsp,$j,8) # tp[j-1]
690 mov %rdx,$N[1]
691
692 mulq $m0 # ap[j]*bp[i]
693 add %rax,$A[1]
694 mov -8($np,$j,8),%rax
695 adc \$0,%rdx
696 add -8(%rsp,$j,8),$A[1]
697 adc \$0,%rdx
698 lea 1($i),$i # i++
699 mov %rdx,$A[0]
700
701 mulq $m1 # np[j]*m1
702 add %rax,$N[1]
703 mov ($ap),%rax # ap[0]
704 adc \$0,%rdx
705 add $A[1],$N[1]
706 adc \$0,%rdx
707 mov $N[1],-16(%rsp,$j,8) # tp[j-1]
708 mov %rdx,$N[0]
709
710 xor $N[1],$N[1]
711 add $A[0],$N[0]
712 adc \$0,$N[1]
713 add (%rsp,$num,8),$N[0] # pull upmost overflow bit
714 adc \$0,$N[1]
715 mov $N[0],-8(%rsp,$j,8)
716 mov $N[1],(%rsp,$j,8) # store upmost overflow bit
717
718 cmp $num,$i
719 jb .Louter4x
720___
721{
722my @ri=("%rax","%rdx",$m0,$m1);
723$code.=<<___;
724 mov 16(%rsp,$num,8),$rp # restore $rp
725 lea -4($num),$j
726 mov 0(%rsp),@ri[0] # tp[0]
727 mov 8(%rsp),@ri[1] # tp[1]
728 shr \$2,$j # j=num/4-1
729 lea (%rsp),$ap # borrow ap for tp
730 xor $i,$i # i=0 and clear CF!
731
732 sub 0($np),@ri[0]
733 mov 16($ap),@ri[2] # tp[2]
734 mov 24($ap),@ri[3] # tp[3]
735 sbb 8($np),@ri[1]
736
737.Lsub4x:
738 mov @ri[0],0($rp,$i,8) # rp[i]=tp[i]-np[i]
739 mov @ri[1],8($rp,$i,8) # rp[i]=tp[i]-np[i]
740 sbb 16($np,$i,8),@ri[2]
741 mov 32($ap,$i,8),@ri[0] # tp[i+1]
742 mov 40($ap,$i,8),@ri[1]
743 sbb 24($np,$i,8),@ri[3]
744 mov @ri[2],16($rp,$i,8) # rp[i]=tp[i]-np[i]
745 mov @ri[3],24($rp,$i,8) # rp[i]=tp[i]-np[i]
746 sbb 32($np,$i,8),@ri[0]
747 mov 48($ap,$i,8),@ri[2]
748 mov 56($ap,$i,8),@ri[3]
749 sbb 40($np,$i,8),@ri[1]
750 lea 4($i),$i # i++
751 dec $j # doesn't affect CF!
752 jnz .Lsub4x
753
754 mov @ri[0],0($rp,$i,8) # rp[i]=tp[i]-np[i]
755 mov 32($ap,$i,8),@ri[0] # load overflow bit
756 sbb 16($np,$i,8),@ri[2]
757 mov @ri[1],8($rp,$i,8) # rp[i]=tp[i]-np[i]
758 sbb 24($np,$i,8),@ri[3]
759 mov @ri[2],16($rp,$i,8) # rp[i]=tp[i]-np[i]
760
761 sbb \$0,@ri[0] # handle upmost overflow bit
762 mov @ri[3],24($rp,$i,8) # rp[i]=tp[i]-np[i]
763 pxor %xmm0,%xmm0
764 movq @ri[0],%xmm4
765 pcmpeqd %xmm5,%xmm5
766 pshufd \$0,%xmm4,%xmm4
767 mov $num,$j
768 pxor %xmm4,%xmm5
769 shr \$2,$j # j=num/4
770 xor %eax,%eax # i=0
771
772 jmp .Lcopy4x
773.align 16
774.Lcopy4x: # conditional copy
775 movdqa (%rsp,%rax),%xmm1
776 movdqu ($rp,%rax),%xmm2
777 pand %xmm4,%xmm1
778 pand %xmm5,%xmm2
779 movdqa 16(%rsp,%rax),%xmm3
780 movdqa %xmm0,(%rsp,%rax)
781 por %xmm2,%xmm1
782 movdqu 16($rp,%rax),%xmm2
783 movdqu %xmm1,($rp,%rax)
784 pand %xmm4,%xmm3
785 pand %xmm5,%xmm2
786 movdqa %xmm0,16(%rsp,%rax)
787 por %xmm2,%xmm3
788 movdqu %xmm3,16($rp,%rax)
789 lea 32(%rax),%rax
790 dec $j
791 jnz .Lcopy4x
792___
793}
794$code.=<<___;
795 mov 8(%rsp,$num,8),%rsi # restore %rsp
796.cfi_def_cfa %rsi, 8
797 mov \$1,%rax
798 mov -48(%rsi),%r15
799.cfi_restore %r15
800 mov -40(%rsi),%r14
801.cfi_restore %r14
802 mov -32(%rsi),%r13
803.cfi_restore %r13
804 mov -24(%rsi),%r12
805.cfi_restore %r12
806 mov -16(%rsi),%rbp
807.cfi_restore %rbp
808 mov -8(%rsi),%rbx
809.cfi_restore %rbx
810 lea (%rsi),%rsp
811.cfi_def_cfa_register %rsp
812.Lmul4x_epilogue:
813 ret
814.cfi_endproc
815.size bn_mul4x_mont,.-bn_mul4x_mont
816___
817}}}
818
819{{{
820######################################################################
821# void bn_sqr8x_mont(
822my $rptr="%rdi"; # const BN_ULONG *rptr,
823my $aptr="%rsi"; # const BN_ULONG *aptr,
824my $bptr="%rdx"; # not used
825my $nptr="%rcx"; # const BN_ULONG *nptr,
826my $n0 ="%r8"; # const BN_ULONG *n0);
827my $num ="%r9"; # int num, has to be divisible by 8
828
829my ($i,$j,$tptr)=("%rbp","%rcx",$rptr);
830my @A0=("%r10","%r11");
831my @A1=("%r12","%r13");
832my ($a0,$a1,$ai)=("%r14","%r15","%rbx");
833
834$code.=<<___ if ($addx);
835.extern bn_sqrx8x_internal # see x86_64-mont5 module
836___
837$code.=<<___;
838.extern bn_sqr8x_internal # see x86_64-mont5 module
839
840.type bn_sqr8x_mont,\@function,6
841.align 32
842bn_sqr8x_mont:
843.cfi_startproc
844 mov %rsp,%rax
845.cfi_def_cfa_register %rax
846.Lsqr8x_enter:
847 push %rbx
848.cfi_push %rbx
849 push %rbp
850.cfi_push %rbp
851 push %r12
852.cfi_push %r12
853 push %r13
854.cfi_push %r13
855 push %r14
856.cfi_push %r14
857 push %r15
858.cfi_push %r15
859.Lsqr8x_prologue:
860
861 mov ${num}d,%r10d
862 shl \$3,${num}d # convert $num to bytes
863 shl \$3+2,%r10 # 4*$num
864 neg $num
865
866 ##############################################################
867 # ensure that stack frame doesn't alias with $aptr modulo
868 # 4096. this is done to allow memory disambiguation logic
869 # do its job.
870 #
871 lea -64(%rsp,$num,2),%r11
872 mov %rsp,%rbp
873 mov ($n0),$n0 # *n0
874 sub $aptr,%r11
875 and \$4095,%r11
876 cmp %r11,%r10
877 jb .Lsqr8x_sp_alt
878 sub %r11,%rbp # align with $aptr
879 lea -64(%rbp,$num,2),%rbp # future alloca(frame+2*$num)
880 jmp .Lsqr8x_sp_done
881
882.align 32
883.Lsqr8x_sp_alt:
884 lea 4096-64(,$num,2),%r10 # 4096-frame-2*$num
885 lea -64(%rbp,$num,2),%rbp # future alloca(frame+2*$num)
886 sub %r10,%r11
887 mov \$0,%r10
888 cmovc %r10,%r11
889 sub %r11,%rbp
890.Lsqr8x_sp_done:
891 and \$-64,%rbp
892 mov %rsp,%r11
893 sub %rbp,%r11
894 and \$-4096,%r11
895 lea (%rbp,%r11),%rsp
896 mov (%rsp),%r10
897 cmp %rbp,%rsp
898 ja .Lsqr8x_page_walk
899 jmp .Lsqr8x_page_walk_done
900
901.align 16
902.Lsqr8x_page_walk:
903 lea -4096(%rsp),%rsp
904 mov (%rsp),%r10
905 cmp %rbp,%rsp
906 ja .Lsqr8x_page_walk
907.Lsqr8x_page_walk_done:
908
909 mov $num,%r10
910 neg $num
911
912 mov $n0, 32(%rsp)
913 mov %rax, 40(%rsp) # save original %rsp
914.cfi_cfa_expression %rsp+40,deref,+8
915.Lsqr8x_body:
916
917 movq $nptr, %xmm2 # save pointer to modulus
918 pxor %xmm0,%xmm0
919 movq $rptr,%xmm1 # save $rptr
920 movq %r10, %xmm3 # -$num
921___
922$code.=<<___ if ($addx);
923 mov OPENSSL_ia32cap_P+8(%rip),%eax
924 and \$0x80100,%eax
925 cmp \$0x80100,%eax
926 jne .Lsqr8x_nox
927
928 call bn_sqrx8x_internal # see x86_64-mont5 module
929 # %rax top-most carry
930 # %rbp nptr
931 # %rcx -8*num
932 # %r8 end of tp[2*num]
933 lea (%r8,%rcx),%rbx
934 mov %rcx,$num
935 mov %rcx,%rdx
936 movq %xmm1,$rptr
937 sar \$3+2,%rcx # %cf=0
938 jmp .Lsqr8x_sub
939
940.align 32
941.Lsqr8x_nox:
942___
943$code.=<<___;
944 call bn_sqr8x_internal # see x86_64-mont5 module
945 # %rax top-most carry
946 # %rbp nptr
947 # %r8 -8*num
948 # %rdi end of tp[2*num]
949 lea (%rdi,$num),%rbx
950 mov $num,%rcx
951 mov $num,%rdx
952 movq %xmm1,$rptr
953 sar \$3+2,%rcx # %cf=0
954 jmp .Lsqr8x_sub
955
956.align 32
957.Lsqr8x_sub:
958 mov 8*0(%rbx),%r12
959 mov 8*1(%rbx),%r13
960 mov 8*2(%rbx),%r14
961 mov 8*3(%rbx),%r15
962 lea 8*4(%rbx),%rbx
963 sbb 8*0(%rbp),%r12
964 sbb 8*1(%rbp),%r13
965 sbb 8*2(%rbp),%r14
966 sbb 8*3(%rbp),%r15
967 lea 8*4(%rbp),%rbp
968 mov %r12,8*0($rptr)
969 mov %r13,8*1($rptr)
970 mov %r14,8*2($rptr)
971 mov %r15,8*3($rptr)
972 lea 8*4($rptr),$rptr
973 inc %rcx # preserves %cf
974 jnz .Lsqr8x_sub
975
976 sbb \$0,%rax # top-most carry
977 lea (%rbx,$num),%rbx # rewind
978 lea ($rptr,$num),$rptr # rewind
979
980 movq %rax,%xmm1
981 pxor %xmm0,%xmm0
982 pshufd \$0,%xmm1,%xmm1
983 mov 40(%rsp),%rsi # restore %rsp
984.cfi_def_cfa %rsi,8
985 jmp .Lsqr8x_cond_copy
986
987.align 32
988.Lsqr8x_cond_copy:
989 movdqa 16*0(%rbx),%xmm2
990 movdqa 16*1(%rbx),%xmm3
991 lea 16*2(%rbx),%rbx
992 movdqu 16*0($rptr),%xmm4
993 movdqu 16*1($rptr),%xmm5
994 lea 16*2($rptr),$rptr
995 movdqa %xmm0,-16*2(%rbx) # zero tp
996 movdqa %xmm0,-16*1(%rbx)
997 movdqa %xmm0,-16*2(%rbx,%rdx)
998 movdqa %xmm0,-16*1(%rbx,%rdx)
999 pcmpeqd %xmm1,%xmm0
1000 pand %xmm1,%xmm2
1001 pand %xmm1,%xmm3
1002 pand %xmm0,%xmm4
1003 pand %xmm0,%xmm5
1004 pxor %xmm0,%xmm0
1005 por %xmm2,%xmm4
1006 por %xmm3,%xmm5
1007 movdqu %xmm4,-16*2($rptr)
1008 movdqu %xmm5,-16*1($rptr)
1009 add \$32,$num
1010 jnz .Lsqr8x_cond_copy
1011
1012 mov \$1,%rax
1013 mov -48(%rsi),%r15
1014.cfi_restore %r15
1015 mov -40(%rsi),%r14
1016.cfi_restore %r14
1017 mov -32(%rsi),%r13
1018.cfi_restore %r13
1019 mov -24(%rsi),%r12
1020.cfi_restore %r12
1021 mov -16(%rsi),%rbp
1022.cfi_restore %rbp
1023 mov -8(%rsi),%rbx
1024.cfi_restore %rbx
1025 lea (%rsi),%rsp
1026.cfi_def_cfa_register %rsp
1027.Lsqr8x_epilogue:
1028 ret
1029.cfi_endproc
1030.size bn_sqr8x_mont,.-bn_sqr8x_mont
1031___
1032}}}
1033
1034
1035if ($addx) {{{
1036my $bp="%rdx"; # original value
1037
1038$code.=<<___;
1039.type bn_mulx4x_mont,\@function,6
1040.align 32
1041bn_mulx4x_mont:
1042.cfi_startproc
1043 mov %rsp,%rax
1044.cfi_def_cfa_register %rax
1045.Lmulx4x_enter:
1046 push %rbx
1047.cfi_push %rbx
1048 push %rbp
1049.cfi_push %rbp
1050 push %r12
1051.cfi_push %r12
1052 push %r13
1053.cfi_push %r13
1054 push %r14
1055.cfi_push %r14
1056 push %r15
1057.cfi_push %r15
1058.Lmulx4x_prologue:
1059
1060 shl \$3,${num}d # convert $num to bytes
1061 xor %r10,%r10
1062 sub $num,%r10 # -$num
1063 mov ($n0),$n0 # *n0
1064 lea -72(%rsp,%r10),%rbp # future alloca(frame+$num+8)
1065 and \$-128,%rbp
1066 mov %rsp,%r11
1067 sub %rbp,%r11
1068 and \$-4096,%r11
1069 lea (%rbp,%r11),%rsp
1070 mov (%rsp),%r10
1071 cmp %rbp,%rsp
1072 ja .Lmulx4x_page_walk
1073 jmp .Lmulx4x_page_walk_done
1074
1075.align 16
1076.Lmulx4x_page_walk:
1077 lea -4096(%rsp),%rsp
1078 mov (%rsp),%r10
1079 cmp %rbp,%rsp
1080 ja .Lmulx4x_page_walk
1081.Lmulx4x_page_walk_done:
1082
1083 lea ($bp,$num),%r10
1084 ##############################################################
1085 # Stack layout
1086 # +0 num
1087 # +8 off-loaded &b[i]
1088 # +16 end of b[num]
1089 # +24 saved n0
1090 # +32 saved rp
1091 # +40 saved %rsp
1092 # +48 inner counter
1093 # +56
1094 # +64 tmp[num+1]
1095 #
1096 mov $num,0(%rsp) # save $num
1097 shr \$5,$num
1098 mov %r10,16(%rsp) # end of b[num]
1099 sub \$1,$num
1100 mov $n0, 24(%rsp) # save *n0
1101 mov $rp, 32(%rsp) # save $rp
1102 mov %rax,40(%rsp) # save original %rsp
1103.cfi_cfa_expression %rsp+40,deref,+8
1104 mov $num,48(%rsp) # inner counter
1105 jmp .Lmulx4x_body
1106
1107.align 32
1108.Lmulx4x_body:
1109___
1110my ($aptr, $bptr, $nptr, $tptr, $mi, $bi, $zero, $num)=
1111 ("%rsi","%rdi","%rcx","%rbx","%r8","%r9","%rbp","%rax");
1112my $rptr=$bptr;
1113$code.=<<___;
1114 lea 8($bp),$bptr
1115 mov ($bp),%rdx # b[0], $bp==%rdx actually
1116 lea 64+32(%rsp),$tptr
1117 mov %rdx,$bi
1118
1119 mulx 0*8($aptr),$mi,%rax # a[0]*b[0]
1120 mulx 1*8($aptr),%r11,%r14 # a[1]*b[0]
1121 add %rax,%r11
1122 mov $bptr,8(%rsp) # off-load &b[i]
1123 mulx 2*8($aptr),%r12,%r13 # ...
1124 adc %r14,%r12
1125 adc \$0,%r13
1126
1127 mov $mi,$bptr # borrow $bptr
1128 imulq 24(%rsp),$mi # "t[0]"*n0
1129 xor $zero,$zero # cf=0, of=0
1130
1131 mulx 3*8($aptr),%rax,%r14
1132 mov $mi,%rdx
1133 lea 4*8($aptr),$aptr
1134 adcx %rax,%r13
1135 adcx $zero,%r14 # cf=0
1136
1137 mulx 0*8($nptr),%rax,%r10
1138 adcx %rax,$bptr # discarded
1139 adox %r11,%r10
1140 mulx 1*8($nptr),%rax,%r11
1141 adcx %rax,%r10
1142 adox %r12,%r11
1143 .byte 0xc4,0x62,0xfb,0xf6,0xa1,0x10,0x00,0x00,0x00 # mulx 2*8($nptr),%rax,%r12
1144 mov 48(%rsp),$bptr # counter value
1145 mov %r10,-4*8($tptr)
1146 adcx %rax,%r11
1147 adox %r13,%r12
1148 mulx 3*8($nptr),%rax,%r15
1149 mov $bi,%rdx
1150 mov %r11,-3*8($tptr)
1151 adcx %rax,%r12
1152 adox $zero,%r15 # of=0
1153 lea 4*8($nptr),$nptr
1154 mov %r12,-2*8($tptr)
1155
1156 jmp .Lmulx4x_1st
1157
1158.align 32
1159.Lmulx4x_1st:
1160 adcx $zero,%r15 # cf=0, modulo-scheduled
1161 mulx 0*8($aptr),%r10,%rax # a[4]*b[0]
1162 adcx %r14,%r10
1163 mulx 1*8($aptr),%r11,%r14 # a[5]*b[0]
1164 adcx %rax,%r11
1165 mulx 2*8($aptr),%r12,%rax # ...
1166 adcx %r14,%r12
1167 mulx 3*8($aptr),%r13,%r14
1168 .byte 0x67,0x67
1169 mov $mi,%rdx
1170 adcx %rax,%r13
1171 adcx $zero,%r14 # cf=0
1172 lea 4*8($aptr),$aptr
1173 lea 4*8($tptr),$tptr
1174
1175 adox %r15,%r10
1176 mulx 0*8($nptr),%rax,%r15
1177 adcx %rax,%r10
1178 adox %r15,%r11
1179 mulx 1*8($nptr),%rax,%r15
1180 adcx %rax,%r11
1181 adox %r15,%r12
1182 mulx 2*8($nptr),%rax,%r15
1183 mov %r10,-5*8($tptr)
1184 adcx %rax,%r12
1185 mov %r11,-4*8($tptr)
1186 adox %r15,%r13
1187 mulx 3*8($nptr),%rax,%r15
1188 mov $bi,%rdx
1189 mov %r12,-3*8($tptr)
1190 adcx %rax,%r13
1191 adox $zero,%r15
1192 lea 4*8($nptr),$nptr
1193 mov %r13,-2*8($tptr)
1194
1195 dec $bptr # of=0, pass cf
1196 jnz .Lmulx4x_1st
1197
1198 mov 0(%rsp),$num # load num
1199 mov 8(%rsp),$bptr # re-load &b[i]
1200 adc $zero,%r15 # modulo-scheduled
1201 add %r15,%r14
1202 sbb %r15,%r15 # top-most carry
1203 mov %r14,-1*8($tptr)
1204 jmp .Lmulx4x_outer
1205
1206.align 32
1207.Lmulx4x_outer:
1208 mov ($bptr),%rdx # b[i]
1209 lea 8($bptr),$bptr # b++
1210 sub $num,$aptr # rewind $aptr
1211 mov %r15,($tptr) # save top-most carry
1212 lea 64+4*8(%rsp),$tptr
1213 sub $num,$nptr # rewind $nptr
1214
1215 mulx 0*8($aptr),$mi,%r11 # a[0]*b[i]
1216 xor %ebp,%ebp # xor $zero,$zero # cf=0, of=0
1217 mov %rdx,$bi
1218 mulx 1*8($aptr),%r14,%r12 # a[1]*b[i]
1219 adox -4*8($tptr),$mi
1220 adcx %r14,%r11
1221 mulx 2*8($aptr),%r15,%r13 # ...
1222 adox -3*8($tptr),%r11
1223 adcx %r15,%r12
1224 adox -2*8($tptr),%r12
1225 adcx $zero,%r13
1226 adox $zero,%r13
1227
1228 mov $bptr,8(%rsp) # off-load &b[i]
1229 mov $mi,%r15
1230 imulq 24(%rsp),$mi # "t[0]"*n0
1231 xor %ebp,%ebp # xor $zero,$zero # cf=0, of=0
1232
1233 mulx 3*8($aptr),%rax,%r14
1234 mov $mi,%rdx
1235 adcx %rax,%r13
1236 adox -1*8($tptr),%r13
1237 adcx $zero,%r14
1238 lea 4*8($aptr),$aptr
1239 adox $zero,%r14
1240
1241 mulx 0*8($nptr),%rax,%r10
1242 adcx %rax,%r15 # discarded
1243 adox %r11,%r10
1244 mulx 1*8($nptr),%rax,%r11
1245 adcx %rax,%r10
1246 adox %r12,%r11
1247 mulx 2*8($nptr),%rax,%r12
1248 mov %r10,-4*8($tptr)
1249 adcx %rax,%r11
1250 adox %r13,%r12
1251 mulx 3*8($nptr),%rax,%r15
1252 mov $bi,%rdx
1253 mov %r11,-3*8($tptr)
1254 lea 4*8($nptr),$nptr
1255 adcx %rax,%r12
1256 adox $zero,%r15 # of=0
1257 mov 48(%rsp),$bptr # counter value
1258 mov %r12,-2*8($tptr)
1259
1260 jmp .Lmulx4x_inner
1261
1262.align 32
1263.Lmulx4x_inner:
1264 mulx 0*8($aptr),%r10,%rax # a[4]*b[i]
1265 adcx $zero,%r15 # cf=0, modulo-scheduled
1266 adox %r14,%r10
1267 mulx 1*8($aptr),%r11,%r14 # a[5]*b[i]
1268 adcx 0*8($tptr),%r10
1269 adox %rax,%r11
1270 mulx 2*8($aptr),%r12,%rax # ...
1271 adcx 1*8($tptr),%r11
1272 adox %r14,%r12
1273 mulx 3*8($aptr),%r13,%r14
1274 mov $mi,%rdx
1275 adcx 2*8($tptr),%r12
1276 adox %rax,%r13
1277 adcx 3*8($tptr),%r13
1278 adox $zero,%r14 # of=0
1279 lea 4*8($aptr),$aptr
1280 lea 4*8($tptr),$tptr
1281 adcx $zero,%r14 # cf=0
1282
1283 adox %r15,%r10
1284 mulx 0*8($nptr),%rax,%r15
1285 adcx %rax,%r10
1286 adox %r15,%r11
1287 mulx 1*8($nptr),%rax,%r15
1288 adcx %rax,%r11
1289 adox %r15,%r12
1290 mulx 2*8($nptr),%rax,%r15
1291 mov %r10,-5*8($tptr)
1292 adcx %rax,%r12
1293 adox %r15,%r13
1294 mulx 3*8($nptr),%rax,%r15
1295 mov $bi,%rdx
1296 mov %r11,-4*8($tptr)
1297 mov %r12,-3*8($tptr)
1298 adcx %rax,%r13
1299 adox $zero,%r15
1300 lea 4*8($nptr),$nptr
1301 mov %r13,-2*8($tptr)
1302
1303 dec $bptr # of=0, pass cf
1304 jnz .Lmulx4x_inner
1305
1306 mov 0(%rsp),$num # load num
1307 mov 8(%rsp),$bptr # re-load &b[i]
1308 adc $zero,%r15 # modulo-scheduled
1309 sub 0*8($tptr),$zero # pull top-most carry
1310 adc %r15,%r14
1311 sbb %r15,%r15 # top-most carry
1312 mov %r14,-1*8($tptr)
1313
1314 cmp 16(%rsp),$bptr
1315 jne .Lmulx4x_outer
1316
1317 lea 64(%rsp),$tptr
1318 sub $num,$nptr # rewind $nptr
1319 neg %r15
1320 mov $num,%rdx
1321 shr \$3+2,$num # %cf=0
1322 mov 32(%rsp),$rptr # restore rp
1323 jmp .Lmulx4x_sub
1324
1325.align 32
1326.Lmulx4x_sub:
1327 mov 8*0($tptr),%r11
1328 mov 8*1($tptr),%r12
1329 mov 8*2($tptr),%r13
1330 mov 8*3($tptr),%r14
1331 lea 8*4($tptr),$tptr
1332 sbb 8*0($nptr),%r11
1333 sbb 8*1($nptr),%r12
1334 sbb 8*2($nptr),%r13
1335 sbb 8*3($nptr),%r14
1336 lea 8*4($nptr),$nptr
1337 mov %r11,8*0($rptr)
1338 mov %r12,8*1($rptr)
1339 mov %r13,8*2($rptr)
1340 mov %r14,8*3($rptr)
1341 lea 8*4($rptr),$rptr
1342 dec $num # preserves %cf
1343 jnz .Lmulx4x_sub
1344
1345 sbb \$0,%r15 # top-most carry
1346 lea 64(%rsp),$tptr
1347 sub %rdx,$rptr # rewind
1348
1349 movq %r15,%xmm1
1350 pxor %xmm0,%xmm0
1351 pshufd \$0,%xmm1,%xmm1
1352 mov 40(%rsp),%rsi # restore %rsp
1353.cfi_def_cfa %rsi,8
1354 jmp .Lmulx4x_cond_copy
1355
1356.align 32
1357.Lmulx4x_cond_copy:
1358 movdqa 16*0($tptr),%xmm2
1359 movdqa 16*1($tptr),%xmm3
1360 lea 16*2($tptr),$tptr
1361 movdqu 16*0($rptr),%xmm4
1362 movdqu 16*1($rptr),%xmm5
1363 lea 16*2($rptr),$rptr
1364 movdqa %xmm0,-16*2($tptr) # zero tp
1365 movdqa %xmm0,-16*1($tptr)
1366 pcmpeqd %xmm1,%xmm0
1367 pand %xmm1,%xmm2
1368 pand %xmm1,%xmm3
1369 pand %xmm0,%xmm4
1370 pand %xmm0,%xmm5
1371 pxor %xmm0,%xmm0
1372 por %xmm2,%xmm4
1373 por %xmm3,%xmm5
1374 movdqu %xmm4,-16*2($rptr)
1375 movdqu %xmm5,-16*1($rptr)
1376 sub \$32,%rdx
1377 jnz .Lmulx4x_cond_copy
1378
1379 mov %rdx,($tptr)
1380
1381 mov \$1,%rax
1382 mov -48(%rsi),%r15
1383.cfi_restore %r15
1384 mov -40(%rsi),%r14
1385.cfi_restore %r14
1386 mov -32(%rsi),%r13
1387.cfi_restore %r13
1388 mov -24(%rsi),%r12
1389.cfi_restore %r12
1390 mov -16(%rsi),%rbp
1391.cfi_restore %rbp
1392 mov -8(%rsi),%rbx
1393.cfi_restore %rbx
1394 lea (%rsi),%rsp
1395.cfi_def_cfa_register %rsp
1396.Lmulx4x_epilogue:
1397 ret
1398.cfi_endproc
1399.size bn_mulx4x_mont,.-bn_mulx4x_mont
1400___
1401}}}
1402$code.=<<___;
1403.asciz "Montgomery Multiplication for x86_64, CRYPTOGAMS by <appro\@openssl.org>"
1404.align 16
1405___
1406
1407# EXCEPTION_DISPOSITION handler (EXCEPTION_RECORD *rec,ULONG64 frame,
1408# CONTEXT *context,DISPATCHER_CONTEXT *disp)
1409if ($win64) {
1410$rec="%rcx";
1411$frame="%rdx";
1412$context="%r8";
1413$disp="%r9";
1414
1415$code.=<<___;
1416.extern __imp_RtlVirtualUnwind
1417.type mul_handler,\@abi-omnipotent
1418.align 16
1419mul_handler:
1420 push %rsi
1421 push %rdi
1422 push %rbx
1423 push %rbp
1424 push %r12
1425 push %r13
1426 push %r14
1427 push %r15
1428 pushfq
1429 sub \$64,%rsp
1430
1431 mov 120($context),%rax # pull context->Rax
1432 mov 248($context),%rbx # pull context->Rip
1433
1434 mov 8($disp),%rsi # disp->ImageBase
1435 mov 56($disp),%r11 # disp->HandlerData
1436
1437 mov 0(%r11),%r10d # HandlerData[0]
1438 lea (%rsi,%r10),%r10 # end of prologue label
1439 cmp %r10,%rbx # context->Rip<end of prologue label
1440 jb .Lcommon_seh_tail
1441
1442 mov 152($context),%rax # pull context->Rsp
1443
1444 mov 4(%r11),%r10d # HandlerData[1]
1445 lea (%rsi,%r10),%r10 # epilogue label
1446 cmp %r10,%rbx # context->Rip>=epilogue label
1447 jae .Lcommon_seh_tail
1448
1449 mov 192($context),%r10 # pull $num
1450 mov 8(%rax,%r10,8),%rax # pull saved stack pointer
1451
1452 jmp .Lcommon_pop_regs
1453.size mul_handler,.-mul_handler
1454
1455.type sqr_handler,\@abi-omnipotent
1456.align 16
1457sqr_handler:
1458 push %rsi
1459 push %rdi
1460 push %rbx
1461 push %rbp
1462 push %r12
1463 push %r13
1464 push %r14
1465 push %r15
1466 pushfq
1467 sub \$64,%rsp
1468
1469 mov 120($context),%rax # pull context->Rax
1470 mov 248($context),%rbx # pull context->Rip
1471
1472 mov 8($disp),%rsi # disp->ImageBase
1473 mov 56($disp),%r11 # disp->HandlerData
1474
1475 mov 0(%r11),%r10d # HandlerData[0]
1476 lea (%rsi,%r10),%r10 # end of prologue label
1477 cmp %r10,%rbx # context->Rip<.Lsqr_prologue
1478 jb .Lcommon_seh_tail
1479
1480 mov 4(%r11),%r10d # HandlerData[1]
1481 lea (%rsi,%r10),%r10 # body label
1482 cmp %r10,%rbx # context->Rip<.Lsqr_body
1483 jb .Lcommon_pop_regs
1484
1485 mov 152($context),%rax # pull context->Rsp
1486
1487 mov 8(%r11),%r10d # HandlerData[2]
1488 lea (%rsi,%r10),%r10 # epilogue label
1489 cmp %r10,%rbx # context->Rip>=.Lsqr_epilogue
1490 jae .Lcommon_seh_tail
1491
1492 mov 40(%rax),%rax # pull saved stack pointer
1493
1494.Lcommon_pop_regs:
1495 mov -8(%rax),%rbx
1496 mov -16(%rax),%rbp
1497 mov -24(%rax),%r12
1498 mov -32(%rax),%r13
1499 mov -40(%rax),%r14
1500 mov -48(%rax),%r15
1501 mov %rbx,144($context) # restore context->Rbx
1502 mov %rbp,160($context) # restore context->Rbp
1503 mov %r12,216($context) # restore context->R12
1504 mov %r13,224($context) # restore context->R13
1505 mov %r14,232($context) # restore context->R14
1506 mov %r15,240($context) # restore context->R15
1507
1508.Lcommon_seh_tail:
1509 mov 8(%rax),%rdi
1510 mov 16(%rax),%rsi
1511 mov %rax,152($context) # restore context->Rsp
1512 mov %rsi,168($context) # restore context->Rsi
1513 mov %rdi,176($context) # restore context->Rdi
1514
1515 mov 40($disp),%rdi # disp->ContextRecord
1516 mov $context,%rsi # context
1517 mov \$154,%ecx # sizeof(CONTEXT)
1518 .long 0xa548f3fc # cld; rep movsq
1519
1520 mov $disp,%rsi
1521 xor %rcx,%rcx # arg1, UNW_FLAG_NHANDLER
1522 mov 8(%rsi),%rdx # arg2, disp->ImageBase
1523 mov 0(%rsi),%r8 # arg3, disp->ControlPc
1524 mov 16(%rsi),%r9 # arg4, disp->FunctionEntry
1525 mov 40(%rsi),%r10 # disp->ContextRecord
1526 lea 56(%rsi),%r11 # &disp->HandlerData
1527 lea 24(%rsi),%r12 # &disp->EstablisherFrame
1528 mov %r10,32(%rsp) # arg5
1529 mov %r11,40(%rsp) # arg6
1530 mov %r12,48(%rsp) # arg7
1531 mov %rcx,56(%rsp) # arg8, (NULL)
1532 call *__imp_RtlVirtualUnwind(%rip)
1533
1534 mov \$1,%eax # ExceptionContinueSearch
1535 add \$64,%rsp
1536 popfq
1537 pop %r15
1538 pop %r14
1539 pop %r13
1540 pop %r12
1541 pop %rbp
1542 pop %rbx
1543 pop %rdi
1544 pop %rsi
1545 ret
1546.size sqr_handler,.-sqr_handler
1547
1548.section .pdata
1549.align 4
1550 .rva .LSEH_begin_bn_mul_mont
1551 .rva .LSEH_end_bn_mul_mont
1552 .rva .LSEH_info_bn_mul_mont
1553
1554 .rva .LSEH_begin_bn_mul4x_mont
1555 .rva .LSEH_end_bn_mul4x_mont
1556 .rva .LSEH_info_bn_mul4x_mont
1557
1558 .rva .LSEH_begin_bn_sqr8x_mont
1559 .rva .LSEH_end_bn_sqr8x_mont
1560 .rva .LSEH_info_bn_sqr8x_mont
1561___
1562$code.=<<___ if ($addx);
1563 .rva .LSEH_begin_bn_mulx4x_mont
1564 .rva .LSEH_end_bn_mulx4x_mont
1565 .rva .LSEH_info_bn_mulx4x_mont
1566___
1567$code.=<<___;
1568.section .xdata
1569.align 8
1570.LSEH_info_bn_mul_mont:
1571 .byte 9,0,0,0
1572 .rva mul_handler
1573 .rva .Lmul_body,.Lmul_epilogue # HandlerData[]
1574.LSEH_info_bn_mul4x_mont:
1575 .byte 9,0,0,0
1576 .rva mul_handler
1577 .rva .Lmul4x_body,.Lmul4x_epilogue # HandlerData[]
1578.LSEH_info_bn_sqr8x_mont:
1579 .byte 9,0,0,0
1580 .rva sqr_handler
1581 .rva .Lsqr8x_prologue,.Lsqr8x_body,.Lsqr8x_epilogue # HandlerData[]
1582.align 8
1583___
1584$code.=<<___ if ($addx);
1585.LSEH_info_bn_mulx4x_mont:
1586 .byte 9,0,0,0
1587 .rva sqr_handler
1588 .rva .Lmulx4x_prologue,.Lmulx4x_body,.Lmulx4x_epilogue # HandlerData[]
1589.align 8
1590___
1591}
1592
1593print $code;
1594close STDOUT or die "error closing STDOUT: $!";
Note: See TracBrowser for help on using the repository browser.

© 2024 Oracle Support Privacy / Do Not Sell My Info Terms of Use Trademark Policy Automated Access Etiquette