VirtualBox

source: vbox/trunk/src/libs/openssl-1.1.0g/crypto/bn/asm/x86_64-mont5.pl@ 69890

Last change on this file since 69890 was 69890, checked in by vboxsync, 7 years ago

Added OpenSSL 1.1.0g with unneeded files removed, otherwise unmodified.
bugref:8070: src/libs maintenance

  • Property svn:eol-style set to LF
  • Property svn:executable set to *
File size: 83.5 KB
Line 
1#! /usr/bin/env perl
2# Copyright 2011-2016 The OpenSSL Project Authors. All Rights Reserved.
3#
4# Licensed under the OpenSSL license (the "License"). You may not use
5# this file except in compliance with the License. You can obtain a copy
6# in the file LICENSE in the source distribution or at
7# https://www.openssl.org/source/license.html
8
9
10# ====================================================================
11# Written by Andy Polyakov <[email protected]> for the OpenSSL
12# project. The module is, however, dual licensed under OpenSSL and
13# CRYPTOGAMS licenses depending on where you obtain it. For further
14# details see http://www.openssl.org/~appro/cryptogams/.
15# ====================================================================
16
17# August 2011.
18#
19# Companion to x86_64-mont.pl that optimizes cache-timing attack
20# countermeasures. The subroutines are produced by replacing bp[i]
21# references in their x86_64-mont.pl counterparts with cache-neutral
22# references to powers table computed in BN_mod_exp_mont_consttime.
23# In addition subroutine that scatters elements of the powers table
24# is implemented, so that scatter-/gathering can be tuned without
25# bn_exp.c modifications.
26
27# August 2013.
28#
29# Add MULX/AD*X code paths and additional interfaces to optimize for
30# branch prediction unit. For input lengths that are multiples of 8
31# the np argument is not just modulus value, but one interleaved
32# with 0. This is to optimize post-condition...
33
34$flavour = shift;
35$output = shift;
36if ($flavour =~ /\./) { $output = $flavour; undef $flavour; }
37
38$win64=0; $win64=1 if ($flavour =~ /[nm]asm|mingw64/ || $output =~ /\.asm$/);
39
40$0 =~ m/(.*[\/\\])[^\/\\]+$/; $dir=$1;
41( $xlate="${dir}x86_64-xlate.pl" and -f $xlate ) or
42( $xlate="${dir}../../perlasm/x86_64-xlate.pl" and -f $xlate) or
43die "can't locate x86_64-xlate.pl";
44
45open OUT,"| \"$^X\" \"$xlate\" $flavour \"$output\"";
46*STDOUT=*OUT;
47
48if (`$ENV{CC} -Wa,-v -c -o /dev/null -x assembler /dev/null 2>&1`
49 =~ /GNU assembler version ([2-9]\.[0-9]+)/) {
50 $addx = ($1>=2.23);
51}
52
53if (!$addx && $win64 && ($flavour =~ /nasm/ || $ENV{ASM} =~ /nasm/) &&
54 `nasm -v 2>&1` =~ /NASM version ([2-9]\.[0-9]+)/) {
55 $addx = ($1>=2.10);
56}
57
58if (!$addx && $win64 && ($flavour =~ /masm/ || $ENV{ASM} =~ /ml64/) &&
59 `ml64 2>&1` =~ /Version ([0-9]+)\./) {
60 $addx = ($1>=12);
61}
62
63if (!$addx && `$ENV{CC} -v 2>&1` =~ /((?:^clang|LLVM) version|.*based on LLVM) ([3-9])\.([0-9]+)/) {
64 my $ver = $2 + $3/100.0; # 3.1->3.01, 3.10->3.10
65 $addx = ($ver>=3.03);
66}
67
68# int bn_mul_mont_gather5(
69$rp="%rdi"; # BN_ULONG *rp,
70$ap="%rsi"; # const BN_ULONG *ap,
71$bp="%rdx"; # const BN_ULONG *bp,
72$np="%rcx"; # const BN_ULONG *np,
73$n0="%r8"; # const BN_ULONG *n0,
74$num="%r9"; # int num,
75 # int idx); # 0 to 2^5-1, "index" in $bp holding
76 # pre-computed powers of a', interlaced
77 # in such manner that b[0] is $bp[idx],
78 # b[1] is [2^5+idx], etc.
79$lo0="%r10";
80$hi0="%r11";
81$hi1="%r13";
82$i="%r14";
83$j="%r15";
84$m0="%rbx";
85$m1="%rbp";
86
87$code=<<___;
88.text
89
90.extern OPENSSL_ia32cap_P
91
92.globl bn_mul_mont_gather5
93.type bn_mul_mont_gather5,\@function,6
94.align 64
95bn_mul_mont_gather5:
96 mov ${num}d,${num}d
97 mov %rsp,%rax
98 test \$7,${num}d
99 jnz .Lmul_enter
100___
101$code.=<<___ if ($addx);
102 mov OPENSSL_ia32cap_P+8(%rip),%r11d
103___
104$code.=<<___;
105 jmp .Lmul4x_enter
106
107.align 16
108.Lmul_enter:
109 movd `($win64?56:8)`(%rsp),%xmm5 # load 7th argument
110 push %rbx
111 push %rbp
112 push %r12
113 push %r13
114 push %r14
115 push %r15
116
117 neg $num
118 mov %rsp,%r11
119 lea -280(%rsp,$num,8),%r10 # future alloca(8*(num+2)+256+8)
120 neg $num # restore $num
121 and \$-1024,%r10 # minimize TLB usage
122
123 # An OS-agnostic version of __chkstk.
124 #
125 # Some OSes (Windows) insist on stack being "wired" to
126 # physical memory in strictly sequential manner, i.e. if stack
127 # allocation spans two pages, then reference to farmost one can
128 # be punishable by SEGV. But page walking can do good even on
129 # other OSes, because it guarantees that villain thread hits
130 # the guard page before it can make damage to innocent one...
131 sub %r10,%r11
132 and \$-4096,%r11
133 lea (%r10,%r11),%rsp
134 mov (%rsp),%r11
135 cmp %r10,%rsp
136 ja .Lmul_page_walk
137 jmp .Lmul_page_walk_done
138
139.Lmul_page_walk:
140 lea -4096(%rsp),%rsp
141 mov (%rsp),%r11
142 cmp %r10,%rsp
143 ja .Lmul_page_walk
144.Lmul_page_walk_done:
145
146 lea .Linc(%rip),%r10
147 mov %rax,8(%rsp,$num,8) # tp[num+1]=%rsp
148.Lmul_body:
149
150 lea 128($bp),%r12 # reassign $bp (+size optimization)
151___
152 $bp="%r12";
153 $STRIDE=2**5*8; # 5 is "window size"
154 $N=$STRIDE/4; # should match cache line size
155$code.=<<___;
156 movdqa 0(%r10),%xmm0 # 00000001000000010000000000000000
157 movdqa 16(%r10),%xmm1 # 00000002000000020000000200000002
158 lea 24-112(%rsp,$num,8),%r10# place the mask after tp[num+3] (+ICache optimization)
159 and \$-16,%r10
160
161 pshufd \$0,%xmm5,%xmm5 # broadcast index
162 movdqa %xmm1,%xmm4
163 movdqa %xmm1,%xmm2
164___
165########################################################################
166# calculate mask by comparing 0..31 to index and save result to stack
167#
168$code.=<<___;
169 paddd %xmm0,%xmm1
170 pcmpeqd %xmm5,%xmm0 # compare to 1,0
171 .byte 0x67
172 movdqa %xmm4,%xmm3
173___
174for($k=0;$k<$STRIDE/16-4;$k+=4) {
175$code.=<<___;
176 paddd %xmm1,%xmm2
177 pcmpeqd %xmm5,%xmm1 # compare to 3,2
178 movdqa %xmm0,`16*($k+0)+112`(%r10)
179 movdqa %xmm4,%xmm0
180
181 paddd %xmm2,%xmm3
182 pcmpeqd %xmm5,%xmm2 # compare to 5,4
183 movdqa %xmm1,`16*($k+1)+112`(%r10)
184 movdqa %xmm4,%xmm1
185
186 paddd %xmm3,%xmm0
187 pcmpeqd %xmm5,%xmm3 # compare to 7,6
188 movdqa %xmm2,`16*($k+2)+112`(%r10)
189 movdqa %xmm4,%xmm2
190
191 paddd %xmm0,%xmm1
192 pcmpeqd %xmm5,%xmm0
193 movdqa %xmm3,`16*($k+3)+112`(%r10)
194 movdqa %xmm4,%xmm3
195___
196}
197$code.=<<___; # last iteration can be optimized
198 paddd %xmm1,%xmm2
199 pcmpeqd %xmm5,%xmm1
200 movdqa %xmm0,`16*($k+0)+112`(%r10)
201
202 paddd %xmm2,%xmm3
203 .byte 0x67
204 pcmpeqd %xmm5,%xmm2
205 movdqa %xmm1,`16*($k+1)+112`(%r10)
206
207 pcmpeqd %xmm5,%xmm3
208 movdqa %xmm2,`16*($k+2)+112`(%r10)
209 pand `16*($k+0)-128`($bp),%xmm0 # while it's still in register
210
211 pand `16*($k+1)-128`($bp),%xmm1
212 pand `16*($k+2)-128`($bp),%xmm2
213 movdqa %xmm3,`16*($k+3)+112`(%r10)
214 pand `16*($k+3)-128`($bp),%xmm3
215 por %xmm2,%xmm0
216 por %xmm3,%xmm1
217___
218for($k=0;$k<$STRIDE/16-4;$k+=4) {
219$code.=<<___;
220 movdqa `16*($k+0)-128`($bp),%xmm4
221 movdqa `16*($k+1)-128`($bp),%xmm5
222 movdqa `16*($k+2)-128`($bp),%xmm2
223 pand `16*($k+0)+112`(%r10),%xmm4
224 movdqa `16*($k+3)-128`($bp),%xmm3
225 pand `16*($k+1)+112`(%r10),%xmm5
226 por %xmm4,%xmm0
227 pand `16*($k+2)+112`(%r10),%xmm2
228 por %xmm5,%xmm1
229 pand `16*($k+3)+112`(%r10),%xmm3
230 por %xmm2,%xmm0
231 por %xmm3,%xmm1
232___
233}
234$code.=<<___;
235 por %xmm1,%xmm0
236 pshufd \$0x4e,%xmm0,%xmm1
237 por %xmm1,%xmm0
238 lea $STRIDE($bp),$bp
239 movq %xmm0,$m0 # m0=bp[0]
240
241 mov ($n0),$n0 # pull n0[0] value
242 mov ($ap),%rax
243
244 xor $i,$i # i=0
245 xor $j,$j # j=0
246
247 mov $n0,$m1
248 mulq $m0 # ap[0]*bp[0]
249 mov %rax,$lo0
250 mov ($np),%rax
251
252 imulq $lo0,$m1 # "tp[0]"*n0
253 mov %rdx,$hi0
254
255 mulq $m1 # np[0]*m1
256 add %rax,$lo0 # discarded
257 mov 8($ap),%rax
258 adc \$0,%rdx
259 mov %rdx,$hi1
260
261 lea 1($j),$j # j++
262 jmp .L1st_enter
263
264.align 16
265.L1st:
266 add %rax,$hi1
267 mov ($ap,$j,8),%rax
268 adc \$0,%rdx
269 add $hi0,$hi1 # np[j]*m1+ap[j]*bp[0]
270 mov $lo0,$hi0
271 adc \$0,%rdx
272 mov $hi1,-16(%rsp,$j,8) # tp[j-1]
273 mov %rdx,$hi1
274
275.L1st_enter:
276 mulq $m0 # ap[j]*bp[0]
277 add %rax,$hi0
278 mov ($np,$j,8),%rax
279 adc \$0,%rdx
280 lea 1($j),$j # j++
281 mov %rdx,$lo0
282
283 mulq $m1 # np[j]*m1
284 cmp $num,$j
285 jne .L1st # note that upon exit $j==$num, so
286 # they can be used interchangeably
287
288 add %rax,$hi1
289 adc \$0,%rdx
290 add $hi0,$hi1 # np[j]*m1+ap[j]*bp[0]
291 adc \$0,%rdx
292 mov $hi1,-16(%rsp,$num,8) # tp[num-1]
293 mov %rdx,$hi1
294 mov $lo0,$hi0
295
296 xor %rdx,%rdx
297 add $hi0,$hi1
298 adc \$0,%rdx
299 mov $hi1,-8(%rsp,$num,8)
300 mov %rdx,(%rsp,$num,8) # store upmost overflow bit
301
302 lea 1($i),$i # i++
303 jmp .Louter
304.align 16
305.Louter:
306 lea 24+128(%rsp,$num,8),%rdx # where 256-byte mask is (+size optimization)
307 and \$-16,%rdx
308 pxor %xmm4,%xmm4
309 pxor %xmm5,%xmm5
310___
311for($k=0;$k<$STRIDE/16;$k+=4) {
312$code.=<<___;
313 movdqa `16*($k+0)-128`($bp),%xmm0
314 movdqa `16*($k+1)-128`($bp),%xmm1
315 movdqa `16*($k+2)-128`($bp),%xmm2
316 movdqa `16*($k+3)-128`($bp),%xmm3
317 pand `16*($k+0)-128`(%rdx),%xmm0
318 pand `16*($k+1)-128`(%rdx),%xmm1
319 por %xmm0,%xmm4
320 pand `16*($k+2)-128`(%rdx),%xmm2
321 por %xmm1,%xmm5
322 pand `16*($k+3)-128`(%rdx),%xmm3
323 por %xmm2,%xmm4
324 por %xmm3,%xmm5
325___
326}
327$code.=<<___;
328 por %xmm5,%xmm4
329 pshufd \$0x4e,%xmm4,%xmm0
330 por %xmm4,%xmm0
331 lea $STRIDE($bp),$bp
332
333 mov ($ap),%rax # ap[0]
334 movq %xmm0,$m0 # m0=bp[i]
335
336 xor $j,$j # j=0
337 mov $n0,$m1
338 mov (%rsp),$lo0
339
340 mulq $m0 # ap[0]*bp[i]
341 add %rax,$lo0 # ap[0]*bp[i]+tp[0]
342 mov ($np),%rax
343 adc \$0,%rdx
344
345 imulq $lo0,$m1 # tp[0]*n0
346 mov %rdx,$hi0
347
348 mulq $m1 # np[0]*m1
349 add %rax,$lo0 # discarded
350 mov 8($ap),%rax
351 adc \$0,%rdx
352 mov 8(%rsp),$lo0 # tp[1]
353 mov %rdx,$hi1
354
355 lea 1($j),$j # j++
356 jmp .Linner_enter
357
358.align 16
359.Linner:
360 add %rax,$hi1
361 mov ($ap,$j,8),%rax
362 adc \$0,%rdx
363 add $lo0,$hi1 # np[j]*m1+ap[j]*bp[i]+tp[j]
364 mov (%rsp,$j,8),$lo0
365 adc \$0,%rdx
366 mov $hi1,-16(%rsp,$j,8) # tp[j-1]
367 mov %rdx,$hi1
368
369.Linner_enter:
370 mulq $m0 # ap[j]*bp[i]
371 add %rax,$hi0
372 mov ($np,$j,8),%rax
373 adc \$0,%rdx
374 add $hi0,$lo0 # ap[j]*bp[i]+tp[j]
375 mov %rdx,$hi0
376 adc \$0,$hi0
377 lea 1($j),$j # j++
378
379 mulq $m1 # np[j]*m1
380 cmp $num,$j
381 jne .Linner # note that upon exit $j==$num, so
382 # they can be used interchangeably
383 add %rax,$hi1
384 adc \$0,%rdx
385 add $lo0,$hi1 # np[j]*m1+ap[j]*bp[i]+tp[j]
386 mov (%rsp,$num,8),$lo0
387 adc \$0,%rdx
388 mov $hi1,-16(%rsp,$num,8) # tp[num-1]
389 mov %rdx,$hi1
390
391 xor %rdx,%rdx
392 add $hi0,$hi1
393 adc \$0,%rdx
394 add $lo0,$hi1 # pull upmost overflow bit
395 adc \$0,%rdx
396 mov $hi1,-8(%rsp,$num,8)
397 mov %rdx,(%rsp,$num,8) # store upmost overflow bit
398
399 lea 1($i),$i # i++
400 cmp $num,$i
401 jb .Louter
402
403 xor $i,$i # i=0 and clear CF!
404 mov (%rsp),%rax # tp[0]
405 lea (%rsp),$ap # borrow ap for tp
406 mov $num,$j # j=num
407 jmp .Lsub
408.align 16
409.Lsub: sbb ($np,$i,8),%rax
410 mov %rax,($rp,$i,8) # rp[i]=tp[i]-np[i]
411 mov 8($ap,$i,8),%rax # tp[i+1]
412 lea 1($i),$i # i++
413 dec $j # doesnn't affect CF!
414 jnz .Lsub
415
416 sbb \$0,%rax # handle upmost overflow bit
417 xor $i,$i
418 and %rax,$ap
419 not %rax
420 mov $rp,$np
421 and %rax,$np
422 mov $num,$j # j=num
423 or $np,$ap # ap=borrow?tp:rp
424.align 16
425.Lcopy: # copy or in-place refresh
426 mov ($ap,$i,8),%rax
427 mov $i,(%rsp,$i,8) # zap temporary vector
428 mov %rax,($rp,$i,8) # rp[i]=tp[i]
429 lea 1($i),$i
430 sub \$1,$j
431 jnz .Lcopy
432
433 mov 8(%rsp,$num,8),%rsi # restore %rsp
434 mov \$1,%rax
435
436 mov -48(%rsi),%r15
437 mov -40(%rsi),%r14
438 mov -32(%rsi),%r13
439 mov -24(%rsi),%r12
440 mov -16(%rsi),%rbp
441 mov -8(%rsi),%rbx
442 lea (%rsi),%rsp
443.Lmul_epilogue:
444 ret
445.size bn_mul_mont_gather5,.-bn_mul_mont_gather5
446___
447{{{
448my @A=("%r10","%r11");
449my @N=("%r13","%rdi");
450$code.=<<___;
451.type bn_mul4x_mont_gather5,\@function,6
452.align 32
453bn_mul4x_mont_gather5:
454 .byte 0x67
455 mov %rsp,%rax
456.Lmul4x_enter:
457___
458$code.=<<___ if ($addx);
459 and \$0x80108,%r11d
460 cmp \$0x80108,%r11d # check for AD*X+BMI2+BMI1
461 je .Lmulx4x_enter
462___
463$code.=<<___;
464 push %rbx
465 push %rbp
466 push %r12
467 push %r13
468 push %r14
469 push %r15
470.Lmul4x_prologue:
471
472 .byte 0x67
473 shl \$3,${num}d # convert $num to bytes
474 lea ($num,$num,2),%r10 # 3*$num in bytes
475 neg $num # -$num
476
477 ##############################################################
478 # Ensure that stack frame doesn't alias with $rptr+3*$num
479 # modulo 4096, which covers ret[num], am[num] and n[num]
480 # (see bn_exp.c). This is done to allow memory disambiguation
481 # logic do its magic. [Extra [num] is allocated in order
482 # to align with bn_power5's frame, which is cleansed after
483 # completing exponentiation. Extra 256 bytes is for power mask
484 # calculated from 7th argument, the index.]
485 #
486 lea -320(%rsp,$num,2),%r11
487 mov %rsp,%rbp
488 sub $rp,%r11
489 and \$4095,%r11
490 cmp %r11,%r10
491 jb .Lmul4xsp_alt
492 sub %r11,%rbp # align with $rp
493 lea -320(%rbp,$num,2),%rbp # future alloca(frame+2*num*8+256)
494 jmp .Lmul4xsp_done
495
496.align 32
497.Lmul4xsp_alt:
498 lea 4096-320(,$num,2),%r10
499 lea -320(%rbp,$num,2),%rbp # future alloca(frame+2*num*8+256)
500 sub %r10,%r11
501 mov \$0,%r10
502 cmovc %r10,%r11
503 sub %r11,%rbp
504.Lmul4xsp_done:
505 and \$-64,%rbp
506 mov %rsp,%r11
507 sub %rbp,%r11
508 and \$-4096,%r11
509 lea (%rbp,%r11),%rsp
510 mov (%rsp),%r10
511 cmp %rbp,%rsp
512 ja .Lmul4x_page_walk
513 jmp .Lmul4x_page_walk_done
514
515.Lmul4x_page_walk:
516 lea -4096(%rsp),%rsp
517 mov (%rsp),%r10
518 cmp %rbp,%rsp
519 ja .Lmul4x_page_walk
520.Lmul4x_page_walk_done:
521
522 neg $num
523
524 mov %rax,40(%rsp)
525.Lmul4x_body:
526
527 call mul4x_internal
528
529 mov 40(%rsp),%rsi # restore %rsp
530 mov \$1,%rax
531
532 mov -48(%rsi),%r15
533 mov -40(%rsi),%r14
534 mov -32(%rsi),%r13
535 mov -24(%rsi),%r12
536 mov -16(%rsi),%rbp
537 mov -8(%rsi),%rbx
538 lea (%rsi),%rsp
539.Lmul4x_epilogue:
540 ret
541.size bn_mul4x_mont_gather5,.-bn_mul4x_mont_gather5
542
543.type mul4x_internal,\@abi-omnipotent
544.align 32
545mul4x_internal:
546 shl \$5,$num # $num was in bytes
547 movd `($win64?56:8)`(%rax),%xmm5 # load 7th argument, index
548 lea .Linc(%rip),%rax
549 lea 128(%rdx,$num),%r13 # end of powers table (+size optimization)
550 shr \$5,$num # restore $num
551___
552 $bp="%r12";
553 $STRIDE=2**5*8; # 5 is "window size"
554 $N=$STRIDE/4; # should match cache line size
555 $tp=$i;
556$code.=<<___;
557 movdqa 0(%rax),%xmm0 # 00000001000000010000000000000000
558 movdqa 16(%rax),%xmm1 # 00000002000000020000000200000002
559 lea 88-112(%rsp,$num),%r10 # place the mask after tp[num+1] (+ICache optimization)
560 lea 128(%rdx),$bp # size optimization
561
562 pshufd \$0,%xmm5,%xmm5 # broadcast index
563 movdqa %xmm1,%xmm4
564 .byte 0x67,0x67
565 movdqa %xmm1,%xmm2
566___
567########################################################################
568# calculate mask by comparing 0..31 to index and save result to stack
569#
570$code.=<<___;
571 paddd %xmm0,%xmm1
572 pcmpeqd %xmm5,%xmm0 # compare to 1,0
573 .byte 0x67
574 movdqa %xmm4,%xmm3
575___
576for($i=0;$i<$STRIDE/16-4;$i+=4) {
577$code.=<<___;
578 paddd %xmm1,%xmm2
579 pcmpeqd %xmm5,%xmm1 # compare to 3,2
580 movdqa %xmm0,`16*($i+0)+112`(%r10)
581 movdqa %xmm4,%xmm0
582
583 paddd %xmm2,%xmm3
584 pcmpeqd %xmm5,%xmm2 # compare to 5,4
585 movdqa %xmm1,`16*($i+1)+112`(%r10)
586 movdqa %xmm4,%xmm1
587
588 paddd %xmm3,%xmm0
589 pcmpeqd %xmm5,%xmm3 # compare to 7,6
590 movdqa %xmm2,`16*($i+2)+112`(%r10)
591 movdqa %xmm4,%xmm2
592
593 paddd %xmm0,%xmm1
594 pcmpeqd %xmm5,%xmm0
595 movdqa %xmm3,`16*($i+3)+112`(%r10)
596 movdqa %xmm4,%xmm3
597___
598}
599$code.=<<___; # last iteration can be optimized
600 paddd %xmm1,%xmm2
601 pcmpeqd %xmm5,%xmm1
602 movdqa %xmm0,`16*($i+0)+112`(%r10)
603
604 paddd %xmm2,%xmm3
605 .byte 0x67
606 pcmpeqd %xmm5,%xmm2
607 movdqa %xmm1,`16*($i+1)+112`(%r10)
608
609 pcmpeqd %xmm5,%xmm3
610 movdqa %xmm2,`16*($i+2)+112`(%r10)
611 pand `16*($i+0)-128`($bp),%xmm0 # while it's still in register
612
613 pand `16*($i+1)-128`($bp),%xmm1
614 pand `16*($i+2)-128`($bp),%xmm2
615 movdqa %xmm3,`16*($i+3)+112`(%r10)
616 pand `16*($i+3)-128`($bp),%xmm3
617 por %xmm2,%xmm0
618 por %xmm3,%xmm1
619___
620for($i=0;$i<$STRIDE/16-4;$i+=4) {
621$code.=<<___;
622 movdqa `16*($i+0)-128`($bp),%xmm4
623 movdqa `16*($i+1)-128`($bp),%xmm5
624 movdqa `16*($i+2)-128`($bp),%xmm2
625 pand `16*($i+0)+112`(%r10),%xmm4
626 movdqa `16*($i+3)-128`($bp),%xmm3
627 pand `16*($i+1)+112`(%r10),%xmm5
628 por %xmm4,%xmm0
629 pand `16*($i+2)+112`(%r10),%xmm2
630 por %xmm5,%xmm1
631 pand `16*($i+3)+112`(%r10),%xmm3
632 por %xmm2,%xmm0
633 por %xmm3,%xmm1
634___
635}
636$code.=<<___;
637 por %xmm1,%xmm0
638 pshufd \$0x4e,%xmm0,%xmm1
639 por %xmm1,%xmm0
640 lea $STRIDE($bp),$bp
641 movq %xmm0,$m0 # m0=bp[0]
642
643 mov %r13,16+8(%rsp) # save end of b[num]
644 mov $rp, 56+8(%rsp) # save $rp
645
646 mov ($n0),$n0 # pull n0[0] value
647 mov ($ap),%rax
648 lea ($ap,$num),$ap # end of a[num]
649 neg $num
650
651 mov $n0,$m1
652 mulq $m0 # ap[0]*bp[0]
653 mov %rax,$A[0]
654 mov ($np),%rax
655
656 imulq $A[0],$m1 # "tp[0]"*n0
657 lea 64+8(%rsp),$tp
658 mov %rdx,$A[1]
659
660 mulq $m1 # np[0]*m1
661 add %rax,$A[0] # discarded
662 mov 8($ap,$num),%rax
663 adc \$0,%rdx
664 mov %rdx,$N[1]
665
666 mulq $m0
667 add %rax,$A[1]
668 mov 8*1($np),%rax
669 adc \$0,%rdx
670 mov %rdx,$A[0]
671
672 mulq $m1
673 add %rax,$N[1]
674 mov 16($ap,$num),%rax
675 adc \$0,%rdx
676 add $A[1],$N[1]
677 lea 4*8($num),$j # j=4
678 lea 8*4($np),$np
679 adc \$0,%rdx
680 mov $N[1],($tp)
681 mov %rdx,$N[0]
682 jmp .L1st4x
683
684.align 32
685.L1st4x:
686 mulq $m0 # ap[j]*bp[0]
687 add %rax,$A[0]
688 mov -8*2($np),%rax
689 lea 32($tp),$tp
690 adc \$0,%rdx
691 mov %rdx,$A[1]
692
693 mulq $m1 # np[j]*m1
694 add %rax,$N[0]
695 mov -8($ap,$j),%rax
696 adc \$0,%rdx
697 add $A[0],$N[0] # np[j]*m1+ap[j]*bp[0]
698 adc \$0,%rdx
699 mov $N[0],-24($tp) # tp[j-1]
700 mov %rdx,$N[1]
701
702 mulq $m0 # ap[j]*bp[0]
703 add %rax,$A[1]
704 mov -8*1($np),%rax
705 adc \$0,%rdx
706 mov %rdx,$A[0]
707
708 mulq $m1 # np[j]*m1
709 add %rax,$N[1]
710 mov ($ap,$j),%rax
711 adc \$0,%rdx
712 add $A[1],$N[1] # np[j]*m1+ap[j]*bp[0]
713 adc \$0,%rdx
714 mov $N[1],-16($tp) # tp[j-1]
715 mov %rdx,$N[0]
716
717 mulq $m0 # ap[j]*bp[0]
718 add %rax,$A[0]
719 mov 8*0($np),%rax
720 adc \$0,%rdx
721 mov %rdx,$A[1]
722
723 mulq $m1 # np[j]*m1
724 add %rax,$N[0]
725 mov 8($ap,$j),%rax
726 adc \$0,%rdx
727 add $A[0],$N[0] # np[j]*m1+ap[j]*bp[0]
728 adc \$0,%rdx
729 mov $N[0],-8($tp) # tp[j-1]
730 mov %rdx,$N[1]
731
732 mulq $m0 # ap[j]*bp[0]
733 add %rax,$A[1]
734 mov 8*1($np),%rax
735 adc \$0,%rdx
736 mov %rdx,$A[0]
737
738 mulq $m1 # np[j]*m1
739 add %rax,$N[1]
740 mov 16($ap,$j),%rax
741 adc \$0,%rdx
742 add $A[1],$N[1] # np[j]*m1+ap[j]*bp[0]
743 lea 8*4($np),$np
744 adc \$0,%rdx
745 mov $N[1],($tp) # tp[j-1]
746 mov %rdx,$N[0]
747
748 add \$32,$j # j+=4
749 jnz .L1st4x
750
751 mulq $m0 # ap[j]*bp[0]
752 add %rax,$A[0]
753 mov -8*2($np),%rax
754 lea 32($tp),$tp
755 adc \$0,%rdx
756 mov %rdx,$A[1]
757
758 mulq $m1 # np[j]*m1
759 add %rax,$N[0]
760 mov -8($ap),%rax
761 adc \$0,%rdx
762 add $A[0],$N[0] # np[j]*m1+ap[j]*bp[0]
763 adc \$0,%rdx
764 mov $N[0],-24($tp) # tp[j-1]
765 mov %rdx,$N[1]
766
767 mulq $m0 # ap[j]*bp[0]
768 add %rax,$A[1]
769 mov -8*1($np),%rax
770 adc \$0,%rdx
771 mov %rdx,$A[0]
772
773 mulq $m1 # np[j]*m1
774 add %rax,$N[1]
775 mov ($ap,$num),%rax # ap[0]
776 adc \$0,%rdx
777 add $A[1],$N[1] # np[j]*m1+ap[j]*bp[0]
778 adc \$0,%rdx
779 mov $N[1],-16($tp) # tp[j-1]
780 mov %rdx,$N[0]
781
782 lea ($np,$num),$np # rewind $np
783
784 xor $N[1],$N[1]
785 add $A[0],$N[0]
786 adc \$0,$N[1]
787 mov $N[0],-8($tp)
788
789 jmp .Louter4x
790
791.align 32
792.Louter4x:
793 lea 16+128($tp),%rdx # where 256-byte mask is (+size optimization)
794 pxor %xmm4,%xmm4
795 pxor %xmm5,%xmm5
796___
797for($i=0;$i<$STRIDE/16;$i+=4) {
798$code.=<<___;
799 movdqa `16*($i+0)-128`($bp),%xmm0
800 movdqa `16*($i+1)-128`($bp),%xmm1
801 movdqa `16*($i+2)-128`($bp),%xmm2
802 movdqa `16*($i+3)-128`($bp),%xmm3
803 pand `16*($i+0)-128`(%rdx),%xmm0
804 pand `16*($i+1)-128`(%rdx),%xmm1
805 por %xmm0,%xmm4
806 pand `16*($i+2)-128`(%rdx),%xmm2
807 por %xmm1,%xmm5
808 pand `16*($i+3)-128`(%rdx),%xmm3
809 por %xmm2,%xmm4
810 por %xmm3,%xmm5
811___
812}
813$code.=<<___;
814 por %xmm5,%xmm4
815 pshufd \$0x4e,%xmm4,%xmm0
816 por %xmm4,%xmm0
817 lea $STRIDE($bp),$bp
818 movq %xmm0,$m0 # m0=bp[i]
819
820 mov ($tp,$num),$A[0]
821 mov $n0,$m1
822 mulq $m0 # ap[0]*bp[i]
823 add %rax,$A[0] # ap[0]*bp[i]+tp[0]
824 mov ($np),%rax
825 adc \$0,%rdx
826
827 imulq $A[0],$m1 # tp[0]*n0
828 mov %rdx,$A[1]
829 mov $N[1],($tp) # store upmost overflow bit
830
831 lea ($tp,$num),$tp # rewind $tp
832
833 mulq $m1 # np[0]*m1
834 add %rax,$A[0] # "$N[0]", discarded
835 mov 8($ap,$num),%rax
836 adc \$0,%rdx
837 mov %rdx,$N[1]
838
839 mulq $m0 # ap[j]*bp[i]
840 add %rax,$A[1]
841 mov 8*1($np),%rax
842 adc \$0,%rdx
843 add 8($tp),$A[1] # +tp[1]
844 adc \$0,%rdx
845 mov %rdx,$A[0]
846
847 mulq $m1 # np[j]*m1
848 add %rax,$N[1]
849 mov 16($ap,$num),%rax
850 adc \$0,%rdx
851 add $A[1],$N[1] # np[j]*m1+ap[j]*bp[i]+tp[j]
852 lea 4*8($num),$j # j=4
853 lea 8*4($np),$np
854 adc \$0,%rdx
855 mov %rdx,$N[0]
856 jmp .Linner4x
857
858.align 32
859.Linner4x:
860 mulq $m0 # ap[j]*bp[i]
861 add %rax,$A[0]
862 mov -8*2($np),%rax
863 adc \$0,%rdx
864 add 16($tp),$A[0] # ap[j]*bp[i]+tp[j]
865 lea 32($tp),$tp
866 adc \$0,%rdx
867 mov %rdx,$A[1]
868
869 mulq $m1 # np[j]*m1
870 add %rax,$N[0]
871 mov -8($ap,$j),%rax
872 adc \$0,%rdx
873 add $A[0],$N[0]
874 adc \$0,%rdx
875 mov $N[1],-32($tp) # tp[j-1]
876 mov %rdx,$N[1]
877
878 mulq $m0 # ap[j]*bp[i]
879 add %rax,$A[1]
880 mov -8*1($np),%rax
881 adc \$0,%rdx
882 add -8($tp),$A[1]
883 adc \$0,%rdx
884 mov %rdx,$A[0]
885
886 mulq $m1 # np[j]*m1
887 add %rax,$N[1]
888 mov ($ap,$j),%rax
889 adc \$0,%rdx
890 add $A[1],$N[1]
891 adc \$0,%rdx
892 mov $N[0],-24($tp) # tp[j-1]
893 mov %rdx,$N[0]
894
895 mulq $m0 # ap[j]*bp[i]
896 add %rax,$A[0]
897 mov 8*0($np),%rax
898 adc \$0,%rdx
899 add ($tp),$A[0] # ap[j]*bp[i]+tp[j]
900 adc \$0,%rdx
901 mov %rdx,$A[1]
902
903 mulq $m1 # np[j]*m1
904 add %rax,$N[0]
905 mov 8($ap,$j),%rax
906 adc \$0,%rdx
907 add $A[0],$N[0]
908 adc \$0,%rdx
909 mov $N[1],-16($tp) # tp[j-1]
910 mov %rdx,$N[1]
911
912 mulq $m0 # ap[j]*bp[i]
913 add %rax,$A[1]
914 mov 8*1($np),%rax
915 adc \$0,%rdx
916 add 8($tp),$A[1]
917 adc \$0,%rdx
918 mov %rdx,$A[0]
919
920 mulq $m1 # np[j]*m1
921 add %rax,$N[1]
922 mov 16($ap,$j),%rax
923 adc \$0,%rdx
924 add $A[1],$N[1]
925 lea 8*4($np),$np
926 adc \$0,%rdx
927 mov $N[0],-8($tp) # tp[j-1]
928 mov %rdx,$N[0]
929
930 add \$32,$j # j+=4
931 jnz .Linner4x
932
933 mulq $m0 # ap[j]*bp[i]
934 add %rax,$A[0]
935 mov -8*2($np),%rax
936 adc \$0,%rdx
937 add 16($tp),$A[0] # ap[j]*bp[i]+tp[j]
938 lea 32($tp),$tp
939 adc \$0,%rdx
940 mov %rdx,$A[1]
941
942 mulq $m1 # np[j]*m1
943 add %rax,$N[0]
944 mov -8($ap),%rax
945 adc \$0,%rdx
946 add $A[0],$N[0]
947 adc \$0,%rdx
948 mov $N[1],-32($tp) # tp[j-1]
949 mov %rdx,$N[1]
950
951 mulq $m0 # ap[j]*bp[i]
952 add %rax,$A[1]
953 mov $m1,%rax
954 mov -8*1($np),$m1
955 adc \$0,%rdx
956 add -8($tp),$A[1]
957 adc \$0,%rdx
958 mov %rdx,$A[0]
959
960 mulq $m1 # np[j]*m1
961 add %rax,$N[1]
962 mov ($ap,$num),%rax # ap[0]
963 adc \$0,%rdx
964 add $A[1],$N[1]
965 adc \$0,%rdx
966 mov $N[0],-24($tp) # tp[j-1]
967 mov %rdx,$N[0]
968
969 mov $N[1],-16($tp) # tp[j-1]
970 lea ($np,$num),$np # rewind $np
971
972 xor $N[1],$N[1]
973 add $A[0],$N[0]
974 adc \$0,$N[1]
975 add ($tp),$N[0] # pull upmost overflow bit
976 adc \$0,$N[1] # upmost overflow bit
977 mov $N[0],-8($tp)
978
979 cmp 16+8(%rsp),$bp
980 jb .Louter4x
981___
982if (1) {
983$code.=<<___;
984 xor %rax,%rax
985 sub $N[0],$m1 # compare top-most words
986 adc $j,$j # $j is zero
987 or $j,$N[1]
988 sub $N[1],%rax # %rax=-$N[1]
989 lea ($tp,$num),%rbx # tptr in .sqr4x_sub
990 mov ($np),%r12
991 lea ($np),%rbp # nptr in .sqr4x_sub
992 mov %r9,%rcx
993 sar \$3+2,%rcx
994 mov 56+8(%rsp),%rdi # rptr in .sqr4x_sub
995 dec %r12 # so that after 'not' we get -n[0]
996 xor %r10,%r10
997 mov 8*1(%rbp),%r13
998 mov 8*2(%rbp),%r14
999 mov 8*3(%rbp),%r15
1000 jmp .Lsqr4x_sub_entry
1001___
1002} else {
1003my @ri=("%rax",$bp,$m0,$m1);
1004my $rp="%rdx";
1005$code.=<<___
1006 xor \$1,$N[1]
1007 lea ($tp,$num),$tp # rewind $tp
1008 sar \$5,$num # cf=0
1009 lea ($np,$N[1],8),$np
1010 mov 56+8(%rsp),$rp # restore $rp
1011 jmp .Lsub4x
1012
1013.align 32
1014.Lsub4x:
1015 .byte 0x66
1016 mov 8*0($tp),@ri[0]
1017 mov 8*1($tp),@ri[1]
1018 .byte 0x66
1019 sbb 16*0($np),@ri[0]
1020 mov 8*2($tp),@ri[2]
1021 sbb 16*1($np),@ri[1]
1022 mov 3*8($tp),@ri[3]
1023 lea 4*8($tp),$tp
1024 sbb 16*2($np),@ri[2]
1025 mov @ri[0],8*0($rp)
1026 sbb 16*3($np),@ri[3]
1027 lea 16*4($np),$np
1028 mov @ri[1],8*1($rp)
1029 mov @ri[2],8*2($rp)
1030 mov @ri[3],8*3($rp)
1031 lea 8*4($rp),$rp
1032
1033 inc $num
1034 jnz .Lsub4x
1035
1036 ret
1037___
1038}
1039$code.=<<___;
1040.size mul4x_internal,.-mul4x_internal
1041___
1042}}}
1043
1044{{{
1045######################################################################
1046# void bn_power5(
1047my $rptr="%rdi"; # BN_ULONG *rptr,
1048my $aptr="%rsi"; # const BN_ULONG *aptr,
1049my $bptr="%rdx"; # const void *table,
1050my $nptr="%rcx"; # const BN_ULONG *nptr,
1051my $n0 ="%r8"; # const BN_ULONG *n0);
1052my $num ="%r9"; # int num, has to be divisible by 8
1053 # int pwr
1054
1055my ($i,$j,$tptr)=("%rbp","%rcx",$rptr);
1056my @A0=("%r10","%r11");
1057my @A1=("%r12","%r13");
1058my ($a0,$a1,$ai)=("%r14","%r15","%rbx");
1059
1060$code.=<<___;
1061.globl bn_power5
1062.type bn_power5,\@function,6
1063.align 32
1064bn_power5:
1065 mov %rsp,%rax
1066___
1067$code.=<<___ if ($addx);
1068 mov OPENSSL_ia32cap_P+8(%rip),%r11d
1069 and \$0x80108,%r11d
1070 cmp \$0x80108,%r11d # check for AD*X+BMI2+BMI1
1071 je .Lpowerx5_enter
1072___
1073$code.=<<___;
1074 push %rbx
1075 push %rbp
1076 push %r12
1077 push %r13
1078 push %r14
1079 push %r15
1080.Lpower5_prologue:
1081
1082 shl \$3,${num}d # convert $num to bytes
1083 lea ($num,$num,2),%r10d # 3*$num
1084 neg $num
1085 mov ($n0),$n0 # *n0
1086
1087 ##############################################################
1088 # Ensure that stack frame doesn't alias with $rptr+3*$num
1089 # modulo 4096, which covers ret[num], am[num] and n[num]
1090 # (see bn_exp.c). This is done to allow memory disambiguation
1091 # logic do its magic. [Extra 256 bytes is for power mask
1092 # calculated from 7th argument, the index.]
1093 #
1094 lea -320(%rsp,$num,2),%r11
1095 mov %rsp,%rbp
1096 sub $rptr,%r11
1097 and \$4095,%r11
1098 cmp %r11,%r10
1099 jb .Lpwr_sp_alt
1100 sub %r11,%rbp # align with $aptr
1101 lea -320(%rbp,$num,2),%rbp # future alloca(frame+2*num*8+256)
1102 jmp .Lpwr_sp_done
1103
1104.align 32
1105.Lpwr_sp_alt:
1106 lea 4096-320(,$num,2),%r10
1107 lea -320(%rbp,$num,2),%rbp # future alloca(frame+2*num*8+256)
1108 sub %r10,%r11
1109 mov \$0,%r10
1110 cmovc %r10,%r11
1111 sub %r11,%rbp
1112.Lpwr_sp_done:
1113 and \$-64,%rbp
1114 mov %rsp,%r11
1115 sub %rbp,%r11
1116 and \$-4096,%r11
1117 lea (%rbp,%r11),%rsp
1118 mov (%rsp),%r10
1119 cmp %rbp,%rsp
1120 ja .Lpwr_page_walk
1121 jmp .Lpwr_page_walk_done
1122
1123.Lpwr_page_walk:
1124 lea -4096(%rsp),%rsp
1125 mov (%rsp),%r10
1126 cmp %rbp,%rsp
1127 ja .Lpwr_page_walk
1128.Lpwr_page_walk_done:
1129
1130 mov $num,%r10
1131 neg $num
1132
1133 ##############################################################
1134 # Stack layout
1135 #
1136 # +0 saved $num, used in reduction section
1137 # +8 &t[2*$num], used in reduction section
1138 # +32 saved *n0
1139 # +40 saved %rsp
1140 # +48 t[2*$num]
1141 #
1142 mov $n0, 32(%rsp)
1143 mov %rax, 40(%rsp) # save original %rsp
1144.Lpower5_body:
1145 movq $rptr,%xmm1 # save $rptr, used in sqr8x
1146 movq $nptr,%xmm2 # save $nptr
1147 movq %r10, %xmm3 # -$num, used in sqr8x
1148 movq $bptr,%xmm4
1149
1150 call __bn_sqr8x_internal
1151 call __bn_post4x_internal
1152 call __bn_sqr8x_internal
1153 call __bn_post4x_internal
1154 call __bn_sqr8x_internal
1155 call __bn_post4x_internal
1156 call __bn_sqr8x_internal
1157 call __bn_post4x_internal
1158 call __bn_sqr8x_internal
1159 call __bn_post4x_internal
1160
1161 movq %xmm2,$nptr
1162 movq %xmm4,$bptr
1163 mov $aptr,$rptr
1164 mov 40(%rsp),%rax
1165 lea 32(%rsp),$n0
1166
1167 call mul4x_internal
1168
1169 mov 40(%rsp),%rsi # restore %rsp
1170 mov \$1,%rax
1171 mov -48(%rsi),%r15
1172 mov -40(%rsi),%r14
1173 mov -32(%rsi),%r13
1174 mov -24(%rsi),%r12
1175 mov -16(%rsi),%rbp
1176 mov -8(%rsi),%rbx
1177 lea (%rsi),%rsp
1178.Lpower5_epilogue:
1179 ret
1180.size bn_power5,.-bn_power5
1181
1182.globl bn_sqr8x_internal
1183.hidden bn_sqr8x_internal
1184.type bn_sqr8x_internal,\@abi-omnipotent
1185.align 32
1186bn_sqr8x_internal:
1187__bn_sqr8x_internal:
1188 ##############################################################
1189 # Squaring part:
1190 #
1191 # a) multiply-n-add everything but a[i]*a[i];
1192 # b) shift result of a) by 1 to the left and accumulate
1193 # a[i]*a[i] products;
1194 #
1195 ##############################################################
1196 # a[1]a[0]
1197 # a[2]a[0]
1198 # a[3]a[0]
1199 # a[2]a[1]
1200 # a[4]a[0]
1201 # a[3]a[1]
1202 # a[5]a[0]
1203 # a[4]a[1]
1204 # a[3]a[2]
1205 # a[6]a[0]
1206 # a[5]a[1]
1207 # a[4]a[2]
1208 # a[7]a[0]
1209 # a[6]a[1]
1210 # a[5]a[2]
1211 # a[4]a[3]
1212 # a[7]a[1]
1213 # a[6]a[2]
1214 # a[5]a[3]
1215 # a[7]a[2]
1216 # a[6]a[3]
1217 # a[5]a[4]
1218 # a[7]a[3]
1219 # a[6]a[4]
1220 # a[7]a[4]
1221 # a[6]a[5]
1222 # a[7]a[5]
1223 # a[7]a[6]
1224 # a[1]a[0]
1225 # a[2]a[0]
1226 # a[3]a[0]
1227 # a[4]a[0]
1228 # a[5]a[0]
1229 # a[6]a[0]
1230 # a[7]a[0]
1231 # a[2]a[1]
1232 # a[3]a[1]
1233 # a[4]a[1]
1234 # a[5]a[1]
1235 # a[6]a[1]
1236 # a[7]a[1]
1237 # a[3]a[2]
1238 # a[4]a[2]
1239 # a[5]a[2]
1240 # a[6]a[2]
1241 # a[7]a[2]
1242 # a[4]a[3]
1243 # a[5]a[3]
1244 # a[6]a[3]
1245 # a[7]a[3]
1246 # a[5]a[4]
1247 # a[6]a[4]
1248 # a[7]a[4]
1249 # a[6]a[5]
1250 # a[7]a[5]
1251 # a[7]a[6]
1252 # a[0]a[0]
1253 # a[1]a[1]
1254 # a[2]a[2]
1255 # a[3]a[3]
1256 # a[4]a[4]
1257 # a[5]a[5]
1258 # a[6]a[6]
1259 # a[7]a[7]
1260
1261 lea 32(%r10),$i # $i=-($num-32)
1262 lea ($aptr,$num),$aptr # end of a[] buffer, ($aptr,$i)=&ap[2]
1263
1264 mov $num,$j # $j=$num
1265
1266 # comments apply to $num==8 case
1267 mov -32($aptr,$i),$a0 # a[0]
1268 lea 48+8(%rsp,$num,2),$tptr # end of tp[] buffer, &tp[2*$num]
1269 mov -24($aptr,$i),%rax # a[1]
1270 lea -32($tptr,$i),$tptr # end of tp[] window, &tp[2*$num-"$i"]
1271 mov -16($aptr,$i),$ai # a[2]
1272 mov %rax,$a1
1273
1274 mul $a0 # a[1]*a[0]
1275 mov %rax,$A0[0] # a[1]*a[0]
1276 mov $ai,%rax # a[2]
1277 mov %rdx,$A0[1]
1278 mov $A0[0],-24($tptr,$i) # t[1]
1279
1280 mul $a0 # a[2]*a[0]
1281 add %rax,$A0[1]
1282 mov $ai,%rax
1283 adc \$0,%rdx
1284 mov $A0[1],-16($tptr,$i) # t[2]
1285 mov %rdx,$A0[0]
1286
1287
1288 mov -8($aptr,$i),$ai # a[3]
1289 mul $a1 # a[2]*a[1]
1290 mov %rax,$A1[0] # a[2]*a[1]+t[3]
1291 mov $ai,%rax
1292 mov %rdx,$A1[1]
1293
1294 lea ($i),$j
1295 mul $a0 # a[3]*a[0]
1296 add %rax,$A0[0] # a[3]*a[0]+a[2]*a[1]+t[3]
1297 mov $ai,%rax
1298 mov %rdx,$A0[1]
1299 adc \$0,$A0[1]
1300 add $A1[0],$A0[0]
1301 adc \$0,$A0[1]
1302 mov $A0[0],-8($tptr,$j) # t[3]
1303 jmp .Lsqr4x_1st
1304
1305.align 32
1306.Lsqr4x_1st:
1307 mov ($aptr,$j),$ai # a[4]
1308 mul $a1 # a[3]*a[1]
1309 add %rax,$A1[1] # a[3]*a[1]+t[4]
1310 mov $ai,%rax
1311 mov %rdx,$A1[0]
1312 adc \$0,$A1[0]
1313
1314 mul $a0 # a[4]*a[0]
1315 add %rax,$A0[1] # a[4]*a[0]+a[3]*a[1]+t[4]
1316 mov $ai,%rax # a[3]
1317 mov 8($aptr,$j),$ai # a[5]
1318 mov %rdx,$A0[0]
1319 adc \$0,$A0[0]
1320 add $A1[1],$A0[1]
1321 adc \$0,$A0[0]
1322
1323
1324 mul $a1 # a[4]*a[3]
1325 add %rax,$A1[0] # a[4]*a[3]+t[5]
1326 mov $ai,%rax
1327 mov $A0[1],($tptr,$j) # t[4]
1328 mov %rdx,$A1[1]
1329 adc \$0,$A1[1]
1330
1331 mul $a0 # a[5]*a[2]
1332 add %rax,$A0[0] # a[5]*a[2]+a[4]*a[3]+t[5]
1333 mov $ai,%rax
1334 mov 16($aptr,$j),$ai # a[6]
1335 mov %rdx,$A0[1]
1336 adc \$0,$A0[1]
1337 add $A1[0],$A0[0]
1338 adc \$0,$A0[1]
1339
1340 mul $a1 # a[5]*a[3]
1341 add %rax,$A1[1] # a[5]*a[3]+t[6]
1342 mov $ai,%rax
1343 mov $A0[0],8($tptr,$j) # t[5]
1344 mov %rdx,$A1[0]
1345 adc \$0,$A1[0]
1346
1347 mul $a0 # a[6]*a[2]
1348 add %rax,$A0[1] # a[6]*a[2]+a[5]*a[3]+t[6]
1349 mov $ai,%rax # a[3]
1350 mov 24($aptr,$j),$ai # a[7]
1351 mov %rdx,$A0[0]
1352 adc \$0,$A0[0]
1353 add $A1[1],$A0[1]
1354 adc \$0,$A0[0]
1355
1356
1357 mul $a1 # a[6]*a[5]
1358 add %rax,$A1[0] # a[6]*a[5]+t[7]
1359 mov $ai,%rax
1360 mov $A0[1],16($tptr,$j) # t[6]
1361 mov %rdx,$A1[1]
1362 adc \$0,$A1[1]
1363 lea 32($j),$j
1364
1365 mul $a0 # a[7]*a[4]
1366 add %rax,$A0[0] # a[7]*a[4]+a[6]*a[5]+t[6]
1367 mov $ai,%rax
1368 mov %rdx,$A0[1]
1369 adc \$0,$A0[1]
1370 add $A1[0],$A0[0]
1371 adc \$0,$A0[1]
1372 mov $A0[0],-8($tptr,$j) # t[7]
1373
1374 cmp \$0,$j
1375 jne .Lsqr4x_1st
1376
1377 mul $a1 # a[7]*a[5]
1378 add %rax,$A1[1]
1379 lea 16($i),$i
1380 adc \$0,%rdx
1381 add $A0[1],$A1[1]
1382 adc \$0,%rdx
1383
1384 mov $A1[1],($tptr) # t[8]
1385 mov %rdx,$A1[0]
1386 mov %rdx,8($tptr) # t[9]
1387 jmp .Lsqr4x_outer
1388
1389.align 32
1390.Lsqr4x_outer: # comments apply to $num==6 case
1391 mov -32($aptr,$i),$a0 # a[0]
1392 lea 48+8(%rsp,$num,2),$tptr # end of tp[] buffer, &tp[2*$num]
1393 mov -24($aptr,$i),%rax # a[1]
1394 lea -32($tptr,$i),$tptr # end of tp[] window, &tp[2*$num-"$i"]
1395 mov -16($aptr,$i),$ai # a[2]
1396 mov %rax,$a1
1397
1398 mul $a0 # a[1]*a[0]
1399 mov -24($tptr,$i),$A0[0] # t[1]
1400 add %rax,$A0[0] # a[1]*a[0]+t[1]
1401 mov $ai,%rax # a[2]
1402 adc \$0,%rdx
1403 mov $A0[0],-24($tptr,$i) # t[1]
1404 mov %rdx,$A0[1]
1405
1406 mul $a0 # a[2]*a[0]
1407 add %rax,$A0[1]
1408 mov $ai,%rax
1409 adc \$0,%rdx
1410 add -16($tptr,$i),$A0[1] # a[2]*a[0]+t[2]
1411 mov %rdx,$A0[0]
1412 adc \$0,$A0[0]
1413 mov $A0[1],-16($tptr,$i) # t[2]
1414
1415 xor $A1[0],$A1[0]
1416
1417 mov -8($aptr,$i),$ai # a[3]
1418 mul $a1 # a[2]*a[1]
1419 add %rax,$A1[0] # a[2]*a[1]+t[3]
1420 mov $ai,%rax
1421 adc \$0,%rdx
1422 add -8($tptr,$i),$A1[0]
1423 mov %rdx,$A1[1]
1424 adc \$0,$A1[1]
1425
1426 mul $a0 # a[3]*a[0]
1427 add %rax,$A0[0] # a[3]*a[0]+a[2]*a[1]+t[3]
1428 mov $ai,%rax
1429 adc \$0,%rdx
1430 add $A1[0],$A0[0]
1431 mov %rdx,$A0[1]
1432 adc \$0,$A0[1]
1433 mov $A0[0],-8($tptr,$i) # t[3]
1434
1435 lea ($i),$j
1436 jmp .Lsqr4x_inner
1437
1438.align 32
1439.Lsqr4x_inner:
1440 mov ($aptr,$j),$ai # a[4]
1441 mul $a1 # a[3]*a[1]
1442 add %rax,$A1[1] # a[3]*a[1]+t[4]
1443 mov $ai,%rax
1444 mov %rdx,$A1[0]
1445 adc \$0,$A1[0]
1446 add ($tptr,$j),$A1[1]
1447 adc \$0,$A1[0]
1448
1449 .byte 0x67
1450 mul $a0 # a[4]*a[0]
1451 add %rax,$A0[1] # a[4]*a[0]+a[3]*a[1]+t[4]
1452 mov $ai,%rax # a[3]
1453 mov 8($aptr,$j),$ai # a[5]
1454 mov %rdx,$A0[0]
1455 adc \$0,$A0[0]
1456 add $A1[1],$A0[1]
1457 adc \$0,$A0[0]
1458
1459 mul $a1 # a[4]*a[3]
1460 add %rax,$A1[0] # a[4]*a[3]+t[5]
1461 mov $A0[1],($tptr,$j) # t[4]
1462 mov $ai,%rax
1463 mov %rdx,$A1[1]
1464 adc \$0,$A1[1]
1465 add 8($tptr,$j),$A1[0]
1466 lea 16($j),$j # j++
1467 adc \$0,$A1[1]
1468
1469 mul $a0 # a[5]*a[2]
1470 add %rax,$A0[0] # a[5]*a[2]+a[4]*a[3]+t[5]
1471 mov $ai,%rax
1472 adc \$0,%rdx
1473 add $A1[0],$A0[0]
1474 mov %rdx,$A0[1]
1475 adc \$0,$A0[1]
1476 mov $A0[0],-8($tptr,$j) # t[5], "preloaded t[1]" below
1477
1478 cmp \$0,$j
1479 jne .Lsqr4x_inner
1480
1481 .byte 0x67
1482 mul $a1 # a[5]*a[3]
1483 add %rax,$A1[1]
1484 adc \$0,%rdx
1485 add $A0[1],$A1[1]
1486 adc \$0,%rdx
1487
1488 mov $A1[1],($tptr) # t[6], "preloaded t[2]" below
1489 mov %rdx,$A1[0]
1490 mov %rdx,8($tptr) # t[7], "preloaded t[3]" below
1491
1492 add \$16,$i
1493 jnz .Lsqr4x_outer
1494
1495 # comments apply to $num==4 case
1496 mov -32($aptr),$a0 # a[0]
1497 lea 48+8(%rsp,$num,2),$tptr # end of tp[] buffer, &tp[2*$num]
1498 mov -24($aptr),%rax # a[1]
1499 lea -32($tptr,$i),$tptr # end of tp[] window, &tp[2*$num-"$i"]
1500 mov -16($aptr),$ai # a[2]
1501 mov %rax,$a1
1502
1503 mul $a0 # a[1]*a[0]
1504 add %rax,$A0[0] # a[1]*a[0]+t[1], preloaded t[1]
1505 mov $ai,%rax # a[2]
1506 mov %rdx,$A0[1]
1507 adc \$0,$A0[1]
1508
1509 mul $a0 # a[2]*a[0]
1510 add %rax,$A0[1]
1511 mov $ai,%rax
1512 mov $A0[0],-24($tptr) # t[1]
1513 mov %rdx,$A0[0]
1514 adc \$0,$A0[0]
1515 add $A1[1],$A0[1] # a[2]*a[0]+t[2], preloaded t[2]
1516 mov -8($aptr),$ai # a[3]
1517 adc \$0,$A0[0]
1518
1519 mul $a1 # a[2]*a[1]
1520 add %rax,$A1[0] # a[2]*a[1]+t[3], preloaded t[3]
1521 mov $ai,%rax
1522 mov $A0[1],-16($tptr) # t[2]
1523 mov %rdx,$A1[1]
1524 adc \$0,$A1[1]
1525
1526 mul $a0 # a[3]*a[0]
1527 add %rax,$A0[0] # a[3]*a[0]+a[2]*a[1]+t[3]
1528 mov $ai,%rax
1529 mov %rdx,$A0[1]
1530 adc \$0,$A0[1]
1531 add $A1[0],$A0[0]
1532 adc \$0,$A0[1]
1533 mov $A0[0],-8($tptr) # t[3]
1534
1535 mul $a1 # a[3]*a[1]
1536 add %rax,$A1[1]
1537 mov -16($aptr),%rax # a[2]
1538 adc \$0,%rdx
1539 add $A0[1],$A1[1]
1540 adc \$0,%rdx
1541
1542 mov $A1[1],($tptr) # t[4]
1543 mov %rdx,$A1[0]
1544 mov %rdx,8($tptr) # t[5]
1545
1546 mul $ai # a[2]*a[3]
1547___
1548{
1549my ($shift,$carry)=($a0,$a1);
1550my @S=(@A1,$ai,$n0);
1551$code.=<<___;
1552 add \$16,$i
1553 xor $shift,$shift
1554 sub $num,$i # $i=16-$num
1555 xor $carry,$carry
1556
1557 add $A1[0],%rax # t[5]
1558 adc \$0,%rdx
1559 mov %rax,8($tptr) # t[5]
1560 mov %rdx,16($tptr) # t[6]
1561 mov $carry,24($tptr) # t[7]
1562
1563 mov -16($aptr,$i),%rax # a[0]
1564 lea 48+8(%rsp),$tptr
1565 xor $A0[0],$A0[0] # t[0]
1566 mov 8($tptr),$A0[1] # t[1]
1567
1568 lea ($shift,$A0[0],2),$S[0] # t[2*i]<<1 | shift
1569 shr \$63,$A0[0]
1570 lea ($j,$A0[1],2),$S[1] # t[2*i+1]<<1 |
1571 shr \$63,$A0[1]
1572 or $A0[0],$S[1] # | t[2*i]>>63
1573 mov 16($tptr),$A0[0] # t[2*i+2] # prefetch
1574 mov $A0[1],$shift # shift=t[2*i+1]>>63
1575 mul %rax # a[i]*a[i]
1576 neg $carry # mov $carry,cf
1577 mov 24($tptr),$A0[1] # t[2*i+2+1] # prefetch
1578 adc %rax,$S[0]
1579 mov -8($aptr,$i),%rax # a[i+1] # prefetch
1580 mov $S[0],($tptr)
1581 adc %rdx,$S[1]
1582
1583 lea ($shift,$A0[0],2),$S[2] # t[2*i]<<1 | shift
1584 mov $S[1],8($tptr)
1585 sbb $carry,$carry # mov cf,$carry
1586 shr \$63,$A0[0]
1587 lea ($j,$A0[1],2),$S[3] # t[2*i+1]<<1 |
1588 shr \$63,$A0[1]
1589 or $A0[0],$S[3] # | t[2*i]>>63
1590 mov 32($tptr),$A0[0] # t[2*i+2] # prefetch
1591 mov $A0[1],$shift # shift=t[2*i+1]>>63
1592 mul %rax # a[i]*a[i]
1593 neg $carry # mov $carry,cf
1594 mov 40($tptr),$A0[1] # t[2*i+2+1] # prefetch
1595 adc %rax,$S[2]
1596 mov 0($aptr,$i),%rax # a[i+1] # prefetch
1597 mov $S[2],16($tptr)
1598 adc %rdx,$S[3]
1599 lea 16($i),$i
1600 mov $S[3],24($tptr)
1601 sbb $carry,$carry # mov cf,$carry
1602 lea 64($tptr),$tptr
1603 jmp .Lsqr4x_shift_n_add
1604
1605.align 32
1606.Lsqr4x_shift_n_add:
1607 lea ($shift,$A0[0],2),$S[0] # t[2*i]<<1 | shift
1608 shr \$63,$A0[0]
1609 lea ($j,$A0[1],2),$S[1] # t[2*i+1]<<1 |
1610 shr \$63,$A0[1]
1611 or $A0[0],$S[1] # | t[2*i]>>63
1612 mov -16($tptr),$A0[0] # t[2*i+2] # prefetch
1613 mov $A0[1],$shift # shift=t[2*i+1]>>63
1614 mul %rax # a[i]*a[i]
1615 neg $carry # mov $carry,cf
1616 mov -8($tptr),$A0[1] # t[2*i+2+1] # prefetch
1617 adc %rax,$S[0]
1618 mov -8($aptr,$i),%rax # a[i+1] # prefetch
1619 mov $S[0],-32($tptr)
1620 adc %rdx,$S[1]
1621
1622 lea ($shift,$A0[0],2),$S[2] # t[2*i]<<1 | shift
1623 mov $S[1],-24($tptr)
1624 sbb $carry,$carry # mov cf,$carry
1625 shr \$63,$A0[0]
1626 lea ($j,$A0[1],2),$S[3] # t[2*i+1]<<1 |
1627 shr \$63,$A0[1]
1628 or $A0[0],$S[3] # | t[2*i]>>63
1629 mov 0($tptr),$A0[0] # t[2*i+2] # prefetch
1630 mov $A0[1],$shift # shift=t[2*i+1]>>63
1631 mul %rax # a[i]*a[i]
1632 neg $carry # mov $carry,cf
1633 mov 8($tptr),$A0[1] # t[2*i+2+1] # prefetch
1634 adc %rax,$S[2]
1635 mov 0($aptr,$i),%rax # a[i+1] # prefetch
1636 mov $S[2],-16($tptr)
1637 adc %rdx,$S[3]
1638
1639 lea ($shift,$A0[0],2),$S[0] # t[2*i]<<1 | shift
1640 mov $S[3],-8($tptr)
1641 sbb $carry,$carry # mov cf,$carry
1642 shr \$63,$A0[0]
1643 lea ($j,$A0[1],2),$S[1] # t[2*i+1]<<1 |
1644 shr \$63,$A0[1]
1645 or $A0[0],$S[1] # | t[2*i]>>63
1646 mov 16($tptr),$A0[0] # t[2*i+2] # prefetch
1647 mov $A0[1],$shift # shift=t[2*i+1]>>63
1648 mul %rax # a[i]*a[i]
1649 neg $carry # mov $carry,cf
1650 mov 24($tptr),$A0[1] # t[2*i+2+1] # prefetch
1651 adc %rax,$S[0]
1652 mov 8($aptr,$i),%rax # a[i+1] # prefetch
1653 mov $S[0],0($tptr)
1654 adc %rdx,$S[1]
1655
1656 lea ($shift,$A0[0],2),$S[2] # t[2*i]<<1 | shift
1657 mov $S[1],8($tptr)
1658 sbb $carry,$carry # mov cf,$carry
1659 shr \$63,$A0[0]
1660 lea ($j,$A0[1],2),$S[3] # t[2*i+1]<<1 |
1661 shr \$63,$A0[1]
1662 or $A0[0],$S[3] # | t[2*i]>>63
1663 mov 32($tptr),$A0[0] # t[2*i+2] # prefetch
1664 mov $A0[1],$shift # shift=t[2*i+1]>>63
1665 mul %rax # a[i]*a[i]
1666 neg $carry # mov $carry,cf
1667 mov 40($tptr),$A0[1] # t[2*i+2+1] # prefetch
1668 adc %rax,$S[2]
1669 mov 16($aptr,$i),%rax # a[i+1] # prefetch
1670 mov $S[2],16($tptr)
1671 adc %rdx,$S[3]
1672 mov $S[3],24($tptr)
1673 sbb $carry,$carry # mov cf,$carry
1674 lea 64($tptr),$tptr
1675 add \$32,$i
1676 jnz .Lsqr4x_shift_n_add
1677
1678 lea ($shift,$A0[0],2),$S[0] # t[2*i]<<1 | shift
1679 .byte 0x67
1680 shr \$63,$A0[0]
1681 lea ($j,$A0[1],2),$S[1] # t[2*i+1]<<1 |
1682 shr \$63,$A0[1]
1683 or $A0[0],$S[1] # | t[2*i]>>63
1684 mov -16($tptr),$A0[0] # t[2*i+2] # prefetch
1685 mov $A0[1],$shift # shift=t[2*i+1]>>63
1686 mul %rax # a[i]*a[i]
1687 neg $carry # mov $carry,cf
1688 mov -8($tptr),$A0[1] # t[2*i+2+1] # prefetch
1689 adc %rax,$S[0]
1690 mov -8($aptr),%rax # a[i+1] # prefetch
1691 mov $S[0],-32($tptr)
1692 adc %rdx,$S[1]
1693
1694 lea ($shift,$A0[0],2),$S[2] # t[2*i]<<1|shift
1695 mov $S[1],-24($tptr)
1696 sbb $carry,$carry # mov cf,$carry
1697 shr \$63,$A0[0]
1698 lea ($j,$A0[1],2),$S[3] # t[2*i+1]<<1 |
1699 shr \$63,$A0[1]
1700 or $A0[0],$S[3] # | t[2*i]>>63
1701 mul %rax # a[i]*a[i]
1702 neg $carry # mov $carry,cf
1703 adc %rax,$S[2]
1704 adc %rdx,$S[3]
1705 mov $S[2],-16($tptr)
1706 mov $S[3],-8($tptr)
1707___
1708}
1709
1710######################################################################
1711# Montgomery reduction part, "word-by-word" algorithm.
1712#
1713# This new path is inspired by multiple submissions from Intel, by
1714# Shay Gueron, Vlad Krasnov, Erdinc Ozturk, James Guilford,
1715# Vinodh Gopal...
1716{
1717my ($nptr,$tptr,$carry,$m0)=("%rbp","%rdi","%rsi","%rbx");
1718
1719$code.=<<___;
1720 movq %xmm2,$nptr
1721__bn_sqr8x_reduction:
1722 xor %rax,%rax
1723 lea ($nptr,$num),%rcx # end of n[]
1724 lea 48+8(%rsp,$num,2),%rdx # end of t[] buffer
1725 mov %rcx,0+8(%rsp)
1726 lea 48+8(%rsp,$num),$tptr # end of initial t[] window
1727 mov %rdx,8+8(%rsp)
1728 neg $num
1729 jmp .L8x_reduction_loop
1730
1731.align 32
1732.L8x_reduction_loop:
1733 lea ($tptr,$num),$tptr # start of current t[] window
1734 .byte 0x66
1735 mov 8*0($tptr),$m0
1736 mov 8*1($tptr),%r9
1737 mov 8*2($tptr),%r10
1738 mov 8*3($tptr),%r11
1739 mov 8*4($tptr),%r12
1740 mov 8*5($tptr),%r13
1741 mov 8*6($tptr),%r14
1742 mov 8*7($tptr),%r15
1743 mov %rax,(%rdx) # store top-most carry bit
1744 lea 8*8($tptr),$tptr
1745
1746 .byte 0x67
1747 mov $m0,%r8
1748 imulq 32+8(%rsp),$m0 # n0*a[0]
1749 mov 8*0($nptr),%rax # n[0]
1750 mov \$8,%ecx
1751 jmp .L8x_reduce
1752
1753.align 32
1754.L8x_reduce:
1755 mulq $m0
1756 mov 8*1($nptr),%rax # n[1]
1757 neg %r8
1758 mov %rdx,%r8
1759 adc \$0,%r8
1760
1761 mulq $m0
1762 add %rax,%r9
1763 mov 8*2($nptr),%rax
1764 adc \$0,%rdx
1765 add %r9,%r8
1766 mov $m0,48-8+8(%rsp,%rcx,8) # put aside n0*a[i]
1767 mov %rdx,%r9
1768 adc \$0,%r9
1769
1770 mulq $m0
1771 add %rax,%r10
1772 mov 8*3($nptr),%rax
1773 adc \$0,%rdx
1774 add %r10,%r9
1775 mov 32+8(%rsp),$carry # pull n0, borrow $carry
1776 mov %rdx,%r10
1777 adc \$0,%r10
1778
1779 mulq $m0
1780 add %rax,%r11
1781 mov 8*4($nptr),%rax
1782 adc \$0,%rdx
1783 imulq %r8,$carry # modulo-scheduled
1784 add %r11,%r10
1785 mov %rdx,%r11
1786 adc \$0,%r11
1787
1788 mulq $m0
1789 add %rax,%r12
1790 mov 8*5($nptr),%rax
1791 adc \$0,%rdx
1792 add %r12,%r11
1793 mov %rdx,%r12
1794 adc \$0,%r12
1795
1796 mulq $m0
1797 add %rax,%r13
1798 mov 8*6($nptr),%rax
1799 adc \$0,%rdx
1800 add %r13,%r12
1801 mov %rdx,%r13
1802 adc \$0,%r13
1803
1804 mulq $m0
1805 add %rax,%r14
1806 mov 8*7($nptr),%rax
1807 adc \$0,%rdx
1808 add %r14,%r13
1809 mov %rdx,%r14
1810 adc \$0,%r14
1811
1812 mulq $m0
1813 mov $carry,$m0 # n0*a[i]
1814 add %rax,%r15
1815 mov 8*0($nptr),%rax # n[0]
1816 adc \$0,%rdx
1817 add %r15,%r14
1818 mov %rdx,%r15
1819 adc \$0,%r15
1820
1821 dec %ecx
1822 jnz .L8x_reduce
1823
1824 lea 8*8($nptr),$nptr
1825 xor %rax,%rax
1826 mov 8+8(%rsp),%rdx # pull end of t[]
1827 cmp 0+8(%rsp),$nptr # end of n[]?
1828 jae .L8x_no_tail
1829
1830 .byte 0x66
1831 add 8*0($tptr),%r8
1832 adc 8*1($tptr),%r9
1833 adc 8*2($tptr),%r10
1834 adc 8*3($tptr),%r11
1835 adc 8*4($tptr),%r12
1836 adc 8*5($tptr),%r13
1837 adc 8*6($tptr),%r14
1838 adc 8*7($tptr),%r15
1839 sbb $carry,$carry # top carry
1840
1841 mov 48+56+8(%rsp),$m0 # pull n0*a[0]
1842 mov \$8,%ecx
1843 mov 8*0($nptr),%rax
1844 jmp .L8x_tail
1845
1846.align 32
1847.L8x_tail:
1848 mulq $m0
1849 add %rax,%r8
1850 mov 8*1($nptr),%rax
1851 mov %r8,($tptr) # save result
1852 mov %rdx,%r8
1853 adc \$0,%r8
1854
1855 mulq $m0
1856 add %rax,%r9
1857 mov 8*2($nptr),%rax
1858 adc \$0,%rdx
1859 add %r9,%r8
1860 lea 8($tptr),$tptr # $tptr++
1861 mov %rdx,%r9
1862 adc \$0,%r9
1863
1864 mulq $m0
1865 add %rax,%r10
1866 mov 8*3($nptr),%rax
1867 adc \$0,%rdx
1868 add %r10,%r9
1869 mov %rdx,%r10
1870 adc \$0,%r10
1871
1872 mulq $m0
1873 add %rax,%r11
1874 mov 8*4($nptr),%rax
1875 adc \$0,%rdx
1876 add %r11,%r10
1877 mov %rdx,%r11
1878 adc \$0,%r11
1879
1880 mulq $m0
1881 add %rax,%r12
1882 mov 8*5($nptr),%rax
1883 adc \$0,%rdx
1884 add %r12,%r11
1885 mov %rdx,%r12
1886 adc \$0,%r12
1887
1888 mulq $m0
1889 add %rax,%r13
1890 mov 8*6($nptr),%rax
1891 adc \$0,%rdx
1892 add %r13,%r12
1893 mov %rdx,%r13
1894 adc \$0,%r13
1895
1896 mulq $m0
1897 add %rax,%r14
1898 mov 8*7($nptr),%rax
1899 adc \$0,%rdx
1900 add %r14,%r13
1901 mov %rdx,%r14
1902 adc \$0,%r14
1903
1904 mulq $m0
1905 mov 48-16+8(%rsp,%rcx,8),$m0# pull n0*a[i]
1906 add %rax,%r15
1907 adc \$0,%rdx
1908 add %r15,%r14
1909 mov 8*0($nptr),%rax # pull n[0]
1910 mov %rdx,%r15
1911 adc \$0,%r15
1912
1913 dec %ecx
1914 jnz .L8x_tail
1915
1916 lea 8*8($nptr),$nptr
1917 mov 8+8(%rsp),%rdx # pull end of t[]
1918 cmp 0+8(%rsp),$nptr # end of n[]?
1919 jae .L8x_tail_done # break out of loop
1920
1921 mov 48+56+8(%rsp),$m0 # pull n0*a[0]
1922 neg $carry
1923 mov 8*0($nptr),%rax # pull n[0]
1924 adc 8*0($tptr),%r8
1925 adc 8*1($tptr),%r9
1926 adc 8*2($tptr),%r10
1927 adc 8*3($tptr),%r11
1928 adc 8*4($tptr),%r12
1929 adc 8*5($tptr),%r13
1930 adc 8*6($tptr),%r14
1931 adc 8*7($tptr),%r15
1932 sbb $carry,$carry # top carry
1933
1934 mov \$8,%ecx
1935 jmp .L8x_tail
1936
1937.align 32
1938.L8x_tail_done:
1939 xor %rax,%rax
1940 add (%rdx),%r8 # can this overflow?
1941 adc \$0,%r9
1942 adc \$0,%r10
1943 adc \$0,%r11
1944 adc \$0,%r12
1945 adc \$0,%r13
1946 adc \$0,%r14
1947 adc \$0,%r15
1948 adc \$0,%rax
1949
1950 neg $carry
1951.L8x_no_tail:
1952 adc 8*0($tptr),%r8
1953 adc 8*1($tptr),%r9
1954 adc 8*2($tptr),%r10
1955 adc 8*3($tptr),%r11
1956 adc 8*4($tptr),%r12
1957 adc 8*5($tptr),%r13
1958 adc 8*6($tptr),%r14
1959 adc 8*7($tptr),%r15
1960 adc \$0,%rax # top-most carry
1961 mov -8($nptr),%rcx # np[num-1]
1962 xor $carry,$carry
1963
1964 movq %xmm2,$nptr # restore $nptr
1965
1966 mov %r8,8*0($tptr) # store top 512 bits
1967 mov %r9,8*1($tptr)
1968 movq %xmm3,$num # $num is %r9, can't be moved upwards
1969 mov %r10,8*2($tptr)
1970 mov %r11,8*3($tptr)
1971 mov %r12,8*4($tptr)
1972 mov %r13,8*5($tptr)
1973 mov %r14,8*6($tptr)
1974 mov %r15,8*7($tptr)
1975 lea 8*8($tptr),$tptr
1976
1977 cmp %rdx,$tptr # end of t[]?
1978 jb .L8x_reduction_loop
1979 ret
1980.size bn_sqr8x_internal,.-bn_sqr8x_internal
1981___
1982}
1983
1984##############################################################
1985# Post-condition, 4x unrolled
1986#
1987{
1988my ($tptr,$nptr)=("%rbx","%rbp");
1989$code.=<<___;
1990.type __bn_post4x_internal,\@abi-omnipotent
1991.align 32
1992__bn_post4x_internal:
1993 mov 8*0($nptr),%r12
1994 lea (%rdi,$num),$tptr # %rdi was $tptr above
1995 mov $num,%rcx
1996 movq %xmm1,$rptr # restore $rptr
1997 neg %rax
1998 movq %xmm1,$aptr # prepare for back-to-back call
1999 sar \$3+2,%rcx
2000 dec %r12 # so that after 'not' we get -n[0]
2001 xor %r10,%r10
2002 mov 8*1($nptr),%r13
2003 mov 8*2($nptr),%r14
2004 mov 8*3($nptr),%r15
2005 jmp .Lsqr4x_sub_entry
2006
2007.align 16
2008.Lsqr4x_sub:
2009 mov 8*0($nptr),%r12
2010 mov 8*1($nptr),%r13
2011 mov 8*2($nptr),%r14
2012 mov 8*3($nptr),%r15
2013.Lsqr4x_sub_entry:
2014 lea 8*4($nptr),$nptr
2015 not %r12
2016 not %r13
2017 not %r14
2018 not %r15
2019 and %rax,%r12
2020 and %rax,%r13
2021 and %rax,%r14
2022 and %rax,%r15
2023
2024 neg %r10 # mov %r10,%cf
2025 adc 8*0($tptr),%r12
2026 adc 8*1($tptr),%r13
2027 adc 8*2($tptr),%r14
2028 adc 8*3($tptr),%r15
2029 mov %r12,8*0($rptr)
2030 lea 8*4($tptr),$tptr
2031 mov %r13,8*1($rptr)
2032 sbb %r10,%r10 # mov %cf,%r10
2033 mov %r14,8*2($rptr)
2034 mov %r15,8*3($rptr)
2035 lea 8*4($rptr),$rptr
2036
2037 inc %rcx # pass %cf
2038 jnz .Lsqr4x_sub
2039
2040 mov $num,%r10 # prepare for back-to-back call
2041 neg $num # restore $num
2042 ret
2043.size __bn_post4x_internal,.-__bn_post4x_internal
2044___
2045}
2046{
2047$code.=<<___;
2048.globl bn_from_montgomery
2049.type bn_from_montgomery,\@abi-omnipotent
2050.align 32
2051bn_from_montgomery:
2052 testl \$7,`($win64?"48(%rsp)":"%r9d")`
2053 jz bn_from_mont8x
2054 xor %eax,%eax
2055 ret
2056.size bn_from_montgomery,.-bn_from_montgomery
2057
2058.type bn_from_mont8x,\@function,6
2059.align 32
2060bn_from_mont8x:
2061 .byte 0x67
2062 mov %rsp,%rax
2063 push %rbx
2064 push %rbp
2065 push %r12
2066 push %r13
2067 push %r14
2068 push %r15
2069.Lfrom_prologue:
2070
2071 shl \$3,${num}d # convert $num to bytes
2072 lea ($num,$num,2),%r10 # 3*$num in bytes
2073 neg $num
2074 mov ($n0),$n0 # *n0
2075
2076 ##############################################################
2077 # Ensure that stack frame doesn't alias with $rptr+3*$num
2078 # modulo 4096, which covers ret[num], am[num] and n[num]
2079 # (see bn_exp.c). The stack is allocated to aligned with
2080 # bn_power5's frame, and as bn_from_montgomery happens to be
2081 # last operation, we use the opportunity to cleanse it.
2082 #
2083 lea -320(%rsp,$num,2),%r11
2084 mov %rsp,%rbp
2085 sub $rptr,%r11
2086 and \$4095,%r11
2087 cmp %r11,%r10
2088 jb .Lfrom_sp_alt
2089 sub %r11,%rbp # align with $aptr
2090 lea -320(%rbp,$num,2),%rbp # future alloca(frame+2*$num*8+256)
2091 jmp .Lfrom_sp_done
2092
2093.align 32
2094.Lfrom_sp_alt:
2095 lea 4096-320(,$num,2),%r10
2096 lea -320(%rbp,$num,2),%rbp # future alloca(frame+2*$num*8+256)
2097 sub %r10,%r11
2098 mov \$0,%r10
2099 cmovc %r10,%r11
2100 sub %r11,%rbp
2101.Lfrom_sp_done:
2102 and \$-64,%rbp
2103 mov %rsp,%r11
2104 sub %rbp,%r11
2105 and \$-4096,%r11
2106 lea (%rbp,%r11),%rsp
2107 mov (%rsp),%r10
2108 cmp %rbp,%rsp
2109 ja .Lfrom_page_walk
2110 jmp .Lfrom_page_walk_done
2111
2112.Lfrom_page_walk:
2113 lea -4096(%rsp),%rsp
2114 mov (%rsp),%r10
2115 cmp %rbp,%rsp
2116 ja .Lfrom_page_walk
2117.Lfrom_page_walk_done:
2118
2119 mov $num,%r10
2120 neg $num
2121
2122 ##############################################################
2123 # Stack layout
2124 #
2125 # +0 saved $num, used in reduction section
2126 # +8 &t[2*$num], used in reduction section
2127 # +32 saved *n0
2128 # +40 saved %rsp
2129 # +48 t[2*$num]
2130 #
2131 mov $n0, 32(%rsp)
2132 mov %rax, 40(%rsp) # save original %rsp
2133.Lfrom_body:
2134 mov $num,%r11
2135 lea 48(%rsp),%rax
2136 pxor %xmm0,%xmm0
2137 jmp .Lmul_by_1
2138
2139.align 32
2140.Lmul_by_1:
2141 movdqu ($aptr),%xmm1
2142 movdqu 16($aptr),%xmm2
2143 movdqu 32($aptr),%xmm3
2144 movdqa %xmm0,(%rax,$num)
2145 movdqu 48($aptr),%xmm4
2146 movdqa %xmm0,16(%rax,$num)
2147 .byte 0x48,0x8d,0xb6,0x40,0x00,0x00,0x00 # lea 64($aptr),$aptr
2148 movdqa %xmm1,(%rax)
2149 movdqa %xmm0,32(%rax,$num)
2150 movdqa %xmm2,16(%rax)
2151 movdqa %xmm0,48(%rax,$num)
2152 movdqa %xmm3,32(%rax)
2153 movdqa %xmm4,48(%rax)
2154 lea 64(%rax),%rax
2155 sub \$64,%r11
2156 jnz .Lmul_by_1
2157
2158 movq $rptr,%xmm1
2159 movq $nptr,%xmm2
2160 .byte 0x67
2161 mov $nptr,%rbp
2162 movq %r10, %xmm3 # -num
2163___
2164$code.=<<___ if ($addx);
2165 mov OPENSSL_ia32cap_P+8(%rip),%r11d
2166 and \$0x80108,%r11d
2167 cmp \$0x80108,%r11d # check for AD*X+BMI2+BMI1
2168 jne .Lfrom_mont_nox
2169
2170 lea (%rax,$num),$rptr
2171 call __bn_sqrx8x_reduction
2172 call __bn_postx4x_internal
2173
2174 pxor %xmm0,%xmm0
2175 lea 48(%rsp),%rax
2176 mov 40(%rsp),%rsi # restore %rsp
2177 jmp .Lfrom_mont_zero
2178
2179.align 32
2180.Lfrom_mont_nox:
2181___
2182$code.=<<___;
2183 call __bn_sqr8x_reduction
2184 call __bn_post4x_internal
2185
2186 pxor %xmm0,%xmm0
2187 lea 48(%rsp),%rax
2188 mov 40(%rsp),%rsi # restore %rsp
2189 jmp .Lfrom_mont_zero
2190
2191.align 32
2192.Lfrom_mont_zero:
2193 movdqa %xmm0,16*0(%rax)
2194 movdqa %xmm0,16*1(%rax)
2195 movdqa %xmm0,16*2(%rax)
2196 movdqa %xmm0,16*3(%rax)
2197 lea 16*4(%rax),%rax
2198 sub \$32,$num
2199 jnz .Lfrom_mont_zero
2200
2201 mov \$1,%rax
2202 mov -48(%rsi),%r15
2203 mov -40(%rsi),%r14
2204 mov -32(%rsi),%r13
2205 mov -24(%rsi),%r12
2206 mov -16(%rsi),%rbp
2207 mov -8(%rsi),%rbx
2208 lea (%rsi),%rsp
2209.Lfrom_epilogue:
2210 ret
2211.size bn_from_mont8x,.-bn_from_mont8x
2212___
2213}
2214}}}
2215
2216
2217if ($addx) {{{
2218my $bp="%rdx"; # restore original value
2219
2220$code.=<<___;
2221.type bn_mulx4x_mont_gather5,\@function,6
2222.align 32
2223bn_mulx4x_mont_gather5:
2224 mov %rsp,%rax
2225.Lmulx4x_enter:
2226 push %rbx
2227 push %rbp
2228 push %r12
2229 push %r13
2230 push %r14
2231 push %r15
2232.Lmulx4x_prologue:
2233
2234 shl \$3,${num}d # convert $num to bytes
2235 lea ($num,$num,2),%r10 # 3*$num in bytes
2236 neg $num # -$num
2237 mov ($n0),$n0 # *n0
2238
2239 ##############################################################
2240 # Ensure that stack frame doesn't alias with $rptr+3*$num
2241 # modulo 4096, which covers ret[num], am[num] and n[num]
2242 # (see bn_exp.c). This is done to allow memory disambiguation
2243 # logic do its magic. [Extra [num] is allocated in order
2244 # to align with bn_power5's frame, which is cleansed after
2245 # completing exponentiation. Extra 256 bytes is for power mask
2246 # calculated from 7th argument, the index.]
2247 #
2248 lea -320(%rsp,$num,2),%r11
2249 mov %rsp,%rbp
2250 sub $rp,%r11
2251 and \$4095,%r11
2252 cmp %r11,%r10
2253 jb .Lmulx4xsp_alt
2254 sub %r11,%rbp # align with $aptr
2255 lea -320(%rbp,$num,2),%rbp # future alloca(frame+2*$num*8+256)
2256 jmp .Lmulx4xsp_done
2257
2258.Lmulx4xsp_alt:
2259 lea 4096-320(,$num,2),%r10
2260 lea -320(%rbp,$num,2),%rbp # future alloca(frame+2*$num*8+256)
2261 sub %r10,%r11
2262 mov \$0,%r10
2263 cmovc %r10,%r11
2264 sub %r11,%rbp
2265.Lmulx4xsp_done:
2266 and \$-64,%rbp # ensure alignment
2267 mov %rsp,%r11
2268 sub %rbp,%r11
2269 and \$-4096,%r11
2270 lea (%rbp,%r11),%rsp
2271 mov (%rsp),%r10
2272 cmp %rbp,%rsp
2273 ja .Lmulx4x_page_walk
2274 jmp .Lmulx4x_page_walk_done
2275
2276.Lmulx4x_page_walk:
2277 lea -4096(%rsp),%rsp
2278 mov (%rsp),%r10
2279 cmp %rbp,%rsp
2280 ja .Lmulx4x_page_walk
2281.Lmulx4x_page_walk_done:
2282
2283 ##############################################################
2284 # Stack layout
2285 # +0 -num
2286 # +8 off-loaded &b[i]
2287 # +16 end of b[num]
2288 # +24 inner counter
2289 # +32 saved n0
2290 # +40 saved %rsp
2291 # +48
2292 # +56 saved rp
2293 # +64 tmp[num+1]
2294 #
2295 mov $n0, 32(%rsp) # save *n0
2296 mov %rax,40(%rsp) # save original %rsp
2297.Lmulx4x_body:
2298 call mulx4x_internal
2299
2300 mov 40(%rsp),%rsi # restore %rsp
2301 mov \$1,%rax
2302
2303 mov -48(%rsi),%r15
2304 mov -40(%rsi),%r14
2305 mov -32(%rsi),%r13
2306 mov -24(%rsi),%r12
2307 mov -16(%rsi),%rbp
2308 mov -8(%rsi),%rbx
2309 lea (%rsi),%rsp
2310.Lmulx4x_epilogue:
2311 ret
2312.size bn_mulx4x_mont_gather5,.-bn_mulx4x_mont_gather5
2313
2314.type mulx4x_internal,\@abi-omnipotent
2315.align 32
2316mulx4x_internal:
2317 mov $num,8(%rsp) # save -$num (it was in bytes)
2318 mov $num,%r10
2319 neg $num # restore $num
2320 shl \$5,$num
2321 neg %r10 # restore $num
2322 lea 128($bp,$num),%r13 # end of powers table (+size optimization)
2323 shr \$5+5,$num
2324 movd `($win64?56:8)`(%rax),%xmm5 # load 7th argument
2325 sub \$1,$num
2326 lea .Linc(%rip),%rax
2327 mov %r13,16+8(%rsp) # end of b[num]
2328 mov $num,24+8(%rsp) # inner counter
2329 mov $rp, 56+8(%rsp) # save $rp
2330___
2331my ($aptr, $bptr, $nptr, $tptr, $mi, $bi, $zero, $num)=
2332 ("%rsi","%rdi","%rcx","%rbx","%r8","%r9","%rbp","%rax");
2333my $rptr=$bptr;
2334my $STRIDE=2**5*8; # 5 is "window size"
2335my $N=$STRIDE/4; # should match cache line size
2336$code.=<<___;
2337 movdqa 0(%rax),%xmm0 # 00000001000000010000000000000000
2338 movdqa 16(%rax),%xmm1 # 00000002000000020000000200000002
2339 lea 88-112(%rsp,%r10),%r10 # place the mask after tp[num+1] (+ICache optimizaton)
2340 lea 128($bp),$bptr # size optimization
2341
2342 pshufd \$0,%xmm5,%xmm5 # broadcast index
2343 movdqa %xmm1,%xmm4
2344 .byte 0x67
2345 movdqa %xmm1,%xmm2
2346___
2347########################################################################
2348# calculate mask by comparing 0..31 to index and save result to stack
2349#
2350$code.=<<___;
2351 .byte 0x67
2352 paddd %xmm0,%xmm1
2353 pcmpeqd %xmm5,%xmm0 # compare to 1,0
2354 movdqa %xmm4,%xmm3
2355___
2356for($i=0;$i<$STRIDE/16-4;$i+=4) {
2357$code.=<<___;
2358 paddd %xmm1,%xmm2
2359 pcmpeqd %xmm5,%xmm1 # compare to 3,2
2360 movdqa %xmm0,`16*($i+0)+112`(%r10)
2361 movdqa %xmm4,%xmm0
2362
2363 paddd %xmm2,%xmm3
2364 pcmpeqd %xmm5,%xmm2 # compare to 5,4
2365 movdqa %xmm1,`16*($i+1)+112`(%r10)
2366 movdqa %xmm4,%xmm1
2367
2368 paddd %xmm3,%xmm0
2369 pcmpeqd %xmm5,%xmm3 # compare to 7,6
2370 movdqa %xmm2,`16*($i+2)+112`(%r10)
2371 movdqa %xmm4,%xmm2
2372
2373 paddd %xmm0,%xmm1
2374 pcmpeqd %xmm5,%xmm0
2375 movdqa %xmm3,`16*($i+3)+112`(%r10)
2376 movdqa %xmm4,%xmm3
2377___
2378}
2379$code.=<<___; # last iteration can be optimized
2380 .byte 0x67
2381 paddd %xmm1,%xmm2
2382 pcmpeqd %xmm5,%xmm1
2383 movdqa %xmm0,`16*($i+0)+112`(%r10)
2384
2385 paddd %xmm2,%xmm3
2386 pcmpeqd %xmm5,%xmm2
2387 movdqa %xmm1,`16*($i+1)+112`(%r10)
2388
2389 pcmpeqd %xmm5,%xmm3
2390 movdqa %xmm2,`16*($i+2)+112`(%r10)
2391
2392 pand `16*($i+0)-128`($bptr),%xmm0 # while it's still in register
2393 pand `16*($i+1)-128`($bptr),%xmm1
2394 pand `16*($i+2)-128`($bptr),%xmm2
2395 movdqa %xmm3,`16*($i+3)+112`(%r10)
2396 pand `16*($i+3)-128`($bptr),%xmm3
2397 por %xmm2,%xmm0
2398 por %xmm3,%xmm1
2399___
2400for($i=0;$i<$STRIDE/16-4;$i+=4) {
2401$code.=<<___;
2402 movdqa `16*($i+0)-128`($bptr),%xmm4
2403 movdqa `16*($i+1)-128`($bptr),%xmm5
2404 movdqa `16*($i+2)-128`($bptr),%xmm2
2405 pand `16*($i+0)+112`(%r10),%xmm4
2406 movdqa `16*($i+3)-128`($bptr),%xmm3
2407 pand `16*($i+1)+112`(%r10),%xmm5
2408 por %xmm4,%xmm0
2409 pand `16*($i+2)+112`(%r10),%xmm2
2410 por %xmm5,%xmm1
2411 pand `16*($i+3)+112`(%r10),%xmm3
2412 por %xmm2,%xmm0
2413 por %xmm3,%xmm1
2414___
2415}
2416$code.=<<___;
2417 pxor %xmm1,%xmm0
2418 pshufd \$0x4e,%xmm0,%xmm1
2419 por %xmm1,%xmm0
2420 lea $STRIDE($bptr),$bptr
2421 movq %xmm0,%rdx # bp[0]
2422 lea 64+8*4+8(%rsp),$tptr
2423
2424 mov %rdx,$bi
2425 mulx 0*8($aptr),$mi,%rax # a[0]*b[0]
2426 mulx 1*8($aptr),%r11,%r12 # a[1]*b[0]
2427 add %rax,%r11
2428 mulx 2*8($aptr),%rax,%r13 # ...
2429 adc %rax,%r12
2430 adc \$0,%r13
2431 mulx 3*8($aptr),%rax,%r14
2432
2433 mov $mi,%r15
2434 imulq 32+8(%rsp),$mi # "t[0]"*n0
2435 xor $zero,$zero # cf=0, of=0
2436 mov $mi,%rdx
2437
2438 mov $bptr,8+8(%rsp) # off-load &b[i]
2439
2440 lea 4*8($aptr),$aptr
2441 adcx %rax,%r13
2442 adcx $zero,%r14 # cf=0
2443
2444 mulx 0*8($nptr),%rax,%r10
2445 adcx %rax,%r15 # discarded
2446 adox %r11,%r10
2447 mulx 1*8($nptr),%rax,%r11
2448 adcx %rax,%r10
2449 adox %r12,%r11
2450 mulx 2*8($nptr),%rax,%r12
2451 mov 24+8(%rsp),$bptr # counter value
2452 mov %r10,-8*4($tptr)
2453 adcx %rax,%r11
2454 adox %r13,%r12
2455 mulx 3*8($nptr),%rax,%r15
2456 mov $bi,%rdx
2457 mov %r11,-8*3($tptr)
2458 adcx %rax,%r12
2459 adox $zero,%r15 # of=0
2460 lea 4*8($nptr),$nptr
2461 mov %r12,-8*2($tptr)
2462 jmp .Lmulx4x_1st
2463
2464.align 32
2465.Lmulx4x_1st:
2466 adcx $zero,%r15 # cf=0, modulo-scheduled
2467 mulx 0*8($aptr),%r10,%rax # a[4]*b[0]
2468 adcx %r14,%r10
2469 mulx 1*8($aptr),%r11,%r14 # a[5]*b[0]
2470 adcx %rax,%r11
2471 mulx 2*8($aptr),%r12,%rax # ...
2472 adcx %r14,%r12
2473 mulx 3*8($aptr),%r13,%r14
2474 .byte 0x67,0x67
2475 mov $mi,%rdx
2476 adcx %rax,%r13
2477 adcx $zero,%r14 # cf=0
2478 lea 4*8($aptr),$aptr
2479 lea 4*8($tptr),$tptr
2480
2481 adox %r15,%r10
2482 mulx 0*8($nptr),%rax,%r15
2483 adcx %rax,%r10
2484 adox %r15,%r11
2485 mulx 1*8($nptr),%rax,%r15
2486 adcx %rax,%r11
2487 adox %r15,%r12
2488 mulx 2*8($nptr),%rax,%r15
2489 mov %r10,-5*8($tptr)
2490 adcx %rax,%r12
2491 mov %r11,-4*8($tptr)
2492 adox %r15,%r13
2493 mulx 3*8($nptr),%rax,%r15
2494 mov $bi,%rdx
2495 mov %r12,-3*8($tptr)
2496 adcx %rax,%r13
2497 adox $zero,%r15
2498 lea 4*8($nptr),$nptr
2499 mov %r13,-2*8($tptr)
2500
2501 dec $bptr # of=0, pass cf
2502 jnz .Lmulx4x_1st
2503
2504 mov 8(%rsp),$num # load -num
2505 adc $zero,%r15 # modulo-scheduled
2506 lea ($aptr,$num),$aptr # rewind $aptr
2507 add %r15,%r14
2508 mov 8+8(%rsp),$bptr # re-load &b[i]
2509 adc $zero,$zero # top-most carry
2510 mov %r14,-1*8($tptr)
2511 jmp .Lmulx4x_outer
2512
2513.align 32
2514.Lmulx4x_outer:
2515 lea 16-256($tptr),%r10 # where 256-byte mask is (+density control)
2516 pxor %xmm4,%xmm4
2517 .byte 0x67,0x67
2518 pxor %xmm5,%xmm5
2519___
2520for($i=0;$i<$STRIDE/16;$i+=4) {
2521$code.=<<___;
2522 movdqa `16*($i+0)-128`($bptr),%xmm0
2523 movdqa `16*($i+1)-128`($bptr),%xmm1
2524 movdqa `16*($i+2)-128`($bptr),%xmm2
2525 pand `16*($i+0)+256`(%r10),%xmm0
2526 movdqa `16*($i+3)-128`($bptr),%xmm3
2527 pand `16*($i+1)+256`(%r10),%xmm1
2528 por %xmm0,%xmm4
2529 pand `16*($i+2)+256`(%r10),%xmm2
2530 por %xmm1,%xmm5
2531 pand `16*($i+3)+256`(%r10),%xmm3
2532 por %xmm2,%xmm4
2533 por %xmm3,%xmm5
2534___
2535}
2536$code.=<<___;
2537 por %xmm5,%xmm4
2538 pshufd \$0x4e,%xmm4,%xmm0
2539 por %xmm4,%xmm0
2540 lea $STRIDE($bptr),$bptr
2541 movq %xmm0,%rdx # m0=bp[i]
2542
2543 mov $zero,($tptr) # save top-most carry
2544 lea 4*8($tptr,$num),$tptr # rewind $tptr
2545 mulx 0*8($aptr),$mi,%r11 # a[0]*b[i]
2546 xor $zero,$zero # cf=0, of=0
2547 mov %rdx,$bi
2548 mulx 1*8($aptr),%r14,%r12 # a[1]*b[i]
2549 adox -4*8($tptr),$mi # +t[0]
2550 adcx %r14,%r11
2551 mulx 2*8($aptr),%r15,%r13 # ...
2552 adox -3*8($tptr),%r11
2553 adcx %r15,%r12
2554 mulx 3*8($aptr),%rdx,%r14
2555 adox -2*8($tptr),%r12
2556 adcx %rdx,%r13
2557 lea ($nptr,$num),$nptr # rewind $nptr
2558 lea 4*8($aptr),$aptr
2559 adox -1*8($tptr),%r13
2560 adcx $zero,%r14
2561 adox $zero,%r14
2562
2563 mov $mi,%r15
2564 imulq 32+8(%rsp),$mi # "t[0]"*n0
2565
2566 mov $mi,%rdx
2567 xor $zero,$zero # cf=0, of=0
2568 mov $bptr,8+8(%rsp) # off-load &b[i]
2569
2570 mulx 0*8($nptr),%rax,%r10
2571 adcx %rax,%r15 # discarded
2572 adox %r11,%r10
2573 mulx 1*8($nptr),%rax,%r11
2574 adcx %rax,%r10
2575 adox %r12,%r11
2576 mulx 2*8($nptr),%rax,%r12
2577 adcx %rax,%r11
2578 adox %r13,%r12
2579 mulx 3*8($nptr),%rax,%r15
2580 mov $bi,%rdx
2581 mov 24+8(%rsp),$bptr # counter value
2582 mov %r10,-8*4($tptr)
2583 adcx %rax,%r12
2584 mov %r11,-8*3($tptr)
2585 adox $zero,%r15 # of=0
2586 mov %r12,-8*2($tptr)
2587 lea 4*8($nptr),$nptr
2588 jmp .Lmulx4x_inner
2589
2590.align 32
2591.Lmulx4x_inner:
2592 mulx 0*8($aptr),%r10,%rax # a[4]*b[i]
2593 adcx $zero,%r15 # cf=0, modulo-scheduled
2594 adox %r14,%r10
2595 mulx 1*8($aptr),%r11,%r14 # a[5]*b[i]
2596 adcx 0*8($tptr),%r10
2597 adox %rax,%r11
2598 mulx 2*8($aptr),%r12,%rax # ...
2599 adcx 1*8($tptr),%r11
2600 adox %r14,%r12
2601 mulx 3*8($aptr),%r13,%r14
2602 mov $mi,%rdx
2603 adcx 2*8($tptr),%r12
2604 adox %rax,%r13
2605 adcx 3*8($tptr),%r13
2606 adox $zero,%r14 # of=0
2607 lea 4*8($aptr),$aptr
2608 lea 4*8($tptr),$tptr
2609 adcx $zero,%r14 # cf=0
2610
2611 adox %r15,%r10
2612 mulx 0*8($nptr),%rax,%r15
2613 adcx %rax,%r10
2614 adox %r15,%r11
2615 mulx 1*8($nptr),%rax,%r15
2616 adcx %rax,%r11
2617 adox %r15,%r12
2618 mulx 2*8($nptr),%rax,%r15
2619 mov %r10,-5*8($tptr)
2620 adcx %rax,%r12
2621 adox %r15,%r13
2622 mov %r11,-4*8($tptr)
2623 mulx 3*8($nptr),%rax,%r15
2624 mov $bi,%rdx
2625 lea 4*8($nptr),$nptr
2626 mov %r12,-3*8($tptr)
2627 adcx %rax,%r13
2628 adox $zero,%r15
2629 mov %r13,-2*8($tptr)
2630
2631 dec $bptr # of=0, pass cf
2632 jnz .Lmulx4x_inner
2633
2634 mov 0+8(%rsp),$num # load -num
2635 adc $zero,%r15 # modulo-scheduled
2636 sub 0*8($tptr),$bptr # pull top-most carry to %cf
2637 mov 8+8(%rsp),$bptr # re-load &b[i]
2638 mov 16+8(%rsp),%r10
2639 adc %r15,%r14
2640 lea ($aptr,$num),$aptr # rewind $aptr
2641 adc $zero,$zero # top-most carry
2642 mov %r14,-1*8($tptr)
2643
2644 cmp %r10,$bptr
2645 jb .Lmulx4x_outer
2646
2647 mov -8($nptr),%r10
2648 mov $zero,%r8
2649 mov ($nptr,$num),%r12
2650 lea ($nptr,$num),%rbp # rewind $nptr
2651 mov $num,%rcx
2652 lea ($tptr,$num),%rdi # rewind $tptr
2653 xor %eax,%eax
2654 xor %r15,%r15
2655 sub %r14,%r10 # compare top-most words
2656 adc %r15,%r15
2657 or %r15,%r8
2658 sar \$3+2,%rcx
2659 sub %r8,%rax # %rax=-%r8
2660 mov 56+8(%rsp),%rdx # restore rp
2661 dec %r12 # so that after 'not' we get -n[0]
2662 mov 8*1(%rbp),%r13
2663 xor %r8,%r8
2664 mov 8*2(%rbp),%r14
2665 mov 8*3(%rbp),%r15
2666 jmp .Lsqrx4x_sub_entry # common post-condition
2667.size mulx4x_internal,.-mulx4x_internal
2668___
2669}
2670{
2671######################################################################
2672# void bn_power5(
2673my $rptr="%rdi"; # BN_ULONG *rptr,
2674my $aptr="%rsi"; # const BN_ULONG *aptr,
2675my $bptr="%rdx"; # const void *table,
2676my $nptr="%rcx"; # const BN_ULONG *nptr,
2677my $n0 ="%r8"; # const BN_ULONG *n0);
2678my $num ="%r9"; # int num, has to be divisible by 8
2679 # int pwr);
2680
2681my ($i,$j,$tptr)=("%rbp","%rcx",$rptr);
2682my @A0=("%r10","%r11");
2683my @A1=("%r12","%r13");
2684my ($a0,$a1,$ai)=("%r14","%r15","%rbx");
2685
2686$code.=<<___;
2687.type bn_powerx5,\@function,6
2688.align 32
2689bn_powerx5:
2690 mov %rsp,%rax
2691.Lpowerx5_enter:
2692 push %rbx
2693 push %rbp
2694 push %r12
2695 push %r13
2696 push %r14
2697 push %r15
2698.Lpowerx5_prologue:
2699
2700 shl \$3,${num}d # convert $num to bytes
2701 lea ($num,$num,2),%r10 # 3*$num in bytes
2702 neg $num
2703 mov ($n0),$n0 # *n0
2704
2705 ##############################################################
2706 # Ensure that stack frame doesn't alias with $rptr+3*$num
2707 # modulo 4096, which covers ret[num], am[num] and n[num]
2708 # (see bn_exp.c). This is done to allow memory disambiguation
2709 # logic do its magic. [Extra 256 bytes is for power mask
2710 # calculated from 7th argument, the index.]
2711 #
2712 lea -320(%rsp,$num,2),%r11
2713 mov %rsp,%rbp
2714 sub $rptr,%r11
2715 and \$4095,%r11
2716 cmp %r11,%r10
2717 jb .Lpwrx_sp_alt
2718 sub %r11,%rbp # align with $aptr
2719 lea -320(%rbp,$num,2),%rbp # future alloca(frame+2*$num*8+256)
2720 jmp .Lpwrx_sp_done
2721
2722.align 32
2723.Lpwrx_sp_alt:
2724 lea 4096-320(,$num,2),%r10
2725 lea -320(%rbp,$num,2),%rbp # alloca(frame+2*$num*8+256)
2726 sub %r10,%r11
2727 mov \$0,%r10
2728 cmovc %r10,%r11
2729 sub %r11,%rbp
2730.Lpwrx_sp_done:
2731 and \$-64,%rbp
2732 mov %rsp,%r11
2733 sub %rbp,%r11
2734 and \$-4096,%r11
2735 lea (%rbp,%r11),%rsp
2736 mov (%rsp),%r10
2737 cmp %rbp,%rsp
2738 ja .Lpwrx_page_walk
2739 jmp .Lpwrx_page_walk_done
2740
2741.Lpwrx_page_walk:
2742 lea -4096(%rsp),%rsp
2743 mov (%rsp),%r10
2744 cmp %rbp,%rsp
2745 ja .Lpwrx_page_walk
2746.Lpwrx_page_walk_done:
2747
2748 mov $num,%r10
2749 neg $num
2750
2751 ##############################################################
2752 # Stack layout
2753 #
2754 # +0 saved $num, used in reduction section
2755 # +8 &t[2*$num], used in reduction section
2756 # +16 intermediate carry bit
2757 # +24 top-most carry bit, used in reduction section
2758 # +32 saved *n0
2759 # +40 saved %rsp
2760 # +48 t[2*$num]
2761 #
2762 pxor %xmm0,%xmm0
2763 movq $rptr,%xmm1 # save $rptr
2764 movq $nptr,%xmm2 # save $nptr
2765 movq %r10, %xmm3 # -$num
2766 movq $bptr,%xmm4
2767 mov $n0, 32(%rsp)
2768 mov %rax, 40(%rsp) # save original %rsp
2769.Lpowerx5_body:
2770
2771 call __bn_sqrx8x_internal
2772 call __bn_postx4x_internal
2773 call __bn_sqrx8x_internal
2774 call __bn_postx4x_internal
2775 call __bn_sqrx8x_internal
2776 call __bn_postx4x_internal
2777 call __bn_sqrx8x_internal
2778 call __bn_postx4x_internal
2779 call __bn_sqrx8x_internal
2780 call __bn_postx4x_internal
2781
2782 mov %r10,$num # -num
2783 mov $aptr,$rptr
2784 movq %xmm2,$nptr
2785 movq %xmm4,$bptr
2786 mov 40(%rsp),%rax
2787
2788 call mulx4x_internal
2789
2790 mov 40(%rsp),%rsi # restore %rsp
2791 mov \$1,%rax
2792
2793 mov -48(%rsi),%r15
2794 mov -40(%rsi),%r14
2795 mov -32(%rsi),%r13
2796 mov -24(%rsi),%r12
2797 mov -16(%rsi),%rbp
2798 mov -8(%rsi),%rbx
2799 lea (%rsi),%rsp
2800.Lpowerx5_epilogue:
2801 ret
2802.size bn_powerx5,.-bn_powerx5
2803
2804.globl bn_sqrx8x_internal
2805.hidden bn_sqrx8x_internal
2806.type bn_sqrx8x_internal,\@abi-omnipotent
2807.align 32
2808bn_sqrx8x_internal:
2809__bn_sqrx8x_internal:
2810 ##################################################################
2811 # Squaring part:
2812 #
2813 # a) multiply-n-add everything but a[i]*a[i];
2814 # b) shift result of a) by 1 to the left and accumulate
2815 # a[i]*a[i] products;
2816 #
2817 ##################################################################
2818 # a[7]a[7]a[6]a[6]a[5]a[5]a[4]a[4]a[3]a[3]a[2]a[2]a[1]a[1]a[0]a[0]
2819 # a[1]a[0]
2820 # a[2]a[0]
2821 # a[3]a[0]
2822 # a[2]a[1]
2823 # a[3]a[1]
2824 # a[3]a[2]
2825 #
2826 # a[4]a[0]
2827 # a[5]a[0]
2828 # a[6]a[0]
2829 # a[7]a[0]
2830 # a[4]a[1]
2831 # a[5]a[1]
2832 # a[6]a[1]
2833 # a[7]a[1]
2834 # a[4]a[2]
2835 # a[5]a[2]
2836 # a[6]a[2]
2837 # a[7]a[2]
2838 # a[4]a[3]
2839 # a[5]a[3]
2840 # a[6]a[3]
2841 # a[7]a[3]
2842 #
2843 # a[5]a[4]
2844 # a[6]a[4]
2845 # a[7]a[4]
2846 # a[6]a[5]
2847 # a[7]a[5]
2848 # a[7]a[6]
2849 # a[7]a[7]a[6]a[6]a[5]a[5]a[4]a[4]a[3]a[3]a[2]a[2]a[1]a[1]a[0]a[0]
2850___
2851{
2852my ($zero,$carry)=("%rbp","%rcx");
2853my $aaptr=$zero;
2854$code.=<<___;
2855 lea 48+8(%rsp),$tptr
2856 lea ($aptr,$num),$aaptr
2857 mov $num,0+8(%rsp) # save $num
2858 mov $aaptr,8+8(%rsp) # save end of $aptr
2859 jmp .Lsqr8x_zero_start
2860
2861.align 32
2862.byte 0x66,0x66,0x66,0x2e,0x0f,0x1f,0x84,0x00,0x00,0x00,0x00,0x00
2863.Lsqrx8x_zero:
2864 .byte 0x3e
2865 movdqa %xmm0,0*8($tptr)
2866 movdqa %xmm0,2*8($tptr)
2867 movdqa %xmm0,4*8($tptr)
2868 movdqa %xmm0,6*8($tptr)
2869.Lsqr8x_zero_start: # aligned at 32
2870 movdqa %xmm0,8*8($tptr)
2871 movdqa %xmm0,10*8($tptr)
2872 movdqa %xmm0,12*8($tptr)
2873 movdqa %xmm0,14*8($tptr)
2874 lea 16*8($tptr),$tptr
2875 sub \$64,$num
2876 jnz .Lsqrx8x_zero
2877
2878 mov 0*8($aptr),%rdx # a[0], modulo-scheduled
2879 #xor %r9,%r9 # t[1], ex-$num, zero already
2880 xor %r10,%r10
2881 xor %r11,%r11
2882 xor %r12,%r12
2883 xor %r13,%r13
2884 xor %r14,%r14
2885 xor %r15,%r15
2886 lea 48+8(%rsp),$tptr
2887 xor $zero,$zero # cf=0, cf=0
2888 jmp .Lsqrx8x_outer_loop
2889
2890.align 32
2891.Lsqrx8x_outer_loop:
2892 mulx 1*8($aptr),%r8,%rax # a[1]*a[0]
2893 adcx %r9,%r8 # a[1]*a[0]+=t[1]
2894 adox %rax,%r10
2895 mulx 2*8($aptr),%r9,%rax # a[2]*a[0]
2896 adcx %r10,%r9
2897 adox %rax,%r11
2898 .byte 0xc4,0xe2,0xab,0xf6,0x86,0x18,0x00,0x00,0x00 # mulx 3*8($aptr),%r10,%rax # ...
2899 adcx %r11,%r10
2900 adox %rax,%r12
2901 .byte 0xc4,0xe2,0xa3,0xf6,0x86,0x20,0x00,0x00,0x00 # mulx 4*8($aptr),%r11,%rax
2902 adcx %r12,%r11
2903 adox %rax,%r13
2904 mulx 5*8($aptr),%r12,%rax
2905 adcx %r13,%r12
2906 adox %rax,%r14
2907 mulx 6*8($aptr),%r13,%rax
2908 adcx %r14,%r13
2909 adox %r15,%rax
2910 mulx 7*8($aptr),%r14,%r15
2911 mov 1*8($aptr),%rdx # a[1]
2912 adcx %rax,%r14
2913 adox $zero,%r15
2914 adc 8*8($tptr),%r15
2915 mov %r8,1*8($tptr) # t[1]
2916 mov %r9,2*8($tptr) # t[2]
2917 sbb $carry,$carry # mov %cf,$carry
2918 xor $zero,$zero # cf=0, of=0
2919
2920
2921 mulx 2*8($aptr),%r8,%rbx # a[2]*a[1]
2922 mulx 3*8($aptr),%r9,%rax # a[3]*a[1]
2923 adcx %r10,%r8
2924 adox %rbx,%r9
2925 mulx 4*8($aptr),%r10,%rbx # ...
2926 adcx %r11,%r9
2927 adox %rax,%r10
2928 .byte 0xc4,0xe2,0xa3,0xf6,0x86,0x28,0x00,0x00,0x00 # mulx 5*8($aptr),%r11,%rax
2929 adcx %r12,%r10
2930 adox %rbx,%r11
2931 .byte 0xc4,0xe2,0x9b,0xf6,0x9e,0x30,0x00,0x00,0x00 # mulx 6*8($aptr),%r12,%rbx
2932 adcx %r13,%r11
2933 adox %r14,%r12
2934 .byte 0xc4,0x62,0x93,0xf6,0xb6,0x38,0x00,0x00,0x00 # mulx 7*8($aptr),%r13,%r14
2935 mov 2*8($aptr),%rdx # a[2]
2936 adcx %rax,%r12
2937 adox %rbx,%r13
2938 adcx %r15,%r13
2939 adox $zero,%r14 # of=0
2940 adcx $zero,%r14 # cf=0
2941
2942 mov %r8,3*8($tptr) # t[3]
2943 mov %r9,4*8($tptr) # t[4]
2944
2945 mulx 3*8($aptr),%r8,%rbx # a[3]*a[2]
2946 mulx 4*8($aptr),%r9,%rax # a[4]*a[2]
2947 adcx %r10,%r8
2948 adox %rbx,%r9
2949 mulx 5*8($aptr),%r10,%rbx # ...
2950 adcx %r11,%r9
2951 adox %rax,%r10
2952 .byte 0xc4,0xe2,0xa3,0xf6,0x86,0x30,0x00,0x00,0x00 # mulx 6*8($aptr),%r11,%rax
2953 adcx %r12,%r10
2954 adox %r13,%r11
2955 .byte 0xc4,0x62,0x9b,0xf6,0xae,0x38,0x00,0x00,0x00 # mulx 7*8($aptr),%r12,%r13
2956 .byte 0x3e
2957 mov 3*8($aptr),%rdx # a[3]
2958 adcx %rbx,%r11
2959 adox %rax,%r12
2960 adcx %r14,%r12
2961 mov %r8,5*8($tptr) # t[5]
2962 mov %r9,6*8($tptr) # t[6]
2963 mulx 4*8($aptr),%r8,%rax # a[4]*a[3]
2964 adox $zero,%r13 # of=0
2965 adcx $zero,%r13 # cf=0
2966
2967 mulx 5*8($aptr),%r9,%rbx # a[5]*a[3]
2968 adcx %r10,%r8
2969 adox %rax,%r9
2970 mulx 6*8($aptr),%r10,%rax # ...
2971 adcx %r11,%r9
2972 adox %r12,%r10
2973 mulx 7*8($aptr),%r11,%r12
2974 mov 4*8($aptr),%rdx # a[4]
2975 mov 5*8($aptr),%r14 # a[5]
2976 adcx %rbx,%r10
2977 adox %rax,%r11
2978 mov 6*8($aptr),%r15 # a[6]
2979 adcx %r13,%r11
2980 adox $zero,%r12 # of=0
2981 adcx $zero,%r12 # cf=0
2982
2983 mov %r8,7*8($tptr) # t[7]
2984 mov %r9,8*8($tptr) # t[8]
2985
2986 mulx %r14,%r9,%rax # a[5]*a[4]
2987 mov 7*8($aptr),%r8 # a[7]
2988 adcx %r10,%r9
2989 mulx %r15,%r10,%rbx # a[6]*a[4]
2990 adox %rax,%r10
2991 adcx %r11,%r10
2992 mulx %r8,%r11,%rax # a[7]*a[4]
2993 mov %r14,%rdx # a[5]
2994 adox %rbx,%r11
2995 adcx %r12,%r11
2996 #adox $zero,%rax # of=0
2997 adcx $zero,%rax # cf=0
2998
2999 mulx %r15,%r14,%rbx # a[6]*a[5]
3000 mulx %r8,%r12,%r13 # a[7]*a[5]
3001 mov %r15,%rdx # a[6]
3002 lea 8*8($aptr),$aptr
3003 adcx %r14,%r11
3004 adox %rbx,%r12
3005 adcx %rax,%r12
3006 adox $zero,%r13
3007
3008 .byte 0x67,0x67
3009 mulx %r8,%r8,%r14 # a[7]*a[6]
3010 adcx %r8,%r13
3011 adcx $zero,%r14
3012
3013 cmp 8+8(%rsp),$aptr
3014 je .Lsqrx8x_outer_break
3015
3016 neg $carry # mov $carry,%cf
3017 mov \$-8,%rcx
3018 mov $zero,%r15
3019 mov 8*8($tptr),%r8
3020 adcx 9*8($tptr),%r9 # +=t[9]
3021 adcx 10*8($tptr),%r10 # ...
3022 adcx 11*8($tptr),%r11
3023 adc 12*8($tptr),%r12
3024 adc 13*8($tptr),%r13
3025 adc 14*8($tptr),%r14
3026 adc 15*8($tptr),%r15
3027 lea ($aptr),$aaptr
3028 lea 2*64($tptr),$tptr
3029 sbb %rax,%rax # mov %cf,$carry
3030
3031 mov -64($aptr),%rdx # a[0]
3032 mov %rax,16+8(%rsp) # offload $carry
3033 mov $tptr,24+8(%rsp)
3034
3035 #lea 8*8($tptr),$tptr # see 2*8*8($tptr) above
3036 xor %eax,%eax # cf=0, of=0
3037 jmp .Lsqrx8x_loop
3038
3039.align 32
3040.Lsqrx8x_loop:
3041 mov %r8,%rbx
3042 mulx 0*8($aaptr),%rax,%r8 # a[8]*a[i]
3043 adcx %rax,%rbx # +=t[8]
3044 adox %r9,%r8
3045
3046 mulx 1*8($aaptr),%rax,%r9 # ...
3047 adcx %rax,%r8
3048 adox %r10,%r9
3049
3050 mulx 2*8($aaptr),%rax,%r10
3051 adcx %rax,%r9
3052 adox %r11,%r10
3053
3054 mulx 3*8($aaptr),%rax,%r11
3055 adcx %rax,%r10
3056 adox %r12,%r11
3057
3058 .byte 0xc4,0x62,0xfb,0xf6,0xa5,0x20,0x00,0x00,0x00 # mulx 4*8($aaptr),%rax,%r12
3059 adcx %rax,%r11
3060 adox %r13,%r12
3061
3062 mulx 5*8($aaptr),%rax,%r13
3063 adcx %rax,%r12
3064 adox %r14,%r13
3065
3066 mulx 6*8($aaptr),%rax,%r14
3067 mov %rbx,($tptr,%rcx,8) # store t[8+i]
3068 mov \$0,%ebx
3069 adcx %rax,%r13
3070 adox %r15,%r14
3071
3072 .byte 0xc4,0x62,0xfb,0xf6,0xbd,0x38,0x00,0x00,0x00 # mulx 7*8($aaptr),%rax,%r15
3073 mov 8($aptr,%rcx,8),%rdx # a[i]
3074 adcx %rax,%r14
3075 adox %rbx,%r15 # %rbx is 0, of=0
3076 adcx %rbx,%r15 # cf=0
3077
3078 .byte 0x67
3079 inc %rcx # of=0
3080 jnz .Lsqrx8x_loop
3081
3082 lea 8*8($aaptr),$aaptr
3083 mov \$-8,%rcx
3084 cmp 8+8(%rsp),$aaptr # done?
3085 je .Lsqrx8x_break
3086
3087 sub 16+8(%rsp),%rbx # mov 16(%rsp),%cf
3088 .byte 0x66
3089 mov -64($aptr),%rdx
3090 adcx 0*8($tptr),%r8
3091 adcx 1*8($tptr),%r9
3092 adc 2*8($tptr),%r10
3093 adc 3*8($tptr),%r11
3094 adc 4*8($tptr),%r12
3095 adc 5*8($tptr),%r13
3096 adc 6*8($tptr),%r14
3097 adc 7*8($tptr),%r15
3098 lea 8*8($tptr),$tptr
3099 .byte 0x67
3100 sbb %rax,%rax # mov %cf,%rax
3101 xor %ebx,%ebx # cf=0, of=0
3102 mov %rax,16+8(%rsp) # offload carry
3103 jmp .Lsqrx8x_loop
3104
3105.align 32
3106.Lsqrx8x_break:
3107 xor $zero,$zero
3108 sub 16+8(%rsp),%rbx # mov 16(%rsp),%cf
3109 adcx $zero,%r8
3110 mov 24+8(%rsp),$carry # initial $tptr, borrow $carry
3111 adcx $zero,%r9
3112 mov 0*8($aptr),%rdx # a[8], modulo-scheduled
3113 adc \$0,%r10
3114 mov %r8,0*8($tptr)
3115 adc \$0,%r11
3116 adc \$0,%r12
3117 adc \$0,%r13
3118 adc \$0,%r14
3119 adc \$0,%r15
3120 cmp $carry,$tptr # cf=0, of=0
3121 je .Lsqrx8x_outer_loop
3122
3123 mov %r9,1*8($tptr)
3124 mov 1*8($carry),%r9
3125 mov %r10,2*8($tptr)
3126 mov 2*8($carry),%r10
3127 mov %r11,3*8($tptr)
3128 mov 3*8($carry),%r11
3129 mov %r12,4*8($tptr)
3130 mov 4*8($carry),%r12
3131 mov %r13,5*8($tptr)
3132 mov 5*8($carry),%r13
3133 mov %r14,6*8($tptr)
3134 mov 6*8($carry),%r14
3135 mov %r15,7*8($tptr)
3136 mov 7*8($carry),%r15
3137 mov $carry,$tptr
3138 jmp .Lsqrx8x_outer_loop
3139
3140.align 32
3141.Lsqrx8x_outer_break:
3142 mov %r9,9*8($tptr) # t[9]
3143 movq %xmm3,%rcx # -$num
3144 mov %r10,10*8($tptr) # ...
3145 mov %r11,11*8($tptr)
3146 mov %r12,12*8($tptr)
3147 mov %r13,13*8($tptr)
3148 mov %r14,14*8($tptr)
3149___
3150}
3151{
3152my $i="%rcx";
3153$code.=<<___;
3154 lea 48+8(%rsp),$tptr
3155 mov ($aptr,$i),%rdx # a[0]
3156
3157 mov 8($tptr),$A0[1] # t[1]
3158 xor $A0[0],$A0[0] # t[0], of=0, cf=0
3159 mov 0+8(%rsp),$num # restore $num
3160 adox $A0[1],$A0[1]
3161 mov 16($tptr),$A1[0] # t[2] # prefetch
3162 mov 24($tptr),$A1[1] # t[3] # prefetch
3163 #jmp .Lsqrx4x_shift_n_add # happens to be aligned
3164
3165.align 32
3166.Lsqrx4x_shift_n_add:
3167 mulx %rdx,%rax,%rbx
3168 adox $A1[0],$A1[0]
3169 adcx $A0[0],%rax
3170 .byte 0x48,0x8b,0x94,0x0e,0x08,0x00,0x00,0x00 # mov 8($aptr,$i),%rdx # a[i+1] # prefetch
3171 .byte 0x4c,0x8b,0x97,0x20,0x00,0x00,0x00 # mov 32($tptr),$A0[0] # t[2*i+4] # prefetch
3172 adox $A1[1],$A1[1]
3173 adcx $A0[1],%rbx
3174 mov 40($tptr),$A0[1] # t[2*i+4+1] # prefetch
3175 mov %rax,0($tptr)
3176 mov %rbx,8($tptr)
3177
3178 mulx %rdx,%rax,%rbx
3179 adox $A0[0],$A0[0]
3180 adcx $A1[0],%rax
3181 mov 16($aptr,$i),%rdx # a[i+2] # prefetch
3182 mov 48($tptr),$A1[0] # t[2*i+6] # prefetch
3183 adox $A0[1],$A0[1]
3184 adcx $A1[1],%rbx
3185 mov 56($tptr),$A1[1] # t[2*i+6+1] # prefetch
3186 mov %rax,16($tptr)
3187 mov %rbx,24($tptr)
3188
3189 mulx %rdx,%rax,%rbx
3190 adox $A1[0],$A1[0]
3191 adcx $A0[0],%rax
3192 mov 24($aptr,$i),%rdx # a[i+3] # prefetch
3193 lea 32($i),$i
3194 mov 64($tptr),$A0[0] # t[2*i+8] # prefetch
3195 adox $A1[1],$A1[1]
3196 adcx $A0[1],%rbx
3197 mov 72($tptr),$A0[1] # t[2*i+8+1] # prefetch
3198 mov %rax,32($tptr)
3199 mov %rbx,40($tptr)
3200
3201 mulx %rdx,%rax,%rbx
3202 adox $A0[0],$A0[0]
3203 adcx $A1[0],%rax
3204 jrcxz .Lsqrx4x_shift_n_add_break
3205 .byte 0x48,0x8b,0x94,0x0e,0x00,0x00,0x00,0x00 # mov 0($aptr,$i),%rdx # a[i+4] # prefetch
3206 adox $A0[1],$A0[1]
3207 adcx $A1[1],%rbx
3208 mov 80($tptr),$A1[0] # t[2*i+10] # prefetch
3209 mov 88($tptr),$A1[1] # t[2*i+10+1] # prefetch
3210 mov %rax,48($tptr)
3211 mov %rbx,56($tptr)
3212 lea 64($tptr),$tptr
3213 nop
3214 jmp .Lsqrx4x_shift_n_add
3215
3216.align 32
3217.Lsqrx4x_shift_n_add_break:
3218 adcx $A1[1],%rbx
3219 mov %rax,48($tptr)
3220 mov %rbx,56($tptr)
3221 lea 64($tptr),$tptr # end of t[] buffer
3222___
3223}
3224
3225######################################################################
3226# Montgomery reduction part, "word-by-word" algorithm.
3227#
3228# This new path is inspired by multiple submissions from Intel, by
3229# Shay Gueron, Vlad Krasnov, Erdinc Ozturk, James Guilford,
3230# Vinodh Gopal...
3231{
3232my ($nptr,$carry,$m0)=("%rbp","%rsi","%rdx");
3233
3234$code.=<<___;
3235 movq %xmm2,$nptr
3236__bn_sqrx8x_reduction:
3237 xor %eax,%eax # initial top-most carry bit
3238 mov 32+8(%rsp),%rbx # n0
3239 mov 48+8(%rsp),%rdx # "%r8", 8*0($tptr)
3240 lea -8*8($nptr,$num),%rcx # end of n[]
3241 #lea 48+8(%rsp,$num,2),$tptr # end of t[] buffer
3242 mov %rcx, 0+8(%rsp) # save end of n[]
3243 mov $tptr,8+8(%rsp) # save end of t[]
3244
3245 lea 48+8(%rsp),$tptr # initial t[] window
3246 jmp .Lsqrx8x_reduction_loop
3247
3248.align 32
3249.Lsqrx8x_reduction_loop:
3250 mov 8*1($tptr),%r9
3251 mov 8*2($tptr),%r10
3252 mov 8*3($tptr),%r11
3253 mov 8*4($tptr),%r12
3254 mov %rdx,%r8
3255 imulq %rbx,%rdx # n0*a[i]
3256 mov 8*5($tptr),%r13
3257 mov 8*6($tptr),%r14
3258 mov 8*7($tptr),%r15
3259 mov %rax,24+8(%rsp) # store top-most carry bit
3260
3261 lea 8*8($tptr),$tptr
3262 xor $carry,$carry # cf=0,of=0
3263 mov \$-8,%rcx
3264 jmp .Lsqrx8x_reduce
3265
3266.align 32
3267.Lsqrx8x_reduce:
3268 mov %r8, %rbx
3269 mulx 8*0($nptr),%rax,%r8 # n[0]
3270 adcx %rbx,%rax # discarded
3271 adox %r9,%r8
3272
3273 mulx 8*1($nptr),%rbx,%r9 # n[1]
3274 adcx %rbx,%r8
3275 adox %r10,%r9
3276
3277 mulx 8*2($nptr),%rbx,%r10
3278 adcx %rbx,%r9
3279 adox %r11,%r10
3280
3281 mulx 8*3($nptr),%rbx,%r11
3282 adcx %rbx,%r10
3283 adox %r12,%r11
3284
3285 .byte 0xc4,0x62,0xe3,0xf6,0xa5,0x20,0x00,0x00,0x00 # mulx 8*4($nptr),%rbx,%r12
3286 mov %rdx,%rax
3287 mov %r8,%rdx
3288 adcx %rbx,%r11
3289 adox %r13,%r12
3290
3291 mulx 32+8(%rsp),%rbx,%rdx # %rdx discarded
3292 mov %rax,%rdx
3293 mov %rax,64+48+8(%rsp,%rcx,8) # put aside n0*a[i]
3294
3295 mulx 8*5($nptr),%rax,%r13
3296 adcx %rax,%r12
3297 adox %r14,%r13
3298
3299 mulx 8*6($nptr),%rax,%r14
3300 adcx %rax,%r13
3301 adox %r15,%r14
3302
3303 mulx 8*7($nptr),%rax,%r15
3304 mov %rbx,%rdx
3305 adcx %rax,%r14
3306 adox $carry,%r15 # $carry is 0
3307 adcx $carry,%r15 # cf=0
3308
3309 .byte 0x67,0x67,0x67
3310 inc %rcx # of=0
3311 jnz .Lsqrx8x_reduce
3312
3313 mov $carry,%rax # xor %rax,%rax
3314 cmp 0+8(%rsp),$nptr # end of n[]?
3315 jae .Lsqrx8x_no_tail
3316
3317 mov 48+8(%rsp),%rdx # pull n0*a[0]
3318 add 8*0($tptr),%r8
3319 lea 8*8($nptr),$nptr
3320 mov \$-8,%rcx
3321 adcx 8*1($tptr),%r9
3322 adcx 8*2($tptr),%r10
3323 adc 8*3($tptr),%r11
3324 adc 8*4($tptr),%r12
3325 adc 8*5($tptr),%r13
3326 adc 8*6($tptr),%r14
3327 adc 8*7($tptr),%r15
3328 lea 8*8($tptr),$tptr
3329 sbb %rax,%rax # top carry
3330
3331 xor $carry,$carry # of=0, cf=0
3332 mov %rax,16+8(%rsp)
3333 jmp .Lsqrx8x_tail
3334
3335.align 32
3336.Lsqrx8x_tail:
3337 mov %r8,%rbx
3338 mulx 8*0($nptr),%rax,%r8
3339 adcx %rax,%rbx
3340 adox %r9,%r8
3341
3342 mulx 8*1($nptr),%rax,%r9
3343 adcx %rax,%r8
3344 adox %r10,%r9
3345
3346 mulx 8*2($nptr),%rax,%r10
3347 adcx %rax,%r9
3348 adox %r11,%r10
3349
3350 mulx 8*3($nptr),%rax,%r11
3351 adcx %rax,%r10
3352 adox %r12,%r11
3353
3354 .byte 0xc4,0x62,0xfb,0xf6,0xa5,0x20,0x00,0x00,0x00 # mulx 8*4($nptr),%rax,%r12
3355 adcx %rax,%r11
3356 adox %r13,%r12
3357
3358 mulx 8*5($nptr),%rax,%r13
3359 adcx %rax,%r12
3360 adox %r14,%r13
3361
3362 mulx 8*6($nptr),%rax,%r14
3363 adcx %rax,%r13
3364 adox %r15,%r14
3365
3366 mulx 8*7($nptr),%rax,%r15
3367 mov 72+48+8(%rsp,%rcx,8),%rdx # pull n0*a[i]
3368 adcx %rax,%r14
3369 adox $carry,%r15
3370 mov %rbx,($tptr,%rcx,8) # save result
3371 mov %r8,%rbx
3372 adcx $carry,%r15 # cf=0
3373
3374 inc %rcx # of=0
3375 jnz .Lsqrx8x_tail
3376
3377 cmp 0+8(%rsp),$nptr # end of n[]?
3378 jae .Lsqrx8x_tail_done # break out of loop
3379
3380 sub 16+8(%rsp),$carry # mov 16(%rsp),%cf
3381 mov 48+8(%rsp),%rdx # pull n0*a[0]
3382 lea 8*8($nptr),$nptr
3383 adc 8*0($tptr),%r8
3384 adc 8*1($tptr),%r9
3385 adc 8*2($tptr),%r10
3386 adc 8*3($tptr),%r11
3387 adc 8*4($tptr),%r12
3388 adc 8*5($tptr),%r13
3389 adc 8*6($tptr),%r14
3390 adc 8*7($tptr),%r15
3391 lea 8*8($tptr),$tptr
3392 sbb %rax,%rax
3393 sub \$8,%rcx # mov \$-8,%rcx
3394
3395 xor $carry,$carry # of=0, cf=0
3396 mov %rax,16+8(%rsp)
3397 jmp .Lsqrx8x_tail
3398
3399.align 32
3400.Lsqrx8x_tail_done:
3401 xor %rax,%rax
3402 add 24+8(%rsp),%r8 # can this overflow?
3403 adc \$0,%r9
3404 adc \$0,%r10
3405 adc \$0,%r11
3406 adc \$0,%r12
3407 adc \$0,%r13
3408 adc \$0,%r14
3409 adc \$0,%r15
3410 adc \$0,%rax
3411
3412 sub 16+8(%rsp),$carry # mov 16(%rsp),%cf
3413.Lsqrx8x_no_tail: # %cf is 0 if jumped here
3414 adc 8*0($tptr),%r8
3415 movq %xmm3,%rcx
3416 adc 8*1($tptr),%r9
3417 mov 8*7($nptr),$carry
3418 movq %xmm2,$nptr # restore $nptr
3419 adc 8*2($tptr),%r10
3420 adc 8*3($tptr),%r11
3421 adc 8*4($tptr),%r12
3422 adc 8*5($tptr),%r13
3423 adc 8*6($tptr),%r14
3424 adc 8*7($tptr),%r15
3425 adc \$0,%rax # top-most carry
3426
3427 mov 32+8(%rsp),%rbx # n0
3428 mov 8*8($tptr,%rcx),%rdx # modulo-scheduled "%r8"
3429
3430 mov %r8,8*0($tptr) # store top 512 bits
3431 lea 8*8($tptr),%r8 # borrow %r8
3432 mov %r9,8*1($tptr)
3433 mov %r10,8*2($tptr)
3434 mov %r11,8*3($tptr)
3435 mov %r12,8*4($tptr)
3436 mov %r13,8*5($tptr)
3437 mov %r14,8*6($tptr)
3438 mov %r15,8*7($tptr)
3439
3440 lea 8*8($tptr,%rcx),$tptr # start of current t[] window
3441 cmp 8+8(%rsp),%r8 # end of t[]?
3442 jb .Lsqrx8x_reduction_loop
3443 ret
3444.size bn_sqrx8x_internal,.-bn_sqrx8x_internal
3445___
3446}
3447
3448##############################################################
3449# Post-condition, 4x unrolled
3450#
3451{
3452my ($rptr,$nptr)=("%rdx","%rbp");
3453$code.=<<___;
3454.align 32
3455__bn_postx4x_internal:
3456 mov 8*0($nptr),%r12
3457 mov %rcx,%r10 # -$num
3458 mov %rcx,%r9 # -$num
3459 neg %rax
3460 sar \$3+2,%rcx
3461 #lea 48+8(%rsp,%r9),$tptr
3462 movq %xmm1,$rptr # restore $rptr
3463 movq %xmm1,$aptr # prepare for back-to-back call
3464 dec %r12 # so that after 'not' we get -n[0]
3465 mov 8*1($nptr),%r13
3466 xor %r8,%r8
3467 mov 8*2($nptr),%r14
3468 mov 8*3($nptr),%r15
3469 jmp .Lsqrx4x_sub_entry
3470
3471.align 16
3472.Lsqrx4x_sub:
3473 mov 8*0($nptr),%r12
3474 mov 8*1($nptr),%r13
3475 mov 8*2($nptr),%r14
3476 mov 8*3($nptr),%r15
3477.Lsqrx4x_sub_entry:
3478 andn %rax,%r12,%r12
3479 lea 8*4($nptr),$nptr
3480 andn %rax,%r13,%r13
3481 andn %rax,%r14,%r14
3482 andn %rax,%r15,%r15
3483
3484 neg %r8 # mov %r8,%cf
3485 adc 8*0($tptr),%r12
3486 adc 8*1($tptr),%r13
3487 adc 8*2($tptr),%r14
3488 adc 8*3($tptr),%r15
3489 mov %r12,8*0($rptr)
3490 lea 8*4($tptr),$tptr
3491 mov %r13,8*1($rptr)
3492 sbb %r8,%r8 # mov %cf,%r8
3493 mov %r14,8*2($rptr)
3494 mov %r15,8*3($rptr)
3495 lea 8*4($rptr),$rptr
3496
3497 inc %rcx
3498 jnz .Lsqrx4x_sub
3499
3500 neg %r9 # restore $num
3501
3502 ret
3503.size __bn_postx4x_internal,.-__bn_postx4x_internal
3504___
3505}
3506}}}
3507{
3508my ($inp,$num,$tbl,$idx)=$win64?("%rcx","%edx","%r8", "%r9d") : # Win64 order
3509 ("%rdi","%esi","%rdx","%ecx"); # Unix order
3510my $out=$inp;
3511my $STRIDE=2**5*8;
3512my $N=$STRIDE/4;
3513
3514$code.=<<___;
3515.globl bn_get_bits5
3516.type bn_get_bits5,\@abi-omnipotent
3517.align 16
3518bn_get_bits5:
3519 lea 0($inp),%r10
3520 lea 1($inp),%r11
3521 mov $num,%ecx
3522 shr \$4,$num
3523 and \$15,%ecx
3524 lea -8(%ecx),%eax
3525 cmp \$11,%ecx
3526 cmova %r11,%r10
3527 cmova %eax,%ecx
3528 movzw (%r10,$num,2),%eax
3529 shrl %cl,%eax
3530 and \$31,%eax
3531 ret
3532.size bn_get_bits5,.-bn_get_bits5
3533
3534.globl bn_scatter5
3535.type bn_scatter5,\@abi-omnipotent
3536.align 16
3537bn_scatter5:
3538 cmp \$0, $num
3539 jz .Lscatter_epilogue
3540 lea ($tbl,$idx,8),$tbl
3541.Lscatter:
3542 mov ($inp),%rax
3543 lea 8($inp),$inp
3544 mov %rax,($tbl)
3545 lea 32*8($tbl),$tbl
3546 sub \$1,$num
3547 jnz .Lscatter
3548.Lscatter_epilogue:
3549 ret
3550.size bn_scatter5,.-bn_scatter5
3551
3552.globl bn_gather5
3553.type bn_gather5,\@abi-omnipotent
3554.align 32
3555bn_gather5:
3556.LSEH_begin_bn_gather5: # Win64 thing, but harmless in other cases
3557 # I can't trust assembler to use specific encoding:-(
3558 .byte 0x4c,0x8d,0x14,0x24 #lea (%rsp),%r10
3559 .byte 0x48,0x81,0xec,0x08,0x01,0x00,0x00 #sub $0x108,%rsp
3560 lea .Linc(%rip),%rax
3561 and \$-16,%rsp # shouldn't be formally required
3562
3563 movd $idx,%xmm5
3564 movdqa 0(%rax),%xmm0 # 00000001000000010000000000000000
3565 movdqa 16(%rax),%xmm1 # 00000002000000020000000200000002
3566 lea 128($tbl),%r11 # size optimization
3567 lea 128(%rsp),%rax # size optimization
3568
3569 pshufd \$0,%xmm5,%xmm5 # broadcast $idx
3570 movdqa %xmm1,%xmm4
3571 movdqa %xmm1,%xmm2
3572___
3573########################################################################
3574# calculate mask by comparing 0..31 to $idx and save result to stack
3575#
3576for($i=0;$i<$STRIDE/16;$i+=4) {
3577$code.=<<___;
3578 paddd %xmm0,%xmm1
3579 pcmpeqd %xmm5,%xmm0 # compare to 1,0
3580___
3581$code.=<<___ if ($i);
3582 movdqa %xmm3,`16*($i-1)-128`(%rax)
3583___
3584$code.=<<___;
3585 movdqa %xmm4,%xmm3
3586
3587 paddd %xmm1,%xmm2
3588 pcmpeqd %xmm5,%xmm1 # compare to 3,2
3589 movdqa %xmm0,`16*($i+0)-128`(%rax)
3590 movdqa %xmm4,%xmm0
3591
3592 paddd %xmm2,%xmm3
3593 pcmpeqd %xmm5,%xmm2 # compare to 5,4
3594 movdqa %xmm1,`16*($i+1)-128`(%rax)
3595 movdqa %xmm4,%xmm1
3596
3597 paddd %xmm3,%xmm0
3598 pcmpeqd %xmm5,%xmm3 # compare to 7,6
3599 movdqa %xmm2,`16*($i+2)-128`(%rax)
3600 movdqa %xmm4,%xmm2
3601___
3602}
3603$code.=<<___;
3604 movdqa %xmm3,`16*($i-1)-128`(%rax)
3605 jmp .Lgather
3606
3607.align 32
3608.Lgather:
3609 pxor %xmm4,%xmm4
3610 pxor %xmm5,%xmm5
3611___
3612for($i=0;$i<$STRIDE/16;$i+=4) {
3613$code.=<<___;
3614 movdqa `16*($i+0)-128`(%r11),%xmm0
3615 movdqa `16*($i+1)-128`(%r11),%xmm1
3616 movdqa `16*($i+2)-128`(%r11),%xmm2
3617 pand `16*($i+0)-128`(%rax),%xmm0
3618 movdqa `16*($i+3)-128`(%r11),%xmm3
3619 pand `16*($i+1)-128`(%rax),%xmm1
3620 por %xmm0,%xmm4
3621 pand `16*($i+2)-128`(%rax),%xmm2
3622 por %xmm1,%xmm5
3623 pand `16*($i+3)-128`(%rax),%xmm3
3624 por %xmm2,%xmm4
3625 por %xmm3,%xmm5
3626___
3627}
3628$code.=<<___;
3629 por %xmm5,%xmm4
3630 lea $STRIDE(%r11),%r11
3631 pshufd \$0x4e,%xmm4,%xmm0
3632 por %xmm4,%xmm0
3633 movq %xmm0,($out) # m0=bp[0]
3634 lea 8($out),$out
3635 sub \$1,$num
3636 jnz .Lgather
3637
3638 lea (%r10),%rsp
3639 ret
3640.LSEH_end_bn_gather5:
3641.size bn_gather5,.-bn_gather5
3642___
3643}
3644$code.=<<___;
3645.align 64
3646.Linc:
3647 .long 0,0, 1,1
3648 .long 2,2, 2,2
3649.asciz "Montgomery Multiplication with scatter/gather for x86_64, CRYPTOGAMS by <appro\@openssl.org>"
3650___
3651
3652# EXCEPTION_DISPOSITION handler (EXCEPTION_RECORD *rec,ULONG64 frame,
3653# CONTEXT *context,DISPATCHER_CONTEXT *disp)
3654if ($win64) {
3655$rec="%rcx";
3656$frame="%rdx";
3657$context="%r8";
3658$disp="%r9";
3659
3660$code.=<<___;
3661.extern __imp_RtlVirtualUnwind
3662.type mul_handler,\@abi-omnipotent
3663.align 16
3664mul_handler:
3665 push %rsi
3666 push %rdi
3667 push %rbx
3668 push %rbp
3669 push %r12
3670 push %r13
3671 push %r14
3672 push %r15
3673 pushfq
3674 sub \$64,%rsp
3675
3676 mov 120($context),%rax # pull context->Rax
3677 mov 248($context),%rbx # pull context->Rip
3678
3679 mov 8($disp),%rsi # disp->ImageBase
3680 mov 56($disp),%r11 # disp->HandlerData
3681
3682 mov 0(%r11),%r10d # HandlerData[0]
3683 lea (%rsi,%r10),%r10 # end of prologue label
3684 cmp %r10,%rbx # context->Rip<end of prologue label
3685 jb .Lcommon_seh_tail
3686
3687 mov 4(%r11),%r10d # HandlerData[1]
3688 lea (%rsi,%r10),%r10 # epilogue label
3689 cmp %r10,%rbx # context->Rip>=epilogue label
3690 jb .Lcommon_pop_regs
3691
3692 mov 152($context),%rax # pull context->Rsp
3693
3694 mov 8(%r11),%r10d # HandlerData[2]
3695 lea (%rsi,%r10),%r10 # epilogue label
3696 cmp %r10,%rbx # context->Rip>=epilogue label
3697 jae .Lcommon_seh_tail
3698
3699 lea .Lmul_epilogue(%rip),%r10
3700 cmp %r10,%rbx
3701 ja .Lbody_40
3702
3703 mov 192($context),%r10 # pull $num
3704 mov 8(%rax,%r10,8),%rax # pull saved stack pointer
3705
3706 jmp .Lcommon_pop_regs
3707
3708.Lbody_40:
3709 mov 40(%rax),%rax # pull saved stack pointer
3710.Lcommon_pop_regs:
3711 mov -8(%rax),%rbx
3712 mov -16(%rax),%rbp
3713 mov -24(%rax),%r12
3714 mov -32(%rax),%r13
3715 mov -40(%rax),%r14
3716 mov -48(%rax),%r15
3717 mov %rbx,144($context) # restore context->Rbx
3718 mov %rbp,160($context) # restore context->Rbp
3719 mov %r12,216($context) # restore context->R12
3720 mov %r13,224($context) # restore context->R13
3721 mov %r14,232($context) # restore context->R14
3722 mov %r15,240($context) # restore context->R15
3723
3724.Lcommon_seh_tail:
3725 mov 8(%rax),%rdi
3726 mov 16(%rax),%rsi
3727 mov %rax,152($context) # restore context->Rsp
3728 mov %rsi,168($context) # restore context->Rsi
3729 mov %rdi,176($context) # restore context->Rdi
3730
3731 mov 40($disp),%rdi # disp->ContextRecord
3732 mov $context,%rsi # context
3733 mov \$154,%ecx # sizeof(CONTEXT)
3734 .long 0xa548f3fc # cld; rep movsq
3735
3736 mov $disp,%rsi
3737 xor %rcx,%rcx # arg1, UNW_FLAG_NHANDLER
3738 mov 8(%rsi),%rdx # arg2, disp->ImageBase
3739 mov 0(%rsi),%r8 # arg3, disp->ControlPc
3740 mov 16(%rsi),%r9 # arg4, disp->FunctionEntry
3741 mov 40(%rsi),%r10 # disp->ContextRecord
3742 lea 56(%rsi),%r11 # &disp->HandlerData
3743 lea 24(%rsi),%r12 # &disp->EstablisherFrame
3744 mov %r10,32(%rsp) # arg5
3745 mov %r11,40(%rsp) # arg6
3746 mov %r12,48(%rsp) # arg7
3747 mov %rcx,56(%rsp) # arg8, (NULL)
3748 call *__imp_RtlVirtualUnwind(%rip)
3749
3750 mov \$1,%eax # ExceptionContinueSearch
3751 add \$64,%rsp
3752 popfq
3753 pop %r15
3754 pop %r14
3755 pop %r13
3756 pop %r12
3757 pop %rbp
3758 pop %rbx
3759 pop %rdi
3760 pop %rsi
3761 ret
3762.size mul_handler,.-mul_handler
3763
3764.section .pdata
3765.align 4
3766 .rva .LSEH_begin_bn_mul_mont_gather5
3767 .rva .LSEH_end_bn_mul_mont_gather5
3768 .rva .LSEH_info_bn_mul_mont_gather5
3769
3770 .rva .LSEH_begin_bn_mul4x_mont_gather5
3771 .rva .LSEH_end_bn_mul4x_mont_gather5
3772 .rva .LSEH_info_bn_mul4x_mont_gather5
3773
3774 .rva .LSEH_begin_bn_power5
3775 .rva .LSEH_end_bn_power5
3776 .rva .LSEH_info_bn_power5
3777
3778 .rva .LSEH_begin_bn_from_mont8x
3779 .rva .LSEH_end_bn_from_mont8x
3780 .rva .LSEH_info_bn_from_mont8x
3781___
3782$code.=<<___ if ($addx);
3783 .rva .LSEH_begin_bn_mulx4x_mont_gather5
3784 .rva .LSEH_end_bn_mulx4x_mont_gather5
3785 .rva .LSEH_info_bn_mulx4x_mont_gather5
3786
3787 .rva .LSEH_begin_bn_powerx5
3788 .rva .LSEH_end_bn_powerx5
3789 .rva .LSEH_info_bn_powerx5
3790___
3791$code.=<<___;
3792 .rva .LSEH_begin_bn_gather5
3793 .rva .LSEH_end_bn_gather5
3794 .rva .LSEH_info_bn_gather5
3795
3796.section .xdata
3797.align 8
3798.LSEH_info_bn_mul_mont_gather5:
3799 .byte 9,0,0,0
3800 .rva mul_handler
3801 .rva .Lmul_body,.Lmul_body,.Lmul_epilogue # HandlerData[]
3802.align 8
3803.LSEH_info_bn_mul4x_mont_gather5:
3804 .byte 9,0,0,0
3805 .rva mul_handler
3806 .rva .Lmul4x_prologue,.Lmul4x_body,.Lmul4x_epilogue # HandlerData[]
3807.align 8
3808.LSEH_info_bn_power5:
3809 .byte 9,0,0,0
3810 .rva mul_handler
3811 .rva .Lpower5_prologue,.Lpower5_body,.Lpower5_epilogue # HandlerData[]
3812.align 8
3813.LSEH_info_bn_from_mont8x:
3814 .byte 9,0,0,0
3815 .rva mul_handler
3816 .rva .Lfrom_prologue,.Lfrom_body,.Lfrom_epilogue # HandlerData[]
3817___
3818$code.=<<___ if ($addx);
3819.align 8
3820.LSEH_info_bn_mulx4x_mont_gather5:
3821 .byte 9,0,0,0
3822 .rva mul_handler
3823 .rva .Lmulx4x_prologue,.Lmulx4x_body,.Lmulx4x_epilogue # HandlerData[]
3824.align 8
3825.LSEH_info_bn_powerx5:
3826 .byte 9,0,0,0
3827 .rva mul_handler
3828 .rva .Lpowerx5_prologue,.Lpowerx5_body,.Lpowerx5_epilogue # HandlerData[]
3829___
3830$code.=<<___;
3831.align 8
3832.LSEH_info_bn_gather5:
3833 .byte 0x01,0x0b,0x03,0x0a
3834 .byte 0x0b,0x01,0x21,0x00 # sub rsp,0x108
3835 .byte 0x04,0xa3,0x00,0x00 # lea r10,(rsp)
3836.align 8
3837___
3838}
3839
3840$code =~ s/\`([^\`]*)\`/eval($1)/gem;
3841
3842print $code;
3843close STDOUT;
Note: See TracBrowser for help on using the repository browser.

© 2024 Oracle Support Privacy / Do Not Sell My Info Terms of Use Trademark Policy Automated Access Etiquette