VirtualBox

source: vbox/trunk/src/libs/openssl-1.1.1l/crypto/bn/asm/x86_64-mont5.pl@ 91772

Last change on this file since 91772 was 91772, checked in by vboxsync, 3 years ago

openssl-1.1.1l: Applied and adjusted our OpenSSL changes to 1.1.1l. bugref:10126

  • Property svn:executable set to *
File size: 85.7 KB
Line 
1#! /usr/bin/env perl
2# Copyright 2011-2020 The OpenSSL Project Authors. All Rights Reserved.
3#
4# Licensed under the OpenSSL license (the "License"). You may not use
5# this file except in compliance with the License. You can obtain a copy
6# in the file LICENSE in the source distribution or at
7# https://www.openssl.org/source/license.html
8
9
10# ====================================================================
11# Written by Andy Polyakov <[email protected]> for the OpenSSL
12# project. The module is, however, dual licensed under OpenSSL and
13# CRYPTOGAMS licenses depending on where you obtain it. For further
14# details see http://www.openssl.org/~appro/cryptogams/.
15# ====================================================================
16
17# August 2011.
18#
19# Companion to x86_64-mont.pl that optimizes cache-timing attack
20# countermeasures. The subroutines are produced by replacing bp[i]
21# references in their x86_64-mont.pl counterparts with cache-neutral
22# references to powers table computed in BN_mod_exp_mont_consttime.
23# In addition subroutine that scatters elements of the powers table
24# is implemented, so that scatter-/gathering can be tuned without
25# bn_exp.c modifications.
26
27# August 2013.
28#
29# Add MULX/AD*X code paths and additional interfaces to optimize for
30# branch prediction unit. For input lengths that are multiples of 8
31# the np argument is not just modulus value, but one interleaved
32# with 0. This is to optimize post-condition...
33
34$flavour = shift;
35$output = shift;
36if ($flavour =~ /\./) { $output = $flavour; undef $flavour; }
37
38$win64=0; $win64=1 if ($flavour =~ /[nm]asm|mingw64/ || $output =~ /\.asm$/);
39
40$0 =~ m/(.*[\/\\])[^\/\\]+$/; $dir=$1;
41( $xlate="${dir}x86_64-xlate.pl" and -f $xlate ) or
42( $xlate="${dir}../../perlasm/x86_64-xlate.pl" and -f $xlate) or
43die "can't locate x86_64-xlate.pl";
44
45open OUT,"| \"$^X\" \"$xlate\" $flavour \"$output\"";
46*STDOUT=*OUT;
47
48if (`$ENV{CC} -Wa,-v -c -o /dev/null -x assembler /dev/null 2>&1`
49 =~ /GNU assembler version ([2-9]\.[0-9]+)/) {
50 $addx = ($1>=2.23);
51}
52
53if (!$addx && $win64 && ($flavour =~ /nasm/ || $ENV{ASM} =~ /nasm/) &&
54 `nasm -v 2>&1` =~ /NASM version ([2-9]\.[0-9]+)/) {
55 $addx = ($1>=2.10);
56}
57
58if (!$addx && $win64 && ($flavour =~ /masm/ || $ENV{ASM} =~ /ml64/) &&
59 `ml64 2>&1` =~ /Version ([0-9]+)\./) {
60 $addx = ($1>=12);
61}
62
63if (!$addx && `$ENV{CC} -v 2>&1` =~ /((?:clang|LLVM) version|.*based on LLVM) ([0-9]+)\.([0-9]+)/) {
64 my $ver = $2 + $3/100.0; # 3.1->3.01, 3.10->3.10
65 $addx = ($ver>=3.03);
66}
67
68# int bn_mul_mont_gather5(
69$rp="%rdi"; # BN_ULONG *rp,
70$ap="%rsi"; # const BN_ULONG *ap,
71$bp="%rdx"; # const BN_ULONG *bp,
72$np="%rcx"; # const BN_ULONG *np,
73$n0="%r8"; # const BN_ULONG *n0,
74$num="%r9"; # int num,
75 # int idx); # 0 to 2^5-1, "index" in $bp holding
76 # pre-computed powers of a', interlaced
77 # in such manner that b[0] is $bp[idx],
78 # b[1] is [2^5+idx], etc.
79$lo0="%r10";
80$hi0="%r11";
81$hi1="%r13";
82$i="%r14";
83$j="%r15";
84$m0="%rbx";
85$m1="%rbp";
86
87$code=<<___;
88.text
89
90.extern OPENSSL_ia32cap_P
91
92.globl bn_mul_mont_gather5
93.type bn_mul_mont_gather5,\@function,6
94.align 64
95bn_mul_mont_gather5:
96.cfi_startproc
97 mov ${num}d,${num}d
98 mov %rsp,%rax
99.cfi_def_cfa_register %rax
100 test \$7,${num}d
101 jnz .Lmul_enter
102___
103$code.=<<___ if ($addx);
104 mov OPENSSL_ia32cap_P+8(%rip),%r11d
105___
106$code.=<<___;
107 jmp .Lmul4x_enter
108
109.align 16
110.Lmul_enter:
111 movd `($win64?56:8)`(%rsp),%xmm5 # load 7th argument
112 push %rbx
113.cfi_push %rbx
114 push %rbp
115.cfi_push %rbp
116 push %r12
117.cfi_push %r12
118 push %r13
119.cfi_push %r13
120 push %r14
121.cfi_push %r14
122 push %r15
123.cfi_push %r15
124
125 neg $num
126 mov %rsp,%r11
127 lea -280(%rsp,$num,8),%r10 # future alloca(8*(num+2)+256+8)
128 neg $num # restore $num
129 and \$-1024,%r10 # minimize TLB usage
130
131 # An OS-agnostic version of __chkstk.
132 #
133 # Some OSes (Windows) insist on stack being "wired" to
134 # physical memory in strictly sequential manner, i.e. if stack
135 # allocation spans two pages, then reference to farmost one can
136 # be punishable by SEGV. But page walking can do good even on
137 # other OSes, because it guarantees that villain thread hits
138 # the guard page before it can make damage to innocent one...
139 sub %r10,%r11
140 and \$-4096,%r11
141 lea (%r10,%r11),%rsp
142 mov (%rsp),%r11
143 cmp %r10,%rsp
144 ja .Lmul_page_walk
145 jmp .Lmul_page_walk_done
146
147.Lmul_page_walk:
148 lea -4096(%rsp),%rsp
149 mov (%rsp),%r11
150 cmp %r10,%rsp
151 ja .Lmul_page_walk
152.Lmul_page_walk_done:
153
154 lea .Linc(%rip),%r10
155 mov %rax,8(%rsp,$num,8) # tp[num+1]=%rsp
156.cfi_cfa_expression %rsp+8,$num,8,mul,plus,deref,+8
157.Lmul_body:
158
159 lea 128($bp),%r12 # reassign $bp (+size optimization)
160___
161 $bp="%r12";
162 $STRIDE=2**5*8; # 5 is "window size"
163 $N=$STRIDE/4; # should match cache line size
164$code.=<<___;
165 movdqa 0(%r10),%xmm0 # 00000001000000010000000000000000
166 movdqa 16(%r10),%xmm1 # 00000002000000020000000200000002
167 lea 24-112(%rsp,$num,8),%r10# place the mask after tp[num+3] (+ICache optimization)
168 and \$-16,%r10
169
170 pshufd \$0,%xmm5,%xmm5 # broadcast index
171 movdqa %xmm1,%xmm4
172 movdqa %xmm1,%xmm2
173___
174########################################################################
175# calculate mask by comparing 0..31 to index and save result to stack
176#
177$code.=<<___;
178 paddd %xmm0,%xmm1
179 pcmpeqd %xmm5,%xmm0 # compare to 1,0
180 .byte 0x67
181 movdqa %xmm4,%xmm3
182___
183for($k=0;$k<$STRIDE/16-4;$k+=4) {
184$code.=<<___;
185 paddd %xmm1,%xmm2
186 pcmpeqd %xmm5,%xmm1 # compare to 3,2
187 movdqa %xmm0,`16*($k+0)+112`(%r10)
188 movdqa %xmm4,%xmm0
189
190 paddd %xmm2,%xmm3
191 pcmpeqd %xmm5,%xmm2 # compare to 5,4
192 movdqa %xmm1,`16*($k+1)+112`(%r10)
193 movdqa %xmm4,%xmm1
194
195 paddd %xmm3,%xmm0
196 pcmpeqd %xmm5,%xmm3 # compare to 7,6
197 movdqa %xmm2,`16*($k+2)+112`(%r10)
198 movdqa %xmm4,%xmm2
199
200 paddd %xmm0,%xmm1
201 pcmpeqd %xmm5,%xmm0
202 movdqa %xmm3,`16*($k+3)+112`(%r10)
203 movdqa %xmm4,%xmm3
204___
205}
206$code.=<<___; # last iteration can be optimized
207 paddd %xmm1,%xmm2
208 pcmpeqd %xmm5,%xmm1
209 movdqa %xmm0,`16*($k+0)+112`(%r10)
210
211 paddd %xmm2,%xmm3
212 .byte 0x67
213 pcmpeqd %xmm5,%xmm2
214 movdqa %xmm1,`16*($k+1)+112`(%r10)
215
216 pcmpeqd %xmm5,%xmm3
217 movdqa %xmm2,`16*($k+2)+112`(%r10)
218 pand `16*($k+0)-128`($bp),%xmm0 # while it's still in register
219
220 pand `16*($k+1)-128`($bp),%xmm1
221 pand `16*($k+2)-128`($bp),%xmm2
222 movdqa %xmm3,`16*($k+3)+112`(%r10)
223 pand `16*($k+3)-128`($bp),%xmm3
224 por %xmm2,%xmm0
225 por %xmm3,%xmm1
226___
227for($k=0;$k<$STRIDE/16-4;$k+=4) {
228$code.=<<___;
229 movdqa `16*($k+0)-128`($bp),%xmm4
230 movdqa `16*($k+1)-128`($bp),%xmm5
231 movdqa `16*($k+2)-128`($bp),%xmm2
232 pand `16*($k+0)+112`(%r10),%xmm4
233 movdqa `16*($k+3)-128`($bp),%xmm3
234 pand `16*($k+1)+112`(%r10),%xmm5
235 por %xmm4,%xmm0
236 pand `16*($k+2)+112`(%r10),%xmm2
237 por %xmm5,%xmm1
238 pand `16*($k+3)+112`(%r10),%xmm3
239 por %xmm2,%xmm0
240 por %xmm3,%xmm1
241___
242}
243$code.=<<___;
244 por %xmm1,%xmm0
245 pshufd \$0x4e,%xmm0,%xmm1
246 por %xmm1,%xmm0
247 lea $STRIDE($bp),$bp
248 movq %xmm0,$m0 # m0=bp[0]
249
250 mov ($n0),$n0 # pull n0[0] value
251 mov ($ap),%rax
252
253 xor $i,$i # i=0
254 xor $j,$j # j=0
255
256 mov $n0,$m1
257 mulq $m0 # ap[0]*bp[0]
258 mov %rax,$lo0
259 mov ($np),%rax
260
261 imulq $lo0,$m1 # "tp[0]"*n0
262 mov %rdx,$hi0
263
264 mulq $m1 # np[0]*m1
265 add %rax,$lo0 # discarded
266 mov 8($ap),%rax
267 adc \$0,%rdx
268 mov %rdx,$hi1
269
270 lea 1($j),$j # j++
271 jmp .L1st_enter
272
273.align 16
274.L1st:
275 add %rax,$hi1
276 mov ($ap,$j,8),%rax
277 adc \$0,%rdx
278 add $hi0,$hi1 # np[j]*m1+ap[j]*bp[0]
279 mov $lo0,$hi0
280 adc \$0,%rdx
281 mov $hi1,-16(%rsp,$j,8) # tp[j-1]
282 mov %rdx,$hi1
283
284.L1st_enter:
285 mulq $m0 # ap[j]*bp[0]
286 add %rax,$hi0
287 mov ($np,$j,8),%rax
288 adc \$0,%rdx
289 lea 1($j),$j # j++
290 mov %rdx,$lo0
291
292 mulq $m1 # np[j]*m1
293 cmp $num,$j
294 jne .L1st # note that upon exit $j==$num, so
295 # they can be used interchangeably
296
297 add %rax,$hi1
298 adc \$0,%rdx
299 add $hi0,$hi1 # np[j]*m1+ap[j]*bp[0]
300 adc \$0,%rdx
301 mov $hi1,-16(%rsp,$num,8) # tp[num-1]
302 mov %rdx,$hi1
303 mov $lo0,$hi0
304
305 xor %rdx,%rdx
306 add $hi0,$hi1
307 adc \$0,%rdx
308 mov $hi1,-8(%rsp,$num,8)
309 mov %rdx,(%rsp,$num,8) # store upmost overflow bit
310
311 lea 1($i),$i # i++
312 jmp .Louter
313.align 16
314.Louter:
315 lea 24+128(%rsp,$num,8),%rdx # where 256-byte mask is (+size optimization)
316 and \$-16,%rdx
317 pxor %xmm4,%xmm4
318 pxor %xmm5,%xmm5
319___
320for($k=0;$k<$STRIDE/16;$k+=4) {
321$code.=<<___;
322 movdqa `16*($k+0)-128`($bp),%xmm0
323 movdqa `16*($k+1)-128`($bp),%xmm1
324 movdqa `16*($k+2)-128`($bp),%xmm2
325 movdqa `16*($k+3)-128`($bp),%xmm3
326 pand `16*($k+0)-128`(%rdx),%xmm0
327 pand `16*($k+1)-128`(%rdx),%xmm1
328 por %xmm0,%xmm4
329 pand `16*($k+2)-128`(%rdx),%xmm2
330 por %xmm1,%xmm5
331 pand `16*($k+3)-128`(%rdx),%xmm3
332 por %xmm2,%xmm4
333 por %xmm3,%xmm5
334___
335}
336$code.=<<___;
337 por %xmm5,%xmm4
338 pshufd \$0x4e,%xmm4,%xmm0
339 por %xmm4,%xmm0
340 lea $STRIDE($bp),$bp
341
342 mov ($ap),%rax # ap[0]
343 movq %xmm0,$m0 # m0=bp[i]
344
345 xor $j,$j # j=0
346 mov $n0,$m1
347 mov (%rsp),$lo0
348
349 mulq $m0 # ap[0]*bp[i]
350 add %rax,$lo0 # ap[0]*bp[i]+tp[0]
351 mov ($np),%rax
352 adc \$0,%rdx
353
354 imulq $lo0,$m1 # tp[0]*n0
355 mov %rdx,$hi0
356
357 mulq $m1 # np[0]*m1
358 add %rax,$lo0 # discarded
359 mov 8($ap),%rax
360 adc \$0,%rdx
361 mov 8(%rsp),$lo0 # tp[1]
362 mov %rdx,$hi1
363
364 lea 1($j),$j # j++
365 jmp .Linner_enter
366
367.align 16
368.Linner:
369 add %rax,$hi1
370 mov ($ap,$j,8),%rax
371 adc \$0,%rdx
372 add $lo0,$hi1 # np[j]*m1+ap[j]*bp[i]+tp[j]
373 mov (%rsp,$j,8),$lo0
374 adc \$0,%rdx
375 mov $hi1,-16(%rsp,$j,8) # tp[j-1]
376 mov %rdx,$hi1
377
378.Linner_enter:
379 mulq $m0 # ap[j]*bp[i]
380 add %rax,$hi0
381 mov ($np,$j,8),%rax
382 adc \$0,%rdx
383 add $hi0,$lo0 # ap[j]*bp[i]+tp[j]
384 mov %rdx,$hi0
385 adc \$0,$hi0
386 lea 1($j),$j # j++
387
388 mulq $m1 # np[j]*m1
389 cmp $num,$j
390 jne .Linner # note that upon exit $j==$num, so
391 # they can be used interchangeably
392 add %rax,$hi1
393 adc \$0,%rdx
394 add $lo0,$hi1 # np[j]*m1+ap[j]*bp[i]+tp[j]
395 mov (%rsp,$num,8),$lo0
396 adc \$0,%rdx
397 mov $hi1,-16(%rsp,$num,8) # tp[num-1]
398 mov %rdx,$hi1
399
400 xor %rdx,%rdx
401 add $hi0,$hi1
402 adc \$0,%rdx
403 add $lo0,$hi1 # pull upmost overflow bit
404 adc \$0,%rdx
405 mov $hi1,-8(%rsp,$num,8)
406 mov %rdx,(%rsp,$num,8) # store upmost overflow bit
407
408 lea 1($i),$i # i++
409 cmp $num,$i
410 jb .Louter
411
412 xor $i,$i # i=0 and clear CF!
413 mov (%rsp),%rax # tp[0]
414 lea (%rsp),$ap # borrow ap for tp
415 mov $num,$j # j=num
416 jmp .Lsub
417.align 16
418.Lsub: sbb ($np,$i,8),%rax
419 mov %rax,($rp,$i,8) # rp[i]=tp[i]-np[i]
420 mov 8($ap,$i,8),%rax # tp[i+1]
421 lea 1($i),$i # i++
422 dec $j # doesn't affect CF!
423 jnz .Lsub
424
425 sbb \$0,%rax # handle upmost overflow bit
426 mov \$-1,%rbx
427 xor %rax,%rbx
428 xor $i,$i
429 mov $num,$j # j=num
430
431.Lcopy: # conditional copy
432 mov ($rp,$i,8),%rcx
433 mov (%rsp,$i,8),%rdx
434 and %rbx,%rcx
435 and %rax,%rdx
436 mov $i,(%rsp,$i,8) # zap temporary vector
437 or %rcx,%rdx
438 mov %rdx,($rp,$i,8) # rp[i]=tp[i]
439 lea 1($i),$i
440 sub \$1,$j
441 jnz .Lcopy
442
443 mov 8(%rsp,$num,8),%rsi # restore %rsp
444.cfi_def_cfa %rsi,8
445 mov \$1,%rax
446
447 mov -48(%rsi),%r15
448.cfi_restore %r15
449 mov -40(%rsi),%r14
450.cfi_restore %r14
451 mov -32(%rsi),%r13
452.cfi_restore %r13
453 mov -24(%rsi),%r12
454.cfi_restore %r12
455 mov -16(%rsi),%rbp
456.cfi_restore %rbp
457 mov -8(%rsi),%rbx
458.cfi_restore %rbx
459 lea (%rsi),%rsp
460.cfi_def_cfa_register %rsp
461.Lmul_epilogue:
462 ret
463.cfi_endproc
464.size bn_mul_mont_gather5,.-bn_mul_mont_gather5
465___
466{{{
467my @A=("%r10","%r11");
468my @N=("%r13","%rdi");
469$code.=<<___;
470.type bn_mul4x_mont_gather5,\@function,6
471.align 32
472bn_mul4x_mont_gather5:
473.cfi_startproc
474 .byte 0x67
475 mov %rsp,%rax
476.cfi_def_cfa_register %rax
477.Lmul4x_enter:
478___
479$code.=<<___ if ($addx);
480 and \$0x80108,%r11d
481 cmp \$0x80108,%r11d # check for AD*X+BMI2+BMI1
482 je .Lmulx4x_enter
483___
484$code.=<<___;
485 push %rbx
486.cfi_push %rbx
487 push %rbp
488.cfi_push %rbp
489 push %r12
490.cfi_push %r12
491 push %r13
492.cfi_push %r13
493 push %r14
494.cfi_push %r14
495 push %r15
496.cfi_push %r15
497.Lmul4x_prologue:
498
499 .byte 0x67
500 shl \$3,${num}d # convert $num to bytes
501 lea ($num,$num,2),%r10 # 3*$num in bytes
502 neg $num # -$num
503
504 ##############################################################
505 # Ensure that stack frame doesn't alias with $rptr+3*$num
506 # modulo 4096, which covers ret[num], am[num] and n[num]
507 # (see bn_exp.c). This is done to allow memory disambiguation
508 # logic do its magic. [Extra [num] is allocated in order
509 # to align with bn_power5's frame, which is cleansed after
510 # completing exponentiation. Extra 256 bytes is for power mask
511 # calculated from 7th argument, the index.]
512 #
513 lea -320(%rsp,$num,2),%r11
514 mov %rsp,%rbp
515 sub $rp,%r11
516 and \$4095,%r11
517 cmp %r11,%r10
518 jb .Lmul4xsp_alt
519 sub %r11,%rbp # align with $rp
520 lea -320(%rbp,$num,2),%rbp # future alloca(frame+2*num*8+256)
521 jmp .Lmul4xsp_done
522
523.align 32
524.Lmul4xsp_alt:
525 lea 4096-320(,$num,2),%r10
526 lea -320(%rbp,$num,2),%rbp # future alloca(frame+2*num*8+256)
527 sub %r10,%r11
528 mov \$0,%r10
529 cmovc %r10,%r11
530 sub %r11,%rbp
531.Lmul4xsp_done:
532 and \$-64,%rbp
533 mov %rsp,%r11
534 sub %rbp,%r11
535 and \$-4096,%r11
536 lea (%rbp,%r11),%rsp
537 mov (%rsp),%r10
538 cmp %rbp,%rsp
539 ja .Lmul4x_page_walk
540 jmp .Lmul4x_page_walk_done
541
542.Lmul4x_page_walk:
543 lea -4096(%rsp),%rsp
544 mov (%rsp),%r10
545 cmp %rbp,%rsp
546 ja .Lmul4x_page_walk
547.Lmul4x_page_walk_done:
548
549 neg $num
550
551 mov %rax,40(%rsp)
552.cfi_cfa_expression %rsp+40,deref,+8
553.Lmul4x_body:
554
555 call mul4x_internal
556
557 mov 40(%rsp),%rsi # restore %rsp
558.cfi_def_cfa %rsi,8
559 mov \$1,%rax
560
561 mov -48(%rsi),%r15
562.cfi_restore %r15
563 mov -40(%rsi),%r14
564.cfi_restore %r14
565 mov -32(%rsi),%r13
566.cfi_restore %r13
567 mov -24(%rsi),%r12
568.cfi_restore %r12
569 mov -16(%rsi),%rbp
570.cfi_restore %rbp
571 mov -8(%rsi),%rbx
572.cfi_restore %rbx
573 lea (%rsi),%rsp
574.cfi_def_cfa_register %rsp
575.Lmul4x_epilogue:
576 ret
577.cfi_endproc
578.size bn_mul4x_mont_gather5,.-bn_mul4x_mont_gather5
579
580.type mul4x_internal,\@abi-omnipotent
581.align 32
582mul4x_internal:
583.cfi_startproc
584 shl \$5,$num # $num was in bytes
585 movd `($win64?56:8)`(%rax),%xmm5 # load 7th argument, index
586 lea .Linc(%rip),%rax
587 lea 128(%rdx,$num),%r13 # end of powers table (+size optimization)
588 shr \$5,$num # restore $num
589___
590 $bp="%r12";
591 $STRIDE=2**5*8; # 5 is "window size"
592 $N=$STRIDE/4; # should match cache line size
593 $tp=$i;
594$code.=<<___;
595 movdqa 0(%rax),%xmm0 # 00000001000000010000000000000000
596 movdqa 16(%rax),%xmm1 # 00000002000000020000000200000002
597 lea 88-112(%rsp,$num),%r10 # place the mask after tp[num+1] (+ICache optimization)
598 lea 128(%rdx),$bp # size optimization
599
600 pshufd \$0,%xmm5,%xmm5 # broadcast index
601 movdqa %xmm1,%xmm4
602 .byte 0x67,0x67
603 movdqa %xmm1,%xmm2
604___
605########################################################################
606# calculate mask by comparing 0..31 to index and save result to stack
607#
608$code.=<<___;
609 paddd %xmm0,%xmm1
610 pcmpeqd %xmm5,%xmm0 # compare to 1,0
611 .byte 0x67
612 movdqa %xmm4,%xmm3
613___
614for($i=0;$i<$STRIDE/16-4;$i+=4) {
615$code.=<<___;
616 paddd %xmm1,%xmm2
617 pcmpeqd %xmm5,%xmm1 # compare to 3,2
618 movdqa %xmm0,`16*($i+0)+112`(%r10)
619 movdqa %xmm4,%xmm0
620
621 paddd %xmm2,%xmm3
622 pcmpeqd %xmm5,%xmm2 # compare to 5,4
623 movdqa %xmm1,`16*($i+1)+112`(%r10)
624 movdqa %xmm4,%xmm1
625
626 paddd %xmm3,%xmm0
627 pcmpeqd %xmm5,%xmm3 # compare to 7,6
628 movdqa %xmm2,`16*($i+2)+112`(%r10)
629 movdqa %xmm4,%xmm2
630
631 paddd %xmm0,%xmm1
632 pcmpeqd %xmm5,%xmm0
633 movdqa %xmm3,`16*($i+3)+112`(%r10)
634 movdqa %xmm4,%xmm3
635___
636}
637$code.=<<___; # last iteration can be optimized
638 paddd %xmm1,%xmm2
639 pcmpeqd %xmm5,%xmm1
640 movdqa %xmm0,`16*($i+0)+112`(%r10)
641
642 paddd %xmm2,%xmm3
643 .byte 0x67
644 pcmpeqd %xmm5,%xmm2
645 movdqa %xmm1,`16*($i+1)+112`(%r10)
646
647 pcmpeqd %xmm5,%xmm3
648 movdqa %xmm2,`16*($i+2)+112`(%r10)
649 pand `16*($i+0)-128`($bp),%xmm0 # while it's still in register
650
651 pand `16*($i+1)-128`($bp),%xmm1
652 pand `16*($i+2)-128`($bp),%xmm2
653 movdqa %xmm3,`16*($i+3)+112`(%r10)
654 pand `16*($i+3)-128`($bp),%xmm3
655 por %xmm2,%xmm0
656 por %xmm3,%xmm1
657___
658for($i=0;$i<$STRIDE/16-4;$i+=4) {
659$code.=<<___;
660 movdqa `16*($i+0)-128`($bp),%xmm4
661 movdqa `16*($i+1)-128`($bp),%xmm5
662 movdqa `16*($i+2)-128`($bp),%xmm2
663 pand `16*($i+0)+112`(%r10),%xmm4
664 movdqa `16*($i+3)-128`($bp),%xmm3
665 pand `16*($i+1)+112`(%r10),%xmm5
666 por %xmm4,%xmm0
667 pand `16*($i+2)+112`(%r10),%xmm2
668 por %xmm5,%xmm1
669 pand `16*($i+3)+112`(%r10),%xmm3
670 por %xmm2,%xmm0
671 por %xmm3,%xmm1
672___
673}
674$code.=<<___;
675 por %xmm1,%xmm0
676 pshufd \$0x4e,%xmm0,%xmm1
677 por %xmm1,%xmm0
678 lea $STRIDE($bp),$bp
679 movq %xmm0,$m0 # m0=bp[0]
680
681 mov %r13,16+8(%rsp) # save end of b[num]
682 mov $rp, 56+8(%rsp) # save $rp
683
684 mov ($n0),$n0 # pull n0[0] value
685 mov ($ap),%rax
686 lea ($ap,$num),$ap # end of a[num]
687 neg $num
688
689 mov $n0,$m1
690 mulq $m0 # ap[0]*bp[0]
691 mov %rax,$A[0]
692 mov ($np),%rax
693
694 imulq $A[0],$m1 # "tp[0]"*n0
695 lea 64+8(%rsp),$tp
696 mov %rdx,$A[1]
697
698 mulq $m1 # np[0]*m1
699 add %rax,$A[0] # discarded
700 mov 8($ap,$num),%rax
701 adc \$0,%rdx
702 mov %rdx,$N[1]
703
704 mulq $m0
705 add %rax,$A[1]
706 mov 8*1($np),%rax
707 adc \$0,%rdx
708 mov %rdx,$A[0]
709
710 mulq $m1
711 add %rax,$N[1]
712 mov 16($ap,$num),%rax
713 adc \$0,%rdx
714 add $A[1],$N[1]
715 lea 4*8($num),$j # j=4
716 lea 8*4($np),$np
717 adc \$0,%rdx
718 mov $N[1],($tp)
719 mov %rdx,$N[0]
720 jmp .L1st4x
721
722.align 32
723.L1st4x:
724 mulq $m0 # ap[j]*bp[0]
725 add %rax,$A[0]
726 mov -8*2($np),%rax
727 lea 32($tp),$tp
728 adc \$0,%rdx
729 mov %rdx,$A[1]
730
731 mulq $m1 # np[j]*m1
732 add %rax,$N[0]
733 mov -8($ap,$j),%rax
734 adc \$0,%rdx
735 add $A[0],$N[0] # np[j]*m1+ap[j]*bp[0]
736 adc \$0,%rdx
737 mov $N[0],-24($tp) # tp[j-1]
738 mov %rdx,$N[1]
739
740 mulq $m0 # ap[j]*bp[0]
741 add %rax,$A[1]
742 mov -8*1($np),%rax
743 adc \$0,%rdx
744 mov %rdx,$A[0]
745
746 mulq $m1 # np[j]*m1
747 add %rax,$N[1]
748 mov ($ap,$j),%rax
749 adc \$0,%rdx
750 add $A[1],$N[1] # np[j]*m1+ap[j]*bp[0]
751 adc \$0,%rdx
752 mov $N[1],-16($tp) # tp[j-1]
753 mov %rdx,$N[0]
754
755 mulq $m0 # ap[j]*bp[0]
756 add %rax,$A[0]
757 mov 8*0($np),%rax
758 adc \$0,%rdx
759 mov %rdx,$A[1]
760
761 mulq $m1 # np[j]*m1
762 add %rax,$N[0]
763 mov 8($ap,$j),%rax
764 adc \$0,%rdx
765 add $A[0],$N[0] # np[j]*m1+ap[j]*bp[0]
766 adc \$0,%rdx
767 mov $N[0],-8($tp) # tp[j-1]
768 mov %rdx,$N[1]
769
770 mulq $m0 # ap[j]*bp[0]
771 add %rax,$A[1]
772 mov 8*1($np),%rax
773 adc \$0,%rdx
774 mov %rdx,$A[0]
775
776 mulq $m1 # np[j]*m1
777 add %rax,$N[1]
778 mov 16($ap,$j),%rax
779 adc \$0,%rdx
780 add $A[1],$N[1] # np[j]*m1+ap[j]*bp[0]
781 lea 8*4($np),$np
782 adc \$0,%rdx
783 mov $N[1],($tp) # tp[j-1]
784 mov %rdx,$N[0]
785
786 add \$32,$j # j+=4
787 jnz .L1st4x
788
789 mulq $m0 # ap[j]*bp[0]
790 add %rax,$A[0]
791 mov -8*2($np),%rax
792 lea 32($tp),$tp
793 adc \$0,%rdx
794 mov %rdx,$A[1]
795
796 mulq $m1 # np[j]*m1
797 add %rax,$N[0]
798 mov -8($ap),%rax
799 adc \$0,%rdx
800 add $A[0],$N[0] # np[j]*m1+ap[j]*bp[0]
801 adc \$0,%rdx
802 mov $N[0],-24($tp) # tp[j-1]
803 mov %rdx,$N[1]
804
805 mulq $m0 # ap[j]*bp[0]
806 add %rax,$A[1]
807 mov -8*1($np),%rax
808 adc \$0,%rdx
809 mov %rdx,$A[0]
810
811 mulq $m1 # np[j]*m1
812 add %rax,$N[1]
813 mov ($ap,$num),%rax # ap[0]
814 adc \$0,%rdx
815 add $A[1],$N[1] # np[j]*m1+ap[j]*bp[0]
816 adc \$0,%rdx
817 mov $N[1],-16($tp) # tp[j-1]
818 mov %rdx,$N[0]
819
820 lea ($np,$num),$np # rewind $np
821
822 xor $N[1],$N[1]
823 add $A[0],$N[0]
824 adc \$0,$N[1]
825 mov $N[0],-8($tp)
826
827 jmp .Louter4x
828
829.align 32
830.Louter4x:
831 lea 16+128($tp),%rdx # where 256-byte mask is (+size optimization)
832 pxor %xmm4,%xmm4
833 pxor %xmm5,%xmm5
834___
835for($i=0;$i<$STRIDE/16;$i+=4) {
836$code.=<<___;
837 movdqa `16*($i+0)-128`($bp),%xmm0
838 movdqa `16*($i+1)-128`($bp),%xmm1
839 movdqa `16*($i+2)-128`($bp),%xmm2
840 movdqa `16*($i+3)-128`($bp),%xmm3
841 pand `16*($i+0)-128`(%rdx),%xmm0
842 pand `16*($i+1)-128`(%rdx),%xmm1
843 por %xmm0,%xmm4
844 pand `16*($i+2)-128`(%rdx),%xmm2
845 por %xmm1,%xmm5
846 pand `16*($i+3)-128`(%rdx),%xmm3
847 por %xmm2,%xmm4
848 por %xmm3,%xmm5
849___
850}
851$code.=<<___;
852 por %xmm5,%xmm4
853 pshufd \$0x4e,%xmm4,%xmm0
854 por %xmm4,%xmm0
855 lea $STRIDE($bp),$bp
856 movq %xmm0,$m0 # m0=bp[i]
857
858 mov ($tp,$num),$A[0]
859 mov $n0,$m1
860 mulq $m0 # ap[0]*bp[i]
861 add %rax,$A[0] # ap[0]*bp[i]+tp[0]
862 mov ($np),%rax
863 adc \$0,%rdx
864
865 imulq $A[0],$m1 # tp[0]*n0
866 mov %rdx,$A[1]
867 mov $N[1],($tp) # store upmost overflow bit
868
869 lea ($tp,$num),$tp # rewind $tp
870
871 mulq $m1 # np[0]*m1
872 add %rax,$A[0] # "$N[0]", discarded
873 mov 8($ap,$num),%rax
874 adc \$0,%rdx
875 mov %rdx,$N[1]
876
877 mulq $m0 # ap[j]*bp[i]
878 add %rax,$A[1]
879 mov 8*1($np),%rax
880 adc \$0,%rdx
881 add 8($tp),$A[1] # +tp[1]
882 adc \$0,%rdx
883 mov %rdx,$A[0]
884
885 mulq $m1 # np[j]*m1
886 add %rax,$N[1]
887 mov 16($ap,$num),%rax
888 adc \$0,%rdx
889 add $A[1],$N[1] # np[j]*m1+ap[j]*bp[i]+tp[j]
890 lea 4*8($num),$j # j=4
891 lea 8*4($np),$np
892 adc \$0,%rdx
893 mov %rdx,$N[0]
894 jmp .Linner4x
895
896.align 32
897.Linner4x:
898 mulq $m0 # ap[j]*bp[i]
899 add %rax,$A[0]
900 mov -8*2($np),%rax
901 adc \$0,%rdx
902 add 16($tp),$A[0] # ap[j]*bp[i]+tp[j]
903 lea 32($tp),$tp
904 adc \$0,%rdx
905 mov %rdx,$A[1]
906
907 mulq $m1 # np[j]*m1
908 add %rax,$N[0]
909 mov -8($ap,$j),%rax
910 adc \$0,%rdx
911 add $A[0],$N[0]
912 adc \$0,%rdx
913 mov $N[1],-32($tp) # tp[j-1]
914 mov %rdx,$N[1]
915
916 mulq $m0 # ap[j]*bp[i]
917 add %rax,$A[1]
918 mov -8*1($np),%rax
919 adc \$0,%rdx
920 add -8($tp),$A[1]
921 adc \$0,%rdx
922 mov %rdx,$A[0]
923
924 mulq $m1 # np[j]*m1
925 add %rax,$N[1]
926 mov ($ap,$j),%rax
927 adc \$0,%rdx
928 add $A[1],$N[1]
929 adc \$0,%rdx
930 mov $N[0],-24($tp) # tp[j-1]
931 mov %rdx,$N[0]
932
933 mulq $m0 # ap[j]*bp[i]
934 add %rax,$A[0]
935 mov 8*0($np),%rax
936 adc \$0,%rdx
937 add ($tp),$A[0] # ap[j]*bp[i]+tp[j]
938 adc \$0,%rdx
939 mov %rdx,$A[1]
940
941 mulq $m1 # np[j]*m1
942 add %rax,$N[0]
943 mov 8($ap,$j),%rax
944 adc \$0,%rdx
945 add $A[0],$N[0]
946 adc \$0,%rdx
947 mov $N[1],-16($tp) # tp[j-1]
948 mov %rdx,$N[1]
949
950 mulq $m0 # ap[j]*bp[i]
951 add %rax,$A[1]
952 mov 8*1($np),%rax
953 adc \$0,%rdx
954 add 8($tp),$A[1]
955 adc \$0,%rdx
956 mov %rdx,$A[0]
957
958 mulq $m1 # np[j]*m1
959 add %rax,$N[1]
960 mov 16($ap,$j),%rax
961 adc \$0,%rdx
962 add $A[1],$N[1]
963 lea 8*4($np),$np
964 adc \$0,%rdx
965 mov $N[0],-8($tp) # tp[j-1]
966 mov %rdx,$N[0]
967
968 add \$32,$j # j+=4
969 jnz .Linner4x
970
971 mulq $m0 # ap[j]*bp[i]
972 add %rax,$A[0]
973 mov -8*2($np),%rax
974 adc \$0,%rdx
975 add 16($tp),$A[0] # ap[j]*bp[i]+tp[j]
976 lea 32($tp),$tp
977 adc \$0,%rdx
978 mov %rdx,$A[1]
979
980 mulq $m1 # np[j]*m1
981 add %rax,$N[0]
982 mov -8($ap),%rax
983 adc \$0,%rdx
984 add $A[0],$N[0]
985 adc \$0,%rdx
986 mov $N[1],-32($tp) # tp[j-1]
987 mov %rdx,$N[1]
988
989 mulq $m0 # ap[j]*bp[i]
990 add %rax,$A[1]
991 mov $m1,%rax
992 mov -8*1($np),$m1
993 adc \$0,%rdx
994 add -8($tp),$A[1]
995 adc \$0,%rdx
996 mov %rdx,$A[0]
997
998 mulq $m1 # np[j]*m1
999 add %rax,$N[1]
1000 mov ($ap,$num),%rax # ap[0]
1001 adc \$0,%rdx
1002 add $A[1],$N[1]
1003 adc \$0,%rdx
1004 mov $N[0],-24($tp) # tp[j-1]
1005 mov %rdx,$N[0]
1006
1007 mov $N[1],-16($tp) # tp[j-1]
1008 lea ($np,$num),$np # rewind $np
1009
1010 xor $N[1],$N[1]
1011 add $A[0],$N[0]
1012 adc \$0,$N[1]
1013 add ($tp),$N[0] # pull upmost overflow bit
1014 adc \$0,$N[1] # upmost overflow bit
1015 mov $N[0],-8($tp)
1016
1017 cmp 16+8(%rsp),$bp
1018 jb .Louter4x
1019___
1020if (1) {
1021$code.=<<___;
1022 xor %rax,%rax
1023 sub $N[0],$m1 # compare top-most words
1024 adc $j,$j # $j is zero
1025 or $j,$N[1]
1026 sub $N[1],%rax # %rax=-$N[1]
1027 lea ($tp,$num),%rbx # tptr in .sqr4x_sub
1028 mov ($np),%r12
1029 lea ($np),%rbp # nptr in .sqr4x_sub
1030 mov %r9,%rcx
1031 sar \$3+2,%rcx
1032 mov 56+8(%rsp),%rdi # rptr in .sqr4x_sub
1033 dec %r12 # so that after 'not' we get -n[0]
1034 xor %r10,%r10
1035 mov 8*1(%rbp),%r13
1036 mov 8*2(%rbp),%r14
1037 mov 8*3(%rbp),%r15
1038 jmp .Lsqr4x_sub_entry
1039___
1040} else {
1041my @ri=("%rax",$bp,$m0,$m1);
1042my $rp="%rdx";
1043$code.=<<___
1044 xor \$1,$N[1]
1045 lea ($tp,$num),$tp # rewind $tp
1046 sar \$5,$num # cf=0
1047 lea ($np,$N[1],8),$np
1048 mov 56+8(%rsp),$rp # restore $rp
1049 jmp .Lsub4x
1050
1051.align 32
1052.Lsub4x:
1053 .byte 0x66
1054 mov 8*0($tp),@ri[0]
1055 mov 8*1($tp),@ri[1]
1056 .byte 0x66
1057 sbb 16*0($np),@ri[0]
1058 mov 8*2($tp),@ri[2]
1059 sbb 16*1($np),@ri[1]
1060 mov 3*8($tp),@ri[3]
1061 lea 4*8($tp),$tp
1062 sbb 16*2($np),@ri[2]
1063 mov @ri[0],8*0($rp)
1064 sbb 16*3($np),@ri[3]
1065 lea 16*4($np),$np
1066 mov @ri[1],8*1($rp)
1067 mov @ri[2],8*2($rp)
1068 mov @ri[3],8*3($rp)
1069 lea 8*4($rp),$rp
1070
1071 inc $num
1072 jnz .Lsub4x
1073
1074 ret
1075___
1076}
1077$code.=<<___;
1078.cfi_endproc
1079.size mul4x_internal,.-mul4x_internal
1080___
1081}}}
1082
1083{{{
1084######################################################################
1085# void bn_power5(
1086my $rptr="%rdi"; # BN_ULONG *rptr,
1087my $aptr="%rsi"; # const BN_ULONG *aptr,
1088my $bptr="%rdx"; # const void *table,
1089my $nptr="%rcx"; # const BN_ULONG *nptr,
1090my $n0 ="%r8"; # const BN_ULONG *n0);
1091my $num ="%r9"; # int num, has to be divisible by 8
1092 # int pwr
1093
1094my ($i,$j,$tptr)=("%rbp","%rcx",$rptr);
1095my @A0=("%r10","%r11");
1096my @A1=("%r12","%r13");
1097my ($a0,$a1,$ai)=("%r14","%r15","%rbx");
1098
1099$code.=<<___;
1100.globl bn_power5
1101.type bn_power5,\@function,6
1102.align 32
1103bn_power5:
1104.cfi_startproc
1105 mov %rsp,%rax
1106.cfi_def_cfa_register %rax
1107___
1108$code.=<<___ if ($addx);
1109 mov OPENSSL_ia32cap_P+8(%rip),%r11d
1110 and \$0x80108,%r11d
1111 cmp \$0x80108,%r11d # check for AD*X+BMI2+BMI1
1112 je .Lpowerx5_enter
1113___
1114$code.=<<___;
1115 push %rbx
1116.cfi_push %rbx
1117 push %rbp
1118.cfi_push %rbp
1119 push %r12
1120.cfi_push %r12
1121 push %r13
1122.cfi_push %r13
1123 push %r14
1124.cfi_push %r14
1125 push %r15
1126.cfi_push %r15
1127.Lpower5_prologue:
1128
1129 shl \$3,${num}d # convert $num to bytes
1130 lea ($num,$num,2),%r10d # 3*$num
1131 neg $num
1132 mov ($n0),$n0 # *n0
1133
1134 ##############################################################
1135 # Ensure that stack frame doesn't alias with $rptr+3*$num
1136 # modulo 4096, which covers ret[num], am[num] and n[num]
1137 # (see bn_exp.c). This is done to allow memory disambiguation
1138 # logic do its magic. [Extra 256 bytes is for power mask
1139 # calculated from 7th argument, the index.]
1140 #
1141 lea -320(%rsp,$num,2),%r11
1142 mov %rsp,%rbp
1143 sub $rptr,%r11
1144 and \$4095,%r11
1145 cmp %r11,%r10
1146 jb .Lpwr_sp_alt
1147 sub %r11,%rbp # align with $aptr
1148 lea -320(%rbp,$num,2),%rbp # future alloca(frame+2*num*8+256)
1149 jmp .Lpwr_sp_done
1150
1151.align 32
1152.Lpwr_sp_alt:
1153 lea 4096-320(,$num,2),%r10
1154 lea -320(%rbp,$num,2),%rbp # future alloca(frame+2*num*8+256)
1155 sub %r10,%r11
1156 mov \$0,%r10
1157 cmovc %r10,%r11
1158 sub %r11,%rbp
1159.Lpwr_sp_done:
1160 and \$-64,%rbp
1161 mov %rsp,%r11
1162 sub %rbp,%r11
1163 and \$-4096,%r11
1164 lea (%rbp,%r11),%rsp
1165 mov (%rsp),%r10
1166 cmp %rbp,%rsp
1167 ja .Lpwr_page_walk
1168 jmp .Lpwr_page_walk_done
1169
1170.Lpwr_page_walk:
1171 lea -4096(%rsp),%rsp
1172 mov (%rsp),%r10
1173 cmp %rbp,%rsp
1174 ja .Lpwr_page_walk
1175.Lpwr_page_walk_done:
1176
1177 mov $num,%r10
1178 neg $num
1179
1180 ##############################################################
1181 # Stack layout
1182 #
1183 # +0 saved $num, used in reduction section
1184 # +8 &t[2*$num], used in reduction section
1185 # +32 saved *n0
1186 # +40 saved %rsp
1187 # +48 t[2*$num]
1188 #
1189 mov $n0, 32(%rsp)
1190 mov %rax, 40(%rsp) # save original %rsp
1191.cfi_cfa_expression %rsp+40,deref,+8
1192.Lpower5_body:
1193 movq $rptr,%xmm1 # save $rptr, used in sqr8x
1194 movq $nptr,%xmm2 # save $nptr
1195 movq %r10, %xmm3 # -$num, used in sqr8x
1196 movq $bptr,%xmm4
1197
1198 call __bn_sqr8x_internal
1199 call __bn_post4x_internal
1200 call __bn_sqr8x_internal
1201 call __bn_post4x_internal
1202 call __bn_sqr8x_internal
1203 call __bn_post4x_internal
1204 call __bn_sqr8x_internal
1205 call __bn_post4x_internal
1206 call __bn_sqr8x_internal
1207 call __bn_post4x_internal
1208
1209 movq %xmm2,$nptr
1210 movq %xmm4,$bptr
1211 mov $aptr,$rptr
1212 mov 40(%rsp),%rax
1213 lea 32(%rsp),$n0
1214
1215 call mul4x_internal
1216
1217 mov 40(%rsp),%rsi # restore %rsp
1218.cfi_def_cfa %rsi,8
1219 mov \$1,%rax
1220 mov -48(%rsi),%r15
1221.cfi_restore %r15
1222 mov -40(%rsi),%r14
1223.cfi_restore %r14
1224 mov -32(%rsi),%r13
1225.cfi_restore %r13
1226 mov -24(%rsi),%r12
1227.cfi_restore %r12
1228 mov -16(%rsi),%rbp
1229.cfi_restore %rbp
1230 mov -8(%rsi),%rbx
1231.cfi_restore %rbx
1232 lea (%rsi),%rsp
1233.cfi_def_cfa_register %rsp
1234.Lpower5_epilogue:
1235 ret
1236.cfi_endproc
1237.size bn_power5,.-bn_power5
1238
1239.globl bn_sqr8x_internal
1240.hidden bn_sqr8x_internal
1241.type bn_sqr8x_internal,\@abi-omnipotent
1242.align 32
1243bn_sqr8x_internal:
1244__bn_sqr8x_internal:
1245.cfi_startproc
1246 ##############################################################
1247 # Squaring part:
1248 #
1249 # a) multiply-n-add everything but a[i]*a[i];
1250 # b) shift result of a) by 1 to the left and accumulate
1251 # a[i]*a[i] products;
1252 #
1253 ##############################################################
1254 # a[1]a[0]
1255 # a[2]a[0]
1256 # a[3]a[0]
1257 # a[2]a[1]
1258 # a[4]a[0]
1259 # a[3]a[1]
1260 # a[5]a[0]
1261 # a[4]a[1]
1262 # a[3]a[2]
1263 # a[6]a[0]
1264 # a[5]a[1]
1265 # a[4]a[2]
1266 # a[7]a[0]
1267 # a[6]a[1]
1268 # a[5]a[2]
1269 # a[4]a[3]
1270 # a[7]a[1]
1271 # a[6]a[2]
1272 # a[5]a[3]
1273 # a[7]a[2]
1274 # a[6]a[3]
1275 # a[5]a[4]
1276 # a[7]a[3]
1277 # a[6]a[4]
1278 # a[7]a[4]
1279 # a[6]a[5]
1280 # a[7]a[5]
1281 # a[7]a[6]
1282 # a[1]a[0]
1283 # a[2]a[0]
1284 # a[3]a[0]
1285 # a[4]a[0]
1286 # a[5]a[0]
1287 # a[6]a[0]
1288 # a[7]a[0]
1289 # a[2]a[1]
1290 # a[3]a[1]
1291 # a[4]a[1]
1292 # a[5]a[1]
1293 # a[6]a[1]
1294 # a[7]a[1]
1295 # a[3]a[2]
1296 # a[4]a[2]
1297 # a[5]a[2]
1298 # a[6]a[2]
1299 # a[7]a[2]
1300 # a[4]a[3]
1301 # a[5]a[3]
1302 # a[6]a[3]
1303 # a[7]a[3]
1304 # a[5]a[4]
1305 # a[6]a[4]
1306 # a[7]a[4]
1307 # a[6]a[5]
1308 # a[7]a[5]
1309 # a[7]a[6]
1310 # a[0]a[0]
1311 # a[1]a[1]
1312 # a[2]a[2]
1313 # a[3]a[3]
1314 # a[4]a[4]
1315 # a[5]a[5]
1316 # a[6]a[6]
1317 # a[7]a[7]
1318
1319 lea 32(%r10),$i # $i=-($num-32)
1320 lea ($aptr,$num),$aptr # end of a[] buffer, ($aptr,$i)=&ap[2]
1321
1322 mov $num,$j # $j=$num
1323
1324 # comments apply to $num==8 case
1325 mov -32($aptr,$i),$a0 # a[0]
1326 lea 48+8(%rsp,$num,2),$tptr # end of tp[] buffer, &tp[2*$num]
1327 mov -24($aptr,$i),%rax # a[1]
1328 lea -32($tptr,$i),$tptr # end of tp[] window, &tp[2*$num-"$i"]
1329 mov -16($aptr,$i),$ai # a[2]
1330 mov %rax,$a1
1331
1332 mul $a0 # a[1]*a[0]
1333 mov %rax,$A0[0] # a[1]*a[0]
1334 mov $ai,%rax # a[2]
1335 mov %rdx,$A0[1]
1336 mov $A0[0],-24($tptr,$i) # t[1]
1337
1338 mul $a0 # a[2]*a[0]
1339 add %rax,$A0[1]
1340 mov $ai,%rax
1341 adc \$0,%rdx
1342 mov $A0[1],-16($tptr,$i) # t[2]
1343 mov %rdx,$A0[0]
1344
1345
1346 mov -8($aptr,$i),$ai # a[3]
1347 mul $a1 # a[2]*a[1]
1348 mov %rax,$A1[0] # a[2]*a[1]+t[3]
1349 mov $ai,%rax
1350 mov %rdx,$A1[1]
1351
1352 lea ($i),$j
1353 mul $a0 # a[3]*a[0]
1354 add %rax,$A0[0] # a[3]*a[0]+a[2]*a[1]+t[3]
1355 mov $ai,%rax
1356 mov %rdx,$A0[1]
1357 adc \$0,$A0[1]
1358 add $A1[0],$A0[0]
1359 adc \$0,$A0[1]
1360 mov $A0[0],-8($tptr,$j) # t[3]
1361 jmp .Lsqr4x_1st
1362
1363.align 32
1364.Lsqr4x_1st:
1365 mov ($aptr,$j),$ai # a[4]
1366 mul $a1 # a[3]*a[1]
1367 add %rax,$A1[1] # a[3]*a[1]+t[4]
1368 mov $ai,%rax
1369 mov %rdx,$A1[0]
1370 adc \$0,$A1[0]
1371
1372 mul $a0 # a[4]*a[0]
1373 add %rax,$A0[1] # a[4]*a[0]+a[3]*a[1]+t[4]
1374 mov $ai,%rax # a[3]
1375 mov 8($aptr,$j),$ai # a[5]
1376 mov %rdx,$A0[0]
1377 adc \$0,$A0[0]
1378 add $A1[1],$A0[1]
1379 adc \$0,$A0[0]
1380
1381
1382 mul $a1 # a[4]*a[3]
1383 add %rax,$A1[0] # a[4]*a[3]+t[5]
1384 mov $ai,%rax
1385 mov $A0[1],($tptr,$j) # t[4]
1386 mov %rdx,$A1[1]
1387 adc \$0,$A1[1]
1388
1389 mul $a0 # a[5]*a[2]
1390 add %rax,$A0[0] # a[5]*a[2]+a[4]*a[3]+t[5]
1391 mov $ai,%rax
1392 mov 16($aptr,$j),$ai # a[6]
1393 mov %rdx,$A0[1]
1394 adc \$0,$A0[1]
1395 add $A1[0],$A0[0]
1396 adc \$0,$A0[1]
1397
1398 mul $a1 # a[5]*a[3]
1399 add %rax,$A1[1] # a[5]*a[3]+t[6]
1400 mov $ai,%rax
1401 mov $A0[0],8($tptr,$j) # t[5]
1402 mov %rdx,$A1[0]
1403 adc \$0,$A1[0]
1404
1405 mul $a0 # a[6]*a[2]
1406 add %rax,$A0[1] # a[6]*a[2]+a[5]*a[3]+t[6]
1407 mov $ai,%rax # a[3]
1408 mov 24($aptr,$j),$ai # a[7]
1409 mov %rdx,$A0[0]
1410 adc \$0,$A0[0]
1411 add $A1[1],$A0[1]
1412 adc \$0,$A0[0]
1413
1414
1415 mul $a1 # a[6]*a[5]
1416 add %rax,$A1[0] # a[6]*a[5]+t[7]
1417 mov $ai,%rax
1418 mov $A0[1],16($tptr,$j) # t[6]
1419 mov %rdx,$A1[1]
1420 adc \$0,$A1[1]
1421 lea 32($j),$j
1422
1423 mul $a0 # a[7]*a[4]
1424 add %rax,$A0[0] # a[7]*a[4]+a[6]*a[5]+t[6]
1425 mov $ai,%rax
1426 mov %rdx,$A0[1]
1427 adc \$0,$A0[1]
1428 add $A1[0],$A0[0]
1429 adc \$0,$A0[1]
1430 mov $A0[0],-8($tptr,$j) # t[7]
1431
1432 cmp \$0,$j
1433 jne .Lsqr4x_1st
1434
1435 mul $a1 # a[7]*a[5]
1436 add %rax,$A1[1]
1437 lea 16($i),$i
1438 adc \$0,%rdx
1439 add $A0[1],$A1[1]
1440 adc \$0,%rdx
1441
1442 mov $A1[1],($tptr) # t[8]
1443 mov %rdx,$A1[0]
1444 mov %rdx,8($tptr) # t[9]
1445 jmp .Lsqr4x_outer
1446
1447.align 32
1448.Lsqr4x_outer: # comments apply to $num==6 case
1449 mov -32($aptr,$i),$a0 # a[0]
1450 lea 48+8(%rsp,$num,2),$tptr # end of tp[] buffer, &tp[2*$num]
1451 mov -24($aptr,$i),%rax # a[1]
1452 lea -32($tptr,$i),$tptr # end of tp[] window, &tp[2*$num-"$i"]
1453 mov -16($aptr,$i),$ai # a[2]
1454 mov %rax,$a1
1455
1456 mul $a0 # a[1]*a[0]
1457 mov -24($tptr,$i),$A0[0] # t[1]
1458 add %rax,$A0[0] # a[1]*a[0]+t[1]
1459 mov $ai,%rax # a[2]
1460 adc \$0,%rdx
1461 mov $A0[0],-24($tptr,$i) # t[1]
1462 mov %rdx,$A0[1]
1463
1464 mul $a0 # a[2]*a[0]
1465 add %rax,$A0[1]
1466 mov $ai,%rax
1467 adc \$0,%rdx
1468 add -16($tptr,$i),$A0[1] # a[2]*a[0]+t[2]
1469 mov %rdx,$A0[0]
1470 adc \$0,$A0[0]
1471 mov $A0[1],-16($tptr,$i) # t[2]
1472
1473 xor $A1[0],$A1[0]
1474
1475 mov -8($aptr,$i),$ai # a[3]
1476 mul $a1 # a[2]*a[1]
1477 add %rax,$A1[0] # a[2]*a[1]+t[3]
1478 mov $ai,%rax
1479 adc \$0,%rdx
1480 add -8($tptr,$i),$A1[0]
1481 mov %rdx,$A1[1]
1482 adc \$0,$A1[1]
1483
1484 mul $a0 # a[3]*a[0]
1485 add %rax,$A0[0] # a[3]*a[0]+a[2]*a[1]+t[3]
1486 mov $ai,%rax
1487 adc \$0,%rdx
1488 add $A1[0],$A0[0]
1489 mov %rdx,$A0[1]
1490 adc \$0,$A0[1]
1491 mov $A0[0],-8($tptr,$i) # t[3]
1492
1493 lea ($i),$j
1494 jmp .Lsqr4x_inner
1495
1496.align 32
1497.Lsqr4x_inner:
1498 mov ($aptr,$j),$ai # a[4]
1499 mul $a1 # a[3]*a[1]
1500 add %rax,$A1[1] # a[3]*a[1]+t[4]
1501 mov $ai,%rax
1502 mov %rdx,$A1[0]
1503 adc \$0,$A1[0]
1504 add ($tptr,$j),$A1[1]
1505 adc \$0,$A1[0]
1506
1507 .byte 0x67
1508 mul $a0 # a[4]*a[0]
1509 add %rax,$A0[1] # a[4]*a[0]+a[3]*a[1]+t[4]
1510 mov $ai,%rax # a[3]
1511 mov 8($aptr,$j),$ai # a[5]
1512 mov %rdx,$A0[0]
1513 adc \$0,$A0[0]
1514 add $A1[1],$A0[1]
1515 adc \$0,$A0[0]
1516
1517 mul $a1 # a[4]*a[3]
1518 add %rax,$A1[0] # a[4]*a[3]+t[5]
1519 mov $A0[1],($tptr,$j) # t[4]
1520 mov $ai,%rax
1521 mov %rdx,$A1[1]
1522 adc \$0,$A1[1]
1523 add 8($tptr,$j),$A1[0]
1524 lea 16($j),$j # j++
1525 adc \$0,$A1[1]
1526
1527 mul $a0 # a[5]*a[2]
1528 add %rax,$A0[0] # a[5]*a[2]+a[4]*a[3]+t[5]
1529 mov $ai,%rax
1530 adc \$0,%rdx
1531 add $A1[0],$A0[0]
1532 mov %rdx,$A0[1]
1533 adc \$0,$A0[1]
1534 mov $A0[0],-8($tptr,$j) # t[5], "preloaded t[1]" below
1535
1536 cmp \$0,$j
1537 jne .Lsqr4x_inner
1538
1539 .byte 0x67
1540 mul $a1 # a[5]*a[3]
1541 add %rax,$A1[1]
1542 adc \$0,%rdx
1543 add $A0[1],$A1[1]
1544 adc \$0,%rdx
1545
1546 mov $A1[1],($tptr) # t[6], "preloaded t[2]" below
1547 mov %rdx,$A1[0]
1548 mov %rdx,8($tptr) # t[7], "preloaded t[3]" below
1549
1550 add \$16,$i
1551 jnz .Lsqr4x_outer
1552
1553 # comments apply to $num==4 case
1554 mov -32($aptr),$a0 # a[0]
1555 lea 48+8(%rsp,$num,2),$tptr # end of tp[] buffer, &tp[2*$num]
1556 mov -24($aptr),%rax # a[1]
1557 lea -32($tptr,$i),$tptr # end of tp[] window, &tp[2*$num-"$i"]
1558 mov -16($aptr),$ai # a[2]
1559 mov %rax,$a1
1560
1561 mul $a0 # a[1]*a[0]
1562 add %rax,$A0[0] # a[1]*a[0]+t[1], preloaded t[1]
1563 mov $ai,%rax # a[2]
1564 mov %rdx,$A0[1]
1565 adc \$0,$A0[1]
1566
1567 mul $a0 # a[2]*a[0]
1568 add %rax,$A0[1]
1569 mov $ai,%rax
1570 mov $A0[0],-24($tptr) # t[1]
1571 mov %rdx,$A0[0]
1572 adc \$0,$A0[0]
1573 add $A1[1],$A0[1] # a[2]*a[0]+t[2], preloaded t[2]
1574 mov -8($aptr),$ai # a[3]
1575 adc \$0,$A0[0]
1576
1577 mul $a1 # a[2]*a[1]
1578 add %rax,$A1[0] # a[2]*a[1]+t[3], preloaded t[3]
1579 mov $ai,%rax
1580 mov $A0[1],-16($tptr) # t[2]
1581 mov %rdx,$A1[1]
1582 adc \$0,$A1[1]
1583
1584 mul $a0 # a[3]*a[0]
1585 add %rax,$A0[0] # a[3]*a[0]+a[2]*a[1]+t[3]
1586 mov $ai,%rax
1587 mov %rdx,$A0[1]
1588 adc \$0,$A0[1]
1589 add $A1[0],$A0[0]
1590 adc \$0,$A0[1]
1591 mov $A0[0],-8($tptr) # t[3]
1592
1593 mul $a1 # a[3]*a[1]
1594 add %rax,$A1[1]
1595 mov -16($aptr),%rax # a[2]
1596 adc \$0,%rdx
1597 add $A0[1],$A1[1]
1598 adc \$0,%rdx
1599
1600 mov $A1[1],($tptr) # t[4]
1601 mov %rdx,$A1[0]
1602 mov %rdx,8($tptr) # t[5]
1603
1604 mul $ai # a[2]*a[3]
1605___
1606{
1607my ($shift,$carry)=($a0,$a1);
1608my @S=(@A1,$ai,$n0);
1609$code.=<<___;
1610 add \$16,$i
1611 xor $shift,$shift
1612 sub $num,$i # $i=16-$num
1613 xor $carry,$carry
1614
1615 add $A1[0],%rax # t[5]
1616 adc \$0,%rdx
1617 mov %rax,8($tptr) # t[5]
1618 mov %rdx,16($tptr) # t[6]
1619 mov $carry,24($tptr) # t[7]
1620
1621 mov -16($aptr,$i),%rax # a[0]
1622 lea 48+8(%rsp),$tptr
1623 xor $A0[0],$A0[0] # t[0]
1624 mov 8($tptr),$A0[1] # t[1]
1625
1626 lea ($shift,$A0[0],2),$S[0] # t[2*i]<<1 | shift
1627 shr \$63,$A0[0]
1628 lea ($j,$A0[1],2),$S[1] # t[2*i+1]<<1 |
1629 shr \$63,$A0[1]
1630 or $A0[0],$S[1] # | t[2*i]>>63
1631 mov 16($tptr),$A0[0] # t[2*i+2] # prefetch
1632 mov $A0[1],$shift # shift=t[2*i+1]>>63
1633 mul %rax # a[i]*a[i]
1634 neg $carry # mov $carry,cf
1635 mov 24($tptr),$A0[1] # t[2*i+2+1] # prefetch
1636 adc %rax,$S[0]
1637 mov -8($aptr,$i),%rax # a[i+1] # prefetch
1638 mov $S[0],($tptr)
1639 adc %rdx,$S[1]
1640
1641 lea ($shift,$A0[0],2),$S[2] # t[2*i]<<1 | shift
1642 mov $S[1],8($tptr)
1643 sbb $carry,$carry # mov cf,$carry
1644 shr \$63,$A0[0]
1645 lea ($j,$A0[1],2),$S[3] # t[2*i+1]<<1 |
1646 shr \$63,$A0[1]
1647 or $A0[0],$S[3] # | t[2*i]>>63
1648 mov 32($tptr),$A0[0] # t[2*i+2] # prefetch
1649 mov $A0[1],$shift # shift=t[2*i+1]>>63
1650 mul %rax # a[i]*a[i]
1651 neg $carry # mov $carry,cf
1652 mov 40($tptr),$A0[1] # t[2*i+2+1] # prefetch
1653 adc %rax,$S[2]
1654 mov 0($aptr,$i),%rax # a[i+1] # prefetch
1655 mov $S[2],16($tptr)
1656 adc %rdx,$S[3]
1657 lea 16($i),$i
1658 mov $S[3],24($tptr)
1659 sbb $carry,$carry # mov cf,$carry
1660 lea 64($tptr),$tptr
1661 jmp .Lsqr4x_shift_n_add
1662
1663.align 32
1664.Lsqr4x_shift_n_add:
1665 lea ($shift,$A0[0],2),$S[0] # t[2*i]<<1 | shift
1666 shr \$63,$A0[0]
1667 lea ($j,$A0[1],2),$S[1] # t[2*i+1]<<1 |
1668 shr \$63,$A0[1]
1669 or $A0[0],$S[1] # | t[2*i]>>63
1670 mov -16($tptr),$A0[0] # t[2*i+2] # prefetch
1671 mov $A0[1],$shift # shift=t[2*i+1]>>63
1672 mul %rax # a[i]*a[i]
1673 neg $carry # mov $carry,cf
1674 mov -8($tptr),$A0[1] # t[2*i+2+1] # prefetch
1675 adc %rax,$S[0]
1676 mov -8($aptr,$i),%rax # a[i+1] # prefetch
1677 mov $S[0],-32($tptr)
1678 adc %rdx,$S[1]
1679
1680 lea ($shift,$A0[0],2),$S[2] # t[2*i]<<1 | shift
1681 mov $S[1],-24($tptr)
1682 sbb $carry,$carry # mov cf,$carry
1683 shr \$63,$A0[0]
1684 lea ($j,$A0[1],2),$S[3] # t[2*i+1]<<1 |
1685 shr \$63,$A0[1]
1686 or $A0[0],$S[3] # | t[2*i]>>63
1687 mov 0($tptr),$A0[0] # t[2*i+2] # prefetch
1688 mov $A0[1],$shift # shift=t[2*i+1]>>63
1689 mul %rax # a[i]*a[i]
1690 neg $carry # mov $carry,cf
1691 mov 8($tptr),$A0[1] # t[2*i+2+1] # prefetch
1692 adc %rax,$S[2]
1693 mov 0($aptr,$i),%rax # a[i+1] # prefetch
1694 mov $S[2],-16($tptr)
1695 adc %rdx,$S[3]
1696
1697 lea ($shift,$A0[0],2),$S[0] # t[2*i]<<1 | shift
1698 mov $S[3],-8($tptr)
1699 sbb $carry,$carry # mov cf,$carry
1700 shr \$63,$A0[0]
1701 lea ($j,$A0[1],2),$S[1] # t[2*i+1]<<1 |
1702 shr \$63,$A0[1]
1703 or $A0[0],$S[1] # | t[2*i]>>63
1704 mov 16($tptr),$A0[0] # t[2*i+2] # prefetch
1705 mov $A0[1],$shift # shift=t[2*i+1]>>63
1706 mul %rax # a[i]*a[i]
1707 neg $carry # mov $carry,cf
1708 mov 24($tptr),$A0[1] # t[2*i+2+1] # prefetch
1709 adc %rax,$S[0]
1710 mov 8($aptr,$i),%rax # a[i+1] # prefetch
1711 mov $S[0],0($tptr)
1712 adc %rdx,$S[1]
1713
1714 lea ($shift,$A0[0],2),$S[2] # t[2*i]<<1 | shift
1715 mov $S[1],8($tptr)
1716 sbb $carry,$carry # mov cf,$carry
1717 shr \$63,$A0[0]
1718 lea ($j,$A0[1],2),$S[3] # t[2*i+1]<<1 |
1719 shr \$63,$A0[1]
1720 or $A0[0],$S[3] # | t[2*i]>>63
1721 mov 32($tptr),$A0[0] # t[2*i+2] # prefetch
1722 mov $A0[1],$shift # shift=t[2*i+1]>>63
1723 mul %rax # a[i]*a[i]
1724 neg $carry # mov $carry,cf
1725 mov 40($tptr),$A0[1] # t[2*i+2+1] # prefetch
1726 adc %rax,$S[2]
1727 mov 16($aptr,$i),%rax # a[i+1] # prefetch
1728 mov $S[2],16($tptr)
1729 adc %rdx,$S[3]
1730 mov $S[3],24($tptr)
1731 sbb $carry,$carry # mov cf,$carry
1732 lea 64($tptr),$tptr
1733 add \$32,$i
1734 jnz .Lsqr4x_shift_n_add
1735
1736 lea ($shift,$A0[0],2),$S[0] # t[2*i]<<1 | shift
1737 .byte 0x67
1738 shr \$63,$A0[0]
1739 lea ($j,$A0[1],2),$S[1] # t[2*i+1]<<1 |
1740 shr \$63,$A0[1]
1741 or $A0[0],$S[1] # | t[2*i]>>63
1742 mov -16($tptr),$A0[0] # t[2*i+2] # prefetch
1743 mov $A0[1],$shift # shift=t[2*i+1]>>63
1744 mul %rax # a[i]*a[i]
1745 neg $carry # mov $carry,cf
1746 mov -8($tptr),$A0[1] # t[2*i+2+1] # prefetch
1747 adc %rax,$S[0]
1748 mov -8($aptr),%rax # a[i+1] # prefetch
1749 mov $S[0],-32($tptr)
1750 adc %rdx,$S[1]
1751
1752 lea ($shift,$A0[0],2),$S[2] # t[2*i]<<1|shift
1753 mov $S[1],-24($tptr)
1754 sbb $carry,$carry # mov cf,$carry
1755 shr \$63,$A0[0]
1756 lea ($j,$A0[1],2),$S[3] # t[2*i+1]<<1 |
1757 shr \$63,$A0[1]
1758 or $A0[0],$S[3] # | t[2*i]>>63
1759 mul %rax # a[i]*a[i]
1760 neg $carry # mov $carry,cf
1761 adc %rax,$S[2]
1762 adc %rdx,$S[3]
1763 mov $S[2],-16($tptr)
1764 mov $S[3],-8($tptr)
1765___
1766}
1767
1768######################################################################
1769# Montgomery reduction part, "word-by-word" algorithm.
1770#
1771# This new path is inspired by multiple submissions from Intel, by
1772# Shay Gueron, Vlad Krasnov, Erdinc Ozturk, James Guilford,
1773# Vinodh Gopal...
1774{
1775my ($nptr,$tptr,$carry,$m0)=("%rbp","%rdi","%rsi","%rbx");
1776
1777$code.=<<___;
1778 movq %xmm2,$nptr
1779__bn_sqr8x_reduction:
1780 xor %rax,%rax
1781 lea ($nptr,$num),%rcx # end of n[]
1782 lea 48+8(%rsp,$num,2),%rdx # end of t[] buffer
1783 mov %rcx,0+8(%rsp)
1784 lea 48+8(%rsp,$num),$tptr # end of initial t[] window
1785 mov %rdx,8+8(%rsp)
1786 neg $num
1787 jmp .L8x_reduction_loop
1788
1789.align 32
1790.L8x_reduction_loop:
1791 lea ($tptr,$num),$tptr # start of current t[] window
1792 .byte 0x66
1793 mov 8*0($tptr),$m0
1794 mov 8*1($tptr),%r9
1795 mov 8*2($tptr),%r10
1796 mov 8*3($tptr),%r11
1797 mov 8*4($tptr),%r12
1798 mov 8*5($tptr),%r13
1799 mov 8*6($tptr),%r14
1800 mov 8*7($tptr),%r15
1801 mov %rax,(%rdx) # store top-most carry bit
1802 lea 8*8($tptr),$tptr
1803
1804 .byte 0x67
1805 mov $m0,%r8
1806 imulq 32+8(%rsp),$m0 # n0*a[0]
1807 mov 8*0($nptr),%rax # n[0]
1808 mov \$8,%ecx
1809 jmp .L8x_reduce
1810
1811.align 32
1812.L8x_reduce:
1813 mulq $m0
1814 mov 8*1($nptr),%rax # n[1]
1815 neg %r8
1816 mov %rdx,%r8
1817 adc \$0,%r8
1818
1819 mulq $m0
1820 add %rax,%r9
1821 mov 8*2($nptr),%rax
1822 adc \$0,%rdx
1823 add %r9,%r8
1824 mov $m0,48-8+8(%rsp,%rcx,8) # put aside n0*a[i]
1825 mov %rdx,%r9
1826 adc \$0,%r9
1827
1828 mulq $m0
1829 add %rax,%r10
1830 mov 8*3($nptr),%rax
1831 adc \$0,%rdx
1832 add %r10,%r9
1833 mov 32+8(%rsp),$carry # pull n0, borrow $carry
1834 mov %rdx,%r10
1835 adc \$0,%r10
1836
1837 mulq $m0
1838 add %rax,%r11
1839 mov 8*4($nptr),%rax
1840 adc \$0,%rdx
1841 imulq %r8,$carry # modulo-scheduled
1842 add %r11,%r10
1843 mov %rdx,%r11
1844 adc \$0,%r11
1845
1846 mulq $m0
1847 add %rax,%r12
1848 mov 8*5($nptr),%rax
1849 adc \$0,%rdx
1850 add %r12,%r11
1851 mov %rdx,%r12
1852 adc \$0,%r12
1853
1854 mulq $m0
1855 add %rax,%r13
1856 mov 8*6($nptr),%rax
1857 adc \$0,%rdx
1858 add %r13,%r12
1859 mov %rdx,%r13
1860 adc \$0,%r13
1861
1862 mulq $m0
1863 add %rax,%r14
1864 mov 8*7($nptr),%rax
1865 adc \$0,%rdx
1866 add %r14,%r13
1867 mov %rdx,%r14
1868 adc \$0,%r14
1869
1870 mulq $m0
1871 mov $carry,$m0 # n0*a[i]
1872 add %rax,%r15
1873 mov 8*0($nptr),%rax # n[0]
1874 adc \$0,%rdx
1875 add %r15,%r14
1876 mov %rdx,%r15
1877 adc \$0,%r15
1878
1879 dec %ecx
1880 jnz .L8x_reduce
1881
1882 lea 8*8($nptr),$nptr
1883 xor %rax,%rax
1884 mov 8+8(%rsp),%rdx # pull end of t[]
1885 cmp 0+8(%rsp),$nptr # end of n[]?
1886 jae .L8x_no_tail
1887
1888 .byte 0x66
1889 add 8*0($tptr),%r8
1890 adc 8*1($tptr),%r9
1891 adc 8*2($tptr),%r10
1892 adc 8*3($tptr),%r11
1893 adc 8*4($tptr),%r12
1894 adc 8*5($tptr),%r13
1895 adc 8*6($tptr),%r14
1896 adc 8*7($tptr),%r15
1897 sbb $carry,$carry # top carry
1898
1899 mov 48+56+8(%rsp),$m0 # pull n0*a[0]
1900 mov \$8,%ecx
1901 mov 8*0($nptr),%rax
1902 jmp .L8x_tail
1903
1904.align 32
1905.L8x_tail:
1906 mulq $m0
1907 add %rax,%r8
1908 mov 8*1($nptr),%rax
1909 mov %r8,($tptr) # save result
1910 mov %rdx,%r8
1911 adc \$0,%r8
1912
1913 mulq $m0
1914 add %rax,%r9
1915 mov 8*2($nptr),%rax
1916 adc \$0,%rdx
1917 add %r9,%r8
1918 lea 8($tptr),$tptr # $tptr++
1919 mov %rdx,%r9
1920 adc \$0,%r9
1921
1922 mulq $m0
1923 add %rax,%r10
1924 mov 8*3($nptr),%rax
1925 adc \$0,%rdx
1926 add %r10,%r9
1927 mov %rdx,%r10
1928 adc \$0,%r10
1929
1930 mulq $m0
1931 add %rax,%r11
1932 mov 8*4($nptr),%rax
1933 adc \$0,%rdx
1934 add %r11,%r10
1935 mov %rdx,%r11
1936 adc \$0,%r11
1937
1938 mulq $m0
1939 add %rax,%r12
1940 mov 8*5($nptr),%rax
1941 adc \$0,%rdx
1942 add %r12,%r11
1943 mov %rdx,%r12
1944 adc \$0,%r12
1945
1946 mulq $m0
1947 add %rax,%r13
1948 mov 8*6($nptr),%rax
1949 adc \$0,%rdx
1950 add %r13,%r12
1951 mov %rdx,%r13
1952 adc \$0,%r13
1953
1954 mulq $m0
1955 add %rax,%r14
1956 mov 8*7($nptr),%rax
1957 adc \$0,%rdx
1958 add %r14,%r13
1959 mov %rdx,%r14
1960 adc \$0,%r14
1961
1962 mulq $m0
1963 mov 48-16+8(%rsp,%rcx,8),$m0# pull n0*a[i]
1964 add %rax,%r15
1965 adc \$0,%rdx
1966 add %r15,%r14
1967 mov 8*0($nptr),%rax # pull n[0]
1968 mov %rdx,%r15
1969 adc \$0,%r15
1970
1971 dec %ecx
1972 jnz .L8x_tail
1973
1974 lea 8*8($nptr),$nptr
1975 mov 8+8(%rsp),%rdx # pull end of t[]
1976 cmp 0+8(%rsp),$nptr # end of n[]?
1977 jae .L8x_tail_done # break out of loop
1978
1979 mov 48+56+8(%rsp),$m0 # pull n0*a[0]
1980 neg $carry
1981 mov 8*0($nptr),%rax # pull n[0]
1982 adc 8*0($tptr),%r8
1983 adc 8*1($tptr),%r9
1984 adc 8*2($tptr),%r10
1985 adc 8*3($tptr),%r11
1986 adc 8*4($tptr),%r12
1987 adc 8*5($tptr),%r13
1988 adc 8*6($tptr),%r14
1989 adc 8*7($tptr),%r15
1990 sbb $carry,$carry # top carry
1991
1992 mov \$8,%ecx
1993 jmp .L8x_tail
1994
1995.align 32
1996.L8x_tail_done:
1997 xor %rax,%rax
1998 add (%rdx),%r8 # can this overflow?
1999 adc \$0,%r9
2000 adc \$0,%r10
2001 adc \$0,%r11
2002 adc \$0,%r12
2003 adc \$0,%r13
2004 adc \$0,%r14
2005 adc \$0,%r15
2006 adc \$0,%rax
2007
2008 neg $carry
2009.L8x_no_tail:
2010 adc 8*0($tptr),%r8
2011 adc 8*1($tptr),%r9
2012 adc 8*2($tptr),%r10
2013 adc 8*3($tptr),%r11
2014 adc 8*4($tptr),%r12
2015 adc 8*5($tptr),%r13
2016 adc 8*6($tptr),%r14
2017 adc 8*7($tptr),%r15
2018 adc \$0,%rax # top-most carry
2019 mov -8($nptr),%rcx # np[num-1]
2020 xor $carry,$carry
2021
2022 movq %xmm2,$nptr # restore $nptr
2023
2024 mov %r8,8*0($tptr) # store top 512 bits
2025 mov %r9,8*1($tptr)
2026 movq %xmm3,$num # $num is %r9, can't be moved upwards
2027 mov %r10,8*2($tptr)
2028 mov %r11,8*3($tptr)
2029 mov %r12,8*4($tptr)
2030 mov %r13,8*5($tptr)
2031 mov %r14,8*6($tptr)
2032 mov %r15,8*7($tptr)
2033 lea 8*8($tptr),$tptr
2034
2035 cmp %rdx,$tptr # end of t[]?
2036 jb .L8x_reduction_loop
2037 ret
2038.cfi_endproc
2039.size bn_sqr8x_internal,.-bn_sqr8x_internal
2040___
2041}
2042
2043##############################################################
2044# Post-condition, 4x unrolled
2045#
2046{
2047my ($tptr,$nptr)=("%rbx","%rbp");
2048$code.=<<___;
2049.type __bn_post4x_internal,\@abi-omnipotent
2050.align 32
2051__bn_post4x_internal:
2052.cfi_startproc
2053 mov 8*0($nptr),%r12
2054 lea (%rdi,$num),$tptr # %rdi was $tptr above
2055 mov $num,%rcx
2056 movq %xmm1,$rptr # restore $rptr
2057 neg %rax
2058 movq %xmm1,$aptr # prepare for back-to-back call
2059 sar \$3+2,%rcx
2060 dec %r12 # so that after 'not' we get -n[0]
2061 xor %r10,%r10
2062 mov 8*1($nptr),%r13
2063 mov 8*2($nptr),%r14
2064 mov 8*3($nptr),%r15
2065 jmp .Lsqr4x_sub_entry
2066
2067.align 16
2068.Lsqr4x_sub:
2069 mov 8*0($nptr),%r12
2070 mov 8*1($nptr),%r13
2071 mov 8*2($nptr),%r14
2072 mov 8*3($nptr),%r15
2073.Lsqr4x_sub_entry:
2074 lea 8*4($nptr),$nptr
2075 not %r12
2076 not %r13
2077 not %r14
2078 not %r15
2079 and %rax,%r12
2080 and %rax,%r13
2081 and %rax,%r14
2082 and %rax,%r15
2083
2084 neg %r10 # mov %r10,%cf
2085 adc 8*0($tptr),%r12
2086 adc 8*1($tptr),%r13
2087 adc 8*2($tptr),%r14
2088 adc 8*3($tptr),%r15
2089 mov %r12,8*0($rptr)
2090 lea 8*4($tptr),$tptr
2091 mov %r13,8*1($rptr)
2092 sbb %r10,%r10 # mov %cf,%r10
2093 mov %r14,8*2($rptr)
2094 mov %r15,8*3($rptr)
2095 lea 8*4($rptr),$rptr
2096
2097 inc %rcx # pass %cf
2098 jnz .Lsqr4x_sub
2099
2100 mov $num,%r10 # prepare for back-to-back call
2101 neg $num # restore $num
2102 ret
2103.cfi_endproc
2104.size __bn_post4x_internal,.-__bn_post4x_internal
2105___
2106}
2107{
2108$code.=<<___;
2109.globl bn_from_montgomery
2110.type bn_from_montgomery,\@abi-omnipotent
2111.align 32
2112bn_from_montgomery:
2113.cfi_startproc
2114 testl \$7,`($win64?"48(%rsp)":"%r9d")`
2115 jz bn_from_mont8x
2116 xor %eax,%eax
2117 ret
2118.cfi_endproc
2119.size bn_from_montgomery,.-bn_from_montgomery
2120
2121.type bn_from_mont8x,\@function,6
2122.align 32
2123bn_from_mont8x:
2124.cfi_startproc
2125 .byte 0x67
2126 mov %rsp,%rax
2127.cfi_def_cfa_register %rax
2128 push %rbx
2129.cfi_push %rbx
2130 push %rbp
2131.cfi_push %rbp
2132 push %r12
2133.cfi_push %r12
2134 push %r13
2135.cfi_push %r13
2136 push %r14
2137.cfi_push %r14
2138 push %r15
2139.cfi_push %r15
2140.Lfrom_prologue:
2141
2142 shl \$3,${num}d # convert $num to bytes
2143 lea ($num,$num,2),%r10 # 3*$num in bytes
2144 neg $num
2145 mov ($n0),$n0 # *n0
2146
2147 ##############################################################
2148 # Ensure that stack frame doesn't alias with $rptr+3*$num
2149 # modulo 4096, which covers ret[num], am[num] and n[num]
2150 # (see bn_exp.c). The stack is allocated to aligned with
2151 # bn_power5's frame, and as bn_from_montgomery happens to be
2152 # last operation, we use the opportunity to cleanse it.
2153 #
2154 lea -320(%rsp,$num,2),%r11
2155 mov %rsp,%rbp
2156 sub $rptr,%r11
2157 and \$4095,%r11
2158 cmp %r11,%r10
2159 jb .Lfrom_sp_alt
2160 sub %r11,%rbp # align with $aptr
2161 lea -320(%rbp,$num,2),%rbp # future alloca(frame+2*$num*8+256)
2162 jmp .Lfrom_sp_done
2163
2164.align 32
2165.Lfrom_sp_alt:
2166 lea 4096-320(,$num,2),%r10
2167 lea -320(%rbp,$num,2),%rbp # future alloca(frame+2*$num*8+256)
2168 sub %r10,%r11
2169 mov \$0,%r10
2170 cmovc %r10,%r11
2171 sub %r11,%rbp
2172.Lfrom_sp_done:
2173 and \$-64,%rbp
2174 mov %rsp,%r11
2175 sub %rbp,%r11
2176 and \$-4096,%r11
2177 lea (%rbp,%r11),%rsp
2178 mov (%rsp),%r10
2179 cmp %rbp,%rsp
2180 ja .Lfrom_page_walk
2181 jmp .Lfrom_page_walk_done
2182
2183.Lfrom_page_walk:
2184 lea -4096(%rsp),%rsp
2185 mov (%rsp),%r10
2186 cmp %rbp,%rsp
2187 ja .Lfrom_page_walk
2188.Lfrom_page_walk_done:
2189
2190 mov $num,%r10
2191 neg $num
2192
2193 ##############################################################
2194 # Stack layout
2195 #
2196 # +0 saved $num, used in reduction section
2197 # +8 &t[2*$num], used in reduction section
2198 # +32 saved *n0
2199 # +40 saved %rsp
2200 # +48 t[2*$num]
2201 #
2202 mov $n0, 32(%rsp)
2203 mov %rax, 40(%rsp) # save original %rsp
2204.cfi_cfa_expression %rsp+40,deref,+8
2205.Lfrom_body:
2206 mov $num,%r11
2207 lea 48(%rsp),%rax
2208 pxor %xmm0,%xmm0
2209 jmp .Lmul_by_1
2210
2211.align 32
2212.Lmul_by_1:
2213 movdqu ($aptr),%xmm1
2214 movdqu 16($aptr),%xmm2
2215 movdqu 32($aptr),%xmm3
2216 movdqa %xmm0,(%rax,$num)
2217 movdqu 48($aptr),%xmm4
2218 movdqa %xmm0,16(%rax,$num)
2219 .byte 0x48,0x8d,0xb6,0x40,0x00,0x00,0x00 # lea 64($aptr),$aptr
2220 movdqa %xmm1,(%rax)
2221 movdqa %xmm0,32(%rax,$num)
2222 movdqa %xmm2,16(%rax)
2223 movdqa %xmm0,48(%rax,$num)
2224 movdqa %xmm3,32(%rax)
2225 movdqa %xmm4,48(%rax)
2226 lea 64(%rax),%rax
2227 sub \$64,%r11
2228 jnz .Lmul_by_1
2229
2230 movq $rptr,%xmm1
2231 movq $nptr,%xmm2
2232 .byte 0x67
2233 mov $nptr,%rbp
2234 movq %r10, %xmm3 # -num
2235___
2236$code.=<<___ if ($addx);
2237 mov OPENSSL_ia32cap_P+8(%rip),%r11d
2238 and \$0x80108,%r11d
2239 cmp \$0x80108,%r11d # check for AD*X+BMI2+BMI1
2240 jne .Lfrom_mont_nox
2241
2242 lea (%rax,$num),$rptr
2243 call __bn_sqrx8x_reduction
2244 call __bn_postx4x_internal
2245
2246 pxor %xmm0,%xmm0
2247 lea 48(%rsp),%rax
2248 jmp .Lfrom_mont_zero
2249
2250.align 32
2251.Lfrom_mont_nox:
2252___
2253$code.=<<___;
2254 call __bn_sqr8x_reduction
2255 call __bn_post4x_internal
2256
2257 pxor %xmm0,%xmm0
2258 lea 48(%rsp),%rax
2259 jmp .Lfrom_mont_zero
2260
2261.align 32
2262.Lfrom_mont_zero:
2263 mov 40(%rsp),%rsi # restore %rsp
2264.cfi_def_cfa %rsi,8
2265 movdqa %xmm0,16*0(%rax)
2266 movdqa %xmm0,16*1(%rax)
2267 movdqa %xmm0,16*2(%rax)
2268 movdqa %xmm0,16*3(%rax)
2269 lea 16*4(%rax),%rax
2270 sub \$32,$num
2271 jnz .Lfrom_mont_zero
2272
2273 mov \$1,%rax
2274 mov -48(%rsi),%r15
2275.cfi_restore %r15
2276 mov -40(%rsi),%r14
2277.cfi_restore %r14
2278 mov -32(%rsi),%r13
2279.cfi_restore %r13
2280 mov -24(%rsi),%r12
2281.cfi_restore %r12
2282 mov -16(%rsi),%rbp
2283.cfi_restore %rbp
2284 mov -8(%rsi),%rbx
2285.cfi_restore %rbx
2286 lea (%rsi),%rsp
2287.cfi_def_cfa_register %rsp
2288.Lfrom_epilogue:
2289 ret
2290.cfi_endproc
2291.size bn_from_mont8x,.-bn_from_mont8x
2292___
2293}
2294}}}
2295
2296
2297if ($addx) {{{
2298my $bp="%rdx"; # restore original value
2299
2300$code.=<<___;
2301.type bn_mulx4x_mont_gather5,\@function,6
2302.align 32
2303bn_mulx4x_mont_gather5:
2304.cfi_startproc
2305 mov %rsp,%rax
2306.cfi_def_cfa_register %rax
2307.Lmulx4x_enter:
2308 push %rbx
2309.cfi_push %rbx
2310 push %rbp
2311.cfi_push %rbp
2312 push %r12
2313.cfi_push %r12
2314 push %r13
2315.cfi_push %r13
2316 push %r14
2317.cfi_push %r14
2318 push %r15
2319.cfi_push %r15
2320.Lmulx4x_prologue:
2321
2322 shl \$3,${num}d # convert $num to bytes
2323 lea ($num,$num,2),%r10 # 3*$num in bytes
2324 neg $num # -$num
2325 mov ($n0),$n0 # *n0
2326
2327 ##############################################################
2328 # Ensure that stack frame doesn't alias with $rptr+3*$num
2329 # modulo 4096, which covers ret[num], am[num] and n[num]
2330 # (see bn_exp.c). This is done to allow memory disambiguation
2331 # logic do its magic. [Extra [num] is allocated in order
2332 # to align with bn_power5's frame, which is cleansed after
2333 # completing exponentiation. Extra 256 bytes is for power mask
2334 # calculated from 7th argument, the index.]
2335 #
2336 lea -320(%rsp,$num,2),%r11
2337 mov %rsp,%rbp
2338 sub $rp,%r11
2339 and \$4095,%r11
2340 cmp %r11,%r10
2341 jb .Lmulx4xsp_alt
2342 sub %r11,%rbp # align with $aptr
2343 lea -320(%rbp,$num,2),%rbp # future alloca(frame+2*$num*8+256)
2344 jmp .Lmulx4xsp_done
2345
2346.Lmulx4xsp_alt:
2347 lea 4096-320(,$num,2),%r10
2348 lea -320(%rbp,$num,2),%rbp # future alloca(frame+2*$num*8+256)
2349 sub %r10,%r11
2350 mov \$0,%r10
2351 cmovc %r10,%r11
2352 sub %r11,%rbp
2353.Lmulx4xsp_done:
2354 and \$-64,%rbp # ensure alignment
2355 mov %rsp,%r11
2356 sub %rbp,%r11
2357 and \$-4096,%r11
2358 lea (%rbp,%r11),%rsp
2359 mov (%rsp),%r10
2360 cmp %rbp,%rsp
2361 ja .Lmulx4x_page_walk
2362 jmp .Lmulx4x_page_walk_done
2363
2364.Lmulx4x_page_walk:
2365 lea -4096(%rsp),%rsp
2366 mov (%rsp),%r10
2367 cmp %rbp,%rsp
2368 ja .Lmulx4x_page_walk
2369.Lmulx4x_page_walk_done:
2370
2371 ##############################################################
2372 # Stack layout
2373 # +0 -num
2374 # +8 off-loaded &b[i]
2375 # +16 end of b[num]
2376 # +24 inner counter
2377 # +32 saved n0
2378 # +40 saved %rsp
2379 # +48
2380 # +56 saved rp
2381 # +64 tmp[num+1]
2382 #
2383 mov $n0, 32(%rsp) # save *n0
2384 mov %rax,40(%rsp) # save original %rsp
2385.cfi_cfa_expression %rsp+40,deref,+8
2386.Lmulx4x_body:
2387 call mulx4x_internal
2388
2389 mov 40(%rsp),%rsi # restore %rsp
2390.cfi_def_cfa %rsi,8
2391 mov \$1,%rax
2392
2393 mov -48(%rsi),%r15
2394.cfi_restore %r15
2395 mov -40(%rsi),%r14
2396.cfi_restore %r14
2397 mov -32(%rsi),%r13
2398.cfi_restore %r13
2399 mov -24(%rsi),%r12
2400.cfi_restore %r12
2401 mov -16(%rsi),%rbp
2402.cfi_restore %rbp
2403 mov -8(%rsi),%rbx
2404.cfi_restore %rbx
2405 lea (%rsi),%rsp
2406.cfi_def_cfa_register %rsp
2407.Lmulx4x_epilogue:
2408 ret
2409.cfi_endproc
2410.size bn_mulx4x_mont_gather5,.-bn_mulx4x_mont_gather5
2411
2412.type mulx4x_internal,\@abi-omnipotent
2413.align 32
2414mulx4x_internal:
2415.cfi_startproc
2416 mov $num,8(%rsp) # save -$num (it was in bytes)
2417 mov $num,%r10
2418 neg $num # restore $num
2419 shl \$5,$num
2420 neg %r10 # restore $num
2421 lea 128($bp,$num),%r13 # end of powers table (+size optimization)
2422 shr \$5+5,$num
2423 movd `($win64?56:8)`(%rax),%xmm5 # load 7th argument
2424 sub \$1,$num
2425 lea .Linc(%rip),%rax
2426 mov %r13,16+8(%rsp) # end of b[num]
2427 mov $num,24+8(%rsp) # inner counter
2428 mov $rp, 56+8(%rsp) # save $rp
2429___
2430my ($aptr, $bptr, $nptr, $tptr, $mi, $bi, $zero, $num)=
2431 ("%rsi","%rdi","%rcx","%rbx","%r8","%r9","%rbp","%rax");
2432my $rptr=$bptr;
2433my $STRIDE=2**5*8; # 5 is "window size"
2434my $N=$STRIDE/4; # should match cache line size
2435$code.=<<___;
2436 movdqa 0(%rax),%xmm0 # 00000001000000010000000000000000
2437 movdqa 16(%rax),%xmm1 # 00000002000000020000000200000002
2438 lea 88-112(%rsp,%r10),%r10 # place the mask after tp[num+1] (+ICache optimization)
2439 lea 128($bp),$bptr # size optimization
2440
2441 pshufd \$0,%xmm5,%xmm5 # broadcast index
2442 movdqa %xmm1,%xmm4
2443 .byte 0x67
2444 movdqa %xmm1,%xmm2
2445___
2446########################################################################
2447# calculate mask by comparing 0..31 to index and save result to stack
2448#
2449$code.=<<___;
2450 .byte 0x67
2451 paddd %xmm0,%xmm1
2452 pcmpeqd %xmm5,%xmm0 # compare to 1,0
2453 movdqa %xmm4,%xmm3
2454___
2455for($i=0;$i<$STRIDE/16-4;$i+=4) {
2456$code.=<<___;
2457 paddd %xmm1,%xmm2
2458 pcmpeqd %xmm5,%xmm1 # compare to 3,2
2459 movdqa %xmm0,`16*($i+0)+112`(%r10)
2460 movdqa %xmm4,%xmm0
2461
2462 paddd %xmm2,%xmm3
2463 pcmpeqd %xmm5,%xmm2 # compare to 5,4
2464 movdqa %xmm1,`16*($i+1)+112`(%r10)
2465 movdqa %xmm4,%xmm1
2466
2467 paddd %xmm3,%xmm0
2468 pcmpeqd %xmm5,%xmm3 # compare to 7,6
2469 movdqa %xmm2,`16*($i+2)+112`(%r10)
2470 movdqa %xmm4,%xmm2
2471
2472 paddd %xmm0,%xmm1
2473 pcmpeqd %xmm5,%xmm0
2474 movdqa %xmm3,`16*($i+3)+112`(%r10)
2475 movdqa %xmm4,%xmm3
2476___
2477}
2478$code.=<<___; # last iteration can be optimized
2479 .byte 0x67
2480 paddd %xmm1,%xmm2
2481 pcmpeqd %xmm5,%xmm1
2482 movdqa %xmm0,`16*($i+0)+112`(%r10)
2483
2484 paddd %xmm2,%xmm3
2485 pcmpeqd %xmm5,%xmm2
2486 movdqa %xmm1,`16*($i+1)+112`(%r10)
2487
2488 pcmpeqd %xmm5,%xmm3
2489 movdqa %xmm2,`16*($i+2)+112`(%r10)
2490
2491 pand `16*($i+0)-128`($bptr),%xmm0 # while it's still in register
2492 pand `16*($i+1)-128`($bptr),%xmm1
2493 pand `16*($i+2)-128`($bptr),%xmm2
2494 movdqa %xmm3,`16*($i+3)+112`(%r10)
2495 pand `16*($i+3)-128`($bptr),%xmm3
2496 por %xmm2,%xmm0
2497 por %xmm3,%xmm1
2498___
2499for($i=0;$i<$STRIDE/16-4;$i+=4) {
2500$code.=<<___;
2501 movdqa `16*($i+0)-128`($bptr),%xmm4
2502 movdqa `16*($i+1)-128`($bptr),%xmm5
2503 movdqa `16*($i+2)-128`($bptr),%xmm2
2504 pand `16*($i+0)+112`(%r10),%xmm4
2505 movdqa `16*($i+3)-128`($bptr),%xmm3
2506 pand `16*($i+1)+112`(%r10),%xmm5
2507 por %xmm4,%xmm0
2508 pand `16*($i+2)+112`(%r10),%xmm2
2509 por %xmm5,%xmm1
2510 pand `16*($i+3)+112`(%r10),%xmm3
2511 por %xmm2,%xmm0
2512 por %xmm3,%xmm1
2513___
2514}
2515$code.=<<___;
2516 pxor %xmm1,%xmm0
2517 pshufd \$0x4e,%xmm0,%xmm1
2518 por %xmm1,%xmm0
2519 lea $STRIDE($bptr),$bptr
2520 movq %xmm0,%rdx # bp[0]
2521 lea 64+8*4+8(%rsp),$tptr
2522
2523 mov %rdx,$bi
2524 mulx 0*8($aptr),$mi,%rax # a[0]*b[0]
2525 mulx 1*8($aptr),%r11,%r12 # a[1]*b[0]
2526 add %rax,%r11
2527 mulx 2*8($aptr),%rax,%r13 # ...
2528 adc %rax,%r12
2529 adc \$0,%r13
2530 mulx 3*8($aptr),%rax,%r14
2531
2532 mov $mi,%r15
2533 imulq 32+8(%rsp),$mi # "t[0]"*n0
2534 xor $zero,$zero # cf=0, of=0
2535 mov $mi,%rdx
2536
2537 mov $bptr,8+8(%rsp) # off-load &b[i]
2538
2539 lea 4*8($aptr),$aptr
2540 adcx %rax,%r13
2541 adcx $zero,%r14 # cf=0
2542
2543 mulx 0*8($nptr),%rax,%r10
2544 adcx %rax,%r15 # discarded
2545 adox %r11,%r10
2546 mulx 1*8($nptr),%rax,%r11
2547 adcx %rax,%r10
2548 adox %r12,%r11
2549 mulx 2*8($nptr),%rax,%r12
2550 mov 24+8(%rsp),$bptr # counter value
2551 mov %r10,-8*4($tptr)
2552 adcx %rax,%r11
2553 adox %r13,%r12
2554 mulx 3*8($nptr),%rax,%r15
2555 mov $bi,%rdx
2556 mov %r11,-8*3($tptr)
2557 adcx %rax,%r12
2558 adox $zero,%r15 # of=0
2559 lea 4*8($nptr),$nptr
2560 mov %r12,-8*2($tptr)
2561 jmp .Lmulx4x_1st
2562
2563.align 32
2564.Lmulx4x_1st:
2565 adcx $zero,%r15 # cf=0, modulo-scheduled
2566 mulx 0*8($aptr),%r10,%rax # a[4]*b[0]
2567 adcx %r14,%r10
2568 mulx 1*8($aptr),%r11,%r14 # a[5]*b[0]
2569 adcx %rax,%r11
2570 mulx 2*8($aptr),%r12,%rax # ...
2571 adcx %r14,%r12
2572 mulx 3*8($aptr),%r13,%r14
2573 .byte 0x67,0x67
2574 mov $mi,%rdx
2575 adcx %rax,%r13
2576 adcx $zero,%r14 # cf=0
2577 lea 4*8($aptr),$aptr
2578 lea 4*8($tptr),$tptr
2579
2580 adox %r15,%r10
2581 mulx 0*8($nptr),%rax,%r15
2582 adcx %rax,%r10
2583 adox %r15,%r11
2584 mulx 1*8($nptr),%rax,%r15
2585 adcx %rax,%r11
2586 adox %r15,%r12
2587 mulx 2*8($nptr),%rax,%r15
2588 mov %r10,-5*8($tptr)
2589 adcx %rax,%r12
2590 mov %r11,-4*8($tptr)
2591 adox %r15,%r13
2592 mulx 3*8($nptr),%rax,%r15
2593 mov $bi,%rdx
2594 mov %r12,-3*8($tptr)
2595 adcx %rax,%r13
2596 adox $zero,%r15
2597 lea 4*8($nptr),$nptr
2598 mov %r13,-2*8($tptr)
2599
2600 dec $bptr # of=0, pass cf
2601 jnz .Lmulx4x_1st
2602
2603 mov 8(%rsp),$num # load -num
2604 adc $zero,%r15 # modulo-scheduled
2605 lea ($aptr,$num),$aptr # rewind $aptr
2606 add %r15,%r14
2607 mov 8+8(%rsp),$bptr # re-load &b[i]
2608 adc $zero,$zero # top-most carry
2609 mov %r14,-1*8($tptr)
2610 jmp .Lmulx4x_outer
2611
2612.align 32
2613.Lmulx4x_outer:
2614 lea 16-256($tptr),%r10 # where 256-byte mask is (+density control)
2615 pxor %xmm4,%xmm4
2616 .byte 0x67,0x67
2617 pxor %xmm5,%xmm5
2618___
2619for($i=0;$i<$STRIDE/16;$i+=4) {
2620$code.=<<___;
2621 movdqa `16*($i+0)-128`($bptr),%xmm0
2622 movdqa `16*($i+1)-128`($bptr),%xmm1
2623 movdqa `16*($i+2)-128`($bptr),%xmm2
2624 pand `16*($i+0)+256`(%r10),%xmm0
2625 movdqa `16*($i+3)-128`($bptr),%xmm3
2626 pand `16*($i+1)+256`(%r10),%xmm1
2627 por %xmm0,%xmm4
2628 pand `16*($i+2)+256`(%r10),%xmm2
2629 por %xmm1,%xmm5
2630 pand `16*($i+3)+256`(%r10),%xmm3
2631 por %xmm2,%xmm4
2632 por %xmm3,%xmm5
2633___
2634}
2635$code.=<<___;
2636 por %xmm5,%xmm4
2637 pshufd \$0x4e,%xmm4,%xmm0
2638 por %xmm4,%xmm0
2639 lea $STRIDE($bptr),$bptr
2640 movq %xmm0,%rdx # m0=bp[i]
2641
2642 mov $zero,($tptr) # save top-most carry
2643 lea 4*8($tptr,$num),$tptr # rewind $tptr
2644 mulx 0*8($aptr),$mi,%r11 # a[0]*b[i]
2645 xor $zero,$zero # cf=0, of=0
2646 mov %rdx,$bi
2647 mulx 1*8($aptr),%r14,%r12 # a[1]*b[i]
2648 adox -4*8($tptr),$mi # +t[0]
2649 adcx %r14,%r11
2650 mulx 2*8($aptr),%r15,%r13 # ...
2651 adox -3*8($tptr),%r11
2652 adcx %r15,%r12
2653 mulx 3*8($aptr),%rdx,%r14
2654 adox -2*8($tptr),%r12
2655 adcx %rdx,%r13
2656 lea ($nptr,$num),$nptr # rewind $nptr
2657 lea 4*8($aptr),$aptr
2658 adox -1*8($tptr),%r13
2659 adcx $zero,%r14
2660 adox $zero,%r14
2661
2662 mov $mi,%r15
2663 imulq 32+8(%rsp),$mi # "t[0]"*n0
2664
2665 mov $mi,%rdx
2666 xor $zero,$zero # cf=0, of=0
2667 mov $bptr,8+8(%rsp) # off-load &b[i]
2668
2669 mulx 0*8($nptr),%rax,%r10
2670 adcx %rax,%r15 # discarded
2671 adox %r11,%r10
2672 mulx 1*8($nptr),%rax,%r11
2673 adcx %rax,%r10
2674 adox %r12,%r11
2675 mulx 2*8($nptr),%rax,%r12
2676 adcx %rax,%r11
2677 adox %r13,%r12
2678 mulx 3*8($nptr),%rax,%r15
2679 mov $bi,%rdx
2680 mov 24+8(%rsp),$bptr # counter value
2681 mov %r10,-8*4($tptr)
2682 adcx %rax,%r12
2683 mov %r11,-8*3($tptr)
2684 adox $zero,%r15 # of=0
2685 mov %r12,-8*2($tptr)
2686 lea 4*8($nptr),$nptr
2687 jmp .Lmulx4x_inner
2688
2689.align 32
2690.Lmulx4x_inner:
2691 mulx 0*8($aptr),%r10,%rax # a[4]*b[i]
2692 adcx $zero,%r15 # cf=0, modulo-scheduled
2693 adox %r14,%r10
2694 mulx 1*8($aptr),%r11,%r14 # a[5]*b[i]
2695 adcx 0*8($tptr),%r10
2696 adox %rax,%r11
2697 mulx 2*8($aptr),%r12,%rax # ...
2698 adcx 1*8($tptr),%r11
2699 adox %r14,%r12
2700 mulx 3*8($aptr),%r13,%r14
2701 mov $mi,%rdx
2702 adcx 2*8($tptr),%r12
2703 adox %rax,%r13
2704 adcx 3*8($tptr),%r13
2705 adox $zero,%r14 # of=0
2706 lea 4*8($aptr),$aptr
2707 lea 4*8($tptr),$tptr
2708 adcx $zero,%r14 # cf=0
2709
2710 adox %r15,%r10
2711 mulx 0*8($nptr),%rax,%r15
2712 adcx %rax,%r10
2713 adox %r15,%r11
2714 mulx 1*8($nptr),%rax,%r15
2715 adcx %rax,%r11
2716 adox %r15,%r12
2717 mulx 2*8($nptr),%rax,%r15
2718 mov %r10,-5*8($tptr)
2719 adcx %rax,%r12
2720 adox %r15,%r13
2721 mov %r11,-4*8($tptr)
2722 mulx 3*8($nptr),%rax,%r15
2723 mov $bi,%rdx
2724 lea 4*8($nptr),$nptr
2725 mov %r12,-3*8($tptr)
2726 adcx %rax,%r13
2727 adox $zero,%r15
2728 mov %r13,-2*8($tptr)
2729
2730 dec $bptr # of=0, pass cf
2731 jnz .Lmulx4x_inner
2732
2733 mov 0+8(%rsp),$num # load -num
2734 adc $zero,%r15 # modulo-scheduled
2735 sub 0*8($tptr),$bptr # pull top-most carry to %cf
2736 mov 8+8(%rsp),$bptr # re-load &b[i]
2737 mov 16+8(%rsp),%r10
2738 adc %r15,%r14
2739 lea ($aptr,$num),$aptr # rewind $aptr
2740 adc $zero,$zero # top-most carry
2741 mov %r14,-1*8($tptr)
2742
2743 cmp %r10,$bptr
2744 jb .Lmulx4x_outer
2745
2746 mov -8($nptr),%r10
2747 mov $zero,%r8
2748 mov ($nptr,$num),%r12
2749 lea ($nptr,$num),%rbp # rewind $nptr
2750 mov $num,%rcx
2751 lea ($tptr,$num),%rdi # rewind $tptr
2752 xor %eax,%eax
2753 xor %r15,%r15
2754 sub %r14,%r10 # compare top-most words
2755 adc %r15,%r15
2756 or %r15,%r8
2757 sar \$3+2,%rcx
2758 sub %r8,%rax # %rax=-%r8
2759 mov 56+8(%rsp),%rdx # restore rp
2760 dec %r12 # so that after 'not' we get -n[0]
2761 mov 8*1(%rbp),%r13
2762 xor %r8,%r8
2763 mov 8*2(%rbp),%r14
2764 mov 8*3(%rbp),%r15
2765 jmp .Lsqrx4x_sub_entry # common post-condition
2766.cfi_endproc
2767.size mulx4x_internal,.-mulx4x_internal
2768___
2769}
2770{
2771######################################################################
2772# void bn_power5(
2773my $rptr="%rdi"; # BN_ULONG *rptr,
2774my $aptr="%rsi"; # const BN_ULONG *aptr,
2775my $bptr="%rdx"; # const void *table,
2776my $nptr="%rcx"; # const BN_ULONG *nptr,
2777my $n0 ="%r8"; # const BN_ULONG *n0);
2778my $num ="%r9"; # int num, has to be divisible by 8
2779 # int pwr);
2780
2781my ($i,$j,$tptr)=("%rbp","%rcx",$rptr);
2782my @A0=("%r10","%r11");
2783my @A1=("%r12","%r13");
2784my ($a0,$a1,$ai)=("%r14","%r15","%rbx");
2785
2786$code.=<<___;
2787.type bn_powerx5,\@function,6
2788.align 32
2789bn_powerx5:
2790.cfi_startproc
2791 mov %rsp,%rax
2792.cfi_def_cfa_register %rax
2793.Lpowerx5_enter:
2794 push %rbx
2795.cfi_push %rbx
2796 push %rbp
2797.cfi_push %rbp
2798 push %r12
2799.cfi_push %r12
2800 push %r13
2801.cfi_push %r13
2802 push %r14
2803.cfi_push %r14
2804 push %r15
2805.cfi_push %r15
2806.Lpowerx5_prologue:
2807
2808 shl \$3,${num}d # convert $num to bytes
2809 lea ($num,$num,2),%r10 # 3*$num in bytes
2810 neg $num
2811 mov ($n0),$n0 # *n0
2812
2813 ##############################################################
2814 # Ensure that stack frame doesn't alias with $rptr+3*$num
2815 # modulo 4096, which covers ret[num], am[num] and n[num]
2816 # (see bn_exp.c). This is done to allow memory disambiguation
2817 # logic do its magic. [Extra 256 bytes is for power mask
2818 # calculated from 7th argument, the index.]
2819 #
2820 lea -320(%rsp,$num,2),%r11
2821 mov %rsp,%rbp
2822 sub $rptr,%r11
2823 and \$4095,%r11
2824 cmp %r11,%r10
2825 jb .Lpwrx_sp_alt
2826 sub %r11,%rbp # align with $aptr
2827 lea -320(%rbp,$num,2),%rbp # future alloca(frame+2*$num*8+256)
2828 jmp .Lpwrx_sp_done
2829
2830.align 32
2831.Lpwrx_sp_alt:
2832 lea 4096-320(,$num,2),%r10
2833 lea -320(%rbp,$num,2),%rbp # alloca(frame+2*$num*8+256)
2834 sub %r10,%r11
2835 mov \$0,%r10
2836 cmovc %r10,%r11
2837 sub %r11,%rbp
2838.Lpwrx_sp_done:
2839 and \$-64,%rbp
2840 mov %rsp,%r11
2841 sub %rbp,%r11
2842 and \$-4096,%r11
2843 lea (%rbp,%r11),%rsp
2844 mov (%rsp),%r10
2845 cmp %rbp,%rsp
2846 ja .Lpwrx_page_walk
2847 jmp .Lpwrx_page_walk_done
2848
2849.Lpwrx_page_walk:
2850 lea -4096(%rsp),%rsp
2851 mov (%rsp),%r10
2852 cmp %rbp,%rsp
2853 ja .Lpwrx_page_walk
2854.Lpwrx_page_walk_done:
2855
2856 mov $num,%r10
2857 neg $num
2858
2859 ##############################################################
2860 # Stack layout
2861 #
2862 # +0 saved $num, used in reduction section
2863 # +8 &t[2*$num], used in reduction section
2864 # +16 intermediate carry bit
2865 # +24 top-most carry bit, used in reduction section
2866 # +32 saved *n0
2867 # +40 saved %rsp
2868 # +48 t[2*$num]
2869 #
2870 pxor %xmm0,%xmm0
2871 movq $rptr,%xmm1 # save $rptr
2872 movq $nptr,%xmm2 # save $nptr
2873 movq %r10, %xmm3 # -$num
2874 movq $bptr,%xmm4
2875 mov $n0, 32(%rsp)
2876 mov %rax, 40(%rsp) # save original %rsp
2877.cfi_cfa_expression %rsp+40,deref,+8
2878.Lpowerx5_body:
2879
2880 call __bn_sqrx8x_internal
2881 call __bn_postx4x_internal
2882 call __bn_sqrx8x_internal
2883 call __bn_postx4x_internal
2884 call __bn_sqrx8x_internal
2885 call __bn_postx4x_internal
2886 call __bn_sqrx8x_internal
2887 call __bn_postx4x_internal
2888 call __bn_sqrx8x_internal
2889 call __bn_postx4x_internal
2890
2891 mov %r10,$num # -num
2892 mov $aptr,$rptr
2893 movq %xmm2,$nptr
2894 movq %xmm4,$bptr
2895 mov 40(%rsp),%rax
2896
2897 call mulx4x_internal
2898
2899 mov 40(%rsp),%rsi # restore %rsp
2900.cfi_def_cfa %rsi,8
2901 mov \$1,%rax
2902
2903 mov -48(%rsi),%r15
2904.cfi_restore %r15
2905 mov -40(%rsi),%r14
2906.cfi_restore %r14
2907 mov -32(%rsi),%r13
2908.cfi_restore %r13
2909 mov -24(%rsi),%r12
2910.cfi_restore %r12
2911 mov -16(%rsi),%rbp
2912.cfi_restore %rbp
2913 mov -8(%rsi),%rbx
2914.cfi_restore %rbx
2915 lea (%rsi),%rsp
2916.cfi_def_cfa_register %rsp
2917.Lpowerx5_epilogue:
2918 ret
2919.cfi_endproc
2920.size bn_powerx5,.-bn_powerx5
2921
2922.globl bn_sqrx8x_internal
2923.hidden bn_sqrx8x_internal
2924.type bn_sqrx8x_internal,\@abi-omnipotent
2925.align 32
2926bn_sqrx8x_internal:
2927__bn_sqrx8x_internal:
2928.cfi_startproc
2929 ##################################################################
2930 # Squaring part:
2931 #
2932 # a) multiply-n-add everything but a[i]*a[i];
2933 # b) shift result of a) by 1 to the left and accumulate
2934 # a[i]*a[i] products;
2935 #
2936 ##################################################################
2937 # a[7]a[7]a[6]a[6]a[5]a[5]a[4]a[4]a[3]a[3]a[2]a[2]a[1]a[1]a[0]a[0]
2938 # a[1]a[0]
2939 # a[2]a[0]
2940 # a[3]a[0]
2941 # a[2]a[1]
2942 # a[3]a[1]
2943 # a[3]a[2]
2944 #
2945 # a[4]a[0]
2946 # a[5]a[0]
2947 # a[6]a[0]
2948 # a[7]a[0]
2949 # a[4]a[1]
2950 # a[5]a[1]
2951 # a[6]a[1]
2952 # a[7]a[1]
2953 # a[4]a[2]
2954 # a[5]a[2]
2955 # a[6]a[2]
2956 # a[7]a[2]
2957 # a[4]a[3]
2958 # a[5]a[3]
2959 # a[6]a[3]
2960 # a[7]a[3]
2961 #
2962 # a[5]a[4]
2963 # a[6]a[4]
2964 # a[7]a[4]
2965 # a[6]a[5]
2966 # a[7]a[5]
2967 # a[7]a[6]
2968 # a[7]a[7]a[6]a[6]a[5]a[5]a[4]a[4]a[3]a[3]a[2]a[2]a[1]a[1]a[0]a[0]
2969___
2970{
2971my ($zero,$carry)=("%rbp","%rcx");
2972my $aaptr=$zero;
2973$code.=<<___;
2974 lea 48+8(%rsp),$tptr
2975 lea ($aptr,$num),$aaptr
2976 mov $num,0+8(%rsp) # save $num
2977 mov $aaptr,8+8(%rsp) # save end of $aptr
2978 jmp .Lsqr8x_zero_start
2979
2980.align 32
2981.byte 0x66,0x66,0x66,0x2e,0x0f,0x1f,0x84,0x00,0x00,0x00,0x00,0x00
2982.Lsqrx8x_zero:
2983 .byte 0x3e
2984 movdqa %xmm0,0*8($tptr)
2985 movdqa %xmm0,2*8($tptr)
2986 movdqa %xmm0,4*8($tptr)
2987 movdqa %xmm0,6*8($tptr)
2988.Lsqr8x_zero_start: # aligned at 32
2989 movdqa %xmm0,8*8($tptr)
2990 movdqa %xmm0,10*8($tptr)
2991 movdqa %xmm0,12*8($tptr)
2992 movdqa %xmm0,14*8($tptr)
2993 lea 16*8($tptr),$tptr
2994 sub \$64,$num
2995 jnz .Lsqrx8x_zero
2996
2997 mov 0*8($aptr),%rdx # a[0], modulo-scheduled
2998 #xor %r9,%r9 # t[1], ex-$num, zero already
2999 xor %r10,%r10
3000 xor %r11,%r11
3001 xor %r12,%r12
3002 xor %r13,%r13
3003 xor %r14,%r14
3004 xor %r15,%r15
3005 lea 48+8(%rsp),$tptr
3006 xor $zero,$zero # cf=0, cf=0
3007 jmp .Lsqrx8x_outer_loop
3008
3009.align 32
3010.Lsqrx8x_outer_loop:
3011 mulx 1*8($aptr),%r8,%rax # a[1]*a[0]
3012 adcx %r9,%r8 # a[1]*a[0]+=t[1]
3013 adox %rax,%r10
3014 mulx 2*8($aptr),%r9,%rax # a[2]*a[0]
3015 adcx %r10,%r9
3016 adox %rax,%r11
3017 .byte 0xc4,0xe2,0xab,0xf6,0x86,0x18,0x00,0x00,0x00 # mulx 3*8($aptr),%r10,%rax # ...
3018 adcx %r11,%r10
3019 adox %rax,%r12
3020 .byte 0xc4,0xe2,0xa3,0xf6,0x86,0x20,0x00,0x00,0x00 # mulx 4*8($aptr),%r11,%rax
3021 adcx %r12,%r11
3022 adox %rax,%r13
3023 mulx 5*8($aptr),%r12,%rax
3024 adcx %r13,%r12
3025 adox %rax,%r14
3026 mulx 6*8($aptr),%r13,%rax
3027 adcx %r14,%r13
3028 adox %r15,%rax
3029 mulx 7*8($aptr),%r14,%r15
3030 mov 1*8($aptr),%rdx # a[1]
3031 adcx %rax,%r14
3032 adox $zero,%r15
3033 adc 8*8($tptr),%r15
3034 mov %r8,1*8($tptr) # t[1]
3035 mov %r9,2*8($tptr) # t[2]
3036 sbb $carry,$carry # mov %cf,$carry
3037 xor $zero,$zero # cf=0, of=0
3038
3039
3040 mulx 2*8($aptr),%r8,%rbx # a[2]*a[1]
3041 mulx 3*8($aptr),%r9,%rax # a[3]*a[1]
3042 adcx %r10,%r8
3043 adox %rbx,%r9
3044 mulx 4*8($aptr),%r10,%rbx # ...
3045 adcx %r11,%r9
3046 adox %rax,%r10
3047 .byte 0xc4,0xe2,0xa3,0xf6,0x86,0x28,0x00,0x00,0x00 # mulx 5*8($aptr),%r11,%rax
3048 adcx %r12,%r10
3049 adox %rbx,%r11
3050 .byte 0xc4,0xe2,0x9b,0xf6,0x9e,0x30,0x00,0x00,0x00 # mulx 6*8($aptr),%r12,%rbx
3051 adcx %r13,%r11
3052 adox %r14,%r12
3053 .byte 0xc4,0x62,0x93,0xf6,0xb6,0x38,0x00,0x00,0x00 # mulx 7*8($aptr),%r13,%r14
3054 mov 2*8($aptr),%rdx # a[2]
3055 adcx %rax,%r12
3056 adox %rbx,%r13
3057 adcx %r15,%r13
3058 adox $zero,%r14 # of=0
3059 adcx $zero,%r14 # cf=0
3060
3061 mov %r8,3*8($tptr) # t[3]
3062 mov %r9,4*8($tptr) # t[4]
3063
3064 mulx 3*8($aptr),%r8,%rbx # a[3]*a[2]
3065 mulx 4*8($aptr),%r9,%rax # a[4]*a[2]
3066 adcx %r10,%r8
3067 adox %rbx,%r9
3068 mulx 5*8($aptr),%r10,%rbx # ...
3069 adcx %r11,%r9
3070 adox %rax,%r10
3071 .byte 0xc4,0xe2,0xa3,0xf6,0x86,0x30,0x00,0x00,0x00 # mulx 6*8($aptr),%r11,%rax
3072 adcx %r12,%r10
3073 adox %r13,%r11
3074 .byte 0xc4,0x62,0x9b,0xf6,0xae,0x38,0x00,0x00,0x00 # mulx 7*8($aptr),%r12,%r13
3075 .byte 0x3e
3076 mov 3*8($aptr),%rdx # a[3]
3077 adcx %rbx,%r11
3078 adox %rax,%r12
3079 adcx %r14,%r12
3080 mov %r8,5*8($tptr) # t[5]
3081 mov %r9,6*8($tptr) # t[6]
3082 mulx 4*8($aptr),%r8,%rax # a[4]*a[3]
3083 adox $zero,%r13 # of=0
3084 adcx $zero,%r13 # cf=0
3085
3086 mulx 5*8($aptr),%r9,%rbx # a[5]*a[3]
3087 adcx %r10,%r8
3088 adox %rax,%r9
3089 mulx 6*8($aptr),%r10,%rax # ...
3090 adcx %r11,%r9
3091 adox %r12,%r10
3092 mulx 7*8($aptr),%r11,%r12
3093 mov 4*8($aptr),%rdx # a[4]
3094 mov 5*8($aptr),%r14 # a[5]
3095 adcx %rbx,%r10
3096 adox %rax,%r11
3097 mov 6*8($aptr),%r15 # a[6]
3098 adcx %r13,%r11
3099 adox $zero,%r12 # of=0
3100 adcx $zero,%r12 # cf=0
3101
3102 mov %r8,7*8($tptr) # t[7]
3103 mov %r9,8*8($tptr) # t[8]
3104
3105 mulx %r14,%r9,%rax # a[5]*a[4]
3106 mov 7*8($aptr),%r8 # a[7]
3107 adcx %r10,%r9
3108 mulx %r15,%r10,%rbx # a[6]*a[4]
3109 adox %rax,%r10
3110 adcx %r11,%r10
3111 mulx %r8,%r11,%rax # a[7]*a[4]
3112 mov %r14,%rdx # a[5]
3113 adox %rbx,%r11
3114 adcx %r12,%r11
3115 #adox $zero,%rax # of=0
3116 adcx $zero,%rax # cf=0
3117
3118 mulx %r15,%r14,%rbx # a[6]*a[5]
3119 mulx %r8,%r12,%r13 # a[7]*a[5]
3120 mov %r15,%rdx # a[6]
3121 lea 8*8($aptr),$aptr
3122 adcx %r14,%r11
3123 adox %rbx,%r12
3124 adcx %rax,%r12
3125 adox $zero,%r13
3126
3127 .byte 0x67,0x67
3128 mulx %r8,%r8,%r14 # a[7]*a[6]
3129 adcx %r8,%r13
3130 adcx $zero,%r14
3131
3132 cmp 8+8(%rsp),$aptr
3133 je .Lsqrx8x_outer_break
3134
3135 neg $carry # mov $carry,%cf
3136 mov \$-8,%rcx
3137 mov $zero,%r15
3138 mov 8*8($tptr),%r8
3139 adcx 9*8($tptr),%r9 # +=t[9]
3140 adcx 10*8($tptr),%r10 # ...
3141 adcx 11*8($tptr),%r11
3142 adc 12*8($tptr),%r12
3143 adc 13*8($tptr),%r13
3144 adc 14*8($tptr),%r14
3145 adc 15*8($tptr),%r15
3146 lea ($aptr),$aaptr
3147 lea 2*64($tptr),$tptr
3148 sbb %rax,%rax # mov %cf,$carry
3149
3150 mov -64($aptr),%rdx # a[0]
3151 mov %rax,16+8(%rsp) # offload $carry
3152 mov $tptr,24+8(%rsp)
3153
3154 #lea 8*8($tptr),$tptr # see 2*8*8($tptr) above
3155 xor %eax,%eax # cf=0, of=0
3156 jmp .Lsqrx8x_loop
3157
3158.align 32
3159.Lsqrx8x_loop:
3160 mov %r8,%rbx
3161 mulx 0*8($aaptr),%rax,%r8 # a[8]*a[i]
3162 adcx %rax,%rbx # +=t[8]
3163 adox %r9,%r8
3164
3165 mulx 1*8($aaptr),%rax,%r9 # ...
3166 adcx %rax,%r8
3167 adox %r10,%r9
3168
3169 mulx 2*8($aaptr),%rax,%r10
3170 adcx %rax,%r9
3171 adox %r11,%r10
3172
3173 mulx 3*8($aaptr),%rax,%r11
3174 adcx %rax,%r10
3175 adox %r12,%r11
3176
3177 .byte 0xc4,0x62,0xfb,0xf6,0xa5,0x20,0x00,0x00,0x00 # mulx 4*8($aaptr),%rax,%r12
3178 adcx %rax,%r11
3179 adox %r13,%r12
3180
3181 mulx 5*8($aaptr),%rax,%r13
3182 adcx %rax,%r12
3183 adox %r14,%r13
3184
3185 mulx 6*8($aaptr),%rax,%r14
3186 mov %rbx,($tptr,%rcx,8) # store t[8+i]
3187 mov \$0,%ebx
3188 adcx %rax,%r13
3189 adox %r15,%r14
3190
3191 .byte 0xc4,0x62,0xfb,0xf6,0xbd,0x38,0x00,0x00,0x00 # mulx 7*8($aaptr),%rax,%r15
3192 mov 8($aptr,%rcx,8),%rdx # a[i]
3193 adcx %rax,%r14
3194 adox %rbx,%r15 # %rbx is 0, of=0
3195 adcx %rbx,%r15 # cf=0
3196
3197 .byte 0x67
3198 inc %rcx # of=0
3199 jnz .Lsqrx8x_loop
3200
3201 lea 8*8($aaptr),$aaptr
3202 mov \$-8,%rcx
3203 cmp 8+8(%rsp),$aaptr # done?
3204 je .Lsqrx8x_break
3205
3206 sub 16+8(%rsp),%rbx # mov 16(%rsp),%cf
3207 .byte 0x66
3208 mov -64($aptr),%rdx
3209 adcx 0*8($tptr),%r8
3210 adcx 1*8($tptr),%r9
3211 adc 2*8($tptr),%r10
3212 adc 3*8($tptr),%r11
3213 adc 4*8($tptr),%r12
3214 adc 5*8($tptr),%r13
3215 adc 6*8($tptr),%r14
3216 adc 7*8($tptr),%r15
3217 lea 8*8($tptr),$tptr
3218 .byte 0x67
3219 sbb %rax,%rax # mov %cf,%rax
3220 xor %ebx,%ebx # cf=0, of=0
3221 mov %rax,16+8(%rsp) # offload carry
3222 jmp .Lsqrx8x_loop
3223
3224.align 32
3225.Lsqrx8x_break:
3226 xor $zero,$zero
3227 sub 16+8(%rsp),%rbx # mov 16(%rsp),%cf
3228 adcx $zero,%r8
3229 mov 24+8(%rsp),$carry # initial $tptr, borrow $carry
3230 adcx $zero,%r9
3231 mov 0*8($aptr),%rdx # a[8], modulo-scheduled
3232 adc \$0,%r10
3233 mov %r8,0*8($tptr)
3234 adc \$0,%r11
3235 adc \$0,%r12
3236 adc \$0,%r13
3237 adc \$0,%r14
3238 adc \$0,%r15
3239 cmp $carry,$tptr # cf=0, of=0
3240 je .Lsqrx8x_outer_loop
3241
3242 mov %r9,1*8($tptr)
3243 mov 1*8($carry),%r9
3244 mov %r10,2*8($tptr)
3245 mov 2*8($carry),%r10
3246 mov %r11,3*8($tptr)
3247 mov 3*8($carry),%r11
3248 mov %r12,4*8($tptr)
3249 mov 4*8($carry),%r12
3250 mov %r13,5*8($tptr)
3251 mov 5*8($carry),%r13
3252 mov %r14,6*8($tptr)
3253 mov 6*8($carry),%r14
3254 mov %r15,7*8($tptr)
3255 mov 7*8($carry),%r15
3256 mov $carry,$tptr
3257 jmp .Lsqrx8x_outer_loop
3258
3259.align 32
3260.Lsqrx8x_outer_break:
3261 mov %r9,9*8($tptr) # t[9]
3262 movq %xmm3,%rcx # -$num
3263 mov %r10,10*8($tptr) # ...
3264 mov %r11,11*8($tptr)
3265 mov %r12,12*8($tptr)
3266 mov %r13,13*8($tptr)
3267 mov %r14,14*8($tptr)
3268___
3269}
3270{
3271my $i="%rcx";
3272$code.=<<___;
3273 lea 48+8(%rsp),$tptr
3274 mov ($aptr,$i),%rdx # a[0]
3275
3276 mov 8($tptr),$A0[1] # t[1]
3277 xor $A0[0],$A0[0] # t[0], of=0, cf=0
3278 mov 0+8(%rsp),$num # restore $num
3279 adox $A0[1],$A0[1]
3280 mov 16($tptr),$A1[0] # t[2] # prefetch
3281 mov 24($tptr),$A1[1] # t[3] # prefetch
3282 #jmp .Lsqrx4x_shift_n_add # happens to be aligned
3283
3284.align 32
3285.Lsqrx4x_shift_n_add:
3286 mulx %rdx,%rax,%rbx
3287 adox $A1[0],$A1[0]
3288 adcx $A0[0],%rax
3289 .byte 0x48,0x8b,0x94,0x0e,0x08,0x00,0x00,0x00 # mov 8($aptr,$i),%rdx # a[i+1] # prefetch
3290 .byte 0x4c,0x8b,0x97,0x20,0x00,0x00,0x00 # mov 32($tptr),$A0[0] # t[2*i+4] # prefetch
3291 adox $A1[1],$A1[1]
3292 adcx $A0[1],%rbx
3293 mov 40($tptr),$A0[1] # t[2*i+4+1] # prefetch
3294 mov %rax,0($tptr)
3295 mov %rbx,8($tptr)
3296
3297 mulx %rdx,%rax,%rbx
3298 adox $A0[0],$A0[0]
3299 adcx $A1[0],%rax
3300 mov 16($aptr,$i),%rdx # a[i+2] # prefetch
3301 mov 48($tptr),$A1[0] # t[2*i+6] # prefetch
3302 adox $A0[1],$A0[1]
3303 adcx $A1[1],%rbx
3304 mov 56($tptr),$A1[1] # t[2*i+6+1] # prefetch
3305 mov %rax,16($tptr)
3306 mov %rbx,24($tptr)
3307
3308 mulx %rdx,%rax,%rbx
3309 adox $A1[0],$A1[0]
3310 adcx $A0[0],%rax
3311 mov 24($aptr,$i),%rdx # a[i+3] # prefetch
3312 lea 32($i),$i
3313 mov 64($tptr),$A0[0] # t[2*i+8] # prefetch
3314 adox $A1[1],$A1[1]
3315 adcx $A0[1],%rbx
3316 mov 72($tptr),$A0[1] # t[2*i+8+1] # prefetch
3317 mov %rax,32($tptr)
3318 mov %rbx,40($tptr)
3319
3320 mulx %rdx,%rax,%rbx
3321 adox $A0[0],$A0[0]
3322 adcx $A1[0],%rax
3323 jrcxz .Lsqrx4x_shift_n_add_break
3324 .byte 0x48,0x8b,0x94,0x0e,0x00,0x00,0x00,0x00 # mov 0($aptr,$i),%rdx # a[i+4] # prefetch
3325 adox $A0[1],$A0[1]
3326 adcx $A1[1],%rbx
3327 mov 80($tptr),$A1[0] # t[2*i+10] # prefetch
3328 mov 88($tptr),$A1[1] # t[2*i+10+1] # prefetch
3329 mov %rax,48($tptr)
3330 mov %rbx,56($tptr)
3331 lea 64($tptr),$tptr
3332 nop
3333 jmp .Lsqrx4x_shift_n_add
3334
3335.align 32
3336.Lsqrx4x_shift_n_add_break:
3337 adcx $A1[1],%rbx
3338 mov %rax,48($tptr)
3339 mov %rbx,56($tptr)
3340 lea 64($tptr),$tptr # end of t[] buffer
3341___
3342}
3343
3344######################################################################
3345# Montgomery reduction part, "word-by-word" algorithm.
3346#
3347# This new path is inspired by multiple submissions from Intel, by
3348# Shay Gueron, Vlad Krasnov, Erdinc Ozturk, James Guilford,
3349# Vinodh Gopal...
3350{
3351my ($nptr,$carry,$m0)=("%rbp","%rsi","%rdx");
3352
3353$code.=<<___;
3354 movq %xmm2,$nptr
3355__bn_sqrx8x_reduction:
3356 xor %eax,%eax # initial top-most carry bit
3357 mov 32+8(%rsp),%rbx # n0
3358 mov 48+8(%rsp),%rdx # "%r8", 8*0($tptr)
3359 lea -8*8($nptr,$num),%rcx # end of n[]
3360 #lea 48+8(%rsp,$num,2),$tptr # end of t[] buffer
3361 mov %rcx, 0+8(%rsp) # save end of n[]
3362 mov $tptr,8+8(%rsp) # save end of t[]
3363
3364 lea 48+8(%rsp),$tptr # initial t[] window
3365 jmp .Lsqrx8x_reduction_loop
3366
3367.align 32
3368.Lsqrx8x_reduction_loop:
3369 mov 8*1($tptr),%r9
3370 mov 8*2($tptr),%r10
3371 mov 8*3($tptr),%r11
3372 mov 8*4($tptr),%r12
3373 mov %rdx,%r8
3374 imulq %rbx,%rdx # n0*a[i]
3375 mov 8*5($tptr),%r13
3376 mov 8*6($tptr),%r14
3377 mov 8*7($tptr),%r15
3378 mov %rax,24+8(%rsp) # store top-most carry bit
3379
3380 lea 8*8($tptr),$tptr
3381 xor $carry,$carry # cf=0,of=0
3382 mov \$-8,%rcx
3383 jmp .Lsqrx8x_reduce
3384
3385.align 32
3386.Lsqrx8x_reduce:
3387 mov %r8, %rbx
3388 mulx 8*0($nptr),%rax,%r8 # n[0]
3389 adcx %rbx,%rax # discarded
3390 adox %r9,%r8
3391
3392 mulx 8*1($nptr),%rbx,%r9 # n[1]
3393 adcx %rbx,%r8
3394 adox %r10,%r9
3395
3396 mulx 8*2($nptr),%rbx,%r10
3397 adcx %rbx,%r9
3398 adox %r11,%r10
3399
3400 mulx 8*3($nptr),%rbx,%r11
3401 adcx %rbx,%r10
3402 adox %r12,%r11
3403
3404 .byte 0xc4,0x62,0xe3,0xf6,0xa5,0x20,0x00,0x00,0x00 # mulx 8*4($nptr),%rbx,%r12
3405 mov %rdx,%rax
3406 mov %r8,%rdx
3407 adcx %rbx,%r11
3408 adox %r13,%r12
3409
3410 mulx 32+8(%rsp),%rbx,%rdx # %rdx discarded
3411 mov %rax,%rdx
3412 mov %rax,64+48+8(%rsp,%rcx,8) # put aside n0*a[i]
3413
3414 mulx 8*5($nptr),%rax,%r13
3415 adcx %rax,%r12
3416 adox %r14,%r13
3417
3418 mulx 8*6($nptr),%rax,%r14
3419 adcx %rax,%r13
3420 adox %r15,%r14
3421
3422 mulx 8*7($nptr),%rax,%r15
3423 mov %rbx,%rdx
3424 adcx %rax,%r14
3425 adox $carry,%r15 # $carry is 0
3426 adcx $carry,%r15 # cf=0
3427
3428 .byte 0x67,0x67,0x67
3429 inc %rcx # of=0
3430 jnz .Lsqrx8x_reduce
3431
3432 mov $carry,%rax # xor %rax,%rax
3433 cmp 0+8(%rsp),$nptr # end of n[]?
3434 jae .Lsqrx8x_no_tail
3435
3436 mov 48+8(%rsp),%rdx # pull n0*a[0]
3437 add 8*0($tptr),%r8
3438 lea 8*8($nptr),$nptr
3439 mov \$-8,%rcx
3440 adcx 8*1($tptr),%r9
3441 adcx 8*2($tptr),%r10
3442 adc 8*3($tptr),%r11
3443 adc 8*4($tptr),%r12
3444 adc 8*5($tptr),%r13
3445 adc 8*6($tptr),%r14
3446 adc 8*7($tptr),%r15
3447 lea 8*8($tptr),$tptr
3448 sbb %rax,%rax # top carry
3449
3450 xor $carry,$carry # of=0, cf=0
3451 mov %rax,16+8(%rsp)
3452 jmp .Lsqrx8x_tail
3453
3454.align 32
3455.Lsqrx8x_tail:
3456 mov %r8,%rbx
3457 mulx 8*0($nptr),%rax,%r8
3458 adcx %rax,%rbx
3459 adox %r9,%r8
3460
3461 mulx 8*1($nptr),%rax,%r9
3462 adcx %rax,%r8
3463 adox %r10,%r9
3464
3465 mulx 8*2($nptr),%rax,%r10
3466 adcx %rax,%r9
3467 adox %r11,%r10
3468
3469 mulx 8*3($nptr),%rax,%r11
3470 adcx %rax,%r10
3471 adox %r12,%r11
3472
3473 .byte 0xc4,0x62,0xfb,0xf6,0xa5,0x20,0x00,0x00,0x00 # mulx 8*4($nptr),%rax,%r12
3474 adcx %rax,%r11
3475 adox %r13,%r12
3476
3477 mulx 8*5($nptr),%rax,%r13
3478 adcx %rax,%r12
3479 adox %r14,%r13
3480
3481 mulx 8*6($nptr),%rax,%r14
3482 adcx %rax,%r13
3483 adox %r15,%r14
3484
3485 mulx 8*7($nptr),%rax,%r15
3486 mov 72+48+8(%rsp,%rcx,8),%rdx # pull n0*a[i]
3487 adcx %rax,%r14
3488 adox $carry,%r15
3489 mov %rbx,($tptr,%rcx,8) # save result
3490 mov %r8,%rbx
3491 adcx $carry,%r15 # cf=0
3492
3493 inc %rcx # of=0
3494 jnz .Lsqrx8x_tail
3495
3496 cmp 0+8(%rsp),$nptr # end of n[]?
3497 jae .Lsqrx8x_tail_done # break out of loop
3498
3499 sub 16+8(%rsp),$carry # mov 16(%rsp),%cf
3500 mov 48+8(%rsp),%rdx # pull n0*a[0]
3501 lea 8*8($nptr),$nptr
3502 adc 8*0($tptr),%r8
3503 adc 8*1($tptr),%r9
3504 adc 8*2($tptr),%r10
3505 adc 8*3($tptr),%r11
3506 adc 8*4($tptr),%r12
3507 adc 8*5($tptr),%r13
3508 adc 8*6($tptr),%r14
3509 adc 8*7($tptr),%r15
3510 lea 8*8($tptr),$tptr
3511 sbb %rax,%rax
3512 sub \$8,%rcx # mov \$-8,%rcx
3513
3514 xor $carry,$carry # of=0, cf=0
3515 mov %rax,16+8(%rsp)
3516 jmp .Lsqrx8x_tail
3517
3518.align 32
3519.Lsqrx8x_tail_done:
3520 xor %rax,%rax
3521 add 24+8(%rsp),%r8 # can this overflow?
3522 adc \$0,%r9
3523 adc \$0,%r10
3524 adc \$0,%r11
3525 adc \$0,%r12
3526 adc \$0,%r13
3527 adc \$0,%r14
3528 adc \$0,%r15
3529 adc \$0,%rax
3530
3531 sub 16+8(%rsp),$carry # mov 16(%rsp),%cf
3532.Lsqrx8x_no_tail: # %cf is 0 if jumped here
3533 adc 8*0($tptr),%r8
3534 movq %xmm3,%rcx
3535 adc 8*1($tptr),%r9
3536 mov 8*7($nptr),$carry
3537 movq %xmm2,$nptr # restore $nptr
3538 adc 8*2($tptr),%r10
3539 adc 8*3($tptr),%r11
3540 adc 8*4($tptr),%r12
3541 adc 8*5($tptr),%r13
3542 adc 8*6($tptr),%r14
3543 adc 8*7($tptr),%r15
3544 adc \$0,%rax # top-most carry
3545
3546 mov 32+8(%rsp),%rbx # n0
3547 mov 8*8($tptr,%rcx),%rdx # modulo-scheduled "%r8"
3548
3549 mov %r8,8*0($tptr) # store top 512 bits
3550 lea 8*8($tptr),%r8 # borrow %r8
3551 mov %r9,8*1($tptr)
3552 mov %r10,8*2($tptr)
3553 mov %r11,8*3($tptr)
3554 mov %r12,8*4($tptr)
3555 mov %r13,8*5($tptr)
3556 mov %r14,8*6($tptr)
3557 mov %r15,8*7($tptr)
3558
3559 lea 8*8($tptr,%rcx),$tptr # start of current t[] window
3560 cmp 8+8(%rsp),%r8 # end of t[]?
3561 jb .Lsqrx8x_reduction_loop
3562 ret
3563.cfi_endproc
3564.size bn_sqrx8x_internal,.-bn_sqrx8x_internal
3565___
3566}
3567
3568##############################################################
3569# Post-condition, 4x unrolled
3570#
3571{
3572my ($rptr,$nptr)=("%rdx","%rbp");
3573$code.=<<___;
3574.align 32
3575__bn_postx4x_internal:
3576.cfi_startproc
3577 mov 8*0($nptr),%r12
3578 mov %rcx,%r10 # -$num
3579 mov %rcx,%r9 # -$num
3580 neg %rax
3581 sar \$3+2,%rcx
3582 #lea 48+8(%rsp,%r9),$tptr
3583 movq %xmm1,$rptr # restore $rptr
3584 movq %xmm1,$aptr # prepare for back-to-back call
3585 dec %r12 # so that after 'not' we get -n[0]
3586 mov 8*1($nptr),%r13
3587 xor %r8,%r8
3588 mov 8*2($nptr),%r14
3589 mov 8*3($nptr),%r15
3590 jmp .Lsqrx4x_sub_entry
3591
3592.align 16
3593.Lsqrx4x_sub:
3594 mov 8*0($nptr),%r12
3595 mov 8*1($nptr),%r13
3596 mov 8*2($nptr),%r14
3597 mov 8*3($nptr),%r15
3598.Lsqrx4x_sub_entry:
3599 andn %rax,%r12,%r12
3600 lea 8*4($nptr),$nptr
3601 andn %rax,%r13,%r13
3602 andn %rax,%r14,%r14
3603 andn %rax,%r15,%r15
3604
3605 neg %r8 # mov %r8,%cf
3606 adc 8*0($tptr),%r12
3607 adc 8*1($tptr),%r13
3608 adc 8*2($tptr),%r14
3609 adc 8*3($tptr),%r15
3610 mov %r12,8*0($rptr)
3611 lea 8*4($tptr),$tptr
3612 mov %r13,8*1($rptr)
3613 sbb %r8,%r8 # mov %cf,%r8
3614 mov %r14,8*2($rptr)
3615 mov %r15,8*3($rptr)
3616 lea 8*4($rptr),$rptr
3617
3618 inc %rcx
3619 jnz .Lsqrx4x_sub
3620
3621 neg %r9 # restore $num
3622
3623 ret
3624.cfi_endproc
3625.size __bn_postx4x_internal,.-__bn_postx4x_internal
3626___
3627}
3628}}}
3629{
3630my ($inp,$num,$tbl,$idx)=$win64?("%rcx","%edx","%r8", "%r9d") : # Win64 order
3631 ("%rdi","%esi","%rdx","%ecx"); # Unix order
3632my $out=$inp;
3633my $STRIDE=2**5*8;
3634my $N=$STRIDE/4;
3635
3636$code.=<<___;
3637.globl bn_get_bits5
3638.type bn_get_bits5,\@abi-omnipotent
3639.align 16
3640bn_get_bits5:
3641.cfi_startproc
3642 lea 0($inp),%r10
3643 lea 1($inp),%r11
3644 mov $num,%ecx
3645 shr \$4,$num
3646 and \$15,%ecx
3647 lea -8(%ecx),%eax
3648 cmp \$11,%ecx
3649 cmova %r11,%r10
3650 cmova %eax,%ecx
3651 movzw (%r10,$num,2),%eax
3652 shrl %cl,%eax
3653 and \$31,%eax
3654 ret
3655.cfi_endproc
3656.size bn_get_bits5,.-bn_get_bits5
3657
3658.globl bn_scatter5
3659.type bn_scatter5,\@abi-omnipotent
3660.align 16
3661bn_scatter5:
3662.cfi_startproc
3663 cmp \$0, $num
3664 jz .Lscatter_epilogue
3665 lea ($tbl,$idx,8),$tbl
3666.Lscatter:
3667 mov ($inp),%rax
3668 lea 8($inp),$inp
3669 mov %rax,($tbl)
3670 lea 32*8($tbl),$tbl
3671 sub \$1,$num
3672 jnz .Lscatter
3673.Lscatter_epilogue:
3674 ret
3675.cfi_endproc
3676.size bn_scatter5,.-bn_scatter5
3677
3678.globl bn_gather5
3679.type bn_gather5,\@abi-omnipotent
3680.align 32
3681bn_gather5:
3682.LSEH_begin_bn_gather5: # Win64 thing, but harmless in other cases
3683.cfi_startproc
3684 # I can't trust assembler to use specific encoding:-(
3685 .byte 0x4c,0x8d,0x14,0x24 #lea (%rsp),%r10
3686 .byte 0x48,0x81,0xec,0x08,0x01,0x00,0x00 #sub $0x108,%rsp
3687 lea .Linc(%rip),%rax
3688 and \$-16,%rsp # shouldn't be formally required
3689
3690 movd $idx,%xmm5
3691 movdqa 0(%rax),%xmm0 # 00000001000000010000000000000000
3692 movdqa 16(%rax),%xmm1 # 00000002000000020000000200000002
3693 lea 128($tbl),%r11 # size optimization
3694 lea 128(%rsp),%rax # size optimization
3695
3696 pshufd \$0,%xmm5,%xmm5 # broadcast $idx
3697 movdqa %xmm1,%xmm4
3698 movdqa %xmm1,%xmm2
3699___
3700########################################################################
3701# calculate mask by comparing 0..31 to $idx and save result to stack
3702#
3703for($i=0;$i<$STRIDE/16;$i+=4) {
3704$code.=<<___;
3705 paddd %xmm0,%xmm1
3706 pcmpeqd %xmm5,%xmm0 # compare to 1,0
3707___
3708$code.=<<___ if ($i);
3709 movdqa %xmm3,`16*($i-1)-128`(%rax)
3710___
3711$code.=<<___;
3712 movdqa %xmm4,%xmm3
3713
3714 paddd %xmm1,%xmm2
3715 pcmpeqd %xmm5,%xmm1 # compare to 3,2
3716 movdqa %xmm0,`16*($i+0)-128`(%rax)
3717 movdqa %xmm4,%xmm0
3718
3719 paddd %xmm2,%xmm3
3720 pcmpeqd %xmm5,%xmm2 # compare to 5,4
3721 movdqa %xmm1,`16*($i+1)-128`(%rax)
3722 movdqa %xmm4,%xmm1
3723
3724 paddd %xmm3,%xmm0
3725 pcmpeqd %xmm5,%xmm3 # compare to 7,6
3726 movdqa %xmm2,`16*($i+2)-128`(%rax)
3727 movdqa %xmm4,%xmm2
3728___
3729}
3730$code.=<<___;
3731 movdqa %xmm3,`16*($i-1)-128`(%rax)
3732 jmp .Lgather
3733
3734.align 32
3735.Lgather:
3736 pxor %xmm4,%xmm4
3737 pxor %xmm5,%xmm5
3738___
3739for($i=0;$i<$STRIDE/16;$i+=4) {
3740$code.=<<___;
3741 movdqa `16*($i+0)-128`(%r11),%xmm0
3742 movdqa `16*($i+1)-128`(%r11),%xmm1
3743 movdqa `16*($i+2)-128`(%r11),%xmm2
3744 pand `16*($i+0)-128`(%rax),%xmm0
3745 movdqa `16*($i+3)-128`(%r11),%xmm3
3746 pand `16*($i+1)-128`(%rax),%xmm1
3747 por %xmm0,%xmm4
3748 pand `16*($i+2)-128`(%rax),%xmm2
3749 por %xmm1,%xmm5
3750 pand `16*($i+3)-128`(%rax),%xmm3
3751 por %xmm2,%xmm4
3752 por %xmm3,%xmm5
3753___
3754}
3755$code.=<<___;
3756 por %xmm5,%xmm4
3757 lea $STRIDE(%r11),%r11
3758 pshufd \$0x4e,%xmm4,%xmm0
3759 por %xmm4,%xmm0
3760 movq %xmm0,($out) # m0=bp[0]
3761 lea 8($out),$out
3762 sub \$1,$num
3763 jnz .Lgather
3764
3765 lea (%r10),%rsp
3766 ret
3767.LSEH_end_bn_gather5:
3768.cfi_endproc
3769.size bn_gather5,.-bn_gather5
3770___
3771}
3772$code.=<<___;
3773.align 64
3774.Linc:
3775 .long 0,0, 1,1
3776 .long 2,2, 2,2
3777.asciz "Montgomery Multiplication with scatter/gather for x86_64, CRYPTOGAMS by <appro\@openssl.org>"
3778___
3779
3780# EXCEPTION_DISPOSITION handler (EXCEPTION_RECORD *rec,ULONG64 frame,
3781# CONTEXT *context,DISPATCHER_CONTEXT *disp)
3782if ($win64) {
3783$rec="%rcx";
3784$frame="%rdx";
3785$context="%r8";
3786$disp="%r9";
3787
3788$code.=<<___;
3789.extern __imp_RtlVirtualUnwind
3790.type mul_handler,\@abi-omnipotent
3791.align 16
3792mul_handler:
3793 push %rsi
3794 push %rdi
3795 push %rbx
3796 push %rbp
3797 push %r12
3798 push %r13
3799 push %r14
3800 push %r15
3801 pushfq
3802 sub \$64,%rsp
3803
3804 mov 120($context),%rax # pull context->Rax
3805 mov 248($context),%rbx # pull context->Rip
3806
3807 mov 8($disp),%rsi # disp->ImageBase
3808 mov 56($disp),%r11 # disp->HandlerData
3809
3810 mov 0(%r11),%r10d # HandlerData[0]
3811 lea (%rsi,%r10),%r10 # end of prologue label
3812 cmp %r10,%rbx # context->Rip<end of prologue label
3813 jb .Lcommon_seh_tail
3814
3815 mov 4(%r11),%r10d # HandlerData[1]
3816 lea (%rsi,%r10),%r10 # beginning of body label
3817 cmp %r10,%rbx # context->Rip<body label
3818 jb .Lcommon_pop_regs
3819
3820 mov 152($context),%rax # pull context->Rsp
3821
3822 mov 8(%r11),%r10d # HandlerData[2]
3823 lea (%rsi,%r10),%r10 # epilogue label
3824 cmp %r10,%rbx # context->Rip>=epilogue label
3825 jae .Lcommon_seh_tail
3826
3827 lea .Lmul_epilogue(%rip),%r10
3828 cmp %r10,%rbx
3829 ja .Lbody_40
3830
3831 mov 192($context),%r10 # pull $num
3832 mov 8(%rax,%r10,8),%rax # pull saved stack pointer
3833
3834 jmp .Lcommon_pop_regs
3835
3836.Lbody_40:
3837 mov 40(%rax),%rax # pull saved stack pointer
3838.Lcommon_pop_regs:
3839 mov -8(%rax),%rbx
3840 mov -16(%rax),%rbp
3841 mov -24(%rax),%r12
3842 mov -32(%rax),%r13
3843 mov -40(%rax),%r14
3844 mov -48(%rax),%r15
3845 mov %rbx,144($context) # restore context->Rbx
3846 mov %rbp,160($context) # restore context->Rbp
3847 mov %r12,216($context) # restore context->R12
3848 mov %r13,224($context) # restore context->R13
3849 mov %r14,232($context) # restore context->R14
3850 mov %r15,240($context) # restore context->R15
3851
3852.Lcommon_seh_tail:
3853 mov 8(%rax),%rdi
3854 mov 16(%rax),%rsi
3855 mov %rax,152($context) # restore context->Rsp
3856 mov %rsi,168($context) # restore context->Rsi
3857 mov %rdi,176($context) # restore context->Rdi
3858
3859 mov 40($disp),%rdi # disp->ContextRecord
3860 mov $context,%rsi # context
3861 mov \$154,%ecx # sizeof(CONTEXT)
3862 .long 0xa548f3fc # cld; rep movsq
3863
3864 mov $disp,%rsi
3865 xor %rcx,%rcx # arg1, UNW_FLAG_NHANDLER
3866 mov 8(%rsi),%rdx # arg2, disp->ImageBase
3867 mov 0(%rsi),%r8 # arg3, disp->ControlPc
3868 mov 16(%rsi),%r9 # arg4, disp->FunctionEntry
3869 mov 40(%rsi),%r10 # disp->ContextRecord
3870 lea 56(%rsi),%r11 # &disp->HandlerData
3871 lea 24(%rsi),%r12 # &disp->EstablisherFrame
3872 mov %r10,32(%rsp) # arg5
3873 mov %r11,40(%rsp) # arg6
3874 mov %r12,48(%rsp) # arg7
3875 mov %rcx,56(%rsp) # arg8, (NULL)
3876 call *__imp_RtlVirtualUnwind(%rip)
3877
3878 mov \$1,%eax # ExceptionContinueSearch
3879 add \$64,%rsp
3880 popfq
3881 pop %r15
3882 pop %r14
3883 pop %r13
3884 pop %r12
3885 pop %rbp
3886 pop %rbx
3887 pop %rdi
3888 pop %rsi
3889 ret
3890.size mul_handler,.-mul_handler
3891
3892.section .pdata
3893.align 4
3894 .rva .LSEH_begin_bn_mul_mont_gather5
3895 .rva .LSEH_end_bn_mul_mont_gather5
3896 .rva .LSEH_info_bn_mul_mont_gather5
3897
3898 .rva .LSEH_begin_bn_mul4x_mont_gather5
3899 .rva .LSEH_end_bn_mul4x_mont_gather5
3900 .rva .LSEH_info_bn_mul4x_mont_gather5
3901
3902 .rva .LSEH_begin_bn_power5
3903 .rva .LSEH_end_bn_power5
3904 .rva .LSEH_info_bn_power5
3905
3906 .rva .LSEH_begin_bn_from_mont8x
3907 .rva .LSEH_end_bn_from_mont8x
3908 .rva .LSEH_info_bn_from_mont8x
3909___
3910$code.=<<___ if ($addx);
3911 .rva .LSEH_begin_bn_mulx4x_mont_gather5
3912 .rva .LSEH_end_bn_mulx4x_mont_gather5
3913 .rva .LSEH_info_bn_mulx4x_mont_gather5
3914
3915 .rva .LSEH_begin_bn_powerx5
3916 .rva .LSEH_end_bn_powerx5
3917 .rva .LSEH_info_bn_powerx5
3918___
3919$code.=<<___;
3920 .rva .LSEH_begin_bn_gather5
3921 .rva .LSEH_end_bn_gather5
3922 .rva .LSEH_info_bn_gather5
3923
3924.section .xdata
3925.align 8
3926.LSEH_info_bn_mul_mont_gather5:
3927 .byte 9,0,0,0
3928 .rva mul_handler
3929 .rva .Lmul_body,.Lmul_body,.Lmul_epilogue # HandlerData[]
3930.align 8
3931.LSEH_info_bn_mul4x_mont_gather5:
3932 .byte 9,0,0,0
3933 .rva mul_handler
3934 .rva .Lmul4x_prologue,.Lmul4x_body,.Lmul4x_epilogue # HandlerData[]
3935.align 8
3936.LSEH_info_bn_power5:
3937 .byte 9,0,0,0
3938 .rva mul_handler
3939 .rva .Lpower5_prologue,.Lpower5_body,.Lpower5_epilogue # HandlerData[]
3940.align 8
3941.LSEH_info_bn_from_mont8x:
3942 .byte 9,0,0,0
3943 .rva mul_handler
3944 .rva .Lfrom_prologue,.Lfrom_body,.Lfrom_epilogue # HandlerData[]
3945___
3946$code.=<<___ if ($addx);
3947.align 8
3948.LSEH_info_bn_mulx4x_mont_gather5:
3949 .byte 9,0,0,0
3950 .rva mul_handler
3951 .rva .Lmulx4x_prologue,.Lmulx4x_body,.Lmulx4x_epilogue # HandlerData[]
3952.align 8
3953.LSEH_info_bn_powerx5:
3954 .byte 9,0,0,0
3955 .rva mul_handler
3956 .rva .Lpowerx5_prologue,.Lpowerx5_body,.Lpowerx5_epilogue # HandlerData[]
3957___
3958$code.=<<___;
3959.align 8
3960.LSEH_info_bn_gather5:
3961 .byte 0x01,0x0b,0x03,0x0a
3962 .byte 0x0b,0x01,0x21,0x00 # sub rsp,0x108
3963 .byte 0x04,0xa3,0x00,0x00 # lea r10,(rsp)
3964.align 8
3965___
3966}
3967
3968$code =~ s/\`([^\`]*)\`/eval($1)/gem;
3969
3970print $code;
3971close STDOUT or die "error closing STDOUT: $!";
Note: See TracBrowser for help on using the repository browser.

© 2024 Oracle Support Privacy / Do Not Sell My Info Terms of Use Trademark Policy Automated Access Etiquette