1 | #! /usr/bin/env perl
|
---|
2 | # Copyright 2005-2020 The OpenSSL Project Authors. All Rights Reserved.
|
---|
3 | #
|
---|
4 | # Licensed under the OpenSSL license (the "License"). You may not use
|
---|
5 | # this file except in compliance with the License. You can obtain a copy
|
---|
6 | # in the file LICENSE in the source distribution or at
|
---|
7 | # https://www.openssl.org/source/license.html
|
---|
8 |
|
---|
9 |
|
---|
10 | # ====================================================================
|
---|
11 | # Written by Andy Polyakov <[email protected]> for the OpenSSL
|
---|
12 | # project. The module is, however, dual licensed under OpenSSL and
|
---|
13 | # CRYPTOGAMS licenses depending on where you obtain it. For further
|
---|
14 | # details see http://www.openssl.org/~appro/cryptogams/.
|
---|
15 | # ====================================================================
|
---|
16 |
|
---|
17 | # October 2005.
|
---|
18 | #
|
---|
19 | # Montgomery multiplication routine for x86_64. While it gives modest
|
---|
20 | # 9% improvement of rsa4096 sign on Opteron, rsa512 sign runs more
|
---|
21 | # than twice, >2x, as fast. Most common rsa1024 sign is improved by
|
---|
22 | # respectful 50%. It remains to be seen if loop unrolling and
|
---|
23 | # dedicated squaring routine can provide further improvement...
|
---|
24 |
|
---|
25 | # July 2011.
|
---|
26 | #
|
---|
27 | # Add dedicated squaring procedure. Performance improvement varies
|
---|
28 | # from platform to platform, but in average it's ~5%/15%/25%/33%
|
---|
29 | # for 512-/1024-/2048-/4096-bit RSA *sign* benchmarks respectively.
|
---|
30 |
|
---|
31 | # August 2011.
|
---|
32 | #
|
---|
33 | # Unroll and modulo-schedule inner loops in such manner that they
|
---|
34 | # are "fallen through" for input lengths of 8, which is critical for
|
---|
35 | # 1024-bit RSA *sign*. Average performance improvement in comparison
|
---|
36 | # to *initial* version of this module from 2005 is ~0%/30%/40%/45%
|
---|
37 | # for 512-/1024-/2048-/4096-bit RSA *sign* benchmarks respectively.
|
---|
38 |
|
---|
39 | # June 2013.
|
---|
40 | #
|
---|
41 | # Optimize reduction in squaring procedure and improve 1024+-bit RSA
|
---|
42 | # sign performance by 10-16% on Intel Sandy Bridge and later
|
---|
43 | # (virtually same on non-Intel processors).
|
---|
44 |
|
---|
45 | # August 2013.
|
---|
46 | #
|
---|
47 | # Add MULX/ADOX/ADCX code path.
|
---|
48 |
|
---|
49 | $flavour = shift;
|
---|
50 | $output = shift;
|
---|
51 | if ($flavour =~ /\./) { $output = $flavour; undef $flavour; }
|
---|
52 |
|
---|
53 | $win64=0; $win64=1 if ($flavour =~ /[nm]asm|mingw64/ || $output =~ /\.asm$/);
|
---|
54 |
|
---|
55 | $0 =~ m/(.*[\/\\])[^\/\\]+$/; $dir=$1;
|
---|
56 | ( $xlate="${dir}x86_64-xlate.pl" and -f $xlate ) or
|
---|
57 | ( $xlate="${dir}../../perlasm/x86_64-xlate.pl" and -f $xlate) or
|
---|
58 | die "can't locate x86_64-xlate.pl";
|
---|
59 |
|
---|
60 | open OUT,"| \"$^X\" \"$xlate\" $flavour \"$output\"";
|
---|
61 | *STDOUT=*OUT;
|
---|
62 |
|
---|
63 | if (`$ENV{CC} -Wa,-v -c -o /dev/null -x assembler /dev/null 2>&1`
|
---|
64 | =~ /GNU assembler version ([2-9]\.[0-9]+)/) {
|
---|
65 | $addx = ($1>=2.23);
|
---|
66 | }
|
---|
67 |
|
---|
68 | if (!$addx && $win64 && ($flavour =~ /nasm/ || $ENV{ASM} =~ /nasm/) &&
|
---|
69 | `nasm -v 2>&1` =~ /NASM version ([2-9]\.[0-9]+)/) {
|
---|
70 | $addx = ($1>=2.10);
|
---|
71 | }
|
---|
72 |
|
---|
73 | if (!$addx && $win64 && ($flavour =~ /masm/ || $ENV{ASM} =~ /ml64/) &&
|
---|
74 | `ml64 2>&1` =~ /Version ([0-9]+)\./) {
|
---|
75 | $addx = ($1>=12);
|
---|
76 | }
|
---|
77 |
|
---|
78 | if (!$addx && `$ENV{CC} -v 2>&1` =~ /((?:clang|LLVM) version|.*based on LLVM) ([0-9]+)\.([0-9]+)/) {
|
---|
79 | my $ver = $2 + $3/100.0; # 3.1->3.01, 3.10->3.10
|
---|
80 | $addx = ($ver>=3.03);
|
---|
81 | }
|
---|
82 |
|
---|
83 | # int bn_mul_mont(
|
---|
84 | $rp="%rdi"; # BN_ULONG *rp,
|
---|
85 | $ap="%rsi"; # const BN_ULONG *ap,
|
---|
86 | $bp="%rdx"; # const BN_ULONG *bp,
|
---|
87 | $np="%rcx"; # const BN_ULONG *np,
|
---|
88 | $n0="%r8"; # const BN_ULONG *n0,
|
---|
89 | $num="%r9"; # int num);
|
---|
90 | $lo0="%r10";
|
---|
91 | $hi0="%r11";
|
---|
92 | $hi1="%r13";
|
---|
93 | $i="%r14";
|
---|
94 | $j="%r15";
|
---|
95 | $m0="%rbx";
|
---|
96 | $m1="%rbp";
|
---|
97 |
|
---|
98 | $code=<<___;
|
---|
99 | .text
|
---|
100 |
|
---|
101 | .extern OPENSSL_ia32cap_P
|
---|
102 |
|
---|
103 | .globl bn_mul_mont
|
---|
104 | .type bn_mul_mont,\@function,6
|
---|
105 | .align 16
|
---|
106 | bn_mul_mont:
|
---|
107 | .cfi_startproc
|
---|
108 | mov ${num}d,${num}d
|
---|
109 | mov %rsp,%rax
|
---|
110 | .cfi_def_cfa_register %rax
|
---|
111 | test \$3,${num}d
|
---|
112 | jnz .Lmul_enter
|
---|
113 | cmp \$8,${num}d
|
---|
114 | jb .Lmul_enter
|
---|
115 | ___
|
---|
116 | $code.=<<___ if ($addx);
|
---|
117 | mov OPENSSL_ia32cap_P+8(%rip),%r11d
|
---|
118 | ___
|
---|
119 | $code.=<<___;
|
---|
120 | cmp $ap,$bp
|
---|
121 | jne .Lmul4x_enter
|
---|
122 | test \$7,${num}d
|
---|
123 | jz .Lsqr8x_enter
|
---|
124 | jmp .Lmul4x_enter
|
---|
125 |
|
---|
126 | .align 16
|
---|
127 | .Lmul_enter:
|
---|
128 | push %rbx
|
---|
129 | .cfi_push %rbx
|
---|
130 | push %rbp
|
---|
131 | .cfi_push %rbp
|
---|
132 | push %r12
|
---|
133 | .cfi_push %r12
|
---|
134 | push %r13
|
---|
135 | .cfi_push %r13
|
---|
136 | push %r14
|
---|
137 | .cfi_push %r14
|
---|
138 | push %r15
|
---|
139 | .cfi_push %r15
|
---|
140 |
|
---|
141 | neg $num
|
---|
142 | mov %rsp,%r11
|
---|
143 | lea -16(%rsp,$num,8),%r10 # future alloca(8*(num+2))
|
---|
144 | neg $num # restore $num
|
---|
145 | and \$-1024,%r10 # minimize TLB usage
|
---|
146 |
|
---|
147 | # An OS-agnostic version of __chkstk.
|
---|
148 | #
|
---|
149 | # Some OSes (Windows) insist on stack being "wired" to
|
---|
150 | # physical memory in strictly sequential manner, i.e. if stack
|
---|
151 | # allocation spans two pages, then reference to farmost one can
|
---|
152 | # be punishable by SEGV. But page walking can do good even on
|
---|
153 | # other OSes, because it guarantees that villain thread hits
|
---|
154 | # the guard page before it can make damage to innocent one...
|
---|
155 | sub %r10,%r11
|
---|
156 | and \$-4096,%r11
|
---|
157 | lea (%r10,%r11),%rsp
|
---|
158 | mov (%rsp),%r11
|
---|
159 | cmp %r10,%rsp
|
---|
160 | ja .Lmul_page_walk
|
---|
161 | jmp .Lmul_page_walk_done
|
---|
162 |
|
---|
163 | .align 16
|
---|
164 | .Lmul_page_walk:
|
---|
165 | lea -4096(%rsp),%rsp
|
---|
166 | mov (%rsp),%r11
|
---|
167 | cmp %r10,%rsp
|
---|
168 | ja .Lmul_page_walk
|
---|
169 | .Lmul_page_walk_done:
|
---|
170 |
|
---|
171 | mov %rax,8(%rsp,$num,8) # tp[num+1]=%rsp
|
---|
172 | .cfi_cfa_expression %rsp+8,$num,8,mul,plus,deref,+8
|
---|
173 | .Lmul_body:
|
---|
174 | mov $bp,%r12 # reassign $bp
|
---|
175 | ___
|
---|
176 | $bp="%r12";
|
---|
177 | $code.=<<___;
|
---|
178 | mov ($n0),$n0 # pull n0[0] value
|
---|
179 | mov ($bp),$m0 # m0=bp[0]
|
---|
180 | mov ($ap),%rax
|
---|
181 |
|
---|
182 | xor $i,$i # i=0
|
---|
183 | xor $j,$j # j=0
|
---|
184 |
|
---|
185 | mov $n0,$m1
|
---|
186 | mulq $m0 # ap[0]*bp[0]
|
---|
187 | mov %rax,$lo0
|
---|
188 | mov ($np),%rax
|
---|
189 |
|
---|
190 | imulq $lo0,$m1 # "tp[0]"*n0
|
---|
191 | mov %rdx,$hi0
|
---|
192 |
|
---|
193 | mulq $m1 # np[0]*m1
|
---|
194 | add %rax,$lo0 # discarded
|
---|
195 | mov 8($ap),%rax
|
---|
196 | adc \$0,%rdx
|
---|
197 | mov %rdx,$hi1
|
---|
198 |
|
---|
199 | lea 1($j),$j # j++
|
---|
200 | jmp .L1st_enter
|
---|
201 |
|
---|
202 | .align 16
|
---|
203 | .L1st:
|
---|
204 | add %rax,$hi1
|
---|
205 | mov ($ap,$j,8),%rax
|
---|
206 | adc \$0,%rdx
|
---|
207 | add $hi0,$hi1 # np[j]*m1+ap[j]*bp[0]
|
---|
208 | mov $lo0,$hi0
|
---|
209 | adc \$0,%rdx
|
---|
210 | mov $hi1,-16(%rsp,$j,8) # tp[j-1]
|
---|
211 | mov %rdx,$hi1
|
---|
212 |
|
---|
213 | .L1st_enter:
|
---|
214 | mulq $m0 # ap[j]*bp[0]
|
---|
215 | add %rax,$hi0
|
---|
216 | mov ($np,$j,8),%rax
|
---|
217 | adc \$0,%rdx
|
---|
218 | lea 1($j),$j # j++
|
---|
219 | mov %rdx,$lo0
|
---|
220 |
|
---|
221 | mulq $m1 # np[j]*m1
|
---|
222 | cmp $num,$j
|
---|
223 | jne .L1st
|
---|
224 |
|
---|
225 | add %rax,$hi1
|
---|
226 | mov ($ap),%rax # ap[0]
|
---|
227 | adc \$0,%rdx
|
---|
228 | add $hi0,$hi1 # np[j]*m1+ap[j]*bp[0]
|
---|
229 | adc \$0,%rdx
|
---|
230 | mov $hi1,-16(%rsp,$j,8) # tp[j-1]
|
---|
231 | mov %rdx,$hi1
|
---|
232 | mov $lo0,$hi0
|
---|
233 |
|
---|
234 | xor %rdx,%rdx
|
---|
235 | add $hi0,$hi1
|
---|
236 | adc \$0,%rdx
|
---|
237 | mov $hi1,-8(%rsp,$num,8)
|
---|
238 | mov %rdx,(%rsp,$num,8) # store upmost overflow bit
|
---|
239 |
|
---|
240 | lea 1($i),$i # i++
|
---|
241 | jmp .Louter
|
---|
242 | .align 16
|
---|
243 | .Louter:
|
---|
244 | mov ($bp,$i,8),$m0 # m0=bp[i]
|
---|
245 | xor $j,$j # j=0
|
---|
246 | mov $n0,$m1
|
---|
247 | mov (%rsp),$lo0
|
---|
248 | mulq $m0 # ap[0]*bp[i]
|
---|
249 | add %rax,$lo0 # ap[0]*bp[i]+tp[0]
|
---|
250 | mov ($np),%rax
|
---|
251 | adc \$0,%rdx
|
---|
252 |
|
---|
253 | imulq $lo0,$m1 # tp[0]*n0
|
---|
254 | mov %rdx,$hi0
|
---|
255 |
|
---|
256 | mulq $m1 # np[0]*m1
|
---|
257 | add %rax,$lo0 # discarded
|
---|
258 | mov 8($ap),%rax
|
---|
259 | adc \$0,%rdx
|
---|
260 | mov 8(%rsp),$lo0 # tp[1]
|
---|
261 | mov %rdx,$hi1
|
---|
262 |
|
---|
263 | lea 1($j),$j # j++
|
---|
264 | jmp .Linner_enter
|
---|
265 |
|
---|
266 | .align 16
|
---|
267 | .Linner:
|
---|
268 | add %rax,$hi1
|
---|
269 | mov ($ap,$j,8),%rax
|
---|
270 | adc \$0,%rdx
|
---|
271 | add $lo0,$hi1 # np[j]*m1+ap[j]*bp[i]+tp[j]
|
---|
272 | mov (%rsp,$j,8),$lo0
|
---|
273 | adc \$0,%rdx
|
---|
274 | mov $hi1,-16(%rsp,$j,8) # tp[j-1]
|
---|
275 | mov %rdx,$hi1
|
---|
276 |
|
---|
277 | .Linner_enter:
|
---|
278 | mulq $m0 # ap[j]*bp[i]
|
---|
279 | add %rax,$hi0
|
---|
280 | mov ($np,$j,8),%rax
|
---|
281 | adc \$0,%rdx
|
---|
282 | add $hi0,$lo0 # ap[j]*bp[i]+tp[j]
|
---|
283 | mov %rdx,$hi0
|
---|
284 | adc \$0,$hi0
|
---|
285 | lea 1($j),$j # j++
|
---|
286 |
|
---|
287 | mulq $m1 # np[j]*m1
|
---|
288 | cmp $num,$j
|
---|
289 | jne .Linner
|
---|
290 |
|
---|
291 | add %rax,$hi1
|
---|
292 | mov ($ap),%rax # ap[0]
|
---|
293 | adc \$0,%rdx
|
---|
294 | add $lo0,$hi1 # np[j]*m1+ap[j]*bp[i]+tp[j]
|
---|
295 | mov (%rsp,$j,8),$lo0
|
---|
296 | adc \$0,%rdx
|
---|
297 | mov $hi1,-16(%rsp,$j,8) # tp[j-1]
|
---|
298 | mov %rdx,$hi1
|
---|
299 |
|
---|
300 | xor %rdx,%rdx
|
---|
301 | add $hi0,$hi1
|
---|
302 | adc \$0,%rdx
|
---|
303 | add $lo0,$hi1 # pull upmost overflow bit
|
---|
304 | adc \$0,%rdx
|
---|
305 | mov $hi1,-8(%rsp,$num,8)
|
---|
306 | mov %rdx,(%rsp,$num,8) # store upmost overflow bit
|
---|
307 |
|
---|
308 | lea 1($i),$i # i++
|
---|
309 | cmp $num,$i
|
---|
310 | jb .Louter
|
---|
311 |
|
---|
312 | xor $i,$i # i=0 and clear CF!
|
---|
313 | mov (%rsp),%rax # tp[0]
|
---|
314 | mov $num,$j # j=num
|
---|
315 |
|
---|
316 | .align 16
|
---|
317 | .Lsub: sbb ($np,$i,8),%rax
|
---|
318 | mov %rax,($rp,$i,8) # rp[i]=tp[i]-np[i]
|
---|
319 | mov 8(%rsp,$i,8),%rax # tp[i+1]
|
---|
320 | lea 1($i),$i # i++
|
---|
321 | dec $j # doesn't affect CF!
|
---|
322 | jnz .Lsub
|
---|
323 |
|
---|
324 | sbb \$0,%rax # handle upmost overflow bit
|
---|
325 | mov \$-1,%rbx
|
---|
326 | xor %rax,%rbx # not %rax
|
---|
327 | xor $i,$i
|
---|
328 | mov $num,$j # j=num
|
---|
329 |
|
---|
330 | .Lcopy: # conditional copy
|
---|
331 | mov ($rp,$i,8),%rcx
|
---|
332 | mov (%rsp,$i,8),%rdx
|
---|
333 | and %rbx,%rcx
|
---|
334 | and %rax,%rdx
|
---|
335 | mov $num,(%rsp,$i,8) # zap temporary vector
|
---|
336 | or %rcx,%rdx
|
---|
337 | mov %rdx,($rp,$i,8) # rp[i]=tp[i]
|
---|
338 | lea 1($i),$i
|
---|
339 | sub \$1,$j
|
---|
340 | jnz .Lcopy
|
---|
341 |
|
---|
342 | mov 8(%rsp,$num,8),%rsi # restore %rsp
|
---|
343 | .cfi_def_cfa %rsi,8
|
---|
344 | mov \$1,%rax
|
---|
345 | mov -48(%rsi),%r15
|
---|
346 | .cfi_restore %r15
|
---|
347 | mov -40(%rsi),%r14
|
---|
348 | .cfi_restore %r14
|
---|
349 | mov -32(%rsi),%r13
|
---|
350 | .cfi_restore %r13
|
---|
351 | mov -24(%rsi),%r12
|
---|
352 | .cfi_restore %r12
|
---|
353 | mov -16(%rsi),%rbp
|
---|
354 | .cfi_restore %rbp
|
---|
355 | mov -8(%rsi),%rbx
|
---|
356 | .cfi_restore %rbx
|
---|
357 | lea (%rsi),%rsp
|
---|
358 | .cfi_def_cfa_register %rsp
|
---|
359 | .Lmul_epilogue:
|
---|
360 | ret
|
---|
361 | .cfi_endproc
|
---|
362 | .size bn_mul_mont,.-bn_mul_mont
|
---|
363 | ___
|
---|
364 | {{{
|
---|
365 | my @A=("%r10","%r11");
|
---|
366 | my @N=("%r13","%rdi");
|
---|
367 | $code.=<<___;
|
---|
368 | .type bn_mul4x_mont,\@function,6
|
---|
369 | .align 16
|
---|
370 | bn_mul4x_mont:
|
---|
371 | .cfi_startproc
|
---|
372 | mov ${num}d,${num}d
|
---|
373 | mov %rsp,%rax
|
---|
374 | .cfi_def_cfa_register %rax
|
---|
375 | .Lmul4x_enter:
|
---|
376 | ___
|
---|
377 | $code.=<<___ if ($addx);
|
---|
378 | and \$0x80100,%r11d
|
---|
379 | cmp \$0x80100,%r11d
|
---|
380 | je .Lmulx4x_enter
|
---|
381 | ___
|
---|
382 | $code.=<<___;
|
---|
383 | push %rbx
|
---|
384 | .cfi_push %rbx
|
---|
385 | push %rbp
|
---|
386 | .cfi_push %rbp
|
---|
387 | push %r12
|
---|
388 | .cfi_push %r12
|
---|
389 | push %r13
|
---|
390 | .cfi_push %r13
|
---|
391 | push %r14
|
---|
392 | .cfi_push %r14
|
---|
393 | push %r15
|
---|
394 | .cfi_push %r15
|
---|
395 |
|
---|
396 | neg $num
|
---|
397 | mov %rsp,%r11
|
---|
398 | lea -32(%rsp,$num,8),%r10 # future alloca(8*(num+4))
|
---|
399 | neg $num # restore
|
---|
400 | and \$-1024,%r10 # minimize TLB usage
|
---|
401 |
|
---|
402 | sub %r10,%r11
|
---|
403 | and \$-4096,%r11
|
---|
404 | lea (%r10,%r11),%rsp
|
---|
405 | mov (%rsp),%r11
|
---|
406 | cmp %r10,%rsp
|
---|
407 | ja .Lmul4x_page_walk
|
---|
408 | jmp .Lmul4x_page_walk_done
|
---|
409 |
|
---|
410 | .Lmul4x_page_walk:
|
---|
411 | lea -4096(%rsp),%rsp
|
---|
412 | mov (%rsp),%r11
|
---|
413 | cmp %r10,%rsp
|
---|
414 | ja .Lmul4x_page_walk
|
---|
415 | .Lmul4x_page_walk_done:
|
---|
416 |
|
---|
417 | mov %rax,8(%rsp,$num,8) # tp[num+1]=%rsp
|
---|
418 | .cfi_cfa_expression %rsp+8,$num,8,mul,plus,deref,+8
|
---|
419 | .Lmul4x_body:
|
---|
420 | mov $rp,16(%rsp,$num,8) # tp[num+2]=$rp
|
---|
421 | mov %rdx,%r12 # reassign $bp
|
---|
422 | ___
|
---|
423 | $bp="%r12";
|
---|
424 | $code.=<<___;
|
---|
425 | mov ($n0),$n0 # pull n0[0] value
|
---|
426 | mov ($bp),$m0 # m0=bp[0]
|
---|
427 | mov ($ap),%rax
|
---|
428 |
|
---|
429 | xor $i,$i # i=0
|
---|
430 | xor $j,$j # j=0
|
---|
431 |
|
---|
432 | mov $n0,$m1
|
---|
433 | mulq $m0 # ap[0]*bp[0]
|
---|
434 | mov %rax,$A[0]
|
---|
435 | mov ($np),%rax
|
---|
436 |
|
---|
437 | imulq $A[0],$m1 # "tp[0]"*n0
|
---|
438 | mov %rdx,$A[1]
|
---|
439 |
|
---|
440 | mulq $m1 # np[0]*m1
|
---|
441 | add %rax,$A[0] # discarded
|
---|
442 | mov 8($ap),%rax
|
---|
443 | adc \$0,%rdx
|
---|
444 | mov %rdx,$N[1]
|
---|
445 |
|
---|
446 | mulq $m0
|
---|
447 | add %rax,$A[1]
|
---|
448 | mov 8($np),%rax
|
---|
449 | adc \$0,%rdx
|
---|
450 | mov %rdx,$A[0]
|
---|
451 |
|
---|
452 | mulq $m1
|
---|
453 | add %rax,$N[1]
|
---|
454 | mov 16($ap),%rax
|
---|
455 | adc \$0,%rdx
|
---|
456 | add $A[1],$N[1]
|
---|
457 | lea 4($j),$j # j++
|
---|
458 | adc \$0,%rdx
|
---|
459 | mov $N[1],(%rsp)
|
---|
460 | mov %rdx,$N[0]
|
---|
461 | jmp .L1st4x
|
---|
462 | .align 16
|
---|
463 | .L1st4x:
|
---|
464 | mulq $m0 # ap[j]*bp[0]
|
---|
465 | add %rax,$A[0]
|
---|
466 | mov -16($np,$j,8),%rax
|
---|
467 | adc \$0,%rdx
|
---|
468 | mov %rdx,$A[1]
|
---|
469 |
|
---|
470 | mulq $m1 # np[j]*m1
|
---|
471 | add %rax,$N[0]
|
---|
472 | mov -8($ap,$j,8),%rax
|
---|
473 | adc \$0,%rdx
|
---|
474 | add $A[0],$N[0] # np[j]*m1+ap[j]*bp[0]
|
---|
475 | adc \$0,%rdx
|
---|
476 | mov $N[0],-24(%rsp,$j,8) # tp[j-1]
|
---|
477 | mov %rdx,$N[1]
|
---|
478 |
|
---|
479 | mulq $m0 # ap[j]*bp[0]
|
---|
480 | add %rax,$A[1]
|
---|
481 | mov -8($np,$j,8),%rax
|
---|
482 | adc \$0,%rdx
|
---|
483 | mov %rdx,$A[0]
|
---|
484 |
|
---|
485 | mulq $m1 # np[j]*m1
|
---|
486 | add %rax,$N[1]
|
---|
487 | mov ($ap,$j,8),%rax
|
---|
488 | adc \$0,%rdx
|
---|
489 | add $A[1],$N[1] # np[j]*m1+ap[j]*bp[0]
|
---|
490 | adc \$0,%rdx
|
---|
491 | mov $N[1],-16(%rsp,$j,8) # tp[j-1]
|
---|
492 | mov %rdx,$N[0]
|
---|
493 |
|
---|
494 | mulq $m0 # ap[j]*bp[0]
|
---|
495 | add %rax,$A[0]
|
---|
496 | mov ($np,$j,8),%rax
|
---|
497 | adc \$0,%rdx
|
---|
498 | mov %rdx,$A[1]
|
---|
499 |
|
---|
500 | mulq $m1 # np[j]*m1
|
---|
501 | add %rax,$N[0]
|
---|
502 | mov 8($ap,$j,8),%rax
|
---|
503 | adc \$0,%rdx
|
---|
504 | add $A[0],$N[0] # np[j]*m1+ap[j]*bp[0]
|
---|
505 | adc \$0,%rdx
|
---|
506 | mov $N[0],-8(%rsp,$j,8) # tp[j-1]
|
---|
507 | mov %rdx,$N[1]
|
---|
508 |
|
---|
509 | mulq $m0 # ap[j]*bp[0]
|
---|
510 | add %rax,$A[1]
|
---|
511 | mov 8($np,$j,8),%rax
|
---|
512 | adc \$0,%rdx
|
---|
513 | lea 4($j),$j # j++
|
---|
514 | mov %rdx,$A[0]
|
---|
515 |
|
---|
516 | mulq $m1 # np[j]*m1
|
---|
517 | add %rax,$N[1]
|
---|
518 | mov -16($ap,$j,8),%rax
|
---|
519 | adc \$0,%rdx
|
---|
520 | add $A[1],$N[1] # np[j]*m1+ap[j]*bp[0]
|
---|
521 | adc \$0,%rdx
|
---|
522 | mov $N[1],-32(%rsp,$j,8) # tp[j-1]
|
---|
523 | mov %rdx,$N[0]
|
---|
524 | cmp $num,$j
|
---|
525 | jb .L1st4x
|
---|
526 |
|
---|
527 | mulq $m0 # ap[j]*bp[0]
|
---|
528 | add %rax,$A[0]
|
---|
529 | mov -16($np,$j,8),%rax
|
---|
530 | adc \$0,%rdx
|
---|
531 | mov %rdx,$A[1]
|
---|
532 |
|
---|
533 | mulq $m1 # np[j]*m1
|
---|
534 | add %rax,$N[0]
|
---|
535 | mov -8($ap,$j,8),%rax
|
---|
536 | adc \$0,%rdx
|
---|
537 | add $A[0],$N[0] # np[j]*m1+ap[j]*bp[0]
|
---|
538 | adc \$0,%rdx
|
---|
539 | mov $N[0],-24(%rsp,$j,8) # tp[j-1]
|
---|
540 | mov %rdx,$N[1]
|
---|
541 |
|
---|
542 | mulq $m0 # ap[j]*bp[0]
|
---|
543 | add %rax,$A[1]
|
---|
544 | mov -8($np,$j,8),%rax
|
---|
545 | adc \$0,%rdx
|
---|
546 | mov %rdx,$A[0]
|
---|
547 |
|
---|
548 | mulq $m1 # np[j]*m1
|
---|
549 | add %rax,$N[1]
|
---|
550 | mov ($ap),%rax # ap[0]
|
---|
551 | adc \$0,%rdx
|
---|
552 | add $A[1],$N[1] # np[j]*m1+ap[j]*bp[0]
|
---|
553 | adc \$0,%rdx
|
---|
554 | mov $N[1],-16(%rsp,$j,8) # tp[j-1]
|
---|
555 | mov %rdx,$N[0]
|
---|
556 |
|
---|
557 | xor $N[1],$N[1]
|
---|
558 | add $A[0],$N[0]
|
---|
559 | adc \$0,$N[1]
|
---|
560 | mov $N[0],-8(%rsp,$j,8)
|
---|
561 | mov $N[1],(%rsp,$j,8) # store upmost overflow bit
|
---|
562 |
|
---|
563 | lea 1($i),$i # i++
|
---|
564 | .align 4
|
---|
565 | .Louter4x:
|
---|
566 | mov ($bp,$i,8),$m0 # m0=bp[i]
|
---|
567 | xor $j,$j # j=0
|
---|
568 | mov (%rsp),$A[0]
|
---|
569 | mov $n0,$m1
|
---|
570 | mulq $m0 # ap[0]*bp[i]
|
---|
571 | add %rax,$A[0] # ap[0]*bp[i]+tp[0]
|
---|
572 | mov ($np),%rax
|
---|
573 | adc \$0,%rdx
|
---|
574 |
|
---|
575 | imulq $A[0],$m1 # tp[0]*n0
|
---|
576 | mov %rdx,$A[1]
|
---|
577 |
|
---|
578 | mulq $m1 # np[0]*m1
|
---|
579 | add %rax,$A[0] # "$N[0]", discarded
|
---|
580 | mov 8($ap),%rax
|
---|
581 | adc \$0,%rdx
|
---|
582 | mov %rdx,$N[1]
|
---|
583 |
|
---|
584 | mulq $m0 # ap[j]*bp[i]
|
---|
585 | add %rax,$A[1]
|
---|
586 | mov 8($np),%rax
|
---|
587 | adc \$0,%rdx
|
---|
588 | add 8(%rsp),$A[1] # +tp[1]
|
---|
589 | adc \$0,%rdx
|
---|
590 | mov %rdx,$A[0]
|
---|
591 |
|
---|
592 | mulq $m1 # np[j]*m1
|
---|
593 | add %rax,$N[1]
|
---|
594 | mov 16($ap),%rax
|
---|
595 | adc \$0,%rdx
|
---|
596 | add $A[1],$N[1] # np[j]*m1+ap[j]*bp[i]+tp[j]
|
---|
597 | lea 4($j),$j # j+=2
|
---|
598 | adc \$0,%rdx
|
---|
599 | mov $N[1],(%rsp) # tp[j-1]
|
---|
600 | mov %rdx,$N[0]
|
---|
601 | jmp .Linner4x
|
---|
602 | .align 16
|
---|
603 | .Linner4x:
|
---|
604 | mulq $m0 # ap[j]*bp[i]
|
---|
605 | add %rax,$A[0]
|
---|
606 | mov -16($np,$j,8),%rax
|
---|
607 | adc \$0,%rdx
|
---|
608 | add -16(%rsp,$j,8),$A[0] # ap[j]*bp[i]+tp[j]
|
---|
609 | adc \$0,%rdx
|
---|
610 | mov %rdx,$A[1]
|
---|
611 |
|
---|
612 | mulq $m1 # np[j]*m1
|
---|
613 | add %rax,$N[0]
|
---|
614 | mov -8($ap,$j,8),%rax
|
---|
615 | adc \$0,%rdx
|
---|
616 | add $A[0],$N[0]
|
---|
617 | adc \$0,%rdx
|
---|
618 | mov $N[0],-24(%rsp,$j,8) # tp[j-1]
|
---|
619 | mov %rdx,$N[1]
|
---|
620 |
|
---|
621 | mulq $m0 # ap[j]*bp[i]
|
---|
622 | add %rax,$A[1]
|
---|
623 | mov -8($np,$j,8),%rax
|
---|
624 | adc \$0,%rdx
|
---|
625 | add -8(%rsp,$j,8),$A[1]
|
---|
626 | adc \$0,%rdx
|
---|
627 | mov %rdx,$A[0]
|
---|
628 |
|
---|
629 | mulq $m1 # np[j]*m1
|
---|
630 | add %rax,$N[1]
|
---|
631 | mov ($ap,$j,8),%rax
|
---|
632 | adc \$0,%rdx
|
---|
633 | add $A[1],$N[1]
|
---|
634 | adc \$0,%rdx
|
---|
635 | mov $N[1],-16(%rsp,$j,8) # tp[j-1]
|
---|
636 | mov %rdx,$N[0]
|
---|
637 |
|
---|
638 | mulq $m0 # ap[j]*bp[i]
|
---|
639 | add %rax,$A[0]
|
---|
640 | mov ($np,$j,8),%rax
|
---|
641 | adc \$0,%rdx
|
---|
642 | add (%rsp,$j,8),$A[0] # ap[j]*bp[i]+tp[j]
|
---|
643 | adc \$0,%rdx
|
---|
644 | mov %rdx,$A[1]
|
---|
645 |
|
---|
646 | mulq $m1 # np[j]*m1
|
---|
647 | add %rax,$N[0]
|
---|
648 | mov 8($ap,$j,8),%rax
|
---|
649 | adc \$0,%rdx
|
---|
650 | add $A[0],$N[0]
|
---|
651 | adc \$0,%rdx
|
---|
652 | mov $N[0],-8(%rsp,$j,8) # tp[j-1]
|
---|
653 | mov %rdx,$N[1]
|
---|
654 |
|
---|
655 | mulq $m0 # ap[j]*bp[i]
|
---|
656 | add %rax,$A[1]
|
---|
657 | mov 8($np,$j,8),%rax
|
---|
658 | adc \$0,%rdx
|
---|
659 | add 8(%rsp,$j,8),$A[1]
|
---|
660 | adc \$0,%rdx
|
---|
661 | lea 4($j),$j # j++
|
---|
662 | mov %rdx,$A[0]
|
---|
663 |
|
---|
664 | mulq $m1 # np[j]*m1
|
---|
665 | add %rax,$N[1]
|
---|
666 | mov -16($ap,$j,8),%rax
|
---|
667 | adc \$0,%rdx
|
---|
668 | add $A[1],$N[1]
|
---|
669 | adc \$0,%rdx
|
---|
670 | mov $N[1],-32(%rsp,$j,8) # tp[j-1]
|
---|
671 | mov %rdx,$N[0]
|
---|
672 | cmp $num,$j
|
---|
673 | jb .Linner4x
|
---|
674 |
|
---|
675 | mulq $m0 # ap[j]*bp[i]
|
---|
676 | add %rax,$A[0]
|
---|
677 | mov -16($np,$j,8),%rax
|
---|
678 | adc \$0,%rdx
|
---|
679 | add -16(%rsp,$j,8),$A[0] # ap[j]*bp[i]+tp[j]
|
---|
680 | adc \$0,%rdx
|
---|
681 | mov %rdx,$A[1]
|
---|
682 |
|
---|
683 | mulq $m1 # np[j]*m1
|
---|
684 | add %rax,$N[0]
|
---|
685 | mov -8($ap,$j,8),%rax
|
---|
686 | adc \$0,%rdx
|
---|
687 | add $A[0],$N[0]
|
---|
688 | adc \$0,%rdx
|
---|
689 | mov $N[0],-24(%rsp,$j,8) # tp[j-1]
|
---|
690 | mov %rdx,$N[1]
|
---|
691 |
|
---|
692 | mulq $m0 # ap[j]*bp[i]
|
---|
693 | add %rax,$A[1]
|
---|
694 | mov -8($np,$j,8),%rax
|
---|
695 | adc \$0,%rdx
|
---|
696 | add -8(%rsp,$j,8),$A[1]
|
---|
697 | adc \$0,%rdx
|
---|
698 | lea 1($i),$i # i++
|
---|
699 | mov %rdx,$A[0]
|
---|
700 |
|
---|
701 | mulq $m1 # np[j]*m1
|
---|
702 | add %rax,$N[1]
|
---|
703 | mov ($ap),%rax # ap[0]
|
---|
704 | adc \$0,%rdx
|
---|
705 | add $A[1],$N[1]
|
---|
706 | adc \$0,%rdx
|
---|
707 | mov $N[1],-16(%rsp,$j,8) # tp[j-1]
|
---|
708 | mov %rdx,$N[0]
|
---|
709 |
|
---|
710 | xor $N[1],$N[1]
|
---|
711 | add $A[0],$N[0]
|
---|
712 | adc \$0,$N[1]
|
---|
713 | add (%rsp,$num,8),$N[0] # pull upmost overflow bit
|
---|
714 | adc \$0,$N[1]
|
---|
715 | mov $N[0],-8(%rsp,$j,8)
|
---|
716 | mov $N[1],(%rsp,$j,8) # store upmost overflow bit
|
---|
717 |
|
---|
718 | cmp $num,$i
|
---|
719 | jb .Louter4x
|
---|
720 | ___
|
---|
721 | {
|
---|
722 | my @ri=("%rax","%rdx",$m0,$m1);
|
---|
723 | $code.=<<___;
|
---|
724 | mov 16(%rsp,$num,8),$rp # restore $rp
|
---|
725 | lea -4($num),$j
|
---|
726 | mov 0(%rsp),@ri[0] # tp[0]
|
---|
727 | mov 8(%rsp),@ri[1] # tp[1]
|
---|
728 | shr \$2,$j # j=num/4-1
|
---|
729 | lea (%rsp),$ap # borrow ap for tp
|
---|
730 | xor $i,$i # i=0 and clear CF!
|
---|
731 |
|
---|
732 | sub 0($np),@ri[0]
|
---|
733 | mov 16($ap),@ri[2] # tp[2]
|
---|
734 | mov 24($ap),@ri[3] # tp[3]
|
---|
735 | sbb 8($np),@ri[1]
|
---|
736 |
|
---|
737 | .Lsub4x:
|
---|
738 | mov @ri[0],0($rp,$i,8) # rp[i]=tp[i]-np[i]
|
---|
739 | mov @ri[1],8($rp,$i,8) # rp[i]=tp[i]-np[i]
|
---|
740 | sbb 16($np,$i,8),@ri[2]
|
---|
741 | mov 32($ap,$i,8),@ri[0] # tp[i+1]
|
---|
742 | mov 40($ap,$i,8),@ri[1]
|
---|
743 | sbb 24($np,$i,8),@ri[3]
|
---|
744 | mov @ri[2],16($rp,$i,8) # rp[i]=tp[i]-np[i]
|
---|
745 | mov @ri[3],24($rp,$i,8) # rp[i]=tp[i]-np[i]
|
---|
746 | sbb 32($np,$i,8),@ri[0]
|
---|
747 | mov 48($ap,$i,8),@ri[2]
|
---|
748 | mov 56($ap,$i,8),@ri[3]
|
---|
749 | sbb 40($np,$i,8),@ri[1]
|
---|
750 | lea 4($i),$i # i++
|
---|
751 | dec $j # doesn't affect CF!
|
---|
752 | jnz .Lsub4x
|
---|
753 |
|
---|
754 | mov @ri[0],0($rp,$i,8) # rp[i]=tp[i]-np[i]
|
---|
755 | mov 32($ap,$i,8),@ri[0] # load overflow bit
|
---|
756 | sbb 16($np,$i,8),@ri[2]
|
---|
757 | mov @ri[1],8($rp,$i,8) # rp[i]=tp[i]-np[i]
|
---|
758 | sbb 24($np,$i,8),@ri[3]
|
---|
759 | mov @ri[2],16($rp,$i,8) # rp[i]=tp[i]-np[i]
|
---|
760 |
|
---|
761 | sbb \$0,@ri[0] # handle upmost overflow bit
|
---|
762 | mov @ri[3],24($rp,$i,8) # rp[i]=tp[i]-np[i]
|
---|
763 | pxor %xmm0,%xmm0
|
---|
764 | movq @ri[0],%xmm4
|
---|
765 | pcmpeqd %xmm5,%xmm5
|
---|
766 | pshufd \$0,%xmm4,%xmm4
|
---|
767 | mov $num,$j
|
---|
768 | pxor %xmm4,%xmm5
|
---|
769 | shr \$2,$j # j=num/4
|
---|
770 | xor %eax,%eax # i=0
|
---|
771 |
|
---|
772 | jmp .Lcopy4x
|
---|
773 | .align 16
|
---|
774 | .Lcopy4x: # conditional copy
|
---|
775 | movdqa (%rsp,%rax),%xmm1
|
---|
776 | movdqu ($rp,%rax),%xmm2
|
---|
777 | pand %xmm4,%xmm1
|
---|
778 | pand %xmm5,%xmm2
|
---|
779 | movdqa 16(%rsp,%rax),%xmm3
|
---|
780 | movdqa %xmm0,(%rsp,%rax)
|
---|
781 | por %xmm2,%xmm1
|
---|
782 | movdqu 16($rp,%rax),%xmm2
|
---|
783 | movdqu %xmm1,($rp,%rax)
|
---|
784 | pand %xmm4,%xmm3
|
---|
785 | pand %xmm5,%xmm2
|
---|
786 | movdqa %xmm0,16(%rsp,%rax)
|
---|
787 | por %xmm2,%xmm3
|
---|
788 | movdqu %xmm3,16($rp,%rax)
|
---|
789 | lea 32(%rax),%rax
|
---|
790 | dec $j
|
---|
791 | jnz .Lcopy4x
|
---|
792 | ___
|
---|
793 | }
|
---|
794 | $code.=<<___;
|
---|
795 | mov 8(%rsp,$num,8),%rsi # restore %rsp
|
---|
796 | .cfi_def_cfa %rsi, 8
|
---|
797 | mov \$1,%rax
|
---|
798 | mov -48(%rsi),%r15
|
---|
799 | .cfi_restore %r15
|
---|
800 | mov -40(%rsi),%r14
|
---|
801 | .cfi_restore %r14
|
---|
802 | mov -32(%rsi),%r13
|
---|
803 | .cfi_restore %r13
|
---|
804 | mov -24(%rsi),%r12
|
---|
805 | .cfi_restore %r12
|
---|
806 | mov -16(%rsi),%rbp
|
---|
807 | .cfi_restore %rbp
|
---|
808 | mov -8(%rsi),%rbx
|
---|
809 | .cfi_restore %rbx
|
---|
810 | lea (%rsi),%rsp
|
---|
811 | .cfi_def_cfa_register %rsp
|
---|
812 | .Lmul4x_epilogue:
|
---|
813 | ret
|
---|
814 | .cfi_endproc
|
---|
815 | .size bn_mul4x_mont,.-bn_mul4x_mont
|
---|
816 | ___
|
---|
817 | }}}
|
---|
818 | |
---|
819 | {{{
|
---|
820 | ######################################################################
|
---|
821 | # void bn_sqr8x_mont(
|
---|
822 | my $rptr="%rdi"; # const BN_ULONG *rptr,
|
---|
823 | my $aptr="%rsi"; # const BN_ULONG *aptr,
|
---|
824 | my $bptr="%rdx"; # not used
|
---|
825 | my $nptr="%rcx"; # const BN_ULONG *nptr,
|
---|
826 | my $n0 ="%r8"; # const BN_ULONG *n0);
|
---|
827 | my $num ="%r9"; # int num, has to be divisible by 8
|
---|
828 |
|
---|
829 | my ($i,$j,$tptr)=("%rbp","%rcx",$rptr);
|
---|
830 | my @A0=("%r10","%r11");
|
---|
831 | my @A1=("%r12","%r13");
|
---|
832 | my ($a0,$a1,$ai)=("%r14","%r15","%rbx");
|
---|
833 |
|
---|
834 | $code.=<<___ if ($addx);
|
---|
835 | .extern bn_sqrx8x_internal # see x86_64-mont5 module
|
---|
836 | ___
|
---|
837 | $code.=<<___;
|
---|
838 | .extern bn_sqr8x_internal # see x86_64-mont5 module
|
---|
839 |
|
---|
840 | .type bn_sqr8x_mont,\@function,6
|
---|
841 | .align 32
|
---|
842 | bn_sqr8x_mont:
|
---|
843 | .cfi_startproc
|
---|
844 | mov %rsp,%rax
|
---|
845 | .cfi_def_cfa_register %rax
|
---|
846 | .Lsqr8x_enter:
|
---|
847 | push %rbx
|
---|
848 | .cfi_push %rbx
|
---|
849 | push %rbp
|
---|
850 | .cfi_push %rbp
|
---|
851 | push %r12
|
---|
852 | .cfi_push %r12
|
---|
853 | push %r13
|
---|
854 | .cfi_push %r13
|
---|
855 | push %r14
|
---|
856 | .cfi_push %r14
|
---|
857 | push %r15
|
---|
858 | .cfi_push %r15
|
---|
859 | .Lsqr8x_prologue:
|
---|
860 |
|
---|
861 | mov ${num}d,%r10d
|
---|
862 | shl \$3,${num}d # convert $num to bytes
|
---|
863 | shl \$3+2,%r10 # 4*$num
|
---|
864 | neg $num
|
---|
865 |
|
---|
866 | ##############################################################
|
---|
867 | # ensure that stack frame doesn't alias with $aptr modulo
|
---|
868 | # 4096. this is done to allow memory disambiguation logic
|
---|
869 | # do its job.
|
---|
870 | #
|
---|
871 | lea -64(%rsp,$num,2),%r11
|
---|
872 | mov %rsp,%rbp
|
---|
873 | mov ($n0),$n0 # *n0
|
---|
874 | sub $aptr,%r11
|
---|
875 | and \$4095,%r11
|
---|
876 | cmp %r11,%r10
|
---|
877 | jb .Lsqr8x_sp_alt
|
---|
878 | sub %r11,%rbp # align with $aptr
|
---|
879 | lea -64(%rbp,$num,2),%rbp # future alloca(frame+2*$num)
|
---|
880 | jmp .Lsqr8x_sp_done
|
---|
881 |
|
---|
882 | .align 32
|
---|
883 | .Lsqr8x_sp_alt:
|
---|
884 | lea 4096-64(,$num,2),%r10 # 4096-frame-2*$num
|
---|
885 | lea -64(%rbp,$num,2),%rbp # future alloca(frame+2*$num)
|
---|
886 | sub %r10,%r11
|
---|
887 | mov \$0,%r10
|
---|
888 | cmovc %r10,%r11
|
---|
889 | sub %r11,%rbp
|
---|
890 | .Lsqr8x_sp_done:
|
---|
891 | and \$-64,%rbp
|
---|
892 | mov %rsp,%r11
|
---|
893 | sub %rbp,%r11
|
---|
894 | and \$-4096,%r11
|
---|
895 | lea (%rbp,%r11),%rsp
|
---|
896 | mov (%rsp),%r10
|
---|
897 | cmp %rbp,%rsp
|
---|
898 | ja .Lsqr8x_page_walk
|
---|
899 | jmp .Lsqr8x_page_walk_done
|
---|
900 |
|
---|
901 | .align 16
|
---|
902 | .Lsqr8x_page_walk:
|
---|
903 | lea -4096(%rsp),%rsp
|
---|
904 | mov (%rsp),%r10
|
---|
905 | cmp %rbp,%rsp
|
---|
906 | ja .Lsqr8x_page_walk
|
---|
907 | .Lsqr8x_page_walk_done:
|
---|
908 |
|
---|
909 | mov $num,%r10
|
---|
910 | neg $num
|
---|
911 |
|
---|
912 | mov $n0, 32(%rsp)
|
---|
913 | mov %rax, 40(%rsp) # save original %rsp
|
---|
914 | .cfi_cfa_expression %rsp+40,deref,+8
|
---|
915 | .Lsqr8x_body:
|
---|
916 |
|
---|
917 | movq $nptr, %xmm2 # save pointer to modulus
|
---|
918 | pxor %xmm0,%xmm0
|
---|
919 | movq $rptr,%xmm1 # save $rptr
|
---|
920 | movq %r10, %xmm3 # -$num
|
---|
921 | ___
|
---|
922 | $code.=<<___ if ($addx);
|
---|
923 | mov OPENSSL_ia32cap_P+8(%rip),%eax
|
---|
924 | and \$0x80100,%eax
|
---|
925 | cmp \$0x80100,%eax
|
---|
926 | jne .Lsqr8x_nox
|
---|
927 |
|
---|
928 | call bn_sqrx8x_internal # see x86_64-mont5 module
|
---|
929 | # %rax top-most carry
|
---|
930 | # %rbp nptr
|
---|
931 | # %rcx -8*num
|
---|
932 | # %r8 end of tp[2*num]
|
---|
933 | lea (%r8,%rcx),%rbx
|
---|
934 | mov %rcx,$num
|
---|
935 | mov %rcx,%rdx
|
---|
936 | movq %xmm1,$rptr
|
---|
937 | sar \$3+2,%rcx # %cf=0
|
---|
938 | jmp .Lsqr8x_sub
|
---|
939 |
|
---|
940 | .align 32
|
---|
941 | .Lsqr8x_nox:
|
---|
942 | ___
|
---|
943 | $code.=<<___;
|
---|
944 | call bn_sqr8x_internal # see x86_64-mont5 module
|
---|
945 | # %rax top-most carry
|
---|
946 | # %rbp nptr
|
---|
947 | # %r8 -8*num
|
---|
948 | # %rdi end of tp[2*num]
|
---|
949 | lea (%rdi,$num),%rbx
|
---|
950 | mov $num,%rcx
|
---|
951 | mov $num,%rdx
|
---|
952 | movq %xmm1,$rptr
|
---|
953 | sar \$3+2,%rcx # %cf=0
|
---|
954 | jmp .Lsqr8x_sub
|
---|
955 |
|
---|
956 | .align 32
|
---|
957 | .Lsqr8x_sub:
|
---|
958 | mov 8*0(%rbx),%r12
|
---|
959 | mov 8*1(%rbx),%r13
|
---|
960 | mov 8*2(%rbx),%r14
|
---|
961 | mov 8*3(%rbx),%r15
|
---|
962 | lea 8*4(%rbx),%rbx
|
---|
963 | sbb 8*0(%rbp),%r12
|
---|
964 | sbb 8*1(%rbp),%r13
|
---|
965 | sbb 8*2(%rbp),%r14
|
---|
966 | sbb 8*3(%rbp),%r15
|
---|
967 | lea 8*4(%rbp),%rbp
|
---|
968 | mov %r12,8*0($rptr)
|
---|
969 | mov %r13,8*1($rptr)
|
---|
970 | mov %r14,8*2($rptr)
|
---|
971 | mov %r15,8*3($rptr)
|
---|
972 | lea 8*4($rptr),$rptr
|
---|
973 | inc %rcx # preserves %cf
|
---|
974 | jnz .Lsqr8x_sub
|
---|
975 |
|
---|
976 | sbb \$0,%rax # top-most carry
|
---|
977 | lea (%rbx,$num),%rbx # rewind
|
---|
978 | lea ($rptr,$num),$rptr # rewind
|
---|
979 |
|
---|
980 | movq %rax,%xmm1
|
---|
981 | pxor %xmm0,%xmm0
|
---|
982 | pshufd \$0,%xmm1,%xmm1
|
---|
983 | mov 40(%rsp),%rsi # restore %rsp
|
---|
984 | .cfi_def_cfa %rsi,8
|
---|
985 | jmp .Lsqr8x_cond_copy
|
---|
986 |
|
---|
987 | .align 32
|
---|
988 | .Lsqr8x_cond_copy:
|
---|
989 | movdqa 16*0(%rbx),%xmm2
|
---|
990 | movdqa 16*1(%rbx),%xmm3
|
---|
991 | lea 16*2(%rbx),%rbx
|
---|
992 | movdqu 16*0($rptr),%xmm4
|
---|
993 | movdqu 16*1($rptr),%xmm5
|
---|
994 | lea 16*2($rptr),$rptr
|
---|
995 | movdqa %xmm0,-16*2(%rbx) # zero tp
|
---|
996 | movdqa %xmm0,-16*1(%rbx)
|
---|
997 | movdqa %xmm0,-16*2(%rbx,%rdx)
|
---|
998 | movdqa %xmm0,-16*1(%rbx,%rdx)
|
---|
999 | pcmpeqd %xmm1,%xmm0
|
---|
1000 | pand %xmm1,%xmm2
|
---|
1001 | pand %xmm1,%xmm3
|
---|
1002 | pand %xmm0,%xmm4
|
---|
1003 | pand %xmm0,%xmm5
|
---|
1004 | pxor %xmm0,%xmm0
|
---|
1005 | por %xmm2,%xmm4
|
---|
1006 | por %xmm3,%xmm5
|
---|
1007 | movdqu %xmm4,-16*2($rptr)
|
---|
1008 | movdqu %xmm5,-16*1($rptr)
|
---|
1009 | add \$32,$num
|
---|
1010 | jnz .Lsqr8x_cond_copy
|
---|
1011 |
|
---|
1012 | mov \$1,%rax
|
---|
1013 | mov -48(%rsi),%r15
|
---|
1014 | .cfi_restore %r15
|
---|
1015 | mov -40(%rsi),%r14
|
---|
1016 | .cfi_restore %r14
|
---|
1017 | mov -32(%rsi),%r13
|
---|
1018 | .cfi_restore %r13
|
---|
1019 | mov -24(%rsi),%r12
|
---|
1020 | .cfi_restore %r12
|
---|
1021 | mov -16(%rsi),%rbp
|
---|
1022 | .cfi_restore %rbp
|
---|
1023 | mov -8(%rsi),%rbx
|
---|
1024 | .cfi_restore %rbx
|
---|
1025 | lea (%rsi),%rsp
|
---|
1026 | .cfi_def_cfa_register %rsp
|
---|
1027 | .Lsqr8x_epilogue:
|
---|
1028 | ret
|
---|
1029 | .cfi_endproc
|
---|
1030 | .size bn_sqr8x_mont,.-bn_sqr8x_mont
|
---|
1031 | ___
|
---|
1032 | }}}
|
---|
1033 | |
---|
1034 |
|
---|
1035 | if ($addx) {{{
|
---|
1036 | my $bp="%rdx"; # original value
|
---|
1037 |
|
---|
1038 | $code.=<<___;
|
---|
1039 | .type bn_mulx4x_mont,\@function,6
|
---|
1040 | .align 32
|
---|
1041 | bn_mulx4x_mont:
|
---|
1042 | .cfi_startproc
|
---|
1043 | mov %rsp,%rax
|
---|
1044 | .cfi_def_cfa_register %rax
|
---|
1045 | .Lmulx4x_enter:
|
---|
1046 | push %rbx
|
---|
1047 | .cfi_push %rbx
|
---|
1048 | push %rbp
|
---|
1049 | .cfi_push %rbp
|
---|
1050 | push %r12
|
---|
1051 | .cfi_push %r12
|
---|
1052 | push %r13
|
---|
1053 | .cfi_push %r13
|
---|
1054 | push %r14
|
---|
1055 | .cfi_push %r14
|
---|
1056 | push %r15
|
---|
1057 | .cfi_push %r15
|
---|
1058 | .Lmulx4x_prologue:
|
---|
1059 |
|
---|
1060 | shl \$3,${num}d # convert $num to bytes
|
---|
1061 | xor %r10,%r10
|
---|
1062 | sub $num,%r10 # -$num
|
---|
1063 | mov ($n0),$n0 # *n0
|
---|
1064 | lea -72(%rsp,%r10),%rbp # future alloca(frame+$num+8)
|
---|
1065 | and \$-128,%rbp
|
---|
1066 | mov %rsp,%r11
|
---|
1067 | sub %rbp,%r11
|
---|
1068 | and \$-4096,%r11
|
---|
1069 | lea (%rbp,%r11),%rsp
|
---|
1070 | mov (%rsp),%r10
|
---|
1071 | cmp %rbp,%rsp
|
---|
1072 | ja .Lmulx4x_page_walk
|
---|
1073 | jmp .Lmulx4x_page_walk_done
|
---|
1074 |
|
---|
1075 | .align 16
|
---|
1076 | .Lmulx4x_page_walk:
|
---|
1077 | lea -4096(%rsp),%rsp
|
---|
1078 | mov (%rsp),%r10
|
---|
1079 | cmp %rbp,%rsp
|
---|
1080 | ja .Lmulx4x_page_walk
|
---|
1081 | .Lmulx4x_page_walk_done:
|
---|
1082 |
|
---|
1083 | lea ($bp,$num),%r10
|
---|
1084 | ##############################################################
|
---|
1085 | # Stack layout
|
---|
1086 | # +0 num
|
---|
1087 | # +8 off-loaded &b[i]
|
---|
1088 | # +16 end of b[num]
|
---|
1089 | # +24 saved n0
|
---|
1090 | # +32 saved rp
|
---|
1091 | # +40 saved %rsp
|
---|
1092 | # +48 inner counter
|
---|
1093 | # +56
|
---|
1094 | # +64 tmp[num+1]
|
---|
1095 | #
|
---|
1096 | mov $num,0(%rsp) # save $num
|
---|
1097 | shr \$5,$num
|
---|
1098 | mov %r10,16(%rsp) # end of b[num]
|
---|
1099 | sub \$1,$num
|
---|
1100 | mov $n0, 24(%rsp) # save *n0
|
---|
1101 | mov $rp, 32(%rsp) # save $rp
|
---|
1102 | mov %rax,40(%rsp) # save original %rsp
|
---|
1103 | .cfi_cfa_expression %rsp+40,deref,+8
|
---|
1104 | mov $num,48(%rsp) # inner counter
|
---|
1105 | jmp .Lmulx4x_body
|
---|
1106 |
|
---|
1107 | .align 32
|
---|
1108 | .Lmulx4x_body:
|
---|
1109 | ___
|
---|
1110 | my ($aptr, $bptr, $nptr, $tptr, $mi, $bi, $zero, $num)=
|
---|
1111 | ("%rsi","%rdi","%rcx","%rbx","%r8","%r9","%rbp","%rax");
|
---|
1112 | my $rptr=$bptr;
|
---|
1113 | $code.=<<___;
|
---|
1114 | lea 8($bp),$bptr
|
---|
1115 | mov ($bp),%rdx # b[0], $bp==%rdx actually
|
---|
1116 | lea 64+32(%rsp),$tptr
|
---|
1117 | mov %rdx,$bi
|
---|
1118 |
|
---|
1119 | mulx 0*8($aptr),$mi,%rax # a[0]*b[0]
|
---|
1120 | mulx 1*8($aptr),%r11,%r14 # a[1]*b[0]
|
---|
1121 | add %rax,%r11
|
---|
1122 | mov $bptr,8(%rsp) # off-load &b[i]
|
---|
1123 | mulx 2*8($aptr),%r12,%r13 # ...
|
---|
1124 | adc %r14,%r12
|
---|
1125 | adc \$0,%r13
|
---|
1126 |
|
---|
1127 | mov $mi,$bptr # borrow $bptr
|
---|
1128 | imulq 24(%rsp),$mi # "t[0]"*n0
|
---|
1129 | xor $zero,$zero # cf=0, of=0
|
---|
1130 |
|
---|
1131 | mulx 3*8($aptr),%rax,%r14
|
---|
1132 | mov $mi,%rdx
|
---|
1133 | lea 4*8($aptr),$aptr
|
---|
1134 | adcx %rax,%r13
|
---|
1135 | adcx $zero,%r14 # cf=0
|
---|
1136 |
|
---|
1137 | mulx 0*8($nptr),%rax,%r10
|
---|
1138 | adcx %rax,$bptr # discarded
|
---|
1139 | adox %r11,%r10
|
---|
1140 | mulx 1*8($nptr),%rax,%r11
|
---|
1141 | adcx %rax,%r10
|
---|
1142 | adox %r12,%r11
|
---|
1143 | .byte 0xc4,0x62,0xfb,0xf6,0xa1,0x10,0x00,0x00,0x00 # mulx 2*8($nptr),%rax,%r12
|
---|
1144 | mov 48(%rsp),$bptr # counter value
|
---|
1145 | mov %r10,-4*8($tptr)
|
---|
1146 | adcx %rax,%r11
|
---|
1147 | adox %r13,%r12
|
---|
1148 | mulx 3*8($nptr),%rax,%r15
|
---|
1149 | mov $bi,%rdx
|
---|
1150 | mov %r11,-3*8($tptr)
|
---|
1151 | adcx %rax,%r12
|
---|
1152 | adox $zero,%r15 # of=0
|
---|
1153 | lea 4*8($nptr),$nptr
|
---|
1154 | mov %r12,-2*8($tptr)
|
---|
1155 |
|
---|
1156 | jmp .Lmulx4x_1st
|
---|
1157 |
|
---|
1158 | .align 32
|
---|
1159 | .Lmulx4x_1st:
|
---|
1160 | adcx $zero,%r15 # cf=0, modulo-scheduled
|
---|
1161 | mulx 0*8($aptr),%r10,%rax # a[4]*b[0]
|
---|
1162 | adcx %r14,%r10
|
---|
1163 | mulx 1*8($aptr),%r11,%r14 # a[5]*b[0]
|
---|
1164 | adcx %rax,%r11
|
---|
1165 | mulx 2*8($aptr),%r12,%rax # ...
|
---|
1166 | adcx %r14,%r12
|
---|
1167 | mulx 3*8($aptr),%r13,%r14
|
---|
1168 | .byte 0x67,0x67
|
---|
1169 | mov $mi,%rdx
|
---|
1170 | adcx %rax,%r13
|
---|
1171 | adcx $zero,%r14 # cf=0
|
---|
1172 | lea 4*8($aptr),$aptr
|
---|
1173 | lea 4*8($tptr),$tptr
|
---|
1174 |
|
---|
1175 | adox %r15,%r10
|
---|
1176 | mulx 0*8($nptr),%rax,%r15
|
---|
1177 | adcx %rax,%r10
|
---|
1178 | adox %r15,%r11
|
---|
1179 | mulx 1*8($nptr),%rax,%r15
|
---|
1180 | adcx %rax,%r11
|
---|
1181 | adox %r15,%r12
|
---|
1182 | mulx 2*8($nptr),%rax,%r15
|
---|
1183 | mov %r10,-5*8($tptr)
|
---|
1184 | adcx %rax,%r12
|
---|
1185 | mov %r11,-4*8($tptr)
|
---|
1186 | adox %r15,%r13
|
---|
1187 | mulx 3*8($nptr),%rax,%r15
|
---|
1188 | mov $bi,%rdx
|
---|
1189 | mov %r12,-3*8($tptr)
|
---|
1190 | adcx %rax,%r13
|
---|
1191 | adox $zero,%r15
|
---|
1192 | lea 4*8($nptr),$nptr
|
---|
1193 | mov %r13,-2*8($tptr)
|
---|
1194 |
|
---|
1195 | dec $bptr # of=0, pass cf
|
---|
1196 | jnz .Lmulx4x_1st
|
---|
1197 |
|
---|
1198 | mov 0(%rsp),$num # load num
|
---|
1199 | mov 8(%rsp),$bptr # re-load &b[i]
|
---|
1200 | adc $zero,%r15 # modulo-scheduled
|
---|
1201 | add %r15,%r14
|
---|
1202 | sbb %r15,%r15 # top-most carry
|
---|
1203 | mov %r14,-1*8($tptr)
|
---|
1204 | jmp .Lmulx4x_outer
|
---|
1205 |
|
---|
1206 | .align 32
|
---|
1207 | .Lmulx4x_outer:
|
---|
1208 | mov ($bptr),%rdx # b[i]
|
---|
1209 | lea 8($bptr),$bptr # b++
|
---|
1210 | sub $num,$aptr # rewind $aptr
|
---|
1211 | mov %r15,($tptr) # save top-most carry
|
---|
1212 | lea 64+4*8(%rsp),$tptr
|
---|
1213 | sub $num,$nptr # rewind $nptr
|
---|
1214 |
|
---|
1215 | mulx 0*8($aptr),$mi,%r11 # a[0]*b[i]
|
---|
1216 | xor %ebp,%ebp # xor $zero,$zero # cf=0, of=0
|
---|
1217 | mov %rdx,$bi
|
---|
1218 | mulx 1*8($aptr),%r14,%r12 # a[1]*b[i]
|
---|
1219 | adox -4*8($tptr),$mi
|
---|
1220 | adcx %r14,%r11
|
---|
1221 | mulx 2*8($aptr),%r15,%r13 # ...
|
---|
1222 | adox -3*8($tptr),%r11
|
---|
1223 | adcx %r15,%r12
|
---|
1224 | adox -2*8($tptr),%r12
|
---|
1225 | adcx $zero,%r13
|
---|
1226 | adox $zero,%r13
|
---|
1227 |
|
---|
1228 | mov $bptr,8(%rsp) # off-load &b[i]
|
---|
1229 | mov $mi,%r15
|
---|
1230 | imulq 24(%rsp),$mi # "t[0]"*n0
|
---|
1231 | xor %ebp,%ebp # xor $zero,$zero # cf=0, of=0
|
---|
1232 |
|
---|
1233 | mulx 3*8($aptr),%rax,%r14
|
---|
1234 | mov $mi,%rdx
|
---|
1235 | adcx %rax,%r13
|
---|
1236 | adox -1*8($tptr),%r13
|
---|
1237 | adcx $zero,%r14
|
---|
1238 | lea 4*8($aptr),$aptr
|
---|
1239 | adox $zero,%r14
|
---|
1240 |
|
---|
1241 | mulx 0*8($nptr),%rax,%r10
|
---|
1242 | adcx %rax,%r15 # discarded
|
---|
1243 | adox %r11,%r10
|
---|
1244 | mulx 1*8($nptr),%rax,%r11
|
---|
1245 | adcx %rax,%r10
|
---|
1246 | adox %r12,%r11
|
---|
1247 | mulx 2*8($nptr),%rax,%r12
|
---|
1248 | mov %r10,-4*8($tptr)
|
---|
1249 | adcx %rax,%r11
|
---|
1250 | adox %r13,%r12
|
---|
1251 | mulx 3*8($nptr),%rax,%r15
|
---|
1252 | mov $bi,%rdx
|
---|
1253 | mov %r11,-3*8($tptr)
|
---|
1254 | lea 4*8($nptr),$nptr
|
---|
1255 | adcx %rax,%r12
|
---|
1256 | adox $zero,%r15 # of=0
|
---|
1257 | mov 48(%rsp),$bptr # counter value
|
---|
1258 | mov %r12,-2*8($tptr)
|
---|
1259 |
|
---|
1260 | jmp .Lmulx4x_inner
|
---|
1261 |
|
---|
1262 | .align 32
|
---|
1263 | .Lmulx4x_inner:
|
---|
1264 | mulx 0*8($aptr),%r10,%rax # a[4]*b[i]
|
---|
1265 | adcx $zero,%r15 # cf=0, modulo-scheduled
|
---|
1266 | adox %r14,%r10
|
---|
1267 | mulx 1*8($aptr),%r11,%r14 # a[5]*b[i]
|
---|
1268 | adcx 0*8($tptr),%r10
|
---|
1269 | adox %rax,%r11
|
---|
1270 | mulx 2*8($aptr),%r12,%rax # ...
|
---|
1271 | adcx 1*8($tptr),%r11
|
---|
1272 | adox %r14,%r12
|
---|
1273 | mulx 3*8($aptr),%r13,%r14
|
---|
1274 | mov $mi,%rdx
|
---|
1275 | adcx 2*8($tptr),%r12
|
---|
1276 | adox %rax,%r13
|
---|
1277 | adcx 3*8($tptr),%r13
|
---|
1278 | adox $zero,%r14 # of=0
|
---|
1279 | lea 4*8($aptr),$aptr
|
---|
1280 | lea 4*8($tptr),$tptr
|
---|
1281 | adcx $zero,%r14 # cf=0
|
---|
1282 |
|
---|
1283 | adox %r15,%r10
|
---|
1284 | mulx 0*8($nptr),%rax,%r15
|
---|
1285 | adcx %rax,%r10
|
---|
1286 | adox %r15,%r11
|
---|
1287 | mulx 1*8($nptr),%rax,%r15
|
---|
1288 | adcx %rax,%r11
|
---|
1289 | adox %r15,%r12
|
---|
1290 | mulx 2*8($nptr),%rax,%r15
|
---|
1291 | mov %r10,-5*8($tptr)
|
---|
1292 | adcx %rax,%r12
|
---|
1293 | adox %r15,%r13
|
---|
1294 | mulx 3*8($nptr),%rax,%r15
|
---|
1295 | mov $bi,%rdx
|
---|
1296 | mov %r11,-4*8($tptr)
|
---|
1297 | mov %r12,-3*8($tptr)
|
---|
1298 | adcx %rax,%r13
|
---|
1299 | adox $zero,%r15
|
---|
1300 | lea 4*8($nptr),$nptr
|
---|
1301 | mov %r13,-2*8($tptr)
|
---|
1302 |
|
---|
1303 | dec $bptr # of=0, pass cf
|
---|
1304 | jnz .Lmulx4x_inner
|
---|
1305 |
|
---|
1306 | mov 0(%rsp),$num # load num
|
---|
1307 | mov 8(%rsp),$bptr # re-load &b[i]
|
---|
1308 | adc $zero,%r15 # modulo-scheduled
|
---|
1309 | sub 0*8($tptr),$zero # pull top-most carry
|
---|
1310 | adc %r15,%r14
|
---|
1311 | sbb %r15,%r15 # top-most carry
|
---|
1312 | mov %r14,-1*8($tptr)
|
---|
1313 |
|
---|
1314 | cmp 16(%rsp),$bptr
|
---|
1315 | jne .Lmulx4x_outer
|
---|
1316 |
|
---|
1317 | lea 64(%rsp),$tptr
|
---|
1318 | sub $num,$nptr # rewind $nptr
|
---|
1319 | neg %r15
|
---|
1320 | mov $num,%rdx
|
---|
1321 | shr \$3+2,$num # %cf=0
|
---|
1322 | mov 32(%rsp),$rptr # restore rp
|
---|
1323 | jmp .Lmulx4x_sub
|
---|
1324 |
|
---|
1325 | .align 32
|
---|
1326 | .Lmulx4x_sub:
|
---|
1327 | mov 8*0($tptr),%r11
|
---|
1328 | mov 8*1($tptr),%r12
|
---|
1329 | mov 8*2($tptr),%r13
|
---|
1330 | mov 8*3($tptr),%r14
|
---|
1331 | lea 8*4($tptr),$tptr
|
---|
1332 | sbb 8*0($nptr),%r11
|
---|
1333 | sbb 8*1($nptr),%r12
|
---|
1334 | sbb 8*2($nptr),%r13
|
---|
1335 | sbb 8*3($nptr),%r14
|
---|
1336 | lea 8*4($nptr),$nptr
|
---|
1337 | mov %r11,8*0($rptr)
|
---|
1338 | mov %r12,8*1($rptr)
|
---|
1339 | mov %r13,8*2($rptr)
|
---|
1340 | mov %r14,8*3($rptr)
|
---|
1341 | lea 8*4($rptr),$rptr
|
---|
1342 | dec $num # preserves %cf
|
---|
1343 | jnz .Lmulx4x_sub
|
---|
1344 |
|
---|
1345 | sbb \$0,%r15 # top-most carry
|
---|
1346 | lea 64(%rsp),$tptr
|
---|
1347 | sub %rdx,$rptr # rewind
|
---|
1348 |
|
---|
1349 | movq %r15,%xmm1
|
---|
1350 | pxor %xmm0,%xmm0
|
---|
1351 | pshufd \$0,%xmm1,%xmm1
|
---|
1352 | mov 40(%rsp),%rsi # restore %rsp
|
---|
1353 | .cfi_def_cfa %rsi,8
|
---|
1354 | jmp .Lmulx4x_cond_copy
|
---|
1355 |
|
---|
1356 | .align 32
|
---|
1357 | .Lmulx4x_cond_copy:
|
---|
1358 | movdqa 16*0($tptr),%xmm2
|
---|
1359 | movdqa 16*1($tptr),%xmm3
|
---|
1360 | lea 16*2($tptr),$tptr
|
---|
1361 | movdqu 16*0($rptr),%xmm4
|
---|
1362 | movdqu 16*1($rptr),%xmm5
|
---|
1363 | lea 16*2($rptr),$rptr
|
---|
1364 | movdqa %xmm0,-16*2($tptr) # zero tp
|
---|
1365 | movdqa %xmm0,-16*1($tptr)
|
---|
1366 | pcmpeqd %xmm1,%xmm0
|
---|
1367 | pand %xmm1,%xmm2
|
---|
1368 | pand %xmm1,%xmm3
|
---|
1369 | pand %xmm0,%xmm4
|
---|
1370 | pand %xmm0,%xmm5
|
---|
1371 | pxor %xmm0,%xmm0
|
---|
1372 | por %xmm2,%xmm4
|
---|
1373 | por %xmm3,%xmm5
|
---|
1374 | movdqu %xmm4,-16*2($rptr)
|
---|
1375 | movdqu %xmm5,-16*1($rptr)
|
---|
1376 | sub \$32,%rdx
|
---|
1377 | jnz .Lmulx4x_cond_copy
|
---|
1378 |
|
---|
1379 | mov %rdx,($tptr)
|
---|
1380 |
|
---|
1381 | mov \$1,%rax
|
---|
1382 | mov -48(%rsi),%r15
|
---|
1383 | .cfi_restore %r15
|
---|
1384 | mov -40(%rsi),%r14
|
---|
1385 | .cfi_restore %r14
|
---|
1386 | mov -32(%rsi),%r13
|
---|
1387 | .cfi_restore %r13
|
---|
1388 | mov -24(%rsi),%r12
|
---|
1389 | .cfi_restore %r12
|
---|
1390 | mov -16(%rsi),%rbp
|
---|
1391 | .cfi_restore %rbp
|
---|
1392 | mov -8(%rsi),%rbx
|
---|
1393 | .cfi_restore %rbx
|
---|
1394 | lea (%rsi),%rsp
|
---|
1395 | .cfi_def_cfa_register %rsp
|
---|
1396 | .Lmulx4x_epilogue:
|
---|
1397 | ret
|
---|
1398 | .cfi_endproc
|
---|
1399 | .size bn_mulx4x_mont,.-bn_mulx4x_mont
|
---|
1400 | ___
|
---|
1401 | }}}
|
---|
1402 | $code.=<<___;
|
---|
1403 | .asciz "Montgomery Multiplication for x86_64, CRYPTOGAMS by <appro\@openssl.org>"
|
---|
1404 | .align 16
|
---|
1405 | ___
|
---|
1406 |
|
---|
1407 | # EXCEPTION_DISPOSITION handler (EXCEPTION_RECORD *rec,ULONG64 frame,
|
---|
1408 | # CONTEXT *context,DISPATCHER_CONTEXT *disp)
|
---|
1409 | if ($win64) {
|
---|
1410 | $rec="%rcx";
|
---|
1411 | $frame="%rdx";
|
---|
1412 | $context="%r8";
|
---|
1413 | $disp="%r9";
|
---|
1414 |
|
---|
1415 | $code.=<<___;
|
---|
1416 | .extern __imp_RtlVirtualUnwind
|
---|
1417 | .type mul_handler,\@abi-omnipotent
|
---|
1418 | .align 16
|
---|
1419 | mul_handler:
|
---|
1420 | push %rsi
|
---|
1421 | push %rdi
|
---|
1422 | push %rbx
|
---|
1423 | push %rbp
|
---|
1424 | push %r12
|
---|
1425 | push %r13
|
---|
1426 | push %r14
|
---|
1427 | push %r15
|
---|
1428 | pushfq
|
---|
1429 | sub \$64,%rsp
|
---|
1430 |
|
---|
1431 | mov 120($context),%rax # pull context->Rax
|
---|
1432 | mov 248($context),%rbx # pull context->Rip
|
---|
1433 |
|
---|
1434 | mov 8($disp),%rsi # disp->ImageBase
|
---|
1435 | mov 56($disp),%r11 # disp->HandlerData
|
---|
1436 |
|
---|
1437 | mov 0(%r11),%r10d # HandlerData[0]
|
---|
1438 | lea (%rsi,%r10),%r10 # end of prologue label
|
---|
1439 | cmp %r10,%rbx # context->Rip<end of prologue label
|
---|
1440 | jb .Lcommon_seh_tail
|
---|
1441 |
|
---|
1442 | mov 152($context),%rax # pull context->Rsp
|
---|
1443 |
|
---|
1444 | mov 4(%r11),%r10d # HandlerData[1]
|
---|
1445 | lea (%rsi,%r10),%r10 # epilogue label
|
---|
1446 | cmp %r10,%rbx # context->Rip>=epilogue label
|
---|
1447 | jae .Lcommon_seh_tail
|
---|
1448 |
|
---|
1449 | mov 192($context),%r10 # pull $num
|
---|
1450 | mov 8(%rax,%r10,8),%rax # pull saved stack pointer
|
---|
1451 |
|
---|
1452 | jmp .Lcommon_pop_regs
|
---|
1453 | .size mul_handler,.-mul_handler
|
---|
1454 |
|
---|
1455 | .type sqr_handler,\@abi-omnipotent
|
---|
1456 | .align 16
|
---|
1457 | sqr_handler:
|
---|
1458 | push %rsi
|
---|
1459 | push %rdi
|
---|
1460 | push %rbx
|
---|
1461 | push %rbp
|
---|
1462 | push %r12
|
---|
1463 | push %r13
|
---|
1464 | push %r14
|
---|
1465 | push %r15
|
---|
1466 | pushfq
|
---|
1467 | sub \$64,%rsp
|
---|
1468 |
|
---|
1469 | mov 120($context),%rax # pull context->Rax
|
---|
1470 | mov 248($context),%rbx # pull context->Rip
|
---|
1471 |
|
---|
1472 | mov 8($disp),%rsi # disp->ImageBase
|
---|
1473 | mov 56($disp),%r11 # disp->HandlerData
|
---|
1474 |
|
---|
1475 | mov 0(%r11),%r10d # HandlerData[0]
|
---|
1476 | lea (%rsi,%r10),%r10 # end of prologue label
|
---|
1477 | cmp %r10,%rbx # context->Rip<.Lsqr_prologue
|
---|
1478 | jb .Lcommon_seh_tail
|
---|
1479 |
|
---|
1480 | mov 4(%r11),%r10d # HandlerData[1]
|
---|
1481 | lea (%rsi,%r10),%r10 # body label
|
---|
1482 | cmp %r10,%rbx # context->Rip<.Lsqr_body
|
---|
1483 | jb .Lcommon_pop_regs
|
---|
1484 |
|
---|
1485 | mov 152($context),%rax # pull context->Rsp
|
---|
1486 |
|
---|
1487 | mov 8(%r11),%r10d # HandlerData[2]
|
---|
1488 | lea (%rsi,%r10),%r10 # epilogue label
|
---|
1489 | cmp %r10,%rbx # context->Rip>=.Lsqr_epilogue
|
---|
1490 | jae .Lcommon_seh_tail
|
---|
1491 |
|
---|
1492 | mov 40(%rax),%rax # pull saved stack pointer
|
---|
1493 |
|
---|
1494 | .Lcommon_pop_regs:
|
---|
1495 | mov -8(%rax),%rbx
|
---|
1496 | mov -16(%rax),%rbp
|
---|
1497 | mov -24(%rax),%r12
|
---|
1498 | mov -32(%rax),%r13
|
---|
1499 | mov -40(%rax),%r14
|
---|
1500 | mov -48(%rax),%r15
|
---|
1501 | mov %rbx,144($context) # restore context->Rbx
|
---|
1502 | mov %rbp,160($context) # restore context->Rbp
|
---|
1503 | mov %r12,216($context) # restore context->R12
|
---|
1504 | mov %r13,224($context) # restore context->R13
|
---|
1505 | mov %r14,232($context) # restore context->R14
|
---|
1506 | mov %r15,240($context) # restore context->R15
|
---|
1507 |
|
---|
1508 | .Lcommon_seh_tail:
|
---|
1509 | mov 8(%rax),%rdi
|
---|
1510 | mov 16(%rax),%rsi
|
---|
1511 | mov %rax,152($context) # restore context->Rsp
|
---|
1512 | mov %rsi,168($context) # restore context->Rsi
|
---|
1513 | mov %rdi,176($context) # restore context->Rdi
|
---|
1514 |
|
---|
1515 | mov 40($disp),%rdi # disp->ContextRecord
|
---|
1516 | mov $context,%rsi # context
|
---|
1517 | mov \$154,%ecx # sizeof(CONTEXT)
|
---|
1518 | .long 0xa548f3fc # cld; rep movsq
|
---|
1519 |
|
---|
1520 | mov $disp,%rsi
|
---|
1521 | xor %rcx,%rcx # arg1, UNW_FLAG_NHANDLER
|
---|
1522 | mov 8(%rsi),%rdx # arg2, disp->ImageBase
|
---|
1523 | mov 0(%rsi),%r8 # arg3, disp->ControlPc
|
---|
1524 | mov 16(%rsi),%r9 # arg4, disp->FunctionEntry
|
---|
1525 | mov 40(%rsi),%r10 # disp->ContextRecord
|
---|
1526 | lea 56(%rsi),%r11 # &disp->HandlerData
|
---|
1527 | lea 24(%rsi),%r12 # &disp->EstablisherFrame
|
---|
1528 | mov %r10,32(%rsp) # arg5
|
---|
1529 | mov %r11,40(%rsp) # arg6
|
---|
1530 | mov %r12,48(%rsp) # arg7
|
---|
1531 | mov %rcx,56(%rsp) # arg8, (NULL)
|
---|
1532 | call *__imp_RtlVirtualUnwind(%rip)
|
---|
1533 |
|
---|
1534 | mov \$1,%eax # ExceptionContinueSearch
|
---|
1535 | add \$64,%rsp
|
---|
1536 | popfq
|
---|
1537 | pop %r15
|
---|
1538 | pop %r14
|
---|
1539 | pop %r13
|
---|
1540 | pop %r12
|
---|
1541 | pop %rbp
|
---|
1542 | pop %rbx
|
---|
1543 | pop %rdi
|
---|
1544 | pop %rsi
|
---|
1545 | ret
|
---|
1546 | .size sqr_handler,.-sqr_handler
|
---|
1547 |
|
---|
1548 | .section .pdata
|
---|
1549 | .align 4
|
---|
1550 | .rva .LSEH_begin_bn_mul_mont
|
---|
1551 | .rva .LSEH_end_bn_mul_mont
|
---|
1552 | .rva .LSEH_info_bn_mul_mont
|
---|
1553 |
|
---|
1554 | .rva .LSEH_begin_bn_mul4x_mont
|
---|
1555 | .rva .LSEH_end_bn_mul4x_mont
|
---|
1556 | .rva .LSEH_info_bn_mul4x_mont
|
---|
1557 |
|
---|
1558 | .rva .LSEH_begin_bn_sqr8x_mont
|
---|
1559 | .rva .LSEH_end_bn_sqr8x_mont
|
---|
1560 | .rva .LSEH_info_bn_sqr8x_mont
|
---|
1561 | ___
|
---|
1562 | $code.=<<___ if ($addx);
|
---|
1563 | .rva .LSEH_begin_bn_mulx4x_mont
|
---|
1564 | .rva .LSEH_end_bn_mulx4x_mont
|
---|
1565 | .rva .LSEH_info_bn_mulx4x_mont
|
---|
1566 | ___
|
---|
1567 | $code.=<<___;
|
---|
1568 | .section .xdata
|
---|
1569 | .align 8
|
---|
1570 | .LSEH_info_bn_mul_mont:
|
---|
1571 | .byte 9,0,0,0
|
---|
1572 | .rva mul_handler
|
---|
1573 | .rva .Lmul_body,.Lmul_epilogue # HandlerData[]
|
---|
1574 | .LSEH_info_bn_mul4x_mont:
|
---|
1575 | .byte 9,0,0,0
|
---|
1576 | .rva mul_handler
|
---|
1577 | .rva .Lmul4x_body,.Lmul4x_epilogue # HandlerData[]
|
---|
1578 | .LSEH_info_bn_sqr8x_mont:
|
---|
1579 | .byte 9,0,0,0
|
---|
1580 | .rva sqr_handler
|
---|
1581 | .rva .Lsqr8x_prologue,.Lsqr8x_body,.Lsqr8x_epilogue # HandlerData[]
|
---|
1582 | .align 8
|
---|
1583 | ___
|
---|
1584 | $code.=<<___ if ($addx);
|
---|
1585 | .LSEH_info_bn_mulx4x_mont:
|
---|
1586 | .byte 9,0,0,0
|
---|
1587 | .rva sqr_handler
|
---|
1588 | .rva .Lmulx4x_prologue,.Lmulx4x_body,.Lmulx4x_epilogue # HandlerData[]
|
---|
1589 | .align 8
|
---|
1590 | ___
|
---|
1591 | }
|
---|
1592 |
|
---|
1593 | print $code;
|
---|
1594 | close STDOUT or die "error closing STDOUT: $!";
|
---|