1 | #! /usr/bin/env perl
|
---|
2 | # Copyright 2005-2016 The OpenSSL Project Authors. All Rights Reserved.
|
---|
3 | #
|
---|
4 | # Licensed under the OpenSSL license (the "License"). You may not use
|
---|
5 | # this file except in compliance with the License. You can obtain a copy
|
---|
6 | # in the file LICENSE in the source distribution or at
|
---|
7 | # https://www.openssl.org/source/license.html
|
---|
8 |
|
---|
9 |
|
---|
10 | # ====================================================================
|
---|
11 | # Written by Andy Polyakov <[email protected]> for the OpenSSL
|
---|
12 | # project. The module is, however, dual licensed under OpenSSL and
|
---|
13 | # CRYPTOGAMS licenses depending on where you obtain it. For further
|
---|
14 | # details see http://www.openssl.org/~appro/cryptogams/.
|
---|
15 | # ====================================================================
|
---|
16 |
|
---|
17 | # October 2005.
|
---|
18 | #
|
---|
19 | # Montgomery multiplication routine for x86_64. While it gives modest
|
---|
20 | # 9% improvement of rsa4096 sign on Opteron, rsa512 sign runs more
|
---|
21 | # than twice, >2x, as fast. Most common rsa1024 sign is improved by
|
---|
22 | # respectful 50%. It remains to be seen if loop unrolling and
|
---|
23 | # dedicated squaring routine can provide further improvement...
|
---|
24 |
|
---|
25 | # July 2011.
|
---|
26 | #
|
---|
27 | # Add dedicated squaring procedure. Performance improvement varies
|
---|
28 | # from platform to platform, but in average it's ~5%/15%/25%/33%
|
---|
29 | # for 512-/1024-/2048-/4096-bit RSA *sign* benchmarks respectively.
|
---|
30 |
|
---|
31 | # August 2011.
|
---|
32 | #
|
---|
33 | # Unroll and modulo-schedule inner loops in such manner that they
|
---|
34 | # are "fallen through" for input lengths of 8, which is critical for
|
---|
35 | # 1024-bit RSA *sign*. Average performance improvement in comparison
|
---|
36 | # to *initial* version of this module from 2005 is ~0%/30%/40%/45%
|
---|
37 | # for 512-/1024-/2048-/4096-bit RSA *sign* benchmarks respectively.
|
---|
38 |
|
---|
39 | # June 2013.
|
---|
40 | #
|
---|
41 | # Optimize reduction in squaring procedure and improve 1024+-bit RSA
|
---|
42 | # sign performance by 10-16% on Intel Sandy Bridge and later
|
---|
43 | # (virtually same on non-Intel processors).
|
---|
44 |
|
---|
45 | # August 2013.
|
---|
46 | #
|
---|
47 | # Add MULX/ADOX/ADCX code path.
|
---|
48 |
|
---|
49 | $flavour = shift;
|
---|
50 | $output = shift;
|
---|
51 | if ($flavour =~ /\./) { $output = $flavour; undef $flavour; }
|
---|
52 |
|
---|
53 | $win64=0; $win64=1 if ($flavour =~ /[nm]asm|mingw64/ || $output =~ /\.asm$/);
|
---|
54 |
|
---|
55 | $0 =~ m/(.*[\/\\])[^\/\\]+$/; $dir=$1;
|
---|
56 | ( $xlate="${dir}x86_64-xlate.pl" and -f $xlate ) or
|
---|
57 | ( $xlate="${dir}../../perlasm/x86_64-xlate.pl" and -f $xlate) or
|
---|
58 | die "can't locate x86_64-xlate.pl";
|
---|
59 |
|
---|
60 | open OUT,"| \"$^X\" \"$xlate\" $flavour \"$output\"";
|
---|
61 | *STDOUT=*OUT;
|
---|
62 |
|
---|
63 | if (`$ENV{CC} -Wa,-v -c -o /dev/null -x assembler /dev/null 2>&1`
|
---|
64 | =~ /GNU assembler version ([2-9]\.[0-9]+)/) {
|
---|
65 | $addx = ($1>=2.23);
|
---|
66 | }
|
---|
67 |
|
---|
68 | if (!$addx && $win64 && ($flavour =~ /nasm/ || $ENV{ASM} =~ /nasm/) &&
|
---|
69 | `nasm -v 2>&1` =~ /NASM version ([2-9]\.[0-9]+)/) {
|
---|
70 | $addx = ($1>=2.10);
|
---|
71 | }
|
---|
72 |
|
---|
73 | if (!$addx && $win64 && ($flavour =~ /masm/ || $ENV{ASM} =~ /ml64/) &&
|
---|
74 | `ml64 2>&1` =~ /Version ([0-9]+)\./) {
|
---|
75 | $addx = ($1>=12);
|
---|
76 | }
|
---|
77 |
|
---|
78 | if (!$addx && `$ENV{CC} -v 2>&1` =~ /((?:^clang|LLVM) version|.*based on LLVM) ([3-9])\.([0-9]+)/) {
|
---|
79 | my $ver = $2 + $3/100.0; # 3.1->3.01, 3.10->3.10
|
---|
80 | $addx = ($ver>=3.03);
|
---|
81 | }
|
---|
82 |
|
---|
83 | # int bn_mul_mont(
|
---|
84 | $rp="%rdi"; # BN_ULONG *rp,
|
---|
85 | $ap="%rsi"; # const BN_ULONG *ap,
|
---|
86 | $bp="%rdx"; # const BN_ULONG *bp,
|
---|
87 | $np="%rcx"; # const BN_ULONG *np,
|
---|
88 | $n0="%r8"; # const BN_ULONG *n0,
|
---|
89 | $num="%r9"; # int num);
|
---|
90 | $lo0="%r10";
|
---|
91 | $hi0="%r11";
|
---|
92 | $hi1="%r13";
|
---|
93 | $i="%r14";
|
---|
94 | $j="%r15";
|
---|
95 | $m0="%rbx";
|
---|
96 | $m1="%rbp";
|
---|
97 |
|
---|
98 | $code=<<___;
|
---|
99 | .text
|
---|
100 |
|
---|
101 | .extern OPENSSL_ia32cap_P
|
---|
102 |
|
---|
103 | .globl bn_mul_mont
|
---|
104 | .type bn_mul_mont,\@function,6
|
---|
105 | .align 16
|
---|
106 | bn_mul_mont:
|
---|
107 | mov ${num}d,${num}d
|
---|
108 | mov %rsp,%rax
|
---|
109 | test \$3,${num}d
|
---|
110 | jnz .Lmul_enter
|
---|
111 | cmp \$8,${num}d
|
---|
112 | jb .Lmul_enter
|
---|
113 | ___
|
---|
114 | $code.=<<___ if ($addx);
|
---|
115 | mov OPENSSL_ia32cap_P+8(%rip),%r11d
|
---|
116 | ___
|
---|
117 | $code.=<<___;
|
---|
118 | cmp $ap,$bp
|
---|
119 | jne .Lmul4x_enter
|
---|
120 | test \$7,${num}d
|
---|
121 | jz .Lsqr8x_enter
|
---|
122 | jmp .Lmul4x_enter
|
---|
123 |
|
---|
124 | .align 16
|
---|
125 | .Lmul_enter:
|
---|
126 | push %rbx
|
---|
127 | push %rbp
|
---|
128 | push %r12
|
---|
129 | push %r13
|
---|
130 | push %r14
|
---|
131 | push %r15
|
---|
132 |
|
---|
133 | neg $num
|
---|
134 | mov %rsp,%r11
|
---|
135 | lea -16(%rsp,$num,8),%r10 # future alloca(8*(num+2))
|
---|
136 | neg $num # restore $num
|
---|
137 | and \$-1024,%r10 # minimize TLB usage
|
---|
138 |
|
---|
139 | # An OS-agnostic version of __chkstk.
|
---|
140 | #
|
---|
141 | # Some OSes (Windows) insist on stack being "wired" to
|
---|
142 | # physical memory in strictly sequential manner, i.e. if stack
|
---|
143 | # allocation spans two pages, then reference to farmost one can
|
---|
144 | # be punishable by SEGV. But page walking can do good even on
|
---|
145 | # other OSes, because it guarantees that villain thread hits
|
---|
146 | # the guard page before it can make damage to innocent one...
|
---|
147 | sub %r10,%r11
|
---|
148 | and \$-4096,%r11
|
---|
149 | lea (%r10,%r11),%rsp
|
---|
150 | mov (%rsp),%r11
|
---|
151 | cmp %r10,%rsp
|
---|
152 | ja .Lmul_page_walk
|
---|
153 | jmp .Lmul_page_walk_done
|
---|
154 |
|
---|
155 | .align 16
|
---|
156 | .Lmul_page_walk:
|
---|
157 | lea -4096(%rsp),%rsp
|
---|
158 | mov (%rsp),%r11
|
---|
159 | cmp %r10,%rsp
|
---|
160 | ja .Lmul_page_walk
|
---|
161 | .Lmul_page_walk_done:
|
---|
162 |
|
---|
163 | mov %rax,8(%rsp,$num,8) # tp[num+1]=%rsp
|
---|
164 | .Lmul_body:
|
---|
165 | mov $bp,%r12 # reassign $bp
|
---|
166 | ___
|
---|
167 | $bp="%r12";
|
---|
168 | $code.=<<___;
|
---|
169 | mov ($n0),$n0 # pull n0[0] value
|
---|
170 | mov ($bp),$m0 # m0=bp[0]
|
---|
171 | mov ($ap),%rax
|
---|
172 |
|
---|
173 | xor $i,$i # i=0
|
---|
174 | xor $j,$j # j=0
|
---|
175 |
|
---|
176 | mov $n0,$m1
|
---|
177 | mulq $m0 # ap[0]*bp[0]
|
---|
178 | mov %rax,$lo0
|
---|
179 | mov ($np),%rax
|
---|
180 |
|
---|
181 | imulq $lo0,$m1 # "tp[0]"*n0
|
---|
182 | mov %rdx,$hi0
|
---|
183 |
|
---|
184 | mulq $m1 # np[0]*m1
|
---|
185 | add %rax,$lo0 # discarded
|
---|
186 | mov 8($ap),%rax
|
---|
187 | adc \$0,%rdx
|
---|
188 | mov %rdx,$hi1
|
---|
189 |
|
---|
190 | lea 1($j),$j # j++
|
---|
191 | jmp .L1st_enter
|
---|
192 |
|
---|
193 | .align 16
|
---|
194 | .L1st:
|
---|
195 | add %rax,$hi1
|
---|
196 | mov ($ap,$j,8),%rax
|
---|
197 | adc \$0,%rdx
|
---|
198 | add $hi0,$hi1 # np[j]*m1+ap[j]*bp[0]
|
---|
199 | mov $lo0,$hi0
|
---|
200 | adc \$0,%rdx
|
---|
201 | mov $hi1,-16(%rsp,$j,8) # tp[j-1]
|
---|
202 | mov %rdx,$hi1
|
---|
203 |
|
---|
204 | .L1st_enter:
|
---|
205 | mulq $m0 # ap[j]*bp[0]
|
---|
206 | add %rax,$hi0
|
---|
207 | mov ($np,$j,8),%rax
|
---|
208 | adc \$0,%rdx
|
---|
209 | lea 1($j),$j # j++
|
---|
210 | mov %rdx,$lo0
|
---|
211 |
|
---|
212 | mulq $m1 # np[j]*m1
|
---|
213 | cmp $num,$j
|
---|
214 | jne .L1st
|
---|
215 |
|
---|
216 | add %rax,$hi1
|
---|
217 | mov ($ap),%rax # ap[0]
|
---|
218 | adc \$0,%rdx
|
---|
219 | add $hi0,$hi1 # np[j]*m1+ap[j]*bp[0]
|
---|
220 | adc \$0,%rdx
|
---|
221 | mov $hi1,-16(%rsp,$j,8) # tp[j-1]
|
---|
222 | mov %rdx,$hi1
|
---|
223 | mov $lo0,$hi0
|
---|
224 |
|
---|
225 | xor %rdx,%rdx
|
---|
226 | add $hi0,$hi1
|
---|
227 | adc \$0,%rdx
|
---|
228 | mov $hi1,-8(%rsp,$num,8)
|
---|
229 | mov %rdx,(%rsp,$num,8) # store upmost overflow bit
|
---|
230 |
|
---|
231 | lea 1($i),$i # i++
|
---|
232 | jmp .Louter
|
---|
233 | .align 16
|
---|
234 | .Louter:
|
---|
235 | mov ($bp,$i,8),$m0 # m0=bp[i]
|
---|
236 | xor $j,$j # j=0
|
---|
237 | mov $n0,$m1
|
---|
238 | mov (%rsp),$lo0
|
---|
239 | mulq $m0 # ap[0]*bp[i]
|
---|
240 | add %rax,$lo0 # ap[0]*bp[i]+tp[0]
|
---|
241 | mov ($np),%rax
|
---|
242 | adc \$0,%rdx
|
---|
243 |
|
---|
244 | imulq $lo0,$m1 # tp[0]*n0
|
---|
245 | mov %rdx,$hi0
|
---|
246 |
|
---|
247 | mulq $m1 # np[0]*m1
|
---|
248 | add %rax,$lo0 # discarded
|
---|
249 | mov 8($ap),%rax
|
---|
250 | adc \$0,%rdx
|
---|
251 | mov 8(%rsp),$lo0 # tp[1]
|
---|
252 | mov %rdx,$hi1
|
---|
253 |
|
---|
254 | lea 1($j),$j # j++
|
---|
255 | jmp .Linner_enter
|
---|
256 |
|
---|
257 | .align 16
|
---|
258 | .Linner:
|
---|
259 | add %rax,$hi1
|
---|
260 | mov ($ap,$j,8),%rax
|
---|
261 | adc \$0,%rdx
|
---|
262 | add $lo0,$hi1 # np[j]*m1+ap[j]*bp[i]+tp[j]
|
---|
263 | mov (%rsp,$j,8),$lo0
|
---|
264 | adc \$0,%rdx
|
---|
265 | mov $hi1,-16(%rsp,$j,8) # tp[j-1]
|
---|
266 | mov %rdx,$hi1
|
---|
267 |
|
---|
268 | .Linner_enter:
|
---|
269 | mulq $m0 # ap[j]*bp[i]
|
---|
270 | add %rax,$hi0
|
---|
271 | mov ($np,$j,8),%rax
|
---|
272 | adc \$0,%rdx
|
---|
273 | add $hi0,$lo0 # ap[j]*bp[i]+tp[j]
|
---|
274 | mov %rdx,$hi0
|
---|
275 | adc \$0,$hi0
|
---|
276 | lea 1($j),$j # j++
|
---|
277 |
|
---|
278 | mulq $m1 # np[j]*m1
|
---|
279 | cmp $num,$j
|
---|
280 | jne .Linner
|
---|
281 |
|
---|
282 | add %rax,$hi1
|
---|
283 | mov ($ap),%rax # ap[0]
|
---|
284 | adc \$0,%rdx
|
---|
285 | add $lo0,$hi1 # np[j]*m1+ap[j]*bp[i]+tp[j]
|
---|
286 | mov (%rsp,$j,8),$lo0
|
---|
287 | adc \$0,%rdx
|
---|
288 | mov $hi1,-16(%rsp,$j,8) # tp[j-1]
|
---|
289 | mov %rdx,$hi1
|
---|
290 |
|
---|
291 | xor %rdx,%rdx
|
---|
292 | add $hi0,$hi1
|
---|
293 | adc \$0,%rdx
|
---|
294 | add $lo0,$hi1 # pull upmost overflow bit
|
---|
295 | adc \$0,%rdx
|
---|
296 | mov $hi1,-8(%rsp,$num,8)
|
---|
297 | mov %rdx,(%rsp,$num,8) # store upmost overflow bit
|
---|
298 |
|
---|
299 | lea 1($i),$i # i++
|
---|
300 | cmp $num,$i
|
---|
301 | jb .Louter
|
---|
302 |
|
---|
303 | xor $i,$i # i=0 and clear CF!
|
---|
304 | mov (%rsp),%rax # tp[0]
|
---|
305 | lea (%rsp),$ap # borrow ap for tp
|
---|
306 | mov $num,$j # j=num
|
---|
307 | jmp .Lsub
|
---|
308 | .align 16
|
---|
309 | .Lsub: sbb ($np,$i,8),%rax
|
---|
310 | mov %rax,($rp,$i,8) # rp[i]=tp[i]-np[i]
|
---|
311 | mov 8($ap,$i,8),%rax # tp[i+1]
|
---|
312 | lea 1($i),$i # i++
|
---|
313 | dec $j # doesnn't affect CF!
|
---|
314 | jnz .Lsub
|
---|
315 |
|
---|
316 | sbb \$0,%rax # handle upmost overflow bit
|
---|
317 | xor $i,$i
|
---|
318 | and %rax,$ap
|
---|
319 | not %rax
|
---|
320 | mov $rp,$np
|
---|
321 | and %rax,$np
|
---|
322 | mov $num,$j # j=num
|
---|
323 | or $np,$ap # ap=borrow?tp:rp
|
---|
324 | .align 16
|
---|
325 | .Lcopy: # copy or in-place refresh
|
---|
326 | mov ($ap,$i,8),%rax
|
---|
327 | mov $i,(%rsp,$i,8) # zap temporary vector
|
---|
328 | mov %rax,($rp,$i,8) # rp[i]=tp[i]
|
---|
329 | lea 1($i),$i
|
---|
330 | sub \$1,$j
|
---|
331 | jnz .Lcopy
|
---|
332 |
|
---|
333 | mov 8(%rsp,$num,8),%rsi # restore %rsp
|
---|
334 | mov \$1,%rax
|
---|
335 | mov -48(%rsi),%r15
|
---|
336 | mov -40(%rsi),%r14
|
---|
337 | mov -32(%rsi),%r13
|
---|
338 | mov -24(%rsi),%r12
|
---|
339 | mov -16(%rsi),%rbp
|
---|
340 | mov -8(%rsi),%rbx
|
---|
341 | lea (%rsi),%rsp
|
---|
342 | .Lmul_epilogue:
|
---|
343 | ret
|
---|
344 | .size bn_mul_mont,.-bn_mul_mont
|
---|
345 | ___
|
---|
346 | {{{
|
---|
347 | my @A=("%r10","%r11");
|
---|
348 | my @N=("%r13","%rdi");
|
---|
349 | $code.=<<___;
|
---|
350 | .type bn_mul4x_mont,\@function,6
|
---|
351 | .align 16
|
---|
352 | bn_mul4x_mont:
|
---|
353 | mov ${num}d,${num}d
|
---|
354 | mov %rsp,%rax
|
---|
355 | .Lmul4x_enter:
|
---|
356 | ___
|
---|
357 | $code.=<<___ if ($addx);
|
---|
358 | and \$0x80100,%r11d
|
---|
359 | cmp \$0x80100,%r11d
|
---|
360 | je .Lmulx4x_enter
|
---|
361 | ___
|
---|
362 | $code.=<<___;
|
---|
363 | push %rbx
|
---|
364 | push %rbp
|
---|
365 | push %r12
|
---|
366 | push %r13
|
---|
367 | push %r14
|
---|
368 | push %r15
|
---|
369 |
|
---|
370 | neg $num
|
---|
371 | mov %rsp,%r11
|
---|
372 | lea -32(%rsp,$num,8),%r10 # future alloca(8*(num+4))
|
---|
373 | neg $num # restore
|
---|
374 | and \$-1024,%r10 # minimize TLB usage
|
---|
375 |
|
---|
376 | sub %r10,%r11
|
---|
377 | and \$-4096,%r11
|
---|
378 | lea (%r10,%r11),%rsp
|
---|
379 | mov (%rsp),%r11
|
---|
380 | cmp %r10,%rsp
|
---|
381 | ja .Lmul4x_page_walk
|
---|
382 | jmp .Lmul4x_page_walk_done
|
---|
383 |
|
---|
384 | .Lmul4x_page_walk:
|
---|
385 | lea -4096(%rsp),%rsp
|
---|
386 | mov (%rsp),%r11
|
---|
387 | cmp %r10,%rsp
|
---|
388 | ja .Lmul4x_page_walk
|
---|
389 | .Lmul4x_page_walk_done:
|
---|
390 |
|
---|
391 | mov %rax,8(%rsp,$num,8) # tp[num+1]=%rsp
|
---|
392 | .Lmul4x_body:
|
---|
393 | mov $rp,16(%rsp,$num,8) # tp[num+2]=$rp
|
---|
394 | mov %rdx,%r12 # reassign $bp
|
---|
395 | ___
|
---|
396 | $bp="%r12";
|
---|
397 | $code.=<<___;
|
---|
398 | mov ($n0),$n0 # pull n0[0] value
|
---|
399 | mov ($bp),$m0 # m0=bp[0]
|
---|
400 | mov ($ap),%rax
|
---|
401 |
|
---|
402 | xor $i,$i # i=0
|
---|
403 | xor $j,$j # j=0
|
---|
404 |
|
---|
405 | mov $n0,$m1
|
---|
406 | mulq $m0 # ap[0]*bp[0]
|
---|
407 | mov %rax,$A[0]
|
---|
408 | mov ($np),%rax
|
---|
409 |
|
---|
410 | imulq $A[0],$m1 # "tp[0]"*n0
|
---|
411 | mov %rdx,$A[1]
|
---|
412 |
|
---|
413 | mulq $m1 # np[0]*m1
|
---|
414 | add %rax,$A[0] # discarded
|
---|
415 | mov 8($ap),%rax
|
---|
416 | adc \$0,%rdx
|
---|
417 | mov %rdx,$N[1]
|
---|
418 |
|
---|
419 | mulq $m0
|
---|
420 | add %rax,$A[1]
|
---|
421 | mov 8($np),%rax
|
---|
422 | adc \$0,%rdx
|
---|
423 | mov %rdx,$A[0]
|
---|
424 |
|
---|
425 | mulq $m1
|
---|
426 | add %rax,$N[1]
|
---|
427 | mov 16($ap),%rax
|
---|
428 | adc \$0,%rdx
|
---|
429 | add $A[1],$N[1]
|
---|
430 | lea 4($j),$j # j++
|
---|
431 | adc \$0,%rdx
|
---|
432 | mov $N[1],(%rsp)
|
---|
433 | mov %rdx,$N[0]
|
---|
434 | jmp .L1st4x
|
---|
435 | .align 16
|
---|
436 | .L1st4x:
|
---|
437 | mulq $m0 # ap[j]*bp[0]
|
---|
438 | add %rax,$A[0]
|
---|
439 | mov -16($np,$j,8),%rax
|
---|
440 | adc \$0,%rdx
|
---|
441 | mov %rdx,$A[1]
|
---|
442 |
|
---|
443 | mulq $m1 # np[j]*m1
|
---|
444 | add %rax,$N[0]
|
---|
445 | mov -8($ap,$j,8),%rax
|
---|
446 | adc \$0,%rdx
|
---|
447 | add $A[0],$N[0] # np[j]*m1+ap[j]*bp[0]
|
---|
448 | adc \$0,%rdx
|
---|
449 | mov $N[0],-24(%rsp,$j,8) # tp[j-1]
|
---|
450 | mov %rdx,$N[1]
|
---|
451 |
|
---|
452 | mulq $m0 # ap[j]*bp[0]
|
---|
453 | add %rax,$A[1]
|
---|
454 | mov -8($np,$j,8),%rax
|
---|
455 | adc \$0,%rdx
|
---|
456 | mov %rdx,$A[0]
|
---|
457 |
|
---|
458 | mulq $m1 # np[j]*m1
|
---|
459 | add %rax,$N[1]
|
---|
460 | mov ($ap,$j,8),%rax
|
---|
461 | adc \$0,%rdx
|
---|
462 | add $A[1],$N[1] # np[j]*m1+ap[j]*bp[0]
|
---|
463 | adc \$0,%rdx
|
---|
464 | mov $N[1],-16(%rsp,$j,8) # tp[j-1]
|
---|
465 | mov %rdx,$N[0]
|
---|
466 |
|
---|
467 | mulq $m0 # ap[j]*bp[0]
|
---|
468 | add %rax,$A[0]
|
---|
469 | mov ($np,$j,8),%rax
|
---|
470 | adc \$0,%rdx
|
---|
471 | mov %rdx,$A[1]
|
---|
472 |
|
---|
473 | mulq $m1 # np[j]*m1
|
---|
474 | add %rax,$N[0]
|
---|
475 | mov 8($ap,$j,8),%rax
|
---|
476 | adc \$0,%rdx
|
---|
477 | add $A[0],$N[0] # np[j]*m1+ap[j]*bp[0]
|
---|
478 | adc \$0,%rdx
|
---|
479 | mov $N[0],-8(%rsp,$j,8) # tp[j-1]
|
---|
480 | mov %rdx,$N[1]
|
---|
481 |
|
---|
482 | mulq $m0 # ap[j]*bp[0]
|
---|
483 | add %rax,$A[1]
|
---|
484 | mov 8($np,$j,8),%rax
|
---|
485 | adc \$0,%rdx
|
---|
486 | lea 4($j),$j # j++
|
---|
487 | mov %rdx,$A[0]
|
---|
488 |
|
---|
489 | mulq $m1 # np[j]*m1
|
---|
490 | add %rax,$N[1]
|
---|
491 | mov -16($ap,$j,8),%rax
|
---|
492 | adc \$0,%rdx
|
---|
493 | add $A[1],$N[1] # np[j]*m1+ap[j]*bp[0]
|
---|
494 | adc \$0,%rdx
|
---|
495 | mov $N[1],-32(%rsp,$j,8) # tp[j-1]
|
---|
496 | mov %rdx,$N[0]
|
---|
497 | cmp $num,$j
|
---|
498 | jb .L1st4x
|
---|
499 |
|
---|
500 | mulq $m0 # ap[j]*bp[0]
|
---|
501 | add %rax,$A[0]
|
---|
502 | mov -16($np,$j,8),%rax
|
---|
503 | adc \$0,%rdx
|
---|
504 | mov %rdx,$A[1]
|
---|
505 |
|
---|
506 | mulq $m1 # np[j]*m1
|
---|
507 | add %rax,$N[0]
|
---|
508 | mov -8($ap,$j,8),%rax
|
---|
509 | adc \$0,%rdx
|
---|
510 | add $A[0],$N[0] # np[j]*m1+ap[j]*bp[0]
|
---|
511 | adc \$0,%rdx
|
---|
512 | mov $N[0],-24(%rsp,$j,8) # tp[j-1]
|
---|
513 | mov %rdx,$N[1]
|
---|
514 |
|
---|
515 | mulq $m0 # ap[j]*bp[0]
|
---|
516 | add %rax,$A[1]
|
---|
517 | mov -8($np,$j,8),%rax
|
---|
518 | adc \$0,%rdx
|
---|
519 | mov %rdx,$A[0]
|
---|
520 |
|
---|
521 | mulq $m1 # np[j]*m1
|
---|
522 | add %rax,$N[1]
|
---|
523 | mov ($ap),%rax # ap[0]
|
---|
524 | adc \$0,%rdx
|
---|
525 | add $A[1],$N[1] # np[j]*m1+ap[j]*bp[0]
|
---|
526 | adc \$0,%rdx
|
---|
527 | mov $N[1],-16(%rsp,$j,8) # tp[j-1]
|
---|
528 | mov %rdx,$N[0]
|
---|
529 |
|
---|
530 | xor $N[1],$N[1]
|
---|
531 | add $A[0],$N[0]
|
---|
532 | adc \$0,$N[1]
|
---|
533 | mov $N[0],-8(%rsp,$j,8)
|
---|
534 | mov $N[1],(%rsp,$j,8) # store upmost overflow bit
|
---|
535 |
|
---|
536 | lea 1($i),$i # i++
|
---|
537 | .align 4
|
---|
538 | .Louter4x:
|
---|
539 | mov ($bp,$i,8),$m0 # m0=bp[i]
|
---|
540 | xor $j,$j # j=0
|
---|
541 | mov (%rsp),$A[0]
|
---|
542 | mov $n0,$m1
|
---|
543 | mulq $m0 # ap[0]*bp[i]
|
---|
544 | add %rax,$A[0] # ap[0]*bp[i]+tp[0]
|
---|
545 | mov ($np),%rax
|
---|
546 | adc \$0,%rdx
|
---|
547 |
|
---|
548 | imulq $A[0],$m1 # tp[0]*n0
|
---|
549 | mov %rdx,$A[1]
|
---|
550 |
|
---|
551 | mulq $m1 # np[0]*m1
|
---|
552 | add %rax,$A[0] # "$N[0]", discarded
|
---|
553 | mov 8($ap),%rax
|
---|
554 | adc \$0,%rdx
|
---|
555 | mov %rdx,$N[1]
|
---|
556 |
|
---|
557 | mulq $m0 # ap[j]*bp[i]
|
---|
558 | add %rax,$A[1]
|
---|
559 | mov 8($np),%rax
|
---|
560 | adc \$0,%rdx
|
---|
561 | add 8(%rsp),$A[1] # +tp[1]
|
---|
562 | adc \$0,%rdx
|
---|
563 | mov %rdx,$A[0]
|
---|
564 |
|
---|
565 | mulq $m1 # np[j]*m1
|
---|
566 | add %rax,$N[1]
|
---|
567 | mov 16($ap),%rax
|
---|
568 | adc \$0,%rdx
|
---|
569 | add $A[1],$N[1] # np[j]*m1+ap[j]*bp[i]+tp[j]
|
---|
570 | lea 4($j),$j # j+=2
|
---|
571 | adc \$0,%rdx
|
---|
572 | mov $N[1],(%rsp) # tp[j-1]
|
---|
573 | mov %rdx,$N[0]
|
---|
574 | jmp .Linner4x
|
---|
575 | .align 16
|
---|
576 | .Linner4x:
|
---|
577 | mulq $m0 # ap[j]*bp[i]
|
---|
578 | add %rax,$A[0]
|
---|
579 | mov -16($np,$j,8),%rax
|
---|
580 | adc \$0,%rdx
|
---|
581 | add -16(%rsp,$j,8),$A[0] # ap[j]*bp[i]+tp[j]
|
---|
582 | adc \$0,%rdx
|
---|
583 | mov %rdx,$A[1]
|
---|
584 |
|
---|
585 | mulq $m1 # np[j]*m1
|
---|
586 | add %rax,$N[0]
|
---|
587 | mov -8($ap,$j,8),%rax
|
---|
588 | adc \$0,%rdx
|
---|
589 | add $A[0],$N[0]
|
---|
590 | adc \$0,%rdx
|
---|
591 | mov $N[0],-24(%rsp,$j,8) # tp[j-1]
|
---|
592 | mov %rdx,$N[1]
|
---|
593 |
|
---|
594 | mulq $m0 # ap[j]*bp[i]
|
---|
595 | add %rax,$A[1]
|
---|
596 | mov -8($np,$j,8),%rax
|
---|
597 | adc \$0,%rdx
|
---|
598 | add -8(%rsp,$j,8),$A[1]
|
---|
599 | adc \$0,%rdx
|
---|
600 | mov %rdx,$A[0]
|
---|
601 |
|
---|
602 | mulq $m1 # np[j]*m1
|
---|
603 | add %rax,$N[1]
|
---|
604 | mov ($ap,$j,8),%rax
|
---|
605 | adc \$0,%rdx
|
---|
606 | add $A[1],$N[1]
|
---|
607 | adc \$0,%rdx
|
---|
608 | mov $N[1],-16(%rsp,$j,8) # tp[j-1]
|
---|
609 | mov %rdx,$N[0]
|
---|
610 |
|
---|
611 | mulq $m0 # ap[j]*bp[i]
|
---|
612 | add %rax,$A[0]
|
---|
613 | mov ($np,$j,8),%rax
|
---|
614 | adc \$0,%rdx
|
---|
615 | add (%rsp,$j,8),$A[0] # ap[j]*bp[i]+tp[j]
|
---|
616 | adc \$0,%rdx
|
---|
617 | mov %rdx,$A[1]
|
---|
618 |
|
---|
619 | mulq $m1 # np[j]*m1
|
---|
620 | add %rax,$N[0]
|
---|
621 | mov 8($ap,$j,8),%rax
|
---|
622 | adc \$0,%rdx
|
---|
623 | add $A[0],$N[0]
|
---|
624 | adc \$0,%rdx
|
---|
625 | mov $N[0],-8(%rsp,$j,8) # tp[j-1]
|
---|
626 | mov %rdx,$N[1]
|
---|
627 |
|
---|
628 | mulq $m0 # ap[j]*bp[i]
|
---|
629 | add %rax,$A[1]
|
---|
630 | mov 8($np,$j,8),%rax
|
---|
631 | adc \$0,%rdx
|
---|
632 | add 8(%rsp,$j,8),$A[1]
|
---|
633 | adc \$0,%rdx
|
---|
634 | lea 4($j),$j # j++
|
---|
635 | mov %rdx,$A[0]
|
---|
636 |
|
---|
637 | mulq $m1 # np[j]*m1
|
---|
638 | add %rax,$N[1]
|
---|
639 | mov -16($ap,$j,8),%rax
|
---|
640 | adc \$0,%rdx
|
---|
641 | add $A[1],$N[1]
|
---|
642 | adc \$0,%rdx
|
---|
643 | mov $N[1],-32(%rsp,$j,8) # tp[j-1]
|
---|
644 | mov %rdx,$N[0]
|
---|
645 | cmp $num,$j
|
---|
646 | jb .Linner4x
|
---|
647 |
|
---|
648 | mulq $m0 # ap[j]*bp[i]
|
---|
649 | add %rax,$A[0]
|
---|
650 | mov -16($np,$j,8),%rax
|
---|
651 | adc \$0,%rdx
|
---|
652 | add -16(%rsp,$j,8),$A[0] # ap[j]*bp[i]+tp[j]
|
---|
653 | adc \$0,%rdx
|
---|
654 | mov %rdx,$A[1]
|
---|
655 |
|
---|
656 | mulq $m1 # np[j]*m1
|
---|
657 | add %rax,$N[0]
|
---|
658 | mov -8($ap,$j,8),%rax
|
---|
659 | adc \$0,%rdx
|
---|
660 | add $A[0],$N[0]
|
---|
661 | adc \$0,%rdx
|
---|
662 | mov $N[0],-24(%rsp,$j,8) # tp[j-1]
|
---|
663 | mov %rdx,$N[1]
|
---|
664 |
|
---|
665 | mulq $m0 # ap[j]*bp[i]
|
---|
666 | add %rax,$A[1]
|
---|
667 | mov -8($np,$j,8),%rax
|
---|
668 | adc \$0,%rdx
|
---|
669 | add -8(%rsp,$j,8),$A[1]
|
---|
670 | adc \$0,%rdx
|
---|
671 | lea 1($i),$i # i++
|
---|
672 | mov %rdx,$A[0]
|
---|
673 |
|
---|
674 | mulq $m1 # np[j]*m1
|
---|
675 | add %rax,$N[1]
|
---|
676 | mov ($ap),%rax # ap[0]
|
---|
677 | adc \$0,%rdx
|
---|
678 | add $A[1],$N[1]
|
---|
679 | adc \$0,%rdx
|
---|
680 | mov $N[1],-16(%rsp,$j,8) # tp[j-1]
|
---|
681 | mov %rdx,$N[0]
|
---|
682 |
|
---|
683 | xor $N[1],$N[1]
|
---|
684 | add $A[0],$N[0]
|
---|
685 | adc \$0,$N[1]
|
---|
686 | add (%rsp,$num,8),$N[0] # pull upmost overflow bit
|
---|
687 | adc \$0,$N[1]
|
---|
688 | mov $N[0],-8(%rsp,$j,8)
|
---|
689 | mov $N[1],(%rsp,$j,8) # store upmost overflow bit
|
---|
690 |
|
---|
691 | cmp $num,$i
|
---|
692 | jb .Louter4x
|
---|
693 | ___
|
---|
694 | {
|
---|
695 | my @ri=("%rax","%rdx",$m0,$m1);
|
---|
696 | $code.=<<___;
|
---|
697 | mov 16(%rsp,$num,8),$rp # restore $rp
|
---|
698 | mov 0(%rsp),@ri[0] # tp[0]
|
---|
699 | pxor %xmm0,%xmm0
|
---|
700 | mov 8(%rsp),@ri[1] # tp[1]
|
---|
701 | shr \$2,$num # num/=4
|
---|
702 | lea (%rsp),$ap # borrow ap for tp
|
---|
703 | xor $i,$i # i=0 and clear CF!
|
---|
704 |
|
---|
705 | sub 0($np),@ri[0]
|
---|
706 | mov 16($ap),@ri[2] # tp[2]
|
---|
707 | mov 24($ap),@ri[3] # tp[3]
|
---|
708 | sbb 8($np),@ri[1]
|
---|
709 | lea -1($num),$j # j=num/4-1
|
---|
710 | jmp .Lsub4x
|
---|
711 | .align 16
|
---|
712 | .Lsub4x:
|
---|
713 | mov @ri[0],0($rp,$i,8) # rp[i]=tp[i]-np[i]
|
---|
714 | mov @ri[1],8($rp,$i,8) # rp[i]=tp[i]-np[i]
|
---|
715 | sbb 16($np,$i,8),@ri[2]
|
---|
716 | mov 32($ap,$i,8),@ri[0] # tp[i+1]
|
---|
717 | mov 40($ap,$i,8),@ri[1]
|
---|
718 | sbb 24($np,$i,8),@ri[3]
|
---|
719 | mov @ri[2],16($rp,$i,8) # rp[i]=tp[i]-np[i]
|
---|
720 | mov @ri[3],24($rp,$i,8) # rp[i]=tp[i]-np[i]
|
---|
721 | sbb 32($np,$i,8),@ri[0]
|
---|
722 | mov 48($ap,$i,8),@ri[2]
|
---|
723 | mov 56($ap,$i,8),@ri[3]
|
---|
724 | sbb 40($np,$i,8),@ri[1]
|
---|
725 | lea 4($i),$i # i++
|
---|
726 | dec $j # doesnn't affect CF!
|
---|
727 | jnz .Lsub4x
|
---|
728 |
|
---|
729 | mov @ri[0],0($rp,$i,8) # rp[i]=tp[i]-np[i]
|
---|
730 | mov 32($ap,$i,8),@ri[0] # load overflow bit
|
---|
731 | sbb 16($np,$i,8),@ri[2]
|
---|
732 | mov @ri[1],8($rp,$i,8) # rp[i]=tp[i]-np[i]
|
---|
733 | sbb 24($np,$i,8),@ri[3]
|
---|
734 | mov @ri[2],16($rp,$i,8) # rp[i]=tp[i]-np[i]
|
---|
735 |
|
---|
736 | sbb \$0,@ri[0] # handle upmost overflow bit
|
---|
737 | mov @ri[3],24($rp,$i,8) # rp[i]=tp[i]-np[i]
|
---|
738 | xor $i,$i # i=0
|
---|
739 | and @ri[0],$ap
|
---|
740 | not @ri[0]
|
---|
741 | mov $rp,$np
|
---|
742 | and @ri[0],$np
|
---|
743 | lea -1($num),$j
|
---|
744 | or $np,$ap # ap=borrow?tp:rp
|
---|
745 |
|
---|
746 | movdqu ($ap),%xmm1
|
---|
747 | movdqa %xmm0,(%rsp)
|
---|
748 | movdqu %xmm1,($rp)
|
---|
749 | jmp .Lcopy4x
|
---|
750 | .align 16
|
---|
751 | .Lcopy4x: # copy or in-place refresh
|
---|
752 | movdqu 16($ap,$i),%xmm2
|
---|
753 | movdqu 32($ap,$i),%xmm1
|
---|
754 | movdqa %xmm0,16(%rsp,$i)
|
---|
755 | movdqu %xmm2,16($rp,$i)
|
---|
756 | movdqa %xmm0,32(%rsp,$i)
|
---|
757 | movdqu %xmm1,32($rp,$i)
|
---|
758 | lea 32($i),$i
|
---|
759 | dec $j
|
---|
760 | jnz .Lcopy4x
|
---|
761 |
|
---|
762 | shl \$2,$num
|
---|
763 | movdqu 16($ap,$i),%xmm2
|
---|
764 | movdqa %xmm0,16(%rsp,$i)
|
---|
765 | movdqu %xmm2,16($rp,$i)
|
---|
766 | ___
|
---|
767 | }
|
---|
768 | $code.=<<___;
|
---|
769 | mov 8(%rsp,$num,8),%rsi # restore %rsp
|
---|
770 | mov \$1,%rax
|
---|
771 | mov -48(%rsi),%r15
|
---|
772 | mov -40(%rsi),%r14
|
---|
773 | mov -32(%rsi),%r13
|
---|
774 | mov -24(%rsi),%r12
|
---|
775 | mov -16(%rsi),%rbp
|
---|
776 | mov -8(%rsi),%rbx
|
---|
777 | lea (%rsi),%rsp
|
---|
778 | .Lmul4x_epilogue:
|
---|
779 | ret
|
---|
780 | .size bn_mul4x_mont,.-bn_mul4x_mont
|
---|
781 | ___
|
---|
782 | }}}
|
---|
783 | |
---|
784 | {{{
|
---|
785 | ######################################################################
|
---|
786 | # void bn_sqr8x_mont(
|
---|
787 | my $rptr="%rdi"; # const BN_ULONG *rptr,
|
---|
788 | my $aptr="%rsi"; # const BN_ULONG *aptr,
|
---|
789 | my $bptr="%rdx"; # not used
|
---|
790 | my $nptr="%rcx"; # const BN_ULONG *nptr,
|
---|
791 | my $n0 ="%r8"; # const BN_ULONG *n0);
|
---|
792 | my $num ="%r9"; # int num, has to be divisible by 8
|
---|
793 |
|
---|
794 | my ($i,$j,$tptr)=("%rbp","%rcx",$rptr);
|
---|
795 | my @A0=("%r10","%r11");
|
---|
796 | my @A1=("%r12","%r13");
|
---|
797 | my ($a0,$a1,$ai)=("%r14","%r15","%rbx");
|
---|
798 |
|
---|
799 | $code.=<<___ if ($addx);
|
---|
800 | .extern bn_sqrx8x_internal # see x86_64-mont5 module
|
---|
801 | ___
|
---|
802 | $code.=<<___;
|
---|
803 | .extern bn_sqr8x_internal # see x86_64-mont5 module
|
---|
804 |
|
---|
805 | .type bn_sqr8x_mont,\@function,6
|
---|
806 | .align 32
|
---|
807 | bn_sqr8x_mont:
|
---|
808 | mov %rsp,%rax
|
---|
809 | .Lsqr8x_enter:
|
---|
810 | push %rbx
|
---|
811 | push %rbp
|
---|
812 | push %r12
|
---|
813 | push %r13
|
---|
814 | push %r14
|
---|
815 | push %r15
|
---|
816 | .Lsqr8x_prologue:
|
---|
817 |
|
---|
818 | mov ${num}d,%r10d
|
---|
819 | shl \$3,${num}d # convert $num to bytes
|
---|
820 | shl \$3+2,%r10 # 4*$num
|
---|
821 | neg $num
|
---|
822 |
|
---|
823 | ##############################################################
|
---|
824 | # ensure that stack frame doesn't alias with $aptr modulo
|
---|
825 | # 4096. this is done to allow memory disambiguation logic
|
---|
826 | # do its job.
|
---|
827 | #
|
---|
828 | lea -64(%rsp,$num,2),%r11
|
---|
829 | mov %rsp,%rbp
|
---|
830 | mov ($n0),$n0 # *n0
|
---|
831 | sub $aptr,%r11
|
---|
832 | and \$4095,%r11
|
---|
833 | cmp %r11,%r10
|
---|
834 | jb .Lsqr8x_sp_alt
|
---|
835 | sub %r11,%rbp # align with $aptr
|
---|
836 | lea -64(%rbp,$num,2),%rbp # future alloca(frame+2*$num)
|
---|
837 | jmp .Lsqr8x_sp_done
|
---|
838 |
|
---|
839 | .align 32
|
---|
840 | .Lsqr8x_sp_alt:
|
---|
841 | lea 4096-64(,$num,2),%r10 # 4096-frame-2*$num
|
---|
842 | lea -64(%rbp,$num,2),%rbp # future alloca(frame+2*$num)
|
---|
843 | sub %r10,%r11
|
---|
844 | mov \$0,%r10
|
---|
845 | cmovc %r10,%r11
|
---|
846 | sub %r11,%rbp
|
---|
847 | .Lsqr8x_sp_done:
|
---|
848 | and \$-64,%rbp
|
---|
849 | mov %rsp,%r11
|
---|
850 | sub %rbp,%r11
|
---|
851 | and \$-4096,%r11
|
---|
852 | lea (%rbp,%r11),%rsp
|
---|
853 | mov (%rsp),%r10
|
---|
854 | cmp %rbp,%rsp
|
---|
855 | ja .Lsqr8x_page_walk
|
---|
856 | jmp .Lsqr8x_page_walk_done
|
---|
857 |
|
---|
858 | .align 16
|
---|
859 | .Lsqr8x_page_walk:
|
---|
860 | lea -4096(%rsp),%rsp
|
---|
861 | mov (%rsp),%r10
|
---|
862 | cmp %rbp,%rsp
|
---|
863 | ja .Lsqr8x_page_walk
|
---|
864 | .Lsqr8x_page_walk_done:
|
---|
865 |
|
---|
866 | mov $num,%r10
|
---|
867 | neg $num
|
---|
868 |
|
---|
869 | mov $n0, 32(%rsp)
|
---|
870 | mov %rax, 40(%rsp) # save original %rsp
|
---|
871 | .Lsqr8x_body:
|
---|
872 |
|
---|
873 | movq $nptr, %xmm2 # save pointer to modulus
|
---|
874 | pxor %xmm0,%xmm0
|
---|
875 | movq $rptr,%xmm1 # save $rptr
|
---|
876 | movq %r10, %xmm3 # -$num
|
---|
877 | ___
|
---|
878 | $code.=<<___ if ($addx);
|
---|
879 | mov OPENSSL_ia32cap_P+8(%rip),%eax
|
---|
880 | and \$0x80100,%eax
|
---|
881 | cmp \$0x80100,%eax
|
---|
882 | jne .Lsqr8x_nox
|
---|
883 |
|
---|
884 | call bn_sqrx8x_internal # see x86_64-mont5 module
|
---|
885 | # %rax top-most carry
|
---|
886 | # %rbp nptr
|
---|
887 | # %rcx -8*num
|
---|
888 | # %r8 end of tp[2*num]
|
---|
889 | lea (%r8,%rcx),%rbx
|
---|
890 | mov %rcx,$num
|
---|
891 | mov %rcx,%rdx
|
---|
892 | movq %xmm1,$rptr
|
---|
893 | sar \$3+2,%rcx # %cf=0
|
---|
894 | jmp .Lsqr8x_sub
|
---|
895 |
|
---|
896 | .align 32
|
---|
897 | .Lsqr8x_nox:
|
---|
898 | ___
|
---|
899 | $code.=<<___;
|
---|
900 | call bn_sqr8x_internal # see x86_64-mont5 module
|
---|
901 | # %rax top-most carry
|
---|
902 | # %rbp nptr
|
---|
903 | # %r8 -8*num
|
---|
904 | # %rdi end of tp[2*num]
|
---|
905 | lea (%rdi,$num),%rbx
|
---|
906 | mov $num,%rcx
|
---|
907 | mov $num,%rdx
|
---|
908 | movq %xmm1,$rptr
|
---|
909 | sar \$3+2,%rcx # %cf=0
|
---|
910 | jmp .Lsqr8x_sub
|
---|
911 |
|
---|
912 | .align 32
|
---|
913 | .Lsqr8x_sub:
|
---|
914 | mov 8*0(%rbx),%r12
|
---|
915 | mov 8*1(%rbx),%r13
|
---|
916 | mov 8*2(%rbx),%r14
|
---|
917 | mov 8*3(%rbx),%r15
|
---|
918 | lea 8*4(%rbx),%rbx
|
---|
919 | sbb 8*0(%rbp),%r12
|
---|
920 | sbb 8*1(%rbp),%r13
|
---|
921 | sbb 8*2(%rbp),%r14
|
---|
922 | sbb 8*3(%rbp),%r15
|
---|
923 | lea 8*4(%rbp),%rbp
|
---|
924 | mov %r12,8*0($rptr)
|
---|
925 | mov %r13,8*1($rptr)
|
---|
926 | mov %r14,8*2($rptr)
|
---|
927 | mov %r15,8*3($rptr)
|
---|
928 | lea 8*4($rptr),$rptr
|
---|
929 | inc %rcx # preserves %cf
|
---|
930 | jnz .Lsqr8x_sub
|
---|
931 |
|
---|
932 | sbb \$0,%rax # top-most carry
|
---|
933 | lea (%rbx,$num),%rbx # rewind
|
---|
934 | lea ($rptr,$num),$rptr # rewind
|
---|
935 |
|
---|
936 | movq %rax,%xmm1
|
---|
937 | pxor %xmm0,%xmm0
|
---|
938 | pshufd \$0,%xmm1,%xmm1
|
---|
939 | mov 40(%rsp),%rsi # restore %rsp
|
---|
940 | jmp .Lsqr8x_cond_copy
|
---|
941 |
|
---|
942 | .align 32
|
---|
943 | .Lsqr8x_cond_copy:
|
---|
944 | movdqa 16*0(%rbx),%xmm2
|
---|
945 | movdqa 16*1(%rbx),%xmm3
|
---|
946 | lea 16*2(%rbx),%rbx
|
---|
947 | movdqu 16*0($rptr),%xmm4
|
---|
948 | movdqu 16*1($rptr),%xmm5
|
---|
949 | lea 16*2($rptr),$rptr
|
---|
950 | movdqa %xmm0,-16*2(%rbx) # zero tp
|
---|
951 | movdqa %xmm0,-16*1(%rbx)
|
---|
952 | movdqa %xmm0,-16*2(%rbx,%rdx)
|
---|
953 | movdqa %xmm0,-16*1(%rbx,%rdx)
|
---|
954 | pcmpeqd %xmm1,%xmm0
|
---|
955 | pand %xmm1,%xmm2
|
---|
956 | pand %xmm1,%xmm3
|
---|
957 | pand %xmm0,%xmm4
|
---|
958 | pand %xmm0,%xmm5
|
---|
959 | pxor %xmm0,%xmm0
|
---|
960 | por %xmm2,%xmm4
|
---|
961 | por %xmm3,%xmm5
|
---|
962 | movdqu %xmm4,-16*2($rptr)
|
---|
963 | movdqu %xmm5,-16*1($rptr)
|
---|
964 | add \$32,$num
|
---|
965 | jnz .Lsqr8x_cond_copy
|
---|
966 |
|
---|
967 | mov \$1,%rax
|
---|
968 | mov -48(%rsi),%r15
|
---|
969 | mov -40(%rsi),%r14
|
---|
970 | mov -32(%rsi),%r13
|
---|
971 | mov -24(%rsi),%r12
|
---|
972 | mov -16(%rsi),%rbp
|
---|
973 | mov -8(%rsi),%rbx
|
---|
974 | lea (%rsi),%rsp
|
---|
975 | .Lsqr8x_epilogue:
|
---|
976 | ret
|
---|
977 | .size bn_sqr8x_mont,.-bn_sqr8x_mont
|
---|
978 | ___
|
---|
979 | }}}
|
---|
980 | |
---|
981 |
|
---|
982 | if ($addx) {{{
|
---|
983 | my $bp="%rdx"; # original value
|
---|
984 |
|
---|
985 | $code.=<<___;
|
---|
986 | .type bn_mulx4x_mont,\@function,6
|
---|
987 | .align 32
|
---|
988 | bn_mulx4x_mont:
|
---|
989 | mov %rsp,%rax
|
---|
990 | .Lmulx4x_enter:
|
---|
991 | push %rbx
|
---|
992 | push %rbp
|
---|
993 | push %r12
|
---|
994 | push %r13
|
---|
995 | push %r14
|
---|
996 | push %r15
|
---|
997 | .Lmulx4x_prologue:
|
---|
998 |
|
---|
999 | shl \$3,${num}d # convert $num to bytes
|
---|
1000 | xor %r10,%r10
|
---|
1001 | sub $num,%r10 # -$num
|
---|
1002 | mov ($n0),$n0 # *n0
|
---|
1003 | lea -72(%rsp,%r10),%rbp # future alloca(frame+$num+8)
|
---|
1004 | and \$-128,%rbp
|
---|
1005 | mov %rsp,%r11
|
---|
1006 | sub %rbp,%r11
|
---|
1007 | and \$-4096,%r11
|
---|
1008 | lea (%rbp,%r11),%rsp
|
---|
1009 | mov (%rsp),%r10
|
---|
1010 | cmp %rbp,%rsp
|
---|
1011 | ja .Lmulx4x_page_walk
|
---|
1012 | jmp .Lmulx4x_page_walk_done
|
---|
1013 |
|
---|
1014 | .align 16
|
---|
1015 | .Lmulx4x_page_walk:
|
---|
1016 | lea -4096(%rsp),%rsp
|
---|
1017 | mov (%rsp),%r10
|
---|
1018 | cmp %rbp,%rsp
|
---|
1019 | ja .Lmulx4x_page_walk
|
---|
1020 | .Lmulx4x_page_walk_done:
|
---|
1021 |
|
---|
1022 | lea ($bp,$num),%r10
|
---|
1023 | ##############################################################
|
---|
1024 | # Stack layout
|
---|
1025 | # +0 num
|
---|
1026 | # +8 off-loaded &b[i]
|
---|
1027 | # +16 end of b[num]
|
---|
1028 | # +24 saved n0
|
---|
1029 | # +32 saved rp
|
---|
1030 | # +40 saved %rsp
|
---|
1031 | # +48 inner counter
|
---|
1032 | # +56
|
---|
1033 | # +64 tmp[num+1]
|
---|
1034 | #
|
---|
1035 | mov $num,0(%rsp) # save $num
|
---|
1036 | shr \$5,$num
|
---|
1037 | mov %r10,16(%rsp) # end of b[num]
|
---|
1038 | sub \$1,$num
|
---|
1039 | mov $n0, 24(%rsp) # save *n0
|
---|
1040 | mov $rp, 32(%rsp) # save $rp
|
---|
1041 | mov %rax,40(%rsp) # save original %rsp
|
---|
1042 | mov $num,48(%rsp) # inner counter
|
---|
1043 | jmp .Lmulx4x_body
|
---|
1044 |
|
---|
1045 | .align 32
|
---|
1046 | .Lmulx4x_body:
|
---|
1047 | ___
|
---|
1048 | my ($aptr, $bptr, $nptr, $tptr, $mi, $bi, $zero, $num)=
|
---|
1049 | ("%rsi","%rdi","%rcx","%rbx","%r8","%r9","%rbp","%rax");
|
---|
1050 | my $rptr=$bptr;
|
---|
1051 | $code.=<<___;
|
---|
1052 | lea 8($bp),$bptr
|
---|
1053 | mov ($bp),%rdx # b[0], $bp==%rdx actually
|
---|
1054 | lea 64+32(%rsp),$tptr
|
---|
1055 | mov %rdx,$bi
|
---|
1056 |
|
---|
1057 | mulx 0*8($aptr),$mi,%rax # a[0]*b[0]
|
---|
1058 | mulx 1*8($aptr),%r11,%r14 # a[1]*b[0]
|
---|
1059 | add %rax,%r11
|
---|
1060 | mov $bptr,8(%rsp) # off-load &b[i]
|
---|
1061 | mulx 2*8($aptr),%r12,%r13 # ...
|
---|
1062 | adc %r14,%r12
|
---|
1063 | adc \$0,%r13
|
---|
1064 |
|
---|
1065 | mov $mi,$bptr # borrow $bptr
|
---|
1066 | imulq 24(%rsp),$mi # "t[0]"*n0
|
---|
1067 | xor $zero,$zero # cf=0, of=0
|
---|
1068 |
|
---|
1069 | mulx 3*8($aptr),%rax,%r14
|
---|
1070 | mov $mi,%rdx
|
---|
1071 | lea 4*8($aptr),$aptr
|
---|
1072 | adcx %rax,%r13
|
---|
1073 | adcx $zero,%r14 # cf=0
|
---|
1074 |
|
---|
1075 | mulx 0*8($nptr),%rax,%r10
|
---|
1076 | adcx %rax,$bptr # discarded
|
---|
1077 | adox %r11,%r10
|
---|
1078 | mulx 1*8($nptr),%rax,%r11
|
---|
1079 | adcx %rax,%r10
|
---|
1080 | adox %r12,%r11
|
---|
1081 | .byte 0xc4,0x62,0xfb,0xf6,0xa1,0x10,0x00,0x00,0x00 # mulx 2*8($nptr),%rax,%r12
|
---|
1082 | mov 48(%rsp),$bptr # counter value
|
---|
1083 | mov %r10,-4*8($tptr)
|
---|
1084 | adcx %rax,%r11
|
---|
1085 | adox %r13,%r12
|
---|
1086 | mulx 3*8($nptr),%rax,%r15
|
---|
1087 | mov $bi,%rdx
|
---|
1088 | mov %r11,-3*8($tptr)
|
---|
1089 | adcx %rax,%r12
|
---|
1090 | adox $zero,%r15 # of=0
|
---|
1091 | lea 4*8($nptr),$nptr
|
---|
1092 | mov %r12,-2*8($tptr)
|
---|
1093 |
|
---|
1094 | jmp .Lmulx4x_1st
|
---|
1095 |
|
---|
1096 | .align 32
|
---|
1097 | .Lmulx4x_1st:
|
---|
1098 | adcx $zero,%r15 # cf=0, modulo-scheduled
|
---|
1099 | mulx 0*8($aptr),%r10,%rax # a[4]*b[0]
|
---|
1100 | adcx %r14,%r10
|
---|
1101 | mulx 1*8($aptr),%r11,%r14 # a[5]*b[0]
|
---|
1102 | adcx %rax,%r11
|
---|
1103 | mulx 2*8($aptr),%r12,%rax # ...
|
---|
1104 | adcx %r14,%r12
|
---|
1105 | mulx 3*8($aptr),%r13,%r14
|
---|
1106 | .byte 0x67,0x67
|
---|
1107 | mov $mi,%rdx
|
---|
1108 | adcx %rax,%r13
|
---|
1109 | adcx $zero,%r14 # cf=0
|
---|
1110 | lea 4*8($aptr),$aptr
|
---|
1111 | lea 4*8($tptr),$tptr
|
---|
1112 |
|
---|
1113 | adox %r15,%r10
|
---|
1114 | mulx 0*8($nptr),%rax,%r15
|
---|
1115 | adcx %rax,%r10
|
---|
1116 | adox %r15,%r11
|
---|
1117 | mulx 1*8($nptr),%rax,%r15
|
---|
1118 | adcx %rax,%r11
|
---|
1119 | adox %r15,%r12
|
---|
1120 | mulx 2*8($nptr),%rax,%r15
|
---|
1121 | mov %r10,-5*8($tptr)
|
---|
1122 | adcx %rax,%r12
|
---|
1123 | mov %r11,-4*8($tptr)
|
---|
1124 | adox %r15,%r13
|
---|
1125 | mulx 3*8($nptr),%rax,%r15
|
---|
1126 | mov $bi,%rdx
|
---|
1127 | mov %r12,-3*8($tptr)
|
---|
1128 | adcx %rax,%r13
|
---|
1129 | adox $zero,%r15
|
---|
1130 | lea 4*8($nptr),$nptr
|
---|
1131 | mov %r13,-2*8($tptr)
|
---|
1132 |
|
---|
1133 | dec $bptr # of=0, pass cf
|
---|
1134 | jnz .Lmulx4x_1st
|
---|
1135 |
|
---|
1136 | mov 0(%rsp),$num # load num
|
---|
1137 | mov 8(%rsp),$bptr # re-load &b[i]
|
---|
1138 | adc $zero,%r15 # modulo-scheduled
|
---|
1139 | add %r15,%r14
|
---|
1140 | sbb %r15,%r15 # top-most carry
|
---|
1141 | mov %r14,-1*8($tptr)
|
---|
1142 | jmp .Lmulx4x_outer
|
---|
1143 |
|
---|
1144 | .align 32
|
---|
1145 | .Lmulx4x_outer:
|
---|
1146 | mov ($bptr),%rdx # b[i]
|
---|
1147 | lea 8($bptr),$bptr # b++
|
---|
1148 | sub $num,$aptr # rewind $aptr
|
---|
1149 | mov %r15,($tptr) # save top-most carry
|
---|
1150 | lea 64+4*8(%rsp),$tptr
|
---|
1151 | sub $num,$nptr # rewind $nptr
|
---|
1152 |
|
---|
1153 | mulx 0*8($aptr),$mi,%r11 # a[0]*b[i]
|
---|
1154 | xor %ebp,%ebp # xor $zero,$zero # cf=0, of=0
|
---|
1155 | mov %rdx,$bi
|
---|
1156 | mulx 1*8($aptr),%r14,%r12 # a[1]*b[i]
|
---|
1157 | adox -4*8($tptr),$mi
|
---|
1158 | adcx %r14,%r11
|
---|
1159 | mulx 2*8($aptr),%r15,%r13 # ...
|
---|
1160 | adox -3*8($tptr),%r11
|
---|
1161 | adcx %r15,%r12
|
---|
1162 | adox -2*8($tptr),%r12
|
---|
1163 | adcx $zero,%r13
|
---|
1164 | adox $zero,%r13
|
---|
1165 |
|
---|
1166 | mov $bptr,8(%rsp) # off-load &b[i]
|
---|
1167 | mov $mi,%r15
|
---|
1168 | imulq 24(%rsp),$mi # "t[0]"*n0
|
---|
1169 | xor %ebp,%ebp # xor $zero,$zero # cf=0, of=0
|
---|
1170 |
|
---|
1171 | mulx 3*8($aptr),%rax,%r14
|
---|
1172 | mov $mi,%rdx
|
---|
1173 | adcx %rax,%r13
|
---|
1174 | adox -1*8($tptr),%r13
|
---|
1175 | adcx $zero,%r14
|
---|
1176 | lea 4*8($aptr),$aptr
|
---|
1177 | adox $zero,%r14
|
---|
1178 |
|
---|
1179 | mulx 0*8($nptr),%rax,%r10
|
---|
1180 | adcx %rax,%r15 # discarded
|
---|
1181 | adox %r11,%r10
|
---|
1182 | mulx 1*8($nptr),%rax,%r11
|
---|
1183 | adcx %rax,%r10
|
---|
1184 | adox %r12,%r11
|
---|
1185 | mulx 2*8($nptr),%rax,%r12
|
---|
1186 | mov %r10,-4*8($tptr)
|
---|
1187 | adcx %rax,%r11
|
---|
1188 | adox %r13,%r12
|
---|
1189 | mulx 3*8($nptr),%rax,%r15
|
---|
1190 | mov $bi,%rdx
|
---|
1191 | mov %r11,-3*8($tptr)
|
---|
1192 | lea 4*8($nptr),$nptr
|
---|
1193 | adcx %rax,%r12
|
---|
1194 | adox $zero,%r15 # of=0
|
---|
1195 | mov 48(%rsp),$bptr # counter value
|
---|
1196 | mov %r12,-2*8($tptr)
|
---|
1197 |
|
---|
1198 | jmp .Lmulx4x_inner
|
---|
1199 |
|
---|
1200 | .align 32
|
---|
1201 | .Lmulx4x_inner:
|
---|
1202 | mulx 0*8($aptr),%r10,%rax # a[4]*b[i]
|
---|
1203 | adcx $zero,%r15 # cf=0, modulo-scheduled
|
---|
1204 | adox %r14,%r10
|
---|
1205 | mulx 1*8($aptr),%r11,%r14 # a[5]*b[i]
|
---|
1206 | adcx 0*8($tptr),%r10
|
---|
1207 | adox %rax,%r11
|
---|
1208 | mulx 2*8($aptr),%r12,%rax # ...
|
---|
1209 | adcx 1*8($tptr),%r11
|
---|
1210 | adox %r14,%r12
|
---|
1211 | mulx 3*8($aptr),%r13,%r14
|
---|
1212 | mov $mi,%rdx
|
---|
1213 | adcx 2*8($tptr),%r12
|
---|
1214 | adox %rax,%r13
|
---|
1215 | adcx 3*8($tptr),%r13
|
---|
1216 | adox $zero,%r14 # of=0
|
---|
1217 | lea 4*8($aptr),$aptr
|
---|
1218 | lea 4*8($tptr),$tptr
|
---|
1219 | adcx $zero,%r14 # cf=0
|
---|
1220 |
|
---|
1221 | adox %r15,%r10
|
---|
1222 | mulx 0*8($nptr),%rax,%r15
|
---|
1223 | adcx %rax,%r10
|
---|
1224 | adox %r15,%r11
|
---|
1225 | mulx 1*8($nptr),%rax,%r15
|
---|
1226 | adcx %rax,%r11
|
---|
1227 | adox %r15,%r12
|
---|
1228 | mulx 2*8($nptr),%rax,%r15
|
---|
1229 | mov %r10,-5*8($tptr)
|
---|
1230 | adcx %rax,%r12
|
---|
1231 | adox %r15,%r13
|
---|
1232 | mulx 3*8($nptr),%rax,%r15
|
---|
1233 | mov $bi,%rdx
|
---|
1234 | mov %r11,-4*8($tptr)
|
---|
1235 | mov %r12,-3*8($tptr)
|
---|
1236 | adcx %rax,%r13
|
---|
1237 | adox $zero,%r15
|
---|
1238 | lea 4*8($nptr),$nptr
|
---|
1239 | mov %r13,-2*8($tptr)
|
---|
1240 |
|
---|
1241 | dec $bptr # of=0, pass cf
|
---|
1242 | jnz .Lmulx4x_inner
|
---|
1243 |
|
---|
1244 | mov 0(%rsp),$num # load num
|
---|
1245 | mov 8(%rsp),$bptr # re-load &b[i]
|
---|
1246 | adc $zero,%r15 # modulo-scheduled
|
---|
1247 | sub 0*8($tptr),$zero # pull top-most carry
|
---|
1248 | adc %r15,%r14
|
---|
1249 | sbb %r15,%r15 # top-most carry
|
---|
1250 | mov %r14,-1*8($tptr)
|
---|
1251 |
|
---|
1252 | cmp 16(%rsp),$bptr
|
---|
1253 | jne .Lmulx4x_outer
|
---|
1254 |
|
---|
1255 | lea 64(%rsp),$tptr
|
---|
1256 | sub $num,$nptr # rewind $nptr
|
---|
1257 | neg %r15
|
---|
1258 | mov $num,%rdx
|
---|
1259 | shr \$3+2,$num # %cf=0
|
---|
1260 | mov 32(%rsp),$rptr # restore rp
|
---|
1261 | jmp .Lmulx4x_sub
|
---|
1262 |
|
---|
1263 | .align 32
|
---|
1264 | .Lmulx4x_sub:
|
---|
1265 | mov 8*0($tptr),%r11
|
---|
1266 | mov 8*1($tptr),%r12
|
---|
1267 | mov 8*2($tptr),%r13
|
---|
1268 | mov 8*3($tptr),%r14
|
---|
1269 | lea 8*4($tptr),$tptr
|
---|
1270 | sbb 8*0($nptr),%r11
|
---|
1271 | sbb 8*1($nptr),%r12
|
---|
1272 | sbb 8*2($nptr),%r13
|
---|
1273 | sbb 8*3($nptr),%r14
|
---|
1274 | lea 8*4($nptr),$nptr
|
---|
1275 | mov %r11,8*0($rptr)
|
---|
1276 | mov %r12,8*1($rptr)
|
---|
1277 | mov %r13,8*2($rptr)
|
---|
1278 | mov %r14,8*3($rptr)
|
---|
1279 | lea 8*4($rptr),$rptr
|
---|
1280 | dec $num # preserves %cf
|
---|
1281 | jnz .Lmulx4x_sub
|
---|
1282 |
|
---|
1283 | sbb \$0,%r15 # top-most carry
|
---|
1284 | lea 64(%rsp),$tptr
|
---|
1285 | sub %rdx,$rptr # rewind
|
---|
1286 |
|
---|
1287 | movq %r15,%xmm1
|
---|
1288 | pxor %xmm0,%xmm0
|
---|
1289 | pshufd \$0,%xmm1,%xmm1
|
---|
1290 | mov 40(%rsp),%rsi # restore %rsp
|
---|
1291 | jmp .Lmulx4x_cond_copy
|
---|
1292 |
|
---|
1293 | .align 32
|
---|
1294 | .Lmulx4x_cond_copy:
|
---|
1295 | movdqa 16*0($tptr),%xmm2
|
---|
1296 | movdqa 16*1($tptr),%xmm3
|
---|
1297 | lea 16*2($tptr),$tptr
|
---|
1298 | movdqu 16*0($rptr),%xmm4
|
---|
1299 | movdqu 16*1($rptr),%xmm5
|
---|
1300 | lea 16*2($rptr),$rptr
|
---|
1301 | movdqa %xmm0,-16*2($tptr) # zero tp
|
---|
1302 | movdqa %xmm0,-16*1($tptr)
|
---|
1303 | pcmpeqd %xmm1,%xmm0
|
---|
1304 | pand %xmm1,%xmm2
|
---|
1305 | pand %xmm1,%xmm3
|
---|
1306 | pand %xmm0,%xmm4
|
---|
1307 | pand %xmm0,%xmm5
|
---|
1308 | pxor %xmm0,%xmm0
|
---|
1309 | por %xmm2,%xmm4
|
---|
1310 | por %xmm3,%xmm5
|
---|
1311 | movdqu %xmm4,-16*2($rptr)
|
---|
1312 | movdqu %xmm5,-16*1($rptr)
|
---|
1313 | sub \$32,%rdx
|
---|
1314 | jnz .Lmulx4x_cond_copy
|
---|
1315 |
|
---|
1316 | mov %rdx,($tptr)
|
---|
1317 |
|
---|
1318 | mov \$1,%rax
|
---|
1319 | mov -48(%rsi),%r15
|
---|
1320 | mov -40(%rsi),%r14
|
---|
1321 | mov -32(%rsi),%r13
|
---|
1322 | mov -24(%rsi),%r12
|
---|
1323 | mov -16(%rsi),%rbp
|
---|
1324 | mov -8(%rsi),%rbx
|
---|
1325 | lea (%rsi),%rsp
|
---|
1326 | .Lmulx4x_epilogue:
|
---|
1327 | ret
|
---|
1328 | .size bn_mulx4x_mont,.-bn_mulx4x_mont
|
---|
1329 | ___
|
---|
1330 | }}}
|
---|
1331 | $code.=<<___;
|
---|
1332 | .asciz "Montgomery Multiplication for x86_64, CRYPTOGAMS by <appro\@openssl.org>"
|
---|
1333 | .align 16
|
---|
1334 | ___
|
---|
1335 |
|
---|
1336 | # EXCEPTION_DISPOSITION handler (EXCEPTION_RECORD *rec,ULONG64 frame,
|
---|
1337 | # CONTEXT *context,DISPATCHER_CONTEXT *disp)
|
---|
1338 | if ($win64) {
|
---|
1339 | $rec="%rcx";
|
---|
1340 | $frame="%rdx";
|
---|
1341 | $context="%r8";
|
---|
1342 | $disp="%r9";
|
---|
1343 |
|
---|
1344 | $code.=<<___;
|
---|
1345 | .extern __imp_RtlVirtualUnwind
|
---|
1346 | .type mul_handler,\@abi-omnipotent
|
---|
1347 | .align 16
|
---|
1348 | mul_handler:
|
---|
1349 | push %rsi
|
---|
1350 | push %rdi
|
---|
1351 | push %rbx
|
---|
1352 | push %rbp
|
---|
1353 | push %r12
|
---|
1354 | push %r13
|
---|
1355 | push %r14
|
---|
1356 | push %r15
|
---|
1357 | pushfq
|
---|
1358 | sub \$64,%rsp
|
---|
1359 |
|
---|
1360 | mov 120($context),%rax # pull context->Rax
|
---|
1361 | mov 248($context),%rbx # pull context->Rip
|
---|
1362 |
|
---|
1363 | mov 8($disp),%rsi # disp->ImageBase
|
---|
1364 | mov 56($disp),%r11 # disp->HandlerData
|
---|
1365 |
|
---|
1366 | mov 0(%r11),%r10d # HandlerData[0]
|
---|
1367 | lea (%rsi,%r10),%r10 # end of prologue label
|
---|
1368 | cmp %r10,%rbx # context->Rip<end of prologue label
|
---|
1369 | jb .Lcommon_seh_tail
|
---|
1370 |
|
---|
1371 | mov 152($context),%rax # pull context->Rsp
|
---|
1372 |
|
---|
1373 | mov 4(%r11),%r10d # HandlerData[1]
|
---|
1374 | lea (%rsi,%r10),%r10 # epilogue label
|
---|
1375 | cmp %r10,%rbx # context->Rip>=epilogue label
|
---|
1376 | jae .Lcommon_seh_tail
|
---|
1377 |
|
---|
1378 | mov 192($context),%r10 # pull $num
|
---|
1379 | mov 8(%rax,%r10,8),%rax # pull saved stack pointer
|
---|
1380 |
|
---|
1381 | jmp .Lcommon_pop_regs
|
---|
1382 | .size mul_handler,.-mul_handler
|
---|
1383 |
|
---|
1384 | .type sqr_handler,\@abi-omnipotent
|
---|
1385 | .align 16
|
---|
1386 | sqr_handler:
|
---|
1387 | push %rsi
|
---|
1388 | push %rdi
|
---|
1389 | push %rbx
|
---|
1390 | push %rbp
|
---|
1391 | push %r12
|
---|
1392 | push %r13
|
---|
1393 | push %r14
|
---|
1394 | push %r15
|
---|
1395 | pushfq
|
---|
1396 | sub \$64,%rsp
|
---|
1397 |
|
---|
1398 | mov 120($context),%rax # pull context->Rax
|
---|
1399 | mov 248($context),%rbx # pull context->Rip
|
---|
1400 |
|
---|
1401 | mov 8($disp),%rsi # disp->ImageBase
|
---|
1402 | mov 56($disp),%r11 # disp->HandlerData
|
---|
1403 |
|
---|
1404 | mov 0(%r11),%r10d # HandlerData[0]
|
---|
1405 | lea (%rsi,%r10),%r10 # end of prologue label
|
---|
1406 | cmp %r10,%rbx # context->Rip<.Lsqr_body
|
---|
1407 | jb .Lcommon_seh_tail
|
---|
1408 |
|
---|
1409 | mov 4(%r11),%r10d # HandlerData[1]
|
---|
1410 | lea (%rsi,%r10),%r10 # body label
|
---|
1411 | cmp %r10,%rbx # context->Rip>=.Lsqr_epilogue
|
---|
1412 | jb .Lcommon_pop_regs
|
---|
1413 |
|
---|
1414 | mov 152($context),%rax # pull context->Rsp
|
---|
1415 |
|
---|
1416 | mov 8(%r11),%r10d # HandlerData[2]
|
---|
1417 | lea (%rsi,%r10),%r10 # epilogue label
|
---|
1418 | cmp %r10,%rbx # context->Rip>=.Lsqr_epilogue
|
---|
1419 | jae .Lcommon_seh_tail
|
---|
1420 |
|
---|
1421 | mov 40(%rax),%rax # pull saved stack pointer
|
---|
1422 |
|
---|
1423 | .Lcommon_pop_regs:
|
---|
1424 | mov -8(%rax),%rbx
|
---|
1425 | mov -16(%rax),%rbp
|
---|
1426 | mov -24(%rax),%r12
|
---|
1427 | mov -32(%rax),%r13
|
---|
1428 | mov -40(%rax),%r14
|
---|
1429 | mov -48(%rax),%r15
|
---|
1430 | mov %rbx,144($context) # restore context->Rbx
|
---|
1431 | mov %rbp,160($context) # restore context->Rbp
|
---|
1432 | mov %r12,216($context) # restore context->R12
|
---|
1433 | mov %r13,224($context) # restore context->R13
|
---|
1434 | mov %r14,232($context) # restore context->R14
|
---|
1435 | mov %r15,240($context) # restore context->R15
|
---|
1436 |
|
---|
1437 | .Lcommon_seh_tail:
|
---|
1438 | mov 8(%rax),%rdi
|
---|
1439 | mov 16(%rax),%rsi
|
---|
1440 | mov %rax,152($context) # restore context->Rsp
|
---|
1441 | mov %rsi,168($context) # restore context->Rsi
|
---|
1442 | mov %rdi,176($context) # restore context->Rdi
|
---|
1443 |
|
---|
1444 | mov 40($disp),%rdi # disp->ContextRecord
|
---|
1445 | mov $context,%rsi # context
|
---|
1446 | mov \$154,%ecx # sizeof(CONTEXT)
|
---|
1447 | .long 0xa548f3fc # cld; rep movsq
|
---|
1448 |
|
---|
1449 | mov $disp,%rsi
|
---|
1450 | xor %rcx,%rcx # arg1, UNW_FLAG_NHANDLER
|
---|
1451 | mov 8(%rsi),%rdx # arg2, disp->ImageBase
|
---|
1452 | mov 0(%rsi),%r8 # arg3, disp->ControlPc
|
---|
1453 | mov 16(%rsi),%r9 # arg4, disp->FunctionEntry
|
---|
1454 | mov 40(%rsi),%r10 # disp->ContextRecord
|
---|
1455 | lea 56(%rsi),%r11 # &disp->HandlerData
|
---|
1456 | lea 24(%rsi),%r12 # &disp->EstablisherFrame
|
---|
1457 | mov %r10,32(%rsp) # arg5
|
---|
1458 | mov %r11,40(%rsp) # arg6
|
---|
1459 | mov %r12,48(%rsp) # arg7
|
---|
1460 | mov %rcx,56(%rsp) # arg8, (NULL)
|
---|
1461 | call *__imp_RtlVirtualUnwind(%rip)
|
---|
1462 |
|
---|
1463 | mov \$1,%eax # ExceptionContinueSearch
|
---|
1464 | add \$64,%rsp
|
---|
1465 | popfq
|
---|
1466 | pop %r15
|
---|
1467 | pop %r14
|
---|
1468 | pop %r13
|
---|
1469 | pop %r12
|
---|
1470 | pop %rbp
|
---|
1471 | pop %rbx
|
---|
1472 | pop %rdi
|
---|
1473 | pop %rsi
|
---|
1474 | ret
|
---|
1475 | .size sqr_handler,.-sqr_handler
|
---|
1476 |
|
---|
1477 | .section .pdata
|
---|
1478 | .align 4
|
---|
1479 | .rva .LSEH_begin_bn_mul_mont
|
---|
1480 | .rva .LSEH_end_bn_mul_mont
|
---|
1481 | .rva .LSEH_info_bn_mul_mont
|
---|
1482 |
|
---|
1483 | .rva .LSEH_begin_bn_mul4x_mont
|
---|
1484 | .rva .LSEH_end_bn_mul4x_mont
|
---|
1485 | .rva .LSEH_info_bn_mul4x_mont
|
---|
1486 |
|
---|
1487 | .rva .LSEH_begin_bn_sqr8x_mont
|
---|
1488 | .rva .LSEH_end_bn_sqr8x_mont
|
---|
1489 | .rva .LSEH_info_bn_sqr8x_mont
|
---|
1490 | ___
|
---|
1491 | $code.=<<___ if ($addx);
|
---|
1492 | .rva .LSEH_begin_bn_mulx4x_mont
|
---|
1493 | .rva .LSEH_end_bn_mulx4x_mont
|
---|
1494 | .rva .LSEH_info_bn_mulx4x_mont
|
---|
1495 | ___
|
---|
1496 | $code.=<<___;
|
---|
1497 | .section .xdata
|
---|
1498 | .align 8
|
---|
1499 | .LSEH_info_bn_mul_mont:
|
---|
1500 | .byte 9,0,0,0
|
---|
1501 | .rva mul_handler
|
---|
1502 | .rva .Lmul_body,.Lmul_epilogue # HandlerData[]
|
---|
1503 | .LSEH_info_bn_mul4x_mont:
|
---|
1504 | .byte 9,0,0,0
|
---|
1505 | .rva mul_handler
|
---|
1506 | .rva .Lmul4x_body,.Lmul4x_epilogue # HandlerData[]
|
---|
1507 | .LSEH_info_bn_sqr8x_mont:
|
---|
1508 | .byte 9,0,0,0
|
---|
1509 | .rva sqr_handler
|
---|
1510 | .rva .Lsqr8x_prologue,.Lsqr8x_body,.Lsqr8x_epilogue # HandlerData[]
|
---|
1511 | .align 8
|
---|
1512 | ___
|
---|
1513 | $code.=<<___ if ($addx);
|
---|
1514 | .LSEH_info_bn_mulx4x_mont:
|
---|
1515 | .byte 9,0,0,0
|
---|
1516 | .rva sqr_handler
|
---|
1517 | .rva .Lmulx4x_prologue,.Lmulx4x_body,.Lmulx4x_epilogue # HandlerData[]
|
---|
1518 | .align 8
|
---|
1519 | ___
|
---|
1520 | }
|
---|
1521 |
|
---|
1522 | print $code;
|
---|
1523 | close STDOUT;
|
---|