1 | #! /usr/bin/env perl
|
---|
2 | # Copyright 2013-2020 The OpenSSL Project Authors. All Rights Reserved.
|
---|
3 | # Copyright (c) 2012, Intel Corporation. All Rights Reserved.
|
---|
4 | #
|
---|
5 | # Licensed under the OpenSSL license (the "License"). You may not use
|
---|
6 | # this file except in compliance with the License. You can obtain a copy
|
---|
7 | # in the file LICENSE in the source distribution or at
|
---|
8 | # https://www.openssl.org/source/license.html
|
---|
9 | #
|
---|
10 | # Originally written by Shay Gueron (1, 2), and Vlad Krasnov (1)
|
---|
11 | # (1) Intel Corporation, Israel Development Center, Haifa, Israel
|
---|
12 | # (2) University of Haifa, Israel
|
---|
13 | #
|
---|
14 | # References:
|
---|
15 | # [1] S. Gueron, "Efficient Software Implementations of Modular
|
---|
16 | # Exponentiation", http://eprint.iacr.org/2011/239
|
---|
17 | # [2] S. Gueron, V. Krasnov. "Speeding up Big-Numbers Squaring".
|
---|
18 | # IEEE Proceedings of 9th International Conference on Information
|
---|
19 | # Technology: New Generations (ITNG 2012), 821-823 (2012).
|
---|
20 | # [3] S. Gueron, Efficient Software Implementations of Modular Exponentiation
|
---|
21 | # Journal of Cryptographic Engineering 2:31-43 (2012).
|
---|
22 | # [4] S. Gueron, V. Krasnov: "[PATCH] Efficient and side channel analysis
|
---|
23 | # resistant 512-bit and 1024-bit modular exponentiation for optimizing
|
---|
24 | # RSA1024 and RSA2048 on x86_64 platforms",
|
---|
25 | # http://rt.openssl.org/Ticket/Display.html?id=2582&user=guest&pass=guest
|
---|
26 | #
|
---|
27 | # While original submission covers 512- and 1024-bit exponentiation,
|
---|
28 | # this module is limited to 512-bit version only (and as such
|
---|
29 | # accelerates RSA1024 sign). This is because improvement for longer
|
---|
30 | # keys is not high enough to justify the effort, highest measured
|
---|
31 | # was ~5% on Westmere. [This is relative to OpenSSL 1.0.2, upcoming
|
---|
32 | # for the moment of this writing!] Nor does this module implement
|
---|
33 | # "monolithic" complete exponentiation jumbo-subroutine, but adheres
|
---|
34 | # to more modular mixture of C and assembly. And it's optimized even
|
---|
35 | # for processors other than Intel Core family (see table below for
|
---|
36 | # improvement coefficients).
|
---|
37 | # <[email protected]>
|
---|
38 | #
|
---|
39 | # RSA1024 sign/sec this/original |this/rsax(*) this/fips(*)
|
---|
40 | # ----------------+---------------------------
|
---|
41 | # Opteron +13% |+5% +20%
|
---|
42 | # Bulldozer -0% |-1% +10%
|
---|
43 | # P4 +11% |+7% +8%
|
---|
44 | # Westmere +5% |+14% +17%
|
---|
45 | # Sandy Bridge +2% |+12% +29%
|
---|
46 | # Ivy Bridge +1% |+11% +35%
|
---|
47 | # Haswell(**) -0% |+12% +39%
|
---|
48 | # Atom +13% |+11% +4%
|
---|
49 | # VIA Nano +70% |+9% +25%
|
---|
50 | #
|
---|
51 | # (*) rsax engine and fips numbers are presented for reference
|
---|
52 | # purposes;
|
---|
53 | # (**) MULX was attempted, but found to give only marginal improvement;
|
---|
54 |
|
---|
55 | $flavour = shift;
|
---|
56 | $output = shift;
|
---|
57 | if ($flavour =~ /\./) { $output = $flavour; undef $flavour; }
|
---|
58 |
|
---|
59 | $win64=0; $win64=1 if ($flavour =~ /[nm]asm|mingw64/ || $output =~ /\.asm$/);
|
---|
60 |
|
---|
61 | $0 =~ m/(.*[\/\\])[^\/\\]+$/; $dir=$1;
|
---|
62 | ( $xlate="${dir}x86_64-xlate.pl" and -f $xlate ) or
|
---|
63 | ( $xlate="${dir}../../perlasm/x86_64-xlate.pl" and -f $xlate) or
|
---|
64 | die "can't locate x86_64-xlate.pl";
|
---|
65 |
|
---|
66 | open OUT,"| \"$^X\" \"$xlate\" $flavour \"$output\"";
|
---|
67 | *STDOUT=*OUT;
|
---|
68 |
|
---|
69 | if (`$ENV{CC} -Wa,-v -c -o /dev/null -x assembler /dev/null 2>&1`
|
---|
70 | =~ /GNU assembler version ([2-9]\.[0-9]+)/) {
|
---|
71 | $addx = ($1>=2.23);
|
---|
72 | }
|
---|
73 |
|
---|
74 | if (!$addx && $win64 && ($flavour =~ /nasm/ || $ENV{ASM} =~ /nasm/) &&
|
---|
75 | `nasm -v 2>&1` =~ /NASM version ([2-9]\.[0-9]+)/) {
|
---|
76 | $addx = ($1>=2.10);
|
---|
77 | }
|
---|
78 |
|
---|
79 | if (!$addx && $win64 && ($flavour =~ /masm/ || $ENV{ASM} =~ /ml64/) &&
|
---|
80 | `ml64 2>&1` =~ /Version ([0-9]+)\./) {
|
---|
81 | $addx = ($1>=12);
|
---|
82 | }
|
---|
83 |
|
---|
84 | if (!$addx && `$ENV{CC} -v 2>&1` =~ /((?:clang|LLVM) version|.*based on LLVM) ([0-9]+)\.([0-9]+)/) {
|
---|
85 | my $ver = $2 + $3/100.0; # 3.1->3.01, 3.10->3.10
|
---|
86 | $addx = ($ver>=3.03);
|
---|
87 | }
|
---|
88 |
|
---|
89 | ($out, $inp, $mod) = ("%rdi", "%rsi", "%rbp"); # common internal API
|
---|
90 | {
|
---|
91 | my ($out,$inp,$mod,$n0,$times) = ("%rdi","%rsi","%rdx","%rcx","%r8d");
|
---|
92 |
|
---|
93 | $code.=<<___;
|
---|
94 | .text
|
---|
95 |
|
---|
96 | .extern OPENSSL_ia32cap_P
|
---|
97 |
|
---|
98 | .globl rsaz_512_sqr
|
---|
99 | .type rsaz_512_sqr,\@function,5
|
---|
100 | .align 32
|
---|
101 | rsaz_512_sqr: # 25-29% faster than rsaz_512_mul
|
---|
102 | .cfi_startproc
|
---|
103 | push %rbx
|
---|
104 | .cfi_push %rbx
|
---|
105 | push %rbp
|
---|
106 | .cfi_push %rbp
|
---|
107 | push %r12
|
---|
108 | .cfi_push %r12
|
---|
109 | push %r13
|
---|
110 | .cfi_push %r13
|
---|
111 | push %r14
|
---|
112 | .cfi_push %r14
|
---|
113 | push %r15
|
---|
114 | .cfi_push %r15
|
---|
115 |
|
---|
116 | subq \$128+24, %rsp
|
---|
117 | .cfi_adjust_cfa_offset 128+24
|
---|
118 | .Lsqr_body:
|
---|
119 | movq $mod, %xmm1 # common off-load
|
---|
120 | movq ($inp), %rdx
|
---|
121 | movq 8($inp), %rax
|
---|
122 | movq $n0, 128(%rsp)
|
---|
123 | ___
|
---|
124 | $code.=<<___ if ($addx);
|
---|
125 | movl \$0x80100,%r11d
|
---|
126 | andl OPENSSL_ia32cap_P+8(%rip),%r11d
|
---|
127 | cmpl \$0x80100,%r11d # check for MULX and ADO/CX
|
---|
128 | je .Loop_sqrx
|
---|
129 | ___
|
---|
130 | $code.=<<___;
|
---|
131 | jmp .Loop_sqr
|
---|
132 |
|
---|
133 | .align 32
|
---|
134 | .Loop_sqr:
|
---|
135 | movl $times,128+8(%rsp)
|
---|
136 | #first iteration
|
---|
137 | movq %rdx, %rbx # 0($inp)
|
---|
138 | mov %rax, %rbp # 8($inp)
|
---|
139 | mulq %rdx
|
---|
140 | movq %rax, %r8
|
---|
141 | movq 16($inp), %rax
|
---|
142 | movq %rdx, %r9
|
---|
143 |
|
---|
144 | mulq %rbx
|
---|
145 | addq %rax, %r9
|
---|
146 | movq 24($inp), %rax
|
---|
147 | movq %rdx, %r10
|
---|
148 | adcq \$0, %r10
|
---|
149 |
|
---|
150 | mulq %rbx
|
---|
151 | addq %rax, %r10
|
---|
152 | movq 32($inp), %rax
|
---|
153 | movq %rdx, %r11
|
---|
154 | adcq \$0, %r11
|
---|
155 |
|
---|
156 | mulq %rbx
|
---|
157 | addq %rax, %r11
|
---|
158 | movq 40($inp), %rax
|
---|
159 | movq %rdx, %r12
|
---|
160 | adcq \$0, %r12
|
---|
161 |
|
---|
162 | mulq %rbx
|
---|
163 | addq %rax, %r12
|
---|
164 | movq 48($inp), %rax
|
---|
165 | movq %rdx, %r13
|
---|
166 | adcq \$0, %r13
|
---|
167 |
|
---|
168 | mulq %rbx
|
---|
169 | addq %rax, %r13
|
---|
170 | movq 56($inp), %rax
|
---|
171 | movq %rdx, %r14
|
---|
172 | adcq \$0, %r14
|
---|
173 |
|
---|
174 | mulq %rbx
|
---|
175 | addq %rax, %r14
|
---|
176 | movq %rbx, %rax
|
---|
177 | adcq \$0, %rdx
|
---|
178 |
|
---|
179 | xorq %rcx,%rcx # rcx:r8 = r8 << 1
|
---|
180 | addq %r8, %r8
|
---|
181 | movq %rdx, %r15
|
---|
182 | adcq \$0, %rcx
|
---|
183 |
|
---|
184 | mulq %rax
|
---|
185 | addq %r8, %rdx
|
---|
186 | adcq \$0, %rcx
|
---|
187 |
|
---|
188 | movq %rax, (%rsp)
|
---|
189 | movq %rdx, 8(%rsp)
|
---|
190 |
|
---|
191 | #second iteration
|
---|
192 | movq 16($inp), %rax
|
---|
193 | mulq %rbp
|
---|
194 | addq %rax, %r10
|
---|
195 | movq 24($inp), %rax
|
---|
196 | movq %rdx, %rbx
|
---|
197 | adcq \$0, %rbx
|
---|
198 |
|
---|
199 | mulq %rbp
|
---|
200 | addq %rax, %r11
|
---|
201 | movq 32($inp), %rax
|
---|
202 | adcq \$0, %rdx
|
---|
203 | addq %rbx, %r11
|
---|
204 | movq %rdx, %rbx
|
---|
205 | adcq \$0, %rbx
|
---|
206 |
|
---|
207 | mulq %rbp
|
---|
208 | addq %rax, %r12
|
---|
209 | movq 40($inp), %rax
|
---|
210 | adcq \$0, %rdx
|
---|
211 | addq %rbx, %r12
|
---|
212 | movq %rdx, %rbx
|
---|
213 | adcq \$0, %rbx
|
---|
214 |
|
---|
215 | mulq %rbp
|
---|
216 | addq %rax, %r13
|
---|
217 | movq 48($inp), %rax
|
---|
218 | adcq \$0, %rdx
|
---|
219 | addq %rbx, %r13
|
---|
220 | movq %rdx, %rbx
|
---|
221 | adcq \$0, %rbx
|
---|
222 |
|
---|
223 | mulq %rbp
|
---|
224 | addq %rax, %r14
|
---|
225 | movq 56($inp), %rax
|
---|
226 | adcq \$0, %rdx
|
---|
227 | addq %rbx, %r14
|
---|
228 | movq %rdx, %rbx
|
---|
229 | adcq \$0, %rbx
|
---|
230 |
|
---|
231 | mulq %rbp
|
---|
232 | addq %rax, %r15
|
---|
233 | movq %rbp, %rax
|
---|
234 | adcq \$0, %rdx
|
---|
235 | addq %rbx, %r15
|
---|
236 | adcq \$0, %rdx
|
---|
237 |
|
---|
238 | xorq %rbx, %rbx # rbx:r10:r9 = r10:r9 << 1
|
---|
239 | addq %r9, %r9
|
---|
240 | movq %rdx, %r8
|
---|
241 | adcq %r10, %r10
|
---|
242 | adcq \$0, %rbx
|
---|
243 |
|
---|
244 | mulq %rax
|
---|
245 | # rcx <= 2 and rax <= 0xFFFF..F9, so carry must be zero here
|
---|
246 | addq %rcx, %rax
|
---|
247 | movq 16($inp), %rbp
|
---|
248 | addq %rax, %r9
|
---|
249 | movq 24($inp), %rax
|
---|
250 | adcq %rdx, %r10
|
---|
251 | adcq \$0, %rbx
|
---|
252 |
|
---|
253 | movq %r9, 16(%rsp)
|
---|
254 | movq %r10, 24(%rsp)
|
---|
255 |
|
---|
256 | #third iteration
|
---|
257 | mulq %rbp
|
---|
258 | addq %rax, %r12
|
---|
259 | movq 32($inp), %rax
|
---|
260 | movq %rdx, %rcx
|
---|
261 | adcq \$0, %rcx
|
---|
262 |
|
---|
263 | mulq %rbp
|
---|
264 | addq %rax, %r13
|
---|
265 | movq 40($inp), %rax
|
---|
266 | adcq \$0, %rdx
|
---|
267 | addq %rcx, %r13
|
---|
268 | movq %rdx, %rcx
|
---|
269 | adcq \$0, %rcx
|
---|
270 |
|
---|
271 | mulq %rbp
|
---|
272 | addq %rax, %r14
|
---|
273 | movq 48($inp), %rax
|
---|
274 | adcq \$0, %rdx
|
---|
275 | addq %rcx, %r14
|
---|
276 | movq %rdx, %rcx
|
---|
277 | adcq \$0, %rcx
|
---|
278 |
|
---|
279 | mulq %rbp
|
---|
280 | addq %rax, %r15
|
---|
281 | movq 56($inp), %rax
|
---|
282 | adcq \$0, %rdx
|
---|
283 | addq %rcx, %r15
|
---|
284 | movq %rdx, %rcx
|
---|
285 | adcq \$0, %rcx
|
---|
286 |
|
---|
287 | mulq %rbp
|
---|
288 | addq %rax, %r8
|
---|
289 | movq %rbp, %rax
|
---|
290 | adcq \$0, %rdx
|
---|
291 | addq %rcx, %r8
|
---|
292 | adcq \$0, %rdx
|
---|
293 |
|
---|
294 | xorq %rcx, %rcx # rcx:r12:r11 = r12:r11 << 1
|
---|
295 | addq %r11, %r11
|
---|
296 | movq %rdx, %r9
|
---|
297 | adcq %r12, %r12
|
---|
298 | adcq \$0, %rcx
|
---|
299 |
|
---|
300 | mulq %rax
|
---|
301 | # rbx <= 2 and rax <= 0xFFFF..F9, so carry must be zero here
|
---|
302 | addq %rbx, %rax
|
---|
303 | movq 24($inp), %r10
|
---|
304 | addq %rax, %r11
|
---|
305 | movq 32($inp), %rax
|
---|
306 | adcq %rdx, %r12
|
---|
307 | adcq \$0, %rcx
|
---|
308 |
|
---|
309 | movq %r11, 32(%rsp)
|
---|
310 | movq %r12, 40(%rsp)
|
---|
311 |
|
---|
312 | #fourth iteration
|
---|
313 | mov %rax, %r11 # 32($inp)
|
---|
314 | mulq %r10
|
---|
315 | addq %rax, %r14
|
---|
316 | movq 40($inp), %rax
|
---|
317 | movq %rdx, %rbx
|
---|
318 | adcq \$0, %rbx
|
---|
319 |
|
---|
320 | mov %rax, %r12 # 40($inp)
|
---|
321 | mulq %r10
|
---|
322 | addq %rax, %r15
|
---|
323 | movq 48($inp), %rax
|
---|
324 | adcq \$0, %rdx
|
---|
325 | addq %rbx, %r15
|
---|
326 | movq %rdx, %rbx
|
---|
327 | adcq \$0, %rbx
|
---|
328 |
|
---|
329 | mov %rax, %rbp # 48($inp)
|
---|
330 | mulq %r10
|
---|
331 | addq %rax, %r8
|
---|
332 | movq 56($inp), %rax
|
---|
333 | adcq \$0, %rdx
|
---|
334 | addq %rbx, %r8
|
---|
335 | movq %rdx, %rbx
|
---|
336 | adcq \$0, %rbx
|
---|
337 |
|
---|
338 | mulq %r10
|
---|
339 | addq %rax, %r9
|
---|
340 | movq %r10, %rax
|
---|
341 | adcq \$0, %rdx
|
---|
342 | addq %rbx, %r9
|
---|
343 | adcq \$0, %rdx
|
---|
344 |
|
---|
345 | xorq %rbx, %rbx # rbx:r13:r14 = r13:r14 << 1
|
---|
346 | addq %r13, %r13
|
---|
347 | movq %rdx, %r10
|
---|
348 | adcq %r14, %r14
|
---|
349 | adcq \$0, %rbx
|
---|
350 |
|
---|
351 | mulq %rax
|
---|
352 | # rcx <= 2 and rax <= 0xFFFF..F9, so carry must be zero here
|
---|
353 | addq %rcx, %rax
|
---|
354 | addq %rax, %r13
|
---|
355 | movq %r12, %rax # 40($inp)
|
---|
356 | adcq %rdx, %r14
|
---|
357 | adcq \$0, %rbx
|
---|
358 |
|
---|
359 | movq %r13, 48(%rsp)
|
---|
360 | movq %r14, 56(%rsp)
|
---|
361 |
|
---|
362 | #fifth iteration
|
---|
363 | mulq %r11
|
---|
364 | addq %rax, %r8
|
---|
365 | movq %rbp, %rax # 48($inp)
|
---|
366 | movq %rdx, %rcx
|
---|
367 | adcq \$0, %rcx
|
---|
368 |
|
---|
369 | mulq %r11
|
---|
370 | addq %rax, %r9
|
---|
371 | movq 56($inp), %rax
|
---|
372 | adcq \$0, %rdx
|
---|
373 | addq %rcx, %r9
|
---|
374 | movq %rdx, %rcx
|
---|
375 | adcq \$0, %rcx
|
---|
376 |
|
---|
377 | mov %rax, %r14 # 56($inp)
|
---|
378 | mulq %r11
|
---|
379 | addq %rax, %r10
|
---|
380 | movq %r11, %rax
|
---|
381 | adcq \$0, %rdx
|
---|
382 | addq %rcx, %r10
|
---|
383 | adcq \$0, %rdx
|
---|
384 |
|
---|
385 | xorq %rcx, %rcx # rcx:r8:r15 = r8:r15 << 1
|
---|
386 | addq %r15, %r15
|
---|
387 | movq %rdx, %r11
|
---|
388 | adcq %r8, %r8
|
---|
389 | adcq \$0, %rcx
|
---|
390 |
|
---|
391 | mulq %rax
|
---|
392 | # rbx <= 2 and rax <= 0xFFFF..F9, so carry must be zero here
|
---|
393 | addq %rbx, %rax
|
---|
394 | addq %rax, %r15
|
---|
395 | movq %rbp, %rax # 48($inp)
|
---|
396 | adcq %rdx, %r8
|
---|
397 | adcq \$0, %rcx
|
---|
398 |
|
---|
399 | movq %r15, 64(%rsp)
|
---|
400 | movq %r8, 72(%rsp)
|
---|
401 |
|
---|
402 | #sixth iteration
|
---|
403 | mulq %r12
|
---|
404 | addq %rax, %r10
|
---|
405 | movq %r14, %rax # 56($inp)
|
---|
406 | movq %rdx, %rbx
|
---|
407 | adcq \$0, %rbx
|
---|
408 |
|
---|
409 | mulq %r12
|
---|
410 | addq %rax, %r11
|
---|
411 | movq %r12, %rax
|
---|
412 | adcq \$0, %rdx
|
---|
413 | addq %rbx, %r11
|
---|
414 | adcq \$0, %rdx
|
---|
415 |
|
---|
416 | xorq %rbx, %rbx # rbx:r10:r9 = r10:r9 << 1
|
---|
417 | addq %r9, %r9
|
---|
418 | movq %rdx, %r12
|
---|
419 | adcq %r10, %r10
|
---|
420 | adcq \$0, %rbx
|
---|
421 |
|
---|
422 | mulq %rax
|
---|
423 | # rcx <= 2 and rax <= 0xFFFF..F9, so carry must be zero here
|
---|
424 | addq %rcx, %rax
|
---|
425 | addq %rax, %r9
|
---|
426 | movq %r14, %rax # 56($inp)
|
---|
427 | adcq %rdx, %r10
|
---|
428 | adcq \$0, %rbx
|
---|
429 |
|
---|
430 | movq %r9, 80(%rsp)
|
---|
431 | movq %r10, 88(%rsp)
|
---|
432 |
|
---|
433 | #seventh iteration
|
---|
434 | mulq %rbp
|
---|
435 | addq %rax, %r12
|
---|
436 | movq %rbp, %rax
|
---|
437 | adcq \$0, %rdx
|
---|
438 |
|
---|
439 | xorq %rcx, %rcx # rcx:r12:r11 = r12:r11 << 1
|
---|
440 | addq %r11, %r11
|
---|
441 | movq %rdx, %r13
|
---|
442 | adcq %r12, %r12
|
---|
443 | adcq \$0, %rcx
|
---|
444 |
|
---|
445 | mulq %rax
|
---|
446 | # rbx <= 2 and rax <= 0xFFFF..F9, so carry must be zero here
|
---|
447 | addq %rbx, %rax
|
---|
448 | addq %rax, %r11
|
---|
449 | movq %r14, %rax # 56($inp)
|
---|
450 | adcq %rdx, %r12
|
---|
451 | adcq \$0, %rcx
|
---|
452 |
|
---|
453 | movq %r11, 96(%rsp)
|
---|
454 | movq %r12, 104(%rsp)
|
---|
455 |
|
---|
456 | #eighth iteration
|
---|
457 | xorq %rbx, %rbx # rbx:r13 = r13 << 1
|
---|
458 | addq %r13, %r13
|
---|
459 | adcq \$0, %rbx
|
---|
460 |
|
---|
461 | mulq %rax
|
---|
462 | # rcx <= 2 and rax <= 0xFFFF..F9, so carry must be zero here
|
---|
463 | addq %rcx, %rax
|
---|
464 | addq %r13, %rax
|
---|
465 | adcq %rbx, %rdx
|
---|
466 |
|
---|
467 | movq (%rsp), %r8
|
---|
468 | movq 8(%rsp), %r9
|
---|
469 | movq 16(%rsp), %r10
|
---|
470 | movq 24(%rsp), %r11
|
---|
471 | movq 32(%rsp), %r12
|
---|
472 | movq 40(%rsp), %r13
|
---|
473 | movq 48(%rsp), %r14
|
---|
474 | movq 56(%rsp), %r15
|
---|
475 | movq %xmm1, %rbp
|
---|
476 |
|
---|
477 | movq %rax, 112(%rsp)
|
---|
478 | movq %rdx, 120(%rsp)
|
---|
479 |
|
---|
480 | call __rsaz_512_reduce
|
---|
481 |
|
---|
482 | addq 64(%rsp), %r8
|
---|
483 | adcq 72(%rsp), %r9
|
---|
484 | adcq 80(%rsp), %r10
|
---|
485 | adcq 88(%rsp), %r11
|
---|
486 | adcq 96(%rsp), %r12
|
---|
487 | adcq 104(%rsp), %r13
|
---|
488 | adcq 112(%rsp), %r14
|
---|
489 | adcq 120(%rsp), %r15
|
---|
490 | sbbq %rcx, %rcx
|
---|
491 |
|
---|
492 | call __rsaz_512_subtract
|
---|
493 |
|
---|
494 | movq %r8, %rdx
|
---|
495 | movq %r9, %rax
|
---|
496 | movl 128+8(%rsp), $times
|
---|
497 | movq $out, $inp
|
---|
498 |
|
---|
499 | decl $times
|
---|
500 | jnz .Loop_sqr
|
---|
501 | ___
|
---|
502 | if ($addx) {
|
---|
503 | $code.=<<___;
|
---|
504 | jmp .Lsqr_tail
|
---|
505 |
|
---|
506 | .align 32
|
---|
507 | .Loop_sqrx:
|
---|
508 | movl $times,128+8(%rsp)
|
---|
509 | movq $out, %xmm0 # off-load
|
---|
510 | #first iteration
|
---|
511 | mulx %rax, %r8, %r9
|
---|
512 | mov %rax, %rbx
|
---|
513 |
|
---|
514 | mulx 16($inp), %rcx, %r10
|
---|
515 | xor %rbp, %rbp # cf=0, of=0
|
---|
516 |
|
---|
517 | mulx 24($inp), %rax, %r11
|
---|
518 | adcx %rcx, %r9
|
---|
519 |
|
---|
520 | .byte 0xc4,0x62,0xf3,0xf6,0xa6,0x20,0x00,0x00,0x00 # mulx 32($inp), %rcx, %r12
|
---|
521 | adcx %rax, %r10
|
---|
522 |
|
---|
523 | .byte 0xc4,0x62,0xfb,0xf6,0xae,0x28,0x00,0x00,0x00 # mulx 40($inp), %rax, %r13
|
---|
524 | adcx %rcx, %r11
|
---|
525 |
|
---|
526 | mulx 48($inp), %rcx, %r14
|
---|
527 | adcx %rax, %r12
|
---|
528 | adcx %rcx, %r13
|
---|
529 |
|
---|
530 | mulx 56($inp), %rax, %r15
|
---|
531 | adcx %rax, %r14
|
---|
532 | adcx %rbp, %r15 # %rbp is 0
|
---|
533 |
|
---|
534 | mulx %rdx, %rax, $out
|
---|
535 | mov %rbx, %rdx # 8($inp)
|
---|
536 | xor %rcx, %rcx
|
---|
537 | adox %r8, %r8
|
---|
538 | adcx $out, %r8
|
---|
539 | adox %rbp, %rcx
|
---|
540 | adcx %rbp, %rcx
|
---|
541 |
|
---|
542 | mov %rax, (%rsp)
|
---|
543 | mov %r8, 8(%rsp)
|
---|
544 |
|
---|
545 | #second iteration
|
---|
546 | .byte 0xc4,0xe2,0xfb,0xf6,0x9e,0x10,0x00,0x00,0x00 # mulx 16($inp), %rax, %rbx
|
---|
547 | adox %rax, %r10
|
---|
548 | adcx %rbx, %r11
|
---|
549 |
|
---|
550 | mulx 24($inp), $out, %r8
|
---|
551 | adox $out, %r11
|
---|
552 | .byte 0x66
|
---|
553 | adcx %r8, %r12
|
---|
554 |
|
---|
555 | mulx 32($inp), %rax, %rbx
|
---|
556 | adox %rax, %r12
|
---|
557 | adcx %rbx, %r13
|
---|
558 |
|
---|
559 | mulx 40($inp), $out, %r8
|
---|
560 | adox $out, %r13
|
---|
561 | adcx %r8, %r14
|
---|
562 |
|
---|
563 | .byte 0xc4,0xe2,0xfb,0xf6,0x9e,0x30,0x00,0x00,0x00 # mulx 48($inp), %rax, %rbx
|
---|
564 | adox %rax, %r14
|
---|
565 | adcx %rbx, %r15
|
---|
566 |
|
---|
567 | .byte 0xc4,0x62,0xc3,0xf6,0x86,0x38,0x00,0x00,0x00 # mulx 56($inp), $out, %r8
|
---|
568 | adox $out, %r15
|
---|
569 | adcx %rbp, %r8
|
---|
570 | mulx %rdx, %rax, $out
|
---|
571 | adox %rbp, %r8
|
---|
572 | .byte 0x48,0x8b,0x96,0x10,0x00,0x00,0x00 # mov 16($inp), %rdx
|
---|
573 |
|
---|
574 | xor %rbx, %rbx
|
---|
575 | adox %r9, %r9
|
---|
576 | # rcx <= 2 and rax <= 0xFFFF..F9, so carry must be zero here
|
---|
577 | adcx %rcx, %rax
|
---|
578 | adox %r10, %r10
|
---|
579 | adcx %rax, %r9
|
---|
580 | adox %rbp, %rbx
|
---|
581 | adcx $out, %r10
|
---|
582 | adcx %rbp, %rbx
|
---|
583 |
|
---|
584 | mov %r9, 16(%rsp)
|
---|
585 | .byte 0x4c,0x89,0x94,0x24,0x18,0x00,0x00,0x00 # mov %r10, 24(%rsp)
|
---|
586 |
|
---|
587 | #third iteration
|
---|
588 | mulx 24($inp), $out, %r9
|
---|
589 | adox $out, %r12
|
---|
590 | adcx %r9, %r13
|
---|
591 |
|
---|
592 | mulx 32($inp), %rax, %rcx
|
---|
593 | adox %rax, %r13
|
---|
594 | adcx %rcx, %r14
|
---|
595 |
|
---|
596 | .byte 0xc4,0x62,0xc3,0xf6,0x8e,0x28,0x00,0x00,0x00 # mulx 40($inp), $out, %r9
|
---|
597 | adox $out, %r14
|
---|
598 | adcx %r9, %r15
|
---|
599 |
|
---|
600 | .byte 0xc4,0xe2,0xfb,0xf6,0x8e,0x30,0x00,0x00,0x00 # mulx 48($inp), %rax, %rcx
|
---|
601 | adox %rax, %r15
|
---|
602 | adcx %rcx, %r8
|
---|
603 |
|
---|
604 | mulx 56($inp), $out, %r9
|
---|
605 | adox $out, %r8
|
---|
606 | adcx %rbp, %r9
|
---|
607 | mulx %rdx, %rax, $out
|
---|
608 | adox %rbp, %r9
|
---|
609 | mov 24($inp), %rdx
|
---|
610 |
|
---|
611 | xor %rcx, %rcx
|
---|
612 | adox %r11, %r11
|
---|
613 | # rbx <= 2 and rax <= 0xFFFF..F9, so carry must be zero here
|
---|
614 | adcx %rbx, %rax
|
---|
615 | adox %r12, %r12
|
---|
616 | adcx %rax, %r11
|
---|
617 | adox %rbp, %rcx
|
---|
618 | adcx $out, %r12
|
---|
619 | adcx %rbp, %rcx
|
---|
620 |
|
---|
621 | mov %r11, 32(%rsp)
|
---|
622 | mov %r12, 40(%rsp)
|
---|
623 |
|
---|
624 | #fourth iteration
|
---|
625 | mulx 32($inp), %rax, %rbx
|
---|
626 | adox %rax, %r14
|
---|
627 | adcx %rbx, %r15
|
---|
628 |
|
---|
629 | mulx 40($inp), $out, %r10
|
---|
630 | adox $out, %r15
|
---|
631 | adcx %r10, %r8
|
---|
632 |
|
---|
633 | mulx 48($inp), %rax, %rbx
|
---|
634 | adox %rax, %r8
|
---|
635 | adcx %rbx, %r9
|
---|
636 |
|
---|
637 | mulx 56($inp), $out, %r10
|
---|
638 | adox $out, %r9
|
---|
639 | adcx %rbp, %r10
|
---|
640 | mulx %rdx, %rax, $out
|
---|
641 | adox %rbp, %r10
|
---|
642 | mov 32($inp), %rdx
|
---|
643 |
|
---|
644 | xor %rbx, %rbx
|
---|
645 | adox %r13, %r13
|
---|
646 | # rcx <= 2 and rax <= 0xFFFF..F9, so carry must be zero here
|
---|
647 | adcx %rcx, %rax
|
---|
648 | adox %r14, %r14
|
---|
649 | adcx %rax, %r13
|
---|
650 | adox %rbp, %rbx
|
---|
651 | adcx $out, %r14
|
---|
652 | adcx %rbp, %rbx
|
---|
653 |
|
---|
654 | mov %r13, 48(%rsp)
|
---|
655 | mov %r14, 56(%rsp)
|
---|
656 |
|
---|
657 | #fifth iteration
|
---|
658 | mulx 40($inp), $out, %r11
|
---|
659 | adox $out, %r8
|
---|
660 | adcx %r11, %r9
|
---|
661 |
|
---|
662 | mulx 48($inp), %rax, %rcx
|
---|
663 | adox %rax, %r9
|
---|
664 | adcx %rcx, %r10
|
---|
665 |
|
---|
666 | mulx 56($inp), $out, %r11
|
---|
667 | adox $out, %r10
|
---|
668 | adcx %rbp, %r11
|
---|
669 | mulx %rdx, %rax, $out
|
---|
670 | mov 40($inp), %rdx
|
---|
671 | adox %rbp, %r11
|
---|
672 |
|
---|
673 | xor %rcx, %rcx
|
---|
674 | adox %r15, %r15
|
---|
675 | # rbx <= 2 and rax <= 0xFFFF..F9, so carry must be zero here
|
---|
676 | adcx %rbx, %rax
|
---|
677 | adox %r8, %r8
|
---|
678 | adcx %rax, %r15
|
---|
679 | adox %rbp, %rcx
|
---|
680 | adcx $out, %r8
|
---|
681 | adcx %rbp, %rcx
|
---|
682 |
|
---|
683 | mov %r15, 64(%rsp)
|
---|
684 | mov %r8, 72(%rsp)
|
---|
685 |
|
---|
686 | #sixth iteration
|
---|
687 | .byte 0xc4,0xe2,0xfb,0xf6,0x9e,0x30,0x00,0x00,0x00 # mulx 48($inp), %rax, %rbx
|
---|
688 | adox %rax, %r10
|
---|
689 | adcx %rbx, %r11
|
---|
690 |
|
---|
691 | .byte 0xc4,0x62,0xc3,0xf6,0xa6,0x38,0x00,0x00,0x00 # mulx 56($inp), $out, %r12
|
---|
692 | adox $out, %r11
|
---|
693 | adcx %rbp, %r12
|
---|
694 | mulx %rdx, %rax, $out
|
---|
695 | adox %rbp, %r12
|
---|
696 | mov 48($inp), %rdx
|
---|
697 |
|
---|
698 | xor %rbx, %rbx
|
---|
699 | adox %r9, %r9
|
---|
700 | # rcx <= 2 and rax <= 0xFFFF..F9, so carry must be zero here
|
---|
701 | adcx %rcx, %rax
|
---|
702 | adox %r10, %r10
|
---|
703 | adcx %rax, %r9
|
---|
704 | adcx $out, %r10
|
---|
705 | adox %rbp, %rbx
|
---|
706 | adcx %rbp, %rbx
|
---|
707 |
|
---|
708 | mov %r9, 80(%rsp)
|
---|
709 | mov %r10, 88(%rsp)
|
---|
710 |
|
---|
711 | #seventh iteration
|
---|
712 | .byte 0xc4,0x62,0xfb,0xf6,0xae,0x38,0x00,0x00,0x00 # mulx 56($inp), %rax, %r13
|
---|
713 | adox %rax, %r12
|
---|
714 | adox %rbp, %r13
|
---|
715 |
|
---|
716 | mulx %rdx, %rax, $out
|
---|
717 | xor %rcx, %rcx
|
---|
718 | mov 56($inp), %rdx
|
---|
719 | adox %r11, %r11
|
---|
720 | # rbx <= 2 and rax <= 0xFFFF..F9, so carry must be zero here
|
---|
721 | adcx %rbx, %rax
|
---|
722 | adox %r12, %r12
|
---|
723 | adcx %rax, %r11
|
---|
724 | adox %rbp, %rcx
|
---|
725 | adcx $out, %r12
|
---|
726 | adcx %rbp, %rcx
|
---|
727 |
|
---|
728 | .byte 0x4c,0x89,0x9c,0x24,0x60,0x00,0x00,0x00 # mov %r11, 96(%rsp)
|
---|
729 | .byte 0x4c,0x89,0xa4,0x24,0x68,0x00,0x00,0x00 # mov %r12, 104(%rsp)
|
---|
730 |
|
---|
731 | #eighth iteration
|
---|
732 | mulx %rdx, %rax, %rdx
|
---|
733 | xor %rbx, %rbx
|
---|
734 | adox %r13, %r13
|
---|
735 | # rcx <= 2 and rax <= 0xFFFF..F9, so carry must be zero here
|
---|
736 | adcx %rcx, %rax
|
---|
737 | adox %rbp, %rbx
|
---|
738 | adcx %r13, %rax
|
---|
739 | adcx %rdx, %rbx
|
---|
740 |
|
---|
741 | movq %xmm0, $out
|
---|
742 | movq %xmm1, %rbp
|
---|
743 |
|
---|
744 | movq 128(%rsp), %rdx # pull $n0
|
---|
745 | movq (%rsp), %r8
|
---|
746 | movq 8(%rsp), %r9
|
---|
747 | movq 16(%rsp), %r10
|
---|
748 | movq 24(%rsp), %r11
|
---|
749 | movq 32(%rsp), %r12
|
---|
750 | movq 40(%rsp), %r13
|
---|
751 | movq 48(%rsp), %r14
|
---|
752 | movq 56(%rsp), %r15
|
---|
753 |
|
---|
754 | movq %rax, 112(%rsp)
|
---|
755 | movq %rbx, 120(%rsp)
|
---|
756 |
|
---|
757 | call __rsaz_512_reducex
|
---|
758 |
|
---|
759 | addq 64(%rsp), %r8
|
---|
760 | adcq 72(%rsp), %r9
|
---|
761 | adcq 80(%rsp), %r10
|
---|
762 | adcq 88(%rsp), %r11
|
---|
763 | adcq 96(%rsp), %r12
|
---|
764 | adcq 104(%rsp), %r13
|
---|
765 | adcq 112(%rsp), %r14
|
---|
766 | adcq 120(%rsp), %r15
|
---|
767 | sbbq %rcx, %rcx
|
---|
768 |
|
---|
769 | call __rsaz_512_subtract
|
---|
770 |
|
---|
771 | movq %r8, %rdx
|
---|
772 | movq %r9, %rax
|
---|
773 | movl 128+8(%rsp), $times
|
---|
774 | movq $out, $inp
|
---|
775 |
|
---|
776 | decl $times
|
---|
777 | jnz .Loop_sqrx
|
---|
778 |
|
---|
779 | .Lsqr_tail:
|
---|
780 | ___
|
---|
781 | }
|
---|
782 | $code.=<<___;
|
---|
783 |
|
---|
784 | leaq 128+24+48(%rsp), %rax
|
---|
785 | .cfi_def_cfa %rax,8
|
---|
786 | movq -48(%rax), %r15
|
---|
787 | .cfi_restore %r15
|
---|
788 | movq -40(%rax), %r14
|
---|
789 | .cfi_restore %r14
|
---|
790 | movq -32(%rax), %r13
|
---|
791 | .cfi_restore %r13
|
---|
792 | movq -24(%rax), %r12
|
---|
793 | .cfi_restore %r12
|
---|
794 | movq -16(%rax), %rbp
|
---|
795 | .cfi_restore %rbp
|
---|
796 | movq -8(%rax), %rbx
|
---|
797 | .cfi_restore %rbx
|
---|
798 | leaq (%rax), %rsp
|
---|
799 | .cfi_def_cfa_register %rsp
|
---|
800 | .Lsqr_epilogue:
|
---|
801 | ret
|
---|
802 | .cfi_endproc
|
---|
803 | .size rsaz_512_sqr,.-rsaz_512_sqr
|
---|
804 | ___
|
---|
805 | }
|
---|
806 | {
|
---|
807 | my ($out,$ap,$bp,$mod,$n0) = ("%rdi","%rsi","%rdx","%rcx","%r8");
|
---|
808 | $code.=<<___;
|
---|
809 | .globl rsaz_512_mul
|
---|
810 | .type rsaz_512_mul,\@function,5
|
---|
811 | .align 32
|
---|
812 | rsaz_512_mul:
|
---|
813 | .cfi_startproc
|
---|
814 | push %rbx
|
---|
815 | .cfi_push %rbx
|
---|
816 | push %rbp
|
---|
817 | .cfi_push %rbp
|
---|
818 | push %r12
|
---|
819 | .cfi_push %r12
|
---|
820 | push %r13
|
---|
821 | .cfi_push %r13
|
---|
822 | push %r14
|
---|
823 | .cfi_push %r14
|
---|
824 | push %r15
|
---|
825 | .cfi_push %r15
|
---|
826 |
|
---|
827 | subq \$128+24, %rsp
|
---|
828 | .cfi_adjust_cfa_offset 128+24
|
---|
829 | .Lmul_body:
|
---|
830 | movq $out, %xmm0 # off-load arguments
|
---|
831 | movq $mod, %xmm1
|
---|
832 | movq $n0, 128(%rsp)
|
---|
833 | ___
|
---|
834 | $code.=<<___ if ($addx);
|
---|
835 | movl \$0x80100,%r11d
|
---|
836 | andl OPENSSL_ia32cap_P+8(%rip),%r11d
|
---|
837 | cmpl \$0x80100,%r11d # check for MULX and ADO/CX
|
---|
838 | je .Lmulx
|
---|
839 | ___
|
---|
840 | $code.=<<___;
|
---|
841 | movq ($bp), %rbx # pass b[0]
|
---|
842 | movq $bp, %rbp # pass argument
|
---|
843 | call __rsaz_512_mul
|
---|
844 |
|
---|
845 | movq %xmm0, $out
|
---|
846 | movq %xmm1, %rbp
|
---|
847 |
|
---|
848 | movq (%rsp), %r8
|
---|
849 | movq 8(%rsp), %r9
|
---|
850 | movq 16(%rsp), %r10
|
---|
851 | movq 24(%rsp), %r11
|
---|
852 | movq 32(%rsp), %r12
|
---|
853 | movq 40(%rsp), %r13
|
---|
854 | movq 48(%rsp), %r14
|
---|
855 | movq 56(%rsp), %r15
|
---|
856 |
|
---|
857 | call __rsaz_512_reduce
|
---|
858 | ___
|
---|
859 | $code.=<<___ if ($addx);
|
---|
860 | jmp .Lmul_tail
|
---|
861 |
|
---|
862 | .align 32
|
---|
863 | .Lmulx:
|
---|
864 | movq $bp, %rbp # pass argument
|
---|
865 | movq ($bp), %rdx # pass b[0]
|
---|
866 | call __rsaz_512_mulx
|
---|
867 |
|
---|
868 | movq %xmm0, $out
|
---|
869 | movq %xmm1, %rbp
|
---|
870 |
|
---|
871 | movq 128(%rsp), %rdx # pull $n0
|
---|
872 | movq (%rsp), %r8
|
---|
873 | movq 8(%rsp), %r9
|
---|
874 | movq 16(%rsp), %r10
|
---|
875 | movq 24(%rsp), %r11
|
---|
876 | movq 32(%rsp), %r12
|
---|
877 | movq 40(%rsp), %r13
|
---|
878 | movq 48(%rsp), %r14
|
---|
879 | movq 56(%rsp), %r15
|
---|
880 |
|
---|
881 | call __rsaz_512_reducex
|
---|
882 | .Lmul_tail:
|
---|
883 | ___
|
---|
884 | $code.=<<___;
|
---|
885 | addq 64(%rsp), %r8
|
---|
886 | adcq 72(%rsp), %r9
|
---|
887 | adcq 80(%rsp), %r10
|
---|
888 | adcq 88(%rsp), %r11
|
---|
889 | adcq 96(%rsp), %r12
|
---|
890 | adcq 104(%rsp), %r13
|
---|
891 | adcq 112(%rsp), %r14
|
---|
892 | adcq 120(%rsp), %r15
|
---|
893 | sbbq %rcx, %rcx
|
---|
894 |
|
---|
895 | call __rsaz_512_subtract
|
---|
896 |
|
---|
897 | leaq 128+24+48(%rsp), %rax
|
---|
898 | .cfi_def_cfa %rax,8
|
---|
899 | movq -48(%rax), %r15
|
---|
900 | .cfi_restore %r15
|
---|
901 | movq -40(%rax), %r14
|
---|
902 | .cfi_restore %r14
|
---|
903 | movq -32(%rax), %r13
|
---|
904 | .cfi_restore %r13
|
---|
905 | movq -24(%rax), %r12
|
---|
906 | .cfi_restore %r12
|
---|
907 | movq -16(%rax), %rbp
|
---|
908 | .cfi_restore %rbp
|
---|
909 | movq -8(%rax), %rbx
|
---|
910 | .cfi_restore %rbx
|
---|
911 | leaq (%rax), %rsp
|
---|
912 | .cfi_def_cfa_register %rsp
|
---|
913 | .Lmul_epilogue:
|
---|
914 | ret
|
---|
915 | .cfi_endproc
|
---|
916 | .size rsaz_512_mul,.-rsaz_512_mul
|
---|
917 | ___
|
---|
918 | }
|
---|
919 | {
|
---|
920 | my ($out,$ap,$bp,$mod,$n0,$pwr) = ("%rdi","%rsi","%rdx","%rcx","%r8","%r9d");
|
---|
921 | $code.=<<___;
|
---|
922 | .globl rsaz_512_mul_gather4
|
---|
923 | .type rsaz_512_mul_gather4,\@function,6
|
---|
924 | .align 32
|
---|
925 | rsaz_512_mul_gather4:
|
---|
926 | .cfi_startproc
|
---|
927 | push %rbx
|
---|
928 | .cfi_push %rbx
|
---|
929 | push %rbp
|
---|
930 | .cfi_push %rbp
|
---|
931 | push %r12
|
---|
932 | .cfi_push %r12
|
---|
933 | push %r13
|
---|
934 | .cfi_push %r13
|
---|
935 | push %r14
|
---|
936 | .cfi_push %r14
|
---|
937 | push %r15
|
---|
938 | .cfi_push %r15
|
---|
939 |
|
---|
940 | subq \$`128+24+($win64?0xb0:0)`, %rsp
|
---|
941 | .cfi_adjust_cfa_offset `128+24+($win64?0xb0:0)`
|
---|
942 | ___
|
---|
943 | $code.=<<___ if ($win64);
|
---|
944 | movaps %xmm6,0xa0(%rsp)
|
---|
945 | movaps %xmm7,0xb0(%rsp)
|
---|
946 | movaps %xmm8,0xc0(%rsp)
|
---|
947 | movaps %xmm9,0xd0(%rsp)
|
---|
948 | movaps %xmm10,0xe0(%rsp)
|
---|
949 | movaps %xmm11,0xf0(%rsp)
|
---|
950 | movaps %xmm12,0x100(%rsp)
|
---|
951 | movaps %xmm13,0x110(%rsp)
|
---|
952 | movaps %xmm14,0x120(%rsp)
|
---|
953 | movaps %xmm15,0x130(%rsp)
|
---|
954 | ___
|
---|
955 | $code.=<<___;
|
---|
956 | .Lmul_gather4_body:
|
---|
957 | movd $pwr,%xmm8
|
---|
958 | movdqa .Linc+16(%rip),%xmm1 # 00000002000000020000000200000002
|
---|
959 | movdqa .Linc(%rip),%xmm0 # 00000001000000010000000000000000
|
---|
960 |
|
---|
961 | pshufd \$0,%xmm8,%xmm8 # broadcast $power
|
---|
962 | movdqa %xmm1,%xmm7
|
---|
963 | movdqa %xmm1,%xmm2
|
---|
964 | ___
|
---|
965 | ########################################################################
|
---|
966 | # calculate mask by comparing 0..15 to $power
|
---|
967 | #
|
---|
968 | for($i=0;$i<4;$i++) {
|
---|
969 | $code.=<<___;
|
---|
970 | paddd %xmm`$i`,%xmm`$i+1`
|
---|
971 | pcmpeqd %xmm8,%xmm`$i`
|
---|
972 | movdqa %xmm7,%xmm`$i+3`
|
---|
973 | ___
|
---|
974 | }
|
---|
975 | for(;$i<7;$i++) {
|
---|
976 | $code.=<<___;
|
---|
977 | paddd %xmm`$i`,%xmm`$i+1`
|
---|
978 | pcmpeqd %xmm8,%xmm`$i`
|
---|
979 | ___
|
---|
980 | }
|
---|
981 | $code.=<<___;
|
---|
982 | pcmpeqd %xmm8,%xmm7
|
---|
983 |
|
---|
984 | movdqa 16*0($bp),%xmm8
|
---|
985 | movdqa 16*1($bp),%xmm9
|
---|
986 | movdqa 16*2($bp),%xmm10
|
---|
987 | movdqa 16*3($bp),%xmm11
|
---|
988 | pand %xmm0,%xmm8
|
---|
989 | movdqa 16*4($bp),%xmm12
|
---|
990 | pand %xmm1,%xmm9
|
---|
991 | movdqa 16*5($bp),%xmm13
|
---|
992 | pand %xmm2,%xmm10
|
---|
993 | movdqa 16*6($bp),%xmm14
|
---|
994 | pand %xmm3,%xmm11
|
---|
995 | movdqa 16*7($bp),%xmm15
|
---|
996 | leaq 128($bp), %rbp
|
---|
997 | pand %xmm4,%xmm12
|
---|
998 | pand %xmm5,%xmm13
|
---|
999 | pand %xmm6,%xmm14
|
---|
1000 | pand %xmm7,%xmm15
|
---|
1001 | por %xmm10,%xmm8
|
---|
1002 | por %xmm11,%xmm9
|
---|
1003 | por %xmm12,%xmm8
|
---|
1004 | por %xmm13,%xmm9
|
---|
1005 | por %xmm14,%xmm8
|
---|
1006 | por %xmm15,%xmm9
|
---|
1007 |
|
---|
1008 | por %xmm9,%xmm8
|
---|
1009 | pshufd \$0x4e,%xmm8,%xmm9
|
---|
1010 | por %xmm9,%xmm8
|
---|
1011 | ___
|
---|
1012 | $code.=<<___ if ($addx);
|
---|
1013 | movl \$0x80100,%r11d
|
---|
1014 | andl OPENSSL_ia32cap_P+8(%rip),%r11d
|
---|
1015 | cmpl \$0x80100,%r11d # check for MULX and ADO/CX
|
---|
1016 | je .Lmulx_gather
|
---|
1017 | ___
|
---|
1018 | $code.=<<___;
|
---|
1019 | movq %xmm8,%rbx
|
---|
1020 |
|
---|
1021 | movq $n0, 128(%rsp) # off-load arguments
|
---|
1022 | movq $out, 128+8(%rsp)
|
---|
1023 | movq $mod, 128+16(%rsp)
|
---|
1024 |
|
---|
1025 | movq ($ap), %rax
|
---|
1026 | movq 8($ap), %rcx
|
---|
1027 | mulq %rbx # 0 iteration
|
---|
1028 | movq %rax, (%rsp)
|
---|
1029 | movq %rcx, %rax
|
---|
1030 | movq %rdx, %r8
|
---|
1031 |
|
---|
1032 | mulq %rbx
|
---|
1033 | addq %rax, %r8
|
---|
1034 | movq 16($ap), %rax
|
---|
1035 | movq %rdx, %r9
|
---|
1036 | adcq \$0, %r9
|
---|
1037 |
|
---|
1038 | mulq %rbx
|
---|
1039 | addq %rax, %r9
|
---|
1040 | movq 24($ap), %rax
|
---|
1041 | movq %rdx, %r10
|
---|
1042 | adcq \$0, %r10
|
---|
1043 |
|
---|
1044 | mulq %rbx
|
---|
1045 | addq %rax, %r10
|
---|
1046 | movq 32($ap), %rax
|
---|
1047 | movq %rdx, %r11
|
---|
1048 | adcq \$0, %r11
|
---|
1049 |
|
---|
1050 | mulq %rbx
|
---|
1051 | addq %rax, %r11
|
---|
1052 | movq 40($ap), %rax
|
---|
1053 | movq %rdx, %r12
|
---|
1054 | adcq \$0, %r12
|
---|
1055 |
|
---|
1056 | mulq %rbx
|
---|
1057 | addq %rax, %r12
|
---|
1058 | movq 48($ap), %rax
|
---|
1059 | movq %rdx, %r13
|
---|
1060 | adcq \$0, %r13
|
---|
1061 |
|
---|
1062 | mulq %rbx
|
---|
1063 | addq %rax, %r13
|
---|
1064 | movq 56($ap), %rax
|
---|
1065 | movq %rdx, %r14
|
---|
1066 | adcq \$0, %r14
|
---|
1067 |
|
---|
1068 | mulq %rbx
|
---|
1069 | addq %rax, %r14
|
---|
1070 | movq ($ap), %rax
|
---|
1071 | movq %rdx, %r15
|
---|
1072 | adcq \$0, %r15
|
---|
1073 |
|
---|
1074 | leaq 8(%rsp), %rdi
|
---|
1075 | movl \$7, %ecx
|
---|
1076 | jmp .Loop_mul_gather
|
---|
1077 |
|
---|
1078 | .align 32
|
---|
1079 | .Loop_mul_gather:
|
---|
1080 | movdqa 16*0(%rbp),%xmm8
|
---|
1081 | movdqa 16*1(%rbp),%xmm9
|
---|
1082 | movdqa 16*2(%rbp),%xmm10
|
---|
1083 | movdqa 16*3(%rbp),%xmm11
|
---|
1084 | pand %xmm0,%xmm8
|
---|
1085 | movdqa 16*4(%rbp),%xmm12
|
---|
1086 | pand %xmm1,%xmm9
|
---|
1087 | movdqa 16*5(%rbp),%xmm13
|
---|
1088 | pand %xmm2,%xmm10
|
---|
1089 | movdqa 16*6(%rbp),%xmm14
|
---|
1090 | pand %xmm3,%xmm11
|
---|
1091 | movdqa 16*7(%rbp),%xmm15
|
---|
1092 | leaq 128(%rbp), %rbp
|
---|
1093 | pand %xmm4,%xmm12
|
---|
1094 | pand %xmm5,%xmm13
|
---|
1095 | pand %xmm6,%xmm14
|
---|
1096 | pand %xmm7,%xmm15
|
---|
1097 | por %xmm10,%xmm8
|
---|
1098 | por %xmm11,%xmm9
|
---|
1099 | por %xmm12,%xmm8
|
---|
1100 | por %xmm13,%xmm9
|
---|
1101 | por %xmm14,%xmm8
|
---|
1102 | por %xmm15,%xmm9
|
---|
1103 |
|
---|
1104 | por %xmm9,%xmm8
|
---|
1105 | pshufd \$0x4e,%xmm8,%xmm9
|
---|
1106 | por %xmm9,%xmm8
|
---|
1107 | movq %xmm8,%rbx
|
---|
1108 |
|
---|
1109 | mulq %rbx
|
---|
1110 | addq %rax, %r8
|
---|
1111 | movq 8($ap), %rax
|
---|
1112 | movq %r8, (%rdi)
|
---|
1113 | movq %rdx, %r8
|
---|
1114 | adcq \$0, %r8
|
---|
1115 |
|
---|
1116 | mulq %rbx
|
---|
1117 | addq %rax, %r9
|
---|
1118 | movq 16($ap), %rax
|
---|
1119 | adcq \$0, %rdx
|
---|
1120 | addq %r9, %r8
|
---|
1121 | movq %rdx, %r9
|
---|
1122 | adcq \$0, %r9
|
---|
1123 |
|
---|
1124 | mulq %rbx
|
---|
1125 | addq %rax, %r10
|
---|
1126 | movq 24($ap), %rax
|
---|
1127 | adcq \$0, %rdx
|
---|
1128 | addq %r10, %r9
|
---|
1129 | movq %rdx, %r10
|
---|
1130 | adcq \$0, %r10
|
---|
1131 |
|
---|
1132 | mulq %rbx
|
---|
1133 | addq %rax, %r11
|
---|
1134 | movq 32($ap), %rax
|
---|
1135 | adcq \$0, %rdx
|
---|
1136 | addq %r11, %r10
|
---|
1137 | movq %rdx, %r11
|
---|
1138 | adcq \$0, %r11
|
---|
1139 |
|
---|
1140 | mulq %rbx
|
---|
1141 | addq %rax, %r12
|
---|
1142 | movq 40($ap), %rax
|
---|
1143 | adcq \$0, %rdx
|
---|
1144 | addq %r12, %r11
|
---|
1145 | movq %rdx, %r12
|
---|
1146 | adcq \$0, %r12
|
---|
1147 |
|
---|
1148 | mulq %rbx
|
---|
1149 | addq %rax, %r13
|
---|
1150 | movq 48($ap), %rax
|
---|
1151 | adcq \$0, %rdx
|
---|
1152 | addq %r13, %r12
|
---|
1153 | movq %rdx, %r13
|
---|
1154 | adcq \$0, %r13
|
---|
1155 |
|
---|
1156 | mulq %rbx
|
---|
1157 | addq %rax, %r14
|
---|
1158 | movq 56($ap), %rax
|
---|
1159 | adcq \$0, %rdx
|
---|
1160 | addq %r14, %r13
|
---|
1161 | movq %rdx, %r14
|
---|
1162 | adcq \$0, %r14
|
---|
1163 |
|
---|
1164 | mulq %rbx
|
---|
1165 | addq %rax, %r15
|
---|
1166 | movq ($ap), %rax
|
---|
1167 | adcq \$0, %rdx
|
---|
1168 | addq %r15, %r14
|
---|
1169 | movq %rdx, %r15
|
---|
1170 | adcq \$0, %r15
|
---|
1171 |
|
---|
1172 | leaq 8(%rdi), %rdi
|
---|
1173 |
|
---|
1174 | decl %ecx
|
---|
1175 | jnz .Loop_mul_gather
|
---|
1176 |
|
---|
1177 | movq %r8, (%rdi)
|
---|
1178 | movq %r9, 8(%rdi)
|
---|
1179 | movq %r10, 16(%rdi)
|
---|
1180 | movq %r11, 24(%rdi)
|
---|
1181 | movq %r12, 32(%rdi)
|
---|
1182 | movq %r13, 40(%rdi)
|
---|
1183 | movq %r14, 48(%rdi)
|
---|
1184 | movq %r15, 56(%rdi)
|
---|
1185 |
|
---|
1186 | movq 128+8(%rsp), $out
|
---|
1187 | movq 128+16(%rsp), %rbp
|
---|
1188 |
|
---|
1189 | movq (%rsp), %r8
|
---|
1190 | movq 8(%rsp), %r9
|
---|
1191 | movq 16(%rsp), %r10
|
---|
1192 | movq 24(%rsp), %r11
|
---|
1193 | movq 32(%rsp), %r12
|
---|
1194 | movq 40(%rsp), %r13
|
---|
1195 | movq 48(%rsp), %r14
|
---|
1196 | movq 56(%rsp), %r15
|
---|
1197 |
|
---|
1198 | call __rsaz_512_reduce
|
---|
1199 | ___
|
---|
1200 | $code.=<<___ if ($addx);
|
---|
1201 | jmp .Lmul_gather_tail
|
---|
1202 |
|
---|
1203 | .align 32
|
---|
1204 | .Lmulx_gather:
|
---|
1205 | movq %xmm8,%rdx
|
---|
1206 |
|
---|
1207 | mov $n0, 128(%rsp) # off-load arguments
|
---|
1208 | mov $out, 128+8(%rsp)
|
---|
1209 | mov $mod, 128+16(%rsp)
|
---|
1210 |
|
---|
1211 | mulx ($ap), %rbx, %r8 # 0 iteration
|
---|
1212 | mov %rbx, (%rsp)
|
---|
1213 | xor %edi, %edi # cf=0, of=0
|
---|
1214 |
|
---|
1215 | mulx 8($ap), %rax, %r9
|
---|
1216 |
|
---|
1217 | mulx 16($ap), %rbx, %r10
|
---|
1218 | adcx %rax, %r8
|
---|
1219 |
|
---|
1220 | mulx 24($ap), %rax, %r11
|
---|
1221 | adcx %rbx, %r9
|
---|
1222 |
|
---|
1223 | mulx 32($ap), %rbx, %r12
|
---|
1224 | adcx %rax, %r10
|
---|
1225 |
|
---|
1226 | mulx 40($ap), %rax, %r13
|
---|
1227 | adcx %rbx, %r11
|
---|
1228 |
|
---|
1229 | mulx 48($ap), %rbx, %r14
|
---|
1230 | adcx %rax, %r12
|
---|
1231 |
|
---|
1232 | mulx 56($ap), %rax, %r15
|
---|
1233 | adcx %rbx, %r13
|
---|
1234 | adcx %rax, %r14
|
---|
1235 | .byte 0x67
|
---|
1236 | mov %r8, %rbx
|
---|
1237 | adcx %rdi, %r15 # %rdi is 0
|
---|
1238 |
|
---|
1239 | mov \$-7, %rcx
|
---|
1240 | jmp .Loop_mulx_gather
|
---|
1241 |
|
---|
1242 | .align 32
|
---|
1243 | .Loop_mulx_gather:
|
---|
1244 | movdqa 16*0(%rbp),%xmm8
|
---|
1245 | movdqa 16*1(%rbp),%xmm9
|
---|
1246 | movdqa 16*2(%rbp),%xmm10
|
---|
1247 | movdqa 16*3(%rbp),%xmm11
|
---|
1248 | pand %xmm0,%xmm8
|
---|
1249 | movdqa 16*4(%rbp),%xmm12
|
---|
1250 | pand %xmm1,%xmm9
|
---|
1251 | movdqa 16*5(%rbp),%xmm13
|
---|
1252 | pand %xmm2,%xmm10
|
---|
1253 | movdqa 16*6(%rbp),%xmm14
|
---|
1254 | pand %xmm3,%xmm11
|
---|
1255 | movdqa 16*7(%rbp),%xmm15
|
---|
1256 | leaq 128(%rbp), %rbp
|
---|
1257 | pand %xmm4,%xmm12
|
---|
1258 | pand %xmm5,%xmm13
|
---|
1259 | pand %xmm6,%xmm14
|
---|
1260 | pand %xmm7,%xmm15
|
---|
1261 | por %xmm10,%xmm8
|
---|
1262 | por %xmm11,%xmm9
|
---|
1263 | por %xmm12,%xmm8
|
---|
1264 | por %xmm13,%xmm9
|
---|
1265 | por %xmm14,%xmm8
|
---|
1266 | por %xmm15,%xmm9
|
---|
1267 |
|
---|
1268 | por %xmm9,%xmm8
|
---|
1269 | pshufd \$0x4e,%xmm8,%xmm9
|
---|
1270 | por %xmm9,%xmm8
|
---|
1271 | movq %xmm8,%rdx
|
---|
1272 |
|
---|
1273 | .byte 0xc4,0x62,0xfb,0xf6,0x86,0x00,0x00,0x00,0x00 # mulx ($ap), %rax, %r8
|
---|
1274 | adcx %rax, %rbx
|
---|
1275 | adox %r9, %r8
|
---|
1276 |
|
---|
1277 | mulx 8($ap), %rax, %r9
|
---|
1278 | adcx %rax, %r8
|
---|
1279 | adox %r10, %r9
|
---|
1280 |
|
---|
1281 | mulx 16($ap), %rax, %r10
|
---|
1282 | adcx %rax, %r9
|
---|
1283 | adox %r11, %r10
|
---|
1284 |
|
---|
1285 | .byte 0xc4,0x62,0xfb,0xf6,0x9e,0x18,0x00,0x00,0x00 # mulx 24($ap), %rax, %r11
|
---|
1286 | adcx %rax, %r10
|
---|
1287 | adox %r12, %r11
|
---|
1288 |
|
---|
1289 | mulx 32($ap), %rax, %r12
|
---|
1290 | adcx %rax, %r11
|
---|
1291 | adox %r13, %r12
|
---|
1292 |
|
---|
1293 | mulx 40($ap), %rax, %r13
|
---|
1294 | adcx %rax, %r12
|
---|
1295 | adox %r14, %r13
|
---|
1296 |
|
---|
1297 | .byte 0xc4,0x62,0xfb,0xf6,0xb6,0x30,0x00,0x00,0x00 # mulx 48($ap), %rax, %r14
|
---|
1298 | adcx %rax, %r13
|
---|
1299 | .byte 0x67
|
---|
1300 | adox %r15, %r14
|
---|
1301 |
|
---|
1302 | mulx 56($ap), %rax, %r15
|
---|
1303 | mov %rbx, 64(%rsp,%rcx,8)
|
---|
1304 | adcx %rax, %r14
|
---|
1305 | adox %rdi, %r15
|
---|
1306 | mov %r8, %rbx
|
---|
1307 | adcx %rdi, %r15 # cf=0
|
---|
1308 |
|
---|
1309 | inc %rcx # of=0
|
---|
1310 | jnz .Loop_mulx_gather
|
---|
1311 |
|
---|
1312 | mov %r8, 64(%rsp)
|
---|
1313 | mov %r9, 64+8(%rsp)
|
---|
1314 | mov %r10, 64+16(%rsp)
|
---|
1315 | mov %r11, 64+24(%rsp)
|
---|
1316 | mov %r12, 64+32(%rsp)
|
---|
1317 | mov %r13, 64+40(%rsp)
|
---|
1318 | mov %r14, 64+48(%rsp)
|
---|
1319 | mov %r15, 64+56(%rsp)
|
---|
1320 |
|
---|
1321 | mov 128(%rsp), %rdx # pull arguments
|
---|
1322 | mov 128+8(%rsp), $out
|
---|
1323 | mov 128+16(%rsp), %rbp
|
---|
1324 |
|
---|
1325 | mov (%rsp), %r8
|
---|
1326 | mov 8(%rsp), %r9
|
---|
1327 | mov 16(%rsp), %r10
|
---|
1328 | mov 24(%rsp), %r11
|
---|
1329 | mov 32(%rsp), %r12
|
---|
1330 | mov 40(%rsp), %r13
|
---|
1331 | mov 48(%rsp), %r14
|
---|
1332 | mov 56(%rsp), %r15
|
---|
1333 |
|
---|
1334 | call __rsaz_512_reducex
|
---|
1335 |
|
---|
1336 | .Lmul_gather_tail:
|
---|
1337 | ___
|
---|
1338 | $code.=<<___;
|
---|
1339 | addq 64(%rsp), %r8
|
---|
1340 | adcq 72(%rsp), %r9
|
---|
1341 | adcq 80(%rsp), %r10
|
---|
1342 | adcq 88(%rsp), %r11
|
---|
1343 | adcq 96(%rsp), %r12
|
---|
1344 | adcq 104(%rsp), %r13
|
---|
1345 | adcq 112(%rsp), %r14
|
---|
1346 | adcq 120(%rsp), %r15
|
---|
1347 | sbbq %rcx, %rcx
|
---|
1348 |
|
---|
1349 | call __rsaz_512_subtract
|
---|
1350 |
|
---|
1351 | leaq 128+24+48(%rsp), %rax
|
---|
1352 | ___
|
---|
1353 | $code.=<<___ if ($win64);
|
---|
1354 | movaps 0xa0-0xc8(%rax),%xmm6
|
---|
1355 | movaps 0xb0-0xc8(%rax),%xmm7
|
---|
1356 | movaps 0xc0-0xc8(%rax),%xmm8
|
---|
1357 | movaps 0xd0-0xc8(%rax),%xmm9
|
---|
1358 | movaps 0xe0-0xc8(%rax),%xmm10
|
---|
1359 | movaps 0xf0-0xc8(%rax),%xmm11
|
---|
1360 | movaps 0x100-0xc8(%rax),%xmm12
|
---|
1361 | movaps 0x110-0xc8(%rax),%xmm13
|
---|
1362 | movaps 0x120-0xc8(%rax),%xmm14
|
---|
1363 | movaps 0x130-0xc8(%rax),%xmm15
|
---|
1364 | lea 0xb0(%rax),%rax
|
---|
1365 | ___
|
---|
1366 | $code.=<<___;
|
---|
1367 | .cfi_def_cfa %rax,8
|
---|
1368 | movq -48(%rax), %r15
|
---|
1369 | .cfi_restore %r15
|
---|
1370 | movq -40(%rax), %r14
|
---|
1371 | .cfi_restore %r14
|
---|
1372 | movq -32(%rax), %r13
|
---|
1373 | .cfi_restore %r13
|
---|
1374 | movq -24(%rax), %r12
|
---|
1375 | .cfi_restore %r12
|
---|
1376 | movq -16(%rax), %rbp
|
---|
1377 | .cfi_restore %rbp
|
---|
1378 | movq -8(%rax), %rbx
|
---|
1379 | .cfi_restore %rbx
|
---|
1380 | leaq (%rax), %rsp
|
---|
1381 | .cfi_def_cfa_register %rsp
|
---|
1382 | .Lmul_gather4_epilogue:
|
---|
1383 | ret
|
---|
1384 | .cfi_endproc
|
---|
1385 | .size rsaz_512_mul_gather4,.-rsaz_512_mul_gather4
|
---|
1386 | ___
|
---|
1387 | }
|
---|
1388 | {
|
---|
1389 | my ($out,$ap,$mod,$n0,$tbl,$pwr) = ("%rdi","%rsi","%rdx","%rcx","%r8","%r9d");
|
---|
1390 | $code.=<<___;
|
---|
1391 | .globl rsaz_512_mul_scatter4
|
---|
1392 | .type rsaz_512_mul_scatter4,\@function,6
|
---|
1393 | .align 32
|
---|
1394 | rsaz_512_mul_scatter4:
|
---|
1395 | .cfi_startproc
|
---|
1396 | push %rbx
|
---|
1397 | .cfi_push %rbx
|
---|
1398 | push %rbp
|
---|
1399 | .cfi_push %rbp
|
---|
1400 | push %r12
|
---|
1401 | .cfi_push %r12
|
---|
1402 | push %r13
|
---|
1403 | .cfi_push %r13
|
---|
1404 | push %r14
|
---|
1405 | .cfi_push %r14
|
---|
1406 | push %r15
|
---|
1407 | .cfi_push %r15
|
---|
1408 |
|
---|
1409 | mov $pwr, $pwr
|
---|
1410 | subq \$128+24, %rsp
|
---|
1411 | .cfi_adjust_cfa_offset 128+24
|
---|
1412 | .Lmul_scatter4_body:
|
---|
1413 | leaq ($tbl,$pwr,8), $tbl
|
---|
1414 | movq $out, %xmm0 # off-load arguments
|
---|
1415 | movq $mod, %xmm1
|
---|
1416 | movq $tbl, %xmm2
|
---|
1417 | movq $n0, 128(%rsp)
|
---|
1418 |
|
---|
1419 | movq $out, %rbp
|
---|
1420 | ___
|
---|
1421 | $code.=<<___ if ($addx);
|
---|
1422 | movl \$0x80100,%r11d
|
---|
1423 | andl OPENSSL_ia32cap_P+8(%rip),%r11d
|
---|
1424 | cmpl \$0x80100,%r11d # check for MULX and ADO/CX
|
---|
1425 | je .Lmulx_scatter
|
---|
1426 | ___
|
---|
1427 | $code.=<<___;
|
---|
1428 | movq ($out),%rbx # pass b[0]
|
---|
1429 | call __rsaz_512_mul
|
---|
1430 |
|
---|
1431 | movq %xmm0, $out
|
---|
1432 | movq %xmm1, %rbp
|
---|
1433 |
|
---|
1434 | movq (%rsp), %r8
|
---|
1435 | movq 8(%rsp), %r9
|
---|
1436 | movq 16(%rsp), %r10
|
---|
1437 | movq 24(%rsp), %r11
|
---|
1438 | movq 32(%rsp), %r12
|
---|
1439 | movq 40(%rsp), %r13
|
---|
1440 | movq 48(%rsp), %r14
|
---|
1441 | movq 56(%rsp), %r15
|
---|
1442 |
|
---|
1443 | call __rsaz_512_reduce
|
---|
1444 | ___
|
---|
1445 | $code.=<<___ if ($addx);
|
---|
1446 | jmp .Lmul_scatter_tail
|
---|
1447 |
|
---|
1448 | .align 32
|
---|
1449 | .Lmulx_scatter:
|
---|
1450 | movq ($out), %rdx # pass b[0]
|
---|
1451 | call __rsaz_512_mulx
|
---|
1452 |
|
---|
1453 | movq %xmm0, $out
|
---|
1454 | movq %xmm1, %rbp
|
---|
1455 |
|
---|
1456 | movq 128(%rsp), %rdx # pull $n0
|
---|
1457 | movq (%rsp), %r8
|
---|
1458 | movq 8(%rsp), %r9
|
---|
1459 | movq 16(%rsp), %r10
|
---|
1460 | movq 24(%rsp), %r11
|
---|
1461 | movq 32(%rsp), %r12
|
---|
1462 | movq 40(%rsp), %r13
|
---|
1463 | movq 48(%rsp), %r14
|
---|
1464 | movq 56(%rsp), %r15
|
---|
1465 |
|
---|
1466 | call __rsaz_512_reducex
|
---|
1467 |
|
---|
1468 | .Lmul_scatter_tail:
|
---|
1469 | ___
|
---|
1470 | $code.=<<___;
|
---|
1471 | addq 64(%rsp), %r8
|
---|
1472 | adcq 72(%rsp), %r9
|
---|
1473 | adcq 80(%rsp), %r10
|
---|
1474 | adcq 88(%rsp), %r11
|
---|
1475 | adcq 96(%rsp), %r12
|
---|
1476 | adcq 104(%rsp), %r13
|
---|
1477 | adcq 112(%rsp), %r14
|
---|
1478 | adcq 120(%rsp), %r15
|
---|
1479 | movq %xmm2, $inp
|
---|
1480 | sbbq %rcx, %rcx
|
---|
1481 |
|
---|
1482 | call __rsaz_512_subtract
|
---|
1483 |
|
---|
1484 | movq %r8, 128*0($inp) # scatter
|
---|
1485 | movq %r9, 128*1($inp)
|
---|
1486 | movq %r10, 128*2($inp)
|
---|
1487 | movq %r11, 128*3($inp)
|
---|
1488 | movq %r12, 128*4($inp)
|
---|
1489 | movq %r13, 128*5($inp)
|
---|
1490 | movq %r14, 128*6($inp)
|
---|
1491 | movq %r15, 128*7($inp)
|
---|
1492 |
|
---|
1493 | leaq 128+24+48(%rsp), %rax
|
---|
1494 | .cfi_def_cfa %rax,8
|
---|
1495 | movq -48(%rax), %r15
|
---|
1496 | .cfi_restore %r15
|
---|
1497 | movq -40(%rax), %r14
|
---|
1498 | .cfi_restore %r14
|
---|
1499 | movq -32(%rax), %r13
|
---|
1500 | .cfi_restore %r13
|
---|
1501 | movq -24(%rax), %r12
|
---|
1502 | .cfi_restore %r12
|
---|
1503 | movq -16(%rax), %rbp
|
---|
1504 | .cfi_restore %rbp
|
---|
1505 | movq -8(%rax), %rbx
|
---|
1506 | .cfi_restore %rbx
|
---|
1507 | leaq (%rax), %rsp
|
---|
1508 | .cfi_def_cfa_register %rsp
|
---|
1509 | .Lmul_scatter4_epilogue:
|
---|
1510 | ret
|
---|
1511 | .cfi_endproc
|
---|
1512 | .size rsaz_512_mul_scatter4,.-rsaz_512_mul_scatter4
|
---|
1513 | ___
|
---|
1514 | }
|
---|
1515 | {
|
---|
1516 | my ($out,$inp,$mod,$n0) = ("%rdi","%rsi","%rdx","%rcx");
|
---|
1517 | $code.=<<___;
|
---|
1518 | .globl rsaz_512_mul_by_one
|
---|
1519 | .type rsaz_512_mul_by_one,\@function,4
|
---|
1520 | .align 32
|
---|
1521 | rsaz_512_mul_by_one:
|
---|
1522 | .cfi_startproc
|
---|
1523 | push %rbx
|
---|
1524 | .cfi_push %rbx
|
---|
1525 | push %rbp
|
---|
1526 | .cfi_push %rbp
|
---|
1527 | push %r12
|
---|
1528 | .cfi_push %r12
|
---|
1529 | push %r13
|
---|
1530 | .cfi_push %r13
|
---|
1531 | push %r14
|
---|
1532 | .cfi_push %r14
|
---|
1533 | push %r15
|
---|
1534 | .cfi_push %r15
|
---|
1535 |
|
---|
1536 | subq \$128+24, %rsp
|
---|
1537 | .cfi_adjust_cfa_offset 128+24
|
---|
1538 | .Lmul_by_one_body:
|
---|
1539 | ___
|
---|
1540 | $code.=<<___ if ($addx);
|
---|
1541 | movl OPENSSL_ia32cap_P+8(%rip),%eax
|
---|
1542 | ___
|
---|
1543 | $code.=<<___;
|
---|
1544 | movq $mod, %rbp # reassign argument
|
---|
1545 | movq $n0, 128(%rsp)
|
---|
1546 |
|
---|
1547 | movq ($inp), %r8
|
---|
1548 | pxor %xmm0, %xmm0
|
---|
1549 | movq 8($inp), %r9
|
---|
1550 | movq 16($inp), %r10
|
---|
1551 | movq 24($inp), %r11
|
---|
1552 | movq 32($inp), %r12
|
---|
1553 | movq 40($inp), %r13
|
---|
1554 | movq 48($inp), %r14
|
---|
1555 | movq 56($inp), %r15
|
---|
1556 |
|
---|
1557 | movdqa %xmm0, (%rsp)
|
---|
1558 | movdqa %xmm0, 16(%rsp)
|
---|
1559 | movdqa %xmm0, 32(%rsp)
|
---|
1560 | movdqa %xmm0, 48(%rsp)
|
---|
1561 | movdqa %xmm0, 64(%rsp)
|
---|
1562 | movdqa %xmm0, 80(%rsp)
|
---|
1563 | movdqa %xmm0, 96(%rsp)
|
---|
1564 | ___
|
---|
1565 | $code.=<<___ if ($addx);
|
---|
1566 | andl \$0x80100,%eax
|
---|
1567 | cmpl \$0x80100,%eax # check for MULX and ADO/CX
|
---|
1568 | je .Lby_one_callx
|
---|
1569 | ___
|
---|
1570 | $code.=<<___;
|
---|
1571 | call __rsaz_512_reduce
|
---|
1572 | ___
|
---|
1573 | $code.=<<___ if ($addx);
|
---|
1574 | jmp .Lby_one_tail
|
---|
1575 | .align 32
|
---|
1576 | .Lby_one_callx:
|
---|
1577 | movq 128(%rsp), %rdx # pull $n0
|
---|
1578 | call __rsaz_512_reducex
|
---|
1579 | .Lby_one_tail:
|
---|
1580 | ___
|
---|
1581 | $code.=<<___;
|
---|
1582 | movq %r8, ($out)
|
---|
1583 | movq %r9, 8($out)
|
---|
1584 | movq %r10, 16($out)
|
---|
1585 | movq %r11, 24($out)
|
---|
1586 | movq %r12, 32($out)
|
---|
1587 | movq %r13, 40($out)
|
---|
1588 | movq %r14, 48($out)
|
---|
1589 | movq %r15, 56($out)
|
---|
1590 |
|
---|
1591 | leaq 128+24+48(%rsp), %rax
|
---|
1592 | .cfi_def_cfa %rax,8
|
---|
1593 | movq -48(%rax), %r15
|
---|
1594 | .cfi_restore %r15
|
---|
1595 | movq -40(%rax), %r14
|
---|
1596 | .cfi_restore %r14
|
---|
1597 | movq -32(%rax), %r13
|
---|
1598 | .cfi_restore %r13
|
---|
1599 | movq -24(%rax), %r12
|
---|
1600 | .cfi_restore %r12
|
---|
1601 | movq -16(%rax), %rbp
|
---|
1602 | .cfi_restore %rbp
|
---|
1603 | movq -8(%rax), %rbx
|
---|
1604 | .cfi_restore %rbx
|
---|
1605 | leaq (%rax), %rsp
|
---|
1606 | .cfi_def_cfa_register %rsp
|
---|
1607 | .Lmul_by_one_epilogue:
|
---|
1608 | ret
|
---|
1609 | .cfi_endproc
|
---|
1610 | .size rsaz_512_mul_by_one,.-rsaz_512_mul_by_one
|
---|
1611 | ___
|
---|
1612 | }
|
---|
1613 | { # __rsaz_512_reduce
|
---|
1614 | #
|
---|
1615 | # input: %r8-%r15, %rbp - mod, 128(%rsp) - n0
|
---|
1616 | # output: %r8-%r15
|
---|
1617 | # clobbers: everything except %rbp and %rdi
|
---|
1618 | $code.=<<___;
|
---|
1619 | .type __rsaz_512_reduce,\@abi-omnipotent
|
---|
1620 | .align 32
|
---|
1621 | __rsaz_512_reduce:
|
---|
1622 | .cfi_startproc
|
---|
1623 | movq %r8, %rbx
|
---|
1624 | imulq 128+8(%rsp), %rbx
|
---|
1625 | movq 0(%rbp), %rax
|
---|
1626 | movl \$8, %ecx
|
---|
1627 | jmp .Lreduction_loop
|
---|
1628 |
|
---|
1629 | .align 32
|
---|
1630 | .Lreduction_loop:
|
---|
1631 | mulq %rbx
|
---|
1632 | movq 8(%rbp), %rax
|
---|
1633 | negq %r8
|
---|
1634 | movq %rdx, %r8
|
---|
1635 | adcq \$0, %r8
|
---|
1636 |
|
---|
1637 | mulq %rbx
|
---|
1638 | addq %rax, %r9
|
---|
1639 | movq 16(%rbp), %rax
|
---|
1640 | adcq \$0, %rdx
|
---|
1641 | addq %r9, %r8
|
---|
1642 | movq %rdx, %r9
|
---|
1643 | adcq \$0, %r9
|
---|
1644 |
|
---|
1645 | mulq %rbx
|
---|
1646 | addq %rax, %r10
|
---|
1647 | movq 24(%rbp), %rax
|
---|
1648 | adcq \$0, %rdx
|
---|
1649 | addq %r10, %r9
|
---|
1650 | movq %rdx, %r10
|
---|
1651 | adcq \$0, %r10
|
---|
1652 |
|
---|
1653 | mulq %rbx
|
---|
1654 | addq %rax, %r11
|
---|
1655 | movq 32(%rbp), %rax
|
---|
1656 | adcq \$0, %rdx
|
---|
1657 | addq %r11, %r10
|
---|
1658 | movq 128+8(%rsp), %rsi
|
---|
1659 | #movq %rdx, %r11
|
---|
1660 | #adcq \$0, %r11
|
---|
1661 | adcq \$0, %rdx
|
---|
1662 | movq %rdx, %r11
|
---|
1663 |
|
---|
1664 | mulq %rbx
|
---|
1665 | addq %rax, %r12
|
---|
1666 | movq 40(%rbp), %rax
|
---|
1667 | adcq \$0, %rdx
|
---|
1668 | imulq %r8, %rsi
|
---|
1669 | addq %r12, %r11
|
---|
1670 | movq %rdx, %r12
|
---|
1671 | adcq \$0, %r12
|
---|
1672 |
|
---|
1673 | mulq %rbx
|
---|
1674 | addq %rax, %r13
|
---|
1675 | movq 48(%rbp), %rax
|
---|
1676 | adcq \$0, %rdx
|
---|
1677 | addq %r13, %r12
|
---|
1678 | movq %rdx, %r13
|
---|
1679 | adcq \$0, %r13
|
---|
1680 |
|
---|
1681 | mulq %rbx
|
---|
1682 | addq %rax, %r14
|
---|
1683 | movq 56(%rbp), %rax
|
---|
1684 | adcq \$0, %rdx
|
---|
1685 | addq %r14, %r13
|
---|
1686 | movq %rdx, %r14
|
---|
1687 | adcq \$0, %r14
|
---|
1688 |
|
---|
1689 | mulq %rbx
|
---|
1690 | movq %rsi, %rbx
|
---|
1691 | addq %rax, %r15
|
---|
1692 | movq 0(%rbp), %rax
|
---|
1693 | adcq \$0, %rdx
|
---|
1694 | addq %r15, %r14
|
---|
1695 | movq %rdx, %r15
|
---|
1696 | adcq \$0, %r15
|
---|
1697 |
|
---|
1698 | decl %ecx
|
---|
1699 | jne .Lreduction_loop
|
---|
1700 |
|
---|
1701 | ret
|
---|
1702 | .cfi_endproc
|
---|
1703 | .size __rsaz_512_reduce,.-__rsaz_512_reduce
|
---|
1704 | ___
|
---|
1705 | }
|
---|
1706 | if ($addx) {
|
---|
1707 | # __rsaz_512_reducex
|
---|
1708 | #
|
---|
1709 | # input: %r8-%r15, %rbp - mod, 128(%rsp) - n0
|
---|
1710 | # output: %r8-%r15
|
---|
1711 | # clobbers: everything except %rbp and %rdi
|
---|
1712 | $code.=<<___;
|
---|
1713 | .type __rsaz_512_reducex,\@abi-omnipotent
|
---|
1714 | .align 32
|
---|
1715 | __rsaz_512_reducex:
|
---|
1716 | .cfi_startproc
|
---|
1717 | #movq 128+8(%rsp), %rdx # pull $n0
|
---|
1718 | imulq %r8, %rdx
|
---|
1719 | xorq %rsi, %rsi # cf=0,of=0
|
---|
1720 | movl \$8, %ecx
|
---|
1721 | jmp .Lreduction_loopx
|
---|
1722 |
|
---|
1723 | .align 32
|
---|
1724 | .Lreduction_loopx:
|
---|
1725 | mov %r8, %rbx
|
---|
1726 | mulx 0(%rbp), %rax, %r8
|
---|
1727 | adcx %rbx, %rax
|
---|
1728 | adox %r9, %r8
|
---|
1729 |
|
---|
1730 | mulx 8(%rbp), %rax, %r9
|
---|
1731 | adcx %rax, %r8
|
---|
1732 | adox %r10, %r9
|
---|
1733 |
|
---|
1734 | mulx 16(%rbp), %rbx, %r10
|
---|
1735 | adcx %rbx, %r9
|
---|
1736 | adox %r11, %r10
|
---|
1737 |
|
---|
1738 | mulx 24(%rbp), %rbx, %r11
|
---|
1739 | adcx %rbx, %r10
|
---|
1740 | adox %r12, %r11
|
---|
1741 |
|
---|
1742 | .byte 0xc4,0x62,0xe3,0xf6,0xa5,0x20,0x00,0x00,0x00 # mulx 32(%rbp), %rbx, %r12
|
---|
1743 | mov %rdx, %rax
|
---|
1744 | mov %r8, %rdx
|
---|
1745 | adcx %rbx, %r11
|
---|
1746 | adox %r13, %r12
|
---|
1747 |
|
---|
1748 | mulx 128+8(%rsp), %rbx, %rdx
|
---|
1749 | mov %rax, %rdx
|
---|
1750 |
|
---|
1751 | mulx 40(%rbp), %rax, %r13
|
---|
1752 | adcx %rax, %r12
|
---|
1753 | adox %r14, %r13
|
---|
1754 |
|
---|
1755 | .byte 0xc4,0x62,0xfb,0xf6,0xb5,0x30,0x00,0x00,0x00 # mulx 48(%rbp), %rax, %r14
|
---|
1756 | adcx %rax, %r13
|
---|
1757 | adox %r15, %r14
|
---|
1758 |
|
---|
1759 | mulx 56(%rbp), %rax, %r15
|
---|
1760 | mov %rbx, %rdx
|
---|
1761 | adcx %rax, %r14
|
---|
1762 | adox %rsi, %r15 # %rsi is 0
|
---|
1763 | adcx %rsi, %r15 # cf=0
|
---|
1764 |
|
---|
1765 | decl %ecx # of=0
|
---|
1766 | jne .Lreduction_loopx
|
---|
1767 |
|
---|
1768 | ret
|
---|
1769 | .cfi_endproc
|
---|
1770 | .size __rsaz_512_reducex,.-__rsaz_512_reducex
|
---|
1771 | ___
|
---|
1772 | }
|
---|
1773 | { # __rsaz_512_subtract
|
---|
1774 | # input: %r8-%r15, %rdi - $out, %rbp - $mod, %rcx - mask
|
---|
1775 | # output:
|
---|
1776 | # clobbers: everything but %rdi, %rsi and %rbp
|
---|
1777 | $code.=<<___;
|
---|
1778 | .type __rsaz_512_subtract,\@abi-omnipotent
|
---|
1779 | .align 32
|
---|
1780 | __rsaz_512_subtract:
|
---|
1781 | .cfi_startproc
|
---|
1782 | movq %r8, ($out)
|
---|
1783 | movq %r9, 8($out)
|
---|
1784 | movq %r10, 16($out)
|
---|
1785 | movq %r11, 24($out)
|
---|
1786 | movq %r12, 32($out)
|
---|
1787 | movq %r13, 40($out)
|
---|
1788 | movq %r14, 48($out)
|
---|
1789 | movq %r15, 56($out)
|
---|
1790 |
|
---|
1791 | movq 0($mod), %r8
|
---|
1792 | movq 8($mod), %r9
|
---|
1793 | negq %r8
|
---|
1794 | notq %r9
|
---|
1795 | andq %rcx, %r8
|
---|
1796 | movq 16($mod), %r10
|
---|
1797 | andq %rcx, %r9
|
---|
1798 | notq %r10
|
---|
1799 | movq 24($mod), %r11
|
---|
1800 | andq %rcx, %r10
|
---|
1801 | notq %r11
|
---|
1802 | movq 32($mod), %r12
|
---|
1803 | andq %rcx, %r11
|
---|
1804 | notq %r12
|
---|
1805 | movq 40($mod), %r13
|
---|
1806 | andq %rcx, %r12
|
---|
1807 | notq %r13
|
---|
1808 | movq 48($mod), %r14
|
---|
1809 | andq %rcx, %r13
|
---|
1810 | notq %r14
|
---|
1811 | movq 56($mod), %r15
|
---|
1812 | andq %rcx, %r14
|
---|
1813 | notq %r15
|
---|
1814 | andq %rcx, %r15
|
---|
1815 |
|
---|
1816 | addq ($out), %r8
|
---|
1817 | adcq 8($out), %r9
|
---|
1818 | adcq 16($out), %r10
|
---|
1819 | adcq 24($out), %r11
|
---|
1820 | adcq 32($out), %r12
|
---|
1821 | adcq 40($out), %r13
|
---|
1822 | adcq 48($out), %r14
|
---|
1823 | adcq 56($out), %r15
|
---|
1824 |
|
---|
1825 | movq %r8, ($out)
|
---|
1826 | movq %r9, 8($out)
|
---|
1827 | movq %r10, 16($out)
|
---|
1828 | movq %r11, 24($out)
|
---|
1829 | movq %r12, 32($out)
|
---|
1830 | movq %r13, 40($out)
|
---|
1831 | movq %r14, 48($out)
|
---|
1832 | movq %r15, 56($out)
|
---|
1833 |
|
---|
1834 | ret
|
---|
1835 | .cfi_endproc
|
---|
1836 | .size __rsaz_512_subtract,.-__rsaz_512_subtract
|
---|
1837 | ___
|
---|
1838 | }
|
---|
1839 | { # __rsaz_512_mul
|
---|
1840 | #
|
---|
1841 | # input: %rsi - ap, %rbp - bp
|
---|
1842 | # output:
|
---|
1843 | # clobbers: everything
|
---|
1844 | my ($ap,$bp) = ("%rsi","%rbp");
|
---|
1845 | $code.=<<___;
|
---|
1846 | .type __rsaz_512_mul,\@abi-omnipotent
|
---|
1847 | .align 32
|
---|
1848 | __rsaz_512_mul:
|
---|
1849 | .cfi_startproc
|
---|
1850 | leaq 8(%rsp), %rdi
|
---|
1851 |
|
---|
1852 | movq ($ap), %rax
|
---|
1853 | mulq %rbx
|
---|
1854 | movq %rax, (%rdi)
|
---|
1855 | movq 8($ap), %rax
|
---|
1856 | movq %rdx, %r8
|
---|
1857 |
|
---|
1858 | mulq %rbx
|
---|
1859 | addq %rax, %r8
|
---|
1860 | movq 16($ap), %rax
|
---|
1861 | movq %rdx, %r9
|
---|
1862 | adcq \$0, %r9
|
---|
1863 |
|
---|
1864 | mulq %rbx
|
---|
1865 | addq %rax, %r9
|
---|
1866 | movq 24($ap), %rax
|
---|
1867 | movq %rdx, %r10
|
---|
1868 | adcq \$0, %r10
|
---|
1869 |
|
---|
1870 | mulq %rbx
|
---|
1871 | addq %rax, %r10
|
---|
1872 | movq 32($ap), %rax
|
---|
1873 | movq %rdx, %r11
|
---|
1874 | adcq \$0, %r11
|
---|
1875 |
|
---|
1876 | mulq %rbx
|
---|
1877 | addq %rax, %r11
|
---|
1878 | movq 40($ap), %rax
|
---|
1879 | movq %rdx, %r12
|
---|
1880 | adcq \$0, %r12
|
---|
1881 |
|
---|
1882 | mulq %rbx
|
---|
1883 | addq %rax, %r12
|
---|
1884 | movq 48($ap), %rax
|
---|
1885 | movq %rdx, %r13
|
---|
1886 | adcq \$0, %r13
|
---|
1887 |
|
---|
1888 | mulq %rbx
|
---|
1889 | addq %rax, %r13
|
---|
1890 | movq 56($ap), %rax
|
---|
1891 | movq %rdx, %r14
|
---|
1892 | adcq \$0, %r14
|
---|
1893 |
|
---|
1894 | mulq %rbx
|
---|
1895 | addq %rax, %r14
|
---|
1896 | movq ($ap), %rax
|
---|
1897 | movq %rdx, %r15
|
---|
1898 | adcq \$0, %r15
|
---|
1899 |
|
---|
1900 | leaq 8($bp), $bp
|
---|
1901 | leaq 8(%rdi), %rdi
|
---|
1902 |
|
---|
1903 | movl \$7, %ecx
|
---|
1904 | jmp .Loop_mul
|
---|
1905 |
|
---|
1906 | .align 32
|
---|
1907 | .Loop_mul:
|
---|
1908 | movq ($bp), %rbx
|
---|
1909 | mulq %rbx
|
---|
1910 | addq %rax, %r8
|
---|
1911 | movq 8($ap), %rax
|
---|
1912 | movq %r8, (%rdi)
|
---|
1913 | movq %rdx, %r8
|
---|
1914 | adcq \$0, %r8
|
---|
1915 |
|
---|
1916 | mulq %rbx
|
---|
1917 | addq %rax, %r9
|
---|
1918 | movq 16($ap), %rax
|
---|
1919 | adcq \$0, %rdx
|
---|
1920 | addq %r9, %r8
|
---|
1921 | movq %rdx, %r9
|
---|
1922 | adcq \$0, %r9
|
---|
1923 |
|
---|
1924 | mulq %rbx
|
---|
1925 | addq %rax, %r10
|
---|
1926 | movq 24($ap), %rax
|
---|
1927 | adcq \$0, %rdx
|
---|
1928 | addq %r10, %r9
|
---|
1929 | movq %rdx, %r10
|
---|
1930 | adcq \$0, %r10
|
---|
1931 |
|
---|
1932 | mulq %rbx
|
---|
1933 | addq %rax, %r11
|
---|
1934 | movq 32($ap), %rax
|
---|
1935 | adcq \$0, %rdx
|
---|
1936 | addq %r11, %r10
|
---|
1937 | movq %rdx, %r11
|
---|
1938 | adcq \$0, %r11
|
---|
1939 |
|
---|
1940 | mulq %rbx
|
---|
1941 | addq %rax, %r12
|
---|
1942 | movq 40($ap), %rax
|
---|
1943 | adcq \$0, %rdx
|
---|
1944 | addq %r12, %r11
|
---|
1945 | movq %rdx, %r12
|
---|
1946 | adcq \$0, %r12
|
---|
1947 |
|
---|
1948 | mulq %rbx
|
---|
1949 | addq %rax, %r13
|
---|
1950 | movq 48($ap), %rax
|
---|
1951 | adcq \$0, %rdx
|
---|
1952 | addq %r13, %r12
|
---|
1953 | movq %rdx, %r13
|
---|
1954 | adcq \$0, %r13
|
---|
1955 |
|
---|
1956 | mulq %rbx
|
---|
1957 | addq %rax, %r14
|
---|
1958 | movq 56($ap), %rax
|
---|
1959 | adcq \$0, %rdx
|
---|
1960 | addq %r14, %r13
|
---|
1961 | movq %rdx, %r14
|
---|
1962 | leaq 8($bp), $bp
|
---|
1963 | adcq \$0, %r14
|
---|
1964 |
|
---|
1965 | mulq %rbx
|
---|
1966 | addq %rax, %r15
|
---|
1967 | movq ($ap), %rax
|
---|
1968 | adcq \$0, %rdx
|
---|
1969 | addq %r15, %r14
|
---|
1970 | movq %rdx, %r15
|
---|
1971 | adcq \$0, %r15
|
---|
1972 |
|
---|
1973 | leaq 8(%rdi), %rdi
|
---|
1974 |
|
---|
1975 | decl %ecx
|
---|
1976 | jnz .Loop_mul
|
---|
1977 |
|
---|
1978 | movq %r8, (%rdi)
|
---|
1979 | movq %r9, 8(%rdi)
|
---|
1980 | movq %r10, 16(%rdi)
|
---|
1981 | movq %r11, 24(%rdi)
|
---|
1982 | movq %r12, 32(%rdi)
|
---|
1983 | movq %r13, 40(%rdi)
|
---|
1984 | movq %r14, 48(%rdi)
|
---|
1985 | movq %r15, 56(%rdi)
|
---|
1986 |
|
---|
1987 | ret
|
---|
1988 | .cfi_endproc
|
---|
1989 | .size __rsaz_512_mul,.-__rsaz_512_mul
|
---|
1990 | ___
|
---|
1991 | }
|
---|
1992 | if ($addx) {
|
---|
1993 | # __rsaz_512_mulx
|
---|
1994 | #
|
---|
1995 | # input: %rsi - ap, %rbp - bp
|
---|
1996 | # output:
|
---|
1997 | # clobbers: everything
|
---|
1998 | my ($ap,$bp,$zero) = ("%rsi","%rbp","%rdi");
|
---|
1999 | $code.=<<___;
|
---|
2000 | .type __rsaz_512_mulx,\@abi-omnipotent
|
---|
2001 | .align 32
|
---|
2002 | __rsaz_512_mulx:
|
---|
2003 | .cfi_startproc
|
---|
2004 | mulx ($ap), %rbx, %r8 # initial %rdx preloaded by caller
|
---|
2005 | mov \$-6, %rcx
|
---|
2006 |
|
---|
2007 | mulx 8($ap), %rax, %r9
|
---|
2008 | movq %rbx, 8(%rsp)
|
---|
2009 |
|
---|
2010 | mulx 16($ap), %rbx, %r10
|
---|
2011 | adc %rax, %r8
|
---|
2012 |
|
---|
2013 | mulx 24($ap), %rax, %r11
|
---|
2014 | adc %rbx, %r9
|
---|
2015 |
|
---|
2016 | mulx 32($ap), %rbx, %r12
|
---|
2017 | adc %rax, %r10
|
---|
2018 |
|
---|
2019 | mulx 40($ap), %rax, %r13
|
---|
2020 | adc %rbx, %r11
|
---|
2021 |
|
---|
2022 | mulx 48($ap), %rbx, %r14
|
---|
2023 | adc %rax, %r12
|
---|
2024 |
|
---|
2025 | mulx 56($ap), %rax, %r15
|
---|
2026 | mov 8($bp), %rdx
|
---|
2027 | adc %rbx, %r13
|
---|
2028 | adc %rax, %r14
|
---|
2029 | adc \$0, %r15
|
---|
2030 |
|
---|
2031 | xor $zero, $zero # cf=0,of=0
|
---|
2032 | jmp .Loop_mulx
|
---|
2033 |
|
---|
2034 | .align 32
|
---|
2035 | .Loop_mulx:
|
---|
2036 | movq %r8, %rbx
|
---|
2037 | mulx ($ap), %rax, %r8
|
---|
2038 | adcx %rax, %rbx
|
---|
2039 | adox %r9, %r8
|
---|
2040 |
|
---|
2041 | mulx 8($ap), %rax, %r9
|
---|
2042 | adcx %rax, %r8
|
---|
2043 | adox %r10, %r9
|
---|
2044 |
|
---|
2045 | mulx 16($ap), %rax, %r10
|
---|
2046 | adcx %rax, %r9
|
---|
2047 | adox %r11, %r10
|
---|
2048 |
|
---|
2049 | mulx 24($ap), %rax, %r11
|
---|
2050 | adcx %rax, %r10
|
---|
2051 | adox %r12, %r11
|
---|
2052 |
|
---|
2053 | .byte 0x3e,0xc4,0x62,0xfb,0xf6,0xa6,0x20,0x00,0x00,0x00 # mulx 32($ap), %rax, %r12
|
---|
2054 | adcx %rax, %r11
|
---|
2055 | adox %r13, %r12
|
---|
2056 |
|
---|
2057 | mulx 40($ap), %rax, %r13
|
---|
2058 | adcx %rax, %r12
|
---|
2059 | adox %r14, %r13
|
---|
2060 |
|
---|
2061 | mulx 48($ap), %rax, %r14
|
---|
2062 | adcx %rax, %r13
|
---|
2063 | adox %r15, %r14
|
---|
2064 |
|
---|
2065 | mulx 56($ap), %rax, %r15
|
---|
2066 | movq 64($bp,%rcx,8), %rdx
|
---|
2067 | movq %rbx, 8+64-8(%rsp,%rcx,8)
|
---|
2068 | adcx %rax, %r14
|
---|
2069 | adox $zero, %r15
|
---|
2070 | adcx $zero, %r15 # cf=0
|
---|
2071 |
|
---|
2072 | inc %rcx # of=0
|
---|
2073 | jnz .Loop_mulx
|
---|
2074 |
|
---|
2075 | movq %r8, %rbx
|
---|
2076 | mulx ($ap), %rax, %r8
|
---|
2077 | adcx %rax, %rbx
|
---|
2078 | adox %r9, %r8
|
---|
2079 |
|
---|
2080 | .byte 0xc4,0x62,0xfb,0xf6,0x8e,0x08,0x00,0x00,0x00 # mulx 8($ap), %rax, %r9
|
---|
2081 | adcx %rax, %r8
|
---|
2082 | adox %r10, %r9
|
---|
2083 |
|
---|
2084 | .byte 0xc4,0x62,0xfb,0xf6,0x96,0x10,0x00,0x00,0x00 # mulx 16($ap), %rax, %r10
|
---|
2085 | adcx %rax, %r9
|
---|
2086 | adox %r11, %r10
|
---|
2087 |
|
---|
2088 | mulx 24($ap), %rax, %r11
|
---|
2089 | adcx %rax, %r10
|
---|
2090 | adox %r12, %r11
|
---|
2091 |
|
---|
2092 | mulx 32($ap), %rax, %r12
|
---|
2093 | adcx %rax, %r11
|
---|
2094 | adox %r13, %r12
|
---|
2095 |
|
---|
2096 | mulx 40($ap), %rax, %r13
|
---|
2097 | adcx %rax, %r12
|
---|
2098 | adox %r14, %r13
|
---|
2099 |
|
---|
2100 | .byte 0xc4,0x62,0xfb,0xf6,0xb6,0x30,0x00,0x00,0x00 # mulx 48($ap), %rax, %r14
|
---|
2101 | adcx %rax, %r13
|
---|
2102 | adox %r15, %r14
|
---|
2103 |
|
---|
2104 | .byte 0xc4,0x62,0xfb,0xf6,0xbe,0x38,0x00,0x00,0x00 # mulx 56($ap), %rax, %r15
|
---|
2105 | adcx %rax, %r14
|
---|
2106 | adox $zero, %r15
|
---|
2107 | adcx $zero, %r15
|
---|
2108 |
|
---|
2109 | mov %rbx, 8+64-8(%rsp)
|
---|
2110 | mov %r8, 8+64(%rsp)
|
---|
2111 | mov %r9, 8+64+8(%rsp)
|
---|
2112 | mov %r10, 8+64+16(%rsp)
|
---|
2113 | mov %r11, 8+64+24(%rsp)
|
---|
2114 | mov %r12, 8+64+32(%rsp)
|
---|
2115 | mov %r13, 8+64+40(%rsp)
|
---|
2116 | mov %r14, 8+64+48(%rsp)
|
---|
2117 | mov %r15, 8+64+56(%rsp)
|
---|
2118 |
|
---|
2119 | ret
|
---|
2120 | .cfi_endproc
|
---|
2121 | .size __rsaz_512_mulx,.-__rsaz_512_mulx
|
---|
2122 | ___
|
---|
2123 | }
|
---|
2124 | {
|
---|
2125 | my ($out,$inp,$power)= $win64 ? ("%rcx","%rdx","%r8d") : ("%rdi","%rsi","%edx");
|
---|
2126 | $code.=<<___;
|
---|
2127 | .globl rsaz_512_scatter4
|
---|
2128 | .type rsaz_512_scatter4,\@abi-omnipotent
|
---|
2129 | .align 16
|
---|
2130 | rsaz_512_scatter4:
|
---|
2131 | .cfi_startproc
|
---|
2132 | leaq ($out,$power,8), $out
|
---|
2133 | movl \$8, %r9d
|
---|
2134 | jmp .Loop_scatter
|
---|
2135 | .align 16
|
---|
2136 | .Loop_scatter:
|
---|
2137 | movq ($inp), %rax
|
---|
2138 | leaq 8($inp), $inp
|
---|
2139 | movq %rax, ($out)
|
---|
2140 | leaq 128($out), $out
|
---|
2141 | decl %r9d
|
---|
2142 | jnz .Loop_scatter
|
---|
2143 | ret
|
---|
2144 | .cfi_endproc
|
---|
2145 | .size rsaz_512_scatter4,.-rsaz_512_scatter4
|
---|
2146 |
|
---|
2147 | .globl rsaz_512_gather4
|
---|
2148 | .type rsaz_512_gather4,\@abi-omnipotent
|
---|
2149 | .align 16
|
---|
2150 | rsaz_512_gather4:
|
---|
2151 | .cfi_startproc
|
---|
2152 | ___
|
---|
2153 | $code.=<<___ if ($win64);
|
---|
2154 | .LSEH_begin_rsaz_512_gather4:
|
---|
2155 | .byte 0x48,0x81,0xec,0xa8,0x00,0x00,0x00 # sub $0xa8,%rsp
|
---|
2156 | .byte 0x0f,0x29,0x34,0x24 # movaps %xmm6,(%rsp)
|
---|
2157 | .byte 0x0f,0x29,0x7c,0x24,0x10 # movaps %xmm7,0x10(%rsp)
|
---|
2158 | .byte 0x44,0x0f,0x29,0x44,0x24,0x20 # movaps %xmm8,0x20(%rsp)
|
---|
2159 | .byte 0x44,0x0f,0x29,0x4c,0x24,0x30 # movaps %xmm9,0x30(%rsp)
|
---|
2160 | .byte 0x44,0x0f,0x29,0x54,0x24,0x40 # movaps %xmm10,0x40(%rsp)
|
---|
2161 | .byte 0x44,0x0f,0x29,0x5c,0x24,0x50 # movaps %xmm11,0x50(%rsp)
|
---|
2162 | .byte 0x44,0x0f,0x29,0x64,0x24,0x60 # movaps %xmm12,0x60(%rsp)
|
---|
2163 | .byte 0x44,0x0f,0x29,0x6c,0x24,0x70 # movaps %xmm13,0x70(%rsp)
|
---|
2164 | .byte 0x44,0x0f,0x29,0xb4,0x24,0x80,0,0,0 # movaps %xmm14,0x80(%rsp)
|
---|
2165 | .byte 0x44,0x0f,0x29,0xbc,0x24,0x90,0,0,0 # movaps %xmm15,0x90(%rsp)
|
---|
2166 | ___
|
---|
2167 | $code.=<<___;
|
---|
2168 | movd $power,%xmm8
|
---|
2169 | movdqa .Linc+16(%rip),%xmm1 # 00000002000000020000000200000002
|
---|
2170 | movdqa .Linc(%rip),%xmm0 # 00000001000000010000000000000000
|
---|
2171 |
|
---|
2172 | pshufd \$0,%xmm8,%xmm8 # broadcast $power
|
---|
2173 | movdqa %xmm1,%xmm7
|
---|
2174 | movdqa %xmm1,%xmm2
|
---|
2175 | ___
|
---|
2176 | ########################################################################
|
---|
2177 | # calculate mask by comparing 0..15 to $power
|
---|
2178 | #
|
---|
2179 | for($i=0;$i<4;$i++) {
|
---|
2180 | $code.=<<___;
|
---|
2181 | paddd %xmm`$i`,%xmm`$i+1`
|
---|
2182 | pcmpeqd %xmm8,%xmm`$i`
|
---|
2183 | movdqa %xmm7,%xmm`$i+3`
|
---|
2184 | ___
|
---|
2185 | }
|
---|
2186 | for(;$i<7;$i++) {
|
---|
2187 | $code.=<<___;
|
---|
2188 | paddd %xmm`$i`,%xmm`$i+1`
|
---|
2189 | pcmpeqd %xmm8,%xmm`$i`
|
---|
2190 | ___
|
---|
2191 | }
|
---|
2192 | $code.=<<___;
|
---|
2193 | pcmpeqd %xmm8,%xmm7
|
---|
2194 | movl \$8, %r9d
|
---|
2195 | jmp .Loop_gather
|
---|
2196 | .align 16
|
---|
2197 | .Loop_gather:
|
---|
2198 | movdqa 16*0($inp),%xmm8
|
---|
2199 | movdqa 16*1($inp),%xmm9
|
---|
2200 | movdqa 16*2($inp),%xmm10
|
---|
2201 | movdqa 16*3($inp),%xmm11
|
---|
2202 | pand %xmm0,%xmm8
|
---|
2203 | movdqa 16*4($inp),%xmm12
|
---|
2204 | pand %xmm1,%xmm9
|
---|
2205 | movdqa 16*5($inp),%xmm13
|
---|
2206 | pand %xmm2,%xmm10
|
---|
2207 | movdqa 16*6($inp),%xmm14
|
---|
2208 | pand %xmm3,%xmm11
|
---|
2209 | movdqa 16*7($inp),%xmm15
|
---|
2210 | leaq 128($inp), $inp
|
---|
2211 | pand %xmm4,%xmm12
|
---|
2212 | pand %xmm5,%xmm13
|
---|
2213 | pand %xmm6,%xmm14
|
---|
2214 | pand %xmm7,%xmm15
|
---|
2215 | por %xmm10,%xmm8
|
---|
2216 | por %xmm11,%xmm9
|
---|
2217 | por %xmm12,%xmm8
|
---|
2218 | por %xmm13,%xmm9
|
---|
2219 | por %xmm14,%xmm8
|
---|
2220 | por %xmm15,%xmm9
|
---|
2221 |
|
---|
2222 | por %xmm9,%xmm8
|
---|
2223 | pshufd \$0x4e,%xmm8,%xmm9
|
---|
2224 | por %xmm9,%xmm8
|
---|
2225 | movq %xmm8,($out)
|
---|
2226 | leaq 8($out), $out
|
---|
2227 | decl %r9d
|
---|
2228 | jnz .Loop_gather
|
---|
2229 | ___
|
---|
2230 | $code.=<<___ if ($win64);
|
---|
2231 | movaps 0x00(%rsp),%xmm6
|
---|
2232 | movaps 0x10(%rsp),%xmm7
|
---|
2233 | movaps 0x20(%rsp),%xmm8
|
---|
2234 | movaps 0x30(%rsp),%xmm9
|
---|
2235 | movaps 0x40(%rsp),%xmm10
|
---|
2236 | movaps 0x50(%rsp),%xmm11
|
---|
2237 | movaps 0x60(%rsp),%xmm12
|
---|
2238 | movaps 0x70(%rsp),%xmm13
|
---|
2239 | movaps 0x80(%rsp),%xmm14
|
---|
2240 | movaps 0x90(%rsp),%xmm15
|
---|
2241 | add \$0xa8,%rsp
|
---|
2242 | ___
|
---|
2243 | $code.=<<___;
|
---|
2244 | ret
|
---|
2245 | .LSEH_end_rsaz_512_gather4:
|
---|
2246 | .cfi_endproc
|
---|
2247 | .size rsaz_512_gather4,.-rsaz_512_gather4
|
---|
2248 |
|
---|
2249 | .align 64
|
---|
2250 | .Linc:
|
---|
2251 | .long 0,0, 1,1
|
---|
2252 | .long 2,2, 2,2
|
---|
2253 | ___
|
---|
2254 | }
|
---|
2255 |
|
---|
2256 | # EXCEPTION_DISPOSITION handler (EXCEPTION_RECORD *rec,ULONG64 frame,
|
---|
2257 | # CONTEXT *context,DISPATCHER_CONTEXT *disp)
|
---|
2258 | if ($win64) {
|
---|
2259 | $rec="%rcx";
|
---|
2260 | $frame="%rdx";
|
---|
2261 | $context="%r8";
|
---|
2262 | $disp="%r9";
|
---|
2263 |
|
---|
2264 | $code.=<<___;
|
---|
2265 | .extern __imp_RtlVirtualUnwind
|
---|
2266 | .type se_handler,\@abi-omnipotent
|
---|
2267 | .align 16
|
---|
2268 | se_handler:
|
---|
2269 | push %rsi
|
---|
2270 | push %rdi
|
---|
2271 | push %rbx
|
---|
2272 | push %rbp
|
---|
2273 | push %r12
|
---|
2274 | push %r13
|
---|
2275 | push %r14
|
---|
2276 | push %r15
|
---|
2277 | pushfq
|
---|
2278 | sub \$64,%rsp
|
---|
2279 |
|
---|
2280 | mov 120($context),%rax # pull context->Rax
|
---|
2281 | mov 248($context),%rbx # pull context->Rip
|
---|
2282 |
|
---|
2283 | mov 8($disp),%rsi # disp->ImageBase
|
---|
2284 | mov 56($disp),%r11 # disp->HandlerData
|
---|
2285 |
|
---|
2286 | mov 0(%r11),%r10d # HandlerData[0]
|
---|
2287 | lea (%rsi,%r10),%r10 # end of prologue label
|
---|
2288 | cmp %r10,%rbx # context->Rip<end of prologue label
|
---|
2289 | jb .Lcommon_seh_tail
|
---|
2290 |
|
---|
2291 | mov 152($context),%rax # pull context->Rsp
|
---|
2292 |
|
---|
2293 | mov 4(%r11),%r10d # HandlerData[1]
|
---|
2294 | lea (%rsi,%r10),%r10 # epilogue label
|
---|
2295 | cmp %r10,%rbx # context->Rip>=epilogue label
|
---|
2296 | jae .Lcommon_seh_tail
|
---|
2297 |
|
---|
2298 | lea 128+24+48(%rax),%rax
|
---|
2299 |
|
---|
2300 | lea .Lmul_gather4_epilogue(%rip),%rbx
|
---|
2301 | cmp %r10,%rbx
|
---|
2302 | jne .Lse_not_in_mul_gather4
|
---|
2303 |
|
---|
2304 | lea 0xb0(%rax),%rax
|
---|
2305 |
|
---|
2306 | lea -48-0xa8(%rax),%rsi
|
---|
2307 | lea 512($context),%rdi
|
---|
2308 | mov \$20,%ecx
|
---|
2309 | .long 0xa548f3fc # cld; rep movsq
|
---|
2310 |
|
---|
2311 | .Lse_not_in_mul_gather4:
|
---|
2312 | mov -8(%rax),%rbx
|
---|
2313 | mov -16(%rax),%rbp
|
---|
2314 | mov -24(%rax),%r12
|
---|
2315 | mov -32(%rax),%r13
|
---|
2316 | mov -40(%rax),%r14
|
---|
2317 | mov -48(%rax),%r15
|
---|
2318 | mov %rbx,144($context) # restore context->Rbx
|
---|
2319 | mov %rbp,160($context) # restore context->Rbp
|
---|
2320 | mov %r12,216($context) # restore context->R12
|
---|
2321 | mov %r13,224($context) # restore context->R13
|
---|
2322 | mov %r14,232($context) # restore context->R14
|
---|
2323 | mov %r15,240($context) # restore context->R15
|
---|
2324 |
|
---|
2325 | .Lcommon_seh_tail:
|
---|
2326 | mov 8(%rax),%rdi
|
---|
2327 | mov 16(%rax),%rsi
|
---|
2328 | mov %rax,152($context) # restore context->Rsp
|
---|
2329 | mov %rsi,168($context) # restore context->Rsi
|
---|
2330 | mov %rdi,176($context) # restore context->Rdi
|
---|
2331 |
|
---|
2332 | mov 40($disp),%rdi # disp->ContextRecord
|
---|
2333 | mov $context,%rsi # context
|
---|
2334 | mov \$154,%ecx # sizeof(CONTEXT)
|
---|
2335 | .long 0xa548f3fc # cld; rep movsq
|
---|
2336 |
|
---|
2337 | mov $disp,%rsi
|
---|
2338 | xor %rcx,%rcx # arg1, UNW_FLAG_NHANDLER
|
---|
2339 | mov 8(%rsi),%rdx # arg2, disp->ImageBase
|
---|
2340 | mov 0(%rsi),%r8 # arg3, disp->ControlPc
|
---|
2341 | mov 16(%rsi),%r9 # arg4, disp->FunctionEntry
|
---|
2342 | mov 40(%rsi),%r10 # disp->ContextRecord
|
---|
2343 | lea 56(%rsi),%r11 # &disp->HandlerData
|
---|
2344 | lea 24(%rsi),%r12 # &disp->EstablisherFrame
|
---|
2345 | mov %r10,32(%rsp) # arg5
|
---|
2346 | mov %r11,40(%rsp) # arg6
|
---|
2347 | mov %r12,48(%rsp) # arg7
|
---|
2348 | mov %rcx,56(%rsp) # arg8, (NULL)
|
---|
2349 | call *__imp_RtlVirtualUnwind(%rip)
|
---|
2350 |
|
---|
2351 | mov \$1,%eax # ExceptionContinueSearch
|
---|
2352 | add \$64,%rsp
|
---|
2353 | popfq
|
---|
2354 | pop %r15
|
---|
2355 | pop %r14
|
---|
2356 | pop %r13
|
---|
2357 | pop %r12
|
---|
2358 | pop %rbp
|
---|
2359 | pop %rbx
|
---|
2360 | pop %rdi
|
---|
2361 | pop %rsi
|
---|
2362 | ret
|
---|
2363 | .size se_handler,.-se_handler
|
---|
2364 |
|
---|
2365 | .section .pdata
|
---|
2366 | .align 4
|
---|
2367 | .rva .LSEH_begin_rsaz_512_sqr
|
---|
2368 | .rva .LSEH_end_rsaz_512_sqr
|
---|
2369 | .rva .LSEH_info_rsaz_512_sqr
|
---|
2370 |
|
---|
2371 | .rva .LSEH_begin_rsaz_512_mul
|
---|
2372 | .rva .LSEH_end_rsaz_512_mul
|
---|
2373 | .rva .LSEH_info_rsaz_512_mul
|
---|
2374 |
|
---|
2375 | .rva .LSEH_begin_rsaz_512_mul_gather4
|
---|
2376 | .rva .LSEH_end_rsaz_512_mul_gather4
|
---|
2377 | .rva .LSEH_info_rsaz_512_mul_gather4
|
---|
2378 |
|
---|
2379 | .rva .LSEH_begin_rsaz_512_mul_scatter4
|
---|
2380 | .rva .LSEH_end_rsaz_512_mul_scatter4
|
---|
2381 | .rva .LSEH_info_rsaz_512_mul_scatter4
|
---|
2382 |
|
---|
2383 | .rva .LSEH_begin_rsaz_512_mul_by_one
|
---|
2384 | .rva .LSEH_end_rsaz_512_mul_by_one
|
---|
2385 | .rva .LSEH_info_rsaz_512_mul_by_one
|
---|
2386 |
|
---|
2387 | .rva .LSEH_begin_rsaz_512_gather4
|
---|
2388 | .rva .LSEH_end_rsaz_512_gather4
|
---|
2389 | .rva .LSEH_info_rsaz_512_gather4
|
---|
2390 |
|
---|
2391 | .section .xdata
|
---|
2392 | .align 8
|
---|
2393 | .LSEH_info_rsaz_512_sqr:
|
---|
2394 | .byte 9,0,0,0
|
---|
2395 | .rva se_handler
|
---|
2396 | .rva .Lsqr_body,.Lsqr_epilogue # HandlerData[]
|
---|
2397 | .LSEH_info_rsaz_512_mul:
|
---|
2398 | .byte 9,0,0,0
|
---|
2399 | .rva se_handler
|
---|
2400 | .rva .Lmul_body,.Lmul_epilogue # HandlerData[]
|
---|
2401 | .LSEH_info_rsaz_512_mul_gather4:
|
---|
2402 | .byte 9,0,0,0
|
---|
2403 | .rva se_handler
|
---|
2404 | .rva .Lmul_gather4_body,.Lmul_gather4_epilogue # HandlerData[]
|
---|
2405 | .LSEH_info_rsaz_512_mul_scatter4:
|
---|
2406 | .byte 9,0,0,0
|
---|
2407 | .rva se_handler
|
---|
2408 | .rva .Lmul_scatter4_body,.Lmul_scatter4_epilogue # HandlerData[]
|
---|
2409 | .LSEH_info_rsaz_512_mul_by_one:
|
---|
2410 | .byte 9,0,0,0
|
---|
2411 | .rva se_handler
|
---|
2412 | .rva .Lmul_by_one_body,.Lmul_by_one_epilogue # HandlerData[]
|
---|
2413 | .LSEH_info_rsaz_512_gather4:
|
---|
2414 | .byte 0x01,0x46,0x16,0x00
|
---|
2415 | .byte 0x46,0xf8,0x09,0x00 # vmovaps 0x90(rsp),xmm15
|
---|
2416 | .byte 0x3d,0xe8,0x08,0x00 # vmovaps 0x80(rsp),xmm14
|
---|
2417 | .byte 0x34,0xd8,0x07,0x00 # vmovaps 0x70(rsp),xmm13
|
---|
2418 | .byte 0x2e,0xc8,0x06,0x00 # vmovaps 0x60(rsp),xmm12
|
---|
2419 | .byte 0x28,0xb8,0x05,0x00 # vmovaps 0x50(rsp),xmm11
|
---|
2420 | .byte 0x22,0xa8,0x04,0x00 # vmovaps 0x40(rsp),xmm10
|
---|
2421 | .byte 0x1c,0x98,0x03,0x00 # vmovaps 0x30(rsp),xmm9
|
---|
2422 | .byte 0x16,0x88,0x02,0x00 # vmovaps 0x20(rsp),xmm8
|
---|
2423 | .byte 0x10,0x78,0x01,0x00 # vmovaps 0x10(rsp),xmm7
|
---|
2424 | .byte 0x0b,0x68,0x00,0x00 # vmovaps 0x00(rsp),xmm6
|
---|
2425 | .byte 0x07,0x01,0x15,0x00 # sub rsp,0xa8
|
---|
2426 | ___
|
---|
2427 | }
|
---|
2428 |
|
---|
2429 | $code =~ s/\`([^\`]*)\`/eval $1/gem;
|
---|
2430 | print $code;
|
---|
2431 | close STDOUT or die "error closing STDOUT: $!";
|
---|