1 | #! /usr/bin/env perl
|
---|
2 | # Copyright 2013-2020 The OpenSSL Project Authors. All Rights Reserved.
|
---|
3 | # Copyright (c) 2012, Intel Corporation. All Rights Reserved.
|
---|
4 | #
|
---|
5 | # Licensed under the Apache License 2.0 (the "License"). You may not use
|
---|
6 | # this file except in compliance with the License. You can obtain a copy
|
---|
7 | # in the file LICENSE in the source distribution or at
|
---|
8 | # https://www.openssl.org/source/license.html
|
---|
9 | #
|
---|
10 | # Originally written by Shay Gueron (1, 2), and Vlad Krasnov (1)
|
---|
11 | # (1) Intel Corporation, Israel Development Center, Haifa, Israel
|
---|
12 | # (2) University of Haifa, Israel
|
---|
13 | #
|
---|
14 | # References:
|
---|
15 | # [1] S. Gueron, "Efficient Software Implementations of Modular
|
---|
16 | # Exponentiation", http://eprint.iacr.org/2011/239
|
---|
17 | # [2] S. Gueron, V. Krasnov. "Speeding up Big-Numbers Squaring".
|
---|
18 | # IEEE Proceedings of 9th International Conference on Information
|
---|
19 | # Technology: New Generations (ITNG 2012), 821-823 (2012).
|
---|
20 | # [3] S. Gueron, Efficient Software Implementations of Modular Exponentiation
|
---|
21 | # Journal of Cryptographic Engineering 2:31-43 (2012).
|
---|
22 | # [4] S. Gueron, V. Krasnov: "[PATCH] Efficient and side channel analysis
|
---|
23 | # resistant 512-bit and 1024-bit modular exponentiation for optimizing
|
---|
24 | # RSA1024 and RSA2048 on x86_64 platforms",
|
---|
25 | # http://rt.openssl.org/Ticket/Display.html?id=2582&user=guest&pass=guest
|
---|
26 | #
|
---|
27 | # While original submission covers 512- and 1024-bit exponentiation,
|
---|
28 | # this module is limited to 512-bit version only (and as such
|
---|
29 | # accelerates RSA1024 sign). This is because improvement for longer
|
---|
30 | # keys is not high enough to justify the effort, highest measured
|
---|
31 | # was ~5% on Westmere. [This is relative to OpenSSL 1.0.2, upcoming
|
---|
32 | # for the moment of this writing!] Nor does this module implement
|
---|
33 | # "monolithic" complete exponentiation jumbo-subroutine, but adheres
|
---|
34 | # to more modular mixture of C and assembly. And it's optimized even
|
---|
35 | # for processors other than Intel Core family (see table below for
|
---|
36 | # improvement coefficients).
|
---|
37 | # <[email protected]>
|
---|
38 | #
|
---|
39 | # RSA1024 sign/sec this/original |this/rsax(*) this/fips(*)
|
---|
40 | # ----------------+---------------------------
|
---|
41 | # Opteron +13% |+5% +20%
|
---|
42 | # Bulldozer -0% |-1% +10%
|
---|
43 | # P4 +11% |+7% +8%
|
---|
44 | # Westmere +5% |+14% +17%
|
---|
45 | # Sandy Bridge +2% |+12% +29%
|
---|
46 | # Ivy Bridge +1% |+11% +35%
|
---|
47 | # Haswell(**) -0% |+12% +39%
|
---|
48 | # Atom +13% |+11% +4%
|
---|
49 | # VIA Nano +70% |+9% +25%
|
---|
50 | #
|
---|
51 | # (*) rsax engine and fips numbers are presented for reference
|
---|
52 | # purposes;
|
---|
53 | # (**) MULX was attempted, but found to give only marginal improvement;
|
---|
54 |
|
---|
55 | # $output is the last argument if it looks like a file (it has an extension)
|
---|
56 | # $flavour is the first argument if it doesn't look like a file
|
---|
57 | $output = $#ARGV >= 0 && $ARGV[$#ARGV] =~ m|\.\w+$| ? pop : undef;
|
---|
58 | $flavour = $#ARGV >= 0 && $ARGV[0] !~ m|\.| ? shift : undef;
|
---|
59 |
|
---|
60 | $win64=0; $win64=1 if ($flavour =~ /[nm]asm|mingw64/ || $output =~ /\.asm$/);
|
---|
61 |
|
---|
62 | $0 =~ m/(.*[\/\\])[^\/\\]+$/; $dir=$1;
|
---|
63 | ( $xlate="${dir}x86_64-xlate.pl" and -f $xlate ) or
|
---|
64 | ( $xlate="${dir}../../perlasm/x86_64-xlate.pl" and -f $xlate) or
|
---|
65 | die "can't locate x86_64-xlate.pl";
|
---|
66 |
|
---|
67 | open OUT,"| \"$^X\" \"$xlate\" $flavour \"$output\""
|
---|
68 | or die "can't call $xlate: $!";
|
---|
69 | *STDOUT=*OUT;
|
---|
70 |
|
---|
71 | if (`$ENV{CC} -Wa,-v -c -o /dev/null -x assembler /dev/null 2>&1`
|
---|
72 | =~ /GNU assembler version ([2-9]\.[0-9]+)/) {
|
---|
73 | $addx = ($1>=2.23);
|
---|
74 | }
|
---|
75 |
|
---|
76 | if (!$addx && $win64 && ($flavour =~ /nasm/ || $ENV{ASM} =~ /nasm/) &&
|
---|
77 | `nasm -v 2>&1` =~ /NASM version ([2-9]\.[0-9]+)/) {
|
---|
78 | $addx = ($1>=2.10);
|
---|
79 | }
|
---|
80 |
|
---|
81 | if (!$addx && $win64 && ($flavour =~ /masm/ || $ENV{ASM} =~ /ml64/) &&
|
---|
82 | `ml64 2>&1` =~ /Version ([0-9]+)\./) {
|
---|
83 | $addx = ($1>=12);
|
---|
84 | }
|
---|
85 |
|
---|
86 | if (!$addx && `$ENV{CC} -v 2>&1` =~ /((?:clang|LLVM) version|.*based on LLVM) ([0-9]+)\.([0-9]+)/) {
|
---|
87 | my $ver = $2 + $3/100.0; # 3.1->3.01, 3.10->3.10
|
---|
88 | $addx = ($ver>=3.03);
|
---|
89 | }
|
---|
90 |
|
---|
91 | ($out, $inp, $mod) = ("%rdi", "%rsi", "%rbp"); # common internal API
|
---|
92 | {
|
---|
93 | my ($out,$inp,$mod,$n0,$times) = ("%rdi","%rsi","%rdx","%rcx","%r8d");
|
---|
94 |
|
---|
95 | $code.=<<___;
|
---|
96 | .text
|
---|
97 |
|
---|
98 | .extern OPENSSL_ia32cap_P
|
---|
99 |
|
---|
100 | .globl rsaz_512_sqr
|
---|
101 | .type rsaz_512_sqr,\@function,5
|
---|
102 | .align 32
|
---|
103 | rsaz_512_sqr: # 25-29% faster than rsaz_512_mul
|
---|
104 | .cfi_startproc
|
---|
105 | push %rbx
|
---|
106 | .cfi_push %rbx
|
---|
107 | push %rbp
|
---|
108 | .cfi_push %rbp
|
---|
109 | push %r12
|
---|
110 | .cfi_push %r12
|
---|
111 | push %r13
|
---|
112 | .cfi_push %r13
|
---|
113 | push %r14
|
---|
114 | .cfi_push %r14
|
---|
115 | push %r15
|
---|
116 | .cfi_push %r15
|
---|
117 |
|
---|
118 | subq \$128+24, %rsp
|
---|
119 | .cfi_adjust_cfa_offset 128+24
|
---|
120 | .Lsqr_body:
|
---|
121 | movq $mod, %xmm1 # common off-load
|
---|
122 | movq ($inp), %rdx
|
---|
123 | movq 8($inp), %rax
|
---|
124 | movq $n0, 128(%rsp)
|
---|
125 | ___
|
---|
126 | $code.=<<___ if ($addx);
|
---|
127 | movl \$0x80100,%r11d
|
---|
128 | andl OPENSSL_ia32cap_P+8(%rip),%r11d
|
---|
129 | cmpl \$0x80100,%r11d # check for MULX and ADO/CX
|
---|
130 | je .Loop_sqrx
|
---|
131 | ___
|
---|
132 | $code.=<<___;
|
---|
133 | jmp .Loop_sqr
|
---|
134 |
|
---|
135 | .align 32
|
---|
136 | .Loop_sqr:
|
---|
137 | movl $times,128+8(%rsp)
|
---|
138 | #first iteration
|
---|
139 | movq %rdx, %rbx # 0($inp)
|
---|
140 | mov %rax, %rbp # 8($inp)
|
---|
141 | mulq %rdx
|
---|
142 | movq %rax, %r8
|
---|
143 | movq 16($inp), %rax
|
---|
144 | movq %rdx, %r9
|
---|
145 |
|
---|
146 | mulq %rbx
|
---|
147 | addq %rax, %r9
|
---|
148 | movq 24($inp), %rax
|
---|
149 | movq %rdx, %r10
|
---|
150 | adcq \$0, %r10
|
---|
151 |
|
---|
152 | mulq %rbx
|
---|
153 | addq %rax, %r10
|
---|
154 | movq 32($inp), %rax
|
---|
155 | movq %rdx, %r11
|
---|
156 | adcq \$0, %r11
|
---|
157 |
|
---|
158 | mulq %rbx
|
---|
159 | addq %rax, %r11
|
---|
160 | movq 40($inp), %rax
|
---|
161 | movq %rdx, %r12
|
---|
162 | adcq \$0, %r12
|
---|
163 |
|
---|
164 | mulq %rbx
|
---|
165 | addq %rax, %r12
|
---|
166 | movq 48($inp), %rax
|
---|
167 | movq %rdx, %r13
|
---|
168 | adcq \$0, %r13
|
---|
169 |
|
---|
170 | mulq %rbx
|
---|
171 | addq %rax, %r13
|
---|
172 | movq 56($inp), %rax
|
---|
173 | movq %rdx, %r14
|
---|
174 | adcq \$0, %r14
|
---|
175 |
|
---|
176 | mulq %rbx
|
---|
177 | addq %rax, %r14
|
---|
178 | movq %rbx, %rax
|
---|
179 | adcq \$0, %rdx
|
---|
180 |
|
---|
181 | xorq %rcx,%rcx # rcx:r8 = r8 << 1
|
---|
182 | addq %r8, %r8
|
---|
183 | movq %rdx, %r15
|
---|
184 | adcq \$0, %rcx
|
---|
185 |
|
---|
186 | mulq %rax
|
---|
187 | addq %r8, %rdx
|
---|
188 | adcq \$0, %rcx
|
---|
189 |
|
---|
190 | movq %rax, (%rsp)
|
---|
191 | movq %rdx, 8(%rsp)
|
---|
192 |
|
---|
193 | #second iteration
|
---|
194 | movq 16($inp), %rax
|
---|
195 | mulq %rbp
|
---|
196 | addq %rax, %r10
|
---|
197 | movq 24($inp), %rax
|
---|
198 | movq %rdx, %rbx
|
---|
199 | adcq \$0, %rbx
|
---|
200 |
|
---|
201 | mulq %rbp
|
---|
202 | addq %rax, %r11
|
---|
203 | movq 32($inp), %rax
|
---|
204 | adcq \$0, %rdx
|
---|
205 | addq %rbx, %r11
|
---|
206 | movq %rdx, %rbx
|
---|
207 | adcq \$0, %rbx
|
---|
208 |
|
---|
209 | mulq %rbp
|
---|
210 | addq %rax, %r12
|
---|
211 | movq 40($inp), %rax
|
---|
212 | adcq \$0, %rdx
|
---|
213 | addq %rbx, %r12
|
---|
214 | movq %rdx, %rbx
|
---|
215 | adcq \$0, %rbx
|
---|
216 |
|
---|
217 | mulq %rbp
|
---|
218 | addq %rax, %r13
|
---|
219 | movq 48($inp), %rax
|
---|
220 | adcq \$0, %rdx
|
---|
221 | addq %rbx, %r13
|
---|
222 | movq %rdx, %rbx
|
---|
223 | adcq \$0, %rbx
|
---|
224 |
|
---|
225 | mulq %rbp
|
---|
226 | addq %rax, %r14
|
---|
227 | movq 56($inp), %rax
|
---|
228 | adcq \$0, %rdx
|
---|
229 | addq %rbx, %r14
|
---|
230 | movq %rdx, %rbx
|
---|
231 | adcq \$0, %rbx
|
---|
232 |
|
---|
233 | mulq %rbp
|
---|
234 | addq %rax, %r15
|
---|
235 | movq %rbp, %rax
|
---|
236 | adcq \$0, %rdx
|
---|
237 | addq %rbx, %r15
|
---|
238 | adcq \$0, %rdx
|
---|
239 |
|
---|
240 | xorq %rbx, %rbx # rbx:r10:r9 = r10:r9 << 1
|
---|
241 | addq %r9, %r9
|
---|
242 | movq %rdx, %r8
|
---|
243 | adcq %r10, %r10
|
---|
244 | adcq \$0, %rbx
|
---|
245 |
|
---|
246 | mulq %rax
|
---|
247 | # rcx <= 2 and rax <= 0xFFFF..F9, so carry must be zero here
|
---|
248 | addq %rcx, %rax
|
---|
249 | movq 16($inp), %rbp
|
---|
250 | addq %rax, %r9
|
---|
251 | movq 24($inp), %rax
|
---|
252 | adcq %rdx, %r10
|
---|
253 | adcq \$0, %rbx
|
---|
254 |
|
---|
255 | movq %r9, 16(%rsp)
|
---|
256 | movq %r10, 24(%rsp)
|
---|
257 |
|
---|
258 | #third iteration
|
---|
259 | mulq %rbp
|
---|
260 | addq %rax, %r12
|
---|
261 | movq 32($inp), %rax
|
---|
262 | movq %rdx, %rcx
|
---|
263 | adcq \$0, %rcx
|
---|
264 |
|
---|
265 | mulq %rbp
|
---|
266 | addq %rax, %r13
|
---|
267 | movq 40($inp), %rax
|
---|
268 | adcq \$0, %rdx
|
---|
269 | addq %rcx, %r13
|
---|
270 | movq %rdx, %rcx
|
---|
271 | adcq \$0, %rcx
|
---|
272 |
|
---|
273 | mulq %rbp
|
---|
274 | addq %rax, %r14
|
---|
275 | movq 48($inp), %rax
|
---|
276 | adcq \$0, %rdx
|
---|
277 | addq %rcx, %r14
|
---|
278 | movq %rdx, %rcx
|
---|
279 | adcq \$0, %rcx
|
---|
280 |
|
---|
281 | mulq %rbp
|
---|
282 | addq %rax, %r15
|
---|
283 | movq 56($inp), %rax
|
---|
284 | adcq \$0, %rdx
|
---|
285 | addq %rcx, %r15
|
---|
286 | movq %rdx, %rcx
|
---|
287 | adcq \$0, %rcx
|
---|
288 |
|
---|
289 | mulq %rbp
|
---|
290 | addq %rax, %r8
|
---|
291 | movq %rbp, %rax
|
---|
292 | adcq \$0, %rdx
|
---|
293 | addq %rcx, %r8
|
---|
294 | adcq \$0, %rdx
|
---|
295 |
|
---|
296 | xorq %rcx, %rcx # rcx:r12:r11 = r12:r11 << 1
|
---|
297 | addq %r11, %r11
|
---|
298 | movq %rdx, %r9
|
---|
299 | adcq %r12, %r12
|
---|
300 | adcq \$0, %rcx
|
---|
301 |
|
---|
302 | mulq %rax
|
---|
303 | # rbx <= 2 and rax <= 0xFFFF..F9, so carry must be zero here
|
---|
304 | addq %rbx, %rax
|
---|
305 | movq 24($inp), %r10
|
---|
306 | addq %rax, %r11
|
---|
307 | movq 32($inp), %rax
|
---|
308 | adcq %rdx, %r12
|
---|
309 | adcq \$0, %rcx
|
---|
310 |
|
---|
311 | movq %r11, 32(%rsp)
|
---|
312 | movq %r12, 40(%rsp)
|
---|
313 |
|
---|
314 | #fourth iteration
|
---|
315 | mov %rax, %r11 # 32($inp)
|
---|
316 | mulq %r10
|
---|
317 | addq %rax, %r14
|
---|
318 | movq 40($inp), %rax
|
---|
319 | movq %rdx, %rbx
|
---|
320 | adcq \$0, %rbx
|
---|
321 |
|
---|
322 | mov %rax, %r12 # 40($inp)
|
---|
323 | mulq %r10
|
---|
324 | addq %rax, %r15
|
---|
325 | movq 48($inp), %rax
|
---|
326 | adcq \$0, %rdx
|
---|
327 | addq %rbx, %r15
|
---|
328 | movq %rdx, %rbx
|
---|
329 | adcq \$0, %rbx
|
---|
330 |
|
---|
331 | mov %rax, %rbp # 48($inp)
|
---|
332 | mulq %r10
|
---|
333 | addq %rax, %r8
|
---|
334 | movq 56($inp), %rax
|
---|
335 | adcq \$0, %rdx
|
---|
336 | addq %rbx, %r8
|
---|
337 | movq %rdx, %rbx
|
---|
338 | adcq \$0, %rbx
|
---|
339 |
|
---|
340 | mulq %r10
|
---|
341 | addq %rax, %r9
|
---|
342 | movq %r10, %rax
|
---|
343 | adcq \$0, %rdx
|
---|
344 | addq %rbx, %r9
|
---|
345 | adcq \$0, %rdx
|
---|
346 |
|
---|
347 | xorq %rbx, %rbx # rbx:r13:r14 = r13:r14 << 1
|
---|
348 | addq %r13, %r13
|
---|
349 | movq %rdx, %r10
|
---|
350 | adcq %r14, %r14
|
---|
351 | adcq \$0, %rbx
|
---|
352 |
|
---|
353 | mulq %rax
|
---|
354 | # rcx <= 2 and rax <= 0xFFFF..F9, so carry must be zero here
|
---|
355 | addq %rcx, %rax
|
---|
356 | addq %rax, %r13
|
---|
357 | movq %r12, %rax # 40($inp)
|
---|
358 | adcq %rdx, %r14
|
---|
359 | adcq \$0, %rbx
|
---|
360 |
|
---|
361 | movq %r13, 48(%rsp)
|
---|
362 | movq %r14, 56(%rsp)
|
---|
363 |
|
---|
364 | #fifth iteration
|
---|
365 | mulq %r11
|
---|
366 | addq %rax, %r8
|
---|
367 | movq %rbp, %rax # 48($inp)
|
---|
368 | movq %rdx, %rcx
|
---|
369 | adcq \$0, %rcx
|
---|
370 |
|
---|
371 | mulq %r11
|
---|
372 | addq %rax, %r9
|
---|
373 | movq 56($inp), %rax
|
---|
374 | adcq \$0, %rdx
|
---|
375 | addq %rcx, %r9
|
---|
376 | movq %rdx, %rcx
|
---|
377 | adcq \$0, %rcx
|
---|
378 |
|
---|
379 | mov %rax, %r14 # 56($inp)
|
---|
380 | mulq %r11
|
---|
381 | addq %rax, %r10
|
---|
382 | movq %r11, %rax
|
---|
383 | adcq \$0, %rdx
|
---|
384 | addq %rcx, %r10
|
---|
385 | adcq \$0, %rdx
|
---|
386 |
|
---|
387 | xorq %rcx, %rcx # rcx:r8:r15 = r8:r15 << 1
|
---|
388 | addq %r15, %r15
|
---|
389 | movq %rdx, %r11
|
---|
390 | adcq %r8, %r8
|
---|
391 | adcq \$0, %rcx
|
---|
392 |
|
---|
393 | mulq %rax
|
---|
394 | # rbx <= 2 and rax <= 0xFFFF..F9, so carry must be zero here
|
---|
395 | addq %rbx, %rax
|
---|
396 | addq %rax, %r15
|
---|
397 | movq %rbp, %rax # 48($inp)
|
---|
398 | adcq %rdx, %r8
|
---|
399 | adcq \$0, %rcx
|
---|
400 |
|
---|
401 | movq %r15, 64(%rsp)
|
---|
402 | movq %r8, 72(%rsp)
|
---|
403 |
|
---|
404 | #sixth iteration
|
---|
405 | mulq %r12
|
---|
406 | addq %rax, %r10
|
---|
407 | movq %r14, %rax # 56($inp)
|
---|
408 | movq %rdx, %rbx
|
---|
409 | adcq \$0, %rbx
|
---|
410 |
|
---|
411 | mulq %r12
|
---|
412 | addq %rax, %r11
|
---|
413 | movq %r12, %rax
|
---|
414 | adcq \$0, %rdx
|
---|
415 | addq %rbx, %r11
|
---|
416 | adcq \$0, %rdx
|
---|
417 |
|
---|
418 | xorq %rbx, %rbx # rbx:r10:r9 = r10:r9 << 1
|
---|
419 | addq %r9, %r9
|
---|
420 | movq %rdx, %r12
|
---|
421 | adcq %r10, %r10
|
---|
422 | adcq \$0, %rbx
|
---|
423 |
|
---|
424 | mulq %rax
|
---|
425 | # rcx <= 2 and rax <= 0xFFFF..F9, so carry must be zero here
|
---|
426 | addq %rcx, %rax
|
---|
427 | addq %rax, %r9
|
---|
428 | movq %r14, %rax # 56($inp)
|
---|
429 | adcq %rdx, %r10
|
---|
430 | adcq \$0, %rbx
|
---|
431 |
|
---|
432 | movq %r9, 80(%rsp)
|
---|
433 | movq %r10, 88(%rsp)
|
---|
434 |
|
---|
435 | #seventh iteration
|
---|
436 | mulq %rbp
|
---|
437 | addq %rax, %r12
|
---|
438 | movq %rbp, %rax
|
---|
439 | adcq \$0, %rdx
|
---|
440 |
|
---|
441 | xorq %rcx, %rcx # rcx:r12:r11 = r12:r11 << 1
|
---|
442 | addq %r11, %r11
|
---|
443 | movq %rdx, %r13
|
---|
444 | adcq %r12, %r12
|
---|
445 | adcq \$0, %rcx
|
---|
446 |
|
---|
447 | mulq %rax
|
---|
448 | # rbx <= 2 and rax <= 0xFFFF..F9, so carry must be zero here
|
---|
449 | addq %rbx, %rax
|
---|
450 | addq %rax, %r11
|
---|
451 | movq %r14, %rax # 56($inp)
|
---|
452 | adcq %rdx, %r12
|
---|
453 | adcq \$0, %rcx
|
---|
454 |
|
---|
455 | movq %r11, 96(%rsp)
|
---|
456 | movq %r12, 104(%rsp)
|
---|
457 |
|
---|
458 | #eighth iteration
|
---|
459 | xorq %rbx, %rbx # rbx:r13 = r13 << 1
|
---|
460 | addq %r13, %r13
|
---|
461 | adcq \$0, %rbx
|
---|
462 |
|
---|
463 | mulq %rax
|
---|
464 | # rcx <= 2 and rax <= 0xFFFF..F9, so carry must be zero here
|
---|
465 | addq %rcx, %rax
|
---|
466 | addq %r13, %rax
|
---|
467 | adcq %rbx, %rdx
|
---|
468 |
|
---|
469 | movq (%rsp), %r8
|
---|
470 | movq 8(%rsp), %r9
|
---|
471 | movq 16(%rsp), %r10
|
---|
472 | movq 24(%rsp), %r11
|
---|
473 | movq 32(%rsp), %r12
|
---|
474 | movq 40(%rsp), %r13
|
---|
475 | movq 48(%rsp), %r14
|
---|
476 | movq 56(%rsp), %r15
|
---|
477 | movq %xmm1, %rbp
|
---|
478 |
|
---|
479 | movq %rax, 112(%rsp)
|
---|
480 | movq %rdx, 120(%rsp)
|
---|
481 |
|
---|
482 | call __rsaz_512_reduce
|
---|
483 |
|
---|
484 | addq 64(%rsp), %r8
|
---|
485 | adcq 72(%rsp), %r9
|
---|
486 | adcq 80(%rsp), %r10
|
---|
487 | adcq 88(%rsp), %r11
|
---|
488 | adcq 96(%rsp), %r12
|
---|
489 | adcq 104(%rsp), %r13
|
---|
490 | adcq 112(%rsp), %r14
|
---|
491 | adcq 120(%rsp), %r15
|
---|
492 | sbbq %rcx, %rcx
|
---|
493 |
|
---|
494 | call __rsaz_512_subtract
|
---|
495 |
|
---|
496 | movq %r8, %rdx
|
---|
497 | movq %r9, %rax
|
---|
498 | movl 128+8(%rsp), $times
|
---|
499 | movq $out, $inp
|
---|
500 |
|
---|
501 | decl $times
|
---|
502 | jnz .Loop_sqr
|
---|
503 | ___
|
---|
504 | if ($addx) {
|
---|
505 | $code.=<<___;
|
---|
506 | jmp .Lsqr_tail
|
---|
507 |
|
---|
508 | .align 32
|
---|
509 | .Loop_sqrx:
|
---|
510 | movl $times,128+8(%rsp)
|
---|
511 | movq $out, %xmm0 # off-load
|
---|
512 | #first iteration
|
---|
513 | mulx %rax, %r8, %r9
|
---|
514 | mov %rax, %rbx
|
---|
515 |
|
---|
516 | mulx 16($inp), %rcx, %r10
|
---|
517 | xor %rbp, %rbp # cf=0, of=0
|
---|
518 |
|
---|
519 | mulx 24($inp), %rax, %r11
|
---|
520 | adcx %rcx, %r9
|
---|
521 |
|
---|
522 | .byte 0xc4,0x62,0xf3,0xf6,0xa6,0x20,0x00,0x00,0x00 # mulx 32($inp), %rcx, %r12
|
---|
523 | adcx %rax, %r10
|
---|
524 |
|
---|
525 | .byte 0xc4,0x62,0xfb,0xf6,0xae,0x28,0x00,0x00,0x00 # mulx 40($inp), %rax, %r13
|
---|
526 | adcx %rcx, %r11
|
---|
527 |
|
---|
528 | mulx 48($inp), %rcx, %r14
|
---|
529 | adcx %rax, %r12
|
---|
530 | adcx %rcx, %r13
|
---|
531 |
|
---|
532 | mulx 56($inp), %rax, %r15
|
---|
533 | adcx %rax, %r14
|
---|
534 | adcx %rbp, %r15 # %rbp is 0
|
---|
535 |
|
---|
536 | mulx %rdx, %rax, $out
|
---|
537 | mov %rbx, %rdx # 8($inp)
|
---|
538 | xor %rcx, %rcx
|
---|
539 | adox %r8, %r8
|
---|
540 | adcx $out, %r8
|
---|
541 | adox %rbp, %rcx
|
---|
542 | adcx %rbp, %rcx
|
---|
543 |
|
---|
544 | mov %rax, (%rsp)
|
---|
545 | mov %r8, 8(%rsp)
|
---|
546 |
|
---|
547 | #second iteration
|
---|
548 | .byte 0xc4,0xe2,0xfb,0xf6,0x9e,0x10,0x00,0x00,0x00 # mulx 16($inp), %rax, %rbx
|
---|
549 | adox %rax, %r10
|
---|
550 | adcx %rbx, %r11
|
---|
551 |
|
---|
552 | mulx 24($inp), $out, %r8
|
---|
553 | adox $out, %r11
|
---|
554 | .byte 0x66
|
---|
555 | adcx %r8, %r12
|
---|
556 |
|
---|
557 | mulx 32($inp), %rax, %rbx
|
---|
558 | adox %rax, %r12
|
---|
559 | adcx %rbx, %r13
|
---|
560 |
|
---|
561 | mulx 40($inp), $out, %r8
|
---|
562 | adox $out, %r13
|
---|
563 | adcx %r8, %r14
|
---|
564 |
|
---|
565 | .byte 0xc4,0xe2,0xfb,0xf6,0x9e,0x30,0x00,0x00,0x00 # mulx 48($inp), %rax, %rbx
|
---|
566 | adox %rax, %r14
|
---|
567 | adcx %rbx, %r15
|
---|
568 |
|
---|
569 | .byte 0xc4,0x62,0xc3,0xf6,0x86,0x38,0x00,0x00,0x00 # mulx 56($inp), $out, %r8
|
---|
570 | adox $out, %r15
|
---|
571 | adcx %rbp, %r8
|
---|
572 | mulx %rdx, %rax, $out
|
---|
573 | adox %rbp, %r8
|
---|
574 | .byte 0x48,0x8b,0x96,0x10,0x00,0x00,0x00 # mov 16($inp), %rdx
|
---|
575 |
|
---|
576 | xor %rbx, %rbx
|
---|
577 | adox %r9, %r9
|
---|
578 | # rcx <= 2 and rax <= 0xFFFF..F9, so carry must be zero here
|
---|
579 | adcx %rcx, %rax
|
---|
580 | adox %r10, %r10
|
---|
581 | adcx %rax, %r9
|
---|
582 | adox %rbp, %rbx
|
---|
583 | adcx $out, %r10
|
---|
584 | adcx %rbp, %rbx
|
---|
585 |
|
---|
586 | mov %r9, 16(%rsp)
|
---|
587 | .byte 0x4c,0x89,0x94,0x24,0x18,0x00,0x00,0x00 # mov %r10, 24(%rsp)
|
---|
588 |
|
---|
589 | #third iteration
|
---|
590 | mulx 24($inp), $out, %r9
|
---|
591 | adox $out, %r12
|
---|
592 | adcx %r9, %r13
|
---|
593 |
|
---|
594 | mulx 32($inp), %rax, %rcx
|
---|
595 | adox %rax, %r13
|
---|
596 | adcx %rcx, %r14
|
---|
597 |
|
---|
598 | .byte 0xc4,0x62,0xc3,0xf6,0x8e,0x28,0x00,0x00,0x00 # mulx 40($inp), $out, %r9
|
---|
599 | adox $out, %r14
|
---|
600 | adcx %r9, %r15
|
---|
601 |
|
---|
602 | .byte 0xc4,0xe2,0xfb,0xf6,0x8e,0x30,0x00,0x00,0x00 # mulx 48($inp), %rax, %rcx
|
---|
603 | adox %rax, %r15
|
---|
604 | adcx %rcx, %r8
|
---|
605 |
|
---|
606 | mulx 56($inp), $out, %r9
|
---|
607 | adox $out, %r8
|
---|
608 | adcx %rbp, %r9
|
---|
609 | mulx %rdx, %rax, $out
|
---|
610 | adox %rbp, %r9
|
---|
611 | mov 24($inp), %rdx
|
---|
612 |
|
---|
613 | xor %rcx, %rcx
|
---|
614 | adox %r11, %r11
|
---|
615 | # rbx <= 2 and rax <= 0xFFFF..F9, so carry must be zero here
|
---|
616 | adcx %rbx, %rax
|
---|
617 | adox %r12, %r12
|
---|
618 | adcx %rax, %r11
|
---|
619 | adox %rbp, %rcx
|
---|
620 | adcx $out, %r12
|
---|
621 | adcx %rbp, %rcx
|
---|
622 |
|
---|
623 | mov %r11, 32(%rsp)
|
---|
624 | mov %r12, 40(%rsp)
|
---|
625 |
|
---|
626 | #fourth iteration
|
---|
627 | mulx 32($inp), %rax, %rbx
|
---|
628 | adox %rax, %r14
|
---|
629 | adcx %rbx, %r15
|
---|
630 |
|
---|
631 | mulx 40($inp), $out, %r10
|
---|
632 | adox $out, %r15
|
---|
633 | adcx %r10, %r8
|
---|
634 |
|
---|
635 | mulx 48($inp), %rax, %rbx
|
---|
636 | adox %rax, %r8
|
---|
637 | adcx %rbx, %r9
|
---|
638 |
|
---|
639 | mulx 56($inp), $out, %r10
|
---|
640 | adox $out, %r9
|
---|
641 | adcx %rbp, %r10
|
---|
642 | mulx %rdx, %rax, $out
|
---|
643 | adox %rbp, %r10
|
---|
644 | mov 32($inp), %rdx
|
---|
645 |
|
---|
646 | xor %rbx, %rbx
|
---|
647 | adox %r13, %r13
|
---|
648 | # rcx <= 2 and rax <= 0xFFFF..F9, so carry must be zero here
|
---|
649 | adcx %rcx, %rax
|
---|
650 | adox %r14, %r14
|
---|
651 | adcx %rax, %r13
|
---|
652 | adox %rbp, %rbx
|
---|
653 | adcx $out, %r14
|
---|
654 | adcx %rbp, %rbx
|
---|
655 |
|
---|
656 | mov %r13, 48(%rsp)
|
---|
657 | mov %r14, 56(%rsp)
|
---|
658 |
|
---|
659 | #fifth iteration
|
---|
660 | mulx 40($inp), $out, %r11
|
---|
661 | adox $out, %r8
|
---|
662 | adcx %r11, %r9
|
---|
663 |
|
---|
664 | mulx 48($inp), %rax, %rcx
|
---|
665 | adox %rax, %r9
|
---|
666 | adcx %rcx, %r10
|
---|
667 |
|
---|
668 | mulx 56($inp), $out, %r11
|
---|
669 | adox $out, %r10
|
---|
670 | adcx %rbp, %r11
|
---|
671 | mulx %rdx, %rax, $out
|
---|
672 | mov 40($inp), %rdx
|
---|
673 | adox %rbp, %r11
|
---|
674 |
|
---|
675 | xor %rcx, %rcx
|
---|
676 | adox %r15, %r15
|
---|
677 | # rbx <= 2 and rax <= 0xFFFF..F9, so carry must be zero here
|
---|
678 | adcx %rbx, %rax
|
---|
679 | adox %r8, %r8
|
---|
680 | adcx %rax, %r15
|
---|
681 | adox %rbp, %rcx
|
---|
682 | adcx $out, %r8
|
---|
683 | adcx %rbp, %rcx
|
---|
684 |
|
---|
685 | mov %r15, 64(%rsp)
|
---|
686 | mov %r8, 72(%rsp)
|
---|
687 |
|
---|
688 | #sixth iteration
|
---|
689 | .byte 0xc4,0xe2,0xfb,0xf6,0x9e,0x30,0x00,0x00,0x00 # mulx 48($inp), %rax, %rbx
|
---|
690 | adox %rax, %r10
|
---|
691 | adcx %rbx, %r11
|
---|
692 |
|
---|
693 | .byte 0xc4,0x62,0xc3,0xf6,0xa6,0x38,0x00,0x00,0x00 # mulx 56($inp), $out, %r12
|
---|
694 | adox $out, %r11
|
---|
695 | adcx %rbp, %r12
|
---|
696 | mulx %rdx, %rax, $out
|
---|
697 | adox %rbp, %r12
|
---|
698 | mov 48($inp), %rdx
|
---|
699 |
|
---|
700 | xor %rbx, %rbx
|
---|
701 | adox %r9, %r9
|
---|
702 | # rcx <= 2 and rax <= 0xFFFF..F9, so carry must be zero here
|
---|
703 | adcx %rcx, %rax
|
---|
704 | adox %r10, %r10
|
---|
705 | adcx %rax, %r9
|
---|
706 | adcx $out, %r10
|
---|
707 | adox %rbp, %rbx
|
---|
708 | adcx %rbp, %rbx
|
---|
709 |
|
---|
710 | mov %r9, 80(%rsp)
|
---|
711 | mov %r10, 88(%rsp)
|
---|
712 |
|
---|
713 | #seventh iteration
|
---|
714 | .byte 0xc4,0x62,0xfb,0xf6,0xae,0x38,0x00,0x00,0x00 # mulx 56($inp), %rax, %r13
|
---|
715 | adox %rax, %r12
|
---|
716 | adox %rbp, %r13
|
---|
717 |
|
---|
718 | mulx %rdx, %rax, $out
|
---|
719 | xor %rcx, %rcx
|
---|
720 | mov 56($inp), %rdx
|
---|
721 | adox %r11, %r11
|
---|
722 | # rbx <= 2 and rax <= 0xFFFF..F9, so carry must be zero here
|
---|
723 | adcx %rbx, %rax
|
---|
724 | adox %r12, %r12
|
---|
725 | adcx %rax, %r11
|
---|
726 | adox %rbp, %rcx
|
---|
727 | adcx $out, %r12
|
---|
728 | adcx %rbp, %rcx
|
---|
729 |
|
---|
730 | .byte 0x4c,0x89,0x9c,0x24,0x60,0x00,0x00,0x00 # mov %r11, 96(%rsp)
|
---|
731 | .byte 0x4c,0x89,0xa4,0x24,0x68,0x00,0x00,0x00 # mov %r12, 104(%rsp)
|
---|
732 |
|
---|
733 | #eighth iteration
|
---|
734 | mulx %rdx, %rax, %rdx
|
---|
735 | xor %rbx, %rbx
|
---|
736 | adox %r13, %r13
|
---|
737 | # rcx <= 2 and rax <= 0xFFFF..F9, so carry must be zero here
|
---|
738 | adcx %rcx, %rax
|
---|
739 | adox %rbp, %rbx
|
---|
740 | adcx %r13, %rax
|
---|
741 | adcx %rdx, %rbx
|
---|
742 |
|
---|
743 | movq %xmm0, $out
|
---|
744 | movq %xmm1, %rbp
|
---|
745 |
|
---|
746 | movq 128(%rsp), %rdx # pull $n0
|
---|
747 | movq (%rsp), %r8
|
---|
748 | movq 8(%rsp), %r9
|
---|
749 | movq 16(%rsp), %r10
|
---|
750 | movq 24(%rsp), %r11
|
---|
751 | movq 32(%rsp), %r12
|
---|
752 | movq 40(%rsp), %r13
|
---|
753 | movq 48(%rsp), %r14
|
---|
754 | movq 56(%rsp), %r15
|
---|
755 |
|
---|
756 | movq %rax, 112(%rsp)
|
---|
757 | movq %rbx, 120(%rsp)
|
---|
758 |
|
---|
759 | call __rsaz_512_reducex
|
---|
760 |
|
---|
761 | addq 64(%rsp), %r8
|
---|
762 | adcq 72(%rsp), %r9
|
---|
763 | adcq 80(%rsp), %r10
|
---|
764 | adcq 88(%rsp), %r11
|
---|
765 | adcq 96(%rsp), %r12
|
---|
766 | adcq 104(%rsp), %r13
|
---|
767 | adcq 112(%rsp), %r14
|
---|
768 | adcq 120(%rsp), %r15
|
---|
769 | sbbq %rcx, %rcx
|
---|
770 |
|
---|
771 | call __rsaz_512_subtract
|
---|
772 |
|
---|
773 | movq %r8, %rdx
|
---|
774 | movq %r9, %rax
|
---|
775 | movl 128+8(%rsp), $times
|
---|
776 | movq $out, $inp
|
---|
777 |
|
---|
778 | decl $times
|
---|
779 | jnz .Loop_sqrx
|
---|
780 |
|
---|
781 | .Lsqr_tail:
|
---|
782 | ___
|
---|
783 | }
|
---|
784 | $code.=<<___;
|
---|
785 |
|
---|
786 | leaq 128+24+48(%rsp), %rax
|
---|
787 | .cfi_def_cfa %rax,8
|
---|
788 | movq -48(%rax), %r15
|
---|
789 | .cfi_restore %r15
|
---|
790 | movq -40(%rax), %r14
|
---|
791 | .cfi_restore %r14
|
---|
792 | movq -32(%rax), %r13
|
---|
793 | .cfi_restore %r13
|
---|
794 | movq -24(%rax), %r12
|
---|
795 | .cfi_restore %r12
|
---|
796 | movq -16(%rax), %rbp
|
---|
797 | .cfi_restore %rbp
|
---|
798 | movq -8(%rax), %rbx
|
---|
799 | .cfi_restore %rbx
|
---|
800 | leaq (%rax), %rsp
|
---|
801 | .cfi_def_cfa_register %rsp
|
---|
802 | .Lsqr_epilogue:
|
---|
803 | ret
|
---|
804 | .cfi_endproc
|
---|
805 | .size rsaz_512_sqr,.-rsaz_512_sqr
|
---|
806 | ___
|
---|
807 | }
|
---|
808 | {
|
---|
809 | my ($out,$ap,$bp,$mod,$n0) = ("%rdi","%rsi","%rdx","%rcx","%r8");
|
---|
810 | $code.=<<___;
|
---|
811 | .globl rsaz_512_mul
|
---|
812 | .type rsaz_512_mul,\@function,5
|
---|
813 | .align 32
|
---|
814 | rsaz_512_mul:
|
---|
815 | .cfi_startproc
|
---|
816 | push %rbx
|
---|
817 | .cfi_push %rbx
|
---|
818 | push %rbp
|
---|
819 | .cfi_push %rbp
|
---|
820 | push %r12
|
---|
821 | .cfi_push %r12
|
---|
822 | push %r13
|
---|
823 | .cfi_push %r13
|
---|
824 | push %r14
|
---|
825 | .cfi_push %r14
|
---|
826 | push %r15
|
---|
827 | .cfi_push %r15
|
---|
828 |
|
---|
829 | subq \$128+24, %rsp
|
---|
830 | .cfi_adjust_cfa_offset 128+24
|
---|
831 | .Lmul_body:
|
---|
832 | movq $out, %xmm0 # off-load arguments
|
---|
833 | movq $mod, %xmm1
|
---|
834 | movq $n0, 128(%rsp)
|
---|
835 | ___
|
---|
836 | $code.=<<___ if ($addx);
|
---|
837 | movl \$0x80100,%r11d
|
---|
838 | andl OPENSSL_ia32cap_P+8(%rip),%r11d
|
---|
839 | cmpl \$0x80100,%r11d # check for MULX and ADO/CX
|
---|
840 | je .Lmulx
|
---|
841 | ___
|
---|
842 | $code.=<<___;
|
---|
843 | movq ($bp), %rbx # pass b[0]
|
---|
844 | movq $bp, %rbp # pass argument
|
---|
845 | call __rsaz_512_mul
|
---|
846 |
|
---|
847 | movq %xmm0, $out
|
---|
848 | movq %xmm1, %rbp
|
---|
849 |
|
---|
850 | movq (%rsp), %r8
|
---|
851 | movq 8(%rsp), %r9
|
---|
852 | movq 16(%rsp), %r10
|
---|
853 | movq 24(%rsp), %r11
|
---|
854 | movq 32(%rsp), %r12
|
---|
855 | movq 40(%rsp), %r13
|
---|
856 | movq 48(%rsp), %r14
|
---|
857 | movq 56(%rsp), %r15
|
---|
858 |
|
---|
859 | call __rsaz_512_reduce
|
---|
860 | ___
|
---|
861 | $code.=<<___ if ($addx);
|
---|
862 | jmp .Lmul_tail
|
---|
863 |
|
---|
864 | .align 32
|
---|
865 | .Lmulx:
|
---|
866 | movq $bp, %rbp # pass argument
|
---|
867 | movq ($bp), %rdx # pass b[0]
|
---|
868 | call __rsaz_512_mulx
|
---|
869 |
|
---|
870 | movq %xmm0, $out
|
---|
871 | movq %xmm1, %rbp
|
---|
872 |
|
---|
873 | movq 128(%rsp), %rdx # pull $n0
|
---|
874 | movq (%rsp), %r8
|
---|
875 | movq 8(%rsp), %r9
|
---|
876 | movq 16(%rsp), %r10
|
---|
877 | movq 24(%rsp), %r11
|
---|
878 | movq 32(%rsp), %r12
|
---|
879 | movq 40(%rsp), %r13
|
---|
880 | movq 48(%rsp), %r14
|
---|
881 | movq 56(%rsp), %r15
|
---|
882 |
|
---|
883 | call __rsaz_512_reducex
|
---|
884 | .Lmul_tail:
|
---|
885 | ___
|
---|
886 | $code.=<<___;
|
---|
887 | addq 64(%rsp), %r8
|
---|
888 | adcq 72(%rsp), %r9
|
---|
889 | adcq 80(%rsp), %r10
|
---|
890 | adcq 88(%rsp), %r11
|
---|
891 | adcq 96(%rsp), %r12
|
---|
892 | adcq 104(%rsp), %r13
|
---|
893 | adcq 112(%rsp), %r14
|
---|
894 | adcq 120(%rsp), %r15
|
---|
895 | sbbq %rcx, %rcx
|
---|
896 |
|
---|
897 | call __rsaz_512_subtract
|
---|
898 |
|
---|
899 | leaq 128+24+48(%rsp), %rax
|
---|
900 | .cfi_def_cfa %rax,8
|
---|
901 | movq -48(%rax), %r15
|
---|
902 | .cfi_restore %r15
|
---|
903 | movq -40(%rax), %r14
|
---|
904 | .cfi_restore %r14
|
---|
905 | movq -32(%rax), %r13
|
---|
906 | .cfi_restore %r13
|
---|
907 | movq -24(%rax), %r12
|
---|
908 | .cfi_restore %r12
|
---|
909 | movq -16(%rax), %rbp
|
---|
910 | .cfi_restore %rbp
|
---|
911 | movq -8(%rax), %rbx
|
---|
912 | .cfi_restore %rbx
|
---|
913 | leaq (%rax), %rsp
|
---|
914 | .cfi_def_cfa_register %rsp
|
---|
915 | .Lmul_epilogue:
|
---|
916 | ret
|
---|
917 | .cfi_endproc
|
---|
918 | .size rsaz_512_mul,.-rsaz_512_mul
|
---|
919 | ___
|
---|
920 | }
|
---|
921 | {
|
---|
922 | my ($out,$ap,$bp,$mod,$n0,$pwr) = ("%rdi","%rsi","%rdx","%rcx","%r8","%r9d");
|
---|
923 | $code.=<<___;
|
---|
924 | .globl rsaz_512_mul_gather4
|
---|
925 | .type rsaz_512_mul_gather4,\@function,6
|
---|
926 | .align 32
|
---|
927 | rsaz_512_mul_gather4:
|
---|
928 | .cfi_startproc
|
---|
929 | push %rbx
|
---|
930 | .cfi_push %rbx
|
---|
931 | push %rbp
|
---|
932 | .cfi_push %rbp
|
---|
933 | push %r12
|
---|
934 | .cfi_push %r12
|
---|
935 | push %r13
|
---|
936 | .cfi_push %r13
|
---|
937 | push %r14
|
---|
938 | .cfi_push %r14
|
---|
939 | push %r15
|
---|
940 | .cfi_push %r15
|
---|
941 |
|
---|
942 | subq \$`128+24+($win64?0xb0:0)`, %rsp
|
---|
943 | .cfi_adjust_cfa_offset `128+24+($win64?0xb0:0)`
|
---|
944 | ___
|
---|
945 | $code.=<<___ if ($win64);
|
---|
946 | movaps %xmm6,0xa0(%rsp)
|
---|
947 | movaps %xmm7,0xb0(%rsp)
|
---|
948 | movaps %xmm8,0xc0(%rsp)
|
---|
949 | movaps %xmm9,0xd0(%rsp)
|
---|
950 | movaps %xmm10,0xe0(%rsp)
|
---|
951 | movaps %xmm11,0xf0(%rsp)
|
---|
952 | movaps %xmm12,0x100(%rsp)
|
---|
953 | movaps %xmm13,0x110(%rsp)
|
---|
954 | movaps %xmm14,0x120(%rsp)
|
---|
955 | movaps %xmm15,0x130(%rsp)
|
---|
956 | ___
|
---|
957 | $code.=<<___;
|
---|
958 | .Lmul_gather4_body:
|
---|
959 | movd $pwr,%xmm8
|
---|
960 | movdqa .Linc+16(%rip),%xmm1 # 00000002000000020000000200000002
|
---|
961 | movdqa .Linc(%rip),%xmm0 # 00000001000000010000000000000000
|
---|
962 |
|
---|
963 | pshufd \$0,%xmm8,%xmm8 # broadcast $power
|
---|
964 | movdqa %xmm1,%xmm7
|
---|
965 | movdqa %xmm1,%xmm2
|
---|
966 | ___
|
---|
967 | ########################################################################
|
---|
968 | # calculate mask by comparing 0..15 to $power
|
---|
969 | #
|
---|
970 | for($i=0;$i<4;$i++) {
|
---|
971 | $code.=<<___;
|
---|
972 | paddd %xmm`$i`,%xmm`$i+1`
|
---|
973 | pcmpeqd %xmm8,%xmm`$i`
|
---|
974 | movdqa %xmm7,%xmm`$i+3`
|
---|
975 | ___
|
---|
976 | }
|
---|
977 | for(;$i<7;$i++) {
|
---|
978 | $code.=<<___;
|
---|
979 | paddd %xmm`$i`,%xmm`$i+1`
|
---|
980 | pcmpeqd %xmm8,%xmm`$i`
|
---|
981 | ___
|
---|
982 | }
|
---|
983 | $code.=<<___;
|
---|
984 | pcmpeqd %xmm8,%xmm7
|
---|
985 |
|
---|
986 | movdqa 16*0($bp),%xmm8
|
---|
987 | movdqa 16*1($bp),%xmm9
|
---|
988 | movdqa 16*2($bp),%xmm10
|
---|
989 | movdqa 16*3($bp),%xmm11
|
---|
990 | pand %xmm0,%xmm8
|
---|
991 | movdqa 16*4($bp),%xmm12
|
---|
992 | pand %xmm1,%xmm9
|
---|
993 | movdqa 16*5($bp),%xmm13
|
---|
994 | pand %xmm2,%xmm10
|
---|
995 | movdqa 16*6($bp),%xmm14
|
---|
996 | pand %xmm3,%xmm11
|
---|
997 | movdqa 16*7($bp),%xmm15
|
---|
998 | leaq 128($bp), %rbp
|
---|
999 | pand %xmm4,%xmm12
|
---|
1000 | pand %xmm5,%xmm13
|
---|
1001 | pand %xmm6,%xmm14
|
---|
1002 | pand %xmm7,%xmm15
|
---|
1003 | por %xmm10,%xmm8
|
---|
1004 | por %xmm11,%xmm9
|
---|
1005 | por %xmm12,%xmm8
|
---|
1006 | por %xmm13,%xmm9
|
---|
1007 | por %xmm14,%xmm8
|
---|
1008 | por %xmm15,%xmm9
|
---|
1009 |
|
---|
1010 | por %xmm9,%xmm8
|
---|
1011 | pshufd \$0x4e,%xmm8,%xmm9
|
---|
1012 | por %xmm9,%xmm8
|
---|
1013 | ___
|
---|
1014 | $code.=<<___ if ($addx);
|
---|
1015 | movl \$0x80100,%r11d
|
---|
1016 | andl OPENSSL_ia32cap_P+8(%rip),%r11d
|
---|
1017 | cmpl \$0x80100,%r11d # check for MULX and ADO/CX
|
---|
1018 | je .Lmulx_gather
|
---|
1019 | ___
|
---|
1020 | $code.=<<___;
|
---|
1021 | movq %xmm8,%rbx
|
---|
1022 |
|
---|
1023 | movq $n0, 128(%rsp) # off-load arguments
|
---|
1024 | movq $out, 128+8(%rsp)
|
---|
1025 | movq $mod, 128+16(%rsp)
|
---|
1026 |
|
---|
1027 | movq ($ap), %rax
|
---|
1028 | movq 8($ap), %rcx
|
---|
1029 | mulq %rbx # 0 iteration
|
---|
1030 | movq %rax, (%rsp)
|
---|
1031 | movq %rcx, %rax
|
---|
1032 | movq %rdx, %r8
|
---|
1033 |
|
---|
1034 | mulq %rbx
|
---|
1035 | addq %rax, %r8
|
---|
1036 | movq 16($ap), %rax
|
---|
1037 | movq %rdx, %r9
|
---|
1038 | adcq \$0, %r9
|
---|
1039 |
|
---|
1040 | mulq %rbx
|
---|
1041 | addq %rax, %r9
|
---|
1042 | movq 24($ap), %rax
|
---|
1043 | movq %rdx, %r10
|
---|
1044 | adcq \$0, %r10
|
---|
1045 |
|
---|
1046 | mulq %rbx
|
---|
1047 | addq %rax, %r10
|
---|
1048 | movq 32($ap), %rax
|
---|
1049 | movq %rdx, %r11
|
---|
1050 | adcq \$0, %r11
|
---|
1051 |
|
---|
1052 | mulq %rbx
|
---|
1053 | addq %rax, %r11
|
---|
1054 | movq 40($ap), %rax
|
---|
1055 | movq %rdx, %r12
|
---|
1056 | adcq \$0, %r12
|
---|
1057 |
|
---|
1058 | mulq %rbx
|
---|
1059 | addq %rax, %r12
|
---|
1060 | movq 48($ap), %rax
|
---|
1061 | movq %rdx, %r13
|
---|
1062 | adcq \$0, %r13
|
---|
1063 |
|
---|
1064 | mulq %rbx
|
---|
1065 | addq %rax, %r13
|
---|
1066 | movq 56($ap), %rax
|
---|
1067 | movq %rdx, %r14
|
---|
1068 | adcq \$0, %r14
|
---|
1069 |
|
---|
1070 | mulq %rbx
|
---|
1071 | addq %rax, %r14
|
---|
1072 | movq ($ap), %rax
|
---|
1073 | movq %rdx, %r15
|
---|
1074 | adcq \$0, %r15
|
---|
1075 |
|
---|
1076 | leaq 8(%rsp), %rdi
|
---|
1077 | movl \$7, %ecx
|
---|
1078 | jmp .Loop_mul_gather
|
---|
1079 |
|
---|
1080 | .align 32
|
---|
1081 | .Loop_mul_gather:
|
---|
1082 | movdqa 16*0(%rbp),%xmm8
|
---|
1083 | movdqa 16*1(%rbp),%xmm9
|
---|
1084 | movdqa 16*2(%rbp),%xmm10
|
---|
1085 | movdqa 16*3(%rbp),%xmm11
|
---|
1086 | pand %xmm0,%xmm8
|
---|
1087 | movdqa 16*4(%rbp),%xmm12
|
---|
1088 | pand %xmm1,%xmm9
|
---|
1089 | movdqa 16*5(%rbp),%xmm13
|
---|
1090 | pand %xmm2,%xmm10
|
---|
1091 | movdqa 16*6(%rbp),%xmm14
|
---|
1092 | pand %xmm3,%xmm11
|
---|
1093 | movdqa 16*7(%rbp),%xmm15
|
---|
1094 | leaq 128(%rbp), %rbp
|
---|
1095 | pand %xmm4,%xmm12
|
---|
1096 | pand %xmm5,%xmm13
|
---|
1097 | pand %xmm6,%xmm14
|
---|
1098 | pand %xmm7,%xmm15
|
---|
1099 | por %xmm10,%xmm8
|
---|
1100 | por %xmm11,%xmm9
|
---|
1101 | por %xmm12,%xmm8
|
---|
1102 | por %xmm13,%xmm9
|
---|
1103 | por %xmm14,%xmm8
|
---|
1104 | por %xmm15,%xmm9
|
---|
1105 |
|
---|
1106 | por %xmm9,%xmm8
|
---|
1107 | pshufd \$0x4e,%xmm8,%xmm9
|
---|
1108 | por %xmm9,%xmm8
|
---|
1109 | movq %xmm8,%rbx
|
---|
1110 |
|
---|
1111 | mulq %rbx
|
---|
1112 | addq %rax, %r8
|
---|
1113 | movq 8($ap), %rax
|
---|
1114 | movq %r8, (%rdi)
|
---|
1115 | movq %rdx, %r8
|
---|
1116 | adcq \$0, %r8
|
---|
1117 |
|
---|
1118 | mulq %rbx
|
---|
1119 | addq %rax, %r9
|
---|
1120 | movq 16($ap), %rax
|
---|
1121 | adcq \$0, %rdx
|
---|
1122 | addq %r9, %r8
|
---|
1123 | movq %rdx, %r9
|
---|
1124 | adcq \$0, %r9
|
---|
1125 |
|
---|
1126 | mulq %rbx
|
---|
1127 | addq %rax, %r10
|
---|
1128 | movq 24($ap), %rax
|
---|
1129 | adcq \$0, %rdx
|
---|
1130 | addq %r10, %r9
|
---|
1131 | movq %rdx, %r10
|
---|
1132 | adcq \$0, %r10
|
---|
1133 |
|
---|
1134 | mulq %rbx
|
---|
1135 | addq %rax, %r11
|
---|
1136 | movq 32($ap), %rax
|
---|
1137 | adcq \$0, %rdx
|
---|
1138 | addq %r11, %r10
|
---|
1139 | movq %rdx, %r11
|
---|
1140 | adcq \$0, %r11
|
---|
1141 |
|
---|
1142 | mulq %rbx
|
---|
1143 | addq %rax, %r12
|
---|
1144 | movq 40($ap), %rax
|
---|
1145 | adcq \$0, %rdx
|
---|
1146 | addq %r12, %r11
|
---|
1147 | movq %rdx, %r12
|
---|
1148 | adcq \$0, %r12
|
---|
1149 |
|
---|
1150 | mulq %rbx
|
---|
1151 | addq %rax, %r13
|
---|
1152 | movq 48($ap), %rax
|
---|
1153 | adcq \$0, %rdx
|
---|
1154 | addq %r13, %r12
|
---|
1155 | movq %rdx, %r13
|
---|
1156 | adcq \$0, %r13
|
---|
1157 |
|
---|
1158 | mulq %rbx
|
---|
1159 | addq %rax, %r14
|
---|
1160 | movq 56($ap), %rax
|
---|
1161 | adcq \$0, %rdx
|
---|
1162 | addq %r14, %r13
|
---|
1163 | movq %rdx, %r14
|
---|
1164 | adcq \$0, %r14
|
---|
1165 |
|
---|
1166 | mulq %rbx
|
---|
1167 | addq %rax, %r15
|
---|
1168 | movq ($ap), %rax
|
---|
1169 | adcq \$0, %rdx
|
---|
1170 | addq %r15, %r14
|
---|
1171 | movq %rdx, %r15
|
---|
1172 | adcq \$0, %r15
|
---|
1173 |
|
---|
1174 | leaq 8(%rdi), %rdi
|
---|
1175 |
|
---|
1176 | decl %ecx
|
---|
1177 | jnz .Loop_mul_gather
|
---|
1178 |
|
---|
1179 | movq %r8, (%rdi)
|
---|
1180 | movq %r9, 8(%rdi)
|
---|
1181 | movq %r10, 16(%rdi)
|
---|
1182 | movq %r11, 24(%rdi)
|
---|
1183 | movq %r12, 32(%rdi)
|
---|
1184 | movq %r13, 40(%rdi)
|
---|
1185 | movq %r14, 48(%rdi)
|
---|
1186 | movq %r15, 56(%rdi)
|
---|
1187 |
|
---|
1188 | movq 128+8(%rsp), $out
|
---|
1189 | movq 128+16(%rsp), %rbp
|
---|
1190 |
|
---|
1191 | movq (%rsp), %r8
|
---|
1192 | movq 8(%rsp), %r9
|
---|
1193 | movq 16(%rsp), %r10
|
---|
1194 | movq 24(%rsp), %r11
|
---|
1195 | movq 32(%rsp), %r12
|
---|
1196 | movq 40(%rsp), %r13
|
---|
1197 | movq 48(%rsp), %r14
|
---|
1198 | movq 56(%rsp), %r15
|
---|
1199 |
|
---|
1200 | call __rsaz_512_reduce
|
---|
1201 | ___
|
---|
1202 | $code.=<<___ if ($addx);
|
---|
1203 | jmp .Lmul_gather_tail
|
---|
1204 |
|
---|
1205 | .align 32
|
---|
1206 | .Lmulx_gather:
|
---|
1207 | movq %xmm8,%rdx
|
---|
1208 |
|
---|
1209 | mov $n0, 128(%rsp) # off-load arguments
|
---|
1210 | mov $out, 128+8(%rsp)
|
---|
1211 | mov $mod, 128+16(%rsp)
|
---|
1212 |
|
---|
1213 | mulx ($ap), %rbx, %r8 # 0 iteration
|
---|
1214 | mov %rbx, (%rsp)
|
---|
1215 | xor %edi, %edi # cf=0, of=0
|
---|
1216 |
|
---|
1217 | mulx 8($ap), %rax, %r9
|
---|
1218 |
|
---|
1219 | mulx 16($ap), %rbx, %r10
|
---|
1220 | adcx %rax, %r8
|
---|
1221 |
|
---|
1222 | mulx 24($ap), %rax, %r11
|
---|
1223 | adcx %rbx, %r9
|
---|
1224 |
|
---|
1225 | mulx 32($ap), %rbx, %r12
|
---|
1226 | adcx %rax, %r10
|
---|
1227 |
|
---|
1228 | mulx 40($ap), %rax, %r13
|
---|
1229 | adcx %rbx, %r11
|
---|
1230 |
|
---|
1231 | mulx 48($ap), %rbx, %r14
|
---|
1232 | adcx %rax, %r12
|
---|
1233 |
|
---|
1234 | mulx 56($ap), %rax, %r15
|
---|
1235 | adcx %rbx, %r13
|
---|
1236 | adcx %rax, %r14
|
---|
1237 | .byte 0x67
|
---|
1238 | mov %r8, %rbx
|
---|
1239 | adcx %rdi, %r15 # %rdi is 0
|
---|
1240 |
|
---|
1241 | mov \$-7, %rcx
|
---|
1242 | jmp .Loop_mulx_gather
|
---|
1243 |
|
---|
1244 | .align 32
|
---|
1245 | .Loop_mulx_gather:
|
---|
1246 | movdqa 16*0(%rbp),%xmm8
|
---|
1247 | movdqa 16*1(%rbp),%xmm9
|
---|
1248 | movdqa 16*2(%rbp),%xmm10
|
---|
1249 | movdqa 16*3(%rbp),%xmm11
|
---|
1250 | pand %xmm0,%xmm8
|
---|
1251 | movdqa 16*4(%rbp),%xmm12
|
---|
1252 | pand %xmm1,%xmm9
|
---|
1253 | movdqa 16*5(%rbp),%xmm13
|
---|
1254 | pand %xmm2,%xmm10
|
---|
1255 | movdqa 16*6(%rbp),%xmm14
|
---|
1256 | pand %xmm3,%xmm11
|
---|
1257 | movdqa 16*7(%rbp),%xmm15
|
---|
1258 | leaq 128(%rbp), %rbp
|
---|
1259 | pand %xmm4,%xmm12
|
---|
1260 | pand %xmm5,%xmm13
|
---|
1261 | pand %xmm6,%xmm14
|
---|
1262 | pand %xmm7,%xmm15
|
---|
1263 | por %xmm10,%xmm8
|
---|
1264 | por %xmm11,%xmm9
|
---|
1265 | por %xmm12,%xmm8
|
---|
1266 | por %xmm13,%xmm9
|
---|
1267 | por %xmm14,%xmm8
|
---|
1268 | por %xmm15,%xmm9
|
---|
1269 |
|
---|
1270 | por %xmm9,%xmm8
|
---|
1271 | pshufd \$0x4e,%xmm8,%xmm9
|
---|
1272 | por %xmm9,%xmm8
|
---|
1273 | movq %xmm8,%rdx
|
---|
1274 |
|
---|
1275 | .byte 0xc4,0x62,0xfb,0xf6,0x86,0x00,0x00,0x00,0x00 # mulx ($ap), %rax, %r8
|
---|
1276 | adcx %rax, %rbx
|
---|
1277 | adox %r9, %r8
|
---|
1278 |
|
---|
1279 | mulx 8($ap), %rax, %r9
|
---|
1280 | adcx %rax, %r8
|
---|
1281 | adox %r10, %r9
|
---|
1282 |
|
---|
1283 | mulx 16($ap), %rax, %r10
|
---|
1284 | adcx %rax, %r9
|
---|
1285 | adox %r11, %r10
|
---|
1286 |
|
---|
1287 | .byte 0xc4,0x62,0xfb,0xf6,0x9e,0x18,0x00,0x00,0x00 # mulx 24($ap), %rax, %r11
|
---|
1288 | adcx %rax, %r10
|
---|
1289 | adox %r12, %r11
|
---|
1290 |
|
---|
1291 | mulx 32($ap), %rax, %r12
|
---|
1292 | adcx %rax, %r11
|
---|
1293 | adox %r13, %r12
|
---|
1294 |
|
---|
1295 | mulx 40($ap), %rax, %r13
|
---|
1296 | adcx %rax, %r12
|
---|
1297 | adox %r14, %r13
|
---|
1298 |
|
---|
1299 | .byte 0xc4,0x62,0xfb,0xf6,0xb6,0x30,0x00,0x00,0x00 # mulx 48($ap), %rax, %r14
|
---|
1300 | adcx %rax, %r13
|
---|
1301 | .byte 0x67
|
---|
1302 | adox %r15, %r14
|
---|
1303 |
|
---|
1304 | mulx 56($ap), %rax, %r15
|
---|
1305 | mov %rbx, 64(%rsp,%rcx,8)
|
---|
1306 | adcx %rax, %r14
|
---|
1307 | adox %rdi, %r15
|
---|
1308 | mov %r8, %rbx
|
---|
1309 | adcx %rdi, %r15 # cf=0
|
---|
1310 |
|
---|
1311 | inc %rcx # of=0
|
---|
1312 | jnz .Loop_mulx_gather
|
---|
1313 |
|
---|
1314 | mov %r8, 64(%rsp)
|
---|
1315 | mov %r9, 64+8(%rsp)
|
---|
1316 | mov %r10, 64+16(%rsp)
|
---|
1317 | mov %r11, 64+24(%rsp)
|
---|
1318 | mov %r12, 64+32(%rsp)
|
---|
1319 | mov %r13, 64+40(%rsp)
|
---|
1320 | mov %r14, 64+48(%rsp)
|
---|
1321 | mov %r15, 64+56(%rsp)
|
---|
1322 |
|
---|
1323 | mov 128(%rsp), %rdx # pull arguments
|
---|
1324 | mov 128+8(%rsp), $out
|
---|
1325 | mov 128+16(%rsp), %rbp
|
---|
1326 |
|
---|
1327 | mov (%rsp), %r8
|
---|
1328 | mov 8(%rsp), %r9
|
---|
1329 | mov 16(%rsp), %r10
|
---|
1330 | mov 24(%rsp), %r11
|
---|
1331 | mov 32(%rsp), %r12
|
---|
1332 | mov 40(%rsp), %r13
|
---|
1333 | mov 48(%rsp), %r14
|
---|
1334 | mov 56(%rsp), %r15
|
---|
1335 |
|
---|
1336 | call __rsaz_512_reducex
|
---|
1337 |
|
---|
1338 | .Lmul_gather_tail:
|
---|
1339 | ___
|
---|
1340 | $code.=<<___;
|
---|
1341 | addq 64(%rsp), %r8
|
---|
1342 | adcq 72(%rsp), %r9
|
---|
1343 | adcq 80(%rsp), %r10
|
---|
1344 | adcq 88(%rsp), %r11
|
---|
1345 | adcq 96(%rsp), %r12
|
---|
1346 | adcq 104(%rsp), %r13
|
---|
1347 | adcq 112(%rsp), %r14
|
---|
1348 | adcq 120(%rsp), %r15
|
---|
1349 | sbbq %rcx, %rcx
|
---|
1350 |
|
---|
1351 | call __rsaz_512_subtract
|
---|
1352 |
|
---|
1353 | leaq 128+24+48(%rsp), %rax
|
---|
1354 | ___
|
---|
1355 | $code.=<<___ if ($win64);
|
---|
1356 | movaps 0xa0-0xc8(%rax),%xmm6
|
---|
1357 | movaps 0xb0-0xc8(%rax),%xmm7
|
---|
1358 | movaps 0xc0-0xc8(%rax),%xmm8
|
---|
1359 | movaps 0xd0-0xc8(%rax),%xmm9
|
---|
1360 | movaps 0xe0-0xc8(%rax),%xmm10
|
---|
1361 | movaps 0xf0-0xc8(%rax),%xmm11
|
---|
1362 | movaps 0x100-0xc8(%rax),%xmm12
|
---|
1363 | movaps 0x110-0xc8(%rax),%xmm13
|
---|
1364 | movaps 0x120-0xc8(%rax),%xmm14
|
---|
1365 | movaps 0x130-0xc8(%rax),%xmm15
|
---|
1366 | lea 0xb0(%rax),%rax
|
---|
1367 | ___
|
---|
1368 | $code.=<<___;
|
---|
1369 | .cfi_def_cfa %rax,8
|
---|
1370 | movq -48(%rax), %r15
|
---|
1371 | .cfi_restore %r15
|
---|
1372 | movq -40(%rax), %r14
|
---|
1373 | .cfi_restore %r14
|
---|
1374 | movq -32(%rax), %r13
|
---|
1375 | .cfi_restore %r13
|
---|
1376 | movq -24(%rax), %r12
|
---|
1377 | .cfi_restore %r12
|
---|
1378 | movq -16(%rax), %rbp
|
---|
1379 | .cfi_restore %rbp
|
---|
1380 | movq -8(%rax), %rbx
|
---|
1381 | .cfi_restore %rbx
|
---|
1382 | leaq (%rax), %rsp
|
---|
1383 | .cfi_def_cfa_register %rsp
|
---|
1384 | .Lmul_gather4_epilogue:
|
---|
1385 | ret
|
---|
1386 | .cfi_endproc
|
---|
1387 | .size rsaz_512_mul_gather4,.-rsaz_512_mul_gather4
|
---|
1388 | ___
|
---|
1389 | }
|
---|
1390 | {
|
---|
1391 | my ($out,$ap,$mod,$n0,$tbl,$pwr) = ("%rdi","%rsi","%rdx","%rcx","%r8","%r9d");
|
---|
1392 | $code.=<<___;
|
---|
1393 | .globl rsaz_512_mul_scatter4
|
---|
1394 | .type rsaz_512_mul_scatter4,\@function,6
|
---|
1395 | .align 32
|
---|
1396 | rsaz_512_mul_scatter4:
|
---|
1397 | .cfi_startproc
|
---|
1398 | push %rbx
|
---|
1399 | .cfi_push %rbx
|
---|
1400 | push %rbp
|
---|
1401 | .cfi_push %rbp
|
---|
1402 | push %r12
|
---|
1403 | .cfi_push %r12
|
---|
1404 | push %r13
|
---|
1405 | .cfi_push %r13
|
---|
1406 | push %r14
|
---|
1407 | .cfi_push %r14
|
---|
1408 | push %r15
|
---|
1409 | .cfi_push %r15
|
---|
1410 |
|
---|
1411 | mov $pwr, $pwr
|
---|
1412 | subq \$128+24, %rsp
|
---|
1413 | .cfi_adjust_cfa_offset 128+24
|
---|
1414 | .Lmul_scatter4_body:
|
---|
1415 | leaq ($tbl,$pwr,8), $tbl
|
---|
1416 | movq $out, %xmm0 # off-load arguments
|
---|
1417 | movq $mod, %xmm1
|
---|
1418 | movq $tbl, %xmm2
|
---|
1419 | movq $n0, 128(%rsp)
|
---|
1420 |
|
---|
1421 | movq $out, %rbp
|
---|
1422 | ___
|
---|
1423 | $code.=<<___ if ($addx);
|
---|
1424 | movl \$0x80100,%r11d
|
---|
1425 | andl OPENSSL_ia32cap_P+8(%rip),%r11d
|
---|
1426 | cmpl \$0x80100,%r11d # check for MULX and ADO/CX
|
---|
1427 | je .Lmulx_scatter
|
---|
1428 | ___
|
---|
1429 | $code.=<<___;
|
---|
1430 | movq ($out),%rbx # pass b[0]
|
---|
1431 | call __rsaz_512_mul
|
---|
1432 |
|
---|
1433 | movq %xmm0, $out
|
---|
1434 | movq %xmm1, %rbp
|
---|
1435 |
|
---|
1436 | movq (%rsp), %r8
|
---|
1437 | movq 8(%rsp), %r9
|
---|
1438 | movq 16(%rsp), %r10
|
---|
1439 | movq 24(%rsp), %r11
|
---|
1440 | movq 32(%rsp), %r12
|
---|
1441 | movq 40(%rsp), %r13
|
---|
1442 | movq 48(%rsp), %r14
|
---|
1443 | movq 56(%rsp), %r15
|
---|
1444 |
|
---|
1445 | call __rsaz_512_reduce
|
---|
1446 | ___
|
---|
1447 | $code.=<<___ if ($addx);
|
---|
1448 | jmp .Lmul_scatter_tail
|
---|
1449 |
|
---|
1450 | .align 32
|
---|
1451 | .Lmulx_scatter:
|
---|
1452 | movq ($out), %rdx # pass b[0]
|
---|
1453 | call __rsaz_512_mulx
|
---|
1454 |
|
---|
1455 | movq %xmm0, $out
|
---|
1456 | movq %xmm1, %rbp
|
---|
1457 |
|
---|
1458 | movq 128(%rsp), %rdx # pull $n0
|
---|
1459 | movq (%rsp), %r8
|
---|
1460 | movq 8(%rsp), %r9
|
---|
1461 | movq 16(%rsp), %r10
|
---|
1462 | movq 24(%rsp), %r11
|
---|
1463 | movq 32(%rsp), %r12
|
---|
1464 | movq 40(%rsp), %r13
|
---|
1465 | movq 48(%rsp), %r14
|
---|
1466 | movq 56(%rsp), %r15
|
---|
1467 |
|
---|
1468 | call __rsaz_512_reducex
|
---|
1469 |
|
---|
1470 | .Lmul_scatter_tail:
|
---|
1471 | ___
|
---|
1472 | $code.=<<___;
|
---|
1473 | addq 64(%rsp), %r8
|
---|
1474 | adcq 72(%rsp), %r9
|
---|
1475 | adcq 80(%rsp), %r10
|
---|
1476 | adcq 88(%rsp), %r11
|
---|
1477 | adcq 96(%rsp), %r12
|
---|
1478 | adcq 104(%rsp), %r13
|
---|
1479 | adcq 112(%rsp), %r14
|
---|
1480 | adcq 120(%rsp), %r15
|
---|
1481 | movq %xmm2, $inp
|
---|
1482 | sbbq %rcx, %rcx
|
---|
1483 |
|
---|
1484 | call __rsaz_512_subtract
|
---|
1485 |
|
---|
1486 | movq %r8, 128*0($inp) # scatter
|
---|
1487 | movq %r9, 128*1($inp)
|
---|
1488 | movq %r10, 128*2($inp)
|
---|
1489 | movq %r11, 128*3($inp)
|
---|
1490 | movq %r12, 128*4($inp)
|
---|
1491 | movq %r13, 128*5($inp)
|
---|
1492 | movq %r14, 128*6($inp)
|
---|
1493 | movq %r15, 128*7($inp)
|
---|
1494 |
|
---|
1495 | leaq 128+24+48(%rsp), %rax
|
---|
1496 | .cfi_def_cfa %rax,8
|
---|
1497 | movq -48(%rax), %r15
|
---|
1498 | .cfi_restore %r15
|
---|
1499 | movq -40(%rax), %r14
|
---|
1500 | .cfi_restore %r14
|
---|
1501 | movq -32(%rax), %r13
|
---|
1502 | .cfi_restore %r13
|
---|
1503 | movq -24(%rax), %r12
|
---|
1504 | .cfi_restore %r12
|
---|
1505 | movq -16(%rax), %rbp
|
---|
1506 | .cfi_restore %rbp
|
---|
1507 | movq -8(%rax), %rbx
|
---|
1508 | .cfi_restore %rbx
|
---|
1509 | leaq (%rax), %rsp
|
---|
1510 | .cfi_def_cfa_register %rsp
|
---|
1511 | .Lmul_scatter4_epilogue:
|
---|
1512 | ret
|
---|
1513 | .cfi_endproc
|
---|
1514 | .size rsaz_512_mul_scatter4,.-rsaz_512_mul_scatter4
|
---|
1515 | ___
|
---|
1516 | }
|
---|
1517 | {
|
---|
1518 | my ($out,$inp,$mod,$n0) = ("%rdi","%rsi","%rdx","%rcx");
|
---|
1519 | $code.=<<___;
|
---|
1520 | .globl rsaz_512_mul_by_one
|
---|
1521 | .type rsaz_512_mul_by_one,\@function,4
|
---|
1522 | .align 32
|
---|
1523 | rsaz_512_mul_by_one:
|
---|
1524 | .cfi_startproc
|
---|
1525 | push %rbx
|
---|
1526 | .cfi_push %rbx
|
---|
1527 | push %rbp
|
---|
1528 | .cfi_push %rbp
|
---|
1529 | push %r12
|
---|
1530 | .cfi_push %r12
|
---|
1531 | push %r13
|
---|
1532 | .cfi_push %r13
|
---|
1533 | push %r14
|
---|
1534 | .cfi_push %r14
|
---|
1535 | push %r15
|
---|
1536 | .cfi_push %r15
|
---|
1537 |
|
---|
1538 | subq \$128+24, %rsp
|
---|
1539 | .cfi_adjust_cfa_offset 128+24
|
---|
1540 | .Lmul_by_one_body:
|
---|
1541 | ___
|
---|
1542 | $code.=<<___ if ($addx);
|
---|
1543 | movl OPENSSL_ia32cap_P+8(%rip),%eax
|
---|
1544 | ___
|
---|
1545 | $code.=<<___;
|
---|
1546 | movq $mod, %rbp # reassign argument
|
---|
1547 | movq $n0, 128(%rsp)
|
---|
1548 |
|
---|
1549 | movq ($inp), %r8
|
---|
1550 | pxor %xmm0, %xmm0
|
---|
1551 | movq 8($inp), %r9
|
---|
1552 | movq 16($inp), %r10
|
---|
1553 | movq 24($inp), %r11
|
---|
1554 | movq 32($inp), %r12
|
---|
1555 | movq 40($inp), %r13
|
---|
1556 | movq 48($inp), %r14
|
---|
1557 | movq 56($inp), %r15
|
---|
1558 |
|
---|
1559 | movdqa %xmm0, (%rsp)
|
---|
1560 | movdqa %xmm0, 16(%rsp)
|
---|
1561 | movdqa %xmm0, 32(%rsp)
|
---|
1562 | movdqa %xmm0, 48(%rsp)
|
---|
1563 | movdqa %xmm0, 64(%rsp)
|
---|
1564 | movdqa %xmm0, 80(%rsp)
|
---|
1565 | movdqa %xmm0, 96(%rsp)
|
---|
1566 | ___
|
---|
1567 | $code.=<<___ if ($addx);
|
---|
1568 | andl \$0x80100,%eax
|
---|
1569 | cmpl \$0x80100,%eax # check for MULX and ADO/CX
|
---|
1570 | je .Lby_one_callx
|
---|
1571 | ___
|
---|
1572 | $code.=<<___;
|
---|
1573 | call __rsaz_512_reduce
|
---|
1574 | ___
|
---|
1575 | $code.=<<___ if ($addx);
|
---|
1576 | jmp .Lby_one_tail
|
---|
1577 | .align 32
|
---|
1578 | .Lby_one_callx:
|
---|
1579 | movq 128(%rsp), %rdx # pull $n0
|
---|
1580 | call __rsaz_512_reducex
|
---|
1581 | .Lby_one_tail:
|
---|
1582 | ___
|
---|
1583 | $code.=<<___;
|
---|
1584 | movq %r8, ($out)
|
---|
1585 | movq %r9, 8($out)
|
---|
1586 | movq %r10, 16($out)
|
---|
1587 | movq %r11, 24($out)
|
---|
1588 | movq %r12, 32($out)
|
---|
1589 | movq %r13, 40($out)
|
---|
1590 | movq %r14, 48($out)
|
---|
1591 | movq %r15, 56($out)
|
---|
1592 |
|
---|
1593 | leaq 128+24+48(%rsp), %rax
|
---|
1594 | .cfi_def_cfa %rax,8
|
---|
1595 | movq -48(%rax), %r15
|
---|
1596 | .cfi_restore %r15
|
---|
1597 | movq -40(%rax), %r14
|
---|
1598 | .cfi_restore %r14
|
---|
1599 | movq -32(%rax), %r13
|
---|
1600 | .cfi_restore %r13
|
---|
1601 | movq -24(%rax), %r12
|
---|
1602 | .cfi_restore %r12
|
---|
1603 | movq -16(%rax), %rbp
|
---|
1604 | .cfi_restore %rbp
|
---|
1605 | movq -8(%rax), %rbx
|
---|
1606 | .cfi_restore %rbx
|
---|
1607 | leaq (%rax), %rsp
|
---|
1608 | .cfi_def_cfa_register %rsp
|
---|
1609 | .Lmul_by_one_epilogue:
|
---|
1610 | ret
|
---|
1611 | .cfi_endproc
|
---|
1612 | .size rsaz_512_mul_by_one,.-rsaz_512_mul_by_one
|
---|
1613 | ___
|
---|
1614 | }
|
---|
1615 | { # __rsaz_512_reduce
|
---|
1616 | #
|
---|
1617 | # input: %r8-%r15, %rbp - mod, 128(%rsp) - n0
|
---|
1618 | # output: %r8-%r15
|
---|
1619 | # clobbers: everything except %rbp and %rdi
|
---|
1620 | $code.=<<___;
|
---|
1621 | .type __rsaz_512_reduce,\@abi-omnipotent
|
---|
1622 | .align 32
|
---|
1623 | __rsaz_512_reduce:
|
---|
1624 | .cfi_startproc
|
---|
1625 | movq %r8, %rbx
|
---|
1626 | imulq 128+8(%rsp), %rbx
|
---|
1627 | movq 0(%rbp), %rax
|
---|
1628 | movl \$8, %ecx
|
---|
1629 | jmp .Lreduction_loop
|
---|
1630 |
|
---|
1631 | .align 32
|
---|
1632 | .Lreduction_loop:
|
---|
1633 | mulq %rbx
|
---|
1634 | movq 8(%rbp), %rax
|
---|
1635 | negq %r8
|
---|
1636 | movq %rdx, %r8
|
---|
1637 | adcq \$0, %r8
|
---|
1638 |
|
---|
1639 | mulq %rbx
|
---|
1640 | addq %rax, %r9
|
---|
1641 | movq 16(%rbp), %rax
|
---|
1642 | adcq \$0, %rdx
|
---|
1643 | addq %r9, %r8
|
---|
1644 | movq %rdx, %r9
|
---|
1645 | adcq \$0, %r9
|
---|
1646 |
|
---|
1647 | mulq %rbx
|
---|
1648 | addq %rax, %r10
|
---|
1649 | movq 24(%rbp), %rax
|
---|
1650 | adcq \$0, %rdx
|
---|
1651 | addq %r10, %r9
|
---|
1652 | movq %rdx, %r10
|
---|
1653 | adcq \$0, %r10
|
---|
1654 |
|
---|
1655 | mulq %rbx
|
---|
1656 | addq %rax, %r11
|
---|
1657 | movq 32(%rbp), %rax
|
---|
1658 | adcq \$0, %rdx
|
---|
1659 | addq %r11, %r10
|
---|
1660 | movq 128+8(%rsp), %rsi
|
---|
1661 | #movq %rdx, %r11
|
---|
1662 | #adcq \$0, %r11
|
---|
1663 | adcq \$0, %rdx
|
---|
1664 | movq %rdx, %r11
|
---|
1665 |
|
---|
1666 | mulq %rbx
|
---|
1667 | addq %rax, %r12
|
---|
1668 | movq 40(%rbp), %rax
|
---|
1669 | adcq \$0, %rdx
|
---|
1670 | imulq %r8, %rsi
|
---|
1671 | addq %r12, %r11
|
---|
1672 | movq %rdx, %r12
|
---|
1673 | adcq \$0, %r12
|
---|
1674 |
|
---|
1675 | mulq %rbx
|
---|
1676 | addq %rax, %r13
|
---|
1677 | movq 48(%rbp), %rax
|
---|
1678 | adcq \$0, %rdx
|
---|
1679 | addq %r13, %r12
|
---|
1680 | movq %rdx, %r13
|
---|
1681 | adcq \$0, %r13
|
---|
1682 |
|
---|
1683 | mulq %rbx
|
---|
1684 | addq %rax, %r14
|
---|
1685 | movq 56(%rbp), %rax
|
---|
1686 | adcq \$0, %rdx
|
---|
1687 | addq %r14, %r13
|
---|
1688 | movq %rdx, %r14
|
---|
1689 | adcq \$0, %r14
|
---|
1690 |
|
---|
1691 | mulq %rbx
|
---|
1692 | movq %rsi, %rbx
|
---|
1693 | addq %rax, %r15
|
---|
1694 | movq 0(%rbp), %rax
|
---|
1695 | adcq \$0, %rdx
|
---|
1696 | addq %r15, %r14
|
---|
1697 | movq %rdx, %r15
|
---|
1698 | adcq \$0, %r15
|
---|
1699 |
|
---|
1700 | decl %ecx
|
---|
1701 | jne .Lreduction_loop
|
---|
1702 |
|
---|
1703 | ret
|
---|
1704 | .cfi_endproc
|
---|
1705 | .size __rsaz_512_reduce,.-__rsaz_512_reduce
|
---|
1706 | ___
|
---|
1707 | }
|
---|
1708 | if ($addx) {
|
---|
1709 | # __rsaz_512_reducex
|
---|
1710 | #
|
---|
1711 | # input: %r8-%r15, %rbp - mod, 128(%rsp) - n0
|
---|
1712 | # output: %r8-%r15
|
---|
1713 | # clobbers: everything except %rbp and %rdi
|
---|
1714 | $code.=<<___;
|
---|
1715 | .type __rsaz_512_reducex,\@abi-omnipotent
|
---|
1716 | .align 32
|
---|
1717 | __rsaz_512_reducex:
|
---|
1718 | .cfi_startproc
|
---|
1719 | #movq 128+8(%rsp), %rdx # pull $n0
|
---|
1720 | imulq %r8, %rdx
|
---|
1721 | xorq %rsi, %rsi # cf=0,of=0
|
---|
1722 | movl \$8, %ecx
|
---|
1723 | jmp .Lreduction_loopx
|
---|
1724 |
|
---|
1725 | .align 32
|
---|
1726 | .Lreduction_loopx:
|
---|
1727 | mov %r8, %rbx
|
---|
1728 | mulx 0(%rbp), %rax, %r8
|
---|
1729 | adcx %rbx, %rax
|
---|
1730 | adox %r9, %r8
|
---|
1731 |
|
---|
1732 | mulx 8(%rbp), %rax, %r9
|
---|
1733 | adcx %rax, %r8
|
---|
1734 | adox %r10, %r9
|
---|
1735 |
|
---|
1736 | mulx 16(%rbp), %rbx, %r10
|
---|
1737 | adcx %rbx, %r9
|
---|
1738 | adox %r11, %r10
|
---|
1739 |
|
---|
1740 | mulx 24(%rbp), %rbx, %r11
|
---|
1741 | adcx %rbx, %r10
|
---|
1742 | adox %r12, %r11
|
---|
1743 |
|
---|
1744 | .byte 0xc4,0x62,0xe3,0xf6,0xa5,0x20,0x00,0x00,0x00 # mulx 32(%rbp), %rbx, %r12
|
---|
1745 | mov %rdx, %rax
|
---|
1746 | mov %r8, %rdx
|
---|
1747 | adcx %rbx, %r11
|
---|
1748 | adox %r13, %r12
|
---|
1749 |
|
---|
1750 | mulx 128+8(%rsp), %rbx, %rdx
|
---|
1751 | mov %rax, %rdx
|
---|
1752 |
|
---|
1753 | mulx 40(%rbp), %rax, %r13
|
---|
1754 | adcx %rax, %r12
|
---|
1755 | adox %r14, %r13
|
---|
1756 |
|
---|
1757 | .byte 0xc4,0x62,0xfb,0xf6,0xb5,0x30,0x00,0x00,0x00 # mulx 48(%rbp), %rax, %r14
|
---|
1758 | adcx %rax, %r13
|
---|
1759 | adox %r15, %r14
|
---|
1760 |
|
---|
1761 | mulx 56(%rbp), %rax, %r15
|
---|
1762 | mov %rbx, %rdx
|
---|
1763 | adcx %rax, %r14
|
---|
1764 | adox %rsi, %r15 # %rsi is 0
|
---|
1765 | adcx %rsi, %r15 # cf=0
|
---|
1766 |
|
---|
1767 | decl %ecx # of=0
|
---|
1768 | jne .Lreduction_loopx
|
---|
1769 |
|
---|
1770 | ret
|
---|
1771 | .cfi_endproc
|
---|
1772 | .size __rsaz_512_reducex,.-__rsaz_512_reducex
|
---|
1773 | ___
|
---|
1774 | }
|
---|
1775 | { # __rsaz_512_subtract
|
---|
1776 | # input: %r8-%r15, %rdi - $out, %rbp - $mod, %rcx - mask
|
---|
1777 | # output:
|
---|
1778 | # clobbers: everything but %rdi, %rsi and %rbp
|
---|
1779 | $code.=<<___;
|
---|
1780 | .type __rsaz_512_subtract,\@abi-omnipotent
|
---|
1781 | .align 32
|
---|
1782 | __rsaz_512_subtract:
|
---|
1783 | .cfi_startproc
|
---|
1784 | movq %r8, ($out)
|
---|
1785 | movq %r9, 8($out)
|
---|
1786 | movq %r10, 16($out)
|
---|
1787 | movq %r11, 24($out)
|
---|
1788 | movq %r12, 32($out)
|
---|
1789 | movq %r13, 40($out)
|
---|
1790 | movq %r14, 48($out)
|
---|
1791 | movq %r15, 56($out)
|
---|
1792 |
|
---|
1793 | movq 0($mod), %r8
|
---|
1794 | movq 8($mod), %r9
|
---|
1795 | negq %r8
|
---|
1796 | notq %r9
|
---|
1797 | andq %rcx, %r8
|
---|
1798 | movq 16($mod), %r10
|
---|
1799 | andq %rcx, %r9
|
---|
1800 | notq %r10
|
---|
1801 | movq 24($mod), %r11
|
---|
1802 | andq %rcx, %r10
|
---|
1803 | notq %r11
|
---|
1804 | movq 32($mod), %r12
|
---|
1805 | andq %rcx, %r11
|
---|
1806 | notq %r12
|
---|
1807 | movq 40($mod), %r13
|
---|
1808 | andq %rcx, %r12
|
---|
1809 | notq %r13
|
---|
1810 | movq 48($mod), %r14
|
---|
1811 | andq %rcx, %r13
|
---|
1812 | notq %r14
|
---|
1813 | movq 56($mod), %r15
|
---|
1814 | andq %rcx, %r14
|
---|
1815 | notq %r15
|
---|
1816 | andq %rcx, %r15
|
---|
1817 |
|
---|
1818 | addq ($out), %r8
|
---|
1819 | adcq 8($out), %r9
|
---|
1820 | adcq 16($out), %r10
|
---|
1821 | adcq 24($out), %r11
|
---|
1822 | adcq 32($out), %r12
|
---|
1823 | adcq 40($out), %r13
|
---|
1824 | adcq 48($out), %r14
|
---|
1825 | adcq 56($out), %r15
|
---|
1826 |
|
---|
1827 | movq %r8, ($out)
|
---|
1828 | movq %r9, 8($out)
|
---|
1829 | movq %r10, 16($out)
|
---|
1830 | movq %r11, 24($out)
|
---|
1831 | movq %r12, 32($out)
|
---|
1832 | movq %r13, 40($out)
|
---|
1833 | movq %r14, 48($out)
|
---|
1834 | movq %r15, 56($out)
|
---|
1835 |
|
---|
1836 | ret
|
---|
1837 | .cfi_endproc
|
---|
1838 | .size __rsaz_512_subtract,.-__rsaz_512_subtract
|
---|
1839 | ___
|
---|
1840 | }
|
---|
1841 | { # __rsaz_512_mul
|
---|
1842 | #
|
---|
1843 | # input: %rsi - ap, %rbp - bp
|
---|
1844 | # output:
|
---|
1845 | # clobbers: everything
|
---|
1846 | my ($ap,$bp) = ("%rsi","%rbp");
|
---|
1847 | $code.=<<___;
|
---|
1848 | .type __rsaz_512_mul,\@abi-omnipotent
|
---|
1849 | .align 32
|
---|
1850 | __rsaz_512_mul:
|
---|
1851 | .cfi_startproc
|
---|
1852 | leaq 8(%rsp), %rdi
|
---|
1853 |
|
---|
1854 | movq ($ap), %rax
|
---|
1855 | mulq %rbx
|
---|
1856 | movq %rax, (%rdi)
|
---|
1857 | movq 8($ap), %rax
|
---|
1858 | movq %rdx, %r8
|
---|
1859 |
|
---|
1860 | mulq %rbx
|
---|
1861 | addq %rax, %r8
|
---|
1862 | movq 16($ap), %rax
|
---|
1863 | movq %rdx, %r9
|
---|
1864 | adcq \$0, %r9
|
---|
1865 |
|
---|
1866 | mulq %rbx
|
---|
1867 | addq %rax, %r9
|
---|
1868 | movq 24($ap), %rax
|
---|
1869 | movq %rdx, %r10
|
---|
1870 | adcq \$0, %r10
|
---|
1871 |
|
---|
1872 | mulq %rbx
|
---|
1873 | addq %rax, %r10
|
---|
1874 | movq 32($ap), %rax
|
---|
1875 | movq %rdx, %r11
|
---|
1876 | adcq \$0, %r11
|
---|
1877 |
|
---|
1878 | mulq %rbx
|
---|
1879 | addq %rax, %r11
|
---|
1880 | movq 40($ap), %rax
|
---|
1881 | movq %rdx, %r12
|
---|
1882 | adcq \$0, %r12
|
---|
1883 |
|
---|
1884 | mulq %rbx
|
---|
1885 | addq %rax, %r12
|
---|
1886 | movq 48($ap), %rax
|
---|
1887 | movq %rdx, %r13
|
---|
1888 | adcq \$0, %r13
|
---|
1889 |
|
---|
1890 | mulq %rbx
|
---|
1891 | addq %rax, %r13
|
---|
1892 | movq 56($ap), %rax
|
---|
1893 | movq %rdx, %r14
|
---|
1894 | adcq \$0, %r14
|
---|
1895 |
|
---|
1896 | mulq %rbx
|
---|
1897 | addq %rax, %r14
|
---|
1898 | movq ($ap), %rax
|
---|
1899 | movq %rdx, %r15
|
---|
1900 | adcq \$0, %r15
|
---|
1901 |
|
---|
1902 | leaq 8($bp), $bp
|
---|
1903 | leaq 8(%rdi), %rdi
|
---|
1904 |
|
---|
1905 | movl \$7, %ecx
|
---|
1906 | jmp .Loop_mul
|
---|
1907 |
|
---|
1908 | .align 32
|
---|
1909 | .Loop_mul:
|
---|
1910 | movq ($bp), %rbx
|
---|
1911 | mulq %rbx
|
---|
1912 | addq %rax, %r8
|
---|
1913 | movq 8($ap), %rax
|
---|
1914 | movq %r8, (%rdi)
|
---|
1915 | movq %rdx, %r8
|
---|
1916 | adcq \$0, %r8
|
---|
1917 |
|
---|
1918 | mulq %rbx
|
---|
1919 | addq %rax, %r9
|
---|
1920 | movq 16($ap), %rax
|
---|
1921 | adcq \$0, %rdx
|
---|
1922 | addq %r9, %r8
|
---|
1923 | movq %rdx, %r9
|
---|
1924 | adcq \$0, %r9
|
---|
1925 |
|
---|
1926 | mulq %rbx
|
---|
1927 | addq %rax, %r10
|
---|
1928 | movq 24($ap), %rax
|
---|
1929 | adcq \$0, %rdx
|
---|
1930 | addq %r10, %r9
|
---|
1931 | movq %rdx, %r10
|
---|
1932 | adcq \$0, %r10
|
---|
1933 |
|
---|
1934 | mulq %rbx
|
---|
1935 | addq %rax, %r11
|
---|
1936 | movq 32($ap), %rax
|
---|
1937 | adcq \$0, %rdx
|
---|
1938 | addq %r11, %r10
|
---|
1939 | movq %rdx, %r11
|
---|
1940 | adcq \$0, %r11
|
---|
1941 |
|
---|
1942 | mulq %rbx
|
---|
1943 | addq %rax, %r12
|
---|
1944 | movq 40($ap), %rax
|
---|
1945 | adcq \$0, %rdx
|
---|
1946 | addq %r12, %r11
|
---|
1947 | movq %rdx, %r12
|
---|
1948 | adcq \$0, %r12
|
---|
1949 |
|
---|
1950 | mulq %rbx
|
---|
1951 | addq %rax, %r13
|
---|
1952 | movq 48($ap), %rax
|
---|
1953 | adcq \$0, %rdx
|
---|
1954 | addq %r13, %r12
|
---|
1955 | movq %rdx, %r13
|
---|
1956 | adcq \$0, %r13
|
---|
1957 |
|
---|
1958 | mulq %rbx
|
---|
1959 | addq %rax, %r14
|
---|
1960 | movq 56($ap), %rax
|
---|
1961 | adcq \$0, %rdx
|
---|
1962 | addq %r14, %r13
|
---|
1963 | movq %rdx, %r14
|
---|
1964 | leaq 8($bp), $bp
|
---|
1965 | adcq \$0, %r14
|
---|
1966 |
|
---|
1967 | mulq %rbx
|
---|
1968 | addq %rax, %r15
|
---|
1969 | movq ($ap), %rax
|
---|
1970 | adcq \$0, %rdx
|
---|
1971 | addq %r15, %r14
|
---|
1972 | movq %rdx, %r15
|
---|
1973 | adcq \$0, %r15
|
---|
1974 |
|
---|
1975 | leaq 8(%rdi), %rdi
|
---|
1976 |
|
---|
1977 | decl %ecx
|
---|
1978 | jnz .Loop_mul
|
---|
1979 |
|
---|
1980 | movq %r8, (%rdi)
|
---|
1981 | movq %r9, 8(%rdi)
|
---|
1982 | movq %r10, 16(%rdi)
|
---|
1983 | movq %r11, 24(%rdi)
|
---|
1984 | movq %r12, 32(%rdi)
|
---|
1985 | movq %r13, 40(%rdi)
|
---|
1986 | movq %r14, 48(%rdi)
|
---|
1987 | movq %r15, 56(%rdi)
|
---|
1988 |
|
---|
1989 | ret
|
---|
1990 | .cfi_endproc
|
---|
1991 | .size __rsaz_512_mul,.-__rsaz_512_mul
|
---|
1992 | ___
|
---|
1993 | }
|
---|
1994 | if ($addx) {
|
---|
1995 | # __rsaz_512_mulx
|
---|
1996 | #
|
---|
1997 | # input: %rsi - ap, %rbp - bp
|
---|
1998 | # output:
|
---|
1999 | # clobbers: everything
|
---|
2000 | my ($ap,$bp,$zero) = ("%rsi","%rbp","%rdi");
|
---|
2001 | $code.=<<___;
|
---|
2002 | .type __rsaz_512_mulx,\@abi-omnipotent
|
---|
2003 | .align 32
|
---|
2004 | __rsaz_512_mulx:
|
---|
2005 | .cfi_startproc
|
---|
2006 | mulx ($ap), %rbx, %r8 # initial %rdx preloaded by caller
|
---|
2007 | mov \$-6, %rcx
|
---|
2008 |
|
---|
2009 | mulx 8($ap), %rax, %r9
|
---|
2010 | movq %rbx, 8(%rsp)
|
---|
2011 |
|
---|
2012 | mulx 16($ap), %rbx, %r10
|
---|
2013 | adc %rax, %r8
|
---|
2014 |
|
---|
2015 | mulx 24($ap), %rax, %r11
|
---|
2016 | adc %rbx, %r9
|
---|
2017 |
|
---|
2018 | mulx 32($ap), %rbx, %r12
|
---|
2019 | adc %rax, %r10
|
---|
2020 |
|
---|
2021 | mulx 40($ap), %rax, %r13
|
---|
2022 | adc %rbx, %r11
|
---|
2023 |
|
---|
2024 | mulx 48($ap), %rbx, %r14
|
---|
2025 | adc %rax, %r12
|
---|
2026 |
|
---|
2027 | mulx 56($ap), %rax, %r15
|
---|
2028 | mov 8($bp), %rdx
|
---|
2029 | adc %rbx, %r13
|
---|
2030 | adc %rax, %r14
|
---|
2031 | adc \$0, %r15
|
---|
2032 |
|
---|
2033 | xor $zero, $zero # cf=0,of=0
|
---|
2034 | jmp .Loop_mulx
|
---|
2035 |
|
---|
2036 | .align 32
|
---|
2037 | .Loop_mulx:
|
---|
2038 | movq %r8, %rbx
|
---|
2039 | mulx ($ap), %rax, %r8
|
---|
2040 | adcx %rax, %rbx
|
---|
2041 | adox %r9, %r8
|
---|
2042 |
|
---|
2043 | mulx 8($ap), %rax, %r9
|
---|
2044 | adcx %rax, %r8
|
---|
2045 | adox %r10, %r9
|
---|
2046 |
|
---|
2047 | mulx 16($ap), %rax, %r10
|
---|
2048 | adcx %rax, %r9
|
---|
2049 | adox %r11, %r10
|
---|
2050 |
|
---|
2051 | mulx 24($ap), %rax, %r11
|
---|
2052 | adcx %rax, %r10
|
---|
2053 | adox %r12, %r11
|
---|
2054 |
|
---|
2055 | .byte 0x3e,0xc4,0x62,0xfb,0xf6,0xa6,0x20,0x00,0x00,0x00 # mulx 32($ap), %rax, %r12
|
---|
2056 | adcx %rax, %r11
|
---|
2057 | adox %r13, %r12
|
---|
2058 |
|
---|
2059 | mulx 40($ap), %rax, %r13
|
---|
2060 | adcx %rax, %r12
|
---|
2061 | adox %r14, %r13
|
---|
2062 |
|
---|
2063 | mulx 48($ap), %rax, %r14
|
---|
2064 | adcx %rax, %r13
|
---|
2065 | adox %r15, %r14
|
---|
2066 |
|
---|
2067 | mulx 56($ap), %rax, %r15
|
---|
2068 | movq 64($bp,%rcx,8), %rdx
|
---|
2069 | movq %rbx, 8+64-8(%rsp,%rcx,8)
|
---|
2070 | adcx %rax, %r14
|
---|
2071 | adox $zero, %r15
|
---|
2072 | adcx $zero, %r15 # cf=0
|
---|
2073 |
|
---|
2074 | inc %rcx # of=0
|
---|
2075 | jnz .Loop_mulx
|
---|
2076 |
|
---|
2077 | movq %r8, %rbx
|
---|
2078 | mulx ($ap), %rax, %r8
|
---|
2079 | adcx %rax, %rbx
|
---|
2080 | adox %r9, %r8
|
---|
2081 |
|
---|
2082 | .byte 0xc4,0x62,0xfb,0xf6,0x8e,0x08,0x00,0x00,0x00 # mulx 8($ap), %rax, %r9
|
---|
2083 | adcx %rax, %r8
|
---|
2084 | adox %r10, %r9
|
---|
2085 |
|
---|
2086 | .byte 0xc4,0x62,0xfb,0xf6,0x96,0x10,0x00,0x00,0x00 # mulx 16($ap), %rax, %r10
|
---|
2087 | adcx %rax, %r9
|
---|
2088 | adox %r11, %r10
|
---|
2089 |
|
---|
2090 | mulx 24($ap), %rax, %r11
|
---|
2091 | adcx %rax, %r10
|
---|
2092 | adox %r12, %r11
|
---|
2093 |
|
---|
2094 | mulx 32($ap), %rax, %r12
|
---|
2095 | adcx %rax, %r11
|
---|
2096 | adox %r13, %r12
|
---|
2097 |
|
---|
2098 | mulx 40($ap), %rax, %r13
|
---|
2099 | adcx %rax, %r12
|
---|
2100 | adox %r14, %r13
|
---|
2101 |
|
---|
2102 | .byte 0xc4,0x62,0xfb,0xf6,0xb6,0x30,0x00,0x00,0x00 # mulx 48($ap), %rax, %r14
|
---|
2103 | adcx %rax, %r13
|
---|
2104 | adox %r15, %r14
|
---|
2105 |
|
---|
2106 | .byte 0xc4,0x62,0xfb,0xf6,0xbe,0x38,0x00,0x00,0x00 # mulx 56($ap), %rax, %r15
|
---|
2107 | adcx %rax, %r14
|
---|
2108 | adox $zero, %r15
|
---|
2109 | adcx $zero, %r15
|
---|
2110 |
|
---|
2111 | mov %rbx, 8+64-8(%rsp)
|
---|
2112 | mov %r8, 8+64(%rsp)
|
---|
2113 | mov %r9, 8+64+8(%rsp)
|
---|
2114 | mov %r10, 8+64+16(%rsp)
|
---|
2115 | mov %r11, 8+64+24(%rsp)
|
---|
2116 | mov %r12, 8+64+32(%rsp)
|
---|
2117 | mov %r13, 8+64+40(%rsp)
|
---|
2118 | mov %r14, 8+64+48(%rsp)
|
---|
2119 | mov %r15, 8+64+56(%rsp)
|
---|
2120 |
|
---|
2121 | ret
|
---|
2122 | .cfi_endproc
|
---|
2123 | .size __rsaz_512_mulx,.-__rsaz_512_mulx
|
---|
2124 | ___
|
---|
2125 | }
|
---|
2126 | {
|
---|
2127 | my ($out,$inp,$power)= $win64 ? ("%rcx","%rdx","%r8d") : ("%rdi","%rsi","%edx");
|
---|
2128 | $code.=<<___;
|
---|
2129 | .globl rsaz_512_scatter4
|
---|
2130 | .type rsaz_512_scatter4,\@abi-omnipotent
|
---|
2131 | .align 16
|
---|
2132 | rsaz_512_scatter4:
|
---|
2133 | .cfi_startproc
|
---|
2134 | leaq ($out,$power,8), $out
|
---|
2135 | movl \$8, %r9d
|
---|
2136 | jmp .Loop_scatter
|
---|
2137 | .align 16
|
---|
2138 | .Loop_scatter:
|
---|
2139 | movq ($inp), %rax
|
---|
2140 | leaq 8($inp), $inp
|
---|
2141 | movq %rax, ($out)
|
---|
2142 | leaq 128($out), $out
|
---|
2143 | decl %r9d
|
---|
2144 | jnz .Loop_scatter
|
---|
2145 | ret
|
---|
2146 | .cfi_endproc
|
---|
2147 | .size rsaz_512_scatter4,.-rsaz_512_scatter4
|
---|
2148 |
|
---|
2149 | .globl rsaz_512_gather4
|
---|
2150 | .type rsaz_512_gather4,\@abi-omnipotent
|
---|
2151 | .align 16
|
---|
2152 | rsaz_512_gather4:
|
---|
2153 | .cfi_startproc
|
---|
2154 | ___
|
---|
2155 | $code.=<<___ if ($win64);
|
---|
2156 | .LSEH_begin_rsaz_512_gather4:
|
---|
2157 | .byte 0x48,0x81,0xec,0xa8,0x00,0x00,0x00 # sub $0xa8,%rsp
|
---|
2158 | .byte 0x0f,0x29,0x34,0x24 # movaps %xmm6,(%rsp)
|
---|
2159 | .byte 0x0f,0x29,0x7c,0x24,0x10 # movaps %xmm7,0x10(%rsp)
|
---|
2160 | .byte 0x44,0x0f,0x29,0x44,0x24,0x20 # movaps %xmm8,0x20(%rsp)
|
---|
2161 | .byte 0x44,0x0f,0x29,0x4c,0x24,0x30 # movaps %xmm9,0x30(%rsp)
|
---|
2162 | .byte 0x44,0x0f,0x29,0x54,0x24,0x40 # movaps %xmm10,0x40(%rsp)
|
---|
2163 | .byte 0x44,0x0f,0x29,0x5c,0x24,0x50 # movaps %xmm11,0x50(%rsp)
|
---|
2164 | .byte 0x44,0x0f,0x29,0x64,0x24,0x60 # movaps %xmm12,0x60(%rsp)
|
---|
2165 | .byte 0x44,0x0f,0x29,0x6c,0x24,0x70 # movaps %xmm13,0x70(%rsp)
|
---|
2166 | .byte 0x44,0x0f,0x29,0xb4,0x24,0x80,0,0,0 # movaps %xmm14,0x80(%rsp)
|
---|
2167 | .byte 0x44,0x0f,0x29,0xbc,0x24,0x90,0,0,0 # movaps %xmm15,0x90(%rsp)
|
---|
2168 | ___
|
---|
2169 | $code.=<<___;
|
---|
2170 | movd $power,%xmm8
|
---|
2171 | movdqa .Linc+16(%rip),%xmm1 # 00000002000000020000000200000002
|
---|
2172 | movdqa .Linc(%rip),%xmm0 # 00000001000000010000000000000000
|
---|
2173 |
|
---|
2174 | pshufd \$0,%xmm8,%xmm8 # broadcast $power
|
---|
2175 | movdqa %xmm1,%xmm7
|
---|
2176 | movdqa %xmm1,%xmm2
|
---|
2177 | ___
|
---|
2178 | ########################################################################
|
---|
2179 | # calculate mask by comparing 0..15 to $power
|
---|
2180 | #
|
---|
2181 | for($i=0;$i<4;$i++) {
|
---|
2182 | $code.=<<___;
|
---|
2183 | paddd %xmm`$i`,%xmm`$i+1`
|
---|
2184 | pcmpeqd %xmm8,%xmm`$i`
|
---|
2185 | movdqa %xmm7,%xmm`$i+3`
|
---|
2186 | ___
|
---|
2187 | }
|
---|
2188 | for(;$i<7;$i++) {
|
---|
2189 | $code.=<<___;
|
---|
2190 | paddd %xmm`$i`,%xmm`$i+1`
|
---|
2191 | pcmpeqd %xmm8,%xmm`$i`
|
---|
2192 | ___
|
---|
2193 | }
|
---|
2194 | $code.=<<___;
|
---|
2195 | pcmpeqd %xmm8,%xmm7
|
---|
2196 | movl \$8, %r9d
|
---|
2197 | jmp .Loop_gather
|
---|
2198 | .align 16
|
---|
2199 | .Loop_gather:
|
---|
2200 | movdqa 16*0($inp),%xmm8
|
---|
2201 | movdqa 16*1($inp),%xmm9
|
---|
2202 | movdqa 16*2($inp),%xmm10
|
---|
2203 | movdqa 16*3($inp),%xmm11
|
---|
2204 | pand %xmm0,%xmm8
|
---|
2205 | movdqa 16*4($inp),%xmm12
|
---|
2206 | pand %xmm1,%xmm9
|
---|
2207 | movdqa 16*5($inp),%xmm13
|
---|
2208 | pand %xmm2,%xmm10
|
---|
2209 | movdqa 16*6($inp),%xmm14
|
---|
2210 | pand %xmm3,%xmm11
|
---|
2211 | movdqa 16*7($inp),%xmm15
|
---|
2212 | leaq 128($inp), $inp
|
---|
2213 | pand %xmm4,%xmm12
|
---|
2214 | pand %xmm5,%xmm13
|
---|
2215 | pand %xmm6,%xmm14
|
---|
2216 | pand %xmm7,%xmm15
|
---|
2217 | por %xmm10,%xmm8
|
---|
2218 | por %xmm11,%xmm9
|
---|
2219 | por %xmm12,%xmm8
|
---|
2220 | por %xmm13,%xmm9
|
---|
2221 | por %xmm14,%xmm8
|
---|
2222 | por %xmm15,%xmm9
|
---|
2223 |
|
---|
2224 | por %xmm9,%xmm8
|
---|
2225 | pshufd \$0x4e,%xmm8,%xmm9
|
---|
2226 | por %xmm9,%xmm8
|
---|
2227 | movq %xmm8,($out)
|
---|
2228 | leaq 8($out), $out
|
---|
2229 | decl %r9d
|
---|
2230 | jnz .Loop_gather
|
---|
2231 | ___
|
---|
2232 | $code.=<<___ if ($win64);
|
---|
2233 | movaps 0x00(%rsp),%xmm6
|
---|
2234 | movaps 0x10(%rsp),%xmm7
|
---|
2235 | movaps 0x20(%rsp),%xmm8
|
---|
2236 | movaps 0x30(%rsp),%xmm9
|
---|
2237 | movaps 0x40(%rsp),%xmm10
|
---|
2238 | movaps 0x50(%rsp),%xmm11
|
---|
2239 | movaps 0x60(%rsp),%xmm12
|
---|
2240 | movaps 0x70(%rsp),%xmm13
|
---|
2241 | movaps 0x80(%rsp),%xmm14
|
---|
2242 | movaps 0x90(%rsp),%xmm15
|
---|
2243 | add \$0xa8,%rsp
|
---|
2244 | ___
|
---|
2245 | $code.=<<___;
|
---|
2246 | ret
|
---|
2247 | .LSEH_end_rsaz_512_gather4:
|
---|
2248 | .cfi_endproc
|
---|
2249 | .size rsaz_512_gather4,.-rsaz_512_gather4
|
---|
2250 |
|
---|
2251 | .align 64
|
---|
2252 | .Linc:
|
---|
2253 | .long 0,0, 1,1
|
---|
2254 | .long 2,2, 2,2
|
---|
2255 | ___
|
---|
2256 | }
|
---|
2257 |
|
---|
2258 | # EXCEPTION_DISPOSITION handler (EXCEPTION_RECORD *rec,ULONG64 frame,
|
---|
2259 | # CONTEXT *context,DISPATCHER_CONTEXT *disp)
|
---|
2260 | if ($win64) {
|
---|
2261 | $rec="%rcx";
|
---|
2262 | $frame="%rdx";
|
---|
2263 | $context="%r8";
|
---|
2264 | $disp="%r9";
|
---|
2265 |
|
---|
2266 | $code.=<<___;
|
---|
2267 | .extern __imp_RtlVirtualUnwind
|
---|
2268 | .type se_handler,\@abi-omnipotent
|
---|
2269 | .align 16
|
---|
2270 | se_handler:
|
---|
2271 | push %rsi
|
---|
2272 | push %rdi
|
---|
2273 | push %rbx
|
---|
2274 | push %rbp
|
---|
2275 | push %r12
|
---|
2276 | push %r13
|
---|
2277 | push %r14
|
---|
2278 | push %r15
|
---|
2279 | pushfq
|
---|
2280 | sub \$64,%rsp
|
---|
2281 |
|
---|
2282 | mov 120($context),%rax # pull context->Rax
|
---|
2283 | mov 248($context),%rbx # pull context->Rip
|
---|
2284 |
|
---|
2285 | mov 8($disp),%rsi # disp->ImageBase
|
---|
2286 | mov 56($disp),%r11 # disp->HandlerData
|
---|
2287 |
|
---|
2288 | mov 0(%r11),%r10d # HandlerData[0]
|
---|
2289 | lea (%rsi,%r10),%r10 # end of prologue label
|
---|
2290 | cmp %r10,%rbx # context->Rip<end of prologue label
|
---|
2291 | jb .Lcommon_seh_tail
|
---|
2292 |
|
---|
2293 | mov 152($context),%rax # pull context->Rsp
|
---|
2294 |
|
---|
2295 | mov 4(%r11),%r10d # HandlerData[1]
|
---|
2296 | lea (%rsi,%r10),%r10 # epilogue label
|
---|
2297 | cmp %r10,%rbx # context->Rip>=epilogue label
|
---|
2298 | jae .Lcommon_seh_tail
|
---|
2299 |
|
---|
2300 | lea 128+24+48(%rax),%rax
|
---|
2301 |
|
---|
2302 | lea .Lmul_gather4_epilogue(%rip),%rbx
|
---|
2303 | cmp %r10,%rbx
|
---|
2304 | jne .Lse_not_in_mul_gather4
|
---|
2305 |
|
---|
2306 | lea 0xb0(%rax),%rax
|
---|
2307 |
|
---|
2308 | lea -48-0xa8(%rax),%rsi
|
---|
2309 | lea 512($context),%rdi
|
---|
2310 | mov \$20,%ecx
|
---|
2311 | .long 0xa548f3fc # cld; rep movsq
|
---|
2312 |
|
---|
2313 | .Lse_not_in_mul_gather4:
|
---|
2314 | mov -8(%rax),%rbx
|
---|
2315 | mov -16(%rax),%rbp
|
---|
2316 | mov -24(%rax),%r12
|
---|
2317 | mov -32(%rax),%r13
|
---|
2318 | mov -40(%rax),%r14
|
---|
2319 | mov -48(%rax),%r15
|
---|
2320 | mov %rbx,144($context) # restore context->Rbx
|
---|
2321 | mov %rbp,160($context) # restore context->Rbp
|
---|
2322 | mov %r12,216($context) # restore context->R12
|
---|
2323 | mov %r13,224($context) # restore context->R13
|
---|
2324 | mov %r14,232($context) # restore context->R14
|
---|
2325 | mov %r15,240($context) # restore context->R15
|
---|
2326 |
|
---|
2327 | .Lcommon_seh_tail:
|
---|
2328 | mov 8(%rax),%rdi
|
---|
2329 | mov 16(%rax),%rsi
|
---|
2330 | mov %rax,152($context) # restore context->Rsp
|
---|
2331 | mov %rsi,168($context) # restore context->Rsi
|
---|
2332 | mov %rdi,176($context) # restore context->Rdi
|
---|
2333 |
|
---|
2334 | mov 40($disp),%rdi # disp->ContextRecord
|
---|
2335 | mov $context,%rsi # context
|
---|
2336 | mov \$154,%ecx # sizeof(CONTEXT)
|
---|
2337 | .long 0xa548f3fc # cld; rep movsq
|
---|
2338 |
|
---|
2339 | mov $disp,%rsi
|
---|
2340 | xor %rcx,%rcx # arg1, UNW_FLAG_NHANDLER
|
---|
2341 | mov 8(%rsi),%rdx # arg2, disp->ImageBase
|
---|
2342 | mov 0(%rsi),%r8 # arg3, disp->ControlPc
|
---|
2343 | mov 16(%rsi),%r9 # arg4, disp->FunctionEntry
|
---|
2344 | mov 40(%rsi),%r10 # disp->ContextRecord
|
---|
2345 | lea 56(%rsi),%r11 # &disp->HandlerData
|
---|
2346 | lea 24(%rsi),%r12 # &disp->EstablisherFrame
|
---|
2347 | mov %r10,32(%rsp) # arg5
|
---|
2348 | mov %r11,40(%rsp) # arg6
|
---|
2349 | mov %r12,48(%rsp) # arg7
|
---|
2350 | mov %rcx,56(%rsp) # arg8, (NULL)
|
---|
2351 | call *__imp_RtlVirtualUnwind(%rip)
|
---|
2352 |
|
---|
2353 | mov \$1,%eax # ExceptionContinueSearch
|
---|
2354 | add \$64,%rsp
|
---|
2355 | popfq
|
---|
2356 | pop %r15
|
---|
2357 | pop %r14
|
---|
2358 | pop %r13
|
---|
2359 | pop %r12
|
---|
2360 | pop %rbp
|
---|
2361 | pop %rbx
|
---|
2362 | pop %rdi
|
---|
2363 | pop %rsi
|
---|
2364 | ret
|
---|
2365 | .size se_handler,.-se_handler
|
---|
2366 |
|
---|
2367 | .section .pdata
|
---|
2368 | .align 4
|
---|
2369 | .rva .LSEH_begin_rsaz_512_sqr
|
---|
2370 | .rva .LSEH_end_rsaz_512_sqr
|
---|
2371 | .rva .LSEH_info_rsaz_512_sqr
|
---|
2372 |
|
---|
2373 | .rva .LSEH_begin_rsaz_512_mul
|
---|
2374 | .rva .LSEH_end_rsaz_512_mul
|
---|
2375 | .rva .LSEH_info_rsaz_512_mul
|
---|
2376 |
|
---|
2377 | .rva .LSEH_begin_rsaz_512_mul_gather4
|
---|
2378 | .rva .LSEH_end_rsaz_512_mul_gather4
|
---|
2379 | .rva .LSEH_info_rsaz_512_mul_gather4
|
---|
2380 |
|
---|
2381 | .rva .LSEH_begin_rsaz_512_mul_scatter4
|
---|
2382 | .rva .LSEH_end_rsaz_512_mul_scatter4
|
---|
2383 | .rva .LSEH_info_rsaz_512_mul_scatter4
|
---|
2384 |
|
---|
2385 | .rva .LSEH_begin_rsaz_512_mul_by_one
|
---|
2386 | .rva .LSEH_end_rsaz_512_mul_by_one
|
---|
2387 | .rva .LSEH_info_rsaz_512_mul_by_one
|
---|
2388 |
|
---|
2389 | .rva .LSEH_begin_rsaz_512_gather4
|
---|
2390 | .rva .LSEH_end_rsaz_512_gather4
|
---|
2391 | .rva .LSEH_info_rsaz_512_gather4
|
---|
2392 |
|
---|
2393 | .section .xdata
|
---|
2394 | .align 8
|
---|
2395 | .LSEH_info_rsaz_512_sqr:
|
---|
2396 | .byte 9,0,0,0
|
---|
2397 | .rva se_handler
|
---|
2398 | .rva .Lsqr_body,.Lsqr_epilogue # HandlerData[]
|
---|
2399 | .LSEH_info_rsaz_512_mul:
|
---|
2400 | .byte 9,0,0,0
|
---|
2401 | .rva se_handler
|
---|
2402 | .rva .Lmul_body,.Lmul_epilogue # HandlerData[]
|
---|
2403 | .LSEH_info_rsaz_512_mul_gather4:
|
---|
2404 | .byte 9,0,0,0
|
---|
2405 | .rva se_handler
|
---|
2406 | .rva .Lmul_gather4_body,.Lmul_gather4_epilogue # HandlerData[]
|
---|
2407 | .LSEH_info_rsaz_512_mul_scatter4:
|
---|
2408 | .byte 9,0,0,0
|
---|
2409 | .rva se_handler
|
---|
2410 | .rva .Lmul_scatter4_body,.Lmul_scatter4_epilogue # HandlerData[]
|
---|
2411 | .LSEH_info_rsaz_512_mul_by_one:
|
---|
2412 | .byte 9,0,0,0
|
---|
2413 | .rva se_handler
|
---|
2414 | .rva .Lmul_by_one_body,.Lmul_by_one_epilogue # HandlerData[]
|
---|
2415 | .LSEH_info_rsaz_512_gather4:
|
---|
2416 | .byte 0x01,0x46,0x16,0x00
|
---|
2417 | .byte 0x46,0xf8,0x09,0x00 # vmovaps 0x90(rsp),xmm15
|
---|
2418 | .byte 0x3d,0xe8,0x08,0x00 # vmovaps 0x80(rsp),xmm14
|
---|
2419 | .byte 0x34,0xd8,0x07,0x00 # vmovaps 0x70(rsp),xmm13
|
---|
2420 | .byte 0x2e,0xc8,0x06,0x00 # vmovaps 0x60(rsp),xmm12
|
---|
2421 | .byte 0x28,0xb8,0x05,0x00 # vmovaps 0x50(rsp),xmm11
|
---|
2422 | .byte 0x22,0xa8,0x04,0x00 # vmovaps 0x40(rsp),xmm10
|
---|
2423 | .byte 0x1c,0x98,0x03,0x00 # vmovaps 0x30(rsp),xmm9
|
---|
2424 | .byte 0x16,0x88,0x02,0x00 # vmovaps 0x20(rsp),xmm8
|
---|
2425 | .byte 0x10,0x78,0x01,0x00 # vmovaps 0x10(rsp),xmm7
|
---|
2426 | .byte 0x0b,0x68,0x00,0x00 # vmovaps 0x00(rsp),xmm6
|
---|
2427 | .byte 0x07,0x01,0x15,0x00 # sub rsp,0xa8
|
---|
2428 | ___
|
---|
2429 | }
|
---|
2430 |
|
---|
2431 | $code =~ s/\`([^\`]*)\`/eval $1/gem;
|
---|
2432 | print $code;
|
---|
2433 | close STDOUT or die "error closing STDOUT: $!";
|
---|