VirtualBox

source: vbox/trunk/src/libs/openssl-1.1.1l/crypto/ec/asm/ecp_nistz256-x86_64.pl@ 91772

Last change on this file since 91772 was 91772, checked in by vboxsync, 3 years ago

openssl-1.1.1l: Applied and adjusted our OpenSSL changes to 1.1.1l. bugref:10126

  • Property svn:executable set to *
File size: 101.6 KB
Line 
1#! /usr/bin/env perl
2# Copyright 2014-2020 The OpenSSL Project Authors. All Rights Reserved.
3# Copyright (c) 2014, Intel Corporation. All Rights Reserved.
4# Copyright (c) 2015 CloudFlare, Inc.
5#
6# Licensed under the OpenSSL license (the "License"). You may not use
7# this file except in compliance with the License. You can obtain a copy
8# in the file LICENSE in the source distribution or at
9# https://www.openssl.org/source/license.html
10#
11# Originally written by Shay Gueron (1, 2), and Vlad Krasnov (1, 3)
12# (1) Intel Corporation, Israel Development Center, Haifa, Israel
13# (2) University of Haifa, Israel
14# (3) CloudFlare, Inc.
15#
16# Reference:
17# S.Gueron and V.Krasnov, "Fast Prime Field Elliptic Curve Cryptography with
18# 256 Bit Primes"
19
20# Further optimization by <[email protected]>:
21#
22# this/original with/without -DECP_NISTZ256_ASM(*)
23# Opteron +15-49% +150-195%
24# Bulldozer +18-45% +175-240%
25# P4 +24-46% +100-150%
26# Westmere +18-34% +87-160%
27# Sandy Bridge +14-35% +120-185%
28# Ivy Bridge +11-35% +125-180%
29# Haswell +10-37% +160-200%
30# Broadwell +24-58% +210-270%
31# Atom +20-50% +180-240%
32# VIA Nano +50-160% +480-480%
33#
34# (*) "without -DECP_NISTZ256_ASM" refers to build with
35# "enable-ec_nistp_64_gcc_128";
36#
37# Ranges denote minimum and maximum improvement coefficients depending
38# on benchmark. In "this/original" column lower coefficient is for
39# ECDSA sign, while in "with/without" - for ECDH key agreement, and
40# higher - for ECDSA sign, relatively fastest server-side operation.
41# Keep in mind that +100% means 2x improvement.
42
43$flavour = shift;
44$output = shift;
45if ($flavour =~ /\./) { $output = $flavour; undef $flavour; }
46
47$win64=0; $win64=1 if ($flavour =~ /[nm]asm|mingw64/ || $output =~ /\.asm$/);
48
49$0 =~ m/(.*[\/\\])[^\/\\]+$/; $dir=$1;
50( $xlate="${dir}x86_64-xlate.pl" and -f $xlate ) or
51( $xlate="${dir}../../perlasm/x86_64-xlate.pl" and -f $xlate) or
52die "can't locate x86_64-xlate.pl";
53
54open OUT,"| \"$^X\" \"$xlate\" $flavour \"$output\"";
55*STDOUT=*OUT;
56
57if (`$ENV{CC} -Wa,-v -c -o /dev/null -x assembler /dev/null 2>&1`
58 =~ /GNU assembler version ([2-9]\.[0-9]+)/) {
59 $avx = ($1>=2.19) + ($1>=2.22);
60 $addx = ($1>=2.23);
61}
62
63if (!$addx && $win64 && ($flavour =~ /nasm/ || $ENV{ASM} =~ /nasm/) &&
64 `nasm -v 2>&1` =~ /NASM version ([2-9]\.[0-9]+)/) {
65 $avx = ($1>=2.09) + ($1>=2.10);
66 $addx = ($1>=2.10);
67}
68
69if (!$addx && $win64 && ($flavour =~ /masm/ || $ENV{ASM} =~ /ml64/) &&
70 `ml64 2>&1` =~ /Version ([0-9]+)\./) {
71 $avx = ($1>=10) + ($1>=11);
72 $addx = ($1>=12);
73}
74
75if (!$addx && `$ENV{CC} -v 2>&1` =~ /((?:clang|LLVM) version|.*based on LLVM) ([0-9]+)\.([0-9]+)/) {
76 my $ver = $2 + $3/100.0; # 3.1->3.01, 3.10->3.10
77 $avx = ($ver>=3.0) + ($ver>=3.01);
78 $addx = ($ver>=3.03);
79}
80
81$code.=<<___;
82.text
83.extern OPENSSL_ia32cap_P
84
85# The polynomial
86.align 64
87.Lpoly:
88.quad 0xffffffffffffffff, 0x00000000ffffffff, 0x0000000000000000, 0xffffffff00000001
89
90# 2^512 mod P precomputed for NIST P256 polynomial
91.LRR:
92.quad 0x0000000000000003, 0xfffffffbffffffff, 0xfffffffffffffffe, 0x00000004fffffffd
93
94.LOne:
95.long 1,1,1,1,1,1,1,1
96.LTwo:
97.long 2,2,2,2,2,2,2,2
98.LThree:
99.long 3,3,3,3,3,3,3,3
100.LONE_mont:
101.quad 0x0000000000000001, 0xffffffff00000000, 0xffffffffffffffff, 0x00000000fffffffe
102
103# Constants for computations modulo ord(p256)
104.Lord:
105.quad 0xf3b9cac2fc632551, 0xbce6faada7179e84, 0xffffffffffffffff, 0xffffffff00000000
106.LordK:
107.quad 0xccd1c8aaee00bc4f
108___
109
110{
111################################################################################
112# void ecp_nistz256_mul_by_2(uint64_t res[4], uint64_t a[4]);
113
114my ($a0,$a1,$a2,$a3)=map("%r$_",(8..11));
115my ($t0,$t1,$t2,$t3,$t4)=("%rax","%rdx","%rcx","%r12","%r13");
116my ($r_ptr,$a_ptr,$b_ptr)=("%rdi","%rsi","%rdx");
117
118$code.=<<___;
119
120.globl ecp_nistz256_mul_by_2
121.type ecp_nistz256_mul_by_2,\@function,2
122.align 64
123ecp_nistz256_mul_by_2:
124.cfi_startproc
125 push %r12
126.cfi_push %r12
127 push %r13
128.cfi_push %r13
129.Lmul_by_2_body:
130
131 mov 8*0($a_ptr), $a0
132 xor $t4,$t4
133 mov 8*1($a_ptr), $a1
134 add $a0, $a0 # a0:a3+a0:a3
135 mov 8*2($a_ptr), $a2
136 adc $a1, $a1
137 mov 8*3($a_ptr), $a3
138 lea .Lpoly(%rip), $a_ptr
139 mov $a0, $t0
140 adc $a2, $a2
141 adc $a3, $a3
142 mov $a1, $t1
143 adc \$0, $t4
144
145 sub 8*0($a_ptr), $a0
146 mov $a2, $t2
147 sbb 8*1($a_ptr), $a1
148 sbb 8*2($a_ptr), $a2
149 mov $a3, $t3
150 sbb 8*3($a_ptr), $a3
151 sbb \$0, $t4
152
153 cmovc $t0, $a0
154 cmovc $t1, $a1
155 mov $a0, 8*0($r_ptr)
156 cmovc $t2, $a2
157 mov $a1, 8*1($r_ptr)
158 cmovc $t3, $a3
159 mov $a2, 8*2($r_ptr)
160 mov $a3, 8*3($r_ptr)
161
162 mov 0(%rsp),%r13
163.cfi_restore %r13
164 mov 8(%rsp),%r12
165.cfi_restore %r12
166 lea 16(%rsp),%rsp
167.cfi_adjust_cfa_offset -16
168.Lmul_by_2_epilogue:
169 ret
170.cfi_endproc
171.size ecp_nistz256_mul_by_2,.-ecp_nistz256_mul_by_2
172
173################################################################################
174# void ecp_nistz256_div_by_2(uint64_t res[4], uint64_t a[4]);
175.globl ecp_nistz256_div_by_2
176.type ecp_nistz256_div_by_2,\@function,2
177.align 32
178ecp_nistz256_div_by_2:
179.cfi_startproc
180 push %r12
181.cfi_push %r12
182 push %r13
183.cfi_push %r13
184.Ldiv_by_2_body:
185
186 mov 8*0($a_ptr), $a0
187 mov 8*1($a_ptr), $a1
188 mov 8*2($a_ptr), $a2
189 mov $a0, $t0
190 mov 8*3($a_ptr), $a3
191 lea .Lpoly(%rip), $a_ptr
192
193 mov $a1, $t1
194 xor $t4, $t4
195 add 8*0($a_ptr), $a0
196 mov $a2, $t2
197 adc 8*1($a_ptr), $a1
198 adc 8*2($a_ptr), $a2
199 mov $a3, $t3
200 adc 8*3($a_ptr), $a3
201 adc \$0, $t4
202 xor $a_ptr, $a_ptr # borrow $a_ptr
203 test \$1, $t0
204
205 cmovz $t0, $a0
206 cmovz $t1, $a1
207 cmovz $t2, $a2
208 cmovz $t3, $a3
209 cmovz $a_ptr, $t4
210
211 mov $a1, $t0 # a0:a3>>1
212 shr \$1, $a0
213 shl \$63, $t0
214 mov $a2, $t1
215 shr \$1, $a1
216 or $t0, $a0
217 shl \$63, $t1
218 mov $a3, $t2
219 shr \$1, $a2
220 or $t1, $a1
221 shl \$63, $t2
222 shr \$1, $a3
223 shl \$63, $t4
224 or $t2, $a2
225 or $t4, $a3
226
227 mov $a0, 8*0($r_ptr)
228 mov $a1, 8*1($r_ptr)
229 mov $a2, 8*2($r_ptr)
230 mov $a3, 8*3($r_ptr)
231
232 mov 0(%rsp),%r13
233.cfi_restore %r13
234 mov 8(%rsp),%r12
235.cfi_restore %r12
236 lea 16(%rsp),%rsp
237.cfi_adjust_cfa_offset -16
238.Ldiv_by_2_epilogue:
239 ret
240.cfi_endproc
241.size ecp_nistz256_div_by_2,.-ecp_nistz256_div_by_2
242
243################################################################################
244# void ecp_nistz256_mul_by_3(uint64_t res[4], uint64_t a[4]);
245.globl ecp_nistz256_mul_by_3
246.type ecp_nistz256_mul_by_3,\@function,2
247.align 32
248ecp_nistz256_mul_by_3:
249.cfi_startproc
250 push %r12
251.cfi_push %r12
252 push %r13
253.cfi_push %r13
254.Lmul_by_3_body:
255
256 mov 8*0($a_ptr), $a0
257 xor $t4, $t4
258 mov 8*1($a_ptr), $a1
259 add $a0, $a0 # a0:a3+a0:a3
260 mov 8*2($a_ptr), $a2
261 adc $a1, $a1
262 mov 8*3($a_ptr), $a3
263 mov $a0, $t0
264 adc $a2, $a2
265 adc $a3, $a3
266 mov $a1, $t1
267 adc \$0, $t4
268
269 sub \$-1, $a0
270 mov $a2, $t2
271 sbb .Lpoly+8*1(%rip), $a1
272 sbb \$0, $a2
273 mov $a3, $t3
274 sbb .Lpoly+8*3(%rip), $a3
275 sbb \$0, $t4
276
277 cmovc $t0, $a0
278 cmovc $t1, $a1
279 cmovc $t2, $a2
280 cmovc $t3, $a3
281
282 xor $t4, $t4
283 add 8*0($a_ptr), $a0 # a0:a3+=a_ptr[0:3]
284 adc 8*1($a_ptr), $a1
285 mov $a0, $t0
286 adc 8*2($a_ptr), $a2
287 adc 8*3($a_ptr), $a3
288 mov $a1, $t1
289 adc \$0, $t4
290
291 sub \$-1, $a0
292 mov $a2, $t2
293 sbb .Lpoly+8*1(%rip), $a1
294 sbb \$0, $a2
295 mov $a3, $t3
296 sbb .Lpoly+8*3(%rip), $a3
297 sbb \$0, $t4
298
299 cmovc $t0, $a0
300 cmovc $t1, $a1
301 mov $a0, 8*0($r_ptr)
302 cmovc $t2, $a2
303 mov $a1, 8*1($r_ptr)
304 cmovc $t3, $a3
305 mov $a2, 8*2($r_ptr)
306 mov $a3, 8*3($r_ptr)
307
308 mov 0(%rsp),%r13
309.cfi_restore %r13
310 mov 8(%rsp),%r12
311.cfi_restore %r12
312 lea 16(%rsp),%rsp
313.cfi_adjust_cfa_offset -16
314.Lmul_by_3_epilogue:
315 ret
316.cfi_endproc
317.size ecp_nistz256_mul_by_3,.-ecp_nistz256_mul_by_3
318
319################################################################################
320# void ecp_nistz256_add(uint64_t res[4], uint64_t a[4], uint64_t b[4]);
321.globl ecp_nistz256_add
322.type ecp_nistz256_add,\@function,3
323.align 32
324ecp_nistz256_add:
325.cfi_startproc
326 push %r12
327.cfi_push %r12
328 push %r13
329.cfi_push %r13
330.Ladd_body:
331
332 mov 8*0($a_ptr), $a0
333 xor $t4, $t4
334 mov 8*1($a_ptr), $a1
335 mov 8*2($a_ptr), $a2
336 mov 8*3($a_ptr), $a3
337 lea .Lpoly(%rip), $a_ptr
338
339 add 8*0($b_ptr), $a0
340 adc 8*1($b_ptr), $a1
341 mov $a0, $t0
342 adc 8*2($b_ptr), $a2
343 adc 8*3($b_ptr), $a3
344 mov $a1, $t1
345 adc \$0, $t4
346
347 sub 8*0($a_ptr), $a0
348 mov $a2, $t2
349 sbb 8*1($a_ptr), $a1
350 sbb 8*2($a_ptr), $a2
351 mov $a3, $t3
352 sbb 8*3($a_ptr), $a3
353 sbb \$0, $t4
354
355 cmovc $t0, $a0
356 cmovc $t1, $a1
357 mov $a0, 8*0($r_ptr)
358 cmovc $t2, $a2
359 mov $a1, 8*1($r_ptr)
360 cmovc $t3, $a3
361 mov $a2, 8*2($r_ptr)
362 mov $a3, 8*3($r_ptr)
363
364 mov 0(%rsp),%r13
365.cfi_restore %r13
366 mov 8(%rsp),%r12
367.cfi_restore %r12
368 lea 16(%rsp),%rsp
369.cfi_adjust_cfa_offset -16
370.Ladd_epilogue:
371 ret
372.cfi_endproc
373.size ecp_nistz256_add,.-ecp_nistz256_add
374
375################################################################################
376# void ecp_nistz256_sub(uint64_t res[4], uint64_t a[4], uint64_t b[4]);
377.globl ecp_nistz256_sub
378.type ecp_nistz256_sub,\@function,3
379.align 32
380ecp_nistz256_sub:
381.cfi_startproc
382 push %r12
383.cfi_push %r12
384 push %r13
385.cfi_push %r13
386.Lsub_body:
387
388 mov 8*0($a_ptr), $a0
389 xor $t4, $t4
390 mov 8*1($a_ptr), $a1
391 mov 8*2($a_ptr), $a2
392 mov 8*3($a_ptr), $a3
393 lea .Lpoly(%rip), $a_ptr
394
395 sub 8*0($b_ptr), $a0
396 sbb 8*1($b_ptr), $a1
397 mov $a0, $t0
398 sbb 8*2($b_ptr), $a2
399 sbb 8*3($b_ptr), $a3
400 mov $a1, $t1
401 sbb \$0, $t4
402
403 add 8*0($a_ptr), $a0
404 mov $a2, $t2
405 adc 8*1($a_ptr), $a1
406 adc 8*2($a_ptr), $a2
407 mov $a3, $t3
408 adc 8*3($a_ptr), $a3
409 test $t4, $t4
410
411 cmovz $t0, $a0
412 cmovz $t1, $a1
413 mov $a0, 8*0($r_ptr)
414 cmovz $t2, $a2
415 mov $a1, 8*1($r_ptr)
416 cmovz $t3, $a3
417 mov $a2, 8*2($r_ptr)
418 mov $a3, 8*3($r_ptr)
419
420 mov 0(%rsp),%r13
421.cfi_restore %r13
422 mov 8(%rsp),%r12
423.cfi_restore %r12
424 lea 16(%rsp),%rsp
425.cfi_adjust_cfa_offset -16
426.Lsub_epilogue:
427 ret
428.cfi_endproc
429.size ecp_nistz256_sub,.-ecp_nistz256_sub
430
431################################################################################
432# void ecp_nistz256_neg(uint64_t res[4], uint64_t a[4]);
433.globl ecp_nistz256_neg
434.type ecp_nistz256_neg,\@function,2
435.align 32
436ecp_nistz256_neg:
437.cfi_startproc
438 push %r12
439.cfi_push %r12
440 push %r13
441.cfi_push %r13
442.Lneg_body:
443
444 xor $a0, $a0
445 xor $a1, $a1
446 xor $a2, $a2
447 xor $a3, $a3
448 xor $t4, $t4
449
450 sub 8*0($a_ptr), $a0
451 sbb 8*1($a_ptr), $a1
452 sbb 8*2($a_ptr), $a2
453 mov $a0, $t0
454 sbb 8*3($a_ptr), $a3
455 lea .Lpoly(%rip), $a_ptr
456 mov $a1, $t1
457 sbb \$0, $t4
458
459 add 8*0($a_ptr), $a0
460 mov $a2, $t2
461 adc 8*1($a_ptr), $a1
462 adc 8*2($a_ptr), $a2
463 mov $a3, $t3
464 adc 8*3($a_ptr), $a3
465 test $t4, $t4
466
467 cmovz $t0, $a0
468 cmovz $t1, $a1
469 mov $a0, 8*0($r_ptr)
470 cmovz $t2, $a2
471 mov $a1, 8*1($r_ptr)
472 cmovz $t3, $a3
473 mov $a2, 8*2($r_ptr)
474 mov $a3, 8*3($r_ptr)
475
476 mov 0(%rsp),%r13
477.cfi_restore %r13
478 mov 8(%rsp),%r12
479.cfi_restore %r12
480 lea 16(%rsp),%rsp
481.cfi_adjust_cfa_offset -16
482.Lneg_epilogue:
483 ret
484.cfi_endproc
485.size ecp_nistz256_neg,.-ecp_nistz256_neg
486___
487}
488{
489my ($r_ptr,$a_ptr,$b_org,$b_ptr)=("%rdi","%rsi","%rdx","%rbx");
490my ($acc0,$acc1,$acc2,$acc3,$acc4,$acc5,$acc6,$acc7)=map("%r$_",(8..15));
491my ($t0,$t1,$t2,$t3,$t4)=("%rcx","%rbp","%rbx","%rdx","%rax");
492my ($poly1,$poly3)=($acc6,$acc7);
493
494$code.=<<___;
495################################################################################
496# void ecp_nistz256_ord_mul_mont(
497# uint64_t res[4],
498# uint64_t a[4],
499# uint64_t b[4]);
500
501.globl ecp_nistz256_ord_mul_mont
502.type ecp_nistz256_ord_mul_mont,\@function,3
503.align 32
504ecp_nistz256_ord_mul_mont:
505.cfi_startproc
506___
507$code.=<<___ if ($addx);
508 mov \$0x80100, %ecx
509 and OPENSSL_ia32cap_P+8(%rip), %ecx
510 cmp \$0x80100, %ecx
511 je .Lecp_nistz256_ord_mul_montx
512___
513$code.=<<___;
514 push %rbp
515.cfi_push %rbp
516 push %rbx
517.cfi_push %rbx
518 push %r12
519.cfi_push %r12
520 push %r13
521.cfi_push %r13
522 push %r14
523.cfi_push %r14
524 push %r15
525.cfi_push %r15
526.Lord_mul_body:
527
528 mov 8*0($b_org), %rax
529 mov $b_org, $b_ptr
530 lea .Lord(%rip), %r14
531 mov .LordK(%rip), %r15
532
533 ################################# * b[0]
534 mov %rax, $t0
535 mulq 8*0($a_ptr)
536 mov %rax, $acc0
537 mov $t0, %rax
538 mov %rdx, $acc1
539
540 mulq 8*1($a_ptr)
541 add %rax, $acc1
542 mov $t0, %rax
543 adc \$0, %rdx
544 mov %rdx, $acc2
545
546 mulq 8*2($a_ptr)
547 add %rax, $acc2
548 mov $t0, %rax
549 adc \$0, %rdx
550
551 mov $acc0, $acc5
552 imulq %r15,$acc0
553
554 mov %rdx, $acc3
555 mulq 8*3($a_ptr)
556 add %rax, $acc3
557 mov $acc0, %rax
558 adc \$0, %rdx
559 mov %rdx, $acc4
560
561 ################################# First reduction step
562 mulq 8*0(%r14)
563 mov $acc0, $t1
564 add %rax, $acc5 # guaranteed to be zero
565 mov $acc0, %rax
566 adc \$0, %rdx
567 mov %rdx, $t0
568
569 sub $acc0, $acc2
570 sbb \$0, $acc0 # can't borrow
571
572 mulq 8*1(%r14)
573 add $t0, $acc1
574 adc \$0, %rdx
575 add %rax, $acc1
576 mov $t1, %rax
577 adc %rdx, $acc2
578 mov $t1, %rdx
579 adc \$0, $acc0 # can't overflow
580
581 shl \$32, %rax
582 shr \$32, %rdx
583 sub %rax, $acc3
584 mov 8*1($b_ptr), %rax
585 sbb %rdx, $t1 # can't borrow
586
587 add $acc0, $acc3
588 adc $t1, $acc4
589 adc \$0, $acc5
590
591 ################################# * b[1]
592 mov %rax, $t0
593 mulq 8*0($a_ptr)
594 add %rax, $acc1
595 mov $t0, %rax
596 adc \$0, %rdx
597 mov %rdx, $t1
598
599 mulq 8*1($a_ptr)
600 add $t1, $acc2
601 adc \$0, %rdx
602 add %rax, $acc2
603 mov $t0, %rax
604 adc \$0, %rdx
605 mov %rdx, $t1
606
607 mulq 8*2($a_ptr)
608 add $t1, $acc3
609 adc \$0, %rdx
610 add %rax, $acc3
611 mov $t0, %rax
612 adc \$0, %rdx
613
614 mov $acc1, $t0
615 imulq %r15, $acc1
616
617 mov %rdx, $t1
618 mulq 8*3($a_ptr)
619 add $t1, $acc4
620 adc \$0, %rdx
621 xor $acc0, $acc0
622 add %rax, $acc4
623 mov $acc1, %rax
624 adc %rdx, $acc5
625 adc \$0, $acc0
626
627 ################################# Second reduction step
628 mulq 8*0(%r14)
629 mov $acc1, $t1
630 add %rax, $t0 # guaranteed to be zero
631 mov $acc1, %rax
632 adc %rdx, $t0
633
634 sub $acc1, $acc3
635 sbb \$0, $acc1 # can't borrow
636
637 mulq 8*1(%r14)
638 add $t0, $acc2
639 adc \$0, %rdx
640 add %rax, $acc2
641 mov $t1, %rax
642 adc %rdx, $acc3
643 mov $t1, %rdx
644 adc \$0, $acc1 # can't overflow
645
646 shl \$32, %rax
647 shr \$32, %rdx
648 sub %rax, $acc4
649 mov 8*2($b_ptr), %rax
650 sbb %rdx, $t1 # can't borrow
651
652 add $acc1, $acc4
653 adc $t1, $acc5
654 adc \$0, $acc0
655
656 ################################## * b[2]
657 mov %rax, $t0
658 mulq 8*0($a_ptr)
659 add %rax, $acc2
660 mov $t0, %rax
661 adc \$0, %rdx
662 mov %rdx, $t1
663
664 mulq 8*1($a_ptr)
665 add $t1, $acc3
666 adc \$0, %rdx
667 add %rax, $acc3
668 mov $t0, %rax
669 adc \$0, %rdx
670 mov %rdx, $t1
671
672 mulq 8*2($a_ptr)
673 add $t1, $acc4
674 adc \$0, %rdx
675 add %rax, $acc4
676 mov $t0, %rax
677 adc \$0, %rdx
678
679 mov $acc2, $t0
680 imulq %r15, $acc2
681
682 mov %rdx, $t1
683 mulq 8*3($a_ptr)
684 add $t1, $acc5
685 adc \$0, %rdx
686 xor $acc1, $acc1
687 add %rax, $acc5
688 mov $acc2, %rax
689 adc %rdx, $acc0
690 adc \$0, $acc1
691
692 ################################# Third reduction step
693 mulq 8*0(%r14)
694 mov $acc2, $t1
695 add %rax, $t0 # guaranteed to be zero
696 mov $acc2, %rax
697 adc %rdx, $t0
698
699 sub $acc2, $acc4
700 sbb \$0, $acc2 # can't borrow
701
702 mulq 8*1(%r14)
703 add $t0, $acc3
704 adc \$0, %rdx
705 add %rax, $acc3
706 mov $t1, %rax
707 adc %rdx, $acc4
708 mov $t1, %rdx
709 adc \$0, $acc2 # can't overflow
710
711 shl \$32, %rax
712 shr \$32, %rdx
713 sub %rax, $acc5
714 mov 8*3($b_ptr), %rax
715 sbb %rdx, $t1 # can't borrow
716
717 add $acc2, $acc5
718 adc $t1, $acc0
719 adc \$0, $acc1
720
721 ################################# * b[3]
722 mov %rax, $t0
723 mulq 8*0($a_ptr)
724 add %rax, $acc3
725 mov $t0, %rax
726 adc \$0, %rdx
727 mov %rdx, $t1
728
729 mulq 8*1($a_ptr)
730 add $t1, $acc4
731 adc \$0, %rdx
732 add %rax, $acc4
733 mov $t0, %rax
734 adc \$0, %rdx
735 mov %rdx, $t1
736
737 mulq 8*2($a_ptr)
738 add $t1, $acc5
739 adc \$0, %rdx
740 add %rax, $acc5
741 mov $t0, %rax
742 adc \$0, %rdx
743
744 mov $acc3, $t0
745 imulq %r15, $acc3
746
747 mov %rdx, $t1
748 mulq 8*3($a_ptr)
749 add $t1, $acc0
750 adc \$0, %rdx
751 xor $acc2, $acc2
752 add %rax, $acc0
753 mov $acc3, %rax
754 adc %rdx, $acc1
755 adc \$0, $acc2
756
757 ################################# Last reduction step
758 mulq 8*0(%r14)
759 mov $acc3, $t1
760 add %rax, $t0 # guaranteed to be zero
761 mov $acc3, %rax
762 adc %rdx, $t0
763
764 sub $acc3, $acc5
765 sbb \$0, $acc3 # can't borrow
766
767 mulq 8*1(%r14)
768 add $t0, $acc4
769 adc \$0, %rdx
770 add %rax, $acc4
771 mov $t1, %rax
772 adc %rdx, $acc5
773 mov $t1, %rdx
774 adc \$0, $acc3 # can't overflow
775
776 shl \$32, %rax
777 shr \$32, %rdx
778 sub %rax, $acc0
779 sbb %rdx, $t1 # can't borrow
780
781 add $acc3, $acc0
782 adc $t1, $acc1
783 adc \$0, $acc2
784
785 ################################# Subtract ord
786 mov $acc4, $a_ptr
787 sub 8*0(%r14), $acc4
788 mov $acc5, $acc3
789 sbb 8*1(%r14), $acc5
790 mov $acc0, $t0
791 sbb 8*2(%r14), $acc0
792 mov $acc1, $t1
793 sbb 8*3(%r14), $acc1
794 sbb \$0, $acc2
795
796 cmovc $a_ptr, $acc4
797 cmovc $acc3, $acc5
798 cmovc $t0, $acc0
799 cmovc $t1, $acc1
800
801 mov $acc4, 8*0($r_ptr)
802 mov $acc5, 8*1($r_ptr)
803 mov $acc0, 8*2($r_ptr)
804 mov $acc1, 8*3($r_ptr)
805
806 mov 0(%rsp),%r15
807.cfi_restore %r15
808 mov 8(%rsp),%r14
809.cfi_restore %r14
810 mov 16(%rsp),%r13
811.cfi_restore %r13
812 mov 24(%rsp),%r12
813.cfi_restore %r12
814 mov 32(%rsp),%rbx
815.cfi_restore %rbx
816 mov 40(%rsp),%rbp
817.cfi_restore %rbp
818 lea 48(%rsp),%rsp
819.cfi_adjust_cfa_offset -48
820.Lord_mul_epilogue:
821 ret
822.cfi_endproc
823.size ecp_nistz256_ord_mul_mont,.-ecp_nistz256_ord_mul_mont
824
825################################################################################
826# void ecp_nistz256_ord_sqr_mont(
827# uint64_t res[4],
828# uint64_t a[4],
829# int rep);
830
831.globl ecp_nistz256_ord_sqr_mont
832.type ecp_nistz256_ord_sqr_mont,\@function,3
833.align 32
834ecp_nistz256_ord_sqr_mont:
835.cfi_startproc
836___
837$code.=<<___ if ($addx);
838 mov \$0x80100, %ecx
839 and OPENSSL_ia32cap_P+8(%rip), %ecx
840 cmp \$0x80100, %ecx
841 je .Lecp_nistz256_ord_sqr_montx
842___
843$code.=<<___;
844 push %rbp
845.cfi_push %rbp
846 push %rbx
847.cfi_push %rbx
848 push %r12
849.cfi_push %r12
850 push %r13
851.cfi_push %r13
852 push %r14
853.cfi_push %r14
854 push %r15
855.cfi_push %r15
856.Lord_sqr_body:
857
858 mov 8*0($a_ptr), $acc0
859 mov 8*1($a_ptr), %rax
860 mov 8*2($a_ptr), $acc6
861 mov 8*3($a_ptr), $acc7
862 lea .Lord(%rip), $a_ptr # pointer to modulus
863 mov $b_org, $b_ptr
864 jmp .Loop_ord_sqr
865
866.align 32
867.Loop_ord_sqr:
868 ################################# a[1:] * a[0]
869 mov %rax, $t1 # put aside a[1]
870 mul $acc0 # a[1] * a[0]
871 mov %rax, $acc1
872 movq $t1, %xmm1 # offload a[1]
873 mov $acc6, %rax
874 mov %rdx, $acc2
875
876 mul $acc0 # a[2] * a[0]
877 add %rax, $acc2
878 mov $acc7, %rax
879 movq $acc6, %xmm2 # offload a[2]
880 adc \$0, %rdx
881 mov %rdx, $acc3
882
883 mul $acc0 # a[3] * a[0]
884 add %rax, $acc3
885 mov $acc7, %rax
886 movq $acc7, %xmm3 # offload a[3]
887 adc \$0, %rdx
888 mov %rdx, $acc4
889
890 ################################# a[3] * a[2]
891 mul $acc6 # a[3] * a[2]
892 mov %rax, $acc5
893 mov $acc6, %rax
894 mov %rdx, $acc6
895
896 ################################# a[2:] * a[1]
897 mul $t1 # a[2] * a[1]
898 add %rax, $acc3
899 mov $acc7, %rax
900 adc \$0, %rdx
901 mov %rdx, $acc7
902
903 mul $t1 # a[3] * a[1]
904 add %rax, $acc4
905 adc \$0, %rdx
906
907 add $acc7, $acc4
908 adc %rdx, $acc5
909 adc \$0, $acc6 # can't overflow
910
911 ################################# *2
912 xor $acc7, $acc7
913 mov $acc0, %rax
914 add $acc1, $acc1
915 adc $acc2, $acc2
916 adc $acc3, $acc3
917 adc $acc4, $acc4
918 adc $acc5, $acc5
919 adc $acc6, $acc6
920 adc \$0, $acc7
921
922 ################################# Missing products
923 mul %rax # a[0] * a[0]
924 mov %rax, $acc0
925 movq %xmm1, %rax
926 mov %rdx, $t1
927
928 mul %rax # a[1] * a[1]
929 add $t1, $acc1
930 adc %rax, $acc2
931 movq %xmm2, %rax
932 adc \$0, %rdx
933 mov %rdx, $t1
934
935 mul %rax # a[2] * a[2]
936 add $t1, $acc3
937 adc %rax, $acc4
938 movq %xmm3, %rax
939 adc \$0, %rdx
940 mov %rdx, $t1
941
942 mov $acc0, $t0
943 imulq 8*4($a_ptr), $acc0 # *= .LordK
944
945 mul %rax # a[3] * a[3]
946 add $t1, $acc5
947 adc %rax, $acc6
948 mov 8*0($a_ptr), %rax # modulus[0]
949 adc %rdx, $acc7 # can't overflow
950
951 ################################# First reduction step
952 mul $acc0
953 mov $acc0, $t1
954 add %rax, $t0 # guaranteed to be zero
955 mov 8*1($a_ptr), %rax # modulus[1]
956 adc %rdx, $t0
957
958 sub $acc0, $acc2
959 sbb \$0, $t1 # can't borrow
960
961 mul $acc0
962 add $t0, $acc1
963 adc \$0, %rdx
964 add %rax, $acc1
965 mov $acc0, %rax
966 adc %rdx, $acc2
967 mov $acc0, %rdx
968 adc \$0, $t1 # can't overflow
969
970 mov $acc1, $t0
971 imulq 8*4($a_ptr), $acc1 # *= .LordK
972
973 shl \$32, %rax
974 shr \$32, %rdx
975 sub %rax, $acc3
976 mov 8*0($a_ptr), %rax
977 sbb %rdx, $acc0 # can't borrow
978
979 add $t1, $acc3
980 adc \$0, $acc0 # can't overflow
981
982 ################################# Second reduction step
983 mul $acc1
984 mov $acc1, $t1
985 add %rax, $t0 # guaranteed to be zero
986 mov 8*1($a_ptr), %rax
987 adc %rdx, $t0
988
989 sub $acc1, $acc3
990 sbb \$0, $t1 # can't borrow
991
992 mul $acc1
993 add $t0, $acc2
994 adc \$0, %rdx
995 add %rax, $acc2
996 mov $acc1, %rax
997 adc %rdx, $acc3
998 mov $acc1, %rdx
999 adc \$0, $t1 # can't overflow
1000
1001 mov $acc2, $t0
1002 imulq 8*4($a_ptr), $acc2 # *= .LordK
1003
1004 shl \$32, %rax
1005 shr \$32, %rdx
1006 sub %rax, $acc0
1007 mov 8*0($a_ptr), %rax
1008 sbb %rdx, $acc1 # can't borrow
1009
1010 add $t1, $acc0
1011 adc \$0, $acc1 # can't overflow
1012
1013 ################################# Third reduction step
1014 mul $acc2
1015 mov $acc2, $t1
1016 add %rax, $t0 # guaranteed to be zero
1017 mov 8*1($a_ptr), %rax
1018 adc %rdx, $t0
1019
1020 sub $acc2, $acc0
1021 sbb \$0, $t1 # can't borrow
1022
1023 mul $acc2
1024 add $t0, $acc3
1025 adc \$0, %rdx
1026 add %rax, $acc3
1027 mov $acc2, %rax
1028 adc %rdx, $acc0
1029 mov $acc2, %rdx
1030 adc \$0, $t1 # can't overflow
1031
1032 mov $acc3, $t0
1033 imulq 8*4($a_ptr), $acc3 # *= .LordK
1034
1035 shl \$32, %rax
1036 shr \$32, %rdx
1037 sub %rax, $acc1
1038 mov 8*0($a_ptr), %rax
1039 sbb %rdx, $acc2 # can't borrow
1040
1041 add $t1, $acc1
1042 adc \$0, $acc2 # can't overflow
1043
1044 ################################# Last reduction step
1045 mul $acc3
1046 mov $acc3, $t1
1047 add %rax, $t0 # guaranteed to be zero
1048 mov 8*1($a_ptr), %rax
1049 adc %rdx, $t0
1050
1051 sub $acc3, $acc1
1052 sbb \$0, $t1 # can't borrow
1053
1054 mul $acc3
1055 add $t0, $acc0
1056 adc \$0, %rdx
1057 add %rax, $acc0
1058 mov $acc3, %rax
1059 adc %rdx, $acc1
1060 mov $acc3, %rdx
1061 adc \$0, $t1 # can't overflow
1062
1063 shl \$32, %rax
1064 shr \$32, %rdx
1065 sub %rax, $acc2
1066 sbb %rdx, $acc3 # can't borrow
1067
1068 add $t1, $acc2
1069 adc \$0, $acc3 # can't overflow
1070
1071 ################################# Add bits [511:256] of the sqr result
1072 xor %rdx, %rdx
1073 add $acc4, $acc0
1074 adc $acc5, $acc1
1075 mov $acc0, $acc4
1076 adc $acc6, $acc2
1077 adc $acc7, $acc3
1078 mov $acc1, %rax
1079 adc \$0, %rdx
1080
1081 ################################# Compare to modulus
1082 sub 8*0($a_ptr), $acc0
1083 mov $acc2, $acc6
1084 sbb 8*1($a_ptr), $acc1
1085 sbb 8*2($a_ptr), $acc2
1086 mov $acc3, $acc7
1087 sbb 8*3($a_ptr), $acc3
1088 sbb \$0, %rdx
1089
1090 cmovc $acc4, $acc0
1091 cmovnc $acc1, %rax
1092 cmovnc $acc2, $acc6
1093 cmovnc $acc3, $acc7
1094
1095 dec $b_ptr
1096 jnz .Loop_ord_sqr
1097
1098 mov $acc0, 8*0($r_ptr)
1099 mov %rax, 8*1($r_ptr)
1100 pxor %xmm1, %xmm1
1101 mov $acc6, 8*2($r_ptr)
1102 pxor %xmm2, %xmm2
1103 mov $acc7, 8*3($r_ptr)
1104 pxor %xmm3, %xmm3
1105
1106 mov 0(%rsp),%r15
1107.cfi_restore %r15
1108 mov 8(%rsp),%r14
1109.cfi_restore %r14
1110 mov 16(%rsp),%r13
1111.cfi_restore %r13
1112 mov 24(%rsp),%r12
1113.cfi_restore %r12
1114 mov 32(%rsp),%rbx
1115.cfi_restore %rbx
1116 mov 40(%rsp),%rbp
1117.cfi_restore %rbp
1118 lea 48(%rsp),%rsp
1119.cfi_adjust_cfa_offset -48
1120.Lord_sqr_epilogue:
1121 ret
1122.cfi_endproc
1123.size ecp_nistz256_ord_sqr_mont,.-ecp_nistz256_ord_sqr_mont
1124___
1125
1126$code.=<<___ if ($addx);
1127################################################################################
1128.type ecp_nistz256_ord_mul_montx,\@function,3
1129.align 32
1130ecp_nistz256_ord_mul_montx:
1131.cfi_startproc
1132.Lecp_nistz256_ord_mul_montx:
1133 push %rbp
1134.cfi_push %rbp
1135 push %rbx
1136.cfi_push %rbx
1137 push %r12
1138.cfi_push %r12
1139 push %r13
1140.cfi_push %r13
1141 push %r14
1142.cfi_push %r14
1143 push %r15
1144.cfi_push %r15
1145.Lord_mulx_body:
1146
1147 mov $b_org, $b_ptr
1148 mov 8*0($b_org), %rdx
1149 mov 8*0($a_ptr), $acc1
1150 mov 8*1($a_ptr), $acc2
1151 mov 8*2($a_ptr), $acc3
1152 mov 8*3($a_ptr), $acc4
1153 lea -128($a_ptr), $a_ptr # control u-op density
1154 lea .Lord-128(%rip), %r14
1155 mov .LordK(%rip), %r15
1156
1157 ################################# Multiply by b[0]
1158 mulx $acc1, $acc0, $acc1
1159 mulx $acc2, $t0, $acc2
1160 mulx $acc3, $t1, $acc3
1161 add $t0, $acc1
1162 mulx $acc4, $t0, $acc4
1163 mov $acc0, %rdx
1164 mulx %r15, %rdx, %rax
1165 adc $t1, $acc2
1166 adc $t0, $acc3
1167 adc \$0, $acc4
1168
1169 ################################# reduction
1170 xor $acc5, $acc5 # $acc5=0, cf=0, of=0
1171 mulx 8*0+128(%r14), $t0, $t1
1172 adcx $t0, $acc0 # guaranteed to be zero
1173 adox $t1, $acc1
1174
1175 mulx 8*1+128(%r14), $t0, $t1
1176 adcx $t0, $acc1
1177 adox $t1, $acc2
1178
1179 mulx 8*2+128(%r14), $t0, $t1
1180 adcx $t0, $acc2
1181 adox $t1, $acc3
1182
1183 mulx 8*3+128(%r14), $t0, $t1
1184 mov 8*1($b_ptr), %rdx
1185 adcx $t0, $acc3
1186 adox $t1, $acc4
1187 adcx $acc0, $acc4
1188 adox $acc0, $acc5
1189 adc \$0, $acc5 # cf=0, of=0
1190
1191 ################################# Multiply by b[1]
1192 mulx 8*0+128($a_ptr), $t0, $t1
1193 adcx $t0, $acc1
1194 adox $t1, $acc2
1195
1196 mulx 8*1+128($a_ptr), $t0, $t1
1197 adcx $t0, $acc2
1198 adox $t1, $acc3
1199
1200 mulx 8*2+128($a_ptr), $t0, $t1
1201 adcx $t0, $acc3
1202 adox $t1, $acc4
1203
1204 mulx 8*3+128($a_ptr), $t0, $t1
1205 mov $acc1, %rdx
1206 mulx %r15, %rdx, %rax
1207 adcx $t0, $acc4
1208 adox $t1, $acc5
1209
1210 adcx $acc0, $acc5
1211 adox $acc0, $acc0
1212 adc \$0, $acc0 # cf=0, of=0
1213
1214 ################################# reduction
1215 mulx 8*0+128(%r14), $t0, $t1
1216 adcx $t0, $acc1 # guaranteed to be zero
1217 adox $t1, $acc2
1218
1219 mulx 8*1+128(%r14), $t0, $t1
1220 adcx $t0, $acc2
1221 adox $t1, $acc3
1222
1223 mulx 8*2+128(%r14), $t0, $t1
1224 adcx $t0, $acc3
1225 adox $t1, $acc4
1226
1227 mulx 8*3+128(%r14), $t0, $t1
1228 mov 8*2($b_ptr), %rdx
1229 adcx $t0, $acc4
1230 adox $t1, $acc5
1231 adcx $acc1, $acc5
1232 adox $acc1, $acc0
1233 adc \$0, $acc0 # cf=0, of=0
1234
1235 ################################# Multiply by b[2]
1236 mulx 8*0+128($a_ptr), $t0, $t1
1237 adcx $t0, $acc2
1238 adox $t1, $acc3
1239
1240 mulx 8*1+128($a_ptr), $t0, $t1
1241 adcx $t0, $acc3
1242 adox $t1, $acc4
1243
1244 mulx 8*2+128($a_ptr), $t0, $t1
1245 adcx $t0, $acc4
1246 adox $t1, $acc5
1247
1248 mulx 8*3+128($a_ptr), $t0, $t1
1249 mov $acc2, %rdx
1250 mulx %r15, %rdx, %rax
1251 adcx $t0, $acc5
1252 adox $t1, $acc0
1253
1254 adcx $acc1, $acc0
1255 adox $acc1, $acc1
1256 adc \$0, $acc1 # cf=0, of=0
1257
1258 ################################# reduction
1259 mulx 8*0+128(%r14), $t0, $t1
1260 adcx $t0, $acc2 # guaranteed to be zero
1261 adox $t1, $acc3
1262
1263 mulx 8*1+128(%r14), $t0, $t1
1264 adcx $t0, $acc3
1265 adox $t1, $acc4
1266
1267 mulx 8*2+128(%r14), $t0, $t1
1268 adcx $t0, $acc4
1269 adox $t1, $acc5
1270
1271 mulx 8*3+128(%r14), $t0, $t1
1272 mov 8*3($b_ptr), %rdx
1273 adcx $t0, $acc5
1274 adox $t1, $acc0
1275 adcx $acc2, $acc0
1276 adox $acc2, $acc1
1277 adc \$0, $acc1 # cf=0, of=0
1278
1279 ################################# Multiply by b[3]
1280 mulx 8*0+128($a_ptr), $t0, $t1
1281 adcx $t0, $acc3
1282 adox $t1, $acc4
1283
1284 mulx 8*1+128($a_ptr), $t0, $t1
1285 adcx $t0, $acc4
1286 adox $t1, $acc5
1287
1288 mulx 8*2+128($a_ptr), $t0, $t1
1289 adcx $t0, $acc5
1290 adox $t1, $acc0
1291
1292 mulx 8*3+128($a_ptr), $t0, $t1
1293 mov $acc3, %rdx
1294 mulx %r15, %rdx, %rax
1295 adcx $t0, $acc0
1296 adox $t1, $acc1
1297
1298 adcx $acc2, $acc1
1299 adox $acc2, $acc2
1300 adc \$0, $acc2 # cf=0, of=0
1301
1302 ################################# reduction
1303 mulx 8*0+128(%r14), $t0, $t1
1304 adcx $t0, $acc3 # guaranteed to be zero
1305 adox $t1, $acc4
1306
1307 mulx 8*1+128(%r14), $t0, $t1
1308 adcx $t0, $acc4
1309 adox $t1, $acc5
1310
1311 mulx 8*2+128(%r14), $t0, $t1
1312 adcx $t0, $acc5
1313 adox $t1, $acc0
1314
1315 mulx 8*3+128(%r14), $t0, $t1
1316 lea 128(%r14),%r14
1317 mov $acc4, $t2
1318 adcx $t0, $acc0
1319 adox $t1, $acc1
1320 mov $acc5, $t3
1321 adcx $acc3, $acc1
1322 adox $acc3, $acc2
1323 adc \$0, $acc2
1324
1325 #################################
1326 # Branch-less conditional subtraction of P
1327 mov $acc0, $t0
1328 sub 8*0(%r14), $acc4
1329 sbb 8*1(%r14), $acc5
1330 sbb 8*2(%r14), $acc0
1331 mov $acc1, $t1
1332 sbb 8*3(%r14), $acc1
1333 sbb \$0, $acc2
1334
1335 cmovc $t2, $acc4
1336 cmovc $t3, $acc5
1337 cmovc $t0, $acc0
1338 cmovc $t1, $acc1
1339
1340 mov $acc4, 8*0($r_ptr)
1341 mov $acc5, 8*1($r_ptr)
1342 mov $acc0, 8*2($r_ptr)
1343 mov $acc1, 8*3($r_ptr)
1344
1345 mov 0(%rsp),%r15
1346.cfi_restore %r15
1347 mov 8(%rsp),%r14
1348.cfi_restore %r14
1349 mov 16(%rsp),%r13
1350.cfi_restore %r13
1351 mov 24(%rsp),%r12
1352.cfi_restore %r12
1353 mov 32(%rsp),%rbx
1354.cfi_restore %rbx
1355 mov 40(%rsp),%rbp
1356.cfi_restore %rbp
1357 lea 48(%rsp),%rsp
1358.cfi_adjust_cfa_offset -48
1359.Lord_mulx_epilogue:
1360 ret
1361.cfi_endproc
1362.size ecp_nistz256_ord_mul_montx,.-ecp_nistz256_ord_mul_montx
1363
1364.type ecp_nistz256_ord_sqr_montx,\@function,3
1365.align 32
1366ecp_nistz256_ord_sqr_montx:
1367.cfi_startproc
1368.Lecp_nistz256_ord_sqr_montx:
1369 push %rbp
1370.cfi_push %rbp
1371 push %rbx
1372.cfi_push %rbx
1373 push %r12
1374.cfi_push %r12
1375 push %r13
1376.cfi_push %r13
1377 push %r14
1378.cfi_push %r14
1379 push %r15
1380.cfi_push %r15
1381.Lord_sqrx_body:
1382
1383 mov $b_org, $b_ptr
1384 mov 8*0($a_ptr), %rdx
1385 mov 8*1($a_ptr), $acc6
1386 mov 8*2($a_ptr), $acc7
1387 mov 8*3($a_ptr), $acc0
1388 lea .Lord(%rip), $a_ptr
1389 jmp .Loop_ord_sqrx
1390
1391.align 32
1392.Loop_ord_sqrx:
1393 mulx $acc6, $acc1, $acc2 # a[0]*a[1]
1394 mulx $acc7, $t0, $acc3 # a[0]*a[2]
1395 mov %rdx, %rax # offload a[0]
1396 movq $acc6, %xmm1 # offload a[1]
1397 mulx $acc0, $t1, $acc4 # a[0]*a[3]
1398 mov $acc6, %rdx
1399 add $t0, $acc2
1400 movq $acc7, %xmm2 # offload a[2]
1401 adc $t1, $acc3
1402 adc \$0, $acc4
1403 xor $acc5, $acc5 # $acc5=0,cf=0,of=0
1404 #################################
1405 mulx $acc7, $t0, $t1 # a[1]*a[2]
1406 adcx $t0, $acc3
1407 adox $t1, $acc4
1408
1409 mulx $acc0, $t0, $t1 # a[1]*a[3]
1410 mov $acc7, %rdx
1411 adcx $t0, $acc4
1412 adox $t1, $acc5
1413 adc \$0, $acc5
1414 #################################
1415 mulx $acc0, $t0, $acc6 # a[2]*a[3]
1416 mov %rax, %rdx
1417 movq $acc0, %xmm3 # offload a[3]
1418 xor $acc7, $acc7 # $acc7=0,cf=0,of=0
1419 adcx $acc1, $acc1 # acc1:6<<1
1420 adox $t0, $acc5
1421 adcx $acc2, $acc2
1422 adox $acc7, $acc6 # of=0
1423
1424 ################################# a[i]*a[i]
1425 mulx %rdx, $acc0, $t1
1426 movq %xmm1, %rdx
1427 adcx $acc3, $acc3
1428 adox $t1, $acc1
1429 adcx $acc4, $acc4
1430 mulx %rdx, $t0, $t4
1431 movq %xmm2, %rdx
1432 adcx $acc5, $acc5
1433 adox $t0, $acc2
1434 adcx $acc6, $acc6
1435 mulx %rdx, $t0, $t1
1436 .byte 0x67
1437 movq %xmm3, %rdx
1438 adox $t4, $acc3
1439 adcx $acc7, $acc7
1440 adox $t0, $acc4
1441 adox $t1, $acc5
1442 mulx %rdx, $t0, $t4
1443 adox $t0, $acc6
1444 adox $t4, $acc7
1445
1446 ################################# reduction
1447 mov $acc0, %rdx
1448 mulx 8*4($a_ptr), %rdx, $t0
1449
1450 xor %rax, %rax # cf=0, of=0
1451 mulx 8*0($a_ptr), $t0, $t1
1452 adcx $t0, $acc0 # guaranteed to be zero
1453 adox $t1, $acc1
1454 mulx 8*1($a_ptr), $t0, $t1
1455 adcx $t0, $acc1
1456 adox $t1, $acc2
1457 mulx 8*2($a_ptr), $t0, $t1
1458 adcx $t0, $acc2
1459 adox $t1, $acc3
1460 mulx 8*3($a_ptr), $t0, $t1
1461 adcx $t0, $acc3
1462 adox $t1, $acc0 # of=0
1463 adcx %rax, $acc0 # cf=0
1464
1465 #################################
1466 mov $acc1, %rdx
1467 mulx 8*4($a_ptr), %rdx, $t0
1468
1469 mulx 8*0($a_ptr), $t0, $t1
1470 adox $t0, $acc1 # guaranteed to be zero
1471 adcx $t1, $acc2
1472 mulx 8*1($a_ptr), $t0, $t1
1473 adox $t0, $acc2
1474 adcx $t1, $acc3
1475 mulx 8*2($a_ptr), $t0, $t1
1476 adox $t0, $acc3
1477 adcx $t1, $acc0
1478 mulx 8*3($a_ptr), $t0, $t1
1479 adox $t0, $acc0
1480 adcx $t1, $acc1 # cf=0
1481 adox %rax, $acc1 # of=0
1482
1483 #################################
1484 mov $acc2, %rdx
1485 mulx 8*4($a_ptr), %rdx, $t0
1486
1487 mulx 8*0($a_ptr), $t0, $t1
1488 adcx $t0, $acc2 # guaranteed to be zero
1489 adox $t1, $acc3
1490 mulx 8*1($a_ptr), $t0, $t1
1491 adcx $t0, $acc3
1492 adox $t1, $acc0
1493 mulx 8*2($a_ptr), $t0, $t1
1494 adcx $t0, $acc0
1495 adox $t1, $acc1
1496 mulx 8*3($a_ptr), $t0, $t1
1497 adcx $t0, $acc1
1498 adox $t1, $acc2 # of=0
1499 adcx %rax, $acc2 # cf=0
1500
1501 #################################
1502 mov $acc3, %rdx
1503 mulx 8*4($a_ptr), %rdx, $t0
1504
1505 mulx 8*0($a_ptr), $t0, $t1
1506 adox $t0, $acc3 # guaranteed to be zero
1507 adcx $t1, $acc0
1508 mulx 8*1($a_ptr), $t0, $t1
1509 adox $t0, $acc0
1510 adcx $t1, $acc1
1511 mulx 8*2($a_ptr), $t0, $t1
1512 adox $t0, $acc1
1513 adcx $t1, $acc2
1514 mulx 8*3($a_ptr), $t0, $t1
1515 adox $t0, $acc2
1516 adcx $t1, $acc3
1517 adox %rax, $acc3
1518
1519 ################################# accumulate upper half
1520 add $acc0, $acc4 # add $acc4, $acc0
1521 adc $acc5, $acc1
1522 mov $acc4, %rdx
1523 adc $acc6, $acc2
1524 adc $acc7, $acc3
1525 mov $acc1, $acc6
1526 adc \$0, %rax
1527
1528 ################################# compare to modulus
1529 sub 8*0($a_ptr), $acc4
1530 mov $acc2, $acc7
1531 sbb 8*1($a_ptr), $acc1
1532 sbb 8*2($a_ptr), $acc2
1533 mov $acc3, $acc0
1534 sbb 8*3($a_ptr), $acc3
1535 sbb \$0, %rax
1536
1537 cmovnc $acc4, %rdx
1538 cmovnc $acc1, $acc6
1539 cmovnc $acc2, $acc7
1540 cmovnc $acc3, $acc0
1541
1542 dec $b_ptr
1543 jnz .Loop_ord_sqrx
1544
1545 mov %rdx, 8*0($r_ptr)
1546 mov $acc6, 8*1($r_ptr)
1547 pxor %xmm1, %xmm1
1548 mov $acc7, 8*2($r_ptr)
1549 pxor %xmm2, %xmm2
1550 mov $acc0, 8*3($r_ptr)
1551 pxor %xmm3, %xmm3
1552
1553 mov 0(%rsp),%r15
1554.cfi_restore %r15
1555 mov 8(%rsp),%r14
1556.cfi_restore %r14
1557 mov 16(%rsp),%r13
1558.cfi_restore %r13
1559 mov 24(%rsp),%r12
1560.cfi_restore %r12
1561 mov 32(%rsp),%rbx
1562.cfi_restore %rbx
1563 mov 40(%rsp),%rbp
1564.cfi_restore %rbp
1565 lea 48(%rsp),%rsp
1566.cfi_adjust_cfa_offset -48
1567.Lord_sqrx_epilogue:
1568 ret
1569.cfi_endproc
1570.size ecp_nistz256_ord_sqr_montx,.-ecp_nistz256_ord_sqr_montx
1571___
1572
1573$code.=<<___;
1574################################################################################
1575# void ecp_nistz256_to_mont(
1576# uint64_t res[4],
1577# uint64_t in[4]);
1578.globl ecp_nistz256_to_mont
1579.type ecp_nistz256_to_mont,\@function,2
1580.align 32
1581ecp_nistz256_to_mont:
1582.cfi_startproc
1583___
1584$code.=<<___ if ($addx);
1585 mov \$0x80100, %ecx
1586 and OPENSSL_ia32cap_P+8(%rip), %ecx
1587___
1588$code.=<<___;
1589 lea .LRR(%rip), $b_org
1590 jmp .Lmul_mont
1591.cfi_endproc
1592.size ecp_nistz256_to_mont,.-ecp_nistz256_to_mont
1593
1594################################################################################
1595# void ecp_nistz256_mul_mont(
1596# uint64_t res[4],
1597# uint64_t a[4],
1598# uint64_t b[4]);
1599
1600.globl ecp_nistz256_mul_mont
1601.type ecp_nistz256_mul_mont,\@function,3
1602.align 32
1603ecp_nistz256_mul_mont:
1604.cfi_startproc
1605___
1606$code.=<<___ if ($addx);
1607 mov \$0x80100, %ecx
1608 and OPENSSL_ia32cap_P+8(%rip), %ecx
1609___
1610$code.=<<___;
1611.Lmul_mont:
1612 push %rbp
1613.cfi_push %rbp
1614 push %rbx
1615.cfi_push %rbx
1616 push %r12
1617.cfi_push %r12
1618 push %r13
1619.cfi_push %r13
1620 push %r14
1621.cfi_push %r14
1622 push %r15
1623.cfi_push %r15
1624.Lmul_body:
1625___
1626$code.=<<___ if ($addx);
1627 cmp \$0x80100, %ecx
1628 je .Lmul_montx
1629___
1630$code.=<<___;
1631 mov $b_org, $b_ptr
1632 mov 8*0($b_org), %rax
1633 mov 8*0($a_ptr), $acc1
1634 mov 8*1($a_ptr), $acc2
1635 mov 8*2($a_ptr), $acc3
1636 mov 8*3($a_ptr), $acc4
1637
1638 call __ecp_nistz256_mul_montq
1639___
1640$code.=<<___ if ($addx);
1641 jmp .Lmul_mont_done
1642
1643.align 32
1644.Lmul_montx:
1645 mov $b_org, $b_ptr
1646 mov 8*0($b_org), %rdx
1647 mov 8*0($a_ptr), $acc1
1648 mov 8*1($a_ptr), $acc2
1649 mov 8*2($a_ptr), $acc3
1650 mov 8*3($a_ptr), $acc4
1651 lea -128($a_ptr), $a_ptr # control u-op density
1652
1653 call __ecp_nistz256_mul_montx
1654___
1655$code.=<<___;
1656.Lmul_mont_done:
1657 mov 0(%rsp),%r15
1658.cfi_restore %r15
1659 mov 8(%rsp),%r14
1660.cfi_restore %r14
1661 mov 16(%rsp),%r13
1662.cfi_restore %r13
1663 mov 24(%rsp),%r12
1664.cfi_restore %r12
1665 mov 32(%rsp),%rbx
1666.cfi_restore %rbx
1667 mov 40(%rsp),%rbp
1668.cfi_restore %rbp
1669 lea 48(%rsp),%rsp
1670.cfi_adjust_cfa_offset -48
1671.Lmul_epilogue:
1672 ret
1673.cfi_endproc
1674.size ecp_nistz256_mul_mont,.-ecp_nistz256_mul_mont
1675
1676.type __ecp_nistz256_mul_montq,\@abi-omnipotent
1677.align 32
1678__ecp_nistz256_mul_montq:
1679.cfi_startproc
1680 ########################################################################
1681 # Multiply a by b[0]
1682 mov %rax, $t1
1683 mulq $acc1
1684 mov .Lpoly+8*1(%rip),$poly1
1685 mov %rax, $acc0
1686 mov $t1, %rax
1687 mov %rdx, $acc1
1688
1689 mulq $acc2
1690 mov .Lpoly+8*3(%rip),$poly3
1691 add %rax, $acc1
1692 mov $t1, %rax
1693 adc \$0, %rdx
1694 mov %rdx, $acc2
1695
1696 mulq $acc3
1697 add %rax, $acc2
1698 mov $t1, %rax
1699 adc \$0, %rdx
1700 mov %rdx, $acc3
1701
1702 mulq $acc4
1703 add %rax, $acc3
1704 mov $acc0, %rax
1705 adc \$0, %rdx
1706 xor $acc5, $acc5
1707 mov %rdx, $acc4
1708
1709 ########################################################################
1710 # First reduction step
1711 # Basically now we want to multiply acc[0] by p256,
1712 # and add the result to the acc.
1713 # Due to the special form of p256 we do some optimizations
1714 #
1715 # acc[0] x p256[0..1] = acc[0] x 2^96 - acc[0]
1716 # then we add acc[0] and get acc[0] x 2^96
1717
1718 mov $acc0, $t1
1719 shl \$32, $acc0
1720 mulq $poly3
1721 shr \$32, $t1
1722 add $acc0, $acc1 # +=acc[0]<<96
1723 adc $t1, $acc2
1724 adc %rax, $acc3
1725 mov 8*1($b_ptr), %rax
1726 adc %rdx, $acc4
1727 adc \$0, $acc5
1728 xor $acc0, $acc0
1729
1730 ########################################################################
1731 # Multiply by b[1]
1732 mov %rax, $t1
1733 mulq 8*0($a_ptr)
1734 add %rax, $acc1
1735 mov $t1, %rax
1736 adc \$0, %rdx
1737 mov %rdx, $t0
1738
1739 mulq 8*1($a_ptr)
1740 add $t0, $acc2
1741 adc \$0, %rdx
1742 add %rax, $acc2
1743 mov $t1, %rax
1744 adc \$0, %rdx
1745 mov %rdx, $t0
1746
1747 mulq 8*2($a_ptr)
1748 add $t0, $acc3
1749 adc \$0, %rdx
1750 add %rax, $acc3
1751 mov $t1, %rax
1752 adc \$0, %rdx
1753 mov %rdx, $t0
1754
1755 mulq 8*3($a_ptr)
1756 add $t0, $acc4
1757 adc \$0, %rdx
1758 add %rax, $acc4
1759 mov $acc1, %rax
1760 adc %rdx, $acc5
1761 adc \$0, $acc0
1762
1763 ########################################################################
1764 # Second reduction step
1765 mov $acc1, $t1
1766 shl \$32, $acc1
1767 mulq $poly3
1768 shr \$32, $t1
1769 add $acc1, $acc2
1770 adc $t1, $acc3
1771 adc %rax, $acc4
1772 mov 8*2($b_ptr), %rax
1773 adc %rdx, $acc5
1774 adc \$0, $acc0
1775 xor $acc1, $acc1
1776
1777 ########################################################################
1778 # Multiply by b[2]
1779 mov %rax, $t1
1780 mulq 8*0($a_ptr)
1781 add %rax, $acc2
1782 mov $t1, %rax
1783 adc \$0, %rdx
1784 mov %rdx, $t0
1785
1786 mulq 8*1($a_ptr)
1787 add $t0, $acc3
1788 adc \$0, %rdx
1789 add %rax, $acc3
1790 mov $t1, %rax
1791 adc \$0, %rdx
1792 mov %rdx, $t0
1793
1794 mulq 8*2($a_ptr)
1795 add $t0, $acc4
1796 adc \$0, %rdx
1797 add %rax, $acc4
1798 mov $t1, %rax
1799 adc \$0, %rdx
1800 mov %rdx, $t0
1801
1802 mulq 8*3($a_ptr)
1803 add $t0, $acc5
1804 adc \$0, %rdx
1805 add %rax, $acc5
1806 mov $acc2, %rax
1807 adc %rdx, $acc0
1808 adc \$0, $acc1
1809
1810 ########################################################################
1811 # Third reduction step
1812 mov $acc2, $t1
1813 shl \$32, $acc2
1814 mulq $poly3
1815 shr \$32, $t1
1816 add $acc2, $acc3
1817 adc $t1, $acc4
1818 adc %rax, $acc5
1819 mov 8*3($b_ptr), %rax
1820 adc %rdx, $acc0
1821 adc \$0, $acc1
1822 xor $acc2, $acc2
1823
1824 ########################################################################
1825 # Multiply by b[3]
1826 mov %rax, $t1
1827 mulq 8*0($a_ptr)
1828 add %rax, $acc3
1829 mov $t1, %rax
1830 adc \$0, %rdx
1831 mov %rdx, $t0
1832
1833 mulq 8*1($a_ptr)
1834 add $t0, $acc4
1835 adc \$0, %rdx
1836 add %rax, $acc4
1837 mov $t1, %rax
1838 adc \$0, %rdx
1839 mov %rdx, $t0
1840
1841 mulq 8*2($a_ptr)
1842 add $t0, $acc5
1843 adc \$0, %rdx
1844 add %rax, $acc5
1845 mov $t1, %rax
1846 adc \$0, %rdx
1847 mov %rdx, $t0
1848
1849 mulq 8*3($a_ptr)
1850 add $t0, $acc0
1851 adc \$0, %rdx
1852 add %rax, $acc0
1853 mov $acc3, %rax
1854 adc %rdx, $acc1
1855 adc \$0, $acc2
1856
1857 ########################################################################
1858 # Final reduction step
1859 mov $acc3, $t1
1860 shl \$32, $acc3
1861 mulq $poly3
1862 shr \$32, $t1
1863 add $acc3, $acc4
1864 adc $t1, $acc5
1865 mov $acc4, $t0
1866 adc %rax, $acc0
1867 adc %rdx, $acc1
1868 mov $acc5, $t1
1869 adc \$0, $acc2
1870
1871 ########################################################################
1872 # Branch-less conditional subtraction of P
1873 sub \$-1, $acc4 # .Lpoly[0]
1874 mov $acc0, $t2
1875 sbb $poly1, $acc5 # .Lpoly[1]
1876 sbb \$0, $acc0 # .Lpoly[2]
1877 mov $acc1, $t3
1878 sbb $poly3, $acc1 # .Lpoly[3]
1879 sbb \$0, $acc2
1880
1881 cmovc $t0, $acc4
1882 cmovc $t1, $acc5
1883 mov $acc4, 8*0($r_ptr)
1884 cmovc $t2, $acc0
1885 mov $acc5, 8*1($r_ptr)
1886 cmovc $t3, $acc1
1887 mov $acc0, 8*2($r_ptr)
1888 mov $acc1, 8*3($r_ptr)
1889
1890 ret
1891.cfi_endproc
1892.size __ecp_nistz256_mul_montq,.-__ecp_nistz256_mul_montq
1893
1894################################################################################
1895# void ecp_nistz256_sqr_mont(
1896# uint64_t res[4],
1897# uint64_t a[4]);
1898
1899# we optimize the square according to S.Gueron and V.Krasnov,
1900# "Speeding up Big-Number Squaring"
1901.globl ecp_nistz256_sqr_mont
1902.type ecp_nistz256_sqr_mont,\@function,2
1903.align 32
1904ecp_nistz256_sqr_mont:
1905.cfi_startproc
1906___
1907$code.=<<___ if ($addx);
1908 mov \$0x80100, %ecx
1909 and OPENSSL_ia32cap_P+8(%rip), %ecx
1910___
1911$code.=<<___;
1912 push %rbp
1913.cfi_push %rbp
1914 push %rbx
1915.cfi_push %rbx
1916 push %r12
1917.cfi_push %r12
1918 push %r13
1919.cfi_push %r13
1920 push %r14
1921.cfi_push %r14
1922 push %r15
1923.cfi_push %r15
1924.Lsqr_body:
1925___
1926$code.=<<___ if ($addx);
1927 cmp \$0x80100, %ecx
1928 je .Lsqr_montx
1929___
1930$code.=<<___;
1931 mov 8*0($a_ptr), %rax
1932 mov 8*1($a_ptr), $acc6
1933 mov 8*2($a_ptr), $acc7
1934 mov 8*3($a_ptr), $acc0
1935
1936 call __ecp_nistz256_sqr_montq
1937___
1938$code.=<<___ if ($addx);
1939 jmp .Lsqr_mont_done
1940
1941.align 32
1942.Lsqr_montx:
1943 mov 8*0($a_ptr), %rdx
1944 mov 8*1($a_ptr), $acc6
1945 mov 8*2($a_ptr), $acc7
1946 mov 8*3($a_ptr), $acc0
1947 lea -128($a_ptr), $a_ptr # control u-op density
1948
1949 call __ecp_nistz256_sqr_montx
1950___
1951$code.=<<___;
1952.Lsqr_mont_done:
1953 mov 0(%rsp),%r15
1954.cfi_restore %r15
1955 mov 8(%rsp),%r14
1956.cfi_restore %r14
1957 mov 16(%rsp),%r13
1958.cfi_restore %r13
1959 mov 24(%rsp),%r12
1960.cfi_restore %r12
1961 mov 32(%rsp),%rbx
1962.cfi_restore %rbx
1963 mov 40(%rsp),%rbp
1964.cfi_restore %rbp
1965 lea 48(%rsp),%rsp
1966.cfi_adjust_cfa_offset -48
1967.Lsqr_epilogue:
1968 ret
1969.cfi_endproc
1970.size ecp_nistz256_sqr_mont,.-ecp_nistz256_sqr_mont
1971
1972.type __ecp_nistz256_sqr_montq,\@abi-omnipotent
1973.align 32
1974__ecp_nistz256_sqr_montq:
1975.cfi_startproc
1976 mov %rax, $acc5
1977 mulq $acc6 # a[1]*a[0]
1978 mov %rax, $acc1
1979 mov $acc7, %rax
1980 mov %rdx, $acc2
1981
1982 mulq $acc5 # a[0]*a[2]
1983 add %rax, $acc2
1984 mov $acc0, %rax
1985 adc \$0, %rdx
1986 mov %rdx, $acc3
1987
1988 mulq $acc5 # a[0]*a[3]
1989 add %rax, $acc3
1990 mov $acc7, %rax
1991 adc \$0, %rdx
1992 mov %rdx, $acc4
1993
1994 #################################
1995 mulq $acc6 # a[1]*a[2]
1996 add %rax, $acc3
1997 mov $acc0, %rax
1998 adc \$0, %rdx
1999 mov %rdx, $t1
2000
2001 mulq $acc6 # a[1]*a[3]
2002 add %rax, $acc4
2003 mov $acc0, %rax
2004 adc \$0, %rdx
2005 add $t1, $acc4
2006 mov %rdx, $acc5
2007 adc \$0, $acc5
2008
2009 #################################
2010 mulq $acc7 # a[2]*a[3]
2011 xor $acc7, $acc7
2012 add %rax, $acc5
2013 mov 8*0($a_ptr), %rax
2014 mov %rdx, $acc6
2015 adc \$0, $acc6
2016
2017 add $acc1, $acc1 # acc1:6<<1
2018 adc $acc2, $acc2
2019 adc $acc3, $acc3
2020 adc $acc4, $acc4
2021 adc $acc5, $acc5
2022 adc $acc6, $acc6
2023 adc \$0, $acc7
2024
2025 mulq %rax
2026 mov %rax, $acc0
2027 mov 8*1($a_ptr), %rax
2028 mov %rdx, $t0
2029
2030 mulq %rax
2031 add $t0, $acc1
2032 adc %rax, $acc2
2033 mov 8*2($a_ptr), %rax
2034 adc \$0, %rdx
2035 mov %rdx, $t0
2036
2037 mulq %rax
2038 add $t0, $acc3
2039 adc %rax, $acc4
2040 mov 8*3($a_ptr), %rax
2041 adc \$0, %rdx
2042 mov %rdx, $t0
2043
2044 mulq %rax
2045 add $t0, $acc5
2046 adc %rax, $acc6
2047 mov $acc0, %rax
2048 adc %rdx, $acc7
2049
2050 mov .Lpoly+8*1(%rip), $a_ptr
2051 mov .Lpoly+8*3(%rip), $t1
2052
2053 ##########################################
2054 # Now the reduction
2055 # First iteration
2056 mov $acc0, $t0
2057 shl \$32, $acc0
2058 mulq $t1
2059 shr \$32, $t0
2060 add $acc0, $acc1 # +=acc[0]<<96
2061 adc $t0, $acc2
2062 adc %rax, $acc3
2063 mov $acc1, %rax
2064 adc \$0, %rdx
2065
2066 ##########################################
2067 # Second iteration
2068 mov $acc1, $t0
2069 shl \$32, $acc1
2070 mov %rdx, $acc0
2071 mulq $t1
2072 shr \$32, $t0
2073 add $acc1, $acc2
2074 adc $t0, $acc3
2075 adc %rax, $acc0
2076 mov $acc2, %rax
2077 adc \$0, %rdx
2078
2079 ##########################################
2080 # Third iteration
2081 mov $acc2, $t0
2082 shl \$32, $acc2
2083 mov %rdx, $acc1
2084 mulq $t1
2085 shr \$32, $t0
2086 add $acc2, $acc3
2087 adc $t0, $acc0
2088 adc %rax, $acc1
2089 mov $acc3, %rax
2090 adc \$0, %rdx
2091
2092 ###########################################
2093 # Last iteration
2094 mov $acc3, $t0
2095 shl \$32, $acc3
2096 mov %rdx, $acc2
2097 mulq $t1
2098 shr \$32, $t0
2099 add $acc3, $acc0
2100 adc $t0, $acc1
2101 adc %rax, $acc2
2102 adc \$0, %rdx
2103 xor $acc3, $acc3
2104
2105 ############################################
2106 # Add the rest of the acc
2107 add $acc0, $acc4
2108 adc $acc1, $acc5
2109 mov $acc4, $acc0
2110 adc $acc2, $acc6
2111 adc %rdx, $acc7
2112 mov $acc5, $acc1
2113 adc \$0, $acc3
2114
2115 sub \$-1, $acc4 # .Lpoly[0]
2116 mov $acc6, $acc2
2117 sbb $a_ptr, $acc5 # .Lpoly[1]
2118 sbb \$0, $acc6 # .Lpoly[2]
2119 mov $acc7, $t0
2120 sbb $t1, $acc7 # .Lpoly[3]
2121 sbb \$0, $acc3
2122
2123 cmovc $acc0, $acc4
2124 cmovc $acc1, $acc5
2125 mov $acc4, 8*0($r_ptr)
2126 cmovc $acc2, $acc6
2127 mov $acc5, 8*1($r_ptr)
2128 cmovc $t0, $acc7
2129 mov $acc6, 8*2($r_ptr)
2130 mov $acc7, 8*3($r_ptr)
2131
2132 ret
2133.cfi_endproc
2134.size __ecp_nistz256_sqr_montq,.-__ecp_nistz256_sqr_montq
2135___
2136
2137if ($addx) {
2138$code.=<<___;
2139.type __ecp_nistz256_mul_montx,\@abi-omnipotent
2140.align 32
2141__ecp_nistz256_mul_montx:
2142.cfi_startproc
2143 ########################################################################
2144 # Multiply by b[0]
2145 mulx $acc1, $acc0, $acc1
2146 mulx $acc2, $t0, $acc2
2147 mov \$32, $poly1
2148 xor $acc5, $acc5 # cf=0
2149 mulx $acc3, $t1, $acc3
2150 mov .Lpoly+8*3(%rip), $poly3
2151 adc $t0, $acc1
2152 mulx $acc4, $t0, $acc4
2153 mov $acc0, %rdx
2154 adc $t1, $acc2
2155 shlx $poly1,$acc0,$t1
2156 adc $t0, $acc3
2157 shrx $poly1,$acc0,$t0
2158 adc \$0, $acc4
2159
2160 ########################################################################
2161 # First reduction step
2162 add $t1, $acc1
2163 adc $t0, $acc2
2164
2165 mulx $poly3, $t0, $t1
2166 mov 8*1($b_ptr), %rdx
2167 adc $t0, $acc3
2168 adc $t1, $acc4
2169 adc \$0, $acc5
2170 xor $acc0, $acc0 # $acc0=0,cf=0,of=0
2171
2172 ########################################################################
2173 # Multiply by b[1]
2174 mulx 8*0+128($a_ptr), $t0, $t1
2175 adcx $t0, $acc1
2176 adox $t1, $acc2
2177
2178 mulx 8*1+128($a_ptr), $t0, $t1
2179 adcx $t0, $acc2
2180 adox $t1, $acc3
2181
2182 mulx 8*2+128($a_ptr), $t0, $t1
2183 adcx $t0, $acc3
2184 adox $t1, $acc4
2185
2186 mulx 8*3+128($a_ptr), $t0, $t1
2187 mov $acc1, %rdx
2188 adcx $t0, $acc4
2189 shlx $poly1, $acc1, $t0
2190 adox $t1, $acc5
2191 shrx $poly1, $acc1, $t1
2192
2193 adcx $acc0, $acc5
2194 adox $acc0, $acc0
2195 adc \$0, $acc0
2196
2197 ########################################################################
2198 # Second reduction step
2199 add $t0, $acc2
2200 adc $t1, $acc3
2201
2202 mulx $poly3, $t0, $t1
2203 mov 8*2($b_ptr), %rdx
2204 adc $t0, $acc4
2205 adc $t1, $acc5
2206 adc \$0, $acc0
2207 xor $acc1 ,$acc1 # $acc1=0,cf=0,of=0
2208
2209 ########################################################################
2210 # Multiply by b[2]
2211 mulx 8*0+128($a_ptr), $t0, $t1
2212 adcx $t0, $acc2
2213 adox $t1, $acc3
2214
2215 mulx 8*1+128($a_ptr), $t0, $t1
2216 adcx $t0, $acc3
2217 adox $t1, $acc4
2218
2219 mulx 8*2+128($a_ptr), $t0, $t1
2220 adcx $t0, $acc4
2221 adox $t1, $acc5
2222
2223 mulx 8*3+128($a_ptr), $t0, $t1
2224 mov $acc2, %rdx
2225 adcx $t0, $acc5
2226 shlx $poly1, $acc2, $t0
2227 adox $t1, $acc0
2228 shrx $poly1, $acc2, $t1
2229
2230 adcx $acc1, $acc0
2231 adox $acc1, $acc1
2232 adc \$0, $acc1
2233
2234 ########################################################################
2235 # Third reduction step
2236 add $t0, $acc3
2237 adc $t1, $acc4
2238
2239 mulx $poly3, $t0, $t1
2240 mov 8*3($b_ptr), %rdx
2241 adc $t0, $acc5
2242 adc $t1, $acc0
2243 adc \$0, $acc1
2244 xor $acc2, $acc2 # $acc2=0,cf=0,of=0
2245
2246 ########################################################################
2247 # Multiply by b[3]
2248 mulx 8*0+128($a_ptr), $t0, $t1
2249 adcx $t0, $acc3
2250 adox $t1, $acc4
2251
2252 mulx 8*1+128($a_ptr), $t0, $t1
2253 adcx $t0, $acc4
2254 adox $t1, $acc5
2255
2256 mulx 8*2+128($a_ptr), $t0, $t1
2257 adcx $t0, $acc5
2258 adox $t1, $acc0
2259
2260 mulx 8*3+128($a_ptr), $t0, $t1
2261 mov $acc3, %rdx
2262 adcx $t0, $acc0
2263 shlx $poly1, $acc3, $t0
2264 adox $t1, $acc1
2265 shrx $poly1, $acc3, $t1
2266
2267 adcx $acc2, $acc1
2268 adox $acc2, $acc2
2269 adc \$0, $acc2
2270
2271 ########################################################################
2272 # Fourth reduction step
2273 add $t0, $acc4
2274 adc $t1, $acc5
2275
2276 mulx $poly3, $t0, $t1
2277 mov $acc4, $t2
2278 mov .Lpoly+8*1(%rip), $poly1
2279 adc $t0, $acc0
2280 mov $acc5, $t3
2281 adc $t1, $acc1
2282 adc \$0, $acc2
2283
2284 ########################################################################
2285 # Branch-less conditional subtraction of P
2286 xor %eax, %eax
2287 mov $acc0, $t0
2288 sbb \$-1, $acc4 # .Lpoly[0]
2289 sbb $poly1, $acc5 # .Lpoly[1]
2290 sbb \$0, $acc0 # .Lpoly[2]
2291 mov $acc1, $t1
2292 sbb $poly3, $acc1 # .Lpoly[3]
2293 sbb \$0, $acc2
2294
2295 cmovc $t2, $acc4
2296 cmovc $t3, $acc5
2297 mov $acc4, 8*0($r_ptr)
2298 cmovc $t0, $acc0
2299 mov $acc5, 8*1($r_ptr)
2300 cmovc $t1, $acc1
2301 mov $acc0, 8*2($r_ptr)
2302 mov $acc1, 8*3($r_ptr)
2303
2304 ret
2305.cfi_endproc
2306.size __ecp_nistz256_mul_montx,.-__ecp_nistz256_mul_montx
2307
2308.type __ecp_nistz256_sqr_montx,\@abi-omnipotent
2309.align 32
2310__ecp_nistz256_sqr_montx:
2311.cfi_startproc
2312 mulx $acc6, $acc1, $acc2 # a[0]*a[1]
2313 mulx $acc7, $t0, $acc3 # a[0]*a[2]
2314 xor %eax, %eax
2315 adc $t0, $acc2
2316 mulx $acc0, $t1, $acc4 # a[0]*a[3]
2317 mov $acc6, %rdx
2318 adc $t1, $acc3
2319 adc \$0, $acc4
2320 xor $acc5, $acc5 # $acc5=0,cf=0,of=0
2321
2322 #################################
2323 mulx $acc7, $t0, $t1 # a[1]*a[2]
2324 adcx $t0, $acc3
2325 adox $t1, $acc4
2326
2327 mulx $acc0, $t0, $t1 # a[1]*a[3]
2328 mov $acc7, %rdx
2329 adcx $t0, $acc4
2330 adox $t1, $acc5
2331 adc \$0, $acc5
2332
2333 #################################
2334 mulx $acc0, $t0, $acc6 # a[2]*a[3]
2335 mov 8*0+128($a_ptr), %rdx
2336 xor $acc7, $acc7 # $acc7=0,cf=0,of=0
2337 adcx $acc1, $acc1 # acc1:6<<1
2338 adox $t0, $acc5
2339 adcx $acc2, $acc2
2340 adox $acc7, $acc6 # of=0
2341
2342 mulx %rdx, $acc0, $t1
2343 mov 8*1+128($a_ptr), %rdx
2344 adcx $acc3, $acc3
2345 adox $t1, $acc1
2346 adcx $acc4, $acc4
2347 mulx %rdx, $t0, $t4
2348 mov 8*2+128($a_ptr), %rdx
2349 adcx $acc5, $acc5
2350 adox $t0, $acc2
2351 adcx $acc6, $acc6
2352 .byte 0x67
2353 mulx %rdx, $t0, $t1
2354 mov 8*3+128($a_ptr), %rdx
2355 adox $t4, $acc3
2356 adcx $acc7, $acc7
2357 adox $t0, $acc4
2358 mov \$32, $a_ptr
2359 adox $t1, $acc5
2360 .byte 0x67,0x67
2361 mulx %rdx, $t0, $t4
2362 mov .Lpoly+8*3(%rip), %rdx
2363 adox $t0, $acc6
2364 shlx $a_ptr, $acc0, $t0
2365 adox $t4, $acc7
2366 shrx $a_ptr, $acc0, $t4
2367 mov %rdx,$t1
2368
2369 # reduction step 1
2370 add $t0, $acc1
2371 adc $t4, $acc2
2372
2373 mulx $acc0, $t0, $acc0
2374 adc $t0, $acc3
2375 shlx $a_ptr, $acc1, $t0
2376 adc \$0, $acc0
2377 shrx $a_ptr, $acc1, $t4
2378
2379 # reduction step 2
2380 add $t0, $acc2
2381 adc $t4, $acc3
2382
2383 mulx $acc1, $t0, $acc1
2384 adc $t0, $acc0
2385 shlx $a_ptr, $acc2, $t0
2386 adc \$0, $acc1
2387 shrx $a_ptr, $acc2, $t4
2388
2389 # reduction step 3
2390 add $t0, $acc3
2391 adc $t4, $acc0
2392
2393 mulx $acc2, $t0, $acc2
2394 adc $t0, $acc1
2395 shlx $a_ptr, $acc3, $t0
2396 adc \$0, $acc2
2397 shrx $a_ptr, $acc3, $t4
2398
2399 # reduction step 4
2400 add $t0, $acc0
2401 adc $t4, $acc1
2402
2403 mulx $acc3, $t0, $acc3
2404 adc $t0, $acc2
2405 adc \$0, $acc3
2406
2407 xor $t3, $t3
2408 add $acc0, $acc4 # accumulate upper half
2409 mov .Lpoly+8*1(%rip), $a_ptr
2410 adc $acc1, $acc5
2411 mov $acc4, $acc0
2412 adc $acc2, $acc6
2413 adc $acc3, $acc7
2414 mov $acc5, $acc1
2415 adc \$0, $t3
2416
2417 sub \$-1, $acc4 # .Lpoly[0]
2418 mov $acc6, $acc2
2419 sbb $a_ptr, $acc5 # .Lpoly[1]
2420 sbb \$0, $acc6 # .Lpoly[2]
2421 mov $acc7, $acc3
2422 sbb $t1, $acc7 # .Lpoly[3]
2423 sbb \$0, $t3
2424
2425 cmovc $acc0, $acc4
2426 cmovc $acc1, $acc5
2427 mov $acc4, 8*0($r_ptr)
2428 cmovc $acc2, $acc6
2429 mov $acc5, 8*1($r_ptr)
2430 cmovc $acc3, $acc7
2431 mov $acc6, 8*2($r_ptr)
2432 mov $acc7, 8*3($r_ptr)
2433
2434 ret
2435.cfi_endproc
2436.size __ecp_nistz256_sqr_montx,.-__ecp_nistz256_sqr_montx
2437___
2438}
2439}
2440{
2441my ($r_ptr,$in_ptr)=("%rdi","%rsi");
2442my ($acc0,$acc1,$acc2,$acc3)=map("%r$_",(8..11));
2443my ($t0,$t1,$t2)=("%rcx","%r12","%r13");
2444
2445$code.=<<___;
2446################################################################################
2447# void ecp_nistz256_from_mont(
2448# uint64_t res[4],
2449# uint64_t in[4]);
2450# This one performs Montgomery multiplication by 1, so we only need the reduction
2451
2452.globl ecp_nistz256_from_mont
2453.type ecp_nistz256_from_mont,\@function,2
2454.align 32
2455ecp_nistz256_from_mont:
2456.cfi_startproc
2457 push %r12
2458.cfi_push %r12
2459 push %r13
2460.cfi_push %r13
2461.Lfrom_body:
2462
2463 mov 8*0($in_ptr), %rax
2464 mov .Lpoly+8*3(%rip), $t2
2465 mov 8*1($in_ptr), $acc1
2466 mov 8*2($in_ptr), $acc2
2467 mov 8*3($in_ptr), $acc3
2468 mov %rax, $acc0
2469 mov .Lpoly+8*1(%rip), $t1
2470
2471 #########################################
2472 # First iteration
2473 mov %rax, $t0
2474 shl \$32, $acc0
2475 mulq $t2
2476 shr \$32, $t0
2477 add $acc0, $acc1
2478 adc $t0, $acc2
2479 adc %rax, $acc3
2480 mov $acc1, %rax
2481 adc \$0, %rdx
2482
2483 #########################################
2484 # Second iteration
2485 mov $acc1, $t0
2486 shl \$32, $acc1
2487 mov %rdx, $acc0
2488 mulq $t2
2489 shr \$32, $t0
2490 add $acc1, $acc2
2491 adc $t0, $acc3
2492 adc %rax, $acc0
2493 mov $acc2, %rax
2494 adc \$0, %rdx
2495
2496 ##########################################
2497 # Third iteration
2498 mov $acc2, $t0
2499 shl \$32, $acc2
2500 mov %rdx, $acc1
2501 mulq $t2
2502 shr \$32, $t0
2503 add $acc2, $acc3
2504 adc $t0, $acc0
2505 adc %rax, $acc1
2506 mov $acc3, %rax
2507 adc \$0, %rdx
2508
2509 ###########################################
2510 # Last iteration
2511 mov $acc3, $t0
2512 shl \$32, $acc3
2513 mov %rdx, $acc2
2514 mulq $t2
2515 shr \$32, $t0
2516 add $acc3, $acc0
2517 adc $t0, $acc1
2518 mov $acc0, $t0
2519 adc %rax, $acc2
2520 mov $acc1, $in_ptr
2521 adc \$0, %rdx
2522
2523 ###########################################
2524 # Branch-less conditional subtraction
2525 sub \$-1, $acc0
2526 mov $acc2, %rax
2527 sbb $t1, $acc1
2528 sbb \$0, $acc2
2529 mov %rdx, $acc3
2530 sbb $t2, %rdx
2531 sbb $t2, $t2
2532
2533 cmovnz $t0, $acc0
2534 cmovnz $in_ptr, $acc1
2535 mov $acc0, 8*0($r_ptr)
2536 cmovnz %rax, $acc2
2537 mov $acc1, 8*1($r_ptr)
2538 cmovz %rdx, $acc3
2539 mov $acc2, 8*2($r_ptr)
2540 mov $acc3, 8*3($r_ptr)
2541
2542 mov 0(%rsp),%r13
2543.cfi_restore %r13
2544 mov 8(%rsp),%r12
2545.cfi_restore %r12
2546 lea 16(%rsp),%rsp
2547.cfi_adjust_cfa_offset -16
2548.Lfrom_epilogue:
2549 ret
2550.cfi_endproc
2551.size ecp_nistz256_from_mont,.-ecp_nistz256_from_mont
2552___
2553}
2554{
2555my ($val,$in_t,$index)=$win64?("%rcx","%rdx","%r8d"):("%rdi","%rsi","%edx");
2556my ($ONE,$INDEX,$Ra,$Rb,$Rc,$Rd,$Re,$Rf)=map("%xmm$_",(0..7));
2557my ($M0,$T0a,$T0b,$T0c,$T0d,$T0e,$T0f,$TMP0)=map("%xmm$_",(8..15));
2558my ($M1,$T2a,$T2b,$TMP2,$M2,$T2a,$T2b,$TMP2)=map("%xmm$_",(8..15));
2559
2560$code.=<<___;
2561################################################################################
2562# void ecp_nistz256_scatter_w5(uint64_t *val, uint64_t *in_t, int index);
2563.globl ecp_nistz256_scatter_w5
2564.type ecp_nistz256_scatter_w5,\@abi-omnipotent
2565.align 32
2566ecp_nistz256_scatter_w5:
2567.cfi_startproc
2568 lea -3($index,$index,2), $index
2569 movdqa 0x00($in_t), %xmm0
2570 shl \$5, $index
2571 movdqa 0x10($in_t), %xmm1
2572 movdqa 0x20($in_t), %xmm2
2573 movdqa 0x30($in_t), %xmm3
2574 movdqa 0x40($in_t), %xmm4
2575 movdqa 0x50($in_t), %xmm5
2576 movdqa %xmm0, 0x00($val,$index)
2577 movdqa %xmm1, 0x10($val,$index)
2578 movdqa %xmm2, 0x20($val,$index)
2579 movdqa %xmm3, 0x30($val,$index)
2580 movdqa %xmm4, 0x40($val,$index)
2581 movdqa %xmm5, 0x50($val,$index)
2582
2583 ret
2584.cfi_endproc
2585.size ecp_nistz256_scatter_w5,.-ecp_nistz256_scatter_w5
2586
2587################################################################################
2588# void ecp_nistz256_gather_w5(uint64_t *val, uint64_t *in_t, int index);
2589.globl ecp_nistz256_gather_w5
2590.type ecp_nistz256_gather_w5,\@abi-omnipotent
2591.align 32
2592ecp_nistz256_gather_w5:
2593.cfi_startproc
2594___
2595$code.=<<___ if ($avx>1);
2596 mov OPENSSL_ia32cap_P+8(%rip), %eax
2597 test \$`1<<5`, %eax
2598 jnz .Lavx2_gather_w5
2599___
2600$code.=<<___ if ($win64);
2601 lea -0x88(%rsp), %rax
2602.LSEH_begin_ecp_nistz256_gather_w5:
2603 .byte 0x48,0x8d,0x60,0xe0 #lea -0x20(%rax), %rsp
2604 .byte 0x0f,0x29,0x70,0xe0 #movaps %xmm6, -0x20(%rax)
2605 .byte 0x0f,0x29,0x78,0xf0 #movaps %xmm7, -0x10(%rax)
2606 .byte 0x44,0x0f,0x29,0x00 #movaps %xmm8, 0(%rax)
2607 .byte 0x44,0x0f,0x29,0x48,0x10 #movaps %xmm9, 0x10(%rax)
2608 .byte 0x44,0x0f,0x29,0x50,0x20 #movaps %xmm10, 0x20(%rax)
2609 .byte 0x44,0x0f,0x29,0x58,0x30 #movaps %xmm11, 0x30(%rax)
2610 .byte 0x44,0x0f,0x29,0x60,0x40 #movaps %xmm12, 0x40(%rax)
2611 .byte 0x44,0x0f,0x29,0x68,0x50 #movaps %xmm13, 0x50(%rax)
2612 .byte 0x44,0x0f,0x29,0x70,0x60 #movaps %xmm14, 0x60(%rax)
2613 .byte 0x44,0x0f,0x29,0x78,0x70 #movaps %xmm15, 0x70(%rax)
2614___
2615$code.=<<___;
2616 movdqa .LOne(%rip), $ONE
2617 movd $index, $INDEX
2618
2619 pxor $Ra, $Ra
2620 pxor $Rb, $Rb
2621 pxor $Rc, $Rc
2622 pxor $Rd, $Rd
2623 pxor $Re, $Re
2624 pxor $Rf, $Rf
2625
2626 movdqa $ONE, $M0
2627 pshufd \$0, $INDEX, $INDEX
2628
2629 mov \$16, %rax
2630.Lselect_loop_sse_w5:
2631
2632 movdqa $M0, $TMP0
2633 paddd $ONE, $M0
2634 pcmpeqd $INDEX, $TMP0
2635
2636 movdqa 16*0($in_t), $T0a
2637 movdqa 16*1($in_t), $T0b
2638 movdqa 16*2($in_t), $T0c
2639 movdqa 16*3($in_t), $T0d
2640 movdqa 16*4($in_t), $T0e
2641 movdqa 16*5($in_t), $T0f
2642 lea 16*6($in_t), $in_t
2643
2644 pand $TMP0, $T0a
2645 pand $TMP0, $T0b
2646 por $T0a, $Ra
2647 pand $TMP0, $T0c
2648 por $T0b, $Rb
2649 pand $TMP0, $T0d
2650 por $T0c, $Rc
2651 pand $TMP0, $T0e
2652 por $T0d, $Rd
2653 pand $TMP0, $T0f
2654 por $T0e, $Re
2655 por $T0f, $Rf
2656
2657 dec %rax
2658 jnz .Lselect_loop_sse_w5
2659
2660 movdqu $Ra, 16*0($val)
2661 movdqu $Rb, 16*1($val)
2662 movdqu $Rc, 16*2($val)
2663 movdqu $Rd, 16*3($val)
2664 movdqu $Re, 16*4($val)
2665 movdqu $Rf, 16*5($val)
2666___
2667$code.=<<___ if ($win64);
2668 movaps (%rsp), %xmm6
2669 movaps 0x10(%rsp), %xmm7
2670 movaps 0x20(%rsp), %xmm8
2671 movaps 0x30(%rsp), %xmm9
2672 movaps 0x40(%rsp), %xmm10
2673 movaps 0x50(%rsp), %xmm11
2674 movaps 0x60(%rsp), %xmm12
2675 movaps 0x70(%rsp), %xmm13
2676 movaps 0x80(%rsp), %xmm14
2677 movaps 0x90(%rsp), %xmm15
2678 lea 0xa8(%rsp), %rsp
2679___
2680$code.=<<___;
2681 ret
2682.cfi_endproc
2683.LSEH_end_ecp_nistz256_gather_w5:
2684.size ecp_nistz256_gather_w5,.-ecp_nistz256_gather_w5
2685
2686################################################################################
2687# void ecp_nistz256_scatter_w7(uint64_t *val, uint64_t *in_t, int index);
2688.globl ecp_nistz256_scatter_w7
2689.type ecp_nistz256_scatter_w7,\@abi-omnipotent
2690.align 32
2691ecp_nistz256_scatter_w7:
2692.cfi_startproc
2693 movdqu 0x00($in_t), %xmm0
2694 shl \$6, $index
2695 movdqu 0x10($in_t), %xmm1
2696 movdqu 0x20($in_t), %xmm2
2697 movdqu 0x30($in_t), %xmm3
2698 movdqa %xmm0, 0x00($val,$index)
2699 movdqa %xmm1, 0x10($val,$index)
2700 movdqa %xmm2, 0x20($val,$index)
2701 movdqa %xmm3, 0x30($val,$index)
2702
2703 ret
2704.cfi_endproc
2705.size ecp_nistz256_scatter_w7,.-ecp_nistz256_scatter_w7
2706
2707################################################################################
2708# void ecp_nistz256_gather_w7(uint64_t *val, uint64_t *in_t, int index);
2709.globl ecp_nistz256_gather_w7
2710.type ecp_nistz256_gather_w7,\@abi-omnipotent
2711.align 32
2712ecp_nistz256_gather_w7:
2713.cfi_startproc
2714___
2715$code.=<<___ if ($avx>1);
2716 mov OPENSSL_ia32cap_P+8(%rip), %eax
2717 test \$`1<<5`, %eax
2718 jnz .Lavx2_gather_w7
2719___
2720$code.=<<___ if ($win64);
2721 lea -0x88(%rsp), %rax
2722.LSEH_begin_ecp_nistz256_gather_w7:
2723 .byte 0x48,0x8d,0x60,0xe0 #lea -0x20(%rax), %rsp
2724 .byte 0x0f,0x29,0x70,0xe0 #movaps %xmm6, -0x20(%rax)
2725 .byte 0x0f,0x29,0x78,0xf0 #movaps %xmm7, -0x10(%rax)
2726 .byte 0x44,0x0f,0x29,0x00 #movaps %xmm8, 0(%rax)
2727 .byte 0x44,0x0f,0x29,0x48,0x10 #movaps %xmm9, 0x10(%rax)
2728 .byte 0x44,0x0f,0x29,0x50,0x20 #movaps %xmm10, 0x20(%rax)
2729 .byte 0x44,0x0f,0x29,0x58,0x30 #movaps %xmm11, 0x30(%rax)
2730 .byte 0x44,0x0f,0x29,0x60,0x40 #movaps %xmm12, 0x40(%rax)
2731 .byte 0x44,0x0f,0x29,0x68,0x50 #movaps %xmm13, 0x50(%rax)
2732 .byte 0x44,0x0f,0x29,0x70,0x60 #movaps %xmm14, 0x60(%rax)
2733 .byte 0x44,0x0f,0x29,0x78,0x70 #movaps %xmm15, 0x70(%rax)
2734___
2735$code.=<<___;
2736 movdqa .LOne(%rip), $M0
2737 movd $index, $INDEX
2738
2739 pxor $Ra, $Ra
2740 pxor $Rb, $Rb
2741 pxor $Rc, $Rc
2742 pxor $Rd, $Rd
2743
2744 movdqa $M0, $ONE
2745 pshufd \$0, $INDEX, $INDEX
2746 mov \$64, %rax
2747
2748.Lselect_loop_sse_w7:
2749 movdqa $M0, $TMP0
2750 paddd $ONE, $M0
2751 movdqa 16*0($in_t), $T0a
2752 movdqa 16*1($in_t), $T0b
2753 pcmpeqd $INDEX, $TMP0
2754 movdqa 16*2($in_t), $T0c
2755 movdqa 16*3($in_t), $T0d
2756 lea 16*4($in_t), $in_t
2757
2758 pand $TMP0, $T0a
2759 pand $TMP0, $T0b
2760 por $T0a, $Ra
2761 pand $TMP0, $T0c
2762 por $T0b, $Rb
2763 pand $TMP0, $T0d
2764 por $T0c, $Rc
2765 prefetcht0 255($in_t)
2766 por $T0d, $Rd
2767
2768 dec %rax
2769 jnz .Lselect_loop_sse_w7
2770
2771 movdqu $Ra, 16*0($val)
2772 movdqu $Rb, 16*1($val)
2773 movdqu $Rc, 16*2($val)
2774 movdqu $Rd, 16*3($val)
2775___
2776$code.=<<___ if ($win64);
2777 movaps (%rsp), %xmm6
2778 movaps 0x10(%rsp), %xmm7
2779 movaps 0x20(%rsp), %xmm8
2780 movaps 0x30(%rsp), %xmm9
2781 movaps 0x40(%rsp), %xmm10
2782 movaps 0x50(%rsp), %xmm11
2783 movaps 0x60(%rsp), %xmm12
2784 movaps 0x70(%rsp), %xmm13
2785 movaps 0x80(%rsp), %xmm14
2786 movaps 0x90(%rsp), %xmm15
2787 lea 0xa8(%rsp), %rsp
2788___
2789$code.=<<___;
2790 ret
2791.cfi_endproc
2792.LSEH_end_ecp_nistz256_gather_w7:
2793.size ecp_nistz256_gather_w7,.-ecp_nistz256_gather_w7
2794___
2795}
2796if ($avx>1) {
2797my ($val,$in_t,$index)=$win64?("%rcx","%rdx","%r8d"):("%rdi","%rsi","%edx");
2798my ($TWO,$INDEX,$Ra,$Rb,$Rc)=map("%ymm$_",(0..4));
2799my ($M0,$T0a,$T0b,$T0c,$TMP0)=map("%ymm$_",(5..9));
2800my ($M1,$T1a,$T1b,$T1c,$TMP1)=map("%ymm$_",(10..14));
2801
2802$code.=<<___;
2803################################################################################
2804# void ecp_nistz256_avx2_gather_w5(uint64_t *val, uint64_t *in_t, int index);
2805.type ecp_nistz256_avx2_gather_w5,\@abi-omnipotent
2806.align 32
2807ecp_nistz256_avx2_gather_w5:
2808.cfi_startproc
2809.Lavx2_gather_w5:
2810 vzeroupper
2811___
2812$code.=<<___ if ($win64);
2813 lea -0x88(%rsp), %rax
2814 mov %rsp,%r11
2815.LSEH_begin_ecp_nistz256_avx2_gather_w5:
2816 .byte 0x48,0x8d,0x60,0xe0 # lea -0x20(%rax), %rsp
2817 .byte 0xc5,0xf8,0x29,0x70,0xe0 # vmovaps %xmm6, -0x20(%rax)
2818 .byte 0xc5,0xf8,0x29,0x78,0xf0 # vmovaps %xmm7, -0x10(%rax)
2819 .byte 0xc5,0x78,0x29,0x40,0x00 # vmovaps %xmm8, 8(%rax)
2820 .byte 0xc5,0x78,0x29,0x48,0x10 # vmovaps %xmm9, 0x10(%rax)
2821 .byte 0xc5,0x78,0x29,0x50,0x20 # vmovaps %xmm10, 0x20(%rax)
2822 .byte 0xc5,0x78,0x29,0x58,0x30 # vmovaps %xmm11, 0x30(%rax)
2823 .byte 0xc5,0x78,0x29,0x60,0x40 # vmovaps %xmm12, 0x40(%rax)
2824 .byte 0xc5,0x78,0x29,0x68,0x50 # vmovaps %xmm13, 0x50(%rax)
2825 .byte 0xc5,0x78,0x29,0x70,0x60 # vmovaps %xmm14, 0x60(%rax)
2826 .byte 0xc5,0x78,0x29,0x78,0x70 # vmovaps %xmm15, 0x70(%rax)
2827___
2828$code.=<<___;
2829 vmovdqa .LTwo(%rip), $TWO
2830
2831 vpxor $Ra, $Ra, $Ra
2832 vpxor $Rb, $Rb, $Rb
2833 vpxor $Rc, $Rc, $Rc
2834
2835 vmovdqa .LOne(%rip), $M0
2836 vmovdqa .LTwo(%rip), $M1
2837
2838 vmovd $index, %xmm1
2839 vpermd $INDEX, $Ra, $INDEX
2840
2841 mov \$8, %rax
2842.Lselect_loop_avx2_w5:
2843
2844 vmovdqa 32*0($in_t), $T0a
2845 vmovdqa 32*1($in_t), $T0b
2846 vmovdqa 32*2($in_t), $T0c
2847
2848 vmovdqa 32*3($in_t), $T1a
2849 vmovdqa 32*4($in_t), $T1b
2850 vmovdqa 32*5($in_t), $T1c
2851
2852 vpcmpeqd $INDEX, $M0, $TMP0
2853 vpcmpeqd $INDEX, $M1, $TMP1
2854
2855 vpaddd $TWO, $M0, $M0
2856 vpaddd $TWO, $M1, $M1
2857 lea 32*6($in_t), $in_t
2858
2859 vpand $TMP0, $T0a, $T0a
2860 vpand $TMP0, $T0b, $T0b
2861 vpand $TMP0, $T0c, $T0c
2862 vpand $TMP1, $T1a, $T1a
2863 vpand $TMP1, $T1b, $T1b
2864 vpand $TMP1, $T1c, $T1c
2865
2866 vpxor $T0a, $Ra, $Ra
2867 vpxor $T0b, $Rb, $Rb
2868 vpxor $T0c, $Rc, $Rc
2869 vpxor $T1a, $Ra, $Ra
2870 vpxor $T1b, $Rb, $Rb
2871 vpxor $T1c, $Rc, $Rc
2872
2873 dec %rax
2874 jnz .Lselect_loop_avx2_w5
2875
2876 vmovdqu $Ra, 32*0($val)
2877 vmovdqu $Rb, 32*1($val)
2878 vmovdqu $Rc, 32*2($val)
2879 vzeroupper
2880___
2881$code.=<<___ if ($win64);
2882 movaps (%rsp), %xmm6
2883 movaps 0x10(%rsp), %xmm7
2884 movaps 0x20(%rsp), %xmm8
2885 movaps 0x30(%rsp), %xmm9
2886 movaps 0x40(%rsp), %xmm10
2887 movaps 0x50(%rsp), %xmm11
2888 movaps 0x60(%rsp), %xmm12
2889 movaps 0x70(%rsp), %xmm13
2890 movaps 0x80(%rsp), %xmm14
2891 movaps 0x90(%rsp), %xmm15
2892 lea (%r11), %rsp
2893___
2894$code.=<<___;
2895 ret
2896.cfi_endproc
2897.LSEH_end_ecp_nistz256_avx2_gather_w5:
2898.size ecp_nistz256_avx2_gather_w5,.-ecp_nistz256_avx2_gather_w5
2899___
2900}
2901if ($avx>1) {
2902my ($val,$in_t,$index)=$win64?("%rcx","%rdx","%r8d"):("%rdi","%rsi","%edx");
2903my ($THREE,$INDEX,$Ra,$Rb)=map("%ymm$_",(0..3));
2904my ($M0,$T0a,$T0b,$TMP0)=map("%ymm$_",(4..7));
2905my ($M1,$T1a,$T1b,$TMP1)=map("%ymm$_",(8..11));
2906my ($M2,$T2a,$T2b,$TMP2)=map("%ymm$_",(12..15));
2907
2908$code.=<<___;
2909
2910################################################################################
2911# void ecp_nistz256_avx2_gather_w7(uint64_t *val, uint64_t *in_t, int index);
2912.globl ecp_nistz256_avx2_gather_w7
2913.type ecp_nistz256_avx2_gather_w7,\@abi-omnipotent
2914.align 32
2915ecp_nistz256_avx2_gather_w7:
2916.cfi_startproc
2917.Lavx2_gather_w7:
2918 vzeroupper
2919___
2920$code.=<<___ if ($win64);
2921 mov %rsp,%r11
2922 lea -0x88(%rsp), %rax
2923.LSEH_begin_ecp_nistz256_avx2_gather_w7:
2924 .byte 0x48,0x8d,0x60,0xe0 # lea -0x20(%rax), %rsp
2925 .byte 0xc5,0xf8,0x29,0x70,0xe0 # vmovaps %xmm6, -0x20(%rax)
2926 .byte 0xc5,0xf8,0x29,0x78,0xf0 # vmovaps %xmm7, -0x10(%rax)
2927 .byte 0xc5,0x78,0x29,0x40,0x00 # vmovaps %xmm8, 8(%rax)
2928 .byte 0xc5,0x78,0x29,0x48,0x10 # vmovaps %xmm9, 0x10(%rax)
2929 .byte 0xc5,0x78,0x29,0x50,0x20 # vmovaps %xmm10, 0x20(%rax)
2930 .byte 0xc5,0x78,0x29,0x58,0x30 # vmovaps %xmm11, 0x30(%rax)
2931 .byte 0xc5,0x78,0x29,0x60,0x40 # vmovaps %xmm12, 0x40(%rax)
2932 .byte 0xc5,0x78,0x29,0x68,0x50 # vmovaps %xmm13, 0x50(%rax)
2933 .byte 0xc5,0x78,0x29,0x70,0x60 # vmovaps %xmm14, 0x60(%rax)
2934 .byte 0xc5,0x78,0x29,0x78,0x70 # vmovaps %xmm15, 0x70(%rax)
2935___
2936$code.=<<___;
2937 vmovdqa .LThree(%rip), $THREE
2938
2939 vpxor $Ra, $Ra, $Ra
2940 vpxor $Rb, $Rb, $Rb
2941
2942 vmovdqa .LOne(%rip), $M0
2943 vmovdqa .LTwo(%rip), $M1
2944 vmovdqa .LThree(%rip), $M2
2945
2946 vmovd $index, %xmm1
2947 vpermd $INDEX, $Ra, $INDEX
2948 # Skip index = 0, because it is implicitly the point at infinity
2949
2950 mov \$21, %rax
2951.Lselect_loop_avx2_w7:
2952
2953 vmovdqa 32*0($in_t), $T0a
2954 vmovdqa 32*1($in_t), $T0b
2955
2956 vmovdqa 32*2($in_t), $T1a
2957 vmovdqa 32*3($in_t), $T1b
2958
2959 vmovdqa 32*4($in_t), $T2a
2960 vmovdqa 32*5($in_t), $T2b
2961
2962 vpcmpeqd $INDEX, $M0, $TMP0
2963 vpcmpeqd $INDEX, $M1, $TMP1
2964 vpcmpeqd $INDEX, $M2, $TMP2
2965
2966 vpaddd $THREE, $M0, $M0
2967 vpaddd $THREE, $M1, $M1
2968 vpaddd $THREE, $M2, $M2
2969 lea 32*6($in_t), $in_t
2970
2971 vpand $TMP0, $T0a, $T0a
2972 vpand $TMP0, $T0b, $T0b
2973 vpand $TMP1, $T1a, $T1a
2974 vpand $TMP1, $T1b, $T1b
2975 vpand $TMP2, $T2a, $T2a
2976 vpand $TMP2, $T2b, $T2b
2977
2978 vpxor $T0a, $Ra, $Ra
2979 vpxor $T0b, $Rb, $Rb
2980 vpxor $T1a, $Ra, $Ra
2981 vpxor $T1b, $Rb, $Rb
2982 vpxor $T2a, $Ra, $Ra
2983 vpxor $T2b, $Rb, $Rb
2984
2985 dec %rax
2986 jnz .Lselect_loop_avx2_w7
2987
2988
2989 vmovdqa 32*0($in_t), $T0a
2990 vmovdqa 32*1($in_t), $T0b
2991
2992 vpcmpeqd $INDEX, $M0, $TMP0
2993
2994 vpand $TMP0, $T0a, $T0a
2995 vpand $TMP0, $T0b, $T0b
2996
2997 vpxor $T0a, $Ra, $Ra
2998 vpxor $T0b, $Rb, $Rb
2999
3000 vmovdqu $Ra, 32*0($val)
3001 vmovdqu $Rb, 32*1($val)
3002 vzeroupper
3003___
3004$code.=<<___ if ($win64);
3005 movaps (%rsp), %xmm6
3006 movaps 0x10(%rsp), %xmm7
3007 movaps 0x20(%rsp), %xmm8
3008 movaps 0x30(%rsp), %xmm9
3009 movaps 0x40(%rsp), %xmm10
3010 movaps 0x50(%rsp), %xmm11
3011 movaps 0x60(%rsp), %xmm12
3012 movaps 0x70(%rsp), %xmm13
3013 movaps 0x80(%rsp), %xmm14
3014 movaps 0x90(%rsp), %xmm15
3015 lea (%r11), %rsp
3016___
3017$code.=<<___;
3018 ret
3019.cfi_endproc
3020.LSEH_end_ecp_nistz256_avx2_gather_w7:
3021.size ecp_nistz256_avx2_gather_w7,.-ecp_nistz256_avx2_gather_w7
3022___
3023} else {
3024$code.=<<___;
3025.globl ecp_nistz256_avx2_gather_w7
3026.type ecp_nistz256_avx2_gather_w7,\@function,3
3027.align 32
3028ecp_nistz256_avx2_gather_w7:
3029.cfi_startproc
3030 .byte 0x0f,0x0b # ud2
3031 ret
3032.cfi_endproc
3033.size ecp_nistz256_avx2_gather_w7,.-ecp_nistz256_avx2_gather_w7
3034___
3035}
3036{{{
3037########################################################################
3038# This block implements higher level point_double, point_add and
3039# point_add_affine. The key to performance in this case is to allow
3040# out-of-order execution logic to overlap computations from next step
3041# with tail processing from current step. By using tailored calling
3042# sequence we minimize inter-step overhead to give processor better
3043# shot at overlapping operations...
3044#
3045# You will notice that input data is copied to stack. Trouble is that
3046# there are no registers to spare for holding original pointers and
3047# reloading them, pointers, would create undesired dependencies on
3048# effective addresses calculation paths. In other words it's too done
3049# to favour out-of-order execution logic.
3050# <[email protected]>
3051
3052my ($r_ptr,$a_ptr,$b_org,$b_ptr)=("%rdi","%rsi","%rdx","%rbx");
3053my ($acc0,$acc1,$acc2,$acc3,$acc4,$acc5,$acc6,$acc7)=map("%r$_",(8..15));
3054my ($t0,$t1,$t2,$t3,$t4)=("%rax","%rbp","%rcx",$acc4,$acc4);
3055my ($poly1,$poly3)=($acc6,$acc7);
3056
3057sub load_for_mul () {
3058my ($a,$b,$src0) = @_;
3059my $bias = $src0 eq "%rax" ? 0 : -128;
3060
3061" mov $b, $src0
3062 lea $b, $b_ptr
3063 mov 8*0+$a, $acc1
3064 mov 8*1+$a, $acc2
3065 lea $bias+$a, $a_ptr
3066 mov 8*2+$a, $acc3
3067 mov 8*3+$a, $acc4"
3068}
3069
3070sub load_for_sqr () {
3071my ($a,$src0) = @_;
3072my $bias = $src0 eq "%rax" ? 0 : -128;
3073
3074" mov 8*0+$a, $src0
3075 mov 8*1+$a, $acc6
3076 lea $bias+$a, $a_ptr
3077 mov 8*2+$a, $acc7
3078 mov 8*3+$a, $acc0"
3079}
3080
3081 {
3082########################################################################
3083# operate in 4-5-0-1 "name space" that matches multiplication output
3084#
3085my ($a0,$a1,$a2,$a3,$t3,$t4)=($acc4,$acc5,$acc0,$acc1,$acc2,$acc3);
3086
3087$code.=<<___;
3088.type __ecp_nistz256_add_toq,\@abi-omnipotent
3089.align 32
3090__ecp_nistz256_add_toq:
3091.cfi_startproc
3092 xor $t4,$t4
3093 add 8*0($b_ptr), $a0
3094 adc 8*1($b_ptr), $a1
3095 mov $a0, $t0
3096 adc 8*2($b_ptr), $a2
3097 adc 8*3($b_ptr), $a3
3098 mov $a1, $t1
3099 adc \$0, $t4
3100
3101 sub \$-1, $a0
3102 mov $a2, $t2
3103 sbb $poly1, $a1
3104 sbb \$0, $a2
3105 mov $a3, $t3
3106 sbb $poly3, $a3
3107 sbb \$0, $t4
3108
3109 cmovc $t0, $a0
3110 cmovc $t1, $a1
3111 mov $a0, 8*0($r_ptr)
3112 cmovc $t2, $a2
3113 mov $a1, 8*1($r_ptr)
3114 cmovc $t3, $a3
3115 mov $a2, 8*2($r_ptr)
3116 mov $a3, 8*3($r_ptr)
3117
3118 ret
3119.cfi_endproc
3120.size __ecp_nistz256_add_toq,.-__ecp_nistz256_add_toq
3121
3122.type __ecp_nistz256_sub_fromq,\@abi-omnipotent
3123.align 32
3124__ecp_nistz256_sub_fromq:
3125.cfi_startproc
3126 sub 8*0($b_ptr), $a0
3127 sbb 8*1($b_ptr), $a1
3128 mov $a0, $t0
3129 sbb 8*2($b_ptr), $a2
3130 sbb 8*3($b_ptr), $a3
3131 mov $a1, $t1
3132 sbb $t4, $t4
3133
3134 add \$-1, $a0
3135 mov $a2, $t2
3136 adc $poly1, $a1
3137 adc \$0, $a2
3138 mov $a3, $t3
3139 adc $poly3, $a3
3140 test $t4, $t4
3141
3142 cmovz $t0, $a0
3143 cmovz $t1, $a1
3144 mov $a0, 8*0($r_ptr)
3145 cmovz $t2, $a2
3146 mov $a1, 8*1($r_ptr)
3147 cmovz $t3, $a3
3148 mov $a2, 8*2($r_ptr)
3149 mov $a3, 8*3($r_ptr)
3150
3151 ret
3152.cfi_endproc
3153.size __ecp_nistz256_sub_fromq,.-__ecp_nistz256_sub_fromq
3154
3155.type __ecp_nistz256_subq,\@abi-omnipotent
3156.align 32
3157__ecp_nistz256_subq:
3158.cfi_startproc
3159 sub $a0, $t0
3160 sbb $a1, $t1
3161 mov $t0, $a0
3162 sbb $a2, $t2
3163 sbb $a3, $t3
3164 mov $t1, $a1
3165 sbb $t4, $t4
3166
3167 add \$-1, $t0
3168 mov $t2, $a2
3169 adc $poly1, $t1
3170 adc \$0, $t2
3171 mov $t3, $a3
3172 adc $poly3, $t3
3173 test $t4, $t4
3174
3175 cmovnz $t0, $a0
3176 cmovnz $t1, $a1
3177 cmovnz $t2, $a2
3178 cmovnz $t3, $a3
3179
3180 ret
3181.cfi_endproc
3182.size __ecp_nistz256_subq,.-__ecp_nistz256_subq
3183
3184.type __ecp_nistz256_mul_by_2q,\@abi-omnipotent
3185.align 32
3186__ecp_nistz256_mul_by_2q:
3187.cfi_startproc
3188 xor $t4, $t4
3189 add $a0, $a0 # a0:a3+a0:a3
3190 adc $a1, $a1
3191 mov $a0, $t0
3192 adc $a2, $a2
3193 adc $a3, $a3
3194 mov $a1, $t1
3195 adc \$0, $t4
3196
3197 sub \$-1, $a0
3198 mov $a2, $t2
3199 sbb $poly1, $a1
3200 sbb \$0, $a2
3201 mov $a3, $t3
3202 sbb $poly3, $a3
3203 sbb \$0, $t4
3204
3205 cmovc $t0, $a0
3206 cmovc $t1, $a1
3207 mov $a0, 8*0($r_ptr)
3208 cmovc $t2, $a2
3209 mov $a1, 8*1($r_ptr)
3210 cmovc $t3, $a3
3211 mov $a2, 8*2($r_ptr)
3212 mov $a3, 8*3($r_ptr)
3213
3214 ret
3215.cfi_endproc
3216.size __ecp_nistz256_mul_by_2q,.-__ecp_nistz256_mul_by_2q
3217___
3218 }
3219sub gen_double () {
3220 my $x = shift;
3221 my ($src0,$sfx,$bias);
3222 my ($S,$M,$Zsqr,$in_x,$tmp0)=map(32*$_,(0..4));
3223
3224 if ($x ne "x") {
3225 $src0 = "%rax";
3226 $sfx = "";
3227 $bias = 0;
3228
3229$code.=<<___;
3230.globl ecp_nistz256_point_double
3231.type ecp_nistz256_point_double,\@function,2
3232.align 32
3233ecp_nistz256_point_double:
3234.cfi_startproc
3235___
3236$code.=<<___ if ($addx);
3237 mov \$0x80100, %ecx
3238 and OPENSSL_ia32cap_P+8(%rip), %ecx
3239 cmp \$0x80100, %ecx
3240 je .Lpoint_doublex
3241___
3242 } else {
3243 $src0 = "%rdx";
3244 $sfx = "x";
3245 $bias = 128;
3246
3247$code.=<<___;
3248.type ecp_nistz256_point_doublex,\@function,2
3249.align 32
3250ecp_nistz256_point_doublex:
3251.cfi_startproc
3252.Lpoint_doublex:
3253___
3254 }
3255$code.=<<___;
3256 push %rbp
3257.cfi_push %rbp
3258 push %rbx
3259.cfi_push %rbx
3260 push %r12
3261.cfi_push %r12
3262 push %r13
3263.cfi_push %r13
3264 push %r14
3265.cfi_push %r14
3266 push %r15
3267.cfi_push %r15
3268 sub \$32*5+8, %rsp
3269.cfi_adjust_cfa_offset 32*5+8
3270.Lpoint_double${x}_body:
3271
3272.Lpoint_double_shortcut$x:
3273 movdqu 0x00($a_ptr), %xmm0 # copy *(P256_POINT *)$a_ptr.x
3274 mov $a_ptr, $b_ptr # backup copy
3275 movdqu 0x10($a_ptr), %xmm1
3276 mov 0x20+8*0($a_ptr), $acc4 # load in_y in "5-4-0-1" order
3277 mov 0x20+8*1($a_ptr), $acc5
3278 mov 0x20+8*2($a_ptr), $acc0
3279 mov 0x20+8*3($a_ptr), $acc1
3280 mov .Lpoly+8*1(%rip), $poly1
3281 mov .Lpoly+8*3(%rip), $poly3
3282 movdqa %xmm0, $in_x(%rsp)
3283 movdqa %xmm1, $in_x+0x10(%rsp)
3284 lea 0x20($r_ptr), $acc2
3285 lea 0x40($r_ptr), $acc3
3286 movq $r_ptr, %xmm0
3287 movq $acc2, %xmm1
3288 movq $acc3, %xmm2
3289
3290 lea $S(%rsp), $r_ptr
3291 call __ecp_nistz256_mul_by_2$x # p256_mul_by_2(S, in_y);
3292
3293 mov 0x40+8*0($a_ptr), $src0
3294 mov 0x40+8*1($a_ptr), $acc6
3295 mov 0x40+8*2($a_ptr), $acc7
3296 mov 0x40+8*3($a_ptr), $acc0
3297 lea 0x40-$bias($a_ptr), $a_ptr
3298 lea $Zsqr(%rsp), $r_ptr
3299 call __ecp_nistz256_sqr_mont$x # p256_sqr_mont(Zsqr, in_z);
3300
3301 `&load_for_sqr("$S(%rsp)", "$src0")`
3302 lea $S(%rsp), $r_ptr
3303 call __ecp_nistz256_sqr_mont$x # p256_sqr_mont(S, S);
3304
3305 mov 0x20($b_ptr), $src0 # $b_ptr is still valid
3306 mov 0x40+8*0($b_ptr), $acc1
3307 mov 0x40+8*1($b_ptr), $acc2
3308 mov 0x40+8*2($b_ptr), $acc3
3309 mov 0x40+8*3($b_ptr), $acc4
3310 lea 0x40-$bias($b_ptr), $a_ptr
3311 lea 0x20($b_ptr), $b_ptr
3312 movq %xmm2, $r_ptr
3313 call __ecp_nistz256_mul_mont$x # p256_mul_mont(res_z, in_z, in_y);
3314 call __ecp_nistz256_mul_by_2$x # p256_mul_by_2(res_z, res_z);
3315
3316 mov $in_x+8*0(%rsp), $acc4 # "5-4-0-1" order
3317 mov $in_x+8*1(%rsp), $acc5
3318 lea $Zsqr(%rsp), $b_ptr
3319 mov $in_x+8*2(%rsp), $acc0
3320 mov $in_x+8*3(%rsp), $acc1
3321 lea $M(%rsp), $r_ptr
3322 call __ecp_nistz256_add_to$x # p256_add(M, in_x, Zsqr);
3323
3324 mov $in_x+8*0(%rsp), $acc4 # "5-4-0-1" order
3325 mov $in_x+8*1(%rsp), $acc5
3326 lea $Zsqr(%rsp), $b_ptr
3327 mov $in_x+8*2(%rsp), $acc0
3328 mov $in_x+8*3(%rsp), $acc1
3329 lea $Zsqr(%rsp), $r_ptr
3330 call __ecp_nistz256_sub_from$x # p256_sub(Zsqr, in_x, Zsqr);
3331
3332 `&load_for_sqr("$S(%rsp)", "$src0")`
3333 movq %xmm1, $r_ptr
3334 call __ecp_nistz256_sqr_mont$x # p256_sqr_mont(res_y, S);
3335___
3336{
3337######## ecp_nistz256_div_by_2(res_y, res_y); ##########################
3338# operate in 4-5-6-7 "name space" that matches squaring output
3339#
3340my ($poly1,$poly3)=($a_ptr,$t1);
3341my ($a0,$a1,$a2,$a3,$t3,$t4,$t1)=($acc4,$acc5,$acc6,$acc7,$acc0,$acc1,$acc2);
3342
3343$code.=<<___;
3344 xor $t4, $t4
3345 mov $a0, $t0
3346 add \$-1, $a0
3347 mov $a1, $t1
3348 adc $poly1, $a1
3349 mov $a2, $t2
3350 adc \$0, $a2
3351 mov $a3, $t3
3352 adc $poly3, $a3
3353 adc \$0, $t4
3354 xor $a_ptr, $a_ptr # borrow $a_ptr
3355 test \$1, $t0
3356
3357 cmovz $t0, $a0
3358 cmovz $t1, $a1
3359 cmovz $t2, $a2
3360 cmovz $t3, $a3
3361 cmovz $a_ptr, $t4
3362
3363 mov $a1, $t0 # a0:a3>>1
3364 shr \$1, $a0
3365 shl \$63, $t0
3366 mov $a2, $t1
3367 shr \$1, $a1
3368 or $t0, $a0
3369 shl \$63, $t1
3370 mov $a3, $t2
3371 shr \$1, $a2
3372 or $t1, $a1
3373 shl \$63, $t2
3374 mov $a0, 8*0($r_ptr)
3375 shr \$1, $a3
3376 mov $a1, 8*1($r_ptr)
3377 shl \$63, $t4
3378 or $t2, $a2
3379 or $t4, $a3
3380 mov $a2, 8*2($r_ptr)
3381 mov $a3, 8*3($r_ptr)
3382___
3383}
3384$code.=<<___;
3385 `&load_for_mul("$M(%rsp)", "$Zsqr(%rsp)", "$src0")`
3386 lea $M(%rsp), $r_ptr
3387 call __ecp_nistz256_mul_mont$x # p256_mul_mont(M, M, Zsqr);
3388
3389 lea $tmp0(%rsp), $r_ptr
3390 call __ecp_nistz256_mul_by_2$x
3391
3392 lea $M(%rsp), $b_ptr
3393 lea $M(%rsp), $r_ptr
3394 call __ecp_nistz256_add_to$x # p256_mul_by_3(M, M);
3395
3396 `&load_for_mul("$S(%rsp)", "$in_x(%rsp)", "$src0")`
3397 lea $S(%rsp), $r_ptr
3398 call __ecp_nistz256_mul_mont$x # p256_mul_mont(S, S, in_x);
3399
3400 lea $tmp0(%rsp), $r_ptr
3401 call __ecp_nistz256_mul_by_2$x # p256_mul_by_2(tmp0, S);
3402
3403 `&load_for_sqr("$M(%rsp)", "$src0")`
3404 movq %xmm0, $r_ptr
3405 call __ecp_nistz256_sqr_mont$x # p256_sqr_mont(res_x, M);
3406
3407 lea $tmp0(%rsp), $b_ptr
3408 mov $acc6, $acc0 # harmonize sqr output and sub input
3409 mov $acc7, $acc1
3410 mov $a_ptr, $poly1
3411 mov $t1, $poly3
3412 call __ecp_nistz256_sub_from$x # p256_sub(res_x, res_x, tmp0);
3413
3414 mov $S+8*0(%rsp), $t0
3415 mov $S+8*1(%rsp), $t1
3416 mov $S+8*2(%rsp), $t2
3417 mov $S+8*3(%rsp), $acc2 # "4-5-0-1" order
3418 lea $S(%rsp), $r_ptr
3419 call __ecp_nistz256_sub$x # p256_sub(S, S, res_x);
3420
3421 mov $M(%rsp), $src0
3422 lea $M(%rsp), $b_ptr
3423 mov $acc4, $acc6 # harmonize sub output and mul input
3424 xor %ecx, %ecx
3425 mov $acc4, $S+8*0(%rsp) # have to save:-(
3426 mov $acc5, $acc2
3427 mov $acc5, $S+8*1(%rsp)
3428 cmovz $acc0, $acc3
3429 mov $acc0, $S+8*2(%rsp)
3430 lea $S-$bias(%rsp), $a_ptr
3431 cmovz $acc1, $acc4
3432 mov $acc1, $S+8*3(%rsp)
3433 mov $acc6, $acc1
3434 lea $S(%rsp), $r_ptr
3435 call __ecp_nistz256_mul_mont$x # p256_mul_mont(S, S, M);
3436
3437 movq %xmm1, $b_ptr
3438 movq %xmm1, $r_ptr
3439 call __ecp_nistz256_sub_from$x # p256_sub(res_y, S, res_y);
3440
3441 lea 32*5+56(%rsp), %rsi
3442.cfi_def_cfa %rsi,8
3443 mov -48(%rsi),%r15
3444.cfi_restore %r15
3445 mov -40(%rsi),%r14
3446.cfi_restore %r14
3447 mov -32(%rsi),%r13
3448.cfi_restore %r13
3449 mov -24(%rsi),%r12
3450.cfi_restore %r12
3451 mov -16(%rsi),%rbx
3452.cfi_restore %rbx
3453 mov -8(%rsi),%rbp
3454.cfi_restore %rbp
3455 lea (%rsi),%rsp
3456.cfi_def_cfa_register %rsp
3457.Lpoint_double${x}_epilogue:
3458 ret
3459.cfi_endproc
3460.size ecp_nistz256_point_double$sfx,.-ecp_nistz256_point_double$sfx
3461___
3462}
3463&gen_double("q");
3464
3465sub gen_add () {
3466 my $x = shift;
3467 my ($src0,$sfx,$bias);
3468 my ($H,$Hsqr,$R,$Rsqr,$Hcub,
3469 $U1,$U2,$S1,$S2,
3470 $res_x,$res_y,$res_z,
3471 $in1_x,$in1_y,$in1_z,
3472 $in2_x,$in2_y,$in2_z)=map(32*$_,(0..17));
3473 my ($Z1sqr, $Z2sqr) = ($Hsqr, $Rsqr);
3474
3475 if ($x ne "x") {
3476 $src0 = "%rax";
3477 $sfx = "";
3478 $bias = 0;
3479
3480$code.=<<___;
3481.globl ecp_nistz256_point_add
3482.type ecp_nistz256_point_add,\@function,3
3483.align 32
3484ecp_nistz256_point_add:
3485.cfi_startproc
3486___
3487$code.=<<___ if ($addx);
3488 mov \$0x80100, %ecx
3489 and OPENSSL_ia32cap_P+8(%rip), %ecx
3490 cmp \$0x80100, %ecx
3491 je .Lpoint_addx
3492___
3493 } else {
3494 $src0 = "%rdx";
3495 $sfx = "x";
3496 $bias = 128;
3497
3498$code.=<<___;
3499.type ecp_nistz256_point_addx,\@function,3
3500.align 32
3501ecp_nistz256_point_addx:
3502.cfi_startproc
3503.Lpoint_addx:
3504___
3505 }
3506$code.=<<___;
3507 push %rbp
3508.cfi_push %rbp
3509 push %rbx
3510.cfi_push %rbx
3511 push %r12
3512.cfi_push %r12
3513 push %r13
3514.cfi_push %r13
3515 push %r14
3516.cfi_push %r14
3517 push %r15
3518.cfi_push %r15
3519 sub \$32*18+8, %rsp
3520.cfi_adjust_cfa_offset 32*18+8
3521.Lpoint_add${x}_body:
3522
3523 movdqu 0x00($a_ptr), %xmm0 # copy *(P256_POINT *)$a_ptr
3524 movdqu 0x10($a_ptr), %xmm1
3525 movdqu 0x20($a_ptr), %xmm2
3526 movdqu 0x30($a_ptr), %xmm3
3527 movdqu 0x40($a_ptr), %xmm4
3528 movdqu 0x50($a_ptr), %xmm5
3529 mov $a_ptr, $b_ptr # reassign
3530 mov $b_org, $a_ptr # reassign
3531 movdqa %xmm0, $in1_x(%rsp)
3532 movdqa %xmm1, $in1_x+0x10(%rsp)
3533 movdqa %xmm2, $in1_y(%rsp)
3534 movdqa %xmm3, $in1_y+0x10(%rsp)
3535 movdqa %xmm4, $in1_z(%rsp)
3536 movdqa %xmm5, $in1_z+0x10(%rsp)
3537 por %xmm4, %xmm5
3538
3539 movdqu 0x00($a_ptr), %xmm0 # copy *(P256_POINT *)$b_ptr
3540 pshufd \$0xb1, %xmm5, %xmm3
3541 movdqu 0x10($a_ptr), %xmm1
3542 movdqu 0x20($a_ptr), %xmm2
3543 por %xmm3, %xmm5
3544 movdqu 0x30($a_ptr), %xmm3
3545 mov 0x40+8*0($a_ptr), $src0 # load original in2_z
3546 mov 0x40+8*1($a_ptr), $acc6
3547 mov 0x40+8*2($a_ptr), $acc7
3548 mov 0x40+8*3($a_ptr), $acc0
3549 movdqa %xmm0, $in2_x(%rsp)
3550 pshufd \$0x1e, %xmm5, %xmm4
3551 movdqa %xmm1, $in2_x+0x10(%rsp)
3552 movdqu 0x40($a_ptr),%xmm0 # in2_z again
3553 movdqu 0x50($a_ptr),%xmm1
3554 movdqa %xmm2, $in2_y(%rsp)
3555 movdqa %xmm3, $in2_y+0x10(%rsp)
3556 por %xmm4, %xmm5
3557 pxor %xmm4, %xmm4
3558 por %xmm0, %xmm1
3559 movq $r_ptr, %xmm0 # save $r_ptr
3560
3561 lea 0x40-$bias($a_ptr), $a_ptr # $a_ptr is still valid
3562 mov $src0, $in2_z+8*0(%rsp) # make in2_z copy
3563 mov $acc6, $in2_z+8*1(%rsp)
3564 mov $acc7, $in2_z+8*2(%rsp)
3565 mov $acc0, $in2_z+8*3(%rsp)
3566 lea $Z2sqr(%rsp), $r_ptr # Z2^2
3567 call __ecp_nistz256_sqr_mont$x # p256_sqr_mont(Z2sqr, in2_z);
3568
3569 pcmpeqd %xmm4, %xmm5
3570 pshufd \$0xb1, %xmm1, %xmm4
3571 por %xmm1, %xmm4
3572 pshufd \$0, %xmm5, %xmm5 # in1infty
3573 pshufd \$0x1e, %xmm4, %xmm3
3574 por %xmm3, %xmm4
3575 pxor %xmm3, %xmm3
3576 pcmpeqd %xmm3, %xmm4
3577 pshufd \$0, %xmm4, %xmm4 # in2infty
3578 mov 0x40+8*0($b_ptr), $src0 # load original in1_z
3579 mov 0x40+8*1($b_ptr), $acc6
3580 mov 0x40+8*2($b_ptr), $acc7
3581 mov 0x40+8*3($b_ptr), $acc0
3582 movq $b_ptr, %xmm1
3583
3584 lea 0x40-$bias($b_ptr), $a_ptr
3585 lea $Z1sqr(%rsp), $r_ptr # Z1^2
3586 call __ecp_nistz256_sqr_mont$x # p256_sqr_mont(Z1sqr, in1_z);
3587
3588 `&load_for_mul("$Z2sqr(%rsp)", "$in2_z(%rsp)", "$src0")`
3589 lea $S1(%rsp), $r_ptr # S1 = Z2^3
3590 call __ecp_nistz256_mul_mont$x # p256_mul_mont(S1, Z2sqr, in2_z);
3591
3592 `&load_for_mul("$Z1sqr(%rsp)", "$in1_z(%rsp)", "$src0")`
3593 lea $S2(%rsp), $r_ptr # S2 = Z1^3
3594 call __ecp_nistz256_mul_mont$x # p256_mul_mont(S2, Z1sqr, in1_z);
3595
3596 `&load_for_mul("$S1(%rsp)", "$in1_y(%rsp)", "$src0")`
3597 lea $S1(%rsp), $r_ptr # S1 = Y1*Z2^3
3598 call __ecp_nistz256_mul_mont$x # p256_mul_mont(S1, S1, in1_y);
3599
3600 `&load_for_mul("$S2(%rsp)", "$in2_y(%rsp)", "$src0")`
3601 lea $S2(%rsp), $r_ptr # S2 = Y2*Z1^3
3602 call __ecp_nistz256_mul_mont$x # p256_mul_mont(S2, S2, in2_y);
3603
3604 lea $S1(%rsp), $b_ptr
3605 lea $R(%rsp), $r_ptr # R = S2 - S1
3606 call __ecp_nistz256_sub_from$x # p256_sub(R, S2, S1);
3607
3608 or $acc5, $acc4 # see if result is zero
3609 movdqa %xmm4, %xmm2
3610 or $acc0, $acc4
3611 or $acc1, $acc4
3612 por %xmm5, %xmm2 # in1infty || in2infty
3613 movq $acc4, %xmm3
3614
3615 `&load_for_mul("$Z2sqr(%rsp)", "$in1_x(%rsp)", "$src0")`
3616 lea $U1(%rsp), $r_ptr # U1 = X1*Z2^2
3617 call __ecp_nistz256_mul_mont$x # p256_mul_mont(U1, in1_x, Z2sqr);
3618
3619 `&load_for_mul("$Z1sqr(%rsp)", "$in2_x(%rsp)", "$src0")`
3620 lea $U2(%rsp), $r_ptr # U2 = X2*Z1^2
3621 call __ecp_nistz256_mul_mont$x # p256_mul_mont(U2, in2_x, Z1sqr);
3622
3623 lea $U1(%rsp), $b_ptr
3624 lea $H(%rsp), $r_ptr # H = U2 - U1
3625 call __ecp_nistz256_sub_from$x # p256_sub(H, U2, U1);
3626
3627 or $acc5, $acc4 # see if result is zero
3628 or $acc0, $acc4
3629 or $acc1, $acc4 # !is_equal(U1, U2)
3630
3631 movq %xmm2, $acc0 # in1infty | in2infty
3632 movq %xmm3, $acc1 # !is_equal(S1, S2)
3633
3634 or $acc0, $acc4
3635 or $acc1, $acc4
3636
3637 # if (!is_equal(U1, U2) | in1infty | in2infty | !is_equal(S1, S2))
3638 .byte 0x3e # predict taken
3639 jnz .Ladd_proceed$x
3640
3641.Ladd_double$x:
3642 movq %xmm1, $a_ptr # restore $a_ptr
3643 movq %xmm0, $r_ptr # restore $r_ptr
3644 add \$`32*(18-5)`, %rsp # difference in frame sizes
3645.cfi_adjust_cfa_offset `-32*(18-5)`
3646 jmp .Lpoint_double_shortcut$x
3647.cfi_adjust_cfa_offset `32*(18-5)`
3648
3649.align 32
3650.Ladd_proceed$x:
3651 `&load_for_sqr("$R(%rsp)", "$src0")`
3652 lea $Rsqr(%rsp), $r_ptr # R^2
3653 call __ecp_nistz256_sqr_mont$x # p256_sqr_mont(Rsqr, R);
3654
3655 `&load_for_mul("$H(%rsp)", "$in1_z(%rsp)", "$src0")`
3656 lea $res_z(%rsp), $r_ptr # Z3 = H*Z1*Z2
3657 call __ecp_nistz256_mul_mont$x # p256_mul_mont(res_z, H, in1_z);
3658
3659 `&load_for_sqr("$H(%rsp)", "$src0")`
3660 lea $Hsqr(%rsp), $r_ptr # H^2
3661 call __ecp_nistz256_sqr_mont$x # p256_sqr_mont(Hsqr, H);
3662
3663 `&load_for_mul("$res_z(%rsp)", "$in2_z(%rsp)", "$src0")`
3664 lea $res_z(%rsp), $r_ptr # Z3 = H*Z1*Z2
3665 call __ecp_nistz256_mul_mont$x # p256_mul_mont(res_z, res_z, in2_z);
3666
3667 `&load_for_mul("$Hsqr(%rsp)", "$H(%rsp)", "$src0")`
3668 lea $Hcub(%rsp), $r_ptr # H^3
3669 call __ecp_nistz256_mul_mont$x # p256_mul_mont(Hcub, Hsqr, H);
3670
3671 `&load_for_mul("$Hsqr(%rsp)", "$U1(%rsp)", "$src0")`
3672 lea $U2(%rsp), $r_ptr # U1*H^2
3673 call __ecp_nistz256_mul_mont$x # p256_mul_mont(U2, U1, Hsqr);
3674___
3675{
3676#######################################################################
3677# operate in 4-5-0-1 "name space" that matches multiplication output
3678#
3679my ($acc0,$acc1,$acc2,$acc3,$t3,$t4)=($acc4,$acc5,$acc0,$acc1,$acc2,$acc3);
3680my ($poly1, $poly3)=($acc6,$acc7);
3681
3682$code.=<<___;
3683 #lea $U2(%rsp), $a_ptr
3684 #lea $Hsqr(%rsp), $r_ptr # 2*U1*H^2
3685 #call __ecp_nistz256_mul_by_2 # ecp_nistz256_mul_by_2(Hsqr, U2);
3686
3687 xor $t4, $t4
3688 add $acc0, $acc0 # a0:a3+a0:a3
3689 lea $Rsqr(%rsp), $a_ptr
3690 adc $acc1, $acc1
3691 mov $acc0, $t0
3692 adc $acc2, $acc2
3693 adc $acc3, $acc3
3694 mov $acc1, $t1
3695 adc \$0, $t4
3696
3697 sub \$-1, $acc0
3698 mov $acc2, $t2
3699 sbb $poly1, $acc1
3700 sbb \$0, $acc2
3701 mov $acc3, $t3
3702 sbb $poly3, $acc3
3703 sbb \$0, $t4
3704
3705 cmovc $t0, $acc0
3706 mov 8*0($a_ptr), $t0
3707 cmovc $t1, $acc1
3708 mov 8*1($a_ptr), $t1
3709 cmovc $t2, $acc2
3710 mov 8*2($a_ptr), $t2
3711 cmovc $t3, $acc3
3712 mov 8*3($a_ptr), $t3
3713
3714 call __ecp_nistz256_sub$x # p256_sub(res_x, Rsqr, Hsqr);
3715
3716 lea $Hcub(%rsp), $b_ptr
3717 lea $res_x(%rsp), $r_ptr
3718 call __ecp_nistz256_sub_from$x # p256_sub(res_x, res_x, Hcub);
3719
3720 mov $U2+8*0(%rsp), $t0
3721 mov $U2+8*1(%rsp), $t1
3722 mov $U2+8*2(%rsp), $t2
3723 mov $U2+8*3(%rsp), $t3
3724 lea $res_y(%rsp), $r_ptr
3725
3726 call __ecp_nistz256_sub$x # p256_sub(res_y, U2, res_x);
3727
3728 mov $acc0, 8*0($r_ptr) # save the result, as
3729 mov $acc1, 8*1($r_ptr) # __ecp_nistz256_sub doesn't
3730 mov $acc2, 8*2($r_ptr)
3731 mov $acc3, 8*3($r_ptr)
3732___
3733}
3734$code.=<<___;
3735 `&load_for_mul("$S1(%rsp)", "$Hcub(%rsp)", "$src0")`
3736 lea $S2(%rsp), $r_ptr
3737 call __ecp_nistz256_mul_mont$x # p256_mul_mont(S2, S1, Hcub);
3738
3739 `&load_for_mul("$R(%rsp)", "$res_y(%rsp)", "$src0")`
3740 lea $res_y(%rsp), $r_ptr
3741 call __ecp_nistz256_mul_mont$x # p256_mul_mont(res_y, R, res_y);
3742
3743 lea $S2(%rsp), $b_ptr
3744 lea $res_y(%rsp), $r_ptr
3745 call __ecp_nistz256_sub_from$x # p256_sub(res_y, res_y, S2);
3746
3747 movq %xmm0, $r_ptr # restore $r_ptr
3748
3749 movdqa %xmm5, %xmm0 # copy_conditional(res_z, in2_z, in1infty);
3750 movdqa %xmm5, %xmm1
3751 pandn $res_z(%rsp), %xmm0
3752 movdqa %xmm5, %xmm2
3753 pandn $res_z+0x10(%rsp), %xmm1
3754 movdqa %xmm5, %xmm3
3755 pand $in2_z(%rsp), %xmm2
3756 pand $in2_z+0x10(%rsp), %xmm3
3757 por %xmm0, %xmm2
3758 por %xmm1, %xmm3
3759
3760 movdqa %xmm4, %xmm0 # copy_conditional(res_z, in1_z, in2infty);
3761 movdqa %xmm4, %xmm1
3762 pandn %xmm2, %xmm0
3763 movdqa %xmm4, %xmm2
3764 pandn %xmm3, %xmm1
3765 movdqa %xmm4, %xmm3
3766 pand $in1_z(%rsp), %xmm2
3767 pand $in1_z+0x10(%rsp), %xmm3
3768 por %xmm0, %xmm2
3769 por %xmm1, %xmm3
3770 movdqu %xmm2, 0x40($r_ptr)
3771 movdqu %xmm3, 0x50($r_ptr)
3772
3773 movdqa %xmm5, %xmm0 # copy_conditional(res_x, in2_x, in1infty);
3774 movdqa %xmm5, %xmm1
3775 pandn $res_x(%rsp), %xmm0
3776 movdqa %xmm5, %xmm2
3777 pandn $res_x+0x10(%rsp), %xmm1
3778 movdqa %xmm5, %xmm3
3779 pand $in2_x(%rsp), %xmm2
3780 pand $in2_x+0x10(%rsp), %xmm3
3781 por %xmm0, %xmm2
3782 por %xmm1, %xmm3
3783
3784 movdqa %xmm4, %xmm0 # copy_conditional(res_x, in1_x, in2infty);
3785 movdqa %xmm4, %xmm1
3786 pandn %xmm2, %xmm0
3787 movdqa %xmm4, %xmm2
3788 pandn %xmm3, %xmm1
3789 movdqa %xmm4, %xmm3
3790 pand $in1_x(%rsp), %xmm2
3791 pand $in1_x+0x10(%rsp), %xmm3
3792 por %xmm0, %xmm2
3793 por %xmm1, %xmm3
3794 movdqu %xmm2, 0x00($r_ptr)
3795 movdqu %xmm3, 0x10($r_ptr)
3796
3797 movdqa %xmm5, %xmm0 # copy_conditional(res_y, in2_y, in1infty);
3798 movdqa %xmm5, %xmm1
3799 pandn $res_y(%rsp), %xmm0
3800 movdqa %xmm5, %xmm2
3801 pandn $res_y+0x10(%rsp), %xmm1
3802 movdqa %xmm5, %xmm3
3803 pand $in2_y(%rsp), %xmm2
3804 pand $in2_y+0x10(%rsp), %xmm3
3805 por %xmm0, %xmm2
3806 por %xmm1, %xmm3
3807
3808 movdqa %xmm4, %xmm0 # copy_conditional(res_y, in1_y, in2infty);
3809 movdqa %xmm4, %xmm1
3810 pandn %xmm2, %xmm0
3811 movdqa %xmm4, %xmm2
3812 pandn %xmm3, %xmm1
3813 movdqa %xmm4, %xmm3
3814 pand $in1_y(%rsp), %xmm2
3815 pand $in1_y+0x10(%rsp), %xmm3
3816 por %xmm0, %xmm2
3817 por %xmm1, %xmm3
3818 movdqu %xmm2, 0x20($r_ptr)
3819 movdqu %xmm3, 0x30($r_ptr)
3820
3821.Ladd_done$x:
3822 lea 32*18+56(%rsp), %rsi
3823.cfi_def_cfa %rsi,8
3824 mov -48(%rsi),%r15
3825.cfi_restore %r15
3826 mov -40(%rsi),%r14
3827.cfi_restore %r14
3828 mov -32(%rsi),%r13
3829.cfi_restore %r13
3830 mov -24(%rsi),%r12
3831.cfi_restore %r12
3832 mov -16(%rsi),%rbx
3833.cfi_restore %rbx
3834 mov -8(%rsi),%rbp
3835.cfi_restore %rbp
3836 lea (%rsi),%rsp
3837.cfi_def_cfa_register %rsp
3838.Lpoint_add${x}_epilogue:
3839 ret
3840.cfi_endproc
3841.size ecp_nistz256_point_add$sfx,.-ecp_nistz256_point_add$sfx
3842___
3843}
3844&gen_add("q");
3845
3846sub gen_add_affine () {
3847 my $x = shift;
3848 my ($src0,$sfx,$bias);
3849 my ($U2,$S2,$H,$R,$Hsqr,$Hcub,$Rsqr,
3850 $res_x,$res_y,$res_z,
3851 $in1_x,$in1_y,$in1_z,
3852 $in2_x,$in2_y)=map(32*$_,(0..14));
3853 my $Z1sqr = $S2;
3854
3855 if ($x ne "x") {
3856 $src0 = "%rax";
3857 $sfx = "";
3858 $bias = 0;
3859
3860$code.=<<___;
3861.globl ecp_nistz256_point_add_affine
3862.type ecp_nistz256_point_add_affine,\@function,3
3863.align 32
3864ecp_nistz256_point_add_affine:
3865.cfi_startproc
3866___
3867$code.=<<___ if ($addx);
3868 mov \$0x80100, %ecx
3869 and OPENSSL_ia32cap_P+8(%rip), %ecx
3870 cmp \$0x80100, %ecx
3871 je .Lpoint_add_affinex
3872___
3873 } else {
3874 $src0 = "%rdx";
3875 $sfx = "x";
3876 $bias = 128;
3877
3878$code.=<<___;
3879.type ecp_nistz256_point_add_affinex,\@function,3
3880.align 32
3881ecp_nistz256_point_add_affinex:
3882.cfi_startproc
3883.Lpoint_add_affinex:
3884___
3885 }
3886$code.=<<___;
3887 push %rbp
3888.cfi_push %rbp
3889 push %rbx
3890.cfi_push %rbx
3891 push %r12
3892.cfi_push %r12
3893 push %r13
3894.cfi_push %r13
3895 push %r14
3896.cfi_push %r14
3897 push %r15
3898.cfi_push %r15
3899 sub \$32*15+8, %rsp
3900.cfi_adjust_cfa_offset 32*15+8
3901.Ladd_affine${x}_body:
3902
3903 movdqu 0x00($a_ptr), %xmm0 # copy *(P256_POINT *)$a_ptr
3904 mov $b_org, $b_ptr # reassign
3905 movdqu 0x10($a_ptr), %xmm1
3906 movdqu 0x20($a_ptr), %xmm2
3907 movdqu 0x30($a_ptr), %xmm3
3908 movdqu 0x40($a_ptr), %xmm4
3909 movdqu 0x50($a_ptr), %xmm5
3910 mov 0x40+8*0($a_ptr), $src0 # load original in1_z
3911 mov 0x40+8*1($a_ptr), $acc6
3912 mov 0x40+8*2($a_ptr), $acc7
3913 mov 0x40+8*3($a_ptr), $acc0
3914 movdqa %xmm0, $in1_x(%rsp)
3915 movdqa %xmm1, $in1_x+0x10(%rsp)
3916 movdqa %xmm2, $in1_y(%rsp)
3917 movdqa %xmm3, $in1_y+0x10(%rsp)
3918 movdqa %xmm4, $in1_z(%rsp)
3919 movdqa %xmm5, $in1_z+0x10(%rsp)
3920 por %xmm4, %xmm5
3921
3922 movdqu 0x00($b_ptr), %xmm0 # copy *(P256_POINT_AFFINE *)$b_ptr
3923 pshufd \$0xb1, %xmm5, %xmm3
3924 movdqu 0x10($b_ptr), %xmm1
3925 movdqu 0x20($b_ptr), %xmm2
3926 por %xmm3, %xmm5
3927 movdqu 0x30($b_ptr), %xmm3
3928 movdqa %xmm0, $in2_x(%rsp)
3929 pshufd \$0x1e, %xmm5, %xmm4
3930 movdqa %xmm1, $in2_x+0x10(%rsp)
3931 por %xmm0, %xmm1
3932 movq $r_ptr, %xmm0 # save $r_ptr
3933 movdqa %xmm2, $in2_y(%rsp)
3934 movdqa %xmm3, $in2_y+0x10(%rsp)
3935 por %xmm2, %xmm3
3936 por %xmm4, %xmm5
3937 pxor %xmm4, %xmm4
3938 por %xmm1, %xmm3
3939
3940 lea 0x40-$bias($a_ptr), $a_ptr # $a_ptr is still valid
3941 lea $Z1sqr(%rsp), $r_ptr # Z1^2
3942 call __ecp_nistz256_sqr_mont$x # p256_sqr_mont(Z1sqr, in1_z);
3943
3944 pcmpeqd %xmm4, %xmm5
3945 pshufd \$0xb1, %xmm3, %xmm4
3946 mov 0x00($b_ptr), $src0 # $b_ptr is still valid
3947 #lea 0x00($b_ptr), $b_ptr
3948 mov $acc4, $acc1 # harmonize sqr output and mul input
3949 por %xmm3, %xmm4
3950 pshufd \$0, %xmm5, %xmm5 # in1infty
3951 pshufd \$0x1e, %xmm4, %xmm3
3952 mov $acc5, $acc2
3953 por %xmm3, %xmm4
3954 pxor %xmm3, %xmm3
3955 mov $acc6, $acc3
3956 pcmpeqd %xmm3, %xmm4
3957 pshufd \$0, %xmm4, %xmm4 # in2infty
3958
3959 lea $Z1sqr-$bias(%rsp), $a_ptr
3960 mov $acc7, $acc4
3961 lea $U2(%rsp), $r_ptr # U2 = X2*Z1^2
3962 call __ecp_nistz256_mul_mont$x # p256_mul_mont(U2, Z1sqr, in2_x);
3963
3964 lea $in1_x(%rsp), $b_ptr
3965 lea $H(%rsp), $r_ptr # H = U2 - U1
3966 call __ecp_nistz256_sub_from$x # p256_sub(H, U2, in1_x);
3967
3968 `&load_for_mul("$Z1sqr(%rsp)", "$in1_z(%rsp)", "$src0")`
3969 lea $S2(%rsp), $r_ptr # S2 = Z1^3
3970 call __ecp_nistz256_mul_mont$x # p256_mul_mont(S2, Z1sqr, in1_z);
3971
3972 `&load_for_mul("$H(%rsp)", "$in1_z(%rsp)", "$src0")`
3973 lea $res_z(%rsp), $r_ptr # Z3 = H*Z1*Z2
3974 call __ecp_nistz256_mul_mont$x # p256_mul_mont(res_z, H, in1_z);
3975
3976 `&load_for_mul("$S2(%rsp)", "$in2_y(%rsp)", "$src0")`
3977 lea $S2(%rsp), $r_ptr # S2 = Y2*Z1^3
3978 call __ecp_nistz256_mul_mont$x # p256_mul_mont(S2, S2, in2_y);
3979
3980 lea $in1_y(%rsp), $b_ptr
3981 lea $R(%rsp), $r_ptr # R = S2 - S1
3982 call __ecp_nistz256_sub_from$x # p256_sub(R, S2, in1_y);
3983
3984 `&load_for_sqr("$H(%rsp)", "$src0")`
3985 lea $Hsqr(%rsp), $r_ptr # H^2
3986 call __ecp_nistz256_sqr_mont$x # p256_sqr_mont(Hsqr, H);
3987
3988 `&load_for_sqr("$R(%rsp)", "$src0")`
3989 lea $Rsqr(%rsp), $r_ptr # R^2
3990 call __ecp_nistz256_sqr_mont$x # p256_sqr_mont(Rsqr, R);
3991
3992 `&load_for_mul("$H(%rsp)", "$Hsqr(%rsp)", "$src0")`
3993 lea $Hcub(%rsp), $r_ptr # H^3
3994 call __ecp_nistz256_mul_mont$x # p256_mul_mont(Hcub, Hsqr, H);
3995
3996 `&load_for_mul("$Hsqr(%rsp)", "$in1_x(%rsp)", "$src0")`
3997 lea $U2(%rsp), $r_ptr # U1*H^2
3998 call __ecp_nistz256_mul_mont$x # p256_mul_mont(U2, in1_x, Hsqr);
3999___
4000{
4001#######################################################################
4002# operate in 4-5-0-1 "name space" that matches multiplication output
4003#
4004my ($acc0,$acc1,$acc2,$acc3,$t3,$t4)=($acc4,$acc5,$acc0,$acc1,$acc2,$acc3);
4005my ($poly1, $poly3)=($acc6,$acc7);
4006
4007$code.=<<___;
4008 #lea $U2(%rsp), $a_ptr
4009 #lea $Hsqr(%rsp), $r_ptr # 2*U1*H^2
4010 #call __ecp_nistz256_mul_by_2 # ecp_nistz256_mul_by_2(Hsqr, U2);
4011
4012 xor $t4, $t4
4013 add $acc0, $acc0 # a0:a3+a0:a3
4014 lea $Rsqr(%rsp), $a_ptr
4015 adc $acc1, $acc1
4016 mov $acc0, $t0
4017 adc $acc2, $acc2
4018 adc $acc3, $acc3
4019 mov $acc1, $t1
4020 adc \$0, $t4
4021
4022 sub \$-1, $acc0
4023 mov $acc2, $t2
4024 sbb $poly1, $acc1
4025 sbb \$0, $acc2
4026 mov $acc3, $t3
4027 sbb $poly3, $acc3
4028 sbb \$0, $t4
4029
4030 cmovc $t0, $acc0
4031 mov 8*0($a_ptr), $t0
4032 cmovc $t1, $acc1
4033 mov 8*1($a_ptr), $t1
4034 cmovc $t2, $acc2
4035 mov 8*2($a_ptr), $t2
4036 cmovc $t3, $acc3
4037 mov 8*3($a_ptr), $t3
4038
4039 call __ecp_nistz256_sub$x # p256_sub(res_x, Rsqr, Hsqr);
4040
4041 lea $Hcub(%rsp), $b_ptr
4042 lea $res_x(%rsp), $r_ptr
4043 call __ecp_nistz256_sub_from$x # p256_sub(res_x, res_x, Hcub);
4044
4045 mov $U2+8*0(%rsp), $t0
4046 mov $U2+8*1(%rsp), $t1
4047 mov $U2+8*2(%rsp), $t2
4048 mov $U2+8*3(%rsp), $t3
4049 lea $H(%rsp), $r_ptr
4050
4051 call __ecp_nistz256_sub$x # p256_sub(H, U2, res_x);
4052
4053 mov $acc0, 8*0($r_ptr) # save the result, as
4054 mov $acc1, 8*1($r_ptr) # __ecp_nistz256_sub doesn't
4055 mov $acc2, 8*2($r_ptr)
4056 mov $acc3, 8*3($r_ptr)
4057___
4058}
4059$code.=<<___;
4060 `&load_for_mul("$Hcub(%rsp)", "$in1_y(%rsp)", "$src0")`
4061 lea $S2(%rsp), $r_ptr
4062 call __ecp_nistz256_mul_mont$x # p256_mul_mont(S2, Hcub, in1_y);
4063
4064 `&load_for_mul("$H(%rsp)", "$R(%rsp)", "$src0")`
4065 lea $H(%rsp), $r_ptr
4066 call __ecp_nistz256_mul_mont$x # p256_mul_mont(H, H, R);
4067
4068 lea $S2(%rsp), $b_ptr
4069 lea $res_y(%rsp), $r_ptr
4070 call __ecp_nistz256_sub_from$x # p256_sub(res_y, H, S2);
4071
4072 movq %xmm0, $r_ptr # restore $r_ptr
4073
4074 movdqa %xmm5, %xmm0 # copy_conditional(res_z, ONE, in1infty);
4075 movdqa %xmm5, %xmm1
4076 pandn $res_z(%rsp), %xmm0
4077 movdqa %xmm5, %xmm2
4078 pandn $res_z+0x10(%rsp), %xmm1
4079 movdqa %xmm5, %xmm3
4080 pand .LONE_mont(%rip), %xmm2
4081 pand .LONE_mont+0x10(%rip), %xmm3
4082 por %xmm0, %xmm2
4083 por %xmm1, %xmm3
4084
4085 movdqa %xmm4, %xmm0 # copy_conditional(res_z, in1_z, in2infty);
4086 movdqa %xmm4, %xmm1
4087 pandn %xmm2, %xmm0
4088 movdqa %xmm4, %xmm2
4089 pandn %xmm3, %xmm1
4090 movdqa %xmm4, %xmm3
4091 pand $in1_z(%rsp), %xmm2
4092 pand $in1_z+0x10(%rsp), %xmm3
4093 por %xmm0, %xmm2
4094 por %xmm1, %xmm3
4095 movdqu %xmm2, 0x40($r_ptr)
4096 movdqu %xmm3, 0x50($r_ptr)
4097
4098 movdqa %xmm5, %xmm0 # copy_conditional(res_x, in2_x, in1infty);
4099 movdqa %xmm5, %xmm1
4100 pandn $res_x(%rsp), %xmm0
4101 movdqa %xmm5, %xmm2
4102 pandn $res_x+0x10(%rsp), %xmm1
4103 movdqa %xmm5, %xmm3
4104 pand $in2_x(%rsp), %xmm2
4105 pand $in2_x+0x10(%rsp), %xmm3
4106 por %xmm0, %xmm2
4107 por %xmm1, %xmm3
4108
4109 movdqa %xmm4, %xmm0 # copy_conditional(res_x, in1_x, in2infty);
4110 movdqa %xmm4, %xmm1
4111 pandn %xmm2, %xmm0
4112 movdqa %xmm4, %xmm2
4113 pandn %xmm3, %xmm1
4114 movdqa %xmm4, %xmm3
4115 pand $in1_x(%rsp), %xmm2
4116 pand $in1_x+0x10(%rsp), %xmm3
4117 por %xmm0, %xmm2
4118 por %xmm1, %xmm3
4119 movdqu %xmm2, 0x00($r_ptr)
4120 movdqu %xmm3, 0x10($r_ptr)
4121
4122 movdqa %xmm5, %xmm0 # copy_conditional(res_y, in2_y, in1infty);
4123 movdqa %xmm5, %xmm1
4124 pandn $res_y(%rsp), %xmm0
4125 movdqa %xmm5, %xmm2
4126 pandn $res_y+0x10(%rsp), %xmm1
4127 movdqa %xmm5, %xmm3
4128 pand $in2_y(%rsp), %xmm2
4129 pand $in2_y+0x10(%rsp), %xmm3
4130 por %xmm0, %xmm2
4131 por %xmm1, %xmm3
4132
4133 movdqa %xmm4, %xmm0 # copy_conditional(res_y, in1_y, in2infty);
4134 movdqa %xmm4, %xmm1
4135 pandn %xmm2, %xmm0
4136 movdqa %xmm4, %xmm2
4137 pandn %xmm3, %xmm1
4138 movdqa %xmm4, %xmm3
4139 pand $in1_y(%rsp), %xmm2
4140 pand $in1_y+0x10(%rsp), %xmm3
4141 por %xmm0, %xmm2
4142 por %xmm1, %xmm3
4143 movdqu %xmm2, 0x20($r_ptr)
4144 movdqu %xmm3, 0x30($r_ptr)
4145
4146 lea 32*15+56(%rsp), %rsi
4147.cfi_def_cfa %rsi,8
4148 mov -48(%rsi),%r15
4149.cfi_restore %r15
4150 mov -40(%rsi),%r14
4151.cfi_restore %r14
4152 mov -32(%rsi),%r13
4153.cfi_restore %r13
4154 mov -24(%rsi),%r12
4155.cfi_restore %r12
4156 mov -16(%rsi),%rbx
4157.cfi_restore %rbx
4158 mov -8(%rsi),%rbp
4159.cfi_restore %rbp
4160 lea (%rsi),%rsp
4161.cfi_def_cfa_register %rsp
4162.Ladd_affine${x}_epilogue:
4163 ret
4164.cfi_endproc
4165.size ecp_nistz256_point_add_affine$sfx,.-ecp_nistz256_point_add_affine$sfx
4166___
4167}
4168&gen_add_affine("q");
4169
4170########################################################################
4171# AD*X magic
4172#
4173if ($addx) { {
4174########################################################################
4175# operate in 4-5-0-1 "name space" that matches multiplication output
4176#
4177my ($a0,$a1,$a2,$a3,$t3,$t4)=($acc4,$acc5,$acc0,$acc1,$acc2,$acc3);
4178
4179$code.=<<___;
4180.type __ecp_nistz256_add_tox,\@abi-omnipotent
4181.align 32
4182__ecp_nistz256_add_tox:
4183.cfi_startproc
4184 xor $t4, $t4
4185 adc 8*0($b_ptr), $a0
4186 adc 8*1($b_ptr), $a1
4187 mov $a0, $t0
4188 adc 8*2($b_ptr), $a2
4189 adc 8*3($b_ptr), $a3
4190 mov $a1, $t1
4191 adc \$0, $t4
4192
4193 xor $t3, $t3
4194 sbb \$-1, $a0
4195 mov $a2, $t2
4196 sbb $poly1, $a1
4197 sbb \$0, $a2
4198 mov $a3, $t3
4199 sbb $poly3, $a3
4200 sbb \$0, $t4
4201
4202 cmovc $t0, $a0
4203 cmovc $t1, $a1
4204 mov $a0, 8*0($r_ptr)
4205 cmovc $t2, $a2
4206 mov $a1, 8*1($r_ptr)
4207 cmovc $t3, $a3
4208 mov $a2, 8*2($r_ptr)
4209 mov $a3, 8*3($r_ptr)
4210
4211 ret
4212.cfi_endproc
4213.size __ecp_nistz256_add_tox,.-__ecp_nistz256_add_tox
4214
4215.type __ecp_nistz256_sub_fromx,\@abi-omnipotent
4216.align 32
4217__ecp_nistz256_sub_fromx:
4218.cfi_startproc
4219 xor $t4, $t4
4220 sbb 8*0($b_ptr), $a0
4221 sbb 8*1($b_ptr), $a1
4222 mov $a0, $t0
4223 sbb 8*2($b_ptr), $a2
4224 sbb 8*3($b_ptr), $a3
4225 mov $a1, $t1
4226 sbb \$0, $t4
4227
4228 xor $t3, $t3
4229 adc \$-1, $a0
4230 mov $a2, $t2
4231 adc $poly1, $a1
4232 adc \$0, $a2
4233 mov $a3, $t3
4234 adc $poly3, $a3
4235
4236 bt \$0, $t4
4237 cmovnc $t0, $a0
4238 cmovnc $t1, $a1
4239 mov $a0, 8*0($r_ptr)
4240 cmovnc $t2, $a2
4241 mov $a1, 8*1($r_ptr)
4242 cmovnc $t3, $a3
4243 mov $a2, 8*2($r_ptr)
4244 mov $a3, 8*3($r_ptr)
4245
4246 ret
4247.cfi_endproc
4248.size __ecp_nistz256_sub_fromx,.-__ecp_nistz256_sub_fromx
4249
4250.type __ecp_nistz256_subx,\@abi-omnipotent
4251.align 32
4252__ecp_nistz256_subx:
4253.cfi_startproc
4254 xor $t4, $t4
4255 sbb $a0, $t0
4256 sbb $a1, $t1
4257 mov $t0, $a0
4258 sbb $a2, $t2
4259 sbb $a3, $t3
4260 mov $t1, $a1
4261 sbb \$0, $t4
4262
4263 xor $a3 ,$a3
4264 adc \$-1, $t0
4265 mov $t2, $a2
4266 adc $poly1, $t1
4267 adc \$0, $t2
4268 mov $t3, $a3
4269 adc $poly3, $t3
4270
4271 bt \$0, $t4
4272 cmovc $t0, $a0
4273 cmovc $t1, $a1
4274 cmovc $t2, $a2
4275 cmovc $t3, $a3
4276
4277 ret
4278.cfi_endproc
4279.size __ecp_nistz256_subx,.-__ecp_nistz256_subx
4280
4281.type __ecp_nistz256_mul_by_2x,\@abi-omnipotent
4282.align 32
4283__ecp_nistz256_mul_by_2x:
4284.cfi_startproc
4285 xor $t4, $t4
4286 adc $a0, $a0 # a0:a3+a0:a3
4287 adc $a1, $a1
4288 mov $a0, $t0
4289 adc $a2, $a2
4290 adc $a3, $a3
4291 mov $a1, $t1
4292 adc \$0, $t4
4293
4294 xor $t3, $t3
4295 sbb \$-1, $a0
4296 mov $a2, $t2
4297 sbb $poly1, $a1
4298 sbb \$0, $a2
4299 mov $a3, $t3
4300 sbb $poly3, $a3
4301 sbb \$0, $t4
4302
4303 cmovc $t0, $a0
4304 cmovc $t1, $a1
4305 mov $a0, 8*0($r_ptr)
4306 cmovc $t2, $a2
4307 mov $a1, 8*1($r_ptr)
4308 cmovc $t3, $a3
4309 mov $a2, 8*2($r_ptr)
4310 mov $a3, 8*3($r_ptr)
4311
4312 ret
4313.cfi_endproc
4314.size __ecp_nistz256_mul_by_2x,.-__ecp_nistz256_mul_by_2x
4315___
4316 }
4317&gen_double("x");
4318&gen_add("x");
4319&gen_add_affine("x");
4320}
4321}}}
4322
4323# EXCEPTION_DISPOSITION handler (EXCEPTION_RECORD *rec,ULONG64 frame,
4324# CONTEXT *context,DISPATCHER_CONTEXT *disp)
4325if ($win64) {
4326$rec="%rcx";
4327$frame="%rdx";
4328$context="%r8";
4329$disp="%r9";
4330
4331$code.=<<___;
4332.extern __imp_RtlVirtualUnwind
4333
4334.type short_handler,\@abi-omnipotent
4335.align 16
4336short_handler:
4337 push %rsi
4338 push %rdi
4339 push %rbx
4340 push %rbp
4341 push %r12
4342 push %r13
4343 push %r14
4344 push %r15
4345 pushfq
4346 sub \$64,%rsp
4347
4348 mov 120($context),%rax # pull context->Rax
4349 mov 248($context),%rbx # pull context->Rip
4350
4351 mov 8($disp),%rsi # disp->ImageBase
4352 mov 56($disp),%r11 # disp->HandlerData
4353
4354 mov 0(%r11),%r10d # HandlerData[0]
4355 lea (%rsi,%r10),%r10 # end of prologue label
4356 cmp %r10,%rbx # context->Rip<end of prologue label
4357 jb .Lcommon_seh_tail
4358
4359 mov 152($context),%rax # pull context->Rsp
4360
4361 mov 4(%r11),%r10d # HandlerData[1]
4362 lea (%rsi,%r10),%r10 # epilogue label
4363 cmp %r10,%rbx # context->Rip>=epilogue label
4364 jae .Lcommon_seh_tail
4365
4366 lea 16(%rax),%rax
4367
4368 mov -8(%rax),%r12
4369 mov -16(%rax),%r13
4370 mov %r12,216($context) # restore context->R12
4371 mov %r13,224($context) # restore context->R13
4372
4373 jmp .Lcommon_seh_tail
4374.size short_handler,.-short_handler
4375
4376.type full_handler,\@abi-omnipotent
4377.align 16
4378full_handler:
4379 push %rsi
4380 push %rdi
4381 push %rbx
4382 push %rbp
4383 push %r12
4384 push %r13
4385 push %r14
4386 push %r15
4387 pushfq
4388 sub \$64,%rsp
4389
4390 mov 120($context),%rax # pull context->Rax
4391 mov 248($context),%rbx # pull context->Rip
4392
4393 mov 8($disp),%rsi # disp->ImageBase
4394 mov 56($disp),%r11 # disp->HandlerData
4395
4396 mov 0(%r11),%r10d # HandlerData[0]
4397 lea (%rsi,%r10),%r10 # end of prologue label
4398 cmp %r10,%rbx # context->Rip<end of prologue label
4399 jb .Lcommon_seh_tail
4400
4401 mov 152($context),%rax # pull context->Rsp
4402
4403 mov 4(%r11),%r10d # HandlerData[1]
4404 lea (%rsi,%r10),%r10 # epilogue label
4405 cmp %r10,%rbx # context->Rip>=epilogue label
4406 jae .Lcommon_seh_tail
4407
4408 mov 8(%r11),%r10d # HandlerData[2]
4409 lea (%rax,%r10),%rax
4410
4411 mov -8(%rax),%rbp
4412 mov -16(%rax),%rbx
4413 mov -24(%rax),%r12
4414 mov -32(%rax),%r13
4415 mov -40(%rax),%r14
4416 mov -48(%rax),%r15
4417 mov %rbx,144($context) # restore context->Rbx
4418 mov %rbp,160($context) # restore context->Rbp
4419 mov %r12,216($context) # restore context->R12
4420 mov %r13,224($context) # restore context->R13
4421 mov %r14,232($context) # restore context->R14
4422 mov %r15,240($context) # restore context->R15
4423
4424.Lcommon_seh_tail:
4425 mov 8(%rax),%rdi
4426 mov 16(%rax),%rsi
4427 mov %rax,152($context) # restore context->Rsp
4428 mov %rsi,168($context) # restore context->Rsi
4429 mov %rdi,176($context) # restore context->Rdi
4430
4431 mov 40($disp),%rdi # disp->ContextRecord
4432 mov $context,%rsi # context
4433 mov \$154,%ecx # sizeof(CONTEXT)
4434 .long 0xa548f3fc # cld; rep movsq
4435
4436 mov $disp,%rsi
4437 xor %rcx,%rcx # arg1, UNW_FLAG_NHANDLER
4438 mov 8(%rsi),%rdx # arg2, disp->ImageBase
4439 mov 0(%rsi),%r8 # arg3, disp->ControlPc
4440 mov 16(%rsi),%r9 # arg4, disp->FunctionEntry
4441 mov 40(%rsi),%r10 # disp->ContextRecord
4442 lea 56(%rsi),%r11 # &disp->HandlerData
4443 lea 24(%rsi),%r12 # &disp->EstablisherFrame
4444 mov %r10,32(%rsp) # arg5
4445 mov %r11,40(%rsp) # arg6
4446 mov %r12,48(%rsp) # arg7
4447 mov %rcx,56(%rsp) # arg8, (NULL)
4448 call *__imp_RtlVirtualUnwind(%rip)
4449
4450 mov \$1,%eax # ExceptionContinueSearch
4451 add \$64,%rsp
4452 popfq
4453 pop %r15
4454 pop %r14
4455 pop %r13
4456 pop %r12
4457 pop %rbp
4458 pop %rbx
4459 pop %rdi
4460 pop %rsi
4461 ret
4462.size full_handler,.-full_handler
4463
4464.section .pdata
4465.align 4
4466 .rva .LSEH_begin_ecp_nistz256_mul_by_2
4467 .rva .LSEH_end_ecp_nistz256_mul_by_2
4468 .rva .LSEH_info_ecp_nistz256_mul_by_2
4469
4470 .rva .LSEH_begin_ecp_nistz256_div_by_2
4471 .rva .LSEH_end_ecp_nistz256_div_by_2
4472 .rva .LSEH_info_ecp_nistz256_div_by_2
4473
4474 .rva .LSEH_begin_ecp_nistz256_mul_by_3
4475 .rva .LSEH_end_ecp_nistz256_mul_by_3
4476 .rva .LSEH_info_ecp_nistz256_mul_by_3
4477
4478 .rva .LSEH_begin_ecp_nistz256_add
4479 .rva .LSEH_end_ecp_nistz256_add
4480 .rva .LSEH_info_ecp_nistz256_add
4481
4482 .rva .LSEH_begin_ecp_nistz256_sub
4483 .rva .LSEH_end_ecp_nistz256_sub
4484 .rva .LSEH_info_ecp_nistz256_sub
4485
4486 .rva .LSEH_begin_ecp_nistz256_neg
4487 .rva .LSEH_end_ecp_nistz256_neg
4488 .rva .LSEH_info_ecp_nistz256_neg
4489
4490 .rva .LSEH_begin_ecp_nistz256_ord_mul_mont
4491 .rva .LSEH_end_ecp_nistz256_ord_mul_mont
4492 .rva .LSEH_info_ecp_nistz256_ord_mul_mont
4493
4494 .rva .LSEH_begin_ecp_nistz256_ord_sqr_mont
4495 .rva .LSEH_end_ecp_nistz256_ord_sqr_mont
4496 .rva .LSEH_info_ecp_nistz256_ord_sqr_mont
4497___
4498$code.=<<___ if ($addx);
4499 .rva .LSEH_begin_ecp_nistz256_ord_mul_montx
4500 .rva .LSEH_end_ecp_nistz256_ord_mul_montx
4501 .rva .LSEH_info_ecp_nistz256_ord_mul_montx
4502
4503 .rva .LSEH_begin_ecp_nistz256_ord_sqr_montx
4504 .rva .LSEH_end_ecp_nistz256_ord_sqr_montx
4505 .rva .LSEH_info_ecp_nistz256_ord_sqr_montx
4506___
4507$code.=<<___;
4508 .rva .LSEH_begin_ecp_nistz256_to_mont
4509 .rva .LSEH_end_ecp_nistz256_to_mont
4510 .rva .LSEH_info_ecp_nistz256_to_mont
4511
4512 .rva .LSEH_begin_ecp_nistz256_mul_mont
4513 .rva .LSEH_end_ecp_nistz256_mul_mont
4514 .rva .LSEH_info_ecp_nistz256_mul_mont
4515
4516 .rva .LSEH_begin_ecp_nistz256_sqr_mont
4517 .rva .LSEH_end_ecp_nistz256_sqr_mont
4518 .rva .LSEH_info_ecp_nistz256_sqr_mont
4519
4520 .rva .LSEH_begin_ecp_nistz256_from_mont
4521 .rva .LSEH_end_ecp_nistz256_from_mont
4522 .rva .LSEH_info_ecp_nistz256_from_mont
4523
4524 .rva .LSEH_begin_ecp_nistz256_gather_w5
4525 .rva .LSEH_end_ecp_nistz256_gather_w5
4526 .rva .LSEH_info_ecp_nistz256_gather_wX
4527
4528 .rva .LSEH_begin_ecp_nistz256_gather_w7
4529 .rva .LSEH_end_ecp_nistz256_gather_w7
4530 .rva .LSEH_info_ecp_nistz256_gather_wX
4531___
4532$code.=<<___ if ($avx>1);
4533 .rva .LSEH_begin_ecp_nistz256_avx2_gather_w5
4534 .rva .LSEH_end_ecp_nistz256_avx2_gather_w5
4535 .rva .LSEH_info_ecp_nistz256_avx2_gather_wX
4536
4537 .rva .LSEH_begin_ecp_nistz256_avx2_gather_w7
4538 .rva .LSEH_end_ecp_nistz256_avx2_gather_w7
4539 .rva .LSEH_info_ecp_nistz256_avx2_gather_wX
4540___
4541$code.=<<___;
4542 .rva .LSEH_begin_ecp_nistz256_point_double
4543 .rva .LSEH_end_ecp_nistz256_point_double
4544 .rva .LSEH_info_ecp_nistz256_point_double
4545
4546 .rva .LSEH_begin_ecp_nistz256_point_add
4547 .rva .LSEH_end_ecp_nistz256_point_add
4548 .rva .LSEH_info_ecp_nistz256_point_add
4549
4550 .rva .LSEH_begin_ecp_nistz256_point_add_affine
4551 .rva .LSEH_end_ecp_nistz256_point_add_affine
4552 .rva .LSEH_info_ecp_nistz256_point_add_affine
4553___
4554$code.=<<___ if ($addx);
4555 .rva .LSEH_begin_ecp_nistz256_point_doublex
4556 .rva .LSEH_end_ecp_nistz256_point_doublex
4557 .rva .LSEH_info_ecp_nistz256_point_doublex
4558
4559 .rva .LSEH_begin_ecp_nistz256_point_addx
4560 .rva .LSEH_end_ecp_nistz256_point_addx
4561 .rva .LSEH_info_ecp_nistz256_point_addx
4562
4563 .rva .LSEH_begin_ecp_nistz256_point_add_affinex
4564 .rva .LSEH_end_ecp_nistz256_point_add_affinex
4565 .rva .LSEH_info_ecp_nistz256_point_add_affinex
4566___
4567$code.=<<___;
4568
4569.section .xdata
4570.align 8
4571.LSEH_info_ecp_nistz256_mul_by_2:
4572 .byte 9,0,0,0
4573 .rva short_handler
4574 .rva .Lmul_by_2_body,.Lmul_by_2_epilogue # HandlerData[]
4575.LSEH_info_ecp_nistz256_div_by_2:
4576 .byte 9,0,0,0
4577 .rva short_handler
4578 .rva .Ldiv_by_2_body,.Ldiv_by_2_epilogue # HandlerData[]
4579.LSEH_info_ecp_nistz256_mul_by_3:
4580 .byte 9,0,0,0
4581 .rva short_handler
4582 .rva .Lmul_by_3_body,.Lmul_by_3_epilogue # HandlerData[]
4583.LSEH_info_ecp_nistz256_add:
4584 .byte 9,0,0,0
4585 .rva short_handler
4586 .rva .Ladd_body,.Ladd_epilogue # HandlerData[]
4587.LSEH_info_ecp_nistz256_sub:
4588 .byte 9,0,0,0
4589 .rva short_handler
4590 .rva .Lsub_body,.Lsub_epilogue # HandlerData[]
4591.LSEH_info_ecp_nistz256_neg:
4592 .byte 9,0,0,0
4593 .rva short_handler
4594 .rva .Lneg_body,.Lneg_epilogue # HandlerData[]
4595.LSEH_info_ecp_nistz256_ord_mul_mont:
4596 .byte 9,0,0,0
4597 .rva full_handler
4598 .rva .Lord_mul_body,.Lord_mul_epilogue # HandlerData[]
4599 .long 48,0
4600.LSEH_info_ecp_nistz256_ord_sqr_mont:
4601 .byte 9,0,0,0
4602 .rva full_handler
4603 .rva .Lord_sqr_body,.Lord_sqr_epilogue # HandlerData[]
4604 .long 48,0
4605___
4606$code.=<<___ if ($addx);
4607.LSEH_info_ecp_nistz256_ord_mul_montx:
4608 .byte 9,0,0,0
4609 .rva full_handler
4610 .rva .Lord_mulx_body,.Lord_mulx_epilogue # HandlerData[]
4611 .long 48,0
4612.LSEH_info_ecp_nistz256_ord_sqr_montx:
4613 .byte 9,0,0,0
4614 .rva full_handler
4615 .rva .Lord_sqrx_body,.Lord_sqrx_epilogue # HandlerData[]
4616 .long 48,0
4617___
4618$code.=<<___;
4619.LSEH_info_ecp_nistz256_to_mont:
4620 .byte 9,0,0,0
4621 .rva full_handler
4622 .rva .Lmul_body,.Lmul_epilogue # HandlerData[]
4623 .long 48,0
4624.LSEH_info_ecp_nistz256_mul_mont:
4625 .byte 9,0,0,0
4626 .rva full_handler
4627 .rva .Lmul_body,.Lmul_epilogue # HandlerData[]
4628 .long 48,0
4629.LSEH_info_ecp_nistz256_sqr_mont:
4630 .byte 9,0,0,0
4631 .rva full_handler
4632 .rva .Lsqr_body,.Lsqr_epilogue # HandlerData[]
4633 .long 48,0
4634.LSEH_info_ecp_nistz256_from_mont:
4635 .byte 9,0,0,0
4636 .rva short_handler
4637 .rva .Lfrom_body,.Lfrom_epilogue # HandlerData[]
4638.LSEH_info_ecp_nistz256_gather_wX:
4639 .byte 0x01,0x33,0x16,0x00
4640 .byte 0x33,0xf8,0x09,0x00 #movaps 0x90(rsp),xmm15
4641 .byte 0x2e,0xe8,0x08,0x00 #movaps 0x80(rsp),xmm14
4642 .byte 0x29,0xd8,0x07,0x00 #movaps 0x70(rsp),xmm13
4643 .byte 0x24,0xc8,0x06,0x00 #movaps 0x60(rsp),xmm12
4644 .byte 0x1f,0xb8,0x05,0x00 #movaps 0x50(rsp),xmm11
4645 .byte 0x1a,0xa8,0x04,0x00 #movaps 0x40(rsp),xmm10
4646 .byte 0x15,0x98,0x03,0x00 #movaps 0x30(rsp),xmm9
4647 .byte 0x10,0x88,0x02,0x00 #movaps 0x20(rsp),xmm8
4648 .byte 0x0c,0x78,0x01,0x00 #movaps 0x10(rsp),xmm7
4649 .byte 0x08,0x68,0x00,0x00 #movaps 0x00(rsp),xmm6
4650 .byte 0x04,0x01,0x15,0x00 #sub rsp,0xa8
4651 .align 8
4652___
4653$code.=<<___ if ($avx>1);
4654.LSEH_info_ecp_nistz256_avx2_gather_wX:
4655 .byte 0x01,0x36,0x17,0x0b
4656 .byte 0x36,0xf8,0x09,0x00 # vmovaps 0x90(rsp),xmm15
4657 .byte 0x31,0xe8,0x08,0x00 # vmovaps 0x80(rsp),xmm14
4658 .byte 0x2c,0xd8,0x07,0x00 # vmovaps 0x70(rsp),xmm13
4659 .byte 0x27,0xc8,0x06,0x00 # vmovaps 0x60(rsp),xmm12
4660 .byte 0x22,0xb8,0x05,0x00 # vmovaps 0x50(rsp),xmm11
4661 .byte 0x1d,0xa8,0x04,0x00 # vmovaps 0x40(rsp),xmm10
4662 .byte 0x18,0x98,0x03,0x00 # vmovaps 0x30(rsp),xmm9
4663 .byte 0x13,0x88,0x02,0x00 # vmovaps 0x20(rsp),xmm8
4664 .byte 0x0e,0x78,0x01,0x00 # vmovaps 0x10(rsp),xmm7
4665 .byte 0x09,0x68,0x00,0x00 # vmovaps 0x00(rsp),xmm6
4666 .byte 0x04,0x01,0x15,0x00 # sub rsp,0xa8
4667 .byte 0x00,0xb3,0x00,0x00 # set_frame r11
4668 .align 8
4669___
4670$code.=<<___;
4671.LSEH_info_ecp_nistz256_point_double:
4672 .byte 9,0,0,0
4673 .rva full_handler
4674 .rva .Lpoint_doubleq_body,.Lpoint_doubleq_epilogue # HandlerData[]
4675 .long 32*5+56,0
4676.LSEH_info_ecp_nistz256_point_add:
4677 .byte 9,0,0,0
4678 .rva full_handler
4679 .rva .Lpoint_addq_body,.Lpoint_addq_epilogue # HandlerData[]
4680 .long 32*18+56,0
4681.LSEH_info_ecp_nistz256_point_add_affine:
4682 .byte 9,0,0,0
4683 .rva full_handler
4684 .rva .Ladd_affineq_body,.Ladd_affineq_epilogue # HandlerData[]
4685 .long 32*15+56,0
4686___
4687$code.=<<___ if ($addx);
4688.align 8
4689.LSEH_info_ecp_nistz256_point_doublex:
4690 .byte 9,0,0,0
4691 .rva full_handler
4692 .rva .Lpoint_doublex_body,.Lpoint_doublex_epilogue # HandlerData[]
4693 .long 32*5+56,0
4694.LSEH_info_ecp_nistz256_point_addx:
4695 .byte 9,0,0,0
4696 .rva full_handler
4697 .rva .Lpoint_addx_body,.Lpoint_addx_epilogue # HandlerData[]
4698 .long 32*18+56,0
4699.LSEH_info_ecp_nistz256_point_add_affinex:
4700 .byte 9,0,0,0
4701 .rva full_handler
4702 .rva .Ladd_affinex_body,.Ladd_affinex_epilogue # HandlerData[]
4703 .long 32*15+56,0
4704___
4705}
4706
4707########################################################################
4708# Convert ecp_nistz256_table.c to layout expected by ecp_nistz_gather_w7
4709#
4710open TABLE,"<ecp_nistz256_table.c" or
4711open TABLE,"<${dir}../ecp_nistz256_table.c" or
4712die "failed to open ecp_nistz256_table.c:",$!;
4713
4714use integer;
4715
4716foreach(<TABLE>) {
4717 s/TOBN\(\s*(0x[0-9a-f]+),\s*(0x[0-9a-f]+)\s*\)/push @arr,hex($2),hex($1)/geo;
4718}
4719close TABLE;
4720
4721die "insane number of elements" if ($#arr != 64*16*37-1);
4722
4723print <<___;
4724.text
4725.globl ecp_nistz256_precomputed
4726.type ecp_nistz256_precomputed,\@object
4727.align 4096
4728ecp_nistz256_precomputed:
4729___
4730while (@line=splice(@arr,0,16)) {
4731 print ".long\t",join(',',map { sprintf "0x%08x",$_} @line),"\n";
4732}
4733print <<___;
4734.size ecp_nistz256_precomputed,.-ecp_nistz256_precomputed
4735___
4736
4737$code =~ s/\`([^\`]*)\`/eval $1/gem;
4738print $code;
4739close STDOUT or die "error closing STDOUT: $!";
Note: See TracBrowser for help on using the repository browser.

© 2024 Oracle Support Privacy / Do Not Sell My Info Terms of Use Trademark Policy Automated Access Etiquette