VirtualBox

source: vbox/trunk/src/libs/openssl-1.1.1f/crypto/ec/asm/x25519-x86_64.pl@ 83531

Last change on this file since 83531 was 83531, checked in by vboxsync, 5 years ago

setting svn:sync-process=export for openssl-1.1.1f, all files except tests

File size: 24.3 KB
Line 
1#!/usr/bin/env perl
2# Copyright 2018-2020 The OpenSSL Project Authors. All Rights Reserved.
3#
4# Licensed under the OpenSSL license (the "License"). You may not use
5# this file except in compliance with the License. You can obtain a copy
6# in the file LICENSE in the source distribution or at
7# https://www.openssl.org/source/license.html
8#
9# ====================================================================
10# Written by Andy Polyakov <[email protected]> for the OpenSSL
11# project. The module is, however, dual licensed under OpenSSL and
12# CRYPTOGAMS licenses depending on where you obtain it. For further
13# details see http://www.openssl.org/~appro/cryptogams/.
14# ====================================================================
15#
16# X25519 lower-level primitives for x86_64.
17#
18# February 2018.
19#
20# This module implements radix 2^51 multiplication and squaring, and
21# radix 2^64 multiplication, squaring, addition, subtraction and final
22# reduction. Latter radix is used on ADCX/ADOX-capable processors such
23# as Broadwell. On related note one should mention that there are
24# vector implementations that provide significantly better performance
25# on some processors(*), but they are large and overly complex. Which
26# in combination with them being effectively processor-specific makes
27# the undertaking hard to justify. The goal for this implementation
28# is rather versatility and simplicity [and ultimately formal
29# verification].
30#
31# (*) For example sandy2x should provide ~30% improvement on Sandy
32# Bridge, but only nominal ~5% on Haswell [and big loss on
33# Broadwell and successors].
34#
35######################################################################
36# Improvement coefficients:
37#
38# amd64-51(*) gcc-5.x(**)
39#
40# P4 +22% +40%
41# Sandy Bridge -3% +11%
42# Haswell -1% +13%
43# Broadwell(***) +30% +35%
44# Skylake(***) +33% +47%
45# Silvermont +20% +26%
46# Goldmont +40% +50%
47# Bulldozer +20% +9%
48# Ryzen(***) +43% +40%
49# VIA +170% +120%
50#
51# (*) amd64-51 is popular assembly implementation with 2^51 radix,
52# only multiplication and squaring subroutines were linked
53# for comparison, but not complete ladder step; gain on most
54# processors is because this module refrains from shld, and
55# minor regression on others is because this does result in
56# higher instruction count;
57# (**) compiler is free to inline functions, in assembly one would
58# need to implement ladder step to do that, and it will improve
59# performance by several percent;
60# (***) ADCX/ADOX result for 2^64 radix, there is no corresponding
61# C implementation, so that comparison is always against
62# 2^51 radix;
63
64$flavour = shift;
65$output = shift;
66if ($flavour =~ /\./) { $output = $flavour; undef $flavour; }
67
68$win64=0; $win64=1 if ($flavour =~ /[nm]asm|mingw64/ || $output =~ /\.asm$/);
69
70$0 =~ m/(.*[\/\\])[^\/\\]+$/; $dir=$1;
71( $xlate="${dir}x86_64-xlate.pl" and -f $xlate ) or
72( $xlate="${dir}../../perlasm/x86_64-xlate.pl" and -f $xlate) or
73die "can't locate x86_64-xlate.pl";
74
75open OUT,"| \"$^X\" \"$xlate\" $flavour \"$output\"";
76*STDOUT=*OUT;
77
78if (`$ENV{CC} -Wa,-v -c -o /dev/null -x assembler /dev/null 2>&1`
79 =~ /GNU assembler version ([2-9]\.[0-9]+)/) {
80 $addx = ($1>=2.23);
81}
82
83if (!$addx && $win64 && ($flavour =~ /nasm/ || $ENV{ASM} =~ /nasm/) &&
84 `nasm -v 2>&1` =~ /NASM version ([2-9]\.[0-9]+)/) {
85 $addx = ($1>=2.10);
86}
87
88if (!$addx && $win64 && ($flavour =~ /masm/ || $ENV{ASM} =~ /ml64/) &&
89 `ml64 2>&1` =~ /Version ([0-9]+)\./) {
90 $addx = ($1>=12);
91}
92
93if (!$addx && `$ENV{CC} -v 2>&1` =~ /((?:^clang|LLVM) version|.*based on LLVM) ([0-9]+)\.([0-9]+)/) {
94 my $ver = $2 + $3/100.0; # 3.1->3.01, 3.10->3.10
95 $addx = ($ver>=3.03);
96}
97
98$code.=<<___;
99.text
100
101.globl x25519_fe51_mul
102.type x25519_fe51_mul,\@function,3
103.align 32
104x25519_fe51_mul:
105.cfi_startproc
106 push %rbp
107.cfi_push %rbp
108 push %rbx
109.cfi_push %rbx
110 push %r12
111.cfi_push %r12
112 push %r13
113.cfi_push %r13
114 push %r14
115.cfi_push %r14
116 push %r15
117.cfi_push %r15
118 lea -8*5(%rsp),%rsp
119.cfi_adjust_cfa_offset 40
120.Lfe51_mul_body:
121
122 mov 8*0(%rsi),%rax # f[0]
123 mov 8*0(%rdx),%r11 # load g[0-4]
124 mov 8*1(%rdx),%r12
125 mov 8*2(%rdx),%r13
126 mov 8*3(%rdx),%rbp
127 mov 8*4(%rdx),%r14
128
129 mov %rdi,8*4(%rsp) # offload 1st argument
130 mov %rax,%rdi
131 mulq %r11 # f[0]*g[0]
132 mov %r11,8*0(%rsp) # offload g[0]
133 mov %rax,%rbx # %rbx:%rcx = h0
134 mov %rdi,%rax
135 mov %rdx,%rcx
136 mulq %r12 # f[0]*g[1]
137 mov %r12,8*1(%rsp) # offload g[1]
138 mov %rax,%r8 # %r8:%r9 = h1
139 mov %rdi,%rax
140 lea (%r14,%r14,8),%r15
141 mov %rdx,%r9
142 mulq %r13 # f[0]*g[2]
143 mov %r13,8*2(%rsp) # offload g[2]
144 mov %rax,%r10 # %r10:%r11 = h2
145 mov %rdi,%rax
146 lea (%r14,%r15,2),%rdi # g[4]*19
147 mov %rdx,%r11
148 mulq %rbp # f[0]*g[3]
149 mov %rax,%r12 # %r12:%r13 = h3
150 mov 8*0(%rsi),%rax # f[0]
151 mov %rdx,%r13
152 mulq %r14 # f[0]*g[4]
153 mov %rax,%r14 # %r14:%r15 = h4
154 mov 8*1(%rsi),%rax # f[1]
155 mov %rdx,%r15
156
157 mulq %rdi # f[1]*g[4]*19
158 add %rax,%rbx
159 mov 8*2(%rsi),%rax # f[2]
160 adc %rdx,%rcx
161 mulq %rdi # f[2]*g[4]*19
162 add %rax,%r8
163 mov 8*3(%rsi),%rax # f[3]
164 adc %rdx,%r9
165 mulq %rdi # f[3]*g[4]*19
166 add %rax,%r10
167 mov 8*4(%rsi),%rax # f[4]
168 adc %rdx,%r11
169 mulq %rdi # f[4]*g[4]*19
170 imulq \$19,%rbp,%rdi # g[3]*19
171 add %rax,%r12
172 mov 8*1(%rsi),%rax # f[1]
173 adc %rdx,%r13
174 mulq %rbp # f[1]*g[3]
175 mov 8*2(%rsp),%rbp # g[2]
176 add %rax,%r14
177 mov 8*2(%rsi),%rax # f[2]
178 adc %rdx,%r15
179
180 mulq %rdi # f[2]*g[3]*19
181 add %rax,%rbx
182 mov 8*3(%rsi),%rax # f[3]
183 adc %rdx,%rcx
184 mulq %rdi # f[3]*g[3]*19
185 add %rax,%r8
186 mov 8*4(%rsi),%rax # f[4]
187 adc %rdx,%r9
188 mulq %rdi # f[4]*g[3]*19
189 imulq \$19,%rbp,%rdi # g[2]*19
190 add %rax,%r10
191 mov 8*1(%rsi),%rax # f[1]
192 adc %rdx,%r11
193 mulq %rbp # f[1]*g[2]
194 add %rax,%r12
195 mov 8*2(%rsi),%rax # f[2]
196 adc %rdx,%r13
197 mulq %rbp # f[2]*g[2]
198 mov 8*1(%rsp),%rbp # g[1]
199 add %rax,%r14
200 mov 8*3(%rsi),%rax # f[3]
201 adc %rdx,%r15
202
203 mulq %rdi # f[3]*g[2]*19
204 add %rax,%rbx
205 mov 8*4(%rsi),%rax # f[3]
206 adc %rdx,%rcx
207 mulq %rdi # f[4]*g[2]*19
208 add %rax,%r8
209 mov 8*1(%rsi),%rax # f[1]
210 adc %rdx,%r9
211 mulq %rbp # f[1]*g[1]
212 imulq \$19,%rbp,%rdi
213 add %rax,%r10
214 mov 8*2(%rsi),%rax # f[2]
215 adc %rdx,%r11
216 mulq %rbp # f[2]*g[1]
217 add %rax,%r12
218 mov 8*3(%rsi),%rax # f[3]
219 adc %rdx,%r13
220 mulq %rbp # f[3]*g[1]
221 mov 8*0(%rsp),%rbp # g[0]
222 add %rax,%r14
223 mov 8*4(%rsi),%rax # f[4]
224 adc %rdx,%r15
225
226 mulq %rdi # f[4]*g[1]*19
227 add %rax,%rbx
228 mov 8*1(%rsi),%rax # f[1]
229 adc %rdx,%rcx
230 mul %rbp # f[1]*g[0]
231 add %rax,%r8
232 mov 8*2(%rsi),%rax # f[2]
233 adc %rdx,%r9
234 mul %rbp # f[2]*g[0]
235 add %rax,%r10
236 mov 8*3(%rsi),%rax # f[3]
237 adc %rdx,%r11
238 mul %rbp # f[3]*g[0]
239 add %rax,%r12
240 mov 8*4(%rsi),%rax # f[4]
241 adc %rdx,%r13
242 mulq %rbp # f[4]*g[0]
243 add %rax,%r14
244 adc %rdx,%r15
245
246 mov 8*4(%rsp),%rdi # restore 1st argument
247 jmp .Lreduce51
248.Lfe51_mul_epilogue:
249.cfi_endproc
250.size x25519_fe51_mul,.-x25519_fe51_mul
251
252.globl x25519_fe51_sqr
253.type x25519_fe51_sqr,\@function,2
254.align 32
255x25519_fe51_sqr:
256.cfi_startproc
257 push %rbp
258.cfi_push %rbp
259 push %rbx
260.cfi_push %rbx
261 push %r12
262.cfi_push %r12
263 push %r13
264.cfi_push %r13
265 push %r14
266.cfi_push %r14
267 push %r15
268.cfi_push %r15
269 lea -8*5(%rsp),%rsp
270.cfi_adjust_cfa_offset 40
271.Lfe51_sqr_body:
272
273 mov 8*0(%rsi),%rax # g[0]
274 mov 8*2(%rsi),%r15 # g[2]
275 mov 8*4(%rsi),%rbp # g[4]
276
277 mov %rdi,8*4(%rsp) # offload 1st argument
278 lea (%rax,%rax),%r14
279 mulq %rax # g[0]*g[0]
280 mov %rax,%rbx
281 mov 8*1(%rsi),%rax # g[1]
282 mov %rdx,%rcx
283 mulq %r14 # 2*g[0]*g[1]
284 mov %rax,%r8
285 mov %r15,%rax
286 mov %r15,8*0(%rsp) # offload g[2]
287 mov %rdx,%r9
288 mulq %r14 # 2*g[0]*g[2]
289 mov %rax,%r10
290 mov 8*3(%rsi),%rax
291 mov %rdx,%r11
292 imulq \$19,%rbp,%rdi # g[4]*19
293 mulq %r14 # 2*g[0]*g[3]
294 mov %rax,%r12
295 mov %rbp,%rax
296 mov %rdx,%r13
297 mulq %r14 # 2*g[0]*g[4]
298 mov %rax,%r14
299 mov %rbp,%rax
300 mov %rdx,%r15
301
302 mulq %rdi # g[4]*g[4]*19
303 add %rax,%r12
304 mov 8*1(%rsi),%rax # g[1]
305 adc %rdx,%r13
306
307 mov 8*3(%rsi),%rsi # g[3]
308 lea (%rax,%rax),%rbp
309 mulq %rax # g[1]*g[1]
310 add %rax,%r10
311 mov 8*0(%rsp),%rax # g[2]
312 adc %rdx,%r11
313 mulq %rbp # 2*g[1]*g[2]
314 add %rax,%r12
315 mov %rbp,%rax
316 adc %rdx,%r13
317 mulq %rsi # 2*g[1]*g[3]
318 add %rax,%r14
319 mov %rbp,%rax
320 adc %rdx,%r15
321 imulq \$19,%rsi,%rbp # g[3]*19
322 mulq %rdi # 2*g[1]*g[4]*19
323 add %rax,%rbx
324 lea (%rsi,%rsi),%rax
325 adc %rdx,%rcx
326
327 mulq %rdi # 2*g[3]*g[4]*19
328 add %rax,%r10
329 mov %rsi,%rax
330 adc %rdx,%r11
331 mulq %rbp # g[3]*g[3]*19
332 add %rax,%r8
333 mov 8*0(%rsp),%rax # g[2]
334 adc %rdx,%r9
335
336 lea (%rax,%rax),%rsi
337 mulq %rax # g[2]*g[2]
338 add %rax,%r14
339 mov %rbp,%rax
340 adc %rdx,%r15
341 mulq %rsi # 2*g[2]*g[3]*19
342 add %rax,%rbx
343 mov %rsi,%rax
344 adc %rdx,%rcx
345 mulq %rdi # 2*g[2]*g[4]*19
346 add %rax,%r8
347 adc %rdx,%r9
348
349 mov 8*4(%rsp),%rdi # restore 1st argument
350 jmp .Lreduce51
351
352.align 32
353.Lreduce51:
354 mov \$0x7ffffffffffff,%rbp
355
356 mov %r10,%rdx
357 shr \$51,%r10
358 shl \$13,%r11
359 and %rbp,%rdx # %rdx = g2 = h2 & mask
360 or %r10,%r11 # h2>>51
361 add %r11,%r12
362 adc \$0,%r13 # h3 += h2>>51
363
364 mov %rbx,%rax
365 shr \$51,%rbx
366 shl \$13,%rcx
367 and %rbp,%rax # %rax = g0 = h0 & mask
368 or %rbx,%rcx # h0>>51
369 add %rcx,%r8 # h1 += h0>>51
370 adc \$0,%r9
371
372 mov %r12,%rbx
373 shr \$51,%r12
374 shl \$13,%r13
375 and %rbp,%rbx # %rbx = g3 = h3 & mask
376 or %r12,%r13 # h3>>51
377 add %r13,%r14 # h4 += h3>>51
378 adc \$0,%r15
379
380 mov %r8,%rcx
381 shr \$51,%r8
382 shl \$13,%r9
383 and %rbp,%rcx # %rcx = g1 = h1 & mask
384 or %r8,%r9
385 add %r9,%rdx # g2 += h1>>51
386
387 mov %r14,%r10
388 shr \$51,%r14
389 shl \$13,%r15
390 and %rbp,%r10 # %r10 = g4 = h0 & mask
391 or %r14,%r15 # h0>>51
392
393 lea (%r15,%r15,8),%r14
394 lea (%r15,%r14,2),%r15
395 add %r15,%rax # g0 += (h0>>51)*19
396
397 mov %rdx,%r8
398 and %rbp,%rdx # g2 &= mask
399 shr \$51,%r8
400 add %r8,%rbx # g3 += g2>>51
401
402 mov %rax,%r9
403 and %rbp,%rax # g0 &= mask
404 shr \$51,%r9
405 add %r9,%rcx # g1 += g0>>51
406
407 mov %rax,8*0(%rdi) # save the result
408 mov %rcx,8*1(%rdi)
409 mov %rdx,8*2(%rdi)
410 mov %rbx,8*3(%rdi)
411 mov %r10,8*4(%rdi)
412
413 mov 8*5(%rsp),%r15
414.cfi_restore %r15
415 mov 8*6(%rsp),%r14
416.cfi_restore %r14
417 mov 8*7(%rsp),%r13
418.cfi_restore %r13
419 mov 8*8(%rsp),%r12
420.cfi_restore %r12
421 mov 8*9(%rsp),%rbx
422.cfi_restore %rbx
423 mov 8*10(%rsp),%rbp
424.cfi_restore %rbp
425 lea 8*11(%rsp),%rsp
426.cfi_adjust_cfa_offset 88
427.Lfe51_sqr_epilogue:
428 ret
429.cfi_endproc
430.size x25519_fe51_sqr,.-x25519_fe51_sqr
431
432.globl x25519_fe51_mul121666
433.type x25519_fe51_mul121666,\@function,2
434.align 32
435x25519_fe51_mul121666:
436.cfi_startproc
437 push %rbp
438.cfi_push %rbp
439 push %rbx
440.cfi_push %rbx
441 push %r12
442.cfi_push %r12
443 push %r13
444.cfi_push %r13
445 push %r14
446.cfi_push %r14
447 push %r15
448.cfi_push %r15
449 lea -8*5(%rsp),%rsp
450.cfi_adjust_cfa_offset 40
451.Lfe51_mul121666_body:
452 mov \$121666,%eax
453
454 mulq 8*0(%rsi)
455 mov %rax,%rbx # %rbx:%rcx = h0
456 mov \$121666,%eax
457 mov %rdx,%rcx
458 mulq 8*1(%rsi)
459 mov %rax,%r8 # %r8:%r9 = h1
460 mov \$121666,%eax
461 mov %rdx,%r9
462 mulq 8*2(%rsi)
463 mov %rax,%r10 # %r10:%r11 = h2
464 mov \$121666,%eax
465 mov %rdx,%r11
466 mulq 8*3(%rsi)
467 mov %rax,%r12 # %r12:%r13 = h3
468 mov \$121666,%eax # f[0]
469 mov %rdx,%r13
470 mulq 8*4(%rsi)
471 mov %rax,%r14 # %r14:%r15 = h4
472 mov %rdx,%r15
473
474 jmp .Lreduce51
475.Lfe51_mul121666_epilogue:
476.cfi_endproc
477.size x25519_fe51_mul121666,.-x25519_fe51_mul121666
478___
479########################################################################
480# Base 2^64 subroutines modulo 2*(2^255-19)
481#
482if ($addx) {
483my ($acc0,$acc1,$acc2,$acc3,$acc4,$acc5,$acc6,$acc7) = map("%r$_",(8..15));
484
485$code.=<<___;
486.extern OPENSSL_ia32cap_P
487.globl x25519_fe64_eligible
488.type x25519_fe64_eligible,\@abi-omnipotent
489.align 32
490x25519_fe64_eligible:
491.cfi_startproc
492 mov OPENSSL_ia32cap_P+8(%rip),%ecx
493 xor %eax,%eax
494 and \$0x80100,%ecx
495 cmp \$0x80100,%ecx
496 cmove %ecx,%eax
497 ret
498.cfi_endproc
499.size x25519_fe64_eligible,.-x25519_fe64_eligible
500
501.globl x25519_fe64_mul
502.type x25519_fe64_mul,\@function,3
503.align 32
504x25519_fe64_mul:
505.cfi_startproc
506 push %rbp
507.cfi_push %rbp
508 push %rbx
509.cfi_push %rbx
510 push %r12
511.cfi_push %r12
512 push %r13
513.cfi_push %r13
514 push %r14
515.cfi_push %r14
516 push %r15
517.cfi_push %r15
518 push %rdi # offload dst
519.cfi_push %rdi
520 lea -8*2(%rsp),%rsp
521.cfi_adjust_cfa_offset 16
522.Lfe64_mul_body:
523
524 mov %rdx,%rax
525 mov 8*0(%rdx),%rbp # b[0]
526 mov 8*0(%rsi),%rdx # a[0]
527 mov 8*1(%rax),%rcx # b[1]
528 mov 8*2(%rax),$acc6 # b[2]
529 mov 8*3(%rax),$acc7 # b[3]
530
531 mulx %rbp,$acc0,%rax # a[0]*b[0]
532 xor %edi,%edi # cf=0,of=0
533 mulx %rcx,$acc1,%rbx # a[0]*b[1]
534 adcx %rax,$acc1
535 mulx $acc6,$acc2,%rax # a[0]*b[2]
536 adcx %rbx,$acc2
537 mulx $acc7,$acc3,$acc4 # a[0]*b[3]
538 mov 8*1(%rsi),%rdx # a[1]
539 adcx %rax,$acc3
540 mov $acc6,(%rsp) # offload b[2]
541 adcx %rdi,$acc4 # cf=0
542
543 mulx %rbp,%rax,%rbx # a[1]*b[0]
544 adox %rax,$acc1
545 adcx %rbx,$acc2
546 mulx %rcx,%rax,%rbx # a[1]*b[1]
547 adox %rax,$acc2
548 adcx %rbx,$acc3
549 mulx $acc6,%rax,%rbx # a[1]*b[2]
550 adox %rax,$acc3
551 adcx %rbx,$acc4
552 mulx $acc7,%rax,$acc5 # a[1]*b[3]
553 mov 8*2(%rsi),%rdx # a[2]
554 adox %rax,$acc4
555 adcx %rdi,$acc5 # cf=0
556 adox %rdi,$acc5 # of=0
557
558 mulx %rbp,%rax,%rbx # a[2]*b[0]
559 adcx %rax,$acc2
560 adox %rbx,$acc3
561 mulx %rcx,%rax,%rbx # a[2]*b[1]
562 adcx %rax,$acc3
563 adox %rbx,$acc4
564 mulx $acc6,%rax,%rbx # a[2]*b[2]
565 adcx %rax,$acc4
566 adox %rbx,$acc5
567 mulx $acc7,%rax,$acc6 # a[2]*b[3]
568 mov 8*3(%rsi),%rdx # a[3]
569 adcx %rax,$acc5
570 adox %rdi,$acc6 # of=0
571 adcx %rdi,$acc6 # cf=0
572
573 mulx %rbp,%rax,%rbx # a[3]*b[0]
574 adox %rax,$acc3
575 adcx %rbx,$acc4
576 mulx %rcx,%rax,%rbx # a[3]*b[1]
577 adox %rax,$acc4
578 adcx %rbx,$acc5
579 mulx (%rsp),%rax,%rbx # a[3]*b[2]
580 adox %rax,$acc5
581 adcx %rbx,$acc6
582 mulx $acc7,%rax,$acc7 # a[3]*b[3]
583 mov \$38,%edx
584 adox %rax,$acc6
585 adcx %rdi,$acc7 # cf=0
586 adox %rdi,$acc7 # of=0
587
588 jmp .Lreduce64
589.Lfe64_mul_epilogue:
590.cfi_endproc
591.size x25519_fe64_mul,.-x25519_fe64_mul
592
593.globl x25519_fe64_sqr
594.type x25519_fe64_sqr,\@function,2
595.align 32
596x25519_fe64_sqr:
597.cfi_startproc
598 push %rbp
599.cfi_push %rbp
600 push %rbx
601.cfi_push %rbx
602 push %r12
603.cfi_push %r12
604 push %r13
605.cfi_push %r13
606 push %r14
607.cfi_push %r14
608 push %r15
609.cfi_push %r15
610 push %rdi # offload dst
611.cfi_push %rdi
612 lea -8*2(%rsp),%rsp
613.cfi_adjust_cfa_offset 16
614.Lfe64_sqr_body:
615
616 mov 8*0(%rsi),%rdx # a[0]
617 mov 8*1(%rsi),%rcx # a[1]
618 mov 8*2(%rsi),%rbp # a[2]
619 mov 8*3(%rsi),%rsi # a[3]
620
621 ################################################################
622 mulx %rdx,$acc0,$acc7 # a[0]*a[0]
623 mulx %rcx,$acc1,%rax # a[0]*a[1]
624 xor %edi,%edi # cf=0,of=0
625 mulx %rbp,$acc2,%rbx # a[0]*a[2]
626 adcx %rax,$acc2
627 mulx %rsi,$acc3,$acc4 # a[0]*a[3]
628 mov %rcx,%rdx # a[1]
629 adcx %rbx,$acc3
630 adcx %rdi,$acc4 # cf=0
631
632 ################################################################
633 mulx %rbp,%rax,%rbx # a[1]*a[2]
634 adox %rax,$acc3
635 adcx %rbx,$acc4
636 mulx %rsi,%rax,$acc5 # a[1]*a[3]
637 mov %rbp,%rdx # a[2]
638 adox %rax,$acc4
639 adcx %rdi,$acc5
640
641 ################################################################
642 mulx %rsi,%rax,$acc6 # a[2]*a[3]
643 mov %rcx,%rdx # a[1]
644 adox %rax,$acc5
645 adcx %rdi,$acc6 # cf=0
646 adox %rdi,$acc6 # of=0
647
648 adcx $acc1,$acc1 # acc1:6<<1
649 adox $acc7,$acc1
650 adcx $acc2,$acc2
651 mulx %rdx,%rax,%rbx # a[1]*a[1]
652 mov %rbp,%rdx # a[2]
653 adcx $acc3,$acc3
654 adox %rax,$acc2
655 adcx $acc4,$acc4
656 adox %rbx,$acc3
657 mulx %rdx,%rax,%rbx # a[2]*a[2]
658 mov %rsi,%rdx # a[3]
659 adcx $acc5,$acc5
660 adox %rax,$acc4
661 adcx $acc6,$acc6
662 adox %rbx,$acc5
663 mulx %rdx,%rax,$acc7 # a[3]*a[3]
664 mov \$38,%edx
665 adox %rax,$acc6
666 adcx %rdi,$acc7 # cf=0
667 adox %rdi,$acc7 # of=0
668 jmp .Lreduce64
669
670.align 32
671.Lreduce64:
672 mulx $acc4,%rax,%rbx
673 adcx %rax,$acc0
674 adox %rbx,$acc1
675 mulx $acc5,%rax,%rbx
676 adcx %rax,$acc1
677 adox %rbx,$acc2
678 mulx $acc6,%rax,%rbx
679 adcx %rax,$acc2
680 adox %rbx,$acc3
681 mulx $acc7,%rax,$acc4
682 adcx %rax,$acc3
683 adox %rdi,$acc4
684 adcx %rdi,$acc4
685
686 mov 8*2(%rsp),%rdi # restore dst
687 imulq %rdx,$acc4
688
689 add $acc4,$acc0
690 adc \$0,$acc1
691 adc \$0,$acc2
692 adc \$0,$acc3
693
694 sbb %rax,%rax # cf -> mask
695 and \$38,%rax
696
697 add %rax,$acc0
698 mov $acc1,8*1(%rdi)
699 mov $acc2,8*2(%rdi)
700 mov $acc3,8*3(%rdi)
701 mov $acc0,8*0(%rdi)
702
703 mov 8*3(%rsp),%r15
704.cfi_restore %r15
705 mov 8*4(%rsp),%r14
706.cfi_restore %r14
707 mov 8*5(%rsp),%r13
708.cfi_restore %r13
709 mov 8*6(%rsp),%r12
710.cfi_restore %r12
711 mov 8*7(%rsp),%rbx
712.cfi_restore %rbx
713 mov 8*8(%rsp),%rbp
714.cfi_restore %rbp
715 lea 8*9(%rsp),%rsp
716.cfi_adjust_cfa_offset 88
717.Lfe64_sqr_epilogue:
718 ret
719.cfi_endproc
720.size x25519_fe64_sqr,.-x25519_fe64_sqr
721
722.globl x25519_fe64_mul121666
723.type x25519_fe64_mul121666,\@function,2
724.align 32
725x25519_fe64_mul121666:
726.Lfe64_mul121666_body:
727.cfi_startproc
728 mov \$121666,%edx
729 mulx 8*0(%rsi),$acc0,%rcx
730 mulx 8*1(%rsi),$acc1,%rax
731 add %rcx,$acc1
732 mulx 8*2(%rsi),$acc2,%rcx
733 adc %rax,$acc2
734 mulx 8*3(%rsi),$acc3,%rax
735 adc %rcx,$acc3
736 adc \$0,%rax
737
738 imulq \$38,%rax,%rax
739
740 add %rax,$acc0
741 adc \$0,$acc1
742 adc \$0,$acc2
743 adc \$0,$acc3
744
745 sbb %rax,%rax # cf -> mask
746 and \$38,%rax
747
748 add %rax,$acc0
749 mov $acc1,8*1(%rdi)
750 mov $acc2,8*2(%rdi)
751 mov $acc3,8*3(%rdi)
752 mov $acc0,8*0(%rdi)
753
754.Lfe64_mul121666_epilogue:
755 ret
756.cfi_endproc
757.size x25519_fe64_mul121666,.-x25519_fe64_mul121666
758
759.globl x25519_fe64_add
760.type x25519_fe64_add,\@function,3
761.align 32
762x25519_fe64_add:
763.Lfe64_add_body:
764.cfi_startproc
765 mov 8*0(%rsi),$acc0
766 mov 8*1(%rsi),$acc1
767 mov 8*2(%rsi),$acc2
768 mov 8*3(%rsi),$acc3
769
770 add 8*0(%rdx),$acc0
771 adc 8*1(%rdx),$acc1
772 adc 8*2(%rdx),$acc2
773 adc 8*3(%rdx),$acc3
774
775 sbb %rax,%rax # cf -> mask
776 and \$38,%rax
777
778 add %rax,$acc0
779 adc \$0,$acc1
780 adc \$0,$acc2
781 mov $acc1,8*1(%rdi)
782 adc \$0,$acc3
783 mov $acc2,8*2(%rdi)
784 sbb %rax,%rax # cf -> mask
785 mov $acc3,8*3(%rdi)
786 and \$38,%rax
787
788 add %rax,$acc0
789 mov $acc0,8*0(%rdi)
790
791.Lfe64_add_epilogue:
792 ret
793.cfi_endproc
794.size x25519_fe64_add,.-x25519_fe64_add
795
796.globl x25519_fe64_sub
797.type x25519_fe64_sub,\@function,3
798.align 32
799x25519_fe64_sub:
800.Lfe64_sub_body:
801.cfi_startproc
802 mov 8*0(%rsi),$acc0
803 mov 8*1(%rsi),$acc1
804 mov 8*2(%rsi),$acc2
805 mov 8*3(%rsi),$acc3
806
807 sub 8*0(%rdx),$acc0
808 sbb 8*1(%rdx),$acc1
809 sbb 8*2(%rdx),$acc2
810 sbb 8*3(%rdx),$acc3
811
812 sbb %rax,%rax # cf -> mask
813 and \$38,%rax
814
815 sub %rax,$acc0
816 sbb \$0,$acc1
817 sbb \$0,$acc2
818 mov $acc1,8*1(%rdi)
819 sbb \$0,$acc3
820 mov $acc2,8*2(%rdi)
821 sbb %rax,%rax # cf -> mask
822 mov $acc3,8*3(%rdi)
823 and \$38,%rax
824
825 sub %rax,$acc0
826 mov $acc0,8*0(%rdi)
827
828.Lfe64_sub_epilogue:
829 ret
830.cfi_endproc
831.size x25519_fe64_sub,.-x25519_fe64_sub
832
833.globl x25519_fe64_tobytes
834.type x25519_fe64_tobytes,\@function,2
835.align 32
836x25519_fe64_tobytes:
837.Lfe64_to_body:
838.cfi_startproc
839 mov 8*0(%rsi),$acc0
840 mov 8*1(%rsi),$acc1
841 mov 8*2(%rsi),$acc2
842 mov 8*3(%rsi),$acc3
843
844 ################################# reduction modulo 2^255-19
845 lea ($acc3,$acc3),%rax
846 sar \$63,$acc3 # most significant bit -> mask
847 shr \$1,%rax # most significant bit cleared
848 and \$19,$acc3
849 add \$19,$acc3 # compare to modulus in the same go
850
851 add $acc3,$acc0
852 adc \$0,$acc1
853 adc \$0,$acc2
854 adc \$0,%rax
855
856 lea (%rax,%rax),$acc3
857 sar \$63,%rax # most significant bit -> mask
858 shr \$1,$acc3 # most significant bit cleared
859 not %rax
860 and \$19,%rax
861
862 sub %rax,$acc0
863 sbb \$0,$acc1
864 sbb \$0,$acc2
865 sbb \$0,$acc3
866
867 mov $acc0,8*0(%rdi)
868 mov $acc1,8*1(%rdi)
869 mov $acc2,8*2(%rdi)
870 mov $acc3,8*3(%rdi)
871
872.Lfe64_to_epilogue:
873 ret
874.cfi_endproc
875.size x25519_fe64_tobytes,.-x25519_fe64_tobytes
876___
877} else {
878$code.=<<___;
879.globl x25519_fe64_eligible
880.type x25519_fe64_eligible,\@abi-omnipotent
881.align 32
882x25519_fe64_eligible:
883.cfi_startproc
884 xor %eax,%eax
885 ret
886.cfi_endproc
887.size x25519_fe64_eligible,.-x25519_fe64_eligible
888
889.globl x25519_fe64_mul
890.type x25519_fe64_mul,\@abi-omnipotent
891.globl x25519_fe64_sqr
892.globl x25519_fe64_mul121666
893.globl x25519_fe64_add
894.globl x25519_fe64_sub
895.globl x25519_fe64_tobytes
896x25519_fe64_mul:
897x25519_fe64_sqr:
898x25519_fe64_mul121666:
899x25519_fe64_add:
900x25519_fe64_sub:
901x25519_fe64_tobytes:
902.cfi_startproc
903 .byte 0x0f,0x0b # ud2
904 ret
905.cfi_endproc
906.size x25519_fe64_mul,.-x25519_fe64_mul
907___
908}
909$code.=<<___;
910.asciz "X25519 primitives for x86_64, CRYPTOGAMS by <appro\@openssl.org>"
911___
912
913# EXCEPTION_DISPOSITION handler (EXCEPTION_RECORD *rec,ULONG64 frame,
914# CONTEXT *context,DISPATCHER_CONTEXT *disp)
915if ($win64) {
916$rec="%rcx";
917$frame="%rdx";
918$context="%r8";
919$disp="%r9";
920
921$code.=<<___;
922.extern __imp_RtlVirtualUnwind
923
924.type short_handler,\@abi-omnipotent
925.align 16
926short_handler:
927 push %rsi
928 push %rdi
929 push %rbx
930 push %rbp
931 push %r12
932 push %r13
933 push %r14
934 push %r15
935 pushfq
936 sub \$64,%rsp
937
938 mov 120($context),%rax # pull context->Rax
939 mov 248($context),%rbx # pull context->Rip
940
941 mov 8($disp),%rsi # disp->ImageBase
942 mov 56($disp),%r11 # disp->HandlerData
943
944 mov 0(%r11),%r10d # HandlerData[0]
945 lea (%rsi,%r10),%r10 # end of prologue label
946 cmp %r10,%rbx # context->Rip<end of prologue label
947 jb .Lcommon_seh_tail
948
949 mov 152($context),%rax # pull context->Rsp
950 jmp .Lcommon_seh_tail
951.size short_handler,.-short_handler
952
953.type full_handler,\@abi-omnipotent
954.align 16
955full_handler:
956 push %rsi
957 push %rdi
958 push %rbx
959 push %rbp
960 push %r12
961 push %r13
962 push %r14
963 push %r15
964 pushfq
965 sub \$64,%rsp
966
967 mov 120($context),%rax # pull context->Rax
968 mov 248($context),%rbx # pull context->Rip
969
970 mov 8($disp),%rsi # disp->ImageBase
971 mov 56($disp),%r11 # disp->HandlerData
972
973 mov 0(%r11),%r10d # HandlerData[0]
974 lea (%rsi,%r10),%r10 # end of prologue label
975 cmp %r10,%rbx # context->Rip<end of prologue label
976 jb .Lcommon_seh_tail
977
978 mov 152($context),%rax # pull context->Rsp
979
980 mov 4(%r11),%r10d # HandlerData[1]
981 lea (%rsi,%r10),%r10 # epilogue label
982 cmp %r10,%rbx # context->Rip>=epilogue label
983 jae .Lcommon_seh_tail
984
985 mov 8(%r11),%r10d # HandlerData[2]
986 lea (%rax,%r10),%rax
987
988 mov -8(%rax),%rbp
989 mov -16(%rax),%rbx
990 mov -24(%rax),%r12
991 mov -32(%rax),%r13
992 mov -40(%rax),%r14
993 mov -48(%rax),%r15
994 mov %rbx,144($context) # restore context->Rbx
995 mov %rbp,160($context) # restore context->Rbp
996 mov %r12,216($context) # restore context->R12
997 mov %r13,224($context) # restore context->R13
998 mov %r14,232($context) # restore context->R14
999 mov %r15,240($context) # restore context->R15
1000
1001.Lcommon_seh_tail:
1002 mov 8(%rax),%rdi
1003 mov 16(%rax),%rsi
1004 mov %rax,152($context) # restore context->Rsp
1005 mov %rsi,168($context) # restore context->Rsi
1006 mov %rdi,176($context) # restore context->Rdi
1007
1008 mov 40($disp),%rdi # disp->ContextRecord
1009 mov $context,%rsi # context
1010 mov \$154,%ecx # sizeof(CONTEXT)
1011 .long 0xa548f3fc # cld; rep movsq
1012
1013 mov $disp,%rsi
1014 xor %rcx,%rcx # arg1, UNW_FLAG_NHANDLER
1015 mov 8(%rsi),%rdx # arg2, disp->ImageBase
1016 mov 0(%rsi),%r8 # arg3, disp->ControlPc
1017 mov 16(%rsi),%r9 # arg4, disp->FunctionEntry
1018 mov 40(%rsi),%r10 # disp->ContextRecord
1019 lea 56(%rsi),%r11 # &disp->HandlerData
1020 lea 24(%rsi),%r12 # &disp->EstablisherFrame
1021 mov %r10,32(%rsp) # arg5
1022 mov %r11,40(%rsp) # arg6
1023 mov %r12,48(%rsp) # arg7
1024 mov %rcx,56(%rsp) # arg8, (NULL)
1025 call *__imp_RtlVirtualUnwind(%rip)
1026
1027 mov \$1,%eax # ExceptionContinueSearch
1028 add \$64,%rsp
1029 popfq
1030 pop %r15
1031 pop %r14
1032 pop %r13
1033 pop %r12
1034 pop %rbp
1035 pop %rbx
1036 pop %rdi
1037 pop %rsi
1038 ret
1039.size full_handler,.-full_handler
1040
1041.section .pdata
1042.align 4
1043 .rva .LSEH_begin_x25519_fe51_mul
1044 .rva .LSEH_end_x25519_fe51_mul
1045 .rva .LSEH_info_x25519_fe51_mul
1046
1047 .rva .LSEH_begin_x25519_fe51_sqr
1048 .rva .LSEH_end_x25519_fe51_sqr
1049 .rva .LSEH_info_x25519_fe51_sqr
1050
1051 .rva .LSEH_begin_x25519_fe51_mul121666
1052 .rva .LSEH_end_x25519_fe51_mul121666
1053 .rva .LSEH_info_x25519_fe51_mul121666
1054___
1055$code.=<<___ if ($addx);
1056 .rva .LSEH_begin_x25519_fe64_mul
1057 .rva .LSEH_end_x25519_fe64_mul
1058 .rva .LSEH_info_x25519_fe64_mul
1059
1060 .rva .LSEH_begin_x25519_fe64_sqr
1061 .rva .LSEH_end_x25519_fe64_sqr
1062 .rva .LSEH_info_x25519_fe64_sqr
1063
1064 .rva .LSEH_begin_x25519_fe64_mul121666
1065 .rva .LSEH_end_x25519_fe64_mul121666
1066 .rva .LSEH_info_x25519_fe64_mul121666
1067
1068 .rva .LSEH_begin_x25519_fe64_add
1069 .rva .LSEH_end_x25519_fe64_add
1070 .rva .LSEH_info_x25519_fe64_add
1071
1072 .rva .LSEH_begin_x25519_fe64_sub
1073 .rva .LSEH_end_x25519_fe64_sub
1074 .rva .LSEH_info_x25519_fe64_sub
1075
1076 .rva .LSEH_begin_x25519_fe64_tobytes
1077 .rva .LSEH_end_x25519_fe64_tobytes
1078 .rva .LSEH_info_x25519_fe64_tobytes
1079___
1080$code.=<<___;
1081.section .xdata
1082.align 8
1083.LSEH_info_x25519_fe51_mul:
1084 .byte 9,0,0,0
1085 .rva full_handler
1086 .rva .Lfe51_mul_body,.Lfe51_mul_epilogue # HandlerData[]
1087 .long 88,0
1088.LSEH_info_x25519_fe51_sqr:
1089 .byte 9,0,0,0
1090 .rva full_handler
1091 .rva .Lfe51_sqr_body,.Lfe51_sqr_epilogue # HandlerData[]
1092 .long 88,0
1093.LSEH_info_x25519_fe51_mul121666:
1094 .byte 9,0,0,0
1095 .rva full_handler
1096 .rva .Lfe51_mul121666_body,.Lfe51_mul121666_epilogue # HandlerData[]
1097 .long 88,0
1098___
1099$code.=<<___ if ($addx);
1100.LSEH_info_x25519_fe64_mul:
1101 .byte 9,0,0,0
1102 .rva full_handler
1103 .rva .Lfe64_mul_body,.Lfe64_mul_epilogue # HandlerData[]
1104 .long 72,0
1105.LSEH_info_x25519_fe64_sqr:
1106 .byte 9,0,0,0
1107 .rva full_handler
1108 .rva .Lfe64_sqr_body,.Lfe64_sqr_epilogue # HandlerData[]
1109 .long 72,0
1110.LSEH_info_x25519_fe64_mul121666:
1111 .byte 9,0,0,0
1112 .rva short_handler
1113 .rva .Lfe64_mul121666_body,.Lfe64_mul121666_epilogue # HandlerData[]
1114.LSEH_info_x25519_fe64_add:
1115 .byte 9,0,0,0
1116 .rva short_handler
1117 .rva .Lfe64_add_body,.Lfe64_add_epilogue # HandlerData[]
1118.LSEH_info_x25519_fe64_sub:
1119 .byte 9,0,0,0
1120 .rva short_handler
1121 .rva .Lfe64_sub_body,.Lfe64_sub_epilogue # HandlerData[]
1122.LSEH_info_x25519_fe64_tobytes:
1123 .byte 9,0,0,0
1124 .rva short_handler
1125 .rva .Lfe64_to_body,.Lfe64_to_epilogue # HandlerData[]
1126___
1127}
1128
1129$code =~ s/\`([^\`]*)\`/eval $1/gem;
1130print $code;
1131close STDOUT or die "error closing STDOUT: $!";
Note: See TracBrowser for help on using the repository browser.

© 2024 Oracle Support Privacy / Do Not Sell My Info Terms of Use Trademark Policy Automated Access Etiquette