x25519-x86_64.pl@ 83531

Last change on this file since 83531 was 83531, checked in by vboxsync, 5 years ago
setting svn:sync-process=export for openssl-1.1.1f, all files except tests
File size: 24.3 KB

Line
1	#!/usr/bin/env perl
2	# Copyright 2018-2020 The OpenSSL Project Authors. All Rights Reserved.
3	#
4	# Licensed under the OpenSSL license (the "License"). You may not use
5	# this file except in compliance with the License. You can obtain a copy
6	# in the file LICENSE in the source distribution or at
7	# https://www.openssl.org/source/license.html
8	#
9	# ====================================================================
10	# Written by Andy Polyakov <[email protected]> for the OpenSSL
11	# project. The module is, however, dual licensed under OpenSSL and
12	# CRYPTOGAMS licenses depending on where you obtain it. For further
13	# details see http://www.openssl.org/~appro/cryptogams/.
14	# ====================================================================
15	#
16	# X25519 lower-level primitives for x86_64.
17	#
18	# February 2018.
19	#
20	# This module implements radix 2^51 multiplication and squaring, and
21	# radix 2^64 multiplication, squaring, addition, subtraction and final
22	# reduction. Latter radix is used on ADCX/ADOX-capable processors such
23	# as Broadwell. On related note one should mention that there are
24	# vector implementations that provide significantly better performance
25	# on some processors(*), but they are large and overly complex. Which
26	# in combination with them being effectively processor-specific makes
27	# the undertaking hard to justify. The goal for this implementation
28	# is rather versatility and simplicity [and ultimately formal
29	# verification].
30	#
31	# (*) For example sandy2x should provide ~30% improvement on Sandy
32	# Bridge, but only nominal ~5% on Haswell [and big loss on
33	# Broadwell and successors].
34	#
35	######################################################################
36	# Improvement coefficients:
37	#
38	# amd64-51() gcc-5.x(*)
39	#
40	# P4 +22% +40%
41	# Sandy Bridge -3% +11%
42	# Haswell -1% +13%
43	# Broadwell(***) +30% +35%
44	# Skylake(***) +33% +47%
45	# Silvermont +20% +26%
46	# Goldmont +40% +50%
47	# Bulldozer +20% +9%
48	# Ryzen(***) +43% +40%
49	# VIA +170% +120%
50	#
51	# (*) amd64-51 is popular assembly implementation with 2^51 radix,
52	# only multiplication and squaring subroutines were linked
53	# for comparison, but not complete ladder step; gain on most
54	# processors is because this module refrains from shld, and
55	# minor regression on others is because this does result in
56	# higher instruction count;
57	# (**) compiler is free to inline functions, in assembly one would
58	# need to implement ladder step to do that, and it will improve
59	# performance by several percent;
60	# (***) ADCX/ADOX result for 2^64 radix, there is no corresponding
61	# C implementation, so that comparison is always against
62	# 2^51 radix;
63
64	$flavour = shift;
65	$output = shift;
66	if ($flavour =~ /\./) { $output = $flavour; undef $flavour; }
67
68	$win64=0; $win64=1 if ($flavour =~ /[nm]asm\|mingw64/ \|\| $output =~ /\.asm$/);
69
70	$0 =~ m/(.*[\/\\])[^\/\\]+$/; $dir=$1;
71	( $xlate="${dir}x86_64-xlate.pl" and -f $xlate ) or
72	( $xlate="${dir}../../perlasm/x86_64-xlate.pl" and -f $xlate) or
73	die "can't locate x86_64-xlate.pl";
74
75	open OUT,"\| \"$^X\" \"$xlate\" $flavour \"$output\"";
76	STDOUT=OUT;
77
78	if (`$ENV{CC} -Wa,-v -c -o /dev/null -x assembler /dev/null 2>&1`
79	=~ /GNU assembler version ([2-9]\.[0-9]+)/) {
80	$addx = ($1>=2.23);
81	}
82
83	if (!$addx && $win64 && ($flavour =~ /nasm/ \|\| $ENV{ASM} =~ /nasm/) &&
84	`nasm -v 2>&1` =~ /NASM version ([2-9]\.[0-9]+)/) {
85	$addx = ($1>=2.10);
86	}
87
88	if (!$addx && $win64 && ($flavour =~ /masm/ \|\| $ENV{ASM} =~ /ml64/) &&
89	`ml64 2>&1` =~ /Version ([0-9]+)\./) {
90	$addx = ($1>=12);
91	}
92
93	if (!$addx && `$ENV{CC} -v 2>&1` =~ /((?:^clang\|LLVM) version\|.*based on LLVM) ([0-9]+)\.([0-9]+)/) {
94	my $ver = $2 + $3/100.0; # 3.1->3.01, 3.10->3.10
95	$addx = ($ver>=3.03);
96	}
97
98	$code.=<<___;
99	.text
100
101	.globl x25519_fe51_mul
102	.type x25519_fe51_mul,\@function,3
103	.align 32
104	x25519_fe51_mul:
105	.cfi_startproc
106	push %rbp
107	.cfi_push %rbp
108	push %rbx
109	.cfi_push %rbx
110	push %r12
111	.cfi_push %r12
112	push %r13
113	.cfi_push %r13
114	push %r14
115	.cfi_push %r14
116	push %r15
117	.cfi_push %r15
118	lea -8*5(%rsp),%rsp
119	.cfi_adjust_cfa_offset 40
120	.Lfe51_mul_body:
121
122	mov 8*0(%rsi),%rax # f[0]
123	mov 8*0(%rdx),%r11 # load g[0-4]
124	mov 8*1(%rdx),%r12
125	mov 8*2(%rdx),%r13
126	mov 8*3(%rdx),%rbp
127	mov 8*4(%rdx),%r14
128
129	mov %rdi,8*4(%rsp) # offload 1st argument
130	mov %rax,%rdi
131	mulq %r11 # f[0]*g[0]
132	mov %r11,8*0(%rsp) # offload g[0]
133	mov %rax,%rbx # %rbx:%rcx = h0
134	mov %rdi,%rax
135	mov %rdx,%rcx
136	mulq %r12 # f[0]*g[1]
137	mov %r12,8*1(%rsp) # offload g[1]
138	mov %rax,%r8 # %r8:%r9 = h1
139	mov %rdi,%rax
140	lea (%r14,%r14,8),%r15
141	mov %rdx,%r9
142	mulq %r13 # f[0]*g[2]
143	mov %r13,8*2(%rsp) # offload g[2]
144	mov %rax,%r10 # %r10:%r11 = h2
145	mov %rdi,%rax
146	lea (%r14,%r15,2),%rdi # g[4]*19
147	mov %rdx,%r11
148	mulq %rbp # f[0]*g[3]
149	mov %rax,%r12 # %r12:%r13 = h3
150	mov 8*0(%rsi),%rax # f[0]
151	mov %rdx,%r13
152	mulq %r14 # f[0]*g[4]
153	mov %rax,%r14 # %r14:%r15 = h4
154	mov 8*1(%rsi),%rax # f[1]
155	mov %rdx,%r15
156
157	mulq %rdi # f[1]g[4]19
158	add %rax,%rbx
159	mov 8*2(%rsi),%rax # f[2]
160	adc %rdx,%rcx
161	mulq %rdi # f[2]g[4]19
162	add %rax,%r8
163	mov 8*3(%rsi),%rax # f[3]
164	adc %rdx,%r9
165	mulq %rdi # f[3]g[4]19
166	add %rax,%r10
167	mov 8*4(%rsi),%rax # f[4]
168	adc %rdx,%r11
169	mulq %rdi # f[4]g[4]19
170	imulq \$19,%rbp,%rdi # g[3]*19
171	add %rax,%r12
172	mov 8*1(%rsi),%rax # f[1]
173	adc %rdx,%r13
174	mulq %rbp # f[1]*g[3]
175	mov 8*2(%rsp),%rbp # g[2]
176	add %rax,%r14
177	mov 8*2(%rsi),%rax # f[2]
178	adc %rdx,%r15
179
180	mulq %rdi # f[2]g[3]19
181	add %rax,%rbx
182	mov 8*3(%rsi),%rax # f[3]
183	adc %rdx,%rcx
184	mulq %rdi # f[3]g[3]19
185	add %rax,%r8
186	mov 8*4(%rsi),%rax # f[4]
187	adc %rdx,%r9
188	mulq %rdi # f[4]g[3]19
189	imulq \$19,%rbp,%rdi # g[2]*19
190	add %rax,%r10
191	mov 8*1(%rsi),%rax # f[1]
192	adc %rdx,%r11
193	mulq %rbp # f[1]*g[2]
194	add %rax,%r12
195	mov 8*2(%rsi),%rax # f[2]
196	adc %rdx,%r13
197	mulq %rbp # f[2]*g[2]
198	mov 8*1(%rsp),%rbp # g[1]
199	add %rax,%r14
200	mov 8*3(%rsi),%rax # f[3]
201	adc %rdx,%r15
202
203	mulq %rdi # f[3]g[2]19
204	add %rax,%rbx
205	mov 8*4(%rsi),%rax # f[3]
206	adc %rdx,%rcx
207	mulq %rdi # f[4]g[2]19
208	add %rax,%r8
209	mov 8*1(%rsi),%rax # f[1]
210	adc %rdx,%r9
211	mulq %rbp # f[1]*g[1]
212	imulq \$19,%rbp,%rdi
213	add %rax,%r10
214	mov 8*2(%rsi),%rax # f[2]
215	adc %rdx,%r11
216	mulq %rbp # f[2]*g[1]
217	add %rax,%r12
218	mov 8*3(%rsi),%rax # f[3]
219	adc %rdx,%r13
220	mulq %rbp # f[3]*g[1]
221	mov 8*0(%rsp),%rbp # g[0]
222	add %rax,%r14
223	mov 8*4(%rsi),%rax # f[4]
224	adc %rdx,%r15
225
226	mulq %rdi # f[4]g[1]19
227	add %rax,%rbx
228	mov 8*1(%rsi),%rax # f[1]
229	adc %rdx,%rcx
230	mul %rbp # f[1]*g[0]
231	add %rax,%r8
232	mov 8*2(%rsi),%rax # f[2]
233	adc %rdx,%r9
234	mul %rbp # f[2]*g[0]
235	add %rax,%r10
236	mov 8*3(%rsi),%rax # f[3]
237	adc %rdx,%r11
238	mul %rbp # f[3]*g[0]
239	add %rax,%r12
240	mov 8*4(%rsi),%rax # f[4]
241	adc %rdx,%r13
242	mulq %rbp # f[4]*g[0]
243	add %rax,%r14
244	adc %rdx,%r15
245
246	mov 8*4(%rsp),%rdi # restore 1st argument
247	jmp .Lreduce51
248	.Lfe51_mul_epilogue:
249	.cfi_endproc
250	.size x25519_fe51_mul,.-x25519_fe51_mul
251
252	.globl x25519_fe51_sqr
253	.type x25519_fe51_sqr,\@function,2
254	.align 32
255	x25519_fe51_sqr:
256	.cfi_startproc
257	push %rbp
258	.cfi_push %rbp
259	push %rbx
260	.cfi_push %rbx
261	push %r12
262	.cfi_push %r12
263	push %r13
264	.cfi_push %r13
265	push %r14
266	.cfi_push %r14
267	push %r15
268	.cfi_push %r15
269	lea -8*5(%rsp),%rsp
270	.cfi_adjust_cfa_offset 40
271	.Lfe51_sqr_body:
272
273	mov 8*0(%rsi),%rax # g[0]
274	mov 8*2(%rsi),%r15 # g[2]
275	mov 8*4(%rsi),%rbp # g[4]
276
277	mov %rdi,8*4(%rsp) # offload 1st argument
278	lea (%rax,%rax),%r14
279	mulq %rax # g[0]*g[0]
280	mov %rax,%rbx
281	mov 8*1(%rsi),%rax # g[1]
282	mov %rdx,%rcx
283	mulq %r14 # 2g[0]g[1]
284	mov %rax,%r8
285	mov %r15,%rax
286	mov %r15,8*0(%rsp) # offload g[2]
287	mov %rdx,%r9
288	mulq %r14 # 2g[0]g[2]
289	mov %rax,%r10
290	mov 8*3(%rsi),%rax
291	mov %rdx,%r11
292	imulq \$19,%rbp,%rdi # g[4]*19
293	mulq %r14 # 2g[0]g[3]
294	mov %rax,%r12
295	mov %rbp,%rax
296	mov %rdx,%r13
297	mulq %r14 # 2g[0]g[4]
298	mov %rax,%r14
299	mov %rbp,%rax
300	mov %rdx,%r15
301
302	mulq %rdi # g[4]g[4]19
303	add %rax,%r12
304	mov 8*1(%rsi),%rax # g[1]
305	adc %rdx,%r13
306
307	mov 8*3(%rsi),%rsi # g[3]
308	lea (%rax,%rax),%rbp
309	mulq %rax # g[1]*g[1]
310	add %rax,%r10
311	mov 8*0(%rsp),%rax # g[2]
312	adc %rdx,%r11
313	mulq %rbp # 2g[1]g[2]
314	add %rax,%r12
315	mov %rbp,%rax
316	adc %rdx,%r13
317	mulq %rsi # 2g[1]g[3]
318	add %rax,%r14
319	mov %rbp,%rax
320	adc %rdx,%r15
321	imulq \$19,%rsi,%rbp # g[3]*19
322	mulq %rdi # 2g[1]g[4]*19
323	add %rax,%rbx
324	lea (%rsi,%rsi),%rax
325	adc %rdx,%rcx
326
327	mulq %rdi # 2g[3]g[4]*19
328	add %rax,%r10
329	mov %rsi,%rax
330	adc %rdx,%r11
331	mulq %rbp # g[3]g[3]19
332	add %rax,%r8
333	mov 8*0(%rsp),%rax # g[2]
334	adc %rdx,%r9
335
336	lea (%rax,%rax),%rsi
337	mulq %rax # g[2]*g[2]
338	add %rax,%r14
339	mov %rbp,%rax
340	adc %rdx,%r15
341	mulq %rsi # 2g[2]g[3]*19
342	add %rax,%rbx
343	mov %rsi,%rax
344	adc %rdx,%rcx
345	mulq %rdi # 2g[2]g[4]*19
346	add %rax,%r8
347	adc %rdx,%r9
348
349	mov 8*4(%rsp),%rdi # restore 1st argument
350	jmp .Lreduce51
351
352	.align 32
353	.Lreduce51:
354	mov \$0x7ffffffffffff,%rbp
355
356	mov %r10,%rdx
357	shr \$51,%r10
358	shl \$13,%r11
359	and %rbp,%rdx # %rdx = g2 = h2 & mask
360	or %r10,%r11 # h2>>51
361	add %r11,%r12
362	adc \$0,%r13 # h3 += h2>>51
363
364	mov %rbx,%rax
365	shr \$51,%rbx
366	shl \$13,%rcx
367	and %rbp,%rax # %rax = g0 = h0 & mask
368	or %rbx,%rcx # h0>>51
369	add %rcx,%r8 # h1 += h0>>51
370	adc \$0,%r9
371
372	mov %r12,%rbx
373	shr \$51,%r12
374	shl \$13,%r13
375	and %rbp,%rbx # %rbx = g3 = h3 & mask
376	or %r12,%r13 # h3>>51
377	add %r13,%r14 # h4 += h3>>51
378	adc \$0,%r15
379
380	mov %r8,%rcx
381	shr \$51,%r8
382	shl \$13,%r9
383	and %rbp,%rcx # %rcx = g1 = h1 & mask
384	or %r8,%r9
385	add %r9,%rdx # g2 += h1>>51
386
387	mov %r14,%r10
388	shr \$51,%r14
389	shl \$13,%r15
390	and %rbp,%r10 # %r10 = g4 = h0 & mask
391	or %r14,%r15 # h0>>51
392
393	lea (%r15,%r15,8),%r14
394	lea (%r15,%r14,2),%r15
395	add %r15,%rax # g0 += (h0>>51)*19
396
397	mov %rdx,%r8
398	and %rbp,%rdx # g2 &= mask
399	shr \$51,%r8
400	add %r8,%rbx # g3 += g2>>51
401
402	mov %rax,%r9
403	and %rbp,%rax # g0 &= mask
404	shr \$51,%r9
405	add %r9,%rcx # g1 += g0>>51
406
407	mov %rax,8*0(%rdi) # save the result
408	mov %rcx,8*1(%rdi)
409	mov %rdx,8*2(%rdi)
410	mov %rbx,8*3(%rdi)
411	mov %r10,8*4(%rdi)
412
413	mov 8*5(%rsp),%r15
414	.cfi_restore %r15
415	mov 8*6(%rsp),%r14
416	.cfi_restore %r14
417	mov 8*7(%rsp),%r13
418	.cfi_restore %r13
419	mov 8*8(%rsp),%r12
420	.cfi_restore %r12
421	mov 8*9(%rsp),%rbx
422	.cfi_restore %rbx
423	mov 8*10(%rsp),%rbp
424	.cfi_restore %rbp
425	lea 8*11(%rsp),%rsp
426	.cfi_adjust_cfa_offset 88
427	.Lfe51_sqr_epilogue:
428	ret
429	.cfi_endproc
430	.size x25519_fe51_sqr,.-x25519_fe51_sqr
431
432	.globl x25519_fe51_mul121666
433	.type x25519_fe51_mul121666,\@function,2
434	.align 32
435	x25519_fe51_mul121666:
436	.cfi_startproc
437	push %rbp
438	.cfi_push %rbp
439	push %rbx
440	.cfi_push %rbx
441	push %r12
442	.cfi_push %r12
443	push %r13
444	.cfi_push %r13
445	push %r14
446	.cfi_push %r14
447	push %r15
448	.cfi_push %r15
449	lea -8*5(%rsp),%rsp
450	.cfi_adjust_cfa_offset 40
451	.Lfe51_mul121666_body:
452	mov \$121666,%eax
453
454	mulq 8*0(%rsi)
455	mov %rax,%rbx # %rbx:%rcx = h0
456	mov \$121666,%eax
457	mov %rdx,%rcx
458	mulq 8*1(%rsi)
459	mov %rax,%r8 # %r8:%r9 = h1
460	mov \$121666,%eax
461	mov %rdx,%r9
462	mulq 8*2(%rsi)
463	mov %rax,%r10 # %r10:%r11 = h2
464	mov \$121666,%eax
465	mov %rdx,%r11
466	mulq 8*3(%rsi)
467	mov %rax,%r12 # %r12:%r13 = h3
468	mov \$121666,%eax # f[0]
469	mov %rdx,%r13
470	mulq 8*4(%rsi)
471	mov %rax,%r14 # %r14:%r15 = h4
472	mov %rdx,%r15
473
474	jmp .Lreduce51
475	.Lfe51_mul121666_epilogue:
476	.cfi_endproc
477	.size x25519_fe51_mul121666,.-x25519_fe51_mul121666
478	___
479	########################################################################
480	# Base 2^64 subroutines modulo 2*(2^255-19)
481	#
482	if ($addx) {
483	my ($acc0,$acc1,$acc2,$acc3,$acc4,$acc5,$acc6,$acc7) = map("%r$_",(8..15));
484
485	$code.=<<___;
486	.extern OPENSSL_ia32cap_P
487	.globl x25519_fe64_eligible
488	.type x25519_fe64_eligible,\@abi-omnipotent
489	.align 32
490	x25519_fe64_eligible:
491	.cfi_startproc
492	mov OPENSSL_ia32cap_P+8(%rip),%ecx
493	xor %eax,%eax
494	and \$0x80100,%ecx
495	cmp \$0x80100,%ecx
496	cmove %ecx,%eax
497	ret
498	.cfi_endproc
499	.size x25519_fe64_eligible,.-x25519_fe64_eligible
500
501	.globl x25519_fe64_mul
502	.type x25519_fe64_mul,\@function,3
503	.align 32
504	x25519_fe64_mul:
505	.cfi_startproc
506	push %rbp
507	.cfi_push %rbp
508	push %rbx
509	.cfi_push %rbx
510	push %r12
511	.cfi_push %r12
512	push %r13
513	.cfi_push %r13
514	push %r14
515	.cfi_push %r14
516	push %r15
517	.cfi_push %r15
518	push %rdi # offload dst
519	.cfi_push %rdi
520	lea -8*2(%rsp),%rsp
521	.cfi_adjust_cfa_offset 16
522	.Lfe64_mul_body:
523
524	mov %rdx,%rax
525	mov 8*0(%rdx),%rbp # b[0]
526	mov 8*0(%rsi),%rdx # a[0]
527	mov 8*1(%rax),%rcx # b[1]
528	mov 8*2(%rax),$acc6 # b[2]
529	mov 8*3(%rax),$acc7 # b[3]
530
531	mulx %rbp,$acc0,%rax # a[0]*b[0]
532	xor %edi,%edi # cf=0,of=0
533	mulx %rcx,$acc1,%rbx # a[0]*b[1]
534	adcx %rax,$acc1
535	mulx $acc6,$acc2,%rax # a[0]*b[2]
536	adcx %rbx,$acc2
537	mulx $acc7,$acc3,$acc4 # a[0]*b[3]
538	mov 8*1(%rsi),%rdx # a[1]
539	adcx %rax,$acc3
540	mov $acc6,(%rsp) # offload b[2]
541	adcx %rdi,$acc4 # cf=0
542
543	mulx %rbp,%rax,%rbx # a[1]*b[0]
544	adox %rax,$acc1
545	adcx %rbx,$acc2
546	mulx %rcx,%rax,%rbx # a[1]*b[1]
547	adox %rax,$acc2
548	adcx %rbx,$acc3
549	mulx $acc6,%rax,%rbx # a[1]*b[2]
550	adox %rax,$acc3
551	adcx %rbx,$acc4
552	mulx $acc7,%rax,$acc5 # a[1]*b[3]
553	mov 8*2(%rsi),%rdx # a[2]
554	adox %rax,$acc4
555	adcx %rdi,$acc5 # cf=0
556	adox %rdi,$acc5 # of=0
557
558	mulx %rbp,%rax,%rbx # a[2]*b[0]
559	adcx %rax,$acc2
560	adox %rbx,$acc3
561	mulx %rcx,%rax,%rbx # a[2]*b[1]
562	adcx %rax,$acc3
563	adox %rbx,$acc4
564	mulx $acc6,%rax,%rbx # a[2]*b[2]
565	adcx %rax,$acc4
566	adox %rbx,$acc5
567	mulx $acc7,%rax,$acc6 # a[2]*b[3]
568	mov 8*3(%rsi),%rdx # a[3]
569	adcx %rax,$acc5
570	adox %rdi,$acc6 # of=0
571	adcx %rdi,$acc6 # cf=0
572
573	mulx %rbp,%rax,%rbx # a[3]*b[0]
574	adox %rax,$acc3
575	adcx %rbx,$acc4
576	mulx %rcx,%rax,%rbx # a[3]*b[1]
577	adox %rax,$acc4
578	adcx %rbx,$acc5
579	mulx (%rsp),%rax,%rbx # a[3]*b[2]
580	adox %rax,$acc5
581	adcx %rbx,$acc6
582	mulx $acc7,%rax,$acc7 # a[3]*b[3]
583	mov \$38,%edx
584	adox %rax,$acc6
585	adcx %rdi,$acc7 # cf=0
586	adox %rdi,$acc7 # of=0
587
588	jmp .Lreduce64
589	.Lfe64_mul_epilogue:
590	.cfi_endproc
591	.size x25519_fe64_mul,.-x25519_fe64_mul
592
593	.globl x25519_fe64_sqr
594	.type x25519_fe64_sqr,\@function,2
595	.align 32
596	x25519_fe64_sqr:
597	.cfi_startproc
598	push %rbp
599	.cfi_push %rbp
600	push %rbx
601	.cfi_push %rbx
602	push %r12
603	.cfi_push %r12
604	push %r13
605	.cfi_push %r13
606	push %r14
607	.cfi_push %r14
608	push %r15
609	.cfi_push %r15
610	push %rdi # offload dst
611	.cfi_push %rdi
612	lea -8*2(%rsp),%rsp
613	.cfi_adjust_cfa_offset 16
614	.Lfe64_sqr_body:
615
616	mov 8*0(%rsi),%rdx # a[0]
617	mov 8*1(%rsi),%rcx # a[1]
618	mov 8*2(%rsi),%rbp # a[2]
619	mov 8*3(%rsi),%rsi # a[3]
620
621	################################################################
622	mulx %rdx,$acc0,$acc7 # a[0]*a[0]
623	mulx %rcx,$acc1,%rax # a[0]*a[1]
624	xor %edi,%edi # cf=0,of=0
625	mulx %rbp,$acc2,%rbx # a[0]*a[2]
626	adcx %rax,$acc2
627	mulx %rsi,$acc3,$acc4 # a[0]*a[3]
628	mov %rcx,%rdx # a[1]
629	adcx %rbx,$acc3
630	adcx %rdi,$acc4 # cf=0
631
632	################################################################
633	mulx %rbp,%rax,%rbx # a[1]*a[2]
634	adox %rax,$acc3
635	adcx %rbx,$acc4
636	mulx %rsi,%rax,$acc5 # a[1]*a[3]
637	mov %rbp,%rdx # a[2]
638	adox %rax,$acc4
639	adcx %rdi,$acc5
640
641	################################################################
642	mulx %rsi,%rax,$acc6 # a[2]*a[3]
643	mov %rcx,%rdx # a[1]
644	adox %rax,$acc5
645	adcx %rdi,$acc6 # cf=0
646	adox %rdi,$acc6 # of=0
647
648	adcx $acc1,$acc1 # acc1:6<<1
649	adox $acc7,$acc1
650	adcx $acc2,$acc2
651	mulx %rdx,%rax,%rbx # a[1]*a[1]
652	mov %rbp,%rdx # a[2]
653	adcx $acc3,$acc3
654	adox %rax,$acc2
655	adcx $acc4,$acc4
656	adox %rbx,$acc3
657	mulx %rdx,%rax,%rbx # a[2]*a[2]
658	mov %rsi,%rdx # a[3]
659	adcx $acc5,$acc5
660	adox %rax,$acc4
661	adcx $acc6,$acc6
662	adox %rbx,$acc5
663	mulx %rdx,%rax,$acc7 # a[3]*a[3]
664	mov \$38,%edx
665	adox %rax,$acc6
666	adcx %rdi,$acc7 # cf=0
667	adox %rdi,$acc7 # of=0
668	jmp .Lreduce64
669
670	.align 32
671	.Lreduce64:
672	mulx $acc4,%rax,%rbx
673	adcx %rax,$acc0
674	adox %rbx,$acc1
675	mulx $acc5,%rax,%rbx
676	adcx %rax,$acc1
677	adox %rbx,$acc2
678	mulx $acc6,%rax,%rbx
679	adcx %rax,$acc2
680	adox %rbx,$acc3
681	mulx $acc7,%rax,$acc4
682	adcx %rax,$acc3
683	adox %rdi,$acc4
684	adcx %rdi,$acc4
685
686	mov 8*2(%rsp),%rdi # restore dst
687	imulq %rdx,$acc4
688
689	add $acc4,$acc0
690	adc \$0,$acc1
691	adc \$0,$acc2
692	adc \$0,$acc3
693
694	sbb %rax,%rax # cf -> mask
695	and \$38,%rax
696
697	add %rax,$acc0
698	mov $acc1,8*1(%rdi)
699	mov $acc2,8*2(%rdi)
700	mov $acc3,8*3(%rdi)
701	mov $acc0,8*0(%rdi)
702
703	mov 8*3(%rsp),%r15
704	.cfi_restore %r15
705	mov 8*4(%rsp),%r14
706	.cfi_restore %r14
707	mov 8*5(%rsp),%r13
708	.cfi_restore %r13
709	mov 8*6(%rsp),%r12
710	.cfi_restore %r12
711	mov 8*7(%rsp),%rbx
712	.cfi_restore %rbx
713	mov 8*8(%rsp),%rbp
714	.cfi_restore %rbp
715	lea 8*9(%rsp),%rsp
716	.cfi_adjust_cfa_offset 88
717	.Lfe64_sqr_epilogue:
718	ret
719	.cfi_endproc
720	.size x25519_fe64_sqr,.-x25519_fe64_sqr
721
722	.globl x25519_fe64_mul121666
723	.type x25519_fe64_mul121666,\@function,2
724	.align 32
725	x25519_fe64_mul121666:
726	.Lfe64_mul121666_body:
727	.cfi_startproc
728	mov \$121666,%edx
729	mulx 8*0(%rsi),$acc0,%rcx
730	mulx 8*1(%rsi),$acc1,%rax
731	add %rcx,$acc1
732	mulx 8*2(%rsi),$acc2,%rcx
733	adc %rax,$acc2
734	mulx 8*3(%rsi),$acc3,%rax
735	adc %rcx,$acc3
736	adc \$0,%rax
737
738	imulq \$38,%rax,%rax
739
740	add %rax,$acc0
741	adc \$0,$acc1
742	adc \$0,$acc2
743	adc \$0,$acc3
744
745	sbb %rax,%rax # cf -> mask
746	and \$38,%rax
747
748	add %rax,$acc0
749	mov $acc1,8*1(%rdi)
750	mov $acc2,8*2(%rdi)
751	mov $acc3,8*3(%rdi)
752	mov $acc0,8*0(%rdi)
753
754	.Lfe64_mul121666_epilogue:
755	ret
756	.cfi_endproc
757	.size x25519_fe64_mul121666,.-x25519_fe64_mul121666
758
759	.globl x25519_fe64_add
760	.type x25519_fe64_add,\@function,3
761	.align 32
762	x25519_fe64_add:
763	.Lfe64_add_body:
764	.cfi_startproc
765	mov 8*0(%rsi),$acc0
766	mov 8*1(%rsi),$acc1
767	mov 8*2(%rsi),$acc2
768	mov 8*3(%rsi),$acc3
769
770	add 8*0(%rdx),$acc0
771	adc 8*1(%rdx),$acc1
772	adc 8*2(%rdx),$acc2
773	adc 8*3(%rdx),$acc3
774
775	sbb %rax,%rax # cf -> mask
776	and \$38,%rax
777
778	add %rax,$acc0
779	adc \$0,$acc1
780	adc \$0,$acc2
781	mov $acc1,8*1(%rdi)
782	adc \$0,$acc3
783	mov $acc2,8*2(%rdi)
784	sbb %rax,%rax # cf -> mask
785	mov $acc3,8*3(%rdi)
786	and \$38,%rax
787
788	add %rax,$acc0
789	mov $acc0,8*0(%rdi)
790
791	.Lfe64_add_epilogue:
792	ret
793	.cfi_endproc
794	.size x25519_fe64_add,.-x25519_fe64_add
795
796	.globl x25519_fe64_sub
797	.type x25519_fe64_sub,\@function,3
798	.align 32
799	x25519_fe64_sub:
800	.Lfe64_sub_body:
801	.cfi_startproc
802	mov 8*0(%rsi),$acc0
803	mov 8*1(%rsi),$acc1
804	mov 8*2(%rsi),$acc2
805	mov 8*3(%rsi),$acc3
806
807	sub 8*0(%rdx),$acc0
808	sbb 8*1(%rdx),$acc1
809	sbb 8*2(%rdx),$acc2
810	sbb 8*3(%rdx),$acc3
811
812	sbb %rax,%rax # cf -> mask
813	and \$38,%rax
814
815	sub %rax,$acc0
816	sbb \$0,$acc1
817	sbb \$0,$acc2
818	mov $acc1,8*1(%rdi)
819	sbb \$0,$acc3
820	mov $acc2,8*2(%rdi)
821	sbb %rax,%rax # cf -> mask
822	mov $acc3,8*3(%rdi)
823	and \$38,%rax
824
825	sub %rax,$acc0
826	mov $acc0,8*0(%rdi)
827
828	.Lfe64_sub_epilogue:
829	ret
830	.cfi_endproc
831	.size x25519_fe64_sub,.-x25519_fe64_sub
832
833	.globl x25519_fe64_tobytes
834	.type x25519_fe64_tobytes,\@function,2
835	.align 32
836	x25519_fe64_tobytes:
837	.Lfe64_to_body:
838	.cfi_startproc
839	mov 8*0(%rsi),$acc0
840	mov 8*1(%rsi),$acc1
841	mov 8*2(%rsi),$acc2
842	mov 8*3(%rsi),$acc3
843
844	################################# reduction modulo 2^255-19
845	lea ($acc3,$acc3),%rax
846	sar \$63,$acc3 # most significant bit -> mask
847	shr \$1,%rax # most significant bit cleared
848	and \$19,$acc3
849	add \$19,$acc3 # compare to modulus in the same go
850
851	add $acc3,$acc0
852	adc \$0,$acc1
853	adc \$0,$acc2
854	adc \$0,%rax
855
856	lea (%rax,%rax),$acc3
857	sar \$63,%rax # most significant bit -> mask
858	shr \$1,$acc3 # most significant bit cleared
859	not %rax
860	and \$19,%rax
861
862	sub %rax,$acc0
863	sbb \$0,$acc1
864	sbb \$0,$acc2
865	sbb \$0,$acc3
866
867	mov $acc0,8*0(%rdi)
868	mov $acc1,8*1(%rdi)
869	mov $acc2,8*2(%rdi)
870	mov $acc3,8*3(%rdi)
871
872	.Lfe64_to_epilogue:
873	ret
874	.cfi_endproc
875	.size x25519_fe64_tobytes,.-x25519_fe64_tobytes
876	___
877	} else {
878	$code.=<<___;
879	.globl x25519_fe64_eligible
880	.type x25519_fe64_eligible,\@abi-omnipotent
881	.align 32
882	x25519_fe64_eligible:
883	.cfi_startproc
884	xor %eax,%eax
885	ret
886	.cfi_endproc
887	.size x25519_fe64_eligible,.-x25519_fe64_eligible
888
889	.globl x25519_fe64_mul
890	.type x25519_fe64_mul,\@abi-omnipotent
891	.globl x25519_fe64_sqr
892	.globl x25519_fe64_mul121666
893	.globl x25519_fe64_add
894	.globl x25519_fe64_sub
895	.globl x25519_fe64_tobytes
896	x25519_fe64_mul:
897	x25519_fe64_sqr:
898	x25519_fe64_mul121666:
899	x25519_fe64_add:
900	x25519_fe64_sub:
901	x25519_fe64_tobytes:
902	.cfi_startproc
903	.byte 0x0f,0x0b # ud2
904	ret
905	.cfi_endproc
906	.size x25519_fe64_mul,.-x25519_fe64_mul
907	___
908	}
909	$code.=<<___;
910	.asciz "X25519 primitives for x86_64, CRYPTOGAMS by <appro\@openssl.org>"
911	___
912
913	# EXCEPTION_DISPOSITION handler (EXCEPTION_RECORD *rec,ULONG64 frame,
914	# CONTEXT context,DISPATCHER_CONTEXT disp)
915	if ($win64) {
916	$rec="%rcx";
917	$frame="%rdx";
918	$context="%r8";
919	$disp="%r9";
920
921	$code.=<<___;
922	.extern __imp_RtlVirtualUnwind
923
924	.type short_handler,\@abi-omnipotent
925	.align 16
926	short_handler:
927	push %rsi
928	push %rdi
929	push %rbx
930	push %rbp
931	push %r12
932	push %r13
933	push %r14
934	push %r15
935	pushfq
936	sub \$64,%rsp
937
938	mov 120($context),%rax # pull context->Rax
939	mov 248($context),%rbx # pull context->Rip
940
941	mov 8($disp),%rsi # disp->ImageBase
942	mov 56($disp),%r11 # disp->HandlerData
943
944	mov 0(%r11),%r10d # HandlerData[0]
945	lea (%rsi,%r10),%r10 # end of prologue label
946	cmp %r10,%rbx # context->Rip<end of prologue label
947	jb .Lcommon_seh_tail
948
949	mov 152($context),%rax # pull context->Rsp
950	jmp .Lcommon_seh_tail
951	.size short_handler,.-short_handler
952
953	.type full_handler,\@abi-omnipotent
954	.align 16
955	full_handler:
956	push %rsi
957	push %rdi
958	push %rbx
959	push %rbp
960	push %r12
961	push %r13
962	push %r14
963	push %r15
964	pushfq
965	sub \$64,%rsp
966
967	mov 120($context),%rax # pull context->Rax
968	mov 248($context),%rbx # pull context->Rip
969
970	mov 8($disp),%rsi # disp->ImageBase
971	mov 56($disp),%r11 # disp->HandlerData
972
973	mov 0(%r11),%r10d # HandlerData[0]
974	lea (%rsi,%r10),%r10 # end of prologue label
975	cmp %r10,%rbx # context->Rip<end of prologue label
976	jb .Lcommon_seh_tail
977
978	mov 152($context),%rax # pull context->Rsp
979
980	mov 4(%r11),%r10d # HandlerData[1]
981	lea (%rsi,%r10),%r10 # epilogue label
982	cmp %r10,%rbx # context->Rip>=epilogue label
983	jae .Lcommon_seh_tail
984
985	mov 8(%r11),%r10d # HandlerData[2]
986	lea (%rax,%r10),%rax
987
988	mov -8(%rax),%rbp
989	mov -16(%rax),%rbx
990	mov -24(%rax),%r12
991	mov -32(%rax),%r13
992	mov -40(%rax),%r14
993	mov -48(%rax),%r15
994	mov %rbx,144($context) # restore context->Rbx
995	mov %rbp,160($context) # restore context->Rbp
996	mov %r12,216($context) # restore context->R12
997	mov %r13,224($context) # restore context->R13
998	mov %r14,232($context) # restore context->R14
999	mov %r15,240($context) # restore context->R15
1000
1001	.Lcommon_seh_tail:
1002	mov 8(%rax),%rdi
1003	mov 16(%rax),%rsi
1004	mov %rax,152($context) # restore context->Rsp
1005	mov %rsi,168($context) # restore context->Rsi
1006	mov %rdi,176($context) # restore context->Rdi
1007
1008	mov 40($disp),%rdi # disp->ContextRecord
1009	mov $context,%rsi # context
1010	mov \$154,%ecx # sizeof(CONTEXT)
1011	.long 0xa548f3fc # cld; rep movsq
1012
1013	mov $disp,%rsi
1014	xor %rcx,%rcx # arg1, UNW_FLAG_NHANDLER
1015	mov 8(%rsi),%rdx # arg2, disp->ImageBase
1016	mov 0(%rsi),%r8 # arg3, disp->ControlPc
1017	mov 16(%rsi),%r9 # arg4, disp->FunctionEntry
1018	mov 40(%rsi),%r10 # disp->ContextRecord
1019	lea 56(%rsi),%r11 # &disp->HandlerData
1020	lea 24(%rsi),%r12 # &disp->EstablisherFrame
1021	mov %r10,32(%rsp) # arg5
1022	mov %r11,40(%rsp) # arg6
1023	mov %r12,48(%rsp) # arg7
1024	mov %rcx,56(%rsp) # arg8, (NULL)
1025	call *__imp_RtlVirtualUnwind(%rip)
1026
1027	mov \$1,%eax # ExceptionContinueSearch
1028	add \$64,%rsp
1029	popfq
1030	pop %r15
1031	pop %r14
1032	pop %r13
1033	pop %r12
1034	pop %rbp
1035	pop %rbx
1036	pop %rdi
1037	pop %rsi
1038	ret
1039	.size full_handler,.-full_handler
1040
1041	.section .pdata
1042	.align 4
1043	.rva .LSEH_begin_x25519_fe51_mul
1044	.rva .LSEH_end_x25519_fe51_mul
1045	.rva .LSEH_info_x25519_fe51_mul
1046
1047	.rva .LSEH_begin_x25519_fe51_sqr
1048	.rva .LSEH_end_x25519_fe51_sqr
1049	.rva .LSEH_info_x25519_fe51_sqr
1050
1051	.rva .LSEH_begin_x25519_fe51_mul121666
1052	.rva .LSEH_end_x25519_fe51_mul121666
1053	.rva .LSEH_info_x25519_fe51_mul121666
1054	___
1055	$code.=<<___ if ($addx);
1056	.rva .LSEH_begin_x25519_fe64_mul
1057	.rva .LSEH_end_x25519_fe64_mul
1058	.rva .LSEH_info_x25519_fe64_mul
1059
1060	.rva .LSEH_begin_x25519_fe64_sqr
1061	.rva .LSEH_end_x25519_fe64_sqr
1062	.rva .LSEH_info_x25519_fe64_sqr
1063
1064	.rva .LSEH_begin_x25519_fe64_mul121666
1065	.rva .LSEH_end_x25519_fe64_mul121666
1066	.rva .LSEH_info_x25519_fe64_mul121666
1067
1068	.rva .LSEH_begin_x25519_fe64_add
1069	.rva .LSEH_end_x25519_fe64_add
1070	.rva .LSEH_info_x25519_fe64_add
1071
1072	.rva .LSEH_begin_x25519_fe64_sub
1073	.rva .LSEH_end_x25519_fe64_sub
1074	.rva .LSEH_info_x25519_fe64_sub
1075
1076	.rva .LSEH_begin_x25519_fe64_tobytes
1077	.rva .LSEH_end_x25519_fe64_tobytes
1078	.rva .LSEH_info_x25519_fe64_tobytes
1079	___
1080	$code.=<<___;
1081	.section .xdata
1082	.align 8
1083	.LSEH_info_x25519_fe51_mul:
1084	.byte 9,0,0,0
1085	.rva full_handler
1086	.rva .Lfe51_mul_body,.Lfe51_mul_epilogue # HandlerData[]
1087	.long 88,0
1088	.LSEH_info_x25519_fe51_sqr:
1089	.byte 9,0,0,0
1090	.rva full_handler
1091	.rva .Lfe51_sqr_body,.Lfe51_sqr_epilogue # HandlerData[]
1092	.long 88,0
1093	.LSEH_info_x25519_fe51_mul121666:
1094	.byte 9,0,0,0
1095	.rva full_handler
1096	.rva .Lfe51_mul121666_body,.Lfe51_mul121666_epilogue # HandlerData[]
1097	.long 88,0
1098	___
1099	$code.=<<___ if ($addx);
1100	.LSEH_info_x25519_fe64_mul:
1101	.byte 9,0,0,0
1102	.rva full_handler
1103	.rva .Lfe64_mul_body,.Lfe64_mul_epilogue # HandlerData[]
1104	.long 72,0
1105	.LSEH_info_x25519_fe64_sqr:
1106	.byte 9,0,0,0
1107	.rva full_handler
1108	.rva .Lfe64_sqr_body,.Lfe64_sqr_epilogue # HandlerData[]
1109	.long 72,0
1110	.LSEH_info_x25519_fe64_mul121666:
1111	.byte 9,0,0,0
1112	.rva short_handler
1113	.rva .Lfe64_mul121666_body,.Lfe64_mul121666_epilogue # HandlerData[]
1114	.LSEH_info_x25519_fe64_add:
1115	.byte 9,0,0,0
1116	.rva short_handler
1117	.rva .Lfe64_add_body,.Lfe64_add_epilogue # HandlerData[]
1118	.LSEH_info_x25519_fe64_sub:
1119	.byte 9,0,0,0
1120	.rva short_handler
1121	.rva .Lfe64_sub_body,.Lfe64_sub_epilogue # HandlerData[]
1122	.LSEH_info_x25519_fe64_tobytes:
1123	.byte 9,0,0,0
1124	.rva short_handler
1125	.rva .Lfe64_to_body,.Lfe64_to_epilogue # HandlerData[]
1126	___
1127	}
1128
1129	$code =~ s/\`([^\`]*)\`/eval $1/gem;
1130	print $code;
1131	close STDOUT or die "error closing STDOUT: $!";

Note: See TracBrowser for help on using the repository browser.

source: vbox/trunk/src/libs/openssl-1.1.1f/crypto/ec/asm/x25519-x86_64.pl@ 83531

Download in other formats: