VirtualBox

source: vbox/trunk/src/libs/openssl-1.1.0g/crypto/ec/asm/ecp_nistz256-avx2.pl@ 69890

Last change on this file since 69890 was 69890, checked in by vboxsync, 7 years ago

Added OpenSSL 1.1.0g with unneeded files removed, otherwise unmodified.
bugref:8070: src/libs maintenance

  • Property svn:eol-style set to LF
  • Property svn:executable set to *
File size: 56.5 KB
Line 
1#! /usr/bin/env perl
2# Copyright 2014-2016 The OpenSSL Project Authors. All Rights Reserved.
3#
4# Licensed under the OpenSSL license (the "License"). You may not use
5# this file except in compliance with the License. You can obtain a copy
6# in the file LICENSE in the source distribution or at
7# https://www.openssl.org/source/license.html
8
9
10##############################################################################
11# #
12# Copyright 2014 Intel Corporation #
13# #
14# Licensed under the Apache License, Version 2.0 (the "License"); #
15# you may not use this file except in compliance with the License. #
16# You may obtain a copy of the License at #
17# #
18# http://www.apache.org/licenses/LICENSE-2.0 #
19# #
20# Unless required by applicable law or agreed to in writing, software #
21# distributed under the License is distributed on an "AS IS" BASIS, #
22# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. #
23# See the License for the specific language governing permissions and #
24# limitations under the License. #
25# #
26##############################################################################
27# #
28# Developers and authors: #
29# Shay Gueron (1, 2), and Vlad Krasnov (1) #
30# (1) Intel Corporation, Israel Development Center #
31# (2) University of Haifa #
32# Reference: #
33# S.Gueron and V.Krasnov, "Fast Prime Field Elliptic Curve Cryptography with#
34# 256 Bit Primes" #
35# #
36##############################################################################
37
38$flavour = shift;
39$output = shift;
40if ($flavour =~ /\./) { $output = $flavour; undef $flavour; }
41
42$win64=0; $win64=1 if ($flavour =~ /[nm]asm|mingw64/ || $output =~ /\.asm$/);
43
44$0 =~ m/(.*[\/\\])[^\/\\]+$/; $dir=$1;
45( $xlate="${dir}x86_64-xlate.pl" and -f $xlate ) or
46( $xlate="${dir}../../perlasm/x86_64-xlate.pl" and -f $xlate) or
47die "can't locate x86_64-xlate.pl";
48
49open OUT,"| \"$^X\" $xlate $flavour $output";
50*STDOUT=*OUT;
51
52if (`$ENV{CC} -Wa,-v -c -o /dev/null -x assembler /dev/null 2>&1`
53 =~ /GNU assembler version ([2-9]\.[0-9]+)/) {
54 $avx = ($1>=2.19) + ($1>=2.22);
55 $addx = ($1>=2.23);
56}
57
58if (!$addx && $win64 && ($flavour =~ /nasm/ || $ENV{ASM} =~ /nasm/) &&
59 `nasm -v 2>&1` =~ /NASM version ([2-9]\.[0-9]+)/) {
60 $avx = ($1>=2.09) + ($1>=2.10);
61 $addx = ($1>=2.10);
62}
63
64if (!$addx && $win64 && ($flavour =~ /masm/ || $ENV{ASM} =~ /ml64/) &&
65 `ml64 2>&1` =~ /Version ([0-9]+)\./) {
66 $avx = ($1>=10) + ($1>=11);
67 $addx = ($1>=12);
68}
69
70if (!$addx && `$ENV{CC} -v 2>&1` =~ /(^clang version|based on LLVM) ([3-9])\.([0-9]+)/) {
71 my $ver = $2 + $3/100.0; # 3.1->3.01, 3.10->3.10
72 $avx = ($ver>=3.0) + ($ver>=3.01);
73 $addx = ($ver>=3.03);
74}
75
76if ($avx>=2) {{
77$digit_size = "\$29";
78$n_digits = "\$9";
79
80$code.=<<___;
81.text
82
83.align 64
84.LAVX2_AND_MASK:
85.LAVX2_POLY:
86.quad 0x1fffffff, 0x1fffffff, 0x1fffffff, 0x1fffffff
87.quad 0x1fffffff, 0x1fffffff, 0x1fffffff, 0x1fffffff
88.quad 0x1fffffff, 0x1fffffff, 0x1fffffff, 0x1fffffff
89.quad 0x000001ff, 0x000001ff, 0x000001ff, 0x000001ff
90.quad 0x00000000, 0x00000000, 0x00000000, 0x00000000
91.quad 0x00000000, 0x00000000, 0x00000000, 0x00000000
92.quad 0x00040000, 0x00040000, 0x00040000, 0x00040000
93.quad 0x1fe00000, 0x1fe00000, 0x1fe00000, 0x1fe00000
94.quad 0x00ffffff, 0x00ffffff, 0x00ffffff, 0x00ffffff
95
96.LAVX2_POLY_x2:
97.quad 0x7FFFFFFC, 0x7FFFFFFC, 0x7FFFFFFC, 0x7FFFFFFC
98.quad 0x7FFFFFFC, 0x7FFFFFFC, 0x7FFFFFFC, 0x7FFFFFFC
99.quad 0x7FFFFFFC, 0x7FFFFFFC, 0x7FFFFFFC, 0x7FFFFFFC
100.quad 0x400007FC, 0x400007FC, 0x400007FC, 0x400007FC
101.quad 0x3FFFFFFE, 0x3FFFFFFE, 0x3FFFFFFE, 0x3FFFFFFE
102.quad 0x3FFFFFFE, 0x3FFFFFFE, 0x3FFFFFFE, 0x3FFFFFFE
103.quad 0x400FFFFE, 0x400FFFFE, 0x400FFFFE, 0x400FFFFE
104.quad 0x7F7FFFFE, 0x7F7FFFFE, 0x7F7FFFFE, 0x7F7FFFFE
105.quad 0x03FFFFFC, 0x03FFFFFC, 0x03FFFFFC, 0x03FFFFFC
106
107.LAVX2_POLY_x8:
108.quad 0xFFFFFFF8, 0xFFFFFFF8, 0xFFFFFFF8, 0xFFFFFFF8
109.quad 0xFFFFFFF8, 0xFFFFFFF8, 0xFFFFFFF8, 0xFFFFFFF8
110.quad 0xFFFFFFF8, 0xFFFFFFF8, 0xFFFFFFF8, 0xFFFFFFF8
111.quad 0x80000FF8, 0x80000FF8, 0x80000FF8, 0x80000FF8
112.quad 0x7FFFFFFC, 0x7FFFFFFC, 0x7FFFFFFC, 0x7FFFFFFC
113.quad 0x7FFFFFFC, 0x7FFFFFFC, 0x7FFFFFFC, 0x7FFFFFFC
114.quad 0x801FFFFC, 0x801FFFFC, 0x801FFFFC, 0x801FFFFC
115.quad 0xFEFFFFFC, 0xFEFFFFFC, 0xFEFFFFFC, 0xFEFFFFFC
116.quad 0x07FFFFF8, 0x07FFFFF8, 0x07FFFFF8, 0x07FFFFF8
117
118.LONE:
119.quad 0x00000020, 0x00000020, 0x00000020, 0x00000020
120.quad 0x00000000, 0x00000000, 0x00000000, 0x00000000
121.quad 0x00000000, 0x00000000, 0x00000000, 0x00000000
122.quad 0x1fffc000, 0x1fffc000, 0x1fffc000, 0x1fffc000
123.quad 0x1fffffff, 0x1fffffff, 0x1fffffff, 0x1fffffff
124.quad 0x1fffffff, 0x1fffffff, 0x1fffffff, 0x1fffffff
125.quad 0x1f7fffff, 0x1f7fffff, 0x1f7fffff, 0x1f7fffff
126.quad 0x03ffffff, 0x03ffffff, 0x03ffffff, 0x03ffffff
127.quad 0x00000000, 0x00000000, 0x00000000, 0x00000000
128
129# RR = 2^266 mod p in AVX2 format, to transform from the native OpenSSL
130# Montgomery form (*2^256) to our format (*2^261)
131
132.LTO_MONT_AVX2:
133.quad 0x00000400, 0x00000400, 0x00000400, 0x00000400
134.quad 0x00000000, 0x00000000, 0x00000000, 0x00000000
135.quad 0x00000000, 0x00000000, 0x00000000, 0x00000000
136.quad 0x1ff80000, 0x1ff80000, 0x1ff80000, 0x1ff80000
137.quad 0x1fffffff, 0x1fffffff, 0x1fffffff, 0x1fffffff
138.quad 0x1fffffff, 0x1fffffff, 0x1fffffff, 0x1fffffff
139.quad 0x0fffffff, 0x0fffffff, 0x0fffffff, 0x0fffffff
140.quad 0x1fffffff, 0x1fffffff, 0x1fffffff, 0x1fffffff
141.quad 0x00000003, 0x00000003, 0x00000003, 0x00000003
142
143.LFROM_MONT_AVX2:
144.quad 0x00000001, 0x00000001, 0x00000001, 0x00000001
145.quad 0x00000000, 0x00000000, 0x00000000, 0x00000000
146.quad 0x00000000, 0x00000000, 0x00000000, 0x00000000
147.quad 0x1ffffe00, 0x1ffffe00, 0x1ffffe00, 0x1ffffe00
148.quad 0x1fffffff, 0x1fffffff, 0x1fffffff, 0x1fffffff
149.quad 0x1fffffff, 0x1fffffff, 0x1fffffff, 0x1fffffff
150.quad 0x1ffbffff, 0x1ffbffff, 0x1ffbffff, 0x1ffbffff
151.quad 0x001fffff, 0x001fffff, 0x001fffff, 0x001fffff
152.quad 0x00000000, 0x00000000, 0x00000000, 0x00000000
153
154.LIntOne:
155.long 1,1,1,1,1,1,1,1
156___
157
158{
159# This function receives a pointer to an array of four affine points
160# (X, Y, <1>) and rearanges the data for AVX2 execution, while
161# converting it to 2^29 radix redundant form
162
163my ($X0,$X1,$X2,$X3, $Y0,$Y1,$Y2,$Y3,
164 $T0,$T1,$T2,$T3, $T4,$T5,$T6,$T7)=map("%ymm$_",(0..15));
165
166$code.=<<___;
167.globl ecp_nistz256_avx2_transpose_convert
168.type ecp_nistz256_avx2_transpose_convert,\@function,2
169.align 64
170ecp_nistz256_avx2_transpose_convert:
171 vzeroupper
172___
173$code.=<<___ if ($win64);
174 lea -8-16*10(%rsp), %rsp
175 vmovaps %xmm6, -8-16*10(%rax)
176 vmovaps %xmm7, -8-16*9(%rax)
177 vmovaps %xmm8, -8-16*8(%rax)
178 vmovaps %xmm9, -8-16*7(%rax)
179 vmovaps %xmm10, -8-16*6(%rax)
180 vmovaps %xmm11, -8-16*5(%rax)
181 vmovaps %xmm12, -8-16*4(%rax)
182 vmovaps %xmm13, -8-16*3(%rax)
183 vmovaps %xmm14, -8-16*2(%rax)
184 vmovaps %xmm15, -8-16*1(%rax)
185___
186$code.=<<___;
187 # Load the data
188 vmovdqa 32*0(%rsi), $X0
189 lea 112(%rsi), %rax # size optimization
190 vmovdqa 32*1(%rsi), $Y0
191 lea .LAVX2_AND_MASK(%rip), %rdx
192 vmovdqa 32*2(%rsi), $X1
193 vmovdqa 32*3(%rsi), $Y1
194 vmovdqa 32*4-112(%rax), $X2
195 vmovdqa 32*5-112(%rax), $Y2
196 vmovdqa 32*6-112(%rax), $X3
197 vmovdqa 32*7-112(%rax), $Y3
198
199 # Transpose X and Y independently
200 vpunpcklqdq $X1, $X0, $T0 # T0 = [B2 A2 B0 A0]
201 vpunpcklqdq $X3, $X2, $T1 # T1 = [D2 C2 D0 C0]
202 vpunpckhqdq $X1, $X0, $T2 # T2 = [B3 A3 B1 A1]
203 vpunpckhqdq $X3, $X2, $T3 # T3 = [D3 C3 D1 C1]
204
205 vpunpcklqdq $Y1, $Y0, $T4
206 vpunpcklqdq $Y3, $Y2, $T5
207 vpunpckhqdq $Y1, $Y0, $T6
208 vpunpckhqdq $Y3, $Y2, $T7
209
210 vperm2i128 \$0x20, $T1, $T0, $X0 # X0 = [D0 C0 B0 A0]
211 vperm2i128 \$0x20, $T3, $T2, $X1 # X1 = [D1 C1 B1 A1]
212 vperm2i128 \$0x31, $T1, $T0, $X2 # X2 = [D2 C2 B2 A2]
213 vperm2i128 \$0x31, $T3, $T2, $X3 # X3 = [D3 C3 B3 A3]
214
215 vperm2i128 \$0x20, $T5, $T4, $Y0
216 vperm2i128 \$0x20, $T7, $T6, $Y1
217 vperm2i128 \$0x31, $T5, $T4, $Y2
218 vperm2i128 \$0x31, $T7, $T6, $Y3
219 vmovdqa (%rdx), $T7
220
221 vpand (%rdx), $X0, $T0 # out[0] = in[0] & mask;
222 vpsrlq \$29, $X0, $X0
223 vpand $T7, $X0, $T1 # out[1] = (in[0] >> shift) & mask;
224 vpsrlq \$29, $X0, $X0
225 vpsllq \$6, $X1, $T2
226 vpxor $X0, $T2, $T2
227 vpand $T7, $T2, $T2 # out[2] = ((in[0] >> (shift*2)) ^ (in[1] << (64-shift*2))) & mask;
228 vpsrlq \$23, $X1, $X1
229 vpand $T7, $X1, $T3 # out[3] = (in[1] >> ((shift*3)%64)) & mask;
230 vpsrlq \$29, $X1, $X1
231 vpsllq \$12, $X2, $T4
232 vpxor $X1, $T4, $T4
233 vpand $T7, $T4, $T4 # out[4] = ((in[1] >> ((shift*4)%64)) ^ (in[2] << (64*2-shift*4))) & mask;
234 vpsrlq \$17, $X2, $X2
235 vpand $T7, $X2, $T5 # out[5] = (in[2] >> ((shift*5)%64)) & mask;
236 vpsrlq \$29, $X2, $X2
237 vpsllq \$18, $X3, $T6
238 vpxor $X2, $T6, $T6
239 vpand $T7, $T6, $T6 # out[6] = ((in[2] >> ((shift*6)%64)) ^ (in[3] << (64*3-shift*6))) & mask;
240 vpsrlq \$11, $X3, $X3
241 vmovdqa $T0, 32*0(%rdi)
242 lea 112(%rdi), %rax # size optimization
243 vpand $T7, $X3, $T0 # out[7] = (in[3] >> ((shift*7)%64)) & mask;
244 vpsrlq \$29, $X3, $X3 # out[8] = (in[3] >> ((shift*8)%64)) & mask;
245
246 vmovdqa $T1, 32*1(%rdi)
247 vmovdqa $T2, 32*2(%rdi)
248 vmovdqa $T3, 32*3(%rdi)
249 vmovdqa $T4, 32*4-112(%rax)
250 vmovdqa $T5, 32*5-112(%rax)
251 vmovdqa $T6, 32*6-112(%rax)
252 vmovdqa $T0, 32*7-112(%rax)
253 vmovdqa $X3, 32*8-112(%rax)
254 lea 448(%rdi), %rax # size optimization
255
256 vpand $T7, $Y0, $T0 # out[0] = in[0] & mask;
257 vpsrlq \$29, $Y0, $Y0
258 vpand $T7, $Y0, $T1 # out[1] = (in[0] >> shift) & mask;
259 vpsrlq \$29, $Y0, $Y0
260 vpsllq \$6, $Y1, $T2
261 vpxor $Y0, $T2, $T2
262 vpand $T7, $T2, $T2 # out[2] = ((in[0] >> (shift*2)) ^ (in[1] << (64-shift*2))) & mask;
263 vpsrlq \$23, $Y1, $Y1
264 vpand $T7, $Y1, $T3 # out[3] = (in[1] >> ((shift*3)%64)) & mask;
265 vpsrlq \$29, $Y1, $Y1
266 vpsllq \$12, $Y2, $T4
267 vpxor $Y1, $T4, $T4
268 vpand $T7, $T4, $T4 # out[4] = ((in[1] >> ((shift*4)%64)) ^ (in[2] << (64*2-shift*4))) & mask;
269 vpsrlq \$17, $Y2, $Y2
270 vpand $T7, $Y2, $T5 # out[5] = (in[2] >> ((shift*5)%64)) & mask;
271 vpsrlq \$29, $Y2, $Y2
272 vpsllq \$18, $Y3, $T6
273 vpxor $Y2, $T6, $T6
274 vpand $T7, $T6, $T6 # out[6] = ((in[2] >> ((shift*6)%64)) ^ (in[3] << (64*3-shift*6))) & mask;
275 vpsrlq \$11, $Y3, $Y3
276 vmovdqa $T0, 32*9-448(%rax)
277 vpand $T7, $Y3, $T0 # out[7] = (in[3] >> ((shift*7)%64)) & mask;
278 vpsrlq \$29, $Y3, $Y3 # out[8] = (in[3] >> ((shift*8)%64)) & mask;
279
280 vmovdqa $T1, 32*10-448(%rax)
281 vmovdqa $T2, 32*11-448(%rax)
282 vmovdqa $T3, 32*12-448(%rax)
283 vmovdqa $T4, 32*13-448(%rax)
284 vmovdqa $T5, 32*14-448(%rax)
285 vmovdqa $T6, 32*15-448(%rax)
286 vmovdqa $T0, 32*16-448(%rax)
287 vmovdqa $Y3, 32*17-448(%rax)
288
289 vzeroupper
290___
291$code.=<<___ if ($win64);
292 movaps 16*0(%rsp), %xmm6
293 movaps 16*1(%rsp), %xmm7
294 movaps 16*2(%rsp), %xmm8
295 movaps 16*3(%rsp), %xmm9
296 movaps 16*4(%rsp), %xmm10
297 movaps 16*5(%rsp), %xmm11
298 movaps 16*6(%rsp), %xmm12
299 movaps 16*7(%rsp), %xmm13
300 movaps 16*8(%rsp), %xmm14
301 movaps 16*9(%rsp), %xmm15
302 lea 8+16*10(%rsp), %rsp
303___
304$code.=<<___;
305 ret
306.size ecp_nistz256_avx2_transpose_convert,.-ecp_nistz256_avx2_transpose_convert
307___
308}
309{
310################################################################################
311# This function receives a pointer to an array of four AVX2 formatted points
312# (X, Y, Z) convert the data to normal representation, and rearanges the data
313
314my ($D0,$D1,$D2,$D3, $D4,$D5,$D6,$D7, $D8)=map("%ymm$_",(0..8));
315my ($T0,$T1,$T2,$T3, $T4,$T5,$T6)=map("%ymm$_",(9..15));
316
317$code.=<<___;
318
319.globl ecp_nistz256_avx2_convert_transpose_back
320.type ecp_nistz256_avx2_convert_transpose_back,\@function,2
321.align 32
322ecp_nistz256_avx2_convert_transpose_back:
323 vzeroupper
324___
325$code.=<<___ if ($win64);
326 lea -8-16*10(%rsp), %rsp
327 vmovaps %xmm6, -8-16*10(%rax)
328 vmovaps %xmm7, -8-16*9(%rax)
329 vmovaps %xmm8, -8-16*8(%rax)
330 vmovaps %xmm9, -8-16*7(%rax)
331 vmovaps %xmm10, -8-16*6(%rax)
332 vmovaps %xmm11, -8-16*5(%rax)
333 vmovaps %xmm12, -8-16*4(%rax)
334 vmovaps %xmm13, -8-16*3(%rax)
335 vmovaps %xmm14, -8-16*2(%rax)
336 vmovaps %xmm15, -8-16*1(%rax)
337___
338$code.=<<___;
339 mov \$3, %ecx
340
341.Lconv_loop:
342 vmovdqa 32*0(%rsi), $D0
343 lea 160(%rsi), %rax # size optimization
344 vmovdqa 32*1(%rsi), $D1
345 vmovdqa 32*2(%rsi), $D2
346 vmovdqa 32*3(%rsi), $D3
347 vmovdqa 32*4-160(%rax), $D4
348 vmovdqa 32*5-160(%rax), $D5
349 vmovdqa 32*6-160(%rax), $D6
350 vmovdqa 32*7-160(%rax), $D7
351 vmovdqa 32*8-160(%rax), $D8
352
353 vpsllq \$29, $D1, $D1
354 vpsllq \$58, $D2, $T0
355 vpaddq $D1, $D0, $D0
356 vpaddq $T0, $D0, $D0 # out[0] = (in[0]) ^ (in[1] << shift*1) ^ (in[2] << shift*2);
357
358 vpsrlq \$6, $D2, $D2
359 vpsllq \$23, $D3, $D3
360 vpsllq \$52, $D4, $T1
361 vpaddq $D2, $D3, $D3
362 vpaddq $D3, $T1, $D1 # out[1] = (in[2] >> (64*1-shift*2)) ^ (in[3] << shift*3%64) ^ (in[4] << shift*4%64);
363
364 vpsrlq \$12, $D4, $D4
365 vpsllq \$17, $D5, $D5
366 vpsllq \$46, $D6, $T2
367 vpaddq $D4, $D5, $D5
368 vpaddq $D5, $T2, $D2 # out[2] = (in[4] >> (64*2-shift*4)) ^ (in[5] << shift*5%64) ^ (in[6] << shift*6%64);
369
370 vpsrlq \$18, $D6, $D6
371 vpsllq \$11, $D7, $D7
372 vpsllq \$40, $D8, $T3
373 vpaddq $D6, $D7, $D7
374 vpaddq $D7, $T3, $D3 # out[3] = (in[6] >> (64*3-shift*6)) ^ (in[7] << shift*7%64) ^ (in[8] << shift*8%64);
375
376 vpunpcklqdq $D1, $D0, $T0 # T0 = [B2 A2 B0 A0]
377 vpunpcklqdq $D3, $D2, $T1 # T1 = [D2 C2 D0 C0]
378 vpunpckhqdq $D1, $D0, $T2 # T2 = [B3 A3 B1 A1]
379 vpunpckhqdq $D3, $D2, $T3 # T3 = [D3 C3 D1 C1]
380
381 vperm2i128 \$0x20, $T1, $T0, $D0 # X0 = [D0 C0 B0 A0]
382 vperm2i128 \$0x20, $T3, $T2, $D1 # X1 = [D1 C1 B1 A1]
383 vperm2i128 \$0x31, $T1, $T0, $D2 # X2 = [D2 C2 B2 A2]
384 vperm2i128 \$0x31, $T3, $T2, $D3 # X3 = [D3 C3 B3 A3]
385
386 vmovdqa $D0, 32*0(%rdi)
387 vmovdqa $D1, 32*3(%rdi)
388 vmovdqa $D2, 32*6(%rdi)
389 vmovdqa $D3, 32*9(%rdi)
390
391 lea 32*9(%rsi), %rsi
392 lea 32*1(%rdi), %rdi
393
394 dec %ecx
395 jnz .Lconv_loop
396
397 vzeroupper
398___
399$code.=<<___ if ($win64);
400 movaps 16*0(%rsp), %xmm6
401 movaps 16*1(%rsp), %xmm7
402 movaps 16*2(%rsp), %xmm8
403 movaps 16*3(%rsp), %xmm9
404 movaps 16*4(%rsp), %xmm10
405 movaps 16*5(%rsp), %xmm11
406 movaps 16*6(%rsp), %xmm12
407 movaps 16*7(%rsp), %xmm13
408 movaps 16*8(%rsp), %xmm14
409 movaps 16*9(%rsp), %xmm15
410 lea 8+16*10(%rsp), %rsp
411___
412$code.=<<___;
413 ret
414.size ecp_nistz256_avx2_convert_transpose_back,.-ecp_nistz256_avx2_convert_transpose_back
415___
416}
417{
418my ($r_ptr,$a_ptr,$b_ptr,$itr)=("%rdi","%rsi","%rdx","%ecx");
419my ($ACC0,$ACC1,$ACC2,$ACC3,$ACC4,$ACC5,$ACC6,$ACC7,$ACC8)=map("%ymm$_",(0..8));
420my ($B,$Y,$T0,$AND_MASK,$OVERFLOW)=map("%ymm$_",(9..13));
421
422sub NORMALIZE {
423my $ret=<<___;
424 vpsrlq $digit_size, $ACC0, $T0
425 vpand $AND_MASK, $ACC0, $ACC0
426 vpaddq $T0, $ACC1, $ACC1
427
428 vpsrlq $digit_size, $ACC1, $T0
429 vpand $AND_MASK, $ACC1, $ACC1
430 vpaddq $T0, $ACC2, $ACC2
431
432 vpsrlq $digit_size, $ACC2, $T0
433 vpand $AND_MASK, $ACC2, $ACC2
434 vpaddq $T0, $ACC3, $ACC3
435
436 vpsrlq $digit_size, $ACC3, $T0
437 vpand $AND_MASK, $ACC3, $ACC3
438 vpaddq $T0, $ACC4, $ACC4
439
440 vpsrlq $digit_size, $ACC4, $T0
441 vpand $AND_MASK, $ACC4, $ACC4
442 vpaddq $T0, $ACC5, $ACC5
443
444 vpsrlq $digit_size, $ACC5, $T0
445 vpand $AND_MASK, $ACC5, $ACC5
446 vpaddq $T0, $ACC6, $ACC6
447
448 vpsrlq $digit_size, $ACC6, $T0
449 vpand $AND_MASK, $ACC6, $ACC6
450 vpaddq $T0, $ACC7, $ACC7
451
452 vpsrlq $digit_size, $ACC7, $T0
453 vpand $AND_MASK, $ACC7, $ACC7
454 vpaddq $T0, $ACC8, $ACC8
455 #vpand $AND_MASK, $ACC8, $ACC8
456___
457 $ret;
458}
459
460sub STORE {
461my $ret=<<___;
462 vmovdqa $ACC0, 32*0(%rdi)
463 lea 160(%rdi), %rax # size optimization
464 vmovdqa $ACC1, 32*1(%rdi)
465 vmovdqa $ACC2, 32*2(%rdi)
466 vmovdqa $ACC3, 32*3(%rdi)
467 vmovdqa $ACC4, 32*4-160(%rax)
468 vmovdqa $ACC5, 32*5-160(%rax)
469 vmovdqa $ACC6, 32*6-160(%rax)
470 vmovdqa $ACC7, 32*7-160(%rax)
471 vmovdqa $ACC8, 32*8-160(%rax)
472___
473 $ret;
474}
475
476$code.=<<___;
477.type avx2_normalize,\@abi-omnipotent
478.align 32
479avx2_normalize:
480 vpsrlq $digit_size, $ACC0, $T0
481 vpand $AND_MASK, $ACC0, $ACC0
482 vpaddq $T0, $ACC1, $ACC1
483
484 vpsrlq $digit_size, $ACC1, $T0
485 vpand $AND_MASK, $ACC1, $ACC1
486 vpaddq $T0, $ACC2, $ACC2
487
488 vpsrlq $digit_size, $ACC2, $T0
489 vpand $AND_MASK, $ACC2, $ACC2
490 vpaddq $T0, $ACC3, $ACC3
491
492 vpsrlq $digit_size, $ACC3, $T0
493 vpand $AND_MASK, $ACC3, $ACC3
494 vpaddq $T0, $ACC4, $ACC4
495
496 vpsrlq $digit_size, $ACC4, $T0
497 vpand $AND_MASK, $ACC4, $ACC4
498 vpaddq $T0, $ACC5, $ACC5
499
500 vpsrlq $digit_size, $ACC5, $T0
501 vpand $AND_MASK, $ACC5, $ACC5
502 vpaddq $T0, $ACC6, $ACC6
503
504 vpsrlq $digit_size, $ACC6, $T0
505 vpand $AND_MASK, $ACC6, $ACC6
506 vpaddq $T0, $ACC7, $ACC7
507
508 vpsrlq $digit_size, $ACC7, $T0
509 vpand $AND_MASK, $ACC7, $ACC7
510 vpaddq $T0, $ACC8, $ACC8
511 #vpand $AND_MASK, $ACC8, $ACC8
512
513 ret
514.size avx2_normalize,.-avx2_normalize
515
516.type avx2_normalize_n_store,\@abi-omnipotent
517.align 32
518avx2_normalize_n_store:
519 vpsrlq $digit_size, $ACC0, $T0
520 vpand $AND_MASK, $ACC0, $ACC0
521 vpaddq $T0, $ACC1, $ACC1
522
523 vpsrlq $digit_size, $ACC1, $T0
524 vpand $AND_MASK, $ACC1, $ACC1
525 vmovdqa $ACC0, 32*0(%rdi)
526 lea 160(%rdi), %rax # size optimization
527 vpaddq $T0, $ACC2, $ACC2
528
529 vpsrlq $digit_size, $ACC2, $T0
530 vpand $AND_MASK, $ACC2, $ACC2
531 vmovdqa $ACC1, 32*1(%rdi)
532 vpaddq $T0, $ACC3, $ACC3
533
534 vpsrlq $digit_size, $ACC3, $T0
535 vpand $AND_MASK, $ACC3, $ACC3
536 vmovdqa $ACC2, 32*2(%rdi)
537 vpaddq $T0, $ACC4, $ACC4
538
539 vpsrlq $digit_size, $ACC4, $T0
540 vpand $AND_MASK, $ACC4, $ACC4
541 vmovdqa $ACC3, 32*3(%rdi)
542 vpaddq $T0, $ACC5, $ACC5
543
544 vpsrlq $digit_size, $ACC5, $T0
545 vpand $AND_MASK, $ACC5, $ACC5
546 vmovdqa $ACC4, 32*4-160(%rax)
547 vpaddq $T0, $ACC6, $ACC6
548
549 vpsrlq $digit_size, $ACC6, $T0
550 vpand $AND_MASK, $ACC6, $ACC6
551 vmovdqa $ACC5, 32*5-160(%rax)
552 vpaddq $T0, $ACC7, $ACC7
553
554 vpsrlq $digit_size, $ACC7, $T0
555 vpand $AND_MASK, $ACC7, $ACC7
556 vmovdqa $ACC6, 32*6-160(%rax)
557 vpaddq $T0, $ACC8, $ACC8
558 #vpand $AND_MASK, $ACC8, $ACC8
559 vmovdqa $ACC7, 32*7-160(%rax)
560 vmovdqa $ACC8, 32*8-160(%rax)
561
562 ret
563.size avx2_normalize_n_store,.-avx2_normalize_n_store
564
565################################################################################
566# void avx2_mul_x4(void* RESULTx4, void *Ax4, void *Bx4);
567.type avx2_mul_x4,\@abi-omnipotent
568.align 32
569avx2_mul_x4:
570 lea .LAVX2_POLY(%rip), %rax
571
572 vpxor $ACC0, $ACC0, $ACC0
573 vpxor $ACC1, $ACC1, $ACC1
574 vpxor $ACC2, $ACC2, $ACC2
575 vpxor $ACC3, $ACC3, $ACC3
576 vpxor $ACC4, $ACC4, $ACC4
577 vpxor $ACC5, $ACC5, $ACC5
578 vpxor $ACC6, $ACC6, $ACC6
579 vpxor $ACC7, $ACC7, $ACC7
580
581 vmovdqa 32*7(%rax), %ymm14
582 vmovdqa 32*8(%rax), %ymm15
583
584 mov $n_digits, $itr
585 lea -512($a_ptr), $a_ptr # strategic bias to control u-op density
586 jmp .Lavx2_mul_x4_loop
587
588.align 32
589.Lavx2_mul_x4_loop:
590 vmovdqa 32*0($b_ptr), $B
591 lea 32*1($b_ptr), $b_ptr
592
593 vpmuludq 32*0+512($a_ptr), $B, $T0
594 vpmuludq 32*1+512($a_ptr), $B, $OVERFLOW # borrow $OVERFLOW
595 vpaddq $T0, $ACC0, $ACC0
596 vpmuludq 32*2+512($a_ptr), $B, $T0
597 vpaddq $OVERFLOW, $ACC1, $ACC1
598 vpand $AND_MASK, $ACC0, $Y
599 vpmuludq 32*3+512($a_ptr), $B, $OVERFLOW
600 vpaddq $T0, $ACC2, $ACC2
601 vpmuludq 32*4+512($a_ptr), $B, $T0
602 vpaddq $OVERFLOW, $ACC3, $ACC3
603 vpmuludq 32*5+512($a_ptr), $B, $OVERFLOW
604 vpaddq $T0, $ACC4, $ACC4
605 vpmuludq 32*6+512($a_ptr), $B, $T0
606 vpaddq $OVERFLOW, $ACC5, $ACC5
607 vpmuludq 32*7+512($a_ptr), $B, $OVERFLOW
608 vpaddq $T0, $ACC6, $ACC6
609
610 # Skip some multiplications, optimizing for the constant poly
611 vpmuludq $AND_MASK, $Y, $T0
612 vpaddq $OVERFLOW, $ACC7, $ACC7
613 vpmuludq 32*8+512($a_ptr), $B, $ACC8
614 vpaddq $T0, $ACC0, $OVERFLOW
615 vpaddq $T0, $ACC1, $ACC0
616 vpsrlq $digit_size, $OVERFLOW, $OVERFLOW
617 vpaddq $T0, $ACC2, $ACC1
618 vpmuludq 32*3(%rax), $Y, $T0
619 vpaddq $OVERFLOW, $ACC0, $ACC0
620 vpaddq $T0, $ACC3, $ACC2
621 .byte 0x67
622 vmovdqa $ACC4, $ACC3
623 vpsllq \$18, $Y, $OVERFLOW
624 .byte 0x67
625 vmovdqa $ACC5, $ACC4
626 vpmuludq %ymm14, $Y, $T0
627 vpaddq $OVERFLOW, $ACC6, $ACC5
628 vpmuludq %ymm15, $Y, $OVERFLOW
629 vpaddq $T0, $ACC7, $ACC6
630 vpaddq $OVERFLOW, $ACC8, $ACC7
631
632 dec $itr
633 jnz .Lavx2_mul_x4_loop
634
635 vpxor $ACC8, $ACC8, $ACC8
636
637 ret
638.size avx2_mul_x4,.-avx2_mul_x4
639
640# Function optimized for the constant 1
641################################################################################
642# void avx2_mul_by1_x4(void* RESULTx4, void *Ax4);
643.type avx2_mul_by1_x4,\@abi-omnipotent
644.align 32
645avx2_mul_by1_x4:
646 lea .LAVX2_POLY(%rip), %rax
647
648 vpxor $ACC0, $ACC0, $ACC0
649 vpxor $ACC1, $ACC1, $ACC1
650 vpxor $ACC2, $ACC2, $ACC2
651 vpxor $ACC3, $ACC3, $ACC3
652 vpxor $ACC4, $ACC4, $ACC4
653 vpxor $ACC5, $ACC5, $ACC5
654 vpxor $ACC6, $ACC6, $ACC6
655 vpxor $ACC7, $ACC7, $ACC7
656 vpxor $ACC8, $ACC8, $ACC8
657
658 vmovdqa 32*3+.LONE(%rip), %ymm14
659 vmovdqa 32*7+.LONE(%rip), %ymm15
660
661 mov $n_digits, $itr
662 jmp .Lavx2_mul_by1_x4_loop
663
664.align 32
665.Lavx2_mul_by1_x4_loop:
666 vmovdqa 32*0($a_ptr), $B
667 .byte 0x48,0x8d,0xb6,0x20,0,0,0 # lea 32*1($a_ptr), $a_ptr
668
669 vpsllq \$5, $B, $OVERFLOW
670 vpmuludq %ymm14, $B, $T0
671 vpaddq $OVERFLOW, $ACC0, $ACC0
672 vpaddq $T0, $ACC3, $ACC3
673 .byte 0x67
674 vpmuludq $AND_MASK, $B, $T0
675 vpand $AND_MASK, $ACC0, $Y
676 vpaddq $T0, $ACC4, $ACC4
677 vpaddq $T0, $ACC5, $ACC5
678 vpaddq $T0, $ACC6, $ACC6
679 vpsllq \$23, $B, $T0
680
681 .byte 0x67,0x67
682 vpmuludq %ymm15, $B, $OVERFLOW
683 vpsubq $T0, $ACC6, $ACC6
684
685 vpmuludq $AND_MASK, $Y, $T0
686 vpaddq $OVERFLOW, $ACC7, $ACC7
687 vpaddq $T0, $ACC0, $OVERFLOW
688 vpaddq $T0, $ACC1, $ACC0
689 .byte 0x67,0x67
690 vpsrlq $digit_size, $OVERFLOW, $OVERFLOW
691 vpaddq $T0, $ACC2, $ACC1
692 vpmuludq 32*3(%rax), $Y, $T0
693 vpaddq $OVERFLOW, $ACC0, $ACC0
694 vpaddq $T0, $ACC3, $ACC2
695 vmovdqa $ACC4, $ACC3
696 vpsllq \$18, $Y, $OVERFLOW
697 vmovdqa $ACC5, $ACC4
698 vpmuludq 32*7(%rax), $Y, $T0
699 vpaddq $OVERFLOW, $ACC6, $ACC5
700 vpaddq $T0, $ACC7, $ACC6
701 vpmuludq 32*8(%rax), $Y, $ACC7
702
703 dec $itr
704 jnz .Lavx2_mul_by1_x4_loop
705
706 ret
707.size avx2_mul_by1_x4,.-avx2_mul_by1_x4
708
709################################################################################
710# void avx2_sqr_x4(void* RESULTx4, void *Ax4, void *Bx4);
711.type avx2_sqr_x4,\@abi-omnipotent
712.align 32
713avx2_sqr_x4:
714 lea .LAVX2_POLY(%rip), %rax
715
716 vmovdqa 32*7(%rax), %ymm14
717 vmovdqa 32*8(%rax), %ymm15
718
719 vmovdqa 32*0($a_ptr), $B
720 vmovdqa 32*1($a_ptr), $ACC1
721 vmovdqa 32*2($a_ptr), $ACC2
722 vmovdqa 32*3($a_ptr), $ACC3
723 vmovdqa 32*4($a_ptr), $ACC4
724 vmovdqa 32*5($a_ptr), $ACC5
725 vmovdqa 32*6($a_ptr), $ACC6
726 vmovdqa 32*7($a_ptr), $ACC7
727 vpaddq $ACC1, $ACC1, $ACC1 # 2*$ACC0..7
728 vmovdqa 32*8($a_ptr), $ACC8
729 vpaddq $ACC2, $ACC2, $ACC2
730 vmovdqa $ACC1, 32*0(%rcx)
731 vpaddq $ACC3, $ACC3, $ACC3
732 vmovdqa $ACC2, 32*1(%rcx)
733 vpaddq $ACC4, $ACC4, $ACC4
734 vmovdqa $ACC3, 32*2(%rcx)
735 vpaddq $ACC5, $ACC5, $ACC5
736 vmovdqa $ACC4, 32*3(%rcx)
737 vpaddq $ACC6, $ACC6, $ACC6
738 vmovdqa $ACC5, 32*4(%rcx)
739 vpaddq $ACC7, $ACC7, $ACC7
740 vmovdqa $ACC6, 32*5(%rcx)
741 vpaddq $ACC8, $ACC8, $ACC8
742 vmovdqa $ACC7, 32*6(%rcx)
743 vmovdqa $ACC8, 32*7(%rcx)
744
745 #itr 1
746 vpmuludq $B, $B, $ACC0
747 vpmuludq $B, $ACC1, $ACC1
748 vpand $AND_MASK, $ACC0, $Y
749 vpmuludq $B, $ACC2, $ACC2
750 vpmuludq $B, $ACC3, $ACC3
751 vpmuludq $B, $ACC4, $ACC4
752 vpmuludq $B, $ACC5, $ACC5
753 vpmuludq $B, $ACC6, $ACC6
754 vpmuludq $AND_MASK, $Y, $T0
755 vpmuludq $B, $ACC7, $ACC7
756 vpmuludq $B, $ACC8, $ACC8
757 vmovdqa 32*1($a_ptr), $B
758
759 vpaddq $T0, $ACC0, $OVERFLOW
760 vpaddq $T0, $ACC1, $ACC0
761 vpsrlq $digit_size, $OVERFLOW, $OVERFLOW
762 vpaddq $T0, $ACC2, $ACC1
763 vpmuludq 32*3(%rax), $Y, $T0
764 vpaddq $OVERFLOW, $ACC0, $ACC0
765 vpaddq $T0, $ACC3, $ACC2
766 vmovdqa $ACC4, $ACC3
767 vpsllq \$18, $Y, $T0
768 vmovdqa $ACC5, $ACC4
769 vpmuludq %ymm14, $Y, $OVERFLOW
770 vpaddq $T0, $ACC6, $ACC5
771 vpmuludq %ymm15, $Y, $T0
772 vpaddq $OVERFLOW, $ACC7, $ACC6
773 vpaddq $T0, $ACC8, $ACC7
774
775 #itr 2
776 vpmuludq $B, $B, $OVERFLOW
777 vpand $AND_MASK, $ACC0, $Y
778 vpmuludq 32*1(%rcx), $B, $T0
779 vpaddq $OVERFLOW, $ACC1, $ACC1
780 vpmuludq 32*2(%rcx), $B, $OVERFLOW
781 vpaddq $T0, $ACC2, $ACC2
782 vpmuludq 32*3(%rcx), $B, $T0
783 vpaddq $OVERFLOW, $ACC3, $ACC3
784 vpmuludq 32*4(%rcx), $B, $OVERFLOW
785 vpaddq $T0, $ACC4, $ACC4
786 vpmuludq 32*5(%rcx), $B, $T0
787 vpaddq $OVERFLOW, $ACC5, $ACC5
788 vpmuludq 32*6(%rcx), $B, $OVERFLOW
789 vpaddq $T0, $ACC6, $ACC6
790
791 vpmuludq $AND_MASK, $Y, $T0
792 vpaddq $OVERFLOW, $ACC7, $ACC7
793 vpmuludq 32*7(%rcx), $B, $ACC8
794 vmovdqa 32*2($a_ptr), $B
795 vpaddq $T0, $ACC0, $OVERFLOW
796 vpaddq $T0, $ACC1, $ACC0
797 vpsrlq $digit_size, $OVERFLOW, $OVERFLOW
798 vpaddq $T0, $ACC2, $ACC1
799 vpmuludq 32*3(%rax), $Y, $T0
800 vpaddq $OVERFLOW, $ACC0, $ACC0
801 vpaddq $T0, $ACC3, $ACC2
802 vmovdqa $ACC4, $ACC3
803 vpsllq \$18, $Y, $T0
804 vmovdqa $ACC5, $ACC4
805 vpmuludq %ymm14, $Y, $OVERFLOW
806 vpaddq $T0, $ACC6, $ACC5
807 vpmuludq %ymm15, $Y, $T0
808 vpaddq $OVERFLOW, $ACC7, $ACC6
809 vpaddq $T0, $ACC8, $ACC7
810
811 #itr 3
812 vpmuludq $B, $B, $T0
813 vpand $AND_MASK, $ACC0, $Y
814 vpmuludq 32*2(%rcx), $B, $OVERFLOW
815 vpaddq $T0, $ACC2, $ACC2
816 vpmuludq 32*3(%rcx), $B, $T0
817 vpaddq $OVERFLOW, $ACC3, $ACC3
818 vpmuludq 32*4(%rcx), $B, $OVERFLOW
819 vpaddq $T0, $ACC4, $ACC4
820 vpmuludq 32*5(%rcx), $B, $T0
821 vpaddq $OVERFLOW, $ACC5, $ACC5
822 vpmuludq 32*6(%rcx), $B, $OVERFLOW
823 vpaddq $T0, $ACC6, $ACC6
824
825 vpmuludq $AND_MASK, $Y, $T0
826 vpaddq $OVERFLOW, $ACC7, $ACC7
827 vpmuludq 32*7(%rcx), $B, $ACC8
828 vmovdqa 32*3($a_ptr), $B
829 vpaddq $T0, $ACC0, $OVERFLOW
830 vpaddq $T0, $ACC1, $ACC0
831 vpsrlq $digit_size, $OVERFLOW, $OVERFLOW
832 vpaddq $T0, $ACC2, $ACC1
833 vpmuludq 32*3(%rax), $Y, $T0
834 vpaddq $OVERFLOW, $ACC0, $ACC0
835 vpaddq $T0, $ACC3, $ACC2
836 vmovdqa $ACC4, $ACC3
837 vpsllq \$18, $Y, $T0
838 vmovdqa $ACC5, $ACC4
839 vpmuludq %ymm14, $Y, $OVERFLOW
840 vpaddq $T0, $ACC6, $ACC5
841 vpmuludq %ymm15, $Y, $T0
842 vpand $AND_MASK, $ACC0, $Y
843 vpaddq $OVERFLOW, $ACC7, $ACC6
844 vpaddq $T0, $ACC8, $ACC7
845
846 #itr 4
847 vpmuludq $B, $B, $OVERFLOW
848 vpmuludq 32*3(%rcx), $B, $T0
849 vpaddq $OVERFLOW, $ACC3, $ACC3
850 vpmuludq 32*4(%rcx), $B, $OVERFLOW
851 vpaddq $T0, $ACC4, $ACC4
852 vpmuludq 32*5(%rcx), $B, $T0
853 vpaddq $OVERFLOW, $ACC5, $ACC5
854 vpmuludq 32*6(%rcx), $B, $OVERFLOW
855 vpaddq $T0, $ACC6, $ACC6
856
857 vpmuludq $AND_MASK, $Y, $T0
858 vpaddq $OVERFLOW, $ACC7, $ACC7
859 vpmuludq 32*7(%rcx), $B, $ACC8
860 vmovdqa 32*4($a_ptr), $B
861 vpaddq $T0, $ACC0, $OVERFLOW
862 vpaddq $T0, $ACC1, $ACC0
863 vpsrlq $digit_size, $OVERFLOW, $OVERFLOW
864 vpaddq $T0, $ACC2, $ACC1
865 vpmuludq 32*3(%rax), $Y, $T0
866 vpaddq $OVERFLOW, $ACC0, $ACC0
867 vpaddq $T0, $ACC3, $ACC2
868 vmovdqa $ACC4, $ACC3
869 vpsllq \$18, $Y, $T0
870 vmovdqa $ACC5, $ACC4
871 vpmuludq %ymm14, $Y, $OVERFLOW
872 vpaddq $T0, $ACC6, $ACC5
873 vpmuludq %ymm15, $Y, $T0
874 vpand $AND_MASK, $ACC0, $Y
875 vpaddq $OVERFLOW, $ACC7, $ACC6
876 vpaddq $T0, $ACC8, $ACC7
877
878 #itr 5
879 vpmuludq $B, $B, $T0
880 vpmuludq 32*4(%rcx), $B, $OVERFLOW
881 vpaddq $T0, $ACC4, $ACC4
882 vpmuludq 32*5(%rcx), $B, $T0
883 vpaddq $OVERFLOW, $ACC5, $ACC5
884 vpmuludq 32*6(%rcx), $B, $OVERFLOW
885 vpaddq $T0, $ACC6, $ACC6
886
887 vpmuludq $AND_MASK, $Y, $T0
888 vpaddq $OVERFLOW, $ACC7, $ACC7
889 vpmuludq 32*7(%rcx), $B, $ACC8
890 vmovdqa 32*5($a_ptr), $B
891 vpaddq $T0, $ACC0, $OVERFLOW
892 vpsrlq $digit_size, $OVERFLOW, $OVERFLOW
893 vpaddq $T0, $ACC1, $ACC0
894 vpaddq $T0, $ACC2, $ACC1
895 vpmuludq 32*3+.LAVX2_POLY(%rip), $Y, $T0
896 vpaddq $OVERFLOW, $ACC0, $ACC0
897 vpaddq $T0, $ACC3, $ACC2
898 vmovdqa $ACC4, $ACC3
899 vpsllq \$18, $Y, $T0
900 vmovdqa $ACC5, $ACC4
901 vpmuludq %ymm14, $Y, $OVERFLOW
902 vpaddq $T0, $ACC6, $ACC5
903 vpmuludq %ymm15, $Y, $T0
904 vpand $AND_MASK, $ACC0, $Y
905 vpaddq $OVERFLOW, $ACC7, $ACC6
906 vpaddq $T0, $ACC8, $ACC7
907
908 #itr 6
909 vpmuludq $B, $B, $OVERFLOW
910 vpmuludq 32*5(%rcx), $B, $T0
911 vpaddq $OVERFLOW, $ACC5, $ACC5
912 vpmuludq 32*6(%rcx), $B, $OVERFLOW
913 vpaddq $T0, $ACC6, $ACC6
914
915 vpmuludq $AND_MASK, $Y, $T0
916 vpaddq $OVERFLOW, $ACC7, $ACC7
917 vpmuludq 32*7(%rcx), $B, $ACC8
918 vmovdqa 32*6($a_ptr), $B
919 vpaddq $T0, $ACC0, $OVERFLOW
920 vpaddq $T0, $ACC1, $ACC0
921 vpsrlq $digit_size, $OVERFLOW, $OVERFLOW
922 vpaddq $T0, $ACC2, $ACC1
923 vpmuludq 32*3(%rax), $Y, $T0
924 vpaddq $OVERFLOW, $ACC0, $ACC0
925 vpaddq $T0, $ACC3, $ACC2
926 vmovdqa $ACC4, $ACC3
927 vpsllq \$18, $Y, $T0
928 vmovdqa $ACC5, $ACC4
929 vpmuludq %ymm14, $Y, $OVERFLOW
930 vpaddq $T0, $ACC6, $ACC5
931 vpmuludq %ymm15, $Y, $T0
932 vpand $AND_MASK, $ACC0, $Y
933 vpaddq $OVERFLOW, $ACC7, $ACC6
934 vpaddq $T0, $ACC8, $ACC7
935
936 #itr 7
937 vpmuludq $B, $B, $T0
938 vpmuludq 32*6(%rcx), $B, $OVERFLOW
939 vpaddq $T0, $ACC6, $ACC6
940
941 vpmuludq $AND_MASK, $Y, $T0
942 vpaddq $OVERFLOW, $ACC7, $ACC7
943 vpmuludq 32*7(%rcx), $B, $ACC8
944 vmovdqa 32*7($a_ptr), $B
945 vpaddq $T0, $ACC0, $OVERFLOW
946 vpsrlq $digit_size, $OVERFLOW, $OVERFLOW
947 vpaddq $T0, $ACC1, $ACC0
948 vpaddq $T0, $ACC2, $ACC1
949 vpmuludq 32*3(%rax), $Y, $T0
950 vpaddq $OVERFLOW, $ACC0, $ACC0
951 vpaddq $T0, $ACC3, $ACC2
952 vmovdqa $ACC4, $ACC3
953 vpsllq \$18, $Y, $T0
954 vmovdqa $ACC5, $ACC4
955 vpmuludq %ymm14, $Y, $OVERFLOW
956 vpaddq $T0, $ACC6, $ACC5
957 vpmuludq %ymm15, $Y, $T0
958 vpand $AND_MASK, $ACC0, $Y
959 vpaddq $OVERFLOW, $ACC7, $ACC6
960 vpaddq $T0, $ACC8, $ACC7
961
962 #itr 8
963 vpmuludq $B, $B, $OVERFLOW
964
965 vpmuludq $AND_MASK, $Y, $T0
966 vpaddq $OVERFLOW, $ACC7, $ACC7
967 vpmuludq 32*7(%rcx), $B, $ACC8
968 vmovdqa 32*8($a_ptr), $B
969 vpaddq $T0, $ACC0, $OVERFLOW
970 vpsrlq $digit_size, $OVERFLOW, $OVERFLOW
971 vpaddq $T0, $ACC1, $ACC0
972 vpaddq $T0, $ACC2, $ACC1
973 vpmuludq 32*3(%rax), $Y, $T0
974 vpaddq $OVERFLOW, $ACC0, $ACC0
975 vpaddq $T0, $ACC3, $ACC2
976 vmovdqa $ACC4, $ACC3
977 vpsllq \$18, $Y, $T0
978 vmovdqa $ACC5, $ACC4
979 vpmuludq %ymm14, $Y, $OVERFLOW
980 vpaddq $T0, $ACC6, $ACC5
981 vpmuludq %ymm15, $Y, $T0
982 vpand $AND_MASK, $ACC0, $Y
983 vpaddq $OVERFLOW, $ACC7, $ACC6
984 vpaddq $T0, $ACC8, $ACC7
985
986 #itr 9
987 vpmuludq $B, $B, $ACC8
988
989 vpmuludq $AND_MASK, $Y, $T0
990 vpaddq $T0, $ACC0, $OVERFLOW
991 vpsrlq $digit_size, $OVERFLOW, $OVERFLOW
992 vpaddq $T0, $ACC1, $ACC0
993 vpaddq $T0, $ACC2, $ACC1
994 vpmuludq 32*3(%rax), $Y, $T0
995 vpaddq $OVERFLOW, $ACC0, $ACC0
996 vpaddq $T0, $ACC3, $ACC2
997 vmovdqa $ACC4, $ACC3
998 vpsllq \$18, $Y, $T0
999 vmovdqa $ACC5, $ACC4
1000 vpmuludq %ymm14, $Y, $OVERFLOW
1001 vpaddq $T0, $ACC6, $ACC5
1002 vpmuludq %ymm15, $Y, $T0
1003 vpaddq $OVERFLOW, $ACC7, $ACC6
1004 vpaddq $T0, $ACC8, $ACC7
1005
1006 vpxor $ACC8, $ACC8, $ACC8
1007
1008 ret
1009.size avx2_sqr_x4,.-avx2_sqr_x4
1010
1011################################################################################
1012# void avx2_sub_x4(void* RESULTx4, void *Ax4, void *Bx4);
1013.type avx2_sub_x4,\@abi-omnipotent
1014.align 32
1015avx2_sub_x4:
1016 vmovdqa 32*0($a_ptr), $ACC0
1017 lea 160($a_ptr), $a_ptr
1018 lea .LAVX2_POLY_x8+128(%rip), %rax
1019 lea 128($b_ptr), $b_ptr
1020 vmovdqa 32*1-160($a_ptr), $ACC1
1021 vmovdqa 32*2-160($a_ptr), $ACC2
1022 vmovdqa 32*3-160($a_ptr), $ACC3
1023 vmovdqa 32*4-160($a_ptr), $ACC4
1024 vmovdqa 32*5-160($a_ptr), $ACC5
1025 vmovdqa 32*6-160($a_ptr), $ACC6
1026 vmovdqa 32*7-160($a_ptr), $ACC7
1027 vmovdqa 32*8-160($a_ptr), $ACC8
1028
1029 vpaddq 32*0-128(%rax), $ACC0, $ACC0
1030 vpaddq 32*1-128(%rax), $ACC1, $ACC1
1031 vpaddq 32*2-128(%rax), $ACC2, $ACC2
1032 vpaddq 32*3-128(%rax), $ACC3, $ACC3
1033 vpaddq 32*4-128(%rax), $ACC4, $ACC4
1034 vpaddq 32*5-128(%rax), $ACC5, $ACC5
1035 vpaddq 32*6-128(%rax), $ACC6, $ACC6
1036 vpaddq 32*7-128(%rax), $ACC7, $ACC7
1037 vpaddq 32*8-128(%rax), $ACC8, $ACC8
1038
1039 vpsubq 32*0-128($b_ptr), $ACC0, $ACC0
1040 vpsubq 32*1-128($b_ptr), $ACC1, $ACC1
1041 vpsubq 32*2-128($b_ptr), $ACC2, $ACC2
1042 vpsubq 32*3-128($b_ptr), $ACC3, $ACC3
1043 vpsubq 32*4-128($b_ptr), $ACC4, $ACC4
1044 vpsubq 32*5-128($b_ptr), $ACC5, $ACC5
1045 vpsubq 32*6-128($b_ptr), $ACC6, $ACC6
1046 vpsubq 32*7-128($b_ptr), $ACC7, $ACC7
1047 vpsubq 32*8-128($b_ptr), $ACC8, $ACC8
1048
1049 ret
1050.size avx2_sub_x4,.-avx2_sub_x4
1051
1052.type avx2_select_n_store,\@abi-omnipotent
1053.align 32
1054avx2_select_n_store:
1055 vmovdqa `8+32*9*8`(%rsp), $Y
1056 vpor `8+32*9*8+32`(%rsp), $Y, $Y
1057
1058 vpandn $ACC0, $Y, $ACC0
1059 vpandn $ACC1, $Y, $ACC1
1060 vpandn $ACC2, $Y, $ACC2
1061 vpandn $ACC3, $Y, $ACC3
1062 vpandn $ACC4, $Y, $ACC4
1063 vpandn $ACC5, $Y, $ACC5
1064 vpandn $ACC6, $Y, $ACC6
1065 vmovdqa `8+32*9*8+32`(%rsp), $B
1066 vpandn $ACC7, $Y, $ACC7
1067 vpandn `8+32*9*8`(%rsp), $B, $B
1068 vpandn $ACC8, $Y, $ACC8
1069
1070 vpand 32*0(%rsi), $B, $T0
1071 lea 160(%rsi), %rax
1072 vpand 32*1(%rsi), $B, $Y
1073 vpxor $T0, $ACC0, $ACC0
1074 vpand 32*2(%rsi), $B, $T0
1075 vpxor $Y, $ACC1, $ACC1
1076 vpand 32*3(%rsi), $B, $Y
1077 vpxor $T0, $ACC2, $ACC2
1078 vpand 32*4-160(%rax), $B, $T0
1079 vpxor $Y, $ACC3, $ACC3
1080 vpand 32*5-160(%rax), $B, $Y
1081 vpxor $T0, $ACC4, $ACC4
1082 vpand 32*6-160(%rax), $B, $T0
1083 vpxor $Y, $ACC5, $ACC5
1084 vpand 32*7-160(%rax), $B, $Y
1085 vpxor $T0, $ACC6, $ACC6
1086 vpand 32*8-160(%rax), $B, $T0
1087 vmovdqa `8+32*9*8+32`(%rsp), $B
1088 vpxor $Y, $ACC7, $ACC7
1089
1090 vpand 32*0(%rdx), $B, $Y
1091 lea 160(%rdx), %rax
1092 vpxor $T0, $ACC8, $ACC8
1093 vpand 32*1(%rdx), $B, $T0
1094 vpxor $Y, $ACC0, $ACC0
1095 vpand 32*2(%rdx), $B, $Y
1096 vpxor $T0, $ACC1, $ACC1
1097 vpand 32*3(%rdx), $B, $T0
1098 vpxor $Y, $ACC2, $ACC2
1099 vpand 32*4-160(%rax), $B, $Y
1100 vpxor $T0, $ACC3, $ACC3
1101 vpand 32*5-160(%rax), $B, $T0
1102 vpxor $Y, $ACC4, $ACC4
1103 vpand 32*6-160(%rax), $B, $Y
1104 vpxor $T0, $ACC5, $ACC5
1105 vpand 32*7-160(%rax), $B, $T0
1106 vpxor $Y, $ACC6, $ACC6
1107 vpand 32*8-160(%rax), $B, $Y
1108 vpxor $T0, $ACC7, $ACC7
1109 vpxor $Y, $ACC8, $ACC8
1110 `&STORE`
1111
1112 ret
1113.size avx2_select_n_store,.-avx2_select_n_store
1114___
1115$code.=<<___ if (0); # inlined
1116################################################################################
1117# void avx2_mul_by2_x4(void* RESULTx4, void *Ax4);
1118.type avx2_mul_by2_x4,\@abi-omnipotent
1119.align 32
1120avx2_mul_by2_x4:
1121 vmovdqa 32*0($a_ptr), $ACC0
1122 lea 160($a_ptr), %rax
1123 vmovdqa 32*1($a_ptr), $ACC1
1124 vmovdqa 32*2($a_ptr), $ACC2
1125 vmovdqa 32*3($a_ptr), $ACC3
1126 vmovdqa 32*4-160(%rax), $ACC4
1127 vmovdqa 32*5-160(%rax), $ACC5
1128 vmovdqa 32*6-160(%rax), $ACC6
1129 vmovdqa 32*7-160(%rax), $ACC7
1130 vmovdqa 32*8-160(%rax), $ACC8
1131
1132 vpaddq $ACC0, $ACC0, $ACC0
1133 vpaddq $ACC1, $ACC1, $ACC1
1134 vpaddq $ACC2, $ACC2, $ACC2
1135 vpaddq $ACC3, $ACC3, $ACC3
1136 vpaddq $ACC4, $ACC4, $ACC4
1137 vpaddq $ACC5, $ACC5, $ACC5
1138 vpaddq $ACC6, $ACC6, $ACC6
1139 vpaddq $ACC7, $ACC7, $ACC7
1140 vpaddq $ACC8, $ACC8, $ACC8
1141
1142 ret
1143.size avx2_mul_by2_x4,.-avx2_mul_by2_x4
1144___
1145my ($r_ptr_in,$a_ptr_in,$b_ptr_in)=("%rdi","%rsi","%rdx");
1146my ($r_ptr,$a_ptr,$b_ptr)=("%r8","%r9","%r10");
1147
1148$code.=<<___;
1149################################################################################
1150# void ecp_nistz256_avx2_point_add_affine_x4(void* RESULTx4, void *Ax4, void *Bx4);
1151.globl ecp_nistz256_avx2_point_add_affine_x4
1152.type ecp_nistz256_avx2_point_add_affine_x4,\@function,3
1153.align 32
1154ecp_nistz256_avx2_point_add_affine_x4:
1155 mov %rsp, %rax
1156 push %rbp
1157 vzeroupper
1158___
1159$code.=<<___ if ($win64);
1160 lea -16*10(%rsp), %rsp
1161 vmovaps %xmm6, -8-16*10(%rax)
1162 vmovaps %xmm7, -8-16*9(%rax)
1163 vmovaps %xmm8, -8-16*8(%rax)
1164 vmovaps %xmm9, -8-16*7(%rax)
1165 vmovaps %xmm10, -8-16*6(%rax)
1166 vmovaps %xmm11, -8-16*5(%rax)
1167 vmovaps %xmm12, -8-16*4(%rax)
1168 vmovaps %xmm13, -8-16*3(%rax)
1169 vmovaps %xmm14, -8-16*2(%rax)
1170 vmovaps %xmm15, -8-16*1(%rax)
1171___
1172$code.=<<___;
1173 lea -8(%rax), %rbp
1174
1175# Result + 32*0 = Result.X
1176# Result + 32*9 = Result.Y
1177# Result + 32*18 = Result.Z
1178
1179# A + 32*0 = A.X
1180# A + 32*9 = A.Y
1181# A + 32*18 = A.Z
1182
1183# B + 32*0 = B.X
1184# B + 32*9 = B.Y
1185
1186 sub \$`32*9*8+32*2+32*8`, %rsp
1187 and \$-64, %rsp
1188
1189 mov $r_ptr_in, $r_ptr
1190 mov $a_ptr_in, $a_ptr
1191 mov $b_ptr_in, $b_ptr
1192
1193 vmovdqa 32*0($a_ptr_in), %ymm0
1194 vmovdqa .LAVX2_AND_MASK(%rip), $AND_MASK
1195 vpxor %ymm1, %ymm1, %ymm1
1196 lea 256($a_ptr_in), %rax # size optimization
1197 vpor 32*1($a_ptr_in), %ymm0, %ymm0
1198 vpor 32*2($a_ptr_in), %ymm0, %ymm0
1199 vpor 32*3($a_ptr_in), %ymm0, %ymm0
1200 vpor 32*4-256(%rax), %ymm0, %ymm0
1201 lea 256(%rax), %rcx # size optimization
1202 vpor 32*5-256(%rax), %ymm0, %ymm0
1203 vpor 32*6-256(%rax), %ymm0, %ymm0
1204 vpor 32*7-256(%rax), %ymm0, %ymm0
1205 vpor 32*8-256(%rax), %ymm0, %ymm0
1206 vpor 32*9-256(%rax), %ymm0, %ymm0
1207 vpor 32*10-256(%rax), %ymm0, %ymm0
1208 vpor 32*11-256(%rax), %ymm0, %ymm0
1209 vpor 32*12-512(%rcx), %ymm0, %ymm0
1210 vpor 32*13-512(%rcx), %ymm0, %ymm0
1211 vpor 32*14-512(%rcx), %ymm0, %ymm0
1212 vpor 32*15-512(%rcx), %ymm0, %ymm0
1213 vpor 32*16-512(%rcx), %ymm0, %ymm0
1214 vpor 32*17-512(%rcx), %ymm0, %ymm0
1215 vpcmpeqq %ymm1, %ymm0, %ymm0
1216 vmovdqa %ymm0, `32*9*8`(%rsp)
1217
1218 vpxor %ymm1, %ymm1, %ymm1
1219 vmovdqa 32*0($b_ptr), %ymm0
1220 lea 256($b_ptr), %rax # size optimization
1221 vpor 32*1($b_ptr), %ymm0, %ymm0
1222 vpor 32*2($b_ptr), %ymm0, %ymm0
1223 vpor 32*3($b_ptr), %ymm0, %ymm0
1224 vpor 32*4-256(%rax), %ymm0, %ymm0
1225 lea 256(%rax), %rcx # size optimization
1226 vpor 32*5-256(%rax), %ymm0, %ymm0
1227 vpor 32*6-256(%rax), %ymm0, %ymm0
1228 vpor 32*7-256(%rax), %ymm0, %ymm0
1229 vpor 32*8-256(%rax), %ymm0, %ymm0
1230 vpor 32*9-256(%rax), %ymm0, %ymm0
1231 vpor 32*10-256(%rax), %ymm0, %ymm0
1232 vpor 32*11-256(%rax), %ymm0, %ymm0
1233 vpor 32*12-512(%rcx), %ymm0, %ymm0
1234 vpor 32*13-512(%rcx), %ymm0, %ymm0
1235 vpor 32*14-512(%rcx), %ymm0, %ymm0
1236 vpor 32*15-512(%rcx), %ymm0, %ymm0
1237 vpor 32*16-512(%rcx), %ymm0, %ymm0
1238 vpor 32*17-512(%rcx), %ymm0, %ymm0
1239 vpcmpeqq %ymm1, %ymm0, %ymm0
1240 vmovdqa %ymm0, `32*9*8+32`(%rsp)
1241
1242 # Z1^2 = Z1*Z1
1243 lea `32*9*2`($a_ptr), %rsi
1244 lea `32*9*2`(%rsp), %rdi
1245 lea `32*9*8+32*2`(%rsp), %rcx # temporary vector
1246 call avx2_sqr_x4
1247 call avx2_normalize_n_store
1248
1249 # U2 = X2*Z1^2
1250 lea `32*9*0`($b_ptr), %rsi
1251 lea `32*9*2`(%rsp), %rdx
1252 lea `32*9*0`(%rsp), %rdi
1253 call avx2_mul_x4
1254 #call avx2_normalize
1255 `&STORE`
1256
1257 # S2 = Z1*Z1^2 = Z1^3
1258 lea `32*9*2`($a_ptr), %rsi
1259 lea `32*9*2`(%rsp), %rdx
1260 lea `32*9*1`(%rsp), %rdi
1261 call avx2_mul_x4
1262 call avx2_normalize_n_store
1263
1264 # S2 = S2*Y2 = Y2*Z1^3
1265 lea `32*9*1`($b_ptr), %rsi
1266 lea `32*9*1`(%rsp), %rdx
1267 lea `32*9*1`(%rsp), %rdi
1268 call avx2_mul_x4
1269 call avx2_normalize_n_store
1270
1271 # H = U2 - U1 = U2 - X1
1272 lea `32*9*0`(%rsp), %rsi
1273 lea `32*9*0`($a_ptr), %rdx
1274 lea `32*9*3`(%rsp), %rdi
1275 call avx2_sub_x4
1276 call avx2_normalize_n_store
1277
1278 # R = S2 - S1 = S2 - Y1
1279 lea `32*9*1`(%rsp), %rsi
1280 lea `32*9*1`($a_ptr), %rdx
1281 lea `32*9*4`(%rsp), %rdi
1282 call avx2_sub_x4
1283 call avx2_normalize_n_store
1284
1285 # Z3 = H*Z1*Z2
1286 lea `32*9*3`(%rsp), %rsi
1287 lea `32*9*2`($a_ptr), %rdx
1288 lea `32*9*2`($r_ptr), %rdi
1289 call avx2_mul_x4
1290 call avx2_normalize
1291
1292 lea .LONE(%rip), %rsi
1293 lea `32*9*2`($a_ptr), %rdx
1294 call avx2_select_n_store
1295
1296 # R^2 = R^2
1297 lea `32*9*4`(%rsp), %rsi
1298 lea `32*9*6`(%rsp), %rdi
1299 lea `32*9*8+32*2`(%rsp), %rcx # temporary vector
1300 call avx2_sqr_x4
1301 call avx2_normalize_n_store
1302
1303 # H^2 = H^2
1304 lea `32*9*3`(%rsp), %rsi
1305 lea `32*9*5`(%rsp), %rdi
1306 call avx2_sqr_x4
1307 call avx2_normalize_n_store
1308
1309 # H^3 = H^2*H
1310 lea `32*9*3`(%rsp), %rsi
1311 lea `32*9*5`(%rsp), %rdx
1312 lea `32*9*7`(%rsp), %rdi
1313 call avx2_mul_x4
1314 call avx2_normalize_n_store
1315
1316 # U2 = U1*H^2
1317 lea `32*9*0`($a_ptr), %rsi
1318 lea `32*9*5`(%rsp), %rdx
1319 lea `32*9*0`(%rsp), %rdi
1320 call avx2_mul_x4
1321 #call avx2_normalize
1322 `&STORE`
1323
1324 # Hsqr = U2*2
1325 #lea 32*9*0(%rsp), %rsi
1326 #lea 32*9*5(%rsp), %rdi
1327 #call avx2_mul_by2_x4
1328
1329 vpaddq $ACC0, $ACC0, $ACC0 # inlined avx2_mul_by2_x4
1330 lea `32*9*5`(%rsp), %rdi
1331 vpaddq $ACC1, $ACC1, $ACC1
1332 vpaddq $ACC2, $ACC2, $ACC2
1333 vpaddq $ACC3, $ACC3, $ACC3
1334 vpaddq $ACC4, $ACC4, $ACC4
1335 vpaddq $ACC5, $ACC5, $ACC5
1336 vpaddq $ACC6, $ACC6, $ACC6
1337 vpaddq $ACC7, $ACC7, $ACC7
1338 vpaddq $ACC8, $ACC8, $ACC8
1339 call avx2_normalize_n_store
1340
1341 # X3 = R^2 - H^3
1342 #lea 32*9*6(%rsp), %rsi
1343 #lea 32*9*7(%rsp), %rdx
1344 #lea 32*9*5(%rsp), %rcx
1345 #lea 32*9*0($r_ptr), %rdi
1346 #call avx2_sub_x4
1347 #NORMALIZE
1348 #STORE
1349
1350 # X3 = X3 - U2*2
1351 #lea 32*9*0($r_ptr), %rsi
1352 #lea 32*9*0($r_ptr), %rdi
1353 #call avx2_sub_x4
1354 #NORMALIZE
1355 #STORE
1356
1357 lea `32*9*6+128`(%rsp), %rsi
1358 lea .LAVX2_POLY_x2+128(%rip), %rax
1359 lea `32*9*7+128`(%rsp), %rdx
1360 lea `32*9*5+128`(%rsp), %rcx
1361 lea `32*9*0`($r_ptr), %rdi
1362
1363 vmovdqa 32*0-128(%rsi), $ACC0
1364 vmovdqa 32*1-128(%rsi), $ACC1
1365 vmovdqa 32*2-128(%rsi), $ACC2
1366 vmovdqa 32*3-128(%rsi), $ACC3
1367 vmovdqa 32*4-128(%rsi), $ACC4
1368 vmovdqa 32*5-128(%rsi), $ACC5
1369 vmovdqa 32*6-128(%rsi), $ACC6
1370 vmovdqa 32*7-128(%rsi), $ACC7
1371 vmovdqa 32*8-128(%rsi), $ACC8
1372
1373 vpaddq 32*0-128(%rax), $ACC0, $ACC0
1374 vpaddq 32*1-128(%rax), $ACC1, $ACC1
1375 vpaddq 32*2-128(%rax), $ACC2, $ACC2
1376 vpaddq 32*3-128(%rax), $ACC3, $ACC3
1377 vpaddq 32*4-128(%rax), $ACC4, $ACC4
1378 vpaddq 32*5-128(%rax), $ACC5, $ACC5
1379 vpaddq 32*6-128(%rax), $ACC6, $ACC6
1380 vpaddq 32*7-128(%rax), $ACC7, $ACC7
1381 vpaddq 32*8-128(%rax), $ACC8, $ACC8
1382
1383 vpsubq 32*0-128(%rdx), $ACC0, $ACC0
1384 vpsubq 32*1-128(%rdx), $ACC1, $ACC1
1385 vpsubq 32*2-128(%rdx), $ACC2, $ACC2
1386 vpsubq 32*3-128(%rdx), $ACC3, $ACC3
1387 vpsubq 32*4-128(%rdx), $ACC4, $ACC4
1388 vpsubq 32*5-128(%rdx), $ACC5, $ACC5
1389 vpsubq 32*6-128(%rdx), $ACC6, $ACC6
1390 vpsubq 32*7-128(%rdx), $ACC7, $ACC7
1391 vpsubq 32*8-128(%rdx), $ACC8, $ACC8
1392
1393 vpsubq 32*0-128(%rcx), $ACC0, $ACC0
1394 vpsubq 32*1-128(%rcx), $ACC1, $ACC1
1395 vpsubq 32*2-128(%rcx), $ACC2, $ACC2
1396 vpsubq 32*3-128(%rcx), $ACC3, $ACC3
1397 vpsubq 32*4-128(%rcx), $ACC4, $ACC4
1398 vpsubq 32*5-128(%rcx), $ACC5, $ACC5
1399 vpsubq 32*6-128(%rcx), $ACC6, $ACC6
1400 vpsubq 32*7-128(%rcx), $ACC7, $ACC7
1401 vpsubq 32*8-128(%rcx), $ACC8, $ACC8
1402 call avx2_normalize
1403
1404 lea 32*0($b_ptr), %rsi
1405 lea 32*0($a_ptr), %rdx
1406 call avx2_select_n_store
1407
1408 # H = U2 - X3
1409 lea `32*9*0`(%rsp), %rsi
1410 lea `32*9*0`($r_ptr), %rdx
1411 lea `32*9*3`(%rsp), %rdi
1412 call avx2_sub_x4
1413 call avx2_normalize_n_store
1414
1415 #
1416 lea `32*9*3`(%rsp), %rsi
1417 lea `32*9*4`(%rsp), %rdx
1418 lea `32*9*3`(%rsp), %rdi
1419 call avx2_mul_x4
1420 call avx2_normalize_n_store
1421
1422 #
1423 lea `32*9*7`(%rsp), %rsi
1424 lea `32*9*1`($a_ptr), %rdx
1425 lea `32*9*1`(%rsp), %rdi
1426 call avx2_mul_x4
1427 call avx2_normalize_n_store
1428
1429 #
1430 lea `32*9*3`(%rsp), %rsi
1431 lea `32*9*1`(%rsp), %rdx
1432 lea `32*9*1`($r_ptr), %rdi
1433 call avx2_sub_x4
1434 call avx2_normalize
1435
1436 lea 32*9($b_ptr), %rsi
1437 lea 32*9($a_ptr), %rdx
1438 call avx2_select_n_store
1439
1440 #lea 32*9*0($r_ptr), %rsi
1441 #lea 32*9*0($r_ptr), %rdi
1442 #call avx2_mul_by1_x4
1443 #NORMALIZE
1444 #STORE
1445
1446 lea `32*9*1`($r_ptr), %rsi
1447 lea `32*9*1`($r_ptr), %rdi
1448 call avx2_mul_by1_x4
1449 call avx2_normalize_n_store
1450
1451 vzeroupper
1452___
1453$code.=<<___ if ($win64);
1454 movaps %xmm6, -16*10(%rbp)
1455 movaps %xmm7, -16*9(%rbp)
1456 movaps %xmm8, -16*8(%rbp)
1457 movaps %xmm9, -16*7(%rbp)
1458 movaps %xmm10, -16*6(%rbp)
1459 movaps %xmm11, -16*5(%rbp)
1460 movaps %xmm12, -16*4(%rbp)
1461 movaps %xmm13, -16*3(%rbp)
1462 movaps %xmm14, -16*2(%rbp)
1463 movaps %xmm15, -16*1(%rbp)
1464___
1465$code.=<<___;
1466 mov %rbp, %rsp
1467 pop %rbp
1468 ret
1469.size ecp_nistz256_avx2_point_add_affine_x4,.-ecp_nistz256_avx2_point_add_affine_x4
1470
1471################################################################################
1472# void ecp_nistz256_avx2_point_add_affines_x4(void* RESULTx4, void *Ax4, void *Bx4);
1473.globl ecp_nistz256_avx2_point_add_affines_x4
1474.type ecp_nistz256_avx2_point_add_affines_x4,\@function,3
1475.align 32
1476ecp_nistz256_avx2_point_add_affines_x4:
1477 mov %rsp, %rax
1478 push %rbp
1479 vzeroupper
1480___
1481$code.=<<___ if ($win64);
1482 lea -16*10(%rsp), %rsp
1483 vmovaps %xmm6, -8-16*10(%rax)
1484 vmovaps %xmm7, -8-16*9(%rax)
1485 vmovaps %xmm8, -8-16*8(%rax)
1486 vmovaps %xmm9, -8-16*7(%rax)
1487 vmovaps %xmm10, -8-16*6(%rax)
1488 vmovaps %xmm11, -8-16*5(%rax)
1489 vmovaps %xmm12, -8-16*4(%rax)
1490 vmovaps %xmm13, -8-16*3(%rax)
1491 vmovaps %xmm14, -8-16*2(%rax)
1492 vmovaps %xmm15, -8-16*1(%rax)
1493___
1494$code.=<<___;
1495 lea -8(%rax), %rbp
1496
1497# Result + 32*0 = Result.X
1498# Result + 32*9 = Result.Y
1499# Result + 32*18 = Result.Z
1500
1501# A + 32*0 = A.X
1502# A + 32*9 = A.Y
1503
1504# B + 32*0 = B.X
1505# B + 32*9 = B.Y
1506
1507 sub \$`32*9*8+32*2+32*8`, %rsp
1508 and \$-64, %rsp
1509
1510 mov $r_ptr_in, $r_ptr
1511 mov $a_ptr_in, $a_ptr
1512 mov $b_ptr_in, $b_ptr
1513
1514 vmovdqa 32*0($a_ptr_in), %ymm0
1515 vmovdqa .LAVX2_AND_MASK(%rip), $AND_MASK
1516 vpxor %ymm1, %ymm1, %ymm1
1517 lea 256($a_ptr_in), %rax # size optimization
1518 vpor 32*1($a_ptr_in), %ymm0, %ymm0
1519 vpor 32*2($a_ptr_in), %ymm0, %ymm0
1520 vpor 32*3($a_ptr_in), %ymm0, %ymm0
1521 vpor 32*4-256(%rax), %ymm0, %ymm0
1522 lea 256(%rax), %rcx # size optimization
1523 vpor 32*5-256(%rax), %ymm0, %ymm0
1524 vpor 32*6-256(%rax), %ymm0, %ymm0
1525 vpor 32*7-256(%rax), %ymm0, %ymm0
1526 vpor 32*8-256(%rax), %ymm0, %ymm0
1527 vpor 32*9-256(%rax), %ymm0, %ymm0
1528 vpor 32*10-256(%rax), %ymm0, %ymm0
1529 vpor 32*11-256(%rax), %ymm0, %ymm0
1530 vpor 32*12-512(%rcx), %ymm0, %ymm0
1531 vpor 32*13-512(%rcx), %ymm0, %ymm0
1532 vpor 32*14-512(%rcx), %ymm0, %ymm0
1533 vpor 32*15-512(%rcx), %ymm0, %ymm0
1534 vpor 32*16-512(%rcx), %ymm0, %ymm0
1535 vpor 32*17-512(%rcx), %ymm0, %ymm0
1536 vpcmpeqq %ymm1, %ymm0, %ymm0
1537 vmovdqa %ymm0, `32*9*8`(%rsp)
1538
1539 vpxor %ymm1, %ymm1, %ymm1
1540 vmovdqa 32*0($b_ptr), %ymm0
1541 lea 256($b_ptr), %rax # size optimization
1542 vpor 32*1($b_ptr), %ymm0, %ymm0
1543 vpor 32*2($b_ptr), %ymm0, %ymm0
1544 vpor 32*3($b_ptr), %ymm0, %ymm0
1545 vpor 32*4-256(%rax), %ymm0, %ymm0
1546 lea 256(%rax), %rcx # size optimization
1547 vpor 32*5-256(%rax), %ymm0, %ymm0
1548 vpor 32*6-256(%rax), %ymm0, %ymm0
1549 vpor 32*7-256(%rax), %ymm0, %ymm0
1550 vpor 32*8-256(%rax), %ymm0, %ymm0
1551 vpor 32*9-256(%rax), %ymm0, %ymm0
1552 vpor 32*10-256(%rax), %ymm0, %ymm0
1553 vpor 32*11-256(%rax), %ymm0, %ymm0
1554 vpor 32*12-512(%rcx), %ymm0, %ymm0
1555 vpor 32*13-512(%rcx), %ymm0, %ymm0
1556 vpor 32*14-512(%rcx), %ymm0, %ymm0
1557 vpor 32*15-512(%rcx), %ymm0, %ymm0
1558 vpor 32*16-512(%rcx), %ymm0, %ymm0
1559 vpor 32*17-512(%rcx), %ymm0, %ymm0
1560 vpcmpeqq %ymm1, %ymm0, %ymm0
1561 vmovdqa %ymm0, `32*9*8+32`(%rsp)
1562
1563 # H = U2 - U1 = X2 - X1
1564 lea `32*9*0`($b_ptr), %rsi
1565 lea `32*9*0`($a_ptr), %rdx
1566 lea `32*9*3`(%rsp), %rdi
1567 call avx2_sub_x4
1568 call avx2_normalize_n_store
1569
1570 # R = S2 - S1 = Y2 - Y1
1571 lea `32*9*1`($b_ptr), %rsi
1572 lea `32*9*1`($a_ptr), %rdx
1573 lea `32*9*4`(%rsp), %rdi
1574 call avx2_sub_x4
1575 call avx2_normalize_n_store
1576
1577 # Z3 = H*Z1*Z2 = H
1578 lea `32*9*3`(%rsp), %rsi
1579 lea `32*9*2`($r_ptr), %rdi
1580 call avx2_mul_by1_x4
1581 call avx2_normalize
1582
1583 vmovdqa `32*9*8`(%rsp), $B
1584 vpor `32*9*8+32`(%rsp), $B, $B
1585
1586 vpandn $ACC0, $B, $ACC0
1587 lea .LONE+128(%rip), %rax
1588 vpandn $ACC1, $B, $ACC1
1589 vpandn $ACC2, $B, $ACC2
1590 vpandn $ACC3, $B, $ACC3
1591 vpandn $ACC4, $B, $ACC4
1592 vpandn $ACC5, $B, $ACC5
1593 vpandn $ACC6, $B, $ACC6
1594 vpandn $ACC7, $B, $ACC7
1595
1596 vpand 32*0-128(%rax), $B, $T0
1597 vpandn $ACC8, $B, $ACC8
1598 vpand 32*1-128(%rax), $B, $Y
1599 vpxor $T0, $ACC0, $ACC0
1600 vpand 32*2-128(%rax), $B, $T0
1601 vpxor $Y, $ACC1, $ACC1
1602 vpand 32*3-128(%rax), $B, $Y
1603 vpxor $T0, $ACC2, $ACC2
1604 vpand 32*4-128(%rax), $B, $T0
1605 vpxor $Y, $ACC3, $ACC3
1606 vpand 32*5-128(%rax), $B, $Y
1607 vpxor $T0, $ACC4, $ACC4
1608 vpand 32*6-128(%rax), $B, $T0
1609 vpxor $Y, $ACC5, $ACC5
1610 vpand 32*7-128(%rax), $B, $Y
1611 vpxor $T0, $ACC6, $ACC6
1612 vpand 32*8-128(%rax), $B, $T0
1613 vpxor $Y, $ACC7, $ACC7
1614 vpxor $T0, $ACC8, $ACC8
1615 `&STORE`
1616
1617 # R^2 = R^2
1618 lea `32*9*4`(%rsp), %rsi
1619 lea `32*9*6`(%rsp), %rdi
1620 lea `32*9*8+32*2`(%rsp), %rcx # temporary vector
1621 call avx2_sqr_x4
1622 call avx2_normalize_n_store
1623
1624 # H^2 = H^2
1625 lea `32*9*3`(%rsp), %rsi
1626 lea `32*9*5`(%rsp), %rdi
1627 call avx2_sqr_x4
1628 call avx2_normalize_n_store
1629
1630 # H^3 = H^2*H
1631 lea `32*9*3`(%rsp), %rsi
1632 lea `32*9*5`(%rsp), %rdx
1633 lea `32*9*7`(%rsp), %rdi
1634 call avx2_mul_x4
1635 call avx2_normalize_n_store
1636
1637 # U2 = U1*H^2
1638 lea `32*9*0`($a_ptr), %rsi
1639 lea `32*9*5`(%rsp), %rdx
1640 lea `32*9*0`(%rsp), %rdi
1641 call avx2_mul_x4
1642 #call avx2_normalize
1643 `&STORE`
1644
1645 # Hsqr = U2*2
1646 #lea 32*9*0(%rsp), %rsi
1647 #lea 32*9*5(%rsp), %rdi
1648 #call avx2_mul_by2_x4
1649
1650 vpaddq $ACC0, $ACC0, $ACC0 # inlined avx2_mul_by2_x4
1651 lea `32*9*5`(%rsp), %rdi
1652 vpaddq $ACC1, $ACC1, $ACC1
1653 vpaddq $ACC2, $ACC2, $ACC2
1654 vpaddq $ACC3, $ACC3, $ACC3
1655 vpaddq $ACC4, $ACC4, $ACC4
1656 vpaddq $ACC5, $ACC5, $ACC5
1657 vpaddq $ACC6, $ACC6, $ACC6
1658 vpaddq $ACC7, $ACC7, $ACC7
1659 vpaddq $ACC8, $ACC8, $ACC8
1660 call avx2_normalize_n_store
1661
1662 # X3 = R^2 - H^3
1663 #lea 32*9*6(%rsp), %rsi
1664 #lea 32*9*7(%rsp), %rdx
1665 #lea 32*9*5(%rsp), %rcx
1666 #lea 32*9*0($r_ptr), %rdi
1667 #call avx2_sub_x4
1668 #NORMALIZE
1669 #STORE
1670
1671 # X3 = X3 - U2*2
1672 #lea 32*9*0($r_ptr), %rsi
1673 #lea 32*9*0($r_ptr), %rdi
1674 #call avx2_sub_x4
1675 #NORMALIZE
1676 #STORE
1677
1678 lea `32*9*6+128`(%rsp), %rsi
1679 lea .LAVX2_POLY_x2+128(%rip), %rax
1680 lea `32*9*7+128`(%rsp), %rdx
1681 lea `32*9*5+128`(%rsp), %rcx
1682 lea `32*9*0`($r_ptr), %rdi
1683
1684 vmovdqa 32*0-128(%rsi), $ACC0
1685 vmovdqa 32*1-128(%rsi), $ACC1
1686 vmovdqa 32*2-128(%rsi), $ACC2
1687 vmovdqa 32*3-128(%rsi), $ACC3
1688 vmovdqa 32*4-128(%rsi), $ACC4
1689 vmovdqa 32*5-128(%rsi), $ACC5
1690 vmovdqa 32*6-128(%rsi), $ACC6
1691 vmovdqa 32*7-128(%rsi), $ACC7
1692 vmovdqa 32*8-128(%rsi), $ACC8
1693
1694 vpaddq 32*0-128(%rax), $ACC0, $ACC0
1695 vpaddq 32*1-128(%rax), $ACC1, $ACC1
1696 vpaddq 32*2-128(%rax), $ACC2, $ACC2
1697 vpaddq 32*3-128(%rax), $ACC3, $ACC3
1698 vpaddq 32*4-128(%rax), $ACC4, $ACC4
1699 vpaddq 32*5-128(%rax), $ACC5, $ACC5
1700 vpaddq 32*6-128(%rax), $ACC6, $ACC6
1701 vpaddq 32*7-128(%rax), $ACC7, $ACC7
1702 vpaddq 32*8-128(%rax), $ACC8, $ACC8
1703
1704 vpsubq 32*0-128(%rdx), $ACC0, $ACC0
1705 vpsubq 32*1-128(%rdx), $ACC1, $ACC1
1706 vpsubq 32*2-128(%rdx), $ACC2, $ACC2
1707 vpsubq 32*3-128(%rdx), $ACC3, $ACC3
1708 vpsubq 32*4-128(%rdx), $ACC4, $ACC4
1709 vpsubq 32*5-128(%rdx), $ACC5, $ACC5
1710 vpsubq 32*6-128(%rdx), $ACC6, $ACC6
1711 vpsubq 32*7-128(%rdx), $ACC7, $ACC7
1712 vpsubq 32*8-128(%rdx), $ACC8, $ACC8
1713
1714 vpsubq 32*0-128(%rcx), $ACC0, $ACC0
1715 vpsubq 32*1-128(%rcx), $ACC1, $ACC1
1716 vpsubq 32*2-128(%rcx), $ACC2, $ACC2
1717 vpsubq 32*3-128(%rcx), $ACC3, $ACC3
1718 vpsubq 32*4-128(%rcx), $ACC4, $ACC4
1719 vpsubq 32*5-128(%rcx), $ACC5, $ACC5
1720 vpsubq 32*6-128(%rcx), $ACC6, $ACC6
1721 vpsubq 32*7-128(%rcx), $ACC7, $ACC7
1722 vpsubq 32*8-128(%rcx), $ACC8, $ACC8
1723 call avx2_normalize
1724
1725 lea 32*0($b_ptr), %rsi
1726 lea 32*0($a_ptr), %rdx
1727 call avx2_select_n_store
1728
1729 # H = U2 - X3
1730 lea `32*9*0`(%rsp), %rsi
1731 lea `32*9*0`($r_ptr), %rdx
1732 lea `32*9*3`(%rsp), %rdi
1733 call avx2_sub_x4
1734 call avx2_normalize_n_store
1735
1736 # H = H*R
1737 lea `32*9*3`(%rsp), %rsi
1738 lea `32*9*4`(%rsp), %rdx
1739 lea `32*9*3`(%rsp), %rdi
1740 call avx2_mul_x4
1741 call avx2_normalize_n_store
1742
1743 # S2 = S1 * H^3
1744 lea `32*9*7`(%rsp), %rsi
1745 lea `32*9*1`($a_ptr), %rdx
1746 lea `32*9*1`(%rsp), %rdi
1747 call avx2_mul_x4
1748 call avx2_normalize_n_store
1749
1750 #
1751 lea `32*9*3`(%rsp), %rsi
1752 lea `32*9*1`(%rsp), %rdx
1753 lea `32*9*1`($r_ptr), %rdi
1754 call avx2_sub_x4
1755 call avx2_normalize
1756
1757 lea 32*9($b_ptr), %rsi
1758 lea 32*9($a_ptr), %rdx
1759 call avx2_select_n_store
1760
1761 #lea 32*9*0($r_ptr), %rsi
1762 #lea 32*9*0($r_ptr), %rdi
1763 #call avx2_mul_by1_x4
1764 #NORMALIZE
1765 #STORE
1766
1767 lea `32*9*1`($r_ptr), %rsi
1768 lea `32*9*1`($r_ptr), %rdi
1769 call avx2_mul_by1_x4
1770 call avx2_normalize_n_store
1771
1772 vzeroupper
1773___
1774$code.=<<___ if ($win64);
1775 movaps %xmm6, -16*10(%rbp)
1776 movaps %xmm7, -16*9(%rbp)
1777 movaps %xmm8, -16*8(%rbp)
1778 movaps %xmm9, -16*7(%rbp)
1779 movaps %xmm10, -16*6(%rbp)
1780 movaps %xmm11, -16*5(%rbp)
1781 movaps %xmm12, -16*4(%rbp)
1782 movaps %xmm13, -16*3(%rbp)
1783 movaps %xmm14, -16*2(%rbp)
1784 movaps %xmm15, -16*1(%rbp)
1785___
1786$code.=<<___;
1787 mov %rbp, %rsp
1788 pop %rbp
1789 ret
1790.size ecp_nistz256_avx2_point_add_affines_x4,.-ecp_nistz256_avx2_point_add_affines_x4
1791
1792################################################################################
1793# void ecp_nistz256_avx2_to_mont(void* RESULTx4, void *Ax4);
1794.globl ecp_nistz256_avx2_to_mont
1795.type ecp_nistz256_avx2_to_mont,\@function,2
1796.align 32
1797ecp_nistz256_avx2_to_mont:
1798 vzeroupper
1799___
1800$code.=<<___ if ($win64);
1801 lea -8-16*10(%rsp), %rsp
1802 vmovaps %xmm6, -8-16*10(%rax)
1803 vmovaps %xmm7, -8-16*9(%rax)
1804 vmovaps %xmm8, -8-16*8(%rax)
1805 vmovaps %xmm9, -8-16*7(%rax)
1806 vmovaps %xmm10, -8-16*6(%rax)
1807 vmovaps %xmm11, -8-16*5(%rax)
1808 vmovaps %xmm12, -8-16*4(%rax)
1809 vmovaps %xmm13, -8-16*3(%rax)
1810 vmovaps %xmm14, -8-16*2(%rax)
1811 vmovaps %xmm15, -8-16*1(%rax)
1812___
1813$code.=<<___;
1814 vmovdqa .LAVX2_AND_MASK(%rip), $AND_MASK
1815 lea .LTO_MONT_AVX2(%rip), %rdx
1816 call avx2_mul_x4
1817 call avx2_normalize_n_store
1818
1819 vzeroupper
1820___
1821$code.=<<___ if ($win64);
1822 movaps 16*0(%rsp), %xmm6
1823 movaps 16*1(%rsp), %xmm7
1824 movaps 16*2(%rsp), %xmm8
1825 movaps 16*3(%rsp), %xmm9
1826 movaps 16*4(%rsp), %xmm10
1827 movaps 16*5(%rsp), %xmm11
1828 movaps 16*6(%rsp), %xmm12
1829 movaps 16*7(%rsp), %xmm13
1830 movaps 16*8(%rsp), %xmm14
1831 movaps 16*9(%rsp), %xmm15
1832 lea 8+16*10(%rsp), %rsp
1833___
1834$code.=<<___;
1835 ret
1836.size ecp_nistz256_avx2_to_mont,.-ecp_nistz256_avx2_to_mont
1837
1838################################################################################
1839# void ecp_nistz256_avx2_from_mont(void* RESULTx4, void *Ax4);
1840.globl ecp_nistz256_avx2_from_mont
1841.type ecp_nistz256_avx2_from_mont,\@function,2
1842.align 32
1843ecp_nistz256_avx2_from_mont:
1844 vzeroupper
1845___
1846$code.=<<___ if ($win64);
1847 lea -8-16*10(%rsp), %rsp
1848 vmovaps %xmm6, -8-16*10(%rax)
1849 vmovaps %xmm7, -8-16*9(%rax)
1850 vmovaps %xmm8, -8-16*8(%rax)
1851 vmovaps %xmm9, -8-16*7(%rax)
1852 vmovaps %xmm10, -8-16*6(%rax)
1853 vmovaps %xmm11, -8-16*5(%rax)
1854 vmovaps %xmm12, -8-16*4(%rax)
1855 vmovaps %xmm13, -8-16*3(%rax)
1856 vmovaps %xmm14, -8-16*2(%rax)
1857 vmovaps %xmm15, -8-16*1(%rax)
1858___
1859$code.=<<___;
1860 vmovdqa .LAVX2_AND_MASK(%rip), $AND_MASK
1861 lea .LFROM_MONT_AVX2(%rip), %rdx
1862 call avx2_mul_x4
1863 call avx2_normalize_n_store
1864
1865 vzeroupper
1866___
1867$code.=<<___ if ($win64);
1868 movaps 16*0(%rsp), %xmm6
1869 movaps 16*1(%rsp), %xmm7
1870 movaps 16*2(%rsp), %xmm8
1871 movaps 16*3(%rsp), %xmm9
1872 movaps 16*4(%rsp), %xmm10
1873 movaps 16*5(%rsp), %xmm11
1874 movaps 16*6(%rsp), %xmm12
1875 movaps 16*7(%rsp), %xmm13
1876 movaps 16*8(%rsp), %xmm14
1877 movaps 16*9(%rsp), %xmm15
1878 lea 8+16*10(%rsp), %rsp
1879___
1880$code.=<<___;
1881 ret
1882.size ecp_nistz256_avx2_from_mont,.-ecp_nistz256_avx2_from_mont
1883
1884################################################################################
1885# void ecp_nistz256_avx2_set1(void* RESULTx4);
1886.globl ecp_nistz256_avx2_set1
1887.type ecp_nistz256_avx2_set1,\@function,1
1888.align 32
1889ecp_nistz256_avx2_set1:
1890 lea .LONE+128(%rip), %rax
1891 lea 128(%rdi), %rdi
1892 vzeroupper
1893 vmovdqa 32*0-128(%rax), %ymm0
1894 vmovdqa 32*1-128(%rax), %ymm1
1895 vmovdqa 32*2-128(%rax), %ymm2
1896 vmovdqa 32*3-128(%rax), %ymm3
1897 vmovdqa 32*4-128(%rax), %ymm4
1898 vmovdqa 32*5-128(%rax), %ymm5
1899 vmovdqa %ymm0, 32*0-128(%rdi)
1900 vmovdqa 32*6-128(%rax), %ymm0
1901 vmovdqa %ymm1, 32*1-128(%rdi)
1902 vmovdqa 32*7-128(%rax), %ymm1
1903 vmovdqa %ymm2, 32*2-128(%rdi)
1904 vmovdqa 32*8-128(%rax), %ymm2
1905 vmovdqa %ymm3, 32*3-128(%rdi)
1906 vmovdqa %ymm4, 32*4-128(%rdi)
1907 vmovdqa %ymm5, 32*5-128(%rdi)
1908 vmovdqa %ymm0, 32*6-128(%rdi)
1909 vmovdqa %ymm1, 32*7-128(%rdi)
1910 vmovdqa %ymm2, 32*8-128(%rdi)
1911
1912 vzeroupper
1913 ret
1914.size ecp_nistz256_avx2_set1,.-ecp_nistz256_avx2_set1
1915___
1916}
1917{
1918################################################################################
1919# void ecp_nistz256_avx2_multi_gather_w7(void* RESULT, void *in,
1920# int index0, int index1, int index2, int index3);
1921################################################################################
1922
1923my ($val,$in_t,$index0,$index1,$index2,$index3)=("%rdi","%rsi","%edx","%ecx","%r8d","%r9d");
1924my ($INDEX0,$INDEX1,$INDEX2,$INDEX3)=map("%ymm$_",(0..3));
1925my ($R0a,$R0b,$R1a,$R1b,$R2a,$R2b,$R3a,$R3b)=map("%ymm$_",(4..11));
1926my ($M0,$T0,$T1,$TMP0)=map("%ymm$_",(12..15));
1927
1928$code.=<<___;
1929.globl ecp_nistz256_avx2_multi_gather_w7
1930.type ecp_nistz256_avx2_multi_gather_w7,\@function,6
1931.align 32
1932ecp_nistz256_avx2_multi_gather_w7:
1933 vzeroupper
1934___
1935$code.=<<___ if ($win64);
1936 lea -8-16*10(%rsp), %rsp
1937 vmovaps %xmm6, -8-16*10(%rax)
1938 vmovaps %xmm7, -8-16*9(%rax)
1939 vmovaps %xmm8, -8-16*8(%rax)
1940 vmovaps %xmm9, -8-16*7(%rax)
1941 vmovaps %xmm10, -8-16*6(%rax)
1942 vmovaps %xmm11, -8-16*5(%rax)
1943 vmovaps %xmm12, -8-16*4(%rax)
1944 vmovaps %xmm13, -8-16*3(%rax)
1945 vmovaps %xmm14, -8-16*2(%rax)
1946 vmovaps %xmm15, -8-16*1(%rax)
1947___
1948$code.=<<___;
1949 lea .LIntOne(%rip), %rax
1950
1951 vmovd $index0, %xmm0
1952 vmovd $index1, %xmm1
1953 vmovd $index2, %xmm2
1954 vmovd $index3, %xmm3
1955
1956 vpxor $R0a, $R0a, $R0a
1957 vpxor $R0b, $R0b, $R0b
1958 vpxor $R1a, $R1a, $R1a
1959 vpxor $R1b, $R1b, $R1b
1960 vpxor $R2a, $R2a, $R2a
1961 vpxor $R2b, $R2b, $R2b
1962 vpxor $R3a, $R3a, $R3a
1963 vpxor $R3b, $R3b, $R3b
1964 vmovdqa (%rax), $M0
1965
1966 vpermd $INDEX0, $R0a, $INDEX0
1967 vpermd $INDEX1, $R0a, $INDEX1
1968 vpermd $INDEX2, $R0a, $INDEX2
1969 vpermd $INDEX3, $R0a, $INDEX3
1970
1971 mov \$64, %ecx
1972 lea 112($val), $val # size optimization
1973 jmp .Lmulti_select_loop_avx2
1974
1975# INDEX=0, corresponds to the point at infty (0,0)
1976.align 32
1977.Lmulti_select_loop_avx2:
1978 vpcmpeqd $INDEX0, $M0, $TMP0
1979
1980 vmovdqa `32*0+32*64*2*0`($in_t), $T0
1981 vmovdqa `32*1+32*64*2*0`($in_t), $T1
1982 vpand $TMP0, $T0, $T0
1983 vpand $TMP0, $T1, $T1
1984 vpxor $T0, $R0a, $R0a
1985 vpxor $T1, $R0b, $R0b
1986
1987 vpcmpeqd $INDEX1, $M0, $TMP0
1988
1989 vmovdqa `32*0+32*64*2*1`($in_t), $T0
1990 vmovdqa `32*1+32*64*2*1`($in_t), $T1
1991 vpand $TMP0, $T0, $T0
1992 vpand $TMP0, $T1, $T1
1993 vpxor $T0, $R1a, $R1a
1994 vpxor $T1, $R1b, $R1b
1995
1996 vpcmpeqd $INDEX2, $M0, $TMP0
1997
1998 vmovdqa `32*0+32*64*2*2`($in_t), $T0
1999 vmovdqa `32*1+32*64*2*2`($in_t), $T1
2000 vpand $TMP0, $T0, $T0
2001 vpand $TMP0, $T1, $T1
2002 vpxor $T0, $R2a, $R2a
2003 vpxor $T1, $R2b, $R2b
2004
2005 vpcmpeqd $INDEX3, $M0, $TMP0
2006
2007 vmovdqa `32*0+32*64*2*3`($in_t), $T0
2008 vmovdqa `32*1+32*64*2*3`($in_t), $T1
2009 vpand $TMP0, $T0, $T0
2010 vpand $TMP0, $T1, $T1
2011 vpxor $T0, $R3a, $R3a
2012 vpxor $T1, $R3b, $R3b
2013
2014 vpaddd (%rax), $M0, $M0 # increment
2015 lea 32*2($in_t), $in_t
2016
2017 dec %ecx
2018 jnz .Lmulti_select_loop_avx2
2019
2020 vmovdqu $R0a, 32*0-112($val)
2021 vmovdqu $R0b, 32*1-112($val)
2022 vmovdqu $R1a, 32*2-112($val)
2023 vmovdqu $R1b, 32*3-112($val)
2024 vmovdqu $R2a, 32*4-112($val)
2025 vmovdqu $R2b, 32*5-112($val)
2026 vmovdqu $R3a, 32*6-112($val)
2027 vmovdqu $R3b, 32*7-112($val)
2028
2029 vzeroupper
2030___
2031$code.=<<___ if ($win64);
2032 movaps 16*0(%rsp), %xmm6
2033 movaps 16*1(%rsp), %xmm7
2034 movaps 16*2(%rsp), %xmm8
2035 movaps 16*3(%rsp), %xmm9
2036 movaps 16*4(%rsp), %xmm10
2037 movaps 16*5(%rsp), %xmm11
2038 movaps 16*6(%rsp), %xmm12
2039 movaps 16*7(%rsp), %xmm13
2040 movaps 16*8(%rsp), %xmm14
2041 movaps 16*9(%rsp), %xmm15
2042 lea 8+16*10(%rsp), %rsp
2043___
2044$code.=<<___;
2045 ret
2046.size ecp_nistz256_avx2_multi_gather_w7,.-ecp_nistz256_avx2_multi_gather_w7
2047
2048.extern OPENSSL_ia32cap_P
2049.globl ecp_nistz_avx2_eligible
2050.type ecp_nistz_avx2_eligible,\@abi-omnipotent
2051.align 32
2052ecp_nistz_avx2_eligible:
2053 mov OPENSSL_ia32cap_P+8(%rip),%eax
2054 shr \$5,%eax
2055 and \$1,%eax
2056 ret
2057.size ecp_nistz_avx2_eligible,.-ecp_nistz_avx2_eligible
2058___
2059}
2060}} else {{ # assembler is too old
2061$code.=<<___;
2062.text
2063
2064.globl ecp_nistz256_avx2_transpose_convert
2065.globl ecp_nistz256_avx2_convert_transpose_back
2066.globl ecp_nistz256_avx2_point_add_affine_x4
2067.globl ecp_nistz256_avx2_point_add_affines_x4
2068.globl ecp_nistz256_avx2_to_mont
2069.globl ecp_nistz256_avx2_from_mont
2070.globl ecp_nistz256_avx2_set1
2071.globl ecp_nistz256_avx2_multi_gather_w7
2072.type ecp_nistz256_avx2_multi_gather_w7,\@abi-omnipotent
2073ecp_nistz256_avx2_transpose_convert:
2074ecp_nistz256_avx2_convert_transpose_back:
2075ecp_nistz256_avx2_point_add_affine_x4:
2076ecp_nistz256_avx2_point_add_affines_x4:
2077ecp_nistz256_avx2_to_mont:
2078ecp_nistz256_avx2_from_mont:
2079ecp_nistz256_avx2_set1:
2080ecp_nistz256_avx2_multi_gather_w7:
2081 .byte 0x0f,0x0b # ud2
2082 ret
2083.size ecp_nistz256_avx2_multi_gather_w7,.-ecp_nistz256_avx2_multi_gather_w7
2084
2085.globl ecp_nistz_avx2_eligible
2086.type ecp_nistz_avx2_eligible,\@abi-omnipotent
2087ecp_nistz_avx2_eligible:
2088 xor %eax,%eax
2089 ret
2090.size ecp_nistz_avx2_eligible,.-ecp_nistz_avx2_eligible
2091___
2092}}
2093
2094foreach (split("\n",$code)) {
2095 s/\`([^\`]*)\`/eval($1)/geo;
2096
2097 print $_,"\n";
2098}
2099
2100close STDOUT;
Note: See TracBrowser for help on using the repository browser.

© 2024 Oracle Support Privacy / Do Not Sell My Info Terms of Use Trademark Policy Automated Access Etiquette