1 | #! /usr/bin/env perl
|
---|
2 | # Copyright 2010-2016 The OpenSSL Project Authors. All Rights Reserved.
|
---|
3 | #
|
---|
4 | # Licensed under the OpenSSL license (the "License"). You may not use
|
---|
5 | # this file except in compliance with the License. You can obtain a copy
|
---|
6 | # in the file LICENSE in the source distribution or at
|
---|
7 | # https://www.openssl.org/source/license.html
|
---|
8 |
|
---|
9 | #
|
---|
10 | # ====================================================================
|
---|
11 | # Written by Andy Polyakov <[email protected]> for the OpenSSL
|
---|
12 | # project. The module is, however, dual licensed under OpenSSL and
|
---|
13 | # CRYPTOGAMS licenses depending on where you obtain it. For further
|
---|
14 | # details see http://www.openssl.org/~appro/cryptogams/.
|
---|
15 | # ====================================================================
|
---|
16 | #
|
---|
17 | # March, June 2010
|
---|
18 | #
|
---|
19 | # The module implements "4-bit" GCM GHASH function and underlying
|
---|
20 | # single multiplication operation in GF(2^128). "4-bit" means that
|
---|
21 | # it uses 256 bytes per-key table [+128 bytes shared table]. GHASH
|
---|
22 | # function features so called "528B" variant utilizing additional
|
---|
23 | # 256+16 bytes of per-key storage [+512 bytes shared table].
|
---|
24 | # Performance results are for this streamed GHASH subroutine and are
|
---|
25 | # expressed in cycles per processed byte, less is better:
|
---|
26 | #
|
---|
27 | # gcc 3.4.x(*) assembler
|
---|
28 | #
|
---|
29 | # P4 28.6 14.0 +100%
|
---|
30 | # Opteron 19.3 7.7 +150%
|
---|
31 | # Core2 17.8 8.1(**) +120%
|
---|
32 | # Atom 31.6 16.8 +88%
|
---|
33 | # VIA Nano 21.8 10.1 +115%
|
---|
34 | #
|
---|
35 | # (*) comparison is not completely fair, because C results are
|
---|
36 | # for vanilla "256B" implementation, while assembler results
|
---|
37 | # are for "528B";-)
|
---|
38 | # (**) it's mystery [to me] why Core2 result is not same as for
|
---|
39 | # Opteron;
|
---|
40 |
|
---|
41 | # May 2010
|
---|
42 | #
|
---|
43 | # Add PCLMULQDQ version performing at 2.02 cycles per processed byte.
|
---|
44 | # See ghash-x86.pl for background information and details about coding
|
---|
45 | # techniques.
|
---|
46 | #
|
---|
47 | # Special thanks to David Woodhouse <[email protected]> for
|
---|
48 | # providing access to a Westmere-based system on behalf of Intel
|
---|
49 | # Open Source Technology Centre.
|
---|
50 |
|
---|
51 | # December 2012
|
---|
52 | #
|
---|
53 | # Overhaul: aggregate Karatsuba post-processing, improve ILP in
|
---|
54 | # reduction_alg9, increase reduction aggregate factor to 4x. As for
|
---|
55 | # the latter. ghash-x86.pl discusses that it makes lesser sense to
|
---|
56 | # increase aggregate factor. Then why increase here? Critical path
|
---|
57 | # consists of 3 independent pclmulqdq instructions, Karatsuba post-
|
---|
58 | # processing and reduction. "On top" of this we lay down aggregated
|
---|
59 | # multiplication operations, triplets of independent pclmulqdq's. As
|
---|
60 | # issue rate for pclmulqdq is limited, it makes lesser sense to
|
---|
61 | # aggregate more multiplications than it takes to perform remaining
|
---|
62 | # non-multiplication operations. 2x is near-optimal coefficient for
|
---|
63 | # contemporary Intel CPUs (therefore modest improvement coefficient),
|
---|
64 | # but not for Bulldozer. Latter is because logical SIMD operations
|
---|
65 | # are twice as slow in comparison to Intel, so that critical path is
|
---|
66 | # longer. A CPU with higher pclmulqdq issue rate would also benefit
|
---|
67 | # from higher aggregate factor...
|
---|
68 | #
|
---|
69 | # Westmere 1.78(+13%)
|
---|
70 | # Sandy Bridge 1.80(+8%)
|
---|
71 | # Ivy Bridge 1.80(+7%)
|
---|
72 | # Haswell 0.55(+93%) (if system doesn't support AVX)
|
---|
73 | # Broadwell 0.45(+110%)(if system doesn't support AVX)
|
---|
74 | # Skylake 0.44(+110%)(if system doesn't support AVX)
|
---|
75 | # Bulldozer 1.49(+27%)
|
---|
76 | # Silvermont 2.88(+13%)
|
---|
77 | # Goldmont 1.08(+24%)
|
---|
78 |
|
---|
79 | # March 2013
|
---|
80 | #
|
---|
81 | # ... 8x aggregate factor AVX code path is using reduction algorithm
|
---|
82 | # suggested by Shay Gueron[1]. Even though contemporary AVX-capable
|
---|
83 | # CPUs such as Sandy and Ivy Bridge can execute it, the code performs
|
---|
84 | # sub-optimally in comparison to above mentioned version. But thanks
|
---|
85 | # to Ilya Albrekht and Max Locktyukhin of Intel Corp. we knew that
|
---|
86 | # it performs in 0.41 cycles per byte on Haswell processor, in
|
---|
87 | # 0.29 on Broadwell, and in 0.36 on Skylake.
|
---|
88 | #
|
---|
89 | # [1] http://rt.openssl.org/Ticket/Display.html?id=2900&user=guest&pass=guest
|
---|
90 |
|
---|
91 | $flavour = shift;
|
---|
92 | $output = shift;
|
---|
93 | if ($flavour =~ /\./) { $output = $flavour; undef $flavour; }
|
---|
94 |
|
---|
95 | $win64=0; $win64=1 if ($flavour =~ /[nm]asm|mingw64/ || $output =~ /\.asm$/);
|
---|
96 |
|
---|
97 | $0 =~ m/(.*[\/\\])[^\/\\]+$/; $dir=$1;
|
---|
98 | ( $xlate="${dir}x86_64-xlate.pl" and -f $xlate ) or
|
---|
99 | ( $xlate="${dir}../../perlasm/x86_64-xlate.pl" and -f $xlate) or
|
---|
100 | die "can't locate x86_64-xlate.pl";
|
---|
101 |
|
---|
102 | if (`$ENV{CC} -Wa,-v -c -o /dev/null -x assembler /dev/null 2>&1`
|
---|
103 | =~ /GNU assembler version ([2-9]\.[0-9]+)/) {
|
---|
104 | $avx = ($1>=2.20) + ($1>=2.22);
|
---|
105 | }
|
---|
106 |
|
---|
107 | if (!$avx && $win64 && ($flavour =~ /nasm/ || $ENV{ASM} =~ /nasm/) &&
|
---|
108 | `nasm -v 2>&1` =~ /NASM version ([2-9]\.[0-9]+)/) {
|
---|
109 | $avx = ($1>=2.09) + ($1>=2.10);
|
---|
110 | }
|
---|
111 |
|
---|
112 | if (!$avx && $win64 && ($flavour =~ /masm/ || $ENV{ASM} =~ /ml64/) &&
|
---|
113 | `ml64 2>&1` =~ /Version ([0-9]+)\./) {
|
---|
114 | $avx = ($1>=10) + ($1>=11);
|
---|
115 | }
|
---|
116 |
|
---|
117 | if (!$avx && `$ENV{CC} -v 2>&1` =~ /((?:^clang|LLVM) version|.*based on LLVM) ([3-9]\.[0-9]+)/) {
|
---|
118 | $avx = ($2>=3.0) + ($2>3.0);
|
---|
119 | }
|
---|
120 |
|
---|
121 | open OUT,"| \"$^X\" \"$xlate\" $flavour \"$output\"";
|
---|
122 | *STDOUT=*OUT;
|
---|
123 |
|
---|
124 | $do4xaggr=1;
|
---|
125 |
|
---|
126 | # common register layout
|
---|
127 | $nlo="%rax";
|
---|
128 | $nhi="%rbx";
|
---|
129 | $Zlo="%r8";
|
---|
130 | $Zhi="%r9";
|
---|
131 | $tmp="%r10";
|
---|
132 | $rem_4bit = "%r11";
|
---|
133 |
|
---|
134 | $Xi="%rdi";
|
---|
135 | $Htbl="%rsi";
|
---|
136 |
|
---|
137 | # per-function register layout
|
---|
138 | $cnt="%rcx";
|
---|
139 | $rem="%rdx";
|
---|
140 |
|
---|
141 | sub LB() { my $r=shift; $r =~ s/%[er]([a-d])x/%\1l/ or
|
---|
142 | $r =~ s/%[er]([sd]i)/%\1l/ or
|
---|
143 | $r =~ s/%[er](bp)/%\1l/ or
|
---|
144 | $r =~ s/%(r[0-9]+)[d]?/%\1b/; $r; }
|
---|
145 |
|
---|
146 | sub AUTOLOAD() # thunk [simplified] 32-bit style perlasm
|
---|
147 | { my $opcode = $AUTOLOAD; $opcode =~ s/.*:://;
|
---|
148 | my $arg = pop;
|
---|
149 | $arg = "\$$arg" if ($arg*1 eq $arg);
|
---|
150 | $code .= "\t$opcode\t".join(',',$arg,reverse @_)."\n";
|
---|
151 | }
|
---|
152 | |
---|
153 |
|
---|
154 | { my $N;
|
---|
155 | sub loop() {
|
---|
156 | my $inp = shift;
|
---|
157 |
|
---|
158 | $N++;
|
---|
159 | $code.=<<___;
|
---|
160 | xor $nlo,$nlo
|
---|
161 | xor $nhi,$nhi
|
---|
162 | mov `&LB("$Zlo")`,`&LB("$nlo")`
|
---|
163 | mov `&LB("$Zlo")`,`&LB("$nhi")`
|
---|
164 | shl \$4,`&LB("$nlo")`
|
---|
165 | mov \$14,$cnt
|
---|
166 | mov 8($Htbl,$nlo),$Zlo
|
---|
167 | mov ($Htbl,$nlo),$Zhi
|
---|
168 | and \$0xf0,`&LB("$nhi")`
|
---|
169 | mov $Zlo,$rem
|
---|
170 | jmp .Loop$N
|
---|
171 |
|
---|
172 | .align 16
|
---|
173 | .Loop$N:
|
---|
174 | shr \$4,$Zlo
|
---|
175 | and \$0xf,$rem
|
---|
176 | mov $Zhi,$tmp
|
---|
177 | mov ($inp,$cnt),`&LB("$nlo")`
|
---|
178 | shr \$4,$Zhi
|
---|
179 | xor 8($Htbl,$nhi),$Zlo
|
---|
180 | shl \$60,$tmp
|
---|
181 | xor ($Htbl,$nhi),$Zhi
|
---|
182 | mov `&LB("$nlo")`,`&LB("$nhi")`
|
---|
183 | xor ($rem_4bit,$rem,8),$Zhi
|
---|
184 | mov $Zlo,$rem
|
---|
185 | shl \$4,`&LB("$nlo")`
|
---|
186 | xor $tmp,$Zlo
|
---|
187 | dec $cnt
|
---|
188 | js .Lbreak$N
|
---|
189 |
|
---|
190 | shr \$4,$Zlo
|
---|
191 | and \$0xf,$rem
|
---|
192 | mov $Zhi,$tmp
|
---|
193 | shr \$4,$Zhi
|
---|
194 | xor 8($Htbl,$nlo),$Zlo
|
---|
195 | shl \$60,$tmp
|
---|
196 | xor ($Htbl,$nlo),$Zhi
|
---|
197 | and \$0xf0,`&LB("$nhi")`
|
---|
198 | xor ($rem_4bit,$rem,8),$Zhi
|
---|
199 | mov $Zlo,$rem
|
---|
200 | xor $tmp,$Zlo
|
---|
201 | jmp .Loop$N
|
---|
202 |
|
---|
203 | .align 16
|
---|
204 | .Lbreak$N:
|
---|
205 | shr \$4,$Zlo
|
---|
206 | and \$0xf,$rem
|
---|
207 | mov $Zhi,$tmp
|
---|
208 | shr \$4,$Zhi
|
---|
209 | xor 8($Htbl,$nlo),$Zlo
|
---|
210 | shl \$60,$tmp
|
---|
211 | xor ($Htbl,$nlo),$Zhi
|
---|
212 | and \$0xf0,`&LB("$nhi")`
|
---|
213 | xor ($rem_4bit,$rem,8),$Zhi
|
---|
214 | mov $Zlo,$rem
|
---|
215 | xor $tmp,$Zlo
|
---|
216 |
|
---|
217 | shr \$4,$Zlo
|
---|
218 | and \$0xf,$rem
|
---|
219 | mov $Zhi,$tmp
|
---|
220 | shr \$4,$Zhi
|
---|
221 | xor 8($Htbl,$nhi),$Zlo
|
---|
222 | shl \$60,$tmp
|
---|
223 | xor ($Htbl,$nhi),$Zhi
|
---|
224 | xor $tmp,$Zlo
|
---|
225 | xor ($rem_4bit,$rem,8),$Zhi
|
---|
226 |
|
---|
227 | bswap $Zlo
|
---|
228 | bswap $Zhi
|
---|
229 | ___
|
---|
230 | }}
|
---|
231 |
|
---|
232 | $code=<<___;
|
---|
233 | .text
|
---|
234 | .extern OPENSSL_ia32cap_P
|
---|
235 |
|
---|
236 | .globl gcm_gmult_4bit
|
---|
237 | .type gcm_gmult_4bit,\@function,2
|
---|
238 | .align 16
|
---|
239 | gcm_gmult_4bit:
|
---|
240 | push %rbx
|
---|
241 | push %rbp # %rbp and %r12 are pushed exclusively in
|
---|
242 | push %r12 # order to reuse Win64 exception handler...
|
---|
243 | .Lgmult_prologue:
|
---|
244 |
|
---|
245 | movzb 15($Xi),$Zlo
|
---|
246 | lea .Lrem_4bit(%rip),$rem_4bit
|
---|
247 | ___
|
---|
248 | &loop ($Xi);
|
---|
249 | $code.=<<___;
|
---|
250 | mov $Zlo,8($Xi)
|
---|
251 | mov $Zhi,($Xi)
|
---|
252 |
|
---|
253 | mov 16(%rsp),%rbx
|
---|
254 | lea 24(%rsp),%rsp
|
---|
255 | .Lgmult_epilogue:
|
---|
256 | ret
|
---|
257 | .size gcm_gmult_4bit,.-gcm_gmult_4bit
|
---|
258 | ___
|
---|
259 | |
---|
260 |
|
---|
261 | # per-function register layout
|
---|
262 | $inp="%rdx";
|
---|
263 | $len="%rcx";
|
---|
264 | $rem_8bit=$rem_4bit;
|
---|
265 |
|
---|
266 | $code.=<<___;
|
---|
267 | .globl gcm_ghash_4bit
|
---|
268 | .type gcm_ghash_4bit,\@function,4
|
---|
269 | .align 16
|
---|
270 | gcm_ghash_4bit:
|
---|
271 | push %rbx
|
---|
272 | push %rbp
|
---|
273 | push %r12
|
---|
274 | push %r13
|
---|
275 | push %r14
|
---|
276 | push %r15
|
---|
277 | sub \$280,%rsp
|
---|
278 | .Lghash_prologue:
|
---|
279 | mov $inp,%r14 # reassign couple of args
|
---|
280 | mov $len,%r15
|
---|
281 | ___
|
---|
282 | { my $inp="%r14";
|
---|
283 | my $dat="%edx";
|
---|
284 | my $len="%r15";
|
---|
285 | my @nhi=("%ebx","%ecx");
|
---|
286 | my @rem=("%r12","%r13");
|
---|
287 | my $Hshr4="%rbp";
|
---|
288 |
|
---|
289 | &sub ($Htbl,-128); # size optimization
|
---|
290 | &lea ($Hshr4,"16+128(%rsp)");
|
---|
291 | { my @lo =($nlo,$nhi);
|
---|
292 | my @hi =($Zlo,$Zhi);
|
---|
293 |
|
---|
294 | &xor ($dat,$dat);
|
---|
295 | for ($i=0,$j=-2;$i<18;$i++,$j++) {
|
---|
296 | &mov ("$j(%rsp)",&LB($dat)) if ($i>1);
|
---|
297 | &or ($lo[0],$tmp) if ($i>1);
|
---|
298 | &mov (&LB($dat),&LB($lo[1])) if ($i>0 && $i<17);
|
---|
299 | &shr ($lo[1],4) if ($i>0 && $i<17);
|
---|
300 | &mov ($tmp,$hi[1]) if ($i>0 && $i<17);
|
---|
301 | &shr ($hi[1],4) if ($i>0 && $i<17);
|
---|
302 | &mov ("8*$j($Hshr4)",$hi[0]) if ($i>1);
|
---|
303 | &mov ($hi[0],"16*$i+0-128($Htbl)") if ($i<16);
|
---|
304 | &shl (&LB($dat),4) if ($i>0 && $i<17);
|
---|
305 | &mov ("8*$j-128($Hshr4)",$lo[0]) if ($i>1);
|
---|
306 | &mov ($lo[0],"16*$i+8-128($Htbl)") if ($i<16);
|
---|
307 | &shl ($tmp,60) if ($i>0 && $i<17);
|
---|
308 |
|
---|
309 | push (@lo,shift(@lo));
|
---|
310 | push (@hi,shift(@hi));
|
---|
311 | }
|
---|
312 | }
|
---|
313 | &add ($Htbl,-128);
|
---|
314 | &mov ($Zlo,"8($Xi)");
|
---|
315 | &mov ($Zhi,"0($Xi)");
|
---|
316 | &add ($len,$inp); # pointer to the end of data
|
---|
317 | &lea ($rem_8bit,".Lrem_8bit(%rip)");
|
---|
318 | &jmp (".Louter_loop");
|
---|
319 |
|
---|
320 | $code.=".align 16\n.Louter_loop:\n";
|
---|
321 | &xor ($Zhi,"($inp)");
|
---|
322 | &mov ("%rdx","8($inp)");
|
---|
323 | &lea ($inp,"16($inp)");
|
---|
324 | &xor ("%rdx",$Zlo);
|
---|
325 | &mov ("($Xi)",$Zhi);
|
---|
326 | &mov ("8($Xi)","%rdx");
|
---|
327 | &shr ("%rdx",32);
|
---|
328 |
|
---|
329 | &xor ($nlo,$nlo);
|
---|
330 | &rol ($dat,8);
|
---|
331 | &mov (&LB($nlo),&LB($dat));
|
---|
332 | &movz ($nhi[0],&LB($dat));
|
---|
333 | &shl (&LB($nlo),4);
|
---|
334 | &shr ($nhi[0],4);
|
---|
335 |
|
---|
336 | for ($j=11,$i=0;$i<15;$i++) {
|
---|
337 | &rol ($dat,8);
|
---|
338 | &xor ($Zlo,"8($Htbl,$nlo)") if ($i>0);
|
---|
339 | &xor ($Zhi,"($Htbl,$nlo)") if ($i>0);
|
---|
340 | &mov ($Zlo,"8($Htbl,$nlo)") if ($i==0);
|
---|
341 | &mov ($Zhi,"($Htbl,$nlo)") if ($i==0);
|
---|
342 |
|
---|
343 | &mov (&LB($nlo),&LB($dat));
|
---|
344 | &xor ($Zlo,$tmp) if ($i>0);
|
---|
345 | &movzw ($rem[1],"($rem_8bit,$rem[1],2)") if ($i>0);
|
---|
346 |
|
---|
347 | &movz ($nhi[1],&LB($dat));
|
---|
348 | &shl (&LB($nlo),4);
|
---|
349 | &movzb ($rem[0],"(%rsp,$nhi[0])");
|
---|
350 |
|
---|
351 | &shr ($nhi[1],4) if ($i<14);
|
---|
352 | &and ($nhi[1],0xf0) if ($i==14);
|
---|
353 | &shl ($rem[1],48) if ($i>0);
|
---|
354 | &xor ($rem[0],$Zlo);
|
---|
355 |
|
---|
356 | &mov ($tmp,$Zhi);
|
---|
357 | &xor ($Zhi,$rem[1]) if ($i>0);
|
---|
358 | &shr ($Zlo,8);
|
---|
359 |
|
---|
360 | &movz ($rem[0],&LB($rem[0]));
|
---|
361 | &mov ($dat,"$j($Xi)") if (--$j%4==0);
|
---|
362 | &shr ($Zhi,8);
|
---|
363 |
|
---|
364 | &xor ($Zlo,"-128($Hshr4,$nhi[0],8)");
|
---|
365 | &shl ($tmp,56);
|
---|
366 | &xor ($Zhi,"($Hshr4,$nhi[0],8)");
|
---|
367 |
|
---|
368 | unshift (@nhi,pop(@nhi)); # "rotate" registers
|
---|
369 | unshift (@rem,pop(@rem));
|
---|
370 | }
|
---|
371 | &movzw ($rem[1],"($rem_8bit,$rem[1],2)");
|
---|
372 | &xor ($Zlo,"8($Htbl,$nlo)");
|
---|
373 | &xor ($Zhi,"($Htbl,$nlo)");
|
---|
374 |
|
---|
375 | &shl ($rem[1],48);
|
---|
376 | &xor ($Zlo,$tmp);
|
---|
377 |
|
---|
378 | &xor ($Zhi,$rem[1]);
|
---|
379 | &movz ($rem[0],&LB($Zlo));
|
---|
380 | &shr ($Zlo,4);
|
---|
381 |
|
---|
382 | &mov ($tmp,$Zhi);
|
---|
383 | &shl (&LB($rem[0]),4);
|
---|
384 | &shr ($Zhi,4);
|
---|
385 |
|
---|
386 | &xor ($Zlo,"8($Htbl,$nhi[0])");
|
---|
387 | &movzw ($rem[0],"($rem_8bit,$rem[0],2)");
|
---|
388 | &shl ($tmp,60);
|
---|
389 |
|
---|
390 | &xor ($Zhi,"($Htbl,$nhi[0])");
|
---|
391 | &xor ($Zlo,$tmp);
|
---|
392 | &shl ($rem[0],48);
|
---|
393 |
|
---|
394 | &bswap ($Zlo);
|
---|
395 | &xor ($Zhi,$rem[0]);
|
---|
396 |
|
---|
397 | &bswap ($Zhi);
|
---|
398 | &cmp ($inp,$len);
|
---|
399 | &jb (".Louter_loop");
|
---|
400 | }
|
---|
401 | $code.=<<___;
|
---|
402 | mov $Zlo,8($Xi)
|
---|
403 | mov $Zhi,($Xi)
|
---|
404 |
|
---|
405 | lea 280(%rsp),%rsi
|
---|
406 | mov 0(%rsi),%r15
|
---|
407 | mov 8(%rsi),%r14
|
---|
408 | mov 16(%rsi),%r13
|
---|
409 | mov 24(%rsi),%r12
|
---|
410 | mov 32(%rsi),%rbp
|
---|
411 | mov 40(%rsi),%rbx
|
---|
412 | lea 48(%rsi),%rsp
|
---|
413 | .Lghash_epilogue:
|
---|
414 | ret
|
---|
415 | .size gcm_ghash_4bit,.-gcm_ghash_4bit
|
---|
416 | ___
|
---|
417 | |
---|
418 |
|
---|
419 | ######################################################################
|
---|
420 | # PCLMULQDQ version.
|
---|
421 |
|
---|
422 | @_4args=$win64? ("%rcx","%rdx","%r8", "%r9") : # Win64 order
|
---|
423 | ("%rdi","%rsi","%rdx","%rcx"); # Unix order
|
---|
424 |
|
---|
425 | ($Xi,$Xhi)=("%xmm0","%xmm1"); $Hkey="%xmm2";
|
---|
426 | ($T1,$T2,$T3)=("%xmm3","%xmm4","%xmm5");
|
---|
427 |
|
---|
428 | sub clmul64x64_T2 { # minimal register pressure
|
---|
429 | my ($Xhi,$Xi,$Hkey,$HK)=@_;
|
---|
430 |
|
---|
431 | if (!defined($HK)) { $HK = $T2;
|
---|
432 | $code.=<<___;
|
---|
433 | movdqa $Xi,$Xhi #
|
---|
434 | pshufd \$0b01001110,$Xi,$T1
|
---|
435 | pshufd \$0b01001110,$Hkey,$T2
|
---|
436 | pxor $Xi,$T1 #
|
---|
437 | pxor $Hkey,$T2
|
---|
438 | ___
|
---|
439 | } else {
|
---|
440 | $code.=<<___;
|
---|
441 | movdqa $Xi,$Xhi #
|
---|
442 | pshufd \$0b01001110,$Xi,$T1
|
---|
443 | pxor $Xi,$T1 #
|
---|
444 | ___
|
---|
445 | }
|
---|
446 | $code.=<<___;
|
---|
447 | pclmulqdq \$0x00,$Hkey,$Xi #######
|
---|
448 | pclmulqdq \$0x11,$Hkey,$Xhi #######
|
---|
449 | pclmulqdq \$0x00,$HK,$T1 #######
|
---|
450 | pxor $Xi,$T1 #
|
---|
451 | pxor $Xhi,$T1 #
|
---|
452 |
|
---|
453 | movdqa $T1,$T2 #
|
---|
454 | psrldq \$8,$T1
|
---|
455 | pslldq \$8,$T2 #
|
---|
456 | pxor $T1,$Xhi
|
---|
457 | pxor $T2,$Xi #
|
---|
458 | ___
|
---|
459 | }
|
---|
460 |
|
---|
461 | sub reduction_alg9 { # 17/11 times faster than Intel version
|
---|
462 | my ($Xhi,$Xi) = @_;
|
---|
463 |
|
---|
464 | $code.=<<___;
|
---|
465 | # 1st phase
|
---|
466 | movdqa $Xi,$T2 #
|
---|
467 | movdqa $Xi,$T1
|
---|
468 | psllq \$5,$Xi
|
---|
469 | pxor $Xi,$T1 #
|
---|
470 | psllq \$1,$Xi
|
---|
471 | pxor $T1,$Xi #
|
---|
472 | psllq \$57,$Xi #
|
---|
473 | movdqa $Xi,$T1 #
|
---|
474 | pslldq \$8,$Xi
|
---|
475 | psrldq \$8,$T1 #
|
---|
476 | pxor $T2,$Xi
|
---|
477 | pxor $T1,$Xhi #
|
---|
478 |
|
---|
479 | # 2nd phase
|
---|
480 | movdqa $Xi,$T2
|
---|
481 | psrlq \$1,$Xi
|
---|
482 | pxor $T2,$Xhi #
|
---|
483 | pxor $Xi,$T2
|
---|
484 | psrlq \$5,$Xi
|
---|
485 | pxor $T2,$Xi #
|
---|
486 | psrlq \$1,$Xi #
|
---|
487 | pxor $Xhi,$Xi #
|
---|
488 | ___
|
---|
489 | }
|
---|
490 | |
---|
491 |
|
---|
492 | { my ($Htbl,$Xip)=@_4args;
|
---|
493 | my $HK="%xmm6";
|
---|
494 |
|
---|
495 | $code.=<<___;
|
---|
496 | .globl gcm_init_clmul
|
---|
497 | .type gcm_init_clmul,\@abi-omnipotent
|
---|
498 | .align 16
|
---|
499 | gcm_init_clmul:
|
---|
500 | .L_init_clmul:
|
---|
501 | ___
|
---|
502 | $code.=<<___ if ($win64);
|
---|
503 | .LSEH_begin_gcm_init_clmul:
|
---|
504 | # I can't trust assembler to use specific encoding:-(
|
---|
505 | .byte 0x48,0x83,0xec,0x18 #sub $0x18,%rsp
|
---|
506 | .byte 0x0f,0x29,0x34,0x24 #movaps %xmm6,(%rsp)
|
---|
507 | ___
|
---|
508 | $code.=<<___;
|
---|
509 | movdqu ($Xip),$Hkey
|
---|
510 | pshufd \$0b01001110,$Hkey,$Hkey # dword swap
|
---|
511 |
|
---|
512 | # <<1 twist
|
---|
513 | pshufd \$0b11111111,$Hkey,$T2 # broadcast uppermost dword
|
---|
514 | movdqa $Hkey,$T1
|
---|
515 | psllq \$1,$Hkey
|
---|
516 | pxor $T3,$T3 #
|
---|
517 | psrlq \$63,$T1
|
---|
518 | pcmpgtd $T2,$T3 # broadcast carry bit
|
---|
519 | pslldq \$8,$T1
|
---|
520 | por $T1,$Hkey # H<<=1
|
---|
521 |
|
---|
522 | # magic reduction
|
---|
523 | pand .L0x1c2_polynomial(%rip),$T3
|
---|
524 | pxor $T3,$Hkey # if(carry) H^=0x1c2_polynomial
|
---|
525 |
|
---|
526 | # calculate H^2
|
---|
527 | pshufd \$0b01001110,$Hkey,$HK
|
---|
528 | movdqa $Hkey,$Xi
|
---|
529 | pxor $Hkey,$HK
|
---|
530 | ___
|
---|
531 | &clmul64x64_T2 ($Xhi,$Xi,$Hkey,$HK);
|
---|
532 | &reduction_alg9 ($Xhi,$Xi);
|
---|
533 | $code.=<<___;
|
---|
534 | pshufd \$0b01001110,$Hkey,$T1
|
---|
535 | pshufd \$0b01001110,$Xi,$T2
|
---|
536 | pxor $Hkey,$T1 # Karatsuba pre-processing
|
---|
537 | movdqu $Hkey,0x00($Htbl) # save H
|
---|
538 | pxor $Xi,$T2 # Karatsuba pre-processing
|
---|
539 | movdqu $Xi,0x10($Htbl) # save H^2
|
---|
540 | palignr \$8,$T1,$T2 # low part is H.lo^H.hi...
|
---|
541 | movdqu $T2,0x20($Htbl) # save Karatsuba "salt"
|
---|
542 | ___
|
---|
543 | if ($do4xaggr) {
|
---|
544 | &clmul64x64_T2 ($Xhi,$Xi,$Hkey,$HK); # H^3
|
---|
545 | &reduction_alg9 ($Xhi,$Xi);
|
---|
546 | $code.=<<___;
|
---|
547 | movdqa $Xi,$T3
|
---|
548 | ___
|
---|
549 | &clmul64x64_T2 ($Xhi,$Xi,$Hkey,$HK); # H^4
|
---|
550 | &reduction_alg9 ($Xhi,$Xi);
|
---|
551 | $code.=<<___;
|
---|
552 | pshufd \$0b01001110,$T3,$T1
|
---|
553 | pshufd \$0b01001110,$Xi,$T2
|
---|
554 | pxor $T3,$T1 # Karatsuba pre-processing
|
---|
555 | movdqu $T3,0x30($Htbl) # save H^3
|
---|
556 | pxor $Xi,$T2 # Karatsuba pre-processing
|
---|
557 | movdqu $Xi,0x40($Htbl) # save H^4
|
---|
558 | palignr \$8,$T1,$T2 # low part is H^3.lo^H^3.hi...
|
---|
559 | movdqu $T2,0x50($Htbl) # save Karatsuba "salt"
|
---|
560 | ___
|
---|
561 | }
|
---|
562 | $code.=<<___ if ($win64);
|
---|
563 | movaps (%rsp),%xmm6
|
---|
564 | lea 0x18(%rsp),%rsp
|
---|
565 | .LSEH_end_gcm_init_clmul:
|
---|
566 | ___
|
---|
567 | $code.=<<___;
|
---|
568 | ret
|
---|
569 | .size gcm_init_clmul,.-gcm_init_clmul
|
---|
570 | ___
|
---|
571 | }
|
---|
572 |
|
---|
573 | { my ($Xip,$Htbl)=@_4args;
|
---|
574 |
|
---|
575 | $code.=<<___;
|
---|
576 | .globl gcm_gmult_clmul
|
---|
577 | .type gcm_gmult_clmul,\@abi-omnipotent
|
---|
578 | .align 16
|
---|
579 | gcm_gmult_clmul:
|
---|
580 | .L_gmult_clmul:
|
---|
581 | movdqu ($Xip),$Xi
|
---|
582 | movdqa .Lbswap_mask(%rip),$T3
|
---|
583 | movdqu ($Htbl),$Hkey
|
---|
584 | movdqu 0x20($Htbl),$T2
|
---|
585 | pshufb $T3,$Xi
|
---|
586 | ___
|
---|
587 | &clmul64x64_T2 ($Xhi,$Xi,$Hkey,$T2);
|
---|
588 | $code.=<<___ if (0 || (&reduction_alg9($Xhi,$Xi)&&0));
|
---|
589 | # experimental alternative. special thing about is that there
|
---|
590 | # no dependency between the two multiplications...
|
---|
591 | mov \$`0xE1<<1`,%eax
|
---|
592 | mov \$0xA040608020C0E000,%r10 # ((7..0)·0xE0)&0xff
|
---|
593 | mov \$0x07,%r11d
|
---|
594 | movq %rax,$T1
|
---|
595 | movq %r10,$T2
|
---|
596 | movq %r11,$T3 # borrow $T3
|
---|
597 | pand $Xi,$T3
|
---|
598 | pshufb $T3,$T2 # ($Xi&7)·0xE0
|
---|
599 | movq %rax,$T3
|
---|
600 | pclmulqdq \$0x00,$Xi,$T1 # ·(0xE1<<1)
|
---|
601 | pxor $Xi,$T2
|
---|
602 | pslldq \$15,$T2
|
---|
603 | paddd $T2,$T2 # <<(64+56+1)
|
---|
604 | pxor $T2,$Xi
|
---|
605 | pclmulqdq \$0x01,$T3,$Xi
|
---|
606 | movdqa .Lbswap_mask(%rip),$T3 # reload $T3
|
---|
607 | psrldq \$1,$T1
|
---|
608 | pxor $T1,$Xhi
|
---|
609 | pslldq \$7,$Xi
|
---|
610 | pxor $Xhi,$Xi
|
---|
611 | ___
|
---|
612 | $code.=<<___;
|
---|
613 | pshufb $T3,$Xi
|
---|
614 | movdqu $Xi,($Xip)
|
---|
615 | ret
|
---|
616 | .size gcm_gmult_clmul,.-gcm_gmult_clmul
|
---|
617 | ___
|
---|
618 | }
|
---|
619 | |
---|
620 |
|
---|
621 | { my ($Xip,$Htbl,$inp,$len)=@_4args;
|
---|
622 | my ($Xln,$Xmn,$Xhn,$Hkey2,$HK) = map("%xmm$_",(3..7));
|
---|
623 | my ($T1,$T2,$T3)=map("%xmm$_",(8..10));
|
---|
624 |
|
---|
625 | $code.=<<___;
|
---|
626 | .globl gcm_ghash_clmul
|
---|
627 | .type gcm_ghash_clmul,\@abi-omnipotent
|
---|
628 | .align 32
|
---|
629 | gcm_ghash_clmul:
|
---|
630 | .L_ghash_clmul:
|
---|
631 | ___
|
---|
632 | $code.=<<___ if ($win64);
|
---|
633 | lea -0x88(%rsp),%rax
|
---|
634 | .LSEH_begin_gcm_ghash_clmul:
|
---|
635 | # I can't trust assembler to use specific encoding:-(
|
---|
636 | .byte 0x48,0x8d,0x60,0xe0 #lea -0x20(%rax),%rsp
|
---|
637 | .byte 0x0f,0x29,0x70,0xe0 #movaps %xmm6,-0x20(%rax)
|
---|
638 | .byte 0x0f,0x29,0x78,0xf0 #movaps %xmm7,-0x10(%rax)
|
---|
639 | .byte 0x44,0x0f,0x29,0x00 #movaps %xmm8,0(%rax)
|
---|
640 | .byte 0x44,0x0f,0x29,0x48,0x10 #movaps %xmm9,0x10(%rax)
|
---|
641 | .byte 0x44,0x0f,0x29,0x50,0x20 #movaps %xmm10,0x20(%rax)
|
---|
642 | .byte 0x44,0x0f,0x29,0x58,0x30 #movaps %xmm11,0x30(%rax)
|
---|
643 | .byte 0x44,0x0f,0x29,0x60,0x40 #movaps %xmm12,0x40(%rax)
|
---|
644 | .byte 0x44,0x0f,0x29,0x68,0x50 #movaps %xmm13,0x50(%rax)
|
---|
645 | .byte 0x44,0x0f,0x29,0x70,0x60 #movaps %xmm14,0x60(%rax)
|
---|
646 | .byte 0x44,0x0f,0x29,0x78,0x70 #movaps %xmm15,0x70(%rax)
|
---|
647 | ___
|
---|
648 | $code.=<<___;
|
---|
649 | movdqa .Lbswap_mask(%rip),$T3
|
---|
650 |
|
---|
651 | movdqu ($Xip),$Xi
|
---|
652 | movdqu ($Htbl),$Hkey
|
---|
653 | movdqu 0x20($Htbl),$HK
|
---|
654 | pshufb $T3,$Xi
|
---|
655 |
|
---|
656 | sub \$0x10,$len
|
---|
657 | jz .Lodd_tail
|
---|
658 |
|
---|
659 | movdqu 0x10($Htbl),$Hkey2
|
---|
660 | ___
|
---|
661 | if ($do4xaggr) {
|
---|
662 | my ($Xl,$Xm,$Xh,$Hkey3,$Hkey4)=map("%xmm$_",(11..15));
|
---|
663 |
|
---|
664 | $code.=<<___;
|
---|
665 | mov OPENSSL_ia32cap_P+4(%rip),%eax
|
---|
666 | cmp \$0x30,$len
|
---|
667 | jb .Lskip4x
|
---|
668 |
|
---|
669 | and \$`1<<26|1<<22`,%eax # isolate MOVBE+XSAVE
|
---|
670 | cmp \$`1<<22`,%eax # check for MOVBE without XSAVE
|
---|
671 | je .Lskip4x
|
---|
672 |
|
---|
673 | sub \$0x30,$len
|
---|
674 | mov \$0xA040608020C0E000,%rax # ((7..0)·0xE0)&0xff
|
---|
675 | movdqu 0x30($Htbl),$Hkey3
|
---|
676 | movdqu 0x40($Htbl),$Hkey4
|
---|
677 |
|
---|
678 | #######
|
---|
679 | # Xi+4 =[(H*Ii+3) + (H^2*Ii+2) + (H^3*Ii+1) + H^4*(Ii+Xi)] mod P
|
---|
680 | #
|
---|
681 | movdqu 0x30($inp),$Xln
|
---|
682 | movdqu 0x20($inp),$Xl
|
---|
683 | pshufb $T3,$Xln
|
---|
684 | pshufb $T3,$Xl
|
---|
685 | movdqa $Xln,$Xhn
|
---|
686 | pshufd \$0b01001110,$Xln,$Xmn
|
---|
687 | pxor $Xln,$Xmn
|
---|
688 | pclmulqdq \$0x00,$Hkey,$Xln
|
---|
689 | pclmulqdq \$0x11,$Hkey,$Xhn
|
---|
690 | pclmulqdq \$0x00,$HK,$Xmn
|
---|
691 |
|
---|
692 | movdqa $Xl,$Xh
|
---|
693 | pshufd \$0b01001110,$Xl,$Xm
|
---|
694 | pxor $Xl,$Xm
|
---|
695 | pclmulqdq \$0x00,$Hkey2,$Xl
|
---|
696 | pclmulqdq \$0x11,$Hkey2,$Xh
|
---|
697 | pclmulqdq \$0x10,$HK,$Xm
|
---|
698 | xorps $Xl,$Xln
|
---|
699 | xorps $Xh,$Xhn
|
---|
700 | movups 0x50($Htbl),$HK
|
---|
701 | xorps $Xm,$Xmn
|
---|
702 |
|
---|
703 | movdqu 0x10($inp),$Xl
|
---|
704 | movdqu 0($inp),$T1
|
---|
705 | pshufb $T3,$Xl
|
---|
706 | pshufb $T3,$T1
|
---|
707 | movdqa $Xl,$Xh
|
---|
708 | pshufd \$0b01001110,$Xl,$Xm
|
---|
709 | pxor $T1,$Xi
|
---|
710 | pxor $Xl,$Xm
|
---|
711 | pclmulqdq \$0x00,$Hkey3,$Xl
|
---|
712 | movdqa $Xi,$Xhi
|
---|
713 | pshufd \$0b01001110,$Xi,$T1
|
---|
714 | pxor $Xi,$T1
|
---|
715 | pclmulqdq \$0x11,$Hkey3,$Xh
|
---|
716 | pclmulqdq \$0x00,$HK,$Xm
|
---|
717 | xorps $Xl,$Xln
|
---|
718 | xorps $Xh,$Xhn
|
---|
719 |
|
---|
720 | lea 0x40($inp),$inp
|
---|
721 | sub \$0x40,$len
|
---|
722 | jc .Ltail4x
|
---|
723 |
|
---|
724 | jmp .Lmod4_loop
|
---|
725 | .align 32
|
---|
726 | .Lmod4_loop:
|
---|
727 | pclmulqdq \$0x00,$Hkey4,$Xi
|
---|
728 | xorps $Xm,$Xmn
|
---|
729 | movdqu 0x30($inp),$Xl
|
---|
730 | pshufb $T3,$Xl
|
---|
731 | pclmulqdq \$0x11,$Hkey4,$Xhi
|
---|
732 | xorps $Xln,$Xi
|
---|
733 | movdqu 0x20($inp),$Xln
|
---|
734 | movdqa $Xl,$Xh
|
---|
735 | pclmulqdq \$0x10,$HK,$T1
|
---|
736 | pshufd \$0b01001110,$Xl,$Xm
|
---|
737 | xorps $Xhn,$Xhi
|
---|
738 | pxor $Xl,$Xm
|
---|
739 | pshufb $T3,$Xln
|
---|
740 | movups 0x20($Htbl),$HK
|
---|
741 | xorps $Xmn,$T1
|
---|
742 | pclmulqdq \$0x00,$Hkey,$Xl
|
---|
743 | pshufd \$0b01001110,$Xln,$Xmn
|
---|
744 |
|
---|
745 | pxor $Xi,$T1 # aggregated Karatsuba post-processing
|
---|
746 | movdqa $Xln,$Xhn
|
---|
747 | pxor $Xhi,$T1 #
|
---|
748 | pxor $Xln,$Xmn
|
---|
749 | movdqa $T1,$T2 #
|
---|
750 | pclmulqdq \$0x11,$Hkey,$Xh
|
---|
751 | pslldq \$8,$T1
|
---|
752 | psrldq \$8,$T2 #
|
---|
753 | pxor $T1,$Xi
|
---|
754 | movdqa .L7_mask(%rip),$T1
|
---|
755 | pxor $T2,$Xhi #
|
---|
756 | movq %rax,$T2
|
---|
757 |
|
---|
758 | pand $Xi,$T1 # 1st phase
|
---|
759 | pshufb $T1,$T2 #
|
---|
760 | pxor $Xi,$T2 #
|
---|
761 | pclmulqdq \$0x00,$HK,$Xm
|
---|
762 | psllq \$57,$T2 #
|
---|
763 | movdqa $T2,$T1 #
|
---|
764 | pslldq \$8,$T2
|
---|
765 | pclmulqdq \$0x00,$Hkey2,$Xln
|
---|
766 | psrldq \$8,$T1 #
|
---|
767 | pxor $T2,$Xi
|
---|
768 | pxor $T1,$Xhi #
|
---|
769 | movdqu 0($inp),$T1
|
---|
770 |
|
---|
771 | movdqa $Xi,$T2 # 2nd phase
|
---|
772 | psrlq \$1,$Xi
|
---|
773 | pclmulqdq \$0x11,$Hkey2,$Xhn
|
---|
774 | xorps $Xl,$Xln
|
---|
775 | movdqu 0x10($inp),$Xl
|
---|
776 | pshufb $T3,$Xl
|
---|
777 | pclmulqdq \$0x10,$HK,$Xmn
|
---|
778 | xorps $Xh,$Xhn
|
---|
779 | movups 0x50($Htbl),$HK
|
---|
780 | pshufb $T3,$T1
|
---|
781 | pxor $T2,$Xhi #
|
---|
782 | pxor $Xi,$T2
|
---|
783 | psrlq \$5,$Xi
|
---|
784 |
|
---|
785 | movdqa $Xl,$Xh
|
---|
786 | pxor $Xm,$Xmn
|
---|
787 | pshufd \$0b01001110,$Xl,$Xm
|
---|
788 | pxor $T2,$Xi #
|
---|
789 | pxor $T1,$Xhi
|
---|
790 | pxor $Xl,$Xm
|
---|
791 | pclmulqdq \$0x00,$Hkey3,$Xl
|
---|
792 | psrlq \$1,$Xi #
|
---|
793 | pxor $Xhi,$Xi #
|
---|
794 | movdqa $Xi,$Xhi
|
---|
795 | pclmulqdq \$0x11,$Hkey3,$Xh
|
---|
796 | xorps $Xl,$Xln
|
---|
797 | pshufd \$0b01001110,$Xi,$T1
|
---|
798 | pxor $Xi,$T1
|
---|
799 |
|
---|
800 | pclmulqdq \$0x00,$HK,$Xm
|
---|
801 | xorps $Xh,$Xhn
|
---|
802 |
|
---|
803 | lea 0x40($inp),$inp
|
---|
804 | sub \$0x40,$len
|
---|
805 | jnc .Lmod4_loop
|
---|
806 |
|
---|
807 | .Ltail4x:
|
---|
808 | pclmulqdq \$0x00,$Hkey4,$Xi
|
---|
809 | pclmulqdq \$0x11,$Hkey4,$Xhi
|
---|
810 | pclmulqdq \$0x10,$HK,$T1
|
---|
811 | xorps $Xm,$Xmn
|
---|
812 | xorps $Xln,$Xi
|
---|
813 | xorps $Xhn,$Xhi
|
---|
814 | pxor $Xi,$Xhi # aggregated Karatsuba post-processing
|
---|
815 | pxor $Xmn,$T1
|
---|
816 |
|
---|
817 | pxor $Xhi,$T1 #
|
---|
818 | pxor $Xi,$Xhi
|
---|
819 |
|
---|
820 | movdqa $T1,$T2 #
|
---|
821 | psrldq \$8,$T1
|
---|
822 | pslldq \$8,$T2 #
|
---|
823 | pxor $T1,$Xhi
|
---|
824 | pxor $T2,$Xi #
|
---|
825 | ___
|
---|
826 | &reduction_alg9($Xhi,$Xi);
|
---|
827 | $code.=<<___;
|
---|
828 | add \$0x40,$len
|
---|
829 | jz .Ldone
|
---|
830 | movdqu 0x20($Htbl),$HK
|
---|
831 | sub \$0x10,$len
|
---|
832 | jz .Lodd_tail
|
---|
833 | .Lskip4x:
|
---|
834 | ___
|
---|
835 | }
|
---|
836 | $code.=<<___;
|
---|
837 | #######
|
---|
838 | # Xi+2 =[H*(Ii+1 + Xi+1)] mod P =
|
---|
839 | # [(H*Ii+1) + (H*Xi+1)] mod P =
|
---|
840 | # [(H*Ii+1) + H^2*(Ii+Xi)] mod P
|
---|
841 | #
|
---|
842 | movdqu ($inp),$T1 # Ii
|
---|
843 | movdqu 16($inp),$Xln # Ii+1
|
---|
844 | pshufb $T3,$T1
|
---|
845 | pshufb $T3,$Xln
|
---|
846 | pxor $T1,$Xi # Ii+Xi
|
---|
847 |
|
---|
848 | movdqa $Xln,$Xhn
|
---|
849 | pshufd \$0b01001110,$Xln,$Xmn
|
---|
850 | pxor $Xln,$Xmn
|
---|
851 | pclmulqdq \$0x00,$Hkey,$Xln
|
---|
852 | pclmulqdq \$0x11,$Hkey,$Xhn
|
---|
853 | pclmulqdq \$0x00,$HK,$Xmn
|
---|
854 |
|
---|
855 | lea 32($inp),$inp # i+=2
|
---|
856 | nop
|
---|
857 | sub \$0x20,$len
|
---|
858 | jbe .Leven_tail
|
---|
859 | nop
|
---|
860 | jmp .Lmod_loop
|
---|
861 |
|
---|
862 | .align 32
|
---|
863 | .Lmod_loop:
|
---|
864 | movdqa $Xi,$Xhi
|
---|
865 | movdqa $Xmn,$T1
|
---|
866 | pshufd \$0b01001110,$Xi,$Xmn #
|
---|
867 | pxor $Xi,$Xmn #
|
---|
868 |
|
---|
869 | pclmulqdq \$0x00,$Hkey2,$Xi
|
---|
870 | pclmulqdq \$0x11,$Hkey2,$Xhi
|
---|
871 | pclmulqdq \$0x10,$HK,$Xmn
|
---|
872 |
|
---|
873 | pxor $Xln,$Xi # (H*Ii+1) + H^2*(Ii+Xi)
|
---|
874 | pxor $Xhn,$Xhi
|
---|
875 | movdqu ($inp),$T2 # Ii
|
---|
876 | pxor $Xi,$T1 # aggregated Karatsuba post-processing
|
---|
877 | pshufb $T3,$T2
|
---|
878 | movdqu 16($inp),$Xln # Ii+1
|
---|
879 |
|
---|
880 | pxor $Xhi,$T1
|
---|
881 | pxor $T2,$Xhi # "Ii+Xi", consume early
|
---|
882 | pxor $T1,$Xmn
|
---|
883 | pshufb $T3,$Xln
|
---|
884 | movdqa $Xmn,$T1 #
|
---|
885 | psrldq \$8,$T1
|
---|
886 | pslldq \$8,$Xmn #
|
---|
887 | pxor $T1,$Xhi
|
---|
888 | pxor $Xmn,$Xi #
|
---|
889 |
|
---|
890 | movdqa $Xln,$Xhn #
|
---|
891 |
|
---|
892 | movdqa $Xi,$T2 # 1st phase
|
---|
893 | movdqa $Xi,$T1
|
---|
894 | psllq \$5,$Xi
|
---|
895 | pxor $Xi,$T1 #
|
---|
896 | pclmulqdq \$0x00,$Hkey,$Xln #######
|
---|
897 | psllq \$1,$Xi
|
---|
898 | pxor $T1,$Xi #
|
---|
899 | psllq \$57,$Xi #
|
---|
900 | movdqa $Xi,$T1 #
|
---|
901 | pslldq \$8,$Xi
|
---|
902 | psrldq \$8,$T1 #
|
---|
903 | pxor $T2,$Xi
|
---|
904 | pshufd \$0b01001110,$Xhn,$Xmn
|
---|
905 | pxor $T1,$Xhi #
|
---|
906 | pxor $Xhn,$Xmn #
|
---|
907 |
|
---|
908 | movdqa $Xi,$T2 # 2nd phase
|
---|
909 | psrlq \$1,$Xi
|
---|
910 | pclmulqdq \$0x11,$Hkey,$Xhn #######
|
---|
911 | pxor $T2,$Xhi #
|
---|
912 | pxor $Xi,$T2
|
---|
913 | psrlq \$5,$Xi
|
---|
914 | pxor $T2,$Xi #
|
---|
915 | lea 32($inp),$inp
|
---|
916 | psrlq \$1,$Xi #
|
---|
917 | pclmulqdq \$0x00,$HK,$Xmn #######
|
---|
918 | pxor $Xhi,$Xi #
|
---|
919 |
|
---|
920 | sub \$0x20,$len
|
---|
921 | ja .Lmod_loop
|
---|
922 |
|
---|
923 | .Leven_tail:
|
---|
924 | movdqa $Xi,$Xhi
|
---|
925 | movdqa $Xmn,$T1
|
---|
926 | pshufd \$0b01001110,$Xi,$Xmn #
|
---|
927 | pxor $Xi,$Xmn #
|
---|
928 |
|
---|
929 | pclmulqdq \$0x00,$Hkey2,$Xi
|
---|
930 | pclmulqdq \$0x11,$Hkey2,$Xhi
|
---|
931 | pclmulqdq \$0x10,$HK,$Xmn
|
---|
932 |
|
---|
933 | pxor $Xln,$Xi # (H*Ii+1) + H^2*(Ii+Xi)
|
---|
934 | pxor $Xhn,$Xhi
|
---|
935 | pxor $Xi,$T1
|
---|
936 | pxor $Xhi,$T1
|
---|
937 | pxor $T1,$Xmn
|
---|
938 | movdqa $Xmn,$T1 #
|
---|
939 | psrldq \$8,$T1
|
---|
940 | pslldq \$8,$Xmn #
|
---|
941 | pxor $T1,$Xhi
|
---|
942 | pxor $Xmn,$Xi #
|
---|
943 | ___
|
---|
944 | &reduction_alg9 ($Xhi,$Xi);
|
---|
945 | $code.=<<___;
|
---|
946 | test $len,$len
|
---|
947 | jnz .Ldone
|
---|
948 |
|
---|
949 | .Lodd_tail:
|
---|
950 | movdqu ($inp),$T1 # Ii
|
---|
951 | pshufb $T3,$T1
|
---|
952 | pxor $T1,$Xi # Ii+Xi
|
---|
953 | ___
|
---|
954 | &clmul64x64_T2 ($Xhi,$Xi,$Hkey,$HK); # H*(Ii+Xi)
|
---|
955 | &reduction_alg9 ($Xhi,$Xi);
|
---|
956 | $code.=<<___;
|
---|
957 | .Ldone:
|
---|
958 | pshufb $T3,$Xi
|
---|
959 | movdqu $Xi,($Xip)
|
---|
960 | ___
|
---|
961 | $code.=<<___ if ($win64);
|
---|
962 | movaps (%rsp),%xmm6
|
---|
963 | movaps 0x10(%rsp),%xmm7
|
---|
964 | movaps 0x20(%rsp),%xmm8
|
---|
965 | movaps 0x30(%rsp),%xmm9
|
---|
966 | movaps 0x40(%rsp),%xmm10
|
---|
967 | movaps 0x50(%rsp),%xmm11
|
---|
968 | movaps 0x60(%rsp),%xmm12
|
---|
969 | movaps 0x70(%rsp),%xmm13
|
---|
970 | movaps 0x80(%rsp),%xmm14
|
---|
971 | movaps 0x90(%rsp),%xmm15
|
---|
972 | lea 0xa8(%rsp),%rsp
|
---|
973 | .LSEH_end_gcm_ghash_clmul:
|
---|
974 | ___
|
---|
975 | $code.=<<___;
|
---|
976 | ret
|
---|
977 | .size gcm_ghash_clmul,.-gcm_ghash_clmul
|
---|
978 | ___
|
---|
979 | }
|
---|
980 | |
---|
981 |
|
---|
982 | $code.=<<___;
|
---|
983 | .globl gcm_init_avx
|
---|
984 | .type gcm_init_avx,\@abi-omnipotent
|
---|
985 | .align 32
|
---|
986 | gcm_init_avx:
|
---|
987 | ___
|
---|
988 | if ($avx) {
|
---|
989 | my ($Htbl,$Xip)=@_4args;
|
---|
990 | my $HK="%xmm6";
|
---|
991 |
|
---|
992 | $code.=<<___ if ($win64);
|
---|
993 | .LSEH_begin_gcm_init_avx:
|
---|
994 | # I can't trust assembler to use specific encoding:-(
|
---|
995 | .byte 0x48,0x83,0xec,0x18 #sub $0x18,%rsp
|
---|
996 | .byte 0x0f,0x29,0x34,0x24 #movaps %xmm6,(%rsp)
|
---|
997 | ___
|
---|
998 | $code.=<<___;
|
---|
999 | vzeroupper
|
---|
1000 |
|
---|
1001 | vmovdqu ($Xip),$Hkey
|
---|
1002 | vpshufd \$0b01001110,$Hkey,$Hkey # dword swap
|
---|
1003 |
|
---|
1004 | # <<1 twist
|
---|
1005 | vpshufd \$0b11111111,$Hkey,$T2 # broadcast uppermost dword
|
---|
1006 | vpsrlq \$63,$Hkey,$T1
|
---|
1007 | vpsllq \$1,$Hkey,$Hkey
|
---|
1008 | vpxor $T3,$T3,$T3 #
|
---|
1009 | vpcmpgtd $T2,$T3,$T3 # broadcast carry bit
|
---|
1010 | vpslldq \$8,$T1,$T1
|
---|
1011 | vpor $T1,$Hkey,$Hkey # H<<=1
|
---|
1012 |
|
---|
1013 | # magic reduction
|
---|
1014 | vpand .L0x1c2_polynomial(%rip),$T3,$T3
|
---|
1015 | vpxor $T3,$Hkey,$Hkey # if(carry) H^=0x1c2_polynomial
|
---|
1016 |
|
---|
1017 | vpunpckhqdq $Hkey,$Hkey,$HK
|
---|
1018 | vmovdqa $Hkey,$Xi
|
---|
1019 | vpxor $Hkey,$HK,$HK
|
---|
1020 | mov \$4,%r10 # up to H^8
|
---|
1021 | jmp .Linit_start_avx
|
---|
1022 | ___
|
---|
1023 |
|
---|
1024 | sub clmul64x64_avx {
|
---|
1025 | my ($Xhi,$Xi,$Hkey,$HK)=@_;
|
---|
1026 |
|
---|
1027 | if (!defined($HK)) { $HK = $T2;
|
---|
1028 | $code.=<<___;
|
---|
1029 | vpunpckhqdq $Xi,$Xi,$T1
|
---|
1030 | vpunpckhqdq $Hkey,$Hkey,$T2
|
---|
1031 | vpxor $Xi,$T1,$T1 #
|
---|
1032 | vpxor $Hkey,$T2,$T2
|
---|
1033 | ___
|
---|
1034 | } else {
|
---|
1035 | $code.=<<___;
|
---|
1036 | vpunpckhqdq $Xi,$Xi,$T1
|
---|
1037 | vpxor $Xi,$T1,$T1 #
|
---|
1038 | ___
|
---|
1039 | }
|
---|
1040 | $code.=<<___;
|
---|
1041 | vpclmulqdq \$0x11,$Hkey,$Xi,$Xhi #######
|
---|
1042 | vpclmulqdq \$0x00,$Hkey,$Xi,$Xi #######
|
---|
1043 | vpclmulqdq \$0x00,$HK,$T1,$T1 #######
|
---|
1044 | vpxor $Xi,$Xhi,$T2 #
|
---|
1045 | vpxor $T2,$T1,$T1 #
|
---|
1046 |
|
---|
1047 | vpslldq \$8,$T1,$T2 #
|
---|
1048 | vpsrldq \$8,$T1,$T1
|
---|
1049 | vpxor $T2,$Xi,$Xi #
|
---|
1050 | vpxor $T1,$Xhi,$Xhi
|
---|
1051 | ___
|
---|
1052 | }
|
---|
1053 |
|
---|
1054 | sub reduction_avx {
|
---|
1055 | my ($Xhi,$Xi) = @_;
|
---|
1056 |
|
---|
1057 | $code.=<<___;
|
---|
1058 | vpsllq \$57,$Xi,$T1 # 1st phase
|
---|
1059 | vpsllq \$62,$Xi,$T2
|
---|
1060 | vpxor $T1,$T2,$T2 #
|
---|
1061 | vpsllq \$63,$Xi,$T1
|
---|
1062 | vpxor $T1,$T2,$T2 #
|
---|
1063 | vpslldq \$8,$T2,$T1 #
|
---|
1064 | vpsrldq \$8,$T2,$T2
|
---|
1065 | vpxor $T1,$Xi,$Xi #
|
---|
1066 | vpxor $T2,$Xhi,$Xhi
|
---|
1067 |
|
---|
1068 | vpsrlq \$1,$Xi,$T2 # 2nd phase
|
---|
1069 | vpxor $Xi,$Xhi,$Xhi
|
---|
1070 | vpxor $T2,$Xi,$Xi #
|
---|
1071 | vpsrlq \$5,$T2,$T2
|
---|
1072 | vpxor $T2,$Xi,$Xi #
|
---|
1073 | vpsrlq \$1,$Xi,$Xi #
|
---|
1074 | vpxor $Xhi,$Xi,$Xi #
|
---|
1075 | ___
|
---|
1076 | }
|
---|
1077 |
|
---|
1078 | $code.=<<___;
|
---|
1079 | .align 32
|
---|
1080 | .Linit_loop_avx:
|
---|
1081 | vpalignr \$8,$T1,$T2,$T3 # low part is H.lo^H.hi...
|
---|
1082 | vmovdqu $T3,-0x10($Htbl) # save Karatsuba "salt"
|
---|
1083 | ___
|
---|
1084 | &clmul64x64_avx ($Xhi,$Xi,$Hkey,$HK); # calculate H^3,5,7
|
---|
1085 | &reduction_avx ($Xhi,$Xi);
|
---|
1086 | $code.=<<___;
|
---|
1087 | .Linit_start_avx:
|
---|
1088 | vmovdqa $Xi,$T3
|
---|
1089 | ___
|
---|
1090 | &clmul64x64_avx ($Xhi,$Xi,$Hkey,$HK); # calculate H^2,4,6,8
|
---|
1091 | &reduction_avx ($Xhi,$Xi);
|
---|
1092 | $code.=<<___;
|
---|
1093 | vpshufd \$0b01001110,$T3,$T1
|
---|
1094 | vpshufd \$0b01001110,$Xi,$T2
|
---|
1095 | vpxor $T3,$T1,$T1 # Karatsuba pre-processing
|
---|
1096 | vmovdqu $T3,0x00($Htbl) # save H^1,3,5,7
|
---|
1097 | vpxor $Xi,$T2,$T2 # Karatsuba pre-processing
|
---|
1098 | vmovdqu $Xi,0x10($Htbl) # save H^2,4,6,8
|
---|
1099 | lea 0x30($Htbl),$Htbl
|
---|
1100 | sub \$1,%r10
|
---|
1101 | jnz .Linit_loop_avx
|
---|
1102 |
|
---|
1103 | vpalignr \$8,$T2,$T1,$T3 # last "salt" is flipped
|
---|
1104 | vmovdqu $T3,-0x10($Htbl)
|
---|
1105 |
|
---|
1106 | vzeroupper
|
---|
1107 | ___
|
---|
1108 | $code.=<<___ if ($win64);
|
---|
1109 | movaps (%rsp),%xmm6
|
---|
1110 | lea 0x18(%rsp),%rsp
|
---|
1111 | .LSEH_end_gcm_init_avx:
|
---|
1112 | ___
|
---|
1113 | $code.=<<___;
|
---|
1114 | ret
|
---|
1115 | .size gcm_init_avx,.-gcm_init_avx
|
---|
1116 | ___
|
---|
1117 | } else {
|
---|
1118 | $code.=<<___;
|
---|
1119 | jmp .L_init_clmul
|
---|
1120 | .size gcm_init_avx,.-gcm_init_avx
|
---|
1121 | ___
|
---|
1122 | }
|
---|
1123 |
|
---|
1124 | $code.=<<___;
|
---|
1125 | .globl gcm_gmult_avx
|
---|
1126 | .type gcm_gmult_avx,\@abi-omnipotent
|
---|
1127 | .align 32
|
---|
1128 | gcm_gmult_avx:
|
---|
1129 | jmp .L_gmult_clmul
|
---|
1130 | .size gcm_gmult_avx,.-gcm_gmult_avx
|
---|
1131 | ___
|
---|
1132 | |
---|
1133 |
|
---|
1134 | $code.=<<___;
|
---|
1135 | .globl gcm_ghash_avx
|
---|
1136 | .type gcm_ghash_avx,\@abi-omnipotent
|
---|
1137 | .align 32
|
---|
1138 | gcm_ghash_avx:
|
---|
1139 | ___
|
---|
1140 | if ($avx) {
|
---|
1141 | my ($Xip,$Htbl,$inp,$len)=@_4args;
|
---|
1142 | my ($Xlo,$Xhi,$Xmi,
|
---|
1143 | $Zlo,$Zhi,$Zmi,
|
---|
1144 | $Hkey,$HK,$T1,$T2,
|
---|
1145 | $Xi,$Xo,$Tred,$bswap,$Ii,$Ij) = map("%xmm$_",(0..15));
|
---|
1146 |
|
---|
1147 | $code.=<<___ if ($win64);
|
---|
1148 | lea -0x88(%rsp),%rax
|
---|
1149 | .LSEH_begin_gcm_ghash_avx:
|
---|
1150 | # I can't trust assembler to use specific encoding:-(
|
---|
1151 | .byte 0x48,0x8d,0x60,0xe0 #lea -0x20(%rax),%rsp
|
---|
1152 | .byte 0x0f,0x29,0x70,0xe0 #movaps %xmm6,-0x20(%rax)
|
---|
1153 | .byte 0x0f,0x29,0x78,0xf0 #movaps %xmm7,-0x10(%rax)
|
---|
1154 | .byte 0x44,0x0f,0x29,0x00 #movaps %xmm8,0(%rax)
|
---|
1155 | .byte 0x44,0x0f,0x29,0x48,0x10 #movaps %xmm9,0x10(%rax)
|
---|
1156 | .byte 0x44,0x0f,0x29,0x50,0x20 #movaps %xmm10,0x20(%rax)
|
---|
1157 | .byte 0x44,0x0f,0x29,0x58,0x30 #movaps %xmm11,0x30(%rax)
|
---|
1158 | .byte 0x44,0x0f,0x29,0x60,0x40 #movaps %xmm12,0x40(%rax)
|
---|
1159 | .byte 0x44,0x0f,0x29,0x68,0x50 #movaps %xmm13,0x50(%rax)
|
---|
1160 | .byte 0x44,0x0f,0x29,0x70,0x60 #movaps %xmm14,0x60(%rax)
|
---|
1161 | .byte 0x44,0x0f,0x29,0x78,0x70 #movaps %xmm15,0x70(%rax)
|
---|
1162 | ___
|
---|
1163 | $code.=<<___;
|
---|
1164 | vzeroupper
|
---|
1165 |
|
---|
1166 | vmovdqu ($Xip),$Xi # load $Xi
|
---|
1167 | lea .L0x1c2_polynomial(%rip),%r10
|
---|
1168 | lea 0x40($Htbl),$Htbl # size optimization
|
---|
1169 | vmovdqu .Lbswap_mask(%rip),$bswap
|
---|
1170 | vpshufb $bswap,$Xi,$Xi
|
---|
1171 | cmp \$0x80,$len
|
---|
1172 | jb .Lshort_avx
|
---|
1173 | sub \$0x80,$len
|
---|
1174 |
|
---|
1175 | vmovdqu 0x70($inp),$Ii # I[7]
|
---|
1176 | vmovdqu 0x00-0x40($Htbl),$Hkey # $Hkey^1
|
---|
1177 | vpshufb $bswap,$Ii,$Ii
|
---|
1178 | vmovdqu 0x20-0x40($Htbl),$HK
|
---|
1179 |
|
---|
1180 | vpunpckhqdq $Ii,$Ii,$T2
|
---|
1181 | vmovdqu 0x60($inp),$Ij # I[6]
|
---|
1182 | vpclmulqdq \$0x00,$Hkey,$Ii,$Xlo
|
---|
1183 | vpxor $Ii,$T2,$T2
|
---|
1184 | vpshufb $bswap,$Ij,$Ij
|
---|
1185 | vpclmulqdq \$0x11,$Hkey,$Ii,$Xhi
|
---|
1186 | vmovdqu 0x10-0x40($Htbl),$Hkey # $Hkey^2
|
---|
1187 | vpunpckhqdq $Ij,$Ij,$T1
|
---|
1188 | vmovdqu 0x50($inp),$Ii # I[5]
|
---|
1189 | vpclmulqdq \$0x00,$HK,$T2,$Xmi
|
---|
1190 | vpxor $Ij,$T1,$T1
|
---|
1191 |
|
---|
1192 | vpshufb $bswap,$Ii,$Ii
|
---|
1193 | vpclmulqdq \$0x00,$Hkey,$Ij,$Zlo
|
---|
1194 | vpunpckhqdq $Ii,$Ii,$T2
|
---|
1195 | vpclmulqdq \$0x11,$Hkey,$Ij,$Zhi
|
---|
1196 | vmovdqu 0x30-0x40($Htbl),$Hkey # $Hkey^3
|
---|
1197 | vpxor $Ii,$T2,$T2
|
---|
1198 | vmovdqu 0x40($inp),$Ij # I[4]
|
---|
1199 | vpclmulqdq \$0x10,$HK,$T1,$Zmi
|
---|
1200 | vmovdqu 0x50-0x40($Htbl),$HK
|
---|
1201 |
|
---|
1202 | vpshufb $bswap,$Ij,$Ij
|
---|
1203 | vpxor $Xlo,$Zlo,$Zlo
|
---|
1204 | vpclmulqdq \$0x00,$Hkey,$Ii,$Xlo
|
---|
1205 | vpxor $Xhi,$Zhi,$Zhi
|
---|
1206 | vpunpckhqdq $Ij,$Ij,$T1
|
---|
1207 | vpclmulqdq \$0x11,$Hkey,$Ii,$Xhi
|
---|
1208 | vmovdqu 0x40-0x40($Htbl),$Hkey # $Hkey^4
|
---|
1209 | vpxor $Xmi,$Zmi,$Zmi
|
---|
1210 | vpclmulqdq \$0x00,$HK,$T2,$Xmi
|
---|
1211 | vpxor $Ij,$T1,$T1
|
---|
1212 |
|
---|
1213 | vmovdqu 0x30($inp),$Ii # I[3]
|
---|
1214 | vpxor $Zlo,$Xlo,$Xlo
|
---|
1215 | vpclmulqdq \$0x00,$Hkey,$Ij,$Zlo
|
---|
1216 | vpxor $Zhi,$Xhi,$Xhi
|
---|
1217 | vpshufb $bswap,$Ii,$Ii
|
---|
1218 | vpclmulqdq \$0x11,$Hkey,$Ij,$Zhi
|
---|
1219 | vmovdqu 0x60-0x40($Htbl),$Hkey # $Hkey^5
|
---|
1220 | vpxor $Zmi,$Xmi,$Xmi
|
---|
1221 | vpunpckhqdq $Ii,$Ii,$T2
|
---|
1222 | vpclmulqdq \$0x10,$HK,$T1,$Zmi
|
---|
1223 | vmovdqu 0x80-0x40($Htbl),$HK
|
---|
1224 | vpxor $Ii,$T2,$T2
|
---|
1225 |
|
---|
1226 | vmovdqu 0x20($inp),$Ij # I[2]
|
---|
1227 | vpxor $Xlo,$Zlo,$Zlo
|
---|
1228 | vpclmulqdq \$0x00,$Hkey,$Ii,$Xlo
|
---|
1229 | vpxor $Xhi,$Zhi,$Zhi
|
---|
1230 | vpshufb $bswap,$Ij,$Ij
|
---|
1231 | vpclmulqdq \$0x11,$Hkey,$Ii,$Xhi
|
---|
1232 | vmovdqu 0x70-0x40($Htbl),$Hkey # $Hkey^6
|
---|
1233 | vpxor $Xmi,$Zmi,$Zmi
|
---|
1234 | vpunpckhqdq $Ij,$Ij,$T1
|
---|
1235 | vpclmulqdq \$0x00,$HK,$T2,$Xmi
|
---|
1236 | vpxor $Ij,$T1,$T1
|
---|
1237 |
|
---|
1238 | vmovdqu 0x10($inp),$Ii # I[1]
|
---|
1239 | vpxor $Zlo,$Xlo,$Xlo
|
---|
1240 | vpclmulqdq \$0x00,$Hkey,$Ij,$Zlo
|
---|
1241 | vpxor $Zhi,$Xhi,$Xhi
|
---|
1242 | vpshufb $bswap,$Ii,$Ii
|
---|
1243 | vpclmulqdq \$0x11,$Hkey,$Ij,$Zhi
|
---|
1244 | vmovdqu 0x90-0x40($Htbl),$Hkey # $Hkey^7
|
---|
1245 | vpxor $Zmi,$Xmi,$Xmi
|
---|
1246 | vpunpckhqdq $Ii,$Ii,$T2
|
---|
1247 | vpclmulqdq \$0x10,$HK,$T1,$Zmi
|
---|
1248 | vmovdqu 0xb0-0x40($Htbl),$HK
|
---|
1249 | vpxor $Ii,$T2,$T2
|
---|
1250 |
|
---|
1251 | vmovdqu ($inp),$Ij # I[0]
|
---|
1252 | vpxor $Xlo,$Zlo,$Zlo
|
---|
1253 | vpclmulqdq \$0x00,$Hkey,$Ii,$Xlo
|
---|
1254 | vpxor $Xhi,$Zhi,$Zhi
|
---|
1255 | vpshufb $bswap,$Ij,$Ij
|
---|
1256 | vpclmulqdq \$0x11,$Hkey,$Ii,$Xhi
|
---|
1257 | vmovdqu 0xa0-0x40($Htbl),$Hkey # $Hkey^8
|
---|
1258 | vpxor $Xmi,$Zmi,$Zmi
|
---|
1259 | vpclmulqdq \$0x10,$HK,$T2,$Xmi
|
---|
1260 |
|
---|
1261 | lea 0x80($inp),$inp
|
---|
1262 | cmp \$0x80,$len
|
---|
1263 | jb .Ltail_avx
|
---|
1264 |
|
---|
1265 | vpxor $Xi,$Ij,$Ij # accumulate $Xi
|
---|
1266 | sub \$0x80,$len
|
---|
1267 | jmp .Loop8x_avx
|
---|
1268 |
|
---|
1269 | .align 32
|
---|
1270 | .Loop8x_avx:
|
---|
1271 | vpunpckhqdq $Ij,$Ij,$T1
|
---|
1272 | vmovdqu 0x70($inp),$Ii # I[7]
|
---|
1273 | vpxor $Xlo,$Zlo,$Zlo
|
---|
1274 | vpxor $Ij,$T1,$T1
|
---|
1275 | vpclmulqdq \$0x00,$Hkey,$Ij,$Xi
|
---|
1276 | vpshufb $bswap,$Ii,$Ii
|
---|
1277 | vpxor $Xhi,$Zhi,$Zhi
|
---|
1278 | vpclmulqdq \$0x11,$Hkey,$Ij,$Xo
|
---|
1279 | vmovdqu 0x00-0x40($Htbl),$Hkey # $Hkey^1
|
---|
1280 | vpunpckhqdq $Ii,$Ii,$T2
|
---|
1281 | vpxor $Xmi,$Zmi,$Zmi
|
---|
1282 | vpclmulqdq \$0x00,$HK,$T1,$Tred
|
---|
1283 | vmovdqu 0x20-0x40($Htbl),$HK
|
---|
1284 | vpxor $Ii,$T2,$T2
|
---|
1285 |
|
---|
1286 | vmovdqu 0x60($inp),$Ij # I[6]
|
---|
1287 | vpclmulqdq \$0x00,$Hkey,$Ii,$Xlo
|
---|
1288 | vpxor $Zlo,$Xi,$Xi # collect result
|
---|
1289 | vpshufb $bswap,$Ij,$Ij
|
---|
1290 | vpclmulqdq \$0x11,$Hkey,$Ii,$Xhi
|
---|
1291 | vxorps $Zhi,$Xo,$Xo
|
---|
1292 | vmovdqu 0x10-0x40($Htbl),$Hkey # $Hkey^2
|
---|
1293 | vpunpckhqdq $Ij,$Ij,$T1
|
---|
1294 | vpclmulqdq \$0x00,$HK, $T2,$Xmi
|
---|
1295 | vpxor $Zmi,$Tred,$Tred
|
---|
1296 | vxorps $Ij,$T1,$T1
|
---|
1297 |
|
---|
1298 | vmovdqu 0x50($inp),$Ii # I[5]
|
---|
1299 | vpxor $Xi,$Tred,$Tred # aggregated Karatsuba post-processing
|
---|
1300 | vpclmulqdq \$0x00,$Hkey,$Ij,$Zlo
|
---|
1301 | vpxor $Xo,$Tred,$Tred
|
---|
1302 | vpslldq \$8,$Tred,$T2
|
---|
1303 | vpxor $Xlo,$Zlo,$Zlo
|
---|
1304 | vpclmulqdq \$0x11,$Hkey,$Ij,$Zhi
|
---|
1305 | vpsrldq \$8,$Tred,$Tred
|
---|
1306 | vpxor $T2, $Xi, $Xi
|
---|
1307 | vmovdqu 0x30-0x40($Htbl),$Hkey # $Hkey^3
|
---|
1308 | vpshufb $bswap,$Ii,$Ii
|
---|
1309 | vxorps $Tred,$Xo, $Xo
|
---|
1310 | vpxor $Xhi,$Zhi,$Zhi
|
---|
1311 | vpunpckhqdq $Ii,$Ii,$T2
|
---|
1312 | vpclmulqdq \$0x10,$HK, $T1,$Zmi
|
---|
1313 | vmovdqu 0x50-0x40($Htbl),$HK
|
---|
1314 | vpxor $Ii,$T2,$T2
|
---|
1315 | vpxor $Xmi,$Zmi,$Zmi
|
---|
1316 |
|
---|
1317 | vmovdqu 0x40($inp),$Ij # I[4]
|
---|
1318 | vpalignr \$8,$Xi,$Xi,$Tred # 1st phase
|
---|
1319 | vpclmulqdq \$0x00,$Hkey,$Ii,$Xlo
|
---|
1320 | vpshufb $bswap,$Ij,$Ij
|
---|
1321 | vpxor $Zlo,$Xlo,$Xlo
|
---|
1322 | vpclmulqdq \$0x11,$Hkey,$Ii,$Xhi
|
---|
1323 | vmovdqu 0x40-0x40($Htbl),$Hkey # $Hkey^4
|
---|
1324 | vpunpckhqdq $Ij,$Ij,$T1
|
---|
1325 | vpxor $Zhi,$Xhi,$Xhi
|
---|
1326 | vpclmulqdq \$0x00,$HK, $T2,$Xmi
|
---|
1327 | vxorps $Ij,$T1,$T1
|
---|
1328 | vpxor $Zmi,$Xmi,$Xmi
|
---|
1329 |
|
---|
1330 | vmovdqu 0x30($inp),$Ii # I[3]
|
---|
1331 | vpclmulqdq \$0x10,(%r10),$Xi,$Xi
|
---|
1332 | vpclmulqdq \$0x00,$Hkey,$Ij,$Zlo
|
---|
1333 | vpshufb $bswap,$Ii,$Ii
|
---|
1334 | vpxor $Xlo,$Zlo,$Zlo
|
---|
1335 | vpclmulqdq \$0x11,$Hkey,$Ij,$Zhi
|
---|
1336 | vmovdqu 0x60-0x40($Htbl),$Hkey # $Hkey^5
|
---|
1337 | vpunpckhqdq $Ii,$Ii,$T2
|
---|
1338 | vpxor $Xhi,$Zhi,$Zhi
|
---|
1339 | vpclmulqdq \$0x10,$HK, $T1,$Zmi
|
---|
1340 | vmovdqu 0x80-0x40($Htbl),$HK
|
---|
1341 | vpxor $Ii,$T2,$T2
|
---|
1342 | vpxor $Xmi,$Zmi,$Zmi
|
---|
1343 |
|
---|
1344 | vmovdqu 0x20($inp),$Ij # I[2]
|
---|
1345 | vpclmulqdq \$0x00,$Hkey,$Ii,$Xlo
|
---|
1346 | vpshufb $bswap,$Ij,$Ij
|
---|
1347 | vpxor $Zlo,$Xlo,$Xlo
|
---|
1348 | vpclmulqdq \$0x11,$Hkey,$Ii,$Xhi
|
---|
1349 | vmovdqu 0x70-0x40($Htbl),$Hkey # $Hkey^6
|
---|
1350 | vpunpckhqdq $Ij,$Ij,$T1
|
---|
1351 | vpxor $Zhi,$Xhi,$Xhi
|
---|
1352 | vpclmulqdq \$0x00,$HK, $T2,$Xmi
|
---|
1353 | vpxor $Ij,$T1,$T1
|
---|
1354 | vpxor $Zmi,$Xmi,$Xmi
|
---|
1355 | vxorps $Tred,$Xi,$Xi
|
---|
1356 |
|
---|
1357 | vmovdqu 0x10($inp),$Ii # I[1]
|
---|
1358 | vpalignr \$8,$Xi,$Xi,$Tred # 2nd phase
|
---|
1359 | vpclmulqdq \$0x00,$Hkey,$Ij,$Zlo
|
---|
1360 | vpshufb $bswap,$Ii,$Ii
|
---|
1361 | vpxor $Xlo,$Zlo,$Zlo
|
---|
1362 | vpclmulqdq \$0x11,$Hkey,$Ij,$Zhi
|
---|
1363 | vmovdqu 0x90-0x40($Htbl),$Hkey # $Hkey^7
|
---|
1364 | vpclmulqdq \$0x10,(%r10),$Xi,$Xi
|
---|
1365 | vxorps $Xo,$Tred,$Tred
|
---|
1366 | vpunpckhqdq $Ii,$Ii,$T2
|
---|
1367 | vpxor $Xhi,$Zhi,$Zhi
|
---|
1368 | vpclmulqdq \$0x10,$HK, $T1,$Zmi
|
---|
1369 | vmovdqu 0xb0-0x40($Htbl),$HK
|
---|
1370 | vpxor $Ii,$T2,$T2
|
---|
1371 | vpxor $Xmi,$Zmi,$Zmi
|
---|
1372 |
|
---|
1373 | vmovdqu ($inp),$Ij # I[0]
|
---|
1374 | vpclmulqdq \$0x00,$Hkey,$Ii,$Xlo
|
---|
1375 | vpshufb $bswap,$Ij,$Ij
|
---|
1376 | vpclmulqdq \$0x11,$Hkey,$Ii,$Xhi
|
---|
1377 | vmovdqu 0xa0-0x40($Htbl),$Hkey # $Hkey^8
|
---|
1378 | vpxor $Tred,$Ij,$Ij
|
---|
1379 | vpclmulqdq \$0x10,$HK, $T2,$Xmi
|
---|
1380 | vpxor $Xi,$Ij,$Ij # accumulate $Xi
|
---|
1381 |
|
---|
1382 | lea 0x80($inp),$inp
|
---|
1383 | sub \$0x80,$len
|
---|
1384 | jnc .Loop8x_avx
|
---|
1385 |
|
---|
1386 | add \$0x80,$len
|
---|
1387 | jmp .Ltail_no_xor_avx
|
---|
1388 |
|
---|
1389 | .align 32
|
---|
1390 | .Lshort_avx:
|
---|
1391 | vmovdqu -0x10($inp,$len),$Ii # very last word
|
---|
1392 | lea ($inp,$len),$inp
|
---|
1393 | vmovdqu 0x00-0x40($Htbl),$Hkey # $Hkey^1
|
---|
1394 | vmovdqu 0x20-0x40($Htbl),$HK
|
---|
1395 | vpshufb $bswap,$Ii,$Ij
|
---|
1396 |
|
---|
1397 | vmovdqa $Xlo,$Zlo # subtle way to zero $Zlo,
|
---|
1398 | vmovdqa $Xhi,$Zhi # $Zhi and
|
---|
1399 | vmovdqa $Xmi,$Zmi # $Zmi
|
---|
1400 | sub \$0x10,$len
|
---|
1401 | jz .Ltail_avx
|
---|
1402 |
|
---|
1403 | vpunpckhqdq $Ij,$Ij,$T1
|
---|
1404 | vpxor $Xlo,$Zlo,$Zlo
|
---|
1405 | vpclmulqdq \$0x00,$Hkey,$Ij,$Xlo
|
---|
1406 | vpxor $Ij,$T1,$T1
|
---|
1407 | vmovdqu -0x20($inp),$Ii
|
---|
1408 | vpxor $Xhi,$Zhi,$Zhi
|
---|
1409 | vpclmulqdq \$0x11,$Hkey,$Ij,$Xhi
|
---|
1410 | vmovdqu 0x10-0x40($Htbl),$Hkey # $Hkey^2
|
---|
1411 | vpshufb $bswap,$Ii,$Ij
|
---|
1412 | vpxor $Xmi,$Zmi,$Zmi
|
---|
1413 | vpclmulqdq \$0x00,$HK,$T1,$Xmi
|
---|
1414 | vpsrldq \$8,$HK,$HK
|
---|
1415 | sub \$0x10,$len
|
---|
1416 | jz .Ltail_avx
|
---|
1417 |
|
---|
1418 | vpunpckhqdq $Ij,$Ij,$T1
|
---|
1419 | vpxor $Xlo,$Zlo,$Zlo
|
---|
1420 | vpclmulqdq \$0x00,$Hkey,$Ij,$Xlo
|
---|
1421 | vpxor $Ij,$T1,$T1
|
---|
1422 | vmovdqu -0x30($inp),$Ii
|
---|
1423 | vpxor $Xhi,$Zhi,$Zhi
|
---|
1424 | vpclmulqdq \$0x11,$Hkey,$Ij,$Xhi
|
---|
1425 | vmovdqu 0x30-0x40($Htbl),$Hkey # $Hkey^3
|
---|
1426 | vpshufb $bswap,$Ii,$Ij
|
---|
1427 | vpxor $Xmi,$Zmi,$Zmi
|
---|
1428 | vpclmulqdq \$0x00,$HK,$T1,$Xmi
|
---|
1429 | vmovdqu 0x50-0x40($Htbl),$HK
|
---|
1430 | sub \$0x10,$len
|
---|
1431 | jz .Ltail_avx
|
---|
1432 |
|
---|
1433 | vpunpckhqdq $Ij,$Ij,$T1
|
---|
1434 | vpxor $Xlo,$Zlo,$Zlo
|
---|
1435 | vpclmulqdq \$0x00,$Hkey,$Ij,$Xlo
|
---|
1436 | vpxor $Ij,$T1,$T1
|
---|
1437 | vmovdqu -0x40($inp),$Ii
|
---|
1438 | vpxor $Xhi,$Zhi,$Zhi
|
---|
1439 | vpclmulqdq \$0x11,$Hkey,$Ij,$Xhi
|
---|
1440 | vmovdqu 0x40-0x40($Htbl),$Hkey # $Hkey^4
|
---|
1441 | vpshufb $bswap,$Ii,$Ij
|
---|
1442 | vpxor $Xmi,$Zmi,$Zmi
|
---|
1443 | vpclmulqdq \$0x00,$HK,$T1,$Xmi
|
---|
1444 | vpsrldq \$8,$HK,$HK
|
---|
1445 | sub \$0x10,$len
|
---|
1446 | jz .Ltail_avx
|
---|
1447 |
|
---|
1448 | vpunpckhqdq $Ij,$Ij,$T1
|
---|
1449 | vpxor $Xlo,$Zlo,$Zlo
|
---|
1450 | vpclmulqdq \$0x00,$Hkey,$Ij,$Xlo
|
---|
1451 | vpxor $Ij,$T1,$T1
|
---|
1452 | vmovdqu -0x50($inp),$Ii
|
---|
1453 | vpxor $Xhi,$Zhi,$Zhi
|
---|
1454 | vpclmulqdq \$0x11,$Hkey,$Ij,$Xhi
|
---|
1455 | vmovdqu 0x60-0x40($Htbl),$Hkey # $Hkey^5
|
---|
1456 | vpshufb $bswap,$Ii,$Ij
|
---|
1457 | vpxor $Xmi,$Zmi,$Zmi
|
---|
1458 | vpclmulqdq \$0x00,$HK,$T1,$Xmi
|
---|
1459 | vmovdqu 0x80-0x40($Htbl),$HK
|
---|
1460 | sub \$0x10,$len
|
---|
1461 | jz .Ltail_avx
|
---|
1462 |
|
---|
1463 | vpunpckhqdq $Ij,$Ij,$T1
|
---|
1464 | vpxor $Xlo,$Zlo,$Zlo
|
---|
1465 | vpclmulqdq \$0x00,$Hkey,$Ij,$Xlo
|
---|
1466 | vpxor $Ij,$T1,$T1
|
---|
1467 | vmovdqu -0x60($inp),$Ii
|
---|
1468 | vpxor $Xhi,$Zhi,$Zhi
|
---|
1469 | vpclmulqdq \$0x11,$Hkey,$Ij,$Xhi
|
---|
1470 | vmovdqu 0x70-0x40($Htbl),$Hkey # $Hkey^6
|
---|
1471 | vpshufb $bswap,$Ii,$Ij
|
---|
1472 | vpxor $Xmi,$Zmi,$Zmi
|
---|
1473 | vpclmulqdq \$0x00,$HK,$T1,$Xmi
|
---|
1474 | vpsrldq \$8,$HK,$HK
|
---|
1475 | sub \$0x10,$len
|
---|
1476 | jz .Ltail_avx
|
---|
1477 |
|
---|
1478 | vpunpckhqdq $Ij,$Ij,$T1
|
---|
1479 | vpxor $Xlo,$Zlo,$Zlo
|
---|
1480 | vpclmulqdq \$0x00,$Hkey,$Ij,$Xlo
|
---|
1481 | vpxor $Ij,$T1,$T1
|
---|
1482 | vmovdqu -0x70($inp),$Ii
|
---|
1483 | vpxor $Xhi,$Zhi,$Zhi
|
---|
1484 | vpclmulqdq \$0x11,$Hkey,$Ij,$Xhi
|
---|
1485 | vmovdqu 0x90-0x40($Htbl),$Hkey # $Hkey^7
|
---|
1486 | vpshufb $bswap,$Ii,$Ij
|
---|
1487 | vpxor $Xmi,$Zmi,$Zmi
|
---|
1488 | vpclmulqdq \$0x00,$HK,$T1,$Xmi
|
---|
1489 | vmovq 0xb8-0x40($Htbl),$HK
|
---|
1490 | sub \$0x10,$len
|
---|
1491 | jmp .Ltail_avx
|
---|
1492 |
|
---|
1493 | .align 32
|
---|
1494 | .Ltail_avx:
|
---|
1495 | vpxor $Xi,$Ij,$Ij # accumulate $Xi
|
---|
1496 | .Ltail_no_xor_avx:
|
---|
1497 | vpunpckhqdq $Ij,$Ij,$T1
|
---|
1498 | vpxor $Xlo,$Zlo,$Zlo
|
---|
1499 | vpclmulqdq \$0x00,$Hkey,$Ij,$Xlo
|
---|
1500 | vpxor $Ij,$T1,$T1
|
---|
1501 | vpxor $Xhi,$Zhi,$Zhi
|
---|
1502 | vpclmulqdq \$0x11,$Hkey,$Ij,$Xhi
|
---|
1503 | vpxor $Xmi,$Zmi,$Zmi
|
---|
1504 | vpclmulqdq \$0x00,$HK,$T1,$Xmi
|
---|
1505 |
|
---|
1506 | vmovdqu (%r10),$Tred
|
---|
1507 |
|
---|
1508 | vpxor $Xlo,$Zlo,$Xi
|
---|
1509 | vpxor $Xhi,$Zhi,$Xo
|
---|
1510 | vpxor $Xmi,$Zmi,$Zmi
|
---|
1511 |
|
---|
1512 | vpxor $Xi, $Zmi,$Zmi # aggregated Karatsuba post-processing
|
---|
1513 | vpxor $Xo, $Zmi,$Zmi
|
---|
1514 | vpslldq \$8, $Zmi,$T2
|
---|
1515 | vpsrldq \$8, $Zmi,$Zmi
|
---|
1516 | vpxor $T2, $Xi, $Xi
|
---|
1517 | vpxor $Zmi,$Xo, $Xo
|
---|
1518 |
|
---|
1519 | vpclmulqdq \$0x10,$Tred,$Xi,$T2 # 1st phase
|
---|
1520 | vpalignr \$8,$Xi,$Xi,$Xi
|
---|
1521 | vpxor $T2,$Xi,$Xi
|
---|
1522 |
|
---|
1523 | vpclmulqdq \$0x10,$Tred,$Xi,$T2 # 2nd phase
|
---|
1524 | vpalignr \$8,$Xi,$Xi,$Xi
|
---|
1525 | vpxor $Xo,$Xi,$Xi
|
---|
1526 | vpxor $T2,$Xi,$Xi
|
---|
1527 |
|
---|
1528 | cmp \$0,$len
|
---|
1529 | jne .Lshort_avx
|
---|
1530 |
|
---|
1531 | vpshufb $bswap,$Xi,$Xi
|
---|
1532 | vmovdqu $Xi,($Xip)
|
---|
1533 | vzeroupper
|
---|
1534 | ___
|
---|
1535 | $code.=<<___ if ($win64);
|
---|
1536 | movaps (%rsp),%xmm6
|
---|
1537 | movaps 0x10(%rsp),%xmm7
|
---|
1538 | movaps 0x20(%rsp),%xmm8
|
---|
1539 | movaps 0x30(%rsp),%xmm9
|
---|
1540 | movaps 0x40(%rsp),%xmm10
|
---|
1541 | movaps 0x50(%rsp),%xmm11
|
---|
1542 | movaps 0x60(%rsp),%xmm12
|
---|
1543 | movaps 0x70(%rsp),%xmm13
|
---|
1544 | movaps 0x80(%rsp),%xmm14
|
---|
1545 | movaps 0x90(%rsp),%xmm15
|
---|
1546 | lea 0xa8(%rsp),%rsp
|
---|
1547 | .LSEH_end_gcm_ghash_avx:
|
---|
1548 | ___
|
---|
1549 | $code.=<<___;
|
---|
1550 | ret
|
---|
1551 | .size gcm_ghash_avx,.-gcm_ghash_avx
|
---|
1552 | ___
|
---|
1553 | } else {
|
---|
1554 | $code.=<<___;
|
---|
1555 | jmp .L_ghash_clmul
|
---|
1556 | .size gcm_ghash_avx,.-gcm_ghash_avx
|
---|
1557 | ___
|
---|
1558 | }
|
---|
1559 | |
---|
1560 |
|
---|
1561 | $code.=<<___;
|
---|
1562 | .align 64
|
---|
1563 | .Lbswap_mask:
|
---|
1564 | .byte 15,14,13,12,11,10,9,8,7,6,5,4,3,2,1,0
|
---|
1565 | .L0x1c2_polynomial:
|
---|
1566 | .byte 1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0xc2
|
---|
1567 | .L7_mask:
|
---|
1568 | .long 7,0,7,0
|
---|
1569 | .L7_mask_poly:
|
---|
1570 | .long 7,0,`0xE1<<1`,0
|
---|
1571 | .align 64
|
---|
1572 | .type .Lrem_4bit,\@object
|
---|
1573 | .Lrem_4bit:
|
---|
1574 | .long 0,`0x0000<<16`,0,`0x1C20<<16`,0,`0x3840<<16`,0,`0x2460<<16`
|
---|
1575 | .long 0,`0x7080<<16`,0,`0x6CA0<<16`,0,`0x48C0<<16`,0,`0x54E0<<16`
|
---|
1576 | .long 0,`0xE100<<16`,0,`0xFD20<<16`,0,`0xD940<<16`,0,`0xC560<<16`
|
---|
1577 | .long 0,`0x9180<<16`,0,`0x8DA0<<16`,0,`0xA9C0<<16`,0,`0xB5E0<<16`
|
---|
1578 | .type .Lrem_8bit,\@object
|
---|
1579 | .Lrem_8bit:
|
---|
1580 | .value 0x0000,0x01C2,0x0384,0x0246,0x0708,0x06CA,0x048C,0x054E
|
---|
1581 | .value 0x0E10,0x0FD2,0x0D94,0x0C56,0x0918,0x08DA,0x0A9C,0x0B5E
|
---|
1582 | .value 0x1C20,0x1DE2,0x1FA4,0x1E66,0x1B28,0x1AEA,0x18AC,0x196E
|
---|
1583 | .value 0x1230,0x13F2,0x11B4,0x1076,0x1538,0x14FA,0x16BC,0x177E
|
---|
1584 | .value 0x3840,0x3982,0x3BC4,0x3A06,0x3F48,0x3E8A,0x3CCC,0x3D0E
|
---|
1585 | .value 0x3650,0x3792,0x35D4,0x3416,0x3158,0x309A,0x32DC,0x331E
|
---|
1586 | .value 0x2460,0x25A2,0x27E4,0x2626,0x2368,0x22AA,0x20EC,0x212E
|
---|
1587 | .value 0x2A70,0x2BB2,0x29F4,0x2836,0x2D78,0x2CBA,0x2EFC,0x2F3E
|
---|
1588 | .value 0x7080,0x7142,0x7304,0x72C6,0x7788,0x764A,0x740C,0x75CE
|
---|
1589 | .value 0x7E90,0x7F52,0x7D14,0x7CD6,0x7998,0x785A,0x7A1C,0x7BDE
|
---|
1590 | .value 0x6CA0,0x6D62,0x6F24,0x6EE6,0x6BA8,0x6A6A,0x682C,0x69EE
|
---|
1591 | .value 0x62B0,0x6372,0x6134,0x60F6,0x65B8,0x647A,0x663C,0x67FE
|
---|
1592 | .value 0x48C0,0x4902,0x4B44,0x4A86,0x4FC8,0x4E0A,0x4C4C,0x4D8E
|
---|
1593 | .value 0x46D0,0x4712,0x4554,0x4496,0x41D8,0x401A,0x425C,0x439E
|
---|
1594 | .value 0x54E0,0x5522,0x5764,0x56A6,0x53E8,0x522A,0x506C,0x51AE
|
---|
1595 | .value 0x5AF0,0x5B32,0x5974,0x58B6,0x5DF8,0x5C3A,0x5E7C,0x5FBE
|
---|
1596 | .value 0xE100,0xE0C2,0xE284,0xE346,0xE608,0xE7CA,0xE58C,0xE44E
|
---|
1597 | .value 0xEF10,0xEED2,0xEC94,0xED56,0xE818,0xE9DA,0xEB9C,0xEA5E
|
---|
1598 | .value 0xFD20,0xFCE2,0xFEA4,0xFF66,0xFA28,0xFBEA,0xF9AC,0xF86E
|
---|
1599 | .value 0xF330,0xF2F2,0xF0B4,0xF176,0xF438,0xF5FA,0xF7BC,0xF67E
|
---|
1600 | .value 0xD940,0xD882,0xDAC4,0xDB06,0xDE48,0xDF8A,0xDDCC,0xDC0E
|
---|
1601 | .value 0xD750,0xD692,0xD4D4,0xD516,0xD058,0xD19A,0xD3DC,0xD21E
|
---|
1602 | .value 0xC560,0xC4A2,0xC6E4,0xC726,0xC268,0xC3AA,0xC1EC,0xC02E
|
---|
1603 | .value 0xCB70,0xCAB2,0xC8F4,0xC936,0xCC78,0xCDBA,0xCFFC,0xCE3E
|
---|
1604 | .value 0x9180,0x9042,0x9204,0x93C6,0x9688,0x974A,0x950C,0x94CE
|
---|
1605 | .value 0x9F90,0x9E52,0x9C14,0x9DD6,0x9898,0x995A,0x9B1C,0x9ADE
|
---|
1606 | .value 0x8DA0,0x8C62,0x8E24,0x8FE6,0x8AA8,0x8B6A,0x892C,0x88EE
|
---|
1607 | .value 0x83B0,0x8272,0x8034,0x81F6,0x84B8,0x857A,0x873C,0x86FE
|
---|
1608 | .value 0xA9C0,0xA802,0xAA44,0xAB86,0xAEC8,0xAF0A,0xAD4C,0xAC8E
|
---|
1609 | .value 0xA7D0,0xA612,0xA454,0xA596,0xA0D8,0xA11A,0xA35C,0xA29E
|
---|
1610 | .value 0xB5E0,0xB422,0xB664,0xB7A6,0xB2E8,0xB32A,0xB16C,0xB0AE
|
---|
1611 | .value 0xBBF0,0xBA32,0xB874,0xB9B6,0xBCF8,0xBD3A,0xBF7C,0xBEBE
|
---|
1612 |
|
---|
1613 | .asciz "GHASH for x86_64, CRYPTOGAMS by <appro\@openssl.org>"
|
---|
1614 | .align 64
|
---|
1615 | ___
|
---|
1616 | |
---|
1617 |
|
---|
1618 | # EXCEPTION_DISPOSITION handler (EXCEPTION_RECORD *rec,ULONG64 frame,
|
---|
1619 | # CONTEXT *context,DISPATCHER_CONTEXT *disp)
|
---|
1620 | if ($win64) {
|
---|
1621 | $rec="%rcx";
|
---|
1622 | $frame="%rdx";
|
---|
1623 | $context="%r8";
|
---|
1624 | $disp="%r9";
|
---|
1625 |
|
---|
1626 | $code.=<<___;
|
---|
1627 | .extern __imp_RtlVirtualUnwind
|
---|
1628 | .type se_handler,\@abi-omnipotent
|
---|
1629 | .align 16
|
---|
1630 | se_handler:
|
---|
1631 | push %rsi
|
---|
1632 | push %rdi
|
---|
1633 | push %rbx
|
---|
1634 | push %rbp
|
---|
1635 | push %r12
|
---|
1636 | push %r13
|
---|
1637 | push %r14
|
---|
1638 | push %r15
|
---|
1639 | pushfq
|
---|
1640 | sub \$64,%rsp
|
---|
1641 |
|
---|
1642 | mov 120($context),%rax # pull context->Rax
|
---|
1643 | mov 248($context),%rbx # pull context->Rip
|
---|
1644 |
|
---|
1645 | mov 8($disp),%rsi # disp->ImageBase
|
---|
1646 | mov 56($disp),%r11 # disp->HandlerData
|
---|
1647 |
|
---|
1648 | mov 0(%r11),%r10d # HandlerData[0]
|
---|
1649 | lea (%rsi,%r10),%r10 # prologue label
|
---|
1650 | cmp %r10,%rbx # context->Rip<prologue label
|
---|
1651 | jb .Lin_prologue
|
---|
1652 |
|
---|
1653 | mov 152($context),%rax # pull context->Rsp
|
---|
1654 |
|
---|
1655 | mov 4(%r11),%r10d # HandlerData[1]
|
---|
1656 | lea (%rsi,%r10),%r10 # epilogue label
|
---|
1657 | cmp %r10,%rbx # context->Rip>=epilogue label
|
---|
1658 | jae .Lin_prologue
|
---|
1659 |
|
---|
1660 | lea 24(%rax),%rax # adjust "rsp"
|
---|
1661 |
|
---|
1662 | mov -8(%rax),%rbx
|
---|
1663 | mov -16(%rax),%rbp
|
---|
1664 | mov -24(%rax),%r12
|
---|
1665 | mov %rbx,144($context) # restore context->Rbx
|
---|
1666 | mov %rbp,160($context) # restore context->Rbp
|
---|
1667 | mov %r12,216($context) # restore context->R12
|
---|
1668 |
|
---|
1669 | .Lin_prologue:
|
---|
1670 | mov 8(%rax),%rdi
|
---|
1671 | mov 16(%rax),%rsi
|
---|
1672 | mov %rax,152($context) # restore context->Rsp
|
---|
1673 | mov %rsi,168($context) # restore context->Rsi
|
---|
1674 | mov %rdi,176($context) # restore context->Rdi
|
---|
1675 |
|
---|
1676 | mov 40($disp),%rdi # disp->ContextRecord
|
---|
1677 | mov $context,%rsi # context
|
---|
1678 | mov \$`1232/8`,%ecx # sizeof(CONTEXT)
|
---|
1679 | .long 0xa548f3fc # cld; rep movsq
|
---|
1680 |
|
---|
1681 | mov $disp,%rsi
|
---|
1682 | xor %rcx,%rcx # arg1, UNW_FLAG_NHANDLER
|
---|
1683 | mov 8(%rsi),%rdx # arg2, disp->ImageBase
|
---|
1684 | mov 0(%rsi),%r8 # arg3, disp->ControlPc
|
---|
1685 | mov 16(%rsi),%r9 # arg4, disp->FunctionEntry
|
---|
1686 | mov 40(%rsi),%r10 # disp->ContextRecord
|
---|
1687 | lea 56(%rsi),%r11 # &disp->HandlerData
|
---|
1688 | lea 24(%rsi),%r12 # &disp->EstablisherFrame
|
---|
1689 | mov %r10,32(%rsp) # arg5
|
---|
1690 | mov %r11,40(%rsp) # arg6
|
---|
1691 | mov %r12,48(%rsp) # arg7
|
---|
1692 | mov %rcx,56(%rsp) # arg8, (NULL)
|
---|
1693 | call *__imp_RtlVirtualUnwind(%rip)
|
---|
1694 |
|
---|
1695 | mov \$1,%eax # ExceptionContinueSearch
|
---|
1696 | add \$64,%rsp
|
---|
1697 | popfq
|
---|
1698 | pop %r15
|
---|
1699 | pop %r14
|
---|
1700 | pop %r13
|
---|
1701 | pop %r12
|
---|
1702 | pop %rbp
|
---|
1703 | pop %rbx
|
---|
1704 | pop %rdi
|
---|
1705 | pop %rsi
|
---|
1706 | ret
|
---|
1707 | .size se_handler,.-se_handler
|
---|
1708 |
|
---|
1709 | .section .pdata
|
---|
1710 | .align 4
|
---|
1711 | .rva .LSEH_begin_gcm_gmult_4bit
|
---|
1712 | .rva .LSEH_end_gcm_gmult_4bit
|
---|
1713 | .rva .LSEH_info_gcm_gmult_4bit
|
---|
1714 |
|
---|
1715 | .rva .LSEH_begin_gcm_ghash_4bit
|
---|
1716 | .rva .LSEH_end_gcm_ghash_4bit
|
---|
1717 | .rva .LSEH_info_gcm_ghash_4bit
|
---|
1718 |
|
---|
1719 | .rva .LSEH_begin_gcm_init_clmul
|
---|
1720 | .rva .LSEH_end_gcm_init_clmul
|
---|
1721 | .rva .LSEH_info_gcm_init_clmul
|
---|
1722 |
|
---|
1723 | .rva .LSEH_begin_gcm_ghash_clmul
|
---|
1724 | .rva .LSEH_end_gcm_ghash_clmul
|
---|
1725 | .rva .LSEH_info_gcm_ghash_clmul
|
---|
1726 | ___
|
---|
1727 | $code.=<<___ if ($avx);
|
---|
1728 | .rva .LSEH_begin_gcm_init_avx
|
---|
1729 | .rva .LSEH_end_gcm_init_avx
|
---|
1730 | .rva .LSEH_info_gcm_init_clmul
|
---|
1731 |
|
---|
1732 | .rva .LSEH_begin_gcm_ghash_avx
|
---|
1733 | .rva .LSEH_end_gcm_ghash_avx
|
---|
1734 | .rva .LSEH_info_gcm_ghash_clmul
|
---|
1735 | ___
|
---|
1736 | $code.=<<___;
|
---|
1737 | .section .xdata
|
---|
1738 | .align 8
|
---|
1739 | .LSEH_info_gcm_gmult_4bit:
|
---|
1740 | .byte 9,0,0,0
|
---|
1741 | .rva se_handler
|
---|
1742 | .rva .Lgmult_prologue,.Lgmult_epilogue # HandlerData
|
---|
1743 | .LSEH_info_gcm_ghash_4bit:
|
---|
1744 | .byte 9,0,0,0
|
---|
1745 | .rva se_handler
|
---|
1746 | .rva .Lghash_prologue,.Lghash_epilogue # HandlerData
|
---|
1747 | .LSEH_info_gcm_init_clmul:
|
---|
1748 | .byte 0x01,0x08,0x03,0x00
|
---|
1749 | .byte 0x08,0x68,0x00,0x00 #movaps 0x00(rsp),xmm6
|
---|
1750 | .byte 0x04,0x22,0x00,0x00 #sub rsp,0x18
|
---|
1751 | .LSEH_info_gcm_ghash_clmul:
|
---|
1752 | .byte 0x01,0x33,0x16,0x00
|
---|
1753 | .byte 0x33,0xf8,0x09,0x00 #movaps 0x90(rsp),xmm15
|
---|
1754 | .byte 0x2e,0xe8,0x08,0x00 #movaps 0x80(rsp),xmm14
|
---|
1755 | .byte 0x29,0xd8,0x07,0x00 #movaps 0x70(rsp),xmm13
|
---|
1756 | .byte 0x24,0xc8,0x06,0x00 #movaps 0x60(rsp),xmm12
|
---|
1757 | .byte 0x1f,0xb8,0x05,0x00 #movaps 0x50(rsp),xmm11
|
---|
1758 | .byte 0x1a,0xa8,0x04,0x00 #movaps 0x40(rsp),xmm10
|
---|
1759 | .byte 0x15,0x98,0x03,0x00 #movaps 0x30(rsp),xmm9
|
---|
1760 | .byte 0x10,0x88,0x02,0x00 #movaps 0x20(rsp),xmm8
|
---|
1761 | .byte 0x0c,0x78,0x01,0x00 #movaps 0x10(rsp),xmm7
|
---|
1762 | .byte 0x08,0x68,0x00,0x00 #movaps 0x00(rsp),xmm6
|
---|
1763 | .byte 0x04,0x01,0x15,0x00 #sub rsp,0xa8
|
---|
1764 | ___
|
---|
1765 | }
|
---|
1766 | |
---|
1767 |
|
---|
1768 | $code =~ s/\`([^\`]*)\`/eval($1)/gem;
|
---|
1769 |
|
---|
1770 | print $code;
|
---|
1771 |
|
---|
1772 | close STDOUT;
|
---|