VirtualBox

source: vbox/trunk/src/libs/openssl-1.1.0g/crypto/modes/asm/ghash-x86_64.pl@ 69890

Last change on this file since 69890 was 69890, checked in by vboxsync, 7 years ago

Added OpenSSL 1.1.0g with unneeded files removed, otherwise unmodified.
bugref:8070: src/libs maintenance

  • Property svn:eol-style set to LF
  • Property svn:executable set to *
File size: 42.2 KB
Line 
1#! /usr/bin/env perl
2# Copyright 2010-2016 The OpenSSL Project Authors. All Rights Reserved.
3#
4# Licensed under the OpenSSL license (the "License"). You may not use
5# this file except in compliance with the License. You can obtain a copy
6# in the file LICENSE in the source distribution or at
7# https://www.openssl.org/source/license.html
8
9#
10# ====================================================================
11# Written by Andy Polyakov <[email protected]> for the OpenSSL
12# project. The module is, however, dual licensed under OpenSSL and
13# CRYPTOGAMS licenses depending on where you obtain it. For further
14# details see http://www.openssl.org/~appro/cryptogams/.
15# ====================================================================
16#
17# March, June 2010
18#
19# The module implements "4-bit" GCM GHASH function and underlying
20# single multiplication operation in GF(2^128). "4-bit" means that
21# it uses 256 bytes per-key table [+128 bytes shared table]. GHASH
22# function features so called "528B" variant utilizing additional
23# 256+16 bytes of per-key storage [+512 bytes shared table].
24# Performance results are for this streamed GHASH subroutine and are
25# expressed in cycles per processed byte, less is better:
26#
27# gcc 3.4.x(*) assembler
28#
29# P4 28.6 14.0 +100%
30# Opteron 19.3 7.7 +150%
31# Core2 17.8 8.1(**) +120%
32# Atom 31.6 16.8 +88%
33# VIA Nano 21.8 10.1 +115%
34#
35# (*) comparison is not completely fair, because C results are
36# for vanilla "256B" implementation, while assembler results
37# are for "528B";-)
38# (**) it's mystery [to me] why Core2 result is not same as for
39# Opteron;
40
41# May 2010
42#
43# Add PCLMULQDQ version performing at 2.02 cycles per processed byte.
44# See ghash-x86.pl for background information and details about coding
45# techniques.
46#
47# Special thanks to David Woodhouse <[email protected]> for
48# providing access to a Westmere-based system on behalf of Intel
49# Open Source Technology Centre.
50
51# December 2012
52#
53# Overhaul: aggregate Karatsuba post-processing, improve ILP in
54# reduction_alg9, increase reduction aggregate factor to 4x. As for
55# the latter. ghash-x86.pl discusses that it makes lesser sense to
56# increase aggregate factor. Then why increase here? Critical path
57# consists of 3 independent pclmulqdq instructions, Karatsuba post-
58# processing and reduction. "On top" of this we lay down aggregated
59# multiplication operations, triplets of independent pclmulqdq's. As
60# issue rate for pclmulqdq is limited, it makes lesser sense to
61# aggregate more multiplications than it takes to perform remaining
62# non-multiplication operations. 2x is near-optimal coefficient for
63# contemporary Intel CPUs (therefore modest improvement coefficient),
64# but not for Bulldozer. Latter is because logical SIMD operations
65# are twice as slow in comparison to Intel, so that critical path is
66# longer. A CPU with higher pclmulqdq issue rate would also benefit
67# from higher aggregate factor...
68#
69# Westmere 1.78(+13%)
70# Sandy Bridge 1.80(+8%)
71# Ivy Bridge 1.80(+7%)
72# Haswell 0.55(+93%) (if system doesn't support AVX)
73# Broadwell 0.45(+110%)(if system doesn't support AVX)
74# Skylake 0.44(+110%)(if system doesn't support AVX)
75# Bulldozer 1.49(+27%)
76# Silvermont 2.88(+13%)
77# Goldmont 1.08(+24%)
78
79# March 2013
80#
81# ... 8x aggregate factor AVX code path is using reduction algorithm
82# suggested by Shay Gueron[1]. Even though contemporary AVX-capable
83# CPUs such as Sandy and Ivy Bridge can execute it, the code performs
84# sub-optimally in comparison to above mentioned version. But thanks
85# to Ilya Albrekht and Max Locktyukhin of Intel Corp. we knew that
86# it performs in 0.41 cycles per byte on Haswell processor, in
87# 0.29 on Broadwell, and in 0.36 on Skylake.
88#
89# [1] http://rt.openssl.org/Ticket/Display.html?id=2900&user=guest&pass=guest
90
91$flavour = shift;
92$output = shift;
93if ($flavour =~ /\./) { $output = $flavour; undef $flavour; }
94
95$win64=0; $win64=1 if ($flavour =~ /[nm]asm|mingw64/ || $output =~ /\.asm$/);
96
97$0 =~ m/(.*[\/\\])[^\/\\]+$/; $dir=$1;
98( $xlate="${dir}x86_64-xlate.pl" and -f $xlate ) or
99( $xlate="${dir}../../perlasm/x86_64-xlate.pl" and -f $xlate) or
100die "can't locate x86_64-xlate.pl";
101
102if (`$ENV{CC} -Wa,-v -c -o /dev/null -x assembler /dev/null 2>&1`
103 =~ /GNU assembler version ([2-9]\.[0-9]+)/) {
104 $avx = ($1>=2.20) + ($1>=2.22);
105}
106
107if (!$avx && $win64 && ($flavour =~ /nasm/ || $ENV{ASM} =~ /nasm/) &&
108 `nasm -v 2>&1` =~ /NASM version ([2-9]\.[0-9]+)/) {
109 $avx = ($1>=2.09) + ($1>=2.10);
110}
111
112if (!$avx && $win64 && ($flavour =~ /masm/ || $ENV{ASM} =~ /ml64/) &&
113 `ml64 2>&1` =~ /Version ([0-9]+)\./) {
114 $avx = ($1>=10) + ($1>=11);
115}
116
117if (!$avx && `$ENV{CC} -v 2>&1` =~ /((?:^clang|LLVM) version|.*based on LLVM) ([3-9]\.[0-9]+)/) {
118 $avx = ($2>=3.0) + ($2>3.0);
119}
120
121open OUT,"| \"$^X\" \"$xlate\" $flavour \"$output\"";
122*STDOUT=*OUT;
123
124$do4xaggr=1;
125
126# common register layout
127$nlo="%rax";
128$nhi="%rbx";
129$Zlo="%r8";
130$Zhi="%r9";
131$tmp="%r10";
132$rem_4bit = "%r11";
133
134$Xi="%rdi";
135$Htbl="%rsi";
136
137# per-function register layout
138$cnt="%rcx";
139$rem="%rdx";
140
141sub LB() { my $r=shift; $r =~ s/%[er]([a-d])x/%\1l/ or
142 $r =~ s/%[er]([sd]i)/%\1l/ or
143 $r =~ s/%[er](bp)/%\1l/ or
144 $r =~ s/%(r[0-9]+)[d]?/%\1b/; $r; }
145
146sub AUTOLOAD() # thunk [simplified] 32-bit style perlasm
147{ my $opcode = $AUTOLOAD; $opcode =~ s/.*:://;
148 my $arg = pop;
149 $arg = "\$$arg" if ($arg*1 eq $arg);
150 $code .= "\t$opcode\t".join(',',$arg,reverse @_)."\n";
151}
152
153
154{ my $N;
155 sub loop() {
156 my $inp = shift;
157
158 $N++;
159$code.=<<___;
160 xor $nlo,$nlo
161 xor $nhi,$nhi
162 mov `&LB("$Zlo")`,`&LB("$nlo")`
163 mov `&LB("$Zlo")`,`&LB("$nhi")`
164 shl \$4,`&LB("$nlo")`
165 mov \$14,$cnt
166 mov 8($Htbl,$nlo),$Zlo
167 mov ($Htbl,$nlo),$Zhi
168 and \$0xf0,`&LB("$nhi")`
169 mov $Zlo,$rem
170 jmp .Loop$N
171
172.align 16
173.Loop$N:
174 shr \$4,$Zlo
175 and \$0xf,$rem
176 mov $Zhi,$tmp
177 mov ($inp,$cnt),`&LB("$nlo")`
178 shr \$4,$Zhi
179 xor 8($Htbl,$nhi),$Zlo
180 shl \$60,$tmp
181 xor ($Htbl,$nhi),$Zhi
182 mov `&LB("$nlo")`,`&LB("$nhi")`
183 xor ($rem_4bit,$rem,8),$Zhi
184 mov $Zlo,$rem
185 shl \$4,`&LB("$nlo")`
186 xor $tmp,$Zlo
187 dec $cnt
188 js .Lbreak$N
189
190 shr \$4,$Zlo
191 and \$0xf,$rem
192 mov $Zhi,$tmp
193 shr \$4,$Zhi
194 xor 8($Htbl,$nlo),$Zlo
195 shl \$60,$tmp
196 xor ($Htbl,$nlo),$Zhi
197 and \$0xf0,`&LB("$nhi")`
198 xor ($rem_4bit,$rem,8),$Zhi
199 mov $Zlo,$rem
200 xor $tmp,$Zlo
201 jmp .Loop$N
202
203.align 16
204.Lbreak$N:
205 shr \$4,$Zlo
206 and \$0xf,$rem
207 mov $Zhi,$tmp
208 shr \$4,$Zhi
209 xor 8($Htbl,$nlo),$Zlo
210 shl \$60,$tmp
211 xor ($Htbl,$nlo),$Zhi
212 and \$0xf0,`&LB("$nhi")`
213 xor ($rem_4bit,$rem,8),$Zhi
214 mov $Zlo,$rem
215 xor $tmp,$Zlo
216
217 shr \$4,$Zlo
218 and \$0xf,$rem
219 mov $Zhi,$tmp
220 shr \$4,$Zhi
221 xor 8($Htbl,$nhi),$Zlo
222 shl \$60,$tmp
223 xor ($Htbl,$nhi),$Zhi
224 xor $tmp,$Zlo
225 xor ($rem_4bit,$rem,8),$Zhi
226
227 bswap $Zlo
228 bswap $Zhi
229___
230}}
231
232$code=<<___;
233.text
234.extern OPENSSL_ia32cap_P
235
236.globl gcm_gmult_4bit
237.type gcm_gmult_4bit,\@function,2
238.align 16
239gcm_gmult_4bit:
240 push %rbx
241 push %rbp # %rbp and %r12 are pushed exclusively in
242 push %r12 # order to reuse Win64 exception handler...
243.Lgmult_prologue:
244
245 movzb 15($Xi),$Zlo
246 lea .Lrem_4bit(%rip),$rem_4bit
247___
248 &loop ($Xi);
249$code.=<<___;
250 mov $Zlo,8($Xi)
251 mov $Zhi,($Xi)
252
253 mov 16(%rsp),%rbx
254 lea 24(%rsp),%rsp
255.Lgmult_epilogue:
256 ret
257.size gcm_gmult_4bit,.-gcm_gmult_4bit
258___
259
260
261# per-function register layout
262$inp="%rdx";
263$len="%rcx";
264$rem_8bit=$rem_4bit;
265
266$code.=<<___;
267.globl gcm_ghash_4bit
268.type gcm_ghash_4bit,\@function,4
269.align 16
270gcm_ghash_4bit:
271 push %rbx
272 push %rbp
273 push %r12
274 push %r13
275 push %r14
276 push %r15
277 sub \$280,%rsp
278.Lghash_prologue:
279 mov $inp,%r14 # reassign couple of args
280 mov $len,%r15
281___
282{ my $inp="%r14";
283 my $dat="%edx";
284 my $len="%r15";
285 my @nhi=("%ebx","%ecx");
286 my @rem=("%r12","%r13");
287 my $Hshr4="%rbp";
288
289 &sub ($Htbl,-128); # size optimization
290 &lea ($Hshr4,"16+128(%rsp)");
291 { my @lo =($nlo,$nhi);
292 my @hi =($Zlo,$Zhi);
293
294 &xor ($dat,$dat);
295 for ($i=0,$j=-2;$i<18;$i++,$j++) {
296 &mov ("$j(%rsp)",&LB($dat)) if ($i>1);
297 &or ($lo[0],$tmp) if ($i>1);
298 &mov (&LB($dat),&LB($lo[1])) if ($i>0 && $i<17);
299 &shr ($lo[1],4) if ($i>0 && $i<17);
300 &mov ($tmp,$hi[1]) if ($i>0 && $i<17);
301 &shr ($hi[1],4) if ($i>0 && $i<17);
302 &mov ("8*$j($Hshr4)",$hi[0]) if ($i>1);
303 &mov ($hi[0],"16*$i+0-128($Htbl)") if ($i<16);
304 &shl (&LB($dat),4) if ($i>0 && $i<17);
305 &mov ("8*$j-128($Hshr4)",$lo[0]) if ($i>1);
306 &mov ($lo[0],"16*$i+8-128($Htbl)") if ($i<16);
307 &shl ($tmp,60) if ($i>0 && $i<17);
308
309 push (@lo,shift(@lo));
310 push (@hi,shift(@hi));
311 }
312 }
313 &add ($Htbl,-128);
314 &mov ($Zlo,"8($Xi)");
315 &mov ($Zhi,"0($Xi)");
316 &add ($len,$inp); # pointer to the end of data
317 &lea ($rem_8bit,".Lrem_8bit(%rip)");
318 &jmp (".Louter_loop");
319
320$code.=".align 16\n.Louter_loop:\n";
321 &xor ($Zhi,"($inp)");
322 &mov ("%rdx","8($inp)");
323 &lea ($inp,"16($inp)");
324 &xor ("%rdx",$Zlo);
325 &mov ("($Xi)",$Zhi);
326 &mov ("8($Xi)","%rdx");
327 &shr ("%rdx",32);
328
329 &xor ($nlo,$nlo);
330 &rol ($dat,8);
331 &mov (&LB($nlo),&LB($dat));
332 &movz ($nhi[0],&LB($dat));
333 &shl (&LB($nlo),4);
334 &shr ($nhi[0],4);
335
336 for ($j=11,$i=0;$i<15;$i++) {
337 &rol ($dat,8);
338 &xor ($Zlo,"8($Htbl,$nlo)") if ($i>0);
339 &xor ($Zhi,"($Htbl,$nlo)") if ($i>0);
340 &mov ($Zlo,"8($Htbl,$nlo)") if ($i==0);
341 &mov ($Zhi,"($Htbl,$nlo)") if ($i==0);
342
343 &mov (&LB($nlo),&LB($dat));
344 &xor ($Zlo,$tmp) if ($i>0);
345 &movzw ($rem[1],"($rem_8bit,$rem[1],2)") if ($i>0);
346
347 &movz ($nhi[1],&LB($dat));
348 &shl (&LB($nlo),4);
349 &movzb ($rem[0],"(%rsp,$nhi[0])");
350
351 &shr ($nhi[1],4) if ($i<14);
352 &and ($nhi[1],0xf0) if ($i==14);
353 &shl ($rem[1],48) if ($i>0);
354 &xor ($rem[0],$Zlo);
355
356 &mov ($tmp,$Zhi);
357 &xor ($Zhi,$rem[1]) if ($i>0);
358 &shr ($Zlo,8);
359
360 &movz ($rem[0],&LB($rem[0]));
361 &mov ($dat,"$j($Xi)") if (--$j%4==0);
362 &shr ($Zhi,8);
363
364 &xor ($Zlo,"-128($Hshr4,$nhi[0],8)");
365 &shl ($tmp,56);
366 &xor ($Zhi,"($Hshr4,$nhi[0],8)");
367
368 unshift (@nhi,pop(@nhi)); # "rotate" registers
369 unshift (@rem,pop(@rem));
370 }
371 &movzw ($rem[1],"($rem_8bit,$rem[1],2)");
372 &xor ($Zlo,"8($Htbl,$nlo)");
373 &xor ($Zhi,"($Htbl,$nlo)");
374
375 &shl ($rem[1],48);
376 &xor ($Zlo,$tmp);
377
378 &xor ($Zhi,$rem[1]);
379 &movz ($rem[0],&LB($Zlo));
380 &shr ($Zlo,4);
381
382 &mov ($tmp,$Zhi);
383 &shl (&LB($rem[0]),4);
384 &shr ($Zhi,4);
385
386 &xor ($Zlo,"8($Htbl,$nhi[0])");
387 &movzw ($rem[0],"($rem_8bit,$rem[0],2)");
388 &shl ($tmp,60);
389
390 &xor ($Zhi,"($Htbl,$nhi[0])");
391 &xor ($Zlo,$tmp);
392 &shl ($rem[0],48);
393
394 &bswap ($Zlo);
395 &xor ($Zhi,$rem[0]);
396
397 &bswap ($Zhi);
398 &cmp ($inp,$len);
399 &jb (".Louter_loop");
400}
401$code.=<<___;
402 mov $Zlo,8($Xi)
403 mov $Zhi,($Xi)
404
405 lea 280(%rsp),%rsi
406 mov 0(%rsi),%r15
407 mov 8(%rsi),%r14
408 mov 16(%rsi),%r13
409 mov 24(%rsi),%r12
410 mov 32(%rsi),%rbp
411 mov 40(%rsi),%rbx
412 lea 48(%rsi),%rsp
413.Lghash_epilogue:
414 ret
415.size gcm_ghash_4bit,.-gcm_ghash_4bit
416___
417
418
419######################################################################
420# PCLMULQDQ version.
421
422@_4args=$win64? ("%rcx","%rdx","%r8", "%r9") : # Win64 order
423 ("%rdi","%rsi","%rdx","%rcx"); # Unix order
424
425($Xi,$Xhi)=("%xmm0","%xmm1"); $Hkey="%xmm2";
426($T1,$T2,$T3)=("%xmm3","%xmm4","%xmm5");
427
428sub clmul64x64_T2 { # minimal register pressure
429my ($Xhi,$Xi,$Hkey,$HK)=@_;
430
431if (!defined($HK)) { $HK = $T2;
432$code.=<<___;
433 movdqa $Xi,$Xhi #
434 pshufd \$0b01001110,$Xi,$T1
435 pshufd \$0b01001110,$Hkey,$T2
436 pxor $Xi,$T1 #
437 pxor $Hkey,$T2
438___
439} else {
440$code.=<<___;
441 movdqa $Xi,$Xhi #
442 pshufd \$0b01001110,$Xi,$T1
443 pxor $Xi,$T1 #
444___
445}
446$code.=<<___;
447 pclmulqdq \$0x00,$Hkey,$Xi #######
448 pclmulqdq \$0x11,$Hkey,$Xhi #######
449 pclmulqdq \$0x00,$HK,$T1 #######
450 pxor $Xi,$T1 #
451 pxor $Xhi,$T1 #
452
453 movdqa $T1,$T2 #
454 psrldq \$8,$T1
455 pslldq \$8,$T2 #
456 pxor $T1,$Xhi
457 pxor $T2,$Xi #
458___
459}
460
461sub reduction_alg9 { # 17/11 times faster than Intel version
462my ($Xhi,$Xi) = @_;
463
464$code.=<<___;
465 # 1st phase
466 movdqa $Xi,$T2 #
467 movdqa $Xi,$T1
468 psllq \$5,$Xi
469 pxor $Xi,$T1 #
470 psllq \$1,$Xi
471 pxor $T1,$Xi #
472 psllq \$57,$Xi #
473 movdqa $Xi,$T1 #
474 pslldq \$8,$Xi
475 psrldq \$8,$T1 #
476 pxor $T2,$Xi
477 pxor $T1,$Xhi #
478
479 # 2nd phase
480 movdqa $Xi,$T2
481 psrlq \$1,$Xi
482 pxor $T2,$Xhi #
483 pxor $Xi,$T2
484 psrlq \$5,$Xi
485 pxor $T2,$Xi #
486 psrlq \$1,$Xi #
487 pxor $Xhi,$Xi #
488___
489}
490
491
492{ my ($Htbl,$Xip)=@_4args;
493 my $HK="%xmm6";
494
495$code.=<<___;
496.globl gcm_init_clmul
497.type gcm_init_clmul,\@abi-omnipotent
498.align 16
499gcm_init_clmul:
500.L_init_clmul:
501___
502$code.=<<___ if ($win64);
503.LSEH_begin_gcm_init_clmul:
504 # I can't trust assembler to use specific encoding:-(
505 .byte 0x48,0x83,0xec,0x18 #sub $0x18,%rsp
506 .byte 0x0f,0x29,0x34,0x24 #movaps %xmm6,(%rsp)
507___
508$code.=<<___;
509 movdqu ($Xip),$Hkey
510 pshufd \$0b01001110,$Hkey,$Hkey # dword swap
511
512 # <<1 twist
513 pshufd \$0b11111111,$Hkey,$T2 # broadcast uppermost dword
514 movdqa $Hkey,$T1
515 psllq \$1,$Hkey
516 pxor $T3,$T3 #
517 psrlq \$63,$T1
518 pcmpgtd $T2,$T3 # broadcast carry bit
519 pslldq \$8,$T1
520 por $T1,$Hkey # H<<=1
521
522 # magic reduction
523 pand .L0x1c2_polynomial(%rip),$T3
524 pxor $T3,$Hkey # if(carry) H^=0x1c2_polynomial
525
526 # calculate H^2
527 pshufd \$0b01001110,$Hkey,$HK
528 movdqa $Hkey,$Xi
529 pxor $Hkey,$HK
530___
531 &clmul64x64_T2 ($Xhi,$Xi,$Hkey,$HK);
532 &reduction_alg9 ($Xhi,$Xi);
533$code.=<<___;
534 pshufd \$0b01001110,$Hkey,$T1
535 pshufd \$0b01001110,$Xi,$T2
536 pxor $Hkey,$T1 # Karatsuba pre-processing
537 movdqu $Hkey,0x00($Htbl) # save H
538 pxor $Xi,$T2 # Karatsuba pre-processing
539 movdqu $Xi,0x10($Htbl) # save H^2
540 palignr \$8,$T1,$T2 # low part is H.lo^H.hi...
541 movdqu $T2,0x20($Htbl) # save Karatsuba "salt"
542___
543if ($do4xaggr) {
544 &clmul64x64_T2 ($Xhi,$Xi,$Hkey,$HK); # H^3
545 &reduction_alg9 ($Xhi,$Xi);
546$code.=<<___;
547 movdqa $Xi,$T3
548___
549 &clmul64x64_T2 ($Xhi,$Xi,$Hkey,$HK); # H^4
550 &reduction_alg9 ($Xhi,$Xi);
551$code.=<<___;
552 pshufd \$0b01001110,$T3,$T1
553 pshufd \$0b01001110,$Xi,$T2
554 pxor $T3,$T1 # Karatsuba pre-processing
555 movdqu $T3,0x30($Htbl) # save H^3
556 pxor $Xi,$T2 # Karatsuba pre-processing
557 movdqu $Xi,0x40($Htbl) # save H^4
558 palignr \$8,$T1,$T2 # low part is H^3.lo^H^3.hi...
559 movdqu $T2,0x50($Htbl) # save Karatsuba "salt"
560___
561}
562$code.=<<___ if ($win64);
563 movaps (%rsp),%xmm6
564 lea 0x18(%rsp),%rsp
565.LSEH_end_gcm_init_clmul:
566___
567$code.=<<___;
568 ret
569.size gcm_init_clmul,.-gcm_init_clmul
570___
571}
572
573{ my ($Xip,$Htbl)=@_4args;
574
575$code.=<<___;
576.globl gcm_gmult_clmul
577.type gcm_gmult_clmul,\@abi-omnipotent
578.align 16
579gcm_gmult_clmul:
580.L_gmult_clmul:
581 movdqu ($Xip),$Xi
582 movdqa .Lbswap_mask(%rip),$T3
583 movdqu ($Htbl),$Hkey
584 movdqu 0x20($Htbl),$T2
585 pshufb $T3,$Xi
586___
587 &clmul64x64_T2 ($Xhi,$Xi,$Hkey,$T2);
588$code.=<<___ if (0 || (&reduction_alg9($Xhi,$Xi)&&0));
589 # experimental alternative. special thing about is that there
590 # no dependency between the two multiplications...
591 mov \$`0xE1<<1`,%eax
592 mov \$0xA040608020C0E000,%r10 # ((7..0)·0xE0)&0xff
593 mov \$0x07,%r11d
594 movq %rax,$T1
595 movq %r10,$T2
596 movq %r11,$T3 # borrow $T3
597 pand $Xi,$T3
598 pshufb $T3,$T2 # ($Xi&7)·0xE0
599 movq %rax,$T3
600 pclmulqdq \$0x00,$Xi,$T1 # ·(0xE1<<1)
601 pxor $Xi,$T2
602 pslldq \$15,$T2
603 paddd $T2,$T2 # <<(64+56+1)
604 pxor $T2,$Xi
605 pclmulqdq \$0x01,$T3,$Xi
606 movdqa .Lbswap_mask(%rip),$T3 # reload $T3
607 psrldq \$1,$T1
608 pxor $T1,$Xhi
609 pslldq \$7,$Xi
610 pxor $Xhi,$Xi
611___
612$code.=<<___;
613 pshufb $T3,$Xi
614 movdqu $Xi,($Xip)
615 ret
616.size gcm_gmult_clmul,.-gcm_gmult_clmul
617___
618}
619
620
621{ my ($Xip,$Htbl,$inp,$len)=@_4args;
622 my ($Xln,$Xmn,$Xhn,$Hkey2,$HK) = map("%xmm$_",(3..7));
623 my ($T1,$T2,$T3)=map("%xmm$_",(8..10));
624
625$code.=<<___;
626.globl gcm_ghash_clmul
627.type gcm_ghash_clmul,\@abi-omnipotent
628.align 32
629gcm_ghash_clmul:
630.L_ghash_clmul:
631___
632$code.=<<___ if ($win64);
633 lea -0x88(%rsp),%rax
634.LSEH_begin_gcm_ghash_clmul:
635 # I can't trust assembler to use specific encoding:-(
636 .byte 0x48,0x8d,0x60,0xe0 #lea -0x20(%rax),%rsp
637 .byte 0x0f,0x29,0x70,0xe0 #movaps %xmm6,-0x20(%rax)
638 .byte 0x0f,0x29,0x78,0xf0 #movaps %xmm7,-0x10(%rax)
639 .byte 0x44,0x0f,0x29,0x00 #movaps %xmm8,0(%rax)
640 .byte 0x44,0x0f,0x29,0x48,0x10 #movaps %xmm9,0x10(%rax)
641 .byte 0x44,0x0f,0x29,0x50,0x20 #movaps %xmm10,0x20(%rax)
642 .byte 0x44,0x0f,0x29,0x58,0x30 #movaps %xmm11,0x30(%rax)
643 .byte 0x44,0x0f,0x29,0x60,0x40 #movaps %xmm12,0x40(%rax)
644 .byte 0x44,0x0f,0x29,0x68,0x50 #movaps %xmm13,0x50(%rax)
645 .byte 0x44,0x0f,0x29,0x70,0x60 #movaps %xmm14,0x60(%rax)
646 .byte 0x44,0x0f,0x29,0x78,0x70 #movaps %xmm15,0x70(%rax)
647___
648$code.=<<___;
649 movdqa .Lbswap_mask(%rip),$T3
650
651 movdqu ($Xip),$Xi
652 movdqu ($Htbl),$Hkey
653 movdqu 0x20($Htbl),$HK
654 pshufb $T3,$Xi
655
656 sub \$0x10,$len
657 jz .Lodd_tail
658
659 movdqu 0x10($Htbl),$Hkey2
660___
661if ($do4xaggr) {
662my ($Xl,$Xm,$Xh,$Hkey3,$Hkey4)=map("%xmm$_",(11..15));
663
664$code.=<<___;
665 mov OPENSSL_ia32cap_P+4(%rip),%eax
666 cmp \$0x30,$len
667 jb .Lskip4x
668
669 and \$`1<<26|1<<22`,%eax # isolate MOVBE+XSAVE
670 cmp \$`1<<22`,%eax # check for MOVBE without XSAVE
671 je .Lskip4x
672
673 sub \$0x30,$len
674 mov \$0xA040608020C0E000,%rax # ((7..0)·0xE0)&0xff
675 movdqu 0x30($Htbl),$Hkey3
676 movdqu 0x40($Htbl),$Hkey4
677
678 #######
679 # Xi+4 =[(H*Ii+3) + (H^2*Ii+2) + (H^3*Ii+1) + H^4*(Ii+Xi)] mod P
680 #
681 movdqu 0x30($inp),$Xln
682 movdqu 0x20($inp),$Xl
683 pshufb $T3,$Xln
684 pshufb $T3,$Xl
685 movdqa $Xln,$Xhn
686 pshufd \$0b01001110,$Xln,$Xmn
687 pxor $Xln,$Xmn
688 pclmulqdq \$0x00,$Hkey,$Xln
689 pclmulqdq \$0x11,$Hkey,$Xhn
690 pclmulqdq \$0x00,$HK,$Xmn
691
692 movdqa $Xl,$Xh
693 pshufd \$0b01001110,$Xl,$Xm
694 pxor $Xl,$Xm
695 pclmulqdq \$0x00,$Hkey2,$Xl
696 pclmulqdq \$0x11,$Hkey2,$Xh
697 pclmulqdq \$0x10,$HK,$Xm
698 xorps $Xl,$Xln
699 xorps $Xh,$Xhn
700 movups 0x50($Htbl),$HK
701 xorps $Xm,$Xmn
702
703 movdqu 0x10($inp),$Xl
704 movdqu 0($inp),$T1
705 pshufb $T3,$Xl
706 pshufb $T3,$T1
707 movdqa $Xl,$Xh
708 pshufd \$0b01001110,$Xl,$Xm
709 pxor $T1,$Xi
710 pxor $Xl,$Xm
711 pclmulqdq \$0x00,$Hkey3,$Xl
712 movdqa $Xi,$Xhi
713 pshufd \$0b01001110,$Xi,$T1
714 pxor $Xi,$T1
715 pclmulqdq \$0x11,$Hkey3,$Xh
716 pclmulqdq \$0x00,$HK,$Xm
717 xorps $Xl,$Xln
718 xorps $Xh,$Xhn
719
720 lea 0x40($inp),$inp
721 sub \$0x40,$len
722 jc .Ltail4x
723
724 jmp .Lmod4_loop
725.align 32
726.Lmod4_loop:
727 pclmulqdq \$0x00,$Hkey4,$Xi
728 xorps $Xm,$Xmn
729 movdqu 0x30($inp),$Xl
730 pshufb $T3,$Xl
731 pclmulqdq \$0x11,$Hkey4,$Xhi
732 xorps $Xln,$Xi
733 movdqu 0x20($inp),$Xln
734 movdqa $Xl,$Xh
735 pclmulqdq \$0x10,$HK,$T1
736 pshufd \$0b01001110,$Xl,$Xm
737 xorps $Xhn,$Xhi
738 pxor $Xl,$Xm
739 pshufb $T3,$Xln
740 movups 0x20($Htbl),$HK
741 xorps $Xmn,$T1
742 pclmulqdq \$0x00,$Hkey,$Xl
743 pshufd \$0b01001110,$Xln,$Xmn
744
745 pxor $Xi,$T1 # aggregated Karatsuba post-processing
746 movdqa $Xln,$Xhn
747 pxor $Xhi,$T1 #
748 pxor $Xln,$Xmn
749 movdqa $T1,$T2 #
750 pclmulqdq \$0x11,$Hkey,$Xh
751 pslldq \$8,$T1
752 psrldq \$8,$T2 #
753 pxor $T1,$Xi
754 movdqa .L7_mask(%rip),$T1
755 pxor $T2,$Xhi #
756 movq %rax,$T2
757
758 pand $Xi,$T1 # 1st phase
759 pshufb $T1,$T2 #
760 pxor $Xi,$T2 #
761 pclmulqdq \$0x00,$HK,$Xm
762 psllq \$57,$T2 #
763 movdqa $T2,$T1 #
764 pslldq \$8,$T2
765 pclmulqdq \$0x00,$Hkey2,$Xln
766 psrldq \$8,$T1 #
767 pxor $T2,$Xi
768 pxor $T1,$Xhi #
769 movdqu 0($inp),$T1
770
771 movdqa $Xi,$T2 # 2nd phase
772 psrlq \$1,$Xi
773 pclmulqdq \$0x11,$Hkey2,$Xhn
774 xorps $Xl,$Xln
775 movdqu 0x10($inp),$Xl
776 pshufb $T3,$Xl
777 pclmulqdq \$0x10,$HK,$Xmn
778 xorps $Xh,$Xhn
779 movups 0x50($Htbl),$HK
780 pshufb $T3,$T1
781 pxor $T2,$Xhi #
782 pxor $Xi,$T2
783 psrlq \$5,$Xi
784
785 movdqa $Xl,$Xh
786 pxor $Xm,$Xmn
787 pshufd \$0b01001110,$Xl,$Xm
788 pxor $T2,$Xi #
789 pxor $T1,$Xhi
790 pxor $Xl,$Xm
791 pclmulqdq \$0x00,$Hkey3,$Xl
792 psrlq \$1,$Xi #
793 pxor $Xhi,$Xi #
794 movdqa $Xi,$Xhi
795 pclmulqdq \$0x11,$Hkey3,$Xh
796 xorps $Xl,$Xln
797 pshufd \$0b01001110,$Xi,$T1
798 pxor $Xi,$T1
799
800 pclmulqdq \$0x00,$HK,$Xm
801 xorps $Xh,$Xhn
802
803 lea 0x40($inp),$inp
804 sub \$0x40,$len
805 jnc .Lmod4_loop
806
807.Ltail4x:
808 pclmulqdq \$0x00,$Hkey4,$Xi
809 pclmulqdq \$0x11,$Hkey4,$Xhi
810 pclmulqdq \$0x10,$HK,$T1
811 xorps $Xm,$Xmn
812 xorps $Xln,$Xi
813 xorps $Xhn,$Xhi
814 pxor $Xi,$Xhi # aggregated Karatsuba post-processing
815 pxor $Xmn,$T1
816
817 pxor $Xhi,$T1 #
818 pxor $Xi,$Xhi
819
820 movdqa $T1,$T2 #
821 psrldq \$8,$T1
822 pslldq \$8,$T2 #
823 pxor $T1,$Xhi
824 pxor $T2,$Xi #
825___
826 &reduction_alg9($Xhi,$Xi);
827$code.=<<___;
828 add \$0x40,$len
829 jz .Ldone
830 movdqu 0x20($Htbl),$HK
831 sub \$0x10,$len
832 jz .Lodd_tail
833.Lskip4x:
834___
835}
836$code.=<<___;
837 #######
838 # Xi+2 =[H*(Ii+1 + Xi+1)] mod P =
839 # [(H*Ii+1) + (H*Xi+1)] mod P =
840 # [(H*Ii+1) + H^2*(Ii+Xi)] mod P
841 #
842 movdqu ($inp),$T1 # Ii
843 movdqu 16($inp),$Xln # Ii+1
844 pshufb $T3,$T1
845 pshufb $T3,$Xln
846 pxor $T1,$Xi # Ii+Xi
847
848 movdqa $Xln,$Xhn
849 pshufd \$0b01001110,$Xln,$Xmn
850 pxor $Xln,$Xmn
851 pclmulqdq \$0x00,$Hkey,$Xln
852 pclmulqdq \$0x11,$Hkey,$Xhn
853 pclmulqdq \$0x00,$HK,$Xmn
854
855 lea 32($inp),$inp # i+=2
856 nop
857 sub \$0x20,$len
858 jbe .Leven_tail
859 nop
860 jmp .Lmod_loop
861
862.align 32
863.Lmod_loop:
864 movdqa $Xi,$Xhi
865 movdqa $Xmn,$T1
866 pshufd \$0b01001110,$Xi,$Xmn #
867 pxor $Xi,$Xmn #
868
869 pclmulqdq \$0x00,$Hkey2,$Xi
870 pclmulqdq \$0x11,$Hkey2,$Xhi
871 pclmulqdq \$0x10,$HK,$Xmn
872
873 pxor $Xln,$Xi # (H*Ii+1) + H^2*(Ii+Xi)
874 pxor $Xhn,$Xhi
875 movdqu ($inp),$T2 # Ii
876 pxor $Xi,$T1 # aggregated Karatsuba post-processing
877 pshufb $T3,$T2
878 movdqu 16($inp),$Xln # Ii+1
879
880 pxor $Xhi,$T1
881 pxor $T2,$Xhi # "Ii+Xi", consume early
882 pxor $T1,$Xmn
883 pshufb $T3,$Xln
884 movdqa $Xmn,$T1 #
885 psrldq \$8,$T1
886 pslldq \$8,$Xmn #
887 pxor $T1,$Xhi
888 pxor $Xmn,$Xi #
889
890 movdqa $Xln,$Xhn #
891
892 movdqa $Xi,$T2 # 1st phase
893 movdqa $Xi,$T1
894 psllq \$5,$Xi
895 pxor $Xi,$T1 #
896 pclmulqdq \$0x00,$Hkey,$Xln #######
897 psllq \$1,$Xi
898 pxor $T1,$Xi #
899 psllq \$57,$Xi #
900 movdqa $Xi,$T1 #
901 pslldq \$8,$Xi
902 psrldq \$8,$T1 #
903 pxor $T2,$Xi
904 pshufd \$0b01001110,$Xhn,$Xmn
905 pxor $T1,$Xhi #
906 pxor $Xhn,$Xmn #
907
908 movdqa $Xi,$T2 # 2nd phase
909 psrlq \$1,$Xi
910 pclmulqdq \$0x11,$Hkey,$Xhn #######
911 pxor $T2,$Xhi #
912 pxor $Xi,$T2
913 psrlq \$5,$Xi
914 pxor $T2,$Xi #
915 lea 32($inp),$inp
916 psrlq \$1,$Xi #
917 pclmulqdq \$0x00,$HK,$Xmn #######
918 pxor $Xhi,$Xi #
919
920 sub \$0x20,$len
921 ja .Lmod_loop
922
923.Leven_tail:
924 movdqa $Xi,$Xhi
925 movdqa $Xmn,$T1
926 pshufd \$0b01001110,$Xi,$Xmn #
927 pxor $Xi,$Xmn #
928
929 pclmulqdq \$0x00,$Hkey2,$Xi
930 pclmulqdq \$0x11,$Hkey2,$Xhi
931 pclmulqdq \$0x10,$HK,$Xmn
932
933 pxor $Xln,$Xi # (H*Ii+1) + H^2*(Ii+Xi)
934 pxor $Xhn,$Xhi
935 pxor $Xi,$T1
936 pxor $Xhi,$T1
937 pxor $T1,$Xmn
938 movdqa $Xmn,$T1 #
939 psrldq \$8,$T1
940 pslldq \$8,$Xmn #
941 pxor $T1,$Xhi
942 pxor $Xmn,$Xi #
943___
944 &reduction_alg9 ($Xhi,$Xi);
945$code.=<<___;
946 test $len,$len
947 jnz .Ldone
948
949.Lodd_tail:
950 movdqu ($inp),$T1 # Ii
951 pshufb $T3,$T1
952 pxor $T1,$Xi # Ii+Xi
953___
954 &clmul64x64_T2 ($Xhi,$Xi,$Hkey,$HK); # H*(Ii+Xi)
955 &reduction_alg9 ($Xhi,$Xi);
956$code.=<<___;
957.Ldone:
958 pshufb $T3,$Xi
959 movdqu $Xi,($Xip)
960___
961$code.=<<___ if ($win64);
962 movaps (%rsp),%xmm6
963 movaps 0x10(%rsp),%xmm7
964 movaps 0x20(%rsp),%xmm8
965 movaps 0x30(%rsp),%xmm9
966 movaps 0x40(%rsp),%xmm10
967 movaps 0x50(%rsp),%xmm11
968 movaps 0x60(%rsp),%xmm12
969 movaps 0x70(%rsp),%xmm13
970 movaps 0x80(%rsp),%xmm14
971 movaps 0x90(%rsp),%xmm15
972 lea 0xa8(%rsp),%rsp
973.LSEH_end_gcm_ghash_clmul:
974___
975$code.=<<___;
976 ret
977.size gcm_ghash_clmul,.-gcm_ghash_clmul
978___
979}
980
981
982$code.=<<___;
983.globl gcm_init_avx
984.type gcm_init_avx,\@abi-omnipotent
985.align 32
986gcm_init_avx:
987___
988if ($avx) {
989my ($Htbl,$Xip)=@_4args;
990my $HK="%xmm6";
991
992$code.=<<___ if ($win64);
993.LSEH_begin_gcm_init_avx:
994 # I can't trust assembler to use specific encoding:-(
995 .byte 0x48,0x83,0xec,0x18 #sub $0x18,%rsp
996 .byte 0x0f,0x29,0x34,0x24 #movaps %xmm6,(%rsp)
997___
998$code.=<<___;
999 vzeroupper
1000
1001 vmovdqu ($Xip),$Hkey
1002 vpshufd \$0b01001110,$Hkey,$Hkey # dword swap
1003
1004 # <<1 twist
1005 vpshufd \$0b11111111,$Hkey,$T2 # broadcast uppermost dword
1006 vpsrlq \$63,$Hkey,$T1
1007 vpsllq \$1,$Hkey,$Hkey
1008 vpxor $T3,$T3,$T3 #
1009 vpcmpgtd $T2,$T3,$T3 # broadcast carry bit
1010 vpslldq \$8,$T1,$T1
1011 vpor $T1,$Hkey,$Hkey # H<<=1
1012
1013 # magic reduction
1014 vpand .L0x1c2_polynomial(%rip),$T3,$T3
1015 vpxor $T3,$Hkey,$Hkey # if(carry) H^=0x1c2_polynomial
1016
1017 vpunpckhqdq $Hkey,$Hkey,$HK
1018 vmovdqa $Hkey,$Xi
1019 vpxor $Hkey,$HK,$HK
1020 mov \$4,%r10 # up to H^8
1021 jmp .Linit_start_avx
1022___
1023
1024sub clmul64x64_avx {
1025my ($Xhi,$Xi,$Hkey,$HK)=@_;
1026
1027if (!defined($HK)) { $HK = $T2;
1028$code.=<<___;
1029 vpunpckhqdq $Xi,$Xi,$T1
1030 vpunpckhqdq $Hkey,$Hkey,$T2
1031 vpxor $Xi,$T1,$T1 #
1032 vpxor $Hkey,$T2,$T2
1033___
1034} else {
1035$code.=<<___;
1036 vpunpckhqdq $Xi,$Xi,$T1
1037 vpxor $Xi,$T1,$T1 #
1038___
1039}
1040$code.=<<___;
1041 vpclmulqdq \$0x11,$Hkey,$Xi,$Xhi #######
1042 vpclmulqdq \$0x00,$Hkey,$Xi,$Xi #######
1043 vpclmulqdq \$0x00,$HK,$T1,$T1 #######
1044 vpxor $Xi,$Xhi,$T2 #
1045 vpxor $T2,$T1,$T1 #
1046
1047 vpslldq \$8,$T1,$T2 #
1048 vpsrldq \$8,$T1,$T1
1049 vpxor $T2,$Xi,$Xi #
1050 vpxor $T1,$Xhi,$Xhi
1051___
1052}
1053
1054sub reduction_avx {
1055my ($Xhi,$Xi) = @_;
1056
1057$code.=<<___;
1058 vpsllq \$57,$Xi,$T1 # 1st phase
1059 vpsllq \$62,$Xi,$T2
1060 vpxor $T1,$T2,$T2 #
1061 vpsllq \$63,$Xi,$T1
1062 vpxor $T1,$T2,$T2 #
1063 vpslldq \$8,$T2,$T1 #
1064 vpsrldq \$8,$T2,$T2
1065 vpxor $T1,$Xi,$Xi #
1066 vpxor $T2,$Xhi,$Xhi
1067
1068 vpsrlq \$1,$Xi,$T2 # 2nd phase
1069 vpxor $Xi,$Xhi,$Xhi
1070 vpxor $T2,$Xi,$Xi #
1071 vpsrlq \$5,$T2,$T2
1072 vpxor $T2,$Xi,$Xi #
1073 vpsrlq \$1,$Xi,$Xi #
1074 vpxor $Xhi,$Xi,$Xi #
1075___
1076}
1077
1078$code.=<<___;
1079.align 32
1080.Linit_loop_avx:
1081 vpalignr \$8,$T1,$T2,$T3 # low part is H.lo^H.hi...
1082 vmovdqu $T3,-0x10($Htbl) # save Karatsuba "salt"
1083___
1084 &clmul64x64_avx ($Xhi,$Xi,$Hkey,$HK); # calculate H^3,5,7
1085 &reduction_avx ($Xhi,$Xi);
1086$code.=<<___;
1087.Linit_start_avx:
1088 vmovdqa $Xi,$T3
1089___
1090 &clmul64x64_avx ($Xhi,$Xi,$Hkey,$HK); # calculate H^2,4,6,8
1091 &reduction_avx ($Xhi,$Xi);
1092$code.=<<___;
1093 vpshufd \$0b01001110,$T3,$T1
1094 vpshufd \$0b01001110,$Xi,$T2
1095 vpxor $T3,$T1,$T1 # Karatsuba pre-processing
1096 vmovdqu $T3,0x00($Htbl) # save H^1,3,5,7
1097 vpxor $Xi,$T2,$T2 # Karatsuba pre-processing
1098 vmovdqu $Xi,0x10($Htbl) # save H^2,4,6,8
1099 lea 0x30($Htbl),$Htbl
1100 sub \$1,%r10
1101 jnz .Linit_loop_avx
1102
1103 vpalignr \$8,$T2,$T1,$T3 # last "salt" is flipped
1104 vmovdqu $T3,-0x10($Htbl)
1105
1106 vzeroupper
1107___
1108$code.=<<___ if ($win64);
1109 movaps (%rsp),%xmm6
1110 lea 0x18(%rsp),%rsp
1111.LSEH_end_gcm_init_avx:
1112___
1113$code.=<<___;
1114 ret
1115.size gcm_init_avx,.-gcm_init_avx
1116___
1117} else {
1118$code.=<<___;
1119 jmp .L_init_clmul
1120.size gcm_init_avx,.-gcm_init_avx
1121___
1122}
1123
1124$code.=<<___;
1125.globl gcm_gmult_avx
1126.type gcm_gmult_avx,\@abi-omnipotent
1127.align 32
1128gcm_gmult_avx:
1129 jmp .L_gmult_clmul
1130.size gcm_gmult_avx,.-gcm_gmult_avx
1131___
1132
1133
1134$code.=<<___;
1135.globl gcm_ghash_avx
1136.type gcm_ghash_avx,\@abi-omnipotent
1137.align 32
1138gcm_ghash_avx:
1139___
1140if ($avx) {
1141my ($Xip,$Htbl,$inp,$len)=@_4args;
1142my ($Xlo,$Xhi,$Xmi,
1143 $Zlo,$Zhi,$Zmi,
1144 $Hkey,$HK,$T1,$T2,
1145 $Xi,$Xo,$Tred,$bswap,$Ii,$Ij) = map("%xmm$_",(0..15));
1146
1147$code.=<<___ if ($win64);
1148 lea -0x88(%rsp),%rax
1149.LSEH_begin_gcm_ghash_avx:
1150 # I can't trust assembler to use specific encoding:-(
1151 .byte 0x48,0x8d,0x60,0xe0 #lea -0x20(%rax),%rsp
1152 .byte 0x0f,0x29,0x70,0xe0 #movaps %xmm6,-0x20(%rax)
1153 .byte 0x0f,0x29,0x78,0xf0 #movaps %xmm7,-0x10(%rax)
1154 .byte 0x44,0x0f,0x29,0x00 #movaps %xmm8,0(%rax)
1155 .byte 0x44,0x0f,0x29,0x48,0x10 #movaps %xmm9,0x10(%rax)
1156 .byte 0x44,0x0f,0x29,0x50,0x20 #movaps %xmm10,0x20(%rax)
1157 .byte 0x44,0x0f,0x29,0x58,0x30 #movaps %xmm11,0x30(%rax)
1158 .byte 0x44,0x0f,0x29,0x60,0x40 #movaps %xmm12,0x40(%rax)
1159 .byte 0x44,0x0f,0x29,0x68,0x50 #movaps %xmm13,0x50(%rax)
1160 .byte 0x44,0x0f,0x29,0x70,0x60 #movaps %xmm14,0x60(%rax)
1161 .byte 0x44,0x0f,0x29,0x78,0x70 #movaps %xmm15,0x70(%rax)
1162___
1163$code.=<<___;
1164 vzeroupper
1165
1166 vmovdqu ($Xip),$Xi # load $Xi
1167 lea .L0x1c2_polynomial(%rip),%r10
1168 lea 0x40($Htbl),$Htbl # size optimization
1169 vmovdqu .Lbswap_mask(%rip),$bswap
1170 vpshufb $bswap,$Xi,$Xi
1171 cmp \$0x80,$len
1172 jb .Lshort_avx
1173 sub \$0x80,$len
1174
1175 vmovdqu 0x70($inp),$Ii # I[7]
1176 vmovdqu 0x00-0x40($Htbl),$Hkey # $Hkey^1
1177 vpshufb $bswap,$Ii,$Ii
1178 vmovdqu 0x20-0x40($Htbl),$HK
1179
1180 vpunpckhqdq $Ii,$Ii,$T2
1181 vmovdqu 0x60($inp),$Ij # I[6]
1182 vpclmulqdq \$0x00,$Hkey,$Ii,$Xlo
1183 vpxor $Ii,$T2,$T2
1184 vpshufb $bswap,$Ij,$Ij
1185 vpclmulqdq \$0x11,$Hkey,$Ii,$Xhi
1186 vmovdqu 0x10-0x40($Htbl),$Hkey # $Hkey^2
1187 vpunpckhqdq $Ij,$Ij,$T1
1188 vmovdqu 0x50($inp),$Ii # I[5]
1189 vpclmulqdq \$0x00,$HK,$T2,$Xmi
1190 vpxor $Ij,$T1,$T1
1191
1192 vpshufb $bswap,$Ii,$Ii
1193 vpclmulqdq \$0x00,$Hkey,$Ij,$Zlo
1194 vpunpckhqdq $Ii,$Ii,$T2
1195 vpclmulqdq \$0x11,$Hkey,$Ij,$Zhi
1196 vmovdqu 0x30-0x40($Htbl),$Hkey # $Hkey^3
1197 vpxor $Ii,$T2,$T2
1198 vmovdqu 0x40($inp),$Ij # I[4]
1199 vpclmulqdq \$0x10,$HK,$T1,$Zmi
1200 vmovdqu 0x50-0x40($Htbl),$HK
1201
1202 vpshufb $bswap,$Ij,$Ij
1203 vpxor $Xlo,$Zlo,$Zlo
1204 vpclmulqdq \$0x00,$Hkey,$Ii,$Xlo
1205 vpxor $Xhi,$Zhi,$Zhi
1206 vpunpckhqdq $Ij,$Ij,$T1
1207 vpclmulqdq \$0x11,$Hkey,$Ii,$Xhi
1208 vmovdqu 0x40-0x40($Htbl),$Hkey # $Hkey^4
1209 vpxor $Xmi,$Zmi,$Zmi
1210 vpclmulqdq \$0x00,$HK,$T2,$Xmi
1211 vpxor $Ij,$T1,$T1
1212
1213 vmovdqu 0x30($inp),$Ii # I[3]
1214 vpxor $Zlo,$Xlo,$Xlo
1215 vpclmulqdq \$0x00,$Hkey,$Ij,$Zlo
1216 vpxor $Zhi,$Xhi,$Xhi
1217 vpshufb $bswap,$Ii,$Ii
1218 vpclmulqdq \$0x11,$Hkey,$Ij,$Zhi
1219 vmovdqu 0x60-0x40($Htbl),$Hkey # $Hkey^5
1220 vpxor $Zmi,$Xmi,$Xmi
1221 vpunpckhqdq $Ii,$Ii,$T2
1222 vpclmulqdq \$0x10,$HK,$T1,$Zmi
1223 vmovdqu 0x80-0x40($Htbl),$HK
1224 vpxor $Ii,$T2,$T2
1225
1226 vmovdqu 0x20($inp),$Ij # I[2]
1227 vpxor $Xlo,$Zlo,$Zlo
1228 vpclmulqdq \$0x00,$Hkey,$Ii,$Xlo
1229 vpxor $Xhi,$Zhi,$Zhi
1230 vpshufb $bswap,$Ij,$Ij
1231 vpclmulqdq \$0x11,$Hkey,$Ii,$Xhi
1232 vmovdqu 0x70-0x40($Htbl),$Hkey # $Hkey^6
1233 vpxor $Xmi,$Zmi,$Zmi
1234 vpunpckhqdq $Ij,$Ij,$T1
1235 vpclmulqdq \$0x00,$HK,$T2,$Xmi
1236 vpxor $Ij,$T1,$T1
1237
1238 vmovdqu 0x10($inp),$Ii # I[1]
1239 vpxor $Zlo,$Xlo,$Xlo
1240 vpclmulqdq \$0x00,$Hkey,$Ij,$Zlo
1241 vpxor $Zhi,$Xhi,$Xhi
1242 vpshufb $bswap,$Ii,$Ii
1243 vpclmulqdq \$0x11,$Hkey,$Ij,$Zhi
1244 vmovdqu 0x90-0x40($Htbl),$Hkey # $Hkey^7
1245 vpxor $Zmi,$Xmi,$Xmi
1246 vpunpckhqdq $Ii,$Ii,$T2
1247 vpclmulqdq \$0x10,$HK,$T1,$Zmi
1248 vmovdqu 0xb0-0x40($Htbl),$HK
1249 vpxor $Ii,$T2,$T2
1250
1251 vmovdqu ($inp),$Ij # I[0]
1252 vpxor $Xlo,$Zlo,$Zlo
1253 vpclmulqdq \$0x00,$Hkey,$Ii,$Xlo
1254 vpxor $Xhi,$Zhi,$Zhi
1255 vpshufb $bswap,$Ij,$Ij
1256 vpclmulqdq \$0x11,$Hkey,$Ii,$Xhi
1257 vmovdqu 0xa0-0x40($Htbl),$Hkey # $Hkey^8
1258 vpxor $Xmi,$Zmi,$Zmi
1259 vpclmulqdq \$0x10,$HK,$T2,$Xmi
1260
1261 lea 0x80($inp),$inp
1262 cmp \$0x80,$len
1263 jb .Ltail_avx
1264
1265 vpxor $Xi,$Ij,$Ij # accumulate $Xi
1266 sub \$0x80,$len
1267 jmp .Loop8x_avx
1268
1269.align 32
1270.Loop8x_avx:
1271 vpunpckhqdq $Ij,$Ij,$T1
1272 vmovdqu 0x70($inp),$Ii # I[7]
1273 vpxor $Xlo,$Zlo,$Zlo
1274 vpxor $Ij,$T1,$T1
1275 vpclmulqdq \$0x00,$Hkey,$Ij,$Xi
1276 vpshufb $bswap,$Ii,$Ii
1277 vpxor $Xhi,$Zhi,$Zhi
1278 vpclmulqdq \$0x11,$Hkey,$Ij,$Xo
1279 vmovdqu 0x00-0x40($Htbl),$Hkey # $Hkey^1
1280 vpunpckhqdq $Ii,$Ii,$T2
1281 vpxor $Xmi,$Zmi,$Zmi
1282 vpclmulqdq \$0x00,$HK,$T1,$Tred
1283 vmovdqu 0x20-0x40($Htbl),$HK
1284 vpxor $Ii,$T2,$T2
1285
1286 vmovdqu 0x60($inp),$Ij # I[6]
1287 vpclmulqdq \$0x00,$Hkey,$Ii,$Xlo
1288 vpxor $Zlo,$Xi,$Xi # collect result
1289 vpshufb $bswap,$Ij,$Ij
1290 vpclmulqdq \$0x11,$Hkey,$Ii,$Xhi
1291 vxorps $Zhi,$Xo,$Xo
1292 vmovdqu 0x10-0x40($Htbl),$Hkey # $Hkey^2
1293 vpunpckhqdq $Ij,$Ij,$T1
1294 vpclmulqdq \$0x00,$HK, $T2,$Xmi
1295 vpxor $Zmi,$Tred,$Tred
1296 vxorps $Ij,$T1,$T1
1297
1298 vmovdqu 0x50($inp),$Ii # I[5]
1299 vpxor $Xi,$Tred,$Tred # aggregated Karatsuba post-processing
1300 vpclmulqdq \$0x00,$Hkey,$Ij,$Zlo
1301 vpxor $Xo,$Tred,$Tred
1302 vpslldq \$8,$Tred,$T2
1303 vpxor $Xlo,$Zlo,$Zlo
1304 vpclmulqdq \$0x11,$Hkey,$Ij,$Zhi
1305 vpsrldq \$8,$Tred,$Tred
1306 vpxor $T2, $Xi, $Xi
1307 vmovdqu 0x30-0x40($Htbl),$Hkey # $Hkey^3
1308 vpshufb $bswap,$Ii,$Ii
1309 vxorps $Tred,$Xo, $Xo
1310 vpxor $Xhi,$Zhi,$Zhi
1311 vpunpckhqdq $Ii,$Ii,$T2
1312 vpclmulqdq \$0x10,$HK, $T1,$Zmi
1313 vmovdqu 0x50-0x40($Htbl),$HK
1314 vpxor $Ii,$T2,$T2
1315 vpxor $Xmi,$Zmi,$Zmi
1316
1317 vmovdqu 0x40($inp),$Ij # I[4]
1318 vpalignr \$8,$Xi,$Xi,$Tred # 1st phase
1319 vpclmulqdq \$0x00,$Hkey,$Ii,$Xlo
1320 vpshufb $bswap,$Ij,$Ij
1321 vpxor $Zlo,$Xlo,$Xlo
1322 vpclmulqdq \$0x11,$Hkey,$Ii,$Xhi
1323 vmovdqu 0x40-0x40($Htbl),$Hkey # $Hkey^4
1324 vpunpckhqdq $Ij,$Ij,$T1
1325 vpxor $Zhi,$Xhi,$Xhi
1326 vpclmulqdq \$0x00,$HK, $T2,$Xmi
1327 vxorps $Ij,$T1,$T1
1328 vpxor $Zmi,$Xmi,$Xmi
1329
1330 vmovdqu 0x30($inp),$Ii # I[3]
1331 vpclmulqdq \$0x10,(%r10),$Xi,$Xi
1332 vpclmulqdq \$0x00,$Hkey,$Ij,$Zlo
1333 vpshufb $bswap,$Ii,$Ii
1334 vpxor $Xlo,$Zlo,$Zlo
1335 vpclmulqdq \$0x11,$Hkey,$Ij,$Zhi
1336 vmovdqu 0x60-0x40($Htbl),$Hkey # $Hkey^5
1337 vpunpckhqdq $Ii,$Ii,$T2
1338 vpxor $Xhi,$Zhi,$Zhi
1339 vpclmulqdq \$0x10,$HK, $T1,$Zmi
1340 vmovdqu 0x80-0x40($Htbl),$HK
1341 vpxor $Ii,$T2,$T2
1342 vpxor $Xmi,$Zmi,$Zmi
1343
1344 vmovdqu 0x20($inp),$Ij # I[2]
1345 vpclmulqdq \$0x00,$Hkey,$Ii,$Xlo
1346 vpshufb $bswap,$Ij,$Ij
1347 vpxor $Zlo,$Xlo,$Xlo
1348 vpclmulqdq \$0x11,$Hkey,$Ii,$Xhi
1349 vmovdqu 0x70-0x40($Htbl),$Hkey # $Hkey^6
1350 vpunpckhqdq $Ij,$Ij,$T1
1351 vpxor $Zhi,$Xhi,$Xhi
1352 vpclmulqdq \$0x00,$HK, $T2,$Xmi
1353 vpxor $Ij,$T1,$T1
1354 vpxor $Zmi,$Xmi,$Xmi
1355 vxorps $Tred,$Xi,$Xi
1356
1357 vmovdqu 0x10($inp),$Ii # I[1]
1358 vpalignr \$8,$Xi,$Xi,$Tred # 2nd phase
1359 vpclmulqdq \$0x00,$Hkey,$Ij,$Zlo
1360 vpshufb $bswap,$Ii,$Ii
1361 vpxor $Xlo,$Zlo,$Zlo
1362 vpclmulqdq \$0x11,$Hkey,$Ij,$Zhi
1363 vmovdqu 0x90-0x40($Htbl),$Hkey # $Hkey^7
1364 vpclmulqdq \$0x10,(%r10),$Xi,$Xi
1365 vxorps $Xo,$Tred,$Tred
1366 vpunpckhqdq $Ii,$Ii,$T2
1367 vpxor $Xhi,$Zhi,$Zhi
1368 vpclmulqdq \$0x10,$HK, $T1,$Zmi
1369 vmovdqu 0xb0-0x40($Htbl),$HK
1370 vpxor $Ii,$T2,$T2
1371 vpxor $Xmi,$Zmi,$Zmi
1372
1373 vmovdqu ($inp),$Ij # I[0]
1374 vpclmulqdq \$0x00,$Hkey,$Ii,$Xlo
1375 vpshufb $bswap,$Ij,$Ij
1376 vpclmulqdq \$0x11,$Hkey,$Ii,$Xhi
1377 vmovdqu 0xa0-0x40($Htbl),$Hkey # $Hkey^8
1378 vpxor $Tred,$Ij,$Ij
1379 vpclmulqdq \$0x10,$HK, $T2,$Xmi
1380 vpxor $Xi,$Ij,$Ij # accumulate $Xi
1381
1382 lea 0x80($inp),$inp
1383 sub \$0x80,$len
1384 jnc .Loop8x_avx
1385
1386 add \$0x80,$len
1387 jmp .Ltail_no_xor_avx
1388
1389.align 32
1390.Lshort_avx:
1391 vmovdqu -0x10($inp,$len),$Ii # very last word
1392 lea ($inp,$len),$inp
1393 vmovdqu 0x00-0x40($Htbl),$Hkey # $Hkey^1
1394 vmovdqu 0x20-0x40($Htbl),$HK
1395 vpshufb $bswap,$Ii,$Ij
1396
1397 vmovdqa $Xlo,$Zlo # subtle way to zero $Zlo,
1398 vmovdqa $Xhi,$Zhi # $Zhi and
1399 vmovdqa $Xmi,$Zmi # $Zmi
1400 sub \$0x10,$len
1401 jz .Ltail_avx
1402
1403 vpunpckhqdq $Ij,$Ij,$T1
1404 vpxor $Xlo,$Zlo,$Zlo
1405 vpclmulqdq \$0x00,$Hkey,$Ij,$Xlo
1406 vpxor $Ij,$T1,$T1
1407 vmovdqu -0x20($inp),$Ii
1408 vpxor $Xhi,$Zhi,$Zhi
1409 vpclmulqdq \$0x11,$Hkey,$Ij,$Xhi
1410 vmovdqu 0x10-0x40($Htbl),$Hkey # $Hkey^2
1411 vpshufb $bswap,$Ii,$Ij
1412 vpxor $Xmi,$Zmi,$Zmi
1413 vpclmulqdq \$0x00,$HK,$T1,$Xmi
1414 vpsrldq \$8,$HK,$HK
1415 sub \$0x10,$len
1416 jz .Ltail_avx
1417
1418 vpunpckhqdq $Ij,$Ij,$T1
1419 vpxor $Xlo,$Zlo,$Zlo
1420 vpclmulqdq \$0x00,$Hkey,$Ij,$Xlo
1421 vpxor $Ij,$T1,$T1
1422 vmovdqu -0x30($inp),$Ii
1423 vpxor $Xhi,$Zhi,$Zhi
1424 vpclmulqdq \$0x11,$Hkey,$Ij,$Xhi
1425 vmovdqu 0x30-0x40($Htbl),$Hkey # $Hkey^3
1426 vpshufb $bswap,$Ii,$Ij
1427 vpxor $Xmi,$Zmi,$Zmi
1428 vpclmulqdq \$0x00,$HK,$T1,$Xmi
1429 vmovdqu 0x50-0x40($Htbl),$HK
1430 sub \$0x10,$len
1431 jz .Ltail_avx
1432
1433 vpunpckhqdq $Ij,$Ij,$T1
1434 vpxor $Xlo,$Zlo,$Zlo
1435 vpclmulqdq \$0x00,$Hkey,$Ij,$Xlo
1436 vpxor $Ij,$T1,$T1
1437 vmovdqu -0x40($inp),$Ii
1438 vpxor $Xhi,$Zhi,$Zhi
1439 vpclmulqdq \$0x11,$Hkey,$Ij,$Xhi
1440 vmovdqu 0x40-0x40($Htbl),$Hkey # $Hkey^4
1441 vpshufb $bswap,$Ii,$Ij
1442 vpxor $Xmi,$Zmi,$Zmi
1443 vpclmulqdq \$0x00,$HK,$T1,$Xmi
1444 vpsrldq \$8,$HK,$HK
1445 sub \$0x10,$len
1446 jz .Ltail_avx
1447
1448 vpunpckhqdq $Ij,$Ij,$T1
1449 vpxor $Xlo,$Zlo,$Zlo
1450 vpclmulqdq \$0x00,$Hkey,$Ij,$Xlo
1451 vpxor $Ij,$T1,$T1
1452 vmovdqu -0x50($inp),$Ii
1453 vpxor $Xhi,$Zhi,$Zhi
1454 vpclmulqdq \$0x11,$Hkey,$Ij,$Xhi
1455 vmovdqu 0x60-0x40($Htbl),$Hkey # $Hkey^5
1456 vpshufb $bswap,$Ii,$Ij
1457 vpxor $Xmi,$Zmi,$Zmi
1458 vpclmulqdq \$0x00,$HK,$T1,$Xmi
1459 vmovdqu 0x80-0x40($Htbl),$HK
1460 sub \$0x10,$len
1461 jz .Ltail_avx
1462
1463 vpunpckhqdq $Ij,$Ij,$T1
1464 vpxor $Xlo,$Zlo,$Zlo
1465 vpclmulqdq \$0x00,$Hkey,$Ij,$Xlo
1466 vpxor $Ij,$T1,$T1
1467 vmovdqu -0x60($inp),$Ii
1468 vpxor $Xhi,$Zhi,$Zhi
1469 vpclmulqdq \$0x11,$Hkey,$Ij,$Xhi
1470 vmovdqu 0x70-0x40($Htbl),$Hkey # $Hkey^6
1471 vpshufb $bswap,$Ii,$Ij
1472 vpxor $Xmi,$Zmi,$Zmi
1473 vpclmulqdq \$0x00,$HK,$T1,$Xmi
1474 vpsrldq \$8,$HK,$HK
1475 sub \$0x10,$len
1476 jz .Ltail_avx
1477
1478 vpunpckhqdq $Ij,$Ij,$T1
1479 vpxor $Xlo,$Zlo,$Zlo
1480 vpclmulqdq \$0x00,$Hkey,$Ij,$Xlo
1481 vpxor $Ij,$T1,$T1
1482 vmovdqu -0x70($inp),$Ii
1483 vpxor $Xhi,$Zhi,$Zhi
1484 vpclmulqdq \$0x11,$Hkey,$Ij,$Xhi
1485 vmovdqu 0x90-0x40($Htbl),$Hkey # $Hkey^7
1486 vpshufb $bswap,$Ii,$Ij
1487 vpxor $Xmi,$Zmi,$Zmi
1488 vpclmulqdq \$0x00,$HK,$T1,$Xmi
1489 vmovq 0xb8-0x40($Htbl),$HK
1490 sub \$0x10,$len
1491 jmp .Ltail_avx
1492
1493.align 32
1494.Ltail_avx:
1495 vpxor $Xi,$Ij,$Ij # accumulate $Xi
1496.Ltail_no_xor_avx:
1497 vpunpckhqdq $Ij,$Ij,$T1
1498 vpxor $Xlo,$Zlo,$Zlo
1499 vpclmulqdq \$0x00,$Hkey,$Ij,$Xlo
1500 vpxor $Ij,$T1,$T1
1501 vpxor $Xhi,$Zhi,$Zhi
1502 vpclmulqdq \$0x11,$Hkey,$Ij,$Xhi
1503 vpxor $Xmi,$Zmi,$Zmi
1504 vpclmulqdq \$0x00,$HK,$T1,$Xmi
1505
1506 vmovdqu (%r10),$Tred
1507
1508 vpxor $Xlo,$Zlo,$Xi
1509 vpxor $Xhi,$Zhi,$Xo
1510 vpxor $Xmi,$Zmi,$Zmi
1511
1512 vpxor $Xi, $Zmi,$Zmi # aggregated Karatsuba post-processing
1513 vpxor $Xo, $Zmi,$Zmi
1514 vpslldq \$8, $Zmi,$T2
1515 vpsrldq \$8, $Zmi,$Zmi
1516 vpxor $T2, $Xi, $Xi
1517 vpxor $Zmi,$Xo, $Xo
1518
1519 vpclmulqdq \$0x10,$Tred,$Xi,$T2 # 1st phase
1520 vpalignr \$8,$Xi,$Xi,$Xi
1521 vpxor $T2,$Xi,$Xi
1522
1523 vpclmulqdq \$0x10,$Tred,$Xi,$T2 # 2nd phase
1524 vpalignr \$8,$Xi,$Xi,$Xi
1525 vpxor $Xo,$Xi,$Xi
1526 vpxor $T2,$Xi,$Xi
1527
1528 cmp \$0,$len
1529 jne .Lshort_avx
1530
1531 vpshufb $bswap,$Xi,$Xi
1532 vmovdqu $Xi,($Xip)
1533 vzeroupper
1534___
1535$code.=<<___ if ($win64);
1536 movaps (%rsp),%xmm6
1537 movaps 0x10(%rsp),%xmm7
1538 movaps 0x20(%rsp),%xmm8
1539 movaps 0x30(%rsp),%xmm9
1540 movaps 0x40(%rsp),%xmm10
1541 movaps 0x50(%rsp),%xmm11
1542 movaps 0x60(%rsp),%xmm12
1543 movaps 0x70(%rsp),%xmm13
1544 movaps 0x80(%rsp),%xmm14
1545 movaps 0x90(%rsp),%xmm15
1546 lea 0xa8(%rsp),%rsp
1547.LSEH_end_gcm_ghash_avx:
1548___
1549$code.=<<___;
1550 ret
1551.size gcm_ghash_avx,.-gcm_ghash_avx
1552___
1553} else {
1554$code.=<<___;
1555 jmp .L_ghash_clmul
1556.size gcm_ghash_avx,.-gcm_ghash_avx
1557___
1558}
1559
1560
1561$code.=<<___;
1562.align 64
1563.Lbswap_mask:
1564 .byte 15,14,13,12,11,10,9,8,7,6,5,4,3,2,1,0
1565.L0x1c2_polynomial:
1566 .byte 1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0xc2
1567.L7_mask:
1568 .long 7,0,7,0
1569.L7_mask_poly:
1570 .long 7,0,`0xE1<<1`,0
1571.align 64
1572.type .Lrem_4bit,\@object
1573.Lrem_4bit:
1574 .long 0,`0x0000<<16`,0,`0x1C20<<16`,0,`0x3840<<16`,0,`0x2460<<16`
1575 .long 0,`0x7080<<16`,0,`0x6CA0<<16`,0,`0x48C0<<16`,0,`0x54E0<<16`
1576 .long 0,`0xE100<<16`,0,`0xFD20<<16`,0,`0xD940<<16`,0,`0xC560<<16`
1577 .long 0,`0x9180<<16`,0,`0x8DA0<<16`,0,`0xA9C0<<16`,0,`0xB5E0<<16`
1578.type .Lrem_8bit,\@object
1579.Lrem_8bit:
1580 .value 0x0000,0x01C2,0x0384,0x0246,0x0708,0x06CA,0x048C,0x054E
1581 .value 0x0E10,0x0FD2,0x0D94,0x0C56,0x0918,0x08DA,0x0A9C,0x0B5E
1582 .value 0x1C20,0x1DE2,0x1FA4,0x1E66,0x1B28,0x1AEA,0x18AC,0x196E
1583 .value 0x1230,0x13F2,0x11B4,0x1076,0x1538,0x14FA,0x16BC,0x177E
1584 .value 0x3840,0x3982,0x3BC4,0x3A06,0x3F48,0x3E8A,0x3CCC,0x3D0E
1585 .value 0x3650,0x3792,0x35D4,0x3416,0x3158,0x309A,0x32DC,0x331E
1586 .value 0x2460,0x25A2,0x27E4,0x2626,0x2368,0x22AA,0x20EC,0x212E
1587 .value 0x2A70,0x2BB2,0x29F4,0x2836,0x2D78,0x2CBA,0x2EFC,0x2F3E
1588 .value 0x7080,0x7142,0x7304,0x72C6,0x7788,0x764A,0x740C,0x75CE
1589 .value 0x7E90,0x7F52,0x7D14,0x7CD6,0x7998,0x785A,0x7A1C,0x7BDE
1590 .value 0x6CA0,0x6D62,0x6F24,0x6EE6,0x6BA8,0x6A6A,0x682C,0x69EE
1591 .value 0x62B0,0x6372,0x6134,0x60F6,0x65B8,0x647A,0x663C,0x67FE
1592 .value 0x48C0,0x4902,0x4B44,0x4A86,0x4FC8,0x4E0A,0x4C4C,0x4D8E
1593 .value 0x46D0,0x4712,0x4554,0x4496,0x41D8,0x401A,0x425C,0x439E
1594 .value 0x54E0,0x5522,0x5764,0x56A6,0x53E8,0x522A,0x506C,0x51AE
1595 .value 0x5AF0,0x5B32,0x5974,0x58B6,0x5DF8,0x5C3A,0x5E7C,0x5FBE
1596 .value 0xE100,0xE0C2,0xE284,0xE346,0xE608,0xE7CA,0xE58C,0xE44E
1597 .value 0xEF10,0xEED2,0xEC94,0xED56,0xE818,0xE9DA,0xEB9C,0xEA5E
1598 .value 0xFD20,0xFCE2,0xFEA4,0xFF66,0xFA28,0xFBEA,0xF9AC,0xF86E
1599 .value 0xF330,0xF2F2,0xF0B4,0xF176,0xF438,0xF5FA,0xF7BC,0xF67E
1600 .value 0xD940,0xD882,0xDAC4,0xDB06,0xDE48,0xDF8A,0xDDCC,0xDC0E
1601 .value 0xD750,0xD692,0xD4D4,0xD516,0xD058,0xD19A,0xD3DC,0xD21E
1602 .value 0xC560,0xC4A2,0xC6E4,0xC726,0xC268,0xC3AA,0xC1EC,0xC02E
1603 .value 0xCB70,0xCAB2,0xC8F4,0xC936,0xCC78,0xCDBA,0xCFFC,0xCE3E
1604 .value 0x9180,0x9042,0x9204,0x93C6,0x9688,0x974A,0x950C,0x94CE
1605 .value 0x9F90,0x9E52,0x9C14,0x9DD6,0x9898,0x995A,0x9B1C,0x9ADE
1606 .value 0x8DA0,0x8C62,0x8E24,0x8FE6,0x8AA8,0x8B6A,0x892C,0x88EE
1607 .value 0x83B0,0x8272,0x8034,0x81F6,0x84B8,0x857A,0x873C,0x86FE
1608 .value 0xA9C0,0xA802,0xAA44,0xAB86,0xAEC8,0xAF0A,0xAD4C,0xAC8E
1609 .value 0xA7D0,0xA612,0xA454,0xA596,0xA0D8,0xA11A,0xA35C,0xA29E
1610 .value 0xB5E0,0xB422,0xB664,0xB7A6,0xB2E8,0xB32A,0xB16C,0xB0AE
1611 .value 0xBBF0,0xBA32,0xB874,0xB9B6,0xBCF8,0xBD3A,0xBF7C,0xBEBE
1612
1613.asciz "GHASH for x86_64, CRYPTOGAMS by <appro\@openssl.org>"
1614.align 64
1615___
1616
1617
1618# EXCEPTION_DISPOSITION handler (EXCEPTION_RECORD *rec,ULONG64 frame,
1619# CONTEXT *context,DISPATCHER_CONTEXT *disp)
1620if ($win64) {
1621$rec="%rcx";
1622$frame="%rdx";
1623$context="%r8";
1624$disp="%r9";
1625
1626$code.=<<___;
1627.extern __imp_RtlVirtualUnwind
1628.type se_handler,\@abi-omnipotent
1629.align 16
1630se_handler:
1631 push %rsi
1632 push %rdi
1633 push %rbx
1634 push %rbp
1635 push %r12
1636 push %r13
1637 push %r14
1638 push %r15
1639 pushfq
1640 sub \$64,%rsp
1641
1642 mov 120($context),%rax # pull context->Rax
1643 mov 248($context),%rbx # pull context->Rip
1644
1645 mov 8($disp),%rsi # disp->ImageBase
1646 mov 56($disp),%r11 # disp->HandlerData
1647
1648 mov 0(%r11),%r10d # HandlerData[0]
1649 lea (%rsi,%r10),%r10 # prologue label
1650 cmp %r10,%rbx # context->Rip<prologue label
1651 jb .Lin_prologue
1652
1653 mov 152($context),%rax # pull context->Rsp
1654
1655 mov 4(%r11),%r10d # HandlerData[1]
1656 lea (%rsi,%r10),%r10 # epilogue label
1657 cmp %r10,%rbx # context->Rip>=epilogue label
1658 jae .Lin_prologue
1659
1660 lea 24(%rax),%rax # adjust "rsp"
1661
1662 mov -8(%rax),%rbx
1663 mov -16(%rax),%rbp
1664 mov -24(%rax),%r12
1665 mov %rbx,144($context) # restore context->Rbx
1666 mov %rbp,160($context) # restore context->Rbp
1667 mov %r12,216($context) # restore context->R12
1668
1669.Lin_prologue:
1670 mov 8(%rax),%rdi
1671 mov 16(%rax),%rsi
1672 mov %rax,152($context) # restore context->Rsp
1673 mov %rsi,168($context) # restore context->Rsi
1674 mov %rdi,176($context) # restore context->Rdi
1675
1676 mov 40($disp),%rdi # disp->ContextRecord
1677 mov $context,%rsi # context
1678 mov \$`1232/8`,%ecx # sizeof(CONTEXT)
1679 .long 0xa548f3fc # cld; rep movsq
1680
1681 mov $disp,%rsi
1682 xor %rcx,%rcx # arg1, UNW_FLAG_NHANDLER
1683 mov 8(%rsi),%rdx # arg2, disp->ImageBase
1684 mov 0(%rsi),%r8 # arg3, disp->ControlPc
1685 mov 16(%rsi),%r9 # arg4, disp->FunctionEntry
1686 mov 40(%rsi),%r10 # disp->ContextRecord
1687 lea 56(%rsi),%r11 # &disp->HandlerData
1688 lea 24(%rsi),%r12 # &disp->EstablisherFrame
1689 mov %r10,32(%rsp) # arg5
1690 mov %r11,40(%rsp) # arg6
1691 mov %r12,48(%rsp) # arg7
1692 mov %rcx,56(%rsp) # arg8, (NULL)
1693 call *__imp_RtlVirtualUnwind(%rip)
1694
1695 mov \$1,%eax # ExceptionContinueSearch
1696 add \$64,%rsp
1697 popfq
1698 pop %r15
1699 pop %r14
1700 pop %r13
1701 pop %r12
1702 pop %rbp
1703 pop %rbx
1704 pop %rdi
1705 pop %rsi
1706 ret
1707.size se_handler,.-se_handler
1708
1709.section .pdata
1710.align 4
1711 .rva .LSEH_begin_gcm_gmult_4bit
1712 .rva .LSEH_end_gcm_gmult_4bit
1713 .rva .LSEH_info_gcm_gmult_4bit
1714
1715 .rva .LSEH_begin_gcm_ghash_4bit
1716 .rva .LSEH_end_gcm_ghash_4bit
1717 .rva .LSEH_info_gcm_ghash_4bit
1718
1719 .rva .LSEH_begin_gcm_init_clmul
1720 .rva .LSEH_end_gcm_init_clmul
1721 .rva .LSEH_info_gcm_init_clmul
1722
1723 .rva .LSEH_begin_gcm_ghash_clmul
1724 .rva .LSEH_end_gcm_ghash_clmul
1725 .rva .LSEH_info_gcm_ghash_clmul
1726___
1727$code.=<<___ if ($avx);
1728 .rva .LSEH_begin_gcm_init_avx
1729 .rva .LSEH_end_gcm_init_avx
1730 .rva .LSEH_info_gcm_init_clmul
1731
1732 .rva .LSEH_begin_gcm_ghash_avx
1733 .rva .LSEH_end_gcm_ghash_avx
1734 .rva .LSEH_info_gcm_ghash_clmul
1735___
1736$code.=<<___;
1737.section .xdata
1738.align 8
1739.LSEH_info_gcm_gmult_4bit:
1740 .byte 9,0,0,0
1741 .rva se_handler
1742 .rva .Lgmult_prologue,.Lgmult_epilogue # HandlerData
1743.LSEH_info_gcm_ghash_4bit:
1744 .byte 9,0,0,0
1745 .rva se_handler
1746 .rva .Lghash_prologue,.Lghash_epilogue # HandlerData
1747.LSEH_info_gcm_init_clmul:
1748 .byte 0x01,0x08,0x03,0x00
1749 .byte 0x08,0x68,0x00,0x00 #movaps 0x00(rsp),xmm6
1750 .byte 0x04,0x22,0x00,0x00 #sub rsp,0x18
1751.LSEH_info_gcm_ghash_clmul:
1752 .byte 0x01,0x33,0x16,0x00
1753 .byte 0x33,0xf8,0x09,0x00 #movaps 0x90(rsp),xmm15
1754 .byte 0x2e,0xe8,0x08,0x00 #movaps 0x80(rsp),xmm14
1755 .byte 0x29,0xd8,0x07,0x00 #movaps 0x70(rsp),xmm13
1756 .byte 0x24,0xc8,0x06,0x00 #movaps 0x60(rsp),xmm12
1757 .byte 0x1f,0xb8,0x05,0x00 #movaps 0x50(rsp),xmm11
1758 .byte 0x1a,0xa8,0x04,0x00 #movaps 0x40(rsp),xmm10
1759 .byte 0x15,0x98,0x03,0x00 #movaps 0x30(rsp),xmm9
1760 .byte 0x10,0x88,0x02,0x00 #movaps 0x20(rsp),xmm8
1761 .byte 0x0c,0x78,0x01,0x00 #movaps 0x10(rsp),xmm7
1762 .byte 0x08,0x68,0x00,0x00 #movaps 0x00(rsp),xmm6
1763 .byte 0x04,0x01,0x15,0x00 #sub rsp,0xa8
1764___
1765}
1766
1767
1768$code =~ s/\`([^\`]*)\`/eval($1)/gem;
1769
1770print $code;
1771
1772close STDOUT;
Note: See TracBrowser for help on using the repository browser.

© 2025 Oracle Support Privacy / Do Not Sell My Info Terms of Use Trademark Policy Automated Access Etiquette