ghashp8-ppc.pl@ 97371

Last change on this file since 97371 was 94082, checked in by vboxsync, 3 years ago
libs/openssl-3.0.1: started applying and adjusting our OpenSSL changes to 3.0.1. bugref:10128
Property svn:executable set to ``*
File size: 14.7 KB

Line
1	#! /usr/bin/env perl
2	# Copyright 2014-2020 The OpenSSL Project Authors. All Rights Reserved.
3	#
4	# Licensed under the Apache License 2.0 (the "License"). You may not use
5	# this file except in compliance with the License. You can obtain a copy
6	# in the file LICENSE in the source distribution or at
7	# https://www.openssl.org/source/license.html
8
9	#
10	# ====================================================================
11	# Written by Andy Polyakov <[email protected]> for the OpenSSL
12	# project. The module is, however, dual licensed under OpenSSL and
13	# CRYPTOGAMS licenses depending on where you obtain it. For further
14	# details see http://www.openssl.org/~appro/cryptogams/.
15	# ====================================================================
16	#
17	# GHASH for for PowerISA v2.07.
18	#
19	# July 2014
20	#
21	# Accurate performance measurements are problematic, because it's
22	# always virtualized setup with possibly throttled processor.
23	# Relative comparison is therefore more informative. This initial
24	# version is ~2.1x slower than hardware-assisted AES-128-CTR, ~12x
25	# faster than "4-bit" integer-only compiler-generated 64-bit code.
26	# "Initial version" means that there is room for further improvement.
27
28	# May 2016
29	#
30	# 2x aggregated reduction improves performance by 50% (resulting
31	# performance on POWER8 is 1 cycle per processed byte), and 4x
32	# aggregated reduction - by 170% or 2.7x (resulting in 0.55 cpb).
33	# POWER9 delivers 0.51 cpb.
34
35	# $output is the last argument if it looks like a file (it has an extension)
36	# $flavour is the first argument if it doesn't look like a file
37	$output = $#ARGV >= 0 && $ARGV[$#ARGV] =~ m\|\.\w+$\| ? pop : undef;
38	$flavour = $#ARGV >= 0 && $ARGV[0] !~ m\|\.\| ? shift : undef;
39
40	if ($flavour =~ /64/) {
41	$SIZE_T=8;
42	$LRSAVE=2*$SIZE_T;
43	$STU="stdu";
44	$POP="ld";
45	$PUSH="std";
46	$UCMP="cmpld";
47	$SHRI="srdi";
48	} elsif ($flavour =~ /32/) {
49	$SIZE_T=4;
50	$LRSAVE=$SIZE_T;
51	$STU="stwu";
52	$POP="lwz";
53	$PUSH="stw";
54	$UCMP="cmplw";
55	$SHRI="srwi";
56	} else { die "nonsense $flavour"; }
57
58	$sp="r1";
59	$FRAME=6$SIZE_T+1316; # 13*16 is for v20-v31 offload
60
61	$0 =~ m/(.*[\/\\])[^\/\\]+$/; $dir=$1;
62	( $xlate="${dir}ppc-xlate.pl" and -f $xlate ) or
63	( $xlate="${dir}../../perlasm/ppc-xlate.pl" and -f $xlate) or
64	die "can't locate ppc-xlate.pl";
65
66	open STDOUT,"\| $^X $xlate $flavour \"$output\""
67	or die "can't call $xlate: $!";
68
69	my ($Xip,$Htbl,$inp,$len)=map("r$_",(3..6)); # argument block
70
71	my ($Xl,$Xm,$Xh,$IN)=map("v$_",(0..3));
72	my ($zero,$t0,$t1,$t2,$xC2,$H,$Hh,$Hl,$lemask)=map("v$_",(4..12));
73	my ($Xl1,$Xm1,$Xh1,$IN1,$H2,$H2h,$H2l)=map("v$_",(13..19));
74	my $vrsave="r12";
75
76	$code=<<___;
77	.machine "any"
78
79	.text
80
81	.globl .gcm_init_p8
82	.align 5
83	.gcm_init_p8:
84	li r0,-4096
85	li r8,0x10
86	mfspr $vrsave,256
87	li r9,0x20
88	mtspr 256,r0
89	li r10,0x30
90	lvx_u $H,0,r4 # load H
91
92	vspltisb $xC2,-16 # 0xf0
93	vspltisb $t0,1 # one
94	vaddubm $xC2,$xC2,$xC2 # 0xe0
95	vxor $zero,$zero,$zero
96	vor $xC2,$xC2,$t0 # 0xe1
97	vsldoi $xC2,$xC2,$zero,15 # 0xe1...
98	vsldoi $t1,$zero,$t0,1 # ...1
99	vaddubm $xC2,$xC2,$xC2 # 0xc2...
100	vspltisb $t2,7
101	vor $xC2,$xC2,$t1 # 0xc2....01
102	vspltb $t1,$H,0 # most significant byte
103	vsl $H,$H,$t0 # H<<=1
104	vsrab $t1,$t1,$t2 # broadcast carry bit
105	vand $t1,$t1,$xC2
106	vxor $IN,$H,$t1 # twisted H
107
108	vsldoi $H,$IN,$IN,8 # twist even more ...
109	vsldoi $xC2,$zero,$xC2,8 # 0xc2.0
110	vsldoi $Hl,$zero,$H,8 # ... and split
111	vsldoi $Hh,$H,$zero,8
112
113	stvx_u $xC2,0,r3 # save pre-computed table
114	stvx_u $Hl,r8,r3
115	li r8,0x40
116	stvx_u $H, r9,r3
117	li r9,0x50
118	stvx_u $Hh,r10,r3
119	li r10,0x60
120
121	vpmsumd $Xl,$IN,$Hl # H.lo·H.lo
122	vpmsumd $Xm,$IN,$H # H.hi·H.lo+H.lo·H.hi
123	vpmsumd $Xh,$IN,$Hh # H.hi·H.hi
124
125	vpmsumd $t2,$Xl,$xC2 # 1st reduction phase
126
127	vsldoi $t0,$Xm,$zero,8
128	vsldoi $t1,$zero,$Xm,8
129	vxor $Xl,$Xl,$t0
130	vxor $Xh,$Xh,$t1
131
132	vsldoi $Xl,$Xl,$Xl,8
133	vxor $Xl,$Xl,$t2
134
135	vsldoi $t1,$Xl,$Xl,8 # 2nd reduction phase
136	vpmsumd $Xl,$Xl,$xC2
137	vxor $t1,$t1,$Xh
138	vxor $IN1,$Xl,$t1
139
140	vsldoi $H2,$IN1,$IN1,8
141	vsldoi $H2l,$zero,$H2,8
142	vsldoi $H2h,$H2,$zero,8
143
144	stvx_u $H2l,r8,r3 # save H^2
145	li r8,0x70
146	stvx_u $H2,r9,r3
147	li r9,0x80
148	stvx_u $H2h,r10,r3
149	li r10,0x90
150	___
151	{
152	my ($t4,$t5,$t6) = ($Hl,$H,$Hh);
153	$code.=<<___;
154	vpmsumd $Xl,$IN,$H2l # H.lo·H^2.lo
155	vpmsumd $Xl1,$IN1,$H2l # H^2.lo·H^2.lo
156	vpmsumd $Xm,$IN,$H2 # H.hi·H^2.lo+H.lo·H^2.hi
157	vpmsumd $Xm1,$IN1,$H2 # H^2.hi·H^2.lo+H^2.lo·H^2.hi
158	vpmsumd $Xh,$IN,$H2h # H.hi·H^2.hi
159	vpmsumd $Xh1,$IN1,$H2h # H^2.hi·H^2.hi
160
161	vpmsumd $t2,$Xl,$xC2 # 1st reduction phase
162	vpmsumd $t6,$Xl1,$xC2 # 1st reduction phase
163
164	vsldoi $t0,$Xm,$zero,8
165	vsldoi $t1,$zero,$Xm,8
166	vsldoi $t4,$Xm1,$zero,8
167	vsldoi $t5,$zero,$Xm1,8
168	vxor $Xl,$Xl,$t0
169	vxor $Xh,$Xh,$t1
170	vxor $Xl1,$Xl1,$t4
171	vxor $Xh1,$Xh1,$t5
172
173	vsldoi $Xl,$Xl,$Xl,8
174	vsldoi $Xl1,$Xl1,$Xl1,8
175	vxor $Xl,$Xl,$t2
176	vxor $Xl1,$Xl1,$t6
177
178	vsldoi $t1,$Xl,$Xl,8 # 2nd reduction phase
179	vsldoi $t5,$Xl1,$Xl1,8 # 2nd reduction phase
180	vpmsumd $Xl,$Xl,$xC2
181	vpmsumd $Xl1,$Xl1,$xC2
182	vxor $t1,$t1,$Xh
183	vxor $t5,$t5,$Xh1
184	vxor $Xl,$Xl,$t1
185	vxor $Xl1,$Xl1,$t5
186
187	vsldoi $H,$Xl,$Xl,8
188	vsldoi $H2,$Xl1,$Xl1,8
189	vsldoi $Hl,$zero,$H,8
190	vsldoi $Hh,$H,$zero,8
191	vsldoi $H2l,$zero,$H2,8
192	vsldoi $H2h,$H2,$zero,8
193
194	stvx_u $Hl,r8,r3 # save H^3
195	li r8,0xa0
196	stvx_u $H,r9,r3
197	li r9,0xb0
198	stvx_u $Hh,r10,r3
199	li r10,0xc0
200	stvx_u $H2l,r8,r3 # save H^4
201	stvx_u $H2,r9,r3
202	stvx_u $H2h,r10,r3
203
204	mtspr 256,$vrsave
205	blr
206	.long 0
207	.byte 0,12,0x14,0,0,0,2,0
208	.long 0
209	.size .gcm_init_p8,.-.gcm_init_p8
210	___
211	}
212	$code.=<<___;
213	.globl .gcm_gmult_p8
214	.align 5
215	.gcm_gmult_p8:
216	lis r0,0xfff8
217	li r8,0x10
218	mfspr $vrsave,256
219	li r9,0x20
220	mtspr 256,r0
221	li r10,0x30
222	lvx_u $IN,0,$Xip # load Xi
223
224	lvx_u $Hl,r8,$Htbl # load pre-computed table
225	le?lvsl $lemask,r0,r0
226	lvx_u $H, r9,$Htbl
227	le?vspltisb $t0,0x07
228	lvx_u $Hh,r10,$Htbl
229	le?vxor $lemask,$lemask,$t0
230	lvx_u $xC2,0,$Htbl
231	le?vperm $IN,$IN,$IN,$lemask
232	vxor $zero,$zero,$zero
233
234	vpmsumd $Xl,$IN,$Hl # H.lo·Xi.lo
235	vpmsumd $Xm,$IN,$H # H.hi·Xi.lo+H.lo·Xi.hi
236	vpmsumd $Xh,$IN,$Hh # H.hi·Xi.hi
237
238	vpmsumd $t2,$Xl,$xC2 # 1st reduction phase
239
240	vsldoi $t0,$Xm,$zero,8
241	vsldoi $t1,$zero,$Xm,8
242	vxor $Xl,$Xl,$t0
243	vxor $Xh,$Xh,$t1
244
245	vsldoi $Xl,$Xl,$Xl,8
246	vxor $Xl,$Xl,$t2
247
248	vsldoi $t1,$Xl,$Xl,8 # 2nd reduction phase
249	vpmsumd $Xl,$Xl,$xC2
250	vxor $t1,$t1,$Xh
251	vxor $Xl,$Xl,$t1
252
253	le?vperm $Xl,$Xl,$Xl,$lemask
254	stvx_u $Xl,0,$Xip # write out Xi
255
256	mtspr 256,$vrsave
257	blr
258	.long 0
259	.byte 0,12,0x14,0,0,0,2,0
260	.long 0
261	.size .gcm_gmult_p8,.-.gcm_gmult_p8
262
263	.globl .gcm_ghash_p8
264	.align 5
265	.gcm_ghash_p8:
266	li r0,-4096
267	li r8,0x10
268	mfspr $vrsave,256
269	li r9,0x20
270	mtspr 256,r0
271	li r10,0x30
272	lvx_u $Xl,0,$Xip # load Xi
273
274	lvx_u $Hl,r8,$Htbl # load pre-computed table
275	li r8,0x40
276	le?lvsl $lemask,r0,r0
277	lvx_u $H, r9,$Htbl
278	li r9,0x50
279	le?vspltisb $t0,0x07
280	lvx_u $Hh,r10,$Htbl
281	li r10,0x60
282	le?vxor $lemask,$lemask,$t0
283	lvx_u $xC2,0,$Htbl
284	le?vperm $Xl,$Xl,$Xl,$lemask
285	vxor $zero,$zero,$zero
286
287	${UCMP}i $len,64
288	bge Lgcm_ghash_p8_4x
289
290	lvx_u $IN,0,$inp
291	addi $inp,$inp,16
292	subic. $len,$len,16
293	le?vperm $IN,$IN,$IN,$lemask
294	vxor $IN,$IN,$Xl
295	beq Lshort
296
297	lvx_u $H2l,r8,$Htbl # load H^2
298	li r8,16
299	lvx_u $H2, r9,$Htbl
300	add r9,$inp,$len # end of input
301	lvx_u $H2h,r10,$Htbl
302	be?b Loop_2x
303
304	.align 5
305	Loop_2x:
306	lvx_u $IN1,0,$inp
307	le?vperm $IN1,$IN1,$IN1,$lemask
308
309	subic $len,$len,32
310	vpmsumd $Xl,$IN,$H2l # H^2.lo·Xi.lo
311	vpmsumd $Xl1,$IN1,$Hl # H.lo·Xi+1.lo
312	subfe r0,r0,r0 # borrow?-1:0
313	vpmsumd $Xm,$IN,$H2 # H^2.hi·Xi.lo+H^2.lo·Xi.hi
314	vpmsumd $Xm1,$IN1,$H # H.hi·Xi+1.lo+H.lo·Xi+1.hi
315	and r0,r0,$len
316	vpmsumd $Xh,$IN,$H2h # H^2.hi·Xi.hi
317	vpmsumd $Xh1,$IN1,$Hh # H.hi·Xi+1.hi
318	add $inp,$inp,r0
319
320	vxor $Xl,$Xl,$Xl1
321	vxor $Xm,$Xm,$Xm1
322
323	vpmsumd $t2,$Xl,$xC2 # 1st reduction phase
324
325	vsldoi $t0,$Xm,$zero,8
326	vsldoi $t1,$zero,$Xm,8
327	vxor $Xh,$Xh,$Xh1
328	vxor $Xl,$Xl,$t0
329	vxor $Xh,$Xh,$t1
330
331	vsldoi $Xl,$Xl,$Xl,8
332	vxor $Xl,$Xl,$t2
333	lvx_u $IN,r8,$inp
334	addi $inp,$inp,32
335
336	vsldoi $t1,$Xl,$Xl,8 # 2nd reduction phase
337	vpmsumd $Xl,$Xl,$xC2
338	le?vperm $IN,$IN,$IN,$lemask
339	vxor $t1,$t1,$Xh
340	vxor $IN,$IN,$t1
341	vxor $IN,$IN,$Xl
342	$UCMP r9,$inp
343	bgt Loop_2x # done yet?
344
345	cmplwi $len,0
346	bne Leven
347
348	Lshort:
349	vpmsumd $Xl,$IN,$Hl # H.lo·Xi.lo
350	vpmsumd $Xm,$IN,$H # H.hi·Xi.lo+H.lo·Xi.hi
351	vpmsumd $Xh,$IN,$Hh # H.hi·Xi.hi
352
353	vpmsumd $t2,$Xl,$xC2 # 1st reduction phase
354
355	vsldoi $t0,$Xm,$zero,8
356	vsldoi $t1,$zero,$Xm,8
357	vxor $Xl,$Xl,$t0
358	vxor $Xh,$Xh,$t1
359
360	vsldoi $Xl,$Xl,$Xl,8
361	vxor $Xl,$Xl,$t2
362
363	vsldoi $t1,$Xl,$Xl,8 # 2nd reduction phase
364	vpmsumd $Xl,$Xl,$xC2
365	vxor $t1,$t1,$Xh
366
367	Leven:
368	vxor $Xl,$Xl,$t1
369	le?vperm $Xl,$Xl,$Xl,$lemask
370	stvx_u $Xl,0,$Xip # write out Xi
371
372	mtspr 256,$vrsave
373	blr
374	.long 0
375	.byte 0,12,0x14,0,0,0,4,0
376	.long 0
377	___
378	{
379	my ($Xl3,$Xm2,$IN2,$H3l,$H3,$H3h,
380	$Xh3,$Xm3,$IN3,$H4l,$H4,$H4h) = map("v$_",(20..31));
381	my $IN0=$IN;
382	my ($H21l,$H21h,$loperm,$hiperm) = ($Hl,$Hh,$H2l,$H2h);
383
384	$code.=<<___;
385	.align 5
386	.gcm_ghash_p8_4x:
387	Lgcm_ghash_p8_4x:
388	$STU $sp,-$FRAME($sp)
389	li r10,`15+6*$SIZE_T`
390	li r11,`31+6*$SIZE_T`
391	stvx v20,r10,$sp
392	addi r10,r10,32
393	stvx v21,r11,$sp
394	addi r11,r11,32
395	stvx v22,r10,$sp
396	addi r10,r10,32
397	stvx v23,r11,$sp
398	addi r11,r11,32
399	stvx v24,r10,$sp
400	addi r10,r10,32
401	stvx v25,r11,$sp
402	addi r11,r11,32
403	stvx v26,r10,$sp
404	addi r10,r10,32
405	stvx v27,r11,$sp
406	addi r11,r11,32
407	stvx v28,r10,$sp
408	addi r10,r10,32
409	stvx v29,r11,$sp
410	addi r11,r11,32
411	stvx v30,r10,$sp
412	li r10,0x60
413	stvx v31,r11,$sp
414	li r0,-1
415	stw $vrsave,`$FRAME-4`($sp) # save vrsave
416	mtspr 256,r0 # preserve all AltiVec registers
417
418	lvsl $t0,0,r8 # 0x0001..0e0f
419	#lvx_u $H2l,r8,$Htbl # load H^2
420	li r8,0x70
421	lvx_u $H2, r9,$Htbl
422	li r9,0x80
423	vspltisb $t1,8 # 0x0808..0808
424	#lvx_u $H2h,r10,$Htbl
425	li r10,0x90
426	lvx_u $H3l,r8,$Htbl # load H^3
427	li r8,0xa0
428	lvx_u $H3, r9,$Htbl
429	li r9,0xb0
430	lvx_u $H3h,r10,$Htbl
431	li r10,0xc0
432	lvx_u $H4l,r8,$Htbl # load H^4
433	li r8,0x10
434	lvx_u $H4, r9,$Htbl
435	li r9,0x20
436	lvx_u $H4h,r10,$Htbl
437	li r10,0x30
438
439	vsldoi $t2,$zero,$t1,8 # 0x0000..0808
440	vaddubm $hiperm,$t0,$t2 # 0x0001..1617
441	vaddubm $loperm,$t1,$hiperm # 0x0809..1e1f
442
443	$SHRI $len,$len,4 # this allows to use sign bit
444	# as carry
445	lvx_u $IN0,0,$inp # load input
446	lvx_u $IN1,r8,$inp
447	subic. $len,$len,8
448	lvx_u $IN2,r9,$inp
449	lvx_u $IN3,r10,$inp
450	addi $inp,$inp,0x40
451	le?vperm $IN0,$IN0,$IN0,$lemask
452	le?vperm $IN1,$IN1,$IN1,$lemask
453	le?vperm $IN2,$IN2,$IN2,$lemask
454	le?vperm $IN3,$IN3,$IN3,$lemask
455
456	vxor $Xh,$IN0,$Xl
457
458	vpmsumd $Xl1,$IN1,$H3l
459	vpmsumd $Xm1,$IN1,$H3
460	vpmsumd $Xh1,$IN1,$H3h
461
462	vperm $H21l,$H2,$H,$hiperm
463	vperm $t0,$IN2,$IN3,$loperm
464	vperm $H21h,$H2,$H,$loperm
465	vperm $t1,$IN2,$IN3,$hiperm
466	vpmsumd $Xm2,$IN2,$H2 # H^2.lo·Xi+2.hi+H^2.hi·Xi+2.lo
467	vpmsumd $Xl3,$t0,$H21l # H^2.lo·Xi+2.lo+H.lo·Xi+3.lo
468	vpmsumd $Xm3,$IN3,$H # H.hi·Xi+3.lo +H.lo·Xi+3.hi
469	vpmsumd $Xh3,$t1,$H21h # H^2.hi·Xi+2.hi+H.hi·Xi+3.hi
470
471	vxor $Xm2,$Xm2,$Xm1
472	vxor $Xl3,$Xl3,$Xl1
473	vxor $Xm3,$Xm3,$Xm2
474	vxor $Xh3,$Xh3,$Xh1
475
476	blt Ltail_4x
477
478	Loop_4x:
479	lvx_u $IN0,0,$inp
480	lvx_u $IN1,r8,$inp
481	subic. $len,$len,4
482	lvx_u $IN2,r9,$inp
483	lvx_u $IN3,r10,$inp
484	addi $inp,$inp,0x40
485	le?vperm $IN1,$IN1,$IN1,$lemask
486	le?vperm $IN2,$IN2,$IN2,$lemask
487	le?vperm $IN3,$IN3,$IN3,$lemask
488	le?vperm $IN0,$IN0,$IN0,$lemask
489
490	vpmsumd $Xl,$Xh,$H4l # H^4.lo·Xi.lo
491	vpmsumd $Xm,$Xh,$H4 # H^4.hi·Xi.lo+H^4.lo·Xi.hi
492	vpmsumd $Xh,$Xh,$H4h # H^4.hi·Xi.hi
493	vpmsumd $Xl1,$IN1,$H3l
494	vpmsumd $Xm1,$IN1,$H3
495	vpmsumd $Xh1,$IN1,$H3h
496
497	vxor $Xl,$Xl,$Xl3
498	vxor $Xm,$Xm,$Xm3
499	vxor $Xh,$Xh,$Xh3
500	vperm $t0,$IN2,$IN3,$loperm
501	vperm $t1,$IN2,$IN3,$hiperm
502
503	vpmsumd $t2,$Xl,$xC2 # 1st reduction phase
504	vpmsumd $Xl3,$t0,$H21l # H.lo·Xi+3.lo +H^2.lo·Xi+2.lo
505	vpmsumd $Xh3,$t1,$H21h # H.hi·Xi+3.hi +H^2.hi·Xi+2.hi
506
507	vsldoi $t0,$Xm,$zero,8
508	vsldoi $t1,$zero,$Xm,8
509	vxor $Xl,$Xl,$t0
510	vxor $Xh,$Xh,$t1
511
512	vsldoi $Xl,$Xl,$Xl,8
513	vxor $Xl,$Xl,$t2
514
515	vsldoi $t1,$Xl,$Xl,8 # 2nd reduction phase
516	vpmsumd $Xm2,$IN2,$H2 # H^2.hi·Xi+2.lo+H^2.lo·Xi+2.hi
517	vpmsumd $Xm3,$IN3,$H # H.hi·Xi+3.lo +H.lo·Xi+3.hi
518	vpmsumd $Xl,$Xl,$xC2
519
520	vxor $Xl3,$Xl3,$Xl1
521	vxor $Xh3,$Xh3,$Xh1
522	vxor $Xh,$Xh,$IN0
523	vxor $Xm2,$Xm2,$Xm1
524	vxor $Xh,$Xh,$t1
525	vxor $Xm3,$Xm3,$Xm2
526	vxor $Xh,$Xh,$Xl
527	bge Loop_4x
528
529	Ltail_4x:
530	vpmsumd $Xl,$Xh,$H4l # H^4.lo·Xi.lo
531	vpmsumd $Xm,$Xh,$H4 # H^4.hi·Xi.lo+H^4.lo·Xi.hi
532	vpmsumd $Xh,$Xh,$H4h # H^4.hi·Xi.hi
533
534	vxor $Xl,$Xl,$Xl3
535	vxor $Xm,$Xm,$Xm3
536
537	vpmsumd $t2,$Xl,$xC2 # 1st reduction phase
538
539	vsldoi $t0,$Xm,$zero,8
540	vsldoi $t1,$zero,$Xm,8
541	vxor $Xh,$Xh,$Xh3
542	vxor $Xl,$Xl,$t0
543	vxor $Xh,$Xh,$t1
544
545	vsldoi $Xl,$Xl,$Xl,8
546	vxor $Xl,$Xl,$t2
547
548	vsldoi $t1,$Xl,$Xl,8 # 2nd reduction phase
549	vpmsumd $Xl,$Xl,$xC2
550	vxor $t1,$t1,$Xh
551	vxor $Xl,$Xl,$t1
552
553	addic. $len,$len,4
554	beq Ldone_4x
555
556	lvx_u $IN0,0,$inp
557	${UCMP}i $len,2
558	li $len,-4
559	blt Lone
560	lvx_u $IN1,r8,$inp
561	beq Ltwo
562
563	Lthree:
564	lvx_u $IN2,r9,$inp
565	le?vperm $IN0,$IN0,$IN0,$lemask
566	le?vperm $IN1,$IN1,$IN1,$lemask
567	le?vperm $IN2,$IN2,$IN2,$lemask
568
569	vxor $Xh,$IN0,$Xl
570	vmr $H4l,$H3l
571	vmr $H4, $H3
572	vmr $H4h,$H3h
573
574	vperm $t0,$IN1,$IN2,$loperm
575	vperm $t1,$IN1,$IN2,$hiperm
576	vpmsumd $Xm2,$IN1,$H2 # H^2.lo·Xi+1.hi+H^2.hi·Xi+1.lo
577	vpmsumd $Xm3,$IN2,$H # H.hi·Xi+2.lo +H.lo·Xi+2.hi
578	vpmsumd $Xl3,$t0,$H21l # H^2.lo·Xi+1.lo+H.lo·Xi+2.lo
579	vpmsumd $Xh3,$t1,$H21h # H^2.hi·Xi+1.hi+H.hi·Xi+2.hi
580
581	vxor $Xm3,$Xm3,$Xm2
582	b Ltail_4x
583
584	.align 4
585	Ltwo:
586	le?vperm $IN0,$IN0,$IN0,$lemask
587	le?vperm $IN1,$IN1,$IN1,$lemask
588
589	vxor $Xh,$IN0,$Xl
590	vperm $t0,$zero,$IN1,$loperm
591	vperm $t1,$zero,$IN1,$hiperm
592
593	vsldoi $H4l,$zero,$H2,8
594	vmr $H4, $H2
595	vsldoi $H4h,$H2,$zero,8
596
597	vpmsumd $Xl3,$t0, $H21l # H.lo·Xi+1.lo
598	vpmsumd $Xm3,$IN1,$H # H.hi·Xi+1.lo+H.lo·Xi+2.hi
599	vpmsumd $Xh3,$t1, $H21h # H.hi·Xi+1.hi
600
601	b Ltail_4x
602
603	.align 4
604	Lone:
605	le?vperm $IN0,$IN0,$IN0,$lemask
606
607	vsldoi $H4l,$zero,$H,8
608	vmr $H4, $H
609	vsldoi $H4h,$H,$zero,8
610
611	vxor $Xh,$IN0,$Xl
612	vxor $Xl3,$Xl3,$Xl3
613	vxor $Xm3,$Xm3,$Xm3
614	vxor $Xh3,$Xh3,$Xh3
615
616	b Ltail_4x
617
618	Ldone_4x:
619	le?vperm $Xl,$Xl,$Xl,$lemask
620	stvx_u $Xl,0,$Xip # write out Xi
621
622	li r10,`15+6*$SIZE_T`
623	li r11,`31+6*$SIZE_T`
624	mtspr 256,$vrsave
625	lvx v20,r10,$sp
626	addi r10,r10,32
627	lvx v21,r11,$sp
628	addi r11,r11,32
629	lvx v22,r10,$sp
630	addi r10,r10,32
631	lvx v23,r11,$sp
632	addi r11,r11,32
633	lvx v24,r10,$sp
634	addi r10,r10,32
635	lvx v25,r11,$sp
636	addi r11,r11,32
637	lvx v26,r10,$sp
638	addi r10,r10,32
639	lvx v27,r11,$sp
640	addi r11,r11,32
641	lvx v28,r10,$sp
642	addi r10,r10,32
643	lvx v29,r11,$sp
644	addi r11,r11,32
645	lvx v30,r10,$sp
646	lvx v31,r11,$sp
647	addi $sp,$sp,$FRAME
648	blr
649	.long 0
650	.byte 0,12,0x04,0,0x80,0,4,0
651	.long 0
652	___
653	}
654	$code.=<<___;
655	.size .gcm_ghash_p8,.-.gcm_ghash_p8
656
657	.asciz "GHASH for PowerISA 2.07, CRYPTOGAMS by <appro\@openssl.org>"
658	.align 2
659	___
660
661	foreach (split("\n",$code)) {
662	s/\`([^\`]*)\`/eval $1/geo;
663
664	if ($flavour =~ /le$/o) { # little-endian
665	s/le\?//o or
666	s/be\?/#be#/o;
667	} else {
668	s/le\?/#le#/o or
669	s/be\?//o;
670	}
671	print $_,"\n";
672	}
673
674	close STDOUT or die "error closing STDOUT: $!"; # enforce flush

Note: See TracBrowser for help on using the repository browser.

source: vbox/trunk/src/libs/openssl-3.0.7/crypto/modes/asm/ghashp8-ppc.pl@ 97371

Download in other formats: