ghashp8-ppc.pl@ 91772

Last change on this file since 91772 was 91772, checked in by vboxsync, 3 years ago
openssl-1.1.1l: Applied and adjusted our OpenSSL changes to 1.1.1l. bugref:10126
Property svn:executable set to ``*
File size: 14.5 KB

Line
1	#! /usr/bin/env perl
2	# Copyright 2014-2020 The OpenSSL Project Authors. All Rights Reserved.
3	#
4	# Licensed under the OpenSSL license (the "License"). You may not use
5	# this file except in compliance with the License. You can obtain a copy
6	# in the file LICENSE in the source distribution or at
7	# https://www.openssl.org/source/license.html
8
9	#
10	# ====================================================================
11	# Written by Andy Polyakov <[email protected]> for the OpenSSL
12	# project. The module is, however, dual licensed under OpenSSL and
13	# CRYPTOGAMS licenses depending on where you obtain it. For further
14	# details see http://www.openssl.org/~appro/cryptogams/.
15	# ====================================================================
16	#
17	# GHASH for for PowerISA v2.07.
18	#
19	# July 2014
20	#
21	# Accurate performance measurements are problematic, because it's
22	# always virtualized setup with possibly throttled processor.
23	# Relative comparison is therefore more informative. This initial
24	# version is ~2.1x slower than hardware-assisted AES-128-CTR, ~12x
25	# faster than "4-bit" integer-only compiler-generated 64-bit code.
26	# "Initial version" means that there is room for further improvement.
27
28	# May 2016
29	#
30	# 2x aggregated reduction improves performance by 50% (resulting
31	# performance on POWER8 is 1 cycle per processed byte), and 4x
32	# aggregated reduction - by 170% or 2.7x (resulting in 0.55 cpb).
33	# POWER9 delivers 0.51 cpb.
34
35	$flavour=shift;
36	$output =shift;
37
38	if ($flavour =~ /64/) {
39	$SIZE_T=8;
40	$LRSAVE=2*$SIZE_T;
41	$STU="stdu";
42	$POP="ld";
43	$PUSH="std";
44	$UCMP="cmpld";
45	$SHRI="srdi";
46	} elsif ($flavour =~ /32/) {
47	$SIZE_T=4;
48	$LRSAVE=$SIZE_T;
49	$STU="stwu";
50	$POP="lwz";
51	$PUSH="stw";
52	$UCMP="cmplw";
53	$SHRI="srwi";
54	} else { die "nonsense $flavour"; }
55
56	$sp="r1";
57	$FRAME=6$SIZE_T+1316; # 13*16 is for v20-v31 offload
58
59	$0 =~ m/(.*[\/\\])[^\/\\]+$/; $dir=$1;
60	( $xlate="${dir}ppc-xlate.pl" and -f $xlate ) or
61	( $xlate="${dir}../../perlasm/ppc-xlate.pl" and -f $xlate) or
62	die "can't locate ppc-xlate.pl";
63
64	open STDOUT,"\| $^X $xlate $flavour $output" \|\| die "can't call $xlate: $!";
65
66	my ($Xip,$Htbl,$inp,$len)=map("r$_",(3..6)); # argument block
67
68	my ($Xl,$Xm,$Xh,$IN)=map("v$_",(0..3));
69	my ($zero,$t0,$t1,$t2,$xC2,$H,$Hh,$Hl,$lemask)=map("v$_",(4..12));
70	my ($Xl1,$Xm1,$Xh1,$IN1,$H2,$H2h,$H2l)=map("v$_",(13..19));
71	my $vrsave="r12";
72
73	$code=<<___;
74	.machine "any"
75
76	.text
77
78	.globl .gcm_init_p8
79	.align 5
80	.gcm_init_p8:
81	li r0,-4096
82	li r8,0x10
83	mfspr $vrsave,256
84	li r9,0x20
85	mtspr 256,r0
86	li r10,0x30
87	lvx_u $H,0,r4 # load H
88
89	vspltisb $xC2,-16 # 0xf0
90	vspltisb $t0,1 # one
91	vaddubm $xC2,$xC2,$xC2 # 0xe0
92	vxor $zero,$zero,$zero
93	vor $xC2,$xC2,$t0 # 0xe1
94	vsldoi $xC2,$xC2,$zero,15 # 0xe1...
95	vsldoi $t1,$zero,$t0,1 # ...1
96	vaddubm $xC2,$xC2,$xC2 # 0xc2...
97	vspltisb $t2,7
98	vor $xC2,$xC2,$t1 # 0xc2....01
99	vspltb $t1,$H,0 # most significant byte
100	vsl $H,$H,$t0 # H<<=1
101	vsrab $t1,$t1,$t2 # broadcast carry bit
102	vand $t1,$t1,$xC2
103	vxor $IN,$H,$t1 # twisted H
104
105	vsldoi $H,$IN,$IN,8 # twist even more ...
106	vsldoi $xC2,$zero,$xC2,8 # 0xc2.0
107	vsldoi $Hl,$zero,$H,8 # ... and split
108	vsldoi $Hh,$H,$zero,8
109
110	stvx_u $xC2,0,r3 # save pre-computed table
111	stvx_u $Hl,r8,r3
112	li r8,0x40
113	stvx_u $H, r9,r3
114	li r9,0x50
115	stvx_u $Hh,r10,r3
116	li r10,0x60
117
118	vpmsumd $Xl,$IN,$Hl # H.lo·H.lo
119	vpmsumd $Xm,$IN,$H # H.hi·H.lo+H.lo·H.hi
120	vpmsumd $Xh,$IN,$Hh # H.hi·H.hi
121
122	vpmsumd $t2,$Xl,$xC2 # 1st reduction phase
123
124	vsldoi $t0,$Xm,$zero,8
125	vsldoi $t1,$zero,$Xm,8
126	vxor $Xl,$Xl,$t0
127	vxor $Xh,$Xh,$t1
128
129	vsldoi $Xl,$Xl,$Xl,8
130	vxor $Xl,$Xl,$t2
131
132	vsldoi $t1,$Xl,$Xl,8 # 2nd reduction phase
133	vpmsumd $Xl,$Xl,$xC2
134	vxor $t1,$t1,$Xh
135	vxor $IN1,$Xl,$t1
136
137	vsldoi $H2,$IN1,$IN1,8
138	vsldoi $H2l,$zero,$H2,8
139	vsldoi $H2h,$H2,$zero,8
140
141	stvx_u $H2l,r8,r3 # save H^2
142	li r8,0x70
143	stvx_u $H2,r9,r3
144	li r9,0x80
145	stvx_u $H2h,r10,r3
146	li r10,0x90
147	___
148	{
149	my ($t4,$t5,$t6) = ($Hl,$H,$Hh);
150	$code.=<<___;
151	vpmsumd $Xl,$IN,$H2l # H.lo·H^2.lo
152	vpmsumd $Xl1,$IN1,$H2l # H^2.lo·H^2.lo
153	vpmsumd $Xm,$IN,$H2 # H.hi·H^2.lo+H.lo·H^2.hi
154	vpmsumd $Xm1,$IN1,$H2 # H^2.hi·H^2.lo+H^2.lo·H^2.hi
155	vpmsumd $Xh,$IN,$H2h # H.hi·H^2.hi
156	vpmsumd $Xh1,$IN1,$H2h # H^2.hi·H^2.hi
157
158	vpmsumd $t2,$Xl,$xC2 # 1st reduction phase
159	vpmsumd $t6,$Xl1,$xC2 # 1st reduction phase
160
161	vsldoi $t0,$Xm,$zero,8
162	vsldoi $t1,$zero,$Xm,8
163	vsldoi $t4,$Xm1,$zero,8
164	vsldoi $t5,$zero,$Xm1,8
165	vxor $Xl,$Xl,$t0
166	vxor $Xh,$Xh,$t1
167	vxor $Xl1,$Xl1,$t4
168	vxor $Xh1,$Xh1,$t5
169
170	vsldoi $Xl,$Xl,$Xl,8
171	vsldoi $Xl1,$Xl1,$Xl1,8
172	vxor $Xl,$Xl,$t2
173	vxor $Xl1,$Xl1,$t6
174
175	vsldoi $t1,$Xl,$Xl,8 # 2nd reduction phase
176	vsldoi $t5,$Xl1,$Xl1,8 # 2nd reduction phase
177	vpmsumd $Xl,$Xl,$xC2
178	vpmsumd $Xl1,$Xl1,$xC2
179	vxor $t1,$t1,$Xh
180	vxor $t5,$t5,$Xh1
181	vxor $Xl,$Xl,$t1
182	vxor $Xl1,$Xl1,$t5
183
184	vsldoi $H,$Xl,$Xl,8
185	vsldoi $H2,$Xl1,$Xl1,8
186	vsldoi $Hl,$zero,$H,8
187	vsldoi $Hh,$H,$zero,8
188	vsldoi $H2l,$zero,$H2,8
189	vsldoi $H2h,$H2,$zero,8
190
191	stvx_u $Hl,r8,r3 # save H^3
192	li r8,0xa0
193	stvx_u $H,r9,r3
194	li r9,0xb0
195	stvx_u $Hh,r10,r3
196	li r10,0xc0
197	stvx_u $H2l,r8,r3 # save H^4
198	stvx_u $H2,r9,r3
199	stvx_u $H2h,r10,r3
200
201	mtspr 256,$vrsave
202	blr
203	.long 0
204	.byte 0,12,0x14,0,0,0,2,0
205	.long 0
206	.size .gcm_init_p8,.-.gcm_init_p8
207	___
208	}
209	$code.=<<___;
210	.globl .gcm_gmult_p8
211	.align 5
212	.gcm_gmult_p8:
213	lis r0,0xfff8
214	li r8,0x10
215	mfspr $vrsave,256
216	li r9,0x20
217	mtspr 256,r0
218	li r10,0x30
219	lvx_u $IN,0,$Xip # load Xi
220
221	lvx_u $Hl,r8,$Htbl # load pre-computed table
222	le?lvsl $lemask,r0,r0
223	lvx_u $H, r9,$Htbl
224	le?vspltisb $t0,0x07
225	lvx_u $Hh,r10,$Htbl
226	le?vxor $lemask,$lemask,$t0
227	lvx_u $xC2,0,$Htbl
228	le?vperm $IN,$IN,$IN,$lemask
229	vxor $zero,$zero,$zero
230
231	vpmsumd $Xl,$IN,$Hl # H.lo·Xi.lo
232	vpmsumd $Xm,$IN,$H # H.hi·Xi.lo+H.lo·Xi.hi
233	vpmsumd $Xh,$IN,$Hh # H.hi·Xi.hi
234
235	vpmsumd $t2,$Xl,$xC2 # 1st reduction phase
236
237	vsldoi $t0,$Xm,$zero,8
238	vsldoi $t1,$zero,$Xm,8
239	vxor $Xl,$Xl,$t0
240	vxor $Xh,$Xh,$t1
241
242	vsldoi $Xl,$Xl,$Xl,8
243	vxor $Xl,$Xl,$t2
244
245	vsldoi $t1,$Xl,$Xl,8 # 2nd reduction phase
246	vpmsumd $Xl,$Xl,$xC2
247	vxor $t1,$t1,$Xh
248	vxor $Xl,$Xl,$t1
249
250	le?vperm $Xl,$Xl,$Xl,$lemask
251	stvx_u $Xl,0,$Xip # write out Xi
252
253	mtspr 256,$vrsave
254	blr
255	.long 0
256	.byte 0,12,0x14,0,0,0,2,0
257	.long 0
258	.size .gcm_gmult_p8,.-.gcm_gmult_p8
259
260	.globl .gcm_ghash_p8
261	.align 5
262	.gcm_ghash_p8:
263	li r0,-4096
264	li r8,0x10
265	mfspr $vrsave,256
266	li r9,0x20
267	mtspr 256,r0
268	li r10,0x30
269	lvx_u $Xl,0,$Xip # load Xi
270
271	lvx_u $Hl,r8,$Htbl # load pre-computed table
272	li r8,0x40
273	le?lvsl $lemask,r0,r0
274	lvx_u $H, r9,$Htbl
275	li r9,0x50
276	le?vspltisb $t0,0x07
277	lvx_u $Hh,r10,$Htbl
278	li r10,0x60
279	le?vxor $lemask,$lemask,$t0
280	lvx_u $xC2,0,$Htbl
281	le?vperm $Xl,$Xl,$Xl,$lemask
282	vxor $zero,$zero,$zero
283
284	${UCMP}i $len,64
285	bge Lgcm_ghash_p8_4x
286
287	lvx_u $IN,0,$inp
288	addi $inp,$inp,16
289	subic. $len,$len,16
290	le?vperm $IN,$IN,$IN,$lemask
291	vxor $IN,$IN,$Xl
292	beq Lshort
293
294	lvx_u $H2l,r8,$Htbl # load H^2
295	li r8,16
296	lvx_u $H2, r9,$Htbl
297	add r9,$inp,$len # end of input
298	lvx_u $H2h,r10,$Htbl
299	be?b Loop_2x
300
301	.align 5
302	Loop_2x:
303	lvx_u $IN1,0,$inp
304	le?vperm $IN1,$IN1,$IN1,$lemask
305
306	subic $len,$len,32
307	vpmsumd $Xl,$IN,$H2l # H^2.lo·Xi.lo
308	vpmsumd $Xl1,$IN1,$Hl # H.lo·Xi+1.lo
309	subfe r0,r0,r0 # borrow?-1:0
310	vpmsumd $Xm,$IN,$H2 # H^2.hi·Xi.lo+H^2.lo·Xi.hi
311	vpmsumd $Xm1,$IN1,$H # H.hi·Xi+1.lo+H.lo·Xi+1.hi
312	and r0,r0,$len
313	vpmsumd $Xh,$IN,$H2h # H^2.hi·Xi.hi
314	vpmsumd $Xh1,$IN1,$Hh # H.hi·Xi+1.hi
315	add $inp,$inp,r0
316
317	vxor $Xl,$Xl,$Xl1
318	vxor $Xm,$Xm,$Xm1
319
320	vpmsumd $t2,$Xl,$xC2 # 1st reduction phase
321
322	vsldoi $t0,$Xm,$zero,8
323	vsldoi $t1,$zero,$Xm,8
324	vxor $Xh,$Xh,$Xh1
325	vxor $Xl,$Xl,$t0
326	vxor $Xh,$Xh,$t1
327
328	vsldoi $Xl,$Xl,$Xl,8
329	vxor $Xl,$Xl,$t2
330	lvx_u $IN,r8,$inp
331	addi $inp,$inp,32
332
333	vsldoi $t1,$Xl,$Xl,8 # 2nd reduction phase
334	vpmsumd $Xl,$Xl,$xC2
335	le?vperm $IN,$IN,$IN,$lemask
336	vxor $t1,$t1,$Xh
337	vxor $IN,$IN,$t1
338	vxor $IN,$IN,$Xl
339	$UCMP r9,$inp
340	bgt Loop_2x # done yet?
341
342	cmplwi $len,0
343	bne Leven
344
345	Lshort:
346	vpmsumd $Xl,$IN,$Hl # H.lo·Xi.lo
347	vpmsumd $Xm,$IN,$H # H.hi·Xi.lo+H.lo·Xi.hi
348	vpmsumd $Xh,$IN,$Hh # H.hi·Xi.hi
349
350	vpmsumd $t2,$Xl,$xC2 # 1st reduction phase
351
352	vsldoi $t0,$Xm,$zero,8
353	vsldoi $t1,$zero,$Xm,8
354	vxor $Xl,$Xl,$t0
355	vxor $Xh,$Xh,$t1
356
357	vsldoi $Xl,$Xl,$Xl,8
358	vxor $Xl,$Xl,$t2
359
360	vsldoi $t1,$Xl,$Xl,8 # 2nd reduction phase
361	vpmsumd $Xl,$Xl,$xC2
362	vxor $t1,$t1,$Xh
363
364	Leven:
365	vxor $Xl,$Xl,$t1
366	le?vperm $Xl,$Xl,$Xl,$lemask
367	stvx_u $Xl,0,$Xip # write out Xi
368
369	mtspr 256,$vrsave
370	blr
371	.long 0
372	.byte 0,12,0x14,0,0,0,4,0
373	.long 0
374	___
375	{
376	my ($Xl3,$Xm2,$IN2,$H3l,$H3,$H3h,
377	$Xh3,$Xm3,$IN3,$H4l,$H4,$H4h) = map("v$_",(20..31));
378	my $IN0=$IN;
379	my ($H21l,$H21h,$loperm,$hiperm) = ($Hl,$Hh,$H2l,$H2h);
380
381	$code.=<<___;
382	.align 5
383	.gcm_ghash_p8_4x:
384	Lgcm_ghash_p8_4x:
385	$STU $sp,-$FRAME($sp)
386	li r10,`15+6*$SIZE_T`
387	li r11,`31+6*$SIZE_T`
388	stvx v20,r10,$sp
389	addi r10,r10,32
390	stvx v21,r11,$sp
391	addi r11,r11,32
392	stvx v22,r10,$sp
393	addi r10,r10,32
394	stvx v23,r11,$sp
395	addi r11,r11,32
396	stvx v24,r10,$sp
397	addi r10,r10,32
398	stvx v25,r11,$sp
399	addi r11,r11,32
400	stvx v26,r10,$sp
401	addi r10,r10,32
402	stvx v27,r11,$sp
403	addi r11,r11,32
404	stvx v28,r10,$sp
405	addi r10,r10,32
406	stvx v29,r11,$sp
407	addi r11,r11,32
408	stvx v30,r10,$sp
409	li r10,0x60
410	stvx v31,r11,$sp
411	li r0,-1
412	stw $vrsave,`$FRAME-4`($sp) # save vrsave
413	mtspr 256,r0 # preserve all AltiVec registers
414
415	lvsl $t0,0,r8 # 0x0001..0e0f
416	#lvx_u $H2l,r8,$Htbl # load H^2
417	li r8,0x70
418	lvx_u $H2, r9,$Htbl
419	li r9,0x80
420	vspltisb $t1,8 # 0x0808..0808
421	#lvx_u $H2h,r10,$Htbl
422	li r10,0x90
423	lvx_u $H3l,r8,$Htbl # load H^3
424	li r8,0xa0
425	lvx_u $H3, r9,$Htbl
426	li r9,0xb0
427	lvx_u $H3h,r10,$Htbl
428	li r10,0xc0
429	lvx_u $H4l,r8,$Htbl # load H^4
430	li r8,0x10
431	lvx_u $H4, r9,$Htbl
432	li r9,0x20
433	lvx_u $H4h,r10,$Htbl
434	li r10,0x30
435
436	vsldoi $t2,$zero,$t1,8 # 0x0000..0808
437	vaddubm $hiperm,$t0,$t2 # 0x0001..1617
438	vaddubm $loperm,$t1,$hiperm # 0x0809..1e1f
439
440	$SHRI $len,$len,4 # this allows to use sign bit
441	# as carry
442	lvx_u $IN0,0,$inp # load input
443	lvx_u $IN1,r8,$inp
444	subic. $len,$len,8
445	lvx_u $IN2,r9,$inp
446	lvx_u $IN3,r10,$inp
447	addi $inp,$inp,0x40
448	le?vperm $IN0,$IN0,$IN0,$lemask
449	le?vperm $IN1,$IN1,$IN1,$lemask
450	le?vperm $IN2,$IN2,$IN2,$lemask
451	le?vperm $IN3,$IN3,$IN3,$lemask
452
453	vxor $Xh,$IN0,$Xl
454
455	vpmsumd $Xl1,$IN1,$H3l
456	vpmsumd $Xm1,$IN1,$H3
457	vpmsumd $Xh1,$IN1,$H3h
458
459	vperm $H21l,$H2,$H,$hiperm
460	vperm $t0,$IN2,$IN3,$loperm
461	vperm $H21h,$H2,$H,$loperm
462	vperm $t1,$IN2,$IN3,$hiperm
463	vpmsumd $Xm2,$IN2,$H2 # H^2.lo·Xi+2.hi+H^2.hi·Xi+2.lo
464	vpmsumd $Xl3,$t0,$H21l # H^2.lo·Xi+2.lo+H.lo·Xi+3.lo
465	vpmsumd $Xm3,$IN3,$H # H.hi·Xi+3.lo +H.lo·Xi+3.hi
466	vpmsumd $Xh3,$t1,$H21h # H^2.hi·Xi+2.hi+H.hi·Xi+3.hi
467
468	vxor $Xm2,$Xm2,$Xm1
469	vxor $Xl3,$Xl3,$Xl1
470	vxor $Xm3,$Xm3,$Xm2
471	vxor $Xh3,$Xh3,$Xh1
472
473	blt Ltail_4x
474
475	Loop_4x:
476	lvx_u $IN0,0,$inp
477	lvx_u $IN1,r8,$inp
478	subic. $len,$len,4
479	lvx_u $IN2,r9,$inp
480	lvx_u $IN3,r10,$inp
481	addi $inp,$inp,0x40
482	le?vperm $IN1,$IN1,$IN1,$lemask
483	le?vperm $IN2,$IN2,$IN2,$lemask
484	le?vperm $IN3,$IN3,$IN3,$lemask
485	le?vperm $IN0,$IN0,$IN0,$lemask
486
487	vpmsumd $Xl,$Xh,$H4l # H^4.lo·Xi.lo
488	vpmsumd $Xm,$Xh,$H4 # H^4.hi·Xi.lo+H^4.lo·Xi.hi
489	vpmsumd $Xh,$Xh,$H4h # H^4.hi·Xi.hi
490	vpmsumd $Xl1,$IN1,$H3l
491	vpmsumd $Xm1,$IN1,$H3
492	vpmsumd $Xh1,$IN1,$H3h
493
494	vxor $Xl,$Xl,$Xl3
495	vxor $Xm,$Xm,$Xm3
496	vxor $Xh,$Xh,$Xh3
497	vperm $t0,$IN2,$IN3,$loperm
498	vperm $t1,$IN2,$IN3,$hiperm
499
500	vpmsumd $t2,$Xl,$xC2 # 1st reduction phase
501	vpmsumd $Xl3,$t0,$H21l # H.lo·Xi+3.lo +H^2.lo·Xi+2.lo
502	vpmsumd $Xh3,$t1,$H21h # H.hi·Xi+3.hi +H^2.hi·Xi+2.hi
503
504	vsldoi $t0,$Xm,$zero,8
505	vsldoi $t1,$zero,$Xm,8
506	vxor $Xl,$Xl,$t0
507	vxor $Xh,$Xh,$t1
508
509	vsldoi $Xl,$Xl,$Xl,8
510	vxor $Xl,$Xl,$t2
511
512	vsldoi $t1,$Xl,$Xl,8 # 2nd reduction phase
513	vpmsumd $Xm2,$IN2,$H2 # H^2.hi·Xi+2.lo+H^2.lo·Xi+2.hi
514	vpmsumd $Xm3,$IN3,$H # H.hi·Xi+3.lo +H.lo·Xi+3.hi
515	vpmsumd $Xl,$Xl,$xC2
516
517	vxor $Xl3,$Xl3,$Xl1
518	vxor $Xh3,$Xh3,$Xh1
519	vxor $Xh,$Xh,$IN0
520	vxor $Xm2,$Xm2,$Xm1
521	vxor $Xh,$Xh,$t1
522	vxor $Xm3,$Xm3,$Xm2
523	vxor $Xh,$Xh,$Xl
524	bge Loop_4x
525
526	Ltail_4x:
527	vpmsumd $Xl,$Xh,$H4l # H^4.lo·Xi.lo
528	vpmsumd $Xm,$Xh,$H4 # H^4.hi·Xi.lo+H^4.lo·Xi.hi
529	vpmsumd $Xh,$Xh,$H4h # H^4.hi·Xi.hi
530
531	vxor $Xl,$Xl,$Xl3
532	vxor $Xm,$Xm,$Xm3
533
534	vpmsumd $t2,$Xl,$xC2 # 1st reduction phase
535
536	vsldoi $t0,$Xm,$zero,8
537	vsldoi $t1,$zero,$Xm,8
538	vxor $Xh,$Xh,$Xh3
539	vxor $Xl,$Xl,$t0
540	vxor $Xh,$Xh,$t1
541
542	vsldoi $Xl,$Xl,$Xl,8
543	vxor $Xl,$Xl,$t2
544
545	vsldoi $t1,$Xl,$Xl,8 # 2nd reduction phase
546	vpmsumd $Xl,$Xl,$xC2
547	vxor $t1,$t1,$Xh
548	vxor $Xl,$Xl,$t1
549
550	addic. $len,$len,4
551	beq Ldone_4x
552
553	lvx_u $IN0,0,$inp
554	${UCMP}i $len,2
555	li $len,-4
556	blt Lone
557	lvx_u $IN1,r8,$inp
558	beq Ltwo
559
560	Lthree:
561	lvx_u $IN2,r9,$inp
562	le?vperm $IN0,$IN0,$IN0,$lemask
563	le?vperm $IN1,$IN1,$IN1,$lemask
564	le?vperm $IN2,$IN2,$IN2,$lemask
565
566	vxor $Xh,$IN0,$Xl
567	vmr $H4l,$H3l
568	vmr $H4, $H3
569	vmr $H4h,$H3h
570
571	vperm $t0,$IN1,$IN2,$loperm
572	vperm $t1,$IN1,$IN2,$hiperm
573	vpmsumd $Xm2,$IN1,$H2 # H^2.lo·Xi+1.hi+H^2.hi·Xi+1.lo
574	vpmsumd $Xm3,$IN2,$H # H.hi·Xi+2.lo +H.lo·Xi+2.hi
575	vpmsumd $Xl3,$t0,$H21l # H^2.lo·Xi+1.lo+H.lo·Xi+2.lo
576	vpmsumd $Xh3,$t1,$H21h # H^2.hi·Xi+1.hi+H.hi·Xi+2.hi
577
578	vxor $Xm3,$Xm3,$Xm2
579	b Ltail_4x
580
581	.align 4
582	Ltwo:
583	le?vperm $IN0,$IN0,$IN0,$lemask
584	le?vperm $IN1,$IN1,$IN1,$lemask
585
586	vxor $Xh,$IN0,$Xl
587	vperm $t0,$zero,$IN1,$loperm
588	vperm $t1,$zero,$IN1,$hiperm
589
590	vsldoi $H4l,$zero,$H2,8
591	vmr $H4, $H2
592	vsldoi $H4h,$H2,$zero,8
593
594	vpmsumd $Xl3,$t0, $H21l # H.lo·Xi+1.lo
595	vpmsumd $Xm3,$IN1,$H # H.hi·Xi+1.lo+H.lo·Xi+2.hi
596	vpmsumd $Xh3,$t1, $H21h # H.hi·Xi+1.hi
597
598	b Ltail_4x
599
600	.align 4
601	Lone:
602	le?vperm $IN0,$IN0,$IN0,$lemask
603
604	vsldoi $H4l,$zero,$H,8
605	vmr $H4, $H
606	vsldoi $H4h,$H,$zero,8
607
608	vxor $Xh,$IN0,$Xl
609	vxor $Xl3,$Xl3,$Xl3
610	vxor $Xm3,$Xm3,$Xm3
611	vxor $Xh3,$Xh3,$Xh3
612
613	b Ltail_4x
614
615	Ldone_4x:
616	le?vperm $Xl,$Xl,$Xl,$lemask
617	stvx_u $Xl,0,$Xip # write out Xi
618
619	li r10,`15+6*$SIZE_T`
620	li r11,`31+6*$SIZE_T`
621	mtspr 256,$vrsave
622	lvx v20,r10,$sp
623	addi r10,r10,32
624	lvx v21,r11,$sp
625	addi r11,r11,32
626	lvx v22,r10,$sp
627	addi r10,r10,32
628	lvx v23,r11,$sp
629	addi r11,r11,32
630	lvx v24,r10,$sp
631	addi r10,r10,32
632	lvx v25,r11,$sp
633	addi r11,r11,32
634	lvx v26,r10,$sp
635	addi r10,r10,32
636	lvx v27,r11,$sp
637	addi r11,r11,32
638	lvx v28,r10,$sp
639	addi r10,r10,32
640	lvx v29,r11,$sp
641	addi r11,r11,32
642	lvx v30,r10,$sp
643	lvx v31,r11,$sp
644	addi $sp,$sp,$FRAME
645	blr
646	.long 0
647	.byte 0,12,0x04,0,0x80,0,4,0
648	.long 0
649	___
650	}
651	$code.=<<___;
652	.size .gcm_ghash_p8,.-.gcm_ghash_p8
653
654	.asciz "GHASH for PowerISA 2.07, CRYPTOGAMS by <appro\@openssl.org>"
655	.align 2
656	___
657
658	foreach (split("\n",$code)) {
659	s/\`([^\`]*)\`/eval $1/geo;
660
661	if ($flavour =~ /le$/o) { # little-endian
662	s/le\?//o or
663	s/be\?/#be#/o;
664	} else {
665	s/le\?/#le#/o or
666	s/be\?//o;
667	}
668	print $_,"\n";
669	}
670
671	close STDOUT or die "error closing STDOUT: $!"; # enforce flush

Note: See TracBrowser for help on using the repository browser.

source: vbox/trunk/src/libs/openssl-1.1.1l/crypto/modes/asm/ghashp8-ppc.pl@ 91772

Download in other formats: