VirtualBox

source: vbox/trunk/src/libs/openssl-1.1.1l/crypto/modes/asm/ghashp8-ppc.pl@ 91772

Last change on this file since 91772 was 91772, checked in by vboxsync, 3 years ago

openssl-1.1.1l: Applied and adjusted our OpenSSL changes to 1.1.1l. bugref:10126

  • Property svn:executable set to *
File size: 14.5 KB
Line 
1#! /usr/bin/env perl
2# Copyright 2014-2020 The OpenSSL Project Authors. All Rights Reserved.
3#
4# Licensed under the OpenSSL license (the "License"). You may not use
5# this file except in compliance with the License. You can obtain a copy
6# in the file LICENSE in the source distribution or at
7# https://www.openssl.org/source/license.html
8
9#
10# ====================================================================
11# Written by Andy Polyakov <[email protected]> for the OpenSSL
12# project. The module is, however, dual licensed under OpenSSL and
13# CRYPTOGAMS licenses depending on where you obtain it. For further
14# details see http://www.openssl.org/~appro/cryptogams/.
15# ====================================================================
16#
17# GHASH for for PowerISA v2.07.
18#
19# July 2014
20#
21# Accurate performance measurements are problematic, because it's
22# always virtualized setup with possibly throttled processor.
23# Relative comparison is therefore more informative. This initial
24# version is ~2.1x slower than hardware-assisted AES-128-CTR, ~12x
25# faster than "4-bit" integer-only compiler-generated 64-bit code.
26# "Initial version" means that there is room for further improvement.
27
28# May 2016
29#
30# 2x aggregated reduction improves performance by 50% (resulting
31# performance on POWER8 is 1 cycle per processed byte), and 4x
32# aggregated reduction - by 170% or 2.7x (resulting in 0.55 cpb).
33# POWER9 delivers 0.51 cpb.
34
35$flavour=shift;
36$output =shift;
37
38if ($flavour =~ /64/) {
39 $SIZE_T=8;
40 $LRSAVE=2*$SIZE_T;
41 $STU="stdu";
42 $POP="ld";
43 $PUSH="std";
44 $UCMP="cmpld";
45 $SHRI="srdi";
46} elsif ($flavour =~ /32/) {
47 $SIZE_T=4;
48 $LRSAVE=$SIZE_T;
49 $STU="stwu";
50 $POP="lwz";
51 $PUSH="stw";
52 $UCMP="cmplw";
53 $SHRI="srwi";
54} else { die "nonsense $flavour"; }
55
56$sp="r1";
57$FRAME=6*$SIZE_T+13*16; # 13*16 is for v20-v31 offload
58
59$0 =~ m/(.*[\/\\])[^\/\\]+$/; $dir=$1;
60( $xlate="${dir}ppc-xlate.pl" and -f $xlate ) or
61( $xlate="${dir}../../perlasm/ppc-xlate.pl" and -f $xlate) or
62die "can't locate ppc-xlate.pl";
63
64open STDOUT,"| $^X $xlate $flavour $output" || die "can't call $xlate: $!";
65
66my ($Xip,$Htbl,$inp,$len)=map("r$_",(3..6)); # argument block
67
68my ($Xl,$Xm,$Xh,$IN)=map("v$_",(0..3));
69my ($zero,$t0,$t1,$t2,$xC2,$H,$Hh,$Hl,$lemask)=map("v$_",(4..12));
70my ($Xl1,$Xm1,$Xh1,$IN1,$H2,$H2h,$H2l)=map("v$_",(13..19));
71my $vrsave="r12";
72
73$code=<<___;
74.machine "any"
75
76.text
77
78.globl .gcm_init_p8
79.align 5
80.gcm_init_p8:
81 li r0,-4096
82 li r8,0x10
83 mfspr $vrsave,256
84 li r9,0x20
85 mtspr 256,r0
86 li r10,0x30
87 lvx_u $H,0,r4 # load H
88
89 vspltisb $xC2,-16 # 0xf0
90 vspltisb $t0,1 # one
91 vaddubm $xC2,$xC2,$xC2 # 0xe0
92 vxor $zero,$zero,$zero
93 vor $xC2,$xC2,$t0 # 0xe1
94 vsldoi $xC2,$xC2,$zero,15 # 0xe1...
95 vsldoi $t1,$zero,$t0,1 # ...1
96 vaddubm $xC2,$xC2,$xC2 # 0xc2...
97 vspltisb $t2,7
98 vor $xC2,$xC2,$t1 # 0xc2....01
99 vspltb $t1,$H,0 # most significant byte
100 vsl $H,$H,$t0 # H<<=1
101 vsrab $t1,$t1,$t2 # broadcast carry bit
102 vand $t1,$t1,$xC2
103 vxor $IN,$H,$t1 # twisted H
104
105 vsldoi $H,$IN,$IN,8 # twist even more ...
106 vsldoi $xC2,$zero,$xC2,8 # 0xc2.0
107 vsldoi $Hl,$zero,$H,8 # ... and split
108 vsldoi $Hh,$H,$zero,8
109
110 stvx_u $xC2,0,r3 # save pre-computed table
111 stvx_u $Hl,r8,r3
112 li r8,0x40
113 stvx_u $H, r9,r3
114 li r9,0x50
115 stvx_u $Hh,r10,r3
116 li r10,0x60
117
118 vpmsumd $Xl,$IN,$Hl # H.lo·H.lo
119 vpmsumd $Xm,$IN,$H # H.hi·H.lo+H.lo·H.hi
120 vpmsumd $Xh,$IN,$Hh # H.hi·H.hi
121
122 vpmsumd $t2,$Xl,$xC2 # 1st reduction phase
123
124 vsldoi $t0,$Xm,$zero,8
125 vsldoi $t1,$zero,$Xm,8
126 vxor $Xl,$Xl,$t0
127 vxor $Xh,$Xh,$t1
128
129 vsldoi $Xl,$Xl,$Xl,8
130 vxor $Xl,$Xl,$t2
131
132 vsldoi $t1,$Xl,$Xl,8 # 2nd reduction phase
133 vpmsumd $Xl,$Xl,$xC2
134 vxor $t1,$t1,$Xh
135 vxor $IN1,$Xl,$t1
136
137 vsldoi $H2,$IN1,$IN1,8
138 vsldoi $H2l,$zero,$H2,8
139 vsldoi $H2h,$H2,$zero,8
140
141 stvx_u $H2l,r8,r3 # save H^2
142 li r8,0x70
143 stvx_u $H2,r9,r3
144 li r9,0x80
145 stvx_u $H2h,r10,r3
146 li r10,0x90
147___
148{
149my ($t4,$t5,$t6) = ($Hl,$H,$Hh);
150$code.=<<___;
151 vpmsumd $Xl,$IN,$H2l # H.lo·H^2.lo
152 vpmsumd $Xl1,$IN1,$H2l # H^2.lo·H^2.lo
153 vpmsumd $Xm,$IN,$H2 # H.hi·H^2.lo+H.lo·H^2.hi
154 vpmsumd $Xm1,$IN1,$H2 # H^2.hi·H^2.lo+H^2.lo·H^2.hi
155 vpmsumd $Xh,$IN,$H2h # H.hi·H^2.hi
156 vpmsumd $Xh1,$IN1,$H2h # H^2.hi·H^2.hi
157
158 vpmsumd $t2,$Xl,$xC2 # 1st reduction phase
159 vpmsumd $t6,$Xl1,$xC2 # 1st reduction phase
160
161 vsldoi $t0,$Xm,$zero,8
162 vsldoi $t1,$zero,$Xm,8
163 vsldoi $t4,$Xm1,$zero,8
164 vsldoi $t5,$zero,$Xm1,8
165 vxor $Xl,$Xl,$t0
166 vxor $Xh,$Xh,$t1
167 vxor $Xl1,$Xl1,$t4
168 vxor $Xh1,$Xh1,$t5
169
170 vsldoi $Xl,$Xl,$Xl,8
171 vsldoi $Xl1,$Xl1,$Xl1,8
172 vxor $Xl,$Xl,$t2
173 vxor $Xl1,$Xl1,$t6
174
175 vsldoi $t1,$Xl,$Xl,8 # 2nd reduction phase
176 vsldoi $t5,$Xl1,$Xl1,8 # 2nd reduction phase
177 vpmsumd $Xl,$Xl,$xC2
178 vpmsumd $Xl1,$Xl1,$xC2
179 vxor $t1,$t1,$Xh
180 vxor $t5,$t5,$Xh1
181 vxor $Xl,$Xl,$t1
182 vxor $Xl1,$Xl1,$t5
183
184 vsldoi $H,$Xl,$Xl,8
185 vsldoi $H2,$Xl1,$Xl1,8
186 vsldoi $Hl,$zero,$H,8
187 vsldoi $Hh,$H,$zero,8
188 vsldoi $H2l,$zero,$H2,8
189 vsldoi $H2h,$H2,$zero,8
190
191 stvx_u $Hl,r8,r3 # save H^3
192 li r8,0xa0
193 stvx_u $H,r9,r3
194 li r9,0xb0
195 stvx_u $Hh,r10,r3
196 li r10,0xc0
197 stvx_u $H2l,r8,r3 # save H^4
198 stvx_u $H2,r9,r3
199 stvx_u $H2h,r10,r3
200
201 mtspr 256,$vrsave
202 blr
203 .long 0
204 .byte 0,12,0x14,0,0,0,2,0
205 .long 0
206.size .gcm_init_p8,.-.gcm_init_p8
207___
208}
209$code.=<<___;
210.globl .gcm_gmult_p8
211.align 5
212.gcm_gmult_p8:
213 lis r0,0xfff8
214 li r8,0x10
215 mfspr $vrsave,256
216 li r9,0x20
217 mtspr 256,r0
218 li r10,0x30
219 lvx_u $IN,0,$Xip # load Xi
220
221 lvx_u $Hl,r8,$Htbl # load pre-computed table
222 le?lvsl $lemask,r0,r0
223 lvx_u $H, r9,$Htbl
224 le?vspltisb $t0,0x07
225 lvx_u $Hh,r10,$Htbl
226 le?vxor $lemask,$lemask,$t0
227 lvx_u $xC2,0,$Htbl
228 le?vperm $IN,$IN,$IN,$lemask
229 vxor $zero,$zero,$zero
230
231 vpmsumd $Xl,$IN,$Hl # H.lo·Xi.lo
232 vpmsumd $Xm,$IN,$H # H.hi·Xi.lo+H.lo·Xi.hi
233 vpmsumd $Xh,$IN,$Hh # H.hi·Xi.hi
234
235 vpmsumd $t2,$Xl,$xC2 # 1st reduction phase
236
237 vsldoi $t0,$Xm,$zero,8
238 vsldoi $t1,$zero,$Xm,8
239 vxor $Xl,$Xl,$t0
240 vxor $Xh,$Xh,$t1
241
242 vsldoi $Xl,$Xl,$Xl,8
243 vxor $Xl,$Xl,$t2
244
245 vsldoi $t1,$Xl,$Xl,8 # 2nd reduction phase
246 vpmsumd $Xl,$Xl,$xC2
247 vxor $t1,$t1,$Xh
248 vxor $Xl,$Xl,$t1
249
250 le?vperm $Xl,$Xl,$Xl,$lemask
251 stvx_u $Xl,0,$Xip # write out Xi
252
253 mtspr 256,$vrsave
254 blr
255 .long 0
256 .byte 0,12,0x14,0,0,0,2,0
257 .long 0
258.size .gcm_gmult_p8,.-.gcm_gmult_p8
259
260.globl .gcm_ghash_p8
261.align 5
262.gcm_ghash_p8:
263 li r0,-4096
264 li r8,0x10
265 mfspr $vrsave,256
266 li r9,0x20
267 mtspr 256,r0
268 li r10,0x30
269 lvx_u $Xl,0,$Xip # load Xi
270
271 lvx_u $Hl,r8,$Htbl # load pre-computed table
272 li r8,0x40
273 le?lvsl $lemask,r0,r0
274 lvx_u $H, r9,$Htbl
275 li r9,0x50
276 le?vspltisb $t0,0x07
277 lvx_u $Hh,r10,$Htbl
278 li r10,0x60
279 le?vxor $lemask,$lemask,$t0
280 lvx_u $xC2,0,$Htbl
281 le?vperm $Xl,$Xl,$Xl,$lemask
282 vxor $zero,$zero,$zero
283
284 ${UCMP}i $len,64
285 bge Lgcm_ghash_p8_4x
286
287 lvx_u $IN,0,$inp
288 addi $inp,$inp,16
289 subic. $len,$len,16
290 le?vperm $IN,$IN,$IN,$lemask
291 vxor $IN,$IN,$Xl
292 beq Lshort
293
294 lvx_u $H2l,r8,$Htbl # load H^2
295 li r8,16
296 lvx_u $H2, r9,$Htbl
297 add r9,$inp,$len # end of input
298 lvx_u $H2h,r10,$Htbl
299 be?b Loop_2x
300
301.align 5
302Loop_2x:
303 lvx_u $IN1,0,$inp
304 le?vperm $IN1,$IN1,$IN1,$lemask
305
306 subic $len,$len,32
307 vpmsumd $Xl,$IN,$H2l # H^2.lo·Xi.lo
308 vpmsumd $Xl1,$IN1,$Hl # H.lo·Xi+1.lo
309 subfe r0,r0,r0 # borrow?-1:0
310 vpmsumd $Xm,$IN,$H2 # H^2.hi·Xi.lo+H^2.lo·Xi.hi
311 vpmsumd $Xm1,$IN1,$H # H.hi·Xi+1.lo+H.lo·Xi+1.hi
312 and r0,r0,$len
313 vpmsumd $Xh,$IN,$H2h # H^2.hi·Xi.hi
314 vpmsumd $Xh1,$IN1,$Hh # H.hi·Xi+1.hi
315 add $inp,$inp,r0
316
317 vxor $Xl,$Xl,$Xl1
318 vxor $Xm,$Xm,$Xm1
319
320 vpmsumd $t2,$Xl,$xC2 # 1st reduction phase
321
322 vsldoi $t0,$Xm,$zero,8
323 vsldoi $t1,$zero,$Xm,8
324 vxor $Xh,$Xh,$Xh1
325 vxor $Xl,$Xl,$t0
326 vxor $Xh,$Xh,$t1
327
328 vsldoi $Xl,$Xl,$Xl,8
329 vxor $Xl,$Xl,$t2
330 lvx_u $IN,r8,$inp
331 addi $inp,$inp,32
332
333 vsldoi $t1,$Xl,$Xl,8 # 2nd reduction phase
334 vpmsumd $Xl,$Xl,$xC2
335 le?vperm $IN,$IN,$IN,$lemask
336 vxor $t1,$t1,$Xh
337 vxor $IN,$IN,$t1
338 vxor $IN,$IN,$Xl
339 $UCMP r9,$inp
340 bgt Loop_2x # done yet?
341
342 cmplwi $len,0
343 bne Leven
344
345Lshort:
346 vpmsumd $Xl,$IN,$Hl # H.lo·Xi.lo
347 vpmsumd $Xm,$IN,$H # H.hi·Xi.lo+H.lo·Xi.hi
348 vpmsumd $Xh,$IN,$Hh # H.hi·Xi.hi
349
350 vpmsumd $t2,$Xl,$xC2 # 1st reduction phase
351
352 vsldoi $t0,$Xm,$zero,8
353 vsldoi $t1,$zero,$Xm,8
354 vxor $Xl,$Xl,$t0
355 vxor $Xh,$Xh,$t1
356
357 vsldoi $Xl,$Xl,$Xl,8
358 vxor $Xl,$Xl,$t2
359
360 vsldoi $t1,$Xl,$Xl,8 # 2nd reduction phase
361 vpmsumd $Xl,$Xl,$xC2
362 vxor $t1,$t1,$Xh
363
364Leven:
365 vxor $Xl,$Xl,$t1
366 le?vperm $Xl,$Xl,$Xl,$lemask
367 stvx_u $Xl,0,$Xip # write out Xi
368
369 mtspr 256,$vrsave
370 blr
371 .long 0
372 .byte 0,12,0x14,0,0,0,4,0
373 .long 0
374___
375{
376my ($Xl3,$Xm2,$IN2,$H3l,$H3,$H3h,
377 $Xh3,$Xm3,$IN3,$H4l,$H4,$H4h) = map("v$_",(20..31));
378my $IN0=$IN;
379my ($H21l,$H21h,$loperm,$hiperm) = ($Hl,$Hh,$H2l,$H2h);
380
381$code.=<<___;
382.align 5
383.gcm_ghash_p8_4x:
384Lgcm_ghash_p8_4x:
385 $STU $sp,-$FRAME($sp)
386 li r10,`15+6*$SIZE_T`
387 li r11,`31+6*$SIZE_T`
388 stvx v20,r10,$sp
389 addi r10,r10,32
390 stvx v21,r11,$sp
391 addi r11,r11,32
392 stvx v22,r10,$sp
393 addi r10,r10,32
394 stvx v23,r11,$sp
395 addi r11,r11,32
396 stvx v24,r10,$sp
397 addi r10,r10,32
398 stvx v25,r11,$sp
399 addi r11,r11,32
400 stvx v26,r10,$sp
401 addi r10,r10,32
402 stvx v27,r11,$sp
403 addi r11,r11,32
404 stvx v28,r10,$sp
405 addi r10,r10,32
406 stvx v29,r11,$sp
407 addi r11,r11,32
408 stvx v30,r10,$sp
409 li r10,0x60
410 stvx v31,r11,$sp
411 li r0,-1
412 stw $vrsave,`$FRAME-4`($sp) # save vrsave
413 mtspr 256,r0 # preserve all AltiVec registers
414
415 lvsl $t0,0,r8 # 0x0001..0e0f
416 #lvx_u $H2l,r8,$Htbl # load H^2
417 li r8,0x70
418 lvx_u $H2, r9,$Htbl
419 li r9,0x80
420 vspltisb $t1,8 # 0x0808..0808
421 #lvx_u $H2h,r10,$Htbl
422 li r10,0x90
423 lvx_u $H3l,r8,$Htbl # load H^3
424 li r8,0xa0
425 lvx_u $H3, r9,$Htbl
426 li r9,0xb0
427 lvx_u $H3h,r10,$Htbl
428 li r10,0xc0
429 lvx_u $H4l,r8,$Htbl # load H^4
430 li r8,0x10
431 lvx_u $H4, r9,$Htbl
432 li r9,0x20
433 lvx_u $H4h,r10,$Htbl
434 li r10,0x30
435
436 vsldoi $t2,$zero,$t1,8 # 0x0000..0808
437 vaddubm $hiperm,$t0,$t2 # 0x0001..1617
438 vaddubm $loperm,$t1,$hiperm # 0x0809..1e1f
439
440 $SHRI $len,$len,4 # this allows to use sign bit
441 # as carry
442 lvx_u $IN0,0,$inp # load input
443 lvx_u $IN1,r8,$inp
444 subic. $len,$len,8
445 lvx_u $IN2,r9,$inp
446 lvx_u $IN3,r10,$inp
447 addi $inp,$inp,0x40
448 le?vperm $IN0,$IN0,$IN0,$lemask
449 le?vperm $IN1,$IN1,$IN1,$lemask
450 le?vperm $IN2,$IN2,$IN2,$lemask
451 le?vperm $IN3,$IN3,$IN3,$lemask
452
453 vxor $Xh,$IN0,$Xl
454
455 vpmsumd $Xl1,$IN1,$H3l
456 vpmsumd $Xm1,$IN1,$H3
457 vpmsumd $Xh1,$IN1,$H3h
458
459 vperm $H21l,$H2,$H,$hiperm
460 vperm $t0,$IN2,$IN3,$loperm
461 vperm $H21h,$H2,$H,$loperm
462 vperm $t1,$IN2,$IN3,$hiperm
463 vpmsumd $Xm2,$IN2,$H2 # H^2.lo·Xi+2.hi+H^2.hi·Xi+2.lo
464 vpmsumd $Xl3,$t0,$H21l # H^2.lo·Xi+2.lo+H.lo·Xi+3.lo
465 vpmsumd $Xm3,$IN3,$H # H.hi·Xi+3.lo +H.lo·Xi+3.hi
466 vpmsumd $Xh3,$t1,$H21h # H^2.hi·Xi+2.hi+H.hi·Xi+3.hi
467
468 vxor $Xm2,$Xm2,$Xm1
469 vxor $Xl3,$Xl3,$Xl1
470 vxor $Xm3,$Xm3,$Xm2
471 vxor $Xh3,$Xh3,$Xh1
472
473 blt Ltail_4x
474
475Loop_4x:
476 lvx_u $IN0,0,$inp
477 lvx_u $IN1,r8,$inp
478 subic. $len,$len,4
479 lvx_u $IN2,r9,$inp
480 lvx_u $IN3,r10,$inp
481 addi $inp,$inp,0x40
482 le?vperm $IN1,$IN1,$IN1,$lemask
483 le?vperm $IN2,$IN2,$IN2,$lemask
484 le?vperm $IN3,$IN3,$IN3,$lemask
485 le?vperm $IN0,$IN0,$IN0,$lemask
486
487 vpmsumd $Xl,$Xh,$H4l # H^4.lo·Xi.lo
488 vpmsumd $Xm,$Xh,$H4 # H^4.hi·Xi.lo+H^4.lo·Xi.hi
489 vpmsumd $Xh,$Xh,$H4h # H^4.hi·Xi.hi
490 vpmsumd $Xl1,$IN1,$H3l
491 vpmsumd $Xm1,$IN1,$H3
492 vpmsumd $Xh1,$IN1,$H3h
493
494 vxor $Xl,$Xl,$Xl3
495 vxor $Xm,$Xm,$Xm3
496 vxor $Xh,$Xh,$Xh3
497 vperm $t0,$IN2,$IN3,$loperm
498 vperm $t1,$IN2,$IN3,$hiperm
499
500 vpmsumd $t2,$Xl,$xC2 # 1st reduction phase
501 vpmsumd $Xl3,$t0,$H21l # H.lo·Xi+3.lo +H^2.lo·Xi+2.lo
502 vpmsumd $Xh3,$t1,$H21h # H.hi·Xi+3.hi +H^2.hi·Xi+2.hi
503
504 vsldoi $t0,$Xm,$zero,8
505 vsldoi $t1,$zero,$Xm,8
506 vxor $Xl,$Xl,$t0
507 vxor $Xh,$Xh,$t1
508
509 vsldoi $Xl,$Xl,$Xl,8
510 vxor $Xl,$Xl,$t2
511
512 vsldoi $t1,$Xl,$Xl,8 # 2nd reduction phase
513 vpmsumd $Xm2,$IN2,$H2 # H^2.hi·Xi+2.lo+H^2.lo·Xi+2.hi
514 vpmsumd $Xm3,$IN3,$H # H.hi·Xi+3.lo +H.lo·Xi+3.hi
515 vpmsumd $Xl,$Xl,$xC2
516
517 vxor $Xl3,$Xl3,$Xl1
518 vxor $Xh3,$Xh3,$Xh1
519 vxor $Xh,$Xh,$IN0
520 vxor $Xm2,$Xm2,$Xm1
521 vxor $Xh,$Xh,$t1
522 vxor $Xm3,$Xm3,$Xm2
523 vxor $Xh,$Xh,$Xl
524 bge Loop_4x
525
526Ltail_4x:
527 vpmsumd $Xl,$Xh,$H4l # H^4.lo·Xi.lo
528 vpmsumd $Xm,$Xh,$H4 # H^4.hi·Xi.lo+H^4.lo·Xi.hi
529 vpmsumd $Xh,$Xh,$H4h # H^4.hi·Xi.hi
530
531 vxor $Xl,$Xl,$Xl3
532 vxor $Xm,$Xm,$Xm3
533
534 vpmsumd $t2,$Xl,$xC2 # 1st reduction phase
535
536 vsldoi $t0,$Xm,$zero,8
537 vsldoi $t1,$zero,$Xm,8
538 vxor $Xh,$Xh,$Xh3
539 vxor $Xl,$Xl,$t0
540 vxor $Xh,$Xh,$t1
541
542 vsldoi $Xl,$Xl,$Xl,8
543 vxor $Xl,$Xl,$t2
544
545 vsldoi $t1,$Xl,$Xl,8 # 2nd reduction phase
546 vpmsumd $Xl,$Xl,$xC2
547 vxor $t1,$t1,$Xh
548 vxor $Xl,$Xl,$t1
549
550 addic. $len,$len,4
551 beq Ldone_4x
552
553 lvx_u $IN0,0,$inp
554 ${UCMP}i $len,2
555 li $len,-4
556 blt Lone
557 lvx_u $IN1,r8,$inp
558 beq Ltwo
559
560Lthree:
561 lvx_u $IN2,r9,$inp
562 le?vperm $IN0,$IN0,$IN0,$lemask
563 le?vperm $IN1,$IN1,$IN1,$lemask
564 le?vperm $IN2,$IN2,$IN2,$lemask
565
566 vxor $Xh,$IN0,$Xl
567 vmr $H4l,$H3l
568 vmr $H4, $H3
569 vmr $H4h,$H3h
570
571 vperm $t0,$IN1,$IN2,$loperm
572 vperm $t1,$IN1,$IN2,$hiperm
573 vpmsumd $Xm2,$IN1,$H2 # H^2.lo·Xi+1.hi+H^2.hi·Xi+1.lo
574 vpmsumd $Xm3,$IN2,$H # H.hi·Xi+2.lo +H.lo·Xi+2.hi
575 vpmsumd $Xl3,$t0,$H21l # H^2.lo·Xi+1.lo+H.lo·Xi+2.lo
576 vpmsumd $Xh3,$t1,$H21h # H^2.hi·Xi+1.hi+H.hi·Xi+2.hi
577
578 vxor $Xm3,$Xm3,$Xm2
579 b Ltail_4x
580
581.align 4
582Ltwo:
583 le?vperm $IN0,$IN0,$IN0,$lemask
584 le?vperm $IN1,$IN1,$IN1,$lemask
585
586 vxor $Xh,$IN0,$Xl
587 vperm $t0,$zero,$IN1,$loperm
588 vperm $t1,$zero,$IN1,$hiperm
589
590 vsldoi $H4l,$zero,$H2,8
591 vmr $H4, $H2
592 vsldoi $H4h,$H2,$zero,8
593
594 vpmsumd $Xl3,$t0, $H21l # H.lo·Xi+1.lo
595 vpmsumd $Xm3,$IN1,$H # H.hi·Xi+1.lo+H.lo·Xi+2.hi
596 vpmsumd $Xh3,$t1, $H21h # H.hi·Xi+1.hi
597
598 b Ltail_4x
599
600.align 4
601Lone:
602 le?vperm $IN0,$IN0,$IN0,$lemask
603
604 vsldoi $H4l,$zero,$H,8
605 vmr $H4, $H
606 vsldoi $H4h,$H,$zero,8
607
608 vxor $Xh,$IN0,$Xl
609 vxor $Xl3,$Xl3,$Xl3
610 vxor $Xm3,$Xm3,$Xm3
611 vxor $Xh3,$Xh3,$Xh3
612
613 b Ltail_4x
614
615Ldone_4x:
616 le?vperm $Xl,$Xl,$Xl,$lemask
617 stvx_u $Xl,0,$Xip # write out Xi
618
619 li r10,`15+6*$SIZE_T`
620 li r11,`31+6*$SIZE_T`
621 mtspr 256,$vrsave
622 lvx v20,r10,$sp
623 addi r10,r10,32
624 lvx v21,r11,$sp
625 addi r11,r11,32
626 lvx v22,r10,$sp
627 addi r10,r10,32
628 lvx v23,r11,$sp
629 addi r11,r11,32
630 lvx v24,r10,$sp
631 addi r10,r10,32
632 lvx v25,r11,$sp
633 addi r11,r11,32
634 lvx v26,r10,$sp
635 addi r10,r10,32
636 lvx v27,r11,$sp
637 addi r11,r11,32
638 lvx v28,r10,$sp
639 addi r10,r10,32
640 lvx v29,r11,$sp
641 addi r11,r11,32
642 lvx v30,r10,$sp
643 lvx v31,r11,$sp
644 addi $sp,$sp,$FRAME
645 blr
646 .long 0
647 .byte 0,12,0x04,0,0x80,0,4,0
648 .long 0
649___
650}
651$code.=<<___;
652.size .gcm_ghash_p8,.-.gcm_ghash_p8
653
654.asciz "GHASH for PowerISA 2.07, CRYPTOGAMS by <appro\@openssl.org>"
655.align 2
656___
657
658foreach (split("\n",$code)) {
659 s/\`([^\`]*)\`/eval $1/geo;
660
661 if ($flavour =~ /le$/o) { # little-endian
662 s/le\?//o or
663 s/be\?/#be#/o;
664 } else {
665 s/le\?/#le#/o or
666 s/be\?//o;
667 }
668 print $_,"\n";
669}
670
671close STDOUT or die "error closing STDOUT: $!"; # enforce flush
Note: See TracBrowser for help on using the repository browser.

© 2024 Oracle Support Privacy / Do Not Sell My Info Terms of Use Trademark Policy Automated Access Etiquette