VirtualBox

source: vbox/trunk/src/libs/openssl-1.1.0g/crypto/sha/asm/sha512p8-ppc.pl@ 69890

Last change on this file since 69890 was 69890, checked in by vboxsync, 7 years ago

Added OpenSSL 1.1.0g with unneeded files removed, otherwise unmodified.
bugref:8070: src/libs maintenance

  • Property svn:eol-style set to LF
  • Property svn:executable set to *
File size: 11.7 KB
Line 
1#! /usr/bin/env perl
2# Copyright 2014-2016 The OpenSSL Project Authors. All Rights Reserved.
3#
4# Licensed under the OpenSSL license (the "License"). You may not use
5# this file except in compliance with the License. You can obtain a copy
6# in the file LICENSE in the source distribution or at
7# https://www.openssl.org/source/license.html
8
9
10# ====================================================================
11# Written by Andy Polyakov <[email protected]> for the OpenSSL
12# project. The module is, however, dual licensed under OpenSSL and
13# CRYPTOGAMS licenses depending on where you obtain it. For further
14# details see http://www.openssl.org/~appro/cryptogams/.
15# ====================================================================
16
17# SHA256/512 for PowerISA v2.07.
18#
19# Accurate performance measurements are problematic, because it's
20# always virtualized setup with possibly throttled processor.
21# Relative comparison is therefore more informative. This module is
22# ~60% faster than integer-only sha512-ppc.pl. To anchor to something
23# else, SHA256 is 24% slower than sha1-ppc.pl and 2.5x slower than
24# hardware-assisted aes-128-cbc encrypt. SHA512 is 20% faster than
25# sha1-ppc.pl and 1.6x slower than aes-128-cbc. Another interesting
26# result is degree of computational resources' utilization. POWER8 is
27# "massively multi-threaded chip" and difference between single- and
28# maximum multi-process benchmark results tells that utlization is
29# whooping 94%. For sha512-ppc.pl we get [not unimpressive] 84% and
30# for sha1-ppc.pl - 73%. 100% means that multi-process result equals
31# to single-process one, given that all threads end up on the same
32# physical core.
33
34$flavour=shift;
35$output =shift;
36
37if ($flavour =~ /64/) {
38 $SIZE_T=8;
39 $LRSAVE=2*$SIZE_T;
40 $STU="stdu";
41 $POP="ld";
42 $PUSH="std";
43} elsif ($flavour =~ /32/) {
44 $SIZE_T=4;
45 $LRSAVE=$SIZE_T;
46 $STU="stwu";
47 $POP="lwz";
48 $PUSH="stw";
49} else { die "nonsense $flavour"; }
50
51$LENDIAN=($flavour=~/le/);
52
53$0 =~ m/(.*[\/\\])[^\/\\]+$/; $dir=$1;
54( $xlate="${dir}ppc-xlate.pl" and -f $xlate ) or
55( $xlate="${dir}../../perlasm/ppc-xlate.pl" and -f $xlate) or
56die "can't locate ppc-xlate.pl";
57
58open STDOUT,"| $^X $xlate $flavour $output" || die "can't call $xlate: $!";
59
60if ($output =~ /512/) {
61 $bits=512;
62 $SZ=8;
63 $sz="d";
64 $rounds=80;
65} else {
66 $bits=256;
67 $SZ=4;
68 $sz="w";
69 $rounds=64;
70}
71
72$func="sha${bits}_block_p8";
73$FRAME=8*$SIZE_T;
74
75$sp ="r1";
76$toc="r2";
77$ctx="r3";
78$inp="r4";
79$num="r5";
80$Tbl="r6";
81$idx="r7";
82$lrsave="r8";
83$offload="r11";
84$vrsave="r12";
85($x00,$x10,$x20,$x30,$x40,$x50,$x60,$x70)=map("r$_",(0,10,26..31));
86 $x00=0 if ($flavour =~ /osx/);
87
88@V=($A,$B,$C,$D,$E,$F,$G,$H)=map("v$_",(0..7));
89@X=map("v$_",(8..23));
90($Ki,$Func,$S0,$S1,$s0,$s1,$lemask)=map("v$_",(24..31));
91
92sub ROUND {
93my ($i,$a,$b,$c,$d,$e,$f,$g,$h)=@_;
94my $j=($i+1)%16;
95
96$code.=<<___ if ($i<15 && ($i%(16/$SZ))==(16/$SZ-1));
97 lvx_u @X[$i+1],0,$inp ; load X[i] in advance
98 addi $inp,$inp,16
99___
100$code.=<<___ if ($i<16 && ($i%(16/$SZ)));
101 vsldoi @X[$i],@X[$i-1],@X[$i-1],$SZ
102___
103$code.=<<___ if ($LENDIAN && $i<16 && ($i%(16/$SZ))==0);
104 vperm @X[$i],@X[$i],@X[$i],$lemask
105___
106$code.=<<___;
107 `"vshasigma${sz} $s0,@X[($j+1)%16],0,0" if ($i>=15)`
108 vsel $Func,$g,$f,$e ; Ch(e,f,g)
109 vshasigma${sz} $S1,$e,1,15 ; Sigma1(e)
110 vaddu${sz}m $h,$h,@X[$i%16] ; h+=X[i]
111 vshasigma${sz} $S0,$a,1,0 ; Sigma0(a)
112 `"vshasigma${sz} $s1,@X[($j+14)%16],0,15" if ($i>=15)`
113 vaddu${sz}m $h,$h,$Func ; h+=Ch(e,f,g)
114 vxor $Func,$a,$b
115 `"vaddu${sz}m @X[$j],@X[$j],@X[($j+9)%16]" if ($i>=15)`
116 vaddu${sz}m $h,$h,$S1 ; h+=Sigma1(e)
117 vsel $Func,$b,$c,$Func ; Maj(a,b,c)
118 vaddu${sz}m $g,$g,$Ki ; future h+=K[i]
119 vaddu${sz}m $d,$d,$h ; d+=h
120 vaddu${sz}m $S0,$S0,$Func ; Sigma0(a)+Maj(a,b,c)
121 `"vaddu${sz}m @X[$j],@X[$j],$s0" if ($i>=15)`
122 lvx $Ki,$idx,$Tbl ; load next K[i]
123 addi $idx,$idx,16
124 vaddu${sz}m $h,$h,$S0 ; h+=Sigma0(a)+Maj(a,b,c)
125 `"vaddu${sz}m @X[$j],@X[$j],$s1" if ($i>=15)`
126___
127}
128
129$code=<<___;
130.machine "any"
131.text
132
133.globl $func
134.align 6
135$func:
136 $STU $sp,-`($FRAME+21*16+6*$SIZE_T)`($sp)
137 mflr $lrsave
138 li r10,`$FRAME+8*16+15`
139 li r11,`$FRAME+8*16+31`
140 stvx v20,r10,$sp # ABI says so
141 addi r10,r10,32
142 mfspr $vrsave,256
143 stvx v21,r11,$sp
144 addi r11,r11,32
145 stvx v22,r10,$sp
146 addi r10,r10,32
147 stvx v23,r11,$sp
148 addi r11,r11,32
149 stvx v24,r10,$sp
150 addi r10,r10,32
151 stvx v25,r11,$sp
152 addi r11,r11,32
153 stvx v26,r10,$sp
154 addi r10,r10,32
155 stvx v27,r11,$sp
156 addi r11,r11,32
157 stvx v28,r10,$sp
158 addi r10,r10,32
159 stvx v29,r11,$sp
160 addi r11,r11,32
161 stvx v30,r10,$sp
162 stvx v31,r11,$sp
163 li r11,-1
164 stw $vrsave,`$FRAME+21*16-4`($sp) # save vrsave
165 li $x10,0x10
166 $PUSH r26,`$FRAME+21*16+0*$SIZE_T`($sp)
167 li $x20,0x20
168 $PUSH r27,`$FRAME+21*16+1*$SIZE_T`($sp)
169 li $x30,0x30
170 $PUSH r28,`$FRAME+21*16+2*$SIZE_T`($sp)
171 li $x40,0x40
172 $PUSH r29,`$FRAME+21*16+3*$SIZE_T`($sp)
173 li $x50,0x50
174 $PUSH r30,`$FRAME+21*16+4*$SIZE_T`($sp)
175 li $x60,0x60
176 $PUSH r31,`$FRAME+21*16+5*$SIZE_T`($sp)
177 li $x70,0x70
178 $PUSH $lrsave,`$FRAME+21*16+6*$SIZE_T+$LRSAVE`($sp)
179 mtspr 256,r11
180
181 bl LPICmeup
182 addi $offload,$sp,$FRAME+15
183___
184$code.=<<___ if ($LENDIAN);
185 li $idx,8
186 lvsl $lemask,0,$idx
187 vspltisb $Ki,0x0f
188 vxor $lemask,$lemask,$Ki
189___
190$code.=<<___ if ($SZ==4);
191 lvx_4w $A,$x00,$ctx
192 lvx_4w $E,$x10,$ctx
193 vsldoi $B,$A,$A,4 # unpack
194 vsldoi $C,$A,$A,8
195 vsldoi $D,$A,$A,12
196 vsldoi $F,$E,$E,4
197 vsldoi $G,$E,$E,8
198 vsldoi $H,$E,$E,12
199___
200$code.=<<___ if ($SZ==8);
201 lvx_u $A,$x00,$ctx
202 lvx_u $C,$x10,$ctx
203 lvx_u $E,$x20,$ctx
204 vsldoi $B,$A,$A,8 # unpack
205 lvx_u $G,$x30,$ctx
206 vsldoi $D,$C,$C,8
207 vsldoi $F,$E,$E,8
208 vsldoi $H,$G,$G,8
209___
210$code.=<<___;
211 li r0,`($rounds-16)/16` # inner loop counter
212 b Loop
213.align 5
214Loop:
215 lvx $Ki,$x00,$Tbl
216 li $idx,16
217 lvx_u @X[0],0,$inp
218 addi $inp,$inp,16
219 stvx $A,$x00,$offload # offload $A-$H
220 stvx $B,$x10,$offload
221 stvx $C,$x20,$offload
222 stvx $D,$x30,$offload
223 stvx $E,$x40,$offload
224 stvx $F,$x50,$offload
225 stvx $G,$x60,$offload
226 stvx $H,$x70,$offload
227 vaddu${sz}m $H,$H,$Ki # h+K[i]
228 lvx $Ki,$idx,$Tbl
229 addi $idx,$idx,16
230___
231for ($i=0;$i<16;$i++) { &ROUND($i,@V); unshift(@V,pop(@V)); }
232$code.=<<___;
233 mtctr r0
234 b L16_xx
235.align 5
236L16_xx:
237___
238for (;$i<32;$i++) { &ROUND($i,@V); unshift(@V,pop(@V)); }
239$code.=<<___;
240 bdnz L16_xx
241
242 lvx @X[2],$x00,$offload
243 subic. $num,$num,1
244 lvx @X[3],$x10,$offload
245 vaddu${sz}m $A,$A,@X[2]
246 lvx @X[4],$x20,$offload
247 vaddu${sz}m $B,$B,@X[3]
248 lvx @X[5],$x30,$offload
249 vaddu${sz}m $C,$C,@X[4]
250 lvx @X[6],$x40,$offload
251 vaddu${sz}m $D,$D,@X[5]
252 lvx @X[7],$x50,$offload
253 vaddu${sz}m $E,$E,@X[6]
254 lvx @X[8],$x60,$offload
255 vaddu${sz}m $F,$F,@X[7]
256 lvx @X[9],$x70,$offload
257 vaddu${sz}m $G,$G,@X[8]
258 vaddu${sz}m $H,$H,@X[9]
259 bne Loop
260___
261$code.=<<___ if ($SZ==4);
262 lvx @X[0],$idx,$Tbl
263 addi $idx,$idx,16
264 vperm $A,$A,$B,$Ki # pack the answer
265 lvx @X[1],$idx,$Tbl
266 vperm $E,$E,$F,$Ki
267 vperm $A,$A,$C,@X[0]
268 vperm $E,$E,$G,@X[0]
269 vperm $A,$A,$D,@X[1]
270 vperm $E,$E,$H,@X[1]
271 stvx_4w $A,$x00,$ctx
272 stvx_4w $E,$x10,$ctx
273___
274$code.=<<___ if ($SZ==8);
275 vperm $A,$A,$B,$Ki # pack the answer
276 vperm $C,$C,$D,$Ki
277 vperm $E,$E,$F,$Ki
278 vperm $G,$G,$H,$Ki
279 stvx_u $A,$x00,$ctx
280 stvx_u $C,$x10,$ctx
281 stvx_u $E,$x20,$ctx
282 stvx_u $G,$x30,$ctx
283___
284$code.=<<___;
285 li r10,`$FRAME+8*16+15`
286 mtlr $lrsave
287 li r11,`$FRAME+8*16+31`
288 mtspr 256,$vrsave
289 lvx v20,r10,$sp # ABI says so
290 addi r10,r10,32
291 lvx v21,r11,$sp
292 addi r11,r11,32
293 lvx v22,r10,$sp
294 addi r10,r10,32
295 lvx v23,r11,$sp
296 addi r11,r11,32
297 lvx v24,r10,$sp
298 addi r10,r10,32
299 lvx v25,r11,$sp
300 addi r11,r11,32
301 lvx v26,r10,$sp
302 addi r10,r10,32
303 lvx v27,r11,$sp
304 addi r11,r11,32
305 lvx v28,r10,$sp
306 addi r10,r10,32
307 lvx v29,r11,$sp
308 addi r11,r11,32
309 lvx v30,r10,$sp
310 lvx v31,r11,$sp
311 $POP r26,`$FRAME+21*16+0*$SIZE_T`($sp)
312 $POP r27,`$FRAME+21*16+1*$SIZE_T`($sp)
313 $POP r28,`$FRAME+21*16+2*$SIZE_T`($sp)
314 $POP r29,`$FRAME+21*16+3*$SIZE_T`($sp)
315 $POP r30,`$FRAME+21*16+4*$SIZE_T`($sp)
316 $POP r31,`$FRAME+21*16+5*$SIZE_T`($sp)
317 addi $sp,$sp,`$FRAME+21*16+6*$SIZE_T`
318 blr
319 .long 0
320 .byte 0,12,4,1,0x80,6,3,0
321 .long 0
322.size $func,.-$func
323___
324
325# Ugly hack here, because PPC assembler syntax seem to vary too
326# much from platforms to platform...
327$code.=<<___;
328.align 6
329LPICmeup:
330 mflr r0
331 bcl 20,31,\$+4
332 mflr $Tbl ; vvvvvv "distance" between . and 1st data entry
333 addi $Tbl,$Tbl,`64-8`
334 mtlr r0
335 blr
336 .long 0
337 .byte 0,12,0x14,0,0,0,0,0
338 .space `64-9*4`
339___
340
341if ($SZ==8) {
342 local *table = sub {
343 foreach(@_) { $code.=".quad $_,$_\n"; }
344 };
345 table(
346 "0x428a2f98d728ae22","0x7137449123ef65cd",
347 "0xb5c0fbcfec4d3b2f","0xe9b5dba58189dbbc",
348 "0x3956c25bf348b538","0x59f111f1b605d019",
349 "0x923f82a4af194f9b","0xab1c5ed5da6d8118",
350 "0xd807aa98a3030242","0x12835b0145706fbe",
351 "0x243185be4ee4b28c","0x550c7dc3d5ffb4e2",
352 "0x72be5d74f27b896f","0x80deb1fe3b1696b1",
353 "0x9bdc06a725c71235","0xc19bf174cf692694",
354 "0xe49b69c19ef14ad2","0xefbe4786384f25e3",
355 "0x0fc19dc68b8cd5b5","0x240ca1cc77ac9c65",
356 "0x2de92c6f592b0275","0x4a7484aa6ea6e483",
357 "0x5cb0a9dcbd41fbd4","0x76f988da831153b5",
358 "0x983e5152ee66dfab","0xa831c66d2db43210",
359 "0xb00327c898fb213f","0xbf597fc7beef0ee4",
360 "0xc6e00bf33da88fc2","0xd5a79147930aa725",
361 "0x06ca6351e003826f","0x142929670a0e6e70",
362 "0x27b70a8546d22ffc","0x2e1b21385c26c926",
363 "0x4d2c6dfc5ac42aed","0x53380d139d95b3df",
364 "0x650a73548baf63de","0x766a0abb3c77b2a8",
365 "0x81c2c92e47edaee6","0x92722c851482353b",
366 "0xa2bfe8a14cf10364","0xa81a664bbc423001",
367 "0xc24b8b70d0f89791","0xc76c51a30654be30",
368 "0xd192e819d6ef5218","0xd69906245565a910",
369 "0xf40e35855771202a","0x106aa07032bbd1b8",
370 "0x19a4c116b8d2d0c8","0x1e376c085141ab53",
371 "0x2748774cdf8eeb99","0x34b0bcb5e19b48a8",
372 "0x391c0cb3c5c95a63","0x4ed8aa4ae3418acb",
373 "0x5b9cca4f7763e373","0x682e6ff3d6b2b8a3",
374 "0x748f82ee5defb2fc","0x78a5636f43172f60",
375 "0x84c87814a1f0ab72","0x8cc702081a6439ec",
376 "0x90befffa23631e28","0xa4506cebde82bde9",
377 "0xbef9a3f7b2c67915","0xc67178f2e372532b",
378 "0xca273eceea26619c","0xd186b8c721c0c207",
379 "0xeada7dd6cde0eb1e","0xf57d4f7fee6ed178",
380 "0x06f067aa72176fba","0x0a637dc5a2c898a6",
381 "0x113f9804bef90dae","0x1b710b35131c471b",
382 "0x28db77f523047d84","0x32caab7b40c72493",
383 "0x3c9ebe0a15c9bebc","0x431d67c49c100d4c",
384 "0x4cc5d4becb3e42b6","0x597f299cfc657e2a",
385 "0x5fcb6fab3ad6faec","0x6c44198c4a475817","0");
386$code.=<<___ if (!$LENDIAN);
387.quad 0x0001020304050607,0x1011121314151617
388___
389$code.=<<___ if ($LENDIAN); # quad-swapped
390.quad 0x1011121314151617,0x0001020304050607
391___
392} else {
393 local *table = sub {
394 foreach(@_) { $code.=".long $_,$_,$_,$_\n"; }
395 };
396 table(
397 "0x428a2f98","0x71374491","0xb5c0fbcf","0xe9b5dba5",
398 "0x3956c25b","0x59f111f1","0x923f82a4","0xab1c5ed5",
399 "0xd807aa98","0x12835b01","0x243185be","0x550c7dc3",
400 "0x72be5d74","0x80deb1fe","0x9bdc06a7","0xc19bf174",
401 "0xe49b69c1","0xefbe4786","0x0fc19dc6","0x240ca1cc",
402 "0x2de92c6f","0x4a7484aa","0x5cb0a9dc","0x76f988da",
403 "0x983e5152","0xa831c66d","0xb00327c8","0xbf597fc7",
404 "0xc6e00bf3","0xd5a79147","0x06ca6351","0x14292967",
405 "0x27b70a85","0x2e1b2138","0x4d2c6dfc","0x53380d13",
406 "0x650a7354","0x766a0abb","0x81c2c92e","0x92722c85",
407 "0xa2bfe8a1","0xa81a664b","0xc24b8b70","0xc76c51a3",
408 "0xd192e819","0xd6990624","0xf40e3585","0x106aa070",
409 "0x19a4c116","0x1e376c08","0x2748774c","0x34b0bcb5",
410 "0x391c0cb3","0x4ed8aa4a","0x5b9cca4f","0x682e6ff3",
411 "0x748f82ee","0x78a5636f","0x84c87814","0x8cc70208",
412 "0x90befffa","0xa4506ceb","0xbef9a3f7","0xc67178f2","0");
413$code.=<<___ if (!$LENDIAN);
414.long 0x00010203,0x10111213,0x10111213,0x10111213
415.long 0x00010203,0x04050607,0x10111213,0x10111213
416.long 0x00010203,0x04050607,0x08090a0b,0x10111213
417___
418$code.=<<___ if ($LENDIAN); # word-swapped
419.long 0x10111213,0x10111213,0x10111213,0x00010203
420.long 0x10111213,0x10111213,0x04050607,0x00010203
421.long 0x10111213,0x08090a0b,0x04050607,0x00010203
422___
423}
424$code.=<<___;
425.asciz "SHA${bits} for PowerISA 2.07, CRYPTOGAMS by <appro\@openssl.org>"
426.align 2
427___
428
429$code =~ s/\`([^\`]*)\`/eval $1/gem;
430print $code;
431close STDOUT;
Note: See TracBrowser for help on using the repository browser.

© 2024 Oracle Support Privacy / Do Not Sell My Info Terms of Use Trademark Policy Automated Access Etiquette