1 | #! /usr/bin/env perl
|
---|
2 | # Copyright 2009-2020 The OpenSSL Project Authors. All Rights Reserved.
|
---|
3 | #
|
---|
4 | # Licensed under the OpenSSL license (the "License"). You may not use
|
---|
5 | # this file except in compliance with the License. You can obtain a copy
|
---|
6 | # in the file LICENSE in the source distribution or at
|
---|
7 | # https://www.openssl.org/source/license.html
|
---|
8 |
|
---|
9 |
|
---|
10 | # ====================================================================
|
---|
11 | # Written by Andy Polyakov <[email protected]> for the OpenSSL
|
---|
12 | # project. The module is, however, dual licensed under OpenSSL and
|
---|
13 | # CRYPTOGAMS licenses depending on where you obtain it. For further
|
---|
14 | # details see http://www.openssl.org/~appro/cryptogams/.
|
---|
15 | # ====================================================================
|
---|
16 |
|
---|
17 | # RC4 for PA-RISC.
|
---|
18 |
|
---|
19 | # June 2009.
|
---|
20 | #
|
---|
21 | # Performance is 33% better than gcc 3.2 generated code on PA-7100LC.
|
---|
22 | # For reference, [4x] unrolled loop is >40% faster than folded one.
|
---|
23 | # It's possible to unroll loop 8 times on PA-RISC 2.0, but improvement
|
---|
24 | # is believed to be not sufficient to justify the effort...
|
---|
25 | #
|
---|
26 | # Special thanks to polarhome.com for providing HP-UX account.
|
---|
27 |
|
---|
28 | $0 =~ m/(.*[\/\\])[^\/\\]+$/; $dir=$1;
|
---|
29 |
|
---|
30 | $flavour = shift;
|
---|
31 | $output = shift;
|
---|
32 | open STDOUT,">$output";
|
---|
33 |
|
---|
34 | if ($flavour =~ /64/) {
|
---|
35 | $LEVEL ="2.0W";
|
---|
36 | $SIZE_T =8;
|
---|
37 | $FRAME_MARKER =80;
|
---|
38 | $SAVED_RP =16;
|
---|
39 | $PUSH ="std";
|
---|
40 | $PUSHMA ="std,ma";
|
---|
41 | $POP ="ldd";
|
---|
42 | $POPMB ="ldd,mb";
|
---|
43 | } else {
|
---|
44 | $LEVEL ="1.0";
|
---|
45 | $SIZE_T =4;
|
---|
46 | $FRAME_MARKER =48;
|
---|
47 | $SAVED_RP =20;
|
---|
48 | $PUSH ="stw";
|
---|
49 | $PUSHMA ="stwm";
|
---|
50 | $POP ="ldw";
|
---|
51 | $POPMB ="ldwm";
|
---|
52 | }
|
---|
53 |
|
---|
54 | $FRAME=4*$SIZE_T+$FRAME_MARKER; # 4 saved regs + frame marker
|
---|
55 | # [+ argument transfer]
|
---|
56 | $SZ=1; # defaults to RC4_CHAR
|
---|
57 | if (open CONF,"<${dir}../../opensslconf.h") {
|
---|
58 | while(<CONF>) {
|
---|
59 | if (m/#\s*define\s+RC4_INT\s+(.*)/) {
|
---|
60 | $SZ = ($1=~/char$/) ? 1 : 4;
|
---|
61 | last;
|
---|
62 | }
|
---|
63 | }
|
---|
64 | close CONF;
|
---|
65 | }
|
---|
66 |
|
---|
67 | if ($SZ==1) { # RC4_CHAR
|
---|
68 | $LD="ldb";
|
---|
69 | $LDX="ldbx";
|
---|
70 | $MKX="addl";
|
---|
71 | $ST="stb";
|
---|
72 | } else { # RC4_INT (~5% faster than RC4_CHAR on PA-7100LC)
|
---|
73 | $LD="ldw";
|
---|
74 | $LDX="ldwx,s";
|
---|
75 | $MKX="sh2addl";
|
---|
76 | $ST="stw";
|
---|
77 | }
|
---|
78 |
|
---|
79 | $key="%r26";
|
---|
80 | $len="%r25";
|
---|
81 | $inp="%r24";
|
---|
82 | $out="%r23";
|
---|
83 |
|
---|
84 | @XX=("%r19","%r20");
|
---|
85 | @TX=("%r21","%r22");
|
---|
86 | $YY="%r28";
|
---|
87 | $TY="%r29";
|
---|
88 |
|
---|
89 | $acc="%r1";
|
---|
90 | $ix="%r2";
|
---|
91 | $iy="%r3";
|
---|
92 | $dat0="%r4";
|
---|
93 | $dat1="%r5";
|
---|
94 | $rem="%r6";
|
---|
95 | $mask="%r31";
|
---|
96 |
|
---|
97 | sub unrolledloopbody {
|
---|
98 | for ($i=0;$i<4;$i++) {
|
---|
99 | $code.=<<___;
|
---|
100 | ldo 1($XX[0]),$XX[1]
|
---|
101 | `sprintf("$LDX %$TY(%$key),%$dat1") if ($i>0)`
|
---|
102 | and $mask,$XX[1],$XX[1]
|
---|
103 | $LDX $YY($key),$TY
|
---|
104 | $MKX $YY,$key,$ix
|
---|
105 | $LDX $XX[1]($key),$TX[1]
|
---|
106 | $MKX $XX[0],$key,$iy
|
---|
107 | $ST $TX[0],0($ix)
|
---|
108 | comclr,<> $XX[1],$YY,%r0 ; conditional
|
---|
109 | copy $TX[0],$TX[1] ; move
|
---|
110 | `sprintf("%sdep %$dat1,%d,8,%$acc",$i==1?"z":"",8*($i-1)+7) if ($i>0)`
|
---|
111 | $ST $TY,0($iy)
|
---|
112 | addl $TX[0],$TY,$TY
|
---|
113 | addl $TX[1],$YY,$YY
|
---|
114 | and $mask,$TY,$TY
|
---|
115 | and $mask,$YY,$YY
|
---|
116 | ___
|
---|
117 | push(@TX,shift(@TX)); push(@XX,shift(@XX)); # "rotate" registers
|
---|
118 | } }
|
---|
119 |
|
---|
120 | sub foldedloop {
|
---|
121 | my ($label,$count)=@_;
|
---|
122 | $code.=<<___;
|
---|
123 | $label
|
---|
124 | $MKX $YY,$key,$iy
|
---|
125 | $LDX $YY($key),$TY
|
---|
126 | $MKX $XX[0],$key,$ix
|
---|
127 | $ST $TX[0],0($iy)
|
---|
128 | ldo 1($XX[0]),$XX[0]
|
---|
129 | $ST $TY,0($ix)
|
---|
130 | addl $TX[0],$TY,$TY
|
---|
131 | ldbx $inp($out),$dat1
|
---|
132 | and $mask,$TY,$TY
|
---|
133 | and $mask,$XX[0],$XX[0]
|
---|
134 | $LDX $TY($key),$acc
|
---|
135 | $LDX $XX[0]($key),$TX[0]
|
---|
136 | ldo 1($out),$out
|
---|
137 | xor $dat1,$acc,$acc
|
---|
138 | addl $TX[0],$YY,$YY
|
---|
139 | stb $acc,-1($out)
|
---|
140 | addib,<> -1,$count,$label ; $count is always small
|
---|
141 | and $mask,$YY,$YY
|
---|
142 | ___
|
---|
143 | }
|
---|
144 |
|
---|
145 | $code=<<___;
|
---|
146 | .LEVEL $LEVEL
|
---|
147 | .SPACE \$TEXT\$
|
---|
148 | .SUBSPA \$CODE\$,QUAD=0,ALIGN=8,ACCESS=0x2C,CODE_ONLY
|
---|
149 |
|
---|
150 | .EXPORT RC4,ENTRY,ARGW0=GR,ARGW1=GR,ARGW2=GR,ARGW3=GR
|
---|
151 | RC4
|
---|
152 | .PROC
|
---|
153 | .CALLINFO FRAME=`$FRAME-4*$SIZE_T`,NO_CALLS,SAVE_RP,ENTRY_GR=6
|
---|
154 | .ENTRY
|
---|
155 | $PUSH %r2,-$SAVED_RP(%sp) ; standard prologue
|
---|
156 | $PUSHMA %r3,$FRAME(%sp)
|
---|
157 | $PUSH %r4,`-$FRAME+1*$SIZE_T`(%sp)
|
---|
158 | $PUSH %r5,`-$FRAME+2*$SIZE_T`(%sp)
|
---|
159 | $PUSH %r6,`-$FRAME+3*$SIZE_T`(%sp)
|
---|
160 |
|
---|
161 | cmpib,*= 0,$len,L\$abort
|
---|
162 | sub $inp,$out,$inp ; distance between $inp and $out
|
---|
163 |
|
---|
164 | $LD `0*$SZ`($key),$XX[0]
|
---|
165 | $LD `1*$SZ`($key),$YY
|
---|
166 | ldo `2*$SZ`($key),$key
|
---|
167 |
|
---|
168 | ldi 0xff,$mask
|
---|
169 | ldi 3,$dat0
|
---|
170 |
|
---|
171 | ldo 1($XX[0]),$XX[0] ; warm up loop
|
---|
172 | and $mask,$XX[0],$XX[0]
|
---|
173 | $LDX $XX[0]($key),$TX[0]
|
---|
174 | addl $TX[0],$YY,$YY
|
---|
175 | cmpib,*>>= 6,$len,L\$oop1 ; is $len large enough to bother?
|
---|
176 | and $mask,$YY,$YY
|
---|
177 |
|
---|
178 | and,<> $out,$dat0,$rem ; is $out aligned?
|
---|
179 | b L\$alignedout
|
---|
180 | subi 4,$rem,$rem
|
---|
181 | sub $len,$rem,$len
|
---|
182 | ___
|
---|
183 | &foldedloop("L\$alignout",$rem); # process till $out is aligned
|
---|
184 |
|
---|
185 | $code.=<<___;
|
---|
186 | L\$alignedout ; $len is at least 4 here
|
---|
187 | and,<> $inp,$dat0,$acc ; is $inp aligned?
|
---|
188 | b L\$oop4
|
---|
189 | sub $inp,$acc,$rem ; align $inp
|
---|
190 |
|
---|
191 | sh3addl $acc,%r0,$acc
|
---|
192 | subi 32,$acc,$acc
|
---|
193 | mtctl $acc,%cr11 ; load %sar with vshd align factor
|
---|
194 | ldwx $rem($out),$dat0
|
---|
195 | ldo 4($rem),$rem
|
---|
196 | L\$oop4misalignedinp
|
---|
197 | ___
|
---|
198 | &unrolledloopbody();
|
---|
199 | $code.=<<___;
|
---|
200 | $LDX $TY($key),$ix
|
---|
201 | ldwx $rem($out),$dat1
|
---|
202 | ldo -4($len),$len
|
---|
203 | or $ix,$acc,$acc ; last piece, no need to dep
|
---|
204 | vshd $dat0,$dat1,$iy ; align data
|
---|
205 | copy $dat1,$dat0
|
---|
206 | xor $iy,$acc,$acc
|
---|
207 | stw $acc,0($out)
|
---|
208 | cmpib,*<< 3,$len,L\$oop4misalignedinp
|
---|
209 | ldo 4($out),$out
|
---|
210 | cmpib,*= 0,$len,L\$done
|
---|
211 | nop
|
---|
212 | b L\$oop1
|
---|
213 | nop
|
---|
214 |
|
---|
215 | .ALIGN 8
|
---|
216 | L\$oop4
|
---|
217 | ___
|
---|
218 | &unrolledloopbody();
|
---|
219 | $code.=<<___;
|
---|
220 | $LDX $TY($key),$ix
|
---|
221 | ldwx $inp($out),$dat0
|
---|
222 | ldo -4($len),$len
|
---|
223 | or $ix,$acc,$acc ; last piece, no need to dep
|
---|
224 | xor $dat0,$acc,$acc
|
---|
225 | stw $acc,0($out)
|
---|
226 | cmpib,*<< 3,$len,L\$oop4
|
---|
227 | ldo 4($out),$out
|
---|
228 | cmpib,*= 0,$len,L\$done
|
---|
229 | nop
|
---|
230 | ___
|
---|
231 | &foldedloop("L\$oop1",$len);
|
---|
232 | $code.=<<___;
|
---|
233 | L\$done
|
---|
234 | $POP `-$FRAME-$SAVED_RP`(%sp),%r2
|
---|
235 | ldo -1($XX[0]),$XX[0] ; chill out loop
|
---|
236 | sub $YY,$TX[0],$YY
|
---|
237 | and $mask,$XX[0],$XX[0]
|
---|
238 | and $mask,$YY,$YY
|
---|
239 | $ST $XX[0],`-2*$SZ`($key)
|
---|
240 | $ST $YY,`-1*$SZ`($key)
|
---|
241 | $POP `-$FRAME+1*$SIZE_T`(%sp),%r4
|
---|
242 | $POP `-$FRAME+2*$SIZE_T`(%sp),%r5
|
---|
243 | $POP `-$FRAME+3*$SIZE_T`(%sp),%r6
|
---|
244 | L\$abort
|
---|
245 | bv (%r2)
|
---|
246 | .EXIT
|
---|
247 | $POPMB -$FRAME(%sp),%r3
|
---|
248 | .PROCEND
|
---|
249 | ___
|
---|
250 |
|
---|
251 | $code.=<<___;
|
---|
252 |
|
---|
253 | .EXPORT RC4_set_key,ENTRY,ARGW0=GR,ARGW1=GR,ARGW2=GR
|
---|
254 | .ALIGN 8
|
---|
255 | RC4_set_key
|
---|
256 | .PROC
|
---|
257 | .CALLINFO NO_CALLS
|
---|
258 | .ENTRY
|
---|
259 | $ST %r0,`0*$SZ`($key)
|
---|
260 | $ST %r0,`1*$SZ`($key)
|
---|
261 | ldo `2*$SZ`($key),$key
|
---|
262 | copy %r0,@XX[0]
|
---|
263 | L\$1st
|
---|
264 | $ST @XX[0],0($key)
|
---|
265 | ldo 1(@XX[0]),@XX[0]
|
---|
266 | bb,>= @XX[0],`31-8`,L\$1st ; @XX[0]<256
|
---|
267 | ldo $SZ($key),$key
|
---|
268 |
|
---|
269 | ldo `-256*$SZ`($key),$key ; rewind $key
|
---|
270 | addl $len,$inp,$inp ; $inp to point at the end
|
---|
271 | sub %r0,$len,%r23 ; inverse index
|
---|
272 | copy %r0,@XX[0]
|
---|
273 | copy %r0,@XX[1]
|
---|
274 | ldi 0xff,$mask
|
---|
275 |
|
---|
276 | L\$2nd
|
---|
277 | $LDX @XX[0]($key),@TX[0]
|
---|
278 | ldbx %r23($inp),@TX[1]
|
---|
279 | addi,nuv 1,%r23,%r23 ; increment and conditional
|
---|
280 | sub %r0,$len,%r23 ; inverse index
|
---|
281 | addl @TX[0],@XX[1],@XX[1]
|
---|
282 | addl @TX[1],@XX[1],@XX[1]
|
---|
283 | and $mask,@XX[1],@XX[1]
|
---|
284 | $MKX @XX[0],$key,$TY
|
---|
285 | $LDX @XX[1]($key),@TX[1]
|
---|
286 | $MKX @XX[1],$key,$YY
|
---|
287 | ldo 1(@XX[0]),@XX[0]
|
---|
288 | $ST @TX[0],0($YY)
|
---|
289 | bb,>= @XX[0],`31-8`,L\$2nd ; @XX[0]<256
|
---|
290 | $ST @TX[1],0($TY)
|
---|
291 |
|
---|
292 | bv,n (%r2)
|
---|
293 | .EXIT
|
---|
294 | nop
|
---|
295 | .PROCEND
|
---|
296 |
|
---|
297 | .EXPORT RC4_options,ENTRY
|
---|
298 | .ALIGN 8
|
---|
299 | RC4_options
|
---|
300 | .PROC
|
---|
301 | .CALLINFO NO_CALLS
|
---|
302 | .ENTRY
|
---|
303 | blr %r0,%r28
|
---|
304 | ldi 3,%r1
|
---|
305 | L\$pic
|
---|
306 | andcm %r28,%r1,%r28
|
---|
307 | bv (%r2)
|
---|
308 | .EXIT
|
---|
309 | ldo L\$opts-L\$pic(%r28),%r28
|
---|
310 | .PROCEND
|
---|
311 | .ALIGN 8
|
---|
312 | L\$opts
|
---|
313 | .STRINGZ "rc4(4x,`$SZ==1?"char":"int"`)"
|
---|
314 | .STRINGZ "RC4 for PA-RISC, CRYPTOGAMS by <appro\@openssl.org>"
|
---|
315 | ___
|
---|
316 |
|
---|
317 | if (`$ENV{CC} -Wa,-v -c -o /dev/null -x assembler /dev/null 2>&1`
|
---|
318 | =~ /GNU assembler/) {
|
---|
319 | $gnuas = 1;
|
---|
320 | }
|
---|
321 |
|
---|
322 | foreach(split("\n",$code)) {
|
---|
323 | s/\`([^\`]*)\`/eval $1/ge;
|
---|
324 |
|
---|
325 | s/(\.LEVEL\s+2\.0)W/$1w/ if ($gnuas && $SIZE_T==8);
|
---|
326 | s/\.SPACE\s+\$TEXT\$/.text/ if ($gnuas && $SIZE_T==8);
|
---|
327 | s/\.SUBSPA.*// if ($gnuas && $SIZE_T==8);
|
---|
328 | s/cmpib,\*/comib,/ if ($SIZE_T==4);
|
---|
329 | s/\bbv\b/bve/ if ($SIZE_T==8);
|
---|
330 |
|
---|
331 | print $_,"\n";
|
---|
332 | }
|
---|
333 | close STDOUT or die "error closing STDOUT: $!";
|
---|