1 | #! /usr/bin/env perl
|
---|
2 | # Copyright 2016 The OpenSSL Project Authors. All Rights Reserved.
|
---|
3 | #
|
---|
4 | # Licensed under the OpenSSL license (the "License"). You may not use
|
---|
5 | # this file except in compliance with the License. You can obtain a copy
|
---|
6 | # in the file LICENSE in the source distribution or at
|
---|
7 | # https://www.openssl.org/source/license.html
|
---|
8 |
|
---|
9 | #
|
---|
10 | # ====================================================================
|
---|
11 | # Written by Andy Polyakov <[email protected]> for the OpenSSL
|
---|
12 | # project. The module is, however, dual licensed under OpenSSL and
|
---|
13 | # CRYPTOGAMS licenses depending on where you obtain it. For further
|
---|
14 | # details see http://www.openssl.org/~appro/cryptogams/.
|
---|
15 | # ====================================================================
|
---|
16 | #
|
---|
17 | # December 2015
|
---|
18 | #
|
---|
19 | # ChaCha20 for s390x.
|
---|
20 | #
|
---|
21 | # 3 times faster than compiler-generated code.
|
---|
22 |
|
---|
23 | $flavour = shift;
|
---|
24 |
|
---|
25 | if ($flavour =~ /3[12]/) {
|
---|
26 | $SIZE_T=4;
|
---|
27 | $g="";
|
---|
28 | } else {
|
---|
29 | $SIZE_T=8;
|
---|
30 | $g="g";
|
---|
31 | }
|
---|
32 |
|
---|
33 | while (($output=shift) && ($output!~/\w[\w\-]*\.\w+$/)) {}
|
---|
34 | open STDOUT,">$output";
|
---|
35 |
|
---|
36 | sub AUTOLOAD() # thunk [simplified] x86-style perlasm
|
---|
37 | { my $opcode = $AUTOLOAD; $opcode =~ s/.*:://;
|
---|
38 | $code .= "\t$opcode\t".join(',',@_)."\n";
|
---|
39 | }
|
---|
40 |
|
---|
41 | my $sp="%r15";
|
---|
42 |
|
---|
43 | my $stdframe=16*$SIZE_T+4*8;
|
---|
44 | my $frame=$stdframe+4*20;
|
---|
45 |
|
---|
46 | my ($out,$inp,$len,$key,$counter)=map("%r$_",(2..6));
|
---|
47 |
|
---|
48 | my @x=map("%r$_",(0..7,"x","x","x","x",(10..13)));
|
---|
49 | my @t=map("%r$_",(8,9));
|
---|
50 |
|
---|
51 | sub ROUND {
|
---|
52 | my ($a0,$b0,$c0,$d0)=@_;
|
---|
53 | my ($a1,$b1,$c1,$d1)=map(($_&~3)+(($_+1)&3),($a0,$b0,$c0,$d0));
|
---|
54 | my ($a2,$b2,$c2,$d2)=map(($_&~3)+(($_+1)&3),($a1,$b1,$c1,$d1));
|
---|
55 | my ($a3,$b3,$c3,$d3)=map(($_&~3)+(($_+1)&3),($a2,$b2,$c2,$d2));
|
---|
56 | my ($xc,$xc_)=map("\"$_\"",@t);
|
---|
57 | my @x=map("\"$_\"",@x);
|
---|
58 |
|
---|
59 | # Consider order in which variables are addressed by their
|
---|
60 | # index:
|
---|
61 | #
|
---|
62 | # a b c d
|
---|
63 | #
|
---|
64 | # 0 4 8 12 < even round
|
---|
65 | # 1 5 9 13
|
---|
66 | # 2 6 10 14
|
---|
67 | # 3 7 11 15
|
---|
68 | # 0 5 10 15 < odd round
|
---|
69 | # 1 6 11 12
|
---|
70 | # 2 7 8 13
|
---|
71 | # 3 4 9 14
|
---|
72 | #
|
---|
73 | # 'a', 'b' and 'd's are permanently allocated in registers,
|
---|
74 | # @x[0..7,12..15], while 'c's are maintained in memory. If
|
---|
75 | # you observe 'c' column, you'll notice that pair of 'c's is
|
---|
76 | # invariant between rounds. This means that we have to reload
|
---|
77 | # them once per round, in the middle. This is why you'll see
|
---|
78 | # 'c' stores and loads in the middle, but none in the beginning
|
---|
79 | # or end.
|
---|
80 |
|
---|
81 | (
|
---|
82 | "&alr (@x[$a0],@x[$b0])", # Q1
|
---|
83 | "&alr (@x[$a1],@x[$b1])", # Q2
|
---|
84 | "&xr (@x[$d0],@x[$a0])",
|
---|
85 | "&xr (@x[$d1],@x[$a1])",
|
---|
86 | "&rll (@x[$d0],@x[$d0],16)",
|
---|
87 | "&rll (@x[$d1],@x[$d1],16)",
|
---|
88 |
|
---|
89 | "&alr ($xc,@x[$d0])",
|
---|
90 | "&alr ($xc_,@x[$d1])",
|
---|
91 | "&xr (@x[$b0],$xc)",
|
---|
92 | "&xr (@x[$b1],$xc_)",
|
---|
93 | "&rll (@x[$b0],@x[$b0],12)",
|
---|
94 | "&rll (@x[$b1],@x[$b1],12)",
|
---|
95 |
|
---|
96 | "&alr (@x[$a0],@x[$b0])",
|
---|
97 | "&alr (@x[$a1],@x[$b1])",
|
---|
98 | "&xr (@x[$d0],@x[$a0])",
|
---|
99 | "&xr (@x[$d1],@x[$a1])",
|
---|
100 | "&rll (@x[$d0],@x[$d0],8)",
|
---|
101 | "&rll (@x[$d1],@x[$d1],8)",
|
---|
102 |
|
---|
103 | "&alr ($xc,@x[$d0])",
|
---|
104 | "&alr ($xc_,@x[$d1])",
|
---|
105 | "&xr (@x[$b0],$xc)",
|
---|
106 | "&xr (@x[$b1],$xc_)",
|
---|
107 | "&rll (@x[$b0],@x[$b0],7)",
|
---|
108 | "&rll (@x[$b1],@x[$b1],7)",
|
---|
109 |
|
---|
110 | "&stm ($xc,$xc_,'$stdframe+4*8+4*$c0($sp)')", # reload pair of 'c's
|
---|
111 | "&lm ($xc,$xc_,'$stdframe+4*8+4*$c2($sp)')",
|
---|
112 |
|
---|
113 | "&alr (@x[$a2],@x[$b2])", # Q3
|
---|
114 | "&alr (@x[$a3],@x[$b3])", # Q4
|
---|
115 | "&xr (@x[$d2],@x[$a2])",
|
---|
116 | "&xr (@x[$d3],@x[$a3])",
|
---|
117 | "&rll (@x[$d2],@x[$d2],16)",
|
---|
118 | "&rll (@x[$d3],@x[$d3],16)",
|
---|
119 |
|
---|
120 | "&alr ($xc,@x[$d2])",
|
---|
121 | "&alr ($xc_,@x[$d3])",
|
---|
122 | "&xr (@x[$b2],$xc)",
|
---|
123 | "&xr (@x[$b3],$xc_)",
|
---|
124 | "&rll (@x[$b2],@x[$b2],12)",
|
---|
125 | "&rll (@x[$b3],@x[$b3],12)",
|
---|
126 |
|
---|
127 | "&alr (@x[$a2],@x[$b2])",
|
---|
128 | "&alr (@x[$a3],@x[$b3])",
|
---|
129 | "&xr (@x[$d2],@x[$a2])",
|
---|
130 | "&xr (@x[$d3],@x[$a3])",
|
---|
131 | "&rll (@x[$d2],@x[$d2],8)",
|
---|
132 | "&rll (@x[$d3],@x[$d3],8)",
|
---|
133 |
|
---|
134 | "&alr ($xc,@x[$d2])",
|
---|
135 | "&alr ($xc_,@x[$d3])",
|
---|
136 | "&xr (@x[$b2],$xc)",
|
---|
137 | "&xr (@x[$b3],$xc_)",
|
---|
138 | "&rll (@x[$b2],@x[$b2],7)",
|
---|
139 | "&rll (@x[$b3],@x[$b3],7)"
|
---|
140 | );
|
---|
141 | }
|
---|
142 |
|
---|
143 | $code.=<<___;
|
---|
144 | .text
|
---|
145 |
|
---|
146 | .globl ChaCha20_ctr32
|
---|
147 | .type ChaCha20_ctr32,\@function
|
---|
148 | .align 32
|
---|
149 | ChaCha20_ctr32:
|
---|
150 | lt${g}r $len,$len # $len==0?
|
---|
151 | bzr %r14
|
---|
152 | a${g}hi $len,-64
|
---|
153 | l${g}hi %r1,-$frame
|
---|
154 | stm${g} %r6,%r15,`6*$SIZE_T`($sp)
|
---|
155 | sl${g}r $out,$inp # difference
|
---|
156 | la $len,0($inp,$len) # end of input minus 64
|
---|
157 | larl %r7,.Lsigma
|
---|
158 | lgr %r0,$sp
|
---|
159 | la $sp,0(%r1,$sp)
|
---|
160 | st${g} %r0,0($sp)
|
---|
161 |
|
---|
162 | lmg %r8,%r11,0($key) # load key
|
---|
163 | lmg %r12,%r13,0($counter) # load counter
|
---|
164 | lmg %r6,%r7,0(%r7) # load sigma constant
|
---|
165 |
|
---|
166 | la %r14,0($inp)
|
---|
167 | st${g} $out,$frame+3*$SIZE_T($sp)
|
---|
168 | st${g} $len,$frame+4*$SIZE_T($sp)
|
---|
169 | stmg %r6,%r13,$stdframe($sp) # copy key schedule to stack
|
---|
170 | srlg @x[12],%r12,32 # 32-bit counter value
|
---|
171 | j .Loop_outer
|
---|
172 |
|
---|
173 | .align 16
|
---|
174 | .Loop_outer:
|
---|
175 | lm @x[0],@x[7],$stdframe+4*0($sp) # load x[0]-x[7]
|
---|
176 | lm @t[0],@t[1],$stdframe+4*10($sp) # load x[10]-x[11]
|
---|
177 | lm @x[13],@x[15],$stdframe+4*13($sp) # load x[13]-x[15]
|
---|
178 | stm @t[0],@t[1],$stdframe+4*8+4*10($sp) # offload x[10]-x[11]
|
---|
179 | lm @t[0],@t[1],$stdframe+4*8($sp) # load x[8]-x[9]
|
---|
180 | st @x[12],$stdframe+4*12($sp) # save counter
|
---|
181 | st${g} %r14,$frame+2*$SIZE_T($sp) # save input pointer
|
---|
182 | lhi %r14,10
|
---|
183 | j .Loop
|
---|
184 |
|
---|
185 | .align 4
|
---|
186 | .Loop:
|
---|
187 | ___
|
---|
188 | foreach (&ROUND(0, 4, 8,12)) { eval; }
|
---|
189 | foreach (&ROUND(0, 5,10,15)) { eval; }
|
---|
190 | $code.=<<___;
|
---|
191 | brct %r14,.Loop
|
---|
192 |
|
---|
193 | l${g} %r14,$frame+2*$SIZE_T($sp) # pull input pointer
|
---|
194 | stm @t[0],@t[1],$stdframe+4*8+4*8($sp) # offload x[8]-x[9]
|
---|
195 | lm${g} @t[0],@t[1],$frame+3*$SIZE_T($sp)
|
---|
196 |
|
---|
197 | al @x[0],$stdframe+4*0($sp) # accumulate key schedule
|
---|
198 | al @x[1],$stdframe+4*1($sp)
|
---|
199 | al @x[2],$stdframe+4*2($sp)
|
---|
200 | al @x[3],$stdframe+4*3($sp)
|
---|
201 | al @x[4],$stdframe+4*4($sp)
|
---|
202 | al @x[5],$stdframe+4*5($sp)
|
---|
203 | al @x[6],$stdframe+4*6($sp)
|
---|
204 | al @x[7],$stdframe+4*7($sp)
|
---|
205 | lrvr @x[0],@x[0]
|
---|
206 | lrvr @x[1],@x[1]
|
---|
207 | lrvr @x[2],@x[2]
|
---|
208 | lrvr @x[3],@x[3]
|
---|
209 | lrvr @x[4],@x[4]
|
---|
210 | lrvr @x[5],@x[5]
|
---|
211 | lrvr @x[6],@x[6]
|
---|
212 | lrvr @x[7],@x[7]
|
---|
213 | al @x[12],$stdframe+4*12($sp)
|
---|
214 | al @x[13],$stdframe+4*13($sp)
|
---|
215 | al @x[14],$stdframe+4*14($sp)
|
---|
216 | al @x[15],$stdframe+4*15($sp)
|
---|
217 | lrvr @x[12],@x[12]
|
---|
218 | lrvr @x[13],@x[13]
|
---|
219 | lrvr @x[14],@x[14]
|
---|
220 | lrvr @x[15],@x[15]
|
---|
221 |
|
---|
222 | la @t[0],0(@t[0],%r14) # reconstruct output pointer
|
---|
223 | cl${g}r %r14,@t[1]
|
---|
224 | jh .Ltail
|
---|
225 |
|
---|
226 | x @x[0],4*0(%r14) # xor with input
|
---|
227 | x @x[1],4*1(%r14)
|
---|
228 | st @x[0],4*0(@t[0]) # store output
|
---|
229 | x @x[2],4*2(%r14)
|
---|
230 | st @x[1],4*1(@t[0])
|
---|
231 | x @x[3],4*3(%r14)
|
---|
232 | st @x[2],4*2(@t[0])
|
---|
233 | x @x[4],4*4(%r14)
|
---|
234 | st @x[3],4*3(@t[0])
|
---|
235 | lm @x[0],@x[3],$stdframe+4*8+4*8($sp) # load x[8]-x[11]
|
---|
236 | x @x[5],4*5(%r14)
|
---|
237 | st @x[4],4*4(@t[0])
|
---|
238 | x @x[6],4*6(%r14)
|
---|
239 | al @x[0],$stdframe+4*8($sp)
|
---|
240 | st @x[5],4*5(@t[0])
|
---|
241 | x @x[7],4*7(%r14)
|
---|
242 | al @x[1],$stdframe+4*9($sp)
|
---|
243 | st @x[6],4*6(@t[0])
|
---|
244 | x @x[12],4*12(%r14)
|
---|
245 | al @x[2],$stdframe+4*10($sp)
|
---|
246 | st @x[7],4*7(@t[0])
|
---|
247 | x @x[13],4*13(%r14)
|
---|
248 | al @x[3],$stdframe+4*11($sp)
|
---|
249 | st @x[12],4*12(@t[0])
|
---|
250 | x @x[14],4*14(%r14)
|
---|
251 | st @x[13],4*13(@t[0])
|
---|
252 | x @x[15],4*15(%r14)
|
---|
253 | st @x[14],4*14(@t[0])
|
---|
254 | lrvr @x[0],@x[0]
|
---|
255 | st @x[15],4*15(@t[0])
|
---|
256 | lrvr @x[1],@x[1]
|
---|
257 | lrvr @x[2],@x[2]
|
---|
258 | lrvr @x[3],@x[3]
|
---|
259 | lhi @x[12],1
|
---|
260 | x @x[0],4*8(%r14)
|
---|
261 | al @x[12],$stdframe+4*12($sp) # increment counter
|
---|
262 | x @x[1],4*9(%r14)
|
---|
263 | st @x[0],4*8(@t[0])
|
---|
264 | x @x[2],4*10(%r14)
|
---|
265 | st @x[1],4*9(@t[0])
|
---|
266 | x @x[3],4*11(%r14)
|
---|
267 | st @x[2],4*10(@t[0])
|
---|
268 | st @x[3],4*11(@t[0])
|
---|
269 |
|
---|
270 | cl${g}r %r14,@t[1] # done yet?
|
---|
271 | la %r14,64(%r14)
|
---|
272 | jl .Loop_outer
|
---|
273 |
|
---|
274 | .Ldone:
|
---|
275 | xgr %r0,%r0
|
---|
276 | xgr %r1,%r1
|
---|
277 | xgr %r2,%r2
|
---|
278 | xgr %r3,%r3
|
---|
279 | stmg %r0,%r3,$stdframe+4*4($sp) # wipe key copy
|
---|
280 | stmg %r0,%r3,$stdframe+4*12($sp)
|
---|
281 |
|
---|
282 | lm${g} %r6,%r15,`$frame+6*$SIZE_T`($sp)
|
---|
283 | br %r14
|
---|
284 |
|
---|
285 | .align 16
|
---|
286 | .Ltail:
|
---|
287 | la @t[1],64($t[1])
|
---|
288 | stm @x[0],@x[7],$stdframe+4*0($sp)
|
---|
289 | sl${g}r @t[1],%r14
|
---|
290 | lm @x[0],@x[3],$stdframe+4*8+4*8($sp)
|
---|
291 | l${g}hi @x[6],0
|
---|
292 | stm @x[12],@x[15],$stdframe+4*12($sp)
|
---|
293 | al @x[0],$stdframe+4*8($sp)
|
---|
294 | al @x[1],$stdframe+4*9($sp)
|
---|
295 | al @x[2],$stdframe+4*10($sp)
|
---|
296 | al @x[3],$stdframe+4*11($sp)
|
---|
297 | lrvr @x[0],@x[0]
|
---|
298 | lrvr @x[1],@x[1]
|
---|
299 | lrvr @x[2],@x[2]
|
---|
300 | lrvr @x[3],@x[3]
|
---|
301 | stm @x[0],@x[3],$stdframe+4*8($sp)
|
---|
302 |
|
---|
303 | .Loop_tail:
|
---|
304 | llgc @x[4],0(@x[6],%r14)
|
---|
305 | llgc @x[5],$stdframe(@x[6],$sp)
|
---|
306 | xr @x[5],@x[4]
|
---|
307 | stc @x[5],0(@x[6],@t[0])
|
---|
308 | la @x[6],1(@x[6])
|
---|
309 | brct @t[1],.Loop_tail
|
---|
310 |
|
---|
311 | j .Ldone
|
---|
312 | .size ChaCha20_ctr32,.-ChaCha20_ctr32
|
---|
313 |
|
---|
314 | .align 32
|
---|
315 | .Lsigma:
|
---|
316 | .long 0x61707865,0x3320646e,0x79622d32,0x6b206574 # endian-neutral
|
---|
317 | .asciz "ChaCha20 for s390x, CRYPTOGAMS by <appro\@openssl.org>"
|
---|
318 | .align 4
|
---|
319 | ___
|
---|
320 |
|
---|
321 | foreach (split("\n",$code)) {
|
---|
322 | s/\`([^\`]*)\`/eval $1/ge;
|
---|
323 |
|
---|
324 | print $_,"\n";
|
---|
325 | }
|
---|
326 | close STDOUT;
|
---|