1 | #! /usr/bin/env perl
|
---|
2 | # Copyright 2012-2020 The OpenSSL Project Authors. All Rights Reserved.
|
---|
3 | #
|
---|
4 | # Licensed under the OpenSSL license (the "License"). You may not use
|
---|
5 | # this file except in compliance with the License. You can obtain a copy
|
---|
6 | # in the file LICENSE in the source distribution or at
|
---|
7 | # https://www.openssl.org/source/license.html
|
---|
8 |
|
---|
9 | #
|
---|
10 | # ====================================================================
|
---|
11 | # Written by Andy Polyakov <[email protected]> for the OpenSSL
|
---|
12 | # project. The module is, however, dual licensed under OpenSSL and
|
---|
13 | # CRYPTOGAMS licenses depending on where you obtain it. For further
|
---|
14 | # details see http://www.openssl.org/~appro/cryptogams/.
|
---|
15 | # ====================================================================
|
---|
16 | #
|
---|
17 | # SHA1 for C64x+.
|
---|
18 | #
|
---|
19 | # November 2011
|
---|
20 | #
|
---|
21 | # If compared to compiler-generated code with similar characteristics,
|
---|
22 | # i.e. compiled with OPENSSL_SMALL_FOOTPRINT and utilizing SPLOOPs,
|
---|
23 | # this implementation is 25% smaller and >2x faster. In absolute terms
|
---|
24 | # performance is (quite impressive) ~6.5 cycles per processed byte.
|
---|
25 | # Fully unrolled assembler would be ~5x larger and is likely to be
|
---|
26 | # ~15% faster. It would be free from references to intermediate ring
|
---|
27 | # buffer, but put more pressure on L1P [both because the code would be
|
---|
28 | # larger and won't be using SPLOOP buffer]. There are no plans to
|
---|
29 | # realize fully unrolled variant though...
|
---|
30 | #
|
---|
31 | # !!! Note that this module uses AMR, which means that all interrupt
|
---|
32 | # service routines are expected to preserve it and for own well-being
|
---|
33 | # zero it upon entry.
|
---|
34 |
|
---|
35 | while (($output=shift) && ($output!~/\w[\w\-]*\.\w+$/)) {}
|
---|
36 | open STDOUT,">$output";
|
---|
37 |
|
---|
38 | ($CTX,$INP,$NUM) = ("A4","B4","A6"); # arguments
|
---|
39 |
|
---|
40 | ($A,$B,$C,$D,$E, $Arot,$F,$F0,$T,$K) = map("A$_",(16..20, 21..25));
|
---|
41 | ($X0,$X2,$X8,$X13) = ("A26","B26","A27","B27");
|
---|
42 | ($TX0,$TX1,$TX2,$TX3) = map("B$_",(28..31));
|
---|
43 | ($XPA,$XPB) = ("A5","B5"); # X circular buffer
|
---|
44 | ($Actx,$Bctx,$Cctx,$Dctx,$Ectx) = map("A$_",(3,6..9)); # zaps $NUM
|
---|
45 |
|
---|
46 | $code=<<___;
|
---|
47 | .text
|
---|
48 |
|
---|
49 | .if .ASSEMBLER_VERSION<7000000
|
---|
50 | .asg 0,__TI_EABI__
|
---|
51 | .endif
|
---|
52 | .if __TI_EABI__
|
---|
53 | .asg sha1_block_data_order,_sha1_block_data_order
|
---|
54 | .endif
|
---|
55 |
|
---|
56 | .asg B3,RA
|
---|
57 | .asg A15,FP
|
---|
58 | .asg B15,SP
|
---|
59 |
|
---|
60 | .if .BIG_ENDIAN
|
---|
61 | .asg MV,SWAP2
|
---|
62 | .asg MV,SWAP4
|
---|
63 | .endif
|
---|
64 |
|
---|
65 | .global _sha1_block_data_order
|
---|
66 | _sha1_block_data_order:
|
---|
67 | .asmfunc stack_usage(64)
|
---|
68 | MV $NUM,A0 ; reassign $NUM
|
---|
69 | || MVK -64,B0
|
---|
70 | [!A0] BNOP RA ; if ($NUM==0) return;
|
---|
71 | || [A0] STW FP,*SP--[16] ; save frame pointer and alloca(64)
|
---|
72 | || [A0] MV SP,FP
|
---|
73 | [A0] LDW *${CTX}[0],$A ; load A-E...
|
---|
74 | || [A0] AND B0,SP,SP ; align stack at 64 bytes
|
---|
75 | [A0] LDW *${CTX}[1],$B
|
---|
76 | || [A0] SUBAW SP,2,SP ; reserve two words above buffer
|
---|
77 | [A0] LDW *${CTX}[2],$C
|
---|
78 | || [A0] MVK 0x00404,B0
|
---|
79 | [A0] LDW *${CTX}[3],$D
|
---|
80 | || [A0] MVKH 0x50000,B0 ; 0x050404, 64 bytes for $XP[AB]
|
---|
81 | [A0] LDW *${CTX}[4],$E
|
---|
82 | || [A0] MVC B0,AMR ; setup circular addressing
|
---|
83 | LDNW *${INP}++,$TX1 ; pre-fetch input
|
---|
84 | NOP 1
|
---|
85 |
|
---|
86 | loop?:
|
---|
87 | MVK 0x00007999,$K
|
---|
88 | || ADDAW SP,2,$XPA
|
---|
89 | || SUB A0,1,A0
|
---|
90 | || MVK 13,B0
|
---|
91 | MVKH 0x5a820000,$K ; K_00_19
|
---|
92 | || ADDAW SP,2,$XPB
|
---|
93 | || MV $A,$Actx
|
---|
94 | || MV $B,$Bctx
|
---|
95 | ;;==================================================
|
---|
96 | SPLOOPD 5 ; BODY_00_13
|
---|
97 | || MV $C,$Cctx
|
---|
98 | || MV $D,$Dctx
|
---|
99 | || MV $E,$Ectx
|
---|
100 | || MVC B0,ILC
|
---|
101 |
|
---|
102 | ROTL $A,5,$Arot
|
---|
103 | || AND $C,$B,$F
|
---|
104 | || ANDN $D,$B,$F0
|
---|
105 | || ADD $K,$E,$T ; T=E+K
|
---|
106 |
|
---|
107 | XOR $F0,$F,$F ; F_00_19(B,C,D)
|
---|
108 | || MV $D,$E ; E=D
|
---|
109 | || MV $C,$D ; D=C
|
---|
110 | || SWAP2 $TX1,$TX2
|
---|
111 | || LDNW *${INP}++,$TX1
|
---|
112 |
|
---|
113 | ADD $F,$T,$T ; T+=F_00_19(B,C,D)
|
---|
114 | || ROTL $B,30,$C ; C=ROL(B,30)
|
---|
115 | || SWAP4 $TX2,$TX3 ; byte swap
|
---|
116 |
|
---|
117 | ADD $Arot,$T,$T ; T+=ROL(A,5)
|
---|
118 | || MV $A,$B ; B=A
|
---|
119 |
|
---|
120 | ADD $TX3,$T,$A ; A=T+Xi
|
---|
121 | || STW $TX3,*${XPB}++
|
---|
122 | SPKERNEL
|
---|
123 | ;;==================================================
|
---|
124 | ROTL $A,5,$Arot ; BODY_14
|
---|
125 | || AND $C,$B,$F
|
---|
126 | || ANDN $D,$B,$F0
|
---|
127 | || ADD $K,$E,$T ; T=E+K
|
---|
128 |
|
---|
129 | XOR $F0,$F,$F ; F_00_19(B,C,D)
|
---|
130 | || MV $D,$E ; E=D
|
---|
131 | || MV $C,$D ; D=C
|
---|
132 | || SWAP2 $TX1,$TX2
|
---|
133 | || LDNW *${INP}++,$TX1
|
---|
134 |
|
---|
135 | ADD $F,$T,$T ; T+=F_00_19(B,C,D)
|
---|
136 | || ROTL $B,30,$C ; C=ROL(B,30)
|
---|
137 | || SWAP4 $TX2,$TX2 ; byte swap
|
---|
138 | || LDW *${XPA}++,$X0 ; fetches from X ring buffer are
|
---|
139 | || LDW *${XPB}[4],$X2 ; 2 iterations ahead
|
---|
140 |
|
---|
141 | ADD $Arot,$T,$T ; T+=ROL(A,5)
|
---|
142 | || MV $A,$B ; B=A
|
---|
143 | || LDW *${XPA}[7],$X8
|
---|
144 | || MV $TX3,$X13 ; || LDW *${XPB}[15],$X13
|
---|
145 | || MV $TX2,$TX3
|
---|
146 |
|
---|
147 | ADD $TX2,$T,$A ; A=T+Xi
|
---|
148 | || STW $TX2,*${XPB}++
|
---|
149 | ;;==================================================
|
---|
150 | ROTL $A,5,$Arot ; BODY_15
|
---|
151 | || AND $C,$B,$F
|
---|
152 | || ANDN $D,$B,$F0
|
---|
153 | || ADD $K,$E,$T ; T=E+K
|
---|
154 |
|
---|
155 | XOR $F0,$F,$F ; F_00_19(B,C,D)
|
---|
156 | || MV $D,$E ; E=D
|
---|
157 | || MV $C,$D ; D=C
|
---|
158 | || SWAP2 $TX1,$TX2
|
---|
159 |
|
---|
160 | ADD $F,$T,$T ; T+=F_00_19(B,C,D)
|
---|
161 | || ROTL $B,30,$C ; C=ROL(B,30)
|
---|
162 | || SWAP4 $TX2,$TX2 ; byte swap
|
---|
163 | || XOR $X0,$X2,$TX0 ; Xupdate XORs are 1 iteration ahead
|
---|
164 | || LDW *${XPA}++,$X0
|
---|
165 | || LDW *${XPB}[4],$X2
|
---|
166 |
|
---|
167 | ADD $Arot,$T,$T ; T+=ROL(A,5)
|
---|
168 | || MV $A,$B ; B=A
|
---|
169 | || XOR $X8,$X13,$TX1
|
---|
170 | || LDW *${XPA}[7],$X8
|
---|
171 | || MV $TX3,$X13 ; || LDW *${XPB}[15],$X13
|
---|
172 | || MV $TX2,$TX3
|
---|
173 |
|
---|
174 | ADD $TX2,$T,$A ; A=T+Xi
|
---|
175 | || STW $TX2,*${XPB}++
|
---|
176 | || XOR $TX0,$TX1,$TX1
|
---|
177 | || MVK 3,B0
|
---|
178 | ;;==================================================
|
---|
179 | SPLOOPD 5 ; BODY_16_19
|
---|
180 | || MVC B0,ILC
|
---|
181 |
|
---|
182 | ROTL $A,5,$Arot
|
---|
183 | || AND $C,$B,$F
|
---|
184 | || ANDN $D,$B,$F0
|
---|
185 | || ADD $K,$E,$T ; T=E+K
|
---|
186 | || ROTL $TX1,1,$TX2 ; Xupdate output
|
---|
187 |
|
---|
188 | XOR $F0,$F,$F ; F_00_19(B,C,D)
|
---|
189 | || MV $D,$E ; E=D
|
---|
190 | || MV $C,$D ; D=C
|
---|
191 |
|
---|
192 | ADD $F,$T,$T ; T+=F_00_19(B,C,D)
|
---|
193 | || ROTL $B,30,$C ; C=ROL(B,30)
|
---|
194 | || XOR $X0,$X2,$TX0
|
---|
195 | || LDW *${XPA}++,$X0
|
---|
196 | || LDW *${XPB}[4],$X2
|
---|
197 |
|
---|
198 | ADD $Arot,$T,$T ; T+=ROL(A,5)
|
---|
199 | || MV $A,$B ; B=A
|
---|
200 | || XOR $X8,$X13,$TX1
|
---|
201 | || LDW *${XPA}[7],$X8
|
---|
202 | || MV $TX3,$X13 ; || LDW *${XPB}[15],$X13
|
---|
203 | || MV $TX2,$TX3
|
---|
204 |
|
---|
205 | ADD $TX2,$T,$A ; A=T+Xi
|
---|
206 | || STW $TX2,*${XPB}++
|
---|
207 | || XOR $TX0,$TX1,$TX1
|
---|
208 | SPKERNEL
|
---|
209 |
|
---|
210 | MVK 0xffffeba1,$K
|
---|
211 | || MVK 19,B0
|
---|
212 | MVKH 0x6ed90000,$K ; K_20_39
|
---|
213 | ___
|
---|
214 | sub BODY_20_39 {
|
---|
215 | $code.=<<___;
|
---|
216 | ;;==================================================
|
---|
217 | SPLOOPD 5 ; BODY_20_39
|
---|
218 | || MVC B0,ILC
|
---|
219 |
|
---|
220 | ROTL $A,5,$Arot
|
---|
221 | || XOR $B,$C,$F
|
---|
222 | || ADD $K,$E,$T ; T=E+K
|
---|
223 | || ROTL $TX1,1,$TX2 ; Xupdate output
|
---|
224 |
|
---|
225 | XOR $D,$F,$F ; F_20_39(B,C,D)
|
---|
226 | || MV $D,$E ; E=D
|
---|
227 | || MV $C,$D ; D=C
|
---|
228 |
|
---|
229 | ADD $F,$T,$T ; T+=F_20_39(B,C,D)
|
---|
230 | || ROTL $B,30,$C ; C=ROL(B,30)
|
---|
231 | || XOR $X0,$X2,$TX0
|
---|
232 | || LDW *${XPA}++,$X0
|
---|
233 | || LDW *${XPB}[4],$X2
|
---|
234 |
|
---|
235 | ADD $Arot,$T,$T ; T+=ROL(A,5)
|
---|
236 | || MV $A,$B ; B=A
|
---|
237 | || XOR $X8,$X13,$TX1
|
---|
238 | || LDW *${XPA}[7],$X8
|
---|
239 | || MV $TX3,$X13 ; || LDW *${XPB}[15],$X13
|
---|
240 | || MV $TX2,$TX3
|
---|
241 |
|
---|
242 | ADD $TX2,$T,$A ; A=T+Xi
|
---|
243 | || STW $TX2,*${XPB}++ ; last one is redundant
|
---|
244 | || XOR $TX0,$TX1,$TX1
|
---|
245 | SPKERNEL
|
---|
246 | ___
|
---|
247 | $code.=<<___ if (!shift);
|
---|
248 | MVK 0xffffbcdc,$K
|
---|
249 | MVKH 0x8f1b0000,$K ; K_40_59
|
---|
250 | ___
|
---|
251 | } &BODY_20_39();
|
---|
252 | $code.=<<___;
|
---|
253 | ;;==================================================
|
---|
254 | SPLOOPD 5 ; BODY_40_59
|
---|
255 | || MVC B0,ILC
|
---|
256 | || AND $B,$C,$F
|
---|
257 | || AND $B,$D,$F0
|
---|
258 |
|
---|
259 | ROTL $A,5,$Arot
|
---|
260 | || XOR $F0,$F,$F
|
---|
261 | || AND $C,$D,$F0
|
---|
262 | || ADD $K,$E,$T ; T=E+K
|
---|
263 | || ROTL $TX1,1,$TX2 ; Xupdate output
|
---|
264 |
|
---|
265 | XOR $F0,$F,$F ; F_40_59(B,C,D)
|
---|
266 | || MV $D,$E ; E=D
|
---|
267 | || MV $C,$D ; D=C
|
---|
268 |
|
---|
269 | ADD $F,$T,$T ; T+=F_40_59(B,C,D)
|
---|
270 | || ROTL $B,30,$C ; C=ROL(B,30)
|
---|
271 | || XOR $X0,$X2,$TX0
|
---|
272 | || LDW *${XPA}++,$X0
|
---|
273 | || LDW *${XPB}[4],$X2
|
---|
274 |
|
---|
275 | ADD $Arot,$T,$T ; T+=ROL(A,5)
|
---|
276 | || MV $A,$B ; B=A
|
---|
277 | || XOR $X8,$X13,$TX1
|
---|
278 | || LDW *${XPA}[7],$X8
|
---|
279 | || MV $TX3,$X13 ; || LDW *${XPB}[15],$X13
|
---|
280 | || MV $TX2,$TX3
|
---|
281 |
|
---|
282 | ADD $TX2,$T,$A ; A=T+Xi
|
---|
283 | || STW $TX2,*${XPB}++
|
---|
284 | || XOR $TX0,$TX1,$TX1
|
---|
285 | || AND $B,$C,$F
|
---|
286 | || AND $B,$D,$F0
|
---|
287 | SPKERNEL
|
---|
288 |
|
---|
289 | MVK 0xffffc1d6,$K
|
---|
290 | || MVK 18,B0
|
---|
291 | MVKH 0xca620000,$K ; K_60_79
|
---|
292 | ___
|
---|
293 | &BODY_20_39(-1); # BODY_60_78
|
---|
294 | $code.=<<___;
|
---|
295 | ;;==================================================
|
---|
296 | [A0] B loop?
|
---|
297 | || ROTL $A,5,$Arot ; BODY_79
|
---|
298 | || XOR $B,$C,$F
|
---|
299 | || ROTL $TX1,1,$TX2 ; Xupdate output
|
---|
300 |
|
---|
301 | [A0] LDNW *${INP}++,$TX1 ; pre-fetch input
|
---|
302 | || ADD $K,$E,$T ; T=E+K
|
---|
303 | || XOR $D,$F,$F ; F_20_39(B,C,D)
|
---|
304 |
|
---|
305 | ADD $F,$T,$T ; T+=F_20_39(B,C,D)
|
---|
306 | || ADD $Ectx,$D,$E ; E=D,E+=Ectx
|
---|
307 | || ADD $Dctx,$C,$D ; D=C,D+=Dctx
|
---|
308 | || ROTL $B,30,$C ; C=ROL(B,30)
|
---|
309 |
|
---|
310 | ADD $Arot,$T,$T ; T+=ROL(A,5)
|
---|
311 | || ADD $Bctx,$A,$B ; B=A,B+=Bctx
|
---|
312 |
|
---|
313 | ADD $TX2,$T,$A ; A=T+Xi
|
---|
314 |
|
---|
315 | ADD $Actx,$A,$A ; A+=Actx
|
---|
316 | || ADD $Cctx,$C,$C ; C+=Cctx
|
---|
317 | ;; end of loop?
|
---|
318 |
|
---|
319 | BNOP RA ; return
|
---|
320 | || MV FP,SP ; restore stack pointer
|
---|
321 | || LDW *FP[0],FP ; restore frame pointer
|
---|
322 | STW $A,*${CTX}[0] ; emit A-E...
|
---|
323 | || MVK 0,B0
|
---|
324 | STW $B,*${CTX}[1]
|
---|
325 | || MVC B0,AMR ; clear AMR
|
---|
326 | STW $C,*${CTX}[2]
|
---|
327 | STW $D,*${CTX}[3]
|
---|
328 | STW $E,*${CTX}[4]
|
---|
329 | .endasmfunc
|
---|
330 |
|
---|
331 | .sect .const
|
---|
332 | .cstring "SHA1 block transform for C64x+, CRYPTOGAMS by <appro\@openssl.org>"
|
---|
333 | .align 4
|
---|
334 | ___
|
---|
335 |
|
---|
336 | print $code;
|
---|
337 | close STDOUT or die "error closing STDOUT: $!";
|
---|