VirtualBox

source: vbox/trunk/src/libs/openssl-1.1.1k/crypto/sha/asm/sha1-c64xplus.pl@ 90293

Last change on this file since 90293 was 90293, checked in by vboxsync, 4 years ago

openssl-1.1.1k: Applied and adjusted our OpenSSL changes to 1.1.1k. bugref:10072

File size: 8.0 KB
Line 
1#! /usr/bin/env perl
2# Copyright 2012-2020 The OpenSSL Project Authors. All Rights Reserved.
3#
4# Licensed under the OpenSSL license (the "License"). You may not use
5# this file except in compliance with the License. You can obtain a copy
6# in the file LICENSE in the source distribution or at
7# https://www.openssl.org/source/license.html
8
9#
10# ====================================================================
11# Written by Andy Polyakov <[email protected]> for the OpenSSL
12# project. The module is, however, dual licensed under OpenSSL and
13# CRYPTOGAMS licenses depending on where you obtain it. For further
14# details see http://www.openssl.org/~appro/cryptogams/.
15# ====================================================================
16#
17# SHA1 for C64x+.
18#
19# November 2011
20#
21# If compared to compiler-generated code with similar characteristics,
22# i.e. compiled with OPENSSL_SMALL_FOOTPRINT and utilizing SPLOOPs,
23# this implementation is 25% smaller and >2x faster. In absolute terms
24# performance is (quite impressive) ~6.5 cycles per processed byte.
25# Fully unrolled assembler would be ~5x larger and is likely to be
26# ~15% faster. It would be free from references to intermediate ring
27# buffer, but put more pressure on L1P [both because the code would be
28# larger and won't be using SPLOOP buffer]. There are no plans to
29# realize fully unrolled variant though...
30#
31# !!! Note that this module uses AMR, which means that all interrupt
32# service routines are expected to preserve it and for own well-being
33# zero it upon entry.
34
35while (($output=shift) && ($output!~/\w[\w\-]*\.\w+$/)) {}
36open STDOUT,">$output";
37
38($CTX,$INP,$NUM) = ("A4","B4","A6"); # arguments
39
40($A,$B,$C,$D,$E, $Arot,$F,$F0,$T,$K) = map("A$_",(16..20, 21..25));
41($X0,$X2,$X8,$X13) = ("A26","B26","A27","B27");
42($TX0,$TX1,$TX2,$TX3) = map("B$_",(28..31));
43($XPA,$XPB) = ("A5","B5"); # X circular buffer
44($Actx,$Bctx,$Cctx,$Dctx,$Ectx) = map("A$_",(3,6..9)); # zaps $NUM
45
46$code=<<___;
47 .text
48
49 .if .ASSEMBLER_VERSION<7000000
50 .asg 0,__TI_EABI__
51 .endif
52 .if __TI_EABI__
53 .asg sha1_block_data_order,_sha1_block_data_order
54 .endif
55
56 .asg B3,RA
57 .asg A15,FP
58 .asg B15,SP
59
60 .if .BIG_ENDIAN
61 .asg MV,SWAP2
62 .asg MV,SWAP4
63 .endif
64
65 .global _sha1_block_data_order
66_sha1_block_data_order:
67 .asmfunc stack_usage(64)
68 MV $NUM,A0 ; reassign $NUM
69|| MVK -64,B0
70 [!A0] BNOP RA ; if ($NUM==0) return;
71|| [A0] STW FP,*SP--[16] ; save frame pointer and alloca(64)
72|| [A0] MV SP,FP
73 [A0] LDW *${CTX}[0],$A ; load A-E...
74|| [A0] AND B0,SP,SP ; align stack at 64 bytes
75 [A0] LDW *${CTX}[1],$B
76|| [A0] SUBAW SP,2,SP ; reserve two words above buffer
77 [A0] LDW *${CTX}[2],$C
78|| [A0] MVK 0x00404,B0
79 [A0] LDW *${CTX}[3],$D
80|| [A0] MVKH 0x50000,B0 ; 0x050404, 64 bytes for $XP[AB]
81 [A0] LDW *${CTX}[4],$E
82|| [A0] MVC B0,AMR ; setup circular addressing
83 LDNW *${INP}++,$TX1 ; pre-fetch input
84 NOP 1
85
86loop?:
87 MVK 0x00007999,$K
88|| ADDAW SP,2,$XPA
89|| SUB A0,1,A0
90|| MVK 13,B0
91 MVKH 0x5a820000,$K ; K_00_19
92|| ADDAW SP,2,$XPB
93|| MV $A,$Actx
94|| MV $B,$Bctx
95;;==================================================
96 SPLOOPD 5 ; BODY_00_13
97|| MV $C,$Cctx
98|| MV $D,$Dctx
99|| MV $E,$Ectx
100|| MVC B0,ILC
101
102 ROTL $A,5,$Arot
103|| AND $C,$B,$F
104|| ANDN $D,$B,$F0
105|| ADD $K,$E,$T ; T=E+K
106
107 XOR $F0,$F,$F ; F_00_19(B,C,D)
108|| MV $D,$E ; E=D
109|| MV $C,$D ; D=C
110|| SWAP2 $TX1,$TX2
111|| LDNW *${INP}++,$TX1
112
113 ADD $F,$T,$T ; T+=F_00_19(B,C,D)
114|| ROTL $B,30,$C ; C=ROL(B,30)
115|| SWAP4 $TX2,$TX3 ; byte swap
116
117 ADD $Arot,$T,$T ; T+=ROL(A,5)
118|| MV $A,$B ; B=A
119
120 ADD $TX3,$T,$A ; A=T+Xi
121|| STW $TX3,*${XPB}++
122 SPKERNEL
123;;==================================================
124 ROTL $A,5,$Arot ; BODY_14
125|| AND $C,$B,$F
126|| ANDN $D,$B,$F0
127|| ADD $K,$E,$T ; T=E+K
128
129 XOR $F0,$F,$F ; F_00_19(B,C,D)
130|| MV $D,$E ; E=D
131|| MV $C,$D ; D=C
132|| SWAP2 $TX1,$TX2
133|| LDNW *${INP}++,$TX1
134
135 ADD $F,$T,$T ; T+=F_00_19(B,C,D)
136|| ROTL $B,30,$C ; C=ROL(B,30)
137|| SWAP4 $TX2,$TX2 ; byte swap
138|| LDW *${XPA}++,$X0 ; fetches from X ring buffer are
139|| LDW *${XPB}[4],$X2 ; 2 iterations ahead
140
141 ADD $Arot,$T,$T ; T+=ROL(A,5)
142|| MV $A,$B ; B=A
143|| LDW *${XPA}[7],$X8
144|| MV $TX3,$X13 ; || LDW *${XPB}[15],$X13
145|| MV $TX2,$TX3
146
147 ADD $TX2,$T,$A ; A=T+Xi
148|| STW $TX2,*${XPB}++
149;;==================================================
150 ROTL $A,5,$Arot ; BODY_15
151|| AND $C,$B,$F
152|| ANDN $D,$B,$F0
153|| ADD $K,$E,$T ; T=E+K
154
155 XOR $F0,$F,$F ; F_00_19(B,C,D)
156|| MV $D,$E ; E=D
157|| MV $C,$D ; D=C
158|| SWAP2 $TX1,$TX2
159
160 ADD $F,$T,$T ; T+=F_00_19(B,C,D)
161|| ROTL $B,30,$C ; C=ROL(B,30)
162|| SWAP4 $TX2,$TX2 ; byte swap
163|| XOR $X0,$X2,$TX0 ; Xupdate XORs are 1 iteration ahead
164|| LDW *${XPA}++,$X0
165|| LDW *${XPB}[4],$X2
166
167 ADD $Arot,$T,$T ; T+=ROL(A,5)
168|| MV $A,$B ; B=A
169|| XOR $X8,$X13,$TX1
170|| LDW *${XPA}[7],$X8
171|| MV $TX3,$X13 ; || LDW *${XPB}[15],$X13
172|| MV $TX2,$TX3
173
174 ADD $TX2,$T,$A ; A=T+Xi
175|| STW $TX2,*${XPB}++
176|| XOR $TX0,$TX1,$TX1
177|| MVK 3,B0
178;;==================================================
179 SPLOOPD 5 ; BODY_16_19
180|| MVC B0,ILC
181
182 ROTL $A,5,$Arot
183|| AND $C,$B,$F
184|| ANDN $D,$B,$F0
185|| ADD $K,$E,$T ; T=E+K
186|| ROTL $TX1,1,$TX2 ; Xupdate output
187
188 XOR $F0,$F,$F ; F_00_19(B,C,D)
189|| MV $D,$E ; E=D
190|| MV $C,$D ; D=C
191
192 ADD $F,$T,$T ; T+=F_00_19(B,C,D)
193|| ROTL $B,30,$C ; C=ROL(B,30)
194|| XOR $X0,$X2,$TX0
195|| LDW *${XPA}++,$X0
196|| LDW *${XPB}[4],$X2
197
198 ADD $Arot,$T,$T ; T+=ROL(A,5)
199|| MV $A,$B ; B=A
200|| XOR $X8,$X13,$TX1
201|| LDW *${XPA}[7],$X8
202|| MV $TX3,$X13 ; || LDW *${XPB}[15],$X13
203|| MV $TX2,$TX3
204
205 ADD $TX2,$T,$A ; A=T+Xi
206|| STW $TX2,*${XPB}++
207|| XOR $TX0,$TX1,$TX1
208 SPKERNEL
209
210 MVK 0xffffeba1,$K
211|| MVK 19,B0
212 MVKH 0x6ed90000,$K ; K_20_39
213___
214sub BODY_20_39 {
215$code.=<<___;
216;;==================================================
217 SPLOOPD 5 ; BODY_20_39
218|| MVC B0,ILC
219
220 ROTL $A,5,$Arot
221|| XOR $B,$C,$F
222|| ADD $K,$E,$T ; T=E+K
223|| ROTL $TX1,1,$TX2 ; Xupdate output
224
225 XOR $D,$F,$F ; F_20_39(B,C,D)
226|| MV $D,$E ; E=D
227|| MV $C,$D ; D=C
228
229 ADD $F,$T,$T ; T+=F_20_39(B,C,D)
230|| ROTL $B,30,$C ; C=ROL(B,30)
231|| XOR $X0,$X2,$TX0
232|| LDW *${XPA}++,$X0
233|| LDW *${XPB}[4],$X2
234
235 ADD $Arot,$T,$T ; T+=ROL(A,5)
236|| MV $A,$B ; B=A
237|| XOR $X8,$X13,$TX1
238|| LDW *${XPA}[7],$X8
239|| MV $TX3,$X13 ; || LDW *${XPB}[15],$X13
240|| MV $TX2,$TX3
241
242 ADD $TX2,$T,$A ; A=T+Xi
243|| STW $TX2,*${XPB}++ ; last one is redundant
244|| XOR $TX0,$TX1,$TX1
245 SPKERNEL
246___
247$code.=<<___ if (!shift);
248 MVK 0xffffbcdc,$K
249 MVKH 0x8f1b0000,$K ; K_40_59
250___
251} &BODY_20_39();
252$code.=<<___;
253;;==================================================
254 SPLOOPD 5 ; BODY_40_59
255|| MVC B0,ILC
256|| AND $B,$C,$F
257|| AND $B,$D,$F0
258
259 ROTL $A,5,$Arot
260|| XOR $F0,$F,$F
261|| AND $C,$D,$F0
262|| ADD $K,$E,$T ; T=E+K
263|| ROTL $TX1,1,$TX2 ; Xupdate output
264
265 XOR $F0,$F,$F ; F_40_59(B,C,D)
266|| MV $D,$E ; E=D
267|| MV $C,$D ; D=C
268
269 ADD $F,$T,$T ; T+=F_40_59(B,C,D)
270|| ROTL $B,30,$C ; C=ROL(B,30)
271|| XOR $X0,$X2,$TX0
272|| LDW *${XPA}++,$X0
273|| LDW *${XPB}[4],$X2
274
275 ADD $Arot,$T,$T ; T+=ROL(A,5)
276|| MV $A,$B ; B=A
277|| XOR $X8,$X13,$TX1
278|| LDW *${XPA}[7],$X8
279|| MV $TX3,$X13 ; || LDW *${XPB}[15],$X13
280|| MV $TX2,$TX3
281
282 ADD $TX2,$T,$A ; A=T+Xi
283|| STW $TX2,*${XPB}++
284|| XOR $TX0,$TX1,$TX1
285|| AND $B,$C,$F
286|| AND $B,$D,$F0
287 SPKERNEL
288
289 MVK 0xffffc1d6,$K
290|| MVK 18,B0
291 MVKH 0xca620000,$K ; K_60_79
292___
293 &BODY_20_39(-1); # BODY_60_78
294$code.=<<___;
295;;==================================================
296 [A0] B loop?
297|| ROTL $A,5,$Arot ; BODY_79
298|| XOR $B,$C,$F
299|| ROTL $TX1,1,$TX2 ; Xupdate output
300
301 [A0] LDNW *${INP}++,$TX1 ; pre-fetch input
302|| ADD $K,$E,$T ; T=E+K
303|| XOR $D,$F,$F ; F_20_39(B,C,D)
304
305 ADD $F,$T,$T ; T+=F_20_39(B,C,D)
306|| ADD $Ectx,$D,$E ; E=D,E+=Ectx
307|| ADD $Dctx,$C,$D ; D=C,D+=Dctx
308|| ROTL $B,30,$C ; C=ROL(B,30)
309
310 ADD $Arot,$T,$T ; T+=ROL(A,5)
311|| ADD $Bctx,$A,$B ; B=A,B+=Bctx
312
313 ADD $TX2,$T,$A ; A=T+Xi
314
315 ADD $Actx,$A,$A ; A+=Actx
316|| ADD $Cctx,$C,$C ; C+=Cctx
317;; end of loop?
318
319 BNOP RA ; return
320|| MV FP,SP ; restore stack pointer
321|| LDW *FP[0],FP ; restore frame pointer
322 STW $A,*${CTX}[0] ; emit A-E...
323|| MVK 0,B0
324 STW $B,*${CTX}[1]
325|| MVC B0,AMR ; clear AMR
326 STW $C,*${CTX}[2]
327 STW $D,*${CTX}[3]
328 STW $E,*${CTX}[4]
329 .endasmfunc
330
331 .sect .const
332 .cstring "SHA1 block transform for C64x+, CRYPTOGAMS by <appro\@openssl.org>"
333 .align 4
334___
335
336print $code;
337close STDOUT or die "error closing STDOUT: $!";
Note: See TracBrowser for help on using the repository browser.

© 2024 Oracle Support Privacy / Do Not Sell My Info Terms of Use Trademark Policy Automated Access Etiquette