VirtualBox

source: vbox/trunk/src/libs/openssl-3.1.3/crypto/sha/asm/sha1-c64xplus.pl@ 101211

Last change on this file since 101211 was 101211, checked in by vboxsync, 17 months ago

openssl-3.1.3: Applied and adjusted our OpenSSL changes to 3.1.2. bugref:10527

File size: 8.0 KB
Line 
1#! /usr/bin/env perl
2# Copyright 2012-2020 The OpenSSL Project Authors. All Rights Reserved.
3#
4# Licensed under the Apache License 2.0 (the "License"). You may not use
5# this file except in compliance with the License. You can obtain a copy
6# in the file LICENSE in the source distribution or at
7# https://www.openssl.org/source/license.html
8
9#
10# ====================================================================
11# Written by Andy Polyakov <[email protected]> for the OpenSSL
12# project. The module is, however, dual licensed under OpenSSL and
13# CRYPTOGAMS licenses depending on where you obtain it. For further
14# details see http://www.openssl.org/~appro/cryptogams/.
15# ====================================================================
16#
17# SHA1 for C64x+.
18#
19# November 2011
20#
21# If compared to compiler-generated code with similar characteristics,
22# i.e. compiled with OPENSSL_SMALL_FOOTPRINT and utilizing SPLOOPs,
23# this implementation is 25% smaller and >2x faster. In absolute terms
24# performance is (quite impressive) ~6.5 cycles per processed byte.
25# Fully unrolled assembler would be ~5x larger and is likely to be
26# ~15% faster. It would be free from references to intermediate ring
27# buffer, but put more pressure on L1P [both because the code would be
28# larger and won't be using SPLOOP buffer]. There are no plans to
29# realize fully unrolled variant though...
30#
31# !!! Note that this module uses AMR, which means that all interrupt
32# service routines are expected to preserve it and for own well-being
33# zero it upon entry.
34
35$output = pop and open STDOUT,">$output";
36
37($CTX,$INP,$NUM) = ("A4","B4","A6"); # arguments
38
39($A,$B,$C,$D,$E, $Arot,$F,$F0,$T,$K) = map("A$_",(16..20, 21..25));
40($X0,$X2,$X8,$X13) = ("A26","B26","A27","B27");
41($TX0,$TX1,$TX2,$TX3) = map("B$_",(28..31));
42($XPA,$XPB) = ("A5","B5"); # X circular buffer
43($Actx,$Bctx,$Cctx,$Dctx,$Ectx) = map("A$_",(3,6..9)); # zaps $NUM
44
45$code=<<___;
46 .text
47
48 .if .ASSEMBLER_VERSION<7000000
49 .asg 0,__TI_EABI__
50 .endif
51 .if __TI_EABI__
52 .asg sha1_block_data_order,_sha1_block_data_order
53 .endif
54
55 .asg B3,RA
56 .asg A15,FP
57 .asg B15,SP
58
59 .if .BIG_ENDIAN
60 .asg MV,SWAP2
61 .asg MV,SWAP4
62 .endif
63
64 .global _sha1_block_data_order
65_sha1_block_data_order:
66 .asmfunc stack_usage(64)
67 MV $NUM,A0 ; reassign $NUM
68|| MVK -64,B0
69 [!A0] BNOP RA ; if ($NUM==0) return;
70|| [A0] STW FP,*SP--[16] ; save frame pointer and alloca(64)
71|| [A0] MV SP,FP
72 [A0] LDW *${CTX}[0],$A ; load A-E...
73|| [A0] AND B0,SP,SP ; align stack at 64 bytes
74 [A0] LDW *${CTX}[1],$B
75|| [A0] SUBAW SP,2,SP ; reserve two words above buffer
76 [A0] LDW *${CTX}[2],$C
77|| [A0] MVK 0x00404,B0
78 [A0] LDW *${CTX}[3],$D
79|| [A0] MVKH 0x50000,B0 ; 0x050404, 64 bytes for $XP[AB]
80 [A0] LDW *${CTX}[4],$E
81|| [A0] MVC B0,AMR ; setup circular addressing
82 LDNW *${INP}++,$TX1 ; pre-fetch input
83 NOP 1
84
85loop?:
86 MVK 0x00007999,$K
87|| ADDAW SP,2,$XPA
88|| SUB A0,1,A0
89|| MVK 13,B0
90 MVKH 0x5a820000,$K ; K_00_19
91|| ADDAW SP,2,$XPB
92|| MV $A,$Actx
93|| MV $B,$Bctx
94;;==================================================
95 SPLOOPD 5 ; BODY_00_13
96|| MV $C,$Cctx
97|| MV $D,$Dctx
98|| MV $E,$Ectx
99|| MVC B0,ILC
100
101 ROTL $A,5,$Arot
102|| AND $C,$B,$F
103|| ANDN $D,$B,$F0
104|| ADD $K,$E,$T ; T=E+K
105
106 XOR $F0,$F,$F ; F_00_19(B,C,D)
107|| MV $D,$E ; E=D
108|| MV $C,$D ; D=C
109|| SWAP2 $TX1,$TX2
110|| LDNW *${INP}++,$TX1
111
112 ADD $F,$T,$T ; T+=F_00_19(B,C,D)
113|| ROTL $B,30,$C ; C=ROL(B,30)
114|| SWAP4 $TX2,$TX3 ; byte swap
115
116 ADD $Arot,$T,$T ; T+=ROL(A,5)
117|| MV $A,$B ; B=A
118
119 ADD $TX3,$T,$A ; A=T+Xi
120|| STW $TX3,*${XPB}++
121 SPKERNEL
122;;==================================================
123 ROTL $A,5,$Arot ; BODY_14
124|| AND $C,$B,$F
125|| ANDN $D,$B,$F0
126|| ADD $K,$E,$T ; T=E+K
127
128 XOR $F0,$F,$F ; F_00_19(B,C,D)
129|| MV $D,$E ; E=D
130|| MV $C,$D ; D=C
131|| SWAP2 $TX1,$TX2
132|| LDNW *${INP}++,$TX1
133
134 ADD $F,$T,$T ; T+=F_00_19(B,C,D)
135|| ROTL $B,30,$C ; C=ROL(B,30)
136|| SWAP4 $TX2,$TX2 ; byte swap
137|| LDW *${XPA}++,$X0 ; fetches from X ring buffer are
138|| LDW *${XPB}[4],$X2 ; 2 iterations ahead
139
140 ADD $Arot,$T,$T ; T+=ROL(A,5)
141|| MV $A,$B ; B=A
142|| LDW *${XPA}[7],$X8
143|| MV $TX3,$X13 ; || LDW *${XPB}[15],$X13
144|| MV $TX2,$TX3
145
146 ADD $TX2,$T,$A ; A=T+Xi
147|| STW $TX2,*${XPB}++
148;;==================================================
149 ROTL $A,5,$Arot ; BODY_15
150|| AND $C,$B,$F
151|| ANDN $D,$B,$F0
152|| ADD $K,$E,$T ; T=E+K
153
154 XOR $F0,$F,$F ; F_00_19(B,C,D)
155|| MV $D,$E ; E=D
156|| MV $C,$D ; D=C
157|| SWAP2 $TX1,$TX2
158
159 ADD $F,$T,$T ; T+=F_00_19(B,C,D)
160|| ROTL $B,30,$C ; C=ROL(B,30)
161|| SWAP4 $TX2,$TX2 ; byte swap
162|| XOR $X0,$X2,$TX0 ; Xupdate XORs are 1 iteration ahead
163|| LDW *${XPA}++,$X0
164|| LDW *${XPB}[4],$X2
165
166 ADD $Arot,$T,$T ; T+=ROL(A,5)
167|| MV $A,$B ; B=A
168|| XOR $X8,$X13,$TX1
169|| LDW *${XPA}[7],$X8
170|| MV $TX3,$X13 ; || LDW *${XPB}[15],$X13
171|| MV $TX2,$TX3
172
173 ADD $TX2,$T,$A ; A=T+Xi
174|| STW $TX2,*${XPB}++
175|| XOR $TX0,$TX1,$TX1
176|| MVK 3,B0
177;;==================================================
178 SPLOOPD 5 ; BODY_16_19
179|| MVC B0,ILC
180
181 ROTL $A,5,$Arot
182|| AND $C,$B,$F
183|| ANDN $D,$B,$F0
184|| ADD $K,$E,$T ; T=E+K
185|| ROTL $TX1,1,$TX2 ; Xupdate output
186
187 XOR $F0,$F,$F ; F_00_19(B,C,D)
188|| MV $D,$E ; E=D
189|| MV $C,$D ; D=C
190
191 ADD $F,$T,$T ; T+=F_00_19(B,C,D)
192|| ROTL $B,30,$C ; C=ROL(B,30)
193|| XOR $X0,$X2,$TX0
194|| LDW *${XPA}++,$X0
195|| LDW *${XPB}[4],$X2
196
197 ADD $Arot,$T,$T ; T+=ROL(A,5)
198|| MV $A,$B ; B=A
199|| XOR $X8,$X13,$TX1
200|| LDW *${XPA}[7],$X8
201|| MV $TX3,$X13 ; || LDW *${XPB}[15],$X13
202|| MV $TX2,$TX3
203
204 ADD $TX2,$T,$A ; A=T+Xi
205|| STW $TX2,*${XPB}++
206|| XOR $TX0,$TX1,$TX1
207 SPKERNEL
208
209 MVK 0xffffeba1,$K
210|| MVK 19,B0
211 MVKH 0x6ed90000,$K ; K_20_39
212___
213sub BODY_20_39 {
214$code.=<<___;
215;;==================================================
216 SPLOOPD 5 ; BODY_20_39
217|| MVC B0,ILC
218
219 ROTL $A,5,$Arot
220|| XOR $B,$C,$F
221|| ADD $K,$E,$T ; T=E+K
222|| ROTL $TX1,1,$TX2 ; Xupdate output
223
224 XOR $D,$F,$F ; F_20_39(B,C,D)
225|| MV $D,$E ; E=D
226|| MV $C,$D ; D=C
227
228 ADD $F,$T,$T ; T+=F_20_39(B,C,D)
229|| ROTL $B,30,$C ; C=ROL(B,30)
230|| XOR $X0,$X2,$TX0
231|| LDW *${XPA}++,$X0
232|| LDW *${XPB}[4],$X2
233
234 ADD $Arot,$T,$T ; T+=ROL(A,5)
235|| MV $A,$B ; B=A
236|| XOR $X8,$X13,$TX1
237|| LDW *${XPA}[7],$X8
238|| MV $TX3,$X13 ; || LDW *${XPB}[15],$X13
239|| MV $TX2,$TX3
240
241 ADD $TX2,$T,$A ; A=T+Xi
242|| STW $TX2,*${XPB}++ ; last one is redundant
243|| XOR $TX0,$TX1,$TX1
244 SPKERNEL
245___
246$code.=<<___ if (!shift);
247 MVK 0xffffbcdc,$K
248 MVKH 0x8f1b0000,$K ; K_40_59
249___
250} &BODY_20_39();
251$code.=<<___;
252;;==================================================
253 SPLOOPD 5 ; BODY_40_59
254|| MVC B0,ILC
255|| AND $B,$C,$F
256|| AND $B,$D,$F0
257
258 ROTL $A,5,$Arot
259|| XOR $F0,$F,$F
260|| AND $C,$D,$F0
261|| ADD $K,$E,$T ; T=E+K
262|| ROTL $TX1,1,$TX2 ; Xupdate output
263
264 XOR $F0,$F,$F ; F_40_59(B,C,D)
265|| MV $D,$E ; E=D
266|| MV $C,$D ; D=C
267
268 ADD $F,$T,$T ; T+=F_40_59(B,C,D)
269|| ROTL $B,30,$C ; C=ROL(B,30)
270|| XOR $X0,$X2,$TX0
271|| LDW *${XPA}++,$X0
272|| LDW *${XPB}[4],$X2
273
274 ADD $Arot,$T,$T ; T+=ROL(A,5)
275|| MV $A,$B ; B=A
276|| XOR $X8,$X13,$TX1
277|| LDW *${XPA}[7],$X8
278|| MV $TX3,$X13 ; || LDW *${XPB}[15],$X13
279|| MV $TX2,$TX3
280
281 ADD $TX2,$T,$A ; A=T+Xi
282|| STW $TX2,*${XPB}++
283|| XOR $TX0,$TX1,$TX1
284|| AND $B,$C,$F
285|| AND $B,$D,$F0
286 SPKERNEL
287
288 MVK 0xffffc1d6,$K
289|| MVK 18,B0
290 MVKH 0xca620000,$K ; K_60_79
291___
292 &BODY_20_39(-1); # BODY_60_78
293$code.=<<___;
294;;==================================================
295 [A0] B loop?
296|| ROTL $A,5,$Arot ; BODY_79
297|| XOR $B,$C,$F
298|| ROTL $TX1,1,$TX2 ; Xupdate output
299
300 [A0] LDNW *${INP}++,$TX1 ; pre-fetch input
301|| ADD $K,$E,$T ; T=E+K
302|| XOR $D,$F,$F ; F_20_39(B,C,D)
303
304 ADD $F,$T,$T ; T+=F_20_39(B,C,D)
305|| ADD $Ectx,$D,$E ; E=D,E+=Ectx
306|| ADD $Dctx,$C,$D ; D=C,D+=Dctx
307|| ROTL $B,30,$C ; C=ROL(B,30)
308
309 ADD $Arot,$T,$T ; T+=ROL(A,5)
310|| ADD $Bctx,$A,$B ; B=A,B+=Bctx
311
312 ADD $TX2,$T,$A ; A=T+Xi
313
314 ADD $Actx,$A,$A ; A+=Actx
315|| ADD $Cctx,$C,$C ; C+=Cctx
316;; end of loop?
317
318 BNOP RA ; return
319|| MV FP,SP ; restore stack pointer
320|| LDW *FP[0],FP ; restore frame pointer
321 STW $A,*${CTX}[0] ; emit A-E...
322|| MVK 0,B0
323 STW $B,*${CTX}[1]
324|| MVC B0,AMR ; clear AMR
325 STW $C,*${CTX}[2]
326 STW $D,*${CTX}[3]
327 STW $E,*${CTX}[4]
328 .endasmfunc
329
330 .sect .const
331 .cstring "SHA1 block transform for C64x+, CRYPTOGAMS by <appro\@openssl.org>"
332 .align 4
333___
334
335print $code;
336close STDOUT or die "error closing STDOUT: $!";
Note: See TracBrowser for help on using the repository browser.

© 2024 Oracle Support Privacy / Do Not Sell My Info Terms of Use Trademark Policy Automated Access Etiquette