1 | #! /usr/bin/env perl
|
---|
2 | # Copyright 2007-2020 The OpenSSL Project Authors. All Rights Reserved.
|
---|
3 | #
|
---|
4 | # Licensed under the OpenSSL license (the "License"). You may not use
|
---|
5 | # this file except in compliance with the License. You can obtain a copy
|
---|
6 | # in the file LICENSE in the source distribution or at
|
---|
7 | # https://www.openssl.org/source/license.html
|
---|
8 |
|
---|
9 |
|
---|
10 | # ====================================================================
|
---|
11 | # Written by Andy Polyakov <[email protected]> for the OpenSSL
|
---|
12 | # project. The module is, however, dual licensed under OpenSSL and
|
---|
13 | # CRYPTOGAMS licenses depending on where you obtain it. For further
|
---|
14 | # details see http://www.openssl.org/~appro/cryptogams/.
|
---|
15 | # ====================================================================
|
---|
16 |
|
---|
17 | # sha1_block procedure for ARMv4.
|
---|
18 | #
|
---|
19 | # January 2007.
|
---|
20 |
|
---|
21 | # Size/performance trade-off
|
---|
22 | # ====================================================================
|
---|
23 | # impl size in bytes comp cycles[*] measured performance
|
---|
24 | # ====================================================================
|
---|
25 | # thumb 304 3212 4420
|
---|
26 | # armv4-small 392/+29% 1958/+64% 2250/+96%
|
---|
27 | # armv4-compact 740/+89% 1552/+26% 1840/+22%
|
---|
28 | # armv4-large 1420/+92% 1307/+19% 1370/+34%[***]
|
---|
29 | # full unroll ~5100/+260% ~1260/+4% ~1300/+5%
|
---|
30 | # ====================================================================
|
---|
31 | # thumb = same as 'small' but in Thumb instructions[**] and
|
---|
32 | # with recurring code in two private functions;
|
---|
33 | # small = detached Xload/update, loops are folded;
|
---|
34 | # compact = detached Xload/update, 5x unroll;
|
---|
35 | # large = interleaved Xload/update, 5x unroll;
|
---|
36 | # full unroll = interleaved Xload/update, full unroll, estimated[!];
|
---|
37 | #
|
---|
38 | # [*] Manually counted instructions in "grand" loop body. Measured
|
---|
39 | # performance is affected by prologue and epilogue overhead,
|
---|
40 | # i-cache availability, branch penalties, etc.
|
---|
41 | # [**] While each Thumb instruction is twice smaller, they are not as
|
---|
42 | # diverse as ARM ones: e.g., there are only two arithmetic
|
---|
43 | # instructions with 3 arguments, no [fixed] rotate, addressing
|
---|
44 | # modes are limited. As result it takes more instructions to do
|
---|
45 | # the same job in Thumb, therefore the code is never twice as
|
---|
46 | # small and always slower.
|
---|
47 | # [***] which is also ~35% better than compiler generated code. Dual-
|
---|
48 | # issue Cortex A8 core was measured to process input block in
|
---|
49 | # ~990 cycles.
|
---|
50 |
|
---|
51 | # August 2010.
|
---|
52 | #
|
---|
53 | # Rescheduling for dual-issue pipeline resulted in 13% improvement on
|
---|
54 | # Cortex A8 core and in absolute terms ~870 cycles per input block
|
---|
55 | # [or 13.6 cycles per byte].
|
---|
56 |
|
---|
57 | # February 2011.
|
---|
58 | #
|
---|
59 | # Profiler-assisted and platform-specific optimization resulted in 10%
|
---|
60 | # improvement on Cortex A8 core and 12.2 cycles per byte.
|
---|
61 |
|
---|
62 | # September 2013.
|
---|
63 | #
|
---|
64 | # Add NEON implementation (see sha1-586.pl for background info). On
|
---|
65 | # Cortex A8 it was measured to process one byte in 6.7 cycles or >80%
|
---|
66 | # faster than integer-only code. Because [fully unrolled] NEON code
|
---|
67 | # is ~2.5x larger and there are some redundant instructions executed
|
---|
68 | # when processing last block, improvement is not as big for smallest
|
---|
69 | # blocks, only ~30%. Snapdragon S4 is a tad faster, 6.4 cycles per
|
---|
70 | # byte, which is also >80% faster than integer-only code. Cortex-A15
|
---|
71 | # is even faster spending 5.6 cycles per byte outperforming integer-
|
---|
72 | # only code by factor of 2.
|
---|
73 |
|
---|
74 | # May 2014.
|
---|
75 | #
|
---|
76 | # Add ARMv8 code path performing at 2.35 cpb on Apple A7.
|
---|
77 |
|
---|
78 | $flavour = shift;
|
---|
79 | if ($flavour=~/\w[\w\-]*\.\w+$/) { $output=$flavour; undef $flavour; }
|
---|
80 | else { while (($output=shift) && ($output!~/\w[\w\-]*\.\w+$/)) {} }
|
---|
81 |
|
---|
82 | if ($flavour && $flavour ne "void") {
|
---|
83 | $0 =~ m/(.*[\/\\])[^\/\\]+$/; $dir=$1;
|
---|
84 | ( $xlate="${dir}arm-xlate.pl" and -f $xlate ) or
|
---|
85 | ( $xlate="${dir}../../perlasm/arm-xlate.pl" and -f $xlate) or
|
---|
86 | die "can't locate arm-xlate.pl";
|
---|
87 |
|
---|
88 | open STDOUT,"| \"$^X\" $xlate $flavour $output";
|
---|
89 | } else {
|
---|
90 | open STDOUT,">$output";
|
---|
91 | }
|
---|
92 |
|
---|
93 | $ctx="r0";
|
---|
94 | $inp="r1";
|
---|
95 | $len="r2";
|
---|
96 | $a="r3";
|
---|
97 | $b="r4";
|
---|
98 | $c="r5";
|
---|
99 | $d="r6";
|
---|
100 | $e="r7";
|
---|
101 | $K="r8";
|
---|
102 | $t0="r9";
|
---|
103 | $t1="r10";
|
---|
104 | $t2="r11";
|
---|
105 | $t3="r12";
|
---|
106 | $Xi="r14";
|
---|
107 | @V=($a,$b,$c,$d,$e);
|
---|
108 |
|
---|
109 | sub Xupdate {
|
---|
110 | my ($a,$b,$c,$d,$e,$opt1,$opt2)=@_;
|
---|
111 | $code.=<<___;
|
---|
112 | ldr $t0,[$Xi,#15*4]
|
---|
113 | ldr $t1,[$Xi,#13*4]
|
---|
114 | ldr $t2,[$Xi,#7*4]
|
---|
115 | add $e,$K,$e,ror#2 @ E+=K_xx_xx
|
---|
116 | ldr $t3,[$Xi,#2*4]
|
---|
117 | eor $t0,$t0,$t1
|
---|
118 | eor $t2,$t2,$t3 @ 1 cycle stall
|
---|
119 | eor $t1,$c,$d @ F_xx_xx
|
---|
120 | mov $t0,$t0,ror#31
|
---|
121 | add $e,$e,$a,ror#27 @ E+=ROR(A,27)
|
---|
122 | eor $t0,$t0,$t2,ror#31
|
---|
123 | str $t0,[$Xi,#-4]!
|
---|
124 | $opt1 @ F_xx_xx
|
---|
125 | $opt2 @ F_xx_xx
|
---|
126 | add $e,$e,$t0 @ E+=X[i]
|
---|
127 | ___
|
---|
128 | }
|
---|
129 |
|
---|
130 | sub BODY_00_15 {
|
---|
131 | my ($a,$b,$c,$d,$e)=@_;
|
---|
132 | $code.=<<___;
|
---|
133 | #if __ARM_ARCH__<7
|
---|
134 | ldrb $t1,[$inp,#2]
|
---|
135 | ldrb $t0,[$inp,#3]
|
---|
136 | ldrb $t2,[$inp,#1]
|
---|
137 | add $e,$K,$e,ror#2 @ E+=K_00_19
|
---|
138 | ldrb $t3,[$inp],#4
|
---|
139 | orr $t0,$t0,$t1,lsl#8
|
---|
140 | eor $t1,$c,$d @ F_xx_xx
|
---|
141 | orr $t0,$t0,$t2,lsl#16
|
---|
142 | add $e,$e,$a,ror#27 @ E+=ROR(A,27)
|
---|
143 | orr $t0,$t0,$t3,lsl#24
|
---|
144 | #else
|
---|
145 | ldr $t0,[$inp],#4 @ handles unaligned
|
---|
146 | add $e,$K,$e,ror#2 @ E+=K_00_19
|
---|
147 | eor $t1,$c,$d @ F_xx_xx
|
---|
148 | add $e,$e,$a,ror#27 @ E+=ROR(A,27)
|
---|
149 | #ifdef __ARMEL__
|
---|
150 | rev $t0,$t0 @ byte swap
|
---|
151 | #endif
|
---|
152 | #endif
|
---|
153 | and $t1,$b,$t1,ror#2
|
---|
154 | add $e,$e,$t0 @ E+=X[i]
|
---|
155 | eor $t1,$t1,$d,ror#2 @ F_00_19(B,C,D)
|
---|
156 | str $t0,[$Xi,#-4]!
|
---|
157 | add $e,$e,$t1 @ E+=F_00_19(B,C,D)
|
---|
158 | ___
|
---|
159 | }
|
---|
160 |
|
---|
161 | sub BODY_16_19 {
|
---|
162 | my ($a,$b,$c,$d,$e)=@_;
|
---|
163 | &Xupdate(@_,"and $t1,$b,$t1,ror#2");
|
---|
164 | $code.=<<___;
|
---|
165 | eor $t1,$t1,$d,ror#2 @ F_00_19(B,C,D)
|
---|
166 | add $e,$e,$t1 @ E+=F_00_19(B,C,D)
|
---|
167 | ___
|
---|
168 | }
|
---|
169 |
|
---|
170 | sub BODY_20_39 {
|
---|
171 | my ($a,$b,$c,$d,$e)=@_;
|
---|
172 | &Xupdate(@_,"eor $t1,$b,$t1,ror#2");
|
---|
173 | $code.=<<___;
|
---|
174 | add $e,$e,$t1 @ E+=F_20_39(B,C,D)
|
---|
175 | ___
|
---|
176 | }
|
---|
177 |
|
---|
178 | sub BODY_40_59 {
|
---|
179 | my ($a,$b,$c,$d,$e)=@_;
|
---|
180 | &Xupdate(@_,"and $t1,$b,$t1,ror#2","and $t2,$c,$d");
|
---|
181 | $code.=<<___;
|
---|
182 | add $e,$e,$t1 @ E+=F_40_59(B,C,D)
|
---|
183 | add $e,$e,$t2,ror#2
|
---|
184 | ___
|
---|
185 | }
|
---|
186 |
|
---|
187 | $code=<<___;
|
---|
188 | #include "arm_arch.h"
|
---|
189 |
|
---|
190 | .text
|
---|
191 | #if defined(__thumb2__)
|
---|
192 | .syntax unified
|
---|
193 | .thumb
|
---|
194 | #else
|
---|
195 | .code 32
|
---|
196 | #endif
|
---|
197 |
|
---|
198 | .global sha1_block_data_order
|
---|
199 | .type sha1_block_data_order,%function
|
---|
200 |
|
---|
201 | .align 5
|
---|
202 | sha1_block_data_order:
|
---|
203 | #if __ARM_MAX_ARCH__>=7
|
---|
204 | .Lsha1_block:
|
---|
205 | adr r3,.Lsha1_block
|
---|
206 | ldr r12,.LOPENSSL_armcap
|
---|
207 | ldr r12,[r3,r12] @ OPENSSL_armcap_P
|
---|
208 | #ifdef __APPLE__
|
---|
209 | ldr r12,[r12]
|
---|
210 | #endif
|
---|
211 | tst r12,#ARMV8_SHA1
|
---|
212 | bne .LARMv8
|
---|
213 | tst r12,#ARMV7_NEON
|
---|
214 | bne .LNEON
|
---|
215 | #endif
|
---|
216 | stmdb sp!,{r4-r12,lr}
|
---|
217 | add $len,$inp,$len,lsl#6 @ $len to point at the end of $inp
|
---|
218 | ldmia $ctx,{$a,$b,$c,$d,$e}
|
---|
219 | .Lloop:
|
---|
220 | ldr $K,.LK_00_19
|
---|
221 | mov $Xi,sp
|
---|
222 | sub sp,sp,#15*4
|
---|
223 | mov $c,$c,ror#30
|
---|
224 | mov $d,$d,ror#30
|
---|
225 | mov $e,$e,ror#30 @ [6]
|
---|
226 | .L_00_15:
|
---|
227 | ___
|
---|
228 | for($i=0;$i<5;$i++) {
|
---|
229 | &BODY_00_15(@V); unshift(@V,pop(@V));
|
---|
230 | }
|
---|
231 | $code.=<<___;
|
---|
232 | #if defined(__thumb2__)
|
---|
233 | mov $t3,sp
|
---|
234 | teq $Xi,$t3
|
---|
235 | #else
|
---|
236 | teq $Xi,sp
|
---|
237 | #endif
|
---|
238 | bne .L_00_15 @ [((11+4)*5+2)*3]
|
---|
239 | sub sp,sp,#25*4
|
---|
240 | ___
|
---|
241 | &BODY_00_15(@V); unshift(@V,pop(@V));
|
---|
242 | &BODY_16_19(@V); unshift(@V,pop(@V));
|
---|
243 | &BODY_16_19(@V); unshift(@V,pop(@V));
|
---|
244 | &BODY_16_19(@V); unshift(@V,pop(@V));
|
---|
245 | &BODY_16_19(@V); unshift(@V,pop(@V));
|
---|
246 | $code.=<<___;
|
---|
247 |
|
---|
248 | ldr $K,.LK_20_39 @ [+15+16*4]
|
---|
249 | cmn sp,#0 @ [+3], clear carry to denote 20_39
|
---|
250 | .L_20_39_or_60_79:
|
---|
251 | ___
|
---|
252 | for($i=0;$i<5;$i++) {
|
---|
253 | &BODY_20_39(@V); unshift(@V,pop(@V));
|
---|
254 | }
|
---|
255 | $code.=<<___;
|
---|
256 | #if defined(__thumb2__)
|
---|
257 | mov $t3,sp
|
---|
258 | teq $Xi,$t3
|
---|
259 | #else
|
---|
260 | teq $Xi,sp @ preserve carry
|
---|
261 | #endif
|
---|
262 | bne .L_20_39_or_60_79 @ [+((12+3)*5+2)*4]
|
---|
263 | bcs .L_done @ [+((12+3)*5+2)*4], spare 300 bytes
|
---|
264 |
|
---|
265 | ldr $K,.LK_40_59
|
---|
266 | sub sp,sp,#20*4 @ [+2]
|
---|
267 | .L_40_59:
|
---|
268 | ___
|
---|
269 | for($i=0;$i<5;$i++) {
|
---|
270 | &BODY_40_59(@V); unshift(@V,pop(@V));
|
---|
271 | }
|
---|
272 | $code.=<<___;
|
---|
273 | #if defined(__thumb2__)
|
---|
274 | mov $t3,sp
|
---|
275 | teq $Xi,$t3
|
---|
276 | #else
|
---|
277 | teq $Xi,sp
|
---|
278 | #endif
|
---|
279 | bne .L_40_59 @ [+((12+5)*5+2)*4]
|
---|
280 |
|
---|
281 | ldr $K,.LK_60_79
|
---|
282 | sub sp,sp,#20*4
|
---|
283 | cmp sp,#0 @ set carry to denote 60_79
|
---|
284 | b .L_20_39_or_60_79 @ [+4], spare 300 bytes
|
---|
285 | .L_done:
|
---|
286 | add sp,sp,#80*4 @ "deallocate" stack frame
|
---|
287 | ldmia $ctx,{$K,$t0,$t1,$t2,$t3}
|
---|
288 | add $a,$K,$a
|
---|
289 | add $b,$t0,$b
|
---|
290 | add $c,$t1,$c,ror#2
|
---|
291 | add $d,$t2,$d,ror#2
|
---|
292 | add $e,$t3,$e,ror#2
|
---|
293 | stmia $ctx,{$a,$b,$c,$d,$e}
|
---|
294 | teq $inp,$len
|
---|
295 | bne .Lloop @ [+18], total 1307
|
---|
296 |
|
---|
297 | #if __ARM_ARCH__>=5
|
---|
298 | ldmia sp!,{r4-r12,pc}
|
---|
299 | #else
|
---|
300 | ldmia sp!,{r4-r12,lr}
|
---|
301 | tst lr,#1
|
---|
302 | moveq pc,lr @ be binary compatible with V4, yet
|
---|
303 | bx lr @ interoperable with Thumb ISA:-)
|
---|
304 | #endif
|
---|
305 | .size sha1_block_data_order,.-sha1_block_data_order
|
---|
306 |
|
---|
307 | .align 5
|
---|
308 | .LK_00_19: .word 0x5a827999
|
---|
309 | .LK_20_39: .word 0x6ed9eba1
|
---|
310 | .LK_40_59: .word 0x8f1bbcdc
|
---|
311 | .LK_60_79: .word 0xca62c1d6
|
---|
312 | #if __ARM_MAX_ARCH__>=7
|
---|
313 | .LOPENSSL_armcap:
|
---|
314 | .word OPENSSL_armcap_P-.Lsha1_block
|
---|
315 | #endif
|
---|
316 | .asciz "SHA1 block transform for ARMv4/NEON/ARMv8, CRYPTOGAMS by <appro\@openssl.org>"
|
---|
317 | .align 5
|
---|
318 | ___
|
---|
319 | #####################################################################
|
---|
320 | # NEON stuff
|
---|
321 | #
|
---|
322 | {{{
|
---|
323 | my @V=($a,$b,$c,$d,$e);
|
---|
324 | my ($K_XX_XX,$Ki,$t0,$t1,$Xfer,$saved_sp)=map("r$_",(8..12,14));
|
---|
325 | my $Xi=4;
|
---|
326 | my @X=map("q$_",(8..11,0..3));
|
---|
327 | my @Tx=("q12","q13");
|
---|
328 | my ($K,$zero)=("q14","q15");
|
---|
329 | my $j=0;
|
---|
330 |
|
---|
331 | sub AUTOLOAD() # thunk [simplified] x86-style perlasm
|
---|
332 | { my $opcode = $AUTOLOAD; $opcode =~ s/.*:://; $opcode =~ s/_/\./;
|
---|
333 | my $arg = pop;
|
---|
334 | $arg = "#$arg" if ($arg*1 eq $arg);
|
---|
335 | $code .= "\t$opcode\t".join(',',@_,$arg)."\n";
|
---|
336 | }
|
---|
337 |
|
---|
338 | sub body_00_19 () {
|
---|
339 | (
|
---|
340 | '($a,$b,$c,$d,$e)=@V;'. # '$code.="@ $j\n";'.
|
---|
341 | '&bic ($t0,$d,$b)',
|
---|
342 | '&add ($e,$e,$Ki)', # e+=X[i]+K
|
---|
343 | '&and ($t1,$c,$b)',
|
---|
344 | '&ldr ($Ki,sprintf "[sp,#%d]",4*(($j+1)&15))',
|
---|
345 | '&add ($e,$e,$a,"ror#27")', # e+=ROR(A,27)
|
---|
346 | '&eor ($t1,$t1,$t0)', # F_00_19
|
---|
347 | '&mov ($b,$b,"ror#2")', # b=ROR(b,2)
|
---|
348 | '&add ($e,$e,$t1);'. # e+=F_00_19
|
---|
349 | '$j++; unshift(@V,pop(@V));'
|
---|
350 | )
|
---|
351 | }
|
---|
352 | sub body_20_39 () {
|
---|
353 | (
|
---|
354 | '($a,$b,$c,$d,$e)=@V;'. # '$code.="@ $j\n";'.
|
---|
355 | '&eor ($t0,$b,$d)',
|
---|
356 | '&add ($e,$e,$Ki)', # e+=X[i]+K
|
---|
357 | '&ldr ($Ki,sprintf "[sp,#%d]",4*(($j+1)&15)) if ($j<79)',
|
---|
358 | '&eor ($t1,$t0,$c)', # F_20_39
|
---|
359 | '&add ($e,$e,$a,"ror#27")', # e+=ROR(A,27)
|
---|
360 | '&mov ($b,$b,"ror#2")', # b=ROR(b,2)
|
---|
361 | '&add ($e,$e,$t1);'. # e+=F_20_39
|
---|
362 | '$j++; unshift(@V,pop(@V));'
|
---|
363 | )
|
---|
364 | }
|
---|
365 | sub body_40_59 () {
|
---|
366 | (
|
---|
367 | '($a,$b,$c,$d,$e)=@V;'. # '$code.="@ $j\n";'.
|
---|
368 | '&add ($e,$e,$Ki)', # e+=X[i]+K
|
---|
369 | '&and ($t0,$c,$d)',
|
---|
370 | '&ldr ($Ki,sprintf "[sp,#%d]",4*(($j+1)&15))',
|
---|
371 | '&add ($e,$e,$a,"ror#27")', # e+=ROR(A,27)
|
---|
372 | '&eor ($t1,$c,$d)',
|
---|
373 | '&add ($e,$e,$t0)',
|
---|
374 | '&and ($t1,$t1,$b)',
|
---|
375 | '&mov ($b,$b,"ror#2")', # b=ROR(b,2)
|
---|
376 | '&add ($e,$e,$t1);'. # e+=F_40_59
|
---|
377 | '$j++; unshift(@V,pop(@V));'
|
---|
378 | )
|
---|
379 | }
|
---|
380 |
|
---|
381 | sub Xupdate_16_31 ()
|
---|
382 | { use integer;
|
---|
383 | my $body = shift;
|
---|
384 | my @insns = (&$body,&$body,&$body,&$body);
|
---|
385 | my ($a,$b,$c,$d,$e);
|
---|
386 |
|
---|
387 | &vext_8 (@X[0],@X[-4&7],@X[-3&7],8); # compose "X[-14]" in "X[0]"
|
---|
388 | eval(shift(@insns));
|
---|
389 | eval(shift(@insns));
|
---|
390 | eval(shift(@insns));
|
---|
391 | &vadd_i32 (@Tx[1],@X[-1&7],$K);
|
---|
392 | eval(shift(@insns));
|
---|
393 | &vld1_32 ("{$K\[]}","[$K_XX_XX,:32]!") if ($Xi%5==0);
|
---|
394 | eval(shift(@insns));
|
---|
395 | &vext_8 (@Tx[0],@X[-1&7],$zero,4); # "X[-3]", 3 words
|
---|
396 | eval(shift(@insns));
|
---|
397 | eval(shift(@insns));
|
---|
398 | eval(shift(@insns));
|
---|
399 | &veor (@X[0],@X[0],@X[-4&7]); # "X[0]"^="X[-16]"
|
---|
400 | eval(shift(@insns));
|
---|
401 | eval(shift(@insns));
|
---|
402 | &veor (@Tx[0],@Tx[0],@X[-2&7]); # "X[-3]"^"X[-8]"
|
---|
403 | eval(shift(@insns));
|
---|
404 | eval(shift(@insns));
|
---|
405 | &veor (@Tx[0],@Tx[0],@X[0]); # "X[0]"^="X[-3]"^"X[-8]
|
---|
406 | eval(shift(@insns));
|
---|
407 | eval(shift(@insns));
|
---|
408 | &vst1_32 ("{@Tx[1]}","[$Xfer,:128]!"); # X[]+K xfer
|
---|
409 | &sub ($Xfer,$Xfer,64) if ($Xi%4==0);
|
---|
410 | eval(shift(@insns));
|
---|
411 | eval(shift(@insns));
|
---|
412 | &vext_8 (@Tx[1],$zero,@Tx[0],4); # "X[0]"<<96, extract one dword
|
---|
413 | eval(shift(@insns));
|
---|
414 | eval(shift(@insns));
|
---|
415 | &vadd_i32 (@X[0],@Tx[0],@Tx[0]);
|
---|
416 | eval(shift(@insns));
|
---|
417 | eval(shift(@insns));
|
---|
418 | &vsri_32 (@X[0],@Tx[0],31); # "X[0]"<<<=1
|
---|
419 | eval(shift(@insns));
|
---|
420 | eval(shift(@insns));
|
---|
421 | eval(shift(@insns));
|
---|
422 | &vshr_u32 (@Tx[0],@Tx[1],30);
|
---|
423 | eval(shift(@insns));
|
---|
424 | eval(shift(@insns));
|
---|
425 | &vshl_u32 (@Tx[1],@Tx[1],2);
|
---|
426 | eval(shift(@insns));
|
---|
427 | eval(shift(@insns));
|
---|
428 | &veor (@X[0],@X[0],@Tx[0]);
|
---|
429 | eval(shift(@insns));
|
---|
430 | eval(shift(@insns));
|
---|
431 | &veor (@X[0],@X[0],@Tx[1]); # "X[0]"^=("X[0]">>96)<<<2
|
---|
432 |
|
---|
433 | foreach (@insns) { eval; } # remaining instructions [if any]
|
---|
434 |
|
---|
435 | $Xi++; push(@X,shift(@X)); # "rotate" X[]
|
---|
436 | }
|
---|
437 |
|
---|
438 | sub Xupdate_32_79 ()
|
---|
439 | { use integer;
|
---|
440 | my $body = shift;
|
---|
441 | my @insns = (&$body,&$body,&$body,&$body);
|
---|
442 | my ($a,$b,$c,$d,$e);
|
---|
443 |
|
---|
444 | &vext_8 (@Tx[0],@X[-2&7],@X[-1&7],8); # compose "X[-6]"
|
---|
445 | eval(shift(@insns));
|
---|
446 | eval(shift(@insns));
|
---|
447 | eval(shift(@insns));
|
---|
448 | &veor (@X[0],@X[0],@X[-4&7]); # "X[0]"="X[-32]"^"X[-16]"
|
---|
449 | eval(shift(@insns));
|
---|
450 | eval(shift(@insns));
|
---|
451 | &veor (@X[0],@X[0],@X[-7&7]); # "X[0]"^="X[-28]"
|
---|
452 | eval(shift(@insns));
|
---|
453 | eval(shift(@insns));
|
---|
454 | &vadd_i32 (@Tx[1],@X[-1&7],$K);
|
---|
455 | eval(shift(@insns));
|
---|
456 | &vld1_32 ("{$K\[]}","[$K_XX_XX,:32]!") if ($Xi%5==0);
|
---|
457 | eval(shift(@insns));
|
---|
458 | &veor (@Tx[0],@Tx[0],@X[0]); # "X[-6]"^="X[0]"
|
---|
459 | eval(shift(@insns));
|
---|
460 | eval(shift(@insns));
|
---|
461 | &vshr_u32 (@X[0],@Tx[0],30);
|
---|
462 | eval(shift(@insns));
|
---|
463 | eval(shift(@insns));
|
---|
464 | &vst1_32 ("{@Tx[1]}","[$Xfer,:128]!"); # X[]+K xfer
|
---|
465 | &sub ($Xfer,$Xfer,64) if ($Xi%4==0);
|
---|
466 | eval(shift(@insns));
|
---|
467 | eval(shift(@insns));
|
---|
468 | &vsli_32 (@X[0],@Tx[0],2); # "X[0]"="X[-6]"<<<2
|
---|
469 |
|
---|
470 | foreach (@insns) { eval; } # remaining instructions [if any]
|
---|
471 |
|
---|
472 | $Xi++; push(@X,shift(@X)); # "rotate" X[]
|
---|
473 | }
|
---|
474 |
|
---|
475 | sub Xuplast_80 ()
|
---|
476 | { use integer;
|
---|
477 | my $body = shift;
|
---|
478 | my @insns = (&$body,&$body,&$body,&$body);
|
---|
479 | my ($a,$b,$c,$d,$e);
|
---|
480 |
|
---|
481 | &vadd_i32 (@Tx[1],@X[-1&7],$K);
|
---|
482 | eval(shift(@insns));
|
---|
483 | eval(shift(@insns));
|
---|
484 | &vst1_32 ("{@Tx[1]}","[$Xfer,:128]!");
|
---|
485 | &sub ($Xfer,$Xfer,64);
|
---|
486 |
|
---|
487 | &teq ($inp,$len);
|
---|
488 | &sub ($K_XX_XX,$K_XX_XX,16); # rewind $K_XX_XX
|
---|
489 | &it ("eq");
|
---|
490 | &subeq ($inp,$inp,64); # reload last block to avoid SEGV
|
---|
491 | &vld1_8 ("{@X[-4&7]-@X[-3&7]}","[$inp]!");
|
---|
492 | eval(shift(@insns));
|
---|
493 | eval(shift(@insns));
|
---|
494 | &vld1_8 ("{@X[-2&7]-@X[-1&7]}","[$inp]!");
|
---|
495 | eval(shift(@insns));
|
---|
496 | eval(shift(@insns));
|
---|
497 | &vld1_32 ("{$K\[]}","[$K_XX_XX,:32]!"); # load K_00_19
|
---|
498 | eval(shift(@insns));
|
---|
499 | eval(shift(@insns));
|
---|
500 | &vrev32_8 (@X[-4&7],@X[-4&7]);
|
---|
501 |
|
---|
502 | foreach (@insns) { eval; } # remaining instructions
|
---|
503 |
|
---|
504 | $Xi=0;
|
---|
505 | }
|
---|
506 |
|
---|
507 | sub Xloop()
|
---|
508 | { use integer;
|
---|
509 | my $body = shift;
|
---|
510 | my @insns = (&$body,&$body,&$body,&$body);
|
---|
511 | my ($a,$b,$c,$d,$e);
|
---|
512 |
|
---|
513 | &vrev32_8 (@X[($Xi-3)&7],@X[($Xi-3)&7]);
|
---|
514 | eval(shift(@insns));
|
---|
515 | eval(shift(@insns));
|
---|
516 | &vadd_i32 (@X[$Xi&7],@X[($Xi-4)&7],$K);
|
---|
517 | eval(shift(@insns));
|
---|
518 | eval(shift(@insns));
|
---|
519 | &vst1_32 ("{@X[$Xi&7]}","[$Xfer,:128]!");# X[]+K xfer to IALU
|
---|
520 |
|
---|
521 | foreach (@insns) { eval; }
|
---|
522 |
|
---|
523 | $Xi++;
|
---|
524 | }
|
---|
525 |
|
---|
526 | $code.=<<___;
|
---|
527 | #if __ARM_MAX_ARCH__>=7
|
---|
528 | .arch armv7-a
|
---|
529 | .fpu neon
|
---|
530 |
|
---|
531 | .type sha1_block_data_order_neon,%function
|
---|
532 | .align 4
|
---|
533 | sha1_block_data_order_neon:
|
---|
534 | .LNEON:
|
---|
535 | stmdb sp!,{r4-r12,lr}
|
---|
536 | add $len,$inp,$len,lsl#6 @ $len to point at the end of $inp
|
---|
537 | @ dmb @ errata #451034 on early Cortex A8
|
---|
538 | @ vstmdb sp!,{d8-d15} @ ABI specification says so
|
---|
539 | mov $saved_sp,sp
|
---|
540 | sub $Xfer,sp,#64
|
---|
541 | adr $K_XX_XX,.LK_00_19
|
---|
542 | bic $Xfer,$Xfer,#15 @ align for 128-bit stores
|
---|
543 |
|
---|
544 | ldmia $ctx,{$a,$b,$c,$d,$e} @ load context
|
---|
545 | mov sp,$Xfer @ alloca
|
---|
546 |
|
---|
547 | vld1.8 {@X[-4&7]-@X[-3&7]},[$inp]! @ handles unaligned
|
---|
548 | veor $zero,$zero,$zero
|
---|
549 | vld1.8 {@X[-2&7]-@X[-1&7]},[$inp]!
|
---|
550 | vld1.32 {${K}\[]},[$K_XX_XX,:32]! @ load K_00_19
|
---|
551 | vrev32.8 @X[-4&7],@X[-4&7] @ yes, even on
|
---|
552 | vrev32.8 @X[-3&7],@X[-3&7] @ big-endian...
|
---|
553 | vrev32.8 @X[-2&7],@X[-2&7]
|
---|
554 | vadd.i32 @X[0],@X[-4&7],$K
|
---|
555 | vrev32.8 @X[-1&7],@X[-1&7]
|
---|
556 | vadd.i32 @X[1],@X[-3&7],$K
|
---|
557 | vst1.32 {@X[0]},[$Xfer,:128]!
|
---|
558 | vadd.i32 @X[2],@X[-2&7],$K
|
---|
559 | vst1.32 {@X[1]},[$Xfer,:128]!
|
---|
560 | vst1.32 {@X[2]},[$Xfer,:128]!
|
---|
561 | ldr $Ki,[sp] @ big RAW stall
|
---|
562 |
|
---|
563 | .Loop_neon:
|
---|
564 | ___
|
---|
565 | &Xupdate_16_31(\&body_00_19);
|
---|
566 | &Xupdate_16_31(\&body_00_19);
|
---|
567 | &Xupdate_16_31(\&body_00_19);
|
---|
568 | &Xupdate_16_31(\&body_00_19);
|
---|
569 | &Xupdate_32_79(\&body_00_19);
|
---|
570 | &Xupdate_32_79(\&body_20_39);
|
---|
571 | &Xupdate_32_79(\&body_20_39);
|
---|
572 | &Xupdate_32_79(\&body_20_39);
|
---|
573 | &Xupdate_32_79(\&body_20_39);
|
---|
574 | &Xupdate_32_79(\&body_20_39);
|
---|
575 | &Xupdate_32_79(\&body_40_59);
|
---|
576 | &Xupdate_32_79(\&body_40_59);
|
---|
577 | &Xupdate_32_79(\&body_40_59);
|
---|
578 | &Xupdate_32_79(\&body_40_59);
|
---|
579 | &Xupdate_32_79(\&body_40_59);
|
---|
580 | &Xupdate_32_79(\&body_20_39);
|
---|
581 | &Xuplast_80(\&body_20_39);
|
---|
582 | &Xloop(\&body_20_39);
|
---|
583 | &Xloop(\&body_20_39);
|
---|
584 | &Xloop(\&body_20_39);
|
---|
585 | $code.=<<___;
|
---|
586 | ldmia $ctx,{$Ki,$t0,$t1,$Xfer} @ accumulate context
|
---|
587 | add $a,$a,$Ki
|
---|
588 | ldr $Ki,[$ctx,#16]
|
---|
589 | add $b,$b,$t0
|
---|
590 | add $c,$c,$t1
|
---|
591 | add $d,$d,$Xfer
|
---|
592 | it eq
|
---|
593 | moveq sp,$saved_sp
|
---|
594 | add $e,$e,$Ki
|
---|
595 | it ne
|
---|
596 | ldrne $Ki,[sp]
|
---|
597 | stmia $ctx,{$a,$b,$c,$d,$e}
|
---|
598 | itt ne
|
---|
599 | addne $Xfer,sp,#3*16
|
---|
600 | bne .Loop_neon
|
---|
601 |
|
---|
602 | @ vldmia sp!,{d8-d15}
|
---|
603 | ldmia sp!,{r4-r12,pc}
|
---|
604 | .size sha1_block_data_order_neon,.-sha1_block_data_order_neon
|
---|
605 | #endif
|
---|
606 | ___
|
---|
607 | }}}
|
---|
608 | #####################################################################
|
---|
609 | # ARMv8 stuff
|
---|
610 | #
|
---|
611 | {{{
|
---|
612 | my ($ABCD,$E,$E0,$E1)=map("q$_",(0..3));
|
---|
613 | my @MSG=map("q$_",(4..7));
|
---|
614 | my @Kxx=map("q$_",(8..11));
|
---|
615 | my ($W0,$W1,$ABCD_SAVE)=map("q$_",(12..14));
|
---|
616 |
|
---|
617 | $code.=<<___;
|
---|
618 | #if __ARM_MAX_ARCH__>=7
|
---|
619 |
|
---|
620 | # if defined(__thumb2__)
|
---|
621 | # define INST(a,b,c,d) .byte c,d|0xf,a,b
|
---|
622 | # else
|
---|
623 | # define INST(a,b,c,d) .byte a,b,c,d|0x10
|
---|
624 | # endif
|
---|
625 |
|
---|
626 | .type sha1_block_data_order_armv8,%function
|
---|
627 | .align 5
|
---|
628 | sha1_block_data_order_armv8:
|
---|
629 | .LARMv8:
|
---|
630 | vstmdb sp!,{d8-d15} @ ABI specification says so
|
---|
631 |
|
---|
632 | veor $E,$E,$E
|
---|
633 | adr r3,.LK_00_19
|
---|
634 | vld1.32 {$ABCD},[$ctx]!
|
---|
635 | vld1.32 {$E\[0]},[$ctx]
|
---|
636 | sub $ctx,$ctx,#16
|
---|
637 | vld1.32 {@Kxx[0]\[]},[r3,:32]!
|
---|
638 | vld1.32 {@Kxx[1]\[]},[r3,:32]!
|
---|
639 | vld1.32 {@Kxx[2]\[]},[r3,:32]!
|
---|
640 | vld1.32 {@Kxx[3]\[]},[r3,:32]
|
---|
641 |
|
---|
642 | .Loop_v8:
|
---|
643 | vld1.8 {@MSG[0]-@MSG[1]},[$inp]!
|
---|
644 | vld1.8 {@MSG[2]-@MSG[3]},[$inp]!
|
---|
645 | vrev32.8 @MSG[0],@MSG[0]
|
---|
646 | vrev32.8 @MSG[1],@MSG[1]
|
---|
647 |
|
---|
648 | vadd.i32 $W0,@Kxx[0],@MSG[0]
|
---|
649 | vrev32.8 @MSG[2],@MSG[2]
|
---|
650 | vmov $ABCD_SAVE,$ABCD @ offload
|
---|
651 | subs $len,$len,#1
|
---|
652 |
|
---|
653 | vadd.i32 $W1,@Kxx[0],@MSG[1]
|
---|
654 | vrev32.8 @MSG[3],@MSG[3]
|
---|
655 | sha1h $E1,$ABCD @ 0
|
---|
656 | sha1c $ABCD,$E,$W0
|
---|
657 | vadd.i32 $W0,@Kxx[$j],@MSG[2]
|
---|
658 | sha1su0 @MSG[0],@MSG[1],@MSG[2]
|
---|
659 | ___
|
---|
660 | for ($j=0,$i=1;$i<20-3;$i++) {
|
---|
661 | my $f=("c","p","m","p")[$i/5];
|
---|
662 | $code.=<<___;
|
---|
663 | sha1h $E0,$ABCD @ $i
|
---|
664 | sha1$f $ABCD,$E1,$W1
|
---|
665 | vadd.i32 $W1,@Kxx[$j],@MSG[3]
|
---|
666 | sha1su1 @MSG[0],@MSG[3]
|
---|
667 | ___
|
---|
668 | $code.=<<___ if ($i<20-4);
|
---|
669 | sha1su0 @MSG[1],@MSG[2],@MSG[3]
|
---|
670 | ___
|
---|
671 | ($E0,$E1)=($E1,$E0); ($W0,$W1)=($W1,$W0);
|
---|
672 | push(@MSG,shift(@MSG)); $j++ if ((($i+3)%5)==0);
|
---|
673 | }
|
---|
674 | $code.=<<___;
|
---|
675 | sha1h $E0,$ABCD @ $i
|
---|
676 | sha1p $ABCD,$E1,$W1
|
---|
677 | vadd.i32 $W1,@Kxx[$j],@MSG[3]
|
---|
678 |
|
---|
679 | sha1h $E1,$ABCD @ 18
|
---|
680 | sha1p $ABCD,$E0,$W0
|
---|
681 |
|
---|
682 | sha1h $E0,$ABCD @ 19
|
---|
683 | sha1p $ABCD,$E1,$W1
|
---|
684 |
|
---|
685 | vadd.i32 $E,$E,$E0
|
---|
686 | vadd.i32 $ABCD,$ABCD,$ABCD_SAVE
|
---|
687 | bne .Loop_v8
|
---|
688 |
|
---|
689 | vst1.32 {$ABCD},[$ctx]!
|
---|
690 | vst1.32 {$E\[0]},[$ctx]
|
---|
691 |
|
---|
692 | vldmia sp!,{d8-d15}
|
---|
693 | ret @ bx lr
|
---|
694 | .size sha1_block_data_order_armv8,.-sha1_block_data_order_armv8
|
---|
695 | #endif
|
---|
696 | ___
|
---|
697 | }}}
|
---|
698 | $code.=<<___;
|
---|
699 | #if __ARM_MAX_ARCH__>=7
|
---|
700 | .comm OPENSSL_armcap_P,4,4
|
---|
701 | #endif
|
---|
702 | ___
|
---|
703 |
|
---|
704 | { my %opcode = (
|
---|
705 | "sha1c" => 0xf2000c40, "sha1p" => 0xf2100c40,
|
---|
706 | "sha1m" => 0xf2200c40, "sha1su0" => 0xf2300c40,
|
---|
707 | "sha1h" => 0xf3b902c0, "sha1su1" => 0xf3ba0380 );
|
---|
708 |
|
---|
709 | sub unsha1 {
|
---|
710 | my ($mnemonic,$arg)=@_;
|
---|
711 |
|
---|
712 | if ($arg =~ m/q([0-9]+)(?:,\s*q([0-9]+))?,\s*q([0-9]+)/o) {
|
---|
713 | my $word = $opcode{$mnemonic}|(($1&7)<<13)|(($1&8)<<19)
|
---|
714 | |(($2&7)<<17)|(($2&8)<<4)
|
---|
715 | |(($3&7)<<1) |(($3&8)<<2);
|
---|
716 | # since ARMv7 instructions are always encoded little-endian.
|
---|
717 | # correct solution is to use .inst directive, but older
|
---|
718 | # assemblers don't implement it:-(
|
---|
719 |
|
---|
720 | # this fix-up provides Thumb encoding in conjunction with INST
|
---|
721 | $word &= ~0x10000000 if (($word & 0x0f000000) == 0x02000000);
|
---|
722 | sprintf "INST(0x%02x,0x%02x,0x%02x,0x%02x)\t@ %s %s",
|
---|
723 | $word&0xff,($word>>8)&0xff,
|
---|
724 | ($word>>16)&0xff,($word>>24)&0xff,
|
---|
725 | $mnemonic,$arg;
|
---|
726 | }
|
---|
727 | }
|
---|
728 | }
|
---|
729 |
|
---|
730 | foreach (split($/,$code)) {
|
---|
731 | s/{q([0-9]+)\[\]}/sprintf "{d%d[],d%d[]}",2*$1,2*$1+1/eo or
|
---|
732 | s/{q([0-9]+)\[0\]}/sprintf "{d%d[0]}",2*$1/eo;
|
---|
733 |
|
---|
734 | s/\b(sha1\w+)\s+(q.*)/unsha1($1,$2)/geo;
|
---|
735 |
|
---|
736 | s/\bret\b/bx lr/o or
|
---|
737 | s/\bbx\s+lr\b/.word\t0xe12fff1e/o; # make it possible to compile with -march=armv4
|
---|
738 |
|
---|
739 | print $_,$/;
|
---|
740 | }
|
---|
741 |
|
---|
742 | close STDOUT or die "error closing STDOUT: $!"; # enforce flush
|
---|