sha256-armv4.pl@ 101021

Last change on this file since 101021 was 101021, checked in by vboxsync, 20 months ago
openssl-3.1.2: Applied and adjusted our OpenSSL changes to 3.1.0. bugref:10519
File size: 18.6 KB

Line
1	#! /usr/bin/env perl
2	# Copyright 2007-2020 The OpenSSL Project Authors. All Rights Reserved.
3	#
4	# Licensed under the Apache License 2.0 (the "License"). You may not use
5	# this file except in compliance with the License. You can obtain a copy
6	# in the file LICENSE in the source distribution or at
7	# https://www.openssl.org/source/license.html
8
9
10	# ====================================================================
11	# Written by Andy Polyakov <[email protected]> for the OpenSSL
12	# project. The module is, however, dual licensed under OpenSSL and
13	# CRYPTOGAMS licenses depending on where you obtain it. For further
14	# details see http://www.openssl.org/~appro/cryptogams/.
15	#
16	# Permission to use under GPL terms is granted.
17	# ====================================================================
18
19	# SHA256 block procedure for ARMv4. May 2007.
20
21	# Performance is ~2x better than gcc 3.4 generated code and in "abso-
22	# lute" terms is ~2250 cycles per 64-byte block or ~35 cycles per
23	# byte [on single-issue Xscale PXA250 core].
24
25	# July 2010.
26	#
27	# Rescheduling for dual-issue pipeline resulted in 22% improvement on
28	# Cortex A8 core and ~20 cycles per processed byte.
29
30	# February 2011.
31	#
32	# Profiler-assisted and platform-specific optimization resulted in 16%
33	# improvement on Cortex A8 core and ~15.4 cycles per processed byte.
34
35	# September 2013.
36	#
37	# Add NEON implementation. On Cortex A8 it was measured to process one
38	# byte in 12.5 cycles or 23% faster than integer-only code. Snapdragon
39	# S4 does it in 12.5 cycles too, but it's 50% faster than integer-only
40	# code (meaning that latter performs sub-optimally, nothing was done
41	# about it).
42
43	# May 2014.
44	#
45	# Add ARMv8 code path performing at 2.0 cpb on Apple A7.
46
47	# $output is the last argument if it looks like a file (it has an extension)
48	# $flavour is the first argument if it doesn't look like a file
49	$output = $#ARGV >= 0 && $ARGV[$#ARGV] =~ m\|\.\w+$\| ? pop : undef;
50	$flavour = $#ARGV >= 0 && $ARGV[0] !~ m\|\.\| ? shift : undef;
51
52	if ($flavour && $flavour ne "void") {
53	$0 =~ m/(.*[\/\\])[^\/\\]+$/; $dir=$1;
54	( $xlate="${dir}arm-xlate.pl" and -f $xlate ) or
55	( $xlate="${dir}../../perlasm/arm-xlate.pl" and -f $xlate) or
56	die "can't locate arm-xlate.pl";
57
58	open STDOUT,"\| \"$^X\" $xlate $flavour \"$output\""
59	or die "can't call $xlate: $!";
60	} else {
61	$output and open STDOUT,">$output";
62	}
63
64	$ctx="r0"; $t0="r0";
65	$inp="r1"; $t4="r1";
66	$len="r2"; $t1="r2";
67	$T1="r3"; $t3="r3";
68	$A="r4";
69	$B="r5";
70	$C="r6";
71	$D="r7";
72	$E="r8";
73	$F="r9";
74	$G="r10";
75	$H="r11";
76	@V=($A,$B,$C,$D,$E,$F,$G,$H);
77	$t2="r12";
78	$Ktbl="r14";
79
80	@Sigma0=( 2,13,22);
81	@Sigma1=( 6,11,25);
82	@sigma0=( 7,18, 3);
83	@sigma1=(17,19,10);
84
85	sub BODY_00_15 {
86	my ($i,$a,$b,$c,$d,$e,$f,$g,$h) = @_;
87
88	$code.=<<___ if ($i<16);
89	#if __ARM_ARCH__>=7
90	@ ldr $t1,[$inp],#4 @ $i
91	# if $i==15
92	str $inp,[sp,#17*4] @ make room for $t4
93	# endif
94	eor $t0,$e,$e,ror#`$Sigma1[1]-$Sigma1[0]`
95	add $a,$a,$t2 @ h+=Maj(a,b,c) from the past
96	eor $t0,$t0,$e,ror#`$Sigma1[2]-$Sigma1[0]` @ Sigma1(e)
97	# ifndef __ARMEB__
98	rev $t1,$t1
99	# endif
100	#else
101	@ ldrb $t1,[$inp,#3] @ $i
102	add $a,$a,$t2 @ h+=Maj(a,b,c) from the past
103	ldrb $t2,[$inp,#2]
104	ldrb $t0,[$inp,#1]
105	orr $t1,$t1,$t2,lsl#8
106	ldrb $t2,[$inp],#4
107	orr $t1,$t1,$t0,lsl#16
108	# if $i==15
109	str $inp,[sp,#17*4] @ make room for $t4
110	# endif
111	eor $t0,$e,$e,ror#`$Sigma1[1]-$Sigma1[0]`
112	orr $t1,$t1,$t2,lsl#24
113	eor $t0,$t0,$e,ror#`$Sigma1[2]-$Sigma1[0]` @ Sigma1(e)
114	#endif
115	___
116	$code.=<<___;
117	ldr $t2,[$Ktbl],#4 @ *K256++
118	add $h,$h,$t1 @ h+=X[i]
119	str $t1,[sp,#`$i%16`*4]
120	eor $t1,$f,$g
121	add $h,$h,$t0,ror#$Sigma1[0] @ h+=Sigma1(e)
122	and $t1,$t1,$e
123	add $h,$h,$t2 @ h+=K256[i]
124	eor $t1,$t1,$g @ Ch(e,f,g)
125	eor $t0,$a,$a,ror#`$Sigma0[1]-$Sigma0[0]`
126	add $h,$h,$t1 @ h+=Ch(e,f,g)
127	#if $i==31
128	and $t2,$t2,#0xff
129	cmp $t2,#0xf2 @ done?
130	#endif
131	#if $i<15
132	# if __ARM_ARCH__>=7
133	ldr $t1,[$inp],#4 @ prefetch
134	# else
135	ldrb $t1,[$inp,#3]
136	# endif
137	eor $t2,$a,$b @ a^b, b^c in next round
138	#else
139	ldr $t1,[sp,#`($i+2)%16`*4] @ from future BODY_16_xx
140	eor $t2,$a,$b @ a^b, b^c in next round
141	ldr $t4,[sp,#`($i+15)%16`*4] @ from future BODY_16_xx
142	#endif
143	eor $t0,$t0,$a,ror#`$Sigma0[2]-$Sigma0[0]` @ Sigma0(a)
144	and $t3,$t3,$t2 @ (b^c)&=(a^b)
145	add $d,$d,$h @ d+=h
146	eor $t3,$t3,$b @ Maj(a,b,c)
147	add $h,$h,$t0,ror#$Sigma0[0] @ h+=Sigma0(a)
148	@ add $h,$h,$t3 @ h+=Maj(a,b,c)
149	___
150	($t2,$t3)=($t3,$t2);
151	}
152
153	sub BODY_16_XX {
154	my ($i,$a,$b,$c,$d,$e,$f,$g,$h) = @_;
155
156	$code.=<<___;
157	@ ldr $t1,[sp,#`($i+1)%16`*4] @ $i
158	@ ldr $t4,[sp,#`($i+14)%16`*4]
159	mov $t0,$t1,ror#$sigma0[0]
160	add $a,$a,$t2 @ h+=Maj(a,b,c) from the past
161	mov $t2,$t4,ror#$sigma1[0]
162	eor $t0,$t0,$t1,ror#$sigma0[1]
163	eor $t2,$t2,$t4,ror#$sigma1[1]
164	eor $t0,$t0,$t1,lsr#$sigma0[2] @ sigma0(X[i+1])
165	ldr $t1,[sp,#`($i+0)%16`*4]
166	eor $t2,$t2,$t4,lsr#$sigma1[2] @ sigma1(X[i+14])
167	ldr $t4,[sp,#`($i+9)%16`*4]
168
169	add $t2,$t2,$t0
170	eor $t0,$e,$e,ror#`$Sigma1[1]-$Sigma1[0]` @ from BODY_00_15
171	add $t1,$t1,$t2
172	eor $t0,$t0,$e,ror#`$Sigma1[2]-$Sigma1[0]` @ Sigma1(e)
173	add $t1,$t1,$t4 @ X[i]
174	___
175	&BODY_00_15(@_);
176	}
177
178	$code=<<___;
179	#ifndef __KERNEL__
180	# include "arm_arch.h"
181	#else
182	# define __ARM_ARCH__ __LINUX_ARM_ARCH__
183	# define __ARM_MAX_ARCH__ 7
184	#endif
185
186	#if defined(__thumb2__)
187	.syntax unified
188	.thumb
189	#else
190	.code 32
191	#endif
192
193	.text
194
195	.type K256,%object
196	.align 5
197	K256:
198	.word 0x428a2f98,0x71374491,0xb5c0fbcf,0xe9b5dba5
199	.word 0x3956c25b,0x59f111f1,0x923f82a4,0xab1c5ed5
200	.word 0xd807aa98,0x12835b01,0x243185be,0x550c7dc3
201	.word 0x72be5d74,0x80deb1fe,0x9bdc06a7,0xc19bf174
202	.word 0xe49b69c1,0xefbe4786,0x0fc19dc6,0x240ca1cc
203	.word 0x2de92c6f,0x4a7484aa,0x5cb0a9dc,0x76f988da
204	.word 0x983e5152,0xa831c66d,0xb00327c8,0xbf597fc7
205	.word 0xc6e00bf3,0xd5a79147,0x06ca6351,0x14292967
206	.word 0x27b70a85,0x2e1b2138,0x4d2c6dfc,0x53380d13
207	.word 0x650a7354,0x766a0abb,0x81c2c92e,0x92722c85
208	.word 0xa2bfe8a1,0xa81a664b,0xc24b8b70,0xc76c51a3
209	.word 0xd192e819,0xd6990624,0xf40e3585,0x106aa070
210	.word 0x19a4c116,0x1e376c08,0x2748774c,0x34b0bcb5
211	.word 0x391c0cb3,0x4ed8aa4a,0x5b9cca4f,0x682e6ff3
212	.word 0x748f82ee,0x78a5636f,0x84c87814,0x8cc70208
213	.word 0x90befffa,0xa4506ceb,0xbef9a3f7,0xc67178f2
214	.size K256,.-K256
215	.word 0 @ terminator
216	#if __ARM_MAX_ARCH__>=7 && !defined(__KERNEL__)
217	.LOPENSSL_armcap:
218	# ifdef _WIN32
219	.word OPENSSL_armcap_P
220	# else
221	.word OPENSSL_armcap_P-.Lsha256_block_data_order
222	# endif
223	#endif
224	.align 5
225
226	.global sha256_block_data_order
227	.type sha256_block_data_order,%function
228	sha256_block_data_order:
229	.Lsha256_block_data_order:
230	#if __ARM_ARCH__<7 && !defined(__thumb2__)
231	sub r3,pc,#8 @ sha256_block_data_order
232	#else
233	adr r3,.Lsha256_block_data_order
234	#endif
235	#if __ARM_MAX_ARCH__>=7 && !defined(__KERNEL__)
236	ldr r12,.LOPENSSL_armcap
237	# if !defined(_WIN32)
238	ldr r12,[r3,r12] @ OPENSSL_armcap_P
239	# endif
240	# if defined(__APPLE__) \|\| defined(_WIN32)
241	ldr r12,[r12]
242	# endif
243	tst r12,#ARMV8_SHA256
244	bne .LARMv8
245	tst r12,#ARMV7_NEON
246	bne .LNEON
247	#endif
248	add $len,$inp,$len,lsl#6 @ len to point at the end of inp
249	stmdb sp!,{$ctx,$inp,$len,r4-r11,lr}
250	ldmia $ctx,{$A,$B,$C,$D,$E,$F,$G,$H}
251	sub $Ktbl,r3,#256+32 @ K256
252	sub sp,sp,#16*4 @ alloca(X[16])
253	.Loop:
254	# if __ARM_ARCH__>=7
255	ldr $t1,[$inp],#4
256	# else
257	ldrb $t1,[$inp,#3]
258	# endif
259	eor $t3,$B,$C @ magic
260	eor $t2,$t2,$t2
261	___
262	for($i=0;$i<16;$i++) { &BODY_00_15($i,@V); unshift(@V,pop(@V)); }
263	$code.=".Lrounds_16_xx:\n";
264	for (;$i<32;$i++) { &BODY_16_XX($i,@V); unshift(@V,pop(@V)); }
265	$code.=<<___;
266	#ifdef __thumb2__
267	ite eq @ Thumb2 thing, sanity check in ARM
268	#endif
269	ldreq $t3,[sp,#16*4] @ pull ctx
270	bne .Lrounds_16_xx
271
272	add $A,$A,$t2 @ h+=Maj(a,b,c) from the past
273	ldr $t0,[$t3,#0]
274	ldr $t1,[$t3,#4]
275	ldr $t2,[$t3,#8]
276	add $A,$A,$t0
277	ldr $t0,[$t3,#12]
278	add $B,$B,$t1
279	ldr $t1,[$t3,#16]
280	add $C,$C,$t2
281	ldr $t2,[$t3,#20]
282	add $D,$D,$t0
283	ldr $t0,[$t3,#24]
284	add $E,$E,$t1
285	ldr $t1,[$t3,#28]
286	add $F,$F,$t2
287	ldr $inp,[sp,#17*4] @ pull inp
288	ldr $t2,[sp,#18*4] @ pull inp+len
289	add $G,$G,$t0
290	add $H,$H,$t1
291	stmia $t3,{$A,$B,$C,$D,$E,$F,$G,$H}
292	cmp $inp,$t2
293	sub $Ktbl,$Ktbl,#256 @ rewind Ktbl
294	bne .Loop
295
296	add sp,sp,#`16+3`*4 @ destroy frame
297	#if __ARM_ARCH__>=5
298	ldmia sp!,{r4-r11,pc}
299	#else
300	ldmia sp!,{r4-r11,lr}
301	tst lr,#1
302	moveq pc,lr @ be binary compatible with V4, yet
303	bx lr @ interoperable with Thumb ISA:-)
304	#endif
305	.size sha256_block_data_order,.-sha256_block_data_order
306	___
307	######################################################################
308	# NEON stuff
309	#
310	{{{
311	my @X=map("q$_",(0..3));
312	my ($T0,$T1,$T2,$T3,$T4,$T5)=("q8","q9","q10","q11","d24","d25");
313	my $Xfer=$t4;
314	my $j=0;
315
316	sub Dlo() { shift=~m\|q([1]?[0-9])\|?"d".($1*2):""; }
317	sub Dhi() { shift=~m\|q([1]?[0-9])\|?"d".($1*2+1):""; }
318
319	sub AUTOLOAD() # thunk [simplified] x86-style perlasm
320	{ my $opcode = $AUTOLOAD; $opcode =~ s/.*:://; $opcode =~ s/_/\./;
321	my $arg = pop;
322	$arg = "#$arg" if ($arg*1 eq $arg);
323	$code .= "\t$opcode\t".join(',',@_,$arg)."\n";
324	}
325
326	sub Xupdate()
327	{ use integer;
328	my $body = shift;
329	my @insns = (&$body,&$body,&$body,&$body);
330	my ($a,$b,$c,$d,$e,$f,$g,$h);
331
332	&vext_8 ($T0,@X[0],@X[1],4); # X[1..4]
333	eval(shift(@insns));
334	eval(shift(@insns));
335	eval(shift(@insns));
336	&vext_8 ($T1,@X[2],@X[3],4); # X[9..12]
337	eval(shift(@insns));
338	eval(shift(@insns));
339	eval(shift(@insns));
340	&vshr_u32 ($T2,$T0,$sigma0[0]);
341	eval(shift(@insns));
342	eval(shift(@insns));
343	&vadd_i32 (@X[0],@X[0],$T1); # X[0..3] += X[9..12]
344	eval(shift(@insns));
345	eval(shift(@insns));
346	&vshr_u32 ($T1,$T0,$sigma0[2]);
347	eval(shift(@insns));
348	eval(shift(@insns));
349	&vsli_32 ($T2,$T0,32-$sigma0[0]);
350	eval(shift(@insns));
351	eval(shift(@insns));
352	&vshr_u32 ($T3,$T0,$sigma0[1]);
353	eval(shift(@insns));
354	eval(shift(@insns));
355	&veor ($T1,$T1,$T2);
356	eval(shift(@insns));
357	eval(shift(@insns));
358	&vsli_32 ($T3,$T0,32-$sigma0[1]);
359	eval(shift(@insns));
360	eval(shift(@insns));
361	&vshr_u32 ($T4,&Dhi(@X[3]),$sigma1[0]);
362	eval(shift(@insns));
363	eval(shift(@insns));
364	&veor ($T1,$T1,$T3); # sigma0(X[1..4])
365	eval(shift(@insns));
366	eval(shift(@insns));
367	&vsli_32 ($T4,&Dhi(@X[3]),32-$sigma1[0]);
368	eval(shift(@insns));
369	eval(shift(@insns));
370	&vshr_u32 ($T5,&Dhi(@X[3]),$sigma1[2]);
371	eval(shift(@insns));
372	eval(shift(@insns));
373	&vadd_i32 (@X[0],@X[0],$T1); # X[0..3] += sigma0(X[1..4])
374	eval(shift(@insns));
375	eval(shift(@insns));
376	&veor ($T5,$T5,$T4);
377	eval(shift(@insns));
378	eval(shift(@insns));
379	&vshr_u32 ($T4,&Dhi(@X[3]),$sigma1[1]);
380	eval(shift(@insns));
381	eval(shift(@insns));
382	&vsli_32 ($T4,&Dhi(@X[3]),32-$sigma1[1]);
383	eval(shift(@insns));
384	eval(shift(@insns));
385	&veor ($T5,$T5,$T4); # sigma1(X[14..15])
386	eval(shift(@insns));
387	eval(shift(@insns));
388	&vadd_i32 (&Dlo(@X[0]),&Dlo(@X[0]),$T5);# X[0..1] += sigma1(X[14..15])
389	eval(shift(@insns));
390	eval(shift(@insns));
391	&vshr_u32 ($T4,&Dlo(@X[0]),$sigma1[0]);
392	eval(shift(@insns));
393	eval(shift(@insns));
394	&vsli_32 ($T4,&Dlo(@X[0]),32-$sigma1[0]);
395	eval(shift(@insns));
396	eval(shift(@insns));
397	&vshr_u32 ($T5,&Dlo(@X[0]),$sigma1[2]);
398	eval(shift(@insns));
399	eval(shift(@insns));
400	&veor ($T5,$T5,$T4);
401	eval(shift(@insns));
402	eval(shift(@insns));
403	&vshr_u32 ($T4,&Dlo(@X[0]),$sigma1[1]);
404	eval(shift(@insns));
405	eval(shift(@insns));
406	&vld1_32 ("{$T0}","[$Ktbl,:128]!");
407	eval(shift(@insns));
408	eval(shift(@insns));
409	&vsli_32 ($T4,&Dlo(@X[0]),32-$sigma1[1]);
410	eval(shift(@insns));
411	eval(shift(@insns));
412	&veor ($T5,$T5,$T4); # sigma1(X[16..17])
413	eval(shift(@insns));
414	eval(shift(@insns));
415	&vadd_i32 (&Dhi(@X[0]),&Dhi(@X[0]),$T5);# X[2..3] += sigma1(X[16..17])
416	eval(shift(@insns));
417	eval(shift(@insns));
418	&vadd_i32 ($T0,$T0,@X[0]);
419	while($#insns>=2) { eval(shift(@insns)); }
420	&vst1_32 ("{$T0}","[$Xfer,:128]!");
421	eval(shift(@insns));
422	eval(shift(@insns));
423
424	push(@X,shift(@X)); # "rotate" X[]
425	}
426
427	sub Xpreload()
428	{ use integer;
429	my $body = shift;
430	my @insns = (&$body,&$body,&$body,&$body);
431	my ($a,$b,$c,$d,$e,$f,$g,$h);
432
433	eval(shift(@insns));
434	eval(shift(@insns));
435	eval(shift(@insns));
436	eval(shift(@insns));
437	&vld1_32 ("{$T0}","[$Ktbl,:128]!");
438	eval(shift(@insns));
439	eval(shift(@insns));
440	eval(shift(@insns));
441	eval(shift(@insns));
442	&vrev32_8 (@X[0],@X[0]);
443	eval(shift(@insns));
444	eval(shift(@insns));
445	eval(shift(@insns));
446	eval(shift(@insns));
447	&vadd_i32 ($T0,$T0,@X[0]);
448	foreach (@insns) { eval; } # remaining instructions
449	&vst1_32 ("{$T0}","[$Xfer,:128]!");
450
451	push(@X,shift(@X)); # "rotate" X[]
452	}
453
454	sub body_00_15 () {
455	(
456	'($a,$b,$c,$d,$e,$f,$g,$h)=@V;'.
457	'&add ($h,$h,$t1)', # h+=X[i]+K[i]
458	'&eor ($t1,$f,$g)',
459	'&eor ($t0,$e,$e,"ror#".($Sigma1[1]-$Sigma1[0]))',
460	'&add ($a,$a,$t2)', # h+=Maj(a,b,c) from the past
461	'&and ($t1,$t1,$e)',
462	'&eor ($t2,$t0,$e,"ror#".($Sigma1[2]-$Sigma1[0]))', # Sigma1(e)
463	'&eor ($t0,$a,$a,"ror#".($Sigma0[1]-$Sigma0[0]))',
464	'&eor ($t1,$t1,$g)', # Ch(e,f,g)
465	'&add ($h,$h,$t2,"ror#$Sigma1[0]")', # h+=Sigma1(e)
466	'&eor ($t2,$a,$b)', # a^b, b^c in next round
467	'&eor ($t0,$t0,$a,"ror#".($Sigma0[2]-$Sigma0[0]))', # Sigma0(a)
468	'&add ($h,$h,$t1)', # h+=Ch(e,f,g)
469	'&ldr ($t1,sprintf "[sp,#%d]",4*(($j+1)&15)) if (($j&15)!=15);'.
470	'&ldr ($t1,"[$Ktbl]") if ($j==15);'.
471	'&ldr ($t1,"[sp,#64]") if ($j==31)',
472	'&and ($t3,$t3,$t2)', # (b^c)&=(a^b)
473	'&add ($d,$d,$h)', # d+=h
474	'&add ($h,$h,$t0,"ror#$Sigma0[0]");'. # h+=Sigma0(a)
475	'&eor ($t3,$t3,$b)', # Maj(a,b,c)
476	'$j++; unshift(@V,pop(@V)); ($t2,$t3)=($t3,$t2);'
477	)
478	}
479
480	$code.=<<___;
481	#if __ARM_MAX_ARCH__>=7
482	.arch armv7-a
483	.fpu neon
484
485	.global sha256_block_data_order_neon
486	.type sha256_block_data_order_neon,%function
487	.align 5
488	.skip 16
489	sha256_block_data_order_neon:
490	.LNEON:
491	stmdb sp!,{r4-r12,lr}
492
493	sub $H,sp,#16*4+16
494	adr $Ktbl,K256
495	bic $H,$H,#15 @ align for 128-bit stores
496	mov $t2,sp
497	mov sp,$H @ alloca
498	add $len,$inp,$len,lsl#6 @ len to point at the end of inp
499
500	vld1.8 {@X[0]},[$inp]!
501	vld1.8 {@X[1]},[$inp]!
502	vld1.8 {@X[2]},[$inp]!
503	vld1.8 {@X[3]},[$inp]!
504	vld1.32 {$T0},[$Ktbl,:128]!
505	vld1.32 {$T1},[$Ktbl,:128]!
506	vld1.32 {$T2},[$Ktbl,:128]!
507	vld1.32 {$T3},[$Ktbl,:128]!
508	vrev32.8 @X[0],@X[0] @ yes, even on
509	str $ctx,[sp,#64]
510	vrev32.8 @X[1],@X[1] @ big-endian
511	str $inp,[sp,#68]
512	mov $Xfer,sp
513	vrev32.8 @X[2],@X[2]
514	str $len,[sp,#72]
515	vrev32.8 @X[3],@X[3]
516	str $t2,[sp,#76] @ save original sp
517	vadd.i32 $T0,$T0,@X[0]
518	vadd.i32 $T1,$T1,@X[1]
519	vst1.32 {$T0},[$Xfer,:128]!
520	vadd.i32 $T2,$T2,@X[2]
521	vst1.32 {$T1},[$Xfer,:128]!
522	vadd.i32 $T3,$T3,@X[3]
523	vst1.32 {$T2},[$Xfer,:128]!
524	vst1.32 {$T3},[$Xfer,:128]!
525
526	ldmia $ctx,{$A-$H}
527	sub $Xfer,$Xfer,#64
528	ldr $t1,[sp,#0]
529	eor $t2,$t2,$t2
530	eor $t3,$B,$C
531	b .L_00_48
532
533	.align 4
534	.L_00_48:
535	___
536	&Xupdate(\&body_00_15);
537	&Xupdate(\&body_00_15);
538	&Xupdate(\&body_00_15);
539	&Xupdate(\&body_00_15);
540	$code.=<<___;
541	teq $t1,#0 @ check for K256 terminator
542	ldr $t1,[sp,#0]
543	sub $Xfer,$Xfer,#64
544	bne .L_00_48
545
546	ldr $inp,[sp,#68]
547	ldr $t0,[sp,#72]
548	sub $Ktbl,$Ktbl,#256 @ rewind $Ktbl
549	teq $inp,$t0
550	it eq
551	subeq $inp,$inp,#64 @ avoid SEGV
552	vld1.8 {@X[0]},[$inp]! @ load next input block
553	vld1.8 {@X[1]},[$inp]!
554	vld1.8 {@X[2]},[$inp]!
555	vld1.8 {@X[3]},[$inp]!
556	it ne
557	strne $inp,[sp,#68]
558	mov $Xfer,sp
559	___
560	&Xpreload(\&body_00_15);
561	&Xpreload(\&body_00_15);
562	&Xpreload(\&body_00_15);
563	&Xpreload(\&body_00_15);
564	$code.=<<___;
565	ldr $t0,[$t1,#0]
566	add $A,$A,$t2 @ h+=Maj(a,b,c) from the past
567	ldr $t2,[$t1,#4]
568	ldr $t3,[$t1,#8]
569	ldr $t4,[$t1,#12]
570	add $A,$A,$t0 @ accumulate
571	ldr $t0,[$t1,#16]
572	add $B,$B,$t2
573	ldr $t2,[$t1,#20]
574	add $C,$C,$t3
575	ldr $t3,[$t1,#24]
576	add $D,$D,$t4
577	ldr $t4,[$t1,#28]
578	add $E,$E,$t0
579	str $A,[$t1],#4
580	add $F,$F,$t2
581	str $B,[$t1],#4
582	add $G,$G,$t3
583	str $C,[$t1],#4
584	add $H,$H,$t4
585	str $D,[$t1],#4
586	stmia $t1,{$E-$H}
587
588	ittte ne
589	movne $Xfer,sp
590	ldrne $t1,[sp,#0]
591	eorne $t2,$t2,$t2
592	ldreq sp,[sp,#76] @ restore original sp
593	itt ne
594	eorne $t3,$B,$C
595	bne .L_00_48
596
597	ldmia sp!,{r4-r12,pc}
598	.size sha256_block_data_order_neon,.-sha256_block_data_order_neon
599	#endif
600	___
601	}}}
602	######################################################################
603	# ARMv8 stuff
604	#
605	{{{
606	my ($ABCD,$EFGH,$abcd)=map("q$_",(0..2));
607	my @MSG=map("q$_",(8..11));
608	my ($W0,$W1,$ABCD_SAVE,$EFGH_SAVE)=map("q$_",(12..15));
609	my $Ktbl="r3";
610	my $_byte = ($flavour =~ /win/ ? "DCB" : ".byte");
611
612	$code.=<<___;
613	#if __ARM_MAX_ARCH__>=7 && !defined(__KERNEL__)
614
615	# if defined(__thumb2__)
616	# define INST(a,b,c,d) $_byte c,d\|0xc,a,b
617	# else
618	# define INST(a,b,c,d) $_byte a,b,c,d
619	# endif
620
621	.type sha256_block_data_order_armv8,%function
622	.align 5
623	sha256_block_data_order_armv8:
624	.LARMv8:
625	vld1.32 {$ABCD,$EFGH},[$ctx]
626	sub $Ktbl,$Ktbl,#256+32
627	add $len,$inp,$len,lsl#6 @ len to point at the end of inp
628	b .Loop_v8
629
630	.align 4
631	.Loop_v8:
632	vld1.8 {@MSG[0]-@MSG[1]},[$inp]!
633	vld1.8 {@MSG[2]-@MSG[3]},[$inp]!
634	vld1.32 {$W0},[$Ktbl]!
635	vrev32.8 @MSG[0],@MSG[0]
636	vrev32.8 @MSG[1],@MSG[1]
637	vrev32.8 @MSG[2],@MSG[2]
638	vrev32.8 @MSG[3],@MSG[3]
639	vmov $ABCD_SAVE,$ABCD @ offload
640	vmov $EFGH_SAVE,$EFGH
641	teq $inp,$len
642	___
643	for($i=0;$i<12;$i++) {
644	$code.=<<___;
645	vld1.32 {$W1},[$Ktbl]!
646	vadd.i32 $W0,$W0,@MSG[0]
647	sha256su0 @MSG[0],@MSG[1]
648	vmov $abcd,$ABCD
649	sha256h $ABCD,$EFGH,$W0
650	sha256h2 $EFGH,$abcd,$W0
651	sha256su1 @MSG[0],@MSG[2],@MSG[3]
652	___
653	($W0,$W1)=($W1,$W0); push(@MSG,shift(@MSG));
654	}
655	$code.=<<___;
656	vld1.32 {$W1},[$Ktbl]!
657	vadd.i32 $W0,$W0,@MSG[0]
658	vmov $abcd,$ABCD
659	sha256h $ABCD,$EFGH,$W0
660	sha256h2 $EFGH,$abcd,$W0
661
662	vld1.32 {$W0},[$Ktbl]!
663	vadd.i32 $W1,$W1,@MSG[1]
664	vmov $abcd,$ABCD
665	sha256h $ABCD,$EFGH,$W1
666	sha256h2 $EFGH,$abcd,$W1
667
668	vld1.32 {$W1},[$Ktbl]
669	vadd.i32 $W0,$W0,@MSG[2]
670	sub $Ktbl,$Ktbl,#256-16 @ rewind
671	vmov $abcd,$ABCD
672	sha256h $ABCD,$EFGH,$W0
673	sha256h2 $EFGH,$abcd,$W0
674
675	vadd.i32 $W1,$W1,@MSG[3]
676	vmov $abcd,$ABCD
677	sha256h $ABCD,$EFGH,$W1
678	sha256h2 $EFGH,$abcd,$W1
679
680	vadd.i32 $ABCD,$ABCD,$ABCD_SAVE
681	vadd.i32 $EFGH,$EFGH,$EFGH_SAVE
682	it ne
683	bne .Loop_v8
684
685	vst1.32 {$ABCD,$EFGH},[$ctx]
686
687	ret @ bx lr
688	.size sha256_block_data_order_armv8,.-sha256_block_data_order_armv8
689	#endif
690	___
691	}}}
692	$code.=<<___;
693	.asciz "SHA256 block transform for ARMv4/NEON/ARMv8, CRYPTOGAMS by <appro\@openssl.org>"
694	.align 2
695	#if __ARM_MAX_ARCH__>=7 && !defined(__KERNEL__)
696	.comm OPENSSL_armcap_P,4,4
697	#endif
698	___
699
700	open SELF,$0;
701	while(<SELF>) {
702	next if (/^#!/);
703	last if (!s/^#/@/ and !/^$/);
704	print;
705	}
706	close SELF;
707
708	{ my %opcode = (
709	"sha256h" => 0xf3000c40, "sha256h2" => 0xf3100c40,
710	"sha256su0" => 0xf3ba03c0, "sha256su1" => 0xf3200c40 );
711
712	sub unsha256 {
713	my ($mnemonic,$arg)=@_;
714
715	if ($arg =~ m/q([0-9]+)(?:,\sq([0-9]+))?,\sq([0-9]+)/o) {
716	my $word = $opcode{$mnemonic}\|(($1&7)<<13)\|(($1&8)<<19)
717	\|(($2&7)<<17)\|(($2&8)<<4)
718	\|(($3&7)<<1) \|(($3&8)<<2);
719	# since ARMv7 instructions are always encoded little-endian.
720	# correct solution is to use .inst directive, but older
721	# assemblers don't implement it:-(
722	sprintf "INST(0x%02x,0x%02x,0x%02x,0x%02x)\t@ %s %s",
723	$word&0xff,($word>>8)&0xff,
724	($word>>16)&0xff,($word>>24)&0xff,
725	$mnemonic,$arg;
726	}
727	}
728	}
729
730	foreach (split($/,$code)) {
731
732	s/\`([^\`]*)\`/eval $1/geo;
733
734	s/\b(sha256\w+)\s+(q.*)/unsha256($1,$2)/geo;
735
736	s/\bret\b/bx lr/go or
737	s/\bbx\s+lr\b/.word\t0xe12fff1e/go; # make it possible to compile with -march=armv4
738
739	print $_,"\n";
740	}
741
742	close STDOUT or die "error closing STDOUT: $!"; # enforce flush

Note: See TracBrowser for help on using the repository browser.

source: vbox/trunk/src/libs/openssl-3.1.2/crypto/sha/asm/sha256-armv4.pl@ 101021

Download in other formats: