ghash-armv4.pl@ 91772

Last change on this file since 91772 was 91772, checked in by vboxsync, 3 years ago
openssl-1.1.1l: Applied and adjusted our OpenSSL changes to 1.1.1l. bugref:10126
File size: 13.9 KB

Line
1	#! /usr/bin/env perl
2	# Copyright 2010-2020 The OpenSSL Project Authors. All Rights Reserved.
3	#
4	# Licensed under the OpenSSL license (the "License"). You may not use
5	# this file except in compliance with the License. You can obtain a copy
6	# in the file LICENSE in the source distribution or at
7	# https://www.openssl.org/source/license.html
8
9	#
10	# ====================================================================
11	# Written by Andy Polyakov <[email protected]> for the OpenSSL
12	# project. The module is, however, dual licensed under OpenSSL and
13	# CRYPTOGAMS licenses depending on where you obtain it. For further
14	# details see http://www.openssl.org/~appro/cryptogams/.
15	# ====================================================================
16	#
17	# April 2010
18	#
19	# The module implements "4-bit" GCM GHASH function and underlying
20	# single multiplication operation in GF(2^128). "4-bit" means that it
21	# uses 256 bytes per-key table [+32 bytes shared table]. There is no
22	# experimental performance data available yet. The only approximation
23	# that can be made at this point is based on code size. Inner loop is
24	# 32 instructions long and on single-issue core should execute in <40
25	# cycles. Having verified that gcc 3.4 didn't unroll corresponding
26	# loop, this assembler loop body was found to be ~3x smaller than
27	# compiler-generated one...
28	#
29	# July 2010
30	#
31	# Rescheduling for dual-issue pipeline resulted in 8.5% improvement on
32	# Cortex A8 core and ~25 cycles per processed byte (which was observed
33	# to be ~3 times faster than gcc-generated code:-)
34	#
35	# February 2011
36	#
37	# Profiler-assisted and platform-specific optimization resulted in 7%
38	# improvement on Cortex A8 core and ~23.5 cycles per byte.
39	#
40	# March 2011
41	#
42	# Add NEON implementation featuring polynomial multiplication, i.e. no
43	# lookup tables involved. On Cortex A8 it was measured to process one
44	# byte in 15 cycles or 55% faster than integer-only code.
45	#
46	# April 2014
47	#
48	# Switch to multiplication algorithm suggested in paper referred
49	# below and combine it with reduction algorithm from x86 module.
50	# Performance improvement over previous version varies from 65% on
51	# Snapdragon S4 to 110% on Cortex A9. In absolute terms Cortex A8
52	# processes one byte in 8.45 cycles, A9 - in 10.2, A15 - in 7.63,
53	# Snapdragon S4 - in 9.33.
54	#
55	# Câmara, D.; Gouvêa, C. P. L.; López, J. & Dahab, R.: Fast Software
56	# Polynomial Multiplication on ARM Processors using the NEON Engine.
57	#
58	# http://conradoplg.cryptoland.net/files/2010/12/mocrysen13.pdf
59
60	# ====================================================================
61	# Note about "528B" variant. In ARM case it makes lesser sense to
62	# implement it for following reasons:
63	#
64	# - performance improvement won't be anywhere near 50%, because 128-
65	# bit shift operation is neatly fused with 128-bit xor here, and
66	# "538B" variant would eliminate only 4-5 instructions out of 32
67	# in the inner loop (meaning that estimated improvement is ~15%);
68	# - ARM-based systems are often embedded ones and extra memory
69	# consumption might be unappreciated (for so little improvement);
70	#
71	# Byte order [in]dependence. =========================================
72	#
73	# Caller is expected to maintain specific dword order in Htable,
74	# namely with least significant dword of 128-bit value at lower
75	# address. This differs completely from C code and has everything to
76	# do with ldm instruction and order in which dwords are "consumed" by
77	# algorithm. Byte order within these dwords in turn is whatever
78	# native byte order on current platform. See gcm128.c for working
79	# example...
80
81	$flavour = shift;
82	if ($flavour=~/\w[\w\-]*\.\w+$/) { $output=$flavour; undef $flavour; }
83	else { while (($output=shift) && ($output!~/\w[\w\-]*\.\w+$/)) {} }
84
85	if ($flavour && $flavour ne "void") {
86	$0 =~ m/(.*[\/\\])[^\/\\]+$/; $dir=$1;
87	( $xlate="${dir}arm-xlate.pl" and -f $xlate ) or
88	( $xlate="${dir}../../perlasm/arm-xlate.pl" and -f $xlate) or
89	die "can't locate arm-xlate.pl";
90
91	open STDOUT,"\| \"$^X\" $xlate $flavour $output";
92	} else {
93	open STDOUT,">$output";
94	}
95
96	$Xi="r0"; # argument block
97	$Htbl="r1";
98	$inp="r2";
99	$len="r3";
100
101	$Zll="r4"; # variables
102	$Zlh="r5";
103	$Zhl="r6";
104	$Zhh="r7";
105	$Tll="r8";
106	$Tlh="r9";
107	$Thl="r10";
108	$Thh="r11";
109	$nlo="r12";
110	################# r13 is stack pointer
111	$nhi="r14";
112	################# r15 is program counter
113
114	$rem_4bit=$inp; # used in gcm_gmult_4bit
115	$cnt=$len;
116
117	sub Zsmash() {
118	my $i=12;
119	my @args=@_;
120	for ($Zll,$Zlh,$Zhl,$Zhh) {
121	$code.=<<___;
122	#if __ARM_ARCH__>=7 && defined(__ARMEL__)
123	rev $_,$_
124	str $_,[$Xi,#$i]
125	#elif defined(__ARMEB__)
126	str $_,[$Xi,#$i]
127	#else
128	mov $Tlh,$_,lsr#8
129	strb $_,[$Xi,#$i+3]
130	mov $Thl,$_,lsr#16
131	strb $Tlh,[$Xi,#$i+2]
132	mov $Thh,$_,lsr#24
133	strb $Thl,[$Xi,#$i+1]
134	strb $Thh,[$Xi,#$i]
135	#endif
136	___
137	$code.="\t".shift(@args)."\n";
138	$i-=4;
139	}
140	}
141
142	$code=<<___;
143	#include "arm_arch.h"
144
145	.text
146	#if defined(__thumb2__) \|\| defined(__clang__)
147	.syntax unified
148	#define ldrplb ldrbpl
149	#define ldrneb ldrbne
150	#endif
151	#if defined(__thumb2__)
152	.thumb
153	#else
154	.code 32
155	#endif
156
157	.type rem_4bit,%object
158	.align 5
159	rem_4bit:
160	.short 0x0000,0x1C20,0x3840,0x2460
161	.short 0x7080,0x6CA0,0x48C0,0x54E0
162	.short 0xE100,0xFD20,0xD940,0xC560
163	.short 0x9180,0x8DA0,0xA9C0,0xB5E0
164	.size rem_4bit,.-rem_4bit
165
166	.type rem_4bit_get,%function
167	rem_4bit_get:
168	#if defined(__thumb2__)
169	adr $rem_4bit,rem_4bit
170	#else
171	sub $rem_4bit,pc,#8+32 @ &rem_4bit
172	#endif
173	b .Lrem_4bit_got
174	nop
175	nop
176	.size rem_4bit_get,.-rem_4bit_get
177
178	.global gcm_ghash_4bit
179	.type gcm_ghash_4bit,%function
180	.align 4
181	gcm_ghash_4bit:
182	#if defined(__thumb2__)
183	adr r12,rem_4bit
184	#else
185	sub r12,pc,#8+48 @ &rem_4bit
186	#endif
187	add $len,$inp,$len @ $len to point at the end
188	stmdb sp!,{r3-r11,lr} @ save $len/end too
189
190	ldmia r12,{r4-r11} @ copy rem_4bit ...
191	stmdb sp!,{r4-r11} @ ... to stack
192
193	ldrb $nlo,[$inp,#15]
194	ldrb $nhi,[$Xi,#15]
195	.Louter:
196	eor $nlo,$nlo,$nhi
197	and $nhi,$nlo,#0xf0
198	and $nlo,$nlo,#0x0f
199	mov $cnt,#14
200
201	add $Zhh,$Htbl,$nlo,lsl#4
202	ldmia $Zhh,{$Zll-$Zhh} @ load Htbl[nlo]
203	add $Thh,$Htbl,$nhi
204	ldrb $nlo,[$inp,#14]
205
206	and $nhi,$Zll,#0xf @ rem
207	ldmia $Thh,{$Tll-$Thh} @ load Htbl[nhi]
208	add $nhi,$nhi,$nhi
209	eor $Zll,$Tll,$Zll,lsr#4
210	ldrh $Tll,[sp,$nhi] @ rem_4bit[rem]
211	eor $Zll,$Zll,$Zlh,lsl#28
212	ldrb $nhi,[$Xi,#14]
213	eor $Zlh,$Tlh,$Zlh,lsr#4
214	eor $Zlh,$Zlh,$Zhl,lsl#28
215	eor $Zhl,$Thl,$Zhl,lsr#4
216	eor $Zhl,$Zhl,$Zhh,lsl#28
217	eor $Zhh,$Thh,$Zhh,lsr#4
218	eor $nlo,$nlo,$nhi
219	and $nhi,$nlo,#0xf0
220	and $nlo,$nlo,#0x0f
221	eor $Zhh,$Zhh,$Tll,lsl#16
222
223	.Linner:
224	add $Thh,$Htbl,$nlo,lsl#4
225	and $nlo,$Zll,#0xf @ rem
226	subs $cnt,$cnt,#1
227	add $nlo,$nlo,$nlo
228	ldmia $Thh,{$Tll-$Thh} @ load Htbl[nlo]
229	eor $Zll,$Tll,$Zll,lsr#4
230	eor $Zll,$Zll,$Zlh,lsl#28
231	eor $Zlh,$Tlh,$Zlh,lsr#4
232	eor $Zlh,$Zlh,$Zhl,lsl#28
233	ldrh $Tll,[sp,$nlo] @ rem_4bit[rem]
234	eor $Zhl,$Thl,$Zhl,lsr#4
235	#ifdef __thumb2__
236	it pl
237	#endif
238	ldrplb $nlo,[$inp,$cnt]
239	eor $Zhl,$Zhl,$Zhh,lsl#28
240	eor $Zhh,$Thh,$Zhh,lsr#4
241
242	add $Thh,$Htbl,$nhi
243	and $nhi,$Zll,#0xf @ rem
244	eor $Zhh,$Zhh,$Tll,lsl#16 @ ^= rem_4bit[rem]
245	add $nhi,$nhi,$nhi
246	ldmia $Thh,{$Tll-$Thh} @ load Htbl[nhi]
247	eor $Zll,$Tll,$Zll,lsr#4
248	#ifdef __thumb2__
249	it pl
250	#endif
251	ldrplb $Tll,[$Xi,$cnt]
252	eor $Zll,$Zll,$Zlh,lsl#28
253	eor $Zlh,$Tlh,$Zlh,lsr#4
254	ldrh $Tlh,[sp,$nhi]
255	eor $Zlh,$Zlh,$Zhl,lsl#28
256	eor $Zhl,$Thl,$Zhl,lsr#4
257	eor $Zhl,$Zhl,$Zhh,lsl#28
258	#ifdef __thumb2__
259	it pl
260	#endif
261	eorpl $nlo,$nlo,$Tll
262	eor $Zhh,$Thh,$Zhh,lsr#4
263	#ifdef __thumb2__
264	itt pl
265	#endif
266	andpl $nhi,$nlo,#0xf0
267	andpl $nlo,$nlo,#0x0f
268	eor $Zhh,$Zhh,$Tlh,lsl#16 @ ^= rem_4bit[rem]
269	bpl .Linner
270
271	ldr $len,[sp,#32] @ re-load $len/end
272	add $inp,$inp,#16
273	mov $nhi,$Zll
274	___
275	&Zsmash("cmp\t$inp,$len","\n".
276	"#ifdef __thumb2__\n".
277	" it ne\n".
278	"#endif\n".
279	" ldrneb $nlo,[$inp,#15]");
280	$code.=<<___;
281	bne .Louter
282
283	add sp,sp,#36
284	#if __ARM_ARCH__>=5
285	ldmia sp!,{r4-r11,pc}
286	#else
287	ldmia sp!,{r4-r11,lr}
288	tst lr,#1
289	moveq pc,lr @ be binary compatible with V4, yet
290	bx lr @ interoperable with Thumb ISA:-)
291	#endif
292	.size gcm_ghash_4bit,.-gcm_ghash_4bit
293
294	.global gcm_gmult_4bit
295	.type gcm_gmult_4bit,%function
296	gcm_gmult_4bit:
297	stmdb sp!,{r4-r11,lr}
298	ldrb $nlo,[$Xi,#15]
299	b rem_4bit_get
300	.Lrem_4bit_got:
301	and $nhi,$nlo,#0xf0
302	and $nlo,$nlo,#0x0f
303	mov $cnt,#14
304
305	add $Zhh,$Htbl,$nlo,lsl#4
306	ldmia $Zhh,{$Zll-$Zhh} @ load Htbl[nlo]
307	ldrb $nlo,[$Xi,#14]
308
309	add $Thh,$Htbl,$nhi
310	and $nhi,$Zll,#0xf @ rem
311	ldmia $Thh,{$Tll-$Thh} @ load Htbl[nhi]
312	add $nhi,$nhi,$nhi
313	eor $Zll,$Tll,$Zll,lsr#4
314	ldrh $Tll,[$rem_4bit,$nhi] @ rem_4bit[rem]
315	eor $Zll,$Zll,$Zlh,lsl#28
316	eor $Zlh,$Tlh,$Zlh,lsr#4
317	eor $Zlh,$Zlh,$Zhl,lsl#28
318	eor $Zhl,$Thl,$Zhl,lsr#4
319	eor $Zhl,$Zhl,$Zhh,lsl#28
320	eor $Zhh,$Thh,$Zhh,lsr#4
321	and $nhi,$nlo,#0xf0
322	eor $Zhh,$Zhh,$Tll,lsl#16
323	and $nlo,$nlo,#0x0f
324
325	.Loop:
326	add $Thh,$Htbl,$nlo,lsl#4
327	and $nlo,$Zll,#0xf @ rem
328	subs $cnt,$cnt,#1
329	add $nlo,$nlo,$nlo
330	ldmia $Thh,{$Tll-$Thh} @ load Htbl[nlo]
331	eor $Zll,$Tll,$Zll,lsr#4
332	eor $Zll,$Zll,$Zlh,lsl#28
333	eor $Zlh,$Tlh,$Zlh,lsr#4
334	eor $Zlh,$Zlh,$Zhl,lsl#28
335	ldrh $Tll,[$rem_4bit,$nlo] @ rem_4bit[rem]
336	eor $Zhl,$Thl,$Zhl,lsr#4
337	#ifdef __thumb2__
338	it pl
339	#endif
340	ldrplb $nlo,[$Xi,$cnt]
341	eor $Zhl,$Zhl,$Zhh,lsl#28
342	eor $Zhh,$Thh,$Zhh,lsr#4
343
344	add $Thh,$Htbl,$nhi
345	and $nhi,$Zll,#0xf @ rem
346	eor $Zhh,$Zhh,$Tll,lsl#16 @ ^= rem_4bit[rem]
347	add $nhi,$nhi,$nhi
348	ldmia $Thh,{$Tll-$Thh} @ load Htbl[nhi]
349	eor $Zll,$Tll,$Zll,lsr#4
350	eor $Zll,$Zll,$Zlh,lsl#28
351	eor $Zlh,$Tlh,$Zlh,lsr#4
352	ldrh $Tll,[$rem_4bit,$nhi] @ rem_4bit[rem]
353	eor $Zlh,$Zlh,$Zhl,lsl#28
354	eor $Zhl,$Thl,$Zhl,lsr#4
355	eor $Zhl,$Zhl,$Zhh,lsl#28
356	eor $Zhh,$Thh,$Zhh,lsr#4
357	#ifdef __thumb2__
358	itt pl
359	#endif
360	andpl $nhi,$nlo,#0xf0
361	andpl $nlo,$nlo,#0x0f
362	eor $Zhh,$Zhh,$Tll,lsl#16 @ ^= rem_4bit[rem]
363	bpl .Loop
364	___
365	&Zsmash();
366	$code.=<<___;
367	#if __ARM_ARCH__>=5
368	ldmia sp!,{r4-r11,pc}
369	#else
370	ldmia sp!,{r4-r11,lr}
371	tst lr,#1
372	moveq pc,lr @ be binary compatible with V4, yet
373	bx lr @ interoperable with Thumb ISA:-)
374	#endif
375	.size gcm_gmult_4bit,.-gcm_gmult_4bit
376	___
377	{
378	my ($Xl,$Xm,$Xh,$IN)=map("q$_",(0..3));
379	my ($t0,$t1,$t2,$t3)=map("q$_",(8..12));
380	my ($Hlo,$Hhi,$Hhl,$k48,$k32,$k16)=map("d$_",(26..31));
381
382	sub clmul64x64 {
383	my ($r,$a,$b)=@_;
384	$code.=<<___;
385	vext.8 $t0#lo, $a, $a, #1 @ A1
386	vmull.p8 $t0, $t0#lo, $b @ F = A1*B
387	vext.8 $r#lo, $b, $b, #1 @ B1
388	vmull.p8 $r, $a, $r#lo @ E = A*B1
389	vext.8 $t1#lo, $a, $a, #2 @ A2
390	vmull.p8 $t1, $t1#lo, $b @ H = A2*B
391	vext.8 $t3#lo, $b, $b, #2 @ B2
392	vmull.p8 $t3, $a, $t3#lo @ G = A*B2
393	vext.8 $t2#lo, $a, $a, #3 @ A3
394	veor $t0, $t0, $r @ L = E + F
395	vmull.p8 $t2, $t2#lo, $b @ J = A3*B
396	vext.8 $r#lo, $b, $b, #3 @ B3
397	veor $t1, $t1, $t3 @ M = G + H
398	vmull.p8 $r, $a, $r#lo @ I = A*B3
399	veor $t0#lo, $t0#lo, $t0#hi @ t0 = (L) (P0 + P1) << 8
400	vand $t0#hi, $t0#hi, $k48
401	vext.8 $t3#lo, $b, $b, #4 @ B4
402	veor $t1#lo, $t1#lo, $t1#hi @ t1 = (M) (P2 + P3) << 16
403	vand $t1#hi, $t1#hi, $k32
404	vmull.p8 $t3, $a, $t3#lo @ K = A*B4
405	veor $t2, $t2, $r @ N = I + J
406	veor $t0#lo, $t0#lo, $t0#hi
407	veor $t1#lo, $t1#lo, $t1#hi
408	veor $t2#lo, $t2#lo, $t2#hi @ t2 = (N) (P4 + P5) << 24
409	vand $t2#hi, $t2#hi, $k16
410	vext.8 $t0, $t0, $t0, #15
411	veor $t3#lo, $t3#lo, $t3#hi @ t3 = (K) (P6 + P7) << 32
412	vmov.i64 $t3#hi, #0
413	vext.8 $t1, $t1, $t1, #14
414	veor $t2#lo, $t2#lo, $t2#hi
415	vmull.p8 $r, $a, $b @ D = A*B
416	vext.8 $t3, $t3, $t3, #12
417	vext.8 $t2, $t2, $t2, #13
418	veor $t0, $t0, $t1
419	veor $t2, $t2, $t3
420	veor $r, $r, $t0
421	veor $r, $r, $t2
422	___
423	}
424
425	$code.=<<___;
426	#if __ARM_MAX_ARCH__>=7
427	.arch armv7-a
428	.fpu neon
429
430	.global gcm_init_neon
431	.type gcm_init_neon,%function
432	.align 4
433	gcm_init_neon:
434	vld1.64 $IN#hi,[r1]! @ load H
435	vmov.i8 $t0,#0xe1
436	vld1.64 $IN#lo,[r1]
437	vshl.i64 $t0#hi,#57
438	vshr.u64 $t0#lo,#63 @ t0=0xc2....01
439	vdup.8 $t1,$IN#hi[7]
440	vshr.u64 $Hlo,$IN#lo,#63
441	vshr.s8 $t1,#7 @ broadcast carry bit
442	vshl.i64 $IN,$IN,#1
443	vand $t0,$t0,$t1
444	vorr $IN#hi,$Hlo @ H<<<=1
445	veor $IN,$IN,$t0 @ twisted H
446	vstmia r0,{$IN}
447
448	ret @ bx lr
449	.size gcm_init_neon,.-gcm_init_neon
450
451	.global gcm_gmult_neon
452	.type gcm_gmult_neon,%function
453	.align 4
454	gcm_gmult_neon:
455	vld1.64 $IN#hi,[$Xi]! @ load Xi
456	vld1.64 $IN#lo,[$Xi]!
457	vmov.i64 $k48,#0x0000ffffffffffff
458	vldmia $Htbl,{$Hlo-$Hhi} @ load twisted H
459	vmov.i64 $k32,#0x00000000ffffffff
460	#ifdef __ARMEL__
461	vrev64.8 $IN,$IN
462	#endif
463	vmov.i64 $k16,#0x000000000000ffff
464	veor $Hhl,$Hlo,$Hhi @ Karatsuba pre-processing
465	mov $len,#16
466	b .Lgmult_neon
467	.size gcm_gmult_neon,.-gcm_gmult_neon
468
469	.global gcm_ghash_neon
470	.type gcm_ghash_neon,%function
471	.align 4
472	gcm_ghash_neon:
473	vld1.64 $Xl#hi,[$Xi]! @ load Xi
474	vld1.64 $Xl#lo,[$Xi]!
475	vmov.i64 $k48,#0x0000ffffffffffff
476	vldmia $Htbl,{$Hlo-$Hhi} @ load twisted H
477	vmov.i64 $k32,#0x00000000ffffffff
478	#ifdef __ARMEL__
479	vrev64.8 $Xl,$Xl
480	#endif
481	vmov.i64 $k16,#0x000000000000ffff
482	veor $Hhl,$Hlo,$Hhi @ Karatsuba pre-processing
483
484	.Loop_neon:
485	vld1.64 $IN#hi,[$inp]! @ load inp
486	vld1.64 $IN#lo,[$inp]!
487	#ifdef __ARMEL__
488	vrev64.8 $IN,$IN
489	#endif
490	veor $IN,$Xl @ inp^=Xi
491	.Lgmult_neon:
492	___
493	&clmul64x64 ($Xl,$Hlo,"$IN#lo"); # H.lo·Xi.lo
494	$code.=<<___;
495	veor $IN#lo,$IN#lo,$IN#hi @ Karatsuba pre-processing
496	___
497	&clmul64x64 ($Xm,$Hhl,"$IN#lo"); # (H.lo+H.hi)·(Xi.lo+Xi.hi)
498	&clmul64x64 ($Xh,$Hhi,"$IN#hi"); # H.hi·Xi.hi
499	$code.=<<___;
500	veor $Xm,$Xm,$Xl @ Karatsuba post-processing
501	veor $Xm,$Xm,$Xh
502	veor $Xl#hi,$Xl#hi,$Xm#lo
503	veor $Xh#lo,$Xh#lo,$Xm#hi @ Xh\|Xl - 256-bit result
504
505	@ equivalent of reduction_avx from ghash-x86_64.pl
506	vshl.i64 $t1,$Xl,#57 @ 1st phase
507	vshl.i64 $t2,$Xl,#62
508	veor $t2,$t2,$t1 @
509	vshl.i64 $t1,$Xl,#63
510	veor $t2, $t2, $t1 @
511	veor $Xl#hi,$Xl#hi,$t2#lo @
512	veor $Xh#lo,$Xh#lo,$t2#hi
513
514	vshr.u64 $t2,$Xl,#1 @ 2nd phase
515	veor $Xh,$Xh,$Xl
516	veor $Xl,$Xl,$t2 @
517	vshr.u64 $t2,$t2,#6
518	vshr.u64 $Xl,$Xl,#1 @
519	veor $Xl,$Xl,$Xh @
520	veor $Xl,$Xl,$t2 @
521
522	subs $len,#16
523	bne .Loop_neon
524
525	#ifdef __ARMEL__
526	vrev64.8 $Xl,$Xl
527	#endif
528	sub $Xi,#16
529	vst1.64 $Xl#hi,[$Xi]! @ write out Xi
530	vst1.64 $Xl#lo,[$Xi]
531
532	ret @ bx lr
533	.size gcm_ghash_neon,.-gcm_ghash_neon
534	#endif
535	___
536	}
537	$code.=<<___;
538	.asciz "GHASH for ARMv4/NEON, CRYPTOGAMS by <appro\@openssl.org>"
539	.align 2
540	___
541
542	foreach (split("\n",$code)) {
543	s/\`([^\`]*)\`/eval $1/geo;
544
545	s/\bq([0-9]+)#(lo\|hi)/sprintf "d%d",2*$1+($2 eq "hi")/geo or
546	s/\bret\b/bx lr/go or
547	s/\bbx\s+lr\b/.word\t0xe12fff1e/go; # make it possible to compile with -march=armv4
548
549	print $_,"\n";
550	}
551	close STDOUT or die "error closing STDOUT: $!"; # enforce flush

Note: See TracBrowser for help on using the repository browser.

source: vbox/trunk/src/libs/openssl-1.1.1l/crypto/modes/asm/ghash-armv4.pl@ 91772

Download in other formats: