VirtualBox

source: vbox/trunk/src/libs/openssl-1.1.1k/crypto/chacha/asm/chacha-armv8.pl@ 90293

Last change on this file since 90293 was 90293, checked in by vboxsync, 4 years ago

openssl-1.1.1k: Applied and adjusted our OpenSSL changes to 1.1.1k. bugref:10072

  • Property svn:executable set to *
File size: 26.5 KB
Line 
1#! /usr/bin/env perl
2# Copyright 2016-2020 The OpenSSL Project Authors. All Rights Reserved.
3#
4# Licensed under the OpenSSL license (the "License"). You may not use
5# this file except in compliance with the License. You can obtain a copy
6# in the file LICENSE in the source distribution or at
7# https://www.openssl.org/source/license.html
8
9#
10# ====================================================================
11# Written by Andy Polyakov <[email protected]> for the OpenSSL
12# project. The module is, however, dual licensed under OpenSSL and
13# CRYPTOGAMS licenses depending on where you obtain it. For further
14# details see http://www.openssl.org/~appro/cryptogams/.
15# ====================================================================
16#
17# June 2015
18#
19# ChaCha20 for ARMv8.
20#
21# Performance in cycles per byte out of large buffer.
22#
23# IALU/gcc-4.9 3xNEON+1xIALU 6xNEON+2xIALU
24#
25# Apple A7 5.50/+49% 3.33 1.70
26# Cortex-A53 8.40/+80% 4.72 4.72(*)
27# Cortex-A57 8.06/+43% 4.90 4.43(**)
28# Denver 4.50/+82% 2.63 2.67(*)
29# X-Gene 9.50/+46% 8.82 8.89(*)
30# Mongoose 8.00/+44% 3.64 3.25
31# Kryo 8.17/+50% 4.83 4.65
32#
33# (*) it's expected that doubling interleave factor doesn't help
34# all processors, only those with higher NEON latency and
35# higher instruction issue rate;
36# (**) expected improvement was actually higher;
37
38$flavour=shift;
39$output=shift;
40
41$0 =~ m/(.*[\/\\])[^\/\\]+$/; $dir=$1;
42( $xlate="${dir}arm-xlate.pl" and -f $xlate ) or
43( $xlate="${dir}../../perlasm/arm-xlate.pl" and -f $xlate) or
44die "can't locate arm-xlate.pl";
45
46open OUT,"| \"$^X\" $xlate $flavour $output";
47*STDOUT=*OUT;
48
49sub AUTOLOAD() # thunk [simplified] x86-style perlasm
50{ my $opcode = $AUTOLOAD; $opcode =~ s/.*:://; $opcode =~ s/_/\./;
51 my $arg = pop;
52 $arg = "#$arg" if ($arg*1 eq $arg);
53 $code .= "\t$opcode\t".join(',',@_,$arg)."\n";
54}
55
56my ($out,$inp,$len,$key,$ctr) = map("x$_",(0..4));
57
58my @x=map("x$_",(5..17,19..21));
59my @d=map("x$_",(22..28,30));
60
61sub ROUND {
62my ($a0,$b0,$c0,$d0)=@_;
63my ($a1,$b1,$c1,$d1)=map(($_&~3)+(($_+1)&3),($a0,$b0,$c0,$d0));
64my ($a2,$b2,$c2,$d2)=map(($_&~3)+(($_+1)&3),($a1,$b1,$c1,$d1));
65my ($a3,$b3,$c3,$d3)=map(($_&~3)+(($_+1)&3),($a2,$b2,$c2,$d2));
66
67 (
68 "&add_32 (@x[$a0],@x[$a0],@x[$b0])",
69 "&add_32 (@x[$a1],@x[$a1],@x[$b1])",
70 "&add_32 (@x[$a2],@x[$a2],@x[$b2])",
71 "&add_32 (@x[$a3],@x[$a3],@x[$b3])",
72 "&eor_32 (@x[$d0],@x[$d0],@x[$a0])",
73 "&eor_32 (@x[$d1],@x[$d1],@x[$a1])",
74 "&eor_32 (@x[$d2],@x[$d2],@x[$a2])",
75 "&eor_32 (@x[$d3],@x[$d3],@x[$a3])",
76 "&ror_32 (@x[$d0],@x[$d0],16)",
77 "&ror_32 (@x[$d1],@x[$d1],16)",
78 "&ror_32 (@x[$d2],@x[$d2],16)",
79 "&ror_32 (@x[$d3],@x[$d3],16)",
80
81 "&add_32 (@x[$c0],@x[$c0],@x[$d0])",
82 "&add_32 (@x[$c1],@x[$c1],@x[$d1])",
83 "&add_32 (@x[$c2],@x[$c2],@x[$d2])",
84 "&add_32 (@x[$c3],@x[$c3],@x[$d3])",
85 "&eor_32 (@x[$b0],@x[$b0],@x[$c0])",
86 "&eor_32 (@x[$b1],@x[$b1],@x[$c1])",
87 "&eor_32 (@x[$b2],@x[$b2],@x[$c2])",
88 "&eor_32 (@x[$b3],@x[$b3],@x[$c3])",
89 "&ror_32 (@x[$b0],@x[$b0],20)",
90 "&ror_32 (@x[$b1],@x[$b1],20)",
91 "&ror_32 (@x[$b2],@x[$b2],20)",
92 "&ror_32 (@x[$b3],@x[$b3],20)",
93
94 "&add_32 (@x[$a0],@x[$a0],@x[$b0])",
95 "&add_32 (@x[$a1],@x[$a1],@x[$b1])",
96 "&add_32 (@x[$a2],@x[$a2],@x[$b2])",
97 "&add_32 (@x[$a3],@x[$a3],@x[$b3])",
98 "&eor_32 (@x[$d0],@x[$d0],@x[$a0])",
99 "&eor_32 (@x[$d1],@x[$d1],@x[$a1])",
100 "&eor_32 (@x[$d2],@x[$d2],@x[$a2])",
101 "&eor_32 (@x[$d3],@x[$d3],@x[$a3])",
102 "&ror_32 (@x[$d0],@x[$d0],24)",
103 "&ror_32 (@x[$d1],@x[$d1],24)",
104 "&ror_32 (@x[$d2],@x[$d2],24)",
105 "&ror_32 (@x[$d3],@x[$d3],24)",
106
107 "&add_32 (@x[$c0],@x[$c0],@x[$d0])",
108 "&add_32 (@x[$c1],@x[$c1],@x[$d1])",
109 "&add_32 (@x[$c2],@x[$c2],@x[$d2])",
110 "&add_32 (@x[$c3],@x[$c3],@x[$d3])",
111 "&eor_32 (@x[$b0],@x[$b0],@x[$c0])",
112 "&eor_32 (@x[$b1],@x[$b1],@x[$c1])",
113 "&eor_32 (@x[$b2],@x[$b2],@x[$c2])",
114 "&eor_32 (@x[$b3],@x[$b3],@x[$c3])",
115 "&ror_32 (@x[$b0],@x[$b0],25)",
116 "&ror_32 (@x[$b1],@x[$b1],25)",
117 "&ror_32 (@x[$b2],@x[$b2],25)",
118 "&ror_32 (@x[$b3],@x[$b3],25)"
119 );
120}
121
122$code.=<<___;
123#include "arm_arch.h"
124
125.text
126
127.extern OPENSSL_armcap_P
128.hidden OPENSSL_armcap_P
129
130.align 5
131.Lsigma:
132.quad 0x3320646e61707865,0x6b20657479622d32 // endian-neutral
133.Lone:
134.long 1,0,0,0
135.LOPENSSL_armcap_P:
136#ifdef __ILP32__
137.long OPENSSL_armcap_P-.
138#else
139.quad OPENSSL_armcap_P-.
140#endif
141.asciz "ChaCha20 for ARMv8, CRYPTOGAMS by <appro\@openssl.org>"
142
143.globl ChaCha20_ctr32
144.type ChaCha20_ctr32,%function
145.align 5
146ChaCha20_ctr32:
147 cbz $len,.Labort
148 adr @x[0],.LOPENSSL_armcap_P
149 cmp $len,#192
150 b.lo .Lshort
151#ifdef __ILP32__
152 ldrsw @x[1],[@x[0]]
153#else
154 ldr @x[1],[@x[0]]
155#endif
156 ldr w17,[@x[1],@x[0]]
157 tst w17,#ARMV7_NEON
158 b.ne ChaCha20_neon
159
160.Lshort:
161 .inst 0xd503233f // paciasp
162 stp x29,x30,[sp,#-96]!
163 add x29,sp,#0
164
165 adr @x[0],.Lsigma
166 stp x19,x20,[sp,#16]
167 stp x21,x22,[sp,#32]
168 stp x23,x24,[sp,#48]
169 stp x25,x26,[sp,#64]
170 stp x27,x28,[sp,#80]
171 sub sp,sp,#64
172
173 ldp @d[0],@d[1],[@x[0]] // load sigma
174 ldp @d[2],@d[3],[$key] // load key
175 ldp @d[4],@d[5],[$key,#16]
176 ldp @d[6],@d[7],[$ctr] // load counter
177#ifdef __ARMEB__
178 ror @d[2],@d[2],#32
179 ror @d[3],@d[3],#32
180 ror @d[4],@d[4],#32
181 ror @d[5],@d[5],#32
182 ror @d[6],@d[6],#32
183 ror @d[7],@d[7],#32
184#endif
185
186.Loop_outer:
187 mov.32 @x[0],@d[0] // unpack key block
188 lsr @x[1],@d[0],#32
189 mov.32 @x[2],@d[1]
190 lsr @x[3],@d[1],#32
191 mov.32 @x[4],@d[2]
192 lsr @x[5],@d[2],#32
193 mov.32 @x[6],@d[3]
194 lsr @x[7],@d[3],#32
195 mov.32 @x[8],@d[4]
196 lsr @x[9],@d[4],#32
197 mov.32 @x[10],@d[5]
198 lsr @x[11],@d[5],#32
199 mov.32 @x[12],@d[6]
200 lsr @x[13],@d[6],#32
201 mov.32 @x[14],@d[7]
202 lsr @x[15],@d[7],#32
203
204 mov $ctr,#10
205 subs $len,$len,#64
206.Loop:
207 sub $ctr,$ctr,#1
208___
209 foreach (&ROUND(0, 4, 8,12)) { eval; }
210 foreach (&ROUND(0, 5,10,15)) { eval; }
211$code.=<<___;
212 cbnz $ctr,.Loop
213
214 add.32 @x[0],@x[0],@d[0] // accumulate key block
215 add @x[1],@x[1],@d[0],lsr#32
216 add.32 @x[2],@x[2],@d[1]
217 add @x[3],@x[3],@d[1],lsr#32
218 add.32 @x[4],@x[4],@d[2]
219 add @x[5],@x[5],@d[2],lsr#32
220 add.32 @x[6],@x[6],@d[3]
221 add @x[7],@x[7],@d[3],lsr#32
222 add.32 @x[8],@x[8],@d[4]
223 add @x[9],@x[9],@d[4],lsr#32
224 add.32 @x[10],@x[10],@d[5]
225 add @x[11],@x[11],@d[5],lsr#32
226 add.32 @x[12],@x[12],@d[6]
227 add @x[13],@x[13],@d[6],lsr#32
228 add.32 @x[14],@x[14],@d[7]
229 add @x[15],@x[15],@d[7],lsr#32
230
231 b.lo .Ltail
232
233 add @x[0],@x[0],@x[1],lsl#32 // pack
234 add @x[2],@x[2],@x[3],lsl#32
235 ldp @x[1],@x[3],[$inp,#0] // load input
236 add @x[4],@x[4],@x[5],lsl#32
237 add @x[6],@x[6],@x[7],lsl#32
238 ldp @x[5],@x[7],[$inp,#16]
239 add @x[8],@x[8],@x[9],lsl#32
240 add @x[10],@x[10],@x[11],lsl#32
241 ldp @x[9],@x[11],[$inp,#32]
242 add @x[12],@x[12],@x[13],lsl#32
243 add @x[14],@x[14],@x[15],lsl#32
244 ldp @x[13],@x[15],[$inp,#48]
245 add $inp,$inp,#64
246#ifdef __ARMEB__
247 rev @x[0],@x[0]
248 rev @x[2],@x[2]
249 rev @x[4],@x[4]
250 rev @x[6],@x[6]
251 rev @x[8],@x[8]
252 rev @x[10],@x[10]
253 rev @x[12],@x[12]
254 rev @x[14],@x[14]
255#endif
256 eor @x[0],@x[0],@x[1]
257 eor @x[2],@x[2],@x[3]
258 eor @x[4],@x[4],@x[5]
259 eor @x[6],@x[6],@x[7]
260 eor @x[8],@x[8],@x[9]
261 eor @x[10],@x[10],@x[11]
262 eor @x[12],@x[12],@x[13]
263 eor @x[14],@x[14],@x[15]
264
265 stp @x[0],@x[2],[$out,#0] // store output
266 add @d[6],@d[6],#1 // increment counter
267 stp @x[4],@x[6],[$out,#16]
268 stp @x[8],@x[10],[$out,#32]
269 stp @x[12],@x[14],[$out,#48]
270 add $out,$out,#64
271
272 b.hi .Loop_outer
273
274 ldp x19,x20,[x29,#16]
275 add sp,sp,#64
276 ldp x21,x22,[x29,#32]
277 ldp x23,x24,[x29,#48]
278 ldp x25,x26,[x29,#64]
279 ldp x27,x28,[x29,#80]
280 ldp x29,x30,[sp],#96
281 .inst 0xd50323bf // autiasp
282.Labort:
283 ret
284
285.align 4
286.Ltail:
287 add $len,$len,#64
288.Less_than_64:
289 sub $out,$out,#1
290 add $inp,$inp,$len
291 add $out,$out,$len
292 add $ctr,sp,$len
293 neg $len,$len
294
295 add @x[0],@x[0],@x[1],lsl#32 // pack
296 add @x[2],@x[2],@x[3],lsl#32
297 add @x[4],@x[4],@x[5],lsl#32
298 add @x[6],@x[6],@x[7],lsl#32
299 add @x[8],@x[8],@x[9],lsl#32
300 add @x[10],@x[10],@x[11],lsl#32
301 add @x[12],@x[12],@x[13],lsl#32
302 add @x[14],@x[14],@x[15],lsl#32
303#ifdef __ARMEB__
304 rev @x[0],@x[0]
305 rev @x[2],@x[2]
306 rev @x[4],@x[4]
307 rev @x[6],@x[6]
308 rev @x[8],@x[8]
309 rev @x[10],@x[10]
310 rev @x[12],@x[12]
311 rev @x[14],@x[14]
312#endif
313 stp @x[0],@x[2],[sp,#0]
314 stp @x[4],@x[6],[sp,#16]
315 stp @x[8],@x[10],[sp,#32]
316 stp @x[12],@x[14],[sp,#48]
317
318.Loop_tail:
319 ldrb w10,[$inp,$len]
320 ldrb w11,[$ctr,$len]
321 add $len,$len,#1
322 eor w10,w10,w11
323 strb w10,[$out,$len]
324 cbnz $len,.Loop_tail
325
326 stp xzr,xzr,[sp,#0]
327 stp xzr,xzr,[sp,#16]
328 stp xzr,xzr,[sp,#32]
329 stp xzr,xzr,[sp,#48]
330
331 ldp x19,x20,[x29,#16]
332 add sp,sp,#64
333 ldp x21,x22,[x29,#32]
334 ldp x23,x24,[x29,#48]
335 ldp x25,x26,[x29,#64]
336 ldp x27,x28,[x29,#80]
337 ldp x29,x30,[sp],#96
338 .inst 0xd50323bf // autiasp
339 ret
340.size ChaCha20_ctr32,.-ChaCha20_ctr32
341___
342
343{{{
344my ($A0,$B0,$C0,$D0,$A1,$B1,$C1,$D1,$A2,$B2,$C2,$D2,$T0,$T1,$T2,$T3) =
345 map("v$_.4s",(0..7,16..23));
346my (@K)=map("v$_.4s",(24..30));
347my $ONE="v31.4s";
348
349sub NEONROUND {
350my $odd = pop;
351my ($a,$b,$c,$d,$t)=@_;
352
353 (
354 "&add ('$a','$a','$b')",
355 "&eor ('$d','$d','$a')",
356 "&rev32_16 ('$d','$d')", # vrot ($d,16)
357
358 "&add ('$c','$c','$d')",
359 "&eor ('$t','$b','$c')",
360 "&ushr ('$b','$t',20)",
361 "&sli ('$b','$t',12)",
362
363 "&add ('$a','$a','$b')",
364 "&eor ('$t','$d','$a')",
365 "&ushr ('$d','$t',24)",
366 "&sli ('$d','$t',8)",
367
368 "&add ('$c','$c','$d')",
369 "&eor ('$t','$b','$c')",
370 "&ushr ('$b','$t',25)",
371 "&sli ('$b','$t',7)",
372
373 "&ext ('$c','$c','$c',8)",
374 "&ext ('$d','$d','$d',$odd?4:12)",
375 "&ext ('$b','$b','$b',$odd?12:4)"
376 );
377}
378
379$code.=<<___;
380
381.type ChaCha20_neon,%function
382.align 5
383ChaCha20_neon:
384 .inst 0xd503233f // paciasp
385 stp x29,x30,[sp,#-96]!
386 add x29,sp,#0
387
388 adr @x[0],.Lsigma
389 stp x19,x20,[sp,#16]
390 stp x21,x22,[sp,#32]
391 stp x23,x24,[sp,#48]
392 stp x25,x26,[sp,#64]
393 stp x27,x28,[sp,#80]
394 cmp $len,#512
395 b.hs .L512_or_more_neon
396
397 sub sp,sp,#64
398
399 ldp @d[0],@d[1],[@x[0]] // load sigma
400 ld1 {@K[0]},[@x[0]],#16
401 ldp @d[2],@d[3],[$key] // load key
402 ldp @d[4],@d[5],[$key,#16]
403 ld1 {@K[1],@K[2]},[$key]
404 ldp @d[6],@d[7],[$ctr] // load counter
405 ld1 {@K[3]},[$ctr]
406 ld1 {$ONE},[@x[0]]
407#ifdef __ARMEB__
408 rev64 @K[0],@K[0]
409 ror @d[2],@d[2],#32
410 ror @d[3],@d[3],#32
411 ror @d[4],@d[4],#32
412 ror @d[5],@d[5],#32
413 ror @d[6],@d[6],#32
414 ror @d[7],@d[7],#32
415#endif
416 add @K[3],@K[3],$ONE // += 1
417 add @K[4],@K[3],$ONE
418 add @K[5],@K[4],$ONE
419 shl $ONE,$ONE,#2 // 1 -> 4
420
421.Loop_outer_neon:
422 mov.32 @x[0],@d[0] // unpack key block
423 lsr @x[1],@d[0],#32
424 mov $A0,@K[0]
425 mov.32 @x[2],@d[1]
426 lsr @x[3],@d[1],#32
427 mov $A1,@K[0]
428 mov.32 @x[4],@d[2]
429 lsr @x[5],@d[2],#32
430 mov $A2,@K[0]
431 mov.32 @x[6],@d[3]
432 mov $B0,@K[1]
433 lsr @x[7],@d[3],#32
434 mov $B1,@K[1]
435 mov.32 @x[8],@d[4]
436 mov $B2,@K[1]
437 lsr @x[9],@d[4],#32
438 mov $D0,@K[3]
439 mov.32 @x[10],@d[5]
440 mov $D1,@K[4]
441 lsr @x[11],@d[5],#32
442 mov $D2,@K[5]
443 mov.32 @x[12],@d[6]
444 mov $C0,@K[2]
445 lsr @x[13],@d[6],#32
446 mov $C1,@K[2]
447 mov.32 @x[14],@d[7]
448 mov $C2,@K[2]
449 lsr @x[15],@d[7],#32
450
451 mov $ctr,#10
452 subs $len,$len,#256
453.Loop_neon:
454 sub $ctr,$ctr,#1
455___
456 my @thread0=&NEONROUND($A0,$B0,$C0,$D0,$T0,0);
457 my @thread1=&NEONROUND($A1,$B1,$C1,$D1,$T1,0);
458 my @thread2=&NEONROUND($A2,$B2,$C2,$D2,$T2,0);
459 my @thread3=&ROUND(0,4,8,12);
460
461 foreach (@thread0) {
462 eval; eval(shift(@thread3));
463 eval(shift(@thread1)); eval(shift(@thread3));
464 eval(shift(@thread2)); eval(shift(@thread3));
465 }
466
467 @thread0=&NEONROUND($A0,$B0,$C0,$D0,$T0,1);
468 @thread1=&NEONROUND($A1,$B1,$C1,$D1,$T1,1);
469 @thread2=&NEONROUND($A2,$B2,$C2,$D2,$T2,1);
470 @thread3=&ROUND(0,5,10,15);
471
472 foreach (@thread0) {
473 eval; eval(shift(@thread3));
474 eval(shift(@thread1)); eval(shift(@thread3));
475 eval(shift(@thread2)); eval(shift(@thread3));
476 }
477$code.=<<___;
478 cbnz $ctr,.Loop_neon
479
480 add.32 @x[0],@x[0],@d[0] // accumulate key block
481 add $A0,$A0,@K[0]
482 add @x[1],@x[1],@d[0],lsr#32
483 add $A1,$A1,@K[0]
484 add.32 @x[2],@x[2],@d[1]
485 add $A2,$A2,@K[0]
486 add @x[3],@x[3],@d[1],lsr#32
487 add $C0,$C0,@K[2]
488 add.32 @x[4],@x[4],@d[2]
489 add $C1,$C1,@K[2]
490 add @x[5],@x[5],@d[2],lsr#32
491 add $C2,$C2,@K[2]
492 add.32 @x[6],@x[6],@d[3]
493 add $D0,$D0,@K[3]
494 add @x[7],@x[7],@d[3],lsr#32
495 add.32 @x[8],@x[8],@d[4]
496 add $D1,$D1,@K[4]
497 add @x[9],@x[9],@d[4],lsr#32
498 add.32 @x[10],@x[10],@d[5]
499 add $D2,$D2,@K[5]
500 add @x[11],@x[11],@d[5],lsr#32
501 add.32 @x[12],@x[12],@d[6]
502 add $B0,$B0,@K[1]
503 add @x[13],@x[13],@d[6],lsr#32
504 add.32 @x[14],@x[14],@d[7]
505 add $B1,$B1,@K[1]
506 add @x[15],@x[15],@d[7],lsr#32
507 add $B2,$B2,@K[1]
508
509 b.lo .Ltail_neon
510
511 add @x[0],@x[0],@x[1],lsl#32 // pack
512 add @x[2],@x[2],@x[3],lsl#32
513 ldp @x[1],@x[3],[$inp,#0] // load input
514 add @x[4],@x[4],@x[5],lsl#32
515 add @x[6],@x[6],@x[7],lsl#32
516 ldp @x[5],@x[7],[$inp,#16]
517 add @x[8],@x[8],@x[9],lsl#32
518 add @x[10],@x[10],@x[11],lsl#32
519 ldp @x[9],@x[11],[$inp,#32]
520 add @x[12],@x[12],@x[13],lsl#32
521 add @x[14],@x[14],@x[15],lsl#32
522 ldp @x[13],@x[15],[$inp,#48]
523 add $inp,$inp,#64
524#ifdef __ARMEB__
525 rev @x[0],@x[0]
526 rev @x[2],@x[2]
527 rev @x[4],@x[4]
528 rev @x[6],@x[6]
529 rev @x[8],@x[8]
530 rev @x[10],@x[10]
531 rev @x[12],@x[12]
532 rev @x[14],@x[14]
533#endif
534 ld1.8 {$T0-$T3},[$inp],#64
535 eor @x[0],@x[0],@x[1]
536 eor @x[2],@x[2],@x[3]
537 eor @x[4],@x[4],@x[5]
538 eor @x[6],@x[6],@x[7]
539 eor @x[8],@x[8],@x[9]
540 eor $A0,$A0,$T0
541 eor @x[10],@x[10],@x[11]
542 eor $B0,$B0,$T1
543 eor @x[12],@x[12],@x[13]
544 eor $C0,$C0,$T2
545 eor @x[14],@x[14],@x[15]
546 eor $D0,$D0,$T3
547 ld1.8 {$T0-$T3},[$inp],#64
548
549 stp @x[0],@x[2],[$out,#0] // store output
550 add @d[6],@d[6],#4 // increment counter
551 stp @x[4],@x[6],[$out,#16]
552 add @K[3],@K[3],$ONE // += 4
553 stp @x[8],@x[10],[$out,#32]
554 add @K[4],@K[4],$ONE
555 stp @x[12],@x[14],[$out,#48]
556 add @K[5],@K[5],$ONE
557 add $out,$out,#64
558
559 st1.8 {$A0-$D0},[$out],#64
560 ld1.8 {$A0-$D0},[$inp],#64
561
562 eor $A1,$A1,$T0
563 eor $B1,$B1,$T1
564 eor $C1,$C1,$T2
565 eor $D1,$D1,$T3
566 st1.8 {$A1-$D1},[$out],#64
567
568 eor $A2,$A2,$A0
569 eor $B2,$B2,$B0
570 eor $C2,$C2,$C0
571 eor $D2,$D2,$D0
572 st1.8 {$A2-$D2},[$out],#64
573
574 b.hi .Loop_outer_neon
575
576 ldp x19,x20,[x29,#16]
577 add sp,sp,#64
578 ldp x21,x22,[x29,#32]
579 ldp x23,x24,[x29,#48]
580 ldp x25,x26,[x29,#64]
581 ldp x27,x28,[x29,#80]
582 ldp x29,x30,[sp],#96
583 .inst 0xd50323bf // autiasp
584 ret
585
586.Ltail_neon:
587 add $len,$len,#256
588 cmp $len,#64
589 b.lo .Less_than_64
590
591 add @x[0],@x[0],@x[1],lsl#32 // pack
592 add @x[2],@x[2],@x[3],lsl#32
593 ldp @x[1],@x[3],[$inp,#0] // load input
594 add @x[4],@x[4],@x[5],lsl#32
595 add @x[6],@x[6],@x[7],lsl#32
596 ldp @x[5],@x[7],[$inp,#16]
597 add @x[8],@x[8],@x[9],lsl#32
598 add @x[10],@x[10],@x[11],lsl#32
599 ldp @x[9],@x[11],[$inp,#32]
600 add @x[12],@x[12],@x[13],lsl#32
601 add @x[14],@x[14],@x[15],lsl#32
602 ldp @x[13],@x[15],[$inp,#48]
603 add $inp,$inp,#64
604#ifdef __ARMEB__
605 rev @x[0],@x[0]
606 rev @x[2],@x[2]
607 rev @x[4],@x[4]
608 rev @x[6],@x[6]
609 rev @x[8],@x[8]
610 rev @x[10],@x[10]
611 rev @x[12],@x[12]
612 rev @x[14],@x[14]
613#endif
614 eor @x[0],@x[0],@x[1]
615 eor @x[2],@x[2],@x[3]
616 eor @x[4],@x[4],@x[5]
617 eor @x[6],@x[6],@x[7]
618 eor @x[8],@x[8],@x[9]
619 eor @x[10],@x[10],@x[11]
620 eor @x[12],@x[12],@x[13]
621 eor @x[14],@x[14],@x[15]
622
623 stp @x[0],@x[2],[$out,#0] // store output
624 add @d[6],@d[6],#4 // increment counter
625 stp @x[4],@x[6],[$out,#16]
626 stp @x[8],@x[10],[$out,#32]
627 stp @x[12],@x[14],[$out,#48]
628 add $out,$out,#64
629 b.eq .Ldone_neon
630 sub $len,$len,#64
631 cmp $len,#64
632 b.lo .Less_than_128
633
634 ld1.8 {$T0-$T3},[$inp],#64
635 eor $A0,$A0,$T0
636 eor $B0,$B0,$T1
637 eor $C0,$C0,$T2
638 eor $D0,$D0,$T3
639 st1.8 {$A0-$D0},[$out],#64
640 b.eq .Ldone_neon
641 sub $len,$len,#64
642 cmp $len,#64
643 b.lo .Less_than_192
644
645 ld1.8 {$T0-$T3},[$inp],#64
646 eor $A1,$A1,$T0
647 eor $B1,$B1,$T1
648 eor $C1,$C1,$T2
649 eor $D1,$D1,$T3
650 st1.8 {$A1-$D1},[$out],#64
651 b.eq .Ldone_neon
652 sub $len,$len,#64
653
654 st1.8 {$A2-$D2},[sp]
655 b .Last_neon
656
657.Less_than_128:
658 st1.8 {$A0-$D0},[sp]
659 b .Last_neon
660.Less_than_192:
661 st1.8 {$A1-$D1},[sp]
662 b .Last_neon
663
664.align 4
665.Last_neon:
666 sub $out,$out,#1
667 add $inp,$inp,$len
668 add $out,$out,$len
669 add $ctr,sp,$len
670 neg $len,$len
671
672.Loop_tail_neon:
673 ldrb w10,[$inp,$len]
674 ldrb w11,[$ctr,$len]
675 add $len,$len,#1
676 eor w10,w10,w11
677 strb w10,[$out,$len]
678 cbnz $len,.Loop_tail_neon
679
680 stp xzr,xzr,[sp,#0]
681 stp xzr,xzr,[sp,#16]
682 stp xzr,xzr,[sp,#32]
683 stp xzr,xzr,[sp,#48]
684
685.Ldone_neon:
686 ldp x19,x20,[x29,#16]
687 add sp,sp,#64
688 ldp x21,x22,[x29,#32]
689 ldp x23,x24,[x29,#48]
690 ldp x25,x26,[x29,#64]
691 ldp x27,x28,[x29,#80]
692 ldp x29,x30,[sp],#96
693 .inst 0xd50323bf // autiasp
694 ret
695.size ChaCha20_neon,.-ChaCha20_neon
696___
697{
698my ($T0,$T1,$T2,$T3,$T4,$T5)=@K;
699my ($A0,$B0,$C0,$D0,$A1,$B1,$C1,$D1,$A2,$B2,$C2,$D2,
700 $A3,$B3,$C3,$D3,$A4,$B4,$C4,$D4,$A5,$B5,$C5,$D5) = map("v$_.4s",(0..23));
701
702$code.=<<___;
703.type ChaCha20_512_neon,%function
704.align 5
705ChaCha20_512_neon:
706 .inst 0xd503233f // paciasp
707 stp x29,x30,[sp,#-96]!
708 add x29,sp,#0
709
710 adr @x[0],.Lsigma
711 stp x19,x20,[sp,#16]
712 stp x21,x22,[sp,#32]
713 stp x23,x24,[sp,#48]
714 stp x25,x26,[sp,#64]
715 stp x27,x28,[sp,#80]
716
717.L512_or_more_neon:
718 sub sp,sp,#128+64
719
720 ldp @d[0],@d[1],[@x[0]] // load sigma
721 ld1 {@K[0]},[@x[0]],#16
722 ldp @d[2],@d[3],[$key] // load key
723 ldp @d[4],@d[5],[$key,#16]
724 ld1 {@K[1],@K[2]},[$key]
725 ldp @d[6],@d[7],[$ctr] // load counter
726 ld1 {@K[3]},[$ctr]
727 ld1 {$ONE},[@x[0]]
728#ifdef __ARMEB__
729 rev64 @K[0],@K[0]
730 ror @d[2],@d[2],#32
731 ror @d[3],@d[3],#32
732 ror @d[4],@d[4],#32
733 ror @d[5],@d[5],#32
734 ror @d[6],@d[6],#32
735 ror @d[7],@d[7],#32
736#endif
737 add @K[3],@K[3],$ONE // += 1
738 stp @K[0],@K[1],[sp,#0] // off-load key block, invariant part
739 add @K[3],@K[3],$ONE // not typo
740 str @K[2],[sp,#32]
741 add @K[4],@K[3],$ONE
742 add @K[5],@K[4],$ONE
743 add @K[6],@K[5],$ONE
744 shl $ONE,$ONE,#2 // 1 -> 4
745
746 stp d8,d9,[sp,#128+0] // meet ABI requirements
747 stp d10,d11,[sp,#128+16]
748 stp d12,d13,[sp,#128+32]
749 stp d14,d15,[sp,#128+48]
750
751 sub $len,$len,#512 // not typo
752
753.Loop_outer_512_neon:
754 mov $A0,@K[0]
755 mov $A1,@K[0]
756 mov $A2,@K[0]
757 mov $A3,@K[0]
758 mov $A4,@K[0]
759 mov $A5,@K[0]
760 mov $B0,@K[1]
761 mov.32 @x[0],@d[0] // unpack key block
762 mov $B1,@K[1]
763 lsr @x[1],@d[0],#32
764 mov $B2,@K[1]
765 mov.32 @x[2],@d[1]
766 mov $B3,@K[1]
767 lsr @x[3],@d[1],#32
768 mov $B4,@K[1]
769 mov.32 @x[4],@d[2]
770 mov $B5,@K[1]
771 lsr @x[5],@d[2],#32
772 mov $D0,@K[3]
773 mov.32 @x[6],@d[3]
774 mov $D1,@K[4]
775 lsr @x[7],@d[3],#32
776 mov $D2,@K[5]
777 mov.32 @x[8],@d[4]
778 mov $D3,@K[6]
779 lsr @x[9],@d[4],#32
780 mov $C0,@K[2]
781 mov.32 @x[10],@d[5]
782 mov $C1,@K[2]
783 lsr @x[11],@d[5],#32
784 add $D4,$D0,$ONE // +4
785 mov.32 @x[12],@d[6]
786 add $D5,$D1,$ONE // +4
787 lsr @x[13],@d[6],#32
788 mov $C2,@K[2]
789 mov.32 @x[14],@d[7]
790 mov $C3,@K[2]
791 lsr @x[15],@d[7],#32
792 mov $C4,@K[2]
793 stp @K[3],@K[4],[sp,#48] // off-load key block, variable part
794 mov $C5,@K[2]
795 str @K[5],[sp,#80]
796
797 mov $ctr,#5
798 subs $len,$len,#512
799.Loop_upper_neon:
800 sub $ctr,$ctr,#1
801___
802 my @thread0=&NEONROUND($A0,$B0,$C0,$D0,$T0,0);
803 my @thread1=&NEONROUND($A1,$B1,$C1,$D1,$T1,0);
804 my @thread2=&NEONROUND($A2,$B2,$C2,$D2,$T2,0);
805 my @thread3=&NEONROUND($A3,$B3,$C3,$D3,$T3,0);
806 my @thread4=&NEONROUND($A4,$B4,$C4,$D4,$T4,0);
807 my @thread5=&NEONROUND($A5,$B5,$C5,$D5,$T5,0);
808 my @thread67=(&ROUND(0,4,8,12),&ROUND(0,5,10,15));
809 my $diff = ($#thread0+1)*6 - $#thread67 - 1;
810 my $i = 0;
811
812 foreach (@thread0) {
813 eval; eval(shift(@thread67));
814 eval(shift(@thread1)); eval(shift(@thread67));
815 eval(shift(@thread2)); eval(shift(@thread67));
816 eval(shift(@thread3)); eval(shift(@thread67));
817 eval(shift(@thread4)); eval(shift(@thread67));
818 eval(shift(@thread5)); eval(shift(@thread67));
819 }
820
821 @thread0=&NEONROUND($A0,$B0,$C0,$D0,$T0,1);
822 @thread1=&NEONROUND($A1,$B1,$C1,$D1,$T1,1);
823 @thread2=&NEONROUND($A2,$B2,$C2,$D2,$T2,1);
824 @thread3=&NEONROUND($A3,$B3,$C3,$D3,$T3,1);
825 @thread4=&NEONROUND($A4,$B4,$C4,$D4,$T4,1);
826 @thread5=&NEONROUND($A5,$B5,$C5,$D5,$T5,1);
827 @thread67=(&ROUND(0,4,8,12),&ROUND(0,5,10,15));
828
829 foreach (@thread0) {
830 eval; eval(shift(@thread67));
831 eval(shift(@thread1)); eval(shift(@thread67));
832 eval(shift(@thread2)); eval(shift(@thread67));
833 eval(shift(@thread3)); eval(shift(@thread67));
834 eval(shift(@thread4)); eval(shift(@thread67));
835 eval(shift(@thread5)); eval(shift(@thread67));
836 }
837$code.=<<___;
838 cbnz $ctr,.Loop_upper_neon
839
840 add.32 @x[0],@x[0],@d[0] // accumulate key block
841 add @x[1],@x[1],@d[0],lsr#32
842 add.32 @x[2],@x[2],@d[1]
843 add @x[3],@x[3],@d[1],lsr#32
844 add.32 @x[4],@x[4],@d[2]
845 add @x[5],@x[5],@d[2],lsr#32
846 add.32 @x[6],@x[6],@d[3]
847 add @x[7],@x[7],@d[3],lsr#32
848 add.32 @x[8],@x[8],@d[4]
849 add @x[9],@x[9],@d[4],lsr#32
850 add.32 @x[10],@x[10],@d[5]
851 add @x[11],@x[11],@d[5],lsr#32
852 add.32 @x[12],@x[12],@d[6]
853 add @x[13],@x[13],@d[6],lsr#32
854 add.32 @x[14],@x[14],@d[7]
855 add @x[15],@x[15],@d[7],lsr#32
856
857 add @x[0],@x[0],@x[1],lsl#32 // pack
858 add @x[2],@x[2],@x[3],lsl#32
859 ldp @x[1],@x[3],[$inp,#0] // load input
860 add @x[4],@x[4],@x[5],lsl#32
861 add @x[6],@x[6],@x[7],lsl#32
862 ldp @x[5],@x[7],[$inp,#16]
863 add @x[8],@x[8],@x[9],lsl#32
864 add @x[10],@x[10],@x[11],lsl#32
865 ldp @x[9],@x[11],[$inp,#32]
866 add @x[12],@x[12],@x[13],lsl#32
867 add @x[14],@x[14],@x[15],lsl#32
868 ldp @x[13],@x[15],[$inp,#48]
869 add $inp,$inp,#64
870#ifdef __ARMEB__
871 rev @x[0],@x[0]
872 rev @x[2],@x[2]
873 rev @x[4],@x[4]
874 rev @x[6],@x[6]
875 rev @x[8],@x[8]
876 rev @x[10],@x[10]
877 rev @x[12],@x[12]
878 rev @x[14],@x[14]
879#endif
880 eor @x[0],@x[0],@x[1]
881 eor @x[2],@x[2],@x[3]
882 eor @x[4],@x[4],@x[5]
883 eor @x[6],@x[6],@x[7]
884 eor @x[8],@x[8],@x[9]
885 eor @x[10],@x[10],@x[11]
886 eor @x[12],@x[12],@x[13]
887 eor @x[14],@x[14],@x[15]
888
889 stp @x[0],@x[2],[$out,#0] // store output
890 add @d[6],@d[6],#1 // increment counter
891 mov.32 @x[0],@d[0] // unpack key block
892 lsr @x[1],@d[0],#32
893 stp @x[4],@x[6],[$out,#16]
894 mov.32 @x[2],@d[1]
895 lsr @x[3],@d[1],#32
896 stp @x[8],@x[10],[$out,#32]
897 mov.32 @x[4],@d[2]
898 lsr @x[5],@d[2],#32
899 stp @x[12],@x[14],[$out,#48]
900 add $out,$out,#64
901 mov.32 @x[6],@d[3]
902 lsr @x[7],@d[3],#32
903 mov.32 @x[8],@d[4]
904 lsr @x[9],@d[4],#32
905 mov.32 @x[10],@d[5]
906 lsr @x[11],@d[5],#32
907 mov.32 @x[12],@d[6]
908 lsr @x[13],@d[6],#32
909 mov.32 @x[14],@d[7]
910 lsr @x[15],@d[7],#32
911
912 mov $ctr,#5
913.Loop_lower_neon:
914 sub $ctr,$ctr,#1
915___
916 @thread0=&NEONROUND($A0,$B0,$C0,$D0,$T0,0);
917 @thread1=&NEONROUND($A1,$B1,$C1,$D1,$T1,0);
918 @thread2=&NEONROUND($A2,$B2,$C2,$D2,$T2,0);
919 @thread3=&NEONROUND($A3,$B3,$C3,$D3,$T3,0);
920 @thread4=&NEONROUND($A4,$B4,$C4,$D4,$T4,0);
921 @thread5=&NEONROUND($A5,$B5,$C5,$D5,$T5,0);
922 @thread67=(&ROUND(0,4,8,12),&ROUND(0,5,10,15));
923
924 foreach (@thread0) {
925 eval; eval(shift(@thread67));
926 eval(shift(@thread1)); eval(shift(@thread67));
927 eval(shift(@thread2)); eval(shift(@thread67));
928 eval(shift(@thread3)); eval(shift(@thread67));
929 eval(shift(@thread4)); eval(shift(@thread67));
930 eval(shift(@thread5)); eval(shift(@thread67));
931 }
932
933 @thread0=&NEONROUND($A0,$B0,$C0,$D0,$T0,1);
934 @thread1=&NEONROUND($A1,$B1,$C1,$D1,$T1,1);
935 @thread2=&NEONROUND($A2,$B2,$C2,$D2,$T2,1);
936 @thread3=&NEONROUND($A3,$B3,$C3,$D3,$T3,1);
937 @thread4=&NEONROUND($A4,$B4,$C4,$D4,$T4,1);
938 @thread5=&NEONROUND($A5,$B5,$C5,$D5,$T5,1);
939 @thread67=(&ROUND(0,4,8,12),&ROUND(0,5,10,15));
940
941 foreach (@thread0) {
942 eval; eval(shift(@thread67));
943 eval(shift(@thread1)); eval(shift(@thread67));
944 eval(shift(@thread2)); eval(shift(@thread67));
945 eval(shift(@thread3)); eval(shift(@thread67));
946 eval(shift(@thread4)); eval(shift(@thread67));
947 eval(shift(@thread5)); eval(shift(@thread67));
948 }
949$code.=<<___;
950 cbnz $ctr,.Loop_lower_neon
951
952 add.32 @x[0],@x[0],@d[0] // accumulate key block
953 ldp @K[0],@K[1],[sp,#0]
954 add @x[1],@x[1],@d[0],lsr#32
955 ldp @K[2],@K[3],[sp,#32]
956 add.32 @x[2],@x[2],@d[1]
957 ldp @K[4],@K[5],[sp,#64]
958 add @x[3],@x[3],@d[1],lsr#32
959 add $A0,$A0,@K[0]
960 add.32 @x[4],@x[4],@d[2]
961 add $A1,$A1,@K[0]
962 add @x[5],@x[5],@d[2],lsr#32
963 add $A2,$A2,@K[0]
964 add.32 @x[6],@x[6],@d[3]
965 add $A3,$A3,@K[0]
966 add @x[7],@x[7],@d[3],lsr#32
967 add $A4,$A4,@K[0]
968 add.32 @x[8],@x[8],@d[4]
969 add $A5,$A5,@K[0]
970 add @x[9],@x[9],@d[4],lsr#32
971 add $C0,$C0,@K[2]
972 add.32 @x[10],@x[10],@d[5]
973 add $C1,$C1,@K[2]
974 add @x[11],@x[11],@d[5],lsr#32
975 add $C2,$C2,@K[2]
976 add.32 @x[12],@x[12],@d[6]
977 add $C3,$C3,@K[2]
978 add @x[13],@x[13],@d[6],lsr#32
979 add $C4,$C4,@K[2]
980 add.32 @x[14],@x[14],@d[7]
981 add $C5,$C5,@K[2]
982 add @x[15],@x[15],@d[7],lsr#32
983 add $D4,$D4,$ONE // +4
984 add @x[0],@x[0],@x[1],lsl#32 // pack
985 add $D5,$D5,$ONE // +4
986 add @x[2],@x[2],@x[3],lsl#32
987 add $D0,$D0,@K[3]
988 ldp @x[1],@x[3],[$inp,#0] // load input
989 add $D1,$D1,@K[4]
990 add @x[4],@x[4],@x[5],lsl#32
991 add $D2,$D2,@K[5]
992 add @x[6],@x[6],@x[7],lsl#32
993 add $D3,$D3,@K[6]
994 ldp @x[5],@x[7],[$inp,#16]
995 add $D4,$D4,@K[3]
996 add @x[8],@x[8],@x[9],lsl#32
997 add $D5,$D5,@K[4]
998 add @x[10],@x[10],@x[11],lsl#32
999 add $B0,$B0,@K[1]
1000 ldp @x[9],@x[11],[$inp,#32]
1001 add $B1,$B1,@K[1]
1002 add @x[12],@x[12],@x[13],lsl#32
1003 add $B2,$B2,@K[1]
1004 add @x[14],@x[14],@x[15],lsl#32
1005 add $B3,$B3,@K[1]
1006 ldp @x[13],@x[15],[$inp,#48]
1007 add $B4,$B4,@K[1]
1008 add $inp,$inp,#64
1009 add $B5,$B5,@K[1]
1010
1011#ifdef __ARMEB__
1012 rev @x[0],@x[0]
1013 rev @x[2],@x[2]
1014 rev @x[4],@x[4]
1015 rev @x[6],@x[6]
1016 rev @x[8],@x[8]
1017 rev @x[10],@x[10]
1018 rev @x[12],@x[12]
1019 rev @x[14],@x[14]
1020#endif
1021 ld1.8 {$T0-$T3},[$inp],#64
1022 eor @x[0],@x[0],@x[1]
1023 eor @x[2],@x[2],@x[3]
1024 eor @x[4],@x[4],@x[5]
1025 eor @x[6],@x[6],@x[7]
1026 eor @x[8],@x[8],@x[9]
1027 eor $A0,$A0,$T0
1028 eor @x[10],@x[10],@x[11]
1029 eor $B0,$B0,$T1
1030 eor @x[12],@x[12],@x[13]
1031 eor $C0,$C0,$T2
1032 eor @x[14],@x[14],@x[15]
1033 eor $D0,$D0,$T3
1034 ld1.8 {$T0-$T3},[$inp],#64
1035
1036 stp @x[0],@x[2],[$out,#0] // store output
1037 add @d[6],@d[6],#7 // increment counter
1038 stp @x[4],@x[6],[$out,#16]
1039 stp @x[8],@x[10],[$out,#32]
1040 stp @x[12],@x[14],[$out,#48]
1041 add $out,$out,#64
1042 st1.8 {$A0-$D0},[$out],#64
1043
1044 ld1.8 {$A0-$D0},[$inp],#64
1045 eor $A1,$A1,$T0
1046 eor $B1,$B1,$T1
1047 eor $C1,$C1,$T2
1048 eor $D1,$D1,$T3
1049 st1.8 {$A1-$D1},[$out],#64
1050
1051 ld1.8 {$A1-$D1},[$inp],#64
1052 eor $A2,$A2,$A0
1053 ldp @K[0],@K[1],[sp,#0]
1054 eor $B2,$B2,$B0
1055 ldp @K[2],@K[3],[sp,#32]
1056 eor $C2,$C2,$C0
1057 eor $D2,$D2,$D0
1058 st1.8 {$A2-$D2},[$out],#64
1059
1060 ld1.8 {$A2-$D2},[$inp],#64
1061 eor $A3,$A3,$A1
1062 eor $B3,$B3,$B1
1063 eor $C3,$C3,$C1
1064 eor $D3,$D3,$D1
1065 st1.8 {$A3-$D3},[$out],#64
1066
1067 ld1.8 {$A3-$D3},[$inp],#64
1068 eor $A4,$A4,$A2
1069 eor $B4,$B4,$B2
1070 eor $C4,$C4,$C2
1071 eor $D4,$D4,$D2
1072 st1.8 {$A4-$D4},[$out],#64
1073
1074 shl $A0,$ONE,#1 // 4 -> 8
1075 eor $A5,$A5,$A3
1076 eor $B5,$B5,$B3
1077 eor $C5,$C5,$C3
1078 eor $D5,$D5,$D3
1079 st1.8 {$A5-$D5},[$out],#64
1080
1081 add @K[3],@K[3],$A0 // += 8
1082 add @K[4],@K[4],$A0
1083 add @K[5],@K[5],$A0
1084 add @K[6],@K[6],$A0
1085
1086 b.hs .Loop_outer_512_neon
1087
1088 adds $len,$len,#512
1089 ushr $A0,$ONE,#2 // 4 -> 1
1090
1091 ldp d8,d9,[sp,#128+0] // meet ABI requirements
1092 ldp d10,d11,[sp,#128+16]
1093 ldp d12,d13,[sp,#128+32]
1094 ldp d14,d15,[sp,#128+48]
1095
1096 stp @K[0],$ONE,[sp,#0] // wipe off-load area
1097 stp @K[0],$ONE,[sp,#32]
1098 stp @K[0],$ONE,[sp,#64]
1099
1100 b.eq .Ldone_512_neon
1101
1102 cmp $len,#192
1103 sub @K[3],@K[3],$A0 // -= 1
1104 sub @K[4],@K[4],$A0
1105 sub @K[5],@K[5],$A0
1106 add sp,sp,#128
1107 b.hs .Loop_outer_neon
1108
1109 eor @K[1],@K[1],@K[1]
1110 eor @K[2],@K[2],@K[2]
1111 eor @K[3],@K[3],@K[3]
1112 eor @K[4],@K[4],@K[4]
1113 eor @K[5],@K[5],@K[5]
1114 eor @K[6],@K[6],@K[6]
1115 b .Loop_outer
1116
1117.Ldone_512_neon:
1118 ldp x19,x20,[x29,#16]
1119 add sp,sp,#128+64
1120 ldp x21,x22,[x29,#32]
1121 ldp x23,x24,[x29,#48]
1122 ldp x25,x26,[x29,#64]
1123 ldp x27,x28,[x29,#80]
1124 ldp x29,x30,[sp],#96
1125 .inst 0xd50323bf // autiasp
1126 ret
1127.size ChaCha20_512_neon,.-ChaCha20_512_neon
1128___
1129}
1130}}}
1131
1132foreach (split("\n",$code)) {
1133 s/\`([^\`]*)\`/eval $1/geo;
1134
1135 (s/\b([a-z]+)\.32\b/$1/ and (s/x([0-9]+)/w$1/g or 1)) or
1136 (m/\b(eor|ext|mov)\b/ and (s/\.4s/\.16b/g or 1)) or
1137 (s/\b((?:ld|st)1)\.8\b/$1/ and (s/\.4s/\.16b/g or 1)) or
1138 (m/\b(ld|st)[rp]\b/ and (s/v([0-9]+)\.4s/q$1/g or 1)) or
1139 (s/\brev32\.16\b/rev32/ and (s/\.4s/\.8h/g or 1));
1140
1141 #s/\bq([0-9]+)#(lo|hi)/sprintf "d%d",2*$1+($2 eq "hi")/geo;
1142
1143 print $_,"\n";
1144}
1145close STDOUT or die "error closing STDOUT: $!"; # flush
Note: See TracBrowser for help on using the repository browser.

© 2024 Oracle Support Privacy / Do Not Sell My Info Terms of Use Trademark Policy Automated Access Etiquette