VirtualBox

source: vbox/trunk/src/libs/openssl-1.1.0g/crypto/poly1305/asm/poly1305-armv8.pl@ 69881

Last change on this file since 69881 was 69881, checked in by vboxsync, 7 years ago

Update OpenSSL to 1.1.0g.
bugref:8070: src/libs maintenance

  • Property svn:eol-style set to LF
  • Property svn:executable set to *
File size: 21.0 KB
Line 
1#! /usr/bin/env perl
2# Copyright 2016 The OpenSSL Project Authors. All Rights Reserved.
3#
4# Licensed under the OpenSSL license (the "License"). You may not use
5# this file except in compliance with the License. You can obtain a copy
6# in the file LICENSE in the source distribution or at
7# https://www.openssl.org/source/license.html
8
9#
10# ====================================================================
11# Written by Andy Polyakov <[email protected]> for the OpenSSL
12# project. The module is, however, dual licensed under OpenSSL and
13# CRYPTOGAMS licenses depending on where you obtain it. For further
14# details see http://www.openssl.org/~appro/cryptogams/.
15# ====================================================================
16#
17# This module implements Poly1305 hash for ARMv8.
18#
19# June 2015
20#
21# Numbers are cycles per processed byte with poly1305_blocks alone.
22#
23# IALU/gcc-4.9 NEON
24#
25# Apple A7 1.86/+5% 0.72
26# Cortex-A53 2.69/+58% 1.47
27# Cortex-A57 2.70/+7% 1.14
28# Denver 1.64/+50% 1.18(*)
29# X-Gene 2.13/+68% 2.27
30# Mongoose 1.77/+75% 1.12
31#
32# (*) estimate based on resources availability is less than 1.0,
33# i.e. measured result is worse than expected, presumably binary
34# translator is not almighty;
35
36$flavour=shift;
37$output=shift;
38
39$0 =~ m/(.*[\/\\])[^\/\\]+$/; $dir=$1;
40( $xlate="${dir}arm-xlate.pl" and -f $xlate ) or
41( $xlate="${dir}../../perlasm/arm-xlate.pl" and -f $xlate) or
42die "can't locate arm-xlate.pl";
43
44open OUT,"| \"$^X\" $xlate $flavour $output";
45*STDOUT=*OUT;
46
47my ($ctx,$inp,$len,$padbit) = map("x$_",(0..3));
48my ($mac,$nonce)=($inp,$len);
49
50my ($h0,$h1,$h2,$r0,$r1,$s1,$t0,$t1,$d0,$d1,$d2) = map("x$_",(4..14));
51
52$code.=<<___;
53#include "arm_arch.h"
54
55.text
56
57// forward "declarations" are required for Apple
58.extern OPENSSL_armcap_P
59.globl poly1305_blocks
60.globl poly1305_emit
61
62.globl poly1305_init
63.type poly1305_init,%function
64.align 5
65poly1305_init:
66 cmp $inp,xzr
67 stp xzr,xzr,[$ctx] // zero hash value
68 stp xzr,xzr,[$ctx,#16] // [along with is_base2_26]
69
70 csel x0,xzr,x0,eq
71 b.eq .Lno_key
72
73#ifdef __ILP32__
74 ldrsw $t1,.LOPENSSL_armcap_P
75#else
76 ldr $t1,.LOPENSSL_armcap_P
77#endif
78 adr $t0,.LOPENSSL_armcap_P
79
80 ldp $r0,$r1,[$inp] // load key
81 mov $s1,#0xfffffffc0fffffff
82 movk $s1,#0x0fff,lsl#48
83 ldr w17,[$t0,$t1]
84#ifdef __ARMEB__
85 rev $r0,$r0 // flip bytes
86 rev $r1,$r1
87#endif
88 and $r0,$r0,$s1 // &=0ffffffc0fffffff
89 and $s1,$s1,#-4
90 and $r1,$r1,$s1 // &=0ffffffc0ffffffc
91 stp $r0,$r1,[$ctx,#32] // save key value
92
93 tst w17,#ARMV7_NEON
94
95 adr $d0,poly1305_blocks
96 adr $r0,poly1305_blocks_neon
97 adr $d1,poly1305_emit
98 adr $r1,poly1305_emit_neon
99
100 csel $d0,$d0,$r0,eq
101 csel $d1,$d1,$r1,eq
102
103#ifdef __ILP32__
104 stp w12,w13,[$len]
105#else
106 stp $d0,$d1,[$len]
107#endif
108
109 mov x0,#1
110.Lno_key:
111 ret
112.size poly1305_init,.-poly1305_init
113
114.type poly1305_blocks,%function
115.align 5
116poly1305_blocks:
117 ands $len,$len,#-16
118 b.eq .Lno_data
119
120 ldp $h0,$h1,[$ctx] // load hash value
121 ldp $r0,$r1,[$ctx,#32] // load key value
122 ldr $h2,[$ctx,#16]
123 add $s1,$r1,$r1,lsr#2 // s1 = r1 + (r1 >> 2)
124 b .Loop
125
126.align 5
127.Loop:
128 ldp $t0,$t1,[$inp],#16 // load input
129 sub $len,$len,#16
130#ifdef __ARMEB__
131 rev $t0,$t0
132 rev $t1,$t1
133#endif
134 adds $h0,$h0,$t0 // accumulate input
135 adcs $h1,$h1,$t1
136
137 mul $d0,$h0,$r0 // h0*r0
138 adc $h2,$h2,$padbit
139 umulh $d1,$h0,$r0
140
141 mul $t0,$h1,$s1 // h1*5*r1
142 umulh $t1,$h1,$s1
143
144 adds $d0,$d0,$t0
145 mul $t0,$h0,$r1 // h0*r1
146 adc $d1,$d1,$t1
147 umulh $d2,$h0,$r1
148
149 adds $d1,$d1,$t0
150 mul $t0,$h1,$r0 // h1*r0
151 adc $d2,$d2,xzr
152 umulh $t1,$h1,$r0
153
154 adds $d1,$d1,$t0
155 mul $t0,$h2,$s1 // h2*5*r1
156 adc $d2,$d2,$t1
157 mul $t1,$h2,$r0 // h2*r0
158
159 adds $d1,$d1,$t0
160 adc $d2,$d2,$t1
161
162 and $t0,$d2,#-4 // final reduction
163 and $h2,$d2,#3
164 add $t0,$t0,$d2,lsr#2
165 adds $h0,$d0,$t0
166 adcs $h1,$d1,xzr
167 adc $h2,$h2,xzr
168
169 cbnz $len,.Loop
170
171 stp $h0,$h1,[$ctx] // store hash value
172 str $h2,[$ctx,#16]
173
174.Lno_data:
175 ret
176.size poly1305_blocks,.-poly1305_blocks
177
178.type poly1305_emit,%function
179.align 5
180poly1305_emit:
181 ldp $h0,$h1,[$ctx] // load hash base 2^64
182 ldr $h2,[$ctx,#16]
183 ldp $t0,$t1,[$nonce] // load nonce
184
185 adds $d0,$h0,#5 // compare to modulus
186 adcs $d1,$h1,xzr
187 adc $d2,$h2,xzr
188
189 tst $d2,#-4 // see if it's carried/borrowed
190
191 csel $h0,$h0,$d0,eq
192 csel $h1,$h1,$d1,eq
193
194#ifdef __ARMEB__
195 ror $t0,$t0,#32 // flip nonce words
196 ror $t1,$t1,#32
197#endif
198 adds $h0,$h0,$t0 // accumulate nonce
199 adc $h1,$h1,$t1
200#ifdef __ARMEB__
201 rev $h0,$h0 // flip output bytes
202 rev $h1,$h1
203#endif
204 stp $h0,$h1,[$mac] // write result
205
206 ret
207.size poly1305_emit,.-poly1305_emit
208___
209my ($R0,$R1,$S1,$R2,$S2,$R3,$S3,$R4,$S4) = map("v$_.4s",(0..8));
210my ($IN01_0,$IN01_1,$IN01_2,$IN01_3,$IN01_4) = map("v$_.2s",(9..13));
211my ($IN23_0,$IN23_1,$IN23_2,$IN23_3,$IN23_4) = map("v$_.2s",(14..18));
212my ($ACC0,$ACC1,$ACC2,$ACC3,$ACC4) = map("v$_.2d",(19..23));
213my ($H0,$H1,$H2,$H3,$H4) = map("v$_.2s",(24..28));
214my ($T0,$T1,$MASK) = map("v$_",(29..31));
215
216my ($in2,$zeros)=("x16","x17");
217my $is_base2_26 = $zeros; # borrow
218
219$code.=<<___;
220.type poly1305_mult,%function
221.align 5
222poly1305_mult:
223 mul $d0,$h0,$r0 // h0*r0
224 umulh $d1,$h0,$r0
225
226 mul $t0,$h1,$s1 // h1*5*r1
227 umulh $t1,$h1,$s1
228
229 adds $d0,$d0,$t0
230 mul $t0,$h0,$r1 // h0*r1
231 adc $d1,$d1,$t1
232 umulh $d2,$h0,$r1
233
234 adds $d1,$d1,$t0
235 mul $t0,$h1,$r0 // h1*r0
236 adc $d2,$d2,xzr
237 umulh $t1,$h1,$r0
238
239 adds $d1,$d1,$t0
240 mul $t0,$h2,$s1 // h2*5*r1
241 adc $d2,$d2,$t1
242 mul $t1,$h2,$r0 // h2*r0
243
244 adds $d1,$d1,$t0
245 adc $d2,$d2,$t1
246
247 and $t0,$d2,#-4 // final reduction
248 and $h2,$d2,#3
249 add $t0,$t0,$d2,lsr#2
250 adds $h0,$d0,$t0
251 adcs $h1,$d1,xzr
252 adc $h2,$h2,xzr
253
254 ret
255.size poly1305_mult,.-poly1305_mult
256
257.type poly1305_splat,%function
258.align 5
259poly1305_splat:
260 and x12,$h0,#0x03ffffff // base 2^64 -> base 2^26
261 ubfx x13,$h0,#26,#26
262 extr x14,$h1,$h0,#52
263 and x14,x14,#0x03ffffff
264 ubfx x15,$h1,#14,#26
265 extr x16,$h2,$h1,#40
266
267 str w12,[$ctx,#16*0] // r0
268 add w12,w13,w13,lsl#2 // r1*5
269 str w13,[$ctx,#16*1] // r1
270 add w13,w14,w14,lsl#2 // r2*5
271 str w12,[$ctx,#16*2] // s1
272 str w14,[$ctx,#16*3] // r2
273 add w14,w15,w15,lsl#2 // r3*5
274 str w13,[$ctx,#16*4] // s2
275 str w15,[$ctx,#16*5] // r3
276 add w15,w16,w16,lsl#2 // r4*5
277 str w14,[$ctx,#16*6] // s3
278 str w16,[$ctx,#16*7] // r4
279 str w15,[$ctx,#16*8] // s4
280
281 ret
282.size poly1305_splat,.-poly1305_splat
283
284.type poly1305_blocks_neon,%function
285.align 5
286poly1305_blocks_neon:
287 ldr $is_base2_26,[$ctx,#24]
288 cmp $len,#128
289 b.hs .Lblocks_neon
290 cbz $is_base2_26,poly1305_blocks
291
292.Lblocks_neon:
293 stp x29,x30,[sp,#-80]!
294 add x29,sp,#0
295
296 ands $len,$len,#-16
297 b.eq .Lno_data_neon
298
299 cbz $is_base2_26,.Lbase2_64_neon
300
301 ldp w10,w11,[$ctx] // load hash value base 2^26
302 ldp w12,w13,[$ctx,#8]
303 ldr w14,[$ctx,#16]
304
305 tst $len,#31
306 b.eq .Leven_neon
307
308 ldp $r0,$r1,[$ctx,#32] // load key value
309
310 add $h0,x10,x11,lsl#26 // base 2^26 -> base 2^64
311 lsr $h1,x12,#12
312 adds $h0,$h0,x12,lsl#52
313 add $h1,$h1,x13,lsl#14
314 adc $h1,$h1,xzr
315 lsr $h2,x14,#24
316 adds $h1,$h1,x14,lsl#40
317 adc $d2,$h2,xzr // can be partially reduced...
318
319 ldp $d0,$d1,[$inp],#16 // load input
320 sub $len,$len,#16
321 add $s1,$r1,$r1,lsr#2 // s1 = r1 + (r1 >> 2)
322
323 and $t0,$d2,#-4 // ... so reduce
324 and $h2,$d2,#3
325 add $t0,$t0,$d2,lsr#2
326 adds $h0,$h0,$t0
327 adcs $h1,$h1,xzr
328 adc $h2,$h2,xzr
329
330#ifdef __ARMEB__
331 rev $d0,$d0
332 rev $d1,$d1
333#endif
334 adds $h0,$h0,$d0 // accumulate input
335 adcs $h1,$h1,$d1
336 adc $h2,$h2,$padbit
337
338 bl poly1305_mult
339 ldr x30,[sp,#8]
340
341 cbz $padbit,.Lstore_base2_64_neon
342
343 and x10,$h0,#0x03ffffff // base 2^64 -> base 2^26
344 ubfx x11,$h0,#26,#26
345 extr x12,$h1,$h0,#52
346 and x12,x12,#0x03ffffff
347 ubfx x13,$h1,#14,#26
348 extr x14,$h2,$h1,#40
349
350 cbnz $len,.Leven_neon
351
352 stp w10,w11,[$ctx] // store hash value base 2^26
353 stp w12,w13,[$ctx,#8]
354 str w14,[$ctx,#16]
355 b .Lno_data_neon
356
357.align 4
358.Lstore_base2_64_neon:
359 stp $h0,$h1,[$ctx] // store hash value base 2^64
360 stp $h2,xzr,[$ctx,#16] // note that is_base2_26 is zeroed
361 b .Lno_data_neon
362
363.align 4
364.Lbase2_64_neon:
365 ldp $r0,$r1,[$ctx,#32] // load key value
366
367 ldp $h0,$h1,[$ctx] // load hash value base 2^64
368 ldr $h2,[$ctx,#16]
369
370 tst $len,#31
371 b.eq .Linit_neon
372
373 ldp $d0,$d1,[$inp],#16 // load input
374 sub $len,$len,#16
375 add $s1,$r1,$r1,lsr#2 // s1 = r1 + (r1 >> 2)
376#ifdef __ARMEB__
377 rev $d0,$d0
378 rev $d1,$d1
379#endif
380 adds $h0,$h0,$d0 // accumulate input
381 adcs $h1,$h1,$d1
382 adc $h2,$h2,$padbit
383
384 bl poly1305_mult
385
386.Linit_neon:
387 and x10,$h0,#0x03ffffff // base 2^64 -> base 2^26
388 ubfx x11,$h0,#26,#26
389 extr x12,$h1,$h0,#52
390 and x12,x12,#0x03ffffff
391 ubfx x13,$h1,#14,#26
392 extr x14,$h2,$h1,#40
393
394 stp d8,d9,[sp,#16] // meet ABI requirements
395 stp d10,d11,[sp,#32]
396 stp d12,d13,[sp,#48]
397 stp d14,d15,[sp,#64]
398
399 fmov ${H0},x10
400 fmov ${H1},x11
401 fmov ${H2},x12
402 fmov ${H3},x13
403 fmov ${H4},x14
404
405 ////////////////////////////////// initialize r^n table
406 mov $h0,$r0 // r^1
407 add $s1,$r1,$r1,lsr#2 // s1 = r1 + (r1 >> 2)
408 mov $h1,$r1
409 mov $h2,xzr
410 add $ctx,$ctx,#48+12
411 bl poly1305_splat
412
413 bl poly1305_mult // r^2
414 sub $ctx,$ctx,#4
415 bl poly1305_splat
416
417 bl poly1305_mult // r^3
418 sub $ctx,$ctx,#4
419 bl poly1305_splat
420
421 bl poly1305_mult // r^4
422 sub $ctx,$ctx,#4
423 bl poly1305_splat
424 ldr x30,[sp,#8]
425
426 add $in2,$inp,#32
427 adr $zeros,.Lzeros
428 subs $len,$len,#64
429 csel $in2,$zeros,$in2,lo
430
431 mov x4,#1
432 str x4,[$ctx,#-24] // set is_base2_26
433 sub $ctx,$ctx,#48 // restore original $ctx
434 b .Ldo_neon
435
436.align 4
437.Leven_neon:
438 add $in2,$inp,#32
439 adr $zeros,.Lzeros
440 subs $len,$len,#64
441 csel $in2,$zeros,$in2,lo
442
443 stp d8,d9,[sp,#16] // meet ABI requirements
444 stp d10,d11,[sp,#32]
445 stp d12,d13,[sp,#48]
446 stp d14,d15,[sp,#64]
447
448 fmov ${H0},x10
449 fmov ${H1},x11
450 fmov ${H2},x12
451 fmov ${H3},x13
452 fmov ${H4},x14
453
454.Ldo_neon:
455 ldp x8,x12,[$in2],#16 // inp[2:3] (or zero)
456 ldp x9,x13,[$in2],#48
457
458 lsl $padbit,$padbit,#24
459 add x15,$ctx,#48
460
461#ifdef __ARMEB__
462 rev x8,x8
463 rev x12,x12
464 rev x9,x9
465 rev x13,x13
466#endif
467 and x4,x8,#0x03ffffff // base 2^64 -> base 2^26
468 and x5,x9,#0x03ffffff
469 ubfx x6,x8,#26,#26
470 ubfx x7,x9,#26,#26
471 add x4,x4,x5,lsl#32 // bfi x4,x5,#32,#32
472 extr x8,x12,x8,#52
473 extr x9,x13,x9,#52
474 add x6,x6,x7,lsl#32 // bfi x6,x7,#32,#32
475 fmov $IN23_0,x4
476 and x8,x8,#0x03ffffff
477 and x9,x9,#0x03ffffff
478 ubfx x10,x12,#14,#26
479 ubfx x11,x13,#14,#26
480 add x12,$padbit,x12,lsr#40
481 add x13,$padbit,x13,lsr#40
482 add x8,x8,x9,lsl#32 // bfi x8,x9,#32,#32
483 fmov $IN23_1,x6
484 add x10,x10,x11,lsl#32 // bfi x10,x11,#32,#32
485 add x12,x12,x13,lsl#32 // bfi x12,x13,#32,#32
486 fmov $IN23_2,x8
487 fmov $IN23_3,x10
488 fmov $IN23_4,x12
489
490 ldp x8,x12,[$inp],#16 // inp[0:1]
491 ldp x9,x13,[$inp],#48
492
493 ld1 {$R0,$R1,$S1,$R2},[x15],#64
494 ld1 {$S2,$R3,$S3,$R4},[x15],#64
495 ld1 {$S4},[x15]
496
497#ifdef __ARMEB__
498 rev x8,x8
499 rev x12,x12
500 rev x9,x9
501 rev x13,x13
502#endif
503 and x4,x8,#0x03ffffff // base 2^64 -> base 2^26
504 and x5,x9,#0x03ffffff
505 ubfx x6,x8,#26,#26
506 ubfx x7,x9,#26,#26
507 add x4,x4,x5,lsl#32 // bfi x4,x5,#32,#32
508 extr x8,x12,x8,#52
509 extr x9,x13,x9,#52
510 add x6,x6,x7,lsl#32 // bfi x6,x7,#32,#32
511 fmov $IN01_0,x4
512 and x8,x8,#0x03ffffff
513 and x9,x9,#0x03ffffff
514 ubfx x10,x12,#14,#26
515 ubfx x11,x13,#14,#26
516 add x12,$padbit,x12,lsr#40
517 add x13,$padbit,x13,lsr#40
518 add x8,x8,x9,lsl#32 // bfi x8,x9,#32,#32
519 fmov $IN01_1,x6
520 add x10,x10,x11,lsl#32 // bfi x10,x11,#32,#32
521 add x12,x12,x13,lsl#32 // bfi x12,x13,#32,#32
522 movi $MASK.2d,#-1
523 fmov $IN01_2,x8
524 fmov $IN01_3,x10
525 fmov $IN01_4,x12
526 ushr $MASK.2d,$MASK.2d,#38
527
528 b.ls .Lskip_loop
529
530.align 4
531.Loop_neon:
532 ////////////////////////////////////////////////////////////////
533 // ((inp[0]*r^4+inp[2]*r^2+inp[4])*r^4+inp[6]*r^2
534 // ((inp[1]*r^4+inp[3]*r^2+inp[5])*r^3+inp[7]*r
535 // \___________________/
536 // ((inp[0]*r^4+inp[2]*r^2+inp[4])*r^4+inp[6]*r^2+inp[8])*r^2
537 // ((inp[1]*r^4+inp[3]*r^2+inp[5])*r^4+inp[7]*r^2+inp[9])*r
538 // \___________________/ \____________________/
539 //
540 // Note that we start with inp[2:3]*r^2. This is because it
541 // doesn't depend on reduction in previous iteration.
542 ////////////////////////////////////////////////////////////////
543 // d4 = h0*r4 + h1*r3 + h2*r2 + h3*r1 + h4*r0
544 // d3 = h0*r3 + h1*r2 + h2*r1 + h3*r0 + h4*5*r4
545 // d2 = h0*r2 + h1*r1 + h2*r0 + h3*5*r4 + h4*5*r3
546 // d1 = h0*r1 + h1*r0 + h2*5*r4 + h3*5*r3 + h4*5*r2
547 // d0 = h0*r0 + h1*5*r4 + h2*5*r3 + h3*5*r2 + h4*5*r1
548
549 subs $len,$len,#64
550 umull $ACC4,$IN23_0,${R4}[2]
551 csel $in2,$zeros,$in2,lo
552 umull $ACC3,$IN23_0,${R3}[2]
553 umull $ACC2,$IN23_0,${R2}[2]
554 ldp x8,x12,[$in2],#16 // inp[2:3] (or zero)
555 umull $ACC1,$IN23_0,${R1}[2]
556 ldp x9,x13,[$in2],#48
557 umull $ACC0,$IN23_0,${R0}[2]
558#ifdef __ARMEB__
559 rev x8,x8
560 rev x12,x12
561 rev x9,x9
562 rev x13,x13
563#endif
564
565 umlal $ACC4,$IN23_1,${R3}[2]
566 and x4,x8,#0x03ffffff // base 2^64 -> base 2^26
567 umlal $ACC3,$IN23_1,${R2}[2]
568 and x5,x9,#0x03ffffff
569 umlal $ACC2,$IN23_1,${R1}[2]
570 ubfx x6,x8,#26,#26
571 umlal $ACC1,$IN23_1,${R0}[2]
572 ubfx x7,x9,#26,#26
573 umlal $ACC0,$IN23_1,${S4}[2]
574 add x4,x4,x5,lsl#32 // bfi x4,x5,#32,#32
575
576 umlal $ACC4,$IN23_2,${R2}[2]
577 extr x8,x12,x8,#52
578 umlal $ACC3,$IN23_2,${R1}[2]
579 extr x9,x13,x9,#52
580 umlal $ACC2,$IN23_2,${R0}[2]
581 add x6,x6,x7,lsl#32 // bfi x6,x7,#32,#32
582 umlal $ACC1,$IN23_2,${S4}[2]
583 fmov $IN23_0,x4
584 umlal $ACC0,$IN23_2,${S3}[2]
585 and x8,x8,#0x03ffffff
586
587 umlal $ACC4,$IN23_3,${R1}[2]
588 and x9,x9,#0x03ffffff
589 umlal $ACC3,$IN23_3,${R0}[2]
590 ubfx x10,x12,#14,#26
591 umlal $ACC2,$IN23_3,${S4}[2]
592 ubfx x11,x13,#14,#26
593 umlal $ACC1,$IN23_3,${S3}[2]
594 add x8,x8,x9,lsl#32 // bfi x8,x9,#32,#32
595 umlal $ACC0,$IN23_3,${S2}[2]
596 fmov $IN23_1,x6
597
598 add $IN01_2,$IN01_2,$H2
599 add x12,$padbit,x12,lsr#40
600 umlal $ACC4,$IN23_4,${R0}[2]
601 add x13,$padbit,x13,lsr#40
602 umlal $ACC3,$IN23_4,${S4}[2]
603 add x10,x10,x11,lsl#32 // bfi x10,x11,#32,#32
604 umlal $ACC2,$IN23_4,${S3}[2]
605 add x12,x12,x13,lsl#32 // bfi x12,x13,#32,#32
606 umlal $ACC1,$IN23_4,${S2}[2]
607 fmov $IN23_2,x8
608 umlal $ACC0,$IN23_4,${S1}[2]
609 fmov $IN23_3,x10
610
611 ////////////////////////////////////////////////////////////////
612 // (hash+inp[0:1])*r^4 and accumulate
613
614 add $IN01_0,$IN01_0,$H0
615 fmov $IN23_4,x12
616 umlal $ACC3,$IN01_2,${R1}[0]
617 ldp x8,x12,[$inp],#16 // inp[0:1]
618 umlal $ACC0,$IN01_2,${S3}[0]
619 ldp x9,x13,[$inp],#48
620 umlal $ACC4,$IN01_2,${R2}[0]
621 umlal $ACC1,$IN01_2,${S4}[0]
622 umlal $ACC2,$IN01_2,${R0}[0]
623#ifdef __ARMEB__
624 rev x8,x8
625 rev x12,x12
626 rev x9,x9
627 rev x13,x13
628#endif
629
630 add $IN01_1,$IN01_1,$H1
631 umlal $ACC3,$IN01_0,${R3}[0]
632 umlal $ACC4,$IN01_0,${R4}[0]
633 and x4,x8,#0x03ffffff // base 2^64 -> base 2^26
634 umlal $ACC2,$IN01_0,${R2}[0]
635 and x5,x9,#0x03ffffff
636 umlal $ACC0,$IN01_0,${R0}[0]
637 ubfx x6,x8,#26,#26
638 umlal $ACC1,$IN01_0,${R1}[0]
639 ubfx x7,x9,#26,#26
640
641 add $IN01_3,$IN01_3,$H3
642 add x4,x4,x5,lsl#32 // bfi x4,x5,#32,#32
643 umlal $ACC3,$IN01_1,${R2}[0]
644 extr x8,x12,x8,#52
645 umlal $ACC4,$IN01_1,${R3}[0]
646 extr x9,x13,x9,#52
647 umlal $ACC0,$IN01_1,${S4}[0]
648 add x6,x6,x7,lsl#32 // bfi x6,x7,#32,#32
649 umlal $ACC2,$IN01_1,${R1}[0]
650 fmov $IN01_0,x4
651 umlal $ACC1,$IN01_1,${R0}[0]
652 and x8,x8,#0x03ffffff
653
654 add $IN01_4,$IN01_4,$H4
655 and x9,x9,#0x03ffffff
656 umlal $ACC3,$IN01_3,${R0}[0]
657 ubfx x10,x12,#14,#26
658 umlal $ACC0,$IN01_3,${S2}[0]
659 ubfx x11,x13,#14,#26
660 umlal $ACC4,$IN01_3,${R1}[0]
661 add x8,x8,x9,lsl#32 // bfi x8,x9,#32,#32
662 umlal $ACC1,$IN01_3,${S3}[0]
663 fmov $IN01_1,x6
664 umlal $ACC2,$IN01_3,${S4}[0]
665 add x12,$padbit,x12,lsr#40
666
667 umlal $ACC3,$IN01_4,${S4}[0]
668 add x13,$padbit,x13,lsr#40
669 umlal $ACC0,$IN01_4,${S1}[0]
670 add x10,x10,x11,lsl#32 // bfi x10,x11,#32,#32
671 umlal $ACC4,$IN01_4,${R0}[0]
672 add x12,x12,x13,lsl#32 // bfi x12,x13,#32,#32
673 umlal $ACC1,$IN01_4,${S2}[0]
674 fmov $IN01_2,x8
675 umlal $ACC2,$IN01_4,${S3}[0]
676 fmov $IN01_3,x10
677 fmov $IN01_4,x12
678
679 /////////////////////////////////////////////////////////////////
680 // lazy reduction as discussed in "NEON crypto" by D.J. Bernstein
681 // and P. Schwabe
682 //
683 // [see discussion in poly1305-armv4 module]
684
685 ushr $T0.2d,$ACC3,#26
686 xtn $H3,$ACC3
687 ushr $T1.2d,$ACC0,#26
688 and $ACC0,$ACC0,$MASK.2d
689 add $ACC4,$ACC4,$T0.2d // h3 -> h4
690 bic $H3,#0xfc,lsl#24 // &=0x03ffffff
691 add $ACC1,$ACC1,$T1.2d // h0 -> h1
692
693 ushr $T0.2d,$ACC4,#26
694 xtn $H4,$ACC4
695 ushr $T1.2d,$ACC1,#26
696 xtn $H1,$ACC1
697 bic $H4,#0xfc,lsl#24
698 add $ACC2,$ACC2,$T1.2d // h1 -> h2
699
700 add $ACC0,$ACC0,$T0.2d
701 shl $T0.2d,$T0.2d,#2
702 shrn $T1.2s,$ACC2,#26
703 xtn $H2,$ACC2
704 add $ACC0,$ACC0,$T0.2d // h4 -> h0
705 bic $H1,#0xfc,lsl#24
706 add $H3,$H3,$T1.2s // h2 -> h3
707 bic $H2,#0xfc,lsl#24
708
709 shrn $T0.2s,$ACC0,#26
710 xtn $H0,$ACC0
711 ushr $T1.2s,$H3,#26
712 bic $H3,#0xfc,lsl#24
713 bic $H0,#0xfc,lsl#24
714 add $H1,$H1,$T0.2s // h0 -> h1
715 add $H4,$H4,$T1.2s // h3 -> h4
716
717 b.hi .Loop_neon
718
719.Lskip_loop:
720 dup $IN23_2,${IN23_2}[0]
721 add $IN01_2,$IN01_2,$H2
722
723 ////////////////////////////////////////////////////////////////
724 // multiply (inp[0:1]+hash) or inp[2:3] by r^2:r^1
725
726 adds $len,$len,#32
727 b.ne .Long_tail
728
729 dup $IN23_2,${IN01_2}[0]
730 add $IN23_0,$IN01_0,$H0
731 add $IN23_3,$IN01_3,$H3
732 add $IN23_1,$IN01_1,$H1
733 add $IN23_4,$IN01_4,$H4
734
735.Long_tail:
736 dup $IN23_0,${IN23_0}[0]
737 umull2 $ACC0,$IN23_2,${S3}
738 umull2 $ACC3,$IN23_2,${R1}
739 umull2 $ACC4,$IN23_2,${R2}
740 umull2 $ACC2,$IN23_2,${R0}
741 umull2 $ACC1,$IN23_2,${S4}
742
743 dup $IN23_1,${IN23_1}[0]
744 umlal2 $ACC0,$IN23_0,${R0}
745 umlal2 $ACC2,$IN23_0,${R2}
746 umlal2 $ACC3,$IN23_0,${R3}
747 umlal2 $ACC4,$IN23_0,${R4}
748 umlal2 $ACC1,$IN23_0,${R1}
749
750 dup $IN23_3,${IN23_3}[0]
751 umlal2 $ACC0,$IN23_1,${S4}
752 umlal2 $ACC3,$IN23_1,${R2}
753 umlal2 $ACC2,$IN23_1,${R1}
754 umlal2 $ACC4,$IN23_1,${R3}
755 umlal2 $ACC1,$IN23_1,${R0}
756
757 dup $IN23_4,${IN23_4}[0]
758 umlal2 $ACC3,$IN23_3,${R0}
759 umlal2 $ACC4,$IN23_3,${R1}
760 umlal2 $ACC0,$IN23_3,${S2}
761 umlal2 $ACC1,$IN23_3,${S3}
762 umlal2 $ACC2,$IN23_3,${S4}
763
764 umlal2 $ACC3,$IN23_4,${S4}
765 umlal2 $ACC0,$IN23_4,${S1}
766 umlal2 $ACC4,$IN23_4,${R0}
767 umlal2 $ACC1,$IN23_4,${S2}
768 umlal2 $ACC2,$IN23_4,${S3}
769
770 b.eq .Lshort_tail
771
772 ////////////////////////////////////////////////////////////////
773 // (hash+inp[0:1])*r^4:r^3 and accumulate
774
775 add $IN01_0,$IN01_0,$H0
776 umlal $ACC3,$IN01_2,${R1}
777 umlal $ACC0,$IN01_2,${S3}
778 umlal $ACC4,$IN01_2,${R2}
779 umlal $ACC1,$IN01_2,${S4}
780 umlal $ACC2,$IN01_2,${R0}
781
782 add $IN01_1,$IN01_1,$H1
783 umlal $ACC3,$IN01_0,${R3}
784 umlal $ACC0,$IN01_0,${R0}
785 umlal $ACC4,$IN01_0,${R4}
786 umlal $ACC1,$IN01_0,${R1}
787 umlal $ACC2,$IN01_0,${R2}
788
789 add $IN01_3,$IN01_3,$H3
790 umlal $ACC3,$IN01_1,${R2}
791 umlal $ACC0,$IN01_1,${S4}
792 umlal $ACC4,$IN01_1,${R3}
793 umlal $ACC1,$IN01_1,${R0}
794 umlal $ACC2,$IN01_1,${R1}
795
796 add $IN01_4,$IN01_4,$H4
797 umlal $ACC3,$IN01_3,${R0}
798 umlal $ACC0,$IN01_3,${S2}
799 umlal $ACC4,$IN01_3,${R1}
800 umlal $ACC1,$IN01_3,${S3}
801 umlal $ACC2,$IN01_3,${S4}
802
803 umlal $ACC3,$IN01_4,${S4}
804 umlal $ACC0,$IN01_4,${S1}
805 umlal $ACC4,$IN01_4,${R0}
806 umlal $ACC1,$IN01_4,${S2}
807 umlal $ACC2,$IN01_4,${S3}
808
809.Lshort_tail:
810 ////////////////////////////////////////////////////////////////
811 // horizontal add
812
813 addp $ACC3,$ACC3,$ACC3
814 ldp d8,d9,[sp,#16] // meet ABI requirements
815 addp $ACC0,$ACC0,$ACC0
816 ldp d10,d11,[sp,#32]
817 addp $ACC4,$ACC4,$ACC4
818 ldp d12,d13,[sp,#48]
819 addp $ACC1,$ACC1,$ACC1
820 ldp d14,d15,[sp,#64]
821 addp $ACC2,$ACC2,$ACC2
822
823 ////////////////////////////////////////////////////////////////
824 // lazy reduction, but without narrowing
825
826 ushr $T0.2d,$ACC3,#26
827 and $ACC3,$ACC3,$MASK.2d
828 ushr $T1.2d,$ACC0,#26
829 and $ACC0,$ACC0,$MASK.2d
830
831 add $ACC4,$ACC4,$T0.2d // h3 -> h4
832 add $ACC1,$ACC1,$T1.2d // h0 -> h1
833
834 ushr $T0.2d,$ACC4,#26
835 and $ACC4,$ACC4,$MASK.2d
836 ushr $T1.2d,$ACC1,#26
837 and $ACC1,$ACC1,$MASK.2d
838 add $ACC2,$ACC2,$T1.2d // h1 -> h2
839
840 add $ACC0,$ACC0,$T0.2d
841 shl $T0.2d,$T0.2d,#2
842 ushr $T1.2d,$ACC2,#26
843 and $ACC2,$ACC2,$MASK.2d
844 add $ACC0,$ACC0,$T0.2d // h4 -> h0
845 add $ACC3,$ACC3,$T1.2d // h2 -> h3
846
847 ushr $T0.2d,$ACC0,#26
848 and $ACC0,$ACC0,$MASK.2d
849 ushr $T1.2d,$ACC3,#26
850 and $ACC3,$ACC3,$MASK.2d
851 add $ACC1,$ACC1,$T0.2d // h0 -> h1
852 add $ACC4,$ACC4,$T1.2d // h3 -> h4
853
854 ////////////////////////////////////////////////////////////////
855 // write the result, can be partially reduced
856
857 st4 {$ACC0,$ACC1,$ACC2,$ACC3}[0],[$ctx],#16
858 st1 {$ACC4}[0],[$ctx]
859
860.Lno_data_neon:
861 ldr x29,[sp],#80
862 ret
863.size poly1305_blocks_neon,.-poly1305_blocks_neon
864
865.type poly1305_emit_neon,%function
866.align 5
867poly1305_emit_neon:
868 ldr $is_base2_26,[$ctx,#24]
869 cbz $is_base2_26,poly1305_emit
870
871 ldp w10,w11,[$ctx] // load hash value base 2^26
872 ldp w12,w13,[$ctx,#8]
873 ldr w14,[$ctx,#16]
874
875 add $h0,x10,x11,lsl#26 // base 2^26 -> base 2^64
876 lsr $h1,x12,#12
877 adds $h0,$h0,x12,lsl#52
878 add $h1,$h1,x13,lsl#14
879 adc $h1,$h1,xzr
880 lsr $h2,x14,#24
881 adds $h1,$h1,x14,lsl#40
882 adc $h2,$h2,xzr // can be partially reduced...
883
884 ldp $t0,$t1,[$nonce] // load nonce
885
886 and $d0,$h2,#-4 // ... so reduce
887 add $d0,$d0,$h2,lsr#2
888 and $h2,$h2,#3
889 adds $h0,$h0,$d0
890 adcs $h1,$h1,xzr
891 adc $h2,$h2,xzr
892
893 adds $d0,$h0,#5 // compare to modulus
894 adcs $d1,$h1,xzr
895 adc $d2,$h2,xzr
896
897 tst $d2,#-4 // see if it's carried/borrowed
898
899 csel $h0,$h0,$d0,eq
900 csel $h1,$h1,$d1,eq
901
902#ifdef __ARMEB__
903 ror $t0,$t0,#32 // flip nonce words
904 ror $t1,$t1,#32
905#endif
906 adds $h0,$h0,$t0 // accumulate nonce
907 adc $h1,$h1,$t1
908#ifdef __ARMEB__
909 rev $h0,$h0 // flip output bytes
910 rev $h1,$h1
911#endif
912 stp $h0,$h1,[$mac] // write result
913
914 ret
915.size poly1305_emit_neon,.-poly1305_emit_neon
916
917.align 5
918.Lzeros:
919.long 0,0,0,0,0,0,0,0
920.LOPENSSL_armcap_P:
921#ifdef __ILP32__
922.long OPENSSL_armcap_P-.
923#else
924.quad OPENSSL_armcap_P-.
925#endif
926.asciz "Poly1305 for ARMv8, CRYPTOGAMS by <appro\@openssl.org>"
927.align 2
928___
929
930foreach (split("\n",$code)) {
931 s/\b(shrn\s+v[0-9]+)\.[24]d/$1.2s/ or
932 s/\b(fmov\s+)v([0-9]+)[^,]*,\s*x([0-9]+)/$1d$2,x$3/ or
933 (m/\bdup\b/ and (s/\.[24]s/.2d/g or 1)) or
934 (m/\b(eor|and)/ and (s/\.[248][sdh]/.16b/g or 1)) or
935 (m/\bum(ul|la)l\b/ and (s/\.4s/.2s/g or 1)) or
936 (m/\bum(ul|la)l2\b/ and (s/\.2s/.4s/g or 1)) or
937 (m/\bst[1-4]\s+{[^}]+}\[/ and (s/\.[24]d/.s/g or 1));
938
939 s/\.[124]([sd])\[/.$1\[/;
940
941 print $_,"\n";
942}
943close STDOUT;
Note: See TracBrowser for help on using the repository browser.

© 2024 Oracle Support Privacy / Do Not Sell My Info Terms of Use Trademark Policy Automated Access Etiquette