1 | #! /usr/bin/env perl
|
---|
2 | # Copyright 2016-2020 The OpenSSL Project Authors. All Rights Reserved.
|
---|
3 | #
|
---|
4 | # Licensed under the OpenSSL license (the "License"). You may not use
|
---|
5 | # this file except in compliance with the License. You can obtain a copy
|
---|
6 | # in the file LICENSE in the source distribution or at
|
---|
7 | # https://www.openssl.org/source/license.html
|
---|
8 |
|
---|
9 | #
|
---|
10 | # ====================================================================
|
---|
11 | # Written by Andy Polyakov <[email protected]> for the OpenSSL
|
---|
12 | # project. The module is, however, dual licensed under OpenSSL and
|
---|
13 | # CRYPTOGAMS licenses depending on where you obtain it. For further
|
---|
14 | # details see http://www.openssl.org/~appro/cryptogams/.
|
---|
15 | # ====================================================================
|
---|
16 | #
|
---|
17 | # IALU(*)/gcc-4.4 NEON
|
---|
18 | #
|
---|
19 | # ARM11xx(ARMv6) 7.78/+100% -
|
---|
20 | # Cortex-A5 6.35/+130% 3.00
|
---|
21 | # Cortex-A8 6.25/+115% 2.36
|
---|
22 | # Cortex-A9 5.10/+95% 2.55
|
---|
23 | # Cortex-A15 3.85/+85% 1.25(**)
|
---|
24 | # Snapdragon S4 5.70/+100% 1.48(**)
|
---|
25 | #
|
---|
26 | # (*) this is for -march=armv6, i.e. with bunch of ldrb loading data;
|
---|
27 | # (**) these are trade-off results, they can be improved by ~8% but at
|
---|
28 | # the cost of 15/12% regression on Cortex-A5/A7, it's even possible
|
---|
29 | # to improve Cortex-A9 result, but then A5/A7 loose more than 20%;
|
---|
30 |
|
---|
31 | $flavour = shift;
|
---|
32 | if ($flavour=~/\w[\w\-]*\.\w+$/) { $output=$flavour; undef $flavour; }
|
---|
33 | else { while (($output=shift) && ($output!~/\w[\w\-]*\.\w+$/)) {} }
|
---|
34 |
|
---|
35 | if ($flavour && $flavour ne "void") {
|
---|
36 | $0 =~ m/(.*[\/\\])[^\/\\]+$/; $dir=$1;
|
---|
37 | ( $xlate="${dir}arm-xlate.pl" and -f $xlate ) or
|
---|
38 | ( $xlate="${dir}../../perlasm/arm-xlate.pl" and -f $xlate) or
|
---|
39 | die "can't locate arm-xlate.pl";
|
---|
40 |
|
---|
41 | open STDOUT,"| \"$^X\" $xlate $flavour $output";
|
---|
42 | } else {
|
---|
43 | open STDOUT,">$output";
|
---|
44 | }
|
---|
45 |
|
---|
46 | ($ctx,$inp,$len,$padbit)=map("r$_",(0..3));
|
---|
47 |
|
---|
48 | $code.=<<___;
|
---|
49 | #include "arm_arch.h"
|
---|
50 |
|
---|
51 | .text
|
---|
52 | #if defined(__thumb2__)
|
---|
53 | .syntax unified
|
---|
54 | .thumb
|
---|
55 | #else
|
---|
56 | .code 32
|
---|
57 | #endif
|
---|
58 |
|
---|
59 | .globl poly1305_emit
|
---|
60 | .globl poly1305_blocks
|
---|
61 | .globl poly1305_init
|
---|
62 | .type poly1305_init,%function
|
---|
63 | .align 5
|
---|
64 | poly1305_init:
|
---|
65 | .Lpoly1305_init:
|
---|
66 | stmdb sp!,{r4-r11}
|
---|
67 |
|
---|
68 | eor r3,r3,r3
|
---|
69 | cmp $inp,#0
|
---|
70 | str r3,[$ctx,#0] @ zero hash value
|
---|
71 | str r3,[$ctx,#4]
|
---|
72 | str r3,[$ctx,#8]
|
---|
73 | str r3,[$ctx,#12]
|
---|
74 | str r3,[$ctx,#16]
|
---|
75 | str r3,[$ctx,#36] @ is_base2_26
|
---|
76 | add $ctx,$ctx,#20
|
---|
77 |
|
---|
78 | #ifdef __thumb2__
|
---|
79 | it eq
|
---|
80 | #endif
|
---|
81 | moveq r0,#0
|
---|
82 | beq .Lno_key
|
---|
83 |
|
---|
84 | #if __ARM_MAX_ARCH__>=7
|
---|
85 | adr r11,.Lpoly1305_init
|
---|
86 | ldr r12,.LOPENSSL_armcap
|
---|
87 | #endif
|
---|
88 | ldrb r4,[$inp,#0]
|
---|
89 | mov r10,#0x0fffffff
|
---|
90 | ldrb r5,[$inp,#1]
|
---|
91 | and r3,r10,#-4 @ 0x0ffffffc
|
---|
92 | ldrb r6,[$inp,#2]
|
---|
93 | ldrb r7,[$inp,#3]
|
---|
94 | orr r4,r4,r5,lsl#8
|
---|
95 | ldrb r5,[$inp,#4]
|
---|
96 | orr r4,r4,r6,lsl#16
|
---|
97 | ldrb r6,[$inp,#5]
|
---|
98 | orr r4,r4,r7,lsl#24
|
---|
99 | ldrb r7,[$inp,#6]
|
---|
100 | and r4,r4,r10
|
---|
101 |
|
---|
102 | #if __ARM_MAX_ARCH__>=7
|
---|
103 | ldr r12,[r11,r12] @ OPENSSL_armcap_P
|
---|
104 | # ifdef __APPLE__
|
---|
105 | ldr r12,[r12]
|
---|
106 | # endif
|
---|
107 | #endif
|
---|
108 | ldrb r8,[$inp,#7]
|
---|
109 | orr r5,r5,r6,lsl#8
|
---|
110 | ldrb r6,[$inp,#8]
|
---|
111 | orr r5,r5,r7,lsl#16
|
---|
112 | ldrb r7,[$inp,#9]
|
---|
113 | orr r5,r5,r8,lsl#24
|
---|
114 | ldrb r8,[$inp,#10]
|
---|
115 | and r5,r5,r3
|
---|
116 |
|
---|
117 | #if __ARM_MAX_ARCH__>=7
|
---|
118 | tst r12,#ARMV7_NEON @ check for NEON
|
---|
119 | # ifdef __APPLE__
|
---|
120 | adr r9,poly1305_blocks_neon
|
---|
121 | adr r11,poly1305_blocks
|
---|
122 | # ifdef __thumb2__
|
---|
123 | it ne
|
---|
124 | # endif
|
---|
125 | movne r11,r9
|
---|
126 | adr r12,poly1305_emit
|
---|
127 | adr r10,poly1305_emit_neon
|
---|
128 | # ifdef __thumb2__
|
---|
129 | it ne
|
---|
130 | # endif
|
---|
131 | movne r12,r10
|
---|
132 | # else
|
---|
133 | # ifdef __thumb2__
|
---|
134 | itete eq
|
---|
135 | # endif
|
---|
136 | addeq r12,r11,#(poly1305_emit-.Lpoly1305_init)
|
---|
137 | addne r12,r11,#(poly1305_emit_neon-.Lpoly1305_init)
|
---|
138 | addeq r11,r11,#(poly1305_blocks-.Lpoly1305_init)
|
---|
139 | addne r11,r11,#(poly1305_blocks_neon-.Lpoly1305_init)
|
---|
140 | # endif
|
---|
141 | # ifdef __thumb2__
|
---|
142 | orr r12,r12,#1 @ thumb-ify address
|
---|
143 | orr r11,r11,#1
|
---|
144 | # endif
|
---|
145 | #endif
|
---|
146 | ldrb r9,[$inp,#11]
|
---|
147 | orr r6,r6,r7,lsl#8
|
---|
148 | ldrb r7,[$inp,#12]
|
---|
149 | orr r6,r6,r8,lsl#16
|
---|
150 | ldrb r8,[$inp,#13]
|
---|
151 | orr r6,r6,r9,lsl#24
|
---|
152 | ldrb r9,[$inp,#14]
|
---|
153 | and r6,r6,r3
|
---|
154 |
|
---|
155 | ldrb r10,[$inp,#15]
|
---|
156 | orr r7,r7,r8,lsl#8
|
---|
157 | str r4,[$ctx,#0]
|
---|
158 | orr r7,r7,r9,lsl#16
|
---|
159 | str r5,[$ctx,#4]
|
---|
160 | orr r7,r7,r10,lsl#24
|
---|
161 | str r6,[$ctx,#8]
|
---|
162 | and r7,r7,r3
|
---|
163 | str r7,[$ctx,#12]
|
---|
164 | #if __ARM_MAX_ARCH__>=7
|
---|
165 | stmia r2,{r11,r12} @ fill functions table
|
---|
166 | mov r0,#1
|
---|
167 | #else
|
---|
168 | mov r0,#0
|
---|
169 | #endif
|
---|
170 | .Lno_key:
|
---|
171 | ldmia sp!,{r4-r11}
|
---|
172 | #if __ARM_ARCH__>=5
|
---|
173 | ret @ bx lr
|
---|
174 | #else
|
---|
175 | tst lr,#1
|
---|
176 | moveq pc,lr @ be binary compatible with V4, yet
|
---|
177 | bx lr @ interoperable with Thumb ISA:-)
|
---|
178 | #endif
|
---|
179 | .size poly1305_init,.-poly1305_init
|
---|
180 | ___
|
---|
181 | {
|
---|
182 | my ($h0,$h1,$h2,$h3,$h4,$r0,$r1,$r2,$r3)=map("r$_",(4..12));
|
---|
183 | my ($s1,$s2,$s3)=($r1,$r2,$r3);
|
---|
184 |
|
---|
185 | $code.=<<___;
|
---|
186 | .type poly1305_blocks,%function
|
---|
187 | .align 5
|
---|
188 | poly1305_blocks:
|
---|
189 | .Lpoly1305_blocks:
|
---|
190 | stmdb sp!,{r3-r11,lr}
|
---|
191 |
|
---|
192 | ands $len,$len,#-16
|
---|
193 | beq .Lno_data
|
---|
194 |
|
---|
195 | cmp $padbit,#0
|
---|
196 | add $len,$len,$inp @ end pointer
|
---|
197 | sub sp,sp,#32
|
---|
198 |
|
---|
199 | ldmia $ctx,{$h0-$r3} @ load context
|
---|
200 |
|
---|
201 | str $ctx,[sp,#12] @ offload stuff
|
---|
202 | mov lr,$inp
|
---|
203 | str $len,[sp,#16]
|
---|
204 | str $r1,[sp,#20]
|
---|
205 | str $r2,[sp,#24]
|
---|
206 | str $r3,[sp,#28]
|
---|
207 | b .Loop
|
---|
208 |
|
---|
209 | .Loop:
|
---|
210 | #if __ARM_ARCH__<7
|
---|
211 | ldrb r0,[lr],#16 @ load input
|
---|
212 | # ifdef __thumb2__
|
---|
213 | it hi
|
---|
214 | # endif
|
---|
215 | addhi $h4,$h4,#1 @ 1<<128
|
---|
216 | ldrb r1,[lr,#-15]
|
---|
217 | ldrb r2,[lr,#-14]
|
---|
218 | ldrb r3,[lr,#-13]
|
---|
219 | orr r1,r0,r1,lsl#8
|
---|
220 | ldrb r0,[lr,#-12]
|
---|
221 | orr r2,r1,r2,lsl#16
|
---|
222 | ldrb r1,[lr,#-11]
|
---|
223 | orr r3,r2,r3,lsl#24
|
---|
224 | ldrb r2,[lr,#-10]
|
---|
225 | adds $h0,$h0,r3 @ accumulate input
|
---|
226 |
|
---|
227 | ldrb r3,[lr,#-9]
|
---|
228 | orr r1,r0,r1,lsl#8
|
---|
229 | ldrb r0,[lr,#-8]
|
---|
230 | orr r2,r1,r2,lsl#16
|
---|
231 | ldrb r1,[lr,#-7]
|
---|
232 | orr r3,r2,r3,lsl#24
|
---|
233 | ldrb r2,[lr,#-6]
|
---|
234 | adcs $h1,$h1,r3
|
---|
235 |
|
---|
236 | ldrb r3,[lr,#-5]
|
---|
237 | orr r1,r0,r1,lsl#8
|
---|
238 | ldrb r0,[lr,#-4]
|
---|
239 | orr r2,r1,r2,lsl#16
|
---|
240 | ldrb r1,[lr,#-3]
|
---|
241 | orr r3,r2,r3,lsl#24
|
---|
242 | ldrb r2,[lr,#-2]
|
---|
243 | adcs $h2,$h2,r3
|
---|
244 |
|
---|
245 | ldrb r3,[lr,#-1]
|
---|
246 | orr r1,r0,r1,lsl#8
|
---|
247 | str lr,[sp,#8] @ offload input pointer
|
---|
248 | orr r2,r1,r2,lsl#16
|
---|
249 | add $s1,$r1,$r1,lsr#2
|
---|
250 | orr r3,r2,r3,lsl#24
|
---|
251 | #else
|
---|
252 | ldr r0,[lr],#16 @ load input
|
---|
253 | # ifdef __thumb2__
|
---|
254 | it hi
|
---|
255 | # endif
|
---|
256 | addhi $h4,$h4,#1 @ padbit
|
---|
257 | ldr r1,[lr,#-12]
|
---|
258 | ldr r2,[lr,#-8]
|
---|
259 | ldr r3,[lr,#-4]
|
---|
260 | # ifdef __ARMEB__
|
---|
261 | rev r0,r0
|
---|
262 | rev r1,r1
|
---|
263 | rev r2,r2
|
---|
264 | rev r3,r3
|
---|
265 | # endif
|
---|
266 | adds $h0,$h0,r0 @ accumulate input
|
---|
267 | str lr,[sp,#8] @ offload input pointer
|
---|
268 | adcs $h1,$h1,r1
|
---|
269 | add $s1,$r1,$r1,lsr#2
|
---|
270 | adcs $h2,$h2,r2
|
---|
271 | #endif
|
---|
272 | add $s2,$r2,$r2,lsr#2
|
---|
273 | adcs $h3,$h3,r3
|
---|
274 | add $s3,$r3,$r3,lsr#2
|
---|
275 |
|
---|
276 | umull r2,r3,$h1,$r0
|
---|
277 | adc $h4,$h4,#0
|
---|
278 | umull r0,r1,$h0,$r0
|
---|
279 | umlal r2,r3,$h4,$s1
|
---|
280 | umlal r0,r1,$h3,$s1
|
---|
281 | ldr $r1,[sp,#20] @ reload $r1
|
---|
282 | umlal r2,r3,$h2,$s3
|
---|
283 | umlal r0,r1,$h1,$s3
|
---|
284 | umlal r2,r3,$h3,$s2
|
---|
285 | umlal r0,r1,$h2,$s2
|
---|
286 | umlal r2,r3,$h0,$r1
|
---|
287 | str r0,[sp,#0] @ future $h0
|
---|
288 | mul r0,$s2,$h4
|
---|
289 | ldr $r2,[sp,#24] @ reload $r2
|
---|
290 | adds r2,r2,r1 @ d1+=d0>>32
|
---|
291 | eor r1,r1,r1
|
---|
292 | adc lr,r3,#0 @ future $h2
|
---|
293 | str r2,[sp,#4] @ future $h1
|
---|
294 |
|
---|
295 | mul r2,$s3,$h4
|
---|
296 | eor r3,r3,r3
|
---|
297 | umlal r0,r1,$h3,$s3
|
---|
298 | ldr $r3,[sp,#28] @ reload $r3
|
---|
299 | umlal r2,r3,$h3,$r0
|
---|
300 | umlal r0,r1,$h2,$r0
|
---|
301 | umlal r2,r3,$h2,$r1
|
---|
302 | umlal r0,r1,$h1,$r1
|
---|
303 | umlal r2,r3,$h1,$r2
|
---|
304 | umlal r0,r1,$h0,$r2
|
---|
305 | umlal r2,r3,$h0,$r3
|
---|
306 | ldr $h0,[sp,#0]
|
---|
307 | mul $h4,$r0,$h4
|
---|
308 | ldr $h1,[sp,#4]
|
---|
309 |
|
---|
310 | adds $h2,lr,r0 @ d2+=d1>>32
|
---|
311 | ldr lr,[sp,#8] @ reload input pointer
|
---|
312 | adc r1,r1,#0
|
---|
313 | adds $h3,r2,r1 @ d3+=d2>>32
|
---|
314 | ldr r0,[sp,#16] @ reload end pointer
|
---|
315 | adc r3,r3,#0
|
---|
316 | add $h4,$h4,r3 @ h4+=d3>>32
|
---|
317 |
|
---|
318 | and r1,$h4,#-4
|
---|
319 | and $h4,$h4,#3
|
---|
320 | add r1,r1,r1,lsr#2 @ *=5
|
---|
321 | adds $h0,$h0,r1
|
---|
322 | adcs $h1,$h1,#0
|
---|
323 | adcs $h2,$h2,#0
|
---|
324 | adcs $h3,$h3,#0
|
---|
325 | adc $h4,$h4,#0
|
---|
326 |
|
---|
327 | cmp r0,lr @ done yet?
|
---|
328 | bhi .Loop
|
---|
329 |
|
---|
330 | ldr $ctx,[sp,#12]
|
---|
331 | add sp,sp,#32
|
---|
332 | stmia $ctx,{$h0-$h4} @ store the result
|
---|
333 |
|
---|
334 | .Lno_data:
|
---|
335 | #if __ARM_ARCH__>=5
|
---|
336 | ldmia sp!,{r3-r11,pc}
|
---|
337 | #else
|
---|
338 | ldmia sp!,{r3-r11,lr}
|
---|
339 | tst lr,#1
|
---|
340 | moveq pc,lr @ be binary compatible with V4, yet
|
---|
341 | bx lr @ interoperable with Thumb ISA:-)
|
---|
342 | #endif
|
---|
343 | .size poly1305_blocks,.-poly1305_blocks
|
---|
344 | ___
|
---|
345 | }
|
---|
346 | {
|
---|
347 | my ($ctx,$mac,$nonce)=map("r$_",(0..2));
|
---|
348 | my ($h0,$h1,$h2,$h3,$h4,$g0,$g1,$g2,$g3)=map("r$_",(3..11));
|
---|
349 | my $g4=$h4;
|
---|
350 |
|
---|
351 | $code.=<<___;
|
---|
352 | .type poly1305_emit,%function
|
---|
353 | .align 5
|
---|
354 | poly1305_emit:
|
---|
355 | stmdb sp!,{r4-r11}
|
---|
356 | .Lpoly1305_emit_enter:
|
---|
357 |
|
---|
358 | ldmia $ctx,{$h0-$h4}
|
---|
359 | adds $g0,$h0,#5 @ compare to modulus
|
---|
360 | adcs $g1,$h1,#0
|
---|
361 | adcs $g2,$h2,#0
|
---|
362 | adcs $g3,$h3,#0
|
---|
363 | adc $g4,$h4,#0
|
---|
364 | tst $g4,#4 @ did it carry/borrow?
|
---|
365 |
|
---|
366 | #ifdef __thumb2__
|
---|
367 | it ne
|
---|
368 | #endif
|
---|
369 | movne $h0,$g0
|
---|
370 | ldr $g0,[$nonce,#0]
|
---|
371 | #ifdef __thumb2__
|
---|
372 | it ne
|
---|
373 | #endif
|
---|
374 | movne $h1,$g1
|
---|
375 | ldr $g1,[$nonce,#4]
|
---|
376 | #ifdef __thumb2__
|
---|
377 | it ne
|
---|
378 | #endif
|
---|
379 | movne $h2,$g2
|
---|
380 | ldr $g2,[$nonce,#8]
|
---|
381 | #ifdef __thumb2__
|
---|
382 | it ne
|
---|
383 | #endif
|
---|
384 | movne $h3,$g3
|
---|
385 | ldr $g3,[$nonce,#12]
|
---|
386 |
|
---|
387 | adds $h0,$h0,$g0
|
---|
388 | adcs $h1,$h1,$g1
|
---|
389 | adcs $h2,$h2,$g2
|
---|
390 | adc $h3,$h3,$g3
|
---|
391 |
|
---|
392 | #if __ARM_ARCH__>=7
|
---|
393 | # ifdef __ARMEB__
|
---|
394 | rev $h0,$h0
|
---|
395 | rev $h1,$h1
|
---|
396 | rev $h2,$h2
|
---|
397 | rev $h3,$h3
|
---|
398 | # endif
|
---|
399 | str $h0,[$mac,#0]
|
---|
400 | str $h1,[$mac,#4]
|
---|
401 | str $h2,[$mac,#8]
|
---|
402 | str $h3,[$mac,#12]
|
---|
403 | #else
|
---|
404 | strb $h0,[$mac,#0]
|
---|
405 | mov $h0,$h0,lsr#8
|
---|
406 | strb $h1,[$mac,#4]
|
---|
407 | mov $h1,$h1,lsr#8
|
---|
408 | strb $h2,[$mac,#8]
|
---|
409 | mov $h2,$h2,lsr#8
|
---|
410 | strb $h3,[$mac,#12]
|
---|
411 | mov $h3,$h3,lsr#8
|
---|
412 |
|
---|
413 | strb $h0,[$mac,#1]
|
---|
414 | mov $h0,$h0,lsr#8
|
---|
415 | strb $h1,[$mac,#5]
|
---|
416 | mov $h1,$h1,lsr#8
|
---|
417 | strb $h2,[$mac,#9]
|
---|
418 | mov $h2,$h2,lsr#8
|
---|
419 | strb $h3,[$mac,#13]
|
---|
420 | mov $h3,$h3,lsr#8
|
---|
421 |
|
---|
422 | strb $h0,[$mac,#2]
|
---|
423 | mov $h0,$h0,lsr#8
|
---|
424 | strb $h1,[$mac,#6]
|
---|
425 | mov $h1,$h1,lsr#8
|
---|
426 | strb $h2,[$mac,#10]
|
---|
427 | mov $h2,$h2,lsr#8
|
---|
428 | strb $h3,[$mac,#14]
|
---|
429 | mov $h3,$h3,lsr#8
|
---|
430 |
|
---|
431 | strb $h0,[$mac,#3]
|
---|
432 | strb $h1,[$mac,#7]
|
---|
433 | strb $h2,[$mac,#11]
|
---|
434 | strb $h3,[$mac,#15]
|
---|
435 | #endif
|
---|
436 | ldmia sp!,{r4-r11}
|
---|
437 | #if __ARM_ARCH__>=5
|
---|
438 | ret @ bx lr
|
---|
439 | #else
|
---|
440 | tst lr,#1
|
---|
441 | moveq pc,lr @ be binary compatible with V4, yet
|
---|
442 | bx lr @ interoperable with Thumb ISA:-)
|
---|
443 | #endif
|
---|
444 | .size poly1305_emit,.-poly1305_emit
|
---|
445 | ___
|
---|
446 | {
|
---|
447 | my ($R0,$R1,$S1,$R2,$S2,$R3,$S3,$R4,$S4) = map("d$_",(0..9));
|
---|
448 | my ($D0,$D1,$D2,$D3,$D4, $H0,$H1,$H2,$H3,$H4) = map("q$_",(5..14));
|
---|
449 | my ($T0,$T1,$MASK) = map("q$_",(15,4,0));
|
---|
450 |
|
---|
451 | my ($in2,$zeros,$tbl0,$tbl1) = map("r$_",(4..7));
|
---|
452 |
|
---|
453 | $code.=<<___;
|
---|
454 | #if __ARM_MAX_ARCH__>=7
|
---|
455 | .fpu neon
|
---|
456 |
|
---|
457 | .type poly1305_init_neon,%function
|
---|
458 | .align 5
|
---|
459 | poly1305_init_neon:
|
---|
460 | ldr r4,[$ctx,#20] @ load key base 2^32
|
---|
461 | ldr r5,[$ctx,#24]
|
---|
462 | ldr r6,[$ctx,#28]
|
---|
463 | ldr r7,[$ctx,#32]
|
---|
464 |
|
---|
465 | and r2,r4,#0x03ffffff @ base 2^32 -> base 2^26
|
---|
466 | mov r3,r4,lsr#26
|
---|
467 | mov r4,r5,lsr#20
|
---|
468 | orr r3,r3,r5,lsl#6
|
---|
469 | mov r5,r6,lsr#14
|
---|
470 | orr r4,r4,r6,lsl#12
|
---|
471 | mov r6,r7,lsr#8
|
---|
472 | orr r5,r5,r7,lsl#18
|
---|
473 | and r3,r3,#0x03ffffff
|
---|
474 | and r4,r4,#0x03ffffff
|
---|
475 | and r5,r5,#0x03ffffff
|
---|
476 |
|
---|
477 | vdup.32 $R0,r2 @ r^1 in both lanes
|
---|
478 | add r2,r3,r3,lsl#2 @ *5
|
---|
479 | vdup.32 $R1,r3
|
---|
480 | add r3,r4,r4,lsl#2
|
---|
481 | vdup.32 $S1,r2
|
---|
482 | vdup.32 $R2,r4
|
---|
483 | add r4,r5,r5,lsl#2
|
---|
484 | vdup.32 $S2,r3
|
---|
485 | vdup.32 $R3,r5
|
---|
486 | add r5,r6,r6,lsl#2
|
---|
487 | vdup.32 $S3,r4
|
---|
488 | vdup.32 $R4,r6
|
---|
489 | vdup.32 $S4,r5
|
---|
490 |
|
---|
491 | mov $zeros,#2 @ counter
|
---|
492 |
|
---|
493 | .Lsquare_neon:
|
---|
494 | @@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@
|
---|
495 | @ d0 = h0*r0 + h4*5*r1 + h3*5*r2 + h2*5*r3 + h1*5*r4
|
---|
496 | @ d1 = h1*r0 + h0*r1 + h4*5*r2 + h3*5*r3 + h2*5*r4
|
---|
497 | @ d2 = h2*r0 + h1*r1 + h0*r2 + h4*5*r3 + h3*5*r4
|
---|
498 | @ d3 = h3*r0 + h2*r1 + h1*r2 + h0*r3 + h4*5*r4
|
---|
499 | @ d4 = h4*r0 + h3*r1 + h2*r2 + h1*r3 + h0*r4
|
---|
500 |
|
---|
501 | vmull.u32 $D0,$R0,${R0}[1]
|
---|
502 | vmull.u32 $D1,$R1,${R0}[1]
|
---|
503 | vmull.u32 $D2,$R2,${R0}[1]
|
---|
504 | vmull.u32 $D3,$R3,${R0}[1]
|
---|
505 | vmull.u32 $D4,$R4,${R0}[1]
|
---|
506 |
|
---|
507 | vmlal.u32 $D0,$R4,${S1}[1]
|
---|
508 | vmlal.u32 $D1,$R0,${R1}[1]
|
---|
509 | vmlal.u32 $D2,$R1,${R1}[1]
|
---|
510 | vmlal.u32 $D3,$R2,${R1}[1]
|
---|
511 | vmlal.u32 $D4,$R3,${R1}[1]
|
---|
512 |
|
---|
513 | vmlal.u32 $D0,$R3,${S2}[1]
|
---|
514 | vmlal.u32 $D1,$R4,${S2}[1]
|
---|
515 | vmlal.u32 $D3,$R1,${R2}[1]
|
---|
516 | vmlal.u32 $D2,$R0,${R2}[1]
|
---|
517 | vmlal.u32 $D4,$R2,${R2}[1]
|
---|
518 |
|
---|
519 | vmlal.u32 $D0,$R2,${S3}[1]
|
---|
520 | vmlal.u32 $D3,$R0,${R3}[1]
|
---|
521 | vmlal.u32 $D1,$R3,${S3}[1]
|
---|
522 | vmlal.u32 $D2,$R4,${S3}[1]
|
---|
523 | vmlal.u32 $D4,$R1,${R3}[1]
|
---|
524 |
|
---|
525 | vmlal.u32 $D3,$R4,${S4}[1]
|
---|
526 | vmlal.u32 $D0,$R1,${S4}[1]
|
---|
527 | vmlal.u32 $D1,$R2,${S4}[1]
|
---|
528 | vmlal.u32 $D2,$R3,${S4}[1]
|
---|
529 | vmlal.u32 $D4,$R0,${R4}[1]
|
---|
530 |
|
---|
531 | @@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@
|
---|
532 | @ lazy reduction as discussed in "NEON crypto" by D.J. Bernstein
|
---|
533 | @ and P. Schwabe
|
---|
534 | @
|
---|
535 | @ H0>>+H1>>+H2>>+H3>>+H4
|
---|
536 | @ H3>>+H4>>*5+H0>>+H1
|
---|
537 | @
|
---|
538 | @ Trivia.
|
---|
539 | @
|
---|
540 | @ Result of multiplication of n-bit number by m-bit number is
|
---|
541 | @ n+m bits wide. However! Even though 2^n is a n+1-bit number,
|
---|
542 | @ m-bit number multiplied by 2^n is still n+m bits wide.
|
---|
543 | @
|
---|
544 | @ Sum of two n-bit numbers is n+1 bits wide, sum of three - n+2,
|
---|
545 | @ and so is sum of four. Sum of 2^m n-m-bit numbers and n-bit
|
---|
546 | @ one is n+1 bits wide.
|
---|
547 | @
|
---|
548 | @ >>+ denotes Hnext += Hn>>26, Hn &= 0x3ffffff. This means that
|
---|
549 | @ H0, H2, H3 are guaranteed to be 26 bits wide, while H1 and H4
|
---|
550 | @ can be 27. However! In cases when their width exceeds 26 bits
|
---|
551 | @ they are limited by 2^26+2^6. This in turn means that *sum*
|
---|
552 | @ of the products with these values can still be viewed as sum
|
---|
553 | @ of 52-bit numbers as long as the amount of addends is not a
|
---|
554 | @ power of 2. For example,
|
---|
555 | @
|
---|
556 | @ H4 = H4*R0 + H3*R1 + H2*R2 + H1*R3 + H0 * R4,
|
---|
557 | @
|
---|
558 | @ which can't be larger than 5 * (2^26 + 2^6) * (2^26 + 2^6), or
|
---|
559 | @ 5 * (2^52 + 2*2^32 + 2^12), which in turn is smaller than
|
---|
560 | @ 8 * (2^52) or 2^55. However, the value is then multiplied by
|
---|
561 | @ by 5, so we should be looking at 5 * 5 * (2^52 + 2^33 + 2^12),
|
---|
562 | @ which is less than 32 * (2^52) or 2^57. And when processing
|
---|
563 | @ data we are looking at triple as many addends...
|
---|
564 | @
|
---|
565 | @ In key setup procedure pre-reduced H0 is limited by 5*4+1 and
|
---|
566 | @ 5*H4 - by 5*5 52-bit addends, or 57 bits. But when hashing the
|
---|
567 | @ input H0 is limited by (5*4+1)*3 addends, or 58 bits, while
|
---|
568 | @ 5*H4 by 5*5*3, or 59[!] bits. How is this relevant? vmlal.u32
|
---|
569 | @ instruction accepts 2x32-bit input and writes 2x64-bit result.
|
---|
570 | @ This means that result of reduction have to be compressed upon
|
---|
571 | @ loop wrap-around. This can be done in the process of reduction
|
---|
572 | @ to minimize amount of instructions [as well as amount of
|
---|
573 | @ 128-bit instructions, which benefits low-end processors], but
|
---|
574 | @ one has to watch for H2 (which is narrower than H0) and 5*H4
|
---|
575 | @ not being wider than 58 bits, so that result of right shift
|
---|
576 | @ by 26 bits fits in 32 bits. This is also useful on x86,
|
---|
577 | @ because it allows to use paddd in place for paddq, which
|
---|
578 | @ benefits Atom, where paddq is ridiculously slow.
|
---|
579 |
|
---|
580 | vshr.u64 $T0,$D3,#26
|
---|
581 | vmovn.i64 $D3#lo,$D3
|
---|
582 | vshr.u64 $T1,$D0,#26
|
---|
583 | vmovn.i64 $D0#lo,$D0
|
---|
584 | vadd.i64 $D4,$D4,$T0 @ h3 -> h4
|
---|
585 | vbic.i32 $D3#lo,#0xfc000000 @ &=0x03ffffff
|
---|
586 | vadd.i64 $D1,$D1,$T1 @ h0 -> h1
|
---|
587 | vbic.i32 $D0#lo,#0xfc000000
|
---|
588 |
|
---|
589 | vshrn.u64 $T0#lo,$D4,#26
|
---|
590 | vmovn.i64 $D4#lo,$D4
|
---|
591 | vshr.u64 $T1,$D1,#26
|
---|
592 | vmovn.i64 $D1#lo,$D1
|
---|
593 | vadd.i64 $D2,$D2,$T1 @ h1 -> h2
|
---|
594 | vbic.i32 $D4#lo,#0xfc000000
|
---|
595 | vbic.i32 $D1#lo,#0xfc000000
|
---|
596 |
|
---|
597 | vadd.i32 $D0#lo,$D0#lo,$T0#lo
|
---|
598 | vshl.u32 $T0#lo,$T0#lo,#2
|
---|
599 | vshrn.u64 $T1#lo,$D2,#26
|
---|
600 | vmovn.i64 $D2#lo,$D2
|
---|
601 | vadd.i32 $D0#lo,$D0#lo,$T0#lo @ h4 -> h0
|
---|
602 | vadd.i32 $D3#lo,$D3#lo,$T1#lo @ h2 -> h3
|
---|
603 | vbic.i32 $D2#lo,#0xfc000000
|
---|
604 |
|
---|
605 | vshr.u32 $T0#lo,$D0#lo,#26
|
---|
606 | vbic.i32 $D0#lo,#0xfc000000
|
---|
607 | vshr.u32 $T1#lo,$D3#lo,#26
|
---|
608 | vbic.i32 $D3#lo,#0xfc000000
|
---|
609 | vadd.i32 $D1#lo,$D1#lo,$T0#lo @ h0 -> h1
|
---|
610 | vadd.i32 $D4#lo,$D4#lo,$T1#lo @ h3 -> h4
|
---|
611 |
|
---|
612 | subs $zeros,$zeros,#1
|
---|
613 | beq .Lsquare_break_neon
|
---|
614 |
|
---|
615 | add $tbl0,$ctx,#(48+0*9*4)
|
---|
616 | add $tbl1,$ctx,#(48+1*9*4)
|
---|
617 |
|
---|
618 | vtrn.32 $R0,$D0#lo @ r^2:r^1
|
---|
619 | vtrn.32 $R2,$D2#lo
|
---|
620 | vtrn.32 $R3,$D3#lo
|
---|
621 | vtrn.32 $R1,$D1#lo
|
---|
622 | vtrn.32 $R4,$D4#lo
|
---|
623 |
|
---|
624 | vshl.u32 $S2,$R2,#2 @ *5
|
---|
625 | vshl.u32 $S3,$R3,#2
|
---|
626 | vshl.u32 $S1,$R1,#2
|
---|
627 | vshl.u32 $S4,$R4,#2
|
---|
628 | vadd.i32 $S2,$S2,$R2
|
---|
629 | vadd.i32 $S1,$S1,$R1
|
---|
630 | vadd.i32 $S3,$S3,$R3
|
---|
631 | vadd.i32 $S4,$S4,$R4
|
---|
632 |
|
---|
633 | vst4.32 {${R0}[0],${R1}[0],${S1}[0],${R2}[0]},[$tbl0]!
|
---|
634 | vst4.32 {${R0}[1],${R1}[1],${S1}[1],${R2}[1]},[$tbl1]!
|
---|
635 | vst4.32 {${S2}[0],${R3}[0],${S3}[0],${R4}[0]},[$tbl0]!
|
---|
636 | vst4.32 {${S2}[1],${R3}[1],${S3}[1],${R4}[1]},[$tbl1]!
|
---|
637 | vst1.32 {${S4}[0]},[$tbl0,:32]
|
---|
638 | vst1.32 {${S4}[1]},[$tbl1,:32]
|
---|
639 |
|
---|
640 | b .Lsquare_neon
|
---|
641 |
|
---|
642 | .align 4
|
---|
643 | .Lsquare_break_neon:
|
---|
644 | add $tbl0,$ctx,#(48+2*4*9)
|
---|
645 | add $tbl1,$ctx,#(48+3*4*9)
|
---|
646 |
|
---|
647 | vmov $R0,$D0#lo @ r^4:r^3
|
---|
648 | vshl.u32 $S1,$D1#lo,#2 @ *5
|
---|
649 | vmov $R1,$D1#lo
|
---|
650 | vshl.u32 $S2,$D2#lo,#2
|
---|
651 | vmov $R2,$D2#lo
|
---|
652 | vshl.u32 $S3,$D3#lo,#2
|
---|
653 | vmov $R3,$D3#lo
|
---|
654 | vshl.u32 $S4,$D4#lo,#2
|
---|
655 | vmov $R4,$D4#lo
|
---|
656 | vadd.i32 $S1,$S1,$D1#lo
|
---|
657 | vadd.i32 $S2,$S2,$D2#lo
|
---|
658 | vadd.i32 $S3,$S3,$D3#lo
|
---|
659 | vadd.i32 $S4,$S4,$D4#lo
|
---|
660 |
|
---|
661 | vst4.32 {${R0}[0],${R1}[0],${S1}[0],${R2}[0]},[$tbl0]!
|
---|
662 | vst4.32 {${R0}[1],${R1}[1],${S1}[1],${R2}[1]},[$tbl1]!
|
---|
663 | vst4.32 {${S2}[0],${R3}[0],${S3}[0],${R4}[0]},[$tbl0]!
|
---|
664 | vst4.32 {${S2}[1],${R3}[1],${S3}[1],${R4}[1]},[$tbl1]!
|
---|
665 | vst1.32 {${S4}[0]},[$tbl0]
|
---|
666 | vst1.32 {${S4}[1]},[$tbl1]
|
---|
667 |
|
---|
668 | ret @ bx lr
|
---|
669 | .size poly1305_init_neon,.-poly1305_init_neon
|
---|
670 |
|
---|
671 | .type poly1305_blocks_neon,%function
|
---|
672 | .align 5
|
---|
673 | poly1305_blocks_neon:
|
---|
674 | ldr ip,[$ctx,#36] @ is_base2_26
|
---|
675 | ands $len,$len,#-16
|
---|
676 | beq .Lno_data_neon
|
---|
677 |
|
---|
678 | cmp $len,#64
|
---|
679 | bhs .Lenter_neon
|
---|
680 | tst ip,ip @ is_base2_26?
|
---|
681 | beq .Lpoly1305_blocks
|
---|
682 |
|
---|
683 | .Lenter_neon:
|
---|
684 | stmdb sp!,{r4-r7}
|
---|
685 | vstmdb sp!,{d8-d15} @ ABI specification says so
|
---|
686 |
|
---|
687 | tst ip,ip @ is_base2_26?
|
---|
688 | bne .Lbase2_26_neon
|
---|
689 |
|
---|
690 | stmdb sp!,{r1-r3,lr}
|
---|
691 | bl poly1305_init_neon
|
---|
692 |
|
---|
693 | ldr r4,[$ctx,#0] @ load hash value base 2^32
|
---|
694 | ldr r5,[$ctx,#4]
|
---|
695 | ldr r6,[$ctx,#8]
|
---|
696 | ldr r7,[$ctx,#12]
|
---|
697 | ldr ip,[$ctx,#16]
|
---|
698 |
|
---|
699 | and r2,r4,#0x03ffffff @ base 2^32 -> base 2^26
|
---|
700 | mov r3,r4,lsr#26
|
---|
701 | veor $D0#lo,$D0#lo,$D0#lo
|
---|
702 | mov r4,r5,lsr#20
|
---|
703 | orr r3,r3,r5,lsl#6
|
---|
704 | veor $D1#lo,$D1#lo,$D1#lo
|
---|
705 | mov r5,r6,lsr#14
|
---|
706 | orr r4,r4,r6,lsl#12
|
---|
707 | veor $D2#lo,$D2#lo,$D2#lo
|
---|
708 | mov r6,r7,lsr#8
|
---|
709 | orr r5,r5,r7,lsl#18
|
---|
710 | veor $D3#lo,$D3#lo,$D3#lo
|
---|
711 | and r3,r3,#0x03ffffff
|
---|
712 | orr r6,r6,ip,lsl#24
|
---|
713 | veor $D4#lo,$D4#lo,$D4#lo
|
---|
714 | and r4,r4,#0x03ffffff
|
---|
715 | mov r1,#1
|
---|
716 | and r5,r5,#0x03ffffff
|
---|
717 | str r1,[$ctx,#36] @ is_base2_26
|
---|
718 |
|
---|
719 | vmov.32 $D0#lo[0],r2
|
---|
720 | vmov.32 $D1#lo[0],r3
|
---|
721 | vmov.32 $D2#lo[0],r4
|
---|
722 | vmov.32 $D3#lo[0],r5
|
---|
723 | vmov.32 $D4#lo[0],r6
|
---|
724 | adr $zeros,.Lzeros
|
---|
725 |
|
---|
726 | ldmia sp!,{r1-r3,lr}
|
---|
727 | b .Lbase2_32_neon
|
---|
728 |
|
---|
729 | .align 4
|
---|
730 | .Lbase2_26_neon:
|
---|
731 | @@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@
|
---|
732 | @ load hash value
|
---|
733 |
|
---|
734 | veor $D0#lo,$D0#lo,$D0#lo
|
---|
735 | veor $D1#lo,$D1#lo,$D1#lo
|
---|
736 | veor $D2#lo,$D2#lo,$D2#lo
|
---|
737 | veor $D3#lo,$D3#lo,$D3#lo
|
---|
738 | veor $D4#lo,$D4#lo,$D4#lo
|
---|
739 | vld4.32 {$D0#lo[0],$D1#lo[0],$D2#lo[0],$D3#lo[0]},[$ctx]!
|
---|
740 | adr $zeros,.Lzeros
|
---|
741 | vld1.32 {$D4#lo[0]},[$ctx]
|
---|
742 | sub $ctx,$ctx,#16 @ rewind
|
---|
743 |
|
---|
744 | .Lbase2_32_neon:
|
---|
745 | add $in2,$inp,#32
|
---|
746 | mov $padbit,$padbit,lsl#24
|
---|
747 | tst $len,#31
|
---|
748 | beq .Leven
|
---|
749 |
|
---|
750 | vld4.32 {$H0#lo[0],$H1#lo[0],$H2#lo[0],$H3#lo[0]},[$inp]!
|
---|
751 | vmov.32 $H4#lo[0],$padbit
|
---|
752 | sub $len,$len,#16
|
---|
753 | add $in2,$inp,#32
|
---|
754 |
|
---|
755 | # ifdef __ARMEB__
|
---|
756 | vrev32.8 $H0,$H0
|
---|
757 | vrev32.8 $H3,$H3
|
---|
758 | vrev32.8 $H1,$H1
|
---|
759 | vrev32.8 $H2,$H2
|
---|
760 | # endif
|
---|
761 | vsri.u32 $H4#lo,$H3#lo,#8 @ base 2^32 -> base 2^26
|
---|
762 | vshl.u32 $H3#lo,$H3#lo,#18
|
---|
763 |
|
---|
764 | vsri.u32 $H3#lo,$H2#lo,#14
|
---|
765 | vshl.u32 $H2#lo,$H2#lo,#12
|
---|
766 | vadd.i32 $H4#hi,$H4#lo,$D4#lo @ add hash value and move to #hi
|
---|
767 |
|
---|
768 | vbic.i32 $H3#lo,#0xfc000000
|
---|
769 | vsri.u32 $H2#lo,$H1#lo,#20
|
---|
770 | vshl.u32 $H1#lo,$H1#lo,#6
|
---|
771 |
|
---|
772 | vbic.i32 $H2#lo,#0xfc000000
|
---|
773 | vsri.u32 $H1#lo,$H0#lo,#26
|
---|
774 | vadd.i32 $H3#hi,$H3#lo,$D3#lo
|
---|
775 |
|
---|
776 | vbic.i32 $H0#lo,#0xfc000000
|
---|
777 | vbic.i32 $H1#lo,#0xfc000000
|
---|
778 | vadd.i32 $H2#hi,$H2#lo,$D2#lo
|
---|
779 |
|
---|
780 | vadd.i32 $H0#hi,$H0#lo,$D0#lo
|
---|
781 | vadd.i32 $H1#hi,$H1#lo,$D1#lo
|
---|
782 |
|
---|
783 | mov $tbl1,$zeros
|
---|
784 | add $tbl0,$ctx,#48
|
---|
785 |
|
---|
786 | cmp $len,$len
|
---|
787 | b .Long_tail
|
---|
788 |
|
---|
789 | .align 4
|
---|
790 | .Leven:
|
---|
791 | subs $len,$len,#64
|
---|
792 | it lo
|
---|
793 | movlo $in2,$zeros
|
---|
794 |
|
---|
795 | vmov.i32 $H4,#1<<24 @ padbit, yes, always
|
---|
796 | vld4.32 {$H0#lo,$H1#lo,$H2#lo,$H3#lo},[$inp] @ inp[0:1]
|
---|
797 | add $inp,$inp,#64
|
---|
798 | vld4.32 {$H0#hi,$H1#hi,$H2#hi,$H3#hi},[$in2] @ inp[2:3] (or 0)
|
---|
799 | add $in2,$in2,#64
|
---|
800 | itt hi
|
---|
801 | addhi $tbl1,$ctx,#(48+1*9*4)
|
---|
802 | addhi $tbl0,$ctx,#(48+3*9*4)
|
---|
803 |
|
---|
804 | # ifdef __ARMEB__
|
---|
805 | vrev32.8 $H0,$H0
|
---|
806 | vrev32.8 $H3,$H3
|
---|
807 | vrev32.8 $H1,$H1
|
---|
808 | vrev32.8 $H2,$H2
|
---|
809 | # endif
|
---|
810 | vsri.u32 $H4,$H3,#8 @ base 2^32 -> base 2^26
|
---|
811 | vshl.u32 $H3,$H3,#18
|
---|
812 |
|
---|
813 | vsri.u32 $H3,$H2,#14
|
---|
814 | vshl.u32 $H2,$H2,#12
|
---|
815 |
|
---|
816 | vbic.i32 $H3,#0xfc000000
|
---|
817 | vsri.u32 $H2,$H1,#20
|
---|
818 | vshl.u32 $H1,$H1,#6
|
---|
819 |
|
---|
820 | vbic.i32 $H2,#0xfc000000
|
---|
821 | vsri.u32 $H1,$H0,#26
|
---|
822 |
|
---|
823 | vbic.i32 $H0,#0xfc000000
|
---|
824 | vbic.i32 $H1,#0xfc000000
|
---|
825 |
|
---|
826 | bls .Lskip_loop
|
---|
827 |
|
---|
828 | vld4.32 {${R0}[1],${R1}[1],${S1}[1],${R2}[1]},[$tbl1]! @ load r^2
|
---|
829 | vld4.32 {${R0}[0],${R1}[0],${S1}[0],${R2}[0]},[$tbl0]! @ load r^4
|
---|
830 | vld4.32 {${S2}[1],${R3}[1],${S3}[1],${R4}[1]},[$tbl1]!
|
---|
831 | vld4.32 {${S2}[0],${R3}[0],${S3}[0],${R4}[0]},[$tbl0]!
|
---|
832 | b .Loop_neon
|
---|
833 |
|
---|
834 | .align 5
|
---|
835 | .Loop_neon:
|
---|
836 | @@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@
|
---|
837 | @ ((inp[0]*r^4+inp[2]*r^2+inp[4])*r^4+inp[6]*r^2
|
---|
838 | @ ((inp[1]*r^4+inp[3]*r^2+inp[5])*r^3+inp[7]*r
|
---|
839 | @ \___________________/
|
---|
840 | @ ((inp[0]*r^4+inp[2]*r^2+inp[4])*r^4+inp[6]*r^2+inp[8])*r^2
|
---|
841 | @ ((inp[1]*r^4+inp[3]*r^2+inp[5])*r^4+inp[7]*r^2+inp[9])*r
|
---|
842 | @ \___________________/ \____________________/
|
---|
843 | @
|
---|
844 | @ Note that we start with inp[2:3]*r^2. This is because it
|
---|
845 | @ doesn't depend on reduction in previous iteration.
|
---|
846 | @@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@
|
---|
847 | @ d4 = h4*r0 + h3*r1 + h2*r2 + h1*r3 + h0*r4
|
---|
848 | @ d3 = h3*r0 + h2*r1 + h1*r2 + h0*r3 + h4*5*r4
|
---|
849 | @ d2 = h2*r0 + h1*r1 + h0*r2 + h4*5*r3 + h3*5*r4
|
---|
850 | @ d1 = h1*r0 + h0*r1 + h4*5*r2 + h3*5*r3 + h2*5*r4
|
---|
851 | @ d0 = h0*r0 + h4*5*r1 + h3*5*r2 + h2*5*r3 + h1*5*r4
|
---|
852 |
|
---|
853 | @@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@
|
---|
854 | @ inp[2:3]*r^2
|
---|
855 |
|
---|
856 | vadd.i32 $H2#lo,$H2#lo,$D2#lo @ accumulate inp[0:1]
|
---|
857 | vmull.u32 $D2,$H2#hi,${R0}[1]
|
---|
858 | vadd.i32 $H0#lo,$H0#lo,$D0#lo
|
---|
859 | vmull.u32 $D0,$H0#hi,${R0}[1]
|
---|
860 | vadd.i32 $H3#lo,$H3#lo,$D3#lo
|
---|
861 | vmull.u32 $D3,$H3#hi,${R0}[1]
|
---|
862 | vmlal.u32 $D2,$H1#hi,${R1}[1]
|
---|
863 | vadd.i32 $H1#lo,$H1#lo,$D1#lo
|
---|
864 | vmull.u32 $D1,$H1#hi,${R0}[1]
|
---|
865 |
|
---|
866 | vadd.i32 $H4#lo,$H4#lo,$D4#lo
|
---|
867 | vmull.u32 $D4,$H4#hi,${R0}[1]
|
---|
868 | subs $len,$len,#64
|
---|
869 | vmlal.u32 $D0,$H4#hi,${S1}[1]
|
---|
870 | it lo
|
---|
871 | movlo $in2,$zeros
|
---|
872 | vmlal.u32 $D3,$H2#hi,${R1}[1]
|
---|
873 | vld1.32 ${S4}[1],[$tbl1,:32]
|
---|
874 | vmlal.u32 $D1,$H0#hi,${R1}[1]
|
---|
875 | vmlal.u32 $D4,$H3#hi,${R1}[1]
|
---|
876 |
|
---|
877 | vmlal.u32 $D0,$H3#hi,${S2}[1]
|
---|
878 | vmlal.u32 $D3,$H1#hi,${R2}[1]
|
---|
879 | vmlal.u32 $D4,$H2#hi,${R2}[1]
|
---|
880 | vmlal.u32 $D1,$H4#hi,${S2}[1]
|
---|
881 | vmlal.u32 $D2,$H0#hi,${R2}[1]
|
---|
882 |
|
---|
883 | vmlal.u32 $D3,$H0#hi,${R3}[1]
|
---|
884 | vmlal.u32 $D0,$H2#hi,${S3}[1]
|
---|
885 | vmlal.u32 $D4,$H1#hi,${R3}[1]
|
---|
886 | vmlal.u32 $D1,$H3#hi,${S3}[1]
|
---|
887 | vmlal.u32 $D2,$H4#hi,${S3}[1]
|
---|
888 |
|
---|
889 | vmlal.u32 $D3,$H4#hi,${S4}[1]
|
---|
890 | vmlal.u32 $D0,$H1#hi,${S4}[1]
|
---|
891 | vmlal.u32 $D4,$H0#hi,${R4}[1]
|
---|
892 | vmlal.u32 $D1,$H2#hi,${S4}[1]
|
---|
893 | vmlal.u32 $D2,$H3#hi,${S4}[1]
|
---|
894 |
|
---|
895 | vld4.32 {$H0#hi,$H1#hi,$H2#hi,$H3#hi},[$in2] @ inp[2:3] (or 0)
|
---|
896 | add $in2,$in2,#64
|
---|
897 |
|
---|
898 | @@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@
|
---|
899 | @ (hash+inp[0:1])*r^4 and accumulate
|
---|
900 |
|
---|
901 | vmlal.u32 $D3,$H3#lo,${R0}[0]
|
---|
902 | vmlal.u32 $D0,$H0#lo,${R0}[0]
|
---|
903 | vmlal.u32 $D4,$H4#lo,${R0}[0]
|
---|
904 | vmlal.u32 $D1,$H1#lo,${R0}[0]
|
---|
905 | vmlal.u32 $D2,$H2#lo,${R0}[0]
|
---|
906 | vld1.32 ${S4}[0],[$tbl0,:32]
|
---|
907 |
|
---|
908 | vmlal.u32 $D3,$H2#lo,${R1}[0]
|
---|
909 | vmlal.u32 $D0,$H4#lo,${S1}[0]
|
---|
910 | vmlal.u32 $D4,$H3#lo,${R1}[0]
|
---|
911 | vmlal.u32 $D1,$H0#lo,${R1}[0]
|
---|
912 | vmlal.u32 $D2,$H1#lo,${R1}[0]
|
---|
913 |
|
---|
914 | vmlal.u32 $D3,$H1#lo,${R2}[0]
|
---|
915 | vmlal.u32 $D0,$H3#lo,${S2}[0]
|
---|
916 | vmlal.u32 $D4,$H2#lo,${R2}[0]
|
---|
917 | vmlal.u32 $D1,$H4#lo,${S2}[0]
|
---|
918 | vmlal.u32 $D2,$H0#lo,${R2}[0]
|
---|
919 |
|
---|
920 | vmlal.u32 $D3,$H0#lo,${R3}[0]
|
---|
921 | vmlal.u32 $D0,$H2#lo,${S3}[0]
|
---|
922 | vmlal.u32 $D4,$H1#lo,${R3}[0]
|
---|
923 | vmlal.u32 $D1,$H3#lo,${S3}[0]
|
---|
924 | vmlal.u32 $D3,$H4#lo,${S4}[0]
|
---|
925 |
|
---|
926 | vmlal.u32 $D2,$H4#lo,${S3}[0]
|
---|
927 | vmlal.u32 $D0,$H1#lo,${S4}[0]
|
---|
928 | vmlal.u32 $D4,$H0#lo,${R4}[0]
|
---|
929 | vmov.i32 $H4,#1<<24 @ padbit, yes, always
|
---|
930 | vmlal.u32 $D1,$H2#lo,${S4}[0]
|
---|
931 | vmlal.u32 $D2,$H3#lo,${S4}[0]
|
---|
932 |
|
---|
933 | vld4.32 {$H0#lo,$H1#lo,$H2#lo,$H3#lo},[$inp] @ inp[0:1]
|
---|
934 | add $inp,$inp,#64
|
---|
935 | # ifdef __ARMEB__
|
---|
936 | vrev32.8 $H0,$H0
|
---|
937 | vrev32.8 $H1,$H1
|
---|
938 | vrev32.8 $H2,$H2
|
---|
939 | vrev32.8 $H3,$H3
|
---|
940 | # endif
|
---|
941 |
|
---|
942 | @@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@
|
---|
943 | @ lazy reduction interleaved with base 2^32 -> base 2^26 of
|
---|
944 | @ inp[0:3] previously loaded to $H0-$H3 and smashed to $H0-$H4.
|
---|
945 |
|
---|
946 | vshr.u64 $T0,$D3,#26
|
---|
947 | vmovn.i64 $D3#lo,$D3
|
---|
948 | vshr.u64 $T1,$D0,#26
|
---|
949 | vmovn.i64 $D0#lo,$D0
|
---|
950 | vadd.i64 $D4,$D4,$T0 @ h3 -> h4
|
---|
951 | vbic.i32 $D3#lo,#0xfc000000
|
---|
952 | vsri.u32 $H4,$H3,#8 @ base 2^32 -> base 2^26
|
---|
953 | vadd.i64 $D1,$D1,$T1 @ h0 -> h1
|
---|
954 | vshl.u32 $H3,$H3,#18
|
---|
955 | vbic.i32 $D0#lo,#0xfc000000
|
---|
956 |
|
---|
957 | vshrn.u64 $T0#lo,$D4,#26
|
---|
958 | vmovn.i64 $D4#lo,$D4
|
---|
959 | vshr.u64 $T1,$D1,#26
|
---|
960 | vmovn.i64 $D1#lo,$D1
|
---|
961 | vadd.i64 $D2,$D2,$T1 @ h1 -> h2
|
---|
962 | vsri.u32 $H3,$H2,#14
|
---|
963 | vbic.i32 $D4#lo,#0xfc000000
|
---|
964 | vshl.u32 $H2,$H2,#12
|
---|
965 | vbic.i32 $D1#lo,#0xfc000000
|
---|
966 |
|
---|
967 | vadd.i32 $D0#lo,$D0#lo,$T0#lo
|
---|
968 | vshl.u32 $T0#lo,$T0#lo,#2
|
---|
969 | vbic.i32 $H3,#0xfc000000
|
---|
970 | vshrn.u64 $T1#lo,$D2,#26
|
---|
971 | vmovn.i64 $D2#lo,$D2
|
---|
972 | vaddl.u32 $D0,$D0#lo,$T0#lo @ h4 -> h0 [widen for a sec]
|
---|
973 | vsri.u32 $H2,$H1,#20
|
---|
974 | vadd.i32 $D3#lo,$D3#lo,$T1#lo @ h2 -> h3
|
---|
975 | vshl.u32 $H1,$H1,#6
|
---|
976 | vbic.i32 $D2#lo,#0xfc000000
|
---|
977 | vbic.i32 $H2,#0xfc000000
|
---|
978 |
|
---|
979 | vshrn.u64 $T0#lo,$D0,#26 @ re-narrow
|
---|
980 | vmovn.i64 $D0#lo,$D0
|
---|
981 | vsri.u32 $H1,$H0,#26
|
---|
982 | vbic.i32 $H0,#0xfc000000
|
---|
983 | vshr.u32 $T1#lo,$D3#lo,#26
|
---|
984 | vbic.i32 $D3#lo,#0xfc000000
|
---|
985 | vbic.i32 $D0#lo,#0xfc000000
|
---|
986 | vadd.i32 $D1#lo,$D1#lo,$T0#lo @ h0 -> h1
|
---|
987 | vadd.i32 $D4#lo,$D4#lo,$T1#lo @ h3 -> h4
|
---|
988 | vbic.i32 $H1,#0xfc000000
|
---|
989 |
|
---|
990 | bhi .Loop_neon
|
---|
991 |
|
---|
992 | .Lskip_loop:
|
---|
993 | @@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@
|
---|
994 | @ multiply (inp[0:1]+hash) or inp[2:3] by r^2:r^1
|
---|
995 |
|
---|
996 | add $tbl1,$ctx,#(48+0*9*4)
|
---|
997 | add $tbl0,$ctx,#(48+1*9*4)
|
---|
998 | adds $len,$len,#32
|
---|
999 | it ne
|
---|
1000 | movne $len,#0
|
---|
1001 | bne .Long_tail
|
---|
1002 |
|
---|
1003 | vadd.i32 $H2#hi,$H2#lo,$D2#lo @ add hash value and move to #hi
|
---|
1004 | vadd.i32 $H0#hi,$H0#lo,$D0#lo
|
---|
1005 | vadd.i32 $H3#hi,$H3#lo,$D3#lo
|
---|
1006 | vadd.i32 $H1#hi,$H1#lo,$D1#lo
|
---|
1007 | vadd.i32 $H4#hi,$H4#lo,$D4#lo
|
---|
1008 |
|
---|
1009 | .Long_tail:
|
---|
1010 | vld4.32 {${R0}[1],${R1}[1],${S1}[1],${R2}[1]},[$tbl1]! @ load r^1
|
---|
1011 | vld4.32 {${R0}[0],${R1}[0],${S1}[0],${R2}[0]},[$tbl0]! @ load r^2
|
---|
1012 |
|
---|
1013 | vadd.i32 $H2#lo,$H2#lo,$D2#lo @ can be redundant
|
---|
1014 | vmull.u32 $D2,$H2#hi,$R0
|
---|
1015 | vadd.i32 $H0#lo,$H0#lo,$D0#lo
|
---|
1016 | vmull.u32 $D0,$H0#hi,$R0
|
---|
1017 | vadd.i32 $H3#lo,$H3#lo,$D3#lo
|
---|
1018 | vmull.u32 $D3,$H3#hi,$R0
|
---|
1019 | vadd.i32 $H1#lo,$H1#lo,$D1#lo
|
---|
1020 | vmull.u32 $D1,$H1#hi,$R0
|
---|
1021 | vadd.i32 $H4#lo,$H4#lo,$D4#lo
|
---|
1022 | vmull.u32 $D4,$H4#hi,$R0
|
---|
1023 |
|
---|
1024 | vmlal.u32 $D0,$H4#hi,$S1
|
---|
1025 | vld4.32 {${S2}[1],${R3}[1],${S3}[1],${R4}[1]},[$tbl1]!
|
---|
1026 | vmlal.u32 $D3,$H2#hi,$R1
|
---|
1027 | vld4.32 {${S2}[0],${R3}[0],${S3}[0],${R4}[0]},[$tbl0]!
|
---|
1028 | vmlal.u32 $D1,$H0#hi,$R1
|
---|
1029 | vmlal.u32 $D4,$H3#hi,$R1
|
---|
1030 | vmlal.u32 $D2,$H1#hi,$R1
|
---|
1031 |
|
---|
1032 | vmlal.u32 $D3,$H1#hi,$R2
|
---|
1033 | vld1.32 ${S4}[1],[$tbl1,:32]
|
---|
1034 | vmlal.u32 $D0,$H3#hi,$S2
|
---|
1035 | vld1.32 ${S4}[0],[$tbl0,:32]
|
---|
1036 | vmlal.u32 $D4,$H2#hi,$R2
|
---|
1037 | vmlal.u32 $D1,$H4#hi,$S2
|
---|
1038 | vmlal.u32 $D2,$H0#hi,$R2
|
---|
1039 |
|
---|
1040 | vmlal.u32 $D3,$H0#hi,$R3
|
---|
1041 | it ne
|
---|
1042 | addne $tbl1,$ctx,#(48+2*9*4)
|
---|
1043 | vmlal.u32 $D0,$H2#hi,$S3
|
---|
1044 | it ne
|
---|
1045 | addne $tbl0,$ctx,#(48+3*9*4)
|
---|
1046 | vmlal.u32 $D4,$H1#hi,$R3
|
---|
1047 | vmlal.u32 $D1,$H3#hi,$S3
|
---|
1048 | vmlal.u32 $D2,$H4#hi,$S3
|
---|
1049 |
|
---|
1050 | vmlal.u32 $D3,$H4#hi,$S4
|
---|
1051 | vorn $MASK,$MASK,$MASK @ all-ones, can be redundant
|
---|
1052 | vmlal.u32 $D0,$H1#hi,$S4
|
---|
1053 | vshr.u64 $MASK,$MASK,#38
|
---|
1054 | vmlal.u32 $D4,$H0#hi,$R4
|
---|
1055 | vmlal.u32 $D1,$H2#hi,$S4
|
---|
1056 | vmlal.u32 $D2,$H3#hi,$S4
|
---|
1057 |
|
---|
1058 | beq .Lshort_tail
|
---|
1059 |
|
---|
1060 | @@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@
|
---|
1061 | @ (hash+inp[0:1])*r^4:r^3 and accumulate
|
---|
1062 |
|
---|
1063 | vld4.32 {${R0}[1],${R1}[1],${S1}[1],${R2}[1]},[$tbl1]! @ load r^3
|
---|
1064 | vld4.32 {${R0}[0],${R1}[0],${S1}[0],${R2}[0]},[$tbl0]! @ load r^4
|
---|
1065 |
|
---|
1066 | vmlal.u32 $D2,$H2#lo,$R0
|
---|
1067 | vmlal.u32 $D0,$H0#lo,$R0
|
---|
1068 | vmlal.u32 $D3,$H3#lo,$R0
|
---|
1069 | vmlal.u32 $D1,$H1#lo,$R0
|
---|
1070 | vmlal.u32 $D4,$H4#lo,$R0
|
---|
1071 |
|
---|
1072 | vmlal.u32 $D0,$H4#lo,$S1
|
---|
1073 | vld4.32 {${S2}[1],${R3}[1],${S3}[1],${R4}[1]},[$tbl1]!
|
---|
1074 | vmlal.u32 $D3,$H2#lo,$R1
|
---|
1075 | vld4.32 {${S2}[0],${R3}[0],${S3}[0],${R4}[0]},[$tbl0]!
|
---|
1076 | vmlal.u32 $D1,$H0#lo,$R1
|
---|
1077 | vmlal.u32 $D4,$H3#lo,$R1
|
---|
1078 | vmlal.u32 $D2,$H1#lo,$R1
|
---|
1079 |
|
---|
1080 | vmlal.u32 $D3,$H1#lo,$R2
|
---|
1081 | vld1.32 ${S4}[1],[$tbl1,:32]
|
---|
1082 | vmlal.u32 $D0,$H3#lo,$S2
|
---|
1083 | vld1.32 ${S4}[0],[$tbl0,:32]
|
---|
1084 | vmlal.u32 $D4,$H2#lo,$R2
|
---|
1085 | vmlal.u32 $D1,$H4#lo,$S2
|
---|
1086 | vmlal.u32 $D2,$H0#lo,$R2
|
---|
1087 |
|
---|
1088 | vmlal.u32 $D3,$H0#lo,$R3
|
---|
1089 | vmlal.u32 $D0,$H2#lo,$S3
|
---|
1090 | vmlal.u32 $D4,$H1#lo,$R3
|
---|
1091 | vmlal.u32 $D1,$H3#lo,$S3
|
---|
1092 | vmlal.u32 $D2,$H4#lo,$S3
|
---|
1093 |
|
---|
1094 | vmlal.u32 $D3,$H4#lo,$S4
|
---|
1095 | vorn $MASK,$MASK,$MASK @ all-ones
|
---|
1096 | vmlal.u32 $D0,$H1#lo,$S4
|
---|
1097 | vshr.u64 $MASK,$MASK,#38
|
---|
1098 | vmlal.u32 $D4,$H0#lo,$R4
|
---|
1099 | vmlal.u32 $D1,$H2#lo,$S4
|
---|
1100 | vmlal.u32 $D2,$H3#lo,$S4
|
---|
1101 |
|
---|
1102 | .Lshort_tail:
|
---|
1103 | @@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@
|
---|
1104 | @ horizontal addition
|
---|
1105 |
|
---|
1106 | vadd.i64 $D3#lo,$D3#lo,$D3#hi
|
---|
1107 | vadd.i64 $D0#lo,$D0#lo,$D0#hi
|
---|
1108 | vadd.i64 $D4#lo,$D4#lo,$D4#hi
|
---|
1109 | vadd.i64 $D1#lo,$D1#lo,$D1#hi
|
---|
1110 | vadd.i64 $D2#lo,$D2#lo,$D2#hi
|
---|
1111 |
|
---|
1112 | @@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@
|
---|
1113 | @ lazy reduction, but without narrowing
|
---|
1114 |
|
---|
1115 | vshr.u64 $T0,$D3,#26
|
---|
1116 | vand.i64 $D3,$D3,$MASK
|
---|
1117 | vshr.u64 $T1,$D0,#26
|
---|
1118 | vand.i64 $D0,$D0,$MASK
|
---|
1119 | vadd.i64 $D4,$D4,$T0 @ h3 -> h4
|
---|
1120 | vadd.i64 $D1,$D1,$T1 @ h0 -> h1
|
---|
1121 |
|
---|
1122 | vshr.u64 $T0,$D4,#26
|
---|
1123 | vand.i64 $D4,$D4,$MASK
|
---|
1124 | vshr.u64 $T1,$D1,#26
|
---|
1125 | vand.i64 $D1,$D1,$MASK
|
---|
1126 | vadd.i64 $D2,$D2,$T1 @ h1 -> h2
|
---|
1127 |
|
---|
1128 | vadd.i64 $D0,$D0,$T0
|
---|
1129 | vshl.u64 $T0,$T0,#2
|
---|
1130 | vshr.u64 $T1,$D2,#26
|
---|
1131 | vand.i64 $D2,$D2,$MASK
|
---|
1132 | vadd.i64 $D0,$D0,$T0 @ h4 -> h0
|
---|
1133 | vadd.i64 $D3,$D3,$T1 @ h2 -> h3
|
---|
1134 |
|
---|
1135 | vshr.u64 $T0,$D0,#26
|
---|
1136 | vand.i64 $D0,$D0,$MASK
|
---|
1137 | vshr.u64 $T1,$D3,#26
|
---|
1138 | vand.i64 $D3,$D3,$MASK
|
---|
1139 | vadd.i64 $D1,$D1,$T0 @ h0 -> h1
|
---|
1140 | vadd.i64 $D4,$D4,$T1 @ h3 -> h4
|
---|
1141 |
|
---|
1142 | cmp $len,#0
|
---|
1143 | bne .Leven
|
---|
1144 |
|
---|
1145 | @@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@
|
---|
1146 | @ store hash value
|
---|
1147 |
|
---|
1148 | vst4.32 {$D0#lo[0],$D1#lo[0],$D2#lo[0],$D3#lo[0]},[$ctx]!
|
---|
1149 | vst1.32 {$D4#lo[0]},[$ctx]
|
---|
1150 |
|
---|
1151 | vldmia sp!,{d8-d15} @ epilogue
|
---|
1152 | ldmia sp!,{r4-r7}
|
---|
1153 | .Lno_data_neon:
|
---|
1154 | ret @ bx lr
|
---|
1155 | .size poly1305_blocks_neon,.-poly1305_blocks_neon
|
---|
1156 |
|
---|
1157 | .type poly1305_emit_neon,%function
|
---|
1158 | .align 5
|
---|
1159 | poly1305_emit_neon:
|
---|
1160 | ldr ip,[$ctx,#36] @ is_base2_26
|
---|
1161 |
|
---|
1162 | stmdb sp!,{r4-r11}
|
---|
1163 |
|
---|
1164 | tst ip,ip
|
---|
1165 | beq .Lpoly1305_emit_enter
|
---|
1166 |
|
---|
1167 | ldmia $ctx,{$h0-$h4}
|
---|
1168 | eor $g0,$g0,$g0
|
---|
1169 |
|
---|
1170 | adds $h0,$h0,$h1,lsl#26 @ base 2^26 -> base 2^32
|
---|
1171 | mov $h1,$h1,lsr#6
|
---|
1172 | adcs $h1,$h1,$h2,lsl#20
|
---|
1173 | mov $h2,$h2,lsr#12
|
---|
1174 | adcs $h2,$h2,$h3,lsl#14
|
---|
1175 | mov $h3,$h3,lsr#18
|
---|
1176 | adcs $h3,$h3,$h4,lsl#8
|
---|
1177 | adc $h4,$g0,$h4,lsr#24 @ can be partially reduced ...
|
---|
1178 |
|
---|
1179 | and $g0,$h4,#-4 @ ... so reduce
|
---|
1180 | and $h4,$h3,#3
|
---|
1181 | add $g0,$g0,$g0,lsr#2 @ *= 5
|
---|
1182 | adds $h0,$h0,$g0
|
---|
1183 | adcs $h1,$h1,#0
|
---|
1184 | adcs $h2,$h2,#0
|
---|
1185 | adcs $h3,$h3,#0
|
---|
1186 | adc $h4,$h4,#0
|
---|
1187 |
|
---|
1188 | adds $g0,$h0,#5 @ compare to modulus
|
---|
1189 | adcs $g1,$h1,#0
|
---|
1190 | adcs $g2,$h2,#0
|
---|
1191 | adcs $g3,$h3,#0
|
---|
1192 | adc $g4,$h4,#0
|
---|
1193 | tst $g4,#4 @ did it carry/borrow?
|
---|
1194 |
|
---|
1195 | it ne
|
---|
1196 | movne $h0,$g0
|
---|
1197 | ldr $g0,[$nonce,#0]
|
---|
1198 | it ne
|
---|
1199 | movne $h1,$g1
|
---|
1200 | ldr $g1,[$nonce,#4]
|
---|
1201 | it ne
|
---|
1202 | movne $h2,$g2
|
---|
1203 | ldr $g2,[$nonce,#8]
|
---|
1204 | it ne
|
---|
1205 | movne $h3,$g3
|
---|
1206 | ldr $g3,[$nonce,#12]
|
---|
1207 |
|
---|
1208 | adds $h0,$h0,$g0 @ accumulate nonce
|
---|
1209 | adcs $h1,$h1,$g1
|
---|
1210 | adcs $h2,$h2,$g2
|
---|
1211 | adc $h3,$h3,$g3
|
---|
1212 |
|
---|
1213 | # ifdef __ARMEB__
|
---|
1214 | rev $h0,$h0
|
---|
1215 | rev $h1,$h1
|
---|
1216 | rev $h2,$h2
|
---|
1217 | rev $h3,$h3
|
---|
1218 | # endif
|
---|
1219 | str $h0,[$mac,#0] @ store the result
|
---|
1220 | str $h1,[$mac,#4]
|
---|
1221 | str $h2,[$mac,#8]
|
---|
1222 | str $h3,[$mac,#12]
|
---|
1223 |
|
---|
1224 | ldmia sp!,{r4-r11}
|
---|
1225 | ret @ bx lr
|
---|
1226 | .size poly1305_emit_neon,.-poly1305_emit_neon
|
---|
1227 |
|
---|
1228 | .align 5
|
---|
1229 | .Lzeros:
|
---|
1230 | .long 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
|
---|
1231 | .LOPENSSL_armcap:
|
---|
1232 | .word OPENSSL_armcap_P-.Lpoly1305_init
|
---|
1233 | #endif
|
---|
1234 | ___
|
---|
1235 | } }
|
---|
1236 | $code.=<<___;
|
---|
1237 | .asciz "Poly1305 for ARMv4/NEON, CRYPTOGAMS by <appro\@openssl.org>"
|
---|
1238 | .align 2
|
---|
1239 | #if __ARM_MAX_ARCH__>=7
|
---|
1240 | .comm OPENSSL_armcap_P,4,4
|
---|
1241 | #endif
|
---|
1242 | ___
|
---|
1243 |
|
---|
1244 | foreach (split("\n",$code)) {
|
---|
1245 | s/\`([^\`]*)\`/eval $1/geo;
|
---|
1246 |
|
---|
1247 | s/\bq([0-9]+)#(lo|hi)/sprintf "d%d",2*$1+($2 eq "hi")/geo or
|
---|
1248 | s/\bret\b/bx lr/go or
|
---|
1249 | s/\bbx\s+lr\b/.word\t0xe12fff1e/go; # make it possible to compile with -march=armv4
|
---|
1250 |
|
---|
1251 | print $_,"\n";
|
---|
1252 | }
|
---|
1253 | close STDOUT or die "error closing STDOUT: $!"; # enforce flush
|
---|