1 | #! /usr/bin/env perl
|
---|
2 | # Copyright 2016-2020 The OpenSSL Project Authors. All Rights Reserved.
|
---|
3 | #
|
---|
4 | # Licensed under the Apache License 2.0 (the "License"). You may not use
|
---|
5 | # this file except in compliance with the License. You can obtain a copy
|
---|
6 | # in the file LICENSE in the source distribution or at
|
---|
7 | # https://www.openssl.org/source/license.html
|
---|
8 |
|
---|
9 | #
|
---|
10 | # ====================================================================
|
---|
11 | # Written by Andy Polyakov <[email protected]> for the OpenSSL
|
---|
12 | # project. The module is, however, dual licensed under OpenSSL and
|
---|
13 | # CRYPTOGAMS licenses depending on where you obtain it. For further
|
---|
14 | # details see http://www.openssl.org/~appro/cryptogams/.
|
---|
15 | # ====================================================================
|
---|
16 | #
|
---|
17 | # IALU(*)/gcc-4.4 NEON
|
---|
18 | #
|
---|
19 | # ARM11xx(ARMv6) 7.78/+100% -
|
---|
20 | # Cortex-A5 6.35/+130% 3.00
|
---|
21 | # Cortex-A8 6.25/+115% 2.36
|
---|
22 | # Cortex-A9 5.10/+95% 2.55
|
---|
23 | # Cortex-A15 3.85/+85% 1.25(**)
|
---|
24 | # Snapdragon S4 5.70/+100% 1.48(**)
|
---|
25 | #
|
---|
26 | # (*) this is for -march=armv6, i.e. with bunch of ldrb loading data;
|
---|
27 | # (**) these are trade-off results, they can be improved by ~8% but at
|
---|
28 | # the cost of 15/12% regression on Cortex-A5/A7, it's even possible
|
---|
29 | # to improve Cortex-A9 result, but then A5/A7 loose more than 20%;
|
---|
30 |
|
---|
31 | # $output is the last argument if it looks like a file (it has an extension)
|
---|
32 | # $flavour is the first argument if it doesn't look like a file
|
---|
33 | $output = $#ARGV >= 0 && $ARGV[$#ARGV] =~ m|\.\w+$| ? pop : undef;
|
---|
34 | $flavour = $#ARGV >= 0 && $ARGV[0] !~ m|\.| ? shift : undef;
|
---|
35 |
|
---|
36 | if ($flavour && $flavour ne "void") {
|
---|
37 | $0 =~ m/(.*[\/\\])[^\/\\]+$/; $dir=$1;
|
---|
38 | ( $xlate="${dir}arm-xlate.pl" and -f $xlate ) or
|
---|
39 | ( $xlate="${dir}../../perlasm/arm-xlate.pl" and -f $xlate) or
|
---|
40 | die "can't locate arm-xlate.pl";
|
---|
41 |
|
---|
42 | open STDOUT,"| \"$^X\" $xlate $flavour \"$output\""
|
---|
43 | or die "can't call $xlate: $!";
|
---|
44 | } else {
|
---|
45 | $output and open STDOUT,">$output";
|
---|
46 | }
|
---|
47 |
|
---|
48 | ($ctx,$inp,$len,$padbit)=map("r$_",(0..3));
|
---|
49 |
|
---|
50 | $code.=<<___;
|
---|
51 | #include "arm_arch.h"
|
---|
52 |
|
---|
53 | #if defined(__thumb2__)
|
---|
54 | .syntax unified
|
---|
55 | .thumb
|
---|
56 | #else
|
---|
57 | .code 32
|
---|
58 | #endif
|
---|
59 |
|
---|
60 | .text
|
---|
61 |
|
---|
62 | .globl poly1305_emit
|
---|
63 | .globl poly1305_blocks
|
---|
64 | .globl poly1305_init
|
---|
65 | .type poly1305_init,%function
|
---|
66 | .align 5
|
---|
67 | poly1305_init:
|
---|
68 | .Lpoly1305_init:
|
---|
69 | stmdb sp!,{r4-r11}
|
---|
70 |
|
---|
71 | eor r3,r3,r3
|
---|
72 | cmp $inp,#0
|
---|
73 | str r3,[$ctx,#0] @ zero hash value
|
---|
74 | str r3,[$ctx,#4]
|
---|
75 | str r3,[$ctx,#8]
|
---|
76 | str r3,[$ctx,#12]
|
---|
77 | str r3,[$ctx,#16]
|
---|
78 | str r3,[$ctx,#36] @ is_base2_26
|
---|
79 | add $ctx,$ctx,#20
|
---|
80 |
|
---|
81 | #ifdef __thumb2__
|
---|
82 | it eq
|
---|
83 | #endif
|
---|
84 | moveq r0,#0
|
---|
85 | beq .Lno_key
|
---|
86 |
|
---|
87 | #if __ARM_MAX_ARCH__>=7
|
---|
88 | adr r11,.Lpoly1305_init
|
---|
89 | ldr r12,.LOPENSSL_armcap
|
---|
90 | #endif
|
---|
91 | ldrb r4,[$inp,#0]
|
---|
92 | mov r10,#0x0fffffff
|
---|
93 | ldrb r5,[$inp,#1]
|
---|
94 | and r3,r10,#-4 @ 0x0ffffffc
|
---|
95 | ldrb r6,[$inp,#2]
|
---|
96 | ldrb r7,[$inp,#3]
|
---|
97 | orr r4,r4,r5,lsl#8
|
---|
98 | ldrb r5,[$inp,#4]
|
---|
99 | orr r4,r4,r6,lsl#16
|
---|
100 | ldrb r6,[$inp,#5]
|
---|
101 | orr r4,r4,r7,lsl#24
|
---|
102 | ldrb r7,[$inp,#6]
|
---|
103 | and r4,r4,r10
|
---|
104 |
|
---|
105 | #if __ARM_MAX_ARCH__>=7
|
---|
106 | # if !defined(_WIN32)
|
---|
107 | ldr r12,[r11,r12] @ OPENSSL_armcap_P
|
---|
108 | # endif
|
---|
109 | # if defined(__APPLE__) || defined(_WIN32)
|
---|
110 | ldr r12,[r12]
|
---|
111 | # endif
|
---|
112 | #endif
|
---|
113 | ldrb r8,[$inp,#7]
|
---|
114 | orr r5,r5,r6,lsl#8
|
---|
115 | ldrb r6,[$inp,#8]
|
---|
116 | orr r5,r5,r7,lsl#16
|
---|
117 | ldrb r7,[$inp,#9]
|
---|
118 | orr r5,r5,r8,lsl#24
|
---|
119 | ldrb r8,[$inp,#10]
|
---|
120 | and r5,r5,r3
|
---|
121 |
|
---|
122 | #if __ARM_MAX_ARCH__>=7
|
---|
123 | tst r12,#ARMV7_NEON @ check for NEON
|
---|
124 | # ifdef __thumb2__
|
---|
125 | adr r9,.Lpoly1305_blocks_neon
|
---|
126 | adr r11,.Lpoly1305_blocks
|
---|
127 | adr r12,.Lpoly1305_emit
|
---|
128 | adr r10,.Lpoly1305_emit_neon
|
---|
129 | itt ne
|
---|
130 | movne r11,r9
|
---|
131 | movne r12,r10
|
---|
132 | orr r11,r11,#1 @ thumb-ify address
|
---|
133 | orr r12,r12,#1
|
---|
134 | # else
|
---|
135 | addeq r12,r11,#(.Lpoly1305_emit-.Lpoly1305_init)
|
---|
136 | addne r12,r11,#(.Lpoly1305_emit_neon-.Lpoly1305_init)
|
---|
137 | addeq r11,r11,#(.Lpoly1305_blocks-.Lpoly1305_init)
|
---|
138 | addne r11,r11,#(.Lpoly1305_blocks_neon-.Lpoly1305_init)
|
---|
139 | # endif
|
---|
140 | #endif
|
---|
141 | ldrb r9,[$inp,#11]
|
---|
142 | orr r6,r6,r7,lsl#8
|
---|
143 | ldrb r7,[$inp,#12]
|
---|
144 | orr r6,r6,r8,lsl#16
|
---|
145 | ldrb r8,[$inp,#13]
|
---|
146 | orr r6,r6,r9,lsl#24
|
---|
147 | ldrb r9,[$inp,#14]
|
---|
148 | and r6,r6,r3
|
---|
149 |
|
---|
150 | ldrb r10,[$inp,#15]
|
---|
151 | orr r7,r7,r8,lsl#8
|
---|
152 | str r4,[$ctx,#0]
|
---|
153 | orr r7,r7,r9,lsl#16
|
---|
154 | str r5,[$ctx,#4]
|
---|
155 | orr r7,r7,r10,lsl#24
|
---|
156 | str r6,[$ctx,#8]
|
---|
157 | and r7,r7,r3
|
---|
158 | str r7,[$ctx,#12]
|
---|
159 | #if __ARM_MAX_ARCH__>=7
|
---|
160 | stmia r2,{r11,r12} @ fill functions table
|
---|
161 | mov r0,#1
|
---|
162 | #else
|
---|
163 | mov r0,#0
|
---|
164 | #endif
|
---|
165 | .Lno_key:
|
---|
166 | ldmia sp!,{r4-r11}
|
---|
167 | #if __ARM_ARCH__>=5
|
---|
168 | ret @ bx lr
|
---|
169 | #else
|
---|
170 | tst lr,#1
|
---|
171 | moveq pc,lr @ be binary compatible with V4, yet
|
---|
172 | bx lr @ interoperable with Thumb ISA:-)
|
---|
173 | #endif
|
---|
174 | .size poly1305_init,.-poly1305_init
|
---|
175 | ___
|
---|
176 | {
|
---|
177 | my ($h0,$h1,$h2,$h3,$h4,$r0,$r1,$r2,$r3)=map("r$_",(4..12));
|
---|
178 | my ($s1,$s2,$s3)=($r1,$r2,$r3);
|
---|
179 |
|
---|
180 | $code.=<<___;
|
---|
181 | .type poly1305_blocks,%function
|
---|
182 | .align 5
|
---|
183 | poly1305_blocks:
|
---|
184 | .Lpoly1305_blocks:
|
---|
185 | stmdb sp!,{r3-r11,lr}
|
---|
186 |
|
---|
187 | ands $len,$len,#-16
|
---|
188 | beq .Lno_data
|
---|
189 |
|
---|
190 | cmp $padbit,#0
|
---|
191 | add $len,$len,$inp @ end pointer
|
---|
192 | sub sp,sp,#32
|
---|
193 |
|
---|
194 | ldmia $ctx,{$h0-$r3} @ load context
|
---|
195 |
|
---|
196 | str $ctx,[sp,#12] @ offload stuff
|
---|
197 | mov lr,$inp
|
---|
198 | str $len,[sp,#16]
|
---|
199 | str $r1,[sp,#20]
|
---|
200 | str $r2,[sp,#24]
|
---|
201 | str $r3,[sp,#28]
|
---|
202 | b .Loop
|
---|
203 |
|
---|
204 | .Loop:
|
---|
205 | #if __ARM_ARCH__<7
|
---|
206 | ldrb r0,[lr],#16 @ load input
|
---|
207 | # ifdef __thumb2__
|
---|
208 | it hi
|
---|
209 | # endif
|
---|
210 | addhi $h4,$h4,#1 @ 1<<128
|
---|
211 | ldrb r1,[lr,#-15]
|
---|
212 | ldrb r2,[lr,#-14]
|
---|
213 | ldrb r3,[lr,#-13]
|
---|
214 | orr r1,r0,r1,lsl#8
|
---|
215 | ldrb r0,[lr,#-12]
|
---|
216 | orr r2,r1,r2,lsl#16
|
---|
217 | ldrb r1,[lr,#-11]
|
---|
218 | orr r3,r2,r3,lsl#24
|
---|
219 | ldrb r2,[lr,#-10]
|
---|
220 | adds $h0,$h0,r3 @ accumulate input
|
---|
221 |
|
---|
222 | ldrb r3,[lr,#-9]
|
---|
223 | orr r1,r0,r1,lsl#8
|
---|
224 | ldrb r0,[lr,#-8]
|
---|
225 | orr r2,r1,r2,lsl#16
|
---|
226 | ldrb r1,[lr,#-7]
|
---|
227 | orr r3,r2,r3,lsl#24
|
---|
228 | ldrb r2,[lr,#-6]
|
---|
229 | adcs $h1,$h1,r3
|
---|
230 |
|
---|
231 | ldrb r3,[lr,#-5]
|
---|
232 | orr r1,r0,r1,lsl#8
|
---|
233 | ldrb r0,[lr,#-4]
|
---|
234 | orr r2,r1,r2,lsl#16
|
---|
235 | ldrb r1,[lr,#-3]
|
---|
236 | orr r3,r2,r3,lsl#24
|
---|
237 | ldrb r2,[lr,#-2]
|
---|
238 | adcs $h2,$h2,r3
|
---|
239 |
|
---|
240 | ldrb r3,[lr,#-1]
|
---|
241 | orr r1,r0,r1,lsl#8
|
---|
242 | str lr,[sp,#8] @ offload input pointer
|
---|
243 | orr r2,r1,r2,lsl#16
|
---|
244 | add $s1,$r1,$r1,lsr#2
|
---|
245 | orr r3,r2,r3,lsl#24
|
---|
246 | #else
|
---|
247 | ldr r0,[lr],#16 @ load input
|
---|
248 | # ifdef __thumb2__
|
---|
249 | it hi
|
---|
250 | # endif
|
---|
251 | addhi $h4,$h4,#1 @ padbit
|
---|
252 | ldr r1,[lr,#-12]
|
---|
253 | ldr r2,[lr,#-8]
|
---|
254 | ldr r3,[lr,#-4]
|
---|
255 | # ifdef __ARMEB__
|
---|
256 | rev r0,r0
|
---|
257 | rev r1,r1
|
---|
258 | rev r2,r2
|
---|
259 | rev r3,r3
|
---|
260 | # endif
|
---|
261 | adds $h0,$h0,r0 @ accumulate input
|
---|
262 | str lr,[sp,#8] @ offload input pointer
|
---|
263 | adcs $h1,$h1,r1
|
---|
264 | add $s1,$r1,$r1,lsr#2
|
---|
265 | adcs $h2,$h2,r2
|
---|
266 | #endif
|
---|
267 | add $s2,$r2,$r2,lsr#2
|
---|
268 | adcs $h3,$h3,r3
|
---|
269 | add $s3,$r3,$r3,lsr#2
|
---|
270 |
|
---|
271 | umull r2,r3,$h1,$r0
|
---|
272 | adc $h4,$h4,#0
|
---|
273 | umull r0,r1,$h0,$r0
|
---|
274 | umlal r2,r3,$h4,$s1
|
---|
275 | umlal r0,r1,$h3,$s1
|
---|
276 | ldr $r1,[sp,#20] @ reload $r1
|
---|
277 | umlal r2,r3,$h2,$s3
|
---|
278 | umlal r0,r1,$h1,$s3
|
---|
279 | umlal r2,r3,$h3,$s2
|
---|
280 | umlal r0,r1,$h2,$s2
|
---|
281 | umlal r2,r3,$h0,$r1
|
---|
282 | str r0,[sp,#0] @ future $h0
|
---|
283 | mul r0,$s2,$h4
|
---|
284 | ldr $r2,[sp,#24] @ reload $r2
|
---|
285 | adds r2,r2,r1 @ d1+=d0>>32
|
---|
286 | eor r1,r1,r1
|
---|
287 | adc lr,r3,#0 @ future $h2
|
---|
288 | str r2,[sp,#4] @ future $h1
|
---|
289 |
|
---|
290 | mul r2,$s3,$h4
|
---|
291 | eor r3,r3,r3
|
---|
292 | umlal r0,r1,$h3,$s3
|
---|
293 | ldr $r3,[sp,#28] @ reload $r3
|
---|
294 | umlal r2,r3,$h3,$r0
|
---|
295 | umlal r0,r1,$h2,$r0
|
---|
296 | umlal r2,r3,$h2,$r1
|
---|
297 | umlal r0,r1,$h1,$r1
|
---|
298 | umlal r2,r3,$h1,$r2
|
---|
299 | umlal r0,r1,$h0,$r2
|
---|
300 | umlal r2,r3,$h0,$r3
|
---|
301 | ldr $h0,[sp,#0]
|
---|
302 | mul $h4,$r0,$h4
|
---|
303 | ldr $h1,[sp,#4]
|
---|
304 |
|
---|
305 | adds $h2,lr,r0 @ d2+=d1>>32
|
---|
306 | ldr lr,[sp,#8] @ reload input pointer
|
---|
307 | adc r1,r1,#0
|
---|
308 | adds $h3,r2,r1 @ d3+=d2>>32
|
---|
309 | ldr r0,[sp,#16] @ reload end pointer
|
---|
310 | adc r3,r3,#0
|
---|
311 | add $h4,$h4,r3 @ h4+=d3>>32
|
---|
312 |
|
---|
313 | and r1,$h4,#-4
|
---|
314 | and $h4,$h4,#3
|
---|
315 | add r1,r1,r1,lsr#2 @ *=5
|
---|
316 | adds $h0,$h0,r1
|
---|
317 | adcs $h1,$h1,#0
|
---|
318 | adcs $h2,$h2,#0
|
---|
319 | adcs $h3,$h3,#0
|
---|
320 | adc $h4,$h4,#0
|
---|
321 |
|
---|
322 | cmp r0,lr @ done yet?
|
---|
323 | bhi .Loop
|
---|
324 |
|
---|
325 | ldr $ctx,[sp,#12]
|
---|
326 | add sp,sp,#32
|
---|
327 | stmia $ctx,{$h0-$h4} @ store the result
|
---|
328 |
|
---|
329 | .Lno_data:
|
---|
330 | #if __ARM_ARCH__>=5
|
---|
331 | ldmia sp!,{r3-r11,pc}
|
---|
332 | #else
|
---|
333 | ldmia sp!,{r3-r11,lr}
|
---|
334 | tst lr,#1
|
---|
335 | moveq pc,lr @ be binary compatible with V4, yet
|
---|
336 | bx lr @ interoperable with Thumb ISA:-)
|
---|
337 | #endif
|
---|
338 | .size poly1305_blocks,.-poly1305_blocks
|
---|
339 | ___
|
---|
340 | }
|
---|
341 | {
|
---|
342 | my ($ctx,$mac,$nonce)=map("r$_",(0..2));
|
---|
343 | my ($h0,$h1,$h2,$h3,$h4,$g0,$g1,$g2,$g3)=map("r$_",(3..11));
|
---|
344 | my $g4=$h4;
|
---|
345 |
|
---|
346 | $code.=<<___;
|
---|
347 | .type poly1305_emit,%function
|
---|
348 | .align 5
|
---|
349 | poly1305_emit:
|
---|
350 | .Lpoly1305_emit:
|
---|
351 | stmdb sp!,{r4-r11}
|
---|
352 | .Lpoly1305_emit_enter:
|
---|
353 |
|
---|
354 | ldmia $ctx,{$h0-$h4}
|
---|
355 | adds $g0,$h0,#5 @ compare to modulus
|
---|
356 | adcs $g1,$h1,#0
|
---|
357 | adcs $g2,$h2,#0
|
---|
358 | adcs $g3,$h3,#0
|
---|
359 | adc $g4,$h4,#0
|
---|
360 | tst $g4,#4 @ did it carry/borrow?
|
---|
361 |
|
---|
362 | #ifdef __thumb2__
|
---|
363 | it ne
|
---|
364 | #endif
|
---|
365 | movne $h0,$g0
|
---|
366 | ldr $g0,[$nonce,#0]
|
---|
367 | #ifdef __thumb2__
|
---|
368 | it ne
|
---|
369 | #endif
|
---|
370 | movne $h1,$g1
|
---|
371 | ldr $g1,[$nonce,#4]
|
---|
372 | #ifdef __thumb2__
|
---|
373 | it ne
|
---|
374 | #endif
|
---|
375 | movne $h2,$g2
|
---|
376 | ldr $g2,[$nonce,#8]
|
---|
377 | #ifdef __thumb2__
|
---|
378 | it ne
|
---|
379 | #endif
|
---|
380 | movne $h3,$g3
|
---|
381 | ldr $g3,[$nonce,#12]
|
---|
382 |
|
---|
383 | adds $h0,$h0,$g0
|
---|
384 | adcs $h1,$h1,$g1
|
---|
385 | adcs $h2,$h2,$g2
|
---|
386 | adc $h3,$h3,$g3
|
---|
387 |
|
---|
388 | #if __ARM_ARCH__>=7
|
---|
389 | # ifdef __ARMEB__
|
---|
390 | rev $h0,$h0
|
---|
391 | rev $h1,$h1
|
---|
392 | rev $h2,$h2
|
---|
393 | rev $h3,$h3
|
---|
394 | # endif
|
---|
395 | str $h0,[$mac,#0]
|
---|
396 | str $h1,[$mac,#4]
|
---|
397 | str $h2,[$mac,#8]
|
---|
398 | str $h3,[$mac,#12]
|
---|
399 | #else
|
---|
400 | strb $h0,[$mac,#0]
|
---|
401 | mov $h0,$h0,lsr#8
|
---|
402 | strb $h1,[$mac,#4]
|
---|
403 | mov $h1,$h1,lsr#8
|
---|
404 | strb $h2,[$mac,#8]
|
---|
405 | mov $h2,$h2,lsr#8
|
---|
406 | strb $h3,[$mac,#12]
|
---|
407 | mov $h3,$h3,lsr#8
|
---|
408 |
|
---|
409 | strb $h0,[$mac,#1]
|
---|
410 | mov $h0,$h0,lsr#8
|
---|
411 | strb $h1,[$mac,#5]
|
---|
412 | mov $h1,$h1,lsr#8
|
---|
413 | strb $h2,[$mac,#9]
|
---|
414 | mov $h2,$h2,lsr#8
|
---|
415 | strb $h3,[$mac,#13]
|
---|
416 | mov $h3,$h3,lsr#8
|
---|
417 |
|
---|
418 | strb $h0,[$mac,#2]
|
---|
419 | mov $h0,$h0,lsr#8
|
---|
420 | strb $h1,[$mac,#6]
|
---|
421 | mov $h1,$h1,lsr#8
|
---|
422 | strb $h2,[$mac,#10]
|
---|
423 | mov $h2,$h2,lsr#8
|
---|
424 | strb $h3,[$mac,#14]
|
---|
425 | mov $h3,$h3,lsr#8
|
---|
426 |
|
---|
427 | strb $h0,[$mac,#3]
|
---|
428 | strb $h1,[$mac,#7]
|
---|
429 | strb $h2,[$mac,#11]
|
---|
430 | strb $h3,[$mac,#15]
|
---|
431 | #endif
|
---|
432 | ldmia sp!,{r4-r11}
|
---|
433 | #if __ARM_ARCH__>=5
|
---|
434 | ret @ bx lr
|
---|
435 | #else
|
---|
436 | tst lr,#1
|
---|
437 | moveq pc,lr @ be binary compatible with V4, yet
|
---|
438 | bx lr @ interoperable with Thumb ISA:-)
|
---|
439 | #endif
|
---|
440 | .size poly1305_emit,.-poly1305_emit
|
---|
441 | ___
|
---|
442 | {
|
---|
443 | my ($R0,$R1,$S1,$R2,$S2,$R3,$S3,$R4,$S4) = map("d$_",(0..9));
|
---|
444 | my ($D0,$D1,$D2,$D3,$D4, $H0,$H1,$H2,$H3,$H4) = map("q$_",(5..14));
|
---|
445 | my ($T0,$T1,$MASK) = map("q$_",(15,4,0));
|
---|
446 |
|
---|
447 | my ($in2,$zeros,$tbl0,$tbl1) = map("r$_",(4..7));
|
---|
448 |
|
---|
449 | $code.=<<___;
|
---|
450 | #if __ARM_MAX_ARCH__>=7
|
---|
451 | .fpu neon
|
---|
452 |
|
---|
453 | .type poly1305_init_neon,%function
|
---|
454 | .align 5
|
---|
455 | poly1305_init_neon:
|
---|
456 | ldr r4,[$ctx,#20] @ load key base 2^32
|
---|
457 | ldr r5,[$ctx,#24]
|
---|
458 | ldr r6,[$ctx,#28]
|
---|
459 | ldr r7,[$ctx,#32]
|
---|
460 |
|
---|
461 | and r2,r4,#0x03ffffff @ base 2^32 -> base 2^26
|
---|
462 | mov r3,r4,lsr#26
|
---|
463 | mov r4,r5,lsr#20
|
---|
464 | orr r3,r3,r5,lsl#6
|
---|
465 | mov r5,r6,lsr#14
|
---|
466 | orr r4,r4,r6,lsl#12
|
---|
467 | mov r6,r7,lsr#8
|
---|
468 | orr r5,r5,r7,lsl#18
|
---|
469 | and r3,r3,#0x03ffffff
|
---|
470 | and r4,r4,#0x03ffffff
|
---|
471 | and r5,r5,#0x03ffffff
|
---|
472 |
|
---|
473 | vdup.32 $R0,r2 @ r^1 in both lanes
|
---|
474 | add r2,r3,r3,lsl#2 @ *5
|
---|
475 | vdup.32 $R1,r3
|
---|
476 | add r3,r4,r4,lsl#2
|
---|
477 | vdup.32 $S1,r2
|
---|
478 | vdup.32 $R2,r4
|
---|
479 | add r4,r5,r5,lsl#2
|
---|
480 | vdup.32 $S2,r3
|
---|
481 | vdup.32 $R3,r5
|
---|
482 | add r5,r6,r6,lsl#2
|
---|
483 | vdup.32 $S3,r4
|
---|
484 | vdup.32 $R4,r6
|
---|
485 | vdup.32 $S4,r5
|
---|
486 |
|
---|
487 | mov $zeros,#2 @ counter
|
---|
488 |
|
---|
489 | .Lsquare_neon:
|
---|
490 | @@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@
|
---|
491 | @ d0 = h0*r0 + h4*5*r1 + h3*5*r2 + h2*5*r3 + h1*5*r4
|
---|
492 | @ d1 = h1*r0 + h0*r1 + h4*5*r2 + h3*5*r3 + h2*5*r4
|
---|
493 | @ d2 = h2*r0 + h1*r1 + h0*r2 + h4*5*r3 + h3*5*r4
|
---|
494 | @ d3 = h3*r0 + h2*r1 + h1*r2 + h0*r3 + h4*5*r4
|
---|
495 | @ d4 = h4*r0 + h3*r1 + h2*r2 + h1*r3 + h0*r4
|
---|
496 |
|
---|
497 | vmull.u32 $D0,$R0,${R0}[1]
|
---|
498 | vmull.u32 $D1,$R1,${R0}[1]
|
---|
499 | vmull.u32 $D2,$R2,${R0}[1]
|
---|
500 | vmull.u32 $D3,$R3,${R0}[1]
|
---|
501 | vmull.u32 $D4,$R4,${R0}[1]
|
---|
502 |
|
---|
503 | vmlal.u32 $D0,$R4,${S1}[1]
|
---|
504 | vmlal.u32 $D1,$R0,${R1}[1]
|
---|
505 | vmlal.u32 $D2,$R1,${R1}[1]
|
---|
506 | vmlal.u32 $D3,$R2,${R1}[1]
|
---|
507 | vmlal.u32 $D4,$R3,${R1}[1]
|
---|
508 |
|
---|
509 | vmlal.u32 $D0,$R3,${S2}[1]
|
---|
510 | vmlal.u32 $D1,$R4,${S2}[1]
|
---|
511 | vmlal.u32 $D3,$R1,${R2}[1]
|
---|
512 | vmlal.u32 $D2,$R0,${R2}[1]
|
---|
513 | vmlal.u32 $D4,$R2,${R2}[1]
|
---|
514 |
|
---|
515 | vmlal.u32 $D0,$R2,${S3}[1]
|
---|
516 | vmlal.u32 $D3,$R0,${R3}[1]
|
---|
517 | vmlal.u32 $D1,$R3,${S3}[1]
|
---|
518 | vmlal.u32 $D2,$R4,${S3}[1]
|
---|
519 | vmlal.u32 $D4,$R1,${R3}[1]
|
---|
520 |
|
---|
521 | vmlal.u32 $D3,$R4,${S4}[1]
|
---|
522 | vmlal.u32 $D0,$R1,${S4}[1]
|
---|
523 | vmlal.u32 $D1,$R2,${S4}[1]
|
---|
524 | vmlal.u32 $D2,$R3,${S4}[1]
|
---|
525 | vmlal.u32 $D4,$R0,${R4}[1]
|
---|
526 |
|
---|
527 | @@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@
|
---|
528 | @ lazy reduction as discussed in "NEON crypto" by D.J. Bernstein
|
---|
529 | @ and P. Schwabe
|
---|
530 | @
|
---|
531 | @ H0>>+H1>>+H2>>+H3>>+H4
|
---|
532 | @ H3>>+H4>>*5+H0>>+H1
|
---|
533 | @
|
---|
534 | @ Trivia.
|
---|
535 | @
|
---|
536 | @ Result of multiplication of n-bit number by m-bit number is
|
---|
537 | @ n+m bits wide. However! Even though 2^n is a n+1-bit number,
|
---|
538 | @ m-bit number multiplied by 2^n is still n+m bits wide.
|
---|
539 | @
|
---|
540 | @ Sum of two n-bit numbers is n+1 bits wide, sum of three - n+2,
|
---|
541 | @ and so is sum of four. Sum of 2^m n-m-bit numbers and n-bit
|
---|
542 | @ one is n+1 bits wide.
|
---|
543 | @
|
---|
544 | @ >>+ denotes Hnext += Hn>>26, Hn &= 0x3ffffff. This means that
|
---|
545 | @ H0, H2, H3 are guaranteed to be 26 bits wide, while H1 and H4
|
---|
546 | @ can be 27. However! In cases when their width exceeds 26 bits
|
---|
547 | @ they are limited by 2^26+2^6. This in turn means that *sum*
|
---|
548 | @ of the products with these values can still be viewed as sum
|
---|
549 | @ of 52-bit numbers as long as the amount of addends is not a
|
---|
550 | @ power of 2. For example,
|
---|
551 | @
|
---|
552 | @ H4 = H4*R0 + H3*R1 + H2*R2 + H1*R3 + H0 * R4,
|
---|
553 | @
|
---|
554 | @ which can't be larger than 5 * (2^26 + 2^6) * (2^26 + 2^6), or
|
---|
555 | @ 5 * (2^52 + 2*2^32 + 2^12), which in turn is smaller than
|
---|
556 | @ 8 * (2^52) or 2^55. However, the value is then multiplied by
|
---|
557 | @ by 5, so we should be looking at 5 * 5 * (2^52 + 2^33 + 2^12),
|
---|
558 | @ which is less than 32 * (2^52) or 2^57. And when processing
|
---|
559 | @ data we are looking at triple as many addends...
|
---|
560 | @
|
---|
561 | @ In key setup procedure pre-reduced H0 is limited by 5*4+1 and
|
---|
562 | @ 5*H4 - by 5*5 52-bit addends, or 57 bits. But when hashing the
|
---|
563 | @ input H0 is limited by (5*4+1)*3 addends, or 58 bits, while
|
---|
564 | @ 5*H4 by 5*5*3, or 59[!] bits. How is this relevant? vmlal.u32
|
---|
565 | @ instruction accepts 2x32-bit input and writes 2x64-bit result.
|
---|
566 | @ This means that result of reduction have to be compressed upon
|
---|
567 | @ loop wrap-around. This can be done in the process of reduction
|
---|
568 | @ to minimize amount of instructions [as well as amount of
|
---|
569 | @ 128-bit instructions, which benefits low-end processors], but
|
---|
570 | @ one has to watch for H2 (which is narrower than H0) and 5*H4
|
---|
571 | @ not being wider than 58 bits, so that result of right shift
|
---|
572 | @ by 26 bits fits in 32 bits. This is also useful on x86,
|
---|
573 | @ because it allows to use paddd in place for paddq, which
|
---|
574 | @ benefits Atom, where paddq is ridiculously slow.
|
---|
575 |
|
---|
576 | vshr.u64 $T0,$D3,#26
|
---|
577 | vmovn.i64 $D3#lo,$D3
|
---|
578 | vshr.u64 $T1,$D0,#26
|
---|
579 | vmovn.i64 $D0#lo,$D0
|
---|
580 | vadd.i64 $D4,$D4,$T0 @ h3 -> h4
|
---|
581 | vbic.i32 $D3#lo,#0xfc000000 @ &=0x03ffffff
|
---|
582 | vadd.i64 $D1,$D1,$T1 @ h0 -> h1
|
---|
583 | vbic.i32 $D0#lo,#0xfc000000
|
---|
584 |
|
---|
585 | vshrn.u64 $T0#lo,$D4,#26
|
---|
586 | vmovn.i64 $D4#lo,$D4
|
---|
587 | vshr.u64 $T1,$D1,#26
|
---|
588 | vmovn.i64 $D1#lo,$D1
|
---|
589 | vadd.i64 $D2,$D2,$T1 @ h1 -> h2
|
---|
590 | vbic.i32 $D4#lo,#0xfc000000
|
---|
591 | vbic.i32 $D1#lo,#0xfc000000
|
---|
592 |
|
---|
593 | vadd.i32 $D0#lo,$D0#lo,$T0#lo
|
---|
594 | vshl.u32 $T0#lo,$T0#lo,#2
|
---|
595 | vshrn.u64 $T1#lo,$D2,#26
|
---|
596 | vmovn.i64 $D2#lo,$D2
|
---|
597 | vadd.i32 $D0#lo,$D0#lo,$T0#lo @ h4 -> h0
|
---|
598 | vadd.i32 $D3#lo,$D3#lo,$T1#lo @ h2 -> h3
|
---|
599 | vbic.i32 $D2#lo,#0xfc000000
|
---|
600 |
|
---|
601 | vshr.u32 $T0#lo,$D0#lo,#26
|
---|
602 | vbic.i32 $D0#lo,#0xfc000000
|
---|
603 | vshr.u32 $T1#lo,$D3#lo,#26
|
---|
604 | vbic.i32 $D3#lo,#0xfc000000
|
---|
605 | vadd.i32 $D1#lo,$D1#lo,$T0#lo @ h0 -> h1
|
---|
606 | vadd.i32 $D4#lo,$D4#lo,$T1#lo @ h3 -> h4
|
---|
607 |
|
---|
608 | subs $zeros,$zeros,#1
|
---|
609 | beq .Lsquare_break_neon
|
---|
610 |
|
---|
611 | add $tbl0,$ctx,#(48+0*9*4)
|
---|
612 | add $tbl1,$ctx,#(48+1*9*4)
|
---|
613 |
|
---|
614 | vtrn.32 $R0,$D0#lo @ r^2:r^1
|
---|
615 | vtrn.32 $R2,$D2#lo
|
---|
616 | vtrn.32 $R3,$D3#lo
|
---|
617 | vtrn.32 $R1,$D1#lo
|
---|
618 | vtrn.32 $R4,$D4#lo
|
---|
619 |
|
---|
620 | vshl.u32 $S2,$R2,#2 @ *5
|
---|
621 | vshl.u32 $S3,$R3,#2
|
---|
622 | vshl.u32 $S1,$R1,#2
|
---|
623 | vshl.u32 $S4,$R4,#2
|
---|
624 | vadd.i32 $S2,$S2,$R2
|
---|
625 | vadd.i32 $S1,$S1,$R1
|
---|
626 | vadd.i32 $S3,$S3,$R3
|
---|
627 | vadd.i32 $S4,$S4,$R4
|
---|
628 |
|
---|
629 | vst4.32 {${R0}[0],${R1}[0],${S1}[0],${R2}[0]},[$tbl0]!
|
---|
630 | vst4.32 {${R0}[1],${R1}[1],${S1}[1],${R2}[1]},[$tbl1]!
|
---|
631 | vst4.32 {${S2}[0],${R3}[0],${S3}[0],${R4}[0]},[$tbl0]!
|
---|
632 | vst4.32 {${S2}[1],${R3}[1],${S3}[1],${R4}[1]},[$tbl1]!
|
---|
633 | vst1.32 {${S4}[0]},[$tbl0,:32]
|
---|
634 | vst1.32 {${S4}[1]},[$tbl1,:32]
|
---|
635 |
|
---|
636 | b .Lsquare_neon
|
---|
637 |
|
---|
638 | .align 4
|
---|
639 | .Lsquare_break_neon:
|
---|
640 | add $tbl0,$ctx,#(48+2*4*9)
|
---|
641 | add $tbl1,$ctx,#(48+3*4*9)
|
---|
642 |
|
---|
643 | vmov $R0,$D0#lo @ r^4:r^3
|
---|
644 | vshl.u32 $S1,$D1#lo,#2 @ *5
|
---|
645 | vmov $R1,$D1#lo
|
---|
646 | vshl.u32 $S2,$D2#lo,#2
|
---|
647 | vmov $R2,$D2#lo
|
---|
648 | vshl.u32 $S3,$D3#lo,#2
|
---|
649 | vmov $R3,$D3#lo
|
---|
650 | vshl.u32 $S4,$D4#lo,#2
|
---|
651 | vmov $R4,$D4#lo
|
---|
652 | vadd.i32 $S1,$S1,$D1#lo
|
---|
653 | vadd.i32 $S2,$S2,$D2#lo
|
---|
654 | vadd.i32 $S3,$S3,$D3#lo
|
---|
655 | vadd.i32 $S4,$S4,$D4#lo
|
---|
656 |
|
---|
657 | vst4.32 {${R0}[0],${R1}[0],${S1}[0],${R2}[0]},[$tbl0]!
|
---|
658 | vst4.32 {${R0}[1],${R1}[1],${S1}[1],${R2}[1]},[$tbl1]!
|
---|
659 | vst4.32 {${S2}[0],${R3}[0],${S3}[0],${R4}[0]},[$tbl0]!
|
---|
660 | vst4.32 {${S2}[1],${R3}[1],${S3}[1],${R4}[1]},[$tbl1]!
|
---|
661 | vst1.32 {${S4}[0]},[$tbl0]
|
---|
662 | vst1.32 {${S4}[1]},[$tbl1]
|
---|
663 |
|
---|
664 | ret @ bx lr
|
---|
665 | .size poly1305_init_neon,.-poly1305_init_neon
|
---|
666 |
|
---|
667 | .type poly1305_blocks_neon,%function
|
---|
668 | .align 5
|
---|
669 | poly1305_blocks_neon:
|
---|
670 | .Lpoly1305_blocks_neon:
|
---|
671 | ldr ip,[$ctx,#36] @ is_base2_26
|
---|
672 | ands $len,$len,#-16
|
---|
673 | beq .Lno_data_neon
|
---|
674 |
|
---|
675 | cmp $len,#64
|
---|
676 | bhs .Lenter_neon
|
---|
677 | tst ip,ip @ is_base2_26?
|
---|
678 | beq .Lpoly1305_blocks
|
---|
679 |
|
---|
680 | .Lenter_neon:
|
---|
681 | stmdb sp!,{r4-r7}
|
---|
682 | vstmdb sp!,{d8-d15} @ ABI specification says so
|
---|
683 |
|
---|
684 | tst ip,ip @ is_base2_26?
|
---|
685 | bne .Lbase2_26_neon
|
---|
686 |
|
---|
687 | stmdb sp!,{r1-r3,lr}
|
---|
688 | bl poly1305_init_neon
|
---|
689 |
|
---|
690 | ldr r4,[$ctx,#0] @ load hash value base 2^32
|
---|
691 | ldr r5,[$ctx,#4]
|
---|
692 | ldr r6,[$ctx,#8]
|
---|
693 | ldr r7,[$ctx,#12]
|
---|
694 | ldr ip,[$ctx,#16]
|
---|
695 |
|
---|
696 | and r2,r4,#0x03ffffff @ base 2^32 -> base 2^26
|
---|
697 | mov r3,r4,lsr#26
|
---|
698 | veor $D0#lo,$D0#lo,$D0#lo
|
---|
699 | mov r4,r5,lsr#20
|
---|
700 | orr r3,r3,r5,lsl#6
|
---|
701 | veor $D1#lo,$D1#lo,$D1#lo
|
---|
702 | mov r5,r6,lsr#14
|
---|
703 | orr r4,r4,r6,lsl#12
|
---|
704 | veor $D2#lo,$D2#lo,$D2#lo
|
---|
705 | mov r6,r7,lsr#8
|
---|
706 | orr r5,r5,r7,lsl#18
|
---|
707 | veor $D3#lo,$D3#lo,$D3#lo
|
---|
708 | and r3,r3,#0x03ffffff
|
---|
709 | orr r6,r6,ip,lsl#24
|
---|
710 | veor $D4#lo,$D4#lo,$D4#lo
|
---|
711 | and r4,r4,#0x03ffffff
|
---|
712 | mov r1,#1
|
---|
713 | and r5,r5,#0x03ffffff
|
---|
714 | str r1,[$ctx,#36] @ is_base2_26
|
---|
715 |
|
---|
716 | vmov.32 $D0#lo[0],r2
|
---|
717 | vmov.32 $D1#lo[0],r3
|
---|
718 | vmov.32 $D2#lo[0],r4
|
---|
719 | vmov.32 $D3#lo[0],r5
|
---|
720 | vmov.32 $D4#lo[0],r6
|
---|
721 | adr $zeros,.Lzeros
|
---|
722 |
|
---|
723 | ldmia sp!,{r1-r3,lr}
|
---|
724 | b .Lbase2_32_neon
|
---|
725 |
|
---|
726 | .align 4
|
---|
727 | .Lbase2_26_neon:
|
---|
728 | @@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@
|
---|
729 | @ load hash value
|
---|
730 |
|
---|
731 | veor $D0#lo,$D0#lo,$D0#lo
|
---|
732 | veor $D1#lo,$D1#lo,$D1#lo
|
---|
733 | veor $D2#lo,$D2#lo,$D2#lo
|
---|
734 | veor $D3#lo,$D3#lo,$D3#lo
|
---|
735 | veor $D4#lo,$D4#lo,$D4#lo
|
---|
736 | vld4.32 {$D0#lo[0],$D1#lo[0],$D2#lo[0],$D3#lo[0]},[$ctx]!
|
---|
737 | adr $zeros,.Lzeros
|
---|
738 | vld1.32 {$D4#lo[0]},[$ctx]
|
---|
739 | sub $ctx,$ctx,#16 @ rewind
|
---|
740 |
|
---|
741 | .Lbase2_32_neon:
|
---|
742 | add $in2,$inp,#32
|
---|
743 | mov $padbit,$padbit,lsl#24
|
---|
744 | tst $len,#31
|
---|
745 | beq .Leven
|
---|
746 |
|
---|
747 | vld4.32 {$H0#lo[0],$H1#lo[0],$H2#lo[0],$H3#lo[0]},[$inp]!
|
---|
748 | vmov.32 $H4#lo[0],$padbit
|
---|
749 | sub $len,$len,#16
|
---|
750 | add $in2,$inp,#32
|
---|
751 |
|
---|
752 | # ifdef __ARMEB__
|
---|
753 | vrev32.8 $H0,$H0
|
---|
754 | vrev32.8 $H3,$H3
|
---|
755 | vrev32.8 $H1,$H1
|
---|
756 | vrev32.8 $H2,$H2
|
---|
757 | # endif
|
---|
758 | vsri.u32 $H4#lo,$H3#lo,#8 @ base 2^32 -> base 2^26
|
---|
759 | vshl.u32 $H3#lo,$H3#lo,#18
|
---|
760 |
|
---|
761 | vsri.u32 $H3#lo,$H2#lo,#14
|
---|
762 | vshl.u32 $H2#lo,$H2#lo,#12
|
---|
763 | vadd.i32 $H4#hi,$H4#lo,$D4#lo @ add hash value and move to #hi
|
---|
764 |
|
---|
765 | vbic.i32 $H3#lo,#0xfc000000
|
---|
766 | vsri.u32 $H2#lo,$H1#lo,#20
|
---|
767 | vshl.u32 $H1#lo,$H1#lo,#6
|
---|
768 |
|
---|
769 | vbic.i32 $H2#lo,#0xfc000000
|
---|
770 | vsri.u32 $H1#lo,$H0#lo,#26
|
---|
771 | vadd.i32 $H3#hi,$H3#lo,$D3#lo
|
---|
772 |
|
---|
773 | vbic.i32 $H0#lo,#0xfc000000
|
---|
774 | vbic.i32 $H1#lo,#0xfc000000
|
---|
775 | vadd.i32 $H2#hi,$H2#lo,$D2#lo
|
---|
776 |
|
---|
777 | vadd.i32 $H0#hi,$H0#lo,$D0#lo
|
---|
778 | vadd.i32 $H1#hi,$H1#lo,$D1#lo
|
---|
779 |
|
---|
780 | mov $tbl1,$zeros
|
---|
781 | add $tbl0,$ctx,#48
|
---|
782 |
|
---|
783 | cmp $len,$len
|
---|
784 | b .Long_tail
|
---|
785 |
|
---|
786 | .align 4
|
---|
787 | .Leven:
|
---|
788 | subs $len,$len,#64
|
---|
789 | it lo
|
---|
790 | movlo $in2,$zeros
|
---|
791 |
|
---|
792 | vmov.i32 $H4,#1<<24 @ padbit, yes, always
|
---|
793 | vld4.32 {$H0#lo,$H1#lo,$H2#lo,$H3#lo},[$inp] @ inp[0:1]
|
---|
794 | add $inp,$inp,#64
|
---|
795 | vld4.32 {$H0#hi,$H1#hi,$H2#hi,$H3#hi},[$in2] @ inp[2:3] (or 0)
|
---|
796 | add $in2,$in2,#64
|
---|
797 | itt hi
|
---|
798 | addhi $tbl1,$ctx,#(48+1*9*4)
|
---|
799 | addhi $tbl0,$ctx,#(48+3*9*4)
|
---|
800 |
|
---|
801 | # ifdef __ARMEB__
|
---|
802 | vrev32.8 $H0,$H0
|
---|
803 | vrev32.8 $H3,$H3
|
---|
804 | vrev32.8 $H1,$H1
|
---|
805 | vrev32.8 $H2,$H2
|
---|
806 | # endif
|
---|
807 | vsri.u32 $H4,$H3,#8 @ base 2^32 -> base 2^26
|
---|
808 | vshl.u32 $H3,$H3,#18
|
---|
809 |
|
---|
810 | vsri.u32 $H3,$H2,#14
|
---|
811 | vshl.u32 $H2,$H2,#12
|
---|
812 |
|
---|
813 | vbic.i32 $H3,#0xfc000000
|
---|
814 | vsri.u32 $H2,$H1,#20
|
---|
815 | vshl.u32 $H1,$H1,#6
|
---|
816 |
|
---|
817 | vbic.i32 $H2,#0xfc000000
|
---|
818 | vsri.u32 $H1,$H0,#26
|
---|
819 |
|
---|
820 | vbic.i32 $H0,#0xfc000000
|
---|
821 | vbic.i32 $H1,#0xfc000000
|
---|
822 |
|
---|
823 | bls .Lskip_loop
|
---|
824 |
|
---|
825 | vld4.32 {${R0}[1],${R1}[1],${S1}[1],${R2}[1]},[$tbl1]! @ load r^2
|
---|
826 | vld4.32 {${R0}[0],${R1}[0],${S1}[0],${R2}[0]},[$tbl0]! @ load r^4
|
---|
827 | vld4.32 {${S2}[1],${R3}[1],${S3}[1],${R4}[1]},[$tbl1]!
|
---|
828 | vld4.32 {${S2}[0],${R3}[0],${S3}[0],${R4}[0]},[$tbl0]!
|
---|
829 | b .Loop_neon
|
---|
830 |
|
---|
831 | .align 5
|
---|
832 | .Loop_neon:
|
---|
833 | @@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@
|
---|
834 | @ ((inp[0]*r^4+inp[2]*r^2+inp[4])*r^4+inp[6]*r^2
|
---|
835 | @ ((inp[1]*r^4+inp[3]*r^2+inp[5])*r^3+inp[7]*r
|
---|
836 | @ \___________________/
|
---|
837 | @ ((inp[0]*r^4+inp[2]*r^2+inp[4])*r^4+inp[6]*r^2+inp[8])*r^2
|
---|
838 | @ ((inp[1]*r^4+inp[3]*r^2+inp[5])*r^4+inp[7]*r^2+inp[9])*r
|
---|
839 | @ \___________________/ \____________________/
|
---|
840 | @
|
---|
841 | @ Note that we start with inp[2:3]*r^2. This is because it
|
---|
842 | @ doesn't depend on reduction in previous iteration.
|
---|
843 | @@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@
|
---|
844 | @ d4 = h4*r0 + h3*r1 + h2*r2 + h1*r3 + h0*r4
|
---|
845 | @ d3 = h3*r0 + h2*r1 + h1*r2 + h0*r3 + h4*5*r4
|
---|
846 | @ d2 = h2*r0 + h1*r1 + h0*r2 + h4*5*r3 + h3*5*r4
|
---|
847 | @ d1 = h1*r0 + h0*r1 + h4*5*r2 + h3*5*r3 + h2*5*r4
|
---|
848 | @ d0 = h0*r0 + h4*5*r1 + h3*5*r2 + h2*5*r3 + h1*5*r4
|
---|
849 |
|
---|
850 | @@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@
|
---|
851 | @ inp[2:3]*r^2
|
---|
852 |
|
---|
853 | vadd.i32 $H2#lo,$H2#lo,$D2#lo @ accumulate inp[0:1]
|
---|
854 | vmull.u32 $D2,$H2#hi,${R0}[1]
|
---|
855 | vadd.i32 $H0#lo,$H0#lo,$D0#lo
|
---|
856 | vmull.u32 $D0,$H0#hi,${R0}[1]
|
---|
857 | vadd.i32 $H3#lo,$H3#lo,$D3#lo
|
---|
858 | vmull.u32 $D3,$H3#hi,${R0}[1]
|
---|
859 | vmlal.u32 $D2,$H1#hi,${R1}[1]
|
---|
860 | vadd.i32 $H1#lo,$H1#lo,$D1#lo
|
---|
861 | vmull.u32 $D1,$H1#hi,${R0}[1]
|
---|
862 |
|
---|
863 | vadd.i32 $H4#lo,$H4#lo,$D4#lo
|
---|
864 | vmull.u32 $D4,$H4#hi,${R0}[1]
|
---|
865 | subs $len,$len,#64
|
---|
866 | vmlal.u32 $D0,$H4#hi,${S1}[1]
|
---|
867 | it lo
|
---|
868 | movlo $in2,$zeros
|
---|
869 | vmlal.u32 $D3,$H2#hi,${R1}[1]
|
---|
870 | vld1.32 ${S4}[1],[$tbl1,:32]
|
---|
871 | vmlal.u32 $D1,$H0#hi,${R1}[1]
|
---|
872 | vmlal.u32 $D4,$H3#hi,${R1}[1]
|
---|
873 |
|
---|
874 | vmlal.u32 $D0,$H3#hi,${S2}[1]
|
---|
875 | vmlal.u32 $D3,$H1#hi,${R2}[1]
|
---|
876 | vmlal.u32 $D4,$H2#hi,${R2}[1]
|
---|
877 | vmlal.u32 $D1,$H4#hi,${S2}[1]
|
---|
878 | vmlal.u32 $D2,$H0#hi,${R2}[1]
|
---|
879 |
|
---|
880 | vmlal.u32 $D3,$H0#hi,${R3}[1]
|
---|
881 | vmlal.u32 $D0,$H2#hi,${S3}[1]
|
---|
882 | vmlal.u32 $D4,$H1#hi,${R3}[1]
|
---|
883 | vmlal.u32 $D1,$H3#hi,${S3}[1]
|
---|
884 | vmlal.u32 $D2,$H4#hi,${S3}[1]
|
---|
885 |
|
---|
886 | vmlal.u32 $D3,$H4#hi,${S4}[1]
|
---|
887 | vmlal.u32 $D0,$H1#hi,${S4}[1]
|
---|
888 | vmlal.u32 $D4,$H0#hi,${R4}[1]
|
---|
889 | vmlal.u32 $D1,$H2#hi,${S4}[1]
|
---|
890 | vmlal.u32 $D2,$H3#hi,${S4}[1]
|
---|
891 |
|
---|
892 | vld4.32 {$H0#hi,$H1#hi,$H2#hi,$H3#hi},[$in2] @ inp[2:3] (or 0)
|
---|
893 | add $in2,$in2,#64
|
---|
894 |
|
---|
895 | @@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@
|
---|
896 | @ (hash+inp[0:1])*r^4 and accumulate
|
---|
897 |
|
---|
898 | vmlal.u32 $D3,$H3#lo,${R0}[0]
|
---|
899 | vmlal.u32 $D0,$H0#lo,${R0}[0]
|
---|
900 | vmlal.u32 $D4,$H4#lo,${R0}[0]
|
---|
901 | vmlal.u32 $D1,$H1#lo,${R0}[0]
|
---|
902 | vmlal.u32 $D2,$H2#lo,${R0}[0]
|
---|
903 | vld1.32 ${S4}[0],[$tbl0,:32]
|
---|
904 |
|
---|
905 | vmlal.u32 $D3,$H2#lo,${R1}[0]
|
---|
906 | vmlal.u32 $D0,$H4#lo,${S1}[0]
|
---|
907 | vmlal.u32 $D4,$H3#lo,${R1}[0]
|
---|
908 | vmlal.u32 $D1,$H0#lo,${R1}[0]
|
---|
909 | vmlal.u32 $D2,$H1#lo,${R1}[0]
|
---|
910 |
|
---|
911 | vmlal.u32 $D3,$H1#lo,${R2}[0]
|
---|
912 | vmlal.u32 $D0,$H3#lo,${S2}[0]
|
---|
913 | vmlal.u32 $D4,$H2#lo,${R2}[0]
|
---|
914 | vmlal.u32 $D1,$H4#lo,${S2}[0]
|
---|
915 | vmlal.u32 $D2,$H0#lo,${R2}[0]
|
---|
916 |
|
---|
917 | vmlal.u32 $D3,$H0#lo,${R3}[0]
|
---|
918 | vmlal.u32 $D0,$H2#lo,${S3}[0]
|
---|
919 | vmlal.u32 $D4,$H1#lo,${R3}[0]
|
---|
920 | vmlal.u32 $D1,$H3#lo,${S3}[0]
|
---|
921 | vmlal.u32 $D3,$H4#lo,${S4}[0]
|
---|
922 |
|
---|
923 | vmlal.u32 $D2,$H4#lo,${S3}[0]
|
---|
924 | vmlal.u32 $D0,$H1#lo,${S4}[0]
|
---|
925 | vmlal.u32 $D4,$H0#lo,${R4}[0]
|
---|
926 | vmov.i32 $H4,#1<<24 @ padbit, yes, always
|
---|
927 | vmlal.u32 $D1,$H2#lo,${S4}[0]
|
---|
928 | vmlal.u32 $D2,$H3#lo,${S4}[0]
|
---|
929 |
|
---|
930 | vld4.32 {$H0#lo,$H1#lo,$H2#lo,$H3#lo},[$inp] @ inp[0:1]
|
---|
931 | add $inp,$inp,#64
|
---|
932 | # ifdef __ARMEB__
|
---|
933 | vrev32.8 $H0,$H0
|
---|
934 | vrev32.8 $H1,$H1
|
---|
935 | vrev32.8 $H2,$H2
|
---|
936 | vrev32.8 $H3,$H3
|
---|
937 | # endif
|
---|
938 |
|
---|
939 | @@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@
|
---|
940 | @ lazy reduction interleaved with base 2^32 -> base 2^26 of
|
---|
941 | @ inp[0:3] previously loaded to $H0-$H3 and smashed to $H0-$H4.
|
---|
942 |
|
---|
943 | vshr.u64 $T0,$D3,#26
|
---|
944 | vmovn.i64 $D3#lo,$D3
|
---|
945 | vshr.u64 $T1,$D0,#26
|
---|
946 | vmovn.i64 $D0#lo,$D0
|
---|
947 | vadd.i64 $D4,$D4,$T0 @ h3 -> h4
|
---|
948 | vbic.i32 $D3#lo,#0xfc000000
|
---|
949 | vsri.u32 $H4,$H3,#8 @ base 2^32 -> base 2^26
|
---|
950 | vadd.i64 $D1,$D1,$T1 @ h0 -> h1
|
---|
951 | vshl.u32 $H3,$H3,#18
|
---|
952 | vbic.i32 $D0#lo,#0xfc000000
|
---|
953 |
|
---|
954 | vshrn.u64 $T0#lo,$D4,#26
|
---|
955 | vmovn.i64 $D4#lo,$D4
|
---|
956 | vshr.u64 $T1,$D1,#26
|
---|
957 | vmovn.i64 $D1#lo,$D1
|
---|
958 | vadd.i64 $D2,$D2,$T1 @ h1 -> h2
|
---|
959 | vsri.u32 $H3,$H2,#14
|
---|
960 | vbic.i32 $D4#lo,#0xfc000000
|
---|
961 | vshl.u32 $H2,$H2,#12
|
---|
962 | vbic.i32 $D1#lo,#0xfc000000
|
---|
963 |
|
---|
964 | vadd.i32 $D0#lo,$D0#lo,$T0#lo
|
---|
965 | vshl.u32 $T0#lo,$T0#lo,#2
|
---|
966 | vbic.i32 $H3,#0xfc000000
|
---|
967 | vshrn.u64 $T1#lo,$D2,#26
|
---|
968 | vmovn.i64 $D2#lo,$D2
|
---|
969 | vaddl.u32 $D0,$D0#lo,$T0#lo @ h4 -> h0 [widen for a sec]
|
---|
970 | vsri.u32 $H2,$H1,#20
|
---|
971 | vadd.i32 $D3#lo,$D3#lo,$T1#lo @ h2 -> h3
|
---|
972 | vshl.u32 $H1,$H1,#6
|
---|
973 | vbic.i32 $D2#lo,#0xfc000000
|
---|
974 | vbic.i32 $H2,#0xfc000000
|
---|
975 |
|
---|
976 | vshrn.u64 $T0#lo,$D0,#26 @ re-narrow
|
---|
977 | vmovn.i64 $D0#lo,$D0
|
---|
978 | vsri.u32 $H1,$H0,#26
|
---|
979 | vbic.i32 $H0,#0xfc000000
|
---|
980 | vshr.u32 $T1#lo,$D3#lo,#26
|
---|
981 | vbic.i32 $D3#lo,#0xfc000000
|
---|
982 | vbic.i32 $D0#lo,#0xfc000000
|
---|
983 | vadd.i32 $D1#lo,$D1#lo,$T0#lo @ h0 -> h1
|
---|
984 | vadd.i32 $D4#lo,$D4#lo,$T1#lo @ h3 -> h4
|
---|
985 | vbic.i32 $H1,#0xfc000000
|
---|
986 |
|
---|
987 | bhi .Loop_neon
|
---|
988 |
|
---|
989 | .Lskip_loop:
|
---|
990 | @@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@
|
---|
991 | @ multiply (inp[0:1]+hash) or inp[2:3] by r^2:r^1
|
---|
992 |
|
---|
993 | add $tbl1,$ctx,#(48+0*9*4)
|
---|
994 | add $tbl0,$ctx,#(48+1*9*4)
|
---|
995 | adds $len,$len,#32
|
---|
996 | it ne
|
---|
997 | movne $len,#0
|
---|
998 | bne .Long_tail
|
---|
999 |
|
---|
1000 | vadd.i32 $H2#hi,$H2#lo,$D2#lo @ add hash value and move to #hi
|
---|
1001 | vadd.i32 $H0#hi,$H0#lo,$D0#lo
|
---|
1002 | vadd.i32 $H3#hi,$H3#lo,$D3#lo
|
---|
1003 | vadd.i32 $H1#hi,$H1#lo,$D1#lo
|
---|
1004 | vadd.i32 $H4#hi,$H4#lo,$D4#lo
|
---|
1005 |
|
---|
1006 | .Long_tail:
|
---|
1007 | vld4.32 {${R0}[1],${R1}[1],${S1}[1],${R2}[1]},[$tbl1]! @ load r^1
|
---|
1008 | vld4.32 {${R0}[0],${R1}[0],${S1}[0],${R2}[0]},[$tbl0]! @ load r^2
|
---|
1009 |
|
---|
1010 | vadd.i32 $H2#lo,$H2#lo,$D2#lo @ can be redundant
|
---|
1011 | vmull.u32 $D2,$H2#hi,$R0
|
---|
1012 | vadd.i32 $H0#lo,$H0#lo,$D0#lo
|
---|
1013 | vmull.u32 $D0,$H0#hi,$R0
|
---|
1014 | vadd.i32 $H3#lo,$H3#lo,$D3#lo
|
---|
1015 | vmull.u32 $D3,$H3#hi,$R0
|
---|
1016 | vadd.i32 $H1#lo,$H1#lo,$D1#lo
|
---|
1017 | vmull.u32 $D1,$H1#hi,$R0
|
---|
1018 | vadd.i32 $H4#lo,$H4#lo,$D4#lo
|
---|
1019 | vmull.u32 $D4,$H4#hi,$R0
|
---|
1020 |
|
---|
1021 | vmlal.u32 $D0,$H4#hi,$S1
|
---|
1022 | vld4.32 {${S2}[1],${R3}[1],${S3}[1],${R4}[1]},[$tbl1]!
|
---|
1023 | vmlal.u32 $D3,$H2#hi,$R1
|
---|
1024 | vld4.32 {${S2}[0],${R3}[0],${S3}[0],${R4}[0]},[$tbl0]!
|
---|
1025 | vmlal.u32 $D1,$H0#hi,$R1
|
---|
1026 | vmlal.u32 $D4,$H3#hi,$R1
|
---|
1027 | vmlal.u32 $D2,$H1#hi,$R1
|
---|
1028 |
|
---|
1029 | vmlal.u32 $D3,$H1#hi,$R2
|
---|
1030 | vld1.32 ${S4}[1],[$tbl1,:32]
|
---|
1031 | vmlal.u32 $D0,$H3#hi,$S2
|
---|
1032 | vld1.32 ${S4}[0],[$tbl0,:32]
|
---|
1033 | vmlal.u32 $D4,$H2#hi,$R2
|
---|
1034 | vmlal.u32 $D1,$H4#hi,$S2
|
---|
1035 | vmlal.u32 $D2,$H0#hi,$R2
|
---|
1036 |
|
---|
1037 | vmlal.u32 $D3,$H0#hi,$R3
|
---|
1038 | it ne
|
---|
1039 | addne $tbl1,$ctx,#(48+2*9*4)
|
---|
1040 | vmlal.u32 $D0,$H2#hi,$S3
|
---|
1041 | it ne
|
---|
1042 | addne $tbl0,$ctx,#(48+3*9*4)
|
---|
1043 | vmlal.u32 $D4,$H1#hi,$R3
|
---|
1044 | vmlal.u32 $D1,$H3#hi,$S3
|
---|
1045 | vmlal.u32 $D2,$H4#hi,$S3
|
---|
1046 |
|
---|
1047 | vmlal.u32 $D3,$H4#hi,$S4
|
---|
1048 | vorn $MASK,$MASK,$MASK @ all-ones, can be redundant
|
---|
1049 | vmlal.u32 $D0,$H1#hi,$S4
|
---|
1050 | vshr.u64 $MASK,$MASK,#38
|
---|
1051 | vmlal.u32 $D4,$H0#hi,$R4
|
---|
1052 | vmlal.u32 $D1,$H2#hi,$S4
|
---|
1053 | vmlal.u32 $D2,$H3#hi,$S4
|
---|
1054 |
|
---|
1055 | beq .Lshort_tail
|
---|
1056 |
|
---|
1057 | @@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@
|
---|
1058 | @ (hash+inp[0:1])*r^4:r^3 and accumulate
|
---|
1059 |
|
---|
1060 | vld4.32 {${R0}[1],${R1}[1],${S1}[1],${R2}[1]},[$tbl1]! @ load r^3
|
---|
1061 | vld4.32 {${R0}[0],${R1}[0],${S1}[0],${R2}[0]},[$tbl0]! @ load r^4
|
---|
1062 |
|
---|
1063 | vmlal.u32 $D2,$H2#lo,$R0
|
---|
1064 | vmlal.u32 $D0,$H0#lo,$R0
|
---|
1065 | vmlal.u32 $D3,$H3#lo,$R0
|
---|
1066 | vmlal.u32 $D1,$H1#lo,$R0
|
---|
1067 | vmlal.u32 $D4,$H4#lo,$R0
|
---|
1068 |
|
---|
1069 | vmlal.u32 $D0,$H4#lo,$S1
|
---|
1070 | vld4.32 {${S2}[1],${R3}[1],${S3}[1],${R4}[1]},[$tbl1]!
|
---|
1071 | vmlal.u32 $D3,$H2#lo,$R1
|
---|
1072 | vld4.32 {${S2}[0],${R3}[0],${S3}[0],${R4}[0]},[$tbl0]!
|
---|
1073 | vmlal.u32 $D1,$H0#lo,$R1
|
---|
1074 | vmlal.u32 $D4,$H3#lo,$R1
|
---|
1075 | vmlal.u32 $D2,$H1#lo,$R1
|
---|
1076 |
|
---|
1077 | vmlal.u32 $D3,$H1#lo,$R2
|
---|
1078 | vld1.32 ${S4}[1],[$tbl1,:32]
|
---|
1079 | vmlal.u32 $D0,$H3#lo,$S2
|
---|
1080 | vld1.32 ${S4}[0],[$tbl0,:32]
|
---|
1081 | vmlal.u32 $D4,$H2#lo,$R2
|
---|
1082 | vmlal.u32 $D1,$H4#lo,$S2
|
---|
1083 | vmlal.u32 $D2,$H0#lo,$R2
|
---|
1084 |
|
---|
1085 | vmlal.u32 $D3,$H0#lo,$R3
|
---|
1086 | vmlal.u32 $D0,$H2#lo,$S3
|
---|
1087 | vmlal.u32 $D4,$H1#lo,$R3
|
---|
1088 | vmlal.u32 $D1,$H3#lo,$S3
|
---|
1089 | vmlal.u32 $D2,$H4#lo,$S3
|
---|
1090 |
|
---|
1091 | vmlal.u32 $D3,$H4#lo,$S4
|
---|
1092 | vorn $MASK,$MASK,$MASK @ all-ones
|
---|
1093 | vmlal.u32 $D0,$H1#lo,$S4
|
---|
1094 | vshr.u64 $MASK,$MASK,#38
|
---|
1095 | vmlal.u32 $D4,$H0#lo,$R4
|
---|
1096 | vmlal.u32 $D1,$H2#lo,$S4
|
---|
1097 | vmlal.u32 $D2,$H3#lo,$S4
|
---|
1098 |
|
---|
1099 | .Lshort_tail:
|
---|
1100 | @@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@
|
---|
1101 | @ horizontal addition
|
---|
1102 |
|
---|
1103 | vadd.i64 $D3#lo,$D3#lo,$D3#hi
|
---|
1104 | vadd.i64 $D0#lo,$D0#lo,$D0#hi
|
---|
1105 | vadd.i64 $D4#lo,$D4#lo,$D4#hi
|
---|
1106 | vadd.i64 $D1#lo,$D1#lo,$D1#hi
|
---|
1107 | vadd.i64 $D2#lo,$D2#lo,$D2#hi
|
---|
1108 |
|
---|
1109 | @@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@
|
---|
1110 | @ lazy reduction, but without narrowing
|
---|
1111 |
|
---|
1112 | vshr.u64 $T0,$D3,#26
|
---|
1113 | vand.i64 $D3,$D3,$MASK
|
---|
1114 | vshr.u64 $T1,$D0,#26
|
---|
1115 | vand.i64 $D0,$D0,$MASK
|
---|
1116 | vadd.i64 $D4,$D4,$T0 @ h3 -> h4
|
---|
1117 | vadd.i64 $D1,$D1,$T1 @ h0 -> h1
|
---|
1118 |
|
---|
1119 | vshr.u64 $T0,$D4,#26
|
---|
1120 | vand.i64 $D4,$D4,$MASK
|
---|
1121 | vshr.u64 $T1,$D1,#26
|
---|
1122 | vand.i64 $D1,$D1,$MASK
|
---|
1123 | vadd.i64 $D2,$D2,$T1 @ h1 -> h2
|
---|
1124 |
|
---|
1125 | vadd.i64 $D0,$D0,$T0
|
---|
1126 | vshl.u64 $T0,$T0,#2
|
---|
1127 | vshr.u64 $T1,$D2,#26
|
---|
1128 | vand.i64 $D2,$D2,$MASK
|
---|
1129 | vadd.i64 $D0,$D0,$T0 @ h4 -> h0
|
---|
1130 | vadd.i64 $D3,$D3,$T1 @ h2 -> h3
|
---|
1131 |
|
---|
1132 | vshr.u64 $T0,$D0,#26
|
---|
1133 | vand.i64 $D0,$D0,$MASK
|
---|
1134 | vshr.u64 $T1,$D3,#26
|
---|
1135 | vand.i64 $D3,$D3,$MASK
|
---|
1136 | vadd.i64 $D1,$D1,$T0 @ h0 -> h1
|
---|
1137 | vadd.i64 $D4,$D4,$T1 @ h3 -> h4
|
---|
1138 |
|
---|
1139 | cmp $len,#0
|
---|
1140 | bne .Leven
|
---|
1141 |
|
---|
1142 | @@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@
|
---|
1143 | @ store hash value
|
---|
1144 |
|
---|
1145 | vst4.32 {$D0#lo[0],$D1#lo[0],$D2#lo[0],$D3#lo[0]},[$ctx]!
|
---|
1146 | vst1.32 {$D4#lo[0]},[$ctx]
|
---|
1147 |
|
---|
1148 | vldmia sp!,{d8-d15} @ epilogue
|
---|
1149 | ldmia sp!,{r4-r7}
|
---|
1150 | .Lno_data_neon:
|
---|
1151 | ret @ bx lr
|
---|
1152 | .size poly1305_blocks_neon,.-poly1305_blocks_neon
|
---|
1153 |
|
---|
1154 | .type poly1305_emit_neon,%function
|
---|
1155 | .align 5
|
---|
1156 | poly1305_emit_neon:
|
---|
1157 | .Lpoly1305_emit_neon:
|
---|
1158 | ldr ip,[$ctx,#36] @ is_base2_26
|
---|
1159 |
|
---|
1160 | stmdb sp!,{r4-r11}
|
---|
1161 |
|
---|
1162 | tst ip,ip
|
---|
1163 | beq .Lpoly1305_emit_enter
|
---|
1164 |
|
---|
1165 | ldmia $ctx,{$h0-$h4}
|
---|
1166 | eor $g0,$g0,$g0
|
---|
1167 |
|
---|
1168 | adds $h0,$h0,$h1,lsl#26 @ base 2^26 -> base 2^32
|
---|
1169 | mov $h1,$h1,lsr#6
|
---|
1170 | adcs $h1,$h1,$h2,lsl#20
|
---|
1171 | mov $h2,$h2,lsr#12
|
---|
1172 | adcs $h2,$h2,$h3,lsl#14
|
---|
1173 | mov $h3,$h3,lsr#18
|
---|
1174 | adcs $h3,$h3,$h4,lsl#8
|
---|
1175 | adc $h4,$g0,$h4,lsr#24 @ can be partially reduced ...
|
---|
1176 |
|
---|
1177 | and $g0,$h4,#-4 @ ... so reduce
|
---|
1178 | and $h4,$h3,#3
|
---|
1179 | add $g0,$g0,$g0,lsr#2 @ *= 5
|
---|
1180 | adds $h0,$h0,$g0
|
---|
1181 | adcs $h1,$h1,#0
|
---|
1182 | adcs $h2,$h2,#0
|
---|
1183 | adcs $h3,$h3,#0
|
---|
1184 | adc $h4,$h4,#0
|
---|
1185 |
|
---|
1186 | adds $g0,$h0,#5 @ compare to modulus
|
---|
1187 | adcs $g1,$h1,#0
|
---|
1188 | adcs $g2,$h2,#0
|
---|
1189 | adcs $g3,$h3,#0
|
---|
1190 | adc $g4,$h4,#0
|
---|
1191 | tst $g4,#4 @ did it carry/borrow?
|
---|
1192 |
|
---|
1193 | it ne
|
---|
1194 | movne $h0,$g0
|
---|
1195 | ldr $g0,[$nonce,#0]
|
---|
1196 | it ne
|
---|
1197 | movne $h1,$g1
|
---|
1198 | ldr $g1,[$nonce,#4]
|
---|
1199 | it ne
|
---|
1200 | movne $h2,$g2
|
---|
1201 | ldr $g2,[$nonce,#8]
|
---|
1202 | it ne
|
---|
1203 | movne $h3,$g3
|
---|
1204 | ldr $g3,[$nonce,#12]
|
---|
1205 |
|
---|
1206 | adds $h0,$h0,$g0 @ accumulate nonce
|
---|
1207 | adcs $h1,$h1,$g1
|
---|
1208 | adcs $h2,$h2,$g2
|
---|
1209 | adc $h3,$h3,$g3
|
---|
1210 |
|
---|
1211 | # ifdef __ARMEB__
|
---|
1212 | rev $h0,$h0
|
---|
1213 | rev $h1,$h1
|
---|
1214 | rev $h2,$h2
|
---|
1215 | rev $h3,$h3
|
---|
1216 | # endif
|
---|
1217 | str $h0,[$mac,#0] @ store the result
|
---|
1218 | str $h1,[$mac,#4]
|
---|
1219 | str $h2,[$mac,#8]
|
---|
1220 | str $h3,[$mac,#12]
|
---|
1221 |
|
---|
1222 | ldmia sp!,{r4-r11}
|
---|
1223 | ret @ bx lr
|
---|
1224 | .size poly1305_emit_neon,.-poly1305_emit_neon
|
---|
1225 |
|
---|
1226 | .align 5
|
---|
1227 | .Lzeros:
|
---|
1228 | .long 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
|
---|
1229 | .LOPENSSL_armcap:
|
---|
1230 | # ifdef _WIN32
|
---|
1231 | .word OPENSSL_armcap_P
|
---|
1232 | # else
|
---|
1233 | .word OPENSSL_armcap_P-.Lpoly1305_init
|
---|
1234 | # endif
|
---|
1235 | #endif
|
---|
1236 | ___
|
---|
1237 | } }
|
---|
1238 | $code.=<<___;
|
---|
1239 | .asciz "Poly1305 for ARMv4/NEON, CRYPTOGAMS by <appro\@openssl.org>"
|
---|
1240 | .align 2
|
---|
1241 | #if __ARM_MAX_ARCH__>=7
|
---|
1242 | .comm OPENSSL_armcap_P,4,4
|
---|
1243 | #endif
|
---|
1244 | ___
|
---|
1245 |
|
---|
1246 | foreach (split("\n",$code)) {
|
---|
1247 | s/\`([^\`]*)\`/eval $1/geo;
|
---|
1248 |
|
---|
1249 | s/\bq([0-9]+)#(lo|hi)/sprintf "d%d",2*$1+($2 eq "hi")/geo or
|
---|
1250 | s/\bret\b/bx lr/go or
|
---|
1251 | s/\bbx\s+lr\b/.word\t0xe12fff1e/go; # make it possible to compile with -march=armv4
|
---|
1252 |
|
---|
1253 | print $_,"\n";
|
---|
1254 | }
|
---|
1255 | close STDOUT or die "error closing STDOUT: $!"; # enforce flush
|
---|