1 | #! /usr/bin/env perl
|
---|
2 | # Copyright 2010-2020 The OpenSSL Project Authors. All Rights Reserved.
|
---|
3 | #
|
---|
4 | # Licensed under the OpenSSL license (the "License"). You may not use
|
---|
5 | # this file except in compliance with the License. You can obtain a copy
|
---|
6 | # in the file LICENSE in the source distribution or at
|
---|
7 | # https://www.openssl.org/source/license.html
|
---|
8 |
|
---|
9 | #
|
---|
10 | # ====================================================================
|
---|
11 | # Written by Andy Polyakov <[email protected]> for the OpenSSL
|
---|
12 | # project. The module is, however, dual licensed under OpenSSL and
|
---|
13 | # CRYPTOGAMS licenses depending on where you obtain it. For further
|
---|
14 | # details see http://www.openssl.org/~appro/cryptogams/.
|
---|
15 | # ====================================================================
|
---|
16 | #
|
---|
17 | # March 2010
|
---|
18 | #
|
---|
19 | # The module implements "4-bit" GCM GHASH function and underlying
|
---|
20 | # single multiplication operation in GF(2^128). "4-bit" means that it
|
---|
21 | # uses 256 bytes per-key table [+128 bytes shared table]. Even though
|
---|
22 | # loops are aggressively modulo-scheduled in respect to references to
|
---|
23 | # Htbl and Z.hi updates for 8 cycles per byte, measured performance is
|
---|
24 | # ~12 cycles per processed byte on 21264 CPU. It seems to be a dynamic
|
---|
25 | # scheduling "glitch," because uprofile(1) indicates uniform sample
|
---|
26 | # distribution, as if all instruction bundles execute in 1.5 cycles.
|
---|
27 | # Meaning that it could have been even faster, yet 12 cycles is ~60%
|
---|
28 | # better than gcc-generated code and ~80% than code generated by vendor
|
---|
29 | # compiler.
|
---|
30 |
|
---|
31 | $cnt="v0"; # $0
|
---|
32 | $t0="t0";
|
---|
33 | $t1="t1";
|
---|
34 | $t2="t2";
|
---|
35 | $Thi0="t3"; # $4
|
---|
36 | $Tlo0="t4";
|
---|
37 | $Thi1="t5";
|
---|
38 | $Tlo1="t6";
|
---|
39 | $rem="t7"; # $8
|
---|
40 | #################
|
---|
41 | $Xi="a0"; # $16, input argument block
|
---|
42 | $Htbl="a1";
|
---|
43 | $inp="a2";
|
---|
44 | $len="a3";
|
---|
45 | $nlo="a4"; # $20
|
---|
46 | $nhi="a5";
|
---|
47 | $Zhi="t8";
|
---|
48 | $Zlo="t9";
|
---|
49 | $Xhi="t10"; # $24
|
---|
50 | $Xlo="t11";
|
---|
51 | $remp="t12";
|
---|
52 | $rem_4bit="AT"; # $28
|
---|
53 |
|
---|
54 | { my $N;
|
---|
55 | sub loop() {
|
---|
56 |
|
---|
57 | $N++;
|
---|
58 | $code.=<<___;
|
---|
59 | .align 4
|
---|
60 | extbl $Xlo,7,$nlo
|
---|
61 | and $nlo,0xf0,$nhi
|
---|
62 | sll $nlo,4,$nlo
|
---|
63 | and $nlo,0xf0,$nlo
|
---|
64 |
|
---|
65 | addq $nlo,$Htbl,$nlo
|
---|
66 | ldq $Zlo,8($nlo)
|
---|
67 | addq $nhi,$Htbl,$nhi
|
---|
68 | ldq $Zhi,0($nlo)
|
---|
69 |
|
---|
70 | and $Zlo,0x0f,$remp
|
---|
71 | sll $Zhi,60,$t0
|
---|
72 | lda $cnt,6(zero)
|
---|
73 | extbl $Xlo,6,$nlo
|
---|
74 |
|
---|
75 | ldq $Tlo1,8($nhi)
|
---|
76 | s8addq $remp,$rem_4bit,$remp
|
---|
77 | ldq $Thi1,0($nhi)
|
---|
78 | srl $Zlo,4,$Zlo
|
---|
79 |
|
---|
80 | ldq $rem,0($remp)
|
---|
81 | srl $Zhi,4,$Zhi
|
---|
82 | xor $t0,$Zlo,$Zlo
|
---|
83 | and $nlo,0xf0,$nhi
|
---|
84 |
|
---|
85 | xor $Tlo1,$Zlo,$Zlo
|
---|
86 | sll $nlo,4,$nlo
|
---|
87 | xor $Thi1,$Zhi,$Zhi
|
---|
88 | and $nlo,0xf0,$nlo
|
---|
89 |
|
---|
90 | addq $nlo,$Htbl,$nlo
|
---|
91 | ldq $Tlo0,8($nlo)
|
---|
92 | addq $nhi,$Htbl,$nhi
|
---|
93 | ldq $Thi0,0($nlo)
|
---|
94 |
|
---|
95 | .Looplo$N:
|
---|
96 | and $Zlo,0x0f,$remp
|
---|
97 | sll $Zhi,60,$t0
|
---|
98 | subq $cnt,1,$cnt
|
---|
99 | srl $Zlo,4,$Zlo
|
---|
100 |
|
---|
101 | ldq $Tlo1,8($nhi)
|
---|
102 | xor $rem,$Zhi,$Zhi
|
---|
103 | ldq $Thi1,0($nhi)
|
---|
104 | s8addq $remp,$rem_4bit,$remp
|
---|
105 |
|
---|
106 | ldq $rem,0($remp)
|
---|
107 | srl $Zhi,4,$Zhi
|
---|
108 | xor $t0,$Zlo,$Zlo
|
---|
109 | extbl $Xlo,$cnt,$nlo
|
---|
110 |
|
---|
111 | and $nlo,0xf0,$nhi
|
---|
112 | xor $Thi0,$Zhi,$Zhi
|
---|
113 | xor $Tlo0,$Zlo,$Zlo
|
---|
114 | sll $nlo,4,$nlo
|
---|
115 |
|
---|
116 |
|
---|
117 | and $Zlo,0x0f,$remp
|
---|
118 | sll $Zhi,60,$t0
|
---|
119 | and $nlo,0xf0,$nlo
|
---|
120 | srl $Zlo,4,$Zlo
|
---|
121 |
|
---|
122 | s8addq $remp,$rem_4bit,$remp
|
---|
123 | xor $rem,$Zhi,$Zhi
|
---|
124 | addq $nlo,$Htbl,$nlo
|
---|
125 | addq $nhi,$Htbl,$nhi
|
---|
126 |
|
---|
127 | ldq $rem,0($remp)
|
---|
128 | srl $Zhi,4,$Zhi
|
---|
129 | ldq $Tlo0,8($nlo)
|
---|
130 | xor $t0,$Zlo,$Zlo
|
---|
131 |
|
---|
132 | xor $Tlo1,$Zlo,$Zlo
|
---|
133 | xor $Thi1,$Zhi,$Zhi
|
---|
134 | ldq $Thi0,0($nlo)
|
---|
135 | bne $cnt,.Looplo$N
|
---|
136 |
|
---|
137 |
|
---|
138 | and $Zlo,0x0f,$remp
|
---|
139 | sll $Zhi,60,$t0
|
---|
140 | lda $cnt,7(zero)
|
---|
141 | srl $Zlo,4,$Zlo
|
---|
142 |
|
---|
143 | ldq $Tlo1,8($nhi)
|
---|
144 | xor $rem,$Zhi,$Zhi
|
---|
145 | ldq $Thi1,0($nhi)
|
---|
146 | s8addq $remp,$rem_4bit,$remp
|
---|
147 |
|
---|
148 | ldq $rem,0($remp)
|
---|
149 | srl $Zhi,4,$Zhi
|
---|
150 | xor $t0,$Zlo,$Zlo
|
---|
151 | extbl $Xhi,$cnt,$nlo
|
---|
152 |
|
---|
153 | and $nlo,0xf0,$nhi
|
---|
154 | xor $Thi0,$Zhi,$Zhi
|
---|
155 | xor $Tlo0,$Zlo,$Zlo
|
---|
156 | sll $nlo,4,$nlo
|
---|
157 |
|
---|
158 | and $Zlo,0x0f,$remp
|
---|
159 | sll $Zhi,60,$t0
|
---|
160 | and $nlo,0xf0,$nlo
|
---|
161 | srl $Zlo,4,$Zlo
|
---|
162 |
|
---|
163 | s8addq $remp,$rem_4bit,$remp
|
---|
164 | xor $rem,$Zhi,$Zhi
|
---|
165 | addq $nlo,$Htbl,$nlo
|
---|
166 | addq $nhi,$Htbl,$nhi
|
---|
167 |
|
---|
168 | ldq $rem,0($remp)
|
---|
169 | srl $Zhi,4,$Zhi
|
---|
170 | ldq $Tlo0,8($nlo)
|
---|
171 | xor $t0,$Zlo,$Zlo
|
---|
172 |
|
---|
173 | xor $Tlo1,$Zlo,$Zlo
|
---|
174 | xor $Thi1,$Zhi,$Zhi
|
---|
175 | ldq $Thi0,0($nlo)
|
---|
176 | unop
|
---|
177 |
|
---|
178 |
|
---|
179 | .Loophi$N:
|
---|
180 | and $Zlo,0x0f,$remp
|
---|
181 | sll $Zhi,60,$t0
|
---|
182 | subq $cnt,1,$cnt
|
---|
183 | srl $Zlo,4,$Zlo
|
---|
184 |
|
---|
185 | ldq $Tlo1,8($nhi)
|
---|
186 | xor $rem,$Zhi,$Zhi
|
---|
187 | ldq $Thi1,0($nhi)
|
---|
188 | s8addq $remp,$rem_4bit,$remp
|
---|
189 |
|
---|
190 | ldq $rem,0($remp)
|
---|
191 | srl $Zhi,4,$Zhi
|
---|
192 | xor $t0,$Zlo,$Zlo
|
---|
193 | extbl $Xhi,$cnt,$nlo
|
---|
194 |
|
---|
195 | and $nlo,0xf0,$nhi
|
---|
196 | xor $Thi0,$Zhi,$Zhi
|
---|
197 | xor $Tlo0,$Zlo,$Zlo
|
---|
198 | sll $nlo,4,$nlo
|
---|
199 |
|
---|
200 |
|
---|
201 | and $Zlo,0x0f,$remp
|
---|
202 | sll $Zhi,60,$t0
|
---|
203 | and $nlo,0xf0,$nlo
|
---|
204 | srl $Zlo,4,$Zlo
|
---|
205 |
|
---|
206 | s8addq $remp,$rem_4bit,$remp
|
---|
207 | xor $rem,$Zhi,$Zhi
|
---|
208 | addq $nlo,$Htbl,$nlo
|
---|
209 | addq $nhi,$Htbl,$nhi
|
---|
210 |
|
---|
211 | ldq $rem,0($remp)
|
---|
212 | srl $Zhi,4,$Zhi
|
---|
213 | ldq $Tlo0,8($nlo)
|
---|
214 | xor $t0,$Zlo,$Zlo
|
---|
215 |
|
---|
216 | xor $Tlo1,$Zlo,$Zlo
|
---|
217 | xor $Thi1,$Zhi,$Zhi
|
---|
218 | ldq $Thi0,0($nlo)
|
---|
219 | bne $cnt,.Loophi$N
|
---|
220 |
|
---|
221 |
|
---|
222 | and $Zlo,0x0f,$remp
|
---|
223 | sll $Zhi,60,$t0
|
---|
224 | srl $Zlo,4,$Zlo
|
---|
225 |
|
---|
226 | ldq $Tlo1,8($nhi)
|
---|
227 | xor $rem,$Zhi,$Zhi
|
---|
228 | ldq $Thi1,0($nhi)
|
---|
229 | s8addq $remp,$rem_4bit,$remp
|
---|
230 |
|
---|
231 | ldq $rem,0($remp)
|
---|
232 | srl $Zhi,4,$Zhi
|
---|
233 | xor $t0,$Zlo,$Zlo
|
---|
234 |
|
---|
235 | xor $Tlo0,$Zlo,$Zlo
|
---|
236 | xor $Thi0,$Zhi,$Zhi
|
---|
237 |
|
---|
238 | and $Zlo,0x0f,$remp
|
---|
239 | sll $Zhi,60,$t0
|
---|
240 | srl $Zlo,4,$Zlo
|
---|
241 |
|
---|
242 | s8addq $remp,$rem_4bit,$remp
|
---|
243 | xor $rem,$Zhi,$Zhi
|
---|
244 |
|
---|
245 | ldq $rem,0($remp)
|
---|
246 | srl $Zhi,4,$Zhi
|
---|
247 | xor $Tlo1,$Zlo,$Zlo
|
---|
248 | xor $Thi1,$Zhi,$Zhi
|
---|
249 | xor $t0,$Zlo,$Zlo
|
---|
250 | xor $rem,$Zhi,$Zhi
|
---|
251 | ___
|
---|
252 | }}
|
---|
253 |
|
---|
254 | $code=<<___;
|
---|
255 | #ifdef __linux__
|
---|
256 | #include <asm/regdef.h>
|
---|
257 | #else
|
---|
258 | #include <asm.h>
|
---|
259 | #include <regdef.h>
|
---|
260 | #endif
|
---|
261 |
|
---|
262 | .text
|
---|
263 |
|
---|
264 | .set noat
|
---|
265 | .set noreorder
|
---|
266 | .globl gcm_gmult_4bit
|
---|
267 | .align 4
|
---|
268 | .ent gcm_gmult_4bit
|
---|
269 | gcm_gmult_4bit:
|
---|
270 | .frame sp,0,ra
|
---|
271 | .prologue 0
|
---|
272 |
|
---|
273 | ldq $Xlo,8($Xi)
|
---|
274 | ldq $Xhi,0($Xi)
|
---|
275 |
|
---|
276 | bsr $t0,picmeup
|
---|
277 | nop
|
---|
278 | ___
|
---|
279 |
|
---|
280 | &loop();
|
---|
281 |
|
---|
282 | $code.=<<___;
|
---|
283 | srl $Zlo,24,$t0 # byte swap
|
---|
284 | srl $Zlo,8,$t1
|
---|
285 |
|
---|
286 | sll $Zlo,8,$t2
|
---|
287 | sll $Zlo,24,$Zlo
|
---|
288 | zapnot $t0,0x11,$t0
|
---|
289 | zapnot $t1,0x22,$t1
|
---|
290 |
|
---|
291 | zapnot $Zlo,0x88,$Zlo
|
---|
292 | or $t0,$t1,$t0
|
---|
293 | zapnot $t2,0x44,$t2
|
---|
294 |
|
---|
295 | or $Zlo,$t0,$Zlo
|
---|
296 | srl $Zhi,24,$t0
|
---|
297 | srl $Zhi,8,$t1
|
---|
298 |
|
---|
299 | or $Zlo,$t2,$Zlo
|
---|
300 | sll $Zhi,8,$t2
|
---|
301 | sll $Zhi,24,$Zhi
|
---|
302 |
|
---|
303 | srl $Zlo,32,$Xlo
|
---|
304 | sll $Zlo,32,$Zlo
|
---|
305 |
|
---|
306 | zapnot $t0,0x11,$t0
|
---|
307 | zapnot $t1,0x22,$t1
|
---|
308 | or $Zlo,$Xlo,$Xlo
|
---|
309 |
|
---|
310 | zapnot $Zhi,0x88,$Zhi
|
---|
311 | or $t0,$t1,$t0
|
---|
312 | zapnot $t2,0x44,$t2
|
---|
313 |
|
---|
314 | or $Zhi,$t0,$Zhi
|
---|
315 | or $Zhi,$t2,$Zhi
|
---|
316 |
|
---|
317 | srl $Zhi,32,$Xhi
|
---|
318 | sll $Zhi,32,$Zhi
|
---|
319 |
|
---|
320 | or $Zhi,$Xhi,$Xhi
|
---|
321 | stq $Xlo,8($Xi)
|
---|
322 | stq $Xhi,0($Xi)
|
---|
323 |
|
---|
324 | ret (ra)
|
---|
325 | .end gcm_gmult_4bit
|
---|
326 | ___
|
---|
327 |
|
---|
328 | $inhi="s0";
|
---|
329 | $inlo="s1";
|
---|
330 |
|
---|
331 | $code.=<<___;
|
---|
332 | .globl gcm_ghash_4bit
|
---|
333 | .align 4
|
---|
334 | .ent gcm_ghash_4bit
|
---|
335 | gcm_ghash_4bit:
|
---|
336 | lda sp,-32(sp)
|
---|
337 | stq ra,0(sp)
|
---|
338 | stq s0,8(sp)
|
---|
339 | stq s1,16(sp)
|
---|
340 | .mask 0x04000600,-32
|
---|
341 | .frame sp,32,ra
|
---|
342 | .prologue 0
|
---|
343 |
|
---|
344 | ldq_u $inhi,0($inp)
|
---|
345 | ldq_u $Thi0,7($inp)
|
---|
346 | ldq_u $inlo,8($inp)
|
---|
347 | ldq_u $Tlo0,15($inp)
|
---|
348 | ldq $Xhi,0($Xi)
|
---|
349 | ldq $Xlo,8($Xi)
|
---|
350 |
|
---|
351 | bsr $t0,picmeup
|
---|
352 | nop
|
---|
353 |
|
---|
354 | .Louter:
|
---|
355 | extql $inhi,$inp,$inhi
|
---|
356 | extqh $Thi0,$inp,$Thi0
|
---|
357 | or $inhi,$Thi0,$inhi
|
---|
358 | lda $inp,16($inp)
|
---|
359 |
|
---|
360 | extql $inlo,$inp,$inlo
|
---|
361 | extqh $Tlo0,$inp,$Tlo0
|
---|
362 | or $inlo,$Tlo0,$inlo
|
---|
363 | subq $len,16,$len
|
---|
364 |
|
---|
365 | xor $Xlo,$inlo,$Xlo
|
---|
366 | xor $Xhi,$inhi,$Xhi
|
---|
367 | ___
|
---|
368 |
|
---|
369 | &loop();
|
---|
370 |
|
---|
371 | $code.=<<___;
|
---|
372 | srl $Zlo,24,$t0 # byte swap
|
---|
373 | srl $Zlo,8,$t1
|
---|
374 |
|
---|
375 | sll $Zlo,8,$t2
|
---|
376 | sll $Zlo,24,$Zlo
|
---|
377 | zapnot $t0,0x11,$t0
|
---|
378 | zapnot $t1,0x22,$t1
|
---|
379 |
|
---|
380 | zapnot $Zlo,0x88,$Zlo
|
---|
381 | or $t0,$t1,$t0
|
---|
382 | zapnot $t2,0x44,$t2
|
---|
383 |
|
---|
384 | or $Zlo,$t0,$Zlo
|
---|
385 | srl $Zhi,24,$t0
|
---|
386 | srl $Zhi,8,$t1
|
---|
387 |
|
---|
388 | or $Zlo,$t2,$Zlo
|
---|
389 | sll $Zhi,8,$t2
|
---|
390 | sll $Zhi,24,$Zhi
|
---|
391 |
|
---|
392 | srl $Zlo,32,$Xlo
|
---|
393 | sll $Zlo,32,$Zlo
|
---|
394 | beq $len,.Ldone
|
---|
395 |
|
---|
396 | zapnot $t0,0x11,$t0
|
---|
397 | zapnot $t1,0x22,$t1
|
---|
398 | or $Zlo,$Xlo,$Xlo
|
---|
399 | ldq_u $inhi,0($inp)
|
---|
400 |
|
---|
401 | zapnot $Zhi,0x88,$Zhi
|
---|
402 | or $t0,$t1,$t0
|
---|
403 | zapnot $t2,0x44,$t2
|
---|
404 | ldq_u $Thi0,7($inp)
|
---|
405 |
|
---|
406 | or $Zhi,$t0,$Zhi
|
---|
407 | or $Zhi,$t2,$Zhi
|
---|
408 | ldq_u $inlo,8($inp)
|
---|
409 | ldq_u $Tlo0,15($inp)
|
---|
410 |
|
---|
411 | srl $Zhi,32,$Xhi
|
---|
412 | sll $Zhi,32,$Zhi
|
---|
413 |
|
---|
414 | or $Zhi,$Xhi,$Xhi
|
---|
415 | br zero,.Louter
|
---|
416 |
|
---|
417 | .Ldone:
|
---|
418 | zapnot $t0,0x11,$t0
|
---|
419 | zapnot $t1,0x22,$t1
|
---|
420 | or $Zlo,$Xlo,$Xlo
|
---|
421 |
|
---|
422 | zapnot $Zhi,0x88,$Zhi
|
---|
423 | or $t0,$t1,$t0
|
---|
424 | zapnot $t2,0x44,$t2
|
---|
425 |
|
---|
426 | or $Zhi,$t0,$Zhi
|
---|
427 | or $Zhi,$t2,$Zhi
|
---|
428 |
|
---|
429 | srl $Zhi,32,$Xhi
|
---|
430 | sll $Zhi,32,$Zhi
|
---|
431 |
|
---|
432 | or $Zhi,$Xhi,$Xhi
|
---|
433 |
|
---|
434 | stq $Xlo,8($Xi)
|
---|
435 | stq $Xhi,0($Xi)
|
---|
436 |
|
---|
437 | .set noreorder
|
---|
438 | /*ldq ra,0(sp)*/
|
---|
439 | ldq s0,8(sp)
|
---|
440 | ldq s1,16(sp)
|
---|
441 | lda sp,32(sp)
|
---|
442 | ret (ra)
|
---|
443 | .end gcm_ghash_4bit
|
---|
444 |
|
---|
445 | .align 4
|
---|
446 | .ent picmeup
|
---|
447 | picmeup:
|
---|
448 | .frame sp,0,$t0
|
---|
449 | .prologue 0
|
---|
450 | br $rem_4bit,.Lpic
|
---|
451 | .Lpic: lda $rem_4bit,12($rem_4bit)
|
---|
452 | ret ($t0)
|
---|
453 | .end picmeup
|
---|
454 | nop
|
---|
455 | rem_4bit:
|
---|
456 | .long 0,0x0000<<16, 0,0x1C20<<16, 0,0x3840<<16, 0,0x2460<<16
|
---|
457 | .long 0,0x7080<<16, 0,0x6CA0<<16, 0,0x48C0<<16, 0,0x54E0<<16
|
---|
458 | .long 0,0xE100<<16, 0,0xFD20<<16, 0,0xD940<<16, 0,0xC560<<16
|
---|
459 | .long 0,0x9180<<16, 0,0x8DA0<<16, 0,0xA9C0<<16, 0,0xB5E0<<16
|
---|
460 | .ascii "GHASH for Alpha, CRYPTOGAMS by <appro\@openssl.org>"
|
---|
461 | .align 4
|
---|
462 |
|
---|
463 | ___
|
---|
464 | $output=pop and open STDOUT,">$output";
|
---|
465 | print $code;
|
---|
466 | close STDOUT or die "error closing STDOUT: $!";
|
---|
467 |
|
---|