VirtualBox

source: vbox/trunk/src/libs/openssl-1.1.1l/crypto/modes/asm/ghash-alpha.pl@ 91772

Last change on this file since 91772 was 91772, checked in by vboxsync, 3 years ago

openssl-1.1.1l: Applied and adjusted our OpenSSL changes to 1.1.1l. bugref:10126

File size: 7.8 KB
Line 
1#! /usr/bin/env perl
2# Copyright 2010-2020 The OpenSSL Project Authors. All Rights Reserved.
3#
4# Licensed under the OpenSSL license (the "License"). You may not use
5# this file except in compliance with the License. You can obtain a copy
6# in the file LICENSE in the source distribution or at
7# https://www.openssl.org/source/license.html
8
9#
10# ====================================================================
11# Written by Andy Polyakov <[email protected]> for the OpenSSL
12# project. The module is, however, dual licensed under OpenSSL and
13# CRYPTOGAMS licenses depending on where you obtain it. For further
14# details see http://www.openssl.org/~appro/cryptogams/.
15# ====================================================================
16#
17# March 2010
18#
19# The module implements "4-bit" GCM GHASH function and underlying
20# single multiplication operation in GF(2^128). "4-bit" means that it
21# uses 256 bytes per-key table [+128 bytes shared table]. Even though
22# loops are aggressively modulo-scheduled in respect to references to
23# Htbl and Z.hi updates for 8 cycles per byte, measured performance is
24# ~12 cycles per processed byte on 21264 CPU. It seems to be a dynamic
25# scheduling "glitch," because uprofile(1) indicates uniform sample
26# distribution, as if all instruction bundles execute in 1.5 cycles.
27# Meaning that it could have been even faster, yet 12 cycles is ~60%
28# better than gcc-generated code and ~80% than code generated by vendor
29# compiler.
30
31$cnt="v0"; # $0
32$t0="t0";
33$t1="t1";
34$t2="t2";
35$Thi0="t3"; # $4
36$Tlo0="t4";
37$Thi1="t5";
38$Tlo1="t6";
39$rem="t7"; # $8
40#################
41$Xi="a0"; # $16, input argument block
42$Htbl="a1";
43$inp="a2";
44$len="a3";
45$nlo="a4"; # $20
46$nhi="a5";
47$Zhi="t8";
48$Zlo="t9";
49$Xhi="t10"; # $24
50$Xlo="t11";
51$remp="t12";
52$rem_4bit="AT"; # $28
53
54{ my $N;
55 sub loop() {
56
57 $N++;
58$code.=<<___;
59.align 4
60 extbl $Xlo,7,$nlo
61 and $nlo,0xf0,$nhi
62 sll $nlo,4,$nlo
63 and $nlo,0xf0,$nlo
64
65 addq $nlo,$Htbl,$nlo
66 ldq $Zlo,8($nlo)
67 addq $nhi,$Htbl,$nhi
68 ldq $Zhi,0($nlo)
69
70 and $Zlo,0x0f,$remp
71 sll $Zhi,60,$t0
72 lda $cnt,6(zero)
73 extbl $Xlo,6,$nlo
74
75 ldq $Tlo1,8($nhi)
76 s8addq $remp,$rem_4bit,$remp
77 ldq $Thi1,0($nhi)
78 srl $Zlo,4,$Zlo
79
80 ldq $rem,0($remp)
81 srl $Zhi,4,$Zhi
82 xor $t0,$Zlo,$Zlo
83 and $nlo,0xf0,$nhi
84
85 xor $Tlo1,$Zlo,$Zlo
86 sll $nlo,4,$nlo
87 xor $Thi1,$Zhi,$Zhi
88 and $nlo,0xf0,$nlo
89
90 addq $nlo,$Htbl,$nlo
91 ldq $Tlo0,8($nlo)
92 addq $nhi,$Htbl,$nhi
93 ldq $Thi0,0($nlo)
94
95.Looplo$N:
96 and $Zlo,0x0f,$remp
97 sll $Zhi,60,$t0
98 subq $cnt,1,$cnt
99 srl $Zlo,4,$Zlo
100
101 ldq $Tlo1,8($nhi)
102 xor $rem,$Zhi,$Zhi
103 ldq $Thi1,0($nhi)
104 s8addq $remp,$rem_4bit,$remp
105
106 ldq $rem,0($remp)
107 srl $Zhi,4,$Zhi
108 xor $t0,$Zlo,$Zlo
109 extbl $Xlo,$cnt,$nlo
110
111 and $nlo,0xf0,$nhi
112 xor $Thi0,$Zhi,$Zhi
113 xor $Tlo0,$Zlo,$Zlo
114 sll $nlo,4,$nlo
115
116
117 and $Zlo,0x0f,$remp
118 sll $Zhi,60,$t0
119 and $nlo,0xf0,$nlo
120 srl $Zlo,4,$Zlo
121
122 s8addq $remp,$rem_4bit,$remp
123 xor $rem,$Zhi,$Zhi
124 addq $nlo,$Htbl,$nlo
125 addq $nhi,$Htbl,$nhi
126
127 ldq $rem,0($remp)
128 srl $Zhi,4,$Zhi
129 ldq $Tlo0,8($nlo)
130 xor $t0,$Zlo,$Zlo
131
132 xor $Tlo1,$Zlo,$Zlo
133 xor $Thi1,$Zhi,$Zhi
134 ldq $Thi0,0($nlo)
135 bne $cnt,.Looplo$N
136
137
138 and $Zlo,0x0f,$remp
139 sll $Zhi,60,$t0
140 lda $cnt,7(zero)
141 srl $Zlo,4,$Zlo
142
143 ldq $Tlo1,8($nhi)
144 xor $rem,$Zhi,$Zhi
145 ldq $Thi1,0($nhi)
146 s8addq $remp,$rem_4bit,$remp
147
148 ldq $rem,0($remp)
149 srl $Zhi,4,$Zhi
150 xor $t0,$Zlo,$Zlo
151 extbl $Xhi,$cnt,$nlo
152
153 and $nlo,0xf0,$nhi
154 xor $Thi0,$Zhi,$Zhi
155 xor $Tlo0,$Zlo,$Zlo
156 sll $nlo,4,$nlo
157
158 and $Zlo,0x0f,$remp
159 sll $Zhi,60,$t0
160 and $nlo,0xf0,$nlo
161 srl $Zlo,4,$Zlo
162
163 s8addq $remp,$rem_4bit,$remp
164 xor $rem,$Zhi,$Zhi
165 addq $nlo,$Htbl,$nlo
166 addq $nhi,$Htbl,$nhi
167
168 ldq $rem,0($remp)
169 srl $Zhi,4,$Zhi
170 ldq $Tlo0,8($nlo)
171 xor $t0,$Zlo,$Zlo
172
173 xor $Tlo1,$Zlo,$Zlo
174 xor $Thi1,$Zhi,$Zhi
175 ldq $Thi0,0($nlo)
176 unop
177
178
179.Loophi$N:
180 and $Zlo,0x0f,$remp
181 sll $Zhi,60,$t0
182 subq $cnt,1,$cnt
183 srl $Zlo,4,$Zlo
184
185 ldq $Tlo1,8($nhi)
186 xor $rem,$Zhi,$Zhi
187 ldq $Thi1,0($nhi)
188 s8addq $remp,$rem_4bit,$remp
189
190 ldq $rem,0($remp)
191 srl $Zhi,4,$Zhi
192 xor $t0,$Zlo,$Zlo
193 extbl $Xhi,$cnt,$nlo
194
195 and $nlo,0xf0,$nhi
196 xor $Thi0,$Zhi,$Zhi
197 xor $Tlo0,$Zlo,$Zlo
198 sll $nlo,4,$nlo
199
200
201 and $Zlo,0x0f,$remp
202 sll $Zhi,60,$t0
203 and $nlo,0xf0,$nlo
204 srl $Zlo,4,$Zlo
205
206 s8addq $remp,$rem_4bit,$remp
207 xor $rem,$Zhi,$Zhi
208 addq $nlo,$Htbl,$nlo
209 addq $nhi,$Htbl,$nhi
210
211 ldq $rem,0($remp)
212 srl $Zhi,4,$Zhi
213 ldq $Tlo0,8($nlo)
214 xor $t0,$Zlo,$Zlo
215
216 xor $Tlo1,$Zlo,$Zlo
217 xor $Thi1,$Zhi,$Zhi
218 ldq $Thi0,0($nlo)
219 bne $cnt,.Loophi$N
220
221
222 and $Zlo,0x0f,$remp
223 sll $Zhi,60,$t0
224 srl $Zlo,4,$Zlo
225
226 ldq $Tlo1,8($nhi)
227 xor $rem,$Zhi,$Zhi
228 ldq $Thi1,0($nhi)
229 s8addq $remp,$rem_4bit,$remp
230
231 ldq $rem,0($remp)
232 srl $Zhi,4,$Zhi
233 xor $t0,$Zlo,$Zlo
234
235 xor $Tlo0,$Zlo,$Zlo
236 xor $Thi0,$Zhi,$Zhi
237
238 and $Zlo,0x0f,$remp
239 sll $Zhi,60,$t0
240 srl $Zlo,4,$Zlo
241
242 s8addq $remp,$rem_4bit,$remp
243 xor $rem,$Zhi,$Zhi
244
245 ldq $rem,0($remp)
246 srl $Zhi,4,$Zhi
247 xor $Tlo1,$Zlo,$Zlo
248 xor $Thi1,$Zhi,$Zhi
249 xor $t0,$Zlo,$Zlo
250 xor $rem,$Zhi,$Zhi
251___
252}}
253
254$code=<<___;
255#ifdef __linux__
256#include <asm/regdef.h>
257#else
258#include <asm.h>
259#include <regdef.h>
260#endif
261
262.text
263
264.set noat
265.set noreorder
266.globl gcm_gmult_4bit
267.align 4
268.ent gcm_gmult_4bit
269gcm_gmult_4bit:
270 .frame sp,0,ra
271 .prologue 0
272
273 ldq $Xlo,8($Xi)
274 ldq $Xhi,0($Xi)
275
276 bsr $t0,picmeup
277 nop
278___
279
280 &loop();
281
282$code.=<<___;
283 srl $Zlo,24,$t0 # byte swap
284 srl $Zlo,8,$t1
285
286 sll $Zlo,8,$t2
287 sll $Zlo,24,$Zlo
288 zapnot $t0,0x11,$t0
289 zapnot $t1,0x22,$t1
290
291 zapnot $Zlo,0x88,$Zlo
292 or $t0,$t1,$t0
293 zapnot $t2,0x44,$t2
294
295 or $Zlo,$t0,$Zlo
296 srl $Zhi,24,$t0
297 srl $Zhi,8,$t1
298
299 or $Zlo,$t2,$Zlo
300 sll $Zhi,8,$t2
301 sll $Zhi,24,$Zhi
302
303 srl $Zlo,32,$Xlo
304 sll $Zlo,32,$Zlo
305
306 zapnot $t0,0x11,$t0
307 zapnot $t1,0x22,$t1
308 or $Zlo,$Xlo,$Xlo
309
310 zapnot $Zhi,0x88,$Zhi
311 or $t0,$t1,$t0
312 zapnot $t2,0x44,$t2
313
314 or $Zhi,$t0,$Zhi
315 or $Zhi,$t2,$Zhi
316
317 srl $Zhi,32,$Xhi
318 sll $Zhi,32,$Zhi
319
320 or $Zhi,$Xhi,$Xhi
321 stq $Xlo,8($Xi)
322 stq $Xhi,0($Xi)
323
324 ret (ra)
325.end gcm_gmult_4bit
326___
327
328$inhi="s0";
329$inlo="s1";
330
331$code.=<<___;
332.globl gcm_ghash_4bit
333.align 4
334.ent gcm_ghash_4bit
335gcm_ghash_4bit:
336 lda sp,-32(sp)
337 stq ra,0(sp)
338 stq s0,8(sp)
339 stq s1,16(sp)
340 .mask 0x04000600,-32
341 .frame sp,32,ra
342 .prologue 0
343
344 ldq_u $inhi,0($inp)
345 ldq_u $Thi0,7($inp)
346 ldq_u $inlo,8($inp)
347 ldq_u $Tlo0,15($inp)
348 ldq $Xhi,0($Xi)
349 ldq $Xlo,8($Xi)
350
351 bsr $t0,picmeup
352 nop
353
354.Louter:
355 extql $inhi,$inp,$inhi
356 extqh $Thi0,$inp,$Thi0
357 or $inhi,$Thi0,$inhi
358 lda $inp,16($inp)
359
360 extql $inlo,$inp,$inlo
361 extqh $Tlo0,$inp,$Tlo0
362 or $inlo,$Tlo0,$inlo
363 subq $len,16,$len
364
365 xor $Xlo,$inlo,$Xlo
366 xor $Xhi,$inhi,$Xhi
367___
368
369 &loop();
370
371$code.=<<___;
372 srl $Zlo,24,$t0 # byte swap
373 srl $Zlo,8,$t1
374
375 sll $Zlo,8,$t2
376 sll $Zlo,24,$Zlo
377 zapnot $t0,0x11,$t0
378 zapnot $t1,0x22,$t1
379
380 zapnot $Zlo,0x88,$Zlo
381 or $t0,$t1,$t0
382 zapnot $t2,0x44,$t2
383
384 or $Zlo,$t0,$Zlo
385 srl $Zhi,24,$t0
386 srl $Zhi,8,$t1
387
388 or $Zlo,$t2,$Zlo
389 sll $Zhi,8,$t2
390 sll $Zhi,24,$Zhi
391
392 srl $Zlo,32,$Xlo
393 sll $Zlo,32,$Zlo
394 beq $len,.Ldone
395
396 zapnot $t0,0x11,$t0
397 zapnot $t1,0x22,$t1
398 or $Zlo,$Xlo,$Xlo
399 ldq_u $inhi,0($inp)
400
401 zapnot $Zhi,0x88,$Zhi
402 or $t0,$t1,$t0
403 zapnot $t2,0x44,$t2
404 ldq_u $Thi0,7($inp)
405
406 or $Zhi,$t0,$Zhi
407 or $Zhi,$t2,$Zhi
408 ldq_u $inlo,8($inp)
409 ldq_u $Tlo0,15($inp)
410
411 srl $Zhi,32,$Xhi
412 sll $Zhi,32,$Zhi
413
414 or $Zhi,$Xhi,$Xhi
415 br zero,.Louter
416
417.Ldone:
418 zapnot $t0,0x11,$t0
419 zapnot $t1,0x22,$t1
420 or $Zlo,$Xlo,$Xlo
421
422 zapnot $Zhi,0x88,$Zhi
423 or $t0,$t1,$t0
424 zapnot $t2,0x44,$t2
425
426 or $Zhi,$t0,$Zhi
427 or $Zhi,$t2,$Zhi
428
429 srl $Zhi,32,$Xhi
430 sll $Zhi,32,$Zhi
431
432 or $Zhi,$Xhi,$Xhi
433
434 stq $Xlo,8($Xi)
435 stq $Xhi,0($Xi)
436
437 .set noreorder
438 /*ldq ra,0(sp)*/
439 ldq s0,8(sp)
440 ldq s1,16(sp)
441 lda sp,32(sp)
442 ret (ra)
443.end gcm_ghash_4bit
444
445.align 4
446.ent picmeup
447picmeup:
448 .frame sp,0,$t0
449 .prologue 0
450 br $rem_4bit,.Lpic
451.Lpic: lda $rem_4bit,12($rem_4bit)
452 ret ($t0)
453.end picmeup
454 nop
455rem_4bit:
456 .long 0,0x0000<<16, 0,0x1C20<<16, 0,0x3840<<16, 0,0x2460<<16
457 .long 0,0x7080<<16, 0,0x6CA0<<16, 0,0x48C0<<16, 0,0x54E0<<16
458 .long 0,0xE100<<16, 0,0xFD20<<16, 0,0xD940<<16, 0,0xC560<<16
459 .long 0,0x9180<<16, 0,0x8DA0<<16, 0,0xA9C0<<16, 0,0xB5E0<<16
460.ascii "GHASH for Alpha, CRYPTOGAMS by <appro\@openssl.org>"
461.align 4
462
463___
464$output=pop and open STDOUT,">$output";
465print $code;
466close STDOUT or die "error closing STDOUT: $!";
467
Note: See TracBrowser for help on using the repository browser.

© 2024 Oracle Support Privacy / Do Not Sell My Info Terms of Use Trademark Policy Automated Access Etiquette