1 | #! /usr/bin/env perl
|
---|
2 | # Copyright 2010-2020 The OpenSSL Project Authors. All Rights Reserved.
|
---|
3 | #
|
---|
4 | # Licensed under the Apache License 2.0 (the "License"). You may not use
|
---|
5 | # this file except in compliance with the License. You can obtain a copy
|
---|
6 | # in the file LICENSE in the source distribution or at
|
---|
7 | # https://www.openssl.org/source/license.html
|
---|
8 |
|
---|
9 |
|
---|
10 | # ====================================================================
|
---|
11 | # Written by Andy Polyakov <[email protected]> for the OpenSSL
|
---|
12 | # project. The module is, however, dual licensed under OpenSSL and
|
---|
13 | # CRYPTOGAMS licenses depending on where you obtain it. For further
|
---|
14 | # details see http://www.openssl.org/~appro/cryptogams/.
|
---|
15 | # ====================================================================
|
---|
16 | #
|
---|
17 | # March 2010
|
---|
18 | #
|
---|
19 | # The module implements "4-bit" GCM GHASH function and underlying
|
---|
20 | # single multiplication operation in GF(2^128). "4-bit" means that it
|
---|
21 | # uses 256 bytes per-key table [+128 bytes shared table]. Streamed
|
---|
22 | # GHASH performance was measured to be 6.67 cycles per processed byte
|
---|
23 | # on Itanium 2, which is >90% better than Microsoft compiler generated
|
---|
24 | # code. To anchor to something else sha1-ia64.pl module processes one
|
---|
25 | # byte in 5.7 cycles. On Itanium GHASH should run at ~8.5 cycles per
|
---|
26 | # byte.
|
---|
27 |
|
---|
28 | # September 2010
|
---|
29 | #
|
---|
30 | # It was originally thought that it makes lesser sense to implement
|
---|
31 | # "528B" variant on Itanium 2 for following reason. Because number of
|
---|
32 | # functional units is naturally limited, it appeared impossible to
|
---|
33 | # implement "528B" loop in 4 cycles, only in 5. This would mean that
|
---|
34 | # theoretically performance improvement couldn't be more than 20%.
|
---|
35 | # But occasionally you prove yourself wrong:-) I figured out a way to
|
---|
36 | # fold couple of instructions and having freed yet another instruction
|
---|
37 | # slot by unrolling the loop... Resulting performance is 4.45 cycles
|
---|
38 | # per processed byte and 50% better than "256B" version. On original
|
---|
39 | # Itanium performance should remain the same as the "256B" version,
|
---|
40 | # i.e. ~8.5 cycles.
|
---|
41 |
|
---|
42 | $output=pop and (open STDOUT,">$output" or die "can't open $output: $!");
|
---|
43 |
|
---|
44 | if ($^O eq "hpux") {
|
---|
45 | $ADDP="addp4";
|
---|
46 | for (@ARGV) { $ADDP="add" if (/[\+DD|\-mlp]64/); }
|
---|
47 | } else { $ADDP="add"; }
|
---|
48 | for (@ARGV) { $big_endian=1 if (/\-DB_ENDIAN/);
|
---|
49 | $big_endian=0 if (/\-DL_ENDIAN/); }
|
---|
50 | if (!defined($big_endian))
|
---|
51 | { $big_endian=(unpack('L',pack('N',1))==1); }
|
---|
52 |
|
---|
53 | sub loop() {
|
---|
54 | my $label=shift;
|
---|
55 | my ($p16,$p17)=(shift)?("p63","p63"):("p16","p17"); # mask references to inp
|
---|
56 |
|
---|
57 | # Loop is scheduled for 6 ticks on Itanium 2 and 8 on Itanium, i.e.
|
---|
58 | # in scalable manner;-) Naturally assuming data in L1 cache...
|
---|
59 | # Special note about 'dep' instruction, which is used to construct
|
---|
60 | # &rem_4bit[Zlo&0xf]. It works, because rem_4bit is aligned at 128
|
---|
61 | # bytes boundary and lower 7 bits of its address are guaranteed to
|
---|
62 | # be zero.
|
---|
63 | $code.=<<___;
|
---|
64 | $label:
|
---|
65 | { .mfi; (p18) ld8 Hlo=[Hi[1]],-8
|
---|
66 | (p19) dep rem=Zlo,rem_4bitp,3,4 }
|
---|
67 | { .mfi; (p19) xor Zhi=Zhi,Hhi
|
---|
68 | ($p17) xor xi[1]=xi[1],in[1] };;
|
---|
69 | { .mfi; (p18) ld8 Hhi=[Hi[1]]
|
---|
70 | (p19) shrp Zlo=Zhi,Zlo,4 }
|
---|
71 | { .mfi; (p19) ld8 rem=[rem]
|
---|
72 | (p18) and Hi[1]=mask0xf0,xi[2] };;
|
---|
73 | { .mmi; ($p16) ld1 in[0]=[inp],-1
|
---|
74 | (p18) xor Zlo=Zlo,Hlo
|
---|
75 | (p19) shr.u Zhi=Zhi,4 }
|
---|
76 | { .mib; (p19) xor Hhi=Hhi,rem
|
---|
77 | (p18) add Hi[1]=Htbl,Hi[1] };;
|
---|
78 |
|
---|
79 | { .mfi; (p18) ld8 Hlo=[Hi[1]],-8
|
---|
80 | (p18) dep rem=Zlo,rem_4bitp,3,4 }
|
---|
81 | { .mfi; (p17) shladd Hi[0]=xi[1],4,r0
|
---|
82 | (p18) xor Zhi=Zhi,Hhi };;
|
---|
83 | { .mfi; (p18) ld8 Hhi=[Hi[1]]
|
---|
84 | (p18) shrp Zlo=Zhi,Zlo,4 }
|
---|
85 | { .mfi; (p18) ld8 rem=[rem]
|
---|
86 | (p17) and Hi[0]=mask0xf0,Hi[0] };;
|
---|
87 | { .mmi; (p16) ld1 xi[0]=[Xi],-1
|
---|
88 | (p18) xor Zlo=Zlo,Hlo
|
---|
89 | (p18) shr.u Zhi=Zhi,4 }
|
---|
90 | { .mib; (p18) xor Hhi=Hhi,rem
|
---|
91 | (p17) add Hi[0]=Htbl,Hi[0]
|
---|
92 | br.ctop.sptk $label };;
|
---|
93 | ___
|
---|
94 | }
|
---|
95 |
|
---|
96 | $code=<<___;
|
---|
97 | .explicit
|
---|
98 | .text
|
---|
99 |
|
---|
100 | prevfs=r2; prevlc=r3; prevpr=r8;
|
---|
101 | mask0xf0=r21;
|
---|
102 | rem=r22; rem_4bitp=r23;
|
---|
103 | Xi=r24; Htbl=r25;
|
---|
104 | inp=r26; end=r27;
|
---|
105 | Hhi=r28; Hlo=r29;
|
---|
106 | Zhi=r30; Zlo=r31;
|
---|
107 |
|
---|
108 | .align 128
|
---|
109 | .skip 16 // aligns loop body
|
---|
110 | .global gcm_gmult_4bit#
|
---|
111 | .proc gcm_gmult_4bit#
|
---|
112 | gcm_gmult_4bit:
|
---|
113 | .prologue
|
---|
114 | { .mmi; .save ar.pfs,prevfs
|
---|
115 | alloc prevfs=ar.pfs,2,6,0,8
|
---|
116 | $ADDP Xi=15,in0 // &Xi[15]
|
---|
117 | mov rem_4bitp=ip }
|
---|
118 | { .mii; $ADDP Htbl=8,in1 // &Htbl[0].lo
|
---|
119 | .save ar.lc,prevlc
|
---|
120 | mov prevlc=ar.lc
|
---|
121 | .save pr,prevpr
|
---|
122 | mov prevpr=pr };;
|
---|
123 |
|
---|
124 | .body
|
---|
125 | .rotr in[3],xi[3],Hi[2]
|
---|
126 |
|
---|
127 | { .mib; ld1 xi[2]=[Xi],-1 // Xi[15]
|
---|
128 | mov mask0xf0=0xf0
|
---|
129 | brp.loop.imp .Loop1,.Lend1-16};;
|
---|
130 | { .mmi; ld1 xi[1]=[Xi],-1 // Xi[14]
|
---|
131 | };;
|
---|
132 | { .mii; shladd Hi[1]=xi[2],4,r0
|
---|
133 | mov pr.rot=0x7<<16
|
---|
134 | mov ar.lc=13 };;
|
---|
135 | { .mii; and Hi[1]=mask0xf0,Hi[1]
|
---|
136 | mov ar.ec=3
|
---|
137 | xor Zlo=Zlo,Zlo };;
|
---|
138 | { .mii; add Hi[1]=Htbl,Hi[1] // &Htbl[nlo].lo
|
---|
139 | add rem_4bitp=rem_4bit#-gcm_gmult_4bit#,rem_4bitp
|
---|
140 | xor Zhi=Zhi,Zhi };;
|
---|
141 | ___
|
---|
142 | &loop (".Loop1",1);
|
---|
143 | $code.=<<___;
|
---|
144 | .Lend1:
|
---|
145 | { .mib; xor Zhi=Zhi,Hhi };; // modulo-scheduling artefact
|
---|
146 | { .mib; mux1 Zlo=Zlo,\@rev };;
|
---|
147 | { .mib; mux1 Zhi=Zhi,\@rev };;
|
---|
148 | { .mmi; add Hlo=9,Xi;; // ;; is here to prevent
|
---|
149 | add Hhi=1,Xi };; // pipeline flush on Itanium
|
---|
150 | { .mib; st8 [Hlo]=Zlo
|
---|
151 | mov pr=prevpr,0x1ffff };;
|
---|
152 | { .mib; st8 [Hhi]=Zhi
|
---|
153 | mov ar.lc=prevlc
|
---|
154 | br.ret.sptk.many b0 };;
|
---|
155 | .endp gcm_gmult_4bit#
|
---|
156 | ___
|
---|
157 |
|
---|
158 | ######################################################################
|
---|
159 | # "528B" (well, "512B" actually) streamed GHASH
|
---|
160 | #
|
---|
161 | $Xip="in0";
|
---|
162 | $Htbl="in1";
|
---|
163 | $inp="in2";
|
---|
164 | $len="in3";
|
---|
165 | $rem_8bit="loc0";
|
---|
166 | $mask0xff="loc1";
|
---|
167 | ($sum,$rum) = $big_endian ? ("nop.m","nop.m") : ("sum","rum");
|
---|
168 |
|
---|
169 | sub load_htable() {
|
---|
170 | for (my $i=0;$i<8;$i++) {
|
---|
171 | $code.=<<___;
|
---|
172 | { .mmi; ld8 r`16+2*$i+1`=[r8],16 // Htable[$i].hi
|
---|
173 | ld8 r`16+2*$i`=[r9],16 } // Htable[$i].lo
|
---|
174 | { .mmi; ldf8 f`32+2*$i+1`=[r10],16 // Htable[`8+$i`].hi
|
---|
175 | ldf8 f`32+2*$i`=[r11],16 // Htable[`8+$i`].lo
|
---|
176 | ___
|
---|
177 | $code.=shift if (($i+$#_)==7);
|
---|
178 | $code.="\t};;\n"
|
---|
179 | }
|
---|
180 | }
|
---|
181 |
|
---|
182 | $code.=<<___;
|
---|
183 | prevsp=r3;
|
---|
184 |
|
---|
185 | .align 32
|
---|
186 | .skip 16 // aligns loop body
|
---|
187 | .global gcm_ghash_4bit#
|
---|
188 | .proc gcm_ghash_4bit#
|
---|
189 | gcm_ghash_4bit:
|
---|
190 | .prologue
|
---|
191 | { .mmi; .save ar.pfs,prevfs
|
---|
192 | alloc prevfs=ar.pfs,4,2,0,0
|
---|
193 | .vframe prevsp
|
---|
194 | mov prevsp=sp
|
---|
195 | mov $rem_8bit=ip };;
|
---|
196 | .body
|
---|
197 | { .mfi; $ADDP r8=0+0,$Htbl
|
---|
198 | $ADDP r9=0+8,$Htbl }
|
---|
199 | { .mfi; $ADDP r10=128+0,$Htbl
|
---|
200 | $ADDP r11=128+8,$Htbl };;
|
---|
201 | ___
|
---|
202 | &load_htable(
|
---|
203 | " $ADDP $Xip=15,$Xip", # &Xi[15]
|
---|
204 | " $ADDP $len=$len,$inp", # &inp[len]
|
---|
205 | " $ADDP $inp=15,$inp", # &inp[15]
|
---|
206 | " mov $mask0xff=0xff",
|
---|
207 | " add sp=-512,sp",
|
---|
208 | " andcm sp=sp,$mask0xff", # align stack frame
|
---|
209 | " add r14=0,sp",
|
---|
210 | " add r15=8,sp");
|
---|
211 | $code.=<<___;
|
---|
212 | { .mmi; $sum 1<<1 // go big-endian
|
---|
213 | add r8=256+0,sp
|
---|
214 | add r9=256+8,sp }
|
---|
215 | { .mmi; add r10=256+128+0,sp
|
---|
216 | add r11=256+128+8,sp
|
---|
217 | add $len=-17,$len };;
|
---|
218 | ___
|
---|
219 | for($i=0;$i<8;$i++) { # generate first half of Hshr4[]
|
---|
220 | my ($rlo,$rhi)=("r".eval(16+2*$i),"r".eval(16+2*$i+1));
|
---|
221 | $code.=<<___;
|
---|
222 | { .mmi; st8 [r8]=$rlo,16 // Htable[$i].lo
|
---|
223 | st8 [r9]=$rhi,16 // Htable[$i].hi
|
---|
224 | shrp $rlo=$rhi,$rlo,4 }//;;
|
---|
225 | { .mmi; stf8 [r10]=f`32+2*$i`,16 // Htable[`8+$i`].lo
|
---|
226 | stf8 [r11]=f`32+2*$i+1`,16 // Htable[`8+$i`].hi
|
---|
227 | shr.u $rhi=$rhi,4 };;
|
---|
228 | { .mmi; st8 [r14]=$rlo,16 // Htable[$i].lo>>4
|
---|
229 | st8 [r15]=$rhi,16 }//;; // Htable[$i].hi>>4
|
---|
230 | ___
|
---|
231 | }
|
---|
232 | $code.=<<___;
|
---|
233 | { .mmi; ld8 r16=[r8],16 // Htable[8].lo
|
---|
234 | ld8 r17=[r9],16 };; // Htable[8].hi
|
---|
235 | { .mmi; ld8 r18=[r8],16 // Htable[9].lo
|
---|
236 | ld8 r19=[r9],16 } // Htable[9].hi
|
---|
237 | { .mmi; rum 1<<5 // clear um.mfh
|
---|
238 | shrp r16=r17,r16,4 };;
|
---|
239 | ___
|
---|
240 | for($i=0;$i<6;$i++) { # generate second half of Hshr4[]
|
---|
241 | $code.=<<___;
|
---|
242 | { .mmi; ld8 r`20+2*$i`=[r8],16 // Htable[`10+$i`].lo
|
---|
243 | ld8 r`20+2*$i+1`=[r9],16 // Htable[`10+$i`].hi
|
---|
244 | shr.u r`16+2*$i+1`=r`16+2*$i+1`,4 };;
|
---|
245 | { .mmi; st8 [r14]=r`16+2*$i`,16 // Htable[`8+$i`].lo>>4
|
---|
246 | st8 [r15]=r`16+2*$i+1`,16 // Htable[`8+$i`].hi>>4
|
---|
247 | shrp r`18+2*$i`=r`18+2*$i+1`,r`18+2*$i`,4 }
|
---|
248 | ___
|
---|
249 | }
|
---|
250 | $code.=<<___;
|
---|
251 | { .mmi; shr.u r`16+2*$i+1`=r`16+2*$i+1`,4 };;
|
---|
252 | { .mmi; st8 [r14]=r`16+2*$i`,16 // Htable[`8+$i`].lo>>4
|
---|
253 | st8 [r15]=r`16+2*$i+1`,16 // Htable[`8+$i`].hi>>4
|
---|
254 | shrp r`18+2*$i`=r`18+2*$i+1`,r`18+2*$i`,4 }
|
---|
255 | { .mmi; add $Htbl=256,sp // &Htable[0]
|
---|
256 | add $rem_8bit=rem_8bit#-gcm_ghash_4bit#,$rem_8bit
|
---|
257 | shr.u r`18+2*$i+1`=r`18+2*$i+1`,4 };;
|
---|
258 | { .mmi; st8 [r14]=r`18+2*$i` // Htable[`8+$i`].lo>>4
|
---|
259 | st8 [r15]=r`18+2*$i+1` } // Htable[`8+$i`].hi>>4
|
---|
260 | ___
|
---|
261 |
|
---|
262 | $in="r15";
|
---|
263 | @xi=("r16","r17");
|
---|
264 | @rem=("r18","r19");
|
---|
265 | ($Alo,$Ahi,$Blo,$Bhi,$Zlo,$Zhi)=("r20","r21","r22","r23","r24","r25");
|
---|
266 | ($Atbl,$Btbl)=("r26","r27");
|
---|
267 |
|
---|
268 | $code.=<<___; # (p16)
|
---|
269 | { .mmi; ld1 $in=[$inp],-1 //(p16) *inp--
|
---|
270 | ld1 $xi[0]=[$Xip],-1 //(p16) *Xi--
|
---|
271 | cmp.eq p0,p6=r0,r0 };; // clear p6
|
---|
272 | ___
|
---|
273 | push (@xi,shift(@xi)); push (@rem,shift(@rem)); # "rotate" registers
|
---|
274 |
|
---|
275 | $code.=<<___; # (p16),(p17)
|
---|
276 | { .mmi; ld1 $xi[0]=[$Xip],-1 //(p16) *Xi--
|
---|
277 | xor $xi[1]=$xi[1],$in };; //(p17) xi=$xi[i]^inp[i]
|
---|
278 | { .mii; ld1 $in=[$inp],-1 //(p16) *inp--
|
---|
279 | dep $Atbl=$xi[1],$Htbl,4,4 //(p17) &Htable[nlo].lo
|
---|
280 | and $xi[1]=-16,$xi[1] };; //(p17) nhi=xi&0xf0
|
---|
281 | .align 32
|
---|
282 | .LOOP:
|
---|
283 | { .mmi;
|
---|
284 | (p6) st8 [$Xip]=$Zhi,13
|
---|
285 | xor $Zlo=$Zlo,$Zlo
|
---|
286 | add $Btbl=$xi[1],$Htbl };; //(p17) &Htable[nhi].lo
|
---|
287 | ___
|
---|
288 | push (@xi,shift(@xi)); push (@rem,shift(@rem)); # "rotate" registers
|
---|
289 |
|
---|
290 | $code.=<<___; # (p16),(p17),(p18)
|
---|
291 | { .mmi; ld8 $Alo=[$Atbl],8 //(p18) Htable[nlo].lo,&Htable[nlo].hi
|
---|
292 | ld8 $rem[0]=[$Btbl],-256 //(p18) Htable[nhi].lo,&Hshr4[nhi].lo
|
---|
293 | xor $xi[1]=$xi[1],$in };; //(p17) xi=$xi[i]^inp[i]
|
---|
294 | { .mfi; ld8 $Ahi=[$Atbl] //(p18) Htable[nlo].hi
|
---|
295 | dep $Atbl=$xi[1],$Htbl,4,4 } //(p17) &Htable[nlo].lo
|
---|
296 | { .mfi; shladd $rem[0]=$rem[0],4,r0 //(p18) Htable[nhi].lo<<4
|
---|
297 | xor $Zlo=$Zlo,$Alo };; //(p18) Z.lo^=Htable[nlo].lo
|
---|
298 | { .mmi; ld8 $Blo=[$Btbl],8 //(p18) Hshr4[nhi].lo,&Hshr4[nhi].hi
|
---|
299 | ld1 $in=[$inp],-1 } //(p16) *inp--
|
---|
300 | { .mmi; xor $rem[0]=$rem[0],$Zlo //(p18) Z.lo^(Htable[nhi].lo<<4)
|
---|
301 | mov $Zhi=$Ahi //(p18) Z.hi^=Htable[nlo].hi
|
---|
302 | and $xi[1]=-16,$xi[1] };; //(p17) nhi=xi&0xf0
|
---|
303 | { .mmi; ld8 $Bhi=[$Btbl] //(p18) Hshr4[nhi].hi
|
---|
304 | ld1 $xi[0]=[$Xip],-1 //(p16) *Xi--
|
---|
305 | shrp $Zlo=$Zhi,$Zlo,8 } //(p18) Z.lo=(Z.hi<<56)|(Z.lo>>8)
|
---|
306 | { .mmi; and $rem[0]=$rem[0],$mask0xff //(p18) rem=($Zlo^(Htable[nhi].lo<<4))&0xff
|
---|
307 | add $Btbl=$xi[1],$Htbl };; //(p17) &Htable[nhi]
|
---|
308 | ___
|
---|
309 | push (@xi,shift(@xi)); push (@rem,shift(@rem)); # "rotate" registers
|
---|
310 |
|
---|
311 | for ($i=1;$i<14;$i++) {
|
---|
312 | # Above and below fragments are derived from this one by removing
|
---|
313 | # unsuitable (p??) instructions.
|
---|
314 | $code.=<<___; # (p16),(p17),(p18),(p19)
|
---|
315 | { .mmi; ld8 $Alo=[$Atbl],8 //(p18) Htable[nlo].lo,&Htable[nlo].hi
|
---|
316 | ld8 $rem[0]=[$Btbl],-256 //(p18) Htable[nhi].lo,&Hshr4[nhi].lo
|
---|
317 | shr.u $Zhi=$Zhi,8 } //(p19) Z.hi>>=8
|
---|
318 | { .mmi; shladd $rem[1]=$rem[1],1,$rem_8bit //(p19) &rem_8bit[rem]
|
---|
319 | xor $Zlo=$Zlo,$Blo //(p19) Z.lo^=Hshr4[nhi].lo
|
---|
320 | xor $xi[1]=$xi[1],$in };; //(p17) xi=$xi[i]^inp[i]
|
---|
321 | { .mmi; ld8 $Ahi=[$Atbl] //(p18) Htable[nlo].hi
|
---|
322 | ld2 $rem[1]=[$rem[1]] //(p19) rem_8bit[rem]
|
---|
323 | dep $Atbl=$xi[1],$Htbl,4,4 } //(p17) &Htable[nlo].lo
|
---|
324 | { .mmi; shladd $rem[0]=$rem[0],4,r0 //(p18) Htable[nhi].lo<<4
|
---|
325 | xor $Zlo=$Zlo,$Alo //(p18) Z.lo^=Htable[nlo].lo
|
---|
326 | xor $Zhi=$Zhi,$Bhi };; //(p19) Z.hi^=Hshr4[nhi].hi
|
---|
327 | { .mmi; ld8 $Blo=[$Btbl],8 //(p18) Hshr4[nhi].lo,&Hshr4[nhi].hi
|
---|
328 | ld1 $in=[$inp],-1 //(p16) *inp--
|
---|
329 | shl $rem[1]=$rem[1],48 } //(p19) rem_8bit[rem]<<48
|
---|
330 | { .mmi; xor $rem[0]=$rem[0],$Zlo //(p18) Z.lo^(Htable[nhi].lo<<4)
|
---|
331 | xor $Zhi=$Zhi,$Ahi //(p18) Z.hi^=Htable[nlo].hi
|
---|
332 | and $xi[1]=-16,$xi[1] };; //(p17) nhi=xi&0xf0
|
---|
333 | { .mmi; ld8 $Bhi=[$Btbl] //(p18) Hshr4[nhi].hi
|
---|
334 | ld1 $xi[0]=[$Xip],-1 //(p16) *Xi--
|
---|
335 | shrp $Zlo=$Zhi,$Zlo,8 } //(p18) Z.lo=(Z.hi<<56)|(Z.lo>>8)
|
---|
336 | { .mmi; and $rem[0]=$rem[0],$mask0xff //(p18) rem=($Zlo^(Htable[nhi].lo<<4))&0xff
|
---|
337 | xor $Zhi=$Zhi,$rem[1] //(p19) Z.hi^=rem_8bit[rem]<<48
|
---|
338 | add $Btbl=$xi[1],$Htbl };; //(p17) &Htable[nhi]
|
---|
339 | ___
|
---|
340 | push (@xi,shift(@xi)); push (@rem,shift(@rem)); # "rotate" registers
|
---|
341 | }
|
---|
342 |
|
---|
343 | $code.=<<___; # (p17),(p18),(p19)
|
---|
344 | { .mmi; ld8 $Alo=[$Atbl],8 //(p18) Htable[nlo].lo,&Htable[nlo].hi
|
---|
345 | ld8 $rem[0]=[$Btbl],-256 //(p18) Htable[nhi].lo,&Hshr4[nhi].lo
|
---|
346 | shr.u $Zhi=$Zhi,8 } //(p19) Z.hi>>=8
|
---|
347 | { .mmi; shladd $rem[1]=$rem[1],1,$rem_8bit //(p19) &rem_8bit[rem]
|
---|
348 | xor $Zlo=$Zlo,$Blo //(p19) Z.lo^=Hshr4[nhi].lo
|
---|
349 | xor $xi[1]=$xi[1],$in };; //(p17) xi=$xi[i]^inp[i]
|
---|
350 | { .mmi; ld8 $Ahi=[$Atbl] //(p18) Htable[nlo].hi
|
---|
351 | ld2 $rem[1]=[$rem[1]] //(p19) rem_8bit[rem]
|
---|
352 | dep $Atbl=$xi[1],$Htbl,4,4 };; //(p17) &Htable[nlo].lo
|
---|
353 | { .mmi; shladd $rem[0]=$rem[0],4,r0 //(p18) Htable[nhi].lo<<4
|
---|
354 | xor $Zlo=$Zlo,$Alo //(p18) Z.lo^=Htable[nlo].lo
|
---|
355 | xor $Zhi=$Zhi,$Bhi };; //(p19) Z.hi^=Hshr4[nhi].hi
|
---|
356 | { .mmi; ld8 $Blo=[$Btbl],8 //(p18) Hshr4[nhi].lo,&Hshr4[nhi].hi
|
---|
357 | shl $rem[1]=$rem[1],48 } //(p19) rem_8bit[rem]<<48
|
---|
358 | { .mmi; xor $rem[0]=$rem[0],$Zlo //(p18) Z.lo^(Htable[nhi].lo<<4)
|
---|
359 | xor $Zhi=$Zhi,$Ahi //(p18) Z.hi^=Htable[nlo].hi
|
---|
360 | and $xi[1]=-16,$xi[1] };; //(p17) nhi=xi&0xf0
|
---|
361 | { .mmi; ld8 $Bhi=[$Btbl] //(p18) Hshr4[nhi].hi
|
---|
362 | shrp $Zlo=$Zhi,$Zlo,8 } //(p18) Z.lo=(Z.hi<<56)|(Z.lo>>8)
|
---|
363 | { .mmi; and $rem[0]=$rem[0],$mask0xff //(p18) rem=($Zlo^(Htable[nhi].lo<<4))&0xff
|
---|
364 | xor $Zhi=$Zhi,$rem[1] //(p19) Z.hi^=rem_8bit[rem]<<48
|
---|
365 | add $Btbl=$xi[1],$Htbl };; //(p17) &Htable[nhi]
|
---|
366 | ___
|
---|
367 | push (@xi,shift(@xi)); push (@rem,shift(@rem)); # "rotate" registers
|
---|
368 |
|
---|
369 | $code.=<<___; # (p18),(p19)
|
---|
370 | { .mfi; ld8 $Alo=[$Atbl],8 //(p18) Htable[nlo].lo,&Htable[nlo].hi
|
---|
371 | shr.u $Zhi=$Zhi,8 } //(p19) Z.hi>>=8
|
---|
372 | { .mfi; shladd $rem[1]=$rem[1],1,$rem_8bit //(p19) &rem_8bit[rem]
|
---|
373 | xor $Zlo=$Zlo,$Blo };; //(p19) Z.lo^=Hshr4[nhi].lo
|
---|
374 | { .mfi; ld8 $Ahi=[$Atbl] //(p18) Htable[nlo].hi
|
---|
375 | xor $Zlo=$Zlo,$Alo } //(p18) Z.lo^=Htable[nlo].lo
|
---|
376 | { .mfi; ld2 $rem[1]=[$rem[1]] //(p19) rem_8bit[rem]
|
---|
377 | xor $Zhi=$Zhi,$Bhi };; //(p19) Z.hi^=Hshr4[nhi].hi
|
---|
378 | { .mfi; ld8 $Blo=[$Btbl],8 //(p18) Htable[nhi].lo,&Htable[nhi].hi
|
---|
379 | shl $rem[1]=$rem[1],48 } //(p19) rem_8bit[rem]<<48
|
---|
380 | { .mfi; shladd $rem[0]=$Zlo,4,r0 //(p18) Z.lo<<4
|
---|
381 | xor $Zhi=$Zhi,$Ahi };; //(p18) Z.hi^=Htable[nlo].hi
|
---|
382 | { .mfi; ld8 $Bhi=[$Btbl] //(p18) Htable[nhi].hi
|
---|
383 | shrp $Zlo=$Zhi,$Zlo,4 } //(p18) Z.lo=(Z.hi<<60)|(Z.lo>>4)
|
---|
384 | { .mfi; and $rem[0]=$rem[0],$mask0xff //(p18) rem=($Zlo^(Htable[nhi].lo<<4))&0xff
|
---|
385 | xor $Zhi=$Zhi,$rem[1] };; //(p19) Z.hi^=rem_8bit[rem]<<48
|
---|
386 | ___
|
---|
387 | push (@xi,shift(@xi)); push (@rem,shift(@rem)); # "rotate" registers
|
---|
388 |
|
---|
389 | $code.=<<___; # (p19)
|
---|
390 | { .mmi; cmp.ltu p6,p0=$inp,$len
|
---|
391 | add $inp=32,$inp
|
---|
392 | shr.u $Zhi=$Zhi,4 } //(p19) Z.hi>>=4
|
---|
393 | { .mmi; shladd $rem[1]=$rem[1],1,$rem_8bit //(p19) &rem_8bit[rem]
|
---|
394 | xor $Zlo=$Zlo,$Blo //(p19) Z.lo^=Hshr4[nhi].lo
|
---|
395 | add $Xip=9,$Xip };; // &Xi.lo
|
---|
396 | { .mmi; ld2 $rem[1]=[$rem[1]] //(p19) rem_8bit[rem]
|
---|
397 | (p6) ld1 $in=[$inp],-1 //[p16] *inp--
|
---|
398 | (p6) extr.u $xi[1]=$Zlo,8,8 } //[p17] Xi[14]
|
---|
399 | { .mmi; xor $Zhi=$Zhi,$Bhi //(p19) Z.hi^=Hshr4[nhi].hi
|
---|
400 | (p6) and $xi[0]=$Zlo,$mask0xff };; //[p16] Xi[15]
|
---|
401 | { .mmi; st8 [$Xip]=$Zlo,-8
|
---|
402 | (p6) xor $xi[0]=$xi[0],$in //[p17] xi=$xi[i]^inp[i]
|
---|
403 | shl $rem[1]=$rem[1],48 };; //(p19) rem_8bit[rem]<<48
|
---|
404 | { .mmi;
|
---|
405 | (p6) ld1 $in=[$inp],-1 //[p16] *inp--
|
---|
406 | xor $Zhi=$Zhi,$rem[1] //(p19) Z.hi^=rem_8bit[rem]<<48
|
---|
407 | (p6) dep $Atbl=$xi[0],$Htbl,4,4 } //[p17] &Htable[nlo].lo
|
---|
408 | { .mib;
|
---|
409 | (p6) and $xi[0]=-16,$xi[0] //[p17] nhi=xi&0xf0
|
---|
410 | (p6) br.cond.dptk.many .LOOP };;
|
---|
411 |
|
---|
412 | { .mib; st8 [$Xip]=$Zhi };;
|
---|
413 | { .mib; $rum 1<<1 // return to little-endian
|
---|
414 | .restore sp
|
---|
415 | mov sp=prevsp
|
---|
416 | br.ret.sptk.many b0 };;
|
---|
417 | .endp gcm_ghash_4bit#
|
---|
418 | ___
|
---|
419 | $code.=<<___;
|
---|
420 | .align 128
|
---|
421 | .type rem_4bit#,\@object
|
---|
422 | rem_4bit:
|
---|
423 | data8 0x0000<<48, 0x1C20<<48, 0x3840<<48, 0x2460<<48
|
---|
424 | data8 0x7080<<48, 0x6CA0<<48, 0x48C0<<48, 0x54E0<<48
|
---|
425 | data8 0xE100<<48, 0xFD20<<48, 0xD940<<48, 0xC560<<48
|
---|
426 | data8 0x9180<<48, 0x8DA0<<48, 0xA9C0<<48, 0xB5E0<<48
|
---|
427 | .size rem_4bit#,128
|
---|
428 | .type rem_8bit#,\@object
|
---|
429 | rem_8bit:
|
---|
430 | data1 0x00,0x00, 0x01,0xC2, 0x03,0x84, 0x02,0x46, 0x07,0x08, 0x06,0xCA, 0x04,0x8C, 0x05,0x4E
|
---|
431 | data1 0x0E,0x10, 0x0F,0xD2, 0x0D,0x94, 0x0C,0x56, 0x09,0x18, 0x08,0xDA, 0x0A,0x9C, 0x0B,0x5E
|
---|
432 | data1 0x1C,0x20, 0x1D,0xE2, 0x1F,0xA4, 0x1E,0x66, 0x1B,0x28, 0x1A,0xEA, 0x18,0xAC, 0x19,0x6E
|
---|
433 | data1 0x12,0x30, 0x13,0xF2, 0x11,0xB4, 0x10,0x76, 0x15,0x38, 0x14,0xFA, 0x16,0xBC, 0x17,0x7E
|
---|
434 | data1 0x38,0x40, 0x39,0x82, 0x3B,0xC4, 0x3A,0x06, 0x3F,0x48, 0x3E,0x8A, 0x3C,0xCC, 0x3D,0x0E
|
---|
435 | data1 0x36,0x50, 0x37,0x92, 0x35,0xD4, 0x34,0x16, 0x31,0x58, 0x30,0x9A, 0x32,0xDC, 0x33,0x1E
|
---|
436 | data1 0x24,0x60, 0x25,0xA2, 0x27,0xE4, 0x26,0x26, 0x23,0x68, 0x22,0xAA, 0x20,0xEC, 0x21,0x2E
|
---|
437 | data1 0x2A,0x70, 0x2B,0xB2, 0x29,0xF4, 0x28,0x36, 0x2D,0x78, 0x2C,0xBA, 0x2E,0xFC, 0x2F,0x3E
|
---|
438 | data1 0x70,0x80, 0x71,0x42, 0x73,0x04, 0x72,0xC6, 0x77,0x88, 0x76,0x4A, 0x74,0x0C, 0x75,0xCE
|
---|
439 | data1 0x7E,0x90, 0x7F,0x52, 0x7D,0x14, 0x7C,0xD6, 0x79,0x98, 0x78,0x5A, 0x7A,0x1C, 0x7B,0xDE
|
---|
440 | data1 0x6C,0xA0, 0x6D,0x62, 0x6F,0x24, 0x6E,0xE6, 0x6B,0xA8, 0x6A,0x6A, 0x68,0x2C, 0x69,0xEE
|
---|
441 | data1 0x62,0xB0, 0x63,0x72, 0x61,0x34, 0x60,0xF6, 0x65,0xB8, 0x64,0x7A, 0x66,0x3C, 0x67,0xFE
|
---|
442 | data1 0x48,0xC0, 0x49,0x02, 0x4B,0x44, 0x4A,0x86, 0x4F,0xC8, 0x4E,0x0A, 0x4C,0x4C, 0x4D,0x8E
|
---|
443 | data1 0x46,0xD0, 0x47,0x12, 0x45,0x54, 0x44,0x96, 0x41,0xD8, 0x40,0x1A, 0x42,0x5C, 0x43,0x9E
|
---|
444 | data1 0x54,0xE0, 0x55,0x22, 0x57,0x64, 0x56,0xA6, 0x53,0xE8, 0x52,0x2A, 0x50,0x6C, 0x51,0xAE
|
---|
445 | data1 0x5A,0xF0, 0x5B,0x32, 0x59,0x74, 0x58,0xB6, 0x5D,0xF8, 0x5C,0x3A, 0x5E,0x7C, 0x5F,0xBE
|
---|
446 | data1 0xE1,0x00, 0xE0,0xC2, 0xE2,0x84, 0xE3,0x46, 0xE6,0x08, 0xE7,0xCA, 0xE5,0x8C, 0xE4,0x4E
|
---|
447 | data1 0xEF,0x10, 0xEE,0xD2, 0xEC,0x94, 0xED,0x56, 0xE8,0x18, 0xE9,0xDA, 0xEB,0x9C, 0xEA,0x5E
|
---|
448 | data1 0xFD,0x20, 0xFC,0xE2, 0xFE,0xA4, 0xFF,0x66, 0xFA,0x28, 0xFB,0xEA, 0xF9,0xAC, 0xF8,0x6E
|
---|
449 | data1 0xF3,0x30, 0xF2,0xF2, 0xF0,0xB4, 0xF1,0x76, 0xF4,0x38, 0xF5,0xFA, 0xF7,0xBC, 0xF6,0x7E
|
---|
450 | data1 0xD9,0x40, 0xD8,0x82, 0xDA,0xC4, 0xDB,0x06, 0xDE,0x48, 0xDF,0x8A, 0xDD,0xCC, 0xDC,0x0E
|
---|
451 | data1 0xD7,0x50, 0xD6,0x92, 0xD4,0xD4, 0xD5,0x16, 0xD0,0x58, 0xD1,0x9A, 0xD3,0xDC, 0xD2,0x1E
|
---|
452 | data1 0xC5,0x60, 0xC4,0xA2, 0xC6,0xE4, 0xC7,0x26, 0xC2,0x68, 0xC3,0xAA, 0xC1,0xEC, 0xC0,0x2E
|
---|
453 | data1 0xCB,0x70, 0xCA,0xB2, 0xC8,0xF4, 0xC9,0x36, 0xCC,0x78, 0xCD,0xBA, 0xCF,0xFC, 0xCE,0x3E
|
---|
454 | data1 0x91,0x80, 0x90,0x42, 0x92,0x04, 0x93,0xC6, 0x96,0x88, 0x97,0x4A, 0x95,0x0C, 0x94,0xCE
|
---|
455 | data1 0x9F,0x90, 0x9E,0x52, 0x9C,0x14, 0x9D,0xD6, 0x98,0x98, 0x99,0x5A, 0x9B,0x1C, 0x9A,0xDE
|
---|
456 | data1 0x8D,0xA0, 0x8C,0x62, 0x8E,0x24, 0x8F,0xE6, 0x8A,0xA8, 0x8B,0x6A, 0x89,0x2C, 0x88,0xEE
|
---|
457 | data1 0x83,0xB0, 0x82,0x72, 0x80,0x34, 0x81,0xF6, 0x84,0xB8, 0x85,0x7A, 0x87,0x3C, 0x86,0xFE
|
---|
458 | data1 0xA9,0xC0, 0xA8,0x02, 0xAA,0x44, 0xAB,0x86, 0xAE,0xC8, 0xAF,0x0A, 0xAD,0x4C, 0xAC,0x8E
|
---|
459 | data1 0xA7,0xD0, 0xA6,0x12, 0xA4,0x54, 0xA5,0x96, 0xA0,0xD8, 0xA1,0x1A, 0xA3,0x5C, 0xA2,0x9E
|
---|
460 | data1 0xB5,0xE0, 0xB4,0x22, 0xB6,0x64, 0xB7,0xA6, 0xB2,0xE8, 0xB3,0x2A, 0xB1,0x6C, 0xB0,0xAE
|
---|
461 | data1 0xBB,0xF0, 0xBA,0x32, 0xB8,0x74, 0xB9,0xB6, 0xBC,0xF8, 0xBD,0x3A, 0xBF,0x7C, 0xBE,0xBE
|
---|
462 | .size rem_8bit#,512
|
---|
463 | stringz "GHASH for IA64, CRYPTOGAMS by <appro\@openssl.org>"
|
---|
464 | ___
|
---|
465 |
|
---|
466 | $code =~ s/mux1(\s+)\S+\@rev/nop.i$1 0x0/gm if ($big_endian);
|
---|
467 | $code =~ s/\`([^\`]*)\`/eval $1/gem;
|
---|
468 |
|
---|
469 | print $code;
|
---|
470 | close STDOUT or die "error closing STDOUT: $!";
|
---|