1 | #! /usr/bin/env perl
|
---|
2 | # Copyright 2012-2016 The OpenSSL Project Authors. All Rights Reserved.
|
---|
3 | #
|
---|
4 | # Licensed under the OpenSSL license (the "License"). You may not use
|
---|
5 | # this file except in compliance with the License. You can obtain a copy
|
---|
6 | # in the file LICENSE in the source distribution or at
|
---|
7 | # https://www.openssl.org/source/license.html
|
---|
8 |
|
---|
9 | #
|
---|
10 | # ====================================================================
|
---|
11 | # Written by Andy Polyakov <[email protected]> for the OpenSSL
|
---|
12 | # project. The module is, however, dual licensed under OpenSSL and
|
---|
13 | # CRYPTOGAMS licenses depending on where you obtain it. For further
|
---|
14 | # details see http://www.openssl.org/~appro/cryptogams/.
|
---|
15 | # ====================================================================
|
---|
16 | #
|
---|
17 | # December 2011
|
---|
18 | #
|
---|
19 | # The module implements GCM GHASH function and underlying single
|
---|
20 | # multiplication operation in GF(2^128). Even though subroutines
|
---|
21 | # have _4bit suffix, they are not using any tables, but rely on
|
---|
22 | # hardware Galois Field Multiply support. Streamed GHASH processes
|
---|
23 | # byte in ~7 cycles, which is >6x faster than "4-bit" table-driven
|
---|
24 | # code compiled with TI's cl6x 6.0 with -mv6400+ -o2 flags. We are
|
---|
25 | # comparing apples vs. oranges, but compiler surely could have done
|
---|
26 | # better, because theoretical [though not necessarily achievable]
|
---|
27 | # estimate for "4-bit" table-driven implementation is ~12 cycles.
|
---|
28 |
|
---|
29 | while (($output=shift) && ($output!~/\w[\w\-]*\.\w+$/)) {}
|
---|
30 | open STDOUT,">$output";
|
---|
31 |
|
---|
32 | ($Xip,$Htable,$inp,$len)=("A4","B4","A6","B6"); # arguments
|
---|
33 |
|
---|
34 | ($Z0,$Z1,$Z2,$Z3, $H0, $H1, $H2, $H3,
|
---|
35 | $H0x,$H1x,$H2x,$H3x)=map("A$_",(16..27));
|
---|
36 | ($H01u,$H01y,$H2u,$H3u, $H0y,$H1y,$H2y,$H3y,
|
---|
37 | $H0z,$H1z,$H2z,$H3z)=map("B$_",(16..27));
|
---|
38 | ($FF000000,$E10000)=("B30","B31");
|
---|
39 | ($xip,$x0,$x1,$xib)=map("B$_",(6..9)); # $xip zaps $len
|
---|
40 | $xia="A9";
|
---|
41 | ($rem,$res)=("B4","B5"); # $rem zaps $Htable
|
---|
42 |
|
---|
43 | $code.=<<___;
|
---|
44 | .text
|
---|
45 |
|
---|
46 | .if .ASSEMBLER_VERSION<7000000
|
---|
47 | .asg 0,__TI_EABI__
|
---|
48 | .endif
|
---|
49 | .if __TI_EABI__
|
---|
50 | .asg gcm_gmult_1bit,_gcm_gmult_1bit
|
---|
51 | .asg gcm_gmult_4bit,_gcm_gmult_4bit
|
---|
52 | .asg gcm_ghash_4bit,_gcm_ghash_4bit
|
---|
53 | .endif
|
---|
54 |
|
---|
55 | .asg B3,RA
|
---|
56 |
|
---|
57 | .if 0
|
---|
58 | .global _gcm_gmult_1bit
|
---|
59 | _gcm_gmult_1bit:
|
---|
60 | ADDAD $Htable,2,$Htable
|
---|
61 | .endif
|
---|
62 | .global _gcm_gmult_4bit
|
---|
63 | _gcm_gmult_4bit:
|
---|
64 | .asmfunc
|
---|
65 | LDDW *${Htable}[-1],$H1:$H0 ; H.lo
|
---|
66 | LDDW *${Htable}[-2],$H3:$H2 ; H.hi
|
---|
67 | || MV $Xip,${xip} ; reassign Xi
|
---|
68 | || MVK 15,B1 ; SPLOOPD constant
|
---|
69 |
|
---|
70 | MVK 0xE1,$E10000
|
---|
71 | || LDBU *++${xip}[15],$x1 ; Xi[15]
|
---|
72 | MVK 0xFF,$FF000000
|
---|
73 | || LDBU *--${xip},$x0 ; Xi[14]
|
---|
74 | SHL $E10000,16,$E10000 ; [pre-shifted] reduction polynomial
|
---|
75 | SHL $FF000000,24,$FF000000 ; upper byte mask
|
---|
76 | || BNOP ghash_loop?
|
---|
77 | || MVK 1,B0 ; take a single spin
|
---|
78 |
|
---|
79 | PACKH2 $H0,$H1,$xia ; pack H0' and H1's upper bytes
|
---|
80 | AND $H2,$FF000000,$H2u ; H2's upper byte
|
---|
81 | AND $H3,$FF000000,$H3u ; H3's upper byte
|
---|
82 | || SHRU $H2u,8,$H2u
|
---|
83 | SHRU $H3u,8,$H3u
|
---|
84 | || ZERO $Z1:$Z0
|
---|
85 | SHRU2 $xia,8,$H01u
|
---|
86 | || ZERO $Z3:$Z2
|
---|
87 | .endasmfunc
|
---|
88 |
|
---|
89 | .global _gcm_ghash_4bit
|
---|
90 | _gcm_ghash_4bit:
|
---|
91 | .asmfunc
|
---|
92 | LDDW *${Htable}[-1],$H1:$H0 ; H.lo
|
---|
93 | || SHRU $len,4,B0 ; reassign len
|
---|
94 | LDDW *${Htable}[-2],$H3:$H2 ; H.hi
|
---|
95 | || MV $Xip,${xip} ; reassign Xi
|
---|
96 | || MVK 15,B1 ; SPLOOPD constant
|
---|
97 |
|
---|
98 | MVK 0xE1,$E10000
|
---|
99 | || [B0] LDNDW *${inp}[1],$H1x:$H0x
|
---|
100 | MVK 0xFF,$FF000000
|
---|
101 | || [B0] LDNDW *${inp}++[2],$H3x:$H2x
|
---|
102 | SHL $E10000,16,$E10000 ; [pre-shifted] reduction polynomial
|
---|
103 | || LDDW *${xip}[1],$Z1:$Z0
|
---|
104 | SHL $FF000000,24,$FF000000 ; upper byte mask
|
---|
105 | || LDDW *${xip}[0],$Z3:$Z2
|
---|
106 |
|
---|
107 | PACKH2 $H0,$H1,$xia ; pack H0' and H1's upper bytes
|
---|
108 | AND $H2,$FF000000,$H2u ; H2's upper byte
|
---|
109 | AND $H3,$FF000000,$H3u ; H3's upper byte
|
---|
110 | || SHRU $H2u,8,$H2u
|
---|
111 | SHRU $H3u,8,$H3u
|
---|
112 | SHRU2 $xia,8,$H01u
|
---|
113 |
|
---|
114 | || [B0] XOR $H0x,$Z0,$Z0 ; Xi^=inp
|
---|
115 | || [B0] XOR $H1x,$Z1,$Z1
|
---|
116 | .if .LITTLE_ENDIAN
|
---|
117 | [B0] XOR $H2x,$Z2,$Z2
|
---|
118 | || [B0] XOR $H3x,$Z3,$Z3
|
---|
119 | || [B0] SHRU $Z1,24,$xia ; Xi[15], avoid cross-path stall
|
---|
120 | STDW $Z1:$Z0,*${xip}[1]
|
---|
121 | || [B0] SHRU $Z1,16,$x0 ; Xi[14]
|
---|
122 | || [B0] ZERO $Z1:$Z0
|
---|
123 | .else
|
---|
124 | [B0] XOR $H2x,$Z2,$Z2
|
---|
125 | || [B0] XOR $H3x,$Z3,$Z3
|
---|
126 | || [B0] MV $Z0,$xia ; Xi[15], avoid cross-path stall
|
---|
127 | STDW $Z1:$Z0,*${xip}[1]
|
---|
128 | || [B0] SHRU $Z0,8,$x0 ; Xi[14]
|
---|
129 | || [B0] ZERO $Z1:$Z0
|
---|
130 | .endif
|
---|
131 | STDW $Z3:$Z2,*${xip}[0]
|
---|
132 | || [B0] ZERO $Z3:$Z2
|
---|
133 | || [B0] MV $xia,$x1
|
---|
134 | [B0] ADDK 14,${xip}
|
---|
135 |
|
---|
136 | ghash_loop?:
|
---|
137 | SPLOOPD 6 ; 6*16+7
|
---|
138 | || MVC B1,ILC
|
---|
139 | || [B0] SUB B0,1,B0
|
---|
140 | || ZERO A0
|
---|
141 | || ADD $x1,$x1,$xib ; SHL $x1,1,$xib
|
---|
142 | || SHL $x1,1,$xia
|
---|
143 | ___
|
---|
144 | |
---|
145 |
|
---|
146 | ########____________________________
|
---|
147 | # 0 D2. M1 M2 |
|
---|
148 | # 1 M1 |
|
---|
149 | # 2 M1 M2 |
|
---|
150 | # 3 D1. M1 M2 |
|
---|
151 | # 4 S1. L1 |
|
---|
152 | # 5 S2 S1x L1 D2 L2 |____________________________
|
---|
153 | # 6/0 L1 S1 L2 S2x |D2. M1 M2 |
|
---|
154 | # 7/1 L1 S1 D1x S2 M2 | M1 |
|
---|
155 | # 8/2 S1 L1x S2 | M1 M2 |
|
---|
156 | # 9/3 S1 L1x | D1. M1 M2 |
|
---|
157 | # 10/4 D1x | S1. L1 |
|
---|
158 | # 11/5 |S2 S1x L1 D2 L2 |____________
|
---|
159 | # 12/6/0 D1x __| L1 S1 L2 S2x |D2. ....
|
---|
160 | # 7/1 L1 S1 D1x S2 M2 | ....
|
---|
161 | # 8/2 S1 L1x S2 | ....
|
---|
162 | #####... ................|............
|
---|
163 | $code.=<<___;
|
---|
164 | XORMPY $H0,$xia,$H0x ; 0 ; H·(Xi[i]<<1)
|
---|
165 | || XORMPY $H01u,$xib,$H01y
|
---|
166 | || [A0] LDBU *--${xip},$x0
|
---|
167 | XORMPY $H1,$xia,$H1x ; 1
|
---|
168 | XORMPY $H2,$xia,$H2x ; 2
|
---|
169 | || XORMPY $H2u,$xib,$H2y
|
---|
170 | XORMPY $H3,$xia,$H3x ; 3
|
---|
171 | || XORMPY $H3u,$xib,$H3y
|
---|
172 | ||[!A0] MVK.D 15,A0 ; *--${xip} counter
|
---|
173 | XOR.L $H0x,$Z0,$Z0 ; 4 ; Z^=H·(Xi[i]<<1)
|
---|
174 | || [A0] SUB.S A0,1,A0
|
---|
175 | XOR.L $H1x,$Z1,$Z1 ; 5
|
---|
176 | || AND.D $H01y,$FF000000,$H0z
|
---|
177 | || SWAP2.L $H01y,$H1y ; ; SHL $H01y,16,$H1y
|
---|
178 | || SHL $x0,1,$xib
|
---|
179 | || SHL $x0,1,$xia
|
---|
180 |
|
---|
181 | XOR.L $H2x,$Z2,$Z2 ; 6/0 ; [0,0] in epilogue
|
---|
182 | || SHL $Z0,1,$rem ; ; rem=Z<<1
|
---|
183 | || SHRMB.S $Z1,$Z0,$Z0 ; ; Z>>=8
|
---|
184 | || AND.L $H1y,$FF000000,$H1z
|
---|
185 | XOR.L $H3x,$Z3,$Z3 ; 7/1
|
---|
186 | || SHRMB.S $Z2,$Z1,$Z1
|
---|
187 | || XOR.D $H0z,$Z0,$Z0 ; merge upper byte products
|
---|
188 | || AND.S $H2y,$FF000000,$H2z
|
---|
189 | || XORMPY $E10000,$rem,$res ; ; implicit rem&0x1FE
|
---|
190 | XOR.L $H1z,$Z1,$Z1 ; 8/2
|
---|
191 | || SHRMB.S $Z3,$Z2,$Z2
|
---|
192 | || AND.S $H3y,$FF000000,$H3z
|
---|
193 | XOR.L $H2z,$Z2,$Z2 ; 9/3
|
---|
194 | || SHRU $Z3,8,$Z3
|
---|
195 | XOR.D $H3z,$Z3,$Z3 ; 10/4
|
---|
196 | NOP ; 11/5
|
---|
197 |
|
---|
198 | SPKERNEL 0,2
|
---|
199 | || XOR.D $res,$Z3,$Z3 ; 12/6/0; Z^=res
|
---|
200 |
|
---|
201 | ; input pre-fetch is possible where D1 slot is available...
|
---|
202 | [B0] LDNDW *${inp}[1],$H1x:$H0x ; 8/-
|
---|
203 | [B0] LDNDW *${inp}++[2],$H3x:$H2x ; 9/-
|
---|
204 | NOP ; 10/-
|
---|
205 | .if .LITTLE_ENDIAN
|
---|
206 | SWAP2 $Z0,$Z1 ; 11/-
|
---|
207 | || SWAP4 $Z1,$Z0
|
---|
208 | SWAP4 $Z1,$Z1 ; 12/-
|
---|
209 | || SWAP2 $Z0,$Z0
|
---|
210 | SWAP2 $Z2,$Z3
|
---|
211 | || SWAP4 $Z3,$Z2
|
---|
212 | ||[!B0] BNOP RA
|
---|
213 | SWAP4 $Z3,$Z3
|
---|
214 | || SWAP2 $Z2,$Z2
|
---|
215 | || [B0] BNOP ghash_loop?
|
---|
216 | [B0] XOR $H0x,$Z0,$Z0 ; Xi^=inp
|
---|
217 | || [B0] XOR $H1x,$Z1,$Z1
|
---|
218 | [B0] XOR $H2x,$Z2,$Z2
|
---|
219 | || [B0] XOR $H3x,$Z3,$Z3
|
---|
220 | || [B0] SHRU $Z1,24,$xia ; Xi[15], avoid cross-path stall
|
---|
221 | STDW $Z1:$Z0,*${xip}[1]
|
---|
222 | || [B0] SHRU $Z1,16,$x0 ; Xi[14]
|
---|
223 | || [B0] ZERO $Z1:$Z0
|
---|
224 | .else
|
---|
225 | [!B0] BNOP RA ; 11/-
|
---|
226 | [B0] BNOP ghash_loop? ; 12/-
|
---|
227 | [B0] XOR $H0x,$Z0,$Z0 ; Xi^=inp
|
---|
228 | || [B0] XOR $H1x,$Z1,$Z1
|
---|
229 | [B0] XOR $H2x,$Z2,$Z2
|
---|
230 | || [B0] XOR $H3x,$Z3,$Z3
|
---|
231 | || [B0] MV $Z0,$xia ; Xi[15], avoid cross-path stall
|
---|
232 | STDW $Z1:$Z0,*${xip}[1]
|
---|
233 | || [B0] SHRU $Z0,8,$x0 ; Xi[14]
|
---|
234 | || [B0] ZERO $Z1:$Z0
|
---|
235 | .endif
|
---|
236 | STDW $Z3:$Z2,*${xip}[0]
|
---|
237 | || [B0] ZERO $Z3:$Z2
|
---|
238 | || [B0] MV $xia,$x1
|
---|
239 | [B0] ADDK 14,${xip}
|
---|
240 | .endasmfunc
|
---|
241 |
|
---|
242 | .sect .const
|
---|
243 | .cstring "GHASH for C64x+, CRYPTOGAMS by <appro\@openssl.org>"
|
---|
244 | .align 4
|
---|
245 | ___
|
---|
246 |
|
---|
247 | print $code;
|
---|
248 | close STDOUT;
|
---|