1 | #! /usr/bin/env perl
|
---|
2 | # Copyright 2010-2020 The OpenSSL Project Authors. All Rights Reserved.
|
---|
3 | #
|
---|
4 | # Licensed under the OpenSSL license (the "License"). You may not use
|
---|
5 | # this file except in compliance with the License. You can obtain a copy
|
---|
6 | # in the file LICENSE in the source distribution or at
|
---|
7 | # https://www.openssl.org/source/license.html
|
---|
8 |
|
---|
9 |
|
---|
10 | # ====================================================================
|
---|
11 | # Written by Andy Polyakov <[email protected]> for the OpenSSL
|
---|
12 | # project. The module is, however, dual licensed under OpenSSL and
|
---|
13 | # CRYPTOGAMS licenses depending on where you obtain it. For further
|
---|
14 | # details see http://www.openssl.org/~appro/cryptogams/.
|
---|
15 | # ====================================================================
|
---|
16 |
|
---|
17 | # September 2010.
|
---|
18 | #
|
---|
19 | # The module implements "4-bit" GCM GHASH function and underlying
|
---|
20 | # single multiplication operation in GF(2^128). "4-bit" means that it
|
---|
21 | # uses 256 bytes per-key table [+128 bytes shared table]. Performance
|
---|
22 | # was measured to be ~18 cycles per processed byte on z10, which is
|
---|
23 | # almost 40% better than gcc-generated code. It should be noted that
|
---|
24 | # 18 cycles is worse result than expected: loop is scheduled for 12
|
---|
25 | # and the result should be close to 12. In the lack of instruction-
|
---|
26 | # level profiling data it's impossible to tell why...
|
---|
27 |
|
---|
28 | # November 2010.
|
---|
29 | #
|
---|
30 | # Adapt for -m31 build. If kernel supports what's called "highgprs"
|
---|
31 | # feature on Linux [see /proc/cpuinfo], it's possible to use 64-bit
|
---|
32 | # instructions and achieve "64-bit" performance even in 31-bit legacy
|
---|
33 | # application context. The feature is not specific to any particular
|
---|
34 | # processor, as long as it's "z-CPU". Latter implies that the code
|
---|
35 | # remains z/Architecture specific. On z990 it was measured to perform
|
---|
36 | # 2.8x better than 32-bit code generated by gcc 4.3.
|
---|
37 |
|
---|
38 | # March 2011.
|
---|
39 | #
|
---|
40 | # Support for hardware KIMD-GHASH is verified to produce correct
|
---|
41 | # result and therefore is engaged. On z196 it was measured to process
|
---|
42 | # 8KB buffer ~7 faster than software implementation. It's not as
|
---|
43 | # impressive for smaller buffer sizes and for smallest 16-bytes buffer
|
---|
44 | # it's actually almost 2 times slower. Which is the reason why
|
---|
45 | # KIMD-GHASH is not used in gcm_gmult_4bit.
|
---|
46 |
|
---|
47 | $flavour = shift;
|
---|
48 |
|
---|
49 | if ($flavour =~ /3[12]/) {
|
---|
50 | $SIZE_T=4;
|
---|
51 | $g="";
|
---|
52 | } else {
|
---|
53 | $SIZE_T=8;
|
---|
54 | $g="g";
|
---|
55 | }
|
---|
56 |
|
---|
57 | while (($output=shift) && ($output!~/\w[\w\-]*\.\w+$/)) {}
|
---|
58 | open STDOUT,">$output";
|
---|
59 |
|
---|
60 | $softonly=0;
|
---|
61 |
|
---|
62 | $Zhi="%r0";
|
---|
63 | $Zlo="%r1";
|
---|
64 |
|
---|
65 | $Xi="%r2"; # argument block
|
---|
66 | $Htbl="%r3";
|
---|
67 | $inp="%r4";
|
---|
68 | $len="%r5";
|
---|
69 |
|
---|
70 | $rem0="%r6"; # variables
|
---|
71 | $rem1="%r7";
|
---|
72 | $nlo="%r8";
|
---|
73 | $nhi="%r9";
|
---|
74 | $xi="%r10";
|
---|
75 | $cnt="%r11";
|
---|
76 | $tmp="%r12";
|
---|
77 | $x78="%r13";
|
---|
78 | $rem_4bit="%r14";
|
---|
79 |
|
---|
80 | $sp="%r15";
|
---|
81 |
|
---|
82 | $code.=<<___;
|
---|
83 | #include "s390x_arch.h"
|
---|
84 |
|
---|
85 | .text
|
---|
86 |
|
---|
87 | .globl gcm_gmult_4bit
|
---|
88 | .align 32
|
---|
89 | gcm_gmult_4bit:
|
---|
90 | ___
|
---|
91 | $code.=<<___ if(!$softonly && 0); # hardware is slow for single block...
|
---|
92 | larl %r1,OPENSSL_s390xcap_P
|
---|
93 | lghi %r0,0
|
---|
94 | lg %r1,S390X_KIMD+8(%r1) # load second word of kimd capabilities
|
---|
95 | # vector
|
---|
96 | tmhh %r1,0x4000 # check for function 65
|
---|
97 | jz .Lsoft_gmult
|
---|
98 | stg %r0,16($sp) # arrange 16 bytes of zero input
|
---|
99 | stg %r0,24($sp)
|
---|
100 | lghi %r0,S390X_GHASH # function 65
|
---|
101 | la %r1,0($Xi) # H lies right after Xi in gcm128_context
|
---|
102 | la $inp,16($sp)
|
---|
103 | lghi $len,16
|
---|
104 | .long 0xb93e0004 # kimd %r0,$inp
|
---|
105 | brc 1,.-4 # pay attention to "partial completion"
|
---|
106 | br %r14
|
---|
107 | .align 32
|
---|
108 | .Lsoft_gmult:
|
---|
109 | ___
|
---|
110 | $code.=<<___;
|
---|
111 | stm${g} %r6,%r14,6*$SIZE_T($sp)
|
---|
112 |
|
---|
113 | aghi $Xi,-1
|
---|
114 | lghi $len,1
|
---|
115 | lghi $x78,`0xf<<3`
|
---|
116 | larl $rem_4bit,rem_4bit
|
---|
117 |
|
---|
118 | lg $Zlo,8+1($Xi) # Xi
|
---|
119 | j .Lgmult_shortcut
|
---|
120 | .type gcm_gmult_4bit,\@function
|
---|
121 | .size gcm_gmult_4bit,(.-gcm_gmult_4bit)
|
---|
122 |
|
---|
123 | .globl gcm_ghash_4bit
|
---|
124 | .align 32
|
---|
125 | gcm_ghash_4bit:
|
---|
126 | ___
|
---|
127 | $code.=<<___ if(!$softonly);
|
---|
128 | larl %r1,OPENSSL_s390xcap_P
|
---|
129 | lg %r0,S390X_KIMD+8(%r1) # load second word of kimd capabilities
|
---|
130 | # vector
|
---|
131 | tmhh %r0,0x4000 # check for function 65
|
---|
132 | jz .Lsoft_ghash
|
---|
133 | lghi %r0,S390X_GHASH # function 65
|
---|
134 | la %r1,0($Xi) # H lies right after Xi in gcm128_context
|
---|
135 | .long 0xb93e0004 # kimd %r0,$inp
|
---|
136 | brc 1,.-4 # pay attention to "partial completion"
|
---|
137 | br %r14
|
---|
138 | .align 32
|
---|
139 | .Lsoft_ghash:
|
---|
140 | ___
|
---|
141 | $code.=<<___ if ($flavour =~ /3[12]/);
|
---|
142 | llgfr $len,$len
|
---|
143 | ___
|
---|
144 | $code.=<<___;
|
---|
145 | stm${g} %r6,%r14,6*$SIZE_T($sp)
|
---|
146 |
|
---|
147 | aghi $Xi,-1
|
---|
148 | srlg $len,$len,4
|
---|
149 | lghi $x78,`0xf<<3`
|
---|
150 | larl $rem_4bit,rem_4bit
|
---|
151 |
|
---|
152 | lg $Zlo,8+1($Xi) # Xi
|
---|
153 | lg $Zhi,0+1($Xi)
|
---|
154 | lghi $tmp,0
|
---|
155 | .Louter:
|
---|
156 | xg $Zhi,0($inp) # Xi ^= inp
|
---|
157 | xg $Zlo,8($inp)
|
---|
158 | xgr $Zhi,$tmp
|
---|
159 | stg $Zlo,8+1($Xi)
|
---|
160 | stg $Zhi,0+1($Xi)
|
---|
161 |
|
---|
162 | .Lgmult_shortcut:
|
---|
163 | lghi $tmp,0xf0
|
---|
164 | sllg $nlo,$Zlo,4
|
---|
165 | srlg $xi,$Zlo,8 # extract second byte
|
---|
166 | ngr $nlo,$tmp
|
---|
167 | lgr $nhi,$Zlo
|
---|
168 | lghi $cnt,14
|
---|
169 | ngr $nhi,$tmp
|
---|
170 |
|
---|
171 | lg $Zlo,8($nlo,$Htbl)
|
---|
172 | lg $Zhi,0($nlo,$Htbl)
|
---|
173 |
|
---|
174 | sllg $nlo,$xi,4
|
---|
175 | sllg $rem0,$Zlo,3
|
---|
176 | ngr $nlo,$tmp
|
---|
177 | ngr $rem0,$x78
|
---|
178 | ngr $xi,$tmp
|
---|
179 |
|
---|
180 | sllg $tmp,$Zhi,60
|
---|
181 | srlg $Zlo,$Zlo,4
|
---|
182 | srlg $Zhi,$Zhi,4
|
---|
183 | xg $Zlo,8($nhi,$Htbl)
|
---|
184 | xg $Zhi,0($nhi,$Htbl)
|
---|
185 | lgr $nhi,$xi
|
---|
186 | sllg $rem1,$Zlo,3
|
---|
187 | xgr $Zlo,$tmp
|
---|
188 | ngr $rem1,$x78
|
---|
189 | sllg $tmp,$Zhi,60
|
---|
190 | j .Lghash_inner
|
---|
191 | .align 16
|
---|
192 | .Lghash_inner:
|
---|
193 | srlg $Zlo,$Zlo,4
|
---|
194 | srlg $Zhi,$Zhi,4
|
---|
195 | xg $Zlo,8($nlo,$Htbl)
|
---|
196 | llgc $xi,0($cnt,$Xi)
|
---|
197 | xg $Zhi,0($nlo,$Htbl)
|
---|
198 | sllg $nlo,$xi,4
|
---|
199 | xg $Zhi,0($rem0,$rem_4bit)
|
---|
200 | nill $nlo,0xf0
|
---|
201 | sllg $rem0,$Zlo,3
|
---|
202 | xgr $Zlo,$tmp
|
---|
203 | ngr $rem0,$x78
|
---|
204 | nill $xi,0xf0
|
---|
205 |
|
---|
206 | sllg $tmp,$Zhi,60
|
---|
207 | srlg $Zlo,$Zlo,4
|
---|
208 | srlg $Zhi,$Zhi,4
|
---|
209 | xg $Zlo,8($nhi,$Htbl)
|
---|
210 | xg $Zhi,0($nhi,$Htbl)
|
---|
211 | lgr $nhi,$xi
|
---|
212 | xg $Zhi,0($rem1,$rem_4bit)
|
---|
213 | sllg $rem1,$Zlo,3
|
---|
214 | xgr $Zlo,$tmp
|
---|
215 | ngr $rem1,$x78
|
---|
216 | sllg $tmp,$Zhi,60
|
---|
217 | brct $cnt,.Lghash_inner
|
---|
218 |
|
---|
219 | srlg $Zlo,$Zlo,4
|
---|
220 | srlg $Zhi,$Zhi,4
|
---|
221 | xg $Zlo,8($nlo,$Htbl)
|
---|
222 | xg $Zhi,0($nlo,$Htbl)
|
---|
223 | sllg $xi,$Zlo,3
|
---|
224 | xg $Zhi,0($rem0,$rem_4bit)
|
---|
225 | xgr $Zlo,$tmp
|
---|
226 | ngr $xi,$x78
|
---|
227 |
|
---|
228 | sllg $tmp,$Zhi,60
|
---|
229 | srlg $Zlo,$Zlo,4
|
---|
230 | srlg $Zhi,$Zhi,4
|
---|
231 | xg $Zlo,8($nhi,$Htbl)
|
---|
232 | xg $Zhi,0($nhi,$Htbl)
|
---|
233 | xgr $Zlo,$tmp
|
---|
234 | xg $Zhi,0($rem1,$rem_4bit)
|
---|
235 |
|
---|
236 | lg $tmp,0($xi,$rem_4bit)
|
---|
237 | la $inp,16($inp)
|
---|
238 | sllg $tmp,$tmp,4 # correct last rem_4bit[rem]
|
---|
239 | brctg $len,.Louter
|
---|
240 |
|
---|
241 | xgr $Zhi,$tmp
|
---|
242 | stg $Zlo,8+1($Xi)
|
---|
243 | stg $Zhi,0+1($Xi)
|
---|
244 | lm${g} %r6,%r14,6*$SIZE_T($sp)
|
---|
245 | br %r14
|
---|
246 | .type gcm_ghash_4bit,\@function
|
---|
247 | .size gcm_ghash_4bit,(.-gcm_ghash_4bit)
|
---|
248 |
|
---|
249 | .align 64
|
---|
250 | rem_4bit:
|
---|
251 | .long `0x0000<<12`,0,`0x1C20<<12`,0,`0x3840<<12`,0,`0x2460<<12`,0
|
---|
252 | .long `0x7080<<12`,0,`0x6CA0<<12`,0,`0x48C0<<12`,0,`0x54E0<<12`,0
|
---|
253 | .long `0xE100<<12`,0,`0xFD20<<12`,0,`0xD940<<12`,0,`0xC560<<12`,0
|
---|
254 | .long `0x9180<<12`,0,`0x8DA0<<12`,0,`0xA9C0<<12`,0,`0xB5E0<<12`,0
|
---|
255 | .type rem_4bit,\@object
|
---|
256 | .size rem_4bit,(.-rem_4bit)
|
---|
257 | .string "GHASH for s390x, CRYPTOGAMS by <appro\@openssl.org>"
|
---|
258 | ___
|
---|
259 |
|
---|
260 | $code =~ s/\`([^\`]*)\`/eval $1/gem;
|
---|
261 | print $code;
|
---|
262 | close STDOUT or die "error closing STDOUT: $!";
|
---|