VirtualBox

source: vbox/trunk/src/libs/openssl-1.1.0g/crypto/modes/asm/ghash-c64xplus.pl@ 69881

Last change on this file since 69881 was 69881, checked in by vboxsync, 7 years ago

Update OpenSSL to 1.1.0g.
bugref:8070: src/libs maintenance

  • Property svn:eol-style set to LF
  • Property svn:executable set to *
File size: 7.3 KB
Line 
1#! /usr/bin/env perl
2# Copyright 2012-2016 The OpenSSL Project Authors. All Rights Reserved.
3#
4# Licensed under the OpenSSL license (the "License"). You may not use
5# this file except in compliance with the License. You can obtain a copy
6# in the file LICENSE in the source distribution or at
7# https://www.openssl.org/source/license.html
8
9#
10# ====================================================================
11# Written by Andy Polyakov <[email protected]> for the OpenSSL
12# project. The module is, however, dual licensed under OpenSSL and
13# CRYPTOGAMS licenses depending on where you obtain it. For further
14# details see http://www.openssl.org/~appro/cryptogams/.
15# ====================================================================
16#
17# December 2011
18#
19# The module implements GCM GHASH function and underlying single
20# multiplication operation in GF(2^128). Even though subroutines
21# have _4bit suffix, they are not using any tables, but rely on
22# hardware Galois Field Multiply support. Streamed GHASH processes
23# byte in ~7 cycles, which is >6x faster than "4-bit" table-driven
24# code compiled with TI's cl6x 6.0 with -mv6400+ -o2 flags. We are
25# comparing apples vs. oranges, but compiler surely could have done
26# better, because theoretical [though not necessarily achievable]
27# estimate for "4-bit" table-driven implementation is ~12 cycles.
28
29while (($output=shift) && ($output!~/\w[\w\-]*\.\w+$/)) {}
30open STDOUT,">$output";
31
32($Xip,$Htable,$inp,$len)=("A4","B4","A6","B6"); # arguments
33
34($Z0,$Z1,$Z2,$Z3, $H0, $H1, $H2, $H3,
35 $H0x,$H1x,$H2x,$H3x)=map("A$_",(16..27));
36($H01u,$H01y,$H2u,$H3u, $H0y,$H1y,$H2y,$H3y,
37 $H0z,$H1z,$H2z,$H3z)=map("B$_",(16..27));
38($FF000000,$E10000)=("B30","B31");
39($xip,$x0,$x1,$xib)=map("B$_",(6..9)); # $xip zaps $len
40 $xia="A9";
41($rem,$res)=("B4","B5"); # $rem zaps $Htable
42
43$code.=<<___;
44 .text
45
46 .if .ASSEMBLER_VERSION<7000000
47 .asg 0,__TI_EABI__
48 .endif
49 .if __TI_EABI__
50 .asg gcm_gmult_1bit,_gcm_gmult_1bit
51 .asg gcm_gmult_4bit,_gcm_gmult_4bit
52 .asg gcm_ghash_4bit,_gcm_ghash_4bit
53 .endif
54
55 .asg B3,RA
56
57 .if 0
58 .global _gcm_gmult_1bit
59_gcm_gmult_1bit:
60 ADDAD $Htable,2,$Htable
61 .endif
62 .global _gcm_gmult_4bit
63_gcm_gmult_4bit:
64 .asmfunc
65 LDDW *${Htable}[-1],$H1:$H0 ; H.lo
66 LDDW *${Htable}[-2],$H3:$H2 ; H.hi
67|| MV $Xip,${xip} ; reassign Xi
68|| MVK 15,B1 ; SPLOOPD constant
69
70 MVK 0xE1,$E10000
71|| LDBU *++${xip}[15],$x1 ; Xi[15]
72 MVK 0xFF,$FF000000
73|| LDBU *--${xip},$x0 ; Xi[14]
74 SHL $E10000,16,$E10000 ; [pre-shifted] reduction polynomial
75 SHL $FF000000,24,$FF000000 ; upper byte mask
76|| BNOP ghash_loop?
77|| MVK 1,B0 ; take a single spin
78
79 PACKH2 $H0,$H1,$xia ; pack H0' and H1's upper bytes
80 AND $H2,$FF000000,$H2u ; H2's upper byte
81 AND $H3,$FF000000,$H3u ; H3's upper byte
82|| SHRU $H2u,8,$H2u
83 SHRU $H3u,8,$H3u
84|| ZERO $Z1:$Z0
85 SHRU2 $xia,8,$H01u
86|| ZERO $Z3:$Z2
87 .endasmfunc
88
89 .global _gcm_ghash_4bit
90_gcm_ghash_4bit:
91 .asmfunc
92 LDDW *${Htable}[-1],$H1:$H0 ; H.lo
93|| SHRU $len,4,B0 ; reassign len
94 LDDW *${Htable}[-2],$H3:$H2 ; H.hi
95|| MV $Xip,${xip} ; reassign Xi
96|| MVK 15,B1 ; SPLOOPD constant
97
98 MVK 0xE1,$E10000
99|| [B0] LDNDW *${inp}[1],$H1x:$H0x
100 MVK 0xFF,$FF000000
101|| [B0] LDNDW *${inp}++[2],$H3x:$H2x
102 SHL $E10000,16,$E10000 ; [pre-shifted] reduction polynomial
103|| LDDW *${xip}[1],$Z1:$Z0
104 SHL $FF000000,24,$FF000000 ; upper byte mask
105|| LDDW *${xip}[0],$Z3:$Z2
106
107 PACKH2 $H0,$H1,$xia ; pack H0' and H1's upper bytes
108 AND $H2,$FF000000,$H2u ; H2's upper byte
109 AND $H3,$FF000000,$H3u ; H3's upper byte
110|| SHRU $H2u,8,$H2u
111 SHRU $H3u,8,$H3u
112 SHRU2 $xia,8,$H01u
113
114|| [B0] XOR $H0x,$Z0,$Z0 ; Xi^=inp
115|| [B0] XOR $H1x,$Z1,$Z1
116 .if .LITTLE_ENDIAN
117 [B0] XOR $H2x,$Z2,$Z2
118|| [B0] XOR $H3x,$Z3,$Z3
119|| [B0] SHRU $Z1,24,$xia ; Xi[15], avoid cross-path stall
120 STDW $Z1:$Z0,*${xip}[1]
121|| [B0] SHRU $Z1,16,$x0 ; Xi[14]
122|| [B0] ZERO $Z1:$Z0
123 .else
124 [B0] XOR $H2x,$Z2,$Z2
125|| [B0] XOR $H3x,$Z3,$Z3
126|| [B0] MV $Z0,$xia ; Xi[15], avoid cross-path stall
127 STDW $Z1:$Z0,*${xip}[1]
128|| [B0] SHRU $Z0,8,$x0 ; Xi[14]
129|| [B0] ZERO $Z1:$Z0
130 .endif
131 STDW $Z3:$Z2,*${xip}[0]
132|| [B0] ZERO $Z3:$Z2
133|| [B0] MV $xia,$x1
134 [B0] ADDK 14,${xip}
135
136ghash_loop?:
137 SPLOOPD 6 ; 6*16+7
138|| MVC B1,ILC
139|| [B0] SUB B0,1,B0
140|| ZERO A0
141|| ADD $x1,$x1,$xib ; SHL $x1,1,$xib
142|| SHL $x1,1,$xia
143___
144
145
146########____________________________
147# 0 D2. M1 M2 |
148# 1 M1 |
149# 2 M1 M2 |
150# 3 D1. M1 M2 |
151# 4 S1. L1 |
152# 5 S2 S1x L1 D2 L2 |____________________________
153# 6/0 L1 S1 L2 S2x |D2. M1 M2 |
154# 7/1 L1 S1 D1x S2 M2 | M1 |
155# 8/2 S1 L1x S2 | M1 M2 |
156# 9/3 S1 L1x | D1. M1 M2 |
157# 10/4 D1x | S1. L1 |
158# 11/5 |S2 S1x L1 D2 L2 |____________
159# 12/6/0 D1x __| L1 S1 L2 S2x |D2. ....
160# 7/1 L1 S1 D1x S2 M2 | ....
161# 8/2 S1 L1x S2 | ....
162#####... ................|............
163$code.=<<___;
164 XORMPY $H0,$xia,$H0x ; 0 ; H·(Xi[i]<<1)
165|| XORMPY $H01u,$xib,$H01y
166|| [A0] LDBU *--${xip},$x0
167 XORMPY $H1,$xia,$H1x ; 1
168 XORMPY $H2,$xia,$H2x ; 2
169|| XORMPY $H2u,$xib,$H2y
170 XORMPY $H3,$xia,$H3x ; 3
171|| XORMPY $H3u,$xib,$H3y
172||[!A0] MVK.D 15,A0 ; *--${xip} counter
173 XOR.L $H0x,$Z0,$Z0 ; 4 ; Z^=H·(Xi[i]<<1)
174|| [A0] SUB.S A0,1,A0
175 XOR.L $H1x,$Z1,$Z1 ; 5
176|| AND.D $H01y,$FF000000,$H0z
177|| SWAP2.L $H01y,$H1y ; ; SHL $H01y,16,$H1y
178|| SHL $x0,1,$xib
179|| SHL $x0,1,$xia
180
181 XOR.L $H2x,$Z2,$Z2 ; 6/0 ; [0,0] in epilogue
182|| SHL $Z0,1,$rem ; ; rem=Z<<1
183|| SHRMB.S $Z1,$Z0,$Z0 ; ; Z>>=8
184|| AND.L $H1y,$FF000000,$H1z
185 XOR.L $H3x,$Z3,$Z3 ; 7/1
186|| SHRMB.S $Z2,$Z1,$Z1
187|| XOR.D $H0z,$Z0,$Z0 ; merge upper byte products
188|| AND.S $H2y,$FF000000,$H2z
189|| XORMPY $E10000,$rem,$res ; ; implicit rem&0x1FE
190 XOR.L $H1z,$Z1,$Z1 ; 8/2
191|| SHRMB.S $Z3,$Z2,$Z2
192|| AND.S $H3y,$FF000000,$H3z
193 XOR.L $H2z,$Z2,$Z2 ; 9/3
194|| SHRU $Z3,8,$Z3
195 XOR.D $H3z,$Z3,$Z3 ; 10/4
196 NOP ; 11/5
197
198 SPKERNEL 0,2
199|| XOR.D $res,$Z3,$Z3 ; 12/6/0; Z^=res
200
201 ; input pre-fetch is possible where D1 slot is available...
202 [B0] LDNDW *${inp}[1],$H1x:$H0x ; 8/-
203 [B0] LDNDW *${inp}++[2],$H3x:$H2x ; 9/-
204 NOP ; 10/-
205 .if .LITTLE_ENDIAN
206 SWAP2 $Z0,$Z1 ; 11/-
207|| SWAP4 $Z1,$Z0
208 SWAP4 $Z1,$Z1 ; 12/-
209|| SWAP2 $Z0,$Z0
210 SWAP2 $Z2,$Z3
211|| SWAP4 $Z3,$Z2
212||[!B0] BNOP RA
213 SWAP4 $Z3,$Z3
214|| SWAP2 $Z2,$Z2
215|| [B0] BNOP ghash_loop?
216 [B0] XOR $H0x,$Z0,$Z0 ; Xi^=inp
217|| [B0] XOR $H1x,$Z1,$Z1
218 [B0] XOR $H2x,$Z2,$Z2
219|| [B0] XOR $H3x,$Z3,$Z3
220|| [B0] SHRU $Z1,24,$xia ; Xi[15], avoid cross-path stall
221 STDW $Z1:$Z0,*${xip}[1]
222|| [B0] SHRU $Z1,16,$x0 ; Xi[14]
223|| [B0] ZERO $Z1:$Z0
224 .else
225 [!B0] BNOP RA ; 11/-
226 [B0] BNOP ghash_loop? ; 12/-
227 [B0] XOR $H0x,$Z0,$Z0 ; Xi^=inp
228|| [B0] XOR $H1x,$Z1,$Z1
229 [B0] XOR $H2x,$Z2,$Z2
230|| [B0] XOR $H3x,$Z3,$Z3
231|| [B0] MV $Z0,$xia ; Xi[15], avoid cross-path stall
232 STDW $Z1:$Z0,*${xip}[1]
233|| [B0] SHRU $Z0,8,$x0 ; Xi[14]
234|| [B0] ZERO $Z1:$Z0
235 .endif
236 STDW $Z3:$Z2,*${xip}[0]
237|| [B0] ZERO $Z3:$Z2
238|| [B0] MV $xia,$x1
239 [B0] ADDK 14,${xip}
240 .endasmfunc
241
242 .sect .const
243 .cstring "GHASH for C64x+, CRYPTOGAMS by <appro\@openssl.org>"
244 .align 4
245___
246
247print $code;
248close STDOUT;
Note: See TracBrowser for help on using the repository browser.

© 2024 Oracle Support Privacy / Do Not Sell My Info Terms of Use Trademark Policy Automated Access Etiquette