rc4-586.pl@ 69890

Last change on this file since 69890 was 69890, checked in by vboxsync, 7 years ago
Added OpenSSL 1.1.0g with unneeded files removed, otherwise unmodified. bugref:8070: src/libs maintenance
Property svn:eol-style set to `LF` Property svn:executable set to ``*
File size: 12.2 KB

Line
1	#! /usr/bin/env perl
2	# Copyright 1998-2016 The OpenSSL Project Authors. All Rights Reserved.
3	#
4	# Licensed under the OpenSSL license (the "License"). You may not use
5	# this file except in compliance with the License. You can obtain a copy
6	# in the file LICENSE in the source distribution or at
7	# https://www.openssl.org/source/license.html
8
9
10	# ====================================================================
11	# [Re]written by Andy Polyakov <[email protected]> for the OpenSSL
12	# project. The module is, however, dual licensed under OpenSSL and
13	# CRYPTOGAMS licenses depending on where you obtain it. For further
14	# details see http://www.openssl.org/~appro/cryptogams/.
15	# ====================================================================
16
17	# At some point it became apparent that the original SSLeay RC4
18	# assembler implementation performs suboptimally on latest IA-32
19	# microarchitectures. After re-tuning performance has changed as
20	# following:
21	#
22	# Pentium -10%
23	# Pentium III +12%
24	# AMD +50%(*)
25	# P4 +250%(**)
26	#
27	# (*) This number is actually a trade-off:-) It's possible to
28	# achieve +72%, but at the cost of -48% off PIII performance.
29	# In other words code performing further 13% faster on AMD
30	# would perform almost 2 times slower on Intel PIII...
31	# For reference! This code delivers ~80% of rc4-amd64.pl
32	# performance on the same Opteron machine.
33	# (**) This number requires compressed key schedule set up by
34	# RC4_set_key [see commentary below for further details].
35	#
36	# <[email protected]>
37
38	# May 2011
39	#
40	# Optimize for Core2 and Westmere [and incidentally Opteron]. Current
41	# performance in cycles per processed byte (less is better) and
42	# improvement relative to previous version of this module is:
43	#
44	# Pentium 10.2 # original numbers
45	# Pentium III 7.8(*)
46	# Intel P4 7.5
47	#
48	# Opteron 6.1/+20% # new MMX numbers
49	# Core2 5.3/+67%(**)
50	# Westmere 5.1/+94%(**)
51	# Sandy Bridge 5.0/+8%
52	# Atom 12.6/+6%
53	# VIA Nano 6.4/+9%
54	# Ivy Bridge 4.9/±0%
55	# Bulldozer 4.9/+15%
56	#
57	# (*) PIII can actually deliver 6.6 cycles per byte with MMX code,
58	# but this specific code performs poorly on Core2. And vice
59	# versa, below MMX/SSE code delivering 5.8/7.1 on Core2 performs
60	# poorly on PIII, at 8.0/14.5:-( As PIII is not a "hot" CPU
61	# [anymore], I chose to discard PIII-specific code path and opt
62	# for original IALU-only code, which is why MMX/SSE code path
63	# is guarded by SSE2 bit (see below), not MMX/SSE.
64	# (**) Performance vs. block size on Core2 and Westmere had a maximum
65	# at ... 64 bytes block size. And it was quite a maximum, 40-60%
66	# in comparison to largest 8KB block size. Above improvement
67	# coefficients are for the largest block size.
68
69	$0 =~ m/(.*[\/\\])[^\/\\]+$/; $dir=$1;
70	push(@INC,"${dir}","${dir}../../perlasm");
71	require "x86asm.pl";
72
73	$output=pop;
74	open STDOUT,">$output";
75
76	&asm_init($ARGV[0],"rc4-586.pl",$x86only = $ARGV[$#ARGV] eq "386");
77
78	$xx="eax";
79	$yy="ebx";
80	$tx="ecx";
81	$ty="edx";
82	$inp="esi";
83	$out="ebp";
84	$dat="edi";
85
86	sub RC4_loop {
87	my $i=shift;
88	my $func = ($i==0)?mov:or;
89
90	&add (&LB($yy),&LB($tx));
91	&mov ($ty,&DWP(0,$dat,$yy,4));
92	&mov (&DWP(0,$dat,$yy,4),$tx);
93	&mov (&DWP(0,$dat,$xx,4),$ty);
94	&add ($ty,$tx);
95	&inc (&LB($xx));
96	&and ($ty,0xff);
97	&ror ($out,8) if ($i!=0);
98	if ($i<3) {
99	&mov ($tx,&DWP(0,$dat,$xx,4));
100	} else {
101	&mov ($tx,&wparam(3)); # reload [re-biased] out
102	}
103	&$func ($out,&DWP(0,$dat,$ty,4));
104	}
105
106	if ($alt=0) {
107	# >20% faster on Atom and Sandy Bridge[!], 8% faster on Opteron,
108	# but ~40% slower on Core2 and Westmere... Attempt to add movz
109	# brings down Opteron by 25%, Atom and Sandy Bridge by 15%, yet
110	# on Core2 with movz it's almost 20% slower than below alternative
111	# code... Yes, it's a total mess...
112	my @XX=($xx,$out);
113	$RC4_loop_mmx = sub { # SSE actually...
114	my $i=shift;
115	my $j=$i<=0?0:$i>>1;
116	my $mm=$i<=0?"mm0":"mm".($i&1);
117
118	&add (&LB($yy),&LB($tx));
119	&lea (@XX[1],&DWP(1,@XX[0]));
120	&pxor ("mm2","mm0") if ($i==0);
121	&psllq ("mm1",8) if ($i==0);
122	&and (@XX[1],0xff);
123	&pxor ("mm0","mm0") if ($i<=0);
124	&mov ($ty,&DWP(0,$dat,$yy,4));
125	&mov (&DWP(0,$dat,$yy,4),$tx);
126	&pxor ("mm1","mm2") if ($i==0);
127	&mov (&DWP(0,$dat,$XX[0],4),$ty);
128	&add (&LB($ty),&LB($tx));
129	&movd (@XX[0],"mm7") if ($i==0);
130	&mov ($tx,&DWP(0,$dat,@XX[1],4));
131	&pxor ("mm1","mm1") if ($i==1);
132	&movq ("mm2",&QWP(0,$inp)) if ($i==1);
133	&movq (&QWP(-8,(@XX[0],$inp)),"mm1") if ($i==0);
134	&pinsrw ($mm,&DWP(0,$dat,$ty,4),$j);
135
136	push (@XX,shift(@XX)) if ($i>=0);
137	}
138	} else {
139	# Using pinsrw here improves performane on Intel CPUs by 2-3%, but
140	# brings down AMD by 7%...
141	$RC4_loop_mmx = sub {
142	my $i=shift;
143
144	&add (&LB($yy),&LB($tx));
145	&psllq ("mm1",8*(($i-1)&7)) if (abs($i)!=1);
146	&mov ($ty,&DWP(0,$dat,$yy,4));
147	&mov (&DWP(0,$dat,$yy,4),$tx);
148	&mov (&DWP(0,$dat,$xx,4),$ty);
149	&inc ($xx);
150	&add ($ty,$tx);
151	&movz ($xx,&LB($xx)); # (*)
152	&movz ($ty,&LB($ty)); # (*)
153	&pxor ("mm2",$i==1?"mm0":"mm1") if ($i>=0);
154	&movq ("mm0",&QWP(0,$inp)) if ($i<=0);
155	&movq (&QWP(-8,($out,$inp)),"mm2") if ($i==0);
156	&mov ($tx,&DWP(0,$dat,$xx,4));
157	&movd ($i>0?"mm1":"mm2",&DWP(0,$dat,$ty,4));
158
159	# (*) This is the key to Core2 and Westmere performance.
160	# Without movz out-of-order execution logic confuses
161	# itself and fails to reorder loads and stores. Problem
162	# appears to be fixed in Sandy Bridge...
163	}
164	}
165
166	&external_label("OPENSSL_ia32cap_P");
167
168	# void RC4(RC4_KEY key,size_t len,const unsigned char inp,unsigned char *out);
169	&function_begin("RC4");
170	&mov ($dat,&wparam(0)); # load key schedule pointer
171	&mov ($ty, &wparam(1)); # load len
172	&mov ($inp,&wparam(2)); # load inp
173	&mov ($out,&wparam(3)); # load out
174
175	&xor ($xx,$xx); # avoid partial register stalls
176	&xor ($yy,$yy);
177
178	&cmp ($ty,0); # safety net
179	&je (&label("abort"));
180
181	&mov (&LB($xx),&BP(0,$dat)); # load key->x
182	&mov (&LB($yy),&BP(4,$dat)); # load key->y
183	&add ($dat,8);
184
185	&lea ($tx,&DWP(0,$inp,$ty));
186	&sub ($out,$inp); # re-bias out
187	&mov (&wparam(1),$tx); # save input+len
188
189	&inc (&LB($xx));
190
191	# detect compressed key schedule...
192	&cmp (&DWP(256,$dat),-1);
193	&je (&label("RC4_CHAR"));
194
195	&mov ($tx,&DWP(0,$dat,$xx,4));
196
197	&and ($ty,-4); # how many 4-byte chunks?
198	&jz (&label("loop1"));
199
200	&mov (&wparam(3),$out); # $out as accumulator in these loops
201	if ($x86only) {
202	&jmp (&label("go4loop4"));
203	} else {
204	&test ($ty,-8);
205	&jz (&label("go4loop4"));
206
207	&picmeup($out,"OPENSSL_ia32cap_P");
208	&bt (&DWP(0,$out),26); # check SSE2 bit [could have been MMX]
209	&jnc (&label("go4loop4"));
210
211	&mov ($out,&wparam(3)) if (!$alt);
212	&movd ("mm7",&wparam(3)) if ($alt);
213	&and ($ty,-8);
214	&lea ($ty,&DWP(-8,$inp,$ty));
215	&mov (&DWP(-4,$dat),$ty); # save input+(len/8)*8-8
216
217	&$RC4_loop_mmx(-1);
218	&jmp(&label("loop_mmx_enter"));
219
220	&set_label("loop_mmx",16);
221	&$RC4_loop_mmx(0);
222	&set_label("loop_mmx_enter");
223	for ($i=1;$i<8;$i++) { &$RC4_loop_mmx($i); }
224	&mov ($ty,$yy);
225	&xor ($yy,$yy); # this is second key to Core2
226	&mov (&LB($yy),&LB($ty)); # and Westmere performance...
227	&cmp ($inp,&DWP(-4,$dat));
228	&lea ($inp,&DWP(8,$inp));
229	&jb (&label("loop_mmx"));
230
231	if ($alt) {
232	&movd ($out,"mm7");
233	&pxor ("mm2","mm0");
234	&psllq ("mm1",8);
235	&pxor ("mm1","mm2");
236	&movq (&QWP(-8,$out,$inp),"mm1");
237	} else {
238	&psllq ("mm1",56);
239	&pxor ("mm2","mm1");
240	&movq (&QWP(-8,$out,$inp),"mm2");
241	}
242	&emms ();
243
244	&cmp ($inp,&wparam(1)); # compare to input+len
245	&je (&label("done"));
246	&jmp (&label("loop1"));
247	}
248
249	&set_label("go4loop4",16);
250	&lea ($ty,&DWP(-4,$inp,$ty));
251	&mov (&wparam(2),$ty); # save input+(len/4)*4-4
252
253	&set_label("loop4");
254	for ($i=0;$i<4;$i++) { RC4_loop($i); }
255	&ror ($out,8);
256	&xor ($out,&DWP(0,$inp));
257	&cmp ($inp,&wparam(2)); # compare to input+(len/4)*4-4
258	&mov (&DWP(0,$tx,$inp),$out);# $tx holds re-biased out here
259	&lea ($inp,&DWP(4,$inp));
260	&mov ($tx,&DWP(0,$dat,$xx,4));
261	&jb (&label("loop4"));
262
263	&cmp ($inp,&wparam(1)); # compare to input+len
264	&je (&label("done"));
265	&mov ($out,&wparam(3)); # restore $out
266
267	&set_label("loop1",16);
268	&add (&LB($yy),&LB($tx));
269	&mov ($ty,&DWP(0,$dat,$yy,4));
270	&mov (&DWP(0,$dat,$yy,4),$tx);
271	&mov (&DWP(0,$dat,$xx,4),$ty);
272	&add ($ty,$tx);
273	&inc (&LB($xx));
274	&and ($ty,0xff);
275	&mov ($ty,&DWP(0,$dat,$ty,4));
276	&xor (&LB($ty),&BP(0,$inp));
277	&lea ($inp,&DWP(1,$inp));
278	&mov ($tx,&DWP(0,$dat,$xx,4));
279	&cmp ($inp,&wparam(1)); # compare to input+len
280	&mov (&BP(-1,$out,$inp),&LB($ty));
281	&jb (&label("loop1"));
282
283	&jmp (&label("done"));
284
285	# this is essentially Intel P4 specific codepath...
286	&set_label("RC4_CHAR",16);
287	&movz ($tx,&BP(0,$dat,$xx));
288	# strangely enough unrolled loop performs over 20% slower...
289	&set_label("cloop1");
290	&add (&LB($yy),&LB($tx));
291	&movz ($ty,&BP(0,$dat,$yy));
292	&mov (&BP(0,$dat,$yy),&LB($tx));
293	&mov (&BP(0,$dat,$xx),&LB($ty));
294	&add (&LB($ty),&LB($tx));
295	&movz ($ty,&BP(0,$dat,$ty));
296	&add (&LB($xx),1);
297	&xor (&LB($ty),&BP(0,$inp));
298	&lea ($inp,&DWP(1,$inp));
299	&movz ($tx,&BP(0,$dat,$xx));
300	&cmp ($inp,&wparam(1));
301	&mov (&BP(-1,$out,$inp),&LB($ty));
302	&jb (&label("cloop1"));
303
304	&set_label("done");
305	&dec (&LB($xx));
306	&mov (&DWP(-4,$dat),$yy); # save key->y
307	&mov (&BP(-8,$dat),&LB($xx)); # save key->x
308	&set_label("abort");
309	&function_end("RC4");
310
311	########################################################################
312
313	$inp="esi";
314	$out="edi";
315	$idi="ebp";
316	$ido="ecx";
317	$idx="edx";
318
319	# void RC4_set_key(RC4_KEY key,int len,const unsigned char data);
320	&function_begin("RC4_set_key");
321	&mov ($out,&wparam(0)); # load key
322	&mov ($idi,&wparam(1)); # load len
323	&mov ($inp,&wparam(2)); # load data
324	&picmeup($idx,"OPENSSL_ia32cap_P");
325
326	&lea ($out,&DWP(2*4,$out)); # &key->data
327	&lea ($inp,&DWP(0,$inp,$idi)); # $inp to point at the end
328	&neg ($idi);
329	&xor ("eax","eax");
330	&mov (&DWP(-4,$out),$idi); # borrow key->y
331
332	&bt (&DWP(0,$idx),20); # check for bit#20
333	&jc (&label("c1stloop"));
334
335	&set_label("w1stloop",16);
336	&mov (&DWP(0,$out,"eax",4),"eax"); # key->data[i]=i;
337	&add (&LB("eax"),1); # i++;
338	&jnc (&label("w1stloop"));
339
340	&xor ($ido,$ido);
341	&xor ($idx,$idx);
342
343	&set_label("w2ndloop",16);
344	&mov ("eax",&DWP(0,$out,$ido,4));
345	&add (&LB($idx),&BP(0,$inp,$idi));
346	&add (&LB($idx),&LB("eax"));
347	&add ($idi,1);
348	&mov ("ebx",&DWP(0,$out,$idx,4));
349	&jnz (&label("wnowrap"));
350	&mov ($idi,&DWP(-4,$out));
351	&set_label("wnowrap");
352	&mov (&DWP(0,$out,$idx,4),"eax");
353	&mov (&DWP(0,$out,$ido,4),"ebx");
354	&add (&LB($ido),1);
355	&jnc (&label("w2ndloop"));
356	&jmp (&label("exit"));
357
358	# Unlike all other x86 [and x86_64] implementations, Intel P4 core
359	# [including EM64T] was found to perform poorly with above "32-bit" key
360	# schedule, a.k.a. RC4_INT. Performance improvement for IA-32 hand-coded
361	# assembler turned out to be 3.5x if re-coded for compressed 8-bit one,
362	# a.k.a. RC4_CHAR! It's however inappropriate to just switch to 8-bit
363	# schedule for x86[_64], because non-P4 implementations suffer from
364	# significant performance losses then, e.g. PIII exhibits >2x
365	# deterioration, and so does Opteron. In order to assure optimal
366	# all-round performance, we detect P4 at run-time and set up compressed
367	# key schedule, which is recognized by RC4 procedure.
368
369	&set_label("c1stloop",16);
370	&mov (&BP(0,$out,"eax"),&LB("eax")); # key->data[i]=i;
371	&add (&LB("eax"),1); # i++;
372	&jnc (&label("c1stloop"));
373
374	&xor ($ido,$ido);
375	&xor ($idx,$idx);
376	&xor ("ebx","ebx");
377
378	&set_label("c2ndloop",16);
379	&mov (&LB("eax"),&BP(0,$out,$ido));
380	&add (&LB($idx),&BP(0,$inp,$idi));
381	&add (&LB($idx),&LB("eax"));
382	&add ($idi,1);
383	&mov (&LB("ebx"),&BP(0,$out,$idx));
384	&jnz (&label("cnowrap"));
385	&mov ($idi,&DWP(-4,$out));
386	&set_label("cnowrap");
387	&mov (&BP(0,$out,$idx),&LB("eax"));
388	&mov (&BP(0,$out,$ido),&LB("ebx"));
389	&add (&LB($ido),1);
390	&jnc (&label("c2ndloop"));
391
392	&mov (&DWP(256,$out),-1); # mark schedule as compressed
393
394	&set_label("exit");
395	&xor ("eax","eax");
396	&mov (&DWP(-8,$out),"eax"); # key->x=0;
397	&mov (&DWP(-4,$out),"eax"); # key->y=0;
398	&function_end("RC4_set_key");
399
400	# const char *RC4_options(void);
401	&function_begin_B("RC4_options");
402	&call (&label("pic_point"));
403	&set_label("pic_point");
404	&blindpop("eax");
405	&lea ("eax",&DWP(&label("opts")."-".&label("pic_point"),"eax"));
406	&picmeup("edx","OPENSSL_ia32cap_P");
407	&mov ("edx",&DWP(0,"edx"));
408	&bt ("edx",20);
409	&jc (&label("1xchar"));
410	&bt ("edx",26);
411	&jnc (&label("ret"));
412	&add ("eax",25);
413	&ret ();
414	&set_label("1xchar");
415	&add ("eax",12);
416	&set_label("ret");
417	&ret ();
418	&set_label("opts",64);
419	&asciz ("rc4(4x,int)");
420	&asciz ("rc4(1x,char)");
421	&asciz ("rc4(8x,mmx)");
422	&asciz ("RC4 for x86, CRYPTOGAMS by <appro\@openssl.org>");
423	&align (64);
424	&function_end_B("RC4_options");
425
426	&asm_finish();
427
428	close STDOUT;

Note: See TracBrowser for help on using the repository browser.

source: vbox/trunk/src/libs/openssl-1.1.0g/crypto/rc4/asm/rc4-586.pl@ 69890

Download in other formats: