1 | #! /usr/bin/env perl
|
---|
2 | # Copyright 2005-2020 The OpenSSL Project Authors. All Rights Reserved.
|
---|
3 | #
|
---|
4 | # Licensed under the Apache License 2.0 (the "License"). You may not use
|
---|
5 | # this file except in compliance with the License. You can obtain a copy
|
---|
6 | # in the file LICENSE in the source distribution or at
|
---|
7 | # https://www.openssl.org/source/license.html
|
---|
8 |
|
---|
9 |
|
---|
10 | # ====================================================================
|
---|
11 | # Written by Andy Polyakov <[email protected]> for the OpenSSL
|
---|
12 | # project. The module is, however, dual licensed under OpenSSL and
|
---|
13 | # CRYPTOGAMS licenses depending on where you obtain it. For further
|
---|
14 | # details see http://www.openssl.org/~appro/cryptogams/.
|
---|
15 | # ====================================================================
|
---|
16 |
|
---|
17 | # October 2005
|
---|
18 | #
|
---|
19 | # This is a "teaser" code, as it can be improved in several ways...
|
---|
20 | # First of all non-SSE2 path should be implemented (yes, for now it
|
---|
21 | # performs Montgomery multiplication/convolution only on SSE2-capable
|
---|
22 | # CPUs such as P4, others fall down to original code). Then inner loop
|
---|
23 | # can be unrolled and modulo-scheduled to improve ILP and possibly
|
---|
24 | # moved to 128-bit XMM register bank (though it would require input
|
---|
25 | # rearrangement and/or increase bus bandwidth utilization). Dedicated
|
---|
26 | # squaring procedure should give further performance improvement...
|
---|
27 | # Yet, for being draft, the code improves rsa512 *sign* benchmark by
|
---|
28 | # 110%(!), rsa1024 one - by 70% and rsa4096 - by 20%:-)
|
---|
29 |
|
---|
30 | # December 2006
|
---|
31 | #
|
---|
32 | # Modulo-scheduling SSE2 loops results in further 15-20% improvement.
|
---|
33 | # Integer-only code [being equipped with dedicated squaring procedure]
|
---|
34 | # gives ~40% on rsa512 sign benchmark...
|
---|
35 |
|
---|
36 | $0 =~ m/(.*[\/\\])[^\/\\]+$/; $dir=$1;
|
---|
37 | push(@INC,"${dir}","${dir}../../perlasm");
|
---|
38 | require "x86asm.pl";
|
---|
39 |
|
---|
40 | $output = pop and open STDOUT,">$output";
|
---|
41 |
|
---|
42 | &asm_init($ARGV[0]);
|
---|
43 |
|
---|
44 | $sse2=0;
|
---|
45 | for (@ARGV) { $sse2=1 if (/-DOPENSSL_IA32_SSE2/); }
|
---|
46 |
|
---|
47 | &external_label("OPENSSL_ia32cap_P") if ($sse2);
|
---|
48 |
|
---|
49 | &function_begin("bn_mul_mont");
|
---|
50 |
|
---|
51 | $i="edx";
|
---|
52 | $j="ecx";
|
---|
53 | $ap="esi"; $tp="esi"; # overlapping variables!!!
|
---|
54 | $rp="edi"; $bp="edi"; # overlapping variables!!!
|
---|
55 | $np="ebp";
|
---|
56 | $num="ebx";
|
---|
57 |
|
---|
58 | $_num=&DWP(4*0,"esp"); # stack top layout
|
---|
59 | $_rp=&DWP(4*1,"esp");
|
---|
60 | $_ap=&DWP(4*2,"esp");
|
---|
61 | $_bp=&DWP(4*3,"esp");
|
---|
62 | $_np=&DWP(4*4,"esp");
|
---|
63 | $_n0=&DWP(4*5,"esp"); $_n0q=&QWP(4*5,"esp");
|
---|
64 | $_sp=&DWP(4*6,"esp");
|
---|
65 | $_bpend=&DWP(4*7,"esp");
|
---|
66 | $frame=32; # size of above frame rounded up to 16n
|
---|
67 |
|
---|
68 | &xor ("eax","eax");
|
---|
69 | &mov ("edi",&wparam(5)); # int num
|
---|
70 | &cmp ("edi",4);
|
---|
71 | &jl (&label("just_leave"));
|
---|
72 |
|
---|
73 | &lea ("esi",&wparam(0)); # put aside pointer to argument block
|
---|
74 | &lea ("edx",&wparam(1)); # load ap
|
---|
75 | &add ("edi",2); # extra two words on top of tp
|
---|
76 | &neg ("edi");
|
---|
77 | &lea ("ebp",&DWP(-$frame,"esp","edi",4)); # future alloca($frame+4*(num+2))
|
---|
78 | &neg ("edi");
|
---|
79 |
|
---|
80 | # minimize cache contention by arranging 2K window between stack
|
---|
81 | # pointer and ap argument [np is also position sensitive vector,
|
---|
82 | # but it's assumed to be near ap, as it's allocated at ~same
|
---|
83 | # time].
|
---|
84 | &mov ("eax","ebp");
|
---|
85 | &sub ("eax","edx");
|
---|
86 | &and ("eax",2047);
|
---|
87 | &sub ("ebp","eax"); # this aligns sp and ap modulo 2048
|
---|
88 |
|
---|
89 | &xor ("edx","ebp");
|
---|
90 | &and ("edx",2048);
|
---|
91 | &xor ("edx",2048);
|
---|
92 | &sub ("ebp","edx"); # this splits them apart modulo 4096
|
---|
93 |
|
---|
94 | &and ("ebp",-64); # align to cache line
|
---|
95 |
|
---|
96 | # An OS-agnostic version of __chkstk.
|
---|
97 | #
|
---|
98 | # Some OSes (Windows) insist on stack being "wired" to
|
---|
99 | # physical memory in strictly sequential manner, i.e. if stack
|
---|
100 | # allocation spans two pages, then reference to farmost one can
|
---|
101 | # be punishable by SEGV. But page walking can do good even on
|
---|
102 | # other OSes, because it guarantees that villain thread hits
|
---|
103 | # the guard page before it can make damage to innocent one...
|
---|
104 | &mov ("eax","esp");
|
---|
105 | &sub ("eax","ebp");
|
---|
106 | &and ("eax",-4096);
|
---|
107 | &mov ("edx","esp"); # saved stack pointer!
|
---|
108 | &lea ("esp",&DWP(0,"ebp","eax"));
|
---|
109 | &mov ("eax",&DWP(0,"esp"));
|
---|
110 | &cmp ("esp","ebp");
|
---|
111 | &ja (&label("page_walk"));
|
---|
112 | &jmp (&label("page_walk_done"));
|
---|
113 |
|
---|
114 | &set_label("page_walk",16);
|
---|
115 | &lea ("esp",&DWP(-4096,"esp"));
|
---|
116 | &mov ("eax",&DWP(0,"esp"));
|
---|
117 | &cmp ("esp","ebp");
|
---|
118 | &ja (&label("page_walk"));
|
---|
119 | &set_label("page_walk_done");
|
---|
120 |
|
---|
121 | ################################# load argument block...
|
---|
122 | &mov ("eax",&DWP(0*4,"esi"));# BN_ULONG *rp
|
---|
123 | &mov ("ebx",&DWP(1*4,"esi"));# const BN_ULONG *ap
|
---|
124 | &mov ("ecx",&DWP(2*4,"esi"));# const BN_ULONG *bp
|
---|
125 | &mov ("ebp",&DWP(3*4,"esi"));# const BN_ULONG *np
|
---|
126 | &mov ("esi",&DWP(4*4,"esi"));# const BN_ULONG *n0
|
---|
127 | #&mov ("edi",&DWP(5*4,"esi"));# int num
|
---|
128 |
|
---|
129 | &mov ("esi",&DWP(0,"esi")); # pull n0[0]
|
---|
130 | &mov ($_rp,"eax"); # ... save a copy of argument block
|
---|
131 | &mov ($_ap,"ebx");
|
---|
132 | &mov ($_bp,"ecx");
|
---|
133 | &mov ($_np,"ebp");
|
---|
134 | &mov ($_n0,"esi");
|
---|
135 | &lea ($num,&DWP(-3,"edi")); # num=num-1 to assist modulo-scheduling
|
---|
136 | #&mov ($_num,$num); # redundant as $num is not reused
|
---|
137 | &mov ($_sp,"edx"); # saved stack pointer!
|
---|
138 | |
---|
139 |
|
---|
140 | if($sse2) {
|
---|
141 | $acc0="mm0"; # mmx register bank layout
|
---|
142 | $acc1="mm1";
|
---|
143 | $car0="mm2";
|
---|
144 | $car1="mm3";
|
---|
145 | $mul0="mm4";
|
---|
146 | $mul1="mm5";
|
---|
147 | $temp="mm6";
|
---|
148 | $mask="mm7";
|
---|
149 |
|
---|
150 | &picmeup("eax","OPENSSL_ia32cap_P");
|
---|
151 | &bt (&DWP(0,"eax"),26);
|
---|
152 | &jnc (&label("non_sse2"));
|
---|
153 |
|
---|
154 | &mov ("eax",-1);
|
---|
155 | &movd ($mask,"eax"); # mask 32 lower bits
|
---|
156 |
|
---|
157 | &mov ($ap,$_ap); # load input pointers
|
---|
158 | &mov ($bp,$_bp);
|
---|
159 | &mov ($np,$_np);
|
---|
160 |
|
---|
161 | &xor ($i,$i); # i=0
|
---|
162 | &xor ($j,$j); # j=0
|
---|
163 |
|
---|
164 | &movd ($mul0,&DWP(0,$bp)); # bp[0]
|
---|
165 | &movd ($mul1,&DWP(0,$ap)); # ap[0]
|
---|
166 | &movd ($car1,&DWP(0,$np)); # np[0]
|
---|
167 |
|
---|
168 | &pmuludq($mul1,$mul0); # ap[0]*bp[0]
|
---|
169 | &movq ($car0,$mul1);
|
---|
170 | &movq ($acc0,$mul1); # I wish movd worked for
|
---|
171 | &pand ($acc0,$mask); # inter-register transfers
|
---|
172 |
|
---|
173 | &pmuludq($mul1,$_n0q); # *=n0
|
---|
174 |
|
---|
175 | &pmuludq($car1,$mul1); # "t[0]"*np[0]*n0
|
---|
176 | &paddq ($car1,$acc0);
|
---|
177 |
|
---|
178 | &movd ($acc1,&DWP(4,$np)); # np[1]
|
---|
179 | &movd ($acc0,&DWP(4,$ap)); # ap[1]
|
---|
180 |
|
---|
181 | &psrlq ($car0,32);
|
---|
182 | &psrlq ($car1,32);
|
---|
183 |
|
---|
184 | &inc ($j); # j++
|
---|
185 | &set_label("1st",16);
|
---|
186 | &pmuludq($acc0,$mul0); # ap[j]*bp[0]
|
---|
187 | &pmuludq($acc1,$mul1); # np[j]*m1
|
---|
188 | &paddq ($car0,$acc0); # +=c0
|
---|
189 | &paddq ($car1,$acc1); # +=c1
|
---|
190 |
|
---|
191 | &movq ($acc0,$car0);
|
---|
192 | &pand ($acc0,$mask);
|
---|
193 | &movd ($acc1,&DWP(4,$np,$j,4)); # np[j+1]
|
---|
194 | &paddq ($car1,$acc0); # +=ap[j]*bp[0];
|
---|
195 | &movd ($acc0,&DWP(4,$ap,$j,4)); # ap[j+1]
|
---|
196 | &psrlq ($car0,32);
|
---|
197 | &movd (&DWP($frame-4,"esp",$j,4),$car1); # tp[j-1]=
|
---|
198 | &psrlq ($car1,32);
|
---|
199 |
|
---|
200 | &lea ($j,&DWP(1,$j));
|
---|
201 | &cmp ($j,$num);
|
---|
202 | &jl (&label("1st"));
|
---|
203 |
|
---|
204 | &pmuludq($acc0,$mul0); # ap[num-1]*bp[0]
|
---|
205 | &pmuludq($acc1,$mul1); # np[num-1]*m1
|
---|
206 | &paddq ($car0,$acc0); # +=c0
|
---|
207 | &paddq ($car1,$acc1); # +=c1
|
---|
208 |
|
---|
209 | &movq ($acc0,$car0);
|
---|
210 | &pand ($acc0,$mask);
|
---|
211 | &paddq ($car1,$acc0); # +=ap[num-1]*bp[0];
|
---|
212 | &movd (&DWP($frame-4,"esp",$j,4),$car1); # tp[num-2]=
|
---|
213 |
|
---|
214 | &psrlq ($car0,32);
|
---|
215 | &psrlq ($car1,32);
|
---|
216 |
|
---|
217 | &paddq ($car1,$car0);
|
---|
218 | &movq (&QWP($frame,"esp",$num,4),$car1); # tp[num].tp[num-1]
|
---|
219 | |
---|
220 |
|
---|
221 | &inc ($i); # i++
|
---|
222 | &set_label("outer");
|
---|
223 | &xor ($j,$j); # j=0
|
---|
224 |
|
---|
225 | &movd ($mul0,&DWP(0,$bp,$i,4)); # bp[i]
|
---|
226 | &movd ($mul1,&DWP(0,$ap)); # ap[0]
|
---|
227 | &movd ($temp,&DWP($frame,"esp")); # tp[0]
|
---|
228 | &movd ($car1,&DWP(0,$np)); # np[0]
|
---|
229 | &pmuludq($mul1,$mul0); # ap[0]*bp[i]
|
---|
230 |
|
---|
231 | &paddq ($mul1,$temp); # +=tp[0]
|
---|
232 | &movq ($acc0,$mul1);
|
---|
233 | &movq ($car0,$mul1);
|
---|
234 | &pand ($acc0,$mask);
|
---|
235 |
|
---|
236 | &pmuludq($mul1,$_n0q); # *=n0
|
---|
237 |
|
---|
238 | &pmuludq($car1,$mul1);
|
---|
239 | &paddq ($car1,$acc0);
|
---|
240 |
|
---|
241 | &movd ($temp,&DWP($frame+4,"esp")); # tp[1]
|
---|
242 | &movd ($acc1,&DWP(4,$np)); # np[1]
|
---|
243 | &movd ($acc0,&DWP(4,$ap)); # ap[1]
|
---|
244 |
|
---|
245 | &psrlq ($car0,32);
|
---|
246 | &psrlq ($car1,32);
|
---|
247 | &paddq ($car0,$temp); # +=tp[1]
|
---|
248 |
|
---|
249 | &inc ($j); # j++
|
---|
250 | &dec ($num);
|
---|
251 | &set_label("inner");
|
---|
252 | &pmuludq($acc0,$mul0); # ap[j]*bp[i]
|
---|
253 | &pmuludq($acc1,$mul1); # np[j]*m1
|
---|
254 | &paddq ($car0,$acc0); # +=c0
|
---|
255 | &paddq ($car1,$acc1); # +=c1
|
---|
256 |
|
---|
257 | &movq ($acc0,$car0);
|
---|
258 | &movd ($temp,&DWP($frame+4,"esp",$j,4));# tp[j+1]
|
---|
259 | &pand ($acc0,$mask);
|
---|
260 | &movd ($acc1,&DWP(4,$np,$j,4)); # np[j+1]
|
---|
261 | &paddq ($car1,$acc0); # +=ap[j]*bp[i]+tp[j]
|
---|
262 | &movd ($acc0,&DWP(4,$ap,$j,4)); # ap[j+1]
|
---|
263 | &psrlq ($car0,32);
|
---|
264 | &movd (&DWP($frame-4,"esp",$j,4),$car1);# tp[j-1]=
|
---|
265 | &psrlq ($car1,32);
|
---|
266 | &paddq ($car0,$temp); # +=tp[j+1]
|
---|
267 |
|
---|
268 | &dec ($num);
|
---|
269 | &lea ($j,&DWP(1,$j)); # j++
|
---|
270 | &jnz (&label("inner"));
|
---|
271 |
|
---|
272 | &mov ($num,$j);
|
---|
273 | &pmuludq($acc0,$mul0); # ap[num-1]*bp[i]
|
---|
274 | &pmuludq($acc1,$mul1); # np[num-1]*m1
|
---|
275 | &paddq ($car0,$acc0); # +=c0
|
---|
276 | &paddq ($car1,$acc1); # +=c1
|
---|
277 |
|
---|
278 | &movq ($acc0,$car0);
|
---|
279 | &pand ($acc0,$mask);
|
---|
280 | &paddq ($car1,$acc0); # +=ap[num-1]*bp[i]+tp[num-1]
|
---|
281 | &movd (&DWP($frame-4,"esp",$j,4),$car1); # tp[num-2]=
|
---|
282 | &psrlq ($car0,32);
|
---|
283 | &psrlq ($car1,32);
|
---|
284 |
|
---|
285 | &movd ($temp,&DWP($frame+4,"esp",$num,4)); # += tp[num]
|
---|
286 | &paddq ($car1,$car0);
|
---|
287 | &paddq ($car1,$temp);
|
---|
288 | &movq (&QWP($frame,"esp",$num,4),$car1); # tp[num].tp[num-1]
|
---|
289 |
|
---|
290 | &lea ($i,&DWP(1,$i)); # i++
|
---|
291 | &cmp ($i,$num);
|
---|
292 | &jle (&label("outer"));
|
---|
293 |
|
---|
294 | &emms (); # done with mmx bank
|
---|
295 | &jmp (&label("common_tail"));
|
---|
296 |
|
---|
297 | &set_label("non_sse2",16);
|
---|
298 | }
|
---|
299 | |
---|
300 |
|
---|
301 | if (0) {
|
---|
302 | &mov ("esp",$_sp);
|
---|
303 | &xor ("eax","eax"); # signal "not fast enough [yet]"
|
---|
304 | &jmp (&label("just_leave"));
|
---|
305 | # While the below code provides competitive performance for
|
---|
306 | # all key lengths on modern Intel cores, it's still more
|
---|
307 | # than 10% slower for 4096-bit key elsewhere:-( "Competitive"
|
---|
308 | # means compared to the original integer-only assembler.
|
---|
309 | # 512-bit RSA sign is better by ~40%, but that's about all
|
---|
310 | # one can say about all CPUs...
|
---|
311 | } else {
|
---|
312 | $inp="esi"; # integer path uses these registers differently
|
---|
313 | $word="edi";
|
---|
314 | $carry="ebp";
|
---|
315 |
|
---|
316 | &mov ($inp,$_ap);
|
---|
317 | &lea ($carry,&DWP(1,$num));
|
---|
318 | &mov ($word,$_bp);
|
---|
319 | &xor ($j,$j); # j=0
|
---|
320 | &mov ("edx",$inp);
|
---|
321 | &and ($carry,1); # see if num is even
|
---|
322 | &sub ("edx",$word); # see if ap==bp
|
---|
323 | &lea ("eax",&DWP(4,$word,$num,4)); # &bp[num]
|
---|
324 | &or ($carry,"edx");
|
---|
325 | &mov ($word,&DWP(0,$word)); # bp[0]
|
---|
326 | &jz (&label("bn_sqr_mont"));
|
---|
327 | &mov ($_bpend,"eax");
|
---|
328 | &mov ("eax",&DWP(0,$inp));
|
---|
329 | &xor ("edx","edx");
|
---|
330 |
|
---|
331 | &set_label("mull",16);
|
---|
332 | &mov ($carry,"edx");
|
---|
333 | &mul ($word); # ap[j]*bp[0]
|
---|
334 | &add ($carry,"eax");
|
---|
335 | &lea ($j,&DWP(1,$j));
|
---|
336 | &adc ("edx",0);
|
---|
337 | &mov ("eax",&DWP(0,$inp,$j,4)); # ap[j+1]
|
---|
338 | &cmp ($j,$num);
|
---|
339 | &mov (&DWP($frame-4,"esp",$j,4),$carry); # tp[j]=
|
---|
340 | &jl (&label("mull"));
|
---|
341 |
|
---|
342 | &mov ($carry,"edx");
|
---|
343 | &mul ($word); # ap[num-1]*bp[0]
|
---|
344 | &mov ($word,$_n0);
|
---|
345 | &add ("eax",$carry);
|
---|
346 | &mov ($inp,$_np);
|
---|
347 | &adc ("edx",0);
|
---|
348 | &imul ($word,&DWP($frame,"esp")); # n0*tp[0]
|
---|
349 |
|
---|
350 | &mov (&DWP($frame,"esp",$num,4),"eax"); # tp[num-1]=
|
---|
351 | &xor ($j,$j);
|
---|
352 | &mov (&DWP($frame+4,"esp",$num,4),"edx"); # tp[num]=
|
---|
353 | &mov (&DWP($frame+8,"esp",$num,4),$j); # tp[num+1]=
|
---|
354 |
|
---|
355 | &mov ("eax",&DWP(0,$inp)); # np[0]
|
---|
356 | &mul ($word); # np[0]*m
|
---|
357 | &add ("eax",&DWP($frame,"esp")); # +=tp[0]
|
---|
358 | &mov ("eax",&DWP(4,$inp)); # np[1]
|
---|
359 | &adc ("edx",0);
|
---|
360 | &inc ($j);
|
---|
361 |
|
---|
362 | &jmp (&label("2ndmadd"));
|
---|
363 | |
---|
364 | |
---|
365 |
|
---|
366 | &set_label("1stmadd",16);
|
---|
367 | &mov ($carry,"edx");
|
---|
368 | &mul ($word); # ap[j]*bp[i]
|
---|
369 | &add ($carry,&DWP($frame,"esp",$j,4)); # +=tp[j]
|
---|
370 | &lea ($j,&DWP(1,$j));
|
---|
371 | &adc ("edx",0);
|
---|
372 | &add ($carry,"eax");
|
---|
373 | &mov ("eax",&DWP(0,$inp,$j,4)); # ap[j+1]
|
---|
374 | &adc ("edx",0);
|
---|
375 | &cmp ($j,$num);
|
---|
376 | &mov (&DWP($frame-4,"esp",$j,4),$carry); # tp[j]=
|
---|
377 | &jl (&label("1stmadd"));
|
---|
378 |
|
---|
379 | &mov ($carry,"edx");
|
---|
380 | &mul ($word); # ap[num-1]*bp[i]
|
---|
381 | &add ("eax",&DWP($frame,"esp",$num,4)); # +=tp[num-1]
|
---|
382 | &mov ($word,$_n0);
|
---|
383 | &adc ("edx",0);
|
---|
384 | &mov ($inp,$_np);
|
---|
385 | &add ($carry,"eax");
|
---|
386 | &adc ("edx",0);
|
---|
387 | &imul ($word,&DWP($frame,"esp")); # n0*tp[0]
|
---|
388 |
|
---|
389 | &xor ($j,$j);
|
---|
390 | &add ("edx",&DWP($frame+4,"esp",$num,4)); # carry+=tp[num]
|
---|
391 | &mov (&DWP($frame,"esp",$num,4),$carry); # tp[num-1]=
|
---|
392 | &adc ($j,0);
|
---|
393 | &mov ("eax",&DWP(0,$inp)); # np[0]
|
---|
394 | &mov (&DWP($frame+4,"esp",$num,4),"edx"); # tp[num]=
|
---|
395 | &mov (&DWP($frame+8,"esp",$num,4),$j); # tp[num+1]=
|
---|
396 |
|
---|
397 | &mul ($word); # np[0]*m
|
---|
398 | &add ("eax",&DWP($frame,"esp")); # +=tp[0]
|
---|
399 | &mov ("eax",&DWP(4,$inp)); # np[1]
|
---|
400 | &adc ("edx",0);
|
---|
401 | &mov ($j,1);
|
---|
402 | |
---|
403 |
|
---|
404 | &set_label("2ndmadd",16);
|
---|
405 | &mov ($carry,"edx");
|
---|
406 | &mul ($word); # np[j]*m
|
---|
407 | &add ($carry,&DWP($frame,"esp",$j,4)); # +=tp[j]
|
---|
408 | &lea ($j,&DWP(1,$j));
|
---|
409 | &adc ("edx",0);
|
---|
410 | &add ($carry,"eax");
|
---|
411 | &mov ("eax",&DWP(0,$inp,$j,4)); # np[j+1]
|
---|
412 | &adc ("edx",0);
|
---|
413 | &cmp ($j,$num);
|
---|
414 | &mov (&DWP($frame-8,"esp",$j,4),$carry); # tp[j-1]=
|
---|
415 | &jl (&label("2ndmadd"));
|
---|
416 |
|
---|
417 | &mov ($carry,"edx");
|
---|
418 | &mul ($word); # np[j]*m
|
---|
419 | &add ($carry,&DWP($frame,"esp",$num,4)); # +=tp[num-1]
|
---|
420 | &adc ("edx",0);
|
---|
421 | &add ($carry,"eax");
|
---|
422 | &adc ("edx",0);
|
---|
423 | &mov (&DWP($frame-4,"esp",$num,4),$carry); # tp[num-2]=
|
---|
424 |
|
---|
425 | &xor ("eax","eax");
|
---|
426 | &mov ($j,$_bp); # &bp[i]
|
---|
427 | &add ("edx",&DWP($frame+4,"esp",$num,4)); # carry+=tp[num]
|
---|
428 | &adc ("eax",&DWP($frame+8,"esp",$num,4)); # +=tp[num+1]
|
---|
429 | &lea ($j,&DWP(4,$j));
|
---|
430 | &mov (&DWP($frame,"esp",$num,4),"edx"); # tp[num-1]=
|
---|
431 | &cmp ($j,$_bpend);
|
---|
432 | &mov (&DWP($frame+4,"esp",$num,4),"eax"); # tp[num]=
|
---|
433 | &je (&label("common_tail"));
|
---|
434 |
|
---|
435 | &mov ($word,&DWP(0,$j)); # bp[i+1]
|
---|
436 | &mov ($inp,$_ap);
|
---|
437 | &mov ($_bp,$j); # &bp[++i]
|
---|
438 | &xor ($j,$j);
|
---|
439 | &xor ("edx","edx");
|
---|
440 | &mov ("eax",&DWP(0,$inp));
|
---|
441 | &jmp (&label("1stmadd"));
|
---|
442 | |
---|
443 |
|
---|
444 | &set_label("bn_sqr_mont",16);
|
---|
445 | $sbit=$num;
|
---|
446 | &mov ($_num,$num);
|
---|
447 | &mov ($_bp,$j); # i=0
|
---|
448 |
|
---|
449 | &mov ("eax",$word); # ap[0]
|
---|
450 | &mul ($word); # ap[0]*ap[0]
|
---|
451 | &mov (&DWP($frame,"esp"),"eax"); # tp[0]=
|
---|
452 | &mov ($sbit,"edx");
|
---|
453 | &shr ("edx",1);
|
---|
454 | &and ($sbit,1);
|
---|
455 | &inc ($j);
|
---|
456 | &set_label("sqr",16);
|
---|
457 | &mov ("eax",&DWP(0,$inp,$j,4)); # ap[j]
|
---|
458 | &mov ($carry,"edx");
|
---|
459 | &mul ($word); # ap[j]*ap[0]
|
---|
460 | &add ("eax",$carry);
|
---|
461 | &lea ($j,&DWP(1,$j));
|
---|
462 | &adc ("edx",0);
|
---|
463 | &lea ($carry,&DWP(0,$sbit,"eax",2));
|
---|
464 | &shr ("eax",31);
|
---|
465 | &cmp ($j,$_num);
|
---|
466 | &mov ($sbit,"eax");
|
---|
467 | &mov (&DWP($frame-4,"esp",$j,4),$carry); # tp[j]=
|
---|
468 | &jl (&label("sqr"));
|
---|
469 |
|
---|
470 | &mov ("eax",&DWP(0,$inp,$j,4)); # ap[num-1]
|
---|
471 | &mov ($carry,"edx");
|
---|
472 | &mul ($word); # ap[num-1]*ap[0]
|
---|
473 | &add ("eax",$carry);
|
---|
474 | &mov ($word,$_n0);
|
---|
475 | &adc ("edx",0);
|
---|
476 | &mov ($inp,$_np);
|
---|
477 | &lea ($carry,&DWP(0,$sbit,"eax",2));
|
---|
478 | &imul ($word,&DWP($frame,"esp")); # n0*tp[0]
|
---|
479 | &shr ("eax",31);
|
---|
480 | &mov (&DWP($frame,"esp",$j,4),$carry); # tp[num-1]=
|
---|
481 |
|
---|
482 | &lea ($carry,&DWP(0,"eax","edx",2));
|
---|
483 | &mov ("eax",&DWP(0,$inp)); # np[0]
|
---|
484 | &shr ("edx",31);
|
---|
485 | &mov (&DWP($frame+4,"esp",$j,4),$carry); # tp[num]=
|
---|
486 | &mov (&DWP($frame+8,"esp",$j,4),"edx"); # tp[num+1]=
|
---|
487 |
|
---|
488 | &mul ($word); # np[0]*m
|
---|
489 | &add ("eax",&DWP($frame,"esp")); # +=tp[0]
|
---|
490 | &mov ($num,$j);
|
---|
491 | &adc ("edx",0);
|
---|
492 | &mov ("eax",&DWP(4,$inp)); # np[1]
|
---|
493 | &mov ($j,1);
|
---|
494 | |
---|
495 | |
---|
496 |
|
---|
497 | &set_label("3rdmadd",16);
|
---|
498 | &mov ($carry,"edx");
|
---|
499 | &mul ($word); # np[j]*m
|
---|
500 | &add ($carry,&DWP($frame,"esp",$j,4)); # +=tp[j]
|
---|
501 | &adc ("edx",0);
|
---|
502 | &add ($carry,"eax");
|
---|
503 | &mov ("eax",&DWP(4,$inp,$j,4)); # np[j+1]
|
---|
504 | &adc ("edx",0);
|
---|
505 | &mov (&DWP($frame-4,"esp",$j,4),$carry); # tp[j-1]=
|
---|
506 |
|
---|
507 | &mov ($carry,"edx");
|
---|
508 | &mul ($word); # np[j+1]*m
|
---|
509 | &add ($carry,&DWP($frame+4,"esp",$j,4)); # +=tp[j+1]
|
---|
510 | &lea ($j,&DWP(2,$j));
|
---|
511 | &adc ("edx",0);
|
---|
512 | &add ($carry,"eax");
|
---|
513 | &mov ("eax",&DWP(0,$inp,$j,4)); # np[j+2]
|
---|
514 | &adc ("edx",0);
|
---|
515 | &cmp ($j,$num);
|
---|
516 | &mov (&DWP($frame-8,"esp",$j,4),$carry); # tp[j]=
|
---|
517 | &jl (&label("3rdmadd"));
|
---|
518 |
|
---|
519 | &mov ($carry,"edx");
|
---|
520 | &mul ($word); # np[j]*m
|
---|
521 | &add ($carry,&DWP($frame,"esp",$num,4)); # +=tp[num-1]
|
---|
522 | &adc ("edx",0);
|
---|
523 | &add ($carry,"eax");
|
---|
524 | &adc ("edx",0);
|
---|
525 | &mov (&DWP($frame-4,"esp",$num,4),$carry); # tp[num-2]=
|
---|
526 |
|
---|
527 | &mov ($j,$_bp); # i
|
---|
528 | &xor ("eax","eax");
|
---|
529 | &mov ($inp,$_ap);
|
---|
530 | &add ("edx",&DWP($frame+4,"esp",$num,4)); # carry+=tp[num]
|
---|
531 | &adc ("eax",&DWP($frame+8,"esp",$num,4)); # +=tp[num+1]
|
---|
532 | &mov (&DWP($frame,"esp",$num,4),"edx"); # tp[num-1]=
|
---|
533 | &cmp ($j,$num);
|
---|
534 | &mov (&DWP($frame+4,"esp",$num,4),"eax"); # tp[num]=
|
---|
535 | &je (&label("common_tail"));
|
---|
536 | |
---|
537 |
|
---|
538 | &mov ($word,&DWP(4,$inp,$j,4)); # ap[i]
|
---|
539 | &lea ($j,&DWP(1,$j));
|
---|
540 | &mov ("eax",$word);
|
---|
541 | &mov ($_bp,$j); # ++i
|
---|
542 | &mul ($word); # ap[i]*ap[i]
|
---|
543 | &add ("eax",&DWP($frame,"esp",$j,4)); # +=tp[i]
|
---|
544 | &adc ("edx",0);
|
---|
545 | &mov (&DWP($frame,"esp",$j,4),"eax"); # tp[i]=
|
---|
546 | &xor ($carry,$carry);
|
---|
547 | &cmp ($j,$num);
|
---|
548 | &lea ($j,&DWP(1,$j));
|
---|
549 | &je (&label("sqrlast"));
|
---|
550 |
|
---|
551 | &mov ($sbit,"edx"); # zaps $num
|
---|
552 | &shr ("edx",1);
|
---|
553 | &and ($sbit,1);
|
---|
554 | &set_label("sqradd",16);
|
---|
555 | &mov ("eax",&DWP(0,$inp,$j,4)); # ap[j]
|
---|
556 | &mov ($carry,"edx");
|
---|
557 | &mul ($word); # ap[j]*ap[i]
|
---|
558 | &add ("eax",$carry);
|
---|
559 | &lea ($carry,&DWP(0,"eax","eax"));
|
---|
560 | &adc ("edx",0);
|
---|
561 | &shr ("eax",31);
|
---|
562 | &add ($carry,&DWP($frame,"esp",$j,4)); # +=tp[j]
|
---|
563 | &lea ($j,&DWP(1,$j));
|
---|
564 | &adc ("eax",0);
|
---|
565 | &add ($carry,$sbit);
|
---|
566 | &adc ("eax",0);
|
---|
567 | &cmp ($j,$_num);
|
---|
568 | &mov (&DWP($frame-4,"esp",$j,4),$carry); # tp[j]=
|
---|
569 | &mov ($sbit,"eax");
|
---|
570 | &jle (&label("sqradd"));
|
---|
571 |
|
---|
572 | &mov ($carry,"edx");
|
---|
573 | &add ("edx","edx");
|
---|
574 | &shr ($carry,31);
|
---|
575 | &add ("edx",$sbit);
|
---|
576 | &adc ($carry,0);
|
---|
577 | &set_label("sqrlast");
|
---|
578 | &mov ($word,$_n0);
|
---|
579 | &mov ($inp,$_np);
|
---|
580 | &imul ($word,&DWP($frame,"esp")); # n0*tp[0]
|
---|
581 |
|
---|
582 | &add ("edx",&DWP($frame,"esp",$j,4)); # +=tp[num]
|
---|
583 | &mov ("eax",&DWP(0,$inp)); # np[0]
|
---|
584 | &adc ($carry,0);
|
---|
585 | &mov (&DWP($frame,"esp",$j,4),"edx"); # tp[num]=
|
---|
586 | &mov (&DWP($frame+4,"esp",$j,4),$carry); # tp[num+1]=
|
---|
587 |
|
---|
588 | &mul ($word); # np[0]*m
|
---|
589 | &add ("eax",&DWP($frame,"esp")); # +=tp[0]
|
---|
590 | &lea ($num,&DWP(-1,$j));
|
---|
591 | &adc ("edx",0);
|
---|
592 | &mov ($j,1);
|
---|
593 | &mov ("eax",&DWP(4,$inp)); # np[1]
|
---|
594 |
|
---|
595 | &jmp (&label("3rdmadd"));
|
---|
596 | }
|
---|
597 | |
---|
598 |
|
---|
599 | &set_label("common_tail",16);
|
---|
600 | &mov ($np,$_np); # load modulus pointer
|
---|
601 | &mov ($rp,$_rp); # load result pointer
|
---|
602 | &lea ($tp,&DWP($frame,"esp")); # [$ap and $bp are zapped]
|
---|
603 |
|
---|
604 | &mov ("eax",&DWP(0,$tp)); # tp[0]
|
---|
605 | &mov ($j,$num); # j=num-1
|
---|
606 | &xor ($i,$i); # i=0 and clear CF!
|
---|
607 |
|
---|
608 | &set_label("sub",16);
|
---|
609 | &sbb ("eax",&DWP(0,$np,$i,4));
|
---|
610 | &mov (&DWP(0,$rp,$i,4),"eax"); # rp[i]=tp[i]-np[i]
|
---|
611 | &dec ($j); # doesn't affect CF!
|
---|
612 | &mov ("eax",&DWP(4,$tp,$i,4)); # tp[i+1]
|
---|
613 | &lea ($i,&DWP(1,$i)); # i++
|
---|
614 | &jge (&label("sub"));
|
---|
615 |
|
---|
616 | &sbb ("eax",0); # handle upmost overflow bit
|
---|
617 | &mov ("edx",-1);
|
---|
618 | &xor ("edx","eax");
|
---|
619 | &jmp (&label("copy"));
|
---|
620 |
|
---|
621 | &set_label("copy",16); # conditional copy
|
---|
622 | &mov ($tp,&DWP($frame,"esp",$num,4));
|
---|
623 | &mov ($np,&DWP(0,$rp,$num,4));
|
---|
624 | &mov (&DWP($frame,"esp",$num,4),$j); # zap temporary vector
|
---|
625 | &and ($tp,"eax");
|
---|
626 | &and ($np,"edx");
|
---|
627 | &or ($np,$tp);
|
---|
628 | &mov (&DWP(0,$rp,$num,4),$np);
|
---|
629 | &dec ($num);
|
---|
630 | &jge (&label("copy"));
|
---|
631 |
|
---|
632 | &mov ("esp",$_sp); # pull saved stack pointer
|
---|
633 | &mov ("eax",1);
|
---|
634 | &set_label("just_leave");
|
---|
635 | &function_end("bn_mul_mont");
|
---|
636 |
|
---|
637 | &asciz("Montgomery Multiplication for x86, CRYPTOGAMS by <appro\@openssl.org>");
|
---|
638 |
|
---|
639 | &asm_finish();
|
---|
640 |
|
---|
641 | close STDOUT or die "error closing STDOUT: $!";
|
---|