bn_asm.c@ 96662

Last change on this file since 96662 was 94082, checked in by vboxsync, 3 years ago
libs/openssl-3.0.1: started applying and adjusting our OpenSSL changes to 3.0.1. bugref:10128
File size: 27.1 KB

Line
1	/*
2	* Copyright 1995-2016 The OpenSSL Project Authors. All Rights Reserved.
3	*
4	* Licensed under the Apache License 2.0 (the "License"). You may not use
5	* this file except in compliance with the License. You can obtain a copy
6	* in the file LICENSE in the source distribution or at
7	* https://www.openssl.org/source/license.html
8	*/
9
10	#include <assert.h>
11	#include <openssl/crypto.h>
12	#include "internal/cryptlib.h"
13	#include "bn_local.h"
14
15	#if defined(BN_LLONG) \|\| defined(BN_UMULT_HIGH)
16
17	BN_ULONG bn_mul_add_words(BN_ULONG rp, const BN_ULONG ap, int num,
18	BN_ULONG w)
19	{
20	BN_ULONG c1 = 0;
21
22	assert(num >= 0);
23	if (num <= 0)
24	return c1;
25
26	# ifndef OPENSSL_SMALL_FOOTPRINT
27	while (num & ~3) {
28	mul_add(rp[0], ap[0], w, c1);
29	mul_add(rp[1], ap[1], w, c1);
30	mul_add(rp[2], ap[2], w, c1);
31	mul_add(rp[3], ap[3], w, c1);
32	ap += 4;
33	rp += 4;
34	num -= 4;
35	}
36	# endif
37	while (num) {
38	mul_add(rp[0], ap[0], w, c1);
39	ap++;
40	rp++;
41	num--;
42	}
43
44	return c1;
45	}
46
47	BN_ULONG bn_mul_words(BN_ULONG rp, const BN_ULONG ap, int num, BN_ULONG w)
48	{
49	BN_ULONG c1 = 0;
50
51	assert(num >= 0);
52	if (num <= 0)
53	return c1;
54
55	# ifndef OPENSSL_SMALL_FOOTPRINT
56	while (num & ~3) {
57	mul(rp[0], ap[0], w, c1);
58	mul(rp[1], ap[1], w, c1);
59	mul(rp[2], ap[2], w, c1);
60	mul(rp[3], ap[3], w, c1);
61	ap += 4;
62	rp += 4;
63	num -= 4;
64	}
65	# endif
66	while (num) {
67	mul(rp[0], ap[0], w, c1);
68	ap++;
69	rp++;
70	num--;
71	}
72	return c1;
73	}
74
75	void bn_sqr_words(BN_ULONG r, const BN_ULONG a, int n)
76	{
77	assert(n >= 0);
78	if (n <= 0)
79	return;
80
81	# ifndef OPENSSL_SMALL_FOOTPRINT
82	while (n & ~3) {
83	sqr(r[0], r[1], a[0]);
84	sqr(r[2], r[3], a[1]);
85	sqr(r[4], r[5], a[2]);
86	sqr(r[6], r[7], a[3]);
87	a += 4;
88	r += 8;
89	n -= 4;
90	}
91	# endif
92	while (n) {
93	sqr(r[0], r[1], a[0]);
94	a++;
95	r += 2;
96	n--;
97	}
98	}
99
100	#else /* !(defined(BN_LLONG) \|\|
101	* defined(BN_UMULT_HIGH)) */
102
103	BN_ULONG bn_mul_add_words(BN_ULONG rp, const BN_ULONG ap, int num,
104	BN_ULONG w)
105	{
106	BN_ULONG c = 0;
107	BN_ULONG bl, bh;
108
109	assert(num >= 0);
110	if (num <= 0)
111	return (BN_ULONG)0;
112
113	bl = LBITS(w);
114	bh = HBITS(w);
115
116	# ifndef OPENSSL_SMALL_FOOTPRINT
117	while (num & ~3) {
118	mul_add(rp[0], ap[0], bl, bh, c);
119	mul_add(rp[1], ap[1], bl, bh, c);
120	mul_add(rp[2], ap[2], bl, bh, c);
121	mul_add(rp[3], ap[3], bl, bh, c);
122	ap += 4;
123	rp += 4;
124	num -= 4;
125	}
126	# endif
127	while (num) {
128	mul_add(rp[0], ap[0], bl, bh, c);
129	ap++;
130	rp++;
131	num--;
132	}
133	return c;
134	}
135
136	BN_ULONG bn_mul_words(BN_ULONG rp, const BN_ULONG ap, int num, BN_ULONG w)
137	{
138	BN_ULONG carry = 0;
139	BN_ULONG bl, bh;
140
141	assert(num >= 0);
142	if (num <= 0)
143	return (BN_ULONG)0;
144
145	bl = LBITS(w);
146	bh = HBITS(w);
147
148	# ifndef OPENSSL_SMALL_FOOTPRINT
149	while (num & ~3) {
150	mul(rp[0], ap[0], bl, bh, carry);
151	mul(rp[1], ap[1], bl, bh, carry);
152	mul(rp[2], ap[2], bl, bh, carry);
153	mul(rp[3], ap[3], bl, bh, carry);
154	ap += 4;
155	rp += 4;
156	num -= 4;
157	}
158	# endif
159	while (num) {
160	mul(rp[0], ap[0], bl, bh, carry);
161	ap++;
162	rp++;
163	num--;
164	}
165	return carry;
166	}
167
168	void bn_sqr_words(BN_ULONG r, const BN_ULONG a, int n)
169	{
170	assert(n >= 0);
171	if (n <= 0)
172	return;
173
174	# ifndef OPENSSL_SMALL_FOOTPRINT
175	while (n & ~3) {
176	sqr64(r[0], r[1], a[0]);
177	sqr64(r[2], r[3], a[1]);
178	sqr64(r[4], r[5], a[2]);
179	sqr64(r[6], r[7], a[3]);
180	a += 4;
181	r += 8;
182	n -= 4;
183	}
184	# endif
185	while (n) {
186	sqr64(r[0], r[1], a[0]);
187	a++;
188	r += 2;
189	n--;
190	}
191	}
192
193	#endif /* !(defined(BN_LLONG) \|\|
194	* defined(BN_UMULT_HIGH)) */
195
196	#if defined(BN_LLONG) && defined(BN_DIV2W)
197
198	BN_ULONG bn_div_words(BN_ULONG h, BN_ULONG l, BN_ULONG d)
199	{
200	return ((BN_ULONG)(((((BN_ULLONG) h) << BN_BITS2) \| l) / (BN_ULLONG) d));
201	}
202
203	#else
204
205	/* Divide h,l by d and return the result. */
206	/* I need to test this some more :-( */
207	BN_ULONG bn_div_words(BN_ULONG h, BN_ULONG l, BN_ULONG d)
208	{
209	BN_ULONG dh, dl, q, ret = 0, th, tl, t;
210	int i, count = 2;
211
212	if (d == 0)
213	return BN_MASK2;
214
215	i = BN_num_bits_word(d);
216	assert((i == BN_BITS2) \|\| (h <= (BN_ULONG)1 << i));
217
218	i = BN_BITS2 - i;
219	if (h >= d)
220	h -= d;
221
222	if (i) {
223	d <<= i;
224	h = (h << i) \| (l >> (BN_BITS2 - i));
225	l <<= i;
226	}
227	dh = (d & BN_MASK2h) >> BN_BITS4;
228	dl = (d & BN_MASK2l);
229	for (;;) {
230	if ((h >> BN_BITS4) == dh)
231	q = BN_MASK2l;
232	else
233	q = h / dh;
234
235	th = q * dh;
236	tl = dl * q;
237	for (;;) {
238	t = h - th;
239	if ((t & BN_MASK2h) \|\|
240	((tl) <= ((t << BN_BITS4) \| ((l & BN_MASK2h) >> BN_BITS4))))
241	break;
242	q--;
243	th -= dh;
244	tl -= dl;
245	}
246	t = (tl >> BN_BITS4);
247	tl = (tl << BN_BITS4) & BN_MASK2h;
248	th += t;
249
250	if (l < tl)
251	th++;
252	l -= tl;
253	if (h < th) {
254	h += d;
255	q--;
256	}
257	h -= th;
258
259	if (--count == 0)
260	break;
261
262	ret = q << BN_BITS4;
263	h = ((h << BN_BITS4) \| (l >> BN_BITS4)) & BN_MASK2;
264	l = (l & BN_MASK2l) << BN_BITS4;
265	}
266	ret \|= q;
267	return ret;
268	}
269	#endif /* !defined(BN_LLONG) && defined(BN_DIV2W) */
270
271	#ifdef BN_LLONG
272	BN_ULONG bn_add_words(BN_ULONG r, const BN_ULONG a, const BN_ULONG *b,
273	int n)
274	{
275	BN_ULLONG ll = 0;
276
277	assert(n >= 0);
278	if (n <= 0)
279	return (BN_ULONG)0;
280
281	# ifndef OPENSSL_SMALL_FOOTPRINT
282	while (n & ~3) {
283	ll += (BN_ULLONG) a[0] + b[0];
284	r[0] = (BN_ULONG)ll & BN_MASK2;
285	ll >>= BN_BITS2;
286	ll += (BN_ULLONG) a[1] + b[1];
287	r[1] = (BN_ULONG)ll & BN_MASK2;
288	ll >>= BN_BITS2;
289	ll += (BN_ULLONG) a[2] + b[2];
290	r[2] = (BN_ULONG)ll & BN_MASK2;
291	ll >>= BN_BITS2;
292	ll += (BN_ULLONG) a[3] + b[3];
293	r[3] = (BN_ULONG)ll & BN_MASK2;
294	ll >>= BN_BITS2;
295	a += 4;
296	b += 4;
297	r += 4;
298	n -= 4;
299	}
300	# endif
301	while (n) {
302	ll += (BN_ULLONG) a[0] + b[0];
303	r[0] = (BN_ULONG)ll & BN_MASK2;
304	ll >>= BN_BITS2;
305	a++;
306	b++;
307	r++;
308	n--;
309	}
310	return (BN_ULONG)ll;
311	}
312	#else /* !BN_LLONG */
313	BN_ULONG bn_add_words(BN_ULONG r, const BN_ULONG a, const BN_ULONG *b,
314	int n)
315	{
316	BN_ULONG c, l, t;
317
318	assert(n >= 0);
319	if (n <= 0)
320	return (BN_ULONG)0;
321
322	c = 0;
323	# ifndef OPENSSL_SMALL_FOOTPRINT
324	while (n & ~3) {
325	t = a[0];
326	t = (t + c) & BN_MASK2;
327	c = (t < c);
328	l = (t + b[0]) & BN_MASK2;
329	c += (l < t);
330	r[0] = l;
331	t = a[1];
332	t = (t + c) & BN_MASK2;
333	c = (t < c);
334	l = (t + b[1]) & BN_MASK2;
335	c += (l < t);
336	r[1] = l;
337	t = a[2];
338	t = (t + c) & BN_MASK2;
339	c = (t < c);
340	l = (t + b[2]) & BN_MASK2;
341	c += (l < t);
342	r[2] = l;
343	t = a[3];
344	t = (t + c) & BN_MASK2;
345	c = (t < c);
346	l = (t + b[3]) & BN_MASK2;
347	c += (l < t);
348	r[3] = l;
349	a += 4;
350	b += 4;
351	r += 4;
352	n -= 4;
353	}
354	# endif
355	while (n) {
356	t = a[0];
357	t = (t + c) & BN_MASK2;
358	c = (t < c);
359	l = (t + b[0]) & BN_MASK2;
360	c += (l < t);
361	r[0] = l;
362	a++;
363	b++;
364	r++;
365	n--;
366	}
367	return (BN_ULONG)c;
368	}
369	#endif /* !BN_LLONG */
370
371	BN_ULONG bn_sub_words(BN_ULONG r, const BN_ULONG a, const BN_ULONG *b,
372	int n)
373	{
374	BN_ULONG t1, t2;
375	int c = 0;
376
377	assert(n >= 0);
378	if (n <= 0)
379	return (BN_ULONG)0;
380
381	#ifndef OPENSSL_SMALL_FOOTPRINT
382	while (n & ~3) {
383	t1 = a[0];
384	t2 = b[0];
385	r[0] = (t1 - t2 - c) & BN_MASK2;
386	if (t1 != t2)
387	c = (t1 < t2);
388	t1 = a[1];
389	t2 = b[1];
390	r[1] = (t1 - t2 - c) & BN_MASK2;
391	if (t1 != t2)
392	c = (t1 < t2);
393	t1 = a[2];
394	t2 = b[2];
395	r[2] = (t1 - t2 - c) & BN_MASK2;
396	if (t1 != t2)
397	c = (t1 < t2);
398	t1 = a[3];
399	t2 = b[3];
400	r[3] = (t1 - t2 - c) & BN_MASK2;
401	if (t1 != t2)
402	c = (t1 < t2);
403	a += 4;
404	b += 4;
405	r += 4;
406	n -= 4;
407	}
408	#endif
409	while (n) {
410	t1 = a[0];
411	t2 = b[0];
412	r[0] = (t1 - t2 - c) & BN_MASK2;
413	if (t1 != t2)
414	c = (t1 < t2);
415	a++;
416	b++;
417	r++;
418	n--;
419	}
420	return c;
421	}
422
423	#if defined(BN_MUL_COMBA) && !defined(OPENSSL_SMALL_FOOTPRINT)
424
425	#ifndef ___openssl_mangling_h___ /* bird */
426	# undef bn_mul_comba8
427	# undef bn_mul_comba4
428	# undef bn_sqr_comba8
429	# undef bn_sqr_comba4
430	#endif /* !___openssl_mangling_h___/ / bird */
431
432	/* mul_add_c(a,b,c0,c1,c2) -- c+=ab for three word number c=(c2,c1,c0) /
433	/* mul_add_c2(a,b,c0,c1,c2) -- c+=2ab for three word number c=(c2,c1,c0) */
434	/* sqr_add_c(a,i,c0,c1,c2) -- c+=a[i]^2 for three word number c=(c2,c1,c0) */
435	/*
436	* sqr_add_c2(a,i,c0,c1,c2) -- c+=2a[i]a[j] for three word number
437	* c=(c2,c1,c0)
438	*/
439
440	# ifdef BN_LLONG
441	/*
442	* Keep in mind that additions to multiplication result can not
443	* overflow, because its high half cannot be all-ones.
444	*/
445	# define mul_add_c(a,b,c0,c1,c2) do { \
446	BN_ULONG hi; \
447	BN_ULLONG t = (BN_ULLONG)(a)*(b); \
448	t += c0; /* no carry */ \
449	c0 = (BN_ULONG)Lw(t); \
450	hi = (BN_ULONG)Hw(t); \
451	c1 = (c1+hi)&BN_MASK2; if (c1<hi) c2++; \
452	} while(0)
453
454	# define mul_add_c2(a,b,c0,c1,c2) do { \
455	BN_ULONG hi; \
456	BN_ULLONG t = (BN_ULLONG)(a)*(b); \
457	BN_ULLONG tt = t+c0; /* no carry */ \
458	c0 = (BN_ULONG)Lw(tt); \
459	hi = (BN_ULONG)Hw(tt); \
460	c1 = (c1+hi)&BN_MASK2; if (c1<hi) c2++; \
461	t += c0; /* no carry */ \
462	c0 = (BN_ULONG)Lw(t); \
463	hi = (BN_ULONG)Hw(t); \
464	c1 = (c1+hi)&BN_MASK2; if (c1<hi) c2++; \
465	} while(0)
466
467	# define sqr_add_c(a,i,c0,c1,c2) do { \
468	BN_ULONG hi; \
469	BN_ULLONG t = (BN_ULLONG)a[i]*a[i]; \
470	t += c0; /* no carry */ \
471	c0 = (BN_ULONG)Lw(t); \
472	hi = (BN_ULONG)Hw(t); \
473	c1 = (c1+hi)&BN_MASK2; if (c1<hi) c2++; \
474	} while(0)
475
476	# define sqr_add_c2(a,i,j,c0,c1,c2) \
477	mul_add_c2((a)[i],(a)[j],c0,c1,c2)
478
479	# elif defined(BN_UMULT_LOHI)
480	/*
481	* Keep in mind that additions to hi can not overflow, because
482	* the high word of a multiplication result cannot be all-ones.
483	*/
484	# define mul_add_c(a,b,c0,c1,c2) do { \
485	BN_ULONG ta = (a), tb = (b); \
486	BN_ULONG lo, hi; \
487	BN_UMULT_LOHI(lo,hi,ta,tb); \
488	c0 += lo; hi += (c0<lo)?1:0; \
489	c1 += hi; c2 += (c1<hi)?1:0; \
490	} while(0)
491
492	# define mul_add_c2(a,b,c0,c1,c2) do { \
493	BN_ULONG ta = (a), tb = (b); \
494	BN_ULONG lo, hi, tt; \
495	BN_UMULT_LOHI(lo,hi,ta,tb); \
496	c0 += lo; tt = hi+((c0<lo)?1:0); \
497	c1 += tt; c2 += (c1<tt)?1:0; \
498	c0 += lo; hi += (c0<lo)?1:0; \
499	c1 += hi; c2 += (c1<hi)?1:0; \
500	} while(0)
501
502	# define sqr_add_c(a,i,c0,c1,c2) do { \
503	BN_ULONG ta = (a)[i]; \
504	BN_ULONG lo, hi; \
505	BN_UMULT_LOHI(lo,hi,ta,ta); \
506	c0 += lo; hi += (c0<lo)?1:0; \
507	c1 += hi; c2 += (c1<hi)?1:0; \
508	} while(0)
509
510	# define sqr_add_c2(a,i,j,c0,c1,c2) \
511	mul_add_c2((a)[i],(a)[j],c0,c1,c2)
512
513	# elif defined(BN_UMULT_HIGH)
514	/*
515	* Keep in mind that additions to hi can not overflow, because
516	* the high word of a multiplication result cannot be all-ones.
517	*/
518	# define mul_add_c(a,b,c0,c1,c2) do { \
519	BN_ULONG ta = (a), tb = (b); \
520	BN_ULONG lo = ta * tb; \
521	BN_ULONG hi = BN_UMULT_HIGH(ta,tb); \
522	c0 += lo; hi += (c0<lo)?1:0; \
523	c1 += hi; c2 += (c1<hi)?1:0; \
524	} while(0)
525
526	# define mul_add_c2(a,b,c0,c1,c2) do { \
527	BN_ULONG ta = (a), tb = (b), tt; \
528	BN_ULONG lo = ta * tb; \
529	BN_ULONG hi = BN_UMULT_HIGH(ta,tb); \
530	c0 += lo; tt = hi + ((c0<lo)?1:0); \
531	c1 += tt; c2 += (c1<tt)?1:0; \
532	c0 += lo; hi += (c0<lo)?1:0; \
533	c1 += hi; c2 += (c1<hi)?1:0; \
534	} while(0)
535
536	# define sqr_add_c(a,i,c0,c1,c2) do { \
537	BN_ULONG ta = (a)[i]; \
538	BN_ULONG lo = ta * ta; \
539	BN_ULONG hi = BN_UMULT_HIGH(ta,ta); \
540	c0 += lo; hi += (c0<lo)?1:0; \
541	c1 += hi; c2 += (c1<hi)?1:0; \
542	} while(0)
543
544	# define sqr_add_c2(a,i,j,c0,c1,c2) \
545	mul_add_c2((a)[i],(a)[j],c0,c1,c2)
546
547	# else /* !BN_LLONG */
548	/*
549	* Keep in mind that additions to hi can not overflow, because
550	* the high word of a multiplication result cannot be all-ones.
551	*/
552	# define mul_add_c(a,b,c0,c1,c2) do { \
553	BN_ULONG lo = LBITS(a), hi = HBITS(a); \
554	BN_ULONG bl = LBITS(b), bh = HBITS(b); \
555	mul64(lo,hi,bl,bh); \
556	c0 = (c0+lo)&BN_MASK2; if (c0<lo) hi++; \
557	c1 = (c1+hi)&BN_MASK2; if (c1<hi) c2++; \
558	} while(0)
559
560	# define mul_add_c2(a,b,c0,c1,c2) do { \
561	BN_ULONG tt; \
562	BN_ULONG lo = LBITS(a), hi = HBITS(a); \
563	BN_ULONG bl = LBITS(b), bh = HBITS(b); \
564	mul64(lo,hi,bl,bh); \
565	tt = hi; \
566	c0 = (c0+lo)&BN_MASK2; if (c0<lo) tt++; \
567	c1 = (c1+tt)&BN_MASK2; if (c1<tt) c2++; \
568	c0 = (c0+lo)&BN_MASK2; if (c0<lo) hi++; \
569	c1 = (c1+hi)&BN_MASK2; if (c1<hi) c2++; \
570	} while(0)
571
572	# define sqr_add_c(a,i,c0,c1,c2) do { \
573	BN_ULONG lo, hi; \
574	sqr64(lo,hi,(a)[i]); \
575	c0 = (c0+lo)&BN_MASK2; if (c0<lo) hi++; \
576	c1 = (c1+hi)&BN_MASK2; if (c1<hi) c2++; \
577	} while(0)
578
579	# define sqr_add_c2(a,i,j,c0,c1,c2) \
580	mul_add_c2((a)[i],(a)[j],c0,c1,c2)
581	# endif /* !BN_LLONG */
582
583	void bn_mul_comba8(BN_ULONG r, BN_ULONG a, BN_ULONG *b)
584	{
585	BN_ULONG c1, c2, c3;
586
587	c1 = 0;
588	c2 = 0;
589	c3 = 0;
590	mul_add_c(a[0], b[0], c1, c2, c3);
591	r[0] = c1;
592	c1 = 0;
593	mul_add_c(a[0], b[1], c2, c3, c1);
594	mul_add_c(a[1], b[0], c2, c3, c1);
595	r[1] = c2;
596	c2 = 0;
597	mul_add_c(a[2], b[0], c3, c1, c2);
598	mul_add_c(a[1], b[1], c3, c1, c2);
599	mul_add_c(a[0], b[2], c3, c1, c2);
600	r[2] = c3;
601	c3 = 0;
602	mul_add_c(a[0], b[3], c1, c2, c3);
603	mul_add_c(a[1], b[2], c1, c2, c3);
604	mul_add_c(a[2], b[1], c1, c2, c3);
605	mul_add_c(a[3], b[0], c1, c2, c3);
606	r[3] = c1;
607	c1 = 0;
608	mul_add_c(a[4], b[0], c2, c3, c1);
609	mul_add_c(a[3], b[1], c2, c3, c1);
610	mul_add_c(a[2], b[2], c2, c3, c1);
611	mul_add_c(a[1], b[3], c2, c3, c1);
612	mul_add_c(a[0], b[4], c2, c3, c1);
613	r[4] = c2;
614	c2 = 0;
615	mul_add_c(a[0], b[5], c3, c1, c2);
616	mul_add_c(a[1], b[4], c3, c1, c2);
617	mul_add_c(a[2], b[3], c3, c1, c2);
618	mul_add_c(a[3], b[2], c3, c1, c2);
619	mul_add_c(a[4], b[1], c3, c1, c2);
620	mul_add_c(a[5], b[0], c3, c1, c2);
621	r[5] = c3;
622	c3 = 0;
623	mul_add_c(a[6], b[0], c1, c2, c3);
624	mul_add_c(a[5], b[1], c1, c2, c3);
625	mul_add_c(a[4], b[2], c1, c2, c3);
626	mul_add_c(a[3], b[3], c1, c2, c3);
627	mul_add_c(a[2], b[4], c1, c2, c3);
628	mul_add_c(a[1], b[5], c1, c2, c3);
629	mul_add_c(a[0], b[6], c1, c2, c3);
630	r[6] = c1;
631	c1 = 0;
632	mul_add_c(a[0], b[7], c2, c3, c1);
633	mul_add_c(a[1], b[6], c2, c3, c1);
634	mul_add_c(a[2], b[5], c2, c3, c1);
635	mul_add_c(a[3], b[4], c2, c3, c1);
636	mul_add_c(a[4], b[3], c2, c3, c1);
637	mul_add_c(a[5], b[2], c2, c3, c1);
638	mul_add_c(a[6], b[1], c2, c3, c1);
639	mul_add_c(a[7], b[0], c2, c3, c1);
640	r[7] = c2;
641	c2 = 0;
642	mul_add_c(a[7], b[1], c3, c1, c2);
643	mul_add_c(a[6], b[2], c3, c1, c2);
644	mul_add_c(a[5], b[3], c3, c1, c2);
645	mul_add_c(a[4], b[4], c3, c1, c2);
646	mul_add_c(a[3], b[5], c3, c1, c2);
647	mul_add_c(a[2], b[6], c3, c1, c2);
648	mul_add_c(a[1], b[7], c3, c1, c2);
649	r[8] = c3;
650	c3 = 0;
651	mul_add_c(a[2], b[7], c1, c2, c3);
652	mul_add_c(a[3], b[6], c1, c2, c3);
653	mul_add_c(a[4], b[5], c1, c2, c3);
654	mul_add_c(a[5], b[4], c1, c2, c3);
655	mul_add_c(a[6], b[3], c1, c2, c3);
656	mul_add_c(a[7], b[2], c1, c2, c3);
657	r[9] = c1;
658	c1 = 0;
659	mul_add_c(a[7], b[3], c2, c3, c1);
660	mul_add_c(a[6], b[4], c2, c3, c1);
661	mul_add_c(a[5], b[5], c2, c3, c1);
662	mul_add_c(a[4], b[6], c2, c3, c1);
663	mul_add_c(a[3], b[7], c2, c3, c1);
664	r[10] = c2;
665	c2 = 0;
666	mul_add_c(a[4], b[7], c3, c1, c2);
667	mul_add_c(a[5], b[6], c3, c1, c2);
668	mul_add_c(a[6], b[5], c3, c1, c2);
669	mul_add_c(a[7], b[4], c3, c1, c2);
670	r[11] = c3;
671	c3 = 0;
672	mul_add_c(a[7], b[5], c1, c2, c3);
673	mul_add_c(a[6], b[6], c1, c2, c3);
674	mul_add_c(a[5], b[7], c1, c2, c3);
675	r[12] = c1;
676	c1 = 0;
677	mul_add_c(a[6], b[7], c2, c3, c1);
678	mul_add_c(a[7], b[6], c2, c3, c1);
679	r[13] = c2;
680	c2 = 0;
681	mul_add_c(a[7], b[7], c3, c1, c2);
682	r[14] = c3;
683	r[15] = c1;
684	}
685
686	void bn_mul_comba4(BN_ULONG r, BN_ULONG a, BN_ULONG *b)
687	{
688	BN_ULONG c1, c2, c3;
689
690	c1 = 0;
691	c2 = 0;
692	c3 = 0;
693	mul_add_c(a[0], b[0], c1, c2, c3);
694	r[0] = c1;
695	c1 = 0;
696	mul_add_c(a[0], b[1], c2, c3, c1);
697	mul_add_c(a[1], b[0], c2, c3, c1);
698	r[1] = c2;
699	c2 = 0;
700	mul_add_c(a[2], b[0], c3, c1, c2);
701	mul_add_c(a[1], b[1], c3, c1, c2);
702	mul_add_c(a[0], b[2], c3, c1, c2);
703	r[2] = c3;
704	c3 = 0;
705	mul_add_c(a[0], b[3], c1, c2, c3);
706	mul_add_c(a[1], b[2], c1, c2, c3);
707	mul_add_c(a[2], b[1], c1, c2, c3);
708	mul_add_c(a[3], b[0], c1, c2, c3);
709	r[3] = c1;
710	c1 = 0;
711	mul_add_c(a[3], b[1], c2, c3, c1);
712	mul_add_c(a[2], b[2], c2, c3, c1);
713	mul_add_c(a[1], b[3], c2, c3, c1);
714	r[4] = c2;
715	c2 = 0;
716	mul_add_c(a[2], b[3], c3, c1, c2);
717	mul_add_c(a[3], b[2], c3, c1, c2);
718	r[5] = c3;
719	c3 = 0;
720	mul_add_c(a[3], b[3], c1, c2, c3);
721	r[6] = c1;
722	r[7] = c2;
723	}
724
725	void bn_sqr_comba8(BN_ULONG r, const BN_ULONG a)
726	{
727	BN_ULONG c1, c2, c3;
728
729	c1 = 0;
730	c2 = 0;
731	c3 = 0;
732	sqr_add_c(a, 0, c1, c2, c3);
733	r[0] = c1;
734	c1 = 0;
735	sqr_add_c2(a, 1, 0, c2, c3, c1);
736	r[1] = c2;
737	c2 = 0;
738	sqr_add_c(a, 1, c3, c1, c2);
739	sqr_add_c2(a, 2, 0, c3, c1, c2);
740	r[2] = c3;
741	c3 = 0;
742	sqr_add_c2(a, 3, 0, c1, c2, c3);
743	sqr_add_c2(a, 2, 1, c1, c2, c3);
744	r[3] = c1;
745	c1 = 0;
746	sqr_add_c(a, 2, c2, c3, c1);
747	sqr_add_c2(a, 3, 1, c2, c3, c1);
748	sqr_add_c2(a, 4, 0, c2, c3, c1);
749	r[4] = c2;
750	c2 = 0;
751	sqr_add_c2(a, 5, 0, c3, c1, c2);
752	sqr_add_c2(a, 4, 1, c3, c1, c2);
753	sqr_add_c2(a, 3, 2, c3, c1, c2);
754	r[5] = c3;
755	c3 = 0;
756	sqr_add_c(a, 3, c1, c2, c3);
757	sqr_add_c2(a, 4, 2, c1, c2, c3);
758	sqr_add_c2(a, 5, 1, c1, c2, c3);
759	sqr_add_c2(a, 6, 0, c1, c2, c3);
760	r[6] = c1;
761	c1 = 0;
762	sqr_add_c2(a, 7, 0, c2, c3, c1);
763	sqr_add_c2(a, 6, 1, c2, c3, c1);
764	sqr_add_c2(a, 5, 2, c2, c3, c1);
765	sqr_add_c2(a, 4, 3, c2, c3, c1);
766	r[7] = c2;
767	c2 = 0;
768	sqr_add_c(a, 4, c3, c1, c2);
769	sqr_add_c2(a, 5, 3, c3, c1, c2);
770	sqr_add_c2(a, 6, 2, c3, c1, c2);
771	sqr_add_c2(a, 7, 1, c3, c1, c2);
772	r[8] = c3;
773	c3 = 0;
774	sqr_add_c2(a, 7, 2, c1, c2, c3);
775	sqr_add_c2(a, 6, 3, c1, c2, c3);
776	sqr_add_c2(a, 5, 4, c1, c2, c3);
777	r[9] = c1;
778	c1 = 0;
779	sqr_add_c(a, 5, c2, c3, c1);
780	sqr_add_c2(a, 6, 4, c2, c3, c1);
781	sqr_add_c2(a, 7, 3, c2, c3, c1);
782	r[10] = c2;
783	c2 = 0;
784	sqr_add_c2(a, 7, 4, c3, c1, c2);
785	sqr_add_c2(a, 6, 5, c3, c1, c2);
786	r[11] = c3;
787	c3 = 0;
788	sqr_add_c(a, 6, c1, c2, c3);
789	sqr_add_c2(a, 7, 5, c1, c2, c3);
790	r[12] = c1;
791	c1 = 0;
792	sqr_add_c2(a, 7, 6, c2, c3, c1);
793	r[13] = c2;
794	c2 = 0;
795	sqr_add_c(a, 7, c3, c1, c2);
796	r[14] = c3;
797	r[15] = c1;
798	}
799
800	void bn_sqr_comba4(BN_ULONG r, const BN_ULONG a)
801	{
802	BN_ULONG c1, c2, c3;
803
804	c1 = 0;
805	c2 = 0;
806	c3 = 0;
807	sqr_add_c(a, 0, c1, c2, c3);
808	r[0] = c1;
809	c1 = 0;
810	sqr_add_c2(a, 1, 0, c2, c3, c1);
811	r[1] = c2;
812	c2 = 0;
813	sqr_add_c(a, 1, c3, c1, c2);
814	sqr_add_c2(a, 2, 0, c3, c1, c2);
815	r[2] = c3;
816	c3 = 0;
817	sqr_add_c2(a, 3, 0, c1, c2, c3);
818	sqr_add_c2(a, 2, 1, c1, c2, c3);
819	r[3] = c1;
820	c1 = 0;
821	sqr_add_c(a, 2, c2, c3, c1);
822	sqr_add_c2(a, 3, 1, c2, c3, c1);
823	r[4] = c2;
824	c2 = 0;
825	sqr_add_c2(a, 3, 2, c3, c1, c2);
826	r[5] = c3;
827	c3 = 0;
828	sqr_add_c(a, 3, c1, c2, c3);
829	r[6] = c1;
830	r[7] = c2;
831	}
832
833	# ifdef OPENSSL_NO_ASM
834	# ifdef OPENSSL_BN_ASM_MONT
835	# include <alloca.h>
836	/*
837	* This is essentially reference implementation, which may or may not
838	* result in performance improvement. E.g. on IA-32 this routine was
839	* observed to give 40% faster rsa1024 private key operations and 10%
840	* faster rsa4096 ones, while on AMD64 it improves rsa1024 sign only
841	* by 10% and worsens rsa4096 sign by 15%. Once again, it's a
842	* reference implementation, one to be used as starting point for
843	* platform-specific assembler. Mentioned numbers apply to compiler
844	* generated code compiled with and without -DOPENSSL_BN_ASM_MONT and
845	* can vary not only from platform to platform, but even for compiler
846	* versions. Assembler vs. assembler improvement coefficients can
847	* [and are known to] differ and are to be documented elsewhere.
848	*/
849	int bn_mul_mont(BN_ULONG rp, const BN_ULONG ap, const BN_ULONG *bp,
850	const BN_ULONG np, const BN_ULONG n0p, int num)
851	{
852	BN_ULONG c0, c1, ml, *tp, n0;
853	# ifdef mul64
854	BN_ULONG mh;
855	# endif
856	volatile BN_ULONG *vp;
857	int i = 0, j;
858
859	# if 0 /* template for platform-specific
860	* implementation */
861	if (ap == bp)
862	return bn_sqr_mont(rp, ap, np, n0p, num);
863	# endif
864	vp = tp = alloca((num + 2) * sizeof(BN_ULONG));
865
866	n0 = *n0p;
867
868	c0 = 0;
869	ml = bp[0];
870	# ifdef mul64
871	mh = HBITS(ml);
872	ml = LBITS(ml);
873	for (j = 0; j < num; ++j)
874	mul(tp[j], ap[j], ml, mh, c0);
875	# else
876	for (j = 0; j < num; ++j)
877	mul(tp[j], ap[j], ml, c0);
878	# endif
879
880	tp[num] = c0;
881	tp[num + 1] = 0;
882	goto enter;
883
884	for (i = 0; i < num; i++) {
885	c0 = 0;
886	ml = bp[i];
887	# ifdef mul64
888	mh = HBITS(ml);
889	ml = LBITS(ml);
890	for (j = 0; j < num; ++j)
891	mul_add(tp[j], ap[j], ml, mh, c0);
892	# else
893	for (j = 0; j < num; ++j)
894	mul_add(tp[j], ap[j], ml, c0);
895	# endif
896	c1 = (tp[num] + c0) & BN_MASK2;
897	tp[num] = c1;
898	tp[num + 1] = (c1 < c0 ? 1 : 0);
899	enter:
900	c1 = tp[0];
901	ml = (c1 * n0) & BN_MASK2;
902	c0 = 0;
903	# ifdef mul64
904	mh = HBITS(ml);
905	ml = LBITS(ml);
906	mul_add(c1, np[0], ml, mh, c0);
907	# else
908	mul_add(c1, ml, np[0], c0);
909	# endif
910	for (j = 1; j < num; j++) {
911	c1 = tp[j];
912	# ifdef mul64
913	mul_add(c1, np[j], ml, mh, c0);
914	# else
915	mul_add(c1, ml, np[j], c0);
916	# endif
917	tp[j - 1] = c1 & BN_MASK2;
918	}
919	c1 = (tp[num] + c0) & BN_MASK2;
920	tp[num - 1] = c1;
921	tp[num] = tp[num + 1] + (c1 < c0 ? 1 : 0);
922	}
923
924	if (tp[num] != 0 \|\| tp[num - 1] >= np[num - 1]) {
925	c0 = bn_sub_words(rp, tp, np, num);
926	if (tp[num] != 0 \|\| c0 == 0) {
927	for (i = 0; i < num + 2; i++)
928	vp[i] = 0;
929	return 1;
930	}
931	}
932	for (i = 0; i < num; i++)
933	rp[i] = tp[i], vp[i] = 0;
934	vp[num] = 0;
935	vp[num + 1] = 0;
936	return 1;
937	}
938	# else
939	/*
940	* Return value of 0 indicates that multiplication/convolution was not
941	* performed to signal the caller to fall down to alternative/original
942	* code-path.
943	*/
944	int bn_mul_mont(BN_ULONG rp, const BN_ULONG ap, const BN_ULONG *bp,
945	const BN_ULONG np, const BN_ULONG n0, int num)
946	{
947	return 0;
948	}
949	# endif /* OPENSSL_BN_ASM_MONT */
950	# endif
951
952	#else /* !BN_MUL_COMBA */
953
954	/* hmm... is it faster just to do a multiply? */
955	#ifndef ___openssl_mangling_h___ /* bird */
956	# undef bn_sqr_comba4
957	# undef bn_sqr_comba8
958	#endif
959	void bn_sqr_comba4(BN_ULONG r, const BN_ULONG a)
960	{
961	BN_ULONG t[8];
962	bn_sqr_normal(r, a, 4, t);
963	}
964
965	void bn_sqr_comba8(BN_ULONG r, const BN_ULONG a)
966	{
967	BN_ULONG t[16];
968	bn_sqr_normal(r, a, 8, t);
969	}
970
971	void bn_mul_comba4(BN_ULONG r, BN_ULONG a, BN_ULONG *b)
972	{
973	r[4] = bn_mul_words(&(r[0]), a, 4, b[0]);
974	r[5] = bn_mul_add_words(&(r[1]), a, 4, b[1]);
975	r[6] = bn_mul_add_words(&(r[2]), a, 4, b[2]);
976	r[7] = bn_mul_add_words(&(r[3]), a, 4, b[3]);
977	}
978
979	void bn_mul_comba8(BN_ULONG r, BN_ULONG a, BN_ULONG *b)
980	{
981	r[8] = bn_mul_words(&(r[0]), a, 8, b[0]);
982	r[9] = bn_mul_add_words(&(r[1]), a, 8, b[1]);
983	r[10] = bn_mul_add_words(&(r[2]), a, 8, b[2]);
984	r[11] = bn_mul_add_words(&(r[3]), a, 8, b[3]);
985	r[12] = bn_mul_add_words(&(r[4]), a, 8, b[4]);
986	r[13] = bn_mul_add_words(&(r[5]), a, 8, b[5]);
987	r[14] = bn_mul_add_words(&(r[6]), a, 8, b[6]);
988	r[15] = bn_mul_add_words(&(r[7]), a, 8, b[7]);
989	}
990
991	# ifdef OPENSSL_NO_ASM
992	# ifdef OPENSSL_BN_ASM_MONT
993	# include <alloca.h>
994	int bn_mul_mont(BN_ULONG rp, const BN_ULONG ap, const BN_ULONG *bp,
995	const BN_ULONG np, const BN_ULONG n0p, int num)
996	{
997	BN_ULONG c0, c1, tp, n0 = n0p;
998	volatile BN_ULONG *vp;
999	int i = 0, j;
1000
1001	vp = tp = alloca((num + 2) * sizeof(BN_ULONG));
1002
1003	for (i = 0; i <= num; i++)
1004	tp[i] = 0;
1005
1006	for (i = 0; i < num; i++) {
1007	c0 = bn_mul_add_words(tp, ap, num, bp[i]);
1008	c1 = (tp[num] + c0) & BN_MASK2;
1009	tp[num] = c1;
1010	tp[num + 1] = (c1 < c0 ? 1 : 0);
1011
1012	c0 = bn_mul_add_words(tp, np, num, tp[0] * n0);
1013	c1 = (tp[num] + c0) & BN_MASK2;
1014	tp[num] = c1;
1015	tp[num + 1] += (c1 < c0 ? 1 : 0);
1016	for (j = 0; j <= num; j++)
1017	tp[j] = tp[j + 1];
1018	}
1019
1020	if (tp[num] != 0 \|\| tp[num - 1] >= np[num - 1]) {
1021	c0 = bn_sub_words(rp, tp, np, num);
1022	if (tp[num] != 0 \|\| c0 == 0) {
1023	for (i = 0; i < num + 2; i++)
1024	vp[i] = 0;
1025	return 1;
1026	}
1027	}
1028	for (i = 0; i < num; i++)
1029	rp[i] = tp[i], vp[i] = 0;
1030	vp[num] = 0;
1031	vp[num + 1] = 0;
1032	return 1;
1033	}
1034	# else
1035	int bn_mul_mont(BN_ULONG rp, const BN_ULONG ap, const BN_ULONG *bp,
1036	const BN_ULONG np, const BN_ULONG n0, int num)
1037	{
1038	return 0;
1039	}
1040	# endif /* OPENSSL_BN_ASM_MONT */
1041	# endif
1042
1043	#endif /* !BN_MUL_COMBA */

Note: See TracBrowser for help on using the repository browser.

source: vbox/trunk/src/libs/openssl-3.0.3/crypto/bn/bn_asm.c@ 96662

Download in other formats: