ecp_nistp521.c@ 83531

Last change on this file since 83531 was 83531, checked in by vboxsync, 5 years ago
setting svn:sync-process=export for openssl-1.1.1f, all files except tests
File size: 71.9 KB

Line
1	/*
2	* Copyright 2011-2020 The OpenSSL Project Authors. All Rights Reserved.
3	*
4	* Licensed under the OpenSSL license (the "License"). You may not use
5	* this file except in compliance with the License. You can obtain a copy
6	* in the file LICENSE in the source distribution or at
7	* https://www.openssl.org/source/license.html
8	*/
9
10	/* Copyright 2011 Google Inc.
11	*
12	* Licensed under the Apache License, Version 2.0 (the "License");
13	*
14	* you may not use this file except in compliance with the License.
15	* You may obtain a copy of the License at
16	*
17	* http://www.apache.org/licenses/LICENSE-2.0
18	*
19	* Unless required by applicable law or agreed to in writing, software
20	* distributed under the License is distributed on an "AS IS" BASIS,
21	* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
22	* See the License for the specific language governing permissions and
23	* limitations under the License.
24	*/
25
26	/*
27	* A 64-bit implementation of the NIST P-521 elliptic curve point multiplication
28	*
29	* OpenSSL integration was taken from Emilia Kasper's work in ecp_nistp224.c.
30	* Otherwise based on Emilia's P224 work, which was inspired by my curve25519
31	* work which got its smarts from Daniel J. Bernstein's work on the same.
32	*/
33
34	#include <openssl/e_os2.h>
35	#ifdef OPENSSL_NO_EC_NISTP_64_GCC_128
36	NON_EMPTY_TRANSLATION_UNIT
37	#else
38
39	# include <string.h>
40	# include <openssl/err.h>
41	# include "ec_local.h"
42
43	# if defined(__SIZEOF_INT128__) && __SIZEOF_INT128__==16
44	/* even with gcc, the typedef won't work for 32-bit platforms */
45	typedef __uint128_t uint128_t; /* nonstandard; implemented by gcc on 64-bit
46	* platforms */
47	# else
48	# error "Your compiler doesn't appear to support 128-bit integer types"
49	# endif
50
51	typedef uint8_t u8;
52	typedef uint64_t u64;
53
54	/*
55	* The underlying field. P521 operates over GF(2^521-1). We can serialise an
56	* element of this field into 66 bytes where the most significant byte
57	* contains only a single bit. We call this an felem_bytearray.
58	*/
59
60	typedef u8 felem_bytearray[66];
61
62	/*
63	* These are the parameters of P521, taken from FIPS 186-3, section D.1.2.5.
64	* These values are big-endian.
65	*/
66	static const felem_bytearray nistp521_curve_params[5] = {
67	{0x01, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, /* p */
68	0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
69	0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
70	0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
71	0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
72	0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
73	0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
74	0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
75	0xff, 0xff},
76	{0x01, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, /* a = -3 */
77	0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
78	0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
79	0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
80	0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
81	0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
82	0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
83	0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
84	0xff, 0xfc},
85	{0x00, 0x51, 0x95, 0x3e, 0xb9, 0x61, 0x8e, 0x1c, /* b */
86	0x9a, 0x1f, 0x92, 0x9a, 0x21, 0xa0, 0xb6, 0x85,
87	0x40, 0xee, 0xa2, 0xda, 0x72, 0x5b, 0x99, 0xb3,
88	0x15, 0xf3, 0xb8, 0xb4, 0x89, 0x91, 0x8e, 0xf1,
89	0x09, 0xe1, 0x56, 0x19, 0x39, 0x51, 0xec, 0x7e,
90	0x93, 0x7b, 0x16, 0x52, 0xc0, 0xbd, 0x3b, 0xb1,
91	0xbf, 0x07, 0x35, 0x73, 0xdf, 0x88, 0x3d, 0x2c,
92	0x34, 0xf1, 0xef, 0x45, 0x1f, 0xd4, 0x6b, 0x50,
93	0x3f, 0x00},
94	{0x00, 0xc6, 0x85, 0x8e, 0x06, 0xb7, 0x04, 0x04, /* x */
95	0xe9, 0xcd, 0x9e, 0x3e, 0xcb, 0x66, 0x23, 0x95,
96	0xb4, 0x42, 0x9c, 0x64, 0x81, 0x39, 0x05, 0x3f,
97	0xb5, 0x21, 0xf8, 0x28, 0xaf, 0x60, 0x6b, 0x4d,
98	0x3d, 0xba, 0xa1, 0x4b, 0x5e, 0x77, 0xef, 0xe7,
99	0x59, 0x28, 0xfe, 0x1d, 0xc1, 0x27, 0xa2, 0xff,
100	0xa8, 0xde, 0x33, 0x48, 0xb3, 0xc1, 0x85, 0x6a,
101	0x42, 0x9b, 0xf9, 0x7e, 0x7e, 0x31, 0xc2, 0xe5,
102	0xbd, 0x66},
103	{0x01, 0x18, 0x39, 0x29, 0x6a, 0x78, 0x9a, 0x3b, /* y */
104	0xc0, 0x04, 0x5c, 0x8a, 0x5f, 0xb4, 0x2c, 0x7d,
105	0x1b, 0xd9, 0x98, 0xf5, 0x44, 0x49, 0x57, 0x9b,
106	0x44, 0x68, 0x17, 0xaf, 0xbd, 0x17, 0x27, 0x3e,
107	0x66, 0x2c, 0x97, 0xee, 0x72, 0x99, 0x5e, 0xf4,
108	0x26, 0x40, 0xc5, 0x50, 0xb9, 0x01, 0x3f, 0xad,
109	0x07, 0x61, 0x35, 0x3c, 0x70, 0x86, 0xa2, 0x72,
110	0xc2, 0x40, 0x88, 0xbe, 0x94, 0x76, 0x9f, 0xd1,
111	0x66, 0x50}
112	};
113
114	/*-
115	* The representation of field elements.
116	* ------------------------------------
117	*
118	* We represent field elements with nine values. These values are either 64 or
119	* 128 bits and the field element represented is:
120	* v[0]2^0 + v[1]2^58 + v[2]2^116 + ... + v[8]2^464 (mod p)
121	* Each of the nine values is called a 'limb'. Since the limbs are spaced only
122	* 58 bits apart, but are greater than 58 bits in length, the most significant
123	* bits of each limb overlap with the least significant bits of the next.
124	*
125	* A field element with 64-bit limbs is an 'felem'. One with 128-bit limbs is a
126	* 'largefelem' */
127
128	# define NLIMBS 9
129
130	typedef uint64_t limb;
131	typedef limb felem[NLIMBS];
132	typedef uint128_t largefelem[NLIMBS];
133
134	static const limb bottom57bits = 0x1ffffffffffffff;
135	static const limb bottom58bits = 0x3ffffffffffffff;
136
137	/*
138	* bin66_to_felem takes a little-endian byte array and converts it into felem
139	* form. This assumes that the CPU is little-endian.
140	*/
141	static void bin66_to_felem(felem out, const u8 in[66])
142	{
143	out[0] = (((limb ) & in[0])) & bottom58bits;
144	out[1] = (((limb ) & in[7]) >> 2) & bottom58bits;
145	out[2] = (((limb ) & in[14]) >> 4) & bottom58bits;
146	out[3] = (((limb ) & in[21]) >> 6) & bottom58bits;
147	out[4] = (((limb ) & in[29])) & bottom58bits;
148	out[5] = (((limb ) & in[36]) >> 2) & bottom58bits;
149	out[6] = (((limb ) & in[43]) >> 4) & bottom58bits;
150	out[7] = (((limb ) & in[50]) >> 6) & bottom58bits;
151	out[8] = (((limb ) & in[58])) & bottom57bits;
152	}
153
154	/*
155	* felem_to_bin66 takes an felem and serialises into a little endian, 66 byte
156	* array. This assumes that the CPU is little-endian.
157	*/
158	static void felem_to_bin66(u8 out[66], const felem in)
159	{
160	memset(out, 0, 66);
161	(((limb ) & out[0])) = in[0];
162	(((limb ) & out[7])) \|= in[1] << 2;
163	(((limb ) & out[14])) \|= in[2] << 4;
164	(((limb ) & out[21])) \|= in[3] << 6;
165	(((limb ) & out[29])) = in[4];
166	(((limb ) & out[36])) \|= in[5] << 2;
167	(((limb ) & out[43])) \|= in[6] << 4;
168	(((limb ) & out[50])) \|= in[7] << 6;
169	(((limb ) & out[58])) = in[8];
170	}
171
172	/* BN_to_felem converts an OpenSSL BIGNUM into an felem */
173	static int BN_to_felem(felem out, const BIGNUM *bn)
174	{
175	felem_bytearray b_out;
176	int num_bytes;
177
178	if (BN_is_negative(bn)) {
179	ECerr(EC_F_BN_TO_FELEM, EC_R_BIGNUM_OUT_OF_RANGE);
180	return 0;
181	}
182	num_bytes = BN_bn2lebinpad(bn, b_out, sizeof(b_out));
183	if (num_bytes < 0) {
184	ECerr(EC_F_BN_TO_FELEM, EC_R_BIGNUM_OUT_OF_RANGE);
185	return 0;
186	}
187	bin66_to_felem(out, b_out);
188	return 1;
189	}
190
191	/* felem_to_BN converts an felem into an OpenSSL BIGNUM */
192	static BIGNUM felem_to_BN(BIGNUM out, const felem in)
193	{
194	felem_bytearray b_out;
195	felem_to_bin66(b_out, in);
196	return BN_lebin2bn(b_out, sizeof(b_out), out);
197	}
198
199	/*-
200	* Field operations
201	* ----------------
202	*/
203
204	static void felem_one(felem out)
205	{
206	out[0] = 1;
207	out[1] = 0;
208	out[2] = 0;
209	out[3] = 0;
210	out[4] = 0;
211	out[5] = 0;
212	out[6] = 0;
213	out[7] = 0;
214	out[8] = 0;
215	}
216
217	static void felem_assign(felem out, const felem in)
218	{
219	out[0] = in[0];
220	out[1] = in[1];
221	out[2] = in[2];
222	out[3] = in[3];
223	out[4] = in[4];
224	out[5] = in[5];
225	out[6] = in[6];
226	out[7] = in[7];
227	out[8] = in[8];
228	}
229
230	/* felem_sum64 sets out = out + in. */
231	static void felem_sum64(felem out, const felem in)
232	{
233	out[0] += in[0];
234	out[1] += in[1];
235	out[2] += in[2];
236	out[3] += in[3];
237	out[4] += in[4];
238	out[5] += in[5];
239	out[6] += in[6];
240	out[7] += in[7];
241	out[8] += in[8];
242	}
243
244	/* felem_scalar sets out = in * scalar */
245	static void felem_scalar(felem out, const felem in, limb scalar)
246	{
247	out[0] = in[0] * scalar;
248	out[1] = in[1] * scalar;
249	out[2] = in[2] * scalar;
250	out[3] = in[3] * scalar;
251	out[4] = in[4] * scalar;
252	out[5] = in[5] * scalar;
253	out[6] = in[6] * scalar;
254	out[7] = in[7] * scalar;
255	out[8] = in[8] * scalar;
256	}
257
258	/* felem_scalar64 sets out = out * scalar */
259	static void felem_scalar64(felem out, limb scalar)
260	{
261	out[0] *= scalar;
262	out[1] *= scalar;
263	out[2] *= scalar;
264	out[3] *= scalar;
265	out[4] *= scalar;
266	out[5] *= scalar;
267	out[6] *= scalar;
268	out[7] *= scalar;
269	out[8] *= scalar;
270	}
271
272	/* felem_scalar128 sets out = out * scalar */
273	static void felem_scalar128(largefelem out, limb scalar)
274	{
275	out[0] *= scalar;
276	out[1] *= scalar;
277	out[2] *= scalar;
278	out[3] *= scalar;
279	out[4] *= scalar;
280	out[5] *= scalar;
281	out[6] *= scalar;
282	out[7] *= scalar;
283	out[8] *= scalar;
284	}
285
286	/*-
287	* felem_neg sets \|out\| to \|-in\|
288	* On entry:
289	* in[i] < 2^59 + 2^14
290	* On exit:
291	* out[i] < 2^62
292	*/
293	static void felem_neg(felem out, const felem in)
294	{
295	/* In order to prevent underflow, we subtract from 0 mod p. */
296	static const limb two62m3 = (((limb) 1) << 62) - (((limb) 1) << 5);
297	static const limb two62m2 = (((limb) 1) << 62) - (((limb) 1) << 4);
298
299	out[0] = two62m3 - in[0];
300	out[1] = two62m2 - in[1];
301	out[2] = two62m2 - in[2];
302	out[3] = two62m2 - in[3];
303	out[4] = two62m2 - in[4];
304	out[5] = two62m2 - in[5];
305	out[6] = two62m2 - in[6];
306	out[7] = two62m2 - in[7];
307	out[8] = two62m2 - in[8];
308	}
309
310	/*-
311	* felem_diff64 subtracts \|in\| from \|out\|
312	* On entry:
313	* in[i] < 2^59 + 2^14
314	* On exit:
315	* out[i] < out[i] + 2^62
316	*/
317	static void felem_diff64(felem out, const felem in)
318	{
319	/*
320	* In order to prevent underflow, we add 0 mod p before subtracting.
321	*/
322	static const limb two62m3 = (((limb) 1) << 62) - (((limb) 1) << 5);
323	static const limb two62m2 = (((limb) 1) << 62) - (((limb) 1) << 4);
324
325	out[0] += two62m3 - in[0];
326	out[1] += two62m2 - in[1];
327	out[2] += two62m2 - in[2];
328	out[3] += two62m2 - in[3];
329	out[4] += two62m2 - in[4];
330	out[5] += two62m2 - in[5];
331	out[6] += two62m2 - in[6];
332	out[7] += two62m2 - in[7];
333	out[8] += two62m2 - in[8];
334	}
335
336	/*-
337	* felem_diff_128_64 subtracts \|in\| from \|out\|
338	* On entry:
339	* in[i] < 2^62 + 2^17
340	* On exit:
341	* out[i] < out[i] + 2^63
342	*/
343	static void felem_diff_128_64(largefelem out, const felem in)
344	{
345	/*
346	* In order to prevent underflow, we add 64p mod p (which is equivalent
347	* to 0 mod p) before subtracting. p is 2^521 - 1, i.e. in binary a 521
348	* digit number with all bits set to 1. See "The representation of field
349	* elements" comment above for a description of how limbs are used to
350	* represent a number. 64p is represented with 8 limbs containing a number
351	* with 58 bits set and one limb with a number with 57 bits set.
352	*/
353	static const limb two63m6 = (((limb) 1) << 63) - (((limb) 1) << 6);
354	static const limb two63m5 = (((limb) 1) << 63) - (((limb) 1) << 5);
355
356	out[0] += two63m6 - in[0];
357	out[1] += two63m5 - in[1];
358	out[2] += two63m5 - in[2];
359	out[3] += two63m5 - in[3];
360	out[4] += two63m5 - in[4];
361	out[5] += two63m5 - in[5];
362	out[6] += two63m5 - in[6];
363	out[7] += two63m5 - in[7];
364	out[8] += two63m5 - in[8];
365	}
366
367	/*-
368	* felem_diff_128_64 subtracts \|in\| from \|out\|
369	* On entry:
370	* in[i] < 2^126
371	* On exit:
372	* out[i] < out[i] + 2^127 - 2^69
373	*/
374	static void felem_diff128(largefelem out, const largefelem in)
375	{
376	/*
377	* In order to prevent underflow, we add 0 mod p before subtracting.
378	*/
379	static const uint128_t two127m70 =
380	(((uint128_t) 1) << 127) - (((uint128_t) 1) << 70);
381	static const uint128_t two127m69 =
382	(((uint128_t) 1) << 127) - (((uint128_t) 1) << 69);
383
384	out[0] += (two127m70 - in[0]);
385	out[1] += (two127m69 - in[1]);
386	out[2] += (two127m69 - in[2]);
387	out[3] += (two127m69 - in[3]);
388	out[4] += (two127m69 - in[4]);
389	out[5] += (two127m69 - in[5]);
390	out[6] += (two127m69 - in[6]);
391	out[7] += (two127m69 - in[7]);
392	out[8] += (two127m69 - in[8]);
393	}
394
395	/*-
396	* felem_square sets \|out\| = \|in\|^2
397	* On entry:
398	* in[i] < 2^62
399	* On exit:
400	* out[i] < 17 * max(in[i]) * max(in[i])
401	*/
402	static void felem_square(largefelem out, const felem in)
403	{
404	felem inx2, inx4;
405	felem_scalar(inx2, in, 2);
406	felem_scalar(inx4, in, 4);
407
408	/*-
409	* We have many cases were we want to do
410	* in[x] * in[y] +
411	* in[y] * in[x]
412	* This is obviously just
413	* 2 * in[x] * in[y]
414	* However, rather than do the doubling on the 128 bit result, we
415	* double one of the inputs to the multiplication by reading from
416	* \|inx2\|
417	*/
418
419	out[0] = ((uint128_t) in[0]) * in[0];
420	out[1] = ((uint128_t) in[0]) * inx2[1];
421	out[2] = ((uint128_t) in[0]) * inx2[2] + ((uint128_t) in[1]) * in[1];
422	out[3] = ((uint128_t) in[0]) * inx2[3] + ((uint128_t) in[1]) * inx2[2];
423	out[4] = ((uint128_t) in[0]) * inx2[4] +
424	((uint128_t) in[1]) * inx2[3] + ((uint128_t) in[2]) * in[2];
425	out[5] = ((uint128_t) in[0]) * inx2[5] +
426	((uint128_t) in[1]) * inx2[4] + ((uint128_t) in[2]) * inx2[3];
427	out[6] = ((uint128_t) in[0]) * inx2[6] +
428	((uint128_t) in[1]) * inx2[5] +
429	((uint128_t) in[2]) * inx2[4] + ((uint128_t) in[3]) * in[3];
430	out[7] = ((uint128_t) in[0]) * inx2[7] +
431	((uint128_t) in[1]) * inx2[6] +
432	((uint128_t) in[2]) * inx2[5] + ((uint128_t) in[3]) * inx2[4];
433	out[8] = ((uint128_t) in[0]) * inx2[8] +
434	((uint128_t) in[1]) * inx2[7] +
435	((uint128_t) in[2]) * inx2[6] +
436	((uint128_t) in[3]) * inx2[5] + ((uint128_t) in[4]) * in[4];
437
438	/*
439	* The remaining limbs fall above 2^521, with the first falling at 2^522.
440	* They correspond to locations one bit up from the limbs produced above
441	* so we would have to multiply by two to align them. Again, rather than
442	* operate on the 128-bit result, we double one of the inputs to the
443	* multiplication. If we want to double for both this reason, and the
444	* reason above, then we end up multiplying by four.
445	*/
446
447	/* 9 */
448	out[0] += ((uint128_t) in[1]) * inx4[8] +
449	((uint128_t) in[2]) * inx4[7] +
450	((uint128_t) in[3]) * inx4[6] + ((uint128_t) in[4]) * inx4[5];
451
452	/* 10 */
453	out[1] += ((uint128_t) in[2]) * inx4[8] +
454	((uint128_t) in[3]) * inx4[7] +
455	((uint128_t) in[4]) * inx4[6] + ((uint128_t) in[5]) * inx2[5];
456
457	/* 11 */
458	out[2] += ((uint128_t) in[3]) * inx4[8] +
459	((uint128_t) in[4]) * inx4[7] + ((uint128_t) in[5]) * inx4[6];
460
461	/* 12 */
462	out[3] += ((uint128_t) in[4]) * inx4[8] +
463	((uint128_t) in[5]) * inx4[7] + ((uint128_t) in[6]) * inx2[6];
464
465	/* 13 */
466	out[4] += ((uint128_t) in[5]) * inx4[8] + ((uint128_t) in[6]) * inx4[7];
467
468	/* 14 */
469	out[5] += ((uint128_t) in[6]) * inx4[8] + ((uint128_t) in[7]) * inx2[7];
470
471	/* 15 */
472	out[6] += ((uint128_t) in[7]) * inx4[8];
473
474	/* 16 */
475	out[7] += ((uint128_t) in[8]) * inx2[8];
476	}
477
478	/*-
479	* felem_mul sets \|out\| = \|in1\| * \|in2\|
480	* On entry:
481	* in1[i] < 2^64
482	* in2[i] < 2^63
483	* On exit:
484	* out[i] < 17 * max(in1[i]) * max(in2[i])
485	*/
486	static void felem_mul(largefelem out, const felem in1, const felem in2)
487	{
488	felem in2x2;
489	felem_scalar(in2x2, in2, 2);
490
491	out[0] = ((uint128_t) in1[0]) * in2[0];
492
493	out[1] = ((uint128_t) in1[0]) * in2[1] +
494	((uint128_t) in1[1]) * in2[0];
495
496	out[2] = ((uint128_t) in1[0]) * in2[2] +
497	((uint128_t) in1[1]) * in2[1] +
498	((uint128_t) in1[2]) * in2[0];
499
500	out[3] = ((uint128_t) in1[0]) * in2[3] +
501	((uint128_t) in1[1]) * in2[2] +
502	((uint128_t) in1[2]) * in2[1] +
503	((uint128_t) in1[3]) * in2[0];
504
505	out[4] = ((uint128_t) in1[0]) * in2[4] +
506	((uint128_t) in1[1]) * in2[3] +
507	((uint128_t) in1[2]) * in2[2] +
508	((uint128_t) in1[3]) * in2[1] +
509	((uint128_t) in1[4]) * in2[0];
510
511	out[5] = ((uint128_t) in1[0]) * in2[5] +
512	((uint128_t) in1[1]) * in2[4] +
513	((uint128_t) in1[2]) * in2[3] +
514	((uint128_t) in1[3]) * in2[2] +
515	((uint128_t) in1[4]) * in2[1] +
516	((uint128_t) in1[5]) * in2[0];
517
518	out[6] = ((uint128_t) in1[0]) * in2[6] +
519	((uint128_t) in1[1]) * in2[5] +
520	((uint128_t) in1[2]) * in2[4] +
521	((uint128_t) in1[3]) * in2[3] +
522	((uint128_t) in1[4]) * in2[2] +
523	((uint128_t) in1[5]) * in2[1] +
524	((uint128_t) in1[6]) * in2[0];
525
526	out[7] = ((uint128_t) in1[0]) * in2[7] +
527	((uint128_t) in1[1]) * in2[6] +
528	((uint128_t) in1[2]) * in2[5] +
529	((uint128_t) in1[3]) * in2[4] +
530	((uint128_t) in1[4]) * in2[3] +
531	((uint128_t) in1[5]) * in2[2] +
532	((uint128_t) in1[6]) * in2[1] +
533	((uint128_t) in1[7]) * in2[0];
534
535	out[8] = ((uint128_t) in1[0]) * in2[8] +
536	((uint128_t) in1[1]) * in2[7] +
537	((uint128_t) in1[2]) * in2[6] +
538	((uint128_t) in1[3]) * in2[5] +
539	((uint128_t) in1[4]) * in2[4] +
540	((uint128_t) in1[5]) * in2[3] +
541	((uint128_t) in1[6]) * in2[2] +
542	((uint128_t) in1[7]) * in2[1] +
543	((uint128_t) in1[8]) * in2[0];
544
545	/* See comment in felem_square about the use of in2x2 here */
546
547	out[0] += ((uint128_t) in1[1]) * in2x2[8] +
548	((uint128_t) in1[2]) * in2x2[7] +
549	((uint128_t) in1[3]) * in2x2[6] +
550	((uint128_t) in1[4]) * in2x2[5] +
551	((uint128_t) in1[5]) * in2x2[4] +
552	((uint128_t) in1[6]) * in2x2[3] +
553	((uint128_t) in1[7]) * in2x2[2] +
554	((uint128_t) in1[8]) * in2x2[1];
555
556	out[1] += ((uint128_t) in1[2]) * in2x2[8] +
557	((uint128_t) in1[3]) * in2x2[7] +
558	((uint128_t) in1[4]) * in2x2[6] +
559	((uint128_t) in1[5]) * in2x2[5] +
560	((uint128_t) in1[6]) * in2x2[4] +
561	((uint128_t) in1[7]) * in2x2[3] +
562	((uint128_t) in1[8]) * in2x2[2];
563
564	out[2] += ((uint128_t) in1[3]) * in2x2[8] +
565	((uint128_t) in1[4]) * in2x2[7] +
566	((uint128_t) in1[5]) * in2x2[6] +
567	((uint128_t) in1[6]) * in2x2[5] +
568	((uint128_t) in1[7]) * in2x2[4] +
569	((uint128_t) in1[8]) * in2x2[3];
570
571	out[3] += ((uint128_t) in1[4]) * in2x2[8] +
572	((uint128_t) in1[5]) * in2x2[7] +
573	((uint128_t) in1[6]) * in2x2[6] +
574	((uint128_t) in1[7]) * in2x2[5] +
575	((uint128_t) in1[8]) * in2x2[4];
576
577	out[4] += ((uint128_t) in1[5]) * in2x2[8] +
578	((uint128_t) in1[6]) * in2x2[7] +
579	((uint128_t) in1[7]) * in2x2[6] +
580	((uint128_t) in1[8]) * in2x2[5];
581
582	out[5] += ((uint128_t) in1[6]) * in2x2[8] +
583	((uint128_t) in1[7]) * in2x2[7] +
584	((uint128_t) in1[8]) * in2x2[6];
585
586	out[6] += ((uint128_t) in1[7]) * in2x2[8] +
587	((uint128_t) in1[8]) * in2x2[7];
588
589	out[7] += ((uint128_t) in1[8]) * in2x2[8];
590	}
591
592	static const limb bottom52bits = 0xfffffffffffff;
593
594	/*-
595	* felem_reduce converts a largefelem to an felem.
596	* On entry:
597	* in[i] < 2^128
598	* On exit:
599	* out[i] < 2^59 + 2^14
600	*/
601	static void felem_reduce(felem out, const largefelem in)
602	{
603	u64 overflow1, overflow2;
604
605	out[0] = ((limb) in[0]) & bottom58bits;
606	out[1] = ((limb) in[1]) & bottom58bits;
607	out[2] = ((limb) in[2]) & bottom58bits;
608	out[3] = ((limb) in[3]) & bottom58bits;
609	out[4] = ((limb) in[4]) & bottom58bits;
610	out[5] = ((limb) in[5]) & bottom58bits;
611	out[6] = ((limb) in[6]) & bottom58bits;
612	out[7] = ((limb) in[7]) & bottom58bits;
613	out[8] = ((limb) in[8]) & bottom58bits;
614
615	/* out[i] < 2^58 */
616
617	out[1] += ((limb) in[0]) >> 58;
618	out[1] += (((limb) (in[0] >> 64)) & bottom52bits) << 6;
619	/*-
620	* out[1] < 2^58 + 2^6 + 2^58
621	* = 2^59 + 2^6
622	*/
623	out[2] += ((limb) (in[0] >> 64)) >> 52;
624
625	out[2] += ((limb) in[1]) >> 58;
626	out[2] += (((limb) (in[1] >> 64)) & bottom52bits) << 6;
627	out[3] += ((limb) (in[1] >> 64)) >> 52;
628
629	out[3] += ((limb) in[2]) >> 58;
630	out[3] += (((limb) (in[2] >> 64)) & bottom52bits) << 6;
631	out[4] += ((limb) (in[2] >> 64)) >> 52;
632
633	out[4] += ((limb) in[3]) >> 58;
634	out[4] += (((limb) (in[3] >> 64)) & bottom52bits) << 6;
635	out[5] += ((limb) (in[3] >> 64)) >> 52;
636
637	out[5] += ((limb) in[4]) >> 58;
638	out[5] += (((limb) (in[4] >> 64)) & bottom52bits) << 6;
639	out[6] += ((limb) (in[4] >> 64)) >> 52;
640
641	out[6] += ((limb) in[5]) >> 58;
642	out[6] += (((limb) (in[5] >> 64)) & bottom52bits) << 6;
643	out[7] += ((limb) (in[5] >> 64)) >> 52;
644
645	out[7] += ((limb) in[6]) >> 58;
646	out[7] += (((limb) (in[6] >> 64)) & bottom52bits) << 6;
647	out[8] += ((limb) (in[6] >> 64)) >> 52;
648
649	out[8] += ((limb) in[7]) >> 58;
650	out[8] += (((limb) (in[7] >> 64)) & bottom52bits) << 6;
651	/*-
652	* out[x > 1] < 2^58 + 2^6 + 2^58 + 2^12
653	* < 2^59 + 2^13
654	*/
655	overflow1 = ((limb) (in[7] >> 64)) >> 52;
656
657	overflow1 += ((limb) in[8]) >> 58;
658	overflow1 += (((limb) (in[8] >> 64)) & bottom52bits) << 6;
659	overflow2 = ((limb) (in[8] >> 64)) >> 52;
660
661	overflow1 <<= 1; /* overflow1 < 2^13 + 2^7 + 2^59 */
662	overflow2 <<= 1; /* overflow2 < 2^13 */
663
664	out[0] += overflow1; /* out[0] < 2^60 */
665	out[1] += overflow2; /* out[1] < 2^59 + 2^6 + 2^13 */
666
667	out[1] += out[0] >> 58;
668	out[0] &= bottom58bits;
669	/*-
670	* out[0] < 2^58
671	* out[1] < 2^59 + 2^6 + 2^13 + 2^2
672	* < 2^59 + 2^14
673	*/
674	}
675
676	static void felem_square_reduce(felem out, const felem in)
677	{
678	largefelem tmp;
679	felem_square(tmp, in);
680	felem_reduce(out, tmp);
681	}
682
683	static void felem_mul_reduce(felem out, const felem in1, const felem in2)
684	{
685	largefelem tmp;
686	felem_mul(tmp, in1, in2);
687	felem_reduce(out, tmp);
688	}
689
690	/*-
691	* felem_inv calculates \|out\| = \|in\|^{-1}
692	*
693	* Based on Fermat's Little Theorem:
694	* a^p = a (mod p)
695	* a^{p-1} = 1 (mod p)
696	* a^{p-2} = a^{-1} (mod p)
697	*/
698	static void felem_inv(felem out, const felem in)
699	{
700	felem ftmp, ftmp2, ftmp3, ftmp4;
701	largefelem tmp;
702	unsigned i;
703
704	felem_square(tmp, in);
705	felem_reduce(ftmp, tmp); /* 2^1 */
706	felem_mul(tmp, in, ftmp);
707	felem_reduce(ftmp, tmp); /* 2^2 - 2^0 */
708	felem_assign(ftmp2, ftmp);
709	felem_square(tmp, ftmp);
710	felem_reduce(ftmp, tmp); /* 2^3 - 2^1 */
711	felem_mul(tmp, in, ftmp);
712	felem_reduce(ftmp, tmp); /* 2^3 - 2^0 */
713	felem_square(tmp, ftmp);
714	felem_reduce(ftmp, tmp); /* 2^4 - 2^1 */
715
716	felem_square(tmp, ftmp2);
717	felem_reduce(ftmp3, tmp); /* 2^3 - 2^1 */
718	felem_square(tmp, ftmp3);
719	felem_reduce(ftmp3, tmp); /* 2^4 - 2^2 */
720	felem_mul(tmp, ftmp3, ftmp2);
721	felem_reduce(ftmp3, tmp); /* 2^4 - 2^0 */
722
723	felem_assign(ftmp2, ftmp3);
724	felem_square(tmp, ftmp3);
725	felem_reduce(ftmp3, tmp); /* 2^5 - 2^1 */
726	felem_square(tmp, ftmp3);
727	felem_reduce(ftmp3, tmp); /* 2^6 - 2^2 */
728	felem_square(tmp, ftmp3);
729	felem_reduce(ftmp3, tmp); /* 2^7 - 2^3 */
730	felem_square(tmp, ftmp3);
731	felem_reduce(ftmp3, tmp); /* 2^8 - 2^4 */
732	felem_assign(ftmp4, ftmp3);
733	felem_mul(tmp, ftmp3, ftmp);
734	felem_reduce(ftmp4, tmp); /* 2^8 - 2^1 */
735	felem_square(tmp, ftmp4);
736	felem_reduce(ftmp4, tmp); /* 2^9 - 2^2 */
737	felem_mul(tmp, ftmp3, ftmp2);
738	felem_reduce(ftmp3, tmp); /* 2^8 - 2^0 */
739	felem_assign(ftmp2, ftmp3);
740
741	for (i = 0; i < 8; i++) {
742	felem_square(tmp, ftmp3);
743	felem_reduce(ftmp3, tmp); /* 2^16 - 2^8 */
744	}
745	felem_mul(tmp, ftmp3, ftmp2);
746	felem_reduce(ftmp3, tmp); /* 2^16 - 2^0 */
747	felem_assign(ftmp2, ftmp3);
748
749	for (i = 0; i < 16; i++) {
750	felem_square(tmp, ftmp3);
751	felem_reduce(ftmp3, tmp); /* 2^32 - 2^16 */
752	}
753	felem_mul(tmp, ftmp3, ftmp2);
754	felem_reduce(ftmp3, tmp); /* 2^32 - 2^0 */
755	felem_assign(ftmp2, ftmp3);
756
757	for (i = 0; i < 32; i++) {
758	felem_square(tmp, ftmp3);
759	felem_reduce(ftmp3, tmp); /* 2^64 - 2^32 */
760	}
761	felem_mul(tmp, ftmp3, ftmp2);
762	felem_reduce(ftmp3, tmp); /* 2^64 - 2^0 */
763	felem_assign(ftmp2, ftmp3);
764
765	for (i = 0; i < 64; i++) {
766	felem_square(tmp, ftmp3);
767	felem_reduce(ftmp3, tmp); /* 2^128 - 2^64 */
768	}
769	felem_mul(tmp, ftmp3, ftmp2);
770	felem_reduce(ftmp3, tmp); /* 2^128 - 2^0 */
771	felem_assign(ftmp2, ftmp3);
772
773	for (i = 0; i < 128; i++) {
774	felem_square(tmp, ftmp3);
775	felem_reduce(ftmp3, tmp); /* 2^256 - 2^128 */
776	}
777	felem_mul(tmp, ftmp3, ftmp2);
778	felem_reduce(ftmp3, tmp); /* 2^256 - 2^0 */
779	felem_assign(ftmp2, ftmp3);
780
781	for (i = 0; i < 256; i++) {
782	felem_square(tmp, ftmp3);
783	felem_reduce(ftmp3, tmp); /* 2^512 - 2^256 */
784	}
785	felem_mul(tmp, ftmp3, ftmp2);
786	felem_reduce(ftmp3, tmp); /* 2^512 - 2^0 */
787
788	for (i = 0; i < 9; i++) {
789	felem_square(tmp, ftmp3);
790	felem_reduce(ftmp3, tmp); /* 2^521 - 2^9 */
791	}
792	felem_mul(tmp, ftmp3, ftmp4);
793	felem_reduce(ftmp3, tmp); /* 2^512 - 2^2 */
794	felem_mul(tmp, ftmp3, in);
795	felem_reduce(out, tmp); /* 2^512 - 3 */
796	}
797
798	/* This is 2^521-1, expressed as an felem */
799	static const felem kPrime = {
800	0x03ffffffffffffff, 0x03ffffffffffffff, 0x03ffffffffffffff,
801	0x03ffffffffffffff, 0x03ffffffffffffff, 0x03ffffffffffffff,
802	0x03ffffffffffffff, 0x03ffffffffffffff, 0x01ffffffffffffff
803	};
804
805	/*-
806	* felem_is_zero returns a limb with all bits set if \|in\| == 0 (mod p) and 0
807	* otherwise.
808	* On entry:
809	* in[i] < 2^59 + 2^14
810	*/
811	static limb felem_is_zero(const felem in)
812	{
813	felem ftmp;
814	limb is_zero, is_p;
815	felem_assign(ftmp, in);
816
817	ftmp[0] += ftmp[8] >> 57;
818	ftmp[8] &= bottom57bits;
819	/* ftmp[8] < 2^57 */
820	ftmp[1] += ftmp[0] >> 58;
821	ftmp[0] &= bottom58bits;
822	ftmp[2] += ftmp[1] >> 58;
823	ftmp[1] &= bottom58bits;
824	ftmp[3] += ftmp[2] >> 58;
825	ftmp[2] &= bottom58bits;
826	ftmp[4] += ftmp[3] >> 58;
827	ftmp[3] &= bottom58bits;
828	ftmp[5] += ftmp[4] >> 58;
829	ftmp[4] &= bottom58bits;
830	ftmp[6] += ftmp[5] >> 58;
831	ftmp[5] &= bottom58bits;
832	ftmp[7] += ftmp[6] >> 58;
833	ftmp[6] &= bottom58bits;
834	ftmp[8] += ftmp[7] >> 58;
835	ftmp[7] &= bottom58bits;
836	/* ftmp[8] < 2^57 + 4 */
837
838	/*
839	* The ninth limb of 2*(2^521-1) is 0x03ffffffffffffff, which is greater
840	* than our bound for ftmp[8]. Therefore we only have to check if the
841	* zero is zero or 2^521-1.
842	*/
843
844	is_zero = 0;
845	is_zero \|= ftmp[0];
846	is_zero \|= ftmp[1];
847	is_zero \|= ftmp[2];
848	is_zero \|= ftmp[3];
849	is_zero \|= ftmp[4];
850	is_zero \|= ftmp[5];
851	is_zero \|= ftmp[6];
852	is_zero \|= ftmp[7];
853	is_zero \|= ftmp[8];
854
855	is_zero--;
856	/*
857	* We know that ftmp[i] < 2^63, therefore the only way that the top bit
858	* can be set is if is_zero was 0 before the decrement.
859	*/
860	is_zero = 0 - (is_zero >> 63);
861
862	is_p = ftmp[0] ^ kPrime[0];
863	is_p \|= ftmp[1] ^ kPrime[1];
864	is_p \|= ftmp[2] ^ kPrime[2];
865	is_p \|= ftmp[3] ^ kPrime[3];
866	is_p \|= ftmp[4] ^ kPrime[4];
867	is_p \|= ftmp[5] ^ kPrime[5];
868	is_p \|= ftmp[6] ^ kPrime[6];
869	is_p \|= ftmp[7] ^ kPrime[7];
870	is_p \|= ftmp[8] ^ kPrime[8];
871
872	is_p--;
873	is_p = 0 - (is_p >> 63);
874
875	is_zero \|= is_p;
876	return is_zero;
877	}
878
879	static int felem_is_zero_int(const void *in)
880	{
881	return (int)(felem_is_zero(in) & ((limb) 1));
882	}
883
884	/*-
885	* felem_contract converts \|in\| to its unique, minimal representation.
886	* On entry:
887	* in[i] < 2^59 + 2^14
888	*/
889	static void felem_contract(felem out, const felem in)
890	{
891	limb is_p, is_greater, sign;
892	static const limb two58 = ((limb) 1) << 58;
893
894	felem_assign(out, in);
895
896	out[0] += out[8] >> 57;
897	out[8] &= bottom57bits;
898	/* out[8] < 2^57 */
899	out[1] += out[0] >> 58;
900	out[0] &= bottom58bits;
901	out[2] += out[1] >> 58;
902	out[1] &= bottom58bits;
903	out[3] += out[2] >> 58;
904	out[2] &= bottom58bits;
905	out[4] += out[3] >> 58;
906	out[3] &= bottom58bits;
907	out[5] += out[4] >> 58;
908	out[4] &= bottom58bits;
909	out[6] += out[5] >> 58;
910	out[5] &= bottom58bits;
911	out[7] += out[6] >> 58;
912	out[6] &= bottom58bits;
913	out[8] += out[7] >> 58;
914	out[7] &= bottom58bits;
915	/* out[8] < 2^57 + 4 */
916
917	/*
918	* If the value is greater than 2^521-1 then we have to subtract 2^521-1
919	* out. See the comments in felem_is_zero regarding why we don't test for
920	* other multiples of the prime.
921	*/
922
923	/*
924	* First, if \|out\| is equal to 2^521-1, we subtract it out to get zero.
925	*/
926
927	is_p = out[0] ^ kPrime[0];
928	is_p \|= out[1] ^ kPrime[1];
929	is_p \|= out[2] ^ kPrime[2];
930	is_p \|= out[3] ^ kPrime[3];
931	is_p \|= out[4] ^ kPrime[4];
932	is_p \|= out[5] ^ kPrime[5];
933	is_p \|= out[6] ^ kPrime[6];
934	is_p \|= out[7] ^ kPrime[7];
935	is_p \|= out[8] ^ kPrime[8];
936
937	is_p--;
938	is_p &= is_p << 32;
939	is_p &= is_p << 16;
940	is_p &= is_p << 8;
941	is_p &= is_p << 4;
942	is_p &= is_p << 2;
943	is_p &= is_p << 1;
944	is_p = 0 - (is_p >> 63);
945	is_p = ~is_p;
946
947	/* is_p is 0 iff \|out\| == 2^521-1 and all ones otherwise */
948
949	out[0] &= is_p;
950	out[1] &= is_p;
951	out[2] &= is_p;
952	out[3] &= is_p;
953	out[4] &= is_p;
954	out[5] &= is_p;
955	out[6] &= is_p;
956	out[7] &= is_p;
957	out[8] &= is_p;
958
959	/*
960	* In order to test that \|out\| >= 2^521-1 we need only test if out[8] >>
961	* 57 is greater than zero as (2^521-1) + x >= 2^522
962	*/
963	is_greater = out[8] >> 57;
964	is_greater \|= is_greater << 32;
965	is_greater \|= is_greater << 16;
966	is_greater \|= is_greater << 8;
967	is_greater \|= is_greater << 4;
968	is_greater \|= is_greater << 2;
969	is_greater \|= is_greater << 1;
970	is_greater = 0 - (is_greater >> 63);
971
972	out[0] -= kPrime[0] & is_greater;
973	out[1] -= kPrime[1] & is_greater;
974	out[2] -= kPrime[2] & is_greater;
975	out[3] -= kPrime[3] & is_greater;
976	out[4] -= kPrime[4] & is_greater;
977	out[5] -= kPrime[5] & is_greater;
978	out[6] -= kPrime[6] & is_greater;
979	out[7] -= kPrime[7] & is_greater;
980	out[8] -= kPrime[8] & is_greater;
981
982	/* Eliminate negative coefficients */
983	sign = -(out[0] >> 63);
984	out[0] += (two58 & sign);
985	out[1] -= (1 & sign);
986	sign = -(out[1] >> 63);
987	out[1] += (two58 & sign);
988	out[2] -= (1 & sign);
989	sign = -(out[2] >> 63);
990	out[2] += (two58 & sign);
991	out[3] -= (1 & sign);
992	sign = -(out[3] >> 63);
993	out[3] += (two58 & sign);
994	out[4] -= (1 & sign);
995	sign = -(out[4] >> 63);
996	out[4] += (two58 & sign);
997	out[5] -= (1 & sign);
998	sign = -(out[0] >> 63);
999	out[5] += (two58 & sign);
1000	out[6] -= (1 & sign);
1001	sign = -(out[6] >> 63);
1002	out[6] += (two58 & sign);
1003	out[7] -= (1 & sign);
1004	sign = -(out[7] >> 63);
1005	out[7] += (two58 & sign);
1006	out[8] -= (1 & sign);
1007	sign = -(out[5] >> 63);
1008	out[5] += (two58 & sign);
1009	out[6] -= (1 & sign);
1010	sign = -(out[6] >> 63);
1011	out[6] += (two58 & sign);
1012	out[7] -= (1 & sign);
1013	sign = -(out[7] >> 63);
1014	out[7] += (two58 & sign);
1015	out[8] -= (1 & sign);
1016	}
1017
1018	/*-
1019	* Group operations
1020	* ----------------
1021	*
1022	* Building on top of the field operations we have the operations on the
1023	* elliptic curve group itself. Points on the curve are represented in Jacobian
1024	* coordinates */
1025
1026	/*-
1027	* point_double calculates 2*(x_in, y_in, z_in)
1028	*
1029	* The method is taken from:
1030	* http://hyperelliptic.org/EFD/g1p/auto-shortw-jacobian-3.html#doubling-dbl-2001-b
1031	*
1032	* Outputs can equal corresponding inputs, i.e., x_out == x_in is allowed.
1033	* while x_out == y_in is not (maybe this works, but it's not tested). */
1034	static void
1035	point_double(felem x_out, felem y_out, felem z_out,
1036	const felem x_in, const felem y_in, const felem z_in)
1037	{
1038	largefelem tmp, tmp2;
1039	felem delta, gamma, beta, alpha, ftmp, ftmp2;
1040
1041	felem_assign(ftmp, x_in);
1042	felem_assign(ftmp2, x_in);
1043
1044	/* delta = z^2 */
1045	felem_square(tmp, z_in);
1046	felem_reduce(delta, tmp); /* delta[i] < 2^59 + 2^14 */
1047
1048	/* gamma = y^2 */
1049	felem_square(tmp, y_in);
1050	felem_reduce(gamma, tmp); /* gamma[i] < 2^59 + 2^14 */
1051
1052	/* beta = xgamma /
1053	felem_mul(tmp, x_in, gamma);
1054	felem_reduce(beta, tmp); /* beta[i] < 2^59 + 2^14 */
1055
1056	/* alpha = 3(x-delta)(x+delta) */
1057	felem_diff64(ftmp, delta);
1058	/* ftmp[i] < 2^61 */
1059	felem_sum64(ftmp2, delta);
1060	/* ftmp2[i] < 2^60 + 2^15 */
1061	felem_scalar64(ftmp2, 3);
1062	/* ftmp2[i] < 32^60 + 32^15 */
1063	felem_mul(tmp, ftmp, ftmp2);
1064	/*-
1065	* tmp[i] < 17(32^121 + 32^76)
1066	* = 612^121 + 612^76
1067	* < 642^121 + 642^76
1068	* = 2^127 + 2^82
1069	* < 2^128
1070	*/
1071	felem_reduce(alpha, tmp);
1072
1073	/* x' = alpha^2 - 8beta /
1074	felem_square(tmp, alpha);
1075	/*
1076	* tmp[i] < 17*2^120 < 2^125
1077	*/
1078	felem_assign(ftmp, beta);
1079	felem_scalar64(ftmp, 8);
1080	/* ftmp[i] < 2^62 + 2^17 */
1081	felem_diff_128_64(tmp, ftmp);
1082	/* tmp[i] < 2^125 + 2^63 + 2^62 + 2^17 */
1083	felem_reduce(x_out, tmp);
1084
1085	/* z' = (y + z)^2 - gamma - delta */
1086	felem_sum64(delta, gamma);
1087	/* delta[i] < 2^60 + 2^15 */
1088	felem_assign(ftmp, y_in);
1089	felem_sum64(ftmp, z_in);
1090	/* ftmp[i] < 2^60 + 2^15 */
1091	felem_square(tmp, ftmp);
1092	/*
1093	* tmp[i] < 17(2^122) < 2^127
1094	*/
1095	felem_diff_128_64(tmp, delta);
1096	/* tmp[i] < 2^127 + 2^63 */
1097	felem_reduce(z_out, tmp);
1098
1099	/* y' = alpha(4beta - x') - 8gamma^2 /
1100	felem_scalar64(beta, 4);
1101	/* beta[i] < 2^61 + 2^16 */
1102	felem_diff64(beta, x_out);
1103	/* beta[i] < 2^61 + 2^60 + 2^16 */
1104	felem_mul(tmp, alpha, beta);
1105	/*-
1106	* tmp[i] < 17*((2^59 + 2^14)(2^61 + 2^60 + 2^16))
1107	* = 17*(2^120 + 2^75 + 2^119 + 2^74 + 2^75 + 2^30)
1108	* = 17*(2^120 + 2^119 + 2^76 + 2^74 + 2^30)
1109	* < 2^128
1110	*/
1111	felem_square(tmp2, gamma);
1112	/*-
1113	* tmp2[i] < 17*(2^59 + 2^14)^2
1114	* = 17*(2^118 + 2^74 + 2^28)
1115	*/
1116	felem_scalar128(tmp2, 8);
1117	/*-
1118	* tmp2[i] < 817(2^118 + 2^74 + 2^28)
1119	* = 2^125 + 2^121 + 2^81 + 2^77 + 2^35 + 2^31
1120	* < 2^126
1121	*/
1122	felem_diff128(tmp, tmp2);
1123	/*-
1124	* tmp[i] < 2^127 - 2^69 + 17(2^120 + 2^119 + 2^76 + 2^74 + 2^30)
1125	* = 2^127 + 2^124 + 2^122 + 2^120 + 2^118 + 2^80 + 2^78 + 2^76 +
1126	* 2^74 + 2^69 + 2^34 + 2^30
1127	* < 2^128
1128	*/
1129	felem_reduce(y_out, tmp);
1130	}
1131
1132	/* copy_conditional copies in to out iff mask is all ones. */
1133	static void copy_conditional(felem out, const felem in, limb mask)
1134	{
1135	unsigned i;
1136	for (i = 0; i < NLIMBS; ++i) {
1137	const limb tmp = mask & (in[i] ^ out[i]);
1138	out[i] ^= tmp;
1139	}
1140	}
1141
1142	/*-
1143	* point_add calculates (x1, y1, z1) + (x2, y2, z2)
1144	*
1145	* The method is taken from
1146	* http://hyperelliptic.org/EFD/g1p/auto-shortw-jacobian-3.html#addition-add-2007-bl,
1147	* adapted for mixed addition (z2 = 1, or z2 = 0 for the point at infinity).
1148	*
1149	* This function includes a branch for checking whether the two input points
1150	* are equal (while not equal to the point at infinity). See comment below
1151	* on constant-time.
1152	*/
1153	static void point_add(felem x3, felem y3, felem z3,
1154	const felem x1, const felem y1, const felem z1,
1155	const int mixed, const felem x2, const felem y2,
1156	const felem z2)
1157	{
1158	felem ftmp, ftmp2, ftmp3, ftmp4, ftmp5, ftmp6, x_out, y_out, z_out;
1159	largefelem tmp, tmp2;
1160	limb x_equal, y_equal, z1_is_zero, z2_is_zero;
1161	limb points_equal;
1162
1163	z1_is_zero = felem_is_zero(z1);
1164	z2_is_zero = felem_is_zero(z2);
1165
1166	/* ftmp = z1z1 = z1*2 /
1167	felem_square(tmp, z1);
1168	felem_reduce(ftmp, tmp);
1169
1170	if (!mixed) {
1171	/* ftmp2 = z2z2 = z2*2 /
1172	felem_square(tmp, z2);
1173	felem_reduce(ftmp2, tmp);
1174
1175	/* u1 = ftmp3 = x1z2z2 /
1176	felem_mul(tmp, x1, ftmp2);
1177	felem_reduce(ftmp3, tmp);
1178
1179	/* ftmp5 = z1 + z2 */
1180	felem_assign(ftmp5, z1);
1181	felem_sum64(ftmp5, z2);
1182	/* ftmp5[i] < 2^61 */
1183
1184	/* ftmp5 = (z1 + z2)*2 - z1z1 - z2z2 = 2z1z2 */
1185	felem_square(tmp, ftmp5);
1186	/* tmp[i] < 172^122 /
1187	felem_diff_128_64(tmp, ftmp);
1188	/* tmp[i] < 172^122 + 2^63 /
1189	felem_diff_128_64(tmp, ftmp2);
1190	/* tmp[i] < 172^122 + 2^64 /
1191	felem_reduce(ftmp5, tmp);
1192
1193	/* ftmp2 = z2 * z2z2 */
1194	felem_mul(tmp, ftmp2, z2);
1195	felem_reduce(ftmp2, tmp);
1196
1197	/* s1 = ftmp6 = y1 * z2*3 /
1198	felem_mul(tmp, y1, ftmp2);
1199	felem_reduce(ftmp6, tmp);
1200	} else {
1201	/*
1202	* We'll assume z2 = 1 (special case z2 = 0 is handled later)
1203	*/
1204
1205	/* u1 = ftmp3 = x1z2z2 /
1206	felem_assign(ftmp3, x1);
1207
1208	/* ftmp5 = 2z1z2 /
1209	felem_scalar(ftmp5, z1, 2);
1210
1211	/* s1 = ftmp6 = y1 * z2*3 /
1212	felem_assign(ftmp6, y1);
1213	}
1214
1215	/* u2 = x2z1z1 /
1216	felem_mul(tmp, x2, ftmp);
1217	/* tmp[i] < 172^120 /
1218
1219	/* h = ftmp4 = u2 - u1 */
1220	felem_diff_128_64(tmp, ftmp3);
1221	/* tmp[i] < 172^120 + 2^63 /
1222	felem_reduce(ftmp4, tmp);
1223
1224	x_equal = felem_is_zero(ftmp4);
1225
1226	/* z_out = ftmp5 * h */
1227	felem_mul(tmp, ftmp5, ftmp4);
1228	felem_reduce(z_out, tmp);
1229
1230	/* ftmp = z1 * z1z1 */
1231	felem_mul(tmp, ftmp, z1);
1232	felem_reduce(ftmp, tmp);
1233
1234	/* s2 = tmp = y2 * z1*3 /
1235	felem_mul(tmp, y2, ftmp);
1236	/* tmp[i] < 172^120 /
1237
1238	/* r = ftmp5 = (s2 - s1)2 /
1239	felem_diff_128_64(tmp, ftmp6);
1240	/* tmp[i] < 172^120 + 2^63 /
1241	felem_reduce(ftmp5, tmp);
1242	y_equal = felem_is_zero(ftmp5);
1243	felem_scalar64(ftmp5, 2);
1244	/* ftmp5[i] < 2^61 */
1245
1246	/*
1247	* The formulae are incorrect if the points are equal, in affine coordinates
1248	* (X_1, Y_1) == (X_2, Y_2), so we check for this and do doubling if this
1249	* happens.
1250	*
1251	* We use bitwise operations to avoid potential side-channels introduced by
1252	* the short-circuiting behaviour of boolean operators.
1253	*
1254	* The special case of either point being the point at infinity (z1 and/or
1255	* z2 are zero), is handled separately later on in this function, so we
1256	* avoid jumping to point_double here in those special cases.
1257	*
1258	* Notice the comment below on the implications of this branching for timing
1259	* leaks and why it is considered practically irrelevant.
1260	*/
1261	points_equal = (x_equal & y_equal & (~z1_is_zero) & (~z2_is_zero));
1262
1263	if (points_equal) {
1264	/*
1265	* This is obviously not constant-time but it will almost-never happen
1266	* for ECDH / ECDSA. The case where it can happen is during scalar-mult
1267	* where the intermediate value gets very close to the group order.
1268	* Since \|ec_GFp_nistp_recode_scalar_bits\| produces signed digits for
1269	* the scalar, it's possible for the intermediate value to be a small
1270	* negative multiple of the base point, and for the final signed digit
1271	* to be the same value. We believe that this only occurs for the scalar
1272	* 1fffffffffffffffffffffffffffffffffffffffffffffffffffffffffff
1273	* ffffffa51868783bf2f966b7fcc0148f709a5d03bb5c9b8899c47aebb6fb
1274	* 71e913863f7, in that case the penultimate intermediate is -9G and
1275	* the final digit is also -9G. Since this only happens for a single
1276	* scalar, the timing leak is irrelevant. (Any attacker who wanted to
1277	* check whether a secret scalar was that exact value, can already do
1278	* so.)
1279	*/
1280	point_double(x3, y3, z3, x1, y1, z1);
1281	return;
1282	}
1283
1284	/* I = ftmp = (2h)*2 /
1285	felem_assign(ftmp, ftmp4);
1286	felem_scalar64(ftmp, 2);
1287	/* ftmp[i] < 2^61 */
1288	felem_square(tmp, ftmp);
1289	/* tmp[i] < 172^122 /
1290	felem_reduce(ftmp, tmp);
1291
1292	/* J = ftmp2 = h * I */
1293	felem_mul(tmp, ftmp4, ftmp);
1294	felem_reduce(ftmp2, tmp);
1295
1296	/* V = ftmp4 = U1 * I */
1297	felem_mul(tmp, ftmp3, ftmp);
1298	felem_reduce(ftmp4, tmp);
1299
1300	/* x_out = r*2 - J - 2V /
1301	felem_square(tmp, ftmp5);
1302	/* tmp[i] < 172^122 /
1303	felem_diff_128_64(tmp, ftmp2);
1304	/* tmp[i] < 172^122 + 2^63 /
1305	felem_assign(ftmp3, ftmp4);
1306	felem_scalar64(ftmp4, 2);
1307	/* ftmp4[i] < 2^61 */
1308	felem_diff_128_64(tmp, ftmp4);
1309	/* tmp[i] < 172^122 + 2^64 /
1310	felem_reduce(x_out, tmp);
1311
1312	/* y_out = r(V-x_out) - 2 * s1 * J */
1313	felem_diff64(ftmp3, x_out);
1314	/*
1315	* ftmp3[i] < 2^60 + 2^60 = 2^61
1316	*/
1317	felem_mul(tmp, ftmp5, ftmp3);
1318	/* tmp[i] < 172^122 /
1319	felem_mul(tmp2, ftmp6, ftmp2);
1320	/* tmp2[i] < 172^120 /
1321	felem_scalar128(tmp2, 2);
1322	/* tmp2[i] < 172^121 /
1323	felem_diff128(tmp, tmp2);
1324	/*-
1325	* tmp[i] < 2^127 - 2^69 + 17*2^122
1326	* = 2^126 - 2^122 - 2^6 - 2^2 - 1
1327	* < 2^127
1328	*/
1329	felem_reduce(y_out, tmp);
1330
1331	copy_conditional(x_out, x2, z1_is_zero);
1332	copy_conditional(x_out, x1, z2_is_zero);
1333	copy_conditional(y_out, y2, z1_is_zero);
1334	copy_conditional(y_out, y1, z2_is_zero);
1335	copy_conditional(z_out, z2, z1_is_zero);
1336	copy_conditional(z_out, z1, z2_is_zero);
1337	felem_assign(x3, x_out);
1338	felem_assign(y3, y_out);
1339	felem_assign(z3, z_out);
1340	}
1341
1342	/*-
1343	* Base point pre computation
1344	* --------------------------
1345	*
1346	* Two different sorts of precomputed tables are used in the following code.
1347	* Each contain various points on the curve, where each point is three field
1348	* elements (x, y, z).
1349	*
1350	* For the base point table, z is usually 1 (0 for the point at infinity).
1351	* This table has 16 elements:
1352	* index \| bits \| point
1353	* ------+---------+------------------------------
1354	* 0 \| 0 0 0 0 \| 0G
1355	* 1 \| 0 0 0 1 \| 1G
1356	* 2 \| 0 0 1 0 \| 2^130G
1357	* 3 \| 0 0 1 1 \| (2^130 + 1)G
1358	* 4 \| 0 1 0 0 \| 2^260G
1359	* 5 \| 0 1 0 1 \| (2^260 + 1)G
1360	* 6 \| 0 1 1 0 \| (2^260 + 2^130)G
1361	* 7 \| 0 1 1 1 \| (2^260 + 2^130 + 1)G
1362	* 8 \| 1 0 0 0 \| 2^390G
1363	* 9 \| 1 0 0 1 \| (2^390 + 1)G
1364	* 10 \| 1 0 1 0 \| (2^390 + 2^130)G
1365	* 11 \| 1 0 1 1 \| (2^390 + 2^130 + 1)G
1366	* 12 \| 1 1 0 0 \| (2^390 + 2^260)G
1367	* 13 \| 1 1 0 1 \| (2^390 + 2^260 + 1)G
1368	* 14 \| 1 1 1 0 \| (2^390 + 2^260 + 2^130)G
1369	* 15 \| 1 1 1 1 \| (2^390 + 2^260 + 2^130 + 1)G
1370	*
1371	* The reason for this is so that we can clock bits into four different
1372	* locations when doing simple scalar multiplies against the base point.
1373	*
1374	* Tables for other points have table[i] = iG for i in 0 .. 16. */
1375
1376	/* gmul is the table of precomputed base points */
1377	static const felem gmul[16][3] = {
1378	{{0, 0, 0, 0, 0, 0, 0, 0, 0},
1379	{0, 0, 0, 0, 0, 0, 0, 0, 0},
1380	{0, 0, 0, 0, 0, 0, 0, 0, 0}},
1381	{{0x017e7e31c2e5bd66, 0x022cf0615a90a6fe, 0x00127a2ffa8de334,
1382	0x01dfbf9d64a3f877, 0x006b4d3dbaa14b5e, 0x014fed487e0a2bd8,
1383	0x015b4429c6481390, 0x03a73678fb2d988e, 0x00c6858e06b70404},
1384	{0x00be94769fd16650, 0x031c21a89cb09022, 0x039013fad0761353,
1385	0x02657bd099031542, 0x03273e662c97ee72, 0x01e6d11a05ebef45,
1386	0x03d1bd998f544495, 0x03001172297ed0b1, 0x011839296a789a3b},
1387	{1, 0, 0, 0, 0, 0, 0, 0, 0}},
1388	{{0x0373faacbc875bae, 0x00f325023721c671, 0x00f666fd3dbde5ad,
1389	0x01a6932363f88ea7, 0x01fc6d9e13f9c47b, 0x03bcbffc2bbf734e,
1390	0x013ee3c3647f3a92, 0x029409fefe75d07d, 0x00ef9199963d85e5},
1391	{0x011173743ad5b178, 0x02499c7c21bf7d46, 0x035beaeabb8b1a58,
1392	0x00f989c4752ea0a3, 0x0101e1de48a9c1a3, 0x01a20076be28ba6c,
1393	0x02f8052e5eb2de95, 0x01bfe8f82dea117c, 0x0160074d3c36ddb7},
1394	{1, 0, 0, 0, 0, 0, 0, 0, 0}},
1395	{{0x012f3fc373393b3b, 0x03d3d6172f1419fa, 0x02adc943c0b86873,
1396	0x00d475584177952b, 0x012a4d1673750ee2, 0x00512517a0f13b0c,
1397	0x02b184671a7b1734, 0x0315b84236f1a50a, 0x00a4afc472edbdb9},
1398	{0x00152a7077f385c4, 0x03044007d8d1c2ee, 0x0065829d61d52b52,
1399	0x00494ff6b6631d0d, 0x00a11d94d5f06bcf, 0x02d2f89474d9282e,
1400	0x0241c5727c06eeb9, 0x0386928710fbdb9d, 0x01f883f727b0dfbe},
1401	{1, 0, 0, 0, 0, 0, 0, 0, 0}},
1402	{{0x019b0c3c9185544d, 0x006243a37c9d97db, 0x02ee3cbe030a2ad2,
1403	0x00cfdd946bb51e0d, 0x0271c00932606b91, 0x03f817d1ec68c561,
1404	0x03f37009806a369c, 0x03c1f30baf184fd5, 0x01091022d6d2f065},
1405	{0x0292c583514c45ed, 0x0316fca51f9a286c, 0x00300af507c1489a,
1406	0x0295f69008298cf1, 0x02c0ed8274943d7b, 0x016509b9b47a431e,
1407	0x02bc9de9634868ce, 0x005b34929bffcb09, 0x000c1a0121681524},
1408	{1, 0, 0, 0, 0, 0, 0, 0, 0}},
1409	{{0x0286abc0292fb9f2, 0x02665eee9805b3f7, 0x01ed7455f17f26d6,
1410	0x0346355b83175d13, 0x006284944cd0a097, 0x0191895bcdec5e51,
1411	0x02e288370afda7d9, 0x03b22312bfefa67a, 0x01d104d3fc0613fe},
1412	{0x0092421a12f7e47f, 0x0077a83fa373c501, 0x03bd25c5f696bd0d,
1413	0x035c41e4d5459761, 0x01ca0d1742b24f53, 0x00aaab27863a509c,
1414	0x018b6de47df73917, 0x025c0b771705cd01, 0x01fd51d566d760a7},
1415	{1, 0, 0, 0, 0, 0, 0, 0, 0}},
1416	{{0x01dd92ff6b0d1dbd, 0x039c5e2e8f8afa69, 0x0261ed13242c3b27,
1417	0x0382c6e67026e6a0, 0x01d60b10be2089f9, 0x03c15f3dce86723f,
1418	0x03c764a32d2a062d, 0x017307eac0fad056, 0x018207c0b96c5256},
1419	{0x0196a16d60e13154, 0x03e6ce74c0267030, 0x00ddbf2b4e52a5aa,
1420	0x012738241bbf31c8, 0x00ebe8dc04685a28, 0x024c2ad6d380d4a2,
1421	0x035ee062a6e62d0e, 0x0029ed74af7d3a0f, 0x00eef32aec142ebd},
1422	{1, 0, 0, 0, 0, 0, 0, 0, 0}},
1423	{{0x00c31ec398993b39, 0x03a9f45bcda68253, 0x00ac733c24c70890,
1424	0x00872b111401ff01, 0x01d178c23195eafb, 0x03bca2c816b87f74,
1425	0x0261a9af46fbad7a, 0x0324b2a8dd3d28f9, 0x00918121d8f24e23},
1426	{0x032bc8c1ca983cd7, 0x00d869dfb08fc8c6, 0x01693cb61fce1516,
1427	0x012a5ea68f4e88a8, 0x010869cab88d7ae3, 0x009081ad277ceee1,
1428	0x033a77166d064cdc, 0x03955235a1fb3a95, 0x01251a4a9b25b65e},
1429	{1, 0, 0, 0, 0, 0, 0, 0, 0}},
1430	{{0x00148a3a1b27f40b, 0x0123186df1b31fdc, 0x00026e7beaad34ce,
1431	0x01db446ac1d3dbba, 0x0299c1a33437eaec, 0x024540610183cbb7,
1432	0x0173bb0e9ce92e46, 0x02b937e43921214b, 0x01ab0436a9bf01b5},
1433	{0x0383381640d46948, 0x008dacbf0e7f330f, 0x03602122bcc3f318,
1434	0x01ee596b200620d6, 0x03bd0585fda430b3, 0x014aed77fd123a83,
1435	0x005ace749e52f742, 0x0390fe041da2b842, 0x0189a8ceb3299242},
1436	{1, 0, 0, 0, 0, 0, 0, 0, 0}},
1437	{{0x012a19d6b3282473, 0x00c0915918b423ce, 0x023a954eb94405ae,
1438	0x00529f692be26158, 0x0289fa1b6fa4b2aa, 0x0198ae4ceea346ef,
1439	0x0047d8cdfbdedd49, 0x00cc8c8953f0f6b8, 0x001424abbff49203},
1440	{0x0256732a1115a03a, 0x0351bc38665c6733, 0x03f7b950fb4a6447,
1441	0x000afffa94c22155, 0x025763d0a4dab540, 0x000511e92d4fc283,
1442	0x030a7e9eda0ee96c, 0x004c3cd93a28bf0a, 0x017edb3a8719217f},
1443	{1, 0, 0, 0, 0, 0, 0, 0, 0}},
1444	{{0x011de5675a88e673, 0x031d7d0f5e567fbe, 0x0016b2062c970ae5,
1445	0x03f4a2be49d90aa7, 0x03cef0bd13822866, 0x03f0923dcf774a6c,
1446	0x0284bebc4f322f72, 0x016ab2645302bb2c, 0x01793f95dace0e2a},
1447	{0x010646e13527a28f, 0x01ca1babd59dc5e7, 0x01afedfd9a5595df,
1448	0x01f15785212ea6b1, 0x0324e5d64f6ae3f4, 0x02d680f526d00645,
1449	0x0127920fadf627a7, 0x03b383f75df4f684, 0x0089e0057e783b0a},
1450	{1, 0, 0, 0, 0, 0, 0, 0, 0}},
1451	{{0x00f334b9eb3c26c6, 0x0298fdaa98568dce, 0x01c2d24843a82292,
1452	0x020bcb24fa1b0711, 0x02cbdb3d2b1875e6, 0x0014907598f89422,
1453	0x03abe3aa43b26664, 0x02cbf47f720bc168, 0x0133b5e73014b79b},
1454	{0x034aab5dab05779d, 0x00cdc5d71fee9abb, 0x0399f16bd4bd9d30,
1455	0x03582fa592d82647, 0x02be1cdfb775b0e9, 0x0034f7cea32e94cb,
1456	0x0335a7f08f56f286, 0x03b707e9565d1c8b, 0x0015c946ea5b614f},
1457	{1, 0, 0, 0, 0, 0, 0, 0, 0}},
1458	{{0x024676f6cff72255, 0x00d14625cac96378, 0x00532b6008bc3767,
1459	0x01fc16721b985322, 0x023355ea1b091668, 0x029de7afdc0317c3,
1460	0x02fc8a7ca2da037c, 0x02de1217d74a6f30, 0x013f7173175b73bf},
1461	{0x0344913f441490b5, 0x0200f9e272b61eca, 0x0258a246b1dd55d2,
1462	0x03753db9ea496f36, 0x025e02937a09c5ef, 0x030cbd3d14012692,
1463	0x01793a67e70dc72a, 0x03ec1d37048a662e, 0x006550f700c32a8d},
1464	{1, 0, 0, 0, 0, 0, 0, 0, 0}},
1465	{{0x00d3f48a347eba27, 0x008e636649b61bd8, 0x00d3b93716778fb3,
1466	0x004d1915757bd209, 0x019d5311a3da44e0, 0x016d1afcbbe6aade,
1467	0x0241bf5f73265616, 0x0384672e5d50d39b, 0x005009fee522b684},
1468	{0x029b4fab064435fe, 0x018868ee095bbb07, 0x01ea3d6936cc92b8,
1469	0x000608b00f78a2f3, 0x02db911073d1c20f, 0x018205938470100a,
1470	0x01f1e4964cbe6ff2, 0x021a19a29eed4663, 0x01414485f42afa81},
1471	{1, 0, 0, 0, 0, 0, 0, 0, 0}},
1472	{{0x01612b3a17f63e34, 0x03813992885428e6, 0x022b3c215b5a9608,
1473	0x029b4057e19f2fcb, 0x0384059a587af7e6, 0x02d6400ace6fe610,
1474	0x029354d896e8e331, 0x00c047ee6dfba65e, 0x0037720542e9d49d},
1475	{0x02ce9eed7c5e9278, 0x0374ed703e79643b, 0x01316c54c4072006,
1476	0x005aaa09054b2ee8, 0x002824000c840d57, 0x03d4eba24771ed86,
1477	0x0189c50aabc3bdae, 0x0338c01541e15510, 0x00466d56e38eed42},
1478	{1, 0, 0, 0, 0, 0, 0, 0, 0}},
1479	{{0x007efd8330ad8bd6, 0x02465ed48047710b, 0x0034c6606b215e0c,
1480	0x016ae30c53cbf839, 0x01fa17bd37161216, 0x018ead4e61ce8ab9,
1481	0x005482ed5f5dee46, 0x037543755bba1d7f, 0x005e5ac7e70a9d0f},
1482	{0x0117e1bb2fdcb2a2, 0x03deea36249f40c4, 0x028d09b4a6246cb7,
1483	0x03524b8855bcf756, 0x023d7d109d5ceb58, 0x0178e43e3223ef9c,
1484	0x0154536a0c6e966a, 0x037964d1286ee9fe, 0x0199bcd90e125055},
1485	{1, 0, 0, 0, 0, 0, 0, 0, 0}}
1486	};
1487
1488	/*
1489	* select_point selects the \|idx\|th point from a precomputation table and
1490	* copies it to out.
1491	*/
1492	/* pre_comp below is of the size provided in \|size\| */
1493	static void select_point(const limb idx, unsigned int size,
1494	const felem pre_comp[][3], felem out[3])
1495	{
1496	unsigned i, j;
1497	limb *outlimbs = &out[0][0];
1498
1499	memset(out, 0, sizeof(out) 3);
1500
1501	for (i = 0; i < size; i++) {
1502	const limb *inlimbs = &pre_comp[i][0][0];
1503	limb mask = i ^ idx;
1504	mask \|= mask >> 4;
1505	mask \|= mask >> 2;
1506	mask \|= mask >> 1;
1507	mask &= 1;
1508	mask--;
1509	for (j = 0; j < NLIMBS * 3; j++)
1510	outlimbs[j] \|= inlimbs[j] & mask;
1511	}
1512	}
1513
1514	/* get_bit returns the \|i\|th bit in \|in\| */
1515	static char get_bit(const felem_bytearray in, int i)
1516	{
1517	if (i < 0)
1518	return 0;
1519	return (in[i >> 3] >> (i & 7)) & 1;
1520	}
1521
1522	/*
1523	* Interleaved point multiplication using precomputed point multiples: The
1524	* small point multiples 0P, 1P, ..., 16*P are in pre_comp[], the scalars
1525	* in scalars[]. If g_scalar is non-NULL, we also add this multiple of the
1526	* generator, using certain (large) precomputed multiples in g_pre_comp.
1527	* Output point (X, Y, Z) is stored in x_out, y_out, z_out
1528	*/
1529	static void batch_mul(felem x_out, felem y_out, felem z_out,
1530	const felem_bytearray scalars[],
1531	const unsigned num_points, const u8 *g_scalar,
1532	const int mixed, const felem pre_comp[][17][3],
1533	const felem g_pre_comp[16][3])
1534	{
1535	int i, skip;
1536	unsigned num, gen_mul = (g_scalar != NULL);
1537	felem nq[3], tmp[4];
1538	limb bits;
1539	u8 sign, digit;
1540
1541	/* set nq to the point at infinity */
1542	memset(nq, 0, sizeof(nq));
1543
1544	/*
1545	* Loop over all scalars msb-to-lsb, interleaving additions of multiples
1546	* of the generator (last quarter of rounds) and additions of other
1547	* points multiples (every 5th round).
1548	*/
1549	skip = 1; /* save two point operations in the first
1550	* round */
1551	for (i = (num_points ? 520 : 130); i >= 0; --i) {
1552	/* double */
1553	if (!skip)
1554	point_double(nq[0], nq[1], nq[2], nq[0], nq[1], nq[2]);
1555
1556	/* add multiples of the generator */
1557	if (gen_mul && (i <= 130)) {
1558	bits = get_bit(g_scalar, i + 390) << 3;
1559	if (i < 130) {
1560	bits \|= get_bit(g_scalar, i + 260) << 2;
1561	bits \|= get_bit(g_scalar, i + 130) << 1;
1562	bits \|= get_bit(g_scalar, i);
1563	}
1564	/* select the point to add, in constant time */
1565	select_point(bits, 16, g_pre_comp, tmp);
1566	if (!skip) {
1567	/* The 1 argument below is for "mixed" */
1568	point_add(nq[0], nq[1], nq[2],
1569	nq[0], nq[1], nq[2], 1, tmp[0], tmp[1], tmp[2]);
1570	} else {
1571	memcpy(nq, tmp, 3 * sizeof(felem));
1572	skip = 0;
1573	}
1574	}
1575
1576	/* do other additions every 5 doublings */
1577	if (num_points && (i % 5 == 0)) {
1578	/* loop over all scalars */
1579	for (num = 0; num < num_points; ++num) {
1580	bits = get_bit(scalars[num], i + 4) << 5;
1581	bits \|= get_bit(scalars[num], i + 3) << 4;
1582	bits \|= get_bit(scalars[num], i + 2) << 3;
1583	bits \|= get_bit(scalars[num], i + 1) << 2;
1584	bits \|= get_bit(scalars[num], i) << 1;
1585	bits \|= get_bit(scalars[num], i - 1);
1586	ec_GFp_nistp_recode_scalar_bits(&sign, &digit, bits);
1587
1588	/*
1589	* select the point to add or subtract, in constant time
1590	*/
1591	select_point(digit, 17, pre_comp[num], tmp);
1592	felem_neg(tmp[3], tmp[1]); /* (X, -Y, Z) is the negative
1593	* point */
1594	copy_conditional(tmp[1], tmp[3], (-(limb) sign));
1595
1596	if (!skip) {
1597	point_add(nq[0], nq[1], nq[2],
1598	nq[0], nq[1], nq[2],
1599	mixed, tmp[0], tmp[1], tmp[2]);
1600	} else {
1601	memcpy(nq, tmp, 3 * sizeof(felem));
1602	skip = 0;
1603	}
1604	}
1605	}
1606	}
1607	felem_assign(x_out, nq[0]);
1608	felem_assign(y_out, nq[1]);
1609	felem_assign(z_out, nq[2]);
1610	}
1611
1612	/* Precomputation for the group generator. */
1613	struct nistp521_pre_comp_st {
1614	felem g_pre_comp[16][3];
1615	CRYPTO_REF_COUNT references;
1616	CRYPTO_RWLOCK *lock;
1617	};
1618
1619	const EC_METHOD *EC_GFp_nistp521_method(void)
1620	{
1621	static const EC_METHOD ret = {
1622	EC_FLAGS_DEFAULT_OCT,
1623	NID_X9_62_prime_field,
1624	ec_GFp_nistp521_group_init,
1625	ec_GFp_simple_group_finish,
1626	ec_GFp_simple_group_clear_finish,
1627	ec_GFp_nist_group_copy,
1628	ec_GFp_nistp521_group_set_curve,
1629	ec_GFp_simple_group_get_curve,
1630	ec_GFp_simple_group_get_degree,
1631	ec_group_simple_order_bits,
1632	ec_GFp_simple_group_check_discriminant,
1633	ec_GFp_simple_point_init,
1634	ec_GFp_simple_point_finish,
1635	ec_GFp_simple_point_clear_finish,
1636	ec_GFp_simple_point_copy,
1637	ec_GFp_simple_point_set_to_infinity,
1638	ec_GFp_simple_set_Jprojective_coordinates_GFp,
1639	ec_GFp_simple_get_Jprojective_coordinates_GFp,
1640	ec_GFp_simple_point_set_affine_coordinates,
1641	ec_GFp_nistp521_point_get_affine_coordinates,
1642	0 /* point_set_compressed_coordinates */ ,
1643	0 /* point2oct */ ,
1644	0 /* oct2point */ ,
1645	ec_GFp_simple_add,
1646	ec_GFp_simple_dbl,
1647	ec_GFp_simple_invert,
1648	ec_GFp_simple_is_at_infinity,
1649	ec_GFp_simple_is_on_curve,
1650	ec_GFp_simple_cmp,
1651	ec_GFp_simple_make_affine,
1652	ec_GFp_simple_points_make_affine,
1653	ec_GFp_nistp521_points_mul,
1654	ec_GFp_nistp521_precompute_mult,
1655	ec_GFp_nistp521_have_precompute_mult,
1656	ec_GFp_nist_field_mul,
1657	ec_GFp_nist_field_sqr,
1658	0 /* field_div */ ,
1659	ec_GFp_simple_field_inv,
1660	0 /* field_encode */ ,
1661	0 /* field_decode */ ,
1662	0, /* field_set_to_one */
1663	ec_key_simple_priv2oct,
1664	ec_key_simple_oct2priv,
1665	0, /* set private */
1666	ec_key_simple_generate_key,
1667	ec_key_simple_check_key,
1668	ec_key_simple_generate_public_key,
1669	0, /* keycopy */
1670	0, /* keyfinish */
1671	ecdh_simple_compute_key,
1672	0, /* field_inverse_mod_ord */
1673	0, /* blind_coordinates */
1674	0, /* ladder_pre */
1675	0, /* ladder_step */
1676	0 /* ladder_post */
1677	};
1678
1679	return &ret;
1680	}
1681
1682	/******************************************************************************/
1683	/*
1684	* FUNCTIONS TO MANAGE PRECOMPUTATION
1685	*/
1686
1687	static NISTP521_PRE_COMP *nistp521_pre_comp_new(void)
1688	{
1689	NISTP521_PRE_COMP ret = OPENSSL_zalloc(sizeof(ret));
1690
1691	if (ret == NULL) {
1692	ECerr(EC_F_NISTP521_PRE_COMP_NEW, ERR_R_MALLOC_FAILURE);
1693	return ret;
1694	}
1695
1696	ret->references = 1;
1697
1698	ret->lock = CRYPTO_THREAD_lock_new();
1699	if (ret->lock == NULL) {
1700	ECerr(EC_F_NISTP521_PRE_COMP_NEW, ERR_R_MALLOC_FAILURE);
1701	OPENSSL_free(ret);
1702	return NULL;
1703	}
1704	return ret;
1705	}
1706
1707	NISTP521_PRE_COMP EC_nistp521_pre_comp_dup(NISTP521_PRE_COMP p)
1708	{
1709	int i;
1710	if (p != NULL)
1711	CRYPTO_UP_REF(&p->references, &i, p->lock);
1712	return p;
1713	}
1714
1715	void EC_nistp521_pre_comp_free(NISTP521_PRE_COMP *p)
1716	{
1717	int i;
1718
1719	if (p == NULL)
1720	return;
1721
1722	CRYPTO_DOWN_REF(&p->references, &i, p->lock);
1723	REF_PRINT_COUNT("EC_nistp521", x);
1724	if (i > 0)
1725	return;
1726	REF_ASSERT_ISNT(i < 0);
1727
1728	CRYPTO_THREAD_lock_free(p->lock);
1729	OPENSSL_free(p);
1730	}
1731
1732	/******************************************************************************/
1733	/*
1734	* OPENSSL EC_METHOD FUNCTIONS
1735	*/
1736
1737	int ec_GFp_nistp521_group_init(EC_GROUP *group)
1738	{
1739	int ret;
1740	ret = ec_GFp_simple_group_init(group);
1741	group->a_is_minus3 = 1;
1742	return ret;
1743	}
1744
1745	int ec_GFp_nistp521_group_set_curve(EC_GROUP group, const BIGNUM p,
1746	const BIGNUM a, const BIGNUM b,
1747	BN_CTX *ctx)
1748	{
1749	int ret = 0;
1750	BN_CTX *new_ctx = NULL;
1751	BIGNUM curve_p, curve_a, *curve_b;
1752
1753	if (ctx == NULL)
1754	if ((ctx = new_ctx = BN_CTX_new()) == NULL)
1755	return 0;
1756	BN_CTX_start(ctx);
1757	curve_p = BN_CTX_get(ctx);
1758	curve_a = BN_CTX_get(ctx);
1759	curve_b = BN_CTX_get(ctx);
1760	if (curve_b == NULL)
1761	goto err;
1762	BN_bin2bn(nistp521_curve_params[0], sizeof(felem_bytearray), curve_p);
1763	BN_bin2bn(nistp521_curve_params[1], sizeof(felem_bytearray), curve_a);
1764	BN_bin2bn(nistp521_curve_params[2], sizeof(felem_bytearray), curve_b);
1765	if ((BN_cmp(curve_p, p)) \|\| (BN_cmp(curve_a, a)) \|\| (BN_cmp(curve_b, b))) {
1766	ECerr(EC_F_EC_GFP_NISTP521_GROUP_SET_CURVE,
1767	EC_R_WRONG_CURVE_PARAMETERS);
1768	goto err;
1769	}
1770	group->field_mod_func = BN_nist_mod_521;
1771	ret = ec_GFp_simple_group_set_curve(group, p, a, b, ctx);
1772	err:
1773	BN_CTX_end(ctx);
1774	BN_CTX_free(new_ctx);
1775	return ret;
1776	}
1777
1778	/*
1779	* Takes the Jacobian coordinates (X, Y, Z) of a point and returns (X', Y') =
1780	* (X/Z^2, Y/Z^3)
1781	*/
1782	int ec_GFp_nistp521_point_get_affine_coordinates(const EC_GROUP *group,
1783	const EC_POINT *point,
1784	BIGNUM x, BIGNUM y,
1785	BN_CTX *ctx)
1786	{
1787	felem z1, z2, x_in, y_in, x_out, y_out;
1788	largefelem tmp;
1789
1790	if (EC_POINT_is_at_infinity(group, point)) {
1791	ECerr(EC_F_EC_GFP_NISTP521_POINT_GET_AFFINE_COORDINATES,
1792	EC_R_POINT_AT_INFINITY);
1793	return 0;
1794	}
1795	if ((!BN_to_felem(x_in, point->X)) \|\| (!BN_to_felem(y_in, point->Y)) \|\|
1796	(!BN_to_felem(z1, point->Z)))
1797	return 0;
1798	felem_inv(z2, z1);
1799	felem_square(tmp, z2);
1800	felem_reduce(z1, tmp);
1801	felem_mul(tmp, x_in, z1);
1802	felem_reduce(x_in, tmp);
1803	felem_contract(x_out, x_in);
1804	if (x != NULL) {
1805	if (!felem_to_BN(x, x_out)) {
1806	ECerr(EC_F_EC_GFP_NISTP521_POINT_GET_AFFINE_COORDINATES,
1807	ERR_R_BN_LIB);
1808	return 0;
1809	}
1810	}
1811	felem_mul(tmp, z1, z2);
1812	felem_reduce(z1, tmp);
1813	felem_mul(tmp, y_in, z1);
1814	felem_reduce(y_in, tmp);
1815	felem_contract(y_out, y_in);
1816	if (y != NULL) {
1817	if (!felem_to_BN(y, y_out)) {
1818	ECerr(EC_F_EC_GFP_NISTP521_POINT_GET_AFFINE_COORDINATES,
1819	ERR_R_BN_LIB);
1820	return 0;
1821	}
1822	}
1823	return 1;
1824	}
1825
1826	/* points below is of size \|num\|, and tmp_felems is of size \|num+1/ */
1827	static void make_points_affine(size_t num, felem points[][3],
1828	felem tmp_felems[])
1829	{
1830	/*
1831	* Runs in constant time, unless an input is the point at infinity (which
1832	* normally shouldn't happen).
1833	*/
1834	ec_GFp_nistp_points_make_affine_internal(num,
1835	points,
1836	sizeof(felem),
1837	tmp_felems,
1838	(void ()(void ))felem_one,
1839	felem_is_zero_int,
1840	(void ()(void , const void *))
1841	felem_assign,
1842	(void ()(void , const void *))
1843	felem_square_reduce, (void (*)
1844	(void *,
1845	const void
1846	*,
1847	const void
1848	*))
1849	felem_mul_reduce,
1850	(void ()(void , const void *))
1851	felem_inv,
1852	(void ()(void , const void *))
1853	felem_contract);
1854	}
1855
1856	/*
1857	* Computes scalargenerator + \sum scalars[i]points[i], ignoring NULL
1858	* values Result is stored in r (r can equal one of the inputs).
1859	*/
1860	int ec_GFp_nistp521_points_mul(const EC_GROUP group, EC_POINT r,
1861	const BIGNUM *scalar, size_t num,
1862	const EC_POINT *points[],
1863	const BIGNUM scalars[], BN_CTX ctx)
1864	{
1865	int ret = 0;
1866	int j;
1867	int mixed = 0;
1868	BIGNUM x, y, z, tmp_scalar;
1869	felem_bytearray g_secret;
1870	felem_bytearray *secrets = NULL;
1871	felem (*pre_comp)[17][3] = NULL;
1872	felem *tmp_felems = NULL;
1873	unsigned i;
1874	int num_bytes;
1875	int have_pre_comp = 0;
1876	size_t num_points = num;
1877	felem x_in, y_in, z_in, x_out, y_out, z_out;
1878	NISTP521_PRE_COMP *pre = NULL;
1879	felem(*g_pre_comp)[3] = NULL;
1880	EC_POINT *generator = NULL;
1881	const EC_POINT *p = NULL;
1882	const BIGNUM *p_scalar = NULL;
1883
1884	BN_CTX_start(ctx);
1885	x = BN_CTX_get(ctx);
1886	y = BN_CTX_get(ctx);
1887	z = BN_CTX_get(ctx);
1888	tmp_scalar = BN_CTX_get(ctx);
1889	if (tmp_scalar == NULL)
1890	goto err;
1891
1892	if (scalar != NULL) {
1893	pre = group->pre_comp.nistp521;
1894	if (pre)
1895	/* we have precomputation, try to use it */
1896	g_pre_comp = &pre->g_pre_comp[0];
1897	else
1898	/* try to use the standard precomputation */
1899	g_pre_comp = (felem(*)[3]) gmul;
1900	generator = EC_POINT_new(group);
1901	if (generator == NULL)
1902	goto err;
1903	/* get the generator from precomputation */
1904	if (!felem_to_BN(x, g_pre_comp[1][0]) \|\|
1905	!felem_to_BN(y, g_pre_comp[1][1]) \|\|
1906	!felem_to_BN(z, g_pre_comp[1][2])) {
1907	ECerr(EC_F_EC_GFP_NISTP521_POINTS_MUL, ERR_R_BN_LIB);
1908	goto err;
1909	}
1910	if (!EC_POINT_set_Jprojective_coordinates_GFp(group,
1911	generator, x, y, z,
1912	ctx))
1913	goto err;
1914	if (0 == EC_POINT_cmp(group, generator, group->generator, ctx))
1915	/* precomputation matches generator */
1916	have_pre_comp = 1;
1917	else
1918	/*
1919	* we don't have valid precomputation: treat the generator as a
1920	* random point
1921	*/
1922	num_points++;
1923	}
1924
1925	if (num_points > 0) {
1926	if (num_points >= 2) {
1927	/*
1928	* unless we precompute multiples for just one point, converting
1929	* those into affine form is time well spent
1930	*/
1931	mixed = 1;
1932	}
1933	secrets = OPENSSL_zalloc(sizeof(secrets) num_points);
1934	pre_comp = OPENSSL_zalloc(sizeof(pre_comp) num_points);
1935	if (mixed)
1936	tmp_felems =
1937	OPENSSL_malloc(sizeof(tmp_felems) (num_points * 17 + 1));
1938	if ((secrets == NULL) \|\| (pre_comp == NULL)
1939	\|\| (mixed && (tmp_felems == NULL))) {
1940	ECerr(EC_F_EC_GFP_NISTP521_POINTS_MUL, ERR_R_MALLOC_FAILURE);
1941	goto err;
1942	}
1943
1944	/*
1945	* we treat NULL scalars as 0, and NULL points as points at infinity,
1946	* i.e., they contribute nothing to the linear combination
1947	*/
1948	for (i = 0; i < num_points; ++i) {
1949	if (i == num) {
1950	/*
1951	* we didn't have a valid precomputation, so we pick the
1952	* generator
1953	*/
1954	p = EC_GROUP_get0_generator(group);
1955	p_scalar = scalar;
1956	} else {
1957	/* the i^th point */
1958	p = points[i];
1959	p_scalar = scalars[i];
1960	}
1961	if ((p_scalar != NULL) && (p != NULL)) {
1962	/* reduce scalar to 0 <= scalar < 2^521 */
1963	if ((BN_num_bits(p_scalar) > 521)
1964	\|\| (BN_is_negative(p_scalar))) {
1965	/*
1966	* this is an unusual input, and we don't guarantee
1967	* constant-timeness
1968	*/
1969	if (!BN_nnmod(tmp_scalar, p_scalar, group->order, ctx)) {
1970	ECerr(EC_F_EC_GFP_NISTP521_POINTS_MUL, ERR_R_BN_LIB);
1971	goto err;
1972	}
1973	num_bytes = BN_bn2lebinpad(tmp_scalar,
1974	secrets[i], sizeof(secrets[i]));
1975	} else {
1976	num_bytes = BN_bn2lebinpad(p_scalar,
1977	secrets[i], sizeof(secrets[i]));
1978	}
1979	if (num_bytes < 0) {
1980	ECerr(EC_F_EC_GFP_NISTP521_POINTS_MUL, ERR_R_BN_LIB);
1981	goto err;
1982	}
1983	/* precompute multiples */
1984	if ((!BN_to_felem(x_out, p->X)) \|\|
1985	(!BN_to_felem(y_out, p->Y)) \|\|
1986	(!BN_to_felem(z_out, p->Z)))
1987	goto err;
1988	memcpy(pre_comp[i][1][0], x_out, sizeof(felem));
1989	memcpy(pre_comp[i][1][1], y_out, sizeof(felem));
1990	memcpy(pre_comp[i][1][2], z_out, sizeof(felem));
1991	for (j = 2; j <= 16; ++j) {
1992	if (j & 1) {
1993	point_add(pre_comp[i][j][0], pre_comp[i][j][1],
1994	pre_comp[i][j][2], pre_comp[i][1][0],
1995	pre_comp[i][1][1], pre_comp[i][1][2], 0,
1996	pre_comp[i][j - 1][0],
1997	pre_comp[i][j - 1][1],
1998	pre_comp[i][j - 1][2]);
1999	} else {
2000	point_double(pre_comp[i][j][0], pre_comp[i][j][1],
2001	pre_comp[i][j][2], pre_comp[i][j / 2][0],
2002	pre_comp[i][j / 2][1],
2003	pre_comp[i][j / 2][2]);
2004	}
2005	}
2006	}
2007	}
2008	if (mixed)
2009	make_points_affine(num_points * 17, pre_comp[0], tmp_felems);
2010	}
2011
2012	/* the scalar for the generator */
2013	if ((scalar != NULL) && (have_pre_comp)) {
2014	memset(g_secret, 0, sizeof(g_secret));
2015	/* reduce scalar to 0 <= scalar < 2^521 */
2016	if ((BN_num_bits(scalar) > 521) \|\| (BN_is_negative(scalar))) {
2017	/*
2018	* this is an unusual input, and we don't guarantee
2019	* constant-timeness
2020	*/
2021	if (!BN_nnmod(tmp_scalar, scalar, group->order, ctx)) {
2022	ECerr(EC_F_EC_GFP_NISTP521_POINTS_MUL, ERR_R_BN_LIB);
2023	goto err;
2024	}
2025	num_bytes = BN_bn2lebinpad(tmp_scalar, g_secret, sizeof(g_secret));
2026	} else {
2027	num_bytes = BN_bn2lebinpad(scalar, g_secret, sizeof(g_secret));
2028	}
2029	/* do the multiplication with generator precomputation */
2030	batch_mul(x_out, y_out, z_out,
2031	(const felem_bytearray(*))secrets, num_points,
2032	g_secret,
2033	mixed, (const felem(*)[17][3])pre_comp,
2034	(const felem(*)[3])g_pre_comp);
2035	} else {
2036	/* do the multiplication without generator precomputation */
2037	batch_mul(x_out, y_out, z_out,
2038	(const felem_bytearray(*))secrets, num_points,
2039	NULL, mixed, (const felem(*)[17][3])pre_comp, NULL);
2040	}
2041	/* reduce the output to its unique minimal representation */
2042	felem_contract(x_in, x_out);
2043	felem_contract(y_in, y_out);
2044	felem_contract(z_in, z_out);
2045	if ((!felem_to_BN(x, x_in)) \|\| (!felem_to_BN(y, y_in)) \|\|
2046	(!felem_to_BN(z, z_in))) {
2047	ECerr(EC_F_EC_GFP_NISTP521_POINTS_MUL, ERR_R_BN_LIB);
2048	goto err;
2049	}
2050	ret = EC_POINT_set_Jprojective_coordinates_GFp(group, r, x, y, z, ctx);
2051
2052	err:
2053	BN_CTX_end(ctx);
2054	EC_POINT_free(generator);
2055	OPENSSL_free(secrets);
2056	OPENSSL_free(pre_comp);
2057	OPENSSL_free(tmp_felems);
2058	return ret;
2059	}
2060
2061	int ec_GFp_nistp521_precompute_mult(EC_GROUP group, BN_CTX ctx)
2062	{
2063	int ret = 0;
2064	NISTP521_PRE_COMP *pre = NULL;
2065	int i, j;
2066	BN_CTX *new_ctx = NULL;
2067	BIGNUM x, y;
2068	EC_POINT *generator = NULL;
2069	felem tmp_felems[16];
2070
2071	/* throw away old precomputation */
2072	EC_pre_comp_free(group);
2073	if (ctx == NULL)
2074	if ((ctx = new_ctx = BN_CTX_new()) == NULL)
2075	return 0;
2076	BN_CTX_start(ctx);
2077	x = BN_CTX_get(ctx);
2078	y = BN_CTX_get(ctx);
2079	if (y == NULL)
2080	goto err;
2081	/* get the generator */
2082	if (group->generator == NULL)
2083	goto err;
2084	generator = EC_POINT_new(group);
2085	if (generator == NULL)
2086	goto err;
2087	BN_bin2bn(nistp521_curve_params[3], sizeof(felem_bytearray), x);
2088	BN_bin2bn(nistp521_curve_params[4], sizeof(felem_bytearray), y);
2089	if (!EC_POINT_set_affine_coordinates(group, generator, x, y, ctx))
2090	goto err;
2091	if ((pre = nistp521_pre_comp_new()) == NULL)
2092	goto err;
2093	/*
2094	* if the generator is the standard one, use built-in precomputation
2095	*/
2096	if (0 == EC_POINT_cmp(group, generator, group->generator, ctx)) {
2097	memcpy(pre->g_pre_comp, gmul, sizeof(pre->g_pre_comp));
2098	goto done;
2099	}
2100	if ((!BN_to_felem(pre->g_pre_comp[1][0], group->generator->X)) \|\|
2101	(!BN_to_felem(pre->g_pre_comp[1][1], group->generator->Y)) \|\|
2102	(!BN_to_felem(pre->g_pre_comp[1][2], group->generator->Z)))
2103	goto err;
2104	/* compute 2^130G, 2^260G, 2^390G /
2105	for (i = 1; i <= 4; i <<= 1) {
2106	point_double(pre->g_pre_comp[2 * i][0], pre->g_pre_comp[2 * i][1],
2107	pre->g_pre_comp[2 * i][2], pre->g_pre_comp[i][0],
2108	pre->g_pre_comp[i][1], pre->g_pre_comp[i][2]);
2109	for (j = 0; j < 129; ++j) {
2110	point_double(pre->g_pre_comp[2 * i][0],
2111	pre->g_pre_comp[2 * i][1],
2112	pre->g_pre_comp[2 * i][2],
2113	pre->g_pre_comp[2 * i][0],
2114	pre->g_pre_comp[2 * i][1],
2115	pre->g_pre_comp[2 * i][2]);
2116	}
2117	}
2118	/* g_pre_comp[0] is the point at infinity */
2119	memset(pre->g_pre_comp[0], 0, sizeof(pre->g_pre_comp[0]));
2120	/* the remaining multiples */
2121	/* 2^130G + 2^260G */
2122	point_add(pre->g_pre_comp[6][0], pre->g_pre_comp[6][1],
2123	pre->g_pre_comp[6][2], pre->g_pre_comp[4][0],
2124	pre->g_pre_comp[4][1], pre->g_pre_comp[4][2],
2125	0, pre->g_pre_comp[2][0], pre->g_pre_comp[2][1],
2126	pre->g_pre_comp[2][2]);
2127	/* 2^130G + 2^390G */
2128	point_add(pre->g_pre_comp[10][0], pre->g_pre_comp[10][1],
2129	pre->g_pre_comp[10][2], pre->g_pre_comp[8][0],
2130	pre->g_pre_comp[8][1], pre->g_pre_comp[8][2],
2131	0, pre->g_pre_comp[2][0], pre->g_pre_comp[2][1],
2132	pre->g_pre_comp[2][2]);
2133	/* 2^260G + 2^390G */
2134	point_add(pre->g_pre_comp[12][0], pre->g_pre_comp[12][1],
2135	pre->g_pre_comp[12][2], pre->g_pre_comp[8][0],
2136	pre->g_pre_comp[8][1], pre->g_pre_comp[8][2],
2137	0, pre->g_pre_comp[4][0], pre->g_pre_comp[4][1],
2138	pre->g_pre_comp[4][2]);
2139	/* 2^130G + 2^260G + 2^390G /
2140	point_add(pre->g_pre_comp[14][0], pre->g_pre_comp[14][1],
2141	pre->g_pre_comp[14][2], pre->g_pre_comp[12][0],
2142	pre->g_pre_comp[12][1], pre->g_pre_comp[12][2],
2143	0, pre->g_pre_comp[2][0], pre->g_pre_comp[2][1],
2144	pre->g_pre_comp[2][2]);
2145	for (i = 1; i < 8; ++i) {
2146	/* odd multiples: add G */
2147	point_add(pre->g_pre_comp[2 * i + 1][0],
2148	pre->g_pre_comp[2 * i + 1][1],
2149	pre->g_pre_comp[2 * i + 1][2], pre->g_pre_comp[2 * i][0],
2150	pre->g_pre_comp[2 * i][1], pre->g_pre_comp[2 * i][2], 0,
2151	pre->g_pre_comp[1][0], pre->g_pre_comp[1][1],
2152	pre->g_pre_comp[1][2]);
2153	}
2154	make_points_affine(15, &(pre->g_pre_comp[1]), tmp_felems);
2155
2156	done:
2157	SETPRECOMP(group, nistp521, pre);
2158	ret = 1;
2159	pre = NULL;
2160	err:
2161	BN_CTX_end(ctx);
2162	EC_POINT_free(generator);
2163	BN_CTX_free(new_ctx);
2164	EC_nistp521_pre_comp_free(pre);
2165	return ret;
2166	}
2167
2168	int ec_GFp_nistp521_have_precompute_mult(const EC_GROUP *group)
2169	{
2170	return HAVEPRECOMP(group, nistp521);
2171	}
2172
2173	#endif

Note: See TracBrowser for help on using the repository browser.

source: vbox/trunk/src/libs/openssl-1.1.1f/crypto/ec/ecp_nistp521.c@ 83531

Download in other formats: