1 | /*
|
---|
2 | * Copyright 2016 The OpenSSL Project Authors. All Rights Reserved.
|
---|
3 | *
|
---|
4 | * Licensed under the OpenSSL license (the "License"). You may not use
|
---|
5 | * this file except in compliance with the License. You can obtain a copy
|
---|
6 | * in the file LICENSE in the source distribution or at
|
---|
7 | * https://www.openssl.org/source/license.html
|
---|
8 | */
|
---|
9 |
|
---|
10 | /*
|
---|
11 | * This module is meant to be used as template for base 2^44 assembly
|
---|
12 | * implementation[s]. On side note compiler-generated code is not
|
---|
13 | * slower than compiler-generated base 2^64 code on [high-end] x86_64,
|
---|
14 | * even though amount of multiplications is 50% higher. Go figure...
|
---|
15 | */
|
---|
16 | #include <stdlib.h>
|
---|
17 |
|
---|
18 | typedef unsigned char u8;
|
---|
19 | typedef unsigned int u32;
|
---|
20 | typedef unsigned long u64;
|
---|
21 | typedef unsigned __int128 u128;
|
---|
22 |
|
---|
23 | typedef struct {
|
---|
24 | u64 h[3];
|
---|
25 | u64 s[2];
|
---|
26 | u64 r[3];
|
---|
27 | } poly1305_internal;
|
---|
28 |
|
---|
29 | #define POLY1305_BLOCK_SIZE 16
|
---|
30 |
|
---|
31 | /* pick 64-bit unsigned integer in little endian order */
|
---|
32 | static u64 U8TOU64(const unsigned char *p)
|
---|
33 | {
|
---|
34 | return (((u64)(p[0] & 0xff)) |
|
---|
35 | ((u64)(p[1] & 0xff) << 8) |
|
---|
36 | ((u64)(p[2] & 0xff) << 16) |
|
---|
37 | ((u64)(p[3] & 0xff) << 24) |
|
---|
38 | ((u64)(p[4] & 0xff) << 32) |
|
---|
39 | ((u64)(p[5] & 0xff) << 40) |
|
---|
40 | ((u64)(p[6] & 0xff) << 48) |
|
---|
41 | ((u64)(p[7] & 0xff) << 56));
|
---|
42 | }
|
---|
43 |
|
---|
44 | /* store a 64-bit unsigned integer in little endian */
|
---|
45 | static void U64TO8(unsigned char *p, u64 v)
|
---|
46 | {
|
---|
47 | p[0] = (unsigned char)((v) & 0xff);
|
---|
48 | p[1] = (unsigned char)((v >> 8) & 0xff);
|
---|
49 | p[2] = (unsigned char)((v >> 16) & 0xff);
|
---|
50 | p[3] = (unsigned char)((v >> 24) & 0xff);
|
---|
51 | p[4] = (unsigned char)((v >> 32) & 0xff);
|
---|
52 | p[5] = (unsigned char)((v >> 40) & 0xff);
|
---|
53 | p[6] = (unsigned char)((v >> 48) & 0xff);
|
---|
54 | p[7] = (unsigned char)((v >> 56) & 0xff);
|
---|
55 | }
|
---|
56 |
|
---|
57 | int poly1305_init(void *ctx, const unsigned char key[16])
|
---|
58 | {
|
---|
59 | poly1305_internal *st = (poly1305_internal *)ctx;
|
---|
60 | u64 r0, r1;
|
---|
61 |
|
---|
62 | /* h = 0 */
|
---|
63 | st->h[0] = 0;
|
---|
64 | st->h[1] = 0;
|
---|
65 | st->h[2] = 0;
|
---|
66 |
|
---|
67 | r0 = U8TOU64(&key[0]) & 0x0ffffffc0fffffff;
|
---|
68 | r1 = U8TOU64(&key[8]) & 0x0ffffffc0ffffffc;
|
---|
69 |
|
---|
70 | /* break r1:r0 to three 44-bit digits, masks are 1<<44-1 */
|
---|
71 | st->r[0] = r0 & 0x0fffffffffff;
|
---|
72 | st->r[1] = ((r0 >> 44) | (r1 << 20)) & 0x0fffffffffff;
|
---|
73 | st->r[2] = (r1 >> 24);
|
---|
74 |
|
---|
75 | st->s[0] = (st->r[1] + (st->r[1] << 2)) << 2;
|
---|
76 | st->s[1] = (st->r[2] + (st->r[2] << 2)) << 2;
|
---|
77 |
|
---|
78 | return 0;
|
---|
79 | }
|
---|
80 |
|
---|
81 | void poly1305_blocks(void *ctx, const unsigned char *inp, size_t len,
|
---|
82 | u32 padbit)
|
---|
83 | {
|
---|
84 | poly1305_internal *st = (poly1305_internal *)ctx;
|
---|
85 | u64 r0, r1, r2;
|
---|
86 | u64 s1, s2;
|
---|
87 | u64 h0, h1, h2, c;
|
---|
88 | u128 d0, d1, d2;
|
---|
89 | u64 pad = (u64)padbit << 40;
|
---|
90 |
|
---|
91 | r0 = st->r[0];
|
---|
92 | r1 = st->r[1];
|
---|
93 | r2 = st->r[2];
|
---|
94 |
|
---|
95 | s1 = st->s[0];
|
---|
96 | s2 = st->s[1];
|
---|
97 |
|
---|
98 | h0 = st->h[0];
|
---|
99 | h1 = st->h[1];
|
---|
100 | h2 = st->h[2];
|
---|
101 |
|
---|
102 | while (len >= POLY1305_BLOCK_SIZE) {
|
---|
103 | u64 m0, m1;
|
---|
104 |
|
---|
105 | m0 = U8TOU64(inp + 0);
|
---|
106 | m1 = U8TOU64(inp + 8);
|
---|
107 |
|
---|
108 | /* h += m[i], m[i] is broken to 44-bit digits */
|
---|
109 | h0 += m0 & 0x0fffffffffff;
|
---|
110 | h1 += ((m0 >> 44) | (m1 << 20)) & 0x0fffffffffff;
|
---|
111 | h2 += (m1 >> 24) + pad;
|
---|
112 |
|
---|
113 | /* h *= r "%" p, where "%" stands for "partial remainder" */
|
---|
114 | d0 = ((u128)h0 * r0) + ((u128)h1 * s2) + ((u128)h2 * s1);
|
---|
115 | d1 = ((u128)h0 * r1) + ((u128)h1 * r0) + ((u128)h2 * s2);
|
---|
116 | d2 = ((u128)h0 * r2) + ((u128)h1 * r1) + ((u128)h2 * r0);
|
---|
117 |
|
---|
118 | /* "lazy" reduction step */
|
---|
119 | h0 = (u64)d0 & 0x0fffffffffff;
|
---|
120 | h1 = (u64)(d1 += (u64)(d0 >> 44)) & 0x0fffffffffff;
|
---|
121 | h2 = (u64)(d2 += (u64)(d1 >> 44)) & 0x03ffffffffff; /* last 42 bits */
|
---|
122 |
|
---|
123 | c = (d2 >> 42);
|
---|
124 | h0 += c + (c << 2);
|
---|
125 |
|
---|
126 | inp += POLY1305_BLOCK_SIZE;
|
---|
127 | len -= POLY1305_BLOCK_SIZE;
|
---|
128 | }
|
---|
129 |
|
---|
130 | st->h[0] = h0;
|
---|
131 | st->h[1] = h1;
|
---|
132 | st->h[2] = h2;
|
---|
133 | }
|
---|
134 |
|
---|
135 | void poly1305_emit(void *ctx, unsigned char mac[16], const u32 nonce[4])
|
---|
136 | {
|
---|
137 | poly1305_internal *st = (poly1305_internal *) ctx;
|
---|
138 | u64 h0, h1, h2;
|
---|
139 | u64 g0, g1, g2;
|
---|
140 | u128 t;
|
---|
141 | u64 mask;
|
---|
142 |
|
---|
143 | h0 = st->h[0];
|
---|
144 | h1 = st->h[1];
|
---|
145 | h2 = st->h[2];
|
---|
146 |
|
---|
147 | /* after "lazy" reduction, convert 44+bit digits to 64-bit ones */
|
---|
148 | h0 = (u64)(t = (u128)h0 + (h1 << 44)); h1 >>= 20;
|
---|
149 | h1 = (u64)(t = (u128)h1 + (h2 << 24) + (t >> 64)); h2 >>= 40;
|
---|
150 | h2 += (u64)(t >> 64);
|
---|
151 |
|
---|
152 | /* compare to modulus by computing h + -p */
|
---|
153 | g0 = (u64)(t = (u128)h0 + 5);
|
---|
154 | g1 = (u64)(t = (u128)h1 + (t >> 64));
|
---|
155 | g2 = h2 + (u64)(t >> 64);
|
---|
156 |
|
---|
157 | /* if there was carry into 131st bit, h1:h0 = g1:g0 */
|
---|
158 | mask = 0 - (g2 >> 2);
|
---|
159 | g0 &= mask;
|
---|
160 | g1 &= mask;
|
---|
161 | mask = ~mask;
|
---|
162 | h0 = (h0 & mask) | g0;
|
---|
163 | h1 = (h1 & mask) | g1;
|
---|
164 |
|
---|
165 | /* mac = (h + nonce) % (2^128) */
|
---|
166 | h0 = (u64)(t = (u128)h0 + nonce[0] + ((u64)nonce[1]<<32));
|
---|
167 | h1 = (u64)(t = (u128)h1 + nonce[2] + ((u64)nonce[3]<<32) + (t >> 64));
|
---|
168 |
|
---|
169 | U64TO8(mac + 0, h0);
|
---|
170 | U64TO8(mac + 8, h1);
|
---|
171 | }
|
---|