1 | // SPDX-License-Identifier: 0BSD
|
---|
2 |
|
---|
3 | ///////////////////////////////////////////////////////////////////////////////
|
---|
4 | //
|
---|
5 | /// \file crc_clmul_consts_gen.c
|
---|
6 | /// \brief Generate constants for CLMUL CRC code
|
---|
7 | ///
|
---|
8 | /// Compiling: gcc -std=c99 -o crc_clmul_consts_gen crc_clmul_consts_gen.c
|
---|
9 | ///
|
---|
10 | /// This is for CRCs that use reversed bit order (bit reflection).
|
---|
11 | /// The same CLMUL CRC code can be used with CRC64 and smaller ones like
|
---|
12 | /// CRC32 apart from one special case: CRC64 needs an extra step in the
|
---|
13 | /// Barrett reduction to handle the 65th bit; the smaller ones don't.
|
---|
14 | /// Otherwise it's enough to just change the polynomial and the derived
|
---|
15 | /// constants and use the same code.
|
---|
16 | ///
|
---|
17 | /// See the Intel white paper "Fast CRC Computation for Generic Polynomials
|
---|
18 | /// Using PCLMULQDQ Instruction" from 2009.
|
---|
19 | //
|
---|
20 | // Author: Lasse Collin
|
---|
21 | //
|
---|
22 | ///////////////////////////////////////////////////////////////////////////////
|
---|
23 |
|
---|
24 | #include <inttypes.h>
|
---|
25 | #include <stdio.h>
|
---|
26 |
|
---|
27 |
|
---|
28 | /// CRC32 (Ethernet) polynomial in reversed representation
|
---|
29 | static const uint64_t p32 = 0xedb88320;
|
---|
30 |
|
---|
31 | // CRC64 (ECMA-182) polynomial in reversed representation
|
---|
32 | static const uint64_t p64 = 0xc96c5795d7870f42;
|
---|
33 |
|
---|
34 |
|
---|
35 | /// Calculates floor(x^128 / p) where p is a CRC64 polynomial in
|
---|
36 | /// reversed representation. The result is in reversed representation too.
|
---|
37 | static uint64_t
|
---|
38 | calc_cldiv(uint64_t p)
|
---|
39 | {
|
---|
40 | // Quotient
|
---|
41 | uint64_t q = 0;
|
---|
42 |
|
---|
43 | // Align the x^64 term with the x^128 (the implied high bits of the
|
---|
44 | // divisor and the dividend) and do the first step of polynomial long
|
---|
45 | // division, calculating the first remainder. The variable q remains
|
---|
46 | // zero because the highest bit of the quotient is an implied bit 1
|
---|
47 | // (we kind of set q = 1 << -1).
|
---|
48 | uint64_t r = p;
|
---|
49 |
|
---|
50 | // Then process the remaining 64 terms. Note that r has no implied
|
---|
51 | // high bit, only q and p do. (And remember that a high bit in the
|
---|
52 | // polynomial is stored at a low bit in the variable due to the
|
---|
53 | // reversed bit order.)
|
---|
54 | for (unsigned i = 0; i < 64; ++i) {
|
---|
55 | q |= (r & 1) << i;
|
---|
56 | r = (r >> 1) ^ (r & 1 ? p : 0);
|
---|
57 | }
|
---|
58 |
|
---|
59 | return q;
|
---|
60 | }
|
---|
61 |
|
---|
62 |
|
---|
63 | /// Calculate the remainder of carryless division:
|
---|
64 | ///
|
---|
65 | /// x^(bits + n - 1) % p, where n=64 (for CRC64)
|
---|
66 | ///
|
---|
67 | /// p must be in reversed representation which omits the bit of
|
---|
68 | /// the highest term of the polynomial. Instead, it is an implied bit
|
---|
69 | /// at kind of like "1 << -1" position, as if it had just been shifted out.
|
---|
70 | ///
|
---|
71 | /// The return value is in the reversed bit order. (There are no implied bits.)
|
---|
72 | static uint64_t
|
---|
73 | calc_clrem(uint64_t p, unsigned bits)
|
---|
74 | {
|
---|
75 | // Do the first step of polynomial long division.
|
---|
76 | uint64_t r = p;
|
---|
77 |
|
---|
78 | // Then process the remaining terms. Start with i = 1 instead of i = 0
|
---|
79 | // to account for the -1 in x^(bits + n - 1). This -1 is convenient
|
---|
80 | // with the reversed bit order. See the "Bit-Reflection" section in
|
---|
81 | // the Intel white paper.
|
---|
82 | for (unsigned i = 1; i < bits; ++i)
|
---|
83 | r = (r >> 1) ^ (r & 1 ? p : 0);
|
---|
84 |
|
---|
85 | return r;
|
---|
86 | }
|
---|
87 |
|
---|
88 |
|
---|
89 | extern int
|
---|
90 | main(void)
|
---|
91 | {
|
---|
92 | puts("// CRC64");
|
---|
93 |
|
---|
94 | // The order of the two 64-bit constants in a vector don't matter.
|
---|
95 | // It feels logical to put them in this order as it matches the
|
---|
96 | // order in which the input bytes are read.
|
---|
97 | printf("const __m128i fold512 = _mm_set_epi64x("
|
---|
98 | "0x%016" PRIx64 ", 0x%016" PRIx64 ");\n",
|
---|
99 | calc_clrem(p64, 4 * 128 - 64),
|
---|
100 | calc_clrem(p64, 4 * 128));
|
---|
101 |
|
---|
102 | printf("const __m128i fold128 = _mm_set_epi64x("
|
---|
103 | "0x%016" PRIx64 ", 0x%016" PRIx64 ");\n",
|
---|
104 | calc_clrem(p64, 128 - 64),
|
---|
105 | calc_clrem(p64, 128));
|
---|
106 |
|
---|
107 | // When we multiply by mu, we care about the high bits of the result
|
---|
108 | // (in reversed bit order!). It doesn't matter that the low bit gets
|
---|
109 | // shifted out because the affected output bits will be ignored.
|
---|
110 | // Below we add the implied high bit with "| 1" after the shifting
|
---|
111 | // so that the high bits of the multiplication will be correct.
|
---|
112 | //
|
---|
113 | // p64 is shifted left by one so that the final multiplication
|
---|
114 | // in Barrett reduction won't be misaligned by one bit. We could
|
---|
115 | // use "(p64 << 1) | 1" instead of "p64 << 1" too but it makes
|
---|
116 | // no difference as that bit won't affect the relevant output bits
|
---|
117 | // (we only care about the lowest 64 bits of the result, that is,
|
---|
118 | // lowest in the reversed bit order).
|
---|
119 | //
|
---|
120 | // NOTE: The 65rd bit of p64 gets shifted out. It needs to be
|
---|
121 | // compensated with 64-bit shift and xor in the CRC64 code.
|
---|
122 | printf("const __m128i mu_p = _mm_set_epi64x("
|
---|
123 | "0x%016" PRIx64 ", 0x%016" PRIx64 ");\n",
|
---|
124 | (calc_cldiv(p64) << 1) | 1,
|
---|
125 | p64 << 1);
|
---|
126 |
|
---|
127 | puts("");
|
---|
128 |
|
---|
129 | puts("// CRC32");
|
---|
130 |
|
---|
131 | printf("const __m128i fold512 = _mm_set_epi64x("
|
---|
132 | "0x%08" PRIx64 ", 0x%08" PRIx64 ");\n",
|
---|
133 | calc_clrem(p32, 4 * 128 - 64),
|
---|
134 | calc_clrem(p32, 4 * 128));
|
---|
135 |
|
---|
136 | printf("const __m128i fold128 = _mm_set_epi64x("
|
---|
137 | "0x%08" PRIx64 ", 0x%08" PRIx64 ");\n",
|
---|
138 | calc_clrem(p32, 128 - 64),
|
---|
139 | calc_clrem(p32, 128));
|
---|
140 |
|
---|
141 | // CRC32 calculation is done by modulus scaling it to a CRC64.
|
---|
142 | // Since the CRC is in reversed representation, only the mu
|
---|
143 | // constant changes with the modulus scaling. This method avoids
|
---|
144 | // one additional constant and one additional clmul in the final
|
---|
145 | // reduction steps, making the code both simpler and faster.
|
---|
146 | //
|
---|
147 | // p32 is shifted left by one so that the final multiplication
|
---|
148 | // in Barrett reduction won't be misaligned by one bit. We could
|
---|
149 | // use "(p32 << 1) | 1" instead of "p32 << 1" too but it makes
|
---|
150 | // no difference as that bit won't affect the relevant output bits.
|
---|
151 | //
|
---|
152 | // NOTE: The 33-bit value fits in 64 bits so, unlike with CRC64,
|
---|
153 | // there is no need to compensate for any missing bits in the code.
|
---|
154 | printf("const __m128i mu_p = _mm_set_epi64x("
|
---|
155 | "0x%016" PRIx64 ", 0x%" PRIx64 ");\n",
|
---|
156 | (calc_cldiv(p32) << 1) | 1,
|
---|
157 | p32 << 1);
|
---|
158 |
|
---|
159 | return 0;
|
---|
160 | }
|
---|