1 | /* SPDX-License-Identifier: 0BSD */
|
---|
2 |
|
---|
3 | /*
|
---|
4 | * Speed-optimized CRC64 using slicing-by-four algorithm
|
---|
5 | *
|
---|
6 | * This uses only i386 instructions, but it is optimized for i686 and later
|
---|
7 | * (including e.g. Pentium II/III/IV, Athlon XP, and Core 2).
|
---|
8 | *
|
---|
9 | * Authors: Igor Pavlov (original CRC32 assembly code)
|
---|
10 | * Lasse Collin (CRC64 adaptation of the modified CRC32 code)
|
---|
11 | *
|
---|
12 | * This code needs lzma_crc64_table, which can be created using the
|
---|
13 | * following C code:
|
---|
14 |
|
---|
15 | uint64_t lzma_crc64_table[4][256];
|
---|
16 |
|
---|
17 | void
|
---|
18 | init_table(void)
|
---|
19 | {
|
---|
20 | // ECMA-182
|
---|
21 | static const uint64_t poly64 = UINT64_C(0xC96C5795D7870F42);
|
---|
22 |
|
---|
23 | for (size_t s = 0; s < 4; ++s) {
|
---|
24 | for (size_t b = 0; b < 256; ++b) {
|
---|
25 | uint64_t r = s == 0 ? b : lzma_crc64_table[s - 1][b];
|
---|
26 |
|
---|
27 | for (size_t i = 0; i < 8; ++i) {
|
---|
28 | if (r & 1)
|
---|
29 | r = (r >> 1) ^ poly64;
|
---|
30 | else
|
---|
31 | r >>= 1;
|
---|
32 | }
|
---|
33 |
|
---|
34 | lzma_crc64_table[s][b] = r;
|
---|
35 | }
|
---|
36 | }
|
---|
37 | }
|
---|
38 |
|
---|
39 | * The prototype of the CRC64 function:
|
---|
40 | * extern uint64_t lzma_crc64(const uint8_t *buf, size_t size, uint64_t crc);
|
---|
41 | */
|
---|
42 |
|
---|
43 | /* When Intel CET is enabled, include <cet.h> in assembly code to mark
|
---|
44 | Intel CET support. */
|
---|
45 | #ifdef __CET__
|
---|
46 | # include <cet.h>
|
---|
47 | #else
|
---|
48 | # define _CET_ENDBR
|
---|
49 | #endif
|
---|
50 |
|
---|
51 | /*
|
---|
52 | * On some systems, the functions need to be prefixed. The prefix is
|
---|
53 | * usually an underscore.
|
---|
54 | */
|
---|
55 | #ifndef __USER_LABEL_PREFIX__
|
---|
56 | # define __USER_LABEL_PREFIX__
|
---|
57 | #endif
|
---|
58 | #define MAKE_SYM_CAT(prefix, sym) prefix ## sym
|
---|
59 | #define MAKE_SYM(prefix, sym) MAKE_SYM_CAT(prefix, sym)
|
---|
60 | #define LZMA_CRC64 MAKE_SYM(__USER_LABEL_PREFIX__, lzma_crc64_generic)
|
---|
61 | #define LZMA_CRC64_TABLE MAKE_SYM(__USER_LABEL_PREFIX__, lzma_crc64_table)
|
---|
62 |
|
---|
63 | /*
|
---|
64 | * Solaris assembler doesn't have .p2align, and Darwin uses .align
|
---|
65 | * differently than GNU/Linux and Solaris.
|
---|
66 | */
|
---|
67 | #if defined(__APPLE__) || defined(__MSDOS__)
|
---|
68 | # define ALIGN(pow2, abs) .align pow2
|
---|
69 | #else
|
---|
70 | # define ALIGN(pow2, abs) .align abs
|
---|
71 | #endif
|
---|
72 |
|
---|
73 | .text
|
---|
74 | .globl LZMA_CRC64
|
---|
75 | #ifdef __ELF__
|
---|
76 | .hidden LZMA_CRC64
|
---|
77 | #endif
|
---|
78 |
|
---|
79 | #if !defined(__APPLE__) && !defined(_WIN32) && !defined(__CYGWIN__) \
|
---|
80 | && !defined(__MSDOS__)
|
---|
81 | .type LZMA_CRC64, @function
|
---|
82 | #endif
|
---|
83 |
|
---|
84 | ALIGN(4, 16)
|
---|
85 | LZMA_CRC64:
|
---|
86 | _CET_ENDBR
|
---|
87 | /*
|
---|
88 | * Register usage:
|
---|
89 | * %eax crc LSB
|
---|
90 | * %edx crc MSB
|
---|
91 | * %esi buf
|
---|
92 | * %edi size or buf + size
|
---|
93 | * %ebx lzma_crc64_table
|
---|
94 | * %ebp Table index
|
---|
95 | * %ecx Temporary
|
---|
96 | */
|
---|
97 | pushl %ebx
|
---|
98 | pushl %esi
|
---|
99 | pushl %edi
|
---|
100 | pushl %ebp
|
---|
101 | movl 0x14(%esp), %esi /* buf */
|
---|
102 | movl 0x18(%esp), %edi /* size */
|
---|
103 | movl 0x1C(%esp), %eax /* crc LSB */
|
---|
104 | movl 0x20(%esp), %edx /* crc MSB */
|
---|
105 |
|
---|
106 | /*
|
---|
107 | * Store the address of lzma_crc64_table to %ebx. This is needed to
|
---|
108 | * get position-independent code (PIC).
|
---|
109 | *
|
---|
110 | * The PIC macro is defined by libtool, while __PIC__ is defined
|
---|
111 | * by GCC but only on some systems. Testing for both makes it simpler
|
---|
112 | * to test this code without libtool, and keeps the code working also
|
---|
113 | * when built with libtool but using something else than GCC.
|
---|
114 | *
|
---|
115 | * I understood that libtool may define PIC on Windows even though
|
---|
116 | * the code in Windows DLLs is not PIC in sense that it is in ELF
|
---|
117 | * binaries, so we need a separate check to always use the non-PIC
|
---|
118 | * code on Windows.
|
---|
119 | */
|
---|
120 | #if (!defined(PIC) && !defined(__PIC__)) \
|
---|
121 | || (defined(_WIN32) || defined(__CYGWIN__))
|
---|
122 | /* Not PIC */
|
---|
123 | movl $ LZMA_CRC64_TABLE, %ebx
|
---|
124 | #elif defined(__APPLE__)
|
---|
125 | /* Mach-O */
|
---|
126 | call .L_get_pc
|
---|
127 | .L_pic:
|
---|
128 | leal .L_lzma_crc64_table$non_lazy_ptr-.L_pic(%ebx), %ebx
|
---|
129 | movl (%ebx), %ebx
|
---|
130 | #else
|
---|
131 | /* ELF */
|
---|
132 | call .L_get_pc
|
---|
133 | addl $_GLOBAL_OFFSET_TABLE_, %ebx
|
---|
134 | movl LZMA_CRC64_TABLE@GOT(%ebx), %ebx
|
---|
135 | #endif
|
---|
136 |
|
---|
137 | /* Complement the initial value. */
|
---|
138 | notl %eax
|
---|
139 | notl %edx
|
---|
140 |
|
---|
141 | .L_align:
|
---|
142 | /*
|
---|
143 | * Check if there is enough input to use slicing-by-four.
|
---|
144 | * We need eight bytes, because the loop pre-reads four bytes.
|
---|
145 | */
|
---|
146 | cmpl $8, %edi
|
---|
147 | jb .L_rest
|
---|
148 |
|
---|
149 | /* Check if we have reached alignment of four bytes. */
|
---|
150 | testl $3, %esi
|
---|
151 | jz .L_slice
|
---|
152 |
|
---|
153 | /* Calculate CRC of the next input byte. */
|
---|
154 | movzbl (%esi), %ebp
|
---|
155 | incl %esi
|
---|
156 | movzbl %al, %ecx
|
---|
157 | xorl %ecx, %ebp
|
---|
158 | shrdl $8, %edx, %eax
|
---|
159 | xorl (%ebx, %ebp, 8), %eax
|
---|
160 | shrl $8, %edx
|
---|
161 | xorl 4(%ebx, %ebp, 8), %edx
|
---|
162 | decl %edi
|
---|
163 | jmp .L_align
|
---|
164 |
|
---|
165 | .L_slice:
|
---|
166 | /*
|
---|
167 | * If we get here, there's at least eight bytes of aligned input
|
---|
168 | * available. Make %edi multiple of four bytes. Store the possible
|
---|
169 | * remainder over the "size" variable in the argument stack.
|
---|
170 | */
|
---|
171 | movl %edi, 0x18(%esp)
|
---|
172 | andl $-4, %edi
|
---|
173 | subl %edi, 0x18(%esp)
|
---|
174 |
|
---|
175 | /*
|
---|
176 | * Let %edi be buf + size - 4 while running the main loop. This way
|
---|
177 | * we can compare for equality to determine when exit the loop.
|
---|
178 | */
|
---|
179 | addl %esi, %edi
|
---|
180 | subl $4, %edi
|
---|
181 |
|
---|
182 | /* Read in the first four aligned bytes. */
|
---|
183 | movl (%esi), %ecx
|
---|
184 |
|
---|
185 | .L_loop:
|
---|
186 | xorl %eax, %ecx
|
---|
187 | movzbl %cl, %ebp
|
---|
188 | movl 0x1800(%ebx, %ebp, 8), %eax
|
---|
189 | xorl %edx, %eax
|
---|
190 | movl 0x1804(%ebx, %ebp, 8), %edx
|
---|
191 | movzbl %ch, %ebp
|
---|
192 | xorl 0x1000(%ebx, %ebp, 8), %eax
|
---|
193 | xorl 0x1004(%ebx, %ebp, 8), %edx
|
---|
194 | shrl $16, %ecx
|
---|
195 | movzbl %cl, %ebp
|
---|
196 | xorl 0x0800(%ebx, %ebp, 8), %eax
|
---|
197 | xorl 0x0804(%ebx, %ebp, 8), %edx
|
---|
198 | movzbl %ch, %ebp
|
---|
199 | addl $4, %esi
|
---|
200 | xorl (%ebx, %ebp, 8), %eax
|
---|
201 | xorl 4(%ebx, %ebp, 8), %edx
|
---|
202 |
|
---|
203 | /* Check for end of aligned input. */
|
---|
204 | cmpl %edi, %esi
|
---|
205 |
|
---|
206 | /*
|
---|
207 | * Copy the next input byte to %ecx. It is slightly faster to
|
---|
208 | * read it here than at the top of the loop.
|
---|
209 | */
|
---|
210 | movl (%esi), %ecx
|
---|
211 | jb .L_loop
|
---|
212 |
|
---|
213 | /*
|
---|
214 | * Process the remaining four bytes, which we have already
|
---|
215 | * copied to %ecx.
|
---|
216 | */
|
---|
217 | xorl %eax, %ecx
|
---|
218 | movzbl %cl, %ebp
|
---|
219 | movl 0x1800(%ebx, %ebp, 8), %eax
|
---|
220 | xorl %edx, %eax
|
---|
221 | movl 0x1804(%ebx, %ebp, 8), %edx
|
---|
222 | movzbl %ch, %ebp
|
---|
223 | xorl 0x1000(%ebx, %ebp, 8), %eax
|
---|
224 | xorl 0x1004(%ebx, %ebp, 8), %edx
|
---|
225 | shrl $16, %ecx
|
---|
226 | movzbl %cl, %ebp
|
---|
227 | xorl 0x0800(%ebx, %ebp, 8), %eax
|
---|
228 | xorl 0x0804(%ebx, %ebp, 8), %edx
|
---|
229 | movzbl %ch, %ebp
|
---|
230 | addl $4, %esi
|
---|
231 | xorl (%ebx, %ebp, 8), %eax
|
---|
232 | xorl 4(%ebx, %ebp, 8), %edx
|
---|
233 |
|
---|
234 | /* Copy the number of remaining bytes to %edi. */
|
---|
235 | movl 0x18(%esp), %edi
|
---|
236 |
|
---|
237 | .L_rest:
|
---|
238 | /* Check for end of input. */
|
---|
239 | testl %edi, %edi
|
---|
240 | jz .L_return
|
---|
241 |
|
---|
242 | /* Calculate CRC of the next input byte. */
|
---|
243 | movzbl (%esi), %ebp
|
---|
244 | incl %esi
|
---|
245 | movzbl %al, %ecx
|
---|
246 | xorl %ecx, %ebp
|
---|
247 | shrdl $8, %edx, %eax
|
---|
248 | xorl (%ebx, %ebp, 8), %eax
|
---|
249 | shrl $8, %edx
|
---|
250 | xorl 4(%ebx, %ebp, 8), %edx
|
---|
251 | decl %edi
|
---|
252 | jmp .L_rest
|
---|
253 |
|
---|
254 | .L_return:
|
---|
255 | /* Complement the final value. */
|
---|
256 | notl %eax
|
---|
257 | notl %edx
|
---|
258 |
|
---|
259 | popl %ebp
|
---|
260 | popl %edi
|
---|
261 | popl %esi
|
---|
262 | popl %ebx
|
---|
263 | ret
|
---|
264 |
|
---|
265 | #if defined(PIC) || defined(__PIC__)
|
---|
266 | ALIGN(4, 16)
|
---|
267 | .L_get_pc:
|
---|
268 | movl (%esp), %ebx
|
---|
269 | ret
|
---|
270 | #endif
|
---|
271 |
|
---|
272 | #if defined(__APPLE__) && (defined(PIC) || defined(__PIC__))
|
---|
273 | /* Mach-O PIC */
|
---|
274 | .section __IMPORT,__pointers,non_lazy_symbol_pointers
|
---|
275 | .L_lzma_crc64_table$non_lazy_ptr:
|
---|
276 | .indirect_symbol LZMA_CRC64_TABLE
|
---|
277 | .long 0
|
---|
278 |
|
---|
279 | #elif !defined(_WIN32) && !defined(__CYGWIN__) && !defined(__MSDOS__)
|
---|
280 | /* ELF */
|
---|
281 | .size LZMA_CRC64, .-LZMA_CRC64
|
---|
282 | #endif
|
---|
283 |
|
---|
284 | /*
|
---|
285 | * This is needed to support non-executable stack. It's ugly to
|
---|
286 | * use __FreeBSD__ and __linux__ here, but I don't know a way to detect when
|
---|
287 | * we are using GNU assembler.
|
---|
288 | */
|
---|
289 | #if defined(__ELF__) && (defined(__FreeBSD__) || defined(__linux__))
|
---|
290 | .section .note.GNU-stack,"",@progbits
|
---|
291 | #endif
|
---|