memcmplen.h@ 108911

Last change on this file since 108911 was 108911, checked in by vboxsync, 4 weeks ago
libs/liblzma: Applied and adjusted our liblzma changes to 5.8.1 and export to OSE. jiraref:VBP-1635
Property svn:eol-style set to `native` Property svn:keywords set to `Author Date Id Revision` Property sync-process set to `export`
File size: 5.5 KB

Line
1	// SPDX-License-Identifier: 0BSD
2
3	///////////////////////////////////////////////////////////////////////////////
4	//
5	/// \file memcmplen.h
6	/// \brief Optimized comparison of two buffers
7	//
8	// Author: Lasse Collin
9	//
10	///////////////////////////////////////////////////////////////////////////////
11
12	#ifndef LZMA_MEMCMPLEN_H
13	#define LZMA_MEMCMPLEN_H
14
15	#include "common.h"
16
17	#ifdef HAVE_IMMINTRIN_H
18	# include <immintrin.h>
19	#endif
20
21	// Only include <intrin.h> if it is needed. The header is only needed
22	// on Windows when using an MSVC compatible compiler. The Intel compiler
23	// can use the intrinsics without the header file.
24	#if defined(TUKLIB_FAST_UNALIGNED_ACCESS) \
25	&& defined(_MSC_VER) \
26	&& (defined(_M_X64) \
27	\|\| defined(_M_ARM64) \|\| defined(_M_ARM64EC)) \
28	&& !defined(__INTEL_COMPILER)
29	# include <intrin.h>
30	#endif
31
32
33	/// Find out how many equal bytes the two buffers have.
34	///
35	/// \param buf1 First buffer
36	/// \param buf2 Second buffer
37	/// \param len How many bytes have already been compared and will
38	/// be assumed to match
39	/// \param limit How many bytes to compare at most, including the
40	/// already-compared bytes. This must be significantly
41	/// smaller than UINT32_MAX to avoid integer overflows.
42	/// Up to LZMA_MEMCMPLEN_EXTRA bytes may be read past
43	/// the specified limit from both buf1 and buf2.
44	///
45	/// \return Number of equal bytes in the buffers is returned.
46	/// This is always at least len and at most limit.
47	///
48	/// \note LZMA_MEMCMPLEN_EXTRA defines how many extra bytes may be read.
49	/// It's rounded up to 2^n. This extra amount needs to be
50	/// allocated in the buffers being used. It needs to be
51	/// initialized too to keep Valgrind quiet.
52	static lzma_always_inline uint32_t
53	lzma_memcmplen(const uint8_t buf1, const uint8_t buf2,
54	uint32_t len, uint32_t limit)
55	{
56	assert(len <= limit);
57	assert(limit <= UINT32_MAX / 2);
58
59	#if defined(TUKLIB_FAST_UNALIGNED_ACCESS) \
60	&& (((TUKLIB_GNUC_REQ(3, 4) \|\| defined(__clang__)) \
61	&& SIZE_MAX == UINT64_MAX) \
62	\|\| (defined(__INTEL_COMPILER) && defined(__x86_64__)) \
63	\|\| (defined(__INTEL_COMPILER) && defined(_M_X64)) \
64	\|\| (defined(_MSC_VER) && (defined(_M_X64) \
65	\|\| defined(_M_ARM64) \|\| defined(_M_ARM64EC))))
66	// This is only for x86-64 and ARM64 for now. This might be fine on
67	// other 64-bit processors too.
68	//
69	// Reasons to use subtraction instead of xor:
70	//
71	// - On some x86-64 processors (Intel Sandy Bridge to Tiger Lake),
72	// sub+jz and sub+jnz can be fused but xor+jz or xor+jnz cannot.
73	// Thus using subtraction has potential to be a tiny amount faster
74	// since the code checks if the quotient is non-zero.
75	//
76	// - Some processors (Intel Pentium 4) used to have more ALU
77	// resources for add/sub instructions than and/or/xor.
78	//
79	// The processor info is based on Agner Fog's microarchitecture.pdf
80	// version 2023-05-26. https://www.agner.org/optimize/
81	#define LZMA_MEMCMPLEN_EXTRA 8
82	while (len < limit) {
83	# ifdef WORDS_BIGENDIAN
84	const uint64_t x = read64ne(buf1 + len) ^ read64ne(buf2 + len);
85	# else
86	const uint64_t x = read64ne(buf1 + len) - read64ne(buf2 + len);
87	# endif
88	if (x != 0) {
89	// MSVC or Intel C compiler on Windows
90	# if defined(_MSC_VER) \|\| defined(__INTEL_COMPILER)
91	unsigned long tmp;
92	_BitScanForward64(&tmp, x);
93	len += (uint32_t)tmp >> 3;
94	// GCC, Clang, or Intel C compiler
95	# elif defined(WORDS_BIGENDIAN)
96	len += (uint32_t)__builtin_clzll(x) >> 3;
97	# else
98	len += (uint32_t)__builtin_ctzll(x) >> 3;
99	# endif
100	return my_min(len, limit);
101	}
102
103	len += 8;
104	}
105
106	return limit;
107
108	#elif defined(TUKLIB_FAST_UNALIGNED_ACCESS) \
109	&& defined(HAVE__MM_MOVEMASK_EPI8) \
110	&& (defined(__SSE2__) \
111	\|\| (defined(_MSC_VER) && defined(_M_IX86_FP) \
112	&& _M_IX86_FP >= 2))
113	// NOTE: This will use 128-bit unaligned access which
114	// TUKLIB_FAST_UNALIGNED_ACCESS wasn't meant to permit,
115	// but it's convenient here since this is x86-only.
116	//
117	// SSE2 version for 32-bit and 64-bit x86. On x86-64 the above
118	// version is sometimes significantly faster and sometimes
119	// slightly slower than this SSE2 version, so this SSE2
120	// version isn't used on x86-64.
121	# define LZMA_MEMCMPLEN_EXTRA 16
122	while (len < limit) {
123	const uint32_t x = 0xFFFF ^ (uint32_t)_mm_movemask_epi8(
124	_mm_cmpeq_epi8(
125	_mm_loadu_si128((const __m128i *)(buf1 + len)),
126	_mm_loadu_si128((const __m128i *)(buf2 + len))));
127
128	if (x != 0) {
129	len += ctz32(x);
130	return my_min(len, limit);
131	}
132
133	len += 16;
134	}
135
136	return limit;
137
138	#elif defined(TUKLIB_FAST_UNALIGNED_ACCESS) && !defined(WORDS_BIGENDIAN)
139	// Generic 32-bit little endian method
140	# define LZMA_MEMCMPLEN_EXTRA 4
141	while (len < limit) {
142	uint32_t x = read32ne(buf1 + len) - read32ne(buf2 + len);
143	if (x != 0) {
144	if ((x & 0xFFFF) == 0) {
145	len += 2;
146	x >>= 16;
147	}
148
149	if ((x & 0xFF) == 0)
150	++len;
151
152	return my_min(len, limit);
153	}
154
155	len += 4;
156	}
157
158	return limit;
159
160	#elif defined(TUKLIB_FAST_UNALIGNED_ACCESS) && defined(WORDS_BIGENDIAN)
161	// Generic 32-bit big endian method
162	# define LZMA_MEMCMPLEN_EXTRA 4
163	while (len < limit) {
164	uint32_t x = read32ne(buf1 + len) ^ read32ne(buf2 + len);
165	if (x != 0) {
166	if ((x & 0xFFFF0000) == 0) {
167	len += 2;
168	x <<= 16;
169	}
170
171	if ((x & 0xFF000000) == 0)
172	++len;
173
174	return my_min(len, limit);
175	}
176
177	len += 4;
178	}
179
180	return limit;
181
182	#else
183	// Simple portable version that doesn't use unaligned access.
184	# define LZMA_MEMCMPLEN_EXTRA 0
185	while (len < limit && buf1[len] == buf2[len])
186	++len;
187
188	return len;
189	#endif
190	}
191
192	#endif

Note: See TracBrowser for help on using the repository browser.

source: vbox/trunk/src/libs/liblzma-5.8.1/common/memcmplen.h@ 108911

Download in other formats: