VirtualBox

source: vbox/trunk/src/libs/liblzma-5.4.1/common/memcmplen.h@ 98929

Last change on this file since 98929 was 98730, checked in by vboxsync, 2 years ago

libs/liblzma-5.4.1: Export to OSE, bugref:10254

  • Property svn:eol-style set to native
  • Property svn:keywords set to Author Date Id Revision
File size: 4.5 KB
Line 
1///////////////////////////////////////////////////////////////////////////////
2//
3/// \file memcmplen.h
4/// \brief Optimized comparison of two buffers
5//
6// Author: Lasse Collin
7//
8// This file has been put into the public domain.
9// You can do whatever you want with this file.
10//
11///////////////////////////////////////////////////////////////////////////////
12
13#ifndef LZMA_MEMCMPLEN_H
14#define LZMA_MEMCMPLEN_H
15
16#include "common.h"
17
18#ifdef HAVE_IMMINTRIN_H
19# include <immintrin.h>
20#endif
21
22
23/// Find out how many equal bytes the two buffers have.
24///
25/// \param buf1 First buffer
26/// \param buf2 Second buffer
27/// \param len How many bytes have already been compared and will
28/// be assumed to match
29/// \param limit How many bytes to compare at most, including the
30/// already-compared bytes. This must be significantly
31/// smaller than UINT32_MAX to avoid integer overflows.
32/// Up to LZMA_MEMCMPLEN_EXTRA bytes may be read past
33/// the specified limit from both buf1 and buf2.
34///
35/// \return Number of equal bytes in the buffers is returned.
36/// This is always at least len and at most limit.
37///
38/// \note LZMA_MEMCMPLEN_EXTRA defines how many extra bytes may be read.
39/// It's rounded up to 2^n. This extra amount needs to be
40/// allocated in the buffers being used. It needs to be
41/// initialized too to keep Valgrind quiet.
42static inline uint32_t lzma_attribute((__always_inline__))
43lzma_memcmplen(const uint8_t *buf1, const uint8_t *buf2,
44 uint32_t len, uint32_t limit)
45{
46 assert(len <= limit);
47 assert(limit <= UINT32_MAX / 2);
48
49#if defined(TUKLIB_FAST_UNALIGNED_ACCESS) \
50 && ((TUKLIB_GNUC_REQ(3, 4) && defined(__x86_64__)) \
51 || (defined(__INTEL_COMPILER) && defined(__x86_64__)) \
52 || (defined(__INTEL_COMPILER) && defined(_M_X64)) \
53 || (defined(_MSC_VER) && defined(_M_X64)))
54 // I keep this x86-64 only for now since that's where I know this
55 // to be a good method. This may be fine on other 64-bit CPUs too.
56 // On big endian one should use xor instead of subtraction and switch
57 // to __builtin_clzll().
58#define LZMA_MEMCMPLEN_EXTRA 8
59 while (len < limit) {
60 const uint64_t x = read64ne(buf1 + len) - read64ne(buf2 + len);
61 if (x != 0) {
62# if defined(_M_X64) // MSVC or Intel C compiler on Windows
63 unsigned long tmp;
64 _BitScanForward64(&tmp, x);
65 len += (uint32_t)tmp >> 3;
66# else // GCC, clang, or Intel C compiler
67 len += (uint32_t)__builtin_ctzll(x) >> 3;
68# endif
69 return my_min(len, limit);
70 }
71
72 len += 8;
73 }
74
75 return limit;
76
77#elif defined(TUKLIB_FAST_UNALIGNED_ACCESS) \
78 && defined(HAVE__MM_MOVEMASK_EPI8) \
79 && (defined(__SSE2__) \
80 || (defined(_MSC_VER) && defined(_M_IX86_FP) \
81 && _M_IX86_FP >= 2))
82 // NOTE: This will use 128-bit unaligned access which
83 // TUKLIB_FAST_UNALIGNED_ACCESS wasn't meant to permit,
84 // but it's convenient here since this is x86-only.
85 //
86 // SSE2 version for 32-bit and 64-bit x86. On x86-64 the above
87 // version is sometimes significantly faster and sometimes
88 // slightly slower than this SSE2 version, so this SSE2
89 // version isn't used on x86-64.
90# define LZMA_MEMCMPLEN_EXTRA 16
91 while (len < limit) {
92 const uint32_t x = 0xFFFF ^ _mm_movemask_epi8(_mm_cmpeq_epi8(
93 _mm_loadu_si128((const __m128i *)(buf1 + len)),
94 _mm_loadu_si128((const __m128i *)(buf2 + len))));
95
96 if (x != 0) {
97 len += ctz32(x);
98 return my_min(len, limit);
99 }
100
101 len += 16;
102 }
103
104 return limit;
105
106#elif defined(TUKLIB_FAST_UNALIGNED_ACCESS) && !defined(WORDS_BIGENDIAN)
107 // Generic 32-bit little endian method
108# define LZMA_MEMCMPLEN_EXTRA 4
109 while (len < limit) {
110 uint32_t x = read32ne(buf1 + len) - read32ne(buf2 + len);
111 if (x != 0) {
112 if ((x & 0xFFFF) == 0) {
113 len += 2;
114 x >>= 16;
115 }
116
117 if ((x & 0xFF) == 0)
118 ++len;
119
120 return my_min(len, limit);
121 }
122
123 len += 4;
124 }
125
126 return limit;
127
128#elif defined(TUKLIB_FAST_UNALIGNED_ACCESS) && defined(WORDS_BIGENDIAN)
129 // Generic 32-bit big endian method
130# define LZMA_MEMCMPLEN_EXTRA 4
131 while (len < limit) {
132 uint32_t x = read32ne(buf1 + len) ^ read32ne(buf2 + len);
133 if (x != 0) {
134 if ((x & 0xFFFF0000) == 0) {
135 len += 2;
136 x <<= 16;
137 }
138
139 if ((x & 0xFF000000) == 0)
140 ++len;
141
142 return my_min(len, limit);
143 }
144
145 len += 4;
146 }
147
148 return limit;
149
150#else
151 // Simple portable version that doesn't use unaligned access.
152# define LZMA_MEMCMPLEN_EXTRA 0
153 while (len < limit && buf1[len] == buf2[len])
154 ++len;
155
156 return len;
157#endif
158}
159
160#endif
Note: See TracBrowser for help on using the repository browser.

© 2024 Oracle Support Privacy / Do Not Sell My Info Terms of Use Trademark Policy Automated Access Etiquette