1 | // SPDX-License-Identifier: 0BSD
|
---|
2 |
|
---|
3 | ///////////////////////////////////////////////////////////////////////////////
|
---|
4 | //
|
---|
5 | /// \file crc32_arm64.h
|
---|
6 | /// \brief CRC32 calculation with ARM64 optimization
|
---|
7 | //
|
---|
8 | // Authors: Chenxi Mao
|
---|
9 | // Jia Tan
|
---|
10 | // Lasse Collin
|
---|
11 | //
|
---|
12 | ///////////////////////////////////////////////////////////////////////////////
|
---|
13 |
|
---|
14 | #ifndef LZMA_CRC32_ARM64_H
|
---|
15 | #define LZMA_CRC32_ARM64_H
|
---|
16 |
|
---|
17 | // MSVC always has the CRC intrinsics available when building for ARM64
|
---|
18 | // there is no need to include any header files.
|
---|
19 | #ifndef _MSC_VER
|
---|
20 | # include <arm_acle.h>
|
---|
21 | #endif
|
---|
22 |
|
---|
23 | // If both versions are going to be built, we need runtime detection
|
---|
24 | // to check if the instructions are supported.
|
---|
25 | #if defined(CRC32_GENERIC) && defined(CRC32_ARCH_OPTIMIZED)
|
---|
26 | # if defined(HAVE_GETAUXVAL) || defined(HAVE_ELF_AUX_INFO)
|
---|
27 | # include <sys/auxv.h>
|
---|
28 | # elif defined(_WIN32)
|
---|
29 | # include <processthreadsapi.h>
|
---|
30 | # elif defined(__APPLE__) && defined(HAVE_SYSCTLBYNAME)
|
---|
31 | # include <sys/sysctl.h>
|
---|
32 | # endif
|
---|
33 | #endif
|
---|
34 |
|
---|
35 | // Some EDG-based compilers support ARM64 and define __GNUC__
|
---|
36 | // (such as Nvidia's nvcc), but do not support function attributes.
|
---|
37 | //
|
---|
38 | // NOTE: Build systems check for this too, keep them in sync with this.
|
---|
39 | #if (defined(__GNUC__) || defined(__clang__)) && !defined(__EDG__)
|
---|
40 | # define crc_attr_target __attribute__((__target__("+crc")))
|
---|
41 | #else
|
---|
42 | # define crc_attr_target
|
---|
43 | #endif
|
---|
44 |
|
---|
45 |
|
---|
46 | crc_attr_target
|
---|
47 | static uint32_t
|
---|
48 | crc32_arch_optimized(const uint8_t *buf, size_t size, uint32_t crc)
|
---|
49 | {
|
---|
50 | crc = ~crc;
|
---|
51 |
|
---|
52 | if (size >= 8) {
|
---|
53 | // Align the input buffer because this was shown to be
|
---|
54 | // significantly faster than unaligned accesses.
|
---|
55 | const size_t align = (0 - (uintptr_t)buf) & 7;
|
---|
56 |
|
---|
57 | if (align & 1)
|
---|
58 | crc = __crc32b(crc, *buf++);
|
---|
59 |
|
---|
60 | if (align & 2) {
|
---|
61 | crc = __crc32h(crc, aligned_read16le(buf));
|
---|
62 | buf += 2;
|
---|
63 | }
|
---|
64 |
|
---|
65 | if (align & 4) {
|
---|
66 | crc = __crc32w(crc, aligned_read32le(buf));
|
---|
67 | buf += 4;
|
---|
68 | }
|
---|
69 |
|
---|
70 | size -= align;
|
---|
71 |
|
---|
72 | // Process 8 bytes at a time. The end point is determined by
|
---|
73 | // ignoring the least significant three bits of size to
|
---|
74 | // ensure we do not process past the bounds of the buffer.
|
---|
75 | // This guarantees that limit is a multiple of 8 and is
|
---|
76 | // strictly less than size.
|
---|
77 | for (const uint8_t *limit = buf + (size & ~(size_t)7);
|
---|
78 | buf < limit; buf += 8)
|
---|
79 | crc = __crc32d(crc, aligned_read64le(buf));
|
---|
80 |
|
---|
81 | size &= 7;
|
---|
82 | }
|
---|
83 |
|
---|
84 | // Process the remaining bytes that are not 8 byte aligned.
|
---|
85 | if (size & 4) {
|
---|
86 | crc = __crc32w(crc, aligned_read32le(buf));
|
---|
87 | buf += 4;
|
---|
88 | }
|
---|
89 |
|
---|
90 | if (size & 2) {
|
---|
91 | crc = __crc32h(crc, aligned_read16le(buf));
|
---|
92 | buf += 2;
|
---|
93 | }
|
---|
94 |
|
---|
95 | if (size & 1)
|
---|
96 | crc = __crc32b(crc, *buf);
|
---|
97 |
|
---|
98 | return ~crc;
|
---|
99 | }
|
---|
100 |
|
---|
101 |
|
---|
102 | #if defined(CRC32_GENERIC) && defined(CRC32_ARCH_OPTIMIZED)
|
---|
103 | static inline bool
|
---|
104 | is_arch_extension_supported(void)
|
---|
105 | {
|
---|
106 | #if defined(HAVE_GETAUXVAL)
|
---|
107 | return (getauxval(AT_HWCAP) & HWCAP_CRC32) != 0;
|
---|
108 |
|
---|
109 | #elif defined(HAVE_ELF_AUX_INFO)
|
---|
110 | unsigned long feature_flags;
|
---|
111 |
|
---|
112 | if (elf_aux_info(AT_HWCAP, &feature_flags, sizeof(feature_flags)) != 0)
|
---|
113 | return false;
|
---|
114 |
|
---|
115 | return (feature_flags & HWCAP_CRC32) != 0;
|
---|
116 |
|
---|
117 | #elif defined(_WIN32)
|
---|
118 | return IsProcessorFeaturePresent(
|
---|
119 | PF_ARM_V8_CRC32_INSTRUCTIONS_AVAILABLE);
|
---|
120 |
|
---|
121 | #elif defined(__APPLE__) && defined(HAVE_SYSCTLBYNAME)
|
---|
122 | int has_crc32 = 0;
|
---|
123 | size_t size = sizeof(has_crc32);
|
---|
124 |
|
---|
125 | // The sysctlbyname() function requires a string identifier for the
|
---|
126 | // CPU feature it tests. The Apple documentation lists the string
|
---|
127 | // "hw.optional.armv8_crc32", which can be found here:
|
---|
128 | // https://developer.apple.com/documentation/kernel/1387446-sysctlbyname/determining_instruction_set_characteristics#3915619
|
---|
129 | if (sysctlbyname("hw.optional.armv8_crc32", &has_crc32,
|
---|
130 | &size, NULL, 0) != 0)
|
---|
131 | return false;
|
---|
132 |
|
---|
133 | return has_crc32;
|
---|
134 |
|
---|
135 | #else
|
---|
136 | // If a runtime detection method cannot be found, then this must
|
---|
137 | // be a compile time error. The checks in crc_common.h should ensure
|
---|
138 | // a runtime detection method is always found if this function is
|
---|
139 | // built. It would be possible to just return false here, but this
|
---|
140 | // is inefficient for binary size and runtime since only the generic
|
---|
141 | // method could ever be used.
|
---|
142 | # error Runtime detection method unavailable.
|
---|
143 | #endif
|
---|
144 | }
|
---|
145 | #endif
|
---|
146 |
|
---|
147 | #endif // LZMA_CRC32_ARM64_H
|
---|