1 | // SPDX-License-Identifier: 0BSD
|
---|
2 |
|
---|
3 | ///////////////////////////////////////////////////////////////////////////////
|
---|
4 | //
|
---|
5 | /// \file lzma_common.h
|
---|
6 | /// \brief Private definitions common to LZMA encoder and decoder
|
---|
7 | ///
|
---|
8 | // Authors: Igor Pavlov
|
---|
9 | // Lasse Collin
|
---|
10 | //
|
---|
11 | ///////////////////////////////////////////////////////////////////////////////
|
---|
12 |
|
---|
13 | #ifndef LZMA_LZMA_COMMON_H
|
---|
14 | #define LZMA_LZMA_COMMON_H
|
---|
15 |
|
---|
16 | #include "common.h"
|
---|
17 | #include "range_common.h"
|
---|
18 |
|
---|
19 |
|
---|
20 | ///////////////////
|
---|
21 | // Miscellaneous //
|
---|
22 | ///////////////////
|
---|
23 |
|
---|
24 | /// Maximum number of position states. A position state is the lowest pos bits
|
---|
25 | /// number of bits of the current uncompressed offset. In some places there
|
---|
26 | /// are different sets of probabilities for different pos states.
|
---|
27 | #define POS_STATES_MAX (1 << LZMA_PB_MAX)
|
---|
28 |
|
---|
29 |
|
---|
30 | /// Validates lc, lp, and pb.
|
---|
31 | static inline bool
|
---|
32 | is_lclppb_valid(const lzma_options_lzma *options)
|
---|
33 | {
|
---|
34 | return options->lc <= LZMA_LCLP_MAX && options->lp <= LZMA_LCLP_MAX
|
---|
35 | && options->lc + options->lp <= LZMA_LCLP_MAX
|
---|
36 | && options->pb <= LZMA_PB_MAX;
|
---|
37 | }
|
---|
38 |
|
---|
39 |
|
---|
40 | ///////////
|
---|
41 | // State //
|
---|
42 | ///////////
|
---|
43 |
|
---|
44 | /// This enum is used to track which events have occurred most recently and
|
---|
45 | /// in which order. This information is used to predict the next event.
|
---|
46 | ///
|
---|
47 | /// Events:
|
---|
48 | /// - Literal: One 8-bit byte
|
---|
49 | /// - Match: Repeat a chunk of data at some distance
|
---|
50 | /// - Long repeat: Multi-byte match at a recently seen distance
|
---|
51 | /// - Short repeat: One-byte repeat at a recently seen distance
|
---|
52 | ///
|
---|
53 | /// The event names are in from STATE_oldest_older_previous. REP means
|
---|
54 | /// either short or long repeated match, and NONLIT means any non-literal.
|
---|
55 | typedef enum {
|
---|
56 | STATE_LIT_LIT,
|
---|
57 | STATE_MATCH_LIT_LIT,
|
---|
58 | STATE_REP_LIT_LIT,
|
---|
59 | STATE_SHORTREP_LIT_LIT,
|
---|
60 | STATE_MATCH_LIT,
|
---|
61 | STATE_REP_LIT,
|
---|
62 | STATE_SHORTREP_LIT,
|
---|
63 | STATE_LIT_MATCH,
|
---|
64 | STATE_LIT_LONGREP,
|
---|
65 | STATE_LIT_SHORTREP,
|
---|
66 | STATE_NONLIT_MATCH,
|
---|
67 | STATE_NONLIT_REP,
|
---|
68 | } lzma_lzma_state;
|
---|
69 |
|
---|
70 |
|
---|
71 | /// Total number of states
|
---|
72 | #define STATES 12
|
---|
73 |
|
---|
74 | /// The lowest 7 states indicate that the previous state was a literal.
|
---|
75 | #define LIT_STATES 7
|
---|
76 |
|
---|
77 |
|
---|
78 | /// Indicate that the latest state was a literal.
|
---|
79 | #define update_literal(state) \
|
---|
80 | state = ((state) <= STATE_SHORTREP_LIT_LIT \
|
---|
81 | ? STATE_LIT_LIT \
|
---|
82 | : ((state) <= STATE_LIT_SHORTREP \
|
---|
83 | ? (state) - 3 \
|
---|
84 | : (state) - 6))
|
---|
85 |
|
---|
86 | /// Like update_literal(state) but when it is already known that
|
---|
87 | /// is_literal_state(state) is true.
|
---|
88 | #define update_literal_normal(state) \
|
---|
89 | state = ((state) <= STATE_SHORTREP_LIT_LIT \
|
---|
90 | ? STATE_LIT_LIT \
|
---|
91 | : (state) - 3);
|
---|
92 |
|
---|
93 | /// Like update_literal(state) but when it is already known that
|
---|
94 | /// is_literal_state(state) is false.
|
---|
95 | #define update_literal_matched(state) \
|
---|
96 | state = ((state) <= STATE_LIT_SHORTREP \
|
---|
97 | ? (state) - 3 \
|
---|
98 | : (state) - 6);
|
---|
99 |
|
---|
100 | /// Indicate that the latest state was a match.
|
---|
101 | #define update_match(state) \
|
---|
102 | state = ((state) < LIT_STATES ? STATE_LIT_MATCH : STATE_NONLIT_MATCH)
|
---|
103 |
|
---|
104 | /// Indicate that the latest state was a long repeated match.
|
---|
105 | #define update_long_rep(state) \
|
---|
106 | state = ((state) < LIT_STATES ? STATE_LIT_LONGREP : STATE_NONLIT_REP)
|
---|
107 |
|
---|
108 | /// Indicate that the latest state was a short match.
|
---|
109 | #define update_short_rep(state) \
|
---|
110 | state = ((state) < LIT_STATES ? STATE_LIT_SHORTREP : STATE_NONLIT_REP)
|
---|
111 |
|
---|
112 | /// Test if the previous state was a literal.
|
---|
113 | #define is_literal_state(state) \
|
---|
114 | ((state) < LIT_STATES)
|
---|
115 |
|
---|
116 |
|
---|
117 | /////////////
|
---|
118 | // Literal //
|
---|
119 | /////////////
|
---|
120 |
|
---|
121 | /// Each literal coder is divided in three sections:
|
---|
122 | /// - 0x001-0x0FF: Without match byte
|
---|
123 | /// - 0x101-0x1FF: With match byte; match bit is 0
|
---|
124 | /// - 0x201-0x2FF: With match byte; match bit is 1
|
---|
125 | ///
|
---|
126 | /// Match byte is used when the previous LZMA symbol was something else than
|
---|
127 | /// a literal (that is, it was some kind of match).
|
---|
128 | #define LITERAL_CODER_SIZE UINT32_C(0x300)
|
---|
129 |
|
---|
130 | /// Maximum number of literal coders
|
---|
131 | #define LITERAL_CODERS_MAX (1 << LZMA_LCLP_MAX)
|
---|
132 |
|
---|
133 | /// Calculates the literal_mask that literal_subcoder() needs.
|
---|
134 | #define literal_mask_calc(lc, lp) \
|
---|
135 | ((UINT32_C(0x100) << (lp)) - (UINT32_C(0x100) >> (lc)))
|
---|
136 |
|
---|
137 | /// Locate the literal coder for the next literal byte. The choice depends on
|
---|
138 | /// - the lowest literal_pos_bits bits of the position of the current
|
---|
139 | /// byte; and
|
---|
140 | /// - the highest literal_context_bits bits of the previous byte.
|
---|
141 | #define literal_subcoder(probs, lc, literal_mask, pos, prev_byte) \
|
---|
142 | ((probs) + UINT32_C(3) * \
|
---|
143 | (((((pos) << 8) + (prev_byte)) & (literal_mask)) << (lc)))
|
---|
144 |
|
---|
145 |
|
---|
146 | static inline void
|
---|
147 | literal_init(probability *probs, uint32_t lc, uint32_t lp)
|
---|
148 | {
|
---|
149 | assert(lc + lp <= LZMA_LCLP_MAX);
|
---|
150 |
|
---|
151 | const size_t coders = LITERAL_CODER_SIZE << (lc + lp);
|
---|
152 |
|
---|
153 | for (size_t i = 0; i < coders; ++i)
|
---|
154 | bit_reset(probs[i]);
|
---|
155 |
|
---|
156 | return;
|
---|
157 | }
|
---|
158 |
|
---|
159 |
|
---|
160 | //////////////////
|
---|
161 | // Match length //
|
---|
162 | //////////////////
|
---|
163 |
|
---|
164 | // Minimum length of a match is two bytes.
|
---|
165 | #define MATCH_LEN_MIN 2
|
---|
166 |
|
---|
167 | // Match length is encoded with 4, 5, or 10 bits.
|
---|
168 | //
|
---|
169 | // Length Bits
|
---|
170 | // 2-9 4 = Choice=0 + 3 bits
|
---|
171 | // 10-17 5 = Choice=1 + Choice2=0 + 3 bits
|
---|
172 | // 18-273 10 = Choice=1 + Choice2=1 + 8 bits
|
---|
173 | #define LEN_LOW_BITS 3
|
---|
174 | #define LEN_LOW_SYMBOLS (1 << LEN_LOW_BITS)
|
---|
175 | #define LEN_MID_BITS 3
|
---|
176 | #define LEN_MID_SYMBOLS (1 << LEN_MID_BITS)
|
---|
177 | #define LEN_HIGH_BITS 8
|
---|
178 | #define LEN_HIGH_SYMBOLS (1 << LEN_HIGH_BITS)
|
---|
179 | #define LEN_SYMBOLS (LEN_LOW_SYMBOLS + LEN_MID_SYMBOLS + LEN_HIGH_SYMBOLS)
|
---|
180 |
|
---|
181 | // Maximum length of a match is 273 which is a result of the encoding
|
---|
182 | // described above.
|
---|
183 | #define MATCH_LEN_MAX (MATCH_LEN_MIN + LEN_SYMBOLS - 1)
|
---|
184 |
|
---|
185 |
|
---|
186 | ////////////////////
|
---|
187 | // Match distance //
|
---|
188 | ////////////////////
|
---|
189 |
|
---|
190 | // Different sets of probabilities are used for match distances that have very
|
---|
191 | // short match length: Lengths of 2, 3, and 4 bytes have a separate set of
|
---|
192 | // probabilities for each length. The matches with longer length use a shared
|
---|
193 | // set of probabilities.
|
---|
194 | #define DIST_STATES 4
|
---|
195 |
|
---|
196 | // Macro to get the index of the appropriate probability array.
|
---|
197 | #define get_dist_state(len) \
|
---|
198 | ((len) < DIST_STATES + MATCH_LEN_MIN \
|
---|
199 | ? (len) - MATCH_LEN_MIN \
|
---|
200 | : DIST_STATES - 1)
|
---|
201 |
|
---|
202 | // The highest two bits of a match distance (distance slot) are encoded
|
---|
203 | // using six bits. See fastpos.h for more explanation.
|
---|
204 | #define DIST_SLOT_BITS 6
|
---|
205 | #define DIST_SLOTS (1 << DIST_SLOT_BITS)
|
---|
206 |
|
---|
207 | // Match distances up to 127 are fully encoded using probabilities. Since
|
---|
208 | // the highest two bits (distance slot) are always encoded using six bits,
|
---|
209 | // the distances 0-3 don't need any additional bits to encode, since the
|
---|
210 | // distance slot itself is the same as the actual distance. DIST_MODEL_START
|
---|
211 | // indicates the first distance slot where at least one additional bit is
|
---|
212 | // needed.
|
---|
213 | #define DIST_MODEL_START 4
|
---|
214 |
|
---|
215 | // Match distances greater than 127 are encoded in three pieces:
|
---|
216 | // - distance slot: the highest two bits
|
---|
217 | // - direct bits: 2-26 bits below the highest two bits
|
---|
218 | // - alignment bits: four lowest bits
|
---|
219 | //
|
---|
220 | // Direct bits don't use any probabilities.
|
---|
221 | //
|
---|
222 | // The distance slot value of 14 is for distances 128-191 (see the table in
|
---|
223 | // fastpos.h to understand why).
|
---|
224 | #define DIST_MODEL_END 14
|
---|
225 |
|
---|
226 | // Distance slots that indicate a distance <= 127.
|
---|
227 | #define FULL_DISTANCES_BITS (DIST_MODEL_END / 2)
|
---|
228 | #define FULL_DISTANCES (1 << FULL_DISTANCES_BITS)
|
---|
229 |
|
---|
230 | // For match distances greater than 127, only the highest two bits and the
|
---|
231 | // lowest four bits (alignment) is encoded using probabilities.
|
---|
232 | #define ALIGN_BITS 4
|
---|
233 | #define ALIGN_SIZE (1 << ALIGN_BITS)
|
---|
234 | #define ALIGN_MASK (ALIGN_SIZE - 1)
|
---|
235 |
|
---|
236 | // LZMA remembers the four most recent match distances. Reusing these distances
|
---|
237 | // tends to take less space than re-encoding the actual distance value.
|
---|
238 | #define REPS 4
|
---|
239 |
|
---|
240 | #endif
|
---|