1 | // SPDX-License-Identifier: 0BSD
|
---|
2 |
|
---|
3 | ///////////////////////////////////////////////////////////////////////////////
|
---|
4 | //
|
---|
5 | /// \file lz_encoder.h
|
---|
6 | /// \brief LZ in window and match finder API
|
---|
7 | ///
|
---|
8 | // Authors: Igor Pavlov
|
---|
9 | // Lasse Collin
|
---|
10 | //
|
---|
11 | ///////////////////////////////////////////////////////////////////////////////
|
---|
12 |
|
---|
13 | #ifndef LZMA_LZ_ENCODER_H
|
---|
14 | #define LZMA_LZ_ENCODER_H
|
---|
15 |
|
---|
16 | #include "common.h"
|
---|
17 |
|
---|
18 |
|
---|
19 | // For now, the dictionary size is limited to 1.5 GiB. This may grow
|
---|
20 | // in the future if needed, but it needs a little more work than just
|
---|
21 | // changing this check.
|
---|
22 | #define IS_ENC_DICT_SIZE_VALID(size) \
|
---|
23 | ((size) >= LZMA_DICT_SIZE_MIN \
|
---|
24 | && (size) <= (UINT32_C(1) << 30) + (UINT32_C(1) << 29))
|
---|
25 |
|
---|
26 |
|
---|
27 | /// A table of these is used by the LZ-based encoder to hold
|
---|
28 | /// the length-distance pairs found by the match finder.
|
---|
29 | typedef struct {
|
---|
30 | uint32_t len;
|
---|
31 | uint32_t dist;
|
---|
32 | } lzma_match;
|
---|
33 |
|
---|
34 |
|
---|
35 | typedef struct lzma_mf_s lzma_mf;
|
---|
36 | struct lzma_mf_s {
|
---|
37 | ///////////////
|
---|
38 | // In Window //
|
---|
39 | ///////////////
|
---|
40 |
|
---|
41 | /// Pointer to buffer with data to be compressed
|
---|
42 | uint8_t *buffer;
|
---|
43 |
|
---|
44 | /// Total size of the allocated buffer (that is, including all
|
---|
45 | /// the extra space)
|
---|
46 | uint32_t size;
|
---|
47 |
|
---|
48 | /// Number of bytes that must be kept available in our input history.
|
---|
49 | /// That is, once keep_size_before bytes have been processed,
|
---|
50 | /// buffer[read_pos - keep_size_before] is the oldest byte that
|
---|
51 | /// must be available for reading.
|
---|
52 | uint32_t keep_size_before;
|
---|
53 |
|
---|
54 | /// Number of bytes that must be kept in buffer after read_pos.
|
---|
55 | /// That is, read_pos <= write_pos - keep_size_after as long as
|
---|
56 | /// action is LZMA_RUN; when action != LZMA_RUN, read_pos is allowed
|
---|
57 | /// to reach write_pos so that the last bytes get encoded too.
|
---|
58 | uint32_t keep_size_after;
|
---|
59 |
|
---|
60 | /// Match finders store locations of matches using 32-bit integers.
|
---|
61 | /// To avoid adjusting several megabytes of integers every time the
|
---|
62 | /// input window is moved with move_window, we only adjust the
|
---|
63 | /// offset of the buffer. Thus, buffer[value_in_hash_table - offset]
|
---|
64 | /// is the byte pointed by value_in_hash_table.
|
---|
65 | uint32_t offset;
|
---|
66 |
|
---|
67 | /// buffer[read_pos] is the next byte to run through the match
|
---|
68 | /// finder. This is incremented in the match finder once the byte
|
---|
69 | /// has been processed.
|
---|
70 | uint32_t read_pos;
|
---|
71 |
|
---|
72 | /// Number of bytes that have been ran through the match finder, but
|
---|
73 | /// which haven't been encoded by the LZ-based encoder yet.
|
---|
74 | uint32_t read_ahead;
|
---|
75 |
|
---|
76 | /// As long as read_pos is less than read_limit, there is enough
|
---|
77 | /// input available in buffer for at least one encoding loop.
|
---|
78 | ///
|
---|
79 | /// Because of the stateful API, read_limit may and will get greater
|
---|
80 | /// than read_pos quite often. This is taken into account when
|
---|
81 | /// calculating the value for keep_size_after.
|
---|
82 | uint32_t read_limit;
|
---|
83 |
|
---|
84 | /// buffer[write_pos] is the first byte that doesn't contain valid
|
---|
85 | /// uncompressed data; that is, the next input byte will be copied
|
---|
86 | /// to buffer[write_pos].
|
---|
87 | uint32_t write_pos;
|
---|
88 |
|
---|
89 | /// Number of bytes not hashed before read_pos. This is needed to
|
---|
90 | /// restart the match finder after LZMA_SYNC_FLUSH.
|
---|
91 | uint32_t pending;
|
---|
92 |
|
---|
93 | //////////////////
|
---|
94 | // Match Finder //
|
---|
95 | //////////////////
|
---|
96 |
|
---|
97 | /// Find matches. Returns the number of distance-length pairs written
|
---|
98 | /// to the matches array. This is called only via lzma_mf_find().
|
---|
99 | uint32_t (*find)(lzma_mf *mf, lzma_match *matches);
|
---|
100 |
|
---|
101 | /// Skips num bytes. This is like find() but doesn't make the
|
---|
102 | /// distance-length pairs available, thus being a little faster.
|
---|
103 | /// This is called only via mf_skip().
|
---|
104 | void (*skip)(lzma_mf *mf, uint32_t num);
|
---|
105 |
|
---|
106 | uint32_t *hash;
|
---|
107 | uint32_t *son;
|
---|
108 | uint32_t cyclic_pos;
|
---|
109 | uint32_t cyclic_size; // Must be dictionary size + 1.
|
---|
110 | uint32_t hash_mask;
|
---|
111 |
|
---|
112 | /// Maximum number of loops in the match finder
|
---|
113 | uint32_t depth;
|
---|
114 |
|
---|
115 | /// Maximum length of a match that the match finder will try to find.
|
---|
116 | uint32_t nice_len;
|
---|
117 |
|
---|
118 | /// Maximum length of a match supported by the LZ-based encoder.
|
---|
119 | /// If the longest match found by the match finder is nice_len,
|
---|
120 | /// mf_find() tries to expand it up to match_len_max bytes.
|
---|
121 | uint32_t match_len_max;
|
---|
122 |
|
---|
123 | /// When running out of input, binary tree match finders need to know
|
---|
124 | /// if it is due to flushing or finishing. The action is used also
|
---|
125 | /// by the LZ-based encoders themselves.
|
---|
126 | lzma_action action;
|
---|
127 |
|
---|
128 | /// Number of elements in hash[]
|
---|
129 | uint32_t hash_count;
|
---|
130 |
|
---|
131 | /// Number of elements in son[]
|
---|
132 | uint32_t sons_count;
|
---|
133 | };
|
---|
134 |
|
---|
135 |
|
---|
136 | typedef struct {
|
---|
137 | /// Extra amount of data to keep available before the "actual"
|
---|
138 | /// dictionary.
|
---|
139 | size_t before_size;
|
---|
140 |
|
---|
141 | /// Size of the history buffer
|
---|
142 | size_t dict_size;
|
---|
143 |
|
---|
144 | /// Extra amount of data to keep available after the "actual"
|
---|
145 | /// dictionary.
|
---|
146 | size_t after_size;
|
---|
147 |
|
---|
148 | /// Maximum length of a match that the LZ-based encoder can accept.
|
---|
149 | /// This is used to extend matches of length nice_len to the
|
---|
150 | /// maximum possible length.
|
---|
151 | size_t match_len_max;
|
---|
152 |
|
---|
153 | /// Match finder will search matches up to this length.
|
---|
154 | /// This must be less than or equal to match_len_max.
|
---|
155 | size_t nice_len;
|
---|
156 |
|
---|
157 | /// Type of the match finder to use
|
---|
158 | lzma_match_finder match_finder;
|
---|
159 |
|
---|
160 | /// Maximum search depth
|
---|
161 | uint32_t depth;
|
---|
162 |
|
---|
163 | /// Initial dictionary for the match finder to search.
|
---|
164 | const uint8_t *preset_dict;
|
---|
165 |
|
---|
166 | /// If the preset dictionary is NULL, this value is ignored.
|
---|
167 | /// Otherwise this member must indicate the preset dictionary's
|
---|
168 | /// buffer size. If this size is larger than dict_size, then only
|
---|
169 | /// the dict_size sized tail of the preset_dict will be used.
|
---|
170 | uint32_t preset_dict_size;
|
---|
171 |
|
---|
172 | } lzma_lz_options;
|
---|
173 |
|
---|
174 |
|
---|
175 | // The total usable buffer space at any moment outside the match finder:
|
---|
176 | // before_size + dict_size + after_size + match_len_max
|
---|
177 | //
|
---|
178 | // In reality, there's some extra space allocated to prevent the number of
|
---|
179 | // memmove() calls reasonable. The bigger the dict_size is, the bigger
|
---|
180 | // this extra buffer will be since with bigger dictionaries memmove() would
|
---|
181 | // also take longer.
|
---|
182 | //
|
---|
183 | // A single encoder loop in the LZ-based encoder may call the match finder
|
---|
184 | // (mf_find() or mf_skip()) at most after_size times. In other words,
|
---|
185 | // a single encoder loop may increment lzma_mf.read_pos at most after_size
|
---|
186 | // times. Since matches are looked up to
|
---|
187 | // lzma_mf.buffer[lzma_mf.read_pos + match_len_max - 1], the total
|
---|
188 | // amount of extra buffer needed after dict_size becomes
|
---|
189 | // after_size + match_len_max.
|
---|
190 | //
|
---|
191 | // before_size has two uses. The first one is to keep literals available
|
---|
192 | // in cases when the LZ-based encoder has made some read ahead.
|
---|
193 | // TODO: Maybe this could be changed by making the LZ-based encoders to
|
---|
194 | // store the actual literals as they do with length-distance pairs.
|
---|
195 | //
|
---|
196 | // Algorithms such as LZMA2 first try to compress a chunk, and then check
|
---|
197 | // if the encoded result is smaller than the uncompressed one. If the chunk
|
---|
198 | // was incompressible, it is better to store it in uncompressed form in
|
---|
199 | // the output stream. To do this, the whole uncompressed chunk has to be
|
---|
200 | // still available in the history buffer. before_size achieves that.
|
---|
201 |
|
---|
202 |
|
---|
203 | typedef struct {
|
---|
204 | /// Data specific to the LZ-based encoder
|
---|
205 | void *coder;
|
---|
206 |
|
---|
207 | /// Function to encode from *dict to out[]
|
---|
208 | lzma_ret (*code)(void *coder,
|
---|
209 | lzma_mf *restrict mf, uint8_t *restrict out,
|
---|
210 | size_t *restrict out_pos, size_t out_size);
|
---|
211 |
|
---|
212 | /// Free allocated resources
|
---|
213 | void (*end)(void *coder, const lzma_allocator *allocator);
|
---|
214 |
|
---|
215 | /// Update the options in the middle of the encoding.
|
---|
216 | lzma_ret (*options_update)(void *coder, const lzma_filter *filter);
|
---|
217 |
|
---|
218 | /// Set maximum allowed output size
|
---|
219 | lzma_ret (*set_out_limit)(void *coder, uint64_t *uncomp_size,
|
---|
220 | uint64_t out_limit);
|
---|
221 |
|
---|
222 | } lzma_lz_encoder;
|
---|
223 |
|
---|
224 |
|
---|
225 | // Basic steps:
|
---|
226 | // 1. Input gets copied into the dictionary.
|
---|
227 | // 2. Data in dictionary gets run through the match finder byte by byte.
|
---|
228 | // 3. The literals and matches are encoded using e.g. LZMA.
|
---|
229 | //
|
---|
230 | // The bytes that have been ran through the match finder, but not encoded yet,
|
---|
231 | // are called 'read ahead'.
|
---|
232 |
|
---|
233 |
|
---|
234 | /// Get how many bytes the match finder hashes in its initial step.
|
---|
235 | /// This is also the minimum nice_len value with the match finder.
|
---|
236 | static inline uint32_t
|
---|
237 | mf_get_hash_bytes(lzma_match_finder match_finder)
|
---|
238 | {
|
---|
239 | return (uint32_t)match_finder & 0x0F;
|
---|
240 | }
|
---|
241 |
|
---|
242 |
|
---|
243 | /// Get pointer to the first byte not ran through the match finder
|
---|
244 | static inline const uint8_t *
|
---|
245 | mf_ptr(const lzma_mf *mf)
|
---|
246 | {
|
---|
247 | return mf->buffer + mf->read_pos;
|
---|
248 | }
|
---|
249 |
|
---|
250 |
|
---|
251 | /// Get the number of bytes that haven't been ran through the match finder yet.
|
---|
252 | static inline uint32_t
|
---|
253 | mf_avail(const lzma_mf *mf)
|
---|
254 | {
|
---|
255 | return mf->write_pos - mf->read_pos;
|
---|
256 | }
|
---|
257 |
|
---|
258 |
|
---|
259 | /// Get the number of bytes that haven't been encoded yet (some of these
|
---|
260 | /// bytes may have been ran through the match finder though).
|
---|
261 | static inline uint32_t
|
---|
262 | mf_unencoded(const lzma_mf *mf)
|
---|
263 | {
|
---|
264 | return mf->write_pos - mf->read_pos + mf->read_ahead;
|
---|
265 | }
|
---|
266 |
|
---|
267 |
|
---|
268 | /// Calculate the absolute offset from the beginning of the most recent
|
---|
269 | /// dictionary reset. Only the lowest four bits are important, so there's no
|
---|
270 | /// problem that we don't know the 64-bit size of the data encoded so far.
|
---|
271 | ///
|
---|
272 | /// NOTE: When moving the input window, we need to do it so that the lowest
|
---|
273 | /// bits of dict->read_pos are not modified to keep this macro working
|
---|
274 | /// as intended.
|
---|
275 | static inline uint32_t
|
---|
276 | mf_position(const lzma_mf *mf)
|
---|
277 | {
|
---|
278 | return mf->read_pos - mf->read_ahead;
|
---|
279 | }
|
---|
280 |
|
---|
281 |
|
---|
282 | /// Since everything else begins with mf_, use it also for lzma_mf_find().
|
---|
283 | #define mf_find lzma_mf_find
|
---|
284 |
|
---|
285 |
|
---|
286 | /// Skip the given number of bytes. This is used when a good match was found.
|
---|
287 | /// For example, if mf_find() finds a match of 200 bytes long, the first byte
|
---|
288 | /// of that match was already consumed by mf_find(), and the rest 199 bytes
|
---|
289 | /// have to be skipped with mf_skip(mf, 199).
|
---|
290 | static inline void
|
---|
291 | mf_skip(lzma_mf *mf, uint32_t amount)
|
---|
292 | {
|
---|
293 | if (amount != 0) {
|
---|
294 | mf->skip(mf, amount);
|
---|
295 | mf->read_ahead += amount;
|
---|
296 | }
|
---|
297 | }
|
---|
298 |
|
---|
299 |
|
---|
300 | /// Copies at most *left number of bytes from the history buffer
|
---|
301 | /// to out[]. This is needed by LZMA2 to encode uncompressed chunks.
|
---|
302 | static inline void
|
---|
303 | mf_read(lzma_mf *mf, uint8_t *out, size_t *out_pos, size_t out_size,
|
---|
304 | size_t *left)
|
---|
305 | {
|
---|
306 | const size_t out_avail = out_size - *out_pos;
|
---|
307 | const size_t copy_size = my_min(out_avail, *left);
|
---|
308 |
|
---|
309 | assert(mf->read_ahead == 0);
|
---|
310 | assert(mf->read_pos >= *left);
|
---|
311 |
|
---|
312 | memcpy(out + *out_pos, mf->buffer + mf->read_pos - *left,
|
---|
313 | copy_size);
|
---|
314 |
|
---|
315 | *out_pos += copy_size;
|
---|
316 | *left -= copy_size;
|
---|
317 | return;
|
---|
318 | }
|
---|
319 |
|
---|
320 |
|
---|
321 | extern lzma_ret lzma_lz_encoder_init(
|
---|
322 | lzma_next_coder *next, const lzma_allocator *allocator,
|
---|
323 | const lzma_filter_info *filters,
|
---|
324 | lzma_ret (*lz_init)(lzma_lz_encoder *lz,
|
---|
325 | const lzma_allocator *allocator,
|
---|
326 | lzma_vli id, const void *options,
|
---|
327 | lzma_lz_options *lz_options));
|
---|
328 |
|
---|
329 |
|
---|
330 | extern uint64_t lzma_lz_encoder_memusage(const lzma_lz_options *lz_options);
|
---|
331 |
|
---|
332 |
|
---|
333 | // These are only for LZ encoder's internal use.
|
---|
334 | extern uint32_t lzma_mf_find(
|
---|
335 | lzma_mf *mf, uint32_t *count, lzma_match *matches);
|
---|
336 |
|
---|
337 | extern uint32_t lzma_mf_hc3_find(lzma_mf *dict, lzma_match *matches);
|
---|
338 | extern void lzma_mf_hc3_skip(lzma_mf *dict, uint32_t amount);
|
---|
339 |
|
---|
340 | extern uint32_t lzma_mf_hc4_find(lzma_mf *dict, lzma_match *matches);
|
---|
341 | extern void lzma_mf_hc4_skip(lzma_mf *dict, uint32_t amount);
|
---|
342 |
|
---|
343 | extern uint32_t lzma_mf_bt2_find(lzma_mf *dict, lzma_match *matches);
|
---|
344 | extern void lzma_mf_bt2_skip(lzma_mf *dict, uint32_t amount);
|
---|
345 |
|
---|
346 | extern uint32_t lzma_mf_bt3_find(lzma_mf *dict, lzma_match *matches);
|
---|
347 | extern void lzma_mf_bt3_skip(lzma_mf *dict, uint32_t amount);
|
---|
348 |
|
---|
349 | extern uint32_t lzma_mf_bt4_find(lzma_mf *dict, lzma_match *matches);
|
---|
350 | extern void lzma_mf_bt4_skip(lzma_mf *dict, uint32_t amount);
|
---|
351 |
|
---|
352 | #endif
|
---|