1 | ///////////////////////////////////////////////////////////////////////////////
|
---|
2 | //
|
---|
3 | /// \file lz_encoder.h
|
---|
4 | /// \brief LZ in window and match finder API
|
---|
5 | ///
|
---|
6 | // Authors: Igor Pavlov
|
---|
7 | // Lasse Collin
|
---|
8 | //
|
---|
9 | // This file has been put into the public domain.
|
---|
10 | // You can do whatever you want with this file.
|
---|
11 | //
|
---|
12 | ///////////////////////////////////////////////////////////////////////////////
|
---|
13 |
|
---|
14 | #ifndef LZMA_LZ_ENCODER_H
|
---|
15 | #define LZMA_LZ_ENCODER_H
|
---|
16 |
|
---|
17 | #include "common.h"
|
---|
18 |
|
---|
19 |
|
---|
20 | /// A table of these is used by the LZ-based encoder to hold
|
---|
21 | /// the length-distance pairs found by the match finder.
|
---|
22 | typedef struct {
|
---|
23 | uint32_t len;
|
---|
24 | uint32_t dist;
|
---|
25 | } lzma_match;
|
---|
26 |
|
---|
27 |
|
---|
28 | typedef struct lzma_mf_s lzma_mf;
|
---|
29 | struct lzma_mf_s {
|
---|
30 | ///////////////
|
---|
31 | // In Window //
|
---|
32 | ///////////////
|
---|
33 |
|
---|
34 | /// Pointer to buffer with data to be compressed
|
---|
35 | uint8_t *buffer;
|
---|
36 |
|
---|
37 | /// Total size of the allocated buffer (that is, including all
|
---|
38 | /// the extra space)
|
---|
39 | uint32_t size;
|
---|
40 |
|
---|
41 | /// Number of bytes that must be kept available in our input history.
|
---|
42 | /// That is, once keep_size_before bytes have been processed,
|
---|
43 | /// buffer[read_pos - keep_size_before] is the oldest byte that
|
---|
44 | /// must be available for reading.
|
---|
45 | uint32_t keep_size_before;
|
---|
46 |
|
---|
47 | /// Number of bytes that must be kept in buffer after read_pos.
|
---|
48 | /// That is, read_pos <= write_pos - keep_size_after as long as
|
---|
49 | /// action is LZMA_RUN; when action != LZMA_RUN, read_pos is allowed
|
---|
50 | /// to reach write_pos so that the last bytes get encoded too.
|
---|
51 | uint32_t keep_size_after;
|
---|
52 |
|
---|
53 | /// Match finders store locations of matches using 32-bit integers.
|
---|
54 | /// To avoid adjusting several megabytes of integers every time the
|
---|
55 | /// input window is moved with move_window, we only adjust the
|
---|
56 | /// offset of the buffer. Thus, buffer[value_in_hash_table - offset]
|
---|
57 | /// is the byte pointed by value_in_hash_table.
|
---|
58 | uint32_t offset;
|
---|
59 |
|
---|
60 | /// buffer[read_pos] is the next byte to run through the match
|
---|
61 | /// finder. This is incremented in the match finder once the byte
|
---|
62 | /// has been processed.
|
---|
63 | uint32_t read_pos;
|
---|
64 |
|
---|
65 | /// Number of bytes that have been ran through the match finder, but
|
---|
66 | /// which haven't been encoded by the LZ-based encoder yet.
|
---|
67 | uint32_t read_ahead;
|
---|
68 |
|
---|
69 | /// As long as read_pos is less than read_limit, there is enough
|
---|
70 | /// input available in buffer for at least one encoding loop.
|
---|
71 | ///
|
---|
72 | /// Because of the stateful API, read_limit may and will get greater
|
---|
73 | /// than read_pos quite often. This is taken into account when
|
---|
74 | /// calculating the value for keep_size_after.
|
---|
75 | uint32_t read_limit;
|
---|
76 |
|
---|
77 | /// buffer[write_pos] is the first byte that doesn't contain valid
|
---|
78 | /// uncompressed data; that is, the next input byte will be copied
|
---|
79 | /// to buffer[write_pos].
|
---|
80 | uint32_t write_pos;
|
---|
81 |
|
---|
82 | /// Number of bytes not hashed before read_pos. This is needed to
|
---|
83 | /// restart the match finder after LZMA_SYNC_FLUSH.
|
---|
84 | uint32_t pending;
|
---|
85 |
|
---|
86 | //////////////////
|
---|
87 | // Match Finder //
|
---|
88 | //////////////////
|
---|
89 |
|
---|
90 | /// Find matches. Returns the number of distance-length pairs written
|
---|
91 | /// to the matches array. This is called only via lzma_mf_find().
|
---|
92 | uint32_t (*find)(lzma_mf *mf, lzma_match *matches);
|
---|
93 |
|
---|
94 | /// Skips num bytes. This is like find() but doesn't make the
|
---|
95 | /// distance-length pairs available, thus being a little faster.
|
---|
96 | /// This is called only via mf_skip().
|
---|
97 | void (*skip)(lzma_mf *mf, uint32_t num);
|
---|
98 |
|
---|
99 | uint32_t *hash;
|
---|
100 | uint32_t *son;
|
---|
101 | uint32_t cyclic_pos;
|
---|
102 | uint32_t cyclic_size; // Must be dictionary size + 1.
|
---|
103 | uint32_t hash_mask;
|
---|
104 |
|
---|
105 | /// Maximum number of loops in the match finder
|
---|
106 | uint32_t depth;
|
---|
107 |
|
---|
108 | /// Maximum length of a match that the match finder will try to find.
|
---|
109 | uint32_t nice_len;
|
---|
110 |
|
---|
111 | /// Maximum length of a match supported by the LZ-based encoder.
|
---|
112 | /// If the longest match found by the match finder is nice_len,
|
---|
113 | /// mf_find() tries to expand it up to match_len_max bytes.
|
---|
114 | uint32_t match_len_max;
|
---|
115 |
|
---|
116 | /// When running out of input, binary tree match finders need to know
|
---|
117 | /// if it is due to flushing or finishing. The action is used also
|
---|
118 | /// by the LZ-based encoders themselves.
|
---|
119 | lzma_action action;
|
---|
120 |
|
---|
121 | /// Number of elements in hash[]
|
---|
122 | uint32_t hash_count;
|
---|
123 |
|
---|
124 | /// Number of elements in son[]
|
---|
125 | uint32_t sons_count;
|
---|
126 | };
|
---|
127 |
|
---|
128 |
|
---|
129 | typedef struct {
|
---|
130 | /// Extra amount of data to keep available before the "actual"
|
---|
131 | /// dictionary.
|
---|
132 | size_t before_size;
|
---|
133 |
|
---|
134 | /// Size of the history buffer
|
---|
135 | size_t dict_size;
|
---|
136 |
|
---|
137 | /// Extra amount of data to keep available after the "actual"
|
---|
138 | /// dictionary.
|
---|
139 | size_t after_size;
|
---|
140 |
|
---|
141 | /// Maximum length of a match that the LZ-based encoder can accept.
|
---|
142 | /// This is used to extend matches of length nice_len to the
|
---|
143 | /// maximum possible length.
|
---|
144 | size_t match_len_max;
|
---|
145 |
|
---|
146 | /// Match finder will search matches up to this length.
|
---|
147 | /// This must be less than or equal to match_len_max.
|
---|
148 | size_t nice_len;
|
---|
149 |
|
---|
150 | /// Type of the match finder to use
|
---|
151 | lzma_match_finder match_finder;
|
---|
152 |
|
---|
153 | /// Maximum search depth
|
---|
154 | uint32_t depth;
|
---|
155 |
|
---|
156 | /// TODO: Comment
|
---|
157 | const uint8_t *preset_dict;
|
---|
158 |
|
---|
159 | uint32_t preset_dict_size;
|
---|
160 |
|
---|
161 | } lzma_lz_options;
|
---|
162 |
|
---|
163 |
|
---|
164 | // The total usable buffer space at any moment outside the match finder:
|
---|
165 | // before_size + dict_size + after_size + match_len_max
|
---|
166 | //
|
---|
167 | // In reality, there's some extra space allocated to prevent the number of
|
---|
168 | // memmove() calls reasonable. The bigger the dict_size is, the bigger
|
---|
169 | // this extra buffer will be since with bigger dictionaries memmove() would
|
---|
170 | // also take longer.
|
---|
171 | //
|
---|
172 | // A single encoder loop in the LZ-based encoder may call the match finder
|
---|
173 | // (mf_find() or mf_skip()) at most after_size times. In other words,
|
---|
174 | // a single encoder loop may increment lzma_mf.read_pos at most after_size
|
---|
175 | // times. Since matches are looked up to
|
---|
176 | // lzma_mf.buffer[lzma_mf.read_pos + match_len_max - 1], the total
|
---|
177 | // amount of extra buffer needed after dict_size becomes
|
---|
178 | // after_size + match_len_max.
|
---|
179 | //
|
---|
180 | // before_size has two uses. The first one is to keep literals available
|
---|
181 | // in cases when the LZ-based encoder has made some read ahead.
|
---|
182 | // TODO: Maybe this could be changed by making the LZ-based encoders to
|
---|
183 | // store the actual literals as they do with length-distance pairs.
|
---|
184 | //
|
---|
185 | // Algorithms such as LZMA2 first try to compress a chunk, and then check
|
---|
186 | // if the encoded result is smaller than the uncompressed one. If the chunk
|
---|
187 | // was uncompressible, it is better to store it in uncompressed form in
|
---|
188 | // the output stream. To do this, the whole uncompressed chunk has to be
|
---|
189 | // still available in the history buffer. before_size achieves that.
|
---|
190 |
|
---|
191 |
|
---|
192 | typedef struct {
|
---|
193 | /// Data specific to the LZ-based encoder
|
---|
194 | void *coder;
|
---|
195 |
|
---|
196 | /// Function to encode from *dict to out[]
|
---|
197 | lzma_ret (*code)(void *coder,
|
---|
198 | lzma_mf *restrict mf, uint8_t *restrict out,
|
---|
199 | size_t *restrict out_pos, size_t out_size);
|
---|
200 |
|
---|
201 | /// Free allocated resources
|
---|
202 | void (*end)(void *coder, const lzma_allocator *allocator);
|
---|
203 |
|
---|
204 | /// Update the options in the middle of the encoding.
|
---|
205 | lzma_ret (*options_update)(void *coder, const lzma_filter *filter);
|
---|
206 |
|
---|
207 | /// Set maximum allowed output size
|
---|
208 | lzma_ret (*set_out_limit)(void *coder, uint64_t *uncomp_size,
|
---|
209 | uint64_t out_limit);
|
---|
210 |
|
---|
211 | } lzma_lz_encoder;
|
---|
212 |
|
---|
213 |
|
---|
214 | // Basic steps:
|
---|
215 | // 1. Input gets copied into the dictionary.
|
---|
216 | // 2. Data in dictionary gets run through the match finder byte by byte.
|
---|
217 | // 3. The literals and matches are encoded using e.g. LZMA.
|
---|
218 | //
|
---|
219 | // The bytes that have been ran through the match finder, but not encoded yet,
|
---|
220 | // are called `read ahead'.
|
---|
221 |
|
---|
222 |
|
---|
223 | /// Get how many bytes the match finder hashes in its initial step.
|
---|
224 | /// This is also the minimum nice_len value with the match finder.
|
---|
225 | static inline uint32_t
|
---|
226 | mf_get_hash_bytes(lzma_match_finder match_finder)
|
---|
227 | {
|
---|
228 | return (uint32_t)match_finder & 0x0F;
|
---|
229 | }
|
---|
230 |
|
---|
231 |
|
---|
232 | /// Get pointer to the first byte not ran through the match finder
|
---|
233 | static inline const uint8_t *
|
---|
234 | mf_ptr(const lzma_mf *mf)
|
---|
235 | {
|
---|
236 | return mf->buffer + mf->read_pos;
|
---|
237 | }
|
---|
238 |
|
---|
239 |
|
---|
240 | /// Get the number of bytes that haven't been ran through the match finder yet.
|
---|
241 | static inline uint32_t
|
---|
242 | mf_avail(const lzma_mf *mf)
|
---|
243 | {
|
---|
244 | return mf->write_pos - mf->read_pos;
|
---|
245 | }
|
---|
246 |
|
---|
247 |
|
---|
248 | /// Get the number of bytes that haven't been encoded yet (some of these
|
---|
249 | /// bytes may have been ran through the match finder though).
|
---|
250 | static inline uint32_t
|
---|
251 | mf_unencoded(const lzma_mf *mf)
|
---|
252 | {
|
---|
253 | return mf->write_pos - mf->read_pos + mf->read_ahead;
|
---|
254 | }
|
---|
255 |
|
---|
256 |
|
---|
257 | /// Calculate the absolute offset from the beginning of the most recent
|
---|
258 | /// dictionary reset. Only the lowest four bits are important, so there's no
|
---|
259 | /// problem that we don't know the 64-bit size of the data encoded so far.
|
---|
260 | ///
|
---|
261 | /// NOTE: When moving the input window, we need to do it so that the lowest
|
---|
262 | /// bits of dict->read_pos are not modified to keep this macro working
|
---|
263 | /// as intended.
|
---|
264 | static inline uint32_t
|
---|
265 | mf_position(const lzma_mf *mf)
|
---|
266 | {
|
---|
267 | return mf->read_pos - mf->read_ahead;
|
---|
268 | }
|
---|
269 |
|
---|
270 |
|
---|
271 | /// Since everything else begins with mf_, use it also for lzma_mf_find().
|
---|
272 | #define mf_find lzma_mf_find
|
---|
273 |
|
---|
274 |
|
---|
275 | /// Skip the given number of bytes. This is used when a good match was found.
|
---|
276 | /// For example, if mf_find() finds a match of 200 bytes long, the first byte
|
---|
277 | /// of that match was already consumed by mf_find(), and the rest 199 bytes
|
---|
278 | /// have to be skipped with mf_skip(mf, 199).
|
---|
279 | static inline void
|
---|
280 | mf_skip(lzma_mf *mf, uint32_t amount)
|
---|
281 | {
|
---|
282 | if (amount != 0) {
|
---|
283 | mf->skip(mf, amount);
|
---|
284 | mf->read_ahead += amount;
|
---|
285 | }
|
---|
286 | }
|
---|
287 |
|
---|
288 |
|
---|
289 | /// Copies at most *left number of bytes from the history buffer
|
---|
290 | /// to out[]. This is needed by LZMA2 to encode uncompressed chunks.
|
---|
291 | static inline void
|
---|
292 | mf_read(lzma_mf *mf, uint8_t *out, size_t *out_pos, size_t out_size,
|
---|
293 | size_t *left)
|
---|
294 | {
|
---|
295 | const size_t out_avail = out_size - *out_pos;
|
---|
296 | const size_t copy_size = my_min(out_avail, *left);
|
---|
297 |
|
---|
298 | assert(mf->read_ahead == 0);
|
---|
299 | assert(mf->read_pos >= *left);
|
---|
300 |
|
---|
301 | memcpy(out + *out_pos, mf->buffer + mf->read_pos - *left,
|
---|
302 | copy_size);
|
---|
303 |
|
---|
304 | *out_pos += copy_size;
|
---|
305 | *left -= copy_size;
|
---|
306 | return;
|
---|
307 | }
|
---|
308 |
|
---|
309 |
|
---|
310 | extern lzma_ret lzma_lz_encoder_init(
|
---|
311 | lzma_next_coder *next, const lzma_allocator *allocator,
|
---|
312 | const lzma_filter_info *filters,
|
---|
313 | lzma_ret (*lz_init)(lzma_lz_encoder *lz,
|
---|
314 | const lzma_allocator *allocator,
|
---|
315 | lzma_vli id, const void *options,
|
---|
316 | lzma_lz_options *lz_options));
|
---|
317 |
|
---|
318 |
|
---|
319 | extern uint64_t lzma_lz_encoder_memusage(const lzma_lz_options *lz_options);
|
---|
320 |
|
---|
321 |
|
---|
322 | // These are only for LZ encoder's internal use.
|
---|
323 | extern uint32_t lzma_mf_find(
|
---|
324 | lzma_mf *mf, uint32_t *count, lzma_match *matches);
|
---|
325 |
|
---|
326 | extern uint32_t lzma_mf_hc3_find(lzma_mf *dict, lzma_match *matches);
|
---|
327 | extern void lzma_mf_hc3_skip(lzma_mf *dict, uint32_t amount);
|
---|
328 |
|
---|
329 | extern uint32_t lzma_mf_hc4_find(lzma_mf *dict, lzma_match *matches);
|
---|
330 | extern void lzma_mf_hc4_skip(lzma_mf *dict, uint32_t amount);
|
---|
331 |
|
---|
332 | extern uint32_t lzma_mf_bt2_find(lzma_mf *dict, lzma_match *matches);
|
---|
333 | extern void lzma_mf_bt2_skip(lzma_mf *dict, uint32_t amount);
|
---|
334 |
|
---|
335 | extern uint32_t lzma_mf_bt3_find(lzma_mf *dict, lzma_match *matches);
|
---|
336 | extern void lzma_mf_bt3_skip(lzma_mf *dict, uint32_t amount);
|
---|
337 |
|
---|
338 | extern uint32_t lzma_mf_bt4_find(lzma_mf *dict, lzma_match *matches);
|
---|
339 | extern void lzma_mf_bt4_skip(lzma_mf *dict, uint32_t amount);
|
---|
340 |
|
---|
341 | #endif
|
---|