VirtualBox

source: vbox/trunk/src/libs/liblzma-5.6.4/lz/lz_encoder_mf.c@ 108905

Last change on this file since 108905 was 108905, checked in by vboxsync, 5 weeks ago

liblzma-5.6.4: Applied and adjusted our liblzma changes to 5.6.4. jiraref:VBP-1613

  • Property svn:eol-style set to LF
  • Property svn:keywords set to Author Date Id Revision
File size: 17.0 KB
Line 
1// SPDX-License-Identifier: 0BSD
2
3///////////////////////////////////////////////////////////////////////////////
4//
5/// \file lz_encoder_mf.c
6/// \brief Match finders
7///
8// Authors: Igor Pavlov
9// Lasse Collin
10//
11///////////////////////////////////////////////////////////////////////////////
12
13#include "lz_encoder.h"
14#include "lz_encoder_hash.h"
15#include "memcmplen.h"
16
17
18/// \brief Find matches starting from the current byte
19///
20/// \return The length of the longest match found
21extern uint32_t
22lzma_mf_find(lzma_mf *mf, uint32_t *count_ptr, lzma_match *matches)
23{
24 // Call the match finder. It returns the number of length-distance
25 // pairs found.
26 // FIXME: Minimum count is zero, what _exactly_ is the maximum?
27 const uint32_t count = mf->find(mf, matches);
28
29 // Length of the longest match; assume that no matches were found
30 // and thus the maximum length is zero.
31 uint32_t len_best = 0;
32
33 if (count > 0) {
34#ifndef NDEBUG
35 // Validate the matches.
36 for (uint32_t i = 0; i < count; ++i) {
37 assert(matches[i].len <= mf->nice_len);
38 assert(matches[i].dist < mf->read_pos);
39 assert(memcmp(mf_ptr(mf) - 1,
40 mf_ptr(mf) - matches[i].dist - 2,
41 matches[i].len) == 0);
42 }
43#endif
44
45 // The last used element in the array contains
46 // the longest match.
47 len_best = matches[count - 1].len;
48
49 // If a match of maximum search length was found, try to
50 // extend the match to maximum possible length.
51 if (len_best == mf->nice_len) {
52 // The limit for the match length is either the
53 // maximum match length supported by the LZ-based
54 // encoder or the number of bytes left in the
55 // dictionary, whichever is smaller.
56 uint32_t limit = mf_avail(mf) + 1;
57 if (limit > mf->match_len_max)
58 limit = mf->match_len_max;
59
60 // Pointer to the byte we just ran through
61 // the match finder.
62 const uint8_t *p1 = mf_ptr(mf) - 1;
63
64 // Pointer to the beginning of the match. We need -1
65 // here because the match distances are zero based.
66 const uint8_t *p2 = p1 - matches[count - 1].dist - 1;
67
68 len_best = lzma_memcmplen(p1, p2, len_best, limit);
69 }
70 }
71
72 *count_ptr = count;
73
74 // Finally update the read position to indicate that match finder was
75 // run for this dictionary offset.
76 ++mf->read_ahead;
77
78 return len_best;
79}
80
81
82/// Hash value to indicate unused element in the hash. Since we start the
83/// positions from dict_size + 1, zero is always too far to qualify
84/// as usable match position.
85#define EMPTY_HASH_VALUE 0
86
87
88/// Normalization must be done when lzma_mf.offset + lzma_mf.read_pos
89/// reaches MUST_NORMALIZE_POS.
90#define MUST_NORMALIZE_POS UINT32_MAX
91
92
93/// \brief Normalizes hash values
94///
95/// The hash arrays store positions of match candidates. The positions are
96/// relative to an arbitrary offset that is not the same as the absolute
97/// offset in the input stream. The relative position of the current byte
98/// is lzma_mf.offset + lzma_mf.read_pos. The distances of the matches are
99/// the differences of the current read position and the position found from
100/// the hash.
101///
102/// To prevent integer overflows of the offsets stored in the hash arrays,
103/// we need to "normalize" the stored values now and then. During the
104/// normalization, we drop values that indicate distance greater than the
105/// dictionary size, thus making space for new values.
106static void
107normalize(lzma_mf *mf)
108{
109 assert(mf->read_pos + mf->offset == MUST_NORMALIZE_POS);
110
111 // In future we may not want to touch the lowest bits, because there
112 // may be match finders that use larger resolution than one byte.
113 const uint32_t subvalue
114 = (MUST_NORMALIZE_POS - mf->cyclic_size);
115 // & ~((UINT32_C(1) << 10) - 1);
116
117 for (uint32_t i = 0; i < mf->hash_count; ++i) {
118 // If the distance is greater than the dictionary size,
119 // we can simply mark the hash element as empty.
120 if (mf->hash[i] <= subvalue)
121 mf->hash[i] = EMPTY_HASH_VALUE;
122 else
123 mf->hash[i] -= subvalue;
124 }
125
126 for (uint32_t i = 0; i < mf->sons_count; ++i) {
127 // Do the same for mf->son.
128 //
129 // NOTE: There may be uninitialized elements in mf->son.
130 // Valgrind may complain that the "if" below depends on
131 // an uninitialized value. In this case it is safe to ignore
132 // the warning. See also the comments in lz_encoder_init()
133 // in lz_encoder.c.
134 if (mf->son[i] <= subvalue)
135 mf->son[i] = EMPTY_HASH_VALUE;
136 else
137 mf->son[i] -= subvalue;
138 }
139
140 // Update offset to match the new locations.
141 mf->offset -= subvalue;
142
143 return;
144}
145
146
147/// Mark the current byte as processed from point of view of the match finder.
148static void
149move_pos(lzma_mf *mf)
150{
151 if (++mf->cyclic_pos == mf->cyclic_size)
152 mf->cyclic_pos = 0;
153
154 ++mf->read_pos;
155 assert(mf->read_pos <= mf->write_pos);
156
157 if (unlikely(mf->read_pos + mf->offset == UINT32_MAX))
158 normalize(mf);
159}
160
161
162/// When flushing, we cannot run the match finder unless there is nice_len
163/// bytes available in the dictionary. Instead, we skip running the match
164/// finder (indicating that no match was found), and count how many bytes we
165/// have ignored this way.
166///
167/// When new data is given after the flushing was completed, the match finder
168/// is restarted by rewinding mf->read_pos backwards by mf->pending. Then
169/// the missed bytes are added to the hash using the match finder's skip
170/// function (with small amount of input, it may start using mf->pending
171/// again if flushing).
172///
173/// Due to this rewinding, we don't touch cyclic_pos or test for
174/// normalization. It will be done when the match finder's skip function
175/// catches up after a flush.
176static void
177move_pending(lzma_mf *mf)
178{
179 ++mf->read_pos;
180 assert(mf->read_pos <= mf->write_pos);
181 ++mf->pending;
182}
183
184
185/// Calculate len_limit and determine if there is enough input to run
186/// the actual match finder code. Sets up "cur" and "pos". This macro
187/// is used by all find functions and binary tree skip functions. Hash
188/// chain skip function doesn't need len_limit so a simpler code is used
189/// in them.
190#define header(is_bt, len_min, ret_op) \
191 uint32_t len_limit = mf_avail(mf); \
192 if (mf->nice_len <= len_limit) { \
193 len_limit = mf->nice_len; \
194 } else if (len_limit < (len_min) \
195 || (is_bt && mf->action == LZMA_SYNC_FLUSH)) { \
196 assert(mf->action != LZMA_RUN); \
197 move_pending(mf); \
198 ret_op; \
199 } \
200 const uint8_t *cur = mf_ptr(mf); \
201 const uint32_t pos = mf->read_pos + mf->offset
202
203
204/// Header for find functions. "return 0" indicates that zero matches
205/// were found.
206#define header_find(is_bt, len_min) \
207 header(is_bt, len_min, return 0); \
208 uint32_t matches_count = 0
209
210
211/// Header for a loop in a skip function. "continue" tells to skip the rest
212/// of the code in the loop.
213#define header_skip(is_bt, len_min) \
214 header(is_bt, len_min, continue)
215
216
217/// Calls hc_find_func() or bt_find_func() and calculates the total number
218/// of matches found. Updates the dictionary position and returns the number
219/// of matches found.
220#define call_find(func, len_best) \
221do { \
222 matches_count = (uint32_t)(func(len_limit, pos, cur, cur_match, \
223 mf->depth, mf->son, \
224 mf->cyclic_pos, mf->cyclic_size, \
225 matches + matches_count, len_best) \
226 - matches); \
227 move_pos(mf); \
228 return matches_count; \
229} while (0)
230
231
232////////////////
233// Hash Chain //
234////////////////
235
236#if defined(HAVE_MF_HC3) || defined(HAVE_MF_HC4)
237///
238///
239/// \param len_limit Don't look for matches longer than len_limit.
240/// \param pos lzma_mf.read_pos + lzma_mf.offset
241/// \param cur Pointer to current byte (mf_ptr(mf))
242/// \param cur_match Start position of the current match candidate
243/// \param depth Maximum length of the hash chain
244/// \param son lzma_mf.son (contains the hash chain)
245/// \param cyclic_pos lzma_mf.cyclic_pos
246/// \param cyclic_size lzma_mf_cyclic_size
247/// \param matches Array to hold the matches.
248/// \param len_best The length of the longest match found so far.
249static lzma_match *
250hc_find_func(
251 const uint32_t len_limit,
252 const uint32_t pos,
253 const uint8_t *const cur,
254 uint32_t cur_match,
255 uint32_t depth,
256 uint32_t *const son,
257 const uint32_t cyclic_pos,
258 const uint32_t cyclic_size,
259 lzma_match *matches,
260 uint32_t len_best)
261{
262 son[cyclic_pos] = cur_match;
263
264 while (true) {
265 const uint32_t delta = pos - cur_match;
266 if (depth-- == 0 || delta >= cyclic_size)
267 return matches;
268
269 const uint8_t *const pb = cur - delta;
270 cur_match = son[cyclic_pos - delta
271 + (delta > cyclic_pos ? cyclic_size : 0)];
272
273 if (pb[len_best] == cur[len_best] && pb[0] == cur[0]) {
274 uint32_t len = lzma_memcmplen(pb, cur, 1, len_limit);
275
276 if (len_best < len) {
277 len_best = len;
278 matches->len = len;
279 matches->dist = delta - 1;
280 ++matches;
281
282 if (len == len_limit)
283 return matches;
284 }
285 }
286 }
287}
288
289
290#define hc_find(len_best) \
291 call_find(hc_find_func, len_best)
292
293
294#define hc_skip() \
295do { \
296 mf->son[mf->cyclic_pos] = cur_match; \
297 move_pos(mf); \
298} while (0)
299
300#endif
301
302
303#ifdef HAVE_MF_HC3
304extern uint32_t
305lzma_mf_hc3_find(lzma_mf *mf, lzma_match *matches)
306{
307 header_find(false, 3);
308
309 hash_3_calc();
310
311 const uint32_t delta2 = pos - mf->hash[hash_2_value];
312 const uint32_t cur_match = mf->hash[FIX_3_HASH_SIZE + hash_value];
313
314 mf->hash[hash_2_value] = pos;
315 mf->hash[FIX_3_HASH_SIZE + hash_value] = pos;
316
317 uint32_t len_best = 2;
318
319 if (delta2 < mf->cyclic_size && *(cur - delta2) == *cur) {
320 len_best = lzma_memcmplen(cur - delta2, cur,
321 len_best, len_limit);
322
323 matches[0].len = len_best;
324 matches[0].dist = delta2 - 1;
325 matches_count = 1;
326
327 if (len_best == len_limit) {
328 hc_skip();
329 return 1; // matches_count
330 }
331 }
332
333 hc_find(len_best);
334}
335
336
337extern void
338lzma_mf_hc3_skip(lzma_mf *mf, uint32_t amount)
339{
340 do {
341 if (mf_avail(mf) < 3) {
342 move_pending(mf);
343 continue;
344 }
345
346 const uint8_t *cur = mf_ptr(mf);
347 const uint32_t pos = mf->read_pos + mf->offset;
348
349 hash_3_calc();
350
351 const uint32_t cur_match
352 = mf->hash[FIX_3_HASH_SIZE + hash_value];
353
354 mf->hash[hash_2_value] = pos;
355 mf->hash[FIX_3_HASH_SIZE + hash_value] = pos;
356
357 hc_skip();
358
359 } while (--amount != 0);
360}
361#endif
362
363
364#ifdef HAVE_MF_HC4
365extern uint32_t
366lzma_mf_hc4_find(lzma_mf *mf, lzma_match *matches)
367{
368 header_find(false, 4);
369
370 hash_4_calc();
371
372 uint32_t delta2 = pos - mf->hash[hash_2_value];
373 const uint32_t delta3
374 = pos - mf->hash[FIX_3_HASH_SIZE + hash_3_value];
375 const uint32_t cur_match = mf->hash[FIX_4_HASH_SIZE + hash_value];
376
377 mf->hash[hash_2_value ] = pos;
378 mf->hash[FIX_3_HASH_SIZE + hash_3_value] = pos;
379 mf->hash[FIX_4_HASH_SIZE + hash_value] = pos;
380
381 uint32_t len_best = 1;
382
383 if (delta2 < mf->cyclic_size && *(cur - delta2) == *cur) {
384 len_best = 2;
385 matches[0].len = 2;
386 matches[0].dist = delta2 - 1;
387 matches_count = 1;
388 }
389
390 if (delta2 != delta3 && delta3 < mf->cyclic_size
391 && *(cur - delta3) == *cur) {
392 len_best = 3;
393 matches[matches_count++].dist = delta3 - 1;
394 delta2 = delta3;
395 }
396
397 if (matches_count != 0) {
398 len_best = lzma_memcmplen(cur - delta2, cur,
399 len_best, len_limit);
400
401 matches[matches_count - 1].len = len_best;
402
403 if (len_best == len_limit) {
404 hc_skip();
405 return matches_count;
406 }
407 }
408
409 if (len_best < 3)
410 len_best = 3;
411
412 hc_find(len_best);
413}
414
415
416extern void
417lzma_mf_hc4_skip(lzma_mf *mf, uint32_t amount)
418{
419 do {
420 if (mf_avail(mf) < 4) {
421 move_pending(mf);
422 continue;
423 }
424
425 const uint8_t *cur = mf_ptr(mf);
426 const uint32_t pos = mf->read_pos + mf->offset;
427
428 hash_4_calc();
429
430 const uint32_t cur_match
431 = mf->hash[FIX_4_HASH_SIZE + hash_value];
432
433 mf->hash[hash_2_value] = pos;
434 mf->hash[FIX_3_HASH_SIZE + hash_3_value] = pos;
435 mf->hash[FIX_4_HASH_SIZE + hash_value] = pos;
436
437 hc_skip();
438
439 } while (--amount != 0);
440}
441#endif
442
443
444/////////////////
445// Binary Tree //
446/////////////////
447
448#if defined(HAVE_MF_BT2) || defined(HAVE_MF_BT3) || defined(HAVE_MF_BT4)
449static lzma_match *
450bt_find_func(
451 const uint32_t len_limit,
452 const uint32_t pos,
453 const uint8_t *const cur,
454 uint32_t cur_match,
455 uint32_t depth,
456 uint32_t *const son,
457 const uint32_t cyclic_pos,
458 const uint32_t cyclic_size,
459 lzma_match *matches,
460 uint32_t len_best)
461{
462 uint32_t *ptr0 = son + (cyclic_pos << 1) + 1;
463 uint32_t *ptr1 = son + (cyclic_pos << 1);
464
465 uint32_t len0 = 0;
466 uint32_t len1 = 0;
467
468 while (true) {
469 const uint32_t delta = pos - cur_match;
470 if (depth-- == 0 || delta >= cyclic_size) {
471 *ptr0 = EMPTY_HASH_VALUE;
472 *ptr1 = EMPTY_HASH_VALUE;
473 return matches;
474 }
475
476 uint32_t *const pair = son + ((cyclic_pos - delta
477 + (delta > cyclic_pos ? cyclic_size : 0))
478 << 1);
479
480 const uint8_t *const pb = cur - delta;
481 uint32_t len = my_min(len0, len1);
482
483 if (pb[len] == cur[len]) {
484 len = lzma_memcmplen(pb, cur, len + 1, len_limit);
485
486 if (len_best < len) {
487 len_best = len;
488 matches->len = len;
489 matches->dist = delta - 1;
490 ++matches;
491
492 if (len == len_limit) {
493 *ptr1 = pair[0];
494 *ptr0 = pair[1];
495 return matches;
496 }
497 }
498 }
499
500 if (pb[len] < cur[len]) {
501 *ptr1 = cur_match;
502 ptr1 = pair + 1;
503 cur_match = *ptr1;
504 len1 = len;
505 } else {
506 *ptr0 = cur_match;
507 ptr0 = pair;
508 cur_match = *ptr0;
509 len0 = len;
510 }
511 }
512}
513
514
515static void
516bt_skip_func(
517 const uint32_t len_limit,
518 const uint32_t pos,
519 const uint8_t *const cur,
520 uint32_t cur_match,
521 uint32_t depth,
522 uint32_t *const son,
523 const uint32_t cyclic_pos,
524 const uint32_t cyclic_size)
525{
526 uint32_t *ptr0 = son + (cyclic_pos << 1) + 1;
527 uint32_t *ptr1 = son + (cyclic_pos << 1);
528
529 uint32_t len0 = 0;
530 uint32_t len1 = 0;
531
532 while (true) {
533 const uint32_t delta = pos - cur_match;
534 if (depth-- == 0 || delta >= cyclic_size) {
535 *ptr0 = EMPTY_HASH_VALUE;
536 *ptr1 = EMPTY_HASH_VALUE;
537 return;
538 }
539
540 uint32_t *pair = son + ((cyclic_pos - delta
541 + (delta > cyclic_pos ? cyclic_size : 0))
542 << 1);
543 const uint8_t *pb = cur - delta;
544 uint32_t len = my_min(len0, len1);
545
546 if (pb[len] == cur[len]) {
547 len = lzma_memcmplen(pb, cur, len + 1, len_limit);
548
549 if (len == len_limit) {
550 *ptr1 = pair[0];
551 *ptr0 = pair[1];
552 return;
553 }
554 }
555
556 if (pb[len] < cur[len]) {
557 *ptr1 = cur_match;
558 ptr1 = pair + 1;
559 cur_match = *ptr1;
560 len1 = len;
561 } else {
562 *ptr0 = cur_match;
563 ptr0 = pair;
564 cur_match = *ptr0;
565 len0 = len;
566 }
567 }
568}
569
570
571#define bt_find(len_best) \
572 call_find(bt_find_func, len_best)
573
574#define bt_skip() \
575do { \
576 bt_skip_func(len_limit, pos, cur, cur_match, mf->depth, \
577 mf->son, mf->cyclic_pos, \
578 mf->cyclic_size); \
579 move_pos(mf); \
580} while (0)
581
582#endif
583
584
585#ifdef HAVE_MF_BT2
586extern uint32_t
587lzma_mf_bt2_find(lzma_mf *mf, lzma_match *matches)
588{
589 header_find(true, 2);
590
591 hash_2_calc();
592
593 const uint32_t cur_match = mf->hash[hash_value];
594 mf->hash[hash_value] = pos;
595
596 bt_find(1);
597}
598
599
600extern void
601lzma_mf_bt2_skip(lzma_mf *mf, uint32_t amount)
602{
603 do {
604 header_skip(true, 2);
605
606 hash_2_calc();
607
608 const uint32_t cur_match = mf->hash[hash_value];
609 mf->hash[hash_value] = pos;
610
611 bt_skip();
612
613 } while (--amount != 0);
614}
615#endif
616
617
618#ifdef HAVE_MF_BT3
619extern uint32_t
620lzma_mf_bt3_find(lzma_mf *mf, lzma_match *matches)
621{
622 header_find(true, 3);
623
624 hash_3_calc();
625
626 const uint32_t delta2 = pos - mf->hash[hash_2_value];
627 const uint32_t cur_match = mf->hash[FIX_3_HASH_SIZE + hash_value];
628
629 mf->hash[hash_2_value] = pos;
630 mf->hash[FIX_3_HASH_SIZE + hash_value] = pos;
631
632 uint32_t len_best = 2;
633
634 if (delta2 < mf->cyclic_size && *(cur - delta2) == *cur) {
635 len_best = lzma_memcmplen(
636 cur, cur - delta2, len_best, len_limit);
637
638 matches[0].len = len_best;
639 matches[0].dist = delta2 - 1;
640 matches_count = 1;
641
642 if (len_best == len_limit) {
643 bt_skip();
644 return 1; // matches_count
645 }
646 }
647
648 bt_find(len_best);
649}
650
651
652extern void
653lzma_mf_bt3_skip(lzma_mf *mf, uint32_t amount)
654{
655 do {
656 header_skip(true, 3);
657
658 hash_3_calc();
659
660 const uint32_t cur_match
661 = mf->hash[FIX_3_HASH_SIZE + hash_value];
662
663 mf->hash[hash_2_value] = pos;
664 mf->hash[FIX_3_HASH_SIZE + hash_value] = pos;
665
666 bt_skip();
667
668 } while (--amount != 0);
669}
670#endif
671
672
673#ifdef HAVE_MF_BT4
674extern uint32_t
675lzma_mf_bt4_find(lzma_mf *mf, lzma_match *matches)
676{
677 header_find(true, 4);
678
679 hash_4_calc();
680
681 uint32_t delta2 = pos - mf->hash[hash_2_value];
682 const uint32_t delta3
683 = pos - mf->hash[FIX_3_HASH_SIZE + hash_3_value];
684 const uint32_t cur_match = mf->hash[FIX_4_HASH_SIZE + hash_value];
685
686 mf->hash[hash_2_value] = pos;
687 mf->hash[FIX_3_HASH_SIZE + hash_3_value] = pos;
688 mf->hash[FIX_4_HASH_SIZE + hash_value] = pos;
689
690 uint32_t len_best = 1;
691
692 if (delta2 < mf->cyclic_size && *(cur - delta2) == *cur) {
693 len_best = 2;
694 matches[0].len = 2;
695 matches[0].dist = delta2 - 1;
696 matches_count = 1;
697 }
698
699 if (delta2 != delta3 && delta3 < mf->cyclic_size
700 && *(cur - delta3) == *cur) {
701 len_best = 3;
702 matches[matches_count++].dist = delta3 - 1;
703 delta2 = delta3;
704 }
705
706 if (matches_count != 0) {
707 len_best = lzma_memcmplen(
708 cur, cur - delta2, len_best, len_limit);
709
710 matches[matches_count - 1].len = len_best;
711
712 if (len_best == len_limit) {
713 bt_skip();
714 return matches_count;
715 }
716 }
717
718 if (len_best < 3)
719 len_best = 3;
720
721 bt_find(len_best);
722}
723
724
725extern void
726lzma_mf_bt4_skip(lzma_mf *mf, uint32_t amount)
727{
728 do {
729 header_skip(true, 4);
730
731 hash_4_calc();
732
733 const uint32_t cur_match
734 = mf->hash[FIX_4_HASH_SIZE + hash_value];
735
736 mf->hash[hash_2_value] = pos;
737 mf->hash[FIX_3_HASH_SIZE + hash_3_value] = pos;
738 mf->hash[FIX_4_HASH_SIZE + hash_value] = pos;
739
740 bt_skip();
741
742 } while (--amount != 0);
743}
744#endif
Note: See TracBrowser for help on using the repository browser.

© 2025 Oracle Support Privacy / Do Not Sell My Info Terms of Use Trademark Policy Automated Access Etiquette