VirtualBox

source: vbox/trunk/include/iprt/bldprog-strtab-template.cpp.h@ 82193

Last change on this file since 82193 was 76553, checked in by vboxsync, 6 years ago

scm --update-copyright-year

  • Property svn:eol-style set to native
  • Property svn:keywords set to Author Date Id Revision
File size: 32.7 KB
Line 
1/* $Id: bldprog-strtab-template.cpp.h 76553 2019-01-01 01:45:53Z vboxsync $ */
2/** @file
3 * IPRT - Build Program - String Table Generator.
4 */
5
6/*
7 * Copyright (C) 2006-2019 Oracle Corporation
8 *
9 * This file is part of VirtualBox Open Source Edition (OSE), as
10 * available from http://www.virtualbox.org. This file is free software;
11 * you can redistribute it and/or modify it under the terms of the GNU
12 * General Public License (GPL) as published by the Free Software
13 * Foundation, in version 2 as it comes in the "COPYING" file of the
14 * VirtualBox OSE distribution. VirtualBox OSE is distributed in the
15 * hope that it will be useful, but WITHOUT ANY WARRANTY of any kind.
16 *
17 * The contents of this file may alternatively be used under the terms
18 * of the Common Development and Distribution License Version 1.0
19 * (CDDL) only, as it comes in the "COPYING.CDDL" file of the
20 * VirtualBox OSE distribution, in which case the provisions of the
21 * CDDL are applicable instead of those of the GPL.
22 *
23 * You may elect to license modified versions of this file under the
24 * terms and conditions of either the GPL or the CDDL or both.
25 */
26
27
28/*
29 * (Avoid include sanity checks as this is ring-3 only, C++ code.)
30 */
31#if defined(__cplusplus) && defined(IN_RING3)
32
33
34/*********************************************************************************************************************************
35* Defined Constants And Macros *
36*********************************************************************************************************************************/
37/** @def BLDPROG_STRTAB_MAX_STRLEN
38 * The max length of strings in the table. */
39#if !defined(BLDPROG_STRTAB_MAX_STRLEN) || defined(DOXYGEN_RUNNING)
40# define BLDPROG_STRTAB_MAX_STRLEN 256
41#endif
42
43/** @def BLDPROG_STRTAB_WITH_COMPRESSION
44 * Enables very simple string compression.
45 */
46#if defined(DOXYGEN_RUNNING)
47# define BLDPROG_STRTAB_WITH_COMPRESSION
48#endif
49
50/** @def BLDPROG_STRTAB_WITH_CAMEL_WORDS
51 * Modifies the string compression to look for camel case words.
52 */
53#if defined(DOXYGEN_RUNNING)
54# define BLDPROG_STRTAB_WITH_CAMEL_WORDS
55#endif
56
57/** @def BLDPROG_STRTAB_PURE_ASCII
58 * String compression assumes pure 7-bit ASCII and will fail on UTF-8 when this
59 * is defined. Otherwise, the compression code will require IPRT to link.
60 */
61#if defined(DOXYGEN_RUNNING)
62# define BLDPROG_STRTAB_PURE_ASCII
63#endif
64
65
66
67/*********************************************************************************************************************************
68* Header Files *
69*********************************************************************************************************************************/
70#include <iprt/stdarg.h>
71#include <iprt/ctype.h>
72#include <stdlib.h>
73#include <stdio.h>
74#include <string.h>
75
76#ifdef BLDPROG_STRTAB_WITH_COMPRESSION
77# include <algorithm>
78# include <map>
79# include <string>
80# include <vector>
81
82typedef std::map<std::string, size_t> BLDPROGWORDFREQMAP;
83typedef BLDPROGWORDFREQMAP::value_type BLDPROGWORDFREQPAIR;
84
85#endif
86
87#include "../../src/VBox/Runtime/include/internal/strhash.h" /** @todo make this one public */
88
89
90/*********************************************************************************************************************************
91* Structures and Typedefs *
92*********************************************************************************************************************************/
93/**
94 * Build table string.
95 */
96typedef struct BLDPROGSTRING
97{
98 /** The string.
99 * @note This may be modified or replaced (allocated from heap) when
100 * compressing the string table. */
101 char *pszString;
102 /** The string hash value. */
103 uint32_t uHash;
104 /** The strint table offset. */
105 uint32_t offStrTab;
106 /** The string length. */
107 size_t cchString;
108 /** Pointer to the next string reference (same string table entry). */
109 struct BLDPROGSTRING *pNextRef;
110 /** Pointer to the next string with the same hash value (collision). */
111 struct BLDPROGSTRING *pNextCollision;
112
113} BLDPROGSTRING;
114/** Pointer to a string table string. */
115typedef BLDPROGSTRING *PBLDPROGSTRING;
116
117
118/** String table data. */
119typedef struct BLDPROGSTRTAB
120{
121 /** The size of g_papStrHash. */
122 size_t cStrHash;
123 /** String hash table. */
124 PBLDPROGSTRING *papStrHash;
125 /** Duplicate strings found by AddString. */
126 size_t cDuplicateStrings;
127 /** Total length of the unique strings (no terminators). */
128 size_t cchUniqueStrings;
129 /** Number of unique strings after AddString. */
130 size_t cUniqueStrings;
131 /** Number of collisions. */
132 size_t cCollisions;
133
134 /** Number of entries in apSortedStrings. */
135 size_t cSortedStrings;
136 /** The sorted string table. */
137 PBLDPROGSTRING *papSortedStrings;
138
139#ifdef BLDPROG_STRTAB_WITH_COMPRESSION
140 /** The 127 words we've picked to be indexed by reference. */
141 BLDPROGSTRING aCompDict[127];
142 /** Incoming strings pending compression. */
143 PBLDPROGSTRING *papPendingStrings;
144 /** Current number of entries in papStrPending. */
145 size_t cPendingStrings;
146 /** The allocated size of papPendingStrings. */
147 size_t cMaxPendingStrings;
148 /** Work frequency map.
149 * @todo rewrite in plain C. */
150 BLDPROGWORDFREQMAP Frequencies;
151#endif
152
153 /** The string table. */
154 char *pachStrTab;
155 /** The actual string table size. */
156 size_t cchStrTab;
157} BLDPROGSTRTAB;
158typedef BLDPROGSTRTAB *PBLDPROGSTRTAB;
159
160#if RT_CLANG_PREREQ(4, 0) || RT_GNUC_PREREQ(4, 6)
161# pragma GCC diagnostic push
162# pragma GCC diagnostic ignored "-Wunused-function"
163#endif
164
165
166/**
167 * Initializes the strint table compiler.
168 *
169 * @returns success indicator (out of memory if false).
170 * @param pThis The strint table compiler instance.
171 * @param cMaxStrings The max number of strings we'll be adding.
172 */
173static bool BldProgStrTab_Init(PBLDPROGSTRTAB pThis, size_t cMaxStrings)
174{
175 pThis->cStrHash = 0;
176 pThis->papStrHash = NULL;
177 pThis->cDuplicateStrings = 0;
178 pThis->cchUniqueStrings = 0;
179 pThis->cUniqueStrings = 0;
180 pThis->cCollisions = 0;
181 pThis->cSortedStrings = 0;
182 pThis->papSortedStrings = NULL;
183 pThis->pachStrTab = NULL;
184 pThis->cchStrTab = 0;
185#ifdef BLDPROG_STRTAB_WITH_COMPRESSION
186 memset(pThis->aCompDict, 0, sizeof(pThis->aCompDict));
187 pThis->papPendingStrings = NULL;
188 pThis->cPendingStrings = 0;
189 pThis->cMaxPendingStrings = cMaxStrings;
190#endif
191
192 /*
193 * Allocate a hash table double the size of all strings (to avoid too
194 * many collisions). Add all strings to it, finding duplicates in the
195 * process.
196 */
197#ifdef BLDPROG_STRTAB_WITH_COMPRESSION
198 cMaxStrings += RT_ELEMENTS(pThis->aCompDict);
199#endif
200 cMaxStrings *= 2;
201 pThis->papStrHash = (PBLDPROGSTRING *)calloc(sizeof(pThis->papStrHash[0]), cMaxStrings);
202 if (pThis->papStrHash)
203 {
204 pThis->cStrHash = cMaxStrings;
205#ifdef BLDPROG_STRTAB_WITH_COMPRESSION
206 pThis->papPendingStrings = (PBLDPROGSTRING *)calloc(sizeof(pThis->papPendingStrings[0]), pThis->cMaxPendingStrings);
207 if (pThis->papPendingStrings)
208#endif
209 return true;
210
211#ifdef BLDPROG_STRTAB_WITH_COMPRESSION
212 free(pThis->papStrHash);
213 pThis->papStrHash = NULL;
214#endif
215 }
216 return false;
217}
218
219
220#if 0 /* unused */
221static void BldProgStrTab_Delete(PBLDPROGSTRTAB pThis)
222{
223 free(pThis->papStrHash);
224 free(pThis->papSortedStrings);
225 free(pThis->pachStrTab);
226# ifdef BLDPROG_STRTAB_WITH_COMPRESSION
227 free(pThis->papPendingStrings);
228# endif
229 memset(pThis, 0, sizeof(*pThis));
230}
231#endif
232
233#ifdef BLDPROG_STRTAB_WITH_COMPRESSION
234
235DECLINLINE(size_t) bldProgStrTab_compressorFindNextWord(const char *pszSrc, char ch, const char **ppszSrc)
236{
237 /*
238 * Skip leading word separators.
239 */
240# ifdef BLDPROG_STRTAB_WITH_CAMEL_WORDS
241 while ( ch == ' '
242 || ch == '-'
243 || ch == '+'
244 || ch == '_')
245# else
246 while (ch == ' ')
247# endif
248 ch = *++pszSrc;
249 if (ch)
250 {
251 /*
252 * Find end of word.
253 */
254 size_t cchWord = 1;
255# ifdef BLDPROG_STRTAB_WITH_CAMEL_WORDS
256 char chPrev = ch;
257 while ( (ch = pszSrc[cchWord]) != ' '
258 && ch != '\0'
259 && ch != '-'
260 && ch != '+'
261 && ch != '_'
262 && ( ch == chPrev
263 || !RT_C_IS_UPPER(ch)
264 || RT_C_IS_UPPER(chPrev)) )
265 {
266 chPrev = ch;
267 cchWord++;
268 }
269# else
270 while ((ch = pszSrc[cchWord]) != ' ' && ch != '\0')
271 cchWord++;
272# endif
273 *ppszSrc = pszSrc;
274 return cchWord;
275 }
276
277 *ppszSrc = pszSrc;
278 return 0;
279}
280
281
282/**
283 * Analyzes a string.
284 *
285 * @param pThis The strint table compiler instance.
286 * @param pStr The string to analyze.
287 */
288static void bldProgStrTab_compressorAnalyzeString(PBLDPROGSTRTAB pThis, PBLDPROGSTRING pStr)
289{
290 const char *psz = pStr->pszString;
291
292 /*
293 * For now we just consider words.
294 */
295 char ch;
296 while ((ch = *psz) != '\0')
297 {
298 size_t cchWord = bldProgStrTab_compressorFindNextWord(psz, ch, &psz);
299 if (cchWord > 1)
300 {
301 std::string strWord(psz, cchWord);
302 BLDPROGWORDFREQMAP::iterator it = pThis->Frequencies.find(strWord);
303 if (it != pThis->Frequencies.end())
304 it->second += cchWord - 1;
305 else
306 pThis->Frequencies[strWord] = 0;
307
308 /** @todo could gain hits by including the space after the word, but that
309 * has the same accounting problems as the two words scenario below. */
310
311# if 0 /** @todo need better accounting for overlapping alternatives before this can be enabled. */
312 /* Two words - immediate yields calc may lie when this enabled and we may pick the wrong words. */
313 if (ch == ' ')
314 {
315 ch = psz[++cchWord];
316 if (ch != ' ' && ch != '\0')
317 {
318 size_t const cchSaved = cchWord;
319
320 do
321 cchWord++;
322 while ((ch = psz[cchWord]) != ' ' && ch != '\0');
323
324 strWord = std::string(psz, cchWord);
325 BLDPROGWORDFREQMAP::iterator it = pThis->Frequencies.find(strWord);
326 if (it != pThis->Frequencies.end())
327 it->second += cchWord - 1;
328 else
329 pThis->Frequencies[strWord] = 0;
330
331 cchWord = cchSaved;
332 }
333 }
334# endif
335 }
336 else if (!cchWord)
337 break;
338
339 /* Advance. */
340 psz += cchWord;
341 }
342 pStr->cchString = psz - pStr->pszString;
343 if (pStr->cchString > BLDPROG_STRTAB_MAX_STRLEN)
344 {
345 fprintf(stderr, "error: String to long (%u)\n", (unsigned)pStr->cchString);
346 abort();
347 }
348}
349
350#endif /* BLDPROG_STRTAB_WITH_COMPRESSION */
351
352/**
353 * Adds a string to the hash table.
354 * @param pThis The strint table compiler instance.
355 * @param pStr The string.
356 */
357static void bldProgStrTab_AddStringToHashTab(PBLDPROGSTRTAB pThis, PBLDPROGSTRING pStr)
358{
359 pStr->pNextRef = NULL;
360 pStr->pNextCollision = NULL;
361 pStr->offStrTab = 0;
362 pStr->uHash = sdbm(pStr->pszString, &pStr->cchString);
363 if (pStr->cchString > BLDPROG_STRTAB_MAX_STRLEN)
364 {
365 fprintf(stderr, "error: String to long (%u)\n", (unsigned)pStr->cchString);
366 exit(RTEXITCODE_FAILURE);
367 }
368
369 size_t idxHash = pStr->uHash % pThis->cStrHash;
370 PBLDPROGSTRING pCur = pThis->papStrHash[idxHash];
371 if (!pCur)
372 pThis->papStrHash[idxHash] = pStr;
373 else
374 {
375 /* Look for matching string. */
376 do
377 {
378 if ( pCur->uHash == pStr->uHash
379 && pCur->cchString == pStr->cchString
380 && memcmp(pCur->pszString, pStr->pszString, pStr->cchString) == 0)
381 {
382 pStr->pNextRef = pCur->pNextRef;
383 pCur->pNextRef = pStr;
384 pThis->cDuplicateStrings++;
385 return;
386 }
387 pCur = pCur->pNextCollision;
388 } while (pCur != NULL);
389
390 /* No matching string, insert. */
391 pThis->cCollisions++;
392 pStr->pNextCollision = pThis->papStrHash[idxHash];
393 pThis->papStrHash[idxHash] = pStr;
394 }
395
396 pThis->cUniqueStrings++;
397 pThis->cchUniqueStrings += pStr->cchString;
398}
399
400
401/**
402 * Adds a string to the string table.
403 *
404 * @param pThis The strint table compiler instance.
405 * @param pStr The string.
406 */
407static void BldProgStrTab_AddString(PBLDPROGSTRTAB pThis, PBLDPROGSTRING pStr)
408{
409#ifdef BLDPROG_STRTAB_WITH_COMPRESSION
410 bldProgStrTab_compressorAnalyzeString(pThis, pStr);
411 if (pThis->cPendingStrings < pThis->cMaxPendingStrings)
412 pThis->papPendingStrings[pThis->cPendingStrings++] = pStr;
413 else
414 abort();
415#else
416 bldProgStrTab_AddStringToHashTab(pThis, pStr);
417#endif
418}
419
420#ifdef BLDPROG_STRTAB_WITH_COMPRESSION
421
422/**
423 * Copies @a cchSrc chars from @a pchSrc to @a pszDst, escaping special
424 * sequences.
425 *
426 * @returns New @a pszDst position, NULL if invalid source encoding.
427 * @param pszDst The destination buffer.
428 * @param pszSrc The source buffer.
429 * @param cchSrc How much to copy.
430 */
431static char *bldProgStrTab_compressorCopyAndEscape(char *pszDst, const char *pszSrc, size_t cchSrc)
432{
433 while (cchSrc-- > 0)
434 {
435 char ch = *pszSrc;
436 if (!((unsigned char)ch & 0x80))
437 {
438 *pszDst++ = ch;
439 pszSrc++;
440 }
441 else
442 {
443# ifdef BLDPROG_STRTAB_PURE_ASCII
444 fprintf(stderr, "error: unexpected char value %#x\n", ch);
445 return NULL;
446# else
447 RTUNICP uc;
448 int rc = RTStrGetCpEx(&pszSrc, &uc);
449 if (RT_SUCCESS(rc))
450 {
451 *pszDst++ = (unsigned char)0xff; /* escape single code point. */
452 pszDst = RTStrPutCp(pszDst, uc);
453 }
454 else
455 {
456 fprintf(stderr, "Error: RTStrGetCpEx failed with rc=%d\n", rc);
457 return NULL;
458 }
459# endif
460 }
461 }
462 return pszDst;
463}
464
465
466/**
467 * Replaces the dictionary words and escapes non-ascii chars in a string.
468 *
469 * @param pThis The strint table compiler instance.
470 * @param pString The string to fixup.
471 */
472static bool bldProgStrTab_compressorFixupString(PBLDPROGSTRTAB pThis, PBLDPROGSTRING pStr)
473{
474 char szNew[BLDPROG_STRTAB_MAX_STRLEN * 2];
475 char *pszDst = szNew;
476 const char *pszSrc = pStr->pszString;
477 const char *pszSrcEnd = pszSrc + pStr->cchString;
478
479 char ch;
480 while ((ch = *pszSrc) != '\0')
481 {
482 const char * const pszSrcUncompressed = pszSrc;
483 size_t cchWord = bldProgStrTab_compressorFindNextWord(pszSrc, ch, &pszSrc);
484 size_t cchSrcUncompressed = pszSrc - pszSrcUncompressed;
485 if (cchSrcUncompressed > 0)
486 {
487 pszDst = bldProgStrTab_compressorCopyAndEscape(pszDst, pszSrcUncompressed, cchSrcUncompressed);
488 if (!pszDst)
489 return false;
490 }
491 if (!cchWord)
492 break;
493
494 /* Check for g_aWord matches. */
495 size_t cchMax = pszSrcEnd - pszSrc;
496 for (unsigned i = 0; i < RT_ELEMENTS(pThis->aCompDict); i++)
497 {
498 size_t cchLen = pThis->aCompDict[i].cchString;
499 if ( cchLen >= cchWord
500 && cchLen <= cchMax
501 && memcmp(pThis->aCompDict[i].pszString, pszSrc, cchLen) == 0)
502 {
503 *pszDst++ = (unsigned char)(0x80 | i);
504 pszSrc += cchLen;
505 cchWord = 0;
506 break;
507 }
508 }
509
510 if (cchWord)
511 {
512 /* Copy the current word. */
513 pszDst = bldProgStrTab_compressorCopyAndEscape(pszDst, pszSrc, cchWord);
514 if (!pszDst)
515 return false;
516 pszSrc += cchWord;
517 }
518 }
519
520 /* Just terminate it now. */
521 *pszDst = '\0';
522
523 /*
524 * Update the string.
525 */
526 size_t cchNew = pszDst - &szNew[0];
527 if (cchNew > pStr->cchString)
528 {
529 pStr->pszString = (char *)malloc(cchNew + 1);
530 if (!pStr->pszString)
531 {
532 fprintf(stderr, "Out of memory!\n");
533 return false;
534 }
535 }
536 memcpy(pStr->pszString, szNew, cchNew + 1);
537 pStr->cchString = cchNew;
538
539 return true;
540}
541
542
543/**
544 * For sorting the frequency fidning in descending order.
545 *
546 * Comparison operators are put outside to make older gcc versions (like 4.1.1
547 * on lnx64-rel) happy.
548 */
549class WordFreqSortEntry
550{
551public:
552 BLDPROGWORDFREQPAIR const *m_pPair;
553
554public:
555 WordFreqSortEntry(BLDPROGWORDFREQPAIR const *pPair) : m_pPair(pPair) {}
556};
557
558bool operator == (WordFreqSortEntry const &rLeft, WordFreqSortEntry const &rRight)
559{
560 return rLeft.m_pPair->second == rRight.m_pPair->second;
561}
562
563bool operator < (WordFreqSortEntry const &rLeft, WordFreqSortEntry const &rRight)
564{
565 return rLeft.m_pPair->second > rRight.m_pPair->second;
566}
567
568
569/**
570 * Compresses the vendor and product strings.
571 *
572 * This is very very simple (a lot less work than the string table for
573 * instance).
574 */
575static bool bldProgStrTab_compressorDoStringCompression(PBLDPROGSTRTAB pThis, bool fVerbose)
576{
577 /*
578 * Sort the frequency analyzis result and pick the top 127 ones.
579 */
580 std::vector<WordFreqSortEntry> SortMap;
581 for (BLDPROGWORDFREQMAP::iterator it = pThis->Frequencies.begin(); it != pThis->Frequencies.end(); ++it)
582 {
583 BLDPROGWORDFREQPAIR const &rPair = *it;
584 SortMap.push_back(WordFreqSortEntry(&rPair));
585 }
586
587 sort(SortMap.begin(), SortMap.end());
588
589 size_t cb = 0;
590 size_t i = 0;
591 for (std::vector<WordFreqSortEntry>::iterator it = SortMap.begin();
592 it != SortMap.end() && i < RT_ELEMENTS(pThis->aCompDict);
593 ++it, i++)
594 {
595 pThis->aCompDict[i].cchString = it->m_pPair->first.length();
596 pThis->aCompDict[i].pszString = (char *)malloc(pThis->aCompDict[i].cchString + 1);
597 if (pThis->aCompDict[i].pszString)
598 memcpy(pThis->aCompDict[i].pszString, it->m_pPair->first.c_str(), pThis->aCompDict[i].cchString + 1);
599 else
600 return false;
601 cb += it->m_pPair->second;
602 }
603
604 if (fVerbose)
605 printf("debug: Estimated string compression saving: %u bytes\n", (unsigned)cb);
606
607 /*
608 * Rework the strings.
609 */
610 size_t cchOld = 0;
611 size_t cchOldMax = 0;
612 size_t cchOldMin = BLDPROG_STRTAB_MAX_STRLEN;
613 size_t cchNew = 0;
614 size_t cchNewMax = 0;
615 size_t cchNewMin = BLDPROG_STRTAB_MAX_STRLEN;
616 i = pThis->cPendingStrings;
617 while (i-- > 0)
618 {
619 PBLDPROGSTRING pCurStr = pThis->papPendingStrings[i];
620 cchOld += pCurStr->cchString;
621 if (pCurStr->cchString > cchOldMax)
622 cchOldMax = pCurStr->cchString;
623 if (pCurStr->cchString < cchOldMin)
624 cchOldMin = pCurStr->cchString;
625
626 if (!bldProgStrTab_compressorFixupString(pThis, pCurStr))
627 return false;
628
629 cchNew += pCurStr->cchString;
630 if (pCurStr->cchString > cchNewMax)
631 cchNewMax = pCurStr->cchString;
632 if (pCurStr->cchString < cchNewMin)
633 cchNewMin = pCurStr->cchString;
634
635 bldProgStrTab_AddStringToHashTab(pThis, pCurStr);
636 }
637
638 /*
639 * Do debug stats.
640 */
641 if (fVerbose)
642 {
643 for (i = 0; i < RT_ELEMENTS(pThis->aCompDict); i++)
644 cchNew += pThis->aCompDict[i].cchString + 1;
645
646 printf("debug: Strings: original: %u bytes; compressed: %u bytes;", (unsigned)cchOld, (unsigned)cchNew);
647 if (cchNew < cchOld)
648 printf(" saving %u bytes (%u%%)\n", (unsigned)(cchOld - cchNew), (unsigned)((cchOld - cchNew) * 100 / cchOld));
649 else
650 printf(" wasting %u bytes!\n", (unsigned)(cchOld - cchNew));
651 printf("debug: Original string lengths: average %u; min %u; max %u\n",
652 (unsigned)(cchOld / pThis->cPendingStrings), (unsigned)cchOldMin, (unsigned)cchOldMax);
653 printf("debug: Compressed string lengths: average %u; min %u; max %u\n",
654 (unsigned)(cchNew / pThis->cPendingStrings), (unsigned)cchNewMin, (unsigned)cchNewMax);
655 }
656
657 return true;
658}
659
660#endif /* BLDPROG_STRTAB_WITH_COMPRESSION */
661
662/**
663 * Inserts a string into g_apUniqueStrings.
664 * @param pThis The strint table compiler instance.
665 * @param pStr The string.
666 */
667static void bldProgStrTab_InsertUniqueString(PBLDPROGSTRTAB pThis, PBLDPROGSTRING pStr)
668{
669 size_t iIdx;
670 size_t iEnd = pThis->cSortedStrings;
671 if (iEnd)
672 {
673 size_t iStart = 0;
674 for (;;)
675 {
676 iIdx = iStart + (iEnd - iStart) / 2;
677 if (pThis->papSortedStrings[iIdx]->cchString < pStr->cchString)
678 {
679 if (iIdx <= iStart)
680 break;
681 iEnd = iIdx;
682 }
683 else if (pThis->papSortedStrings[iIdx]->cchString > pStr->cchString)
684 {
685 if (++iIdx >= iEnd)
686 break;
687 iStart = iIdx;
688 }
689 else
690 break;
691 }
692
693 if (iIdx != pThis->cSortedStrings)
694 memmove(&pThis->papSortedStrings[iIdx + 1], &pThis->papSortedStrings[iIdx],
695 (pThis->cSortedStrings - iIdx) * sizeof(pThis->papSortedStrings[iIdx]));
696 }
697 else
698 iIdx = 0;
699
700 pThis->papSortedStrings[iIdx] = pStr;
701 pThis->cSortedStrings++;
702}
703
704
705/**
706 * Compiles the string table after the string has been added.
707 *
708 * This will save space by dropping string terminators, eliminating duplicates
709 * and try find strings that are sub-strings of others.
710 *
711 * Will initialize the StrRef of all StrTabString instances.
712 *
713 * @returns success indicator (error flagged).
714 * @param pThis The strint table compiler instance.
715 */
716static bool BldProgStrTab_CompileIt(PBLDPROGSTRTAB pThis, bool fVerbose)
717{
718#ifdef BLDPROG_STRTAB_WITH_COMPRESSION
719 /*
720 * Do the compression and add all the compressed strings to the table.
721 */
722 if (!bldProgStrTab_compressorDoStringCompression(pThis, fVerbose))
723 return false;
724
725 /*
726 * Add the dictionary strings.
727 */
728 for (unsigned i = 0; i < RT_ELEMENTS(pThis->aCompDict); i++)
729 bldProgStrTab_AddStringToHashTab(pThis, &pThis->aCompDict[i]);
730#endif
731 if (fVerbose)
732 printf("debug: %u unique strings (%u bytes), %u duplicates, %u collisions\n",
733 (unsigned)pThis->cUniqueStrings, (unsigned)pThis->cchUniqueStrings,
734 (unsigned)pThis->cDuplicateStrings, (unsigned)pThis->cCollisions);
735
736 /*
737 * Create papSortedStrings from the hash table. The table is sorted by
738 * string length, with the longer strings first.
739 */
740 pThis->papSortedStrings = (PBLDPROGSTRING *)malloc(sizeof(pThis->papSortedStrings[0]) * pThis->cUniqueStrings);
741 if (!pThis->papSortedStrings)
742 return false;
743 pThis->cSortedStrings = 0;
744 size_t idxHash = pThis->cStrHash;
745 while (idxHash-- > 0)
746 {
747 PBLDPROGSTRING pCur = pThis->papStrHash[idxHash];
748 if (pCur)
749 {
750 do
751 {
752 bldProgStrTab_InsertUniqueString(pThis, pCur);
753 pCur = pCur->pNextCollision;
754 } while (pCur);
755 }
756 }
757
758 /*
759 * Create the actual string table.
760 */
761 pThis->pachStrTab = (char *)malloc(pThis->cchUniqueStrings + 1);
762 if (!pThis->pachStrTab)
763 return false;
764 pThis->cchStrTab = 0;
765 for (size_t i = 0; i < pThis->cSortedStrings; i++)
766 {
767 PBLDPROGSTRING pCur = pThis->papSortedStrings[i];
768 const char * const pszCur = pCur->pszString;
769 size_t const cchCur = pCur->cchString;
770 size_t offStrTab = pThis->cchStrTab;
771
772 /*
773 * See if the string is a substring already found in the string table.
774 * Excluding the zero terminator increases the chances for this.
775 */
776 size_t cchLeft = pThis->cchStrTab >= cchCur ? pThis->cchStrTab - cchCur : 0;
777 const char *pchLeft = pThis->pachStrTab;
778 char const chFirst = *pszCur;
779 while (cchLeft > 0)
780 {
781 const char *pchCandidate = (const char *)memchr(pchLeft, chFirst, cchLeft);
782 if (!pchCandidate)
783 break;
784 if (memcmp(pchCandidate, pszCur, cchCur) == 0)
785 {
786 offStrTab = pchCandidate - pThis->pachStrTab;
787 break;
788 }
789
790 cchLeft -= pchCandidate + 1 - pchLeft;
791 pchLeft = pchCandidate + 1;
792 }
793
794 if (offStrTab == pThis->cchStrTab)
795 {
796 /*
797 * See if the start of the string overlaps the end of the string table.
798 */
799 if (pThis->cchStrTab && cchCur > 1)
800 {
801 cchLeft = RT_MIN(pThis->cchStrTab, cchCur - 1);
802 pchLeft = &pThis->pachStrTab[pThis->cchStrTab - cchLeft];
803 while (cchLeft > 0)
804 {
805 const char *pchCandidate = (const char *)memchr(pchLeft, chFirst, cchLeft);
806 if (!pchCandidate)
807 break;
808 cchLeft -= pchCandidate - pchLeft;
809 pchLeft = pchCandidate;
810 if (memcmp(pchLeft, pszCur, cchLeft) == 0)
811 {
812 size_t cchToCopy = cchCur - cchLeft;
813 memcpy(&pThis->pachStrTab[offStrTab], &pszCur[cchLeft], cchToCopy);
814 pThis->cchStrTab += cchToCopy;
815 offStrTab = pchCandidate - pThis->pachStrTab;
816 break;
817 }
818 cchLeft--;
819 pchLeft++;
820 }
821 }
822
823 /*
824 * If we didn't have any luck above, just append the string.
825 */
826 if (offStrTab == pThis->cchStrTab)
827 {
828 memcpy(&pThis->pachStrTab[offStrTab], pszCur, cchCur);
829 pThis->cchStrTab += cchCur;
830 }
831 }
832
833 /*
834 * Set the string table offset for all the references to this string.
835 */
836 do
837 {
838 pCur->offStrTab = (uint32_t)offStrTab;
839 pCur = pCur->pNextRef;
840 } while (pCur != NULL);
841 }
842
843 if (fVerbose)
844 printf("debug: String table: %u bytes\n", (unsigned)pThis->cchStrTab);
845 return true;
846}
847
848
849#ifdef RT_STRICT
850/**
851 * Sanity checks a string table string.
852 * @param pThis The strint table compiler instance.
853 * @param pStr The string to check.
854 */
855static void BldProgStrTab_CheckStrTabString(PBLDPROGSTRTAB pThis, PBLDPROGSTRING pStr)
856{
857 if (pStr->cchString != strlen(pStr->pszString))
858 abort();
859 if (pStr->offStrTab >= pThis->cchStrTab)
860 abort();
861 if (pStr->offStrTab + pStr->cchString > pThis->cchStrTab)
862 abort();
863 if (memcmp(pStr->pszString, &pThis->pachStrTab[pStr->offStrTab], pStr->cchString) != 0)
864 abort();
865}
866#endif
867
868
869/**
870 * Output the string table string in C string litteral fashion.
871 *
872 * @param pThis The strint table instance.
873 * @param pString The string to print.
874 * @param pOut The output stream.
875 */
876static void BldProgStrTab_PrintCStringLitteral(PBLDPROGSTRTAB pThis, PBLDPROGSTRING pString, FILE *pOut)
877{
878 const unsigned char *psz = (const unsigned char *)pString->pszString;
879 unsigned char uch;
880 while ((uch = *psz++) != '\0')
881 {
882 if (!(uch & 0x80))
883 {
884 if (uch != '\'' && uch != '\\')
885 fputc((char)uch, pOut);
886 else
887 {
888 fputc('\\', pOut);
889 fputc((char)uch, pOut);
890 }
891 }
892#ifdef BLDPROG_STRTAB_WITH_COMPRESSION
893 else if (uch != 0xff)
894 fputs(pThis->aCompDict[uch & 0x7f].pszString, pOut);
895 else
896 {
897# ifdef BLDPROG_STRTAB_PURE_ASCII
898 abort();
899# else
900 RTUNICP uc = RTStrGetCp((const char *)psz);
901 psz += RTStrCpSize(uc);
902 fprintf(pOut, "\\u%04x", uc);
903# endif
904 }
905#else
906 else
907 fprintf(pOut, "\\x%02x", (unsigned)uch);
908 NOREF(pThis);
909#endif
910 }
911}
912
913
914/**
915 * Writes the string table code to the output stream.
916 *
917 * @param pThis The strint table compiler instance.
918 * @param pOut The output stream.
919 * @param pszScope The scoping ("static " or empty string),
920 * including trailing space.
921 * @param pszPrefix The variable prefix. Typically "g_" for C programs,
922 * whereas C++ classes normally use "class::s_g".
923 * @param pszBaseName The base name for the variables. The user
924 * accessible variable will end up as just the
925 * prefix and basename concatenated.
926 */
927static void BldProgStrTab_WriteStringTable(PBLDPROGSTRTAB pThis, FILE *pOut,
928 const char *pszScope, const char *pszPrefix, const char *pszBaseName)
929{
930#ifdef RT_STRICT
931 /*
932 * Do some quick sanity checks while we're here.
933 */
934# ifdef BLDPROG_STRTAB_WITH_COMPRESSION
935 for (unsigned i = 0; i < RT_ELEMENTS(pThis->aCompDict); i++)
936 BldProgStrTab_CheckStrTabString(pThis, &pThis->aCompDict[i]);
937# endif
938#endif
939
940 /*
941 * Create a table for speeding up the character categorization.
942 */
943 uint8_t abCharCat[256];
944 memset(abCharCat, 0, sizeof(abCharCat));
945 abCharCat[(unsigned char)'\\'] = 1;
946 abCharCat[(unsigned char)'\''] = 1;
947 for (unsigned i = 0; i < 0x20; i++)
948 abCharCat[i] = 2;
949 for (unsigned i = 0x7f; i < 0x100; i++)
950 abCharCat[i] = 2;
951
952 /*
953 * We follow the sorted string table, one string per line.
954 */
955 fprintf(pOut,
956 "#include <iprt/bldprog-strtab.h>\n"
957 "\n"
958 "static const char g_achStrTab%s[] =\n"
959 "{\n",
960 pszBaseName);
961
962 uint32_t off = 0;
963 for (uint32_t i = 0; i < pThis->cSortedStrings; i++)
964 {
965 PBLDPROGSTRING pCur = pThis->papSortedStrings[i];
966 uint32_t offEnd = pCur->offStrTab + (uint32_t)pCur->cchString;
967 if (offEnd > off)
968 {
969 /* Comment with a uncompressed and more readable version of the string. */
970 fprintf(pOut, off == pCur->offStrTab ? "/* 0x%05x = \"" : "/* 0X%05x = \"", off);
971 BldProgStrTab_PrintCStringLitteral(pThis, pCur, pOut);
972 fputs("\" */\n", pOut);
973
974 /* Must use char by char here or we may trigger the max string
975 length limit in the compiler, */
976 fputs(" ", pOut);
977 for (; off < offEnd; off++)
978 {
979 unsigned char uch = pThis->pachStrTab[off];
980 fputc('\'', pOut);
981 if (abCharCat[uch] == 0)
982 fputc(uch, pOut);
983 else if (abCharCat[uch] != 1)
984 fprintf(pOut, "\\x%02x", (unsigned)uch);
985 else
986 {
987 fputc('\\', pOut);
988 fputc(uch, pOut);
989 }
990 fputc('\'', pOut);
991 fputc(',', pOut);
992 }
993 fputc('\n', pOut);
994 }
995 }
996
997 fprintf(pOut,
998 "};\n"
999 "AssertCompile(sizeof(g_achStrTab%s) == %#x);\n\n",
1000 pszBaseName, (unsigned)pThis->cchStrTab);
1001
1002#ifdef BLDPROG_STRTAB_WITH_COMPRESSION
1003 /*
1004 * Write the compression dictionary.
1005 */
1006 fprintf(pOut,
1007 "static const RTBLDPROGSTRREF g_aCompDict%s[%u] = \n"
1008 "{\n",
1009 pszBaseName, (unsigned)RT_ELEMENTS(pThis->aCompDict));
1010 for (unsigned i = 0; i < RT_ELEMENTS(pThis->aCompDict); i++)
1011 fprintf(pOut, " { %#08x, %#04x }, // %s\n",
1012 pThis->aCompDict[i].offStrTab, (unsigned)pThis->aCompDict[i].cchString, pThis->aCompDict[i].pszString);
1013 fprintf(pOut, "};\n\n");
1014#endif
1015
1016
1017 /*
1018 * Write the string table data structure.
1019 */
1020 fprintf(pOut,
1021 "%sconst RTBLDPROGSTRTAB %s%s = \n"
1022 "{\n"
1023 " /*.pchStrTab = */ &g_achStrTab%s[0],\n"
1024 " /*.cchStrTab = */ sizeof(g_achStrTab%s),\n"
1025 ,
1026 pszScope, pszPrefix, pszBaseName, pszBaseName, pszBaseName);
1027#ifdef BLDPROG_STRTAB_WITH_COMPRESSION
1028 fprintf(pOut,
1029 " /*.cCompDict = */ %u,\n"
1030 " /*.paCompDict = */ &g_aCompDict%s[0]\n"
1031 "};\n"
1032 , (unsigned)RT_ELEMENTS(pThis->aCompDict), pszBaseName);
1033#else
1034 fprintf(pOut,
1035 " /*.cCompDict = */ 0,\n"
1036 " /*.paCompDict = */ NULL\n"
1037 "};\n");
1038#endif
1039}
1040
1041#if RT_CLANG_PREREQ(4, 0) || RT_GNUC_PREREQ(4, 6)
1042# pragma GCC diagnostic pop
1043#endif
1044
1045#endif /* __cplusplus && IN_RING3 */
1046
Note: See TracBrowser for help on using the repository browser.

© 2024 Oracle Support Privacy / Do Not Sell My Info Terms of Use Trademark Policy Automated Access Etiquette