VirtualBox

source: vbox/trunk/include/iprt/bldprog-strtab-template.cpp.h@ 84055

Last change on this file since 84055 was 84055, checked in by vboxsync, 5 years ago

IPRT,++: Apply bldprog-strtab.h and friends to the IPRT status message database (errmsg.cpp) to reduce size. The interface (RTErrMsg*) has been reworked as we no longer have C-strings in the database, but 'compressed' string w/o zero terminators. [build fixes] bugref:9726

  • Property svn:eol-style set to native
  • Property svn:keywords set to Author Date Id Revision
File size: 33.3 KB
Line 
1/* $Id: bldprog-strtab-template.cpp.h 84055 2020-04-28 16:11:58Z vboxsync $ */
2/** @file
3 * IPRT - Build Program - String Table Generator.
4 */
5
6/*
7 * Copyright (C) 2006-2020 Oracle Corporation
8 *
9 * This file is part of VirtualBox Open Source Edition (OSE), as
10 * available from http://www.virtualbox.org. This file is free software;
11 * you can redistribute it and/or modify it under the terms of the GNU
12 * General Public License (GPL) as published by the Free Software
13 * Foundation, in version 2 as it comes in the "COPYING" file of the
14 * VirtualBox OSE distribution. VirtualBox OSE is distributed in the
15 * hope that it will be useful, but WITHOUT ANY WARRANTY of any kind.
16 *
17 * The contents of this file may alternatively be used under the terms
18 * of the Common Development and Distribution License Version 1.0
19 * (CDDL) only, as it comes in the "COPYING.CDDL" file of the
20 * VirtualBox OSE distribution, in which case the provisions of the
21 * CDDL are applicable instead of those of the GPL.
22 *
23 * You may elect to license modified versions of this file under the
24 * terms and conditions of either the GPL or the CDDL or both.
25 */
26
27
28/*
29 * (Avoid include sanity checks as this is ring-3 only, C++ code.)
30 */
31#if defined(__cplusplus) && defined(IN_RING3)
32
33
34/*********************************************************************************************************************************
35* Defined Constants And Macros *
36*********************************************************************************************************************************/
37/** @def BLDPROG_STRTAB_MAX_STRLEN
38 * The max length of strings in the table. */
39#if !defined(BLDPROG_STRTAB_MAX_STRLEN) || defined(DOXYGEN_RUNNING)
40# define BLDPROG_STRTAB_MAX_STRLEN 256
41#endif
42
43/** @def BLDPROG_STRTAB_WITH_COMPRESSION
44 * Enables very simple string compression.
45 */
46#if defined(DOXYGEN_RUNNING)
47# define BLDPROG_STRTAB_WITH_COMPRESSION
48#endif
49
50/** @def BLDPROG_STRTAB_WITH_CAMEL_WORDS
51 * Modifies the string compression to look for camel case words.
52 */
53#if defined(DOXYGEN_RUNNING)
54# define BLDPROG_STRTAB_WITH_CAMEL_WORDS
55#endif
56
57/** @def BLDPROG_STRTAB_PURE_ASCII
58 * String compression assumes pure 7-bit ASCII and will fail on UTF-8 when this
59 * is defined. Otherwise, the compression code will require IPRT to link.
60 */
61#if defined(DOXYGEN_RUNNING)
62# define BLDPROG_STRTAB_PURE_ASCII
63#endif
64
65
66
67/*********************************************************************************************************************************
68* Header Files *
69*********************************************************************************************************************************/
70#include <iprt/stdarg.h>
71#include <iprt/ctype.h>
72#include <stdlib.h>
73#include <stdio.h>
74#include <string.h>
75
76#ifdef BLDPROG_STRTAB_WITH_COMPRESSION
77# include <algorithm>
78# include <map>
79# include <iprt/sanitized/string>
80# include <vector>
81
82typedef std::map<std::string, size_t> BLDPROGWORDFREQMAP;
83typedef BLDPROGWORDFREQMAP::value_type BLDPROGWORDFREQPAIR;
84
85#endif
86
87#include "../../src/VBox/Runtime/include/internal/strhash.h" /** @todo make this one public */
88
89
90/*********************************************************************************************************************************
91* Structures and Typedefs *
92*********************************************************************************************************************************/
93/**
94 * Build table string.
95 */
96typedef struct BLDPROGSTRING
97{
98 /** The string.
99 * @note This may be modified or replaced (allocated from heap) when
100 * compressing the string table. */
101 char *pszString;
102 /** The string hash value. */
103 uint32_t uHash;
104 /** The strint table offset. */
105 uint32_t offStrTab;
106 /** The string length. */
107 size_t cchString;
108 /** Pointer to the next string reference (same string table entry). */
109 struct BLDPROGSTRING *pNextRef;
110 /** Pointer to the next string with the same hash value (collision). */
111 struct BLDPROGSTRING *pNextCollision;
112
113} BLDPROGSTRING;
114/** Pointer to a string table string. */
115typedef BLDPROGSTRING *PBLDPROGSTRING;
116
117
118/** String table data. */
119typedef struct BLDPROGSTRTAB
120{
121 /** The size of g_papStrHash. */
122 size_t cStrHash;
123 /** String hash table. */
124 PBLDPROGSTRING *papStrHash;
125 /** Duplicate strings found by AddString. */
126 size_t cDuplicateStrings;
127 /** Total length of the unique strings (no terminators). */
128 size_t cchUniqueStrings;
129 /** Number of unique strings after AddString. */
130 size_t cUniqueStrings;
131 /** Number of collisions. */
132 size_t cCollisions;
133
134 /** Number of entries in apSortedStrings. */
135 size_t cSortedStrings;
136 /** The sorted string table. */
137 PBLDPROGSTRING *papSortedStrings;
138
139#ifdef BLDPROG_STRTAB_WITH_COMPRESSION
140 /** The 127 words we've picked to be indexed by reference. */
141 BLDPROGSTRING aCompDict[127];
142 /** Incoming strings pending compression. */
143 PBLDPROGSTRING *papPendingStrings;
144 /** Current number of entries in papStrPending. */
145 size_t cPendingStrings;
146 /** The allocated size of papPendingStrings. */
147 size_t cMaxPendingStrings;
148 /** Work frequency map.
149 * @todo rewrite in plain C. */
150 BLDPROGWORDFREQMAP Frequencies;
151#endif
152
153 /** The string table. */
154 char *pachStrTab;
155 /** The actual string table size. */
156 size_t cchStrTab;
157} BLDPROGSTRTAB;
158typedef BLDPROGSTRTAB *PBLDPROGSTRTAB;
159
160#if RT_CLANG_PREREQ(4, 0) || RT_GNUC_PREREQ(4, 6)
161# pragma GCC diagnostic push
162# pragma GCC diagnostic ignored "-Wunused-function"
163#endif
164
165
166/**
167 * Initializes the strint table compiler.
168 *
169 * @returns success indicator (out of memory if false).
170 * @param pThis The strint table compiler instance.
171 * @param cMaxStrings The max number of strings we'll be adding.
172 */
173static bool BldProgStrTab_Init(PBLDPROGSTRTAB pThis, size_t cMaxStrings)
174{
175 pThis->cStrHash = 0;
176 pThis->papStrHash = NULL;
177 pThis->cDuplicateStrings = 0;
178 pThis->cchUniqueStrings = 0;
179 pThis->cUniqueStrings = 0;
180 pThis->cCollisions = 0;
181 pThis->cSortedStrings = 0;
182 pThis->papSortedStrings = NULL;
183 pThis->pachStrTab = NULL;
184 pThis->cchStrTab = 0;
185#ifdef BLDPROG_STRTAB_WITH_COMPRESSION
186 memset(pThis->aCompDict, 0, sizeof(pThis->aCompDict));
187 pThis->papPendingStrings = NULL;
188 pThis->cPendingStrings = 0;
189 pThis->cMaxPendingStrings = cMaxStrings;
190#endif
191
192 /*
193 * Allocate a hash table double the size of all strings (to avoid too
194 * many collisions). Add all strings to it, finding duplicates in the
195 * process.
196 */
197#ifdef BLDPROG_STRTAB_WITH_COMPRESSION
198 cMaxStrings += RT_ELEMENTS(pThis->aCompDict);
199#endif
200 cMaxStrings *= 2;
201 pThis->papStrHash = (PBLDPROGSTRING *)calloc(sizeof(pThis->papStrHash[0]), cMaxStrings);
202 if (pThis->papStrHash)
203 {
204 pThis->cStrHash = cMaxStrings;
205#ifdef BLDPROG_STRTAB_WITH_COMPRESSION
206 pThis->papPendingStrings = (PBLDPROGSTRING *)calloc(sizeof(pThis->papPendingStrings[0]), pThis->cMaxPendingStrings);
207 if (pThis->papPendingStrings)
208#endif
209 return true;
210
211#ifdef BLDPROG_STRTAB_WITH_COMPRESSION
212 free(pThis->papStrHash);
213 pThis->papStrHash = NULL;
214#endif
215 }
216 return false;
217}
218
219
220#if 0 /* unused */
221static void BldProgStrTab_Delete(PBLDPROGSTRTAB pThis)
222{
223 free(pThis->papStrHash);
224 free(pThis->papSortedStrings);
225 free(pThis->pachStrTab);
226# ifdef BLDPROG_STRTAB_WITH_COMPRESSION
227 free(pThis->papPendingStrings);
228# endif
229 memset(pThis, 0, sizeof(*pThis));
230}
231#endif
232
233#ifdef BLDPROG_STRTAB_WITH_COMPRESSION
234
235DECLINLINE(size_t) bldProgStrTab_compressorFindNextWord(const char *pszSrc, char ch, const char **ppszSrc)
236{
237 /*
238 * Skip leading word separators.
239 */
240# ifdef BLDPROG_STRTAB_WITH_CAMEL_WORDS
241 while ( ch == ' '
242 || ch == '-'
243 || ch == '+'
244 || ch == '_')
245# else
246 while (ch == ' ')
247# endif
248 ch = *++pszSrc;
249 if (ch)
250 {
251 /*
252 * Find end of word.
253 */
254 size_t cchWord = 1;
255# ifdef BLDPROG_STRTAB_WITH_CAMEL_WORDS
256 char chPrev = ch;
257 while ( (ch = pszSrc[cchWord]) != ' '
258 && ch != '\0'
259 && ch != '-'
260 && ch != '+'
261 && ch != '_'
262 && ( ch == chPrev
263 || !RT_C_IS_UPPER(ch)
264 || RT_C_IS_UPPER(chPrev)) )
265 {
266 chPrev = ch;
267 cchWord++;
268 }
269# else
270 while ((ch = pszSrc[cchWord]) != ' ' && ch != '\0')
271 cchWord++;
272# endif
273 *ppszSrc = pszSrc;
274 return cchWord;
275 }
276
277 *ppszSrc = pszSrc;
278 return 0;
279}
280
281
282/**
283 * Analyzes a string.
284 *
285 * @param pThis The strint table compiler instance.
286 * @param pStr The string to analyze.
287 */
288static void bldProgStrTab_compressorAnalyzeString(PBLDPROGSTRTAB pThis, PBLDPROGSTRING pStr)
289{
290 const char *psz = pStr->pszString;
291
292 /*
293 * For now we just consider words.
294 */
295 char ch;
296 while ((ch = *psz) != '\0')
297 {
298 size_t cchWord = bldProgStrTab_compressorFindNextWord(psz, ch, &psz);
299 if (cchWord > 1)
300 {
301 std::string strWord(psz, cchWord);
302 BLDPROGWORDFREQMAP::iterator it = pThis->Frequencies.find(strWord);
303 if (it != pThis->Frequencies.end())
304 it->second += cchWord - 1;
305 else
306 pThis->Frequencies[strWord] = 0;
307
308 /** @todo could gain hits by including the space after the word, but that
309 * has the same accounting problems as the two words scenario below. */
310
311# if 0 /** @todo need better accounting for overlapping alternatives before this can be enabled. */
312 /* Two words - immediate yields calc may lie when this enabled and we may pick the wrong words. */
313 if (ch == ' ')
314 {
315 ch = psz[++cchWord];
316 if (ch != ' ' && ch != '\0')
317 {
318 size_t const cchSaved = cchWord;
319
320 do
321 cchWord++;
322 while ((ch = psz[cchWord]) != ' ' && ch != '\0');
323
324 strWord = std::string(psz, cchWord);
325 BLDPROGWORDFREQMAP::iterator it = pThis->Frequencies.find(strWord);
326 if (it != pThis->Frequencies.end())
327 it->second += cchWord - 1;
328 else
329 pThis->Frequencies[strWord] = 0;
330
331 cchWord = cchSaved;
332 }
333 }
334# endif
335 }
336 else if (!cchWord)
337 break;
338
339 /* Advance. */
340 psz += cchWord;
341 }
342 pStr->cchString = psz - pStr->pszString;
343 if (pStr->cchString > BLDPROG_STRTAB_MAX_STRLEN)
344 {
345 fprintf(stderr, "error: String to long (%u)\n", (unsigned)pStr->cchString);
346 abort();
347 }
348}
349
350#endif /* BLDPROG_STRTAB_WITH_COMPRESSION */
351
352/**
353 * Adds a string to the hash table.
354 * @param pThis The strint table compiler instance.
355 * @param pStr The string.
356 */
357static void bldProgStrTab_AddStringToHashTab(PBLDPROGSTRTAB pThis, PBLDPROGSTRING pStr)
358{
359 pStr->pNextRef = NULL;
360 pStr->pNextCollision = NULL;
361 pStr->offStrTab = 0;
362 pStr->uHash = sdbm(pStr->pszString, &pStr->cchString);
363 if (pStr->cchString > BLDPROG_STRTAB_MAX_STRLEN)
364 {
365 fprintf(stderr, "error: String to long (%u)\n", (unsigned)pStr->cchString);
366 exit(RTEXITCODE_FAILURE);
367 }
368
369 size_t idxHash = pStr->uHash % pThis->cStrHash;
370 PBLDPROGSTRING pCur = pThis->papStrHash[idxHash];
371 if (!pCur)
372 pThis->papStrHash[idxHash] = pStr;
373 else
374 {
375 /* Look for matching string. */
376 do
377 {
378 if ( pCur->uHash == pStr->uHash
379 && pCur->cchString == pStr->cchString
380 && memcmp(pCur->pszString, pStr->pszString, pStr->cchString) == 0)
381 {
382 pStr->pNextRef = pCur->pNextRef;
383 pCur->pNextRef = pStr;
384 pThis->cDuplicateStrings++;
385 return;
386 }
387 pCur = pCur->pNextCollision;
388 } while (pCur != NULL);
389
390 /* No matching string, insert. */
391 pThis->cCollisions++;
392 pStr->pNextCollision = pThis->papStrHash[idxHash];
393 pThis->papStrHash[idxHash] = pStr;
394 }
395
396 pThis->cUniqueStrings++;
397 pThis->cchUniqueStrings += pStr->cchString;
398}
399
400
401/**
402 * Adds a string to the string table.
403 *
404 * @param pThis The strint table compiler instance.
405 * @param pStr The string.
406 */
407static void BldProgStrTab_AddString(PBLDPROGSTRTAB pThis, PBLDPROGSTRING pStr)
408{
409#ifdef BLDPROG_STRTAB_WITH_COMPRESSION
410 bldProgStrTab_compressorAnalyzeString(pThis, pStr);
411 if (pThis->cPendingStrings < pThis->cMaxPendingStrings)
412 pThis->papPendingStrings[pThis->cPendingStrings++] = pStr;
413 else
414 abort();
415#else
416 bldProgStrTab_AddStringToHashTab(pThis, pStr);
417#endif
418}
419
420
421/**
422 * Adds a string to the string table.
423 *
424 * @param pThis The strint table compiler instance.
425 * @param pStr The string entry (uninitialized).
426 * @param psz The string, will be duplicated if compression is enabled.
427 */
428DECLINLINE(void) BldProgStrTab_AddStringDup(PBLDPROGSTRTAB pThis, PBLDPROGSTRING pStr, const char *psz)
429{
430#ifdef BLDPROG_STRTAB_WITH_COMPRESSION
431 pStr->pszString = strdup(psz);
432 if (!pStr->pszString)
433 abort();
434#else
435 pStr->pszString = (char *)psz;
436#endif
437 BldProgStrTab_AddString(pThis, pStr);
438}
439
440#ifdef BLDPROG_STRTAB_WITH_COMPRESSION
441
442/**
443 * Copies @a cchSrc chars from @a pchSrc to @a pszDst, escaping special
444 * sequences.
445 *
446 * @returns New @a pszDst position, NULL if invalid source encoding.
447 * @param pszDst The destination buffer.
448 * @param pszSrc The source buffer.
449 * @param cchSrc How much to copy.
450 */
451static char *bldProgStrTab_compressorCopyAndEscape(char *pszDst, const char *pszSrc, size_t cchSrc)
452{
453 while (cchSrc-- > 0)
454 {
455 char ch = *pszSrc;
456 if (!((unsigned char)ch & 0x80))
457 {
458 *pszDst++ = ch;
459 pszSrc++;
460 }
461 else
462 {
463# ifdef BLDPROG_STRTAB_PURE_ASCII
464 fprintf(stderr, "error: unexpected char value %#x\n", ch);
465 return NULL;
466# else
467 RTUNICP uc;
468 int rc = RTStrGetCpEx(&pszSrc, &uc);
469 if (RT_SUCCESS(rc))
470 {
471 *pszDst++ = (unsigned char)0xff; /* escape single code point. */
472 pszDst = RTStrPutCp(pszDst, uc);
473 }
474 else
475 {
476 fprintf(stderr, "Error: RTStrGetCpEx failed with rc=%d\n", rc);
477 return NULL;
478 }
479# endif
480 }
481 }
482 return pszDst;
483}
484
485
486/**
487 * Replaces the dictionary words and escapes non-ascii chars in a string.
488 *
489 * @param pThis The strint table compiler instance.
490 * @param pString The string to fixup.
491 */
492static bool bldProgStrTab_compressorFixupString(PBLDPROGSTRTAB pThis, PBLDPROGSTRING pStr)
493{
494 char szNew[BLDPROG_STRTAB_MAX_STRLEN * 2];
495 char *pszDst = szNew;
496 const char *pszSrc = pStr->pszString;
497 const char *pszSrcEnd = pszSrc + pStr->cchString;
498
499 char ch;
500 while ((ch = *pszSrc) != '\0')
501 {
502 const char * const pszSrcUncompressed = pszSrc;
503 size_t cchWord = bldProgStrTab_compressorFindNextWord(pszSrc, ch, &pszSrc);
504 size_t cchSrcUncompressed = pszSrc - pszSrcUncompressed;
505 if (cchSrcUncompressed > 0)
506 {
507 pszDst = bldProgStrTab_compressorCopyAndEscape(pszDst, pszSrcUncompressed, cchSrcUncompressed);
508 if (!pszDst)
509 return false;
510 }
511 if (!cchWord)
512 break;
513
514 /* Check for g_aWord matches. */
515 size_t cchMax = pszSrcEnd - pszSrc;
516 for (unsigned i = 0; i < RT_ELEMENTS(pThis->aCompDict); i++)
517 {
518 size_t cchLen = pThis->aCompDict[i].cchString;
519 if ( cchLen >= cchWord
520 && cchLen <= cchMax
521 && memcmp(pThis->aCompDict[i].pszString, pszSrc, cchLen) == 0)
522 {
523 *pszDst++ = (unsigned char)(0x80 | i);
524 pszSrc += cchLen;
525 cchWord = 0;
526 break;
527 }
528 }
529
530 if (cchWord)
531 {
532 /* Copy the current word. */
533 pszDst = bldProgStrTab_compressorCopyAndEscape(pszDst, pszSrc, cchWord);
534 if (!pszDst)
535 return false;
536 pszSrc += cchWord;
537 }
538 }
539
540 /* Just terminate it now. */
541 *pszDst = '\0';
542
543 /*
544 * Update the string.
545 */
546 size_t cchNew = pszDst - &szNew[0];
547 if (cchNew > pStr->cchString)
548 {
549 pStr->pszString = (char *)malloc(cchNew + 1);
550 if (!pStr->pszString)
551 {
552 fprintf(stderr, "Out of memory!\n");
553 return false;
554 }
555 }
556 memcpy(pStr->pszString, szNew, cchNew + 1);
557 pStr->cchString = cchNew;
558
559 return true;
560}
561
562
563/**
564 * For sorting the frequency fidning in descending order.
565 *
566 * Comparison operators are put outside to make older gcc versions (like 4.1.1
567 * on lnx64-rel) happy.
568 */
569class WordFreqSortEntry
570{
571public:
572 BLDPROGWORDFREQPAIR const *m_pPair;
573
574public:
575 WordFreqSortEntry(BLDPROGWORDFREQPAIR const *pPair) : m_pPair(pPair) {}
576};
577
578bool operator == (WordFreqSortEntry const &rLeft, WordFreqSortEntry const &rRight)
579{
580 return rLeft.m_pPair->second == rRight.m_pPair->second;
581}
582
583bool operator < (WordFreqSortEntry const &rLeft, WordFreqSortEntry const &rRight)
584{
585 return rLeft.m_pPair->second > rRight.m_pPair->second;
586}
587
588
589/**
590 * Compresses the vendor and product strings.
591 *
592 * This is very very simple (a lot less work than the string table for
593 * instance).
594 */
595static bool bldProgStrTab_compressorDoStringCompression(PBLDPROGSTRTAB pThis, bool fVerbose)
596{
597 /*
598 * Sort the frequency analyzis result and pick the top 127 ones.
599 */
600 std::vector<WordFreqSortEntry> SortMap;
601 for (BLDPROGWORDFREQMAP::iterator it = pThis->Frequencies.begin(); it != pThis->Frequencies.end(); ++it)
602 {
603 BLDPROGWORDFREQPAIR const &rPair = *it;
604 SortMap.push_back(WordFreqSortEntry(&rPair));
605 }
606
607 sort(SortMap.begin(), SortMap.end());
608
609 size_t cb = 0;
610 size_t i = 0;
611 for (std::vector<WordFreqSortEntry>::iterator it = SortMap.begin();
612 it != SortMap.end() && i < RT_ELEMENTS(pThis->aCompDict);
613 ++it, i++)
614 {
615 pThis->aCompDict[i].cchString = it->m_pPair->first.length();
616 pThis->aCompDict[i].pszString = (char *)malloc(pThis->aCompDict[i].cchString + 1);
617 if (pThis->aCompDict[i].pszString)
618 memcpy(pThis->aCompDict[i].pszString, it->m_pPair->first.c_str(), pThis->aCompDict[i].cchString + 1);
619 else
620 return false;
621 cb += it->m_pPair->second;
622 }
623
624 if (fVerbose)
625 printf("debug: Estimated string compression saving: %u bytes\n", (unsigned)cb);
626
627 /*
628 * Rework the strings.
629 */
630 size_t cchOld = 0;
631 size_t cchOldMax = 0;
632 size_t cchOldMin = BLDPROG_STRTAB_MAX_STRLEN;
633 size_t cchNew = 0;
634 size_t cchNewMax = 0;
635 size_t cchNewMin = BLDPROG_STRTAB_MAX_STRLEN;
636 i = pThis->cPendingStrings;
637 while (i-- > 0)
638 {
639 PBLDPROGSTRING pCurStr = pThis->papPendingStrings[i];
640 cchOld += pCurStr->cchString;
641 if (pCurStr->cchString > cchOldMax)
642 cchOldMax = pCurStr->cchString;
643 if (pCurStr->cchString < cchOldMin)
644 cchOldMin = pCurStr->cchString;
645
646 if (!bldProgStrTab_compressorFixupString(pThis, pCurStr))
647 return false;
648
649 cchNew += pCurStr->cchString;
650 if (pCurStr->cchString > cchNewMax)
651 cchNewMax = pCurStr->cchString;
652 if (pCurStr->cchString < cchNewMin)
653 cchNewMin = pCurStr->cchString;
654
655 bldProgStrTab_AddStringToHashTab(pThis, pCurStr);
656 }
657
658 /*
659 * Do debug stats.
660 */
661 if (fVerbose)
662 {
663 for (i = 0; i < RT_ELEMENTS(pThis->aCompDict); i++)
664 cchNew += pThis->aCompDict[i].cchString + 1;
665
666 printf("debug: Strings: original: %u bytes; compressed: %u bytes;", (unsigned)cchOld, (unsigned)cchNew);
667 if (cchNew < cchOld)
668 printf(" saving %u bytes (%u%%)\n", (unsigned)(cchOld - cchNew), (unsigned)((cchOld - cchNew) * 100 / cchOld));
669 else
670 printf(" wasting %u bytes!\n", (unsigned)(cchOld - cchNew));
671 printf("debug: Original string lengths: average %u; min %u; max %u\n",
672 (unsigned)(cchOld / pThis->cPendingStrings), (unsigned)cchOldMin, (unsigned)cchOldMax);
673 printf("debug: Compressed string lengths: average %u; min %u; max %u\n",
674 (unsigned)(cchNew / pThis->cPendingStrings), (unsigned)cchNewMin, (unsigned)cchNewMax);
675 }
676
677 return true;
678}
679
680#endif /* BLDPROG_STRTAB_WITH_COMPRESSION */
681
682/**
683 * Inserts a string into g_apUniqueStrings.
684 * @param pThis The strint table compiler instance.
685 * @param pStr The string.
686 */
687static void bldProgStrTab_InsertUniqueString(PBLDPROGSTRTAB pThis, PBLDPROGSTRING pStr)
688{
689 size_t iIdx;
690 size_t iEnd = pThis->cSortedStrings;
691 if (iEnd)
692 {
693 size_t iStart = 0;
694 for (;;)
695 {
696 iIdx = iStart + (iEnd - iStart) / 2;
697 if (pThis->papSortedStrings[iIdx]->cchString < pStr->cchString)
698 {
699 if (iIdx <= iStart)
700 break;
701 iEnd = iIdx;
702 }
703 else if (pThis->papSortedStrings[iIdx]->cchString > pStr->cchString)
704 {
705 if (++iIdx >= iEnd)
706 break;
707 iStart = iIdx;
708 }
709 else
710 break;
711 }
712
713 if (iIdx != pThis->cSortedStrings)
714 memmove(&pThis->papSortedStrings[iIdx + 1], &pThis->papSortedStrings[iIdx],
715 (pThis->cSortedStrings - iIdx) * sizeof(pThis->papSortedStrings[iIdx]));
716 }
717 else
718 iIdx = 0;
719
720 pThis->papSortedStrings[iIdx] = pStr;
721 pThis->cSortedStrings++;
722}
723
724
725/**
726 * Compiles the string table after the string has been added.
727 *
728 * This will save space by dropping string terminators, eliminating duplicates
729 * and try find strings that are sub-strings of others.
730 *
731 * Will initialize the StrRef of all StrTabString instances.
732 *
733 * @returns success indicator (error flagged).
734 * @param pThis The strint table compiler instance.
735 */
736static bool BldProgStrTab_CompileIt(PBLDPROGSTRTAB pThis, bool fVerbose)
737{
738#ifdef BLDPROG_STRTAB_WITH_COMPRESSION
739 /*
740 * Do the compression and add all the compressed strings to the table.
741 */
742 if (!bldProgStrTab_compressorDoStringCompression(pThis, fVerbose))
743 return false;
744
745 /*
746 * Add the dictionary strings.
747 */
748 for (unsigned i = 0; i < RT_ELEMENTS(pThis->aCompDict); i++)
749 bldProgStrTab_AddStringToHashTab(pThis, &pThis->aCompDict[i]);
750#endif
751 if (fVerbose)
752 printf("debug: %u unique strings (%u bytes), %u duplicates, %u collisions\n",
753 (unsigned)pThis->cUniqueStrings, (unsigned)pThis->cchUniqueStrings,
754 (unsigned)pThis->cDuplicateStrings, (unsigned)pThis->cCollisions);
755
756 /*
757 * Create papSortedStrings from the hash table. The table is sorted by
758 * string length, with the longer strings first.
759 */
760 pThis->papSortedStrings = (PBLDPROGSTRING *)malloc(sizeof(pThis->papSortedStrings[0]) * pThis->cUniqueStrings);
761 if (!pThis->papSortedStrings)
762 return false;
763 pThis->cSortedStrings = 0;
764 size_t idxHash = pThis->cStrHash;
765 while (idxHash-- > 0)
766 {
767 PBLDPROGSTRING pCur = pThis->papStrHash[idxHash];
768 if (pCur)
769 {
770 do
771 {
772 bldProgStrTab_InsertUniqueString(pThis, pCur);
773 pCur = pCur->pNextCollision;
774 } while (pCur);
775 }
776 }
777
778 /*
779 * Create the actual string table.
780 */
781 pThis->pachStrTab = (char *)malloc(pThis->cchUniqueStrings + 1);
782 if (!pThis->pachStrTab)
783 return false;
784 pThis->cchStrTab = 0;
785 for (size_t i = 0; i < pThis->cSortedStrings; i++)
786 {
787 PBLDPROGSTRING pCur = pThis->papSortedStrings[i];
788 const char * const pszCur = pCur->pszString;
789 size_t const cchCur = pCur->cchString;
790 size_t offStrTab = pThis->cchStrTab;
791
792 /*
793 * See if the string is a substring already found in the string table.
794 * Excluding the zero terminator increases the chances for this.
795 */
796 size_t cchLeft = pThis->cchStrTab >= cchCur ? pThis->cchStrTab - cchCur : 0;
797 const char *pchLeft = pThis->pachStrTab;
798 char const chFirst = *pszCur;
799 while (cchLeft > 0)
800 {
801 const char *pchCandidate = (const char *)memchr(pchLeft, chFirst, cchLeft);
802 if (!pchCandidate)
803 break;
804 if (memcmp(pchCandidate, pszCur, cchCur) == 0)
805 {
806 offStrTab = pchCandidate - pThis->pachStrTab;
807 break;
808 }
809
810 cchLeft -= pchCandidate + 1 - pchLeft;
811 pchLeft = pchCandidate + 1;
812 }
813
814 if (offStrTab == pThis->cchStrTab)
815 {
816 /*
817 * See if the start of the string overlaps the end of the string table.
818 */
819 if (pThis->cchStrTab && cchCur > 1)
820 {
821 cchLeft = RT_MIN(pThis->cchStrTab, cchCur - 1);
822 pchLeft = &pThis->pachStrTab[pThis->cchStrTab - cchLeft];
823 while (cchLeft > 0)
824 {
825 const char *pchCandidate = (const char *)memchr(pchLeft, chFirst, cchLeft);
826 if (!pchCandidate)
827 break;
828 cchLeft -= pchCandidate - pchLeft;
829 pchLeft = pchCandidate;
830 if (memcmp(pchLeft, pszCur, cchLeft) == 0)
831 {
832 size_t cchToCopy = cchCur - cchLeft;
833 memcpy(&pThis->pachStrTab[offStrTab], &pszCur[cchLeft], cchToCopy);
834 pThis->cchStrTab += cchToCopy;
835 offStrTab = pchCandidate - pThis->pachStrTab;
836 break;
837 }
838 cchLeft--;
839 pchLeft++;
840 }
841 }
842
843 /*
844 * If we didn't have any luck above, just append the string.
845 */
846 if (offStrTab == pThis->cchStrTab)
847 {
848 memcpy(&pThis->pachStrTab[offStrTab], pszCur, cchCur);
849 pThis->cchStrTab += cchCur;
850 }
851 }
852
853 /*
854 * Set the string table offset for all the references to this string.
855 */
856 do
857 {
858 pCur->offStrTab = (uint32_t)offStrTab;
859 pCur = pCur->pNextRef;
860 } while (pCur != NULL);
861 }
862
863 if (fVerbose)
864 printf("debug: String table: %u bytes\n", (unsigned)pThis->cchStrTab);
865 return true;
866}
867
868
869#ifdef RT_STRICT
870/**
871 * Sanity checks a string table string.
872 * @param pThis The strint table compiler instance.
873 * @param pStr The string to check.
874 */
875static void BldProgStrTab_CheckStrTabString(PBLDPROGSTRTAB pThis, PBLDPROGSTRING pStr)
876{
877 if (pStr->cchString != strlen(pStr->pszString))
878 abort();
879 if (pStr->offStrTab >= pThis->cchStrTab)
880 abort();
881 if (pStr->offStrTab + pStr->cchString > pThis->cchStrTab)
882 abort();
883 if (memcmp(pStr->pszString, &pThis->pachStrTab[pStr->offStrTab], pStr->cchString) != 0)
884 abort();
885}
886#endif
887
888
889/**
890 * Output the string table string in C string litteral fashion.
891 *
892 * @param pThis The strint table instance.
893 * @param pString The string to print.
894 * @param pOut The output stream.
895 */
896static void BldProgStrTab_PrintCStringLitteral(PBLDPROGSTRTAB pThis, PBLDPROGSTRING pString, FILE *pOut)
897{
898 const unsigned char *psz = (const unsigned char *)pString->pszString;
899 unsigned char uch;
900 while ((uch = *psz++) != '\0')
901 {
902 if (!(uch & 0x80))
903 {
904 if (uch != '\'' && uch != '\\')
905 fputc((char)uch, pOut);
906 else
907 {
908 fputc('\\', pOut);
909 fputc((char)uch, pOut);
910 }
911 }
912#ifdef BLDPROG_STRTAB_WITH_COMPRESSION
913 else if (uch != 0xff)
914 fputs(pThis->aCompDict[uch & 0x7f].pszString, pOut);
915 else
916 {
917# ifdef BLDPROG_STRTAB_PURE_ASCII
918 abort();
919# else
920 RTUNICP uc = RTStrGetCp((const char *)psz);
921 psz += RTStrCpSize(uc);
922 fprintf(pOut, "\\u%04x", uc);
923# endif
924 }
925#else
926 else
927 fprintf(pOut, "\\x%02x", (unsigned)uch);
928 NOREF(pThis);
929#endif
930 }
931}
932
933
934/**
935 * Writes the string table code to the output stream.
936 *
937 * @param pThis The strint table compiler instance.
938 * @param pOut The output stream.
939 * @param pszScope The scoping ("static " or empty string),
940 * including trailing space.
941 * @param pszPrefix The variable prefix. Typically "g_" for C programs,
942 * whereas C++ classes normally use "class::s_g".
943 * @param pszBaseName The base name for the variables. The user
944 * accessible variable will end up as just the
945 * prefix and basename concatenated.
946 */
947static void BldProgStrTab_WriteStringTable(PBLDPROGSTRTAB pThis, FILE *pOut,
948 const char *pszScope, const char *pszPrefix, const char *pszBaseName)
949{
950#ifdef RT_STRICT
951 /*
952 * Do some quick sanity checks while we're here.
953 */
954# ifdef BLDPROG_STRTAB_WITH_COMPRESSION
955 for (unsigned i = 0; i < RT_ELEMENTS(pThis->aCompDict); i++)
956 BldProgStrTab_CheckStrTabString(pThis, &pThis->aCompDict[i]);
957# endif
958#endif
959
960 /*
961 * Create a table for speeding up the character categorization.
962 */
963 uint8_t abCharCat[256];
964 memset(abCharCat, 0, sizeof(abCharCat));
965 abCharCat[(unsigned char)'\\'] = 1;
966 abCharCat[(unsigned char)'\''] = 1;
967 for (unsigned i = 0; i < 0x20; i++)
968 abCharCat[i] = 2;
969 for (unsigned i = 0x7f; i < 0x100; i++)
970 abCharCat[i] = 2;
971
972 /*
973 * We follow the sorted string table, one string per line.
974 */
975 fprintf(pOut,
976 "#include <iprt/bldprog-strtab.h>\n"
977 "\n"
978 "static const char g_achStrTab%s[] =\n"
979 "{\n",
980 pszBaseName);
981
982 uint32_t off = 0;
983 for (uint32_t i = 0; i < pThis->cSortedStrings; i++)
984 {
985 PBLDPROGSTRING pCur = pThis->papSortedStrings[i];
986 uint32_t offEnd = pCur->offStrTab + (uint32_t)pCur->cchString;
987 if (offEnd > off)
988 {
989 /* Comment with a uncompressed and more readable version of the string. */
990 if (off == pCur->offStrTab)
991 fprintf(pOut, "/* 0x%05x = \"", off);
992 else
993 fprintf(pOut, "/* 0X%05x = \"", off);
994 BldProgStrTab_PrintCStringLitteral(pThis, pCur, pOut);
995 fputs("\" */\n", pOut);
996
997 /* Must use char by char here or we may trigger the max string
998 length limit in the compiler, */
999 fputs(" ", pOut);
1000 for (; off < offEnd; off++)
1001 {
1002 unsigned char uch = pThis->pachStrTab[off];
1003 fputc('\'', pOut);
1004 if (abCharCat[uch] == 0)
1005 fputc(uch, pOut);
1006 else if (abCharCat[uch] != 1)
1007 fprintf(pOut, "\\x%02x", (unsigned)uch);
1008 else
1009 {
1010 fputc('\\', pOut);
1011 fputc(uch, pOut);
1012 }
1013 fputc('\'', pOut);
1014 fputc(',', pOut);
1015 }
1016 fputc('\n', pOut);
1017 }
1018 }
1019
1020 fprintf(pOut,
1021 "};\n"
1022 "AssertCompile(sizeof(g_achStrTab%s) == %#x);\n\n",
1023 pszBaseName, (unsigned)pThis->cchStrTab);
1024
1025#ifdef BLDPROG_STRTAB_WITH_COMPRESSION
1026 /*
1027 * Write the compression dictionary.
1028 */
1029 fprintf(pOut,
1030 "static const RTBLDPROGSTRREF g_aCompDict%s[%u] = \n"
1031 "{\n",
1032 pszBaseName, (unsigned)RT_ELEMENTS(pThis->aCompDict));
1033 for (unsigned i = 0; i < RT_ELEMENTS(pThis->aCompDict); i++)
1034 fprintf(pOut, " { %#08x, %#04x }, // %s\n",
1035 pThis->aCompDict[i].offStrTab, (unsigned)pThis->aCompDict[i].cchString, pThis->aCompDict[i].pszString);
1036 fprintf(pOut, "};\n\n");
1037#endif
1038
1039
1040 /*
1041 * Write the string table data structure.
1042 */
1043 fprintf(pOut,
1044 "%sconst RTBLDPROGSTRTAB %s%s = \n"
1045 "{\n"
1046 " /*.pchStrTab = */ &g_achStrTab%s[0],\n"
1047 " /*.cchStrTab = */ sizeof(g_achStrTab%s),\n"
1048 ,
1049 pszScope, pszPrefix, pszBaseName, pszBaseName, pszBaseName);
1050#ifdef BLDPROG_STRTAB_WITH_COMPRESSION
1051 fprintf(pOut,
1052 " /*.cCompDict = */ %u,\n"
1053 " /*.paCompDict = */ &g_aCompDict%s[0]\n"
1054 "};\n"
1055 , (unsigned)RT_ELEMENTS(pThis->aCompDict), pszBaseName);
1056#else
1057 fprintf(pOut,
1058 " /*.cCompDict = */ 0,\n"
1059 " /*.paCompDict = */ NULL\n"
1060 "};\n");
1061#endif
1062}
1063
1064#if RT_CLANG_PREREQ(4, 0) || RT_GNUC_PREREQ(4, 6)
1065# pragma GCC diagnostic pop
1066#endif
1067
1068#endif /* __cplusplus && IN_RING3 */
1069
Note: See TracBrowser for help on using the repository browser.

© 2024 Oracle Support Privacy / Do Not Sell My Info Terms of Use Trademark Policy Automated Access Etiquette