VirtualBox

source: vbox/trunk/include/iprt/bldprog-strtab-template.cpp.h@ 86842

Last change on this file since 86842 was 84127, checked in by vboxsync, 5 years ago

iprt/bldprog-strtab-template.cpp.h: Comment. bugref:9726

  • Property svn:eol-style set to native
  • Property svn:keywords set to Author Date Id Revision
File size: 33.6 KB
Line 
1/* $Id: bldprog-strtab-template.cpp.h 84127 2020-05-02 11:56:09Z vboxsync $ */
2/** @file
3 * IPRT - Build Program - String Table Generator.
4 */
5
6/*
7 * Copyright (C) 2006-2020 Oracle Corporation
8 *
9 * This file is part of VirtualBox Open Source Edition (OSE), as
10 * available from http://www.virtualbox.org. This file is free software;
11 * you can redistribute it and/or modify it under the terms of the GNU
12 * General Public License (GPL) as published by the Free Software
13 * Foundation, in version 2 as it comes in the "COPYING" file of the
14 * VirtualBox OSE distribution. VirtualBox OSE is distributed in the
15 * hope that it will be useful, but WITHOUT ANY WARRANTY of any kind.
16 *
17 * The contents of this file may alternatively be used under the terms
18 * of the Common Development and Distribution License Version 1.0
19 * (CDDL) only, as it comes in the "COPYING.CDDL" file of the
20 * VirtualBox OSE distribution, in which case the provisions of the
21 * CDDL are applicable instead of those of the GPL.
22 *
23 * You may elect to license modified versions of this file under the
24 * terms and conditions of either the GPL or the CDDL or both.
25 */
26
27
28/*
29 * (Avoid include sanity checks as this is ring-3 only, C++ code.)
30 */
31#if defined(__cplusplus) && defined(IN_RING3)
32
33
34/*********************************************************************************************************************************
35* Defined Constants And Macros *
36*********************************************************************************************************************************/
37/** @def BLDPROG_STRTAB_MAX_STRLEN
38 * The max length of strings in the table. */
39#if !defined(BLDPROG_STRTAB_MAX_STRLEN) || defined(DOXYGEN_RUNNING)
40# define BLDPROG_STRTAB_MAX_STRLEN 256
41#endif
42
43/** @def BLDPROG_STRTAB_WITH_COMPRESSION
44 * Enables very simple string compression.
45 */
46#if defined(DOXYGEN_RUNNING)
47# define BLDPROG_STRTAB_WITH_COMPRESSION
48#endif
49
50/** @def BLDPROG_STRTAB_WITH_CAMEL_WORDS
51 * Modifies the string compression to look for camel case words.
52 */
53#if defined(DOXYGEN_RUNNING)
54# define BLDPROG_STRTAB_WITH_CAMEL_WORDS
55#endif
56
57/** @def BLDPROG_STRTAB_PURE_ASCII
58 * String compression assumes pure 7-bit ASCII and will fail on UTF-8 when this
59 * is defined. Otherwise, the compression code will require IPRT to link.
60 */
61#if defined(DOXYGEN_RUNNING)
62# define BLDPROG_STRTAB_PURE_ASCII
63#endif
64
65
66
67/*********************************************************************************************************************************
68* Header Files *
69*********************************************************************************************************************************/
70#include <iprt/stdarg.h>
71#include <iprt/ctype.h>
72#include <stdlib.h>
73#include <stdio.h>
74#include <string.h>
75
76#ifdef BLDPROG_STRTAB_WITH_COMPRESSION
77# include <algorithm>
78# include <map>
79# include <iprt/sanitized/string>
80# include <vector>
81
82typedef std::map<std::string, size_t> BLDPROGWORDFREQMAP;
83typedef BLDPROGWORDFREQMAP::value_type BLDPROGWORDFREQPAIR;
84
85#endif
86
87#include "../../src/VBox/Runtime/include/internal/strhash.h" /** @todo make this one public */
88
89
90/*********************************************************************************************************************************
91* Structures and Typedefs *
92*********************************************************************************************************************************/
93/**
94 * Build table string.
95 */
96typedef struct BLDPROGSTRING
97{
98 /** The string.
99 * @note This may be modified or replaced (allocated from heap) when
100 * compressing the string table. */
101 char *pszString;
102 /** The string hash value. */
103 uint32_t uHash;
104 /** The strint table offset. */
105 uint32_t offStrTab;
106 /** The string length. */
107 size_t cchString;
108 /** Pointer to the next string reference (same string table entry). */
109 struct BLDPROGSTRING *pNextRef;
110 /** Pointer to the next string with the same hash value (collision). */
111 struct BLDPROGSTRING *pNextCollision;
112
113} BLDPROGSTRING;
114/** Pointer to a string table string. */
115typedef BLDPROGSTRING *PBLDPROGSTRING;
116
117
118/** String table data. */
119typedef struct BLDPROGSTRTAB
120{
121 /** The size of g_papStrHash. */
122 size_t cStrHash;
123 /** String hash table. */
124 PBLDPROGSTRING *papStrHash;
125 /** Duplicate strings found by AddString. */
126 size_t cDuplicateStrings;
127 /** Total length of the unique strings (no terminators). */
128 size_t cchUniqueStrings;
129 /** Number of unique strings after AddString. */
130 size_t cUniqueStrings;
131 /** Number of collisions. */
132 size_t cCollisions;
133
134 /** Number of entries in apSortedStrings. */
135 size_t cSortedStrings;
136 /** The sorted string table. */
137 PBLDPROGSTRING *papSortedStrings;
138
139#ifdef BLDPROG_STRTAB_WITH_COMPRESSION
140 /** The 127 words we've picked to be indexed by reference. */
141 BLDPROGSTRING aCompDict[127];
142 /** The frequency of the 127 dictionary entries. */
143 size_t auCompDictFreq[127];
144 /** Incoming strings pending compression. */
145 PBLDPROGSTRING *papPendingStrings;
146 /** Current number of entries in papStrPending. */
147 size_t cPendingStrings;
148 /** The allocated size of papPendingStrings. */
149 size_t cMaxPendingStrings;
150 /** Work frequency map.
151 * @todo rewrite in plain C. */
152 BLDPROGWORDFREQMAP Frequencies;
153#endif
154
155 /** The string table. */
156 char *pachStrTab;
157 /** The actual string table size. */
158 size_t cchStrTab;
159} BLDPROGSTRTAB;
160typedef BLDPROGSTRTAB *PBLDPROGSTRTAB;
161
162#if RT_CLANG_PREREQ(4, 0) || RT_GNUC_PREREQ(4, 6)
163# pragma GCC diagnostic push
164# pragma GCC diagnostic ignored "-Wunused-function"
165#endif
166
167
168/**
169 * Initializes the strint table compiler.
170 *
171 * @returns success indicator (out of memory if false).
172 * @param pThis The strint table compiler instance.
173 * @param cMaxStrings The max number of strings we'll be adding.
174 */
175static bool BldProgStrTab_Init(PBLDPROGSTRTAB pThis, size_t cMaxStrings)
176{
177 pThis->cStrHash = 0;
178 pThis->papStrHash = NULL;
179 pThis->cDuplicateStrings = 0;
180 pThis->cchUniqueStrings = 0;
181 pThis->cUniqueStrings = 0;
182 pThis->cCollisions = 0;
183 pThis->cSortedStrings = 0;
184 pThis->papSortedStrings = NULL;
185 pThis->pachStrTab = NULL;
186 pThis->cchStrTab = 0;
187#ifdef BLDPROG_STRTAB_WITH_COMPRESSION
188 memset(pThis->aCompDict, 0, sizeof(pThis->aCompDict));
189 pThis->papPendingStrings = NULL;
190 pThis->cPendingStrings = 0;
191 pThis->cMaxPendingStrings = cMaxStrings;
192#endif
193
194 /*
195 * Allocate a hash table double the size of all strings (to avoid too
196 * many collisions). Add all strings to it, finding duplicates in the
197 * process.
198 */
199#ifdef BLDPROG_STRTAB_WITH_COMPRESSION
200 cMaxStrings += RT_ELEMENTS(pThis->aCompDict);
201#endif
202 cMaxStrings *= 2;
203 pThis->papStrHash = (PBLDPROGSTRING *)calloc(sizeof(pThis->papStrHash[0]), cMaxStrings);
204 if (pThis->papStrHash)
205 {
206 pThis->cStrHash = cMaxStrings;
207#ifdef BLDPROG_STRTAB_WITH_COMPRESSION
208 pThis->papPendingStrings = (PBLDPROGSTRING *)calloc(sizeof(pThis->papPendingStrings[0]), pThis->cMaxPendingStrings);
209 if (pThis->papPendingStrings)
210#endif
211 return true;
212
213#ifdef BLDPROG_STRTAB_WITH_COMPRESSION
214 free(pThis->papStrHash);
215 pThis->papStrHash = NULL;
216#endif
217 }
218 return false;
219}
220
221
222#if 0 /* unused */
223static void BldProgStrTab_Delete(PBLDPROGSTRTAB pThis)
224{
225 free(pThis->papStrHash);
226 free(pThis->papSortedStrings);
227 free(pThis->pachStrTab);
228# ifdef BLDPROG_STRTAB_WITH_COMPRESSION
229 free(pThis->papPendingStrings);
230# endif
231 memset(pThis, 0, sizeof(*pThis));
232}
233#endif
234
235#ifdef BLDPROG_STRTAB_WITH_COMPRESSION
236
237DECLINLINE(size_t) bldProgStrTab_compressorFindNextWord(const char *pszSrc, char ch, const char **ppszSrc)
238{
239 /*
240 * Skip leading word separators.
241 */
242# ifdef BLDPROG_STRTAB_WITH_CAMEL_WORDS
243 while ( ch == ' '
244 || ch == '-'
245 || ch == '+'
246 || ch == '_')
247# else
248 while (ch == ' ')
249# endif
250 ch = *++pszSrc;
251 if (ch)
252 {
253 /*
254 * Find end of word.
255 */
256 size_t cchWord = 1;
257# ifdef BLDPROG_STRTAB_WITH_CAMEL_WORDS
258 char chPrev = ch;
259 while ( (ch = pszSrc[cchWord]) != ' '
260 && ch != '\0'
261 && ch != '-'
262 && ch != '+'
263 && ch != '_'
264 && ( ch == chPrev
265 || !RT_C_IS_UPPER(ch)
266 || RT_C_IS_UPPER(chPrev)) )
267 {
268 chPrev = ch;
269 cchWord++;
270 }
271# else
272 while ((ch = pszSrc[cchWord]) != ' ' && ch != '\0')
273 cchWord++;
274# endif
275 *ppszSrc = pszSrc;
276 return cchWord;
277 }
278
279 *ppszSrc = pszSrc;
280 return 0;
281}
282
283
284/**
285 * Analyzes a string.
286 *
287 * @param pThis The strint table compiler instance.
288 * @param pStr The string to analyze.
289 */
290static void bldProgStrTab_compressorAnalyzeString(PBLDPROGSTRTAB pThis, PBLDPROGSTRING pStr)
291{
292 const char *psz = pStr->pszString;
293
294 /*
295 * For now we just consider words.
296 */
297 char ch;
298 while ((ch = *psz) != '\0')
299 {
300 size_t cchWord = bldProgStrTab_compressorFindNextWord(psz, ch, &psz);
301 if (cchWord > 1)
302 {
303 std::string strWord(psz, cchWord);
304 BLDPROGWORDFREQMAP::iterator it = pThis->Frequencies.find(strWord);
305 if (it != pThis->Frequencies.end())
306 it->second += cchWord - 1;
307 else
308 pThis->Frequencies[strWord] = 0;
309
310 /** @todo could gain hits by including the space after the word, but that
311 * has the same accounting problems as the two words scenario below. */
312
313# if 0 /** @todo need better accounting for overlapping alternatives before this can be enabled. */
314 /* Two words - immediate yields calc may lie when this enabled and we may pick the wrong words. */
315 if (ch == ' ')
316 {
317 ch = psz[++cchWord];
318 if (ch != ' ' && ch != '\0')
319 {
320 size_t const cchSaved = cchWord;
321
322 do
323 cchWord++;
324 while ((ch = psz[cchWord]) != ' ' && ch != '\0');
325
326 strWord = std::string(psz, cchWord);
327 BLDPROGWORDFREQMAP::iterator it = pThis->Frequencies.find(strWord);
328 if (it != pThis->Frequencies.end())
329 it->second += cchWord - 1;
330 else
331 pThis->Frequencies[strWord] = 0;
332
333 cchWord = cchSaved;
334 }
335 }
336# endif
337 }
338 else if (!cchWord)
339 break;
340
341 /* Advance. */
342 psz += cchWord;
343 }
344 pStr->cchString = psz - pStr->pszString;
345 if (pStr->cchString > BLDPROG_STRTAB_MAX_STRLEN)
346 {
347 fprintf(stderr, "error: String to long (%u)\n", (unsigned)pStr->cchString);
348 abort();
349 }
350}
351
352#endif /* BLDPROG_STRTAB_WITH_COMPRESSION */
353
354/**
355 * Adds a string to the hash table.
356 * @param pThis The strint table compiler instance.
357 * @param pStr The string.
358 */
359static void bldProgStrTab_AddStringToHashTab(PBLDPROGSTRTAB pThis, PBLDPROGSTRING pStr)
360{
361 pStr->pNextRef = NULL;
362 pStr->pNextCollision = NULL;
363 pStr->offStrTab = 0;
364 pStr->uHash = sdbm(pStr->pszString, &pStr->cchString);
365 if (pStr->cchString > BLDPROG_STRTAB_MAX_STRLEN)
366 {
367 fprintf(stderr, "error: String to long (%u)\n", (unsigned)pStr->cchString);
368 exit(RTEXITCODE_FAILURE);
369 }
370
371 size_t idxHash = pStr->uHash % pThis->cStrHash;
372 PBLDPROGSTRING pCur = pThis->papStrHash[idxHash];
373 if (!pCur)
374 pThis->papStrHash[idxHash] = pStr;
375 else
376 {
377 /* Look for matching string. */
378 do
379 {
380 if ( pCur->uHash == pStr->uHash
381 && pCur->cchString == pStr->cchString
382 && memcmp(pCur->pszString, pStr->pszString, pStr->cchString) == 0)
383 {
384 pStr->pNextRef = pCur->pNextRef;
385 pCur->pNextRef = pStr;
386 pThis->cDuplicateStrings++;
387 return;
388 }
389 pCur = pCur->pNextCollision;
390 } while (pCur != NULL);
391
392 /* No matching string, insert. */
393 pThis->cCollisions++;
394 pStr->pNextCollision = pThis->papStrHash[idxHash];
395 pThis->papStrHash[idxHash] = pStr;
396 }
397
398 pThis->cUniqueStrings++;
399 pThis->cchUniqueStrings += pStr->cchString;
400}
401
402
403/**
404 * Adds a string to the string table.
405 *
406 * @param pThis The strint table compiler instance.
407 * @param pStr The string.
408 */
409static void BldProgStrTab_AddString(PBLDPROGSTRTAB pThis, PBLDPROGSTRING pStr)
410{
411#ifdef BLDPROG_STRTAB_WITH_COMPRESSION
412 bldProgStrTab_compressorAnalyzeString(pThis, pStr);
413 if (pThis->cPendingStrings < pThis->cMaxPendingStrings)
414 pThis->papPendingStrings[pThis->cPendingStrings++] = pStr;
415 else
416 abort();
417#else
418 bldProgStrTab_AddStringToHashTab(pThis, pStr);
419#endif
420}
421
422
423/**
424 * Adds a string to the string table.
425 *
426 * @param pThis The strint table compiler instance.
427 * @param pStr The string entry (uninitialized).
428 * @param psz The string, will be duplicated if compression is enabled.
429 */
430DECLINLINE(void) BldProgStrTab_AddStringDup(PBLDPROGSTRTAB pThis, PBLDPROGSTRING pStr, const char *psz)
431{
432#ifdef BLDPROG_STRTAB_WITH_COMPRESSION
433 pStr->pszString = strdup(psz);
434 if (!pStr->pszString)
435 abort();
436#else
437 pStr->pszString = (char *)psz;
438#endif
439 BldProgStrTab_AddString(pThis, pStr);
440}
441
442#ifdef BLDPROG_STRTAB_WITH_COMPRESSION
443
444/**
445 * Copies @a cchSrc chars from @a pchSrc to @a pszDst, escaping special
446 * sequences.
447 *
448 * @returns New @a pszDst position, NULL if invalid source encoding.
449 * @param pszDst The destination buffer.
450 * @param pszSrc The source buffer.
451 * @param cchSrc How much to copy.
452 */
453static char *bldProgStrTab_compressorCopyAndEscape(char *pszDst, const char *pszSrc, size_t cchSrc)
454{
455 while (cchSrc-- > 0)
456 {
457 char ch = *pszSrc;
458 if (!((unsigned char)ch & 0x80))
459 {
460 *pszDst++ = ch;
461 pszSrc++;
462 }
463 else
464 {
465# ifdef BLDPROG_STRTAB_PURE_ASCII
466 fprintf(stderr, "error: unexpected char value %#x\n", ch);
467 return NULL;
468# else
469 RTUNICP uc;
470 int rc = RTStrGetCpEx(&pszSrc, &uc);
471 if (RT_SUCCESS(rc))
472 {
473 *pszDst++ = (unsigned char)0xff; /* escape single code point. */
474 pszDst = RTStrPutCp(pszDst, uc);
475 }
476 else
477 {
478 fprintf(stderr, "Error: RTStrGetCpEx failed with rc=%d\n", rc);
479 return NULL;
480 }
481# endif
482 }
483 }
484 return pszDst;
485}
486
487
488/**
489 * Replaces the dictionary words and escapes non-ascii chars in a string.
490 *
491 * @param pThis The strint table compiler instance.
492 * @param pString The string to fixup.
493 */
494static bool bldProgStrTab_compressorFixupString(PBLDPROGSTRTAB pThis, PBLDPROGSTRING pStr)
495{
496 char szNew[BLDPROG_STRTAB_MAX_STRLEN * 2];
497 char *pszDst = szNew;
498 const char *pszSrc = pStr->pszString;
499 const char *pszSrcEnd = pszSrc + pStr->cchString;
500
501 char ch;
502 while ((ch = *pszSrc) != '\0')
503 {
504 const char * const pszSrcUncompressed = pszSrc;
505 size_t cchWord = bldProgStrTab_compressorFindNextWord(pszSrc, ch, &pszSrc);
506 size_t cchSrcUncompressed = pszSrc - pszSrcUncompressed;
507 if (cchSrcUncompressed > 0)
508 {
509 pszDst = bldProgStrTab_compressorCopyAndEscape(pszDst, pszSrcUncompressed, cchSrcUncompressed);
510 if (!pszDst)
511 return false;
512 }
513 if (!cchWord)
514 break;
515
516 /* Check for g_aWord matches. */
517 size_t cchMax = pszSrcEnd - pszSrc;
518 for (unsigned i = 0; i < RT_ELEMENTS(pThis->aCompDict); i++)
519 {
520 size_t cchLen = pThis->aCompDict[i].cchString;
521 if ( cchLen >= cchWord
522 && cchLen <= cchMax
523 && memcmp(pThis->aCompDict[i].pszString, pszSrc, cchLen) == 0)
524 {
525 *pszDst++ = (unsigned char)(0x80 | i);
526 pszSrc += cchLen;
527 cchWord = 0;
528 break;
529 }
530 }
531
532 if (cchWord)
533 {
534 /* Copy the current word. */
535 pszDst = bldProgStrTab_compressorCopyAndEscape(pszDst, pszSrc, cchWord);
536 if (!pszDst)
537 return false;
538 pszSrc += cchWord;
539 }
540 }
541
542 /* Just terminate it now. */
543 *pszDst = '\0';
544
545 /*
546 * Update the string.
547 */
548 size_t cchNew = pszDst - &szNew[0];
549 if (cchNew > pStr->cchString)
550 {
551 pStr->pszString = (char *)malloc(cchNew + 1);
552 if (!pStr->pszString)
553 {
554 fprintf(stderr, "Out of memory!\n");
555 return false;
556 }
557 }
558 memcpy(pStr->pszString, szNew, cchNew + 1);
559 pStr->cchString = cchNew;
560
561 return true;
562}
563
564
565/**
566 * For sorting the frequency fidning in descending order.
567 *
568 * Comparison operators are put outside to make older gcc versions (like 4.1.1
569 * on lnx64-rel) happy.
570 */
571class WordFreqSortEntry
572{
573public:
574 BLDPROGWORDFREQPAIR const *m_pPair;
575
576public:
577 WordFreqSortEntry(BLDPROGWORDFREQPAIR const *pPair) : m_pPair(pPair) {}
578};
579
580bool operator == (WordFreqSortEntry const &rLeft, WordFreqSortEntry const &rRight)
581{
582 return rLeft.m_pPair->second == rRight.m_pPair->second;
583}
584
585bool operator < (WordFreqSortEntry const &rLeft, WordFreqSortEntry const &rRight)
586{
587 return rLeft.m_pPair->second > rRight.m_pPair->second;
588}
589
590
591/**
592 * Compresses the vendor and product strings.
593 *
594 * This is very very simple (a lot less work than the string table for
595 * instance).
596 */
597static bool bldProgStrTab_compressorDoStringCompression(PBLDPROGSTRTAB pThis, bool fVerbose)
598{
599 /*
600 * Sort the frequency analyzis result and pick the top 127 ones.
601 */
602 std::vector<WordFreqSortEntry> SortMap;
603 for (BLDPROGWORDFREQMAP::iterator it = pThis->Frequencies.begin(); it != pThis->Frequencies.end(); ++it)
604 {
605 BLDPROGWORDFREQPAIR const &rPair = *it;
606 SortMap.push_back(WordFreqSortEntry(&rPair));
607 }
608
609 sort(SortMap.begin(), SortMap.end());
610
611 size_t cb = 0;
612 size_t i = 0;
613 for (std::vector<WordFreqSortEntry>::iterator it = SortMap.begin();
614 it != SortMap.end() && i < RT_ELEMENTS(pThis->aCompDict);
615 ++it, i++)
616 {
617 pThis->auCompDictFreq[i] = it->m_pPair->second;
618 pThis->aCompDict[i].cchString = it->m_pPair->first.length();
619 pThis->aCompDict[i].pszString = (char *)malloc(pThis->aCompDict[i].cchString + 1);
620 if (pThis->aCompDict[i].pszString)
621 memcpy(pThis->aCompDict[i].pszString, it->m_pPair->first.c_str(), pThis->aCompDict[i].cchString + 1);
622 else
623 return false;
624 cb += it->m_pPair->second;
625 }
626
627 if (fVerbose)
628 printf("debug: Estimated string compression saving: %u bytes\n", (unsigned)cb);
629
630 /*
631 * Rework the strings.
632 */
633 size_t cchOld = 0;
634 size_t cchOldMax = 0;
635 size_t cchOldMin = BLDPROG_STRTAB_MAX_STRLEN;
636 size_t cchNew = 0;
637 size_t cchNewMax = 0;
638 size_t cchNewMin = BLDPROG_STRTAB_MAX_STRLEN;
639 i = pThis->cPendingStrings;
640 while (i-- > 0)
641 {
642 PBLDPROGSTRING pCurStr = pThis->papPendingStrings[i];
643 cchOld += pCurStr->cchString;
644 if (pCurStr->cchString > cchOldMax)
645 cchOldMax = pCurStr->cchString;
646 if (pCurStr->cchString < cchOldMin)
647 cchOldMin = pCurStr->cchString;
648
649 if (!bldProgStrTab_compressorFixupString(pThis, pCurStr))
650 return false;
651
652 cchNew += pCurStr->cchString;
653 if (pCurStr->cchString > cchNewMax)
654 cchNewMax = pCurStr->cchString;
655 if (pCurStr->cchString < cchNewMin)
656 cchNewMin = pCurStr->cchString;
657
658 bldProgStrTab_AddStringToHashTab(pThis, pCurStr);
659 }
660
661 /*
662 * Do debug stats.
663 */
664 if (fVerbose)
665 {
666 for (i = 0; i < RT_ELEMENTS(pThis->aCompDict); i++)
667 cchNew += pThis->aCompDict[i].cchString + 1;
668
669 printf("debug: Strings: original: %u bytes; compressed: %u bytes;", (unsigned)cchOld, (unsigned)cchNew);
670 if (cchNew < cchOld)
671 printf(" saving %u bytes (%u%%)\n", (unsigned)(cchOld - cchNew), (unsigned)((cchOld - cchNew) * 100 / cchOld));
672 else
673 printf(" wasting %u bytes!\n", (unsigned)(cchOld - cchNew));
674 printf("debug: Original string lengths: average %u; min %u; max %u\n",
675 (unsigned)(cchOld / pThis->cPendingStrings), (unsigned)cchOldMin, (unsigned)cchOldMax);
676 printf("debug: Compressed string lengths: average %u; min %u; max %u\n",
677 (unsigned)(cchNew / pThis->cPendingStrings), (unsigned)cchNewMin, (unsigned)cchNewMax);
678 }
679
680 return true;
681}
682
683#endif /* BLDPROG_STRTAB_WITH_COMPRESSION */
684
685/**
686 * Inserts a string into g_apUniqueStrings.
687 * @param pThis The strint table compiler instance.
688 * @param pStr The string.
689 */
690static void bldProgStrTab_InsertUniqueString(PBLDPROGSTRTAB pThis, PBLDPROGSTRING pStr)
691{
692 size_t iIdx;
693 size_t iEnd = pThis->cSortedStrings;
694 if (iEnd)
695 {
696 size_t iStart = 0;
697 for (;;)
698 {
699 iIdx = iStart + (iEnd - iStart) / 2;
700 if (pThis->papSortedStrings[iIdx]->cchString < pStr->cchString)
701 {
702 if (iIdx <= iStart)
703 break;
704 iEnd = iIdx;
705 }
706 else if (pThis->papSortedStrings[iIdx]->cchString > pStr->cchString)
707 {
708 if (++iIdx >= iEnd)
709 break;
710 iStart = iIdx;
711 }
712 else
713 break;
714 }
715
716 if (iIdx != pThis->cSortedStrings)
717 memmove(&pThis->papSortedStrings[iIdx + 1], &pThis->papSortedStrings[iIdx],
718 (pThis->cSortedStrings - iIdx) * sizeof(pThis->papSortedStrings[iIdx]));
719 }
720 else
721 iIdx = 0;
722
723 pThis->papSortedStrings[iIdx] = pStr;
724 pThis->cSortedStrings++;
725}
726
727
728/**
729 * Compiles the string table after the string has been added.
730 *
731 * This will save space by dropping string terminators, eliminating duplicates
732 * and try find strings that are sub-strings of others.
733 *
734 * Will initialize the StrRef of all StrTabString instances.
735 *
736 * @returns success indicator (error flagged).
737 * @param pThis The strint table compiler instance.
738 */
739static bool BldProgStrTab_CompileIt(PBLDPROGSTRTAB pThis, bool fVerbose)
740{
741#ifdef BLDPROG_STRTAB_WITH_COMPRESSION
742 /*
743 * Do the compression and add all the compressed strings to the table.
744 */
745 if (!bldProgStrTab_compressorDoStringCompression(pThis, fVerbose))
746 return false;
747
748 /*
749 * Add the dictionary strings.
750 */
751 for (unsigned i = 0; i < RT_ELEMENTS(pThis->aCompDict); i++)
752 bldProgStrTab_AddStringToHashTab(pThis, &pThis->aCompDict[i]);
753#endif
754 if (fVerbose)
755 printf("debug: %u unique strings (%u bytes), %u duplicates, %u collisions\n",
756 (unsigned)pThis->cUniqueStrings, (unsigned)pThis->cchUniqueStrings,
757 (unsigned)pThis->cDuplicateStrings, (unsigned)pThis->cCollisions);
758
759 /*
760 * Create papSortedStrings from the hash table. The table is sorted by
761 * string length, with the longer strings first, so that we increase our
762 * chances of locating duplicate substrings.
763 */
764 pThis->papSortedStrings = (PBLDPROGSTRING *)malloc(sizeof(pThis->papSortedStrings[0]) * pThis->cUniqueStrings);
765 if (!pThis->papSortedStrings)
766 return false;
767 pThis->cSortedStrings = 0;
768 size_t idxHash = pThis->cStrHash;
769 while (idxHash-- > 0)
770 {
771 PBLDPROGSTRING pCur = pThis->papStrHash[idxHash];
772 if (pCur)
773 {
774 do
775 {
776 bldProgStrTab_InsertUniqueString(pThis, pCur);
777 pCur = pCur->pNextCollision;
778 } while (pCur);
779 }
780 }
781
782 /*
783 * Create the actual string table.
784 */
785 pThis->pachStrTab = (char *)malloc(pThis->cchUniqueStrings + 1);
786 if (!pThis->pachStrTab)
787 return false;
788 pThis->cchStrTab = 0;
789 for (size_t i = 0; i < pThis->cSortedStrings; i++)
790 {
791 PBLDPROGSTRING pCur = pThis->papSortedStrings[i];
792 const char * const pszCur = pCur->pszString;
793 size_t const cchCur = pCur->cchString;
794 size_t offStrTab = pThis->cchStrTab;
795
796 /*
797 * See if the string is a substring already found in the string table.
798 * Excluding the zero terminator increases the chances for this.
799 */
800 size_t cchLeft = pThis->cchStrTab >= cchCur ? pThis->cchStrTab - cchCur : 0;
801 const char *pchLeft = pThis->pachStrTab;
802 char const chFirst = *pszCur;
803 while (cchLeft > 0)
804 {
805 const char *pchCandidate = (const char *)memchr(pchLeft, chFirst, cchLeft);
806 if (!pchCandidate)
807 break;
808 if (memcmp(pchCandidate, pszCur, cchCur) == 0)
809 {
810 offStrTab = pchCandidate - pThis->pachStrTab;
811 break;
812 }
813
814 cchLeft -= pchCandidate + 1 - pchLeft;
815 pchLeft = pchCandidate + 1;
816 }
817
818 if (offStrTab == pThis->cchStrTab)
819 {
820 /*
821 * See if the start of the string overlaps the end of the string table.
822 */
823 if (pThis->cchStrTab && cchCur > 1)
824 {
825 cchLeft = RT_MIN(pThis->cchStrTab, cchCur - 1);
826 pchLeft = &pThis->pachStrTab[pThis->cchStrTab - cchLeft];
827 while (cchLeft > 0)
828 {
829 const char *pchCandidate = (const char *)memchr(pchLeft, chFirst, cchLeft);
830 if (!pchCandidate)
831 break;
832 cchLeft -= pchCandidate - pchLeft;
833 pchLeft = pchCandidate;
834 if (memcmp(pchLeft, pszCur, cchLeft) == 0)
835 {
836 size_t cchToCopy = cchCur - cchLeft;
837 memcpy(&pThis->pachStrTab[offStrTab], &pszCur[cchLeft], cchToCopy);
838 pThis->cchStrTab += cchToCopy;
839 offStrTab = pchCandidate - pThis->pachStrTab;
840 break;
841 }
842 cchLeft--;
843 pchLeft++;
844 }
845 }
846
847 /*
848 * If we didn't have any luck above, just append the string.
849 */
850 if (offStrTab == pThis->cchStrTab)
851 {
852 memcpy(&pThis->pachStrTab[offStrTab], pszCur, cchCur);
853 pThis->cchStrTab += cchCur;
854 }
855 }
856
857 /*
858 * Set the string table offset for all the references to this string.
859 */
860 do
861 {
862 pCur->offStrTab = (uint32_t)offStrTab;
863 pCur = pCur->pNextRef;
864 } while (pCur != NULL);
865 }
866
867 if (fVerbose)
868 printf("debug: String table: %u bytes\n", (unsigned)pThis->cchStrTab);
869 return true;
870}
871
872
873#ifdef RT_STRICT
874/**
875 * Sanity checks a string table string.
876 * @param pThis The strint table compiler instance.
877 * @param pStr The string to check.
878 */
879static void BldProgStrTab_CheckStrTabString(PBLDPROGSTRTAB pThis, PBLDPROGSTRING pStr)
880{
881 if (pStr->cchString != strlen(pStr->pszString))
882 abort();
883 if (pStr->offStrTab >= pThis->cchStrTab)
884 abort();
885 if (pStr->offStrTab + pStr->cchString > pThis->cchStrTab)
886 abort();
887 if (memcmp(pStr->pszString, &pThis->pachStrTab[pStr->offStrTab], pStr->cchString) != 0)
888 abort();
889}
890#endif
891
892
893/**
894 * Output the string table string in C string litteral fashion.
895 *
896 * @param pThis The strint table instance.
897 * @param pString The string to print.
898 * @param pOut The output stream.
899 */
900static void BldProgStrTab_PrintCStringLitteral(PBLDPROGSTRTAB pThis, PBLDPROGSTRING pString, FILE *pOut)
901{
902 const unsigned char *psz = (const unsigned char *)pString->pszString;
903 unsigned char uch;
904 while ((uch = *psz++) != '\0')
905 {
906 if (!(uch & 0x80))
907 {
908 if (uch != '\'' && uch != '\\')
909 fputc((char)uch, pOut);
910 else
911 {
912 fputc('\\', pOut);
913 fputc((char)uch, pOut);
914 }
915 }
916#ifdef BLDPROG_STRTAB_WITH_COMPRESSION
917 else if (uch != 0xff)
918 fputs(pThis->aCompDict[uch & 0x7f].pszString, pOut);
919 else
920 {
921# ifdef BLDPROG_STRTAB_PURE_ASCII
922 abort();
923# else
924 RTUNICP uc = RTStrGetCp((const char *)psz);
925 psz += RTStrCpSize(uc);
926 fprintf(pOut, "\\u%04x", uc);
927# endif
928 }
929#else
930 else
931 fprintf(pOut, "\\x%02x", (unsigned)uch);
932 NOREF(pThis);
933#endif
934 }
935}
936
937
938/**
939 * Writes the string table code to the output stream.
940 *
941 * @param pThis The strint table compiler instance.
942 * @param pOut The output stream.
943 * @param pszScope The scoping ("static " or empty string),
944 * including trailing space.
945 * @param pszPrefix The variable prefix. Typically "g_" for C programs,
946 * whereas C++ classes normally use "class::s_g".
947 * @param pszBaseName The base name for the variables. The user
948 * accessible variable will end up as just the
949 * prefix and basename concatenated.
950 */
951static void BldProgStrTab_WriteStringTable(PBLDPROGSTRTAB pThis, FILE *pOut,
952 const char *pszScope, const char *pszPrefix, const char *pszBaseName)
953{
954#ifdef RT_STRICT
955 /*
956 * Do some quick sanity checks while we're here.
957 */
958# ifdef BLDPROG_STRTAB_WITH_COMPRESSION
959 for (unsigned i = 0; i < RT_ELEMENTS(pThis->aCompDict); i++)
960 BldProgStrTab_CheckStrTabString(pThis, &pThis->aCompDict[i]);
961# endif
962#endif
963
964 /*
965 * Create a table for speeding up the character categorization.
966 */
967 uint8_t abCharCat[256];
968 memset(abCharCat, 0, sizeof(abCharCat));
969 abCharCat[(unsigned char)'\\'] = 1;
970 abCharCat[(unsigned char)'\''] = 1;
971 for (unsigned i = 0; i < 0x20; i++)
972 abCharCat[i] = 2;
973 for (unsigned i = 0x7f; i < 0x100; i++)
974 abCharCat[i] = 2;
975
976 /*
977 * We follow the sorted string table, one string per line.
978 */
979 fprintf(pOut,
980 "#include <iprt/bldprog-strtab.h>\n"
981 "\n"
982 "static const char g_achStrTab%s[] =\n"
983 "{\n",
984 pszBaseName);
985
986 uint32_t off = 0;
987 for (uint32_t i = 0; i < pThis->cSortedStrings; i++)
988 {
989 PBLDPROGSTRING pCur = pThis->papSortedStrings[i];
990 uint32_t offEnd = pCur->offStrTab + (uint32_t)pCur->cchString;
991 if (offEnd > off)
992 {
993 /* Comment with a uncompressed and more readable version of the string. */
994 if (off == pCur->offStrTab)
995 fprintf(pOut, "/* 0x%05x = \"", off);
996 else
997 fprintf(pOut, "/* 0X%05x = \"", off);
998 BldProgStrTab_PrintCStringLitteral(pThis, pCur, pOut);
999 fputs("\" */\n", pOut);
1000
1001 /* Must use char by char here or we may trigger the max string
1002 length limit in the compiler, */
1003 fputs(" ", pOut);
1004 for (; off < offEnd; off++)
1005 {
1006 unsigned char uch = pThis->pachStrTab[off];
1007 fputc('\'', pOut);
1008 if (abCharCat[uch] == 0)
1009 fputc(uch, pOut);
1010 else if (abCharCat[uch] != 1)
1011 fprintf(pOut, "\\x%02x", (unsigned)uch);
1012 else
1013 {
1014 fputc('\\', pOut);
1015 fputc(uch, pOut);
1016 }
1017 fputc('\'', pOut);
1018 fputc(',', pOut);
1019 }
1020 fputc('\n', pOut);
1021 }
1022 }
1023
1024 fprintf(pOut,
1025 "};\n"
1026 "AssertCompile(sizeof(g_achStrTab%s) == %#x);\n\n",
1027 pszBaseName, (unsigned)pThis->cchStrTab);
1028
1029#ifdef BLDPROG_STRTAB_WITH_COMPRESSION
1030 /*
1031 * Write the compression dictionary.
1032 */
1033 fprintf(pOut,
1034 "static const RTBLDPROGSTRREF g_aCompDict%s[%u] = \n"
1035 "{\n",
1036 pszBaseName, (unsigned)RT_ELEMENTS(pThis->aCompDict));
1037 for (unsigned i = 0; i < RT_ELEMENTS(pThis->aCompDict); i++)
1038 fprintf(pOut, " { %#08x, %#04x }, // %6lu - %s\n",
1039 pThis->aCompDict[i].offStrTab, (unsigned)pThis->aCompDict[i].cchString,
1040 (unsigned long)pThis->auCompDictFreq[i], pThis->aCompDict[i].pszString);
1041 fprintf(pOut, "};\n\n");
1042#endif
1043
1044
1045 /*
1046 * Write the string table data structure.
1047 */
1048 fprintf(pOut,
1049 "%sconst RTBLDPROGSTRTAB %s%s = \n"
1050 "{\n"
1051 " /*.pchStrTab = */ &g_achStrTab%s[0],\n"
1052 " /*.cchStrTab = */ sizeof(g_achStrTab%s),\n"
1053 ,
1054 pszScope, pszPrefix, pszBaseName, pszBaseName, pszBaseName);
1055#ifdef BLDPROG_STRTAB_WITH_COMPRESSION
1056 fprintf(pOut,
1057 " /*.cCompDict = */ %u,\n"
1058 " /*.paCompDict = */ &g_aCompDict%s[0]\n"
1059 "};\n"
1060 , (unsigned)RT_ELEMENTS(pThis->aCompDict), pszBaseName);
1061#else
1062 fprintf(pOut,
1063 " /*.cCompDict = */ 0,\n"
1064 " /*.paCompDict = */ NULL\n"
1065 "};\n");
1066#endif
1067}
1068
1069#if RT_CLANG_PREREQ(4, 0) || RT_GNUC_PREREQ(4, 6)
1070# pragma GCC diagnostic pop
1071#endif
1072
1073#endif /* __cplusplus && IN_RING3 */
1074
Note: See TracBrowser for help on using the repository browser.

© 2024 Oracle Support Privacy / Do Not Sell My Info Terms of Use Trademark Policy Automated Access Etiquette