VirtualBox

source: vbox/trunk/include/iprt/bldprog-strtab-template.cpp.h@ 63848

Last change on this file since 63848 was 62450, checked in by vboxsync, 8 years ago

bldprogs: MSC level 4 warning fixes.

  • Property svn:eol-style set to native
  • Property svn:keywords set to Author Date Id Revision
File size: 31.8 KB
Line 
1/* $Id: bldprog-strtab-template.cpp.h 62450 2016-07-22 15:04:27Z vboxsync $ */
2/** @file
3 * IPRT - Build Program - String Table Generator.
4 */
5
6/*
7 * Copyright (C) 2006-2016 Oracle Corporation
8 *
9 * This file is part of VirtualBox Open Source Edition (OSE), as
10 * available from http://www.virtualbox.org. This file is free software;
11 * you can redistribute it and/or modify it under the terms of the GNU
12 * General Public License (GPL) as published by the Free Software
13 * Foundation, in version 2 as it comes in the "COPYING" file of the
14 * VirtualBox OSE distribution. VirtualBox OSE is distributed in the
15 * hope that it will be useful, but WITHOUT ANY WARRANTY of any kind.
16 *
17 * The contents of this file may alternatively be used under the terms
18 * of the Common Development and Distribution License Version 1.0
19 * (CDDL) only, as it comes in the "COPYING.CDDL" file of the
20 * VirtualBox OSE distribution, in which case the provisions of the
21 * CDDL are applicable instead of those of the GPL.
22 *
23 * You may elect to license modified versions of this file under the
24 * terms and conditions of either the GPL or the CDDL or both.
25 */
26
27
28/*
29 * (Avoid include sanity checks as this is ring-3 only, C++ code.)
30 */
31#if defined(__cplusplus) && defined(IN_RING3)
32
33
34/*********************************************************************************************************************************
35* Defined Constants And Macros *
36*********************************************************************************************************************************/
37/** @def BLDPROG_STRTAB_MAX_STRLEN
38 * The max length of strings in the table. */
39#if !defined(BLDPROG_STRTAB_MAX_STRLEN) || defined(DOXYGEN_RUNNING)
40# define BLDPROG_STRTAB_MAX_STRLEN 256
41#endif
42
43/** @def BLDPROG_STRTAB_WITH_COMPRESSION
44 * Enables very simple string compression.
45 */
46#if defined(DOXYGEN_RUNNING)
47# define BLDPROG_STRTAB_WITH_COMPRESSION
48#endif
49
50/** @def BLDPROG_STRTAB_WITH_CAMEL_WORDS
51 * Modifies the string compression to look for camel case words.
52 */
53#if defined(DOXYGEN_RUNNING)
54# define BLDPROG_STRTAB_WITH_CAMEL_WORDS
55#endif
56
57/** @def BLDPROG_STRTAB_PURE_ASCII
58 * String compression assumes pure 7-bit ASCII and will fail on UTF-8 when this
59 * is defined. Otherwise, the compression code will require IPRT to link.
60 */
61#if defined(DOXYGEN_RUNNING)
62# define BLDPROG_STRTAB_PURE_ASCII
63#endif
64
65
66
67/*********************************************************************************************************************************
68* Header Files *
69*********************************************************************************************************************************/
70#include <iprt/stdarg.h>
71#include <iprt/ctype.h>
72#include <stdlib.h>
73#include <stdio.h>
74#include <string.h>
75
76#ifdef BLDPROG_STRTAB_WITH_COMPRESSION
77# include <algorithm>
78# include <map>
79# include <string>
80# include <vector>
81
82typedef std::map<std::string, size_t> BLDPROGWORDFREQMAP;
83typedef BLDPROGWORDFREQMAP::value_type BLDPROGWORDFREQPAIR;
84
85#endif
86
87#include "../../src/VBox/Runtime/include/internal/strhash.h" /** @todo make this one public */
88
89
90/*********************************************************************************************************************************
91* Structures and Typedefs *
92*********************************************************************************************************************************/
93/**
94 * Build table string.
95 */
96typedef struct BLDPROGSTRING
97{
98 /** The string.
99 * @note This may be modified or replaced (allocated from heap) when
100 * compressing the string table. */
101 char *pszString;
102 /** The string hash value. */
103 uint32_t uHash;
104 /** The strint table offset. */
105 uint32_t offStrTab;
106 /** The string length. */
107 size_t cchString;
108 /** Pointer to the next string reference (same string table entry). */
109 struct BLDPROGSTRING *pNextRef;
110 /** Pointer to the next string with the same hash value (collision). */
111 struct BLDPROGSTRING *pNextCollision;
112
113} BLDPROGSTRING;
114/** Pointer to a string table string. */
115typedef BLDPROGSTRING *PBLDPROGSTRING;
116
117
118
119/** String table data. */
120typedef struct BLDPROGSTRTAB
121{
122 /** The size of g_papStrHash. */
123 size_t cStrHash;
124 /** String hash table. */
125 PBLDPROGSTRING *papStrHash;
126 /** Duplicate strings found by AddString. */
127 size_t cDuplicateStrings;
128 /** Total length of the unique strings (no terminators). */
129 size_t cchUniqueStrings;
130 /** Number of unique strings after AddString. */
131 size_t cUniqueStrings;
132 /** Number of collisions. */
133 size_t cCollisions;
134
135 /** Number of entries in apSortedStrings. */
136 size_t cSortedStrings;
137 /** The sorted string table. */
138 PBLDPROGSTRING *papSortedStrings;
139
140#ifdef BLDPROG_STRTAB_WITH_COMPRESSION
141 /** The 127 words we've picked to be indexed by reference. */
142 BLDPROGSTRING aCompDict[127];
143 /** Incoming strings pending compression. */
144 PBLDPROGSTRING *papPendingStrings;
145 /** Current number of entries in papStrPending. */
146 size_t cPendingStrings;
147 /** The allocated size of papPendingStrings. */
148 size_t cMaxPendingStrings;
149 /** Work frequency map.
150 * @todo rewrite in plain C. */
151 BLDPROGWORDFREQMAP Frequencies;
152#endif
153
154 /** The string table. */
155 char *pachStrTab;
156 /** The actual string table size. */
157 size_t cchStrTab;
158} BLDPROGSTRTAB;
159typedef BLDPROGSTRTAB *PBLDPROGSTRTAB;
160
161
162/**
163 * Initializes the strint table compiler.
164 *
165 * @returns success indicator (out of memory if false).
166 * @param pThis The strint table compiler instance.
167 * @param cMaxStrings The max number of strings we'll be adding.
168 */
169static bool BldProgStrTab_Init(PBLDPROGSTRTAB pThis, size_t cMaxStrings)
170{
171 pThis->cStrHash = 0;
172 pThis->papStrHash = NULL;
173 pThis->cDuplicateStrings = 0;
174 pThis->cchUniqueStrings = 0;
175 pThis->cUniqueStrings = 0;
176 pThis->cCollisions = 0;
177 pThis->cSortedStrings = 0;
178 pThis->papSortedStrings = NULL;
179 pThis->pachStrTab = NULL;
180 pThis->cchStrTab = 0;
181#ifdef BLDPROG_STRTAB_WITH_COMPRESSION
182 memset(pThis->aCompDict, 0, sizeof(pThis->aCompDict));
183 pThis->papPendingStrings = NULL;
184 pThis->cPendingStrings = 0;
185 pThis->cMaxPendingStrings = cMaxStrings;
186#endif
187
188 /*
189 * Allocate a hash table double the size of all strings (to avoid too
190 * many collisions). Add all strings to it, finding duplicates in the
191 * process.
192 */
193#ifdef BLDPROG_STRTAB_WITH_COMPRESSION
194 cMaxStrings += RT_ELEMENTS(pThis->aCompDict);
195#endif
196 cMaxStrings *= 2;
197 pThis->papStrHash = (PBLDPROGSTRING *)calloc(sizeof(pThis->papStrHash[0]), cMaxStrings);
198 if (pThis->papStrHash)
199 {
200 pThis->cStrHash = cMaxStrings;
201#ifdef BLDPROG_STRTAB_WITH_COMPRESSION
202 pThis->papPendingStrings = (PBLDPROGSTRING *)calloc(sizeof(pThis->papPendingStrings[0]), pThis->cMaxPendingStrings);
203 if (pThis->papPendingStrings)
204#endif
205 return true;
206
207#ifdef BLDPROG_STRTAB_WITH_COMPRESSION
208 free(pThis->papStrHash);
209 pThis->papStrHash = NULL;
210#endif
211 }
212 return false;
213}
214
215
216#if 0 /* unused */
217static void BldProgStrTab_Delete(PBLDPROGSTRTAB pThis)
218{
219 free(pThis->papStrHash);
220 free(pThis->papSortedStrings);
221 free(pThis->pachStrTab);
222# ifdef BLDPROG_STRTAB_WITH_COMPRESSION
223 free(pThis->papPendingStrings);
224# endif
225 memset(pThis, 0, sizeof(*pThis));
226}
227#endif
228
229
230#ifdef BLDPROG_STRTAB_WITH_COMPRESSION
231DECLINLINE(size_t) bldProgStrTab_compressorFindNextWord(const char *pszSrc, char ch, const char **ppszSrc)
232{
233 /*
234 * Skip leading word separators.
235 */
236# ifdef BLDPROG_STRTAB_WITH_CAMEL_WORDS
237 while ( ch == ' '
238 || ch == '-'
239 || ch == '+'
240 || ch == '_')
241# else
242 while (ch == ' ')
243# endif
244 ch = *++pszSrc;
245 if (ch)
246 {
247 /*
248 * Find end of word.
249 */
250 size_t cchWord = 1;
251# ifdef BLDPROG_STRTAB_WITH_CAMEL_WORDS
252 char chPrev = ch;
253 while ( (ch = pszSrc[cchWord]) != ' '
254 && ch != '\0'
255 && ch != '-'
256 && ch != '+'
257 && ch != '_'
258 && ( ch == chPrev
259 || !RT_C_IS_UPPER(ch)
260 || RT_C_IS_UPPER(chPrev)) )
261 {
262 chPrev = ch;
263 cchWord++;
264 }
265# else
266 while ((ch = pszSrc[cchWord]) != ' ' && ch != '\0')
267 cchWord++;
268# endif
269 *ppszSrc = pszSrc;
270 return cchWord;
271 }
272
273 *ppszSrc = pszSrc;
274 return 0;
275}
276
277/**
278 * Analyzes a string.
279 *
280 * @param pThis The strint table compiler instance.
281 * @param pStr The string to analyze.
282 */
283static void bldProgStrTab_compressorAnalyzeString(PBLDPROGSTRTAB pThis, PBLDPROGSTRING pStr)
284{
285 const char *psz = pStr->pszString;
286
287 /*
288 * For now we just consider words.
289 */
290 char ch;
291 while ((ch = *psz) != '\0')
292 {
293 size_t cchWord = bldProgStrTab_compressorFindNextWord(psz, ch, &psz);
294 if (cchWord > 1)
295 {
296 std::string strWord(psz, cchWord);
297 BLDPROGWORDFREQMAP::iterator it = pThis->Frequencies.find(strWord);
298 if (it != pThis->Frequencies.end())
299 it->second += cchWord - 1;
300 else
301 pThis->Frequencies[strWord] = 0;
302
303 /** @todo could gain hits by including the space after the word, but that
304 * has the same accounting problems as the two words scenario below. */
305
306# if 0 /** @todo need better accounting for overlapping alternatives before this can be enabled. */
307 /* Two words - immediate yields calc may lie when this enabled and we may pick the wrong words. */
308 if (ch == ' ')
309 {
310 ch = psz[++cchWord];
311 if (ch != ' ' && ch != '\0')
312 {
313 size_t const cchSaved = cchWord;
314
315 do
316 cchWord++;
317 while ((ch = psz[cchWord]) != ' ' && ch != '\0');
318
319 strWord = std::string(psz, cchWord);
320 BLDPROGWORDFREQMAP::iterator it = pThis->Frequencies.find(strWord);
321 if (it != pThis->Frequencies.end())
322 it->second += cchWord - 1;
323 else
324 pThis->Frequencies[strWord] = 0;
325
326 cchWord = cchSaved;
327 }
328 }
329# endif
330 }
331 else if (!cchWord)
332 break;
333
334 /* Advance. */
335 psz += cchWord;
336 }
337 pStr->cchString = psz - pStr->pszString;
338 if (pStr->cchString > BLDPROG_STRTAB_MAX_STRLEN)
339 {
340 fprintf(stderr, "error: String to long (%u)\n", (unsigned)pStr->cchString);
341 abort();
342 }
343}
344#endif /* BLDPROG_STRTAB_WITH_COMPRESSION */
345
346
347/**
348 * Adds a string to the hash table.
349 * @param pThis The strint table compiler instance.
350 * @param pStr The string.
351 */
352static void bldProgStrTab_AddStringToHashTab(PBLDPROGSTRTAB pThis, PBLDPROGSTRING pStr)
353{
354 pStr->pNextRef = NULL;
355 pStr->pNextCollision = NULL;
356 pStr->offStrTab = 0;
357 pStr->uHash = sdbm(pStr->pszString, &pStr->cchString);
358 if (pStr->cchString > BLDPROG_STRTAB_MAX_STRLEN)
359 {
360 fprintf(stderr, "error: String to long (%u)\n", (unsigned)pStr->cchString);
361 exit(RTEXITCODE_FAILURE);
362 }
363
364 size_t idxHash = pStr->uHash % pThis->cStrHash;
365 PBLDPROGSTRING pCur = pThis->papStrHash[idxHash];
366 if (!pCur)
367 pThis->papStrHash[idxHash] = pStr;
368 else
369 {
370 /* Look for matching string. */
371 do
372 {
373 if ( pCur->uHash == pStr->uHash
374 && pCur->cchString == pStr->cchString
375 && memcmp(pCur->pszString, pStr->pszString, pStr->cchString) == 0)
376 {
377 pStr->pNextRef = pCur->pNextRef;
378 pCur->pNextRef = pStr;
379 pThis->cDuplicateStrings++;
380 return;
381 }
382 pCur = pCur->pNextCollision;
383 } while (pCur != NULL);
384
385 /* No matching string, insert. */
386 pThis->cCollisions++;
387 pStr->pNextCollision = pThis->papStrHash[idxHash];
388 pThis->papStrHash[idxHash] = pStr;
389 }
390
391 pThis->cUniqueStrings++;
392 pThis->cchUniqueStrings += pStr->cchString;
393}
394
395
396
397/**
398 * Adds a string to the string table.
399 *
400 * @param pThis The strint table compiler instance.
401 * @param pStr The string.
402 */
403static void BldProgStrTab_AddString(PBLDPROGSTRTAB pThis, PBLDPROGSTRING pStr)
404{
405#ifdef BLDPROG_STRTAB_WITH_COMPRESSION
406 bldProgStrTab_compressorAnalyzeString(pThis, pStr);
407 if (pThis->cPendingStrings < pThis->cMaxPendingStrings)
408 pThis->papPendingStrings[pThis->cPendingStrings++] = pStr;
409 else
410 abort();
411#else
412 bldProgStrTab_AddStringToHashTab(pThis, pStr);
413#endif
414}
415
416
417
418#ifdef BLDPROG_STRTAB_WITH_COMPRESSION
419
420/**
421 * Replaces the dictionary words and escapes non-ascii chars in a string.
422 *
423 * @param pThis The strint table compiler instance.
424 * @param pString The string to fixup.
425 */
426static bool bldProgStrTab_compressorFixupString(PBLDPROGSTRTAB pThis, PBLDPROGSTRING pStr)
427{
428 char szNew[BLDPROG_STRTAB_MAX_STRLEN * 2];
429 char *pszDst = szNew;
430 const char *pszSrc = pStr->pszString;
431 const char *pszSrcEnd = pszSrc + pStr->cchString;
432
433 char ch;
434 while ((ch = *pszSrc) != '\0')
435 {
436 size_t cchWord = bldProgStrTab_compressorFindNextWord(pszSrc, ch, &pszSrc);
437 if (!cchWord)
438 break;
439
440 /* Check for g_aWord matches. */
441 size_t cchMax = pszSrcEnd - pszSrc;
442 for (unsigned i = 0; i < RT_ELEMENTS(pThis->aCompDict); i++)
443 {
444 size_t cchLen = pThis->aCompDict[i].cchString;
445 if ( cchLen >= cchWord
446 && cchLen <= cchMax
447 && memcmp(pThis->aCompDict[i].pszString, pszSrc, cchLen) == 0)
448 {
449 *pszDst++ = (unsigned char)(0x80 | i);
450 pszSrc += cchLen;
451 cchWord = 0;
452 break;
453 }
454 }
455
456 if (cchWord)
457 {
458 /* Copy the current word. */
459 ch = *pszSrc;
460 do
461 {
462 if (!((unsigned char)ch & 0x80))
463 {
464 *pszDst++ = ch;
465 pszSrc++;
466 }
467 else
468 {
469#ifdef BLDPROG_STRTAB_PURE_ASCII
470 fprintf(stderr, "error: unexpected char value %#x\n", ch);
471 return false;
472#else
473 RTUNICP uc;
474 int rc = RTStrGetCpEx(&pszSrc, &uc);
475 if (RT_SUCCESS(rc))
476 {
477 *pszDst++ = (unsigned char)0xff; /* escape single code point. */
478 pszDst = RTStrPutCp(pszDst, uc);
479 }
480 else
481 {
482 fprintf(stderr, "Error: RTStrGetCpEx failed with rc=%d\n", rc);
483 return false;
484 }
485#endif
486 }
487 } while ((ch = *pszSrc) != '\0' && ch != ' ');
488 }
489 }
490
491 /* Just terminate it now. */
492 *pszDst = '\0';
493
494 /*
495 * Update the string.
496 */
497 size_t cchNew = pszDst - &szNew[0];
498 if (cchNew > pStr->cchString)
499 {
500 pStr->pszString = (char *)malloc(cchNew + 1);
501 if (!pStr->pszString)
502 {
503 fprintf(stderr, "Out of memory!\n");
504 return false;
505 }
506 }
507 memcpy(pStr->pszString, szNew, cchNew + 1);
508 pStr->cchString = cchNew;
509
510 return true;
511}
512
513
514/**
515 * For sorting the frequency fidning in descending order.
516 *
517 * Comparison operators are put outside to make older gcc versions (like 4.1.1
518 * on lnx64-rel) happy.
519 */
520class WordFreqSortEntry
521{
522public:
523 BLDPROGWORDFREQPAIR const *m_pPair;
524
525public:
526 WordFreqSortEntry(BLDPROGWORDFREQPAIR const *pPair) : m_pPair(pPair) {}
527};
528
529bool operator == (WordFreqSortEntry const &rLeft, WordFreqSortEntry const &rRight)
530{
531 return rLeft.m_pPair->second == rRight.m_pPair->second;
532}
533
534bool operator < (WordFreqSortEntry const &rLeft, WordFreqSortEntry const &rRight)
535{
536 return rLeft.m_pPair->second > rRight.m_pPair->second;
537}
538
539
540/**
541 * Compresses the vendor and product strings.
542 *
543 * This is very very simple (a lot less work than the string table for
544 * instance).
545 */
546static bool bldProgStrTab_compressorDoStringCompression(PBLDPROGSTRTAB pThis, bool fVerbose)
547{
548 /*
549 * Sort the frequency analyzis result and pick the top 127 ones.
550 */
551 std::vector<WordFreqSortEntry> SortMap;
552 for (BLDPROGWORDFREQMAP::iterator it = pThis->Frequencies.begin(); it != pThis->Frequencies.end(); ++it)
553 {
554 BLDPROGWORDFREQPAIR const &rPair = *it;
555 SortMap.push_back(WordFreqSortEntry(&rPair));
556 }
557
558 sort(SortMap.begin(), SortMap.end());
559
560 size_t cb = 0;
561 size_t i = 0;
562 for (std::vector<WordFreqSortEntry>::iterator it = SortMap.begin();
563 it != SortMap.end() && i < RT_ELEMENTS(pThis->aCompDict);
564 ++it, i++)
565 {
566 pThis->aCompDict[i].cchString = it->m_pPair->first.length();
567 pThis->aCompDict[i].pszString = (char *)malloc(pThis->aCompDict[i].cchString + 1);
568 if (pThis->aCompDict[i].pszString)
569 memcpy(pThis->aCompDict[i].pszString, it->m_pPair->first.c_str(), pThis->aCompDict[i].cchString + 1);
570 else
571 return false;
572 cb += it->m_pPair->second;
573 }
574
575 if (fVerbose)
576 printf("debug: Estimated string compression saving: %u bytes\n", (unsigned)cb);
577
578 /*
579 * Rework the strings.
580 */
581 size_t cchOld = 0;
582 size_t cchOldMax = 0;
583 size_t cchOldMin = BLDPROG_STRTAB_MAX_STRLEN;
584 size_t cchNew = 0;
585 size_t cchNewMax = 0;
586 size_t cchNewMin = BLDPROG_STRTAB_MAX_STRLEN;
587 i = pThis->cPendingStrings;
588 while (i-- > 0)
589 {
590 PBLDPROGSTRING pCurStr = pThis->papPendingStrings[i];
591 cchOld += pCurStr->cchString;
592 if (pCurStr->cchString > cchOldMax)
593 cchOldMax = pCurStr->cchString;
594 if (pCurStr->cchString < cchOldMin)
595 cchOldMin = pCurStr->cchString;
596
597 if (!bldProgStrTab_compressorFixupString(pThis, pCurStr))
598 return false;
599
600 cchNew += pCurStr->cchString;
601 if (pCurStr->cchString > cchNewMax)
602 cchNewMax = pCurStr->cchString;
603 if (pCurStr->cchString < cchNewMin)
604 cchNewMin = pCurStr->cchString;
605
606 bldProgStrTab_AddStringToHashTab(pThis, pCurStr);
607 }
608
609 /*
610 * Do debug stats.
611 */
612 if (fVerbose)
613 {
614 for (i = 0; i < RT_ELEMENTS(pThis->aCompDict); i++)
615 cchNew += pThis->aCompDict[i].cchString + 1;
616
617 printf("debug: Strings: original: %u bytes; compressed: %u bytes;", (unsigned)cchOld, (unsigned)cchNew);
618 if (cchNew < cchOld)
619 printf(" saving %u bytes (%u%%)\n", (unsigned)(cchOld - cchNew), (unsigned)((cchOld - cchNew) * 100 / cchOld));
620 else
621 printf(" wasting %u bytes!\n", (unsigned)(cchOld - cchNew));
622 printf("debug: Original string lengths: average %u; min %u; max %u\n",
623 (unsigned)(cchOld / pThis->cPendingStrings), (unsigned)cchOldMin, (unsigned)cchOldMax);
624 printf("debug: Compressed string lengths: average %u; min %u; max %u\n",
625 (unsigned)(cchNew / pThis->cPendingStrings), (unsigned)cchNewMin, (unsigned)cchNewMax);
626 }
627
628 return true;
629}
630
631#endif /* BLDPROG_STRTAB_WITH_COMPRESSION */
632
633
634/**
635 * Inserts a string into g_apUniqueStrings.
636 * @param pThis The strint table compiler instance.
637 * @param pStr The string.
638 */
639static void bldProgStrTab_InsertUniqueString(PBLDPROGSTRTAB pThis, PBLDPROGSTRING pStr)
640{
641 size_t iIdx;
642 size_t iEnd = pThis->cSortedStrings;
643 if (iEnd)
644 {
645 size_t iStart = 0;
646 for (;;)
647 {
648 iIdx = iStart + (iEnd - iStart) / 2;
649 if (pThis->papSortedStrings[iIdx]->cchString < pStr->cchString)
650 {
651 if (iIdx <= iStart)
652 break;
653 iEnd = iIdx;
654 }
655 else if (pThis->papSortedStrings[iIdx]->cchString > pStr->cchString)
656 {
657 if (++iIdx >= iEnd)
658 break;
659 iStart = iIdx;
660 }
661 else
662 break;
663 }
664
665 if (iIdx != pThis->cSortedStrings)
666 memmove(&pThis->papSortedStrings[iIdx + 1], &pThis->papSortedStrings[iIdx],
667 (pThis->cSortedStrings - iIdx) * sizeof(pThis->papSortedStrings[iIdx]));
668 }
669 else
670 iIdx = 0;
671
672 pThis->papSortedStrings[iIdx] = pStr;
673 pThis->cSortedStrings++;
674}
675
676
677/**
678 * Compiles the string table after the string has been added.
679 *
680 * This will save space by dropping string terminators, eliminating duplicates
681 * and try find strings that are sub-strings of others.
682 *
683 * Will initialize the StrRef of all StrTabString instances.
684 *
685 * @returns success indicator (error flagged).
686 * @param pThis The strint table compiler instance.
687 */
688static bool BldProgStrTab_CompileIt(PBLDPROGSTRTAB pThis, bool fVerbose)
689{
690#ifdef BLDPROG_STRTAB_WITH_COMPRESSION
691 /*
692 * Do the compression and add all the compressed strings to the table.
693 */
694 if (!bldProgStrTab_compressorDoStringCompression(pThis, fVerbose))
695 return false;
696
697 /*
698 * Add the dictionary strings.
699 */
700 for (unsigned i = 0; i < RT_ELEMENTS(pThis->aCompDict); i++)
701 bldProgStrTab_AddStringToHashTab(pThis, &pThis->aCompDict[i]);
702#endif
703 if (fVerbose)
704 printf("debug: %u unique strings (%u bytes), %u duplicates, %u collisions\n",
705 (unsigned)pThis->cUniqueStrings, (unsigned)pThis->cchUniqueStrings,
706 (unsigned)pThis->cDuplicateStrings, (unsigned)pThis->cCollisions);
707
708 /*
709 * Create papSortedStrings from the hash table. The table is sorted by
710 * string length, with the longer strings first.
711 */
712 pThis->papSortedStrings = (PBLDPROGSTRING *)malloc(sizeof(pThis->papSortedStrings[0]) * pThis->cUniqueStrings);
713 if (!pThis->papSortedStrings)
714 return false;
715 pThis->cSortedStrings = 0;
716 size_t idxHash = pThis->cStrHash;
717 while (idxHash-- > 0)
718 {
719 PBLDPROGSTRING pCur = pThis->papStrHash[idxHash];
720 if (pCur)
721 {
722 do
723 {
724 bldProgStrTab_InsertUniqueString(pThis, pCur);
725 pCur = pCur->pNextCollision;
726 } while (pCur);
727 }
728 }
729
730 /*
731 * Create the actual string table.
732 */
733 pThis->pachStrTab = (char *)malloc(pThis->cchUniqueStrings + 1);
734 if (!pThis->pachStrTab)
735 return false;
736 pThis->cchStrTab = 0;
737 for (size_t i = 0; i < pThis->cSortedStrings; i++)
738 {
739 PBLDPROGSTRING pCur = pThis->papSortedStrings[i];
740 const char * const pszCur = pCur->pszString;
741 size_t const cchCur = pCur->cchString;
742 size_t offStrTab = pThis->cchStrTab;
743
744 /*
745 * See if the string is a substring already found in the string table.
746 * Excluding the zero terminator increases the chances for this.
747 */
748 size_t cchLeft = pThis->cchStrTab >= cchCur ? pThis->cchStrTab - cchCur : 0;
749 const char *pchLeft = pThis->pachStrTab;
750 char const chFirst = *pszCur;
751 while (cchLeft > 0)
752 {
753 const char *pchCandidate = (const char *)memchr(pchLeft, chFirst, cchLeft);
754 if (!pchCandidate)
755 break;
756 if (memcmp(pchCandidate, pszCur, cchCur) == 0)
757 {
758 offStrTab = pchCandidate - pThis->pachStrTab;
759 break;
760 }
761
762 cchLeft -= pchCandidate + 1 - pchLeft;
763 pchLeft = pchCandidate + 1;
764 }
765
766 if (offStrTab == pThis->cchStrTab)
767 {
768 /*
769 * See if the start of the string overlaps the end of the string table.
770 */
771 if (pThis->cchStrTab && cchCur > 1)
772 {
773 cchLeft = RT_MIN(pThis->cchStrTab, cchCur - 1);
774 pchLeft = &pThis->pachStrTab[pThis->cchStrTab - cchLeft];
775 while (cchLeft > 0)
776 {
777 const char *pchCandidate = (const char *)memchr(pchLeft, chFirst, cchLeft);
778 if (!pchCandidate)
779 break;
780 cchLeft -= pchCandidate - pchLeft;
781 pchLeft = pchCandidate;
782 if (memcmp(pchLeft, pszCur, cchLeft) == 0)
783 {
784 size_t cchToCopy = cchCur - cchLeft;
785 memcpy(&pThis->pachStrTab[offStrTab], &pszCur[cchLeft], cchToCopy);
786 pThis->cchStrTab += cchToCopy;
787 offStrTab = pchCandidate - pThis->pachStrTab;
788 break;
789 }
790 cchLeft--;
791 pchLeft++;
792 }
793 }
794
795 /*
796 * If we didn't have any luck above, just append the string.
797 */
798 if (offStrTab == pThis->cchStrTab)
799 {
800 memcpy(&pThis->pachStrTab[offStrTab], pszCur, cchCur);
801 pThis->cchStrTab += cchCur;
802 }
803 }
804
805 /*
806 * Set the string table offset for all the references to this string.
807 */
808 do
809 {
810 pCur->offStrTab = (uint32_t)offStrTab;
811 pCur = pCur->pNextRef;
812 } while (pCur != NULL);
813 }
814
815 if (fVerbose)
816 printf("debug: String table: %u bytes\n", (unsigned)pThis->cchStrTab);
817 return true;
818}
819
820
821#ifdef RT_STRICT
822/**
823 * Sanity checks a string table string.
824 * @param pThis The strint table compiler instance.
825 * @param pStr The string to check.
826 */
827static void BldProgStrTab_CheckStrTabString(PBLDPROGSTRTAB pThis, PBLDPROGSTRING pStr)
828{
829 if (pStr->cchString != strlen(pStr->pszString))
830 abort();
831 if (pStr->offStrTab >= pThis->cchStrTab)
832 abort();
833 if (pStr->offStrTab + pStr->cchString > pThis->cchStrTab)
834 abort();
835 if (memcmp(pStr->pszString, &pThis->pachStrTab[pStr->offStrTab], pStr->cchString) != 0)
836 abort();
837}
838#endif
839
840
841/**
842 * Output the string table string in C string litteral fashion.
843 *
844 * @param pThis The strint table instance.
845 * @param pString The string to print.
846 * @param pOut The output stream.
847 */
848static void BldProgStrTab_PrintCStringLitteral(PBLDPROGSTRTAB pThis, PBLDPROGSTRING pString, FILE *pOut)
849{
850 const unsigned char *psz = (const unsigned char *)pString->pszString;
851 unsigned char uch;
852 while ((uch = *psz++) != '\0')
853 {
854 if (!(uch & 0x80))
855 {
856 if (uch != '\'' && uch != '\\')
857 fputc((char)uch, pOut);
858 else
859 {
860 fputc('\\', pOut);
861 fputc((char)uch, pOut);
862 }
863 }
864#ifdef BLDPROG_STRTAB_WITH_COMPRESSION
865 else if (uch != 0xff)
866 fputs(pThis->aCompDict[uch & 0x7f].pszString, pOut);
867 else
868 {
869# ifdef BLDPROG_STRTAB_PURE_ASCII
870 abort();
871 fprintf(pOut, "\\x%02x", (unsigned)uch);
872# else
873 RTUNICP uc = RTStrGetCp((const char *)psz);
874 psz += RTStrCpSize(uc);
875 fprintf(pOut, "\\u%04x", uc);
876# endif
877 }
878#else
879 else
880 fprintf(pOut, "\\x%02", (unsigned)uch);
881#endif
882 }
883}
884
885
886/**
887 * Writes the string table code to the output stream.
888 *
889 * @param pThis The strint table compiler instance.
890 * @param pOut The output stream.
891 * @param pszScope The scoping ("static " or empty string),
892 * including trailing space.
893 * @param pszPrefix The variable prefix. Typically "g_" for C programs,
894 * whereas C++ classes normally use "class::s_g".
895 * @param pszBaseName The base name for the variables. The user
896 * accessible variable will end up as just the
897 * prefix and basename concatenated.
898 */
899static void BldProgStrTab_WriteStringTable(PBLDPROGSTRTAB pThis, FILE *pOut,
900 const char *pszScope, const char *pszPrefix, const char *pszBaseName)
901{
902#ifdef RT_STRICT
903 /*
904 * Do some quick sanity checks while we're here.
905 */
906# ifdef BLDPROG_STRTAB_WITH_COMPRESSION
907 for (unsigned i = 0; i < RT_ELEMENTS(pThis->aCompDict); i++)
908 BldProgStrTab_CheckStrTabString(pThis, &pThis->aCompDict[i]);
909# endif
910#endif
911
912 /*
913 * Create a table for speeding up the character categorization.
914 */
915 uint8_t abCharCat[256];
916 memset(abCharCat, 0, sizeof(abCharCat));
917 abCharCat[(unsigned char)'\\'] = 1;
918 abCharCat[(unsigned char)'\''] = 1;
919 for (unsigned i = 0; i < 0x20; i++)
920 abCharCat[i] = 2;
921 for (unsigned i = 0x7f; i < 0x100; i++)
922 abCharCat[i] = 2;
923
924 /*
925 * We follow the sorted string table, one string per line.
926 */
927 fprintf(pOut,
928 "#include <iprt/bldprog-strtab.h>\n"
929 "\n"
930 "static const char g_achStrTab%s[] =\n"
931 "{\n",
932 pszBaseName);
933
934 uint32_t off = 0;
935 for (uint32_t i = 0; i < pThis->cSortedStrings; i++)
936 {
937 PBLDPROGSTRING pCur = pThis->papSortedStrings[i];
938 uint32_t offEnd = pCur->offStrTab + (uint32_t)pCur->cchString;
939 if (offEnd > off)
940 {
941 /* Comment with a uncompressed and more readable version of the string. */
942 fprintf(pOut, off == pCur->offStrTab ? "/* 0x%05x = \"" : "/* 0X%05x = \"", off);
943 BldProgStrTab_PrintCStringLitteral(pThis, pCur, pOut);
944 fputs("\" */\n", pOut);
945
946 /* Must use char by char here or we may trigger the max string
947 length limit in the compiler, */
948 fputs(" ", pOut);
949 for (; off < offEnd; off++)
950 {
951 unsigned char uch = pThis->pachStrTab[off];
952 fputc('\'', pOut);
953 if (abCharCat[uch] == 0)
954 fputc(uch, pOut);
955 else if (abCharCat[uch] != 1)
956 fprintf(pOut, "\\x%02x", (unsigned)uch);
957 else
958 {
959 fputc('\\', pOut);
960 fputc(uch, pOut);
961 }
962 fputc('\'', pOut);
963 fputc(',', pOut);
964 }
965 fputc('\n', pOut);
966 }
967 }
968
969 fprintf(pOut,
970 "};\n"
971 "AssertCompile(sizeof(g_achStrTab%s) == %#x);\n\n",
972 pszBaseName, (unsigned)pThis->cchStrTab);
973
974#ifdef BLDPROG_STRTAB_WITH_COMPRESSION
975 /*
976 * Write the compression dictionary.
977 */
978 fprintf(pOut,
979 "static const RTBLDPROGSTRREF g_aCompDict%s[%u] = \n"
980 "{\n",
981 pszBaseName, (unsigned)RT_ELEMENTS(pThis->aCompDict));
982 for (unsigned i = 0; i < RT_ELEMENTS(pThis->aCompDict); i++)
983 fprintf(pOut, " { %#08x, %#04x }, // %s\n",
984 pThis->aCompDict[i].offStrTab, (unsigned)pThis->aCompDict[i].cchString, pThis->aCompDict[i].pszString);
985 fprintf(pOut, "};\n\n");
986#endif
987
988
989 /*
990 * Write the string table data structure.
991 */
992 fprintf(pOut,
993 "%sconst RTBLDPROGSTRTAB %s%s = \n"
994 "{\n"
995 " /*.pchStrTab = */ &g_achStrTab%s[0],\n"
996 " /*.cchStrTab = */ sizeof(g_achStrTab%s),\n"
997 ,
998 pszScope, pszPrefix, pszBaseName, pszBaseName, pszBaseName);
999#ifdef BLDPROG_STRTAB_WITH_COMPRESSION
1000 fprintf(pOut,
1001 " /*.cCompDict = */ %u,\n"
1002 " /*.paCompDict = */ &g_aCompDict%s[0]\n"
1003 "};\n"
1004 , (unsigned)RT_ELEMENTS(pThis->aCompDict), pszBaseName);
1005#else
1006 fprintf(pOut,
1007 " /*.cCompDict = */ 0,\n"
1008 " /*.paCompDict = */ NULL\n"
1009 "};\n");
1010#endif
1011}
1012
1013#endif /* __cplusplus && IN_RING3 */
1014
Note: See TracBrowser for help on using the repository browser.

© 2024 Oracle Support Privacy / Do Not Sell My Info Terms of Use Trademark Policy Automated Access Etiquette