VirtualBox

source: vbox/trunk/include/iprt/bldprog-strtab-template.cpp.h@ 61649

Last change on this file since 61649 was 59762, checked in by vboxsync, 9 years ago

include/iprt: typo and gcc parameter warning

  • Property svn:eol-style set to native
  • Property svn:keywords set to Author Date Id Revision
File size: 31.8 KB
Line 
1/* $Id: bldprog-strtab-template.cpp.h 59762 2016-02-20 16:51:28Z vboxsync $ */
2/** @file
3 * IPRT - Build Program - String Table Generator.
4 */
5
6/*
7 * Copyright (C) 2006-2016 Oracle Corporation
8 *
9 * This file is part of VirtualBox Open Source Edition (OSE), as
10 * available from http://www.virtualbox.org. This file is free software;
11 * you can redistribute it and/or modify it under the terms of the GNU
12 * General Public License (GPL) as published by the Free Software
13 * Foundation, in version 2 as it comes in the "COPYING" file of the
14 * VirtualBox OSE distribution. VirtualBox OSE is distributed in the
15 * hope that it will be useful, but WITHOUT ANY WARRANTY of any kind.
16 *
17 * The contents of this file may alternatively be used under the terms
18 * of the Common Development and Distribution License Version 1.0
19 * (CDDL) only, as it comes in the "COPYING.CDDL" file of the
20 * VirtualBox OSE distribution, in which case the provisions of the
21 * CDDL are applicable instead of those of the GPL.
22 *
23 * You may elect to license modified versions of this file under the
24 * terms and conditions of either the GPL or the CDDL or both.
25 */
26
27
28/*
29 * (Avoid include sanity checks as this is ring-3 only, C++ code.)
30 */
31#if defined(__cplusplus) && defined(IN_RING3)
32
33
34/*********************************************************************************************************************************
35* Defined Constants And Macros *
36*********************************************************************************************************************************/
37/** @def BLDPROG_STRTAB_MAX_STRLEN
38 * The max length of strings in the table. */
39#if !defined(BLDPROG_STRTAB_MAX_STRLEN) || defined(DOXYGEN_RUNNING)
40# define BLDPROG_STRTAB_MAX_STRLEN 256
41#endif
42
43/** @def BLDPROG_STRTAB_WITH_COMPRESSION
44 * Enables very simple string compression.
45 */
46#if defined(DOXYGEN_RUNNING)
47# define BLDPROG_STRTAB_WITH_COMPRESSION
48#endif
49
50/** @def BLDPROG_STRTAB_WITH_CAMEL_WORDS
51 * Modifies the string compression to look for camel case words.
52 */
53#if defined(DOXYGEN_RUNNING)
54# define BLDPROG_STRTAB_WITH_CAMEL_WORDS
55#endif
56
57/** @def BLDPROG_STRTAB_PURE_ASCII
58 * String compression assumes pure 7-bit ASCII and will fail on UTF-8 when this
59 * is defined. Otherwise, the compression code will require IPRT to link.
60 */
61#if defined(DOXYGEN_RUNNING)
62# define BLDPROG_STRTAB_PURE_ASCII
63#endif
64
65
66
67/*********************************************************************************************************************************
68* Header Files *
69*********************************************************************************************************************************/
70#include <iprt/stdarg.h>
71#include <iprt/ctype.h>
72#include <stdlib.h>
73#include <stdio.h>
74#include <string.h>
75
76#ifdef BLDPROG_STRTAB_WITH_COMPRESSION
77# include <algorithm>
78# include <map>
79# include <string>
80# include <vector>
81
82typedef std::map<std::string, size_t> BLDPROGWORDFREQMAP;
83typedef BLDPROGWORDFREQMAP::value_type BLDPROGWORDFREQPAIR;
84
85#endif
86
87#include "../../src/VBox/Runtime/include/internal/strhash.h" /** @todo make this one public */
88
89
90/*********************************************************************************************************************************
91* Structures and Typedefs *
92*********************************************************************************************************************************/
93/**
94 * Build table string.
95 */
96typedef struct BLDPROGSTRING
97{
98 /** The string.
99 * @note This may be modified or replaced (allocated from heap) when
100 * compressing the string table. */
101 char *pszString;
102 /** The string hash value. */
103 uint32_t uHash;
104 /** The strint table offset. */
105 uint32_t offStrTab;
106 /** The string length. */
107 size_t cchString;
108 /** Pointer to the next string reference (same string table entry). */
109 struct BLDPROGSTRING *pNextRef;
110 /** Pointer to the next string with the same hash value (collision). */
111 struct BLDPROGSTRING *pNextCollision;
112
113} BLDPROGSTRING;
114/** Pointer to a string table string. */
115typedef BLDPROGSTRING *PBLDPROGSTRING;
116
117
118
119/** String table data. */
120typedef struct BLDPROGSTRTAB
121{
122 /** The size of g_papStrHash. */
123 size_t cStrHash;
124 /** String hash table. */
125 PBLDPROGSTRING *papStrHash;
126 /** Duplicate strings found by AddString. */
127 size_t cDuplicateStrings;
128 /** Total length of the unique strings (no terminators). */
129 size_t cchUniqueStrings;
130 /** Number of unique strings after AddString. */
131 size_t cUniqueStrings;
132 /** Number of collisions. */
133 size_t cCollisions;
134
135 /** Number of entries in apSortedStrings. */
136 size_t cSortedStrings;
137 /** The sorted string table. */
138 PBLDPROGSTRING *papSortedStrings;
139
140#ifdef BLDPROG_STRTAB_WITH_COMPRESSION
141 /** The 127 words we've picked to be indexed by reference. */
142 BLDPROGSTRING aCompDict[127];
143 /** Incoming strings pending compression. */
144 PBLDPROGSTRING *papPendingStrings;
145 /** Current number of entries in papStrPending. */
146 size_t cPendingStrings;
147 /** The allocated size of papPendingStrings. */
148 size_t cMaxPendingStrings;
149 /** Work frequency map.
150 * @todo rewrite in plain C. */
151 BLDPROGWORDFREQMAP Frequencies;
152#endif
153
154 /** The string table. */
155 char *pachStrTab;
156 /** The actual string table size. */
157 size_t cchStrTab;
158} BLDPROGSTRTAB;
159typedef BLDPROGSTRTAB *PBLDPROGSTRTAB;
160
161
162/**
163 * Initializes the strint table compiler.
164 *
165 * @returns success indicator (out of memory if false).
166 * @param pThis The strint table compiler instance.
167 * @param cMaxStrings The max number of strings we'll be adding.
168 */
169static bool BldProgStrTab_Init(PBLDPROGSTRTAB pThis, size_t cMaxStrings)
170{
171 pThis->cStrHash = 0;
172 pThis->papStrHash = NULL;
173 pThis->cDuplicateStrings = 0;
174 pThis->cchUniqueStrings = 0;
175 pThis->cUniqueStrings = 0;
176 pThis->cCollisions = 0;
177 pThis->cSortedStrings = 0;
178 pThis->papSortedStrings = NULL;
179 pThis->pachStrTab = NULL;
180 pThis->cchStrTab = 0;
181#ifdef BLDPROG_STRTAB_WITH_COMPRESSION
182 memset(pThis->aCompDict, 0, sizeof(pThis->aCompDict));
183 pThis->papPendingStrings = NULL;
184 pThis->cPendingStrings = 0;
185 pThis->cMaxPendingStrings = cMaxStrings;
186#endif
187
188 /*
189 * Allocate a hash table double the size of all strings (to avoid too
190 * many collisions). Add all strings to it, finding duplicates in the
191 * process.
192 */
193#ifdef BLDPROG_STRTAB_WITH_COMPRESSION
194 cMaxStrings += RT_ELEMENTS(pThis->aCompDict);
195#endif
196 cMaxStrings *= 2;
197 pThis->papStrHash = (PBLDPROGSTRING *)calloc(sizeof(pThis->papStrHash[0]), cMaxStrings);
198 if (pThis->papStrHash)
199 {
200 pThis->cStrHash = cMaxStrings;
201#ifdef BLDPROG_STRTAB_WITH_COMPRESSION
202 pThis->papPendingStrings = (PBLDPROGSTRING *)calloc(sizeof(pThis->papPendingStrings[0]), pThis->cMaxPendingStrings);
203 if (pThis->papPendingStrings)
204#endif
205 return true;
206
207#ifdef BLDPROG_STRTAB_WITH_COMPRESSION
208 free(pThis->papStrHash);
209 pThis->papStrHash = NULL;
210#endif
211 }
212 return false;
213}
214
215
216static void BldProgStrTab_Delete(PBLDPROGSTRTAB pThis)
217{
218 free(pThis->papStrHash);
219 free(pThis->papSortedStrings);
220 free(pThis->pachStrTab);
221#ifdef BLDPROG_STRTAB_WITH_COMPRESSION
222 free(pThis->papPendingStrings);
223#endif
224 memset(pThis, 0, sizeof(*pThis));
225}
226
227
228#ifdef BLDPROG_STRTAB_WITH_COMPRESSION
229DECLINLINE(size_t) bldProgStrTab_compressorFindNextWord(const char *pszSrc, char ch, const char **ppszSrc)
230{
231 /*
232 * Skip leading word separators.
233 */
234# ifdef BLDPROG_STRTAB_WITH_CAMEL_WORDS
235 while ( ch == ' '
236 || ch == '-'
237 || ch == '+'
238 || ch == '_')
239# else
240 while (ch == ' ')
241# endif
242 ch = *++pszSrc;
243 if (ch)
244 {
245 /*
246 * Find end of word.
247 */
248 size_t cchWord = 1;
249# ifdef BLDPROG_STRTAB_WITH_CAMEL_WORDS
250 char chPrev = ch;
251 while ( (ch = pszSrc[cchWord]) != ' '
252 && ch != '\0'
253 && ch != '-'
254 && ch != '+'
255 && ch != '_'
256 && ( ch == chPrev
257 || !RT_C_IS_UPPER(ch)
258 || RT_C_IS_UPPER(chPrev)) )
259 {
260 chPrev = ch;
261 cchWord++;
262 }
263# else
264 while ((ch = pszSrc[cchWord]) != ' ' && ch != '\0')
265 cchWord++;
266# endif
267 *ppszSrc = pszSrc;
268 return cchWord;
269 }
270
271 *ppszSrc = pszSrc;
272 return 0;
273}
274
275/**
276 * Analyzes a string.
277 *
278 * @param pThis The strint table compiler instance.
279 * @param pStr The string to analyze.
280 */
281static void bldProgStrTab_compressorAnalyzeString(PBLDPROGSTRTAB pThis, PBLDPROGSTRING pStr)
282{
283 const char *psz = pStr->pszString;
284
285 /*
286 * For now we just consider words.
287 */
288 char ch;
289 while ((ch = *psz) != '\0')
290 {
291 size_t cchWord = bldProgStrTab_compressorFindNextWord(psz, ch, &psz);
292 if (cchWord > 1)
293 {
294 std::string strWord(psz, cchWord);
295 BLDPROGWORDFREQMAP::iterator it = pThis->Frequencies.find(strWord);
296 if (it != pThis->Frequencies.end())
297 it->second += cchWord - 1;
298 else
299 pThis->Frequencies[strWord] = 0;
300
301 /** @todo could gain hits by including the space after the word, but that
302 * has the same accounting problems as the two words scenario below. */
303
304# if 0 /** @todo need better accounting for overlapping alternatives before this can be enabled. */
305 /* Two words - immediate yields calc may lie when this enabled and we may pick the wrong words. */
306 if (ch == ' ')
307 {
308 ch = psz[++cchWord];
309 if (ch != ' ' && ch != '\0')
310 {
311 size_t const cchSaved = cchWord;
312
313 do
314 cchWord++;
315 while ((ch = psz[cchWord]) != ' ' && ch != '\0');
316
317 strWord = std::string(psz, cchWord);
318 BLDPROGWORDFREQMAP::iterator it = pThis->Frequencies.find(strWord);
319 if (it != pThis->Frequencies.end())
320 it->second += cchWord - 1;
321 else
322 pThis->Frequencies[strWord] = 0;
323
324 cchWord = cchSaved;
325 }
326 }
327# endif
328 }
329 else if (!cchWord)
330 break;
331
332 /* Advance. */
333 psz += cchWord;
334 }
335 pStr->cchString = psz - pStr->pszString;
336 if (pStr->cchString > BLDPROG_STRTAB_MAX_STRLEN)
337 {
338 fprintf(stderr, "error: String to long (%u)\n", (unsigned)pStr->cchString);
339 abort();
340 }
341}
342#endif /* BLDPROG_STRTAB_WITH_COMPRESSION */
343
344
345/**
346 * Adds a string to the hash table.
347 * @param pThis The strint table compiler instance.
348 * @param pStr The string.
349 */
350static void bldProgStrTab_AddStringToHashTab(PBLDPROGSTRTAB pThis, PBLDPROGSTRING pStr)
351{
352 pStr->pNextRef = NULL;
353 pStr->pNextCollision = NULL;
354 pStr->offStrTab = 0;
355 pStr->uHash = sdbm(pStr->pszString, &pStr->cchString);
356 if (pStr->cchString > BLDPROG_STRTAB_MAX_STRLEN)
357 {
358 fprintf(stderr, "error: String to long (%u)\n", (unsigned)pStr->cchString);
359 exit(RTEXITCODE_FAILURE);
360 }
361
362 size_t idxHash = pStr->uHash % pThis->cStrHash;
363 PBLDPROGSTRING pCur = pThis->papStrHash[idxHash];
364 if (!pCur)
365 pThis->papStrHash[idxHash] = pStr;
366 else
367 {
368 /* Look for matching string. */
369 do
370 {
371 if ( pCur->uHash == pStr->uHash
372 && pCur->cchString == pStr->cchString
373 && memcmp(pCur->pszString, pStr->pszString, pStr->cchString) == 0)
374 {
375 pStr->pNextRef = pCur->pNextRef;
376 pCur->pNextRef = pStr;
377 pThis->cDuplicateStrings++;
378 return;
379 }
380 pCur = pCur->pNextCollision;
381 } while (pCur != NULL);
382
383 /* No matching string, insert. */
384 pThis->cCollisions++;
385 pStr->pNextCollision = pThis->papStrHash[idxHash];
386 pThis->papStrHash[idxHash] = pStr;
387 }
388
389 pThis->cUniqueStrings++;
390 pThis->cchUniqueStrings += pStr->cchString;
391}
392
393
394
395/**
396 * Adds a string to the string table.
397 *
398 * @param pThis The strint table compiler instance.
399 * @param pStr The string.
400 */
401static void BldProgStrTab_AddString(PBLDPROGSTRTAB pThis, PBLDPROGSTRING pStr)
402{
403#ifdef BLDPROG_STRTAB_WITH_COMPRESSION
404 bldProgStrTab_compressorAnalyzeString(pThis, pStr);
405 if (pThis->cPendingStrings < pThis->cMaxPendingStrings)
406 pThis->papPendingStrings[pThis->cPendingStrings++] = pStr;
407 else
408 abort();
409#else
410 bldProgStrTab_AddStringToHashTab(pThis, pStr);
411#endif
412}
413
414
415
416#ifdef BLDPROG_STRTAB_WITH_COMPRESSION
417
418/**
419 * Replaces the dictionary words and escapes non-ascii chars in a string.
420 *
421 * @param pThis The strint table compiler instance.
422 * @param pString The string to fixup.
423 */
424static bool bldProgStrTab_compressorFixupString(PBLDPROGSTRTAB pThis, PBLDPROGSTRING pStr)
425{
426 char szNew[BLDPROG_STRTAB_MAX_STRLEN * 2];
427 char *pszDst = szNew;
428 const char *pszSrc = pStr->pszString;
429 const char *pszSrcEnd = pszSrc + pStr->cchString;
430
431 char ch;
432 while ((ch = *pszSrc) != '\0')
433 {
434 size_t cchWord = bldProgStrTab_compressorFindNextWord(pszSrc, ch, &pszSrc);
435 if (!cchWord)
436 break;
437
438 /* Check for g_aWord matches. */
439 size_t cchMax = pszSrcEnd - pszSrc;
440 for (unsigned i = 0; i < RT_ELEMENTS(pThis->aCompDict); i++)
441 {
442 size_t cchLen = pThis->aCompDict[i].cchString;
443 if ( cchLen >= cchWord
444 && cchLen <= cchMax
445 && memcmp(pThis->aCompDict[i].pszString, pszSrc, cchLen) == 0)
446 {
447 *pszDst++ = (unsigned char)(0x80 | i);
448 pszSrc += cchLen;
449 cchWord = 0;
450 break;
451 }
452 }
453
454 if (cchWord)
455 {
456 /* Copy the current word. */
457 ch = *pszSrc;
458 do
459 {
460 if (!((unsigned char)ch & 0x80))
461 {
462 *pszDst++ = ch;
463 pszSrc++;
464 }
465 else
466 {
467#ifdef BLDPROG_STRTAB_PURE_ASCII
468 fprintf(stderr, "error: unexpected char value %#x\n", ch);
469 return false;
470#else
471 RTUNICP uc;
472 int rc = RTStrGetCpEx(&pszSrc, &uc);
473 if (RT_SUCCESS(rc))
474 {
475 *pszDst++ = (unsigned char)0xff; /* escape single code point. */
476 pszDst = RTStrPutCp(pszDst, uc);
477 }
478 else
479 {
480 fprintf(stderr, "Error: RTStrGetCpEx failed with rc=%d\n", rc);
481 return false;
482 }
483#endif
484 }
485 } while ((ch = *pszSrc) != '\0' && ch != ' ');
486 }
487 }
488
489 /* Just terminate it now. */
490 *pszDst = '\0';
491
492 /*
493 * Update the string.
494 */
495 size_t cchNew = pszDst - &szNew[0];
496 if (cchNew > pStr->cchString)
497 {
498 pStr->pszString = (char *)malloc(cchNew + 1);
499 if (!pStr->pszString)
500 {
501 fprintf(stderr, "Out of memory!\n");
502 return false;
503 }
504 }
505 memcpy(pStr->pszString, szNew, cchNew + 1);
506 pStr->cchString = cchNew;
507
508 return true;
509}
510
511
512/**
513 * For sorting the frequency fidning in descending order.
514 *
515 * Comparison operators are put outside to make older gcc versions (like 4.1.1
516 * on lnx64-rel) happy.
517 */
518class WordFreqSortEntry
519{
520public:
521 BLDPROGWORDFREQPAIR const *m_pPair;
522
523public:
524 WordFreqSortEntry(BLDPROGWORDFREQPAIR const *pPair) : m_pPair(pPair) {}
525};
526
527bool operator == (WordFreqSortEntry const &rLeft, WordFreqSortEntry const &rRight)
528{
529 return rLeft.m_pPair->second == rRight.m_pPair->second;
530}
531
532bool operator < (WordFreqSortEntry const &rLeft, WordFreqSortEntry const &rRight)
533{
534 return rLeft.m_pPair->second > rRight.m_pPair->second;
535}
536
537
538/**
539 * Compresses the vendor and product strings.
540 *
541 * This is very very simple (a lot less work than the string table for
542 * instance).
543 */
544static bool bldProgStrTab_compressorDoStringCompression(PBLDPROGSTRTAB pThis, bool fVerbose)
545{
546 /*
547 * Sort the frequency analyzis result and pick the top 127 ones.
548 */
549 std::vector<WordFreqSortEntry> SortMap;
550 for (BLDPROGWORDFREQMAP::iterator it = pThis->Frequencies.begin(); it != pThis->Frequencies.end(); ++it)
551 {
552 BLDPROGWORDFREQPAIR const &rPair = *it;
553 SortMap.push_back(WordFreqSortEntry(&rPair));
554 }
555
556 sort(SortMap.begin(), SortMap.end());
557
558 size_t cb = 0;
559 size_t i = 0;
560 for (std::vector<WordFreqSortEntry>::iterator it = SortMap.begin();
561 it != SortMap.end() && i < RT_ELEMENTS(pThis->aCompDict);
562 ++it, i++)
563 {
564 pThis->aCompDict[i].cchString = it->m_pPair->first.length();
565 pThis->aCompDict[i].pszString = (char *)malloc(pThis->aCompDict[i].cchString + 1);
566 if (pThis->aCompDict[i].pszString)
567 memcpy(pThis->aCompDict[i].pszString, it->m_pPair->first.c_str(), pThis->aCompDict[i].cchString + 1);
568 else
569 return false;
570 cb += it->m_pPair->second;
571 }
572
573 if (fVerbose)
574 printf("debug: Estimated string compression saving: %u bytes\n", (unsigned)cb);
575
576 /*
577 * Rework the strings.
578 */
579 size_t cchOld = 0;
580 size_t cchOldMax = 0;
581 size_t cchOldMin = BLDPROG_STRTAB_MAX_STRLEN;
582 size_t cchNew = 0;
583 size_t cchNewMax = 0;
584 size_t cchNewMin = BLDPROG_STRTAB_MAX_STRLEN;
585 i = pThis->cPendingStrings;
586 while (i-- > 0)
587 {
588 PBLDPROGSTRING pCurStr = pThis->papPendingStrings[i];
589 cchOld += pCurStr->cchString;
590 if (pCurStr->cchString > cchOldMax)
591 cchOldMax = pCurStr->cchString;
592 if (pCurStr->cchString < cchOldMin)
593 cchOldMin = pCurStr->cchString;
594
595 if (!bldProgStrTab_compressorFixupString(pThis, pCurStr))
596 return false;
597
598 cchNew += pCurStr->cchString;
599 if (pCurStr->cchString > cchNewMax)
600 cchNewMax = pCurStr->cchString;
601 if (pCurStr->cchString < cchNewMin)
602 cchNewMin = pCurStr->cchString;
603
604 bldProgStrTab_AddStringToHashTab(pThis, pCurStr);
605 }
606
607 /*
608 * Do debug stats.
609 */
610 if (fVerbose)
611 {
612 for (i = 0; i < RT_ELEMENTS(pThis->aCompDict); i++)
613 cchNew += pThis->aCompDict[i].cchString + 1;
614
615 printf("debug: Strings: original: %u bytes; compressed: %u bytes;", (unsigned)cchOld, (unsigned)cchNew);
616 if (cchNew < cchOld)
617 printf(" saving %u bytes (%u%%)\n", (unsigned)(cchOld - cchNew), (unsigned)((cchOld - cchNew) * 100 / cchOld));
618 else
619 printf(" wasting %u bytes!\n", (unsigned)(cchOld - cchNew));
620 printf("debug: Original string lengths: average %u; min %u; max %u\n",
621 (unsigned)(cchOld / pThis->cPendingStrings), (unsigned)cchOldMin, (unsigned)cchOldMax);
622 printf("debug: Compressed string lengths: average %u; min %u; max %u\n",
623 (unsigned)(cchNew / pThis->cPendingStrings), (unsigned)cchNewMin, (unsigned)cchNewMax);
624 }
625
626 return true;
627}
628
629#endif /* BLDPROG_STRTAB_WITH_COMPRESSION */
630
631
632/**
633 * Inserts a string into g_apUniqueStrings.
634 * @param pThis The strint table compiler instance.
635 * @param pStr The string.
636 */
637static void bldProgStrTab_InsertUniqueString(PBLDPROGSTRTAB pThis, PBLDPROGSTRING pStr)
638{
639 size_t iIdx;
640 size_t iEnd = pThis->cSortedStrings;
641 if (iEnd)
642 {
643 size_t iStart = 0;
644 for (;;)
645 {
646 iIdx = iStart + (iEnd - iStart) / 2;
647 if (pThis->papSortedStrings[iIdx]->cchString < pStr->cchString)
648 {
649 if (iIdx <= iStart)
650 break;
651 iEnd = iIdx;
652 }
653 else if (pThis->papSortedStrings[iIdx]->cchString > pStr->cchString)
654 {
655 if (++iIdx >= iEnd)
656 break;
657 iStart = iIdx;
658 }
659 else
660 break;
661 }
662
663 if (iIdx != pThis->cSortedStrings)
664 memmove(&pThis->papSortedStrings[iIdx + 1], &pThis->papSortedStrings[iIdx],
665 (pThis->cSortedStrings - iIdx) * sizeof(pThis->papSortedStrings[iIdx]));
666 }
667 else
668 iIdx = 0;
669
670 pThis->papSortedStrings[iIdx] = pStr;
671 pThis->cSortedStrings++;
672}
673
674
675/**
676 * Compiles the string table after the string has been added.
677 *
678 * This will save space by dropping string terminators, eliminating duplicates
679 * and try find strings that are sub-strings of others.
680 *
681 * Will initialize the StrRef of all StrTabString instances.
682 *
683 * @returns success indicator (error flagged).
684 * @param pThis The strint table compiler instance.
685 */
686static bool BldProgStrTab_CompileIt(PBLDPROGSTRTAB pThis, bool fVerbose)
687{
688#ifdef BLDPROG_STRTAB_WITH_COMPRESSION
689 /*
690 * Do the compression and add all the compressed strings to the table.
691 */
692 if (!bldProgStrTab_compressorDoStringCompression(pThis, fVerbose))
693 return false;
694
695 /*
696 * Add the dictionary strings.
697 */
698 for (unsigned i = 0; i < RT_ELEMENTS(pThis->aCompDict); i++)
699 bldProgStrTab_AddStringToHashTab(pThis, &pThis->aCompDict[i]);
700#endif
701 if (fVerbose)
702 printf("debug: %u unique strings (%u bytes), %u duplicates, %u collisions\n",
703 (unsigned)pThis->cUniqueStrings, (unsigned)pThis->cchUniqueStrings,
704 (unsigned)pThis->cDuplicateStrings, (unsigned)pThis->cCollisions);
705
706 /*
707 * Create papSortedStrings from the hash table. The table is sorted by
708 * string length, with the longer strings first.
709 */
710 pThis->papSortedStrings = (PBLDPROGSTRING *)malloc(sizeof(pThis->papSortedStrings[0]) * pThis->cUniqueStrings);
711 if (!pThis->papSortedStrings)
712 return false;
713 pThis->cSortedStrings = 0;
714 size_t idxHash = pThis->cStrHash;
715 while (idxHash-- > 0)
716 {
717 PBLDPROGSTRING pCur = pThis->papStrHash[idxHash];
718 if (pCur)
719 {
720 do
721 {
722 bldProgStrTab_InsertUniqueString(pThis, pCur);
723 pCur = pCur->pNextCollision;
724 } while (pCur);
725 }
726 }
727
728 /*
729 * Create the actual string table.
730 */
731 pThis->pachStrTab = (char *)malloc(pThis->cchUniqueStrings + 1);
732 if (!pThis->pachStrTab)
733 return false;
734 pThis->cchStrTab = 0;
735 for (size_t i = 0; i < pThis->cSortedStrings; i++)
736 {
737 PBLDPROGSTRING pCur = pThis->papSortedStrings[i];
738 const char * const pszCur = pCur->pszString;
739 size_t const cchCur = pCur->cchString;
740 size_t offStrTab = pThis->cchStrTab;
741
742 /*
743 * See if the string is a substring already found in the string table.
744 * Excluding the zero terminator increases the chances for this.
745 */
746 size_t cchLeft = pThis->cchStrTab >= cchCur ? pThis->cchStrTab - cchCur : 0;
747 const char *pchLeft = pThis->pachStrTab;
748 char const chFirst = *pszCur;
749 while (cchLeft > 0)
750 {
751 const char *pchCandidate = (const char *)memchr(pchLeft, chFirst, cchLeft);
752 if (!pchCandidate)
753 break;
754 if (memcmp(pchCandidate, pszCur, cchCur) == 0)
755 {
756 offStrTab = pchCandidate - pThis->pachStrTab;
757 break;
758 }
759
760 cchLeft -= pchCandidate + 1 - pchLeft;
761 pchLeft = pchCandidate + 1;
762 }
763
764 if (offStrTab == pThis->cchStrTab)
765 {
766 /*
767 * See if the start of the string overlaps the end of the string table.
768 */
769 if (pThis->cchStrTab && cchCur > 1)
770 {
771 cchLeft = RT_MIN(pThis->cchStrTab, cchCur - 1);
772 pchLeft = &pThis->pachStrTab[pThis->cchStrTab - cchLeft];
773 while (cchLeft > 0)
774 {
775 const char *pchCandidate = (const char *)memchr(pchLeft, chFirst, cchLeft);
776 if (!pchCandidate)
777 break;
778 cchLeft -= pchCandidate - pchLeft;
779 pchLeft = pchCandidate;
780 if (memcmp(pchLeft, pszCur, cchLeft) == 0)
781 {
782 size_t cchToCopy = cchCur - cchLeft;
783 memcpy(&pThis->pachStrTab[offStrTab], &pszCur[cchLeft], cchToCopy);
784 pThis->cchStrTab += cchToCopy;
785 offStrTab = pchCandidate - pThis->pachStrTab;
786 break;
787 }
788 cchLeft--;
789 pchLeft++;
790 }
791 }
792
793 /*
794 * If we didn't have any luck above, just append the string.
795 */
796 if (offStrTab == pThis->cchStrTab)
797 {
798 memcpy(&pThis->pachStrTab[offStrTab], pszCur, cchCur);
799 pThis->cchStrTab += cchCur;
800 }
801 }
802
803 /*
804 * Set the string table offset for all the references to this string.
805 */
806 do
807 {
808 pCur->offStrTab = (uint32_t)offStrTab;
809 pCur = pCur->pNextRef;
810 } while (pCur != NULL);
811 }
812
813 if (fVerbose)
814 printf("debug: String table: %u bytes\n", (unsigned)pThis->cchStrTab);
815 return true;
816}
817
818
819#ifdef RT_STRICT
820/**
821 * Sanity checks a string table string.
822 * @param pThis The strint table compiler instance.
823 * @param pStr The string to check.
824 */
825static void BldProgStrTab_CheckStrTabString(PBLDPROGSTRTAB pThis, PBLDPROGSTRING pStr)
826{
827 if (pStr->cchString != strlen(pStr->pszString))
828 abort();
829 if (pStr->offStrTab >= pThis->cchStrTab)
830 abort();
831 if (pStr->offStrTab + pStr->cchString > pThis->cchStrTab)
832 abort();
833 if (memcmp(pStr->pszString, &pThis->pachStrTab[pStr->offStrTab], pStr->cchString) != 0)
834 abort();
835}
836#endif
837
838
839/**
840 * Output the string table string in C string litteral fashion.
841 *
842 * @param pThis The strint table instance.
843 * @param pString The string to print.
844 * @param pOut The output stream.
845 */
846static void BldProgStrTab_PrintCStringLitteral(PBLDPROGSTRTAB pThis, PBLDPROGSTRING pString, FILE *pOut)
847{
848 const unsigned char *psz = (const unsigned char *)pString->pszString;
849 unsigned char uch;
850 while ((uch = *psz++) != '\0')
851 {
852 if (!(uch & 0x80))
853 {
854 if (uch != '\'' && uch != '\\')
855 fputc((char)uch, pOut);
856 else
857 {
858 fputc('\\', pOut);
859 fputc((char)uch, pOut);
860 }
861 }
862#ifdef BLDPROG_STRTAB_WITH_COMPRESSION
863 else if (uch != 0xff)
864 fputs(pThis->aCompDict[uch & 0x7f].pszString, pOut);
865 else
866 {
867# ifdef BLDPROG_STRTAB_PURE_ASCII
868 abort();
869 fprintf(pOut, "\\x%02x", (unsigned)uch);
870# else
871 RTUNICP uc = RTStrGetCp((const char *)psz);
872 psz += RTStrCpSize(uc);
873 fprintf(pOut, "\\u%04x", uc);
874# endif
875 }
876#else
877 else
878 fprintf(pOut, "\\x%02", (unsigned)uch);
879#endif
880 }
881}
882
883
884/**
885 * Writes the string table code to the output stream.
886 *
887 * @param pThis The strint table compiler instance.
888 * @param pOut The output stream.
889 * @param pszScope The scoping ("static " or empty string),
890 * including trailing space.
891 * @param pszPrefix The variable prefix. Typically "g_" for C programs,
892 * whereas C++ classes normally use "class::s_g".
893 * @param pszBaseName The base name for the variables. The user
894 * accessible variable will end up as just the
895 * prefix and basename concatenated.
896 */
897static void BldProgStrTab_WriteStringTable(PBLDPROGSTRTAB pThis, FILE *pOut,
898 const char *pszScope, const char *pszPrefix, const char *pszBaseName)
899{
900#ifdef RT_STRICT
901 /*
902 * Do some quick sanity checks while we're here.
903 */
904# ifdef BLDPROG_STRTAB_WITH_COMPRESSION
905 for (unsigned i = 0; i < RT_ELEMENTS(pThis->aCompDict); i++)
906 BldProgStrTab_CheckStrTabString(pThis, &pThis->aCompDict[i]);
907# endif
908#endif
909
910 /*
911 * Create a table for speeding up the character categorization.
912 */
913 uint8_t abCharCat[256];
914 memset(abCharCat, 0, sizeof(abCharCat));
915 abCharCat[(unsigned char)'\\'] = 1;
916 abCharCat[(unsigned char)'\''] = 1;
917 for (unsigned i = 0; i < 0x20; i++)
918 abCharCat[i] = 2;
919 for (unsigned i = 0x7f; i < 0x100; i++)
920 abCharCat[i] = 2;
921
922 /*
923 * We follow the sorted string table, one string per line.
924 */
925 fprintf(pOut,
926 "#include <iprt/bldprog-strtab.h>\n"
927 "\n"
928 "static const char g_achStrTab%s[] =\n"
929 "{\n",
930 pszBaseName);
931
932 uint32_t off = 0;
933 for (uint32_t i = 0; i < pThis->cSortedStrings; i++)
934 {
935 PBLDPROGSTRING pCur = pThis->papSortedStrings[i];
936 uint32_t offEnd = pCur->offStrTab + (uint32_t)pCur->cchString;
937 if (offEnd > off)
938 {
939 /* Comment with a uncompressed and more readable version of the string. */
940 fprintf(pOut, off == pCur->offStrTab ? "/* 0x%05x = \"" : "/* 0X%05x = \"", off);
941 BldProgStrTab_PrintCStringLitteral(pThis, pCur, pOut);
942 fputs("\" */\n", pOut);
943
944 /* Must use char by char here or we may trigger the max string
945 length limit in the compiler, */
946 fputs(" ", pOut);
947 for (; off < offEnd; off++)
948 {
949 unsigned char uch = pThis->pachStrTab[off];
950 fputc('\'', pOut);
951 if (abCharCat[uch] == 0)
952 fputc(uch, pOut);
953 else if (abCharCat[uch] != 1)
954 fprintf(pOut, "\\x%02x", (unsigned)uch);
955 else
956 {
957 fputc('\\', pOut);
958 fputc(uch, pOut);
959 }
960 fputc('\'', pOut);
961 fputc(',', pOut);
962 }
963 fputc('\n', pOut);
964 }
965 }
966
967 fprintf(pOut,
968 "};\n"
969 "AssertCompile(sizeof(g_achStrTab%s) == %#x);\n\n",
970 pszBaseName, (unsigned)pThis->cchStrTab);
971
972#ifdef BLDPROG_STRTAB_WITH_COMPRESSION
973 /*
974 * Write the compression dictionary.
975 */
976 fprintf(pOut,
977 "static const RTBLDPROGSTRREF g_aCompDict%s[%u] = \n"
978 "{\n",
979 pszBaseName, (unsigned)RT_ELEMENTS(pThis->aCompDict));
980 for (unsigned i = 0; i < RT_ELEMENTS(pThis->aCompDict); i++)
981 fprintf(pOut, " { %#08x, %#04x }, // %s\n",
982 pThis->aCompDict[i].offStrTab, (unsigned)pThis->aCompDict[i].cchString, pThis->aCompDict[i].pszString);
983 fprintf(pOut, "};\n\n");
984#endif
985
986
987 /*
988 * Write the string table data structure.
989 */
990 fprintf(pOut,
991 "%sconst RTBLDPROGSTRTAB %s%s = \n"
992 "{\n"
993 " /*.pchStrTab = */ &g_achStrTab%s[0],\n"
994 " /*.cchStrTab = */ sizeof(g_achStrTab%s),\n"
995 ,
996 pszScope, pszPrefix, pszBaseName, pszBaseName, pszBaseName);
997#ifdef BLDPROG_STRTAB_WITH_COMPRESSION
998 fprintf(pOut,
999 " /*.cCompDict = */ %u,\n"
1000 " /*.paCompDict = */ &g_aCompDict%s[0]\n"
1001 "};\n"
1002 , (unsigned)RT_ELEMENTS(pThis->aCompDict), pszBaseName);
1003#else
1004 fprintf(pOut,
1005 " /*.cCompDict = */ 0,\n"
1006 " /*.paCompDict = */ NULL\n"
1007 "};\n");
1008#endif
1009}
1010
1011#endif /* __cplusplus && IN_RING3 */
1012
Note: See TracBrowser for help on using the repository browser.

© 2024 Oracle Support Privacy / Do Not Sell My Info Terms of Use Trademark Policy Automated Access Etiquette