VirtualBox

source: vbox/trunk/include/iprt/bldprog-strtab-template.cpp.h@ 66064

Last change on this file since 66064 was 64814, checked in by vboxsync, 8 years ago

bldprog-strtab-template.cpp.h: Fix bug in the compressor where it omits spaces and separators between words.

  • Property svn:eol-style set to native
  • Property svn:keywords set to Author Date Id Revision
File size: 32.5 KB
Line 
1/* $Id: bldprog-strtab-template.cpp.h 64814 2016-12-08 21:37:21Z vboxsync $ */
2/** @file
3 * IPRT - Build Program - String Table Generator.
4 */
5
6/*
7 * Copyright (C) 2006-2016 Oracle Corporation
8 *
9 * This file is part of VirtualBox Open Source Edition (OSE), as
10 * available from http://www.virtualbox.org. This file is free software;
11 * you can redistribute it and/or modify it under the terms of the GNU
12 * General Public License (GPL) as published by the Free Software
13 * Foundation, in version 2 as it comes in the "COPYING" file of the
14 * VirtualBox OSE distribution. VirtualBox OSE is distributed in the
15 * hope that it will be useful, but WITHOUT ANY WARRANTY of any kind.
16 *
17 * The contents of this file may alternatively be used under the terms
18 * of the Common Development and Distribution License Version 1.0
19 * (CDDL) only, as it comes in the "COPYING.CDDL" file of the
20 * VirtualBox OSE distribution, in which case the provisions of the
21 * CDDL are applicable instead of those of the GPL.
22 *
23 * You may elect to license modified versions of this file under the
24 * terms and conditions of either the GPL or the CDDL or both.
25 */
26
27
28/*
29 * (Avoid include sanity checks as this is ring-3 only, C++ code.)
30 */
31#if defined(__cplusplus) && defined(IN_RING3)
32
33
34/*********************************************************************************************************************************
35* Defined Constants And Macros *
36*********************************************************************************************************************************/
37/** @def BLDPROG_STRTAB_MAX_STRLEN
38 * The max length of strings in the table. */
39#if !defined(BLDPROG_STRTAB_MAX_STRLEN) || defined(DOXYGEN_RUNNING)
40# define BLDPROG_STRTAB_MAX_STRLEN 256
41#endif
42
43/** @def BLDPROG_STRTAB_WITH_COMPRESSION
44 * Enables very simple string compression.
45 */
46#if defined(DOXYGEN_RUNNING)
47# define BLDPROG_STRTAB_WITH_COMPRESSION
48#endif
49
50/** @def BLDPROG_STRTAB_WITH_CAMEL_WORDS
51 * Modifies the string compression to look for camel case words.
52 */
53#if defined(DOXYGEN_RUNNING)
54# define BLDPROG_STRTAB_WITH_CAMEL_WORDS
55#endif
56
57/** @def BLDPROG_STRTAB_PURE_ASCII
58 * String compression assumes pure 7-bit ASCII and will fail on UTF-8 when this
59 * is defined. Otherwise, the compression code will require IPRT to link.
60 */
61#if defined(DOXYGEN_RUNNING)
62# define BLDPROG_STRTAB_PURE_ASCII
63#endif
64
65
66
67/*********************************************************************************************************************************
68* Header Files *
69*********************************************************************************************************************************/
70#include <iprt/stdarg.h>
71#include <iprt/ctype.h>
72#include <stdlib.h>
73#include <stdio.h>
74#include <string.h>
75
76#ifdef BLDPROG_STRTAB_WITH_COMPRESSION
77# include <algorithm>
78# include <map>
79# include <string>
80# include <vector>
81
82typedef std::map<std::string, size_t> BLDPROGWORDFREQMAP;
83typedef BLDPROGWORDFREQMAP::value_type BLDPROGWORDFREQPAIR;
84
85#endif
86
87#include "../../src/VBox/Runtime/include/internal/strhash.h" /** @todo make this one public */
88
89
90/*********************************************************************************************************************************
91* Structures and Typedefs *
92*********************************************************************************************************************************/
93/**
94 * Build table string.
95 */
96typedef struct BLDPROGSTRING
97{
98 /** The string.
99 * @note This may be modified or replaced (allocated from heap) when
100 * compressing the string table. */
101 char *pszString;
102 /** The string hash value. */
103 uint32_t uHash;
104 /** The strint table offset. */
105 uint32_t offStrTab;
106 /** The string length. */
107 size_t cchString;
108 /** Pointer to the next string reference (same string table entry). */
109 struct BLDPROGSTRING *pNextRef;
110 /** Pointer to the next string with the same hash value (collision). */
111 struct BLDPROGSTRING *pNextCollision;
112
113} BLDPROGSTRING;
114/** Pointer to a string table string. */
115typedef BLDPROGSTRING *PBLDPROGSTRING;
116
117
118/** String table data. */
119typedef struct BLDPROGSTRTAB
120{
121 /** The size of g_papStrHash. */
122 size_t cStrHash;
123 /** String hash table. */
124 PBLDPROGSTRING *papStrHash;
125 /** Duplicate strings found by AddString. */
126 size_t cDuplicateStrings;
127 /** Total length of the unique strings (no terminators). */
128 size_t cchUniqueStrings;
129 /** Number of unique strings after AddString. */
130 size_t cUniqueStrings;
131 /** Number of collisions. */
132 size_t cCollisions;
133
134 /** Number of entries in apSortedStrings. */
135 size_t cSortedStrings;
136 /** The sorted string table. */
137 PBLDPROGSTRING *papSortedStrings;
138
139#ifdef BLDPROG_STRTAB_WITH_COMPRESSION
140 /** The 127 words we've picked to be indexed by reference. */
141 BLDPROGSTRING aCompDict[127];
142 /** Incoming strings pending compression. */
143 PBLDPROGSTRING *papPendingStrings;
144 /** Current number of entries in papStrPending. */
145 size_t cPendingStrings;
146 /** The allocated size of papPendingStrings. */
147 size_t cMaxPendingStrings;
148 /** Work frequency map.
149 * @todo rewrite in plain C. */
150 BLDPROGWORDFREQMAP Frequencies;
151#endif
152
153 /** The string table. */
154 char *pachStrTab;
155 /** The actual string table size. */
156 size_t cchStrTab;
157} BLDPROGSTRTAB;
158typedef BLDPROGSTRTAB *PBLDPROGSTRTAB;
159
160
161/**
162 * Initializes the strint table compiler.
163 *
164 * @returns success indicator (out of memory if false).
165 * @param pThis The strint table compiler instance.
166 * @param cMaxStrings The max number of strings we'll be adding.
167 */
168static bool BldProgStrTab_Init(PBLDPROGSTRTAB pThis, size_t cMaxStrings)
169{
170 pThis->cStrHash = 0;
171 pThis->papStrHash = NULL;
172 pThis->cDuplicateStrings = 0;
173 pThis->cchUniqueStrings = 0;
174 pThis->cUniqueStrings = 0;
175 pThis->cCollisions = 0;
176 pThis->cSortedStrings = 0;
177 pThis->papSortedStrings = NULL;
178 pThis->pachStrTab = NULL;
179 pThis->cchStrTab = 0;
180#ifdef BLDPROG_STRTAB_WITH_COMPRESSION
181 memset(pThis->aCompDict, 0, sizeof(pThis->aCompDict));
182 pThis->papPendingStrings = NULL;
183 pThis->cPendingStrings = 0;
184 pThis->cMaxPendingStrings = cMaxStrings;
185#endif
186
187 /*
188 * Allocate a hash table double the size of all strings (to avoid too
189 * many collisions). Add all strings to it, finding duplicates in the
190 * process.
191 */
192#ifdef BLDPROG_STRTAB_WITH_COMPRESSION
193 cMaxStrings += RT_ELEMENTS(pThis->aCompDict);
194#endif
195 cMaxStrings *= 2;
196 pThis->papStrHash = (PBLDPROGSTRING *)calloc(sizeof(pThis->papStrHash[0]), cMaxStrings);
197 if (pThis->papStrHash)
198 {
199 pThis->cStrHash = cMaxStrings;
200#ifdef BLDPROG_STRTAB_WITH_COMPRESSION
201 pThis->papPendingStrings = (PBLDPROGSTRING *)calloc(sizeof(pThis->papPendingStrings[0]), pThis->cMaxPendingStrings);
202 if (pThis->papPendingStrings)
203#endif
204 return true;
205
206#ifdef BLDPROG_STRTAB_WITH_COMPRESSION
207 free(pThis->papStrHash);
208 pThis->papStrHash = NULL;
209#endif
210 }
211 return false;
212}
213
214
215#if 0 /* unused */
216static void BldProgStrTab_Delete(PBLDPROGSTRTAB pThis)
217{
218 free(pThis->papStrHash);
219 free(pThis->papSortedStrings);
220 free(pThis->pachStrTab);
221# ifdef BLDPROG_STRTAB_WITH_COMPRESSION
222 free(pThis->papPendingStrings);
223# endif
224 memset(pThis, 0, sizeof(*pThis));
225}
226#endif
227
228#ifdef BLDPROG_STRTAB_WITH_COMPRESSION
229
230DECLINLINE(size_t) bldProgStrTab_compressorFindNextWord(const char *pszSrc, char ch, const char **ppszSrc)
231{
232 /*
233 * Skip leading word separators.
234 */
235# ifdef BLDPROG_STRTAB_WITH_CAMEL_WORDS
236 while ( ch == ' '
237 || ch == '-'
238 || ch == '+'
239 || ch == '_')
240# else
241 while (ch == ' ')
242# endif
243 ch = *++pszSrc;
244 if (ch)
245 {
246 /*
247 * Find end of word.
248 */
249 size_t cchWord = 1;
250# ifdef BLDPROG_STRTAB_WITH_CAMEL_WORDS
251 char chPrev = ch;
252 while ( (ch = pszSrc[cchWord]) != ' '
253 && ch != '\0'
254 && ch != '-'
255 && ch != '+'
256 && ch != '_'
257 && ( ch == chPrev
258 || !RT_C_IS_UPPER(ch)
259 || RT_C_IS_UPPER(chPrev)) )
260 {
261 chPrev = ch;
262 cchWord++;
263 }
264# else
265 while ((ch = pszSrc[cchWord]) != ' ' && ch != '\0')
266 cchWord++;
267# endif
268 *ppszSrc = pszSrc;
269 return cchWord;
270 }
271
272 *ppszSrc = pszSrc;
273 return 0;
274}
275
276
277/**
278 * Analyzes a string.
279 *
280 * @param pThis The strint table compiler instance.
281 * @param pStr The string to analyze.
282 */
283static void bldProgStrTab_compressorAnalyzeString(PBLDPROGSTRTAB pThis, PBLDPROGSTRING pStr)
284{
285 const char *psz = pStr->pszString;
286
287 /*
288 * For now we just consider words.
289 */
290 char ch;
291 while ((ch = *psz) != '\0')
292 {
293 size_t cchWord = bldProgStrTab_compressorFindNextWord(psz, ch, &psz);
294 if (cchWord > 1)
295 {
296 std::string strWord(psz, cchWord);
297 BLDPROGWORDFREQMAP::iterator it = pThis->Frequencies.find(strWord);
298 if (it != pThis->Frequencies.end())
299 it->second += cchWord - 1;
300 else
301 pThis->Frequencies[strWord] = 0;
302
303 /** @todo could gain hits by including the space after the word, but that
304 * has the same accounting problems as the two words scenario below. */
305
306# if 0 /** @todo need better accounting for overlapping alternatives before this can be enabled. */
307 /* Two words - immediate yields calc may lie when this enabled and we may pick the wrong words. */
308 if (ch == ' ')
309 {
310 ch = psz[++cchWord];
311 if (ch != ' ' && ch != '\0')
312 {
313 size_t const cchSaved = cchWord;
314
315 do
316 cchWord++;
317 while ((ch = psz[cchWord]) != ' ' && ch != '\0');
318
319 strWord = std::string(psz, cchWord);
320 BLDPROGWORDFREQMAP::iterator it = pThis->Frequencies.find(strWord);
321 if (it != pThis->Frequencies.end())
322 it->second += cchWord - 1;
323 else
324 pThis->Frequencies[strWord] = 0;
325
326 cchWord = cchSaved;
327 }
328 }
329# endif
330 }
331 else if (!cchWord)
332 break;
333
334 /* Advance. */
335 psz += cchWord;
336 }
337 pStr->cchString = psz - pStr->pszString;
338 if (pStr->cchString > BLDPROG_STRTAB_MAX_STRLEN)
339 {
340 fprintf(stderr, "error: String to long (%u)\n", (unsigned)pStr->cchString);
341 abort();
342 }
343}
344
345#endif /* BLDPROG_STRTAB_WITH_COMPRESSION */
346
347/**
348 * Adds a string to the hash table.
349 * @param pThis The strint table compiler instance.
350 * @param pStr The string.
351 */
352static void bldProgStrTab_AddStringToHashTab(PBLDPROGSTRTAB pThis, PBLDPROGSTRING pStr)
353{
354 pStr->pNextRef = NULL;
355 pStr->pNextCollision = NULL;
356 pStr->offStrTab = 0;
357 pStr->uHash = sdbm(pStr->pszString, &pStr->cchString);
358 if (pStr->cchString > BLDPROG_STRTAB_MAX_STRLEN)
359 {
360 fprintf(stderr, "error: String to long (%u)\n", (unsigned)pStr->cchString);
361 exit(RTEXITCODE_FAILURE);
362 }
363
364 size_t idxHash = pStr->uHash % pThis->cStrHash;
365 PBLDPROGSTRING pCur = pThis->papStrHash[idxHash];
366 if (!pCur)
367 pThis->papStrHash[idxHash] = pStr;
368 else
369 {
370 /* Look for matching string. */
371 do
372 {
373 if ( pCur->uHash == pStr->uHash
374 && pCur->cchString == pStr->cchString
375 && memcmp(pCur->pszString, pStr->pszString, pStr->cchString) == 0)
376 {
377 pStr->pNextRef = pCur->pNextRef;
378 pCur->pNextRef = pStr;
379 pThis->cDuplicateStrings++;
380 return;
381 }
382 pCur = pCur->pNextCollision;
383 } while (pCur != NULL);
384
385 /* No matching string, insert. */
386 pThis->cCollisions++;
387 pStr->pNextCollision = pThis->papStrHash[idxHash];
388 pThis->papStrHash[idxHash] = pStr;
389 }
390
391 pThis->cUniqueStrings++;
392 pThis->cchUniqueStrings += pStr->cchString;
393}
394
395
396/**
397 * Adds a string to the string table.
398 *
399 * @param pThis The strint table compiler instance.
400 * @param pStr The string.
401 */
402static void BldProgStrTab_AddString(PBLDPROGSTRTAB pThis, PBLDPROGSTRING pStr)
403{
404#ifdef BLDPROG_STRTAB_WITH_COMPRESSION
405 bldProgStrTab_compressorAnalyzeString(pThis, pStr);
406 if (pThis->cPendingStrings < pThis->cMaxPendingStrings)
407 pThis->papPendingStrings[pThis->cPendingStrings++] = pStr;
408 else
409 abort();
410#else
411 bldProgStrTab_AddStringToHashTab(pThis, pStr);
412#endif
413}
414
415#ifdef BLDPROG_STRTAB_WITH_COMPRESSION
416
417/**
418 * Copies @a cchSrc chars from @a pchSrc to @a pszDst, escaping special
419 * sequences.
420 *
421 * @returns New @a pszDst position, NULL if invalid source encoding.
422 * @param pszDst The destination buffer.
423 * @param pszSrc The source buffer.
424 * @param cchSrc How much to copy.
425 */
426static char *bldProgStrTab_compressorCopyAndEscape(char *pszDst, const char *pszSrc, size_t cchSrc)
427{
428 while (cchSrc-- > 0)
429 {
430 char ch = *pszSrc;
431 if (!((unsigned char)ch & 0x80))
432 {
433 *pszDst++ = ch;
434 pszSrc++;
435 }
436 else
437 {
438# ifdef BLDPROG_STRTAB_PURE_ASCII
439 fprintf(stderr, "error: unexpected char value %#x\n", ch);
440 return NULL;
441# else
442 RTUNICP uc;
443 int rc = RTStrGetCpEx(&pszSrc, &uc);
444 if (RT_SUCCESS(rc))
445 {
446 *pszDst++ = (unsigned char)0xff; /* escape single code point. */
447 pszDst = RTStrPutCp(pszDst, uc);
448 }
449 else
450 {
451 fprintf(stderr, "Error: RTStrGetCpEx failed with rc=%d\n", rc);
452 return NULL;
453 }
454# endif
455 }
456 }
457 return pszDst;
458}
459
460
461/**
462 * Replaces the dictionary words and escapes non-ascii chars in a string.
463 *
464 * @param pThis The strint table compiler instance.
465 * @param pString The string to fixup.
466 */
467static bool bldProgStrTab_compressorFixupString(PBLDPROGSTRTAB pThis, PBLDPROGSTRING pStr)
468{
469 char szNew[BLDPROG_STRTAB_MAX_STRLEN * 2];
470 char *pszDst = szNew;
471 const char *pszSrc = pStr->pszString;
472 const char *pszSrcEnd = pszSrc + pStr->cchString;
473
474 char ch;
475 while ((ch = *pszSrc) != '\0')
476 {
477 const char * const pszSrcUncompressed = pszSrc;
478 size_t cchWord = bldProgStrTab_compressorFindNextWord(pszSrc, ch, &pszSrc);
479 size_t cchSrcUncompressed = pszSrc - pszSrcUncompressed;
480 if (cchSrcUncompressed > 0)
481 {
482 pszDst = bldProgStrTab_compressorCopyAndEscape(pszDst, pszSrcUncompressed, cchSrcUncompressed);
483 if (!pszDst)
484 return false;
485 }
486 if (!cchWord)
487 break;
488
489 /* Check for g_aWord matches. */
490 size_t cchMax = pszSrcEnd - pszSrc;
491 for (unsigned i = 0; i < RT_ELEMENTS(pThis->aCompDict); i++)
492 {
493 size_t cchLen = pThis->aCompDict[i].cchString;
494 if ( cchLen >= cchWord
495 && cchLen <= cchMax
496 && memcmp(pThis->aCompDict[i].pszString, pszSrc, cchLen) == 0)
497 {
498 *pszDst++ = (unsigned char)(0x80 | i);
499 pszSrc += cchLen;
500 cchWord = 0;
501 break;
502 }
503 }
504
505 if (cchWord)
506 {
507 /* Copy the current word. */
508 pszDst = bldProgStrTab_compressorCopyAndEscape(pszDst, pszSrc, cchWord);
509 if (!pszDst)
510 return false;
511 pszSrc += cchWord;
512 }
513 }
514
515 /* Just terminate it now. */
516 *pszDst = '\0';
517
518 /*
519 * Update the string.
520 */
521 size_t cchNew = pszDst - &szNew[0];
522 if (cchNew > pStr->cchString)
523 {
524 pStr->pszString = (char *)malloc(cchNew + 1);
525 if (!pStr->pszString)
526 {
527 fprintf(stderr, "Out of memory!\n");
528 return false;
529 }
530 }
531 memcpy(pStr->pszString, szNew, cchNew + 1);
532 pStr->cchString = cchNew;
533
534 return true;
535}
536
537
538/**
539 * For sorting the frequency fidning in descending order.
540 *
541 * Comparison operators are put outside to make older gcc versions (like 4.1.1
542 * on lnx64-rel) happy.
543 */
544class WordFreqSortEntry
545{
546public:
547 BLDPROGWORDFREQPAIR const *m_pPair;
548
549public:
550 WordFreqSortEntry(BLDPROGWORDFREQPAIR const *pPair) : m_pPair(pPair) {}
551};
552
553bool operator == (WordFreqSortEntry const &rLeft, WordFreqSortEntry const &rRight)
554{
555 return rLeft.m_pPair->second == rRight.m_pPair->second;
556}
557
558bool operator < (WordFreqSortEntry const &rLeft, WordFreqSortEntry const &rRight)
559{
560 return rLeft.m_pPair->second > rRight.m_pPair->second;
561}
562
563
564/**
565 * Compresses the vendor and product strings.
566 *
567 * This is very very simple (a lot less work than the string table for
568 * instance).
569 */
570static bool bldProgStrTab_compressorDoStringCompression(PBLDPROGSTRTAB pThis, bool fVerbose)
571{
572 /*
573 * Sort the frequency analyzis result and pick the top 127 ones.
574 */
575 std::vector<WordFreqSortEntry> SortMap;
576 for (BLDPROGWORDFREQMAP::iterator it = pThis->Frequencies.begin(); it != pThis->Frequencies.end(); ++it)
577 {
578 BLDPROGWORDFREQPAIR const &rPair = *it;
579 SortMap.push_back(WordFreqSortEntry(&rPair));
580 }
581
582 sort(SortMap.begin(), SortMap.end());
583
584 size_t cb = 0;
585 size_t i = 0;
586 for (std::vector<WordFreqSortEntry>::iterator it = SortMap.begin();
587 it != SortMap.end() && i < RT_ELEMENTS(pThis->aCompDict);
588 ++it, i++)
589 {
590 pThis->aCompDict[i].cchString = it->m_pPair->first.length();
591 pThis->aCompDict[i].pszString = (char *)malloc(pThis->aCompDict[i].cchString + 1);
592 if (pThis->aCompDict[i].pszString)
593 memcpy(pThis->aCompDict[i].pszString, it->m_pPair->first.c_str(), pThis->aCompDict[i].cchString + 1);
594 else
595 return false;
596 cb += it->m_pPair->second;
597 }
598
599 if (fVerbose)
600 printf("debug: Estimated string compression saving: %u bytes\n", (unsigned)cb);
601
602 /*
603 * Rework the strings.
604 */
605 size_t cchOld = 0;
606 size_t cchOldMax = 0;
607 size_t cchOldMin = BLDPROG_STRTAB_MAX_STRLEN;
608 size_t cchNew = 0;
609 size_t cchNewMax = 0;
610 size_t cchNewMin = BLDPROG_STRTAB_MAX_STRLEN;
611 i = pThis->cPendingStrings;
612 while (i-- > 0)
613 {
614 PBLDPROGSTRING pCurStr = pThis->papPendingStrings[i];
615 cchOld += pCurStr->cchString;
616 if (pCurStr->cchString > cchOldMax)
617 cchOldMax = pCurStr->cchString;
618 if (pCurStr->cchString < cchOldMin)
619 cchOldMin = pCurStr->cchString;
620
621 if (!bldProgStrTab_compressorFixupString(pThis, pCurStr))
622 return false;
623
624 cchNew += pCurStr->cchString;
625 if (pCurStr->cchString > cchNewMax)
626 cchNewMax = pCurStr->cchString;
627 if (pCurStr->cchString < cchNewMin)
628 cchNewMin = pCurStr->cchString;
629
630 bldProgStrTab_AddStringToHashTab(pThis, pCurStr);
631 }
632
633 /*
634 * Do debug stats.
635 */
636 if (fVerbose)
637 {
638 for (i = 0; i < RT_ELEMENTS(pThis->aCompDict); i++)
639 cchNew += pThis->aCompDict[i].cchString + 1;
640
641 printf("debug: Strings: original: %u bytes; compressed: %u bytes;", (unsigned)cchOld, (unsigned)cchNew);
642 if (cchNew < cchOld)
643 printf(" saving %u bytes (%u%%)\n", (unsigned)(cchOld - cchNew), (unsigned)((cchOld - cchNew) * 100 / cchOld));
644 else
645 printf(" wasting %u bytes!\n", (unsigned)(cchOld - cchNew));
646 printf("debug: Original string lengths: average %u; min %u; max %u\n",
647 (unsigned)(cchOld / pThis->cPendingStrings), (unsigned)cchOldMin, (unsigned)cchOldMax);
648 printf("debug: Compressed string lengths: average %u; min %u; max %u\n",
649 (unsigned)(cchNew / pThis->cPendingStrings), (unsigned)cchNewMin, (unsigned)cchNewMax);
650 }
651
652 return true;
653}
654
655#endif /* BLDPROG_STRTAB_WITH_COMPRESSION */
656
657/**
658 * Inserts a string into g_apUniqueStrings.
659 * @param pThis The strint table compiler instance.
660 * @param pStr The string.
661 */
662static void bldProgStrTab_InsertUniqueString(PBLDPROGSTRTAB pThis, PBLDPROGSTRING pStr)
663{
664 size_t iIdx;
665 size_t iEnd = pThis->cSortedStrings;
666 if (iEnd)
667 {
668 size_t iStart = 0;
669 for (;;)
670 {
671 iIdx = iStart + (iEnd - iStart) / 2;
672 if (pThis->papSortedStrings[iIdx]->cchString < pStr->cchString)
673 {
674 if (iIdx <= iStart)
675 break;
676 iEnd = iIdx;
677 }
678 else if (pThis->papSortedStrings[iIdx]->cchString > pStr->cchString)
679 {
680 if (++iIdx >= iEnd)
681 break;
682 iStart = iIdx;
683 }
684 else
685 break;
686 }
687
688 if (iIdx != pThis->cSortedStrings)
689 memmove(&pThis->papSortedStrings[iIdx + 1], &pThis->papSortedStrings[iIdx],
690 (pThis->cSortedStrings - iIdx) * sizeof(pThis->papSortedStrings[iIdx]));
691 }
692 else
693 iIdx = 0;
694
695 pThis->papSortedStrings[iIdx] = pStr;
696 pThis->cSortedStrings++;
697}
698
699
700/**
701 * Compiles the string table after the string has been added.
702 *
703 * This will save space by dropping string terminators, eliminating duplicates
704 * and try find strings that are sub-strings of others.
705 *
706 * Will initialize the StrRef of all StrTabString instances.
707 *
708 * @returns success indicator (error flagged).
709 * @param pThis The strint table compiler instance.
710 */
711static bool BldProgStrTab_CompileIt(PBLDPROGSTRTAB pThis, bool fVerbose)
712{
713#ifdef BLDPROG_STRTAB_WITH_COMPRESSION
714 /*
715 * Do the compression and add all the compressed strings to the table.
716 */
717 if (!bldProgStrTab_compressorDoStringCompression(pThis, fVerbose))
718 return false;
719
720 /*
721 * Add the dictionary strings.
722 */
723 for (unsigned i = 0; i < RT_ELEMENTS(pThis->aCompDict); i++)
724 bldProgStrTab_AddStringToHashTab(pThis, &pThis->aCompDict[i]);
725#endif
726 if (fVerbose)
727 printf("debug: %u unique strings (%u bytes), %u duplicates, %u collisions\n",
728 (unsigned)pThis->cUniqueStrings, (unsigned)pThis->cchUniqueStrings,
729 (unsigned)pThis->cDuplicateStrings, (unsigned)pThis->cCollisions);
730
731 /*
732 * Create papSortedStrings from the hash table. The table is sorted by
733 * string length, with the longer strings first.
734 */
735 pThis->papSortedStrings = (PBLDPROGSTRING *)malloc(sizeof(pThis->papSortedStrings[0]) * pThis->cUniqueStrings);
736 if (!pThis->papSortedStrings)
737 return false;
738 pThis->cSortedStrings = 0;
739 size_t idxHash = pThis->cStrHash;
740 while (idxHash-- > 0)
741 {
742 PBLDPROGSTRING pCur = pThis->papStrHash[idxHash];
743 if (pCur)
744 {
745 do
746 {
747 bldProgStrTab_InsertUniqueString(pThis, pCur);
748 pCur = pCur->pNextCollision;
749 } while (pCur);
750 }
751 }
752
753 /*
754 * Create the actual string table.
755 */
756 pThis->pachStrTab = (char *)malloc(pThis->cchUniqueStrings + 1);
757 if (!pThis->pachStrTab)
758 return false;
759 pThis->cchStrTab = 0;
760 for (size_t i = 0; i < pThis->cSortedStrings; i++)
761 {
762 PBLDPROGSTRING pCur = pThis->papSortedStrings[i];
763 const char * const pszCur = pCur->pszString;
764 size_t const cchCur = pCur->cchString;
765 size_t offStrTab = pThis->cchStrTab;
766
767 /*
768 * See if the string is a substring already found in the string table.
769 * Excluding the zero terminator increases the chances for this.
770 */
771 size_t cchLeft = pThis->cchStrTab >= cchCur ? pThis->cchStrTab - cchCur : 0;
772 const char *pchLeft = pThis->pachStrTab;
773 char const chFirst = *pszCur;
774 while (cchLeft > 0)
775 {
776 const char *pchCandidate = (const char *)memchr(pchLeft, chFirst, cchLeft);
777 if (!pchCandidate)
778 break;
779 if (memcmp(pchCandidate, pszCur, cchCur) == 0)
780 {
781 offStrTab = pchCandidate - pThis->pachStrTab;
782 break;
783 }
784
785 cchLeft -= pchCandidate + 1 - pchLeft;
786 pchLeft = pchCandidate + 1;
787 }
788
789 if (offStrTab == pThis->cchStrTab)
790 {
791 /*
792 * See if the start of the string overlaps the end of the string table.
793 */
794 if (pThis->cchStrTab && cchCur > 1)
795 {
796 cchLeft = RT_MIN(pThis->cchStrTab, cchCur - 1);
797 pchLeft = &pThis->pachStrTab[pThis->cchStrTab - cchLeft];
798 while (cchLeft > 0)
799 {
800 const char *pchCandidate = (const char *)memchr(pchLeft, chFirst, cchLeft);
801 if (!pchCandidate)
802 break;
803 cchLeft -= pchCandidate - pchLeft;
804 pchLeft = pchCandidate;
805 if (memcmp(pchLeft, pszCur, cchLeft) == 0)
806 {
807 size_t cchToCopy = cchCur - cchLeft;
808 memcpy(&pThis->pachStrTab[offStrTab], &pszCur[cchLeft], cchToCopy);
809 pThis->cchStrTab += cchToCopy;
810 offStrTab = pchCandidate - pThis->pachStrTab;
811 break;
812 }
813 cchLeft--;
814 pchLeft++;
815 }
816 }
817
818 /*
819 * If we didn't have any luck above, just append the string.
820 */
821 if (offStrTab == pThis->cchStrTab)
822 {
823 memcpy(&pThis->pachStrTab[offStrTab], pszCur, cchCur);
824 pThis->cchStrTab += cchCur;
825 }
826 }
827
828 /*
829 * Set the string table offset for all the references to this string.
830 */
831 do
832 {
833 pCur->offStrTab = (uint32_t)offStrTab;
834 pCur = pCur->pNextRef;
835 } while (pCur != NULL);
836 }
837
838 if (fVerbose)
839 printf("debug: String table: %u bytes\n", (unsigned)pThis->cchStrTab);
840 return true;
841}
842
843
844#ifdef RT_STRICT
845/**
846 * Sanity checks a string table string.
847 * @param pThis The strint table compiler instance.
848 * @param pStr The string to check.
849 */
850static void BldProgStrTab_CheckStrTabString(PBLDPROGSTRTAB pThis, PBLDPROGSTRING pStr)
851{
852 if (pStr->cchString != strlen(pStr->pszString))
853 abort();
854 if (pStr->offStrTab >= pThis->cchStrTab)
855 abort();
856 if (pStr->offStrTab + pStr->cchString > pThis->cchStrTab)
857 abort();
858 if (memcmp(pStr->pszString, &pThis->pachStrTab[pStr->offStrTab], pStr->cchString) != 0)
859 abort();
860}
861#endif
862
863
864/**
865 * Output the string table string in C string litteral fashion.
866 *
867 * @param pThis The strint table instance.
868 * @param pString The string to print.
869 * @param pOut The output stream.
870 */
871static void BldProgStrTab_PrintCStringLitteral(PBLDPROGSTRTAB pThis, PBLDPROGSTRING pString, FILE *pOut)
872{
873 const unsigned char *psz = (const unsigned char *)pString->pszString;
874 unsigned char uch;
875 while ((uch = *psz++) != '\0')
876 {
877 if (!(uch & 0x80))
878 {
879 if (uch != '\'' && uch != '\\')
880 fputc((char)uch, pOut);
881 else
882 {
883 fputc('\\', pOut);
884 fputc((char)uch, pOut);
885 }
886 }
887#ifdef BLDPROG_STRTAB_WITH_COMPRESSION
888 else if (uch != 0xff)
889 fputs(pThis->aCompDict[uch & 0x7f].pszString, pOut);
890 else
891 {
892# ifdef BLDPROG_STRTAB_PURE_ASCII
893 abort();
894 fprintf(pOut, "\\x%02x", (unsigned)uch);
895# else
896 RTUNICP uc = RTStrGetCp((const char *)psz);
897 psz += RTStrCpSize(uc);
898 fprintf(pOut, "\\u%04x", uc);
899# endif
900 }
901#else
902 else
903 fprintf(pOut, "\\x%02", (unsigned)uch);
904#endif
905 }
906}
907
908
909/**
910 * Writes the string table code to the output stream.
911 *
912 * @param pThis The strint table compiler instance.
913 * @param pOut The output stream.
914 * @param pszScope The scoping ("static " or empty string),
915 * including trailing space.
916 * @param pszPrefix The variable prefix. Typically "g_" for C programs,
917 * whereas C++ classes normally use "class::s_g".
918 * @param pszBaseName The base name for the variables. The user
919 * accessible variable will end up as just the
920 * prefix and basename concatenated.
921 */
922static void BldProgStrTab_WriteStringTable(PBLDPROGSTRTAB pThis, FILE *pOut,
923 const char *pszScope, const char *pszPrefix, const char *pszBaseName)
924{
925#ifdef RT_STRICT
926 /*
927 * Do some quick sanity checks while we're here.
928 */
929# ifdef BLDPROG_STRTAB_WITH_COMPRESSION
930 for (unsigned i = 0; i < RT_ELEMENTS(pThis->aCompDict); i++)
931 BldProgStrTab_CheckStrTabString(pThis, &pThis->aCompDict[i]);
932# endif
933#endif
934
935 /*
936 * Create a table for speeding up the character categorization.
937 */
938 uint8_t abCharCat[256];
939 memset(abCharCat, 0, sizeof(abCharCat));
940 abCharCat[(unsigned char)'\\'] = 1;
941 abCharCat[(unsigned char)'\''] = 1;
942 for (unsigned i = 0; i < 0x20; i++)
943 abCharCat[i] = 2;
944 for (unsigned i = 0x7f; i < 0x100; i++)
945 abCharCat[i] = 2;
946
947 /*
948 * We follow the sorted string table, one string per line.
949 */
950 fprintf(pOut,
951 "#include <iprt/bldprog-strtab.h>\n"
952 "\n"
953 "static const char g_achStrTab%s[] =\n"
954 "{\n",
955 pszBaseName);
956
957 uint32_t off = 0;
958 for (uint32_t i = 0; i < pThis->cSortedStrings; i++)
959 {
960 PBLDPROGSTRING pCur = pThis->papSortedStrings[i];
961 uint32_t offEnd = pCur->offStrTab + (uint32_t)pCur->cchString;
962 if (offEnd > off)
963 {
964 /* Comment with a uncompressed and more readable version of the string. */
965 fprintf(pOut, off == pCur->offStrTab ? "/* 0x%05x = \"" : "/* 0X%05x = \"", off);
966 BldProgStrTab_PrintCStringLitteral(pThis, pCur, pOut);
967 fputs("\" */\n", pOut);
968
969 /* Must use char by char here or we may trigger the max string
970 length limit in the compiler, */
971 fputs(" ", pOut);
972 for (; off < offEnd; off++)
973 {
974 unsigned char uch = pThis->pachStrTab[off];
975 fputc('\'', pOut);
976 if (abCharCat[uch] == 0)
977 fputc(uch, pOut);
978 else if (abCharCat[uch] != 1)
979 fprintf(pOut, "\\x%02x", (unsigned)uch);
980 else
981 {
982 fputc('\\', pOut);
983 fputc(uch, pOut);
984 }
985 fputc('\'', pOut);
986 fputc(',', pOut);
987 }
988 fputc('\n', pOut);
989 }
990 }
991
992 fprintf(pOut,
993 "};\n"
994 "AssertCompile(sizeof(g_achStrTab%s) == %#x);\n\n",
995 pszBaseName, (unsigned)pThis->cchStrTab);
996
997#ifdef BLDPROG_STRTAB_WITH_COMPRESSION
998 /*
999 * Write the compression dictionary.
1000 */
1001 fprintf(pOut,
1002 "static const RTBLDPROGSTRREF g_aCompDict%s[%u] = \n"
1003 "{\n",
1004 pszBaseName, (unsigned)RT_ELEMENTS(pThis->aCompDict));
1005 for (unsigned i = 0; i < RT_ELEMENTS(pThis->aCompDict); i++)
1006 fprintf(pOut, " { %#08x, %#04x }, // %s\n",
1007 pThis->aCompDict[i].offStrTab, (unsigned)pThis->aCompDict[i].cchString, pThis->aCompDict[i].pszString);
1008 fprintf(pOut, "};\n\n");
1009#endif
1010
1011
1012 /*
1013 * Write the string table data structure.
1014 */
1015 fprintf(pOut,
1016 "%sconst RTBLDPROGSTRTAB %s%s = \n"
1017 "{\n"
1018 " /*.pchStrTab = */ &g_achStrTab%s[0],\n"
1019 " /*.cchStrTab = */ sizeof(g_achStrTab%s),\n"
1020 ,
1021 pszScope, pszPrefix, pszBaseName, pszBaseName, pszBaseName);
1022#ifdef BLDPROG_STRTAB_WITH_COMPRESSION
1023 fprintf(pOut,
1024 " /*.cCompDict = */ %u,\n"
1025 " /*.paCompDict = */ &g_aCompDict%s[0]\n"
1026 "};\n"
1027 , (unsigned)RT_ELEMENTS(pThis->aCompDict), pszBaseName);
1028#else
1029 fprintf(pOut,
1030 " /*.cCompDict = */ 0,\n"
1031 " /*.paCompDict = */ NULL\n"
1032 "};\n");
1033#endif
1034}
1035
1036#endif /* __cplusplus && IN_RING3 */
1037
Note: See TracBrowser for help on using the repository browser.

© 2024 Oracle Support Privacy / Do Not Sell My Info Terms of Use Trademark Policy Automated Access Etiquette