VirtualBox

source: vbox/trunk/src/VBox/Runtime/common/string/uniread.cpp@ 31157

Last change on this file since 31157 was 28876, checked in by vboxsync, 15 years ago

uniread.cpp: Updated to cope with version 5.2 of the spec. Preparing for exctracing necessary decomposition and normalization information. Fixed Oracle (C).

  • Property svn:eol-style set to native
  • Property svn:keywords set to Id
File size: 39.6 KB
Line 
1/* $Id: uniread.cpp 28876 2010-04-28 19:01:33Z vboxsync $ */
2/** @file
3 * IPRT - Unicode Specification Reader.
4 */
5
6/*
7 * Copyright (C) 2006-2007 Oracle Corporation
8 *
9 * This file is part of VirtualBox Open Source Edition (OSE), as
10 * available from http://www.virtualbox.org. This file is free software;
11 * you can redistribute it and/or modify it under the terms of the GNU
12 * General Public License (GPL) as published by the Free Software
13 * Foundation, in version 2 as it comes in the "COPYING" file of the
14 * VirtualBox OSE distribution. VirtualBox OSE is distributed in the
15 * hope that it will be useful, but WITHOUT ANY WARRANTY of any kind.
16 *
17 * The contents of this file may alternatively be used under the terms
18 * of the Common Development and Distribution License Version 1.0
19 * (CDDL) only, as it comes in the "COPYING.CDDL" file of the
20 * VirtualBox OSE distribution, in which case the provisions of the
21 * CDDL are applicable instead of those of the GPL.
22 *
23 * You may elect to license modified versions of this file under the
24 * terms and conditions of either the GPL or the CDDL or both.
25 */
26
27/*******************************************************************************
28* Header Files *
29*******************************************************************************/
30#include <iprt/types.h>
31#include <iprt/stdarg.h>
32#include <iprt/ctype.h>
33
34#include <stdio.h>
35#include <string.h>
36#include <stdlib.h>
37
38
39/*******************************************************************************
40* Global Variables *
41*******************************************************************************/
42/** When set, no output is produced. Very useful when debugging ths code. */
43static bool g_fQuiet = false;
44/** The file we're currently parsing. */
45static const char *g_pszCurFile;
46/** The current line number. */
47static unsigned g_iLine;
48
49
50/**
51 * Exit the program after printing a parse error.
52 *
53 * @param pszFormat The message.
54 * @param ... Format arguments.
55 */
56static void ParseError(const char *pszFormat, ...)
57{
58 va_list va;
59 va_start(va, pszFormat);
60 fprintf(stderr, "parse error: %s:%u: ", g_pszCurFile, g_iLine);
61 vfprintf(stderr, pszFormat, va);
62 va_end(va);
63 exit(1);
64}
65
66/**
67 * Strip a line.
68 * @returns pointer to first non-blank char.
69 * @param pszLine The line string to strip.
70 */
71static char *StripLine(char *pszLine)
72{
73 while (*pszLine == ' ' || *pszLine == '\t')
74 pszLine++;
75
76 char *psz = strchr(pszLine, '#');
77 if (psz)
78 *psz = '\0';
79 else
80 psz = strchr(pszLine, '\0');
81 while (psz > pszLine)
82 {
83 switch (psz[-1])
84 {
85 case ' ':
86 case '\t':
87 case '\n':
88 case '\r':
89 *--psz = '\0';
90 continue;
91 }
92 break;
93 }
94
95 return pszLine;
96}
97
98
99/**
100 * Checks if the line is blank or a comment line and should be skipped.
101 * @returns true/false.
102 * @param pszLine The line to consider.
103 */
104static bool IsCommentOrBlankLine(const char *pszLine)
105{
106 while (*pszLine == ' ' || *pszLine == '\t' || *pszLine == '\n' || *pszLine == '\r')
107 pszLine++;
108 return *pszLine == '#' || *pszLine == '\0';
109}
110
111
112/**
113 * Get the first field in the string.
114 *
115 * @returns Pointer to the next field.
116 * @param ppsz Where to store the pointer to the next field.
117 * @param pszLine The line string. (could also be *ppsz from a FirstNext call)
118 */
119static char *FirstField(char **ppsz, char *pszLine)
120{
121 char *psz = strchr(pszLine, ';');
122 if (!psz)
123 *ppsz = psz = strchr(pszLine, '\0');
124 else
125 {
126 *psz = '\0';
127 *ppsz = psz + 1;
128 }
129
130 /* strip */
131 while (*pszLine == ' ' || *pszLine == '\t' || *pszLine == '\r' || *pszLine == '\n')
132 pszLine++;
133 while (psz > pszLine)
134 {
135 switch (psz[-1])
136 {
137 case ' ':
138 case '\t':
139 case '\n':
140 case '\r':
141 *--psz = '\0';
142 continue;
143 }
144 break;
145 }
146 return pszLine;
147}
148
149
150/**
151 * Get the next field in a field enumeration.
152 *
153 * @returns Pointer to the next field.
154 * @param ppsz Where to get and store the string postition.
155 */
156static char *NextField(char **ppsz)
157{
158 return FirstField(ppsz, *ppsz);
159}
160
161
162/**
163 * Splits a decomposition field.
164 *
165 * This may start with a type that is enclosed in angle brackets.
166 *
167 * @returns Pointer to the mapping values following the type. @a *ppsz if empty.
168 * @param ppszType Pointer to the type field pointer. On input the type
169 * field contains the combined type and mapping string. On
170 * output this should only contain the type, no angle
171 * brackets. If no type specified, it is replaced with an
172 * empty string (const).
173 */
174static char *SplitDecompField(char **ppszType)
175{
176 /* Empty field? */
177 char *psz = *ppszType;
178 if (!*psz)
179 return psz;
180
181 /* No type? */
182 if (*psz != '<')
183 {
184 *ppszType = (char *)"";
185 return psz;
186 }
187
188 /* Split out the type. */
189 *ppszType = ++psz;
190 psz = strchr(psz, '>');
191 if (!psz)
192 {
193 ParseError("Bad Decomposition Type/Mappings\n");
194 return *ppszType;
195 }
196 *psz++ = '\0';
197
198 psz = StripLine(psz);
199 if (!*psz)
200 ParseError("Missing decomposition mappings\n");
201 return psz;
202}
203
204/**
205 * Converts a code point field to a number.
206 * @returns Code point.
207 * @param psz The field string.
208 */
209static RTUNICP ToNum(const char *psz)
210{
211 char *pszEnd = NULL;
212 unsigned long ul = strtoul(psz, &pszEnd, 16);
213 if (pszEnd && *pszEnd)
214 ParseError("failed converting '%s' to a number!\n", psz);
215 return (RTUNICP)ul;
216}
217
218
219/**
220 * Same as ToNum except that if the field is empty the Default is returned.
221 */
222static RTUNICP ToNumDefault(const char *psz, RTUNICP Default)
223{
224 if (*psz)
225 return ToNum(psz);
226 return Default;
227}
228
229
230/**
231 * Converts a code point range to numbers.
232 * @returns The start code point.\
233 * @returns ~(RTUNICP)0 on failure.
234 * @param psz The field string.
235 * @param pLast Where to store the last code point in the range.
236 */
237static RTUNICP ToRange(const char *psz, PRTUNICP pLast)
238{
239 char *pszEnd = NULL;
240 unsigned long ulStart = strtoul(psz, &pszEnd, 16);
241 unsigned long ulLast = ulStart;
242 if (pszEnd && *pszEnd)
243 {
244 if (*pszEnd == '.')
245 {
246 while (*pszEnd == '.')
247 pszEnd++;
248 ulLast = strtoul(pszEnd, &pszEnd, 16);
249 if (pszEnd && *pszEnd)
250 {
251 ParseError("failed converting '%s' to a number!\n", psz);
252 return ~(RTUNICP)0;
253 }
254 }
255 else
256 {
257 ParseError("failed converting '%s' to a number!\n", psz);
258 return ~(RTUNICP)0;
259 }
260 }
261 *pLast = (RTUNICP)ulLast;
262 return (RTUNICP)ulStart;
263
264}
265
266/**
267 * For converting the decompisition mappings field and similar.
268 *
269 * @returns Mapping array or NULL if none.
270 * @param psz The string to convert. Can be empty.
271 * @param pcEntries Where to store the number of entries.
272 * @param cMax The max number of entries.
273 */
274static PRTUNICP ToMapping(char *psz, unsigned *pcEntries, unsigned cMax)
275{
276 PRTUNICP paCps = NULL;
277 unsigned cAlloc = 0;
278 unsigned i = 0;
279
280 /* Convert the code points. */
281 while (psz)
282 {
283 /* skip leading spaces */
284 while (RT_C_IS_BLANK(*psz))
285 psz++;
286
287 /* the end? */
288 if (!*psz)
289 break;
290
291 /* room left? */
292 if (i >= cMax)
293 {
294 ParseError("Too many mappings.\n");
295 break;
296 }
297 if (i >= cAlloc)
298 {
299 cAlloc += 4;
300 paCps = (PRTUNICP)realloc(paCps, cAlloc * sizeof(paCps[0]));
301 if (!paCps)
302 {
303 fprintf(stderr, "out of memory (%u)\n", (unsigned)(cAlloc * sizeof(paCps[0])));
304 exit(1);
305 }
306 }
307
308 /* Find the end. */
309 char *pszThis = psz;
310 while (RT_C_IS_XDIGIT(*psz))
311 psz++;
312 if (*psz && !RT_C_IS_BLANK(*psz))
313 ParseError("Malformed mappings.\n");
314 if (*psz)
315 *psz++ = '\0';
316
317 /* Convert to number and add it. */
318 paCps[i++] = ToNum(pszThis);
319 }
320
321 *pcEntries = i;
322 return paCps;
323}
324
325
326/**
327 * Duplicate a string, optimize certain strings to save memory.
328 *
329 * @returns Pointer to string copy.
330 * @param pszStr The string to duplicate.
331 */
332static char *DupStr(const char *pszStr)
333{
334 if (!*pszStr)
335 return (char*)"";
336 char *psz = strdup(pszStr);
337 if (psz)
338 return psz;
339
340 fprintf(stderr, "out of memory!\n");
341 exit(1);
342}
343
344
345/**
346 * Array of all possible and impossible unicode code points as of 4.1
347 */
348struct CPINFO
349{
350 RTUNICP CodePoint;
351 RTUNICP SimpleUpperCaseMapping;
352 RTUNICP SimpleLowerCaseMapping;
353 RTUNICP SimpleTitleCaseMapping;
354 unsigned CanonicalCombiningClass;
355 const char *pszDecompositionType;
356 unsigned cDecompositionMapping;
357 PRTUNICP paDecompositionMapping;
358 const char *pszName;
359 /** Set if this is an unused entry */
360 unsigned fNullEntry : 1;
361
362 unsigned fAlphabetic : 1;
363 unsigned fASCIIHexDigit : 1;
364 unsigned fBidiControl : 1;
365 unsigned fCaseIgnorable : 1;
366 unsigned fCased : 1;
367 unsigned fChangesWhenCasefolded : 1;
368 unsigned fChangesWhenCasemapped : 1;
369 unsigned fChangesWhenLowercased : 1;
370 unsigned fChangesWhenTitlecased : 1;
371 unsigned fChangesWhenUppercased : 1;
372 unsigned fDash : 1;
373 unsigned fDefaultIgnorableCodePoint : 1;
374 unsigned fDeprecated : 1;
375 unsigned fDiacritic : 1;
376 unsigned fExtender : 1;
377 unsigned fGraphemeBase : 1;
378 unsigned fGraphemeExtend : 1;
379 unsigned fGraphemeLink : 1;
380 unsigned fHexDigit : 1;
381 unsigned fHyphen : 1;
382 unsigned fIDContinue : 1;
383 unsigned fIdeographic : 1;
384 unsigned fIDSBinaryOperator : 1;
385 unsigned fIDStart : 1;
386 unsigned fIDSTrinaryOperator : 1;
387 unsigned fJoinControl : 1;
388 unsigned fLogicalOrderException : 1;
389 unsigned fLowercase : 1;
390 unsigned fMath : 1;
391 unsigned fNoncharacterCodePoint : 1;
392 unsigned fOtherAlphabetic : 1;
393 unsigned fOtherDefaultIgnorableCodePoint : 1;
394 unsigned fOtherGraphemeExtend : 1;
395 unsigned fOtherIDContinue : 1;
396 unsigned fOtherIDStart : 1;
397 unsigned fOtherLowercase : 1;
398 unsigned fOtherMath : 1;
399 unsigned fOtherUppercase : 1;
400 unsigned fPatternSyntax : 1;
401 unsigned fPatternWhiteSpace : 1;
402 unsigned fQuotationMark : 1;
403 unsigned fRadical : 1;
404 unsigned fSoftDotted : 1;
405 unsigned fSTerm : 1;
406 unsigned fTerminalPunctuation : 1;
407 unsigned fUnifiedIdeograph : 1;
408 unsigned fUppercase : 1;
409 unsigned fVariationSelector : 1;
410 unsigned fWhiteSpace : 1;
411 unsigned fXIDContinue : 1;
412 unsigned fXIDStart : 1;
413
414 /** @name DerivedNormalizationProps.txt
415 * @{ */
416 unsigned fFullCompositionExclusion : 1;
417 unsigned fInvNFC_QC : 2; /**< If 1 (NFC_QC == N) then code point 100% sure not part of NFC string. */
418 unsigned fInvNFD_QC : 2; /**< If 1 (NFD_QC == N) then code point 100% sure not part of NFD string. */
419 unsigned fInvNFKC_QC : 2;
420 unsigned fInvNFKD_QC : 2;
421 unsigned fExpandsOnNFC : 1;
422 unsigned fExpandsOnNFD : 1;
423 unsigned fExpandsOnNFKC : 1;
424 unsigned fExpandsOnNFKD : 1;
425 /** @} */
426
427 /* unprocessed stuff, so far. */
428 const char *pszGeneralCategory;
429 const char *pszBidiClass;
430 const char *pszNumericType;
431 const char *pszNumericValueD;
432 const char *pszNumericValueN;
433 const char *pszBidiMirrored;
434 const char *pszUnicode1Name;
435 const char *pszISOComment;
436} g_aCPInfo[0x110000];
437
438
439/**
440 * Creates a 'null' entry at i.
441 * @param i The entry in question.
442 */
443static void NullEntry(unsigned i)
444{
445 g_aCPInfo[i].CodePoint = i;
446 g_aCPInfo[i].fNullEntry = 1;
447 g_aCPInfo[i].SimpleUpperCaseMapping = i;
448 g_aCPInfo[i].SimpleLowerCaseMapping = i;
449 g_aCPInfo[i].SimpleTitleCaseMapping = i;
450 g_aCPInfo[i].pszDecompositionType = "";
451 g_aCPInfo[i].cDecompositionMapping = 0;
452 g_aCPInfo[i].paDecompositionMapping = NULL;
453 g_aCPInfo[i].pszName = "";
454 g_aCPInfo[i].pszGeneralCategory = "";
455 g_aCPInfo[i].pszBidiClass = "";
456 g_aCPInfo[i].pszNumericType = "";
457 g_aCPInfo[i].pszNumericValueD = "";
458 g_aCPInfo[i].pszNumericValueN = "";
459 g_aCPInfo[i].pszBidiMirrored = "";
460 g_aCPInfo[i].pszUnicode1Name = "";
461 g_aCPInfo[i].pszISOComment = "";
462}
463
464
465/**
466 * Open a file for reading, optionally with a base path prefixed.
467 *
468 * @returns file stream on success, NULL w/ complaint on failure.
469 * @param pszBasePath The base path, can be NULL.
470 * @param pszFilename The name of the file to open.
471 */
472static FILE *OpenFile(const char *pszBasePath, const char *pszFilename)
473{
474 FILE *pFile;
475 if ( !pszBasePath
476 || *pszFilename == '/'
477#if defined(_MSC_VER) || defined(__OS2__)
478 || *pszFilename == '\\'
479 || (*pszFilename && pszFilename[1] == ':')
480#endif
481 )
482 {
483 pFile = fopen(pszFilename, "r");
484 if (!pFile)
485 fprintf(stderr, "uniread: failed to open '%s' for reading\n", pszFilename);
486 }
487 else
488 {
489 size_t cchBasePath = strlen(pszBasePath);
490 size_t cchFilename = strlen(pszFilename);
491 char *pszFullName = (char *)malloc(cchBasePath + 1 + cchFilename + 1);
492 if (!pszFullName)
493 {
494 fprintf(stderr, "uniread: failed to allocate %d bytes\n", (int)(cchBasePath + 1 + cchFilename + 1));
495 return NULL;
496 }
497
498 memcpy(pszFullName, pszBasePath, cchBasePath);
499 pszFullName[cchBasePath] = '/';
500 memcpy(&pszFullName[cchBasePath + 1], pszFilename, cchFilename + 1);
501
502 pFile = fopen(pszFullName, "r");
503 if (!pFile)
504 fprintf(stderr, "uniread: failed to open '%s' for reading\n", pszFullName);
505 free(pszFullName);
506 }
507 g_pszCurFile = pszFilename;
508 g_iLine = 0;
509 return pFile;
510}
511
512
513/**
514 * Wrapper around fgets that keep track of the line number.
515 *
516 * @returns See fgets.
517 * @param pszBuf The buffer. See fgets for output definition.
518 * @param cbBuf The buffer size.
519 * @param pFile The file to read from.
520 */
521static char *GetLineFromFile(char *pszBuf, int cbBuf, FILE *pFile)
522{
523 g_iLine++;
524 return fgets(pszBuf, cbBuf, pFile);
525}
526
527
528/**
529 * Closes a file opened by OpenFile
530 *
531 * @param pFile The file to close.
532 */
533static void CloseFile(FILE *pFile)
534{
535 g_pszCurFile = NULL;
536 g_iLine = 0;
537 fclose(pFile);
538}
539
540
541/**
542 * Read the UnicodeData.txt file.
543 * @returns 0 on success.
544 * @returns !0 on failure.
545 * @param pszBasePath The base path, can be NULL.
546 * @param pszFilename The name of the file.
547 */
548static int ReadUnicodeData(const char *pszBasePath, const char *pszFilename)
549{
550 /*
551 * Open input.
552 */
553 FILE *pFile = OpenFile(pszBasePath, pszFilename);
554 if (!pFile)
555 return 1;
556
557 /*
558 * Parse the input and spit out the output.
559 */
560 char szLine[4096];
561 RTUNICP i = 0;
562 while (GetLineFromFile(szLine, sizeof(szLine), pFile) != NULL)
563 {
564 if (IsCommentOrBlankLine(szLine))
565 continue;
566
567 char *pszCurField;
568 char *pszCodePoint = FirstField(&pszCurField, StripLine(szLine)); /* 0 */
569 char *pszName = NextField(&pszCurField); /* 1 */
570 char *pszGeneralCategory = NextField(&pszCurField); /* 2 */
571 char *pszCanonicalCombiningClass = NextField(&pszCurField); /* 3 */
572 char *pszBidiClass = NextField(&pszCurField); /* 4 */
573 char *pszDecompositionType = NextField(&pszCurField); /* 5 */
574 char *pszDecompositionMapping = SplitDecompField(&pszDecompositionType);
575 char *pszNumericType = NextField(&pszCurField); /* 6 */
576 char *pszNumericValueD = NextField(&pszCurField); /* 7 */
577 char *pszNumericValueN = NextField(&pszCurField); /* 8 */
578 char *pszBidiMirrored = NextField(&pszCurField); /* 9 */
579 char *pszUnicode1Name = NextField(&pszCurField); /* 10 */
580 char *pszISOComment = NextField(&pszCurField); /* 11 */
581 char *pszSimpleUpperCaseMapping = NextField(&pszCurField); /* 12 */
582 char *pszSimpleLowerCaseMapping = NextField(&pszCurField); /* 13 */
583 char *pszSimpleTitleCaseMapping = NextField(&pszCurField); /* 14 */
584
585 RTUNICP CodePoint = ToNum(pszCodePoint);
586 if (CodePoint >= RT_ELEMENTS(g_aCPInfo))
587 {
588 ParseError("U+05X is out of range\n", CodePoint);
589 continue;
590 }
591
592 /* catchup? */
593 while (i < CodePoint)
594 NullEntry(i++);
595 if (i != CodePoint)
596 {
597 ParseError("i=%d CodePoint=%u\n", i, CodePoint);
598 CloseFile(pFile);
599 return 1;
600 }
601
602 /* this one */
603 g_aCPInfo[i].CodePoint = i;
604 g_aCPInfo[i].fNullEntry = 0;
605 g_aCPInfo[i].pszName = DupStr(pszName);
606 g_aCPInfo[i].SimpleUpperCaseMapping = ToNumDefault(pszSimpleUpperCaseMapping, CodePoint);
607 g_aCPInfo[i].SimpleLowerCaseMapping = ToNumDefault(pszSimpleLowerCaseMapping, CodePoint);
608 g_aCPInfo[i].SimpleTitleCaseMapping = ToNumDefault(pszSimpleTitleCaseMapping, CodePoint);
609 g_aCPInfo[i].CanonicalCombiningClass = ToNum(pszCanonicalCombiningClass);
610 g_aCPInfo[i].pszDecompositionType = DupStr(pszDecompositionType);
611 g_aCPInfo[i].paDecompositionMapping = ToMapping(pszDecompositionMapping, &g_aCPInfo[i].cDecompositionMapping, 20);
612 g_aCPInfo[i].pszGeneralCategory = DupStr(pszGeneralCategory);
613 g_aCPInfo[i].pszBidiClass = DupStr(pszBidiClass);
614 g_aCPInfo[i].pszNumericType = DupStr(pszNumericType);
615 g_aCPInfo[i].pszNumericValueD = DupStr(pszNumericValueD);
616 g_aCPInfo[i].pszNumericValueN = DupStr(pszNumericValueN);
617 g_aCPInfo[i].pszBidiMirrored = DupStr(pszBidiMirrored);
618 g_aCPInfo[i].pszUnicode1Name = DupStr(pszUnicode1Name);
619 g_aCPInfo[i].pszISOComment = DupStr(pszISOComment);
620 i++;
621 }
622
623 /* catchup? */
624 while (i < RT_ELEMENTS(g_aCPInfo))
625 NullEntry(i++);
626 CloseFile(pFile);
627
628 return 0;
629}
630
631
632/**
633 * Generates excluded data.
634 *
635 * @returns 0 on success, exit code on failure.
636 */
637static int GenerateExcludedData(void)
638{
639 /*
640 * Hangul Syllables U+AC00 to U+D7A3.
641 */
642 for (RTUNICP i = 0xac00; i <= 0xd7a3; i++)
643 {
644 g_aCPInfo[i].fNullEntry = 0;
645 g_aCPInfo[i].fInvNFD_QC = 1;
646 /** @todo generate the decomposition: http://unicode.org/reports/tr15/#Hangul
647 * */
648 }
649
650 /** @todo
651 * CJK Ideographs Extension A (U+3400 - U+4DB5)
652 * CJK Ideographs (U+4E00 - U+9FA5)
653 * CJK Ideograph Extension B (U+20000 - U+2A6D6)
654 * CJK Ideograph Extension C (U+2A700 - U+2B734)
655 */
656
657 return 0;
658}
659
660
661
662/**
663 * Worker for ApplyProperty that handles a yes, no, maybe property value.
664 *
665 * @returns 0 (NO), 1 (YES), 2 (MAYBE).
666 * @param ppszNextField The field cursor, input and output.
667 */
668static int YesNoMaybePropertyValue(char **ppszNextField)
669{
670 if (!**ppszNextField)
671 {
672 ParseError("Missing Y/N/M field\n");
673 return 0;
674 }
675 char *psz = NextField(ppszNextField);
676 if (!strcmp(psz, "N"))
677 return 0;
678 if (!strcmp(psz, "Y"))
679 return 1;
680 if (!strcmp(psz, "M"))
681 return 2;
682 ParseError("Unexpected Y/N/M value: '%s'\n", psz);
683 return 0;
684}
685
686
687/**
688 * Inverted version of YesNoMaybePropertyValue
689 *
690 * @returns 1 (NO), 0 (YES), 2 (MAYBE).
691 * @param ppszNextField The field cursor, input and output.
692 */
693static int YesNoMaybePropertyValueInv(char **ppszNextField)
694{
695 unsigned rc = YesNoMaybePropertyValue(ppszNextField);
696 switch (rc)
697 {
698 case 0: return 1;
699 case 1: return 0;
700 default: return rc;
701 }
702}
703
704
705/**
706 * Applies a property to a code point.
707 *
708 * @param StartCP The code point.
709 * @param pszProperty The property name.
710 */
711static void ApplyProperty(RTUNICP StartCP, const char *pszProperty, char *pszNextField)
712{
713 if (StartCP >= RT_ELEMENTS(g_aCPInfo))
714 {
715 ParseError("U+%06X is out of the g_aCPInfo range.\n", StartCP);
716 return;
717 }
718 struct CPINFO *pCPInfo = &g_aCPInfo[StartCP];
719 /* string switch */
720 if (!strcmp(pszProperty, "ASCII_Hex_Digit")) pCPInfo->fASCIIHexDigit = 1;
721 else if (!strcmp(pszProperty, "Alphabetic")) pCPInfo->fAlphabetic = 1;
722 else if (!strcmp(pszProperty, "Bidi_Control")) pCPInfo->fBidiControl = 1;
723 else if (!strcmp(pszProperty, "Case_Ignorable")) pCPInfo->fCaseIgnorable = 1;
724 else if (!strcmp(pszProperty, "Cased")) pCPInfo->fCased = 1;
725 else if (!strcmp(pszProperty, "Changes_When_Casefolded")) pCPInfo->fChangesWhenCasefolded = 1;
726 else if (!strcmp(pszProperty, "Changes_When_Casemapped")) pCPInfo->fChangesWhenCasemapped = 1;
727 else if (!strcmp(pszProperty, "Changes_When_Lowercased")) pCPInfo->fChangesWhenLowercased = 1;
728 else if (!strcmp(pszProperty, "Changes_When_Titlecased")) pCPInfo->fChangesWhenTitlecased = 1;
729 else if (!strcmp(pszProperty, "Changes_When_Uppercased")) pCPInfo->fChangesWhenUppercased = 1;
730 else if (!strcmp(pszProperty, "Dash")) pCPInfo->fDash = 1;
731 else if (!strcmp(pszProperty, "Default_Ignorable_Code_Point")) pCPInfo->fDefaultIgnorableCodePoint = 1;
732 else if (!strcmp(pszProperty, "Deprecated")) pCPInfo->fDeprecated = 1;
733 else if (!strcmp(pszProperty, "Diacritic")) pCPInfo->fDiacritic = 1;
734 else if (!strcmp(pszProperty, "Extender")) pCPInfo->fExtender = 1;
735 else if (!strcmp(pszProperty, "Grapheme_Base")) pCPInfo->fGraphemeBase = 1;
736 else if (!strcmp(pszProperty, "Grapheme_Extend")) pCPInfo->fGraphemeExtend = 1;
737 else if (!strcmp(pszProperty, "Grapheme_Link")) pCPInfo->fGraphemeLink = 1;
738 else if (!strcmp(pszProperty, "Hex_Digit")) pCPInfo->fHexDigit = 1;
739 else if (!strcmp(pszProperty, "Hyphen")) pCPInfo->fHyphen = 1;
740 else if (!strcmp(pszProperty, "ID_Continue")) pCPInfo->fIDContinue = 1;
741 else if (!strcmp(pszProperty, "ID_Start")) pCPInfo->fIDStart = 1;
742 else if (!strcmp(pszProperty, "Ideographic")) pCPInfo->fIdeographic = 1;
743 else if (!strcmp(pszProperty, "IDS_Binary_Operator")) pCPInfo->fIDSBinaryOperator = 1;
744 else if (!strcmp(pszProperty, "IDS_Trinary_Operator")) pCPInfo->fIDSTrinaryOperator = 1;
745 else if (!strcmp(pszProperty, "Join_Control")) pCPInfo->fJoinControl = 1;
746 else if (!strcmp(pszProperty, "Logical_Order_Exception")) pCPInfo->fLogicalOrderException = 1;
747 else if (!strcmp(pszProperty, "Lowercase")) pCPInfo->fLowercase = 1;
748 else if (!strcmp(pszProperty, "Math")) pCPInfo->fMath = 1;
749 else if (!strcmp(pszProperty, "Noncharacter_Code_Point")) pCPInfo->fNoncharacterCodePoint = 1;
750 else if (!strcmp(pszProperty, "Other_Alphabetic")) pCPInfo->fOtherAlphabetic = 1;
751 else if (!strcmp(pszProperty, "Other_Default_Ignorable_Code_Point")) pCPInfo->fOtherDefaultIgnorableCodePoint = 1;
752 else if (!strcmp(pszProperty, "Other_Grapheme_Extend")) pCPInfo->fOtherGraphemeExtend = 1;
753 else if (!strcmp(pszProperty, "Other_ID_Continue")) pCPInfo->fOtherIDContinue = 1;
754 else if (!strcmp(pszProperty, "Other_ID_Start")) pCPInfo->fOtherIDStart = 1;
755 else if (!strcmp(pszProperty, "Other_Lowercase")) pCPInfo->fOtherLowercase = 1;
756 else if (!strcmp(pszProperty, "Other_Math")) pCPInfo->fOtherMath = 1;
757 else if (!strcmp(pszProperty, "Other_Uppercase")) pCPInfo->fOtherUppercase = 1;
758 else if (!strcmp(pszProperty, "Pattern_Syntax")) pCPInfo->fPatternSyntax = 1;
759 else if (!strcmp(pszProperty, "Pattern_White_Space")) pCPInfo->fPatternWhiteSpace = 1;
760 else if (!strcmp(pszProperty, "Quotation_Mark")) pCPInfo->fQuotationMark = 1;
761 else if (!strcmp(pszProperty, "Radical")) pCPInfo->fRadical = 1;
762 else if (!strcmp(pszProperty, "Soft_Dotted")) pCPInfo->fSoftDotted = 1;
763 else if (!strcmp(pszProperty, "STerm")) pCPInfo->fSTerm = 1;
764 else if (!strcmp(pszProperty, "Terminal_Punctuation")) pCPInfo->fTerminalPunctuation = 1;
765 else if (!strcmp(pszProperty, "Unified_Ideograph")) pCPInfo->fUnifiedIdeograph = 1;
766 else if (!strcmp(pszProperty, "Uppercase")) pCPInfo->fUppercase = 1;
767 else if (!strcmp(pszProperty, "Variation_Selector")) pCPInfo->fVariationSelector = 1;
768 else if (!strcmp(pszProperty, "White_Space")) pCPInfo->fWhiteSpace = 1;
769 else if (!strcmp(pszProperty, "XID_Continue")) pCPInfo->fXIDContinue = 1;
770 else if (!strcmp(pszProperty, "XID_Start")) pCPInfo->fXIDStart = 1;
771 /* DerivedNormalizationProps: */
772 else if (!strcmp(pszProperty, "FC_NFKC")) return; /* ignored */
773 else if (!strcmp(pszProperty, "Full_Composition_Exclusion")) pCPInfo->fFullCompositionExclusion = 1;
774 else if (!strcmp(pszProperty, "NFC_QC")) pCPInfo->fInvNFC_QC = YesNoMaybePropertyValueInv(&pszNextField);
775 else if (!strcmp(pszProperty, "NFD_QC")) pCPInfo->fInvNFD_QC = YesNoMaybePropertyValueInv(&pszNextField);
776 else if (!strcmp(pszProperty, "NFKC_QC")) pCPInfo->fInvNFKC_QC = YesNoMaybePropertyValueInv(&pszNextField);
777 else if (!strcmp(pszProperty, "NFKD_QC")) pCPInfo->fInvNFKD_QC = YesNoMaybePropertyValueInv(&pszNextField);
778 else if (!strcmp(pszProperty, "Expands_On_NFC")) pCPInfo->fExpandsOnNFC = 1;
779 else if (!strcmp(pszProperty, "Expands_On_NFD")) pCPInfo->fExpandsOnNFD = 1;
780 else if (!strcmp(pszProperty, "Expands_On_NFKC")) pCPInfo->fExpandsOnNFKC = 1;
781 else if (!strcmp(pszProperty, "Expands_On_NFKD")) pCPInfo->fExpandsOnNFKD = 1;
782 else if (!strcmp(pszProperty, "NFKC_CF")) return; /*ignore */
783 else if (!strcmp(pszProperty, "Changes_When_NFKC_Casefolded")) return; /*ignore */
784 else
785 {
786 ParseError("Unknown property '%s'\n", pszProperty);
787 return;
788 }
789
790 if (pszNextField && *pszNextField)
791 ParseError("Unexpected next field: '%s'\n", pszNextField);
792}
793
794
795/**
796 * Reads a property file.
797 *
798 * There are several property files, this code can read all
799 * of those but will only make use of the properties it recognizes.
800 *
801 * @returns 0 on success.
802 * @returns !0 on failure.
803 * @param pszBasePath The base path, can be NULL.
804 * @param pszFilename The name of the file.
805 */
806static int ReadProperties(const char *pszBasePath, const char *pszFilename)
807{
808 /*
809 * Open input.
810 */
811 FILE *pFile = OpenFile(pszBasePath, pszFilename);
812 if (!pFile)
813 return 1;
814
815 /*
816 * Parse the input and spit out the output.
817 */
818 char szLine[4096];
819 while (GetLineFromFile(szLine, sizeof(szLine), pFile) != NULL)
820 {
821 if (IsCommentOrBlankLine(szLine))
822 continue;
823 char *pszCurField;
824 char *pszRange = FirstField(&pszCurField, StripLine(szLine));
825 char *pszProperty = NextField(&pszCurField);
826 if (!*pszProperty)
827 {
828 ParseError("no property field.\n");
829 continue;
830 }
831
832 RTUNICP LastCP;
833 RTUNICP StartCP = ToRange(pszRange, &LastCP);
834 if (StartCP == ~(RTUNICP)0)
835 continue;
836
837 while (StartCP <= LastCP)
838 ApplyProperty(StartCP++, pszProperty, pszCurField);
839 }
840
841 CloseFile(pFile);
842
843 return 0;
844}
845
846
847/**
848 * Append a flag to the string.
849 */
850static char *AppendFlag(char *psz, const char *pszFlag)
851{
852 char *pszEnd = strchr(psz, '\0');
853 if (pszEnd != psz)
854 {
855 *pszEnd++ = ' ';
856 *pszEnd++ = '|';
857 *pszEnd++ = ' ';
858 }
859 strcpy(pszEnd, pszFlag);
860 return psz;
861}
862
863/**
864 * Calcs the flags for a code point.
865 * @returns true if there is a flag.
866 * @returns false if the isn't.
867 */
868static bool CalcFlags(struct CPINFO *pInfo, char *pszFlags)
869{
870 pszFlags[0] = '\0';
871 /** @todo read the specs on this other vs standard stuff, and check out the finer points */
872 if (pInfo->fAlphabetic || pInfo->fOtherAlphabetic)
873 AppendFlag(pszFlags, "RTUNI_ALPHA");
874 if (pInfo->fHexDigit || pInfo->fASCIIHexDigit)
875 AppendFlag(pszFlags, "RTUNI_XDIGIT");
876 if (!strcmp(pInfo->pszGeneralCategory, "Nd"))
877 AppendFlag(pszFlags, "RTUNI_DDIGIT");
878 if (pInfo->fWhiteSpace)
879 AppendFlag(pszFlags, "RTUNI_WSPACE");
880 if (pInfo->fUppercase || pInfo->fOtherUppercase)
881 AppendFlag(pszFlags, "RTUNI_UPPER");
882 if (pInfo->fLowercase || pInfo->fOtherLowercase)
883 AppendFlag(pszFlags, "RTUNI_LOWER");
884 //if (pInfo->???)
885 // AppendFlag(pszFlags, "RTUNI_BSPACE");
886 if (pInfo->fInvNFD_QC != 0 || pInfo->fInvNFC_QC != 0)
887 {
888 AppendFlag(pszFlags, "RTUNI_QC_NFX");
889 if (!pInfo->paDecompositionMapping && pInfo->fInvNFD_QC)
890 fprintf(stderr, "uniread: U+%05X is QC_NFD but has no mappings.\n", pInfo->CodePoint);
891 else if (*pInfo->pszDecompositionType && pInfo->fInvNFD_QC)
892 fprintf(stderr, "uniread: U+%05X is QC_NFD but has no canonical mappings.\n", pInfo->CodePoint);
893 }
894 else if (pInfo->paDecompositionMapping && !*pInfo->pszDecompositionType)
895 fprintf(stderr, "uniread: U+%05X is not QC_NFX but has canonical mappings.\n", pInfo->CodePoint);
896
897 if (!*pszFlags)
898 {
899 pszFlags[0] = '0';
900 pszFlags[1] = '\0';
901 return false;
902 }
903 return true;
904}
905
906
907/**
908 * printf wrapper for the primary output stream.
909 *
910 * @returns See vfprintf.
911 * @param pszFormat The vfprintf format string.
912 * @param ... The format arguments.
913 */
914static int Stream1Printf(const char *pszFormat, ...)
915{
916 int cch;
917 va_list va;
918 va_start(va, pszFormat);
919 if (!g_fQuiet)
920 cch = vfprintf(stdout, pszFormat, va);
921 else
922 cch = strlen(pszFormat);
923 va_end(va);
924 return cch;
925}
926
927
928/** the data store for stream two. */
929static char g_szStream2[10240];
930static unsigned volatile g_offStream2 = 0;
931
932/**
933 * Initializes the 2nd steam.
934 */
935static void Stream2Init(void)
936{
937 g_szStream2[0] = '\0';
938 g_offStream2 = 0;
939}
940
941/**
942 * Flushes the 2nd stream to stdout.
943 */
944static int Stream2Flush(void)
945{
946 g_szStream2[g_offStream2] = '\0';
947 Stream1Printf("%s", g_szStream2);
948 Stream2Init();
949 return 0;
950}
951
952/**
953 * printf to the 2nd stream.
954 */
955static int Stream2Printf(const char *pszFormat, ...)
956{
957 unsigned offStream2 = g_offStream2;
958 va_list va;
959 va_start(va, pszFormat);
960 int cch = vsprintf(&g_szStream2[offStream2], pszFormat, va);
961 va_end(va);
962 offStream2 += cch;
963 if (offStream2 >= sizeof(g_szStream2))
964 {
965 fprintf(stderr, "error: stream2 overflow!\n");
966 exit(1);
967 }
968 g_offStream2 = offStream2;
969 return cch;
970}
971
972
973/**
974 * Print the unidata.cpp file header and include list.
975 */
976int PrintHeader(const char *argv0)
977{
978 Stream1Printf("/** @file\n"
979 " *\n"
980 " * IPRT - Unicode Tables.\n"
981 " *\n"
982 " * Automatically Generated by %s (" __DATE__ " " __TIME__ ")\n"
983 " */\n"
984 "\n"
985 "/*\n"
986 " * Copyright (C) 2006-2010 Oracle Corporation\n"
987 " *\n"
988 " * This file is part of VirtualBox Open Source Edition (OSE), as\n"
989 " * available from http://www.virtualbox.org. This file is free software;\n"
990 " * you can redistribute it and/or modify it under the terms of the GNU\n"
991 " * General Public License (GPL) as published by the Free Software\n"
992 " * Foundation, in version 2 as it comes in the \"COPYING\" file of the\n"
993 " * VirtualBox OSE distribution. VirtualBox OSE is distributed in the\n"
994 " * hope that it will be useful, but WITHOUT ANY WARRANTY of any kind.\n"
995 " *\n"
996 " * The contents of this file may alternatively be used under the terms\n"
997 " * of the Common Development and Distribution License Version 1.0\n"
998 " * (CDDL) only, as it comes in the \"COPYING.CDDL\" file of the\n"
999 " * VirtualBox OSE distribution, in which case the provisions of the\n"
1000 " * CDDL are applicable instead of those of the GPL.\n"
1001 " *\n"
1002 " * You may elect to license modified versions of this file under the\n"
1003 " * terms and conditions of either the GPL or the CDDL or both.\n"
1004 " */\n"
1005 "\n"
1006 "#include <iprt/uni.h>\n"
1007 "\n",
1008 argv0);
1009 return 0;
1010}
1011
1012
1013/**
1014 * Print the flag tables.
1015 */
1016int PrintFlags(void)
1017{
1018 /*
1019 * Print flags table.
1020 */
1021 Stream2Init();
1022 Stream2Printf("const RTUNIFLAGSRANGE g_aRTUniFlagRanges[] =\n"
1023 "{\n");
1024 RTUNICP i = 0;
1025 int iStart = -1;
1026 while (i < RT_ELEMENTS(g_aCPInfo))
1027 {
1028 /* figure how far off the next chunk is */
1029 char szFlags[256];
1030 unsigned iNonNull = i;
1031 while ( iNonNull < RT_ELEMENTS(g_aCPInfo)
1032 && iNonNull >= 256
1033 && (g_aCPInfo[iNonNull].fNullEntry || !CalcFlags(&g_aCPInfo[iNonNull], szFlags)) )
1034 iNonNull++;
1035 if (iNonNull - i > 4096 || iNonNull == RT_ELEMENTS(g_aCPInfo))
1036 {
1037 if (iStart >= 0)
1038 {
1039 Stream1Printf("};\n\n");
1040 Stream2Printf(" { 0x%06x, 0x%06x, &g_afRTUniFlags0x%06x[0] },\n", iStart, i, iStart);
1041 iStart = -1;
1042 }
1043 i = iNonNull;
1044 }
1045 else
1046 {
1047 if (iStart < 0)
1048 {
1049 Stream1Printf("static const uint8_t g_afRTUniFlags0x%06x[] = \n"
1050 "{\n", i);
1051 iStart = i;
1052 }
1053 CalcFlags(&g_aCPInfo[i], szFlags);
1054 Stream1Printf(" %50s, /* U+%06x: %s*/\n", szFlags, g_aCPInfo[i].CodePoint, g_aCPInfo[i].pszName);
1055 i++;
1056 }
1057 }
1058 Stream2Printf(" { ~(RTUNICP)0, ~(RTUNICP)0, NULL }\n"
1059 "};\n\n\n");
1060 Stream1Printf("\n");
1061 return Stream2Flush();
1062}
1063
1064
1065/**
1066 * Prints the upper case tables.
1067 */
1068static int PrintUpper(void)
1069{
1070 Stream2Init();
1071 Stream2Printf("const RTUNICASERANGE g_aRTUniUpperRanges[] =\n"
1072 "{\n");
1073 RTUNICP i = 0;
1074 int iStart = -1;
1075 while (i < RT_ELEMENTS(g_aCPInfo))
1076 {
1077 /* figure how far off the next chunk is */
1078 unsigned iSameCase = i;
1079 while ( g_aCPInfo[iSameCase].SimpleUpperCaseMapping == g_aCPInfo[iSameCase].CodePoint
1080 && iSameCase < RT_ELEMENTS(g_aCPInfo)
1081 && iSameCase >= 256)
1082 iSameCase++;
1083 if (iSameCase - i > 4096/sizeof(RTUNICP) || iSameCase == RT_ELEMENTS(g_aCPInfo))
1084 {
1085 if (iStart >= 0)
1086 {
1087 Stream1Printf("};\n\n");
1088 Stream2Printf(" { 0x%06x, 0x%06x, &g_afRTUniUpper0x%06x[0] },\n", iStart, i, iStart);
1089 iStart = -1;
1090 }
1091 i = iSameCase;
1092 }
1093 else
1094 {
1095 if (iStart < 0)
1096 {
1097 Stream1Printf("static const RTUNICP g_afRTUniUpper0x%06x[] = \n"
1098 "{\n", i);
1099 iStart = i;
1100 }
1101 Stream1Printf(" 0x%02x, /* U+%06x: %s*/\n", g_aCPInfo[i].SimpleUpperCaseMapping, g_aCPInfo[i].CodePoint, g_aCPInfo[i].pszName);
1102 i++;
1103 }
1104 }
1105 Stream2Printf(" { ~(RTUNICP)0, ~(RTUNICP)0, NULL }\n"
1106 "};\n\n\n");
1107 Stream1Printf("\n");
1108 return Stream2Flush();
1109}
1110
1111
1112/**
1113 * Prints the lowercase tables.
1114 */
1115static int PrintLower(void)
1116{
1117 Stream2Init();
1118 Stream2Printf("const RTUNICASERANGE g_aRTUniLowerRanges[] =\n"
1119 "{\n");
1120 RTUNICP i = 0;
1121 int iStart = -1;
1122 while (i < RT_ELEMENTS(g_aCPInfo))
1123 {
1124 /* figure how far off the next chunk is */
1125 unsigned iSameCase = i;
1126 while ( g_aCPInfo[iSameCase].SimpleLowerCaseMapping == g_aCPInfo[iSameCase].CodePoint
1127 && iSameCase < RT_ELEMENTS(g_aCPInfo)
1128 && iSameCase >= 256)
1129 iSameCase++;
1130 if (iSameCase - i > 4096/sizeof(RTUNICP) || iSameCase == RT_ELEMENTS(g_aCPInfo))
1131 {
1132 if (iStart >= 0)
1133 {
1134 Stream1Printf("};\n\n");
1135 Stream2Printf(" { 0x%06x, 0x%06x, &g_afRTUniLower0x%06x[0] },\n", iStart, i, iStart);
1136 iStart = -1;
1137 }
1138 i = iSameCase;
1139 }
1140 else
1141 {
1142 if (iStart < 0)
1143 {
1144 Stream1Printf("static const RTUNICP g_afRTUniLower0x%06x[] = \n"
1145 "{\n", i);
1146 iStart = i;
1147 }
1148 Stream1Printf(" 0x%02x, /* U+%06x: %s*/\n",
1149 g_aCPInfo[i].SimpleLowerCaseMapping, g_aCPInfo[i].CodePoint, g_aCPInfo[i].pszName);
1150 i++;
1151 }
1152 }
1153 Stream2Printf(" { ~(RTUNICP)0, ~(RTUNICP)0, NULL }\n"
1154 "};\n\n\n");
1155 Stream1Printf("\n");
1156 return Stream2Flush();
1157}
1158
1159
1160int main(int argc, char **argv)
1161{
1162 /*
1163 * Parse args.
1164 */
1165 if (argc <= 1)
1166 {
1167 printf("usage: %s [-C|--dir <UCD-dir>] [UnicodeData.txt [DerivedCoreProperties.txt [PropList.txt] [DerivedNormalizationProps.txt]]]\n",
1168 argv[0]);
1169 return 1;
1170 }
1171
1172 const char *pszBaseDir = NULL;
1173 const char *pszUnicodeData = "UnicodeData.txt";
1174 const char *pszDerivedCoreProperties = "DerivedCoreProperties.txt";
1175 const char *pszPropList = "PropList.txt";
1176 const char *pszDerivedNormalizationProps = "DerivedNormalizationProps.txt";
1177 int iFile = 0;
1178 for (int argi = 1; argi < argc; argi++)
1179 {
1180 if (argv[argi][0] != '-')
1181 {
1182 switch (iFile++)
1183 {
1184 case 0: pszUnicodeData = argv[argi]; break;
1185 case 1: pszDerivedCoreProperties = argv[argi]; break;
1186 case 2: pszPropList = argv[argi]; break;
1187 case 3: pszDerivedNormalizationProps = argv[argi]; break;
1188 default:
1189 fprintf(stderr, "uniread: syntax error at '%s': too many filenames\n", argv[argi]);
1190 return 1;
1191 }
1192 }
1193 else if ( !strcmp(argv[argi], "--dir")
1194 || !strcmp(argv[argi], "-C"))
1195 {
1196 if (argi + 1 >= argc)
1197 {
1198 fprintf(stderr, "uniread: syntax error: '%s' is missing the directory name.\n", argv[argi]);
1199 return 1;
1200 }
1201 argi++;
1202 pszBaseDir = argv[argi];
1203 }
1204 else if ( !strcmp(argv[argi], "-q")
1205 || !strcmp(argv[argi], "--quiet"))
1206 g_fQuiet = true;
1207 else
1208 {
1209 fprintf(stderr, "uniread: syntax error at '%s': Unknown argument\n", argv[argi]);
1210 return 1;
1211 }
1212 }
1213
1214 /*
1215 * Read the data.
1216 */
1217 int rc = ReadUnicodeData(pszBaseDir, pszUnicodeData);
1218 if (rc)
1219 return rc;
1220 rc = GenerateExcludedData();
1221 if (rc)
1222 return rc;
1223 rc = ReadProperties(pszBaseDir, pszPropList);
1224 if (rc)
1225 return rc;
1226 rc = ReadProperties(pszBaseDir, pszDerivedCoreProperties);
1227 if (rc)
1228 return rc;
1229 rc = ReadProperties(pszBaseDir, pszDerivedNormalizationProps);
1230 if (rc)
1231 return rc;
1232
1233 /*
1234 * Print stuff.
1235 */
1236 rc = PrintHeader(argv[0]);
1237 if (rc)
1238 return rc;
1239 rc = PrintFlags();
1240 if (rc)
1241 return rc;
1242 rc = PrintUpper();
1243 if (rc)
1244 return rc;
1245 rc = PrintLower();
1246 if (rc)
1247 return rc;
1248
1249 /* done */
1250 fflush(stdout);
1251
1252 return rc;
1253}
1254
Note: See TracBrowser for help on using the repository browser.

© 2024 Oracle Support Privacy / Do Not Sell My Info Terms of Use Trademark Policy Automated Access Etiquette