Changeset 28876 in vbox for trunk/src/VBox/Runtime
- Timestamp:
- Apr 28, 2010 7:01:33 PM (15 years ago)
- File:
-
- 1 edited
Legend:
- Unmodified
- Added
- Removed
-
trunk/src/VBox/Runtime/common/string/uniread.cpp
r28800 r28876 30 30 #include <iprt/types.h> 31 31 #include <iprt/stdarg.h> 32 #include <iprt/ctype.h> 32 33 33 34 #include <stdio.h> … … 35 36 #include <stdlib.h> 36 37 38 39 /******************************************************************************* 40 * Global Variables * 41 *******************************************************************************/ 42 /** When set, no output is produced. Very useful when debugging ths code. */ 43 static bool g_fQuiet = false; 44 /** The file we're currently parsing. */ 45 static const char *g_pszCurFile; 46 /** The current line number. */ 47 static unsigned g_iLine; 48 49 50 /** 51 * Exit the program after printing a parse error. 52 * 53 * @param pszFormat The message. 54 * @param ... Format arguments. 55 */ 56 static void ParseError(const char *pszFormat, ...) 57 { 58 va_list va; 59 va_start(va, pszFormat); 60 fprintf(stderr, "parse error: %s:%u: ", g_pszCurFile, g_iLine); 61 vfprintf(stderr, pszFormat, va); 62 va_end(va); 63 exit(1); 64 } 37 65 38 66 /** … … 133 161 134 162 /** 163 * Splits a decomposition field. 164 * 165 * This may start with a type that is enclosed in angle brackets. 166 * 167 * @returns Pointer to the mapping values following the type. @a *ppsz if empty. 168 * @param ppszType Pointer to the type field pointer. On input the type 169 * field contains the combined type and mapping string. On 170 * output this should only contain the type, no angle 171 * brackets. If no type specified, it is replaced with an 172 * empty string (const). 173 */ 174 static char *SplitDecompField(char **ppszType) 175 { 176 /* Empty field? */ 177 char *psz = *ppszType; 178 if (!*psz) 179 return psz; 180 181 /* No type? */ 182 if (*psz != '<') 183 { 184 *ppszType = (char *)""; 185 return psz; 186 } 187 188 /* Split out the type. */ 189 *ppszType = ++psz; 190 psz = strchr(psz, '>'); 191 if (!psz) 192 { 193 ParseError("Bad Decomposition Type/Mappings\n"); 194 return *ppszType; 195 } 196 *psz++ = '\0'; 197 198 psz = StripLine(psz); 199 if (!*psz) 200 ParseError("Missing decomposition mappings\n"); 201 return psz; 202 } 203 204 /** 135 205 * Converts a code point field to a number. 136 206 * @returns Code point. … … 142 212 unsigned long ul = strtoul(psz, &pszEnd, 16); 143 213 if (pszEnd && *pszEnd) 144 fprintf(stderr, "warning:failed converting '%s' to a number!\n", psz);214 ParseError("failed converting '%s' to a number!\n", psz); 145 215 return (RTUNICP)ul; 146 216 } … … 179 249 if (pszEnd && *pszEnd) 180 250 { 181 fprintf(stderr, "warning:failed converting '%s' to a number!\n", psz);251 ParseError("failed converting '%s' to a number!\n", psz); 182 252 return ~(RTUNICP)0; 183 253 } … … 185 255 else 186 256 { 187 fprintf(stderr, "warning:failed converting '%s' to a number!\n", psz);257 ParseError("failed converting '%s' to a number!\n", psz); 188 258 return ~(RTUNICP)0; 189 259 } … … 192 262 return (RTUNICP)ulStart; 193 263 264 } 265 266 /** 267 * For converting the decompisition mappings field and similar. 268 * 269 * @returns Mapping array or NULL if none. 270 * @param psz The string to convert. Can be empty. 271 * @param pcEntries Where to store the number of entries. 272 * @param cMax The max number of entries. 273 */ 274 static PRTUNICP ToMapping(char *psz, unsigned *pcEntries, unsigned cMax) 275 { 276 PRTUNICP paCps = NULL; 277 unsigned cAlloc = 0; 278 unsigned i = 0; 279 280 /* Convert the code points. */ 281 while (psz) 282 { 283 /* skip leading spaces */ 284 while (RT_C_IS_BLANK(*psz)) 285 psz++; 286 287 /* the end? */ 288 if (!*psz) 289 break; 290 291 /* room left? */ 292 if (i >= cMax) 293 { 294 ParseError("Too many mappings.\n"); 295 break; 296 } 297 if (i >= cAlloc) 298 { 299 cAlloc += 4; 300 paCps = (PRTUNICP)realloc(paCps, cAlloc * sizeof(paCps[0])); 301 if (!paCps) 302 { 303 fprintf(stderr, "out of memory (%u)\n", (unsigned)(cAlloc * sizeof(paCps[0]))); 304 exit(1); 305 } 306 } 307 308 /* Find the end. */ 309 char *pszThis = psz; 310 while (RT_C_IS_XDIGIT(*psz)) 311 psz++; 312 if (*psz && !RT_C_IS_BLANK(*psz)) 313 ParseError("Malformed mappings.\n"); 314 if (*psz) 315 *psz++ = '\0'; 316 317 /* Convert to number and add it. */ 318 paCps[i++] = ToNum(pszThis); 319 } 320 321 *pcEntries = i; 322 return paCps; 194 323 } 195 324 … … 223 352 RTUNICP SimpleLowerCaseMapping; 224 353 RTUNICP SimpleTitleCaseMapping; 354 unsigned CanonicalCombiningClass; 355 const char *pszDecompositionType; 356 unsigned cDecompositionMapping; 357 PRTUNICP paDecompositionMapping; 225 358 const char *pszName; 226 359 /** Set if this is an unused entry */ … … 230 363 unsigned fASCIIHexDigit : 1; 231 364 unsigned fBidiControl : 1; 365 unsigned fCaseIgnorable : 1; 366 unsigned fCased : 1; 367 unsigned fChangesWhenCasefolded : 1; 368 unsigned fChangesWhenCasemapped : 1; 369 unsigned fChangesWhenLowercased : 1; 370 unsigned fChangesWhenTitlecased : 1; 371 unsigned fChangesWhenUppercased : 1; 232 372 unsigned fDash : 1; 233 373 unsigned fDefaultIgnorableCodePoint : 1; … … 272 412 unsigned fXIDStart : 1; 273 413 274 /* unprocess stuff, so far. */ 414 /** @name DerivedNormalizationProps.txt 415 * @{ */ 416 unsigned fFullCompositionExclusion : 1; 417 unsigned fInvNFC_QC : 2; /**< If 1 (NFC_QC == N) then code point 100% sure not part of NFC string. */ 418 unsigned fInvNFD_QC : 2; /**< If 1 (NFD_QC == N) then code point 100% sure not part of NFD string. */ 419 unsigned fInvNFKC_QC : 2; 420 unsigned fInvNFKD_QC : 2; 421 unsigned fExpandsOnNFC : 1; 422 unsigned fExpandsOnNFD : 1; 423 unsigned fExpandsOnNFKC : 1; 424 unsigned fExpandsOnNFKD : 1; 425 /** @} */ 426 427 /* unprocessed stuff, so far. */ 275 428 const char *pszGeneralCategory; 276 const char *pszCanonicalCombiningClass;277 429 const char *pszBidiClass; 278 const char *pszDecompositionType;279 const char *pszDecompositionMapping;280 430 const char *pszNumericType; 281 const char *pszNumericValue; 431 const char *pszNumericValueD; 432 const char *pszNumericValueN; 282 433 const char *pszBidiMirrored; 283 434 const char *pszUnicode1Name; 284 435 const char *pszISOComment; 285 } g_aCPInfo[0x f0000];436 } g_aCPInfo[0x110000]; 286 437 287 438 … … 294 445 g_aCPInfo[i].CodePoint = i; 295 446 g_aCPInfo[i].fNullEntry = 1; 296 g_aCPInfo[i].pszName = "";297 447 g_aCPInfo[i].SimpleUpperCaseMapping = i; 298 448 g_aCPInfo[i].SimpleLowerCaseMapping = i; 299 449 g_aCPInfo[i].SimpleTitleCaseMapping = i; 450 g_aCPInfo[i].pszDecompositionType = ""; 451 g_aCPInfo[i].cDecompositionMapping = 0; 452 g_aCPInfo[i].paDecompositionMapping = NULL; 453 g_aCPInfo[i].pszName = ""; 300 454 g_aCPInfo[i].pszGeneralCategory = ""; 301 g_aCPInfo[i].pszCanonicalCombiningClass = "";302 455 g_aCPInfo[i].pszBidiClass = ""; 303 g_aCPInfo[i].pszDecompositionType = "";304 g_aCPInfo[i].pszDecompositionMapping = "";305 456 g_aCPInfo[i].pszNumericType = ""; 306 g_aCPInfo[i].pszNumericValue = ""; 457 g_aCPInfo[i].pszNumericValueD = ""; 458 g_aCPInfo[i].pszNumericValueN = ""; 307 459 g_aCPInfo[i].pszBidiMirrored = ""; 308 460 g_aCPInfo[i].pszUnicode1Name = ""; … … 312 464 313 465 /** 466 * Open a file for reading, optionally with a base path prefixed. 467 * 468 * @returns file stream on success, NULL w/ complaint on failure. 469 * @param pszBasePath The base path, can be NULL. 470 * @param pszFilename The name of the file to open. 471 */ 472 static FILE *OpenFile(const char *pszBasePath, const char *pszFilename) 473 { 474 FILE *pFile; 475 if ( !pszBasePath 476 || *pszFilename == '/' 477 #if defined(_MSC_VER) || defined(__OS2__) 478 || *pszFilename == '\\' 479 || (*pszFilename && pszFilename[1] == ':') 480 #endif 481 ) 482 { 483 pFile = fopen(pszFilename, "r"); 484 if (!pFile) 485 fprintf(stderr, "uniread: failed to open '%s' for reading\n", pszFilename); 486 } 487 else 488 { 489 size_t cchBasePath = strlen(pszBasePath); 490 size_t cchFilename = strlen(pszFilename); 491 char *pszFullName = (char *)malloc(cchBasePath + 1 + cchFilename + 1); 492 if (!pszFullName) 493 { 494 fprintf(stderr, "uniread: failed to allocate %d bytes\n", (int)(cchBasePath + 1 + cchFilename + 1)); 495 return NULL; 496 } 497 498 memcpy(pszFullName, pszBasePath, cchBasePath); 499 pszFullName[cchBasePath] = '/'; 500 memcpy(&pszFullName[cchBasePath + 1], pszFilename, cchFilename + 1); 501 502 pFile = fopen(pszFullName, "r"); 503 if (!pFile) 504 fprintf(stderr, "uniread: failed to open '%s' for reading\n", pszFullName); 505 free(pszFullName); 506 } 507 g_pszCurFile = pszFilename; 508 g_iLine = 0; 509 return pFile; 510 } 511 512 513 /** 514 * Wrapper around fgets that keep track of the line number. 515 * 516 * @returns See fgets. 517 * @param pszBuf The buffer. See fgets for output definition. 518 * @param cbBuf The buffer size. 519 * @param pFile The file to read from. 520 */ 521 static char *GetLineFromFile(char *pszBuf, int cbBuf, FILE *pFile) 522 { 523 g_iLine++; 524 return fgets(pszBuf, cbBuf, pFile); 525 } 526 527 528 /** 529 * Closes a file opened by OpenFile 530 * 531 * @param pFile The file to close. 532 */ 533 static void CloseFile(FILE *pFile) 534 { 535 g_pszCurFile = NULL; 536 g_iLine = 0; 537 fclose(pFile); 538 } 539 540 541 /** 314 542 * Read the UnicodeData.txt file. 315 543 * @returns 0 on success. 316 544 * @returns !0 on failure. 317 * @param pszFilename The name of the file. 318 */ 319 static int ReadUnicodeData(const char *pszFilename) 545 * @param pszBasePath The base path, can be NULL. 546 * @param pszFilename The name of the file. 547 */ 548 static int ReadUnicodeData(const char *pszBasePath, const char *pszFilename) 320 549 { 321 550 /* 322 551 * Open input. 323 552 */ 324 FILE *pFile = fopen(pszFilename, "r");553 FILE *pFile = OpenFile(pszBasePath, pszFilename); 325 554 if (!pFile) 326 {327 printf("uniread: failed to open '%s' for reading\n", pszFilename);328 555 return 1; 329 }330 556 331 557 /* … … 334 560 char szLine[4096]; 335 561 RTUNICP i = 0; 336 while ( fgets(szLine, sizeof(szLine), pFile) != NULL)562 while (GetLineFromFile(szLine, sizeof(szLine), pFile) != NULL) 337 563 { 338 564 if (IsCommentOrBlankLine(szLine)) … … 346 572 char *pszBidiClass = NextField(&pszCurField); /* 4 */ 347 573 char *pszDecompositionType = NextField(&pszCurField); /* 5 */ 348 char *pszDecompositionMapping = NextField(&pszCurField); /* 6 */ 349 char *pszNumericType = NextField(&pszCurField); /* 7 */ 350 char *pszNumericValue = NextField(&pszCurField); /* 8 */ 574 char *pszDecompositionMapping = SplitDecompField(&pszDecompositionType); 575 char *pszNumericType = NextField(&pszCurField); /* 6 */ 576 char *pszNumericValueD = NextField(&pszCurField); /* 7 */ 577 char *pszNumericValueN = NextField(&pszCurField); /* 8 */ 351 578 char *pszBidiMirrored = NextField(&pszCurField); /* 9 */ 352 579 char *pszUnicode1Name = NextField(&pszCurField); /* 10 */ … … 358 585 RTUNICP CodePoint = ToNum(pszCodePoint); 359 586 if (CodePoint >= RT_ELEMENTS(g_aCPInfo)) 587 { 588 ParseError("U+05X is out of range\n", CodePoint); 360 589 continue; 590 } 361 591 362 592 /* catchup? */ … … 365 595 if (i != CodePoint) 366 596 { 367 fprintf(stderr, "unitest: error:i=%d CodePoint=%u\n", i, CodePoint);368 fclose(pFile);597 ParseError("i=%d CodePoint=%u\n", i, CodePoint); 598 CloseFile(pFile); 369 599 return 1; 370 600 } … … 377 607 g_aCPInfo[i].SimpleLowerCaseMapping = ToNumDefault(pszSimpleLowerCaseMapping, CodePoint); 378 608 g_aCPInfo[i].SimpleTitleCaseMapping = ToNumDefault(pszSimpleTitleCaseMapping, CodePoint); 609 g_aCPInfo[i].CanonicalCombiningClass = ToNum(pszCanonicalCombiningClass); 610 g_aCPInfo[i].pszDecompositionType = DupStr(pszDecompositionType); 611 g_aCPInfo[i].paDecompositionMapping = ToMapping(pszDecompositionMapping, &g_aCPInfo[i].cDecompositionMapping, 20); 379 612 g_aCPInfo[i].pszGeneralCategory = DupStr(pszGeneralCategory); 380 g_aCPInfo[i].pszCanonicalCombiningClass = DupStr(pszCanonicalCombiningClass);381 613 g_aCPInfo[i].pszBidiClass = DupStr(pszBidiClass); 382 g_aCPInfo[i].pszDecompositionType = DupStr(pszDecompositionType);383 g_aCPInfo[i].pszDecompositionMapping = DupStr(pszDecompositionMapping);384 614 g_aCPInfo[i].pszNumericType = DupStr(pszNumericType); 385 g_aCPInfo[i].pszNumericValue = DupStr(pszNumericValue); 615 g_aCPInfo[i].pszNumericValueD = DupStr(pszNumericValueD); 616 g_aCPInfo[i].pszNumericValueN = DupStr(pszNumericValueN); 386 617 g_aCPInfo[i].pszBidiMirrored = DupStr(pszBidiMirrored); 387 618 g_aCPInfo[i].pszUnicode1Name = DupStr(pszUnicode1Name); … … 389 620 i++; 390 621 } 622 391 623 /* catchup? */ 392 624 while (i < RT_ELEMENTS(g_aCPInfo)) 393 625 NullEntry(i++); 394 fclose(pFile);626 CloseFile(pFile); 395 627 396 628 return 0; 629 } 630 631 632 /** 633 * Generates excluded data. 634 * 635 * @returns 0 on success, exit code on failure. 636 */ 637 static int GenerateExcludedData(void) 638 { 639 /* 640 * Hangul Syllables U+AC00 to U+D7A3. 641 */ 642 for (RTUNICP i = 0xac00; i <= 0xd7a3; i++) 643 { 644 g_aCPInfo[i].fNullEntry = 0; 645 g_aCPInfo[i].fInvNFD_QC = 1; 646 /** @todo generate the decomposition: http://unicode.org/reports/tr15/#Hangul 647 * */ 648 } 649 650 /** @todo 651 * CJK Ideographs Extension A (U+3400 - U+4DB5) 652 * CJK Ideographs (U+4E00 - U+9FA5) 653 * CJK Ideograph Extension B (U+20000 - U+2A6D6) 654 * CJK Ideograph Extension C (U+2A700 - U+2B734) 655 */ 656 657 return 0; 658 } 659 660 661 662 /** 663 * Worker for ApplyProperty that handles a yes, no, maybe property value. 664 * 665 * @returns 0 (NO), 1 (YES), 2 (MAYBE). 666 * @param ppszNextField The field cursor, input and output. 667 */ 668 static int YesNoMaybePropertyValue(char **ppszNextField) 669 { 670 if (!**ppszNextField) 671 { 672 ParseError("Missing Y/N/M field\n"); 673 return 0; 674 } 675 char *psz = NextField(ppszNextField); 676 if (!strcmp(psz, "N")) 677 return 0; 678 if (!strcmp(psz, "Y")) 679 return 1; 680 if (!strcmp(psz, "M")) 681 return 2; 682 ParseError("Unexpected Y/N/M value: '%s'\n", psz); 683 return 0; 684 } 685 686 687 /** 688 * Inverted version of YesNoMaybePropertyValue 689 * 690 * @returns 1 (NO), 0 (YES), 2 (MAYBE). 691 * @param ppszNextField The field cursor, input and output. 692 */ 693 static int YesNoMaybePropertyValueInv(char **ppszNextField) 694 { 695 unsigned rc = YesNoMaybePropertyValue(ppszNextField); 696 switch (rc) 697 { 698 case 0: return 1; 699 case 1: return 0; 700 default: return rc; 701 } 397 702 } 398 703 … … 404 709 * @param pszProperty The property name. 405 710 */ 406 static void ApplyProperty(RTUNICP StartCP, const char *pszProperty )711 static void ApplyProperty(RTUNICP StartCP, const char *pszProperty, char *pszNextField) 407 712 { 408 713 if (StartCP >= RT_ELEMENTS(g_aCPInfo)) 714 { 715 ParseError("U+%06X is out of the g_aCPInfo range.\n", StartCP); 409 716 return; 717 } 410 718 struct CPINFO *pCPInfo = &g_aCPInfo[StartCP]; 411 719 /* string switch */ 412 if (!strcmp(pszProperty, "ASCII_Hex_Digit")) pCPInfo->fASCIIHexDigit = 1; 720 if (!strcmp(pszProperty, "ASCII_Hex_Digit")) pCPInfo->fASCIIHexDigit = 1; 721 else if (!strcmp(pszProperty, "Alphabetic")) pCPInfo->fAlphabetic = 1; 413 722 else if (!strcmp(pszProperty, "Bidi_Control")) pCPInfo->fBidiControl = 1; 723 else if (!strcmp(pszProperty, "Case_Ignorable")) pCPInfo->fCaseIgnorable = 1; 724 else if (!strcmp(pszProperty, "Cased")) pCPInfo->fCased = 1; 725 else if (!strcmp(pszProperty, "Changes_When_Casefolded")) pCPInfo->fChangesWhenCasefolded = 1; 726 else if (!strcmp(pszProperty, "Changes_When_Casemapped")) pCPInfo->fChangesWhenCasemapped = 1; 727 else if (!strcmp(pszProperty, "Changes_When_Lowercased")) pCPInfo->fChangesWhenLowercased = 1; 728 else if (!strcmp(pszProperty, "Changes_When_Titlecased")) pCPInfo->fChangesWhenTitlecased = 1; 729 else if (!strcmp(pszProperty, "Changes_When_Uppercased")) pCPInfo->fChangesWhenUppercased = 1; 414 730 else if (!strcmp(pszProperty, "Dash")) pCPInfo->fDash = 1; 731 else if (!strcmp(pszProperty, "Default_Ignorable_Code_Point")) pCPInfo->fDefaultIgnorableCodePoint = 1; 415 732 else if (!strcmp(pszProperty, "Deprecated")) pCPInfo->fDeprecated = 1; 416 733 else if (!strcmp(pszProperty, "Diacritic")) pCPInfo->fDiacritic = 1; 417 734 else if (!strcmp(pszProperty, "Extender")) pCPInfo->fExtender = 1; 735 else if (!strcmp(pszProperty, "Grapheme_Base")) pCPInfo->fGraphemeBase = 1; 736 else if (!strcmp(pszProperty, "Grapheme_Extend")) pCPInfo->fGraphemeExtend = 1; 418 737 else if (!strcmp(pszProperty, "Grapheme_Link")) pCPInfo->fGraphemeLink = 1; 419 738 else if (!strcmp(pszProperty, "Hex_Digit")) pCPInfo->fHexDigit = 1; 420 739 else if (!strcmp(pszProperty, "Hyphen")) pCPInfo->fHyphen = 1; 740 else if (!strcmp(pszProperty, "ID_Continue")) pCPInfo->fIDContinue = 1; 741 else if (!strcmp(pszProperty, "ID_Start")) pCPInfo->fIDStart = 1; 421 742 else if (!strcmp(pszProperty, "Ideographic")) pCPInfo->fIdeographic = 1; 422 743 else if (!strcmp(pszProperty, "IDS_Binary_Operator")) pCPInfo->fIDSBinaryOperator = 1; … … 424 745 else if (!strcmp(pszProperty, "Join_Control")) pCPInfo->fJoinControl = 1; 425 746 else if (!strcmp(pszProperty, "Logical_Order_Exception")) pCPInfo->fLogicalOrderException = 1; 747 else if (!strcmp(pszProperty, "Lowercase")) pCPInfo->fLowercase = 1; 748 else if (!strcmp(pszProperty, "Math")) pCPInfo->fMath = 1; 426 749 else if (!strcmp(pszProperty, "Noncharacter_Code_Point")) pCPInfo->fNoncharacterCodePoint = 1; 427 750 else if (!strcmp(pszProperty, "Other_Alphabetic")) pCPInfo->fOtherAlphabetic = 1; … … 433 756 else if (!strcmp(pszProperty, "Other_Math")) pCPInfo->fOtherMath = 1; 434 757 else if (!strcmp(pszProperty, "Other_Uppercase")) pCPInfo->fOtherUppercase = 1; 435 else if (!strcmp(pszProperty, "Alphabetic")) pCPInfo->fAlphabetic = 1;436 else if (!strcmp(pszProperty, "Default_Ignorable_Code_Point")) pCPInfo->fDefaultIgnorableCodePoint = 1;437 else if (!strcmp(pszProperty, "Grapheme_Base")) pCPInfo->fGraphemeBase = 1;438 else if (!strcmp(pszProperty, "Grapheme_Extend")) pCPInfo->fGraphemeExtend = 1;439 else if (!strcmp(pszProperty, "ID_Continue")) pCPInfo->fIDContinue = 1;440 else if (!strcmp(pszProperty, "ID_Start")) pCPInfo->fIDStart = 1;441 else if (!strcmp(pszProperty, "XID_Continue")) pCPInfo->fXIDContinue = 1;442 else if (!strcmp(pszProperty, "XID_Start")) pCPInfo->fXIDStart = 1;443 else if (!strcmp(pszProperty, "Lowercase")) pCPInfo->fLowercase = 1;444 else if (!strcmp(pszProperty, "Math")) pCPInfo->fMath = 1;445 else if (!strcmp(pszProperty, "Uppercase")) pCPInfo->fUppercase = 1;446 758 else if (!strcmp(pszProperty, "Pattern_Syntax")) pCPInfo->fPatternSyntax = 1; 447 759 else if (!strcmp(pszProperty, "Pattern_White_Space")) pCPInfo->fPatternWhiteSpace = 1; … … 452 764 else if (!strcmp(pszProperty, "Terminal_Punctuation")) pCPInfo->fTerminalPunctuation = 1; 453 765 else if (!strcmp(pszProperty, "Unified_Ideograph")) pCPInfo->fUnifiedIdeograph = 1; 766 else if (!strcmp(pszProperty, "Uppercase")) pCPInfo->fUppercase = 1; 454 767 else if (!strcmp(pszProperty, "Variation_Selector")) pCPInfo->fVariationSelector = 1; 455 768 else if (!strcmp(pszProperty, "White_Space")) pCPInfo->fWhiteSpace = 1; 769 else if (!strcmp(pszProperty, "XID_Continue")) pCPInfo->fXIDContinue = 1; 770 else if (!strcmp(pszProperty, "XID_Start")) pCPInfo->fXIDStart = 1; 771 /* DerivedNormalizationProps: */ 772 else if (!strcmp(pszProperty, "FC_NFKC")) return; /* ignored */ 773 else if (!strcmp(pszProperty, "Full_Composition_Exclusion")) pCPInfo->fFullCompositionExclusion = 1; 774 else if (!strcmp(pszProperty, "NFC_QC")) pCPInfo->fInvNFC_QC = YesNoMaybePropertyValueInv(&pszNextField); 775 else if (!strcmp(pszProperty, "NFD_QC")) pCPInfo->fInvNFD_QC = YesNoMaybePropertyValueInv(&pszNextField); 776 else if (!strcmp(pszProperty, "NFKC_QC")) pCPInfo->fInvNFKC_QC = YesNoMaybePropertyValueInv(&pszNextField); 777 else if (!strcmp(pszProperty, "NFKD_QC")) pCPInfo->fInvNFKD_QC = YesNoMaybePropertyValueInv(&pszNextField); 778 else if (!strcmp(pszProperty, "Expands_On_NFC")) pCPInfo->fExpandsOnNFC = 1; 779 else if (!strcmp(pszProperty, "Expands_On_NFD")) pCPInfo->fExpandsOnNFD = 1; 780 else if (!strcmp(pszProperty, "Expands_On_NFKC")) pCPInfo->fExpandsOnNFKC = 1; 781 else if (!strcmp(pszProperty, "Expands_On_NFKD")) pCPInfo->fExpandsOnNFKD = 1; 782 else if (!strcmp(pszProperty, "NFKC_CF")) return; /*ignore */ 783 else if (!strcmp(pszProperty, "Changes_When_NFKC_Casefolded")) return; /*ignore */ 456 784 else 457 fprintf(stderr, "uniread: Unknown property '%s'\n", pszProperty); 785 { 786 ParseError("Unknown property '%s'\n", pszProperty); 787 return; 788 } 789 790 if (pszNextField && *pszNextField) 791 ParseError("Unexpected next field: '%s'\n", pszNextField); 458 792 } 459 793 … … 467 801 * @returns 0 on success. 468 802 * @returns !0 on failure. 803 * @param pszBasePath The base path, can be NULL. 469 804 * @param pszFilename The name of the file. 470 805 */ 471 static int ReadProperties(const char *psz Filename)806 static int ReadProperties(const char *pszBasePath, const char *pszFilename) 472 807 { 473 808 /* 474 809 * Open input. 475 810 */ 476 FILE *pFile = fopen(pszFilename, "r");811 FILE *pFile = OpenFile(pszBasePath, pszFilename); 477 812 if (!pFile) 478 {479 printf("uniread: failed to open '%s' for reading\n", pszFilename);480 813 return 1; 481 }482 814 483 815 /* … … 485 817 */ 486 818 char szLine[4096]; 487 while ( fgets(szLine, sizeof(szLine), pFile) != NULL)819 while (GetLineFromFile(szLine, sizeof(szLine), pFile) != NULL) 488 820 { 489 821 if (IsCommentOrBlankLine(szLine)) 490 822 continue; 491 823 char *pszCurField; 492 char *pszRange = FirstField(&pszCurField, StripLine(szLine));824 char *pszRange = FirstField(&pszCurField, StripLine(szLine)); 493 825 char *pszProperty = NextField(&pszCurField); 494 826 if (!*pszProperty) 827 { 828 ParseError("no property field.\n"); 495 829 continue; 830 } 496 831 497 832 RTUNICP LastCP; … … 501 836 502 837 while (StartCP <= LastCP) 503 ApplyProperty(StartCP++, pszProperty );504 } 505 506 fclose(pFile);838 ApplyProperty(StartCP++, pszProperty, pszCurField); 839 } 840 841 CloseFile(pFile); 507 842 508 843 return 0; … … 547 882 if (pInfo->fLowercase || pInfo->fOtherLowercase) 548 883 AppendFlag(pszFlags, "RTUNI_LOWER"); 549 //if (pInfo->fNumeric) 550 // AppendFlag(pszFlags, "RTUNI_NUMERIC"); 884 //if (pInfo->???) 885 // AppendFlag(pszFlags, "RTUNI_BSPACE"); 886 if (pInfo->fInvNFD_QC != 0 || pInfo->fInvNFC_QC != 0) 887 { 888 AppendFlag(pszFlags, "RTUNI_QC_NFX"); 889 if (!pInfo->paDecompositionMapping && pInfo->fInvNFD_QC) 890 fprintf(stderr, "uniread: U+%05X is QC_NFD but has no mappings.\n", pInfo->CodePoint); 891 else if (*pInfo->pszDecompositionType && pInfo->fInvNFD_QC) 892 fprintf(stderr, "uniread: U+%05X is QC_NFD but has no canonical mappings.\n", pInfo->CodePoint); 893 } 894 else if (pInfo->paDecompositionMapping && !*pInfo->pszDecompositionType) 895 fprintf(stderr, "uniread: U+%05X is not QC_NFX but has canonical mappings.\n", pInfo->CodePoint); 896 551 897 if (!*pszFlags) 552 898 { … … 558 904 } 559 905 906 907 /** 908 * printf wrapper for the primary output stream. 909 * 910 * @returns See vfprintf. 911 * @param pszFormat The vfprintf format string. 912 * @param ... The format arguments. 913 */ 914 static int Stream1Printf(const char *pszFormat, ...) 915 { 916 int cch; 917 va_list va; 918 va_start(va, pszFormat); 919 if (!g_fQuiet) 920 cch = vfprintf(stdout, pszFormat, va); 921 else 922 cch = strlen(pszFormat); 923 va_end(va); 924 return cch; 925 } 926 927 560 928 /** the data store for stream two. */ 561 929 static char g_szStream2[10240]; 562 static unsigned g_offStream2 = 0;930 static unsigned volatile g_offStream2 = 0; 563 931 564 932 /** … … 576 944 static int Stream2Flush(void) 577 945 { 578 fwrite(g_szStream2, 1, g_offStream2, stdout); 946 g_szStream2[g_offStream2] = '\0'; 947 Stream1Printf("%s", g_szStream2); 948 Stream2Init(); 579 949 return 0; 580 950 } … … 585 955 static int Stream2Printf(const char *pszFormat, ...) 586 956 { 957 unsigned offStream2 = g_offStream2; 587 958 va_list va; 588 959 va_start(va, pszFormat); 589 int cch = vsprintf(&g_szStream2[ g_offStream2], pszFormat, va);960 int cch = vsprintf(&g_szStream2[offStream2], pszFormat, va); 590 961 va_end(va); 591 g_offStream2 += cch;592 if ( g_offStream2 >= sizeof(g_szStream2))962 offStream2 += cch; 963 if (offStream2 >= sizeof(g_szStream2)) 593 964 { 594 965 fprintf(stderr, "error: stream2 overflow!\n"); 595 966 exit(1); 596 967 } 968 g_offStream2 = offStream2; 597 969 return cch; 598 970 } … … 604 976 int PrintHeader(const char *argv0) 605 977 { 606 /* 607 * Print file header. 608 */ 609 printf("/** @file\n" 610 " *\n" 611 " * IPRT - Unicode Tables\n" 612 " *\n" 613 " * Automatically Generated by %s (" __DATE__ " " __TIME__ ")\n" 614 " */\n\n" 615 "/*\n" 616 " * Copyright (C) 2006-2008 Sun Microsystems, Inc.\n" 617 " *\n" 618 " * This file is part of VirtualBox Open Source Edition (OSE), as\n" 619 " * available from http://www.virtualbox.org. This file is free software;\n" 620 " * you can redistribute it and/or modify it under the terms of the GNU\n" 621 " * General Public License as published by the Free Software Foundation,\n" 622 " * in version 2 as it comes in the \"COPYING\" file of the VirtualBox OSE\n" 623 " * distribution. VirtualBox OSE is distributed in the hope that it will\n" 624 " * be useful, but WITHOUT ANY WARRANTY of any kind.\n" 625 " *\n" 626 "\n" 627 "#include <iprt/uni.h>\n" 628 "\n", 629 argv0); 978 Stream1Printf("/** @file\n" 979 " *\n" 980 " * IPRT - Unicode Tables.\n" 981 " *\n" 982 " * Automatically Generated by %s (" __DATE__ " " __TIME__ ")\n" 983 " */\n" 984 "\n" 985 "/*\n" 986 " * Copyright (C) 2006-2010 Oracle Corporation\n" 987 " *\n" 988 " * This file is part of VirtualBox Open Source Edition (OSE), as\n" 989 " * available from http://www.virtualbox.org. This file is free software;\n" 990 " * you can redistribute it and/or modify it under the terms of the GNU\n" 991 " * General Public License (GPL) as published by the Free Software\n" 992 " * Foundation, in version 2 as it comes in the \"COPYING\" file of the\n" 993 " * VirtualBox OSE distribution. VirtualBox OSE is distributed in the\n" 994 " * hope that it will be useful, but WITHOUT ANY WARRANTY of any kind.\n" 995 " *\n" 996 " * The contents of this file may alternatively be used under the terms\n" 997 " * of the Common Development and Distribution License Version 1.0\n" 998 " * (CDDL) only, as it comes in the \"COPYING.CDDL\" file of the\n" 999 " * VirtualBox OSE distribution, in which case the provisions of the\n" 1000 " * CDDL are applicable instead of those of the GPL.\n" 1001 " *\n" 1002 " * You may elect to license modified versions of this file under the\n" 1003 " * terms and conditions of either the GPL or the CDDL or both.\n" 1004 " */\n" 1005 "\n" 1006 "#include <iprt/uni.h>\n" 1007 "\n", 1008 argv0); 630 1009 return 0; 631 1010 } … … 650 1029 char szFlags[256]; 651 1030 unsigned iNonNull = i; 652 while ( (g_aCPInfo[iNonNull].fNullEntry || !CalcFlags(&g_aCPInfo[iNonNull], szFlags))653 && iNonNull < RT_ELEMENTS(g_aCPInfo)654 && iNonNull >= 256)1031 while ( iNonNull < RT_ELEMENTS(g_aCPInfo) 1032 && iNonNull >= 256 1033 && (g_aCPInfo[iNonNull].fNullEntry || !CalcFlags(&g_aCPInfo[iNonNull], szFlags)) ) 655 1034 iNonNull++; 656 1035 if (iNonNull - i > 4096 || iNonNull == RT_ELEMENTS(g_aCPInfo)) … … 658 1037 if (iStart >= 0) 659 1038 { 660 printf("};\n\n");1039 Stream1Printf("};\n\n"); 661 1040 Stream2Printf(" { 0x%06x, 0x%06x, &g_afRTUniFlags0x%06x[0] },\n", iStart, i, iStart); 662 1041 iStart = -1; … … 668 1047 if (iStart < 0) 669 1048 { 670 printf("static const uint8_t g_afRTUniFlags0x%06x[] = \n"671 "{\n", i);1049 Stream1Printf("static const uint8_t g_afRTUniFlags0x%06x[] = \n" 1050 "{\n", i); 672 1051 iStart = i; 673 1052 } 674 1053 CalcFlags(&g_aCPInfo[i], szFlags); 675 printf(" %50s, /* U+%06x: %s*/\n", szFlags, g_aCPInfo[i].CodePoint, g_aCPInfo[i].pszName);1054 Stream1Printf(" %50s, /* U+%06x: %s*/\n", szFlags, g_aCPInfo[i].CodePoint, g_aCPInfo[i].pszName); 676 1055 i++; 677 1056 } … … 679 1058 Stream2Printf(" { ~(RTUNICP)0, ~(RTUNICP)0, NULL }\n" 680 1059 "};\n\n\n"); 681 printf("\n");1060 Stream1Printf("\n"); 682 1061 return Stream2Flush(); 683 1062 } … … 706 1085 if (iStart >= 0) 707 1086 { 708 printf("};\n\n");1087 Stream1Printf("};\n\n"); 709 1088 Stream2Printf(" { 0x%06x, 0x%06x, &g_afRTUniUpper0x%06x[0] },\n", iStart, i, iStart); 710 1089 iStart = -1; … … 716 1095 if (iStart < 0) 717 1096 { 718 printf("static const RTUNICP g_afRTUniUpper0x%06x[] = \n"719 "{\n", i);1097 Stream1Printf("static const RTUNICP g_afRTUniUpper0x%06x[] = \n" 1098 "{\n", i); 720 1099 iStart = i; 721 1100 } 722 printf(" 0x%02x, /* U+%06x: %s*/\n", g_aCPInfo[i].SimpleUpperCaseMapping, g_aCPInfo[i].CodePoint, g_aCPInfo[i].pszName);1101 Stream1Printf(" 0x%02x, /* U+%06x: %s*/\n", g_aCPInfo[i].SimpleUpperCaseMapping, g_aCPInfo[i].CodePoint, g_aCPInfo[i].pszName); 723 1102 i++; 724 1103 } … … 726 1105 Stream2Printf(" { ~(RTUNICP)0, ~(RTUNICP)0, NULL }\n" 727 1106 "};\n\n\n"); 728 printf("\n");1107 Stream1Printf("\n"); 729 1108 return Stream2Flush(); 730 1109 } … … 753 1132 if (iStart >= 0) 754 1133 { 755 printf("};\n\n");1134 Stream1Printf("};\n\n"); 756 1135 Stream2Printf(" { 0x%06x, 0x%06x, &g_afRTUniLower0x%06x[0] },\n", iStart, i, iStart); 757 1136 iStart = -1; … … 763 1142 if (iStart < 0) 764 1143 { 765 printf("static const RTUNICP g_afRTUniLower0x%06x[] = \n"766 "{\n", i);1144 Stream1Printf("static const RTUNICP g_afRTUniLower0x%06x[] = \n" 1145 "{\n", i); 767 1146 iStart = i; 768 1147 } 769 printf(" 0x%02x, /* U+%06x: %s*/\n", g_aCPInfo[i].SimpleLowerCaseMapping, g_aCPInfo[i].CodePoint, g_aCPInfo[i].pszName); 1148 Stream1Printf(" 0x%02x, /* U+%06x: %s*/\n", 1149 g_aCPInfo[i].SimpleLowerCaseMapping, g_aCPInfo[i].CodePoint, g_aCPInfo[i].pszName); 770 1150 i++; 771 1151 } … … 773 1153 Stream2Printf(" { ~(RTUNICP)0, ~(RTUNICP)0, NULL }\n" 774 1154 "};\n\n\n"); 775 printf("\n");1155 Stream1Printf("\n"); 776 1156 return Stream2Flush(); 777 1157 } … … 785 1165 if (argc <= 1) 786 1166 { 787 printf("usage: %s [UnicodeData.txt [DerivedCoreProperties.txt [PropList.txt]]]\n", argv[0]); 1167 printf("usage: %s [-C|--dir <UCD-dir>] [UnicodeData.txt [DerivedCoreProperties.txt [PropList.txt] [DerivedNormalizationProps.txt]]]\n", 1168 argv[0]); 788 1169 return 1; 789 1170 } 790 1171 791 const char *pszUnicodeData = "UnicodeData.txt"; 792 const char *pszDerivedCoreProperties = "DerivedCoreProperties.txt"; 793 const char *pszPropList = "PropList.txt"; 1172 const char *pszBaseDir = NULL; 1173 const char *pszUnicodeData = "UnicodeData.txt"; 1174 const char *pszDerivedCoreProperties = "DerivedCoreProperties.txt"; 1175 const char *pszPropList = "PropList.txt"; 1176 const char *pszDerivedNormalizationProps = "DerivedNormalizationProps.txt"; 794 1177 int iFile = 0; 795 1178 for (int argi = 1; argi < argc; argi++) … … 799 1182 switch (iFile++) 800 1183 { 801 case 0: pszUnicodeData = argv[argi]; break; 802 case 1: pszDerivedCoreProperties = argv[argi]; break; 803 case 2: pszPropList = argv[argi]; break; 1184 case 0: pszUnicodeData = argv[argi]; break; 1185 case 1: pszDerivedCoreProperties = argv[argi]; break; 1186 case 2: pszPropList = argv[argi]; break; 1187 case 3: pszDerivedNormalizationProps = argv[argi]; break; 804 1188 default: 805 printf("uniread: syntax error at '%s': too many filenames\n", argv[argi]);1189 fprintf(stderr, "uniread: syntax error at '%s': too many filenames\n", argv[argi]); 806 1190 return 1; 807 1191 } 808 1192 } 1193 else if ( !strcmp(argv[argi], "--dir") 1194 || !strcmp(argv[argi], "-C")) 1195 { 1196 if (argi + 1 >= argc) 1197 { 1198 fprintf(stderr, "uniread: syntax error: '%s' is missing the directory name.\n", argv[argi]); 1199 return 1; 1200 } 1201 argi++; 1202 pszBaseDir = argv[argi]; 1203 } 1204 else if ( !strcmp(argv[argi], "-q") 1205 || !strcmp(argv[argi], "--quiet")) 1206 g_fQuiet = true; 809 1207 else 810 1208 { 811 printf("uniread: syntax error at '%s': Unknown argument\n", argv[argi]);1209 fprintf(stderr, "uniread: syntax error at '%s': Unknown argument\n", argv[argi]); 812 1210 return 1; 813 1211 } … … 817 1215 * Read the data. 818 1216 */ 819 int rc = ReadUnicodeData(psz UnicodeData);1217 int rc = ReadUnicodeData(pszBaseDir, pszUnicodeData); 820 1218 if (rc) 821 1219 return rc; 822 rc = ReadProperties(pszPropList);1220 rc = GenerateExcludedData(); 823 1221 if (rc) 824 1222 return rc; 825 rc = ReadProperties(pszDerivedCoreProperties); 1223 rc = ReadProperties(pszBaseDir, pszPropList); 1224 if (rc) 1225 return rc; 1226 rc = ReadProperties(pszBaseDir, pszDerivedCoreProperties); 1227 if (rc) 1228 return rc; 1229 rc = ReadProperties(pszBaseDir, pszDerivedNormalizationProps); 826 1230 if (rc) 827 1231 return rc;
Note:
See TracChangeset
for help on using the changeset viewer.