VirtualBox

Changeset 28876 in vbox for trunk/src/VBox/Runtime


Ignore:
Timestamp:
Apr 28, 2010 7:01:33 PM (15 years ago)
Author:
vboxsync
Message:

uniread.cpp: Updated to cope with version 5.2 of the spec. Preparing for exctracing necessary decomposition and normalization information. Fixed Oracle (C).

File:
1 edited

Legend:

Unmodified
Added
Removed
  • trunk/src/VBox/Runtime/common/string/uniread.cpp

    r28800 r28876  
    3030#include <iprt/types.h>
    3131#include <iprt/stdarg.h>
     32#include <iprt/ctype.h>
    3233
    3334#include <stdio.h>
     
    3536#include <stdlib.h>
    3637
     38
     39/*******************************************************************************
     40*   Global Variables                                                           *
     41*******************************************************************************/
     42/** When set, no output is produced.  Very useful when debugging ths code. */
     43static bool g_fQuiet = false;
     44/** The file we're currently parsing. */
     45static const char *g_pszCurFile;
     46/** The current line number. */
     47static unsigned g_iLine;
     48
     49
     50/**
     51 * Exit the program after printing a parse error.
     52 *
     53 * @param   pszFormat           The message.
     54 * @param   ...                 Format arguments.
     55 */
     56static void ParseError(const char *pszFormat, ...)
     57{
     58    va_list va;
     59    va_start(va, pszFormat);
     60    fprintf(stderr, "parse error: %s:%u: ", g_pszCurFile, g_iLine);
     61    vfprintf(stderr, pszFormat, va);
     62    va_end(va);
     63    exit(1);
     64}
    3765
    3866/**
     
    133161
    134162/**
     163 * Splits a decomposition field.
     164 *
     165 * This may start with a type that is enclosed in angle brackets.
     166 *
     167 * @returns Pointer to the mapping values following the type. @a *ppsz if empty.
     168 * @param   ppszType    Pointer to the type field pointer.  On input the type
     169 *                      field contains the combined type and mapping string.  On
     170 *                      output this should only contain the type, no angle
     171 *                      brackets.  If no type specified, it is replaced with an
     172 *                      empty string (const).
     173 */
     174static char *SplitDecompField(char **ppszType)
     175{
     176    /* Empty field? */
     177    char *psz = *ppszType;
     178    if (!*psz)
     179        return psz;
     180
     181    /* No type? */
     182    if (*psz != '<')
     183    {
     184        *ppszType = (char *)"";
     185        return psz;
     186    }
     187
     188    /* Split out the type. */
     189    *ppszType = ++psz;
     190    psz = strchr(psz, '>');
     191    if (!psz)
     192    {
     193        ParseError("Bad Decomposition Type/Mappings\n");
     194        return *ppszType;
     195    }
     196    *psz++ = '\0';
     197
     198    psz = StripLine(psz);
     199    if (!*psz)
     200        ParseError("Missing decomposition mappings\n");
     201    return psz;
     202}
     203
     204/**
    135205 * Converts a code point field to a number.
    136206 * @returns Code point.
     
    142212    unsigned long ul = strtoul(psz, &pszEnd, 16);
    143213    if (pszEnd && *pszEnd)
    144         fprintf(stderr, "warning: failed converting '%s' to a number!\n", psz);
     214        ParseError("failed converting '%s' to a number!\n", psz);
    145215    return (RTUNICP)ul;
    146216}
     
    179249            if (pszEnd && *pszEnd)
    180250            {
    181                 fprintf(stderr, "warning: failed converting '%s' to a number!\n", psz);
     251                ParseError("failed converting '%s' to a number!\n", psz);
    182252                return ~(RTUNICP)0;
    183253            }
     
    185255        else
    186256        {
    187             fprintf(stderr, "warning: failed converting '%s' to a number!\n", psz);
     257            ParseError("failed converting '%s' to a number!\n", psz);
    188258            return ~(RTUNICP)0;
    189259        }
     
    192262    return (RTUNICP)ulStart;
    193263
     264}
     265
     266/**
     267 * For converting the decompisition mappings field and similar.
     268 *
     269 * @returns Mapping array or NULL if none.
     270 * @param   psz                 The string to convert.  Can be empty.
     271 * @param   pcEntries           Where to store the number of entries.
     272 * @param   cMax                The max number of entries.
     273 */
     274static PRTUNICP ToMapping(char *psz, unsigned *pcEntries, unsigned cMax)
     275{
     276    PRTUNICP paCps  = NULL;
     277    unsigned cAlloc = 0;
     278    unsigned i      = 0;
     279
     280    /* Convert the code points. */
     281    while (psz)
     282    {
     283        /* skip leading spaces */
     284        while (RT_C_IS_BLANK(*psz))
     285            psz++;
     286
     287        /* the end? */
     288        if (!*psz)
     289            break;
     290
     291        /* room left? */
     292        if (i >= cMax)
     293        {
     294            ParseError("Too many mappings.\n");
     295            break;
     296        }
     297        if (i >= cAlloc)
     298        {
     299            cAlloc += 4;
     300            paCps = (PRTUNICP)realloc(paCps, cAlloc * sizeof(paCps[0]));
     301            if (!paCps)
     302            {
     303                fprintf(stderr, "out of memory (%u)\n", (unsigned)(cAlloc * sizeof(paCps[0])));
     304                exit(1);
     305            }
     306        }
     307
     308        /* Find the end. */
     309        char *pszThis = psz;
     310        while (RT_C_IS_XDIGIT(*psz))
     311            psz++;
     312        if (*psz && !RT_C_IS_BLANK(*psz))
     313            ParseError("Malformed mappings.\n");
     314        if (*psz)
     315            *psz++ = '\0';
     316
     317        /* Convert to number and add it. */
     318        paCps[i++] = ToNum(pszThis);
     319    }
     320
     321    *pcEntries = i;
     322    return paCps;
    194323}
    195324
     
    223352    RTUNICP     SimpleLowerCaseMapping;
    224353    RTUNICP     SimpleTitleCaseMapping;
     354    unsigned    CanonicalCombiningClass;
     355    const char *pszDecompositionType;
     356    unsigned    cDecompositionMapping;
     357    PRTUNICP    paDecompositionMapping;
    225358    const char *pszName;
    226359    /** Set if this is an unused entry */
     
    230363    unsigned    fASCIIHexDigit : 1;
    231364    unsigned    fBidiControl : 1;
     365    unsigned    fCaseIgnorable : 1;
     366    unsigned    fCased : 1;
     367    unsigned    fChangesWhenCasefolded : 1;
     368    unsigned    fChangesWhenCasemapped : 1;
     369    unsigned    fChangesWhenLowercased : 1;
     370    unsigned    fChangesWhenTitlecased : 1;
     371    unsigned    fChangesWhenUppercased : 1;
    232372    unsigned    fDash : 1;
    233373    unsigned    fDefaultIgnorableCodePoint : 1;
     
    272412    unsigned    fXIDStart : 1;
    273413
    274     /* unprocess stuff, so far. */
     414    /** @name DerivedNormalizationProps.txt
     415     * @{ */
     416    unsigned    fFullCompositionExclusion : 1;
     417    unsigned    fInvNFC_QC : 2;     /**< If 1 (NFC_QC == N) then code point 100% sure not part of NFC string. */
     418    unsigned    fInvNFD_QC : 2;     /**< If 1 (NFD_QC == N) then code point 100% sure not part of NFD string. */
     419    unsigned    fInvNFKC_QC : 2;
     420    unsigned    fInvNFKD_QC : 2;
     421    unsigned    fExpandsOnNFC : 1;
     422    unsigned    fExpandsOnNFD : 1;
     423    unsigned    fExpandsOnNFKC : 1;
     424    unsigned    fExpandsOnNFKD : 1;
     425    /** @}  */
     426
     427    /* unprocessed stuff, so far. */
    275428    const char *pszGeneralCategory;
    276     const char *pszCanonicalCombiningClass;
    277429    const char *pszBidiClass;
    278     const char *pszDecompositionType;
    279     const char *pszDecompositionMapping;
    280430    const char *pszNumericType;
    281     const char *pszNumericValue;
     431    const char *pszNumericValueD;
     432    const char *pszNumericValueN;
    282433    const char *pszBidiMirrored;
    283434    const char *pszUnicode1Name;
    284435    const char *pszISOComment;
    285 } g_aCPInfo[0xf0000];
     436} g_aCPInfo[0x110000];
    286437
    287438
     
    294445    g_aCPInfo[i].CodePoint = i;
    295446    g_aCPInfo[i].fNullEntry = 1;
    296     g_aCPInfo[i].pszName = "";
    297447    g_aCPInfo[i].SimpleUpperCaseMapping = i;
    298448    g_aCPInfo[i].SimpleLowerCaseMapping = i;
    299449    g_aCPInfo[i].SimpleTitleCaseMapping = i;
     450    g_aCPInfo[i].pszDecompositionType = "";
     451    g_aCPInfo[i].cDecompositionMapping = 0;
     452    g_aCPInfo[i].paDecompositionMapping = NULL;
     453    g_aCPInfo[i].pszName = "";
    300454    g_aCPInfo[i].pszGeneralCategory = "";
    301     g_aCPInfo[i].pszCanonicalCombiningClass = "";
    302455    g_aCPInfo[i].pszBidiClass = "";
    303     g_aCPInfo[i].pszDecompositionType = "";
    304     g_aCPInfo[i].pszDecompositionMapping = "";
    305456    g_aCPInfo[i].pszNumericType = "";
    306     g_aCPInfo[i].pszNumericValue = "";
     457    g_aCPInfo[i].pszNumericValueD = "";
     458    g_aCPInfo[i].pszNumericValueN = "";
    307459    g_aCPInfo[i].pszBidiMirrored = "";
    308460    g_aCPInfo[i].pszUnicode1Name = "";
     
    312464
    313465/**
     466 * Open a file for reading, optionally with a base path prefixed.
     467 *
     468 * @returns file stream on success, NULL w/ complaint on failure.
     469 * @param   pszBasePath         The base path, can be NULL.
     470 * @param   pszFilename         The name of the file to open.
     471 */
     472static FILE *OpenFile(const char *pszBasePath, const char *pszFilename)
     473{
     474    FILE *pFile;
     475    if (   !pszBasePath
     476        || *pszFilename == '/'
     477#if defined(_MSC_VER) || defined(__OS2__)
     478        || *pszFilename == '\\'
     479        || (*pszFilename && pszFilename[1] == ':')
     480#endif
     481       )
     482    {
     483        pFile = fopen(pszFilename, "r");
     484        if (!pFile)
     485            fprintf(stderr, "uniread: failed to open '%s' for reading\n", pszFilename);
     486    }
     487    else
     488    {
     489        size_t cchBasePath = strlen(pszBasePath);
     490        size_t cchFilename = strlen(pszFilename);
     491        char  *pszFullName = (char *)malloc(cchBasePath + 1 + cchFilename + 1);
     492        if (!pszFullName)
     493        {
     494            fprintf(stderr, "uniread: failed to allocate %d bytes\n", (int)(cchBasePath + 1 + cchFilename + 1));
     495            return NULL;
     496        }
     497
     498        memcpy(pszFullName, pszBasePath, cchBasePath);
     499        pszFullName[cchBasePath] = '/';
     500        memcpy(&pszFullName[cchBasePath + 1], pszFilename, cchFilename + 1);
     501
     502        pFile = fopen(pszFullName, "r");
     503        if (!pFile)
     504            fprintf(stderr, "uniread: failed to open '%s' for reading\n", pszFullName);
     505        free(pszFullName);
     506    }
     507    g_pszCurFile = pszFilename;
     508    g_iLine      = 0;
     509    return pFile;
     510}
     511
     512
     513/**
     514 * Wrapper around fgets that keep track of the line number.
     515 *
     516 * @returns See fgets.
     517 * @param   pszBuf              The buffer.  See fgets for output definition.
     518 * @param   cbBuf               The buffer size.
     519 * @param   pFile               The file to read from.
     520 */
     521static char *GetLineFromFile(char *pszBuf, int cbBuf, FILE *pFile)
     522{
     523    g_iLine++;
     524    return fgets(pszBuf, cbBuf, pFile);
     525}
     526
     527
     528/**
     529 * Closes a file opened by OpenFile
     530 *
     531 * @param   pFile               The file to close.
     532 */
     533static void CloseFile(FILE *pFile)
     534{
     535    g_pszCurFile = NULL;
     536    g_iLine = 0;
     537    fclose(pFile);
     538}
     539
     540
     541/**
    314542 * Read the UnicodeData.txt file.
    315543 * @returns 0 on success.
    316544 * @returns !0 on failure.
    317  * @param   pszFilename     The name of the file.
    318  */
    319 static int ReadUnicodeData(const char *pszFilename)
     545 * @param   pszBasePath         The base path, can be NULL.
     546 * @param   pszFilename         The name of the file.
     547 */
     548static int ReadUnicodeData(const char *pszBasePath, const char *pszFilename)
    320549{
    321550    /*
    322551     * Open input.
    323552     */
    324     FILE *pFile = fopen(pszFilename, "r");
     553    FILE *pFile = OpenFile(pszBasePath, pszFilename);
    325554    if (!pFile)
    326     {
    327         printf("uniread: failed to open '%s' for reading\n", pszFilename);
    328555        return 1;
    329     }
    330556
    331557    /*
     
    334560    char szLine[4096];
    335561    RTUNICP i = 0;
    336     while (fgets(szLine, sizeof(szLine), pFile) != NULL)
     562    while (GetLineFromFile(szLine, sizeof(szLine), pFile) != NULL)
    337563    {
    338564        if (IsCommentOrBlankLine(szLine))
     
    346572        char *pszBidiClass = NextField(&pszCurField);                     /* 4 */
    347573        char *pszDecompositionType = NextField(&pszCurField);             /* 5 */
    348         char *pszDecompositionMapping = NextField(&pszCurField);          /* 6 */
    349         char *pszNumericType = NextField(&pszCurField);                   /* 7 */
    350         char *pszNumericValue = NextField(&pszCurField);                  /* 8 */
     574        char *pszDecompositionMapping = SplitDecompField(&pszDecompositionType);
     575        char *pszNumericType = NextField(&pszCurField);                   /* 6 */
     576        char *pszNumericValueD = NextField(&pszCurField);                 /* 7 */
     577        char *pszNumericValueN = NextField(&pszCurField);                 /* 8 */
    351578        char *pszBidiMirrored = NextField(&pszCurField);                  /* 9 */
    352579        char *pszUnicode1Name = NextField(&pszCurField);                  /* 10 */
     
    358585        RTUNICP CodePoint = ToNum(pszCodePoint);
    359586        if (CodePoint >= RT_ELEMENTS(g_aCPInfo))
     587        {
     588            ParseError("U+05X is out of range\n", CodePoint);
    360589            continue;
     590        }
    361591
    362592        /* catchup? */
     
    365595        if (i != CodePoint)
    366596        {
    367             fprintf(stderr, "unitest: error: i=%d CodePoint=%u\n", i, CodePoint);
    368             fclose(pFile);
     597            ParseError("i=%d CodePoint=%u\n", i, CodePoint);
     598            CloseFile(pFile);
    369599            return 1;
    370600        }
     
    377607        g_aCPInfo[i].SimpleLowerCaseMapping     = ToNumDefault(pszSimpleLowerCaseMapping, CodePoint);
    378608        g_aCPInfo[i].SimpleTitleCaseMapping     = ToNumDefault(pszSimpleTitleCaseMapping, CodePoint);
     609        g_aCPInfo[i].CanonicalCombiningClass    = ToNum(pszCanonicalCombiningClass);
     610        g_aCPInfo[i].pszDecompositionType       = DupStr(pszDecompositionType);
     611        g_aCPInfo[i].paDecompositionMapping     = ToMapping(pszDecompositionMapping, &g_aCPInfo[i].cDecompositionMapping, 20);
    379612        g_aCPInfo[i].pszGeneralCategory         = DupStr(pszGeneralCategory);
    380         g_aCPInfo[i].pszCanonicalCombiningClass = DupStr(pszCanonicalCombiningClass);
    381613        g_aCPInfo[i].pszBidiClass               = DupStr(pszBidiClass);
    382         g_aCPInfo[i].pszDecompositionType       = DupStr(pszDecompositionType);
    383         g_aCPInfo[i].pszDecompositionMapping    = DupStr(pszDecompositionMapping);
    384614        g_aCPInfo[i].pszNumericType             = DupStr(pszNumericType);
    385         g_aCPInfo[i].pszNumericValue            = DupStr(pszNumericValue);
     615        g_aCPInfo[i].pszNumericValueD           = DupStr(pszNumericValueD);
     616        g_aCPInfo[i].pszNumericValueN           = DupStr(pszNumericValueN);
    386617        g_aCPInfo[i].pszBidiMirrored            = DupStr(pszBidiMirrored);
    387618        g_aCPInfo[i].pszUnicode1Name            = DupStr(pszUnicode1Name);
     
    389620        i++;
    390621    }
     622
    391623    /* catchup? */
    392624    while (i < RT_ELEMENTS(g_aCPInfo))
    393625        NullEntry(i++);
    394     fclose(pFile);
     626    CloseFile(pFile);
    395627
    396628    return 0;
     629}
     630
     631
     632/**
     633 * Generates excluded data.
     634 *
     635 * @returns 0 on success, exit code on failure.
     636 */
     637static int GenerateExcludedData(void)
     638{
     639    /*
     640     * Hangul Syllables U+AC00 to U+D7A3.
     641     */
     642    for (RTUNICP i = 0xac00; i <= 0xd7a3; i++)
     643    {
     644        g_aCPInfo[i].fNullEntry = 0;
     645        g_aCPInfo[i].fInvNFD_QC = 1;
     646        /** @todo generate the decomposition: http://unicode.org/reports/tr15/#Hangul
     647         *         */
     648    }
     649
     650    /** @todo
     651     * CJK Ideographs Extension A (U+3400 - U+4DB5)
     652     * CJK Ideographs (U+4E00 - U+9FA5)
     653     * CJK Ideograph Extension B (U+20000 - U+2A6D6)
     654     * CJK Ideograph Extension C (U+2A700 - U+2B734)
     655     */
     656
     657    return 0;
     658}
     659
     660
     661
     662/**
     663 * Worker for ApplyProperty that handles a yes, no, maybe property value.
     664 *
     665 * @returns 0 (NO), 1 (YES), 2 (MAYBE).
     666 * @param   ppszNextField   The field cursor, input and output.
     667 */
     668static int YesNoMaybePropertyValue(char **ppszNextField)
     669{
     670    if (!**ppszNextField)
     671    {
     672        ParseError("Missing Y/N/M field\n");
     673        return 0;
     674    }
     675    char *psz = NextField(ppszNextField);
     676    if (!strcmp(psz, "N"))
     677        return 0;
     678    if (!strcmp(psz, "Y"))
     679        return 1;
     680    if (!strcmp(psz, "M"))
     681        return 2;
     682    ParseError("Unexpected Y/N/M value: '%s'\n",  psz);
     683    return 0;
     684}
     685
     686
     687/**
     688 * Inverted version of YesNoMaybePropertyValue
     689 *
     690 * @returns 1 (NO), 0 (YES), 2 (MAYBE).
     691 * @param   ppszNextField   The field cursor, input and output.
     692 */
     693static int YesNoMaybePropertyValueInv(char **ppszNextField)
     694{
     695    unsigned rc = YesNoMaybePropertyValue(ppszNextField);
     696    switch (rc)
     697    {
     698        case 0:     return 1;
     699        case 1:     return 0;
     700        default:    return rc;
     701    }
    397702}
    398703
     
    404709 * @param   pszProperty The property name.
    405710 */
    406 static void ApplyProperty(RTUNICP StartCP, const char *pszProperty)
     711static void ApplyProperty(RTUNICP StartCP, const char *pszProperty, char *pszNextField)
    407712{
    408713    if (StartCP >= RT_ELEMENTS(g_aCPInfo))
     714    {
     715        ParseError("U+%06X is out of the g_aCPInfo range.\n", StartCP);
    409716        return;
     717    }
    410718    struct CPINFO *pCPInfo = &g_aCPInfo[StartCP];
    411719    /* string switch */
    412     if (!strcmp(pszProperty, "ASCII_Hex_Digit")) pCPInfo->fASCIIHexDigit = 1;
     720         if (!strcmp(pszProperty, "ASCII_Hex_Digit")) pCPInfo->fASCIIHexDigit = 1;
     721    else if (!strcmp(pszProperty, "Alphabetic")) pCPInfo->fAlphabetic = 1;
    413722    else if (!strcmp(pszProperty, "Bidi_Control")) pCPInfo->fBidiControl = 1;
     723    else if (!strcmp(pszProperty, "Case_Ignorable")) pCPInfo->fCaseIgnorable = 1;
     724    else if (!strcmp(pszProperty, "Cased")) pCPInfo->fCased = 1;
     725    else if (!strcmp(pszProperty, "Changes_When_Casefolded")) pCPInfo->fChangesWhenCasefolded = 1;
     726    else if (!strcmp(pszProperty, "Changes_When_Casemapped")) pCPInfo->fChangesWhenCasemapped = 1;
     727    else if (!strcmp(pszProperty, "Changes_When_Lowercased")) pCPInfo->fChangesWhenLowercased = 1;
     728    else if (!strcmp(pszProperty, "Changes_When_Titlecased")) pCPInfo->fChangesWhenTitlecased = 1;
     729    else if (!strcmp(pszProperty, "Changes_When_Uppercased")) pCPInfo->fChangesWhenUppercased = 1;
    414730    else if (!strcmp(pszProperty, "Dash")) pCPInfo->fDash = 1;
     731    else if (!strcmp(pszProperty, "Default_Ignorable_Code_Point")) pCPInfo->fDefaultIgnorableCodePoint = 1;
    415732    else if (!strcmp(pszProperty, "Deprecated")) pCPInfo->fDeprecated = 1;
    416733    else if (!strcmp(pszProperty, "Diacritic")) pCPInfo->fDiacritic = 1;
    417734    else if (!strcmp(pszProperty, "Extender")) pCPInfo->fExtender = 1;
     735    else if (!strcmp(pszProperty, "Grapheme_Base")) pCPInfo->fGraphemeBase = 1;
     736    else if (!strcmp(pszProperty, "Grapheme_Extend")) pCPInfo->fGraphemeExtend = 1;
    418737    else if (!strcmp(pszProperty, "Grapheme_Link")) pCPInfo->fGraphemeLink = 1;
    419738    else if (!strcmp(pszProperty, "Hex_Digit")) pCPInfo->fHexDigit = 1;
    420739    else if (!strcmp(pszProperty, "Hyphen")) pCPInfo->fHyphen = 1;
     740    else if (!strcmp(pszProperty, "ID_Continue")) pCPInfo->fIDContinue = 1;
     741    else if (!strcmp(pszProperty, "ID_Start")) pCPInfo->fIDStart = 1;
    421742    else if (!strcmp(pszProperty, "Ideographic")) pCPInfo->fIdeographic = 1;
    422743    else if (!strcmp(pszProperty, "IDS_Binary_Operator")) pCPInfo->fIDSBinaryOperator = 1;
     
    424745    else if (!strcmp(pszProperty, "Join_Control")) pCPInfo->fJoinControl = 1;
    425746    else if (!strcmp(pszProperty, "Logical_Order_Exception")) pCPInfo->fLogicalOrderException = 1;
     747    else if (!strcmp(pszProperty, "Lowercase")) pCPInfo->fLowercase = 1;
     748    else if (!strcmp(pszProperty, "Math")) pCPInfo->fMath = 1;
    426749    else if (!strcmp(pszProperty, "Noncharacter_Code_Point")) pCPInfo->fNoncharacterCodePoint = 1;
    427750    else if (!strcmp(pszProperty, "Other_Alphabetic")) pCPInfo->fOtherAlphabetic = 1;
     
    433756    else if (!strcmp(pszProperty, "Other_Math")) pCPInfo->fOtherMath = 1;
    434757    else if (!strcmp(pszProperty, "Other_Uppercase")) pCPInfo->fOtherUppercase = 1;
    435     else if (!strcmp(pszProperty, "Alphabetic")) pCPInfo->fAlphabetic = 1;
    436     else if (!strcmp(pszProperty, "Default_Ignorable_Code_Point")) pCPInfo->fDefaultIgnorableCodePoint = 1;
    437     else if (!strcmp(pszProperty, "Grapheme_Base")) pCPInfo->fGraphemeBase = 1;
    438     else if (!strcmp(pszProperty, "Grapheme_Extend")) pCPInfo->fGraphemeExtend = 1;
    439     else if (!strcmp(pszProperty, "ID_Continue")) pCPInfo->fIDContinue = 1;
    440     else if (!strcmp(pszProperty, "ID_Start")) pCPInfo->fIDStart = 1;
    441     else if (!strcmp(pszProperty, "XID_Continue")) pCPInfo->fXIDContinue = 1;
    442     else if (!strcmp(pszProperty, "XID_Start")) pCPInfo->fXIDStart = 1;
    443     else if (!strcmp(pszProperty, "Lowercase")) pCPInfo->fLowercase = 1;
    444     else if (!strcmp(pszProperty, "Math")) pCPInfo->fMath = 1;
    445     else if (!strcmp(pszProperty, "Uppercase")) pCPInfo->fUppercase = 1;
    446758    else if (!strcmp(pszProperty, "Pattern_Syntax")) pCPInfo->fPatternSyntax = 1;
    447759    else if (!strcmp(pszProperty, "Pattern_White_Space")) pCPInfo->fPatternWhiteSpace = 1;
     
    452764    else if (!strcmp(pszProperty, "Terminal_Punctuation")) pCPInfo->fTerminalPunctuation = 1;
    453765    else if (!strcmp(pszProperty, "Unified_Ideograph")) pCPInfo->fUnifiedIdeograph = 1;
     766    else if (!strcmp(pszProperty, "Uppercase")) pCPInfo->fUppercase = 1;
    454767    else if (!strcmp(pszProperty, "Variation_Selector")) pCPInfo->fVariationSelector = 1;
    455768    else if (!strcmp(pszProperty, "White_Space")) pCPInfo->fWhiteSpace = 1;
     769    else if (!strcmp(pszProperty, "XID_Continue")) pCPInfo->fXIDContinue = 1;
     770    else if (!strcmp(pszProperty, "XID_Start")) pCPInfo->fXIDStart = 1;
     771    /* DerivedNormalizationProps: */
     772    else if (!strcmp(pszProperty, "FC_NFKC")) return; /* ignored */
     773    else if (!strcmp(pszProperty, "Full_Composition_Exclusion")) pCPInfo->fFullCompositionExclusion = 1;
     774    else if (!strcmp(pszProperty, "NFC_QC"))  pCPInfo->fInvNFC_QC  = YesNoMaybePropertyValueInv(&pszNextField);
     775    else if (!strcmp(pszProperty, "NFD_QC"))  pCPInfo->fInvNFD_QC  = YesNoMaybePropertyValueInv(&pszNextField);
     776    else if (!strcmp(pszProperty, "NFKC_QC")) pCPInfo->fInvNFKC_QC = YesNoMaybePropertyValueInv(&pszNextField);
     777    else if (!strcmp(pszProperty, "NFKD_QC")) pCPInfo->fInvNFKD_QC = YesNoMaybePropertyValueInv(&pszNextField);
     778    else if (!strcmp(pszProperty, "Expands_On_NFC"))  pCPInfo->fExpandsOnNFC  = 1;
     779    else if (!strcmp(pszProperty, "Expands_On_NFD"))  pCPInfo->fExpandsOnNFD  = 1;
     780    else if (!strcmp(pszProperty, "Expands_On_NFKC")) pCPInfo->fExpandsOnNFKC = 1;
     781    else if (!strcmp(pszProperty, "Expands_On_NFKD")) pCPInfo->fExpandsOnNFKD = 1;
     782    else if (!strcmp(pszProperty, "NFKC_CF")) return; /*ignore */
     783    else if (!strcmp(pszProperty, "Changes_When_NFKC_Casefolded")) return; /*ignore */
    456784    else
    457         fprintf(stderr, "uniread: Unknown property '%s'\n", pszProperty);
     785    {
     786        ParseError("Unknown property '%s'\n", pszProperty);
     787        return;
     788    }
     789
     790    if (pszNextField && *pszNextField)
     791        ParseError("Unexpected next field: '%s'\n", pszNextField);
    458792}
    459793
     
    467801 * @returns 0 on success.
    468802 * @returns !0 on failure.
     803 * @param   pszBasePath         The base path, can be NULL.
    469804 * @param   pszFilename     The name of the file.
    470805 */
    471 static int ReadProperties(const char *pszFilename)
     806static int ReadProperties(const char *pszBasePath, const char *pszFilename)
    472807{
    473808    /*
    474809     * Open input.
    475810     */
    476     FILE *pFile = fopen(pszFilename, "r");
     811    FILE *pFile = OpenFile(pszBasePath, pszFilename);
    477812    if (!pFile)
    478     {
    479         printf("uniread: failed to open '%s' for reading\n", pszFilename);
    480813        return 1;
    481     }
    482814
    483815    /*
     
    485817     */
    486818    char szLine[4096];
    487     while (fgets(szLine, sizeof(szLine), pFile) != NULL)
     819    while (GetLineFromFile(szLine, sizeof(szLine), pFile) != NULL)
    488820    {
    489821        if (IsCommentOrBlankLine(szLine))
    490822            continue;
    491823        char *pszCurField;
    492         char *pszRange = FirstField(&pszCurField, StripLine(szLine));
     824        char *pszRange    = FirstField(&pszCurField, StripLine(szLine));
    493825        char *pszProperty = NextField(&pszCurField);
    494826        if (!*pszProperty)
     827        {
     828            ParseError("no property field.\n");
    495829            continue;
     830        }
    496831
    497832        RTUNICP LastCP;
     
    501836
    502837        while (StartCP <= LastCP)
    503             ApplyProperty(StartCP++, pszProperty);
    504     }
    505 
    506     fclose(pFile);
     838            ApplyProperty(StartCP++, pszProperty, pszCurField);
     839    }
     840
     841    CloseFile(pFile);
    507842
    508843    return 0;
     
    547882    if (pInfo->fLowercase || pInfo->fOtherLowercase)
    548883        AppendFlag(pszFlags, "RTUNI_LOWER");
    549     //if (pInfo->fNumeric)
    550     //    AppendFlag(pszFlags, "RTUNI_NUMERIC");
     884    //if (pInfo->???)
     885    //    AppendFlag(pszFlags, "RTUNI_BSPACE");
     886    if (pInfo->fInvNFD_QC != 0 || pInfo->fInvNFC_QC != 0)
     887    {
     888        AppendFlag(pszFlags, "RTUNI_QC_NFX");
     889        if (!pInfo->paDecompositionMapping && pInfo->fInvNFD_QC)
     890            fprintf(stderr, "uniread: U+%05X is QC_NFD but has no mappings.\n", pInfo->CodePoint);
     891        else if (*pInfo->pszDecompositionType && pInfo->fInvNFD_QC)
     892            fprintf(stderr, "uniread: U+%05X is QC_NFD but has no canonical mappings.\n", pInfo->CodePoint);
     893    }
     894    else if (pInfo->paDecompositionMapping && !*pInfo->pszDecompositionType)
     895        fprintf(stderr, "uniread: U+%05X is not QC_NFX but has canonical mappings.\n", pInfo->CodePoint);
     896
    551897    if (!*pszFlags)
    552898    {
     
    558904}
    559905
     906
     907/**
     908 * printf wrapper for the primary output stream.
     909 *
     910 * @returns See vfprintf.
     911 * @param   pszFormat           The vfprintf format string.
     912 * @param   ...                 The format arguments.
     913 */
     914static int Stream1Printf(const char *pszFormat, ...)
     915{
     916    int     cch;
     917    va_list va;
     918    va_start(va, pszFormat);
     919    if (!g_fQuiet)
     920        cch = vfprintf(stdout, pszFormat, va);
     921    else
     922        cch = strlen(pszFormat);
     923    va_end(va);
     924    return cch;
     925}
     926
     927
    560928/** the data store for stream two. */
    561929static char g_szStream2[10240];
    562 static unsigned g_offStream2 = 0;
     930static unsigned volatile g_offStream2 = 0;
    563931
    564932/**
     
    576944static int Stream2Flush(void)
    577945{
    578     fwrite(g_szStream2, 1, g_offStream2, stdout);
     946    g_szStream2[g_offStream2] = '\0';
     947    Stream1Printf("%s", g_szStream2);
     948    Stream2Init();
    579949    return 0;
    580950}
     
    585955static int Stream2Printf(const char *pszFormat, ...)
    586956{
     957    unsigned offStream2 = g_offStream2;
    587958    va_list va;
    588959    va_start(va, pszFormat);
    589     int cch = vsprintf(&g_szStream2[g_offStream2], pszFormat, va);
     960    int cch = vsprintf(&g_szStream2[offStream2], pszFormat, va);
    590961    va_end(va);
    591     g_offStream2 += cch;
    592     if (g_offStream2 >= sizeof(g_szStream2))
     962    offStream2 += cch;
     963    if (offStream2 >= sizeof(g_szStream2))
    593964    {
    594965        fprintf(stderr, "error: stream2 overflow!\n");
    595966        exit(1);
    596967    }
     968    g_offStream2 = offStream2;
    597969    return cch;
    598970}
     
    604976int PrintHeader(const char *argv0)
    605977{
    606     /*
    607      * Print file header.
    608      */
    609     printf("/** @file\n"
    610            " *\n"
    611            " * IPRT - Unicode Tables\n"
    612            " *\n"
    613            " *      Automatically Generated by %s (" __DATE__ " " __TIME__ ")\n"
    614            " */\n\n"
    615            "/*\n"
    616            " * Copyright (C) 2006-2008 Sun Microsystems, Inc.\n"
    617            " *\n"
    618            " * This file is part of VirtualBox Open Source Edition (OSE), as\n"
    619            " * available from http://www.virtualbox.org. This file is free software;\n"
    620            " * you can redistribute it and/or modify it under the terms of the GNU\n"
    621            " * General Public License as published by the Free Software Foundation,\n"
    622            " * in version 2 as it comes in the \"COPYING\" file of the VirtualBox OSE\n"
    623            " * distribution. VirtualBox OSE is distributed in the hope that it will\n"
    624            " * be useful, but WITHOUT ANY WARRANTY of any kind.\n"
    625            " *\n"
    626            "\n"
    627            "#include <iprt/uni.h>\n"
    628            "\n",
    629            argv0);
     978    Stream1Printf("/** @file\n"
     979                  " *\n"
     980                  " * IPRT - Unicode Tables.\n"
     981                  " *\n"
     982                  " * Automatically Generated by %s (" __DATE__ " " __TIME__ ")\n"
     983                  " */\n"
     984                  "\n"
     985                  "/*\n"
     986                  " * Copyright (C) 2006-2010 Oracle Corporation\n"
     987                  " *\n"
     988                  " * This file is part of VirtualBox Open Source Edition (OSE), as\n"
     989                  " * available from http://www.virtualbox.org. This file is free software;\n"
     990                  " * you can redistribute it and/or modify it under the terms of the GNU\n"
     991                  " * General Public License (GPL) as published by the Free Software\n"
     992                  " * Foundation, in version 2 as it comes in the \"COPYING\" file of the\n"
     993                  " * VirtualBox OSE distribution. VirtualBox OSE is distributed in the\n"
     994                  " * hope that it will be useful, but WITHOUT ANY WARRANTY of any kind.\n"
     995                  " *\n"
     996                  " * The contents of this file may alternatively be used under the terms\n"
     997                  " * of the Common Development and Distribution License Version 1.0\n"
     998                  " * (CDDL) only, as it comes in the \"COPYING.CDDL\" file of the\n"
     999                  " * VirtualBox OSE distribution, in which case the provisions of the\n"
     1000                  " * CDDL are applicable instead of those of the GPL.\n"
     1001                  " *\n"
     1002                  " * You may elect to license modified versions of this file under the\n"
     1003                  " * terms and conditions of either the GPL or the CDDL or both.\n"
     1004                  " */\n"
     1005                  "\n"
     1006                  "#include <iprt/uni.h>\n"
     1007                  "\n",
     1008                  argv0);
    6301009    return 0;
    6311010}
     
    6501029        char szFlags[256];
    6511030        unsigned iNonNull = i;
    652         while (     (g_aCPInfo[iNonNull].fNullEntry || !CalcFlags(&g_aCPInfo[iNonNull], szFlags))
    653                &&   iNonNull < RT_ELEMENTS(g_aCPInfo)
    654                &&   iNonNull >= 256)
     1031        while (   iNonNull < RT_ELEMENTS(g_aCPInfo)
     1032               && iNonNull >= 256
     1033               && (g_aCPInfo[iNonNull].fNullEntry || !CalcFlags(&g_aCPInfo[iNonNull], szFlags)) )
    6551034            iNonNull++;
    6561035        if (iNonNull - i > 4096 || iNonNull == RT_ELEMENTS(g_aCPInfo))
     
    6581037            if (iStart >= 0)
    6591038            {
    660                 printf("};\n\n");
     1039                Stream1Printf("};\n\n");
    6611040                Stream2Printf("    { 0x%06x, 0x%06x, &g_afRTUniFlags0x%06x[0] },\n", iStart, i, iStart);
    6621041                iStart = -1;
     
    6681047            if (iStart < 0)
    6691048            {
    670                 printf("static const uint8_t g_afRTUniFlags0x%06x[] = \n"
    671                        "{\n", i);
     1049                Stream1Printf("static const uint8_t g_afRTUniFlags0x%06x[] = \n"
     1050                              "{\n", i);
    6721051                iStart = i;
    6731052            }
    6741053            CalcFlags(&g_aCPInfo[i], szFlags);
    675             printf("    %50s, /* U+%06x: %s*/\n", szFlags, g_aCPInfo[i].CodePoint, g_aCPInfo[i].pszName);
     1054            Stream1Printf("    %50s, /* U+%06x: %s*/\n", szFlags, g_aCPInfo[i].CodePoint, g_aCPInfo[i].pszName);
    6761055            i++;
    6771056        }
     
    6791058    Stream2Printf("    { ~(RTUNICP)0, ~(RTUNICP)0, NULL }\n"
    6801059                  "};\n\n\n");
    681     printf("\n");
     1060    Stream1Printf("\n");
    6821061    return Stream2Flush();
    6831062}
     
    7061085            if (iStart >= 0)
    7071086            {
    708                 printf("};\n\n");
     1087                Stream1Printf("};\n\n");
    7091088                Stream2Printf("    { 0x%06x, 0x%06x, &g_afRTUniUpper0x%06x[0] },\n", iStart, i, iStart);
    7101089                iStart = -1;
     
    7161095            if (iStart < 0)
    7171096            {
    718                 printf("static const RTUNICP g_afRTUniUpper0x%06x[] = \n"
    719                        "{\n", i);
     1097                Stream1Printf("static const RTUNICP g_afRTUniUpper0x%06x[] = \n"
     1098                              "{\n", i);
    7201099                iStart = i;
    7211100            }
    722             printf("    0x%02x, /* U+%06x: %s*/\n", g_aCPInfo[i].SimpleUpperCaseMapping, g_aCPInfo[i].CodePoint, g_aCPInfo[i].pszName);
     1101            Stream1Printf("    0x%02x, /* U+%06x: %s*/\n", g_aCPInfo[i].SimpleUpperCaseMapping, g_aCPInfo[i].CodePoint, g_aCPInfo[i].pszName);
    7231102            i++;
    7241103        }
     
    7261105    Stream2Printf("    { ~(RTUNICP)0, ~(RTUNICP)0, NULL }\n"
    7271106                  "};\n\n\n");
    728     printf("\n");
     1107    Stream1Printf("\n");
    7291108    return Stream2Flush();
    7301109}
     
    7531132            if (iStart >= 0)
    7541133            {
    755                 printf("};\n\n");
     1134                Stream1Printf("};\n\n");
    7561135                Stream2Printf("    { 0x%06x, 0x%06x, &g_afRTUniLower0x%06x[0] },\n", iStart, i, iStart);
    7571136                iStart = -1;
     
    7631142            if (iStart < 0)
    7641143            {
    765                 printf("static const RTUNICP g_afRTUniLower0x%06x[] = \n"
    766                        "{\n", i);
     1144                Stream1Printf("static const RTUNICP g_afRTUniLower0x%06x[] = \n"
     1145                              "{\n", i);
    7671146                iStart = i;
    7681147            }
    769             printf("    0x%02x, /* U+%06x: %s*/\n", g_aCPInfo[i].SimpleLowerCaseMapping, g_aCPInfo[i].CodePoint, g_aCPInfo[i].pszName);
     1148            Stream1Printf("    0x%02x, /* U+%06x: %s*/\n",
     1149                          g_aCPInfo[i].SimpleLowerCaseMapping, g_aCPInfo[i].CodePoint, g_aCPInfo[i].pszName);
    7701150            i++;
    7711151        }
     
    7731153    Stream2Printf("    { ~(RTUNICP)0, ~(RTUNICP)0, NULL }\n"
    7741154                  "};\n\n\n");
    775     printf("\n");
     1155    Stream1Printf("\n");
    7761156    return Stream2Flush();
    7771157}
     
    7851165    if (argc <= 1)
    7861166    {
    787         printf("usage: %s [UnicodeData.txt [DerivedCoreProperties.txt [PropList.txt]]]\n", argv[0]);
     1167        printf("usage: %s [-C|--dir <UCD-dir>] [UnicodeData.txt [DerivedCoreProperties.txt [PropList.txt] [DerivedNormalizationProps.txt]]]\n",
     1168                argv[0]);
    7881169        return 1;
    7891170    }
    7901171
    791     const char *pszUnicodeData              = "UnicodeData.txt";
    792     const char *pszDerivedCoreProperties    = "DerivedCoreProperties.txt";
    793     const char *pszPropList                 = "PropList.txt";
     1172    const char *pszBaseDir                      = NULL;
     1173    const char *pszUnicodeData                  = "UnicodeData.txt";
     1174    const char *pszDerivedCoreProperties        = "DerivedCoreProperties.txt";
     1175    const char *pszPropList                     = "PropList.txt";
     1176    const char *pszDerivedNormalizationProps    = "DerivedNormalizationProps.txt";
    7941177    int iFile = 0;
    7951178    for (int argi = 1;  argi < argc; argi++)
     
    7991182            switch (iFile++)
    8001183            {
    801                 case 0: pszUnicodeData = argv[argi]; break;
    802                 case 1: pszDerivedCoreProperties = argv[argi]; break;
    803                 case 2: pszPropList = argv[argi]; break;
     1184                case 0: pszUnicodeData                  = argv[argi]; break;
     1185                case 1: pszDerivedCoreProperties        = argv[argi]; break;
     1186                case 2: pszPropList                     = argv[argi]; break;
     1187                case 3: pszDerivedNormalizationProps    = argv[argi]; break;
    8041188                default:
    805                     printf("uniread: syntax error at '%s': too many filenames\n", argv[argi]);
     1189                    fprintf(stderr, "uniread: syntax error at '%s': too many filenames\n", argv[argi]);
    8061190                    return 1;
    8071191            }
    8081192        }
     1193        else if (   !strcmp(argv[argi], "--dir")
     1194                 || !strcmp(argv[argi], "-C"))
     1195        {
     1196            if (argi + 1 >= argc)
     1197            {
     1198                fprintf(stderr, "uniread: syntax error: '%s' is missing the directory name.\n", argv[argi]);
     1199                return 1;
     1200            }
     1201            argi++;
     1202            pszBaseDir = argv[argi];
     1203        }
     1204        else if (   !strcmp(argv[argi], "-q")
     1205                 || !strcmp(argv[argi], "--quiet"))
     1206            g_fQuiet = true;
    8091207        else
    8101208        {
    811             printf("uniread: syntax error at '%s': Unknown argument\n", argv[argi]);
     1209            fprintf(stderr, "uniread: syntax error at '%s': Unknown argument\n", argv[argi]);
    8121210            return 1;
    8131211        }
     
    8171215     * Read the data.
    8181216     */
    819     int rc = ReadUnicodeData(pszUnicodeData);
     1217    int rc = ReadUnicodeData(pszBaseDir, pszUnicodeData);
    8201218    if (rc)
    8211219        return rc;
    822     rc = ReadProperties(pszPropList);
     1220    rc = GenerateExcludedData();
    8231221    if (rc)
    8241222        return rc;
    825     rc = ReadProperties(pszDerivedCoreProperties);
     1223    rc = ReadProperties(pszBaseDir, pszPropList);
     1224    if (rc)
     1225        return rc;
     1226    rc = ReadProperties(pszBaseDir, pszDerivedCoreProperties);
     1227    if (rc)
     1228        return rc;
     1229    rc = ReadProperties(pszBaseDir, pszDerivedNormalizationProps);
    8261230    if (rc)
    8271231        return rc;
Note: See TracChangeset for help on using the changeset viewer.

© 2024 Oracle Support Privacy / Do Not Sell My Info Terms of Use Trademark Policy Automated Access Etiquette