VirtualBox

Changeset 51795 in vbox for trunk/src/VBox/Runtime/testcase


Ignore:
Timestamp:
Jul 2, 2014 1:01:50 AM (11 years ago)
Author:
vboxsync
Message:

UTF-8 case folding hacks to deal with U+0130, U+0131, U+017f and U+01fbe since these doesn't have any roundtrip compatible mappings nor have case folds into codepoins with the same UTF-8 length. They map to other characters in other blocks.

File:
1 edited

Legend:

Unmodified
Added
Removed
  • trunk/src/VBox/Runtime/testcase/tstIprtMiniString.cpp

    r48935 r51795  
    372372}
    373373
     374#if 0
     375/**
     376 * Detects a few annoying unicode points with unstable case folding for UTF-8.
     377 *
     378 * Unicode 4.01, I think, introduces a few codepoints with lower/upper mappings
     379 * that has a different length when encoded as UTF-8.  This breaks some
     380 * assumptions we used to make.  Since it's just a handful codepoints, we'll
     381 * detect them and ignore them here.  The actual case folding functions in
     382 * IPRT will of course deal with this in a more robust manner.
     383 *
     384 * @returns true if problematic, false if not.
     385 * @param   uc      The codepoints.
     386 */
     387static bool isUnevenUtf8FoldingCp(RTUNICP uc)
     388{
     389    RTUNICP ucLower = RTUniCpToLower(uc);
     390    RTUNICP ucUpper = RTUniCpToUpper(uc);
     391    //return RTUniCpCalcUtf8Len(ucLower) != RTUniCpCalcUtf8Len(ucUpper);
     392    return false;
     393}
     394#endif
     395
    374396static void test2(RTTEST hTest)
    375397{
     
    384406    } while (0)
    385407
    386     RTCString strTmp;
     408    RTCString strTmp, strExpect;
    387409    char szDst[16];
     410
     411    /* Some simple ascii stuff. */
     412    strTmp    = "abcdefghijklmnopqrstuvwxyz0123456ABCDEFGHIJKLMNOPQRSTUVWXYZ;-+/\\";
     413    strExpect = "ABCDEFGHIJKLMNOPQRSTUVWXYZ0123456ABCDEFGHIJKLMNOPQRSTUVWXYZ;-+/\\";
     414    strTmp.toUpper();
     415    CHECK_EQUAL(strTmp, strExpect);
     416
     417    strTmp.toLower();
     418    strExpect = "abcdefghijklmnopqrstuvwxyz0123456abcdefghijklmnopqrstuvwxyz;-+/\\";
     419    CHECK_EQUAL(strTmp, strExpect);
     420
     421    strTmp    = "abcdefghijklmnopqrstuvwxyz0123456ABCDEFGHIJKLMNOPQRSTUVWXYZ;-+/\\";
     422    strTmp.toLower();
     423    CHECK_EQUAL(strTmp, strExpect);
    388424
    389425    /* Collect all upper and lower case code points. */
     
    396432    for (RTUNICP uc = 1; uc <= 0x10fffd; uc++)
    397433    {
     434        /* Unicode 4.01, I think, introduced a few codepoints with lower/upper mappings
     435           that aren't up for roundtrips and which case folding has a different UTF-8
     436           length.  We'll just skip them here as there are very few:
     437            - Dotless small i and dotless capital I folds into ASCII I and i.
     438            - The small letter long s folds to ASCII S.
     439            - Greek prosgegrammeni folds to iota, which is a letter with both upper
     440              and lower case foldings of its own. */
     441        if (uc == 0x131 || uc == 0x130 || uc == 0x17f || 0x1fbe)
     442            continue;
     443
    398444        if (RTUniCpIsLower(uc))
    399445        {
     
    445491        size_t const        cchLower2 = pszDstEnd - &szDst[0];
    446492        RTTESTI_CHECK_MSG(cchDst == cchLower2,
    447                           ("ucLower2=%#x %u bytes;  ucUpper=%#x %u bytes\n",
    448                            ucLower2, cchLower2, ucUpper, cchDst));
     493                          ("ucLower2=%#x %u bytes;  ucUpper=%#x %u bytes; ucLower=%#x\n",
     494                           ucLower2, cchLower2, ucUpper, cchDst, ucLower));
    449495    }
    450496    RTTESTI_CHECK(strlen(strUpper2.c_str()) == strUpper2.length());
Note: See TracChangeset for help on using the changeset viewer.

© 2024 Oracle Support Privacy / Do Not Sell My Info Terms of Use Trademark Policy Automated Access Etiquette