VirtualBox

Ignore:
Timestamp:
Dec 10, 2009 1:22:48 PM (15 years ago)
Author:
vboxsync
Message:

IPRT: splitting up utf-8.cpp

File:
1 edited

Legend:

Unmodified
Added
Removed
  • trunk/src/VBox/Runtime/common/string/utf-8.cpp

    r25000 r25296  
    55
    66/*
    7  * Copyright (C) 2006-2007 Sun Microsystems, Inc.
     7 * Copyright (C) 2006-2009 Sun Microsystems, Inc.
    88 *
    99 * This file is part of VirtualBox Open Source Edition (OSE), as
     
    11761176RT_EXPORT_SYMBOL(RTStrPrevCp);
    11771177
    1178 
    1179 /**
    1180  * Performs a case sensitive string compare between two UTF-8 strings.
    1181  *
    1182  * Encoding errors are ignored by the current implementation. So, the only
    1183  * difference between this and the CRT strcmp function is the handling of
    1184  * NULL arguments.
    1185  *
    1186  * @returns < 0 if the first string less than the second string.
    1187  * @returns 0 if the first string identical to the second string.
    1188  * @returns > 0 if the first string greater than the second string.
    1189  * @param   psz1        First UTF-8 string. Null is allowed.
    1190  * @param   psz2        Second UTF-8 string. Null is allowed.
    1191  */
    1192 RTDECL(int) RTStrCmp(const char *psz1, const char *psz2)
    1193 {
    1194     if (psz1 == psz2)
    1195         return 0;
    1196     if (!psz1)
    1197         return -1;
    1198     if (!psz2)
    1199         return 1;
    1200 
    1201     return strcmp(psz1, psz2);
    1202 }
    1203 RT_EXPORT_SYMBOL(RTStrCmp);
    1204 
    1205 
    1206 /**
    1207  * Performs a case sensitive string compare between two UTF-8 strings, given
    1208  * a maximum string length.
    1209  *
    1210  * Encoding errors are ignored by the current implementation. So, the only
    1211  * difference between this and the CRT strncmp function is the handling of
    1212  * NULL arguments.
    1213  *
    1214  * @returns < 0 if the first string less than the second string.
    1215  * @returns 0 if the first string identical to the second string.
    1216  * @returns > 0 if the first string greater than the second string.
    1217  * @param   psz1        First UTF-8 string. Null is allowed.
    1218  * @param   psz2        Second UTF-8 string. Null is allowed.
    1219  * @param   cchMax      The maximum string length
    1220  */
    1221 RTDECL(int) RTStrNCmp(const char *psz1, const char *psz2, size_t cchMax)
    1222 {
    1223     if (psz1 == psz2)
    1224         return 0;
    1225     if (!psz1)
    1226         return -1;
    1227     if (!psz2)
    1228         return 1;
    1229 
    1230     return strncmp(psz1, psz2, cchMax);
    1231 }
    1232 RT_EXPORT_SYMBOL(RTStrNCmp);
    1233 
    1234 
    1235 /**
    1236  * Performs a case insensitive string compare between two UTF-8 strings.
    1237  *
    1238  * This is a simplified compare, as only the simplified lower/upper case folding
    1239  * specified by the unicode specs are used. It does not consider character pairs
    1240  * as they are used in some languages, just simple upper & lower case compares.
    1241  *
    1242  * The result is the difference between the mismatching codepoints after they
    1243  * both have been lower cased.
    1244  *
    1245  * If the string encoding is invalid the function will assert (strict builds)
    1246  * and use RTStrCmp for the remainder of the string.
    1247  *
    1248  * @returns < 0 if the first string less than the second string.
    1249  * @returns 0 if the first string identical to the second string.
    1250  * @returns > 0 if the first string greater than the second string.
    1251  * @param   psz1        First UTF-8 string. Null is allowed.
    1252  * @param   psz2        Second UTF-8 string. Null is allowed.
    1253  */
    1254 RTDECL(int) RTStrICmp(const char *psz1, const char *psz2)
    1255 {
    1256     if (psz1 == psz2)
    1257         return 0;
    1258     if (!psz1)
    1259         return -1;
    1260     if (!psz2)
    1261         return 1;
    1262 
    1263     const char *pszStart1 = psz1;
    1264     for (;;)
    1265     {
    1266         /* Get the codepoints */
    1267         RTUNICP cp1;
    1268         int rc = RTStrGetCpEx(&psz1, &cp1);
    1269         if (RT_FAILURE(rc))
    1270         {
    1271             AssertRC(rc);
    1272             psz1--;
    1273             break;
    1274         }
    1275 
    1276         RTUNICP cp2;
    1277         rc = RTStrGetCpEx(&psz2, &cp2);
    1278         if (RT_FAILURE(rc))
    1279         {
    1280             AssertRC(rc);
    1281             psz2--;
    1282             psz1 = RTStrPrevCp(pszStart1, psz1);
    1283             break;
    1284         }
    1285 
    1286         /* compare */
    1287         int iDiff = cp1 - cp2;
    1288         if (iDiff)
    1289         {
    1290             iDiff = RTUniCpToUpper(cp1) != RTUniCpToUpper(cp2);
    1291             if (iDiff)
    1292             {
    1293                 iDiff = RTUniCpToLower(cp1) - RTUniCpToLower(cp2); /* lower case diff last! */
    1294                 if (iDiff)
    1295                     return iDiff;
    1296             }
    1297         }
    1298 
    1299         /* hit the terminator? */
    1300         if (!cp1)
    1301             return 0;
    1302     }
    1303 
    1304     /* Hit some bad encoding, continue in case insensitive mode. */
    1305     return RTStrCmp(psz1, psz2);
    1306 }
    1307 RT_EXPORT_SYMBOL(RTStrICmp);
    1308 
    1309 
    1310 /**
    1311  * Performs a case insensitive string compare between two UTF-8 strings, given a
    1312  * maximum string length.
    1313  *
    1314  * This is a simplified compare, as only the simplified lower/upper case folding
    1315  * specified by the unicode specs are used. It does not consider character pairs
    1316  * as they are used in some languages, just simple upper & lower case compares.
    1317  *
    1318  * The result is the difference between the mismatching codepoints after they
    1319  * both have been lower cased.
    1320  *
    1321  * If the string encoding is invalid the function will assert (strict builds)
    1322  * and use RTStrCmp for the remainder of the string.
    1323  *
    1324  * @returns < 0 if the first string less than the second string.
    1325  * @returns 0 if the first string identical to the second string.
    1326  * @returns > 0 if the first string greater than the second string.
    1327  * @param   psz1        First UTF-8 string. Null is allowed.
    1328  * @param   psz2        Second UTF-8 string. Null is allowed.
    1329  * @param   cchMax      Maximum string length
    1330  */
    1331 RTDECL(int) RTStrNICmp(const char *psz1, const char *psz2, size_t cchMax)
    1332 {
    1333     if (cchMax == 0)
    1334         return 0;
    1335     if (psz1 == psz2)
    1336         return 0;
    1337     if (!psz1)
    1338         return -1;
    1339     if (!psz2)
    1340         return 1;
    1341 
    1342     for (;;)
    1343     {
    1344         /* Get the codepoints */
    1345         RTUNICP cp1;
    1346         size_t cchMax2 = cchMax;
    1347         int rc = RTStrGetCpNEx(&psz1, &cchMax, &cp1);
    1348         if (RT_FAILURE(rc))
    1349         {
    1350             AssertRC(rc);
    1351             psz1--;
    1352             cchMax++;
    1353             break;
    1354         }
    1355 
    1356         RTUNICP cp2;
    1357         rc = RTStrGetCpNEx(&psz2, &cchMax2, &cp2);
    1358         if (RT_FAILURE(rc))
    1359         {
    1360             AssertRC(rc);
    1361             psz2--;
    1362             psz1 -= (cchMax - cchMax2 + 1);  /* This can't overflow, can it? */
    1363             cchMax = cchMax2 + 1;
    1364             break;
    1365         }
    1366 
    1367         /* compare */
    1368         int iDiff = cp1 - cp2;
    1369         if (iDiff)
    1370         {
    1371             iDiff = RTUniCpToUpper(cp1) != RTUniCpToUpper(cp2);
    1372             if (iDiff)
    1373             {
    1374                 iDiff = RTUniCpToLower(cp1) - RTUniCpToLower(cp2); /* lower case diff last! */
    1375                 if (iDiff)
    1376                     return iDiff;
    1377             }
    1378         }
    1379 
    1380         /* hit the terminator? */
    1381         if (!cp1 || cchMax == 0)
    1382             return 0;
    1383     }
    1384 
    1385     /* Hit some bad encoding, continue in case insensitive mode. */
    1386     return RTStrNCmp(psz1, psz2, cchMax);
    1387 }
    1388 RT_EXPORT_SYMBOL(RTStrNICmp);
    1389 
    1390 
    1391 RTDECL(char *) RTStrStr(const char *pszHaystack, const char *pszNeedle)
    1392 {
    1393     /* Any NULL strings means NULL return. (In the RTStrCmp tradition.) */
    1394     if (!pszHaystack)
    1395         return NULL;
    1396     if (!pszNeedle)
    1397         return NULL;
    1398 
    1399     /* The rest is CRT. */
    1400     return (char *)strstr(pszHaystack, pszNeedle);
    1401 }
    1402 RT_EXPORT_SYMBOL(RTStrStr);
    1403 
    1404 
    1405 RTDECL(char *) RTStrIStr(const char *pszHaystack, const char *pszNeedle)
    1406 {
    1407     /* Any NULL strings means NULL return. (In the RTStrCmp tradition.) */
    1408     if (!pszHaystack)
    1409         return NULL;
    1410     if (!pszNeedle)
    1411         return NULL;
    1412 
    1413     /* The empty string matches everything. */
    1414     if (!*pszNeedle)
    1415         return (char *)pszHaystack;
    1416 
    1417     /*
    1418      * The search strategy is to pick out the first char of the needle, fold it,
    1419      * and match it against the haystack code point by code point. When encountering
    1420      * a matching code point we use RTStrNICmp for the remainder (if any) of the needle.
    1421      */
    1422     const char * const pszNeedleStart = pszNeedle;
    1423     RTUNICP Cp0;
    1424     RTStrGetCpEx(&pszNeedle, &Cp0);     /* pszNeedle is advanced one code point. */
    1425     size_t const    cchNeedle   = strlen(pszNeedle);
    1426     size_t const    cchNeedleCp0= pszNeedle - pszNeedleStart;
    1427     RTUNICP const   Cp0Lower    = RTUniCpToLower(Cp0);
    1428     RTUNICP const   Cp0Upper    = RTUniCpToUpper(Cp0);
    1429     if (    Cp0Lower == Cp0Upper
    1430         &&  Cp0Lower == Cp0)
    1431     {
    1432         /* Cp0 is not a case sensitive char. */
    1433         for (;;)
    1434         {
    1435             RTUNICP Cp;
    1436             RTStrGetCpEx(&pszHaystack, &Cp);
    1437             if (!Cp)
    1438                 break;
    1439             if (    Cp == Cp0
    1440                 &&  !RTStrNICmp(pszHaystack, pszNeedle, cchNeedle))
    1441                 return (char *)pszHaystack - cchNeedleCp0;
    1442         }
    1443     }
    1444     else if (   Cp0Lower == Cp0
    1445              || Cp0Upper != Cp0)
    1446     {
    1447         /* Cp0 is case sensitive */
    1448         for (;;)
    1449         {
    1450             RTUNICP Cp;
    1451             RTStrGetCpEx(&pszHaystack, &Cp);
    1452             if (!Cp)
    1453                 break;
    1454             if (    (   Cp == Cp0Upper
    1455                      || Cp == Cp0Lower)
    1456                 &&  !RTStrNICmp(pszHaystack, pszNeedle, cchNeedle))
    1457                 return (char *)pszHaystack - cchNeedleCp0;
    1458         }
    1459     }
    1460     else
    1461     {
    1462         /* Cp0 is case sensitive and folds to two difference chars. (paranoia) */
    1463         for (;;)
    1464         {
    1465             RTUNICP Cp;
    1466             RTStrGetCpEx(&pszHaystack, &Cp);
    1467             if (!Cp)
    1468                 break;
    1469             if (    (   Cp == Cp0
    1470                      || Cp == Cp0Upper
    1471                      || Cp == Cp0Lower)
    1472                 &&  !RTStrNICmp(pszHaystack, pszNeedle, cchNeedle))
    1473                 return (char *)pszHaystack - cchNeedleCp0;
    1474         }
    1475     }
    1476 
    1477 
    1478     return NULL;
    1479 }
    1480 RT_EXPORT_SYMBOL(RTStrIStr);
    1481 
    1482 
    1483 RTDECL(char *) RTStrToLower(char *psz)
    1484 {
    1485     /*
    1486      * Loop the code points in the string, converting them one by one.
    1487      * ASSUMES that the code points for upper and lower case are encoded
    1488      *         with the exact same length.
    1489      */
    1490     /** @todo Handled bad encodings correctly+quietly, remove assumption,
    1491      *        optimize. */
    1492     char *pszCur = psz;
    1493     while (*pszCur)
    1494     {
    1495         RTUNICP cp = RTStrGetCp(pszCur);
    1496         cp = RTUniCpToLower(cp);
    1497         pszCur = RTStrPutCp(pszCur, cp);
    1498     }
    1499     return psz;
    1500 }
    1501 RT_EXPORT_SYMBOL(RTStrToLower);
    1502 
    1503 
    1504 RTDECL(char *) RTStrToUpper(char *psz)
    1505 {
    1506     /*
    1507      * Loop the code points in the string, converting them one by one.
    1508      * ASSUMES that the code points for upper and lower case are encoded
    1509      *         with the exact same length.
    1510      */
    1511     /** @todo Handled bad encodings correctly+quietly, remove assumption,
    1512      *        optimize. */
    1513     char *pszCur = psz;
    1514     while(*pszCur)
    1515     {
    1516         RTUNICP cp = RTStrGetCp(pszCur);
    1517         cp = RTUniCpToUpper(cp);
    1518         pszCur = RTStrPutCp(pszCur, cp);
    1519     }
    1520     return psz;
    1521 }
    1522 RT_EXPORT_SYMBOL(RTStrToUpper);
    1523 
Note: See TracChangeset for help on using the changeset viewer.

© 2024 Oracle Support Privacy / Do Not Sell My Info Terms of Use Trademark Policy Automated Access Etiquette