Changeset 25296 in vbox for trunk/src/VBox/Runtime/common/string/utf-8.cpp
- Timestamp:
- Dec 10, 2009 1:22:48 PM (15 years ago)
- File:
-
- 1 edited
Legend:
- Unmodified
- Added
- Removed
-
trunk/src/VBox/Runtime/common/string/utf-8.cpp
r25000 r25296 5 5 6 6 /* 7 * Copyright (C) 2006-200 7Sun Microsystems, Inc.7 * Copyright (C) 2006-2009 Sun Microsystems, Inc. 8 8 * 9 9 * This file is part of VirtualBox Open Source Edition (OSE), as … … 1176 1176 RT_EXPORT_SYMBOL(RTStrPrevCp); 1177 1177 1178 1179 /**1180 * Performs a case sensitive string compare between two UTF-8 strings.1181 *1182 * Encoding errors are ignored by the current implementation. So, the only1183 * difference between this and the CRT strcmp function is the handling of1184 * NULL arguments.1185 *1186 * @returns < 0 if the first string less than the second string.1187 * @returns 0 if the first string identical to the second string.1188 * @returns > 0 if the first string greater than the second string.1189 * @param psz1 First UTF-8 string. Null is allowed.1190 * @param psz2 Second UTF-8 string. Null is allowed.1191 */1192 RTDECL(int) RTStrCmp(const char *psz1, const char *psz2)1193 {1194 if (psz1 == psz2)1195 return 0;1196 if (!psz1)1197 return -1;1198 if (!psz2)1199 return 1;1200 1201 return strcmp(psz1, psz2);1202 }1203 RT_EXPORT_SYMBOL(RTStrCmp);1204 1205 1206 /**1207 * Performs a case sensitive string compare between two UTF-8 strings, given1208 * a maximum string length.1209 *1210 * Encoding errors are ignored by the current implementation. So, the only1211 * difference between this and the CRT strncmp function is the handling of1212 * NULL arguments.1213 *1214 * @returns < 0 if the first string less than the second string.1215 * @returns 0 if the first string identical to the second string.1216 * @returns > 0 if the first string greater than the second string.1217 * @param psz1 First UTF-8 string. Null is allowed.1218 * @param psz2 Second UTF-8 string. Null is allowed.1219 * @param cchMax The maximum string length1220 */1221 RTDECL(int) RTStrNCmp(const char *psz1, const char *psz2, size_t cchMax)1222 {1223 if (psz1 == psz2)1224 return 0;1225 if (!psz1)1226 return -1;1227 if (!psz2)1228 return 1;1229 1230 return strncmp(psz1, psz2, cchMax);1231 }1232 RT_EXPORT_SYMBOL(RTStrNCmp);1233 1234 1235 /**1236 * Performs a case insensitive string compare between two UTF-8 strings.1237 *1238 * This is a simplified compare, as only the simplified lower/upper case folding1239 * specified by the unicode specs are used. It does not consider character pairs1240 * as they are used in some languages, just simple upper & lower case compares.1241 *1242 * The result is the difference between the mismatching codepoints after they1243 * both have been lower cased.1244 *1245 * If the string encoding is invalid the function will assert (strict builds)1246 * and use RTStrCmp for the remainder of the string.1247 *1248 * @returns < 0 if the first string less than the second string.1249 * @returns 0 if the first string identical to the second string.1250 * @returns > 0 if the first string greater than the second string.1251 * @param psz1 First UTF-8 string. Null is allowed.1252 * @param psz2 Second UTF-8 string. Null is allowed.1253 */1254 RTDECL(int) RTStrICmp(const char *psz1, const char *psz2)1255 {1256 if (psz1 == psz2)1257 return 0;1258 if (!psz1)1259 return -1;1260 if (!psz2)1261 return 1;1262 1263 const char *pszStart1 = psz1;1264 for (;;)1265 {1266 /* Get the codepoints */1267 RTUNICP cp1;1268 int rc = RTStrGetCpEx(&psz1, &cp1);1269 if (RT_FAILURE(rc))1270 {1271 AssertRC(rc);1272 psz1--;1273 break;1274 }1275 1276 RTUNICP cp2;1277 rc = RTStrGetCpEx(&psz2, &cp2);1278 if (RT_FAILURE(rc))1279 {1280 AssertRC(rc);1281 psz2--;1282 psz1 = RTStrPrevCp(pszStart1, psz1);1283 break;1284 }1285 1286 /* compare */1287 int iDiff = cp1 - cp2;1288 if (iDiff)1289 {1290 iDiff = RTUniCpToUpper(cp1) != RTUniCpToUpper(cp2);1291 if (iDiff)1292 {1293 iDiff = RTUniCpToLower(cp1) - RTUniCpToLower(cp2); /* lower case diff last! */1294 if (iDiff)1295 return iDiff;1296 }1297 }1298 1299 /* hit the terminator? */1300 if (!cp1)1301 return 0;1302 }1303 1304 /* Hit some bad encoding, continue in case insensitive mode. */1305 return RTStrCmp(psz1, psz2);1306 }1307 RT_EXPORT_SYMBOL(RTStrICmp);1308 1309 1310 /**1311 * Performs a case insensitive string compare between two UTF-8 strings, given a1312 * maximum string length.1313 *1314 * This is a simplified compare, as only the simplified lower/upper case folding1315 * specified by the unicode specs are used. It does not consider character pairs1316 * as they are used in some languages, just simple upper & lower case compares.1317 *1318 * The result is the difference between the mismatching codepoints after they1319 * both have been lower cased.1320 *1321 * If the string encoding is invalid the function will assert (strict builds)1322 * and use RTStrCmp for the remainder of the string.1323 *1324 * @returns < 0 if the first string less than the second string.1325 * @returns 0 if the first string identical to the second string.1326 * @returns > 0 if the first string greater than the second string.1327 * @param psz1 First UTF-8 string. Null is allowed.1328 * @param psz2 Second UTF-8 string. Null is allowed.1329 * @param cchMax Maximum string length1330 */1331 RTDECL(int) RTStrNICmp(const char *psz1, const char *psz2, size_t cchMax)1332 {1333 if (cchMax == 0)1334 return 0;1335 if (psz1 == psz2)1336 return 0;1337 if (!psz1)1338 return -1;1339 if (!psz2)1340 return 1;1341 1342 for (;;)1343 {1344 /* Get the codepoints */1345 RTUNICP cp1;1346 size_t cchMax2 = cchMax;1347 int rc = RTStrGetCpNEx(&psz1, &cchMax, &cp1);1348 if (RT_FAILURE(rc))1349 {1350 AssertRC(rc);1351 psz1--;1352 cchMax++;1353 break;1354 }1355 1356 RTUNICP cp2;1357 rc = RTStrGetCpNEx(&psz2, &cchMax2, &cp2);1358 if (RT_FAILURE(rc))1359 {1360 AssertRC(rc);1361 psz2--;1362 psz1 -= (cchMax - cchMax2 + 1); /* This can't overflow, can it? */1363 cchMax = cchMax2 + 1;1364 break;1365 }1366 1367 /* compare */1368 int iDiff = cp1 - cp2;1369 if (iDiff)1370 {1371 iDiff = RTUniCpToUpper(cp1) != RTUniCpToUpper(cp2);1372 if (iDiff)1373 {1374 iDiff = RTUniCpToLower(cp1) - RTUniCpToLower(cp2); /* lower case diff last! */1375 if (iDiff)1376 return iDiff;1377 }1378 }1379 1380 /* hit the terminator? */1381 if (!cp1 || cchMax == 0)1382 return 0;1383 }1384 1385 /* Hit some bad encoding, continue in case insensitive mode. */1386 return RTStrNCmp(psz1, psz2, cchMax);1387 }1388 RT_EXPORT_SYMBOL(RTStrNICmp);1389 1390 1391 RTDECL(char *) RTStrStr(const char *pszHaystack, const char *pszNeedle)1392 {1393 /* Any NULL strings means NULL return. (In the RTStrCmp tradition.) */1394 if (!pszHaystack)1395 return NULL;1396 if (!pszNeedle)1397 return NULL;1398 1399 /* The rest is CRT. */1400 return (char *)strstr(pszHaystack, pszNeedle);1401 }1402 RT_EXPORT_SYMBOL(RTStrStr);1403 1404 1405 RTDECL(char *) RTStrIStr(const char *pszHaystack, const char *pszNeedle)1406 {1407 /* Any NULL strings means NULL return. (In the RTStrCmp tradition.) */1408 if (!pszHaystack)1409 return NULL;1410 if (!pszNeedle)1411 return NULL;1412 1413 /* The empty string matches everything. */1414 if (!*pszNeedle)1415 return (char *)pszHaystack;1416 1417 /*1418 * The search strategy is to pick out the first char of the needle, fold it,1419 * and match it against the haystack code point by code point. When encountering1420 * a matching code point we use RTStrNICmp for the remainder (if any) of the needle.1421 */1422 const char * const pszNeedleStart = pszNeedle;1423 RTUNICP Cp0;1424 RTStrGetCpEx(&pszNeedle, &Cp0); /* pszNeedle is advanced one code point. */1425 size_t const cchNeedle = strlen(pszNeedle);1426 size_t const cchNeedleCp0= pszNeedle - pszNeedleStart;1427 RTUNICP const Cp0Lower = RTUniCpToLower(Cp0);1428 RTUNICP const Cp0Upper = RTUniCpToUpper(Cp0);1429 if ( Cp0Lower == Cp0Upper1430 && Cp0Lower == Cp0)1431 {1432 /* Cp0 is not a case sensitive char. */1433 for (;;)1434 {1435 RTUNICP Cp;1436 RTStrGetCpEx(&pszHaystack, &Cp);1437 if (!Cp)1438 break;1439 if ( Cp == Cp01440 && !RTStrNICmp(pszHaystack, pszNeedle, cchNeedle))1441 return (char *)pszHaystack - cchNeedleCp0;1442 }1443 }1444 else if ( Cp0Lower == Cp01445 || Cp0Upper != Cp0)1446 {1447 /* Cp0 is case sensitive */1448 for (;;)1449 {1450 RTUNICP Cp;1451 RTStrGetCpEx(&pszHaystack, &Cp);1452 if (!Cp)1453 break;1454 if ( ( Cp == Cp0Upper1455 || Cp == Cp0Lower)1456 && !RTStrNICmp(pszHaystack, pszNeedle, cchNeedle))1457 return (char *)pszHaystack - cchNeedleCp0;1458 }1459 }1460 else1461 {1462 /* Cp0 is case sensitive and folds to two difference chars. (paranoia) */1463 for (;;)1464 {1465 RTUNICP Cp;1466 RTStrGetCpEx(&pszHaystack, &Cp);1467 if (!Cp)1468 break;1469 if ( ( Cp == Cp01470 || Cp == Cp0Upper1471 || Cp == Cp0Lower)1472 && !RTStrNICmp(pszHaystack, pszNeedle, cchNeedle))1473 return (char *)pszHaystack - cchNeedleCp0;1474 }1475 }1476 1477 1478 return NULL;1479 }1480 RT_EXPORT_SYMBOL(RTStrIStr);1481 1482 1483 RTDECL(char *) RTStrToLower(char *psz)1484 {1485 /*1486 * Loop the code points in the string, converting them one by one.1487 * ASSUMES that the code points for upper and lower case are encoded1488 * with the exact same length.1489 */1490 /** @todo Handled bad encodings correctly+quietly, remove assumption,1491 * optimize. */1492 char *pszCur = psz;1493 while (*pszCur)1494 {1495 RTUNICP cp = RTStrGetCp(pszCur);1496 cp = RTUniCpToLower(cp);1497 pszCur = RTStrPutCp(pszCur, cp);1498 }1499 return psz;1500 }1501 RT_EXPORT_SYMBOL(RTStrToLower);1502 1503 1504 RTDECL(char *) RTStrToUpper(char *psz)1505 {1506 /*1507 * Loop the code points in the string, converting them one by one.1508 * ASSUMES that the code points for upper and lower case are encoded1509 * with the exact same length.1510 */1511 /** @todo Handled bad encodings correctly+quietly, remove assumption,1512 * optimize. */1513 char *pszCur = psz;1514 while(*pszCur)1515 {1516 RTUNICP cp = RTStrGetCp(pszCur);1517 cp = RTUniCpToUpper(cp);1518 pszCur = RTStrPutCp(pszCur, cp);1519 }1520 return psz;1521 }1522 RT_EXPORT_SYMBOL(RTStrToUpper);1523
Note:
See TracChangeset
for help on using the changeset viewer.