Changeset 13927 in vbox for trunk/src/VBox/Runtime
- Timestamp:
- Nov 6, 2008 6:07:59 PM (16 years ago)
- Location:
- trunk/src/VBox/Runtime
- Files:
-
- 2 edited
Legend:
- Unmodified
- Added
- Removed
-
trunk/src/VBox/Runtime/common/string/utf-8.cpp
r10951 r13927 916 916 917 917 918 /** 919 * Handle invalid encodings passed to RTStrGetCpNEx(). 920 * @returns rc 921 * @param ppsz The pointer to the the string position point. 922 * @param pCp Where to store RTUNICP_INVALID. 923 * @param pcch Pointer to the string length. 924 * @param rc The iprt error code. 925 */ 926 static int rtStrGetCpNExFailure(const char **ppsz, PRTUNICP pCp, size_t *pcch, int rc) 927 { 928 /* 929 * Try find a valid encoding. 930 */ 931 (*ppsz)++; /** @todo code this! */ 932 (*pcch)--; 933 *pCp = RTUNICP_INVALID; 934 return rc; 935 } 936 937 938 RTDECL(int) RTStrGetCpNExInternal(const char **ppsz, PRTUNICP pCp, size_t *pcch) 939 { 940 const unsigned char *puch = (const unsigned char *)*ppsz; 941 const unsigned char uch = *puch; 942 RTUNICP uc; 943 944 if (*pcch == 0) 945 { 946 *pCp = RTUNICP_INVALID; 947 return VERR_INVALID_UTF8_ENCODING; 948 } 949 /* ASCII ? */ 950 if (!(uch & RT_BIT(7))) 951 { 952 uc = uch; 953 puch++; 954 } 955 else if (uch & RT_BIT(6)) 956 { 957 /* figure the length and validate the first octet. */ 958 unsigned cb; 959 if (!(uch & RT_BIT(5))) 960 cb = 2; 961 else if (!(uch & RT_BIT(4))) 962 cb = 3; 963 else if (!(uch & RT_BIT(3))) 964 cb = 4; 965 else if (!(uch & RT_BIT(2))) 966 cb = 5; 967 else if (!(uch & RT_BIT(1))) 968 cb = 6; 969 else 970 { 971 RTStrAssertMsgFailed(("Invalid UTF-8 first byte: %.*Rhxs\n", RT_MIN(strlen((char *)puch), 10), puch)); 972 return rtStrGetCpNExFailure(ppsz, pCp, pcch, VERR_INVALID_UTF8_ENCODING); 973 } 974 975 if (cb > *pcch) 976 return rtStrGetCpNExFailure(ppsz, pCp, pcch, VERR_INVALID_UTF8_ENCODING); 977 978 /* validate the rest */ 979 switch (cb) 980 { 981 case 6: 982 RTStrAssertMsgReturn((puch[5] & 0xc0) == 0x80, ("6/%u: %.*Rhxs\n", cb, RT_MIN(cb + 10, strlen((char *)puch)), puch), 983 rtStrGetCpNExFailure(ppsz, pCp, pcch, VERR_INVALID_UTF8_ENCODING)); 984 case 5: 985 RTStrAssertMsgReturn((puch[4] & 0xc0) == 0x80, ("5/%u: %.*Rhxs\n", cb, RT_MIN(cb + 10, strlen((char *)puch)), puch), 986 rtStrGetCpNExFailure(ppsz, pCp, pcch, VERR_INVALID_UTF8_ENCODING)); 987 case 4: 988 RTStrAssertMsgReturn((puch[3] & 0xc0) == 0x80, ("4/%u: %.*Rhxs\n", cb, RT_MIN(cb + 10, strlen((char *)puch)), puch), 989 rtStrGetCpNExFailure(ppsz, pCp, pcch, VERR_INVALID_UTF8_ENCODING)); 990 case 3: 991 RTStrAssertMsgReturn((puch[2] & 0xc0) == 0x80, ("3/%u: %.*Rhxs\n", cb, RT_MIN(cb + 10, strlen((char *)puch)), puch), 992 rtStrGetCpNExFailure(ppsz, pCp, pcch, VERR_INVALID_UTF8_ENCODING)); 993 case 2: 994 RTStrAssertMsgReturn((puch[1] & 0xc0) == 0x80, ("2/%u: %.*Rhxs\n", cb, RT_MIN(cb + 10, strlen((char *)puch)), puch), 995 rtStrGetCpNExFailure(ppsz, pCp, pcch, VERR_INVALID_UTF8_ENCODING)); 996 break; 997 } 998 999 /* get and validate the code point. */ 1000 switch (cb) 1001 { 1002 case 6: 1003 uc = (puch[5] & 0x3f) 1004 | ((RTUNICP)(puch[4] & 0x3f) << 6) 1005 | ((RTUNICP)(puch[3] & 0x3f) << 12) 1006 | ((RTUNICP)(puch[2] & 0x3f) << 18) 1007 | ((RTUNICP)(puch[1] & 0x3f) << 24) 1008 | ((RTUNICP)(uch & 0x01) << 30); 1009 RTStrAssertMsgReturn(uc >= 0x04000000 && uc <= 0x7fffffff, 1010 ("%u: cp=%#010RX32: %.*Rhxs\n", cb, uc, RT_MIN(cb + 10, strlen((char *)puch)), puch), 1011 rtStrGetCpNExFailure(ppsz, pCp, pcch, VERR_INVALID_UTF8_ENCODING)); 1012 break; 1013 case 5: 1014 uc = (puch[4] & 0x3f) 1015 | ((RTUNICP)(puch[3] & 0x3f) << 6) 1016 | ((RTUNICP)(puch[2] & 0x3f) << 12) 1017 | ((RTUNICP)(puch[1] & 0x3f) << 18) 1018 | ((RTUNICP)(uch & 0x03) << 24); 1019 RTStrAssertMsgReturn(uc >= 0x00200000 && uc <= 0x03ffffff, 1020 ("%u: cp=%#010RX32: %.*Rhxs\n", cb, uc, RT_MIN(cb + 10, strlen((char *)puch)), puch), 1021 rtStrGetCpNExFailure(ppsz, pCp, pcch, VERR_INVALID_UTF8_ENCODING)); 1022 break; 1023 case 4: 1024 uc = (puch[3] & 0x3f) 1025 | ((RTUNICP)(puch[2] & 0x3f) << 6) 1026 | ((RTUNICP)(puch[1] & 0x3f) << 12) 1027 | ((RTUNICP)(uch & 0x07) << 18); 1028 RTStrAssertMsgReturn(uc >= 0x00010000 && uc <= 0x001fffff, 1029 ("%u: cp=%#010RX32: %.*Rhxs\n", cb, uc, RT_MIN(cb + 10, strlen((char *)puch)), puch), 1030 rtStrGetCpNExFailure(ppsz, pCp, pcch, VERR_INVALID_UTF8_ENCODING)); 1031 break; 1032 case 3: 1033 uc = (puch[2] & 0x3f) 1034 | ((RTUNICP)(puch[1] & 0x3f) << 6) 1035 | ((RTUNICP)(uch & 0x0f) << 12); 1036 RTStrAssertMsgReturn(uc >= 0x00000800 && uc <= 0x0000fffd, 1037 ("%u: cp=%#010RX32: %.*Rhxs\n", cb, uc, RT_MIN(cb + 10, strlen((char *)puch)), puch), 1038 rtStrGetCpNExFailure(ppsz, pCp, pcch, uc == 0xffff || uc == 0xfffe ? VERR_CODE_POINT_ENDIAN_INDICATOR : VERR_INVALID_UTF8_ENCODING)); 1039 RTStrAssertMsgReturn(uc < 0xd800 || uc > 0xdfff, 1040 ("%u: cp=%#010RX32: %.*Rhxs\n", cb, uc, RT_MIN(cb + 10, strlen((char *)puch)), puch), 1041 rtStrGetCpNExFailure(ppsz, pCp, pcch, VERR_CODE_POINT_SURROGATE)); 1042 break; 1043 case 2: 1044 uc = (puch[1] & 0x3f) 1045 | ((RTUNICP)(uch & 0x1f) << 6); 1046 RTStrAssertMsgReturn(uc >= 0x00000080 && uc <= 0x000007ff, 1047 ("%u: cp=%#010RX32: %.*Rhxs\n", cb, uc, RT_MIN(cb + 10, strlen((char *)puch)), puch), 1048 rtStrGetCpNExFailure(ppsz, pCp, pcch, VERR_INVALID_UTF8_ENCODING)); 1049 break; 1050 default: /* impossible, but GCC is bitching. */ 1051 uc = RTUNICP_INVALID; 1052 break; 1053 } 1054 puch += cb; 1055 (*pcch) -= cb; 1056 } 1057 else 1058 { 1059 /* 6th bit is always set. */ 1060 RTStrAssertMsgFailed(("Invalid UTF-8 first byte: %.*Rhxs\n", RT_MIN(strlen((char *)puch), 10), puch)); 1061 return rtStrGetCpNExFailure(ppsz, pCp, pcch, VERR_INVALID_UTF8_ENCODING); 1062 } 1063 *pCp = uc; 1064 *ppsz = (const char *)puch; 1065 return VINF_SUCCESS; 1066 } 1067 1068 918 1069 RTDECL(char *) RTStrPutCpInternal(char *psz, RTUNICP uc) 919 1070 { … … 1031 1182 1032 1183 return strcmp(psz1, psz2); 1184 } 1185 1186 1187 /** 1188 * Performs a case sensitive string compare between two UTF-8 strings, given 1189 * a maximum string length. 1190 * 1191 * Encoding errors are ignored by the current implementation. So, the only 1192 * difference between this and the CRT strncmp function is the handling of 1193 * NULL arguments. 1194 * 1195 * @returns < 0 if the first string less than the second string. 1196 * @returns 0 if the first string identical to the second string. 1197 * @returns > 0 if the first string greater than the second string. 1198 * @param psz1 First UTF-8 string. Null is allowed. 1199 * @param psz2 Second UTF-8 string. Null is allowed. 1200 * @param cchMax The maximum string length 1201 */ 1202 RTDECL(int) RTStrNCmp(const char *psz1, const char *psz2, size_t cchMax) 1203 { 1204 if (psz1 == psz2) 1205 return 0; 1206 if (!psz1) 1207 return -1; 1208 if (!psz2) 1209 return 1; 1210 1211 return strncmp(psz1, psz2, cchMax); 1033 1212 } 1034 1213 … … 1114 1293 #endif 1115 1294 } 1295 1296 1297 /** 1298 * Performs a case insensitive string compare between two UTF-8 strings, given a 1299 * maximum string length. 1300 * 1301 * This is a simplified compare, as only the simplified lower/upper case folding 1302 * specified by the unicode specs are used. It does not consider character pairs 1303 * as they are used in some languages, just simple upper & lower case compares. 1304 * 1305 * The result is the difference between the mismatching codepoints after they 1306 * both have been lower cased. 1307 * 1308 * If the string encoding is invalid the function will assert (strict builds) 1309 * and use RTStrCmp for the remainder of the string. 1310 * 1311 * @returns < 0 if the first string less than the second string. 1312 * @returns 0 if the first string identical to the second string. 1313 * @returns > 0 if the first string greater than the second string. 1314 * @param psz1 First UTF-8 string. Null is allowed. 1315 * @param psz2 Second UTF-8 string. Null is allowed. 1316 * @param cchMax Maximum string length 1317 */ 1318 RTDECL(int) RTStrNICmp(const char *psz1, const char *psz2, size_t cchMax) 1319 { 1320 if (psz1 == psz2) 1321 return 0; 1322 if (!psz1) 1323 return -1; 1324 if (!psz2) 1325 return 1; 1326 if (cchMax == 0) 1327 return 0; 1328 1329 #if 1 /* new */ 1330 const char *pszStart1 = psz1; 1331 for (;;) 1332 { 1333 /* Get the codepoints */ 1334 RTUNICP cp1; 1335 size_t cchMax2 = cchMax; 1336 int rc = RTStrGetCpNEx(&psz1, &cp1, &cchMax); 1337 if (RT_FAILURE(rc)) 1338 { 1339 AssertRC(rc); 1340 psz1--; 1341 cchMax++; 1342 break; 1343 } 1344 1345 RTUNICP cp2; 1346 rc = RTStrGetCpNEx(&psz2, &cp2, &cchMax2); 1347 if (RT_FAILURE(rc)) 1348 { 1349 AssertRC(rc); 1350 psz2--; 1351 psz1 -= (cchMax - cchMax2 + 1); /* This can't overflow, can it? */ 1352 cchMax = cchMax2 + 1; 1353 break; 1354 } 1355 1356 /* compare */ 1357 int iDiff = cp1 - cp2; 1358 if (iDiff) 1359 { 1360 iDiff = RTUniCpToUpper(cp1) != RTUniCpToUpper(cp2); 1361 if (iDiff) 1362 { 1363 iDiff = RTUniCpToLower(cp1) - RTUniCpToLower(cp2); /* lower case diff last! */ 1364 if (iDiff) 1365 return iDiff; 1366 } 1367 } 1368 1369 /* hit the terminator? */ 1370 if (!cp1 || cchMax == 0) 1371 return 0; 1372 } 1373 1374 /* Hit some bad encoding, continue in case insensitive mode. */ 1375 return RTStrNCmp(psz1, psz2, cchMax); 1376 #else /* old */ 1377 #ifdef RT_OS_WINDOWS 1378 return strnicmp(psz1, psz2, cchMax); 1379 #else /* !RT_OS_WINDOWS */ 1380 return strncasecmp(psz1, psz2, cchMax); 1381 #endif /* !RT_OS_WINDOWS */ 1382 #endif 1383 } 1384 -
trunk/src/VBox/Runtime/testcase/tstUtf8.cpp
r13837 r13927 832 832 833 833 834 CHECK_DIFF(RTStrNCmp(NULL, NULL, RTSTR_MAX), == ); 835 CHECK_DIFF(RTStrNCmp(NULL, "", RTSTR_MAX), < ); 836 CHECK_DIFF(RTStrNCmp("", NULL, RTSTR_MAX), > ); 837 CHECK_DIFF(RTStrNCmp("", "", RTSTR_MAX), == ); 838 CHECK_DIFF(RTStrNCmp("abcdef", "abcdef", RTSTR_MAX), == ); 839 CHECK_DIFF(RTStrNCmp("abcdef", "abcde", RTSTR_MAX), > ); 840 CHECK_DIFF(RTStrNCmp("abcde", "abcdef", RTSTR_MAX), < ); 841 CHECK_DIFF(RTStrNCmp("abcdeg", "abcdef", RTSTR_MAX), > ); 842 CHECK_DIFF(RTStrNCmp("abcdef", "abcdeg", RTSTR_MAX), < ); 843 CHECK_DIFF(RTStrNCmp("abcdeF", "abcdef", RTSTR_MAX), < ); 844 CHECK_DIFF(RTStrNCmp("abcdef", "abcdeF", RTSTR_MAX), > ); 845 846 CHECK_DIFF(RTStrNCmp("abcdef", "fedcba", 0), ==); 847 CHECK_DIFF(RTStrNCmp("abcdef", "abcdeF", 5), ==); 848 CHECK_DIFF(RTStrNCmp("abcdef", "abcdeF", 6), > ); 849 850 834 851 CHECK_DIFF(RTStrICmp(NULL, NULL), == ); 835 852 CHECK_DIFF(RTStrICmp(NULL, ""), < ); … … 849 866 CHECK_DIFF(RTStrICmp("AbCdEg", "aBcDeF"), > ); 850 867 CHECK_DIFF(RTStrICmp("AbCdEG", "aBcDef"), > ); /* diff performed on the lower case cp. */ 868 869 870 871 CHECK_DIFF(RTStrNICmp(NULL, NULL, RTSTR_MAX), == ); 872 CHECK_DIFF(RTStrNICmp(NULL, "", RTSTR_MAX), < ); 873 CHECK_DIFF(RTStrNICmp("", NULL, RTSTR_MAX), > ); 874 CHECK_DIFF(RTStrNICmp("", "", RTSTR_MAX), == ); 875 CHECK_DIFF(RTStrNICmp("abcdef", "abcdef", RTSTR_MAX), == ); 876 CHECK_DIFF(RTStrNICmp("abcdef", "abcde", RTSTR_MAX), > ); 877 CHECK_DIFF(RTStrNICmp("abcde", "abcdef", RTSTR_MAX), < ); 878 CHECK_DIFF(RTStrNICmp("abcdeg", "abcdef", RTSTR_MAX), > ); 879 CHECK_DIFF(RTStrNICmp("abcdef", "abcdeg", RTSTR_MAX), < ); 880 881 CHECK_DIFF(RTStrNICmp("abcdeF", "abcdef", RTSTR_MAX), == ); 882 CHECK_DIFF(RTStrNICmp("abcdef", "abcdeF", RTSTR_MAX), ==); 883 CHECK_DIFF(RTStrNICmp("ABCDEF", "abcdef", RTSTR_MAX), ==); 884 CHECK_DIFF(RTStrNICmp("abcdef", "ABCDEF", RTSTR_MAX), ==); 885 CHECK_DIFF(RTStrNICmp("AbCdEf", "aBcDeF", RTSTR_MAX), ==); 886 CHECK_DIFF(RTStrNICmp("AbCdEg", "aBcDeF", RTSTR_MAX), > ); 887 CHECK_DIFF(RTStrNICmp("AbCdEG", "aBcDef", RTSTR_MAX), > ); /* diff performed on the lower case cp. */ 888 889 CHECK_DIFF(RTStrNICmp("ABCDEF", "fedcba", 0), ==); 890 CHECK_DIFF(RTStrNICmp("AbCdEg", "aBcDeF", 5), ==); 891 CHECK_DIFF(RTStrNICmp("AbCdEg", "aBcDeF", 6), > ); 892 CHECK_DIFF(RTStrNICmp("AbCdEG", "aBcDef", 6), > ); /* diff performed on the lower case cp. */ 893 /* We should continue using byte comparison when we hit the invalid CP. Will assert in debug builds. */ 894 // CHECK_DIFF(RTStrNICmp("AbCd\xff""eg", "aBcD\xff""eF", 6), ==); 851 895 } 852 896
Note:
See TracChangeset
for help on using the changeset viewer.