Changeset 31246 in vbox
- Timestamp:
- Jul 30, 2010 1:24:53 PM (14 years ago)
- Location:
- trunk
- Files:
-
- 2 edited
Legend:
- Unmodified
- Added
- Removed
-
trunk/include/iprt/string.h
r31221 r31246 29 29 #include <iprt/cdefs.h> 30 30 #include <iprt/types.h> 31 #include <iprt/assert.h> 31 32 #include <iprt/stdarg.h> 33 #include <iprt/uni.h> /* for RTUNICP_INVALID */ 32 34 #include <iprt/err.h> /* for VINF_SUCCESS */ 33 35 #if defined(RT_OS_LINUX) && defined(__KERNEL__) … … 1157 1159 1158 1160 /** 1161 * Get the UTF-8 size in characters of a given Unicode code point. The code 1162 * point is expected to be a valid Unicode one, but not necessarily in the 1163 * range supported by UTF-8. 1164 * 1165 * @returns the size in characters, or zero if there is no UTF-8 encoding 1166 */ 1167 DECLINLINE(size_t) RTStrCpSize(RTUNICP CodePoint) 1168 { 1169 if (CodePoint < 0x80) 1170 return 1; 1171 if (CodePoint < 0x800) 1172 return 2; 1173 if (CodePoint < 0x10000) 1174 return 3; 1175 if (CodePoint < 0x11000) 1176 return 4; 1177 return 0; 1178 } 1179 1180 /** 1159 1181 * Put the unicode code point at the given string position 1160 1182 * and return the pointer to the char following it. … … 1207 1229 */ 1208 1230 RTDECL(char *) RTStrPrevCp(const char *pszStart, const char *psz); 1231 1232 /** 1233 * Get the unicode code point at the given string position. 1234 * 1235 * @returns unicode code point. 1236 * @returns RTUNICP_INVALID if the encoding is invalid. 1237 * @param psz The string. 1238 */ 1239 DECLINLINE(RTUNICP) RTLatin1GetCp(const char *psz) 1240 { 1241 return *(const unsigned char *)psz; 1242 } 1243 1244 /** 1245 * Get the unicode code point at the given string position. 1246 * 1247 * @returns iprt status code. 1248 * @param ppsz Pointer to the string pointer. This will be updated to 1249 * point to the char following the current code point. 1250 * This is advanced one character forward on failure. 1251 * @param pCp Where to store the code point. 1252 * RTUNICP_INVALID is stored here on failure. 1253 * 1254 * @remark We optimize this operation by using an inline function for 1255 * the most frequent and simplest sequence, the rest is 1256 * handled by RTStrGetCpExInternal(). 1257 */ 1258 DECLINLINE(int) RTLatin1GetCpEx(const char **ppsz, PRTUNICP pCp) 1259 { 1260 const unsigned char uch = **(const unsigned char **)ppsz; 1261 (*ppsz)++; 1262 *pCp = uch; 1263 return VINF_SUCCESS; 1264 } 1265 1266 /** 1267 * Get the unicode code point at the given string position for a string of a 1268 * given maximum length. 1269 * 1270 * @returns iprt status code. 1271 * @retval VERR_END_OF_STRING if *pcch is 0. *pCp is set to RTUNICP_INVALID. 1272 * 1273 * @param ppsz Pointer to the string pointer. This will be updated to 1274 * point to the char following the current code point. 1275 * @param pcch Pointer to the maximum string length. This will be 1276 * decremented by the size of the code point found. 1277 * @param pCp Where to store the code point. 1278 * RTUNICP_INVALID is stored here on failure. 1279 */ 1280 DECLINLINE(int) RTLatin1GetCpNEx(const char **ppsz, size_t *pcch, PRTUNICP pCp) 1281 { 1282 if (RT_LIKELY(*pcch != 0)) 1283 { 1284 const unsigned char uch = **(const unsigned char **)ppsz; 1285 (*ppsz)++; 1286 (*pcch)--; 1287 *pCp = uch; 1288 return VINF_SUCCESS; 1289 } 1290 *pCp = RTUNICP_INVALID; 1291 return VERR_END_OF_STRING; 1292 } 1293 1294 /** 1295 * Get the Latin-1 size in characters of a given Unicode code point. The code 1296 * point is expected to be a valid Unicode one, but not necessarily in the 1297 * range supported by Latin-1. 1298 * 1299 * @returns the size in characters, or zero if there is no Latin-1 encoding 1300 */ 1301 DECLINLINE(size_t) RTLatin1CpSize(RTUNICP CodePoint) 1302 { 1303 if (CodePoint < 0x100) 1304 return 1; 1305 return 0; 1306 } 1307 1308 /** 1309 * Put the unicode code point at the given string position 1310 * and return the pointer to the char following it. 1311 * 1312 * This function will not consider anything at or following the 1313 * buffer area pointed to by psz. It is therefore not suitable for 1314 * inserting code points into a string, only appending/overwriting. 1315 * 1316 * @returns pointer to the char following the written code point. 1317 * @param psz The string. 1318 * @param CodePoint The code point to write. 1319 * This should not be RTUNICP_INVALID or any other 1320 * character out of the Latin-1 range. 1321 */ 1322 DECLINLINE(char *) RTLatin1PutCp(char *psz, RTUNICP CodePoint) 1323 { 1324 AssertReturn(CodePoint < 0x100, NULL); 1325 *psz++ = (unsigned char)CodePoint; 1326 return psz; 1327 } 1328 1329 /** 1330 * Skips ahead, past the current code point. 1331 * 1332 * @returns Pointer to the char after the current code point. 1333 * @param psz Pointer to the current code point. 1334 * @remark This will not move the next valid code point, only past the current one. 1335 */ 1336 DECLINLINE(char *) RTLatin1NextCp(const char *psz) 1337 { 1338 psz++; 1339 return (char *)psz; 1340 } 1341 1342 /** 1343 * Skips back to the previous code point. 1344 * 1345 * @returns Pointer to the char before the current code point. 1346 * @returns pszStart on failure. 1347 * @param pszStart Pointer to the start of the string. 1348 * @param psz Pointer to the current code point. 1349 */ 1350 DECLINLINE(char *) RTLatin1PrevCp(const char *psz) 1351 { 1352 psz--; 1353 return (char *)psz; 1354 } 1209 1355 1210 1356 -
trunk/src/VBox/Runtime/common/string/utf-8.cpp
r31229 r31246 805 805 { 806 806 size_t cch = 0; 807 while (cchIn > 0) 808 { 809 char ch = *psz++; cchIn--; 810 if (!ch) 807 while (true) 808 { 809 RTUNICP Cp; 810 size_t cchCp; 811 int rc = RTLatin1GetCpNEx(&psz, &cchIn, &Cp); 812 if (Cp == 0 || rc == VERR_END_OF_STRING) 811 813 break; 812 if (!(ch & 0x80)) 813 cch++; 814 else 815 cch += 2; 816 } 817 814 if (RT_FAILURE(rc)) 815 return rc; 816 cchCp = RTStrCpSize(Cp); 817 if (cchCp == 0) 818 return VERR_NO_TRANSLATION; 819 cch += cchCp; 820 } 818 821 819 822 /* done */ … … 832 835 * @param psz Where to store the UTF-8 string. 833 836 * @param cch The size of the UTF-8 buffer, excluding the terminator. 834 * @param pcch Where to store the number of octets actually encoded.835 837 */ 836 static int rtLatin1RecodeAsUtf8(const char *pszIn, size_t cchIn, char *psz, size_t cch, size_t *pcch) 837 { 838 unsigned char *puch = (unsigned char *)psz; 839 int rc = VINF_SUCCESS; 840 while (cchIn > 0) 841 { 842 unsigned char ch = (unsigned char) *pszIn++; cchIn--; 843 if (!ch) 838 static int rtLatin1RecodeAsUtf8(const char *pszIn, size_t cchIn, char *psz, size_t cch) 839 { 840 int rc = VINF_SUCCESS; 841 842 while (true) 843 { 844 RTUNICP Cp; 845 size_t cchCp; 846 rc = RTLatin1GetCpNEx(&pszIn, &cchIn, &Cp); 847 if (Cp == 0 || RT_FAILURE(rc)) 844 848 break; 845 if (!(ch & 0x80)) 846 { 847 if (RT_UNLIKELY(cch < 1)) 848 { 849 RTStrAssertMsgFailed(("Buffer overflow! 1\n")); 850 rc = VERR_BUFFER_OVERFLOW; 851 break; 852 } 853 cch--; 854 *puch++ = (unsigned char)ch; 855 } 856 else 857 { 858 if (RT_UNLIKELY(cch < 2)) 859 { 860 RTStrAssertMsgFailed(("Buffer overflow! 2\n")); 861 rc = VERR_BUFFER_OVERFLOW; 862 break; 863 } 864 cch -= 2; 865 *puch++ = 0xc0 | (ch >> 6); 866 *puch++ = 0x80 | (ch & 0x3f); 867 } 849 cchCp = RTStrCpSize(Cp); 850 if (RT_UNLIKELY(cch < cchCp)) 851 { 852 RTStrAssertMsgFailed(("Buffer overflow! 1\n")); 853 rc = VERR_BUFFER_OVERFLOW; 854 break; 855 } 856 psz = RTStrPutCp(psz, Cp); 857 cch -= cchCp; 868 858 } 869 859 870 860 /* done */ 871 *puch = '\0'; 872 *pcch = (char *)puch - psz; 861 if (rc == VERR_END_OF_STRING) 862 rc = VINF_SUCCESS; 863 *psz = '\0'; 873 864 return rc; 874 865 } … … 898 889 if (pszResult) 899 890 { 900 rc = rtLatin1RecodeAsUtf8(pszString, RTSTR_MAX, pszResult, cch , &cch);891 rc = rtLatin1RecodeAsUtf8(pszString, RTSTR_MAX, pszResult, cch); 901 892 if (RT_SUCCESS(rc)) 902 893 { … … 955 946 if (pszResult) 956 947 { 957 rc = rtLatin1RecodeAsUtf8(pszString, cchString, pszResult, cch - 1 , &cch);948 rc = rtLatin1RecodeAsUtf8(pszString, cchString, pszResult, cch - 1); 958 949 if (RT_SUCCESS(rc)) 959 950 { … … 998 989 * @returns IPRT status code. 999 990 * @param psz Pointer to the UTF-8 string. 1000 * @param cch 991 * @param cchIn The max length of the string. (btw cch = cb) 1001 992 * Use RTSTR_MAX if all of the string is to be examined. 1002 993 * @param pcch Where to store the length of the Latin-1 string in bytes. 1003 994 */ 1004 static int rtUtf8CalcLatin1Length(const char *psz, size_t cch , size_t *pcch)1005 { 1006 size_t cchOut= 0;995 static int rtUtf8CalcLatin1Length(const char *psz, size_t cchIn, size_t *pcch) 996 { 997 size_t cch = 0; 1007 998 while (true) 1008 999 { 1009 1000 RTUNICP Cp; 1010 int rc = RTStrGetCpNEx(&psz, &cch, &Cp); 1001 size_t cchCp; 1002 int rc = RTStrGetCpNEx(&psz, &cchIn, &Cp); 1011 1003 if (Cp == 0 || rc == VERR_END_OF_STRING) 1012 1004 break; 1013 1005 if (RT_FAILURE(rc)) 1014 1006 return rc; 1015 if (Cp >= 0x100) 1007 cchCp = RTLatin1CpSize(Cp); 1008 if (cchCp == 0) 1016 1009 return VERR_NO_TRANSLATION; 1017 cch Out++;1010 cch += cchCp; 1018 1011 } 1019 1012 1020 1013 /* done */ 1021 *pcch = cch Out;1014 *pcch = cch; 1022 1015 return VINF_SUCCESS; 1023 1016 } … … 1030 1023 * 1031 1024 * @returns iprt status code. 1032 * @param psz 1033 * @param cch 1025 * @param pszIn The UTF-8 string to recode. This is a valid encoding. 1026 * @param cchIn The number of chars (the type char, so bytes if you like) to process of the UTF-8 string. 1034 1027 * The recoding will stop when cch or '\\0' is reached. Pass RTSTR_MAX to process up to '\\0'. 1035 * @param psz OutWhere to store the Latin-1 string.1036 * @param cch OutThe number of characters the pszOut buffer can hold, excluding the terminator ('\\0').1028 * @param psz Where to store the Latin-1 string. 1029 * @param cch The number of characters the pszOut buffer can hold, excluding the terminator ('\\0'). 1037 1030 */ 1038 static int rtUtf8RecodeAsLatin1(const char *psz , size_t cch, char *pszOut, size_t cchOut)1039 { 1040 int rc= VINF_SUCCESS;1041 const unsigned char *puch = (const unsigned char *)psz; 1042 unsigned char *puchOut = (unsigned char *)pszOut;1043 while (cch > 0)1044 {1045 /* read the next char and check for terminator. */1046 const unsigned char uch = *puch;1047 if ( !uch)1031 static int rtUtf8RecodeAsLatin1(const char *pszIn, size_t cchIn, char *psz, size_t cch) 1032 { 1033 int rc = VINF_SUCCESS; 1034 1035 while (true) 1036 { 1037 RTUNICP Cp; 1038 size_t cchCp; 1039 rc = RTStrGetCpNEx(&pszIn, &cchIn, &Cp); 1040 if (Cp == 0 || RT_FAILURE(rc)) 1048 1041 break; 1049 1050 /* check for output overflow */1051 if (RT_UNLIKELY(cchOut < 1))1052 {1042 cchCp = RTLatin1CpSize(Cp); 1043 if (RT_UNLIKELY(cch < cchCp)) 1044 { 1045 RTStrAssertMsgFailed(("Buffer overflow! 1\n")); 1053 1046 rc = VERR_BUFFER_OVERFLOW; 1054 1047 break; 1055 1048 } 1056 cchOut--; 1057 1058 /* decode and recode the code point */ 1059 if (!(uch & RT_BIT(7))) 1060 { 1061 *puchOut++ = uch; 1062 puch++; 1063 cch--; 1064 } 1065 else if ((uch & (RT_BIT(7) | RT_BIT(6) | RT_BIT(5))) == (RT_BIT(7) | RT_BIT(6))) 1066 { 1067 uint16_t uc = (puch[1] & 0x3f) 1068 | ((uint16_t)(uch & 0x1f) << 6); 1069 if (uc >= 0x100) 1070 { 1071 rc = VERR_NO_TRANSLATION; 1072 break; 1073 } 1074 *puchOut++ = uc; 1075 puch += 2; 1076 cch -= 2; 1077 } 1078 else 1079 { 1080 rc = VERR_NO_TRANSLATION; 1081 break; 1082 } 1049 psz = RTLatin1PutCp(psz, Cp); 1050 cch -= cchCp; 1083 1051 } 1084 1052 1085 1053 /* done */ 1086 *puchOut = '\0'; 1087 return rc; 1088 } 1054 if (rc == VERR_END_OF_STRING) 1055 rc = VINF_SUCCESS; 1056 *psz = '\0'; 1057 return rc; 1058 } 1059 1089 1060 1090 1061
Note:
See TracChangeset
for help on using the changeset viewer.