- Timestamp:
- Jul 29, 2010 4:03:53 PM (15 years ago)
- svn:sync-xref-src-repo-rev:
- 64229
- Location:
- trunk
- Files:
-
- 2 edited
Legend:
- Unmodified
- Added
- Removed
-
trunk/include/iprt/string.h
r31199 r31221 829 829 * @param psz The string. 830 830 */ 831 #define RTStrCalcLatin1Len(psz) RTStrUniLen(psz) 831 RTDECL(size_t) RTStrCalcLatin1Len(const char *psz); 832 832 833 833 /** … … 843 843 * This is undefined on failure. 844 844 */ 845 #define RTStrCalcLatin1LenEx(psz, cch, pcch) RTStrUniLenEx(psz, cch, pcch) 845 RTDECL(int) RTStrCalcLatin1LenEx(const char *psz, size_t cch, size_t *pcwc); 846 846 847 847 /** -
trunk/src/VBox/Runtime/common/string/utf-8.cpp
r31213 r31221 840 840 while (cchIn > 0) 841 841 { 842 char ch =*pszIn++; cchIn--;842 unsigned char ch = (unsigned char) *pszIn++; cchIn--; 843 843 if (!ch) 844 844 break; … … 1000 1000 * @param cch The max length of the string. (btw cch = cb) 1001 1001 * Use RTSTR_MAX if all of the string is to be examined. 1002 * @param pc wcWhere to store the length of the Latin-1 string in bytes.1002 * @param pcch Where to store the length of the Latin-1 string in bytes. 1003 1003 */ 1004 1004 static int rtUtf8CalcLatin1Length(const char *psz, size_t cch, size_t *pcch) 1005 1005 { 1006 /* We re-encode to one byte per unicode code point. */ 1007 return RTStrUniLenEx(psz, cch, pcch); 1006 const unsigned char *puch = (const unsigned char *)psz; 1007 size_t cchOut = 0; 1008 while (cch > 0) 1009 { 1010 const unsigned char uch = *puch; 1011 if (!uch) 1012 break; 1013 if (!(uch & RT_BIT(7))) 1014 { 1015 /* one ASCII byte */ 1016 cchOut++; 1017 puch++; 1018 cch--; 1019 } 1020 else 1021 { 1022 /* figure sequence length and validate the first byte */ 1023 unsigned cb; 1024 if ((uch & (RT_BIT(7) | RT_BIT(6) | RT_BIT(5))) == (RT_BIT(7) | RT_BIT(6))) 1025 cb = 2; 1026 else if ((uch & (RT_BIT(7) | RT_BIT(6) | RT_BIT(5) | RT_BIT(4))) == (RT_BIT(7) | RT_BIT(6) | RT_BIT(5))) 1027 cb = 3; 1028 else if ((uch & (RT_BIT(7) | RT_BIT(6) | RT_BIT(5) | RT_BIT(4) | RT_BIT(3))) == (RT_BIT(7) | RT_BIT(6) | RT_BIT(5) | RT_BIT(4))) 1029 cb = 4; 1030 else if ((uch & (RT_BIT(7) | RT_BIT(6) | RT_BIT(5) | RT_BIT(4) | RT_BIT(3) | RT_BIT(2))) == (RT_BIT(7) | RT_BIT(6) | RT_BIT(5) | RT_BIT(4) | RT_BIT(3))) 1031 cb = 5; 1032 else if ((uch & (RT_BIT(7) | RT_BIT(6) | RT_BIT(5) | RT_BIT(4) | RT_BIT(3) | RT_BIT(2) | RT_BIT(1))) == (RT_BIT(7) | RT_BIT(6) | RT_BIT(5) | RT_BIT(4) | RT_BIT(3) | RT_BIT(2))) 1033 cb = 6; 1034 else 1035 { 1036 RTStrAssertMsgFailed(("Invalid UTF-8 first byte: %.*Rhxs\n", RT_MIN(cch, 10), puch)); 1037 return VERR_INVALID_UTF8_ENCODING; 1038 } 1039 1040 /* check length */ 1041 if (cb > cch) 1042 { 1043 RTStrAssertMsgFailed(("Invalid UTF-8 length: cb=%d cch=%d (%.*Rhxs)\n", cb, cch, RT_MIN(cch, 10), puch)); 1044 return VERR_INVALID_UTF8_ENCODING; 1045 } 1046 1047 /* validate the rest */ 1048 switch (cb) 1049 { 1050 case 6: 1051 RTStrAssertMsgReturn((puch[5] & (RT_BIT(7) | RT_BIT(6))) == RT_BIT(7), ("6/%u: %.*Rhxs\n", cb, RT_MIN(cb + 10, cch), puch), VERR_INVALID_UTF8_ENCODING); 1052 case 5: 1053 RTStrAssertMsgReturn((puch[4] & (RT_BIT(7) | RT_BIT(6))) == RT_BIT(7), ("5/%u: %.*Rhxs\n", cb, RT_MIN(cb + 10, cch), puch), VERR_INVALID_UTF8_ENCODING); 1054 case 4: 1055 RTStrAssertMsgReturn((puch[3] & (RT_BIT(7) | RT_BIT(6))) == RT_BIT(7), ("4/%u: %.*Rhxs\n", cb, RT_MIN(cb + 10, cch), puch), VERR_INVALID_UTF8_ENCODING); 1056 case 3: 1057 RTStrAssertMsgReturn((puch[2] & (RT_BIT(7) | RT_BIT(6))) == RT_BIT(7), ("3/%u: %.*Rhxs\n", cb, RT_MIN(cb + 10, cch), puch), VERR_INVALID_UTF8_ENCODING); 1058 case 2: 1059 RTStrAssertMsgReturn((puch[1] & (RT_BIT(7) | RT_BIT(6))) == RT_BIT(7), ("2/%u: %.*Rhxs\n", cb, RT_MIN(cb + 10, cch), puch), VERR_INVALID_UTF8_ENCODING); 1060 break; 1061 } 1062 1063 /* validate the code point. */ 1064 RTUNICP uc; 1065 switch (cb) 1066 { 1067 case 6: 1068 uc = (puch[5] & 0x3f) 1069 | ((RTUNICP)(puch[4] & 0x3f) << 6) 1070 | ((RTUNICP)(puch[3] & 0x3f) << 12) 1071 | ((RTUNICP)(puch[2] & 0x3f) << 18) 1072 | ((RTUNICP)(puch[1] & 0x3f) << 24) 1073 | ((RTUNICP)(uch & 0x01) << 30); 1074 RTStrAssertMsgReturn(uc >= 0x04000000 && uc <= 0x7fffffff, 1075 ("%u: cp=%#010RX32: %.*Rhxs\n", cb, uc, RT_MIN(cb + 10, cch), puch), VERR_INVALID_UTF8_ENCODING); 1076 break; 1077 case 5: 1078 uc = (puch[4] & 0x3f) 1079 | ((RTUNICP)(puch[3] & 0x3f) << 6) 1080 | ((RTUNICP)(puch[2] & 0x3f) << 12) 1081 | ((RTUNICP)(puch[1] & 0x3f) << 18) 1082 | ((RTUNICP)(uch & 0x03) << 24); 1083 RTStrAssertMsgReturn(uc >= 0x00200000 && uc <= 0x03ffffff, 1084 ("%u: cp=%#010RX32: %.*Rhxs\n", cb, uc, RT_MIN(cb + 10, cch), puch), VERR_INVALID_UTF8_ENCODING); 1085 break; 1086 case 4: 1087 uc = (puch[3] & 0x3f) 1088 | ((RTUNICP)(puch[2] & 0x3f) << 6) 1089 | ((RTUNICP)(puch[1] & 0x3f) << 12) 1090 | ((RTUNICP)(uch & 0x07) << 18); 1091 RTStrAssertMsgReturn(uc >= 0x00010000 && uc <= 0x001fffff, 1092 ("%u: cp=%#010RX32: %.*Rhxs\n", cb, uc, RT_MIN(cb + 10, cch), puch), VERR_INVALID_UTF8_ENCODING); 1093 break; 1094 case 3: 1095 uc = (puch[2] & 0x3f) 1096 | ((RTUNICP)(puch[1] & 0x3f) << 6) 1097 | ((RTUNICP)(uch & 0x0f) << 12); 1098 RTStrAssertMsgReturn(uc >= 0x00000800 && uc <= 0x0000fffd, 1099 ("%u: cp=%#010RX32: %.*Rhxs\n", cb, uc, RT_MIN(cb + 10, cch), puch), 1100 uc == 0xffff || uc == 0xfffe ? VERR_CODE_POINT_ENDIAN_INDICATOR : VERR_INVALID_UTF8_ENCODING); 1101 RTStrAssertMsgReturn(uc < 0xd800 || uc > 0xdfff, 1102 ("%u: cp=%#010RX32: %.*Rhxs\n", cb, uc, RT_MIN(cb + 10, cch), puch), VERR_CODE_POINT_SURROGATE); 1103 break; 1104 case 2: 1105 uc = (puch[1] & 0x3f) 1106 | ((RTUNICP)(uch & 0x1f) << 6); 1107 RTStrAssertMsgReturn(uc >= 0x00000080 && uc <= 0x000007ff, 1108 ("%u: cp=%#010RX32: %.*Rhxs\n", cb, uc, RT_MIN(cb + 10, cch), puch), VERR_INVALID_UTF8_ENCODING); 1109 break; 1110 } 1111 1112 /* does this code point have a Latin-1 translation? */ 1113 if (cb > 2 || uch > 0xC3) 1114 return VERR_NO_TRANSLATION; 1115 1116 /* advance */ 1117 cch -= cb; 1118 puch += cb; 1119 cchOut++; 1120 } 1121 } 1122 1123 /* done */ 1124 *pcch = cchOut; 1125 return VINF_SUCCESS; 1008 1126 } 1009 1127 … … 1174 1292 } 1175 1293 RT_EXPORT_SYMBOL(RTStrToLatin1Tag); 1294 1295 1296 RTDECL(size_t) RTStrCalcLatin1Len(const char *psz) 1297 { 1298 size_t cch; 1299 int rc = rtUtf8CalcLatin1Length(psz, RTSTR_MAX, &cch); 1300 return RT_SUCCESS(rc) ? cch : 0; 1301 } 1302 RT_EXPORT_SYMBOL(RTStrCalcLatin1Len); 1303 1304 1305 RTDECL(int) RTStrCalcLatin1LenEx(const char *psz, size_t cchIn, size_t *pcch) 1306 { 1307 size_t cch; 1308 int rc = rtUtf8CalcLatin1Length(psz, cchIn, &cch); 1309 if (pcch) 1310 *pcch = RT_SUCCESS(rc) ? cch : ~(size_t)0; 1311 return rc; 1312 } 1313 RT_EXPORT_SYMBOL(RTStrCalcLatin1LenEx); 1176 1314 1177 1315
Note:
See TracChangeset
for help on using the changeset viewer.