Changeset 31229 in vbox
- Timestamp:
- Jul 29, 2010 9:29:35 PM (15 years ago)
- svn:sync-xref-src-repo-rev:
- 64240
- File:
-
- 1 edited
Legend:
- Unmodified
- Added
- Removed
-
trunk/src/VBox/Runtime/common/string/utf-8.cpp
r31221 r31229 1004 1004 static int rtUtf8CalcLatin1Length(const char *psz, size_t cch, size_t *pcch) 1005 1005 { 1006 const unsigned char *puch = (const unsigned char *)psz;1007 1006 size_t cchOut = 0; 1008 while (cch > 0) 1009 { 1010 const unsigned char uch = *puch; 1011 if (!uch) 1007 while (true) 1008 { 1009 RTUNICP Cp; 1010 int rc = RTStrGetCpNEx(&psz, &cch, &Cp); 1011 if (Cp == 0 || rc == VERR_END_OF_STRING) 1012 1012 break; 1013 if (!(uch & RT_BIT(7))) 1014 { 1015 /* one ASCII byte */ 1016 cchOut++; 1017 puch++; 1018 cch--; 1019 } 1020 else 1021 { 1022 /* figure sequence length and validate the first byte */ 1023 unsigned cb; 1024 if ((uch & (RT_BIT(7) | RT_BIT(6) | RT_BIT(5))) == (RT_BIT(7) | RT_BIT(6))) 1025 cb = 2; 1026 else if ((uch & (RT_BIT(7) | RT_BIT(6) | RT_BIT(5) | RT_BIT(4))) == (RT_BIT(7) | RT_BIT(6) | RT_BIT(5))) 1027 cb = 3; 1028 else if ((uch & (RT_BIT(7) | RT_BIT(6) | RT_BIT(5) | RT_BIT(4) | RT_BIT(3))) == (RT_BIT(7) | RT_BIT(6) | RT_BIT(5) | RT_BIT(4))) 1029 cb = 4; 1030 else if ((uch & (RT_BIT(7) | RT_BIT(6) | RT_BIT(5) | RT_BIT(4) | RT_BIT(3) | RT_BIT(2))) == (RT_BIT(7) | RT_BIT(6) | RT_BIT(5) | RT_BIT(4) | RT_BIT(3))) 1031 cb = 5; 1032 else if ((uch & (RT_BIT(7) | RT_BIT(6) | RT_BIT(5) | RT_BIT(4) | RT_BIT(3) | RT_BIT(2) | RT_BIT(1))) == (RT_BIT(7) | RT_BIT(6) | RT_BIT(5) | RT_BIT(4) | RT_BIT(3) | RT_BIT(2))) 1033 cb = 6; 1034 else 1035 { 1036 RTStrAssertMsgFailed(("Invalid UTF-8 first byte: %.*Rhxs\n", RT_MIN(cch, 10), puch)); 1037 return VERR_INVALID_UTF8_ENCODING; 1038 } 1039 1040 /* check length */ 1041 if (cb > cch) 1042 { 1043 RTStrAssertMsgFailed(("Invalid UTF-8 length: cb=%d cch=%d (%.*Rhxs)\n", cb, cch, RT_MIN(cch, 10), puch)); 1044 return VERR_INVALID_UTF8_ENCODING; 1045 } 1046 1047 /* validate the rest */ 1048 switch (cb) 1049 { 1050 case 6: 1051 RTStrAssertMsgReturn((puch[5] & (RT_BIT(7) | RT_BIT(6))) == RT_BIT(7), ("6/%u: %.*Rhxs\n", cb, RT_MIN(cb + 10, cch), puch), VERR_INVALID_UTF8_ENCODING); 1052 case 5: 1053 RTStrAssertMsgReturn((puch[4] & (RT_BIT(7) | RT_BIT(6))) == RT_BIT(7), ("5/%u: %.*Rhxs\n", cb, RT_MIN(cb + 10, cch), puch), VERR_INVALID_UTF8_ENCODING); 1054 case 4: 1055 RTStrAssertMsgReturn((puch[3] & (RT_BIT(7) | RT_BIT(6))) == RT_BIT(7), ("4/%u: %.*Rhxs\n", cb, RT_MIN(cb + 10, cch), puch), VERR_INVALID_UTF8_ENCODING); 1056 case 3: 1057 RTStrAssertMsgReturn((puch[2] & (RT_BIT(7) | RT_BIT(6))) == RT_BIT(7), ("3/%u: %.*Rhxs\n", cb, RT_MIN(cb + 10, cch), puch), VERR_INVALID_UTF8_ENCODING); 1058 case 2: 1059 RTStrAssertMsgReturn((puch[1] & (RT_BIT(7) | RT_BIT(6))) == RT_BIT(7), ("2/%u: %.*Rhxs\n", cb, RT_MIN(cb + 10, cch), puch), VERR_INVALID_UTF8_ENCODING); 1060 break; 1061 } 1062 1063 /* validate the code point. */ 1064 RTUNICP uc; 1065 switch (cb) 1066 { 1067 case 6: 1068 uc = (puch[5] & 0x3f) 1069 | ((RTUNICP)(puch[4] & 0x3f) << 6) 1070 | ((RTUNICP)(puch[3] & 0x3f) << 12) 1071 | ((RTUNICP)(puch[2] & 0x3f) << 18) 1072 | ((RTUNICP)(puch[1] & 0x3f) << 24) 1073 | ((RTUNICP)(uch & 0x01) << 30); 1074 RTStrAssertMsgReturn(uc >= 0x04000000 && uc <= 0x7fffffff, 1075 ("%u: cp=%#010RX32: %.*Rhxs\n", cb, uc, RT_MIN(cb + 10, cch), puch), VERR_INVALID_UTF8_ENCODING); 1076 break; 1077 case 5: 1078 uc = (puch[4] & 0x3f) 1079 | ((RTUNICP)(puch[3] & 0x3f) << 6) 1080 | ((RTUNICP)(puch[2] & 0x3f) << 12) 1081 | ((RTUNICP)(puch[1] & 0x3f) << 18) 1082 | ((RTUNICP)(uch & 0x03) << 24); 1083 RTStrAssertMsgReturn(uc >= 0x00200000 && uc <= 0x03ffffff, 1084 ("%u: cp=%#010RX32: %.*Rhxs\n", cb, uc, RT_MIN(cb + 10, cch), puch), VERR_INVALID_UTF8_ENCODING); 1085 break; 1086 case 4: 1087 uc = (puch[3] & 0x3f) 1088 | ((RTUNICP)(puch[2] & 0x3f) << 6) 1089 | ((RTUNICP)(puch[1] & 0x3f) << 12) 1090 | ((RTUNICP)(uch & 0x07) << 18); 1091 RTStrAssertMsgReturn(uc >= 0x00010000 && uc <= 0x001fffff, 1092 ("%u: cp=%#010RX32: %.*Rhxs\n", cb, uc, RT_MIN(cb + 10, cch), puch), VERR_INVALID_UTF8_ENCODING); 1093 break; 1094 case 3: 1095 uc = (puch[2] & 0x3f) 1096 | ((RTUNICP)(puch[1] & 0x3f) << 6) 1097 | ((RTUNICP)(uch & 0x0f) << 12); 1098 RTStrAssertMsgReturn(uc >= 0x00000800 && uc <= 0x0000fffd, 1099 ("%u: cp=%#010RX32: %.*Rhxs\n", cb, uc, RT_MIN(cb + 10, cch), puch), 1100 uc == 0xffff || uc == 0xfffe ? VERR_CODE_POINT_ENDIAN_INDICATOR : VERR_INVALID_UTF8_ENCODING); 1101 RTStrAssertMsgReturn(uc < 0xd800 || uc > 0xdfff, 1102 ("%u: cp=%#010RX32: %.*Rhxs\n", cb, uc, RT_MIN(cb + 10, cch), puch), VERR_CODE_POINT_SURROGATE); 1103 break; 1104 case 2: 1105 uc = (puch[1] & 0x3f) 1106 | ((RTUNICP)(uch & 0x1f) << 6); 1107 RTStrAssertMsgReturn(uc >= 0x00000080 && uc <= 0x000007ff, 1108 ("%u: cp=%#010RX32: %.*Rhxs\n", cb, uc, RT_MIN(cb + 10, cch), puch), VERR_INVALID_UTF8_ENCODING); 1109 break; 1110 } 1111 1112 /* does this code point have a Latin-1 translation? */ 1113 if (cb > 2 || uch > 0xC3) 1114 return VERR_NO_TRANSLATION; 1115 1116 /* advance */ 1117 cch -= cb; 1118 puch += cb; 1119 cchOut++; 1120 } 1013 if (RT_FAILURE(rc)) 1014 return rc; 1015 if (Cp >= 0x100) 1016 return VERR_NO_TRANSLATION; 1017 cchOut++; 1121 1018 } 1122 1019
Note:
See TracChangeset
for help on using the changeset viewer.