Changeset 31199 in vbox for trunk/src/VBox/Runtime/common/string
- Timestamp:
- Jul 29, 2010 10:54:16 AM (14 years ago)
- File:
-
- 1 edited
Legend:
- Unmodified
- Added
- Removed
-
trunk/src/VBox/Runtime/common/string/utf-8.cpp
r31157 r31199 792 792 } 793 793 RT_EXPORT_SYMBOL(RTStrCalcUtf16LenEx); 794 795 796 /** 797 * Calculates the length of the UTF-8 encoding of a Latin-1 string. 798 * 799 * @returns iprt status code. 800 * @param psz The Latin-1 string. 801 * @param cchIn The max length of the Latin-1 string to consider. 802 * @param pcch Where to store the length (excluding '\\0') of the UTF-8 string. (cch == cb, btw) 803 */ 804 static int rtLatin1CalcUtf8Length(const char *psz, size_t cchIn, size_t *pcch) 805 { 806 size_t cch = 0; 807 while (cchIn > 0) 808 { 809 char ch = *psz++; cchIn--; 810 if (!ch) 811 break; 812 if (!(ch & 0x80)) 813 cch++; 814 else 815 cch += 2; 816 } 817 818 819 /* done */ 820 *pcch = cch; 821 return VINF_SUCCESS; 822 } 823 824 825 /** 826 * Recodes a Latin-1 string as UTF-8. 827 * 828 * @returns iprt status code. 829 * @param psz The Latin-1 string. 830 * @param cchIn The number of characters to process from psz. The recoding 831 * will stop when cch or '\\0' is reached. 832 * @param psz Where to store the UTF-8 string. 833 * @param cch The size of the UTF-8 buffer, excluding the terminator. 834 * @param pcch Where to store the number of octets actually encoded. 835 */ 836 static int rtLatin1RecodeAsUtf8(const char *pszIn, size_t cchIn, char *psz, size_t cch, size_t *pcch) 837 { 838 unsigned char *puch = (unsigned char *)psz; 839 int rc = VINF_SUCCESS; 840 while (cchIn > 0) 841 { 842 char ch = *pszIn++; cchIn--; 843 if (!ch) 844 break; 845 if (!(ch & 0x80)) 846 { 847 if (RT_UNLIKELY(cch < 1)) 848 { 849 RTStrAssertMsgFailed(("Buffer overflow! 1\n")); 850 rc = VERR_BUFFER_OVERFLOW; 851 break; 852 } 853 cch--; 854 *puch++ = (unsigned char)ch; 855 } 856 else 857 { 858 if (RT_UNLIKELY(cch < 2)) 859 { 860 RTStrAssertMsgFailed(("Buffer overflow! 2\n")); 861 rc = VERR_BUFFER_OVERFLOW; 862 break; 863 } 864 cch -= 2; 865 *puch++ = 0xc0 | (ch >> 6); 866 *puch++ = 0x80 | (ch & 0x3f); 867 } 868 } 869 870 /* done */ 871 *puch = '\0'; 872 *pcch = (char *)puch - psz; 873 return rc; 874 } 875 876 877 878 RTDECL(int) RTLatin1ToUtf8Tag(const char *pszString, char **ppszString, const char *pszTag) 879 { 880 /* 881 * Validate input. 882 */ 883 Assert(VALID_PTR(ppszString)); 884 Assert(VALID_PTR(pszString)); 885 *ppszString = NULL; 886 887 /* 888 * Calculate the length of the UTF-8 encoding of the Latin-1 string. 889 */ 890 size_t cch; 891 int rc = rtLatin1CalcUtf8Length(pszString, RTSTR_MAX, &cch); 892 if (RT_SUCCESS(rc)) 893 { 894 /* 895 * Allocate buffer and recode it. 896 */ 897 char *pszResult = (char *)RTMemAllocTag(cch + 1, pszTag); 898 if (pszResult) 899 { 900 rc = rtLatin1RecodeAsUtf8(pszString, RTSTR_MAX, pszResult, cch, &cch); 901 if (RT_SUCCESS(rc)) 902 { 903 *ppszString = pszResult; 904 return rc; 905 } 906 907 RTMemFree(pszResult); 908 } 909 else 910 rc = VERR_NO_STR_MEMORY; 911 } 912 return rc; 913 } 914 RT_EXPORT_SYMBOL(RTLatin1ToUtf8Tag); 915 916 917 RTDECL(int) RTLatin1ToUtf8ExTag(const char *pszString, size_t cchString, char **ppsz, size_t cch, size_t *pcch, const char *pszTag) 918 { 919 /* 920 * Validate input. 921 */ 922 Assert(VALID_PTR(pszString)); 923 Assert(VALID_PTR(ppsz)); 924 Assert(!pcch || VALID_PTR(pcch)); 925 926 /* 927 * Calculate the length of the UTF-8 encoding of the Latin-1 string. 928 */ 929 size_t cchResult; 930 int rc = rtLatin1CalcUtf8Length(pszString, cchString, &cchResult); 931 if (RT_SUCCESS(rc)) 932 { 933 if (pcch) 934 *pcch = cchResult; 935 936 /* 937 * Check buffer size / Allocate buffer and recode it. 938 */ 939 bool fShouldFree; 940 char *pszResult; 941 if (cch > 0 && *ppsz) 942 { 943 fShouldFree = false; 944 if (RT_UNLIKELY(cch <= cchResult)) 945 return VERR_BUFFER_OVERFLOW; 946 pszResult = *ppsz; 947 } 948 else 949 { 950 *ppsz = NULL; 951 fShouldFree = true; 952 cch = RT_MAX(cch, cchResult + 1); 953 pszResult = (char *)RTStrAllocTag(cch, pszTag); 954 } 955 if (pszResult) 956 { 957 rc = rtLatin1RecodeAsUtf8(pszString, cchString, pszResult, cch - 1, &cch); 958 if (RT_SUCCESS(rc)) 959 { 960 *ppsz = pszResult; 961 return rc; 962 } 963 964 if (fShouldFree) 965 RTStrFree(pszResult); 966 } 967 else 968 rc = VERR_NO_STR_MEMORY; 969 } 970 return rc; 971 } 972 RT_EXPORT_SYMBOL(RTLatin1ToUtf8ExTag); 973 974 975 RTDECL(size_t) RTLatin1CalcUtf8Len(const char *psz) 976 { 977 size_t cch; 978 int rc = rtLatin1CalcUtf8Length(psz, RTSTR_MAX, &cch); 979 return RT_SUCCESS(rc) ? cch : 0; 980 } 981 RT_EXPORT_SYMBOL(RTLatin1CalcUtf8Len); 982 983 984 RTDECL(int) RTLatin1CalcUtf8LenEx(const char *psz, size_t cchIn, size_t *pcch) 985 { 986 size_t cch; 987 int rc = rtLatin1CalcUtf8Length(psz, cchIn, &cch); 988 if (pcch) 989 *pcch = RT_SUCCESS(rc) ? cch : ~(size_t)0; 990 return rc; 991 } 992 RT_EXPORT_SYMBOL(RTLatin1CalcUtf8LenEx); 993 994 995 /** 996 * Calculates the Latin-1 length of a string, validating the encoding while doing so. 997 * 998 * @returns IPRT status code. 999 * @param psz Pointer to the UTF-8 string. 1000 * @param cch The max length of the string. (btw cch = cb) 1001 * Use RTSTR_MAX if all of the string is to be examined. 1002 * @param pcwc Where to store the length of the Latin-1 string in bytes. 1003 */ 1004 static int rtUtf8CalcLatin1Length(const char *psz, size_t cch, size_t *pcch) 1005 { 1006 /* We re-encode to one byte per unicode code point. */ 1007 return RTStrUniLenEx(psz, cch, pcch); 1008 } 1009 1010 1011 /** 1012 * Recodes a valid UTF-8 string as Latin-1. 1013 * 1014 * Since we know the input is valid, we do *not* perform encoding or length checks. 1015 * 1016 * @returns iprt status code. 1017 * @param psz The UTF-8 string to recode. This is a valid encoding. 1018 * @param cch The number of chars (the type char, so bytes if you like) to process of the UTF-8 string. 1019 * The recoding will stop when cch or '\\0' is reached. Pass RTSTR_MAX to process up to '\\0'. 1020 * @param pszOut Where to store the Latin-1 string. 1021 * @param cchOut The number of characters the pszOut buffer can hold, excluding the terminator ('\\0'). 1022 */ 1023 static int rtUtf8RecodeAsLatin1(const char *psz, size_t cch, char *pszOut, size_t cchOut) 1024 { 1025 int rc = VINF_SUCCESS; 1026 const unsigned char *puch = (const unsigned char *)psz; 1027 unsigned char *puchOut = (unsigned char *)pszOut; 1028 while (cch > 0) 1029 { 1030 /* read the next char and check for terminator. */ 1031 const unsigned char uch = *puch; 1032 if (!uch) 1033 break; 1034 1035 /* check for output overflow */ 1036 if (RT_UNLIKELY(cchOut < 1)) 1037 { 1038 rc = VERR_BUFFER_OVERFLOW; 1039 break; 1040 } 1041 cchOut--; 1042 1043 /* decode and recode the code point */ 1044 if (!(uch & RT_BIT(7))) 1045 { 1046 *puchOut++ = uch; 1047 puch++; 1048 cch--; 1049 } 1050 else if ((uch & (RT_BIT(7) | RT_BIT(6) | RT_BIT(5))) == (RT_BIT(7) | RT_BIT(6))) 1051 { 1052 uint16_t uc = (puch[1] & 0x3f) 1053 | ((uint16_t)(uch & 0x1f) << 6); 1054 *puchOut++ = uc < 0x100 ? uc : '?'; 1055 puch += 2; 1056 cch -= 2; 1057 } 1058 else if ((uch & (RT_BIT(7) | RT_BIT(6) | RT_BIT(5) | RT_BIT(4))) == (RT_BIT(7) | RT_BIT(6) | RT_BIT(5))) 1059 { 1060 *puchOut++ = '?'; 1061 puch += 3; 1062 cch -= 3; 1063 } 1064 else 1065 { 1066 *puchOut++ = '?'; 1067 puch += 4; 1068 cch -= 4; 1069 } 1070 } 1071 1072 /* done */ 1073 *puchOut = '\0'; 1074 return rc; 1075 } 1076 1077 1078 RTDECL(int) RTStrToLatin1Tag(const char *pszString, char **ppszString, const char *pszTag) 1079 { 1080 /* 1081 * Validate input. 1082 */ 1083 Assert(VALID_PTR(ppszString)); 1084 Assert(VALID_PTR(pszString)); 1085 *ppszString = NULL; 1086 1087 /* 1088 * Validate the UTF-8 input and calculate the length of the Latin-1 string. 1089 */ 1090 size_t cch; 1091 int rc = rtUtf8CalcLatin1Length(pszString, RTSTR_MAX, &cch); 1092 if (RT_SUCCESS(rc)) 1093 { 1094 /* 1095 * Allocate buffer. 1096 */ 1097 char *psz = (char *)RTMemAllocTag(cch + 1, pszTag); 1098 if (psz) 1099 { 1100 /* 1101 * Encode the UTF-16 string. 1102 */ 1103 rc = rtUtf8RecodeAsLatin1(pszString, RTSTR_MAX, psz, cch); 1104 if (RT_SUCCESS(rc)) 1105 { 1106 *ppszString = psz; 1107 return rc; 1108 } 1109 RTMemFree(psz); 1110 } 1111 else 1112 rc = VERR_NO_STR_MEMORY; 1113 } 1114 return rc; 1115 } 1116 RT_EXPORT_SYMBOL(RTStrToLatin1Tag); 1117 1118 1119 RTDECL(int) RTStrToLatin1ExTag(const char *pszString, size_t cchString, 1120 char **ppsz, size_t cch, size_t *pcch, const char *pszTag) 1121 { 1122 /* 1123 * Validate input. 1124 */ 1125 Assert(VALID_PTR(pszString)); 1126 Assert(VALID_PTR(ppsz)); 1127 Assert(!pcch || VALID_PTR(pcch)); 1128 1129 /* 1130 * Validate the UTF-8 input and calculate the length of the UTF-16 string. 1131 */ 1132 size_t cchResult; 1133 int rc = rtUtf8CalcLatin1Length(pszString, cchString, &cchResult); 1134 if (RT_SUCCESS(rc)) 1135 { 1136 if (pcch) 1137 *pcch = cchResult; 1138 1139 /* 1140 * Check buffer size / Allocate buffer. 1141 */ 1142 bool fShouldFree; 1143 char *pszResult; 1144 if (cch > 0 && *ppsz) 1145 { 1146 fShouldFree = false; 1147 if (cch <= cchResult) 1148 return VERR_BUFFER_OVERFLOW; 1149 pszResult = *ppsz; 1150 } 1151 else 1152 { 1153 *ppsz = NULL; 1154 fShouldFree = true; 1155 cch = RT_MAX(cchResult + 1, cch); 1156 pszResult = (char *)RTMemAllocTag(cch, pszTag); 1157 } 1158 if (pszResult) 1159 { 1160 /* 1161 * Encode the Latin-1 string. 1162 */ 1163 rc = rtUtf8RecodeAsLatin1(pszString, cchString, pszResult, cch - 1); 1164 if (RT_SUCCESS(rc)) 1165 { 1166 *ppsz = pszResult; 1167 return rc; 1168 } 1169 if (fShouldFree) 1170 RTMemFree(pszResult); 1171 } 1172 else 1173 rc = VERR_NO_STR_MEMORY; 1174 } 1175 return rc; 1176 } 1177 RT_EXPORT_SYMBOL(RTStrToLatin1Tag); 794 1178 795 1179
Note:
See TracChangeset
for help on using the changeset viewer.