Changeset 31199 in vbox
- Timestamp:
- Jul 29, 2010 10:54:16 AM (15 years ago)
- Location:
- trunk
- Files:
-
- 2 edited
Legend:
- Unmodified
- Added
- Removed
-
trunk/include/iprt/string.h
r31157 r31199 818 818 819 819 /** 820 * Calculates the length of the string in Latin-1 characters. 821 * 822 * This function will validate the string, and incorrectly encoded UTF-8 823 * strings will be rejected. The primary purpose of this function is to 824 * help allocate buffers for RTStrToLatin1Ex of the correct size. For most 825 * other purposes RTStrCalcLatin1LenEx() should be used. 826 * 827 * @returns Number of Latin-1 characters. 828 * @returns 0 if the string was incorrectly encoded. 829 * @param psz The string. 830 */ 831 #define RTStrCalcLatin1Len(psz) RTStrUniLen(psz) 832 833 /** 834 * Calculates the length of the string in Latin-1 characters. 835 * 836 * This function will validate the string, and incorrectly encoded UTF-8 837 * strings will be rejected. 838 * 839 * @returns iprt status code. 840 * @param psz The string. 841 * @param cch The max string length. Use RTSTR_MAX to process the entire string. 842 * @param pcch Where to store the string length. Optional. 843 * This is undefined on failure. 844 */ 845 #define RTStrCalcLatin1LenEx(psz, cch, pcch) RTStrUniLenEx(psz, cch, pcch) 846 847 /** 848 * Translate a UTF-8 string into a Latin-1 allocating the result buffer (default 849 * tag). 850 * 851 * @returns iprt status code. 852 * @param pszString UTF-8 string to convert. 853 * @param ppszString Receives pointer to the allocated Latin-1 string. 854 * The returned string must be freed using RTStrFree(). 855 */ 856 #define RTStrToLatin1(pszString, ppszString) RTStrToLatin1Tag((pszString), (ppszString), RTSTR_TAG) 857 858 /** 859 * Translate a UTF-8 string into a Latin-1 allocating the result buffer (custom 860 * tag). 861 * 862 * @returns iprt status code. 863 * @param pszString UTF-8 string to convert. 864 * @param ppszString Receives pointer to the allocated Latin-1 string. 865 * The returned string must be freed using RTStrFree(). 866 * @param pszTag Allocation tag used for statistics and such. 867 */ 868 RTDECL(int) RTStrToLatin1Tag(const char *pszString, char **ppszString, const char *pszTag); 869 870 /** 871 * Translates pszString from UTF-8 to Latin-1, allocating the result buffer if requested. 872 * 873 * @returns iprt status code. 874 * @param pszString UTF-8 string to convert. 875 * @param cchString The maximum size in chars (the type) to convert. The conversion stop 876 * when it reaches cchString or the string terminator ('\\0'). 877 * Use RTSTR_MAX to translate the entire string. 878 * @param ppsz If cch is non-zero, this must either be pointing to pointer to 879 * a buffer of the specified size, or pointer to a NULL pointer. 880 * If *ppsz is NULL or cch is zero a buffer of at least cch items 881 * will be allocated to hold the translated string. 882 * If a buffer was requested it must be freed using RTStrFree(). 883 * @param cch The buffer size in bytes. This includes the terminator. 884 * @param pcch Where to store the length of the translated string, 885 * excluding the terminator. (Optional) 886 * 887 * This may be set under some error conditions, 888 * however, only for VERR_BUFFER_OVERFLOW and 889 * VERR_NO_STR_MEMORY will it contain a valid string 890 * length that can be used to resize the buffer. 891 */ 892 #define RTStrToLatin1Ex(pszString, cchString, ppsz, cch, pcch) \ 893 RTStrToLatin1ExTag((pszString), (cchString), (ppsz), (cch), (pcch), RTSTR_TAG) 894 895 /** 896 * Translates pszString from UTF-8 to Latin1, allocating the result buffer if 897 * requested (custom tag). 898 * 899 * @returns iprt status code. 900 * @param pszString UTF-8 string to convert. 901 * @param cchString The maximum size in chars (the type) to convert. The conversion stop 902 * when it reaches cchString or the string terminator ('\\0'). 903 * Use RTSTR_MAX to translate the entire string. 904 * @param ppsz If cch is non-zero, this must either be pointing to pointer to 905 * a buffer of the specified size, or pointer to a NULL pointer. 906 * If *ppsz is NULL or cch is zero a buffer of at least cch items 907 * will be allocated to hold the translated string. 908 * If a buffer was requested it must be freed using RTStrFree(). 909 * @param cch The buffer size in bytes. This includes the terminator. 910 * @param pcch Where to store the length of the translated string, 911 * excluding the terminator. (Optional) 912 * 913 * This may be set under some error conditions, 914 * however, only for VERR_BUFFER_OVERFLOW and 915 * VERR_NO_STR_MEMORY will it contain a valid string 916 * length that can be used to resize the buffer. 917 * @param pszTag Allocation tag used for statistics and such. 918 */ 919 RTDECL(int) RTStrToLatin1ExTag(const char *pszString, size_t cchString, char **ppsz, size_t cch, size_t *pcch, const char *pszTag); 920 921 922 /** 923 * Translate a Latin1 string into a UTF-8 allocating the result buffer (default 924 * tag). 925 * 926 * @returns iprt status code. 927 * @param pszString Latin1 string to convert. 928 * @param ppszString Receives pointer of allocated UTF-8 string on 929 * success, and is always set to NULL on failure. 930 * The returned pointer must be freed using RTStrFree(). 931 */ 932 #define RTLatin1ToUtf8(pszString, ppszString) RTLatin1ToUtf8Tag((pszString), (ppszString), RTSTR_TAG) 933 934 /** 935 * Translate a Latin-1 string into a UTF-8 allocating the result buffer. 936 * 937 * @returns iprt status code. 938 * @param pszString Latin-1 string to convert. 939 * @param ppszString Receives pointer of allocated UTF-8 string on 940 * success, and is always set to NULL on failure. 941 * The returned pointer must be freed using RTStrFree(). 942 * @param pszTag Allocation tag used for statistics and such. 943 */ 944 RTDECL(int) RTLatin1ToUtf8Tag(const char *pszString, char **ppszString, const char *pszTag); 945 946 /** 947 * Translates Latin-1 to UTF-8 using buffer provided by the caller or a fittingly 948 * sized buffer allocated by the function (default tag). 949 * 950 * @returns iprt status code. 951 * @param pszString The Latin-1 string to convert. 952 * @param cchString The number of Latin-1 characters to translate from pszString. 953 * The translation will stop when reaching cchString or the terminator ('\\0'). 954 * Use RTSTR_MAX to translate the entire string. 955 * @param ppsz If cch is non-zero, this must either be pointing to a pointer to 956 * a buffer of the specified size, or pointer to a NULL pointer. 957 * If *ppsz is NULL or cch is zero a buffer of at least cch chars 958 * will be allocated to hold the translated string. 959 * If a buffer was requested it must be freed using RTStrFree(). 960 * @param cch The buffer size in chars (the type). This includes the terminator. 961 * @param pcch Where to store the length of the translated string, 962 * excluding the terminator. (Optional) 963 * 964 * This may be set under some error conditions, 965 * however, only for VERR_BUFFER_OVERFLOW and 966 * VERR_NO_STR_MEMORY will it contain a valid string 967 * length that can be used to resize the buffer. 968 */ 969 #define RTLatin1ToUtf8Ex(pszString, cchString, ppsz, cch, pcch) \ 970 RTLatin1ToUtf8ExTag((pszString), (cchString), (ppsz), (cch), (pcch), RTSTR_TAG) 971 972 /** 973 * Translates Latin1 to UTF-8 using buffer provided by the caller or a fittingly 974 * sized buffer allocated by the function (custom tag). 975 * 976 * @returns iprt status code. 977 * @param pszString The Latin1 string to convert. 978 * @param cchString The number of Latin1 characters to translate from pwszString. 979 * The translation will stop when reaching cchString or the terminator ('\\0'). 980 * Use RTSTR_MAX to translate the entire string. 981 * @param ppsz If cch is non-zero, this must either be pointing to a pointer to 982 * a buffer of the specified size, or pointer to a NULL pointer. 983 * If *ppsz is NULL or cch is zero a buffer of at least cch chars 984 * will be allocated to hold the translated string. 985 * If a buffer was requested it must be freed using RTStrFree(). 986 * @param cch The buffer size in chars (the type). This includes the terminator. 987 * @param pcch Where to store the length of the translated string, 988 * excluding the terminator. (Optional) 989 * 990 * This may be set under some error conditions, 991 * however, only for VERR_BUFFER_OVERFLOW and 992 * VERR_NO_STR_MEMORY will it contain a valid string 993 * length that can be used to resize the buffer. 994 * @param pszTag Allocation tag used for statistics and such. 995 */ 996 RTDECL(int) RTLatin1ToUtf8ExTag(const char *pszString, size_t cchString, char **ppsz, size_t cch, size_t *pcch, const char *pszTag); 997 998 /** 999 * Calculates the length of the Latin-1 string in UTF-8 chars (bytes). 1000 * 1001 * The primary purpose of this function is to help allocate buffers for 1002 * RTLatin1ToUtf8() of the correct size. For most other purposes 1003 * RTLatin1ToUtf8Ex() should be used. 1004 * 1005 * @returns Number of char (bytes). 1006 * @returns 0 if the string was incorrectly encoded. 1007 * @param psz The Latin-1 string. 1008 */ 1009 RTDECL(size_t) RTLatin1CalcUtf8Len(const char *psz); 1010 1011 /** 1012 * Calculates the length of the Latin-1 string in UTF-8 chars (bytes). 1013 * 1014 * @returns iprt status code. 1015 * @param psz The string. 1016 * @param cch The max string length. Use RTSTR_MAX to process the entire string. 1017 * @param pcch Where to store the string length (in bytes). Optional. 1018 * This is undefined on failure. 1019 */ 1020 RTDECL(int) RTLatin1CalcUtf8LenEx(const char *psz, size_t cch, size_t *pcch); 1021 1022 /** 820 1023 * Get the unicode code point at the given string position. 821 1024 * -
trunk/src/VBox/Runtime/common/string/utf-8.cpp
r31157 r31199 792 792 } 793 793 RT_EXPORT_SYMBOL(RTStrCalcUtf16LenEx); 794 795 796 /** 797 * Calculates the length of the UTF-8 encoding of a Latin-1 string. 798 * 799 * @returns iprt status code. 800 * @param psz The Latin-1 string. 801 * @param cchIn The max length of the Latin-1 string to consider. 802 * @param pcch Where to store the length (excluding '\\0') of the UTF-8 string. (cch == cb, btw) 803 */ 804 static int rtLatin1CalcUtf8Length(const char *psz, size_t cchIn, size_t *pcch) 805 { 806 size_t cch = 0; 807 while (cchIn > 0) 808 { 809 char ch = *psz++; cchIn--; 810 if (!ch) 811 break; 812 if (!(ch & 0x80)) 813 cch++; 814 else 815 cch += 2; 816 } 817 818 819 /* done */ 820 *pcch = cch; 821 return VINF_SUCCESS; 822 } 823 824 825 /** 826 * Recodes a Latin-1 string as UTF-8. 827 * 828 * @returns iprt status code. 829 * @param psz The Latin-1 string. 830 * @param cchIn The number of characters to process from psz. The recoding 831 * will stop when cch or '\\0' is reached. 832 * @param psz Where to store the UTF-8 string. 833 * @param cch The size of the UTF-8 buffer, excluding the terminator. 834 * @param pcch Where to store the number of octets actually encoded. 835 */ 836 static int rtLatin1RecodeAsUtf8(const char *pszIn, size_t cchIn, char *psz, size_t cch, size_t *pcch) 837 { 838 unsigned char *puch = (unsigned char *)psz; 839 int rc = VINF_SUCCESS; 840 while (cchIn > 0) 841 { 842 char ch = *pszIn++; cchIn--; 843 if (!ch) 844 break; 845 if (!(ch & 0x80)) 846 { 847 if (RT_UNLIKELY(cch < 1)) 848 { 849 RTStrAssertMsgFailed(("Buffer overflow! 1\n")); 850 rc = VERR_BUFFER_OVERFLOW; 851 break; 852 } 853 cch--; 854 *puch++ = (unsigned char)ch; 855 } 856 else 857 { 858 if (RT_UNLIKELY(cch < 2)) 859 { 860 RTStrAssertMsgFailed(("Buffer overflow! 2\n")); 861 rc = VERR_BUFFER_OVERFLOW; 862 break; 863 } 864 cch -= 2; 865 *puch++ = 0xc0 | (ch >> 6); 866 *puch++ = 0x80 | (ch & 0x3f); 867 } 868 } 869 870 /* done */ 871 *puch = '\0'; 872 *pcch = (char *)puch - psz; 873 return rc; 874 } 875 876 877 878 RTDECL(int) RTLatin1ToUtf8Tag(const char *pszString, char **ppszString, const char *pszTag) 879 { 880 /* 881 * Validate input. 882 */ 883 Assert(VALID_PTR(ppszString)); 884 Assert(VALID_PTR(pszString)); 885 *ppszString = NULL; 886 887 /* 888 * Calculate the length of the UTF-8 encoding of the Latin-1 string. 889 */ 890 size_t cch; 891 int rc = rtLatin1CalcUtf8Length(pszString, RTSTR_MAX, &cch); 892 if (RT_SUCCESS(rc)) 893 { 894 /* 895 * Allocate buffer and recode it. 896 */ 897 char *pszResult = (char *)RTMemAllocTag(cch + 1, pszTag); 898 if (pszResult) 899 { 900 rc = rtLatin1RecodeAsUtf8(pszString, RTSTR_MAX, pszResult, cch, &cch); 901 if (RT_SUCCESS(rc)) 902 { 903 *ppszString = pszResult; 904 return rc; 905 } 906 907 RTMemFree(pszResult); 908 } 909 else 910 rc = VERR_NO_STR_MEMORY; 911 } 912 return rc; 913 } 914 RT_EXPORT_SYMBOL(RTLatin1ToUtf8Tag); 915 916 917 RTDECL(int) RTLatin1ToUtf8ExTag(const char *pszString, size_t cchString, char **ppsz, size_t cch, size_t *pcch, const char *pszTag) 918 { 919 /* 920 * Validate input. 921 */ 922 Assert(VALID_PTR(pszString)); 923 Assert(VALID_PTR(ppsz)); 924 Assert(!pcch || VALID_PTR(pcch)); 925 926 /* 927 * Calculate the length of the UTF-8 encoding of the Latin-1 string. 928 */ 929 size_t cchResult; 930 int rc = rtLatin1CalcUtf8Length(pszString, cchString, &cchResult); 931 if (RT_SUCCESS(rc)) 932 { 933 if (pcch) 934 *pcch = cchResult; 935 936 /* 937 * Check buffer size / Allocate buffer and recode it. 938 */ 939 bool fShouldFree; 940 char *pszResult; 941 if (cch > 0 && *ppsz) 942 { 943 fShouldFree = false; 944 if (RT_UNLIKELY(cch <= cchResult)) 945 return VERR_BUFFER_OVERFLOW; 946 pszResult = *ppsz; 947 } 948 else 949 { 950 *ppsz = NULL; 951 fShouldFree = true; 952 cch = RT_MAX(cch, cchResult + 1); 953 pszResult = (char *)RTStrAllocTag(cch, pszTag); 954 } 955 if (pszResult) 956 { 957 rc = rtLatin1RecodeAsUtf8(pszString, cchString, pszResult, cch - 1, &cch); 958 if (RT_SUCCESS(rc)) 959 { 960 *ppsz = pszResult; 961 return rc; 962 } 963 964 if (fShouldFree) 965 RTStrFree(pszResult); 966 } 967 else 968 rc = VERR_NO_STR_MEMORY; 969 } 970 return rc; 971 } 972 RT_EXPORT_SYMBOL(RTLatin1ToUtf8ExTag); 973 974 975 RTDECL(size_t) RTLatin1CalcUtf8Len(const char *psz) 976 { 977 size_t cch; 978 int rc = rtLatin1CalcUtf8Length(psz, RTSTR_MAX, &cch); 979 return RT_SUCCESS(rc) ? cch : 0; 980 } 981 RT_EXPORT_SYMBOL(RTLatin1CalcUtf8Len); 982 983 984 RTDECL(int) RTLatin1CalcUtf8LenEx(const char *psz, size_t cchIn, size_t *pcch) 985 { 986 size_t cch; 987 int rc = rtLatin1CalcUtf8Length(psz, cchIn, &cch); 988 if (pcch) 989 *pcch = RT_SUCCESS(rc) ? cch : ~(size_t)0; 990 return rc; 991 } 992 RT_EXPORT_SYMBOL(RTLatin1CalcUtf8LenEx); 993 994 995 /** 996 * Calculates the Latin-1 length of a string, validating the encoding while doing so. 997 * 998 * @returns IPRT status code. 999 * @param psz Pointer to the UTF-8 string. 1000 * @param cch The max length of the string. (btw cch = cb) 1001 * Use RTSTR_MAX if all of the string is to be examined. 1002 * @param pcwc Where to store the length of the Latin-1 string in bytes. 1003 */ 1004 static int rtUtf8CalcLatin1Length(const char *psz, size_t cch, size_t *pcch) 1005 { 1006 /* We re-encode to one byte per unicode code point. */ 1007 return RTStrUniLenEx(psz, cch, pcch); 1008 } 1009 1010 1011 /** 1012 * Recodes a valid UTF-8 string as Latin-1. 1013 * 1014 * Since we know the input is valid, we do *not* perform encoding or length checks. 1015 * 1016 * @returns iprt status code. 1017 * @param psz The UTF-8 string to recode. This is a valid encoding. 1018 * @param cch The number of chars (the type char, so bytes if you like) to process of the UTF-8 string. 1019 * The recoding will stop when cch or '\\0' is reached. Pass RTSTR_MAX to process up to '\\0'. 1020 * @param pszOut Where to store the Latin-1 string. 1021 * @param cchOut The number of characters the pszOut buffer can hold, excluding the terminator ('\\0'). 1022 */ 1023 static int rtUtf8RecodeAsLatin1(const char *psz, size_t cch, char *pszOut, size_t cchOut) 1024 { 1025 int rc = VINF_SUCCESS; 1026 const unsigned char *puch = (const unsigned char *)psz; 1027 unsigned char *puchOut = (unsigned char *)pszOut; 1028 while (cch > 0) 1029 { 1030 /* read the next char and check for terminator. */ 1031 const unsigned char uch = *puch; 1032 if (!uch) 1033 break; 1034 1035 /* check for output overflow */ 1036 if (RT_UNLIKELY(cchOut < 1)) 1037 { 1038 rc = VERR_BUFFER_OVERFLOW; 1039 break; 1040 } 1041 cchOut--; 1042 1043 /* decode and recode the code point */ 1044 if (!(uch & RT_BIT(7))) 1045 { 1046 *puchOut++ = uch; 1047 puch++; 1048 cch--; 1049 } 1050 else if ((uch & (RT_BIT(7) | RT_BIT(6) | RT_BIT(5))) == (RT_BIT(7) | RT_BIT(6))) 1051 { 1052 uint16_t uc = (puch[1] & 0x3f) 1053 | ((uint16_t)(uch & 0x1f) << 6); 1054 *puchOut++ = uc < 0x100 ? uc : '?'; 1055 puch += 2; 1056 cch -= 2; 1057 } 1058 else if ((uch & (RT_BIT(7) | RT_BIT(6) | RT_BIT(5) | RT_BIT(4))) == (RT_BIT(7) | RT_BIT(6) | RT_BIT(5))) 1059 { 1060 *puchOut++ = '?'; 1061 puch += 3; 1062 cch -= 3; 1063 } 1064 else 1065 { 1066 *puchOut++ = '?'; 1067 puch += 4; 1068 cch -= 4; 1069 } 1070 } 1071 1072 /* done */ 1073 *puchOut = '\0'; 1074 return rc; 1075 } 1076 1077 1078 RTDECL(int) RTStrToLatin1Tag(const char *pszString, char **ppszString, const char *pszTag) 1079 { 1080 /* 1081 * Validate input. 1082 */ 1083 Assert(VALID_PTR(ppszString)); 1084 Assert(VALID_PTR(pszString)); 1085 *ppszString = NULL; 1086 1087 /* 1088 * Validate the UTF-8 input and calculate the length of the Latin-1 string. 1089 */ 1090 size_t cch; 1091 int rc = rtUtf8CalcLatin1Length(pszString, RTSTR_MAX, &cch); 1092 if (RT_SUCCESS(rc)) 1093 { 1094 /* 1095 * Allocate buffer. 1096 */ 1097 char *psz = (char *)RTMemAllocTag(cch + 1, pszTag); 1098 if (psz) 1099 { 1100 /* 1101 * Encode the UTF-16 string. 1102 */ 1103 rc = rtUtf8RecodeAsLatin1(pszString, RTSTR_MAX, psz, cch); 1104 if (RT_SUCCESS(rc)) 1105 { 1106 *ppszString = psz; 1107 return rc; 1108 } 1109 RTMemFree(psz); 1110 } 1111 else 1112 rc = VERR_NO_STR_MEMORY; 1113 } 1114 return rc; 1115 } 1116 RT_EXPORT_SYMBOL(RTStrToLatin1Tag); 1117 1118 1119 RTDECL(int) RTStrToLatin1ExTag(const char *pszString, size_t cchString, 1120 char **ppsz, size_t cch, size_t *pcch, const char *pszTag) 1121 { 1122 /* 1123 * Validate input. 1124 */ 1125 Assert(VALID_PTR(pszString)); 1126 Assert(VALID_PTR(ppsz)); 1127 Assert(!pcch || VALID_PTR(pcch)); 1128 1129 /* 1130 * Validate the UTF-8 input and calculate the length of the UTF-16 string. 1131 */ 1132 size_t cchResult; 1133 int rc = rtUtf8CalcLatin1Length(pszString, cchString, &cchResult); 1134 if (RT_SUCCESS(rc)) 1135 { 1136 if (pcch) 1137 *pcch = cchResult; 1138 1139 /* 1140 * Check buffer size / Allocate buffer. 1141 */ 1142 bool fShouldFree; 1143 char *pszResult; 1144 if (cch > 0 && *ppsz) 1145 { 1146 fShouldFree = false; 1147 if (cch <= cchResult) 1148 return VERR_BUFFER_OVERFLOW; 1149 pszResult = *ppsz; 1150 } 1151 else 1152 { 1153 *ppsz = NULL; 1154 fShouldFree = true; 1155 cch = RT_MAX(cchResult + 1, cch); 1156 pszResult = (char *)RTMemAllocTag(cch, pszTag); 1157 } 1158 if (pszResult) 1159 { 1160 /* 1161 * Encode the Latin-1 string. 1162 */ 1163 rc = rtUtf8RecodeAsLatin1(pszString, cchString, pszResult, cch - 1); 1164 if (RT_SUCCESS(rc)) 1165 { 1166 *ppsz = pszResult; 1167 return rc; 1168 } 1169 if (fShouldFree) 1170 RTMemFree(pszResult); 1171 } 1172 else 1173 rc = VERR_NO_STR_MEMORY; 1174 } 1175 return rc; 1176 } 1177 RT_EXPORT_SYMBOL(RTStrToLatin1Tag); 794 1178 795 1179
Note:
See TracChangeset
for help on using the changeset viewer.