Changeset 21714 in vbox for trunk/src/VBox/Runtime/common/string
- Timestamp:
- Jul 17, 2009 11:22:40 PM (16 years ago)
- svn:sync-xref-src-repo-rev:
- 50274
- File:
-
- 1 edited
Legend:
- Unmodified
- Added
- Removed
-
trunk/src/VBox/Runtime/common/string/utf-16.cpp
r21337 r21714 649 649 RT_EXPORT_SYMBOL(RTUtf16PutCpInternal); 650 650 651 652 /** 653 * Validate the UTF-16 encoding and calculates the length of a Latin1 encoding. 654 * 655 * @returns iprt status code. 656 * @param pwsz The UTF-16 string. 657 * @param cwc The max length of the UTF-16 string to consider. 658 * @param pcch Where to store the length (excluding '\\0') of the Latin1 string. (cch == cb, btw) 659 */ 660 static int rtUtf16CalcLatin1Length(PCRTUTF16 pwsz, size_t cwc, size_t *pcch) 661 { 662 int rc = VINF_SUCCESS; 663 size_t cch = 0; 664 while (cwc > 0) 665 { 666 RTUTF16 wc = *pwsz++; cwc--; 667 if (!wc) 668 break; 669 else if (wc < 0xd800 || wc > 0xdfff) 670 { 671 if (wc < 0xfffe) 672 ++cch; 673 else 674 { 675 RTStrAssertMsgFailed(("endian indicator! wc=%#x\n", wc)); 676 rc = VERR_CODE_POINT_ENDIAN_INDICATOR; 677 break; 678 } 679 } 680 else 681 { 682 if (wc >= 0xdc00) 683 { 684 RTStrAssertMsgFailed(("Wrong 1st char in surrogate! wc=%#x\n", wc)); 685 rc = VERR_INVALID_UTF16_ENCODING; 686 break; 687 } 688 if (cwc <= 0) 689 { 690 RTStrAssertMsgFailed(("Invalid length! wc=%#x\n", wc)); 691 rc = VERR_INVALID_UTF16_ENCODING; 692 break; 693 } 694 wc = *pwsz++; cwc--; 695 if (wc < 0xdc00 || wc > 0xdfff) 696 { 697 RTStrAssertMsgFailed(("Wrong 2nd char in surrogate! wc=%#x\n", wc)); 698 rc = VERR_INVALID_UTF16_ENCODING; 699 break; 700 } 701 ++cch; 702 } 703 } 704 705 706 /* done */ 707 *pcch = cch; 708 return rc; 709 } 710 711 712 /** 713 * Recodes an valid UTF-16 string as Latin1. 714 * 715 * @returns iprt status code. 716 * @param pwsz The UTF-16 string. 717 * @param cwc The number of RTUTF16 characters to process from pwsz. The recoding 718 * will stop when cwc or '\\0' is reached. 719 * @param psz Where to store the Latin1 string. 720 * @param cch The size of the Latin1 buffer, excluding the terminator. 721 * @param pcch Where to store the number of octets actually encoded. 722 */ 723 static int rtUtf16RecodeAsLatin1(PCRTUTF16 pwsz, size_t cwc, char *psz, size_t cch, size_t *pcch) 724 { 725 unsigned char *pwch = (unsigned char *)psz; 726 int rc = VINF_SUCCESS; 727 while (cwc > 0) 728 { 729 RTUTF16 wc = *pwsz++; cwc--; 730 if (!wc) 731 break; 732 else if (wc < 0xd800 || wc > 0xdfff) 733 { 734 if (wc < 0x100) 735 { 736 if (cch < 1) 737 { 738 RTStrAssertMsgFailed(("Buffer overflow! 1\n")); 739 rc = VERR_BUFFER_OVERFLOW; 740 break; 741 } 742 cch--; 743 *pwch++ = (char)wc; 744 } 745 else if (wc < 0xfffe) 746 { 747 if (cch < 1) 748 { 749 RTStrAssertMsgFailed(("Buffer overflow! 3\n")); 750 rc = VERR_BUFFER_OVERFLOW; 751 break; 752 } 753 cch--; 754 *pwch++ = '?'; 755 } 756 else 757 { 758 RTStrAssertMsgFailed(("endian indicator! wc=%#x\n", wc)); 759 rc = VERR_CODE_POINT_ENDIAN_INDICATOR; 760 break; 761 } 762 } 763 else 764 { 765 if (wc >= 0xdc00) 766 { 767 RTStrAssertMsgFailed(("Wrong 1st char in surrogate! wc=%#x\n", wc)); 768 rc = VERR_INVALID_UTF16_ENCODING; 769 break; 770 } 771 if (cwc <= 0) 772 { 773 RTStrAssertMsgFailed(("Invalid length! wc=%#x\n", wc)); 774 rc = VERR_INVALID_UTF16_ENCODING; 775 break; 776 } 777 RTUTF16 wc2 = *pwsz++; cwc--; 778 if (wc2 < 0xdc00 || wc2 > 0xdfff) 779 { 780 RTStrAssertMsgFailed(("Wrong 2nd char in surrogate! wc=%#x\n", wc)); 781 rc = VERR_INVALID_UTF16_ENCODING; 782 break; 783 } 784 if (cch < 1) 785 { 786 RTStrAssertMsgFailed(("Buffer overflow! 4\n")); 787 rc = VERR_BUFFER_OVERFLOW; 788 break; 789 } 790 cch--; 791 *pwch++ = '?'; 792 } 793 } 794 795 /* done */ 796 *pwch = '\0'; 797 *pcch = (char *)pwch - psz; 798 return rc; 799 } 800 801 802 RTDECL(int) RTUtf16ToLatin1(PCRTUTF16 pwszString, char **ppszString) 803 { 804 /* 805 * Validate input. 806 */ 807 Assert(VALID_PTR(ppszString)); 808 Assert(VALID_PTR(pwszString)); 809 *ppszString = NULL; 810 811 /* 812 * Validate the UTF-16 string and calculate the length of the UTF-8 encoding of it. 813 */ 814 size_t cch; 815 int rc = rtUtf16CalcLatin1Length(pwszString, RTSTR_MAX, &cch); 816 if (RT_SUCCESS(rc)) 817 { 818 /* 819 * Allocate buffer and recode it. 820 */ 821 char *pszResult = (char *)RTMemAlloc(cch + 1); 822 if (pszResult) 823 { 824 rc = rtUtf16RecodeAsLatin1(pwszString, RTSTR_MAX, pszResult, cch, &cch); 825 if (RT_SUCCESS(rc)) 826 { 827 *ppszString = pszResult; 828 return rc; 829 } 830 831 RTMemFree(pszResult); 832 } 833 else 834 rc = VERR_NO_STR_MEMORY; 835 } 836 return rc; 837 } 838 RT_EXPORT_SYMBOL(RTUtf16ToLatin1); 839 840 841 RTDECL(int) RTUtf16ToLatin1Ex(PCRTUTF16 pwszString, size_t cwcString, char **ppsz, size_t cch, size_t *pcch) 842 { 843 /* 844 * Validate input. 845 */ 846 Assert(VALID_PTR(pwszString)); 847 Assert(VALID_PTR(ppsz)); 848 Assert(!pcch || VALID_PTR(pcch)); 849 850 /* 851 * Validate the UTF-16 string and calculate the length of the Latin1 encoding of it. 852 */ 853 size_t cchResult; 854 int rc = rtUtf16CalcLatin1Length(pwszString, cwcString, &cchResult); 855 if (RT_SUCCESS(rc)) 856 { 857 if (pcch) 858 *pcch = cchResult; 859 860 /* 861 * Check buffer size / Allocate buffer and recode it. 862 */ 863 bool fShouldFree; 864 char *pszResult; 865 if (cch > 0 && *ppsz) 866 { 867 fShouldFree = false; 868 if (cch <= cchResult) 869 return VERR_BUFFER_OVERFLOW; 870 pszResult = *ppsz; 871 } 872 else 873 { 874 *ppsz = NULL; 875 fShouldFree = true; 876 cch = RT_MAX(cch, cchResult + 1); 877 pszResult = (char *)RTMemAlloc(cch); 878 } 879 if (pszResult) 880 { 881 rc = rtUtf16RecodeAsLatin1(pwszString, cwcString, pszResult, cch - 1, &cch); 882 if (RT_SUCCESS(rc)) 883 { 884 *ppsz = pszResult; 885 return rc; 886 } 887 888 if (fShouldFree) 889 RTMemFree(pszResult); 890 } 891 else 892 rc = VERR_NO_STR_MEMORY; 893 } 894 return rc; 895 } 896 RT_EXPORT_SYMBOL(RTUtf16ToLatin1Ex); 897 898 899 RTDECL(size_t) RTUtf16CalcLatin1Len(PCRTUTF16 pwsz) 900 { 901 size_t cch; 902 int rc = rtUtf16CalcLatin1Length(pwsz, RTSTR_MAX, &cch); 903 return RT_SUCCESS(rc) ? cch : 0; 904 } 905 RT_EXPORT_SYMBOL(RTUtf16CalcLatin1Len); 906 907 908 RTDECL(int) RTUtf16CalcLatin1LenEx(PCRTUTF16 pwsz, size_t cwc, size_t *pcch) 909 { 910 size_t cch; 911 int rc = rtUtf16CalcLatin1Length(pwsz, cwc, &cch); 912 if (pcch) 913 *pcch = RT_SUCCESS(rc) ? cch : ~(size_t)0; 914 return rc; 915 } 916 RT_EXPORT_SYMBOL(RTUtf16CalcLatin1LenEx); 917 918 919 /** 920 * Calculates the UTF-16 length of a Latin1 string. In fact this is just the 921 * original length, but the function saves us nasty comments to that effect 922 * all over the place. 923 * 924 * @returns IPRT status code. 925 * @param psz Pointer to the Latin1 string. 926 * @param cch The max length of the string. (btw cch = cb) 927 * Use RTSTR_MAX if all of the string is to be examined.s 928 * @param pcwc Where to store the length of the UTF-16 string as a number of RTUTF16 characters. 929 */ 930 static int rtLatin1CalcUtf16Length(const char *psz, size_t cch, size_t *pcwc) 931 { 932 *pcwc = RTStrNLen(psz, cch); 933 return VINF_SUCCESS; 934 } 935 936 937 /** 938 * Recodes a Latin1 string as UTF-16. This is just a case of expanding it to 939 * sixteen bits, as Unicode is a superset of Latin1. 940 * 941 * Since we know the input is valid, we do *not* perform length checks. 942 * 943 * @returns iprt status code. 944 * @param psz The Latin1 string to recode. 945 * @param cch The number of chars (the type char, so bytes if you like) to process of the Latin1 string. 946 * The recoding will stop when cch or '\\0' is reached. Pass RTSTR_MAX to process up to '\\0'. 947 * @param pwsz Where to store the UTF-16 string. 948 * @param cwc The number of RTUTF16 items the pwsz buffer can hold, excluding the terminator ('\\0'). 949 * @param pcwc Where to store the actual number of RTUTF16 items encoded into the UTF-16. This excludes the terminator. 950 */ 951 static int rtLatin1RecodeAsUtf16(const char *psz, size_t cch, PRTUTF16 pwsz, size_t cwc, size_t *pcwc) 952 { 953 int rc = VINF_SUCCESS; 954 const unsigned char *puch = (const unsigned char *)psz; 955 const PRTUTF16 pwszEnd = pwsz + cwc; 956 PRTUTF16 pwc = pwsz; 957 Assert(pwszEnd >= pwc); 958 while (cch > 0) 959 { 960 /* read the next char and check for terminator. */ 961 const unsigned char uch = *puch; 962 if (!uch) 963 break; 964 965 /* check for output overflow */ 966 if (pwc >= pwszEnd) 967 { 968 rc = VERR_BUFFER_OVERFLOW; 969 break; 970 } 971 972 /* expand the code point */ 973 *pwc++ = uch; 974 puch++; 975 cch--; 976 } 977 978 /* done */ 979 *pwc = '\0'; 980 *pcwc = pwc - pwsz; 981 return rc; 982 } 983 984 985 RTDECL(int) RTLatin1ToUtf16(const char *pszString, PRTUTF16 *ppwszString) 986 { 987 /* 988 * Validate input. 989 */ 990 Assert(VALID_PTR(ppwszString)); 991 Assert(VALID_PTR(pszString)); 992 *ppwszString = NULL; 993 994 /* 995 * Validate the input and calculate the length of the UTF-16 string. 996 */ 997 size_t cwc; 998 int rc = rtLatin1CalcUtf16Length(pszString, RTSTR_MAX, &cwc); 999 if (RT_SUCCESS(rc)) 1000 { 1001 /* 1002 * Allocate buffer. 1003 */ 1004 PRTUTF16 pwsz = (PRTUTF16)RTMemAlloc((cwc + 1) * sizeof(RTUTF16)); 1005 if (pwsz) 1006 { 1007 /* 1008 * Encode the UTF-16 string. 1009 */ 1010 rc = rtLatin1RecodeAsUtf16(pszString, RTSTR_MAX, pwsz, cwc, &cwc); 1011 if (RT_SUCCESS(rc)) 1012 { 1013 *ppwszString = pwsz; 1014 return rc; 1015 } 1016 RTMemFree(pwsz); 1017 } 1018 else 1019 rc = VERR_NO_UTF16_MEMORY; 1020 } 1021 return rc; 1022 } 1023 RT_EXPORT_SYMBOL(RTLatin1ToUtf16); 1024 1025 1026 RTDECL(int) RTLatin1ToUtf16Ex(const char *pszString, size_t cchString, PRTUTF16 *ppwsz, size_t cwc, size_t *pcwc) 1027 { 1028 /* 1029 * Validate input. 1030 */ 1031 Assert(VALID_PTR(pszString)); 1032 Assert(VALID_PTR(ppwsz)); 1033 Assert(!pcwc || VALID_PTR(pcwc)); 1034 1035 /* 1036 * Validate the input and calculate the length of the UTF-16 string. 1037 */ 1038 size_t cwcResult; 1039 int rc = rtLatin1CalcUtf16Length(pszString, cchString, &cwcResult); 1040 if (RT_SUCCESS(rc)) 1041 { 1042 if (pcwc) 1043 *pcwc = cwcResult; 1044 1045 /* 1046 * Check buffer size / Allocate buffer. 1047 */ 1048 bool fShouldFree; 1049 PRTUTF16 pwszResult; 1050 if (cwc > 0 && *ppwsz) 1051 { 1052 fShouldFree = false; 1053 if (cwc <= cwcResult) 1054 return VERR_BUFFER_OVERFLOW; 1055 pwszResult = *ppwsz; 1056 } 1057 else 1058 { 1059 *ppwsz = NULL; 1060 fShouldFree = true; 1061 cwc = RT_MAX(cwcResult + 1, cwc); 1062 pwszResult = (PRTUTF16)RTMemAlloc(cwc * sizeof(RTUTF16)); 1063 } 1064 if (pwszResult) 1065 { 1066 /* 1067 * Encode the UTF-16 string. 1068 */ 1069 rc = rtLatin1RecodeAsUtf16(pszString, cchString, pwszResult, cwc - 1, &cwcResult); 1070 if (RT_SUCCESS(rc)) 1071 { 1072 *ppwsz = pwszResult; 1073 return rc; 1074 } 1075 if (fShouldFree) 1076 RTMemFree(pwszResult); 1077 } 1078 else 1079 rc = VERR_NO_UTF16_MEMORY; 1080 } 1081 return rc; 1082 } 1083 RT_EXPORT_SYMBOL(RTLatin1ToUtf16Ex); 1084 1085 1086 RTDECL(size_t) RTLatin1CalcUtf16Len(const char *psz) 1087 { 1088 size_t cwc; 1089 int rc = rtLatin1CalcUtf16Length(psz, RTSTR_MAX, &cwc); 1090 return RT_SUCCESS(rc) ? cwc : 0; 1091 } 1092 RT_EXPORT_SYMBOL(RTLatin1CalcUtf16Len); 1093 1094 1095 RTDECL(int) RTLatin1CalcUtf16LenEx(const char *psz, size_t cch, size_t *pcwc) 1096 { 1097 size_t cwc; 1098 int rc = rtLatin1CalcUtf16Length(psz, cch, &cwc); 1099 if (pcwc) 1100 *pcwc = RT_SUCCESS(rc) ? cwc : ~(size_t)0; 1101 return rc; 1102 } 1103 RT_EXPORT_SYMBOL(RTLatin1CalcUtf16LenEx);
Note:
See TracChangeset
for help on using the changeset viewer.