Changeset 68316 in vbox for trunk/src/VBox/Runtime/common/string/utf-16.cpp
- Timestamp:
- Aug 7, 2017 2:19:34 PM (7 years ago)
- File:
-
- 1 edited
Legend:
- Unmodified
- Added
- Removed
-
trunk/src/VBox/Runtime/common/string/utf-16.cpp
r67391 r68316 369 369 370 370 /** 371 * Validate the UTF-16 encoding and calculates the length of an UTF-8 encoding. 371 * Validate the UTF-16BE encoding and calculates the length of an UTF-8 372 * encoding. 372 373 * 373 374 * @returns iprt status code. 374 * @param pwsz The UTF-16 string.375 * @param cwc The max length of the UTF-16 string to consider.375 * @param pwsz The UTF-16BE string. 376 * @param cwc The max length of the UTF-16BE string to consider. 376 377 * @param pcch Where to store the length (excluding '\\0') of the UTF-8 string. (cch == cb, btw) 377 378 * 378 * @note rtUtf16 BigCalcUtf8Length is a copy of this.379 * @note rtUtf16LittleCalcUtf8Length | s/RT_LE2H_U16/RT_BE2H_U16/g 379 380 */ 380 static int rtUtf16 CalcUtf8Length(PCRTUTF16 pwsz, size_t cwc, size_t *pcch)381 static int rtUtf16BigCalcUtf8Length(PCRTUTF16 pwsz, size_t cwc, size_t *pcch) 381 382 { 382 383 int rc = VINF_SUCCESS; … … 387 388 if (!wc) 388 389 break; 390 wc = RT_BE2H_U16(wc); 389 391 if (wc < 0xd800 || wc > 0xdfff) 390 392 { … … 417 419 } 418 420 wc = *pwsz++; cwc--; 421 wc = RT_BE2H_U16(wc); 419 422 if (wc < 0xdc00 || wc > 0xdfff) 420 423 { … … 435 438 436 439 /** 437 * Validate the UTF-16 BE encoding and calculates the length of an UTF-8440 * Validate the UTF-16LE encoding and calculates the length of an UTF-8 438 441 * encoding. 439 442 * 440 443 * @returns iprt status code. 441 * @param pwsz The UTF-16 string.442 * @param cwc The max length of the UTF-16 BE string to consider.444 * @param pwsz The UTF-16LE string. 445 * @param cwc The max length of the UTF-16LE string to consider. 443 446 * @param pcch Where to store the length (excluding '\\0') of the UTF-8 string. (cch == cb, btw) 444 447 * 445 * @note Code is a copy of rtUtf16CalcUtf8Length, but with two RT_BE2H_U16 446 * invocations inserted. 448 * @note rtUtf16BigCalcUtf8Length | s/RT_BE2H_U16/RT_LE2H_U16/g 447 449 */ 448 static int rtUtf16 BigCalcUtf8Length(PCRTUTF16 pwsz, size_t cwc, size_t *pcch)450 static int rtUtf16LittleCalcUtf8Length(PCRTUTF16 pwsz, size_t cwc, size_t *pcch) 449 451 { 450 452 int rc = VINF_SUCCESS; … … 455 457 if (!wc) 456 458 break; 457 wc = RT_ BE2H_U16(wc);459 wc = RT_LE2H_U16(wc); 458 460 if (wc < 0xd800 || wc > 0xdfff) 459 461 { … … 486 488 } 487 489 wc = *pwsz++; cwc--; 488 wc = RT_ BE2H_U16(wc);490 wc = RT_LE2H_U16(wc); 489 491 if (wc < 0xdc00 || wc > 0xdfff) 490 492 { … … 500 502 /* done */ 501 503 *pcch = cch; 502 return rc;503 }504 505 506 /**507 * Recodes an valid UTF-16 string as UTF-8.508 *509 * @returns iprt status code.510 * @param pwsz The UTF-16 string.511 * @param cwc The number of RTUTF16 characters to process from pwsz. The recoding512 * will stop when cwc or '\\0' is reached.513 * @param psz Where to store the UTF-8 string.514 * @param cch The size of the UTF-8 buffer, excluding the terminator.515 * @param pcch Where to store the number of octets actually encoded.516 * @note rtUtf16BigRecodeAsUtf8 is a copy of this.517 */518 static int rtUtf16RecodeAsUtf8(PCRTUTF16 pwsz, size_t cwc, char *psz, size_t cch, size_t *pcch)519 {520 unsigned char *pwch = (unsigned char *)psz;521 int rc = VINF_SUCCESS;522 while (cwc > 0)523 {524 RTUTF16 wc = *pwsz++; cwc--;525 if (!wc)526 break;527 if (wc < 0xd800 || wc > 0xdfff)528 {529 if (wc < 0x80)530 {531 if (RT_UNLIKELY(cch < 1))532 {533 RTStrAssertMsgFailed(("Buffer overflow! 1\n"));534 rc = VERR_BUFFER_OVERFLOW;535 break;536 }537 cch--;538 *pwch++ = (unsigned char)wc;539 }540 else if (wc < 0x800)541 {542 if (RT_UNLIKELY(cch < 2))543 {544 RTStrAssertMsgFailed(("Buffer overflow! 2\n"));545 rc = VERR_BUFFER_OVERFLOW;546 break;547 }548 cch -= 2;549 *pwch++ = 0xc0 | (wc >> 6);550 *pwch++ = 0x80 | (wc & 0x3f);551 }552 else if (wc < 0xfffe)553 {554 if (RT_UNLIKELY(cch < 3))555 {556 RTStrAssertMsgFailed(("Buffer overflow! 3\n"));557 rc = VERR_BUFFER_OVERFLOW;558 break;559 }560 cch -= 3;561 *pwch++ = 0xe0 | (wc >> 12);562 *pwch++ = 0x80 | ((wc >> 6) & 0x3f);563 *pwch++ = 0x80 | (wc & 0x3f);564 }565 else566 {567 RTStrAssertMsgFailed(("endian indicator! wc=%#x\n", wc));568 rc = VERR_CODE_POINT_ENDIAN_INDICATOR;569 break;570 }571 }572 else573 {574 if (wc >= 0xdc00)575 {576 RTStrAssertMsgFailed(("Wrong 1st char in surrogate! wc=%#x\n", wc));577 rc = VERR_INVALID_UTF16_ENCODING;578 break;579 }580 if (cwc <= 0)581 {582 RTStrAssertMsgFailed(("Invalid length! wc=%#x\n", wc));583 rc = VERR_INVALID_UTF16_ENCODING;584 break;585 }586 RTUTF16 wc2 = *pwsz++; cwc--;587 if (wc2 < 0xdc00 || wc2 > 0xdfff)588 {589 RTStrAssertMsgFailed(("Wrong 2nd char in surrogate! wc=%#x\n", wc));590 rc = VERR_INVALID_UTF16_ENCODING;591 break;592 }593 uint32_t CodePoint = 0x10000594 + ( ((wc & 0x3ff) << 10)595 | (wc2 & 0x3ff));596 if (RT_UNLIKELY(cch < 4))597 {598 RTStrAssertMsgFailed(("Buffer overflow! 4\n"));599 rc = VERR_BUFFER_OVERFLOW;600 break;601 }602 cch -= 4;603 *pwch++ = 0xf0 | (CodePoint >> 18);604 *pwch++ = 0x80 | ((CodePoint >> 12) & 0x3f);605 *pwch++ = 0x80 | ((CodePoint >> 6) & 0x3f);606 *pwch++ = 0x80 | (CodePoint & 0x3f);607 }608 }609 610 /* done */611 *pwch = '\0';612 *pcch = (char *)pwch - psz;613 504 return rc; 614 505 } … … 626 517 * @param pcch Where to store the number of octets actually encoded. 627 518 * 628 * @note Copy of rtUtf16RecodeAsUtf8 with a few RT_BE2H_U16 invocations 629 * insterted. 519 * @note rtUtf16LittleRecodeAsUtf8 == s/RT_BE2H_U16/RT_LE2H_U16/g 630 520 */ 631 521 static int rtUtf16BigRecodeAsUtf8(PCRTUTF16 pwsz, size_t cwc, char *psz, size_t cch, size_t *pcch) … … 730 620 731 621 622 /** 623 * Recodes an valid UTF-16LE string as UTF-8. 624 * 625 * @returns iprt status code. 626 * @param pwsz The UTF-16LE string. 627 * @param cwc The number of RTUTF16 characters to process from pwsz. The recoding 628 * will stop when cwc or '\\0' is reached. 629 * @param psz Where to store the UTF-8 string. 630 * @param cch The size of the UTF-8 buffer, excluding the terminator. 631 * @param pcch Where to store the number of octets actually encoded. 632 * 633 * @note rtUtf16LittleRecodeAsUtf8 == s/RT_LE2H_U16/RT_GE2H_U16/g 634 */ 635 static int rtUtf16LittleRecodeAsUtf8(PCRTUTF16 pwsz, size_t cwc, char *psz, size_t cch, size_t *pcch) 636 { 637 unsigned char *pwch = (unsigned char *)psz; 638 int rc = VINF_SUCCESS; 639 while (cwc > 0) 640 { 641 RTUTF16 wc = *pwsz++; cwc--; 642 if (!wc) 643 break; 644 wc = RT_LE2H_U16(wc); 645 if (wc < 0xd800 || wc > 0xdfff) 646 { 647 if (wc < 0x80) 648 { 649 if (RT_UNLIKELY(cch < 1)) 650 { 651 RTStrAssertMsgFailed(("Buffer overflow! 1\n")); 652 rc = VERR_BUFFER_OVERFLOW; 653 break; 654 } 655 cch--; 656 *pwch++ = (unsigned char)wc; 657 } 658 else if (wc < 0x800) 659 { 660 if (RT_UNLIKELY(cch < 2)) 661 { 662 RTStrAssertMsgFailed(("Buffer overflow! 2\n")); 663 rc = VERR_BUFFER_OVERFLOW; 664 break; 665 } 666 cch -= 2; 667 *pwch++ = 0xc0 | (wc >> 6); 668 *pwch++ = 0x80 | (wc & 0x3f); 669 } 670 else if (wc < 0xfffe) 671 { 672 if (RT_UNLIKELY(cch < 3)) 673 { 674 RTStrAssertMsgFailed(("Buffer overflow! 3\n")); 675 rc = VERR_BUFFER_OVERFLOW; 676 break; 677 } 678 cch -= 3; 679 *pwch++ = 0xe0 | (wc >> 12); 680 *pwch++ = 0x80 | ((wc >> 6) & 0x3f); 681 *pwch++ = 0x80 | (wc & 0x3f); 682 } 683 else 684 { 685 RTStrAssertMsgFailed(("endian indicator! wc=%#x\n", wc)); 686 rc = VERR_CODE_POINT_ENDIAN_INDICATOR; 687 break; 688 } 689 } 690 else 691 { 692 if (wc >= 0xdc00) 693 { 694 RTStrAssertMsgFailed(("Wrong 1st char in surrogate! wc=%#x\n", wc)); 695 rc = VERR_INVALID_UTF16_ENCODING; 696 break; 697 } 698 if (cwc <= 0) 699 { 700 RTStrAssertMsgFailed(("Invalid length! wc=%#x\n", wc)); 701 rc = VERR_INVALID_UTF16_ENCODING; 702 break; 703 } 704 RTUTF16 wc2 = *pwsz++; cwc--; 705 wc2 = RT_LE2H_U16(wc2); 706 if (wc2 < 0xdc00 || wc2 > 0xdfff) 707 { 708 RTStrAssertMsgFailed(("Wrong 2nd char in surrogate! wc=%#x\n", wc)); 709 rc = VERR_INVALID_UTF16_ENCODING; 710 break; 711 } 712 uint32_t CodePoint = 0x10000 713 + ( ((wc & 0x3ff) << 10) 714 | (wc2 & 0x3ff)); 715 if (RT_UNLIKELY(cch < 4)) 716 { 717 RTStrAssertMsgFailed(("Buffer overflow! 4\n")); 718 rc = VERR_BUFFER_OVERFLOW; 719 break; 720 } 721 cch -= 4; 722 *pwch++ = 0xf0 | (CodePoint >> 18); 723 *pwch++ = 0x80 | ((CodePoint >> 12) & 0x3f); 724 *pwch++ = 0x80 | ((CodePoint >> 6) & 0x3f); 725 *pwch++ = 0x80 | (CodePoint & 0x3f); 726 } 727 } 728 729 /* done */ 730 *pwch = '\0'; 731 *pcch = (char *)pwch - psz; 732 return rc; 733 } 734 735 732 736 733 737 RTDECL(int) RTUtf16ToUtf8Tag(PCRTUTF16 pwszString, char **ppszString, const char *pszTag) … … 744 748 */ 745 749 size_t cch; 746 int rc = rtUtf16CalcUtf8Length(pwszString, RTSTR_MAX, &cch); 750 #ifdef RT_BIG_ENDIAN 751 int rc = rtUtf16BigCalcUtf8Length(pwszString, RTSTR_MAX, &cch); 752 #else 753 int rc = rtUtf16LittleCalcUtf8Length(pwszString, RTSTR_MAX, &cch); 754 #endif 747 755 if (RT_SUCCESS(rc)) 748 756 { … … 753 761 if (pszResult) 754 762 { 755 rc = rtUtf16RecodeAsUtf8(pwszString, RTSTR_MAX, pszResult, cch, &cch); 763 #ifdef RT_BIG_ENDIAN 764 rc = rtUtf16BigRecodeAsUtf8(pwszString, RTSTR_MAX, pszResult, cch, &cch); 765 #else 766 rc = rtUtf16LittleRecodeAsUtf8(pwszString, RTSTR_MAX, pszResult, cch, &cch); 767 #endif 756 768 if (RT_SUCCESS(rc)) 757 769 { … … 809 821 810 822 823 RTDECL(int) RTUtf16LittleToUtf8Tag(PCRTUTF16 pwszString, char **ppszString, const char *pszTag) 824 { 825 /* 826 * Validate input. 827 */ 828 Assert(VALID_PTR(ppszString)); 829 Assert(VALID_PTR(pwszString)); 830 *ppszString = NULL; 831 832 /* 833 * Validate the UTF-16LE string and calculate the length of the UTF-8 encoding of it. 834 */ 835 size_t cch; 836 int rc = rtUtf16LittleCalcUtf8Length(pwszString, RTSTR_MAX, &cch); 837 if (RT_SUCCESS(rc)) 838 { 839 /* 840 * Allocate buffer and recode it. 841 */ 842 char *pszResult = (char *)RTMemAllocTag(cch + 1, pszTag); 843 if (pszResult) 844 { 845 rc = rtUtf16LittleRecodeAsUtf8(pwszString, RTSTR_MAX, pszResult, cch, &cch); 846 if (RT_SUCCESS(rc)) 847 { 848 *ppszString = pszResult; 849 return rc; 850 } 851 852 RTMemFree(pszResult); 853 } 854 else 855 rc = VERR_NO_STR_MEMORY; 856 } 857 return rc; 858 } 859 RT_EXPORT_SYMBOL(RTUtf16BigToUtf8Tag); 860 861 811 862 RTDECL(int) RTUtf16ToUtf8ExTag(PCRTUTF16 pwszString, size_t cwcString, char **ppsz, size_t cch, size_t *pcch, const char *pszTag) 812 863 { … … 822 873 */ 823 874 size_t cchResult; 824 int rc = rtUtf16CalcUtf8Length(pwszString, cwcString, &cchResult); 875 #ifdef RT_BIG_ENDIAN 876 int rc = rtUtf16BigCalcUtf8Length(pwszString, cwcString, &cchResult); 877 #else 878 int rc = rtUtf16LittleCalcUtf8Length(pwszString, cwcString, &cchResult); 879 #endif 825 880 if (RT_SUCCESS(rc)) 826 881 { … … 849 904 if (pszResult) 850 905 { 851 rc = rtUtf16RecodeAsUtf8(pwszString, cwcString, pszResult, cch - 1, &cch); 906 #ifdef RT_BIG_ENDIAN 907 rc = rtUtf16BigRecodeAsUtf8(pwszString, cwcString, pszResult, cch - 1, &cch); 908 #else 909 rc = rtUtf16LittleRecodeAsUtf8(pwszString, cwcString, pszResult, cch - 1, &cch); 910 #endif 852 911 if (RT_SUCCESS(rc)) 853 912 { … … 925 984 926 985 986 RTDECL(int) RTUtf16LittleToUtf8ExTag(PCRTUTF16 pwszString, size_t cwcString, char **ppsz, size_t cch, size_t *pcch, 987 const char *pszTag) 988 { 989 /* 990 * Validate input. 991 */ 992 AssertPtr(pwszString); 993 AssertPtr(ppsz); 994 AssertPtrNull(pcch); 995 996 /* 997 * Validate the UTF-16LE string and calculate the length of the UTF-8 encoding of it. 998 */ 999 size_t cchResult; 1000 int rc = rtUtf16LittleCalcUtf8Length(pwszString, cwcString, &cchResult); 1001 if (RT_SUCCESS(rc)) 1002 { 1003 if (pcch) 1004 *pcch = cchResult; 1005 1006 /* 1007 * Check buffer size / Allocate buffer and recode it. 1008 */ 1009 bool fShouldFree; 1010 char *pszResult; 1011 if (cch > 0 && *ppsz) 1012 { 1013 fShouldFree = false; 1014 if (RT_UNLIKELY(cch <= cchResult)) 1015 return VERR_BUFFER_OVERFLOW; 1016 pszResult = *ppsz; 1017 } 1018 else 1019 { 1020 *ppsz = NULL; 1021 fShouldFree = true; 1022 cch = RT_MAX(cch, cchResult + 1); 1023 pszResult = (char *)RTStrAllocTag(cch, pszTag); 1024 } 1025 if (pszResult) 1026 { 1027 rc = rtUtf16LittleRecodeAsUtf8(pwszString, cwcString, pszResult, cch - 1, &cch); 1028 if (RT_SUCCESS(rc)) 1029 { 1030 *ppsz = pszResult; 1031 return rc; 1032 } 1033 1034 if (fShouldFree) 1035 RTStrFree(pszResult); 1036 } 1037 else 1038 rc = VERR_NO_STR_MEMORY; 1039 } 1040 return rc; 1041 } 1042 RT_EXPORT_SYMBOL(RTUtf16BigToUtf8ExTag); 1043 1044 927 1045 RTDECL(size_t) RTUtf16CalcUtf8Len(PCRTUTF16 pwsz) 928 1046 { 929 1047 size_t cch; 930 int rc = rtUtf16CalcUtf8Length(pwsz, RTSTR_MAX, &cch); 1048 #ifdef RT_BIG_ENDIAN 1049 int rc = rtUtf16BigCalcUtf8Length(pwsz, RTSTR_MAX, &cch); 1050 #else 1051 int rc = rtUtf16LittleCalcUtf8Length(pwsz, RTSTR_MAX, &cch); 1052 #endif 931 1053 return RT_SUCCESS(rc) ? cch : 0; 932 1054 } … … 934 1056 935 1057 1058 RTDECL(size_t) RTUtf16BigCalcUtf8Len(PCRTUTF16 pwsz) 1059 { 1060 size_t cch; 1061 int rc = rtUtf16BigCalcUtf8Length(pwsz, RTSTR_MAX, &cch); 1062 return RT_SUCCESS(rc) ? cch : 0; 1063 } 1064 RT_EXPORT_SYMBOL(RTUtf16BigCalcUtf8Len); 1065 1066 1067 RTDECL(size_t) RTUtf16LittleCalcUtf8Len(PCRTUTF16 pwsz) 1068 { 1069 size_t cch; 1070 int rc = rtUtf16LittleCalcUtf8Length(pwsz, RTSTR_MAX, &cch); 1071 return RT_SUCCESS(rc) ? cch : 0; 1072 } 1073 RT_EXPORT_SYMBOL(RTUtf16LittleCalcUtf8Len); 1074 1075 936 1076 RTDECL(int) RTUtf16CalcUtf8LenEx(PCRTUTF16 pwsz, size_t cwc, size_t *pcch) 937 1077 { 938 1078 size_t cch; 939 int rc = rtUtf16CalcUtf8Length(pwsz, cwc, &cch); 1079 #ifdef RT_BIG_ENDIAN 1080 int rc = rtUtf16BigCalcUtf8Length(pwsz, cwc, &cch); 1081 #else 1082 int rc = rtUtf16LittleCalcUtf8Length(pwsz, cwc, &cch); 1083 #endif 940 1084 if (pcch) 941 1085 *pcch = RT_SUCCESS(rc) ? cch : ~(size_t)0; … … 943 1087 } 944 1088 RT_EXPORT_SYMBOL(RTUtf16CalcUtf8LenEx); 1089 1090 1091 RTDECL(int) RTUtf16BigCalcUtf8LenEx(PCRTUTF16 pwsz, size_t cwc, size_t *pcch) 1092 { 1093 size_t cch; 1094 int rc = rtUtf16BigCalcUtf8Length(pwsz, cwc, &cch); 1095 if (pcch) 1096 *pcch = RT_SUCCESS(rc) ? cch : ~(size_t)0; 1097 return rc; 1098 } 1099 RT_EXPORT_SYMBOL(RTUtf16BigCalcUtf8LenEx); 1100 1101 1102 RTDECL(int) RTUtf16LittleCalcUtf8LenEx(PCRTUTF16 pwsz, size_t cwc, size_t *pcch) 1103 { 1104 size_t cch; 1105 int rc = rtUtf16LittleCalcUtf8Length(pwsz, cwc, &cch); 1106 if (pcch) 1107 *pcch = RT_SUCCESS(rc) ? cch : ~(size_t)0; 1108 return rc; 1109 } 1110 RT_EXPORT_SYMBOL(RTUtf16LittleCalcUtf8LenEx); 945 1111 946 1112
Note:
See TracChangeset
for help on using the changeset viewer.