/* $Id: utf-16.cpp 62930 2016-08-03 16:11:43Z vboxsync $ */ /** @file * IPRT - UTF-16. */ /* * Copyright (C) 2006-2016 Oracle Corporation * * This file is part of VirtualBox Open Source Edition (OSE), as * available from http://www.virtualbox.org. This file is free software; * you can redistribute it and/or modify it under the terms of the GNU * General Public License (GPL) as published by the Free Software * Foundation, in version 2 as it comes in the "COPYING" file of the * VirtualBox OSE distribution. VirtualBox OSE is distributed in the * hope that it will be useful, but WITHOUT ANY WARRANTY of any kind. * * The contents of this file may alternatively be used under the terms * of the Common Development and Distribution License Version 1.0 * (CDDL) only, as it comes in the "COPYING.CDDL" file of the * VirtualBox OSE distribution, in which case the provisions of the * CDDL are applicable instead of those of the GPL. * * You may elect to license modified versions of this file under the * terms and conditions of either the GPL or the CDDL or both. */ /********************************************************************************************************************************* * Header Files * *********************************************************************************************************************************/ #include #include "internal/iprt.h" #include #include #include #include #include "internal/string.h" /** * Get get length in code points of an UTF-16 encoded string, validating the * string while doing so. * * @returns IPRT status code. * @param pwsz Pointer to the UTF-16 string. * @param cwc The max length of the string in UTF-16 units. Use * RTSTR_MAX if all of the string is to be examined. * @param pcuc Where to store the length in unicode code points. * @param pcwcActual Where to store the actual size of the UTF-16 string * on success. Optional. */ static int rtUtf16Length(PCRTUTF16 pwsz, size_t cwc, size_t *pcuc, size_t *pcwcActual) { PCRTUTF16 pwszStart = pwsz; size_t cCodePoints = 0; while (cwc > 0) { RTUTF16 wc = *pwsz; if (!wc) break; if (wc < 0xd800 || wc > 0xdfff) { cCodePoints++; pwsz++; cwc--; } /* Surrogate pair: */ else if (wc >= 0xdc00) { RTStrAssertMsgFailed(("Lone UTF-16 trail surrogate: %#x (%.*Rhxs)\n", wc, RT_MIN(cwc * 2, 10), pwsz)); return VERR_INVALID_UTF16_ENCODING; } else if (cwc < 2) { RTStrAssertMsgFailed(("Lone UTF-16 lead surrogate: %#x\n", wc)); return VERR_INVALID_UTF16_ENCODING; } else { RTUTF16 wcTrail = pwsz[1]; if (wcTrail < 0xdc00 || wcTrail > 0xdfff) { RTStrAssertMsgFailed(("Invalid UTF-16 trail surrogate: %#x (lead %#x)\n", wcTrail, wc)); return VERR_INVALID_UTF16_ENCODING; } cCodePoints++; pwsz += 2; cwc -= 2; } } /* done */ *pcuc = cCodePoints; if (pcwcActual) *pcwcActual = pwsz - pwszStart; return VINF_SUCCESS; } RTDECL(PRTUTF16) RTUtf16AllocTag(size_t cb, const char *pszTag) { if (cb > sizeof(RTUTF16)) cb = RT_ALIGN_Z(cb, sizeof(RTUTF16)); else cb = sizeof(RTUTF16); PRTUTF16 pwsz = (PRTUTF16)RTMemAllocTag(cb, pszTag); if (pwsz) *pwsz = '\0'; return pwsz; } RT_EXPORT_SYMBOL(RTUtf16AllocTag); RTDECL(int) RTUtf16ReallocTag(PRTUTF16 *ppwsz, size_t cbNew, const char *pszTag) { PRTUTF16 pwszOld = *ppwsz; cbNew = RT_ALIGN_Z(cbNew, sizeof(RTUTF16)); if (!cbNew) { RTMemFree(pwszOld); *ppwsz = NULL; } else if (pwszOld) { PRTUTF16 pwszNew = (PRTUTF16)RTMemReallocTag(pwszOld, cbNew, pszTag); if (!pwszNew) return VERR_NO_STR_MEMORY; pwszNew[cbNew / sizeof(RTUTF16) - 1] = '\0'; *ppwsz = pwszNew; } else { PRTUTF16 pwszNew = (PRTUTF16)RTMemAllocTag(cbNew, pszTag); if (!pwszNew) return VERR_NO_UTF16_MEMORY; pwszNew[0] = '\0'; pwszNew[cbNew / sizeof(RTUTF16) - 1] = '\0'; *ppwsz = pwszNew; } return VINF_SUCCESS; } RT_EXPORT_SYMBOL(RTUtf16ReallocTag); RTDECL(void) RTUtf16Free(PRTUTF16 pwszString) { if (pwszString) RTMemTmpFree(pwszString); } RT_EXPORT_SYMBOL(RTUtf16Free); RTDECL(PRTUTF16) RTUtf16DupTag(PCRTUTF16 pwszString, const char *pszTag) { Assert(pwszString); size_t cb = (RTUtf16Len(pwszString) + 1) * sizeof(RTUTF16); PRTUTF16 pwsz = (PRTUTF16)RTMemAllocTag(cb, pszTag); if (pwsz) memcpy(pwsz, pwszString, cb); return pwsz; } RT_EXPORT_SYMBOL(RTUtf16DupTag); RTDECL(int) RTUtf16DupExTag(PRTUTF16 *ppwszString, PCRTUTF16 pwszString, size_t cwcExtra, const char *pszTag) { Assert(pwszString); size_t cb = (RTUtf16Len(pwszString) + 1) * sizeof(RTUTF16); PRTUTF16 pwsz = (PRTUTF16)RTMemAllocTag(cb + cwcExtra * sizeof(RTUTF16), pszTag); if (pwsz) { memcpy(pwsz, pwszString, cb); *ppwszString = pwsz; return VINF_SUCCESS; } return VERR_NO_MEMORY; } RT_EXPORT_SYMBOL(RTUtf16DupExTag); RTDECL(size_t) RTUtf16Len(PCRTUTF16 pwszString) { if (!pwszString) return 0; PCRTUTF16 pwsz = pwszString; while (*pwsz) pwsz++; return pwsz - pwszString; } RT_EXPORT_SYMBOL(RTUtf16Len); RTDECL(int) RTUtf16Cmp(register PCRTUTF16 pwsz1, register PCRTUTF16 pwsz2) { if (pwsz1 == pwsz2) return 0; if (!pwsz1) return -1; if (!pwsz2) return 1; for (;;) { register RTUTF16 wcs = *pwsz1; register int iDiff = wcs - *pwsz2; if (iDiff || !wcs) return iDiff; pwsz1++; pwsz2++; } } RT_EXPORT_SYMBOL(RTUtf16Cmp); RTDECL(int) RTUtf16CmpUtf8(PCRTUTF16 pwsz1, const char *psz2) { /* * NULL and empty strings are all the same. */ if (!pwsz1) return !psz2 || !*psz2 ? 0 : -1; if (!psz2) return !*pwsz1 ? 0 : 1; /* * Compare with a UTF-8 string by enumerating them char by char. */ for (;;) { RTUNICP uc1; int rc = RTUtf16GetCpEx(&pwsz1, &uc1); AssertRCReturn(rc, 1); RTUNICP uc2; rc = RTStrGetCpEx(&psz2, &uc2); AssertRCReturn(rc, -1); if (uc1 == uc2) { if (uc1) continue; return 0; } return uc1 < uc2 ? -1 : 1; } } RT_EXPORT_SYMBOL(RTUtf16CmpUtf8); RTDECL(int) RTUtf16ValidateEncoding(PCRTUTF16 pwsz) { return RTUtf16ValidateEncodingEx(pwsz, RTSTR_MAX, 0); } RT_EXPORT_SYMBOL(RTUtf16ValidateEncoding); RTDECL(int) RTUtf16ValidateEncodingEx(PCRTUTF16 pwsz, size_t cwc, uint32_t fFlags) { AssertReturn(!(fFlags & ~(RTSTR_VALIDATE_ENCODING_ZERO_TERMINATED | RTSTR_VALIDATE_ENCODING_EXACT_LENGTH)), VERR_INVALID_PARAMETER); AssertPtr(pwsz); /* * Use rtUtf16Length for the job. */ size_t cwcActual = 0; /* Shut up cc1plus. */ size_t cCpsIgnored; int rc = rtUtf16Length(pwsz, cwc, &cCpsIgnored, &cwcActual); if (RT_SUCCESS(rc)) { if (fFlags & RTSTR_VALIDATE_ENCODING_EXACT_LENGTH) { if (fFlags & RTSTR_VALIDATE_ENCODING_ZERO_TERMINATED) cwcActual++; if (cwcActual == cwc) rc = VINF_SUCCESS; else if (cwcActual < cwc) rc = VERR_BUFFER_UNDERFLOW; else rc = VERR_BUFFER_OVERFLOW; } else if ( (fFlags & RTSTR_VALIDATE_ENCODING_ZERO_TERMINATED) && cwcActual >= cwc) rc = VERR_BUFFER_OVERFLOW; } return rc; } RT_EXPORT_SYMBOL(RTUtf16ValidateEncodingEx); RTDECL(bool) RTUtf16IsValidEncoding(PCRTUTF16 pwsz) { int rc = RTUtf16ValidateEncodingEx(pwsz, RTSTR_MAX, 0); return RT_SUCCESS(rc); } RT_EXPORT_SYMBOL(RTUtf16IsValidEncoding); /** * Helper for RTUtf16PurgeComplementSet. * * @returns true if @a Cp is valid, false if not. * @param Cp The code point to validate. * @param puszValidPairs Pair of valid code point sets. * @param cValidPairs Number of pairs. */ DECLINLINE(bool) rtUtf16PurgeIsInSet(RTUNICP Cp, PCRTUNICP puszValidPairs, uint32_t cValidPairs) { while (cValidPairs-- > 0) { if ( Cp >= puszValidPairs[0] && Cp <= puszValidPairs[1]) return true; puszValidPairs += 2; } return false; } RTDECL(ssize_t) RTUtf16PurgeComplementSet(PRTUTF16 pwsz, PCRTUNICP puszValidPairs, char chReplacement) { AssertReturn(chReplacement && (unsigned)chReplacement < 128, -1); /* * Calc valid pairs and check that we've got an even number. */ uint32_t cValidPairs = 0; while (puszValidPairs[cValidPairs * 2]) { AssertReturn(puszValidPairs[cValidPairs * 2 + 1], -1); AssertMsg(puszValidPairs[cValidPairs * 2] <= puszValidPairs[cValidPairs * 2 + 1], ("%#x vs %#x\n", puszValidPairs[cValidPairs * 2], puszValidPairs[cValidPairs * 2 + 1])); cValidPairs++; } /* * Do the replacing. */ ssize_t cReplacements = 0; for (;;) { PRTUTF16 pwszCur = pwsz; RTUNICP Cp; int rc = RTUtf16GetCpEx((PCRTUTF16 *)&pwsz, &Cp); if (RT_SUCCESS(rc)) { if (Cp) { if (!rtUtf16PurgeIsInSet(Cp, puszValidPairs, cValidPairs)) { for (; pwszCur != pwsz; ++pwszCur) *pwszCur = chReplacement; ++cReplacements; } } else break; } else return -1; } return cReplacements; } RT_EXPORT_SYMBOL(RTUtf16PurgeComplementSet); /** * Validate the UTF-16 encoding and calculates the length of an UTF-8 encoding. * * @returns iprt status code. * @param pwsz The UTF-16 string. * @param cwc The max length of the UTF-16 string to consider. * @param pcch Where to store the length (excluding '\\0') of the UTF-8 string. (cch == cb, btw) */ static int rtUtf16CalcUtf8Length(PCRTUTF16 pwsz, size_t cwc, size_t *pcch) { int rc = VINF_SUCCESS; size_t cch = 0; while (cwc > 0) { RTUTF16 wc = *pwsz++; cwc--; if (!wc) break; else if (wc < 0xd800 || wc > 0xdfff) { if (wc < 0x80) cch++; else if (wc < 0x800) cch += 2; else if (wc < 0xfffe) cch += 3; else { RTStrAssertMsgFailed(("endian indicator! wc=%#x\n", wc)); rc = VERR_CODE_POINT_ENDIAN_INDICATOR; break; } } else { if (wc >= 0xdc00) { RTStrAssertMsgFailed(("Wrong 1st char in surrogate! wc=%#x\n", wc)); rc = VERR_INVALID_UTF16_ENCODING; break; } if (cwc <= 0) { RTStrAssertMsgFailed(("Invalid length! wc=%#x\n", wc)); rc = VERR_INVALID_UTF16_ENCODING; break; } wc = *pwsz++; cwc--; if (wc < 0xdc00 || wc > 0xdfff) { RTStrAssertMsgFailed(("Wrong 2nd char in surrogate! wc=%#x\n", wc)); rc = VERR_INVALID_UTF16_ENCODING; break; } cch += 4; } } /* done */ *pcch = cch; return rc; } /** * Recodes an valid UTF-16 string as UTF-8. * * @returns iprt status code. * @param pwsz The UTF-16 string. * @param cwc The number of RTUTF16 characters to process from pwsz. The recoding * will stop when cwc or '\\0' is reached. * @param psz Where to store the UTF-8 string. * @param cch The size of the UTF-8 buffer, excluding the terminator. * @param pcch Where to store the number of octets actually encoded. */ static int rtUtf16RecodeAsUtf8(PCRTUTF16 pwsz, size_t cwc, char *psz, size_t cch, size_t *pcch) { unsigned char *pwch = (unsigned char *)psz; int rc = VINF_SUCCESS; while (cwc > 0) { RTUTF16 wc = *pwsz++; cwc--; if (!wc) break; else if (wc < 0xd800 || wc > 0xdfff) { if (wc < 0x80) { if (RT_UNLIKELY(cch < 1)) { RTStrAssertMsgFailed(("Buffer overflow! 1\n")); rc = VERR_BUFFER_OVERFLOW; break; } cch--; *pwch++ = (unsigned char)wc; } else if (wc < 0x800) { if (RT_UNLIKELY(cch < 2)) { RTStrAssertMsgFailed(("Buffer overflow! 2\n")); rc = VERR_BUFFER_OVERFLOW; break; } cch -= 2; *pwch++ = 0xc0 | (wc >> 6); *pwch++ = 0x80 | (wc & 0x3f); } else if (wc < 0xfffe) { if (RT_UNLIKELY(cch < 3)) { RTStrAssertMsgFailed(("Buffer overflow! 3\n")); rc = VERR_BUFFER_OVERFLOW; break; } cch -= 3; *pwch++ = 0xe0 | (wc >> 12); *pwch++ = 0x80 | ((wc >> 6) & 0x3f); *pwch++ = 0x80 | (wc & 0x3f); } else { RTStrAssertMsgFailed(("endian indicator! wc=%#x\n", wc)); rc = VERR_CODE_POINT_ENDIAN_INDICATOR; break; } } else { if (wc >= 0xdc00) { RTStrAssertMsgFailed(("Wrong 1st char in surrogate! wc=%#x\n", wc)); rc = VERR_INVALID_UTF16_ENCODING; break; } if (cwc <= 0) { RTStrAssertMsgFailed(("Invalid length! wc=%#x\n", wc)); rc = VERR_INVALID_UTF16_ENCODING; break; } RTUTF16 wc2 = *pwsz++; cwc--; if (wc2 < 0xdc00 || wc2 > 0xdfff) { RTStrAssertMsgFailed(("Wrong 2nd char in surrogate! wc=%#x\n", wc)); rc = VERR_INVALID_UTF16_ENCODING; break; } uint32_t CodePoint = 0x10000 + ( ((wc & 0x3ff) << 10) | (wc2 & 0x3ff)); if (RT_UNLIKELY(cch < 4)) { RTStrAssertMsgFailed(("Buffer overflow! 4\n")); rc = VERR_BUFFER_OVERFLOW; break; } cch -= 4; *pwch++ = 0xf0 | (CodePoint >> 18); *pwch++ = 0x80 | ((CodePoint >> 12) & 0x3f); *pwch++ = 0x80 | ((CodePoint >> 6) & 0x3f); *pwch++ = 0x80 | (CodePoint & 0x3f); } } /* done */ *pwch = '\0'; *pcch = (char *)pwch - psz; return rc; } RTDECL(int) RTUtf16ToUtf8Tag(PCRTUTF16 pwszString, char **ppszString, const char *pszTag) { /* * Validate input. */ Assert(VALID_PTR(ppszString)); Assert(VALID_PTR(pwszString)); *ppszString = NULL; /* * Validate the UTF-16 string and calculate the length of the UTF-8 encoding of it. */ size_t cch; int rc = rtUtf16CalcUtf8Length(pwszString, RTSTR_MAX, &cch); if (RT_SUCCESS(rc)) { /* * Allocate buffer and recode it. */ char *pszResult = (char *)RTMemAllocTag(cch + 1, pszTag); if (pszResult) { rc = rtUtf16RecodeAsUtf8(pwszString, RTSTR_MAX, pszResult, cch, &cch); if (RT_SUCCESS(rc)) { *ppszString = pszResult; return rc; } RTMemFree(pszResult); } else rc = VERR_NO_STR_MEMORY; } return rc; } RT_EXPORT_SYMBOL(RTUtf16ToUtf8Tag); RTDECL(int) RTUtf16ToUtf8ExTag(PCRTUTF16 pwszString, size_t cwcString, char **ppsz, size_t cch, size_t *pcch, const char *pszTag) { /* * Validate input. */ Assert(VALID_PTR(pwszString)); Assert(VALID_PTR(ppsz)); Assert(!pcch || VALID_PTR(pcch)); /* * Validate the UTF-16 string and calculate the length of the UTF-8 encoding of it. */ size_t cchResult; int rc = rtUtf16CalcUtf8Length(pwszString, cwcString, &cchResult); if (RT_SUCCESS(rc)) { if (pcch) *pcch = cchResult; /* * Check buffer size / Allocate buffer and recode it. */ bool fShouldFree; char *pszResult; if (cch > 0 && *ppsz) { fShouldFree = false; if (RT_UNLIKELY(cch <= cchResult)) return VERR_BUFFER_OVERFLOW; pszResult = *ppsz; } else { *ppsz = NULL; fShouldFree = true; cch = RT_MAX(cch, cchResult + 1); pszResult = (char *)RTStrAllocTag(cch, pszTag); } if (pszResult) { rc = rtUtf16RecodeAsUtf8(pwszString, cwcString, pszResult, cch - 1, &cch); if (RT_SUCCESS(rc)) { *ppsz = pszResult; return rc; } if (fShouldFree) RTStrFree(pszResult); } else rc = VERR_NO_STR_MEMORY; } return rc; } RT_EXPORT_SYMBOL(RTUtf16ToUtf8ExTag); RTDECL(size_t) RTUtf16CalcUtf8Len(PCRTUTF16 pwsz) { size_t cch; int rc = rtUtf16CalcUtf8Length(pwsz, RTSTR_MAX, &cch); return RT_SUCCESS(rc) ? cch : 0; } RT_EXPORT_SYMBOL(RTUtf16CalcUtf8Len); RTDECL(int) RTUtf16CalcUtf8LenEx(PCRTUTF16 pwsz, size_t cwc, size_t *pcch) { size_t cch; int rc = rtUtf16CalcUtf8Length(pwsz, cwc, &cch); if (pcch) *pcch = RT_SUCCESS(rc) ? cch : ~(size_t)0; return rc; } RT_EXPORT_SYMBOL(RTUtf16CalcUtf8LenEx); RTDECL(RTUNICP) RTUtf16GetCpInternal(PCRTUTF16 pwsz) { const RTUTF16 wc = *pwsz; /* simple */ if (wc < 0xd800 || (wc > 0xdfff && wc < 0xfffe)) return wc; if (wc < 0xfffe) { /* surrogate pair */ if (wc < 0xdc00) { const RTUTF16 wc2 = pwsz[1]; if (wc2 >= 0xdc00 && wc2 <= 0xdfff) { RTUNICP uc = 0x10000 + (((wc & 0x3ff) << 10) | (wc2 & 0x3ff)); return uc; } RTStrAssertMsgFailed(("wc=%#08x wc2=%#08x - invalid 2nd char in surrogate pair\n", wc, wc2)); } else RTStrAssertMsgFailed(("wc=%#08x - invalid surrogate pair order\n", wc)); } else RTStrAssertMsgFailed(("wc=%#08x - endian indicator\n", wc)); return RTUNICP_INVALID; } RT_EXPORT_SYMBOL(RTUtf16GetCpInternal); RTDECL(int) RTUtf16GetCpExInternal(PCRTUTF16 *ppwsz, PRTUNICP pCp) { const RTUTF16 wc = **ppwsz; /* simple */ if (wc < 0xd800 || (wc > 0xdfff && wc < 0xfffe)) { (*ppwsz)++; *pCp = wc; return VINF_SUCCESS; } int rc; if (wc < 0xfffe) { /* surrogate pair */ if (wc < 0xdc00) { const RTUTF16 wc2 = (*ppwsz)[1]; if (wc2 >= 0xdc00 && wc2 <= 0xdfff) { RTUNICP uc = 0x10000 + (((wc & 0x3ff) << 10) | (wc2 & 0x3ff)); *pCp = uc; (*ppwsz) += 2; return VINF_SUCCESS; } RTStrAssertMsgFailed(("wc=%#08x wc2=%#08x - invalid 2nd char in surrogate pair\n", wc, wc2)); } else RTStrAssertMsgFailed(("wc=%#08x - invalid surrogate pair order\n", wc)); rc = VERR_INVALID_UTF16_ENCODING; } else { RTStrAssertMsgFailed(("wc=%#08x - endian indicator\n", wc)); rc = VERR_CODE_POINT_ENDIAN_INDICATOR; } *pCp = RTUNICP_INVALID; (*ppwsz)++; return rc; } RT_EXPORT_SYMBOL(RTUtf16GetCpExInternal); RTDECL(PRTUTF16) RTUtf16PutCpInternal(PRTUTF16 pwsz, RTUNICP CodePoint) { /* simple */ if ( CodePoint < 0xd800 || ( CodePoint > 0xdfff && CodePoint < 0xfffe)) { *pwsz++ = (RTUTF16)CodePoint; return pwsz; } /* surrogate pair */ if (CodePoint >= 0x10000 && CodePoint <= 0x0010ffff) { CodePoint -= 0x10000; *pwsz++ = 0xd800 | (CodePoint >> 10); *pwsz++ = 0xdc00 | (CodePoint & 0x3ff); return pwsz; } /* invalid code point. */ RTStrAssertMsgFailed(("Invalid codepoint %#x\n", CodePoint)); *pwsz++ = 0x7f; return pwsz; } RT_EXPORT_SYMBOL(RTUtf16PutCpInternal);