Changeset 31246 in vbox

Timestamp:

Jul 30, 2010 1:24:53 PM (14 years ago)

Author:

vboxsync

Message:

Runtime/string: clean up the UTF-8 <-> Latin1 APIs a bit more

Location:

trunk

Files:

: 2 edited

include/iprt/string.h (modified) (3 diffs)
src/VBox/Runtime/common/string/utf-8.cpp (modified) (6 diffs)

Legend:

: Unmodified
: Added
: Removed

trunk/include/iprt/string.h

-              r31221
+              r31246
 #include <iprt/cdefs.h>
 #include <iprt/types.h>
+#include <iprt/assert.h>
 #include <iprt/stdarg.h>
+#include <iprt/uni.h> /* for RTUNICP_INVALID */
 #include <iprt/err.h> /* for VINF_SUCCESS */
 #if defined(RT_OS_LINUX) && defined(__KERNEL__)
 …
 /**
+ * Get the UTF-8 size in characters of a given Unicode code point.  The code
+ * point is expected to be a valid Unicode one, but not necessarily in the
+ * range supported by UTF-8.
+ *
+ * @returns the size in characters, or zero if there is no UTF-8 encoding
+ */
+DECLINLINE(size_t) RTStrCpSize(RTUNICP CodePoint)
+{
+    if (CodePoint < 0x80)
+        return 1;
+    if (CodePoint < 0x800)
+        return 2;
+    if (CodePoint < 0x10000)
+        return 3;
+    if (CodePoint < 0x11000)
+        return 4;
+    return 0;
+}
+/**
  * Put the unicode code point at the given string position
  * and return the pointer to the char following it.
 …
  */
 RTDECL(char *) RTStrPrevCp(const char *pszStart, const char *psz);
+/**
+ * Get the unicode code point at the given string position.
+ *
+ * @returns unicode code point.
+ * @returns RTUNICP_INVALID if the encoding is invalid.
+ * @param   psz         The string.
+ */
+DECLINLINE(RTUNICP) RTLatin1GetCp(const char *psz)
+{
+    return *(const unsigned char *)psz;
+}
+/**
+ * Get the unicode code point at the given string position.
+ *
+ * @returns iprt status code.
+ * @param   ppsz        Pointer to the string pointer. This will be updated to
+ *                      point to the char following the current code point.
+ *                      This is advanced one character forward on failure.
+ * @param   pCp         Where to store the code point.
+ *                      RTUNICP_INVALID is stored here on failure.
+ *
+ * @remark  We optimize this operation by using an inline function for
+ *          the most frequent and simplest sequence, the rest is
+ *          handled by RTStrGetCpExInternal().
+ */
+DECLINLINE(int) RTLatin1GetCpEx(const char **ppsz, PRTUNICP pCp)
+{
+    const unsigned char uch = **(const unsigned char **)ppsz;
+    (*ppsz)++;
+    *pCp = uch;
+    return VINF_SUCCESS;
+}
+/**
+ * Get the unicode code point at the given string position for a string of a
+ * given maximum length.
+ *
+ * @returns iprt status code.
+ * @retval  VERR_END_OF_STRING if *pcch is 0. *pCp is set to RTUNICP_INVALID.
+ *
+ * @param   ppsz        Pointer to the string pointer. This will be updated to
+ *                      point to the char following the current code point.
+ * @param   pcch        Pointer to the maximum string length.  This will be
+ *                      decremented by the size of the code point found.
+ * @param   pCp         Where to store the code point.
+ *                      RTUNICP_INVALID is stored here on failure.
+ */
+DECLINLINE(int) RTLatin1GetCpNEx(const char **ppsz, size_t *pcch, PRTUNICP pCp)
+{
+    if (RT_LIKELY(*pcch != 0))
+    {
+        const unsigned char uch = **(const unsigned char **)ppsz;
+        (*ppsz)++;
+        (*pcch)--;
+        *pCp = uch;
+        return VINF_SUCCESS;
+    }
+    *pCp = RTUNICP_INVALID;
+    return VERR_END_OF_STRING;
+}
+/**
+ * Get the Latin-1 size in characters of a given Unicode code point.  The code
+ * point is expected to be a valid Unicode one, but not necessarily in the
+ * range supported by Latin-1.
+ *
+ * @returns the size in characters, or zero if there is no Latin-1 encoding
+ */
+DECLINLINE(size_t) RTLatin1CpSize(RTUNICP CodePoint)
+{
+    if (CodePoint < 0x100)
+        return 1;
+    return 0;
+}
+/**
+ * Put the unicode code point at the given string position
+ * and return the pointer to the char following it.
+ *
+ * This function will not consider anything at or following the
+ * buffer area pointed to by psz. It is therefore not suitable for
+ * inserting code points into a string, only appending/overwriting.
+ *
+ * @returns pointer to the char following the written code point.
+ * @param   psz         The string.
+ * @param   CodePoint   The code point to write.
+ *                      This should not be RTUNICP_INVALID or any other
+ *                      character out of the Latin-1 range.
+ */
+DECLINLINE(char *) RTLatin1PutCp(char *psz, RTUNICP CodePoint)
+{
+    AssertReturn(CodePoint < 0x100, NULL);
+    *psz++ = (unsigned char)CodePoint;
+    return psz;
+}
+/**
+ * Skips ahead, past the current code point.
+ *
+ * @returns Pointer to the char after the current code point.
+ * @param   psz     Pointer to the current code point.
+ * @remark  This will not move the next valid code point, only past the current one.
+ */
+DECLINLINE(char *) RTLatin1NextCp(const char *psz)
+{
+    psz++;
+    return (char *)psz;
+}
+/**
+ * Skips back to the previous code point.
+ *
+ * @returns Pointer to the char before the current code point.
+ * @returns pszStart on failure.
+ * @param   pszStart    Pointer to the start of the string.
+ * @param   psz         Pointer to the current code point.
+ */
+DECLINLINE(char *) RTLatin1PrevCp(const char *psz)
+{
+    psz--;
+    return (char *)psz;
+}

trunk/src/VBox/Runtime/common/string/utf-8.cpp

-              r31229
+              r31246
+{
     size_t  cch = 0;
+    while (cchIn > 0)
+    {
+        char ch = *psz++; cchIn--;
+        if (!ch)
+    while (true)
+    {
+        RTUNICP Cp;
+        size_t cchCp;
+        int rc = RTLatin1GetCpNEx(&psz, &cchIn, &Cp);
+        if (Cp == 0 || rc == VERR_END_OF_STRING)
             break;
+        if (!(ch & 0x80))
+            cch++;
+        else
+            cch += 2;
+    }
+        if (RT_FAILURE(rc))
+            return rc;
+        cchCp = RTStrCpSize(Cp);
+        if (cchCp == 0)
+            return VERR_NO_TRANSLATION;
+        cch += cchCp;
+    }
     /* done */
 …
  * @param   psz         Where to store the UTF-8 string.
  * @param   cch         The size of the UTF-8 buffer, excluding the terminator.
- * @param   pcch        Where to store the number of octets actually encoded.
  */
+static int rtLatin1RecodeAsUtf8(const char *pszIn, size_t cchIn, char *psz, size_t cch, size_t *pcch)
+{
+    unsigned char  *puch = (unsigned char *)psz;
+    int             rc = VINF_SUCCESS;
+    while (cchIn > 0)
+    {
+        unsigned char ch = (unsigned char) *pszIn++; cchIn--;
+        if (!ch)
+static int rtLatin1RecodeAsUtf8(const char *pszIn, size_t cchIn, char *psz, size_t cch)
+{
+    int   rc  = VINF_SUCCESS;
+    while (true)
+    {
+        RTUNICP Cp;
+        size_t cchCp;
+        rc = RTLatin1GetCpNEx(&pszIn, &cchIn, &Cp);
+        if (Cp == 0 || RT_FAILURE(rc))
             break;
+        if (!(ch & 0x80))
+        {
+            if (RT_UNLIKELY(cch < 1))
+            {
+                RTStrAssertMsgFailed(("Buffer overflow! 1\n"));
+                rc = VERR_BUFFER_OVERFLOW;
+                break;
+            }
+            cch--;
+            *puch++ = (unsigned char)ch;
+        }
+        else
+        {
+            if (RT_UNLIKELY(cch < 2))
+            {
+                RTStrAssertMsgFailed(("Buffer overflow! 2\n"));
+                rc = VERR_BUFFER_OVERFLOW;
+                break;
+            }
+            cch -= 2;
+            *puch++ = 0xc0 | (ch >> 6);
+            *puch++ = 0x80 | (ch & 0x3f);
+        }
+        cchCp = RTStrCpSize(Cp);
+        if (RT_UNLIKELY(cch < cchCp))
+        {
+            RTStrAssertMsgFailed(("Buffer overflow! 1\n"));
+            rc = VERR_BUFFER_OVERFLOW;
+            break;
+        }
+        psz = RTStrPutCp(psz, Cp);
+        cch -= cchCp;
+    }
     /* done */
+    *puch = '\0';
+    *pcch = (char *)puch - psz;
+    if (rc == VERR_END_OF_STRING)
+        rc = VINF_SUCCESS;
+    *psz = '\0';
     return rc;
+}
 …
         if (pszResult)
+        {
             rc = rtLatin1RecodeAsUtf8(pszString, RTSTR_MAX, pszResult, cch, &cch);
+            rc = rtLatin1RecodeAsUtf8(pszString, RTSTR_MAX, pszResult, cch);
             if (RT_SUCCESS(rc))
+            {
 …
         if (pszResult)
+        {
             rc = rtLatin1RecodeAsUtf8(pszString, cchString, pszResult, cch - 1, &cch);
+            rc = rtLatin1RecodeAsUtf8(pszString, cchString, pszResult, cch - 1);
             if (RT_SUCCESS(rc))
+            {
 …
  * @returns IPRT status code.
  * @param   psz     Pointer to the UTF-8 string.
  * @param   cch     The max length of the string. (btw cch = cb)
+ * @param   cchIn   The max length of the string. (btw cch = cb)
  *                  Use RTSTR_MAX if all of the string is to be examined.
  * @param   pcch    Where to store the length of the Latin-1 string in bytes.
  */
 static int rtUtf8CalcLatin1Length(const char *psz, size_t cch, size_t *pcch)
+{
     size_t cchOut = 0;
+static int rtUtf8CalcLatin1Length(const char *psz, size_t cchIn, size_t *pcch)
+{
+    size_t  cch = 0;
     while (true)
+    {
         RTUNICP Cp;
+        int rc = RTStrGetCpNEx(&psz, &cch, &Cp);
+        size_t cchCp;
+        int rc = RTStrGetCpNEx(&psz, &cchIn, &Cp);
         if (Cp == 0 || rc == VERR_END_OF_STRING)
             break;
         if (RT_FAILURE(rc))
             return rc;
+        if (Cp >= 0x100)
+        cchCp = RTLatin1CpSize(Cp);
+        if (cchCp == 0)
             return VERR_NO_TRANSLATION;
         cchOut++;
+        cch += cchCp;
+    }
     /* done */
     *pcch = cchOut;
+    *pcch = cch;
     return VINF_SUCCESS;
+}
 …
+ *
  * @returns iprt status code.
  * @param   psz     The UTF-8 string to recode. This is a valid encoding.
  * @param   cch     The number of chars (the type char, so bytes if you like) to process of the UTF-8 string.
+ * @param   pszIn   The UTF-8 string to recode. This is a valid encoding.
+ * @param   cchIn   The number of chars (the type char, so bytes if you like) to process of the UTF-8 string.
  *                  The recoding will stop when cch or '\\0' is reached. Pass RTSTR_MAX to process up to '\\0'.
  * @param   pszOut  Where to store the Latin-1 string.
  * @param   cchOut  The number of characters the pszOut buffer can hold, excluding the terminator ('\\0').
+ * @param   psz     Where to store the Latin-1 string.
+ * @param   cch     The number of characters the pszOut buffer can hold, excluding the terminator ('\\0').
  */
 static int rtUtf8RecodeAsLatin1(const char *psz, size_t cch, char *pszOut, size_t cchOut)
+{
     int                     rc      = VINF_SUCCESS;
+    const unsigned char    *puch    = (const unsigned char *)psz;
     unsigned char          *puchOut = (unsigned char *)pszOut;
     while (cch > 0)
+    {
         /* read the next char and check for terminator. */
         const unsigned char uch = *puch;
         if (!uch)
+static int rtUtf8RecodeAsLatin1(const char *pszIn, size_t cchIn, char *psz, size_t cch)
+{
+    int   rc  = VINF_SUCCESS;
+    while (true)
+    {
+        RTUNICP Cp;
+        size_t cchCp;
+        rc = RTStrGetCpNEx(&pszIn, &cchIn, &Cp);
+        if (Cp == 0 || RT_FAILURE(rc))
             break;
         /* check for output overflow */
         if (RT_UNLIKELY(cchOut < 1))
+        {
+        cchCp = RTLatin1CpSize(Cp);
+        if (RT_UNLIKELY(cch < cchCp))
+        {
+            RTStrAssertMsgFailed(("Buffer overflow! 1\n"));
             rc = VERR_BUFFER_OVERFLOW;
             break;
+        }
+        cchOut--;
+        /* decode and recode the code point */
+        if (!(uch & RT_BIT(7)))
+        {
+            *puchOut++ = uch;
+            puch++;
+            cch--;
+        }
+        else if ((uch & (RT_BIT(7) | RT_BIT(6) | RT_BIT(5))) == (RT_BIT(7) | RT_BIT(6)))
+        {
+            uint16_t uc = (puch[1] & 0x3f)
+                    | ((uint16_t)(uch     & 0x1f) << 6);
+            if (uc >= 0x100)
+            {
+                rc = VERR_NO_TRANSLATION;
+                break;
+            }
+            *puchOut++ = uc;
+            puch += 2;
+            cch -= 2;
+        }
+        else
+        {
+            rc = VERR_NO_TRANSLATION;
+            break;
+        }
+        psz = RTLatin1PutCp(psz, Cp);
+        cch -= cchCp;
+    }
     /* done */
+    *puchOut = '\0';
+    return rc;
+}
+    if (rc == VERR_END_OF_STRING)
+        rc = VINF_SUCCESS;
+    *psz = '\0';
+    return rc;
+}

Note: See TracChangeset for help on using the changeset viewer.

Changeset 31246 in vbox

Legend:

trunk/include/iprt/string.h

trunk/src/VBox/Runtime/common/string/utf-8.cpp

Download in other formats: