utf-8.cpp

Timestamp:

Dec 10, 2009 1:22:48 PM (15 years ago)

Author:

vboxsync

Message:

IPRT: splitting up utf-8.cpp

File:

: 1 edited

trunk/src/VBox/Runtime/common/string/utf-8.cpp (modified) (2 diffs)

Legend:

: Unmodified
: Added
: Removed

trunk/src/VBox/Runtime/common/string/utf-8.cpp

-              r25000
+              r25296
 /*
  * Copyright (C) 2006-2007 Sun Microsystems, Inc.
+ * Copyright (C) 2006-2009 Sun Microsystems, Inc.
+ *
  * This file is part of VirtualBox Open Source Edition (OSE), as
 …
 RT_EXPORT_SYMBOL(RTStrPrevCp);
-/**
- * Performs a case sensitive string compare between two UTF-8 strings.
+ *
- * Encoding errors are ignored by the current implementation. So, the only
- * difference between this and the CRT strcmp function is the handling of
- * NULL arguments.
+ *
- * @returns < 0 if the first string less than the second string.
- * @returns 0 if the first string identical to the second string.
- * @returns > 0 if the first string greater than the second string.
- * @param   psz1        First UTF-8 string. Null is allowed.
- * @param   psz2        Second UTF-8 string. Null is allowed.
- */
-RTDECL(int) RTStrCmp(const char *psz1, const char *psz2)
+{
-    if (psz1 == psz2)
-        return 0;
-    if (!psz1)
-        return -1;
-    if (!psz2)
-        return 1;
-    return strcmp(psz1, psz2);
+}
-RT_EXPORT_SYMBOL(RTStrCmp);
-/**
- * Performs a case sensitive string compare between two UTF-8 strings, given
- * a maximum string length.
+ *
- * Encoding errors are ignored by the current implementation. So, the only
- * difference between this and the CRT strncmp function is the handling of
- * NULL arguments.
+ *
- * @returns < 0 if the first string less than the second string.
- * @returns 0 if the first string identical to the second string.
- * @returns > 0 if the first string greater than the second string.
- * @param   psz1        First UTF-8 string. Null is allowed.
- * @param   psz2        Second UTF-8 string. Null is allowed.
- * @param   cchMax      The maximum string length
- */
-RTDECL(int) RTStrNCmp(const char *psz1, const char *psz2, size_t cchMax)
+{
-    if (psz1 == psz2)
-        return 0;
-    if (!psz1)
-        return -1;
-    if (!psz2)
-        return 1;
-    return strncmp(psz1, psz2, cchMax);
+}
-RT_EXPORT_SYMBOL(RTStrNCmp);
-/**
- * Performs a case insensitive string compare between two UTF-8 strings.
+ *
- * This is a simplified compare, as only the simplified lower/upper case folding
- * specified by the unicode specs are used. It does not consider character pairs
- * as they are used in some languages, just simple upper & lower case compares.
+ *
- * The result is the difference between the mismatching codepoints after they
- * both have been lower cased.
+ *
- * If the string encoding is invalid the function will assert (strict builds)
- * and use RTStrCmp for the remainder of the string.
+ *
- * @returns < 0 if the first string less than the second string.
- * @returns 0 if the first string identical to the second string.
- * @returns > 0 if the first string greater than the second string.
- * @param   psz1        First UTF-8 string. Null is allowed.
- * @param   psz2        Second UTF-8 string. Null is allowed.
- */
-RTDECL(int) RTStrICmp(const char *psz1, const char *psz2)
+{
-    if (psz1 == psz2)
-        return 0;
-    if (!psz1)
-        return -1;
-    if (!psz2)
-        return 1;
-    const char *pszStart1 = psz1;
-    for (;;)
+    {
-        /* Get the codepoints */
-        RTUNICP cp1;
-        int rc = RTStrGetCpEx(&psz1, &cp1);
-        if (RT_FAILURE(rc))
+        {
-            AssertRC(rc);
-            psz1--;
-            break;
+        }
-        RTUNICP cp2;
-        rc = RTStrGetCpEx(&psz2, &cp2);
-        if (RT_FAILURE(rc))
+        {
-            AssertRC(rc);
-            psz2--;
-            psz1 = RTStrPrevCp(pszStart1, psz1);
-            break;
+        }
-        /* compare */
-        int iDiff = cp1 - cp2;
-        if (iDiff)
+        {
-            iDiff = RTUniCpToUpper(cp1) != RTUniCpToUpper(cp2);
-            if (iDiff)
+            {
-                iDiff = RTUniCpToLower(cp1) - RTUniCpToLower(cp2); /* lower case diff last! */
-                if (iDiff)
-                    return iDiff;
+            }
+        }
-        /* hit the terminator? */
-        if (!cp1)
-            return 0;
+    }
-    /* Hit some bad encoding, continue in case insensitive mode. */
-    return RTStrCmp(psz1, psz2);
+}
-RT_EXPORT_SYMBOL(RTStrICmp);
-/**
- * Performs a case insensitive string compare between two UTF-8 strings, given a
- * maximum string length.
+ *
- * This is a simplified compare, as only the simplified lower/upper case folding
- * specified by the unicode specs are used. It does not consider character pairs
- * as they are used in some languages, just simple upper & lower case compares.
+ *
- * The result is the difference between the mismatching codepoints after they
- * both have been lower cased.
+ *
- * If the string encoding is invalid the function will assert (strict builds)
- * and use RTStrCmp for the remainder of the string.
+ *
- * @returns < 0 if the first string less than the second string.
- * @returns 0 if the first string identical to the second string.
- * @returns > 0 if the first string greater than the second string.
- * @param   psz1        First UTF-8 string. Null is allowed.
- * @param   psz2        Second UTF-8 string. Null is allowed.
- * @param   cchMax      Maximum string length
- */
-RTDECL(int) RTStrNICmp(const char *psz1, const char *psz2, size_t cchMax)
+{
-    if (cchMax == 0)
-        return 0;
-    if (psz1 == psz2)
-        return 0;
-    if (!psz1)
-        return -1;
-    if (!psz2)
-        return 1;
-    for (;;)
+    {
-        /* Get the codepoints */
-        RTUNICP cp1;
-        size_t cchMax2 = cchMax;
-        int rc = RTStrGetCpNEx(&psz1, &cchMax, &cp1);
-        if (RT_FAILURE(rc))
+        {
-            AssertRC(rc);
-            psz1--;
-            cchMax++;
-            break;
+        }
-        RTUNICP cp2;
-        rc = RTStrGetCpNEx(&psz2, &cchMax2, &cp2);
-        if (RT_FAILURE(rc))
+        {
-            AssertRC(rc);
-            psz2--;
-            psz1 -= (cchMax - cchMax2 + 1);  /* This can't overflow, can it? */
-            cchMax = cchMax2 + 1;
-            break;
+        }
-        /* compare */
-        int iDiff = cp1 - cp2;
-        if (iDiff)
+        {
-            iDiff = RTUniCpToUpper(cp1) != RTUniCpToUpper(cp2);
-            if (iDiff)
+            {
-                iDiff = RTUniCpToLower(cp1) - RTUniCpToLower(cp2); /* lower case diff last! */
-                if (iDiff)
-                    return iDiff;
+            }
+        }
-        /* hit the terminator? */
-        if (!cp1 || cchMax == 0)
-            return 0;
+    }
-    /* Hit some bad encoding, continue in case insensitive mode. */
-    return RTStrNCmp(psz1, psz2, cchMax);
+}
-RT_EXPORT_SYMBOL(RTStrNICmp);
-RTDECL(char *) RTStrStr(const char *pszHaystack, const char *pszNeedle)
+{
-    /* Any NULL strings means NULL return. (In the RTStrCmp tradition.) */
-    if (!pszHaystack)
-        return NULL;
-    if (!pszNeedle)
-        return NULL;
-    /* The rest is CRT. */
-    return (char *)strstr(pszHaystack, pszNeedle);
+}
-RT_EXPORT_SYMBOL(RTStrStr);
-RTDECL(char *) RTStrIStr(const char *pszHaystack, const char *pszNeedle)
+{
-    /* Any NULL strings means NULL return. (In the RTStrCmp tradition.) */
-    if (!pszHaystack)
-        return NULL;
-    if (!pszNeedle)
-        return NULL;
-    /* The empty string matches everything. */
-    if (!*pszNeedle)
-        return (char *)pszHaystack;
-    /*
-     * The search strategy is to pick out the first char of the needle, fold it,
-     * and match it against the haystack code point by code point. When encountering
-     * a matching code point we use RTStrNICmp for the remainder (if any) of the needle.
-     */
-    const char * const pszNeedleStart = pszNeedle;
-    RTUNICP Cp0;
-    RTStrGetCpEx(&pszNeedle, &Cp0);     /* pszNeedle is advanced one code point. */
-    size_t const    cchNeedle   = strlen(pszNeedle);
-    size_t const    cchNeedleCp0= pszNeedle - pszNeedleStart;
-    RTUNICP const   Cp0Lower    = RTUniCpToLower(Cp0);
-    RTUNICP const   Cp0Upper    = RTUniCpToUpper(Cp0);
-    if (    Cp0Lower == Cp0Upper
-        &&  Cp0Lower == Cp0)
+    {
-        /* Cp0 is not a case sensitive char. */
-        for (;;)
+        {
-            RTUNICP Cp;
-            RTStrGetCpEx(&pszHaystack, &Cp);
-            if (!Cp)
-                break;
-            if (    Cp == Cp0
-                &&  !RTStrNICmp(pszHaystack, pszNeedle, cchNeedle))
-                return (char *)pszHaystack - cchNeedleCp0;
+        }
+    }
-    else if (   Cp0Lower == Cp0
-             || Cp0Upper != Cp0)
+    {
-        /* Cp0 is case sensitive */
-        for (;;)
+        {
-            RTUNICP Cp;
-            RTStrGetCpEx(&pszHaystack, &Cp);
-            if (!Cp)
-                break;
-            if (    (   Cp == Cp0Upper
-                     || Cp == Cp0Lower)
-                &&  !RTStrNICmp(pszHaystack, pszNeedle, cchNeedle))
-                return (char *)pszHaystack - cchNeedleCp0;
+        }
+    }
-    else
+    {
-        /* Cp0 is case sensitive and folds to two difference chars. (paranoia) */
-        for (;;)
+        {
-            RTUNICP Cp;
-            RTStrGetCpEx(&pszHaystack, &Cp);
-            if (!Cp)
-                break;
-            if (    (   Cp == Cp0
-                     || Cp == Cp0Upper
-                     || Cp == Cp0Lower)
-                &&  !RTStrNICmp(pszHaystack, pszNeedle, cchNeedle))
-                return (char *)pszHaystack - cchNeedleCp0;
+        }
+    }
-    return NULL;
+}
-RT_EXPORT_SYMBOL(RTStrIStr);
-RTDECL(char *) RTStrToLower(char *psz)
+{
-    /*
-     * Loop the code points in the string, converting them one by one.
-     * ASSUMES that the code points for upper and lower case are encoded
-     *         with the exact same length.
-     */
-    /** @todo Handled bad encodings correctly+quietly, remove assumption,
-     *        optimize. */
-    char *pszCur = psz;
-    while (*pszCur)
+    {
-        RTUNICP cp = RTStrGetCp(pszCur);
-        cp = RTUniCpToLower(cp);
-        pszCur = RTStrPutCp(pszCur, cp);
+    }
-    return psz;
+}
-RT_EXPORT_SYMBOL(RTStrToLower);
-RTDECL(char *) RTStrToUpper(char *psz)
+{
-    /*
-     * Loop the code points in the string, converting them one by one.
-     * ASSUMES that the code points for upper and lower case are encoded
-     *         with the exact same length.
-     */
-    /** @todo Handled bad encodings correctly+quietly, remove assumption,
-     *        optimize. */
-    char *pszCur = psz;
-    while(*pszCur)
+    {
-        RTUNICP cp = RTStrGetCp(pszCur);
-        cp = RTUniCpToUpper(cp);
-        pszCur = RTStrPutCp(pszCur, cp);
+    }
-    return psz;
+}
-RT_EXPORT_SYMBOL(RTStrToUpper);

Note: See TracChangeset for help on using the changeset viewer.

Changeset 25296 in vbox for trunk/src/VBox/Runtime/common/string/utf-8.cpp

Legend:

trunk/src/VBox/Runtime/common/string/utf-8.cpp

Download in other formats: