Changeset 35567 in vbox

Timestamp:

Jan 14, 2011 2:16:45 PM (14 years ago)

Author:

vboxsync

svn:sync-xref-src-repo-rev:

69451

Message:

IPRT: fix rare crash in MiniString::substr(); rename substr() to substrCP() and add a substr that operates on bytes, not codepoints; more to come

Location:

trunk

Files:

: 3 edited

include/iprt/cpp/ministring.h (modified) (5 diffs)
src/VBox/Runtime/common/string/ministring.cpp (modified) (2 diffs)
src/VBox/Runtime/testcase/tstIprtMiniString.cpp (modified) (1 diff)

Legend:

: Unmodified
: Added
: Removed

trunk/include/iprt/cpp/ministring.h

-              r35128
+              r35567
      * String length in bytes.
+     *
      * Returns the length of the member string, which is equal to strlen(c_str()).
      * In other words, this does not count unicode codepoints but returns the number
      * of bytes.  This is always cached so calling this is cheap and requires no
+     * Returns the length of the member string in bytes, which is equal to strlen(c_str()).
+     * In other words, this does not count unicode codepoints; use utf8length() for that.
+     * The byte length is always cached so calling this is cheap and requires no
      * strlen() invocation.
+     *
 …
+    {
         return m_cch;
+    }
+    /**
+     * String length in UTF-8 codepoints.
+     *
+     * As opposed to length(), which returns the length in bytes, this counts the number
+     * of UTF-8 codepoints. This is *not* cached so calling this is expensive.
+     *
+     * @returns Number of codepoints in the member string.
+     */
+    size_t utf8length() const
+    {
+        return m_psz ? RTStrUniLen(m_psz) : 0;
+    }
 …
      * Find the given substring.
+     *
      * Looks for pcszFind in "this" starting at "pos" and returns its position,
      * counting from the beginning of "this" at 0.
+     * Looks for pcszFind in "this" starting at "pos" and returns its position
+     * as a byte (not codepoint) offset, counting from the beginning of "this" at 0.
+     *
      * @param   pcszFind        The substring to find.
 …
      * Returns a substring of "this" as a new Utf8Str.
+     *
+     * Works exactly like its equivalent in std::string except that this interprets
+     * pos and n as unicode codepoints instead of bytes.  With the default
+     * parameters "0" and "npos", this always copies the entire string.
+     * Works exactly like its equivalent in std::string. With the default
+     * parameters "0" and "npos", this always copies the entire string. The
+     * "pos" and "n" arguments represent bytes; it is the caller's responsibility
+     * to ensure that the offsets do not copy invalid UTF-8 sequences. When
+     * used in conjunction with find() and length(), this will work.
+     *
+     * @param   pos             Index of first byte offset to copy from "this", counting from 0.
+     * @param   n               Number of bytes to copy, starting with the one at "pos".
+     *                          The copying will stop if the null terminator is encountered before
+     *                          n bytes have been copied.
+     */
+    iprt::MiniString substr(size_t pos = 0, size_t n = npos) const
+    {
+        return MiniString(*this, pos, n);
+    }
+    /**
+     * Returns a substring of "this" as a new Utf8Str. As opposed to substr(),
+     * this variant takes codepoint offsets instead of byte offsets.
+     *
      * @param   pos             Index of first unicode codepoint to copy from
 …
      *                          terminator is encountered before n codepoints have
      *                          been copied.
+     *
+     * @remarks This works on code points, not bytes!
+     */
+    iprt::MiniString substr(size_t pos = 0, size_t n = npos) const;
+     */
+    iprt::MiniString substrCP(size_t pos = 0, size_t n = npos) const;
     /**

trunk/src/VBox/Runtime/common/string/ministring.cpp

-              r35128
+              r35567
+}
 MiniString MiniString::substr(size_t pos /*= 0*/, size_t n /*= npos*/)
+MiniString MiniString::substrCP(size_t pos /*= 0*/, size_t n /*= npos*/)
     const
+{
 …
                 size_t cbCopy = psz - pFirst;
+                ret.reserve(cbCopy + 1); // may throw bad_alloc
+#ifndef RT_EXCEPTIONS_ENABLED
+                AssertRelease(capacity() >= cbCopy + 1);
+#endif
+                memcpy(ret.m_psz, pFirst, cbCopy);
+                ret.m_cch = cbCopy;
+                ret.m_psz[cbCopy] = '\0';
+                if (cbCopy)
+                {
+                    ret.reserve(cbCopy + 1); // may throw bad_alloc
+#ifndef RT_EXCEPTIONS_ENABLED
+                    AssertRelease(capacity() >= cbCopy + 1);
+#endif
+                    memcpy(ret.m_psz, pFirst, cbCopy);
+                    ret.m_cch = cbCopy;
+                    ret.m_psz[cbCopy] = '\0';
+                }
+            }
+        }

trunk/src/VBox/Runtime/testcase/tstIprtMiniString.cpp

-              r33862
+              r35567
     CHECK_EQUAL(SubStr15, "cdef");
+    /* substr() and substrCP() functions */
+    iprt::MiniString strTest("");
+    CHECK_EQUAL(strTest.substr(0), "");
+    CHECK_EQUAL(strTest.substrCP(0), "");
+    CHECK_EQUAL(strTest.substr(1), "");
+    CHECK_EQUAL(strTest.substrCP(1), "");
+    /* now let's have some non-ASCII to chew on */
+    strTest = "abcdefßäbcdef";
+            // 13 codepoints, but 15 bytes (excluding null terminator);
+            // "ß" and "ä" consume two bytes each
+    CHECK_EQUAL(strTest.substr(0),   strTest.c_str());
+    CHECK_EQUAL(strTest.substrCP(0), strTest.c_str());
+    CHECK_EQUAL(strTest.substr(2),   "cdefßäbcdef");
+    CHECK_EQUAL(strTest.substrCP(2), "cdefßäbcdef");
+    CHECK_EQUAL(strTest.substr(2, 2),   "cd");
+    CHECK_EQUAL(strTest.substrCP(2, 2), "cd");
+    CHECK_EQUAL(strTest.substr(6),   "ßäbcdef");
+    CHECK_EQUAL(strTest.substrCP(6), "ßäbcdef");
+    CHECK_EQUAL(strTest.substr(6, 2),   "ß");           // UTF-8 "ß" consumes two bytes
+    CHECK_EQUAL(strTest.substrCP(6, 1), "ß");
+    CHECK_EQUAL(strTest.substr(8),   "äbcdef");         // UTF-8 "ß" consumes two bytes
+    CHECK_EQUAL(strTest.substrCP(7), "äbcdef");
+    CHECK_EQUAL(strTest.substr(8, 3),   "äb");          // UTF-8 "ä" consumes two bytes
+    CHECK_EQUAL(strTest.substrCP(7, 2), "äb");
+    CHECK_EQUAL(strTest.substr(14, 1),   "f");
+    CHECK_EQUAL(strTest.substrCP(12, 1), "f");
+    CHECK_EQUAL(strTest.substr(15, 1),   "");
+    CHECK_EQUAL(strTest.substrCP(13, 1), "");
+    CHECK_EQUAL(strTest.substr(16, 1),   "");
+    CHECK_EQUAL(strTest.substrCP(15, 1), "");
     /* special constructor and assignment arguments */

Note: See TracChangeset for help on using the changeset viewer.

Changeset 35567 in vbox

Legend:

trunk/include/iprt/cpp/ministring.h

trunk/src/VBox/Runtime/common/string/ministring.cpp

trunk/src/VBox/Runtime/testcase/tstIprtMiniString.cpp

Download in other formats: