VirtualBox

Changeset 31246 in vbox


Ignore:
Timestamp:
Jul 30, 2010 1:24:53 PM (14 years ago)
Author:
vboxsync
Message:

Runtime/string: clean up the UTF-8 <-> Latin1 APIs a bit more

Location:
trunk
Files:
2 edited

Legend:

Unmodified
Added
Removed
  • trunk/include/iprt/string.h

    r31221 r31246  
    2929#include <iprt/cdefs.h>
    3030#include <iprt/types.h>
     31#include <iprt/assert.h>
    3132#include <iprt/stdarg.h>
     33#include <iprt/uni.h> /* for RTUNICP_INVALID */
    3234#include <iprt/err.h> /* for VINF_SUCCESS */
    3335#if defined(RT_OS_LINUX) && defined(__KERNEL__)
     
    11571159
    11581160/**
     1161 * Get the UTF-8 size in characters of a given Unicode code point.  The code
     1162 * point is expected to be a valid Unicode one, but not necessarily in the
     1163 * range supported by UTF-8.
     1164 *
     1165 * @returns the size in characters, or zero if there is no UTF-8 encoding
     1166 */
     1167DECLINLINE(size_t) RTStrCpSize(RTUNICP CodePoint)
     1168{
     1169    if (CodePoint < 0x80)
     1170        return 1;
     1171    if (CodePoint < 0x800)
     1172        return 2;
     1173    if (CodePoint < 0x10000)
     1174        return 3;
     1175    if (CodePoint < 0x11000)
     1176        return 4;
     1177    return 0;
     1178}
     1179
     1180/**
    11591181 * Put the unicode code point at the given string position
    11601182 * and return the pointer to the char following it.
     
    12071229 */
    12081230RTDECL(char *) RTStrPrevCp(const char *pszStart, const char *psz);
     1231
     1232/**
     1233 * Get the unicode code point at the given string position.
     1234 *
     1235 * @returns unicode code point.
     1236 * @returns RTUNICP_INVALID if the encoding is invalid.
     1237 * @param   psz         The string.
     1238 */
     1239DECLINLINE(RTUNICP) RTLatin1GetCp(const char *psz)
     1240{
     1241    return *(const unsigned char *)psz;
     1242}
     1243
     1244/**
     1245 * Get the unicode code point at the given string position.
     1246 *
     1247 * @returns iprt status code.
     1248 * @param   ppsz        Pointer to the string pointer. This will be updated to
     1249 *                      point to the char following the current code point.
     1250 *                      This is advanced one character forward on failure.
     1251 * @param   pCp         Where to store the code point.
     1252 *                      RTUNICP_INVALID is stored here on failure.
     1253 *
     1254 * @remark  We optimize this operation by using an inline function for
     1255 *          the most frequent and simplest sequence, the rest is
     1256 *          handled by RTStrGetCpExInternal().
     1257 */
     1258DECLINLINE(int) RTLatin1GetCpEx(const char **ppsz, PRTUNICP pCp)
     1259{
     1260    const unsigned char uch = **(const unsigned char **)ppsz;
     1261    (*ppsz)++;
     1262    *pCp = uch;
     1263    return VINF_SUCCESS;
     1264}
     1265
     1266/**
     1267 * Get the unicode code point at the given string position for a string of a
     1268 * given maximum length.
     1269 *
     1270 * @returns iprt status code.
     1271 * @retval  VERR_END_OF_STRING if *pcch is 0. *pCp is set to RTUNICP_INVALID.
     1272 *
     1273 * @param   ppsz        Pointer to the string pointer. This will be updated to
     1274 *                      point to the char following the current code point.
     1275 * @param   pcch        Pointer to the maximum string length.  This will be
     1276 *                      decremented by the size of the code point found.
     1277 * @param   pCp         Where to store the code point.
     1278 *                      RTUNICP_INVALID is stored here on failure.
     1279 */
     1280DECLINLINE(int) RTLatin1GetCpNEx(const char **ppsz, size_t *pcch, PRTUNICP pCp)
     1281{
     1282    if (RT_LIKELY(*pcch != 0))
     1283    {
     1284        const unsigned char uch = **(const unsigned char **)ppsz;
     1285        (*ppsz)++;
     1286        (*pcch)--;
     1287        *pCp = uch;
     1288        return VINF_SUCCESS;
     1289    }
     1290    *pCp = RTUNICP_INVALID;
     1291    return VERR_END_OF_STRING;
     1292}
     1293
     1294/**
     1295 * Get the Latin-1 size in characters of a given Unicode code point.  The code
     1296 * point is expected to be a valid Unicode one, but not necessarily in the
     1297 * range supported by Latin-1.
     1298 *
     1299 * @returns the size in characters, or zero if there is no Latin-1 encoding
     1300 */
     1301DECLINLINE(size_t) RTLatin1CpSize(RTUNICP CodePoint)
     1302{
     1303    if (CodePoint < 0x100)
     1304        return 1;
     1305    return 0;
     1306}
     1307
     1308/**
     1309 * Put the unicode code point at the given string position
     1310 * and return the pointer to the char following it.
     1311 *
     1312 * This function will not consider anything at or following the
     1313 * buffer area pointed to by psz. It is therefore not suitable for
     1314 * inserting code points into a string, only appending/overwriting.
     1315 *
     1316 * @returns pointer to the char following the written code point.
     1317 * @param   psz         The string.
     1318 * @param   CodePoint   The code point to write.
     1319 *                      This should not be RTUNICP_INVALID or any other
     1320 *                      character out of the Latin-1 range.
     1321 */
     1322DECLINLINE(char *) RTLatin1PutCp(char *psz, RTUNICP CodePoint)
     1323{
     1324    AssertReturn(CodePoint < 0x100, NULL);
     1325    *psz++ = (unsigned char)CodePoint;
     1326    return psz;
     1327}
     1328
     1329/**
     1330 * Skips ahead, past the current code point.
     1331 *
     1332 * @returns Pointer to the char after the current code point.
     1333 * @param   psz     Pointer to the current code point.
     1334 * @remark  This will not move the next valid code point, only past the current one.
     1335 */
     1336DECLINLINE(char *) RTLatin1NextCp(const char *psz)
     1337{
     1338    psz++;
     1339    return (char *)psz;
     1340}
     1341
     1342/**
     1343 * Skips back to the previous code point.
     1344 *
     1345 * @returns Pointer to the char before the current code point.
     1346 * @returns pszStart on failure.
     1347 * @param   pszStart    Pointer to the start of the string.
     1348 * @param   psz         Pointer to the current code point.
     1349 */
     1350DECLINLINE(char *) RTLatin1PrevCp(const char *psz)
     1351{
     1352    psz--;
     1353    return (char *)psz;
     1354}
    12091355
    12101356
  • trunk/src/VBox/Runtime/common/string/utf-8.cpp

    r31229 r31246  
    805805{
    806806    size_t  cch = 0;
    807     while (cchIn > 0)
    808     {
    809         char ch = *psz++; cchIn--;
    810         if (!ch)
     807    while (true)
     808    {
     809        RTUNICP Cp;
     810        size_t cchCp;
     811        int rc = RTLatin1GetCpNEx(&psz, &cchIn, &Cp);
     812        if (Cp == 0 || rc == VERR_END_OF_STRING)
    811813            break;
    812         if (!(ch & 0x80))
    813             cch++;
    814         else
    815             cch += 2;
    816     }
    817 
     814        if (RT_FAILURE(rc))
     815            return rc;
     816        cchCp = RTStrCpSize(Cp);
     817        if (cchCp == 0)
     818            return VERR_NO_TRANSLATION;
     819        cch += cchCp;
     820    }
    818821
    819822    /* done */
     
    832835 * @param   psz         Where to store the UTF-8 string.
    833836 * @param   cch         The size of the UTF-8 buffer, excluding the terminator.
    834  * @param   pcch        Where to store the number of octets actually encoded.
    835837 */
    836 static int rtLatin1RecodeAsUtf8(const char *pszIn, size_t cchIn, char *psz, size_t cch, size_t *pcch)
    837 {
    838     unsigned char  *puch = (unsigned char *)psz;
    839     int             rc = VINF_SUCCESS;
    840     while (cchIn > 0)
    841     {
    842         unsigned char ch = (unsigned char) *pszIn++; cchIn--;
    843         if (!ch)
     838static int rtLatin1RecodeAsUtf8(const char *pszIn, size_t cchIn, char *psz, size_t cch)
     839{
     840    int   rc  = VINF_SUCCESS;
     841
     842    while (true)
     843    {
     844        RTUNICP Cp;
     845        size_t cchCp;
     846        rc = RTLatin1GetCpNEx(&pszIn, &cchIn, &Cp);
     847        if (Cp == 0 || RT_FAILURE(rc))
    844848            break;
    845         if (!(ch & 0x80))
    846         {
    847             if (RT_UNLIKELY(cch < 1))
    848             {
    849                 RTStrAssertMsgFailed(("Buffer overflow! 1\n"));
    850                 rc = VERR_BUFFER_OVERFLOW;
    851                 break;
    852             }
    853             cch--;
    854             *puch++ = (unsigned char)ch;
    855         }
    856         else
    857         {
    858             if (RT_UNLIKELY(cch < 2))
    859             {
    860                 RTStrAssertMsgFailed(("Buffer overflow! 2\n"));
    861                 rc = VERR_BUFFER_OVERFLOW;
    862                 break;
    863             }
    864             cch -= 2;
    865             *puch++ = 0xc0 | (ch >> 6);
    866             *puch++ = 0x80 | (ch & 0x3f);
    867         }
     849        cchCp = RTStrCpSize(Cp);
     850        if (RT_UNLIKELY(cch < cchCp))
     851        {
     852            RTStrAssertMsgFailed(("Buffer overflow! 1\n"));
     853            rc = VERR_BUFFER_OVERFLOW;
     854            break;
     855        }
     856        psz = RTStrPutCp(psz, Cp);
     857        cch -= cchCp;
    868858    }
    869859
    870860    /* done */
    871     *puch = '\0';
    872     *pcch = (char *)puch - psz;
     861    if (rc == VERR_END_OF_STRING)
     862        rc = VINF_SUCCESS;
     863    *psz = '\0';
    873864    return rc;
    874865}
     
    898889        if (pszResult)
    899890        {
    900             rc = rtLatin1RecodeAsUtf8(pszString, RTSTR_MAX, pszResult, cch, &cch);
     891            rc = rtLatin1RecodeAsUtf8(pszString, RTSTR_MAX, pszResult, cch);
    901892            if (RT_SUCCESS(rc))
    902893            {
     
    955946        if (pszResult)
    956947        {
    957             rc = rtLatin1RecodeAsUtf8(pszString, cchString, pszResult, cch - 1, &cch);
     948            rc = rtLatin1RecodeAsUtf8(pszString, cchString, pszResult, cch - 1);
    958949            if (RT_SUCCESS(rc))
    959950            {
     
    998989 * @returns IPRT status code.
    999990 * @param   psz     Pointer to the UTF-8 string.
    1000  * @param   cch     The max length of the string. (btw cch = cb)
     991 * @param   cchIn   The max length of the string. (btw cch = cb)
    1001992 *                  Use RTSTR_MAX if all of the string is to be examined.
    1002993 * @param   pcch    Where to store the length of the Latin-1 string in bytes.
    1003994 */
    1004 static int rtUtf8CalcLatin1Length(const char *psz, size_t cch, size_t *pcch)
    1005 {
    1006     size_t cchOut = 0;
     995static int rtUtf8CalcLatin1Length(const char *psz, size_t cchIn, size_t *pcch)
     996{
     997    size_t  cch = 0;
    1007998    while (true)
    1008999    {
    10091000        RTUNICP Cp;
    1010         int rc = RTStrGetCpNEx(&psz, &cch, &Cp);
     1001        size_t cchCp;
     1002        int rc = RTStrGetCpNEx(&psz, &cchIn, &Cp);
    10111003        if (Cp == 0 || rc == VERR_END_OF_STRING)
    10121004            break;
    10131005        if (RT_FAILURE(rc))
    10141006            return rc;
    1015         if (Cp >= 0x100)
     1007        cchCp = RTLatin1CpSize(Cp);
     1008        if (cchCp == 0)
    10161009            return VERR_NO_TRANSLATION;
    1017         cchOut++;
     1010        cch += cchCp;
    10181011    }
    10191012
    10201013    /* done */
    1021     *pcch = cchOut;
     1014    *pcch = cch;
    10221015    return VINF_SUCCESS;
    10231016}
     
    10301023 *
    10311024 * @returns iprt status code.
    1032  * @param   psz     The UTF-8 string to recode. This is a valid encoding.
    1033  * @param   cch     The number of chars (the type char, so bytes if you like) to process of the UTF-8 string.
     1025 * @param   pszIn   The UTF-8 string to recode. This is a valid encoding.
     1026 * @param   cchIn   The number of chars (the type char, so bytes if you like) to process of the UTF-8 string.
    10341027 *                  The recoding will stop when cch or '\\0' is reached. Pass RTSTR_MAX to process up to '\\0'.
    1035  * @param   pszOut  Where to store the Latin-1 string.
    1036  * @param   cchOut  The number of characters the pszOut buffer can hold, excluding the terminator ('\\0').
     1028 * @param   psz     Where to store the Latin-1 string.
     1029 * @param   cch     The number of characters the pszOut buffer can hold, excluding the terminator ('\\0').
    10371030 */
    1038 static int rtUtf8RecodeAsLatin1(const char *psz, size_t cch, char *pszOut, size_t cchOut)
    1039 {
    1040     int                     rc      = VINF_SUCCESS;
    1041     const unsigned char    *puch    = (const unsigned char *)psz;
    1042     unsigned char          *puchOut = (unsigned char *)pszOut;
    1043     while (cch > 0)
    1044     {
    1045         /* read the next char and check for terminator. */
    1046         const unsigned char uch = *puch;
    1047         if (!uch)
     1031static int rtUtf8RecodeAsLatin1(const char *pszIn, size_t cchIn, char *psz, size_t cch)
     1032{
     1033    int   rc  = VINF_SUCCESS;
     1034
     1035    while (true)
     1036    {
     1037        RTUNICP Cp;
     1038        size_t cchCp;
     1039        rc = RTStrGetCpNEx(&pszIn, &cchIn, &Cp);
     1040        if (Cp == 0 || RT_FAILURE(rc))
    10481041            break;
    1049 
    1050         /* check for output overflow */
    1051         if (RT_UNLIKELY(cchOut < 1))
    1052         {
     1042        cchCp = RTLatin1CpSize(Cp);
     1043        if (RT_UNLIKELY(cch < cchCp))
     1044        {
     1045            RTStrAssertMsgFailed(("Buffer overflow! 1\n"));
    10531046            rc = VERR_BUFFER_OVERFLOW;
    10541047            break;
    10551048        }
    1056         cchOut--;
    1057 
    1058         /* decode and recode the code point */
    1059         if (!(uch & RT_BIT(7)))
    1060         {
    1061             *puchOut++ = uch;
    1062             puch++;
    1063             cch--;
    1064         }
    1065         else if ((uch & (RT_BIT(7) | RT_BIT(6) | RT_BIT(5))) == (RT_BIT(7) | RT_BIT(6)))
    1066         {
    1067             uint16_t uc = (puch[1] & 0x3f)
    1068                     | ((uint16_t)(uch     & 0x1f) << 6);
    1069             if (uc >= 0x100)
    1070             {
    1071                 rc = VERR_NO_TRANSLATION;
    1072                 break;
    1073             }
    1074             *puchOut++ = uc;
    1075             puch += 2;
    1076             cch -= 2;
    1077         }
    1078         else
    1079         {
    1080             rc = VERR_NO_TRANSLATION;
    1081             break;
    1082         }
     1049        psz = RTLatin1PutCp(psz, Cp);
     1050        cch -= cchCp;
    10831051    }
    10841052
    10851053    /* done */
    1086     *puchOut = '\0';
    1087     return rc;
    1088 }
     1054    if (rc == VERR_END_OF_STRING)
     1055        rc = VINF_SUCCESS;
     1056    *psz = '\0';
     1057    return rc;
     1058}
     1059
    10891060
    10901061
Note: See TracChangeset for help on using the changeset viewer.

© 2024 Oracle Support Privacy / Do Not Sell My Info Terms of Use Trademark Policy Automated Access Etiquette