Changeset 25296 in vbox for trunk/src/VBox/Runtime/common/string/RTStrStr.cpp
- Timestamp:
- Dec 10, 2009 1:22:48 PM (15 years ago)
- File:
-
- 1 copied
Legend:
- Unmodified
- Added
- Removed
-
trunk/src/VBox/Runtime/common/string/RTStrStr.cpp
r25278 r25296 1 1 /* $Id$ */ 2 2 /** @file 3 * IPRT - UTF-8 Decoding.3 * IPRT - RTStrStr. 4 4 */ 5 5 6 6 /* 7 * Copyright (C) 2006-200 7Sun Microsystems, Inc.7 * Copyright (C) 2006-2009 Sun Microsystems, Inc. 8 8 * 9 9 * This file is part of VirtualBox Open Source Edition (OSE), as … … 36 36 #include "internal/iprt.h" 37 37 38 #include <iprt/uni.h>39 #include <iprt/alloc.h>40 #include <iprt/assert.h>41 #include <iprt/err.h>42 #include "internal/string.h"43 44 45 46 /**47 * Get get length in code points of a UTF-8 encoded string.48 * The string is validated while doing this.49 *50 * @returns IPRT status code.51 * @param psz Pointer to the UTF-8 string.52 * @param cch The max length of the string. (btw cch = cb)53 * Use RTSTR_MAX if all of the string is to be examined.54 * @param pcuc Where to store the length in unicode code points.55 * @param pcchActual Where to store the actual size of the UTF-8 string56 * on success (cch = cb again). Optional.57 */58 static int rtUtf8Length(const char *psz, size_t cch, size_t *pcuc, size_t *pcchActual)59 {60 const unsigned char *puch = (const unsigned char *)psz;61 size_t cCodePoints = 0;62 while (cch > 0)63 {64 const unsigned char uch = *puch;65 if (!uch)66 break;67 if (uch & RT_BIT(7))68 {69 /* figure sequence length and validate the first byte */70 unsigned cb;71 if ((uch & (RT_BIT(7) | RT_BIT(6) | RT_BIT(5))) == (RT_BIT(7) | RT_BIT(6)))72 cb = 2;73 else if ((uch & (RT_BIT(7) | RT_BIT(6) | RT_BIT(5) | RT_BIT(4))) == (RT_BIT(7) | RT_BIT(6) | RT_BIT(5)))74 cb = 3;75 else if ((uch & (RT_BIT(7) | RT_BIT(6) | RT_BIT(5) | RT_BIT(4) | RT_BIT(3))) == (RT_BIT(7) | RT_BIT(6) | RT_BIT(5) | RT_BIT(4)))76 cb = 4;77 else if ((uch & (RT_BIT(7) | RT_BIT(6) | RT_BIT(5) | RT_BIT(4) | RT_BIT(3) | RT_BIT(2))) == (RT_BIT(7) | RT_BIT(6) | RT_BIT(5) | RT_BIT(4) | RT_BIT(3)))78 cb = 5;79 else if ((uch & (RT_BIT(7) | RT_BIT(6) | RT_BIT(5) | RT_BIT(4) | RT_BIT(3) | RT_BIT(2) | RT_BIT(1))) == (RT_BIT(7) | RT_BIT(6) | RT_BIT(5) | RT_BIT(4) | RT_BIT(3) | RT_BIT(2)))80 cb = 6;81 else82 {83 RTStrAssertMsgFailed(("Invalid UTF-8 first byte: %.*Rhxs\n", RT_MIN(cch, 10), puch));84 return VERR_INVALID_UTF8_ENCODING;85 }86 87 /* check length */88 if (cb > cch)89 {90 RTStrAssertMsgFailed(("Invalid UTF-8 length: cb=%d cch=%d (%.*Rhxs)\n", cb, cch, RT_MIN(cch, 10), puch));91 return VERR_INVALID_UTF8_ENCODING;92 }93 94 /* validate the rest */95 switch (cb)96 {97 case 6:98 RTStrAssertMsgReturn((puch[5] & (RT_BIT(7) | RT_BIT(6))) == RT_BIT(7), ("6/%u: %.*Rhxs\n", cb, RT_MIN(cb + 10, cch), puch), VERR_INVALID_UTF8_ENCODING);99 case 5:100 RTStrAssertMsgReturn((puch[4] & (RT_BIT(7) | RT_BIT(6))) == RT_BIT(7), ("5/%u: %.*Rhxs\n", cb, RT_MIN(cb + 10, cch), puch), VERR_INVALID_UTF8_ENCODING);101 case 4:102 RTStrAssertMsgReturn((puch[3] & (RT_BIT(7) | RT_BIT(6))) == RT_BIT(7), ("4/%u: %.*Rhxs\n", cb, RT_MIN(cb + 10, cch), puch), VERR_INVALID_UTF8_ENCODING);103 case 3:104 RTStrAssertMsgReturn((puch[2] & (RT_BIT(7) | RT_BIT(6))) == RT_BIT(7), ("3/%u: %.*Rhxs\n", cb, RT_MIN(cb + 10, cch), puch), VERR_INVALID_UTF8_ENCODING);105 case 2:106 RTStrAssertMsgReturn((puch[1] & (RT_BIT(7) | RT_BIT(6))) == RT_BIT(7), ("2/%u: %.*Rhxs\n", cb, RT_MIN(cb + 10, cch), puch), VERR_INVALID_UTF8_ENCODING);107 break;108 }109 110 /* validate the code point. */111 RTUNICP uc;112 switch (cb)113 {114 case 6:115 uc = (puch[5] & 0x3f)116 | ((RTUNICP)(puch[4] & 0x3f) << 6)117 | ((RTUNICP)(puch[3] & 0x3f) << 12)118 | ((RTUNICP)(puch[2] & 0x3f) << 18)119 | ((RTUNICP)(puch[1] & 0x3f) << 24)120 | ((RTUNICP)(uch & 0x01) << 30);121 RTStrAssertMsgReturn(uc >= 0x04000000 && uc <= 0x7fffffff,122 ("%u: cp=%#010RX32: %.*Rhxs\n", cb, uc, RT_MIN(cb + 10, cch), puch), VERR_INVALID_UTF8_ENCODING);123 break;124 case 5:125 uc = (puch[4] & 0x3f)126 | ((RTUNICP)(puch[3] & 0x3f) << 6)127 | ((RTUNICP)(puch[2] & 0x3f) << 12)128 | ((RTUNICP)(puch[1] & 0x3f) << 18)129 | ((RTUNICP)(uch & 0x03) << 24);130 RTStrAssertMsgReturn(uc >= 0x00200000 && uc <= 0x03ffffff,131 ("%u: cp=%#010RX32: %.*Rhxs\n", cb, uc, RT_MIN(cb + 10, cch), puch), VERR_INVALID_UTF8_ENCODING);132 break;133 case 4:134 uc = (puch[3] & 0x3f)135 | ((RTUNICP)(puch[2] & 0x3f) << 6)136 | ((RTUNICP)(puch[1] & 0x3f) << 12)137 | ((RTUNICP)(uch & 0x07) << 18);138 RTStrAssertMsgReturn(uc >= 0x00010000 && uc <= 0x001fffff,139 ("%u: cp=%#010RX32: %.*Rhxs\n", cb, uc, RT_MIN(cb + 10, cch), puch), VERR_INVALID_UTF8_ENCODING);140 break;141 case 3:142 uc = (puch[2] & 0x3f)143 | ((RTUNICP)(puch[1] & 0x3f) << 6)144 | ((RTUNICP)(uch & 0x0f) << 12);145 RTStrAssertMsgReturn(uc >= 0x00000800 && uc <= 0x0000fffd,146 ("%u: cp=%#010RX32: %.*Rhxs\n", cb, uc, RT_MIN(cb + 10, cch), puch),147 uc == 0xffff || uc == 0xfffe ? VERR_CODE_POINT_ENDIAN_INDICATOR : VERR_INVALID_UTF8_ENCODING);148 RTStrAssertMsgReturn(uc < 0xd800 || uc > 0xdfff,149 ("%u: cp=%#010RX32: %.*Rhxs\n", cb, uc, RT_MIN(cb + 10, cch), puch), VERR_CODE_POINT_SURROGATE);150 break;151 case 2:152 uc = (puch[1] & 0x3f)153 | ((RTUNICP)(uch & 0x1f) << 6);154 RTStrAssertMsgReturn(uc >= 0x00000080 && uc <= 0x000007ff,155 ("%u: cp=%#010RX32: %.*Rhxs\n", cb, uc, RT_MIN(cb + 10, cch), puch), VERR_INVALID_UTF8_ENCODING);156 break;157 }158 159 /* advance */160 cch -= cb;161 puch += cb;162 }163 else164 {165 /* one ASCII byte */166 puch++;167 cch--;168 }169 cCodePoints++;170 }171 172 /* done */173 *pcuc = cCodePoints;174 if (pcchActual)175 *pcchActual = puch - (unsigned char const *)psz;176 return VINF_SUCCESS;177 }178 179 180 /**181 * Decodes and UTF-8 string into an array of unicode code point.182 *183 * Since we know the input is valid, we do *not* perform encoding or length checks.184 *185 * @returns iprt status code.186 * @param psz The UTF-8 string to recode. This is a valid encoding.187 * @param cch The number of chars (the type char, so bytes if you like) to process of the UTF-8 string.188 * The recoding will stop when cch or '\\0' is reached. Pass RTSTR_MAX to process up to '\\0'.189 * @param paCps Where to store the code points array.190 * @param cCps The number of RTUNICP items the paCps buffer can hold, excluding the terminator ('\\0').191 */192 static int rtUtf8Decode(const char *psz, size_t cch, PRTUNICP paCps, size_t cCps)193 {194 int rc = VINF_SUCCESS;195 const unsigned char *puch = (const unsigned char *)psz;196 PRTUNICP pCp = paCps;197 while (cch > 0)198 {199 /* read the next char and check for terminator. */200 const unsigned char uch = *puch;201 if (!uch)202 break;203 204 /* check for output overflow */205 if (RT_UNLIKELY(cCps < 1))206 {207 rc = VERR_BUFFER_OVERFLOW;208 break;209 }210 cCps--;211 212 /* decode and recode the code point */213 if (!(uch & RT_BIT(7)))214 {215 *pCp++ = uch;216 puch++;217 cch--;218 }219 #ifdef RT_STRICT220 else if (!(uch & RT_BIT(6)))221 AssertMsgFailed(("Internal error!\n"));222 #endif223 else if (!(uch & RT_BIT(5)))224 {225 *pCp++ = (puch[1] & 0x3f)226 | ((uint16_t)(uch & 0x1f) << 6);227 puch += 2;228 cch -= 2;229 }230 else if (!(uch & RT_BIT(4)))231 {232 *pCp++ = (puch[2] & 0x3f)233 | ((uint16_t)(puch[1] & 0x3f) << 6)234 | ((uint16_t)(uch & 0x0f) << 12);235 puch += 3;236 cch -= 3;237 }238 else if (!(uch & RT_BIT(3)))239 {240 *pCp++ = (puch[3] & 0x3f)241 | ((RTUNICP)(puch[2] & 0x3f) << 6)242 | ((RTUNICP)(puch[1] & 0x3f) << 12)243 | ((RTUNICP)(uch & 0x07) << 18);244 puch += 4;245 cch -= 4;246 }247 else if (!(uch & RT_BIT(2)))248 {249 *pCp++ = (puch[4] & 0x3f)250 | ((RTUNICP)(puch[3] & 0x3f) << 6)251 | ((RTUNICP)(puch[2] & 0x3f) << 12)252 | ((RTUNICP)(puch[1] & 0x3f) << 18)253 | ((RTUNICP)(uch & 0x03) << 24);254 puch += 5;255 cch -= 6;256 }257 else258 {259 Assert(!(uch & RT_BIT(1)));260 *pCp++ = (puch[5] & 0x3f)261 | ((RTUNICP)(puch[4] & 0x3f) << 6)262 | ((RTUNICP)(puch[3] & 0x3f) << 12)263 | ((RTUNICP)(puch[2] & 0x3f) << 18)264 | ((RTUNICP)(puch[1] & 0x3f) << 24)265 | ((RTUNICP)(uch & 0x01) << 30);266 puch += 6;267 cch -= 6;268 }269 }270 271 /* done */272 *pCp = 0;273 return rc;274 }275 276 277 RTDECL(size_t) RTStrUniLen(const char *psz)278 {279 size_t cCodePoints;280 int rc = rtUtf8Length(psz, RTSTR_MAX, &cCodePoints, NULL);281 return RT_SUCCESS(rc) ? cCodePoints : 0;282 }283 RT_EXPORT_SYMBOL(RTStrUniLen);284 285 286 RTDECL(int) RTStrUniLenEx(const char *psz, size_t cch, size_t *pcCps)287 {288 size_t cCodePoints;289 int rc = rtUtf8Length(psz, cch, &cCodePoints, NULL);290 if (pcCps)291 *pcCps = RT_SUCCESS(rc) ? cCodePoints : 0;292 return rc;293 }294 RT_EXPORT_SYMBOL(RTStrUniLenEx);295 296 297 RTDECL(int) RTStrValidateEncoding(const char *psz)298 {299 return RTStrValidateEncodingEx(psz, RTSTR_MAX, 0);300 }301 RT_EXPORT_SYMBOL(RTStrValidateEncoding);302 303 304 RTDECL(int) RTStrValidateEncodingEx(const char *psz, size_t cch, uint32_t fFlags)305 {306 AssertReturn(!(fFlags & ~(RTSTR_VALIDATE_ENCODING_ZERO_TERMINATED)), VERR_INVALID_PARAMETER);307 AssertPtr(psz);308 309 /*310 * Use rtUtf8Length for the job.311 */312 size_t cchActual;313 size_t cCpsIgnored;314 int rc = rtUtf8Length(psz, cch, &cCpsIgnored, &cchActual);315 if (RT_SUCCESS(rc))316 {317 if ( (fFlags & RTSTR_VALIDATE_ENCODING_ZERO_TERMINATED)318 && cchActual >= cch)319 rc = VERR_BUFFER_OVERFLOW;320 }321 return rc;322 323 324 return RTStrUniLenEx(psz, cch, &cCpsIgnored);325 }326 RT_EXPORT_SYMBOL(RTStrValidateEncodingEx);327 328 329 RTDECL(bool) RTStrIsValidEncoding(const char *psz)330 {331 int rc = RTStrValidateEncodingEx(psz, RTSTR_MAX, 0);332 return RT_SUCCESS(rc);333 }334 RT_EXPORT_SYMBOL(RTStrIsValidEncoding);335 336 337 RTDECL(int) RTStrToUni(const char *pszString, PRTUNICP *ppaCps)338 {339 /*340 * Validate input.341 */342 Assert(VALID_PTR(pszString));343 Assert(VALID_PTR(ppaCps));344 *ppaCps = NULL;345 346 /*347 * Validate the UTF-8 input and count its code points.348 */349 size_t cCps;350 int rc = rtUtf8Length(pszString, RTSTR_MAX, &cCps, NULL);351 if (RT_SUCCESS(rc))352 {353 /*354 * Allocate buffer.355 */356 PRTUNICP paCps = (PRTUNICP)RTMemAlloc((cCps + 1) * sizeof(RTUNICP));357 if (paCps)358 {359 /*360 * Decode the string.361 */362 rc = rtUtf8Decode(pszString, RTSTR_MAX, paCps, cCps);363 if (RT_SUCCESS(rc))364 {365 *ppaCps = paCps;366 return rc;367 }368 RTMemFree(paCps);369 }370 else371 rc = VERR_NO_CODE_POINT_MEMORY;372 }373 return rc;374 }375 RT_EXPORT_SYMBOL(RTStrToUni);376 377 378 RTDECL(int) RTStrToUniEx(const char *pszString, size_t cchString, PRTUNICP *ppaCps, size_t cCps, size_t *pcCps)379 {380 /*381 * Validate input.382 */383 Assert(VALID_PTR(pszString));384 Assert(VALID_PTR(ppaCps));385 Assert(!pcCps || VALID_PTR(pcCps));386 387 /*388 * Validate the UTF-8 input and count the code points.389 */390 size_t cCpsResult;391 int rc = rtUtf8Length(pszString, cchString, &cCpsResult, NULL);392 if (RT_SUCCESS(rc))393 {394 if (pcCps)395 *pcCps = cCpsResult;396 397 /*398 * Check buffer size / Allocate buffer.399 */400 bool fShouldFree;401 PRTUNICP paCpsResult;402 if (cCps > 0 && *ppaCps)403 {404 fShouldFree = false;405 if (cCps <= cCpsResult)406 return VERR_BUFFER_OVERFLOW;407 paCpsResult = *ppaCps;408 }409 else410 {411 *ppaCps = NULL;412 fShouldFree = true;413 cCps = RT_MAX(cCpsResult + 1, cCps);414 paCpsResult = (PRTUNICP)RTMemAlloc(cCps * sizeof(RTUNICP));415 }416 if (paCpsResult)417 {418 /*419 * Encode the UTF-16 string.420 */421 rc = rtUtf8Decode(pszString, cchString, paCpsResult, cCps - 1);422 if (RT_SUCCESS(rc))423 {424 *ppaCps = paCpsResult;425 return rc;426 }427 if (fShouldFree)428 RTMemFree(paCpsResult);429 }430 else431 rc = VERR_NO_CODE_POINT_MEMORY;432 }433 return rc;434 }435 RT_EXPORT_SYMBOL(RTStrToUniEx);436 437 438 /**439 * Calculates the UTF-16 length of a string, validating the encoding while doing so.440 *441 * @returns IPRT status code.442 * @param psz Pointer to the UTF-8 string.443 * @param cch The max length of the string. (btw cch = cb)444 * Use RTSTR_MAX if all of the string is to be examined.s445 * @param pcwc Where to store the length of the UTF-16 string as a number of RTUTF16 characters.446 */447 static int rtUtf8CalcUtf16Length(const char *psz, size_t cch, size_t *pcwc)448 {449 const unsigned char *puch = (const unsigned char *)psz;450 size_t cwc = 0;451 while (cch > 0)452 {453 const unsigned char uch = *puch;454 if (!uch)455 break;456 if (!(uch & RT_BIT(7)))457 {458 /* one ASCII byte */459 cwc++;460 puch++;461 cch--;462 }463 else464 {465 /* figure sequence length and validate the first byte */466 unsigned cb;467 if ((uch & (RT_BIT(7) | RT_BIT(6) | RT_BIT(5))) == (RT_BIT(7) | RT_BIT(6)))468 cb = 2;469 else if ((uch & (RT_BIT(7) | RT_BIT(6) | RT_BIT(5) | RT_BIT(4))) == (RT_BIT(7) | RT_BIT(6) | RT_BIT(5)))470 cb = 3;471 else if ((uch & (RT_BIT(7) | RT_BIT(6) | RT_BIT(5) | RT_BIT(4) | RT_BIT(3))) == (RT_BIT(7) | RT_BIT(6) | RT_BIT(5) | RT_BIT(4)))472 cb = 4;473 else if ((uch & (RT_BIT(7) | RT_BIT(6) | RT_BIT(5) | RT_BIT(4) | RT_BIT(3) | RT_BIT(2))) == (RT_BIT(7) | RT_BIT(6) | RT_BIT(5) | RT_BIT(4) | RT_BIT(3)))474 cb = 5;475 else if ((uch & (RT_BIT(7) | RT_BIT(6) | RT_BIT(5) | RT_BIT(4) | RT_BIT(3) | RT_BIT(2) | RT_BIT(1))) == (RT_BIT(7) | RT_BIT(6) | RT_BIT(5) | RT_BIT(4) | RT_BIT(3) | RT_BIT(2)))476 cb = 6;477 else478 {479 RTStrAssertMsgFailed(("Invalid UTF-8 first byte: %.*Rhxs\n", RT_MIN(cch, 10), puch));480 return VERR_INVALID_UTF8_ENCODING;481 }482 483 /* check length */484 if (cb > cch)485 {486 RTStrAssertMsgFailed(("Invalid UTF-8 length: cb=%d cch=%d (%.*Rhxs)\n", cb, cch, RT_MIN(cch, 10), puch));487 return VERR_INVALID_UTF8_ENCODING;488 }489 490 /* validate the rest */491 switch (cb)492 {493 case 6:494 RTStrAssertMsgReturn((puch[5] & (RT_BIT(7) | RT_BIT(6))) == RT_BIT(7), ("6/%u: %.*Rhxs\n", cb, RT_MIN(cb + 10, cch), puch), VERR_INVALID_UTF8_ENCODING);495 case 5:496 RTStrAssertMsgReturn((puch[4] & (RT_BIT(7) | RT_BIT(6))) == RT_BIT(7), ("5/%u: %.*Rhxs\n", cb, RT_MIN(cb + 10, cch), puch), VERR_INVALID_UTF8_ENCODING);497 case 4:498 RTStrAssertMsgReturn((puch[3] & (RT_BIT(7) | RT_BIT(6))) == RT_BIT(7), ("4/%u: %.*Rhxs\n", cb, RT_MIN(cb + 10, cch), puch), VERR_INVALID_UTF8_ENCODING);499 case 3:500 RTStrAssertMsgReturn((puch[2] & (RT_BIT(7) | RT_BIT(6))) == RT_BIT(7), ("3/%u: %.*Rhxs\n", cb, RT_MIN(cb + 10, cch), puch), VERR_INVALID_UTF8_ENCODING);501 case 2:502 RTStrAssertMsgReturn((puch[1] & (RT_BIT(7) | RT_BIT(6))) == RT_BIT(7), ("2/%u: %.*Rhxs\n", cb, RT_MIN(cb + 10, cch), puch), VERR_INVALID_UTF8_ENCODING);503 break;504 }505 506 /* validate the code point. */507 RTUNICP uc;508 switch (cb)509 {510 case 6:511 uc = (puch[5] & 0x3f)512 | ((RTUNICP)(puch[4] & 0x3f) << 6)513 | ((RTUNICP)(puch[3] & 0x3f) << 12)514 | ((RTUNICP)(puch[2] & 0x3f) << 18)515 | ((RTUNICP)(puch[1] & 0x3f) << 24)516 | ((RTUNICP)(uch & 0x01) << 30);517 RTStrAssertMsgReturn(uc >= 0x04000000 && uc <= 0x7fffffff,518 ("%u: cp=%#010RX32: %.*Rhxs\n", cb, uc, RT_MIN(cb + 10, cch), puch), VERR_INVALID_UTF8_ENCODING);519 RTStrAssertMsgFailed(("%u: cp=%#010RX32: %.*Rhxs\n", cb, uc, RT_MIN(cb + 10, cch), puch));520 return VERR_CANT_RECODE_AS_UTF16;521 case 5:522 uc = (puch[4] & 0x3f)523 | ((RTUNICP)(puch[3] & 0x3f) << 6)524 | ((RTUNICP)(puch[2] & 0x3f) << 12)525 | ((RTUNICP)(puch[1] & 0x3f) << 18)526 | ((RTUNICP)(uch & 0x03) << 24);527 RTStrAssertMsgReturn(uc >= 0x00200000 && uc <= 0x03ffffff,528 ("%u: cp=%#010RX32: %.*Rhxs\n", cb, uc, RT_MIN(cb + 10, cch), puch), VERR_INVALID_UTF8_ENCODING);529 RTStrAssertMsgFailed(("%u: cp=%#010RX32: %.*Rhxs\n", cb, uc, RT_MIN(cb + 10, cch), puch));530 return VERR_CANT_RECODE_AS_UTF16;531 case 4:532 uc = (puch[3] & 0x3f)533 | ((RTUNICP)(puch[2] & 0x3f) << 6)534 | ((RTUNICP)(puch[1] & 0x3f) << 12)535 | ((RTUNICP)(uch & 0x07) << 18);536 RTStrAssertMsgReturn(uc >= 0x00010000 && uc <= 0x001fffff,537 ("%u: cp=%#010RX32: %.*Rhxs\n", cb, uc, RT_MIN(cb + 10, cch), puch), VERR_INVALID_UTF8_ENCODING);538 RTStrAssertMsgReturn(uc <= 0x0010ffff,539 ("%u: cp=%#010RX32: %.*Rhxs\n", cb, uc, RT_MIN(cb + 10, cch), puch), VERR_CANT_RECODE_AS_UTF16);540 cwc++;541 break;542 case 3:543 uc = (puch[2] & 0x3f)544 | ((RTUNICP)(puch[1] & 0x3f) << 6)545 | ((RTUNICP)(uch & 0x0f) << 12);546 RTStrAssertMsgReturn(uc >= 0x00000800 && uc <= 0x0000fffd,547 ("%u: cp=%#010RX32: %.*Rhxs\n", cb, uc, RT_MIN(cb + 10, cch), puch),548 uc == 0xffff || uc == 0xfffe ? VERR_CODE_POINT_ENDIAN_INDICATOR : VERR_INVALID_UTF8_ENCODING);549 RTStrAssertMsgReturn(uc < 0xd800 || uc > 0xdfff,550 ("%u: cp=%#010RX32: %.*Rhxs\n", cb, uc, RT_MIN(cb + 10, cch), puch), VERR_CODE_POINT_SURROGATE);551 break;552 case 2:553 uc = (puch[1] & 0x3f)554 | ((RTUNICP)(uch & 0x1f) << 6);555 RTStrAssertMsgReturn(uc >= 0x00000080 && uc <= 0x000007ff,556 ("%u: cp=%#010RX32: %.*Rhxs\n", cb, uc, RT_MIN(cb + 10, cch), puch), VERR_INVALID_UTF8_ENCODING);557 break;558 }559 560 /* advance */561 cch -= cb;562 puch += cb;563 cwc++;564 }565 }566 567 /* done */568 *pcwc = cwc;569 return VINF_SUCCESS;570 }571 572 573 /**574 * Recodes a valid UTF-8 string as UTF-16.575 *576 * Since we know the input is valid, we do *not* perform encoding or length checks.577 *578 * @returns iprt status code.579 * @param psz The UTF-8 string to recode. This is a valid encoding.580 * @param cch The number of chars (the type char, so bytes if you like) to process of the UTF-8 string.581 * The recoding will stop when cch or '\\0' is reached. Pass RTSTR_MAX to process up to '\\0'.582 * @param pwsz Where to store the UTF-16 string.583 * @param cwc The number of RTUTF16 items the pwsz buffer can hold, excluding the terminator ('\\0').584 */585 static int rtUtf8RecodeAsUtf16(const char *psz, size_t cch, PRTUTF16 pwsz, size_t cwc)586 {587 int rc = VINF_SUCCESS;588 const unsigned char *puch = (const unsigned char *)psz;589 PRTUTF16 pwc = pwsz;590 while (cch > 0)591 {592 /* read the next char and check for terminator. */593 const unsigned char uch = *puch;594 if (!uch)595 break;596 597 /* check for output overflow */598 if (RT_UNLIKELY(cwc < 1))599 {600 rc = VERR_BUFFER_OVERFLOW;601 break;602 }603 cwc--;604 605 /* decode and recode the code point */606 if (!(uch & RT_BIT(7)))607 {608 *pwc++ = uch;609 puch++;610 cch--;611 }612 else if ((uch & (RT_BIT(7) | RT_BIT(6) | RT_BIT(5))) == (RT_BIT(7) | RT_BIT(6)))613 {614 uint16_t uc = (puch[1] & 0x3f)615 | ((uint16_t)(uch & 0x1f) << 6);616 *pwc++ = uc;617 puch += 2;618 cch -= 2;619 }620 else if ((uch & (RT_BIT(7) | RT_BIT(6) | RT_BIT(5) | RT_BIT(4))) == (RT_BIT(7) | RT_BIT(6) | RT_BIT(5)))621 {622 uint16_t uc = (puch[2] & 0x3f)623 | ((uint16_t)(puch[1] & 0x3f) << 6)624 | ((uint16_t)(uch & 0x0f) << 12);625 *pwc++ = uc;626 puch += 3;627 cch -= 3;628 }629 else630 {631 /* generate surrugate pair */632 Assert((uch & (RT_BIT(7) | RT_BIT(6) | RT_BIT(5) | RT_BIT(4) | RT_BIT(3))) == (RT_BIT(7) | RT_BIT(6) | RT_BIT(5) | RT_BIT(4)));633 RTUNICP uc = (puch[3] & 0x3f)634 | ((RTUNICP)(puch[2] & 0x3f) << 6)635 | ((RTUNICP)(puch[1] & 0x3f) << 12)636 | ((RTUNICP)(uch & 0x07) << 18);637 if (RT_UNLIKELY(cwc < 1))638 {639 rc = VERR_BUFFER_OVERFLOW;640 break;641 }642 cwc--;643 644 uc -= 0x10000;645 *pwc++ = 0xd800 | (uc >> 10);646 *pwc++ = 0xdc00 | (uc & 0x3ff);647 puch += 4;648 cch -= 4;649 }650 }651 652 /* done */653 *pwc = '\0';654 return rc;655 }656 657 658 RTDECL(int) RTStrToUtf16(const char *pszString, PRTUTF16 *ppwszString)659 {660 /*661 * Validate input.662 */663 Assert(VALID_PTR(ppwszString));664 Assert(VALID_PTR(pszString));665 *ppwszString = NULL;666 667 /*668 * Validate the UTF-8 input and calculate the length of the UTF-16 string.669 */670 size_t cwc;671 int rc = rtUtf8CalcUtf16Length(pszString, RTSTR_MAX, &cwc);672 if (RT_SUCCESS(rc))673 {674 /*675 * Allocate buffer.676 */677 PRTUTF16 pwsz = (PRTUTF16)RTMemAlloc((cwc + 1) * sizeof(RTUTF16));678 if (pwsz)679 {680 /*681 * Encode the UTF-16 string.682 */683 rc = rtUtf8RecodeAsUtf16(pszString, RTSTR_MAX, pwsz, cwc);684 if (RT_SUCCESS(rc))685 {686 *ppwszString = pwsz;687 return rc;688 }689 RTMemFree(pwsz);690 }691 else692 rc = VERR_NO_UTF16_MEMORY;693 }694 return rc;695 }696 RT_EXPORT_SYMBOL(RTStrToUtf16);697 698 699 RTDECL(int) RTStrToUtf16Ex(const char *pszString, size_t cchString, PRTUTF16 *ppwsz, size_t cwc, size_t *pcwc)700 {701 /*702 * Validate input.703 */704 Assert(VALID_PTR(pszString));705 Assert(VALID_PTR(ppwsz));706 Assert(!pcwc || VALID_PTR(pcwc));707 708 /*709 * Validate the UTF-8 input and calculate the length of the UTF-16 string.710 */711 size_t cwcResult;712 int rc = rtUtf8CalcUtf16Length(pszString, cchString, &cwcResult);713 if (RT_SUCCESS(rc))714 {715 if (pcwc)716 *pcwc = cwcResult;717 718 /*719 * Check buffer size / Allocate buffer.720 */721 bool fShouldFree;722 PRTUTF16 pwszResult;723 if (cwc > 0 && *ppwsz)724 {725 fShouldFree = false;726 if (cwc <= cwcResult)727 return VERR_BUFFER_OVERFLOW;728 pwszResult = *ppwsz;729 }730 else731 {732 *ppwsz = NULL;733 fShouldFree = true;734 cwc = RT_MAX(cwcResult + 1, cwc);735 pwszResult = (PRTUTF16)RTMemAlloc(cwc * sizeof(RTUTF16));736 }737 if (pwszResult)738 {739 /*740 * Encode the UTF-16 string.741 */742 rc = rtUtf8RecodeAsUtf16(pszString, cchString, pwszResult, cwc - 1);743 if (RT_SUCCESS(rc))744 {745 *ppwsz = pwszResult;746 return rc;747 }748 if (fShouldFree)749 RTMemFree(pwszResult);750 }751 else752 rc = VERR_NO_UTF16_MEMORY;753 }754 return rc;755 }756 RT_EXPORT_SYMBOL(RTStrToUtf16Ex);757 758 759 RTDECL(size_t) RTStrCalcUtf16Len(const char *psz)760 {761 size_t cwc;762 int rc = rtUtf8CalcUtf16Length(psz, RTSTR_MAX, &cwc);763 return RT_SUCCESS(rc) ? cwc : 0;764 }765 RT_EXPORT_SYMBOL(RTStrCalcUtf16Len);766 767 768 RTDECL(int) RTStrCalcUtf16LenEx(const char *psz, size_t cch, size_t *pcwc)769 {770 size_t cwc;771 int rc = rtUtf8CalcUtf16Length(psz, cch, &cwc);772 if (pcwc)773 *pcwc = RT_SUCCESS(rc) ? cwc : ~(size_t)0;774 return rc;775 }776 RT_EXPORT_SYMBOL(RTStrCalcUtf16LenEx);777 778 779 /**780 * Handle invalid encodings passed to RTStrGetCp() and RTStrGetCpEx().781 * @returns rc782 * @param ppsz The pointer to the string position point.783 * @param pCp Where to store RTUNICP_INVALID.784 * @param rc The iprt error code.785 */786 static int rtStrGetCpExFailure(const char **ppsz, PRTUNICP pCp, int rc)787 {788 /*789 * Try find a valid encoding.790 */791 (*ppsz)++; /** @todo code this! */792 *pCp = RTUNICP_INVALID;793 return rc;794 }795 796 797 RTDECL(RTUNICP) RTStrGetCpInternal(const char *psz)798 {799 RTUNICP Cp;800 RTStrGetCpExInternal(&psz, &Cp);801 return Cp;802 }803 RT_EXPORT_SYMBOL(RTStrGetCpInternal);804 805 806 RTDECL(int) RTStrGetCpExInternal(const char **ppsz, PRTUNICP pCp)807 {808 const unsigned char *puch = (const unsigned char *)*ppsz;809 const unsigned char uch = *puch;810 RTUNICP uc;811 812 /* ASCII ? */813 if (!(uch & RT_BIT(7)))814 {815 uc = uch;816 puch++;817 }818 else if (uch & RT_BIT(6))819 {820 /* figure the length and validate the first octet. */821 unsigned cb;822 if (!(uch & RT_BIT(5)))823 cb = 2;824 else if (!(uch & RT_BIT(4)))825 cb = 3;826 else if (!(uch & RT_BIT(3)))827 cb = 4;828 else if (!(uch & RT_BIT(2)))829 cb = 5;830 else if (!(uch & RT_BIT(1)))831 cb = 6;832 else833 {834 RTStrAssertMsgFailed(("Invalid UTF-8 first byte: %.*Rhxs\n", RT_MIN(strlen((char *)puch), 10), puch));835 return rtStrGetCpExFailure(ppsz, pCp, VERR_INVALID_UTF8_ENCODING);836 }837 838 /* validate the rest */839 switch (cb)840 {841 case 6:842 RTStrAssertMsgReturn((puch[5] & 0xc0) == 0x80, ("6/%u: %.*Rhxs\n", cb, RT_MIN(cb + 10, strlen((char *)puch)), puch),843 rtStrGetCpExFailure(ppsz, pCp, VERR_INVALID_UTF8_ENCODING));844 case 5:845 RTStrAssertMsgReturn((puch[4] & 0xc0) == 0x80, ("5/%u: %.*Rhxs\n", cb, RT_MIN(cb + 10, strlen((char *)puch)), puch),846 rtStrGetCpExFailure(ppsz, pCp, VERR_INVALID_UTF8_ENCODING));847 case 4:848 RTStrAssertMsgReturn((puch[3] & 0xc0) == 0x80, ("4/%u: %.*Rhxs\n", cb, RT_MIN(cb + 10, strlen((char *)puch)), puch),849 rtStrGetCpExFailure(ppsz, pCp, VERR_INVALID_UTF8_ENCODING));850 case 3:851 RTStrAssertMsgReturn((puch[2] & 0xc0) == 0x80, ("3/%u: %.*Rhxs\n", cb, RT_MIN(cb + 10, strlen((char *)puch)), puch),852 rtStrGetCpExFailure(ppsz, pCp, VERR_INVALID_UTF8_ENCODING));853 case 2:854 RTStrAssertMsgReturn((puch[1] & 0xc0) == 0x80, ("2/%u: %.*Rhxs\n", cb, RT_MIN(cb + 10, strlen((char *)puch)), puch),855 rtStrGetCpExFailure(ppsz, pCp, VERR_INVALID_UTF8_ENCODING));856 break;857 }858 859 /* get and validate the code point. */860 switch (cb)861 {862 case 6:863 uc = (puch[5] & 0x3f)864 | ((RTUNICP)(puch[4] & 0x3f) << 6)865 | ((RTUNICP)(puch[3] & 0x3f) << 12)866 | ((RTUNICP)(puch[2] & 0x3f) << 18)867 | ((RTUNICP)(puch[1] & 0x3f) << 24)868 | ((RTUNICP)(uch & 0x01) << 30);869 RTStrAssertMsgReturn(uc >= 0x04000000 && uc <= 0x7fffffff,870 ("%u: cp=%#010RX32: %.*Rhxs\n", cb, uc, RT_MIN(cb + 10, strlen((char *)puch)), puch),871 rtStrGetCpExFailure(ppsz, pCp, VERR_INVALID_UTF8_ENCODING));872 break;873 case 5:874 uc = (puch[4] & 0x3f)875 | ((RTUNICP)(puch[3] & 0x3f) << 6)876 | ((RTUNICP)(puch[2] & 0x3f) << 12)877 | ((RTUNICP)(puch[1] & 0x3f) << 18)878 | ((RTUNICP)(uch & 0x03) << 24);879 RTStrAssertMsgReturn(uc >= 0x00200000 && uc <= 0x03ffffff,880 ("%u: cp=%#010RX32: %.*Rhxs\n", cb, uc, RT_MIN(cb + 10, strlen((char *)puch)), puch),881 rtStrGetCpExFailure(ppsz, pCp, VERR_INVALID_UTF8_ENCODING));882 break;883 case 4:884 uc = (puch[3] & 0x3f)885 | ((RTUNICP)(puch[2] & 0x3f) << 6)886 | ((RTUNICP)(puch[1] & 0x3f) << 12)887 | ((RTUNICP)(uch & 0x07) << 18);888 RTStrAssertMsgReturn(uc >= 0x00010000 && uc <= 0x001fffff,889 ("%u: cp=%#010RX32: %.*Rhxs\n", cb, uc, RT_MIN(cb + 10, strlen((char *)puch)), puch),890 rtStrGetCpExFailure(ppsz, pCp, VERR_INVALID_UTF8_ENCODING));891 break;892 case 3:893 uc = (puch[2] & 0x3f)894 | ((RTUNICP)(puch[1] & 0x3f) << 6)895 | ((RTUNICP)(uch & 0x0f) << 12);896 RTStrAssertMsgReturn(uc >= 0x00000800 && uc <= 0x0000fffd,897 ("%u: cp=%#010RX32: %.*Rhxs\n", cb, uc, RT_MIN(cb + 10, strlen((char *)puch)), puch),898 rtStrGetCpExFailure(ppsz, pCp, uc == 0xffff || uc == 0xfffe ? VERR_CODE_POINT_ENDIAN_INDICATOR : VERR_INVALID_UTF8_ENCODING));899 RTStrAssertMsgReturn(uc < 0xd800 || uc > 0xdfff,900 ("%u: cp=%#010RX32: %.*Rhxs\n", cb, uc, RT_MIN(cb + 10, strlen((char *)puch)), puch),901 rtStrGetCpExFailure(ppsz, pCp, VERR_CODE_POINT_SURROGATE));902 break;903 case 2:904 uc = (puch[1] & 0x3f)905 | ((RTUNICP)(uch & 0x1f) << 6);906 RTStrAssertMsgReturn(uc >= 0x00000080 && uc <= 0x000007ff,907 ("%u: cp=%#010RX32: %.*Rhxs\n", cb, uc, RT_MIN(cb + 10, strlen((char *)puch)), puch),908 rtStrGetCpExFailure(ppsz, pCp, VERR_INVALID_UTF8_ENCODING));909 break;910 default: /* impossible, but GCC is bitching. */911 uc = RTUNICP_INVALID;912 break;913 }914 puch += cb;915 }916 else917 {918 /* 6th bit is always set. */919 RTStrAssertMsgFailed(("Invalid UTF-8 first byte: %.*Rhxs\n", RT_MIN(strlen((char *)puch), 10), puch));920 return rtStrGetCpExFailure(ppsz, pCp, VERR_INVALID_UTF8_ENCODING);921 }922 *pCp = uc;923 *ppsz = (const char *)puch;924 return VINF_SUCCESS;925 }926 RT_EXPORT_SYMBOL(RTStrGetCpExInternal);927 928 929 /**930 * Handle invalid encodings passed to RTStrGetCpNEx().931 * @returns rc932 * @param ppsz The pointer to the string position point.933 * @param pcch Pointer to the string length.934 * @param pCp Where to store RTUNICP_INVALID.935 * @param rc The iprt error code.936 */937 static int rtStrGetCpNExFailure(const char **ppsz, size_t *pcch, PRTUNICP pCp, int rc)938 {939 /*940 * Try find a valid encoding.941 */942 (*ppsz)++; /** @todo code this! */943 (*pcch)--;944 *pCp = RTUNICP_INVALID;945 return rc;946 }947 948 949 RTDECL(int) RTStrGetCpNExInternal(const char **ppsz, size_t *pcch, PRTUNICP pCp)950 {951 const unsigned char *puch = (const unsigned char *)*ppsz;952 const unsigned char uch = *puch;953 size_t cch = *pcch;954 RTUNICP uc;955 956 if (cch == 0)957 {958 *pCp = RTUNICP_INVALID;959 return VERR_END_OF_STRING;960 }961 962 /* ASCII ? */963 if (!(uch & RT_BIT(7)))964 {965 uc = uch;966 puch++;967 cch--;968 }969 else if (uch & RT_BIT(6))970 {971 /* figure the length and validate the first octet. */972 unsigned cb;973 if (!(uch & RT_BIT(5)))974 cb = 2;975 else if (!(uch & RT_BIT(4)))976 cb = 3;977 else if (!(uch & RT_BIT(3)))978 cb = 4;979 else if (!(uch & RT_BIT(2)))980 cb = 5;981 else if (!(uch & RT_BIT(1)))982 cb = 6;983 else984 {985 RTStrAssertMsgFailed(("Invalid UTF-8 first byte: %.*Rhxs\n", RT_MIN(strlen((char *)puch), 10), puch));986 return rtStrGetCpNExFailure(ppsz, pcch, pCp, VERR_INVALID_UTF8_ENCODING);987 }988 989 if (cb > cch)990 return rtStrGetCpNExFailure(ppsz, pcch, pCp, VERR_INVALID_UTF8_ENCODING);991 992 /* validate the rest */993 switch (cb)994 {995 case 6:996 RTStrAssertMsgReturn((puch[5] & 0xc0) == 0x80, ("6/%u: %.*Rhxs\n", cb, RT_MIN(cb + 10, strlen((char *)puch)), puch),997 rtStrGetCpNExFailure(ppsz, pcch, pCp, VERR_INVALID_UTF8_ENCODING));998 case 5:999 RTStrAssertMsgReturn((puch[4] & 0xc0) == 0x80, ("5/%u: %.*Rhxs\n", cb, RT_MIN(cb + 10, strlen((char *)puch)), puch),1000 rtStrGetCpNExFailure(ppsz, pcch, pCp, VERR_INVALID_UTF8_ENCODING));1001 case 4:1002 RTStrAssertMsgReturn((puch[3] & 0xc0) == 0x80, ("4/%u: %.*Rhxs\n", cb, RT_MIN(cb + 10, strlen((char *)puch)), puch),1003 rtStrGetCpNExFailure(ppsz, pcch, pCp, VERR_INVALID_UTF8_ENCODING));1004 case 3:1005 RTStrAssertMsgReturn((puch[2] & 0xc0) == 0x80, ("3/%u: %.*Rhxs\n", cb, RT_MIN(cb + 10, strlen((char *)puch)), puch),1006 rtStrGetCpNExFailure(ppsz, pcch, pCp, VERR_INVALID_UTF8_ENCODING));1007 case 2:1008 RTStrAssertMsgReturn((puch[1] & 0xc0) == 0x80, ("2/%u: %.*Rhxs\n", cb, RT_MIN(cb + 10, strlen((char *)puch)), puch),1009 rtStrGetCpNExFailure(ppsz, pcch, pCp, VERR_INVALID_UTF8_ENCODING));1010 break;1011 }1012 1013 /* get and validate the code point. */1014 switch (cb)1015 {1016 case 6:1017 uc = (puch[5] & 0x3f)1018 | ((RTUNICP)(puch[4] & 0x3f) << 6)1019 | ((RTUNICP)(puch[3] & 0x3f) << 12)1020 | ((RTUNICP)(puch[2] & 0x3f) << 18)1021 | ((RTUNICP)(puch[1] & 0x3f) << 24)1022 | ((RTUNICP)(uch & 0x01) << 30);1023 RTStrAssertMsgReturn(uc >= 0x04000000 && uc <= 0x7fffffff,1024 ("%u: cp=%#010RX32: %.*Rhxs\n", cb, uc, RT_MIN(cb + 10, strlen((char *)puch)), puch),1025 rtStrGetCpNExFailure(ppsz, pcch, pCp, VERR_INVALID_UTF8_ENCODING));1026 break;1027 case 5:1028 uc = (puch[4] & 0x3f)1029 | ((RTUNICP)(puch[3] & 0x3f) << 6)1030 | ((RTUNICP)(puch[2] & 0x3f) << 12)1031 | ((RTUNICP)(puch[1] & 0x3f) << 18)1032 | ((RTUNICP)(uch & 0x03) << 24);1033 RTStrAssertMsgReturn(uc >= 0x00200000 && uc <= 0x03ffffff,1034 ("%u: cp=%#010RX32: %.*Rhxs\n", cb, uc, RT_MIN(cb + 10, strlen((char *)puch)), puch),1035 rtStrGetCpNExFailure(ppsz, pcch, pCp, VERR_INVALID_UTF8_ENCODING));1036 break;1037 case 4:1038 uc = (puch[3] & 0x3f)1039 | ((RTUNICP)(puch[2] & 0x3f) << 6)1040 | ((RTUNICP)(puch[1] & 0x3f) << 12)1041 | ((RTUNICP)(uch & 0x07) << 18);1042 RTStrAssertMsgReturn(uc >= 0x00010000 && uc <= 0x001fffff,1043 ("%u: cp=%#010RX32: %.*Rhxs\n", cb, uc, RT_MIN(cb + 10, strlen((char *)puch)), puch),1044 rtStrGetCpNExFailure(ppsz, pcch, pCp, VERR_INVALID_UTF8_ENCODING));1045 break;1046 case 3:1047 uc = (puch[2] & 0x3f)1048 | ((RTUNICP)(puch[1] & 0x3f) << 6)1049 | ((RTUNICP)(uch & 0x0f) << 12);1050 RTStrAssertMsgReturn(uc >= 0x00000800 && uc <= 0x0000fffd,1051 ("%u: cp=%#010RX32: %.*Rhxs\n", cb, uc, RT_MIN(cb + 10, strlen((char *)puch)), puch),1052 rtStrGetCpNExFailure(ppsz, pcch, pCp, uc == 0xffff || uc == 0xfffe ? VERR_CODE_POINT_ENDIAN_INDICATOR : VERR_INVALID_UTF8_ENCODING));1053 RTStrAssertMsgReturn(uc < 0xd800 || uc > 0xdfff,1054 ("%u: cp=%#010RX32: %.*Rhxs\n", cb, uc, RT_MIN(cb + 10, strlen((char *)puch)), puch),1055 rtStrGetCpNExFailure(ppsz, pcch, pCp, VERR_CODE_POINT_SURROGATE));1056 break;1057 case 2:1058 uc = (puch[1] & 0x3f)1059 | ((RTUNICP)(uch & 0x1f) << 6);1060 RTStrAssertMsgReturn(uc >= 0x00000080 && uc <= 0x000007ff,1061 ("%u: cp=%#010RX32: %.*Rhxs\n", cb, uc, RT_MIN(cb + 10, strlen((char *)puch)), puch),1062 rtStrGetCpNExFailure(ppsz, pcch, pCp, VERR_INVALID_UTF8_ENCODING));1063 break;1064 default: /* impossible, but GCC is bitching. */1065 uc = RTUNICP_INVALID;1066 break;1067 }1068 puch += cb;1069 cch -= cb;1070 }1071 else1072 {1073 /* 6th bit is always set. */1074 RTStrAssertMsgFailed(("Invalid UTF-8 first byte: %.*Rhxs\n", RT_MIN(strlen((char *)puch), 10), puch));1075 return rtStrGetCpNExFailure(ppsz, pcch, pCp, VERR_INVALID_UTF8_ENCODING);1076 }1077 *pCp = uc;1078 *ppsz = (const char *)puch;1079 (*pcch) = cch;1080 return VINF_SUCCESS;1081 }1082 RT_EXPORT_SYMBOL(RTStrGetCpNExInternal);1083 1084 1085 RTDECL(char *) RTStrPutCpInternal(char *psz, RTUNICP uc)1086 {1087 unsigned char *puch = (unsigned char *)psz;1088 if (uc < 0x80)1089 *puch++ = (unsigned char )uc;1090 else if (uc < 0x00000800)1091 {1092 *puch++ = 0xc0 | (uc >> 6);1093 *puch++ = 0x80 | (uc & 0x3f);1094 }1095 else if (uc < 0x00010000)1096 {1097 if ( uc < 0x0000d80001098 || ( uc > 0x0000dfff1099 && uc < 0x0000fffe))1100 {1101 *puch++ = 0xe0 | (uc >> 12);1102 *puch++ = 0x80 | ((uc >> 6) & 0x3f);1103 *puch++ = 0x80 | (uc & 0x3f);1104 }1105 else1106 {1107 AssertMsgFailed(("Invalid code point U+%05x!\n", uc));1108 *puch++ = 0x7f;1109 }1110 }1111 else if (uc < 0x00200000)1112 {1113 *puch++ = 0xf0 | (uc >> 18);1114 *puch++ = 0x80 | ((uc >> 12) & 0x3f);1115 *puch++ = 0x80 | ((uc >> 6) & 0x3f);1116 *puch++ = 0x80 | (uc & 0x3f);1117 }1118 else if (uc < 0x04000000)1119 {1120 *puch++ = 0xf8 | (uc >> 24);1121 *puch++ = 0x80 | ((uc >> 18) & 0x3f);1122 *puch++ = 0x80 | ((uc >> 12) & 0x3f);1123 *puch++ = 0x80 | ((uc >> 6) & 0x3f);1124 *puch++ = 0x80 | (uc & 0x3f);1125 }1126 else if (uc <= 0x7fffffff)1127 {1128 *puch++ = 0xfc | (uc >> 30);1129 *puch++ = 0x80 | ((uc >> 24) & 0x3f);1130 *puch++ = 0x80 | ((uc >> 18) & 0x3f);1131 *puch++ = 0x80 | ((uc >> 12) & 0x3f);1132 *puch++ = 0x80 | ((uc >> 6) & 0x3f);1133 *puch++ = 0x80 | (uc & 0x3f);1134 }1135 else1136 {1137 AssertMsgFailed(("Invalid code point U+%08x!\n", uc));1138 *puch++ = 0x7f;1139 }1140 1141 return (char *)puch;1142 }1143 RT_EXPORT_SYMBOL(RTStrPutCpInternal);1144 1145 1146 RTDECL(char *) RTStrPrevCp(const char *pszStart, const char *psz)1147 {1148 if (pszStart < psz)1149 {1150 /* simple char? */1151 const unsigned char *puch = (const unsigned char *)psz;1152 unsigned uch = *--puch;1153 if (!(uch & RT_BIT(7)))1154 return (char *)puch;1155 RTStrAssertMsgReturn(!(uch & RT_BIT(6)), ("uch=%#x\n", uch), (char *)pszStart);1156 1157 /* two or more. */1158 uint32_t uMask = 0xffffffc0;1159 while ( (const unsigned char *)pszStart < puch1160 && !(uMask & 1))1161 {1162 uch = *--puch;1163 if ((uch & 0xc0) != 0x80)1164 {1165 RTStrAssertMsgReturn((uch & (uMask >> 1)) == (uMask & 0xff),1166 ("Invalid UTF-8 encoding: %.*Rhxs puch=%p psz=%p\n", psz - (char *)puch, puch, psz),1167 (char *)pszStart);1168 return (char *)puch;1169 }1170 uMask >>= 1;1171 }1172 RTStrAssertMsgFailed(("Invalid UTF-8 encoding: %.*Rhxs puch=%p psz=%p\n", psz - (char *)puch, puch, psz));1173 }1174 return (char *)pszStart;1175 }1176 RT_EXPORT_SYMBOL(RTStrPrevCp);1177 1178 1179 /**1180 * Performs a case sensitive string compare between two UTF-8 strings.1181 *1182 * Encoding errors are ignored by the current implementation. So, the only1183 * difference between this and the CRT strcmp function is the handling of1184 * NULL arguments.1185 *1186 * @returns < 0 if the first string less than the second string.1187 * @returns 0 if the first string identical to the second string.1188 * @returns > 0 if the first string greater than the second string.1189 * @param psz1 First UTF-8 string. Null is allowed.1190 * @param psz2 Second UTF-8 string. Null is allowed.1191 */1192 RTDECL(int) RTStrCmp(const char *psz1, const char *psz2)1193 {1194 if (psz1 == psz2)1195 return 0;1196 if (!psz1)1197 return -1;1198 if (!psz2)1199 return 1;1200 1201 return strcmp(psz1, psz2);1202 }1203 RT_EXPORT_SYMBOL(RTStrCmp);1204 1205 1206 /**1207 * Performs a case sensitive string compare between two UTF-8 strings, given1208 * a maximum string length.1209 *1210 * Encoding errors are ignored by the current implementation. So, the only1211 * difference between this and the CRT strncmp function is the handling of1212 * NULL arguments.1213 *1214 * @returns < 0 if the first string less than the second string.1215 * @returns 0 if the first string identical to the second string.1216 * @returns > 0 if the first string greater than the second string.1217 * @param psz1 First UTF-8 string. Null is allowed.1218 * @param psz2 Second UTF-8 string. Null is allowed.1219 * @param cchMax The maximum string length1220 */1221 RTDECL(int) RTStrNCmp(const char *psz1, const char *psz2, size_t cchMax)1222 {1223 if (psz1 == psz2)1224 return 0;1225 if (!psz1)1226 return -1;1227 if (!psz2)1228 return 1;1229 1230 return strncmp(psz1, psz2, cchMax);1231 }1232 RT_EXPORT_SYMBOL(RTStrNCmp);1233 1234 1235 /**1236 * Performs a case insensitive string compare between two UTF-8 strings.1237 *1238 * This is a simplified compare, as only the simplified lower/upper case folding1239 * specified by the unicode specs are used. It does not consider character pairs1240 * as they are used in some languages, just simple upper & lower case compares.1241 *1242 * The result is the difference between the mismatching codepoints after they1243 * both have been lower cased.1244 *1245 * If the string encoding is invalid the function will assert (strict builds)1246 * and use RTStrCmp for the remainder of the string.1247 *1248 * @returns < 0 if the first string less than the second string.1249 * @returns 0 if the first string identical to the second string.1250 * @returns > 0 if the first string greater than the second string.1251 * @param psz1 First UTF-8 string. Null is allowed.1252 * @param psz2 Second UTF-8 string. Null is allowed.1253 */1254 RTDECL(int) RTStrICmp(const char *psz1, const char *psz2)1255 {1256 if (psz1 == psz2)1257 return 0;1258 if (!psz1)1259 return -1;1260 if (!psz2)1261 return 1;1262 1263 const char *pszStart1 = psz1;1264 for (;;)1265 {1266 /* Get the codepoints */1267 RTUNICP cp1;1268 int rc = RTStrGetCpEx(&psz1, &cp1);1269 if (RT_FAILURE(rc))1270 {1271 AssertRC(rc);1272 psz1--;1273 break;1274 }1275 1276 RTUNICP cp2;1277 rc = RTStrGetCpEx(&psz2, &cp2);1278 if (RT_FAILURE(rc))1279 {1280 AssertRC(rc);1281 psz2--;1282 psz1 = RTStrPrevCp(pszStart1, psz1);1283 break;1284 }1285 1286 /* compare */1287 int iDiff = cp1 - cp2;1288 if (iDiff)1289 {1290 iDiff = RTUniCpToUpper(cp1) != RTUniCpToUpper(cp2);1291 if (iDiff)1292 {1293 iDiff = RTUniCpToLower(cp1) - RTUniCpToLower(cp2); /* lower case diff last! */1294 if (iDiff)1295 return iDiff;1296 }1297 }1298 1299 /* hit the terminator? */1300 if (!cp1)1301 return 0;1302 }1303 1304 /* Hit some bad encoding, continue in case insensitive mode. */1305 return RTStrCmp(psz1, psz2);1306 }1307 RT_EXPORT_SYMBOL(RTStrICmp);1308 1309 1310 /**1311 * Performs a case insensitive string compare between two UTF-8 strings, given a1312 * maximum string length.1313 *1314 * This is a simplified compare, as only the simplified lower/upper case folding1315 * specified by the unicode specs are used. It does not consider character pairs1316 * as they are used in some languages, just simple upper & lower case compares.1317 *1318 * The result is the difference between the mismatching codepoints after they1319 * both have been lower cased.1320 *1321 * If the string encoding is invalid the function will assert (strict builds)1322 * and use RTStrCmp for the remainder of the string.1323 *1324 * @returns < 0 if the first string less than the second string.1325 * @returns 0 if the first string identical to the second string.1326 * @returns > 0 if the first string greater than the second string.1327 * @param psz1 First UTF-8 string. Null is allowed.1328 * @param psz2 Second UTF-8 string. Null is allowed.1329 * @param cchMax Maximum string length1330 */1331 RTDECL(int) RTStrNICmp(const char *psz1, const char *psz2, size_t cchMax)1332 {1333 if (cchMax == 0)1334 return 0;1335 if (psz1 == psz2)1336 return 0;1337 if (!psz1)1338 return -1;1339 if (!psz2)1340 return 1;1341 1342 for (;;)1343 {1344 /* Get the codepoints */1345 RTUNICP cp1;1346 size_t cchMax2 = cchMax;1347 int rc = RTStrGetCpNEx(&psz1, &cchMax, &cp1);1348 if (RT_FAILURE(rc))1349 {1350 AssertRC(rc);1351 psz1--;1352 cchMax++;1353 break;1354 }1355 1356 RTUNICP cp2;1357 rc = RTStrGetCpNEx(&psz2, &cchMax2, &cp2);1358 if (RT_FAILURE(rc))1359 {1360 AssertRC(rc);1361 psz2--;1362 psz1 -= (cchMax - cchMax2 + 1); /* This can't overflow, can it? */1363 cchMax = cchMax2 + 1;1364 break;1365 }1366 1367 /* compare */1368 int iDiff = cp1 - cp2;1369 if (iDiff)1370 {1371 iDiff = RTUniCpToUpper(cp1) != RTUniCpToUpper(cp2);1372 if (iDiff)1373 {1374 iDiff = RTUniCpToLower(cp1) - RTUniCpToLower(cp2); /* lower case diff last! */1375 if (iDiff)1376 return iDiff;1377 }1378 }1379 1380 /* hit the terminator? */1381 if (!cp1 || cchMax == 0)1382 return 0;1383 }1384 1385 /* Hit some bad encoding, continue in case insensitive mode. */1386 return RTStrNCmp(psz1, psz2, cchMax);1387 }1388 RT_EXPORT_SYMBOL(RTStrNICmp);1389 1390 38 1391 39 RTDECL(char *) RTStrStr(const char *pszHaystack, const char *pszNeedle) … … 1402 50 RT_EXPORT_SYMBOL(RTStrStr); 1403 51 1404 1405 RTDECL(char *) RTStrIStr(const char *pszHaystack, const char *pszNeedle)1406 {1407 /* Any NULL strings means NULL return. (In the RTStrCmp tradition.) */1408 if (!pszHaystack)1409 return NULL;1410 if (!pszNeedle)1411 return NULL;1412 1413 /* The empty string matches everything. */1414 if (!*pszNeedle)1415 return (char *)pszHaystack;1416 1417 /*1418 * The search strategy is to pick out the first char of the needle, fold it,1419 * and match it against the haystack code point by code point. When encountering1420 * a matching code point we use RTStrNICmp for the remainder (if any) of the needle.1421 */1422 const char * const pszNeedleStart = pszNeedle;1423 RTUNICP Cp0;1424 RTStrGetCpEx(&pszNeedle, &Cp0); /* pszNeedle is advanced one code point. */1425 size_t const cchNeedle = strlen(pszNeedle);1426 size_t const cchNeedleCp0= pszNeedle - pszNeedleStart;1427 RTUNICP const Cp0Lower = RTUniCpToLower(Cp0);1428 RTUNICP const Cp0Upper = RTUniCpToUpper(Cp0);1429 if ( Cp0Lower == Cp0Upper1430 && Cp0Lower == Cp0)1431 {1432 /* Cp0 is not a case sensitive char. */1433 for (;;)1434 {1435 RTUNICP Cp;1436 RTStrGetCpEx(&pszHaystack, &Cp);1437 if (!Cp)1438 break;1439 if ( Cp == Cp01440 && !RTStrNICmp(pszHaystack, pszNeedle, cchNeedle))1441 return (char *)pszHaystack - cchNeedleCp0;1442 }1443 }1444 else if ( Cp0Lower == Cp01445 || Cp0Upper != Cp0)1446 {1447 /* Cp0 is case sensitive */1448 for (;;)1449 {1450 RTUNICP Cp;1451 RTStrGetCpEx(&pszHaystack, &Cp);1452 if (!Cp)1453 break;1454 if ( ( Cp == Cp0Upper1455 || Cp == Cp0Lower)1456 && !RTStrNICmp(pszHaystack, pszNeedle, cchNeedle))1457 return (char *)pszHaystack - cchNeedleCp0;1458 }1459 }1460 else1461 {1462 /* Cp0 is case sensitive and folds to two difference chars. (paranoia) */1463 for (;;)1464 {1465 RTUNICP Cp;1466 RTStrGetCpEx(&pszHaystack, &Cp);1467 if (!Cp)1468 break;1469 if ( ( Cp == Cp01470 || Cp == Cp0Upper1471 || Cp == Cp0Lower)1472 && !RTStrNICmp(pszHaystack, pszNeedle, cchNeedle))1473 return (char *)pszHaystack - cchNeedleCp0;1474 }1475 }1476 1477 1478 return NULL;1479 }1480 RT_EXPORT_SYMBOL(RTStrIStr);1481 1482 1483 RTDECL(char *) RTStrToLower(char *psz)1484 {1485 /*1486 * Loop the code points in the string, converting them one by one.1487 * ASSUMES that the code points for upper and lower case are encoded1488 * with the exact same length.1489 */1490 /** @todo Handled bad encodings correctly+quietly, remove assumption,1491 * optimize. */1492 char *pszCur = psz;1493 while (*pszCur)1494 {1495 RTUNICP cp = RTStrGetCp(pszCur);1496 cp = RTUniCpToLower(cp);1497 pszCur = RTStrPutCp(pszCur, cp);1498 }1499 return psz;1500 }1501 RT_EXPORT_SYMBOL(RTStrToLower);1502 1503 1504 RTDECL(char *) RTStrToUpper(char *psz)1505 {1506 /*1507 * Loop the code points in the string, converting them one by one.1508 * ASSUMES that the code points for upper and lower case are encoded1509 * with the exact same length.1510 */1511 /** @todo Handled bad encodings correctly+quietly, remove assumption,1512 * optimize. */1513 char *pszCur = psz;1514 while(*pszCur)1515 {1516 RTUNICP cp = RTStrGetCp(pszCur);1517 cp = RTUniCpToUpper(cp);1518 pszCur = RTStrPutCp(pszCur, cp);1519 }1520 return psz;1521 }1522 RT_EXPORT_SYMBOL(RTStrToUpper);1523
Note:
See TracChangeset
for help on using the changeset viewer.