VirtualBox

source: vbox/trunk/src/VBox/Runtime/common/string/utf-8.cpp@ 45256

Last change on this file since 45256 was 44528, checked in by vboxsync, 12 years ago

header (C) fixes

  • Property svn:eol-style set to native
  • Property svn:keywords set to Id
File size: 54.5 KB
Line 
1/* $Id: utf-8.cpp 44528 2013-02-04 14:27:54Z vboxsync $ */
2/** @file
3 * IPRT - UTF-8 Decoding.
4 */
5
6/*
7 * Copyright (C) 2006-2012 Oracle Corporation
8 *
9 * This file is part of VirtualBox Open Source Edition (OSE), as
10 * available from http://www.virtualbox.org. This file is free software;
11 * you can redistribute it and/or modify it under the terms of the GNU
12 * General Public License (GPL) as published by the Free Software
13 * Foundation, in version 2 as it comes in the "COPYING" file of the
14 * VirtualBox OSE distribution. VirtualBox OSE is distributed in the
15 * hope that it will be useful, but WITHOUT ANY WARRANTY of any kind.
16 *
17 * The contents of this file may alternatively be used under the terms
18 * of the Common Development and Distribution License Version 1.0
19 * (CDDL) only, as it comes in the "COPYING.CDDL" file of the
20 * VirtualBox OSE distribution, in which case the provisions of the
21 * CDDL are applicable instead of those of the GPL.
22 *
23 * You may elect to license modified versions of this file under the
24 * terms and conditions of either the GPL or the CDDL or both.
25 */
26
27
28/*******************************************************************************
29* Header Files *
30*******************************************************************************/
31#include <iprt/string.h>
32#include "internal/iprt.h"
33
34#include <iprt/uni.h>
35#include <iprt/alloc.h>
36#include <iprt/assert.h>
37#include <iprt/err.h>
38#include "internal/string.h"
39
40
41
42/**
43 * Get get length in code points of a UTF-8 encoded string.
44 * The string is validated while doing this.
45 *
46 * @returns IPRT status code.
47 * @param psz Pointer to the UTF-8 string.
48 * @param cch The max length of the string. (btw cch = cb)
49 * Use RTSTR_MAX if all of the string is to be examined.
50 * @param pcuc Where to store the length in unicode code points.
51 * @param pcchActual Where to store the actual size of the UTF-8 string
52 * on success (cch = cb again). Optional.
53 */
54DECLHIDDEN(int) rtUtf8Length(const char *psz, size_t cch, size_t *pcuc, size_t *pcchActual)
55{
56 const unsigned char *puch = (const unsigned char *)psz;
57 size_t cCodePoints = 0;
58 while (cch > 0)
59 {
60 const unsigned char uch = *puch;
61 if (!uch)
62 break;
63 if (uch & RT_BIT(7))
64 {
65 /* figure sequence length and validate the first byte */
66/** @todo RT_USE_RTC_3629 */
67 unsigned cb;
68 if ((uch & (RT_BIT(7) | RT_BIT(6) | RT_BIT(5))) == (RT_BIT(7) | RT_BIT(6)))
69 cb = 2;
70 else if ((uch & (RT_BIT(7) | RT_BIT(6) | RT_BIT(5) | RT_BIT(4))) == (RT_BIT(7) | RT_BIT(6) | RT_BIT(5)))
71 cb = 3;
72 else if ((uch & (RT_BIT(7) | RT_BIT(6) | RT_BIT(5) | RT_BIT(4) | RT_BIT(3))) == (RT_BIT(7) | RT_BIT(6) | RT_BIT(5) | RT_BIT(4)))
73 cb = 4;
74 else if ((uch & (RT_BIT(7) | RT_BIT(6) | RT_BIT(5) | RT_BIT(4) | RT_BIT(3) | RT_BIT(2))) == (RT_BIT(7) | RT_BIT(6) | RT_BIT(5) | RT_BIT(4) | RT_BIT(3)))
75 cb = 5;
76 else if ((uch & (RT_BIT(7) | RT_BIT(6) | RT_BIT(5) | RT_BIT(4) | RT_BIT(3) | RT_BIT(2) | RT_BIT(1))) == (RT_BIT(7) | RT_BIT(6) | RT_BIT(5) | RT_BIT(4) | RT_BIT(3) | RT_BIT(2)))
77 cb = 6;
78 else
79 {
80 RTStrAssertMsgFailed(("Invalid UTF-8 first byte: %.*Rhxs\n", RT_MIN(cch, 10), puch));
81 return VERR_INVALID_UTF8_ENCODING;
82 }
83
84 /* check length */
85 if (cb > cch)
86 {
87 RTStrAssertMsgFailed(("Invalid UTF-8 length: cb=%d cch=%d (%.*Rhxs)\n", cb, cch, RT_MIN(cch, 10), puch));
88 return VERR_INVALID_UTF8_ENCODING;
89 }
90
91 /* validate the rest */
92 switch (cb)
93 {
94 case 6:
95 RTStrAssertMsgReturn((puch[5] & (RT_BIT(7) | RT_BIT(6))) == RT_BIT(7), ("6/%u: %.*Rhxs\n", cb, RT_MIN(cb + 10, cch), puch), VERR_INVALID_UTF8_ENCODING);
96 case 5:
97 RTStrAssertMsgReturn((puch[4] & (RT_BIT(7) | RT_BIT(6))) == RT_BIT(7), ("5/%u: %.*Rhxs\n", cb, RT_MIN(cb + 10, cch), puch), VERR_INVALID_UTF8_ENCODING);
98 case 4:
99 RTStrAssertMsgReturn((puch[3] & (RT_BIT(7) | RT_BIT(6))) == RT_BIT(7), ("4/%u: %.*Rhxs\n", cb, RT_MIN(cb + 10, cch), puch), VERR_INVALID_UTF8_ENCODING);
100 case 3:
101 RTStrAssertMsgReturn((puch[2] & (RT_BIT(7) | RT_BIT(6))) == RT_BIT(7), ("3/%u: %.*Rhxs\n", cb, RT_MIN(cb + 10, cch), puch), VERR_INVALID_UTF8_ENCODING);
102 case 2:
103 RTStrAssertMsgReturn((puch[1] & (RT_BIT(7) | RT_BIT(6))) == RT_BIT(7), ("2/%u: %.*Rhxs\n", cb, RT_MIN(cb + 10, cch), puch), VERR_INVALID_UTF8_ENCODING);
104 break;
105 }
106
107 /* validate the code point. */
108 RTUNICP uc;
109 switch (cb)
110 {
111 case 6:
112 uc = (puch[5] & 0x3f)
113 | ((RTUNICP)(puch[4] & 0x3f) << 6)
114 | ((RTUNICP)(puch[3] & 0x3f) << 12)
115 | ((RTUNICP)(puch[2] & 0x3f) << 18)
116 | ((RTUNICP)(puch[1] & 0x3f) << 24)
117 | ((RTUNICP)(uch & 0x01) << 30);
118 RTStrAssertMsgReturn(uc >= 0x04000000 && uc <= 0x7fffffff,
119 ("%u: cp=%#010RX32: %.*Rhxs\n", cb, uc, RT_MIN(cb + 10, cch), puch), VERR_INVALID_UTF8_ENCODING);
120 break;
121 case 5:
122 uc = (puch[4] & 0x3f)
123 | ((RTUNICP)(puch[3] & 0x3f) << 6)
124 | ((RTUNICP)(puch[2] & 0x3f) << 12)
125 | ((RTUNICP)(puch[1] & 0x3f) << 18)
126 | ((RTUNICP)(uch & 0x03) << 24);
127 RTStrAssertMsgReturn(uc >= 0x00200000 && uc <= 0x03ffffff,
128 ("%u: cp=%#010RX32: %.*Rhxs\n", cb, uc, RT_MIN(cb + 10, cch), puch), VERR_INVALID_UTF8_ENCODING);
129 break;
130 case 4:
131 uc = (puch[3] & 0x3f)
132 | ((RTUNICP)(puch[2] & 0x3f) << 6)
133 | ((RTUNICP)(puch[1] & 0x3f) << 12)
134 | ((RTUNICP)(uch & 0x07) << 18);
135 RTStrAssertMsgReturn(uc >= 0x00010000 && uc <= 0x001fffff,
136 ("%u: cp=%#010RX32: %.*Rhxs\n", cb, uc, RT_MIN(cb + 10, cch), puch), VERR_INVALID_UTF8_ENCODING);
137 break;
138 case 3:
139 uc = (puch[2] & 0x3f)
140 | ((RTUNICP)(puch[1] & 0x3f) << 6)
141 | ((RTUNICP)(uch & 0x0f) << 12);
142 RTStrAssertMsgReturn(uc >= 0x00000800 && uc <= 0x0000fffd,
143 ("%u: cp=%#010RX32: %.*Rhxs\n", cb, uc, RT_MIN(cb + 10, cch), puch),
144 uc == 0xffff || uc == 0xfffe ? VERR_CODE_POINT_ENDIAN_INDICATOR : VERR_INVALID_UTF8_ENCODING);
145 RTStrAssertMsgReturn(uc < 0xd800 || uc > 0xdfff,
146 ("%u: cp=%#010RX32: %.*Rhxs\n", cb, uc, RT_MIN(cb + 10, cch), puch), VERR_CODE_POINT_SURROGATE);
147 break;
148 case 2:
149 uc = (puch[1] & 0x3f)
150 | ((RTUNICP)(uch & 0x1f) << 6);
151 RTStrAssertMsgReturn(uc >= 0x00000080 && uc <= 0x000007ff,
152 ("%u: cp=%#010RX32: %.*Rhxs\n", cb, uc, RT_MIN(cb + 10, cch), puch), VERR_INVALID_UTF8_ENCODING);
153 break;
154 }
155
156 /* advance */
157 cch -= cb;
158 puch += cb;
159 }
160 else
161 {
162 /* one ASCII byte */
163 puch++;
164 cch--;
165 }
166 cCodePoints++;
167 }
168
169 /* done */
170 *pcuc = cCodePoints;
171 if (pcchActual)
172 *pcchActual = puch - (unsigned char const *)psz;
173 return VINF_SUCCESS;
174}
175
176
177/**
178 * Decodes and UTF-8 string into an array of unicode code point.
179 *
180 * Since we know the input is valid, we do *not* perform encoding or length checks.
181 *
182 * @returns iprt status code.
183 * @param psz The UTF-8 string to recode. This is a valid encoding.
184 * @param cch The number of chars (the type char, so bytes if you like) to process of the UTF-8 string.
185 * The recoding will stop when cch or '\\0' is reached. Pass RTSTR_MAX to process up to '\\0'.
186 * @param paCps Where to store the code points array.
187 * @param cCps The number of RTUNICP items the paCps buffer can hold, excluding the terminator ('\\0').
188 */
189static int rtUtf8Decode(const char *psz, size_t cch, PRTUNICP paCps, size_t cCps)
190{
191 int rc = VINF_SUCCESS;
192 const unsigned char *puch = (const unsigned char *)psz;
193 PRTUNICP pCp = paCps;
194 while (cch > 0)
195 {
196 /* read the next char and check for terminator. */
197 const unsigned char uch = *puch;
198 if (!uch)
199 break;
200
201 /* check for output overflow */
202 if (RT_UNLIKELY(cCps < 1))
203 {
204 rc = VERR_BUFFER_OVERFLOW;
205 break;
206 }
207 cCps--;
208
209 /* decode and recode the code point */
210 if (!(uch & RT_BIT(7)))
211 {
212 *pCp++ = uch;
213 puch++;
214 cch--;
215 }
216#ifdef RT_STRICT
217 else if (!(uch & RT_BIT(6)))
218 AssertMsgFailed(("Internal error!\n"));
219#endif
220 else if (!(uch & RT_BIT(5)))
221 {
222 *pCp++ = (puch[1] & 0x3f)
223 | ((uint16_t)(uch & 0x1f) << 6);
224 puch += 2;
225 cch -= 2;
226 }
227 else if (!(uch & RT_BIT(4)))
228 {
229 *pCp++ = (puch[2] & 0x3f)
230 | ((uint16_t)(puch[1] & 0x3f) << 6)
231 | ((uint16_t)(uch & 0x0f) << 12);
232 puch += 3;
233 cch -= 3;
234 }
235 else if (!(uch & RT_BIT(3)))
236 {
237 *pCp++ = (puch[3] & 0x3f)
238 | ((RTUNICP)(puch[2] & 0x3f) << 6)
239 | ((RTUNICP)(puch[1] & 0x3f) << 12)
240 | ((RTUNICP)(uch & 0x07) << 18);
241 puch += 4;
242 cch -= 4;
243 }
244 else if (!(uch & RT_BIT(2)))
245 {
246 *pCp++ = (puch[4] & 0x3f)
247 | ((RTUNICP)(puch[3] & 0x3f) << 6)
248 | ((RTUNICP)(puch[2] & 0x3f) << 12)
249 | ((RTUNICP)(puch[1] & 0x3f) << 18)
250 | ((RTUNICP)(uch & 0x03) << 24);
251 puch += 5;
252 cch -= 6;
253 }
254 else
255 {
256 Assert(!(uch & RT_BIT(1)));
257 *pCp++ = (puch[5] & 0x3f)
258 | ((RTUNICP)(puch[4] & 0x3f) << 6)
259 | ((RTUNICP)(puch[3] & 0x3f) << 12)
260 | ((RTUNICP)(puch[2] & 0x3f) << 18)
261 | ((RTUNICP)(puch[1] & 0x3f) << 24)
262 | ((RTUNICP)(uch & 0x01) << 30);
263 puch += 6;
264 cch -= 6;
265 }
266 }
267
268 /* done */
269 *pCp = 0;
270 return rc;
271}
272
273
274RTDECL(size_t) RTStrUniLen(const char *psz)
275{
276 size_t cCodePoints;
277 int rc = rtUtf8Length(psz, RTSTR_MAX, &cCodePoints, NULL);
278 return RT_SUCCESS(rc) ? cCodePoints : 0;
279}
280RT_EXPORT_SYMBOL(RTStrUniLen);
281
282
283RTDECL(int) RTStrUniLenEx(const char *psz, size_t cch, size_t *pcCps)
284{
285 size_t cCodePoints;
286 int rc = rtUtf8Length(psz, cch, &cCodePoints, NULL);
287 if (pcCps)
288 *pcCps = RT_SUCCESS(rc) ? cCodePoints : 0;
289 return rc;
290}
291RT_EXPORT_SYMBOL(RTStrUniLenEx);
292
293
294RTDECL(int) RTStrValidateEncoding(const char *psz)
295{
296 return RTStrValidateEncodingEx(psz, RTSTR_MAX, 0);
297}
298RT_EXPORT_SYMBOL(RTStrValidateEncoding);
299
300
301RTDECL(int) RTStrValidateEncodingEx(const char *psz, size_t cch, uint32_t fFlags)
302{
303 AssertReturn(!(fFlags & ~(RTSTR_VALIDATE_ENCODING_ZERO_TERMINATED)), VERR_INVALID_PARAMETER);
304 AssertPtr(psz);
305
306 /*
307 * Use rtUtf8Length for the job.
308 */
309 size_t cchActual;
310 size_t cCpsIgnored;
311 int rc = rtUtf8Length(psz, cch, &cCpsIgnored, &cchActual);
312 if (RT_SUCCESS(rc))
313 {
314 if ( (fFlags & RTSTR_VALIDATE_ENCODING_ZERO_TERMINATED)
315 && cchActual >= cch)
316 rc = VERR_BUFFER_OVERFLOW;
317 }
318 return rc;
319}
320RT_EXPORT_SYMBOL(RTStrValidateEncodingEx);
321
322
323RTDECL(bool) RTStrIsValidEncoding(const char *psz)
324{
325 int rc = RTStrValidateEncodingEx(psz, RTSTR_MAX, 0);
326 return RT_SUCCESS(rc);
327}
328RT_EXPORT_SYMBOL(RTStrIsValidEncoding);
329
330
331RTDECL(size_t) RTStrPurgeEncoding(char *psz)
332{
333 size_t cErrors = 0;
334 for (;;)
335 {
336 RTUNICP Cp;
337 int rc = RTStrGetCpEx((const char **)&psz, &Cp);
338 if (RT_SUCCESS(rc))
339 {
340 if (!Cp)
341 break;
342 }
343 else
344 {
345 psz[-1] = '?';
346 cErrors++;
347 }
348 }
349 return cErrors;
350}
351RT_EXPORT_SYMBOL(RTStrPurgeEncoding);
352
353
354RTDECL(ssize_t) RTStrPurgeComplementSet(char *psz, PCRTUNICP puszValidSet, char chReplacement)
355{
356 size_t cReplacements = 0;
357 AssertReturn(chReplacement && (unsigned)chReplacement < 128, -1);
358 for (;;)
359 {
360 RTUNICP Cp;
361 PCRTUNICP pCp;
362 char *pszOld = psz;
363 if (RT_FAILURE(RTStrGetCpEx((const char **)&psz, &Cp)))
364 return -1;
365 if (!Cp)
366 break;
367 for (pCp = puszValidSet; *pCp; pCp += 2)
368 {
369 AssertReturn(*(pCp + 1), -1);
370 if (*pCp <= Cp && *(pCp + 1) >= Cp) /* No, I won't do * and ++. */
371 break;
372 }
373 if (!*pCp)
374 {
375 for (; pszOld != psz; ++pszOld)
376 *pszOld = chReplacement;
377 ++cReplacements;
378 }
379 }
380 return cReplacements;
381}
382RT_EXPORT_SYMBOL(RTStrPurgeComplementSet);
383
384
385RTDECL(int) RTStrToUni(const char *pszString, PRTUNICP *ppaCps)
386{
387 /*
388 * Validate input.
389 */
390 Assert(VALID_PTR(pszString));
391 Assert(VALID_PTR(ppaCps));
392 *ppaCps = NULL;
393
394 /*
395 * Validate the UTF-8 input and count its code points.
396 */
397 size_t cCps;
398 int rc = rtUtf8Length(pszString, RTSTR_MAX, &cCps, NULL);
399 if (RT_SUCCESS(rc))
400 {
401 /*
402 * Allocate buffer.
403 */
404 PRTUNICP paCps = (PRTUNICP)RTMemAlloc((cCps + 1) * sizeof(RTUNICP));
405 if (paCps)
406 {
407 /*
408 * Decode the string.
409 */
410 rc = rtUtf8Decode(pszString, RTSTR_MAX, paCps, cCps);
411 if (RT_SUCCESS(rc))
412 {
413 *ppaCps = paCps;
414 return rc;
415 }
416 RTMemFree(paCps);
417 }
418 else
419 rc = VERR_NO_CODE_POINT_MEMORY;
420 }
421 return rc;
422}
423RT_EXPORT_SYMBOL(RTStrToUni);
424
425
426RTDECL(int) RTStrToUniEx(const char *pszString, size_t cchString, PRTUNICP *ppaCps, size_t cCps, size_t *pcCps)
427{
428 /*
429 * Validate input.
430 */
431 Assert(VALID_PTR(pszString));
432 Assert(VALID_PTR(ppaCps));
433 Assert(!pcCps || VALID_PTR(pcCps));
434
435 /*
436 * Validate the UTF-8 input and count the code points.
437 */
438 size_t cCpsResult;
439 int rc = rtUtf8Length(pszString, cchString, &cCpsResult, NULL);
440 if (RT_SUCCESS(rc))
441 {
442 if (pcCps)
443 *pcCps = cCpsResult;
444
445 /*
446 * Check buffer size / Allocate buffer.
447 */
448 bool fShouldFree;
449 PRTUNICP paCpsResult;
450 if (cCps > 0 && *ppaCps)
451 {
452 fShouldFree = false;
453 if (cCps <= cCpsResult)
454 return VERR_BUFFER_OVERFLOW;
455 paCpsResult = *ppaCps;
456 }
457 else
458 {
459 *ppaCps = NULL;
460 fShouldFree = true;
461 cCps = RT_MAX(cCpsResult + 1, cCps);
462 paCpsResult = (PRTUNICP)RTMemAlloc(cCps * sizeof(RTUNICP));
463 }
464 if (paCpsResult)
465 {
466 /*
467 * Encode the UTF-16 string.
468 */
469 rc = rtUtf8Decode(pszString, cchString, paCpsResult, cCps - 1);
470 if (RT_SUCCESS(rc))
471 {
472 *ppaCps = paCpsResult;
473 return rc;
474 }
475 if (fShouldFree)
476 RTMemFree(paCpsResult);
477 }
478 else
479 rc = VERR_NO_CODE_POINT_MEMORY;
480 }
481 return rc;
482}
483RT_EXPORT_SYMBOL(RTStrToUniEx);
484
485
486/**
487 * Calculates the UTF-16 length of a string, validating the encoding while doing so.
488 *
489 * @returns IPRT status code.
490 * @param psz Pointer to the UTF-8 string.
491 * @param cch The max length of the string. (btw cch = cb)
492 * Use RTSTR_MAX if all of the string is to be examined.
493 * @param pcwc Where to store the length of the UTF-16 string as a number of RTUTF16 characters.
494 */
495static int rtUtf8CalcUtf16Length(const char *psz, size_t cch, size_t *pcwc)
496{
497 const unsigned char *puch = (const unsigned char *)psz;
498 size_t cwc = 0;
499 while (cch > 0)
500 {
501 const unsigned char uch = *puch;
502 if (!uch)
503 break;
504 if (!(uch & RT_BIT(7)))
505 {
506 /* one ASCII byte */
507 cwc++;
508 puch++;
509 cch--;
510 }
511 else
512 {
513 /* figure sequence length and validate the first byte */
514 unsigned cb;
515 if ((uch & (RT_BIT(7) | RT_BIT(6) | RT_BIT(5))) == (RT_BIT(7) | RT_BIT(6)))
516 cb = 2;
517 else if ((uch & (RT_BIT(7) | RT_BIT(6) | RT_BIT(5) | RT_BIT(4))) == (RT_BIT(7) | RT_BIT(6) | RT_BIT(5)))
518 cb = 3;
519 else if ((uch & (RT_BIT(7) | RT_BIT(6) | RT_BIT(5) | RT_BIT(4) | RT_BIT(3))) == (RT_BIT(7) | RT_BIT(6) | RT_BIT(5) | RT_BIT(4)))
520 cb = 4;
521 else if ((uch & (RT_BIT(7) | RT_BIT(6) | RT_BIT(5) | RT_BIT(4) | RT_BIT(3) | RT_BIT(2))) == (RT_BIT(7) | RT_BIT(6) | RT_BIT(5) | RT_BIT(4) | RT_BIT(3)))
522 cb = 5;
523 else if ((uch & (RT_BIT(7) | RT_BIT(6) | RT_BIT(5) | RT_BIT(4) | RT_BIT(3) | RT_BIT(2) | RT_BIT(1))) == (RT_BIT(7) | RT_BIT(6) | RT_BIT(5) | RT_BIT(4) | RT_BIT(3) | RT_BIT(2)))
524 cb = 6;
525 else
526 {
527 RTStrAssertMsgFailed(("Invalid UTF-8 first byte: %.*Rhxs\n", RT_MIN(cch, 10), puch));
528 return VERR_INVALID_UTF8_ENCODING;
529 }
530
531 /* check length */
532 if (cb > cch)
533 {
534 RTStrAssertMsgFailed(("Invalid UTF-8 length: cb=%d cch=%d (%.*Rhxs)\n", cb, cch, RT_MIN(cch, 10), puch));
535 return VERR_INVALID_UTF8_ENCODING;
536 }
537
538 /* validate the rest */
539 switch (cb)
540 {
541 case 6:
542 RTStrAssertMsgReturn((puch[5] & (RT_BIT(7) | RT_BIT(6))) == RT_BIT(7), ("6/%u: %.*Rhxs\n", cb, RT_MIN(cb + 10, cch), puch), VERR_INVALID_UTF8_ENCODING);
543 case 5:
544 RTStrAssertMsgReturn((puch[4] & (RT_BIT(7) | RT_BIT(6))) == RT_BIT(7), ("5/%u: %.*Rhxs\n", cb, RT_MIN(cb + 10, cch), puch), VERR_INVALID_UTF8_ENCODING);
545 case 4:
546 RTStrAssertMsgReturn((puch[3] & (RT_BIT(7) | RT_BIT(6))) == RT_BIT(7), ("4/%u: %.*Rhxs\n", cb, RT_MIN(cb + 10, cch), puch), VERR_INVALID_UTF8_ENCODING);
547 case 3:
548 RTStrAssertMsgReturn((puch[2] & (RT_BIT(7) | RT_BIT(6))) == RT_BIT(7), ("3/%u: %.*Rhxs\n", cb, RT_MIN(cb + 10, cch), puch), VERR_INVALID_UTF8_ENCODING);
549 case 2:
550 RTStrAssertMsgReturn((puch[1] & (RT_BIT(7) | RT_BIT(6))) == RT_BIT(7), ("2/%u: %.*Rhxs\n", cb, RT_MIN(cb + 10, cch), puch), VERR_INVALID_UTF8_ENCODING);
551 break;
552 }
553
554 /* validate the code point. */
555 RTUNICP uc;
556 switch (cb)
557 {
558 case 6:
559 uc = (puch[5] & 0x3f)
560 | ((RTUNICP)(puch[4] & 0x3f) << 6)
561 | ((RTUNICP)(puch[3] & 0x3f) << 12)
562 | ((RTUNICP)(puch[2] & 0x3f) << 18)
563 | ((RTUNICP)(puch[1] & 0x3f) << 24)
564 | ((RTUNICP)(uch & 0x01) << 30);
565 RTStrAssertMsgReturn(uc >= 0x04000000 && uc <= 0x7fffffff,
566 ("%u: cp=%#010RX32: %.*Rhxs\n", cb, uc, RT_MIN(cb + 10, cch), puch), VERR_INVALID_UTF8_ENCODING);
567 RTStrAssertMsgFailed(("%u: cp=%#010RX32: %.*Rhxs\n", cb, uc, RT_MIN(cb + 10, cch), puch));
568 return VERR_CANT_RECODE_AS_UTF16;
569 case 5:
570 uc = (puch[4] & 0x3f)
571 | ((RTUNICP)(puch[3] & 0x3f) << 6)
572 | ((RTUNICP)(puch[2] & 0x3f) << 12)
573 | ((RTUNICP)(puch[1] & 0x3f) << 18)
574 | ((RTUNICP)(uch & 0x03) << 24);
575 RTStrAssertMsgReturn(uc >= 0x00200000 && uc <= 0x03ffffff,
576 ("%u: cp=%#010RX32: %.*Rhxs\n", cb, uc, RT_MIN(cb + 10, cch), puch), VERR_INVALID_UTF8_ENCODING);
577 RTStrAssertMsgFailed(("%u: cp=%#010RX32: %.*Rhxs\n", cb, uc, RT_MIN(cb + 10, cch), puch));
578 return VERR_CANT_RECODE_AS_UTF16;
579 case 4:
580 uc = (puch[3] & 0x3f)
581 | ((RTUNICP)(puch[2] & 0x3f) << 6)
582 | ((RTUNICP)(puch[1] & 0x3f) << 12)
583 | ((RTUNICP)(uch & 0x07) << 18);
584 RTStrAssertMsgReturn(uc >= 0x00010000 && uc <= 0x001fffff,
585 ("%u: cp=%#010RX32: %.*Rhxs\n", cb, uc, RT_MIN(cb + 10, cch), puch), VERR_INVALID_UTF8_ENCODING);
586 RTStrAssertMsgReturn(uc <= 0x0010ffff,
587 ("%u: cp=%#010RX32: %.*Rhxs\n", cb, uc, RT_MIN(cb + 10, cch), puch), VERR_CANT_RECODE_AS_UTF16);
588 cwc++;
589 break;
590 case 3:
591 uc = (puch[2] & 0x3f)
592 | ((RTUNICP)(puch[1] & 0x3f) << 6)
593 | ((RTUNICP)(uch & 0x0f) << 12);
594 RTStrAssertMsgReturn(uc >= 0x00000800 && uc <= 0x0000fffd,
595 ("%u: cp=%#010RX32: %.*Rhxs\n", cb, uc, RT_MIN(cb + 10, cch), puch),
596 uc == 0xffff || uc == 0xfffe ? VERR_CODE_POINT_ENDIAN_INDICATOR : VERR_INVALID_UTF8_ENCODING);
597 RTStrAssertMsgReturn(uc < 0xd800 || uc > 0xdfff,
598 ("%u: cp=%#010RX32: %.*Rhxs\n", cb, uc, RT_MIN(cb + 10, cch), puch), VERR_CODE_POINT_SURROGATE);
599 break;
600 case 2:
601 uc = (puch[1] & 0x3f)
602 | ((RTUNICP)(uch & 0x1f) << 6);
603 RTStrAssertMsgReturn(uc >= 0x00000080 && uc <= 0x000007ff,
604 ("%u: cp=%#010RX32: %.*Rhxs\n", cb, uc, RT_MIN(cb + 10, cch), puch), VERR_INVALID_UTF8_ENCODING);
605 break;
606 }
607
608 /* advance */
609 cch -= cb;
610 puch += cb;
611 cwc++;
612 }
613 }
614
615 /* done */
616 *pcwc = cwc;
617 return VINF_SUCCESS;
618}
619
620
621/**
622 * Recodes a valid UTF-8 string as UTF-16.
623 *
624 * Since we know the input is valid, we do *not* perform encoding or length checks.
625 *
626 * @returns iprt status code.
627 * @param psz The UTF-8 string to recode. This is a valid encoding.
628 * @param cch The number of chars (the type char, so bytes if you like) to process of the UTF-8 string.
629 * The recoding will stop when cch or '\\0' is reached. Pass RTSTR_MAX to process up to '\\0'.
630 * @param pwsz Where to store the UTF-16 string.
631 * @param cwc The number of RTUTF16 items the pwsz buffer can hold, excluding the terminator ('\\0').
632 */
633static int rtUtf8RecodeAsUtf16(const char *psz, size_t cch, PRTUTF16 pwsz, size_t cwc)
634{
635 int rc = VINF_SUCCESS;
636 const unsigned char *puch = (const unsigned char *)psz;
637 PRTUTF16 pwc = pwsz;
638 while (cch > 0)
639 {
640 /* read the next char and check for terminator. */
641 const unsigned char uch = *puch;
642 if (!uch)
643 break;
644
645 /* check for output overflow */
646 if (RT_UNLIKELY(cwc < 1))
647 {
648 rc = VERR_BUFFER_OVERFLOW;
649 break;
650 }
651 cwc--;
652
653 /* decode and recode the code point */
654 if (!(uch & RT_BIT(7)))
655 {
656 *pwc++ = uch;
657 puch++;
658 cch--;
659 }
660 else if ((uch & (RT_BIT(7) | RT_BIT(6) | RT_BIT(5))) == (RT_BIT(7) | RT_BIT(6)))
661 {
662 uint16_t uc = (puch[1] & 0x3f)
663 | ((uint16_t)(uch & 0x1f) << 6);
664 *pwc++ = uc;
665 puch += 2;
666 cch -= 2;
667 }
668 else if ((uch & (RT_BIT(7) | RT_BIT(6) | RT_BIT(5) | RT_BIT(4))) == (RT_BIT(7) | RT_BIT(6) | RT_BIT(5)))
669 {
670 uint16_t uc = (puch[2] & 0x3f)
671 | ((uint16_t)(puch[1] & 0x3f) << 6)
672 | ((uint16_t)(uch & 0x0f) << 12);
673 *pwc++ = uc;
674 puch += 3;
675 cch -= 3;
676 }
677 else
678 {
679 /* generate surrogate pair */
680 Assert((uch & (RT_BIT(7) | RT_BIT(6) | RT_BIT(5) | RT_BIT(4) | RT_BIT(3))) == (RT_BIT(7) | RT_BIT(6) | RT_BIT(5) | RT_BIT(4)));
681 RTUNICP uc = (puch[3] & 0x3f)
682 | ((RTUNICP)(puch[2] & 0x3f) << 6)
683 | ((RTUNICP)(puch[1] & 0x3f) << 12)
684 | ((RTUNICP)(uch & 0x07) << 18);
685 if (RT_UNLIKELY(cwc < 1))
686 {
687 rc = VERR_BUFFER_OVERFLOW;
688 break;
689 }
690 cwc--;
691
692 uc -= 0x10000;
693 *pwc++ = 0xd800 | (uc >> 10);
694 *pwc++ = 0xdc00 | (uc & 0x3ff);
695 puch += 4;
696 cch -= 4;
697 }
698 }
699
700 /* done */
701 *pwc = '\0';
702 return rc;
703}
704
705
706RTDECL(int) RTStrToUtf16Tag(const char *pszString, PRTUTF16 *ppwszString, const char *pszTag)
707{
708 /*
709 * Validate input.
710 */
711 Assert(VALID_PTR(ppwszString));
712 Assert(VALID_PTR(pszString));
713 *ppwszString = NULL;
714
715 /*
716 * Validate the UTF-8 input and calculate the length of the UTF-16 string.
717 */
718 size_t cwc;
719 int rc = rtUtf8CalcUtf16Length(pszString, RTSTR_MAX, &cwc);
720 if (RT_SUCCESS(rc))
721 {
722 /*
723 * Allocate buffer.
724 */
725 PRTUTF16 pwsz = (PRTUTF16)RTMemAllocTag((cwc + 1) * sizeof(RTUTF16), pszTag);
726 if (pwsz)
727 {
728 /*
729 * Encode the UTF-16 string.
730 */
731 rc = rtUtf8RecodeAsUtf16(pszString, RTSTR_MAX, pwsz, cwc);
732 if (RT_SUCCESS(rc))
733 {
734 *ppwszString = pwsz;
735 return rc;
736 }
737 RTMemFree(pwsz);
738 }
739 else
740 rc = VERR_NO_UTF16_MEMORY;
741 }
742 return rc;
743}
744RT_EXPORT_SYMBOL(RTStrToUtf16Tag);
745
746
747RTDECL(int) RTStrToUtf16ExTag(const char *pszString, size_t cchString,
748 PRTUTF16 *ppwsz, size_t cwc, size_t *pcwc, const char *pszTag)
749{
750 /*
751 * Validate input.
752 */
753 Assert(VALID_PTR(pszString));
754 Assert(VALID_PTR(ppwsz));
755 Assert(!pcwc || VALID_PTR(pcwc));
756
757 /*
758 * Validate the UTF-8 input and calculate the length of the UTF-16 string.
759 */
760 size_t cwcResult;
761 int rc = rtUtf8CalcUtf16Length(pszString, cchString, &cwcResult);
762 if (RT_SUCCESS(rc))
763 {
764 if (pcwc)
765 *pcwc = cwcResult;
766
767 /*
768 * Check buffer size / Allocate buffer.
769 */
770 bool fShouldFree;
771 PRTUTF16 pwszResult;
772 if (cwc > 0 && *ppwsz)
773 {
774 fShouldFree = false;
775 if (cwc <= cwcResult)
776 return VERR_BUFFER_OVERFLOW;
777 pwszResult = *ppwsz;
778 }
779 else
780 {
781 *ppwsz = NULL;
782 fShouldFree = true;
783 cwc = RT_MAX(cwcResult + 1, cwc);
784 pwszResult = (PRTUTF16)RTMemAllocTag(cwc * sizeof(RTUTF16), pszTag);
785 }
786 if (pwszResult)
787 {
788 /*
789 * Encode the UTF-16 string.
790 */
791 rc = rtUtf8RecodeAsUtf16(pszString, cchString, pwszResult, cwc - 1);
792 if (RT_SUCCESS(rc))
793 {
794 *ppwsz = pwszResult;
795 return rc;
796 }
797 if (fShouldFree)
798 RTMemFree(pwszResult);
799 }
800 else
801 rc = VERR_NO_UTF16_MEMORY;
802 }
803 return rc;
804}
805RT_EXPORT_SYMBOL(RTStrToUtf16ExTag);
806
807
808RTDECL(size_t) RTStrCalcUtf16Len(const char *psz)
809{
810 size_t cwc;
811 int rc = rtUtf8CalcUtf16Length(psz, RTSTR_MAX, &cwc);
812 return RT_SUCCESS(rc) ? cwc : 0;
813}
814RT_EXPORT_SYMBOL(RTStrCalcUtf16Len);
815
816
817RTDECL(int) RTStrCalcUtf16LenEx(const char *psz, size_t cch, size_t *pcwc)
818{
819 size_t cwc;
820 int rc = rtUtf8CalcUtf16Length(psz, cch, &cwc);
821 if (pcwc)
822 *pcwc = RT_SUCCESS(rc) ? cwc : ~(size_t)0;
823 return rc;
824}
825RT_EXPORT_SYMBOL(RTStrCalcUtf16LenEx);
826
827
828/**
829 * Calculates the length of the UTF-8 encoding of a Latin-1 string.
830 *
831 * @returns iprt status code.
832 * @param psz The Latin-1 string.
833 * @param cchIn The max length of the Latin-1 string to consider.
834 * @param pcch Where to store the length (excluding '\\0') of the UTF-8 string. (cch == cb, btw)
835 */
836static int rtLatin1CalcUtf8Length(const char *psz, size_t cchIn, size_t *pcch)
837{
838 size_t cch = 0;
839 for (;;)
840 {
841 RTUNICP Cp;
842 int rc = RTLatin1GetCpNEx(&psz, &cchIn, &Cp);
843 if (Cp == 0 || rc == VERR_END_OF_STRING)
844 break;
845 if (RT_FAILURE(rc))
846 return rc;
847 cch += RTStrCpSize(Cp); /* cannot fail */
848 }
849
850 /* done */
851 *pcch = cch;
852 return VINF_SUCCESS;
853}
854
855
856/**
857 * Recodes a Latin-1 string as UTF-8.
858 *
859 * @returns iprt status code.
860 * @param psz The Latin-1 string.
861 * @param cchIn The number of characters to process from psz. The recoding
862 * will stop when cch or '\\0' is reached.
863 * @param psz Where to store the UTF-8 string.
864 * @param cch The size of the UTF-8 buffer, excluding the terminator.
865 */
866static int rtLatin1RecodeAsUtf8(const char *pszIn, size_t cchIn, char *psz, size_t cch)
867{
868 int rc;
869 for (;;)
870 {
871 RTUNICP Cp;
872 size_t cchCp;
873 rc = RTLatin1GetCpNEx(&pszIn, &cchIn, &Cp);
874 if (Cp == 0 || RT_FAILURE(rc))
875 break;
876 cchCp = RTStrCpSize(Cp);
877 if (RT_UNLIKELY(cch < cchCp))
878 {
879 RTStrAssertMsgFailed(("Buffer overflow! 1\n"));
880 rc = VERR_BUFFER_OVERFLOW;
881 break;
882 }
883 cch -= cchCp;
884 psz = RTStrPutCp(psz, Cp);
885 }
886
887 /* done */
888 if (rc == VERR_END_OF_STRING)
889 rc = VINF_SUCCESS;
890 *psz = '\0';
891 return rc;
892}
893
894
895
896RTDECL(int) RTLatin1ToUtf8Tag(const char *pszString, char **ppszString, const char *pszTag)
897{
898 /*
899 * Validate input.
900 */
901 Assert(VALID_PTR(ppszString));
902 Assert(VALID_PTR(pszString));
903 *ppszString = NULL;
904
905 /*
906 * Calculate the length of the UTF-8 encoding of the Latin-1 string.
907 */
908 size_t cch;
909 int rc = rtLatin1CalcUtf8Length(pszString, RTSTR_MAX, &cch);
910 if (RT_SUCCESS(rc))
911 {
912 /*
913 * Allocate buffer and recode it.
914 */
915 char *pszResult = (char *)RTMemAllocTag(cch + 1, pszTag);
916 if (pszResult)
917 {
918 rc = rtLatin1RecodeAsUtf8(pszString, RTSTR_MAX, pszResult, cch);
919 if (RT_SUCCESS(rc))
920 {
921 *ppszString = pszResult;
922 return rc;
923 }
924
925 RTMemFree(pszResult);
926 }
927 else
928 rc = VERR_NO_STR_MEMORY;
929 }
930 return rc;
931}
932RT_EXPORT_SYMBOL(RTLatin1ToUtf8Tag);
933
934
935RTDECL(int) RTLatin1ToUtf8ExTag(const char *pszString, size_t cchString, char **ppsz, size_t cch, size_t *pcch, const char *pszTag)
936{
937 /*
938 * Validate input.
939 */
940 Assert(VALID_PTR(pszString));
941 Assert(VALID_PTR(ppsz));
942 Assert(!pcch || VALID_PTR(pcch));
943
944 /*
945 * Calculate the length of the UTF-8 encoding of the Latin-1 string.
946 */
947 size_t cchResult;
948 int rc = rtLatin1CalcUtf8Length(pszString, cchString, &cchResult);
949 if (RT_SUCCESS(rc))
950 {
951 if (pcch)
952 *pcch = cchResult;
953
954 /*
955 * Check buffer size / Allocate buffer and recode it.
956 */
957 bool fShouldFree;
958 char *pszResult;
959 if (cch > 0 && *ppsz)
960 {
961 fShouldFree = false;
962 if (RT_UNLIKELY(cch <= cchResult))
963 return VERR_BUFFER_OVERFLOW;
964 pszResult = *ppsz;
965 }
966 else
967 {
968 *ppsz = NULL;
969 fShouldFree = true;
970 cch = RT_MAX(cch, cchResult + 1);
971 pszResult = (char *)RTStrAllocTag(cch, pszTag);
972 }
973 if (pszResult)
974 {
975 rc = rtLatin1RecodeAsUtf8(pszString, cchString, pszResult, cch - 1);
976 if (RT_SUCCESS(rc))
977 {
978 *ppsz = pszResult;
979 return rc;
980 }
981
982 if (fShouldFree)
983 RTStrFree(pszResult);
984 }
985 else
986 rc = VERR_NO_STR_MEMORY;
987 }
988 return rc;
989}
990RT_EXPORT_SYMBOL(RTLatin1ToUtf8ExTag);
991
992
993RTDECL(size_t) RTLatin1CalcUtf8Len(const char *psz)
994{
995 size_t cch;
996 int rc = rtLatin1CalcUtf8Length(psz, RTSTR_MAX, &cch);
997 return RT_SUCCESS(rc) ? cch : 0;
998}
999RT_EXPORT_SYMBOL(RTLatin1CalcUtf8Len);
1000
1001
1002RTDECL(int) RTLatin1CalcUtf8LenEx(const char *psz, size_t cchIn, size_t *pcch)
1003{
1004 size_t cch;
1005 int rc = rtLatin1CalcUtf8Length(psz, cchIn, &cch);
1006 if (pcch)
1007 *pcch = RT_SUCCESS(rc) ? cch : ~(size_t)0;
1008 return rc;
1009}
1010RT_EXPORT_SYMBOL(RTLatin1CalcUtf8LenEx);
1011
1012
1013/**
1014 * Calculates the Latin-1 length of a string, validating the encoding while
1015 * doing so.
1016 *
1017 * @returns IPRT status code.
1018 * @param psz Pointer to the UTF-8 string.
1019 * @param cchIn The max length of the string. (btw cch = cb)
1020 * Use RTSTR_MAX if all of the string is to be examined.
1021 * @param pcch Where to store the length of the Latin-1 string in bytes.
1022 */
1023static int rtUtf8CalcLatin1Length(const char *psz, size_t cchIn, size_t *pcch)
1024{
1025 size_t cch = 0;
1026 for (;;)
1027 {
1028 RTUNICP Cp;
1029 size_t cchCp;
1030 int rc = RTStrGetCpNEx(&psz, &cchIn, &Cp);
1031 if (Cp == 0 || rc == VERR_END_OF_STRING)
1032 break;
1033 if (RT_FAILURE(rc))
1034 return rc;
1035 cchCp = RTLatin1CpSize(Cp);
1036 if (cchCp == 0)
1037 return VERR_NO_TRANSLATION;
1038 cch += cchCp;
1039 }
1040
1041 /* done */
1042 *pcch = cch;
1043 return VINF_SUCCESS;
1044}
1045
1046
1047/**
1048 * Recodes a valid UTF-8 string as Latin-1.
1049 *
1050 * Since we know the input is valid, we do *not* perform encoding or length checks.
1051 *
1052 * @returns iprt status code.
1053 * @param pszIn The UTF-8 string to recode. This is a valid encoding.
1054 * @param cchIn The number of chars (the type char, so bytes if you like) to process of the UTF-8 string.
1055 * The recoding will stop when cch or '\\0' is reached. Pass RTSTR_MAX to process up to '\\0'.
1056 * @param psz Where to store the Latin-1 string.
1057 * @param cch The number of characters the pszOut buffer can hold, excluding the terminator ('\\0').
1058 */
1059static int rtUtf8RecodeAsLatin1(const char *pszIn, size_t cchIn, char *psz, size_t cch)
1060{
1061 int rc;
1062 for (;;)
1063 {
1064 RTUNICP Cp;
1065 size_t cchCp;
1066 rc = RTStrGetCpNEx(&pszIn, &cchIn, &Cp);
1067 if (Cp == 0 || RT_FAILURE(rc))
1068 break;
1069 cchCp = RTLatin1CpSize(Cp);
1070 if (RT_UNLIKELY(cch < cchCp))
1071 {
1072 RTStrAssertMsgFailed(("Buffer overflow! 1\n"));
1073 rc = VERR_BUFFER_OVERFLOW;
1074 break;
1075 }
1076 cch -= cchCp;
1077 psz = RTLatin1PutCp(psz, Cp);
1078 }
1079
1080 /* done */
1081 if (rc == VERR_END_OF_STRING)
1082 rc = VINF_SUCCESS;
1083 *psz = '\0';
1084 return rc;
1085}
1086
1087
1088
1089RTDECL(int) RTStrToLatin1Tag(const char *pszString, char **ppszString, const char *pszTag)
1090{
1091 /*
1092 * Validate input.
1093 */
1094 Assert(VALID_PTR(ppszString));
1095 Assert(VALID_PTR(pszString));
1096 *ppszString = NULL;
1097
1098 /*
1099 * Validate the UTF-8 input and calculate the length of the Latin-1 string.
1100 */
1101 size_t cch;
1102 int rc = rtUtf8CalcLatin1Length(pszString, RTSTR_MAX, &cch);
1103 if (RT_SUCCESS(rc))
1104 {
1105 /*
1106 * Allocate buffer.
1107 */
1108 char *psz = (char *)RTMemAllocTag(cch + 1, pszTag);
1109 if (psz)
1110 {
1111 /*
1112 * Encode the UTF-16 string.
1113 */
1114 rc = rtUtf8RecodeAsLatin1(pszString, RTSTR_MAX, psz, cch);
1115 if (RT_SUCCESS(rc))
1116 {
1117 *ppszString = psz;
1118 return rc;
1119 }
1120 RTMemFree(psz);
1121 }
1122 else
1123 rc = VERR_NO_STR_MEMORY;
1124 }
1125 return rc;
1126}
1127RT_EXPORT_SYMBOL(RTStrToLatin1Tag);
1128
1129
1130RTDECL(int) RTStrToLatin1ExTag(const char *pszString, size_t cchString,
1131 char **ppsz, size_t cch, size_t *pcch, const char *pszTag)
1132{
1133 /*
1134 * Validate input.
1135 */
1136 Assert(VALID_PTR(pszString));
1137 Assert(VALID_PTR(ppsz));
1138 Assert(!pcch || VALID_PTR(pcch));
1139
1140 /*
1141 * Validate the UTF-8 input and calculate the length of the UTF-16 string.
1142 */
1143 size_t cchResult;
1144 int rc = rtUtf8CalcLatin1Length(pszString, cchString, &cchResult);
1145 if (RT_SUCCESS(rc))
1146 {
1147 if (pcch)
1148 *pcch = cchResult;
1149
1150 /*
1151 * Check buffer size / Allocate buffer.
1152 */
1153 bool fShouldFree;
1154 char *pszResult;
1155 if (cch > 0 && *ppsz)
1156 {
1157 fShouldFree = false;
1158 if (cch <= cchResult)
1159 return VERR_BUFFER_OVERFLOW;
1160 pszResult = *ppsz;
1161 }
1162 else
1163 {
1164 *ppsz = NULL;
1165 fShouldFree = true;
1166 cch = RT_MAX(cchResult + 1, cch);
1167 pszResult = (char *)RTMemAllocTag(cch, pszTag);
1168 }
1169 if (pszResult)
1170 {
1171 /*
1172 * Encode the Latin-1 string.
1173 */
1174 rc = rtUtf8RecodeAsLatin1(pszString, cchString, pszResult, cch - 1);
1175 if (RT_SUCCESS(rc))
1176 {
1177 *ppsz = pszResult;
1178 return rc;
1179 }
1180 if (fShouldFree)
1181 RTMemFree(pszResult);
1182 }
1183 else
1184 rc = VERR_NO_STR_MEMORY;
1185 }
1186 return rc;
1187}
1188RT_EXPORT_SYMBOL(RTStrToLatin1Tag);
1189
1190
1191RTDECL(size_t) RTStrCalcLatin1Len(const char *psz)
1192{
1193 size_t cch;
1194 int rc = rtUtf8CalcLatin1Length(psz, RTSTR_MAX, &cch);
1195 return RT_SUCCESS(rc) ? cch : 0;
1196}
1197RT_EXPORT_SYMBOL(RTStrCalcLatin1Len);
1198
1199
1200RTDECL(int) RTStrCalcLatin1LenEx(const char *psz, size_t cchIn, size_t *pcch)
1201{
1202 size_t cch;
1203 int rc = rtUtf8CalcLatin1Length(psz, cchIn, &cch);
1204 if (pcch)
1205 *pcch = RT_SUCCESS(rc) ? cch : ~(size_t)0;
1206 return rc;
1207}
1208RT_EXPORT_SYMBOL(RTStrCalcLatin1LenEx);
1209
1210
1211/**
1212 * Handle invalid encodings passed to RTStrGetCp() and RTStrGetCpEx().
1213 * @returns rc
1214 * @param ppsz The pointer to the string position point.
1215 * @param pCp Where to store RTUNICP_INVALID.
1216 * @param rc The iprt error code.
1217 */
1218static int rtStrGetCpExFailure(const char **ppsz, PRTUNICP pCp, int rc)
1219{
1220 /*
1221 * Try find a valid encoding.
1222 */
1223 (*ppsz)++; /** @todo code this! */
1224 *pCp = RTUNICP_INVALID;
1225 return rc;
1226}
1227
1228
1229RTDECL(RTUNICP) RTStrGetCpInternal(const char *psz)
1230{
1231 RTUNICP Cp;
1232 RTStrGetCpExInternal(&psz, &Cp);
1233 return Cp;
1234}
1235RT_EXPORT_SYMBOL(RTStrGetCpInternal);
1236
1237
1238RTDECL(int) RTStrGetCpExInternal(const char **ppsz, PRTUNICP pCp)
1239{
1240 const unsigned char *puch = (const unsigned char *)*ppsz;
1241 const unsigned char uch = *puch;
1242 RTUNICP uc;
1243
1244 /* ASCII ? */
1245 if (!(uch & RT_BIT(7)))
1246 {
1247 uc = uch;
1248 puch++;
1249 }
1250 else if (uch & RT_BIT(6))
1251 {
1252 /* figure the length and validate the first octet. */
1253/** @todo RT_USE_RTC_3629 */
1254 unsigned cb;
1255 if (!(uch & RT_BIT(5)))
1256 cb = 2;
1257 else if (!(uch & RT_BIT(4)))
1258 cb = 3;
1259 else if (!(uch & RT_BIT(3)))
1260 cb = 4;
1261 else if (!(uch & RT_BIT(2)))
1262 cb = 5;
1263 else if (!(uch & RT_BIT(1)))
1264 cb = 6;
1265 else
1266 {
1267 RTStrAssertMsgFailed(("Invalid UTF-8 first byte: %.*Rhxs\n", RT_MIN(strlen((char *)puch), 10), puch));
1268 return rtStrGetCpExFailure(ppsz, pCp, VERR_INVALID_UTF8_ENCODING);
1269 }
1270
1271 /* validate the rest */
1272 switch (cb)
1273 {
1274 case 6:
1275 RTStrAssertMsgReturn((puch[5] & 0xc0) == 0x80, ("6/%u: %.*Rhxs\n", cb, RT_MIN(cb + 10, strlen((char *)puch)), puch),
1276 rtStrGetCpExFailure(ppsz, pCp, VERR_INVALID_UTF8_ENCODING));
1277 case 5:
1278 RTStrAssertMsgReturn((puch[4] & 0xc0) == 0x80, ("5/%u: %.*Rhxs\n", cb, RT_MIN(cb + 10, strlen((char *)puch)), puch),
1279 rtStrGetCpExFailure(ppsz, pCp, VERR_INVALID_UTF8_ENCODING));
1280 case 4:
1281 RTStrAssertMsgReturn((puch[3] & 0xc0) == 0x80, ("4/%u: %.*Rhxs\n", cb, RT_MIN(cb + 10, strlen((char *)puch)), puch),
1282 rtStrGetCpExFailure(ppsz, pCp, VERR_INVALID_UTF8_ENCODING));
1283 case 3:
1284 RTStrAssertMsgReturn((puch[2] & 0xc0) == 0x80, ("3/%u: %.*Rhxs\n", cb, RT_MIN(cb + 10, strlen((char *)puch)), puch),
1285 rtStrGetCpExFailure(ppsz, pCp, VERR_INVALID_UTF8_ENCODING));
1286 case 2:
1287 RTStrAssertMsgReturn((puch[1] & 0xc0) == 0x80, ("2/%u: %.*Rhxs\n", cb, RT_MIN(cb + 10, strlen((char *)puch)), puch),
1288 rtStrGetCpExFailure(ppsz, pCp, VERR_INVALID_UTF8_ENCODING));
1289 break;
1290 }
1291
1292 /* get and validate the code point. */
1293 switch (cb)
1294 {
1295 case 6:
1296 uc = (puch[5] & 0x3f)
1297 | ((RTUNICP)(puch[4] & 0x3f) << 6)
1298 | ((RTUNICP)(puch[3] & 0x3f) << 12)
1299 | ((RTUNICP)(puch[2] & 0x3f) << 18)
1300 | ((RTUNICP)(puch[1] & 0x3f) << 24)
1301 | ((RTUNICP)(uch & 0x01) << 30);
1302 RTStrAssertMsgReturn(uc >= 0x04000000 && uc <= 0x7fffffff,
1303 ("%u: cp=%#010RX32: %.*Rhxs\n", cb, uc, RT_MIN(cb + 10, strlen((char *)puch)), puch),
1304 rtStrGetCpExFailure(ppsz, pCp, VERR_INVALID_UTF8_ENCODING));
1305 break;
1306 case 5:
1307 uc = (puch[4] & 0x3f)
1308 | ((RTUNICP)(puch[3] & 0x3f) << 6)
1309 | ((RTUNICP)(puch[2] & 0x3f) << 12)
1310 | ((RTUNICP)(puch[1] & 0x3f) << 18)
1311 | ((RTUNICP)(uch & 0x03) << 24);
1312 RTStrAssertMsgReturn(uc >= 0x00200000 && uc <= 0x03ffffff,
1313 ("%u: cp=%#010RX32: %.*Rhxs\n", cb, uc, RT_MIN(cb + 10, strlen((char *)puch)), puch),
1314 rtStrGetCpExFailure(ppsz, pCp, VERR_INVALID_UTF8_ENCODING));
1315 break;
1316 case 4:
1317 uc = (puch[3] & 0x3f)
1318 | ((RTUNICP)(puch[2] & 0x3f) << 6)
1319 | ((RTUNICP)(puch[1] & 0x3f) << 12)
1320 | ((RTUNICP)(uch & 0x07) << 18);
1321 RTStrAssertMsgReturn(uc >= 0x00010000 && uc <= 0x001fffff,
1322 ("%u: cp=%#010RX32: %.*Rhxs\n", cb, uc, RT_MIN(cb + 10, strlen((char *)puch)), puch),
1323 rtStrGetCpExFailure(ppsz, pCp, VERR_INVALID_UTF8_ENCODING));
1324 break;
1325 case 3:
1326 uc = (puch[2] & 0x3f)
1327 | ((RTUNICP)(puch[1] & 0x3f) << 6)
1328 | ((RTUNICP)(uch & 0x0f) << 12);
1329 RTStrAssertMsgReturn(uc >= 0x00000800 && uc <= 0x0000fffd,
1330 ("%u: cp=%#010RX32: %.*Rhxs\n", cb, uc, RT_MIN(cb + 10, strlen((char *)puch)), puch),
1331 rtStrGetCpExFailure(ppsz, pCp, uc == 0xffff || uc == 0xfffe ? VERR_CODE_POINT_ENDIAN_INDICATOR : VERR_INVALID_UTF8_ENCODING));
1332 RTStrAssertMsgReturn(uc < 0xd800 || uc > 0xdfff,
1333 ("%u: cp=%#010RX32: %.*Rhxs\n", cb, uc, RT_MIN(cb + 10, strlen((char *)puch)), puch),
1334 rtStrGetCpExFailure(ppsz, pCp, VERR_CODE_POINT_SURROGATE));
1335 break;
1336 case 2:
1337 uc = (puch[1] & 0x3f)
1338 | ((RTUNICP)(uch & 0x1f) << 6);
1339 RTStrAssertMsgReturn(uc >= 0x00000080 && uc <= 0x000007ff,
1340 ("%u: cp=%#010RX32: %.*Rhxs\n", cb, uc, RT_MIN(cb + 10, strlen((char *)puch)), puch),
1341 rtStrGetCpExFailure(ppsz, pCp, VERR_INVALID_UTF8_ENCODING));
1342 break;
1343 default: /* impossible, but GCC is bitching. */
1344 uc = RTUNICP_INVALID;
1345 break;
1346 }
1347 puch += cb;
1348 }
1349 else
1350 {
1351 /* 6th bit is always set. */
1352 RTStrAssertMsgFailed(("Invalid UTF-8 first byte: %.*Rhxs\n", RT_MIN(strlen((char *)puch), 10), puch));
1353 return rtStrGetCpExFailure(ppsz, pCp, VERR_INVALID_UTF8_ENCODING);
1354 }
1355 *pCp = uc;
1356 *ppsz = (const char *)puch;
1357 return VINF_SUCCESS;
1358}
1359RT_EXPORT_SYMBOL(RTStrGetCpExInternal);
1360
1361
1362/**
1363 * Handle invalid encodings passed to RTStrGetCpNEx().
1364 * @returns rc
1365 * @param ppsz The pointer to the string position point.
1366 * @param pcch Pointer to the string length.
1367 * @param pCp Where to store RTUNICP_INVALID.
1368 * @param rc The iprt error code.
1369 */
1370static int rtStrGetCpNExFailure(const char **ppsz, size_t *pcch, PRTUNICP pCp, int rc)
1371{
1372 /*
1373 * Try find a valid encoding.
1374 */
1375 (*ppsz)++; /** @todo code this! */
1376 (*pcch)--;
1377 *pCp = RTUNICP_INVALID;
1378 return rc;
1379}
1380
1381
1382RTDECL(int) RTStrGetCpNExInternal(const char **ppsz, size_t *pcch, PRTUNICP pCp)
1383{
1384 const unsigned char *puch = (const unsigned char *)*ppsz;
1385 const unsigned char uch = *puch;
1386 size_t cch = *pcch;
1387 RTUNICP uc;
1388
1389 if (cch == 0)
1390 {
1391 *pCp = RTUNICP_INVALID;
1392 return VERR_END_OF_STRING;
1393 }
1394
1395 /* ASCII ? */
1396 if (!(uch & RT_BIT(7)))
1397 {
1398 uc = uch;
1399 puch++;
1400 cch--;
1401 }
1402 else if (uch & RT_BIT(6))
1403 {
1404 /* figure the length and validate the first octet. */
1405/** @todo RT_USE_RTC_3629 */
1406 unsigned cb;
1407 if (!(uch & RT_BIT(5)))
1408 cb = 2;
1409 else if (!(uch & RT_BIT(4)))
1410 cb = 3;
1411 else if (!(uch & RT_BIT(3)))
1412 cb = 4;
1413 else if (!(uch & RT_BIT(2)))
1414 cb = 5;
1415 else if (!(uch & RT_BIT(1)))
1416 cb = 6;
1417 else
1418 {
1419 RTStrAssertMsgFailed(("Invalid UTF-8 first byte: %.*Rhxs\n", RT_MIN(strlen((char *)puch), 10), puch));
1420 return rtStrGetCpNExFailure(ppsz, pcch, pCp, VERR_INVALID_UTF8_ENCODING);
1421 }
1422
1423 if (cb > cch)
1424 return rtStrGetCpNExFailure(ppsz, pcch, pCp, VERR_INVALID_UTF8_ENCODING);
1425
1426 /* validate the rest */
1427 switch (cb)
1428 {
1429 case 6:
1430 RTStrAssertMsgReturn((puch[5] & 0xc0) == 0x80, ("6/%u: %.*Rhxs\n", cb, RT_MIN(cb + 10, strlen((char *)puch)), puch),
1431 rtStrGetCpNExFailure(ppsz, pcch, pCp, VERR_INVALID_UTF8_ENCODING));
1432 case 5:
1433 RTStrAssertMsgReturn((puch[4] & 0xc0) == 0x80, ("5/%u: %.*Rhxs\n", cb, RT_MIN(cb + 10, strlen((char *)puch)), puch),
1434 rtStrGetCpNExFailure(ppsz, pcch, pCp, VERR_INVALID_UTF8_ENCODING));
1435 case 4:
1436 RTStrAssertMsgReturn((puch[3] & 0xc0) == 0x80, ("4/%u: %.*Rhxs\n", cb, RT_MIN(cb + 10, strlen((char *)puch)), puch),
1437 rtStrGetCpNExFailure(ppsz, pcch, pCp, VERR_INVALID_UTF8_ENCODING));
1438 case 3:
1439 RTStrAssertMsgReturn((puch[2] & 0xc0) == 0x80, ("3/%u: %.*Rhxs\n", cb, RT_MIN(cb + 10, strlen((char *)puch)), puch),
1440 rtStrGetCpNExFailure(ppsz, pcch, pCp, VERR_INVALID_UTF8_ENCODING));
1441 case 2:
1442 RTStrAssertMsgReturn((puch[1] & 0xc0) == 0x80, ("2/%u: %.*Rhxs\n", cb, RT_MIN(cb + 10, strlen((char *)puch)), puch),
1443 rtStrGetCpNExFailure(ppsz, pcch, pCp, VERR_INVALID_UTF8_ENCODING));
1444 break;
1445 }
1446
1447 /* get and validate the code point. */
1448 switch (cb)
1449 {
1450 case 6:
1451 uc = (puch[5] & 0x3f)
1452 | ((RTUNICP)(puch[4] & 0x3f) << 6)
1453 | ((RTUNICP)(puch[3] & 0x3f) << 12)
1454 | ((RTUNICP)(puch[2] & 0x3f) << 18)
1455 | ((RTUNICP)(puch[1] & 0x3f) << 24)
1456 | ((RTUNICP)(uch & 0x01) << 30);
1457 RTStrAssertMsgReturn(uc >= 0x04000000 && uc <= 0x7fffffff,
1458 ("%u: cp=%#010RX32: %.*Rhxs\n", cb, uc, RT_MIN(cb + 10, strlen((char *)puch)), puch),
1459 rtStrGetCpNExFailure(ppsz, pcch, pCp, VERR_INVALID_UTF8_ENCODING));
1460 break;
1461 case 5:
1462 uc = (puch[4] & 0x3f)
1463 | ((RTUNICP)(puch[3] & 0x3f) << 6)
1464 | ((RTUNICP)(puch[2] & 0x3f) << 12)
1465 | ((RTUNICP)(puch[1] & 0x3f) << 18)
1466 | ((RTUNICP)(uch & 0x03) << 24);
1467 RTStrAssertMsgReturn(uc >= 0x00200000 && uc <= 0x03ffffff,
1468 ("%u: cp=%#010RX32: %.*Rhxs\n", cb, uc, RT_MIN(cb + 10, strlen((char *)puch)), puch),
1469 rtStrGetCpNExFailure(ppsz, pcch, pCp, VERR_INVALID_UTF8_ENCODING));
1470 break;
1471 case 4:
1472 uc = (puch[3] & 0x3f)
1473 | ((RTUNICP)(puch[2] & 0x3f) << 6)
1474 | ((RTUNICP)(puch[1] & 0x3f) << 12)
1475 | ((RTUNICP)(uch & 0x07) << 18);
1476 RTStrAssertMsgReturn(uc >= 0x00010000 && uc <= 0x001fffff,
1477 ("%u: cp=%#010RX32: %.*Rhxs\n", cb, uc, RT_MIN(cb + 10, strlen((char *)puch)), puch),
1478 rtStrGetCpNExFailure(ppsz, pcch, pCp, VERR_INVALID_UTF8_ENCODING));
1479 break;
1480 case 3:
1481 uc = (puch[2] & 0x3f)
1482 | ((RTUNICP)(puch[1] & 0x3f) << 6)
1483 | ((RTUNICP)(uch & 0x0f) << 12);
1484 RTStrAssertMsgReturn(uc >= 0x00000800 && uc <= 0x0000fffd,
1485 ("%u: cp=%#010RX32: %.*Rhxs\n", cb, uc, RT_MIN(cb + 10, strlen((char *)puch)), puch),
1486 rtStrGetCpNExFailure(ppsz, pcch, pCp, uc == 0xffff || uc == 0xfffe ? VERR_CODE_POINT_ENDIAN_INDICATOR : VERR_INVALID_UTF8_ENCODING));
1487 RTStrAssertMsgReturn(uc < 0xd800 || uc > 0xdfff,
1488 ("%u: cp=%#010RX32: %.*Rhxs\n", cb, uc, RT_MIN(cb + 10, strlen((char *)puch)), puch),
1489 rtStrGetCpNExFailure(ppsz, pcch, pCp, VERR_CODE_POINT_SURROGATE));
1490 break;
1491 case 2:
1492 uc = (puch[1] & 0x3f)
1493 | ((RTUNICP)(uch & 0x1f) << 6);
1494 RTStrAssertMsgReturn(uc >= 0x00000080 && uc <= 0x000007ff,
1495 ("%u: cp=%#010RX32: %.*Rhxs\n", cb, uc, RT_MIN(cb + 10, strlen((char *)puch)), puch),
1496 rtStrGetCpNExFailure(ppsz, pcch, pCp, VERR_INVALID_UTF8_ENCODING));
1497 break;
1498 default: /* impossible, but GCC is bitching. */
1499 uc = RTUNICP_INVALID;
1500 break;
1501 }
1502 puch += cb;
1503 cch -= cb;
1504 }
1505 else
1506 {
1507 /* 6th bit is always set. */
1508 RTStrAssertMsgFailed(("Invalid UTF-8 first byte: %.*Rhxs\n", RT_MIN(strlen((char *)puch), 10), puch));
1509 return rtStrGetCpNExFailure(ppsz, pcch, pCp, VERR_INVALID_UTF8_ENCODING);
1510 }
1511 *pCp = uc;
1512 *ppsz = (const char *)puch;
1513 (*pcch) = cch;
1514 return VINF_SUCCESS;
1515}
1516RT_EXPORT_SYMBOL(RTStrGetCpNExInternal);
1517
1518
1519RTDECL(char *) RTStrPutCpInternal(char *psz, RTUNICP uc)
1520{
1521 unsigned char *puch = (unsigned char *)psz;
1522 if (uc < 0x80)
1523 *puch++ = (unsigned char )uc;
1524 else if (uc < 0x00000800)
1525 {
1526 *puch++ = 0xc0 | (uc >> 6);
1527 *puch++ = 0x80 | (uc & 0x3f);
1528 }
1529 else if (uc < 0x00010000)
1530 {
1531/** @todo RT_USE_RTC_3629 */
1532 if ( uc < 0x0000d8000
1533 || ( uc > 0x0000dfff
1534 && uc < 0x0000fffe))
1535 {
1536 *puch++ = 0xe0 | (uc >> 12);
1537 *puch++ = 0x80 | ((uc >> 6) & 0x3f);
1538 *puch++ = 0x80 | (uc & 0x3f);
1539 }
1540 else
1541 {
1542 AssertMsgFailed(("Invalid code point U+%05x!\n", uc));
1543 *puch++ = 0x7f;
1544 }
1545 }
1546/** @todo RT_USE_RTC_3629 */
1547 else if (uc < 0x00200000)
1548 {
1549 *puch++ = 0xf0 | (uc >> 18);
1550 *puch++ = 0x80 | ((uc >> 12) & 0x3f);
1551 *puch++ = 0x80 | ((uc >> 6) & 0x3f);
1552 *puch++ = 0x80 | (uc & 0x3f);
1553 }
1554 else if (uc < 0x04000000)
1555 {
1556 *puch++ = 0xf8 | (uc >> 24);
1557 *puch++ = 0x80 | ((uc >> 18) & 0x3f);
1558 *puch++ = 0x80 | ((uc >> 12) & 0x3f);
1559 *puch++ = 0x80 | ((uc >> 6) & 0x3f);
1560 *puch++ = 0x80 | (uc & 0x3f);
1561 }
1562 else if (uc <= 0x7fffffff)
1563 {
1564 *puch++ = 0xfc | (uc >> 30);
1565 *puch++ = 0x80 | ((uc >> 24) & 0x3f);
1566 *puch++ = 0x80 | ((uc >> 18) & 0x3f);
1567 *puch++ = 0x80 | ((uc >> 12) & 0x3f);
1568 *puch++ = 0x80 | ((uc >> 6) & 0x3f);
1569 *puch++ = 0x80 | (uc & 0x3f);
1570 }
1571 else
1572 {
1573 AssertMsgFailed(("Invalid code point U+%08x!\n", uc));
1574 *puch++ = 0x7f;
1575 }
1576
1577 return (char *)puch;
1578}
1579RT_EXPORT_SYMBOL(RTStrPutCpInternal);
1580
1581
1582RTDECL(char *) RTStrPrevCp(const char *pszStart, const char *psz)
1583{
1584 if (pszStart < psz)
1585 {
1586 /* simple char? */
1587 const unsigned char *puch = (const unsigned char *)psz;
1588 unsigned uch = *--puch;
1589 if (!(uch & RT_BIT(7)))
1590 return (char *)puch;
1591 RTStrAssertMsgReturn(!(uch & RT_BIT(6)), ("uch=%#x\n", uch), (char *)pszStart);
1592
1593 /* two or more. */
1594 uint32_t uMask = 0xffffffc0;
1595 while ( (const unsigned char *)pszStart < puch
1596 && !(uMask & 1))
1597 {
1598 uch = *--puch;
1599 if ((uch & 0xc0) != 0x80)
1600 {
1601 RTStrAssertMsgReturn((uch & (uMask >> 1)) == (uMask & 0xff),
1602 ("Invalid UTF-8 encoding: %.*Rhxs puch=%p psz=%p\n", psz - (char *)puch, puch, psz),
1603 (char *)pszStart);
1604 return (char *)puch;
1605 }
1606 uMask >>= 1;
1607 }
1608 RTStrAssertMsgFailed(("Invalid UTF-8 encoding: %.*Rhxs puch=%p psz=%p\n", psz - (char *)puch, puch, psz));
1609 }
1610 return (char *)pszStart;
1611}
1612RT_EXPORT_SYMBOL(RTStrPrevCp);
1613
Note: See TracBrowser for help on using the repository browser.

© 2024 Oracle Support Privacy / Do Not Sell My Info Terms of Use Trademark Policy Automated Access Etiquette