VirtualBox

source: vbox/trunk/src/VBox/Runtime/common/string/utf-8.cpp@ 59706

Last change on this file since 59706 was 57944, checked in by vboxsync, 9 years ago

iprt: More doxygen corrections.

  • Property svn:eol-style set to native
  • Property svn:keywords set to Id Revision
File size: 55.1 KB
Line 
1/* $Id: utf-8.cpp 57944 2015-09-29 15:07:09Z vboxsync $ */
2/** @file
3 * IPRT - UTF-8 Decoding.
4 */
5
6/*
7 * Copyright (C) 2006-2015 Oracle Corporation
8 *
9 * This file is part of VirtualBox Open Source Edition (OSE), as
10 * available from http://www.virtualbox.org. This file is free software;
11 * you can redistribute it and/or modify it under the terms of the GNU
12 * General Public License (GPL) as published by the Free Software
13 * Foundation, in version 2 as it comes in the "COPYING" file of the
14 * VirtualBox OSE distribution. VirtualBox OSE is distributed in the
15 * hope that it will be useful, but WITHOUT ANY WARRANTY of any kind.
16 *
17 * The contents of this file may alternatively be used under the terms
18 * of the Common Development and Distribution License Version 1.0
19 * (CDDL) only, as it comes in the "COPYING.CDDL" file of the
20 * VirtualBox OSE distribution, in which case the provisions of the
21 * CDDL are applicable instead of those of the GPL.
22 *
23 * You may elect to license modified versions of this file under the
24 * terms and conditions of either the GPL or the CDDL or both.
25 */
26
27
28/*********************************************************************************************************************************
29* Header Files *
30*********************************************************************************************************************************/
31#include <iprt/string.h>
32#include "internal/iprt.h"
33
34#include <iprt/uni.h>
35#include <iprt/alloc.h>
36#include <iprt/assert.h>
37#include <iprt/err.h>
38#include "internal/string.h"
39
40
41
42/**
43 * Get get length in code points of a UTF-8 encoded string.
44 * The string is validated while doing this.
45 *
46 * @returns IPRT status code.
47 * @param psz Pointer to the UTF-8 string.
48 * @param cch The max length of the string. (btw cch = cb)
49 * Use RTSTR_MAX if all of the string is to be examined.
50 * @param pcuc Where to store the length in unicode code points.
51 * @param pcchActual Where to store the actual size of the UTF-8 string
52 * on success (cch = cb again). Optional.
53 */
54DECLHIDDEN(int) rtUtf8Length(const char *psz, size_t cch, size_t *pcuc, size_t *pcchActual)
55{
56 const unsigned char *puch = (const unsigned char *)psz;
57 size_t cCodePoints = 0;
58 while (cch > 0)
59 {
60 const unsigned char uch = *puch;
61 if (!uch)
62 break;
63 if (uch & RT_BIT(7))
64 {
65 /* figure sequence length and validate the first byte */
66/** @todo RT_USE_RTC_3629 */
67 unsigned cb;
68 if ((uch & (RT_BIT(7) | RT_BIT(6) | RT_BIT(5))) == (RT_BIT(7) | RT_BIT(6)))
69 cb = 2;
70 else if ((uch & (RT_BIT(7) | RT_BIT(6) | RT_BIT(5) | RT_BIT(4))) == (RT_BIT(7) | RT_BIT(6) | RT_BIT(5)))
71 cb = 3;
72 else if ((uch & (RT_BIT(7) | RT_BIT(6) | RT_BIT(5) | RT_BIT(4) | RT_BIT(3))) == (RT_BIT(7) | RT_BIT(6) | RT_BIT(5) | RT_BIT(4)))
73 cb = 4;
74 else if ((uch & (RT_BIT(7) | RT_BIT(6) | RT_BIT(5) | RT_BIT(4) | RT_BIT(3) | RT_BIT(2))) == (RT_BIT(7) | RT_BIT(6) | RT_BIT(5) | RT_BIT(4) | RT_BIT(3)))
75 cb = 5;
76 else if ((uch & (RT_BIT(7) | RT_BIT(6) | RT_BIT(5) | RT_BIT(4) | RT_BIT(3) | RT_BIT(2) | RT_BIT(1))) == (RT_BIT(7) | RT_BIT(6) | RT_BIT(5) | RT_BIT(4) | RT_BIT(3) | RT_BIT(2)))
77 cb = 6;
78 else
79 {
80 RTStrAssertMsgFailed(("Invalid UTF-8 first byte: %.*Rhxs\n", RT_MIN(cch, 10), puch));
81 return VERR_INVALID_UTF8_ENCODING;
82 }
83
84 /* check length */
85 if (cb > cch)
86 {
87 RTStrAssertMsgFailed(("Invalid UTF-8 length: cb=%d cch=%d (%.*Rhxs)\n", cb, cch, RT_MIN(cch, 10), puch));
88 return VERR_INVALID_UTF8_ENCODING;
89 }
90
91 /* validate the rest */
92 switch (cb)
93 {
94 case 6:
95 RTStrAssertMsgReturn((puch[5] & (RT_BIT(7) | RT_BIT(6))) == RT_BIT(7), ("6/%u: %.*Rhxs\n", cb, RT_MIN(cb + 10, cch), puch), VERR_INVALID_UTF8_ENCODING);
96 case 5:
97 RTStrAssertMsgReturn((puch[4] & (RT_BIT(7) | RT_BIT(6))) == RT_BIT(7), ("5/%u: %.*Rhxs\n", cb, RT_MIN(cb + 10, cch), puch), VERR_INVALID_UTF8_ENCODING);
98 case 4:
99 RTStrAssertMsgReturn((puch[3] & (RT_BIT(7) | RT_BIT(6))) == RT_BIT(7), ("4/%u: %.*Rhxs\n", cb, RT_MIN(cb + 10, cch), puch), VERR_INVALID_UTF8_ENCODING);
100 case 3:
101 RTStrAssertMsgReturn((puch[2] & (RT_BIT(7) | RT_BIT(6))) == RT_BIT(7), ("3/%u: %.*Rhxs\n", cb, RT_MIN(cb + 10, cch), puch), VERR_INVALID_UTF8_ENCODING);
102 case 2:
103 RTStrAssertMsgReturn((puch[1] & (RT_BIT(7) | RT_BIT(6))) == RT_BIT(7), ("2/%u: %.*Rhxs\n", cb, RT_MIN(cb + 10, cch), puch), VERR_INVALID_UTF8_ENCODING);
104 break;
105 }
106
107 /* validate the code point. */
108 RTUNICP uc;
109 switch (cb)
110 {
111 case 6:
112 uc = (puch[5] & 0x3f)
113 | ((RTUNICP)(puch[4] & 0x3f) << 6)
114 | ((RTUNICP)(puch[3] & 0x3f) << 12)
115 | ((RTUNICP)(puch[2] & 0x3f) << 18)
116 | ((RTUNICP)(puch[1] & 0x3f) << 24)
117 | ((RTUNICP)(uch & 0x01) << 30);
118 RTStrAssertMsgReturn(uc >= 0x04000000 && uc <= 0x7fffffff,
119 ("%u: cp=%#010RX32: %.*Rhxs\n", cb, uc, RT_MIN(cb + 10, cch), puch), VERR_INVALID_UTF8_ENCODING);
120 break;
121 case 5:
122 uc = (puch[4] & 0x3f)
123 | ((RTUNICP)(puch[3] & 0x3f) << 6)
124 | ((RTUNICP)(puch[2] & 0x3f) << 12)
125 | ((RTUNICP)(puch[1] & 0x3f) << 18)
126 | ((RTUNICP)(uch & 0x03) << 24);
127 RTStrAssertMsgReturn(uc >= 0x00200000 && uc <= 0x03ffffff,
128 ("%u: cp=%#010RX32: %.*Rhxs\n", cb, uc, RT_MIN(cb + 10, cch), puch), VERR_INVALID_UTF8_ENCODING);
129 break;
130 case 4:
131 uc = (puch[3] & 0x3f)
132 | ((RTUNICP)(puch[2] & 0x3f) << 6)
133 | ((RTUNICP)(puch[1] & 0x3f) << 12)
134 | ((RTUNICP)(uch & 0x07) << 18);
135 RTStrAssertMsgReturn(uc >= 0x00010000 && uc <= 0x001fffff,
136 ("%u: cp=%#010RX32: %.*Rhxs\n", cb, uc, RT_MIN(cb + 10, cch), puch), VERR_INVALID_UTF8_ENCODING);
137 break;
138 case 3:
139 uc = (puch[2] & 0x3f)
140 | ((RTUNICP)(puch[1] & 0x3f) << 6)
141 | ((RTUNICP)(uch & 0x0f) << 12);
142 RTStrAssertMsgReturn(uc >= 0x00000800 && uc <= 0x0000fffd,
143 ("%u: cp=%#010RX32: %.*Rhxs\n", cb, uc, RT_MIN(cb + 10, cch), puch),
144 uc == 0xffff || uc == 0xfffe ? VERR_CODE_POINT_ENDIAN_INDICATOR : VERR_INVALID_UTF8_ENCODING);
145 RTStrAssertMsgReturn(uc < 0xd800 || uc > 0xdfff,
146 ("%u: cp=%#010RX32: %.*Rhxs\n", cb, uc, RT_MIN(cb + 10, cch), puch), VERR_CODE_POINT_SURROGATE);
147 break;
148 case 2:
149 uc = (puch[1] & 0x3f)
150 | ((RTUNICP)(uch & 0x1f) << 6);
151 RTStrAssertMsgReturn(uc >= 0x00000080 && uc <= 0x000007ff,
152 ("%u: cp=%#010RX32: %.*Rhxs\n", cb, uc, RT_MIN(cb + 10, cch), puch), VERR_INVALID_UTF8_ENCODING);
153 break;
154 }
155
156 /* advance */
157 cch -= cb;
158 puch += cb;
159 }
160 else
161 {
162 /* one ASCII byte */
163 puch++;
164 cch--;
165 }
166 cCodePoints++;
167 }
168
169 /* done */
170 *pcuc = cCodePoints;
171 if (pcchActual)
172 *pcchActual = puch - (unsigned char const *)psz;
173 return VINF_SUCCESS;
174}
175
176
177/**
178 * Decodes and UTF-8 string into an array of unicode code point.
179 *
180 * Since we know the input is valid, we do *not* perform encoding or length checks.
181 *
182 * @returns iprt status code.
183 * @param psz The UTF-8 string to recode. This is a valid encoding.
184 * @param cch The number of chars (the type char, so bytes if you like) to process of the UTF-8 string.
185 * The recoding will stop when cch or '\\0' is reached. Pass RTSTR_MAX to process up to '\\0'.
186 * @param paCps Where to store the code points array.
187 * @param cCps The number of RTUNICP items the paCps buffer can hold, excluding the terminator ('\\0').
188 */
189static int rtUtf8Decode(const char *psz, size_t cch, PRTUNICP paCps, size_t cCps)
190{
191 int rc = VINF_SUCCESS;
192 const unsigned char *puch = (const unsigned char *)psz;
193 PRTUNICP pCp = paCps;
194 while (cch > 0)
195 {
196 /* read the next char and check for terminator. */
197 const unsigned char uch = *puch;
198 if (!uch)
199 break;
200
201 /* check for output overflow */
202 if (RT_UNLIKELY(cCps < 1))
203 {
204 rc = VERR_BUFFER_OVERFLOW;
205 break;
206 }
207 cCps--;
208
209 /* decode and recode the code point */
210 if (!(uch & RT_BIT(7)))
211 {
212 *pCp++ = uch;
213 puch++;
214 cch--;
215 }
216#ifdef RT_STRICT
217 else if (!(uch & RT_BIT(6)))
218 AssertMsgFailed(("Internal error!\n"));
219#endif
220 else if (!(uch & RT_BIT(5)))
221 {
222 *pCp++ = (puch[1] & 0x3f)
223 | ((uint16_t)(uch & 0x1f) << 6);
224 puch += 2;
225 cch -= 2;
226 }
227 else if (!(uch & RT_BIT(4)))
228 {
229 *pCp++ = (puch[2] & 0x3f)
230 | ((uint16_t)(puch[1] & 0x3f) << 6)
231 | ((uint16_t)(uch & 0x0f) << 12);
232 puch += 3;
233 cch -= 3;
234 }
235 else if (!(uch & RT_BIT(3)))
236 {
237 *pCp++ = (puch[3] & 0x3f)
238 | ((RTUNICP)(puch[2] & 0x3f) << 6)
239 | ((RTUNICP)(puch[1] & 0x3f) << 12)
240 | ((RTUNICP)(uch & 0x07) << 18);
241 puch += 4;
242 cch -= 4;
243 }
244 else if (!(uch & RT_BIT(2)))
245 {
246 *pCp++ = (puch[4] & 0x3f)
247 | ((RTUNICP)(puch[3] & 0x3f) << 6)
248 | ((RTUNICP)(puch[2] & 0x3f) << 12)
249 | ((RTUNICP)(puch[1] & 0x3f) << 18)
250 | ((RTUNICP)(uch & 0x03) << 24);
251 puch += 5;
252 cch -= 6;
253 }
254 else
255 {
256 Assert(!(uch & RT_BIT(1)));
257 *pCp++ = (puch[5] & 0x3f)
258 | ((RTUNICP)(puch[4] & 0x3f) << 6)
259 | ((RTUNICP)(puch[3] & 0x3f) << 12)
260 | ((RTUNICP)(puch[2] & 0x3f) << 18)
261 | ((RTUNICP)(puch[1] & 0x3f) << 24)
262 | ((RTUNICP)(uch & 0x01) << 30);
263 puch += 6;
264 cch -= 6;
265 }
266 }
267
268 /* done */
269 *pCp = 0;
270 return rc;
271}
272
273
274RTDECL(size_t) RTStrUniLen(const char *psz)
275{
276 size_t cCodePoints;
277 int rc = rtUtf8Length(psz, RTSTR_MAX, &cCodePoints, NULL);
278 return RT_SUCCESS(rc) ? cCodePoints : 0;
279}
280RT_EXPORT_SYMBOL(RTStrUniLen);
281
282
283RTDECL(int) RTStrUniLenEx(const char *psz, size_t cch, size_t *pcCps)
284{
285 size_t cCodePoints;
286 int rc = rtUtf8Length(psz, cch, &cCodePoints, NULL);
287 if (pcCps)
288 *pcCps = RT_SUCCESS(rc) ? cCodePoints : 0;
289 return rc;
290}
291RT_EXPORT_SYMBOL(RTStrUniLenEx);
292
293
294RTDECL(int) RTStrValidateEncoding(const char *psz)
295{
296 return RTStrValidateEncodingEx(psz, RTSTR_MAX, 0);
297}
298RT_EXPORT_SYMBOL(RTStrValidateEncoding);
299
300
301RTDECL(int) RTStrValidateEncodingEx(const char *psz, size_t cch, uint32_t fFlags)
302{
303 AssertReturn(!(fFlags & ~(RTSTR_VALIDATE_ENCODING_ZERO_TERMINATED | RTSTR_VALIDATE_ENCODING_EXACT_LENGTH)),
304 VERR_INVALID_PARAMETER);
305 AssertPtr(psz);
306
307 /*
308 * Use rtUtf8Length for the job.
309 */
310 size_t cchActual;
311 size_t cCpsIgnored;
312 int rc = rtUtf8Length(psz, cch, &cCpsIgnored, &cchActual);
313 if (RT_SUCCESS(rc))
314 {
315 if (fFlags & RTSTR_VALIDATE_ENCODING_EXACT_LENGTH)
316 {
317 if (fFlags & RTSTR_VALIDATE_ENCODING_ZERO_TERMINATED)
318 cchActual++;
319 if (cchActual == cch)
320 rc = VINF_SUCCESS;
321 else if (cchActual < cch)
322 rc = VERR_BUFFER_UNDERFLOW;
323 else
324 rc = VERR_BUFFER_OVERFLOW;
325 }
326 else if ( (fFlags & RTSTR_VALIDATE_ENCODING_ZERO_TERMINATED)
327 && cchActual >= cch)
328 rc = VERR_BUFFER_OVERFLOW;
329 }
330 return rc;
331}
332RT_EXPORT_SYMBOL(RTStrValidateEncodingEx);
333
334
335RTDECL(bool) RTStrIsValidEncoding(const char *psz)
336{
337 int rc = RTStrValidateEncodingEx(psz, RTSTR_MAX, 0);
338 return RT_SUCCESS(rc);
339}
340RT_EXPORT_SYMBOL(RTStrIsValidEncoding);
341
342
343RTDECL(size_t) RTStrPurgeEncoding(char *psz)
344{
345 size_t cErrors = 0;
346 for (;;)
347 {
348 RTUNICP Cp;
349 int rc = RTStrGetCpEx((const char **)&psz, &Cp);
350 if (RT_SUCCESS(rc))
351 {
352 if (!Cp)
353 break;
354 }
355 else
356 {
357 psz[-1] = '?';
358 cErrors++;
359 }
360 }
361 return cErrors;
362}
363RT_EXPORT_SYMBOL(RTStrPurgeEncoding);
364
365
366RTDECL(ssize_t) RTStrPurgeComplementSet(char *psz, PCRTUNICP puszValidSet, char chReplacement)
367{
368 size_t cReplacements = 0;
369 AssertReturn(chReplacement && (unsigned)chReplacement < 128, -1);
370 for (;;)
371 {
372 RTUNICP Cp;
373 PCRTUNICP pCp;
374 char *pszOld = psz;
375 if (RT_FAILURE(RTStrGetCpEx((const char **)&psz, &Cp)))
376 return -1;
377 if (!Cp)
378 break;
379 for (pCp = puszValidSet; *pCp; pCp += 2)
380 {
381 AssertReturn(*(pCp + 1), -1);
382 if (*pCp <= Cp && *(pCp + 1) >= Cp) /* No, I won't do * and ++. */
383 break;
384 }
385 if (!*pCp)
386 {
387 for (; pszOld != psz; ++pszOld)
388 *pszOld = chReplacement;
389 ++cReplacements;
390 }
391 }
392 return cReplacements;
393}
394RT_EXPORT_SYMBOL(RTStrPurgeComplementSet);
395
396
397RTDECL(int) RTStrToUni(const char *pszString, PRTUNICP *ppaCps)
398{
399 /*
400 * Validate input.
401 */
402 Assert(VALID_PTR(pszString));
403 Assert(VALID_PTR(ppaCps));
404 *ppaCps = NULL;
405
406 /*
407 * Validate the UTF-8 input and count its code points.
408 */
409 size_t cCps;
410 int rc = rtUtf8Length(pszString, RTSTR_MAX, &cCps, NULL);
411 if (RT_SUCCESS(rc))
412 {
413 /*
414 * Allocate buffer.
415 */
416 PRTUNICP paCps = (PRTUNICP)RTMemAlloc((cCps + 1) * sizeof(RTUNICP));
417 if (paCps)
418 {
419 /*
420 * Decode the string.
421 */
422 rc = rtUtf8Decode(pszString, RTSTR_MAX, paCps, cCps);
423 if (RT_SUCCESS(rc))
424 {
425 *ppaCps = paCps;
426 return rc;
427 }
428 RTMemFree(paCps);
429 }
430 else
431 rc = VERR_NO_CODE_POINT_MEMORY;
432 }
433 return rc;
434}
435RT_EXPORT_SYMBOL(RTStrToUni);
436
437
438RTDECL(int) RTStrToUniEx(const char *pszString, size_t cchString, PRTUNICP *ppaCps, size_t cCps, size_t *pcCps)
439{
440 /*
441 * Validate input.
442 */
443 Assert(VALID_PTR(pszString));
444 Assert(VALID_PTR(ppaCps));
445 Assert(!pcCps || VALID_PTR(pcCps));
446
447 /*
448 * Validate the UTF-8 input and count the code points.
449 */
450 size_t cCpsResult;
451 int rc = rtUtf8Length(pszString, cchString, &cCpsResult, NULL);
452 if (RT_SUCCESS(rc))
453 {
454 if (pcCps)
455 *pcCps = cCpsResult;
456
457 /*
458 * Check buffer size / Allocate buffer.
459 */
460 bool fShouldFree;
461 PRTUNICP paCpsResult;
462 if (cCps > 0 && *ppaCps)
463 {
464 fShouldFree = false;
465 if (cCps <= cCpsResult)
466 return VERR_BUFFER_OVERFLOW;
467 paCpsResult = *ppaCps;
468 }
469 else
470 {
471 *ppaCps = NULL;
472 fShouldFree = true;
473 cCps = RT_MAX(cCpsResult + 1, cCps);
474 paCpsResult = (PRTUNICP)RTMemAlloc(cCps * sizeof(RTUNICP));
475 }
476 if (paCpsResult)
477 {
478 /*
479 * Encode the UTF-16 string.
480 */
481 rc = rtUtf8Decode(pszString, cchString, paCpsResult, cCps - 1);
482 if (RT_SUCCESS(rc))
483 {
484 *ppaCps = paCpsResult;
485 return rc;
486 }
487 if (fShouldFree)
488 RTMemFree(paCpsResult);
489 }
490 else
491 rc = VERR_NO_CODE_POINT_MEMORY;
492 }
493 return rc;
494}
495RT_EXPORT_SYMBOL(RTStrToUniEx);
496
497
498/**
499 * Calculates the UTF-16 length of a string, validating the encoding while doing so.
500 *
501 * @returns IPRT status code.
502 * @param psz Pointer to the UTF-8 string.
503 * @param cch The max length of the string. (btw cch = cb)
504 * Use RTSTR_MAX if all of the string is to be examined.
505 * @param pcwc Where to store the length of the UTF-16 string as a number of RTUTF16 characters.
506 */
507static int rtUtf8CalcUtf16Length(const char *psz, size_t cch, size_t *pcwc)
508{
509 const unsigned char *puch = (const unsigned char *)psz;
510 size_t cwc = 0;
511 while (cch > 0)
512 {
513 const unsigned char uch = *puch;
514 if (!uch)
515 break;
516 if (!(uch & RT_BIT(7)))
517 {
518 /* one ASCII byte */
519 cwc++;
520 puch++;
521 cch--;
522 }
523 else
524 {
525 /* figure sequence length and validate the first byte */
526 unsigned cb;
527 if ((uch & (RT_BIT(7) | RT_BIT(6) | RT_BIT(5))) == (RT_BIT(7) | RT_BIT(6)))
528 cb = 2;
529 else if ((uch & (RT_BIT(7) | RT_BIT(6) | RT_BIT(5) | RT_BIT(4))) == (RT_BIT(7) | RT_BIT(6) | RT_BIT(5)))
530 cb = 3;
531 else if ((uch & (RT_BIT(7) | RT_BIT(6) | RT_BIT(5) | RT_BIT(4) | RT_BIT(3))) == (RT_BIT(7) | RT_BIT(6) | RT_BIT(5) | RT_BIT(4)))
532 cb = 4;
533 else if ((uch & (RT_BIT(7) | RT_BIT(6) | RT_BIT(5) | RT_BIT(4) | RT_BIT(3) | RT_BIT(2))) == (RT_BIT(7) | RT_BIT(6) | RT_BIT(5) | RT_BIT(4) | RT_BIT(3)))
534 cb = 5;
535 else if ((uch & (RT_BIT(7) | RT_BIT(6) | RT_BIT(5) | RT_BIT(4) | RT_BIT(3) | RT_BIT(2) | RT_BIT(1))) == (RT_BIT(7) | RT_BIT(6) | RT_BIT(5) | RT_BIT(4) | RT_BIT(3) | RT_BIT(2)))
536 cb = 6;
537 else
538 {
539 RTStrAssertMsgFailed(("Invalid UTF-8 first byte: %.*Rhxs\n", RT_MIN(cch, 10), puch));
540 return VERR_INVALID_UTF8_ENCODING;
541 }
542
543 /* check length */
544 if (cb > cch)
545 {
546 RTStrAssertMsgFailed(("Invalid UTF-8 length: cb=%d cch=%d (%.*Rhxs)\n", cb, cch, RT_MIN(cch, 10), puch));
547 return VERR_INVALID_UTF8_ENCODING;
548 }
549
550 /* validate the rest */
551 switch (cb)
552 {
553 case 6:
554 RTStrAssertMsgReturn((puch[5] & (RT_BIT(7) | RT_BIT(6))) == RT_BIT(7), ("6/%u: %.*Rhxs\n", cb, RT_MIN(cb + 10, cch), puch), VERR_INVALID_UTF8_ENCODING);
555 case 5:
556 RTStrAssertMsgReturn((puch[4] & (RT_BIT(7) | RT_BIT(6))) == RT_BIT(7), ("5/%u: %.*Rhxs\n", cb, RT_MIN(cb + 10, cch), puch), VERR_INVALID_UTF8_ENCODING);
557 case 4:
558 RTStrAssertMsgReturn((puch[3] & (RT_BIT(7) | RT_BIT(6))) == RT_BIT(7), ("4/%u: %.*Rhxs\n", cb, RT_MIN(cb + 10, cch), puch), VERR_INVALID_UTF8_ENCODING);
559 case 3:
560 RTStrAssertMsgReturn((puch[2] & (RT_BIT(7) | RT_BIT(6))) == RT_BIT(7), ("3/%u: %.*Rhxs\n", cb, RT_MIN(cb + 10, cch), puch), VERR_INVALID_UTF8_ENCODING);
561 case 2:
562 RTStrAssertMsgReturn((puch[1] & (RT_BIT(7) | RT_BIT(6))) == RT_BIT(7), ("2/%u: %.*Rhxs\n", cb, RT_MIN(cb + 10, cch), puch), VERR_INVALID_UTF8_ENCODING);
563 break;
564 }
565
566 /* validate the code point. */
567 RTUNICP uc;
568 switch (cb)
569 {
570 case 6:
571 uc = (puch[5] & 0x3f)
572 | ((RTUNICP)(puch[4] & 0x3f) << 6)
573 | ((RTUNICP)(puch[3] & 0x3f) << 12)
574 | ((RTUNICP)(puch[2] & 0x3f) << 18)
575 | ((RTUNICP)(puch[1] & 0x3f) << 24)
576 | ((RTUNICP)(uch & 0x01) << 30);
577 RTStrAssertMsgReturn(uc >= 0x04000000 && uc <= 0x7fffffff,
578 ("%u: cp=%#010RX32: %.*Rhxs\n", cb, uc, RT_MIN(cb + 10, cch), puch), VERR_INVALID_UTF8_ENCODING);
579 RTStrAssertMsgFailed(("%u: cp=%#010RX32: %.*Rhxs\n", cb, uc, RT_MIN(cb + 10, cch), puch));
580 return VERR_CANT_RECODE_AS_UTF16;
581 case 5:
582 uc = (puch[4] & 0x3f)
583 | ((RTUNICP)(puch[3] & 0x3f) << 6)
584 | ((RTUNICP)(puch[2] & 0x3f) << 12)
585 | ((RTUNICP)(puch[1] & 0x3f) << 18)
586 | ((RTUNICP)(uch & 0x03) << 24);
587 RTStrAssertMsgReturn(uc >= 0x00200000 && uc <= 0x03ffffff,
588 ("%u: cp=%#010RX32: %.*Rhxs\n", cb, uc, RT_MIN(cb + 10, cch), puch), VERR_INVALID_UTF8_ENCODING);
589 RTStrAssertMsgFailed(("%u: cp=%#010RX32: %.*Rhxs\n", cb, uc, RT_MIN(cb + 10, cch), puch));
590 return VERR_CANT_RECODE_AS_UTF16;
591 case 4:
592 uc = (puch[3] & 0x3f)
593 | ((RTUNICP)(puch[2] & 0x3f) << 6)
594 | ((RTUNICP)(puch[1] & 0x3f) << 12)
595 | ((RTUNICP)(uch & 0x07) << 18);
596 RTStrAssertMsgReturn(uc >= 0x00010000 && uc <= 0x001fffff,
597 ("%u: cp=%#010RX32: %.*Rhxs\n", cb, uc, RT_MIN(cb + 10, cch), puch), VERR_INVALID_UTF8_ENCODING);
598 RTStrAssertMsgReturn(uc <= 0x0010ffff,
599 ("%u: cp=%#010RX32: %.*Rhxs\n", cb, uc, RT_MIN(cb + 10, cch), puch), VERR_CANT_RECODE_AS_UTF16);
600 cwc++;
601 break;
602 case 3:
603 uc = (puch[2] & 0x3f)
604 | ((RTUNICP)(puch[1] & 0x3f) << 6)
605 | ((RTUNICP)(uch & 0x0f) << 12);
606 RTStrAssertMsgReturn(uc >= 0x00000800 && uc <= 0x0000fffd,
607 ("%u: cp=%#010RX32: %.*Rhxs\n", cb, uc, RT_MIN(cb + 10, cch), puch),
608 uc == 0xffff || uc == 0xfffe ? VERR_CODE_POINT_ENDIAN_INDICATOR : VERR_INVALID_UTF8_ENCODING);
609 RTStrAssertMsgReturn(uc < 0xd800 || uc > 0xdfff,
610 ("%u: cp=%#010RX32: %.*Rhxs\n", cb, uc, RT_MIN(cb + 10, cch), puch), VERR_CODE_POINT_SURROGATE);
611 break;
612 case 2:
613 uc = (puch[1] & 0x3f)
614 | ((RTUNICP)(uch & 0x1f) << 6);
615 RTStrAssertMsgReturn(uc >= 0x00000080 && uc <= 0x000007ff,
616 ("%u: cp=%#010RX32: %.*Rhxs\n", cb, uc, RT_MIN(cb + 10, cch), puch), VERR_INVALID_UTF8_ENCODING);
617 break;
618 }
619
620 /* advance */
621 cch -= cb;
622 puch += cb;
623 cwc++;
624 }
625 }
626
627 /* done */
628 *pcwc = cwc;
629 return VINF_SUCCESS;
630}
631
632
633/**
634 * Recodes a valid UTF-8 string as UTF-16.
635 *
636 * Since we know the input is valid, we do *not* perform encoding or length checks.
637 *
638 * @returns iprt status code.
639 * @param psz The UTF-8 string to recode. This is a valid encoding.
640 * @param cch The number of chars (the type char, so bytes if you like) to process of the UTF-8 string.
641 * The recoding will stop when cch or '\\0' is reached. Pass RTSTR_MAX to process up to '\\0'.
642 * @param pwsz Where to store the UTF-16 string.
643 * @param cwc The number of RTUTF16 items the pwsz buffer can hold, excluding the terminator ('\\0').
644 */
645static int rtUtf8RecodeAsUtf16(const char *psz, size_t cch, PRTUTF16 pwsz, size_t cwc)
646{
647 int rc = VINF_SUCCESS;
648 const unsigned char *puch = (const unsigned char *)psz;
649 PRTUTF16 pwc = pwsz;
650 while (cch > 0)
651 {
652 /* read the next char and check for terminator. */
653 const unsigned char uch = *puch;
654 if (!uch)
655 break;
656
657 /* check for output overflow */
658 if (RT_UNLIKELY(cwc < 1))
659 {
660 rc = VERR_BUFFER_OVERFLOW;
661 break;
662 }
663 cwc--;
664
665 /* decode and recode the code point */
666 if (!(uch & RT_BIT(7)))
667 {
668 *pwc++ = uch;
669 puch++;
670 cch--;
671 }
672 else if ((uch & (RT_BIT(7) | RT_BIT(6) | RT_BIT(5))) == (RT_BIT(7) | RT_BIT(6)))
673 {
674 uint16_t uc = (puch[1] & 0x3f)
675 | ((uint16_t)(uch & 0x1f) << 6);
676 *pwc++ = uc;
677 puch += 2;
678 cch -= 2;
679 }
680 else if ((uch & (RT_BIT(7) | RT_BIT(6) | RT_BIT(5) | RT_BIT(4))) == (RT_BIT(7) | RT_BIT(6) | RT_BIT(5)))
681 {
682 uint16_t uc = (puch[2] & 0x3f)
683 | ((uint16_t)(puch[1] & 0x3f) << 6)
684 | ((uint16_t)(uch & 0x0f) << 12);
685 *pwc++ = uc;
686 puch += 3;
687 cch -= 3;
688 }
689 else
690 {
691 /* generate surrogate pair */
692 Assert((uch & (RT_BIT(7) | RT_BIT(6) | RT_BIT(5) | RT_BIT(4) | RT_BIT(3))) == (RT_BIT(7) | RT_BIT(6) | RT_BIT(5) | RT_BIT(4)));
693 RTUNICP uc = (puch[3] & 0x3f)
694 | ((RTUNICP)(puch[2] & 0x3f) << 6)
695 | ((RTUNICP)(puch[1] & 0x3f) << 12)
696 | ((RTUNICP)(uch & 0x07) << 18);
697 if (RT_UNLIKELY(cwc < 1))
698 {
699 rc = VERR_BUFFER_OVERFLOW;
700 break;
701 }
702 cwc--;
703
704 uc -= 0x10000;
705 *pwc++ = 0xd800 | (uc >> 10);
706 *pwc++ = 0xdc00 | (uc & 0x3ff);
707 puch += 4;
708 cch -= 4;
709 }
710 }
711
712 /* done */
713 *pwc = '\0';
714 return rc;
715}
716
717
718RTDECL(int) RTStrToUtf16Tag(const char *pszString, PRTUTF16 *ppwszString, const char *pszTag)
719{
720 /*
721 * Validate input.
722 */
723 Assert(VALID_PTR(ppwszString));
724 Assert(VALID_PTR(pszString));
725 *ppwszString = NULL;
726
727 /*
728 * Validate the UTF-8 input and calculate the length of the UTF-16 string.
729 */
730 size_t cwc;
731 int rc = rtUtf8CalcUtf16Length(pszString, RTSTR_MAX, &cwc);
732 if (RT_SUCCESS(rc))
733 {
734 /*
735 * Allocate buffer.
736 */
737 PRTUTF16 pwsz = (PRTUTF16)RTMemAllocTag((cwc + 1) * sizeof(RTUTF16), pszTag);
738 if (pwsz)
739 {
740 /*
741 * Encode the UTF-16 string.
742 */
743 rc = rtUtf8RecodeAsUtf16(pszString, RTSTR_MAX, pwsz, cwc);
744 if (RT_SUCCESS(rc))
745 {
746 *ppwszString = pwsz;
747 return rc;
748 }
749 RTMemFree(pwsz);
750 }
751 else
752 rc = VERR_NO_UTF16_MEMORY;
753 }
754 return rc;
755}
756RT_EXPORT_SYMBOL(RTStrToUtf16Tag);
757
758
759RTDECL(int) RTStrToUtf16ExTag(const char *pszString, size_t cchString,
760 PRTUTF16 *ppwsz, size_t cwc, size_t *pcwc, const char *pszTag)
761{
762 /*
763 * Validate input.
764 */
765 Assert(VALID_PTR(pszString));
766 Assert(VALID_PTR(ppwsz));
767 Assert(!pcwc || VALID_PTR(pcwc));
768
769 /*
770 * Validate the UTF-8 input and calculate the length of the UTF-16 string.
771 */
772 size_t cwcResult;
773 int rc = rtUtf8CalcUtf16Length(pszString, cchString, &cwcResult);
774 if (RT_SUCCESS(rc))
775 {
776 if (pcwc)
777 *pcwc = cwcResult;
778
779 /*
780 * Check buffer size / Allocate buffer.
781 */
782 bool fShouldFree;
783 PRTUTF16 pwszResult;
784 if (cwc > 0 && *ppwsz)
785 {
786 fShouldFree = false;
787 if (cwc <= cwcResult)
788 return VERR_BUFFER_OVERFLOW;
789 pwszResult = *ppwsz;
790 }
791 else
792 {
793 *ppwsz = NULL;
794 fShouldFree = true;
795 cwc = RT_MAX(cwcResult + 1, cwc);
796 pwszResult = (PRTUTF16)RTMemAllocTag(cwc * sizeof(RTUTF16), pszTag);
797 }
798 if (pwszResult)
799 {
800 /*
801 * Encode the UTF-16 string.
802 */
803 rc = rtUtf8RecodeAsUtf16(pszString, cchString, pwszResult, cwc - 1);
804 if (RT_SUCCESS(rc))
805 {
806 *ppwsz = pwszResult;
807 return rc;
808 }
809 if (fShouldFree)
810 RTMemFree(pwszResult);
811 }
812 else
813 rc = VERR_NO_UTF16_MEMORY;
814 }
815 return rc;
816}
817RT_EXPORT_SYMBOL(RTStrToUtf16ExTag);
818
819
820RTDECL(size_t) RTStrCalcUtf16Len(const char *psz)
821{
822 size_t cwc;
823 int rc = rtUtf8CalcUtf16Length(psz, RTSTR_MAX, &cwc);
824 return RT_SUCCESS(rc) ? cwc : 0;
825}
826RT_EXPORT_SYMBOL(RTStrCalcUtf16Len);
827
828
829RTDECL(int) RTStrCalcUtf16LenEx(const char *psz, size_t cch, size_t *pcwc)
830{
831 size_t cwc;
832 int rc = rtUtf8CalcUtf16Length(psz, cch, &cwc);
833 if (pcwc)
834 *pcwc = RT_SUCCESS(rc) ? cwc : ~(size_t)0;
835 return rc;
836}
837RT_EXPORT_SYMBOL(RTStrCalcUtf16LenEx);
838
839
840/**
841 * Calculates the length of the UTF-8 encoding of a Latin-1 string.
842 *
843 * @returns iprt status code.
844 * @param psz The Latin-1 string.
845 * @param cchIn The max length of the Latin-1 string to consider.
846 * @param pcch Where to store the length (excluding '\\0') of the UTF-8 string. (cch == cb, btw)
847 */
848static int rtLatin1CalcUtf8Length(const char *psz, size_t cchIn, size_t *pcch)
849{
850 size_t cch = 0;
851 for (;;)
852 {
853 RTUNICP Cp;
854 int rc = RTLatin1GetCpNEx(&psz, &cchIn, &Cp);
855 if (Cp == 0 || rc == VERR_END_OF_STRING)
856 break;
857 if (RT_FAILURE(rc))
858 return rc;
859 cch += RTStrCpSize(Cp); /* cannot fail */
860 }
861
862 /* done */
863 *pcch = cch;
864 return VINF_SUCCESS;
865}
866
867
868/**
869 * Recodes a Latin-1 string as UTF-8.
870 *
871 * @returns iprt status code.
872 * @param pszIn The Latin-1 string.
873 * @param cchIn The number of characters to process from psz. The recoding
874 * will stop when cch or '\\0' is reached.
875 * @param psz Where to store the UTF-8 string.
876 * @param cch The size of the UTF-8 buffer, excluding the terminator.
877 */
878static int rtLatin1RecodeAsUtf8(const char *pszIn, size_t cchIn, char *psz, size_t cch)
879{
880 int rc;
881 for (;;)
882 {
883 RTUNICP Cp;
884 size_t cchCp;
885 rc = RTLatin1GetCpNEx(&pszIn, &cchIn, &Cp);
886 if (Cp == 0 || RT_FAILURE(rc))
887 break;
888 cchCp = RTStrCpSize(Cp);
889 if (RT_UNLIKELY(cch < cchCp))
890 {
891 RTStrAssertMsgFailed(("Buffer overflow! 1\n"));
892 rc = VERR_BUFFER_OVERFLOW;
893 break;
894 }
895 cch -= cchCp;
896 psz = RTStrPutCp(psz, Cp);
897 }
898
899 /* done */
900 if (rc == VERR_END_OF_STRING)
901 rc = VINF_SUCCESS;
902 *psz = '\0';
903 return rc;
904}
905
906
907
908RTDECL(int) RTLatin1ToUtf8Tag(const char *pszString, char **ppszString, const char *pszTag)
909{
910 /*
911 * Validate input.
912 */
913 Assert(VALID_PTR(ppszString));
914 Assert(VALID_PTR(pszString));
915 *ppszString = NULL;
916
917 /*
918 * Calculate the length of the UTF-8 encoding of the Latin-1 string.
919 */
920 size_t cch;
921 int rc = rtLatin1CalcUtf8Length(pszString, RTSTR_MAX, &cch);
922 if (RT_SUCCESS(rc))
923 {
924 /*
925 * Allocate buffer and recode it.
926 */
927 char *pszResult = (char *)RTMemAllocTag(cch + 1, pszTag);
928 if (pszResult)
929 {
930 rc = rtLatin1RecodeAsUtf8(pszString, RTSTR_MAX, pszResult, cch);
931 if (RT_SUCCESS(rc))
932 {
933 *ppszString = pszResult;
934 return rc;
935 }
936
937 RTMemFree(pszResult);
938 }
939 else
940 rc = VERR_NO_STR_MEMORY;
941 }
942 return rc;
943}
944RT_EXPORT_SYMBOL(RTLatin1ToUtf8Tag);
945
946
947RTDECL(int) RTLatin1ToUtf8ExTag(const char *pszString, size_t cchString, char **ppsz, size_t cch, size_t *pcch, const char *pszTag)
948{
949 /*
950 * Validate input.
951 */
952 Assert(VALID_PTR(pszString));
953 Assert(VALID_PTR(ppsz));
954 Assert(!pcch || VALID_PTR(pcch));
955
956 /*
957 * Calculate the length of the UTF-8 encoding of the Latin-1 string.
958 */
959 size_t cchResult;
960 int rc = rtLatin1CalcUtf8Length(pszString, cchString, &cchResult);
961 if (RT_SUCCESS(rc))
962 {
963 if (pcch)
964 *pcch = cchResult;
965
966 /*
967 * Check buffer size / Allocate buffer and recode it.
968 */
969 bool fShouldFree;
970 char *pszResult;
971 if (cch > 0 && *ppsz)
972 {
973 fShouldFree = false;
974 if (RT_UNLIKELY(cch <= cchResult))
975 return VERR_BUFFER_OVERFLOW;
976 pszResult = *ppsz;
977 }
978 else
979 {
980 *ppsz = NULL;
981 fShouldFree = true;
982 cch = RT_MAX(cch, cchResult + 1);
983 pszResult = (char *)RTStrAllocTag(cch, pszTag);
984 }
985 if (pszResult)
986 {
987 rc = rtLatin1RecodeAsUtf8(pszString, cchString, pszResult, cch - 1);
988 if (RT_SUCCESS(rc))
989 {
990 *ppsz = pszResult;
991 return rc;
992 }
993
994 if (fShouldFree)
995 RTStrFree(pszResult);
996 }
997 else
998 rc = VERR_NO_STR_MEMORY;
999 }
1000 return rc;
1001}
1002RT_EXPORT_SYMBOL(RTLatin1ToUtf8ExTag);
1003
1004
1005RTDECL(size_t) RTLatin1CalcUtf8Len(const char *psz)
1006{
1007 size_t cch;
1008 int rc = rtLatin1CalcUtf8Length(psz, RTSTR_MAX, &cch);
1009 return RT_SUCCESS(rc) ? cch : 0;
1010}
1011RT_EXPORT_SYMBOL(RTLatin1CalcUtf8Len);
1012
1013
1014RTDECL(int) RTLatin1CalcUtf8LenEx(const char *psz, size_t cchIn, size_t *pcch)
1015{
1016 size_t cch;
1017 int rc = rtLatin1CalcUtf8Length(psz, cchIn, &cch);
1018 if (pcch)
1019 *pcch = RT_SUCCESS(rc) ? cch : ~(size_t)0;
1020 return rc;
1021}
1022RT_EXPORT_SYMBOL(RTLatin1CalcUtf8LenEx);
1023
1024
1025/**
1026 * Calculates the Latin-1 length of a string, validating the encoding while
1027 * doing so.
1028 *
1029 * @returns IPRT status code.
1030 * @param psz Pointer to the UTF-8 string.
1031 * @param cchIn The max length of the string. (btw cch = cb)
1032 * Use RTSTR_MAX if all of the string is to be examined.
1033 * @param pcch Where to store the length of the Latin-1 string in bytes.
1034 */
1035static int rtUtf8CalcLatin1Length(const char *psz, size_t cchIn, size_t *pcch)
1036{
1037 size_t cch = 0;
1038 for (;;)
1039 {
1040 RTUNICP Cp;
1041 size_t cchCp;
1042 int rc = RTStrGetCpNEx(&psz, &cchIn, &Cp);
1043 if (Cp == 0 || rc == VERR_END_OF_STRING)
1044 break;
1045 if (RT_FAILURE(rc))
1046 return rc;
1047 cchCp = RTLatin1CpSize(Cp);
1048 if (cchCp == 0)
1049 return VERR_NO_TRANSLATION;
1050 cch += cchCp;
1051 }
1052
1053 /* done */
1054 *pcch = cch;
1055 return VINF_SUCCESS;
1056}
1057
1058
1059/**
1060 * Recodes a valid UTF-8 string as Latin-1.
1061 *
1062 * Since we know the input is valid, we do *not* perform encoding or length checks.
1063 *
1064 * @returns iprt status code.
1065 * @param pszIn The UTF-8 string to recode. This is a valid encoding.
1066 * @param cchIn The number of chars (the type char, so bytes if you like) to process of the UTF-8 string.
1067 * The recoding will stop when cch or '\\0' is reached. Pass RTSTR_MAX to process up to '\\0'.
1068 * @param psz Where to store the Latin-1 string.
1069 * @param cch The number of characters the pszOut buffer can hold, excluding the terminator ('\\0').
1070 */
1071static int rtUtf8RecodeAsLatin1(const char *pszIn, size_t cchIn, char *psz, size_t cch)
1072{
1073 int rc;
1074 for (;;)
1075 {
1076 RTUNICP Cp;
1077 size_t cchCp;
1078 rc = RTStrGetCpNEx(&pszIn, &cchIn, &Cp);
1079 if (Cp == 0 || RT_FAILURE(rc))
1080 break;
1081 cchCp = RTLatin1CpSize(Cp);
1082 if (RT_UNLIKELY(cch < cchCp))
1083 {
1084 RTStrAssertMsgFailed(("Buffer overflow! 1\n"));
1085 rc = VERR_BUFFER_OVERFLOW;
1086 break;
1087 }
1088 cch -= cchCp;
1089 psz = RTLatin1PutCp(psz, Cp);
1090 }
1091
1092 /* done */
1093 if (rc == VERR_END_OF_STRING)
1094 rc = VINF_SUCCESS;
1095 *psz = '\0';
1096 return rc;
1097}
1098
1099
1100
1101RTDECL(int) RTStrToLatin1Tag(const char *pszString, char **ppszString, const char *pszTag)
1102{
1103 /*
1104 * Validate input.
1105 */
1106 Assert(VALID_PTR(ppszString));
1107 Assert(VALID_PTR(pszString));
1108 *ppszString = NULL;
1109
1110 /*
1111 * Validate the UTF-8 input and calculate the length of the Latin-1 string.
1112 */
1113 size_t cch;
1114 int rc = rtUtf8CalcLatin1Length(pszString, RTSTR_MAX, &cch);
1115 if (RT_SUCCESS(rc))
1116 {
1117 /*
1118 * Allocate buffer.
1119 */
1120 char *psz = (char *)RTMemAllocTag(cch + 1, pszTag);
1121 if (psz)
1122 {
1123 /*
1124 * Encode the UTF-16 string.
1125 */
1126 rc = rtUtf8RecodeAsLatin1(pszString, RTSTR_MAX, psz, cch);
1127 if (RT_SUCCESS(rc))
1128 {
1129 *ppszString = psz;
1130 return rc;
1131 }
1132 RTMemFree(psz);
1133 }
1134 else
1135 rc = VERR_NO_STR_MEMORY;
1136 }
1137 return rc;
1138}
1139RT_EXPORT_SYMBOL(RTStrToLatin1Tag);
1140
1141
1142RTDECL(int) RTStrToLatin1ExTag(const char *pszString, size_t cchString,
1143 char **ppsz, size_t cch, size_t *pcch, const char *pszTag)
1144{
1145 /*
1146 * Validate input.
1147 */
1148 Assert(VALID_PTR(pszString));
1149 Assert(VALID_PTR(ppsz));
1150 Assert(!pcch || VALID_PTR(pcch));
1151
1152 /*
1153 * Validate the UTF-8 input and calculate the length of the UTF-16 string.
1154 */
1155 size_t cchResult;
1156 int rc = rtUtf8CalcLatin1Length(pszString, cchString, &cchResult);
1157 if (RT_SUCCESS(rc))
1158 {
1159 if (pcch)
1160 *pcch = cchResult;
1161
1162 /*
1163 * Check buffer size / Allocate buffer.
1164 */
1165 bool fShouldFree;
1166 char *pszResult;
1167 if (cch > 0 && *ppsz)
1168 {
1169 fShouldFree = false;
1170 if (cch <= cchResult)
1171 return VERR_BUFFER_OVERFLOW;
1172 pszResult = *ppsz;
1173 }
1174 else
1175 {
1176 *ppsz = NULL;
1177 fShouldFree = true;
1178 cch = RT_MAX(cchResult + 1, cch);
1179 pszResult = (char *)RTMemAllocTag(cch, pszTag);
1180 }
1181 if (pszResult)
1182 {
1183 /*
1184 * Encode the Latin-1 string.
1185 */
1186 rc = rtUtf8RecodeAsLatin1(pszString, cchString, pszResult, cch - 1);
1187 if (RT_SUCCESS(rc))
1188 {
1189 *ppsz = pszResult;
1190 return rc;
1191 }
1192 if (fShouldFree)
1193 RTMemFree(pszResult);
1194 }
1195 else
1196 rc = VERR_NO_STR_MEMORY;
1197 }
1198 return rc;
1199}
1200RT_EXPORT_SYMBOL(RTStrToLatin1Tag);
1201
1202
1203RTDECL(size_t) RTStrCalcLatin1Len(const char *psz)
1204{
1205 size_t cch;
1206 int rc = rtUtf8CalcLatin1Length(psz, RTSTR_MAX, &cch);
1207 return RT_SUCCESS(rc) ? cch : 0;
1208}
1209RT_EXPORT_SYMBOL(RTStrCalcLatin1Len);
1210
1211
1212RTDECL(int) RTStrCalcLatin1LenEx(const char *psz, size_t cchIn, size_t *pcch)
1213{
1214 size_t cch;
1215 int rc = rtUtf8CalcLatin1Length(psz, cchIn, &cch);
1216 if (pcch)
1217 *pcch = RT_SUCCESS(rc) ? cch : ~(size_t)0;
1218 return rc;
1219}
1220RT_EXPORT_SYMBOL(RTStrCalcLatin1LenEx);
1221
1222
1223/**
1224 * Handle invalid encodings passed to RTStrGetCp() and RTStrGetCpEx().
1225 * @returns rc
1226 * @param ppsz The pointer to the string position point.
1227 * @param pCp Where to store RTUNICP_INVALID.
1228 * @param rc The iprt error code.
1229 */
1230static int rtStrGetCpExFailure(const char **ppsz, PRTUNICP pCp, int rc)
1231{
1232 /*
1233 * Try find a valid encoding.
1234 */
1235 (*ppsz)++; /** @todo code this! */
1236 *pCp = RTUNICP_INVALID;
1237 return rc;
1238}
1239
1240
1241RTDECL(RTUNICP) RTStrGetCpInternal(const char *psz)
1242{
1243 RTUNICP Cp;
1244 RTStrGetCpExInternal(&psz, &Cp);
1245 return Cp;
1246}
1247RT_EXPORT_SYMBOL(RTStrGetCpInternal);
1248
1249
1250RTDECL(int) RTStrGetCpExInternal(const char **ppsz, PRTUNICP pCp)
1251{
1252 const unsigned char *puch = (const unsigned char *)*ppsz;
1253 const unsigned char uch = *puch;
1254 RTUNICP uc;
1255
1256 /* ASCII ? */
1257 if (!(uch & RT_BIT(7)))
1258 {
1259 uc = uch;
1260 puch++;
1261 }
1262 else if (uch & RT_BIT(6))
1263 {
1264 /* figure the length and validate the first octet. */
1265/** @todo RT_USE_RTC_3629 */
1266 unsigned cb;
1267 if (!(uch & RT_BIT(5)))
1268 cb = 2;
1269 else if (!(uch & RT_BIT(4)))
1270 cb = 3;
1271 else if (!(uch & RT_BIT(3)))
1272 cb = 4;
1273 else if (!(uch & RT_BIT(2)))
1274 cb = 5;
1275 else if (!(uch & RT_BIT(1)))
1276 cb = 6;
1277 else
1278 {
1279 RTStrAssertMsgFailed(("Invalid UTF-8 first byte: %.*Rhxs\n", RT_MIN(strlen((char *)puch), 10), puch));
1280 return rtStrGetCpExFailure(ppsz, pCp, VERR_INVALID_UTF8_ENCODING);
1281 }
1282
1283 /* validate the rest */
1284 switch (cb)
1285 {
1286 case 6:
1287 RTStrAssertMsgReturn((puch[5] & 0xc0) == 0x80, ("6/%u: %.*Rhxs\n", cb, RT_MIN(cb + 10, strlen((char *)puch)), puch),
1288 rtStrGetCpExFailure(ppsz, pCp, VERR_INVALID_UTF8_ENCODING));
1289 case 5:
1290 RTStrAssertMsgReturn((puch[4] & 0xc0) == 0x80, ("5/%u: %.*Rhxs\n", cb, RT_MIN(cb + 10, strlen((char *)puch)), puch),
1291 rtStrGetCpExFailure(ppsz, pCp, VERR_INVALID_UTF8_ENCODING));
1292 case 4:
1293 RTStrAssertMsgReturn((puch[3] & 0xc0) == 0x80, ("4/%u: %.*Rhxs\n", cb, RT_MIN(cb + 10, strlen((char *)puch)), puch),
1294 rtStrGetCpExFailure(ppsz, pCp, VERR_INVALID_UTF8_ENCODING));
1295 case 3:
1296 RTStrAssertMsgReturn((puch[2] & 0xc0) == 0x80, ("3/%u: %.*Rhxs\n", cb, RT_MIN(cb + 10, strlen((char *)puch)), puch),
1297 rtStrGetCpExFailure(ppsz, pCp, VERR_INVALID_UTF8_ENCODING));
1298 case 2:
1299 RTStrAssertMsgReturn((puch[1] & 0xc0) == 0x80, ("2/%u: %.*Rhxs\n", cb, RT_MIN(cb + 10, strlen((char *)puch)), puch),
1300 rtStrGetCpExFailure(ppsz, pCp, VERR_INVALID_UTF8_ENCODING));
1301 break;
1302 }
1303
1304 /* get and validate the code point. */
1305 switch (cb)
1306 {
1307 case 6:
1308 uc = (puch[5] & 0x3f)
1309 | ((RTUNICP)(puch[4] & 0x3f) << 6)
1310 | ((RTUNICP)(puch[3] & 0x3f) << 12)
1311 | ((RTUNICP)(puch[2] & 0x3f) << 18)
1312 | ((RTUNICP)(puch[1] & 0x3f) << 24)
1313 | ((RTUNICP)(uch & 0x01) << 30);
1314 RTStrAssertMsgReturn(uc >= 0x04000000 && uc <= 0x7fffffff,
1315 ("%u: cp=%#010RX32: %.*Rhxs\n", cb, uc, RT_MIN(cb + 10, strlen((char *)puch)), puch),
1316 rtStrGetCpExFailure(ppsz, pCp, VERR_INVALID_UTF8_ENCODING));
1317 break;
1318 case 5:
1319 uc = (puch[4] & 0x3f)
1320 | ((RTUNICP)(puch[3] & 0x3f) << 6)
1321 | ((RTUNICP)(puch[2] & 0x3f) << 12)
1322 | ((RTUNICP)(puch[1] & 0x3f) << 18)
1323 | ((RTUNICP)(uch & 0x03) << 24);
1324 RTStrAssertMsgReturn(uc >= 0x00200000 && uc <= 0x03ffffff,
1325 ("%u: cp=%#010RX32: %.*Rhxs\n", cb, uc, RT_MIN(cb + 10, strlen((char *)puch)), puch),
1326 rtStrGetCpExFailure(ppsz, pCp, VERR_INVALID_UTF8_ENCODING));
1327 break;
1328 case 4:
1329 uc = (puch[3] & 0x3f)
1330 | ((RTUNICP)(puch[2] & 0x3f) << 6)
1331 | ((RTUNICP)(puch[1] & 0x3f) << 12)
1332 | ((RTUNICP)(uch & 0x07) << 18);
1333 RTStrAssertMsgReturn(uc >= 0x00010000 && uc <= 0x001fffff,
1334 ("%u: cp=%#010RX32: %.*Rhxs\n", cb, uc, RT_MIN(cb + 10, strlen((char *)puch)), puch),
1335 rtStrGetCpExFailure(ppsz, pCp, VERR_INVALID_UTF8_ENCODING));
1336 break;
1337 case 3:
1338 uc = (puch[2] & 0x3f)
1339 | ((RTUNICP)(puch[1] & 0x3f) << 6)
1340 | ((RTUNICP)(uch & 0x0f) << 12);
1341 RTStrAssertMsgReturn(uc >= 0x00000800 && uc <= 0x0000fffd,
1342 ("%u: cp=%#010RX32: %.*Rhxs\n", cb, uc, RT_MIN(cb + 10, strlen((char *)puch)), puch),
1343 rtStrGetCpExFailure(ppsz, pCp, uc == 0xffff || uc == 0xfffe ? VERR_CODE_POINT_ENDIAN_INDICATOR : VERR_INVALID_UTF8_ENCODING));
1344 RTStrAssertMsgReturn(uc < 0xd800 || uc > 0xdfff,
1345 ("%u: cp=%#010RX32: %.*Rhxs\n", cb, uc, RT_MIN(cb + 10, strlen((char *)puch)), puch),
1346 rtStrGetCpExFailure(ppsz, pCp, VERR_CODE_POINT_SURROGATE));
1347 break;
1348 case 2:
1349 uc = (puch[1] & 0x3f)
1350 | ((RTUNICP)(uch & 0x1f) << 6);
1351 RTStrAssertMsgReturn(uc >= 0x00000080 && uc <= 0x000007ff,
1352 ("%u: cp=%#010RX32: %.*Rhxs\n", cb, uc, RT_MIN(cb + 10, strlen((char *)puch)), puch),
1353 rtStrGetCpExFailure(ppsz, pCp, VERR_INVALID_UTF8_ENCODING));
1354 break;
1355 default: /* impossible, but GCC is bitching. */
1356 uc = RTUNICP_INVALID;
1357 break;
1358 }
1359 puch += cb;
1360 }
1361 else
1362 {
1363 /* 6th bit is always set. */
1364 RTStrAssertMsgFailed(("Invalid UTF-8 first byte: %.*Rhxs\n", RT_MIN(strlen((char *)puch), 10), puch));
1365 return rtStrGetCpExFailure(ppsz, pCp, VERR_INVALID_UTF8_ENCODING);
1366 }
1367 *pCp = uc;
1368 *ppsz = (const char *)puch;
1369 return VINF_SUCCESS;
1370}
1371RT_EXPORT_SYMBOL(RTStrGetCpExInternal);
1372
1373
1374/**
1375 * Handle invalid encodings passed to RTStrGetCpNEx().
1376 * @returns rc
1377 * @param ppsz The pointer to the string position point.
1378 * @param pcch Pointer to the string length.
1379 * @param pCp Where to store RTUNICP_INVALID.
1380 * @param rc The iprt error code.
1381 */
1382static int rtStrGetCpNExFailure(const char **ppsz, size_t *pcch, PRTUNICP pCp, int rc)
1383{
1384 /*
1385 * Try find a valid encoding.
1386 */
1387 (*ppsz)++; /** @todo code this! */
1388 (*pcch)--;
1389 *pCp = RTUNICP_INVALID;
1390 return rc;
1391}
1392
1393
1394RTDECL(int) RTStrGetCpNExInternal(const char **ppsz, size_t *pcch, PRTUNICP pCp)
1395{
1396 const unsigned char *puch = (const unsigned char *)*ppsz;
1397 const unsigned char uch = *puch;
1398 size_t cch = *pcch;
1399 RTUNICP uc;
1400
1401 if (cch == 0)
1402 {
1403 *pCp = RTUNICP_INVALID;
1404 return VERR_END_OF_STRING;
1405 }
1406
1407 /* ASCII ? */
1408 if (!(uch & RT_BIT(7)))
1409 {
1410 uc = uch;
1411 puch++;
1412 cch--;
1413 }
1414 else if (uch & RT_BIT(6))
1415 {
1416 /* figure the length and validate the first octet. */
1417/** @todo RT_USE_RTC_3629 */
1418 unsigned cb;
1419 if (!(uch & RT_BIT(5)))
1420 cb = 2;
1421 else if (!(uch & RT_BIT(4)))
1422 cb = 3;
1423 else if (!(uch & RT_BIT(3)))
1424 cb = 4;
1425 else if (!(uch & RT_BIT(2)))
1426 cb = 5;
1427 else if (!(uch & RT_BIT(1)))
1428 cb = 6;
1429 else
1430 {
1431 RTStrAssertMsgFailed(("Invalid UTF-8 first byte: %.*Rhxs\n", RT_MIN(strlen((char *)puch), 10), puch));
1432 return rtStrGetCpNExFailure(ppsz, pcch, pCp, VERR_INVALID_UTF8_ENCODING);
1433 }
1434
1435 if (cb > cch)
1436 return rtStrGetCpNExFailure(ppsz, pcch, pCp, VERR_INVALID_UTF8_ENCODING);
1437
1438 /* validate the rest */
1439 switch (cb)
1440 {
1441 case 6:
1442 RTStrAssertMsgReturn((puch[5] & 0xc0) == 0x80, ("6/%u: %.*Rhxs\n", cb, RT_MIN(cb + 10, strlen((char *)puch)), puch),
1443 rtStrGetCpNExFailure(ppsz, pcch, pCp, VERR_INVALID_UTF8_ENCODING));
1444 case 5:
1445 RTStrAssertMsgReturn((puch[4] & 0xc0) == 0x80, ("5/%u: %.*Rhxs\n", cb, RT_MIN(cb + 10, strlen((char *)puch)), puch),
1446 rtStrGetCpNExFailure(ppsz, pcch, pCp, VERR_INVALID_UTF8_ENCODING));
1447 case 4:
1448 RTStrAssertMsgReturn((puch[3] & 0xc0) == 0x80, ("4/%u: %.*Rhxs\n", cb, RT_MIN(cb + 10, strlen((char *)puch)), puch),
1449 rtStrGetCpNExFailure(ppsz, pcch, pCp, VERR_INVALID_UTF8_ENCODING));
1450 case 3:
1451 RTStrAssertMsgReturn((puch[2] & 0xc0) == 0x80, ("3/%u: %.*Rhxs\n", cb, RT_MIN(cb + 10, strlen((char *)puch)), puch),
1452 rtStrGetCpNExFailure(ppsz, pcch, pCp, VERR_INVALID_UTF8_ENCODING));
1453 case 2:
1454 RTStrAssertMsgReturn((puch[1] & 0xc0) == 0x80, ("2/%u: %.*Rhxs\n", cb, RT_MIN(cb + 10, strlen((char *)puch)), puch),
1455 rtStrGetCpNExFailure(ppsz, pcch, pCp, VERR_INVALID_UTF8_ENCODING));
1456 break;
1457 }
1458
1459 /* get and validate the code point. */
1460 switch (cb)
1461 {
1462 case 6:
1463 uc = (puch[5] & 0x3f)
1464 | ((RTUNICP)(puch[4] & 0x3f) << 6)
1465 | ((RTUNICP)(puch[3] & 0x3f) << 12)
1466 | ((RTUNICP)(puch[2] & 0x3f) << 18)
1467 | ((RTUNICP)(puch[1] & 0x3f) << 24)
1468 | ((RTUNICP)(uch & 0x01) << 30);
1469 RTStrAssertMsgReturn(uc >= 0x04000000 && uc <= 0x7fffffff,
1470 ("%u: cp=%#010RX32: %.*Rhxs\n", cb, uc, RT_MIN(cb + 10, strlen((char *)puch)), puch),
1471 rtStrGetCpNExFailure(ppsz, pcch, pCp, VERR_INVALID_UTF8_ENCODING));
1472 break;
1473 case 5:
1474 uc = (puch[4] & 0x3f)
1475 | ((RTUNICP)(puch[3] & 0x3f) << 6)
1476 | ((RTUNICP)(puch[2] & 0x3f) << 12)
1477 | ((RTUNICP)(puch[1] & 0x3f) << 18)
1478 | ((RTUNICP)(uch & 0x03) << 24);
1479 RTStrAssertMsgReturn(uc >= 0x00200000 && uc <= 0x03ffffff,
1480 ("%u: cp=%#010RX32: %.*Rhxs\n", cb, uc, RT_MIN(cb + 10, strlen((char *)puch)), puch),
1481 rtStrGetCpNExFailure(ppsz, pcch, pCp, VERR_INVALID_UTF8_ENCODING));
1482 break;
1483 case 4:
1484 uc = (puch[3] & 0x3f)
1485 | ((RTUNICP)(puch[2] & 0x3f) << 6)
1486 | ((RTUNICP)(puch[1] & 0x3f) << 12)
1487 | ((RTUNICP)(uch & 0x07) << 18);
1488 RTStrAssertMsgReturn(uc >= 0x00010000 && uc <= 0x001fffff,
1489 ("%u: cp=%#010RX32: %.*Rhxs\n", cb, uc, RT_MIN(cb + 10, strlen((char *)puch)), puch),
1490 rtStrGetCpNExFailure(ppsz, pcch, pCp, VERR_INVALID_UTF8_ENCODING));
1491 break;
1492 case 3:
1493 uc = (puch[2] & 0x3f)
1494 | ((RTUNICP)(puch[1] & 0x3f) << 6)
1495 | ((RTUNICP)(uch & 0x0f) << 12);
1496 RTStrAssertMsgReturn(uc >= 0x00000800 && uc <= 0x0000fffd,
1497 ("%u: cp=%#010RX32: %.*Rhxs\n", cb, uc, RT_MIN(cb + 10, strlen((char *)puch)), puch),
1498 rtStrGetCpNExFailure(ppsz, pcch, pCp, uc == 0xffff || uc == 0xfffe ? VERR_CODE_POINT_ENDIAN_INDICATOR : VERR_INVALID_UTF8_ENCODING));
1499 RTStrAssertMsgReturn(uc < 0xd800 || uc > 0xdfff,
1500 ("%u: cp=%#010RX32: %.*Rhxs\n", cb, uc, RT_MIN(cb + 10, strlen((char *)puch)), puch),
1501 rtStrGetCpNExFailure(ppsz, pcch, pCp, VERR_CODE_POINT_SURROGATE));
1502 break;
1503 case 2:
1504 uc = (puch[1] & 0x3f)
1505 | ((RTUNICP)(uch & 0x1f) << 6);
1506 RTStrAssertMsgReturn(uc >= 0x00000080 && uc <= 0x000007ff,
1507 ("%u: cp=%#010RX32: %.*Rhxs\n", cb, uc, RT_MIN(cb + 10, strlen((char *)puch)), puch),
1508 rtStrGetCpNExFailure(ppsz, pcch, pCp, VERR_INVALID_UTF8_ENCODING));
1509 break;
1510 default: /* impossible, but GCC is bitching. */
1511 uc = RTUNICP_INVALID;
1512 break;
1513 }
1514 puch += cb;
1515 cch -= cb;
1516 }
1517 else
1518 {
1519 /* 6th bit is always set. */
1520 RTStrAssertMsgFailed(("Invalid UTF-8 first byte: %.*Rhxs\n", RT_MIN(strlen((char *)puch), 10), puch));
1521 return rtStrGetCpNExFailure(ppsz, pcch, pCp, VERR_INVALID_UTF8_ENCODING);
1522 }
1523 *pCp = uc;
1524 *ppsz = (const char *)puch;
1525 (*pcch) = cch;
1526 return VINF_SUCCESS;
1527}
1528RT_EXPORT_SYMBOL(RTStrGetCpNExInternal);
1529
1530
1531RTDECL(char *) RTStrPutCpInternal(char *psz, RTUNICP uc)
1532{
1533 unsigned char *puch = (unsigned char *)psz;
1534 if (uc < 0x80)
1535 *puch++ = (unsigned char )uc;
1536 else if (uc < 0x00000800)
1537 {
1538 *puch++ = 0xc0 | (uc >> 6);
1539 *puch++ = 0x80 | (uc & 0x3f);
1540 }
1541 else if (uc < 0x00010000)
1542 {
1543/** @todo RT_USE_RTC_3629 */
1544 if ( uc < 0x0000d8000
1545 || ( uc > 0x0000dfff
1546 && uc < 0x0000fffe))
1547 {
1548 *puch++ = 0xe0 | (uc >> 12);
1549 *puch++ = 0x80 | ((uc >> 6) & 0x3f);
1550 *puch++ = 0x80 | (uc & 0x3f);
1551 }
1552 else
1553 {
1554 AssertMsgFailed(("Invalid code point U+%05x!\n", uc));
1555 *puch++ = 0x7f;
1556 }
1557 }
1558/** @todo RT_USE_RTC_3629 */
1559 else if (uc < 0x00200000)
1560 {
1561 *puch++ = 0xf0 | (uc >> 18);
1562 *puch++ = 0x80 | ((uc >> 12) & 0x3f);
1563 *puch++ = 0x80 | ((uc >> 6) & 0x3f);
1564 *puch++ = 0x80 | (uc & 0x3f);
1565 }
1566 else if (uc < 0x04000000)
1567 {
1568 *puch++ = 0xf8 | (uc >> 24);
1569 *puch++ = 0x80 | ((uc >> 18) & 0x3f);
1570 *puch++ = 0x80 | ((uc >> 12) & 0x3f);
1571 *puch++ = 0x80 | ((uc >> 6) & 0x3f);
1572 *puch++ = 0x80 | (uc & 0x3f);
1573 }
1574 else if (uc <= 0x7fffffff)
1575 {
1576 *puch++ = 0xfc | (uc >> 30);
1577 *puch++ = 0x80 | ((uc >> 24) & 0x3f);
1578 *puch++ = 0x80 | ((uc >> 18) & 0x3f);
1579 *puch++ = 0x80 | ((uc >> 12) & 0x3f);
1580 *puch++ = 0x80 | ((uc >> 6) & 0x3f);
1581 *puch++ = 0x80 | (uc & 0x3f);
1582 }
1583 else
1584 {
1585 AssertMsgFailed(("Invalid code point U+%08x!\n", uc));
1586 *puch++ = 0x7f;
1587 }
1588
1589 return (char *)puch;
1590}
1591RT_EXPORT_SYMBOL(RTStrPutCpInternal);
1592
1593
1594RTDECL(char *) RTStrPrevCp(const char *pszStart, const char *psz)
1595{
1596 if (pszStart < psz)
1597 {
1598 /* simple char? */
1599 const unsigned char *puch = (const unsigned char *)psz;
1600 unsigned uch = *--puch;
1601 if (!(uch & RT_BIT(7)))
1602 return (char *)puch;
1603 RTStrAssertMsgReturn(!(uch & RT_BIT(6)), ("uch=%#x\n", uch), (char *)pszStart);
1604
1605 /* two or more. */
1606 uint32_t uMask = 0xffffffc0;
1607 while ( (const unsigned char *)pszStart < puch
1608 && !(uMask & 1))
1609 {
1610 uch = *--puch;
1611 if ((uch & 0xc0) != 0x80)
1612 {
1613 RTStrAssertMsgReturn((uch & (uMask >> 1)) == (uMask & 0xff),
1614 ("Invalid UTF-8 encoding: %.*Rhxs puch=%p psz=%p\n", psz - (char *)puch, puch, psz),
1615 (char *)pszStart);
1616 return (char *)puch;
1617 }
1618 uMask >>= 1;
1619 }
1620 RTStrAssertMsgFailed(("Invalid UTF-8 encoding: %.*Rhxs puch=%p psz=%p\n", psz - (char *)puch, puch, psz));
1621 }
1622 return (char *)pszStart;
1623}
1624RT_EXPORT_SYMBOL(RTStrPrevCp);
1625
Note: See TracBrowser for help on using the repository browser.

© 2024 Oracle Support Privacy / Do Not Sell My Info Terms of Use Trademark Policy Automated Access Etiquette