VirtualBox

source: vbox/trunk/src/VBox/Runtime/common/string/utf-8.cpp@ 76734

Last change on this file since 76734 was 76553, checked in by vboxsync, 6 years ago

scm --update-copyright-year

  • Property svn:eol-style set to native
  • Property svn:keywords set to Id Revision
File size: 70.7 KB
Line 
1/* $Id: utf-8.cpp 76553 2019-01-01 01:45:53Z vboxsync $ */
2/** @file
3 * IPRT - UTF-8 Decoding.
4 */
5
6/*
7 * Copyright (C) 2006-2019 Oracle Corporation
8 *
9 * This file is part of VirtualBox Open Source Edition (OSE), as
10 * available from http://www.virtualbox.org. This file is free software;
11 * you can redistribute it and/or modify it under the terms of the GNU
12 * General Public License (GPL) as published by the Free Software
13 * Foundation, in version 2 as it comes in the "COPYING" file of the
14 * VirtualBox OSE distribution. VirtualBox OSE is distributed in the
15 * hope that it will be useful, but WITHOUT ANY WARRANTY of any kind.
16 *
17 * The contents of this file may alternatively be used under the terms
18 * of the Common Development and Distribution License Version 1.0
19 * (CDDL) only, as it comes in the "COPYING.CDDL" file of the
20 * VirtualBox OSE distribution, in which case the provisions of the
21 * CDDL are applicable instead of those of the GPL.
22 *
23 * You may elect to license modified versions of this file under the
24 * terms and conditions of either the GPL or the CDDL or both.
25 */
26
27
28/*********************************************************************************************************************************
29* Header Files *
30*********************************************************************************************************************************/
31#include <iprt/string.h>
32#include <iprt/latin1.h>
33#include "internal/iprt.h"
34
35#include <iprt/uni.h>
36#include <iprt/asm.h>
37#include <iprt/alloc.h>
38#include <iprt/assert.h>
39#include <iprt/err.h>
40#include "internal/string.h"
41
42
43
44/**
45 * Get get length in code points of a UTF-8 encoded string.
46 * The string is validated while doing this.
47 *
48 * @returns IPRT status code.
49 * @param psz Pointer to the UTF-8 string.
50 * @param cch The max length of the string. (btw cch = cb)
51 * Use RTSTR_MAX if all of the string is to be examined.
52 * @param pcuc Where to store the length in unicode code points.
53 * @param pcchActual Where to store the actual size of the UTF-8 string
54 * on success (cch = cb again). Optional.
55 */
56DECLHIDDEN(int) rtUtf8Length(const char *psz, size_t cch, size_t *pcuc, size_t *pcchActual)
57{
58 const unsigned char *puch = (const unsigned char *)psz;
59 size_t cCodePoints = 0;
60 while (cch > 0)
61 {
62 const unsigned char uch = *puch;
63 if (!uch)
64 break;
65 if (uch & RT_BIT(7))
66 {
67 /* figure sequence length and validate the first byte */
68/** @todo RT_USE_RTC_3629 */
69 unsigned cb;
70 if ((uch & (RT_BIT(7) | RT_BIT(6) | RT_BIT(5))) == (RT_BIT(7) | RT_BIT(6)))
71 cb = 2;
72 else if ((uch & (RT_BIT(7) | RT_BIT(6) | RT_BIT(5) | RT_BIT(4))) == (RT_BIT(7) | RT_BIT(6) | RT_BIT(5)))
73 cb = 3;
74 else if ((uch & (RT_BIT(7) | RT_BIT(6) | RT_BIT(5) | RT_BIT(4) | RT_BIT(3))) == (RT_BIT(7) | RT_BIT(6) | RT_BIT(5) | RT_BIT(4)))
75 cb = 4;
76 else if ((uch & (RT_BIT(7) | RT_BIT(6) | RT_BIT(5) | RT_BIT(4) | RT_BIT(3) | RT_BIT(2))) == (RT_BIT(7) | RT_BIT(6) | RT_BIT(5) | RT_BIT(4) | RT_BIT(3)))
77 cb = 5;
78 else if ((uch & (RT_BIT(7) | RT_BIT(6) | RT_BIT(5) | RT_BIT(4) | RT_BIT(3) | RT_BIT(2) | RT_BIT(1))) == (RT_BIT(7) | RT_BIT(6) | RT_BIT(5) | RT_BIT(4) | RT_BIT(3) | RT_BIT(2)))
79 cb = 6;
80 else
81 {
82 RTStrAssertMsgFailed(("Invalid UTF-8 first byte: %.*Rhxs\n", RT_MIN(cch, 10), puch));
83 return VERR_INVALID_UTF8_ENCODING;
84 }
85
86 /* check length */
87 if (cb > cch)
88 {
89 RTStrAssertMsgFailed(("Invalid UTF-8 length: cb=%d cch=%d (%.*Rhxs)\n", cb, cch, RT_MIN(cch, 10), puch));
90 return VERR_INVALID_UTF8_ENCODING;
91 }
92
93 /* validate the rest */
94 switch (cb)
95 {
96 case 6:
97 RTStrAssertMsgReturn((puch[5] & (RT_BIT(7) | RT_BIT(6))) == RT_BIT(7), ("6/%u: %.*Rhxs\n", cb, RT_MIN(cb + 10, cch), puch), VERR_INVALID_UTF8_ENCODING);
98 RT_FALL_THRU();
99 case 5:
100 RTStrAssertMsgReturn((puch[4] & (RT_BIT(7) | RT_BIT(6))) == RT_BIT(7), ("5/%u: %.*Rhxs\n", cb, RT_MIN(cb + 10, cch), puch), VERR_INVALID_UTF8_ENCODING);
101 RT_FALL_THRU();
102 case 4:
103 RTStrAssertMsgReturn((puch[3] & (RT_BIT(7) | RT_BIT(6))) == RT_BIT(7), ("4/%u: %.*Rhxs\n", cb, RT_MIN(cb + 10, cch), puch), VERR_INVALID_UTF8_ENCODING);
104 RT_FALL_THRU();
105 case 3:
106 RTStrAssertMsgReturn((puch[2] & (RT_BIT(7) | RT_BIT(6))) == RT_BIT(7), ("3/%u: %.*Rhxs\n", cb, RT_MIN(cb + 10, cch), puch), VERR_INVALID_UTF8_ENCODING);
107 RT_FALL_THRU();
108 case 2:
109 RTStrAssertMsgReturn((puch[1] & (RT_BIT(7) | RT_BIT(6))) == RT_BIT(7), ("2/%u: %.*Rhxs\n", cb, RT_MIN(cb + 10, cch), puch), VERR_INVALID_UTF8_ENCODING);
110 break;
111 }
112
113 /* validate the code point. */
114 RTUNICP uc;
115 switch (cb)
116 {
117 case 6:
118 uc = (puch[5] & 0x3f)
119 | ((RTUNICP)(puch[4] & 0x3f) << 6)
120 | ((RTUNICP)(puch[3] & 0x3f) << 12)
121 | ((RTUNICP)(puch[2] & 0x3f) << 18)
122 | ((RTUNICP)(puch[1] & 0x3f) << 24)
123 | ((RTUNICP)(uch & 0x01) << 30);
124 RTStrAssertMsgReturn(uc >= 0x04000000 && uc <= 0x7fffffff,
125 ("%u: cp=%#010RX32: %.*Rhxs\n", cb, uc, RT_MIN(cb + 10, cch), puch), VERR_INVALID_UTF8_ENCODING);
126 break;
127 case 5:
128 uc = (puch[4] & 0x3f)
129 | ((RTUNICP)(puch[3] & 0x3f) << 6)
130 | ((RTUNICP)(puch[2] & 0x3f) << 12)
131 | ((RTUNICP)(puch[1] & 0x3f) << 18)
132 | ((RTUNICP)(uch & 0x03) << 24);
133 RTStrAssertMsgReturn(uc >= 0x00200000 && uc <= 0x03ffffff,
134 ("%u: cp=%#010RX32: %.*Rhxs\n", cb, uc, RT_MIN(cb + 10, cch), puch), VERR_INVALID_UTF8_ENCODING);
135 break;
136 case 4:
137 uc = (puch[3] & 0x3f)
138 | ((RTUNICP)(puch[2] & 0x3f) << 6)
139 | ((RTUNICP)(puch[1] & 0x3f) << 12)
140 | ((RTUNICP)(uch & 0x07) << 18);
141 RTStrAssertMsgReturn(uc >= 0x00010000 && uc <= 0x001fffff,
142 ("%u: cp=%#010RX32: %.*Rhxs\n", cb, uc, RT_MIN(cb + 10, cch), puch), VERR_INVALID_UTF8_ENCODING);
143 break;
144 case 3:
145 uc = (puch[2] & 0x3f)
146 | ((RTUNICP)(puch[1] & 0x3f) << 6)
147 | ((RTUNICP)(uch & 0x0f) << 12);
148 RTStrAssertMsgReturn(uc >= 0x00000800 && uc <= 0x0000fffd,
149 ("%u: cp=%#010RX32: %.*Rhxs\n", cb, uc, RT_MIN(cb + 10, cch), puch),
150 uc == 0xffff || uc == 0xfffe ? VERR_CODE_POINT_ENDIAN_INDICATOR : VERR_INVALID_UTF8_ENCODING);
151 RTStrAssertMsgReturn(uc < 0xd800 || uc > 0xdfff,
152 ("%u: cp=%#010RX32: %.*Rhxs\n", cb, uc, RT_MIN(cb + 10, cch), puch), VERR_CODE_POINT_SURROGATE);
153 break;
154 case 2:
155 uc = (puch[1] & 0x3f)
156 | ((RTUNICP)(uch & 0x1f) << 6);
157 RTStrAssertMsgReturn(uc >= 0x00000080 && uc <= 0x000007ff,
158 ("%u: cp=%#010RX32: %.*Rhxs\n", cb, uc, RT_MIN(cb + 10, cch), puch), VERR_INVALID_UTF8_ENCODING);
159 break;
160 }
161
162 /* advance */
163 cch -= cb;
164 puch += cb;
165 }
166 else
167 {
168 /* one ASCII byte */
169 puch++;
170 cch--;
171 }
172 cCodePoints++;
173 }
174
175 /* done */
176 *pcuc = cCodePoints;
177 if (pcchActual)
178 *pcchActual = puch - (unsigned char const *)psz;
179 return VINF_SUCCESS;
180}
181
182
183/**
184 * Decodes and UTF-8 string into an array of unicode code point.
185 *
186 * Since we know the input is valid, we do *not* perform encoding or length checks.
187 *
188 * @returns iprt status code.
189 * @param psz The UTF-8 string to recode. This is a valid encoding.
190 * @param cch The number of chars (the type char, so bytes if you like) to process of the UTF-8 string.
191 * The recoding will stop when cch or '\\0' is reached. Pass RTSTR_MAX to process up to '\\0'.
192 * @param paCps Where to store the code points array.
193 * @param cCps The number of RTUNICP items the paCps buffer can hold, excluding the terminator ('\\0').
194 */
195static int rtUtf8Decode(const char *psz, size_t cch, PRTUNICP paCps, size_t cCps)
196{
197 int rc = VINF_SUCCESS;
198 const unsigned char *puch = (const unsigned char *)psz;
199 PRTUNICP pCp = paCps;
200 while (cch > 0)
201 {
202 /* read the next char and check for terminator. */
203 const unsigned char uch = *puch;
204 if (uch)
205 { /* we only break once, so consider this the likely branch. */ }
206 else
207 break;
208
209 /* check for output overflow */
210 if (RT_LIKELY(cCps >= 1))
211 { /* likely */ }
212 else
213 {
214 rc = VERR_BUFFER_OVERFLOW;
215 break;
216 }
217 cCps--;
218
219 /* decode and recode the code point */
220 if (!(uch & RT_BIT(7)))
221 {
222 *pCp++ = uch;
223 puch++;
224 cch--;
225 }
226#ifdef RT_STRICT
227 else if (!(uch & RT_BIT(6)))
228 AssertMsgFailed(("Internal error!\n"));
229#endif
230 else if (!(uch & RT_BIT(5)))
231 {
232 *pCp++ = (puch[1] & 0x3f)
233 | ((uint16_t)(uch & 0x1f) << 6);
234 puch += 2;
235 cch -= 2;
236 }
237 else if (!(uch & RT_BIT(4)))
238 {
239 *pCp++ = (puch[2] & 0x3f)
240 | ((uint16_t)(puch[1] & 0x3f) << 6)
241 | ((uint16_t)(uch & 0x0f) << 12);
242 puch += 3;
243 cch -= 3;
244 }
245 else if (!(uch & RT_BIT(3)))
246 {
247 *pCp++ = (puch[3] & 0x3f)
248 | ((RTUNICP)(puch[2] & 0x3f) << 6)
249 | ((RTUNICP)(puch[1] & 0x3f) << 12)
250 | ((RTUNICP)(uch & 0x07) << 18);
251 puch += 4;
252 cch -= 4;
253 }
254 else if (!(uch & RT_BIT(2)))
255 {
256 *pCp++ = (puch[4] & 0x3f)
257 | ((RTUNICP)(puch[3] & 0x3f) << 6)
258 | ((RTUNICP)(puch[2] & 0x3f) << 12)
259 | ((RTUNICP)(puch[1] & 0x3f) << 18)
260 | ((RTUNICP)(uch & 0x03) << 24);
261 puch += 5;
262 cch -= 6;
263 }
264 else
265 {
266 Assert(!(uch & RT_BIT(1)));
267 *pCp++ = (puch[5] & 0x3f)
268 | ((RTUNICP)(puch[4] & 0x3f) << 6)
269 | ((RTUNICP)(puch[3] & 0x3f) << 12)
270 | ((RTUNICP)(puch[2] & 0x3f) << 18)
271 | ((RTUNICP)(puch[1] & 0x3f) << 24)
272 | ((RTUNICP)(uch & 0x01) << 30);
273 puch += 6;
274 cch -= 6;
275 }
276 }
277
278 /* done */
279 *pCp = 0;
280 return rc;
281}
282
283
284RTDECL(size_t) RTStrUniLen(const char *psz)
285{
286 size_t cCodePoints;
287 int rc = rtUtf8Length(psz, RTSTR_MAX, &cCodePoints, NULL);
288 return RT_SUCCESS(rc) ? cCodePoints : 0;
289}
290RT_EXPORT_SYMBOL(RTStrUniLen);
291
292
293RTDECL(int) RTStrUniLenEx(const char *psz, size_t cch, size_t *pcCps)
294{
295 size_t cCodePoints;
296 int rc = rtUtf8Length(psz, cch, &cCodePoints, NULL);
297 if (pcCps)
298 *pcCps = RT_SUCCESS(rc) ? cCodePoints : 0;
299 return rc;
300}
301RT_EXPORT_SYMBOL(RTStrUniLenEx);
302
303
304RTDECL(int) RTStrValidateEncoding(const char *psz)
305{
306 return RTStrValidateEncodingEx(psz, RTSTR_MAX, 0);
307}
308RT_EXPORT_SYMBOL(RTStrValidateEncoding);
309
310
311RTDECL(int) RTStrValidateEncodingEx(const char *psz, size_t cch, uint32_t fFlags)
312{
313 AssertReturn(!(fFlags & ~(RTSTR_VALIDATE_ENCODING_ZERO_TERMINATED | RTSTR_VALIDATE_ENCODING_EXACT_LENGTH)),
314 VERR_INVALID_PARAMETER);
315 AssertPtr(psz);
316
317 /*
318 * Use rtUtf8Length for the job.
319 */
320 size_t cchActual;
321 size_t cCpsIgnored;
322 int rc = rtUtf8Length(psz, cch, &cCpsIgnored, &cchActual);
323 if (RT_SUCCESS(rc))
324 {
325 if (fFlags & RTSTR_VALIDATE_ENCODING_EXACT_LENGTH)
326 {
327 if (fFlags & RTSTR_VALIDATE_ENCODING_ZERO_TERMINATED)
328 cchActual++;
329 if (cchActual == cch)
330 rc = VINF_SUCCESS;
331 else if (cchActual < cch)
332 rc = VERR_BUFFER_UNDERFLOW;
333 else
334 rc = VERR_BUFFER_OVERFLOW;
335 }
336 else if ( (fFlags & RTSTR_VALIDATE_ENCODING_ZERO_TERMINATED)
337 && cchActual >= cch)
338 rc = VERR_BUFFER_OVERFLOW;
339 }
340 return rc;
341}
342RT_EXPORT_SYMBOL(RTStrValidateEncodingEx);
343
344
345RTDECL(bool) RTStrIsValidEncoding(const char *psz)
346{
347 int rc = RTStrValidateEncodingEx(psz, RTSTR_MAX, 0);
348 return RT_SUCCESS(rc);
349}
350RT_EXPORT_SYMBOL(RTStrIsValidEncoding);
351
352
353RTDECL(size_t) RTStrPurgeEncoding(char *psz)
354{
355 size_t cErrors = 0;
356 for (;;)
357 {
358 RTUNICP Cp;
359 int rc = RTStrGetCpEx((const char **)&psz, &Cp);
360 if (RT_SUCCESS(rc))
361 {
362 if (!Cp)
363 break;
364 }
365 else
366 {
367 psz[-1] = '?';
368 cErrors++;
369 }
370 }
371 return cErrors;
372}
373RT_EXPORT_SYMBOL(RTStrPurgeEncoding);
374
375
376/**
377 * Helper for RTStrPurgeComplementSet.
378 *
379 * @returns true if @a Cp is valid, false if not.
380 * @param Cp The code point to validate.
381 * @param puszValidPairs Pair of valid code point sets.
382 * @param cValidPairs Number of pairs.
383 */
384DECLINLINE(bool) rtStrPurgeIsInSet(RTUNICP Cp, PCRTUNICP puszValidPairs, uint32_t cValidPairs)
385{
386 while (cValidPairs-- > 0)
387 {
388 if ( Cp >= puszValidPairs[0]
389 && Cp <= puszValidPairs[1])
390 return true;
391 puszValidPairs += 2;
392 }
393 return false;
394}
395
396
397RTDECL(ssize_t) RTStrPurgeComplementSet(char *psz, PCRTUNICP puszValidPairs, char chReplacement)
398{
399 AssertReturn(chReplacement && (unsigned)chReplacement < 128, -1);
400
401 /*
402 * Calc valid pairs and check that we've got an even number.
403 */
404 uint32_t cValidPairs = 0;
405 while (puszValidPairs[cValidPairs * 2])
406 {
407 AssertReturn(puszValidPairs[cValidPairs * 2 + 1], -1);
408 AssertMsg(puszValidPairs[cValidPairs * 2] <= puszValidPairs[cValidPairs * 2 + 1],
409 ("%#x vs %#x\n", puszValidPairs[cValidPairs * 2], puszValidPairs[cValidPairs * 2 + 1]));
410 cValidPairs++;
411 }
412
413 /*
414 * Do the replacing.
415 */
416 ssize_t cReplacements = 0;
417 for (;;)
418 {
419 char *pszCur = psz;
420 RTUNICP Cp;
421 int rc = RTStrGetCpEx((const char **)&psz, &Cp);
422 if (RT_SUCCESS(rc))
423 {
424 if (Cp)
425 {
426 if (!rtStrPurgeIsInSet(Cp, puszValidPairs, cValidPairs))
427 {
428 for (; pszCur != psz; ++pszCur)
429 *pszCur = chReplacement;
430 ++cReplacements;
431 }
432 }
433 else
434 break;
435 }
436 else
437 return -1;
438 }
439 return cReplacements;
440}
441RT_EXPORT_SYMBOL(RTStrPurgeComplementSet);
442
443
444RTDECL(int) RTStrToUni(const char *pszString, PRTUNICP *ppaCps)
445{
446 /*
447 * Validate input.
448 */
449 Assert(VALID_PTR(pszString));
450 Assert(VALID_PTR(ppaCps));
451 *ppaCps = NULL;
452
453 /*
454 * Validate the UTF-8 input and count its code points.
455 */
456 size_t cCps;
457 int rc = rtUtf8Length(pszString, RTSTR_MAX, &cCps, NULL);
458 if (RT_SUCCESS(rc))
459 {
460 /*
461 * Allocate buffer.
462 */
463 PRTUNICP paCps = (PRTUNICP)RTMemAlloc((cCps + 1) * sizeof(RTUNICP));
464 if (paCps)
465 {
466 /*
467 * Decode the string.
468 */
469 rc = rtUtf8Decode(pszString, RTSTR_MAX, paCps, cCps);
470 if (RT_SUCCESS(rc))
471 {
472 *ppaCps = paCps;
473 return rc;
474 }
475 RTMemFree(paCps);
476 }
477 else
478 rc = VERR_NO_CODE_POINT_MEMORY;
479 }
480 return rc;
481}
482RT_EXPORT_SYMBOL(RTStrToUni);
483
484
485RTDECL(int) RTStrToUniEx(const char *pszString, size_t cchString, PRTUNICP *ppaCps, size_t cCps, size_t *pcCps)
486{
487 /*
488 * Validate input.
489 */
490 Assert(VALID_PTR(pszString));
491 Assert(VALID_PTR(ppaCps));
492 Assert(!pcCps || VALID_PTR(pcCps));
493
494 /*
495 * Validate the UTF-8 input and count the code points.
496 */
497 size_t cCpsResult;
498 int rc = rtUtf8Length(pszString, cchString, &cCpsResult, NULL);
499 if (RT_SUCCESS(rc))
500 {
501 if (pcCps)
502 *pcCps = cCpsResult;
503
504 /*
505 * Check buffer size / Allocate buffer.
506 */
507 bool fShouldFree;
508 PRTUNICP paCpsResult;
509 if (cCps > 0 && *ppaCps)
510 {
511 fShouldFree = false;
512 if (cCps <= cCpsResult)
513 return VERR_BUFFER_OVERFLOW;
514 paCpsResult = *ppaCps;
515 }
516 else
517 {
518 *ppaCps = NULL;
519 fShouldFree = true;
520 cCps = RT_MAX(cCpsResult + 1, cCps);
521 paCpsResult = (PRTUNICP)RTMemAlloc(cCps * sizeof(RTUNICP));
522 }
523 if (paCpsResult)
524 {
525 /*
526 * Encode the UTF-16 string.
527 */
528 rc = rtUtf8Decode(pszString, cchString, paCpsResult, cCps - 1);
529 if (RT_SUCCESS(rc))
530 {
531 *ppaCps = paCpsResult;
532 return rc;
533 }
534 if (fShouldFree)
535 RTMemFree(paCpsResult);
536 }
537 else
538 rc = VERR_NO_CODE_POINT_MEMORY;
539 }
540 return rc;
541}
542RT_EXPORT_SYMBOL(RTStrToUniEx);
543
544
545/**
546 * Calculates the UTF-16 length of a string, validating the encoding while doing so.
547 *
548 * @returns IPRT status code.
549 * @param psz Pointer to the UTF-8 string.
550 * @param cch The max length of the string. (btw cch = cb)
551 * @param pcwc Where to store the length of the UTF-16 string as a number
552 * of RTUTF16 characters.
553 * @sa rtUtf8CalcUtf16Length
554 */
555static int rtUtf8CalcUtf16LengthN(const char *psz, size_t cch, size_t *pcwc)
556{
557 const unsigned char *puch = (const unsigned char *)psz;
558 size_t cwc = 0;
559 while (cch > 0)
560 {
561 const unsigned char uch = *puch;
562 if (!(uch & RT_BIT(7)))
563 {
564 /* one ASCII byte */
565 if (uch)
566 {
567 cwc++;
568 puch++;
569 cch--;
570 }
571 else
572 break;
573 }
574 else
575 {
576 /*
577 * Multibyte sequence is more complicated when we have length
578 * restrictions on the input.
579 */
580 /* figure sequence length and validate the first byte */
581 unsigned cb;
582 if ((uch & (RT_BIT(7) | RT_BIT(6) | RT_BIT(5))) == (RT_BIT(7) | RT_BIT(6)))
583 cb = 2;
584 else if ((uch & (RT_BIT(7) | RT_BIT(6) | RT_BIT(5) | RT_BIT(4))) == (RT_BIT(7) | RT_BIT(6) | RT_BIT(5)))
585 cb = 3;
586 else if ((uch & (RT_BIT(7) | RT_BIT(6) | RT_BIT(5) | RT_BIT(4) | RT_BIT(3))) == (RT_BIT(7) | RT_BIT(6) | RT_BIT(5) | RT_BIT(4)))
587 cb = 4;
588 else if ((uch & (RT_BIT(7) | RT_BIT(6) | RT_BIT(5) | RT_BIT(4) | RT_BIT(3) | RT_BIT(2))) == (RT_BIT(7) | RT_BIT(6) | RT_BIT(5) | RT_BIT(4) | RT_BIT(3)))
589 cb = 5;
590 else if ((uch & (RT_BIT(7) | RT_BIT(6) | RT_BIT(5) | RT_BIT(4) | RT_BIT(3) | RT_BIT(2) | RT_BIT(1))) == (RT_BIT(7) | RT_BIT(6) | RT_BIT(5) | RT_BIT(4) | RT_BIT(3) | RT_BIT(2)))
591 cb = 6;
592 else
593 {
594 RTStrAssertMsgFailed(("Invalid UTF-8 first byte: %.*Rhxs\n", RT_MIN(cch, 10), puch));
595 return VERR_INVALID_UTF8_ENCODING;
596 }
597
598 /* check length */
599 if (cb > cch)
600 {
601 RTStrAssertMsgFailed(("Invalid UTF-8 length: cb=%d cch=%d (%.*Rhxs)\n", cb, cch, RT_MIN(cch, 10), puch));
602 return VERR_INVALID_UTF8_ENCODING;
603 }
604
605 /* validate the rest */
606 switch (cb)
607 {
608 case 6:
609 RTStrAssertMsgReturn((puch[5] & (RT_BIT(7) | RT_BIT(6))) == RT_BIT(7), ("6/%u: %.*Rhxs\n", cb, RT_MIN(cb + 10, cch), puch), VERR_INVALID_UTF8_ENCODING);
610 RT_FALL_THRU();
611 case 5:
612 RTStrAssertMsgReturn((puch[4] & (RT_BIT(7) | RT_BIT(6))) == RT_BIT(7), ("5/%u: %.*Rhxs\n", cb, RT_MIN(cb + 10, cch), puch), VERR_INVALID_UTF8_ENCODING);
613 RT_FALL_THRU();
614 case 4:
615 RTStrAssertMsgReturn((puch[3] & (RT_BIT(7) | RT_BIT(6))) == RT_BIT(7), ("4/%u: %.*Rhxs\n", cb, RT_MIN(cb + 10, cch), puch), VERR_INVALID_UTF8_ENCODING);
616 RT_FALL_THRU();
617 case 3:
618 RTStrAssertMsgReturn((puch[2] & (RT_BIT(7) | RT_BIT(6))) == RT_BIT(7), ("3/%u: %.*Rhxs\n", cb, RT_MIN(cb + 10, cch), puch), VERR_INVALID_UTF8_ENCODING);
619 RT_FALL_THRU();
620 case 2:
621 RTStrAssertMsgReturn((puch[1] & (RT_BIT(7) | RT_BIT(6))) == RT_BIT(7), ("2/%u: %.*Rhxs\n", cb, RT_MIN(cb + 10, cch), puch), VERR_INVALID_UTF8_ENCODING);
622 break;
623 }
624
625 /* validate the code point. */
626 RTUNICP uc;
627 switch (cb)
628 {
629 case 6:
630 uc = (puch[5] & 0x3f)
631 | ((RTUNICP)(puch[4] & 0x3f) << 6)
632 | ((RTUNICP)(puch[3] & 0x3f) << 12)
633 | ((RTUNICP)(puch[2] & 0x3f) << 18)
634 | ((RTUNICP)(puch[1] & 0x3f) << 24)
635 | ((RTUNICP)(uch & 0x01) << 30);
636 RTStrAssertMsgReturn(uc >= 0x04000000 && uc <= 0x7fffffff,
637 ("%u: cp=%#010RX32: %.*Rhxs\n", cb, uc, RT_MIN(cb + 10, cch), puch), VERR_INVALID_UTF8_ENCODING);
638 RTStrAssertMsgFailed(("%u: cp=%#010RX32: %.*Rhxs\n", cb, uc, RT_MIN(cb + 10, cch), puch));
639 return VERR_CANT_RECODE_AS_UTF16;
640 case 5:
641 uc = (puch[4] & 0x3f)
642 | ((RTUNICP)(puch[3] & 0x3f) << 6)
643 | ((RTUNICP)(puch[2] & 0x3f) << 12)
644 | ((RTUNICP)(puch[1] & 0x3f) << 18)
645 | ((RTUNICP)(uch & 0x03) << 24);
646 RTStrAssertMsgReturn(uc >= 0x00200000 && uc <= 0x03ffffff,
647 ("%u: cp=%#010RX32: %.*Rhxs\n", cb, uc, RT_MIN(cb + 10, cch), puch), VERR_INVALID_UTF8_ENCODING);
648 RTStrAssertMsgFailed(("%u: cp=%#010RX32: %.*Rhxs\n", cb, uc, RT_MIN(cb + 10, cch), puch));
649 return VERR_CANT_RECODE_AS_UTF16;
650 case 4:
651 uc = (puch[3] & 0x3f)
652 | ((RTUNICP)(puch[2] & 0x3f) << 6)
653 | ((RTUNICP)(puch[1] & 0x3f) << 12)
654 | ((RTUNICP)(uch & 0x07) << 18);
655 RTStrAssertMsgReturn(uc >= 0x00010000 && uc <= 0x001fffff,
656 ("%u: cp=%#010RX32: %.*Rhxs\n", cb, uc, RT_MIN(cb + 10, cch), puch), VERR_INVALID_UTF8_ENCODING);
657 RTStrAssertMsgReturn(uc <= 0x0010ffff,
658 ("%u: cp=%#010RX32: %.*Rhxs\n", cb, uc, RT_MIN(cb + 10, cch), puch), VERR_CANT_RECODE_AS_UTF16);
659 cwc++;
660 break;
661 case 3:
662 uc = (puch[2] & 0x3f)
663 | ((RTUNICP)(puch[1] & 0x3f) << 6)
664 | ((RTUNICP)(uch & 0x0f) << 12);
665 RTStrAssertMsgReturn(uc >= 0x00000800 && uc <= 0x0000fffd,
666 ("%u: cp=%#010RX32: %.*Rhxs\n", cb, uc, RT_MIN(cb + 10, cch), puch),
667 uc == 0xffff || uc == 0xfffe ? VERR_CODE_POINT_ENDIAN_INDICATOR : VERR_INVALID_UTF8_ENCODING);
668 RTStrAssertMsgReturn(uc < 0xd800 || uc > 0xdfff,
669 ("%u: cp=%#010RX32: %.*Rhxs\n", cb, uc, RT_MIN(cb + 10, cch), puch), VERR_CODE_POINT_SURROGATE);
670 break;
671 case 2:
672 uc = (puch[1] & 0x3f)
673 | ((RTUNICP)(uch & 0x1f) << 6);
674 RTStrAssertMsgReturn(uc >= 0x00000080 && uc <= 0x000007ff,
675 ("%u: cp=%#010RX32: %.*Rhxs\n", cb, uc, RT_MIN(cb + 10, cch), puch), VERR_INVALID_UTF8_ENCODING);
676 break;
677 }
678
679 /* advance */
680 cch -= cb;
681 puch += cb;
682 cwc++;
683 }
684 }
685
686 /* done */
687 *pcwc = cwc;
688 return VINF_SUCCESS;
689}
690
691
692/**
693 * Calculates the UTF-16 length of a string, validating the encoding while doing so.
694 *
695 * @returns IPRT status code.
696 * @param psz Pointer to the UTF-8 string.
697 * @param pcwc Where to store the length of the UTF-16 string as a number
698 * of RTUTF16 characters.
699 * @sa rtUtf8CalcUtf16LengthN
700 */
701static int rtUtf8CalcUtf16Length(const char *psz, size_t *pcwc)
702{
703 const unsigned char *puch = (const unsigned char *)psz;
704 size_t cwc = 0;
705 for (;;)
706 {
707 const unsigned char uch = *puch;
708 if (!(uch & RT_BIT(7)))
709 {
710 /* one ASCII byte */
711 if (uch)
712 {
713 cwc++;
714 puch++;
715 }
716 else
717 break;
718 }
719 else
720 {
721 /*
722 * Figure sequence length, implicitly validate the first byte.
723 * Then validate the additional bytes.
724 * Finally validate the code point.
725 */
726 unsigned cb;
727 RTUNICP uc;
728 if ((uch & (RT_BIT(7) | RT_BIT(6) | RT_BIT(5))) == (RT_BIT(7) | RT_BIT(6)))
729 {
730 RTStrAssertMsgReturn((puch[1] & (RT_BIT(7) | RT_BIT(6))) == RT_BIT(7), ("2/%u: %.*Rhxs\n", cb, RT_MIN(cb + 10, cch), puch), VERR_INVALID_UTF8_ENCODING);
731 uc = (puch[1] & 0x3f)
732 | ((RTUNICP)(uch & 0x1f) << 6);
733 RTStrAssertMsgReturn(uc >= 0x00000080 && uc <= 0x000007ff,
734 ("%u: cp=%#010RX32: %.*Rhxs\n", cb, uc, RT_MIN(cb + 10, cch), puch), VERR_INVALID_UTF8_ENCODING);
735 cb = 2;
736 }
737 else if ((uch & (RT_BIT(7) | RT_BIT(6) | RT_BIT(5) | RT_BIT(4))) == (RT_BIT(7) | RT_BIT(6) | RT_BIT(5)))
738 {
739 RTStrAssertMsgReturn((puch[1] & (RT_BIT(7) | RT_BIT(6))) == RT_BIT(7), ("2/%u: %.*Rhxs\n", cb, RT_MIN(cb + 10, cch), puch), VERR_INVALID_UTF8_ENCODING);
740 RTStrAssertMsgReturn((puch[2] & (RT_BIT(7) | RT_BIT(6))) == RT_BIT(7), ("3/%u: %.*Rhxs\n", cb, RT_MIN(cb + 10, cch), puch), VERR_INVALID_UTF8_ENCODING);
741 uc = (puch[2] & 0x3f)
742 | ((RTUNICP)(puch[1] & 0x3f) << 6)
743 | ((RTUNICP)(uch & 0x0f) << 12);
744 RTStrAssertMsgReturn(uc >= 0x00000800 && uc <= 0x0000fffd,
745 ("%u: cp=%#010RX32: %.*Rhxs\n", cb, uc, RT_MIN(cb + 10, cch), puch),
746 uc == 0xffff || uc == 0xfffe ? VERR_CODE_POINT_ENDIAN_INDICATOR : VERR_INVALID_UTF8_ENCODING);
747 RTStrAssertMsgReturn(uc < 0xd800 || uc > 0xdfff,
748 ("%u: cp=%#010RX32: %.*Rhxs\n", cb, uc, RT_MIN(cb + 10, cch), puch), VERR_CODE_POINT_SURROGATE);
749 cb = 3;
750 }
751 else if ((uch & (RT_BIT(7) | RT_BIT(6) | RT_BIT(5) | RT_BIT(4) | RT_BIT(3))) == (RT_BIT(7) | RT_BIT(6) | RT_BIT(5) | RT_BIT(4)))
752 {
753 RTStrAssertMsgReturn((puch[1] & (RT_BIT(7) | RT_BIT(6))) == RT_BIT(7), ("2/%u: %.*Rhxs\n", cb, RT_MIN(cb + 10, cch), puch), VERR_INVALID_UTF8_ENCODING);
754 RTStrAssertMsgReturn((puch[2] & (RT_BIT(7) | RT_BIT(6))) == RT_BIT(7), ("3/%u: %.*Rhxs\n", cb, RT_MIN(cb + 10, cch), puch), VERR_INVALID_UTF8_ENCODING);
755 RTStrAssertMsgReturn((puch[3] & (RT_BIT(7) | RT_BIT(6))) == RT_BIT(7), ("4/%u: %.*Rhxs\n", cb, RT_MIN(cb + 10, cch), puch), VERR_INVALID_UTF8_ENCODING);
756 uc = (puch[3] & 0x3f)
757 | ((RTUNICP)(puch[2] & 0x3f) << 6)
758 | ((RTUNICP)(puch[1] & 0x3f) << 12)
759 | ((RTUNICP)(uch & 0x07) << 18);
760 RTStrAssertMsgReturn(uc >= 0x00010000 && uc <= 0x001fffff,
761 ("%u: cp=%#010RX32: %.*Rhxs\n", cb, uc, RT_MIN(cb + 10, cch), puch), VERR_INVALID_UTF8_ENCODING);
762 RTStrAssertMsgReturn(uc <= 0x0010ffff,
763 ("%u: cp=%#010RX32: %.*Rhxs\n", cb, uc, RT_MIN(cb + 10, cch), puch), VERR_CANT_RECODE_AS_UTF16);
764 cwc++;
765 cb = 4;
766 }
767 else if ((uch & (RT_BIT(7) | RT_BIT(6) | RT_BIT(5) | RT_BIT(4) | RT_BIT(3) | RT_BIT(2))) == (RT_BIT(7) | RT_BIT(6) | RT_BIT(5) | RT_BIT(4) | RT_BIT(3)))
768 {
769 RTStrAssertMsgReturn((puch[1] & (RT_BIT(7) | RT_BIT(6))) == RT_BIT(7), ("2/%u: %.*Rhxs\n", cb, RT_MIN(cb + 10, cch), puch), VERR_INVALID_UTF8_ENCODING);
770 RTStrAssertMsgReturn((puch[2] & (RT_BIT(7) | RT_BIT(6))) == RT_BIT(7), ("3/%u: %.*Rhxs\n", cb, RT_MIN(cb + 10, cch), puch), VERR_INVALID_UTF8_ENCODING);
771 RTStrAssertMsgReturn((puch[3] & (RT_BIT(7) | RT_BIT(6))) == RT_BIT(7), ("4/%u: %.*Rhxs\n", cb, RT_MIN(cb + 10, cch), puch), VERR_INVALID_UTF8_ENCODING);
772 RTStrAssertMsgReturn((puch[4] & (RT_BIT(7) | RT_BIT(6))) == RT_BIT(7), ("5/%u: %.*Rhxs\n", cb, RT_MIN(cb + 10, cch), puch), VERR_INVALID_UTF8_ENCODING);
773 uc = (puch[4] & 0x3f)
774 | ((RTUNICP)(puch[3] & 0x3f) << 6)
775 | ((RTUNICP)(puch[2] & 0x3f) << 12)
776 | ((RTUNICP)(puch[1] & 0x3f) << 18)
777 | ((RTUNICP)(uch & 0x03) << 24);
778 RTStrAssertMsgReturn(uc >= 0x00200000 && uc <= 0x03ffffff,
779 ("%u: cp=%#010RX32: %.*Rhxs\n", cb, uc, RT_MIN(cb + 10, cch), puch), VERR_INVALID_UTF8_ENCODING);
780 RTStrAssertMsgFailed(("%u: cp=%#010RX32: %.*Rhxs\n", cb, uc, RT_MIN(cb + 10, cch), puch));
781 return VERR_CANT_RECODE_AS_UTF16;
782 //cb = 5;
783 }
784 else if ((uch & (RT_BIT(7) | RT_BIT(6) | RT_BIT(5) | RT_BIT(4) | RT_BIT(3) | RT_BIT(2) | RT_BIT(1))) == (RT_BIT(7) | RT_BIT(6) | RT_BIT(5) | RT_BIT(4) | RT_BIT(3) | RT_BIT(2)))
785 {
786 RTStrAssertMsgReturn((puch[1] & (RT_BIT(7) | RT_BIT(6))) == RT_BIT(7), ("2/%u: %.*Rhxs\n", cb, RT_MIN(cb + 10, cch), puch), VERR_INVALID_UTF8_ENCODING);
787 RTStrAssertMsgReturn((puch[2] & (RT_BIT(7) | RT_BIT(6))) == RT_BIT(7), ("3/%u: %.*Rhxs\n", cb, RT_MIN(cb + 10, cch), puch), VERR_INVALID_UTF8_ENCODING);
788 RTStrAssertMsgReturn((puch[3] & (RT_BIT(7) | RT_BIT(6))) == RT_BIT(7), ("4/%u: %.*Rhxs\n", cb, RT_MIN(cb + 10, cch), puch), VERR_INVALID_UTF8_ENCODING);
789 RTStrAssertMsgReturn((puch[4] & (RT_BIT(7) | RT_BIT(6))) == RT_BIT(7), ("5/%u: %.*Rhxs\n", cb, RT_MIN(cb + 10, cch), puch), VERR_INVALID_UTF8_ENCODING);
790 RTStrAssertMsgReturn((puch[5] & (RT_BIT(7) | RT_BIT(6))) == RT_BIT(7), ("6/%u: %.*Rhxs\n", cb, RT_MIN(cb + 10, cch), puch), VERR_INVALID_UTF8_ENCODING);
791 uc = (puch[5] & 0x3f)
792 | ((RTUNICP)(puch[4] & 0x3f) << 6)
793 | ((RTUNICP)(puch[3] & 0x3f) << 12)
794 | ((RTUNICP)(puch[2] & 0x3f) << 18)
795 | ((RTUNICP)(puch[1] & 0x3f) << 24)
796 | ((RTUNICP)(uch & 0x01) << 30);
797 RTStrAssertMsgReturn(uc >= 0x04000000 && uc <= 0x7fffffff,
798 ("%u: cp=%#010RX32: %.*Rhxs\n", cb, uc, RT_MIN(cb + 10, cch), puch), VERR_INVALID_UTF8_ENCODING);
799 RTStrAssertMsgFailed(("%u: cp=%#010RX32: %.*Rhxs\n", cb, uc, RT_MIN(cb + 10, cch), puch));
800 return VERR_CANT_RECODE_AS_UTF16;
801 //cb = 6;
802 }
803 else
804 {
805 RTStrAssertMsgFailed(("Invalid UTF-8 first byte: %.*Rhxs\n", RT_MIN(cch, 10), puch));
806 return VERR_INVALID_UTF8_ENCODING;
807 }
808
809 /* advance */
810 puch += cb;
811 cwc++;
812 }
813 }
814
815 /* done */
816 *pcwc = cwc;
817 return VINF_SUCCESS;
818}
819
820
821
822/**
823 * Recodes a valid UTF-8 string as UTF-16.
824 *
825 * Since we know the input is valid, we do *not* perform encoding or length checks.
826 *
827 * @returns iprt status code.
828 * @param psz The UTF-8 string to recode. This is a valid encoding.
829 * @param cch The number of chars (the type char, so bytes if you like) to process of the UTF-8 string.
830 * The recoding will stop when cch or '\\0' is reached. Pass RTSTR_MAX to process up to '\\0'.
831 * @param pwsz Where to store the UTF-16 string.
832 * @param cwc The number of RTUTF16 items the pwsz buffer can hold, excluding the terminator ('\\0').
833 *
834 * @note rtUtf8RecodeAsUtf16Big is a duplicate with RT_H2BE_U16 applied.
835 */
836static int rtUtf8RecodeAsUtf16(const char *psz, size_t cch, PRTUTF16 pwsz, size_t cwc)
837{
838 int rc = VINF_SUCCESS;
839 const unsigned char *puch = (const unsigned char *)psz;
840 PRTUTF16 pwc = pwsz;
841 while (cch > 0)
842 {
843 /* read the next char and check for terminator. */
844 const unsigned char uch = *puch;
845 if (uch)
846 { /* we only break once, so consider this the likely branch. */ }
847 else
848 break;
849
850 /* check for output overflow */
851 if (RT_LIKELY(cwc >= 1))
852 { /* likely */ }
853 else
854 {
855 rc = VERR_BUFFER_OVERFLOW;
856 break;
857 }
858 cwc--;
859
860 /* decode and recode the code point */
861 if (!(uch & RT_BIT(7)))
862 {
863 *pwc++ = uch;
864 puch++;
865 cch--;
866 }
867 else if ((uch & (RT_BIT(7) | RT_BIT(6) | RT_BIT(5))) == (RT_BIT(7) | RT_BIT(6)))
868 {
869 uint16_t uc = (puch[1] & 0x3f)
870 | ((uint16_t)(uch & 0x1f) << 6);
871 *pwc++ = uc;
872 puch += 2;
873 cch -= 2;
874 }
875 else if ((uch & (RT_BIT(7) | RT_BIT(6) | RT_BIT(5) | RT_BIT(4))) == (RT_BIT(7) | RT_BIT(6) | RT_BIT(5)))
876 {
877 uint16_t uc = (puch[2] & 0x3f)
878 | ((uint16_t)(puch[1] & 0x3f) << 6)
879 | ((uint16_t)(uch & 0x0f) << 12);
880 *pwc++ = uc;
881 puch += 3;
882 cch -= 3;
883 }
884 else
885 {
886 /* generate surrogate pair */
887 Assert((uch & (RT_BIT(7) | RT_BIT(6) | RT_BIT(5) | RT_BIT(4) | RT_BIT(3))) == (RT_BIT(7) | RT_BIT(6) | RT_BIT(5) | RT_BIT(4)));
888 RTUNICP uc = (puch[3] & 0x3f)
889 | ((RTUNICP)(puch[2] & 0x3f) << 6)
890 | ((RTUNICP)(puch[1] & 0x3f) << 12)
891 | ((RTUNICP)(uch & 0x07) << 18);
892 if (RT_UNLIKELY(cwc < 1))
893 {
894 rc = VERR_BUFFER_OVERFLOW;
895 break;
896 }
897 cwc--;
898
899 uc -= 0x10000;
900 *pwc++ = 0xd800 | (uc >> 10);
901 *pwc++ = 0xdc00 | (uc & 0x3ff);
902 puch += 4;
903 cch -= 4;
904 }
905 }
906
907 /* done */
908 *pwc = '\0';
909 return rc;
910}
911
912
913/**
914 * Recodes a valid UTF-8 string as UTF-16BE.
915 *
916 * Since we know the input is valid, we do *not* perform encoding or length checks.
917 *
918 * @returns iprt status code.
919 * @param psz The UTF-8 string to recode. This is a valid encoding.
920 * @param cch The number of chars (the type char, so bytes if you like) to process of the UTF-8 string.
921 * The recoding will stop when cch or '\\0' is reached. Pass RTSTR_MAX to process up to '\\0'.
922 * @param pwsz Where to store the UTF-16BE string.
923 * @param cwc The number of RTUTF16 items the pwsz buffer can hold, excluding the terminator ('\\0').
924 *
925 * @note This is a copy of rtUtf8RecodeAsUtf16 with RT_H2BE_U16 applied.
926 */
927static int rtUtf8RecodeAsUtf16Big(const char *psz, size_t cch, PRTUTF16 pwsz, size_t cwc)
928{
929 int rc = VINF_SUCCESS;
930 const unsigned char *puch = (const unsigned char *)psz;
931 PRTUTF16 pwc = pwsz;
932 while (cch > 0)
933 {
934 /* read the next char and check for terminator. */
935 const unsigned char uch = *puch;
936 if (uch)
937 { /* we only break once, so consider this the likely branch. */ }
938 else
939 break;
940
941 /* check for output overflow */
942 if (RT_LIKELY(cwc >= 1))
943 { /* likely */ }
944 else
945 {
946 rc = VERR_BUFFER_OVERFLOW;
947 break;
948 }
949 cwc--;
950
951 /* decode and recode the code point */
952 if (!(uch & RT_BIT(7)))
953 {
954 *pwc++ = RT_H2BE_U16((RTUTF16)uch);
955 puch++;
956 cch--;
957 }
958 else if ((uch & (RT_BIT(7) | RT_BIT(6) | RT_BIT(5))) == (RT_BIT(7) | RT_BIT(6)))
959 {
960 uint16_t uc = (puch[1] & 0x3f)
961 | ((uint16_t)(uch & 0x1f) << 6);
962 *pwc++ = RT_H2BE_U16(uc);
963 puch += 2;
964 cch -= 2;
965 }
966 else if ((uch & (RT_BIT(7) | RT_BIT(6) | RT_BIT(5) | RT_BIT(4))) == (RT_BIT(7) | RT_BIT(6) | RT_BIT(5)))
967 {
968 uint16_t uc = (puch[2] & 0x3f)
969 | ((uint16_t)(puch[1] & 0x3f) << 6)
970 | ((uint16_t)(uch & 0x0f) << 12);
971 *pwc++ = RT_H2BE_U16(uc);
972 puch += 3;
973 cch -= 3;
974 }
975 else
976 {
977 /* generate surrogate pair */
978 Assert((uch & (RT_BIT(7) | RT_BIT(6) | RT_BIT(5) | RT_BIT(4) | RT_BIT(3))) == (RT_BIT(7) | RT_BIT(6) | RT_BIT(5) | RT_BIT(4)));
979 RTUNICP uc = (puch[3] & 0x3f)
980 | ((RTUNICP)(puch[2] & 0x3f) << 6)
981 | ((RTUNICP)(puch[1] & 0x3f) << 12)
982 | ((RTUNICP)(uch & 0x07) << 18);
983 if (RT_UNLIKELY(cwc < 1))
984 {
985 rc = VERR_BUFFER_OVERFLOW;
986 break;
987 }
988 cwc--;
989
990 uc -= 0x10000;
991 *pwc++ = RT_H2BE_U16(0xd800 | (uc >> 10));
992 *pwc++ = RT_H2BE_U16(0xdc00 | (uc & 0x3ff));
993 puch += 4;
994 cch -= 4;
995 }
996 }
997
998 /* done */
999 *pwc = '\0';
1000 return rc;
1001}
1002
1003
1004RTDECL(int) RTStrToUtf16Tag(const char *pszString, PRTUTF16 *ppwszString, const char *pszTag)
1005{
1006 /*
1007 * Validate input.
1008 */
1009 Assert(VALID_PTR(ppwszString));
1010 Assert(VALID_PTR(pszString));
1011 *ppwszString = NULL;
1012
1013 /*
1014 * Validate the UTF-8 input and calculate the length of the UTF-16 string.
1015 */
1016 size_t cwc;
1017 int rc = rtUtf8CalcUtf16Length(pszString, &cwc);
1018 if (RT_SUCCESS(rc))
1019 {
1020 /*
1021 * Allocate buffer.
1022 */
1023 PRTUTF16 pwsz = (PRTUTF16)RTMemAllocTag((cwc + 1) * sizeof(RTUTF16), pszTag);
1024 if (pwsz)
1025 {
1026 /*
1027 * Encode the UTF-16 string.
1028 */
1029 rc = rtUtf8RecodeAsUtf16(pszString, RTSTR_MAX, pwsz, cwc);
1030 if (RT_SUCCESS(rc))
1031 {
1032 *ppwszString = pwsz;
1033 return rc;
1034 }
1035 RTMemFree(pwsz);
1036 }
1037 else
1038 rc = VERR_NO_UTF16_MEMORY;
1039 }
1040 return rc;
1041}
1042RT_EXPORT_SYMBOL(RTStrToUtf16Tag);
1043
1044
1045RTDECL(int) RTStrToUtf16BigTag(const char *pszString, PRTUTF16 *ppwszString, const char *pszTag)
1046{
1047 /*
1048 * Validate input.
1049 */
1050 Assert(VALID_PTR(ppwszString));
1051 Assert(VALID_PTR(pszString));
1052 *ppwszString = NULL;
1053
1054 /*
1055 * Validate the UTF-8 input and calculate the length of the UTF-16 string.
1056 */
1057 size_t cwc;
1058 int rc = rtUtf8CalcUtf16Length(pszString, &cwc);
1059 if (RT_SUCCESS(rc))
1060 {
1061 /*
1062 * Allocate buffer.
1063 */
1064 PRTUTF16 pwsz = (PRTUTF16)RTMemAllocTag((cwc + 1) * sizeof(RTUTF16), pszTag);
1065 if (pwsz)
1066 {
1067 /*
1068 * Encode the UTF-16 string.
1069 */
1070 rc = rtUtf8RecodeAsUtf16Big(pszString, RTSTR_MAX, pwsz, cwc);
1071 if (RT_SUCCESS(rc))
1072 {
1073 *ppwszString = pwsz;
1074 return rc;
1075 }
1076 RTMemFree(pwsz);
1077 }
1078 else
1079 rc = VERR_NO_UTF16_MEMORY;
1080 }
1081 return rc;
1082}
1083RT_EXPORT_SYMBOL(RTStrToUtf16BigTag);
1084
1085
1086RTDECL(int) RTStrToUtf16ExTag(const char *pszString, size_t cchString,
1087 PRTUTF16 *ppwsz, size_t cwc, size_t *pcwc, const char *pszTag)
1088{
1089 /*
1090 * Validate input.
1091 */
1092 Assert(VALID_PTR(pszString));
1093 Assert(VALID_PTR(ppwsz));
1094 Assert(!pcwc || VALID_PTR(pcwc));
1095
1096 /*
1097 * Validate the UTF-8 input and calculate the length of the UTF-16 string.
1098 */
1099 size_t cwcResult;
1100 int rc;
1101 if (cchString != RTSTR_MAX)
1102 rc = rtUtf8CalcUtf16LengthN(pszString, cchString, &cwcResult);
1103 else
1104 rc = rtUtf8CalcUtf16Length(pszString, &cwcResult);
1105 if (RT_SUCCESS(rc))
1106 {
1107 if (pcwc)
1108 *pcwc = cwcResult;
1109
1110 /*
1111 * Check buffer size / Allocate buffer.
1112 */
1113 bool fShouldFree;
1114 PRTUTF16 pwszResult;
1115 if (cwc > 0 && *ppwsz)
1116 {
1117 fShouldFree = false;
1118 if (cwc <= cwcResult)
1119 return VERR_BUFFER_OVERFLOW;
1120 pwszResult = *ppwsz;
1121 }
1122 else
1123 {
1124 *ppwsz = NULL;
1125 fShouldFree = true;
1126 cwc = RT_MAX(cwcResult + 1, cwc);
1127 pwszResult = (PRTUTF16)RTMemAllocTag(cwc * sizeof(RTUTF16), pszTag);
1128 }
1129 if (pwszResult)
1130 {
1131 /*
1132 * Encode the UTF-16 string.
1133 */
1134 rc = rtUtf8RecodeAsUtf16(pszString, cchString, pwszResult, cwc - 1);
1135 if (RT_SUCCESS(rc))
1136 {
1137 *ppwsz = pwszResult;
1138 return rc;
1139 }
1140 if (fShouldFree)
1141 RTMemFree(pwszResult);
1142 }
1143 else
1144 rc = VERR_NO_UTF16_MEMORY;
1145 }
1146 return rc;
1147}
1148RT_EXPORT_SYMBOL(RTStrToUtf16ExTag);
1149
1150
1151RTDECL(int) RTStrToUtf16BigExTag(const char *pszString, size_t cchString,
1152 PRTUTF16 *ppwsz, size_t cwc, size_t *pcwc, const char *pszTag)
1153{
1154 /*
1155 * Validate input.
1156 */
1157 Assert(VALID_PTR(pszString));
1158 Assert(VALID_PTR(ppwsz));
1159 Assert(!pcwc || VALID_PTR(pcwc));
1160
1161 /*
1162 * Validate the UTF-8 input and calculate the length of the UTF-16 string.
1163 */
1164 size_t cwcResult;
1165 int rc;
1166 if (cchString != RTSTR_MAX)
1167 rc = rtUtf8CalcUtf16LengthN(pszString, cchString, &cwcResult);
1168 else
1169 rc = rtUtf8CalcUtf16Length(pszString, &cwcResult);
1170 if (RT_SUCCESS(rc))
1171 {
1172 if (pcwc)
1173 *pcwc = cwcResult;
1174
1175 /*
1176 * Check buffer size / Allocate buffer.
1177 */
1178 bool fShouldFree;
1179 PRTUTF16 pwszResult;
1180 if (cwc > 0 && *ppwsz)
1181 {
1182 fShouldFree = false;
1183 if (cwc <= cwcResult)
1184 return VERR_BUFFER_OVERFLOW;
1185 pwszResult = *ppwsz;
1186 }
1187 else
1188 {
1189 *ppwsz = NULL;
1190 fShouldFree = true;
1191 cwc = RT_MAX(cwcResult + 1, cwc);
1192 pwszResult = (PRTUTF16)RTMemAllocTag(cwc * sizeof(RTUTF16), pszTag);
1193 }
1194 if (pwszResult)
1195 {
1196 /*
1197 * Encode the UTF-16BE string.
1198 */
1199 rc = rtUtf8RecodeAsUtf16Big(pszString, cchString, pwszResult, cwc - 1);
1200 if (RT_SUCCESS(rc))
1201 {
1202 *ppwsz = pwszResult;
1203 return rc;
1204 }
1205 if (fShouldFree)
1206 RTMemFree(pwszResult);
1207 }
1208 else
1209 rc = VERR_NO_UTF16_MEMORY;
1210 }
1211 return rc;
1212}
1213RT_EXPORT_SYMBOL(RTStrToUtf16BigExTag);
1214
1215
1216RTDECL(size_t) RTStrCalcUtf16Len(const char *psz)
1217{
1218 size_t cwc;
1219 int rc = rtUtf8CalcUtf16Length(psz, &cwc);
1220 return RT_SUCCESS(rc) ? cwc : 0;
1221}
1222RT_EXPORT_SYMBOL(RTStrCalcUtf16Len);
1223
1224
1225RTDECL(int) RTStrCalcUtf16LenEx(const char *psz, size_t cch, size_t *pcwc)
1226{
1227 size_t cwc;
1228 int rc;
1229 if (cch != RTSTR_MAX)
1230 rc = rtUtf8CalcUtf16LengthN(psz, cch, &cwc);
1231 else
1232 rc = rtUtf8CalcUtf16Length(psz, &cwc);
1233 if (pcwc)
1234 *pcwc = RT_SUCCESS(rc) ? cwc : ~(size_t)0;
1235 return rc;
1236}
1237RT_EXPORT_SYMBOL(RTStrCalcUtf16LenEx);
1238
1239
1240/**
1241 * Calculates the length of the UTF-8 encoding of a Latin-1 string.
1242 *
1243 * @returns iprt status code.
1244 * @param psz The Latin-1 string.
1245 * @param cchIn The max length of the Latin-1 string to consider.
1246 * @param pcch Where to store the length (excluding '\\0') of the UTF-8 string. (cch == cb, btw)
1247 */
1248static int rtLatin1CalcUtf8Length(const char *psz, size_t cchIn, size_t *pcch)
1249{
1250 size_t cch = 0;
1251 for (;;)
1252 {
1253 RTUNICP Cp;
1254 int rc = RTLatin1GetCpNEx(&psz, &cchIn, &Cp);
1255 if (Cp == 0 || rc == VERR_END_OF_STRING)
1256 break;
1257 if (RT_FAILURE(rc))
1258 return rc;
1259 cch += RTStrCpSize(Cp); /* cannot fail */
1260 }
1261
1262 /* done */
1263 *pcch = cch;
1264 return VINF_SUCCESS;
1265}
1266
1267
1268/**
1269 * Recodes a Latin-1 string as UTF-8.
1270 *
1271 * @returns iprt status code.
1272 * @param pszIn The Latin-1 string.
1273 * @param cchIn The number of characters to process from psz. The recoding
1274 * will stop when cch or '\\0' is reached.
1275 * @param psz Where to store the UTF-8 string.
1276 * @param cch The size of the UTF-8 buffer, excluding the terminator.
1277 */
1278static int rtLatin1RecodeAsUtf8(const char *pszIn, size_t cchIn, char *psz, size_t cch)
1279{
1280 int rc;
1281 for (;;)
1282 {
1283 RTUNICP Cp;
1284 size_t cchCp;
1285 rc = RTLatin1GetCpNEx(&pszIn, &cchIn, &Cp);
1286 if (Cp == 0 || RT_FAILURE(rc))
1287 break;
1288 cchCp = RTStrCpSize(Cp);
1289 if (RT_UNLIKELY(cch < cchCp))
1290 {
1291 RTStrAssertMsgFailed(("Buffer overflow! 1\n"));
1292 rc = VERR_BUFFER_OVERFLOW;
1293 break;
1294 }
1295 cch -= cchCp;
1296 psz = RTStrPutCp(psz, Cp);
1297 }
1298
1299 /* done */
1300 if (rc == VERR_END_OF_STRING)
1301 rc = VINF_SUCCESS;
1302 *psz = '\0';
1303 return rc;
1304}
1305
1306
1307
1308RTDECL(int) RTLatin1ToUtf8Tag(const char *pszString, char **ppszString, const char *pszTag)
1309{
1310 /*
1311 * Validate input.
1312 */
1313 Assert(VALID_PTR(ppszString));
1314 Assert(VALID_PTR(pszString));
1315 *ppszString = NULL;
1316
1317 /*
1318 * Calculate the length of the UTF-8 encoding of the Latin-1 string.
1319 */
1320 size_t cch;
1321 int rc = rtLatin1CalcUtf8Length(pszString, RTSTR_MAX, &cch);
1322 if (RT_SUCCESS(rc))
1323 {
1324 /*
1325 * Allocate buffer and recode it.
1326 */
1327 char *pszResult = (char *)RTMemAllocTag(cch + 1, pszTag);
1328 if (pszResult)
1329 {
1330 rc = rtLatin1RecodeAsUtf8(pszString, RTSTR_MAX, pszResult, cch);
1331 if (RT_SUCCESS(rc))
1332 {
1333 *ppszString = pszResult;
1334 return rc;
1335 }
1336
1337 RTMemFree(pszResult);
1338 }
1339 else
1340 rc = VERR_NO_STR_MEMORY;
1341 }
1342 return rc;
1343}
1344RT_EXPORT_SYMBOL(RTLatin1ToUtf8Tag);
1345
1346
1347RTDECL(int) RTLatin1ToUtf8ExTag(const char *pszString, size_t cchString, char **ppsz, size_t cch, size_t *pcch, const char *pszTag)
1348{
1349 /*
1350 * Validate input.
1351 */
1352 Assert(VALID_PTR(pszString));
1353 Assert(VALID_PTR(ppsz));
1354 Assert(!pcch || VALID_PTR(pcch));
1355
1356 /*
1357 * Calculate the length of the UTF-8 encoding of the Latin-1 string.
1358 */
1359 size_t cchResult;
1360 int rc = rtLatin1CalcUtf8Length(pszString, cchString, &cchResult);
1361 if (RT_SUCCESS(rc))
1362 {
1363 if (pcch)
1364 *pcch = cchResult;
1365
1366 /*
1367 * Check buffer size / Allocate buffer and recode it.
1368 */
1369 bool fShouldFree;
1370 char *pszResult;
1371 if (cch > 0 && *ppsz)
1372 {
1373 fShouldFree = false;
1374 if (RT_UNLIKELY(cch <= cchResult))
1375 return VERR_BUFFER_OVERFLOW;
1376 pszResult = *ppsz;
1377 }
1378 else
1379 {
1380 *ppsz = NULL;
1381 fShouldFree = true;
1382 cch = RT_MAX(cch, cchResult + 1);
1383 pszResult = (char *)RTStrAllocTag(cch, pszTag);
1384 }
1385 if (pszResult)
1386 {
1387 rc = rtLatin1RecodeAsUtf8(pszString, cchString, pszResult, cch - 1);
1388 if (RT_SUCCESS(rc))
1389 {
1390 *ppsz = pszResult;
1391 return rc;
1392 }
1393
1394 if (fShouldFree)
1395 RTStrFree(pszResult);
1396 }
1397 else
1398 rc = VERR_NO_STR_MEMORY;
1399 }
1400 return rc;
1401}
1402RT_EXPORT_SYMBOL(RTLatin1ToUtf8ExTag);
1403
1404
1405RTDECL(size_t) RTLatin1CalcUtf8Len(const char *psz)
1406{
1407 size_t cch;
1408 int rc = rtLatin1CalcUtf8Length(psz, RTSTR_MAX, &cch);
1409 return RT_SUCCESS(rc) ? cch : 0;
1410}
1411RT_EXPORT_SYMBOL(RTLatin1CalcUtf8Len);
1412
1413
1414RTDECL(int) RTLatin1CalcUtf8LenEx(const char *psz, size_t cchIn, size_t *pcch)
1415{
1416 size_t cch;
1417 int rc = rtLatin1CalcUtf8Length(psz, cchIn, &cch);
1418 if (pcch)
1419 *pcch = RT_SUCCESS(rc) ? cch : ~(size_t)0;
1420 return rc;
1421}
1422RT_EXPORT_SYMBOL(RTLatin1CalcUtf8LenEx);
1423
1424
1425/**
1426 * Calculates the Latin-1 length of a string, validating the encoding while
1427 * doing so.
1428 *
1429 * @returns IPRT status code.
1430 * @param psz Pointer to the UTF-8 string.
1431 * @param cchIn The max length of the string. (btw cch = cb)
1432 * Use RTSTR_MAX if all of the string is to be examined.
1433 * @param pcch Where to store the length of the Latin-1 string in bytes.
1434 */
1435static int rtUtf8CalcLatin1Length(const char *psz, size_t cchIn, size_t *pcch)
1436{
1437 size_t cch = 0;
1438 for (;;)
1439 {
1440 RTUNICP Cp;
1441 size_t cchCp;
1442 int rc = RTStrGetCpNEx(&psz, &cchIn, &Cp);
1443 if (Cp == 0 || rc == VERR_END_OF_STRING)
1444 break;
1445 if (RT_FAILURE(rc))
1446 return rc;
1447 cchCp = RTLatin1CpSize(Cp);
1448 if (cchCp == 0)
1449 return VERR_NO_TRANSLATION;
1450 cch += cchCp;
1451 }
1452
1453 /* done */
1454 *pcch = cch;
1455 return VINF_SUCCESS;
1456}
1457
1458
1459/**
1460 * Recodes a valid UTF-8 string as Latin-1.
1461 *
1462 * Since we know the input is valid, we do *not* perform encoding or length checks.
1463 *
1464 * @returns iprt status code.
1465 * @param pszIn The UTF-8 string to recode. This is a valid encoding.
1466 * @param cchIn The number of chars (the type char, so bytes if you like) to process of the UTF-8 string.
1467 * The recoding will stop when cch or '\\0' is reached. Pass RTSTR_MAX to process up to '\\0'.
1468 * @param psz Where to store the Latin-1 string.
1469 * @param cch The number of characters the pszOut buffer can hold, excluding the terminator ('\\0').
1470 */
1471static int rtUtf8RecodeAsLatin1(const char *pszIn, size_t cchIn, char *psz, size_t cch)
1472{
1473 int rc;
1474 for (;;)
1475 {
1476 RTUNICP Cp;
1477 size_t cchCp;
1478 rc = RTStrGetCpNEx(&pszIn, &cchIn, &Cp);
1479 if (Cp == 0 || RT_FAILURE(rc))
1480 break;
1481 cchCp = RTLatin1CpSize(Cp);
1482 if (RT_UNLIKELY(cch < cchCp))
1483 {
1484 RTStrAssertMsgFailed(("Buffer overflow! 1\n"));
1485 rc = VERR_BUFFER_OVERFLOW;
1486 break;
1487 }
1488 cch -= cchCp;
1489 psz = RTLatin1PutCp(psz, Cp);
1490 }
1491
1492 /* done */
1493 if (rc == VERR_END_OF_STRING)
1494 rc = VINF_SUCCESS;
1495 *psz = '\0';
1496 return rc;
1497}
1498
1499
1500
1501RTDECL(int) RTStrToLatin1Tag(const char *pszString, char **ppszString, const char *pszTag)
1502{
1503 /*
1504 * Validate input.
1505 */
1506 Assert(VALID_PTR(ppszString));
1507 Assert(VALID_PTR(pszString));
1508 *ppszString = NULL;
1509
1510 /*
1511 * Validate the UTF-8 input and calculate the length of the Latin-1 string.
1512 */
1513 size_t cch;
1514 int rc = rtUtf8CalcLatin1Length(pszString, RTSTR_MAX, &cch);
1515 if (RT_SUCCESS(rc))
1516 {
1517 /*
1518 * Allocate buffer.
1519 */
1520 char *psz = (char *)RTMemAllocTag(cch + 1, pszTag);
1521 if (psz)
1522 {
1523 /*
1524 * Encode the UTF-16 string.
1525 */
1526 rc = rtUtf8RecodeAsLatin1(pszString, RTSTR_MAX, psz, cch);
1527 if (RT_SUCCESS(rc))
1528 {
1529 *ppszString = psz;
1530 return rc;
1531 }
1532 RTMemFree(psz);
1533 }
1534 else
1535 rc = VERR_NO_STR_MEMORY;
1536 }
1537 return rc;
1538}
1539RT_EXPORT_SYMBOL(RTStrToLatin1Tag);
1540
1541
1542RTDECL(int) RTStrToLatin1ExTag(const char *pszString, size_t cchString,
1543 char **ppsz, size_t cch, size_t *pcch, const char *pszTag)
1544{
1545 /*
1546 * Validate input.
1547 */
1548 Assert(VALID_PTR(pszString));
1549 Assert(VALID_PTR(ppsz));
1550 Assert(!pcch || VALID_PTR(pcch));
1551
1552 /*
1553 * Validate the UTF-8 input and calculate the length of the UTF-16 string.
1554 */
1555 size_t cchResult;
1556 int rc = rtUtf8CalcLatin1Length(pszString, cchString, &cchResult);
1557 if (RT_SUCCESS(rc))
1558 {
1559 if (pcch)
1560 *pcch = cchResult;
1561
1562 /*
1563 * Check buffer size / Allocate buffer.
1564 */
1565 bool fShouldFree;
1566 char *pszResult;
1567 if (cch > 0 && *ppsz)
1568 {
1569 fShouldFree = false;
1570 if (cch <= cchResult)
1571 return VERR_BUFFER_OVERFLOW;
1572 pszResult = *ppsz;
1573 }
1574 else
1575 {
1576 *ppsz = NULL;
1577 fShouldFree = true;
1578 cch = RT_MAX(cchResult + 1, cch);
1579 pszResult = (char *)RTMemAllocTag(cch, pszTag);
1580 }
1581 if (pszResult)
1582 {
1583 /*
1584 * Encode the Latin-1 string.
1585 */
1586 rc = rtUtf8RecodeAsLatin1(pszString, cchString, pszResult, cch - 1);
1587 if (RT_SUCCESS(rc))
1588 {
1589 *ppsz = pszResult;
1590 return rc;
1591 }
1592 if (fShouldFree)
1593 RTMemFree(pszResult);
1594 }
1595 else
1596 rc = VERR_NO_STR_MEMORY;
1597 }
1598 return rc;
1599}
1600RT_EXPORT_SYMBOL(RTStrToLatin1ExTag);
1601
1602
1603RTDECL(size_t) RTStrCalcLatin1Len(const char *psz)
1604{
1605 size_t cch;
1606 int rc = rtUtf8CalcLatin1Length(psz, RTSTR_MAX, &cch);
1607 return RT_SUCCESS(rc) ? cch : 0;
1608}
1609RT_EXPORT_SYMBOL(RTStrCalcLatin1Len);
1610
1611
1612RTDECL(int) RTStrCalcLatin1LenEx(const char *psz, size_t cchIn, size_t *pcch)
1613{
1614 size_t cch;
1615 int rc = rtUtf8CalcLatin1Length(psz, cchIn, &cch);
1616 if (pcch)
1617 *pcch = RT_SUCCESS(rc) ? cch : ~(size_t)0;
1618 return rc;
1619}
1620RT_EXPORT_SYMBOL(RTStrCalcLatin1LenEx);
1621
1622
1623/**
1624 * Handle invalid encodings passed to RTStrGetCp() and RTStrGetCpEx().
1625 * @returns rc
1626 * @param ppsz The pointer to the string position point.
1627 * @param pCp Where to store RTUNICP_INVALID.
1628 * @param rc The iprt error code.
1629 */
1630static int rtStrGetCpExFailure(const char **ppsz, PRTUNICP pCp, int rc)
1631{
1632 /*
1633 * Try find a valid encoding.
1634 */
1635 (*ppsz)++; /** @todo code this! */
1636 *pCp = RTUNICP_INVALID;
1637 return rc;
1638}
1639
1640
1641RTDECL(RTUNICP) RTStrGetCpInternal(const char *psz)
1642{
1643 RTUNICP Cp;
1644 RTStrGetCpExInternal(&psz, &Cp);
1645 return Cp;
1646}
1647RT_EXPORT_SYMBOL(RTStrGetCpInternal);
1648
1649
1650RTDECL(int) RTStrGetCpExInternal(const char **ppsz, PRTUNICP pCp)
1651{
1652 const unsigned char *puch = (const unsigned char *)*ppsz;
1653 const unsigned char uch = *puch;
1654 RTUNICP uc;
1655
1656 /* ASCII ? */
1657 if (!(uch & RT_BIT(7)))
1658 {
1659 uc = uch;
1660 puch++;
1661 }
1662 else if (uch & RT_BIT(6))
1663 {
1664 /* figure the length and validate the first octet. */
1665/** @todo RT_USE_RTC_3629 */
1666 unsigned cb;
1667 if (!(uch & RT_BIT(5)))
1668 cb = 2;
1669 else if (!(uch & RT_BIT(4)))
1670 cb = 3;
1671 else if (!(uch & RT_BIT(3)))
1672 cb = 4;
1673 else if (!(uch & RT_BIT(2)))
1674 cb = 5;
1675 else if (!(uch & RT_BIT(1)))
1676 cb = 6;
1677 else
1678 {
1679 RTStrAssertMsgFailed(("Invalid UTF-8 first byte: %.*Rhxs\n", RT_MIN(strlen((char *)puch), 10), puch));
1680 return rtStrGetCpExFailure(ppsz, pCp, VERR_INVALID_UTF8_ENCODING);
1681 }
1682
1683 /* validate the rest */
1684 switch (cb)
1685 {
1686 case 6:
1687 RTStrAssertMsgReturn((puch[5] & 0xc0) == 0x80, ("6/%u: %.*Rhxs\n", cb, RT_MIN(cb + 10, strlen((char *)puch)), puch),
1688 rtStrGetCpExFailure(ppsz, pCp, VERR_INVALID_UTF8_ENCODING));
1689 RT_FALL_THRU();
1690 case 5:
1691 RTStrAssertMsgReturn((puch[4] & 0xc0) == 0x80, ("5/%u: %.*Rhxs\n", cb, RT_MIN(cb + 10, strlen((char *)puch)), puch),
1692 rtStrGetCpExFailure(ppsz, pCp, VERR_INVALID_UTF8_ENCODING));
1693 RT_FALL_THRU();
1694 case 4:
1695 RTStrAssertMsgReturn((puch[3] & 0xc0) == 0x80, ("4/%u: %.*Rhxs\n", cb, RT_MIN(cb + 10, strlen((char *)puch)), puch),
1696 rtStrGetCpExFailure(ppsz, pCp, VERR_INVALID_UTF8_ENCODING));
1697 RT_FALL_THRU();
1698 case 3:
1699 RTStrAssertMsgReturn((puch[2] & 0xc0) == 0x80, ("3/%u: %.*Rhxs\n", cb, RT_MIN(cb + 10, strlen((char *)puch)), puch),
1700 rtStrGetCpExFailure(ppsz, pCp, VERR_INVALID_UTF8_ENCODING));
1701 RT_FALL_THRU();
1702 case 2:
1703 RTStrAssertMsgReturn((puch[1] & 0xc0) == 0x80, ("2/%u: %.*Rhxs\n", cb, RT_MIN(cb + 10, strlen((char *)puch)), puch),
1704 rtStrGetCpExFailure(ppsz, pCp, VERR_INVALID_UTF8_ENCODING));
1705 break;
1706 }
1707
1708 /* get and validate the code point. */
1709 switch (cb)
1710 {
1711 case 6:
1712 uc = (puch[5] & 0x3f)
1713 | ((RTUNICP)(puch[4] & 0x3f) << 6)
1714 | ((RTUNICP)(puch[3] & 0x3f) << 12)
1715 | ((RTUNICP)(puch[2] & 0x3f) << 18)
1716 | ((RTUNICP)(puch[1] & 0x3f) << 24)
1717 | ((RTUNICP)(uch & 0x01) << 30);
1718 RTStrAssertMsgReturn(uc >= 0x04000000 && uc <= 0x7fffffff,
1719 ("%u: cp=%#010RX32: %.*Rhxs\n", cb, uc, RT_MIN(cb + 10, strlen((char *)puch)), puch),
1720 rtStrGetCpExFailure(ppsz, pCp, VERR_INVALID_UTF8_ENCODING));
1721 break;
1722 case 5:
1723 uc = (puch[4] & 0x3f)
1724 | ((RTUNICP)(puch[3] & 0x3f) << 6)
1725 | ((RTUNICP)(puch[2] & 0x3f) << 12)
1726 | ((RTUNICP)(puch[1] & 0x3f) << 18)
1727 | ((RTUNICP)(uch & 0x03) << 24);
1728 RTStrAssertMsgReturn(uc >= 0x00200000 && uc <= 0x03ffffff,
1729 ("%u: cp=%#010RX32: %.*Rhxs\n", cb, uc, RT_MIN(cb + 10, strlen((char *)puch)), puch),
1730 rtStrGetCpExFailure(ppsz, pCp, VERR_INVALID_UTF8_ENCODING));
1731 break;
1732 case 4:
1733 uc = (puch[3] & 0x3f)
1734 | ((RTUNICP)(puch[2] & 0x3f) << 6)
1735 | ((RTUNICP)(puch[1] & 0x3f) << 12)
1736 | ((RTUNICP)(uch & 0x07) << 18);
1737 RTStrAssertMsgReturn(uc >= 0x00010000 && uc <= 0x001fffff,
1738 ("%u: cp=%#010RX32: %.*Rhxs\n", cb, uc, RT_MIN(cb + 10, strlen((char *)puch)), puch),
1739 rtStrGetCpExFailure(ppsz, pCp, VERR_INVALID_UTF8_ENCODING));
1740 break;
1741 case 3:
1742 uc = (puch[2] & 0x3f)
1743 | ((RTUNICP)(puch[1] & 0x3f) << 6)
1744 | ((RTUNICP)(uch & 0x0f) << 12);
1745 RTStrAssertMsgReturn(uc >= 0x00000800 && uc <= 0x0000fffd,
1746 ("%u: cp=%#010RX32: %.*Rhxs\n", cb, uc, RT_MIN(cb + 10, strlen((char *)puch)), puch),
1747 rtStrGetCpExFailure(ppsz, pCp, uc == 0xffff || uc == 0xfffe ? VERR_CODE_POINT_ENDIAN_INDICATOR : VERR_INVALID_UTF8_ENCODING));
1748 RTStrAssertMsgReturn(uc < 0xd800 || uc > 0xdfff,
1749 ("%u: cp=%#010RX32: %.*Rhxs\n", cb, uc, RT_MIN(cb + 10, strlen((char *)puch)), puch),
1750 rtStrGetCpExFailure(ppsz, pCp, VERR_CODE_POINT_SURROGATE));
1751 break;
1752 case 2:
1753 uc = (puch[1] & 0x3f)
1754 | ((RTUNICP)(uch & 0x1f) << 6);
1755 RTStrAssertMsgReturn(uc >= 0x00000080 && uc <= 0x000007ff,
1756 ("%u: cp=%#010RX32: %.*Rhxs\n", cb, uc, RT_MIN(cb + 10, strlen((char *)puch)), puch),
1757 rtStrGetCpExFailure(ppsz, pCp, VERR_INVALID_UTF8_ENCODING));
1758 break;
1759 default: /* impossible, but GCC is bitching. */
1760 uc = RTUNICP_INVALID;
1761 break;
1762 }
1763 puch += cb;
1764 }
1765 else
1766 {
1767 /* 6th bit is always set. */
1768 RTStrAssertMsgFailed(("Invalid UTF-8 first byte: %.*Rhxs\n", RT_MIN(strlen((char *)puch), 10), puch));
1769 return rtStrGetCpExFailure(ppsz, pCp, VERR_INVALID_UTF8_ENCODING);
1770 }
1771 *pCp = uc;
1772 *ppsz = (const char *)puch;
1773 return VINF_SUCCESS;
1774}
1775RT_EXPORT_SYMBOL(RTStrGetCpExInternal);
1776
1777
1778/**
1779 * Handle invalid encodings passed to RTStrGetCpNEx().
1780 * @returns rc
1781 * @param ppsz The pointer to the string position point.
1782 * @param pcch Pointer to the string length.
1783 * @param pCp Where to store RTUNICP_INVALID.
1784 * @param rc The iprt error code.
1785 */
1786static int rtStrGetCpNExFailure(const char **ppsz, size_t *pcch, PRTUNICP pCp, int rc)
1787{
1788 /*
1789 * Try find a valid encoding.
1790 */
1791 (*ppsz)++; /** @todo code this! */
1792 (*pcch)--;
1793 *pCp = RTUNICP_INVALID;
1794 return rc;
1795}
1796
1797
1798RTDECL(int) RTStrGetCpNExInternal(const char **ppsz, size_t *pcch, PRTUNICP pCp)
1799{
1800 const unsigned char *puch = (const unsigned char *)*ppsz;
1801 const unsigned char uch = *puch;
1802 size_t cch = *pcch;
1803 RTUNICP uc;
1804
1805 if (cch == 0)
1806 {
1807 *pCp = RTUNICP_INVALID;
1808 return VERR_END_OF_STRING;
1809 }
1810
1811 /* ASCII ? */
1812 if (!(uch & RT_BIT(7)))
1813 {
1814 uc = uch;
1815 puch++;
1816 cch--;
1817 }
1818 else if (uch & RT_BIT(6))
1819 {
1820 /* figure the length and validate the first octet. */
1821/** @todo RT_USE_RTC_3629 */
1822 unsigned cb;
1823 if (!(uch & RT_BIT(5)))
1824 cb = 2;
1825 else if (!(uch & RT_BIT(4)))
1826 cb = 3;
1827 else if (!(uch & RT_BIT(3)))
1828 cb = 4;
1829 else if (!(uch & RT_BIT(2)))
1830 cb = 5;
1831 else if (!(uch & RT_BIT(1)))
1832 cb = 6;
1833 else
1834 {
1835 RTStrAssertMsgFailed(("Invalid UTF-8 first byte: %.*Rhxs\n", RT_MIN(strlen((char *)puch), 10), puch));
1836 return rtStrGetCpNExFailure(ppsz, pcch, pCp, VERR_INVALID_UTF8_ENCODING);
1837 }
1838
1839 if (cb > cch)
1840 return rtStrGetCpNExFailure(ppsz, pcch, pCp, VERR_INVALID_UTF8_ENCODING);
1841
1842 /* validate the rest */
1843 switch (cb)
1844 {
1845 case 6:
1846 RTStrAssertMsgReturn((puch[5] & 0xc0) == 0x80, ("6/%u: %.*Rhxs\n", cb, RT_MIN(cb + 10, strlen((char *)puch)), puch),
1847 rtStrGetCpNExFailure(ppsz, pcch, pCp, VERR_INVALID_UTF8_ENCODING));
1848 RT_FALL_THRU();
1849 case 5:
1850 RTStrAssertMsgReturn((puch[4] & 0xc0) == 0x80, ("5/%u: %.*Rhxs\n", cb, RT_MIN(cb + 10, strlen((char *)puch)), puch),
1851 rtStrGetCpNExFailure(ppsz, pcch, pCp, VERR_INVALID_UTF8_ENCODING));
1852 RT_FALL_THRU();
1853 case 4:
1854 RTStrAssertMsgReturn((puch[3] & 0xc0) == 0x80, ("4/%u: %.*Rhxs\n", cb, RT_MIN(cb + 10, strlen((char *)puch)), puch),
1855 rtStrGetCpNExFailure(ppsz, pcch, pCp, VERR_INVALID_UTF8_ENCODING));
1856 RT_FALL_THRU();
1857 case 3:
1858 RTStrAssertMsgReturn((puch[2] & 0xc0) == 0x80, ("3/%u: %.*Rhxs\n", cb, RT_MIN(cb + 10, strlen((char *)puch)), puch),
1859 rtStrGetCpNExFailure(ppsz, pcch, pCp, VERR_INVALID_UTF8_ENCODING));
1860 RT_FALL_THRU();
1861 case 2:
1862 RTStrAssertMsgReturn((puch[1] & 0xc0) == 0x80, ("2/%u: %.*Rhxs\n", cb, RT_MIN(cb + 10, strlen((char *)puch)), puch),
1863 rtStrGetCpNExFailure(ppsz, pcch, pCp, VERR_INVALID_UTF8_ENCODING));
1864 break;
1865 }
1866
1867 /* get and validate the code point. */
1868 switch (cb)
1869 {
1870 case 6:
1871 uc = (puch[5] & 0x3f)
1872 | ((RTUNICP)(puch[4] & 0x3f) << 6)
1873 | ((RTUNICP)(puch[3] & 0x3f) << 12)
1874 | ((RTUNICP)(puch[2] & 0x3f) << 18)
1875 | ((RTUNICP)(puch[1] & 0x3f) << 24)
1876 | ((RTUNICP)(uch & 0x01) << 30);
1877 RTStrAssertMsgReturn(uc >= 0x04000000 && uc <= 0x7fffffff,
1878 ("%u: cp=%#010RX32: %.*Rhxs\n", cb, uc, RT_MIN(cb + 10, strlen((char *)puch)), puch),
1879 rtStrGetCpNExFailure(ppsz, pcch, pCp, VERR_INVALID_UTF8_ENCODING));
1880 break;
1881 case 5:
1882 uc = (puch[4] & 0x3f)
1883 | ((RTUNICP)(puch[3] & 0x3f) << 6)
1884 | ((RTUNICP)(puch[2] & 0x3f) << 12)
1885 | ((RTUNICP)(puch[1] & 0x3f) << 18)
1886 | ((RTUNICP)(uch & 0x03) << 24);
1887 RTStrAssertMsgReturn(uc >= 0x00200000 && uc <= 0x03ffffff,
1888 ("%u: cp=%#010RX32: %.*Rhxs\n", cb, uc, RT_MIN(cb + 10, strlen((char *)puch)), puch),
1889 rtStrGetCpNExFailure(ppsz, pcch, pCp, VERR_INVALID_UTF8_ENCODING));
1890 break;
1891 case 4:
1892 uc = (puch[3] & 0x3f)
1893 | ((RTUNICP)(puch[2] & 0x3f) << 6)
1894 | ((RTUNICP)(puch[1] & 0x3f) << 12)
1895 | ((RTUNICP)(uch & 0x07) << 18);
1896 RTStrAssertMsgReturn(uc >= 0x00010000 && uc <= 0x001fffff,
1897 ("%u: cp=%#010RX32: %.*Rhxs\n", cb, uc, RT_MIN(cb + 10, strlen((char *)puch)), puch),
1898 rtStrGetCpNExFailure(ppsz, pcch, pCp, VERR_INVALID_UTF8_ENCODING));
1899 break;
1900 case 3:
1901 uc = (puch[2] & 0x3f)
1902 | ((RTUNICP)(puch[1] & 0x3f) << 6)
1903 | ((RTUNICP)(uch & 0x0f) << 12);
1904 RTStrAssertMsgReturn(uc >= 0x00000800 && uc <= 0x0000fffd,
1905 ("%u: cp=%#010RX32: %.*Rhxs\n", cb, uc, RT_MIN(cb + 10, strlen((char *)puch)), puch),
1906 rtStrGetCpNExFailure(ppsz, pcch, pCp, uc == 0xffff || uc == 0xfffe ? VERR_CODE_POINT_ENDIAN_INDICATOR : VERR_INVALID_UTF8_ENCODING));
1907 RTStrAssertMsgReturn(uc < 0xd800 || uc > 0xdfff,
1908 ("%u: cp=%#010RX32: %.*Rhxs\n", cb, uc, RT_MIN(cb + 10, strlen((char *)puch)), puch),
1909 rtStrGetCpNExFailure(ppsz, pcch, pCp, VERR_CODE_POINT_SURROGATE));
1910 break;
1911 case 2:
1912 uc = (puch[1] & 0x3f)
1913 | ((RTUNICP)(uch & 0x1f) << 6);
1914 RTStrAssertMsgReturn(uc >= 0x00000080 && uc <= 0x000007ff,
1915 ("%u: cp=%#010RX32: %.*Rhxs\n", cb, uc, RT_MIN(cb + 10, strlen((char *)puch)), puch),
1916 rtStrGetCpNExFailure(ppsz, pcch, pCp, VERR_INVALID_UTF8_ENCODING));
1917 break;
1918 default: /* impossible, but GCC is bitching. */
1919 uc = RTUNICP_INVALID;
1920 break;
1921 }
1922 puch += cb;
1923 cch -= cb;
1924 }
1925 else
1926 {
1927 /* 6th bit is always set. */
1928 RTStrAssertMsgFailed(("Invalid UTF-8 first byte: %.*Rhxs\n", RT_MIN(strlen((char *)puch), 10), puch));
1929 return rtStrGetCpNExFailure(ppsz, pcch, pCp, VERR_INVALID_UTF8_ENCODING);
1930 }
1931 *pCp = uc;
1932 *ppsz = (const char *)puch;
1933 (*pcch) = cch;
1934 return VINF_SUCCESS;
1935}
1936RT_EXPORT_SYMBOL(RTStrGetCpNExInternal);
1937
1938
1939RTDECL(char *) RTStrPutCpInternal(char *psz, RTUNICP uc)
1940{
1941 unsigned char *puch = (unsigned char *)psz;
1942 if (uc < 0x80)
1943 *puch++ = (unsigned char )uc;
1944 else if (uc < 0x00000800)
1945 {
1946 *puch++ = 0xc0 | (uc >> 6);
1947 *puch++ = 0x80 | (uc & 0x3f);
1948 }
1949 else if (uc < 0x00010000)
1950 {
1951/** @todo RT_USE_RTC_3629 */
1952 if ( uc < 0x0000d8000
1953 || ( uc > 0x0000dfff
1954 && uc < 0x0000fffe))
1955 {
1956 *puch++ = 0xe0 | (uc >> 12);
1957 *puch++ = 0x80 | ((uc >> 6) & 0x3f);
1958 *puch++ = 0x80 | (uc & 0x3f);
1959 }
1960 else
1961 {
1962 AssertMsgFailed(("Invalid code point U+%05x!\n", uc));
1963 *puch++ = 0x7f;
1964 }
1965 }
1966/** @todo RT_USE_RTC_3629 */
1967 else if (uc < 0x00200000)
1968 {
1969 *puch++ = 0xf0 | (uc >> 18);
1970 *puch++ = 0x80 | ((uc >> 12) & 0x3f);
1971 *puch++ = 0x80 | ((uc >> 6) & 0x3f);
1972 *puch++ = 0x80 | (uc & 0x3f);
1973 }
1974 else if (uc < 0x04000000)
1975 {
1976 *puch++ = 0xf8 | (uc >> 24);
1977 *puch++ = 0x80 | ((uc >> 18) & 0x3f);
1978 *puch++ = 0x80 | ((uc >> 12) & 0x3f);
1979 *puch++ = 0x80 | ((uc >> 6) & 0x3f);
1980 *puch++ = 0x80 | (uc & 0x3f);
1981 }
1982 else if (uc <= 0x7fffffff)
1983 {
1984 *puch++ = 0xfc | (uc >> 30);
1985 *puch++ = 0x80 | ((uc >> 24) & 0x3f);
1986 *puch++ = 0x80 | ((uc >> 18) & 0x3f);
1987 *puch++ = 0x80 | ((uc >> 12) & 0x3f);
1988 *puch++ = 0x80 | ((uc >> 6) & 0x3f);
1989 *puch++ = 0x80 | (uc & 0x3f);
1990 }
1991 else
1992 {
1993 AssertMsgFailed(("Invalid code point U+%08x!\n", uc));
1994 *puch++ = 0x7f;
1995 }
1996
1997 return (char *)puch;
1998}
1999RT_EXPORT_SYMBOL(RTStrPutCpInternal);
2000
2001
2002RTDECL(char *) RTStrPrevCp(const char *pszStart, const char *psz)
2003{
2004 if (pszStart < psz)
2005 {
2006 /* simple char? */
2007 const unsigned char *puch = (const unsigned char *)psz;
2008 unsigned uch = *--puch;
2009 if (!(uch & RT_BIT(7)))
2010 return (char *)puch;
2011 RTStrAssertMsgReturn(!(uch & RT_BIT(6)), ("uch=%#x\n", uch), (char *)pszStart);
2012
2013 /* two or more. */
2014 uint32_t uMask = 0xffffffc0;
2015 while ( (const unsigned char *)pszStart < puch
2016 && !(uMask & 1))
2017 {
2018 uch = *--puch;
2019 if ((uch & 0xc0) != 0x80)
2020 {
2021 RTStrAssertMsgReturn((uch & (uMask >> 1)) == (uMask & 0xff),
2022 ("Invalid UTF-8 encoding: %.*Rhxs puch=%p psz=%p\n", psz - (char *)puch, puch, psz),
2023 (char *)pszStart);
2024 return (char *)puch;
2025 }
2026 uMask >>= 1;
2027 }
2028 RTStrAssertMsgFailed(("Invalid UTF-8 encoding: %.*Rhxs puch=%p psz=%p\n", psz - (char *)puch, puch, psz));
2029 }
2030 return (char *)pszStart;
2031}
2032RT_EXPORT_SYMBOL(RTStrPrevCp);
2033
Note: See TracBrowser for help on using the repository browser.

© 2024 Oracle Support Privacy / Do Not Sell My Info Terms of Use Trademark Policy Automated Access Etiquette