VirtualBox

source: vbox/trunk/src/VBox/Runtime/common/string/utf-8.cpp@ 72778

Last change on this file since 72778 was 70156, checked in by vboxsync, 7 years ago

Runtime: fix Utf-8 for Linux R0.

  • Property svn:eol-style set to native
  • Property svn:keywords set to Id Revision
File size: 70.7 KB
Line 
1/* $Id: utf-8.cpp 70156 2017-12-15 15:52:10Z vboxsync $ */
2/** @file
3 * IPRT - UTF-8 Decoding.
4 */
5
6/*
7 * Copyright (C) 2006-2017 Oracle Corporation
8 *
9 * This file is part of VirtualBox Open Source Edition (OSE), as
10 * available from http://www.virtualbox.org. This file is free software;
11 * you can redistribute it and/or modify it under the terms of the GNU
12 * General Public License (GPL) as published by the Free Software
13 * Foundation, in version 2 as it comes in the "COPYING" file of the
14 * VirtualBox OSE distribution. VirtualBox OSE is distributed in the
15 * hope that it will be useful, but WITHOUT ANY WARRANTY of any kind.
16 *
17 * The contents of this file may alternatively be used under the terms
18 * of the Common Development and Distribution License Version 1.0
19 * (CDDL) only, as it comes in the "COPYING.CDDL" file of the
20 * VirtualBox OSE distribution, in which case the provisions of the
21 * CDDL are applicable instead of those of the GPL.
22 *
23 * You may elect to license modified versions of this file under the
24 * terms and conditions of either the GPL or the CDDL or both.
25 */
26
27
28/*********************************************************************************************************************************
29* Header Files *
30*********************************************************************************************************************************/
31#include <iprt/string.h>
32#include "internal/iprt.h"
33
34#include <iprt/uni.h>
35#include <iprt/asm.h>
36#include <iprt/alloc.h>
37#include <iprt/assert.h>
38#include <iprt/err.h>
39#include "internal/string.h"
40
41
42
43/**
44 * Get get length in code points of a UTF-8 encoded string.
45 * The string is validated while doing this.
46 *
47 * @returns IPRT status code.
48 * @param psz Pointer to the UTF-8 string.
49 * @param cch The max length of the string. (btw cch = cb)
50 * Use RTSTR_MAX if all of the string is to be examined.
51 * @param pcuc Where to store the length in unicode code points.
52 * @param pcchActual Where to store the actual size of the UTF-8 string
53 * on success (cch = cb again). Optional.
54 */
55DECLHIDDEN(int) rtUtf8Length(const char *psz, size_t cch, size_t *pcuc, size_t *pcchActual)
56{
57 const unsigned char *puch = (const unsigned char *)psz;
58 size_t cCodePoints = 0;
59 while (cch > 0)
60 {
61 const unsigned char uch = *puch;
62 if (!uch)
63 break;
64 if (uch & RT_BIT(7))
65 {
66 /* figure sequence length and validate the first byte */
67/** @todo RT_USE_RTC_3629 */
68 unsigned cb;
69 if ((uch & (RT_BIT(7) | RT_BIT(6) | RT_BIT(5))) == (RT_BIT(7) | RT_BIT(6)))
70 cb = 2;
71 else if ((uch & (RT_BIT(7) | RT_BIT(6) | RT_BIT(5) | RT_BIT(4))) == (RT_BIT(7) | RT_BIT(6) | RT_BIT(5)))
72 cb = 3;
73 else if ((uch & (RT_BIT(7) | RT_BIT(6) | RT_BIT(5) | RT_BIT(4) | RT_BIT(3))) == (RT_BIT(7) | RT_BIT(6) | RT_BIT(5) | RT_BIT(4)))
74 cb = 4;
75 else if ((uch & (RT_BIT(7) | RT_BIT(6) | RT_BIT(5) | RT_BIT(4) | RT_BIT(3) | RT_BIT(2))) == (RT_BIT(7) | RT_BIT(6) | RT_BIT(5) | RT_BIT(4) | RT_BIT(3)))
76 cb = 5;
77 else if ((uch & (RT_BIT(7) | RT_BIT(6) | RT_BIT(5) | RT_BIT(4) | RT_BIT(3) | RT_BIT(2) | RT_BIT(1))) == (RT_BIT(7) | RT_BIT(6) | RT_BIT(5) | RT_BIT(4) | RT_BIT(3) | RT_BIT(2)))
78 cb = 6;
79 else
80 {
81 RTStrAssertMsgFailed(("Invalid UTF-8 first byte: %.*Rhxs\n", RT_MIN(cch, 10), puch));
82 return VERR_INVALID_UTF8_ENCODING;
83 }
84
85 /* check length */
86 if (cb > cch)
87 {
88 RTStrAssertMsgFailed(("Invalid UTF-8 length: cb=%d cch=%d (%.*Rhxs)\n", cb, cch, RT_MIN(cch, 10), puch));
89 return VERR_INVALID_UTF8_ENCODING;
90 }
91
92 /* validate the rest */
93 switch (cb)
94 {
95 case 6:
96 RTStrAssertMsgReturn((puch[5] & (RT_BIT(7) | RT_BIT(6))) == RT_BIT(7), ("6/%u: %.*Rhxs\n", cb, RT_MIN(cb + 10, cch), puch), VERR_INVALID_UTF8_ENCODING);
97 RT_FALL_THRU();
98 case 5:
99 RTStrAssertMsgReturn((puch[4] & (RT_BIT(7) | RT_BIT(6))) == RT_BIT(7), ("5/%u: %.*Rhxs\n", cb, RT_MIN(cb + 10, cch), puch), VERR_INVALID_UTF8_ENCODING);
100 RT_FALL_THRU();
101 case 4:
102 RTStrAssertMsgReturn((puch[3] & (RT_BIT(7) | RT_BIT(6))) == RT_BIT(7), ("4/%u: %.*Rhxs\n", cb, RT_MIN(cb + 10, cch), puch), VERR_INVALID_UTF8_ENCODING);
103 RT_FALL_THRU();
104 case 3:
105 RTStrAssertMsgReturn((puch[2] & (RT_BIT(7) | RT_BIT(6))) == RT_BIT(7), ("3/%u: %.*Rhxs\n", cb, RT_MIN(cb + 10, cch), puch), VERR_INVALID_UTF8_ENCODING);
106 RT_FALL_THRU();
107 case 2:
108 RTStrAssertMsgReturn((puch[1] & (RT_BIT(7) | RT_BIT(6))) == RT_BIT(7), ("2/%u: %.*Rhxs\n", cb, RT_MIN(cb + 10, cch), puch), VERR_INVALID_UTF8_ENCODING);
109 break;
110 }
111
112 /* validate the code point. */
113 RTUNICP uc;
114 switch (cb)
115 {
116 case 6:
117 uc = (puch[5] & 0x3f)
118 | ((RTUNICP)(puch[4] & 0x3f) << 6)
119 | ((RTUNICP)(puch[3] & 0x3f) << 12)
120 | ((RTUNICP)(puch[2] & 0x3f) << 18)
121 | ((RTUNICP)(puch[1] & 0x3f) << 24)
122 | ((RTUNICP)(uch & 0x01) << 30);
123 RTStrAssertMsgReturn(uc >= 0x04000000 && uc <= 0x7fffffff,
124 ("%u: cp=%#010RX32: %.*Rhxs\n", cb, uc, RT_MIN(cb + 10, cch), puch), VERR_INVALID_UTF8_ENCODING);
125 break;
126 case 5:
127 uc = (puch[4] & 0x3f)
128 | ((RTUNICP)(puch[3] & 0x3f) << 6)
129 | ((RTUNICP)(puch[2] & 0x3f) << 12)
130 | ((RTUNICP)(puch[1] & 0x3f) << 18)
131 | ((RTUNICP)(uch & 0x03) << 24);
132 RTStrAssertMsgReturn(uc >= 0x00200000 && uc <= 0x03ffffff,
133 ("%u: cp=%#010RX32: %.*Rhxs\n", cb, uc, RT_MIN(cb + 10, cch), puch), VERR_INVALID_UTF8_ENCODING);
134 break;
135 case 4:
136 uc = (puch[3] & 0x3f)
137 | ((RTUNICP)(puch[2] & 0x3f) << 6)
138 | ((RTUNICP)(puch[1] & 0x3f) << 12)
139 | ((RTUNICP)(uch & 0x07) << 18);
140 RTStrAssertMsgReturn(uc >= 0x00010000 && uc <= 0x001fffff,
141 ("%u: cp=%#010RX32: %.*Rhxs\n", cb, uc, RT_MIN(cb + 10, cch), puch), VERR_INVALID_UTF8_ENCODING);
142 break;
143 case 3:
144 uc = (puch[2] & 0x3f)
145 | ((RTUNICP)(puch[1] & 0x3f) << 6)
146 | ((RTUNICP)(uch & 0x0f) << 12);
147 RTStrAssertMsgReturn(uc >= 0x00000800 && uc <= 0x0000fffd,
148 ("%u: cp=%#010RX32: %.*Rhxs\n", cb, uc, RT_MIN(cb + 10, cch), puch),
149 uc == 0xffff || uc == 0xfffe ? VERR_CODE_POINT_ENDIAN_INDICATOR : VERR_INVALID_UTF8_ENCODING);
150 RTStrAssertMsgReturn(uc < 0xd800 || uc > 0xdfff,
151 ("%u: cp=%#010RX32: %.*Rhxs\n", cb, uc, RT_MIN(cb + 10, cch), puch), VERR_CODE_POINT_SURROGATE);
152 break;
153 case 2:
154 uc = (puch[1] & 0x3f)
155 | ((RTUNICP)(uch & 0x1f) << 6);
156 RTStrAssertMsgReturn(uc >= 0x00000080 && uc <= 0x000007ff,
157 ("%u: cp=%#010RX32: %.*Rhxs\n", cb, uc, RT_MIN(cb + 10, cch), puch), VERR_INVALID_UTF8_ENCODING);
158 break;
159 }
160
161 /* advance */
162 cch -= cb;
163 puch += cb;
164 }
165 else
166 {
167 /* one ASCII byte */
168 puch++;
169 cch--;
170 }
171 cCodePoints++;
172 }
173
174 /* done */
175 *pcuc = cCodePoints;
176 if (pcchActual)
177 *pcchActual = puch - (unsigned char const *)psz;
178 return VINF_SUCCESS;
179}
180
181
182/**
183 * Decodes and UTF-8 string into an array of unicode code point.
184 *
185 * Since we know the input is valid, we do *not* perform encoding or length checks.
186 *
187 * @returns iprt status code.
188 * @param psz The UTF-8 string to recode. This is a valid encoding.
189 * @param cch The number of chars (the type char, so bytes if you like) to process of the UTF-8 string.
190 * The recoding will stop when cch or '\\0' is reached. Pass RTSTR_MAX to process up to '\\0'.
191 * @param paCps Where to store the code points array.
192 * @param cCps The number of RTUNICP items the paCps buffer can hold, excluding the terminator ('\\0').
193 */
194static int rtUtf8Decode(const char *psz, size_t cch, PRTUNICP paCps, size_t cCps)
195{
196 int rc = VINF_SUCCESS;
197 const unsigned char *puch = (const unsigned char *)psz;
198 PRTUNICP pCp = paCps;
199 while (cch > 0)
200 {
201 /* read the next char and check for terminator. */
202 const unsigned char uch = *puch;
203 if (uch)
204 { /* we only break once, so consider this the likely branch. */ }
205 else
206 break;
207
208 /* check for output overflow */
209 if (RT_LIKELY(cCps >= 1))
210 { /* likely */ }
211 else
212 {
213 rc = VERR_BUFFER_OVERFLOW;
214 break;
215 }
216 cCps--;
217
218 /* decode and recode the code point */
219 if (!(uch & RT_BIT(7)))
220 {
221 *pCp++ = uch;
222 puch++;
223 cch--;
224 }
225#ifdef RT_STRICT
226 else if (!(uch & RT_BIT(6)))
227 AssertMsgFailed(("Internal error!\n"));
228#endif
229 else if (!(uch & RT_BIT(5)))
230 {
231 *pCp++ = (puch[1] & 0x3f)
232 | ((uint16_t)(uch & 0x1f) << 6);
233 puch += 2;
234 cch -= 2;
235 }
236 else if (!(uch & RT_BIT(4)))
237 {
238 *pCp++ = (puch[2] & 0x3f)
239 | ((uint16_t)(puch[1] & 0x3f) << 6)
240 | ((uint16_t)(uch & 0x0f) << 12);
241 puch += 3;
242 cch -= 3;
243 }
244 else if (!(uch & RT_BIT(3)))
245 {
246 *pCp++ = (puch[3] & 0x3f)
247 | ((RTUNICP)(puch[2] & 0x3f) << 6)
248 | ((RTUNICP)(puch[1] & 0x3f) << 12)
249 | ((RTUNICP)(uch & 0x07) << 18);
250 puch += 4;
251 cch -= 4;
252 }
253 else if (!(uch & RT_BIT(2)))
254 {
255 *pCp++ = (puch[4] & 0x3f)
256 | ((RTUNICP)(puch[3] & 0x3f) << 6)
257 | ((RTUNICP)(puch[2] & 0x3f) << 12)
258 | ((RTUNICP)(puch[1] & 0x3f) << 18)
259 | ((RTUNICP)(uch & 0x03) << 24);
260 puch += 5;
261 cch -= 6;
262 }
263 else
264 {
265 Assert(!(uch & RT_BIT(1)));
266 *pCp++ = (puch[5] & 0x3f)
267 | ((RTUNICP)(puch[4] & 0x3f) << 6)
268 | ((RTUNICP)(puch[3] & 0x3f) << 12)
269 | ((RTUNICP)(puch[2] & 0x3f) << 18)
270 | ((RTUNICP)(puch[1] & 0x3f) << 24)
271 | ((RTUNICP)(uch & 0x01) << 30);
272 puch += 6;
273 cch -= 6;
274 }
275 }
276
277 /* done */
278 *pCp = 0;
279 return rc;
280}
281
282
283RTDECL(size_t) RTStrUniLen(const char *psz)
284{
285 size_t cCodePoints;
286 int rc = rtUtf8Length(psz, RTSTR_MAX, &cCodePoints, NULL);
287 return RT_SUCCESS(rc) ? cCodePoints : 0;
288}
289RT_EXPORT_SYMBOL(RTStrUniLen);
290
291
292RTDECL(int) RTStrUniLenEx(const char *psz, size_t cch, size_t *pcCps)
293{
294 size_t cCodePoints;
295 int rc = rtUtf8Length(psz, cch, &cCodePoints, NULL);
296 if (pcCps)
297 *pcCps = RT_SUCCESS(rc) ? cCodePoints : 0;
298 return rc;
299}
300RT_EXPORT_SYMBOL(RTStrUniLenEx);
301
302
303RTDECL(int) RTStrValidateEncoding(const char *psz)
304{
305 return RTStrValidateEncodingEx(psz, RTSTR_MAX, 0);
306}
307RT_EXPORT_SYMBOL(RTStrValidateEncoding);
308
309
310RTDECL(int) RTStrValidateEncodingEx(const char *psz, size_t cch, uint32_t fFlags)
311{
312 AssertReturn(!(fFlags & ~(RTSTR_VALIDATE_ENCODING_ZERO_TERMINATED | RTSTR_VALIDATE_ENCODING_EXACT_LENGTH)),
313 VERR_INVALID_PARAMETER);
314 AssertPtr(psz);
315
316 /*
317 * Use rtUtf8Length for the job.
318 */
319 size_t cchActual;
320 size_t cCpsIgnored;
321 int rc = rtUtf8Length(psz, cch, &cCpsIgnored, &cchActual);
322 if (RT_SUCCESS(rc))
323 {
324 if (fFlags & RTSTR_VALIDATE_ENCODING_EXACT_LENGTH)
325 {
326 if (fFlags & RTSTR_VALIDATE_ENCODING_ZERO_TERMINATED)
327 cchActual++;
328 if (cchActual == cch)
329 rc = VINF_SUCCESS;
330 else if (cchActual < cch)
331 rc = VERR_BUFFER_UNDERFLOW;
332 else
333 rc = VERR_BUFFER_OVERFLOW;
334 }
335 else if ( (fFlags & RTSTR_VALIDATE_ENCODING_ZERO_TERMINATED)
336 && cchActual >= cch)
337 rc = VERR_BUFFER_OVERFLOW;
338 }
339 return rc;
340}
341RT_EXPORT_SYMBOL(RTStrValidateEncodingEx);
342
343
344RTDECL(bool) RTStrIsValidEncoding(const char *psz)
345{
346 int rc = RTStrValidateEncodingEx(psz, RTSTR_MAX, 0);
347 return RT_SUCCESS(rc);
348}
349RT_EXPORT_SYMBOL(RTStrIsValidEncoding);
350
351
352RTDECL(size_t) RTStrPurgeEncoding(char *psz)
353{
354 size_t cErrors = 0;
355 for (;;)
356 {
357 RTUNICP Cp;
358 int rc = RTStrGetCpEx((const char **)&psz, &Cp);
359 if (RT_SUCCESS(rc))
360 {
361 if (!Cp)
362 break;
363 }
364 else
365 {
366 psz[-1] = '?';
367 cErrors++;
368 }
369 }
370 return cErrors;
371}
372RT_EXPORT_SYMBOL(RTStrPurgeEncoding);
373
374
375/**
376 * Helper for RTStrPurgeComplementSet.
377 *
378 * @returns true if @a Cp is valid, false if not.
379 * @param Cp The code point to validate.
380 * @param puszValidPairs Pair of valid code point sets.
381 * @param cValidPairs Number of pairs.
382 */
383DECLINLINE(bool) rtStrPurgeIsInSet(RTUNICP Cp, PCRTUNICP puszValidPairs, uint32_t cValidPairs)
384{
385 while (cValidPairs-- > 0)
386 {
387 if ( Cp >= puszValidPairs[0]
388 && Cp <= puszValidPairs[1])
389 return true;
390 puszValidPairs += 2;
391 }
392 return false;
393}
394
395
396RTDECL(ssize_t) RTStrPurgeComplementSet(char *psz, PCRTUNICP puszValidPairs, char chReplacement)
397{
398 AssertReturn(chReplacement && (unsigned)chReplacement < 128, -1);
399
400 /*
401 * Calc valid pairs and check that we've got an even number.
402 */
403 uint32_t cValidPairs = 0;
404 while (puszValidPairs[cValidPairs * 2])
405 {
406 AssertReturn(puszValidPairs[cValidPairs * 2 + 1], -1);
407 AssertMsg(puszValidPairs[cValidPairs * 2] <= puszValidPairs[cValidPairs * 2 + 1],
408 ("%#x vs %#x\n", puszValidPairs[cValidPairs * 2], puszValidPairs[cValidPairs * 2 + 1]));
409 cValidPairs++;
410 }
411
412 /*
413 * Do the replacing.
414 */
415 ssize_t cReplacements = 0;
416 for (;;)
417 {
418 char *pszCur = psz;
419 RTUNICP Cp;
420 int rc = RTStrGetCpEx((const char **)&psz, &Cp);
421 if (RT_SUCCESS(rc))
422 {
423 if (Cp)
424 {
425 if (!rtStrPurgeIsInSet(Cp, puszValidPairs, cValidPairs))
426 {
427 for (; pszCur != psz; ++pszCur)
428 *pszCur = chReplacement;
429 ++cReplacements;
430 }
431 }
432 else
433 break;
434 }
435 else
436 return -1;
437 }
438 return cReplacements;
439}
440RT_EXPORT_SYMBOL(RTStrPurgeComplementSet);
441
442
443RTDECL(int) RTStrToUni(const char *pszString, PRTUNICP *ppaCps)
444{
445 /*
446 * Validate input.
447 */
448 Assert(VALID_PTR(pszString));
449 Assert(VALID_PTR(ppaCps));
450 *ppaCps = NULL;
451
452 /*
453 * Validate the UTF-8 input and count its code points.
454 */
455 size_t cCps;
456 int rc = rtUtf8Length(pszString, RTSTR_MAX, &cCps, NULL);
457 if (RT_SUCCESS(rc))
458 {
459 /*
460 * Allocate buffer.
461 */
462 PRTUNICP paCps = (PRTUNICP)RTMemAlloc((cCps + 1) * sizeof(RTUNICP));
463 if (paCps)
464 {
465 /*
466 * Decode the string.
467 */
468 rc = rtUtf8Decode(pszString, RTSTR_MAX, paCps, cCps);
469 if (RT_SUCCESS(rc))
470 {
471 *ppaCps = paCps;
472 return rc;
473 }
474 RTMemFree(paCps);
475 }
476 else
477 rc = VERR_NO_CODE_POINT_MEMORY;
478 }
479 return rc;
480}
481RT_EXPORT_SYMBOL(RTStrToUni);
482
483
484RTDECL(int) RTStrToUniEx(const char *pszString, size_t cchString, PRTUNICP *ppaCps, size_t cCps, size_t *pcCps)
485{
486 /*
487 * Validate input.
488 */
489 Assert(VALID_PTR(pszString));
490 Assert(VALID_PTR(ppaCps));
491 Assert(!pcCps || VALID_PTR(pcCps));
492
493 /*
494 * Validate the UTF-8 input and count the code points.
495 */
496 size_t cCpsResult;
497 int rc = rtUtf8Length(pszString, cchString, &cCpsResult, NULL);
498 if (RT_SUCCESS(rc))
499 {
500 if (pcCps)
501 *pcCps = cCpsResult;
502
503 /*
504 * Check buffer size / Allocate buffer.
505 */
506 bool fShouldFree;
507 PRTUNICP paCpsResult;
508 if (cCps > 0 && *ppaCps)
509 {
510 fShouldFree = false;
511 if (cCps <= cCpsResult)
512 return VERR_BUFFER_OVERFLOW;
513 paCpsResult = *ppaCps;
514 }
515 else
516 {
517 *ppaCps = NULL;
518 fShouldFree = true;
519 cCps = RT_MAX(cCpsResult + 1, cCps);
520 paCpsResult = (PRTUNICP)RTMemAlloc(cCps * sizeof(RTUNICP));
521 }
522 if (paCpsResult)
523 {
524 /*
525 * Encode the UTF-16 string.
526 */
527 rc = rtUtf8Decode(pszString, cchString, paCpsResult, cCps - 1);
528 if (RT_SUCCESS(rc))
529 {
530 *ppaCps = paCpsResult;
531 return rc;
532 }
533 if (fShouldFree)
534 RTMemFree(paCpsResult);
535 }
536 else
537 rc = VERR_NO_CODE_POINT_MEMORY;
538 }
539 return rc;
540}
541RT_EXPORT_SYMBOL(RTStrToUniEx);
542
543
544/**
545 * Calculates the UTF-16 length of a string, validating the encoding while doing so.
546 *
547 * @returns IPRT status code.
548 * @param psz Pointer to the UTF-8 string.
549 * @param cch The max length of the string. (btw cch = cb)
550 * @param pcwc Where to store the length of the UTF-16 string as a number
551 * of RTUTF16 characters.
552 * @sa rtUtf8CalcUtf16Length
553 */
554static int rtUtf8CalcUtf16LengthN(const char *psz, size_t cch, size_t *pcwc)
555{
556 const unsigned char *puch = (const unsigned char *)psz;
557 size_t cwc = 0;
558 while (cch > 0)
559 {
560 const unsigned char uch = *puch;
561 if (!(uch & RT_BIT(7)))
562 {
563 /* one ASCII byte */
564 if (uch)
565 {
566 cwc++;
567 puch++;
568 cch--;
569 }
570 else
571 break;
572 }
573 else
574 {
575 /*
576 * Multibyte sequence is more complicated when we have length
577 * restrictions on the input.
578 */
579 /* figure sequence length and validate the first byte */
580 unsigned cb;
581 if ((uch & (RT_BIT(7) | RT_BIT(6) | RT_BIT(5))) == (RT_BIT(7) | RT_BIT(6)))
582 cb = 2;
583 else if ((uch & (RT_BIT(7) | RT_BIT(6) | RT_BIT(5) | RT_BIT(4))) == (RT_BIT(7) | RT_BIT(6) | RT_BIT(5)))
584 cb = 3;
585 else if ((uch & (RT_BIT(7) | RT_BIT(6) | RT_BIT(5) | RT_BIT(4) | RT_BIT(3))) == (RT_BIT(7) | RT_BIT(6) | RT_BIT(5) | RT_BIT(4)))
586 cb = 4;
587 else if ((uch & (RT_BIT(7) | RT_BIT(6) | RT_BIT(5) | RT_BIT(4) | RT_BIT(3) | RT_BIT(2))) == (RT_BIT(7) | RT_BIT(6) | RT_BIT(5) | RT_BIT(4) | RT_BIT(3)))
588 cb = 5;
589 else if ((uch & (RT_BIT(7) | RT_BIT(6) | RT_BIT(5) | RT_BIT(4) | RT_BIT(3) | RT_BIT(2) | RT_BIT(1))) == (RT_BIT(7) | RT_BIT(6) | RT_BIT(5) | RT_BIT(4) | RT_BIT(3) | RT_BIT(2)))
590 cb = 6;
591 else
592 {
593 RTStrAssertMsgFailed(("Invalid UTF-8 first byte: %.*Rhxs\n", RT_MIN(cch, 10), puch));
594 return VERR_INVALID_UTF8_ENCODING;
595 }
596
597 /* check length */
598 if (cb > cch)
599 {
600 RTStrAssertMsgFailed(("Invalid UTF-8 length: cb=%d cch=%d (%.*Rhxs)\n", cb, cch, RT_MIN(cch, 10), puch));
601 return VERR_INVALID_UTF8_ENCODING;
602 }
603
604 /* validate the rest */
605 switch (cb)
606 {
607 case 6:
608 RTStrAssertMsgReturn((puch[5] & (RT_BIT(7) | RT_BIT(6))) == RT_BIT(7), ("6/%u: %.*Rhxs\n", cb, RT_MIN(cb + 10, cch), puch), VERR_INVALID_UTF8_ENCODING);
609 RT_FALL_THRU();
610 case 5:
611 RTStrAssertMsgReturn((puch[4] & (RT_BIT(7) | RT_BIT(6))) == RT_BIT(7), ("5/%u: %.*Rhxs\n", cb, RT_MIN(cb + 10, cch), puch), VERR_INVALID_UTF8_ENCODING);
612 RT_FALL_THRU();
613 case 4:
614 RTStrAssertMsgReturn((puch[3] & (RT_BIT(7) | RT_BIT(6))) == RT_BIT(7), ("4/%u: %.*Rhxs\n", cb, RT_MIN(cb + 10, cch), puch), VERR_INVALID_UTF8_ENCODING);
615 RT_FALL_THRU();
616 case 3:
617 RTStrAssertMsgReturn((puch[2] & (RT_BIT(7) | RT_BIT(6))) == RT_BIT(7), ("3/%u: %.*Rhxs\n", cb, RT_MIN(cb + 10, cch), puch), VERR_INVALID_UTF8_ENCODING);
618 RT_FALL_THRU();
619 case 2:
620 RTStrAssertMsgReturn((puch[1] & (RT_BIT(7) | RT_BIT(6))) == RT_BIT(7), ("2/%u: %.*Rhxs\n", cb, RT_MIN(cb + 10, cch), puch), VERR_INVALID_UTF8_ENCODING);
621 break;
622 }
623
624 /* validate the code point. */
625 RTUNICP uc;
626 switch (cb)
627 {
628 case 6:
629 uc = (puch[5] & 0x3f)
630 | ((RTUNICP)(puch[4] & 0x3f) << 6)
631 | ((RTUNICP)(puch[3] & 0x3f) << 12)
632 | ((RTUNICP)(puch[2] & 0x3f) << 18)
633 | ((RTUNICP)(puch[1] & 0x3f) << 24)
634 | ((RTUNICP)(uch & 0x01) << 30);
635 RTStrAssertMsgReturn(uc >= 0x04000000 && uc <= 0x7fffffff,
636 ("%u: cp=%#010RX32: %.*Rhxs\n", cb, uc, RT_MIN(cb + 10, cch), puch), VERR_INVALID_UTF8_ENCODING);
637 RTStrAssertMsgFailed(("%u: cp=%#010RX32: %.*Rhxs\n", cb, uc, RT_MIN(cb + 10, cch), puch));
638 return VERR_CANT_RECODE_AS_UTF16;
639 case 5:
640 uc = (puch[4] & 0x3f)
641 | ((RTUNICP)(puch[3] & 0x3f) << 6)
642 | ((RTUNICP)(puch[2] & 0x3f) << 12)
643 | ((RTUNICP)(puch[1] & 0x3f) << 18)
644 | ((RTUNICP)(uch & 0x03) << 24);
645 RTStrAssertMsgReturn(uc >= 0x00200000 && uc <= 0x03ffffff,
646 ("%u: cp=%#010RX32: %.*Rhxs\n", cb, uc, RT_MIN(cb + 10, cch), puch), VERR_INVALID_UTF8_ENCODING);
647 RTStrAssertMsgFailed(("%u: cp=%#010RX32: %.*Rhxs\n", cb, uc, RT_MIN(cb + 10, cch), puch));
648 return VERR_CANT_RECODE_AS_UTF16;
649 case 4:
650 uc = (puch[3] & 0x3f)
651 | ((RTUNICP)(puch[2] & 0x3f) << 6)
652 | ((RTUNICP)(puch[1] & 0x3f) << 12)
653 | ((RTUNICP)(uch & 0x07) << 18);
654 RTStrAssertMsgReturn(uc >= 0x00010000 && uc <= 0x001fffff,
655 ("%u: cp=%#010RX32: %.*Rhxs\n", cb, uc, RT_MIN(cb + 10, cch), puch), VERR_INVALID_UTF8_ENCODING);
656 RTStrAssertMsgReturn(uc <= 0x0010ffff,
657 ("%u: cp=%#010RX32: %.*Rhxs\n", cb, uc, RT_MIN(cb + 10, cch), puch), VERR_CANT_RECODE_AS_UTF16);
658 cwc++;
659 break;
660 case 3:
661 uc = (puch[2] & 0x3f)
662 | ((RTUNICP)(puch[1] & 0x3f) << 6)
663 | ((RTUNICP)(uch & 0x0f) << 12);
664 RTStrAssertMsgReturn(uc >= 0x00000800 && uc <= 0x0000fffd,
665 ("%u: cp=%#010RX32: %.*Rhxs\n", cb, uc, RT_MIN(cb + 10, cch), puch),
666 uc == 0xffff || uc == 0xfffe ? VERR_CODE_POINT_ENDIAN_INDICATOR : VERR_INVALID_UTF8_ENCODING);
667 RTStrAssertMsgReturn(uc < 0xd800 || uc > 0xdfff,
668 ("%u: cp=%#010RX32: %.*Rhxs\n", cb, uc, RT_MIN(cb + 10, cch), puch), VERR_CODE_POINT_SURROGATE);
669 break;
670 case 2:
671 uc = (puch[1] & 0x3f)
672 | ((RTUNICP)(uch & 0x1f) << 6);
673 RTStrAssertMsgReturn(uc >= 0x00000080 && uc <= 0x000007ff,
674 ("%u: cp=%#010RX32: %.*Rhxs\n", cb, uc, RT_MIN(cb + 10, cch), puch), VERR_INVALID_UTF8_ENCODING);
675 break;
676 }
677
678 /* advance */
679 cch -= cb;
680 puch += cb;
681 cwc++;
682 }
683 }
684
685 /* done */
686 *pcwc = cwc;
687 return VINF_SUCCESS;
688}
689
690
691/**
692 * Calculates the UTF-16 length of a string, validating the encoding while doing so.
693 *
694 * @returns IPRT status code.
695 * @param psz Pointer to the UTF-8 string.
696 * @param pcwc Where to store the length of the UTF-16 string as a number
697 * of RTUTF16 characters.
698 * @sa rtUtf8CalcUtf16LengthN
699 */
700static int rtUtf8CalcUtf16Length(const char *psz, size_t *pcwc)
701{
702 const unsigned char *puch = (const unsigned char *)psz;
703 size_t cwc = 0;
704 for (;;)
705 {
706 const unsigned char uch = *puch;
707 if (!(uch & RT_BIT(7)))
708 {
709 /* one ASCII byte */
710 if (uch)
711 {
712 cwc++;
713 puch++;
714 }
715 else
716 break;
717 }
718 else
719 {
720 /*
721 * Figure sequence length, implicitly validate the first byte.
722 * Then validate the additional bytes.
723 * Finally validate the code point.
724 */
725 unsigned cb;
726 RTUNICP uc;
727 if ((uch & (RT_BIT(7) | RT_BIT(6) | RT_BIT(5))) == (RT_BIT(7) | RT_BIT(6)))
728 {
729 RTStrAssertMsgReturn((puch[1] & (RT_BIT(7) | RT_BIT(6))) == RT_BIT(7), ("2/%u: %.*Rhxs\n", cb, RT_MIN(cb + 10, cch), puch), VERR_INVALID_UTF8_ENCODING);
730 uc = (puch[1] & 0x3f)
731 | ((RTUNICP)(uch & 0x1f) << 6);
732 RTStrAssertMsgReturn(uc >= 0x00000080 && uc <= 0x000007ff,
733 ("%u: cp=%#010RX32: %.*Rhxs\n", cb, uc, RT_MIN(cb + 10, cch), puch), VERR_INVALID_UTF8_ENCODING);
734 cb = 2;
735 }
736 else if ((uch & (RT_BIT(7) | RT_BIT(6) | RT_BIT(5) | RT_BIT(4))) == (RT_BIT(7) | RT_BIT(6) | RT_BIT(5)))
737 {
738 RTStrAssertMsgReturn((puch[1] & (RT_BIT(7) | RT_BIT(6))) == RT_BIT(7), ("2/%u: %.*Rhxs\n", cb, RT_MIN(cb + 10, cch), puch), VERR_INVALID_UTF8_ENCODING);
739 RTStrAssertMsgReturn((puch[2] & (RT_BIT(7) | RT_BIT(6))) == RT_BIT(7), ("3/%u: %.*Rhxs\n", cb, RT_MIN(cb + 10, cch), puch), VERR_INVALID_UTF8_ENCODING);
740 uc = (puch[2] & 0x3f)
741 | ((RTUNICP)(puch[1] & 0x3f) << 6)
742 | ((RTUNICP)(uch & 0x0f) << 12);
743 RTStrAssertMsgReturn(uc >= 0x00000800 && uc <= 0x0000fffd,
744 ("%u: cp=%#010RX32: %.*Rhxs\n", cb, uc, RT_MIN(cb + 10, cch), puch),
745 uc == 0xffff || uc == 0xfffe ? VERR_CODE_POINT_ENDIAN_INDICATOR : VERR_INVALID_UTF8_ENCODING);
746 RTStrAssertMsgReturn(uc < 0xd800 || uc > 0xdfff,
747 ("%u: cp=%#010RX32: %.*Rhxs\n", cb, uc, RT_MIN(cb + 10, cch), puch), VERR_CODE_POINT_SURROGATE);
748 cb = 3;
749 }
750 else if ((uch & (RT_BIT(7) | RT_BIT(6) | RT_BIT(5) | RT_BIT(4) | RT_BIT(3))) == (RT_BIT(7) | RT_BIT(6) | RT_BIT(5) | RT_BIT(4)))
751 {
752 RTStrAssertMsgReturn((puch[1] & (RT_BIT(7) | RT_BIT(6))) == RT_BIT(7), ("2/%u: %.*Rhxs\n", cb, RT_MIN(cb + 10, cch), puch), VERR_INVALID_UTF8_ENCODING);
753 RTStrAssertMsgReturn((puch[2] & (RT_BIT(7) | RT_BIT(6))) == RT_BIT(7), ("3/%u: %.*Rhxs\n", cb, RT_MIN(cb + 10, cch), puch), VERR_INVALID_UTF8_ENCODING);
754 RTStrAssertMsgReturn((puch[3] & (RT_BIT(7) | RT_BIT(6))) == RT_BIT(7), ("4/%u: %.*Rhxs\n", cb, RT_MIN(cb + 10, cch), puch), VERR_INVALID_UTF8_ENCODING);
755 uc = (puch[3] & 0x3f)
756 | ((RTUNICP)(puch[2] & 0x3f) << 6)
757 | ((RTUNICP)(puch[1] & 0x3f) << 12)
758 | ((RTUNICP)(uch & 0x07) << 18);
759 RTStrAssertMsgReturn(uc >= 0x00010000 && uc <= 0x001fffff,
760 ("%u: cp=%#010RX32: %.*Rhxs\n", cb, uc, RT_MIN(cb + 10, cch), puch), VERR_INVALID_UTF8_ENCODING);
761 RTStrAssertMsgReturn(uc <= 0x0010ffff,
762 ("%u: cp=%#010RX32: %.*Rhxs\n", cb, uc, RT_MIN(cb + 10, cch), puch), VERR_CANT_RECODE_AS_UTF16);
763 cwc++;
764 cb = 4;
765 }
766 else if ((uch & (RT_BIT(7) | RT_BIT(6) | RT_BIT(5) | RT_BIT(4) | RT_BIT(3) | RT_BIT(2))) == (RT_BIT(7) | RT_BIT(6) | RT_BIT(5) | RT_BIT(4) | RT_BIT(3)))
767 {
768 RTStrAssertMsgReturn((puch[1] & (RT_BIT(7) | RT_BIT(6))) == RT_BIT(7), ("2/%u: %.*Rhxs\n", cb, RT_MIN(cb + 10, cch), puch), VERR_INVALID_UTF8_ENCODING);
769 RTStrAssertMsgReturn((puch[2] & (RT_BIT(7) | RT_BIT(6))) == RT_BIT(7), ("3/%u: %.*Rhxs\n", cb, RT_MIN(cb + 10, cch), puch), VERR_INVALID_UTF8_ENCODING);
770 RTStrAssertMsgReturn((puch[3] & (RT_BIT(7) | RT_BIT(6))) == RT_BIT(7), ("4/%u: %.*Rhxs\n", cb, RT_MIN(cb + 10, cch), puch), VERR_INVALID_UTF8_ENCODING);
771 RTStrAssertMsgReturn((puch[4] & (RT_BIT(7) | RT_BIT(6))) == RT_BIT(7), ("5/%u: %.*Rhxs\n", cb, RT_MIN(cb + 10, cch), puch), VERR_INVALID_UTF8_ENCODING);
772 uc = (puch[4] & 0x3f)
773 | ((RTUNICP)(puch[3] & 0x3f) << 6)
774 | ((RTUNICP)(puch[2] & 0x3f) << 12)
775 | ((RTUNICP)(puch[1] & 0x3f) << 18)
776 | ((RTUNICP)(uch & 0x03) << 24);
777 RTStrAssertMsgReturn(uc >= 0x00200000 && uc <= 0x03ffffff,
778 ("%u: cp=%#010RX32: %.*Rhxs\n", cb, uc, RT_MIN(cb + 10, cch), puch), VERR_INVALID_UTF8_ENCODING);
779 RTStrAssertMsgFailed(("%u: cp=%#010RX32: %.*Rhxs\n", cb, uc, RT_MIN(cb + 10, cch), puch));
780 return VERR_CANT_RECODE_AS_UTF16;
781 //cb = 5;
782 }
783 else if ((uch & (RT_BIT(7) | RT_BIT(6) | RT_BIT(5) | RT_BIT(4) | RT_BIT(3) | RT_BIT(2) | RT_BIT(1))) == (RT_BIT(7) | RT_BIT(6) | RT_BIT(5) | RT_BIT(4) | RT_BIT(3) | RT_BIT(2)))
784 {
785 RTStrAssertMsgReturn((puch[1] & (RT_BIT(7) | RT_BIT(6))) == RT_BIT(7), ("2/%u: %.*Rhxs\n", cb, RT_MIN(cb + 10, cch), puch), VERR_INVALID_UTF8_ENCODING);
786 RTStrAssertMsgReturn((puch[2] & (RT_BIT(7) | RT_BIT(6))) == RT_BIT(7), ("3/%u: %.*Rhxs\n", cb, RT_MIN(cb + 10, cch), puch), VERR_INVALID_UTF8_ENCODING);
787 RTStrAssertMsgReturn((puch[3] & (RT_BIT(7) | RT_BIT(6))) == RT_BIT(7), ("4/%u: %.*Rhxs\n", cb, RT_MIN(cb + 10, cch), puch), VERR_INVALID_UTF8_ENCODING);
788 RTStrAssertMsgReturn((puch[4] & (RT_BIT(7) | RT_BIT(6))) == RT_BIT(7), ("5/%u: %.*Rhxs\n", cb, RT_MIN(cb + 10, cch), puch), VERR_INVALID_UTF8_ENCODING);
789 RTStrAssertMsgReturn((puch[5] & (RT_BIT(7) | RT_BIT(6))) == RT_BIT(7), ("6/%u: %.*Rhxs\n", cb, RT_MIN(cb + 10, cch), puch), VERR_INVALID_UTF8_ENCODING);
790 uc = (puch[5] & 0x3f)
791 | ((RTUNICP)(puch[4] & 0x3f) << 6)
792 | ((RTUNICP)(puch[3] & 0x3f) << 12)
793 | ((RTUNICP)(puch[2] & 0x3f) << 18)
794 | ((RTUNICP)(puch[1] & 0x3f) << 24)
795 | ((RTUNICP)(uch & 0x01) << 30);
796 RTStrAssertMsgReturn(uc >= 0x04000000 && uc <= 0x7fffffff,
797 ("%u: cp=%#010RX32: %.*Rhxs\n", cb, uc, RT_MIN(cb + 10, cch), puch), VERR_INVALID_UTF8_ENCODING);
798 RTStrAssertMsgFailed(("%u: cp=%#010RX32: %.*Rhxs\n", cb, uc, RT_MIN(cb + 10, cch), puch));
799 return VERR_CANT_RECODE_AS_UTF16;
800 //cb = 6;
801 }
802 else
803 {
804 RTStrAssertMsgFailed(("Invalid UTF-8 first byte: %.*Rhxs\n", RT_MIN(cch, 10), puch));
805 return VERR_INVALID_UTF8_ENCODING;
806 }
807
808 /* advance */
809 puch += cb;
810 cwc++;
811 }
812 }
813
814 /* done */
815 *pcwc = cwc;
816 return VINF_SUCCESS;
817}
818
819
820
821/**
822 * Recodes a valid UTF-8 string as UTF-16.
823 *
824 * Since we know the input is valid, we do *not* perform encoding or length checks.
825 *
826 * @returns iprt status code.
827 * @param psz The UTF-8 string to recode. This is a valid encoding.
828 * @param cch The number of chars (the type char, so bytes if you like) to process of the UTF-8 string.
829 * The recoding will stop when cch or '\\0' is reached. Pass RTSTR_MAX to process up to '\\0'.
830 * @param pwsz Where to store the UTF-16 string.
831 * @param cwc The number of RTUTF16 items the pwsz buffer can hold, excluding the terminator ('\\0').
832 *
833 * @note rtUtf8RecodeAsUtf16Big is a duplicate with RT_H2BE_U16 applied.
834 */
835static int rtUtf8RecodeAsUtf16(const char *psz, size_t cch, PRTUTF16 pwsz, size_t cwc)
836{
837 int rc = VINF_SUCCESS;
838 const unsigned char *puch = (const unsigned char *)psz;
839 PRTUTF16 pwc = pwsz;
840 while (cch > 0)
841 {
842 /* read the next char and check for terminator. */
843 const unsigned char uch = *puch;
844 if (uch)
845 { /* we only break once, so consider this the likely branch. */ }
846 else
847 break;
848
849 /* check for output overflow */
850 if (RT_LIKELY(cwc >= 1))
851 { /* likely */ }
852 else
853 {
854 rc = VERR_BUFFER_OVERFLOW;
855 break;
856 }
857 cwc--;
858
859 /* decode and recode the code point */
860 if (!(uch & RT_BIT(7)))
861 {
862 *pwc++ = uch;
863 puch++;
864 cch--;
865 }
866 else if ((uch & (RT_BIT(7) | RT_BIT(6) | RT_BIT(5))) == (RT_BIT(7) | RT_BIT(6)))
867 {
868 uint16_t uc = (puch[1] & 0x3f)
869 | ((uint16_t)(uch & 0x1f) << 6);
870 *pwc++ = uc;
871 puch += 2;
872 cch -= 2;
873 }
874 else if ((uch & (RT_BIT(7) | RT_BIT(6) | RT_BIT(5) | RT_BIT(4))) == (RT_BIT(7) | RT_BIT(6) | RT_BIT(5)))
875 {
876 uint16_t uc = (puch[2] & 0x3f)
877 | ((uint16_t)(puch[1] & 0x3f) << 6)
878 | ((uint16_t)(uch & 0x0f) << 12);
879 *pwc++ = uc;
880 puch += 3;
881 cch -= 3;
882 }
883 else
884 {
885 /* generate surrogate pair */
886 Assert((uch & (RT_BIT(7) | RT_BIT(6) | RT_BIT(5) | RT_BIT(4) | RT_BIT(3))) == (RT_BIT(7) | RT_BIT(6) | RT_BIT(5) | RT_BIT(4)));
887 RTUNICP uc = (puch[3] & 0x3f)
888 | ((RTUNICP)(puch[2] & 0x3f) << 6)
889 | ((RTUNICP)(puch[1] & 0x3f) << 12)
890 | ((RTUNICP)(uch & 0x07) << 18);
891 if (RT_UNLIKELY(cwc < 1))
892 {
893 rc = VERR_BUFFER_OVERFLOW;
894 break;
895 }
896 cwc--;
897
898 uc -= 0x10000;
899 *pwc++ = 0xd800 | (uc >> 10);
900 *pwc++ = 0xdc00 | (uc & 0x3ff);
901 puch += 4;
902 cch -= 4;
903 }
904 }
905
906 /* done */
907 *pwc = '\0';
908 return rc;
909}
910
911
912/**
913 * Recodes a valid UTF-8 string as UTF-16BE.
914 *
915 * Since we know the input is valid, we do *not* perform encoding or length checks.
916 *
917 * @returns iprt status code.
918 * @param psz The UTF-8 string to recode. This is a valid encoding.
919 * @param cch The number of chars (the type char, so bytes if you like) to process of the UTF-8 string.
920 * The recoding will stop when cch or '\\0' is reached. Pass RTSTR_MAX to process up to '\\0'.
921 * @param pwsz Where to store the UTF-16BE string.
922 * @param cwc The number of RTUTF16 items the pwsz buffer can hold, excluding the terminator ('\\0').
923 *
924 * @note This is a copy of rtUtf8RecodeAsUtf16 with RT_H2BE_U16 applied.
925 */
926static int rtUtf8RecodeAsUtf16Big(const char *psz, size_t cch, PRTUTF16 pwsz, size_t cwc)
927{
928 int rc = VINF_SUCCESS;
929 const unsigned char *puch = (const unsigned char *)psz;
930 PRTUTF16 pwc = pwsz;
931 while (cch > 0)
932 {
933 /* read the next char and check for terminator. */
934 const unsigned char uch = *puch;
935 if (uch)
936 { /* we only break once, so consider this the likely branch. */ }
937 else
938 break;
939
940 /* check for output overflow */
941 if (RT_LIKELY(cwc >= 1))
942 { /* likely */ }
943 else
944 {
945 rc = VERR_BUFFER_OVERFLOW;
946 break;
947 }
948 cwc--;
949
950 /* decode and recode the code point */
951 if (!(uch & RT_BIT(7)))
952 {
953 *pwc++ = RT_H2BE_U16((RTUTF16)uch);
954 puch++;
955 cch--;
956 }
957 else if ((uch & (RT_BIT(7) | RT_BIT(6) | RT_BIT(5))) == (RT_BIT(7) | RT_BIT(6)))
958 {
959 uint16_t uc = (puch[1] & 0x3f)
960 | ((uint16_t)(uch & 0x1f) << 6);
961 *pwc++ = RT_H2BE_U16(uc);
962 puch += 2;
963 cch -= 2;
964 }
965 else if ((uch & (RT_BIT(7) | RT_BIT(6) | RT_BIT(5) | RT_BIT(4))) == (RT_BIT(7) | RT_BIT(6) | RT_BIT(5)))
966 {
967 uint16_t uc = (puch[2] & 0x3f)
968 | ((uint16_t)(puch[1] & 0x3f) << 6)
969 | ((uint16_t)(uch & 0x0f) << 12);
970 *pwc++ = RT_H2BE_U16(uc);
971 puch += 3;
972 cch -= 3;
973 }
974 else
975 {
976 /* generate surrogate pair */
977 Assert((uch & (RT_BIT(7) | RT_BIT(6) | RT_BIT(5) | RT_BIT(4) | RT_BIT(3))) == (RT_BIT(7) | RT_BIT(6) | RT_BIT(5) | RT_BIT(4)));
978 RTUNICP uc = (puch[3] & 0x3f)
979 | ((RTUNICP)(puch[2] & 0x3f) << 6)
980 | ((RTUNICP)(puch[1] & 0x3f) << 12)
981 | ((RTUNICP)(uch & 0x07) << 18);
982 if (RT_UNLIKELY(cwc < 1))
983 {
984 rc = VERR_BUFFER_OVERFLOW;
985 break;
986 }
987 cwc--;
988
989 uc -= 0x10000;
990 *pwc++ = RT_H2BE_U16(0xd800 | (uc >> 10));
991 *pwc++ = RT_H2BE_U16(0xdc00 | (uc & 0x3ff));
992 puch += 4;
993 cch -= 4;
994 }
995 }
996
997 /* done */
998 *pwc = '\0';
999 return rc;
1000}
1001
1002
1003RTDECL(int) RTStrToUtf16Tag(const char *pszString, PRTUTF16 *ppwszString, const char *pszTag)
1004{
1005 /*
1006 * Validate input.
1007 */
1008 Assert(VALID_PTR(ppwszString));
1009 Assert(VALID_PTR(pszString));
1010 *ppwszString = NULL;
1011
1012 /*
1013 * Validate the UTF-8 input and calculate the length of the UTF-16 string.
1014 */
1015 size_t cwc;
1016 int rc = rtUtf8CalcUtf16Length(pszString, &cwc);
1017 if (RT_SUCCESS(rc))
1018 {
1019 /*
1020 * Allocate buffer.
1021 */
1022 PRTUTF16 pwsz = (PRTUTF16)RTMemAllocTag((cwc + 1) * sizeof(RTUTF16), pszTag);
1023 if (pwsz)
1024 {
1025 /*
1026 * Encode the UTF-16 string.
1027 */
1028 rc = rtUtf8RecodeAsUtf16(pszString, RTSTR_MAX, pwsz, cwc);
1029 if (RT_SUCCESS(rc))
1030 {
1031 *ppwszString = pwsz;
1032 return rc;
1033 }
1034 RTMemFree(pwsz);
1035 }
1036 else
1037 rc = VERR_NO_UTF16_MEMORY;
1038 }
1039 return rc;
1040}
1041RT_EXPORT_SYMBOL(RTStrToUtf16Tag);
1042
1043
1044RTDECL(int) RTStrToUtf16BigTag(const char *pszString, PRTUTF16 *ppwszString, const char *pszTag)
1045{
1046 /*
1047 * Validate input.
1048 */
1049 Assert(VALID_PTR(ppwszString));
1050 Assert(VALID_PTR(pszString));
1051 *ppwszString = NULL;
1052
1053 /*
1054 * Validate the UTF-8 input and calculate the length of the UTF-16 string.
1055 */
1056 size_t cwc;
1057 int rc = rtUtf8CalcUtf16Length(pszString, &cwc);
1058 if (RT_SUCCESS(rc))
1059 {
1060 /*
1061 * Allocate buffer.
1062 */
1063 PRTUTF16 pwsz = (PRTUTF16)RTMemAllocTag((cwc + 1) * sizeof(RTUTF16), pszTag);
1064 if (pwsz)
1065 {
1066 /*
1067 * Encode the UTF-16 string.
1068 */
1069 rc = rtUtf8RecodeAsUtf16Big(pszString, RTSTR_MAX, pwsz, cwc);
1070 if (RT_SUCCESS(rc))
1071 {
1072 *ppwszString = pwsz;
1073 return rc;
1074 }
1075 RTMemFree(pwsz);
1076 }
1077 else
1078 rc = VERR_NO_UTF16_MEMORY;
1079 }
1080 return rc;
1081}
1082RT_EXPORT_SYMBOL(RTStrToUtf16BigTag);
1083
1084
1085RTDECL(int) RTStrToUtf16ExTag(const char *pszString, size_t cchString,
1086 PRTUTF16 *ppwsz, size_t cwc, size_t *pcwc, const char *pszTag)
1087{
1088 /*
1089 * Validate input.
1090 */
1091 Assert(VALID_PTR(pszString));
1092 Assert(VALID_PTR(ppwsz));
1093 Assert(!pcwc || VALID_PTR(pcwc));
1094
1095 /*
1096 * Validate the UTF-8 input and calculate the length of the UTF-16 string.
1097 */
1098 size_t cwcResult;
1099 int rc;
1100 if (cchString != RTSTR_MAX)
1101 rc = rtUtf8CalcUtf16LengthN(pszString, cchString, &cwcResult);
1102 else
1103 rc = rtUtf8CalcUtf16Length(pszString, &cwcResult);
1104 if (RT_SUCCESS(rc))
1105 {
1106 if (pcwc)
1107 *pcwc = cwcResult;
1108
1109 /*
1110 * Check buffer size / Allocate buffer.
1111 */
1112 bool fShouldFree;
1113 PRTUTF16 pwszResult;
1114 if (cwc > 0 && *ppwsz)
1115 {
1116 fShouldFree = false;
1117 if (cwc <= cwcResult)
1118 return VERR_BUFFER_OVERFLOW;
1119 pwszResult = *ppwsz;
1120 }
1121 else
1122 {
1123 *ppwsz = NULL;
1124 fShouldFree = true;
1125 cwc = RT_MAX(cwcResult + 1, cwc);
1126 pwszResult = (PRTUTF16)RTMemAllocTag(cwc * sizeof(RTUTF16), pszTag);
1127 }
1128 if (pwszResult)
1129 {
1130 /*
1131 * Encode the UTF-16 string.
1132 */
1133 rc = rtUtf8RecodeAsUtf16(pszString, cchString, pwszResult, cwc - 1);
1134 if (RT_SUCCESS(rc))
1135 {
1136 *ppwsz = pwszResult;
1137 return rc;
1138 }
1139 if (fShouldFree)
1140 RTMemFree(pwszResult);
1141 }
1142 else
1143 rc = VERR_NO_UTF16_MEMORY;
1144 }
1145 return rc;
1146}
1147RT_EXPORT_SYMBOL(RTStrToUtf16ExTag);
1148
1149
1150RTDECL(int) RTStrToUtf16BigExTag(const char *pszString, size_t cchString,
1151 PRTUTF16 *ppwsz, size_t cwc, size_t *pcwc, const char *pszTag)
1152{
1153 /*
1154 * Validate input.
1155 */
1156 Assert(VALID_PTR(pszString));
1157 Assert(VALID_PTR(ppwsz));
1158 Assert(!pcwc || VALID_PTR(pcwc));
1159
1160 /*
1161 * Validate the UTF-8 input and calculate the length of the UTF-16 string.
1162 */
1163 size_t cwcResult;
1164 int rc;
1165 if (cchString != RTSTR_MAX)
1166 rc = rtUtf8CalcUtf16LengthN(pszString, cchString, &cwcResult);
1167 else
1168 rc = rtUtf8CalcUtf16Length(pszString, &cwcResult);
1169 if (RT_SUCCESS(rc))
1170 {
1171 if (pcwc)
1172 *pcwc = cwcResult;
1173
1174 /*
1175 * Check buffer size / Allocate buffer.
1176 */
1177 bool fShouldFree;
1178 PRTUTF16 pwszResult;
1179 if (cwc > 0 && *ppwsz)
1180 {
1181 fShouldFree = false;
1182 if (cwc <= cwcResult)
1183 return VERR_BUFFER_OVERFLOW;
1184 pwszResult = *ppwsz;
1185 }
1186 else
1187 {
1188 *ppwsz = NULL;
1189 fShouldFree = true;
1190 cwc = RT_MAX(cwcResult + 1, cwc);
1191 pwszResult = (PRTUTF16)RTMemAllocTag(cwc * sizeof(RTUTF16), pszTag);
1192 }
1193 if (pwszResult)
1194 {
1195 /*
1196 * Encode the UTF-16BE string.
1197 */
1198 rc = rtUtf8RecodeAsUtf16Big(pszString, cchString, pwszResult, cwc - 1);
1199 if (RT_SUCCESS(rc))
1200 {
1201 *ppwsz = pwszResult;
1202 return rc;
1203 }
1204 if (fShouldFree)
1205 RTMemFree(pwszResult);
1206 }
1207 else
1208 rc = VERR_NO_UTF16_MEMORY;
1209 }
1210 return rc;
1211}
1212RT_EXPORT_SYMBOL(RTStrToUtf16BigExTag);
1213
1214
1215RTDECL(size_t) RTStrCalcUtf16Len(const char *psz)
1216{
1217 size_t cwc;
1218 int rc = rtUtf8CalcUtf16Length(psz, &cwc);
1219 return RT_SUCCESS(rc) ? cwc : 0;
1220}
1221RT_EXPORT_SYMBOL(RTStrCalcUtf16Len);
1222
1223
1224RTDECL(int) RTStrCalcUtf16LenEx(const char *psz, size_t cch, size_t *pcwc)
1225{
1226 size_t cwc;
1227 int rc;
1228 if (cch != RTSTR_MAX)
1229 rc = rtUtf8CalcUtf16LengthN(psz, cch, &cwc);
1230 else
1231 rc = rtUtf8CalcUtf16Length(psz, &cwc);
1232 if (pcwc)
1233 *pcwc = RT_SUCCESS(rc) ? cwc : ~(size_t)0;
1234 return rc;
1235}
1236RT_EXPORT_SYMBOL(RTStrCalcUtf16LenEx);
1237
1238
1239/**
1240 * Calculates the length of the UTF-8 encoding of a Latin-1 string.
1241 *
1242 * @returns iprt status code.
1243 * @param psz The Latin-1 string.
1244 * @param cchIn The max length of the Latin-1 string to consider.
1245 * @param pcch Where to store the length (excluding '\\0') of the UTF-8 string. (cch == cb, btw)
1246 */
1247static int rtLatin1CalcUtf8Length(const char *psz, size_t cchIn, size_t *pcch)
1248{
1249 size_t cch = 0;
1250 for (;;)
1251 {
1252 RTUNICP Cp;
1253 int rc = RTLatin1GetCpNEx(&psz, &cchIn, &Cp);
1254 if (Cp == 0 || rc == VERR_END_OF_STRING)
1255 break;
1256 if (RT_FAILURE(rc))
1257 return rc;
1258 cch += RTStrCpSize(Cp); /* cannot fail */
1259 }
1260
1261 /* done */
1262 *pcch = cch;
1263 return VINF_SUCCESS;
1264}
1265
1266
1267/**
1268 * Recodes a Latin-1 string as UTF-8.
1269 *
1270 * @returns iprt status code.
1271 * @param pszIn The Latin-1 string.
1272 * @param cchIn The number of characters to process from psz. The recoding
1273 * will stop when cch or '\\0' is reached.
1274 * @param psz Where to store the UTF-8 string.
1275 * @param cch The size of the UTF-8 buffer, excluding the terminator.
1276 */
1277static int rtLatin1RecodeAsUtf8(const char *pszIn, size_t cchIn, char *psz, size_t cch)
1278{
1279 int rc;
1280 for (;;)
1281 {
1282 RTUNICP Cp;
1283 size_t cchCp;
1284 rc = RTLatin1GetCpNEx(&pszIn, &cchIn, &Cp);
1285 if (Cp == 0 || RT_FAILURE(rc))
1286 break;
1287 cchCp = RTStrCpSize(Cp);
1288 if (RT_UNLIKELY(cch < cchCp))
1289 {
1290 RTStrAssertMsgFailed(("Buffer overflow! 1\n"));
1291 rc = VERR_BUFFER_OVERFLOW;
1292 break;
1293 }
1294 cch -= cchCp;
1295 psz = RTStrPutCp(psz, Cp);
1296 }
1297
1298 /* done */
1299 if (rc == VERR_END_OF_STRING)
1300 rc = VINF_SUCCESS;
1301 *psz = '\0';
1302 return rc;
1303}
1304
1305
1306
1307RTDECL(int) RTLatin1ToUtf8Tag(const char *pszString, char **ppszString, const char *pszTag)
1308{
1309 /*
1310 * Validate input.
1311 */
1312 Assert(VALID_PTR(ppszString));
1313 Assert(VALID_PTR(pszString));
1314 *ppszString = NULL;
1315
1316 /*
1317 * Calculate the length of the UTF-8 encoding of the Latin-1 string.
1318 */
1319 size_t cch;
1320 int rc = rtLatin1CalcUtf8Length(pszString, RTSTR_MAX, &cch);
1321 if (RT_SUCCESS(rc))
1322 {
1323 /*
1324 * Allocate buffer and recode it.
1325 */
1326 char *pszResult = (char *)RTMemAllocTag(cch + 1, pszTag);
1327 if (pszResult)
1328 {
1329 rc = rtLatin1RecodeAsUtf8(pszString, RTSTR_MAX, pszResult, cch);
1330 if (RT_SUCCESS(rc))
1331 {
1332 *ppszString = pszResult;
1333 return rc;
1334 }
1335
1336 RTMemFree(pszResult);
1337 }
1338 else
1339 rc = VERR_NO_STR_MEMORY;
1340 }
1341 return rc;
1342}
1343RT_EXPORT_SYMBOL(RTLatin1ToUtf8Tag);
1344
1345
1346RTDECL(int) RTLatin1ToUtf8ExTag(const char *pszString, size_t cchString, char **ppsz, size_t cch, size_t *pcch, const char *pszTag)
1347{
1348 /*
1349 * Validate input.
1350 */
1351 Assert(VALID_PTR(pszString));
1352 Assert(VALID_PTR(ppsz));
1353 Assert(!pcch || VALID_PTR(pcch));
1354
1355 /*
1356 * Calculate the length of the UTF-8 encoding of the Latin-1 string.
1357 */
1358 size_t cchResult;
1359 int rc = rtLatin1CalcUtf8Length(pszString, cchString, &cchResult);
1360 if (RT_SUCCESS(rc))
1361 {
1362 if (pcch)
1363 *pcch = cchResult;
1364
1365 /*
1366 * Check buffer size / Allocate buffer and recode it.
1367 */
1368 bool fShouldFree;
1369 char *pszResult;
1370 if (cch > 0 && *ppsz)
1371 {
1372 fShouldFree = false;
1373 if (RT_UNLIKELY(cch <= cchResult))
1374 return VERR_BUFFER_OVERFLOW;
1375 pszResult = *ppsz;
1376 }
1377 else
1378 {
1379 *ppsz = NULL;
1380 fShouldFree = true;
1381 cch = RT_MAX(cch, cchResult + 1);
1382 pszResult = (char *)RTStrAllocTag(cch, pszTag);
1383 }
1384 if (pszResult)
1385 {
1386 rc = rtLatin1RecodeAsUtf8(pszString, cchString, pszResult, cch - 1);
1387 if (RT_SUCCESS(rc))
1388 {
1389 *ppsz = pszResult;
1390 return rc;
1391 }
1392
1393 if (fShouldFree)
1394 RTStrFree(pszResult);
1395 }
1396 else
1397 rc = VERR_NO_STR_MEMORY;
1398 }
1399 return rc;
1400}
1401RT_EXPORT_SYMBOL(RTLatin1ToUtf8ExTag);
1402
1403
1404RTDECL(size_t) RTLatin1CalcUtf8Len(const char *psz)
1405{
1406 size_t cch;
1407 int rc = rtLatin1CalcUtf8Length(psz, RTSTR_MAX, &cch);
1408 return RT_SUCCESS(rc) ? cch : 0;
1409}
1410RT_EXPORT_SYMBOL(RTLatin1CalcUtf8Len);
1411
1412
1413RTDECL(int) RTLatin1CalcUtf8LenEx(const char *psz, size_t cchIn, size_t *pcch)
1414{
1415 size_t cch;
1416 int rc = rtLatin1CalcUtf8Length(psz, cchIn, &cch);
1417 if (pcch)
1418 *pcch = RT_SUCCESS(rc) ? cch : ~(size_t)0;
1419 return rc;
1420}
1421RT_EXPORT_SYMBOL(RTLatin1CalcUtf8LenEx);
1422
1423
1424/**
1425 * Calculates the Latin-1 length of a string, validating the encoding while
1426 * doing so.
1427 *
1428 * @returns IPRT status code.
1429 * @param psz Pointer to the UTF-8 string.
1430 * @param cchIn The max length of the string. (btw cch = cb)
1431 * Use RTSTR_MAX if all of the string is to be examined.
1432 * @param pcch Where to store the length of the Latin-1 string in bytes.
1433 */
1434static int rtUtf8CalcLatin1Length(const char *psz, size_t cchIn, size_t *pcch)
1435{
1436 size_t cch = 0;
1437 for (;;)
1438 {
1439 RTUNICP Cp;
1440 size_t cchCp;
1441 int rc = RTStrGetCpNEx(&psz, &cchIn, &Cp);
1442 if (Cp == 0 || rc == VERR_END_OF_STRING)
1443 break;
1444 if (RT_FAILURE(rc))
1445 return rc;
1446 cchCp = RTLatin1CpSize(Cp);
1447 if (cchCp == 0)
1448 return VERR_NO_TRANSLATION;
1449 cch += cchCp;
1450 }
1451
1452 /* done */
1453 *pcch = cch;
1454 return VINF_SUCCESS;
1455}
1456
1457
1458/**
1459 * Recodes a valid UTF-8 string as Latin-1.
1460 *
1461 * Since we know the input is valid, we do *not* perform encoding or length checks.
1462 *
1463 * @returns iprt status code.
1464 * @param pszIn The UTF-8 string to recode. This is a valid encoding.
1465 * @param cchIn The number of chars (the type char, so bytes if you like) to process of the UTF-8 string.
1466 * The recoding will stop when cch or '\\0' is reached. Pass RTSTR_MAX to process up to '\\0'.
1467 * @param psz Where to store the Latin-1 string.
1468 * @param cch The number of characters the pszOut buffer can hold, excluding the terminator ('\\0').
1469 */
1470static int rtUtf8RecodeAsLatin1(const char *pszIn, size_t cchIn, char *psz, size_t cch)
1471{
1472 int rc;
1473 for (;;)
1474 {
1475 RTUNICP Cp;
1476 size_t cchCp;
1477 rc = RTStrGetCpNEx(&pszIn, &cchIn, &Cp);
1478 if (Cp == 0 || RT_FAILURE(rc))
1479 break;
1480 cchCp = RTLatin1CpSize(Cp);
1481 if (RT_UNLIKELY(cch < cchCp))
1482 {
1483 RTStrAssertMsgFailed(("Buffer overflow! 1\n"));
1484 rc = VERR_BUFFER_OVERFLOW;
1485 break;
1486 }
1487 cch -= cchCp;
1488 psz = RTLatin1PutCp(psz, Cp);
1489 }
1490
1491 /* done */
1492 if (rc == VERR_END_OF_STRING)
1493 rc = VINF_SUCCESS;
1494 *psz = '\0';
1495 return rc;
1496}
1497
1498
1499
1500RTDECL(int) RTStrToLatin1Tag(const char *pszString, char **ppszString, const char *pszTag)
1501{
1502 /*
1503 * Validate input.
1504 */
1505 Assert(VALID_PTR(ppszString));
1506 Assert(VALID_PTR(pszString));
1507 *ppszString = NULL;
1508
1509 /*
1510 * Validate the UTF-8 input and calculate the length of the Latin-1 string.
1511 */
1512 size_t cch;
1513 int rc = rtUtf8CalcLatin1Length(pszString, RTSTR_MAX, &cch);
1514 if (RT_SUCCESS(rc))
1515 {
1516 /*
1517 * Allocate buffer.
1518 */
1519 char *psz = (char *)RTMemAllocTag(cch + 1, pszTag);
1520 if (psz)
1521 {
1522 /*
1523 * Encode the UTF-16 string.
1524 */
1525 rc = rtUtf8RecodeAsLatin1(pszString, RTSTR_MAX, psz, cch);
1526 if (RT_SUCCESS(rc))
1527 {
1528 *ppszString = psz;
1529 return rc;
1530 }
1531 RTMemFree(psz);
1532 }
1533 else
1534 rc = VERR_NO_STR_MEMORY;
1535 }
1536 return rc;
1537}
1538RT_EXPORT_SYMBOL(RTStrToLatin1Tag);
1539
1540
1541RTDECL(int) RTStrToLatin1ExTag(const char *pszString, size_t cchString,
1542 char **ppsz, size_t cch, size_t *pcch, const char *pszTag)
1543{
1544 /*
1545 * Validate input.
1546 */
1547 Assert(VALID_PTR(pszString));
1548 Assert(VALID_PTR(ppsz));
1549 Assert(!pcch || VALID_PTR(pcch));
1550
1551 /*
1552 * Validate the UTF-8 input and calculate the length of the UTF-16 string.
1553 */
1554 size_t cchResult;
1555 int rc = rtUtf8CalcLatin1Length(pszString, cchString, &cchResult);
1556 if (RT_SUCCESS(rc))
1557 {
1558 if (pcch)
1559 *pcch = cchResult;
1560
1561 /*
1562 * Check buffer size / Allocate buffer.
1563 */
1564 bool fShouldFree;
1565 char *pszResult;
1566 if (cch > 0 && *ppsz)
1567 {
1568 fShouldFree = false;
1569 if (cch <= cchResult)
1570 return VERR_BUFFER_OVERFLOW;
1571 pszResult = *ppsz;
1572 }
1573 else
1574 {
1575 *ppsz = NULL;
1576 fShouldFree = true;
1577 cch = RT_MAX(cchResult + 1, cch);
1578 pszResult = (char *)RTMemAllocTag(cch, pszTag);
1579 }
1580 if (pszResult)
1581 {
1582 /*
1583 * Encode the Latin-1 string.
1584 */
1585 rc = rtUtf8RecodeAsLatin1(pszString, cchString, pszResult, cch - 1);
1586 if (RT_SUCCESS(rc))
1587 {
1588 *ppsz = pszResult;
1589 return rc;
1590 }
1591 if (fShouldFree)
1592 RTMemFree(pszResult);
1593 }
1594 else
1595 rc = VERR_NO_STR_MEMORY;
1596 }
1597 return rc;
1598}
1599RT_EXPORT_SYMBOL(RTStrToLatin1ExTag);
1600
1601
1602RTDECL(size_t) RTStrCalcLatin1Len(const char *psz)
1603{
1604 size_t cch;
1605 int rc = rtUtf8CalcLatin1Length(psz, RTSTR_MAX, &cch);
1606 return RT_SUCCESS(rc) ? cch : 0;
1607}
1608RT_EXPORT_SYMBOL(RTStrCalcLatin1Len);
1609
1610
1611RTDECL(int) RTStrCalcLatin1LenEx(const char *psz, size_t cchIn, size_t *pcch)
1612{
1613 size_t cch;
1614 int rc = rtUtf8CalcLatin1Length(psz, cchIn, &cch);
1615 if (pcch)
1616 *pcch = RT_SUCCESS(rc) ? cch : ~(size_t)0;
1617 return rc;
1618}
1619RT_EXPORT_SYMBOL(RTStrCalcLatin1LenEx);
1620
1621
1622/**
1623 * Handle invalid encodings passed to RTStrGetCp() and RTStrGetCpEx().
1624 * @returns rc
1625 * @param ppsz The pointer to the string position point.
1626 * @param pCp Where to store RTUNICP_INVALID.
1627 * @param rc The iprt error code.
1628 */
1629static int rtStrGetCpExFailure(const char **ppsz, PRTUNICP pCp, int rc)
1630{
1631 /*
1632 * Try find a valid encoding.
1633 */
1634 (*ppsz)++; /** @todo code this! */
1635 *pCp = RTUNICP_INVALID;
1636 return rc;
1637}
1638
1639
1640RTDECL(RTUNICP) RTStrGetCpInternal(const char *psz)
1641{
1642 RTUNICP Cp;
1643 RTStrGetCpExInternal(&psz, &Cp);
1644 return Cp;
1645}
1646RT_EXPORT_SYMBOL(RTStrGetCpInternal);
1647
1648
1649RTDECL(int) RTStrGetCpExInternal(const char **ppsz, PRTUNICP pCp)
1650{
1651 const unsigned char *puch = (const unsigned char *)*ppsz;
1652 const unsigned char uch = *puch;
1653 RTUNICP uc;
1654
1655 /* ASCII ? */
1656 if (!(uch & RT_BIT(7)))
1657 {
1658 uc = uch;
1659 puch++;
1660 }
1661 else if (uch & RT_BIT(6))
1662 {
1663 /* figure the length and validate the first octet. */
1664/** @todo RT_USE_RTC_3629 */
1665 unsigned cb;
1666 if (!(uch & RT_BIT(5)))
1667 cb = 2;
1668 else if (!(uch & RT_BIT(4)))
1669 cb = 3;
1670 else if (!(uch & RT_BIT(3)))
1671 cb = 4;
1672 else if (!(uch & RT_BIT(2)))
1673 cb = 5;
1674 else if (!(uch & RT_BIT(1)))
1675 cb = 6;
1676 else
1677 {
1678 RTStrAssertMsgFailed(("Invalid UTF-8 first byte: %.*Rhxs\n", RT_MIN(strlen((char *)puch), 10), puch));
1679 return rtStrGetCpExFailure(ppsz, pCp, VERR_INVALID_UTF8_ENCODING);
1680 }
1681
1682 /* validate the rest */
1683 switch (cb)
1684 {
1685 case 6:
1686 RTStrAssertMsgReturn((puch[5] & 0xc0) == 0x80, ("6/%u: %.*Rhxs\n", cb, RT_MIN(cb + 10, strlen((char *)puch)), puch),
1687 rtStrGetCpExFailure(ppsz, pCp, VERR_INVALID_UTF8_ENCODING));
1688 RT_FALL_THRU();
1689 case 5:
1690 RTStrAssertMsgReturn((puch[4] & 0xc0) == 0x80, ("5/%u: %.*Rhxs\n", cb, RT_MIN(cb + 10, strlen((char *)puch)), puch),
1691 rtStrGetCpExFailure(ppsz, pCp, VERR_INVALID_UTF8_ENCODING));
1692 RT_FALL_THRU();
1693 case 4:
1694 RTStrAssertMsgReturn((puch[3] & 0xc0) == 0x80, ("4/%u: %.*Rhxs\n", cb, RT_MIN(cb + 10, strlen((char *)puch)), puch),
1695 rtStrGetCpExFailure(ppsz, pCp, VERR_INVALID_UTF8_ENCODING));
1696 RT_FALL_THRU();
1697 case 3:
1698 RTStrAssertMsgReturn((puch[2] & 0xc0) == 0x80, ("3/%u: %.*Rhxs\n", cb, RT_MIN(cb + 10, strlen((char *)puch)), puch),
1699 rtStrGetCpExFailure(ppsz, pCp, VERR_INVALID_UTF8_ENCODING));
1700 RT_FALL_THRU();
1701 case 2:
1702 RTStrAssertMsgReturn((puch[1] & 0xc0) == 0x80, ("2/%u: %.*Rhxs\n", cb, RT_MIN(cb + 10, strlen((char *)puch)), puch),
1703 rtStrGetCpExFailure(ppsz, pCp, VERR_INVALID_UTF8_ENCODING));
1704 break;
1705 }
1706
1707 /* get and validate the code point. */
1708 switch (cb)
1709 {
1710 case 6:
1711 uc = (puch[5] & 0x3f)
1712 | ((RTUNICP)(puch[4] & 0x3f) << 6)
1713 | ((RTUNICP)(puch[3] & 0x3f) << 12)
1714 | ((RTUNICP)(puch[2] & 0x3f) << 18)
1715 | ((RTUNICP)(puch[1] & 0x3f) << 24)
1716 | ((RTUNICP)(uch & 0x01) << 30);
1717 RTStrAssertMsgReturn(uc >= 0x04000000 && uc <= 0x7fffffff,
1718 ("%u: cp=%#010RX32: %.*Rhxs\n", cb, uc, RT_MIN(cb + 10, strlen((char *)puch)), puch),
1719 rtStrGetCpExFailure(ppsz, pCp, VERR_INVALID_UTF8_ENCODING));
1720 break;
1721 case 5:
1722 uc = (puch[4] & 0x3f)
1723 | ((RTUNICP)(puch[3] & 0x3f) << 6)
1724 | ((RTUNICP)(puch[2] & 0x3f) << 12)
1725 | ((RTUNICP)(puch[1] & 0x3f) << 18)
1726 | ((RTUNICP)(uch & 0x03) << 24);
1727 RTStrAssertMsgReturn(uc >= 0x00200000 && uc <= 0x03ffffff,
1728 ("%u: cp=%#010RX32: %.*Rhxs\n", cb, uc, RT_MIN(cb + 10, strlen((char *)puch)), puch),
1729 rtStrGetCpExFailure(ppsz, pCp, VERR_INVALID_UTF8_ENCODING));
1730 break;
1731 case 4:
1732 uc = (puch[3] & 0x3f)
1733 | ((RTUNICP)(puch[2] & 0x3f) << 6)
1734 | ((RTUNICP)(puch[1] & 0x3f) << 12)
1735 | ((RTUNICP)(uch & 0x07) << 18);
1736 RTStrAssertMsgReturn(uc >= 0x00010000 && uc <= 0x001fffff,
1737 ("%u: cp=%#010RX32: %.*Rhxs\n", cb, uc, RT_MIN(cb + 10, strlen((char *)puch)), puch),
1738 rtStrGetCpExFailure(ppsz, pCp, VERR_INVALID_UTF8_ENCODING));
1739 break;
1740 case 3:
1741 uc = (puch[2] & 0x3f)
1742 | ((RTUNICP)(puch[1] & 0x3f) << 6)
1743 | ((RTUNICP)(uch & 0x0f) << 12);
1744 RTStrAssertMsgReturn(uc >= 0x00000800 && uc <= 0x0000fffd,
1745 ("%u: cp=%#010RX32: %.*Rhxs\n", cb, uc, RT_MIN(cb + 10, strlen((char *)puch)), puch),
1746 rtStrGetCpExFailure(ppsz, pCp, uc == 0xffff || uc == 0xfffe ? VERR_CODE_POINT_ENDIAN_INDICATOR : VERR_INVALID_UTF8_ENCODING));
1747 RTStrAssertMsgReturn(uc < 0xd800 || uc > 0xdfff,
1748 ("%u: cp=%#010RX32: %.*Rhxs\n", cb, uc, RT_MIN(cb + 10, strlen((char *)puch)), puch),
1749 rtStrGetCpExFailure(ppsz, pCp, VERR_CODE_POINT_SURROGATE));
1750 break;
1751 case 2:
1752 uc = (puch[1] & 0x3f)
1753 | ((RTUNICP)(uch & 0x1f) << 6);
1754 RTStrAssertMsgReturn(uc >= 0x00000080 && uc <= 0x000007ff,
1755 ("%u: cp=%#010RX32: %.*Rhxs\n", cb, uc, RT_MIN(cb + 10, strlen((char *)puch)), puch),
1756 rtStrGetCpExFailure(ppsz, pCp, VERR_INVALID_UTF8_ENCODING));
1757 break;
1758 default: /* impossible, but GCC is bitching. */
1759 uc = RTUNICP_INVALID;
1760 break;
1761 }
1762 puch += cb;
1763 }
1764 else
1765 {
1766 /* 6th bit is always set. */
1767 RTStrAssertMsgFailed(("Invalid UTF-8 first byte: %.*Rhxs\n", RT_MIN(strlen((char *)puch), 10), puch));
1768 return rtStrGetCpExFailure(ppsz, pCp, VERR_INVALID_UTF8_ENCODING);
1769 }
1770 *pCp = uc;
1771 *ppsz = (const char *)puch;
1772 return VINF_SUCCESS;
1773}
1774RT_EXPORT_SYMBOL(RTStrGetCpExInternal);
1775
1776
1777/**
1778 * Handle invalid encodings passed to RTStrGetCpNEx().
1779 * @returns rc
1780 * @param ppsz The pointer to the string position point.
1781 * @param pcch Pointer to the string length.
1782 * @param pCp Where to store RTUNICP_INVALID.
1783 * @param rc The iprt error code.
1784 */
1785static int rtStrGetCpNExFailure(const char **ppsz, size_t *pcch, PRTUNICP pCp, int rc)
1786{
1787 /*
1788 * Try find a valid encoding.
1789 */
1790 (*ppsz)++; /** @todo code this! */
1791 (*pcch)--;
1792 *pCp = RTUNICP_INVALID;
1793 return rc;
1794}
1795
1796
1797RTDECL(int) RTStrGetCpNExInternal(const char **ppsz, size_t *pcch, PRTUNICP pCp)
1798{
1799 const unsigned char *puch = (const unsigned char *)*ppsz;
1800 const unsigned char uch = *puch;
1801 size_t cch = *pcch;
1802 RTUNICP uc;
1803
1804 if (cch == 0)
1805 {
1806 *pCp = RTUNICP_INVALID;
1807 return VERR_END_OF_STRING;
1808 }
1809
1810 /* ASCII ? */
1811 if (!(uch & RT_BIT(7)))
1812 {
1813 uc = uch;
1814 puch++;
1815 cch--;
1816 }
1817 else if (uch & RT_BIT(6))
1818 {
1819 /* figure the length and validate the first octet. */
1820/** @todo RT_USE_RTC_3629 */
1821 unsigned cb;
1822 if (!(uch & RT_BIT(5)))
1823 cb = 2;
1824 else if (!(uch & RT_BIT(4)))
1825 cb = 3;
1826 else if (!(uch & RT_BIT(3)))
1827 cb = 4;
1828 else if (!(uch & RT_BIT(2)))
1829 cb = 5;
1830 else if (!(uch & RT_BIT(1)))
1831 cb = 6;
1832 else
1833 {
1834 RTStrAssertMsgFailed(("Invalid UTF-8 first byte: %.*Rhxs\n", RT_MIN(strlen((char *)puch), 10), puch));
1835 return rtStrGetCpNExFailure(ppsz, pcch, pCp, VERR_INVALID_UTF8_ENCODING);
1836 }
1837
1838 if (cb > cch)
1839 return rtStrGetCpNExFailure(ppsz, pcch, pCp, VERR_INVALID_UTF8_ENCODING);
1840
1841 /* validate the rest */
1842 switch (cb)
1843 {
1844 case 6:
1845 RTStrAssertMsgReturn((puch[5] & 0xc0) == 0x80, ("6/%u: %.*Rhxs\n", cb, RT_MIN(cb + 10, strlen((char *)puch)), puch),
1846 rtStrGetCpNExFailure(ppsz, pcch, pCp, VERR_INVALID_UTF8_ENCODING));
1847 RT_FALL_THRU();
1848 case 5:
1849 RTStrAssertMsgReturn((puch[4] & 0xc0) == 0x80, ("5/%u: %.*Rhxs\n", cb, RT_MIN(cb + 10, strlen((char *)puch)), puch),
1850 rtStrGetCpNExFailure(ppsz, pcch, pCp, VERR_INVALID_UTF8_ENCODING));
1851 RT_FALL_THRU();
1852 case 4:
1853 RTStrAssertMsgReturn((puch[3] & 0xc0) == 0x80, ("4/%u: %.*Rhxs\n", cb, RT_MIN(cb + 10, strlen((char *)puch)), puch),
1854 rtStrGetCpNExFailure(ppsz, pcch, pCp, VERR_INVALID_UTF8_ENCODING));
1855 RT_FALL_THRU();
1856 case 3:
1857 RTStrAssertMsgReturn((puch[2] & 0xc0) == 0x80, ("3/%u: %.*Rhxs\n", cb, RT_MIN(cb + 10, strlen((char *)puch)), puch),
1858 rtStrGetCpNExFailure(ppsz, pcch, pCp, VERR_INVALID_UTF8_ENCODING));
1859 RT_FALL_THRU();
1860 case 2:
1861 RTStrAssertMsgReturn((puch[1] & 0xc0) == 0x80, ("2/%u: %.*Rhxs\n", cb, RT_MIN(cb + 10, strlen((char *)puch)), puch),
1862 rtStrGetCpNExFailure(ppsz, pcch, pCp, VERR_INVALID_UTF8_ENCODING));
1863 break;
1864 }
1865
1866 /* get and validate the code point. */
1867 switch (cb)
1868 {
1869 case 6:
1870 uc = (puch[5] & 0x3f)
1871 | ((RTUNICP)(puch[4] & 0x3f) << 6)
1872 | ((RTUNICP)(puch[3] & 0x3f) << 12)
1873 | ((RTUNICP)(puch[2] & 0x3f) << 18)
1874 | ((RTUNICP)(puch[1] & 0x3f) << 24)
1875 | ((RTUNICP)(uch & 0x01) << 30);
1876 RTStrAssertMsgReturn(uc >= 0x04000000 && uc <= 0x7fffffff,
1877 ("%u: cp=%#010RX32: %.*Rhxs\n", cb, uc, RT_MIN(cb + 10, strlen((char *)puch)), puch),
1878 rtStrGetCpNExFailure(ppsz, pcch, pCp, VERR_INVALID_UTF8_ENCODING));
1879 break;
1880 case 5:
1881 uc = (puch[4] & 0x3f)
1882 | ((RTUNICP)(puch[3] & 0x3f) << 6)
1883 | ((RTUNICP)(puch[2] & 0x3f) << 12)
1884 | ((RTUNICP)(puch[1] & 0x3f) << 18)
1885 | ((RTUNICP)(uch & 0x03) << 24);
1886 RTStrAssertMsgReturn(uc >= 0x00200000 && uc <= 0x03ffffff,
1887 ("%u: cp=%#010RX32: %.*Rhxs\n", cb, uc, RT_MIN(cb + 10, strlen((char *)puch)), puch),
1888 rtStrGetCpNExFailure(ppsz, pcch, pCp, VERR_INVALID_UTF8_ENCODING));
1889 break;
1890 case 4:
1891 uc = (puch[3] & 0x3f)
1892 | ((RTUNICP)(puch[2] & 0x3f) << 6)
1893 | ((RTUNICP)(puch[1] & 0x3f) << 12)
1894 | ((RTUNICP)(uch & 0x07) << 18);
1895 RTStrAssertMsgReturn(uc >= 0x00010000 && uc <= 0x001fffff,
1896 ("%u: cp=%#010RX32: %.*Rhxs\n", cb, uc, RT_MIN(cb + 10, strlen((char *)puch)), puch),
1897 rtStrGetCpNExFailure(ppsz, pcch, pCp, VERR_INVALID_UTF8_ENCODING));
1898 break;
1899 case 3:
1900 uc = (puch[2] & 0x3f)
1901 | ((RTUNICP)(puch[1] & 0x3f) << 6)
1902 | ((RTUNICP)(uch & 0x0f) << 12);
1903 RTStrAssertMsgReturn(uc >= 0x00000800 && uc <= 0x0000fffd,
1904 ("%u: cp=%#010RX32: %.*Rhxs\n", cb, uc, RT_MIN(cb + 10, strlen((char *)puch)), puch),
1905 rtStrGetCpNExFailure(ppsz, pcch, pCp, uc == 0xffff || uc == 0xfffe ? VERR_CODE_POINT_ENDIAN_INDICATOR : VERR_INVALID_UTF8_ENCODING));
1906 RTStrAssertMsgReturn(uc < 0xd800 || uc > 0xdfff,
1907 ("%u: cp=%#010RX32: %.*Rhxs\n", cb, uc, RT_MIN(cb + 10, strlen((char *)puch)), puch),
1908 rtStrGetCpNExFailure(ppsz, pcch, pCp, VERR_CODE_POINT_SURROGATE));
1909 break;
1910 case 2:
1911 uc = (puch[1] & 0x3f)
1912 | ((RTUNICP)(uch & 0x1f) << 6);
1913 RTStrAssertMsgReturn(uc >= 0x00000080 && uc <= 0x000007ff,
1914 ("%u: cp=%#010RX32: %.*Rhxs\n", cb, uc, RT_MIN(cb + 10, strlen((char *)puch)), puch),
1915 rtStrGetCpNExFailure(ppsz, pcch, pCp, VERR_INVALID_UTF8_ENCODING));
1916 break;
1917 default: /* impossible, but GCC is bitching. */
1918 uc = RTUNICP_INVALID;
1919 break;
1920 }
1921 puch += cb;
1922 cch -= cb;
1923 }
1924 else
1925 {
1926 /* 6th bit is always set. */
1927 RTStrAssertMsgFailed(("Invalid UTF-8 first byte: %.*Rhxs\n", RT_MIN(strlen((char *)puch), 10), puch));
1928 return rtStrGetCpNExFailure(ppsz, pcch, pCp, VERR_INVALID_UTF8_ENCODING);
1929 }
1930 *pCp = uc;
1931 *ppsz = (const char *)puch;
1932 (*pcch) = cch;
1933 return VINF_SUCCESS;
1934}
1935RT_EXPORT_SYMBOL(RTStrGetCpNExInternal);
1936
1937
1938RTDECL(char *) RTStrPutCpInternal(char *psz, RTUNICP uc)
1939{
1940 unsigned char *puch = (unsigned char *)psz;
1941 if (uc < 0x80)
1942 *puch++ = (unsigned char )uc;
1943 else if (uc < 0x00000800)
1944 {
1945 *puch++ = 0xc0 | (uc >> 6);
1946 *puch++ = 0x80 | (uc & 0x3f);
1947 }
1948 else if (uc < 0x00010000)
1949 {
1950/** @todo RT_USE_RTC_3629 */
1951 if ( uc < 0x0000d8000
1952 || ( uc > 0x0000dfff
1953 && uc < 0x0000fffe))
1954 {
1955 *puch++ = 0xe0 | (uc >> 12);
1956 *puch++ = 0x80 | ((uc >> 6) & 0x3f);
1957 *puch++ = 0x80 | (uc & 0x3f);
1958 }
1959 else
1960 {
1961 AssertMsgFailed(("Invalid code point U+%05x!\n", uc));
1962 *puch++ = 0x7f;
1963 }
1964 }
1965/** @todo RT_USE_RTC_3629 */
1966 else if (uc < 0x00200000)
1967 {
1968 *puch++ = 0xf0 | (uc >> 18);
1969 *puch++ = 0x80 | ((uc >> 12) & 0x3f);
1970 *puch++ = 0x80 | ((uc >> 6) & 0x3f);
1971 *puch++ = 0x80 | (uc & 0x3f);
1972 }
1973 else if (uc < 0x04000000)
1974 {
1975 *puch++ = 0xf8 | (uc >> 24);
1976 *puch++ = 0x80 | ((uc >> 18) & 0x3f);
1977 *puch++ = 0x80 | ((uc >> 12) & 0x3f);
1978 *puch++ = 0x80 | ((uc >> 6) & 0x3f);
1979 *puch++ = 0x80 | (uc & 0x3f);
1980 }
1981 else if (uc <= 0x7fffffff)
1982 {
1983 *puch++ = 0xfc | (uc >> 30);
1984 *puch++ = 0x80 | ((uc >> 24) & 0x3f);
1985 *puch++ = 0x80 | ((uc >> 18) & 0x3f);
1986 *puch++ = 0x80 | ((uc >> 12) & 0x3f);
1987 *puch++ = 0x80 | ((uc >> 6) & 0x3f);
1988 *puch++ = 0x80 | (uc & 0x3f);
1989 }
1990 else
1991 {
1992 AssertMsgFailed(("Invalid code point U+%08x!\n", uc));
1993 *puch++ = 0x7f;
1994 }
1995
1996 return (char *)puch;
1997}
1998RT_EXPORT_SYMBOL(RTStrPutCpInternal);
1999
2000
2001RTDECL(char *) RTStrPrevCp(const char *pszStart, const char *psz)
2002{
2003 if (pszStart < psz)
2004 {
2005 /* simple char? */
2006 const unsigned char *puch = (const unsigned char *)psz;
2007 unsigned uch = *--puch;
2008 if (!(uch & RT_BIT(7)))
2009 return (char *)puch;
2010 RTStrAssertMsgReturn(!(uch & RT_BIT(6)), ("uch=%#x\n", uch), (char *)pszStart);
2011
2012 /* two or more. */
2013 uint32_t uMask = 0xffffffc0;
2014 while ( (const unsigned char *)pszStart < puch
2015 && !(uMask & 1))
2016 {
2017 uch = *--puch;
2018 if ((uch & 0xc0) != 0x80)
2019 {
2020 RTStrAssertMsgReturn((uch & (uMask >> 1)) == (uMask & 0xff),
2021 ("Invalid UTF-8 encoding: %.*Rhxs puch=%p psz=%p\n", psz - (char *)puch, puch, psz),
2022 (char *)pszStart);
2023 return (char *)puch;
2024 }
2025 uMask >>= 1;
2026 }
2027 RTStrAssertMsgFailed(("Invalid UTF-8 encoding: %.*Rhxs puch=%p psz=%p\n", psz - (char *)puch, puch, psz));
2028 }
2029 return (char *)pszStart;
2030}
2031RT_EXPORT_SYMBOL(RTStrPrevCp);
2032
Note: See TracBrowser for help on using the repository browser.

© 2024 Oracle Support Privacy / Do Not Sell My Info Terms of Use Trademark Policy Automated Access Etiquette