VirtualBox

source: vbox/trunk/src/VBox/Runtime/common/string/utf-8.cpp@ 64797

Last change on this file since 64797 was 64633, checked in by vboxsync, 8 years ago

utf-8.cpp: Duplicate rtUtf8CalcUtf16Length so we can optimize the common case of RTSTR_MAX. Also rearranged some string terminator checks to try improve our mojo.

  • Property svn:eol-style set to native
  • Property svn:keywords set to Id Revision
File size: 64.4 KB
Line 
1/* $Id: utf-8.cpp 64633 2016-11-10 15:03:17Z vboxsync $ */
2/** @file
3 * IPRT - UTF-8 Decoding.
4 */
5
6/*
7 * Copyright (C) 2006-2016 Oracle Corporation
8 *
9 * This file is part of VirtualBox Open Source Edition (OSE), as
10 * available from http://www.virtualbox.org. This file is free software;
11 * you can redistribute it and/or modify it under the terms of the GNU
12 * General Public License (GPL) as published by the Free Software
13 * Foundation, in version 2 as it comes in the "COPYING" file of the
14 * VirtualBox OSE distribution. VirtualBox OSE is distributed in the
15 * hope that it will be useful, but WITHOUT ANY WARRANTY of any kind.
16 *
17 * The contents of this file may alternatively be used under the terms
18 * of the Common Development and Distribution License Version 1.0
19 * (CDDL) only, as it comes in the "COPYING.CDDL" file of the
20 * VirtualBox OSE distribution, in which case the provisions of the
21 * CDDL are applicable instead of those of the GPL.
22 *
23 * You may elect to license modified versions of this file under the
24 * terms and conditions of either the GPL or the CDDL or both.
25 */
26
27
28/*********************************************************************************************************************************
29* Header Files *
30*********************************************************************************************************************************/
31#include <iprt/string.h>
32#include "internal/iprt.h"
33
34#include <iprt/uni.h>
35#include <iprt/alloc.h>
36#include <iprt/assert.h>
37#include <iprt/err.h>
38#include "internal/string.h"
39
40
41
42/**
43 * Get get length in code points of a UTF-8 encoded string.
44 * The string is validated while doing this.
45 *
46 * @returns IPRT status code.
47 * @param psz Pointer to the UTF-8 string.
48 * @param cch The max length of the string. (btw cch = cb)
49 * Use RTSTR_MAX if all of the string is to be examined.
50 * @param pcuc Where to store the length in unicode code points.
51 * @param pcchActual Where to store the actual size of the UTF-8 string
52 * on success (cch = cb again). Optional.
53 */
54DECLHIDDEN(int) rtUtf8Length(const char *psz, size_t cch, size_t *pcuc, size_t *pcchActual)
55{
56 const unsigned char *puch = (const unsigned char *)psz;
57 size_t cCodePoints = 0;
58 while (cch > 0)
59 {
60 const unsigned char uch = *puch;
61 if (!uch)
62 break;
63 if (uch & RT_BIT(7))
64 {
65 /* figure sequence length and validate the first byte */
66/** @todo RT_USE_RTC_3629 */
67 unsigned cb;
68 if ((uch & (RT_BIT(7) | RT_BIT(6) | RT_BIT(5))) == (RT_BIT(7) | RT_BIT(6)))
69 cb = 2;
70 else if ((uch & (RT_BIT(7) | RT_BIT(6) | RT_BIT(5) | RT_BIT(4))) == (RT_BIT(7) | RT_BIT(6) | RT_BIT(5)))
71 cb = 3;
72 else if ((uch & (RT_BIT(7) | RT_BIT(6) | RT_BIT(5) | RT_BIT(4) | RT_BIT(3))) == (RT_BIT(7) | RT_BIT(6) | RT_BIT(5) | RT_BIT(4)))
73 cb = 4;
74 else if ((uch & (RT_BIT(7) | RT_BIT(6) | RT_BIT(5) | RT_BIT(4) | RT_BIT(3) | RT_BIT(2))) == (RT_BIT(7) | RT_BIT(6) | RT_BIT(5) | RT_BIT(4) | RT_BIT(3)))
75 cb = 5;
76 else if ((uch & (RT_BIT(7) | RT_BIT(6) | RT_BIT(5) | RT_BIT(4) | RT_BIT(3) | RT_BIT(2) | RT_BIT(1))) == (RT_BIT(7) | RT_BIT(6) | RT_BIT(5) | RT_BIT(4) | RT_BIT(3) | RT_BIT(2)))
77 cb = 6;
78 else
79 {
80 RTStrAssertMsgFailed(("Invalid UTF-8 first byte: %.*Rhxs\n", RT_MIN(cch, 10), puch));
81 return VERR_INVALID_UTF8_ENCODING;
82 }
83
84 /* check length */
85 if (cb > cch)
86 {
87 RTStrAssertMsgFailed(("Invalid UTF-8 length: cb=%d cch=%d (%.*Rhxs)\n", cb, cch, RT_MIN(cch, 10), puch));
88 return VERR_INVALID_UTF8_ENCODING;
89 }
90
91 /* validate the rest */
92 switch (cb)
93 {
94 case 6:
95 RTStrAssertMsgReturn((puch[5] & (RT_BIT(7) | RT_BIT(6))) == RT_BIT(7), ("6/%u: %.*Rhxs\n", cb, RT_MIN(cb + 10, cch), puch), VERR_INVALID_UTF8_ENCODING);
96 case 5:
97 RTStrAssertMsgReturn((puch[4] & (RT_BIT(7) | RT_BIT(6))) == RT_BIT(7), ("5/%u: %.*Rhxs\n", cb, RT_MIN(cb + 10, cch), puch), VERR_INVALID_UTF8_ENCODING);
98 case 4:
99 RTStrAssertMsgReturn((puch[3] & (RT_BIT(7) | RT_BIT(6))) == RT_BIT(7), ("4/%u: %.*Rhxs\n", cb, RT_MIN(cb + 10, cch), puch), VERR_INVALID_UTF8_ENCODING);
100 case 3:
101 RTStrAssertMsgReturn((puch[2] & (RT_BIT(7) | RT_BIT(6))) == RT_BIT(7), ("3/%u: %.*Rhxs\n", cb, RT_MIN(cb + 10, cch), puch), VERR_INVALID_UTF8_ENCODING);
102 case 2:
103 RTStrAssertMsgReturn((puch[1] & (RT_BIT(7) | RT_BIT(6))) == RT_BIT(7), ("2/%u: %.*Rhxs\n", cb, RT_MIN(cb + 10, cch), puch), VERR_INVALID_UTF8_ENCODING);
104 break;
105 }
106
107 /* validate the code point. */
108 RTUNICP uc;
109 switch (cb)
110 {
111 case 6:
112 uc = (puch[5] & 0x3f)
113 | ((RTUNICP)(puch[4] & 0x3f) << 6)
114 | ((RTUNICP)(puch[3] & 0x3f) << 12)
115 | ((RTUNICP)(puch[2] & 0x3f) << 18)
116 | ((RTUNICP)(puch[1] & 0x3f) << 24)
117 | ((RTUNICP)(uch & 0x01) << 30);
118 RTStrAssertMsgReturn(uc >= 0x04000000 && uc <= 0x7fffffff,
119 ("%u: cp=%#010RX32: %.*Rhxs\n", cb, uc, RT_MIN(cb + 10, cch), puch), VERR_INVALID_UTF8_ENCODING);
120 break;
121 case 5:
122 uc = (puch[4] & 0x3f)
123 | ((RTUNICP)(puch[3] & 0x3f) << 6)
124 | ((RTUNICP)(puch[2] & 0x3f) << 12)
125 | ((RTUNICP)(puch[1] & 0x3f) << 18)
126 | ((RTUNICP)(uch & 0x03) << 24);
127 RTStrAssertMsgReturn(uc >= 0x00200000 && uc <= 0x03ffffff,
128 ("%u: cp=%#010RX32: %.*Rhxs\n", cb, uc, RT_MIN(cb + 10, cch), puch), VERR_INVALID_UTF8_ENCODING);
129 break;
130 case 4:
131 uc = (puch[3] & 0x3f)
132 | ((RTUNICP)(puch[2] & 0x3f) << 6)
133 | ((RTUNICP)(puch[1] & 0x3f) << 12)
134 | ((RTUNICP)(uch & 0x07) << 18);
135 RTStrAssertMsgReturn(uc >= 0x00010000 && uc <= 0x001fffff,
136 ("%u: cp=%#010RX32: %.*Rhxs\n", cb, uc, RT_MIN(cb + 10, cch), puch), VERR_INVALID_UTF8_ENCODING);
137 break;
138 case 3:
139 uc = (puch[2] & 0x3f)
140 | ((RTUNICP)(puch[1] & 0x3f) << 6)
141 | ((RTUNICP)(uch & 0x0f) << 12);
142 RTStrAssertMsgReturn(uc >= 0x00000800 && uc <= 0x0000fffd,
143 ("%u: cp=%#010RX32: %.*Rhxs\n", cb, uc, RT_MIN(cb + 10, cch), puch),
144 uc == 0xffff || uc == 0xfffe ? VERR_CODE_POINT_ENDIAN_INDICATOR : VERR_INVALID_UTF8_ENCODING);
145 RTStrAssertMsgReturn(uc < 0xd800 || uc > 0xdfff,
146 ("%u: cp=%#010RX32: %.*Rhxs\n", cb, uc, RT_MIN(cb + 10, cch), puch), VERR_CODE_POINT_SURROGATE);
147 break;
148 case 2:
149 uc = (puch[1] & 0x3f)
150 | ((RTUNICP)(uch & 0x1f) << 6);
151 RTStrAssertMsgReturn(uc >= 0x00000080 && uc <= 0x000007ff,
152 ("%u: cp=%#010RX32: %.*Rhxs\n", cb, uc, RT_MIN(cb + 10, cch), puch), VERR_INVALID_UTF8_ENCODING);
153 break;
154 }
155
156 /* advance */
157 cch -= cb;
158 puch += cb;
159 }
160 else
161 {
162 /* one ASCII byte */
163 puch++;
164 cch--;
165 }
166 cCodePoints++;
167 }
168
169 /* done */
170 *pcuc = cCodePoints;
171 if (pcchActual)
172 *pcchActual = puch - (unsigned char const *)psz;
173 return VINF_SUCCESS;
174}
175
176
177/**
178 * Decodes and UTF-8 string into an array of unicode code point.
179 *
180 * Since we know the input is valid, we do *not* perform encoding or length checks.
181 *
182 * @returns iprt status code.
183 * @param psz The UTF-8 string to recode. This is a valid encoding.
184 * @param cch The number of chars (the type char, so bytes if you like) to process of the UTF-8 string.
185 * The recoding will stop when cch or '\\0' is reached. Pass RTSTR_MAX to process up to '\\0'.
186 * @param paCps Where to store the code points array.
187 * @param cCps The number of RTUNICP items the paCps buffer can hold, excluding the terminator ('\\0').
188 */
189static int rtUtf8Decode(const char *psz, size_t cch, PRTUNICP paCps, size_t cCps)
190{
191 int rc = VINF_SUCCESS;
192 const unsigned char *puch = (const unsigned char *)psz;
193 PRTUNICP pCp = paCps;
194 while (cch > 0)
195 {
196 /* read the next char and check for terminator. */
197 const unsigned char uch = *puch;
198 if (uch)
199 { /* we only break once, so consider this the likely branch. */ }
200 else
201 break;
202
203 /* check for output overflow */
204 if (RT_LIKELY(cCps >= 1))
205 { /* likely */ }
206 else
207 {
208 rc = VERR_BUFFER_OVERFLOW;
209 break;
210 }
211 cCps--;
212
213 /* decode and recode the code point */
214 if (!(uch & RT_BIT(7)))
215 {
216 *pCp++ = uch;
217 puch++;
218 cch--;
219 }
220#ifdef RT_STRICT
221 else if (!(uch & RT_BIT(6)))
222 AssertMsgFailed(("Internal error!\n"));
223#endif
224 else if (!(uch & RT_BIT(5)))
225 {
226 *pCp++ = (puch[1] & 0x3f)
227 | ((uint16_t)(uch & 0x1f) << 6);
228 puch += 2;
229 cch -= 2;
230 }
231 else if (!(uch & RT_BIT(4)))
232 {
233 *pCp++ = (puch[2] & 0x3f)
234 | ((uint16_t)(puch[1] & 0x3f) << 6)
235 | ((uint16_t)(uch & 0x0f) << 12);
236 puch += 3;
237 cch -= 3;
238 }
239 else if (!(uch & RT_BIT(3)))
240 {
241 *pCp++ = (puch[3] & 0x3f)
242 | ((RTUNICP)(puch[2] & 0x3f) << 6)
243 | ((RTUNICP)(puch[1] & 0x3f) << 12)
244 | ((RTUNICP)(uch & 0x07) << 18);
245 puch += 4;
246 cch -= 4;
247 }
248 else if (!(uch & RT_BIT(2)))
249 {
250 *pCp++ = (puch[4] & 0x3f)
251 | ((RTUNICP)(puch[3] & 0x3f) << 6)
252 | ((RTUNICP)(puch[2] & 0x3f) << 12)
253 | ((RTUNICP)(puch[1] & 0x3f) << 18)
254 | ((RTUNICP)(uch & 0x03) << 24);
255 puch += 5;
256 cch -= 6;
257 }
258 else
259 {
260 Assert(!(uch & RT_BIT(1)));
261 *pCp++ = (puch[5] & 0x3f)
262 | ((RTUNICP)(puch[4] & 0x3f) << 6)
263 | ((RTUNICP)(puch[3] & 0x3f) << 12)
264 | ((RTUNICP)(puch[2] & 0x3f) << 18)
265 | ((RTUNICP)(puch[1] & 0x3f) << 24)
266 | ((RTUNICP)(uch & 0x01) << 30);
267 puch += 6;
268 cch -= 6;
269 }
270 }
271
272 /* done */
273 *pCp = 0;
274 return rc;
275}
276
277
278RTDECL(size_t) RTStrUniLen(const char *psz)
279{
280 size_t cCodePoints;
281 int rc = rtUtf8Length(psz, RTSTR_MAX, &cCodePoints, NULL);
282 return RT_SUCCESS(rc) ? cCodePoints : 0;
283}
284RT_EXPORT_SYMBOL(RTStrUniLen);
285
286
287RTDECL(int) RTStrUniLenEx(const char *psz, size_t cch, size_t *pcCps)
288{
289 size_t cCodePoints;
290 int rc = rtUtf8Length(psz, cch, &cCodePoints, NULL);
291 if (pcCps)
292 *pcCps = RT_SUCCESS(rc) ? cCodePoints : 0;
293 return rc;
294}
295RT_EXPORT_SYMBOL(RTStrUniLenEx);
296
297
298RTDECL(int) RTStrValidateEncoding(const char *psz)
299{
300 return RTStrValidateEncodingEx(psz, RTSTR_MAX, 0);
301}
302RT_EXPORT_SYMBOL(RTStrValidateEncoding);
303
304
305RTDECL(int) RTStrValidateEncodingEx(const char *psz, size_t cch, uint32_t fFlags)
306{
307 AssertReturn(!(fFlags & ~(RTSTR_VALIDATE_ENCODING_ZERO_TERMINATED | RTSTR_VALIDATE_ENCODING_EXACT_LENGTH)),
308 VERR_INVALID_PARAMETER);
309 AssertPtr(psz);
310
311 /*
312 * Use rtUtf8Length for the job.
313 */
314 size_t cchActual;
315 size_t cCpsIgnored;
316 int rc = rtUtf8Length(psz, cch, &cCpsIgnored, &cchActual);
317 if (RT_SUCCESS(rc))
318 {
319 if (fFlags & RTSTR_VALIDATE_ENCODING_EXACT_LENGTH)
320 {
321 if (fFlags & RTSTR_VALIDATE_ENCODING_ZERO_TERMINATED)
322 cchActual++;
323 if (cchActual == cch)
324 rc = VINF_SUCCESS;
325 else if (cchActual < cch)
326 rc = VERR_BUFFER_UNDERFLOW;
327 else
328 rc = VERR_BUFFER_OVERFLOW;
329 }
330 else if ( (fFlags & RTSTR_VALIDATE_ENCODING_ZERO_TERMINATED)
331 && cchActual >= cch)
332 rc = VERR_BUFFER_OVERFLOW;
333 }
334 return rc;
335}
336RT_EXPORT_SYMBOL(RTStrValidateEncodingEx);
337
338
339RTDECL(bool) RTStrIsValidEncoding(const char *psz)
340{
341 int rc = RTStrValidateEncodingEx(psz, RTSTR_MAX, 0);
342 return RT_SUCCESS(rc);
343}
344RT_EXPORT_SYMBOL(RTStrIsValidEncoding);
345
346
347RTDECL(size_t) RTStrPurgeEncoding(char *psz)
348{
349 size_t cErrors = 0;
350 for (;;)
351 {
352 RTUNICP Cp;
353 int rc = RTStrGetCpEx((const char **)&psz, &Cp);
354 if (RT_SUCCESS(rc))
355 {
356 if (!Cp)
357 break;
358 }
359 else
360 {
361 psz[-1] = '?';
362 cErrors++;
363 }
364 }
365 return cErrors;
366}
367RT_EXPORT_SYMBOL(RTStrPurgeEncoding);
368
369
370/**
371 * Helper for RTStrPurgeComplementSet.
372 *
373 * @returns true if @a Cp is valid, false if not.
374 * @param Cp The code point to validate.
375 * @param puszValidPairs Pair of valid code point sets.
376 * @param cValidPairs Number of pairs.
377 */
378DECLINLINE(bool) rtStrPurgeIsInSet(RTUNICP Cp, PCRTUNICP puszValidPairs, uint32_t cValidPairs)
379{
380 while (cValidPairs-- > 0)
381 {
382 if ( Cp >= puszValidPairs[0]
383 && Cp <= puszValidPairs[1])
384 return true;
385 puszValidPairs += 2;
386 }
387 return false;
388}
389
390
391RTDECL(ssize_t) RTStrPurgeComplementSet(char *psz, PCRTUNICP puszValidPairs, char chReplacement)
392{
393 AssertReturn(chReplacement && (unsigned)chReplacement < 128, -1);
394
395 /*
396 * Calc valid pairs and check that we've got an even number.
397 */
398 uint32_t cValidPairs = 0;
399 while (puszValidPairs[cValidPairs * 2])
400 {
401 AssertReturn(puszValidPairs[cValidPairs * 2 + 1], -1);
402 AssertMsg(puszValidPairs[cValidPairs * 2] <= puszValidPairs[cValidPairs * 2 + 1],
403 ("%#x vs %#x\n", puszValidPairs[cValidPairs * 2], puszValidPairs[cValidPairs * 2 + 1]));
404 cValidPairs++;
405 }
406
407 /*
408 * Do the replacing.
409 */
410 ssize_t cReplacements = 0;
411 for (;;)
412 {
413 char *pszCur = psz;
414 RTUNICP Cp;
415 int rc = RTStrGetCpEx((const char **)&psz, &Cp);
416 if (RT_SUCCESS(rc))
417 {
418 if (Cp)
419 {
420 if (!rtStrPurgeIsInSet(Cp, puszValidPairs, cValidPairs))
421 {
422 for (; pszCur != psz; ++pszCur)
423 *pszCur = chReplacement;
424 ++cReplacements;
425 }
426 }
427 else
428 break;
429 }
430 else
431 return -1;
432 }
433 return cReplacements;
434}
435RT_EXPORT_SYMBOL(RTStrPurgeComplementSet);
436
437
438RTDECL(int) RTStrToUni(const char *pszString, PRTUNICP *ppaCps)
439{
440 /*
441 * Validate input.
442 */
443 Assert(VALID_PTR(pszString));
444 Assert(VALID_PTR(ppaCps));
445 *ppaCps = NULL;
446
447 /*
448 * Validate the UTF-8 input and count its code points.
449 */
450 size_t cCps;
451 int rc = rtUtf8Length(pszString, RTSTR_MAX, &cCps, NULL);
452 if (RT_SUCCESS(rc))
453 {
454 /*
455 * Allocate buffer.
456 */
457 PRTUNICP paCps = (PRTUNICP)RTMemAlloc((cCps + 1) * sizeof(RTUNICP));
458 if (paCps)
459 {
460 /*
461 * Decode the string.
462 */
463 rc = rtUtf8Decode(pszString, RTSTR_MAX, paCps, cCps);
464 if (RT_SUCCESS(rc))
465 {
466 *ppaCps = paCps;
467 return rc;
468 }
469 RTMemFree(paCps);
470 }
471 else
472 rc = VERR_NO_CODE_POINT_MEMORY;
473 }
474 return rc;
475}
476RT_EXPORT_SYMBOL(RTStrToUni);
477
478
479RTDECL(int) RTStrToUniEx(const char *pszString, size_t cchString, PRTUNICP *ppaCps, size_t cCps, size_t *pcCps)
480{
481 /*
482 * Validate input.
483 */
484 Assert(VALID_PTR(pszString));
485 Assert(VALID_PTR(ppaCps));
486 Assert(!pcCps || VALID_PTR(pcCps));
487
488 /*
489 * Validate the UTF-8 input and count the code points.
490 */
491 size_t cCpsResult;
492 int rc = rtUtf8Length(pszString, cchString, &cCpsResult, NULL);
493 if (RT_SUCCESS(rc))
494 {
495 if (pcCps)
496 *pcCps = cCpsResult;
497
498 /*
499 * Check buffer size / Allocate buffer.
500 */
501 bool fShouldFree;
502 PRTUNICP paCpsResult;
503 if (cCps > 0 && *ppaCps)
504 {
505 fShouldFree = false;
506 if (cCps <= cCpsResult)
507 return VERR_BUFFER_OVERFLOW;
508 paCpsResult = *ppaCps;
509 }
510 else
511 {
512 *ppaCps = NULL;
513 fShouldFree = true;
514 cCps = RT_MAX(cCpsResult + 1, cCps);
515 paCpsResult = (PRTUNICP)RTMemAlloc(cCps * sizeof(RTUNICP));
516 }
517 if (paCpsResult)
518 {
519 /*
520 * Encode the UTF-16 string.
521 */
522 rc = rtUtf8Decode(pszString, cchString, paCpsResult, cCps - 1);
523 if (RT_SUCCESS(rc))
524 {
525 *ppaCps = paCpsResult;
526 return rc;
527 }
528 if (fShouldFree)
529 RTMemFree(paCpsResult);
530 }
531 else
532 rc = VERR_NO_CODE_POINT_MEMORY;
533 }
534 return rc;
535}
536RT_EXPORT_SYMBOL(RTStrToUniEx);
537
538
539/**
540 * Calculates the UTF-16 length of a string, validating the encoding while doing so.
541 *
542 * @returns IPRT status code.
543 * @param psz Pointer to the UTF-8 string.
544 * @param cch The max length of the string. (btw cch = cb)
545 * @param pcwc Where to store the length of the UTF-16 string as a number
546 * of RTUTF16 characters.
547 * @sa rtUtf8CalcUtf16Length
548 */
549static int rtUtf8CalcUtf16LengthN(const char *psz, size_t cch, size_t *pcwc)
550{
551 const unsigned char *puch = (const unsigned char *)psz;
552 size_t cwc = 0;
553 while (cch > 0)
554 {
555 const unsigned char uch = *puch;
556 if (!(uch & RT_BIT(7)))
557 {
558 /* one ASCII byte */
559 if (uch)
560 {
561 cwc++;
562 puch++;
563 cch--;
564 }
565 else
566 break;
567 }
568 else
569 {
570 /*
571 * Multibyte sequence is more complicated when we have length
572 * restrictions on the input.
573 */
574 /* figure sequence length and validate the first byte */
575 unsigned cb;
576 if ((uch & (RT_BIT(7) | RT_BIT(6) | RT_BIT(5))) == (RT_BIT(7) | RT_BIT(6)))
577 cb = 2;
578 else if ((uch & (RT_BIT(7) | RT_BIT(6) | RT_BIT(5) | RT_BIT(4))) == (RT_BIT(7) | RT_BIT(6) | RT_BIT(5)))
579 cb = 3;
580 else if ((uch & (RT_BIT(7) | RT_BIT(6) | RT_BIT(5) | RT_BIT(4) | RT_BIT(3))) == (RT_BIT(7) | RT_BIT(6) | RT_BIT(5) | RT_BIT(4)))
581 cb = 4;
582 else if ((uch & (RT_BIT(7) | RT_BIT(6) | RT_BIT(5) | RT_BIT(4) | RT_BIT(3) | RT_BIT(2))) == (RT_BIT(7) | RT_BIT(6) | RT_BIT(5) | RT_BIT(4) | RT_BIT(3)))
583 cb = 5;
584 else if ((uch & (RT_BIT(7) | RT_BIT(6) | RT_BIT(5) | RT_BIT(4) | RT_BIT(3) | RT_BIT(2) | RT_BIT(1))) == (RT_BIT(7) | RT_BIT(6) | RT_BIT(5) | RT_BIT(4) | RT_BIT(3) | RT_BIT(2)))
585 cb = 6;
586 else
587 {
588 RTStrAssertMsgFailed(("Invalid UTF-8 first byte: %.*Rhxs\n", RT_MIN(cch, 10), puch));
589 return VERR_INVALID_UTF8_ENCODING;
590 }
591
592 /* check length */
593 if (cb > cch)
594 {
595 RTStrAssertMsgFailed(("Invalid UTF-8 length: cb=%d cch=%d (%.*Rhxs)\n", cb, cch, RT_MIN(cch, 10), puch));
596 return VERR_INVALID_UTF8_ENCODING;
597 }
598
599 /* validate the rest */
600 switch (cb)
601 {
602 case 6:
603 RTStrAssertMsgReturn((puch[5] & (RT_BIT(7) | RT_BIT(6))) == RT_BIT(7), ("6/%u: %.*Rhxs\n", cb, RT_MIN(cb + 10, cch), puch), VERR_INVALID_UTF8_ENCODING);
604 case 5:
605 RTStrAssertMsgReturn((puch[4] & (RT_BIT(7) | RT_BIT(6))) == RT_BIT(7), ("5/%u: %.*Rhxs\n", cb, RT_MIN(cb + 10, cch), puch), VERR_INVALID_UTF8_ENCODING);
606 case 4:
607 RTStrAssertMsgReturn((puch[3] & (RT_BIT(7) | RT_BIT(6))) == RT_BIT(7), ("4/%u: %.*Rhxs\n", cb, RT_MIN(cb + 10, cch), puch), VERR_INVALID_UTF8_ENCODING);
608 case 3:
609 RTStrAssertMsgReturn((puch[2] & (RT_BIT(7) | RT_BIT(6))) == RT_BIT(7), ("3/%u: %.*Rhxs\n", cb, RT_MIN(cb + 10, cch), puch), VERR_INVALID_UTF8_ENCODING);
610 case 2:
611 RTStrAssertMsgReturn((puch[1] & (RT_BIT(7) | RT_BIT(6))) == RT_BIT(7), ("2/%u: %.*Rhxs\n", cb, RT_MIN(cb + 10, cch), puch), VERR_INVALID_UTF8_ENCODING);
612 break;
613 }
614
615 /* validate the code point. */
616 RTUNICP uc;
617 switch (cb)
618 {
619 case 6:
620 uc = (puch[5] & 0x3f)
621 | ((RTUNICP)(puch[4] & 0x3f) << 6)
622 | ((RTUNICP)(puch[3] & 0x3f) << 12)
623 | ((RTUNICP)(puch[2] & 0x3f) << 18)
624 | ((RTUNICP)(puch[1] & 0x3f) << 24)
625 | ((RTUNICP)(uch & 0x01) << 30);
626 RTStrAssertMsgReturn(uc >= 0x04000000 && uc <= 0x7fffffff,
627 ("%u: cp=%#010RX32: %.*Rhxs\n", cb, uc, RT_MIN(cb + 10, cch), puch), VERR_INVALID_UTF8_ENCODING);
628 RTStrAssertMsgFailed(("%u: cp=%#010RX32: %.*Rhxs\n", cb, uc, RT_MIN(cb + 10, cch), puch));
629 return VERR_CANT_RECODE_AS_UTF16;
630 case 5:
631 uc = (puch[4] & 0x3f)
632 | ((RTUNICP)(puch[3] & 0x3f) << 6)
633 | ((RTUNICP)(puch[2] & 0x3f) << 12)
634 | ((RTUNICP)(puch[1] & 0x3f) << 18)
635 | ((RTUNICP)(uch & 0x03) << 24);
636 RTStrAssertMsgReturn(uc >= 0x00200000 && uc <= 0x03ffffff,
637 ("%u: cp=%#010RX32: %.*Rhxs\n", cb, uc, RT_MIN(cb + 10, cch), puch), VERR_INVALID_UTF8_ENCODING);
638 RTStrAssertMsgFailed(("%u: cp=%#010RX32: %.*Rhxs\n", cb, uc, RT_MIN(cb + 10, cch), puch));
639 return VERR_CANT_RECODE_AS_UTF16;
640 case 4:
641 uc = (puch[3] & 0x3f)
642 | ((RTUNICP)(puch[2] & 0x3f) << 6)
643 | ((RTUNICP)(puch[1] & 0x3f) << 12)
644 | ((RTUNICP)(uch & 0x07) << 18);
645 RTStrAssertMsgReturn(uc >= 0x00010000 && uc <= 0x001fffff,
646 ("%u: cp=%#010RX32: %.*Rhxs\n", cb, uc, RT_MIN(cb + 10, cch), puch), VERR_INVALID_UTF8_ENCODING);
647 RTStrAssertMsgReturn(uc <= 0x0010ffff,
648 ("%u: cp=%#010RX32: %.*Rhxs\n", cb, uc, RT_MIN(cb + 10, cch), puch), VERR_CANT_RECODE_AS_UTF16);
649 cwc++;
650 break;
651 case 3:
652 uc = (puch[2] & 0x3f)
653 | ((RTUNICP)(puch[1] & 0x3f) << 6)
654 | ((RTUNICP)(uch & 0x0f) << 12);
655 RTStrAssertMsgReturn(uc >= 0x00000800 && uc <= 0x0000fffd,
656 ("%u: cp=%#010RX32: %.*Rhxs\n", cb, uc, RT_MIN(cb + 10, cch), puch),
657 uc == 0xffff || uc == 0xfffe ? VERR_CODE_POINT_ENDIAN_INDICATOR : VERR_INVALID_UTF8_ENCODING);
658 RTStrAssertMsgReturn(uc < 0xd800 || uc > 0xdfff,
659 ("%u: cp=%#010RX32: %.*Rhxs\n", cb, uc, RT_MIN(cb + 10, cch), puch), VERR_CODE_POINT_SURROGATE);
660 break;
661 case 2:
662 uc = (puch[1] & 0x3f)
663 | ((RTUNICP)(uch & 0x1f) << 6);
664 RTStrAssertMsgReturn(uc >= 0x00000080 && uc <= 0x000007ff,
665 ("%u: cp=%#010RX32: %.*Rhxs\n", cb, uc, RT_MIN(cb + 10, cch), puch), VERR_INVALID_UTF8_ENCODING);
666 break;
667 }
668
669 /* advance */
670 cch -= cb;
671 puch += cb;
672 cwc++;
673 }
674 }
675
676 /* done */
677 *pcwc = cwc;
678 return VINF_SUCCESS;
679}
680
681
682/**
683 * Calculates the UTF-16 length of a string, validating the encoding while doing so.
684 *
685 * @returns IPRT status code.
686 * @param psz Pointer to the UTF-8 string.
687 * @param pcwc Where to store the length of the UTF-16 string as a number
688 * of RTUTF16 characters.
689 * @sa rtUtf8CalcUtf16LengthN
690 */
691static int rtUtf8CalcUtf16Length(const char *psz, size_t *pcwc)
692{
693 const unsigned char *puch = (const unsigned char *)psz;
694 size_t cwc = 0;
695 for (;;)
696 {
697 const unsigned char uch = *puch;
698 if (!(uch & RT_BIT(7)))
699 {
700 /* one ASCII byte */
701 if (uch)
702 {
703 cwc++;
704 puch++;
705 }
706 else
707 break;
708 }
709 else
710 {
711 /*
712 * Figure sequence length, implicitly validate the first byte.
713 * Then validate the additional bytes.
714 * Finally validate the code point.
715 */
716 unsigned cb;
717 RTUNICP uc;
718 if ((uch & (RT_BIT(7) | RT_BIT(6) | RT_BIT(5))) == (RT_BIT(7) | RT_BIT(6)))
719 {
720 RTStrAssertMsgReturn((puch[1] & (RT_BIT(7) | RT_BIT(6))) == RT_BIT(7), ("2/%u: %.*Rhxs\n", cb, RT_MIN(cb + 10, cch), puch), VERR_INVALID_UTF8_ENCODING);
721 uc = (puch[1] & 0x3f)
722 | ((RTUNICP)(uch & 0x1f) << 6);
723 RTStrAssertMsgReturn(uc >= 0x00000080 && uc <= 0x000007ff,
724 ("%u: cp=%#010RX32: %.*Rhxs\n", cb, uc, RT_MIN(cb + 10, cch), puch), VERR_INVALID_UTF8_ENCODING);
725 cb = 2;
726 }
727 else if ((uch & (RT_BIT(7) | RT_BIT(6) | RT_BIT(5) | RT_BIT(4))) == (RT_BIT(7) | RT_BIT(6) | RT_BIT(5)))
728 {
729 RTStrAssertMsgReturn((puch[1] & (RT_BIT(7) | RT_BIT(6))) == RT_BIT(7), ("2/%u: %.*Rhxs\n", cb, RT_MIN(cb + 10, cch), puch), VERR_INVALID_UTF8_ENCODING);
730 RTStrAssertMsgReturn((puch[2] & (RT_BIT(7) | RT_BIT(6))) == RT_BIT(7), ("3/%u: %.*Rhxs\n", cb, RT_MIN(cb + 10, cch), puch), VERR_INVALID_UTF8_ENCODING);
731 uc = (puch[2] & 0x3f)
732 | ((RTUNICP)(puch[1] & 0x3f) << 6)
733 | ((RTUNICP)(uch & 0x0f) << 12);
734 RTStrAssertMsgReturn(uc >= 0x00000800 && uc <= 0x0000fffd,
735 ("%u: cp=%#010RX32: %.*Rhxs\n", cb, uc, RT_MIN(cb + 10, cch), puch),
736 uc == 0xffff || uc == 0xfffe ? VERR_CODE_POINT_ENDIAN_INDICATOR : VERR_INVALID_UTF8_ENCODING);
737 RTStrAssertMsgReturn(uc < 0xd800 || uc > 0xdfff,
738 ("%u: cp=%#010RX32: %.*Rhxs\n", cb, uc, RT_MIN(cb + 10, cch), puch), VERR_CODE_POINT_SURROGATE);
739 cb = 3;
740 }
741 else if ((uch & (RT_BIT(7) | RT_BIT(6) | RT_BIT(5) | RT_BIT(4) | RT_BIT(3))) == (RT_BIT(7) | RT_BIT(6) | RT_BIT(5) | RT_BIT(4)))
742 {
743 RTStrAssertMsgReturn((puch[1] & (RT_BIT(7) | RT_BIT(6))) == RT_BIT(7), ("2/%u: %.*Rhxs\n", cb, RT_MIN(cb + 10, cch), puch), VERR_INVALID_UTF8_ENCODING);
744 RTStrAssertMsgReturn((puch[2] & (RT_BIT(7) | RT_BIT(6))) == RT_BIT(7), ("3/%u: %.*Rhxs\n", cb, RT_MIN(cb + 10, cch), puch), VERR_INVALID_UTF8_ENCODING);
745 RTStrAssertMsgReturn((puch[3] & (RT_BIT(7) | RT_BIT(6))) == RT_BIT(7), ("4/%u: %.*Rhxs\n", cb, RT_MIN(cb + 10, cch), puch), VERR_INVALID_UTF8_ENCODING);
746 uc = (puch[3] & 0x3f)
747 | ((RTUNICP)(puch[2] & 0x3f) << 6)
748 | ((RTUNICP)(puch[1] & 0x3f) << 12)
749 | ((RTUNICP)(uch & 0x07) << 18);
750 RTStrAssertMsgReturn(uc >= 0x00010000 && uc <= 0x001fffff,
751 ("%u: cp=%#010RX32: %.*Rhxs\n", cb, uc, RT_MIN(cb + 10, cch), puch), VERR_INVALID_UTF8_ENCODING);
752 RTStrAssertMsgReturn(uc <= 0x0010ffff,
753 ("%u: cp=%#010RX32: %.*Rhxs\n", cb, uc, RT_MIN(cb + 10, cch), puch), VERR_CANT_RECODE_AS_UTF16);
754 cwc++;
755 cb = 4;
756 }
757 else if ((uch & (RT_BIT(7) | RT_BIT(6) | RT_BIT(5) | RT_BIT(4) | RT_BIT(3) | RT_BIT(2))) == (RT_BIT(7) | RT_BIT(6) | RT_BIT(5) | RT_BIT(4) | RT_BIT(3)))
758 {
759 RTStrAssertMsgReturn((puch[1] & (RT_BIT(7) | RT_BIT(6))) == RT_BIT(7), ("2/%u: %.*Rhxs\n", cb, RT_MIN(cb + 10, cch), puch), VERR_INVALID_UTF8_ENCODING);
760 RTStrAssertMsgReturn((puch[2] & (RT_BIT(7) | RT_BIT(6))) == RT_BIT(7), ("3/%u: %.*Rhxs\n", cb, RT_MIN(cb + 10, cch), puch), VERR_INVALID_UTF8_ENCODING);
761 RTStrAssertMsgReturn((puch[3] & (RT_BIT(7) | RT_BIT(6))) == RT_BIT(7), ("4/%u: %.*Rhxs\n", cb, RT_MIN(cb + 10, cch), puch), VERR_INVALID_UTF8_ENCODING);
762 RTStrAssertMsgReturn((puch[4] & (RT_BIT(7) | RT_BIT(6))) == RT_BIT(7), ("5/%u: %.*Rhxs\n", cb, RT_MIN(cb + 10, cch), puch), VERR_INVALID_UTF8_ENCODING);
763 uc = (puch[4] & 0x3f)
764 | ((RTUNICP)(puch[3] & 0x3f) << 6)
765 | ((RTUNICP)(puch[2] & 0x3f) << 12)
766 | ((RTUNICP)(puch[1] & 0x3f) << 18)
767 | ((RTUNICP)(uch & 0x03) << 24);
768 RTStrAssertMsgReturn(uc >= 0x00200000 && uc <= 0x03ffffff,
769 ("%u: cp=%#010RX32: %.*Rhxs\n", cb, uc, RT_MIN(cb + 10, cch), puch), VERR_INVALID_UTF8_ENCODING);
770 RTStrAssertMsgFailed(("%u: cp=%#010RX32: %.*Rhxs\n", cb, uc, RT_MIN(cb + 10, cch), puch));
771 return VERR_CANT_RECODE_AS_UTF16;
772 //cb = 5;
773 }
774 else if ((uch & (RT_BIT(7) | RT_BIT(6) | RT_BIT(5) | RT_BIT(4) | RT_BIT(3) | RT_BIT(2) | RT_BIT(1))) == (RT_BIT(7) | RT_BIT(6) | RT_BIT(5) | RT_BIT(4) | RT_BIT(3) | RT_BIT(2)))
775 {
776 RTStrAssertMsgReturn((puch[1] & (RT_BIT(7) | RT_BIT(6))) == RT_BIT(7), ("2/%u: %.*Rhxs\n", cb, RT_MIN(cb + 10, cch), puch), VERR_INVALID_UTF8_ENCODING);
777 RTStrAssertMsgReturn((puch[2] & (RT_BIT(7) | RT_BIT(6))) == RT_BIT(7), ("3/%u: %.*Rhxs\n", cb, RT_MIN(cb + 10, cch), puch), VERR_INVALID_UTF8_ENCODING);
778 RTStrAssertMsgReturn((puch[3] & (RT_BIT(7) | RT_BIT(6))) == RT_BIT(7), ("4/%u: %.*Rhxs\n", cb, RT_MIN(cb + 10, cch), puch), VERR_INVALID_UTF8_ENCODING);
779 RTStrAssertMsgReturn((puch[4] & (RT_BIT(7) | RT_BIT(6))) == RT_BIT(7), ("5/%u: %.*Rhxs\n", cb, RT_MIN(cb + 10, cch), puch), VERR_INVALID_UTF8_ENCODING);
780 RTStrAssertMsgReturn((puch[5] & (RT_BIT(7) | RT_BIT(6))) == RT_BIT(7), ("6/%u: %.*Rhxs\n", cb, RT_MIN(cb + 10, cch), puch), VERR_INVALID_UTF8_ENCODING);
781 uc = (puch[5] & 0x3f)
782 | ((RTUNICP)(puch[4] & 0x3f) << 6)
783 | ((RTUNICP)(puch[3] & 0x3f) << 12)
784 | ((RTUNICP)(puch[2] & 0x3f) << 18)
785 | ((RTUNICP)(puch[1] & 0x3f) << 24)
786 | ((RTUNICP)(uch & 0x01) << 30);
787 RTStrAssertMsgReturn(uc >= 0x04000000 && uc <= 0x7fffffff,
788 ("%u: cp=%#010RX32: %.*Rhxs\n", cb, uc, RT_MIN(cb + 10, cch), puch), VERR_INVALID_UTF8_ENCODING);
789 RTStrAssertMsgFailed(("%u: cp=%#010RX32: %.*Rhxs\n", cb, uc, RT_MIN(cb + 10, cch), puch));
790 return VERR_CANT_RECODE_AS_UTF16;
791 //cb = 6;
792 }
793 else
794 {
795 RTStrAssertMsgFailed(("Invalid UTF-8 first byte: %.*Rhxs\n", RT_MIN(cch, 10), puch));
796 return VERR_INVALID_UTF8_ENCODING;
797 }
798
799 /* advance */
800 puch += cb;
801 cwc++;
802 }
803 }
804
805 /* done */
806 *pcwc = cwc;
807 return VINF_SUCCESS;
808}
809
810
811
812/**
813 * Recodes a valid UTF-8 string as UTF-16.
814 *
815 * Since we know the input is valid, we do *not* perform encoding or length checks.
816 *
817 * @returns iprt status code.
818 * @param psz The UTF-8 string to recode. This is a valid encoding.
819 * @param cch The number of chars (the type char, so bytes if you like) to process of the UTF-8 string.
820 * The recoding will stop when cch or '\\0' is reached. Pass RTSTR_MAX to process up to '\\0'.
821 * @param pwsz Where to store the UTF-16 string.
822 * @param cwc The number of RTUTF16 items the pwsz buffer can hold, excluding the terminator ('\\0').
823 */
824static int rtUtf8RecodeAsUtf16(const char *psz, size_t cch, PRTUTF16 pwsz, size_t cwc)
825{
826 int rc = VINF_SUCCESS;
827 const unsigned char *puch = (const unsigned char *)psz;
828 PRTUTF16 pwc = pwsz;
829 while (cch > 0)
830 {
831 /* read the next char and check for terminator. */
832 const unsigned char uch = *puch;
833 if (uch)
834 { /* we only break once, so consider this the likely branch. */ }
835 else
836 break;
837
838 /* check for output overflow */
839 if (RT_LIKELY(cwc >= 1))
840 { /* likely */ }
841 else
842 {
843 rc = VERR_BUFFER_OVERFLOW;
844 break;
845 }
846 cwc--;
847
848 /* decode and recode the code point */
849 if (!(uch & RT_BIT(7)))
850 {
851 *pwc++ = uch;
852 puch++;
853 cch--;
854 }
855 else if ((uch & (RT_BIT(7) | RT_BIT(6) | RT_BIT(5))) == (RT_BIT(7) | RT_BIT(6)))
856 {
857 uint16_t uc = (puch[1] & 0x3f)
858 | ((uint16_t)(uch & 0x1f) << 6);
859 *pwc++ = uc;
860 puch += 2;
861 cch -= 2;
862 }
863 else if ((uch & (RT_BIT(7) | RT_BIT(6) | RT_BIT(5) | RT_BIT(4))) == (RT_BIT(7) | RT_BIT(6) | RT_BIT(5)))
864 {
865 uint16_t uc = (puch[2] & 0x3f)
866 | ((uint16_t)(puch[1] & 0x3f) << 6)
867 | ((uint16_t)(uch & 0x0f) << 12);
868 *pwc++ = uc;
869 puch += 3;
870 cch -= 3;
871 }
872 else
873 {
874 /* generate surrogate pair */
875 Assert((uch & (RT_BIT(7) | RT_BIT(6) | RT_BIT(5) | RT_BIT(4) | RT_BIT(3))) == (RT_BIT(7) | RT_BIT(6) | RT_BIT(5) | RT_BIT(4)));
876 RTUNICP uc = (puch[3] & 0x3f)
877 | ((RTUNICP)(puch[2] & 0x3f) << 6)
878 | ((RTUNICP)(puch[1] & 0x3f) << 12)
879 | ((RTUNICP)(uch & 0x07) << 18);
880 if (RT_UNLIKELY(cwc < 1))
881 {
882 rc = VERR_BUFFER_OVERFLOW;
883 break;
884 }
885 cwc--;
886
887 uc -= 0x10000;
888 *pwc++ = 0xd800 | (uc >> 10);
889 *pwc++ = 0xdc00 | (uc & 0x3ff);
890 puch += 4;
891 cch -= 4;
892 }
893 }
894
895 /* done */
896 *pwc = '\0';
897 return rc;
898}
899
900
901RTDECL(int) RTStrToUtf16Tag(const char *pszString, PRTUTF16 *ppwszString, const char *pszTag)
902{
903 /*
904 * Validate input.
905 */
906 Assert(VALID_PTR(ppwszString));
907 Assert(VALID_PTR(pszString));
908 *ppwszString = NULL;
909
910 /*
911 * Validate the UTF-8 input and calculate the length of the UTF-16 string.
912 */
913 size_t cwc;
914 int rc = rtUtf8CalcUtf16Length(pszString, &cwc);
915 if (RT_SUCCESS(rc))
916 {
917 /*
918 * Allocate buffer.
919 */
920 PRTUTF16 pwsz = (PRTUTF16)RTMemAllocTag((cwc + 1) * sizeof(RTUTF16), pszTag);
921 if (pwsz)
922 {
923 /*
924 * Encode the UTF-16 string.
925 */
926 rc = rtUtf8RecodeAsUtf16(pszString, RTSTR_MAX, pwsz, cwc);
927 if (RT_SUCCESS(rc))
928 {
929 *ppwszString = pwsz;
930 return rc;
931 }
932 RTMemFree(pwsz);
933 }
934 else
935 rc = VERR_NO_UTF16_MEMORY;
936 }
937 return rc;
938}
939RT_EXPORT_SYMBOL(RTStrToUtf16Tag);
940
941
942RTDECL(int) RTStrToUtf16ExTag(const char *pszString, size_t cchString,
943 PRTUTF16 *ppwsz, size_t cwc, size_t *pcwc, const char *pszTag)
944{
945 /*
946 * Validate input.
947 */
948 Assert(VALID_PTR(pszString));
949 Assert(VALID_PTR(ppwsz));
950 Assert(!pcwc || VALID_PTR(pcwc));
951
952 /*
953 * Validate the UTF-8 input and calculate the length of the UTF-16 string.
954 */
955 size_t cwcResult;
956 int rc;
957 if (cchString != RTSTR_MAX)
958 rc = rtUtf8CalcUtf16LengthN(pszString, cchString, &cwcResult);
959 else
960 rc = rtUtf8CalcUtf16Length(pszString, &cwcResult);
961 if (RT_SUCCESS(rc))
962 {
963 if (pcwc)
964 *pcwc = cwcResult;
965
966 /*
967 * Check buffer size / Allocate buffer.
968 */
969 bool fShouldFree;
970 PRTUTF16 pwszResult;
971 if (cwc > 0 && *ppwsz)
972 {
973 fShouldFree = false;
974 if (cwc <= cwcResult)
975 return VERR_BUFFER_OVERFLOW;
976 pwszResult = *ppwsz;
977 }
978 else
979 {
980 *ppwsz = NULL;
981 fShouldFree = true;
982 cwc = RT_MAX(cwcResult + 1, cwc);
983 pwszResult = (PRTUTF16)RTMemAllocTag(cwc * sizeof(RTUTF16), pszTag);
984 }
985 if (pwszResult)
986 {
987 /*
988 * Encode the UTF-16 string.
989 */
990 rc = rtUtf8RecodeAsUtf16(pszString, cchString, pwszResult, cwc - 1);
991 if (RT_SUCCESS(rc))
992 {
993 *ppwsz = pwszResult;
994 return rc;
995 }
996 if (fShouldFree)
997 RTMemFree(pwszResult);
998 }
999 else
1000 rc = VERR_NO_UTF16_MEMORY;
1001 }
1002 return rc;
1003}
1004RT_EXPORT_SYMBOL(RTStrToUtf16ExTag);
1005
1006
1007RTDECL(size_t) RTStrCalcUtf16Len(const char *psz)
1008{
1009 size_t cwc;
1010 int rc = rtUtf8CalcUtf16Length(psz, &cwc);
1011 return RT_SUCCESS(rc) ? cwc : 0;
1012}
1013RT_EXPORT_SYMBOL(RTStrCalcUtf16Len);
1014
1015
1016RTDECL(int) RTStrCalcUtf16LenEx(const char *psz, size_t cch, size_t *pcwc)
1017{
1018 size_t cwc;
1019 int rc;
1020 if (cch != RTSTR_MAX)
1021 rc = rtUtf8CalcUtf16LengthN(psz, cch, &cwc);
1022 else
1023 rc = rtUtf8CalcUtf16Length(psz, &cwc);
1024 if (pcwc)
1025 *pcwc = RT_SUCCESS(rc) ? cwc : ~(size_t)0;
1026 return rc;
1027}
1028RT_EXPORT_SYMBOL(RTStrCalcUtf16LenEx);
1029
1030
1031/**
1032 * Calculates the length of the UTF-8 encoding of a Latin-1 string.
1033 *
1034 * @returns iprt status code.
1035 * @param psz The Latin-1 string.
1036 * @param cchIn The max length of the Latin-1 string to consider.
1037 * @param pcch Where to store the length (excluding '\\0') of the UTF-8 string. (cch == cb, btw)
1038 */
1039static int rtLatin1CalcUtf8Length(const char *psz, size_t cchIn, size_t *pcch)
1040{
1041 size_t cch = 0;
1042 for (;;)
1043 {
1044 RTUNICP Cp;
1045 int rc = RTLatin1GetCpNEx(&psz, &cchIn, &Cp);
1046 if (Cp == 0 || rc == VERR_END_OF_STRING)
1047 break;
1048 if (RT_FAILURE(rc))
1049 return rc;
1050 cch += RTStrCpSize(Cp); /* cannot fail */
1051 }
1052
1053 /* done */
1054 *pcch = cch;
1055 return VINF_SUCCESS;
1056}
1057
1058
1059/**
1060 * Recodes a Latin-1 string as UTF-8.
1061 *
1062 * @returns iprt status code.
1063 * @param pszIn The Latin-1 string.
1064 * @param cchIn The number of characters to process from psz. The recoding
1065 * will stop when cch or '\\0' is reached.
1066 * @param psz Where to store the UTF-8 string.
1067 * @param cch The size of the UTF-8 buffer, excluding the terminator.
1068 */
1069static int rtLatin1RecodeAsUtf8(const char *pszIn, size_t cchIn, char *psz, size_t cch)
1070{
1071 int rc;
1072 for (;;)
1073 {
1074 RTUNICP Cp;
1075 size_t cchCp;
1076 rc = RTLatin1GetCpNEx(&pszIn, &cchIn, &Cp);
1077 if (Cp == 0 || RT_FAILURE(rc))
1078 break;
1079 cchCp = RTStrCpSize(Cp);
1080 if (RT_UNLIKELY(cch < cchCp))
1081 {
1082 RTStrAssertMsgFailed(("Buffer overflow! 1\n"));
1083 rc = VERR_BUFFER_OVERFLOW;
1084 break;
1085 }
1086 cch -= cchCp;
1087 psz = RTStrPutCp(psz, Cp);
1088 }
1089
1090 /* done */
1091 if (rc == VERR_END_OF_STRING)
1092 rc = VINF_SUCCESS;
1093 *psz = '\0';
1094 return rc;
1095}
1096
1097
1098
1099RTDECL(int) RTLatin1ToUtf8Tag(const char *pszString, char **ppszString, const char *pszTag)
1100{
1101 /*
1102 * Validate input.
1103 */
1104 Assert(VALID_PTR(ppszString));
1105 Assert(VALID_PTR(pszString));
1106 *ppszString = NULL;
1107
1108 /*
1109 * Calculate the length of the UTF-8 encoding of the Latin-1 string.
1110 */
1111 size_t cch;
1112 int rc = rtLatin1CalcUtf8Length(pszString, RTSTR_MAX, &cch);
1113 if (RT_SUCCESS(rc))
1114 {
1115 /*
1116 * Allocate buffer and recode it.
1117 */
1118 char *pszResult = (char *)RTMemAllocTag(cch + 1, pszTag);
1119 if (pszResult)
1120 {
1121 rc = rtLatin1RecodeAsUtf8(pszString, RTSTR_MAX, pszResult, cch);
1122 if (RT_SUCCESS(rc))
1123 {
1124 *ppszString = pszResult;
1125 return rc;
1126 }
1127
1128 RTMemFree(pszResult);
1129 }
1130 else
1131 rc = VERR_NO_STR_MEMORY;
1132 }
1133 return rc;
1134}
1135RT_EXPORT_SYMBOL(RTLatin1ToUtf8Tag);
1136
1137
1138RTDECL(int) RTLatin1ToUtf8ExTag(const char *pszString, size_t cchString, char **ppsz, size_t cch, size_t *pcch, const char *pszTag)
1139{
1140 /*
1141 * Validate input.
1142 */
1143 Assert(VALID_PTR(pszString));
1144 Assert(VALID_PTR(ppsz));
1145 Assert(!pcch || VALID_PTR(pcch));
1146
1147 /*
1148 * Calculate the length of the UTF-8 encoding of the Latin-1 string.
1149 */
1150 size_t cchResult;
1151 int rc = rtLatin1CalcUtf8Length(pszString, cchString, &cchResult);
1152 if (RT_SUCCESS(rc))
1153 {
1154 if (pcch)
1155 *pcch = cchResult;
1156
1157 /*
1158 * Check buffer size / Allocate buffer and recode it.
1159 */
1160 bool fShouldFree;
1161 char *pszResult;
1162 if (cch > 0 && *ppsz)
1163 {
1164 fShouldFree = false;
1165 if (RT_UNLIKELY(cch <= cchResult))
1166 return VERR_BUFFER_OVERFLOW;
1167 pszResult = *ppsz;
1168 }
1169 else
1170 {
1171 *ppsz = NULL;
1172 fShouldFree = true;
1173 cch = RT_MAX(cch, cchResult + 1);
1174 pszResult = (char *)RTStrAllocTag(cch, pszTag);
1175 }
1176 if (pszResult)
1177 {
1178 rc = rtLatin1RecodeAsUtf8(pszString, cchString, pszResult, cch - 1);
1179 if (RT_SUCCESS(rc))
1180 {
1181 *ppsz = pszResult;
1182 return rc;
1183 }
1184
1185 if (fShouldFree)
1186 RTStrFree(pszResult);
1187 }
1188 else
1189 rc = VERR_NO_STR_MEMORY;
1190 }
1191 return rc;
1192}
1193RT_EXPORT_SYMBOL(RTLatin1ToUtf8ExTag);
1194
1195
1196RTDECL(size_t) RTLatin1CalcUtf8Len(const char *psz)
1197{
1198 size_t cch;
1199 int rc = rtLatin1CalcUtf8Length(psz, RTSTR_MAX, &cch);
1200 return RT_SUCCESS(rc) ? cch : 0;
1201}
1202RT_EXPORT_SYMBOL(RTLatin1CalcUtf8Len);
1203
1204
1205RTDECL(int) RTLatin1CalcUtf8LenEx(const char *psz, size_t cchIn, size_t *pcch)
1206{
1207 size_t cch;
1208 int rc = rtLatin1CalcUtf8Length(psz, cchIn, &cch);
1209 if (pcch)
1210 *pcch = RT_SUCCESS(rc) ? cch : ~(size_t)0;
1211 return rc;
1212}
1213RT_EXPORT_SYMBOL(RTLatin1CalcUtf8LenEx);
1214
1215
1216/**
1217 * Calculates the Latin-1 length of a string, validating the encoding while
1218 * doing so.
1219 *
1220 * @returns IPRT status code.
1221 * @param psz Pointer to the UTF-8 string.
1222 * @param cchIn The max length of the string. (btw cch = cb)
1223 * Use RTSTR_MAX if all of the string is to be examined.
1224 * @param pcch Where to store the length of the Latin-1 string in bytes.
1225 */
1226static int rtUtf8CalcLatin1Length(const char *psz, size_t cchIn, size_t *pcch)
1227{
1228 size_t cch = 0;
1229 for (;;)
1230 {
1231 RTUNICP Cp;
1232 size_t cchCp;
1233 int rc = RTStrGetCpNEx(&psz, &cchIn, &Cp);
1234 if (Cp == 0 || rc == VERR_END_OF_STRING)
1235 break;
1236 if (RT_FAILURE(rc))
1237 return rc;
1238 cchCp = RTLatin1CpSize(Cp);
1239 if (cchCp == 0)
1240 return VERR_NO_TRANSLATION;
1241 cch += cchCp;
1242 }
1243
1244 /* done */
1245 *pcch = cch;
1246 return VINF_SUCCESS;
1247}
1248
1249
1250/**
1251 * Recodes a valid UTF-8 string as Latin-1.
1252 *
1253 * Since we know the input is valid, we do *not* perform encoding or length checks.
1254 *
1255 * @returns iprt status code.
1256 * @param pszIn The UTF-8 string to recode. This is a valid encoding.
1257 * @param cchIn The number of chars (the type char, so bytes if you like) to process of the UTF-8 string.
1258 * The recoding will stop when cch or '\\0' is reached. Pass RTSTR_MAX to process up to '\\0'.
1259 * @param psz Where to store the Latin-1 string.
1260 * @param cch The number of characters the pszOut buffer can hold, excluding the terminator ('\\0').
1261 */
1262static int rtUtf8RecodeAsLatin1(const char *pszIn, size_t cchIn, char *psz, size_t cch)
1263{
1264 int rc;
1265 for (;;)
1266 {
1267 RTUNICP Cp;
1268 size_t cchCp;
1269 rc = RTStrGetCpNEx(&pszIn, &cchIn, &Cp);
1270 if (Cp == 0 || RT_FAILURE(rc))
1271 break;
1272 cchCp = RTLatin1CpSize(Cp);
1273 if (RT_UNLIKELY(cch < cchCp))
1274 {
1275 RTStrAssertMsgFailed(("Buffer overflow! 1\n"));
1276 rc = VERR_BUFFER_OVERFLOW;
1277 break;
1278 }
1279 cch -= cchCp;
1280 psz = RTLatin1PutCp(psz, Cp);
1281 }
1282
1283 /* done */
1284 if (rc == VERR_END_OF_STRING)
1285 rc = VINF_SUCCESS;
1286 *psz = '\0';
1287 return rc;
1288}
1289
1290
1291
1292RTDECL(int) RTStrToLatin1Tag(const char *pszString, char **ppszString, const char *pszTag)
1293{
1294 /*
1295 * Validate input.
1296 */
1297 Assert(VALID_PTR(ppszString));
1298 Assert(VALID_PTR(pszString));
1299 *ppszString = NULL;
1300
1301 /*
1302 * Validate the UTF-8 input and calculate the length of the Latin-1 string.
1303 */
1304 size_t cch;
1305 int rc = rtUtf8CalcLatin1Length(pszString, RTSTR_MAX, &cch);
1306 if (RT_SUCCESS(rc))
1307 {
1308 /*
1309 * Allocate buffer.
1310 */
1311 char *psz = (char *)RTMemAllocTag(cch + 1, pszTag);
1312 if (psz)
1313 {
1314 /*
1315 * Encode the UTF-16 string.
1316 */
1317 rc = rtUtf8RecodeAsLatin1(pszString, RTSTR_MAX, psz, cch);
1318 if (RT_SUCCESS(rc))
1319 {
1320 *ppszString = psz;
1321 return rc;
1322 }
1323 RTMemFree(psz);
1324 }
1325 else
1326 rc = VERR_NO_STR_MEMORY;
1327 }
1328 return rc;
1329}
1330RT_EXPORT_SYMBOL(RTStrToLatin1Tag);
1331
1332
1333RTDECL(int) RTStrToLatin1ExTag(const char *pszString, size_t cchString,
1334 char **ppsz, size_t cch, size_t *pcch, const char *pszTag)
1335{
1336 /*
1337 * Validate input.
1338 */
1339 Assert(VALID_PTR(pszString));
1340 Assert(VALID_PTR(ppsz));
1341 Assert(!pcch || VALID_PTR(pcch));
1342
1343 /*
1344 * Validate the UTF-8 input and calculate the length of the UTF-16 string.
1345 */
1346 size_t cchResult;
1347 int rc = rtUtf8CalcLatin1Length(pszString, cchString, &cchResult);
1348 if (RT_SUCCESS(rc))
1349 {
1350 if (pcch)
1351 *pcch = cchResult;
1352
1353 /*
1354 * Check buffer size / Allocate buffer.
1355 */
1356 bool fShouldFree;
1357 char *pszResult;
1358 if (cch > 0 && *ppsz)
1359 {
1360 fShouldFree = false;
1361 if (cch <= cchResult)
1362 return VERR_BUFFER_OVERFLOW;
1363 pszResult = *ppsz;
1364 }
1365 else
1366 {
1367 *ppsz = NULL;
1368 fShouldFree = true;
1369 cch = RT_MAX(cchResult + 1, cch);
1370 pszResult = (char *)RTMemAllocTag(cch, pszTag);
1371 }
1372 if (pszResult)
1373 {
1374 /*
1375 * Encode the Latin-1 string.
1376 */
1377 rc = rtUtf8RecodeAsLatin1(pszString, cchString, pszResult, cch - 1);
1378 if (RT_SUCCESS(rc))
1379 {
1380 *ppsz = pszResult;
1381 return rc;
1382 }
1383 if (fShouldFree)
1384 RTMemFree(pszResult);
1385 }
1386 else
1387 rc = VERR_NO_STR_MEMORY;
1388 }
1389 return rc;
1390}
1391RT_EXPORT_SYMBOL(RTStrToLatin1Tag);
1392
1393
1394RTDECL(size_t) RTStrCalcLatin1Len(const char *psz)
1395{
1396 size_t cch;
1397 int rc = rtUtf8CalcLatin1Length(psz, RTSTR_MAX, &cch);
1398 return RT_SUCCESS(rc) ? cch : 0;
1399}
1400RT_EXPORT_SYMBOL(RTStrCalcLatin1Len);
1401
1402
1403RTDECL(int) RTStrCalcLatin1LenEx(const char *psz, size_t cchIn, size_t *pcch)
1404{
1405 size_t cch;
1406 int rc = rtUtf8CalcLatin1Length(psz, cchIn, &cch);
1407 if (pcch)
1408 *pcch = RT_SUCCESS(rc) ? cch : ~(size_t)0;
1409 return rc;
1410}
1411RT_EXPORT_SYMBOL(RTStrCalcLatin1LenEx);
1412
1413
1414/**
1415 * Handle invalid encodings passed to RTStrGetCp() and RTStrGetCpEx().
1416 * @returns rc
1417 * @param ppsz The pointer to the string position point.
1418 * @param pCp Where to store RTUNICP_INVALID.
1419 * @param rc The iprt error code.
1420 */
1421static int rtStrGetCpExFailure(const char **ppsz, PRTUNICP pCp, int rc)
1422{
1423 /*
1424 * Try find a valid encoding.
1425 */
1426 (*ppsz)++; /** @todo code this! */
1427 *pCp = RTUNICP_INVALID;
1428 return rc;
1429}
1430
1431
1432RTDECL(RTUNICP) RTStrGetCpInternal(const char *psz)
1433{
1434 RTUNICP Cp;
1435 RTStrGetCpExInternal(&psz, &Cp);
1436 return Cp;
1437}
1438RT_EXPORT_SYMBOL(RTStrGetCpInternal);
1439
1440
1441RTDECL(int) RTStrGetCpExInternal(const char **ppsz, PRTUNICP pCp)
1442{
1443 const unsigned char *puch = (const unsigned char *)*ppsz;
1444 const unsigned char uch = *puch;
1445 RTUNICP uc;
1446
1447 /* ASCII ? */
1448 if (!(uch & RT_BIT(7)))
1449 {
1450 uc = uch;
1451 puch++;
1452 }
1453 else if (uch & RT_BIT(6))
1454 {
1455 /* figure the length and validate the first octet. */
1456/** @todo RT_USE_RTC_3629 */
1457 unsigned cb;
1458 if (!(uch & RT_BIT(5)))
1459 cb = 2;
1460 else if (!(uch & RT_BIT(4)))
1461 cb = 3;
1462 else if (!(uch & RT_BIT(3)))
1463 cb = 4;
1464 else if (!(uch & RT_BIT(2)))
1465 cb = 5;
1466 else if (!(uch & RT_BIT(1)))
1467 cb = 6;
1468 else
1469 {
1470 RTStrAssertMsgFailed(("Invalid UTF-8 first byte: %.*Rhxs\n", RT_MIN(strlen((char *)puch), 10), puch));
1471 return rtStrGetCpExFailure(ppsz, pCp, VERR_INVALID_UTF8_ENCODING);
1472 }
1473
1474 /* validate the rest */
1475 switch (cb)
1476 {
1477 case 6:
1478 RTStrAssertMsgReturn((puch[5] & 0xc0) == 0x80, ("6/%u: %.*Rhxs\n", cb, RT_MIN(cb + 10, strlen((char *)puch)), puch),
1479 rtStrGetCpExFailure(ppsz, pCp, VERR_INVALID_UTF8_ENCODING));
1480 case 5:
1481 RTStrAssertMsgReturn((puch[4] & 0xc0) == 0x80, ("5/%u: %.*Rhxs\n", cb, RT_MIN(cb + 10, strlen((char *)puch)), puch),
1482 rtStrGetCpExFailure(ppsz, pCp, VERR_INVALID_UTF8_ENCODING));
1483 case 4:
1484 RTStrAssertMsgReturn((puch[3] & 0xc0) == 0x80, ("4/%u: %.*Rhxs\n", cb, RT_MIN(cb + 10, strlen((char *)puch)), puch),
1485 rtStrGetCpExFailure(ppsz, pCp, VERR_INVALID_UTF8_ENCODING));
1486 case 3:
1487 RTStrAssertMsgReturn((puch[2] & 0xc0) == 0x80, ("3/%u: %.*Rhxs\n", cb, RT_MIN(cb + 10, strlen((char *)puch)), puch),
1488 rtStrGetCpExFailure(ppsz, pCp, VERR_INVALID_UTF8_ENCODING));
1489 case 2:
1490 RTStrAssertMsgReturn((puch[1] & 0xc0) == 0x80, ("2/%u: %.*Rhxs\n", cb, RT_MIN(cb + 10, strlen((char *)puch)), puch),
1491 rtStrGetCpExFailure(ppsz, pCp, VERR_INVALID_UTF8_ENCODING));
1492 break;
1493 }
1494
1495 /* get and validate the code point. */
1496 switch (cb)
1497 {
1498 case 6:
1499 uc = (puch[5] & 0x3f)
1500 | ((RTUNICP)(puch[4] & 0x3f) << 6)
1501 | ((RTUNICP)(puch[3] & 0x3f) << 12)
1502 | ((RTUNICP)(puch[2] & 0x3f) << 18)
1503 | ((RTUNICP)(puch[1] & 0x3f) << 24)
1504 | ((RTUNICP)(uch & 0x01) << 30);
1505 RTStrAssertMsgReturn(uc >= 0x04000000 && uc <= 0x7fffffff,
1506 ("%u: cp=%#010RX32: %.*Rhxs\n", cb, uc, RT_MIN(cb + 10, strlen((char *)puch)), puch),
1507 rtStrGetCpExFailure(ppsz, pCp, VERR_INVALID_UTF8_ENCODING));
1508 break;
1509 case 5:
1510 uc = (puch[4] & 0x3f)
1511 | ((RTUNICP)(puch[3] & 0x3f) << 6)
1512 | ((RTUNICP)(puch[2] & 0x3f) << 12)
1513 | ((RTUNICP)(puch[1] & 0x3f) << 18)
1514 | ((RTUNICP)(uch & 0x03) << 24);
1515 RTStrAssertMsgReturn(uc >= 0x00200000 && uc <= 0x03ffffff,
1516 ("%u: cp=%#010RX32: %.*Rhxs\n", cb, uc, RT_MIN(cb + 10, strlen((char *)puch)), puch),
1517 rtStrGetCpExFailure(ppsz, pCp, VERR_INVALID_UTF8_ENCODING));
1518 break;
1519 case 4:
1520 uc = (puch[3] & 0x3f)
1521 | ((RTUNICP)(puch[2] & 0x3f) << 6)
1522 | ((RTUNICP)(puch[1] & 0x3f) << 12)
1523 | ((RTUNICP)(uch & 0x07) << 18);
1524 RTStrAssertMsgReturn(uc >= 0x00010000 && uc <= 0x001fffff,
1525 ("%u: cp=%#010RX32: %.*Rhxs\n", cb, uc, RT_MIN(cb + 10, strlen((char *)puch)), puch),
1526 rtStrGetCpExFailure(ppsz, pCp, VERR_INVALID_UTF8_ENCODING));
1527 break;
1528 case 3:
1529 uc = (puch[2] & 0x3f)
1530 | ((RTUNICP)(puch[1] & 0x3f) << 6)
1531 | ((RTUNICP)(uch & 0x0f) << 12);
1532 RTStrAssertMsgReturn(uc >= 0x00000800 && uc <= 0x0000fffd,
1533 ("%u: cp=%#010RX32: %.*Rhxs\n", cb, uc, RT_MIN(cb + 10, strlen((char *)puch)), puch),
1534 rtStrGetCpExFailure(ppsz, pCp, uc == 0xffff || uc == 0xfffe ? VERR_CODE_POINT_ENDIAN_INDICATOR : VERR_INVALID_UTF8_ENCODING));
1535 RTStrAssertMsgReturn(uc < 0xd800 || uc > 0xdfff,
1536 ("%u: cp=%#010RX32: %.*Rhxs\n", cb, uc, RT_MIN(cb + 10, strlen((char *)puch)), puch),
1537 rtStrGetCpExFailure(ppsz, pCp, VERR_CODE_POINT_SURROGATE));
1538 break;
1539 case 2:
1540 uc = (puch[1] & 0x3f)
1541 | ((RTUNICP)(uch & 0x1f) << 6);
1542 RTStrAssertMsgReturn(uc >= 0x00000080 && uc <= 0x000007ff,
1543 ("%u: cp=%#010RX32: %.*Rhxs\n", cb, uc, RT_MIN(cb + 10, strlen((char *)puch)), puch),
1544 rtStrGetCpExFailure(ppsz, pCp, VERR_INVALID_UTF8_ENCODING));
1545 break;
1546 default: /* impossible, but GCC is bitching. */
1547 uc = RTUNICP_INVALID;
1548 break;
1549 }
1550 puch += cb;
1551 }
1552 else
1553 {
1554 /* 6th bit is always set. */
1555 RTStrAssertMsgFailed(("Invalid UTF-8 first byte: %.*Rhxs\n", RT_MIN(strlen((char *)puch), 10), puch));
1556 return rtStrGetCpExFailure(ppsz, pCp, VERR_INVALID_UTF8_ENCODING);
1557 }
1558 *pCp = uc;
1559 *ppsz = (const char *)puch;
1560 return VINF_SUCCESS;
1561}
1562RT_EXPORT_SYMBOL(RTStrGetCpExInternal);
1563
1564
1565/**
1566 * Handle invalid encodings passed to RTStrGetCpNEx().
1567 * @returns rc
1568 * @param ppsz The pointer to the string position point.
1569 * @param pcch Pointer to the string length.
1570 * @param pCp Where to store RTUNICP_INVALID.
1571 * @param rc The iprt error code.
1572 */
1573static int rtStrGetCpNExFailure(const char **ppsz, size_t *pcch, PRTUNICP pCp, int rc)
1574{
1575 /*
1576 * Try find a valid encoding.
1577 */
1578 (*ppsz)++; /** @todo code this! */
1579 (*pcch)--;
1580 *pCp = RTUNICP_INVALID;
1581 return rc;
1582}
1583
1584
1585RTDECL(int) RTStrGetCpNExInternal(const char **ppsz, size_t *pcch, PRTUNICP pCp)
1586{
1587 const unsigned char *puch = (const unsigned char *)*ppsz;
1588 const unsigned char uch = *puch;
1589 size_t cch = *pcch;
1590 RTUNICP uc;
1591
1592 if (cch == 0)
1593 {
1594 *pCp = RTUNICP_INVALID;
1595 return VERR_END_OF_STRING;
1596 }
1597
1598 /* ASCII ? */
1599 if (!(uch & RT_BIT(7)))
1600 {
1601 uc = uch;
1602 puch++;
1603 cch--;
1604 }
1605 else if (uch & RT_BIT(6))
1606 {
1607 /* figure the length and validate the first octet. */
1608/** @todo RT_USE_RTC_3629 */
1609 unsigned cb;
1610 if (!(uch & RT_BIT(5)))
1611 cb = 2;
1612 else if (!(uch & RT_BIT(4)))
1613 cb = 3;
1614 else if (!(uch & RT_BIT(3)))
1615 cb = 4;
1616 else if (!(uch & RT_BIT(2)))
1617 cb = 5;
1618 else if (!(uch & RT_BIT(1)))
1619 cb = 6;
1620 else
1621 {
1622 RTStrAssertMsgFailed(("Invalid UTF-8 first byte: %.*Rhxs\n", RT_MIN(strlen((char *)puch), 10), puch));
1623 return rtStrGetCpNExFailure(ppsz, pcch, pCp, VERR_INVALID_UTF8_ENCODING);
1624 }
1625
1626 if (cb > cch)
1627 return rtStrGetCpNExFailure(ppsz, pcch, pCp, VERR_INVALID_UTF8_ENCODING);
1628
1629 /* validate the rest */
1630 switch (cb)
1631 {
1632 case 6:
1633 RTStrAssertMsgReturn((puch[5] & 0xc0) == 0x80, ("6/%u: %.*Rhxs\n", cb, RT_MIN(cb + 10, strlen((char *)puch)), puch),
1634 rtStrGetCpNExFailure(ppsz, pcch, pCp, VERR_INVALID_UTF8_ENCODING));
1635 case 5:
1636 RTStrAssertMsgReturn((puch[4] & 0xc0) == 0x80, ("5/%u: %.*Rhxs\n", cb, RT_MIN(cb + 10, strlen((char *)puch)), puch),
1637 rtStrGetCpNExFailure(ppsz, pcch, pCp, VERR_INVALID_UTF8_ENCODING));
1638 case 4:
1639 RTStrAssertMsgReturn((puch[3] & 0xc0) == 0x80, ("4/%u: %.*Rhxs\n", cb, RT_MIN(cb + 10, strlen((char *)puch)), puch),
1640 rtStrGetCpNExFailure(ppsz, pcch, pCp, VERR_INVALID_UTF8_ENCODING));
1641 case 3:
1642 RTStrAssertMsgReturn((puch[2] & 0xc0) == 0x80, ("3/%u: %.*Rhxs\n", cb, RT_MIN(cb + 10, strlen((char *)puch)), puch),
1643 rtStrGetCpNExFailure(ppsz, pcch, pCp, VERR_INVALID_UTF8_ENCODING));
1644 case 2:
1645 RTStrAssertMsgReturn((puch[1] & 0xc0) == 0x80, ("2/%u: %.*Rhxs\n", cb, RT_MIN(cb + 10, strlen((char *)puch)), puch),
1646 rtStrGetCpNExFailure(ppsz, pcch, pCp, VERR_INVALID_UTF8_ENCODING));
1647 break;
1648 }
1649
1650 /* get and validate the code point. */
1651 switch (cb)
1652 {
1653 case 6:
1654 uc = (puch[5] & 0x3f)
1655 | ((RTUNICP)(puch[4] & 0x3f) << 6)
1656 | ((RTUNICP)(puch[3] & 0x3f) << 12)
1657 | ((RTUNICP)(puch[2] & 0x3f) << 18)
1658 | ((RTUNICP)(puch[1] & 0x3f) << 24)
1659 | ((RTUNICP)(uch & 0x01) << 30);
1660 RTStrAssertMsgReturn(uc >= 0x04000000 && uc <= 0x7fffffff,
1661 ("%u: cp=%#010RX32: %.*Rhxs\n", cb, uc, RT_MIN(cb + 10, strlen((char *)puch)), puch),
1662 rtStrGetCpNExFailure(ppsz, pcch, pCp, VERR_INVALID_UTF8_ENCODING));
1663 break;
1664 case 5:
1665 uc = (puch[4] & 0x3f)
1666 | ((RTUNICP)(puch[3] & 0x3f) << 6)
1667 | ((RTUNICP)(puch[2] & 0x3f) << 12)
1668 | ((RTUNICP)(puch[1] & 0x3f) << 18)
1669 | ((RTUNICP)(uch & 0x03) << 24);
1670 RTStrAssertMsgReturn(uc >= 0x00200000 && uc <= 0x03ffffff,
1671 ("%u: cp=%#010RX32: %.*Rhxs\n", cb, uc, RT_MIN(cb + 10, strlen((char *)puch)), puch),
1672 rtStrGetCpNExFailure(ppsz, pcch, pCp, VERR_INVALID_UTF8_ENCODING));
1673 break;
1674 case 4:
1675 uc = (puch[3] & 0x3f)
1676 | ((RTUNICP)(puch[2] & 0x3f) << 6)
1677 | ((RTUNICP)(puch[1] & 0x3f) << 12)
1678 | ((RTUNICP)(uch & 0x07) << 18);
1679 RTStrAssertMsgReturn(uc >= 0x00010000 && uc <= 0x001fffff,
1680 ("%u: cp=%#010RX32: %.*Rhxs\n", cb, uc, RT_MIN(cb + 10, strlen((char *)puch)), puch),
1681 rtStrGetCpNExFailure(ppsz, pcch, pCp, VERR_INVALID_UTF8_ENCODING));
1682 break;
1683 case 3:
1684 uc = (puch[2] & 0x3f)
1685 | ((RTUNICP)(puch[1] & 0x3f) << 6)
1686 | ((RTUNICP)(uch & 0x0f) << 12);
1687 RTStrAssertMsgReturn(uc >= 0x00000800 && uc <= 0x0000fffd,
1688 ("%u: cp=%#010RX32: %.*Rhxs\n", cb, uc, RT_MIN(cb + 10, strlen((char *)puch)), puch),
1689 rtStrGetCpNExFailure(ppsz, pcch, pCp, uc == 0xffff || uc == 0xfffe ? VERR_CODE_POINT_ENDIAN_INDICATOR : VERR_INVALID_UTF8_ENCODING));
1690 RTStrAssertMsgReturn(uc < 0xd800 || uc > 0xdfff,
1691 ("%u: cp=%#010RX32: %.*Rhxs\n", cb, uc, RT_MIN(cb + 10, strlen((char *)puch)), puch),
1692 rtStrGetCpNExFailure(ppsz, pcch, pCp, VERR_CODE_POINT_SURROGATE));
1693 break;
1694 case 2:
1695 uc = (puch[1] & 0x3f)
1696 | ((RTUNICP)(uch & 0x1f) << 6);
1697 RTStrAssertMsgReturn(uc >= 0x00000080 && uc <= 0x000007ff,
1698 ("%u: cp=%#010RX32: %.*Rhxs\n", cb, uc, RT_MIN(cb + 10, strlen((char *)puch)), puch),
1699 rtStrGetCpNExFailure(ppsz, pcch, pCp, VERR_INVALID_UTF8_ENCODING));
1700 break;
1701 default: /* impossible, but GCC is bitching. */
1702 uc = RTUNICP_INVALID;
1703 break;
1704 }
1705 puch += cb;
1706 cch -= cb;
1707 }
1708 else
1709 {
1710 /* 6th bit is always set. */
1711 RTStrAssertMsgFailed(("Invalid UTF-8 first byte: %.*Rhxs\n", RT_MIN(strlen((char *)puch), 10), puch));
1712 return rtStrGetCpNExFailure(ppsz, pcch, pCp, VERR_INVALID_UTF8_ENCODING);
1713 }
1714 *pCp = uc;
1715 *ppsz = (const char *)puch;
1716 (*pcch) = cch;
1717 return VINF_SUCCESS;
1718}
1719RT_EXPORT_SYMBOL(RTStrGetCpNExInternal);
1720
1721
1722RTDECL(char *) RTStrPutCpInternal(char *psz, RTUNICP uc)
1723{
1724 unsigned char *puch = (unsigned char *)psz;
1725 if (uc < 0x80)
1726 *puch++ = (unsigned char )uc;
1727 else if (uc < 0x00000800)
1728 {
1729 *puch++ = 0xc0 | (uc >> 6);
1730 *puch++ = 0x80 | (uc & 0x3f);
1731 }
1732 else if (uc < 0x00010000)
1733 {
1734/** @todo RT_USE_RTC_3629 */
1735 if ( uc < 0x0000d8000
1736 || ( uc > 0x0000dfff
1737 && uc < 0x0000fffe))
1738 {
1739 *puch++ = 0xe0 | (uc >> 12);
1740 *puch++ = 0x80 | ((uc >> 6) & 0x3f);
1741 *puch++ = 0x80 | (uc & 0x3f);
1742 }
1743 else
1744 {
1745 AssertMsgFailed(("Invalid code point U+%05x!\n", uc));
1746 *puch++ = 0x7f;
1747 }
1748 }
1749/** @todo RT_USE_RTC_3629 */
1750 else if (uc < 0x00200000)
1751 {
1752 *puch++ = 0xf0 | (uc >> 18);
1753 *puch++ = 0x80 | ((uc >> 12) & 0x3f);
1754 *puch++ = 0x80 | ((uc >> 6) & 0x3f);
1755 *puch++ = 0x80 | (uc & 0x3f);
1756 }
1757 else if (uc < 0x04000000)
1758 {
1759 *puch++ = 0xf8 | (uc >> 24);
1760 *puch++ = 0x80 | ((uc >> 18) & 0x3f);
1761 *puch++ = 0x80 | ((uc >> 12) & 0x3f);
1762 *puch++ = 0x80 | ((uc >> 6) & 0x3f);
1763 *puch++ = 0x80 | (uc & 0x3f);
1764 }
1765 else if (uc <= 0x7fffffff)
1766 {
1767 *puch++ = 0xfc | (uc >> 30);
1768 *puch++ = 0x80 | ((uc >> 24) & 0x3f);
1769 *puch++ = 0x80 | ((uc >> 18) & 0x3f);
1770 *puch++ = 0x80 | ((uc >> 12) & 0x3f);
1771 *puch++ = 0x80 | ((uc >> 6) & 0x3f);
1772 *puch++ = 0x80 | (uc & 0x3f);
1773 }
1774 else
1775 {
1776 AssertMsgFailed(("Invalid code point U+%08x!\n", uc));
1777 *puch++ = 0x7f;
1778 }
1779
1780 return (char *)puch;
1781}
1782RT_EXPORT_SYMBOL(RTStrPutCpInternal);
1783
1784
1785RTDECL(char *) RTStrPrevCp(const char *pszStart, const char *psz)
1786{
1787 if (pszStart < psz)
1788 {
1789 /* simple char? */
1790 const unsigned char *puch = (const unsigned char *)psz;
1791 unsigned uch = *--puch;
1792 if (!(uch & RT_BIT(7)))
1793 return (char *)puch;
1794 RTStrAssertMsgReturn(!(uch & RT_BIT(6)), ("uch=%#x\n", uch), (char *)pszStart);
1795
1796 /* two or more. */
1797 uint32_t uMask = 0xffffffc0;
1798 while ( (const unsigned char *)pszStart < puch
1799 && !(uMask & 1))
1800 {
1801 uch = *--puch;
1802 if ((uch & 0xc0) != 0x80)
1803 {
1804 RTStrAssertMsgReturn((uch & (uMask >> 1)) == (uMask & 0xff),
1805 ("Invalid UTF-8 encoding: %.*Rhxs puch=%p psz=%p\n", psz - (char *)puch, puch, psz),
1806 (char *)pszStart);
1807 return (char *)puch;
1808 }
1809 uMask >>= 1;
1810 }
1811 RTStrAssertMsgFailed(("Invalid UTF-8 encoding: %.*Rhxs puch=%p psz=%p\n", psz - (char *)puch, puch, psz));
1812 }
1813 return (char *)pszStart;
1814}
1815RT_EXPORT_SYMBOL(RTStrPrevCp);
1816
Note: See TracBrowser for help on using the repository browser.

© 2024 Oracle Support Privacy / Do Not Sell My Info Terms of Use Trademark Policy Automated Access Etiquette