VirtualBox

source: vbox/trunk/src/VBox/Runtime/common/string/utf-8.cpp@ 30080

Last change on this file since 30080 was 28903, checked in by vboxsync, 15 years ago

IPRT: iconv cache.

  • Property svn:eol-style set to native
  • Property svn:keywords set to Id
File size: 43.2 KB
Line 
1/* $Id: utf-8.cpp 28903 2010-04-29 14:58:12Z vboxsync $ */
2/** @file
3 * IPRT - UTF-8 Decoding.
4 */
5
6/*
7 * Copyright (C) 2006-2009 Oracle Corporation
8 *
9 * This file is part of VirtualBox Open Source Edition (OSE), as
10 * available from http://www.virtualbox.org. This file is free software;
11 * you can redistribute it and/or modify it under the terms of the GNU
12 * General Public License (GPL) as published by the Free Software
13 * Foundation, in version 2 as it comes in the "COPYING" file of the
14 * VirtualBox OSE distribution. VirtualBox OSE is distributed in the
15 * hope that it will be useful, but WITHOUT ANY WARRANTY of any kind.
16 *
17 * The contents of this file may alternatively be used under the terms
18 * of the Common Development and Distribution License Version 1.0
19 * (CDDL) only, as it comes in the "COPYING.CDDL" file of the
20 * VirtualBox OSE distribution, in which case the provisions of the
21 * CDDL are applicable instead of those of the GPL.
22 *
23 * You may elect to license modified versions of this file under the
24 * terms and conditions of either the GPL or the CDDL or both.
25 */
26
27
28/*******************************************************************************
29* Header Files *
30*******************************************************************************/
31#include <iprt/string.h>
32#include "internal/iprt.h"
33
34#include <iprt/uni.h>
35#include <iprt/alloc.h>
36#include <iprt/assert.h>
37#include <iprt/err.h>
38#include "internal/string.h"
39
40
41
42/**
43 * Get get length in code points of a UTF-8 encoded string.
44 * The string is validated while doing this.
45 *
46 * @returns IPRT status code.
47 * @param psz Pointer to the UTF-8 string.
48 * @param cch The max length of the string. (btw cch = cb)
49 * Use RTSTR_MAX if all of the string is to be examined.
50 * @param pcuc Where to store the length in unicode code points.
51 * @param pcchActual Where to store the actual size of the UTF-8 string
52 * on success (cch = cb again). Optional.
53 */
54int rtUtf8Length(const char *psz, size_t cch, size_t *pcuc, size_t *pcchActual)
55{
56 const unsigned char *puch = (const unsigned char *)psz;
57 size_t cCodePoints = 0;
58 while (cch > 0)
59 {
60 const unsigned char uch = *puch;
61 if (!uch)
62 break;
63 if (uch & RT_BIT(7))
64 {
65 /* figure sequence length and validate the first byte */
66 unsigned cb;
67 if ((uch & (RT_BIT(7) | RT_BIT(6) | RT_BIT(5))) == (RT_BIT(7) | RT_BIT(6)))
68 cb = 2;
69 else if ((uch & (RT_BIT(7) | RT_BIT(6) | RT_BIT(5) | RT_BIT(4))) == (RT_BIT(7) | RT_BIT(6) | RT_BIT(5)))
70 cb = 3;
71 else if ((uch & (RT_BIT(7) | RT_BIT(6) | RT_BIT(5) | RT_BIT(4) | RT_BIT(3))) == (RT_BIT(7) | RT_BIT(6) | RT_BIT(5) | RT_BIT(4)))
72 cb = 4;
73 else if ((uch & (RT_BIT(7) | RT_BIT(6) | RT_BIT(5) | RT_BIT(4) | RT_BIT(3) | RT_BIT(2))) == (RT_BIT(7) | RT_BIT(6) | RT_BIT(5) | RT_BIT(4) | RT_BIT(3)))
74 cb = 5;
75 else if ((uch & (RT_BIT(7) | RT_BIT(6) | RT_BIT(5) | RT_BIT(4) | RT_BIT(3) | RT_BIT(2) | RT_BIT(1))) == (RT_BIT(7) | RT_BIT(6) | RT_BIT(5) | RT_BIT(4) | RT_BIT(3) | RT_BIT(2)))
76 cb = 6;
77 else
78 {
79 RTStrAssertMsgFailed(("Invalid UTF-8 first byte: %.*Rhxs\n", RT_MIN(cch, 10), puch));
80 return VERR_INVALID_UTF8_ENCODING;
81 }
82
83 /* check length */
84 if (cb > cch)
85 {
86 RTStrAssertMsgFailed(("Invalid UTF-8 length: cb=%d cch=%d (%.*Rhxs)\n", cb, cch, RT_MIN(cch, 10), puch));
87 return VERR_INVALID_UTF8_ENCODING;
88 }
89
90 /* validate the rest */
91 switch (cb)
92 {
93 case 6:
94 RTStrAssertMsgReturn((puch[5] & (RT_BIT(7) | RT_BIT(6))) == RT_BIT(7), ("6/%u: %.*Rhxs\n", cb, RT_MIN(cb + 10, cch), puch), VERR_INVALID_UTF8_ENCODING);
95 case 5:
96 RTStrAssertMsgReturn((puch[4] & (RT_BIT(7) | RT_BIT(6))) == RT_BIT(7), ("5/%u: %.*Rhxs\n", cb, RT_MIN(cb + 10, cch), puch), VERR_INVALID_UTF8_ENCODING);
97 case 4:
98 RTStrAssertMsgReturn((puch[3] & (RT_BIT(7) | RT_BIT(6))) == RT_BIT(7), ("4/%u: %.*Rhxs\n", cb, RT_MIN(cb + 10, cch), puch), VERR_INVALID_UTF8_ENCODING);
99 case 3:
100 RTStrAssertMsgReturn((puch[2] & (RT_BIT(7) | RT_BIT(6))) == RT_BIT(7), ("3/%u: %.*Rhxs\n", cb, RT_MIN(cb + 10, cch), puch), VERR_INVALID_UTF8_ENCODING);
101 case 2:
102 RTStrAssertMsgReturn((puch[1] & (RT_BIT(7) | RT_BIT(6))) == RT_BIT(7), ("2/%u: %.*Rhxs\n", cb, RT_MIN(cb + 10, cch), puch), VERR_INVALID_UTF8_ENCODING);
103 break;
104 }
105
106 /* validate the code point. */
107 RTUNICP uc;
108 switch (cb)
109 {
110 case 6:
111 uc = (puch[5] & 0x3f)
112 | ((RTUNICP)(puch[4] & 0x3f) << 6)
113 | ((RTUNICP)(puch[3] & 0x3f) << 12)
114 | ((RTUNICP)(puch[2] & 0x3f) << 18)
115 | ((RTUNICP)(puch[1] & 0x3f) << 24)
116 | ((RTUNICP)(uch & 0x01) << 30);
117 RTStrAssertMsgReturn(uc >= 0x04000000 && uc <= 0x7fffffff,
118 ("%u: cp=%#010RX32: %.*Rhxs\n", cb, uc, RT_MIN(cb + 10, cch), puch), VERR_INVALID_UTF8_ENCODING);
119 break;
120 case 5:
121 uc = (puch[4] & 0x3f)
122 | ((RTUNICP)(puch[3] & 0x3f) << 6)
123 | ((RTUNICP)(puch[2] & 0x3f) << 12)
124 | ((RTUNICP)(puch[1] & 0x3f) << 18)
125 | ((RTUNICP)(uch & 0x03) << 24);
126 RTStrAssertMsgReturn(uc >= 0x00200000 && uc <= 0x03ffffff,
127 ("%u: cp=%#010RX32: %.*Rhxs\n", cb, uc, RT_MIN(cb + 10, cch), puch), VERR_INVALID_UTF8_ENCODING);
128 break;
129 case 4:
130 uc = (puch[3] & 0x3f)
131 | ((RTUNICP)(puch[2] & 0x3f) << 6)
132 | ((RTUNICP)(puch[1] & 0x3f) << 12)
133 | ((RTUNICP)(uch & 0x07) << 18);
134 RTStrAssertMsgReturn(uc >= 0x00010000 && uc <= 0x001fffff,
135 ("%u: cp=%#010RX32: %.*Rhxs\n", cb, uc, RT_MIN(cb + 10, cch), puch), VERR_INVALID_UTF8_ENCODING);
136 break;
137 case 3:
138 uc = (puch[2] & 0x3f)
139 | ((RTUNICP)(puch[1] & 0x3f) << 6)
140 | ((RTUNICP)(uch & 0x0f) << 12);
141 RTStrAssertMsgReturn(uc >= 0x00000800 && uc <= 0x0000fffd,
142 ("%u: cp=%#010RX32: %.*Rhxs\n", cb, uc, RT_MIN(cb + 10, cch), puch),
143 uc == 0xffff || uc == 0xfffe ? VERR_CODE_POINT_ENDIAN_INDICATOR : VERR_INVALID_UTF8_ENCODING);
144 RTStrAssertMsgReturn(uc < 0xd800 || uc > 0xdfff,
145 ("%u: cp=%#010RX32: %.*Rhxs\n", cb, uc, RT_MIN(cb + 10, cch), puch), VERR_CODE_POINT_SURROGATE);
146 break;
147 case 2:
148 uc = (puch[1] & 0x3f)
149 | ((RTUNICP)(uch & 0x1f) << 6);
150 RTStrAssertMsgReturn(uc >= 0x00000080 && uc <= 0x000007ff,
151 ("%u: cp=%#010RX32: %.*Rhxs\n", cb, uc, RT_MIN(cb + 10, cch), puch), VERR_INVALID_UTF8_ENCODING);
152 break;
153 }
154
155 /* advance */
156 cch -= cb;
157 puch += cb;
158 }
159 else
160 {
161 /* one ASCII byte */
162 puch++;
163 cch--;
164 }
165 cCodePoints++;
166 }
167
168 /* done */
169 *pcuc = cCodePoints;
170 if (pcchActual)
171 *pcchActual = puch - (unsigned char const *)psz;
172 return VINF_SUCCESS;
173}
174
175
176/**
177 * Decodes and UTF-8 string into an array of unicode code point.
178 *
179 * Since we know the input is valid, we do *not* perform encoding or length checks.
180 *
181 * @returns iprt status code.
182 * @param psz The UTF-8 string to recode. This is a valid encoding.
183 * @param cch The number of chars (the type char, so bytes if you like) to process of the UTF-8 string.
184 * The recoding will stop when cch or '\\0' is reached. Pass RTSTR_MAX to process up to '\\0'.
185 * @param paCps Where to store the code points array.
186 * @param cCps The number of RTUNICP items the paCps buffer can hold, excluding the terminator ('\\0').
187 */
188static int rtUtf8Decode(const char *psz, size_t cch, PRTUNICP paCps, size_t cCps)
189{
190 int rc = VINF_SUCCESS;
191 const unsigned char *puch = (const unsigned char *)psz;
192 PRTUNICP pCp = paCps;
193 while (cch > 0)
194 {
195 /* read the next char and check for terminator. */
196 const unsigned char uch = *puch;
197 if (!uch)
198 break;
199
200 /* check for output overflow */
201 if (RT_UNLIKELY(cCps < 1))
202 {
203 rc = VERR_BUFFER_OVERFLOW;
204 break;
205 }
206 cCps--;
207
208 /* decode and recode the code point */
209 if (!(uch & RT_BIT(7)))
210 {
211 *pCp++ = uch;
212 puch++;
213 cch--;
214 }
215#ifdef RT_STRICT
216 else if (!(uch & RT_BIT(6)))
217 AssertMsgFailed(("Internal error!\n"));
218#endif
219 else if (!(uch & RT_BIT(5)))
220 {
221 *pCp++ = (puch[1] & 0x3f)
222 | ((uint16_t)(uch & 0x1f) << 6);
223 puch += 2;
224 cch -= 2;
225 }
226 else if (!(uch & RT_BIT(4)))
227 {
228 *pCp++ = (puch[2] & 0x3f)
229 | ((uint16_t)(puch[1] & 0x3f) << 6)
230 | ((uint16_t)(uch & 0x0f) << 12);
231 puch += 3;
232 cch -= 3;
233 }
234 else if (!(uch & RT_BIT(3)))
235 {
236 *pCp++ = (puch[3] & 0x3f)
237 | ((RTUNICP)(puch[2] & 0x3f) << 6)
238 | ((RTUNICP)(puch[1] & 0x3f) << 12)
239 | ((RTUNICP)(uch & 0x07) << 18);
240 puch += 4;
241 cch -= 4;
242 }
243 else if (!(uch & RT_BIT(2)))
244 {
245 *pCp++ = (puch[4] & 0x3f)
246 | ((RTUNICP)(puch[3] & 0x3f) << 6)
247 | ((RTUNICP)(puch[2] & 0x3f) << 12)
248 | ((RTUNICP)(puch[1] & 0x3f) << 18)
249 | ((RTUNICP)(uch & 0x03) << 24);
250 puch += 5;
251 cch -= 6;
252 }
253 else
254 {
255 Assert(!(uch & RT_BIT(1)));
256 *pCp++ = (puch[5] & 0x3f)
257 | ((RTUNICP)(puch[4] & 0x3f) << 6)
258 | ((RTUNICP)(puch[3] & 0x3f) << 12)
259 | ((RTUNICP)(puch[2] & 0x3f) << 18)
260 | ((RTUNICP)(puch[1] & 0x3f) << 24)
261 | ((RTUNICP)(uch & 0x01) << 30);
262 puch += 6;
263 cch -= 6;
264 }
265 }
266
267 /* done */
268 *pCp = 0;
269 return rc;
270}
271
272
273RTDECL(size_t) RTStrUniLen(const char *psz)
274{
275 size_t cCodePoints;
276 int rc = rtUtf8Length(psz, RTSTR_MAX, &cCodePoints, NULL);
277 return RT_SUCCESS(rc) ? cCodePoints : 0;
278}
279RT_EXPORT_SYMBOL(RTStrUniLen);
280
281
282RTDECL(int) RTStrUniLenEx(const char *psz, size_t cch, size_t *pcCps)
283{
284 size_t cCodePoints;
285 int rc = rtUtf8Length(psz, cch, &cCodePoints, NULL);
286 if (pcCps)
287 *pcCps = RT_SUCCESS(rc) ? cCodePoints : 0;
288 return rc;
289}
290RT_EXPORT_SYMBOL(RTStrUniLenEx);
291
292
293RTDECL(int) RTStrValidateEncoding(const char *psz)
294{
295 return RTStrValidateEncodingEx(psz, RTSTR_MAX, 0);
296}
297RT_EXPORT_SYMBOL(RTStrValidateEncoding);
298
299
300RTDECL(int) RTStrValidateEncodingEx(const char *psz, size_t cch, uint32_t fFlags)
301{
302 AssertReturn(!(fFlags & ~(RTSTR_VALIDATE_ENCODING_ZERO_TERMINATED)), VERR_INVALID_PARAMETER);
303 AssertPtr(psz);
304
305 /*
306 * Use rtUtf8Length for the job.
307 */
308 size_t cchActual;
309 size_t cCpsIgnored;
310 int rc = rtUtf8Length(psz, cch, &cCpsIgnored, &cchActual);
311 if (RT_SUCCESS(rc))
312 {
313 if ( (fFlags & RTSTR_VALIDATE_ENCODING_ZERO_TERMINATED)
314 && cchActual >= cch)
315 rc = VERR_BUFFER_OVERFLOW;
316 }
317 return rc;
318}
319RT_EXPORT_SYMBOL(RTStrValidateEncodingEx);
320
321
322RTDECL(bool) RTStrIsValidEncoding(const char *psz)
323{
324 int rc = RTStrValidateEncodingEx(psz, RTSTR_MAX, 0);
325 return RT_SUCCESS(rc);
326}
327RT_EXPORT_SYMBOL(RTStrIsValidEncoding);
328
329
330RTDECL(int) RTStrToUni(const char *pszString, PRTUNICP *ppaCps)
331{
332 /*
333 * Validate input.
334 */
335 Assert(VALID_PTR(pszString));
336 Assert(VALID_PTR(ppaCps));
337 *ppaCps = NULL;
338
339 /*
340 * Validate the UTF-8 input and count its code points.
341 */
342 size_t cCps;
343 int rc = rtUtf8Length(pszString, RTSTR_MAX, &cCps, NULL);
344 if (RT_SUCCESS(rc))
345 {
346 /*
347 * Allocate buffer.
348 */
349 PRTUNICP paCps = (PRTUNICP)RTMemAlloc((cCps + 1) * sizeof(RTUNICP));
350 if (paCps)
351 {
352 /*
353 * Decode the string.
354 */
355 rc = rtUtf8Decode(pszString, RTSTR_MAX, paCps, cCps);
356 if (RT_SUCCESS(rc))
357 {
358 *ppaCps = paCps;
359 return rc;
360 }
361 RTMemFree(paCps);
362 }
363 else
364 rc = VERR_NO_CODE_POINT_MEMORY;
365 }
366 return rc;
367}
368RT_EXPORT_SYMBOL(RTStrToUni);
369
370
371RTDECL(int) RTStrToUniEx(const char *pszString, size_t cchString, PRTUNICP *ppaCps, size_t cCps, size_t *pcCps)
372{
373 /*
374 * Validate input.
375 */
376 Assert(VALID_PTR(pszString));
377 Assert(VALID_PTR(ppaCps));
378 Assert(!pcCps || VALID_PTR(pcCps));
379
380 /*
381 * Validate the UTF-8 input and count the code points.
382 */
383 size_t cCpsResult;
384 int rc = rtUtf8Length(pszString, cchString, &cCpsResult, NULL);
385 if (RT_SUCCESS(rc))
386 {
387 if (pcCps)
388 *pcCps = cCpsResult;
389
390 /*
391 * Check buffer size / Allocate buffer.
392 */
393 bool fShouldFree;
394 PRTUNICP paCpsResult;
395 if (cCps > 0 && *ppaCps)
396 {
397 fShouldFree = false;
398 if (cCps <= cCpsResult)
399 return VERR_BUFFER_OVERFLOW;
400 paCpsResult = *ppaCps;
401 }
402 else
403 {
404 *ppaCps = NULL;
405 fShouldFree = true;
406 cCps = RT_MAX(cCpsResult + 1, cCps);
407 paCpsResult = (PRTUNICP)RTMemAlloc(cCps * sizeof(RTUNICP));
408 }
409 if (paCpsResult)
410 {
411 /*
412 * Encode the UTF-16 string.
413 */
414 rc = rtUtf8Decode(pszString, cchString, paCpsResult, cCps - 1);
415 if (RT_SUCCESS(rc))
416 {
417 *ppaCps = paCpsResult;
418 return rc;
419 }
420 if (fShouldFree)
421 RTMemFree(paCpsResult);
422 }
423 else
424 rc = VERR_NO_CODE_POINT_MEMORY;
425 }
426 return rc;
427}
428RT_EXPORT_SYMBOL(RTStrToUniEx);
429
430
431/**
432 * Calculates the UTF-16 length of a string, validating the encoding while doing so.
433 *
434 * @returns IPRT status code.
435 * @param psz Pointer to the UTF-8 string.
436 * @param cch The max length of the string. (btw cch = cb)
437 * Use RTSTR_MAX if all of the string is to be examined.
438 * @param pcwc Where to store the length of the UTF-16 string as a number of RTUTF16 characters.
439 */
440static int rtUtf8CalcUtf16Length(const char *psz, size_t cch, size_t *pcwc)
441{
442 const unsigned char *puch = (const unsigned char *)psz;
443 size_t cwc = 0;
444 while (cch > 0)
445 {
446 const unsigned char uch = *puch;
447 if (!uch)
448 break;
449 if (!(uch & RT_BIT(7)))
450 {
451 /* one ASCII byte */
452 cwc++;
453 puch++;
454 cch--;
455 }
456 else
457 {
458 /* figure sequence length and validate the first byte */
459 unsigned cb;
460 if ((uch & (RT_BIT(7) | RT_BIT(6) | RT_BIT(5))) == (RT_BIT(7) | RT_BIT(6)))
461 cb = 2;
462 else if ((uch & (RT_BIT(7) | RT_BIT(6) | RT_BIT(5) | RT_BIT(4))) == (RT_BIT(7) | RT_BIT(6) | RT_BIT(5)))
463 cb = 3;
464 else if ((uch & (RT_BIT(7) | RT_BIT(6) | RT_BIT(5) | RT_BIT(4) | RT_BIT(3))) == (RT_BIT(7) | RT_BIT(6) | RT_BIT(5) | RT_BIT(4)))
465 cb = 4;
466 else if ((uch & (RT_BIT(7) | RT_BIT(6) | RT_BIT(5) | RT_BIT(4) | RT_BIT(3) | RT_BIT(2))) == (RT_BIT(7) | RT_BIT(6) | RT_BIT(5) | RT_BIT(4) | RT_BIT(3)))
467 cb = 5;
468 else if ((uch & (RT_BIT(7) | RT_BIT(6) | RT_BIT(5) | RT_BIT(4) | RT_BIT(3) | RT_BIT(2) | RT_BIT(1))) == (RT_BIT(7) | RT_BIT(6) | RT_BIT(5) | RT_BIT(4) | RT_BIT(3) | RT_BIT(2)))
469 cb = 6;
470 else
471 {
472 RTStrAssertMsgFailed(("Invalid UTF-8 first byte: %.*Rhxs\n", RT_MIN(cch, 10), puch));
473 return VERR_INVALID_UTF8_ENCODING;
474 }
475
476 /* check length */
477 if (cb > cch)
478 {
479 RTStrAssertMsgFailed(("Invalid UTF-8 length: cb=%d cch=%d (%.*Rhxs)\n", cb, cch, RT_MIN(cch, 10), puch));
480 return VERR_INVALID_UTF8_ENCODING;
481 }
482
483 /* validate the rest */
484 switch (cb)
485 {
486 case 6:
487 RTStrAssertMsgReturn((puch[5] & (RT_BIT(7) | RT_BIT(6))) == RT_BIT(7), ("6/%u: %.*Rhxs\n", cb, RT_MIN(cb + 10, cch), puch), VERR_INVALID_UTF8_ENCODING);
488 case 5:
489 RTStrAssertMsgReturn((puch[4] & (RT_BIT(7) | RT_BIT(6))) == RT_BIT(7), ("5/%u: %.*Rhxs\n", cb, RT_MIN(cb + 10, cch), puch), VERR_INVALID_UTF8_ENCODING);
490 case 4:
491 RTStrAssertMsgReturn((puch[3] & (RT_BIT(7) | RT_BIT(6))) == RT_BIT(7), ("4/%u: %.*Rhxs\n", cb, RT_MIN(cb + 10, cch), puch), VERR_INVALID_UTF8_ENCODING);
492 case 3:
493 RTStrAssertMsgReturn((puch[2] & (RT_BIT(7) | RT_BIT(6))) == RT_BIT(7), ("3/%u: %.*Rhxs\n", cb, RT_MIN(cb + 10, cch), puch), VERR_INVALID_UTF8_ENCODING);
494 case 2:
495 RTStrAssertMsgReturn((puch[1] & (RT_BIT(7) | RT_BIT(6))) == RT_BIT(7), ("2/%u: %.*Rhxs\n", cb, RT_MIN(cb + 10, cch), puch), VERR_INVALID_UTF8_ENCODING);
496 break;
497 }
498
499 /* validate the code point. */
500 RTUNICP uc;
501 switch (cb)
502 {
503 case 6:
504 uc = (puch[5] & 0x3f)
505 | ((RTUNICP)(puch[4] & 0x3f) << 6)
506 | ((RTUNICP)(puch[3] & 0x3f) << 12)
507 | ((RTUNICP)(puch[2] & 0x3f) << 18)
508 | ((RTUNICP)(puch[1] & 0x3f) << 24)
509 | ((RTUNICP)(uch & 0x01) << 30);
510 RTStrAssertMsgReturn(uc >= 0x04000000 && uc <= 0x7fffffff,
511 ("%u: cp=%#010RX32: %.*Rhxs\n", cb, uc, RT_MIN(cb + 10, cch), puch), VERR_INVALID_UTF8_ENCODING);
512 RTStrAssertMsgFailed(("%u: cp=%#010RX32: %.*Rhxs\n", cb, uc, RT_MIN(cb + 10, cch), puch));
513 return VERR_CANT_RECODE_AS_UTF16;
514 case 5:
515 uc = (puch[4] & 0x3f)
516 | ((RTUNICP)(puch[3] & 0x3f) << 6)
517 | ((RTUNICP)(puch[2] & 0x3f) << 12)
518 | ((RTUNICP)(puch[1] & 0x3f) << 18)
519 | ((RTUNICP)(uch & 0x03) << 24);
520 RTStrAssertMsgReturn(uc >= 0x00200000 && uc <= 0x03ffffff,
521 ("%u: cp=%#010RX32: %.*Rhxs\n", cb, uc, RT_MIN(cb + 10, cch), puch), VERR_INVALID_UTF8_ENCODING);
522 RTStrAssertMsgFailed(("%u: cp=%#010RX32: %.*Rhxs\n", cb, uc, RT_MIN(cb + 10, cch), puch));
523 return VERR_CANT_RECODE_AS_UTF16;
524 case 4:
525 uc = (puch[3] & 0x3f)
526 | ((RTUNICP)(puch[2] & 0x3f) << 6)
527 | ((RTUNICP)(puch[1] & 0x3f) << 12)
528 | ((RTUNICP)(uch & 0x07) << 18);
529 RTStrAssertMsgReturn(uc >= 0x00010000 && uc <= 0x001fffff,
530 ("%u: cp=%#010RX32: %.*Rhxs\n", cb, uc, RT_MIN(cb + 10, cch), puch), VERR_INVALID_UTF8_ENCODING);
531 RTStrAssertMsgReturn(uc <= 0x0010ffff,
532 ("%u: cp=%#010RX32: %.*Rhxs\n", cb, uc, RT_MIN(cb + 10, cch), puch), VERR_CANT_RECODE_AS_UTF16);
533 cwc++;
534 break;
535 case 3:
536 uc = (puch[2] & 0x3f)
537 | ((RTUNICP)(puch[1] & 0x3f) << 6)
538 | ((RTUNICP)(uch & 0x0f) << 12);
539 RTStrAssertMsgReturn(uc >= 0x00000800 && uc <= 0x0000fffd,
540 ("%u: cp=%#010RX32: %.*Rhxs\n", cb, uc, RT_MIN(cb + 10, cch), puch),
541 uc == 0xffff || uc == 0xfffe ? VERR_CODE_POINT_ENDIAN_INDICATOR : VERR_INVALID_UTF8_ENCODING);
542 RTStrAssertMsgReturn(uc < 0xd800 || uc > 0xdfff,
543 ("%u: cp=%#010RX32: %.*Rhxs\n", cb, uc, RT_MIN(cb + 10, cch), puch), VERR_CODE_POINT_SURROGATE);
544 break;
545 case 2:
546 uc = (puch[1] & 0x3f)
547 | ((RTUNICP)(uch & 0x1f) << 6);
548 RTStrAssertMsgReturn(uc >= 0x00000080 && uc <= 0x000007ff,
549 ("%u: cp=%#010RX32: %.*Rhxs\n", cb, uc, RT_MIN(cb + 10, cch), puch), VERR_INVALID_UTF8_ENCODING);
550 break;
551 }
552
553 /* advance */
554 cch -= cb;
555 puch += cb;
556 cwc++;
557 }
558 }
559
560 /* done */
561 *pcwc = cwc;
562 return VINF_SUCCESS;
563}
564
565
566/**
567 * Recodes a valid UTF-8 string as UTF-16.
568 *
569 * Since we know the input is valid, we do *not* perform encoding or length checks.
570 *
571 * @returns iprt status code.
572 * @param psz The UTF-8 string to recode. This is a valid encoding.
573 * @param cch The number of chars (the type char, so bytes if you like) to process of the UTF-8 string.
574 * The recoding will stop when cch or '\\0' is reached. Pass RTSTR_MAX to process up to '\\0'.
575 * @param pwsz Where to store the UTF-16 string.
576 * @param cwc The number of RTUTF16 items the pwsz buffer can hold, excluding the terminator ('\\0').
577 */
578static int rtUtf8RecodeAsUtf16(const char *psz, size_t cch, PRTUTF16 pwsz, size_t cwc)
579{
580 int rc = VINF_SUCCESS;
581 const unsigned char *puch = (const unsigned char *)psz;
582 PRTUTF16 pwc = pwsz;
583 while (cch > 0)
584 {
585 /* read the next char and check for terminator. */
586 const unsigned char uch = *puch;
587 if (!uch)
588 break;
589
590 /* check for output overflow */
591 if (RT_UNLIKELY(cwc < 1))
592 {
593 rc = VERR_BUFFER_OVERFLOW;
594 break;
595 }
596 cwc--;
597
598 /* decode and recode the code point */
599 if (!(uch & RT_BIT(7)))
600 {
601 *pwc++ = uch;
602 puch++;
603 cch--;
604 }
605 else if ((uch & (RT_BIT(7) | RT_BIT(6) | RT_BIT(5))) == (RT_BIT(7) | RT_BIT(6)))
606 {
607 uint16_t uc = (puch[1] & 0x3f)
608 | ((uint16_t)(uch & 0x1f) << 6);
609 *pwc++ = uc;
610 puch += 2;
611 cch -= 2;
612 }
613 else if ((uch & (RT_BIT(7) | RT_BIT(6) | RT_BIT(5) | RT_BIT(4))) == (RT_BIT(7) | RT_BIT(6) | RT_BIT(5)))
614 {
615 uint16_t uc = (puch[2] & 0x3f)
616 | ((uint16_t)(puch[1] & 0x3f) << 6)
617 | ((uint16_t)(uch & 0x0f) << 12);
618 *pwc++ = uc;
619 puch += 3;
620 cch -= 3;
621 }
622 else
623 {
624 /* generate surrugate pair */
625 Assert((uch & (RT_BIT(7) | RT_BIT(6) | RT_BIT(5) | RT_BIT(4) | RT_BIT(3))) == (RT_BIT(7) | RT_BIT(6) | RT_BIT(5) | RT_BIT(4)));
626 RTUNICP uc = (puch[3] & 0x3f)
627 | ((RTUNICP)(puch[2] & 0x3f) << 6)
628 | ((RTUNICP)(puch[1] & 0x3f) << 12)
629 | ((RTUNICP)(uch & 0x07) << 18);
630 if (RT_UNLIKELY(cwc < 1))
631 {
632 rc = VERR_BUFFER_OVERFLOW;
633 break;
634 }
635 cwc--;
636
637 uc -= 0x10000;
638 *pwc++ = 0xd800 | (uc >> 10);
639 *pwc++ = 0xdc00 | (uc & 0x3ff);
640 puch += 4;
641 cch -= 4;
642 }
643 }
644
645 /* done */
646 *pwc = '\0';
647 return rc;
648}
649
650
651RTDECL(int) RTStrToUtf16(const char *pszString, PRTUTF16 *ppwszString)
652{
653 /*
654 * Validate input.
655 */
656 Assert(VALID_PTR(ppwszString));
657 Assert(VALID_PTR(pszString));
658 *ppwszString = NULL;
659
660 /*
661 * Validate the UTF-8 input and calculate the length of the UTF-16 string.
662 */
663 size_t cwc;
664 int rc = rtUtf8CalcUtf16Length(pszString, RTSTR_MAX, &cwc);
665 if (RT_SUCCESS(rc))
666 {
667 /*
668 * Allocate buffer.
669 */
670 PRTUTF16 pwsz = (PRTUTF16)RTMemAlloc((cwc + 1) * sizeof(RTUTF16));
671 if (pwsz)
672 {
673 /*
674 * Encode the UTF-16 string.
675 */
676 rc = rtUtf8RecodeAsUtf16(pszString, RTSTR_MAX, pwsz, cwc);
677 if (RT_SUCCESS(rc))
678 {
679 *ppwszString = pwsz;
680 return rc;
681 }
682 RTMemFree(pwsz);
683 }
684 else
685 rc = VERR_NO_UTF16_MEMORY;
686 }
687 return rc;
688}
689RT_EXPORT_SYMBOL(RTStrToUtf16);
690
691
692RTDECL(int) RTStrToUtf16Ex(const char *pszString, size_t cchString, PRTUTF16 *ppwsz, size_t cwc, size_t *pcwc)
693{
694 /*
695 * Validate input.
696 */
697 Assert(VALID_PTR(pszString));
698 Assert(VALID_PTR(ppwsz));
699 Assert(!pcwc || VALID_PTR(pcwc));
700
701 /*
702 * Validate the UTF-8 input and calculate the length of the UTF-16 string.
703 */
704 size_t cwcResult;
705 int rc = rtUtf8CalcUtf16Length(pszString, cchString, &cwcResult);
706 if (RT_SUCCESS(rc))
707 {
708 if (pcwc)
709 *pcwc = cwcResult;
710
711 /*
712 * Check buffer size / Allocate buffer.
713 */
714 bool fShouldFree;
715 PRTUTF16 pwszResult;
716 if (cwc > 0 && *ppwsz)
717 {
718 fShouldFree = false;
719 if (cwc <= cwcResult)
720 return VERR_BUFFER_OVERFLOW;
721 pwszResult = *ppwsz;
722 }
723 else
724 {
725 *ppwsz = NULL;
726 fShouldFree = true;
727 cwc = RT_MAX(cwcResult + 1, cwc);
728 pwszResult = (PRTUTF16)RTMemAlloc(cwc * sizeof(RTUTF16));
729 }
730 if (pwszResult)
731 {
732 /*
733 * Encode the UTF-16 string.
734 */
735 rc = rtUtf8RecodeAsUtf16(pszString, cchString, pwszResult, cwc - 1);
736 if (RT_SUCCESS(rc))
737 {
738 *ppwsz = pwszResult;
739 return rc;
740 }
741 if (fShouldFree)
742 RTMemFree(pwszResult);
743 }
744 else
745 rc = VERR_NO_UTF16_MEMORY;
746 }
747 return rc;
748}
749RT_EXPORT_SYMBOL(RTStrToUtf16Ex);
750
751
752RTDECL(size_t) RTStrCalcUtf16Len(const char *psz)
753{
754 size_t cwc;
755 int rc = rtUtf8CalcUtf16Length(psz, RTSTR_MAX, &cwc);
756 return RT_SUCCESS(rc) ? cwc : 0;
757}
758RT_EXPORT_SYMBOL(RTStrCalcUtf16Len);
759
760
761RTDECL(int) RTStrCalcUtf16LenEx(const char *psz, size_t cch, size_t *pcwc)
762{
763 size_t cwc;
764 int rc = rtUtf8CalcUtf16Length(psz, cch, &cwc);
765 if (pcwc)
766 *pcwc = RT_SUCCESS(rc) ? cwc : ~(size_t)0;
767 return rc;
768}
769RT_EXPORT_SYMBOL(RTStrCalcUtf16LenEx);
770
771
772/**
773 * Handle invalid encodings passed to RTStrGetCp() and RTStrGetCpEx().
774 * @returns rc
775 * @param ppsz The pointer to the string position point.
776 * @param pCp Where to store RTUNICP_INVALID.
777 * @param rc The iprt error code.
778 */
779static int rtStrGetCpExFailure(const char **ppsz, PRTUNICP pCp, int rc)
780{
781 /*
782 * Try find a valid encoding.
783 */
784 (*ppsz)++; /** @todo code this! */
785 *pCp = RTUNICP_INVALID;
786 return rc;
787}
788
789
790RTDECL(RTUNICP) RTStrGetCpInternal(const char *psz)
791{
792 RTUNICP Cp;
793 RTStrGetCpExInternal(&psz, &Cp);
794 return Cp;
795}
796RT_EXPORT_SYMBOL(RTStrGetCpInternal);
797
798
799RTDECL(int) RTStrGetCpExInternal(const char **ppsz, PRTUNICP pCp)
800{
801 const unsigned char *puch = (const unsigned char *)*ppsz;
802 const unsigned char uch = *puch;
803 RTUNICP uc;
804
805 /* ASCII ? */
806 if (!(uch & RT_BIT(7)))
807 {
808 uc = uch;
809 puch++;
810 }
811 else if (uch & RT_BIT(6))
812 {
813 /* figure the length and validate the first octet. */
814 unsigned cb;
815 if (!(uch & RT_BIT(5)))
816 cb = 2;
817 else if (!(uch & RT_BIT(4)))
818 cb = 3;
819 else if (!(uch & RT_BIT(3)))
820 cb = 4;
821 else if (!(uch & RT_BIT(2)))
822 cb = 5;
823 else if (!(uch & RT_BIT(1)))
824 cb = 6;
825 else
826 {
827 RTStrAssertMsgFailed(("Invalid UTF-8 first byte: %.*Rhxs\n", RT_MIN(strlen((char *)puch), 10), puch));
828 return rtStrGetCpExFailure(ppsz, pCp, VERR_INVALID_UTF8_ENCODING);
829 }
830
831 /* validate the rest */
832 switch (cb)
833 {
834 case 6:
835 RTStrAssertMsgReturn((puch[5] & 0xc0) == 0x80, ("6/%u: %.*Rhxs\n", cb, RT_MIN(cb + 10, strlen((char *)puch)), puch),
836 rtStrGetCpExFailure(ppsz, pCp, VERR_INVALID_UTF8_ENCODING));
837 case 5:
838 RTStrAssertMsgReturn((puch[4] & 0xc0) == 0x80, ("5/%u: %.*Rhxs\n", cb, RT_MIN(cb + 10, strlen((char *)puch)), puch),
839 rtStrGetCpExFailure(ppsz, pCp, VERR_INVALID_UTF8_ENCODING));
840 case 4:
841 RTStrAssertMsgReturn((puch[3] & 0xc0) == 0x80, ("4/%u: %.*Rhxs\n", cb, RT_MIN(cb + 10, strlen((char *)puch)), puch),
842 rtStrGetCpExFailure(ppsz, pCp, VERR_INVALID_UTF8_ENCODING));
843 case 3:
844 RTStrAssertMsgReturn((puch[2] & 0xc0) == 0x80, ("3/%u: %.*Rhxs\n", cb, RT_MIN(cb + 10, strlen((char *)puch)), puch),
845 rtStrGetCpExFailure(ppsz, pCp, VERR_INVALID_UTF8_ENCODING));
846 case 2:
847 RTStrAssertMsgReturn((puch[1] & 0xc0) == 0x80, ("2/%u: %.*Rhxs\n", cb, RT_MIN(cb + 10, strlen((char *)puch)), puch),
848 rtStrGetCpExFailure(ppsz, pCp, VERR_INVALID_UTF8_ENCODING));
849 break;
850 }
851
852 /* get and validate the code point. */
853 switch (cb)
854 {
855 case 6:
856 uc = (puch[5] & 0x3f)
857 | ((RTUNICP)(puch[4] & 0x3f) << 6)
858 | ((RTUNICP)(puch[3] & 0x3f) << 12)
859 | ((RTUNICP)(puch[2] & 0x3f) << 18)
860 | ((RTUNICP)(puch[1] & 0x3f) << 24)
861 | ((RTUNICP)(uch & 0x01) << 30);
862 RTStrAssertMsgReturn(uc >= 0x04000000 && uc <= 0x7fffffff,
863 ("%u: cp=%#010RX32: %.*Rhxs\n", cb, uc, RT_MIN(cb + 10, strlen((char *)puch)), puch),
864 rtStrGetCpExFailure(ppsz, pCp, VERR_INVALID_UTF8_ENCODING));
865 break;
866 case 5:
867 uc = (puch[4] & 0x3f)
868 | ((RTUNICP)(puch[3] & 0x3f) << 6)
869 | ((RTUNICP)(puch[2] & 0x3f) << 12)
870 | ((RTUNICP)(puch[1] & 0x3f) << 18)
871 | ((RTUNICP)(uch & 0x03) << 24);
872 RTStrAssertMsgReturn(uc >= 0x00200000 && uc <= 0x03ffffff,
873 ("%u: cp=%#010RX32: %.*Rhxs\n", cb, uc, RT_MIN(cb + 10, strlen((char *)puch)), puch),
874 rtStrGetCpExFailure(ppsz, pCp, VERR_INVALID_UTF8_ENCODING));
875 break;
876 case 4:
877 uc = (puch[3] & 0x3f)
878 | ((RTUNICP)(puch[2] & 0x3f) << 6)
879 | ((RTUNICP)(puch[1] & 0x3f) << 12)
880 | ((RTUNICP)(uch & 0x07) << 18);
881 RTStrAssertMsgReturn(uc >= 0x00010000 && uc <= 0x001fffff,
882 ("%u: cp=%#010RX32: %.*Rhxs\n", cb, uc, RT_MIN(cb + 10, strlen((char *)puch)), puch),
883 rtStrGetCpExFailure(ppsz, pCp, VERR_INVALID_UTF8_ENCODING));
884 break;
885 case 3:
886 uc = (puch[2] & 0x3f)
887 | ((RTUNICP)(puch[1] & 0x3f) << 6)
888 | ((RTUNICP)(uch & 0x0f) << 12);
889 RTStrAssertMsgReturn(uc >= 0x00000800 && uc <= 0x0000fffd,
890 ("%u: cp=%#010RX32: %.*Rhxs\n", cb, uc, RT_MIN(cb + 10, strlen((char *)puch)), puch),
891 rtStrGetCpExFailure(ppsz, pCp, uc == 0xffff || uc == 0xfffe ? VERR_CODE_POINT_ENDIAN_INDICATOR : VERR_INVALID_UTF8_ENCODING));
892 RTStrAssertMsgReturn(uc < 0xd800 || uc > 0xdfff,
893 ("%u: cp=%#010RX32: %.*Rhxs\n", cb, uc, RT_MIN(cb + 10, strlen((char *)puch)), puch),
894 rtStrGetCpExFailure(ppsz, pCp, VERR_CODE_POINT_SURROGATE));
895 break;
896 case 2:
897 uc = (puch[1] & 0x3f)
898 | ((RTUNICP)(uch & 0x1f) << 6);
899 RTStrAssertMsgReturn(uc >= 0x00000080 && uc <= 0x000007ff,
900 ("%u: cp=%#010RX32: %.*Rhxs\n", cb, uc, RT_MIN(cb + 10, strlen((char *)puch)), puch),
901 rtStrGetCpExFailure(ppsz, pCp, VERR_INVALID_UTF8_ENCODING));
902 break;
903 default: /* impossible, but GCC is bitching. */
904 uc = RTUNICP_INVALID;
905 break;
906 }
907 puch += cb;
908 }
909 else
910 {
911 /* 6th bit is always set. */
912 RTStrAssertMsgFailed(("Invalid UTF-8 first byte: %.*Rhxs\n", RT_MIN(strlen((char *)puch), 10), puch));
913 return rtStrGetCpExFailure(ppsz, pCp, VERR_INVALID_UTF8_ENCODING);
914 }
915 *pCp = uc;
916 *ppsz = (const char *)puch;
917 return VINF_SUCCESS;
918}
919RT_EXPORT_SYMBOL(RTStrGetCpExInternal);
920
921
922/**
923 * Handle invalid encodings passed to RTStrGetCpNEx().
924 * @returns rc
925 * @param ppsz The pointer to the string position point.
926 * @param pcch Pointer to the string length.
927 * @param pCp Where to store RTUNICP_INVALID.
928 * @param rc The iprt error code.
929 */
930static int rtStrGetCpNExFailure(const char **ppsz, size_t *pcch, PRTUNICP pCp, int rc)
931{
932 /*
933 * Try find a valid encoding.
934 */
935 (*ppsz)++; /** @todo code this! */
936 (*pcch)--;
937 *pCp = RTUNICP_INVALID;
938 return rc;
939}
940
941
942RTDECL(int) RTStrGetCpNExInternal(const char **ppsz, size_t *pcch, PRTUNICP pCp)
943{
944 const unsigned char *puch = (const unsigned char *)*ppsz;
945 const unsigned char uch = *puch;
946 size_t cch = *pcch;
947 RTUNICP uc;
948
949 if (cch == 0)
950 {
951 *pCp = RTUNICP_INVALID;
952 return VERR_END_OF_STRING;
953 }
954
955 /* ASCII ? */
956 if (!(uch & RT_BIT(7)))
957 {
958 uc = uch;
959 puch++;
960 cch--;
961 }
962 else if (uch & RT_BIT(6))
963 {
964 /* figure the length and validate the first octet. */
965 unsigned cb;
966 if (!(uch & RT_BIT(5)))
967 cb = 2;
968 else if (!(uch & RT_BIT(4)))
969 cb = 3;
970 else if (!(uch & RT_BIT(3)))
971 cb = 4;
972 else if (!(uch & RT_BIT(2)))
973 cb = 5;
974 else if (!(uch & RT_BIT(1)))
975 cb = 6;
976 else
977 {
978 RTStrAssertMsgFailed(("Invalid UTF-8 first byte: %.*Rhxs\n", RT_MIN(strlen((char *)puch), 10), puch));
979 return rtStrGetCpNExFailure(ppsz, pcch, pCp, VERR_INVALID_UTF8_ENCODING);
980 }
981
982 if (cb > cch)
983 return rtStrGetCpNExFailure(ppsz, pcch, pCp, VERR_INVALID_UTF8_ENCODING);
984
985 /* validate the rest */
986 switch (cb)
987 {
988 case 6:
989 RTStrAssertMsgReturn((puch[5] & 0xc0) == 0x80, ("6/%u: %.*Rhxs\n", cb, RT_MIN(cb + 10, strlen((char *)puch)), puch),
990 rtStrGetCpNExFailure(ppsz, pcch, pCp, VERR_INVALID_UTF8_ENCODING));
991 case 5:
992 RTStrAssertMsgReturn((puch[4] & 0xc0) == 0x80, ("5/%u: %.*Rhxs\n", cb, RT_MIN(cb + 10, strlen((char *)puch)), puch),
993 rtStrGetCpNExFailure(ppsz, pcch, pCp, VERR_INVALID_UTF8_ENCODING));
994 case 4:
995 RTStrAssertMsgReturn((puch[3] & 0xc0) == 0x80, ("4/%u: %.*Rhxs\n", cb, RT_MIN(cb + 10, strlen((char *)puch)), puch),
996 rtStrGetCpNExFailure(ppsz, pcch, pCp, VERR_INVALID_UTF8_ENCODING));
997 case 3:
998 RTStrAssertMsgReturn((puch[2] & 0xc0) == 0x80, ("3/%u: %.*Rhxs\n", cb, RT_MIN(cb + 10, strlen((char *)puch)), puch),
999 rtStrGetCpNExFailure(ppsz, pcch, pCp, VERR_INVALID_UTF8_ENCODING));
1000 case 2:
1001 RTStrAssertMsgReturn((puch[1] & 0xc0) == 0x80, ("2/%u: %.*Rhxs\n", cb, RT_MIN(cb + 10, strlen((char *)puch)), puch),
1002 rtStrGetCpNExFailure(ppsz, pcch, pCp, VERR_INVALID_UTF8_ENCODING));
1003 break;
1004 }
1005
1006 /* get and validate the code point. */
1007 switch (cb)
1008 {
1009 case 6:
1010 uc = (puch[5] & 0x3f)
1011 | ((RTUNICP)(puch[4] & 0x3f) << 6)
1012 | ((RTUNICP)(puch[3] & 0x3f) << 12)
1013 | ((RTUNICP)(puch[2] & 0x3f) << 18)
1014 | ((RTUNICP)(puch[1] & 0x3f) << 24)
1015 | ((RTUNICP)(uch & 0x01) << 30);
1016 RTStrAssertMsgReturn(uc >= 0x04000000 && uc <= 0x7fffffff,
1017 ("%u: cp=%#010RX32: %.*Rhxs\n", cb, uc, RT_MIN(cb + 10, strlen((char *)puch)), puch),
1018 rtStrGetCpNExFailure(ppsz, pcch, pCp, VERR_INVALID_UTF8_ENCODING));
1019 break;
1020 case 5:
1021 uc = (puch[4] & 0x3f)
1022 | ((RTUNICP)(puch[3] & 0x3f) << 6)
1023 | ((RTUNICP)(puch[2] & 0x3f) << 12)
1024 | ((RTUNICP)(puch[1] & 0x3f) << 18)
1025 | ((RTUNICP)(uch & 0x03) << 24);
1026 RTStrAssertMsgReturn(uc >= 0x00200000 && uc <= 0x03ffffff,
1027 ("%u: cp=%#010RX32: %.*Rhxs\n", cb, uc, RT_MIN(cb + 10, strlen((char *)puch)), puch),
1028 rtStrGetCpNExFailure(ppsz, pcch, pCp, VERR_INVALID_UTF8_ENCODING));
1029 break;
1030 case 4:
1031 uc = (puch[3] & 0x3f)
1032 | ((RTUNICP)(puch[2] & 0x3f) << 6)
1033 | ((RTUNICP)(puch[1] & 0x3f) << 12)
1034 | ((RTUNICP)(uch & 0x07) << 18);
1035 RTStrAssertMsgReturn(uc >= 0x00010000 && uc <= 0x001fffff,
1036 ("%u: cp=%#010RX32: %.*Rhxs\n", cb, uc, RT_MIN(cb + 10, strlen((char *)puch)), puch),
1037 rtStrGetCpNExFailure(ppsz, pcch, pCp, VERR_INVALID_UTF8_ENCODING));
1038 break;
1039 case 3:
1040 uc = (puch[2] & 0x3f)
1041 | ((RTUNICP)(puch[1] & 0x3f) << 6)
1042 | ((RTUNICP)(uch & 0x0f) << 12);
1043 RTStrAssertMsgReturn(uc >= 0x00000800 && uc <= 0x0000fffd,
1044 ("%u: cp=%#010RX32: %.*Rhxs\n", cb, uc, RT_MIN(cb + 10, strlen((char *)puch)), puch),
1045 rtStrGetCpNExFailure(ppsz, pcch, pCp, uc == 0xffff || uc == 0xfffe ? VERR_CODE_POINT_ENDIAN_INDICATOR : VERR_INVALID_UTF8_ENCODING));
1046 RTStrAssertMsgReturn(uc < 0xd800 || uc > 0xdfff,
1047 ("%u: cp=%#010RX32: %.*Rhxs\n", cb, uc, RT_MIN(cb + 10, strlen((char *)puch)), puch),
1048 rtStrGetCpNExFailure(ppsz, pcch, pCp, VERR_CODE_POINT_SURROGATE));
1049 break;
1050 case 2:
1051 uc = (puch[1] & 0x3f)
1052 | ((RTUNICP)(uch & 0x1f) << 6);
1053 RTStrAssertMsgReturn(uc >= 0x00000080 && uc <= 0x000007ff,
1054 ("%u: cp=%#010RX32: %.*Rhxs\n", cb, uc, RT_MIN(cb + 10, strlen((char *)puch)), puch),
1055 rtStrGetCpNExFailure(ppsz, pcch, pCp, VERR_INVALID_UTF8_ENCODING));
1056 break;
1057 default: /* impossible, but GCC is bitching. */
1058 uc = RTUNICP_INVALID;
1059 break;
1060 }
1061 puch += cb;
1062 cch -= cb;
1063 }
1064 else
1065 {
1066 /* 6th bit is always set. */
1067 RTStrAssertMsgFailed(("Invalid UTF-8 first byte: %.*Rhxs\n", RT_MIN(strlen((char *)puch), 10), puch));
1068 return rtStrGetCpNExFailure(ppsz, pcch, pCp, VERR_INVALID_UTF8_ENCODING);
1069 }
1070 *pCp = uc;
1071 *ppsz = (const char *)puch;
1072 (*pcch) = cch;
1073 return VINF_SUCCESS;
1074}
1075RT_EXPORT_SYMBOL(RTStrGetCpNExInternal);
1076
1077
1078RTDECL(char *) RTStrPutCpInternal(char *psz, RTUNICP uc)
1079{
1080 unsigned char *puch = (unsigned char *)psz;
1081 if (uc < 0x80)
1082 *puch++ = (unsigned char )uc;
1083 else if (uc < 0x00000800)
1084 {
1085 *puch++ = 0xc0 | (uc >> 6);
1086 *puch++ = 0x80 | (uc & 0x3f);
1087 }
1088 else if (uc < 0x00010000)
1089 {
1090 if ( uc < 0x0000d8000
1091 || ( uc > 0x0000dfff
1092 && uc < 0x0000fffe))
1093 {
1094 *puch++ = 0xe0 | (uc >> 12);
1095 *puch++ = 0x80 | ((uc >> 6) & 0x3f);
1096 *puch++ = 0x80 | (uc & 0x3f);
1097 }
1098 else
1099 {
1100 AssertMsgFailed(("Invalid code point U+%05x!\n", uc));
1101 *puch++ = 0x7f;
1102 }
1103 }
1104 else if (uc < 0x00200000)
1105 {
1106 *puch++ = 0xf0 | (uc >> 18);
1107 *puch++ = 0x80 | ((uc >> 12) & 0x3f);
1108 *puch++ = 0x80 | ((uc >> 6) & 0x3f);
1109 *puch++ = 0x80 | (uc & 0x3f);
1110 }
1111 else if (uc < 0x04000000)
1112 {
1113 *puch++ = 0xf8 | (uc >> 24);
1114 *puch++ = 0x80 | ((uc >> 18) & 0x3f);
1115 *puch++ = 0x80 | ((uc >> 12) & 0x3f);
1116 *puch++ = 0x80 | ((uc >> 6) & 0x3f);
1117 *puch++ = 0x80 | (uc & 0x3f);
1118 }
1119 else if (uc <= 0x7fffffff)
1120 {
1121 *puch++ = 0xfc | (uc >> 30);
1122 *puch++ = 0x80 | ((uc >> 24) & 0x3f);
1123 *puch++ = 0x80 | ((uc >> 18) & 0x3f);
1124 *puch++ = 0x80 | ((uc >> 12) & 0x3f);
1125 *puch++ = 0x80 | ((uc >> 6) & 0x3f);
1126 *puch++ = 0x80 | (uc & 0x3f);
1127 }
1128 else
1129 {
1130 AssertMsgFailed(("Invalid code point U+%08x!\n", uc));
1131 *puch++ = 0x7f;
1132 }
1133
1134 return (char *)puch;
1135}
1136RT_EXPORT_SYMBOL(RTStrPutCpInternal);
1137
1138
1139RTDECL(char *) RTStrPrevCp(const char *pszStart, const char *psz)
1140{
1141 if (pszStart < psz)
1142 {
1143 /* simple char? */
1144 const unsigned char *puch = (const unsigned char *)psz;
1145 unsigned uch = *--puch;
1146 if (!(uch & RT_BIT(7)))
1147 return (char *)puch;
1148 RTStrAssertMsgReturn(!(uch & RT_BIT(6)), ("uch=%#x\n", uch), (char *)pszStart);
1149
1150 /* two or more. */
1151 uint32_t uMask = 0xffffffc0;
1152 while ( (const unsigned char *)pszStart < puch
1153 && !(uMask & 1))
1154 {
1155 uch = *--puch;
1156 if ((uch & 0xc0) != 0x80)
1157 {
1158 RTStrAssertMsgReturn((uch & (uMask >> 1)) == (uMask & 0xff),
1159 ("Invalid UTF-8 encoding: %.*Rhxs puch=%p psz=%p\n", psz - (char *)puch, puch, psz),
1160 (char *)pszStart);
1161 return (char *)puch;
1162 }
1163 uMask >>= 1;
1164 }
1165 RTStrAssertMsgFailed(("Invalid UTF-8 encoding: %.*Rhxs puch=%p psz=%p\n", psz - (char *)puch, puch, psz));
1166 }
1167 return (char *)pszStart;
1168}
1169RT_EXPORT_SYMBOL(RTStrPrevCp);
1170
Note: See TracBrowser for help on using the repository browser.

© 2024 Oracle Support Privacy / Do Not Sell My Info Terms of Use Trademark Policy Automated Access Etiquette