VirtualBox

source: vbox/trunk/src/VBox/Runtime/utf-8.cpp@ 3672

Last change on this file since 3672 was 3672, checked in by vboxsync, 17 years ago

RT_OS_* and RT_ARCH_* for Runtime/ and Support/

  • Property svn:eol-style set to native
  • Property svn:keywords set to Id
File size: 35.4 KB
Line 
1/* $Id: utf-8.cpp 3672 2007-07-17 12:39:30Z vboxsync $ */
2/** @file
3 * innotek Portable Runtime - UTF-8 Decoding.
4 */
5
6/*
7 * Copyright (C) 2006-2007 innotek GmbH
8 *
9 * This file is part of VirtualBox Open Source Edition (OSE), as
10 * available from http://www.virtualbox.org. This file is free software;
11 * you can redistribute it and/or modify it under the terms of the GNU
12 * General Public License as published by the Free Software Foundation,
13 * in version 2 as it comes in the "COPYING" file of the VirtualBox OSE
14 * distribution. VirtualBox OSE is distributed in the hope that it will
15 * be useful, but WITHOUT ANY WARRANTY of any kind.
16 *
17 * If you received this file as part of a commercial VirtualBox
18 * distribution, then only the terms of your commercial VirtualBox
19 * license agreement apply instead of the previous paragraph.
20 */
21
22
23/*******************************************************************************
24* Header Files *
25*******************************************************************************/
26#include <iprt/string.h>
27#include <iprt/uni.h>
28#include <iprt/alloc.h>
29#include <iprt/assert.h>
30#include <iprt/err.h>
31#include "internal/string.h"
32
33
34
35/**
36 * Get get length in code points of a UTF-8 endcoded string.
37 * The string is validated while doing this.
38 *
39 * @returns IPRT status code.
40 * @param psz Pointer to the UTF-8 string.
41 * @param cch The max length of the string. (btw cch = cb)
42 * Use RTSTR_MAX if all of the string is to be examined.s
43 * @param pcuc Where to store the length in unicode code points.
44 */
45static int rtUtf8Length(const char *psz, size_t cch, size_t *pcuc)
46{
47 const unsigned char *puch = (const unsigned char *)psz;
48 size_t cCodePoints = 0;
49 while (cch > 0)
50 {
51 const unsigned char uch = *puch;
52 if (!uch)
53 break;
54 if (uch & BIT(7))
55 {
56 /* figure sequence length and validate the first byte */
57 unsigned cb;
58 if ((uch & (BIT(7) | BIT(6) | BIT(5))) == (BIT(7) | BIT(6)))
59 cb = 2;
60 else if ((uch & (BIT(7) | BIT(6) | BIT(5) | BIT(4))) == (BIT(7) | BIT(6) | BIT(5)))
61 cb = 3;
62 else if ((uch & (BIT(7) | BIT(6) | BIT(5) | BIT(4) | BIT(3))) == (BIT(7) | BIT(6) | BIT(5) | BIT(4)))
63 cb = 4;
64 else if ((uch & (BIT(7) | BIT(6) | BIT(5) | BIT(4) | BIT(3) | BIT(2))) == (BIT(7) | BIT(6) | BIT(5) | BIT(4) | BIT(3)))
65 cb = 5;
66 else if ((uch & (BIT(7) | BIT(6) | BIT(5) | BIT(4) | BIT(3) | BIT(2) | BIT(1))) == (BIT(7) | BIT(6) | BIT(5) | BIT(4) | BIT(3) | BIT(2)))
67 cb = 6;
68 else
69 {
70 RTStrAssertMsgFailed(("Invalid UTF-8 first byte: %.*Rhxs\n", RT_MIN(cch, 10), puch));
71 return VERR_INVALID_UTF8_ENCODING;
72 }
73
74 /* check length */
75 if (cb > cch)
76 {
77 RTStrAssertMsgFailed(("Invalid UTF-8 length: cb=%d cch=%d (%.*Rhxs)\n", cb, cch, RT_MIN(cch, 10), puch));
78 return VERR_INVALID_UTF8_ENCODING;
79 }
80
81 /* validate the rest */
82 switch (cb)
83 {
84 case 6:
85 RTStrAssertMsgReturn((puch[5] & (BIT(7) | BIT(6))) == BIT(7), ("6/%u: %.*Rhxs\n", cb, RT_MIN(cb + 10, cch), puch), VERR_INVALID_UTF8_ENCODING);
86 case 5:
87 RTStrAssertMsgReturn((puch[4] & (BIT(7) | BIT(6))) == BIT(7), ("5/%u: %.*Rhxs\n", cb, RT_MIN(cb + 10, cch), puch), VERR_INVALID_UTF8_ENCODING);
88 case 4:
89 RTStrAssertMsgReturn((puch[3] & (BIT(7) | BIT(6))) == BIT(7), ("4/%u: %.*Rhxs\n", cb, RT_MIN(cb + 10, cch), puch), VERR_INVALID_UTF8_ENCODING);
90 case 3:
91 RTStrAssertMsgReturn((puch[2] & (BIT(7) | BIT(6))) == BIT(7), ("3/%u: %.*Rhxs\n", cb, RT_MIN(cb + 10, cch), puch), VERR_INVALID_UTF8_ENCODING);
92 case 2:
93 RTStrAssertMsgReturn((puch[1] & (BIT(7) | BIT(6))) == BIT(7), ("2/%u: %.*Rhxs\n", cb, RT_MIN(cb + 10, cch), puch), VERR_INVALID_UTF8_ENCODING);
94 break;
95 }
96
97 /* validate the code point. */
98 RTUNICP uc;
99 switch (cb)
100 {
101 case 6:
102 uc = (puch[5] & 0x3f)
103 | ((RTUNICP)(puch[4] & 0x3f) << 6)
104 | ((RTUNICP)(puch[3] & 0x3f) << 12)
105 | ((RTUNICP)(puch[2] & 0x3f) << 18)
106 | ((RTUNICP)(puch[1] & 0x3f) << 24)
107 | ((RTUNICP)(uch & 0x01) << 30);
108 RTStrAssertMsgReturn(uc >= 0x04000000 && uc <= 0x7fffffff,
109 ("%u: cp=%#010RX32: %.*Rhxs\n", cb, uc, RT_MIN(cb + 10, cch), puch), VERR_INVALID_UTF8_ENCODING);
110 break;
111 case 5:
112 uc = (puch[4] & 0x3f)
113 | ((RTUNICP)(puch[3] & 0x3f) << 6)
114 | ((RTUNICP)(puch[2] & 0x3f) << 12)
115 | ((RTUNICP)(puch[1] & 0x3f) << 18)
116 | ((RTUNICP)(uch & 0x03) << 24);
117 RTStrAssertMsgReturn(uc >= 0x00200000 && uc <= 0x03ffffff,
118 ("%u: cp=%#010RX32: %.*Rhxs\n", cb, uc, RT_MIN(cb + 10, cch), puch), VERR_INVALID_UTF8_ENCODING);
119 break;
120 case 4:
121 uc = (puch[3] & 0x3f)
122 | ((RTUNICP)(puch[2] & 0x3f) << 6)
123 | ((RTUNICP)(puch[1] & 0x3f) << 12)
124 | ((RTUNICP)(uch & 0x07) << 18);
125 RTStrAssertMsgReturn(uc >= 0x00010000 && uc <= 0x001fffff,
126 ("%u: cp=%#010RX32: %.*Rhxs\n", cb, uc, RT_MIN(cb + 10, cch), puch), VERR_INVALID_UTF8_ENCODING);
127 break;
128 case 3:
129 uc = (puch[2] & 0x3f)
130 | ((RTUNICP)(puch[1] & 0x3f) << 6)
131 | ((RTUNICP)(uch & 0x0f) << 12);
132 RTStrAssertMsgReturn(uc >= 0x00000800 && uc <= 0x0000fffd,
133 ("%u: cp=%#010RX32: %.*Rhxs\n", cb, uc, RT_MIN(cb + 10, cch), puch),
134 uc == 0xffff || uc == 0xfffe ? VERR_CODE_POINT_ENDIAN_INDICATOR : VERR_INVALID_UTF8_ENCODING);
135 RTStrAssertMsgReturn(uc < 0xd800 || uc > 0xdfff,
136 ("%u: cp=%#010RX32: %.*Rhxs\n", cb, uc, RT_MIN(cb + 10, cch), puch), VERR_CODE_POINT_SURROGATE);
137 break;
138 case 2:
139 uc = (puch[1] & 0x3f)
140 | ((RTUNICP)(uch & 0x1f) << 6);
141 RTStrAssertMsgReturn(uc >= 0x00000080 && uc <= 0x000007ff,
142 ("%u: cp=%#010RX32: %.*Rhxs\n", cb, uc, RT_MIN(cb + 10, cch), puch), VERR_INVALID_UTF8_ENCODING);
143 break;
144 }
145
146 /* advance */
147 cch -= cb;
148 puch += cb;
149 }
150 else
151 {
152 /* one ASCII byte */
153 puch++;
154 cch--;
155 }
156 cCodePoints++;
157 }
158
159 /* done */
160 *pcuc = cCodePoints;
161 return VINF_SUCCESS;
162}
163
164
165/**
166 * Decodes and UTF-8 string into an array of unicode code point.
167 *
168 * Since we know the input is valid, we do *not* perform encoding or length checks.
169 *
170 * @returns iprt status code.
171 * @param psz The UTF-8 string to recode. This is a valid encoding.
172 * @param cch The number of chars (the type char, so bytes if you like) to process of the UTF-8 string.
173 * The recoding will stop when cch or '\\0' is reached. Pass RTSTR_MAX to process up to '\\0'.
174 * @param paCps Where to store the code points array.
175 * @param cCps The number of RTUNICP items the paCps buffer can hold, excluding the terminator ('\\0').
176 * @param pcCps Where to store the actual number of decoded code points. This excludes the terminator.
177 */
178static int rtUtf8Decode(const char *psz, size_t cch, PRTUNICP paCps, size_t cCps, size_t *pcCps)
179{
180 int rc = VINF_SUCCESS;
181 const unsigned char *puch = (const unsigned char *)psz;
182 const PRTUNICP pCpEnd = paCps + cCps;
183 PRTUNICP pCp = paCps;
184 Assert(pCpEnd >= pCp);
185 while (cch > 0)
186 {
187 /* read the next char and check for terminator. */
188 const unsigned char uch = *puch;
189 if (!uch)
190 break;
191
192 /* check for output overflow */
193 if (pCp >= pCpEnd)
194 {
195 rc = VERR_BUFFER_OVERFLOW;
196 break;
197 }
198
199 /* decode and recode the code point */
200 if (!(uch & BIT(7)))
201 {
202 *pCp++ = uch;
203 puch++;
204 cch--;
205 }
206#ifdef RT_STRICT
207 else if (!(uch & BIT(6)))
208 AssertMsgFailed(("Internal error!\n"));
209#endif
210 else if (!(uch & BIT(5)))
211 {
212 *pCp++ = (puch[1] & 0x3f)
213 | ((uint16_t)(uch & 0x1f) << 6);
214 puch += 2;
215 cch -= 2;
216 }
217 else if (!(uch & BIT(4)))
218 {
219 *pCp++ = (puch[2] & 0x3f)
220 | ((uint16_t)(puch[1] & 0x3f) << 6)
221 | ((uint16_t)(uch & 0x0f) << 12);
222 puch += 3;
223 cch -= 3;
224 }
225 else if (!(uch & BIT(3)))
226 {
227 *pCp++ = (puch[3] & 0x3f)
228 | ((RTUNICP)(puch[2] & 0x3f) << 6)
229 | ((RTUNICP)(puch[1] & 0x3f) << 12)
230 | ((RTUNICP)(uch & 0x07) << 18);
231 puch += 4;
232 cch -= 4;
233 }
234 else if (!(uch & BIT(2)))
235 {
236 *pCp++ = (puch[4] & 0x3f)
237 | ((RTUNICP)(puch[3] & 0x3f) << 6)
238 | ((RTUNICP)(puch[2] & 0x3f) << 12)
239 | ((RTUNICP)(puch[1] & 0x3f) << 18)
240 | ((RTUNICP)(uch & 0x03) << 24);
241 puch += 5;
242 cch -= 6;
243 }
244 else
245 {
246 Assert(!(uch & BIT(1)));
247 *pCp++ = (puch[5] & 0x3f)
248 | ((RTUNICP)(puch[4] & 0x3f) << 6)
249 | ((RTUNICP)(puch[3] & 0x3f) << 12)
250 | ((RTUNICP)(puch[2] & 0x3f) << 18)
251 | ((RTUNICP)(puch[1] & 0x3f) << 24)
252 | ((RTUNICP)(uch & 0x01) << 30);
253 puch += 6;
254 cch -= 6;
255 }
256 }
257
258 /* done */
259 *pCp = 0;
260 *pcCps = pCp - paCps;
261 return rc;
262}
263
264
265RTDECL(size_t) RTStrUniLen(const char *psz)
266{
267 size_t cCodePoints;
268 int rc = rtUtf8Length(psz, RTSTR_MAX, &cCodePoints);
269 return RT_SUCCESS(rc) ? cCodePoints : 0;
270}
271
272
273RTDECL(int) RTStrUniLenEx(const char *psz, size_t cch, size_t *pcCps)
274{
275 size_t cCodePoints;
276 int rc = rtUtf8Length(psz, cch, &cCodePoints);
277 if (pcCps)
278 *pcCps = RT_SUCCESS(rc) ? cCodePoints : 0;
279 return rc;
280}
281
282
283RTDECL(int) RTStrToUni(const char *pszString, PRTUNICP *ppaCps)
284{
285 /*
286 * Validate input.
287 */
288 Assert(VALID_PTR(pszString));
289 Assert(VALID_PTR(ppaCps));
290 *ppaCps = NULL;
291
292 /*
293 * Validate the UTF-8 input and count its code points.
294 */
295 size_t cCps;
296 int rc = rtUtf8Length(pszString, RTSTR_MAX, &cCps);
297 if (RT_SUCCESS(rc))
298 {
299 /*
300 * Allocate buffer.
301 */
302 PRTUNICP paCps = (PRTUNICP)RTMemAlloc((cCps + 1) * sizeof(RTUNICP));
303 if (paCps)
304 {
305 /*
306 * Decode the string.
307 */
308 rc = rtUtf8Decode(pszString, RTSTR_MAX, paCps, cCps, &cCps);
309 if (RT_SUCCESS(rc))
310 {
311 *ppaCps = paCps;
312 return rc;
313 }
314 RTMemFree(paCps);
315 }
316 else
317 rc = VERR_NO_CODE_POINT_MEMORY;
318 }
319 return rc;
320}
321
322
323RTDECL(int) RTStrToUniEx(const char *pszString, size_t cchString, PRTUNICP *ppaCps, size_t cCps, size_t *pcCps)
324{
325 /*
326 * Validate input.
327 */
328 Assert(VALID_PTR(pszString));
329 Assert(VALID_PTR(ppaCps));
330 Assert(!pcCps || VALID_PTR(pcCps));
331
332 /*
333 * Validate the UTF-8 input and count the code points.
334 */
335 size_t cCpsResult;
336 int rc = rtUtf8Length(pszString, cchString, &cCpsResult);
337 if (RT_SUCCESS(rc))
338 {
339 if (pcCps)
340 *pcCps = cCpsResult;
341
342 /*
343 * Check buffer size / Allocate buffer.
344 */
345 bool fShouldFree;
346 PRTUNICP paCpsResult;
347 if (cCps > 0 && *ppaCps)
348 {
349 fShouldFree = false;
350 if (cCps <= cCpsResult)
351 return VERR_BUFFER_OVERFLOW;
352 paCpsResult = *ppaCps;
353 }
354 else
355 {
356 *ppaCps = NULL;
357 fShouldFree = true;
358 cCps = RT_MAX(cCpsResult + 1, cCps);
359 paCpsResult = (PRTUNICP)RTMemAlloc(cCps * sizeof(RTUNICP));
360 }
361 if (paCpsResult)
362 {
363 /*
364 * Encode the UTF-16 string.
365 */
366 rc = rtUtf8Decode(pszString, cchString, paCpsResult, cCps - 1, &cCpsResult);
367 if (RT_SUCCESS(rc))
368 {
369 *ppaCps = paCpsResult;
370 return rc;
371 }
372 if (fShouldFree)
373 RTMemFree(paCpsResult);
374 }
375 else
376 rc = VERR_NO_CODE_POINT_MEMORY;
377 }
378 return rc;
379}
380
381
382/**
383 * Calculates the UTF-16 length of a string, validating the encoding while doing so.
384 *
385 * @returns IPRT status code.
386 * @param psz Pointer to the UTF-8 string.
387 * @param cch The max length of the string. (btw cch = cb)
388 * Use RTSTR_MAX if all of the string is to be examined.s
389 * @param pcwc Where to store the length of the UTF-16 string as a number of RTUTF16 characters.
390 */
391static int rtUtf8CalcUtf16Length(const char *psz, size_t cch, size_t *pcwc)
392{
393 const unsigned char *puch = (const unsigned char *)psz;
394 size_t cwc = 0;
395 while (cch > 0)
396 {
397 const unsigned char uch = *puch;
398 if (!uch)
399 break;
400 if (!(uch & BIT(7)))
401 {
402 /* one ASCII byte */
403 cwc++;
404 puch++;
405 cch--;
406 }
407 else
408 {
409 /* figure sequence length and validate the first byte */
410 unsigned cb;
411 if ((uch & (BIT(7) | BIT(6) | BIT(5))) == (BIT(7) | BIT(6)))
412 cb = 2;
413 else if ((uch & (BIT(7) | BIT(6) | BIT(5) | BIT(4))) == (BIT(7) | BIT(6) | BIT(5)))
414 cb = 3;
415 else if ((uch & (BIT(7) | BIT(6) | BIT(5) | BIT(4) | BIT(3))) == (BIT(7) | BIT(6) | BIT(5) | BIT(4)))
416 cb = 4;
417 else if ((uch & (BIT(7) | BIT(6) | BIT(5) | BIT(4) | BIT(3) | BIT(2))) == (BIT(7) | BIT(6) | BIT(5) | BIT(4) | BIT(3)))
418 cb = 5;
419 else if ((uch & (BIT(7) | BIT(6) | BIT(5) | BIT(4) | BIT(3) | BIT(2) | BIT(1))) == (BIT(7) | BIT(6) | BIT(5) | BIT(4) | BIT(3) | BIT(2)))
420 cb = 6;
421 else
422 {
423 RTStrAssertMsgFailed(("Invalid UTF-8 first byte: %.*Rhxs\n", RT_MIN(cch, 10), puch));
424 return VERR_INVALID_UTF8_ENCODING;
425 }
426
427 /* check length */
428 if (cb > cch)
429 {
430 RTStrAssertMsgFailed(("Invalid UTF-8 length: cb=%d cch=%d (%.*Rhxs)\n", cb, cch, RT_MIN(cch, 10), puch));
431 return VERR_INVALID_UTF8_ENCODING;
432 }
433
434 /* validate the rest */
435 switch (cb)
436 {
437 case 6:
438 RTStrAssertMsgReturn((puch[5] & (BIT(7) | BIT(6))) == BIT(7), ("6/%u: %.*Rhxs\n", cb, RT_MIN(cb + 10, cch), puch), VERR_INVALID_UTF8_ENCODING);
439 case 5:
440 RTStrAssertMsgReturn((puch[4] & (BIT(7) | BIT(6))) == BIT(7), ("5/%u: %.*Rhxs\n", cb, RT_MIN(cb + 10, cch), puch), VERR_INVALID_UTF8_ENCODING);
441 case 4:
442 RTStrAssertMsgReturn((puch[3] & (BIT(7) | BIT(6))) == BIT(7), ("4/%u: %.*Rhxs\n", cb, RT_MIN(cb + 10, cch), puch), VERR_INVALID_UTF8_ENCODING);
443 case 3:
444 RTStrAssertMsgReturn((puch[2] & (BIT(7) | BIT(6))) == BIT(7), ("3/%u: %.*Rhxs\n", cb, RT_MIN(cb + 10, cch), puch), VERR_INVALID_UTF8_ENCODING);
445 case 2:
446 RTStrAssertMsgReturn((puch[1] & (BIT(7) | BIT(6))) == BIT(7), ("2/%u: %.*Rhxs\n", cb, RT_MIN(cb + 10, cch), puch), VERR_INVALID_UTF8_ENCODING);
447 break;
448 }
449
450 /* validate the code point. */
451 RTUNICP uc;
452 switch (cb)
453 {
454 case 6:
455 uc = (puch[5] & 0x3f)
456 | ((RTUNICP)(puch[4] & 0x3f) << 6)
457 | ((RTUNICP)(puch[3] & 0x3f) << 12)
458 | ((RTUNICP)(puch[2] & 0x3f) << 18)
459 | ((RTUNICP)(puch[1] & 0x3f) << 24)
460 | ((RTUNICP)(uch & 0x01) << 30);
461 RTStrAssertMsgReturn(uc >= 0x04000000 && uc <= 0x7fffffff,
462 ("%u: cp=%#010RX32: %.*Rhxs\n", cb, uc, RT_MIN(cb + 10, cch), puch), VERR_INVALID_UTF8_ENCODING);
463 RTStrAssertMsgFailed(("%u: cp=%#010RX32: %.*Rhxs\n", cb, uc, RT_MIN(cb + 10, cch), puch));
464 return VERR_CANT_RECODE_AS_UTF16;
465 case 5:
466 uc = (puch[4] & 0x3f)
467 | ((RTUNICP)(puch[3] & 0x3f) << 6)
468 | ((RTUNICP)(puch[2] & 0x3f) << 12)
469 | ((RTUNICP)(puch[1] & 0x3f) << 18)
470 | ((RTUNICP)(uch & 0x03) << 24);
471 RTStrAssertMsgReturn(uc >= 0x00200000 && uc <= 0x03ffffff,
472 ("%u: cp=%#010RX32: %.*Rhxs\n", cb, uc, RT_MIN(cb + 10, cch), puch), VERR_INVALID_UTF8_ENCODING);
473 RTStrAssertMsgFailed(("%u: cp=%#010RX32: %.*Rhxs\n", cb, uc, RT_MIN(cb + 10, cch), puch));
474 return VERR_CANT_RECODE_AS_UTF16;
475 case 4:
476 uc = (puch[3] & 0x3f)
477 | ((RTUNICP)(puch[2] & 0x3f) << 6)
478 | ((RTUNICP)(puch[1] & 0x3f) << 12)
479 | ((RTUNICP)(uch & 0x07) << 18);
480 RTStrAssertMsgReturn(uc >= 0x00010000 && uc <= 0x001fffff,
481 ("%u: cp=%#010RX32: %.*Rhxs\n", cb, uc, RT_MIN(cb + 10, cch), puch), VERR_INVALID_UTF8_ENCODING);
482 RTStrAssertMsgReturn(uc <= 0x0010ffff,
483 ("%u: cp=%#010RX32: %.*Rhxs\n", cb, uc, RT_MIN(cb + 10, cch), puch), VERR_CANT_RECODE_AS_UTF16);
484 cwc++;
485 break;
486 case 3:
487 uc = (puch[2] & 0x3f)
488 | ((RTUNICP)(puch[1] & 0x3f) << 6)
489 | ((RTUNICP)(uch & 0x0f) << 12);
490 RTStrAssertMsgReturn(uc >= 0x00000800 && uc <= 0x0000fffd,
491 ("%u: cp=%#010RX32: %.*Rhxs\n", cb, uc, RT_MIN(cb + 10, cch), puch),
492 uc == 0xffff || uc == 0xfffe ? VERR_CODE_POINT_ENDIAN_INDICATOR : VERR_INVALID_UTF8_ENCODING);
493 RTStrAssertMsgReturn(uc < 0xd800 || uc > 0xdfff,
494 ("%u: cp=%#010RX32: %.*Rhxs\n", cb, uc, RT_MIN(cb + 10, cch), puch), VERR_CODE_POINT_SURROGATE);
495 break;
496 case 2:
497 uc = (puch[1] & 0x3f)
498 | ((RTUNICP)(uch & 0x1f) << 6);
499 RTStrAssertMsgReturn(uc >= 0x00000080 && uc <= 0x000007ff,
500 ("%u: cp=%#010RX32: %.*Rhxs\n", cb, uc, RT_MIN(cb + 10, cch), puch), VERR_INVALID_UTF8_ENCODING);
501 break;
502 }
503
504 /* advance */
505 cch -= cb;
506 puch += cb;
507 cwc++;
508 }
509 }
510
511 /* done */
512 *pcwc = cwc;
513 return VINF_SUCCESS;
514}
515
516
517/**
518 * Recodes a valid UTF-8 string as UTF-16.
519 *
520 * Since we know the input is valid, we do *not* perform encoding or length checks.
521 *
522 * @returns iprt status code.
523 * @param psz The UTF-8 string to recode. This is a valid encoding.
524 * @param cch The number of chars (the type char, so bytes if you like) to process of the UTF-8 string.
525 * The recoding will stop when cch or '\\0' is reached. Pass RTSTR_MAX to process up to '\\0'.
526 * @param pwsz Where to store the UTF-16 string.
527 * @param cwc The number of RTUTF16 items the pwsz buffer can hold, excluding the terminator ('\\0').
528 * @param pcwc Where to store the actual number of RTUTF16 items encoded into the UTF-16. This excludes the terminator.
529 */
530static int rtUtf8RecodeAsUtf16(const char *psz, size_t cch, PRTUTF16 pwsz, size_t cwc, size_t *pcwc)
531{
532 int rc = VINF_SUCCESS;
533 const unsigned char *puch = (const unsigned char *)psz;
534 const PRTUTF16 pwszEnd = pwsz + cwc;
535 PRTUTF16 pwc = pwsz;
536 Assert(pwszEnd >= pwc);
537 while (cch > 0)
538 {
539 /* read the next char and check for terminator. */
540 const unsigned char uch = *puch;
541 if (!uch)
542 break;
543
544 /* check for output overflow */
545 if (pwc >= pwszEnd)
546 {
547 rc = VERR_BUFFER_OVERFLOW;
548 break;
549 }
550
551 /* decode and recode the code point */
552 if (!(uch & BIT(7)))
553 {
554 *pwc++ = uch;
555 puch++;
556 cch--;
557 }
558 else if ((uch & (BIT(7) | BIT(6) | BIT(5))) == (BIT(7) | BIT(6)))
559 {
560 uint16_t uc = (puch[1] & 0x3f)
561 | ((uint16_t)(uch & 0x1f) << 6);
562 *pwc++ = uc;
563 puch += 2;
564 cch -= 2;
565 }
566 else if ((uch & (BIT(7) | BIT(6) | BIT(5) | BIT(4))) == (BIT(7) | BIT(6) | BIT(5)))
567 {
568 uint16_t uc = (puch[2] & 0x3f)
569 | ((uint16_t)(puch[1] & 0x3f) << 6)
570 | ((uint16_t)(uch & 0x0f) << 12);
571 *pwc++ = uc;
572 puch += 3;
573 cch -= 3;
574 }
575 else
576 {
577 /* generate surrugate pair */
578 Assert((uch & (BIT(7) | BIT(6) | BIT(5) | BIT(4) | BIT(3))) == (BIT(7) | BIT(6) | BIT(5) | BIT(4)));
579 RTUNICP uc = (puch[3] & 0x3f)
580 | ((RTUNICP)(puch[2] & 0x3f) << 6)
581 | ((RTUNICP)(puch[1] & 0x3f) << 12)
582 | ((RTUNICP)(uch & 0x07) << 18);
583 if (pwc + 1 >= pwszEnd)
584 {
585 rc = VERR_BUFFER_OVERFLOW;
586 break;
587 }
588 uc -= 0x10000;
589 *pwc++ = 0xd800 | (uc >> 10);
590 *pwc++ = 0xdc00 | (uc & 0x3ff);
591 puch += 4;
592 cch -= 4;
593 }
594 }
595
596 /* done */
597 *pwc = '\0';
598 *pcwc = pwc - pwsz;
599 return rc;
600}
601
602
603RTDECL(int) RTStrToUtf16(const char *pszString, PRTUTF16 *ppwszString)
604{
605 /*
606 * Validate input.
607 */
608 Assert(VALID_PTR(ppwszString));
609 Assert(VALID_PTR(pszString));
610 *ppwszString = NULL;
611
612 /*
613 * Validate the UTF-8 input and calculate the length of the UTF-16 string.
614 */
615 size_t cwc;
616 int rc = rtUtf8CalcUtf16Length(pszString, RTSTR_MAX, &cwc);
617 if (RT_SUCCESS(rc))
618 {
619 /*
620 * Allocate buffer.
621 */
622 PRTUTF16 pwsz = (PRTUTF16)RTMemAlloc((cwc + 1) * sizeof(RTUTF16));
623 if (pwsz)
624 {
625 /*
626 * Encode the UTF-16 string.
627 */
628 rc = rtUtf8RecodeAsUtf16(pszString, RTSTR_MAX, pwsz, cwc, &cwc);
629 if (RT_SUCCESS(rc))
630 {
631 *ppwszString = pwsz;
632 return rc;
633 }
634 RTMemFree(pwsz);
635 }
636 else
637 rc = VERR_NO_UTF16_MEMORY;
638 }
639 return rc;
640}
641
642
643RTDECL(int) RTStrToUtf16Ex(const char *pszString, size_t cchString, PRTUTF16 *ppwsz, size_t cwc, size_t *pcwc)
644{
645 /*
646 * Validate input.
647 */
648 Assert(VALID_PTR(pszString));
649 Assert(VALID_PTR(ppwsz));
650 Assert(!pcwc || VALID_PTR(pcwc));
651
652 /*
653 * Validate the UTF-8 input and calculate the length of the UTF-16 string.
654 */
655 size_t cwcResult;
656 int rc = rtUtf8CalcUtf16Length(pszString, cchString, &cwcResult);
657 if (RT_SUCCESS(rc))
658 {
659 if (pcwc)
660 *pcwc = cwcResult;
661
662 /*
663 * Check buffer size / Allocate buffer.
664 */
665 bool fShouldFree;
666 PRTUTF16 pwszResult;
667 if (cwc > 0 && *ppwsz)
668 {
669 fShouldFree = false;
670 if (cwc <= cwcResult)
671 return VERR_BUFFER_OVERFLOW;
672 pwszResult = *ppwsz;
673 }
674 else
675 {
676 *ppwsz = NULL;
677 fShouldFree = true;
678 cwc = RT_MAX(cwcResult + 1, cwc);
679 pwszResult = (PRTUTF16)RTMemAlloc(cwc * sizeof(RTUTF16));
680 }
681 if (pwszResult)
682 {
683 /*
684 * Encode the UTF-16 string.
685 */
686 rc = rtUtf8RecodeAsUtf16(pszString, cchString, pwszResult, cwc - 1, &cwcResult);
687 if (RT_SUCCESS(rc))
688 {
689 *ppwsz = pwszResult;
690 return rc;
691 }
692 if (fShouldFree)
693 RTMemFree(pwszResult);
694 }
695 else
696 rc = VERR_NO_UTF16_MEMORY;
697 }
698 return rc;
699}
700
701
702RTDECL(size_t) RTStrCalcUtf16Len(const char *psz)
703{
704 size_t cwc;
705 int rc = rtUtf8CalcUtf16Length(psz, RTSTR_MAX, &cwc);
706 return RT_SUCCESS(rc) ? cwc : 0;
707}
708
709
710RTDECL(int) RTStrCalcUtf16LenEx(const char *psz, size_t cch, size_t *pcwc)
711{
712 size_t cwc;
713 int rc = rtUtf8CalcUtf16Length(psz, cch, &cwc);
714 if (pcwc)
715 *pcwc = RT_SUCCESS(rc) ? cwc : ~(size_t)0;
716 return rc;
717}
718
719
720/**
721 * Handle invalid encodings passed to RTStrGetCp() and RTStrGetCpEx().
722 * @returns rc
723 * @param ppsz The pointer to the the string position point.
724 * @param pCp Where to store RTUNICP_INVALID.
725 * @param rc The iprt error code.
726 */
727static int rtStrGetCpExFailure(const char **ppsz, PRTUNICP pCp, int rc)
728{
729 /*
730 * Try find a valid encoding.
731 */
732 (*ppsz)++; /** @todo code this! */
733 *pCp = RTUNICP_INVALID;
734 return rc;
735}
736
737
738RTDECL(RTUNICP) RTStrGetCpInternal(const char *psz)
739{
740 RTUNICP Cp;
741 RTStrGetCpExInternal(&psz, &Cp);
742 return Cp;
743}
744
745
746RTDECL(int) RTStrGetCpExInternal(const char **ppsz, PRTUNICP pCp)
747{
748 const unsigned char *puch = (const unsigned char *)*ppsz;
749 const unsigned char uch = *puch;
750 RTUNICP uc;
751
752 /* ASCII ? */
753 if (!(uch & BIT(7)))
754 {
755 uc = uch;
756 puch++;
757 }
758 else if (uch & BIT(6))
759 {
760 /* figure the length and validate the first octet. */
761 unsigned cb;
762 if (!(uch & BIT(5)))
763 cb = 2;
764 else if (!(uch & BIT(4)))
765 cb = 3;
766 else if (!(uch & BIT(3)))
767 cb = 4;
768 else if (!(uch & BIT(2)))
769 cb = 5;
770 else if (!(uch & BIT(1)))
771 cb = 6;
772 else
773 {
774 RTStrAssertMsgFailed(("Invalid UTF-8 first byte: %.*Rhxs\n", RT_MIN(strlen((char *)puch), 10), puch));
775 return rtStrGetCpExFailure(ppsz, pCp, VERR_INVALID_UTF8_ENCODING);
776 }
777
778 /* validate the rest */
779 switch (cb)
780 {
781 case 6:
782 RTStrAssertMsgReturn((puch[5] & 0xc0) == 0x80, ("6/%u: %.*Rhxs\n", cb, RT_MIN(cb + 10, strlen((char *)puch)), puch),
783 rtStrGetCpExFailure(ppsz, pCp, VERR_INVALID_UTF8_ENCODING));
784 case 5:
785 RTStrAssertMsgReturn((puch[4] & 0xc0) == 0x80, ("5/%u: %.*Rhxs\n", cb, RT_MIN(cb + 10, strlen((char *)puch)), puch),
786 rtStrGetCpExFailure(ppsz, pCp, VERR_INVALID_UTF8_ENCODING));
787 case 4:
788 RTStrAssertMsgReturn((puch[3] & 0xc0) == 0x80, ("4/%u: %.*Rhxs\n", cb, RT_MIN(cb + 10, strlen((char *)puch)), puch),
789 rtStrGetCpExFailure(ppsz, pCp, VERR_INVALID_UTF8_ENCODING));
790 case 3:
791 RTStrAssertMsgReturn((puch[2] & 0xc0) == 0x80, ("3/%u: %.*Rhxs\n", cb, RT_MIN(cb + 10, strlen((char *)puch)), puch),
792 rtStrGetCpExFailure(ppsz, pCp, VERR_INVALID_UTF8_ENCODING));
793 case 2:
794 RTStrAssertMsgReturn((puch[1] & 0xc0) == 0x80, ("2/%u: %.*Rhxs\n", cb, RT_MIN(cb + 10, strlen((char *)puch)), puch),
795 rtStrGetCpExFailure(ppsz, pCp, VERR_INVALID_UTF8_ENCODING));
796 break;
797 }
798
799 /* get and validate the code point. */
800 switch (cb)
801 {
802 case 6:
803 uc = (puch[5] & 0x3f)
804 | ((RTUNICP)(puch[4] & 0x3f) << 6)
805 | ((RTUNICP)(puch[3] & 0x3f) << 12)
806 | ((RTUNICP)(puch[2] & 0x3f) << 18)
807 | ((RTUNICP)(puch[1] & 0x3f) << 24)
808 | ((RTUNICP)(uch & 0x01) << 30);
809 RTStrAssertMsgReturn(uc >= 0x04000000 && uc <= 0x7fffffff,
810 ("%u: cp=%#010RX32: %.*Rhxs\n", cb, uc, RT_MIN(cb + 10, strlen((char *)puch)), puch),
811 rtStrGetCpExFailure(ppsz, pCp, VERR_INVALID_UTF8_ENCODING));
812 break;
813 case 5:
814 uc = (puch[4] & 0x3f)
815 | ((RTUNICP)(puch[3] & 0x3f) << 6)
816 | ((RTUNICP)(puch[2] & 0x3f) << 12)
817 | ((RTUNICP)(puch[1] & 0x3f) << 18)
818 | ((RTUNICP)(uch & 0x03) << 24);
819 RTStrAssertMsgReturn(uc >= 0x00200000 && uc <= 0x03ffffff,
820 ("%u: cp=%#010RX32: %.*Rhxs\n", cb, uc, RT_MIN(cb + 10, strlen((char *)puch)), puch),
821 rtStrGetCpExFailure(ppsz, pCp, VERR_INVALID_UTF8_ENCODING));
822 break;
823 case 4:
824 uc = (puch[3] & 0x3f)
825 | ((RTUNICP)(puch[2] & 0x3f) << 6)
826 | ((RTUNICP)(puch[1] & 0x3f) << 12)
827 | ((RTUNICP)(uch & 0x07) << 18);
828 RTStrAssertMsgReturn(uc >= 0x00010000 && uc <= 0x001fffff,
829 ("%u: cp=%#010RX32: %.*Rhxs\n", cb, uc, RT_MIN(cb + 10, strlen((char *)puch)), puch),
830 rtStrGetCpExFailure(ppsz, pCp, VERR_INVALID_UTF8_ENCODING));
831 break;
832 case 3:
833 uc = (puch[2] & 0x3f)
834 | ((RTUNICP)(puch[1] & 0x3f) << 6)
835 | ((RTUNICP)(uch & 0x0f) << 12);
836 RTStrAssertMsgReturn(uc >= 0x00000800 && uc <= 0x0000fffd,
837 ("%u: cp=%#010RX32: %.*Rhxs\n", cb, uc, RT_MIN(cb + 10, strlen((char *)puch)), puch),
838 rtStrGetCpExFailure(ppsz, pCp, uc == 0xffff || uc == 0xfffe ? VERR_CODE_POINT_ENDIAN_INDICATOR : VERR_INVALID_UTF8_ENCODING));
839 RTStrAssertMsgReturn(uc < 0xd800 || uc > 0xdfff,
840 ("%u: cp=%#010RX32: %.*Rhxs\n", cb, uc, RT_MIN(cb + 10, strlen((char *)puch)), puch),
841 rtStrGetCpExFailure(ppsz, pCp, VERR_CODE_POINT_SURROGATE));
842 break;
843 case 2:
844 uc = (puch[1] & 0x3f)
845 | ((RTUNICP)(uch & 0x1f) << 6);
846 RTStrAssertMsgReturn(uc >= 0x00000080 && uc <= 0x000007ff,
847 ("%u: cp=%#010RX32: %.*Rhxs\n", cb, uc, RT_MIN(cb + 10, strlen((char *)puch)), puch),
848 rtStrGetCpExFailure(ppsz, pCp, VERR_INVALID_UTF8_ENCODING));
849 break;
850 default: /* impossible, but GCC is bitching. */
851 uc = RTUNICP_INVALID;
852 break;
853 }
854 puch += cb;
855 }
856 else
857 {
858 /* 6th bit is always set. */
859 RTStrAssertMsgFailed(("Invalid UTF-8 first byte: %.*Rhxs\n", RT_MIN(strlen((char *)puch), 10), puch));
860 return rtStrGetCpExFailure(ppsz, pCp, VERR_INVALID_UTF8_ENCODING);
861 }
862 *pCp = uc;
863 *ppsz = (const char *)puch;
864 return VINF_SUCCESS;
865}
866
867
868RTDECL(char *) RTStrPutCpInternal(char *psz, RTUNICP uc)
869{
870 unsigned char *puch = (unsigned char *)psz;
871 if (uc < 0x80)
872 *puch++ = (unsigned char )uc;
873 else if (uc < 0x00000800)
874 {
875 *puch++ = 0xc0 | (uc >> 6);
876 *puch++ = 0x80 | (uc & 0x3f);
877 }
878 else if (uc < 0x00010000)
879 {
880 if ( uc < 0x0000d8000
881 || ( uc > 0x0000dfff
882 && uc < 0x0000fffe))
883 {
884 *puch++ = 0xe0 | (uc >> 12);
885 *puch++ = 0x80 | ((uc >> 6) & 0x3f);
886 *puch++ = 0x80 | (uc & 0x3f);
887 }
888 else
889 {
890 AssertMsgFailed(("Invalid code point U+%05x!\n", uc));
891 *puch++ = 0x7f;
892 }
893 }
894 else if (uc < 0x00200000)
895 {
896 *puch++ = 0xf0 | (uc >> 18);
897 *puch++ = 0x80 | ((uc >> 12) & 0x3f);
898 *puch++ = 0x80 | ((uc >> 6) & 0x3f);
899 *puch++ = 0x80 | (uc & 0x3f);
900 }
901 else if (uc < 0x04000000)
902 {
903 *puch++ = 0xf1 | (uc >> 24);
904 *puch++ = 0x80 | ((uc >> 18) & 0x3f);
905 *puch++ = 0x80 | ((uc >> 12) & 0x3f);
906 *puch++ = 0x80 | ((uc >> 6) & 0x3f);
907 *puch++ = 0x80 | (uc & 0x3f);
908 }
909 else if (uc <= 0x7fffffff)
910 {
911 *puch++ = 0xf3 | (uc >> 30);
912 *puch++ = 0x80 | ((uc >> 24) & 0x3f);
913 *puch++ = 0x80 | ((uc >> 18) & 0x3f);
914 *puch++ = 0x80 | ((uc >> 12) & 0x3f);
915 *puch++ = 0x80 | ((uc >> 6) & 0x3f);
916 *puch++ = 0x80 | (uc & 0x3f);
917 }
918 else
919 {
920 AssertMsgFailed(("Invalid code point U+%08x!\n", uc));
921 *puch++ = 0x7f;
922 }
923
924 return (char *)puch;
925}
926
927
928RTDECL(char *) RTStrPrevCp(const char *pszStart, const char *psz)
929{
930 if (pszStart < psz)
931 {
932 /* simple char? */
933 const unsigned char *puch = (const unsigned char *)psz;
934 unsigned uch = *--puch;
935 if (!(uch & BIT(7)))
936 return (char *)puch;
937 RTStrAssertMsgReturn(!(uch & BIT(6)), ("uch=%#x\n", uch), (char *)pszStart);
938
939 /* two or more. */
940 uint32_t uMask = 0xffffffc0;
941 while ( (const unsigned char *)pszStart < puch
942 && !(uMask & 1))
943 {
944 unsigned uch = *--puch;
945 if ((uch & 0xc0) != 0x80)
946 {
947 RTStrAssertMsgReturn((uch & (uMask >> 1)) == (uMask & 0xff),
948 ("Invalid UTF-8 encoding: %.*Rhxs puch=%p psz=%p\n", psz - (char *)puch, puch, psz),
949 (char *)pszStart);
950 return (char *)puch;
951 }
952 uMask >>= 1;
953 }
954 RTStrAssertMsgFailed(("Invalid UTF-8 encoding: %.*Rhxs puch=%p psz=%p\n", psz - (char *)puch, puch, psz));
955 }
956 return (char *)pszStart;
957}
958
959
960/**
961 * Performs a case insensitive string compare between two UTF-8 strings.
962 *
963 * This is a simplified compare, as only the simplified lower/upper case folding
964 * specified by the unicode specs are used. It does not consider character pairs
965 * as they are used in some languages, just simple upper & lower case compares.
966 *
967 * @returns < 0 if the first string less than the second string.
968 * @returns 0 if the first string identical to the second string.
969 * @returns > 0 if the first string greater than the second string.
970 * @param psz1 First UTF-8 string.
971 * @param psz2 Second UTF-8 string.
972 */
973RTDECL(int) RTStrICmp(const char *psz1, const char *psz2)
974{
975 /** @todo implement proper UTF-8 case-insensitive string comparison. */
976#ifdef RT_OS_WINDOWS
977 return stricmp(psz1, psz2);
978#else /* !RT_OS_WINDOWS */
979 return strcasecmp(psz1, psz2);
980#endif /* !RT_OS_WINDOWS */
981}
Note: See TracBrowser for help on using the repository browser.

© 2024 Oracle Support Privacy / Do Not Sell My Info Terms of Use Trademark Policy Automated Access Etiquette