VirtualBox

source: vbox/trunk/src/VBox/Runtime/common/string/utf-8.cpp@ 7389

Last change on this file since 7389 was 5999, checked in by vboxsync, 17 years ago

The Giant CDDL Dual-License Header Change.

  • Property svn:eol-style set to native
  • Property svn:keywords set to Id
File size: 36.2 KB
Line 
1/* $Id: utf-8.cpp 5999 2007-12-07 15:05:06Z vboxsync $ */
2/** @file
3 * innotek Portable Runtime - UTF-8 Decoding.
4 */
5
6/*
7 * Copyright (C) 2006-2007 innotek GmbH
8 *
9 * This file is part of VirtualBox Open Source Edition (OSE), as
10 * available from http://www.virtualbox.org. This file is free software;
11 * you can redistribute it and/or modify it under the terms of the GNU
12 * General Public License (GPL) as published by the Free Software
13 * Foundation, in version 2 as it comes in the "COPYING" file of the
14 * VirtualBox OSE distribution. VirtualBox OSE is distributed in the
15 * hope that it will be useful, but WITHOUT ANY WARRANTY of any kind.
16 *
17 * The contents of this file may alternatively be used under the terms
18 * of the Common Development and Distribution License Version 1.0
19 * (CDDL) only, as it comes in the "COPYING.CDDL" file of the
20 * VirtualBox OSE distribution, in which case the provisions of the
21 * CDDL are applicable instead of those of the GPL.
22 *
23 * You may elect to license modified versions of this file under the
24 * terms and conditions of either the GPL or the CDDL or both.
25 */
26
27
28/*******************************************************************************
29* Header Files *
30*******************************************************************************/
31#include <iprt/string.h>
32#include <iprt/uni.h>
33#include <iprt/alloc.h>
34#include <iprt/assert.h>
35#include <iprt/err.h>
36#include "internal/string.h"
37
38
39
40/**
41 * Get get length in code points of a UTF-8 encoded string.
42 * The string is validated while doing this.
43 *
44 * @returns IPRT status code.
45 * @param psz Pointer to the UTF-8 string.
46 * @param cch The max length of the string. (btw cch = cb)
47 * Use RTSTR_MAX if all of the string is to be examined.s
48 * @param pcuc Where to store the length in unicode code points.
49 */
50static int rtUtf8Length(const char *psz, size_t cch, size_t *pcuc)
51{
52 const unsigned char *puch = (const unsigned char *)psz;
53 size_t cCodePoints = 0;
54 while (cch > 0)
55 {
56 const unsigned char uch = *puch;
57 if (!uch)
58 break;
59 if (uch & RT_BIT(7))
60 {
61 /* figure sequence length and validate the first byte */
62 unsigned cb;
63 if ((uch & (RT_BIT(7) | RT_BIT(6) | RT_BIT(5))) == (RT_BIT(7) | RT_BIT(6)))
64 cb = 2;
65 else if ((uch & (RT_BIT(7) | RT_BIT(6) | RT_BIT(5) | RT_BIT(4))) == (RT_BIT(7) | RT_BIT(6) | RT_BIT(5)))
66 cb = 3;
67 else if ((uch & (RT_BIT(7) | RT_BIT(6) | RT_BIT(5) | RT_BIT(4) | RT_BIT(3))) == (RT_BIT(7) | RT_BIT(6) | RT_BIT(5) | RT_BIT(4)))
68 cb = 4;
69 else if ((uch & (RT_BIT(7) | RT_BIT(6) | RT_BIT(5) | RT_BIT(4) | RT_BIT(3) | RT_BIT(2))) == (RT_BIT(7) | RT_BIT(6) | RT_BIT(5) | RT_BIT(4) | RT_BIT(3)))
70 cb = 5;
71 else if ((uch & (RT_BIT(7) | RT_BIT(6) | RT_BIT(5) | RT_BIT(4) | RT_BIT(3) | RT_BIT(2) | RT_BIT(1))) == (RT_BIT(7) | RT_BIT(6) | RT_BIT(5) | RT_BIT(4) | RT_BIT(3) | RT_BIT(2)))
72 cb = 6;
73 else
74 {
75 RTStrAssertMsgFailed(("Invalid UTF-8 first byte: %.*Rhxs\n", RT_MIN(cch, 10), puch));
76 return VERR_INVALID_UTF8_ENCODING;
77 }
78
79 /* check length */
80 if (cb > cch)
81 {
82 RTStrAssertMsgFailed(("Invalid UTF-8 length: cb=%d cch=%d (%.*Rhxs)\n", cb, cch, RT_MIN(cch, 10), puch));
83 return VERR_INVALID_UTF8_ENCODING;
84 }
85
86 /* validate the rest */
87 switch (cb)
88 {
89 case 6:
90 RTStrAssertMsgReturn((puch[5] & (RT_BIT(7) | RT_BIT(6))) == RT_BIT(7), ("6/%u: %.*Rhxs\n", cb, RT_MIN(cb + 10, cch), puch), VERR_INVALID_UTF8_ENCODING);
91 case 5:
92 RTStrAssertMsgReturn((puch[4] & (RT_BIT(7) | RT_BIT(6))) == RT_BIT(7), ("5/%u: %.*Rhxs\n", cb, RT_MIN(cb + 10, cch), puch), VERR_INVALID_UTF8_ENCODING);
93 case 4:
94 RTStrAssertMsgReturn((puch[3] & (RT_BIT(7) | RT_BIT(6))) == RT_BIT(7), ("4/%u: %.*Rhxs\n", cb, RT_MIN(cb + 10, cch), puch), VERR_INVALID_UTF8_ENCODING);
95 case 3:
96 RTStrAssertMsgReturn((puch[2] & (RT_BIT(7) | RT_BIT(6))) == RT_BIT(7), ("3/%u: %.*Rhxs\n", cb, RT_MIN(cb + 10, cch), puch), VERR_INVALID_UTF8_ENCODING);
97 case 2:
98 RTStrAssertMsgReturn((puch[1] & (RT_BIT(7) | RT_BIT(6))) == RT_BIT(7), ("2/%u: %.*Rhxs\n", cb, RT_MIN(cb + 10, cch), puch), VERR_INVALID_UTF8_ENCODING);
99 break;
100 }
101
102 /* validate the code point. */
103 RTUNICP uc;
104 switch (cb)
105 {
106 case 6:
107 uc = (puch[5] & 0x3f)
108 | ((RTUNICP)(puch[4] & 0x3f) << 6)
109 | ((RTUNICP)(puch[3] & 0x3f) << 12)
110 | ((RTUNICP)(puch[2] & 0x3f) << 18)
111 | ((RTUNICP)(puch[1] & 0x3f) << 24)
112 | ((RTUNICP)(uch & 0x01) << 30);
113 RTStrAssertMsgReturn(uc >= 0x04000000 && uc <= 0x7fffffff,
114 ("%u: cp=%#010RX32: %.*Rhxs\n", cb, uc, RT_MIN(cb + 10, cch), puch), VERR_INVALID_UTF8_ENCODING);
115 break;
116 case 5:
117 uc = (puch[4] & 0x3f)
118 | ((RTUNICP)(puch[3] & 0x3f) << 6)
119 | ((RTUNICP)(puch[2] & 0x3f) << 12)
120 | ((RTUNICP)(puch[1] & 0x3f) << 18)
121 | ((RTUNICP)(uch & 0x03) << 24);
122 RTStrAssertMsgReturn(uc >= 0x00200000 && uc <= 0x03ffffff,
123 ("%u: cp=%#010RX32: %.*Rhxs\n", cb, uc, RT_MIN(cb + 10, cch), puch), VERR_INVALID_UTF8_ENCODING);
124 break;
125 case 4:
126 uc = (puch[3] & 0x3f)
127 | ((RTUNICP)(puch[2] & 0x3f) << 6)
128 | ((RTUNICP)(puch[1] & 0x3f) << 12)
129 | ((RTUNICP)(uch & 0x07) << 18);
130 RTStrAssertMsgReturn(uc >= 0x00010000 && uc <= 0x001fffff,
131 ("%u: cp=%#010RX32: %.*Rhxs\n", cb, uc, RT_MIN(cb + 10, cch), puch), VERR_INVALID_UTF8_ENCODING);
132 break;
133 case 3:
134 uc = (puch[2] & 0x3f)
135 | ((RTUNICP)(puch[1] & 0x3f) << 6)
136 | ((RTUNICP)(uch & 0x0f) << 12);
137 RTStrAssertMsgReturn(uc >= 0x00000800 && uc <= 0x0000fffd,
138 ("%u: cp=%#010RX32: %.*Rhxs\n", cb, uc, RT_MIN(cb + 10, cch), puch),
139 uc == 0xffff || uc == 0xfffe ? VERR_CODE_POINT_ENDIAN_INDICATOR : VERR_INVALID_UTF8_ENCODING);
140 RTStrAssertMsgReturn(uc < 0xd800 || uc > 0xdfff,
141 ("%u: cp=%#010RX32: %.*Rhxs\n", cb, uc, RT_MIN(cb + 10, cch), puch), VERR_CODE_POINT_SURROGATE);
142 break;
143 case 2:
144 uc = (puch[1] & 0x3f)
145 | ((RTUNICP)(uch & 0x1f) << 6);
146 RTStrAssertMsgReturn(uc >= 0x00000080 && uc <= 0x000007ff,
147 ("%u: cp=%#010RX32: %.*Rhxs\n", cb, uc, RT_MIN(cb + 10, cch), puch), VERR_INVALID_UTF8_ENCODING);
148 break;
149 }
150
151 /* advance */
152 cch -= cb;
153 puch += cb;
154 }
155 else
156 {
157 /* one ASCII byte */
158 puch++;
159 cch--;
160 }
161 cCodePoints++;
162 }
163
164 /* done */
165 *pcuc = cCodePoints;
166 return VINF_SUCCESS;
167}
168
169
170/**
171 * Decodes and UTF-8 string into an array of unicode code point.
172 *
173 * Since we know the input is valid, we do *not* perform encoding or length checks.
174 *
175 * @returns iprt status code.
176 * @param psz The UTF-8 string to recode. This is a valid encoding.
177 * @param cch The number of chars (the type char, so bytes if you like) to process of the UTF-8 string.
178 * The recoding will stop when cch or '\\0' is reached. Pass RTSTR_MAX to process up to '\\0'.
179 * @param paCps Where to store the code points array.
180 * @param cCps The number of RTUNICP items the paCps buffer can hold, excluding the terminator ('\\0').
181 * @param pcCps Where to store the actual number of decoded code points. This excludes the terminator.
182 */
183static int rtUtf8Decode(const char *psz, size_t cch, PRTUNICP paCps, size_t cCps, size_t *pcCps)
184{
185 int rc = VINF_SUCCESS;
186 const unsigned char *puch = (const unsigned char *)psz;
187 const PRTUNICP pCpEnd = paCps + cCps;
188 PRTUNICP pCp = paCps;
189 Assert(pCpEnd >= pCp);
190 while (cch > 0)
191 {
192 /* read the next char and check for terminator. */
193 const unsigned char uch = *puch;
194 if (!uch)
195 break;
196
197 /* check for output overflow */
198 if (pCp >= pCpEnd)
199 {
200 rc = VERR_BUFFER_OVERFLOW;
201 break;
202 }
203
204 /* decode and recode the code point */
205 if (!(uch & RT_BIT(7)))
206 {
207 *pCp++ = uch;
208 puch++;
209 cch--;
210 }
211#ifdef RT_STRICT
212 else if (!(uch & RT_BIT(6)))
213 AssertMsgFailed(("Internal error!\n"));
214#endif
215 else if (!(uch & RT_BIT(5)))
216 {
217 *pCp++ = (puch[1] & 0x3f)
218 | ((uint16_t)(uch & 0x1f) << 6);
219 puch += 2;
220 cch -= 2;
221 }
222 else if (!(uch & RT_BIT(4)))
223 {
224 *pCp++ = (puch[2] & 0x3f)
225 | ((uint16_t)(puch[1] & 0x3f) << 6)
226 | ((uint16_t)(uch & 0x0f) << 12);
227 puch += 3;
228 cch -= 3;
229 }
230 else if (!(uch & RT_BIT(3)))
231 {
232 *pCp++ = (puch[3] & 0x3f)
233 | ((RTUNICP)(puch[2] & 0x3f) << 6)
234 | ((RTUNICP)(puch[1] & 0x3f) << 12)
235 | ((RTUNICP)(uch & 0x07) << 18);
236 puch += 4;
237 cch -= 4;
238 }
239 else if (!(uch & RT_BIT(2)))
240 {
241 *pCp++ = (puch[4] & 0x3f)
242 | ((RTUNICP)(puch[3] & 0x3f) << 6)
243 | ((RTUNICP)(puch[2] & 0x3f) << 12)
244 | ((RTUNICP)(puch[1] & 0x3f) << 18)
245 | ((RTUNICP)(uch & 0x03) << 24);
246 puch += 5;
247 cch -= 6;
248 }
249 else
250 {
251 Assert(!(uch & RT_BIT(1)));
252 *pCp++ = (puch[5] & 0x3f)
253 | ((RTUNICP)(puch[4] & 0x3f) << 6)
254 | ((RTUNICP)(puch[3] & 0x3f) << 12)
255 | ((RTUNICP)(puch[2] & 0x3f) << 18)
256 | ((RTUNICP)(puch[1] & 0x3f) << 24)
257 | ((RTUNICP)(uch & 0x01) << 30);
258 puch += 6;
259 cch -= 6;
260 }
261 }
262
263 /* done */
264 *pCp = 0;
265 *pcCps = pCp - paCps;
266 return rc;
267}
268
269
270RTDECL(size_t) RTStrUniLen(const char *psz)
271{
272 size_t cCodePoints;
273 int rc = rtUtf8Length(psz, RTSTR_MAX, &cCodePoints);
274 return RT_SUCCESS(rc) ? cCodePoints : 0;
275}
276
277
278RTDECL(int) RTStrUniLenEx(const char *psz, size_t cch, size_t *pcCps)
279{
280 size_t cCodePoints;
281 int rc = rtUtf8Length(psz, cch, &cCodePoints);
282 if (pcCps)
283 *pcCps = RT_SUCCESS(rc) ? cCodePoints : 0;
284 return rc;
285}
286
287
288RTDECL(int) RTStrToUni(const char *pszString, PRTUNICP *ppaCps)
289{
290 /*
291 * Validate input.
292 */
293 Assert(VALID_PTR(pszString));
294 Assert(VALID_PTR(ppaCps));
295 *ppaCps = NULL;
296
297 /*
298 * Validate the UTF-8 input and count its code points.
299 */
300 size_t cCps;
301 int rc = rtUtf8Length(pszString, RTSTR_MAX, &cCps);
302 if (RT_SUCCESS(rc))
303 {
304 /*
305 * Allocate buffer.
306 */
307 PRTUNICP paCps = (PRTUNICP)RTMemAlloc((cCps + 1) * sizeof(RTUNICP));
308 if (paCps)
309 {
310 /*
311 * Decode the string.
312 */
313 rc = rtUtf8Decode(pszString, RTSTR_MAX, paCps, cCps, &cCps);
314 if (RT_SUCCESS(rc))
315 {
316 *ppaCps = paCps;
317 return rc;
318 }
319 RTMemFree(paCps);
320 }
321 else
322 rc = VERR_NO_CODE_POINT_MEMORY;
323 }
324 return rc;
325}
326
327
328RTDECL(int) RTStrToUniEx(const char *pszString, size_t cchString, PRTUNICP *ppaCps, size_t cCps, size_t *pcCps)
329{
330 /*
331 * Validate input.
332 */
333 Assert(VALID_PTR(pszString));
334 Assert(VALID_PTR(ppaCps));
335 Assert(!pcCps || VALID_PTR(pcCps));
336
337 /*
338 * Validate the UTF-8 input and count the code points.
339 */
340 size_t cCpsResult;
341 int rc = rtUtf8Length(pszString, cchString, &cCpsResult);
342 if (RT_SUCCESS(rc))
343 {
344 if (pcCps)
345 *pcCps = cCpsResult;
346
347 /*
348 * Check buffer size / Allocate buffer.
349 */
350 bool fShouldFree;
351 PRTUNICP paCpsResult;
352 if (cCps > 0 && *ppaCps)
353 {
354 fShouldFree = false;
355 if (cCps <= cCpsResult)
356 return VERR_BUFFER_OVERFLOW;
357 paCpsResult = *ppaCps;
358 }
359 else
360 {
361 *ppaCps = NULL;
362 fShouldFree = true;
363 cCps = RT_MAX(cCpsResult + 1, cCps);
364 paCpsResult = (PRTUNICP)RTMemAlloc(cCps * sizeof(RTUNICP));
365 }
366 if (paCpsResult)
367 {
368 /*
369 * Encode the UTF-16 string.
370 */
371 rc = rtUtf8Decode(pszString, cchString, paCpsResult, cCps - 1, &cCpsResult);
372 if (RT_SUCCESS(rc))
373 {
374 *ppaCps = paCpsResult;
375 return rc;
376 }
377 if (fShouldFree)
378 RTMemFree(paCpsResult);
379 }
380 else
381 rc = VERR_NO_CODE_POINT_MEMORY;
382 }
383 return rc;
384}
385
386
387/**
388 * Calculates the UTF-16 length of a string, validating the encoding while doing so.
389 *
390 * @returns IPRT status code.
391 * @param psz Pointer to the UTF-8 string.
392 * @param cch The max length of the string. (btw cch = cb)
393 * Use RTSTR_MAX if all of the string is to be examined.s
394 * @param pcwc Where to store the length of the UTF-16 string as a number of RTUTF16 characters.
395 */
396static int rtUtf8CalcUtf16Length(const char *psz, size_t cch, size_t *pcwc)
397{
398 const unsigned char *puch = (const unsigned char *)psz;
399 size_t cwc = 0;
400 while (cch > 0)
401 {
402 const unsigned char uch = *puch;
403 if (!uch)
404 break;
405 if (!(uch & RT_BIT(7)))
406 {
407 /* one ASCII byte */
408 cwc++;
409 puch++;
410 cch--;
411 }
412 else
413 {
414 /* figure sequence length and validate the first byte */
415 unsigned cb;
416 if ((uch & (RT_BIT(7) | RT_BIT(6) | RT_BIT(5))) == (RT_BIT(7) | RT_BIT(6)))
417 cb = 2;
418 else if ((uch & (RT_BIT(7) | RT_BIT(6) | RT_BIT(5) | RT_BIT(4))) == (RT_BIT(7) | RT_BIT(6) | RT_BIT(5)))
419 cb = 3;
420 else if ((uch & (RT_BIT(7) | RT_BIT(6) | RT_BIT(5) | RT_BIT(4) | RT_BIT(3))) == (RT_BIT(7) | RT_BIT(6) | RT_BIT(5) | RT_BIT(4)))
421 cb = 4;
422 else if ((uch & (RT_BIT(7) | RT_BIT(6) | RT_BIT(5) | RT_BIT(4) | RT_BIT(3) | RT_BIT(2))) == (RT_BIT(7) | RT_BIT(6) | RT_BIT(5) | RT_BIT(4) | RT_BIT(3)))
423 cb = 5;
424 else if ((uch & (RT_BIT(7) | RT_BIT(6) | RT_BIT(5) | RT_BIT(4) | RT_BIT(3) | RT_BIT(2) | RT_BIT(1))) == (RT_BIT(7) | RT_BIT(6) | RT_BIT(5) | RT_BIT(4) | RT_BIT(3) | RT_BIT(2)))
425 cb = 6;
426 else
427 {
428 RTStrAssertMsgFailed(("Invalid UTF-8 first byte: %.*Rhxs\n", RT_MIN(cch, 10), puch));
429 return VERR_INVALID_UTF8_ENCODING;
430 }
431
432 /* check length */
433 if (cb > cch)
434 {
435 RTStrAssertMsgFailed(("Invalid UTF-8 length: cb=%d cch=%d (%.*Rhxs)\n", cb, cch, RT_MIN(cch, 10), puch));
436 return VERR_INVALID_UTF8_ENCODING;
437 }
438
439 /* validate the rest */
440 switch (cb)
441 {
442 case 6:
443 RTStrAssertMsgReturn((puch[5] & (RT_BIT(7) | RT_BIT(6))) == RT_BIT(7), ("6/%u: %.*Rhxs\n", cb, RT_MIN(cb + 10, cch), puch), VERR_INVALID_UTF8_ENCODING);
444 case 5:
445 RTStrAssertMsgReturn((puch[4] & (RT_BIT(7) | RT_BIT(6))) == RT_BIT(7), ("5/%u: %.*Rhxs\n", cb, RT_MIN(cb + 10, cch), puch), VERR_INVALID_UTF8_ENCODING);
446 case 4:
447 RTStrAssertMsgReturn((puch[3] & (RT_BIT(7) | RT_BIT(6))) == RT_BIT(7), ("4/%u: %.*Rhxs\n", cb, RT_MIN(cb + 10, cch), puch), VERR_INVALID_UTF8_ENCODING);
448 case 3:
449 RTStrAssertMsgReturn((puch[2] & (RT_BIT(7) | RT_BIT(6))) == RT_BIT(7), ("3/%u: %.*Rhxs\n", cb, RT_MIN(cb + 10, cch), puch), VERR_INVALID_UTF8_ENCODING);
450 case 2:
451 RTStrAssertMsgReturn((puch[1] & (RT_BIT(7) | RT_BIT(6))) == RT_BIT(7), ("2/%u: %.*Rhxs\n", cb, RT_MIN(cb + 10, cch), puch), VERR_INVALID_UTF8_ENCODING);
452 break;
453 }
454
455 /* validate the code point. */
456 RTUNICP uc;
457 switch (cb)
458 {
459 case 6:
460 uc = (puch[5] & 0x3f)
461 | ((RTUNICP)(puch[4] & 0x3f) << 6)
462 | ((RTUNICP)(puch[3] & 0x3f) << 12)
463 | ((RTUNICP)(puch[2] & 0x3f) << 18)
464 | ((RTUNICP)(puch[1] & 0x3f) << 24)
465 | ((RTUNICP)(uch & 0x01) << 30);
466 RTStrAssertMsgReturn(uc >= 0x04000000 && uc <= 0x7fffffff,
467 ("%u: cp=%#010RX32: %.*Rhxs\n", cb, uc, RT_MIN(cb + 10, cch), puch), VERR_INVALID_UTF8_ENCODING);
468 RTStrAssertMsgFailed(("%u: cp=%#010RX32: %.*Rhxs\n", cb, uc, RT_MIN(cb + 10, cch), puch));
469 return VERR_CANT_RECODE_AS_UTF16;
470 case 5:
471 uc = (puch[4] & 0x3f)
472 | ((RTUNICP)(puch[3] & 0x3f) << 6)
473 | ((RTUNICP)(puch[2] & 0x3f) << 12)
474 | ((RTUNICP)(puch[1] & 0x3f) << 18)
475 | ((RTUNICP)(uch & 0x03) << 24);
476 RTStrAssertMsgReturn(uc >= 0x00200000 && uc <= 0x03ffffff,
477 ("%u: cp=%#010RX32: %.*Rhxs\n", cb, uc, RT_MIN(cb + 10, cch), puch), VERR_INVALID_UTF8_ENCODING);
478 RTStrAssertMsgFailed(("%u: cp=%#010RX32: %.*Rhxs\n", cb, uc, RT_MIN(cb + 10, cch), puch));
479 return VERR_CANT_RECODE_AS_UTF16;
480 case 4:
481 uc = (puch[3] & 0x3f)
482 | ((RTUNICP)(puch[2] & 0x3f) << 6)
483 | ((RTUNICP)(puch[1] & 0x3f) << 12)
484 | ((RTUNICP)(uch & 0x07) << 18);
485 RTStrAssertMsgReturn(uc >= 0x00010000 && uc <= 0x001fffff,
486 ("%u: cp=%#010RX32: %.*Rhxs\n", cb, uc, RT_MIN(cb + 10, cch), puch), VERR_INVALID_UTF8_ENCODING);
487 RTStrAssertMsgReturn(uc <= 0x0010ffff,
488 ("%u: cp=%#010RX32: %.*Rhxs\n", cb, uc, RT_MIN(cb + 10, cch), puch), VERR_CANT_RECODE_AS_UTF16);
489 cwc++;
490 break;
491 case 3:
492 uc = (puch[2] & 0x3f)
493 | ((RTUNICP)(puch[1] & 0x3f) << 6)
494 | ((RTUNICP)(uch & 0x0f) << 12);
495 RTStrAssertMsgReturn(uc >= 0x00000800 && uc <= 0x0000fffd,
496 ("%u: cp=%#010RX32: %.*Rhxs\n", cb, uc, RT_MIN(cb + 10, cch), puch),
497 uc == 0xffff || uc == 0xfffe ? VERR_CODE_POINT_ENDIAN_INDICATOR : VERR_INVALID_UTF8_ENCODING);
498 RTStrAssertMsgReturn(uc < 0xd800 || uc > 0xdfff,
499 ("%u: cp=%#010RX32: %.*Rhxs\n", cb, uc, RT_MIN(cb + 10, cch), puch), VERR_CODE_POINT_SURROGATE);
500 break;
501 case 2:
502 uc = (puch[1] & 0x3f)
503 | ((RTUNICP)(uch & 0x1f) << 6);
504 RTStrAssertMsgReturn(uc >= 0x00000080 && uc <= 0x000007ff,
505 ("%u: cp=%#010RX32: %.*Rhxs\n", cb, uc, RT_MIN(cb + 10, cch), puch), VERR_INVALID_UTF8_ENCODING);
506 break;
507 }
508
509 /* advance */
510 cch -= cb;
511 puch += cb;
512 cwc++;
513 }
514 }
515
516 /* done */
517 *pcwc = cwc;
518 return VINF_SUCCESS;
519}
520
521
522/**
523 * Recodes a valid UTF-8 string as UTF-16.
524 *
525 * Since we know the input is valid, we do *not* perform encoding or length checks.
526 *
527 * @returns iprt status code.
528 * @param psz The UTF-8 string to recode. This is a valid encoding.
529 * @param cch The number of chars (the type char, so bytes if you like) to process of the UTF-8 string.
530 * The recoding will stop when cch or '\\0' is reached. Pass RTSTR_MAX to process up to '\\0'.
531 * @param pwsz Where to store the UTF-16 string.
532 * @param cwc The number of RTUTF16 items the pwsz buffer can hold, excluding the terminator ('\\0').
533 * @param pcwc Where to store the actual number of RTUTF16 items encoded into the UTF-16. This excludes the terminator.
534 */
535static int rtUtf8RecodeAsUtf16(const char *psz, size_t cch, PRTUTF16 pwsz, size_t cwc, size_t *pcwc)
536{
537 int rc = VINF_SUCCESS;
538 const unsigned char *puch = (const unsigned char *)psz;
539 const PRTUTF16 pwszEnd = pwsz + cwc;
540 PRTUTF16 pwc = pwsz;
541 Assert(pwszEnd >= pwc);
542 while (cch > 0)
543 {
544 /* read the next char and check for terminator. */
545 const unsigned char uch = *puch;
546 if (!uch)
547 break;
548
549 /* check for output overflow */
550 if (pwc >= pwszEnd)
551 {
552 rc = VERR_BUFFER_OVERFLOW;
553 break;
554 }
555
556 /* decode and recode the code point */
557 if (!(uch & RT_BIT(7)))
558 {
559 *pwc++ = uch;
560 puch++;
561 cch--;
562 }
563 else if ((uch & (RT_BIT(7) | RT_BIT(6) | RT_BIT(5))) == (RT_BIT(7) | RT_BIT(6)))
564 {
565 uint16_t uc = (puch[1] & 0x3f)
566 | ((uint16_t)(uch & 0x1f) << 6);
567 *pwc++ = uc;
568 puch += 2;
569 cch -= 2;
570 }
571 else if ((uch & (RT_BIT(7) | RT_BIT(6) | RT_BIT(5) | RT_BIT(4))) == (RT_BIT(7) | RT_BIT(6) | RT_BIT(5)))
572 {
573 uint16_t uc = (puch[2] & 0x3f)
574 | ((uint16_t)(puch[1] & 0x3f) << 6)
575 | ((uint16_t)(uch & 0x0f) << 12);
576 *pwc++ = uc;
577 puch += 3;
578 cch -= 3;
579 }
580 else
581 {
582 /* generate surrugate pair */
583 Assert((uch & (RT_BIT(7) | RT_BIT(6) | RT_BIT(5) | RT_BIT(4) | RT_BIT(3))) == (RT_BIT(7) | RT_BIT(6) | RT_BIT(5) | RT_BIT(4)));
584 RTUNICP uc = (puch[3] & 0x3f)
585 | ((RTUNICP)(puch[2] & 0x3f) << 6)
586 | ((RTUNICP)(puch[1] & 0x3f) << 12)
587 | ((RTUNICP)(uch & 0x07) << 18);
588 if (pwc + 1 >= pwszEnd)
589 {
590 rc = VERR_BUFFER_OVERFLOW;
591 break;
592 }
593 uc -= 0x10000;
594 *pwc++ = 0xd800 | (uc >> 10);
595 *pwc++ = 0xdc00 | (uc & 0x3ff);
596 puch += 4;
597 cch -= 4;
598 }
599 }
600
601 /* done */
602 *pwc = '\0';
603 *pcwc = pwc - pwsz;
604 return rc;
605}
606
607
608RTDECL(int) RTStrToUtf16(const char *pszString, PRTUTF16 *ppwszString)
609{
610 /*
611 * Validate input.
612 */
613 Assert(VALID_PTR(ppwszString));
614 Assert(VALID_PTR(pszString));
615 *ppwszString = NULL;
616
617 /*
618 * Validate the UTF-8 input and calculate the length of the UTF-16 string.
619 */
620 size_t cwc;
621 int rc = rtUtf8CalcUtf16Length(pszString, RTSTR_MAX, &cwc);
622 if (RT_SUCCESS(rc))
623 {
624 /*
625 * Allocate buffer.
626 */
627 PRTUTF16 pwsz = (PRTUTF16)RTMemAlloc((cwc + 1) * sizeof(RTUTF16));
628 if (pwsz)
629 {
630 /*
631 * Encode the UTF-16 string.
632 */
633 rc = rtUtf8RecodeAsUtf16(pszString, RTSTR_MAX, pwsz, cwc, &cwc);
634 if (RT_SUCCESS(rc))
635 {
636 *ppwszString = pwsz;
637 return rc;
638 }
639 RTMemFree(pwsz);
640 }
641 else
642 rc = VERR_NO_UTF16_MEMORY;
643 }
644 return rc;
645}
646
647
648RTDECL(int) RTStrToUtf16Ex(const char *pszString, size_t cchString, PRTUTF16 *ppwsz, size_t cwc, size_t *pcwc)
649{
650 /*
651 * Validate input.
652 */
653 Assert(VALID_PTR(pszString));
654 Assert(VALID_PTR(ppwsz));
655 Assert(!pcwc || VALID_PTR(pcwc));
656
657 /*
658 * Validate the UTF-8 input and calculate the length of the UTF-16 string.
659 */
660 size_t cwcResult;
661 int rc = rtUtf8CalcUtf16Length(pszString, cchString, &cwcResult);
662 if (RT_SUCCESS(rc))
663 {
664 if (pcwc)
665 *pcwc = cwcResult;
666
667 /*
668 * Check buffer size / Allocate buffer.
669 */
670 bool fShouldFree;
671 PRTUTF16 pwszResult;
672 if (cwc > 0 && *ppwsz)
673 {
674 fShouldFree = false;
675 if (cwc <= cwcResult)
676 return VERR_BUFFER_OVERFLOW;
677 pwszResult = *ppwsz;
678 }
679 else
680 {
681 *ppwsz = NULL;
682 fShouldFree = true;
683 cwc = RT_MAX(cwcResult + 1, cwc);
684 pwszResult = (PRTUTF16)RTMemAlloc(cwc * sizeof(RTUTF16));
685 }
686 if (pwszResult)
687 {
688 /*
689 * Encode the UTF-16 string.
690 */
691 rc = rtUtf8RecodeAsUtf16(pszString, cchString, pwszResult, cwc - 1, &cwcResult);
692 if (RT_SUCCESS(rc))
693 {
694 *ppwsz = pwszResult;
695 return rc;
696 }
697 if (fShouldFree)
698 RTMemFree(pwszResult);
699 }
700 else
701 rc = VERR_NO_UTF16_MEMORY;
702 }
703 return rc;
704}
705
706
707RTDECL(size_t) RTStrCalcUtf16Len(const char *psz)
708{
709 size_t cwc;
710 int rc = rtUtf8CalcUtf16Length(psz, RTSTR_MAX, &cwc);
711 return RT_SUCCESS(rc) ? cwc : 0;
712}
713
714
715RTDECL(int) RTStrCalcUtf16LenEx(const char *psz, size_t cch, size_t *pcwc)
716{
717 size_t cwc;
718 int rc = rtUtf8CalcUtf16Length(psz, cch, &cwc);
719 if (pcwc)
720 *pcwc = RT_SUCCESS(rc) ? cwc : ~(size_t)0;
721 return rc;
722}
723
724
725/**
726 * Handle invalid encodings passed to RTStrGetCp() and RTStrGetCpEx().
727 * @returns rc
728 * @param ppsz The pointer to the the string position point.
729 * @param pCp Where to store RTUNICP_INVALID.
730 * @param rc The iprt error code.
731 */
732static int rtStrGetCpExFailure(const char **ppsz, PRTUNICP pCp, int rc)
733{
734 /*
735 * Try find a valid encoding.
736 */
737 (*ppsz)++; /** @todo code this! */
738 *pCp = RTUNICP_INVALID;
739 return rc;
740}
741
742
743RTDECL(RTUNICP) RTStrGetCpInternal(const char *psz)
744{
745 RTUNICP Cp;
746 RTStrGetCpExInternal(&psz, &Cp);
747 return Cp;
748}
749
750
751RTDECL(int) RTStrGetCpExInternal(const char **ppsz, PRTUNICP pCp)
752{
753 const unsigned char *puch = (const unsigned char *)*ppsz;
754 const unsigned char uch = *puch;
755 RTUNICP uc;
756
757 /* ASCII ? */
758 if (!(uch & RT_BIT(7)))
759 {
760 uc = uch;
761 puch++;
762 }
763 else if (uch & RT_BIT(6))
764 {
765 /* figure the length and validate the first octet. */
766 unsigned cb;
767 if (!(uch & RT_BIT(5)))
768 cb = 2;
769 else if (!(uch & RT_BIT(4)))
770 cb = 3;
771 else if (!(uch & RT_BIT(3)))
772 cb = 4;
773 else if (!(uch & RT_BIT(2)))
774 cb = 5;
775 else if (!(uch & RT_BIT(1)))
776 cb = 6;
777 else
778 {
779 RTStrAssertMsgFailed(("Invalid UTF-8 first byte: %.*Rhxs\n", RT_MIN(strlen((char *)puch), 10), puch));
780 return rtStrGetCpExFailure(ppsz, pCp, VERR_INVALID_UTF8_ENCODING);
781 }
782
783 /* validate the rest */
784 switch (cb)
785 {
786 case 6:
787 RTStrAssertMsgReturn((puch[5] & 0xc0) == 0x80, ("6/%u: %.*Rhxs\n", cb, RT_MIN(cb + 10, strlen((char *)puch)), puch),
788 rtStrGetCpExFailure(ppsz, pCp, VERR_INVALID_UTF8_ENCODING));
789 case 5:
790 RTStrAssertMsgReturn((puch[4] & 0xc0) == 0x80, ("5/%u: %.*Rhxs\n", cb, RT_MIN(cb + 10, strlen((char *)puch)), puch),
791 rtStrGetCpExFailure(ppsz, pCp, VERR_INVALID_UTF8_ENCODING));
792 case 4:
793 RTStrAssertMsgReturn((puch[3] & 0xc0) == 0x80, ("4/%u: %.*Rhxs\n", cb, RT_MIN(cb + 10, strlen((char *)puch)), puch),
794 rtStrGetCpExFailure(ppsz, pCp, VERR_INVALID_UTF8_ENCODING));
795 case 3:
796 RTStrAssertMsgReturn((puch[2] & 0xc0) == 0x80, ("3/%u: %.*Rhxs\n", cb, RT_MIN(cb + 10, strlen((char *)puch)), puch),
797 rtStrGetCpExFailure(ppsz, pCp, VERR_INVALID_UTF8_ENCODING));
798 case 2:
799 RTStrAssertMsgReturn((puch[1] & 0xc0) == 0x80, ("2/%u: %.*Rhxs\n", cb, RT_MIN(cb + 10, strlen((char *)puch)), puch),
800 rtStrGetCpExFailure(ppsz, pCp, VERR_INVALID_UTF8_ENCODING));
801 break;
802 }
803
804 /* get and validate the code point. */
805 switch (cb)
806 {
807 case 6:
808 uc = (puch[5] & 0x3f)
809 | ((RTUNICP)(puch[4] & 0x3f) << 6)
810 | ((RTUNICP)(puch[3] & 0x3f) << 12)
811 | ((RTUNICP)(puch[2] & 0x3f) << 18)
812 | ((RTUNICP)(puch[1] & 0x3f) << 24)
813 | ((RTUNICP)(uch & 0x01) << 30);
814 RTStrAssertMsgReturn(uc >= 0x04000000 && uc <= 0x7fffffff,
815 ("%u: cp=%#010RX32: %.*Rhxs\n", cb, uc, RT_MIN(cb + 10, strlen((char *)puch)), puch),
816 rtStrGetCpExFailure(ppsz, pCp, VERR_INVALID_UTF8_ENCODING));
817 break;
818 case 5:
819 uc = (puch[4] & 0x3f)
820 | ((RTUNICP)(puch[3] & 0x3f) << 6)
821 | ((RTUNICP)(puch[2] & 0x3f) << 12)
822 | ((RTUNICP)(puch[1] & 0x3f) << 18)
823 | ((RTUNICP)(uch & 0x03) << 24);
824 RTStrAssertMsgReturn(uc >= 0x00200000 && uc <= 0x03ffffff,
825 ("%u: cp=%#010RX32: %.*Rhxs\n", cb, uc, RT_MIN(cb + 10, strlen((char *)puch)), puch),
826 rtStrGetCpExFailure(ppsz, pCp, VERR_INVALID_UTF8_ENCODING));
827 break;
828 case 4:
829 uc = (puch[3] & 0x3f)
830 | ((RTUNICP)(puch[2] & 0x3f) << 6)
831 | ((RTUNICP)(puch[1] & 0x3f) << 12)
832 | ((RTUNICP)(uch & 0x07) << 18);
833 RTStrAssertMsgReturn(uc >= 0x00010000 && uc <= 0x001fffff,
834 ("%u: cp=%#010RX32: %.*Rhxs\n", cb, uc, RT_MIN(cb + 10, strlen((char *)puch)), puch),
835 rtStrGetCpExFailure(ppsz, pCp, VERR_INVALID_UTF8_ENCODING));
836 break;
837 case 3:
838 uc = (puch[2] & 0x3f)
839 | ((RTUNICP)(puch[1] & 0x3f) << 6)
840 | ((RTUNICP)(uch & 0x0f) << 12);
841 RTStrAssertMsgReturn(uc >= 0x00000800 && uc <= 0x0000fffd,
842 ("%u: cp=%#010RX32: %.*Rhxs\n", cb, uc, RT_MIN(cb + 10, strlen((char *)puch)), puch),
843 rtStrGetCpExFailure(ppsz, pCp, uc == 0xffff || uc == 0xfffe ? VERR_CODE_POINT_ENDIAN_INDICATOR : VERR_INVALID_UTF8_ENCODING));
844 RTStrAssertMsgReturn(uc < 0xd800 || uc > 0xdfff,
845 ("%u: cp=%#010RX32: %.*Rhxs\n", cb, uc, RT_MIN(cb + 10, strlen((char *)puch)), puch),
846 rtStrGetCpExFailure(ppsz, pCp, VERR_CODE_POINT_SURROGATE));
847 break;
848 case 2:
849 uc = (puch[1] & 0x3f)
850 | ((RTUNICP)(uch & 0x1f) << 6);
851 RTStrAssertMsgReturn(uc >= 0x00000080 && uc <= 0x000007ff,
852 ("%u: cp=%#010RX32: %.*Rhxs\n", cb, uc, RT_MIN(cb + 10, strlen((char *)puch)), puch),
853 rtStrGetCpExFailure(ppsz, pCp, VERR_INVALID_UTF8_ENCODING));
854 break;
855 default: /* impossible, but GCC is bitching. */
856 uc = RTUNICP_INVALID;
857 break;
858 }
859 puch += cb;
860 }
861 else
862 {
863 /* 6th bit is always set. */
864 RTStrAssertMsgFailed(("Invalid UTF-8 first byte: %.*Rhxs\n", RT_MIN(strlen((char *)puch), 10), puch));
865 return rtStrGetCpExFailure(ppsz, pCp, VERR_INVALID_UTF8_ENCODING);
866 }
867 *pCp = uc;
868 *ppsz = (const char *)puch;
869 return VINF_SUCCESS;
870}
871
872
873RTDECL(char *) RTStrPutCpInternal(char *psz, RTUNICP uc)
874{
875 unsigned char *puch = (unsigned char *)psz;
876 if (uc < 0x80)
877 *puch++ = (unsigned char )uc;
878 else if (uc < 0x00000800)
879 {
880 *puch++ = 0xc0 | (uc >> 6);
881 *puch++ = 0x80 | (uc & 0x3f);
882 }
883 else if (uc < 0x00010000)
884 {
885 if ( uc < 0x0000d8000
886 || ( uc > 0x0000dfff
887 && uc < 0x0000fffe))
888 {
889 *puch++ = 0xe0 | (uc >> 12);
890 *puch++ = 0x80 | ((uc >> 6) & 0x3f);
891 *puch++ = 0x80 | (uc & 0x3f);
892 }
893 else
894 {
895 AssertMsgFailed(("Invalid code point U+%05x!\n", uc));
896 *puch++ = 0x7f;
897 }
898 }
899 else if (uc < 0x00200000)
900 {
901 *puch++ = 0xf0 | (uc >> 18);
902 *puch++ = 0x80 | ((uc >> 12) & 0x3f);
903 *puch++ = 0x80 | ((uc >> 6) & 0x3f);
904 *puch++ = 0x80 | (uc & 0x3f);
905 }
906 else if (uc < 0x04000000)
907 {
908 *puch++ = 0xf1 | (uc >> 24);
909 *puch++ = 0x80 | ((uc >> 18) & 0x3f);
910 *puch++ = 0x80 | ((uc >> 12) & 0x3f);
911 *puch++ = 0x80 | ((uc >> 6) & 0x3f);
912 *puch++ = 0x80 | (uc & 0x3f);
913 }
914 else if (uc <= 0x7fffffff)
915 {
916 *puch++ = 0xf3 | (uc >> 30);
917 *puch++ = 0x80 | ((uc >> 24) & 0x3f);
918 *puch++ = 0x80 | ((uc >> 18) & 0x3f);
919 *puch++ = 0x80 | ((uc >> 12) & 0x3f);
920 *puch++ = 0x80 | ((uc >> 6) & 0x3f);
921 *puch++ = 0x80 | (uc & 0x3f);
922 }
923 else
924 {
925 AssertMsgFailed(("Invalid code point U+%08x!\n", uc));
926 *puch++ = 0x7f;
927 }
928
929 return (char *)puch;
930}
931
932
933RTDECL(char *) RTStrPrevCp(const char *pszStart, const char *psz)
934{
935 if (pszStart < psz)
936 {
937 /* simple char? */
938 const unsigned char *puch = (const unsigned char *)psz;
939 unsigned uch = *--puch;
940 if (!(uch & RT_BIT(7)))
941 return (char *)puch;
942 RTStrAssertMsgReturn(!(uch & RT_BIT(6)), ("uch=%#x\n", uch), (char *)pszStart);
943
944 /* two or more. */
945 uint32_t uMask = 0xffffffc0;
946 while ( (const unsigned char *)pszStart < puch
947 && !(uMask & 1))
948 {
949 unsigned uch = *--puch;
950 if ((uch & 0xc0) != 0x80)
951 {
952 RTStrAssertMsgReturn((uch & (uMask >> 1)) == (uMask & 0xff),
953 ("Invalid UTF-8 encoding: %.*Rhxs puch=%p psz=%p\n", psz - (char *)puch, puch, psz),
954 (char *)pszStart);
955 return (char *)puch;
956 }
957 uMask >>= 1;
958 }
959 RTStrAssertMsgFailed(("Invalid UTF-8 encoding: %.*Rhxs puch=%p psz=%p\n", psz - (char *)puch, puch, psz));
960 }
961 return (char *)pszStart;
962}
963
964
965/**
966 * Performs a case insensitive string compare between two UTF-8 strings.
967 *
968 * This is a simplified compare, as only the simplified lower/upper case folding
969 * specified by the unicode specs are used. It does not consider character pairs
970 * as they are used in some languages, just simple upper & lower case compares.
971 *
972 * @returns < 0 if the first string less than the second string.
973 * @returns 0 if the first string identical to the second string.
974 * @returns > 0 if the first string greater than the second string.
975 * @param psz1 First UTF-8 string.
976 * @param psz2 Second UTF-8 string.
977 */
978RTDECL(int) RTStrICmp(const char *psz1, const char *psz2)
979{
980 /** @todo implement proper UTF-8 case-insensitive string comparison. */
981#ifdef RT_OS_WINDOWS
982 return stricmp(psz1, psz2);
983#else /* !RT_OS_WINDOWS */
984 return strcasecmp(psz1, psz2);
985#endif /* !RT_OS_WINDOWS */
986}
Note: See TracBrowser for help on using the repository browser.

© 2024 Oracle Support Privacy / Do Not Sell My Info Terms of Use Trademark Policy Automated Access Etiquette