VirtualBox

source: vbox/trunk/src/VBox/Runtime/common/string/utf-8.cpp@ 10808

Last change on this file since 10808 was 10106, checked in by vboxsync, 17 years ago

Added RTStrValidateEncoding, RTStrValidateEncodingEx and RTStrIsValidEncoding for explicit UTF-8 validation.

  • Property svn:eol-style set to native
  • Property svn:keywords set to Id
File size: 38.9 KB
Line 
1/* $Id: utf-8.cpp 10106 2008-07-02 13:40:07Z vboxsync $ */
2/** @file
3 * IPRT - UTF-8 Decoding.
4 */
5
6/*
7 * Copyright (C) 2006-2007 Sun Microsystems, Inc.
8 *
9 * This file is part of VirtualBox Open Source Edition (OSE), as
10 * available from http://www.virtualbox.org. This file is free software;
11 * you can redistribute it and/or modify it under the terms of the GNU
12 * General Public License (GPL) as published by the Free Software
13 * Foundation, in version 2 as it comes in the "COPYING" file of the
14 * VirtualBox OSE distribution. VirtualBox OSE is distributed in the
15 * hope that it will be useful, but WITHOUT ANY WARRANTY of any kind.
16 *
17 * The contents of this file may alternatively be used under the terms
18 * of the Common Development and Distribution License Version 1.0
19 * (CDDL) only, as it comes in the "COPYING.CDDL" file of the
20 * VirtualBox OSE distribution, in which case the provisions of the
21 * CDDL are applicable instead of those of the GPL.
22 *
23 * You may elect to license modified versions of this file under the
24 * terms and conditions of either the GPL or the CDDL or both.
25 *
26 * Please contact Sun Microsystems, Inc., 4150 Network Circle, Santa
27 * Clara, CA 95054 USA or visit http://www.sun.com if you need
28 * additional information or have any questions.
29 */
30
31
32/*******************************************************************************
33* Header Files *
34*******************************************************************************/
35#include <iprt/string.h>
36#include <iprt/uni.h>
37#include <iprt/alloc.h>
38#include <iprt/assert.h>
39#include <iprt/err.h>
40#include "internal/string.h"
41
42
43
44/**
45 * Get get length in code points of a UTF-8 encoded string.
46 * The string is validated while doing this.
47 *
48 * @returns IPRT status code.
49 * @param psz Pointer to the UTF-8 string.
50 * @param cch The max length of the string. (btw cch = cb)
51 * Use RTSTR_MAX if all of the string is to be examined.s
52 * @param pcuc Where to store the length in unicode code points.
53 */
54static int rtUtf8Length(const char *psz, size_t cch, size_t *pcuc)
55{
56 const unsigned char *puch = (const unsigned char *)psz;
57 size_t cCodePoints = 0;
58 while (cch > 0)
59 {
60 const unsigned char uch = *puch;
61 if (!uch)
62 break;
63 if (uch & RT_BIT(7))
64 {
65 /* figure sequence length and validate the first byte */
66 unsigned cb;
67 if ((uch & (RT_BIT(7) | RT_BIT(6) | RT_BIT(5))) == (RT_BIT(7) | RT_BIT(6)))
68 cb = 2;
69 else if ((uch & (RT_BIT(7) | RT_BIT(6) | RT_BIT(5) | RT_BIT(4))) == (RT_BIT(7) | RT_BIT(6) | RT_BIT(5)))
70 cb = 3;
71 else if ((uch & (RT_BIT(7) | RT_BIT(6) | RT_BIT(5) | RT_BIT(4) | RT_BIT(3))) == (RT_BIT(7) | RT_BIT(6) | RT_BIT(5) | RT_BIT(4)))
72 cb = 4;
73 else if ((uch & (RT_BIT(7) | RT_BIT(6) | RT_BIT(5) | RT_BIT(4) | RT_BIT(3) | RT_BIT(2))) == (RT_BIT(7) | RT_BIT(6) | RT_BIT(5) | RT_BIT(4) | RT_BIT(3)))
74 cb = 5;
75 else if ((uch & (RT_BIT(7) | RT_BIT(6) | RT_BIT(5) | RT_BIT(4) | RT_BIT(3) | RT_BIT(2) | RT_BIT(1))) == (RT_BIT(7) | RT_BIT(6) | RT_BIT(5) | RT_BIT(4) | RT_BIT(3) | RT_BIT(2)))
76 cb = 6;
77 else
78 {
79 RTStrAssertMsgFailed(("Invalid UTF-8 first byte: %.*Rhxs\n", RT_MIN(cch, 10), puch));
80 return VERR_INVALID_UTF8_ENCODING;
81 }
82
83 /* check length */
84 if (cb > cch)
85 {
86 RTStrAssertMsgFailed(("Invalid UTF-8 length: cb=%d cch=%d (%.*Rhxs)\n", cb, cch, RT_MIN(cch, 10), puch));
87 return VERR_INVALID_UTF8_ENCODING;
88 }
89
90 /* validate the rest */
91 switch (cb)
92 {
93 case 6:
94 RTStrAssertMsgReturn((puch[5] & (RT_BIT(7) | RT_BIT(6))) == RT_BIT(7), ("6/%u: %.*Rhxs\n", cb, RT_MIN(cb + 10, cch), puch), VERR_INVALID_UTF8_ENCODING);
95 case 5:
96 RTStrAssertMsgReturn((puch[4] & (RT_BIT(7) | RT_BIT(6))) == RT_BIT(7), ("5/%u: %.*Rhxs\n", cb, RT_MIN(cb + 10, cch), puch), VERR_INVALID_UTF8_ENCODING);
97 case 4:
98 RTStrAssertMsgReturn((puch[3] & (RT_BIT(7) | RT_BIT(6))) == RT_BIT(7), ("4/%u: %.*Rhxs\n", cb, RT_MIN(cb + 10, cch), puch), VERR_INVALID_UTF8_ENCODING);
99 case 3:
100 RTStrAssertMsgReturn((puch[2] & (RT_BIT(7) | RT_BIT(6))) == RT_BIT(7), ("3/%u: %.*Rhxs\n", cb, RT_MIN(cb + 10, cch), puch), VERR_INVALID_UTF8_ENCODING);
101 case 2:
102 RTStrAssertMsgReturn((puch[1] & (RT_BIT(7) | RT_BIT(6))) == RT_BIT(7), ("2/%u: %.*Rhxs\n", cb, RT_MIN(cb + 10, cch), puch), VERR_INVALID_UTF8_ENCODING);
103 break;
104 }
105
106 /* validate the code point. */
107 RTUNICP uc;
108 switch (cb)
109 {
110 case 6:
111 uc = (puch[5] & 0x3f)
112 | ((RTUNICP)(puch[4] & 0x3f) << 6)
113 | ((RTUNICP)(puch[3] & 0x3f) << 12)
114 | ((RTUNICP)(puch[2] & 0x3f) << 18)
115 | ((RTUNICP)(puch[1] & 0x3f) << 24)
116 | ((RTUNICP)(uch & 0x01) << 30);
117 RTStrAssertMsgReturn(uc >= 0x04000000 && uc <= 0x7fffffff,
118 ("%u: cp=%#010RX32: %.*Rhxs\n", cb, uc, RT_MIN(cb + 10, cch), puch), VERR_INVALID_UTF8_ENCODING);
119 break;
120 case 5:
121 uc = (puch[4] & 0x3f)
122 | ((RTUNICP)(puch[3] & 0x3f) << 6)
123 | ((RTUNICP)(puch[2] & 0x3f) << 12)
124 | ((RTUNICP)(puch[1] & 0x3f) << 18)
125 | ((RTUNICP)(uch & 0x03) << 24);
126 RTStrAssertMsgReturn(uc >= 0x00200000 && uc <= 0x03ffffff,
127 ("%u: cp=%#010RX32: %.*Rhxs\n", cb, uc, RT_MIN(cb + 10, cch), puch), VERR_INVALID_UTF8_ENCODING);
128 break;
129 case 4:
130 uc = (puch[3] & 0x3f)
131 | ((RTUNICP)(puch[2] & 0x3f) << 6)
132 | ((RTUNICP)(puch[1] & 0x3f) << 12)
133 | ((RTUNICP)(uch & 0x07) << 18);
134 RTStrAssertMsgReturn(uc >= 0x00010000 && uc <= 0x001fffff,
135 ("%u: cp=%#010RX32: %.*Rhxs\n", cb, uc, RT_MIN(cb + 10, cch), puch), VERR_INVALID_UTF8_ENCODING);
136 break;
137 case 3:
138 uc = (puch[2] & 0x3f)
139 | ((RTUNICP)(puch[1] & 0x3f) << 6)
140 | ((RTUNICP)(uch & 0x0f) << 12);
141 RTStrAssertMsgReturn(uc >= 0x00000800 && uc <= 0x0000fffd,
142 ("%u: cp=%#010RX32: %.*Rhxs\n", cb, uc, RT_MIN(cb + 10, cch), puch),
143 uc == 0xffff || uc == 0xfffe ? VERR_CODE_POINT_ENDIAN_INDICATOR : VERR_INVALID_UTF8_ENCODING);
144 RTStrAssertMsgReturn(uc < 0xd800 || uc > 0xdfff,
145 ("%u: cp=%#010RX32: %.*Rhxs\n", cb, uc, RT_MIN(cb + 10, cch), puch), VERR_CODE_POINT_SURROGATE);
146 break;
147 case 2:
148 uc = (puch[1] & 0x3f)
149 | ((RTUNICP)(uch & 0x1f) << 6);
150 RTStrAssertMsgReturn(uc >= 0x00000080 && uc <= 0x000007ff,
151 ("%u: cp=%#010RX32: %.*Rhxs\n", cb, uc, RT_MIN(cb + 10, cch), puch), VERR_INVALID_UTF8_ENCODING);
152 break;
153 }
154
155 /* advance */
156 cch -= cb;
157 puch += cb;
158 }
159 else
160 {
161 /* one ASCII byte */
162 puch++;
163 cch--;
164 }
165 cCodePoints++;
166 }
167
168 /* done */
169 *pcuc = cCodePoints;
170 return VINF_SUCCESS;
171}
172
173
174/**
175 * Decodes and UTF-8 string into an array of unicode code point.
176 *
177 * Since we know the input is valid, we do *not* perform encoding or length checks.
178 *
179 * @returns iprt status code.
180 * @param psz The UTF-8 string to recode. This is a valid encoding.
181 * @param cch The number of chars (the type char, so bytes if you like) to process of the UTF-8 string.
182 * The recoding will stop when cch or '\\0' is reached. Pass RTSTR_MAX to process up to '\\0'.
183 * @param paCps Where to store the code points array.
184 * @param cCps The number of RTUNICP items the paCps buffer can hold, excluding the terminator ('\\0').
185 * @param pcCps Where to store the actual number of decoded code points. This excludes the terminator.
186 */
187static int rtUtf8Decode(const char *psz, size_t cch, PRTUNICP paCps, size_t cCps, size_t *pcCps)
188{
189 int rc = VINF_SUCCESS;
190 const unsigned char *puch = (const unsigned char *)psz;
191 const PRTUNICP pCpEnd = paCps + cCps;
192 PRTUNICP pCp = paCps;
193 Assert(pCpEnd >= pCp);
194 while (cch > 0)
195 {
196 /* read the next char and check for terminator. */
197 const unsigned char uch = *puch;
198 if (!uch)
199 break;
200
201 /* check for output overflow */
202 if (pCp >= pCpEnd)
203 {
204 rc = VERR_BUFFER_OVERFLOW;
205 break;
206 }
207
208 /* decode and recode the code point */
209 if (!(uch & RT_BIT(7)))
210 {
211 *pCp++ = uch;
212 puch++;
213 cch--;
214 }
215#ifdef RT_STRICT
216 else if (!(uch & RT_BIT(6)))
217 AssertMsgFailed(("Internal error!\n"));
218#endif
219 else if (!(uch & RT_BIT(5)))
220 {
221 *pCp++ = (puch[1] & 0x3f)
222 | ((uint16_t)(uch & 0x1f) << 6);
223 puch += 2;
224 cch -= 2;
225 }
226 else if (!(uch & RT_BIT(4)))
227 {
228 *pCp++ = (puch[2] & 0x3f)
229 | ((uint16_t)(puch[1] & 0x3f) << 6)
230 | ((uint16_t)(uch & 0x0f) << 12);
231 puch += 3;
232 cch -= 3;
233 }
234 else if (!(uch & RT_BIT(3)))
235 {
236 *pCp++ = (puch[3] & 0x3f)
237 | ((RTUNICP)(puch[2] & 0x3f) << 6)
238 | ((RTUNICP)(puch[1] & 0x3f) << 12)
239 | ((RTUNICP)(uch & 0x07) << 18);
240 puch += 4;
241 cch -= 4;
242 }
243 else if (!(uch & RT_BIT(2)))
244 {
245 *pCp++ = (puch[4] & 0x3f)
246 | ((RTUNICP)(puch[3] & 0x3f) << 6)
247 | ((RTUNICP)(puch[2] & 0x3f) << 12)
248 | ((RTUNICP)(puch[1] & 0x3f) << 18)
249 | ((RTUNICP)(uch & 0x03) << 24);
250 puch += 5;
251 cch -= 6;
252 }
253 else
254 {
255 Assert(!(uch & RT_BIT(1)));
256 *pCp++ = (puch[5] & 0x3f)
257 | ((RTUNICP)(puch[4] & 0x3f) << 6)
258 | ((RTUNICP)(puch[3] & 0x3f) << 12)
259 | ((RTUNICP)(puch[2] & 0x3f) << 18)
260 | ((RTUNICP)(puch[1] & 0x3f) << 24)
261 | ((RTUNICP)(uch & 0x01) << 30);
262 puch += 6;
263 cch -= 6;
264 }
265 }
266
267 /* done */
268 *pCp = 0;
269 *pcCps = pCp - paCps;
270 return rc;
271}
272
273
274RTDECL(size_t) RTStrUniLen(const char *psz)
275{
276 size_t cCodePoints;
277 int rc = rtUtf8Length(psz, RTSTR_MAX, &cCodePoints);
278 return RT_SUCCESS(rc) ? cCodePoints : 0;
279}
280
281
282RTDECL(int) RTStrUniLenEx(const char *psz, size_t cch, size_t *pcCps)
283{
284 size_t cCodePoints;
285 int rc = rtUtf8Length(psz, cch, &cCodePoints);
286 if (pcCps)
287 *pcCps = RT_SUCCESS(rc) ? cCodePoints : 0;
288 return rc;
289}
290
291
292RTDECL(int) RTStrValidateEncoding(const char *psz)
293{
294 return RTStrValidateEncodingEx(psz, RTSTR_MAX, 0);
295}
296
297
298RTDECL(int) RTStrValidateEncodingEx(const char *psz, size_t cch, unsigned fFlags)
299{
300 NOREF(fFlags);
301 size_t cCpsIgnored;
302 return RTStrUniLenEx(psz, cch, &cCpsIgnored);
303}
304
305
306RTDECL(bool) RTStrIsValidEncoding(const char *psz)
307{
308 int rc = RTStrValidateEncodingEx(psz, RTSTR_MAX, 0);
309 return RT_SUCCESS(rc);
310}
311
312
313RTDECL(int) RTStrToUni(const char *pszString, PRTUNICP *ppaCps)
314{
315 /*
316 * Validate input.
317 */
318 Assert(VALID_PTR(pszString));
319 Assert(VALID_PTR(ppaCps));
320 *ppaCps = NULL;
321
322 /*
323 * Validate the UTF-8 input and count its code points.
324 */
325 size_t cCps;
326 int rc = rtUtf8Length(pszString, RTSTR_MAX, &cCps);
327 if (RT_SUCCESS(rc))
328 {
329 /*
330 * Allocate buffer.
331 */
332 PRTUNICP paCps = (PRTUNICP)RTMemAlloc((cCps + 1) * sizeof(RTUNICP));
333 if (paCps)
334 {
335 /*
336 * Decode the string.
337 */
338 rc = rtUtf8Decode(pszString, RTSTR_MAX, paCps, cCps, &cCps);
339 if (RT_SUCCESS(rc))
340 {
341 *ppaCps = paCps;
342 return rc;
343 }
344 RTMemFree(paCps);
345 }
346 else
347 rc = VERR_NO_CODE_POINT_MEMORY;
348 }
349 return rc;
350}
351
352
353RTDECL(int) RTStrToUniEx(const char *pszString, size_t cchString, PRTUNICP *ppaCps, size_t cCps, size_t *pcCps)
354{
355 /*
356 * Validate input.
357 */
358 Assert(VALID_PTR(pszString));
359 Assert(VALID_PTR(ppaCps));
360 Assert(!pcCps || VALID_PTR(pcCps));
361
362 /*
363 * Validate the UTF-8 input and count the code points.
364 */
365 size_t cCpsResult;
366 int rc = rtUtf8Length(pszString, cchString, &cCpsResult);
367 if (RT_SUCCESS(rc))
368 {
369 if (pcCps)
370 *pcCps = cCpsResult;
371
372 /*
373 * Check buffer size / Allocate buffer.
374 */
375 bool fShouldFree;
376 PRTUNICP paCpsResult;
377 if (cCps > 0 && *ppaCps)
378 {
379 fShouldFree = false;
380 if (cCps <= cCpsResult)
381 return VERR_BUFFER_OVERFLOW;
382 paCpsResult = *ppaCps;
383 }
384 else
385 {
386 *ppaCps = NULL;
387 fShouldFree = true;
388 cCps = RT_MAX(cCpsResult + 1, cCps);
389 paCpsResult = (PRTUNICP)RTMemAlloc(cCps * sizeof(RTUNICP));
390 }
391 if (paCpsResult)
392 {
393 /*
394 * Encode the UTF-16 string.
395 */
396 rc = rtUtf8Decode(pszString, cchString, paCpsResult, cCps - 1, &cCpsResult);
397 if (RT_SUCCESS(rc))
398 {
399 *ppaCps = paCpsResult;
400 return rc;
401 }
402 if (fShouldFree)
403 RTMemFree(paCpsResult);
404 }
405 else
406 rc = VERR_NO_CODE_POINT_MEMORY;
407 }
408 return rc;
409}
410
411
412/**
413 * Calculates the UTF-16 length of a string, validating the encoding while doing so.
414 *
415 * @returns IPRT status code.
416 * @param psz Pointer to the UTF-8 string.
417 * @param cch The max length of the string. (btw cch = cb)
418 * Use RTSTR_MAX if all of the string is to be examined.s
419 * @param pcwc Where to store the length of the UTF-16 string as a number of RTUTF16 characters.
420 */
421static int rtUtf8CalcUtf16Length(const char *psz, size_t cch, size_t *pcwc)
422{
423 const unsigned char *puch = (const unsigned char *)psz;
424 size_t cwc = 0;
425 while (cch > 0)
426 {
427 const unsigned char uch = *puch;
428 if (!uch)
429 break;
430 if (!(uch & RT_BIT(7)))
431 {
432 /* one ASCII byte */
433 cwc++;
434 puch++;
435 cch--;
436 }
437 else
438 {
439 /* figure sequence length and validate the first byte */
440 unsigned cb;
441 if ((uch & (RT_BIT(7) | RT_BIT(6) | RT_BIT(5))) == (RT_BIT(7) | RT_BIT(6)))
442 cb = 2;
443 else if ((uch & (RT_BIT(7) | RT_BIT(6) | RT_BIT(5) | RT_BIT(4))) == (RT_BIT(7) | RT_BIT(6) | RT_BIT(5)))
444 cb = 3;
445 else if ((uch & (RT_BIT(7) | RT_BIT(6) | RT_BIT(5) | RT_BIT(4) | RT_BIT(3))) == (RT_BIT(7) | RT_BIT(6) | RT_BIT(5) | RT_BIT(4)))
446 cb = 4;
447 else if ((uch & (RT_BIT(7) | RT_BIT(6) | RT_BIT(5) | RT_BIT(4) | RT_BIT(3) | RT_BIT(2))) == (RT_BIT(7) | RT_BIT(6) | RT_BIT(5) | RT_BIT(4) | RT_BIT(3)))
448 cb = 5;
449 else if ((uch & (RT_BIT(7) | RT_BIT(6) | RT_BIT(5) | RT_BIT(4) | RT_BIT(3) | RT_BIT(2) | RT_BIT(1))) == (RT_BIT(7) | RT_BIT(6) | RT_BIT(5) | RT_BIT(4) | RT_BIT(3) | RT_BIT(2)))
450 cb = 6;
451 else
452 {
453 RTStrAssertMsgFailed(("Invalid UTF-8 first byte: %.*Rhxs\n", RT_MIN(cch, 10), puch));
454 return VERR_INVALID_UTF8_ENCODING;
455 }
456
457 /* check length */
458 if (cb > cch)
459 {
460 RTStrAssertMsgFailed(("Invalid UTF-8 length: cb=%d cch=%d (%.*Rhxs)\n", cb, cch, RT_MIN(cch, 10), puch));
461 return VERR_INVALID_UTF8_ENCODING;
462 }
463
464 /* validate the rest */
465 switch (cb)
466 {
467 case 6:
468 RTStrAssertMsgReturn((puch[5] & (RT_BIT(7) | RT_BIT(6))) == RT_BIT(7), ("6/%u: %.*Rhxs\n", cb, RT_MIN(cb + 10, cch), puch), VERR_INVALID_UTF8_ENCODING);
469 case 5:
470 RTStrAssertMsgReturn((puch[4] & (RT_BIT(7) | RT_BIT(6))) == RT_BIT(7), ("5/%u: %.*Rhxs\n", cb, RT_MIN(cb + 10, cch), puch), VERR_INVALID_UTF8_ENCODING);
471 case 4:
472 RTStrAssertMsgReturn((puch[3] & (RT_BIT(7) | RT_BIT(6))) == RT_BIT(7), ("4/%u: %.*Rhxs\n", cb, RT_MIN(cb + 10, cch), puch), VERR_INVALID_UTF8_ENCODING);
473 case 3:
474 RTStrAssertMsgReturn((puch[2] & (RT_BIT(7) | RT_BIT(6))) == RT_BIT(7), ("3/%u: %.*Rhxs\n", cb, RT_MIN(cb + 10, cch), puch), VERR_INVALID_UTF8_ENCODING);
475 case 2:
476 RTStrAssertMsgReturn((puch[1] & (RT_BIT(7) | RT_BIT(6))) == RT_BIT(7), ("2/%u: %.*Rhxs\n", cb, RT_MIN(cb + 10, cch), puch), VERR_INVALID_UTF8_ENCODING);
477 break;
478 }
479
480 /* validate the code point. */
481 RTUNICP uc;
482 switch (cb)
483 {
484 case 6:
485 uc = (puch[5] & 0x3f)
486 | ((RTUNICP)(puch[4] & 0x3f) << 6)
487 | ((RTUNICP)(puch[3] & 0x3f) << 12)
488 | ((RTUNICP)(puch[2] & 0x3f) << 18)
489 | ((RTUNICP)(puch[1] & 0x3f) << 24)
490 | ((RTUNICP)(uch & 0x01) << 30);
491 RTStrAssertMsgReturn(uc >= 0x04000000 && uc <= 0x7fffffff,
492 ("%u: cp=%#010RX32: %.*Rhxs\n", cb, uc, RT_MIN(cb + 10, cch), puch), VERR_INVALID_UTF8_ENCODING);
493 RTStrAssertMsgFailed(("%u: cp=%#010RX32: %.*Rhxs\n", cb, uc, RT_MIN(cb + 10, cch), puch));
494 return VERR_CANT_RECODE_AS_UTF16;
495 case 5:
496 uc = (puch[4] & 0x3f)
497 | ((RTUNICP)(puch[3] & 0x3f) << 6)
498 | ((RTUNICP)(puch[2] & 0x3f) << 12)
499 | ((RTUNICP)(puch[1] & 0x3f) << 18)
500 | ((RTUNICP)(uch & 0x03) << 24);
501 RTStrAssertMsgReturn(uc >= 0x00200000 && uc <= 0x03ffffff,
502 ("%u: cp=%#010RX32: %.*Rhxs\n", cb, uc, RT_MIN(cb + 10, cch), puch), VERR_INVALID_UTF8_ENCODING);
503 RTStrAssertMsgFailed(("%u: cp=%#010RX32: %.*Rhxs\n", cb, uc, RT_MIN(cb + 10, cch), puch));
504 return VERR_CANT_RECODE_AS_UTF16;
505 case 4:
506 uc = (puch[3] & 0x3f)
507 | ((RTUNICP)(puch[2] & 0x3f) << 6)
508 | ((RTUNICP)(puch[1] & 0x3f) << 12)
509 | ((RTUNICP)(uch & 0x07) << 18);
510 RTStrAssertMsgReturn(uc >= 0x00010000 && uc <= 0x001fffff,
511 ("%u: cp=%#010RX32: %.*Rhxs\n", cb, uc, RT_MIN(cb + 10, cch), puch), VERR_INVALID_UTF8_ENCODING);
512 RTStrAssertMsgReturn(uc <= 0x0010ffff,
513 ("%u: cp=%#010RX32: %.*Rhxs\n", cb, uc, RT_MIN(cb + 10, cch), puch), VERR_CANT_RECODE_AS_UTF16);
514 cwc++;
515 break;
516 case 3:
517 uc = (puch[2] & 0x3f)
518 | ((RTUNICP)(puch[1] & 0x3f) << 6)
519 | ((RTUNICP)(uch & 0x0f) << 12);
520 RTStrAssertMsgReturn(uc >= 0x00000800 && uc <= 0x0000fffd,
521 ("%u: cp=%#010RX32: %.*Rhxs\n", cb, uc, RT_MIN(cb + 10, cch), puch),
522 uc == 0xffff || uc == 0xfffe ? VERR_CODE_POINT_ENDIAN_INDICATOR : VERR_INVALID_UTF8_ENCODING);
523 RTStrAssertMsgReturn(uc < 0xd800 || uc > 0xdfff,
524 ("%u: cp=%#010RX32: %.*Rhxs\n", cb, uc, RT_MIN(cb + 10, cch), puch), VERR_CODE_POINT_SURROGATE);
525 break;
526 case 2:
527 uc = (puch[1] & 0x3f)
528 | ((RTUNICP)(uch & 0x1f) << 6);
529 RTStrAssertMsgReturn(uc >= 0x00000080 && uc <= 0x000007ff,
530 ("%u: cp=%#010RX32: %.*Rhxs\n", cb, uc, RT_MIN(cb + 10, cch), puch), VERR_INVALID_UTF8_ENCODING);
531 break;
532 }
533
534 /* advance */
535 cch -= cb;
536 puch += cb;
537 cwc++;
538 }
539 }
540
541 /* done */
542 *pcwc = cwc;
543 return VINF_SUCCESS;
544}
545
546
547/**
548 * Recodes a valid UTF-8 string as UTF-16.
549 *
550 * Since we know the input is valid, we do *not* perform encoding or length checks.
551 *
552 * @returns iprt status code.
553 * @param psz The UTF-8 string to recode. This is a valid encoding.
554 * @param cch The number of chars (the type char, so bytes if you like) to process of the UTF-8 string.
555 * The recoding will stop when cch or '\\0' is reached. Pass RTSTR_MAX to process up to '\\0'.
556 * @param pwsz Where to store the UTF-16 string.
557 * @param cwc The number of RTUTF16 items the pwsz buffer can hold, excluding the terminator ('\\0').
558 * @param pcwc Where to store the actual number of RTUTF16 items encoded into the UTF-16. This excludes the terminator.
559 */
560static int rtUtf8RecodeAsUtf16(const char *psz, size_t cch, PRTUTF16 pwsz, size_t cwc, size_t *pcwc)
561{
562 int rc = VINF_SUCCESS;
563 const unsigned char *puch = (const unsigned char *)psz;
564 const PRTUTF16 pwszEnd = pwsz + cwc;
565 PRTUTF16 pwc = pwsz;
566 Assert(pwszEnd >= pwc);
567 while (cch > 0)
568 {
569 /* read the next char and check for terminator. */
570 const unsigned char uch = *puch;
571 if (!uch)
572 break;
573
574 /* check for output overflow */
575 if (pwc >= pwszEnd)
576 {
577 rc = VERR_BUFFER_OVERFLOW;
578 break;
579 }
580
581 /* decode and recode the code point */
582 if (!(uch & RT_BIT(7)))
583 {
584 *pwc++ = uch;
585 puch++;
586 cch--;
587 }
588 else if ((uch & (RT_BIT(7) | RT_BIT(6) | RT_BIT(5))) == (RT_BIT(7) | RT_BIT(6)))
589 {
590 uint16_t uc = (puch[1] & 0x3f)
591 | ((uint16_t)(uch & 0x1f) << 6);
592 *pwc++ = uc;
593 puch += 2;
594 cch -= 2;
595 }
596 else if ((uch & (RT_BIT(7) | RT_BIT(6) | RT_BIT(5) | RT_BIT(4))) == (RT_BIT(7) | RT_BIT(6) | RT_BIT(5)))
597 {
598 uint16_t uc = (puch[2] & 0x3f)
599 | ((uint16_t)(puch[1] & 0x3f) << 6)
600 | ((uint16_t)(uch & 0x0f) << 12);
601 *pwc++ = uc;
602 puch += 3;
603 cch -= 3;
604 }
605 else
606 {
607 /* generate surrugate pair */
608 Assert((uch & (RT_BIT(7) | RT_BIT(6) | RT_BIT(5) | RT_BIT(4) | RT_BIT(3))) == (RT_BIT(7) | RT_BIT(6) | RT_BIT(5) | RT_BIT(4)));
609 RTUNICP uc = (puch[3] & 0x3f)
610 | ((RTUNICP)(puch[2] & 0x3f) << 6)
611 | ((RTUNICP)(puch[1] & 0x3f) << 12)
612 | ((RTUNICP)(uch & 0x07) << 18);
613 if (pwc + 1 >= pwszEnd)
614 {
615 rc = VERR_BUFFER_OVERFLOW;
616 break;
617 }
618 uc -= 0x10000;
619 *pwc++ = 0xd800 | (uc >> 10);
620 *pwc++ = 0xdc00 | (uc & 0x3ff);
621 puch += 4;
622 cch -= 4;
623 }
624 }
625
626 /* done */
627 *pwc = '\0';
628 *pcwc = pwc - pwsz;
629 return rc;
630}
631
632
633RTDECL(int) RTStrToUtf16(const char *pszString, PRTUTF16 *ppwszString)
634{
635 /*
636 * Validate input.
637 */
638 Assert(VALID_PTR(ppwszString));
639 Assert(VALID_PTR(pszString));
640 *ppwszString = NULL;
641
642 /*
643 * Validate the UTF-8 input and calculate the length of the UTF-16 string.
644 */
645 size_t cwc;
646 int rc = rtUtf8CalcUtf16Length(pszString, RTSTR_MAX, &cwc);
647 if (RT_SUCCESS(rc))
648 {
649 /*
650 * Allocate buffer.
651 */
652 PRTUTF16 pwsz = (PRTUTF16)RTMemAlloc((cwc + 1) * sizeof(RTUTF16));
653 if (pwsz)
654 {
655 /*
656 * Encode the UTF-16 string.
657 */
658 rc = rtUtf8RecodeAsUtf16(pszString, RTSTR_MAX, pwsz, cwc, &cwc);
659 if (RT_SUCCESS(rc))
660 {
661 *ppwszString = pwsz;
662 return rc;
663 }
664 RTMemFree(pwsz);
665 }
666 else
667 rc = VERR_NO_UTF16_MEMORY;
668 }
669 return rc;
670}
671
672
673RTDECL(int) RTStrToUtf16Ex(const char *pszString, size_t cchString, PRTUTF16 *ppwsz, size_t cwc, size_t *pcwc)
674{
675 /*
676 * Validate input.
677 */
678 Assert(VALID_PTR(pszString));
679 Assert(VALID_PTR(ppwsz));
680 Assert(!pcwc || VALID_PTR(pcwc));
681
682 /*
683 * Validate the UTF-8 input and calculate the length of the UTF-16 string.
684 */
685 size_t cwcResult;
686 int rc = rtUtf8CalcUtf16Length(pszString, cchString, &cwcResult);
687 if (RT_SUCCESS(rc))
688 {
689 if (pcwc)
690 *pcwc = cwcResult;
691
692 /*
693 * Check buffer size / Allocate buffer.
694 */
695 bool fShouldFree;
696 PRTUTF16 pwszResult;
697 if (cwc > 0 && *ppwsz)
698 {
699 fShouldFree = false;
700 if (cwc <= cwcResult)
701 return VERR_BUFFER_OVERFLOW;
702 pwszResult = *ppwsz;
703 }
704 else
705 {
706 *ppwsz = NULL;
707 fShouldFree = true;
708 cwc = RT_MAX(cwcResult + 1, cwc);
709 pwszResult = (PRTUTF16)RTMemAlloc(cwc * sizeof(RTUTF16));
710 }
711 if (pwszResult)
712 {
713 /*
714 * Encode the UTF-16 string.
715 */
716 rc = rtUtf8RecodeAsUtf16(pszString, cchString, pwszResult, cwc - 1, &cwcResult);
717 if (RT_SUCCESS(rc))
718 {
719 *ppwsz = pwszResult;
720 return rc;
721 }
722 if (fShouldFree)
723 RTMemFree(pwszResult);
724 }
725 else
726 rc = VERR_NO_UTF16_MEMORY;
727 }
728 return rc;
729}
730
731
732RTDECL(size_t) RTStrCalcUtf16Len(const char *psz)
733{
734 size_t cwc;
735 int rc = rtUtf8CalcUtf16Length(psz, RTSTR_MAX, &cwc);
736 return RT_SUCCESS(rc) ? cwc : 0;
737}
738
739
740RTDECL(int) RTStrCalcUtf16LenEx(const char *psz, size_t cch, size_t *pcwc)
741{
742 size_t cwc;
743 int rc = rtUtf8CalcUtf16Length(psz, cch, &cwc);
744 if (pcwc)
745 *pcwc = RT_SUCCESS(rc) ? cwc : ~(size_t)0;
746 return rc;
747}
748
749
750/**
751 * Handle invalid encodings passed to RTStrGetCp() and RTStrGetCpEx().
752 * @returns rc
753 * @param ppsz The pointer to the the string position point.
754 * @param pCp Where to store RTUNICP_INVALID.
755 * @param rc The iprt error code.
756 */
757static int rtStrGetCpExFailure(const char **ppsz, PRTUNICP pCp, int rc)
758{
759 /*
760 * Try find a valid encoding.
761 */
762 (*ppsz)++; /** @todo code this! */
763 *pCp = RTUNICP_INVALID;
764 return rc;
765}
766
767
768RTDECL(RTUNICP) RTStrGetCpInternal(const char *psz)
769{
770 RTUNICP Cp;
771 RTStrGetCpExInternal(&psz, &Cp);
772 return Cp;
773}
774
775
776RTDECL(int) RTStrGetCpExInternal(const char **ppsz, PRTUNICP pCp)
777{
778 const unsigned char *puch = (const unsigned char *)*ppsz;
779 const unsigned char uch = *puch;
780 RTUNICP uc;
781
782 /* ASCII ? */
783 if (!(uch & RT_BIT(7)))
784 {
785 uc = uch;
786 puch++;
787 }
788 else if (uch & RT_BIT(6))
789 {
790 /* figure the length and validate the first octet. */
791 unsigned cb;
792 if (!(uch & RT_BIT(5)))
793 cb = 2;
794 else if (!(uch & RT_BIT(4)))
795 cb = 3;
796 else if (!(uch & RT_BIT(3)))
797 cb = 4;
798 else if (!(uch & RT_BIT(2)))
799 cb = 5;
800 else if (!(uch & RT_BIT(1)))
801 cb = 6;
802 else
803 {
804 RTStrAssertMsgFailed(("Invalid UTF-8 first byte: %.*Rhxs\n", RT_MIN(strlen((char *)puch), 10), puch));
805 return rtStrGetCpExFailure(ppsz, pCp, VERR_INVALID_UTF8_ENCODING);
806 }
807
808 /* validate the rest */
809 switch (cb)
810 {
811 case 6:
812 RTStrAssertMsgReturn((puch[5] & 0xc0) == 0x80, ("6/%u: %.*Rhxs\n", cb, RT_MIN(cb + 10, strlen((char *)puch)), puch),
813 rtStrGetCpExFailure(ppsz, pCp, VERR_INVALID_UTF8_ENCODING));
814 case 5:
815 RTStrAssertMsgReturn((puch[4] & 0xc0) == 0x80, ("5/%u: %.*Rhxs\n", cb, RT_MIN(cb + 10, strlen((char *)puch)), puch),
816 rtStrGetCpExFailure(ppsz, pCp, VERR_INVALID_UTF8_ENCODING));
817 case 4:
818 RTStrAssertMsgReturn((puch[3] & 0xc0) == 0x80, ("4/%u: %.*Rhxs\n", cb, RT_MIN(cb + 10, strlen((char *)puch)), puch),
819 rtStrGetCpExFailure(ppsz, pCp, VERR_INVALID_UTF8_ENCODING));
820 case 3:
821 RTStrAssertMsgReturn((puch[2] & 0xc0) == 0x80, ("3/%u: %.*Rhxs\n", cb, RT_MIN(cb + 10, strlen((char *)puch)), puch),
822 rtStrGetCpExFailure(ppsz, pCp, VERR_INVALID_UTF8_ENCODING));
823 case 2:
824 RTStrAssertMsgReturn((puch[1] & 0xc0) == 0x80, ("2/%u: %.*Rhxs\n", cb, RT_MIN(cb + 10, strlen((char *)puch)), puch),
825 rtStrGetCpExFailure(ppsz, pCp, VERR_INVALID_UTF8_ENCODING));
826 break;
827 }
828
829 /* get and validate the code point. */
830 switch (cb)
831 {
832 case 6:
833 uc = (puch[5] & 0x3f)
834 | ((RTUNICP)(puch[4] & 0x3f) << 6)
835 | ((RTUNICP)(puch[3] & 0x3f) << 12)
836 | ((RTUNICP)(puch[2] & 0x3f) << 18)
837 | ((RTUNICP)(puch[1] & 0x3f) << 24)
838 | ((RTUNICP)(uch & 0x01) << 30);
839 RTStrAssertMsgReturn(uc >= 0x04000000 && uc <= 0x7fffffff,
840 ("%u: cp=%#010RX32: %.*Rhxs\n", cb, uc, RT_MIN(cb + 10, strlen((char *)puch)), puch),
841 rtStrGetCpExFailure(ppsz, pCp, VERR_INVALID_UTF8_ENCODING));
842 break;
843 case 5:
844 uc = (puch[4] & 0x3f)
845 | ((RTUNICP)(puch[3] & 0x3f) << 6)
846 | ((RTUNICP)(puch[2] & 0x3f) << 12)
847 | ((RTUNICP)(puch[1] & 0x3f) << 18)
848 | ((RTUNICP)(uch & 0x03) << 24);
849 RTStrAssertMsgReturn(uc >= 0x00200000 && uc <= 0x03ffffff,
850 ("%u: cp=%#010RX32: %.*Rhxs\n", cb, uc, RT_MIN(cb + 10, strlen((char *)puch)), puch),
851 rtStrGetCpExFailure(ppsz, pCp, VERR_INVALID_UTF8_ENCODING));
852 break;
853 case 4:
854 uc = (puch[3] & 0x3f)
855 | ((RTUNICP)(puch[2] & 0x3f) << 6)
856 | ((RTUNICP)(puch[1] & 0x3f) << 12)
857 | ((RTUNICP)(uch & 0x07) << 18);
858 RTStrAssertMsgReturn(uc >= 0x00010000 && uc <= 0x001fffff,
859 ("%u: cp=%#010RX32: %.*Rhxs\n", cb, uc, RT_MIN(cb + 10, strlen((char *)puch)), puch),
860 rtStrGetCpExFailure(ppsz, pCp, VERR_INVALID_UTF8_ENCODING));
861 break;
862 case 3:
863 uc = (puch[2] & 0x3f)
864 | ((RTUNICP)(puch[1] & 0x3f) << 6)
865 | ((RTUNICP)(uch & 0x0f) << 12);
866 RTStrAssertMsgReturn(uc >= 0x00000800 && uc <= 0x0000fffd,
867 ("%u: cp=%#010RX32: %.*Rhxs\n", cb, uc, RT_MIN(cb + 10, strlen((char *)puch)), puch),
868 rtStrGetCpExFailure(ppsz, pCp, uc == 0xffff || uc == 0xfffe ? VERR_CODE_POINT_ENDIAN_INDICATOR : VERR_INVALID_UTF8_ENCODING));
869 RTStrAssertMsgReturn(uc < 0xd800 || uc > 0xdfff,
870 ("%u: cp=%#010RX32: %.*Rhxs\n", cb, uc, RT_MIN(cb + 10, strlen((char *)puch)), puch),
871 rtStrGetCpExFailure(ppsz, pCp, VERR_CODE_POINT_SURROGATE));
872 break;
873 case 2:
874 uc = (puch[1] & 0x3f)
875 | ((RTUNICP)(uch & 0x1f) << 6);
876 RTStrAssertMsgReturn(uc >= 0x00000080 && uc <= 0x000007ff,
877 ("%u: cp=%#010RX32: %.*Rhxs\n", cb, uc, RT_MIN(cb + 10, strlen((char *)puch)), puch),
878 rtStrGetCpExFailure(ppsz, pCp, VERR_INVALID_UTF8_ENCODING));
879 break;
880 default: /* impossible, but GCC is bitching. */
881 uc = RTUNICP_INVALID;
882 break;
883 }
884 puch += cb;
885 }
886 else
887 {
888 /* 6th bit is always set. */
889 RTStrAssertMsgFailed(("Invalid UTF-8 first byte: %.*Rhxs\n", RT_MIN(strlen((char *)puch), 10), puch));
890 return rtStrGetCpExFailure(ppsz, pCp, VERR_INVALID_UTF8_ENCODING);
891 }
892 *pCp = uc;
893 *ppsz = (const char *)puch;
894 return VINF_SUCCESS;
895}
896
897
898RTDECL(char *) RTStrPutCpInternal(char *psz, RTUNICP uc)
899{
900 unsigned char *puch = (unsigned char *)psz;
901 if (uc < 0x80)
902 *puch++ = (unsigned char )uc;
903 else if (uc < 0x00000800)
904 {
905 *puch++ = 0xc0 | (uc >> 6);
906 *puch++ = 0x80 | (uc & 0x3f);
907 }
908 else if (uc < 0x00010000)
909 {
910 if ( uc < 0x0000d8000
911 || ( uc > 0x0000dfff
912 && uc < 0x0000fffe))
913 {
914 *puch++ = 0xe0 | (uc >> 12);
915 *puch++ = 0x80 | ((uc >> 6) & 0x3f);
916 *puch++ = 0x80 | (uc & 0x3f);
917 }
918 else
919 {
920 AssertMsgFailed(("Invalid code point U+%05x!\n", uc));
921 *puch++ = 0x7f;
922 }
923 }
924 else if (uc < 0x00200000)
925 {
926 *puch++ = 0xf0 | (uc >> 18);
927 *puch++ = 0x80 | ((uc >> 12) & 0x3f);
928 *puch++ = 0x80 | ((uc >> 6) & 0x3f);
929 *puch++ = 0x80 | (uc & 0x3f);
930 }
931 else if (uc < 0x04000000)
932 {
933 *puch++ = 0xf1 | (uc >> 24);
934 *puch++ = 0x80 | ((uc >> 18) & 0x3f);
935 *puch++ = 0x80 | ((uc >> 12) & 0x3f);
936 *puch++ = 0x80 | ((uc >> 6) & 0x3f);
937 *puch++ = 0x80 | (uc & 0x3f);
938 }
939 else if (uc <= 0x7fffffff)
940 {
941 *puch++ = 0xf3 | (uc >> 30);
942 *puch++ = 0x80 | ((uc >> 24) & 0x3f);
943 *puch++ = 0x80 | ((uc >> 18) & 0x3f);
944 *puch++ = 0x80 | ((uc >> 12) & 0x3f);
945 *puch++ = 0x80 | ((uc >> 6) & 0x3f);
946 *puch++ = 0x80 | (uc & 0x3f);
947 }
948 else
949 {
950 AssertMsgFailed(("Invalid code point U+%08x!\n", uc));
951 *puch++ = 0x7f;
952 }
953
954 return (char *)puch;
955}
956
957
958RTDECL(char *) RTStrPrevCp(const char *pszStart, const char *psz)
959{
960 if (pszStart < psz)
961 {
962 /* simple char? */
963 const unsigned char *puch = (const unsigned char *)psz;
964 unsigned uch = *--puch;
965 if (!(uch & RT_BIT(7)))
966 return (char *)puch;
967 RTStrAssertMsgReturn(!(uch & RT_BIT(6)), ("uch=%#x\n", uch), (char *)pszStart);
968
969 /* two or more. */
970 uint32_t uMask = 0xffffffc0;
971 while ( (const unsigned char *)pszStart < puch
972 && !(uMask & 1))
973 {
974 unsigned uch = *--puch;
975 if ((uch & 0xc0) != 0x80)
976 {
977 RTStrAssertMsgReturn((uch & (uMask >> 1)) == (uMask & 0xff),
978 ("Invalid UTF-8 encoding: %.*Rhxs puch=%p psz=%p\n", psz - (char *)puch, puch, psz),
979 (char *)pszStart);
980 return (char *)puch;
981 }
982 uMask >>= 1;
983 }
984 RTStrAssertMsgFailed(("Invalid UTF-8 encoding: %.*Rhxs puch=%p psz=%p\n", psz - (char *)puch, puch, psz));
985 }
986 return (char *)pszStart;
987}
988
989
990/**
991 * Performs a case sensitive string compare between two UTF-8 strings.
992 *
993 * Encoding errors are ignored by the current implementation. So, the only
994 * difference between this and the CRT strcmp function is the handling of
995 * NULL arguments.
996 *
997 * @returns < 0 if the first string less than the second string.
998 * @returns 0 if the first string identical to the second string.
999 * @returns > 0 if the first string greater than the second string.
1000 * @param psz1 First UTF-8 string. Null is allowed.
1001 * @param psz2 Second UTF-8 string. Null is allowed.
1002 */
1003RTDECL(int) RTStrCmp(const char *psz1, const char *psz2)
1004{
1005 if (psz1 == psz2)
1006 return 0;
1007 if (!psz1)
1008 return -1;
1009 if (!psz2)
1010 return 1;
1011
1012 return strcmp(psz1, psz2);
1013}
1014
1015
1016/**
1017 * Performs a case insensitive string compare between two UTF-8 strings.
1018 *
1019 * This is a simplified compare, as only the simplified lower/upper case folding
1020 * specified by the unicode specs are used. It does not consider character pairs
1021 * as they are used in some languages, just simple upper & lower case compares.
1022 *
1023 * The result is the difference between the mismatching codepoints after they
1024 * both have been lower cased.
1025 *
1026 * If the string encoding is invalid the function will assert (strict builds)
1027 * and use RTStrCmp for the remainder of the string.
1028 *
1029 * @returns < 0 if the first string less than the second string.
1030 * @returns 0 if the first string identical to the second string.
1031 * @returns > 0 if the first string greater than the second string.
1032 * @param psz1 First UTF-8 string. Null is allowed.
1033 * @param psz2 Second UTF-8 string. Null is allowed.
1034 */
1035RTDECL(int) RTStrICmp(const char *psz1, const char *psz2)
1036{
1037 if (psz1 == psz2)
1038 return 0;
1039 if (!psz1)
1040 return -1;
1041 if (!psz2)
1042 return 1;
1043
1044#if 1 /* new */
1045 const char *pszStart1 = psz1;
1046 for (;;)
1047 {
1048 /* Get the codepoints */
1049 RTUNICP cp1;
1050 int rc = RTStrGetCpEx(&psz1, &cp1);
1051 if (RT_FAILURE(rc))
1052 {
1053 AssertRC(rc);
1054 psz1--;
1055 break;
1056 }
1057
1058 RTUNICP cp2;
1059 rc = RTStrGetCpEx(&psz2, &cp2);
1060 if (RT_FAILURE(rc))
1061 {
1062 AssertRC(rc);
1063 psz2--;
1064 psz1 = RTStrPrevCp(pszStart1, psz1);
1065 break;
1066 }
1067
1068 /* compare */
1069 int iDiff = cp1 - cp2;
1070 if (iDiff)
1071 {
1072 iDiff = RTUniCpToUpper(cp1) != RTUniCpToUpper(cp2);
1073 if (iDiff)
1074 {
1075 iDiff = RTUniCpToLower(cp1) - RTUniCpToLower(cp2); /* lower case diff last! */
1076 if (iDiff)
1077 return iDiff;
1078 }
1079 }
1080
1081 /* hit the terminator? */
1082 if (!cp1)
1083 return 0;
1084 }
1085
1086 /* Hit some bad encoding, continue in case insensitive mode. */
1087 return RTStrCmp(psz1, psz2);
1088#else /* old */
1089#ifdef RT_OS_WINDOWS
1090 return stricmp(psz1, psz2);
1091#else /* !RT_OS_WINDOWS */
1092 return strcasecmp(psz1, psz2);
1093#endif /* !RT_OS_WINDOWS */
1094#endif
1095}
Note: See TracBrowser for help on using the repository browser.

© 2024 Oracle Support Privacy / Do Not Sell My Info Terms of Use Trademark Policy Automated Access Etiquette