VirtualBox

source: vbox/trunk/src/VBox/Runtime/utf-8.cpp@ 5042

Last change on this file since 5042 was 4071, checked in by vboxsync, 17 years ago

Biggest check-in ever. New source code headers for all (C) innotek files.

  • Property svn:eol-style set to native
  • Property svn:keywords set to Id
File size: 35.3 KB
Line 
1/* $Id: utf-8.cpp 4071 2007-08-07 17:07:59Z vboxsync $ */
2/** @file
3 * innotek Portable Runtime - UTF-8 Decoding.
4 */
5
6/*
7 * Copyright (C) 2006-2007 innotek GmbH
8 *
9 * This file is part of VirtualBox Open Source Edition (OSE), as
10 * available from http://www.virtualbox.org. This file is free software;
11 * you can redistribute it and/or modify it under the terms of the GNU
12 * General Public License as published by the Free Software Foundation,
13 * in version 2 as it comes in the "COPYING" file of the VirtualBox OSE
14 * distribution. VirtualBox OSE is distributed in the hope that it will
15 * be useful, but WITHOUT ANY WARRANTY of any kind.
16 */
17
18
19/*******************************************************************************
20* Header Files *
21*******************************************************************************/
22#include <iprt/string.h>
23#include <iprt/uni.h>
24#include <iprt/alloc.h>
25#include <iprt/assert.h>
26#include <iprt/err.h>
27#include "internal/string.h"
28
29
30
31/**
32 * Get get length in code points of a UTF-8 encoded string.
33 * The string is validated while doing this.
34 *
35 * @returns IPRT status code.
36 * @param psz Pointer to the UTF-8 string.
37 * @param cch The max length of the string. (btw cch = cb)
38 * Use RTSTR_MAX if all of the string is to be examined.s
39 * @param pcuc Where to store the length in unicode code points.
40 */
41static int rtUtf8Length(const char *psz, size_t cch, size_t *pcuc)
42{
43 const unsigned char *puch = (const unsigned char *)psz;
44 size_t cCodePoints = 0;
45 while (cch > 0)
46 {
47 const unsigned char uch = *puch;
48 if (!uch)
49 break;
50 if (uch & BIT(7))
51 {
52 /* figure sequence length and validate the first byte */
53 unsigned cb;
54 if ((uch & (BIT(7) | BIT(6) | BIT(5))) == (BIT(7) | BIT(6)))
55 cb = 2;
56 else if ((uch & (BIT(7) | BIT(6) | BIT(5) | BIT(4))) == (BIT(7) | BIT(6) | BIT(5)))
57 cb = 3;
58 else if ((uch & (BIT(7) | BIT(6) | BIT(5) | BIT(4) | BIT(3))) == (BIT(7) | BIT(6) | BIT(5) | BIT(4)))
59 cb = 4;
60 else if ((uch & (BIT(7) | BIT(6) | BIT(5) | BIT(4) | BIT(3) | BIT(2))) == (BIT(7) | BIT(6) | BIT(5) | BIT(4) | BIT(3)))
61 cb = 5;
62 else if ((uch & (BIT(7) | BIT(6) | BIT(5) | BIT(4) | BIT(3) | BIT(2) | BIT(1))) == (BIT(7) | BIT(6) | BIT(5) | BIT(4) | BIT(3) | BIT(2)))
63 cb = 6;
64 else
65 {
66 RTStrAssertMsgFailed(("Invalid UTF-8 first byte: %.*Rhxs\n", RT_MIN(cch, 10), puch));
67 return VERR_INVALID_UTF8_ENCODING;
68 }
69
70 /* check length */
71 if (cb > cch)
72 {
73 RTStrAssertMsgFailed(("Invalid UTF-8 length: cb=%d cch=%d (%.*Rhxs)\n", cb, cch, RT_MIN(cch, 10), puch));
74 return VERR_INVALID_UTF8_ENCODING;
75 }
76
77 /* validate the rest */
78 switch (cb)
79 {
80 case 6:
81 RTStrAssertMsgReturn((puch[5] & (BIT(7) | BIT(6))) == BIT(7), ("6/%u: %.*Rhxs\n", cb, RT_MIN(cb + 10, cch), puch), VERR_INVALID_UTF8_ENCODING);
82 case 5:
83 RTStrAssertMsgReturn((puch[4] & (BIT(7) | BIT(6))) == BIT(7), ("5/%u: %.*Rhxs\n", cb, RT_MIN(cb + 10, cch), puch), VERR_INVALID_UTF8_ENCODING);
84 case 4:
85 RTStrAssertMsgReturn((puch[3] & (BIT(7) | BIT(6))) == BIT(7), ("4/%u: %.*Rhxs\n", cb, RT_MIN(cb + 10, cch), puch), VERR_INVALID_UTF8_ENCODING);
86 case 3:
87 RTStrAssertMsgReturn((puch[2] & (BIT(7) | BIT(6))) == BIT(7), ("3/%u: %.*Rhxs\n", cb, RT_MIN(cb + 10, cch), puch), VERR_INVALID_UTF8_ENCODING);
88 case 2:
89 RTStrAssertMsgReturn((puch[1] & (BIT(7) | BIT(6))) == BIT(7), ("2/%u: %.*Rhxs\n", cb, RT_MIN(cb + 10, cch), puch), VERR_INVALID_UTF8_ENCODING);
90 break;
91 }
92
93 /* validate the code point. */
94 RTUNICP uc;
95 switch (cb)
96 {
97 case 6:
98 uc = (puch[5] & 0x3f)
99 | ((RTUNICP)(puch[4] & 0x3f) << 6)
100 | ((RTUNICP)(puch[3] & 0x3f) << 12)
101 | ((RTUNICP)(puch[2] & 0x3f) << 18)
102 | ((RTUNICP)(puch[1] & 0x3f) << 24)
103 | ((RTUNICP)(uch & 0x01) << 30);
104 RTStrAssertMsgReturn(uc >= 0x04000000 && uc <= 0x7fffffff,
105 ("%u: cp=%#010RX32: %.*Rhxs\n", cb, uc, RT_MIN(cb + 10, cch), puch), VERR_INVALID_UTF8_ENCODING);
106 break;
107 case 5:
108 uc = (puch[4] & 0x3f)
109 | ((RTUNICP)(puch[3] & 0x3f) << 6)
110 | ((RTUNICP)(puch[2] & 0x3f) << 12)
111 | ((RTUNICP)(puch[1] & 0x3f) << 18)
112 | ((RTUNICP)(uch & 0x03) << 24);
113 RTStrAssertMsgReturn(uc >= 0x00200000 && uc <= 0x03ffffff,
114 ("%u: cp=%#010RX32: %.*Rhxs\n", cb, uc, RT_MIN(cb + 10, cch), puch), VERR_INVALID_UTF8_ENCODING);
115 break;
116 case 4:
117 uc = (puch[3] & 0x3f)
118 | ((RTUNICP)(puch[2] & 0x3f) << 6)
119 | ((RTUNICP)(puch[1] & 0x3f) << 12)
120 | ((RTUNICP)(uch & 0x07) << 18);
121 RTStrAssertMsgReturn(uc >= 0x00010000 && uc <= 0x001fffff,
122 ("%u: cp=%#010RX32: %.*Rhxs\n", cb, uc, RT_MIN(cb + 10, cch), puch), VERR_INVALID_UTF8_ENCODING);
123 break;
124 case 3:
125 uc = (puch[2] & 0x3f)
126 | ((RTUNICP)(puch[1] & 0x3f) << 6)
127 | ((RTUNICP)(uch & 0x0f) << 12);
128 RTStrAssertMsgReturn(uc >= 0x00000800 && uc <= 0x0000fffd,
129 ("%u: cp=%#010RX32: %.*Rhxs\n", cb, uc, RT_MIN(cb + 10, cch), puch),
130 uc == 0xffff || uc == 0xfffe ? VERR_CODE_POINT_ENDIAN_INDICATOR : VERR_INVALID_UTF8_ENCODING);
131 RTStrAssertMsgReturn(uc < 0xd800 || uc > 0xdfff,
132 ("%u: cp=%#010RX32: %.*Rhxs\n", cb, uc, RT_MIN(cb + 10, cch), puch), VERR_CODE_POINT_SURROGATE);
133 break;
134 case 2:
135 uc = (puch[1] & 0x3f)
136 | ((RTUNICP)(uch & 0x1f) << 6);
137 RTStrAssertMsgReturn(uc >= 0x00000080 && uc <= 0x000007ff,
138 ("%u: cp=%#010RX32: %.*Rhxs\n", cb, uc, RT_MIN(cb + 10, cch), puch), VERR_INVALID_UTF8_ENCODING);
139 break;
140 }
141
142 /* advance */
143 cch -= cb;
144 puch += cb;
145 }
146 else
147 {
148 /* one ASCII byte */
149 puch++;
150 cch--;
151 }
152 cCodePoints++;
153 }
154
155 /* done */
156 *pcuc = cCodePoints;
157 return VINF_SUCCESS;
158}
159
160
161/**
162 * Decodes and UTF-8 string into an array of unicode code point.
163 *
164 * Since we know the input is valid, we do *not* perform encoding or length checks.
165 *
166 * @returns iprt status code.
167 * @param psz The UTF-8 string to recode. This is a valid encoding.
168 * @param cch The number of chars (the type char, so bytes if you like) to process of the UTF-8 string.
169 * The recoding will stop when cch or '\\0' is reached. Pass RTSTR_MAX to process up to '\\0'.
170 * @param paCps Where to store the code points array.
171 * @param cCps The number of RTUNICP items the paCps buffer can hold, excluding the terminator ('\\0').
172 * @param pcCps Where to store the actual number of decoded code points. This excludes the terminator.
173 */
174static int rtUtf8Decode(const char *psz, size_t cch, PRTUNICP paCps, size_t cCps, size_t *pcCps)
175{
176 int rc = VINF_SUCCESS;
177 const unsigned char *puch = (const unsigned char *)psz;
178 const PRTUNICP pCpEnd = paCps + cCps;
179 PRTUNICP pCp = paCps;
180 Assert(pCpEnd >= pCp);
181 while (cch > 0)
182 {
183 /* read the next char and check for terminator. */
184 const unsigned char uch = *puch;
185 if (!uch)
186 break;
187
188 /* check for output overflow */
189 if (pCp >= pCpEnd)
190 {
191 rc = VERR_BUFFER_OVERFLOW;
192 break;
193 }
194
195 /* decode and recode the code point */
196 if (!(uch & BIT(7)))
197 {
198 *pCp++ = uch;
199 puch++;
200 cch--;
201 }
202#ifdef RT_STRICT
203 else if (!(uch & BIT(6)))
204 AssertMsgFailed(("Internal error!\n"));
205#endif
206 else if (!(uch & BIT(5)))
207 {
208 *pCp++ = (puch[1] & 0x3f)
209 | ((uint16_t)(uch & 0x1f) << 6);
210 puch += 2;
211 cch -= 2;
212 }
213 else if (!(uch & BIT(4)))
214 {
215 *pCp++ = (puch[2] & 0x3f)
216 | ((uint16_t)(puch[1] & 0x3f) << 6)
217 | ((uint16_t)(uch & 0x0f) << 12);
218 puch += 3;
219 cch -= 3;
220 }
221 else if (!(uch & BIT(3)))
222 {
223 *pCp++ = (puch[3] & 0x3f)
224 | ((RTUNICP)(puch[2] & 0x3f) << 6)
225 | ((RTUNICP)(puch[1] & 0x3f) << 12)
226 | ((RTUNICP)(uch & 0x07) << 18);
227 puch += 4;
228 cch -= 4;
229 }
230 else if (!(uch & BIT(2)))
231 {
232 *pCp++ = (puch[4] & 0x3f)
233 | ((RTUNICP)(puch[3] & 0x3f) << 6)
234 | ((RTUNICP)(puch[2] & 0x3f) << 12)
235 | ((RTUNICP)(puch[1] & 0x3f) << 18)
236 | ((RTUNICP)(uch & 0x03) << 24);
237 puch += 5;
238 cch -= 6;
239 }
240 else
241 {
242 Assert(!(uch & BIT(1)));
243 *pCp++ = (puch[5] & 0x3f)
244 | ((RTUNICP)(puch[4] & 0x3f) << 6)
245 | ((RTUNICP)(puch[3] & 0x3f) << 12)
246 | ((RTUNICP)(puch[2] & 0x3f) << 18)
247 | ((RTUNICP)(puch[1] & 0x3f) << 24)
248 | ((RTUNICP)(uch & 0x01) << 30);
249 puch += 6;
250 cch -= 6;
251 }
252 }
253
254 /* done */
255 *pCp = 0;
256 *pcCps = pCp - paCps;
257 return rc;
258}
259
260
261RTDECL(size_t) RTStrUniLen(const char *psz)
262{
263 size_t cCodePoints;
264 int rc = rtUtf8Length(psz, RTSTR_MAX, &cCodePoints);
265 return RT_SUCCESS(rc) ? cCodePoints : 0;
266}
267
268
269RTDECL(int) RTStrUniLenEx(const char *psz, size_t cch, size_t *pcCps)
270{
271 size_t cCodePoints;
272 int rc = rtUtf8Length(psz, cch, &cCodePoints);
273 if (pcCps)
274 *pcCps = RT_SUCCESS(rc) ? cCodePoints : 0;
275 return rc;
276}
277
278
279RTDECL(int) RTStrToUni(const char *pszString, PRTUNICP *ppaCps)
280{
281 /*
282 * Validate input.
283 */
284 Assert(VALID_PTR(pszString));
285 Assert(VALID_PTR(ppaCps));
286 *ppaCps = NULL;
287
288 /*
289 * Validate the UTF-8 input and count its code points.
290 */
291 size_t cCps;
292 int rc = rtUtf8Length(pszString, RTSTR_MAX, &cCps);
293 if (RT_SUCCESS(rc))
294 {
295 /*
296 * Allocate buffer.
297 */
298 PRTUNICP paCps = (PRTUNICP)RTMemAlloc((cCps + 1) * sizeof(RTUNICP));
299 if (paCps)
300 {
301 /*
302 * Decode the string.
303 */
304 rc = rtUtf8Decode(pszString, RTSTR_MAX, paCps, cCps, &cCps);
305 if (RT_SUCCESS(rc))
306 {
307 *ppaCps = paCps;
308 return rc;
309 }
310 RTMemFree(paCps);
311 }
312 else
313 rc = VERR_NO_CODE_POINT_MEMORY;
314 }
315 return rc;
316}
317
318
319RTDECL(int) RTStrToUniEx(const char *pszString, size_t cchString, PRTUNICP *ppaCps, size_t cCps, size_t *pcCps)
320{
321 /*
322 * Validate input.
323 */
324 Assert(VALID_PTR(pszString));
325 Assert(VALID_PTR(ppaCps));
326 Assert(!pcCps || VALID_PTR(pcCps));
327
328 /*
329 * Validate the UTF-8 input and count the code points.
330 */
331 size_t cCpsResult;
332 int rc = rtUtf8Length(pszString, cchString, &cCpsResult);
333 if (RT_SUCCESS(rc))
334 {
335 if (pcCps)
336 *pcCps = cCpsResult;
337
338 /*
339 * Check buffer size / Allocate buffer.
340 */
341 bool fShouldFree;
342 PRTUNICP paCpsResult;
343 if (cCps > 0 && *ppaCps)
344 {
345 fShouldFree = false;
346 if (cCps <= cCpsResult)
347 return VERR_BUFFER_OVERFLOW;
348 paCpsResult = *ppaCps;
349 }
350 else
351 {
352 *ppaCps = NULL;
353 fShouldFree = true;
354 cCps = RT_MAX(cCpsResult + 1, cCps);
355 paCpsResult = (PRTUNICP)RTMemAlloc(cCps * sizeof(RTUNICP));
356 }
357 if (paCpsResult)
358 {
359 /*
360 * Encode the UTF-16 string.
361 */
362 rc = rtUtf8Decode(pszString, cchString, paCpsResult, cCps - 1, &cCpsResult);
363 if (RT_SUCCESS(rc))
364 {
365 *ppaCps = paCpsResult;
366 return rc;
367 }
368 if (fShouldFree)
369 RTMemFree(paCpsResult);
370 }
371 else
372 rc = VERR_NO_CODE_POINT_MEMORY;
373 }
374 return rc;
375}
376
377
378/**
379 * Calculates the UTF-16 length of a string, validating the encoding while doing so.
380 *
381 * @returns IPRT status code.
382 * @param psz Pointer to the UTF-8 string.
383 * @param cch The max length of the string. (btw cch = cb)
384 * Use RTSTR_MAX if all of the string is to be examined.s
385 * @param pcwc Where to store the length of the UTF-16 string as a number of RTUTF16 characters.
386 */
387static int rtUtf8CalcUtf16Length(const char *psz, size_t cch, size_t *pcwc)
388{
389 const unsigned char *puch = (const unsigned char *)psz;
390 size_t cwc = 0;
391 while (cch > 0)
392 {
393 const unsigned char uch = *puch;
394 if (!uch)
395 break;
396 if (!(uch & BIT(7)))
397 {
398 /* one ASCII byte */
399 cwc++;
400 puch++;
401 cch--;
402 }
403 else
404 {
405 /* figure sequence length and validate the first byte */
406 unsigned cb;
407 if ((uch & (BIT(7) | BIT(6) | BIT(5))) == (BIT(7) | BIT(6)))
408 cb = 2;
409 else if ((uch & (BIT(7) | BIT(6) | BIT(5) | BIT(4))) == (BIT(7) | BIT(6) | BIT(5)))
410 cb = 3;
411 else if ((uch & (BIT(7) | BIT(6) | BIT(5) | BIT(4) | BIT(3))) == (BIT(7) | BIT(6) | BIT(5) | BIT(4)))
412 cb = 4;
413 else if ((uch & (BIT(7) | BIT(6) | BIT(5) | BIT(4) | BIT(3) | BIT(2))) == (BIT(7) | BIT(6) | BIT(5) | BIT(4) | BIT(3)))
414 cb = 5;
415 else if ((uch & (BIT(7) | BIT(6) | BIT(5) | BIT(4) | BIT(3) | BIT(2) | BIT(1))) == (BIT(7) | BIT(6) | BIT(5) | BIT(4) | BIT(3) | BIT(2)))
416 cb = 6;
417 else
418 {
419 RTStrAssertMsgFailed(("Invalid UTF-8 first byte: %.*Rhxs\n", RT_MIN(cch, 10), puch));
420 return VERR_INVALID_UTF8_ENCODING;
421 }
422
423 /* check length */
424 if (cb > cch)
425 {
426 RTStrAssertMsgFailed(("Invalid UTF-8 length: cb=%d cch=%d (%.*Rhxs)\n", cb, cch, RT_MIN(cch, 10), puch));
427 return VERR_INVALID_UTF8_ENCODING;
428 }
429
430 /* validate the rest */
431 switch (cb)
432 {
433 case 6:
434 RTStrAssertMsgReturn((puch[5] & (BIT(7) | BIT(6))) == BIT(7), ("6/%u: %.*Rhxs\n", cb, RT_MIN(cb + 10, cch), puch), VERR_INVALID_UTF8_ENCODING);
435 case 5:
436 RTStrAssertMsgReturn((puch[4] & (BIT(7) | BIT(6))) == BIT(7), ("5/%u: %.*Rhxs\n", cb, RT_MIN(cb + 10, cch), puch), VERR_INVALID_UTF8_ENCODING);
437 case 4:
438 RTStrAssertMsgReturn((puch[3] & (BIT(7) | BIT(6))) == BIT(7), ("4/%u: %.*Rhxs\n", cb, RT_MIN(cb + 10, cch), puch), VERR_INVALID_UTF8_ENCODING);
439 case 3:
440 RTStrAssertMsgReturn((puch[2] & (BIT(7) | BIT(6))) == BIT(7), ("3/%u: %.*Rhxs\n", cb, RT_MIN(cb + 10, cch), puch), VERR_INVALID_UTF8_ENCODING);
441 case 2:
442 RTStrAssertMsgReturn((puch[1] & (BIT(7) | BIT(6))) == BIT(7), ("2/%u: %.*Rhxs\n", cb, RT_MIN(cb + 10, cch), puch), VERR_INVALID_UTF8_ENCODING);
443 break;
444 }
445
446 /* validate the code point. */
447 RTUNICP uc;
448 switch (cb)
449 {
450 case 6:
451 uc = (puch[5] & 0x3f)
452 | ((RTUNICP)(puch[4] & 0x3f) << 6)
453 | ((RTUNICP)(puch[3] & 0x3f) << 12)
454 | ((RTUNICP)(puch[2] & 0x3f) << 18)
455 | ((RTUNICP)(puch[1] & 0x3f) << 24)
456 | ((RTUNICP)(uch & 0x01) << 30);
457 RTStrAssertMsgReturn(uc >= 0x04000000 && uc <= 0x7fffffff,
458 ("%u: cp=%#010RX32: %.*Rhxs\n", cb, uc, RT_MIN(cb + 10, cch), puch), VERR_INVALID_UTF8_ENCODING);
459 RTStrAssertMsgFailed(("%u: cp=%#010RX32: %.*Rhxs\n", cb, uc, RT_MIN(cb + 10, cch), puch));
460 return VERR_CANT_RECODE_AS_UTF16;
461 case 5:
462 uc = (puch[4] & 0x3f)
463 | ((RTUNICP)(puch[3] & 0x3f) << 6)
464 | ((RTUNICP)(puch[2] & 0x3f) << 12)
465 | ((RTUNICP)(puch[1] & 0x3f) << 18)
466 | ((RTUNICP)(uch & 0x03) << 24);
467 RTStrAssertMsgReturn(uc >= 0x00200000 && uc <= 0x03ffffff,
468 ("%u: cp=%#010RX32: %.*Rhxs\n", cb, uc, RT_MIN(cb + 10, cch), puch), VERR_INVALID_UTF8_ENCODING);
469 RTStrAssertMsgFailed(("%u: cp=%#010RX32: %.*Rhxs\n", cb, uc, RT_MIN(cb + 10, cch), puch));
470 return VERR_CANT_RECODE_AS_UTF16;
471 case 4:
472 uc = (puch[3] & 0x3f)
473 | ((RTUNICP)(puch[2] & 0x3f) << 6)
474 | ((RTUNICP)(puch[1] & 0x3f) << 12)
475 | ((RTUNICP)(uch & 0x07) << 18);
476 RTStrAssertMsgReturn(uc >= 0x00010000 && uc <= 0x001fffff,
477 ("%u: cp=%#010RX32: %.*Rhxs\n", cb, uc, RT_MIN(cb + 10, cch), puch), VERR_INVALID_UTF8_ENCODING);
478 RTStrAssertMsgReturn(uc <= 0x0010ffff,
479 ("%u: cp=%#010RX32: %.*Rhxs\n", cb, uc, RT_MIN(cb + 10, cch), puch), VERR_CANT_RECODE_AS_UTF16);
480 cwc++;
481 break;
482 case 3:
483 uc = (puch[2] & 0x3f)
484 | ((RTUNICP)(puch[1] & 0x3f) << 6)
485 | ((RTUNICP)(uch & 0x0f) << 12);
486 RTStrAssertMsgReturn(uc >= 0x00000800 && uc <= 0x0000fffd,
487 ("%u: cp=%#010RX32: %.*Rhxs\n", cb, uc, RT_MIN(cb + 10, cch), puch),
488 uc == 0xffff || uc == 0xfffe ? VERR_CODE_POINT_ENDIAN_INDICATOR : VERR_INVALID_UTF8_ENCODING);
489 RTStrAssertMsgReturn(uc < 0xd800 || uc > 0xdfff,
490 ("%u: cp=%#010RX32: %.*Rhxs\n", cb, uc, RT_MIN(cb + 10, cch), puch), VERR_CODE_POINT_SURROGATE);
491 break;
492 case 2:
493 uc = (puch[1] & 0x3f)
494 | ((RTUNICP)(uch & 0x1f) << 6);
495 RTStrAssertMsgReturn(uc >= 0x00000080 && uc <= 0x000007ff,
496 ("%u: cp=%#010RX32: %.*Rhxs\n", cb, uc, RT_MIN(cb + 10, cch), puch), VERR_INVALID_UTF8_ENCODING);
497 break;
498 }
499
500 /* advance */
501 cch -= cb;
502 puch += cb;
503 cwc++;
504 }
505 }
506
507 /* done */
508 *pcwc = cwc;
509 return VINF_SUCCESS;
510}
511
512
513/**
514 * Recodes a valid UTF-8 string as UTF-16.
515 *
516 * Since we know the input is valid, we do *not* perform encoding or length checks.
517 *
518 * @returns iprt status code.
519 * @param psz The UTF-8 string to recode. This is a valid encoding.
520 * @param cch The number of chars (the type char, so bytes if you like) to process of the UTF-8 string.
521 * The recoding will stop when cch or '\\0' is reached. Pass RTSTR_MAX to process up to '\\0'.
522 * @param pwsz Where to store the UTF-16 string.
523 * @param cwc The number of RTUTF16 items the pwsz buffer can hold, excluding the terminator ('\\0').
524 * @param pcwc Where to store the actual number of RTUTF16 items encoded into the UTF-16. This excludes the terminator.
525 */
526static int rtUtf8RecodeAsUtf16(const char *psz, size_t cch, PRTUTF16 pwsz, size_t cwc, size_t *pcwc)
527{
528 int rc = VINF_SUCCESS;
529 const unsigned char *puch = (const unsigned char *)psz;
530 const PRTUTF16 pwszEnd = pwsz + cwc;
531 PRTUTF16 pwc = pwsz;
532 Assert(pwszEnd >= pwc);
533 while (cch > 0)
534 {
535 /* read the next char and check for terminator. */
536 const unsigned char uch = *puch;
537 if (!uch)
538 break;
539
540 /* check for output overflow */
541 if (pwc >= pwszEnd)
542 {
543 rc = VERR_BUFFER_OVERFLOW;
544 break;
545 }
546
547 /* decode and recode the code point */
548 if (!(uch & BIT(7)))
549 {
550 *pwc++ = uch;
551 puch++;
552 cch--;
553 }
554 else if ((uch & (BIT(7) | BIT(6) | BIT(5))) == (BIT(7) | BIT(6)))
555 {
556 uint16_t uc = (puch[1] & 0x3f)
557 | ((uint16_t)(uch & 0x1f) << 6);
558 *pwc++ = uc;
559 puch += 2;
560 cch -= 2;
561 }
562 else if ((uch & (BIT(7) | BIT(6) | BIT(5) | BIT(4))) == (BIT(7) | BIT(6) | BIT(5)))
563 {
564 uint16_t uc = (puch[2] & 0x3f)
565 | ((uint16_t)(puch[1] & 0x3f) << 6)
566 | ((uint16_t)(uch & 0x0f) << 12);
567 *pwc++ = uc;
568 puch += 3;
569 cch -= 3;
570 }
571 else
572 {
573 /* generate surrugate pair */
574 Assert((uch & (BIT(7) | BIT(6) | BIT(5) | BIT(4) | BIT(3))) == (BIT(7) | BIT(6) | BIT(5) | BIT(4)));
575 RTUNICP uc = (puch[3] & 0x3f)
576 | ((RTUNICP)(puch[2] & 0x3f) << 6)
577 | ((RTUNICP)(puch[1] & 0x3f) << 12)
578 | ((RTUNICP)(uch & 0x07) << 18);
579 if (pwc + 1 >= pwszEnd)
580 {
581 rc = VERR_BUFFER_OVERFLOW;
582 break;
583 }
584 uc -= 0x10000;
585 *pwc++ = 0xd800 | (uc >> 10);
586 *pwc++ = 0xdc00 | (uc & 0x3ff);
587 puch += 4;
588 cch -= 4;
589 }
590 }
591
592 /* done */
593 *pwc = '\0';
594 *pcwc = pwc - pwsz;
595 return rc;
596}
597
598
599RTDECL(int) RTStrToUtf16(const char *pszString, PRTUTF16 *ppwszString)
600{
601 /*
602 * Validate input.
603 */
604 Assert(VALID_PTR(ppwszString));
605 Assert(VALID_PTR(pszString));
606 *ppwszString = NULL;
607
608 /*
609 * Validate the UTF-8 input and calculate the length of the UTF-16 string.
610 */
611 size_t cwc;
612 int rc = rtUtf8CalcUtf16Length(pszString, RTSTR_MAX, &cwc);
613 if (RT_SUCCESS(rc))
614 {
615 /*
616 * Allocate buffer.
617 */
618 PRTUTF16 pwsz = (PRTUTF16)RTMemAlloc((cwc + 1) * sizeof(RTUTF16));
619 if (pwsz)
620 {
621 /*
622 * Encode the UTF-16 string.
623 */
624 rc = rtUtf8RecodeAsUtf16(pszString, RTSTR_MAX, pwsz, cwc, &cwc);
625 if (RT_SUCCESS(rc))
626 {
627 *ppwszString = pwsz;
628 return rc;
629 }
630 RTMemFree(pwsz);
631 }
632 else
633 rc = VERR_NO_UTF16_MEMORY;
634 }
635 return rc;
636}
637
638
639RTDECL(int) RTStrToUtf16Ex(const char *pszString, size_t cchString, PRTUTF16 *ppwsz, size_t cwc, size_t *pcwc)
640{
641 /*
642 * Validate input.
643 */
644 Assert(VALID_PTR(pszString));
645 Assert(VALID_PTR(ppwsz));
646 Assert(!pcwc || VALID_PTR(pcwc));
647
648 /*
649 * Validate the UTF-8 input and calculate the length of the UTF-16 string.
650 */
651 size_t cwcResult;
652 int rc = rtUtf8CalcUtf16Length(pszString, cchString, &cwcResult);
653 if (RT_SUCCESS(rc))
654 {
655 if (pcwc)
656 *pcwc = cwcResult;
657
658 /*
659 * Check buffer size / Allocate buffer.
660 */
661 bool fShouldFree;
662 PRTUTF16 pwszResult;
663 if (cwc > 0 && *ppwsz)
664 {
665 fShouldFree = false;
666 if (cwc <= cwcResult)
667 return VERR_BUFFER_OVERFLOW;
668 pwszResult = *ppwsz;
669 }
670 else
671 {
672 *ppwsz = NULL;
673 fShouldFree = true;
674 cwc = RT_MAX(cwcResult + 1, cwc);
675 pwszResult = (PRTUTF16)RTMemAlloc(cwc * sizeof(RTUTF16));
676 }
677 if (pwszResult)
678 {
679 /*
680 * Encode the UTF-16 string.
681 */
682 rc = rtUtf8RecodeAsUtf16(pszString, cchString, pwszResult, cwc - 1, &cwcResult);
683 if (RT_SUCCESS(rc))
684 {
685 *ppwsz = pwszResult;
686 return rc;
687 }
688 if (fShouldFree)
689 RTMemFree(pwszResult);
690 }
691 else
692 rc = VERR_NO_UTF16_MEMORY;
693 }
694 return rc;
695}
696
697
698RTDECL(size_t) RTStrCalcUtf16Len(const char *psz)
699{
700 size_t cwc;
701 int rc = rtUtf8CalcUtf16Length(psz, RTSTR_MAX, &cwc);
702 return RT_SUCCESS(rc) ? cwc : 0;
703}
704
705
706RTDECL(int) RTStrCalcUtf16LenEx(const char *psz, size_t cch, size_t *pcwc)
707{
708 size_t cwc;
709 int rc = rtUtf8CalcUtf16Length(psz, cch, &cwc);
710 if (pcwc)
711 *pcwc = RT_SUCCESS(rc) ? cwc : ~(size_t)0;
712 return rc;
713}
714
715
716/**
717 * Handle invalid encodings passed to RTStrGetCp() and RTStrGetCpEx().
718 * @returns rc
719 * @param ppsz The pointer to the the string position point.
720 * @param pCp Where to store RTUNICP_INVALID.
721 * @param rc The iprt error code.
722 */
723static int rtStrGetCpExFailure(const char **ppsz, PRTUNICP pCp, int rc)
724{
725 /*
726 * Try find a valid encoding.
727 */
728 (*ppsz)++; /** @todo code this! */
729 *pCp = RTUNICP_INVALID;
730 return rc;
731}
732
733
734RTDECL(RTUNICP) RTStrGetCpInternal(const char *psz)
735{
736 RTUNICP Cp;
737 RTStrGetCpExInternal(&psz, &Cp);
738 return Cp;
739}
740
741
742RTDECL(int) RTStrGetCpExInternal(const char **ppsz, PRTUNICP pCp)
743{
744 const unsigned char *puch = (const unsigned char *)*ppsz;
745 const unsigned char uch = *puch;
746 RTUNICP uc;
747
748 /* ASCII ? */
749 if (!(uch & BIT(7)))
750 {
751 uc = uch;
752 puch++;
753 }
754 else if (uch & BIT(6))
755 {
756 /* figure the length and validate the first octet. */
757 unsigned cb;
758 if (!(uch & BIT(5)))
759 cb = 2;
760 else if (!(uch & BIT(4)))
761 cb = 3;
762 else if (!(uch & BIT(3)))
763 cb = 4;
764 else if (!(uch & BIT(2)))
765 cb = 5;
766 else if (!(uch & BIT(1)))
767 cb = 6;
768 else
769 {
770 RTStrAssertMsgFailed(("Invalid UTF-8 first byte: %.*Rhxs\n", RT_MIN(strlen((char *)puch), 10), puch));
771 return rtStrGetCpExFailure(ppsz, pCp, VERR_INVALID_UTF8_ENCODING);
772 }
773
774 /* validate the rest */
775 switch (cb)
776 {
777 case 6:
778 RTStrAssertMsgReturn((puch[5] & 0xc0) == 0x80, ("6/%u: %.*Rhxs\n", cb, RT_MIN(cb + 10, strlen((char *)puch)), puch),
779 rtStrGetCpExFailure(ppsz, pCp, VERR_INVALID_UTF8_ENCODING));
780 case 5:
781 RTStrAssertMsgReturn((puch[4] & 0xc0) == 0x80, ("5/%u: %.*Rhxs\n", cb, RT_MIN(cb + 10, strlen((char *)puch)), puch),
782 rtStrGetCpExFailure(ppsz, pCp, VERR_INVALID_UTF8_ENCODING));
783 case 4:
784 RTStrAssertMsgReturn((puch[3] & 0xc0) == 0x80, ("4/%u: %.*Rhxs\n", cb, RT_MIN(cb + 10, strlen((char *)puch)), puch),
785 rtStrGetCpExFailure(ppsz, pCp, VERR_INVALID_UTF8_ENCODING));
786 case 3:
787 RTStrAssertMsgReturn((puch[2] & 0xc0) == 0x80, ("3/%u: %.*Rhxs\n", cb, RT_MIN(cb + 10, strlen((char *)puch)), puch),
788 rtStrGetCpExFailure(ppsz, pCp, VERR_INVALID_UTF8_ENCODING));
789 case 2:
790 RTStrAssertMsgReturn((puch[1] & 0xc0) == 0x80, ("2/%u: %.*Rhxs\n", cb, RT_MIN(cb + 10, strlen((char *)puch)), puch),
791 rtStrGetCpExFailure(ppsz, pCp, VERR_INVALID_UTF8_ENCODING));
792 break;
793 }
794
795 /* get and validate the code point. */
796 switch (cb)
797 {
798 case 6:
799 uc = (puch[5] & 0x3f)
800 | ((RTUNICP)(puch[4] & 0x3f) << 6)
801 | ((RTUNICP)(puch[3] & 0x3f) << 12)
802 | ((RTUNICP)(puch[2] & 0x3f) << 18)
803 | ((RTUNICP)(puch[1] & 0x3f) << 24)
804 | ((RTUNICP)(uch & 0x01) << 30);
805 RTStrAssertMsgReturn(uc >= 0x04000000 && uc <= 0x7fffffff,
806 ("%u: cp=%#010RX32: %.*Rhxs\n", cb, uc, RT_MIN(cb + 10, strlen((char *)puch)), puch),
807 rtStrGetCpExFailure(ppsz, pCp, VERR_INVALID_UTF8_ENCODING));
808 break;
809 case 5:
810 uc = (puch[4] & 0x3f)
811 | ((RTUNICP)(puch[3] & 0x3f) << 6)
812 | ((RTUNICP)(puch[2] & 0x3f) << 12)
813 | ((RTUNICP)(puch[1] & 0x3f) << 18)
814 | ((RTUNICP)(uch & 0x03) << 24);
815 RTStrAssertMsgReturn(uc >= 0x00200000 && uc <= 0x03ffffff,
816 ("%u: cp=%#010RX32: %.*Rhxs\n", cb, uc, RT_MIN(cb + 10, strlen((char *)puch)), puch),
817 rtStrGetCpExFailure(ppsz, pCp, VERR_INVALID_UTF8_ENCODING));
818 break;
819 case 4:
820 uc = (puch[3] & 0x3f)
821 | ((RTUNICP)(puch[2] & 0x3f) << 6)
822 | ((RTUNICP)(puch[1] & 0x3f) << 12)
823 | ((RTUNICP)(uch & 0x07) << 18);
824 RTStrAssertMsgReturn(uc >= 0x00010000 && uc <= 0x001fffff,
825 ("%u: cp=%#010RX32: %.*Rhxs\n", cb, uc, RT_MIN(cb + 10, strlen((char *)puch)), puch),
826 rtStrGetCpExFailure(ppsz, pCp, VERR_INVALID_UTF8_ENCODING));
827 break;
828 case 3:
829 uc = (puch[2] & 0x3f)
830 | ((RTUNICP)(puch[1] & 0x3f) << 6)
831 | ((RTUNICP)(uch & 0x0f) << 12);
832 RTStrAssertMsgReturn(uc >= 0x00000800 && uc <= 0x0000fffd,
833 ("%u: cp=%#010RX32: %.*Rhxs\n", cb, uc, RT_MIN(cb + 10, strlen((char *)puch)), puch),
834 rtStrGetCpExFailure(ppsz, pCp, uc == 0xffff || uc == 0xfffe ? VERR_CODE_POINT_ENDIAN_INDICATOR : VERR_INVALID_UTF8_ENCODING));
835 RTStrAssertMsgReturn(uc < 0xd800 || uc > 0xdfff,
836 ("%u: cp=%#010RX32: %.*Rhxs\n", cb, uc, RT_MIN(cb + 10, strlen((char *)puch)), puch),
837 rtStrGetCpExFailure(ppsz, pCp, VERR_CODE_POINT_SURROGATE));
838 break;
839 case 2:
840 uc = (puch[1] & 0x3f)
841 | ((RTUNICP)(uch & 0x1f) << 6);
842 RTStrAssertMsgReturn(uc >= 0x00000080 && uc <= 0x000007ff,
843 ("%u: cp=%#010RX32: %.*Rhxs\n", cb, uc, RT_MIN(cb + 10, strlen((char *)puch)), puch),
844 rtStrGetCpExFailure(ppsz, pCp, VERR_INVALID_UTF8_ENCODING));
845 break;
846 default: /* impossible, but GCC is bitching. */
847 uc = RTUNICP_INVALID;
848 break;
849 }
850 puch += cb;
851 }
852 else
853 {
854 /* 6th bit is always set. */
855 RTStrAssertMsgFailed(("Invalid UTF-8 first byte: %.*Rhxs\n", RT_MIN(strlen((char *)puch), 10), puch));
856 return rtStrGetCpExFailure(ppsz, pCp, VERR_INVALID_UTF8_ENCODING);
857 }
858 *pCp = uc;
859 *ppsz = (const char *)puch;
860 return VINF_SUCCESS;
861}
862
863
864RTDECL(char *) RTStrPutCpInternal(char *psz, RTUNICP uc)
865{
866 unsigned char *puch = (unsigned char *)psz;
867 if (uc < 0x80)
868 *puch++ = (unsigned char )uc;
869 else if (uc < 0x00000800)
870 {
871 *puch++ = 0xc0 | (uc >> 6);
872 *puch++ = 0x80 | (uc & 0x3f);
873 }
874 else if (uc < 0x00010000)
875 {
876 if ( uc < 0x0000d8000
877 || ( uc > 0x0000dfff
878 && uc < 0x0000fffe))
879 {
880 *puch++ = 0xe0 | (uc >> 12);
881 *puch++ = 0x80 | ((uc >> 6) & 0x3f);
882 *puch++ = 0x80 | (uc & 0x3f);
883 }
884 else
885 {
886 AssertMsgFailed(("Invalid code point U+%05x!\n", uc));
887 *puch++ = 0x7f;
888 }
889 }
890 else if (uc < 0x00200000)
891 {
892 *puch++ = 0xf0 | (uc >> 18);
893 *puch++ = 0x80 | ((uc >> 12) & 0x3f);
894 *puch++ = 0x80 | ((uc >> 6) & 0x3f);
895 *puch++ = 0x80 | (uc & 0x3f);
896 }
897 else if (uc < 0x04000000)
898 {
899 *puch++ = 0xf1 | (uc >> 24);
900 *puch++ = 0x80 | ((uc >> 18) & 0x3f);
901 *puch++ = 0x80 | ((uc >> 12) & 0x3f);
902 *puch++ = 0x80 | ((uc >> 6) & 0x3f);
903 *puch++ = 0x80 | (uc & 0x3f);
904 }
905 else if (uc <= 0x7fffffff)
906 {
907 *puch++ = 0xf3 | (uc >> 30);
908 *puch++ = 0x80 | ((uc >> 24) & 0x3f);
909 *puch++ = 0x80 | ((uc >> 18) & 0x3f);
910 *puch++ = 0x80 | ((uc >> 12) & 0x3f);
911 *puch++ = 0x80 | ((uc >> 6) & 0x3f);
912 *puch++ = 0x80 | (uc & 0x3f);
913 }
914 else
915 {
916 AssertMsgFailed(("Invalid code point U+%08x!\n", uc));
917 *puch++ = 0x7f;
918 }
919
920 return (char *)puch;
921}
922
923
924RTDECL(char *) RTStrPrevCp(const char *pszStart, const char *psz)
925{
926 if (pszStart < psz)
927 {
928 /* simple char? */
929 const unsigned char *puch = (const unsigned char *)psz;
930 unsigned uch = *--puch;
931 if (!(uch & BIT(7)))
932 return (char *)puch;
933 RTStrAssertMsgReturn(!(uch & BIT(6)), ("uch=%#x\n", uch), (char *)pszStart);
934
935 /* two or more. */
936 uint32_t uMask = 0xffffffc0;
937 while ( (const unsigned char *)pszStart < puch
938 && !(uMask & 1))
939 {
940 unsigned uch = *--puch;
941 if ((uch & 0xc0) != 0x80)
942 {
943 RTStrAssertMsgReturn((uch & (uMask >> 1)) == (uMask & 0xff),
944 ("Invalid UTF-8 encoding: %.*Rhxs puch=%p psz=%p\n", psz - (char *)puch, puch, psz),
945 (char *)pszStart);
946 return (char *)puch;
947 }
948 uMask >>= 1;
949 }
950 RTStrAssertMsgFailed(("Invalid UTF-8 encoding: %.*Rhxs puch=%p psz=%p\n", psz - (char *)puch, puch, psz));
951 }
952 return (char *)pszStart;
953}
954
955
956/**
957 * Performs a case insensitive string compare between two UTF-8 strings.
958 *
959 * This is a simplified compare, as only the simplified lower/upper case folding
960 * specified by the unicode specs are used. It does not consider character pairs
961 * as they are used in some languages, just simple upper & lower case compares.
962 *
963 * @returns < 0 if the first string less than the second string.
964 * @returns 0 if the first string identical to the second string.
965 * @returns > 0 if the first string greater than the second string.
966 * @param psz1 First UTF-8 string.
967 * @param psz2 Second UTF-8 string.
968 */
969RTDECL(int) RTStrICmp(const char *psz1, const char *psz2)
970{
971 /** @todo implement proper UTF-8 case-insensitive string comparison. */
972#ifdef RT_OS_WINDOWS
973 return stricmp(psz1, psz2);
974#else /* !RT_OS_WINDOWS */
975 return strcasecmp(psz1, psz2);
976#endif /* !RT_OS_WINDOWS */
977}
Note: See TracBrowser for help on using the repository browser.

© 2024 Oracle Support Privacy / Do Not Sell My Info Terms of Use Trademark Policy Automated Access Etiquette