VirtualBox

source: vbox/trunk/src/VBox/Runtime/common/string/utf-8.cpp@ 23594

Last change on this file since 23594 was 21791, checked in by vboxsync, 16 years ago

RTStrPutCpInternal: Fixed an irrelevant bug.

  • Property svn:eol-style set to native
  • Property svn:keywords set to Id
File size: 53.4 KB
Line 
1/* $Id: utf-8.cpp 21791 2009-07-25 17:10:57Z vboxsync $ */
2/** @file
3 * IPRT - UTF-8 Decoding.
4 */
5
6/*
7 * Copyright (C) 2006-2007 Sun Microsystems, Inc.
8 *
9 * This file is part of VirtualBox Open Source Edition (OSE), as
10 * available from http://www.virtualbox.org. This file is free software;
11 * you can redistribute it and/or modify it under the terms of the GNU
12 * General Public License (GPL) as published by the Free Software
13 * Foundation, in version 2 as it comes in the "COPYING" file of the
14 * VirtualBox OSE distribution. VirtualBox OSE is distributed in the
15 * hope that it will be useful, but WITHOUT ANY WARRANTY of any kind.
16 *
17 * The contents of this file may alternatively be used under the terms
18 * of the Common Development and Distribution License Version 1.0
19 * (CDDL) only, as it comes in the "COPYING.CDDL" file of the
20 * VirtualBox OSE distribution, in which case the provisions of the
21 * CDDL are applicable instead of those of the GPL.
22 *
23 * You may elect to license modified versions of this file under the
24 * terms and conditions of either the GPL or the CDDL or both.
25 *
26 * Please contact Sun Microsystems, Inc., 4150 Network Circle, Santa
27 * Clara, CA 95054 USA or visit http://www.sun.com if you need
28 * additional information or have any questions.
29 */
30
31
32/*******************************************************************************
33* Header Files *
34*******************************************************************************/
35#include <iprt/string.h>
36#include "internal/iprt.h"
37
38#include <iprt/uni.h>
39#include <iprt/alloc.h>
40#include <iprt/assert.h>
41#include <iprt/err.h>
42#include "internal/string.h"
43
44
45
46/**
47 * Get get length in code points of a UTF-8 encoded string.
48 * The string is validated while doing this.
49 *
50 * @returns IPRT status code.
51 * @param psz Pointer to the UTF-8 string.
52 * @param cch The max length of the string. (btw cch = cb)
53 * Use RTSTR_MAX if all of the string is to be examined.
54 * @param pcuc Where to store the length in unicode code points.
55 * @param pcchActual Where to store the actual size of the UTF-8 string
56 * on success (cch = cb again). Optional.
57 */
58static int rtUtf8Length(const char *psz, size_t cch, size_t *pcuc, size_t *pcchActual)
59{
60 const unsigned char *puch = (const unsigned char *)psz;
61 size_t cCodePoints = 0;
62 while (cch > 0)
63 {
64 const unsigned char uch = *puch;
65 if (!uch)
66 break;
67 if (uch & RT_BIT(7))
68 {
69 /* figure sequence length and validate the first byte */
70 unsigned cb;
71 if ((uch & (RT_BIT(7) | RT_BIT(6) | RT_BIT(5))) == (RT_BIT(7) | RT_BIT(6)))
72 cb = 2;
73 else if ((uch & (RT_BIT(7) | RT_BIT(6) | RT_BIT(5) | RT_BIT(4))) == (RT_BIT(7) | RT_BIT(6) | RT_BIT(5)))
74 cb = 3;
75 else if ((uch & (RT_BIT(7) | RT_BIT(6) | RT_BIT(5) | RT_BIT(4) | RT_BIT(3))) == (RT_BIT(7) | RT_BIT(6) | RT_BIT(5) | RT_BIT(4)))
76 cb = 4;
77 else if ((uch & (RT_BIT(7) | RT_BIT(6) | RT_BIT(5) | RT_BIT(4) | RT_BIT(3) | RT_BIT(2))) == (RT_BIT(7) | RT_BIT(6) | RT_BIT(5) | RT_BIT(4) | RT_BIT(3)))
78 cb = 5;
79 else if ((uch & (RT_BIT(7) | RT_BIT(6) | RT_BIT(5) | RT_BIT(4) | RT_BIT(3) | RT_BIT(2) | RT_BIT(1))) == (RT_BIT(7) | RT_BIT(6) | RT_BIT(5) | RT_BIT(4) | RT_BIT(3) | RT_BIT(2)))
80 cb = 6;
81 else
82 {
83 RTStrAssertMsgFailed(("Invalid UTF-8 first byte: %.*Rhxs\n", RT_MIN(cch, 10), puch));
84 return VERR_INVALID_UTF8_ENCODING;
85 }
86
87 /* check length */
88 if (cb > cch)
89 {
90 RTStrAssertMsgFailed(("Invalid UTF-8 length: cb=%d cch=%d (%.*Rhxs)\n", cb, cch, RT_MIN(cch, 10), puch));
91 return VERR_INVALID_UTF8_ENCODING;
92 }
93
94 /* validate the rest */
95 switch (cb)
96 {
97 case 6:
98 RTStrAssertMsgReturn((puch[5] & (RT_BIT(7) | RT_BIT(6))) == RT_BIT(7), ("6/%u: %.*Rhxs\n", cb, RT_MIN(cb + 10, cch), puch), VERR_INVALID_UTF8_ENCODING);
99 case 5:
100 RTStrAssertMsgReturn((puch[4] & (RT_BIT(7) | RT_BIT(6))) == RT_BIT(7), ("5/%u: %.*Rhxs\n", cb, RT_MIN(cb + 10, cch), puch), VERR_INVALID_UTF8_ENCODING);
101 case 4:
102 RTStrAssertMsgReturn((puch[3] & (RT_BIT(7) | RT_BIT(6))) == RT_BIT(7), ("4/%u: %.*Rhxs\n", cb, RT_MIN(cb + 10, cch), puch), VERR_INVALID_UTF8_ENCODING);
103 case 3:
104 RTStrAssertMsgReturn((puch[2] & (RT_BIT(7) | RT_BIT(6))) == RT_BIT(7), ("3/%u: %.*Rhxs\n", cb, RT_MIN(cb + 10, cch), puch), VERR_INVALID_UTF8_ENCODING);
105 case 2:
106 RTStrAssertMsgReturn((puch[1] & (RT_BIT(7) | RT_BIT(6))) == RT_BIT(7), ("2/%u: %.*Rhxs\n", cb, RT_MIN(cb + 10, cch), puch), VERR_INVALID_UTF8_ENCODING);
107 break;
108 }
109
110 /* validate the code point. */
111 RTUNICP uc;
112 switch (cb)
113 {
114 case 6:
115 uc = (puch[5] & 0x3f)
116 | ((RTUNICP)(puch[4] & 0x3f) << 6)
117 | ((RTUNICP)(puch[3] & 0x3f) << 12)
118 | ((RTUNICP)(puch[2] & 0x3f) << 18)
119 | ((RTUNICP)(puch[1] & 0x3f) << 24)
120 | ((RTUNICP)(uch & 0x01) << 30);
121 RTStrAssertMsgReturn(uc >= 0x04000000 && uc <= 0x7fffffff,
122 ("%u: cp=%#010RX32: %.*Rhxs\n", cb, uc, RT_MIN(cb + 10, cch), puch), VERR_INVALID_UTF8_ENCODING);
123 break;
124 case 5:
125 uc = (puch[4] & 0x3f)
126 | ((RTUNICP)(puch[3] & 0x3f) << 6)
127 | ((RTUNICP)(puch[2] & 0x3f) << 12)
128 | ((RTUNICP)(puch[1] & 0x3f) << 18)
129 | ((RTUNICP)(uch & 0x03) << 24);
130 RTStrAssertMsgReturn(uc >= 0x00200000 && uc <= 0x03ffffff,
131 ("%u: cp=%#010RX32: %.*Rhxs\n", cb, uc, RT_MIN(cb + 10, cch), puch), VERR_INVALID_UTF8_ENCODING);
132 break;
133 case 4:
134 uc = (puch[3] & 0x3f)
135 | ((RTUNICP)(puch[2] & 0x3f) << 6)
136 | ((RTUNICP)(puch[1] & 0x3f) << 12)
137 | ((RTUNICP)(uch & 0x07) << 18);
138 RTStrAssertMsgReturn(uc >= 0x00010000 && uc <= 0x001fffff,
139 ("%u: cp=%#010RX32: %.*Rhxs\n", cb, uc, RT_MIN(cb + 10, cch), puch), VERR_INVALID_UTF8_ENCODING);
140 break;
141 case 3:
142 uc = (puch[2] & 0x3f)
143 | ((RTUNICP)(puch[1] & 0x3f) << 6)
144 | ((RTUNICP)(uch & 0x0f) << 12);
145 RTStrAssertMsgReturn(uc >= 0x00000800 && uc <= 0x0000fffd,
146 ("%u: cp=%#010RX32: %.*Rhxs\n", cb, uc, RT_MIN(cb + 10, cch), puch),
147 uc == 0xffff || uc == 0xfffe ? VERR_CODE_POINT_ENDIAN_INDICATOR : VERR_INVALID_UTF8_ENCODING);
148 RTStrAssertMsgReturn(uc < 0xd800 || uc > 0xdfff,
149 ("%u: cp=%#010RX32: %.*Rhxs\n", cb, uc, RT_MIN(cb + 10, cch), puch), VERR_CODE_POINT_SURROGATE);
150 break;
151 case 2:
152 uc = (puch[1] & 0x3f)
153 | ((RTUNICP)(uch & 0x1f) << 6);
154 RTStrAssertMsgReturn(uc >= 0x00000080 && uc <= 0x000007ff,
155 ("%u: cp=%#010RX32: %.*Rhxs\n", cb, uc, RT_MIN(cb + 10, cch), puch), VERR_INVALID_UTF8_ENCODING);
156 break;
157 }
158
159 /* advance */
160 cch -= cb;
161 puch += cb;
162 }
163 else
164 {
165 /* one ASCII byte */
166 puch++;
167 cch--;
168 }
169 cCodePoints++;
170 }
171
172 /* done */
173 *pcuc = cCodePoints;
174 if (pcchActual)
175 *pcchActual = puch - (unsigned char const *)psz;
176 return VINF_SUCCESS;
177}
178
179
180/**
181 * Decodes and UTF-8 string into an array of unicode code point.
182 *
183 * Since we know the input is valid, we do *not* perform encoding or length checks.
184 *
185 * @returns iprt status code.
186 * @param psz The UTF-8 string to recode. This is a valid encoding.
187 * @param cch The number of chars (the type char, so bytes if you like) to process of the UTF-8 string.
188 * The recoding will stop when cch or '\\0' is reached. Pass RTSTR_MAX to process up to '\\0'.
189 * @param paCps Where to store the code points array.
190 * @param cCps The number of RTUNICP items the paCps buffer can hold, excluding the terminator ('\\0').
191 */
192static int rtUtf8Decode(const char *psz, size_t cch, PRTUNICP paCps, size_t cCps)
193{
194 int rc = VINF_SUCCESS;
195 const unsigned char *puch = (const unsigned char *)psz;
196 PRTUNICP pCp = paCps;
197 while (cch > 0)
198 {
199 /* read the next char and check for terminator. */
200 const unsigned char uch = *puch;
201 if (!uch)
202 break;
203
204 /* check for output overflow */
205 if (RT_UNLIKELY(cCps < 1))
206 {
207 rc = VERR_BUFFER_OVERFLOW;
208 break;
209 }
210 cCps--;
211
212 /* decode and recode the code point */
213 if (!(uch & RT_BIT(7)))
214 {
215 *pCp++ = uch;
216 puch++;
217 cch--;
218 }
219#ifdef RT_STRICT
220 else if (!(uch & RT_BIT(6)))
221 AssertMsgFailed(("Internal error!\n"));
222#endif
223 else if (!(uch & RT_BIT(5)))
224 {
225 *pCp++ = (puch[1] & 0x3f)
226 | ((uint16_t)(uch & 0x1f) << 6);
227 puch += 2;
228 cch -= 2;
229 }
230 else if (!(uch & RT_BIT(4)))
231 {
232 *pCp++ = (puch[2] & 0x3f)
233 | ((uint16_t)(puch[1] & 0x3f) << 6)
234 | ((uint16_t)(uch & 0x0f) << 12);
235 puch += 3;
236 cch -= 3;
237 }
238 else if (!(uch & RT_BIT(3)))
239 {
240 *pCp++ = (puch[3] & 0x3f)
241 | ((RTUNICP)(puch[2] & 0x3f) << 6)
242 | ((RTUNICP)(puch[1] & 0x3f) << 12)
243 | ((RTUNICP)(uch & 0x07) << 18);
244 puch += 4;
245 cch -= 4;
246 }
247 else if (!(uch & RT_BIT(2)))
248 {
249 *pCp++ = (puch[4] & 0x3f)
250 | ((RTUNICP)(puch[3] & 0x3f) << 6)
251 | ((RTUNICP)(puch[2] & 0x3f) << 12)
252 | ((RTUNICP)(puch[1] & 0x3f) << 18)
253 | ((RTUNICP)(uch & 0x03) << 24);
254 puch += 5;
255 cch -= 6;
256 }
257 else
258 {
259 Assert(!(uch & RT_BIT(1)));
260 *pCp++ = (puch[5] & 0x3f)
261 | ((RTUNICP)(puch[4] & 0x3f) << 6)
262 | ((RTUNICP)(puch[3] & 0x3f) << 12)
263 | ((RTUNICP)(puch[2] & 0x3f) << 18)
264 | ((RTUNICP)(puch[1] & 0x3f) << 24)
265 | ((RTUNICP)(uch & 0x01) << 30);
266 puch += 6;
267 cch -= 6;
268 }
269 }
270
271 /* done */
272 *pCp = 0;
273 return rc;
274}
275
276
277RTDECL(size_t) RTStrUniLen(const char *psz)
278{
279 size_t cCodePoints;
280 int rc = rtUtf8Length(psz, RTSTR_MAX, &cCodePoints, NULL);
281 return RT_SUCCESS(rc) ? cCodePoints : 0;
282}
283RT_EXPORT_SYMBOL(RTStrUniLen);
284
285
286RTDECL(int) RTStrUniLenEx(const char *psz, size_t cch, size_t *pcCps)
287{
288 size_t cCodePoints;
289 int rc = rtUtf8Length(psz, cch, &cCodePoints, NULL);
290 if (pcCps)
291 *pcCps = RT_SUCCESS(rc) ? cCodePoints : 0;
292 return rc;
293}
294RT_EXPORT_SYMBOL(RTStrUniLenEx);
295
296
297RTDECL(int) RTStrValidateEncoding(const char *psz)
298{
299 return RTStrValidateEncodingEx(psz, RTSTR_MAX, 0);
300}
301RT_EXPORT_SYMBOL(RTStrValidateEncoding);
302
303
304RTDECL(int) RTStrValidateEncodingEx(const char *psz, size_t cch, uint32_t fFlags)
305{
306 AssertReturn(!(fFlags & ~(RTSTR_VALIDATE_ENCODING_ZERO_TERMINATED)), VERR_INVALID_PARAMETER);
307 AssertPtr(psz);
308
309 /*
310 * Use rtUtf8Length for the job.
311 */
312 size_t cchActual;
313 size_t cCpsIgnored;
314 int rc = rtUtf8Length(psz, cch, &cCpsIgnored, &cchActual);
315 if (RT_SUCCESS(rc))
316 {
317 if ( (fFlags & RTSTR_VALIDATE_ENCODING_ZERO_TERMINATED)
318 && cchActual >= cch)
319 rc = VERR_BUFFER_OVERFLOW;
320 }
321 return rc;
322
323
324 return RTStrUniLenEx(psz, cch, &cCpsIgnored);
325}
326RT_EXPORT_SYMBOL(RTStrValidateEncodingEx);
327
328
329RTDECL(bool) RTStrIsValidEncoding(const char *psz)
330{
331 int rc = RTStrValidateEncodingEx(psz, RTSTR_MAX, 0);
332 return RT_SUCCESS(rc);
333}
334RT_EXPORT_SYMBOL(RTStrIsValidEncoding);
335
336
337RTDECL(int) RTStrToUni(const char *pszString, PRTUNICP *ppaCps)
338{
339 /*
340 * Validate input.
341 */
342 Assert(VALID_PTR(pszString));
343 Assert(VALID_PTR(ppaCps));
344 *ppaCps = NULL;
345
346 /*
347 * Validate the UTF-8 input and count its code points.
348 */
349 size_t cCps;
350 int rc = rtUtf8Length(pszString, RTSTR_MAX, &cCps, NULL);
351 if (RT_SUCCESS(rc))
352 {
353 /*
354 * Allocate buffer.
355 */
356 PRTUNICP paCps = (PRTUNICP)RTMemAlloc((cCps + 1) * sizeof(RTUNICP));
357 if (paCps)
358 {
359 /*
360 * Decode the string.
361 */
362 rc = rtUtf8Decode(pszString, RTSTR_MAX, paCps, cCps);
363 if (RT_SUCCESS(rc))
364 {
365 *ppaCps = paCps;
366 return rc;
367 }
368 RTMemFree(paCps);
369 }
370 else
371 rc = VERR_NO_CODE_POINT_MEMORY;
372 }
373 return rc;
374}
375RT_EXPORT_SYMBOL(RTStrToUni);
376
377
378RTDECL(int) RTStrToUniEx(const char *pszString, size_t cchString, PRTUNICP *ppaCps, size_t cCps, size_t *pcCps)
379{
380 /*
381 * Validate input.
382 */
383 Assert(VALID_PTR(pszString));
384 Assert(VALID_PTR(ppaCps));
385 Assert(!pcCps || VALID_PTR(pcCps));
386
387 /*
388 * Validate the UTF-8 input and count the code points.
389 */
390 size_t cCpsResult;
391 int rc = rtUtf8Length(pszString, cchString, &cCpsResult, NULL);
392 if (RT_SUCCESS(rc))
393 {
394 if (pcCps)
395 *pcCps = cCpsResult;
396
397 /*
398 * Check buffer size / Allocate buffer.
399 */
400 bool fShouldFree;
401 PRTUNICP paCpsResult;
402 if (cCps > 0 && *ppaCps)
403 {
404 fShouldFree = false;
405 if (cCps <= cCpsResult)
406 return VERR_BUFFER_OVERFLOW;
407 paCpsResult = *ppaCps;
408 }
409 else
410 {
411 *ppaCps = NULL;
412 fShouldFree = true;
413 cCps = RT_MAX(cCpsResult + 1, cCps);
414 paCpsResult = (PRTUNICP)RTMemAlloc(cCps * sizeof(RTUNICP));
415 }
416 if (paCpsResult)
417 {
418 /*
419 * Encode the UTF-16 string.
420 */
421 rc = rtUtf8Decode(pszString, cchString, paCpsResult, cCps - 1);
422 if (RT_SUCCESS(rc))
423 {
424 *ppaCps = paCpsResult;
425 return rc;
426 }
427 if (fShouldFree)
428 RTMemFree(paCpsResult);
429 }
430 else
431 rc = VERR_NO_CODE_POINT_MEMORY;
432 }
433 return rc;
434}
435RT_EXPORT_SYMBOL(RTStrToUniEx);
436
437
438/**
439 * Calculates the UTF-16 length of a string, validating the encoding while doing so.
440 *
441 * @returns IPRT status code.
442 * @param psz Pointer to the UTF-8 string.
443 * @param cch The max length of the string. (btw cch = cb)
444 * Use RTSTR_MAX if all of the string is to be examined.s
445 * @param pcwc Where to store the length of the UTF-16 string as a number of RTUTF16 characters.
446 */
447static int rtUtf8CalcUtf16Length(const char *psz, size_t cch, size_t *pcwc)
448{
449 const unsigned char *puch = (const unsigned char *)psz;
450 size_t cwc = 0;
451 while (cch > 0)
452 {
453 const unsigned char uch = *puch;
454 if (!uch)
455 break;
456 if (!(uch & RT_BIT(7)))
457 {
458 /* one ASCII byte */
459 cwc++;
460 puch++;
461 cch--;
462 }
463 else
464 {
465 /* figure sequence length and validate the first byte */
466 unsigned cb;
467 if ((uch & (RT_BIT(7) | RT_BIT(6) | RT_BIT(5))) == (RT_BIT(7) | RT_BIT(6)))
468 cb = 2;
469 else if ((uch & (RT_BIT(7) | RT_BIT(6) | RT_BIT(5) | RT_BIT(4))) == (RT_BIT(7) | RT_BIT(6) | RT_BIT(5)))
470 cb = 3;
471 else if ((uch & (RT_BIT(7) | RT_BIT(6) | RT_BIT(5) | RT_BIT(4) | RT_BIT(3))) == (RT_BIT(7) | RT_BIT(6) | RT_BIT(5) | RT_BIT(4)))
472 cb = 4;
473 else if ((uch & (RT_BIT(7) | RT_BIT(6) | RT_BIT(5) | RT_BIT(4) | RT_BIT(3) | RT_BIT(2))) == (RT_BIT(7) | RT_BIT(6) | RT_BIT(5) | RT_BIT(4) | RT_BIT(3)))
474 cb = 5;
475 else if ((uch & (RT_BIT(7) | RT_BIT(6) | RT_BIT(5) | RT_BIT(4) | RT_BIT(3) | RT_BIT(2) | RT_BIT(1))) == (RT_BIT(7) | RT_BIT(6) | RT_BIT(5) | RT_BIT(4) | RT_BIT(3) | RT_BIT(2)))
476 cb = 6;
477 else
478 {
479 RTStrAssertMsgFailed(("Invalid UTF-8 first byte: %.*Rhxs\n", RT_MIN(cch, 10), puch));
480 return VERR_INVALID_UTF8_ENCODING;
481 }
482
483 /* check length */
484 if (cb > cch)
485 {
486 RTStrAssertMsgFailed(("Invalid UTF-8 length: cb=%d cch=%d (%.*Rhxs)\n", cb, cch, RT_MIN(cch, 10), puch));
487 return VERR_INVALID_UTF8_ENCODING;
488 }
489
490 /* validate the rest */
491 switch (cb)
492 {
493 case 6:
494 RTStrAssertMsgReturn((puch[5] & (RT_BIT(7) | RT_BIT(6))) == RT_BIT(7), ("6/%u: %.*Rhxs\n", cb, RT_MIN(cb + 10, cch), puch), VERR_INVALID_UTF8_ENCODING);
495 case 5:
496 RTStrAssertMsgReturn((puch[4] & (RT_BIT(7) | RT_BIT(6))) == RT_BIT(7), ("5/%u: %.*Rhxs\n", cb, RT_MIN(cb + 10, cch), puch), VERR_INVALID_UTF8_ENCODING);
497 case 4:
498 RTStrAssertMsgReturn((puch[3] & (RT_BIT(7) | RT_BIT(6))) == RT_BIT(7), ("4/%u: %.*Rhxs\n", cb, RT_MIN(cb + 10, cch), puch), VERR_INVALID_UTF8_ENCODING);
499 case 3:
500 RTStrAssertMsgReturn((puch[2] & (RT_BIT(7) | RT_BIT(6))) == RT_BIT(7), ("3/%u: %.*Rhxs\n", cb, RT_MIN(cb + 10, cch), puch), VERR_INVALID_UTF8_ENCODING);
501 case 2:
502 RTStrAssertMsgReturn((puch[1] & (RT_BIT(7) | RT_BIT(6))) == RT_BIT(7), ("2/%u: %.*Rhxs\n", cb, RT_MIN(cb + 10, cch), puch), VERR_INVALID_UTF8_ENCODING);
503 break;
504 }
505
506 /* validate the code point. */
507 RTUNICP uc;
508 switch (cb)
509 {
510 case 6:
511 uc = (puch[5] & 0x3f)
512 | ((RTUNICP)(puch[4] & 0x3f) << 6)
513 | ((RTUNICP)(puch[3] & 0x3f) << 12)
514 | ((RTUNICP)(puch[2] & 0x3f) << 18)
515 | ((RTUNICP)(puch[1] & 0x3f) << 24)
516 | ((RTUNICP)(uch & 0x01) << 30);
517 RTStrAssertMsgReturn(uc >= 0x04000000 && uc <= 0x7fffffff,
518 ("%u: cp=%#010RX32: %.*Rhxs\n", cb, uc, RT_MIN(cb + 10, cch), puch), VERR_INVALID_UTF8_ENCODING);
519 RTStrAssertMsgFailed(("%u: cp=%#010RX32: %.*Rhxs\n", cb, uc, RT_MIN(cb + 10, cch), puch));
520 return VERR_CANT_RECODE_AS_UTF16;
521 case 5:
522 uc = (puch[4] & 0x3f)
523 | ((RTUNICP)(puch[3] & 0x3f) << 6)
524 | ((RTUNICP)(puch[2] & 0x3f) << 12)
525 | ((RTUNICP)(puch[1] & 0x3f) << 18)
526 | ((RTUNICP)(uch & 0x03) << 24);
527 RTStrAssertMsgReturn(uc >= 0x00200000 && uc <= 0x03ffffff,
528 ("%u: cp=%#010RX32: %.*Rhxs\n", cb, uc, RT_MIN(cb + 10, cch), puch), VERR_INVALID_UTF8_ENCODING);
529 RTStrAssertMsgFailed(("%u: cp=%#010RX32: %.*Rhxs\n", cb, uc, RT_MIN(cb + 10, cch), puch));
530 return VERR_CANT_RECODE_AS_UTF16;
531 case 4:
532 uc = (puch[3] & 0x3f)
533 | ((RTUNICP)(puch[2] & 0x3f) << 6)
534 | ((RTUNICP)(puch[1] & 0x3f) << 12)
535 | ((RTUNICP)(uch & 0x07) << 18);
536 RTStrAssertMsgReturn(uc >= 0x00010000 && uc <= 0x001fffff,
537 ("%u: cp=%#010RX32: %.*Rhxs\n", cb, uc, RT_MIN(cb + 10, cch), puch), VERR_INVALID_UTF8_ENCODING);
538 RTStrAssertMsgReturn(uc <= 0x0010ffff,
539 ("%u: cp=%#010RX32: %.*Rhxs\n", cb, uc, RT_MIN(cb + 10, cch), puch), VERR_CANT_RECODE_AS_UTF16);
540 cwc++;
541 break;
542 case 3:
543 uc = (puch[2] & 0x3f)
544 | ((RTUNICP)(puch[1] & 0x3f) << 6)
545 | ((RTUNICP)(uch & 0x0f) << 12);
546 RTStrAssertMsgReturn(uc >= 0x00000800 && uc <= 0x0000fffd,
547 ("%u: cp=%#010RX32: %.*Rhxs\n", cb, uc, RT_MIN(cb + 10, cch), puch),
548 uc == 0xffff || uc == 0xfffe ? VERR_CODE_POINT_ENDIAN_INDICATOR : VERR_INVALID_UTF8_ENCODING);
549 RTStrAssertMsgReturn(uc < 0xd800 || uc > 0xdfff,
550 ("%u: cp=%#010RX32: %.*Rhxs\n", cb, uc, RT_MIN(cb + 10, cch), puch), VERR_CODE_POINT_SURROGATE);
551 break;
552 case 2:
553 uc = (puch[1] & 0x3f)
554 | ((RTUNICP)(uch & 0x1f) << 6);
555 RTStrAssertMsgReturn(uc >= 0x00000080 && uc <= 0x000007ff,
556 ("%u: cp=%#010RX32: %.*Rhxs\n", cb, uc, RT_MIN(cb + 10, cch), puch), VERR_INVALID_UTF8_ENCODING);
557 break;
558 }
559
560 /* advance */
561 cch -= cb;
562 puch += cb;
563 cwc++;
564 }
565 }
566
567 /* done */
568 *pcwc = cwc;
569 return VINF_SUCCESS;
570}
571
572
573/**
574 * Recodes a valid UTF-8 string as UTF-16.
575 *
576 * Since we know the input is valid, we do *not* perform encoding or length checks.
577 *
578 * @returns iprt status code.
579 * @param psz The UTF-8 string to recode. This is a valid encoding.
580 * @param cch The number of chars (the type char, so bytes if you like) to process of the UTF-8 string.
581 * The recoding will stop when cch or '\\0' is reached. Pass RTSTR_MAX to process up to '\\0'.
582 * @param pwsz Where to store the UTF-16 string.
583 * @param cwc The number of RTUTF16 items the pwsz buffer can hold, excluding the terminator ('\\0').
584 */
585static int rtUtf8RecodeAsUtf16(const char *psz, size_t cch, PRTUTF16 pwsz, size_t cwc)
586{
587 int rc = VINF_SUCCESS;
588 const unsigned char *puch = (const unsigned char *)psz;
589 PRTUTF16 pwc = pwsz;
590 while (cch > 0)
591 {
592 /* read the next char and check for terminator. */
593 const unsigned char uch = *puch;
594 if (!uch)
595 break;
596
597 /* check for output overflow */
598 if (RT_UNLIKELY(cwc < 1))
599 {
600 rc = VERR_BUFFER_OVERFLOW;
601 break;
602 }
603 cwc--;
604
605 /* decode and recode the code point */
606 if (!(uch & RT_BIT(7)))
607 {
608 *pwc++ = uch;
609 puch++;
610 cch--;
611 }
612 else if ((uch & (RT_BIT(7) | RT_BIT(6) | RT_BIT(5))) == (RT_BIT(7) | RT_BIT(6)))
613 {
614 uint16_t uc = (puch[1] & 0x3f)
615 | ((uint16_t)(uch & 0x1f) << 6);
616 *pwc++ = uc;
617 puch += 2;
618 cch -= 2;
619 }
620 else if ((uch & (RT_BIT(7) | RT_BIT(6) | RT_BIT(5) | RT_BIT(4))) == (RT_BIT(7) | RT_BIT(6) | RT_BIT(5)))
621 {
622 uint16_t uc = (puch[2] & 0x3f)
623 | ((uint16_t)(puch[1] & 0x3f) << 6)
624 | ((uint16_t)(uch & 0x0f) << 12);
625 *pwc++ = uc;
626 puch += 3;
627 cch -= 3;
628 }
629 else
630 {
631 /* generate surrugate pair */
632 Assert((uch & (RT_BIT(7) | RT_BIT(6) | RT_BIT(5) | RT_BIT(4) | RT_BIT(3))) == (RT_BIT(7) | RT_BIT(6) | RT_BIT(5) | RT_BIT(4)));
633 RTUNICP uc = (puch[3] & 0x3f)
634 | ((RTUNICP)(puch[2] & 0x3f) << 6)
635 | ((RTUNICP)(puch[1] & 0x3f) << 12)
636 | ((RTUNICP)(uch & 0x07) << 18);
637 if (RT_UNLIKELY(cwc < 1))
638 {
639 rc = VERR_BUFFER_OVERFLOW;
640 break;
641 }
642 cwc--;
643
644 uc -= 0x10000;
645 *pwc++ = 0xd800 | (uc >> 10);
646 *pwc++ = 0xdc00 | (uc & 0x3ff);
647 puch += 4;
648 cch -= 4;
649 }
650 }
651
652 /* done */
653 *pwc = '\0';
654 return rc;
655}
656
657
658RTDECL(int) RTStrToUtf16(const char *pszString, PRTUTF16 *ppwszString)
659{
660 /*
661 * Validate input.
662 */
663 Assert(VALID_PTR(ppwszString));
664 Assert(VALID_PTR(pszString));
665 *ppwszString = NULL;
666
667 /*
668 * Validate the UTF-8 input and calculate the length of the UTF-16 string.
669 */
670 size_t cwc;
671 int rc = rtUtf8CalcUtf16Length(pszString, RTSTR_MAX, &cwc);
672 if (RT_SUCCESS(rc))
673 {
674 /*
675 * Allocate buffer.
676 */
677 PRTUTF16 pwsz = (PRTUTF16)RTMemAlloc((cwc + 1) * sizeof(RTUTF16));
678 if (pwsz)
679 {
680 /*
681 * Encode the UTF-16 string.
682 */
683 rc = rtUtf8RecodeAsUtf16(pszString, RTSTR_MAX, pwsz, cwc);
684 if (RT_SUCCESS(rc))
685 {
686 *ppwszString = pwsz;
687 return rc;
688 }
689 RTMemFree(pwsz);
690 }
691 else
692 rc = VERR_NO_UTF16_MEMORY;
693 }
694 return rc;
695}
696RT_EXPORT_SYMBOL(RTStrToUtf16);
697
698
699RTDECL(int) RTStrToUtf16Ex(const char *pszString, size_t cchString, PRTUTF16 *ppwsz, size_t cwc, size_t *pcwc)
700{
701 /*
702 * Validate input.
703 */
704 Assert(VALID_PTR(pszString));
705 Assert(VALID_PTR(ppwsz));
706 Assert(!pcwc || VALID_PTR(pcwc));
707
708 /*
709 * Validate the UTF-8 input and calculate the length of the UTF-16 string.
710 */
711 size_t cwcResult;
712 int rc = rtUtf8CalcUtf16Length(pszString, cchString, &cwcResult);
713 if (RT_SUCCESS(rc))
714 {
715 if (pcwc)
716 *pcwc = cwcResult;
717
718 /*
719 * Check buffer size / Allocate buffer.
720 */
721 bool fShouldFree;
722 PRTUTF16 pwszResult;
723 if (cwc > 0 && *ppwsz)
724 {
725 fShouldFree = false;
726 if (cwc <= cwcResult)
727 return VERR_BUFFER_OVERFLOW;
728 pwszResult = *ppwsz;
729 }
730 else
731 {
732 *ppwsz = NULL;
733 fShouldFree = true;
734 cwc = RT_MAX(cwcResult + 1, cwc);
735 pwszResult = (PRTUTF16)RTMemAlloc(cwc * sizeof(RTUTF16));
736 }
737 if (pwszResult)
738 {
739 /*
740 * Encode the UTF-16 string.
741 */
742 rc = rtUtf8RecodeAsUtf16(pszString, cchString, pwszResult, cwc - 1);
743 if (RT_SUCCESS(rc))
744 {
745 *ppwsz = pwszResult;
746 return rc;
747 }
748 if (fShouldFree)
749 RTMemFree(pwszResult);
750 }
751 else
752 rc = VERR_NO_UTF16_MEMORY;
753 }
754 return rc;
755}
756RT_EXPORT_SYMBOL(RTStrToUtf16Ex);
757
758
759RTDECL(size_t) RTStrCalcUtf16Len(const char *psz)
760{
761 size_t cwc;
762 int rc = rtUtf8CalcUtf16Length(psz, RTSTR_MAX, &cwc);
763 return RT_SUCCESS(rc) ? cwc : 0;
764}
765RT_EXPORT_SYMBOL(RTStrCalcUtf16Len);
766
767
768RTDECL(int) RTStrCalcUtf16LenEx(const char *psz, size_t cch, size_t *pcwc)
769{
770 size_t cwc;
771 int rc = rtUtf8CalcUtf16Length(psz, cch, &cwc);
772 if (pcwc)
773 *pcwc = RT_SUCCESS(rc) ? cwc : ~(size_t)0;
774 return rc;
775}
776RT_EXPORT_SYMBOL(RTStrCalcUtf16LenEx);
777
778
779/**
780 * Handle invalid encodings passed to RTStrGetCp() and RTStrGetCpEx().
781 * @returns rc
782 * @param ppsz The pointer to the string position point.
783 * @param pCp Where to store RTUNICP_INVALID.
784 * @param rc The iprt error code.
785 */
786static int rtStrGetCpExFailure(const char **ppsz, PRTUNICP pCp, int rc)
787{
788 /*
789 * Try find a valid encoding.
790 */
791 (*ppsz)++; /** @todo code this! */
792 *pCp = RTUNICP_INVALID;
793 return rc;
794}
795
796
797RTDECL(RTUNICP) RTStrGetCpInternal(const char *psz)
798{
799 RTUNICP Cp;
800 RTStrGetCpExInternal(&psz, &Cp);
801 return Cp;
802}
803RT_EXPORT_SYMBOL(RTStrGetCpInternal);
804
805
806RTDECL(int) RTStrGetCpExInternal(const char **ppsz, PRTUNICP pCp)
807{
808 const unsigned char *puch = (const unsigned char *)*ppsz;
809 const unsigned char uch = *puch;
810 RTUNICP uc;
811
812 /* ASCII ? */
813 if (!(uch & RT_BIT(7)))
814 {
815 uc = uch;
816 puch++;
817 }
818 else if (uch & RT_BIT(6))
819 {
820 /* figure the length and validate the first octet. */
821 unsigned cb;
822 if (!(uch & RT_BIT(5)))
823 cb = 2;
824 else if (!(uch & RT_BIT(4)))
825 cb = 3;
826 else if (!(uch & RT_BIT(3)))
827 cb = 4;
828 else if (!(uch & RT_BIT(2)))
829 cb = 5;
830 else if (!(uch & RT_BIT(1)))
831 cb = 6;
832 else
833 {
834 RTStrAssertMsgFailed(("Invalid UTF-8 first byte: %.*Rhxs\n", RT_MIN(strlen((char *)puch), 10), puch));
835 return rtStrGetCpExFailure(ppsz, pCp, VERR_INVALID_UTF8_ENCODING);
836 }
837
838 /* validate the rest */
839 switch (cb)
840 {
841 case 6:
842 RTStrAssertMsgReturn((puch[5] & 0xc0) == 0x80, ("6/%u: %.*Rhxs\n", cb, RT_MIN(cb + 10, strlen((char *)puch)), puch),
843 rtStrGetCpExFailure(ppsz, pCp, VERR_INVALID_UTF8_ENCODING));
844 case 5:
845 RTStrAssertMsgReturn((puch[4] & 0xc0) == 0x80, ("5/%u: %.*Rhxs\n", cb, RT_MIN(cb + 10, strlen((char *)puch)), puch),
846 rtStrGetCpExFailure(ppsz, pCp, VERR_INVALID_UTF8_ENCODING));
847 case 4:
848 RTStrAssertMsgReturn((puch[3] & 0xc0) == 0x80, ("4/%u: %.*Rhxs\n", cb, RT_MIN(cb + 10, strlen((char *)puch)), puch),
849 rtStrGetCpExFailure(ppsz, pCp, VERR_INVALID_UTF8_ENCODING));
850 case 3:
851 RTStrAssertMsgReturn((puch[2] & 0xc0) == 0x80, ("3/%u: %.*Rhxs\n", cb, RT_MIN(cb + 10, strlen((char *)puch)), puch),
852 rtStrGetCpExFailure(ppsz, pCp, VERR_INVALID_UTF8_ENCODING));
853 case 2:
854 RTStrAssertMsgReturn((puch[1] & 0xc0) == 0x80, ("2/%u: %.*Rhxs\n", cb, RT_MIN(cb + 10, strlen((char *)puch)), puch),
855 rtStrGetCpExFailure(ppsz, pCp, VERR_INVALID_UTF8_ENCODING));
856 break;
857 }
858
859 /* get and validate the code point. */
860 switch (cb)
861 {
862 case 6:
863 uc = (puch[5] & 0x3f)
864 | ((RTUNICP)(puch[4] & 0x3f) << 6)
865 | ((RTUNICP)(puch[3] & 0x3f) << 12)
866 | ((RTUNICP)(puch[2] & 0x3f) << 18)
867 | ((RTUNICP)(puch[1] & 0x3f) << 24)
868 | ((RTUNICP)(uch & 0x01) << 30);
869 RTStrAssertMsgReturn(uc >= 0x04000000 && uc <= 0x7fffffff,
870 ("%u: cp=%#010RX32: %.*Rhxs\n", cb, uc, RT_MIN(cb + 10, strlen((char *)puch)), puch),
871 rtStrGetCpExFailure(ppsz, pCp, VERR_INVALID_UTF8_ENCODING));
872 break;
873 case 5:
874 uc = (puch[4] & 0x3f)
875 | ((RTUNICP)(puch[3] & 0x3f) << 6)
876 | ((RTUNICP)(puch[2] & 0x3f) << 12)
877 | ((RTUNICP)(puch[1] & 0x3f) << 18)
878 | ((RTUNICP)(uch & 0x03) << 24);
879 RTStrAssertMsgReturn(uc >= 0x00200000 && uc <= 0x03ffffff,
880 ("%u: cp=%#010RX32: %.*Rhxs\n", cb, uc, RT_MIN(cb + 10, strlen((char *)puch)), puch),
881 rtStrGetCpExFailure(ppsz, pCp, VERR_INVALID_UTF8_ENCODING));
882 break;
883 case 4:
884 uc = (puch[3] & 0x3f)
885 | ((RTUNICP)(puch[2] & 0x3f) << 6)
886 | ((RTUNICP)(puch[1] & 0x3f) << 12)
887 | ((RTUNICP)(uch & 0x07) << 18);
888 RTStrAssertMsgReturn(uc >= 0x00010000 && uc <= 0x001fffff,
889 ("%u: cp=%#010RX32: %.*Rhxs\n", cb, uc, RT_MIN(cb + 10, strlen((char *)puch)), puch),
890 rtStrGetCpExFailure(ppsz, pCp, VERR_INVALID_UTF8_ENCODING));
891 break;
892 case 3:
893 uc = (puch[2] & 0x3f)
894 | ((RTUNICP)(puch[1] & 0x3f) << 6)
895 | ((RTUNICP)(uch & 0x0f) << 12);
896 RTStrAssertMsgReturn(uc >= 0x00000800 && uc <= 0x0000fffd,
897 ("%u: cp=%#010RX32: %.*Rhxs\n", cb, uc, RT_MIN(cb + 10, strlen((char *)puch)), puch),
898 rtStrGetCpExFailure(ppsz, pCp, uc == 0xffff || uc == 0xfffe ? VERR_CODE_POINT_ENDIAN_INDICATOR : VERR_INVALID_UTF8_ENCODING));
899 RTStrAssertMsgReturn(uc < 0xd800 || uc > 0xdfff,
900 ("%u: cp=%#010RX32: %.*Rhxs\n", cb, uc, RT_MIN(cb + 10, strlen((char *)puch)), puch),
901 rtStrGetCpExFailure(ppsz, pCp, VERR_CODE_POINT_SURROGATE));
902 break;
903 case 2:
904 uc = (puch[1] & 0x3f)
905 | ((RTUNICP)(uch & 0x1f) << 6);
906 RTStrAssertMsgReturn(uc >= 0x00000080 && uc <= 0x000007ff,
907 ("%u: cp=%#010RX32: %.*Rhxs\n", cb, uc, RT_MIN(cb + 10, strlen((char *)puch)), puch),
908 rtStrGetCpExFailure(ppsz, pCp, VERR_INVALID_UTF8_ENCODING));
909 break;
910 default: /* impossible, but GCC is bitching. */
911 uc = RTUNICP_INVALID;
912 break;
913 }
914 puch += cb;
915 }
916 else
917 {
918 /* 6th bit is always set. */
919 RTStrAssertMsgFailed(("Invalid UTF-8 first byte: %.*Rhxs\n", RT_MIN(strlen((char *)puch), 10), puch));
920 return rtStrGetCpExFailure(ppsz, pCp, VERR_INVALID_UTF8_ENCODING);
921 }
922 *pCp = uc;
923 *ppsz = (const char *)puch;
924 return VINF_SUCCESS;
925}
926RT_EXPORT_SYMBOL(RTStrGetCpExInternal);
927
928
929/**
930 * Handle invalid encodings passed to RTStrGetCpNEx().
931 * @returns rc
932 * @param ppsz The pointer to the string position point.
933 * @param pcch Pointer to the string length.
934 * @param pCp Where to store RTUNICP_INVALID.
935 * @param rc The iprt error code.
936 */
937static int rtStrGetCpNExFailure(const char **ppsz, size_t *pcch, PRTUNICP pCp, int rc)
938{
939 /*
940 * Try find a valid encoding.
941 */
942 (*ppsz)++; /** @todo code this! */
943 (*pcch)--;
944 *pCp = RTUNICP_INVALID;
945 return rc;
946}
947
948
949RTDECL(int) RTStrGetCpNExInternal(const char **ppsz, size_t *pcch, PRTUNICP pCp)
950{
951 const unsigned char *puch = (const unsigned char *)*ppsz;
952 const unsigned char uch = *puch;
953 size_t cch = *pcch;
954 RTUNICP uc;
955
956 if (cch == 0)
957 {
958 *pCp = RTUNICP_INVALID;
959 return VERR_END_OF_STRING;
960 }
961
962 /* ASCII ? */
963 if (!(uch & RT_BIT(7)))
964 {
965 uc = uch;
966 puch++;
967 cch--;
968 }
969 else if (uch & RT_BIT(6))
970 {
971 /* figure the length and validate the first octet. */
972 unsigned cb;
973 if (!(uch & RT_BIT(5)))
974 cb = 2;
975 else if (!(uch & RT_BIT(4)))
976 cb = 3;
977 else if (!(uch & RT_BIT(3)))
978 cb = 4;
979 else if (!(uch & RT_BIT(2)))
980 cb = 5;
981 else if (!(uch & RT_BIT(1)))
982 cb = 6;
983 else
984 {
985 RTStrAssertMsgFailed(("Invalid UTF-8 first byte: %.*Rhxs\n", RT_MIN(strlen((char *)puch), 10), puch));
986 return rtStrGetCpNExFailure(ppsz, pcch, pCp, VERR_INVALID_UTF8_ENCODING);
987 }
988
989 if (cb > cch)
990 return rtStrGetCpNExFailure(ppsz, pcch, pCp, VERR_INVALID_UTF8_ENCODING);
991
992 /* validate the rest */
993 switch (cb)
994 {
995 case 6:
996 RTStrAssertMsgReturn((puch[5] & 0xc0) == 0x80, ("6/%u: %.*Rhxs\n", cb, RT_MIN(cb + 10, strlen((char *)puch)), puch),
997 rtStrGetCpNExFailure(ppsz, pcch, pCp, VERR_INVALID_UTF8_ENCODING));
998 case 5:
999 RTStrAssertMsgReturn((puch[4] & 0xc0) == 0x80, ("5/%u: %.*Rhxs\n", cb, RT_MIN(cb + 10, strlen((char *)puch)), puch),
1000 rtStrGetCpNExFailure(ppsz, pcch, pCp, VERR_INVALID_UTF8_ENCODING));
1001 case 4:
1002 RTStrAssertMsgReturn((puch[3] & 0xc0) == 0x80, ("4/%u: %.*Rhxs\n", cb, RT_MIN(cb + 10, strlen((char *)puch)), puch),
1003 rtStrGetCpNExFailure(ppsz, pcch, pCp, VERR_INVALID_UTF8_ENCODING));
1004 case 3:
1005 RTStrAssertMsgReturn((puch[2] & 0xc0) == 0x80, ("3/%u: %.*Rhxs\n", cb, RT_MIN(cb + 10, strlen((char *)puch)), puch),
1006 rtStrGetCpNExFailure(ppsz, pcch, pCp, VERR_INVALID_UTF8_ENCODING));
1007 case 2:
1008 RTStrAssertMsgReturn((puch[1] & 0xc0) == 0x80, ("2/%u: %.*Rhxs\n", cb, RT_MIN(cb + 10, strlen((char *)puch)), puch),
1009 rtStrGetCpNExFailure(ppsz, pcch, pCp, VERR_INVALID_UTF8_ENCODING));
1010 break;
1011 }
1012
1013 /* get and validate the code point. */
1014 switch (cb)
1015 {
1016 case 6:
1017 uc = (puch[5] & 0x3f)
1018 | ((RTUNICP)(puch[4] & 0x3f) << 6)
1019 | ((RTUNICP)(puch[3] & 0x3f) << 12)
1020 | ((RTUNICP)(puch[2] & 0x3f) << 18)
1021 | ((RTUNICP)(puch[1] & 0x3f) << 24)
1022 | ((RTUNICP)(uch & 0x01) << 30);
1023 RTStrAssertMsgReturn(uc >= 0x04000000 && uc <= 0x7fffffff,
1024 ("%u: cp=%#010RX32: %.*Rhxs\n", cb, uc, RT_MIN(cb + 10, strlen((char *)puch)), puch),
1025 rtStrGetCpNExFailure(ppsz, pcch, pCp, VERR_INVALID_UTF8_ENCODING));
1026 break;
1027 case 5:
1028 uc = (puch[4] & 0x3f)
1029 | ((RTUNICP)(puch[3] & 0x3f) << 6)
1030 | ((RTUNICP)(puch[2] & 0x3f) << 12)
1031 | ((RTUNICP)(puch[1] & 0x3f) << 18)
1032 | ((RTUNICP)(uch & 0x03) << 24);
1033 RTStrAssertMsgReturn(uc >= 0x00200000 && uc <= 0x03ffffff,
1034 ("%u: cp=%#010RX32: %.*Rhxs\n", cb, uc, RT_MIN(cb + 10, strlen((char *)puch)), puch),
1035 rtStrGetCpNExFailure(ppsz, pcch, pCp, VERR_INVALID_UTF8_ENCODING));
1036 break;
1037 case 4:
1038 uc = (puch[3] & 0x3f)
1039 | ((RTUNICP)(puch[2] & 0x3f) << 6)
1040 | ((RTUNICP)(puch[1] & 0x3f) << 12)
1041 | ((RTUNICP)(uch & 0x07) << 18);
1042 RTStrAssertMsgReturn(uc >= 0x00010000 && uc <= 0x001fffff,
1043 ("%u: cp=%#010RX32: %.*Rhxs\n", cb, uc, RT_MIN(cb + 10, strlen((char *)puch)), puch),
1044 rtStrGetCpNExFailure(ppsz, pcch, pCp, VERR_INVALID_UTF8_ENCODING));
1045 break;
1046 case 3:
1047 uc = (puch[2] & 0x3f)
1048 | ((RTUNICP)(puch[1] & 0x3f) << 6)
1049 | ((RTUNICP)(uch & 0x0f) << 12);
1050 RTStrAssertMsgReturn(uc >= 0x00000800 && uc <= 0x0000fffd,
1051 ("%u: cp=%#010RX32: %.*Rhxs\n", cb, uc, RT_MIN(cb + 10, strlen((char *)puch)), puch),
1052 rtStrGetCpNExFailure(ppsz, pcch, pCp, uc == 0xffff || uc == 0xfffe ? VERR_CODE_POINT_ENDIAN_INDICATOR : VERR_INVALID_UTF8_ENCODING));
1053 RTStrAssertMsgReturn(uc < 0xd800 || uc > 0xdfff,
1054 ("%u: cp=%#010RX32: %.*Rhxs\n", cb, uc, RT_MIN(cb + 10, strlen((char *)puch)), puch),
1055 rtStrGetCpNExFailure(ppsz, pcch, pCp, VERR_CODE_POINT_SURROGATE));
1056 break;
1057 case 2:
1058 uc = (puch[1] & 0x3f)
1059 | ((RTUNICP)(uch & 0x1f) << 6);
1060 RTStrAssertMsgReturn(uc >= 0x00000080 && uc <= 0x000007ff,
1061 ("%u: cp=%#010RX32: %.*Rhxs\n", cb, uc, RT_MIN(cb + 10, strlen((char *)puch)), puch),
1062 rtStrGetCpNExFailure(ppsz, pcch, pCp, VERR_INVALID_UTF8_ENCODING));
1063 break;
1064 default: /* impossible, but GCC is bitching. */
1065 uc = RTUNICP_INVALID;
1066 break;
1067 }
1068 puch += cb;
1069 cch -= cb;
1070 }
1071 else
1072 {
1073 /* 6th bit is always set. */
1074 RTStrAssertMsgFailed(("Invalid UTF-8 first byte: %.*Rhxs\n", RT_MIN(strlen((char *)puch), 10), puch));
1075 return rtStrGetCpNExFailure(ppsz, pcch, pCp, VERR_INVALID_UTF8_ENCODING);
1076 }
1077 *pCp = uc;
1078 *ppsz = (const char *)puch;
1079 (*pcch) = cch;
1080 return VINF_SUCCESS;
1081}
1082RT_EXPORT_SYMBOL(RTStrGetCpNExInternal);
1083
1084
1085RTDECL(char *) RTStrPutCpInternal(char *psz, RTUNICP uc)
1086{
1087 unsigned char *puch = (unsigned char *)psz;
1088 if (uc < 0x80)
1089 *puch++ = (unsigned char )uc;
1090 else if (uc < 0x00000800)
1091 {
1092 *puch++ = 0xc0 | (uc >> 6);
1093 *puch++ = 0x80 | (uc & 0x3f);
1094 }
1095 else if (uc < 0x00010000)
1096 {
1097 if ( uc < 0x0000d8000
1098 || ( uc > 0x0000dfff
1099 && uc < 0x0000fffe))
1100 {
1101 *puch++ = 0xe0 | (uc >> 12);
1102 *puch++ = 0x80 | ((uc >> 6) & 0x3f);
1103 *puch++ = 0x80 | (uc & 0x3f);
1104 }
1105 else
1106 {
1107 AssertMsgFailed(("Invalid code point U+%05x!\n", uc));
1108 *puch++ = 0x7f;
1109 }
1110 }
1111 else if (uc < 0x00200000)
1112 {
1113 *puch++ = 0xf0 | (uc >> 18);
1114 *puch++ = 0x80 | ((uc >> 12) & 0x3f);
1115 *puch++ = 0x80 | ((uc >> 6) & 0x3f);
1116 *puch++ = 0x80 | (uc & 0x3f);
1117 }
1118 else if (uc < 0x04000000)
1119 {
1120 *puch++ = 0xf8 | (uc >> 24);
1121 *puch++ = 0x80 | ((uc >> 18) & 0x3f);
1122 *puch++ = 0x80 | ((uc >> 12) & 0x3f);
1123 *puch++ = 0x80 | ((uc >> 6) & 0x3f);
1124 *puch++ = 0x80 | (uc & 0x3f);
1125 }
1126 else if (uc <= 0x7fffffff)
1127 {
1128 *puch++ = 0xfc | (uc >> 30);
1129 *puch++ = 0x80 | ((uc >> 24) & 0x3f);
1130 *puch++ = 0x80 | ((uc >> 18) & 0x3f);
1131 *puch++ = 0x80 | ((uc >> 12) & 0x3f);
1132 *puch++ = 0x80 | ((uc >> 6) & 0x3f);
1133 *puch++ = 0x80 | (uc & 0x3f);
1134 }
1135 else
1136 {
1137 AssertMsgFailed(("Invalid code point U+%08x!\n", uc));
1138 *puch++ = 0x7f;
1139 }
1140
1141 return (char *)puch;
1142}
1143RT_EXPORT_SYMBOL(RTStrPutCpInternal);
1144
1145
1146RTDECL(char *) RTStrPrevCp(const char *pszStart, const char *psz)
1147{
1148 if (pszStart < psz)
1149 {
1150 /* simple char? */
1151 const unsigned char *puch = (const unsigned char *)psz;
1152 unsigned uch = *--puch;
1153 if (!(uch & RT_BIT(7)))
1154 return (char *)puch;
1155 RTStrAssertMsgReturn(!(uch & RT_BIT(6)), ("uch=%#x\n", uch), (char *)pszStart);
1156
1157 /* two or more. */
1158 uint32_t uMask = 0xffffffc0;
1159 while ( (const unsigned char *)pszStart < puch
1160 && !(uMask & 1))
1161 {
1162 unsigned uch = *--puch;
1163 if ((uch & 0xc0) != 0x80)
1164 {
1165 RTStrAssertMsgReturn((uch & (uMask >> 1)) == (uMask & 0xff),
1166 ("Invalid UTF-8 encoding: %.*Rhxs puch=%p psz=%p\n", psz - (char *)puch, puch, psz),
1167 (char *)pszStart);
1168 return (char *)puch;
1169 }
1170 uMask >>= 1;
1171 }
1172 RTStrAssertMsgFailed(("Invalid UTF-8 encoding: %.*Rhxs puch=%p psz=%p\n", psz - (char *)puch, puch, psz));
1173 }
1174 return (char *)pszStart;
1175}
1176RT_EXPORT_SYMBOL(RTStrPrevCp);
1177
1178
1179/**
1180 * Performs a case sensitive string compare between two UTF-8 strings.
1181 *
1182 * Encoding errors are ignored by the current implementation. So, the only
1183 * difference between this and the CRT strcmp function is the handling of
1184 * NULL arguments.
1185 *
1186 * @returns < 0 if the first string less than the second string.
1187 * @returns 0 if the first string identical to the second string.
1188 * @returns > 0 if the first string greater than the second string.
1189 * @param psz1 First UTF-8 string. Null is allowed.
1190 * @param psz2 Second UTF-8 string. Null is allowed.
1191 */
1192RTDECL(int) RTStrCmp(const char *psz1, const char *psz2)
1193{
1194 if (psz1 == psz2)
1195 return 0;
1196 if (!psz1)
1197 return -1;
1198 if (!psz2)
1199 return 1;
1200
1201 return strcmp(psz1, psz2);
1202}
1203RT_EXPORT_SYMBOL(RTStrCmp);
1204
1205
1206/**
1207 * Performs a case sensitive string compare between two UTF-8 strings, given
1208 * a maximum string length.
1209 *
1210 * Encoding errors are ignored by the current implementation. So, the only
1211 * difference between this and the CRT strncmp function is the handling of
1212 * NULL arguments.
1213 *
1214 * @returns < 0 if the first string less than the second string.
1215 * @returns 0 if the first string identical to the second string.
1216 * @returns > 0 if the first string greater than the second string.
1217 * @param psz1 First UTF-8 string. Null is allowed.
1218 * @param psz2 Second UTF-8 string. Null is allowed.
1219 * @param cchMax The maximum string length
1220 */
1221RTDECL(int) RTStrNCmp(const char *psz1, const char *psz2, size_t cchMax)
1222{
1223 if (psz1 == psz2)
1224 return 0;
1225 if (!psz1)
1226 return -1;
1227 if (!psz2)
1228 return 1;
1229
1230 return strncmp(psz1, psz2, cchMax);
1231}
1232RT_EXPORT_SYMBOL(RTStrNCmp);
1233
1234
1235/**
1236 * Performs a case insensitive string compare between two UTF-8 strings.
1237 *
1238 * This is a simplified compare, as only the simplified lower/upper case folding
1239 * specified by the unicode specs are used. It does not consider character pairs
1240 * as they are used in some languages, just simple upper & lower case compares.
1241 *
1242 * The result is the difference between the mismatching codepoints after they
1243 * both have been lower cased.
1244 *
1245 * If the string encoding is invalid the function will assert (strict builds)
1246 * and use RTStrCmp for the remainder of the string.
1247 *
1248 * @returns < 0 if the first string less than the second string.
1249 * @returns 0 if the first string identical to the second string.
1250 * @returns > 0 if the first string greater than the second string.
1251 * @param psz1 First UTF-8 string. Null is allowed.
1252 * @param psz2 Second UTF-8 string. Null is allowed.
1253 */
1254RTDECL(int) RTStrICmp(const char *psz1, const char *psz2)
1255{
1256 if (psz1 == psz2)
1257 return 0;
1258 if (!psz1)
1259 return -1;
1260 if (!psz2)
1261 return 1;
1262
1263 const char *pszStart1 = psz1;
1264 for (;;)
1265 {
1266 /* Get the codepoints */
1267 RTUNICP cp1;
1268 int rc = RTStrGetCpEx(&psz1, &cp1);
1269 if (RT_FAILURE(rc))
1270 {
1271 AssertRC(rc);
1272 psz1--;
1273 break;
1274 }
1275
1276 RTUNICP cp2;
1277 rc = RTStrGetCpEx(&psz2, &cp2);
1278 if (RT_FAILURE(rc))
1279 {
1280 AssertRC(rc);
1281 psz2--;
1282 psz1 = RTStrPrevCp(pszStart1, psz1);
1283 break;
1284 }
1285
1286 /* compare */
1287 int iDiff = cp1 - cp2;
1288 if (iDiff)
1289 {
1290 iDiff = RTUniCpToUpper(cp1) != RTUniCpToUpper(cp2);
1291 if (iDiff)
1292 {
1293 iDiff = RTUniCpToLower(cp1) - RTUniCpToLower(cp2); /* lower case diff last! */
1294 if (iDiff)
1295 return iDiff;
1296 }
1297 }
1298
1299 /* hit the terminator? */
1300 if (!cp1)
1301 return 0;
1302 }
1303
1304 /* Hit some bad encoding, continue in case insensitive mode. */
1305 return RTStrCmp(psz1, psz2);
1306}
1307RT_EXPORT_SYMBOL(RTStrICmp);
1308
1309
1310/**
1311 * Performs a case insensitive string compare between two UTF-8 strings, given a
1312 * maximum string length.
1313 *
1314 * This is a simplified compare, as only the simplified lower/upper case folding
1315 * specified by the unicode specs are used. It does not consider character pairs
1316 * as they are used in some languages, just simple upper & lower case compares.
1317 *
1318 * The result is the difference between the mismatching codepoints after they
1319 * both have been lower cased.
1320 *
1321 * If the string encoding is invalid the function will assert (strict builds)
1322 * and use RTStrCmp for the remainder of the string.
1323 *
1324 * @returns < 0 if the first string less than the second string.
1325 * @returns 0 if the first string identical to the second string.
1326 * @returns > 0 if the first string greater than the second string.
1327 * @param psz1 First UTF-8 string. Null is allowed.
1328 * @param psz2 Second UTF-8 string. Null is allowed.
1329 * @param cchMax Maximum string length
1330 */
1331RTDECL(int) RTStrNICmp(const char *psz1, const char *psz2, size_t cchMax)
1332{
1333 if (cchMax == 0)
1334 return 0;
1335 if (psz1 == psz2)
1336 return 0;
1337 if (!psz1)
1338 return -1;
1339 if (!psz2)
1340 return 1;
1341
1342 for (;;)
1343 {
1344 /* Get the codepoints */
1345 RTUNICP cp1;
1346 size_t cchMax2 = cchMax;
1347 int rc = RTStrGetCpNEx(&psz1, &cchMax, &cp1);
1348 if (RT_FAILURE(rc))
1349 {
1350 AssertRC(rc);
1351 psz1--;
1352 cchMax++;
1353 break;
1354 }
1355
1356 RTUNICP cp2;
1357 rc = RTStrGetCpNEx(&psz2, &cchMax2, &cp2);
1358 if (RT_FAILURE(rc))
1359 {
1360 AssertRC(rc);
1361 psz2--;
1362 psz1 -= (cchMax - cchMax2 + 1); /* This can't overflow, can it? */
1363 cchMax = cchMax2 + 1;
1364 break;
1365 }
1366
1367 /* compare */
1368 int iDiff = cp1 - cp2;
1369 if (iDiff)
1370 {
1371 iDiff = RTUniCpToUpper(cp1) != RTUniCpToUpper(cp2);
1372 if (iDiff)
1373 {
1374 iDiff = RTUniCpToLower(cp1) - RTUniCpToLower(cp2); /* lower case diff last! */
1375 if (iDiff)
1376 return iDiff;
1377 }
1378 }
1379
1380 /* hit the terminator? */
1381 if (!cp1 || cchMax == 0)
1382 return 0;
1383 }
1384
1385 /* Hit some bad encoding, continue in case insensitive mode. */
1386 return RTStrNCmp(psz1, psz2, cchMax);
1387}
1388RT_EXPORT_SYMBOL(RTStrNICmp);
1389
1390
1391RTDECL(char *) RTStrStr(const char *pszHaystack, const char *pszNeedle)
1392{
1393 /* Any NULL strings means NULL return. (In the RTStrCmp tradition.) */
1394 if (!pszHaystack)
1395 return NULL;
1396 if (!pszNeedle)
1397 return NULL;
1398
1399 /* The rest is CRT. */
1400 return (char *)strstr(pszHaystack, pszNeedle);
1401}
1402RT_EXPORT_SYMBOL(RTStrStr);
1403
1404
1405RTDECL(char *) RTStrIStr(const char *pszHaystack, const char *pszNeedle)
1406{
1407 /* Any NULL strings means NULL return. (In the RTStrCmp tradition.) */
1408 if (!pszHaystack)
1409 return NULL;
1410 if (!pszNeedle)
1411 return NULL;
1412
1413 /* The empty string matches everything. */
1414 if (!*pszNeedle)
1415 return (char *)pszHaystack;
1416
1417 /*
1418 * The search strategy is to pick out the first char of the needle, fold it,
1419 * and match it against the haystack code point by code point. When encountering
1420 * a matching code point we use RTStrNICmp for the remainder (if any) of the needle.
1421 */
1422 const char * const pszNeedleStart = pszNeedle;
1423 RTUNICP Cp0;
1424 RTStrGetCpEx(&pszNeedle, &Cp0); /* pszNeedle is advanced one code point. */
1425 size_t const cchNeedle = strlen(pszNeedle);
1426 size_t const cchNeedleCp0= pszNeedle - pszNeedleStart;
1427 RTUNICP const Cp0Lower = RTUniCpToLower(Cp0);
1428 RTUNICP const Cp0Upper = RTUniCpToUpper(Cp0);
1429 if ( Cp0Lower == Cp0Upper
1430 && Cp0Lower == Cp0)
1431 {
1432 /* Cp0 is not a case sensitive char. */
1433 for (;;)
1434 {
1435 RTUNICP Cp;
1436 RTStrGetCpEx(&pszHaystack, &Cp);
1437 if (!Cp)
1438 break;
1439 if ( Cp == Cp0
1440 && !RTStrNICmp(pszHaystack, pszNeedle, cchNeedle))
1441 return (char *)pszHaystack - cchNeedleCp0;
1442 }
1443 }
1444 else if ( Cp0Lower == Cp0
1445 || Cp0Upper != Cp0)
1446 {
1447 /* Cp0 is case sensitive */
1448 for (;;)
1449 {
1450 RTUNICP Cp;
1451 RTStrGetCpEx(&pszHaystack, &Cp);
1452 if (!Cp)
1453 break;
1454 if ( ( Cp == Cp0Upper
1455 || Cp == Cp0Lower)
1456 && !RTStrNICmp(pszHaystack, pszNeedle, cchNeedle))
1457 return (char *)pszHaystack - cchNeedleCp0;
1458 }
1459 }
1460 else
1461 {
1462 /* Cp0 is case sensitive and folds to two difference chars. (paranoia) */
1463 for (;;)
1464 {
1465 RTUNICP Cp;
1466 RTStrGetCpEx(&pszHaystack, &Cp);
1467 if (!Cp)
1468 break;
1469 if ( ( Cp == Cp0
1470 || Cp == Cp0Upper
1471 || Cp == Cp0Lower)
1472 && !RTStrNICmp(pszHaystack, pszNeedle, cchNeedle))
1473 return (char *)pszHaystack - cchNeedleCp0;
1474 }
1475 }
1476
1477
1478 return NULL;
1479}
1480RT_EXPORT_SYMBOL(RTStrIStr);
1481
1482
1483RTDECL(char *) RTStrToLower(char *psz)
1484{
1485 /*
1486 * Loop the code points in the string, converting them one by one.
1487 * ASSUMES that the code points for upper and lower case are encoded
1488 * with the exact same length.
1489 */
1490 /** @todo Handled bad encodings correctly+quietly, remove assumption,
1491 * optimize. */
1492 char *pszCur = psz;
1493 while (*pszCur)
1494 {
1495 RTUNICP cp = RTStrGetCp(pszCur);
1496 cp = RTUniCpToLower(cp);
1497 pszCur = RTStrPutCp(pszCur, cp);
1498 }
1499 return psz;
1500}
1501RT_EXPORT_SYMBOL(RTStrToLower);
1502
1503
1504RTDECL(char *) RTStrToUpper(char *psz)
1505{
1506 /*
1507 * Loop the code points in the string, converting them one by one.
1508 * ASSUMES that the code points for upper and lower case are encoded
1509 * with the exact same length.
1510 */
1511 /** @todo Handled bad encodings correctly+quietly, remove assumption,
1512 * optimize. */
1513 char *pszCur = psz;
1514 while(*pszCur)
1515 {
1516 RTUNICP cp = RTStrGetCp(pszCur);
1517 cp = RTUniCpToUpper(cp);
1518 pszCur = RTStrPutCp(pszCur, cp);
1519 }
1520 return psz;
1521}
1522RT_EXPORT_SYMBOL(RTStrToUpper);
1523
Note: See TracBrowser for help on using the repository browser.

© 2024 Oracle Support Privacy / Do Not Sell My Info Terms of Use Trademark Policy Automated Access Etiquette