VirtualBox

source: vbox/trunk/src/VBox/Runtime/common/string/utf-8.cpp@ 18570

Last change on this file since 18570 was 18570, checked in by vboxsync, 16 years ago

RTStrIStr: fixed inverted test.

  • Property svn:eol-style set to native
  • Property svn:keywords set to Id
File size: 53.0 KB
Line 
1/* $Id: utf-8.cpp 18570 2009-03-31 13:07:44Z vboxsync $ */
2/** @file
3 * IPRT - UTF-8 Decoding.
4 */
5
6/*
7 * Copyright (C) 2006-2007 Sun Microsystems, Inc.
8 *
9 * This file is part of VirtualBox Open Source Edition (OSE), as
10 * available from http://www.virtualbox.org. This file is free software;
11 * you can redistribute it and/or modify it under the terms of the GNU
12 * General Public License (GPL) as published by the Free Software
13 * Foundation, in version 2 as it comes in the "COPYING" file of the
14 * VirtualBox OSE distribution. VirtualBox OSE is distributed in the
15 * hope that it will be useful, but WITHOUT ANY WARRANTY of any kind.
16 *
17 * The contents of this file may alternatively be used under the terms
18 * of the Common Development and Distribution License Version 1.0
19 * (CDDL) only, as it comes in the "COPYING.CDDL" file of the
20 * VirtualBox OSE distribution, in which case the provisions of the
21 * CDDL are applicable instead of those of the GPL.
22 *
23 * You may elect to license modified versions of this file under the
24 * terms and conditions of either the GPL or the CDDL or both.
25 *
26 * Please contact Sun Microsystems, Inc., 4150 Network Circle, Santa
27 * Clara, CA 95054 USA or visit http://www.sun.com if you need
28 * additional information or have any questions.
29 */
30
31
32/*******************************************************************************
33* Header Files *
34*******************************************************************************/
35#include <iprt/string.h>
36#include <iprt/uni.h>
37#include <iprt/alloc.h>
38#include <iprt/assert.h>
39#include <iprt/err.h>
40#include "internal/string.h"
41
42
43
44/**
45 * Get get length in code points of a UTF-8 encoded string.
46 * The string is validated while doing this.
47 *
48 * @returns IPRT status code.
49 * @param psz Pointer to the UTF-8 string.
50 * @param cch The max length of the string. (btw cch = cb)
51 * Use RTSTR_MAX if all of the string is to be examined.
52 * @param pcuc Where to store the length in unicode code points.
53 * @param pcchActual Where to store the actual size of the UTF-8 string
54 * on success (cch = cb again). Optional.
55 */
56static int rtUtf8Length(const char *psz, size_t cch, size_t *pcuc, size_t *pcchActual)
57{
58 const unsigned char *puch = (const unsigned char *)psz;
59 size_t cCodePoints = 0;
60 while (cch > 0)
61 {
62 const unsigned char uch = *puch;
63 if (!uch)
64 break;
65 if (uch & RT_BIT(7))
66 {
67 /* figure sequence length and validate the first byte */
68 unsigned cb;
69 if ((uch & (RT_BIT(7) | RT_BIT(6) | RT_BIT(5))) == (RT_BIT(7) | RT_BIT(6)))
70 cb = 2;
71 else if ((uch & (RT_BIT(7) | RT_BIT(6) | RT_BIT(5) | RT_BIT(4))) == (RT_BIT(7) | RT_BIT(6) | RT_BIT(5)))
72 cb = 3;
73 else if ((uch & (RT_BIT(7) | RT_BIT(6) | RT_BIT(5) | RT_BIT(4) | RT_BIT(3))) == (RT_BIT(7) | RT_BIT(6) | RT_BIT(5) | RT_BIT(4)))
74 cb = 4;
75 else if ((uch & (RT_BIT(7) | RT_BIT(6) | RT_BIT(5) | RT_BIT(4) | RT_BIT(3) | RT_BIT(2))) == (RT_BIT(7) | RT_BIT(6) | RT_BIT(5) | RT_BIT(4) | RT_BIT(3)))
76 cb = 5;
77 else if ((uch & (RT_BIT(7) | RT_BIT(6) | RT_BIT(5) | RT_BIT(4) | RT_BIT(3) | RT_BIT(2) | RT_BIT(1))) == (RT_BIT(7) | RT_BIT(6) | RT_BIT(5) | RT_BIT(4) | RT_BIT(3) | RT_BIT(2)))
78 cb = 6;
79 else
80 {
81 RTStrAssertMsgFailed(("Invalid UTF-8 first byte: %.*Rhxs\n", RT_MIN(cch, 10), puch));
82 return VERR_INVALID_UTF8_ENCODING;
83 }
84
85 /* check length */
86 if (cb > cch)
87 {
88 RTStrAssertMsgFailed(("Invalid UTF-8 length: cb=%d cch=%d (%.*Rhxs)\n", cb, cch, RT_MIN(cch, 10), puch));
89 return VERR_INVALID_UTF8_ENCODING;
90 }
91
92 /* validate the rest */
93 switch (cb)
94 {
95 case 6:
96 RTStrAssertMsgReturn((puch[5] & (RT_BIT(7) | RT_BIT(6))) == RT_BIT(7), ("6/%u: %.*Rhxs\n", cb, RT_MIN(cb + 10, cch), puch), VERR_INVALID_UTF8_ENCODING);
97 case 5:
98 RTStrAssertMsgReturn((puch[4] & (RT_BIT(7) | RT_BIT(6))) == RT_BIT(7), ("5/%u: %.*Rhxs\n", cb, RT_MIN(cb + 10, cch), puch), VERR_INVALID_UTF8_ENCODING);
99 case 4:
100 RTStrAssertMsgReturn((puch[3] & (RT_BIT(7) | RT_BIT(6))) == RT_BIT(7), ("4/%u: %.*Rhxs\n", cb, RT_MIN(cb + 10, cch), puch), VERR_INVALID_UTF8_ENCODING);
101 case 3:
102 RTStrAssertMsgReturn((puch[2] & (RT_BIT(7) | RT_BIT(6))) == RT_BIT(7), ("3/%u: %.*Rhxs\n", cb, RT_MIN(cb + 10, cch), puch), VERR_INVALID_UTF8_ENCODING);
103 case 2:
104 RTStrAssertMsgReturn((puch[1] & (RT_BIT(7) | RT_BIT(6))) == RT_BIT(7), ("2/%u: %.*Rhxs\n", cb, RT_MIN(cb + 10, cch), puch), VERR_INVALID_UTF8_ENCODING);
105 break;
106 }
107
108 /* validate the code point. */
109 RTUNICP uc;
110 switch (cb)
111 {
112 case 6:
113 uc = (puch[5] & 0x3f)
114 | ((RTUNICP)(puch[4] & 0x3f) << 6)
115 | ((RTUNICP)(puch[3] & 0x3f) << 12)
116 | ((RTUNICP)(puch[2] & 0x3f) << 18)
117 | ((RTUNICP)(puch[1] & 0x3f) << 24)
118 | ((RTUNICP)(uch & 0x01) << 30);
119 RTStrAssertMsgReturn(uc >= 0x04000000 && uc <= 0x7fffffff,
120 ("%u: cp=%#010RX32: %.*Rhxs\n", cb, uc, RT_MIN(cb + 10, cch), puch), VERR_INVALID_UTF8_ENCODING);
121 break;
122 case 5:
123 uc = (puch[4] & 0x3f)
124 | ((RTUNICP)(puch[3] & 0x3f) << 6)
125 | ((RTUNICP)(puch[2] & 0x3f) << 12)
126 | ((RTUNICP)(puch[1] & 0x3f) << 18)
127 | ((RTUNICP)(uch & 0x03) << 24);
128 RTStrAssertMsgReturn(uc >= 0x00200000 && uc <= 0x03ffffff,
129 ("%u: cp=%#010RX32: %.*Rhxs\n", cb, uc, RT_MIN(cb + 10, cch), puch), VERR_INVALID_UTF8_ENCODING);
130 break;
131 case 4:
132 uc = (puch[3] & 0x3f)
133 | ((RTUNICP)(puch[2] & 0x3f) << 6)
134 | ((RTUNICP)(puch[1] & 0x3f) << 12)
135 | ((RTUNICP)(uch & 0x07) << 18);
136 RTStrAssertMsgReturn(uc >= 0x00010000 && uc <= 0x001fffff,
137 ("%u: cp=%#010RX32: %.*Rhxs\n", cb, uc, RT_MIN(cb + 10, cch), puch), VERR_INVALID_UTF8_ENCODING);
138 break;
139 case 3:
140 uc = (puch[2] & 0x3f)
141 | ((RTUNICP)(puch[1] & 0x3f) << 6)
142 | ((RTUNICP)(uch & 0x0f) << 12);
143 RTStrAssertMsgReturn(uc >= 0x00000800 && uc <= 0x0000fffd,
144 ("%u: cp=%#010RX32: %.*Rhxs\n", cb, uc, RT_MIN(cb + 10, cch), puch),
145 uc == 0xffff || uc == 0xfffe ? VERR_CODE_POINT_ENDIAN_INDICATOR : VERR_INVALID_UTF8_ENCODING);
146 RTStrAssertMsgReturn(uc < 0xd800 || uc > 0xdfff,
147 ("%u: cp=%#010RX32: %.*Rhxs\n", cb, uc, RT_MIN(cb + 10, cch), puch), VERR_CODE_POINT_SURROGATE);
148 break;
149 case 2:
150 uc = (puch[1] & 0x3f)
151 | ((RTUNICP)(uch & 0x1f) << 6);
152 RTStrAssertMsgReturn(uc >= 0x00000080 && uc <= 0x000007ff,
153 ("%u: cp=%#010RX32: %.*Rhxs\n", cb, uc, RT_MIN(cb + 10, cch), puch), VERR_INVALID_UTF8_ENCODING);
154 break;
155 }
156
157 /* advance */
158 cch -= cb;
159 puch += cb;
160 }
161 else
162 {
163 /* one ASCII byte */
164 puch++;
165 cch--;
166 }
167 cCodePoints++;
168 }
169
170 /* done */
171 *pcuc = cCodePoints;
172 if (pcchActual)
173 *pcchActual = puch - (unsigned char const *)psz;
174 return VINF_SUCCESS;
175}
176
177
178/**
179 * Decodes and UTF-8 string into an array of unicode code point.
180 *
181 * Since we know the input is valid, we do *not* perform encoding or length checks.
182 *
183 * @returns iprt status code.
184 * @param psz The UTF-8 string to recode. This is a valid encoding.
185 * @param cch The number of chars (the type char, so bytes if you like) to process of the UTF-8 string.
186 * The recoding will stop when cch or '\\0' is reached. Pass RTSTR_MAX to process up to '\\0'.
187 * @param paCps Where to store the code points array.
188 * @param cCps The number of RTUNICP items the paCps buffer can hold, excluding the terminator ('\\0').
189 * @param pcCps Where to store the actual number of decoded code points. This excludes the terminator.
190 */
191static int rtUtf8Decode(const char *psz, size_t cch, PRTUNICP paCps, size_t cCps, size_t *pcCps)
192{
193 int rc = VINF_SUCCESS;
194 const unsigned char *puch = (const unsigned char *)psz;
195 const PRTUNICP pCpEnd = paCps + cCps;
196 PRTUNICP pCp = paCps;
197 Assert(pCpEnd >= pCp);
198 while (cch > 0)
199 {
200 /* read the next char and check for terminator. */
201 const unsigned char uch = *puch;
202 if (!uch)
203 break;
204
205 /* check for output overflow */
206 if (pCp >= pCpEnd)
207 {
208 rc = VERR_BUFFER_OVERFLOW;
209 break;
210 }
211
212 /* decode and recode the code point */
213 if (!(uch & RT_BIT(7)))
214 {
215 *pCp++ = uch;
216 puch++;
217 cch--;
218 }
219#ifdef RT_STRICT
220 else if (!(uch & RT_BIT(6)))
221 AssertMsgFailed(("Internal error!\n"));
222#endif
223 else if (!(uch & RT_BIT(5)))
224 {
225 *pCp++ = (puch[1] & 0x3f)
226 | ((uint16_t)(uch & 0x1f) << 6);
227 puch += 2;
228 cch -= 2;
229 }
230 else if (!(uch & RT_BIT(4)))
231 {
232 *pCp++ = (puch[2] & 0x3f)
233 | ((uint16_t)(puch[1] & 0x3f) << 6)
234 | ((uint16_t)(uch & 0x0f) << 12);
235 puch += 3;
236 cch -= 3;
237 }
238 else if (!(uch & RT_BIT(3)))
239 {
240 *pCp++ = (puch[3] & 0x3f)
241 | ((RTUNICP)(puch[2] & 0x3f) << 6)
242 | ((RTUNICP)(puch[1] & 0x3f) << 12)
243 | ((RTUNICP)(uch & 0x07) << 18);
244 puch += 4;
245 cch -= 4;
246 }
247 else if (!(uch & RT_BIT(2)))
248 {
249 *pCp++ = (puch[4] & 0x3f)
250 | ((RTUNICP)(puch[3] & 0x3f) << 6)
251 | ((RTUNICP)(puch[2] & 0x3f) << 12)
252 | ((RTUNICP)(puch[1] & 0x3f) << 18)
253 | ((RTUNICP)(uch & 0x03) << 24);
254 puch += 5;
255 cch -= 6;
256 }
257 else
258 {
259 Assert(!(uch & RT_BIT(1)));
260 *pCp++ = (puch[5] & 0x3f)
261 | ((RTUNICP)(puch[4] & 0x3f) << 6)
262 | ((RTUNICP)(puch[3] & 0x3f) << 12)
263 | ((RTUNICP)(puch[2] & 0x3f) << 18)
264 | ((RTUNICP)(puch[1] & 0x3f) << 24)
265 | ((RTUNICP)(uch & 0x01) << 30);
266 puch += 6;
267 cch -= 6;
268 }
269 }
270
271 /* done */
272 *pCp = 0;
273 *pcCps = pCp - paCps;
274 return rc;
275}
276
277
278RTDECL(size_t) RTStrUniLen(const char *psz)
279{
280 size_t cCodePoints;
281 int rc = rtUtf8Length(psz, RTSTR_MAX, &cCodePoints, NULL);
282 return RT_SUCCESS(rc) ? cCodePoints : 0;
283}
284
285
286RTDECL(int) RTStrUniLenEx(const char *psz, size_t cch, size_t *pcCps)
287{
288 size_t cCodePoints;
289 int rc = rtUtf8Length(psz, cch, &cCodePoints, NULL);
290 if (pcCps)
291 *pcCps = RT_SUCCESS(rc) ? cCodePoints : 0;
292 return rc;
293}
294
295
296RTDECL(int) RTStrValidateEncoding(const char *psz)
297{
298 return RTStrValidateEncodingEx(psz, RTSTR_MAX, 0);
299}
300
301
302RTDECL(int) RTStrValidateEncodingEx(const char *psz, size_t cch, uint32_t fFlags)
303{
304 AssertReturn(!(fFlags & ~(RTSTR_VALIDATE_ENCODING_ZERO_TERMINATED)), VERR_INVALID_PARAMETER);
305 AssertPtr(psz);
306
307 /*
308 * Use rtUtf8Length for the job.
309 */
310 size_t cchActual;
311 size_t cCpsIgnored;
312 int rc = rtUtf8Length(psz, cch, &cCpsIgnored, &cchActual);
313 if (RT_SUCCESS(rc))
314 {
315 if ( (fFlags & RTSTR_VALIDATE_ENCODING_ZERO_TERMINATED)
316 && cchActual >= cch)
317 rc = VERR_BUFFER_OVERFLOW;
318 }
319 return rc;
320
321
322 return RTStrUniLenEx(psz, cch, &cCpsIgnored);
323}
324
325
326RTDECL(bool) RTStrIsValidEncoding(const char *psz)
327{
328 int rc = RTStrValidateEncodingEx(psz, RTSTR_MAX, 0);
329 return RT_SUCCESS(rc);
330}
331
332
333RTDECL(int) RTStrToUni(const char *pszString, PRTUNICP *ppaCps)
334{
335 /*
336 * Validate input.
337 */
338 Assert(VALID_PTR(pszString));
339 Assert(VALID_PTR(ppaCps));
340 *ppaCps = NULL;
341
342 /*
343 * Validate the UTF-8 input and count its code points.
344 */
345 size_t cCps;
346 int rc = rtUtf8Length(pszString, RTSTR_MAX, &cCps, NULL);
347 if (RT_SUCCESS(rc))
348 {
349 /*
350 * Allocate buffer.
351 */
352 PRTUNICP paCps = (PRTUNICP)RTMemAlloc((cCps + 1) * sizeof(RTUNICP));
353 if (paCps)
354 {
355 /*
356 * Decode the string.
357 */
358 rc = rtUtf8Decode(pszString, RTSTR_MAX, paCps, cCps, &cCps);
359 if (RT_SUCCESS(rc))
360 {
361 *ppaCps = paCps;
362 return rc;
363 }
364 RTMemFree(paCps);
365 }
366 else
367 rc = VERR_NO_CODE_POINT_MEMORY;
368 }
369 return rc;
370}
371
372
373RTDECL(int) RTStrToUniEx(const char *pszString, size_t cchString, PRTUNICP *ppaCps, size_t cCps, size_t *pcCps)
374{
375 /*
376 * Validate input.
377 */
378 Assert(VALID_PTR(pszString));
379 Assert(VALID_PTR(ppaCps));
380 Assert(!pcCps || VALID_PTR(pcCps));
381
382 /*
383 * Validate the UTF-8 input and count the code points.
384 */
385 size_t cCpsResult;
386 int rc = rtUtf8Length(pszString, cchString, &cCpsResult, NULL);
387 if (RT_SUCCESS(rc))
388 {
389 if (pcCps)
390 *pcCps = cCpsResult;
391
392 /*
393 * Check buffer size / Allocate buffer.
394 */
395 bool fShouldFree;
396 PRTUNICP paCpsResult;
397 if (cCps > 0 && *ppaCps)
398 {
399 fShouldFree = false;
400 if (cCps <= cCpsResult)
401 return VERR_BUFFER_OVERFLOW;
402 paCpsResult = *ppaCps;
403 }
404 else
405 {
406 *ppaCps = NULL;
407 fShouldFree = true;
408 cCps = RT_MAX(cCpsResult + 1, cCps);
409 paCpsResult = (PRTUNICP)RTMemAlloc(cCps * sizeof(RTUNICP));
410 }
411 if (paCpsResult)
412 {
413 /*
414 * Encode the UTF-16 string.
415 */
416 rc = rtUtf8Decode(pszString, cchString, paCpsResult, cCps - 1, &cCpsResult);
417 if (RT_SUCCESS(rc))
418 {
419 *ppaCps = paCpsResult;
420 return rc;
421 }
422 if (fShouldFree)
423 RTMemFree(paCpsResult);
424 }
425 else
426 rc = VERR_NO_CODE_POINT_MEMORY;
427 }
428 return rc;
429}
430
431
432/**
433 * Calculates the UTF-16 length of a string, validating the encoding while doing so.
434 *
435 * @returns IPRT status code.
436 * @param psz Pointer to the UTF-8 string.
437 * @param cch The max length of the string. (btw cch = cb)
438 * Use RTSTR_MAX if all of the string is to be examined.s
439 * @param pcwc Where to store the length of the UTF-16 string as a number of RTUTF16 characters.
440 */
441static int rtUtf8CalcUtf16Length(const char *psz, size_t cch, size_t *pcwc)
442{
443 const unsigned char *puch = (const unsigned char *)psz;
444 size_t cwc = 0;
445 while (cch > 0)
446 {
447 const unsigned char uch = *puch;
448 if (!uch)
449 break;
450 if (!(uch & RT_BIT(7)))
451 {
452 /* one ASCII byte */
453 cwc++;
454 puch++;
455 cch--;
456 }
457 else
458 {
459 /* figure sequence length and validate the first byte */
460 unsigned cb;
461 if ((uch & (RT_BIT(7) | RT_BIT(6) | RT_BIT(5))) == (RT_BIT(7) | RT_BIT(6)))
462 cb = 2;
463 else if ((uch & (RT_BIT(7) | RT_BIT(6) | RT_BIT(5) | RT_BIT(4))) == (RT_BIT(7) | RT_BIT(6) | RT_BIT(5)))
464 cb = 3;
465 else if ((uch & (RT_BIT(7) | RT_BIT(6) | RT_BIT(5) | RT_BIT(4) | RT_BIT(3))) == (RT_BIT(7) | RT_BIT(6) | RT_BIT(5) | RT_BIT(4)))
466 cb = 4;
467 else if ((uch & (RT_BIT(7) | RT_BIT(6) | RT_BIT(5) | RT_BIT(4) | RT_BIT(3) | RT_BIT(2))) == (RT_BIT(7) | RT_BIT(6) | RT_BIT(5) | RT_BIT(4) | RT_BIT(3)))
468 cb = 5;
469 else if ((uch & (RT_BIT(7) | RT_BIT(6) | RT_BIT(5) | RT_BIT(4) | RT_BIT(3) | RT_BIT(2) | RT_BIT(1))) == (RT_BIT(7) | RT_BIT(6) | RT_BIT(5) | RT_BIT(4) | RT_BIT(3) | RT_BIT(2)))
470 cb = 6;
471 else
472 {
473 RTStrAssertMsgFailed(("Invalid UTF-8 first byte: %.*Rhxs\n", RT_MIN(cch, 10), puch));
474 return VERR_INVALID_UTF8_ENCODING;
475 }
476
477 /* check length */
478 if (cb > cch)
479 {
480 RTStrAssertMsgFailed(("Invalid UTF-8 length: cb=%d cch=%d (%.*Rhxs)\n", cb, cch, RT_MIN(cch, 10), puch));
481 return VERR_INVALID_UTF8_ENCODING;
482 }
483
484 /* validate the rest */
485 switch (cb)
486 {
487 case 6:
488 RTStrAssertMsgReturn((puch[5] & (RT_BIT(7) | RT_BIT(6))) == RT_BIT(7), ("6/%u: %.*Rhxs\n", cb, RT_MIN(cb + 10, cch), puch), VERR_INVALID_UTF8_ENCODING);
489 case 5:
490 RTStrAssertMsgReturn((puch[4] & (RT_BIT(7) | RT_BIT(6))) == RT_BIT(7), ("5/%u: %.*Rhxs\n", cb, RT_MIN(cb + 10, cch), puch), VERR_INVALID_UTF8_ENCODING);
491 case 4:
492 RTStrAssertMsgReturn((puch[3] & (RT_BIT(7) | RT_BIT(6))) == RT_BIT(7), ("4/%u: %.*Rhxs\n", cb, RT_MIN(cb + 10, cch), puch), VERR_INVALID_UTF8_ENCODING);
493 case 3:
494 RTStrAssertMsgReturn((puch[2] & (RT_BIT(7) | RT_BIT(6))) == RT_BIT(7), ("3/%u: %.*Rhxs\n", cb, RT_MIN(cb + 10, cch), puch), VERR_INVALID_UTF8_ENCODING);
495 case 2:
496 RTStrAssertMsgReturn((puch[1] & (RT_BIT(7) | RT_BIT(6))) == RT_BIT(7), ("2/%u: %.*Rhxs\n", cb, RT_MIN(cb + 10, cch), puch), VERR_INVALID_UTF8_ENCODING);
497 break;
498 }
499
500 /* validate the code point. */
501 RTUNICP uc;
502 switch (cb)
503 {
504 case 6:
505 uc = (puch[5] & 0x3f)
506 | ((RTUNICP)(puch[4] & 0x3f) << 6)
507 | ((RTUNICP)(puch[3] & 0x3f) << 12)
508 | ((RTUNICP)(puch[2] & 0x3f) << 18)
509 | ((RTUNICP)(puch[1] & 0x3f) << 24)
510 | ((RTUNICP)(uch & 0x01) << 30);
511 RTStrAssertMsgReturn(uc >= 0x04000000 && uc <= 0x7fffffff,
512 ("%u: cp=%#010RX32: %.*Rhxs\n", cb, uc, RT_MIN(cb + 10, cch), puch), VERR_INVALID_UTF8_ENCODING);
513 RTStrAssertMsgFailed(("%u: cp=%#010RX32: %.*Rhxs\n", cb, uc, RT_MIN(cb + 10, cch), puch));
514 return VERR_CANT_RECODE_AS_UTF16;
515 case 5:
516 uc = (puch[4] & 0x3f)
517 | ((RTUNICP)(puch[3] & 0x3f) << 6)
518 | ((RTUNICP)(puch[2] & 0x3f) << 12)
519 | ((RTUNICP)(puch[1] & 0x3f) << 18)
520 | ((RTUNICP)(uch & 0x03) << 24);
521 RTStrAssertMsgReturn(uc >= 0x00200000 && uc <= 0x03ffffff,
522 ("%u: cp=%#010RX32: %.*Rhxs\n", cb, uc, RT_MIN(cb + 10, cch), puch), VERR_INVALID_UTF8_ENCODING);
523 RTStrAssertMsgFailed(("%u: cp=%#010RX32: %.*Rhxs\n", cb, uc, RT_MIN(cb + 10, cch), puch));
524 return VERR_CANT_RECODE_AS_UTF16;
525 case 4:
526 uc = (puch[3] & 0x3f)
527 | ((RTUNICP)(puch[2] & 0x3f) << 6)
528 | ((RTUNICP)(puch[1] & 0x3f) << 12)
529 | ((RTUNICP)(uch & 0x07) << 18);
530 RTStrAssertMsgReturn(uc >= 0x00010000 && uc <= 0x001fffff,
531 ("%u: cp=%#010RX32: %.*Rhxs\n", cb, uc, RT_MIN(cb + 10, cch), puch), VERR_INVALID_UTF8_ENCODING);
532 RTStrAssertMsgReturn(uc <= 0x0010ffff,
533 ("%u: cp=%#010RX32: %.*Rhxs\n", cb, uc, RT_MIN(cb + 10, cch), puch), VERR_CANT_RECODE_AS_UTF16);
534 cwc++;
535 break;
536 case 3:
537 uc = (puch[2] & 0x3f)
538 | ((RTUNICP)(puch[1] & 0x3f) << 6)
539 | ((RTUNICP)(uch & 0x0f) << 12);
540 RTStrAssertMsgReturn(uc >= 0x00000800 && uc <= 0x0000fffd,
541 ("%u: cp=%#010RX32: %.*Rhxs\n", cb, uc, RT_MIN(cb + 10, cch), puch),
542 uc == 0xffff || uc == 0xfffe ? VERR_CODE_POINT_ENDIAN_INDICATOR : VERR_INVALID_UTF8_ENCODING);
543 RTStrAssertMsgReturn(uc < 0xd800 || uc > 0xdfff,
544 ("%u: cp=%#010RX32: %.*Rhxs\n", cb, uc, RT_MIN(cb + 10, cch), puch), VERR_CODE_POINT_SURROGATE);
545 break;
546 case 2:
547 uc = (puch[1] & 0x3f)
548 | ((RTUNICP)(uch & 0x1f) << 6);
549 RTStrAssertMsgReturn(uc >= 0x00000080 && uc <= 0x000007ff,
550 ("%u: cp=%#010RX32: %.*Rhxs\n", cb, uc, RT_MIN(cb + 10, cch), puch), VERR_INVALID_UTF8_ENCODING);
551 break;
552 }
553
554 /* advance */
555 cch -= cb;
556 puch += cb;
557 cwc++;
558 }
559 }
560
561 /* done */
562 *pcwc = cwc;
563 return VINF_SUCCESS;
564}
565
566
567/**
568 * Recodes a valid UTF-8 string as UTF-16.
569 *
570 * Since we know the input is valid, we do *not* perform encoding or length checks.
571 *
572 * @returns iprt status code.
573 * @param psz The UTF-8 string to recode. This is a valid encoding.
574 * @param cch The number of chars (the type char, so bytes if you like) to process of the UTF-8 string.
575 * The recoding will stop when cch or '\\0' is reached. Pass RTSTR_MAX to process up to '\\0'.
576 * @param pwsz Where to store the UTF-16 string.
577 * @param cwc The number of RTUTF16 items the pwsz buffer can hold, excluding the terminator ('\\0').
578 * @param pcwc Where to store the actual number of RTUTF16 items encoded into the UTF-16. This excludes the terminator.
579 */
580static int rtUtf8RecodeAsUtf16(const char *psz, size_t cch, PRTUTF16 pwsz, size_t cwc, size_t *pcwc)
581{
582 int rc = VINF_SUCCESS;
583 const unsigned char *puch = (const unsigned char *)psz;
584 const PRTUTF16 pwszEnd = pwsz + cwc;
585 PRTUTF16 pwc = pwsz;
586 Assert(pwszEnd >= pwc);
587 while (cch > 0)
588 {
589 /* read the next char and check for terminator. */
590 const unsigned char uch = *puch;
591 if (!uch)
592 break;
593
594 /* check for output overflow */
595 if (pwc >= pwszEnd)
596 {
597 rc = VERR_BUFFER_OVERFLOW;
598 break;
599 }
600
601 /* decode and recode the code point */
602 if (!(uch & RT_BIT(7)))
603 {
604 *pwc++ = uch;
605 puch++;
606 cch--;
607 }
608 else if ((uch & (RT_BIT(7) | RT_BIT(6) | RT_BIT(5))) == (RT_BIT(7) | RT_BIT(6)))
609 {
610 uint16_t uc = (puch[1] & 0x3f)
611 | ((uint16_t)(uch & 0x1f) << 6);
612 *pwc++ = uc;
613 puch += 2;
614 cch -= 2;
615 }
616 else if ((uch & (RT_BIT(7) | RT_BIT(6) | RT_BIT(5) | RT_BIT(4))) == (RT_BIT(7) | RT_BIT(6) | RT_BIT(5)))
617 {
618 uint16_t uc = (puch[2] & 0x3f)
619 | ((uint16_t)(puch[1] & 0x3f) << 6)
620 | ((uint16_t)(uch & 0x0f) << 12);
621 *pwc++ = uc;
622 puch += 3;
623 cch -= 3;
624 }
625 else
626 {
627 /* generate surrugate pair */
628 Assert((uch & (RT_BIT(7) | RT_BIT(6) | RT_BIT(5) | RT_BIT(4) | RT_BIT(3))) == (RT_BIT(7) | RT_BIT(6) | RT_BIT(5) | RT_BIT(4)));
629 RTUNICP uc = (puch[3] & 0x3f)
630 | ((RTUNICP)(puch[2] & 0x3f) << 6)
631 | ((RTUNICP)(puch[1] & 0x3f) << 12)
632 | ((RTUNICP)(uch & 0x07) << 18);
633 if (pwc + 1 >= pwszEnd)
634 {
635 rc = VERR_BUFFER_OVERFLOW;
636 break;
637 }
638 uc -= 0x10000;
639 *pwc++ = 0xd800 | (uc >> 10);
640 *pwc++ = 0xdc00 | (uc & 0x3ff);
641 puch += 4;
642 cch -= 4;
643 }
644 }
645
646 /* done */
647 *pwc = '\0';
648 *pcwc = pwc - pwsz;
649 return rc;
650}
651
652
653RTDECL(int) RTStrToUtf16(const char *pszString, PRTUTF16 *ppwszString)
654{
655 /*
656 * Validate input.
657 */
658 Assert(VALID_PTR(ppwszString));
659 Assert(VALID_PTR(pszString));
660 *ppwszString = NULL;
661
662 /*
663 * Validate the UTF-8 input and calculate the length of the UTF-16 string.
664 */
665 size_t cwc;
666 int rc = rtUtf8CalcUtf16Length(pszString, RTSTR_MAX, &cwc);
667 if (RT_SUCCESS(rc))
668 {
669 /*
670 * Allocate buffer.
671 */
672 PRTUTF16 pwsz = (PRTUTF16)RTMemAlloc((cwc + 1) * sizeof(RTUTF16));
673 if (pwsz)
674 {
675 /*
676 * Encode the UTF-16 string.
677 */
678 rc = rtUtf8RecodeAsUtf16(pszString, RTSTR_MAX, pwsz, cwc, &cwc);
679 if (RT_SUCCESS(rc))
680 {
681 *ppwszString = pwsz;
682 return rc;
683 }
684 RTMemFree(pwsz);
685 }
686 else
687 rc = VERR_NO_UTF16_MEMORY;
688 }
689 return rc;
690}
691
692
693RTDECL(int) RTStrToUtf16Ex(const char *pszString, size_t cchString, PRTUTF16 *ppwsz, size_t cwc, size_t *pcwc)
694{
695 /*
696 * Validate input.
697 */
698 Assert(VALID_PTR(pszString));
699 Assert(VALID_PTR(ppwsz));
700 Assert(!pcwc || VALID_PTR(pcwc));
701
702 /*
703 * Validate the UTF-8 input and calculate the length of the UTF-16 string.
704 */
705 size_t cwcResult;
706 int rc = rtUtf8CalcUtf16Length(pszString, cchString, &cwcResult);
707 if (RT_SUCCESS(rc))
708 {
709 if (pcwc)
710 *pcwc = cwcResult;
711
712 /*
713 * Check buffer size / Allocate buffer.
714 */
715 bool fShouldFree;
716 PRTUTF16 pwszResult;
717 if (cwc > 0 && *ppwsz)
718 {
719 fShouldFree = false;
720 if (cwc <= cwcResult)
721 return VERR_BUFFER_OVERFLOW;
722 pwszResult = *ppwsz;
723 }
724 else
725 {
726 *ppwsz = NULL;
727 fShouldFree = true;
728 cwc = RT_MAX(cwcResult + 1, cwc);
729 pwszResult = (PRTUTF16)RTMemAlloc(cwc * sizeof(RTUTF16));
730 }
731 if (pwszResult)
732 {
733 /*
734 * Encode the UTF-16 string.
735 */
736 rc = rtUtf8RecodeAsUtf16(pszString, cchString, pwszResult, cwc - 1, &cwcResult);
737 if (RT_SUCCESS(rc))
738 {
739 *ppwsz = pwszResult;
740 return rc;
741 }
742 if (fShouldFree)
743 RTMemFree(pwszResult);
744 }
745 else
746 rc = VERR_NO_UTF16_MEMORY;
747 }
748 return rc;
749}
750
751
752RTDECL(size_t) RTStrCalcUtf16Len(const char *psz)
753{
754 size_t cwc;
755 int rc = rtUtf8CalcUtf16Length(psz, RTSTR_MAX, &cwc);
756 return RT_SUCCESS(rc) ? cwc : 0;
757}
758
759
760RTDECL(int) RTStrCalcUtf16LenEx(const char *psz, size_t cch, size_t *pcwc)
761{
762 size_t cwc;
763 int rc = rtUtf8CalcUtf16Length(psz, cch, &cwc);
764 if (pcwc)
765 *pcwc = RT_SUCCESS(rc) ? cwc : ~(size_t)0;
766 return rc;
767}
768
769
770/**
771 * Handle invalid encodings passed to RTStrGetCp() and RTStrGetCpEx().
772 * @returns rc
773 * @param ppsz The pointer to the string position point.
774 * @param pCp Where to store RTUNICP_INVALID.
775 * @param rc The iprt error code.
776 */
777static int rtStrGetCpExFailure(const char **ppsz, PRTUNICP pCp, int rc)
778{
779 /*
780 * Try find a valid encoding.
781 */
782 (*ppsz)++; /** @todo code this! */
783 *pCp = RTUNICP_INVALID;
784 return rc;
785}
786
787
788RTDECL(RTUNICP) RTStrGetCpInternal(const char *psz)
789{
790 RTUNICP Cp;
791 RTStrGetCpExInternal(&psz, &Cp);
792 return Cp;
793}
794
795
796RTDECL(int) RTStrGetCpExInternal(const char **ppsz, PRTUNICP pCp)
797{
798 const unsigned char *puch = (const unsigned char *)*ppsz;
799 const unsigned char uch = *puch;
800 RTUNICP uc;
801
802 /* ASCII ? */
803 if (!(uch & RT_BIT(7)))
804 {
805 uc = uch;
806 puch++;
807 }
808 else if (uch & RT_BIT(6))
809 {
810 /* figure the length and validate the first octet. */
811 unsigned cb;
812 if (!(uch & RT_BIT(5)))
813 cb = 2;
814 else if (!(uch & RT_BIT(4)))
815 cb = 3;
816 else if (!(uch & RT_BIT(3)))
817 cb = 4;
818 else if (!(uch & RT_BIT(2)))
819 cb = 5;
820 else if (!(uch & RT_BIT(1)))
821 cb = 6;
822 else
823 {
824 RTStrAssertMsgFailed(("Invalid UTF-8 first byte: %.*Rhxs\n", RT_MIN(strlen((char *)puch), 10), puch));
825 return rtStrGetCpExFailure(ppsz, pCp, VERR_INVALID_UTF8_ENCODING);
826 }
827
828 /* validate the rest */
829 switch (cb)
830 {
831 case 6:
832 RTStrAssertMsgReturn((puch[5] & 0xc0) == 0x80, ("6/%u: %.*Rhxs\n", cb, RT_MIN(cb + 10, strlen((char *)puch)), puch),
833 rtStrGetCpExFailure(ppsz, pCp, VERR_INVALID_UTF8_ENCODING));
834 case 5:
835 RTStrAssertMsgReturn((puch[4] & 0xc0) == 0x80, ("5/%u: %.*Rhxs\n", cb, RT_MIN(cb + 10, strlen((char *)puch)), puch),
836 rtStrGetCpExFailure(ppsz, pCp, VERR_INVALID_UTF8_ENCODING));
837 case 4:
838 RTStrAssertMsgReturn((puch[3] & 0xc0) == 0x80, ("4/%u: %.*Rhxs\n", cb, RT_MIN(cb + 10, strlen((char *)puch)), puch),
839 rtStrGetCpExFailure(ppsz, pCp, VERR_INVALID_UTF8_ENCODING));
840 case 3:
841 RTStrAssertMsgReturn((puch[2] & 0xc0) == 0x80, ("3/%u: %.*Rhxs\n", cb, RT_MIN(cb + 10, strlen((char *)puch)), puch),
842 rtStrGetCpExFailure(ppsz, pCp, VERR_INVALID_UTF8_ENCODING));
843 case 2:
844 RTStrAssertMsgReturn((puch[1] & 0xc0) == 0x80, ("2/%u: %.*Rhxs\n", cb, RT_MIN(cb + 10, strlen((char *)puch)), puch),
845 rtStrGetCpExFailure(ppsz, pCp, VERR_INVALID_UTF8_ENCODING));
846 break;
847 }
848
849 /* get and validate the code point. */
850 switch (cb)
851 {
852 case 6:
853 uc = (puch[5] & 0x3f)
854 | ((RTUNICP)(puch[4] & 0x3f) << 6)
855 | ((RTUNICP)(puch[3] & 0x3f) << 12)
856 | ((RTUNICP)(puch[2] & 0x3f) << 18)
857 | ((RTUNICP)(puch[1] & 0x3f) << 24)
858 | ((RTUNICP)(uch & 0x01) << 30);
859 RTStrAssertMsgReturn(uc >= 0x04000000 && uc <= 0x7fffffff,
860 ("%u: cp=%#010RX32: %.*Rhxs\n", cb, uc, RT_MIN(cb + 10, strlen((char *)puch)), puch),
861 rtStrGetCpExFailure(ppsz, pCp, VERR_INVALID_UTF8_ENCODING));
862 break;
863 case 5:
864 uc = (puch[4] & 0x3f)
865 | ((RTUNICP)(puch[3] & 0x3f) << 6)
866 | ((RTUNICP)(puch[2] & 0x3f) << 12)
867 | ((RTUNICP)(puch[1] & 0x3f) << 18)
868 | ((RTUNICP)(uch & 0x03) << 24);
869 RTStrAssertMsgReturn(uc >= 0x00200000 && uc <= 0x03ffffff,
870 ("%u: cp=%#010RX32: %.*Rhxs\n", cb, uc, RT_MIN(cb + 10, strlen((char *)puch)), puch),
871 rtStrGetCpExFailure(ppsz, pCp, VERR_INVALID_UTF8_ENCODING));
872 break;
873 case 4:
874 uc = (puch[3] & 0x3f)
875 | ((RTUNICP)(puch[2] & 0x3f) << 6)
876 | ((RTUNICP)(puch[1] & 0x3f) << 12)
877 | ((RTUNICP)(uch & 0x07) << 18);
878 RTStrAssertMsgReturn(uc >= 0x00010000 && uc <= 0x001fffff,
879 ("%u: cp=%#010RX32: %.*Rhxs\n", cb, uc, RT_MIN(cb + 10, strlen((char *)puch)), puch),
880 rtStrGetCpExFailure(ppsz, pCp, VERR_INVALID_UTF8_ENCODING));
881 break;
882 case 3:
883 uc = (puch[2] & 0x3f)
884 | ((RTUNICP)(puch[1] & 0x3f) << 6)
885 | ((RTUNICP)(uch & 0x0f) << 12);
886 RTStrAssertMsgReturn(uc >= 0x00000800 && uc <= 0x0000fffd,
887 ("%u: cp=%#010RX32: %.*Rhxs\n", cb, uc, RT_MIN(cb + 10, strlen((char *)puch)), puch),
888 rtStrGetCpExFailure(ppsz, pCp, uc == 0xffff || uc == 0xfffe ? VERR_CODE_POINT_ENDIAN_INDICATOR : VERR_INVALID_UTF8_ENCODING));
889 RTStrAssertMsgReturn(uc < 0xd800 || uc > 0xdfff,
890 ("%u: cp=%#010RX32: %.*Rhxs\n", cb, uc, RT_MIN(cb + 10, strlen((char *)puch)), puch),
891 rtStrGetCpExFailure(ppsz, pCp, VERR_CODE_POINT_SURROGATE));
892 break;
893 case 2:
894 uc = (puch[1] & 0x3f)
895 | ((RTUNICP)(uch & 0x1f) << 6);
896 RTStrAssertMsgReturn(uc >= 0x00000080 && uc <= 0x000007ff,
897 ("%u: cp=%#010RX32: %.*Rhxs\n", cb, uc, RT_MIN(cb + 10, strlen((char *)puch)), puch),
898 rtStrGetCpExFailure(ppsz, pCp, VERR_INVALID_UTF8_ENCODING));
899 break;
900 default: /* impossible, but GCC is bitching. */
901 uc = RTUNICP_INVALID;
902 break;
903 }
904 puch += cb;
905 }
906 else
907 {
908 /* 6th bit is always set. */
909 RTStrAssertMsgFailed(("Invalid UTF-8 first byte: %.*Rhxs\n", RT_MIN(strlen((char *)puch), 10), puch));
910 return rtStrGetCpExFailure(ppsz, pCp, VERR_INVALID_UTF8_ENCODING);
911 }
912 *pCp = uc;
913 *ppsz = (const char *)puch;
914 return VINF_SUCCESS;
915}
916
917
918/**
919 * Handle invalid encodings passed to RTStrGetCpNEx().
920 * @returns rc
921 * @param ppsz The pointer to the string position point.
922 * @param pcch Pointer to the string length.
923 * @param pCp Where to store RTUNICP_INVALID.
924 * @param rc The iprt error code.
925 */
926static int rtStrGetCpNExFailure(const char **ppsz, size_t *pcch, PRTUNICP pCp, int rc)
927{
928 /*
929 * Try find a valid encoding.
930 */
931 (*ppsz)++; /** @todo code this! */
932 (*pcch)--;
933 *pCp = RTUNICP_INVALID;
934 return rc;
935}
936
937
938RTDECL(int) RTStrGetCpNExInternal(const char **ppsz, size_t *pcch, PRTUNICP pCp)
939{
940 const unsigned char *puch = (const unsigned char *)*ppsz;
941 const unsigned char uch = *puch;
942 size_t cch = *pcch;
943 RTUNICP uc;
944
945 if (cch == 0)
946 {
947 *pCp = RTUNICP_INVALID;
948 return VERR_END_OF_STRING;
949 }
950
951 /* ASCII ? */
952 if (!(uch & RT_BIT(7)))
953 {
954 uc = uch;
955 puch++;
956 cch--;
957 }
958 else if (uch & RT_BIT(6))
959 {
960 /* figure the length and validate the first octet. */
961 unsigned cb;
962 if (!(uch & RT_BIT(5)))
963 cb = 2;
964 else if (!(uch & RT_BIT(4)))
965 cb = 3;
966 else if (!(uch & RT_BIT(3)))
967 cb = 4;
968 else if (!(uch & RT_BIT(2)))
969 cb = 5;
970 else if (!(uch & RT_BIT(1)))
971 cb = 6;
972 else
973 {
974 RTStrAssertMsgFailed(("Invalid UTF-8 first byte: %.*Rhxs\n", RT_MIN(strlen((char *)puch), 10), puch));
975 return rtStrGetCpNExFailure(ppsz, pcch, pCp, VERR_INVALID_UTF8_ENCODING);
976 }
977
978 if (cb > cch)
979 return rtStrGetCpNExFailure(ppsz, pcch, pCp, VERR_INVALID_UTF8_ENCODING);
980
981 /* validate the rest */
982 switch (cb)
983 {
984 case 6:
985 RTStrAssertMsgReturn((puch[5] & 0xc0) == 0x80, ("6/%u: %.*Rhxs\n", cb, RT_MIN(cb + 10, strlen((char *)puch)), puch),
986 rtStrGetCpNExFailure(ppsz, pcch, pCp, VERR_INVALID_UTF8_ENCODING));
987 case 5:
988 RTStrAssertMsgReturn((puch[4] & 0xc0) == 0x80, ("5/%u: %.*Rhxs\n", cb, RT_MIN(cb + 10, strlen((char *)puch)), puch),
989 rtStrGetCpNExFailure(ppsz, pcch, pCp, VERR_INVALID_UTF8_ENCODING));
990 case 4:
991 RTStrAssertMsgReturn((puch[3] & 0xc0) == 0x80, ("4/%u: %.*Rhxs\n", cb, RT_MIN(cb + 10, strlen((char *)puch)), puch),
992 rtStrGetCpNExFailure(ppsz, pcch, pCp, VERR_INVALID_UTF8_ENCODING));
993 case 3:
994 RTStrAssertMsgReturn((puch[2] & 0xc0) == 0x80, ("3/%u: %.*Rhxs\n", cb, RT_MIN(cb + 10, strlen((char *)puch)), puch),
995 rtStrGetCpNExFailure(ppsz, pcch, pCp, VERR_INVALID_UTF8_ENCODING));
996 case 2:
997 RTStrAssertMsgReturn((puch[1] & 0xc0) == 0x80, ("2/%u: %.*Rhxs\n", cb, RT_MIN(cb + 10, strlen((char *)puch)), puch),
998 rtStrGetCpNExFailure(ppsz, pcch, pCp, VERR_INVALID_UTF8_ENCODING));
999 break;
1000 }
1001
1002 /* get and validate the code point. */
1003 switch (cb)
1004 {
1005 case 6:
1006 uc = (puch[5] & 0x3f)
1007 | ((RTUNICP)(puch[4] & 0x3f) << 6)
1008 | ((RTUNICP)(puch[3] & 0x3f) << 12)
1009 | ((RTUNICP)(puch[2] & 0x3f) << 18)
1010 | ((RTUNICP)(puch[1] & 0x3f) << 24)
1011 | ((RTUNICP)(uch & 0x01) << 30);
1012 RTStrAssertMsgReturn(uc >= 0x04000000 && uc <= 0x7fffffff,
1013 ("%u: cp=%#010RX32: %.*Rhxs\n", cb, uc, RT_MIN(cb + 10, strlen((char *)puch)), puch),
1014 rtStrGetCpNExFailure(ppsz, pcch, pCp, VERR_INVALID_UTF8_ENCODING));
1015 break;
1016 case 5:
1017 uc = (puch[4] & 0x3f)
1018 | ((RTUNICP)(puch[3] & 0x3f) << 6)
1019 | ((RTUNICP)(puch[2] & 0x3f) << 12)
1020 | ((RTUNICP)(puch[1] & 0x3f) << 18)
1021 | ((RTUNICP)(uch & 0x03) << 24);
1022 RTStrAssertMsgReturn(uc >= 0x00200000 && uc <= 0x03ffffff,
1023 ("%u: cp=%#010RX32: %.*Rhxs\n", cb, uc, RT_MIN(cb + 10, strlen((char *)puch)), puch),
1024 rtStrGetCpNExFailure(ppsz, pcch, pCp, VERR_INVALID_UTF8_ENCODING));
1025 break;
1026 case 4:
1027 uc = (puch[3] & 0x3f)
1028 | ((RTUNICP)(puch[2] & 0x3f) << 6)
1029 | ((RTUNICP)(puch[1] & 0x3f) << 12)
1030 | ((RTUNICP)(uch & 0x07) << 18);
1031 RTStrAssertMsgReturn(uc >= 0x00010000 && uc <= 0x001fffff,
1032 ("%u: cp=%#010RX32: %.*Rhxs\n", cb, uc, RT_MIN(cb + 10, strlen((char *)puch)), puch),
1033 rtStrGetCpNExFailure(ppsz, pcch, pCp, VERR_INVALID_UTF8_ENCODING));
1034 break;
1035 case 3:
1036 uc = (puch[2] & 0x3f)
1037 | ((RTUNICP)(puch[1] & 0x3f) << 6)
1038 | ((RTUNICP)(uch & 0x0f) << 12);
1039 RTStrAssertMsgReturn(uc >= 0x00000800 && uc <= 0x0000fffd,
1040 ("%u: cp=%#010RX32: %.*Rhxs\n", cb, uc, RT_MIN(cb + 10, strlen((char *)puch)), puch),
1041 rtStrGetCpNExFailure(ppsz, pcch, pCp, uc == 0xffff || uc == 0xfffe ? VERR_CODE_POINT_ENDIAN_INDICATOR : VERR_INVALID_UTF8_ENCODING));
1042 RTStrAssertMsgReturn(uc < 0xd800 || uc > 0xdfff,
1043 ("%u: cp=%#010RX32: %.*Rhxs\n", cb, uc, RT_MIN(cb + 10, strlen((char *)puch)), puch),
1044 rtStrGetCpNExFailure(ppsz, pcch, pCp, VERR_CODE_POINT_SURROGATE));
1045 break;
1046 case 2:
1047 uc = (puch[1] & 0x3f)
1048 | ((RTUNICP)(uch & 0x1f) << 6);
1049 RTStrAssertMsgReturn(uc >= 0x00000080 && uc <= 0x000007ff,
1050 ("%u: cp=%#010RX32: %.*Rhxs\n", cb, uc, RT_MIN(cb + 10, strlen((char *)puch)), puch),
1051 rtStrGetCpNExFailure(ppsz, pcch, pCp, VERR_INVALID_UTF8_ENCODING));
1052 break;
1053 default: /* impossible, but GCC is bitching. */
1054 uc = RTUNICP_INVALID;
1055 break;
1056 }
1057 puch += cb;
1058 cch -= cb;
1059 }
1060 else
1061 {
1062 /* 6th bit is always set. */
1063 RTStrAssertMsgFailed(("Invalid UTF-8 first byte: %.*Rhxs\n", RT_MIN(strlen((char *)puch), 10), puch));
1064 return rtStrGetCpNExFailure(ppsz, pcch, pCp, VERR_INVALID_UTF8_ENCODING);
1065 }
1066 *pCp = uc;
1067 *ppsz = (const char *)puch;
1068 (*pcch) = cch;
1069 return VINF_SUCCESS;
1070}
1071
1072
1073RTDECL(char *) RTStrPutCpInternal(char *psz, RTUNICP uc)
1074{
1075 unsigned char *puch = (unsigned char *)psz;
1076 if (uc < 0x80)
1077 *puch++ = (unsigned char )uc;
1078 else if (uc < 0x00000800)
1079 {
1080 *puch++ = 0xc0 | (uc >> 6);
1081 *puch++ = 0x80 | (uc & 0x3f);
1082 }
1083 else if (uc < 0x00010000)
1084 {
1085 if ( uc < 0x0000d8000
1086 || ( uc > 0x0000dfff
1087 && uc < 0x0000fffe))
1088 {
1089 *puch++ = 0xe0 | (uc >> 12);
1090 *puch++ = 0x80 | ((uc >> 6) & 0x3f);
1091 *puch++ = 0x80 | (uc & 0x3f);
1092 }
1093 else
1094 {
1095 AssertMsgFailed(("Invalid code point U+%05x!\n", uc));
1096 *puch++ = 0x7f;
1097 }
1098 }
1099 else if (uc < 0x00200000)
1100 {
1101 *puch++ = 0xf0 | (uc >> 18);
1102 *puch++ = 0x80 | ((uc >> 12) & 0x3f);
1103 *puch++ = 0x80 | ((uc >> 6) & 0x3f);
1104 *puch++ = 0x80 | (uc & 0x3f);
1105 }
1106 else if (uc < 0x04000000)
1107 {
1108 *puch++ = 0xf1 | (uc >> 24);
1109 *puch++ = 0x80 | ((uc >> 18) & 0x3f);
1110 *puch++ = 0x80 | ((uc >> 12) & 0x3f);
1111 *puch++ = 0x80 | ((uc >> 6) & 0x3f);
1112 *puch++ = 0x80 | (uc & 0x3f);
1113 }
1114 else if (uc <= 0x7fffffff)
1115 {
1116 *puch++ = 0xf3 | (uc >> 30);
1117 *puch++ = 0x80 | ((uc >> 24) & 0x3f);
1118 *puch++ = 0x80 | ((uc >> 18) & 0x3f);
1119 *puch++ = 0x80 | ((uc >> 12) & 0x3f);
1120 *puch++ = 0x80 | ((uc >> 6) & 0x3f);
1121 *puch++ = 0x80 | (uc & 0x3f);
1122 }
1123 else
1124 {
1125 AssertMsgFailed(("Invalid code point U+%08x!\n", uc));
1126 *puch++ = 0x7f;
1127 }
1128
1129 return (char *)puch;
1130}
1131
1132
1133RTDECL(char *) RTStrPrevCp(const char *pszStart, const char *psz)
1134{
1135 if (pszStart < psz)
1136 {
1137 /* simple char? */
1138 const unsigned char *puch = (const unsigned char *)psz;
1139 unsigned uch = *--puch;
1140 if (!(uch & RT_BIT(7)))
1141 return (char *)puch;
1142 RTStrAssertMsgReturn(!(uch & RT_BIT(6)), ("uch=%#x\n", uch), (char *)pszStart);
1143
1144 /* two or more. */
1145 uint32_t uMask = 0xffffffc0;
1146 while ( (const unsigned char *)pszStart < puch
1147 && !(uMask & 1))
1148 {
1149 unsigned uch = *--puch;
1150 if ((uch & 0xc0) != 0x80)
1151 {
1152 RTStrAssertMsgReturn((uch & (uMask >> 1)) == (uMask & 0xff),
1153 ("Invalid UTF-8 encoding: %.*Rhxs puch=%p psz=%p\n", psz - (char *)puch, puch, psz),
1154 (char *)pszStart);
1155 return (char *)puch;
1156 }
1157 uMask >>= 1;
1158 }
1159 RTStrAssertMsgFailed(("Invalid UTF-8 encoding: %.*Rhxs puch=%p psz=%p\n", psz - (char *)puch, puch, psz));
1160 }
1161 return (char *)pszStart;
1162}
1163
1164
1165/**
1166 * Performs a case sensitive string compare between two UTF-8 strings.
1167 *
1168 * Encoding errors are ignored by the current implementation. So, the only
1169 * difference between this and the CRT strcmp function is the handling of
1170 * NULL arguments.
1171 *
1172 * @returns < 0 if the first string less than the second string.
1173 * @returns 0 if the first string identical to the second string.
1174 * @returns > 0 if the first string greater than the second string.
1175 * @param psz1 First UTF-8 string. Null is allowed.
1176 * @param psz2 Second UTF-8 string. Null is allowed.
1177 */
1178RTDECL(int) RTStrCmp(const char *psz1, const char *psz2)
1179{
1180 if (psz1 == psz2)
1181 return 0;
1182 if (!psz1)
1183 return -1;
1184 if (!psz2)
1185 return 1;
1186
1187 return strcmp(psz1, psz2);
1188}
1189
1190
1191/**
1192 * Performs a case sensitive string compare between two UTF-8 strings, given
1193 * a maximum string length.
1194 *
1195 * Encoding errors are ignored by the current implementation. So, the only
1196 * difference between this and the CRT strncmp function is the handling of
1197 * NULL arguments.
1198 *
1199 * @returns < 0 if the first string less than the second string.
1200 * @returns 0 if the first string identical to the second string.
1201 * @returns > 0 if the first string greater than the second string.
1202 * @param psz1 First UTF-8 string. Null is allowed.
1203 * @param psz2 Second UTF-8 string. Null is allowed.
1204 * @param cchMax The maximum string length
1205 */
1206RTDECL(int) RTStrNCmp(const char *psz1, const char *psz2, size_t cchMax)
1207{
1208 if (psz1 == psz2)
1209 return 0;
1210 if (!psz1)
1211 return -1;
1212 if (!psz2)
1213 return 1;
1214
1215 return strncmp(psz1, psz2, cchMax);
1216}
1217
1218
1219/**
1220 * Performs a case insensitive string compare between two UTF-8 strings.
1221 *
1222 * This is a simplified compare, as only the simplified lower/upper case folding
1223 * specified by the unicode specs are used. It does not consider character pairs
1224 * as they are used in some languages, just simple upper & lower case compares.
1225 *
1226 * The result is the difference between the mismatching codepoints after they
1227 * both have been lower cased.
1228 *
1229 * If the string encoding is invalid the function will assert (strict builds)
1230 * and use RTStrCmp for the remainder of the string.
1231 *
1232 * @returns < 0 if the first string less than the second string.
1233 * @returns 0 if the first string identical to the second string.
1234 * @returns > 0 if the first string greater than the second string.
1235 * @param psz1 First UTF-8 string. Null is allowed.
1236 * @param psz2 Second UTF-8 string. Null is allowed.
1237 */
1238RTDECL(int) RTStrICmp(const char *psz1, const char *psz2)
1239{
1240 if (psz1 == psz2)
1241 return 0;
1242 if (!psz1)
1243 return -1;
1244 if (!psz2)
1245 return 1;
1246
1247 const char *pszStart1 = psz1;
1248 for (;;)
1249 {
1250 /* Get the codepoints */
1251 RTUNICP cp1;
1252 int rc = RTStrGetCpEx(&psz1, &cp1);
1253 if (RT_FAILURE(rc))
1254 {
1255 AssertRC(rc);
1256 psz1--;
1257 break;
1258 }
1259
1260 RTUNICP cp2;
1261 rc = RTStrGetCpEx(&psz2, &cp2);
1262 if (RT_FAILURE(rc))
1263 {
1264 AssertRC(rc);
1265 psz2--;
1266 psz1 = RTStrPrevCp(pszStart1, psz1);
1267 break;
1268 }
1269
1270 /* compare */
1271 int iDiff = cp1 - cp2;
1272 if (iDiff)
1273 {
1274 iDiff = RTUniCpToUpper(cp1) != RTUniCpToUpper(cp2);
1275 if (iDiff)
1276 {
1277 iDiff = RTUniCpToLower(cp1) - RTUniCpToLower(cp2); /* lower case diff last! */
1278 if (iDiff)
1279 return iDiff;
1280 }
1281 }
1282
1283 /* hit the terminator? */
1284 if (!cp1)
1285 return 0;
1286 }
1287
1288 /* Hit some bad encoding, continue in case insensitive mode. */
1289 return RTStrCmp(psz1, psz2);
1290}
1291
1292
1293/**
1294 * Performs a case insensitive string compare between two UTF-8 strings, given a
1295 * maximum string length.
1296 *
1297 * This is a simplified compare, as only the simplified lower/upper case folding
1298 * specified by the unicode specs are used. It does not consider character pairs
1299 * as they are used in some languages, just simple upper & lower case compares.
1300 *
1301 * The result is the difference between the mismatching codepoints after they
1302 * both have been lower cased.
1303 *
1304 * If the string encoding is invalid the function will assert (strict builds)
1305 * and use RTStrCmp for the remainder of the string.
1306 *
1307 * @returns < 0 if the first string less than the second string.
1308 * @returns 0 if the first string identical to the second string.
1309 * @returns > 0 if the first string greater than the second string.
1310 * @param psz1 First UTF-8 string. Null is allowed.
1311 * @param psz2 Second UTF-8 string. Null is allowed.
1312 * @param cchMax Maximum string length
1313 */
1314RTDECL(int) RTStrNICmp(const char *psz1, const char *psz2, size_t cchMax)
1315{
1316 if (cchMax == 0)
1317 return 0;
1318 if (psz1 == psz2)
1319 return 0;
1320 if (!psz1)
1321 return -1;
1322 if (!psz2)
1323 return 1;
1324
1325 for (;;)
1326 {
1327 /* Get the codepoints */
1328 RTUNICP cp1;
1329 size_t cchMax2 = cchMax;
1330 int rc = RTStrGetCpNEx(&psz1, &cchMax, &cp1);
1331 if (RT_FAILURE(rc))
1332 {
1333 AssertRC(rc);
1334 psz1--;
1335 cchMax++;
1336 break;
1337 }
1338
1339 RTUNICP cp2;
1340 rc = RTStrGetCpNEx(&psz2, &cchMax2, &cp2);
1341 if (RT_FAILURE(rc))
1342 {
1343 AssertRC(rc);
1344 psz2--;
1345 psz1 -= (cchMax - cchMax2 + 1); /* This can't overflow, can it? */
1346 cchMax = cchMax2 + 1;
1347 break;
1348 }
1349
1350 /* compare */
1351 int iDiff = cp1 - cp2;
1352 if (iDiff)
1353 {
1354 iDiff = RTUniCpToUpper(cp1) != RTUniCpToUpper(cp2);
1355 if (iDiff)
1356 {
1357 iDiff = RTUniCpToLower(cp1) - RTUniCpToLower(cp2); /* lower case diff last! */
1358 if (iDiff)
1359 return iDiff;
1360 }
1361 }
1362
1363 /* hit the terminator? */
1364 if (!cp1 || cchMax == 0)
1365 return 0;
1366 }
1367
1368 /* Hit some bad encoding, continue in case insensitive mode. */
1369 return RTStrNCmp(psz1, psz2, cchMax);
1370}
1371
1372
1373RTDECL(char *) RTStrStr(const char *pszHaystack, const char *pszNeedle)
1374{
1375 /* Any NULL strings means NULL return. (In the RTStrCmp tradition.) */
1376 if (!pszHaystack)
1377 return NULL;
1378 if (!pszNeedle)
1379 return NULL;
1380
1381 /* The rest is CRT. */
1382 return (char *)strstr(pszHaystack, pszNeedle);
1383}
1384
1385
1386RTDECL(char *) RTStrIStr(const char *pszHaystack, const char *pszNeedle)
1387{
1388 /* Any NULL strings means NULL return. (In the RTStrCmp tradition.) */
1389 if (!pszHaystack)
1390 return NULL;
1391 if (!pszNeedle)
1392 return NULL;
1393
1394 /* The empty string matches everything. */
1395 if (!*pszNeedle)
1396 return (char *)pszHaystack;
1397
1398 /*
1399 * The search strategy is to pick out the first char of the needle, fold it,
1400 * and match it against the haystack code point by code point. When encountering
1401 * a matching code point we use RTStrNICmp for the remainder (if any) of the needle.
1402 */
1403 const char * const pszNeedleStart = pszNeedle;
1404 RTUNICP Cp0;
1405 RTStrGetCpEx(&pszNeedle, &Cp0); /* pszNeedle is advanced one code point. */
1406 size_t const cchNeedle = strlen(pszNeedle);
1407 size_t const cchNeedleCp0= pszNeedle - pszNeedleStart;
1408 RTUNICP const Cp0Lower = RTUniCpToLower(Cp0);
1409 RTUNICP const Cp0Upper = RTUniCpToUpper(Cp0);
1410 if ( Cp0Lower == Cp0Upper
1411 && Cp0Lower == Cp0)
1412 {
1413 /* Cp0 is not a case sensitive char. */
1414 for (;;)
1415 {
1416 RTUNICP Cp;
1417 RTStrGetCpEx(&pszHaystack, &Cp);
1418 if (!Cp)
1419 break;
1420 if ( Cp == Cp0
1421 && !RTStrNICmp(pszHaystack, pszNeedle, cchNeedle))
1422 return (char *)pszHaystack - cchNeedleCp0;
1423 }
1424 }
1425 else if ( Cp0Lower == Cp0
1426 || Cp0Upper != Cp0)
1427 {
1428 /* Cp0 is case sensitive */
1429 for (;;)
1430 {
1431 RTUNICP Cp;
1432 RTStrGetCpEx(&pszHaystack, &Cp);
1433 if (!Cp)
1434 break;
1435 if ( ( Cp == Cp0Upper
1436 || Cp == Cp0Lower)
1437 && !RTStrNICmp(pszHaystack, pszNeedle, cchNeedle))
1438 return (char *)pszHaystack - cchNeedleCp0;
1439 }
1440 }
1441 else
1442 {
1443 /* Cp0 is case sensitive and folds to two difference chars. (paranoia) */
1444 for (;;)
1445 {
1446 RTUNICP Cp;
1447 RTStrGetCpEx(&pszHaystack, &Cp);
1448 if (!Cp)
1449 break;
1450 if ( ( Cp == Cp0
1451 || Cp == Cp0Upper
1452 || Cp == Cp0Lower)
1453 && !RTStrNICmp(pszHaystack, pszNeedle, cchNeedle))
1454 return (char *)pszHaystack - cchNeedleCp0;
1455 }
1456 }
1457
1458
1459 return NULL;
1460}
1461
1462
1463RTDECL(char *) RTStrToLower(char *psz)
1464{
1465 /*
1466 * Loop the code points in the string, converting them one by one.
1467 * ASSUMES that the code points for upper and lower case are encoded
1468 * with the exact same length.
1469 */
1470 /** @todo Handled bad encodings correctly+quietly, remove assumption,
1471 * optimize. */
1472 char *pszCur = psz;
1473 while (*pszCur)
1474 {
1475 RTUNICP cp = RTStrGetCp(pszCur);
1476 cp = RTUniCpToLower(cp);
1477 pszCur = RTStrPutCp(pszCur, cp);
1478 }
1479 return psz;
1480}
1481
1482
1483RTDECL(char *) RTStrToUpper(char *psz)
1484{
1485 /*
1486 * Loop the code points in the string, converting them one by one.
1487 * ASSUMES that the code points for upper and lower case are encoded
1488 * with the exact same length.
1489 */
1490 /** @todo Handled bad encodings correctly+quietly, remove assumption,
1491 * optimize. */
1492 char *pszCur = psz;
1493 while(*pszCur)
1494 {
1495 RTUNICP cp = RTStrGetCp(pszCur);
1496 cp = RTUniCpToUpper(cp);
1497 pszCur = RTStrPutCp(pszCur, cp);
1498 }
1499 return psz;
1500}
1501
Note: See TracBrowser for help on using the repository browser.

© 2024 Oracle Support Privacy / Do Not Sell My Info Terms of Use Trademark Policy Automated Access Etiquette