VirtualBox

source: vbox/trunk/src/VBox/Runtime/common/string/utf-8.cpp@ 21337

Last change on this file since 21337 was 21337, checked in by vboxsync, 16 years ago

IPRT,HostDrv,AddDrv: Export public IPRT symbols for the linux kernel (pain).

  • Property svn:eol-style set to native
  • Property svn:keywords set to Id
File size: 53.8 KB
Line 
1/* $Id: utf-8.cpp 21337 2009-07-07 14:58:27Z vboxsync $ */
2/** @file
3 * IPRT - UTF-8 Decoding.
4 */
5
6/*
7 * Copyright (C) 2006-2007 Sun Microsystems, Inc.
8 *
9 * This file is part of VirtualBox Open Source Edition (OSE), as
10 * available from http://www.virtualbox.org. This file is free software;
11 * you can redistribute it and/or modify it under the terms of the GNU
12 * General Public License (GPL) as published by the Free Software
13 * Foundation, in version 2 as it comes in the "COPYING" file of the
14 * VirtualBox OSE distribution. VirtualBox OSE is distributed in the
15 * hope that it will be useful, but WITHOUT ANY WARRANTY of any kind.
16 *
17 * The contents of this file may alternatively be used under the terms
18 * of the Common Development and Distribution License Version 1.0
19 * (CDDL) only, as it comes in the "COPYING.CDDL" file of the
20 * VirtualBox OSE distribution, in which case the provisions of the
21 * CDDL are applicable instead of those of the GPL.
22 *
23 * You may elect to license modified versions of this file under the
24 * terms and conditions of either the GPL or the CDDL or both.
25 *
26 * Please contact Sun Microsystems, Inc., 4150 Network Circle, Santa
27 * Clara, CA 95054 USA or visit http://www.sun.com if you need
28 * additional information or have any questions.
29 */
30
31
32/*******************************************************************************
33* Header Files *
34*******************************************************************************/
35#include <iprt/string.h>
36#include "internal/iprt.h"
37
38#include <iprt/uni.h>
39#include <iprt/alloc.h>
40#include <iprt/assert.h>
41#include <iprt/err.h>
42#include "internal/string.h"
43
44
45
46/**
47 * Get get length in code points of a UTF-8 encoded string.
48 * The string is validated while doing this.
49 *
50 * @returns IPRT status code.
51 * @param psz Pointer to the UTF-8 string.
52 * @param cch The max length of the string. (btw cch = cb)
53 * Use RTSTR_MAX if all of the string is to be examined.
54 * @param pcuc Where to store the length in unicode code points.
55 * @param pcchActual Where to store the actual size of the UTF-8 string
56 * on success (cch = cb again). Optional.
57 */
58static int rtUtf8Length(const char *psz, size_t cch, size_t *pcuc, size_t *pcchActual)
59{
60 const unsigned char *puch = (const unsigned char *)psz;
61 size_t cCodePoints = 0;
62 while (cch > 0)
63 {
64 const unsigned char uch = *puch;
65 if (!uch)
66 break;
67 if (uch & RT_BIT(7))
68 {
69 /* figure sequence length and validate the first byte */
70 unsigned cb;
71 if ((uch & (RT_BIT(7) | RT_BIT(6) | RT_BIT(5))) == (RT_BIT(7) | RT_BIT(6)))
72 cb = 2;
73 else if ((uch & (RT_BIT(7) | RT_BIT(6) | RT_BIT(5) | RT_BIT(4))) == (RT_BIT(7) | RT_BIT(6) | RT_BIT(5)))
74 cb = 3;
75 else if ((uch & (RT_BIT(7) | RT_BIT(6) | RT_BIT(5) | RT_BIT(4) | RT_BIT(3))) == (RT_BIT(7) | RT_BIT(6) | RT_BIT(5) | RT_BIT(4)))
76 cb = 4;
77 else if ((uch & (RT_BIT(7) | RT_BIT(6) | RT_BIT(5) | RT_BIT(4) | RT_BIT(3) | RT_BIT(2))) == (RT_BIT(7) | RT_BIT(6) | RT_BIT(5) | RT_BIT(4) | RT_BIT(3)))
78 cb = 5;
79 else if ((uch & (RT_BIT(7) | RT_BIT(6) | RT_BIT(5) | RT_BIT(4) | RT_BIT(3) | RT_BIT(2) | RT_BIT(1))) == (RT_BIT(7) | RT_BIT(6) | RT_BIT(5) | RT_BIT(4) | RT_BIT(3) | RT_BIT(2)))
80 cb = 6;
81 else
82 {
83 RTStrAssertMsgFailed(("Invalid UTF-8 first byte: %.*Rhxs\n", RT_MIN(cch, 10), puch));
84 return VERR_INVALID_UTF8_ENCODING;
85 }
86
87 /* check length */
88 if (cb > cch)
89 {
90 RTStrAssertMsgFailed(("Invalid UTF-8 length: cb=%d cch=%d (%.*Rhxs)\n", cb, cch, RT_MIN(cch, 10), puch));
91 return VERR_INVALID_UTF8_ENCODING;
92 }
93
94 /* validate the rest */
95 switch (cb)
96 {
97 case 6:
98 RTStrAssertMsgReturn((puch[5] & (RT_BIT(7) | RT_BIT(6))) == RT_BIT(7), ("6/%u: %.*Rhxs\n", cb, RT_MIN(cb + 10, cch), puch), VERR_INVALID_UTF8_ENCODING);
99 case 5:
100 RTStrAssertMsgReturn((puch[4] & (RT_BIT(7) | RT_BIT(6))) == RT_BIT(7), ("5/%u: %.*Rhxs\n", cb, RT_MIN(cb + 10, cch), puch), VERR_INVALID_UTF8_ENCODING);
101 case 4:
102 RTStrAssertMsgReturn((puch[3] & (RT_BIT(7) | RT_BIT(6))) == RT_BIT(7), ("4/%u: %.*Rhxs\n", cb, RT_MIN(cb + 10, cch), puch), VERR_INVALID_UTF8_ENCODING);
103 case 3:
104 RTStrAssertMsgReturn((puch[2] & (RT_BIT(7) | RT_BIT(6))) == RT_BIT(7), ("3/%u: %.*Rhxs\n", cb, RT_MIN(cb + 10, cch), puch), VERR_INVALID_UTF8_ENCODING);
105 case 2:
106 RTStrAssertMsgReturn((puch[1] & (RT_BIT(7) | RT_BIT(6))) == RT_BIT(7), ("2/%u: %.*Rhxs\n", cb, RT_MIN(cb + 10, cch), puch), VERR_INVALID_UTF8_ENCODING);
107 break;
108 }
109
110 /* validate the code point. */
111 RTUNICP uc;
112 switch (cb)
113 {
114 case 6:
115 uc = (puch[5] & 0x3f)
116 | ((RTUNICP)(puch[4] & 0x3f) << 6)
117 | ((RTUNICP)(puch[3] & 0x3f) << 12)
118 | ((RTUNICP)(puch[2] & 0x3f) << 18)
119 | ((RTUNICP)(puch[1] & 0x3f) << 24)
120 | ((RTUNICP)(uch & 0x01) << 30);
121 RTStrAssertMsgReturn(uc >= 0x04000000 && uc <= 0x7fffffff,
122 ("%u: cp=%#010RX32: %.*Rhxs\n", cb, uc, RT_MIN(cb + 10, cch), puch), VERR_INVALID_UTF8_ENCODING);
123 break;
124 case 5:
125 uc = (puch[4] & 0x3f)
126 | ((RTUNICP)(puch[3] & 0x3f) << 6)
127 | ((RTUNICP)(puch[2] & 0x3f) << 12)
128 | ((RTUNICP)(puch[1] & 0x3f) << 18)
129 | ((RTUNICP)(uch & 0x03) << 24);
130 RTStrAssertMsgReturn(uc >= 0x00200000 && uc <= 0x03ffffff,
131 ("%u: cp=%#010RX32: %.*Rhxs\n", cb, uc, RT_MIN(cb + 10, cch), puch), VERR_INVALID_UTF8_ENCODING);
132 break;
133 case 4:
134 uc = (puch[3] & 0x3f)
135 | ((RTUNICP)(puch[2] & 0x3f) << 6)
136 | ((RTUNICP)(puch[1] & 0x3f) << 12)
137 | ((RTUNICP)(uch & 0x07) << 18);
138 RTStrAssertMsgReturn(uc >= 0x00010000 && uc <= 0x001fffff,
139 ("%u: cp=%#010RX32: %.*Rhxs\n", cb, uc, RT_MIN(cb + 10, cch), puch), VERR_INVALID_UTF8_ENCODING);
140 break;
141 case 3:
142 uc = (puch[2] & 0x3f)
143 | ((RTUNICP)(puch[1] & 0x3f) << 6)
144 | ((RTUNICP)(uch & 0x0f) << 12);
145 RTStrAssertMsgReturn(uc >= 0x00000800 && uc <= 0x0000fffd,
146 ("%u: cp=%#010RX32: %.*Rhxs\n", cb, uc, RT_MIN(cb + 10, cch), puch),
147 uc == 0xffff || uc == 0xfffe ? VERR_CODE_POINT_ENDIAN_INDICATOR : VERR_INVALID_UTF8_ENCODING);
148 RTStrAssertMsgReturn(uc < 0xd800 || uc > 0xdfff,
149 ("%u: cp=%#010RX32: %.*Rhxs\n", cb, uc, RT_MIN(cb + 10, cch), puch), VERR_CODE_POINT_SURROGATE);
150 break;
151 case 2:
152 uc = (puch[1] & 0x3f)
153 | ((RTUNICP)(uch & 0x1f) << 6);
154 RTStrAssertMsgReturn(uc >= 0x00000080 && uc <= 0x000007ff,
155 ("%u: cp=%#010RX32: %.*Rhxs\n", cb, uc, RT_MIN(cb + 10, cch), puch), VERR_INVALID_UTF8_ENCODING);
156 break;
157 }
158
159 /* advance */
160 cch -= cb;
161 puch += cb;
162 }
163 else
164 {
165 /* one ASCII byte */
166 puch++;
167 cch--;
168 }
169 cCodePoints++;
170 }
171
172 /* done */
173 *pcuc = cCodePoints;
174 if (pcchActual)
175 *pcchActual = puch - (unsigned char const *)psz;
176 return VINF_SUCCESS;
177}
178
179
180/**
181 * Decodes and UTF-8 string into an array of unicode code point.
182 *
183 * Since we know the input is valid, we do *not* perform encoding or length checks.
184 *
185 * @returns iprt status code.
186 * @param psz The UTF-8 string to recode. This is a valid encoding.
187 * @param cch The number of chars (the type char, so bytes if you like) to process of the UTF-8 string.
188 * The recoding will stop when cch or '\\0' is reached. Pass RTSTR_MAX to process up to '\\0'.
189 * @param paCps Where to store the code points array.
190 * @param cCps The number of RTUNICP items the paCps buffer can hold, excluding the terminator ('\\0').
191 * @param pcCps Where to store the actual number of decoded code points. This excludes the terminator.
192 */
193static int rtUtf8Decode(const char *psz, size_t cch, PRTUNICP paCps, size_t cCps, size_t *pcCps)
194{
195 int rc = VINF_SUCCESS;
196 const unsigned char *puch = (const unsigned char *)psz;
197 const PRTUNICP pCpEnd = paCps + cCps;
198 PRTUNICP pCp = paCps;
199 Assert(pCpEnd >= pCp);
200 while (cch > 0)
201 {
202 /* read the next char and check for terminator. */
203 const unsigned char uch = *puch;
204 if (!uch)
205 break;
206
207 /* check for output overflow */
208 if (pCp >= pCpEnd)
209 {
210 rc = VERR_BUFFER_OVERFLOW;
211 break;
212 }
213
214 /* decode and recode the code point */
215 if (!(uch & RT_BIT(7)))
216 {
217 *pCp++ = uch;
218 puch++;
219 cch--;
220 }
221#ifdef RT_STRICT
222 else if (!(uch & RT_BIT(6)))
223 AssertMsgFailed(("Internal error!\n"));
224#endif
225 else if (!(uch & RT_BIT(5)))
226 {
227 *pCp++ = (puch[1] & 0x3f)
228 | ((uint16_t)(uch & 0x1f) << 6);
229 puch += 2;
230 cch -= 2;
231 }
232 else if (!(uch & RT_BIT(4)))
233 {
234 *pCp++ = (puch[2] & 0x3f)
235 | ((uint16_t)(puch[1] & 0x3f) << 6)
236 | ((uint16_t)(uch & 0x0f) << 12);
237 puch += 3;
238 cch -= 3;
239 }
240 else if (!(uch & RT_BIT(3)))
241 {
242 *pCp++ = (puch[3] & 0x3f)
243 | ((RTUNICP)(puch[2] & 0x3f) << 6)
244 | ((RTUNICP)(puch[1] & 0x3f) << 12)
245 | ((RTUNICP)(uch & 0x07) << 18);
246 puch += 4;
247 cch -= 4;
248 }
249 else if (!(uch & RT_BIT(2)))
250 {
251 *pCp++ = (puch[4] & 0x3f)
252 | ((RTUNICP)(puch[3] & 0x3f) << 6)
253 | ((RTUNICP)(puch[2] & 0x3f) << 12)
254 | ((RTUNICP)(puch[1] & 0x3f) << 18)
255 | ((RTUNICP)(uch & 0x03) << 24);
256 puch += 5;
257 cch -= 6;
258 }
259 else
260 {
261 Assert(!(uch & RT_BIT(1)));
262 *pCp++ = (puch[5] & 0x3f)
263 | ((RTUNICP)(puch[4] & 0x3f) << 6)
264 | ((RTUNICP)(puch[3] & 0x3f) << 12)
265 | ((RTUNICP)(puch[2] & 0x3f) << 18)
266 | ((RTUNICP)(puch[1] & 0x3f) << 24)
267 | ((RTUNICP)(uch & 0x01) << 30);
268 puch += 6;
269 cch -= 6;
270 }
271 }
272
273 /* done */
274 *pCp = 0;
275 *pcCps = pCp - paCps;
276 return rc;
277}
278
279
280RTDECL(size_t) RTStrUniLen(const char *psz)
281{
282 size_t cCodePoints;
283 int rc = rtUtf8Length(psz, RTSTR_MAX, &cCodePoints, NULL);
284 return RT_SUCCESS(rc) ? cCodePoints : 0;
285}
286RT_EXPORT_SYMBOL(RTStrUniLen);
287
288
289RTDECL(int) RTStrUniLenEx(const char *psz, size_t cch, size_t *pcCps)
290{
291 size_t cCodePoints;
292 int rc = rtUtf8Length(psz, cch, &cCodePoints, NULL);
293 if (pcCps)
294 *pcCps = RT_SUCCESS(rc) ? cCodePoints : 0;
295 return rc;
296}
297RT_EXPORT_SYMBOL(RTStrUniLenEx);
298
299
300RTDECL(int) RTStrValidateEncoding(const char *psz)
301{
302 return RTStrValidateEncodingEx(psz, RTSTR_MAX, 0);
303}
304RT_EXPORT_SYMBOL(RTStrValidateEncoding);
305
306
307RTDECL(int) RTStrValidateEncodingEx(const char *psz, size_t cch, uint32_t fFlags)
308{
309 AssertReturn(!(fFlags & ~(RTSTR_VALIDATE_ENCODING_ZERO_TERMINATED)), VERR_INVALID_PARAMETER);
310 AssertPtr(psz);
311
312 /*
313 * Use rtUtf8Length for the job.
314 */
315 size_t cchActual;
316 size_t cCpsIgnored;
317 int rc = rtUtf8Length(psz, cch, &cCpsIgnored, &cchActual);
318 if (RT_SUCCESS(rc))
319 {
320 if ( (fFlags & RTSTR_VALIDATE_ENCODING_ZERO_TERMINATED)
321 && cchActual >= cch)
322 rc = VERR_BUFFER_OVERFLOW;
323 }
324 return rc;
325
326
327 return RTStrUniLenEx(psz, cch, &cCpsIgnored);
328}
329RT_EXPORT_SYMBOL(RTStrValidateEncodingEx);
330
331
332RTDECL(bool) RTStrIsValidEncoding(const char *psz)
333{
334 int rc = RTStrValidateEncodingEx(psz, RTSTR_MAX, 0);
335 return RT_SUCCESS(rc);
336}
337RT_EXPORT_SYMBOL(RTStrIsValidEncoding);
338
339
340RTDECL(int) RTStrToUni(const char *pszString, PRTUNICP *ppaCps)
341{
342 /*
343 * Validate input.
344 */
345 Assert(VALID_PTR(pszString));
346 Assert(VALID_PTR(ppaCps));
347 *ppaCps = NULL;
348
349 /*
350 * Validate the UTF-8 input and count its code points.
351 */
352 size_t cCps;
353 int rc = rtUtf8Length(pszString, RTSTR_MAX, &cCps, NULL);
354 if (RT_SUCCESS(rc))
355 {
356 /*
357 * Allocate buffer.
358 */
359 PRTUNICP paCps = (PRTUNICP)RTMemAlloc((cCps + 1) * sizeof(RTUNICP));
360 if (paCps)
361 {
362 /*
363 * Decode the string.
364 */
365 rc = rtUtf8Decode(pszString, RTSTR_MAX, paCps, cCps, &cCps);
366 if (RT_SUCCESS(rc))
367 {
368 *ppaCps = paCps;
369 return rc;
370 }
371 RTMemFree(paCps);
372 }
373 else
374 rc = VERR_NO_CODE_POINT_MEMORY;
375 }
376 return rc;
377}
378RT_EXPORT_SYMBOL(RTStrToUni);
379
380
381RTDECL(int) RTStrToUniEx(const char *pszString, size_t cchString, PRTUNICP *ppaCps, size_t cCps, size_t *pcCps)
382{
383 /*
384 * Validate input.
385 */
386 Assert(VALID_PTR(pszString));
387 Assert(VALID_PTR(ppaCps));
388 Assert(!pcCps || VALID_PTR(pcCps));
389
390 /*
391 * Validate the UTF-8 input and count the code points.
392 */
393 size_t cCpsResult;
394 int rc = rtUtf8Length(pszString, cchString, &cCpsResult, NULL);
395 if (RT_SUCCESS(rc))
396 {
397 if (pcCps)
398 *pcCps = cCpsResult;
399
400 /*
401 * Check buffer size / Allocate buffer.
402 */
403 bool fShouldFree;
404 PRTUNICP paCpsResult;
405 if (cCps > 0 && *ppaCps)
406 {
407 fShouldFree = false;
408 if (cCps <= cCpsResult)
409 return VERR_BUFFER_OVERFLOW;
410 paCpsResult = *ppaCps;
411 }
412 else
413 {
414 *ppaCps = NULL;
415 fShouldFree = true;
416 cCps = RT_MAX(cCpsResult + 1, cCps);
417 paCpsResult = (PRTUNICP)RTMemAlloc(cCps * sizeof(RTUNICP));
418 }
419 if (paCpsResult)
420 {
421 /*
422 * Encode the UTF-16 string.
423 */
424 rc = rtUtf8Decode(pszString, cchString, paCpsResult, cCps - 1, &cCpsResult);
425 if (RT_SUCCESS(rc))
426 {
427 *ppaCps = paCpsResult;
428 return rc;
429 }
430 if (fShouldFree)
431 RTMemFree(paCpsResult);
432 }
433 else
434 rc = VERR_NO_CODE_POINT_MEMORY;
435 }
436 return rc;
437}
438RT_EXPORT_SYMBOL(RTStrToUniEx);
439
440
441/**
442 * Calculates the UTF-16 length of a string, validating the encoding while doing so.
443 *
444 * @returns IPRT status code.
445 * @param psz Pointer to the UTF-8 string.
446 * @param cch The max length of the string. (btw cch = cb)
447 * Use RTSTR_MAX if all of the string is to be examined.s
448 * @param pcwc Where to store the length of the UTF-16 string as a number of RTUTF16 characters.
449 */
450static int rtUtf8CalcUtf16Length(const char *psz, size_t cch, size_t *pcwc)
451{
452 const unsigned char *puch = (const unsigned char *)psz;
453 size_t cwc = 0;
454 while (cch > 0)
455 {
456 const unsigned char uch = *puch;
457 if (!uch)
458 break;
459 if (!(uch & RT_BIT(7)))
460 {
461 /* one ASCII byte */
462 cwc++;
463 puch++;
464 cch--;
465 }
466 else
467 {
468 /* figure sequence length and validate the first byte */
469 unsigned cb;
470 if ((uch & (RT_BIT(7) | RT_BIT(6) | RT_BIT(5))) == (RT_BIT(7) | RT_BIT(6)))
471 cb = 2;
472 else if ((uch & (RT_BIT(7) | RT_BIT(6) | RT_BIT(5) | RT_BIT(4))) == (RT_BIT(7) | RT_BIT(6) | RT_BIT(5)))
473 cb = 3;
474 else if ((uch & (RT_BIT(7) | RT_BIT(6) | RT_BIT(5) | RT_BIT(4) | RT_BIT(3))) == (RT_BIT(7) | RT_BIT(6) | RT_BIT(5) | RT_BIT(4)))
475 cb = 4;
476 else if ((uch & (RT_BIT(7) | RT_BIT(6) | RT_BIT(5) | RT_BIT(4) | RT_BIT(3) | RT_BIT(2))) == (RT_BIT(7) | RT_BIT(6) | RT_BIT(5) | RT_BIT(4) | RT_BIT(3)))
477 cb = 5;
478 else if ((uch & (RT_BIT(7) | RT_BIT(6) | RT_BIT(5) | RT_BIT(4) | RT_BIT(3) | RT_BIT(2) | RT_BIT(1))) == (RT_BIT(7) | RT_BIT(6) | RT_BIT(5) | RT_BIT(4) | RT_BIT(3) | RT_BIT(2)))
479 cb = 6;
480 else
481 {
482 RTStrAssertMsgFailed(("Invalid UTF-8 first byte: %.*Rhxs\n", RT_MIN(cch, 10), puch));
483 return VERR_INVALID_UTF8_ENCODING;
484 }
485
486 /* check length */
487 if (cb > cch)
488 {
489 RTStrAssertMsgFailed(("Invalid UTF-8 length: cb=%d cch=%d (%.*Rhxs)\n", cb, cch, RT_MIN(cch, 10), puch));
490 return VERR_INVALID_UTF8_ENCODING;
491 }
492
493 /* validate the rest */
494 switch (cb)
495 {
496 case 6:
497 RTStrAssertMsgReturn((puch[5] & (RT_BIT(7) | RT_BIT(6))) == RT_BIT(7), ("6/%u: %.*Rhxs\n", cb, RT_MIN(cb + 10, cch), puch), VERR_INVALID_UTF8_ENCODING);
498 case 5:
499 RTStrAssertMsgReturn((puch[4] & (RT_BIT(7) | RT_BIT(6))) == RT_BIT(7), ("5/%u: %.*Rhxs\n", cb, RT_MIN(cb + 10, cch), puch), VERR_INVALID_UTF8_ENCODING);
500 case 4:
501 RTStrAssertMsgReturn((puch[3] & (RT_BIT(7) | RT_BIT(6))) == RT_BIT(7), ("4/%u: %.*Rhxs\n", cb, RT_MIN(cb + 10, cch), puch), VERR_INVALID_UTF8_ENCODING);
502 case 3:
503 RTStrAssertMsgReturn((puch[2] & (RT_BIT(7) | RT_BIT(6))) == RT_BIT(7), ("3/%u: %.*Rhxs\n", cb, RT_MIN(cb + 10, cch), puch), VERR_INVALID_UTF8_ENCODING);
504 case 2:
505 RTStrAssertMsgReturn((puch[1] & (RT_BIT(7) | RT_BIT(6))) == RT_BIT(7), ("2/%u: %.*Rhxs\n", cb, RT_MIN(cb + 10, cch), puch), VERR_INVALID_UTF8_ENCODING);
506 break;
507 }
508
509 /* validate the code point. */
510 RTUNICP uc;
511 switch (cb)
512 {
513 case 6:
514 uc = (puch[5] & 0x3f)
515 | ((RTUNICP)(puch[4] & 0x3f) << 6)
516 | ((RTUNICP)(puch[3] & 0x3f) << 12)
517 | ((RTUNICP)(puch[2] & 0x3f) << 18)
518 | ((RTUNICP)(puch[1] & 0x3f) << 24)
519 | ((RTUNICP)(uch & 0x01) << 30);
520 RTStrAssertMsgReturn(uc >= 0x04000000 && uc <= 0x7fffffff,
521 ("%u: cp=%#010RX32: %.*Rhxs\n", cb, uc, RT_MIN(cb + 10, cch), puch), VERR_INVALID_UTF8_ENCODING);
522 RTStrAssertMsgFailed(("%u: cp=%#010RX32: %.*Rhxs\n", cb, uc, RT_MIN(cb + 10, cch), puch));
523 return VERR_CANT_RECODE_AS_UTF16;
524 case 5:
525 uc = (puch[4] & 0x3f)
526 | ((RTUNICP)(puch[3] & 0x3f) << 6)
527 | ((RTUNICP)(puch[2] & 0x3f) << 12)
528 | ((RTUNICP)(puch[1] & 0x3f) << 18)
529 | ((RTUNICP)(uch & 0x03) << 24);
530 RTStrAssertMsgReturn(uc >= 0x00200000 && uc <= 0x03ffffff,
531 ("%u: cp=%#010RX32: %.*Rhxs\n", cb, uc, RT_MIN(cb + 10, cch), puch), VERR_INVALID_UTF8_ENCODING);
532 RTStrAssertMsgFailed(("%u: cp=%#010RX32: %.*Rhxs\n", cb, uc, RT_MIN(cb + 10, cch), puch));
533 return VERR_CANT_RECODE_AS_UTF16;
534 case 4:
535 uc = (puch[3] & 0x3f)
536 | ((RTUNICP)(puch[2] & 0x3f) << 6)
537 | ((RTUNICP)(puch[1] & 0x3f) << 12)
538 | ((RTUNICP)(uch & 0x07) << 18);
539 RTStrAssertMsgReturn(uc >= 0x00010000 && uc <= 0x001fffff,
540 ("%u: cp=%#010RX32: %.*Rhxs\n", cb, uc, RT_MIN(cb + 10, cch), puch), VERR_INVALID_UTF8_ENCODING);
541 RTStrAssertMsgReturn(uc <= 0x0010ffff,
542 ("%u: cp=%#010RX32: %.*Rhxs\n", cb, uc, RT_MIN(cb + 10, cch), puch), VERR_CANT_RECODE_AS_UTF16);
543 cwc++;
544 break;
545 case 3:
546 uc = (puch[2] & 0x3f)
547 | ((RTUNICP)(puch[1] & 0x3f) << 6)
548 | ((RTUNICP)(uch & 0x0f) << 12);
549 RTStrAssertMsgReturn(uc >= 0x00000800 && uc <= 0x0000fffd,
550 ("%u: cp=%#010RX32: %.*Rhxs\n", cb, uc, RT_MIN(cb + 10, cch), puch),
551 uc == 0xffff || uc == 0xfffe ? VERR_CODE_POINT_ENDIAN_INDICATOR : VERR_INVALID_UTF8_ENCODING);
552 RTStrAssertMsgReturn(uc < 0xd800 || uc > 0xdfff,
553 ("%u: cp=%#010RX32: %.*Rhxs\n", cb, uc, RT_MIN(cb + 10, cch), puch), VERR_CODE_POINT_SURROGATE);
554 break;
555 case 2:
556 uc = (puch[1] & 0x3f)
557 | ((RTUNICP)(uch & 0x1f) << 6);
558 RTStrAssertMsgReturn(uc >= 0x00000080 && uc <= 0x000007ff,
559 ("%u: cp=%#010RX32: %.*Rhxs\n", cb, uc, RT_MIN(cb + 10, cch), puch), VERR_INVALID_UTF8_ENCODING);
560 break;
561 }
562
563 /* advance */
564 cch -= cb;
565 puch += cb;
566 cwc++;
567 }
568 }
569
570 /* done */
571 *pcwc = cwc;
572 return VINF_SUCCESS;
573}
574
575
576/**
577 * Recodes a valid UTF-8 string as UTF-16.
578 *
579 * Since we know the input is valid, we do *not* perform encoding or length checks.
580 *
581 * @returns iprt status code.
582 * @param psz The UTF-8 string to recode. This is a valid encoding.
583 * @param cch The number of chars (the type char, so bytes if you like) to process of the UTF-8 string.
584 * The recoding will stop when cch or '\\0' is reached. Pass RTSTR_MAX to process up to '\\0'.
585 * @param pwsz Where to store the UTF-16 string.
586 * @param cwc The number of RTUTF16 items the pwsz buffer can hold, excluding the terminator ('\\0').
587 * @param pcwc Where to store the actual number of RTUTF16 items encoded into the UTF-16. This excludes the terminator.
588 */
589static int rtUtf8RecodeAsUtf16(const char *psz, size_t cch, PRTUTF16 pwsz, size_t cwc, size_t *pcwc)
590{
591 int rc = VINF_SUCCESS;
592 const unsigned char *puch = (const unsigned char *)psz;
593 const PRTUTF16 pwszEnd = pwsz + cwc;
594 PRTUTF16 pwc = pwsz;
595 Assert(pwszEnd >= pwc);
596 while (cch > 0)
597 {
598 /* read the next char and check for terminator. */
599 const unsigned char uch = *puch;
600 if (!uch)
601 break;
602
603 /* check for output overflow */
604 if (pwc >= pwszEnd)
605 {
606 rc = VERR_BUFFER_OVERFLOW;
607 break;
608 }
609
610 /* decode and recode the code point */
611 if (!(uch & RT_BIT(7)))
612 {
613 *pwc++ = uch;
614 puch++;
615 cch--;
616 }
617 else if ((uch & (RT_BIT(7) | RT_BIT(6) | RT_BIT(5))) == (RT_BIT(7) | RT_BIT(6)))
618 {
619 uint16_t uc = (puch[1] & 0x3f)
620 | ((uint16_t)(uch & 0x1f) << 6);
621 *pwc++ = uc;
622 puch += 2;
623 cch -= 2;
624 }
625 else if ((uch & (RT_BIT(7) | RT_BIT(6) | RT_BIT(5) | RT_BIT(4))) == (RT_BIT(7) | RT_BIT(6) | RT_BIT(5)))
626 {
627 uint16_t uc = (puch[2] & 0x3f)
628 | ((uint16_t)(puch[1] & 0x3f) << 6)
629 | ((uint16_t)(uch & 0x0f) << 12);
630 *pwc++ = uc;
631 puch += 3;
632 cch -= 3;
633 }
634 else
635 {
636 /* generate surrugate pair */
637 Assert((uch & (RT_BIT(7) | RT_BIT(6) | RT_BIT(5) | RT_BIT(4) | RT_BIT(3))) == (RT_BIT(7) | RT_BIT(6) | RT_BIT(5) | RT_BIT(4)));
638 RTUNICP uc = (puch[3] & 0x3f)
639 | ((RTUNICP)(puch[2] & 0x3f) << 6)
640 | ((RTUNICP)(puch[1] & 0x3f) << 12)
641 | ((RTUNICP)(uch & 0x07) << 18);
642 if (pwc + 1 >= pwszEnd)
643 {
644 rc = VERR_BUFFER_OVERFLOW;
645 break;
646 }
647 uc -= 0x10000;
648 *pwc++ = 0xd800 | (uc >> 10);
649 *pwc++ = 0xdc00 | (uc & 0x3ff);
650 puch += 4;
651 cch -= 4;
652 }
653 }
654
655 /* done */
656 *pwc = '\0';
657 *pcwc = pwc - pwsz;
658 return rc;
659}
660
661
662RTDECL(int) RTStrToUtf16(const char *pszString, PRTUTF16 *ppwszString)
663{
664 /*
665 * Validate input.
666 */
667 Assert(VALID_PTR(ppwszString));
668 Assert(VALID_PTR(pszString));
669 *ppwszString = NULL;
670
671 /*
672 * Validate the UTF-8 input and calculate the length of the UTF-16 string.
673 */
674 size_t cwc;
675 int rc = rtUtf8CalcUtf16Length(pszString, RTSTR_MAX, &cwc);
676 if (RT_SUCCESS(rc))
677 {
678 /*
679 * Allocate buffer.
680 */
681 PRTUTF16 pwsz = (PRTUTF16)RTMemAlloc((cwc + 1) * sizeof(RTUTF16));
682 if (pwsz)
683 {
684 /*
685 * Encode the UTF-16 string.
686 */
687 rc = rtUtf8RecodeAsUtf16(pszString, RTSTR_MAX, pwsz, cwc, &cwc);
688 if (RT_SUCCESS(rc))
689 {
690 *ppwszString = pwsz;
691 return rc;
692 }
693 RTMemFree(pwsz);
694 }
695 else
696 rc = VERR_NO_UTF16_MEMORY;
697 }
698 return rc;
699}
700RT_EXPORT_SYMBOL(RTStrToUtf16);
701
702
703RTDECL(int) RTStrToUtf16Ex(const char *pszString, size_t cchString, PRTUTF16 *ppwsz, size_t cwc, size_t *pcwc)
704{
705 /*
706 * Validate input.
707 */
708 Assert(VALID_PTR(pszString));
709 Assert(VALID_PTR(ppwsz));
710 Assert(!pcwc || VALID_PTR(pcwc));
711
712 /*
713 * Validate the UTF-8 input and calculate the length of the UTF-16 string.
714 */
715 size_t cwcResult;
716 int rc = rtUtf8CalcUtf16Length(pszString, cchString, &cwcResult);
717 if (RT_SUCCESS(rc))
718 {
719 if (pcwc)
720 *pcwc = cwcResult;
721
722 /*
723 * Check buffer size / Allocate buffer.
724 */
725 bool fShouldFree;
726 PRTUTF16 pwszResult;
727 if (cwc > 0 && *ppwsz)
728 {
729 fShouldFree = false;
730 if (cwc <= cwcResult)
731 return VERR_BUFFER_OVERFLOW;
732 pwszResult = *ppwsz;
733 }
734 else
735 {
736 *ppwsz = NULL;
737 fShouldFree = true;
738 cwc = RT_MAX(cwcResult + 1, cwc);
739 pwszResult = (PRTUTF16)RTMemAlloc(cwc * sizeof(RTUTF16));
740 }
741 if (pwszResult)
742 {
743 /*
744 * Encode the UTF-16 string.
745 */
746 rc = rtUtf8RecodeAsUtf16(pszString, cchString, pwszResult, cwc - 1, &cwcResult);
747 if (RT_SUCCESS(rc))
748 {
749 *ppwsz = pwszResult;
750 return rc;
751 }
752 if (fShouldFree)
753 RTMemFree(pwszResult);
754 }
755 else
756 rc = VERR_NO_UTF16_MEMORY;
757 }
758 return rc;
759}
760RT_EXPORT_SYMBOL(RTStrToUtf16Ex);
761
762
763RTDECL(size_t) RTStrCalcUtf16Len(const char *psz)
764{
765 size_t cwc;
766 int rc = rtUtf8CalcUtf16Length(psz, RTSTR_MAX, &cwc);
767 return RT_SUCCESS(rc) ? cwc : 0;
768}
769RT_EXPORT_SYMBOL(RTStrCalcUtf16Len);
770
771
772RTDECL(int) RTStrCalcUtf16LenEx(const char *psz, size_t cch, size_t *pcwc)
773{
774 size_t cwc;
775 int rc = rtUtf8CalcUtf16Length(psz, cch, &cwc);
776 if (pcwc)
777 *pcwc = RT_SUCCESS(rc) ? cwc : ~(size_t)0;
778 return rc;
779}
780RT_EXPORT_SYMBOL(RTStrCalcUtf16LenEx);
781
782
783/**
784 * Handle invalid encodings passed to RTStrGetCp() and RTStrGetCpEx().
785 * @returns rc
786 * @param ppsz The pointer to the string position point.
787 * @param pCp Where to store RTUNICP_INVALID.
788 * @param rc The iprt error code.
789 */
790static int rtStrGetCpExFailure(const char **ppsz, PRTUNICP pCp, int rc)
791{
792 /*
793 * Try find a valid encoding.
794 */
795 (*ppsz)++; /** @todo code this! */
796 *pCp = RTUNICP_INVALID;
797 return rc;
798}
799
800
801RTDECL(RTUNICP) RTStrGetCpInternal(const char *psz)
802{
803 RTUNICP Cp;
804 RTStrGetCpExInternal(&psz, &Cp);
805 return Cp;
806}
807RT_EXPORT_SYMBOL(RTStrGetCpInternal);
808
809
810RTDECL(int) RTStrGetCpExInternal(const char **ppsz, PRTUNICP pCp)
811{
812 const unsigned char *puch = (const unsigned char *)*ppsz;
813 const unsigned char uch = *puch;
814 RTUNICP uc;
815
816 /* ASCII ? */
817 if (!(uch & RT_BIT(7)))
818 {
819 uc = uch;
820 puch++;
821 }
822 else if (uch & RT_BIT(6))
823 {
824 /* figure the length and validate the first octet. */
825 unsigned cb;
826 if (!(uch & RT_BIT(5)))
827 cb = 2;
828 else if (!(uch & RT_BIT(4)))
829 cb = 3;
830 else if (!(uch & RT_BIT(3)))
831 cb = 4;
832 else if (!(uch & RT_BIT(2)))
833 cb = 5;
834 else if (!(uch & RT_BIT(1)))
835 cb = 6;
836 else
837 {
838 RTStrAssertMsgFailed(("Invalid UTF-8 first byte: %.*Rhxs\n", RT_MIN(strlen((char *)puch), 10), puch));
839 return rtStrGetCpExFailure(ppsz, pCp, VERR_INVALID_UTF8_ENCODING);
840 }
841
842 /* validate the rest */
843 switch (cb)
844 {
845 case 6:
846 RTStrAssertMsgReturn((puch[5] & 0xc0) == 0x80, ("6/%u: %.*Rhxs\n", cb, RT_MIN(cb + 10, strlen((char *)puch)), puch),
847 rtStrGetCpExFailure(ppsz, pCp, VERR_INVALID_UTF8_ENCODING));
848 case 5:
849 RTStrAssertMsgReturn((puch[4] & 0xc0) == 0x80, ("5/%u: %.*Rhxs\n", cb, RT_MIN(cb + 10, strlen((char *)puch)), puch),
850 rtStrGetCpExFailure(ppsz, pCp, VERR_INVALID_UTF8_ENCODING));
851 case 4:
852 RTStrAssertMsgReturn((puch[3] & 0xc0) == 0x80, ("4/%u: %.*Rhxs\n", cb, RT_MIN(cb + 10, strlen((char *)puch)), puch),
853 rtStrGetCpExFailure(ppsz, pCp, VERR_INVALID_UTF8_ENCODING));
854 case 3:
855 RTStrAssertMsgReturn((puch[2] & 0xc0) == 0x80, ("3/%u: %.*Rhxs\n", cb, RT_MIN(cb + 10, strlen((char *)puch)), puch),
856 rtStrGetCpExFailure(ppsz, pCp, VERR_INVALID_UTF8_ENCODING));
857 case 2:
858 RTStrAssertMsgReturn((puch[1] & 0xc0) == 0x80, ("2/%u: %.*Rhxs\n", cb, RT_MIN(cb + 10, strlen((char *)puch)), puch),
859 rtStrGetCpExFailure(ppsz, pCp, VERR_INVALID_UTF8_ENCODING));
860 break;
861 }
862
863 /* get and validate the code point. */
864 switch (cb)
865 {
866 case 6:
867 uc = (puch[5] & 0x3f)
868 | ((RTUNICP)(puch[4] & 0x3f) << 6)
869 | ((RTUNICP)(puch[3] & 0x3f) << 12)
870 | ((RTUNICP)(puch[2] & 0x3f) << 18)
871 | ((RTUNICP)(puch[1] & 0x3f) << 24)
872 | ((RTUNICP)(uch & 0x01) << 30);
873 RTStrAssertMsgReturn(uc >= 0x04000000 && uc <= 0x7fffffff,
874 ("%u: cp=%#010RX32: %.*Rhxs\n", cb, uc, RT_MIN(cb + 10, strlen((char *)puch)), puch),
875 rtStrGetCpExFailure(ppsz, pCp, VERR_INVALID_UTF8_ENCODING));
876 break;
877 case 5:
878 uc = (puch[4] & 0x3f)
879 | ((RTUNICP)(puch[3] & 0x3f) << 6)
880 | ((RTUNICP)(puch[2] & 0x3f) << 12)
881 | ((RTUNICP)(puch[1] & 0x3f) << 18)
882 | ((RTUNICP)(uch & 0x03) << 24);
883 RTStrAssertMsgReturn(uc >= 0x00200000 && uc <= 0x03ffffff,
884 ("%u: cp=%#010RX32: %.*Rhxs\n", cb, uc, RT_MIN(cb + 10, strlen((char *)puch)), puch),
885 rtStrGetCpExFailure(ppsz, pCp, VERR_INVALID_UTF8_ENCODING));
886 break;
887 case 4:
888 uc = (puch[3] & 0x3f)
889 | ((RTUNICP)(puch[2] & 0x3f) << 6)
890 | ((RTUNICP)(puch[1] & 0x3f) << 12)
891 | ((RTUNICP)(uch & 0x07) << 18);
892 RTStrAssertMsgReturn(uc >= 0x00010000 && uc <= 0x001fffff,
893 ("%u: cp=%#010RX32: %.*Rhxs\n", cb, uc, RT_MIN(cb + 10, strlen((char *)puch)), puch),
894 rtStrGetCpExFailure(ppsz, pCp, VERR_INVALID_UTF8_ENCODING));
895 break;
896 case 3:
897 uc = (puch[2] & 0x3f)
898 | ((RTUNICP)(puch[1] & 0x3f) << 6)
899 | ((RTUNICP)(uch & 0x0f) << 12);
900 RTStrAssertMsgReturn(uc >= 0x00000800 && uc <= 0x0000fffd,
901 ("%u: cp=%#010RX32: %.*Rhxs\n", cb, uc, RT_MIN(cb + 10, strlen((char *)puch)), puch),
902 rtStrGetCpExFailure(ppsz, pCp, uc == 0xffff || uc == 0xfffe ? VERR_CODE_POINT_ENDIAN_INDICATOR : VERR_INVALID_UTF8_ENCODING));
903 RTStrAssertMsgReturn(uc < 0xd800 || uc > 0xdfff,
904 ("%u: cp=%#010RX32: %.*Rhxs\n", cb, uc, RT_MIN(cb + 10, strlen((char *)puch)), puch),
905 rtStrGetCpExFailure(ppsz, pCp, VERR_CODE_POINT_SURROGATE));
906 break;
907 case 2:
908 uc = (puch[1] & 0x3f)
909 | ((RTUNICP)(uch & 0x1f) << 6);
910 RTStrAssertMsgReturn(uc >= 0x00000080 && uc <= 0x000007ff,
911 ("%u: cp=%#010RX32: %.*Rhxs\n", cb, uc, RT_MIN(cb + 10, strlen((char *)puch)), puch),
912 rtStrGetCpExFailure(ppsz, pCp, VERR_INVALID_UTF8_ENCODING));
913 break;
914 default: /* impossible, but GCC is bitching. */
915 uc = RTUNICP_INVALID;
916 break;
917 }
918 puch += cb;
919 }
920 else
921 {
922 /* 6th bit is always set. */
923 RTStrAssertMsgFailed(("Invalid UTF-8 first byte: %.*Rhxs\n", RT_MIN(strlen((char *)puch), 10), puch));
924 return rtStrGetCpExFailure(ppsz, pCp, VERR_INVALID_UTF8_ENCODING);
925 }
926 *pCp = uc;
927 *ppsz = (const char *)puch;
928 return VINF_SUCCESS;
929}
930RT_EXPORT_SYMBOL(RTStrGetCpExInternal);
931
932
933/**
934 * Handle invalid encodings passed to RTStrGetCpNEx().
935 * @returns rc
936 * @param ppsz The pointer to the string position point.
937 * @param pcch Pointer to the string length.
938 * @param pCp Where to store RTUNICP_INVALID.
939 * @param rc The iprt error code.
940 */
941static int rtStrGetCpNExFailure(const char **ppsz, size_t *pcch, PRTUNICP pCp, int rc)
942{
943 /*
944 * Try find a valid encoding.
945 */
946 (*ppsz)++; /** @todo code this! */
947 (*pcch)--;
948 *pCp = RTUNICP_INVALID;
949 return rc;
950}
951
952
953RTDECL(int) RTStrGetCpNExInternal(const char **ppsz, size_t *pcch, PRTUNICP pCp)
954{
955 const unsigned char *puch = (const unsigned char *)*ppsz;
956 const unsigned char uch = *puch;
957 size_t cch = *pcch;
958 RTUNICP uc;
959
960 if (cch == 0)
961 {
962 *pCp = RTUNICP_INVALID;
963 return VERR_END_OF_STRING;
964 }
965
966 /* ASCII ? */
967 if (!(uch & RT_BIT(7)))
968 {
969 uc = uch;
970 puch++;
971 cch--;
972 }
973 else if (uch & RT_BIT(6))
974 {
975 /* figure the length and validate the first octet. */
976 unsigned cb;
977 if (!(uch & RT_BIT(5)))
978 cb = 2;
979 else if (!(uch & RT_BIT(4)))
980 cb = 3;
981 else if (!(uch & RT_BIT(3)))
982 cb = 4;
983 else if (!(uch & RT_BIT(2)))
984 cb = 5;
985 else if (!(uch & RT_BIT(1)))
986 cb = 6;
987 else
988 {
989 RTStrAssertMsgFailed(("Invalid UTF-8 first byte: %.*Rhxs\n", RT_MIN(strlen((char *)puch), 10), puch));
990 return rtStrGetCpNExFailure(ppsz, pcch, pCp, VERR_INVALID_UTF8_ENCODING);
991 }
992
993 if (cb > cch)
994 return rtStrGetCpNExFailure(ppsz, pcch, pCp, VERR_INVALID_UTF8_ENCODING);
995
996 /* validate the rest */
997 switch (cb)
998 {
999 case 6:
1000 RTStrAssertMsgReturn((puch[5] & 0xc0) == 0x80, ("6/%u: %.*Rhxs\n", cb, RT_MIN(cb + 10, strlen((char *)puch)), puch),
1001 rtStrGetCpNExFailure(ppsz, pcch, pCp, VERR_INVALID_UTF8_ENCODING));
1002 case 5:
1003 RTStrAssertMsgReturn((puch[4] & 0xc0) == 0x80, ("5/%u: %.*Rhxs\n", cb, RT_MIN(cb + 10, strlen((char *)puch)), puch),
1004 rtStrGetCpNExFailure(ppsz, pcch, pCp, VERR_INVALID_UTF8_ENCODING));
1005 case 4:
1006 RTStrAssertMsgReturn((puch[3] & 0xc0) == 0x80, ("4/%u: %.*Rhxs\n", cb, RT_MIN(cb + 10, strlen((char *)puch)), puch),
1007 rtStrGetCpNExFailure(ppsz, pcch, pCp, VERR_INVALID_UTF8_ENCODING));
1008 case 3:
1009 RTStrAssertMsgReturn((puch[2] & 0xc0) == 0x80, ("3/%u: %.*Rhxs\n", cb, RT_MIN(cb + 10, strlen((char *)puch)), puch),
1010 rtStrGetCpNExFailure(ppsz, pcch, pCp, VERR_INVALID_UTF8_ENCODING));
1011 case 2:
1012 RTStrAssertMsgReturn((puch[1] & 0xc0) == 0x80, ("2/%u: %.*Rhxs\n", cb, RT_MIN(cb + 10, strlen((char *)puch)), puch),
1013 rtStrGetCpNExFailure(ppsz, pcch, pCp, VERR_INVALID_UTF8_ENCODING));
1014 break;
1015 }
1016
1017 /* get and validate the code point. */
1018 switch (cb)
1019 {
1020 case 6:
1021 uc = (puch[5] & 0x3f)
1022 | ((RTUNICP)(puch[4] & 0x3f) << 6)
1023 | ((RTUNICP)(puch[3] & 0x3f) << 12)
1024 | ((RTUNICP)(puch[2] & 0x3f) << 18)
1025 | ((RTUNICP)(puch[1] & 0x3f) << 24)
1026 | ((RTUNICP)(uch & 0x01) << 30);
1027 RTStrAssertMsgReturn(uc >= 0x04000000 && uc <= 0x7fffffff,
1028 ("%u: cp=%#010RX32: %.*Rhxs\n", cb, uc, RT_MIN(cb + 10, strlen((char *)puch)), puch),
1029 rtStrGetCpNExFailure(ppsz, pcch, pCp, VERR_INVALID_UTF8_ENCODING));
1030 break;
1031 case 5:
1032 uc = (puch[4] & 0x3f)
1033 | ((RTUNICP)(puch[3] & 0x3f) << 6)
1034 | ((RTUNICP)(puch[2] & 0x3f) << 12)
1035 | ((RTUNICP)(puch[1] & 0x3f) << 18)
1036 | ((RTUNICP)(uch & 0x03) << 24);
1037 RTStrAssertMsgReturn(uc >= 0x00200000 && uc <= 0x03ffffff,
1038 ("%u: cp=%#010RX32: %.*Rhxs\n", cb, uc, RT_MIN(cb + 10, strlen((char *)puch)), puch),
1039 rtStrGetCpNExFailure(ppsz, pcch, pCp, VERR_INVALID_UTF8_ENCODING));
1040 break;
1041 case 4:
1042 uc = (puch[3] & 0x3f)
1043 | ((RTUNICP)(puch[2] & 0x3f) << 6)
1044 | ((RTUNICP)(puch[1] & 0x3f) << 12)
1045 | ((RTUNICP)(uch & 0x07) << 18);
1046 RTStrAssertMsgReturn(uc >= 0x00010000 && uc <= 0x001fffff,
1047 ("%u: cp=%#010RX32: %.*Rhxs\n", cb, uc, RT_MIN(cb + 10, strlen((char *)puch)), puch),
1048 rtStrGetCpNExFailure(ppsz, pcch, pCp, VERR_INVALID_UTF8_ENCODING));
1049 break;
1050 case 3:
1051 uc = (puch[2] & 0x3f)
1052 | ((RTUNICP)(puch[1] & 0x3f) << 6)
1053 | ((RTUNICP)(uch & 0x0f) << 12);
1054 RTStrAssertMsgReturn(uc >= 0x00000800 && uc <= 0x0000fffd,
1055 ("%u: cp=%#010RX32: %.*Rhxs\n", cb, uc, RT_MIN(cb + 10, strlen((char *)puch)), puch),
1056 rtStrGetCpNExFailure(ppsz, pcch, pCp, uc == 0xffff || uc == 0xfffe ? VERR_CODE_POINT_ENDIAN_INDICATOR : VERR_INVALID_UTF8_ENCODING));
1057 RTStrAssertMsgReturn(uc < 0xd800 || uc > 0xdfff,
1058 ("%u: cp=%#010RX32: %.*Rhxs\n", cb, uc, RT_MIN(cb + 10, strlen((char *)puch)), puch),
1059 rtStrGetCpNExFailure(ppsz, pcch, pCp, VERR_CODE_POINT_SURROGATE));
1060 break;
1061 case 2:
1062 uc = (puch[1] & 0x3f)
1063 | ((RTUNICP)(uch & 0x1f) << 6);
1064 RTStrAssertMsgReturn(uc >= 0x00000080 && uc <= 0x000007ff,
1065 ("%u: cp=%#010RX32: %.*Rhxs\n", cb, uc, RT_MIN(cb + 10, strlen((char *)puch)), puch),
1066 rtStrGetCpNExFailure(ppsz, pcch, pCp, VERR_INVALID_UTF8_ENCODING));
1067 break;
1068 default: /* impossible, but GCC is bitching. */
1069 uc = RTUNICP_INVALID;
1070 break;
1071 }
1072 puch += cb;
1073 cch -= cb;
1074 }
1075 else
1076 {
1077 /* 6th bit is always set. */
1078 RTStrAssertMsgFailed(("Invalid UTF-8 first byte: %.*Rhxs\n", RT_MIN(strlen((char *)puch), 10), puch));
1079 return rtStrGetCpNExFailure(ppsz, pcch, pCp, VERR_INVALID_UTF8_ENCODING);
1080 }
1081 *pCp = uc;
1082 *ppsz = (const char *)puch;
1083 (*pcch) = cch;
1084 return VINF_SUCCESS;
1085}
1086RT_EXPORT_SYMBOL(RTStrGetCpNExInternal);
1087
1088
1089RTDECL(char *) RTStrPutCpInternal(char *psz, RTUNICP uc)
1090{
1091 unsigned char *puch = (unsigned char *)psz;
1092 if (uc < 0x80)
1093 *puch++ = (unsigned char )uc;
1094 else if (uc < 0x00000800)
1095 {
1096 *puch++ = 0xc0 | (uc >> 6);
1097 *puch++ = 0x80 | (uc & 0x3f);
1098 }
1099 else if (uc < 0x00010000)
1100 {
1101 if ( uc < 0x0000d8000
1102 || ( uc > 0x0000dfff
1103 && uc < 0x0000fffe))
1104 {
1105 *puch++ = 0xe0 | (uc >> 12);
1106 *puch++ = 0x80 | ((uc >> 6) & 0x3f);
1107 *puch++ = 0x80 | (uc & 0x3f);
1108 }
1109 else
1110 {
1111 AssertMsgFailed(("Invalid code point U+%05x!\n", uc));
1112 *puch++ = 0x7f;
1113 }
1114 }
1115 else if (uc < 0x00200000)
1116 {
1117 *puch++ = 0xf0 | (uc >> 18);
1118 *puch++ = 0x80 | ((uc >> 12) & 0x3f);
1119 *puch++ = 0x80 | ((uc >> 6) & 0x3f);
1120 *puch++ = 0x80 | (uc & 0x3f);
1121 }
1122 else if (uc < 0x04000000)
1123 {
1124 *puch++ = 0xf1 | (uc >> 24);
1125 *puch++ = 0x80 | ((uc >> 18) & 0x3f);
1126 *puch++ = 0x80 | ((uc >> 12) & 0x3f);
1127 *puch++ = 0x80 | ((uc >> 6) & 0x3f);
1128 *puch++ = 0x80 | (uc & 0x3f);
1129 }
1130 else if (uc <= 0x7fffffff)
1131 {
1132 *puch++ = 0xf3 | (uc >> 30);
1133 *puch++ = 0x80 | ((uc >> 24) & 0x3f);
1134 *puch++ = 0x80 | ((uc >> 18) & 0x3f);
1135 *puch++ = 0x80 | ((uc >> 12) & 0x3f);
1136 *puch++ = 0x80 | ((uc >> 6) & 0x3f);
1137 *puch++ = 0x80 | (uc & 0x3f);
1138 }
1139 else
1140 {
1141 AssertMsgFailed(("Invalid code point U+%08x!\n", uc));
1142 *puch++ = 0x7f;
1143 }
1144
1145 return (char *)puch;
1146}
1147RT_EXPORT_SYMBOL(RTStrPutCpInternal);
1148
1149
1150RTDECL(char *) RTStrPrevCp(const char *pszStart, const char *psz)
1151{
1152 if (pszStart < psz)
1153 {
1154 /* simple char? */
1155 const unsigned char *puch = (const unsigned char *)psz;
1156 unsigned uch = *--puch;
1157 if (!(uch & RT_BIT(7)))
1158 return (char *)puch;
1159 RTStrAssertMsgReturn(!(uch & RT_BIT(6)), ("uch=%#x\n", uch), (char *)pszStart);
1160
1161 /* two or more. */
1162 uint32_t uMask = 0xffffffc0;
1163 while ( (const unsigned char *)pszStart < puch
1164 && !(uMask & 1))
1165 {
1166 unsigned uch = *--puch;
1167 if ((uch & 0xc0) != 0x80)
1168 {
1169 RTStrAssertMsgReturn((uch & (uMask >> 1)) == (uMask & 0xff),
1170 ("Invalid UTF-8 encoding: %.*Rhxs puch=%p psz=%p\n", psz - (char *)puch, puch, psz),
1171 (char *)pszStart);
1172 return (char *)puch;
1173 }
1174 uMask >>= 1;
1175 }
1176 RTStrAssertMsgFailed(("Invalid UTF-8 encoding: %.*Rhxs puch=%p psz=%p\n", psz - (char *)puch, puch, psz));
1177 }
1178 return (char *)pszStart;
1179}
1180RT_EXPORT_SYMBOL(RTStrPrevCp);
1181
1182
1183/**
1184 * Performs a case sensitive string compare between two UTF-8 strings.
1185 *
1186 * Encoding errors are ignored by the current implementation. So, the only
1187 * difference between this and the CRT strcmp function is the handling of
1188 * NULL arguments.
1189 *
1190 * @returns < 0 if the first string less than the second string.
1191 * @returns 0 if the first string identical to the second string.
1192 * @returns > 0 if the first string greater than the second string.
1193 * @param psz1 First UTF-8 string. Null is allowed.
1194 * @param psz2 Second UTF-8 string. Null is allowed.
1195 */
1196RTDECL(int) RTStrCmp(const char *psz1, const char *psz2)
1197{
1198 if (psz1 == psz2)
1199 return 0;
1200 if (!psz1)
1201 return -1;
1202 if (!psz2)
1203 return 1;
1204
1205 return strcmp(psz1, psz2);
1206}
1207RT_EXPORT_SYMBOL(RTStrCmp);
1208
1209
1210/**
1211 * Performs a case sensitive string compare between two UTF-8 strings, given
1212 * a maximum string length.
1213 *
1214 * Encoding errors are ignored by the current implementation. So, the only
1215 * difference between this and the CRT strncmp function is the handling of
1216 * NULL arguments.
1217 *
1218 * @returns < 0 if the first string less than the second string.
1219 * @returns 0 if the first string identical to the second string.
1220 * @returns > 0 if the first string greater than the second string.
1221 * @param psz1 First UTF-8 string. Null is allowed.
1222 * @param psz2 Second UTF-8 string. Null is allowed.
1223 * @param cchMax The maximum string length
1224 */
1225RTDECL(int) RTStrNCmp(const char *psz1, const char *psz2, size_t cchMax)
1226{
1227 if (psz1 == psz2)
1228 return 0;
1229 if (!psz1)
1230 return -1;
1231 if (!psz2)
1232 return 1;
1233
1234 return strncmp(psz1, psz2, cchMax);
1235}
1236RT_EXPORT_SYMBOL(RTStrNCmp);
1237
1238
1239/**
1240 * Performs a case insensitive string compare between two UTF-8 strings.
1241 *
1242 * This is a simplified compare, as only the simplified lower/upper case folding
1243 * specified by the unicode specs are used. It does not consider character pairs
1244 * as they are used in some languages, just simple upper & lower case compares.
1245 *
1246 * The result is the difference between the mismatching codepoints after they
1247 * both have been lower cased.
1248 *
1249 * If the string encoding is invalid the function will assert (strict builds)
1250 * and use RTStrCmp for the remainder of the string.
1251 *
1252 * @returns < 0 if the first string less than the second string.
1253 * @returns 0 if the first string identical to the second string.
1254 * @returns > 0 if the first string greater than the second string.
1255 * @param psz1 First UTF-8 string. Null is allowed.
1256 * @param psz2 Second UTF-8 string. Null is allowed.
1257 */
1258RTDECL(int) RTStrICmp(const char *psz1, const char *psz2)
1259{
1260 if (psz1 == psz2)
1261 return 0;
1262 if (!psz1)
1263 return -1;
1264 if (!psz2)
1265 return 1;
1266
1267 const char *pszStart1 = psz1;
1268 for (;;)
1269 {
1270 /* Get the codepoints */
1271 RTUNICP cp1;
1272 int rc = RTStrGetCpEx(&psz1, &cp1);
1273 if (RT_FAILURE(rc))
1274 {
1275 AssertRC(rc);
1276 psz1--;
1277 break;
1278 }
1279
1280 RTUNICP cp2;
1281 rc = RTStrGetCpEx(&psz2, &cp2);
1282 if (RT_FAILURE(rc))
1283 {
1284 AssertRC(rc);
1285 psz2--;
1286 psz1 = RTStrPrevCp(pszStart1, psz1);
1287 break;
1288 }
1289
1290 /* compare */
1291 int iDiff = cp1 - cp2;
1292 if (iDiff)
1293 {
1294 iDiff = RTUniCpToUpper(cp1) != RTUniCpToUpper(cp2);
1295 if (iDiff)
1296 {
1297 iDiff = RTUniCpToLower(cp1) - RTUniCpToLower(cp2); /* lower case diff last! */
1298 if (iDiff)
1299 return iDiff;
1300 }
1301 }
1302
1303 /* hit the terminator? */
1304 if (!cp1)
1305 return 0;
1306 }
1307
1308 /* Hit some bad encoding, continue in case insensitive mode. */
1309 return RTStrCmp(psz1, psz2);
1310}
1311RT_EXPORT_SYMBOL(RTStrICmp);
1312
1313
1314/**
1315 * Performs a case insensitive string compare between two UTF-8 strings, given a
1316 * maximum string length.
1317 *
1318 * This is a simplified compare, as only the simplified lower/upper case folding
1319 * specified by the unicode specs are used. It does not consider character pairs
1320 * as they are used in some languages, just simple upper & lower case compares.
1321 *
1322 * The result is the difference between the mismatching codepoints after they
1323 * both have been lower cased.
1324 *
1325 * If the string encoding is invalid the function will assert (strict builds)
1326 * and use RTStrCmp for the remainder of the string.
1327 *
1328 * @returns < 0 if the first string less than the second string.
1329 * @returns 0 if the first string identical to the second string.
1330 * @returns > 0 if the first string greater than the second string.
1331 * @param psz1 First UTF-8 string. Null is allowed.
1332 * @param psz2 Second UTF-8 string. Null is allowed.
1333 * @param cchMax Maximum string length
1334 */
1335RTDECL(int) RTStrNICmp(const char *psz1, const char *psz2, size_t cchMax)
1336{
1337 if (cchMax == 0)
1338 return 0;
1339 if (psz1 == psz2)
1340 return 0;
1341 if (!psz1)
1342 return -1;
1343 if (!psz2)
1344 return 1;
1345
1346 for (;;)
1347 {
1348 /* Get the codepoints */
1349 RTUNICP cp1;
1350 size_t cchMax2 = cchMax;
1351 int rc = RTStrGetCpNEx(&psz1, &cchMax, &cp1);
1352 if (RT_FAILURE(rc))
1353 {
1354 AssertRC(rc);
1355 psz1--;
1356 cchMax++;
1357 break;
1358 }
1359
1360 RTUNICP cp2;
1361 rc = RTStrGetCpNEx(&psz2, &cchMax2, &cp2);
1362 if (RT_FAILURE(rc))
1363 {
1364 AssertRC(rc);
1365 psz2--;
1366 psz1 -= (cchMax - cchMax2 + 1); /* This can't overflow, can it? */
1367 cchMax = cchMax2 + 1;
1368 break;
1369 }
1370
1371 /* compare */
1372 int iDiff = cp1 - cp2;
1373 if (iDiff)
1374 {
1375 iDiff = RTUniCpToUpper(cp1) != RTUniCpToUpper(cp2);
1376 if (iDiff)
1377 {
1378 iDiff = RTUniCpToLower(cp1) - RTUniCpToLower(cp2); /* lower case diff last! */
1379 if (iDiff)
1380 return iDiff;
1381 }
1382 }
1383
1384 /* hit the terminator? */
1385 if (!cp1 || cchMax == 0)
1386 return 0;
1387 }
1388
1389 /* Hit some bad encoding, continue in case insensitive mode. */
1390 return RTStrNCmp(psz1, psz2, cchMax);
1391}
1392RT_EXPORT_SYMBOL(RTStrNICmp);
1393
1394
1395RTDECL(char *) RTStrStr(const char *pszHaystack, const char *pszNeedle)
1396{
1397 /* Any NULL strings means NULL return. (In the RTStrCmp tradition.) */
1398 if (!pszHaystack)
1399 return NULL;
1400 if (!pszNeedle)
1401 return NULL;
1402
1403 /* The rest is CRT. */
1404 return (char *)strstr(pszHaystack, pszNeedle);
1405}
1406RT_EXPORT_SYMBOL(RTStrStr);
1407
1408
1409RTDECL(char *) RTStrIStr(const char *pszHaystack, const char *pszNeedle)
1410{
1411 /* Any NULL strings means NULL return. (In the RTStrCmp tradition.) */
1412 if (!pszHaystack)
1413 return NULL;
1414 if (!pszNeedle)
1415 return NULL;
1416
1417 /* The empty string matches everything. */
1418 if (!*pszNeedle)
1419 return (char *)pszHaystack;
1420
1421 /*
1422 * The search strategy is to pick out the first char of the needle, fold it,
1423 * and match it against the haystack code point by code point. When encountering
1424 * a matching code point we use RTStrNICmp for the remainder (if any) of the needle.
1425 */
1426 const char * const pszNeedleStart = pszNeedle;
1427 RTUNICP Cp0;
1428 RTStrGetCpEx(&pszNeedle, &Cp0); /* pszNeedle is advanced one code point. */
1429 size_t const cchNeedle = strlen(pszNeedle);
1430 size_t const cchNeedleCp0= pszNeedle - pszNeedleStart;
1431 RTUNICP const Cp0Lower = RTUniCpToLower(Cp0);
1432 RTUNICP const Cp0Upper = RTUniCpToUpper(Cp0);
1433 if ( Cp0Lower == Cp0Upper
1434 && Cp0Lower == Cp0)
1435 {
1436 /* Cp0 is not a case sensitive char. */
1437 for (;;)
1438 {
1439 RTUNICP Cp;
1440 RTStrGetCpEx(&pszHaystack, &Cp);
1441 if (!Cp)
1442 break;
1443 if ( Cp == Cp0
1444 && !RTStrNICmp(pszHaystack, pszNeedle, cchNeedle))
1445 return (char *)pszHaystack - cchNeedleCp0;
1446 }
1447 }
1448 else if ( Cp0Lower == Cp0
1449 || Cp0Upper != Cp0)
1450 {
1451 /* Cp0 is case sensitive */
1452 for (;;)
1453 {
1454 RTUNICP Cp;
1455 RTStrGetCpEx(&pszHaystack, &Cp);
1456 if (!Cp)
1457 break;
1458 if ( ( Cp == Cp0Upper
1459 || Cp == Cp0Lower)
1460 && !RTStrNICmp(pszHaystack, pszNeedle, cchNeedle))
1461 return (char *)pszHaystack - cchNeedleCp0;
1462 }
1463 }
1464 else
1465 {
1466 /* Cp0 is case sensitive and folds to two difference chars. (paranoia) */
1467 for (;;)
1468 {
1469 RTUNICP Cp;
1470 RTStrGetCpEx(&pszHaystack, &Cp);
1471 if (!Cp)
1472 break;
1473 if ( ( Cp == Cp0
1474 || Cp == Cp0Upper
1475 || Cp == Cp0Lower)
1476 && !RTStrNICmp(pszHaystack, pszNeedle, cchNeedle))
1477 return (char *)pszHaystack - cchNeedleCp0;
1478 }
1479 }
1480
1481
1482 return NULL;
1483}
1484RT_EXPORT_SYMBOL(RTStrIStr);
1485
1486
1487RTDECL(char *) RTStrToLower(char *psz)
1488{
1489 /*
1490 * Loop the code points in the string, converting them one by one.
1491 * ASSUMES that the code points for upper and lower case are encoded
1492 * with the exact same length.
1493 */
1494 /** @todo Handled bad encodings correctly+quietly, remove assumption,
1495 * optimize. */
1496 char *pszCur = psz;
1497 while (*pszCur)
1498 {
1499 RTUNICP cp = RTStrGetCp(pszCur);
1500 cp = RTUniCpToLower(cp);
1501 pszCur = RTStrPutCp(pszCur, cp);
1502 }
1503 return psz;
1504}
1505RT_EXPORT_SYMBOL(RTStrToLower);
1506
1507
1508RTDECL(char *) RTStrToUpper(char *psz)
1509{
1510 /*
1511 * Loop the code points in the string, converting them one by one.
1512 * ASSUMES that the code points for upper and lower case are encoded
1513 * with the exact same length.
1514 */
1515 /** @todo Handled bad encodings correctly+quietly, remove assumption,
1516 * optimize. */
1517 char *pszCur = psz;
1518 while(*pszCur)
1519 {
1520 RTUNICP cp = RTStrGetCp(pszCur);
1521 cp = RTUniCpToUpper(cp);
1522 pszCur = RTStrPutCp(pszCur, cp);
1523 }
1524 return psz;
1525}
1526RT_EXPORT_SYMBOL(RTStrToUpper);
1527
Note: See TracBrowser for help on using the repository browser.

© 2024 Oracle Support Privacy / Do Not Sell My Info Terms of Use Trademark Policy Automated Access Etiquette