VirtualBox

source: vbox/trunk/src/VBox/Runtime/common/string/utf-16.cpp@ 56290

Last change on this file since 56290 was 56290, checked in by vboxsync, 10 years ago

IPRT: Updated (C) year.

  • Property svn:eol-style set to native
  • Property svn:keywords set to Id Revision
File size: 18.2 KB
Line 
1/* $Id: utf-16.cpp 56290 2015-06-09 14:01:31Z vboxsync $ */
2/** @file
3 * IPRT - UTF-16.
4 */
5
6/*
7 * Copyright (C) 2006-2015 Oracle Corporation
8 *
9 * This file is part of VirtualBox Open Source Edition (OSE), as
10 * available from http://www.virtualbox.org. This file is free software;
11 * you can redistribute it and/or modify it under the terms of the GNU
12 * General Public License (GPL) as published by the Free Software
13 * Foundation, in version 2 as it comes in the "COPYING" file of the
14 * VirtualBox OSE distribution. VirtualBox OSE is distributed in the
15 * hope that it will be useful, but WITHOUT ANY WARRANTY of any kind.
16 *
17 * The contents of this file may alternatively be used under the terms
18 * of the Common Development and Distribution License Version 1.0
19 * (CDDL) only, as it comes in the "COPYING.CDDL" file of the
20 * VirtualBox OSE distribution, in which case the provisions of the
21 * CDDL are applicable instead of those of the GPL.
22 *
23 * You may elect to license modified versions of this file under the
24 * terms and conditions of either the GPL or the CDDL or both.
25 */
26
27
28/*******************************************************************************
29* Header Files *
30*******************************************************************************/
31#include <iprt/string.h>
32#include "internal/iprt.h"
33
34#include <iprt/uni.h>
35#include <iprt/alloc.h>
36#include <iprt/assert.h>
37#include <iprt/err.h>
38#include "internal/string.h"
39
40
41/**
42 * Get get length in code points of an UTF-16 encoded string, validating the
43 * string while doing so.
44 *
45 * @returns IPRT status code.
46 * @param pwsz Pointer to the UTF-16 string.
47 * @param cwc The max length of the string in UTF-16 units. Use
48 * RTSTR_MAX if all of the string is to be examined.
49 * @param pcuc Where to store the length in unicode code points.
50 * @param pcwcActual Where to store the actual size of the UTF-16 string
51 * on success. Optional.
52 */
53static int rtUtf16Length(PCRTUTF16 pwsz, size_t cwc, size_t *pcuc, size_t *pcwcActual)
54{
55 PCRTUTF16 pwszStart = pwsz;
56 size_t cCodePoints = 0;
57 while (cwc > 0)
58 {
59 RTUTF16 wc = *pwsz;
60 if (!wc)
61 break;
62 if (wc < 0xd800 || wc > 0xdfff)
63 {
64 cCodePoints++;
65 pwsz++;
66 cwc--;
67 }
68 /* Surrogate pair: */
69 else if (wc >= 0xdc00)
70 {
71 RTStrAssertMsgFailed(("Lone UTF-16 trail surrogate: %#x (%.*Rhxs)\n", wc, RT_MIN(cwc * 2, 10), pwsz));
72 return VERR_INVALID_UTF16_ENCODING;
73 }
74 else if (cwc < 2)
75 {
76 RTStrAssertMsgFailed(("Lone UTF-16 lead surrogate: %#x\n", wc));
77 return VERR_INVALID_UTF16_ENCODING;
78 }
79 else
80 {
81 RTUTF16 wcTrail = pwsz[1];
82 if (wcTrail < 0xdc00 || wcTrail > 0xdfff)
83 {
84 RTStrAssertMsgFailed(("Invalid UTF-16 trail surrogate: %#x (lead %#x)\n", wcTrail, wc));
85 return VERR_INVALID_UTF16_ENCODING;
86 }
87
88 cCodePoints++;
89 pwsz += 2;
90 cwc -= 2;
91 }
92 }
93
94 /* done */
95 *pcuc = cCodePoints;
96 if (pcwcActual)
97 *pcwcActual = pwsz - pwszStart;
98 return VINF_SUCCESS;
99}
100
101
102RTDECL(PRTUTF16) RTUtf16AllocTag(size_t cb, const char *pszTag)
103{
104 if (cb > sizeof(RTUTF16))
105 cb = RT_ALIGN_Z(cb, sizeof(RTUTF16));
106 else
107 cb = sizeof(RTUTF16);
108 PRTUTF16 pwsz = (PRTUTF16)RTMemAllocTag(cb, pszTag);
109 if (pwsz)
110 *pwsz = '\0';
111 return pwsz;
112}
113RT_EXPORT_SYMBOL(RTUtf16AllocTag);
114
115
116RTDECL(void) RTUtf16Free(PRTUTF16 pwszString)
117{
118 if (pwszString)
119 RTMemTmpFree(pwszString);
120}
121RT_EXPORT_SYMBOL(RTUtf16Free);
122
123
124RTDECL(PRTUTF16) RTUtf16DupTag(PCRTUTF16 pwszString, const char *pszTag)
125{
126 Assert(pwszString);
127 size_t cb = (RTUtf16Len(pwszString) + 1) * sizeof(RTUTF16);
128 PRTUTF16 pwsz = (PRTUTF16)RTMemAllocTag(cb, pszTag);
129 if (pwsz)
130 memcpy(pwsz, pwszString, cb);
131 return pwsz;
132}
133RT_EXPORT_SYMBOL(RTUtf16DupTag);
134
135
136RTDECL(int) RTUtf16DupExTag(PRTUTF16 *ppwszString, PCRTUTF16 pwszString, size_t cwcExtra, const char *pszTag)
137{
138 Assert(pwszString);
139 size_t cb = (RTUtf16Len(pwszString) + 1) * sizeof(RTUTF16);
140 PRTUTF16 pwsz = (PRTUTF16)RTMemAllocTag(cb + cwcExtra * sizeof(RTUTF16), pszTag);
141 if (pwsz)
142 {
143 memcpy(pwsz, pwszString, cb);
144 *ppwszString = pwsz;
145 return VINF_SUCCESS;
146 }
147 return VERR_NO_MEMORY;
148}
149RT_EXPORT_SYMBOL(RTUtf16DupExTag);
150
151
152RTDECL(size_t) RTUtf16Len(PCRTUTF16 pwszString)
153{
154 if (!pwszString)
155 return 0;
156
157 PCRTUTF16 pwsz = pwszString;
158 while (*pwsz)
159 pwsz++;
160 return pwsz - pwszString;
161}
162RT_EXPORT_SYMBOL(RTUtf16Len);
163
164
165RTDECL(int) RTUtf16Cmp(register PCRTUTF16 pwsz1, register PCRTUTF16 pwsz2)
166{
167 if (pwsz1 == pwsz2)
168 return 0;
169 if (!pwsz1)
170 return -1;
171 if (!pwsz2)
172 return 1;
173
174 for (;;)
175 {
176 register RTUTF16 wcs = *pwsz1;
177 register int iDiff = wcs - *pwsz2;
178 if (iDiff || !wcs)
179 return iDiff;
180 pwsz1++;
181 pwsz2++;
182 }
183}
184RT_EXPORT_SYMBOL(RTUtf16Cmp);
185
186
187RTDECL(int) RTUtf16ValidateEncoding(PCRTUTF16 pwsz)
188{
189 return RTUtf16ValidateEncodingEx(pwsz, RTSTR_MAX, 0);
190}
191RT_EXPORT_SYMBOL(RTUtf16ValidateEncoding);
192
193
194RTDECL(int) RTUtf16ValidateEncodingEx(PCRTUTF16 pwsz, size_t cwc, uint32_t fFlags)
195{
196 AssertReturn(!(fFlags & ~(RTSTR_VALIDATE_ENCODING_ZERO_TERMINATED | RTSTR_VALIDATE_ENCODING_EXACT_LENGTH)),
197 VERR_INVALID_PARAMETER);
198 AssertPtr(pwsz);
199
200 /*
201 * Use rtUtf16Length for the job.
202 */
203 size_t cwcActual = 0; /* Shut up cc1plus. */
204 size_t cCpsIgnored;
205 int rc = rtUtf16Length(pwsz, cwc, &cCpsIgnored, &cwcActual);
206 if (RT_SUCCESS(rc))
207 {
208 if (fFlags & RTSTR_VALIDATE_ENCODING_EXACT_LENGTH)
209 {
210 if (fFlags & RTSTR_VALIDATE_ENCODING_ZERO_TERMINATED)
211 cwcActual++;
212 if (cwcActual == cwc)
213 rc = VINF_SUCCESS;
214 else if (cwcActual < cwc)
215 rc = VERR_BUFFER_UNDERFLOW;
216 else
217 rc = VERR_BUFFER_OVERFLOW;
218 }
219 else if ( (fFlags & RTSTR_VALIDATE_ENCODING_ZERO_TERMINATED)
220 && cwcActual >= cwc)
221 rc = VERR_BUFFER_OVERFLOW;
222 }
223 return rc;
224}
225RT_EXPORT_SYMBOL(RTUtf16ValidateEncodingEx);
226
227
228RTDECL(bool) RTUtf16IsValidEncoding(PCRTUTF16 pwsz)
229{
230 int rc = RTUtf16ValidateEncodingEx(pwsz, RTSTR_MAX, 0);
231 return RT_SUCCESS(rc);
232}
233RT_EXPORT_SYMBOL(RTUtf16IsValidEncoding);
234
235
236RTDECL(ssize_t) RTUtf16PurgeComplementSet(PRTUTF16 pwsz, PCRTUNICP puszValidSet, char chReplacement)
237{
238 size_t cReplacements = 0;
239 AssertReturn(chReplacement && (unsigned)chReplacement < 128, -1);
240 /* Validate the encoding. */
241 for (;;)
242 {
243 RTUNICP Cp;
244 PCRTUNICP pCp;
245 PRTUTF16 pwszOld = pwsz;
246 if (RT_FAILURE(RTUtf16GetCpEx((PCRTUTF16 *)&pwsz, &Cp)))
247 return -1;
248 if (!Cp)
249 break;
250 for (pCp = puszValidSet; *pCp; pCp += 2)
251 {
252 AssertReturn(*(pCp + 1), -1);
253 if (*pCp <= Cp && *(pCp + 1) >= Cp) /* No, I won't do * and ++. */
254 break;
255 }
256 if (!*pCp)
257 {
258 for (; pwszOld != pwsz; ++pwszOld)
259 *pwszOld = chReplacement;
260 ++cReplacements;
261 }
262 }
263 return cReplacements;
264}
265RT_EXPORT_SYMBOL(RTUtf16PurgeComplementSet);
266
267
268/**
269 * Validate the UTF-16 encoding and calculates the length of an UTF-8 encoding.
270 *
271 * @returns iprt status code.
272 * @param pwsz The UTF-16 string.
273 * @param cwc The max length of the UTF-16 string to consider.
274 * @param pcch Where to store the length (excluding '\\0') of the UTF-8 string. (cch == cb, btw)
275 */
276static int rtUtf16CalcUtf8Length(PCRTUTF16 pwsz, size_t cwc, size_t *pcch)
277{
278 int rc = VINF_SUCCESS;
279 size_t cch = 0;
280 while (cwc > 0)
281 {
282 RTUTF16 wc = *pwsz++; cwc--;
283 if (!wc)
284 break;
285 else if (wc < 0xd800 || wc > 0xdfff)
286 {
287 if (wc < 0x80)
288 cch++;
289 else if (wc < 0x800)
290 cch += 2;
291 else if (wc < 0xfffe)
292 cch += 3;
293 else
294 {
295 RTStrAssertMsgFailed(("endian indicator! wc=%#x\n", wc));
296 rc = VERR_CODE_POINT_ENDIAN_INDICATOR;
297 break;
298 }
299 }
300 else
301 {
302 if (wc >= 0xdc00)
303 {
304 RTStrAssertMsgFailed(("Wrong 1st char in surrogate! wc=%#x\n", wc));
305 rc = VERR_INVALID_UTF16_ENCODING;
306 break;
307 }
308 if (cwc <= 0)
309 {
310 RTStrAssertMsgFailed(("Invalid length! wc=%#x\n", wc));
311 rc = VERR_INVALID_UTF16_ENCODING;
312 break;
313 }
314 wc = *pwsz++; cwc--;
315 if (wc < 0xdc00 || wc > 0xdfff)
316 {
317 RTStrAssertMsgFailed(("Wrong 2nd char in surrogate! wc=%#x\n", wc));
318 rc = VERR_INVALID_UTF16_ENCODING;
319 break;
320 }
321 cch += 4;
322 }
323 }
324
325
326 /* done */
327 *pcch = cch;
328 return rc;
329}
330
331
332/**
333 * Recodes an valid UTF-16 string as UTF-8.
334 *
335 * @returns iprt status code.
336 * @param pwsz The UTF-16 string.
337 * @param cwc The number of RTUTF16 characters to process from pwsz. The recoding
338 * will stop when cwc or '\\0' is reached.
339 * @param psz Where to store the UTF-8 string.
340 * @param cch The size of the UTF-8 buffer, excluding the terminator.
341 * @param pcch Where to store the number of octets actually encoded.
342 */
343static int rtUtf16RecodeAsUtf8(PCRTUTF16 pwsz, size_t cwc, char *psz, size_t cch, size_t *pcch)
344{
345 unsigned char *pwch = (unsigned char *)psz;
346 int rc = VINF_SUCCESS;
347 while (cwc > 0)
348 {
349 RTUTF16 wc = *pwsz++; cwc--;
350 if (!wc)
351 break;
352 else if (wc < 0xd800 || wc > 0xdfff)
353 {
354 if (wc < 0x80)
355 {
356 if (RT_UNLIKELY(cch < 1))
357 {
358 RTStrAssertMsgFailed(("Buffer overflow! 1\n"));
359 rc = VERR_BUFFER_OVERFLOW;
360 break;
361 }
362 cch--;
363 *pwch++ = (unsigned char)wc;
364 }
365 else if (wc < 0x800)
366 {
367 if (RT_UNLIKELY(cch < 2))
368 {
369 RTStrAssertMsgFailed(("Buffer overflow! 2\n"));
370 rc = VERR_BUFFER_OVERFLOW;
371 break;
372 }
373 cch -= 2;
374 *pwch++ = 0xc0 | (wc >> 6);
375 *pwch++ = 0x80 | (wc & 0x3f);
376 }
377 else if (wc < 0xfffe)
378 {
379 if (RT_UNLIKELY(cch < 3))
380 {
381 RTStrAssertMsgFailed(("Buffer overflow! 3\n"));
382 rc = VERR_BUFFER_OVERFLOW;
383 break;
384 }
385 cch -= 3;
386 *pwch++ = 0xe0 | (wc >> 12);
387 *pwch++ = 0x80 | ((wc >> 6) & 0x3f);
388 *pwch++ = 0x80 | (wc & 0x3f);
389 }
390 else
391 {
392 RTStrAssertMsgFailed(("endian indicator! wc=%#x\n", wc));
393 rc = VERR_CODE_POINT_ENDIAN_INDICATOR;
394 break;
395 }
396 }
397 else
398 {
399 if (wc >= 0xdc00)
400 {
401 RTStrAssertMsgFailed(("Wrong 1st char in surrogate! wc=%#x\n", wc));
402 rc = VERR_INVALID_UTF16_ENCODING;
403 break;
404 }
405 if (cwc <= 0)
406 {
407 RTStrAssertMsgFailed(("Invalid length! wc=%#x\n", wc));
408 rc = VERR_INVALID_UTF16_ENCODING;
409 break;
410 }
411 RTUTF16 wc2 = *pwsz++; cwc--;
412 if (wc2 < 0xdc00 || wc2 > 0xdfff)
413 {
414 RTStrAssertMsgFailed(("Wrong 2nd char in surrogate! wc=%#x\n", wc));
415 rc = VERR_INVALID_UTF16_ENCODING;
416 break;
417 }
418 uint32_t CodePoint = 0x10000
419 + ( ((wc & 0x3ff) << 10)
420 | (wc2 & 0x3ff));
421 if (RT_UNLIKELY(cch < 4))
422 {
423 RTStrAssertMsgFailed(("Buffer overflow! 4\n"));
424 rc = VERR_BUFFER_OVERFLOW;
425 break;
426 }
427 cch -= 4;
428 *pwch++ = 0xf0 | (CodePoint >> 18);
429 *pwch++ = 0x80 | ((CodePoint >> 12) & 0x3f);
430 *pwch++ = 0x80 | ((CodePoint >> 6) & 0x3f);
431 *pwch++ = 0x80 | (CodePoint & 0x3f);
432 }
433 }
434
435 /* done */
436 *pwch = '\0';
437 *pcch = (char *)pwch - psz;
438 return rc;
439}
440
441
442
443RTDECL(int) RTUtf16ToUtf8Tag(PCRTUTF16 pwszString, char **ppszString, const char *pszTag)
444{
445 /*
446 * Validate input.
447 */
448 Assert(VALID_PTR(ppszString));
449 Assert(VALID_PTR(pwszString));
450 *ppszString = NULL;
451
452 /*
453 * Validate the UTF-16 string and calculate the length of the UTF-8 encoding of it.
454 */
455 size_t cch;
456 int rc = rtUtf16CalcUtf8Length(pwszString, RTSTR_MAX, &cch);
457 if (RT_SUCCESS(rc))
458 {
459 /*
460 * Allocate buffer and recode it.
461 */
462 char *pszResult = (char *)RTMemAllocTag(cch + 1, pszTag);
463 if (pszResult)
464 {
465 rc = rtUtf16RecodeAsUtf8(pwszString, RTSTR_MAX, pszResult, cch, &cch);
466 if (RT_SUCCESS(rc))
467 {
468 *ppszString = pszResult;
469 return rc;
470 }
471
472 RTMemFree(pszResult);
473 }
474 else
475 rc = VERR_NO_STR_MEMORY;
476 }
477 return rc;
478}
479RT_EXPORT_SYMBOL(RTUtf16ToUtf8Tag);
480
481
482RTDECL(int) RTUtf16ToUtf8ExTag(PCRTUTF16 pwszString, size_t cwcString, char **ppsz, size_t cch, size_t *pcch, const char *pszTag)
483{
484 /*
485 * Validate input.
486 */
487 Assert(VALID_PTR(pwszString));
488 Assert(VALID_PTR(ppsz));
489 Assert(!pcch || VALID_PTR(pcch));
490
491 /*
492 * Validate the UTF-16 string and calculate the length of the UTF-8 encoding of it.
493 */
494 size_t cchResult;
495 int rc = rtUtf16CalcUtf8Length(pwszString, cwcString, &cchResult);
496 if (RT_SUCCESS(rc))
497 {
498 if (pcch)
499 *pcch = cchResult;
500
501 /*
502 * Check buffer size / Allocate buffer and recode it.
503 */
504 bool fShouldFree;
505 char *pszResult;
506 if (cch > 0 && *ppsz)
507 {
508 fShouldFree = false;
509 if (RT_UNLIKELY(cch <= cchResult))
510 return VERR_BUFFER_OVERFLOW;
511 pszResult = *ppsz;
512 }
513 else
514 {
515 *ppsz = NULL;
516 fShouldFree = true;
517 cch = RT_MAX(cch, cchResult + 1);
518 pszResult = (char *)RTStrAllocTag(cch, pszTag);
519 }
520 if (pszResult)
521 {
522 rc = rtUtf16RecodeAsUtf8(pwszString, cwcString, pszResult, cch - 1, &cch);
523 if (RT_SUCCESS(rc))
524 {
525 *ppsz = pszResult;
526 return rc;
527 }
528
529 if (fShouldFree)
530 RTStrFree(pszResult);
531 }
532 else
533 rc = VERR_NO_STR_MEMORY;
534 }
535 return rc;
536}
537RT_EXPORT_SYMBOL(RTUtf16ToUtf8ExTag);
538
539
540RTDECL(size_t) RTUtf16CalcUtf8Len(PCRTUTF16 pwsz)
541{
542 size_t cch;
543 int rc = rtUtf16CalcUtf8Length(pwsz, RTSTR_MAX, &cch);
544 return RT_SUCCESS(rc) ? cch : 0;
545}
546RT_EXPORT_SYMBOL(RTUtf16CalcUtf8Len);
547
548
549RTDECL(int) RTUtf16CalcUtf8LenEx(PCRTUTF16 pwsz, size_t cwc, size_t *pcch)
550{
551 size_t cch;
552 int rc = rtUtf16CalcUtf8Length(pwsz, cwc, &cch);
553 if (pcch)
554 *pcch = RT_SUCCESS(rc) ? cch : ~(size_t)0;
555 return rc;
556}
557RT_EXPORT_SYMBOL(RTUtf16CalcUtf8LenEx);
558
559
560RTDECL(RTUNICP) RTUtf16GetCpInternal(PCRTUTF16 pwsz)
561{
562 const RTUTF16 wc = *pwsz;
563
564 /* simple */
565 if (wc < 0xd800 || (wc > 0xdfff && wc < 0xfffe))
566 return wc;
567 if (wc < 0xfffe)
568 {
569 /* surrogate pair */
570 if (wc < 0xdc00)
571 {
572 const RTUTF16 wc2 = pwsz[1];
573 if (wc2 >= 0xdc00 && wc2 <= 0xdfff)
574 {
575 RTUNICP uc = 0x10000 + (((wc & 0x3ff) << 10) | (wc2 & 0x3ff));
576 return uc;
577 }
578
579 RTStrAssertMsgFailed(("wc=%#08x wc2=%#08x - invalid 2nd char in surrogate pair\n", wc, wc2));
580 }
581 else
582 RTStrAssertMsgFailed(("wc=%#08x - invalid surrogate pair order\n", wc));
583 }
584 else
585 RTStrAssertMsgFailed(("wc=%#08x - endian indicator\n", wc));
586 return RTUNICP_INVALID;
587}
588RT_EXPORT_SYMBOL(RTUtf16GetCpInternal);
589
590
591RTDECL(int) RTUtf16GetCpExInternal(PCRTUTF16 *ppwsz, PRTUNICP pCp)
592{
593 const RTUTF16 wc = **ppwsz;
594
595 /* simple */
596 if (wc < 0xd800 || (wc > 0xdfff && wc < 0xfffe))
597 {
598 (*ppwsz)++;
599 *pCp = wc;
600 return VINF_SUCCESS;
601 }
602
603 int rc;
604 if (wc < 0xfffe)
605 {
606 /* surrogate pair */
607 if (wc < 0xdc00)
608 {
609 const RTUTF16 wc2 = (*ppwsz)[1];
610 if (wc2 >= 0xdc00 && wc2 <= 0xdfff)
611 {
612 RTUNICP uc = 0x10000 + (((wc & 0x3ff) << 10) | (wc2 & 0x3ff));
613 *pCp = uc;
614 (*ppwsz) += 2;
615 return VINF_SUCCESS;
616 }
617
618 RTStrAssertMsgFailed(("wc=%#08x wc2=%#08x - invalid 2nd char in surrogate pair\n", wc, wc2));
619 }
620 else
621 RTStrAssertMsgFailed(("wc=%#08x - invalid surrogate pair order\n", wc));
622 rc = VERR_INVALID_UTF16_ENCODING;
623 }
624 else
625 {
626 RTStrAssertMsgFailed(("wc=%#08x - endian indicator\n", wc));
627 rc = VERR_CODE_POINT_ENDIAN_INDICATOR;
628 }
629 *pCp = RTUNICP_INVALID;
630 (*ppwsz)++;
631 return rc;
632}
633RT_EXPORT_SYMBOL(RTUtf16GetCpExInternal);
634
635
636RTDECL(PRTUTF16) RTUtf16PutCpInternal(PRTUTF16 pwsz, RTUNICP CodePoint)
637{
638 /* simple */
639 if ( CodePoint < 0xd800
640 || ( CodePoint > 0xdfff
641 && CodePoint < 0xfffe))
642 {
643 *pwsz++ = (RTUTF16)CodePoint;
644 return pwsz;
645 }
646
647 /* surrogate pair */
648 if (CodePoint >= 0x10000 && CodePoint <= 0x0010ffff)
649 {
650 CodePoint -= 0x10000;
651 *pwsz++ = 0xd800 | (CodePoint >> 10);
652 *pwsz++ = 0xdc00 | (CodePoint & 0x3ff);
653 return pwsz;
654 }
655
656 /* invalid code point. */
657 RTStrAssertMsgFailed(("Invalid codepoint %#x\n", CodePoint));
658 *pwsz++ = 0x7f;
659 return pwsz;
660}
661RT_EXPORT_SYMBOL(RTUtf16PutCpInternal);
662
Note: See TracBrowser for help on using the repository browser.

© 2024 Oracle Support Privacy / Do Not Sell My Info Terms of Use Trademark Policy Automated Access Etiquette