VirtualBox

source: vbox/trunk/src/VBox/Runtime/common/string/utf-16.cpp@ 62930

Last change on this file since 62930 was 62930, checked in by vboxsync, 8 years ago

RTUtf16PurgeEncoding: Optimized it a little, adding debug assertion for bad pairs.

  • Property svn:eol-style set to native
  • Property svn:keywords set to Id Revision
File size: 20.9 KB
Line 
1/* $Id: utf-16.cpp 62930 2016-08-03 16:11:43Z vboxsync $ */
2/** @file
3 * IPRT - UTF-16.
4 */
5
6/*
7 * Copyright (C) 2006-2016 Oracle Corporation
8 *
9 * This file is part of VirtualBox Open Source Edition (OSE), as
10 * available from http://www.virtualbox.org. This file is free software;
11 * you can redistribute it and/or modify it under the terms of the GNU
12 * General Public License (GPL) as published by the Free Software
13 * Foundation, in version 2 as it comes in the "COPYING" file of the
14 * VirtualBox OSE distribution. VirtualBox OSE is distributed in the
15 * hope that it will be useful, but WITHOUT ANY WARRANTY of any kind.
16 *
17 * The contents of this file may alternatively be used under the terms
18 * of the Common Development and Distribution License Version 1.0
19 * (CDDL) only, as it comes in the "COPYING.CDDL" file of the
20 * VirtualBox OSE distribution, in which case the provisions of the
21 * CDDL are applicable instead of those of the GPL.
22 *
23 * You may elect to license modified versions of this file under the
24 * terms and conditions of either the GPL or the CDDL or both.
25 */
26
27
28/*********************************************************************************************************************************
29* Header Files *
30*********************************************************************************************************************************/
31#include <iprt/string.h>
32#include "internal/iprt.h"
33
34#include <iprt/uni.h>
35#include <iprt/alloc.h>
36#include <iprt/assert.h>
37#include <iprt/err.h>
38#include "internal/string.h"
39
40
41/**
42 * Get get length in code points of an UTF-16 encoded string, validating the
43 * string while doing so.
44 *
45 * @returns IPRT status code.
46 * @param pwsz Pointer to the UTF-16 string.
47 * @param cwc The max length of the string in UTF-16 units. Use
48 * RTSTR_MAX if all of the string is to be examined.
49 * @param pcuc Where to store the length in unicode code points.
50 * @param pcwcActual Where to store the actual size of the UTF-16 string
51 * on success. Optional.
52 */
53static int rtUtf16Length(PCRTUTF16 pwsz, size_t cwc, size_t *pcuc, size_t *pcwcActual)
54{
55 PCRTUTF16 pwszStart = pwsz;
56 size_t cCodePoints = 0;
57 while (cwc > 0)
58 {
59 RTUTF16 wc = *pwsz;
60 if (!wc)
61 break;
62 if (wc < 0xd800 || wc > 0xdfff)
63 {
64 cCodePoints++;
65 pwsz++;
66 cwc--;
67 }
68 /* Surrogate pair: */
69 else if (wc >= 0xdc00)
70 {
71 RTStrAssertMsgFailed(("Lone UTF-16 trail surrogate: %#x (%.*Rhxs)\n", wc, RT_MIN(cwc * 2, 10), pwsz));
72 return VERR_INVALID_UTF16_ENCODING;
73 }
74 else if (cwc < 2)
75 {
76 RTStrAssertMsgFailed(("Lone UTF-16 lead surrogate: %#x\n", wc));
77 return VERR_INVALID_UTF16_ENCODING;
78 }
79 else
80 {
81 RTUTF16 wcTrail = pwsz[1];
82 if (wcTrail < 0xdc00 || wcTrail > 0xdfff)
83 {
84 RTStrAssertMsgFailed(("Invalid UTF-16 trail surrogate: %#x (lead %#x)\n", wcTrail, wc));
85 return VERR_INVALID_UTF16_ENCODING;
86 }
87
88 cCodePoints++;
89 pwsz += 2;
90 cwc -= 2;
91 }
92 }
93
94 /* done */
95 *pcuc = cCodePoints;
96 if (pcwcActual)
97 *pcwcActual = pwsz - pwszStart;
98 return VINF_SUCCESS;
99}
100
101
102RTDECL(PRTUTF16) RTUtf16AllocTag(size_t cb, const char *pszTag)
103{
104 if (cb > sizeof(RTUTF16))
105 cb = RT_ALIGN_Z(cb, sizeof(RTUTF16));
106 else
107 cb = sizeof(RTUTF16);
108 PRTUTF16 pwsz = (PRTUTF16)RTMemAllocTag(cb, pszTag);
109 if (pwsz)
110 *pwsz = '\0';
111 return pwsz;
112}
113RT_EXPORT_SYMBOL(RTUtf16AllocTag);
114
115
116RTDECL(int) RTUtf16ReallocTag(PRTUTF16 *ppwsz, size_t cbNew, const char *pszTag)
117{
118 PRTUTF16 pwszOld = *ppwsz;
119 cbNew = RT_ALIGN_Z(cbNew, sizeof(RTUTF16));
120 if (!cbNew)
121 {
122 RTMemFree(pwszOld);
123 *ppwsz = NULL;
124 }
125 else if (pwszOld)
126 {
127 PRTUTF16 pwszNew = (PRTUTF16)RTMemReallocTag(pwszOld, cbNew, pszTag);
128 if (!pwszNew)
129 return VERR_NO_STR_MEMORY;
130 pwszNew[cbNew / sizeof(RTUTF16) - 1] = '\0';
131 *ppwsz = pwszNew;
132 }
133 else
134 {
135 PRTUTF16 pwszNew = (PRTUTF16)RTMemAllocTag(cbNew, pszTag);
136 if (!pwszNew)
137 return VERR_NO_UTF16_MEMORY;
138 pwszNew[0] = '\0';
139 pwszNew[cbNew / sizeof(RTUTF16) - 1] = '\0';
140 *ppwsz = pwszNew;
141 }
142 return VINF_SUCCESS;
143}
144RT_EXPORT_SYMBOL(RTUtf16ReallocTag);
145
146
147RTDECL(void) RTUtf16Free(PRTUTF16 pwszString)
148{
149 if (pwszString)
150 RTMemTmpFree(pwszString);
151}
152RT_EXPORT_SYMBOL(RTUtf16Free);
153
154
155RTDECL(PRTUTF16) RTUtf16DupTag(PCRTUTF16 pwszString, const char *pszTag)
156{
157 Assert(pwszString);
158 size_t cb = (RTUtf16Len(pwszString) + 1) * sizeof(RTUTF16);
159 PRTUTF16 pwsz = (PRTUTF16)RTMemAllocTag(cb, pszTag);
160 if (pwsz)
161 memcpy(pwsz, pwszString, cb);
162 return pwsz;
163}
164RT_EXPORT_SYMBOL(RTUtf16DupTag);
165
166
167RTDECL(int) RTUtf16DupExTag(PRTUTF16 *ppwszString, PCRTUTF16 pwszString, size_t cwcExtra, const char *pszTag)
168{
169 Assert(pwszString);
170 size_t cb = (RTUtf16Len(pwszString) + 1) * sizeof(RTUTF16);
171 PRTUTF16 pwsz = (PRTUTF16)RTMemAllocTag(cb + cwcExtra * sizeof(RTUTF16), pszTag);
172 if (pwsz)
173 {
174 memcpy(pwsz, pwszString, cb);
175 *ppwszString = pwsz;
176 return VINF_SUCCESS;
177 }
178 return VERR_NO_MEMORY;
179}
180RT_EXPORT_SYMBOL(RTUtf16DupExTag);
181
182
183RTDECL(size_t) RTUtf16Len(PCRTUTF16 pwszString)
184{
185 if (!pwszString)
186 return 0;
187
188 PCRTUTF16 pwsz = pwszString;
189 while (*pwsz)
190 pwsz++;
191 return pwsz - pwszString;
192}
193RT_EXPORT_SYMBOL(RTUtf16Len);
194
195
196RTDECL(int) RTUtf16Cmp(register PCRTUTF16 pwsz1, register PCRTUTF16 pwsz2)
197{
198 if (pwsz1 == pwsz2)
199 return 0;
200 if (!pwsz1)
201 return -1;
202 if (!pwsz2)
203 return 1;
204
205 for (;;)
206 {
207 register RTUTF16 wcs = *pwsz1;
208 register int iDiff = wcs - *pwsz2;
209 if (iDiff || !wcs)
210 return iDiff;
211 pwsz1++;
212 pwsz2++;
213 }
214}
215RT_EXPORT_SYMBOL(RTUtf16Cmp);
216
217
218RTDECL(int) RTUtf16CmpUtf8(PCRTUTF16 pwsz1, const char *psz2)
219{
220 /*
221 * NULL and empty strings are all the same.
222 */
223 if (!pwsz1)
224 return !psz2 || !*psz2 ? 0 : -1;
225 if (!psz2)
226 return !*pwsz1 ? 0 : 1;
227
228 /*
229 * Compare with a UTF-8 string by enumerating them char by char.
230 */
231 for (;;)
232 {
233 RTUNICP uc1;
234 int rc = RTUtf16GetCpEx(&pwsz1, &uc1);
235 AssertRCReturn(rc, 1);
236
237 RTUNICP uc2;
238 rc = RTStrGetCpEx(&psz2, &uc2);
239 AssertRCReturn(rc, -1);
240 if (uc1 == uc2)
241 {
242 if (uc1)
243 continue;
244 return 0;
245 }
246 return uc1 < uc2 ? -1 : 1;
247 }
248}
249RT_EXPORT_SYMBOL(RTUtf16CmpUtf8);
250
251
252RTDECL(int) RTUtf16ValidateEncoding(PCRTUTF16 pwsz)
253{
254 return RTUtf16ValidateEncodingEx(pwsz, RTSTR_MAX, 0);
255}
256RT_EXPORT_SYMBOL(RTUtf16ValidateEncoding);
257
258
259RTDECL(int) RTUtf16ValidateEncodingEx(PCRTUTF16 pwsz, size_t cwc, uint32_t fFlags)
260{
261 AssertReturn(!(fFlags & ~(RTSTR_VALIDATE_ENCODING_ZERO_TERMINATED | RTSTR_VALIDATE_ENCODING_EXACT_LENGTH)),
262 VERR_INVALID_PARAMETER);
263 AssertPtr(pwsz);
264
265 /*
266 * Use rtUtf16Length for the job.
267 */
268 size_t cwcActual = 0; /* Shut up cc1plus. */
269 size_t cCpsIgnored;
270 int rc = rtUtf16Length(pwsz, cwc, &cCpsIgnored, &cwcActual);
271 if (RT_SUCCESS(rc))
272 {
273 if (fFlags & RTSTR_VALIDATE_ENCODING_EXACT_LENGTH)
274 {
275 if (fFlags & RTSTR_VALIDATE_ENCODING_ZERO_TERMINATED)
276 cwcActual++;
277 if (cwcActual == cwc)
278 rc = VINF_SUCCESS;
279 else if (cwcActual < cwc)
280 rc = VERR_BUFFER_UNDERFLOW;
281 else
282 rc = VERR_BUFFER_OVERFLOW;
283 }
284 else if ( (fFlags & RTSTR_VALIDATE_ENCODING_ZERO_TERMINATED)
285 && cwcActual >= cwc)
286 rc = VERR_BUFFER_OVERFLOW;
287 }
288 return rc;
289}
290RT_EXPORT_SYMBOL(RTUtf16ValidateEncodingEx);
291
292
293RTDECL(bool) RTUtf16IsValidEncoding(PCRTUTF16 pwsz)
294{
295 int rc = RTUtf16ValidateEncodingEx(pwsz, RTSTR_MAX, 0);
296 return RT_SUCCESS(rc);
297}
298RT_EXPORT_SYMBOL(RTUtf16IsValidEncoding);
299
300
301/**
302 * Helper for RTUtf16PurgeComplementSet.
303 *
304 * @returns true if @a Cp is valid, false if not.
305 * @param Cp The code point to validate.
306 * @param puszValidPairs Pair of valid code point sets.
307 * @param cValidPairs Number of pairs.
308 */
309DECLINLINE(bool) rtUtf16PurgeIsInSet(RTUNICP Cp, PCRTUNICP puszValidPairs, uint32_t cValidPairs)
310{
311 while (cValidPairs-- > 0)
312 {
313 if ( Cp >= puszValidPairs[0]
314 && Cp <= puszValidPairs[1])
315 return true;
316 puszValidPairs += 2;
317 }
318 return false;
319}
320
321
322RTDECL(ssize_t) RTUtf16PurgeComplementSet(PRTUTF16 pwsz, PCRTUNICP puszValidPairs, char chReplacement)
323{
324 AssertReturn(chReplacement && (unsigned)chReplacement < 128, -1);
325
326 /*
327 * Calc valid pairs and check that we've got an even number.
328 */
329 uint32_t cValidPairs = 0;
330 while (puszValidPairs[cValidPairs * 2])
331 {
332 AssertReturn(puszValidPairs[cValidPairs * 2 + 1], -1);
333 AssertMsg(puszValidPairs[cValidPairs * 2] <= puszValidPairs[cValidPairs * 2 + 1],
334 ("%#x vs %#x\n", puszValidPairs[cValidPairs * 2], puszValidPairs[cValidPairs * 2 + 1]));
335 cValidPairs++;
336 }
337
338 /*
339 * Do the replacing.
340 */
341 ssize_t cReplacements = 0;
342 for (;;)
343 {
344 PRTUTF16 pwszCur = pwsz;
345 RTUNICP Cp;
346 int rc = RTUtf16GetCpEx((PCRTUTF16 *)&pwsz, &Cp);
347 if (RT_SUCCESS(rc))
348 {
349 if (Cp)
350 {
351 if (!rtUtf16PurgeIsInSet(Cp, puszValidPairs, cValidPairs))
352 {
353 for (; pwszCur != pwsz; ++pwszCur)
354 *pwszCur = chReplacement;
355 ++cReplacements;
356 }
357 }
358 else
359 break;
360 }
361 else
362 return -1;
363 }
364 return cReplacements;
365}
366RT_EXPORT_SYMBOL(RTUtf16PurgeComplementSet);
367
368
369/**
370 * Validate the UTF-16 encoding and calculates the length of an UTF-8 encoding.
371 *
372 * @returns iprt status code.
373 * @param pwsz The UTF-16 string.
374 * @param cwc The max length of the UTF-16 string to consider.
375 * @param pcch Where to store the length (excluding '\\0') of the UTF-8 string. (cch == cb, btw)
376 */
377static int rtUtf16CalcUtf8Length(PCRTUTF16 pwsz, size_t cwc, size_t *pcch)
378{
379 int rc = VINF_SUCCESS;
380 size_t cch = 0;
381 while (cwc > 0)
382 {
383 RTUTF16 wc = *pwsz++; cwc--;
384 if (!wc)
385 break;
386 else if (wc < 0xd800 || wc > 0xdfff)
387 {
388 if (wc < 0x80)
389 cch++;
390 else if (wc < 0x800)
391 cch += 2;
392 else if (wc < 0xfffe)
393 cch += 3;
394 else
395 {
396 RTStrAssertMsgFailed(("endian indicator! wc=%#x\n", wc));
397 rc = VERR_CODE_POINT_ENDIAN_INDICATOR;
398 break;
399 }
400 }
401 else
402 {
403 if (wc >= 0xdc00)
404 {
405 RTStrAssertMsgFailed(("Wrong 1st char in surrogate! wc=%#x\n", wc));
406 rc = VERR_INVALID_UTF16_ENCODING;
407 break;
408 }
409 if (cwc <= 0)
410 {
411 RTStrAssertMsgFailed(("Invalid length! wc=%#x\n", wc));
412 rc = VERR_INVALID_UTF16_ENCODING;
413 break;
414 }
415 wc = *pwsz++; cwc--;
416 if (wc < 0xdc00 || wc > 0xdfff)
417 {
418 RTStrAssertMsgFailed(("Wrong 2nd char in surrogate! wc=%#x\n", wc));
419 rc = VERR_INVALID_UTF16_ENCODING;
420 break;
421 }
422 cch += 4;
423 }
424 }
425
426
427 /* done */
428 *pcch = cch;
429 return rc;
430}
431
432
433/**
434 * Recodes an valid UTF-16 string as UTF-8.
435 *
436 * @returns iprt status code.
437 * @param pwsz The UTF-16 string.
438 * @param cwc The number of RTUTF16 characters to process from pwsz. The recoding
439 * will stop when cwc or '\\0' is reached.
440 * @param psz Where to store the UTF-8 string.
441 * @param cch The size of the UTF-8 buffer, excluding the terminator.
442 * @param pcch Where to store the number of octets actually encoded.
443 */
444static int rtUtf16RecodeAsUtf8(PCRTUTF16 pwsz, size_t cwc, char *psz, size_t cch, size_t *pcch)
445{
446 unsigned char *pwch = (unsigned char *)psz;
447 int rc = VINF_SUCCESS;
448 while (cwc > 0)
449 {
450 RTUTF16 wc = *pwsz++; cwc--;
451 if (!wc)
452 break;
453 else if (wc < 0xd800 || wc > 0xdfff)
454 {
455 if (wc < 0x80)
456 {
457 if (RT_UNLIKELY(cch < 1))
458 {
459 RTStrAssertMsgFailed(("Buffer overflow! 1\n"));
460 rc = VERR_BUFFER_OVERFLOW;
461 break;
462 }
463 cch--;
464 *pwch++ = (unsigned char)wc;
465 }
466 else if (wc < 0x800)
467 {
468 if (RT_UNLIKELY(cch < 2))
469 {
470 RTStrAssertMsgFailed(("Buffer overflow! 2\n"));
471 rc = VERR_BUFFER_OVERFLOW;
472 break;
473 }
474 cch -= 2;
475 *pwch++ = 0xc0 | (wc >> 6);
476 *pwch++ = 0x80 | (wc & 0x3f);
477 }
478 else if (wc < 0xfffe)
479 {
480 if (RT_UNLIKELY(cch < 3))
481 {
482 RTStrAssertMsgFailed(("Buffer overflow! 3\n"));
483 rc = VERR_BUFFER_OVERFLOW;
484 break;
485 }
486 cch -= 3;
487 *pwch++ = 0xe0 | (wc >> 12);
488 *pwch++ = 0x80 | ((wc >> 6) & 0x3f);
489 *pwch++ = 0x80 | (wc & 0x3f);
490 }
491 else
492 {
493 RTStrAssertMsgFailed(("endian indicator! wc=%#x\n", wc));
494 rc = VERR_CODE_POINT_ENDIAN_INDICATOR;
495 break;
496 }
497 }
498 else
499 {
500 if (wc >= 0xdc00)
501 {
502 RTStrAssertMsgFailed(("Wrong 1st char in surrogate! wc=%#x\n", wc));
503 rc = VERR_INVALID_UTF16_ENCODING;
504 break;
505 }
506 if (cwc <= 0)
507 {
508 RTStrAssertMsgFailed(("Invalid length! wc=%#x\n", wc));
509 rc = VERR_INVALID_UTF16_ENCODING;
510 break;
511 }
512 RTUTF16 wc2 = *pwsz++; cwc--;
513 if (wc2 < 0xdc00 || wc2 > 0xdfff)
514 {
515 RTStrAssertMsgFailed(("Wrong 2nd char in surrogate! wc=%#x\n", wc));
516 rc = VERR_INVALID_UTF16_ENCODING;
517 break;
518 }
519 uint32_t CodePoint = 0x10000
520 + ( ((wc & 0x3ff) << 10)
521 | (wc2 & 0x3ff));
522 if (RT_UNLIKELY(cch < 4))
523 {
524 RTStrAssertMsgFailed(("Buffer overflow! 4\n"));
525 rc = VERR_BUFFER_OVERFLOW;
526 break;
527 }
528 cch -= 4;
529 *pwch++ = 0xf0 | (CodePoint >> 18);
530 *pwch++ = 0x80 | ((CodePoint >> 12) & 0x3f);
531 *pwch++ = 0x80 | ((CodePoint >> 6) & 0x3f);
532 *pwch++ = 0x80 | (CodePoint & 0x3f);
533 }
534 }
535
536 /* done */
537 *pwch = '\0';
538 *pcch = (char *)pwch - psz;
539 return rc;
540}
541
542
543
544RTDECL(int) RTUtf16ToUtf8Tag(PCRTUTF16 pwszString, char **ppszString, const char *pszTag)
545{
546 /*
547 * Validate input.
548 */
549 Assert(VALID_PTR(ppszString));
550 Assert(VALID_PTR(pwszString));
551 *ppszString = NULL;
552
553 /*
554 * Validate the UTF-16 string and calculate the length of the UTF-8 encoding of it.
555 */
556 size_t cch;
557 int rc = rtUtf16CalcUtf8Length(pwszString, RTSTR_MAX, &cch);
558 if (RT_SUCCESS(rc))
559 {
560 /*
561 * Allocate buffer and recode it.
562 */
563 char *pszResult = (char *)RTMemAllocTag(cch + 1, pszTag);
564 if (pszResult)
565 {
566 rc = rtUtf16RecodeAsUtf8(pwszString, RTSTR_MAX, pszResult, cch, &cch);
567 if (RT_SUCCESS(rc))
568 {
569 *ppszString = pszResult;
570 return rc;
571 }
572
573 RTMemFree(pszResult);
574 }
575 else
576 rc = VERR_NO_STR_MEMORY;
577 }
578 return rc;
579}
580RT_EXPORT_SYMBOL(RTUtf16ToUtf8Tag);
581
582
583RTDECL(int) RTUtf16ToUtf8ExTag(PCRTUTF16 pwszString, size_t cwcString, char **ppsz, size_t cch, size_t *pcch, const char *pszTag)
584{
585 /*
586 * Validate input.
587 */
588 Assert(VALID_PTR(pwszString));
589 Assert(VALID_PTR(ppsz));
590 Assert(!pcch || VALID_PTR(pcch));
591
592 /*
593 * Validate the UTF-16 string and calculate the length of the UTF-8 encoding of it.
594 */
595 size_t cchResult;
596 int rc = rtUtf16CalcUtf8Length(pwszString, cwcString, &cchResult);
597 if (RT_SUCCESS(rc))
598 {
599 if (pcch)
600 *pcch = cchResult;
601
602 /*
603 * Check buffer size / Allocate buffer and recode it.
604 */
605 bool fShouldFree;
606 char *pszResult;
607 if (cch > 0 && *ppsz)
608 {
609 fShouldFree = false;
610 if (RT_UNLIKELY(cch <= cchResult))
611 return VERR_BUFFER_OVERFLOW;
612 pszResult = *ppsz;
613 }
614 else
615 {
616 *ppsz = NULL;
617 fShouldFree = true;
618 cch = RT_MAX(cch, cchResult + 1);
619 pszResult = (char *)RTStrAllocTag(cch, pszTag);
620 }
621 if (pszResult)
622 {
623 rc = rtUtf16RecodeAsUtf8(pwszString, cwcString, pszResult, cch - 1, &cch);
624 if (RT_SUCCESS(rc))
625 {
626 *ppsz = pszResult;
627 return rc;
628 }
629
630 if (fShouldFree)
631 RTStrFree(pszResult);
632 }
633 else
634 rc = VERR_NO_STR_MEMORY;
635 }
636 return rc;
637}
638RT_EXPORT_SYMBOL(RTUtf16ToUtf8ExTag);
639
640
641RTDECL(size_t) RTUtf16CalcUtf8Len(PCRTUTF16 pwsz)
642{
643 size_t cch;
644 int rc = rtUtf16CalcUtf8Length(pwsz, RTSTR_MAX, &cch);
645 return RT_SUCCESS(rc) ? cch : 0;
646}
647RT_EXPORT_SYMBOL(RTUtf16CalcUtf8Len);
648
649
650RTDECL(int) RTUtf16CalcUtf8LenEx(PCRTUTF16 pwsz, size_t cwc, size_t *pcch)
651{
652 size_t cch;
653 int rc = rtUtf16CalcUtf8Length(pwsz, cwc, &cch);
654 if (pcch)
655 *pcch = RT_SUCCESS(rc) ? cch : ~(size_t)0;
656 return rc;
657}
658RT_EXPORT_SYMBOL(RTUtf16CalcUtf8LenEx);
659
660
661RTDECL(RTUNICP) RTUtf16GetCpInternal(PCRTUTF16 pwsz)
662{
663 const RTUTF16 wc = *pwsz;
664
665 /* simple */
666 if (wc < 0xd800 || (wc > 0xdfff && wc < 0xfffe))
667 return wc;
668 if (wc < 0xfffe)
669 {
670 /* surrogate pair */
671 if (wc < 0xdc00)
672 {
673 const RTUTF16 wc2 = pwsz[1];
674 if (wc2 >= 0xdc00 && wc2 <= 0xdfff)
675 {
676 RTUNICP uc = 0x10000 + (((wc & 0x3ff) << 10) | (wc2 & 0x3ff));
677 return uc;
678 }
679
680 RTStrAssertMsgFailed(("wc=%#08x wc2=%#08x - invalid 2nd char in surrogate pair\n", wc, wc2));
681 }
682 else
683 RTStrAssertMsgFailed(("wc=%#08x - invalid surrogate pair order\n", wc));
684 }
685 else
686 RTStrAssertMsgFailed(("wc=%#08x - endian indicator\n", wc));
687 return RTUNICP_INVALID;
688}
689RT_EXPORT_SYMBOL(RTUtf16GetCpInternal);
690
691
692RTDECL(int) RTUtf16GetCpExInternal(PCRTUTF16 *ppwsz, PRTUNICP pCp)
693{
694 const RTUTF16 wc = **ppwsz;
695
696 /* simple */
697 if (wc < 0xd800 || (wc > 0xdfff && wc < 0xfffe))
698 {
699 (*ppwsz)++;
700 *pCp = wc;
701 return VINF_SUCCESS;
702 }
703
704 int rc;
705 if (wc < 0xfffe)
706 {
707 /* surrogate pair */
708 if (wc < 0xdc00)
709 {
710 const RTUTF16 wc2 = (*ppwsz)[1];
711 if (wc2 >= 0xdc00 && wc2 <= 0xdfff)
712 {
713 RTUNICP uc = 0x10000 + (((wc & 0x3ff) << 10) | (wc2 & 0x3ff));
714 *pCp = uc;
715 (*ppwsz) += 2;
716 return VINF_SUCCESS;
717 }
718
719 RTStrAssertMsgFailed(("wc=%#08x wc2=%#08x - invalid 2nd char in surrogate pair\n", wc, wc2));
720 }
721 else
722 RTStrAssertMsgFailed(("wc=%#08x - invalid surrogate pair order\n", wc));
723 rc = VERR_INVALID_UTF16_ENCODING;
724 }
725 else
726 {
727 RTStrAssertMsgFailed(("wc=%#08x - endian indicator\n", wc));
728 rc = VERR_CODE_POINT_ENDIAN_INDICATOR;
729 }
730 *pCp = RTUNICP_INVALID;
731 (*ppwsz)++;
732 return rc;
733}
734RT_EXPORT_SYMBOL(RTUtf16GetCpExInternal);
735
736
737RTDECL(PRTUTF16) RTUtf16PutCpInternal(PRTUTF16 pwsz, RTUNICP CodePoint)
738{
739 /* simple */
740 if ( CodePoint < 0xd800
741 || ( CodePoint > 0xdfff
742 && CodePoint < 0xfffe))
743 {
744 *pwsz++ = (RTUTF16)CodePoint;
745 return pwsz;
746 }
747
748 /* surrogate pair */
749 if (CodePoint >= 0x10000 && CodePoint <= 0x0010ffff)
750 {
751 CodePoint -= 0x10000;
752 *pwsz++ = 0xd800 | (CodePoint >> 10);
753 *pwsz++ = 0xdc00 | (CodePoint & 0x3ff);
754 return pwsz;
755 }
756
757 /* invalid code point. */
758 RTStrAssertMsgFailed(("Invalid codepoint %#x\n", CodePoint));
759 *pwsz++ = 0x7f;
760 return pwsz;
761}
762RT_EXPORT_SYMBOL(RTUtf16PutCpInternal);
763
Note: See TracBrowser for help on using the repository browser.

© 2024 Oracle Support Privacy / Do Not Sell My Info Terms of Use Trademark Policy Automated Access Etiquette