VirtualBox

source: vbox/trunk/src/VBox/Runtime/common/string/utf-16.cpp@ 66731

Last change on this file since 66731 was 66731, checked in by vboxsync, 8 years ago

iprt/string.h,iprt/utf16.h: Added some big endian UTF-16 related functions/features.

  • Property svn:eol-style set to native
  • Property svn:keywords set to Id Revision
File size: 22.0 KB
Line 
1/* $Id: utf-16.cpp 66731 2017-05-01 23:21:06Z vboxsync $ */
2/** @file
3 * IPRT - UTF-16.
4 */
5
6/*
7 * Copyright (C) 2006-2016 Oracle Corporation
8 *
9 * This file is part of VirtualBox Open Source Edition (OSE), as
10 * available from http://www.virtualbox.org. This file is free software;
11 * you can redistribute it and/or modify it under the terms of the GNU
12 * General Public License (GPL) as published by the Free Software
13 * Foundation, in version 2 as it comes in the "COPYING" file of the
14 * VirtualBox OSE distribution. VirtualBox OSE is distributed in the
15 * hope that it will be useful, but WITHOUT ANY WARRANTY of any kind.
16 *
17 * The contents of this file may alternatively be used under the terms
18 * of the Common Development and Distribution License Version 1.0
19 * (CDDL) only, as it comes in the "COPYING.CDDL" file of the
20 * VirtualBox OSE distribution, in which case the provisions of the
21 * CDDL are applicable instead of those of the GPL.
22 *
23 * You may elect to license modified versions of this file under the
24 * terms and conditions of either the GPL or the CDDL or both.
25 */
26
27
28/*********************************************************************************************************************************
29* Header Files *
30*********************************************************************************************************************************/
31#include <iprt/string.h>
32#include "internal/iprt.h"
33
34#include <iprt/uni.h>
35#include <iprt/asm.h>
36#include <iprt/mem.h>
37#include <iprt/assert.h>
38#include <iprt/err.h>
39#include "internal/string.h"
40
41
42/**
43 * Get get length in code points of an UTF-16 encoded string, validating the
44 * string while doing so.
45 *
46 * @returns IPRT status code.
47 * @param pwsz Pointer to the UTF-16 string.
48 * @param cwc The max length of the string in UTF-16 units. Use
49 * RTSTR_MAX if all of the string is to be examined.
50 * @param pcuc Where to store the length in unicode code points.
51 * @param pcwcActual Where to store the actual size of the UTF-16 string
52 * on success. Optional.
53 */
54static int rtUtf16Length(PCRTUTF16 pwsz, size_t cwc, size_t *pcuc, size_t *pcwcActual)
55{
56 PCRTUTF16 pwszStart = pwsz;
57 size_t cCodePoints = 0;
58 while (cwc > 0)
59 {
60 RTUTF16 wc = *pwsz;
61 if (!wc)
62 break;
63 if (wc < 0xd800 || wc > 0xdfff)
64 {
65 cCodePoints++;
66 pwsz++;
67 cwc--;
68 }
69 /* Surrogate pair: */
70 else if (wc >= 0xdc00)
71 {
72 RTStrAssertMsgFailed(("Lone UTF-16 trail surrogate: %#x (%.*Rhxs)\n", wc, RT_MIN(cwc * 2, 10), pwsz));
73 return VERR_INVALID_UTF16_ENCODING;
74 }
75 else if (cwc < 2)
76 {
77 RTStrAssertMsgFailed(("Lone UTF-16 lead surrogate: %#x\n", wc));
78 return VERR_INVALID_UTF16_ENCODING;
79 }
80 else
81 {
82 RTUTF16 wcTrail = pwsz[1];
83 if (wcTrail < 0xdc00 || wcTrail > 0xdfff)
84 {
85 RTStrAssertMsgFailed(("Invalid UTF-16 trail surrogate: %#x (lead %#x)\n", wcTrail, wc));
86 return VERR_INVALID_UTF16_ENCODING;
87 }
88
89 cCodePoints++;
90 pwsz += 2;
91 cwc -= 2;
92 }
93 }
94
95 /* done */
96 *pcuc = cCodePoints;
97 if (pcwcActual)
98 *pcwcActual = pwsz - pwszStart;
99 return VINF_SUCCESS;
100}
101
102
103RTDECL(PRTUTF16) RTUtf16AllocTag(size_t cb, const char *pszTag)
104{
105 if (cb > sizeof(RTUTF16))
106 cb = RT_ALIGN_Z(cb, sizeof(RTUTF16));
107 else
108 cb = sizeof(RTUTF16);
109 PRTUTF16 pwsz = (PRTUTF16)RTMemAllocTag(cb, pszTag);
110 if (pwsz)
111 *pwsz = '\0';
112 return pwsz;
113}
114RT_EXPORT_SYMBOL(RTUtf16AllocTag);
115
116
117RTDECL(int) RTUtf16ReallocTag(PRTUTF16 *ppwsz, size_t cbNew, const char *pszTag)
118{
119 PRTUTF16 pwszOld = *ppwsz;
120 cbNew = RT_ALIGN_Z(cbNew, sizeof(RTUTF16));
121 if (!cbNew)
122 {
123 RTMemFree(pwszOld);
124 *ppwsz = NULL;
125 }
126 else if (pwszOld)
127 {
128 PRTUTF16 pwszNew = (PRTUTF16)RTMemReallocTag(pwszOld, cbNew, pszTag);
129 if (!pwszNew)
130 return VERR_NO_STR_MEMORY;
131 pwszNew[cbNew / sizeof(RTUTF16) - 1] = '\0';
132 *ppwsz = pwszNew;
133 }
134 else
135 {
136 PRTUTF16 pwszNew = (PRTUTF16)RTMemAllocTag(cbNew, pszTag);
137 if (!pwszNew)
138 return VERR_NO_UTF16_MEMORY;
139 pwszNew[0] = '\0';
140 pwszNew[cbNew / sizeof(RTUTF16) - 1] = '\0';
141 *ppwsz = pwszNew;
142 }
143 return VINF_SUCCESS;
144}
145RT_EXPORT_SYMBOL(RTUtf16ReallocTag);
146
147
148RTDECL(void) RTUtf16Free(PRTUTF16 pwszString)
149{
150 if (pwszString)
151 RTMemTmpFree(pwszString);
152}
153RT_EXPORT_SYMBOL(RTUtf16Free);
154
155
156RTDECL(PRTUTF16) RTUtf16DupTag(PCRTUTF16 pwszString, const char *pszTag)
157{
158 Assert(pwszString);
159 size_t cb = (RTUtf16Len(pwszString) + 1) * sizeof(RTUTF16);
160 PRTUTF16 pwsz = (PRTUTF16)RTMemAllocTag(cb, pszTag);
161 if (pwsz)
162 memcpy(pwsz, pwszString, cb);
163 return pwsz;
164}
165RT_EXPORT_SYMBOL(RTUtf16DupTag);
166
167
168RTDECL(int) RTUtf16DupExTag(PRTUTF16 *ppwszString, PCRTUTF16 pwszString, size_t cwcExtra, const char *pszTag)
169{
170 Assert(pwszString);
171 size_t cb = (RTUtf16Len(pwszString) + 1) * sizeof(RTUTF16);
172 PRTUTF16 pwsz = (PRTUTF16)RTMemAllocTag(cb + cwcExtra * sizeof(RTUTF16), pszTag);
173 if (pwsz)
174 {
175 memcpy(pwsz, pwszString, cb);
176 *ppwszString = pwsz;
177 return VINF_SUCCESS;
178 }
179 return VERR_NO_MEMORY;
180}
181RT_EXPORT_SYMBOL(RTUtf16DupExTag);
182
183
184RTDECL(size_t) RTUtf16Len(PCRTUTF16 pwszString)
185{
186 if (!pwszString)
187 return 0;
188
189 PCRTUTF16 pwsz = pwszString;
190 while (*pwsz)
191 pwsz++;
192 return pwsz - pwszString;
193}
194RT_EXPORT_SYMBOL(RTUtf16Len);
195
196
197RTDECL(int) RTUtf16Cmp(register PCRTUTF16 pwsz1, register PCRTUTF16 pwsz2)
198{
199 if (pwsz1 == pwsz2)
200 return 0;
201 if (!pwsz1)
202 return -1;
203 if (!pwsz2)
204 return 1;
205
206 for (;;)
207 {
208 register RTUTF16 wcs = *pwsz1;
209 register int iDiff = wcs - *pwsz2;
210 if (iDiff || !wcs)
211 return iDiff;
212 pwsz1++;
213 pwsz2++;
214 }
215}
216RT_EXPORT_SYMBOL(RTUtf16Cmp);
217
218
219RTDECL(int) RTUtf16CmpUtf8(PCRTUTF16 pwsz1, const char *psz2)
220{
221 /*
222 * NULL and empty strings are all the same.
223 */
224 if (!pwsz1)
225 return !psz2 || !*psz2 ? 0 : -1;
226 if (!psz2)
227 return !*pwsz1 ? 0 : 1;
228
229 /*
230 * Compare with a UTF-8 string by enumerating them char by char.
231 */
232 for (;;)
233 {
234 RTUNICP uc1;
235 int rc = RTUtf16GetCpEx(&pwsz1, &uc1);
236 AssertRCReturn(rc, 1);
237
238 RTUNICP uc2;
239 rc = RTStrGetCpEx(&psz2, &uc2);
240 AssertRCReturn(rc, -1);
241 if (uc1 == uc2)
242 {
243 if (uc1)
244 continue;
245 return 0;
246 }
247 return uc1 < uc2 ? -1 : 1;
248 }
249}
250RT_EXPORT_SYMBOL(RTUtf16CmpUtf8);
251
252
253RTDECL(int) RTUtf16ValidateEncoding(PCRTUTF16 pwsz)
254{
255 return RTUtf16ValidateEncodingEx(pwsz, RTSTR_MAX, 0);
256}
257RT_EXPORT_SYMBOL(RTUtf16ValidateEncoding);
258
259
260RTDECL(int) RTUtf16ValidateEncodingEx(PCRTUTF16 pwsz, size_t cwc, uint32_t fFlags)
261{
262 AssertReturn(!(fFlags & ~(RTSTR_VALIDATE_ENCODING_ZERO_TERMINATED | RTSTR_VALIDATE_ENCODING_EXACT_LENGTH)),
263 VERR_INVALID_PARAMETER);
264 AssertPtr(pwsz);
265
266 /*
267 * Use rtUtf16Length for the job.
268 */
269 size_t cwcActual = 0; /* Shut up cc1plus. */
270 size_t cCpsIgnored;
271 int rc = rtUtf16Length(pwsz, cwc, &cCpsIgnored, &cwcActual);
272 if (RT_SUCCESS(rc))
273 {
274 if (fFlags & RTSTR_VALIDATE_ENCODING_EXACT_LENGTH)
275 {
276 if (fFlags & RTSTR_VALIDATE_ENCODING_ZERO_TERMINATED)
277 cwcActual++;
278 if (cwcActual == cwc)
279 rc = VINF_SUCCESS;
280 else if (cwcActual < cwc)
281 rc = VERR_BUFFER_UNDERFLOW;
282 else
283 rc = VERR_BUFFER_OVERFLOW;
284 }
285 else if ( (fFlags & RTSTR_VALIDATE_ENCODING_ZERO_TERMINATED)
286 && cwcActual >= cwc)
287 rc = VERR_BUFFER_OVERFLOW;
288 }
289 return rc;
290}
291RT_EXPORT_SYMBOL(RTUtf16ValidateEncodingEx);
292
293
294RTDECL(bool) RTUtf16IsValidEncoding(PCRTUTF16 pwsz)
295{
296 int rc = RTUtf16ValidateEncodingEx(pwsz, RTSTR_MAX, 0);
297 return RT_SUCCESS(rc);
298}
299RT_EXPORT_SYMBOL(RTUtf16IsValidEncoding);
300
301
302/**
303 * Helper for RTUtf16PurgeComplementSet.
304 *
305 * @returns true if @a Cp is valid, false if not.
306 * @param Cp The code point to validate.
307 * @param puszValidPairs Pair of valid code point sets.
308 * @param cValidPairs Number of pairs.
309 */
310DECLINLINE(bool) rtUtf16PurgeIsInSet(RTUNICP Cp, PCRTUNICP puszValidPairs, uint32_t cValidPairs)
311{
312 while (cValidPairs-- > 0)
313 {
314 if ( Cp >= puszValidPairs[0]
315 && Cp <= puszValidPairs[1])
316 return true;
317 puszValidPairs += 2;
318 }
319 return false;
320}
321
322
323RTDECL(ssize_t) RTUtf16PurgeComplementSet(PRTUTF16 pwsz, PCRTUNICP puszValidPairs, char chReplacement)
324{
325 AssertReturn(chReplacement && (unsigned)chReplacement < 128, -1);
326
327 /*
328 * Calc valid pairs and check that we've got an even number.
329 */
330 uint32_t cValidPairs = 0;
331 while (puszValidPairs[cValidPairs * 2])
332 {
333 AssertReturn(puszValidPairs[cValidPairs * 2 + 1], -1);
334 AssertMsg(puszValidPairs[cValidPairs * 2] <= puszValidPairs[cValidPairs * 2 + 1],
335 ("%#x vs %#x\n", puszValidPairs[cValidPairs * 2], puszValidPairs[cValidPairs * 2 + 1]));
336 cValidPairs++;
337 }
338
339 /*
340 * Do the replacing.
341 */
342 ssize_t cReplacements = 0;
343 for (;;)
344 {
345 PRTUTF16 pwszCur = pwsz;
346 RTUNICP Cp;
347 int rc = RTUtf16GetCpEx((PCRTUTF16 *)&pwsz, &Cp);
348 if (RT_SUCCESS(rc))
349 {
350 if (Cp)
351 {
352 if (!rtUtf16PurgeIsInSet(Cp, puszValidPairs, cValidPairs))
353 {
354 for (; pwszCur != pwsz; ++pwszCur)
355 *pwszCur = chReplacement;
356 ++cReplacements;
357 }
358 }
359 else
360 break;
361 }
362 else
363 return -1;
364 }
365 return cReplacements;
366}
367RT_EXPORT_SYMBOL(RTUtf16PurgeComplementSet);
368
369
370/**
371 * Validate the UTF-16 encoding and calculates the length of an UTF-8 encoding.
372 *
373 * @returns iprt status code.
374 * @param pwsz The UTF-16 string.
375 * @param cwc The max length of the UTF-16 string to consider.
376 * @param pcch Where to store the length (excluding '\\0') of the UTF-8 string. (cch == cb, btw)
377 */
378static int rtUtf16CalcUtf8Length(PCRTUTF16 pwsz, size_t cwc, size_t *pcch)
379{
380 int rc = VINF_SUCCESS;
381 size_t cch = 0;
382 while (cwc > 0)
383 {
384 RTUTF16 wc = *pwsz++; cwc--;
385 if (!wc)
386 break;
387 else if (wc < 0xd800 || wc > 0xdfff)
388 {
389 if (wc < 0x80)
390 cch++;
391 else if (wc < 0x800)
392 cch += 2;
393 else if (wc < 0xfffe)
394 cch += 3;
395 else
396 {
397 RTStrAssertMsgFailed(("endian indicator! wc=%#x\n", wc));
398 rc = VERR_CODE_POINT_ENDIAN_INDICATOR;
399 break;
400 }
401 }
402 else
403 {
404 if (wc >= 0xdc00)
405 {
406 RTStrAssertMsgFailed(("Wrong 1st char in surrogate! wc=%#x\n", wc));
407 rc = VERR_INVALID_UTF16_ENCODING;
408 break;
409 }
410 if (cwc <= 0)
411 {
412 RTStrAssertMsgFailed(("Invalid length! wc=%#x\n", wc));
413 rc = VERR_INVALID_UTF16_ENCODING;
414 break;
415 }
416 wc = *pwsz++; cwc--;
417 if (wc < 0xdc00 || wc > 0xdfff)
418 {
419 RTStrAssertMsgFailed(("Wrong 2nd char in surrogate! wc=%#x\n", wc));
420 rc = VERR_INVALID_UTF16_ENCODING;
421 break;
422 }
423 cch += 4;
424 }
425 }
426
427
428 /* done */
429 *pcch = cch;
430 return rc;
431}
432
433
434/**
435 * Recodes an valid UTF-16 string as UTF-8.
436 *
437 * @returns iprt status code.
438 * @param pwsz The UTF-16 string.
439 * @param cwc The number of RTUTF16 characters to process from pwsz. The recoding
440 * will stop when cwc or '\\0' is reached.
441 * @param psz Where to store the UTF-8 string.
442 * @param cch The size of the UTF-8 buffer, excluding the terminator.
443 * @param pcch Where to store the number of octets actually encoded.
444 */
445static int rtUtf16RecodeAsUtf8(PCRTUTF16 pwsz, size_t cwc, char *psz, size_t cch, size_t *pcch)
446{
447 unsigned char *pwch = (unsigned char *)psz;
448 int rc = VINF_SUCCESS;
449 while (cwc > 0)
450 {
451 RTUTF16 wc = *pwsz++; cwc--;
452 if (!wc)
453 break;
454 else if (wc < 0xd800 || wc > 0xdfff)
455 {
456 if (wc < 0x80)
457 {
458 if (RT_UNLIKELY(cch < 1))
459 {
460 RTStrAssertMsgFailed(("Buffer overflow! 1\n"));
461 rc = VERR_BUFFER_OVERFLOW;
462 break;
463 }
464 cch--;
465 *pwch++ = (unsigned char)wc;
466 }
467 else if (wc < 0x800)
468 {
469 if (RT_UNLIKELY(cch < 2))
470 {
471 RTStrAssertMsgFailed(("Buffer overflow! 2\n"));
472 rc = VERR_BUFFER_OVERFLOW;
473 break;
474 }
475 cch -= 2;
476 *pwch++ = 0xc0 | (wc >> 6);
477 *pwch++ = 0x80 | (wc & 0x3f);
478 }
479 else if (wc < 0xfffe)
480 {
481 if (RT_UNLIKELY(cch < 3))
482 {
483 RTStrAssertMsgFailed(("Buffer overflow! 3\n"));
484 rc = VERR_BUFFER_OVERFLOW;
485 break;
486 }
487 cch -= 3;
488 *pwch++ = 0xe0 | (wc >> 12);
489 *pwch++ = 0x80 | ((wc >> 6) & 0x3f);
490 *pwch++ = 0x80 | (wc & 0x3f);
491 }
492 else
493 {
494 RTStrAssertMsgFailed(("endian indicator! wc=%#x\n", wc));
495 rc = VERR_CODE_POINT_ENDIAN_INDICATOR;
496 break;
497 }
498 }
499 else
500 {
501 if (wc >= 0xdc00)
502 {
503 RTStrAssertMsgFailed(("Wrong 1st char in surrogate! wc=%#x\n", wc));
504 rc = VERR_INVALID_UTF16_ENCODING;
505 break;
506 }
507 if (cwc <= 0)
508 {
509 RTStrAssertMsgFailed(("Invalid length! wc=%#x\n", wc));
510 rc = VERR_INVALID_UTF16_ENCODING;
511 break;
512 }
513 RTUTF16 wc2 = *pwsz++; cwc--;
514 if (wc2 < 0xdc00 || wc2 > 0xdfff)
515 {
516 RTStrAssertMsgFailed(("Wrong 2nd char in surrogate! wc=%#x\n", wc));
517 rc = VERR_INVALID_UTF16_ENCODING;
518 break;
519 }
520 uint32_t CodePoint = 0x10000
521 + ( ((wc & 0x3ff) << 10)
522 | (wc2 & 0x3ff));
523 if (RT_UNLIKELY(cch < 4))
524 {
525 RTStrAssertMsgFailed(("Buffer overflow! 4\n"));
526 rc = VERR_BUFFER_OVERFLOW;
527 break;
528 }
529 cch -= 4;
530 *pwch++ = 0xf0 | (CodePoint >> 18);
531 *pwch++ = 0x80 | ((CodePoint >> 12) & 0x3f);
532 *pwch++ = 0x80 | ((CodePoint >> 6) & 0x3f);
533 *pwch++ = 0x80 | (CodePoint & 0x3f);
534 }
535 }
536
537 /* done */
538 *pwch = '\0';
539 *pcch = (char *)pwch - psz;
540 return rc;
541}
542
543
544
545RTDECL(int) RTUtf16ToUtf8Tag(PCRTUTF16 pwszString, char **ppszString, const char *pszTag)
546{
547 /*
548 * Validate input.
549 */
550 Assert(VALID_PTR(ppszString));
551 Assert(VALID_PTR(pwszString));
552 *ppszString = NULL;
553
554 /*
555 * Validate the UTF-16 string and calculate the length of the UTF-8 encoding of it.
556 */
557 size_t cch;
558 int rc = rtUtf16CalcUtf8Length(pwszString, RTSTR_MAX, &cch);
559 if (RT_SUCCESS(rc))
560 {
561 /*
562 * Allocate buffer and recode it.
563 */
564 char *pszResult = (char *)RTMemAllocTag(cch + 1, pszTag);
565 if (pszResult)
566 {
567 rc = rtUtf16RecodeAsUtf8(pwszString, RTSTR_MAX, pszResult, cch, &cch);
568 if (RT_SUCCESS(rc))
569 {
570 *ppszString = pszResult;
571 return rc;
572 }
573
574 RTMemFree(pszResult);
575 }
576 else
577 rc = VERR_NO_STR_MEMORY;
578 }
579 return rc;
580}
581RT_EXPORT_SYMBOL(RTUtf16ToUtf8Tag);
582
583
584RTDECL(int) RTUtf16ToUtf8ExTag(PCRTUTF16 pwszString, size_t cwcString, char **ppsz, size_t cch, size_t *pcch, const char *pszTag)
585{
586 /*
587 * Validate input.
588 */
589 Assert(VALID_PTR(pwszString));
590 Assert(VALID_PTR(ppsz));
591 Assert(!pcch || VALID_PTR(pcch));
592
593 /*
594 * Validate the UTF-16 string and calculate the length of the UTF-8 encoding of it.
595 */
596 size_t cchResult;
597 int rc = rtUtf16CalcUtf8Length(pwszString, cwcString, &cchResult);
598 if (RT_SUCCESS(rc))
599 {
600 if (pcch)
601 *pcch = cchResult;
602
603 /*
604 * Check buffer size / Allocate buffer and recode it.
605 */
606 bool fShouldFree;
607 char *pszResult;
608 if (cch > 0 && *ppsz)
609 {
610 fShouldFree = false;
611 if (RT_UNLIKELY(cch <= cchResult))
612 return VERR_BUFFER_OVERFLOW;
613 pszResult = *ppsz;
614 }
615 else
616 {
617 *ppsz = NULL;
618 fShouldFree = true;
619 cch = RT_MAX(cch, cchResult + 1);
620 pszResult = (char *)RTStrAllocTag(cch, pszTag);
621 }
622 if (pszResult)
623 {
624 rc = rtUtf16RecodeAsUtf8(pwszString, cwcString, pszResult, cch - 1, &cch);
625 if (RT_SUCCESS(rc))
626 {
627 *ppsz = pszResult;
628 return rc;
629 }
630
631 if (fShouldFree)
632 RTStrFree(pszResult);
633 }
634 else
635 rc = VERR_NO_STR_MEMORY;
636 }
637 return rc;
638}
639RT_EXPORT_SYMBOL(RTUtf16ToUtf8ExTag);
640
641
642RTDECL(size_t) RTUtf16CalcUtf8Len(PCRTUTF16 pwsz)
643{
644 size_t cch;
645 int rc = rtUtf16CalcUtf8Length(pwsz, RTSTR_MAX, &cch);
646 return RT_SUCCESS(rc) ? cch : 0;
647}
648RT_EXPORT_SYMBOL(RTUtf16CalcUtf8Len);
649
650
651RTDECL(int) RTUtf16CalcUtf8LenEx(PCRTUTF16 pwsz, size_t cwc, size_t *pcch)
652{
653 size_t cch;
654 int rc = rtUtf16CalcUtf8Length(pwsz, cwc, &cch);
655 if (pcch)
656 *pcch = RT_SUCCESS(rc) ? cch : ~(size_t)0;
657 return rc;
658}
659RT_EXPORT_SYMBOL(RTUtf16CalcUtf8LenEx);
660
661
662RTDECL(RTUNICP) RTUtf16GetCpInternal(PCRTUTF16 pwsz)
663{
664 const RTUTF16 wc = *pwsz;
665
666 /* simple */
667 if (wc < 0xd800 || (wc > 0xdfff && wc < 0xfffe))
668 return wc;
669 if (wc < 0xfffe)
670 {
671 /* surrogate pair */
672 if (wc < 0xdc00)
673 {
674 const RTUTF16 wc2 = pwsz[1];
675 if (wc2 >= 0xdc00 && wc2 <= 0xdfff)
676 {
677 RTUNICP uc = 0x10000 + (((wc & 0x3ff) << 10) | (wc2 & 0x3ff));
678 return uc;
679 }
680
681 RTStrAssertMsgFailed(("wc=%#08x wc2=%#08x - invalid 2nd char in surrogate pair\n", wc, wc2));
682 }
683 else
684 RTStrAssertMsgFailed(("wc=%#08x - invalid surrogate pair order\n", wc));
685 }
686 else
687 RTStrAssertMsgFailed(("wc=%#08x - endian indicator\n", wc));
688 return RTUNICP_INVALID;
689}
690RT_EXPORT_SYMBOL(RTUtf16GetCpInternal);
691
692
693RTDECL(int) RTUtf16GetCpExInternal(PCRTUTF16 *ppwsz, PRTUNICP pCp)
694{
695 const RTUTF16 wc = **ppwsz;
696
697 /* simple */
698 if (wc < 0xd800 || (wc > 0xdfff && wc < 0xfffe))
699 {
700 (*ppwsz)++;
701 *pCp = wc;
702 return VINF_SUCCESS;
703 }
704
705 int rc;
706 if (wc < 0xfffe)
707 {
708 /* surrogate pair */
709 if (wc < 0xdc00)
710 {
711 const RTUTF16 wc2 = (*ppwsz)[1];
712 if (wc2 >= 0xdc00 && wc2 <= 0xdfff)
713 {
714 RTUNICP uc = 0x10000 + (((wc & 0x3ff) << 10) | (wc2 & 0x3ff));
715 *pCp = uc;
716 (*ppwsz) += 2;
717 return VINF_SUCCESS;
718 }
719
720 RTStrAssertMsgFailed(("wc=%#08x wc2=%#08x - invalid 2nd char in surrogate pair\n", wc, wc2));
721 }
722 else
723 RTStrAssertMsgFailed(("wc=%#08x - invalid surrogate pair order\n", wc));
724 rc = VERR_INVALID_UTF16_ENCODING;
725 }
726 else
727 {
728 RTStrAssertMsgFailed(("wc=%#08x - endian indicator\n", wc));
729 rc = VERR_CODE_POINT_ENDIAN_INDICATOR;
730 }
731 *pCp = RTUNICP_INVALID;
732 (*ppwsz)++;
733 return rc;
734}
735RT_EXPORT_SYMBOL(RTUtf16GetCpExInternal);
736
737
738RTDECL(int) RTUtf16BigGetCpExInternal(PCRTUTF16 *ppwsz, PRTUNICP pCp)
739{
740 const RTUTF16 wc = RT_BE2H_U16(**ppwsz);
741
742 /* simple */
743 if (wc < 0xd800 || (wc > 0xdfff && wc < 0xfffe))
744 {
745 (*ppwsz)++;
746 *pCp = wc;
747 return VINF_SUCCESS;
748 }
749
750 int rc;
751 if (wc < 0xfffe)
752 {
753 /* surrogate pair */
754 if (wc < 0xdc00)
755 {
756 const RTUTF16 wc2 = RT_BE2H_U16((*ppwsz)[1]);
757 if (wc2 >= 0xdc00 && wc2 <= 0xdfff)
758 {
759 RTUNICP uc = 0x10000 + (((wc & 0x3ff) << 10) | (wc2 & 0x3ff));
760 *pCp = uc;
761 (*ppwsz) += 2;
762 return VINF_SUCCESS;
763 }
764
765 RTStrAssertMsgFailed(("wc=%#08x wc2=%#08x - invalid 2nd char in surrogate pair\n", wc, wc2));
766 }
767 else
768 RTStrAssertMsgFailed(("wc=%#08x - invalid surrogate pair order\n", wc));
769 rc = VERR_INVALID_UTF16_ENCODING;
770 }
771 else
772 {
773 RTStrAssertMsgFailed(("wc=%#08x - endian indicator\n", wc));
774 rc = VERR_CODE_POINT_ENDIAN_INDICATOR;
775 }
776 *pCp = RTUNICP_INVALID;
777 (*ppwsz)++;
778 return rc;
779}
780RT_EXPORT_SYMBOL(RTUtf16GetCpExInternal);
781
782
783RTDECL(PRTUTF16) RTUtf16PutCpInternal(PRTUTF16 pwsz, RTUNICP CodePoint)
784{
785 /* simple */
786 if ( CodePoint < 0xd800
787 || ( CodePoint > 0xdfff
788 && CodePoint < 0xfffe))
789 {
790 *pwsz++ = (RTUTF16)CodePoint;
791 return pwsz;
792 }
793
794 /* surrogate pair */
795 if (CodePoint >= 0x10000 && CodePoint <= 0x0010ffff)
796 {
797 CodePoint -= 0x10000;
798 *pwsz++ = 0xd800 | (CodePoint >> 10);
799 *pwsz++ = 0xdc00 | (CodePoint & 0x3ff);
800 return pwsz;
801 }
802
803 /* invalid code point. */
804 RTStrAssertMsgFailed(("Invalid codepoint %#x\n", CodePoint));
805 *pwsz++ = 0x7f;
806 return pwsz;
807}
808RT_EXPORT_SYMBOL(RTUtf16PutCpInternal);
809
Note: See TracBrowser for help on using the repository browser.

© 2024 Oracle Support Privacy / Do Not Sell My Info Terms of Use Trademark Policy Automated Access Etiquette