VirtualBox

source: vbox/trunk/src/VBox/Runtime/common/string/utf-16.cpp@ 44300

Last change on this file since 44300 was 40938, checked in by vboxsync, 13 years ago

runtime: backed out r77481,r77482,r77483,r77484,r77485

  • Property svn:eol-style set to native
  • Property svn:keywords set to Id
File size: 30.8 KB
Line 
1/* $Id: utf-16.cpp 40938 2012-04-16 11:58:26Z vboxsync $ */
2/** @file
3 * IPRT - UTF-16.
4 */
5
6/*
7 * Copyright (C) 2006-2010 Oracle Corporation
8 *
9 * This file is part of VirtualBox Open Source Edition (OSE), as
10 * available from http://www.virtualbox.org. This file is free software;
11 * you can redistribute it and/or modify it under the terms of the GNU
12 * General Public License (GPL) as published by the Free Software
13 * Foundation, in version 2 as it comes in the "COPYING" file of the
14 * VirtualBox OSE distribution. VirtualBox OSE is distributed in the
15 * hope that it will be useful, but WITHOUT ANY WARRANTY of any kind.
16 *
17 * The contents of this file may alternatively be used under the terms
18 * of the Common Development and Distribution License Version 1.0
19 * (CDDL) only, as it comes in the "COPYING.CDDL" file of the
20 * VirtualBox OSE distribution, in which case the provisions of the
21 * CDDL are applicable instead of those of the GPL.
22 *
23 * You may elect to license modified versions of this file under the
24 * terms and conditions of either the GPL or the CDDL or both.
25 */
26
27
28/*******************************************************************************
29* Header Files *
30*******************************************************************************/
31#include <iprt/string.h>
32#include "internal/iprt.h"
33
34#include <iprt/uni.h>
35#include <iprt/alloc.h>
36#include <iprt/assert.h>
37#include <iprt/err.h>
38#include "internal/string.h"
39
40
41
42RTDECL(void) RTUtf16Free(PRTUTF16 pwszString)
43{
44 if (pwszString)
45 RTMemTmpFree(pwszString);
46}
47RT_EXPORT_SYMBOL(RTUtf16Free);
48
49
50RTDECL(PRTUTF16) RTUtf16DupTag(PCRTUTF16 pwszString, const char *pszTag)
51{
52 Assert(pwszString);
53 size_t cb = (RTUtf16Len(pwszString) + 1) * sizeof(RTUTF16);
54 PRTUTF16 pwsz = (PRTUTF16)RTMemAllocTag(cb, pszTag);
55 if (pwsz)
56 memcpy(pwsz, pwszString, cb);
57 return pwsz;
58}
59RT_EXPORT_SYMBOL(RTUtf16DupTag);
60
61
62RTDECL(int) RTUtf16DupExTag(PRTUTF16 *ppwszString, PCRTUTF16 pwszString, size_t cwcExtra, const char *pszTag)
63{
64 Assert(pwszString);
65 size_t cb = (RTUtf16Len(pwszString) + 1) * sizeof(RTUTF16);
66 PRTUTF16 pwsz = (PRTUTF16)RTMemAllocTag(cb + cwcExtra * sizeof(RTUTF16), pszTag);
67 if (pwsz)
68 {
69 memcpy(pwsz, pwszString, cb);
70 *ppwszString = pwsz;
71 return VINF_SUCCESS;
72 }
73 return VERR_NO_MEMORY;
74}
75RT_EXPORT_SYMBOL(RTUtf16DupExTag);
76
77
78RTDECL(size_t) RTUtf16Len(PCRTUTF16 pwszString)
79{
80 if (!pwszString)
81 return 0;
82
83 PCRTUTF16 pwsz = pwszString;
84 while (*pwsz)
85 pwsz++;
86 return pwsz - pwszString;
87}
88RT_EXPORT_SYMBOL(RTUtf16Len);
89
90
91RTDECL(int) RTUtf16Cmp(register PCRTUTF16 pwsz1, register PCRTUTF16 pwsz2)
92{
93 if (pwsz1 == pwsz2)
94 return 0;
95 if (!pwsz1)
96 return -1;
97 if (!pwsz2)
98 return 1;
99
100 for (;;)
101 {
102 register RTUTF16 wcs = *pwsz1;
103 register int iDiff = wcs - *pwsz2;
104 if (iDiff || !wcs)
105 return iDiff;
106 pwsz1++;
107 pwsz2++;
108 }
109}
110RT_EXPORT_SYMBOL(RTUtf16Cmp);
111
112
113RTDECL(int) RTUtf16ICmp(register PCRTUTF16 pwsz1, register PCRTUTF16 pwsz2)
114{
115 if (pwsz1 == pwsz2)
116 return 0;
117 if (!pwsz1)
118 return -1;
119 if (!pwsz2)
120 return 1;
121
122 PCRTUTF16 pwsz1Start = pwsz1; /* keep it around in case we have to backtrack on a surrogate pair */
123 for (;;)
124 {
125 register RTUTF16 wc1 = *pwsz1;
126 register RTUTF16 wc2 = *pwsz2;
127 register int iDiff = wc1 - wc2;
128 if (iDiff)
129 {
130 /* unless they are *both* surrogate pairs, there is no chance they'll be identical. */
131 if ( wc1 < 0xd800
132 || wc2 < 0xd800
133 || wc1 > 0xdfff
134 || wc2 > 0xdfff)
135 {
136 /* simple UCS-2 char */
137 iDiff = RTUniCpToUpper(wc1) - RTUniCpToUpper(wc2);
138 if (iDiff)
139 iDiff = RTUniCpToLower(wc1) - RTUniCpToLower(wc2);
140 }
141 else
142 {
143 /* a damned pair */
144 RTUNICP uc1;
145 RTUNICP uc2;
146 if (wc1 >= 0xdc00)
147 {
148 if (pwsz1Start == pwsz1)
149 return iDiff;
150 uc1 = pwsz1[-1];
151 if (uc1 < 0xd800 || uc1 >= 0xdc00)
152 return iDiff;
153 uc1 = 0x10000 + (((uc1 & 0x3ff) << 10) | (wc1 & 0x3ff));
154 uc2 = 0x10000 + (((pwsz2[-1] & 0x3ff) << 10) | (wc2 & 0x3ff));
155 }
156 else
157 {
158 uc1 = *++pwsz1;
159 if (uc1 < 0xdc00 || uc1 >= 0xe000)
160 return iDiff;
161 uc1 = 0x10000 + (((wc1 & 0x3ff) << 10) | (uc1 & 0x3ff));
162 uc2 = 0x10000 + (((wc2 & 0x3ff) << 10) | (*++pwsz2 & 0x3ff));
163 }
164 iDiff = RTUniCpToUpper(uc1) - RTUniCpToUpper(uc2);
165 if (iDiff)
166 iDiff = RTUniCpToLower(uc1) - RTUniCpToLower(uc2); /* serious paranoia! */
167 }
168 if (iDiff)
169 return iDiff;
170 }
171 if (!wc1)
172 return 0;
173 pwsz1++;
174 pwsz2++;
175 }
176}
177RT_EXPORT_SYMBOL(RTUtf16ICmp);
178
179
180RTDECL(PRTUTF16) RTUtf16ToLower(PRTUTF16 pwsz)
181{
182 PRTUTF16 pwc = pwsz;
183 for (;;)
184 {
185 RTUTF16 wc = *pwc;
186 if (!wc)
187 break;
188 if (wc < 0xd800 || wc >= 0xdc00)
189 {
190 RTUNICP ucFolded = RTUniCpToLower(wc);
191 if (ucFolded < 0x10000)
192 *pwc++ = RTUniCpToLower(wc);
193 }
194 else
195 {
196 /* surrogate */
197 RTUTF16 wc2 = pwc[1];
198 if (wc2 >= 0xdc00 && wc2 <= 0xdfff)
199 {
200 RTUNICP uc = 0x10000 + (((wc & 0x3ff) << 10) | (wc2 & 0x3ff));
201 RTUNICP ucFolded = RTUniCpToLower(uc);
202 if (uc != ucFolded && ucFolded >= 0x10000) /* we don't support shrinking the string */
203 {
204 uc -= 0x10000;
205 *pwc++ = 0xd800 | (uc >> 10);
206 *pwc++ = 0xdc00 | (uc & 0x3ff);
207 }
208 }
209 else /* invalid encoding. */
210 pwc++;
211 }
212 }
213 return pwsz;
214}
215RT_EXPORT_SYMBOL(RTUtf16ToLower);
216
217
218RTDECL(PRTUTF16) RTUtf16ToUpper(PRTUTF16 pwsz)
219{
220 PRTUTF16 pwc = pwsz;
221 for (;;)
222 {
223 RTUTF16 wc = *pwc;
224 if (!wc)
225 break;
226 if (wc < 0xd800 || wc >= 0xdc00)
227 *pwc++ = RTUniCpToUpper(wc);
228 else
229 {
230 /* surrogate */
231 RTUTF16 wc2 = pwc[1];
232 if (wc2 >= 0xdc00 && wc2 <= 0xdfff)
233 {
234 RTUNICP uc = 0x10000 + (((wc & 0x3ff) << 10) | (wc2 & 0x3ff));
235 RTUNICP ucFolded = RTUniCpToUpper(uc);
236 if (uc != ucFolded && ucFolded >= 0x10000) /* we don't support shrinking the string */
237 {
238 uc -= 0x10000;
239 *pwc++ = 0xd800 | (uc >> 10);
240 *pwc++ = 0xdc00 | (uc & 0x3ff);
241 }
242 }
243 else /* invalid encoding. */
244 pwc++;
245 }
246 }
247 return pwsz;
248}
249RT_EXPORT_SYMBOL(RTUtf16ToUpper);
250
251
252RTDECL(ssize_t) RTUtf16PurgeComplementSet(PRTUTF16 pwsz, PCRTUNICP puszValidSet, char chReplacement)
253{
254 size_t cReplacements = 0;
255 AssertReturn(chReplacement && (unsigned)chReplacement < 128, -1);
256 /* Validate the encoding. */
257 for (;;)
258 {
259 RTUNICP Cp;
260 PCRTUNICP pCp;
261 PRTUTF16 pwszOld = pwsz;
262 if (RT_FAILURE(RTUtf16GetCpEx((PCRTUTF16 *)&pwsz, &Cp)))
263 return -1;
264 if (!Cp)
265 break;
266 for (pCp = puszValidSet; *pCp; pCp += 2)
267 {
268 AssertReturn(*(pCp + 1), -1);
269 if (*pCp <= Cp && *(pCp + 1) >= Cp) /* No, I won't do * and ++. */
270 break;
271 }
272 if (!*pCp)
273 {
274 for (; pwszOld != pwsz; ++pwszOld)
275 *pwszOld = chReplacement;
276 ++cReplacements;
277 }
278 }
279 return cReplacements;
280}
281RT_EXPORT_SYMBOL(RTUtf16PurgeComplementSet);
282
283
284/**
285 * Validate the UTF-16 encoding and calculates the length of an UTF-8 encoding.
286 *
287 * @returns iprt status code.
288 * @param pwsz The UTF-16 string.
289 * @param cwc The max length of the UTF-16 string to consider.
290 * @param pcch Where to store the length (excluding '\\0') of the UTF-8 string. (cch == cb, btw)
291 */
292static int rtUtf16CalcUtf8Length(PCRTUTF16 pwsz, size_t cwc, size_t *pcch)
293{
294 int rc = VINF_SUCCESS;
295 size_t cch = 0;
296 while (cwc > 0)
297 {
298 RTUTF16 wc = *pwsz++; cwc--;
299 if (!wc)
300 break;
301 else if (wc < 0xd800 || wc > 0xdfff)
302 {
303 if (wc < 0x80)
304 cch++;
305 else if (wc < 0x800)
306 cch += 2;
307 else if (wc < 0xfffe)
308 cch += 3;
309 else
310 {
311 RTStrAssertMsgFailed(("endian indicator! wc=%#x\n", wc));
312 rc = VERR_CODE_POINT_ENDIAN_INDICATOR;
313 break;
314 }
315 }
316 else
317 {
318 if (wc >= 0xdc00)
319 {
320 RTStrAssertMsgFailed(("Wrong 1st char in surrogate! wc=%#x\n", wc));
321 rc = VERR_INVALID_UTF16_ENCODING;
322 break;
323 }
324 if (cwc <= 0)
325 {
326 RTStrAssertMsgFailed(("Invalid length! wc=%#x\n", wc));
327 rc = VERR_INVALID_UTF16_ENCODING;
328 break;
329 }
330 wc = *pwsz++; cwc--;
331 if (wc < 0xdc00 || wc > 0xdfff)
332 {
333 RTStrAssertMsgFailed(("Wrong 2nd char in surrogate! wc=%#x\n", wc));
334 rc = VERR_INVALID_UTF16_ENCODING;
335 break;
336 }
337 cch += 4;
338 }
339 }
340
341
342 /* done */
343 *pcch = cch;
344 return rc;
345}
346
347
348/**
349 * Recodes an valid UTF-16 string as UTF-8.
350 *
351 * @returns iprt status code.
352 * @param pwsz The UTF-16 string.
353 * @param cwc The number of RTUTF16 characters to process from pwsz. The recoding
354 * will stop when cwc or '\\0' is reached.
355 * @param psz Where to store the UTF-8 string.
356 * @param cch The size of the UTF-8 buffer, excluding the terminator.
357 * @param pcch Where to store the number of octets actually encoded.
358 */
359static int rtUtf16RecodeAsUtf8(PCRTUTF16 pwsz, size_t cwc, char *psz, size_t cch, size_t *pcch)
360{
361 unsigned char *pwch = (unsigned char *)psz;
362 int rc = VINF_SUCCESS;
363 while (cwc > 0)
364 {
365 RTUTF16 wc = *pwsz++; cwc--;
366 if (!wc)
367 break;
368 else if (wc < 0xd800 || wc > 0xdfff)
369 {
370 if (wc < 0x80)
371 {
372 if (RT_UNLIKELY(cch < 1))
373 {
374 RTStrAssertMsgFailed(("Buffer overflow! 1\n"));
375 rc = VERR_BUFFER_OVERFLOW;
376 break;
377 }
378 cch--;
379 *pwch++ = (unsigned char)wc;
380 }
381 else if (wc < 0x800)
382 {
383 if (RT_UNLIKELY(cch < 2))
384 {
385 RTStrAssertMsgFailed(("Buffer overflow! 2\n"));
386 rc = VERR_BUFFER_OVERFLOW;
387 break;
388 }
389 cch -= 2;
390 *pwch++ = 0xc0 | (wc >> 6);
391 *pwch++ = 0x80 | (wc & 0x3f);
392 }
393 else if (wc < 0xfffe)
394 {
395 if (RT_UNLIKELY(cch < 3))
396 {
397 RTStrAssertMsgFailed(("Buffer overflow! 3\n"));
398 rc = VERR_BUFFER_OVERFLOW;
399 break;
400 }
401 cch -= 3;
402 *pwch++ = 0xe0 | (wc >> 12);
403 *pwch++ = 0x80 | ((wc >> 6) & 0x3f);
404 *pwch++ = 0x80 | (wc & 0x3f);
405 }
406 else
407 {
408 RTStrAssertMsgFailed(("endian indicator! wc=%#x\n", wc));
409 rc = VERR_CODE_POINT_ENDIAN_INDICATOR;
410 break;
411 }
412 }
413 else
414 {
415 if (wc >= 0xdc00)
416 {
417 RTStrAssertMsgFailed(("Wrong 1st char in surrogate! wc=%#x\n", wc));
418 rc = VERR_INVALID_UTF16_ENCODING;
419 break;
420 }
421 if (cwc <= 0)
422 {
423 RTStrAssertMsgFailed(("Invalid length! wc=%#x\n", wc));
424 rc = VERR_INVALID_UTF16_ENCODING;
425 break;
426 }
427 RTUTF16 wc2 = *pwsz++; cwc--;
428 if (wc2 < 0xdc00 || wc2 > 0xdfff)
429 {
430 RTStrAssertMsgFailed(("Wrong 2nd char in surrogate! wc=%#x\n", wc));
431 rc = VERR_INVALID_UTF16_ENCODING;
432 break;
433 }
434 uint32_t CodePoint = 0x10000
435 + ( ((wc & 0x3ff) << 10)
436 | (wc2 & 0x3ff));
437 if (RT_UNLIKELY(cch < 4))
438 {
439 RTStrAssertMsgFailed(("Buffer overflow! 4\n"));
440 rc = VERR_BUFFER_OVERFLOW;
441 break;
442 }
443 cch -= 4;
444 *pwch++ = 0xf0 | (CodePoint >> 18);
445 *pwch++ = 0x80 | ((CodePoint >> 12) & 0x3f);
446 *pwch++ = 0x80 | ((CodePoint >> 6) & 0x3f);
447 *pwch++ = 0x80 | (CodePoint & 0x3f);
448 }
449 }
450
451 /* done */
452 *pwch = '\0';
453 *pcch = (char *)pwch - psz;
454 return rc;
455}
456
457
458
459RTDECL(int) RTUtf16ToUtf8Tag(PCRTUTF16 pwszString, char **ppszString, const char *pszTag)
460{
461 /*
462 * Validate input.
463 */
464 Assert(VALID_PTR(ppszString));
465 Assert(VALID_PTR(pwszString));
466 *ppszString = NULL;
467
468 /*
469 * Validate the UTF-16 string and calculate the length of the UTF-8 encoding of it.
470 */
471 size_t cch;
472 int rc = rtUtf16CalcUtf8Length(pwszString, RTSTR_MAX, &cch);
473 if (RT_SUCCESS(rc))
474 {
475 /*
476 * Allocate buffer and recode it.
477 */
478 char *pszResult = (char *)RTMemAllocTag(cch + 1, pszTag);
479 if (pszResult)
480 {
481 rc = rtUtf16RecodeAsUtf8(pwszString, RTSTR_MAX, pszResult, cch, &cch);
482 if (RT_SUCCESS(rc))
483 {
484 *ppszString = pszResult;
485 return rc;
486 }
487
488 RTMemFree(pszResult);
489 }
490 else
491 rc = VERR_NO_STR_MEMORY;
492 }
493 return rc;
494}
495RT_EXPORT_SYMBOL(RTUtf16ToUtf8Tag);
496
497
498RTDECL(int) RTUtf16ToUtf8ExTag(PCRTUTF16 pwszString, size_t cwcString, char **ppsz, size_t cch, size_t *pcch, const char *pszTag)
499{
500 /*
501 * Validate input.
502 */
503 Assert(VALID_PTR(pwszString));
504 Assert(VALID_PTR(ppsz));
505 Assert(!pcch || VALID_PTR(pcch));
506
507 /*
508 * Validate the UTF-16 string and calculate the length of the UTF-8 encoding of it.
509 */
510 size_t cchResult;
511 int rc = rtUtf16CalcUtf8Length(pwszString, cwcString, &cchResult);
512 if (RT_SUCCESS(rc))
513 {
514 if (pcch)
515 *pcch = cchResult;
516
517 /*
518 * Check buffer size / Allocate buffer and recode it.
519 */
520 bool fShouldFree;
521 char *pszResult;
522 if (cch > 0 && *ppsz)
523 {
524 fShouldFree = false;
525 if (RT_UNLIKELY(cch <= cchResult))
526 return VERR_BUFFER_OVERFLOW;
527 pszResult = *ppsz;
528 }
529 else
530 {
531 *ppsz = NULL;
532 fShouldFree = true;
533 cch = RT_MAX(cch, cchResult + 1);
534 pszResult = (char *)RTStrAllocTag(cch, pszTag);
535 }
536 if (pszResult)
537 {
538 rc = rtUtf16RecodeAsUtf8(pwszString, cwcString, pszResult, cch - 1, &cch);
539 if (RT_SUCCESS(rc))
540 {
541 *ppsz = pszResult;
542 return rc;
543 }
544
545 if (fShouldFree)
546 RTStrFree(pszResult);
547 }
548 else
549 rc = VERR_NO_STR_MEMORY;
550 }
551 return rc;
552}
553RT_EXPORT_SYMBOL(RTUtf16ToUtf8ExTag);
554
555
556RTDECL(size_t) RTUtf16CalcUtf8Len(PCRTUTF16 pwsz)
557{
558 size_t cch;
559 int rc = rtUtf16CalcUtf8Length(pwsz, RTSTR_MAX, &cch);
560 return RT_SUCCESS(rc) ? cch : 0;
561}
562RT_EXPORT_SYMBOL(RTUtf16CalcUtf8Len);
563
564
565RTDECL(int) RTUtf16CalcUtf8LenEx(PCRTUTF16 pwsz, size_t cwc, size_t *pcch)
566{
567 size_t cch;
568 int rc = rtUtf16CalcUtf8Length(pwsz, cwc, &cch);
569 if (pcch)
570 *pcch = RT_SUCCESS(rc) ? cch : ~(size_t)0;
571 return rc;
572}
573RT_EXPORT_SYMBOL(RTUtf16CalcUtf8LenEx);
574
575
576RTDECL(RTUNICP) RTUtf16GetCpInternal(PCRTUTF16 pwsz)
577{
578 const RTUTF16 wc = *pwsz;
579
580 /* simple */
581 if (wc < 0xd800 || (wc > 0xdfff && wc < 0xfffe))
582 return wc;
583 if (wc < 0xfffe)
584 {
585 /* surrogate pair */
586 if (wc < 0xdc00)
587 {
588 const RTUTF16 wc2 = pwsz[1];
589 if (wc2 >= 0xdc00 && wc2 <= 0xdfff)
590 {
591 RTUNICP uc = 0x10000 + (((wc & 0x3ff) << 10) | (wc2 & 0x3ff));
592 return uc;
593 }
594
595 RTStrAssertMsgFailed(("wc=%#08x wc2=%#08x - invalid 2nd char in surrogate pair\n", wc, wc2));
596 }
597 else
598 RTStrAssertMsgFailed(("wc=%#08x - invalid surrogate pair order\n", wc));
599 }
600 else
601 RTStrAssertMsgFailed(("wc=%#08x - endian indicator\n", wc));
602 return RTUNICP_INVALID;
603}
604RT_EXPORT_SYMBOL(RTUtf16GetCpInternal);
605
606
607RTDECL(int) RTUtf16GetCpExInternal(PCRTUTF16 *ppwsz, PRTUNICP pCp)
608{
609 const RTUTF16 wc = **ppwsz;
610
611 /* simple */
612 if (wc < 0xd800 || (wc > 0xdfff && wc < 0xfffe))
613 {
614 (*ppwsz)++;
615 *pCp = wc;
616 return VINF_SUCCESS;
617 }
618
619 int rc;
620 if (wc < 0xfffe)
621 {
622 /* surrogate pair */
623 if (wc < 0xdc00)
624 {
625 const RTUTF16 wc2 = (*ppwsz)[1];
626 if (wc2 >= 0xdc00 && wc2 <= 0xdfff)
627 {
628 RTUNICP uc = 0x10000 + (((wc & 0x3ff) << 10) | (wc2 & 0x3ff));
629 *pCp = uc;
630 (*ppwsz) += 2;
631 return VINF_SUCCESS;
632 }
633
634 RTStrAssertMsgFailed(("wc=%#08x wc2=%#08x - invalid 2nd char in surrogate pair\n", wc, wc2));
635 }
636 else
637 RTStrAssertMsgFailed(("wc=%#08x - invalid surrogate pair order\n", wc));
638 rc = VERR_INVALID_UTF16_ENCODING;
639 }
640 else
641 {
642 RTStrAssertMsgFailed(("wc=%#08x - endian indicator\n", wc));
643 rc = VERR_CODE_POINT_ENDIAN_INDICATOR;
644 }
645 *pCp = RTUNICP_INVALID;
646 (*ppwsz)++;
647 return rc;
648}
649RT_EXPORT_SYMBOL(RTUtf16GetCpExInternal);
650
651
652RTDECL(PRTUTF16) RTUtf16PutCpInternal(PRTUTF16 pwsz, RTUNICP CodePoint)
653{
654 /* simple */
655 if ( CodePoint < 0xd800
656 || ( CodePoint > 0xdfff
657 && CodePoint < 0xfffe))
658 {
659 *pwsz++ = (RTUTF16)CodePoint;
660 return pwsz;
661 }
662
663 /* surrogate pair */
664 if (CodePoint >= 0x10000 && CodePoint <= 0x0010ffff)
665 {
666 CodePoint -= 0x10000;
667 *pwsz++ = 0xd800 | (CodePoint >> 10);
668 *pwsz++ = 0xdc00 | (CodePoint & 0x3ff);
669 return pwsz;
670 }
671
672 /* invalid code point. */
673 RTStrAssertMsgFailed(("Invalid codepoint %#x\n", CodePoint));
674 *pwsz++ = 0x7f;
675 return pwsz;
676}
677RT_EXPORT_SYMBOL(RTUtf16PutCpInternal);
678
679
680/**
681 * Validate the UTF-16 encoding and calculates the length of a Latin1 encoding.
682 *
683 * @returns iprt status code.
684 * @param pwsz The UTF-16 string.
685 * @param cwc The max length of the UTF-16 string to consider.
686 * @param pcch Where to store the length (excluding '\\0') of the Latin1 string. (cch == cb, btw)
687 */
688static int rtUtf16CalcLatin1Length(PCRTUTF16 pwsz, size_t cwc, size_t *pcch)
689{
690 int rc = VINF_SUCCESS;
691 size_t cch = 0;
692 while (cwc > 0)
693 {
694 RTUTF16 wc = *pwsz++; cwc--;
695 if (!wc)
696 break;
697 else if (RT_LIKELY(wc < 0x100))
698 ++cch;
699 else
700 {
701 if (wc < 0xd800 || wc > 0xdfff)
702 {
703 if (wc >= 0xfffe)
704 {
705 RTStrAssertMsgFailed(("endian indicator! wc=%#x\n", wc));
706 rc = VERR_CODE_POINT_ENDIAN_INDICATOR;
707 break;
708 }
709 }
710 else
711 {
712 if (wc >= 0xdc00)
713 {
714 RTStrAssertMsgFailed(("Wrong 1st char in surrogate! wc=%#x\n", wc));
715 rc = VERR_INVALID_UTF16_ENCODING;
716 break;
717 }
718 if (cwc <= 0)
719 {
720 RTStrAssertMsgFailed(("Invalid length! wc=%#x\n", wc));
721 rc = VERR_INVALID_UTF16_ENCODING;
722 break;
723 }
724 wc = *pwsz++; cwc--;
725 if (wc < 0xdc00 || wc > 0xdfff)
726 {
727 RTStrAssertMsgFailed(("Wrong 2nd char in surrogate! wc=%#x\n", wc));
728 rc = VERR_INVALID_UTF16_ENCODING;
729 break;
730 }
731 }
732
733 rc = VERR_NO_TRANSLATION;
734 break;
735 }
736 }
737
738 /* done */
739 *pcch = cch;
740 return rc;
741}
742
743
744/**
745 * Recodes an valid UTF-16 string as Latin1.
746 *
747 * @returns iprt status code.
748 * @param pwsz The UTF-16 string.
749 * @param cwc The number of RTUTF16 characters to process from pwsz. The recoding
750 * will stop when cwc or '\\0' is reached.
751 * @param psz Where to store the Latin1 string.
752 * @param cch The size of the Latin1 buffer, excluding the terminator.
753 */
754static int rtUtf16RecodeAsLatin1(PCRTUTF16 pwsz, size_t cwc, char *psz, size_t cch)
755{
756 unsigned char *pch = (unsigned char *)psz;
757 int rc = VINF_SUCCESS;
758 while (cwc > 0)
759 {
760 RTUTF16 wc = *pwsz++; cwc--;
761 if (!wc)
762 break;
763 if (RT_LIKELY(wc < 0x100))
764 {
765 if (RT_UNLIKELY(cch < 1))
766 {
767 RTStrAssertMsgFailed(("Buffer overflow! 1\n"));
768 rc = VERR_BUFFER_OVERFLOW;
769 break;
770 }
771 cch--;
772 *pch++ = (unsigned char)wc;
773 }
774 else
775 {
776 if (wc < 0xd800 || wc > 0xdfff)
777 {
778 if (wc >= 0xfffe)
779 {
780 RTStrAssertMsgFailed(("endian indicator! wc=%#x\n", wc));
781 rc = VERR_CODE_POINT_ENDIAN_INDICATOR;
782 break;
783 }
784 }
785 else
786 {
787 if (wc >= 0xdc00)
788 {
789 RTStrAssertMsgFailed(("Wrong 1st char in surrogate! wc=%#x\n", wc));
790 rc = VERR_INVALID_UTF16_ENCODING;
791 break;
792 }
793 if (cwc <= 0)
794 {
795 RTStrAssertMsgFailed(("Invalid length! wc=%#x\n", wc));
796 rc = VERR_INVALID_UTF16_ENCODING;
797 break;
798 }
799 RTUTF16 wc2 = *pwsz++; cwc--;
800 if (wc2 < 0xdc00 || wc2 > 0xdfff)
801 {
802 RTStrAssertMsgFailed(("Wrong 2nd char in surrogate! wc=%#x\n", wc));
803 rc = VERR_INVALID_UTF16_ENCODING;
804 break;
805 }
806 }
807
808 rc = VERR_NO_TRANSLATION;
809 break;
810 }
811 }
812
813 /* done */
814 *pch = '\0';
815 return rc;
816}
817
818
819RTDECL(int) RTUtf16ToLatin1Tag(PCRTUTF16 pwszString, char **ppszString, const char *pszTag)
820{
821 /*
822 * Validate input.
823 */
824 Assert(VALID_PTR(ppszString));
825 Assert(VALID_PTR(pwszString));
826 *ppszString = NULL;
827
828 /*
829 * Validate the UTF-16 string and calculate the length of the UTF-8 encoding of it.
830 */
831 size_t cch;
832 int rc = rtUtf16CalcLatin1Length(pwszString, RTSTR_MAX, &cch);
833 if (RT_SUCCESS(rc))
834 {
835 /*
836 * Allocate buffer and recode it.
837 */
838 char *pszResult = (char *)RTMemAllocTag(cch + 1, pszTag);
839 if (pszResult)
840 {
841 rc = rtUtf16RecodeAsLatin1(pwszString, RTSTR_MAX, pszResult, cch);
842 if (RT_SUCCESS(rc))
843 {
844 *ppszString = pszResult;
845 return rc;
846 }
847
848 RTMemFree(pszResult);
849 }
850 else
851 rc = VERR_NO_STR_MEMORY;
852 }
853 return rc;
854}
855RT_EXPORT_SYMBOL(RTUtf16ToLatin1Tag);
856
857
858RTDECL(int) RTUtf16ToLatin1ExTag(PCRTUTF16 pwszString, size_t cwcString, char **ppsz, size_t cch, size_t *pcch, const char *pszTag)
859{
860 /*
861 * Validate input.
862 */
863 AssertPtr(pwszString);
864 AssertPtr(ppsz);
865 AssertPtrNull(pcch);
866
867 /*
868 * Validate the UTF-16 string and calculate the length of the Latin1 encoding of it.
869 */
870 size_t cchResult;
871 int rc = rtUtf16CalcLatin1Length(pwszString, cwcString, &cchResult);
872 if (RT_SUCCESS(rc))
873 {
874 if (pcch)
875 *pcch = cchResult;
876
877 /*
878 * Check buffer size / Allocate buffer and recode it.
879 */
880 bool fShouldFree;
881 char *pszResult;
882 if (cch > 0 && *ppsz)
883 {
884 fShouldFree = false;
885 if (cch <= cchResult)
886 return VERR_BUFFER_OVERFLOW;
887 pszResult = *ppsz;
888 }
889 else
890 {
891 *ppsz = NULL;
892 fShouldFree = true;
893 cch = RT_MAX(cch, cchResult + 1);
894 pszResult = (char *)RTMemAllocTag(cch, pszTag);
895 }
896 if (pszResult)
897 {
898 rc = rtUtf16RecodeAsLatin1(pwszString, cwcString, pszResult, cch - 1);
899 if (RT_SUCCESS(rc))
900 {
901 *ppsz = pszResult;
902 return rc;
903 }
904
905 if (fShouldFree)
906 RTMemFree(pszResult);
907 }
908 else
909 rc = VERR_NO_STR_MEMORY;
910 }
911 return rc;
912}
913RT_EXPORT_SYMBOL(RTUtf16ToLatin1ExTag);
914
915
916RTDECL(size_t) RTUtf16CalcLatin1Len(PCRTUTF16 pwsz)
917{
918 size_t cch;
919 int rc = rtUtf16CalcLatin1Length(pwsz, RTSTR_MAX, &cch);
920 return RT_SUCCESS(rc) ? cch : 0;
921}
922RT_EXPORT_SYMBOL(RTUtf16CalcLatin1Len);
923
924
925RTDECL(int) RTUtf16CalcLatin1LenEx(PCRTUTF16 pwsz, size_t cwc, size_t *pcch)
926{
927 size_t cch;
928 int rc = rtUtf16CalcLatin1Length(pwsz, cwc, &cch);
929 if (pcch)
930 *pcch = RT_SUCCESS(rc) ? cch : ~(size_t)0;
931 return rc;
932}
933RT_EXPORT_SYMBOL(RTUtf16CalcLatin1LenEx);
934
935
936/**
937 * Calculates the UTF-16 length of a Latin1 string. In fact this is just the
938 * original length, but the function saves us nasty comments to that effect
939 * all over the place.
940 *
941 * @returns IPRT status code.
942 * @param psz Pointer to the Latin1 string.
943 * @param cch The max length of the string. (btw cch = cb)
944 * Use RTSTR_MAX if all of the string is to be examined.s
945 * @param pcwc Where to store the length of the UTF-16 string as a number of RTUTF16 characters.
946 */
947static int rtLatin1CalcUtf16Length(const char *psz, size_t cch, size_t *pcwc)
948{
949 *pcwc = RTStrNLen(psz, cch);
950 return VINF_SUCCESS;
951}
952
953
954/**
955 * Recodes a Latin1 string as UTF-16. This is just a case of expanding it to
956 * sixteen bits, as Unicode is a superset of Latin1.
957 *
958 * Since we know the input is valid, we do *not* perform length checks.
959 *
960 * @returns iprt status code.
961 * @param psz The Latin1 string to recode.
962 * @param cch The number of chars (the type char, so bytes if you like) to process of the Latin1 string.
963 * The recoding will stop when cch or '\\0' is reached. Pass RTSTR_MAX to process up to '\\0'.
964 * @param pwsz Where to store the UTF-16 string.
965 * @param cwc The number of RTUTF16 items the pwsz buffer can hold, excluding the terminator ('\\0').
966 */
967static int rtLatin1RecodeAsUtf16(const char *psz, size_t cch, PRTUTF16 pwsz, size_t cwc)
968{
969 int rc = VINF_SUCCESS;
970 const unsigned char *puch = (const unsigned char *)psz;
971 PRTUTF16 pwc = pwsz;
972 while (cch-- > 0)
973 {
974 /* read the next char and check for terminator. */
975 const unsigned char uch = *puch;
976 if (!uch)
977 break;
978
979 /* check for output overflow */
980 if (RT_UNLIKELY(cwc < 1))
981 {
982 rc = VERR_BUFFER_OVERFLOW;
983 break;
984 }
985
986 /* expand the code point */
987 *pwc++ = uch;
988 cwc--;
989 puch++;
990 }
991
992 /* done */
993 *pwc = '\0';
994 return rc;
995}
996
997
998RTDECL(int) RTLatin1ToUtf16Tag(const char *pszString, PRTUTF16 *ppwszString, const char *pszTag)
999{
1000 /*
1001 * Validate input.
1002 */
1003 Assert(VALID_PTR(ppwszString));
1004 Assert(VALID_PTR(pszString));
1005 *ppwszString = NULL;
1006
1007 /*
1008 * Validate the input and calculate the length of the UTF-16 string.
1009 */
1010 size_t cwc;
1011 int rc = rtLatin1CalcUtf16Length(pszString, RTSTR_MAX, &cwc);
1012 if (RT_SUCCESS(rc))
1013 {
1014 /*
1015 * Allocate buffer.
1016 */
1017 PRTUTF16 pwsz = (PRTUTF16)RTMemAllocTag((cwc + 1) * sizeof(RTUTF16), pszTag);
1018 if (pwsz)
1019 {
1020 /*
1021 * Encode the UTF-16 string.
1022 */
1023 rc = rtLatin1RecodeAsUtf16(pszString, RTSTR_MAX, pwsz, cwc);
1024 if (RT_SUCCESS(rc))
1025 {
1026 *ppwszString = pwsz;
1027 return rc;
1028 }
1029 RTMemFree(pwsz);
1030 }
1031 else
1032 rc = VERR_NO_UTF16_MEMORY;
1033 }
1034 return rc;
1035}
1036RT_EXPORT_SYMBOL(RTLatin1ToUtf16Tag);
1037
1038
1039RTDECL(int) RTLatin1ToUtf16ExTag(const char *pszString, size_t cchString,
1040 PRTUTF16 *ppwsz, size_t cwc, size_t *pcwc, const char *pszTag)
1041{
1042 /*
1043 * Validate input.
1044 */
1045 Assert(VALID_PTR(pszString));
1046 Assert(VALID_PTR(ppwsz));
1047 Assert(!pcwc || VALID_PTR(pcwc));
1048
1049 /*
1050 * Validate the input and calculate the length of the UTF-16 string.
1051 */
1052 size_t cwcResult;
1053 int rc = rtLatin1CalcUtf16Length(pszString, cchString, &cwcResult);
1054 if (RT_SUCCESS(rc))
1055 {
1056 if (pcwc)
1057 *pcwc = cwcResult;
1058
1059 /*
1060 * Check buffer size / Allocate buffer.
1061 */
1062 bool fShouldFree;
1063 PRTUTF16 pwszResult;
1064 if (cwc > 0 && *ppwsz)
1065 {
1066 fShouldFree = false;
1067 if (cwc <= cwcResult)
1068 return VERR_BUFFER_OVERFLOW;
1069 pwszResult = *ppwsz;
1070 }
1071 else
1072 {
1073 *ppwsz = NULL;
1074 fShouldFree = true;
1075 cwc = RT_MAX(cwcResult + 1, cwc);
1076 pwszResult = (PRTUTF16)RTMemAllocTag(cwc * sizeof(RTUTF16), pszTag);
1077 }
1078 if (pwszResult)
1079 {
1080 /*
1081 * Encode the UTF-16 string.
1082 */
1083 rc = rtLatin1RecodeAsUtf16(pszString, cchString, pwszResult, cwc - 1);
1084 if (RT_SUCCESS(rc))
1085 {
1086 *ppwsz = pwszResult;
1087 return rc;
1088 }
1089 if (fShouldFree)
1090 RTMemFree(pwszResult);
1091 }
1092 else
1093 rc = VERR_NO_UTF16_MEMORY;
1094 }
1095 return rc;
1096}
1097RT_EXPORT_SYMBOL(RTLatin1ToUtf16ExTag);
1098
1099
1100RTDECL(size_t) RTLatin1CalcUtf16Len(const char *psz)
1101{
1102 size_t cwc;
1103 int rc = rtLatin1CalcUtf16Length(psz, RTSTR_MAX, &cwc);
1104 return RT_SUCCESS(rc) ? cwc : 0;
1105}
1106RT_EXPORT_SYMBOL(RTLatin1CalcUtf16Len);
1107
1108
1109RTDECL(int) RTLatin1CalcUtf16LenEx(const char *psz, size_t cch, size_t *pcwc)
1110{
1111 size_t cwc;
1112 int rc = rtLatin1CalcUtf16Length(psz, cch, &cwc);
1113 if (pcwc)
1114 *pcwc = RT_SUCCESS(rc) ? cwc : ~(size_t)0;
1115 return rc;
1116}
1117RT_EXPORT_SYMBOL(RTLatin1CalcUtf16LenEx);
Note: See TracBrowser for help on using the repository browser.

© 2024 Oracle Support Privacy / Do Not Sell My Info Terms of Use Trademark Policy Automated Access Etiquette