VirtualBox

source: vbox/trunk/src/VBox/Runtime/common/string/utf-16.cpp@ 34079

Last change on this file since 34079 was 31157, checked in by vboxsync, 14 years ago

iprt,++: Tag allocation in all builds with a string, defaulting to FILE.

  • Property svn:eol-style set to native
  • Property svn:keywords set to Id
File size: 29.9 KB
Line 
1/* $Id: utf-16.cpp 31157 2010-07-28 03:15:35Z vboxsync $ */
2/** @file
3 * IPRT - UTF-16.
4 */
5
6/*
7 * Copyright (C) 2006-2010 Oracle Corporation
8 *
9 * This file is part of VirtualBox Open Source Edition (OSE), as
10 * available from http://www.virtualbox.org. This file is free software;
11 * you can redistribute it and/or modify it under the terms of the GNU
12 * General Public License (GPL) as published by the Free Software
13 * Foundation, in version 2 as it comes in the "COPYING" file of the
14 * VirtualBox OSE distribution. VirtualBox OSE is distributed in the
15 * hope that it will be useful, but WITHOUT ANY WARRANTY of any kind.
16 *
17 * The contents of this file may alternatively be used under the terms
18 * of the Common Development and Distribution License Version 1.0
19 * (CDDL) only, as it comes in the "COPYING.CDDL" file of the
20 * VirtualBox OSE distribution, in which case the provisions of the
21 * CDDL are applicable instead of those of the GPL.
22 *
23 * You may elect to license modified versions of this file under the
24 * terms and conditions of either the GPL or the CDDL or both.
25 */
26
27
28/*******************************************************************************
29* Header Files *
30*******************************************************************************/
31#include <iprt/string.h>
32#include "internal/iprt.h"
33
34#include <iprt/uni.h>
35#include <iprt/alloc.h>
36#include <iprt/assert.h>
37#include <iprt/err.h>
38#include "internal/string.h"
39
40
41
42RTDECL(void) RTUtf16Free(PRTUTF16 pwszString)
43{
44 if (pwszString)
45 RTMemTmpFree(pwszString);
46}
47RT_EXPORT_SYMBOL(RTUtf16Free);
48
49
50RTDECL(PRTUTF16) RTUtf16DupTag(PCRTUTF16 pwszString, const char *pszTag)
51{
52 Assert(pwszString);
53 size_t cb = (RTUtf16Len(pwszString) + 1) * sizeof(RTUTF16);
54 PRTUTF16 pwsz = (PRTUTF16)RTMemAllocTag(cb, pszTag);
55 if (pwsz)
56 memcpy(pwsz, pwszString, cb);
57 return pwsz;
58}
59RT_EXPORT_SYMBOL(RTUtf16DupTag);
60
61
62RTDECL(int) RTUtf16DupExTag(PRTUTF16 *ppwszString, PCRTUTF16 pwszString, size_t cwcExtra, const char *pszTag)
63{
64 Assert(pwszString);
65 size_t cb = (RTUtf16Len(pwszString) + 1) * sizeof(RTUTF16);
66 PRTUTF16 pwsz = (PRTUTF16)RTMemAllocTag(cb + cwcExtra * sizeof(RTUTF16), pszTag);
67 if (pwsz)
68 {
69 memcpy(pwsz, pwszString, cb);
70 *ppwszString = pwsz;
71 return VINF_SUCCESS;
72 }
73 return VERR_NO_MEMORY;
74}
75RT_EXPORT_SYMBOL(RTUtf16DupExTag);
76
77
78RTDECL(size_t) RTUtf16Len(PCRTUTF16 pwszString)
79{
80 if (!pwszString)
81 return 0;
82
83 PCRTUTF16 pwsz = pwszString;
84 while (*pwsz)
85 pwsz++;
86 return pwsz - pwszString;
87}
88RT_EXPORT_SYMBOL(RTUtf16Len);
89
90
91RTDECL(int) RTUtf16Cmp(register PCRTUTF16 pwsz1, register PCRTUTF16 pwsz2)
92{
93 if (pwsz1 == pwsz2)
94 return 0;
95 if (!pwsz1)
96 return -1;
97 if (!pwsz2)
98 return 1;
99
100 for (;;)
101 {
102 register RTUTF16 wcs = *pwsz1;
103 register int iDiff = wcs - *pwsz2;
104 if (iDiff || !wcs)
105 return iDiff;
106 pwsz1++;
107 pwsz2++;
108 }
109}
110RT_EXPORT_SYMBOL(RTUtf16Cmp);
111
112
113RTDECL(int) RTUtf16ICmp(register PCRTUTF16 pwsz1, register PCRTUTF16 pwsz2)
114{
115 if (pwsz1 == pwsz2)
116 return 0;
117 if (!pwsz1)
118 return -1;
119 if (!pwsz2)
120 return 1;
121
122 PCRTUTF16 pwsz1Start = pwsz1; /* keep it around in case we have to backtrack on a surrogate pair */
123 for (;;)
124 {
125 register RTUTF16 wc1 = *pwsz1;
126 register RTUTF16 wc2 = *pwsz2;
127 register int iDiff = wc1 - wc2;
128 if (iDiff)
129 {
130 /* unless they are *both* surrogate pairs, there is no chance they'll be identical. */
131 if ( wc1 < 0xd800
132 || wc2 < 0xd800
133 || wc1 > 0xdfff
134 || wc2 > 0xdfff)
135 {
136 /* simple UCS-2 char */
137 iDiff = RTUniCpToUpper(wc1) - RTUniCpToUpper(wc2);
138 if (iDiff)
139 iDiff = RTUniCpToLower(wc1) - RTUniCpToLower(wc2);
140 }
141 else
142 {
143 /* a damned pair */
144 RTUNICP uc1;
145 RTUNICP uc2;
146 if (wc1 >= 0xdc00)
147 {
148 if (pwsz1Start == pwsz1)
149 return iDiff;
150 uc1 = pwsz1[-1];
151 if (uc1 < 0xd800 || uc1 >= 0xdc00)
152 return iDiff;
153 uc1 = 0x10000 + (((uc1 & 0x3ff) << 10) | (wc1 & 0x3ff));
154 uc2 = 0x10000 + (((pwsz2[-1] & 0x3ff) << 10) | (wc2 & 0x3ff));
155 }
156 else
157 {
158 uc1 = *++pwsz1;
159 if (uc1 < 0xdc00 || uc1 >= 0xe000)
160 return iDiff;
161 uc1 = 0x10000 + (((wc1 & 0x3ff) << 10) | (uc1 & 0x3ff));
162 uc2 = 0x10000 + (((wc2 & 0x3ff) << 10) | (*++pwsz2 & 0x3ff));
163 }
164 iDiff = RTUniCpToUpper(uc1) - RTUniCpToUpper(uc2);
165 if (iDiff)
166 iDiff = RTUniCpToLower(uc1) - RTUniCpToLower(uc2); /* serious paranoia! */
167 }
168 if (iDiff)
169 return iDiff;
170 }
171 if (!wc1)
172 return 0;
173 pwsz1++;
174 pwsz2++;
175 }
176}
177RT_EXPORT_SYMBOL(RTUtf16ICmp);
178
179
180RTDECL(PRTUTF16) RTUtf16ToLower(PRTUTF16 pwsz)
181{
182 PRTUTF16 pwc = pwsz;
183 for (;;)
184 {
185 RTUTF16 wc = *pwc;
186 if (!wc)
187 break;
188 if (wc < 0xd800 || wc >= 0xdc00)
189 {
190 RTUNICP ucFolded = RTUniCpToLower(wc);
191 if (ucFolded < 0x10000)
192 *pwc++ = RTUniCpToLower(wc);
193 }
194 else
195 {
196 /* surrogate */
197 RTUTF16 wc2 = pwc[1];
198 if (wc2 >= 0xdc00 && wc2 <= 0xdfff)
199 {
200 RTUNICP uc = 0x10000 + (((wc & 0x3ff) << 10) | (wc2 & 0x3ff));
201 RTUNICP ucFolded = RTUniCpToLower(uc);
202 if (uc != ucFolded && ucFolded >= 0x10000) /* we don't support shrinking the string */
203 {
204 uc -= 0x10000;
205 *pwc++ = 0xd800 | (uc >> 10);
206 *pwc++ = 0xdc00 | (uc & 0x3ff);
207 }
208 }
209 else /* invalid encoding. */
210 pwc++;
211 }
212 }
213 return pwsz;
214}
215RT_EXPORT_SYMBOL(RTUtf16ToLower);
216
217
218RTDECL(PRTUTF16) RTUtf16ToUpper(PRTUTF16 pwsz)
219{
220 PRTUTF16 pwc = pwsz;
221 for (;;)
222 {
223 RTUTF16 wc = *pwc;
224 if (!wc)
225 break;
226 if (wc < 0xd800 || wc >= 0xdc00)
227 *pwc++ = RTUniCpToUpper(wc);
228 else
229 {
230 /* surrogate */
231 RTUTF16 wc2 = pwc[1];
232 if (wc2 >= 0xdc00 && wc2 <= 0xdfff)
233 {
234 RTUNICP uc = 0x10000 + (((wc & 0x3ff) << 10) | (wc2 & 0x3ff));
235 RTUNICP ucFolded = RTUniCpToUpper(uc);
236 if (uc != ucFolded && ucFolded >= 0x10000) /* we don't support shrinking the string */
237 {
238 uc -= 0x10000;
239 *pwc++ = 0xd800 | (uc >> 10);
240 *pwc++ = 0xdc00 | (uc & 0x3ff);
241 }
242 }
243 else /* invalid encoding. */
244 pwc++;
245 }
246 }
247 return pwsz;
248}
249RT_EXPORT_SYMBOL(RTUtf16ToUpper);
250
251
252/**
253 * Validate the UTF-16 encoding and calculates the length of an UTF-8 encoding.
254 *
255 * @returns iprt status code.
256 * @param pwsz The UTF-16 string.
257 * @param cwc The max length of the UTF-16 string to consider.
258 * @param pcch Where to store the length (excluding '\\0') of the UTF-8 string. (cch == cb, btw)
259 */
260static int rtUtf16CalcUtf8Length(PCRTUTF16 pwsz, size_t cwc, size_t *pcch)
261{
262 int rc = VINF_SUCCESS;
263 size_t cch = 0;
264 while (cwc > 0)
265 {
266 RTUTF16 wc = *pwsz++; cwc--;
267 if (!wc)
268 break;
269 else if (wc < 0xd800 || wc > 0xdfff)
270 {
271 if (wc < 0x80)
272 cch++;
273 else if (wc < 0x800)
274 cch += 2;
275 else if (wc < 0xfffe)
276 cch += 3;
277 else
278 {
279 RTStrAssertMsgFailed(("endian indicator! wc=%#x\n", wc));
280 rc = VERR_CODE_POINT_ENDIAN_INDICATOR;
281 break;
282 }
283 }
284 else
285 {
286 if (wc >= 0xdc00)
287 {
288 RTStrAssertMsgFailed(("Wrong 1st char in surrogate! wc=%#x\n", wc));
289 rc = VERR_INVALID_UTF16_ENCODING;
290 break;
291 }
292 if (cwc <= 0)
293 {
294 RTStrAssertMsgFailed(("Invalid length! wc=%#x\n", wc));
295 rc = VERR_INVALID_UTF16_ENCODING;
296 break;
297 }
298 wc = *pwsz++; cwc--;
299 if (wc < 0xdc00 || wc > 0xdfff)
300 {
301 RTStrAssertMsgFailed(("Wrong 2nd char in surrogate! wc=%#x\n", wc));
302 rc = VERR_INVALID_UTF16_ENCODING;
303 break;
304 }
305 cch += 4;
306 }
307 }
308
309
310 /* done */
311 *pcch = cch;
312 return rc;
313}
314
315
316/**
317 * Recodes an valid UTF-16 string as UTF-8.
318 *
319 * @returns iprt status code.
320 * @param pwsz The UTF-16 string.
321 * @param cwc The number of RTUTF16 characters to process from pwsz. The recoding
322 * will stop when cwc or '\\0' is reached.
323 * @param psz Where to store the UTF-8 string.
324 * @param cch The size of the UTF-8 buffer, excluding the terminator.
325 * @param pcch Where to store the number of octets actually encoded.
326 */
327static int rtUtf16RecodeAsUtf8(PCRTUTF16 pwsz, size_t cwc, char *psz, size_t cch, size_t *pcch)
328{
329 unsigned char *pwch = (unsigned char *)psz;
330 int rc = VINF_SUCCESS;
331 while (cwc > 0)
332 {
333 RTUTF16 wc = *pwsz++; cwc--;
334 if (!wc)
335 break;
336 else if (wc < 0xd800 || wc > 0xdfff)
337 {
338 if (wc < 0x80)
339 {
340 if (RT_UNLIKELY(cch < 1))
341 {
342 RTStrAssertMsgFailed(("Buffer overflow! 1\n"));
343 rc = VERR_BUFFER_OVERFLOW;
344 break;
345 }
346 cch--;
347 *pwch++ = (unsigned char)wc;
348 }
349 else if (wc < 0x800)
350 {
351 if (RT_UNLIKELY(cch < 2))
352 {
353 RTStrAssertMsgFailed(("Buffer overflow! 2\n"));
354 rc = VERR_BUFFER_OVERFLOW;
355 break;
356 }
357 cch -= 2;
358 *pwch++ = 0xc0 | (wc >> 6);
359 *pwch++ = 0x80 | (wc & 0x3f);
360 }
361 else if (wc < 0xfffe)
362 {
363 if (RT_UNLIKELY(cch < 3))
364 {
365 RTStrAssertMsgFailed(("Buffer overflow! 3\n"));
366 rc = VERR_BUFFER_OVERFLOW;
367 break;
368 }
369 cch -= 3;
370 *pwch++ = 0xe0 | (wc >> 12);
371 *pwch++ = 0x80 | ((wc >> 6) & 0x3f);
372 *pwch++ = 0x80 | (wc & 0x3f);
373 }
374 else
375 {
376 RTStrAssertMsgFailed(("endian indicator! wc=%#x\n", wc));
377 rc = VERR_CODE_POINT_ENDIAN_INDICATOR;
378 break;
379 }
380 }
381 else
382 {
383 if (wc >= 0xdc00)
384 {
385 RTStrAssertMsgFailed(("Wrong 1st char in surrogate! wc=%#x\n", wc));
386 rc = VERR_INVALID_UTF16_ENCODING;
387 break;
388 }
389 if (cwc <= 0)
390 {
391 RTStrAssertMsgFailed(("Invalid length! wc=%#x\n", wc));
392 rc = VERR_INVALID_UTF16_ENCODING;
393 break;
394 }
395 RTUTF16 wc2 = *pwsz++; cwc--;
396 if (wc2 < 0xdc00 || wc2 > 0xdfff)
397 {
398 RTStrAssertMsgFailed(("Wrong 2nd char in surrogate! wc=%#x\n", wc));
399 rc = VERR_INVALID_UTF16_ENCODING;
400 break;
401 }
402 uint32_t CodePoint = 0x10000
403 + ( ((wc & 0x3ff) << 10)
404 | (wc2 & 0x3ff));
405 if (RT_UNLIKELY(cch < 4))
406 {
407 RTStrAssertMsgFailed(("Buffer overflow! 4\n"));
408 rc = VERR_BUFFER_OVERFLOW;
409 break;
410 }
411 cch -= 4;
412 *pwch++ = 0xf0 | (CodePoint >> 18);
413 *pwch++ = 0x80 | ((CodePoint >> 12) & 0x3f);
414 *pwch++ = 0x80 | ((CodePoint >> 6) & 0x3f);
415 *pwch++ = 0x80 | (CodePoint & 0x3f);
416 }
417 }
418
419 /* done */
420 *pwch = '\0';
421 *pcch = (char *)pwch - psz;
422 return rc;
423}
424
425
426
427RTDECL(int) RTUtf16ToUtf8Tag(PCRTUTF16 pwszString, char **ppszString, const char *pszTag)
428{
429 /*
430 * Validate input.
431 */
432 Assert(VALID_PTR(ppszString));
433 Assert(VALID_PTR(pwszString));
434 *ppszString = NULL;
435
436 /*
437 * Validate the UTF-16 string and calculate the length of the UTF-8 encoding of it.
438 */
439 size_t cch;
440 int rc = rtUtf16CalcUtf8Length(pwszString, RTSTR_MAX, &cch);
441 if (RT_SUCCESS(rc))
442 {
443 /*
444 * Allocate buffer and recode it.
445 */
446 char *pszResult = (char *)RTMemAllocTag(cch + 1, pszTag);
447 if (pszResult)
448 {
449 rc = rtUtf16RecodeAsUtf8(pwszString, RTSTR_MAX, pszResult, cch, &cch);
450 if (RT_SUCCESS(rc))
451 {
452 *ppszString = pszResult;
453 return rc;
454 }
455
456 RTMemFree(pszResult);
457 }
458 else
459 rc = VERR_NO_STR_MEMORY;
460 }
461 return rc;
462}
463RT_EXPORT_SYMBOL(RTUtf16ToUtf8Tag);
464
465
466RTDECL(int) RTUtf16ToUtf8ExTag(PCRTUTF16 pwszString, size_t cwcString, char **ppsz, size_t cch, size_t *pcch, const char *pszTag)
467{
468 /*
469 * Validate input.
470 */
471 Assert(VALID_PTR(pwszString));
472 Assert(VALID_PTR(ppsz));
473 Assert(!pcch || VALID_PTR(pcch));
474
475 /*
476 * Validate the UTF-16 string and calculate the length of the UTF-8 encoding of it.
477 */
478 size_t cchResult;
479 int rc = rtUtf16CalcUtf8Length(pwszString, cwcString, &cchResult);
480 if (RT_SUCCESS(rc))
481 {
482 if (pcch)
483 *pcch = cchResult;
484
485 /*
486 * Check buffer size / Allocate buffer and recode it.
487 */
488 bool fShouldFree;
489 char *pszResult;
490 if (cch > 0 && *ppsz)
491 {
492 fShouldFree = false;
493 if (RT_UNLIKELY(cch <= cchResult))
494 return VERR_BUFFER_OVERFLOW;
495 pszResult = *ppsz;
496 }
497 else
498 {
499 *ppsz = NULL;
500 fShouldFree = true;
501 cch = RT_MAX(cch, cchResult + 1);
502 pszResult = (char *)RTStrAllocTag(cch, pszTag);
503 }
504 if (pszResult)
505 {
506 rc = rtUtf16RecodeAsUtf8(pwszString, cwcString, pszResult, cch - 1, &cch);
507 if (RT_SUCCESS(rc))
508 {
509 *ppsz = pszResult;
510 return rc;
511 }
512
513 if (fShouldFree)
514 RTStrFree(pszResult);
515 }
516 else
517 rc = VERR_NO_STR_MEMORY;
518 }
519 return rc;
520}
521RT_EXPORT_SYMBOL(RTUtf16ToUtf8ExTag);
522
523
524RTDECL(size_t) RTUtf16CalcUtf8Len(PCRTUTF16 pwsz)
525{
526 size_t cch;
527 int rc = rtUtf16CalcUtf8Length(pwsz, RTSTR_MAX, &cch);
528 return RT_SUCCESS(rc) ? cch : 0;
529}
530RT_EXPORT_SYMBOL(RTUtf16CalcUtf8Len);
531
532
533RTDECL(int) RTUtf16CalcUtf8LenEx(PCRTUTF16 pwsz, size_t cwc, size_t *pcch)
534{
535 size_t cch;
536 int rc = rtUtf16CalcUtf8Length(pwsz, cwc, &cch);
537 if (pcch)
538 *pcch = RT_SUCCESS(rc) ? cch : ~(size_t)0;
539 return rc;
540}
541RT_EXPORT_SYMBOL(RTUtf16CalcUtf8LenEx);
542
543
544RTDECL(RTUNICP) RTUtf16GetCpInternal(PCRTUTF16 pwsz)
545{
546 const RTUTF16 wc = *pwsz;
547
548 /* simple */
549 if (wc < 0xd800 || (wc > 0xdfff && wc < 0xfffe))
550 return wc;
551 if (wc < 0xfffe)
552 {
553 /* surrogate pair */
554 if (wc < 0xdc00)
555 {
556 const RTUTF16 wc2 = pwsz[1];
557 if (wc2 >= 0xdc00 && wc2 <= 0xdfff)
558 {
559 RTUNICP uc = 0x10000 + (((wc & 0x3ff) << 10) | (wc2 & 0x3ff));
560 return uc;
561 }
562
563 RTStrAssertMsgFailed(("wc=%#08x wc2=%#08x - invalid 2nd char in surrogate pair\n", wc, wc2));
564 }
565 else
566 RTStrAssertMsgFailed(("wc=%#08x - invalid surrogate pair order\n", wc));
567 }
568 else
569 RTStrAssertMsgFailed(("wc=%#08x - endian indicator\n", wc));
570 return RTUNICP_INVALID;
571}
572RT_EXPORT_SYMBOL(RTUtf16GetCpInternal);
573
574
575RTDECL(int) RTUtf16GetCpExInternal(PCRTUTF16 *ppwsz, PRTUNICP pCp)
576{
577 const RTUTF16 wc = **ppwsz;
578
579 /* simple */
580 if (wc < 0xd800 || (wc > 0xdfff && wc < 0xfffe))
581 {
582 (*ppwsz)++;
583 *pCp = wc;
584 return VINF_SUCCESS;
585 }
586
587 int rc;
588 if (wc < 0xfffe)
589 {
590 /* surrogate pair */
591 if (wc < 0xdc00)
592 {
593 const RTUTF16 wc2 = (*ppwsz)[1];
594 if (wc2 >= 0xdc00 && wc2 <= 0xdfff)
595 {
596 RTUNICP uc = 0x10000 + (((wc & 0x3ff) << 10) | (wc2 & 0x3ff));
597 *pCp = uc;
598 (*ppwsz) += 2;
599 return VINF_SUCCESS;
600 }
601
602 RTStrAssertMsgFailed(("wc=%#08x wc2=%#08x - invalid 2nd char in surrogate pair\n", wc, wc2));
603 }
604 else
605 RTStrAssertMsgFailed(("wc=%#08x - invalid surrogate pair order\n", wc));
606 rc = VERR_INVALID_UTF16_ENCODING;
607 }
608 else
609 {
610 RTStrAssertMsgFailed(("wc=%#08x - endian indicator\n", wc));
611 rc = VERR_CODE_POINT_ENDIAN_INDICATOR;
612 }
613 *pCp = RTUNICP_INVALID;
614 (*ppwsz)++;
615 return rc;
616}
617RT_EXPORT_SYMBOL(RTUtf16GetCpExInternal);
618
619
620RTDECL(PRTUTF16) RTUtf16PutCpInternal(PRTUTF16 pwsz, RTUNICP CodePoint)
621{
622 /* simple */
623 if ( CodePoint < 0xd800
624 || ( CodePoint > 0xdfff
625 && CodePoint < 0xfffe))
626 {
627 *pwsz++ = (RTUTF16)CodePoint;
628 return pwsz;
629 }
630
631 /* surrogate pair */
632 if (CodePoint >= 0x10000 && CodePoint <= 0x0010ffff)
633 {
634 CodePoint -= 0x10000;
635 *pwsz++ = 0xd800 | (CodePoint >> 10);
636 *pwsz++ = 0xdc00 | (CodePoint & 0x3ff);
637 return pwsz;
638 }
639
640 /* invalid code point. */
641 RTStrAssertMsgFailed(("Invalid codepoint %#x\n", CodePoint));
642 *pwsz++ = 0x7f;
643 return pwsz;
644}
645RT_EXPORT_SYMBOL(RTUtf16PutCpInternal);
646
647
648/**
649 * Validate the UTF-16 encoding and calculates the length of a Latin1 encoding.
650 *
651 * @returns iprt status code.
652 * @param pwsz The UTF-16 string.
653 * @param cwc The max length of the UTF-16 string to consider.
654 * @param pcch Where to store the length (excluding '\\0') of the Latin1 string. (cch == cb, btw)
655 */
656static int rtUtf16CalcLatin1Length(PCRTUTF16 pwsz, size_t cwc, size_t *pcch)
657{
658 int rc = VINF_SUCCESS;
659 size_t cch = 0;
660 while (cwc > 0)
661 {
662 RTUTF16 wc = *pwsz++; cwc--;
663 if (!wc)
664 break;
665 else if (RT_LIKELY(wc < 0x100))
666 ++cch;
667 else
668 {
669 if (wc < 0xd800 || wc > 0xdfff)
670 {
671 if (wc >= 0xfffe)
672 {
673 RTStrAssertMsgFailed(("endian indicator! wc=%#x\n", wc));
674 rc = VERR_CODE_POINT_ENDIAN_INDICATOR;
675 break;
676 }
677 }
678 else
679 {
680 if (wc >= 0xdc00)
681 {
682 RTStrAssertMsgFailed(("Wrong 1st char in surrogate! wc=%#x\n", wc));
683 rc = VERR_INVALID_UTF16_ENCODING;
684 break;
685 }
686 if (cwc <= 0)
687 {
688 RTStrAssertMsgFailed(("Invalid length! wc=%#x\n", wc));
689 rc = VERR_INVALID_UTF16_ENCODING;
690 break;
691 }
692 wc = *pwsz++; cwc--;
693 if (wc < 0xdc00 || wc > 0xdfff)
694 {
695 RTStrAssertMsgFailed(("Wrong 2nd char in surrogate! wc=%#x\n", wc));
696 rc = VERR_INVALID_UTF16_ENCODING;
697 break;
698 }
699 }
700
701 rc = VERR_NO_TRANSLATION;
702 break;
703 }
704 }
705
706 /* done */
707 *pcch = cch;
708 return rc;
709}
710
711
712/**
713 * Recodes an valid UTF-16 string as Latin1.
714 *
715 * @returns iprt status code.
716 * @param pwsz The UTF-16 string.
717 * @param cwc The number of RTUTF16 characters to process from pwsz. The recoding
718 * will stop when cwc or '\\0' is reached.
719 * @param psz Where to store the Latin1 string.
720 * @param cch The size of the Latin1 buffer, excluding the terminator.
721 */
722static int rtUtf16RecodeAsLatin1(PCRTUTF16 pwsz, size_t cwc, char *psz, size_t cch)
723{
724 unsigned char *pch = (unsigned char *)psz;
725 int rc = VINF_SUCCESS;
726 while (cwc > 0)
727 {
728 RTUTF16 wc = *pwsz++; cwc--;
729 if (!wc)
730 break;
731 if (RT_LIKELY(wc < 0x100))
732 {
733 if (RT_UNLIKELY(cch < 1))
734 {
735 RTStrAssertMsgFailed(("Buffer overflow! 1\n"));
736 rc = VERR_BUFFER_OVERFLOW;
737 break;
738 }
739 cch--;
740 *pch++ = (unsigned char)wc;
741 }
742 else
743 {
744 if (wc < 0xd800 || wc > 0xdfff)
745 {
746 if (wc >= 0xfffe)
747 {
748 RTStrAssertMsgFailed(("endian indicator! wc=%#x\n", wc));
749 rc = VERR_CODE_POINT_ENDIAN_INDICATOR;
750 break;
751 }
752 }
753 else
754 {
755 if (wc >= 0xdc00)
756 {
757 RTStrAssertMsgFailed(("Wrong 1st char in surrogate! wc=%#x\n", wc));
758 rc = VERR_INVALID_UTF16_ENCODING;
759 break;
760 }
761 if (cwc <= 0)
762 {
763 RTStrAssertMsgFailed(("Invalid length! wc=%#x\n", wc));
764 rc = VERR_INVALID_UTF16_ENCODING;
765 break;
766 }
767 RTUTF16 wc2 = *pwsz++; cwc--;
768 if (wc2 < 0xdc00 || wc2 > 0xdfff)
769 {
770 RTStrAssertMsgFailed(("Wrong 2nd char in surrogate! wc=%#x\n", wc));
771 rc = VERR_INVALID_UTF16_ENCODING;
772 break;
773 }
774 }
775
776 rc = VERR_NO_TRANSLATION;
777 break;
778 }
779 }
780
781 /* done */
782 *pch = '\0';
783 return rc;
784}
785
786
787RTDECL(int) RTUtf16ToLatin1Tag(PCRTUTF16 pwszString, char **ppszString, const char *pszTag)
788{
789 /*
790 * Validate input.
791 */
792 Assert(VALID_PTR(ppszString));
793 Assert(VALID_PTR(pwszString));
794 *ppszString = NULL;
795
796 /*
797 * Validate the UTF-16 string and calculate the length of the UTF-8 encoding of it.
798 */
799 size_t cch;
800 int rc = rtUtf16CalcLatin1Length(pwszString, RTSTR_MAX, &cch);
801 if (RT_SUCCESS(rc))
802 {
803 /*
804 * Allocate buffer and recode it.
805 */
806 char *pszResult = (char *)RTMemAllocTag(cch + 1, pszTag);
807 if (pszResult)
808 {
809 rc = rtUtf16RecodeAsLatin1(pwszString, RTSTR_MAX, pszResult, cch);
810 if (RT_SUCCESS(rc))
811 {
812 *ppszString = pszResult;
813 return rc;
814 }
815
816 RTMemFree(pszResult);
817 }
818 else
819 rc = VERR_NO_STR_MEMORY;
820 }
821 return rc;
822}
823RT_EXPORT_SYMBOL(RTUtf16ToLatin1Tag);
824
825
826RTDECL(int) RTUtf16ToLatin1ExTag(PCRTUTF16 pwszString, size_t cwcString, char **ppsz, size_t cch, size_t *pcch, const char *pszTag)
827{
828 /*
829 * Validate input.
830 */
831 AssertPtr(pwszString);
832 AssertPtr(ppsz);
833 AssertPtrNull(pcch);
834
835 /*
836 * Validate the UTF-16 string and calculate the length of the Latin1 encoding of it.
837 */
838 size_t cchResult;
839 int rc = rtUtf16CalcLatin1Length(pwszString, cwcString, &cchResult);
840 if (RT_SUCCESS(rc))
841 {
842 if (pcch)
843 *pcch = cchResult;
844
845 /*
846 * Check buffer size / Allocate buffer and recode it.
847 */
848 bool fShouldFree;
849 char *pszResult;
850 if (cch > 0 && *ppsz)
851 {
852 fShouldFree = false;
853 if (cch <= cchResult)
854 return VERR_BUFFER_OVERFLOW;
855 pszResult = *ppsz;
856 }
857 else
858 {
859 *ppsz = NULL;
860 fShouldFree = true;
861 cch = RT_MAX(cch, cchResult + 1);
862 pszResult = (char *)RTMemAllocTag(cch, pszTag);
863 }
864 if (pszResult)
865 {
866 rc = rtUtf16RecodeAsLatin1(pwszString, cwcString, pszResult, cch - 1);
867 if (RT_SUCCESS(rc))
868 {
869 *ppsz = pszResult;
870 return rc;
871 }
872
873 if (fShouldFree)
874 RTMemFree(pszResult);
875 }
876 else
877 rc = VERR_NO_STR_MEMORY;
878 }
879 return rc;
880}
881RT_EXPORT_SYMBOL(RTUtf16ToLatin1ExTag);
882
883
884RTDECL(size_t) RTUtf16CalcLatin1Len(PCRTUTF16 pwsz)
885{
886 size_t cch;
887 int rc = rtUtf16CalcLatin1Length(pwsz, RTSTR_MAX, &cch);
888 return RT_SUCCESS(rc) ? cch : 0;
889}
890RT_EXPORT_SYMBOL(RTUtf16CalcLatin1Len);
891
892
893RTDECL(int) RTUtf16CalcLatin1LenEx(PCRTUTF16 pwsz, size_t cwc, size_t *pcch)
894{
895 size_t cch;
896 int rc = rtUtf16CalcLatin1Length(pwsz, cwc, &cch);
897 if (pcch)
898 *pcch = RT_SUCCESS(rc) ? cch : ~(size_t)0;
899 return rc;
900}
901RT_EXPORT_SYMBOL(RTUtf16CalcLatin1LenEx);
902
903
904/**
905 * Calculates the UTF-16 length of a Latin1 string. In fact this is just the
906 * original length, but the function saves us nasty comments to that effect
907 * all over the place.
908 *
909 * @returns IPRT status code.
910 * @param psz Pointer to the Latin1 string.
911 * @param cch The max length of the string. (btw cch = cb)
912 * Use RTSTR_MAX if all of the string is to be examined.s
913 * @param pcwc Where to store the length of the UTF-16 string as a number of RTUTF16 characters.
914 */
915static int rtLatin1CalcUtf16Length(const char *psz, size_t cch, size_t *pcwc)
916{
917 *pcwc = RTStrNLen(psz, cch);
918 return VINF_SUCCESS;
919}
920
921
922/**
923 * Recodes a Latin1 string as UTF-16. This is just a case of expanding it to
924 * sixteen bits, as Unicode is a superset of Latin1.
925 *
926 * Since we know the input is valid, we do *not* perform length checks.
927 *
928 * @returns iprt status code.
929 * @param psz The Latin1 string to recode.
930 * @param cch The number of chars (the type char, so bytes if you like) to process of the Latin1 string.
931 * The recoding will stop when cch or '\\0' is reached. Pass RTSTR_MAX to process up to '\\0'.
932 * @param pwsz Where to store the UTF-16 string.
933 * @param cwc The number of RTUTF16 items the pwsz buffer can hold, excluding the terminator ('\\0').
934 */
935static int rtLatin1RecodeAsUtf16(const char *psz, size_t cch, PRTUTF16 pwsz, size_t cwc)
936{
937 int rc = VINF_SUCCESS;
938 const unsigned char *puch = (const unsigned char *)psz;
939 PRTUTF16 pwc = pwsz;
940 while (cch-- > 0)
941 {
942 /* read the next char and check for terminator. */
943 const unsigned char uch = *puch;
944 if (!uch)
945 break;
946
947 /* check for output overflow */
948 if (RT_UNLIKELY(cwc < 1))
949 {
950 rc = VERR_BUFFER_OVERFLOW;
951 break;
952 }
953
954 /* expand the code point */
955 *pwc++ = uch;
956 cwc--;
957 puch++;
958 }
959
960 /* done */
961 *pwc = '\0';
962 return rc;
963}
964
965
966RTDECL(int) RTLatin1ToUtf16Tag(const char *pszString, PRTUTF16 *ppwszString, const char *pszTag)
967{
968 /*
969 * Validate input.
970 */
971 Assert(VALID_PTR(ppwszString));
972 Assert(VALID_PTR(pszString));
973 *ppwszString = NULL;
974
975 /*
976 * Validate the input and calculate the length of the UTF-16 string.
977 */
978 size_t cwc;
979 int rc = rtLatin1CalcUtf16Length(pszString, RTSTR_MAX, &cwc);
980 if (RT_SUCCESS(rc))
981 {
982 /*
983 * Allocate buffer.
984 */
985 PRTUTF16 pwsz = (PRTUTF16)RTMemAllocTag((cwc + 1) * sizeof(RTUTF16), pszTag);
986 if (pwsz)
987 {
988 /*
989 * Encode the UTF-16 string.
990 */
991 rc = rtLatin1RecodeAsUtf16(pszString, RTSTR_MAX, pwsz, cwc);
992 if (RT_SUCCESS(rc))
993 {
994 *ppwszString = pwsz;
995 return rc;
996 }
997 RTMemFree(pwsz);
998 }
999 else
1000 rc = VERR_NO_UTF16_MEMORY;
1001 }
1002 return rc;
1003}
1004RT_EXPORT_SYMBOL(RTLatin1ToUtf16Tag);
1005
1006
1007RTDECL(int) RTLatin1ToUtf16ExTag(const char *pszString, size_t cchString,
1008 PRTUTF16 *ppwsz, size_t cwc, size_t *pcwc, const char *pszTag)
1009{
1010 /*
1011 * Validate input.
1012 */
1013 Assert(VALID_PTR(pszString));
1014 Assert(VALID_PTR(ppwsz));
1015 Assert(!pcwc || VALID_PTR(pcwc));
1016
1017 /*
1018 * Validate the input and calculate the length of the UTF-16 string.
1019 */
1020 size_t cwcResult;
1021 int rc = rtLatin1CalcUtf16Length(pszString, cchString, &cwcResult);
1022 if (RT_SUCCESS(rc))
1023 {
1024 if (pcwc)
1025 *pcwc = cwcResult;
1026
1027 /*
1028 * Check buffer size / Allocate buffer.
1029 */
1030 bool fShouldFree;
1031 PRTUTF16 pwszResult;
1032 if (cwc > 0 && *ppwsz)
1033 {
1034 fShouldFree = false;
1035 if (cwc <= cwcResult)
1036 return VERR_BUFFER_OVERFLOW;
1037 pwszResult = *ppwsz;
1038 }
1039 else
1040 {
1041 *ppwsz = NULL;
1042 fShouldFree = true;
1043 cwc = RT_MAX(cwcResult + 1, cwc);
1044 pwszResult = (PRTUTF16)RTMemAllocTag(cwc * sizeof(RTUTF16), pszTag);
1045 }
1046 if (pwszResult)
1047 {
1048 /*
1049 * Encode the UTF-16 string.
1050 */
1051 rc = rtLatin1RecodeAsUtf16(pszString, cchString, pwszResult, cwc - 1);
1052 if (RT_SUCCESS(rc))
1053 {
1054 *ppwsz = pwszResult;
1055 return rc;
1056 }
1057 if (fShouldFree)
1058 RTMemFree(pwszResult);
1059 }
1060 else
1061 rc = VERR_NO_UTF16_MEMORY;
1062 }
1063 return rc;
1064}
1065RT_EXPORT_SYMBOL(RTLatin1ToUtf16ExTag);
1066
1067
1068RTDECL(size_t) RTLatin1CalcUtf16Len(const char *psz)
1069{
1070 size_t cwc;
1071 int rc = rtLatin1CalcUtf16Length(psz, RTSTR_MAX, &cwc);
1072 return RT_SUCCESS(rc) ? cwc : 0;
1073}
1074RT_EXPORT_SYMBOL(RTLatin1CalcUtf16Len);
1075
1076
1077RTDECL(int) RTLatin1CalcUtf16LenEx(const char *psz, size_t cch, size_t *pcwc)
1078{
1079 size_t cwc;
1080 int rc = rtLatin1CalcUtf16Length(psz, cch, &cwc);
1081 if (pcwc)
1082 *pcwc = RT_SUCCESS(rc) ? cwc : ~(size_t)0;
1083 return rc;
1084}
1085RT_EXPORT_SYMBOL(RTLatin1CalcUtf16LenEx);
Note: See TracBrowser for help on using the repository browser.

© 2024 Oracle Support Privacy / Do Not Sell My Info Terms of Use Trademark Policy Automated Access Etiquette