VirtualBox

source: vbox/trunk/src/VBox/Runtime/common/string/utf-16.cpp@ 40091

Last change on this file since 40091 was 40091, checked in by vboxsync, 13 years ago

Runtime/strings: add Utf-8 and Utf-16 sanitising to a white list of characters. Do not validate the string encoding in advance.

  • Property svn:eol-style set to native
  • Property svn:keywords set to Id
File size: 30.7 KB
Line 
1/* $Id: utf-16.cpp 40091 2012-02-13 10:14:00Z vboxsync $ */
2/** @file
3 * IPRT - UTF-16.
4 */
5
6/*
7 * Copyright (C) 2006-2010 Oracle Corporation
8 *
9 * This file is part of VirtualBox Open Source Edition (OSE), as
10 * available from http://www.virtualbox.org. This file is free software;
11 * you can redistribute it and/or modify it under the terms of the GNU
12 * General Public License (GPL) as published by the Free Software
13 * Foundation, in version 2 as it comes in the "COPYING" file of the
14 * VirtualBox OSE distribution. VirtualBox OSE is distributed in the
15 * hope that it will be useful, but WITHOUT ANY WARRANTY of any kind.
16 *
17 * The contents of this file may alternatively be used under the terms
18 * of the Common Development and Distribution License Version 1.0
19 * (CDDL) only, as it comes in the "COPYING.CDDL" file of the
20 * VirtualBox OSE distribution, in which case the provisions of the
21 * CDDL are applicable instead of those of the GPL.
22 *
23 * You may elect to license modified versions of this file under the
24 * terms and conditions of either the GPL or the CDDL or both.
25 */
26
27
28/*******************************************************************************
29* Header Files *
30*******************************************************************************/
31#include <iprt/string.h>
32#include "internal/iprt.h"
33
34#include <iprt/uni.h>
35#include <iprt/alloc.h>
36#include <iprt/assert.h>
37#include <iprt/err.h>
38#include "internal/string.h"
39
40
41
42RTDECL(void) RTUtf16Free(PRTUTF16 pwszString)
43{
44 if (pwszString)
45 RTMemTmpFree(pwszString);
46}
47RT_EXPORT_SYMBOL(RTUtf16Free);
48
49
50RTDECL(PRTUTF16) RTUtf16DupTag(PCRTUTF16 pwszString, const char *pszTag)
51{
52 Assert(pwszString);
53 size_t cb = (RTUtf16Len(pwszString) + 1) * sizeof(RTUTF16);
54 PRTUTF16 pwsz = (PRTUTF16)RTMemAllocTag(cb, pszTag);
55 if (pwsz)
56 memcpy(pwsz, pwszString, cb);
57 return pwsz;
58}
59RT_EXPORT_SYMBOL(RTUtf16DupTag);
60
61
62RTDECL(int) RTUtf16DupExTag(PRTUTF16 *ppwszString, PCRTUTF16 pwszString, size_t cwcExtra, const char *pszTag)
63{
64 Assert(pwszString);
65 size_t cb = (RTUtf16Len(pwszString) + 1) * sizeof(RTUTF16);
66 PRTUTF16 pwsz = (PRTUTF16)RTMemAllocTag(cb + cwcExtra * sizeof(RTUTF16), pszTag);
67 if (pwsz)
68 {
69 memcpy(pwsz, pwszString, cb);
70 *ppwszString = pwsz;
71 return VINF_SUCCESS;
72 }
73 return VERR_NO_MEMORY;
74}
75RT_EXPORT_SYMBOL(RTUtf16DupExTag);
76
77
78RTDECL(size_t) RTUtf16Len(PCRTUTF16 pwszString)
79{
80 if (!pwszString)
81 return 0;
82
83 PCRTUTF16 pwsz = pwszString;
84 while (*pwsz)
85 pwsz++;
86 return pwsz - pwszString;
87}
88RT_EXPORT_SYMBOL(RTUtf16Len);
89
90
91RTDECL(int) RTUtf16Cmp(register PCRTUTF16 pwsz1, register PCRTUTF16 pwsz2)
92{
93 if (pwsz1 == pwsz2)
94 return 0;
95 if (!pwsz1)
96 return -1;
97 if (!pwsz2)
98 return 1;
99
100 for (;;)
101 {
102 register RTUTF16 wcs = *pwsz1;
103 register int iDiff = wcs - *pwsz2;
104 if (iDiff || !wcs)
105 return iDiff;
106 pwsz1++;
107 pwsz2++;
108 }
109}
110RT_EXPORT_SYMBOL(RTUtf16Cmp);
111
112
113RTDECL(int) RTUtf16ICmp(register PCRTUTF16 pwsz1, register PCRTUTF16 pwsz2)
114{
115 if (pwsz1 == pwsz2)
116 return 0;
117 if (!pwsz1)
118 return -1;
119 if (!pwsz2)
120 return 1;
121
122 PCRTUTF16 pwsz1Start = pwsz1; /* keep it around in case we have to backtrack on a surrogate pair */
123 for (;;)
124 {
125 register RTUTF16 wc1 = *pwsz1;
126 register RTUTF16 wc2 = *pwsz2;
127 register int iDiff = wc1 - wc2;
128 if (iDiff)
129 {
130 /* unless they are *both* surrogate pairs, there is no chance they'll be identical. */
131 if ( wc1 < 0xd800
132 || wc2 < 0xd800
133 || wc1 > 0xdfff
134 || wc2 > 0xdfff)
135 {
136 /* simple UCS-2 char */
137 iDiff = RTUniCpToUpper(wc1) - RTUniCpToUpper(wc2);
138 if (iDiff)
139 iDiff = RTUniCpToLower(wc1) - RTUniCpToLower(wc2);
140 }
141 else
142 {
143 /* a damned pair */
144 RTUNICP uc1;
145 RTUNICP uc2;
146 if (wc1 >= 0xdc00)
147 {
148 if (pwsz1Start == pwsz1)
149 return iDiff;
150 uc1 = pwsz1[-1];
151 if (uc1 < 0xd800 || uc1 >= 0xdc00)
152 return iDiff;
153 uc1 = 0x10000 + (((uc1 & 0x3ff) << 10) | (wc1 & 0x3ff));
154 uc2 = 0x10000 + (((pwsz2[-1] & 0x3ff) << 10) | (wc2 & 0x3ff));
155 }
156 else
157 {
158 uc1 = *++pwsz1;
159 if (uc1 < 0xdc00 || uc1 >= 0xe000)
160 return iDiff;
161 uc1 = 0x10000 + (((wc1 & 0x3ff) << 10) | (uc1 & 0x3ff));
162 uc2 = 0x10000 + (((wc2 & 0x3ff) << 10) | (*++pwsz2 & 0x3ff));
163 }
164 iDiff = RTUniCpToUpper(uc1) - RTUniCpToUpper(uc2);
165 if (iDiff)
166 iDiff = RTUniCpToLower(uc1) - RTUniCpToLower(uc2); /* serious paranoia! */
167 }
168 if (iDiff)
169 return iDiff;
170 }
171 if (!wc1)
172 return 0;
173 pwsz1++;
174 pwsz2++;
175 }
176}
177RT_EXPORT_SYMBOL(RTUtf16ICmp);
178
179
180RTDECL(PRTUTF16) RTUtf16ToLower(PRTUTF16 pwsz)
181{
182 PRTUTF16 pwc = pwsz;
183 for (;;)
184 {
185 RTUTF16 wc = *pwc;
186 if (!wc)
187 break;
188 if (wc < 0xd800 || wc >= 0xdc00)
189 {
190 RTUNICP ucFolded = RTUniCpToLower(wc);
191 if (ucFolded < 0x10000)
192 *pwc++ = RTUniCpToLower(wc);
193 }
194 else
195 {
196 /* surrogate */
197 RTUTF16 wc2 = pwc[1];
198 if (wc2 >= 0xdc00 && wc2 <= 0xdfff)
199 {
200 RTUNICP uc = 0x10000 + (((wc & 0x3ff) << 10) | (wc2 & 0x3ff));
201 RTUNICP ucFolded = RTUniCpToLower(uc);
202 if (uc != ucFolded && ucFolded >= 0x10000) /* we don't support shrinking the string */
203 {
204 uc -= 0x10000;
205 *pwc++ = 0xd800 | (uc >> 10);
206 *pwc++ = 0xdc00 | (uc & 0x3ff);
207 }
208 }
209 else /* invalid encoding. */
210 pwc++;
211 }
212 }
213 return pwsz;
214}
215RT_EXPORT_SYMBOL(RTUtf16ToLower);
216
217
218RTDECL(PRTUTF16) RTUtf16ToUpper(PRTUTF16 pwsz)
219{
220 PRTUTF16 pwc = pwsz;
221 for (;;)
222 {
223 RTUTF16 wc = *pwc;
224 if (!wc)
225 break;
226 if (wc < 0xd800 || wc >= 0xdc00)
227 *pwc++ = RTUniCpToUpper(wc);
228 else
229 {
230 /* surrogate */
231 RTUTF16 wc2 = pwc[1];
232 if (wc2 >= 0xdc00 && wc2 <= 0xdfff)
233 {
234 RTUNICP uc = 0x10000 + (((wc & 0x3ff) << 10) | (wc2 & 0x3ff));
235 RTUNICP ucFolded = RTUniCpToUpper(uc);
236 if (uc != ucFolded && ucFolded >= 0x10000) /* we don't support shrinking the string */
237 {
238 uc -= 0x10000;
239 *pwc++ = 0xd800 | (uc >> 10);
240 *pwc++ = 0xdc00 | (uc & 0x3ff);
241 }
242 }
243 else /* invalid encoding. */
244 pwc++;
245 }
246 }
247 return pwsz;
248}
249RT_EXPORT_SYMBOL(RTUtf16ToUpper);
250
251
252RTDECL(ssize_t) RTUtf16PurgeComplementSet(PRTUTF16 pwsz, PCRTUNICP puszValidSet, char chReplacement)
253{
254 size_t cReplacements = 0;
255 AssertReturn(chReplacement && (unsigned)chReplacement < 128, -1);
256 /* Validate the encoding. */
257 for (;;)
258 {
259 RTUNICP Cp;
260 PCRTUNICP pCp;
261 PRTUTF16 pwszOld = pwsz;
262 if (RT_FAILURE(RTUtf16GetCpEx((PCRTUTF16 *)&pwsz, &Cp)))
263 return -1;
264 if (!Cp)
265 break;
266 for (pCp = puszValidSet; ; ++pCp)
267 if (!*pCp || *pCp == Cp)
268 break;
269 if (!*pCp)
270 {
271 for (; pwszOld != pwsz; ++pwszOld)
272 *pwszOld = chReplacement;
273 ++cReplacements;
274 }
275 }
276 return cReplacements;
277}
278RT_EXPORT_SYMBOL(RTUtf16PurgeComplementSet);
279
280
281/**
282 * Validate the UTF-16 encoding and calculates the length of an UTF-8 encoding.
283 *
284 * @returns iprt status code.
285 * @param pwsz The UTF-16 string.
286 * @param cwc The max length of the UTF-16 string to consider.
287 * @param pcch Where to store the length (excluding '\\0') of the UTF-8 string. (cch == cb, btw)
288 */
289static int rtUtf16CalcUtf8Length(PCRTUTF16 pwsz, size_t cwc, size_t *pcch)
290{
291 int rc = VINF_SUCCESS;
292 size_t cch = 0;
293 while (cwc > 0)
294 {
295 RTUTF16 wc = *pwsz++; cwc--;
296 if (!wc)
297 break;
298 else if (wc < 0xd800 || wc > 0xdfff)
299 {
300 if (wc < 0x80)
301 cch++;
302 else if (wc < 0x800)
303 cch += 2;
304 else if (wc < 0xfffe)
305 cch += 3;
306 else
307 {
308 RTStrAssertMsgFailed(("endian indicator! wc=%#x\n", wc));
309 rc = VERR_CODE_POINT_ENDIAN_INDICATOR;
310 break;
311 }
312 }
313 else
314 {
315 if (wc >= 0xdc00)
316 {
317 RTStrAssertMsgFailed(("Wrong 1st char in surrogate! wc=%#x\n", wc));
318 rc = VERR_INVALID_UTF16_ENCODING;
319 break;
320 }
321 if (cwc <= 0)
322 {
323 RTStrAssertMsgFailed(("Invalid length! wc=%#x\n", wc));
324 rc = VERR_INVALID_UTF16_ENCODING;
325 break;
326 }
327 wc = *pwsz++; cwc--;
328 if (wc < 0xdc00 || wc > 0xdfff)
329 {
330 RTStrAssertMsgFailed(("Wrong 2nd char in surrogate! wc=%#x\n", wc));
331 rc = VERR_INVALID_UTF16_ENCODING;
332 break;
333 }
334 cch += 4;
335 }
336 }
337
338
339 /* done */
340 *pcch = cch;
341 return rc;
342}
343
344
345/**
346 * Recodes an valid UTF-16 string as UTF-8.
347 *
348 * @returns iprt status code.
349 * @param pwsz The UTF-16 string.
350 * @param cwc The number of RTUTF16 characters to process from pwsz. The recoding
351 * will stop when cwc or '\\0' is reached.
352 * @param psz Where to store the UTF-8 string.
353 * @param cch The size of the UTF-8 buffer, excluding the terminator.
354 * @param pcch Where to store the number of octets actually encoded.
355 */
356static int rtUtf16RecodeAsUtf8(PCRTUTF16 pwsz, size_t cwc, char *psz, size_t cch, size_t *pcch)
357{
358 unsigned char *pwch = (unsigned char *)psz;
359 int rc = VINF_SUCCESS;
360 while (cwc > 0)
361 {
362 RTUTF16 wc = *pwsz++; cwc--;
363 if (!wc)
364 break;
365 else if (wc < 0xd800 || wc > 0xdfff)
366 {
367 if (wc < 0x80)
368 {
369 if (RT_UNLIKELY(cch < 1))
370 {
371 RTStrAssertMsgFailed(("Buffer overflow! 1\n"));
372 rc = VERR_BUFFER_OVERFLOW;
373 break;
374 }
375 cch--;
376 *pwch++ = (unsigned char)wc;
377 }
378 else if (wc < 0x800)
379 {
380 if (RT_UNLIKELY(cch < 2))
381 {
382 RTStrAssertMsgFailed(("Buffer overflow! 2\n"));
383 rc = VERR_BUFFER_OVERFLOW;
384 break;
385 }
386 cch -= 2;
387 *pwch++ = 0xc0 | (wc >> 6);
388 *pwch++ = 0x80 | (wc & 0x3f);
389 }
390 else if (wc < 0xfffe)
391 {
392 if (RT_UNLIKELY(cch < 3))
393 {
394 RTStrAssertMsgFailed(("Buffer overflow! 3\n"));
395 rc = VERR_BUFFER_OVERFLOW;
396 break;
397 }
398 cch -= 3;
399 *pwch++ = 0xe0 | (wc >> 12);
400 *pwch++ = 0x80 | ((wc >> 6) & 0x3f);
401 *pwch++ = 0x80 | (wc & 0x3f);
402 }
403 else
404 {
405 RTStrAssertMsgFailed(("endian indicator! wc=%#x\n", wc));
406 rc = VERR_CODE_POINT_ENDIAN_INDICATOR;
407 break;
408 }
409 }
410 else
411 {
412 if (wc >= 0xdc00)
413 {
414 RTStrAssertMsgFailed(("Wrong 1st char in surrogate! wc=%#x\n", wc));
415 rc = VERR_INVALID_UTF16_ENCODING;
416 break;
417 }
418 if (cwc <= 0)
419 {
420 RTStrAssertMsgFailed(("Invalid length! wc=%#x\n", wc));
421 rc = VERR_INVALID_UTF16_ENCODING;
422 break;
423 }
424 RTUTF16 wc2 = *pwsz++; cwc--;
425 if (wc2 < 0xdc00 || wc2 > 0xdfff)
426 {
427 RTStrAssertMsgFailed(("Wrong 2nd char in surrogate! wc=%#x\n", wc));
428 rc = VERR_INVALID_UTF16_ENCODING;
429 break;
430 }
431 uint32_t CodePoint = 0x10000
432 + ( ((wc & 0x3ff) << 10)
433 | (wc2 & 0x3ff));
434 if (RT_UNLIKELY(cch < 4))
435 {
436 RTStrAssertMsgFailed(("Buffer overflow! 4\n"));
437 rc = VERR_BUFFER_OVERFLOW;
438 break;
439 }
440 cch -= 4;
441 *pwch++ = 0xf0 | (CodePoint >> 18);
442 *pwch++ = 0x80 | ((CodePoint >> 12) & 0x3f);
443 *pwch++ = 0x80 | ((CodePoint >> 6) & 0x3f);
444 *pwch++ = 0x80 | (CodePoint & 0x3f);
445 }
446 }
447
448 /* done */
449 *pwch = '\0';
450 *pcch = (char *)pwch - psz;
451 return rc;
452}
453
454
455
456RTDECL(int) RTUtf16ToUtf8Tag(PCRTUTF16 pwszString, char **ppszString, const char *pszTag)
457{
458 /*
459 * Validate input.
460 */
461 Assert(VALID_PTR(ppszString));
462 Assert(VALID_PTR(pwszString));
463 *ppszString = NULL;
464
465 /*
466 * Validate the UTF-16 string and calculate the length of the UTF-8 encoding of it.
467 */
468 size_t cch;
469 int rc = rtUtf16CalcUtf8Length(pwszString, RTSTR_MAX, &cch);
470 if (RT_SUCCESS(rc))
471 {
472 /*
473 * Allocate buffer and recode it.
474 */
475 char *pszResult = (char *)RTMemAllocTag(cch + 1, pszTag);
476 if (pszResult)
477 {
478 rc = rtUtf16RecodeAsUtf8(pwszString, RTSTR_MAX, pszResult, cch, &cch);
479 if (RT_SUCCESS(rc))
480 {
481 *ppszString = pszResult;
482 return rc;
483 }
484
485 RTMemFree(pszResult);
486 }
487 else
488 rc = VERR_NO_STR_MEMORY;
489 }
490 return rc;
491}
492RT_EXPORT_SYMBOL(RTUtf16ToUtf8Tag);
493
494
495RTDECL(int) RTUtf16ToUtf8ExTag(PCRTUTF16 pwszString, size_t cwcString, char **ppsz, size_t cch, size_t *pcch, const char *pszTag)
496{
497 /*
498 * Validate input.
499 */
500 Assert(VALID_PTR(pwszString));
501 Assert(VALID_PTR(ppsz));
502 Assert(!pcch || VALID_PTR(pcch));
503
504 /*
505 * Validate the UTF-16 string and calculate the length of the UTF-8 encoding of it.
506 */
507 size_t cchResult;
508 int rc = rtUtf16CalcUtf8Length(pwszString, cwcString, &cchResult);
509 if (RT_SUCCESS(rc))
510 {
511 if (pcch)
512 *pcch = cchResult;
513
514 /*
515 * Check buffer size / Allocate buffer and recode it.
516 */
517 bool fShouldFree;
518 char *pszResult;
519 if (cch > 0 && *ppsz)
520 {
521 fShouldFree = false;
522 if (RT_UNLIKELY(cch <= cchResult))
523 return VERR_BUFFER_OVERFLOW;
524 pszResult = *ppsz;
525 }
526 else
527 {
528 *ppsz = NULL;
529 fShouldFree = true;
530 cch = RT_MAX(cch, cchResult + 1);
531 pszResult = (char *)RTStrAllocTag(cch, pszTag);
532 }
533 if (pszResult)
534 {
535 rc = rtUtf16RecodeAsUtf8(pwszString, cwcString, pszResult, cch - 1, &cch);
536 if (RT_SUCCESS(rc))
537 {
538 *ppsz = pszResult;
539 return rc;
540 }
541
542 if (fShouldFree)
543 RTStrFree(pszResult);
544 }
545 else
546 rc = VERR_NO_STR_MEMORY;
547 }
548 return rc;
549}
550RT_EXPORT_SYMBOL(RTUtf16ToUtf8ExTag);
551
552
553RTDECL(size_t) RTUtf16CalcUtf8Len(PCRTUTF16 pwsz)
554{
555 size_t cch;
556 int rc = rtUtf16CalcUtf8Length(pwsz, RTSTR_MAX, &cch);
557 return RT_SUCCESS(rc) ? cch : 0;
558}
559RT_EXPORT_SYMBOL(RTUtf16CalcUtf8Len);
560
561
562RTDECL(int) RTUtf16CalcUtf8LenEx(PCRTUTF16 pwsz, size_t cwc, size_t *pcch)
563{
564 size_t cch;
565 int rc = rtUtf16CalcUtf8Length(pwsz, cwc, &cch);
566 if (pcch)
567 *pcch = RT_SUCCESS(rc) ? cch : ~(size_t)0;
568 return rc;
569}
570RT_EXPORT_SYMBOL(RTUtf16CalcUtf8LenEx);
571
572
573RTDECL(RTUNICP) RTUtf16GetCpInternal(PCRTUTF16 pwsz)
574{
575 const RTUTF16 wc = *pwsz;
576
577 /* simple */
578 if (wc < 0xd800 || (wc > 0xdfff && wc < 0xfffe))
579 return wc;
580 if (wc < 0xfffe)
581 {
582 /* surrogate pair */
583 if (wc < 0xdc00)
584 {
585 const RTUTF16 wc2 = pwsz[1];
586 if (wc2 >= 0xdc00 && wc2 <= 0xdfff)
587 {
588 RTUNICP uc = 0x10000 + (((wc & 0x3ff) << 10) | (wc2 & 0x3ff));
589 return uc;
590 }
591
592 RTStrAssertMsgFailed(("wc=%#08x wc2=%#08x - invalid 2nd char in surrogate pair\n", wc, wc2));
593 }
594 else
595 RTStrAssertMsgFailed(("wc=%#08x - invalid surrogate pair order\n", wc));
596 }
597 else
598 RTStrAssertMsgFailed(("wc=%#08x - endian indicator\n", wc));
599 return RTUNICP_INVALID;
600}
601RT_EXPORT_SYMBOL(RTUtf16GetCpInternal);
602
603
604RTDECL(int) RTUtf16GetCpExInternal(PCRTUTF16 *ppwsz, PRTUNICP pCp)
605{
606 const RTUTF16 wc = **ppwsz;
607
608 /* simple */
609 if (wc < 0xd800 || (wc > 0xdfff && wc < 0xfffe))
610 {
611 (*ppwsz)++;
612 *pCp = wc;
613 return VINF_SUCCESS;
614 }
615
616 int rc;
617 if (wc < 0xfffe)
618 {
619 /* surrogate pair */
620 if (wc < 0xdc00)
621 {
622 const RTUTF16 wc2 = (*ppwsz)[1];
623 if (wc2 >= 0xdc00 && wc2 <= 0xdfff)
624 {
625 RTUNICP uc = 0x10000 + (((wc & 0x3ff) << 10) | (wc2 & 0x3ff));
626 *pCp = uc;
627 (*ppwsz) += 2;
628 return VINF_SUCCESS;
629 }
630
631 RTStrAssertMsgFailed(("wc=%#08x wc2=%#08x - invalid 2nd char in surrogate pair\n", wc, wc2));
632 }
633 else
634 RTStrAssertMsgFailed(("wc=%#08x - invalid surrogate pair order\n", wc));
635 rc = VERR_INVALID_UTF16_ENCODING;
636 }
637 else
638 {
639 RTStrAssertMsgFailed(("wc=%#08x - endian indicator\n", wc));
640 rc = VERR_CODE_POINT_ENDIAN_INDICATOR;
641 }
642 *pCp = RTUNICP_INVALID;
643 (*ppwsz)++;
644 return rc;
645}
646RT_EXPORT_SYMBOL(RTUtf16GetCpExInternal);
647
648
649RTDECL(PRTUTF16) RTUtf16PutCpInternal(PRTUTF16 pwsz, RTUNICP CodePoint)
650{
651 /* simple */
652 if ( CodePoint < 0xd800
653 || ( CodePoint > 0xdfff
654 && CodePoint < 0xfffe))
655 {
656 *pwsz++ = (RTUTF16)CodePoint;
657 return pwsz;
658 }
659
660 /* surrogate pair */
661 if (CodePoint >= 0x10000 && CodePoint <= 0x0010ffff)
662 {
663 CodePoint -= 0x10000;
664 *pwsz++ = 0xd800 | (CodePoint >> 10);
665 *pwsz++ = 0xdc00 | (CodePoint & 0x3ff);
666 return pwsz;
667 }
668
669 /* invalid code point. */
670 RTStrAssertMsgFailed(("Invalid codepoint %#x\n", CodePoint));
671 *pwsz++ = 0x7f;
672 return pwsz;
673}
674RT_EXPORT_SYMBOL(RTUtf16PutCpInternal);
675
676
677/**
678 * Validate the UTF-16 encoding and calculates the length of a Latin1 encoding.
679 *
680 * @returns iprt status code.
681 * @param pwsz The UTF-16 string.
682 * @param cwc The max length of the UTF-16 string to consider.
683 * @param pcch Where to store the length (excluding '\\0') of the Latin1 string. (cch == cb, btw)
684 */
685static int rtUtf16CalcLatin1Length(PCRTUTF16 pwsz, size_t cwc, size_t *pcch)
686{
687 int rc = VINF_SUCCESS;
688 size_t cch = 0;
689 while (cwc > 0)
690 {
691 RTUTF16 wc = *pwsz++; cwc--;
692 if (!wc)
693 break;
694 else if (RT_LIKELY(wc < 0x100))
695 ++cch;
696 else
697 {
698 if (wc < 0xd800 || wc > 0xdfff)
699 {
700 if (wc >= 0xfffe)
701 {
702 RTStrAssertMsgFailed(("endian indicator! wc=%#x\n", wc));
703 rc = VERR_CODE_POINT_ENDIAN_INDICATOR;
704 break;
705 }
706 }
707 else
708 {
709 if (wc >= 0xdc00)
710 {
711 RTStrAssertMsgFailed(("Wrong 1st char in surrogate! wc=%#x\n", wc));
712 rc = VERR_INVALID_UTF16_ENCODING;
713 break;
714 }
715 if (cwc <= 0)
716 {
717 RTStrAssertMsgFailed(("Invalid length! wc=%#x\n", wc));
718 rc = VERR_INVALID_UTF16_ENCODING;
719 break;
720 }
721 wc = *pwsz++; cwc--;
722 if (wc < 0xdc00 || wc > 0xdfff)
723 {
724 RTStrAssertMsgFailed(("Wrong 2nd char in surrogate! wc=%#x\n", wc));
725 rc = VERR_INVALID_UTF16_ENCODING;
726 break;
727 }
728 }
729
730 rc = VERR_NO_TRANSLATION;
731 break;
732 }
733 }
734
735 /* done */
736 *pcch = cch;
737 return rc;
738}
739
740
741/**
742 * Recodes an valid UTF-16 string as Latin1.
743 *
744 * @returns iprt status code.
745 * @param pwsz The UTF-16 string.
746 * @param cwc The number of RTUTF16 characters to process from pwsz. The recoding
747 * will stop when cwc or '\\0' is reached.
748 * @param psz Where to store the Latin1 string.
749 * @param cch The size of the Latin1 buffer, excluding the terminator.
750 */
751static int rtUtf16RecodeAsLatin1(PCRTUTF16 pwsz, size_t cwc, char *psz, size_t cch)
752{
753 unsigned char *pch = (unsigned char *)psz;
754 int rc = VINF_SUCCESS;
755 while (cwc > 0)
756 {
757 RTUTF16 wc = *pwsz++; cwc--;
758 if (!wc)
759 break;
760 if (RT_LIKELY(wc < 0x100))
761 {
762 if (RT_UNLIKELY(cch < 1))
763 {
764 RTStrAssertMsgFailed(("Buffer overflow! 1\n"));
765 rc = VERR_BUFFER_OVERFLOW;
766 break;
767 }
768 cch--;
769 *pch++ = (unsigned char)wc;
770 }
771 else
772 {
773 if (wc < 0xd800 || wc > 0xdfff)
774 {
775 if (wc >= 0xfffe)
776 {
777 RTStrAssertMsgFailed(("endian indicator! wc=%#x\n", wc));
778 rc = VERR_CODE_POINT_ENDIAN_INDICATOR;
779 break;
780 }
781 }
782 else
783 {
784 if (wc >= 0xdc00)
785 {
786 RTStrAssertMsgFailed(("Wrong 1st char in surrogate! wc=%#x\n", wc));
787 rc = VERR_INVALID_UTF16_ENCODING;
788 break;
789 }
790 if (cwc <= 0)
791 {
792 RTStrAssertMsgFailed(("Invalid length! wc=%#x\n", wc));
793 rc = VERR_INVALID_UTF16_ENCODING;
794 break;
795 }
796 RTUTF16 wc2 = *pwsz++; cwc--;
797 if (wc2 < 0xdc00 || wc2 > 0xdfff)
798 {
799 RTStrAssertMsgFailed(("Wrong 2nd char in surrogate! wc=%#x\n", wc));
800 rc = VERR_INVALID_UTF16_ENCODING;
801 break;
802 }
803 }
804
805 rc = VERR_NO_TRANSLATION;
806 break;
807 }
808 }
809
810 /* done */
811 *pch = '\0';
812 return rc;
813}
814
815
816RTDECL(int) RTUtf16ToLatin1Tag(PCRTUTF16 pwszString, char **ppszString, const char *pszTag)
817{
818 /*
819 * Validate input.
820 */
821 Assert(VALID_PTR(ppszString));
822 Assert(VALID_PTR(pwszString));
823 *ppszString = NULL;
824
825 /*
826 * Validate the UTF-16 string and calculate the length of the UTF-8 encoding of it.
827 */
828 size_t cch;
829 int rc = rtUtf16CalcLatin1Length(pwszString, RTSTR_MAX, &cch);
830 if (RT_SUCCESS(rc))
831 {
832 /*
833 * Allocate buffer and recode it.
834 */
835 char *pszResult = (char *)RTMemAllocTag(cch + 1, pszTag);
836 if (pszResult)
837 {
838 rc = rtUtf16RecodeAsLatin1(pwszString, RTSTR_MAX, pszResult, cch);
839 if (RT_SUCCESS(rc))
840 {
841 *ppszString = pszResult;
842 return rc;
843 }
844
845 RTMemFree(pszResult);
846 }
847 else
848 rc = VERR_NO_STR_MEMORY;
849 }
850 return rc;
851}
852RT_EXPORT_SYMBOL(RTUtf16ToLatin1Tag);
853
854
855RTDECL(int) RTUtf16ToLatin1ExTag(PCRTUTF16 pwszString, size_t cwcString, char **ppsz, size_t cch, size_t *pcch, const char *pszTag)
856{
857 /*
858 * Validate input.
859 */
860 AssertPtr(pwszString);
861 AssertPtr(ppsz);
862 AssertPtrNull(pcch);
863
864 /*
865 * Validate the UTF-16 string and calculate the length of the Latin1 encoding of it.
866 */
867 size_t cchResult;
868 int rc = rtUtf16CalcLatin1Length(pwszString, cwcString, &cchResult);
869 if (RT_SUCCESS(rc))
870 {
871 if (pcch)
872 *pcch = cchResult;
873
874 /*
875 * Check buffer size / Allocate buffer and recode it.
876 */
877 bool fShouldFree;
878 char *pszResult;
879 if (cch > 0 && *ppsz)
880 {
881 fShouldFree = false;
882 if (cch <= cchResult)
883 return VERR_BUFFER_OVERFLOW;
884 pszResult = *ppsz;
885 }
886 else
887 {
888 *ppsz = NULL;
889 fShouldFree = true;
890 cch = RT_MAX(cch, cchResult + 1);
891 pszResult = (char *)RTMemAllocTag(cch, pszTag);
892 }
893 if (pszResult)
894 {
895 rc = rtUtf16RecodeAsLatin1(pwszString, cwcString, pszResult, cch - 1);
896 if (RT_SUCCESS(rc))
897 {
898 *ppsz = pszResult;
899 return rc;
900 }
901
902 if (fShouldFree)
903 RTMemFree(pszResult);
904 }
905 else
906 rc = VERR_NO_STR_MEMORY;
907 }
908 return rc;
909}
910RT_EXPORT_SYMBOL(RTUtf16ToLatin1ExTag);
911
912
913RTDECL(size_t) RTUtf16CalcLatin1Len(PCRTUTF16 pwsz)
914{
915 size_t cch;
916 int rc = rtUtf16CalcLatin1Length(pwsz, RTSTR_MAX, &cch);
917 return RT_SUCCESS(rc) ? cch : 0;
918}
919RT_EXPORT_SYMBOL(RTUtf16CalcLatin1Len);
920
921
922RTDECL(int) RTUtf16CalcLatin1LenEx(PCRTUTF16 pwsz, size_t cwc, size_t *pcch)
923{
924 size_t cch;
925 int rc = rtUtf16CalcLatin1Length(pwsz, cwc, &cch);
926 if (pcch)
927 *pcch = RT_SUCCESS(rc) ? cch : ~(size_t)0;
928 return rc;
929}
930RT_EXPORT_SYMBOL(RTUtf16CalcLatin1LenEx);
931
932
933/**
934 * Calculates the UTF-16 length of a Latin1 string. In fact this is just the
935 * original length, but the function saves us nasty comments to that effect
936 * all over the place.
937 *
938 * @returns IPRT status code.
939 * @param psz Pointer to the Latin1 string.
940 * @param cch The max length of the string. (btw cch = cb)
941 * Use RTSTR_MAX if all of the string is to be examined.s
942 * @param pcwc Where to store the length of the UTF-16 string as a number of RTUTF16 characters.
943 */
944static int rtLatin1CalcUtf16Length(const char *psz, size_t cch, size_t *pcwc)
945{
946 *pcwc = RTStrNLen(psz, cch);
947 return VINF_SUCCESS;
948}
949
950
951/**
952 * Recodes a Latin1 string as UTF-16. This is just a case of expanding it to
953 * sixteen bits, as Unicode is a superset of Latin1.
954 *
955 * Since we know the input is valid, we do *not* perform length checks.
956 *
957 * @returns iprt status code.
958 * @param psz The Latin1 string to recode.
959 * @param cch The number of chars (the type char, so bytes if you like) to process of the Latin1 string.
960 * The recoding will stop when cch or '\\0' is reached. Pass RTSTR_MAX to process up to '\\0'.
961 * @param pwsz Where to store the UTF-16 string.
962 * @param cwc The number of RTUTF16 items the pwsz buffer can hold, excluding the terminator ('\\0').
963 */
964static int rtLatin1RecodeAsUtf16(const char *psz, size_t cch, PRTUTF16 pwsz, size_t cwc)
965{
966 int rc = VINF_SUCCESS;
967 const unsigned char *puch = (const unsigned char *)psz;
968 PRTUTF16 pwc = pwsz;
969 while (cch-- > 0)
970 {
971 /* read the next char and check for terminator. */
972 const unsigned char uch = *puch;
973 if (!uch)
974 break;
975
976 /* check for output overflow */
977 if (RT_UNLIKELY(cwc < 1))
978 {
979 rc = VERR_BUFFER_OVERFLOW;
980 break;
981 }
982
983 /* expand the code point */
984 *pwc++ = uch;
985 cwc--;
986 puch++;
987 }
988
989 /* done */
990 *pwc = '\0';
991 return rc;
992}
993
994
995RTDECL(int) RTLatin1ToUtf16Tag(const char *pszString, PRTUTF16 *ppwszString, const char *pszTag)
996{
997 /*
998 * Validate input.
999 */
1000 Assert(VALID_PTR(ppwszString));
1001 Assert(VALID_PTR(pszString));
1002 *ppwszString = NULL;
1003
1004 /*
1005 * Validate the input and calculate the length of the UTF-16 string.
1006 */
1007 size_t cwc;
1008 int rc = rtLatin1CalcUtf16Length(pszString, RTSTR_MAX, &cwc);
1009 if (RT_SUCCESS(rc))
1010 {
1011 /*
1012 * Allocate buffer.
1013 */
1014 PRTUTF16 pwsz = (PRTUTF16)RTMemAllocTag((cwc + 1) * sizeof(RTUTF16), pszTag);
1015 if (pwsz)
1016 {
1017 /*
1018 * Encode the UTF-16 string.
1019 */
1020 rc = rtLatin1RecodeAsUtf16(pszString, RTSTR_MAX, pwsz, cwc);
1021 if (RT_SUCCESS(rc))
1022 {
1023 *ppwszString = pwsz;
1024 return rc;
1025 }
1026 RTMemFree(pwsz);
1027 }
1028 else
1029 rc = VERR_NO_UTF16_MEMORY;
1030 }
1031 return rc;
1032}
1033RT_EXPORT_SYMBOL(RTLatin1ToUtf16Tag);
1034
1035
1036RTDECL(int) RTLatin1ToUtf16ExTag(const char *pszString, size_t cchString,
1037 PRTUTF16 *ppwsz, size_t cwc, size_t *pcwc, const char *pszTag)
1038{
1039 /*
1040 * Validate input.
1041 */
1042 Assert(VALID_PTR(pszString));
1043 Assert(VALID_PTR(ppwsz));
1044 Assert(!pcwc || VALID_PTR(pcwc));
1045
1046 /*
1047 * Validate the input and calculate the length of the UTF-16 string.
1048 */
1049 size_t cwcResult;
1050 int rc = rtLatin1CalcUtf16Length(pszString, cchString, &cwcResult);
1051 if (RT_SUCCESS(rc))
1052 {
1053 if (pcwc)
1054 *pcwc = cwcResult;
1055
1056 /*
1057 * Check buffer size / Allocate buffer.
1058 */
1059 bool fShouldFree;
1060 PRTUTF16 pwszResult;
1061 if (cwc > 0 && *ppwsz)
1062 {
1063 fShouldFree = false;
1064 if (cwc <= cwcResult)
1065 return VERR_BUFFER_OVERFLOW;
1066 pwszResult = *ppwsz;
1067 }
1068 else
1069 {
1070 *ppwsz = NULL;
1071 fShouldFree = true;
1072 cwc = RT_MAX(cwcResult + 1, cwc);
1073 pwszResult = (PRTUTF16)RTMemAllocTag(cwc * sizeof(RTUTF16), pszTag);
1074 }
1075 if (pwszResult)
1076 {
1077 /*
1078 * Encode the UTF-16 string.
1079 */
1080 rc = rtLatin1RecodeAsUtf16(pszString, cchString, pwszResult, cwc - 1);
1081 if (RT_SUCCESS(rc))
1082 {
1083 *ppwsz = pwszResult;
1084 return rc;
1085 }
1086 if (fShouldFree)
1087 RTMemFree(pwszResult);
1088 }
1089 else
1090 rc = VERR_NO_UTF16_MEMORY;
1091 }
1092 return rc;
1093}
1094RT_EXPORT_SYMBOL(RTLatin1ToUtf16ExTag);
1095
1096
1097RTDECL(size_t) RTLatin1CalcUtf16Len(const char *psz)
1098{
1099 size_t cwc;
1100 int rc = rtLatin1CalcUtf16Length(psz, RTSTR_MAX, &cwc);
1101 return RT_SUCCESS(rc) ? cwc : 0;
1102}
1103RT_EXPORT_SYMBOL(RTLatin1CalcUtf16Len);
1104
1105
1106RTDECL(int) RTLatin1CalcUtf16LenEx(const char *psz, size_t cch, size_t *pcwc)
1107{
1108 size_t cwc;
1109 int rc = rtLatin1CalcUtf16Length(psz, cch, &cwc);
1110 if (pcwc)
1111 *pcwc = RT_SUCCESS(rc) ? cwc : ~(size_t)0;
1112 return rc;
1113}
1114RT_EXPORT_SYMBOL(RTLatin1CalcUtf16LenEx);
Note: See TracBrowser for help on using the repository browser.

© 2025 Oracle Support Privacy / Do Not Sell My Info Terms of Use Trademark Policy Automated Access Etiquette