VirtualBox

source: vbox/trunk/src/VBox/Runtime/common/string/utf-16.cpp@ 16823

Last change on this file since 16823 was 8245, checked in by vboxsync, 17 years ago

rebranding: IPRT files again.

  • Property svn:eol-style set to native
  • Property svn:keywords set to Id
File size: 17.4 KB
Line 
1/* $Id: utf-16.cpp 8245 2008-04-21 17:24:28Z vboxsync $ */
2/** @file
3 * IPRT - UTF-16
4 */
5
6/*
7 * Copyright (C) 2006-2007 Sun Microsystems, Inc.
8 *
9 * This file is part of VirtualBox Open Source Edition (OSE), as
10 * available from http://www.virtualbox.org. This file is free software;
11 * you can redistribute it and/or modify it under the terms of the GNU
12 * General Public License (GPL) as published by the Free Software
13 * Foundation, in version 2 as it comes in the "COPYING" file of the
14 * VirtualBox OSE distribution. VirtualBox OSE is distributed in the
15 * hope that it will be useful, but WITHOUT ANY WARRANTY of any kind.
16 *
17 * The contents of this file may alternatively be used under the terms
18 * of the Common Development and Distribution License Version 1.0
19 * (CDDL) only, as it comes in the "COPYING.CDDL" file of the
20 * VirtualBox OSE distribution, in which case the provisions of the
21 * CDDL are applicable instead of those of the GPL.
22 *
23 * You may elect to license modified versions of this file under the
24 * terms and conditions of either the GPL or the CDDL or both.
25 *
26 * Please contact Sun Microsystems, Inc., 4150 Network Circle, Santa
27 * Clara, CA 95054 USA or visit http://www.sun.com if you need
28 * additional information or have any questions.
29 */
30
31
32/*******************************************************************************
33* Header Files *
34*******************************************************************************/
35#include <iprt/string.h>
36#include <iprt/uni.h>
37#include <iprt/alloc.h>
38#include <iprt/assert.h>
39#include <iprt/err.h>
40#include "internal/string.h"
41
42
43
44RTDECL(void) RTUtf16Free(PRTUTF16 pwszString)
45{
46 if (pwszString)
47 RTMemTmpFree(pwszString);
48}
49
50
51RTDECL(PRTUTF16) RTUtf16Dup(PCRTUTF16 pwszString)
52{
53 Assert(pwszString);
54 size_t cb = (RTUtf16Len(pwszString) + 1) * sizeof(RTUTF16);
55 PRTUTF16 pwsz = (PRTUTF16)RTMemAlloc(cb);
56 if (pwsz)
57 memcpy(pwsz, pwszString, cb);
58 return pwsz;
59}
60
61
62RTDECL(int) RTUtf16DupEx(PRTUTF16 *ppwszString, PCRTUTF16 pwszString, size_t cwcExtra)
63{
64 Assert(pwszString);
65 size_t cb = (RTUtf16Len(pwszString) + 1) * sizeof(RTUTF16);
66 PRTUTF16 pwsz = (PRTUTF16)RTMemAlloc(cb + cwcExtra * sizeof(RTUTF16));
67 if (pwsz)
68 {
69 memcpy(pwsz, pwszString, cb);
70 *ppwszString = pwsz;
71 return VINF_SUCCESS;
72 }
73 return VERR_NO_MEMORY;
74}
75
76
77RTDECL(size_t) RTUtf16Len(PCRTUTF16 pwszString)
78{
79 if (!pwszString)
80 return 0;
81
82 PCRTUTF16 pwsz = pwszString;
83 while (*pwsz)
84 pwsz++;
85 return pwsz - pwszString;
86}
87
88
89RTDECL(int) RTUtf16Cmp(register PCRTUTF16 pwsz1, register PCRTUTF16 pwsz2)
90{
91 if (pwsz1 == pwsz2)
92 return 0;
93 if (!pwsz1)
94 return -1;
95 if (!pwsz2)
96 return 1;
97
98 for (;;)
99 {
100 register RTUTF16 wcs = *pwsz1;
101 register int iDiff = wcs - *pwsz2;
102 if (iDiff || !wcs)
103 return iDiff;
104 pwsz1++;
105 pwsz2++;
106 }
107}
108
109
110RTDECL(int) RTUtf16ICmp(register PCRTUTF16 pwsz1, register PCRTUTF16 pwsz2)
111{
112 if (pwsz1 == pwsz2)
113 return 0;
114 if (!pwsz1)
115 return -1;
116 if (!pwsz2)
117 return 1;
118
119 PCRTUTF16 pwsz1Start = pwsz1; /* keep it around in case we have to backtrack on a surrogate pair */
120 for (;;)
121 {
122 register RTUTF16 wc1 = *pwsz1;
123 register RTUTF16 wc2 = *pwsz2;
124 register int iDiff = wc1 - wc2;
125 if (iDiff)
126 {
127 /* unless they are *both* surrogate pairs, there is no chance they'll be identical. */
128 if ( wc1 < 0xd800
129 || wc2 < 0xd800
130 || wc1 > 0xdfff
131 || wc2 > 0xdfff)
132 {
133 /* simple UCS-2 char */
134 iDiff = RTUniCpToUpper(wc1) - RTUniCpToUpper(wc2);
135 if (iDiff)
136 iDiff = RTUniCpToLower(wc1) - RTUniCpToLower(wc2);
137 }
138 else
139 {
140 /* a damned pair */
141 RTUNICP uc1;
142 RTUNICP uc2;
143 if (wc1 >= 0xdc00)
144 {
145 if (pwsz1Start == pwsz1)
146 return iDiff;
147 uc1 = pwsz1[-1];
148 if (uc1 < 0xd800 || uc1 >= 0xdc00)
149 return iDiff;
150 uc1 = 0x10000 + (((uc1 & 0x3ff) << 10) | (wc1 & 0x3ff));
151 uc2 = 0x10000 + (((pwsz2[-1] & 0x3ff) << 10) | (wc2 & 0x3ff));
152 }
153 else
154 {
155 uc1 = *++pwsz1;
156 if (uc1 < 0xdc00 || uc1 >= 0xe000)
157 return iDiff;
158 uc1 = 0x10000 + (((wc1 & 0x3ff) << 10) | (uc1 & 0x3ff));
159 uc2 = 0x10000 + (((wc2 & 0x3ff) << 10) | (*++pwsz2 & 0x3ff));
160 }
161 iDiff = RTUniCpToUpper(uc1) - RTUniCpToUpper(uc2);
162 if (iDiff)
163 iDiff = RTUniCpToLower(uc1) - RTUniCpToLower(uc2); /* serious paranoia! */
164 }
165 if (iDiff)
166 return iDiff;
167 }
168 if (!wc1)
169 return 0;
170 pwsz1++;
171 pwsz2++;
172 }
173}
174
175
176RTDECL(PRTUTF16) RTUtf16ToLower(PRTUTF16 pwsz)
177{
178 PRTUTF16 pwc = pwsz;
179 for (;;)
180 {
181 RTUTF16 wc = *pwc;
182 if (!wc)
183 break;
184 if (wc < 0xd800 || wc >= 0xdc00)
185 {
186 RTUNICP ucFolded = RTUniCpToLower(wc);
187 if (ucFolded < 0x10000)
188 *pwc++ = RTUniCpToLower(wc);
189 }
190 else
191 {
192 /* surrogate */
193 RTUTF16 wc2 = pwc[1];
194 if (wc2 >= 0xdc00 && wc2 <= 0xdfff)
195 {
196 RTUNICP uc = 0x10000 + (((wc & 0x3ff) << 10) | (wc2 & 0x3ff));
197 RTUNICP ucFolded = RTUniCpToLower(uc);
198 if (uc != ucFolded && ucFolded >= 0x10000) /* we don't support shrinking the string */
199 {
200 uc -= 0x10000;
201 *pwc++ = 0xd800 | (uc >> 10);
202 *pwc++ = 0xdc00 | (uc & 0x3ff);
203 }
204 }
205 else /* invalid encoding. */
206 pwc++;
207 }
208 }
209 return pwsz;
210}
211
212
213RTDECL(PRTUTF16) RTUtf16ToUpper(PRTUTF16 pwsz)
214{
215 PRTUTF16 pwc = pwsz;
216 for (;;)
217 {
218 RTUTF16 wc = *pwc;
219 if (!wc)
220 break;
221 if (wc < 0xd800 || wc >= 0xdc00)
222 *pwc++ = RTUniCpToUpper(wc);
223 else
224 {
225 /* surrogate */
226 RTUTF16 wc2 = pwc[1];
227 if (wc2 >= 0xdc00 && wc2 <= 0xdfff)
228 {
229 RTUNICP uc = 0x10000 + (((wc & 0x3ff) << 10) | (wc2 & 0x3ff));
230 RTUNICP ucFolded = RTUniCpToUpper(uc);
231 if (uc != ucFolded && ucFolded >= 0x10000) /* we don't support shrinking the string */
232 {
233 uc -= 0x10000;
234 *pwc++ = 0xd800 | (uc >> 10);
235 *pwc++ = 0xdc00 | (uc & 0x3ff);
236 }
237 }
238 else /* invalid encoding. */
239 pwc++;
240 }
241 }
242 return pwsz;
243}
244
245
246/**
247 * Validate the UTF-16 encoding and calculates the length of an UTF-8 encoding.
248 *
249 * @returns iprt status code.
250 * @param pwsz The UTF-16 string.
251 * @param cwc The max length of the UTF-16 string to consider.
252 * @param pcch Where to store the length (excluding '\\0') of the UTF-8 string. (cch == cb, btw)
253 */
254static int rtUtf16CalcUtf8Length(PCRTUTF16 pwsz, size_t cwc, size_t *pcch)
255{
256 int rc = VINF_SUCCESS;
257 size_t cch = 0;
258 while (cwc > 0)
259 {
260 RTUTF16 wc = *pwsz++; cwc--;
261 if (!wc)
262 break;
263 else if (wc < 0xd800 || wc > 0xdfff)
264 {
265 if (wc < 0x80)
266 cch++;
267 else if (wc < 0x800)
268 cch += 2;
269 else if (wc < 0xfffe)
270 cch += 3;
271 else
272 {
273 RTStrAssertMsgFailed(("endian indicator! wc=%#x\n", wc));
274 rc = VERR_CODE_POINT_ENDIAN_INDICATOR;
275 break;
276 }
277 }
278 else
279 {
280 if (wc >= 0xdc00)
281 {
282 RTStrAssertMsgFailed(("Wrong 1st char in surrogate! wc=%#x\n", wc));
283 rc = VERR_INVALID_UTF16_ENCODING;
284 break;
285 }
286 if (cwc <= 0)
287 {
288 RTStrAssertMsgFailed(("Invalid length! wc=%#x\n", wc));
289 rc = VERR_INVALID_UTF16_ENCODING;
290 break;
291 }
292 wc = *pwsz++; cwc--;
293 if (wc < 0xdc00 || wc > 0xdfff)
294 {
295 RTStrAssertMsgFailed(("Wrong 2nd char in surrogate! wc=%#x\n", wc));
296 rc = VERR_INVALID_UTF16_ENCODING;
297 break;
298 }
299 cch += 4;
300 }
301 }
302
303
304 /* done */
305 *pcch = cch;
306 return rc;
307}
308
309
310/**
311 * Recodes an valid UTF-16 string as UTF-8.
312 *
313 * @returns iprt status code.
314 * @param pwsz The UTF-16 string.
315 * @param cwc The number of RTUTF16 characters to process from pwsz. The recoding
316 * will stop when cwc or '\\0' is reached.
317 * @param psz Where to store the UTF-8 string.
318 * @param cch The size of the UTF-8 buffer, excluding the terminator.
319 * @param pcch Where to store the number of octets actually encoded.
320 */
321static int rtUtf16RecodeAsUtf8(PCRTUTF16 pwsz, size_t cwc, char *psz, size_t cch, size_t *pcch)
322{
323 unsigned char *pwch = (unsigned char *)psz;
324 int rc = VINF_SUCCESS;
325 while (cwc > 0)
326 {
327 RTUTF16 wc = *pwsz++; cwc--;
328 if (!wc)
329 break;
330 else if (wc < 0xd800 || wc > 0xdfff)
331 {
332 if (wc < 0x80)
333 {
334 if (cch < 1)
335 {
336 RTStrAssertMsgFailed(("Buffer overflow! 1\n"));
337 rc = VERR_BUFFER_OVERFLOW;
338 break;
339 }
340 cch--;
341 *pwch++ = (unsigned char)wc;
342 }
343 else if (wc < 0x800)
344 {
345 if (cch < 2)
346 {
347 RTStrAssertMsgFailed(("Buffer overflow! 2\n"));
348 rc = VERR_BUFFER_OVERFLOW;
349 break;
350 }
351 cch -= 2;
352 *pwch++ = 0xc0 | (wc >> 6);
353 *pwch++ = 0x80 | (wc & 0x3f);
354 }
355 else if (wc < 0xfffe)
356 {
357 if (cch < 3)
358 {
359 RTStrAssertMsgFailed(("Buffer overflow! 3\n"));
360 rc = VERR_BUFFER_OVERFLOW;
361 break;
362 }
363 cch -= 3;
364 *pwch++ = 0xe0 | (wc >> 12);
365 *pwch++ = 0x80 | ((wc >> 6) & 0x3f);
366 *pwch++ = 0x80 | (wc & 0x3f);
367 }
368 else
369 {
370 RTStrAssertMsgFailed(("endian indicator! wc=%#x\n", wc));
371 rc = VERR_CODE_POINT_ENDIAN_INDICATOR;
372 break;
373 }
374 }
375 else
376 {
377 if (wc >= 0xdc00)
378 {
379 RTStrAssertMsgFailed(("Wrong 1st char in surrogate! wc=%#x\n", wc));
380 rc = VERR_INVALID_UTF16_ENCODING;
381 break;
382 }
383 if (cwc <= 0)
384 {
385 RTStrAssertMsgFailed(("Invalid length! wc=%#x\n", wc));
386 rc = VERR_INVALID_UTF16_ENCODING;
387 break;
388 }
389 RTUTF16 wc2 = *pwsz++; cwc--;
390 if (wc2 < 0xdc00 || wc2 > 0xdfff)
391 {
392 RTStrAssertMsgFailed(("Wrong 2nd char in surrogate! wc=%#x\n", wc));
393 rc = VERR_INVALID_UTF16_ENCODING;
394 break;
395 }
396 uint32_t CodePoint = 0x10000
397 + ( ((wc & 0x3ff) << 10)
398 | (wc2 & 0x3ff));
399 if (cch < 4)
400 {
401 RTStrAssertMsgFailed(("Buffer overflow! 4\n"));
402 rc = VERR_BUFFER_OVERFLOW;
403 break;
404 }
405 cch -= 4;
406 *pwch++ = 0xf0 | (CodePoint >> 18);
407 *pwch++ = 0x80 | ((CodePoint >> 12) & 0x3f);
408 *pwch++ = 0x80 | ((CodePoint >> 6) & 0x3f);
409 *pwch++ = 0x80 | (CodePoint & 0x3f);
410 }
411 }
412
413 /* done */
414 *pwch = '\0';
415 *pcch = (char *)pwch - psz;
416 return rc;
417}
418
419
420
421RTDECL(int) RTUtf16ToUtf8(PCRTUTF16 pwszString, char **ppszString)
422{
423 /*
424 * Validate input.
425 */
426 Assert(VALID_PTR(ppszString));
427 Assert(VALID_PTR(pwszString));
428 *ppszString = NULL;
429
430 /*
431 * Validate the UTF-16 string and calculate the length of the UTF-8 encoding of it.
432 */
433 size_t cch;
434 int rc = rtUtf16CalcUtf8Length(pwszString, RTSTR_MAX, &cch);
435 if (RT_SUCCESS(rc))
436 {
437 /*
438 * Allocate buffer and recode it.
439 */
440 char *pszResult = (char *)RTMemAlloc(cch + 1);
441 if (pszResult)
442 {
443 rc = rtUtf16RecodeAsUtf8(pwszString, RTSTR_MAX, pszResult, cch, &cch);
444 if (RT_SUCCESS(rc))
445 {
446 *ppszString = pszResult;
447 return rc;
448 }
449
450 RTMemFree(pszResult);
451 }
452 else
453 rc = VERR_NO_STR_MEMORY;
454 }
455 return rc;
456}
457
458
459RTDECL(int) RTUtf16ToUtf8Ex(PCRTUTF16 pwszString, size_t cwcString, char **ppsz, size_t cch, size_t *pcch)
460{
461 /*
462 * Validate input.
463 */
464 Assert(VALID_PTR(pwszString));
465 Assert(VALID_PTR(ppsz));
466 Assert(!pcch || VALID_PTR(pcch));
467
468 /*
469 * Validate the UTF-16 string and calculate the length of the UTF-8 encoding of it.
470 */
471 size_t cchResult;
472 int rc = rtUtf16CalcUtf8Length(pwszString, cwcString, &cchResult);
473 if (RT_SUCCESS(rc))
474 {
475 if (pcch)
476 *pcch = cchResult;
477
478 /*
479 * Check buffer size / Allocate buffer and recode it.
480 */
481 bool fShouldFree;
482 char *pszResult;
483 if (cch > 0 && *ppsz)
484 {
485 fShouldFree = false;
486 if (cch <= cchResult)
487 return VERR_BUFFER_OVERFLOW;
488 pszResult = *ppsz;
489 }
490 else
491 {
492 *ppsz = NULL;
493 fShouldFree = true;
494 cch = RT_MAX(cch, cchResult + 1);
495 pszResult = (char *)RTMemAlloc(cch);
496 }
497 if (pszResult)
498 {
499 rc = rtUtf16RecodeAsUtf8(pwszString, cwcString, pszResult, cch - 1, &cch);
500 if (RT_SUCCESS(rc))
501 {
502 *ppsz = pszResult;
503 return rc;
504 }
505
506 if (fShouldFree)
507 RTMemFree(pszResult);
508 }
509 else
510 rc = VERR_NO_STR_MEMORY;
511 }
512 return rc;
513}
514
515
516RTDECL(size_t) RTUtf16CalcUtf8Len(PCRTUTF16 pwsz)
517{
518 size_t cch;
519 int rc = rtUtf16CalcUtf8Length(pwsz, RTSTR_MAX, &cch);
520 return RT_SUCCESS(rc) ? cch : 0;
521}
522
523
524RTDECL(int) RTUtf16CalcUtf8LenEx(PCRTUTF16 pwsz, size_t cwc, size_t *pcch)
525{
526 size_t cch;
527 int rc = rtUtf16CalcUtf8Length(pwsz, cwc, &cch);
528 if (pcch)
529 *pcch = RT_SUCCESS(rc) ? cch : ~(size_t)0;
530 return rc;
531}
532
533
534RTDECL(RTUNICP) RTUtf16GetCpInternal(PCRTUTF16 pwsz)
535{
536 const RTUTF16 wc = *pwsz;
537
538 /* simple */
539 if (wc < 0xd800 || (wc > 0xdfff && wc < 0xfffe))
540 return wc;
541 if (wc < 0xfffe)
542 {
543 /* surrogate pair */
544 if (wc < 0xdc00)
545 {
546 const RTUTF16 wc2 = pwsz[1];
547 if (wc2 >= 0xdc00 && wc2 <= 0xdfff)
548 {
549 RTUNICP uc = 0x10000 + (((wc & 0x3ff) << 10) | (wc2 & 0x3ff));
550 return uc;
551 }
552
553 RTStrAssertMsgFailed(("wc=%#08x wc2=%#08x - invalid 2nd char in surrogate pair\n", wc, wc2));
554 }
555 else
556 RTStrAssertMsgFailed(("wc=%#08x - invalid surrogate pair order\n", wc));
557 }
558 else
559 RTStrAssertMsgFailed(("wc=%#08x - endian indicator\n", wc));
560 return RTUNICP_INVALID;
561}
562
563
564RTDECL(int) RTUtf16GetCpExInternal(PCRTUTF16 *ppwsz, PRTUNICP pCp)
565{
566 const RTUTF16 wc = **ppwsz;
567
568 /* simple */
569 if (wc < 0xd800 || (wc > 0xdfff && wc < 0xfffe))
570 {
571 (*ppwsz)++;
572 *pCp = wc;
573 return VINF_SUCCESS;
574 }
575
576 int rc;
577 if (wc < 0xfffe)
578 {
579 /* surrogate pair */
580 if (wc < 0xdc00)
581 {
582 const RTUTF16 wc2 = (*ppwsz)[1];
583 if (wc2 >= 0xdc00 && wc2 <= 0xdfff)
584 {
585 RTUNICP uc = 0x10000 + (((wc & 0x3ff) << 10) | (wc2 & 0x3ff));
586 *pCp = uc;
587 (*ppwsz) += 2;
588 return VINF_SUCCESS;
589 }
590
591 RTStrAssertMsgFailed(("wc=%#08x wc2=%#08x - invalid 2nd char in surrogate pair\n", wc, wc2));
592 }
593 else
594 RTStrAssertMsgFailed(("wc=%#08x - invalid surrogate pair order\n", wc));
595 rc = VERR_INVALID_UTF16_ENCODING;
596 }
597 else
598 {
599 RTStrAssertMsgFailed(("wc=%#08x - endian indicator\n", wc));
600 rc = VERR_CODE_POINT_ENDIAN_INDICATOR;
601 }
602 *pCp = RTUNICP_INVALID;
603 (*ppwsz)++;
604 return rc;
605}
606
607
608RTDECL(PRTUTF16) RTUtf16PutCpInternal(PRTUTF16 pwsz, RTUNICP CodePoint)
609{
610 /* simple */
611 if ( CodePoint < 0xd800
612 || ( CodePoint > 0xdfff
613 && CodePoint < 0xfffe))
614 {
615 *pwsz++ = (RTUTF16)CodePoint;
616 return pwsz;
617 }
618
619 /* surrogate pair */
620 if (CodePoint >= 0x10000 && CodePoint <= 0x0010ffff)
621 {
622 CodePoint -= 0x10000;
623 *pwsz++ = 0xd800 | (CodePoint >> 10);
624 *pwsz++ = 0xdc00 | (CodePoint & 0x3ff);
625 return pwsz;
626 }
627
628 /* invalid code point. */
629 RTStrAssertMsgFailed(("Invalid codepoint %#x\n", CodePoint));
630 *pwsz++ = 0x7f;
631 return pwsz;
632}
633
634
Note: See TracBrowser for help on using the repository browser.

© 2024 Oracle Support Privacy / Do Not Sell My Info Terms of Use Trademark Policy Automated Access Etiquette