VirtualBox

source: vbox/trunk/src/VBox/Runtime/common/string/utf-16.cpp@ 7389

Last change on this file since 7389 was 6041, checked in by vboxsync, 17 years ago

Added RTUtf16CalcUtf8Len and RTUtf16CalcUtf8LenEx.

  • Property svn:eol-style set to native
  • Property svn:keywords set to Id
File size: 17.2 KB
Line 
1/* $Id: utf-16.cpp 6041 2007-12-10 19:11:19Z vboxsync $ */
2/** @file
3 * innotek Portable Runtime - UTF-16
4 */
5
6/*
7 * Copyright (C) 2006-2007 innotek GmbH
8 *
9 * This file is part of VirtualBox Open Source Edition (OSE), as
10 * available from http://www.virtualbox.org. This file is free software;
11 * you can redistribute it and/or modify it under the terms of the GNU
12 * General Public License (GPL) as published by the Free Software
13 * Foundation, in version 2 as it comes in the "COPYING" file of the
14 * VirtualBox OSE distribution. VirtualBox OSE is distributed in the
15 * hope that it will be useful, but WITHOUT ANY WARRANTY of any kind.
16 *
17 * The contents of this file may alternatively be used under the terms
18 * of the Common Development and Distribution License Version 1.0
19 * (CDDL) only, as it comes in the "COPYING.CDDL" file of the
20 * VirtualBox OSE distribution, in which case the provisions of the
21 * CDDL are applicable instead of those of the GPL.
22 *
23 * You may elect to license modified versions of this file under the
24 * terms and conditions of either the GPL or the CDDL or both.
25 */
26
27
28/*******************************************************************************
29* Header Files *
30*******************************************************************************/
31#include <iprt/string.h>
32#include <iprt/uni.h>
33#include <iprt/alloc.h>
34#include <iprt/assert.h>
35#include <iprt/err.h>
36#include "internal/string.h"
37
38
39
40RTDECL(void) RTUtf16Free(PRTUTF16 pwszString)
41{
42 if (pwszString)
43 RTMemTmpFree(pwszString);
44}
45
46
47RTDECL(PRTUTF16) RTUtf16Dup(PCRTUTF16 pwszString)
48{
49 Assert(pwszString);
50 size_t cb = (RTUtf16Len(pwszString) + 1) * sizeof(RTUTF16);
51 PRTUTF16 pwsz = (PRTUTF16)RTMemAlloc(cb);
52 if (pwsz)
53 memcpy(pwsz, pwszString, cb);
54 return pwsz;
55}
56
57
58RTDECL(int) RTUtf16DupEx(PRTUTF16 *ppwszString, PCRTUTF16 pwszString, size_t cwcExtra)
59{
60 Assert(pwszString);
61 size_t cb = (RTUtf16Len(pwszString) + 1) * sizeof(RTUTF16);
62 PRTUTF16 pwsz = (PRTUTF16)RTMemAlloc(cb + cwcExtra * sizeof(RTUTF16));
63 if (pwsz)
64 {
65 memcpy(pwsz, pwszString, cb);
66 *ppwszString = pwsz;
67 return VINF_SUCCESS;
68 }
69 return VERR_NO_MEMORY;
70}
71
72
73RTDECL(size_t) RTUtf16Len(PCRTUTF16 pwszString)
74{
75 if (!pwszString)
76 return 0;
77
78 PCRTUTF16 pwsz = pwszString;
79 while (*pwsz)
80 pwsz++;
81 return pwsz - pwszString;
82}
83
84
85RTDECL(int) RTUtf16Cmp(register PCRTUTF16 pwsz1, register PCRTUTF16 pwsz2)
86{
87 if (pwsz1 == pwsz2)
88 return 0;
89 if (!pwsz1)
90 return -1;
91 if (!pwsz2)
92 return 1;
93
94 for (;;)
95 {
96 register RTUTF16 wcs = *pwsz1;
97 register int iDiff = wcs - *pwsz2;
98 if (iDiff || !wcs)
99 return iDiff;
100 pwsz1++;
101 pwsz2++;
102 }
103}
104
105
106RTDECL(int) RTUtf16ICmp(register PCRTUTF16 pwsz1, register PCRTUTF16 pwsz2)
107{
108 if (pwsz1 == pwsz2)
109 return 0;
110 if (!pwsz1)
111 return -1;
112 if (!pwsz2)
113 return 1;
114
115 PCRTUTF16 pwsz1Start = pwsz1; /* keep it around in case we have to backtrack on a surrogate pair */
116 for (;;)
117 {
118 register RTUTF16 wc1 = *pwsz1;
119 register RTUTF16 wc2 = *pwsz2;
120 register int iDiff = wc1 - wc2;
121 if (iDiff)
122 {
123 /* unless they are *both* surrogate pairs, there is no chance they'll be identical. */
124 if ( wc1 < 0xd800
125 || wc2 < 0xd800
126 || wc1 > 0xdfff
127 || wc2 > 0xdfff)
128 {
129 /* simple UCS-2 char */
130 iDiff = RTUniCpToUpper(wc1) - RTUniCpToUpper(wc2);
131 if (iDiff)
132 iDiff = RTUniCpToLower(wc1) - RTUniCpToLower(wc2);
133 }
134 else
135 {
136 /* a damned pair */
137 RTUNICP uc1;
138 RTUNICP uc2;
139 if (wc1 >= 0xdc00)
140 {
141 if (pwsz1Start == pwsz1)
142 return iDiff;
143 uc1 = pwsz1[-1];
144 if (uc1 < 0xd800 || uc1 >= 0xdc00)
145 return iDiff;
146 uc1 = 0x10000 + (((uc1 & 0x3ff) << 10) | (wc1 & 0x3ff));
147 uc2 = 0x10000 + (((pwsz2[-1] & 0x3ff) << 10) | (wc2 & 0x3ff));
148 }
149 else
150 {
151 uc1 = *++pwsz1;
152 if (uc1 < 0xdc00 || uc1 >= 0xe000)
153 return iDiff;
154 uc1 = 0x10000 + (((wc1 & 0x3ff) << 10) | (uc1 & 0x3ff));
155 uc2 = 0x10000 + (((wc2 & 0x3ff) << 10) | (*++pwsz2 & 0x3ff));
156 }
157 iDiff = RTUniCpToUpper(uc1) - RTUniCpToUpper(uc2);
158 if (iDiff)
159 iDiff = RTUniCpToLower(uc1) - RTUniCpToLower(uc2); /* serious paranoia! */
160 }
161 if (iDiff)
162 return iDiff;
163 }
164 if (!wc1)
165 return 0;
166 pwsz1++;
167 pwsz2++;
168 }
169}
170
171
172RTDECL(PRTUTF16) RTUtf16ToLower(PRTUTF16 pwsz)
173{
174 PRTUTF16 pwc = pwsz;
175 for (;;)
176 {
177 RTUTF16 wc = *pwc;
178 if (!wc)
179 break;
180 if (wc < 0xd800 || wc >= 0xdc00)
181 {
182 RTUNICP ucFolded = RTUniCpToLower(wc);
183 if (ucFolded < 0x10000)
184 *pwc++ = RTUniCpToLower(wc);
185 }
186 else
187 {
188 /* surrogate */
189 RTUTF16 wc2 = pwc[1];
190 if (wc2 >= 0xdc00 && wc2 <= 0xdfff)
191 {
192 RTUNICP uc = 0x10000 + (((wc & 0x3ff) << 10) | (wc2 & 0x3ff));
193 RTUNICP ucFolded = RTUniCpToLower(uc);
194 if (uc != ucFolded && ucFolded >= 0x10000) /* we don't support shrinking the string */
195 {
196 uc -= 0x10000;
197 *pwc++ = 0xd800 | (uc >> 10);
198 *pwc++ = 0xdc00 | (uc & 0x3ff);
199 }
200 }
201 else /* invalid encoding. */
202 pwc++;
203 }
204 }
205 return pwsz;
206}
207
208
209RTDECL(PRTUTF16) RTUtf16ToUpper(PRTUTF16 pwsz)
210{
211 PRTUTF16 pwc = pwsz;
212 for (;;)
213 {
214 RTUTF16 wc = *pwc;
215 if (!wc)
216 break;
217 if (wc < 0xd800 || wc >= 0xdc00)
218 *pwc++ = RTUniCpToUpper(wc);
219 else
220 {
221 /* surrogate */
222 RTUTF16 wc2 = pwc[1];
223 if (wc2 >= 0xdc00 && wc2 <= 0xdfff)
224 {
225 RTUNICP uc = 0x10000 + (((wc & 0x3ff) << 10) | (wc2 & 0x3ff));
226 RTUNICP ucFolded = RTUniCpToUpper(uc);
227 if (uc != ucFolded && ucFolded >= 0x10000) /* we don't support shrinking the string */
228 {
229 uc -= 0x10000;
230 *pwc++ = 0xd800 | (uc >> 10);
231 *pwc++ = 0xdc00 | (uc & 0x3ff);
232 }
233 }
234 else /* invalid encoding. */
235 pwc++;
236 }
237 }
238 return pwsz;
239}
240
241
242/**
243 * Validate the UTF-16 encoding and calculates the length of an UTF-8 encoding.
244 *
245 * @returns iprt status code.
246 * @param pwsz The UTF-16 string.
247 * @param cwc The max length of the UTF-16 string to consider.
248 * @param pcch Where to store the length (excluding '\\0') of the UTF-8 string. (cch == cb, btw)
249 */
250static int rtUtf16CalcUtf8Length(PCRTUTF16 pwsz, size_t cwc, size_t *pcch)
251{
252 int rc = VINF_SUCCESS;
253 size_t cch = 0;
254 while (cwc > 0)
255 {
256 RTUTF16 wc = *pwsz++; cwc--;
257 if (!wc)
258 break;
259 else if (wc < 0xd800 || wc > 0xdfff)
260 {
261 if (wc < 0x80)
262 cch++;
263 else if (wc < 0x800)
264 cch += 2;
265 else if (wc < 0xfffe)
266 cch += 3;
267 else
268 {
269 RTStrAssertMsgFailed(("endian indicator! wc=%#x\n", wc));
270 rc = VERR_CODE_POINT_ENDIAN_INDICATOR;
271 break;
272 }
273 }
274 else
275 {
276 if (wc >= 0xdc00)
277 {
278 RTStrAssertMsgFailed(("Wrong 1st char in surrogate! wc=%#x\n", wc));
279 rc = VERR_INVALID_UTF16_ENCODING;
280 break;
281 }
282 if (cwc <= 0)
283 {
284 RTStrAssertMsgFailed(("Invalid length! wc=%#x\n", wc));
285 rc = VERR_INVALID_UTF16_ENCODING;
286 break;
287 }
288 wc = *pwsz++; cwc--;
289 if (wc < 0xdc00 || wc > 0xdfff)
290 {
291 RTStrAssertMsgFailed(("Wrong 2nd char in surrogate! wc=%#x\n", wc));
292 rc = VERR_INVALID_UTF16_ENCODING;
293 break;
294 }
295 cch += 4;
296 }
297 }
298
299
300 /* done */
301 *pcch = cch;
302 return rc;
303}
304
305
306/**
307 * Recodes an valid UTF-16 string as UTF-8.
308 *
309 * @returns iprt status code.
310 * @param pwsz The UTF-16 string.
311 * @param cwc The number of RTUTF16 characters to process from pwsz. The recoding
312 * will stop when cwc or '\\0' is reached.
313 * @param psz Where to store the UTF-8 string.
314 * @param cch The size of the UTF-8 buffer, excluding the terminator.
315 * @param pcch Where to store the number of octets actually encoded.
316 */
317static int rtUtf16RecodeAsUtf8(PCRTUTF16 pwsz, size_t cwc, char *psz, size_t cch, size_t *pcch)
318{
319 unsigned char *pwch = (unsigned char *)psz;
320 int rc = VINF_SUCCESS;
321 while (cwc > 0)
322 {
323 RTUTF16 wc = *pwsz++; cwc--;
324 if (!wc)
325 break;
326 else if (wc < 0xd800 || wc > 0xdfff)
327 {
328 if (wc < 0x80)
329 {
330 if (cch < 1)
331 {
332 RTStrAssertMsgFailed(("Buffer overflow! 1\n"));
333 rc = VERR_BUFFER_OVERFLOW;
334 break;
335 }
336 cch--;
337 *pwch++ = (unsigned char)wc;
338 }
339 else if (wc < 0x800)
340 {
341 if (cch < 2)
342 {
343 RTStrAssertMsgFailed(("Buffer overflow! 2\n"));
344 rc = VERR_BUFFER_OVERFLOW;
345 break;
346 }
347 cch -= 2;
348 *pwch++ = 0xc0 | (wc >> 6);
349 *pwch++ = 0x80 | (wc & 0x3f);
350 }
351 else if (wc < 0xfffe)
352 {
353 if (cch < 3)
354 {
355 RTStrAssertMsgFailed(("Buffer overflow! 3\n"));
356 rc = VERR_BUFFER_OVERFLOW;
357 break;
358 }
359 cch -= 3;
360 *pwch++ = 0xe0 | (wc >> 12);
361 *pwch++ = 0x80 | ((wc >> 6) & 0x3f);
362 *pwch++ = 0x80 | (wc & 0x3f);
363 }
364 else
365 {
366 RTStrAssertMsgFailed(("endian indicator! wc=%#x\n", wc));
367 rc = VERR_CODE_POINT_ENDIAN_INDICATOR;
368 break;
369 }
370 }
371 else
372 {
373 if (wc >= 0xdc00)
374 {
375 RTStrAssertMsgFailed(("Wrong 1st char in surrogate! wc=%#x\n", wc));
376 rc = VERR_INVALID_UTF16_ENCODING;
377 break;
378 }
379 if (cwc <= 0)
380 {
381 RTStrAssertMsgFailed(("Invalid length! wc=%#x\n", wc));
382 rc = VERR_INVALID_UTF16_ENCODING;
383 break;
384 }
385 RTUTF16 wc2 = *pwsz++; cwc--;
386 if (wc2 < 0xdc00 || wc2 > 0xdfff)
387 {
388 RTStrAssertMsgFailed(("Wrong 2nd char in surrogate! wc=%#x\n", wc));
389 rc = VERR_INVALID_UTF16_ENCODING;
390 break;
391 }
392 uint32_t CodePoint = 0x10000
393 + ( ((wc & 0x3ff) << 10)
394 | (wc2 & 0x3ff));
395 if (cch < 4)
396 {
397 RTStrAssertMsgFailed(("Buffer overflow! 4\n"));
398 rc = VERR_BUFFER_OVERFLOW;
399 break;
400 }
401 cch -= 4;
402 *pwch++ = 0xf0 | (CodePoint >> 18);
403 *pwch++ = 0x80 | ((CodePoint >> 12) & 0x3f);
404 *pwch++ = 0x80 | ((CodePoint >> 6) & 0x3f);
405 *pwch++ = 0x80 | (CodePoint & 0x3f);
406 }
407 }
408
409 /* done */
410 *pwch = '\0';
411 *pcch = (char *)pwch - psz;
412 return rc;
413}
414
415
416
417RTDECL(int) RTUtf16ToUtf8(PCRTUTF16 pwszString, char **ppszString)
418{
419 /*
420 * Validate input.
421 */
422 Assert(VALID_PTR(ppszString));
423 Assert(VALID_PTR(pwszString));
424 *ppszString = NULL;
425
426 /*
427 * Validate the UTF-16 string and calculate the length of the UTF-8 encoding of it.
428 */
429 size_t cch;
430 int rc = rtUtf16CalcUtf8Length(pwszString, RTSTR_MAX, &cch);
431 if (RT_SUCCESS(rc))
432 {
433 /*
434 * Allocate buffer and recode it.
435 */
436 char *pszResult = (char *)RTMemAlloc(cch + 1);
437 if (pszResult)
438 {
439 rc = rtUtf16RecodeAsUtf8(pwszString, RTSTR_MAX, pszResult, cch, &cch);
440 if (RT_SUCCESS(rc))
441 {
442 *ppszString = pszResult;
443 return rc;
444 }
445
446 RTMemFree(pszResult);
447 }
448 else
449 rc = VERR_NO_STR_MEMORY;
450 }
451 return rc;
452}
453
454
455RTDECL(int) RTUtf16ToUtf8Ex(PCRTUTF16 pwszString, size_t cwcString, char **ppsz, size_t cch, size_t *pcch)
456{
457 /*
458 * Validate input.
459 */
460 Assert(VALID_PTR(pwszString));
461 Assert(VALID_PTR(ppsz));
462 Assert(!pcch || VALID_PTR(pcch));
463
464 /*
465 * Validate the UTF-16 string and calculate the length of the UTF-8 encoding of it.
466 */
467 size_t cchResult;
468 int rc = rtUtf16CalcUtf8Length(pwszString, cwcString, &cchResult);
469 if (RT_SUCCESS(rc))
470 {
471 if (pcch)
472 *pcch = cchResult;
473
474 /*
475 * Check buffer size / Allocate buffer and recode it.
476 */
477 bool fShouldFree;
478 char *pszResult;
479 if (cch > 0 && *ppsz)
480 {
481 fShouldFree = false;
482 if (cch <= cchResult)
483 return VERR_BUFFER_OVERFLOW;
484 pszResult = *ppsz;
485 }
486 else
487 {
488 *ppsz = NULL;
489 fShouldFree = true;
490 cch = RT_MAX(cch, cchResult + 1);
491 pszResult = (char *)RTMemAlloc(cch);
492 }
493 if (pszResult)
494 {
495 rc = rtUtf16RecodeAsUtf8(pwszString, cwcString, pszResult, cch - 1, &cch);
496 if (RT_SUCCESS(rc))
497 {
498 *ppsz = pszResult;
499 return rc;
500 }
501
502 if (fShouldFree)
503 RTMemFree(pszResult);
504 }
505 else
506 rc = VERR_NO_STR_MEMORY;
507 }
508 return rc;
509}
510
511
512RTDECL(size_t) RTUtf16CalcUtf8Len(PCRTUTF16 pwsz)
513{
514 size_t cch;
515 int rc = rtUtf16CalcUtf8Length(pwsz, RTSTR_MAX, &cch);
516 return RT_SUCCESS(rc) ? cch : 0;
517}
518
519
520RTDECL(int) RTUtf16CalcUtf8LenEx(PCRTUTF16 pwsz, size_t cwc, size_t *pcch)
521{
522 size_t cch;
523 int rc = rtUtf16CalcUtf8Length(pwsz, cwc, &cch);
524 if (pcch)
525 *pcch = RT_SUCCESS(rc) ? cch : ~(size_t)0;
526 return rc;
527}
528
529
530RTDECL(RTUNICP) RTUtf16GetCpInternal(PCRTUTF16 pwsz)
531{
532 const RTUTF16 wc = *pwsz;
533
534 /* simple */
535 if (wc < 0xd800 || (wc > 0xdfff && wc < 0xfffe))
536 return wc;
537 if (wc < 0xfffe)
538 {
539 /* surrogate pair */
540 if (wc < 0xdc00)
541 {
542 const RTUTF16 wc2 = pwsz[1];
543 if (wc2 >= 0xdc00 && wc2 <= 0xdfff)
544 {
545 RTUNICP uc = 0x10000 + (((wc & 0x3ff) << 10) | (wc2 & 0x3ff));
546 return uc;
547 }
548
549 RTStrAssertMsgFailed(("wc=%#08x wc2=%#08x - invalid 2nd char in surrogate pair\n", wc, wc2));
550 }
551 else
552 RTStrAssertMsgFailed(("wc=%#08x - invalid surrogate pair order\n", wc));
553 }
554 else
555 RTStrAssertMsgFailed(("wc=%#08x - endian indicator\n", wc));
556 return RTUNICP_INVALID;
557}
558
559
560RTDECL(int) RTUtf16GetCpExInternal(PCRTUTF16 *ppwsz, PRTUNICP pCp)
561{
562 const RTUTF16 wc = **ppwsz;
563
564 /* simple */
565 if (wc < 0xd800 || (wc > 0xdfff && wc < 0xfffe))
566 {
567 (*ppwsz)++;
568 *pCp = wc;
569 return VINF_SUCCESS;
570 }
571
572 int rc;
573 if (wc < 0xfffe)
574 {
575 /* surrogate pair */
576 if (wc < 0xdc00)
577 {
578 const RTUTF16 wc2 = (*ppwsz)[1];
579 if (wc2 >= 0xdc00 && wc2 <= 0xdfff)
580 {
581 RTUNICP uc = 0x10000 + (((wc & 0x3ff) << 10) | (wc2 & 0x3ff));
582 *pCp = uc;
583 (*ppwsz) += 2;
584 return VINF_SUCCESS;
585 }
586
587 RTStrAssertMsgFailed(("wc=%#08x wc2=%#08x - invalid 2nd char in surrogate pair\n", wc, wc2));
588 }
589 else
590 RTStrAssertMsgFailed(("wc=%#08x - invalid surrogate pair order\n", wc));
591 rc = VERR_INVALID_UTF16_ENCODING;
592 }
593 else
594 {
595 RTStrAssertMsgFailed(("wc=%#08x - endian indicator\n", wc));
596 rc = VERR_CODE_POINT_ENDIAN_INDICATOR;
597 }
598 *pCp = RTUNICP_INVALID;
599 (*ppwsz)++;
600 return rc;
601}
602
603
604RTDECL(PRTUTF16) RTUtf16PutCpInternal(PRTUTF16 pwsz, RTUNICP CodePoint)
605{
606 /* simple */
607 if ( CodePoint < 0xd800
608 || ( CodePoint > 0xdfff
609 && CodePoint < 0xfffe))
610 {
611 *pwsz++ = (RTUTF16)CodePoint;
612 return pwsz;
613 }
614
615 /* surrogate pair */
616 if (CodePoint >= 0x10000 && CodePoint <= 0x0010ffff)
617 {
618 CodePoint -= 0x10000;
619 *pwsz++ = 0xd800 | (CodePoint >> 10);
620 *pwsz++ = 0xdc00 | (CodePoint & 0x3ff);
621 return pwsz;
622 }
623
624 /* invalid code point. */
625 RTStrAssertMsgFailed(("Invalid codepoint %#x\n", CodePoint));
626 *pwsz++ = 0x7f;
627 return pwsz;
628}
629
630
Note: See TracBrowser for help on using the repository browser.

© 2024 Oracle Support Privacy / Do Not Sell My Info Terms of Use Trademark Policy Automated Access Etiquette