VirtualBox

source: vbox/trunk/src/VBox/Runtime/r3/posix/utf8-posix.cpp@ 87007

Last change on this file since 87007 was 82968, checked in by vboxsync, 5 years ago

Copyright year updates by scm.

  • Property svn:eol-style set to native
  • Property svn:keywords set to Id Revision
File size: 18.0 KB
Line 
1/* $Id: utf8-posix.cpp 82968 2020-02-04 10:35:17Z vboxsync $ */
2/** @file
3 * IPRT - UTF-8 helpers, POSIX.
4 */
5
6/*
7 * Copyright (C) 2006-2020 Oracle Corporation
8 *
9 * This file is part of VirtualBox Open Source Edition (OSE), as
10 * available from http://www.virtualbox.org. This file is free software;
11 * you can redistribute it and/or modify it under the terms of the GNU
12 * General Public License (GPL) as published by the Free Software
13 * Foundation, in version 2 as it comes in the "COPYING" file of the
14 * VirtualBox OSE distribution. VirtualBox OSE is distributed in the
15 * hope that it will be useful, but WITHOUT ANY WARRANTY of any kind.
16 *
17 * The contents of this file may alternatively be used under the terms
18 * of the Common Development and Distribution License Version 1.0
19 * (CDDL) only, as it comes in the "COPYING.CDDL" file of the
20 * VirtualBox OSE distribution, in which case the provisions of the
21 * CDDL are applicable instead of those of the GPL.
22 *
23 * You may elect to license modified versions of this file under the
24 * terms and conditions of either the GPL or the CDDL or both.
25 */
26
27
28/*********************************************************************************************************************************
29* Header Files *
30*********************************************************************************************************************************/
31#include <iprt/string.h>
32#include "internal/iprt.h"
33
34#include <iprt/alloc.h>
35#include <iprt/assert.h>
36#include <iprt/err.h>
37#include <iprt/string.h>
38
39#include <errno.h>
40#include <locale.h>
41
42/* iconv prototype changed with 165+ (thanks to PSARC/2010/160 Bugster 7037400) */
43#if defined(RT_OS_SOLARIS)
44# if !defined(_XPG6)
45# define IPRT_XPG6_TMP_DEF
46# define _XPG6
47# endif
48# if defined(__USE_LEGACY_PROTOTYPES__)
49# define IPRT_LEGACY_PROTO_TMP_DEF
50# undef __USE_LEGACY_PROTOTYPES__
51# endif
52#endif /* RT_OS_SOLARIS */
53
54# include <iconv.h>
55
56#if defined(RT_OS_SOLARIS)
57# if defined(IPRT_XPG6_TMP_DEF)
58# undef _XPG6
59# undef IPRT_XPG6_TMP_DEF
60# endif
61# if defined(IPRT_LEGACY_PROTO_TMP_DEF)
62# define __USE_LEGACY_PROTOTYPES__
63# undef IPRT_LEGACY_PROTO_TMP_DEF
64# endif
65#endif /* RT_OS_SOLARIS */
66
67#include <wctype.h>
68
69#include <langinfo.h>
70
71#include "internal/alignmentchecks.h"
72#include "internal/string.h"
73#ifdef RT_WITH_ICONV_CACHE
74# include "internal/thread.h"
75AssertCompile(sizeof(iconv_t) <= sizeof(void *));
76#endif
77
78
79/* There are different opinions about the constness of the input buffer. */
80#if defined(RT_OS_LINUX) || defined(RT_OS_HAIKU) || defined(RT_OS_SOLARIS) \
81 || (defined(RT_OS_DARWIN) && defined(_DARWIN_FEATURE_UNIX_CONFORMANCE))
82# define NON_CONST_ICONV_INPUT
83#endif
84#ifdef RT_OS_FREEBSD
85# include <sys/param.h>
86# if __FreeBSD_version >= 1002000 /* Changed around 10.2.2 (https://svnweb.freebsd.org/base?view=revision&revision=281550) */
87# define NON_CONST_ICONV_INPUT
88# else
89# error __FreeBSD_version__
90# endif
91#endif
92#ifdef RT_OS_NETBSD
93/* iconv constness was changed on 2019-10-24, shortly after 9.99.17 */
94# include <sys/param.h>
95# if __NetBSD_Prereq__(9,99,18)
96# define NON_CONST_ICONV_INPUT
97# endif
98#endif
99
100
101/**
102 * Gets the codeset of the current locale (LC_CTYPE).
103 *
104 * @returns Pointer to read-only string with the codeset name.
105 */
106DECLHIDDEN(const char *) rtStrGetLocaleCodeset(void)
107{
108 return nl_langinfo(CODESET);
109}
110
111
112#ifdef RT_WITH_ICONV_CACHE
113
114/**
115 * Initializes the iconv handle cache associated with a thread.
116 *
117 * @param pThread The thread in question.
118 */
119DECLHIDDEN(void) rtStrIconvCacheInit(PRTTHREADINT pThread)
120{
121 for (size_t i = 0; i < RT_ELEMENTS(pThread->ahIconvs); i++)
122 pThread->ahIconvs[i] = (iconv_t)-1;
123}
124
125/**
126 * Destroys the iconv handle cache associated with a thread.
127 *
128 * @param pThread The thread in question.
129 */
130DECLHIDDEN(void) rtStrIconvCacheDestroy(PRTTHREADINT pThread)
131{
132 for (size_t i = 0; i < RT_ELEMENTS(pThread->ahIconvs); i++)
133 {
134 iconv_t hIconv = (iconv_t)pThread->ahIconvs[i];
135 pThread->ahIconvs[i] = (iconv_t)-1;
136 if (hIconv != (iconv_t)-1)
137 iconv_close(hIconv);
138 }
139}
140
141
142/**
143 * Converts a string from one charset to another.
144 *
145 * @returns iprt status code.
146 * @param pvInput Pointer to intput string.
147 * @param cbInput Size (in bytes) of input string. Excludes any terminators.
148 * @param pszInputCS Codeset of the input string.
149 * @param ppvOutput Pointer to pointer to output buffer if cbOutput > 0.
150 * If cbOutput is 0 this is where the pointer to the allocated
151 * buffer is stored.
152 * @param cbOutput Size of the passed in buffer.
153 * @param pszOutputCS Codeset of the input string.
154 * @param cFactor Input vs. output size factor.
155 * @param phIconv Pointer to the cache entry.
156 */
157static int rtstrConvertCached(const void *pvInput, size_t cbInput, const char *pszInputCS,
158 void **ppvOutput, size_t cbOutput, const char *pszOutputCS,
159 unsigned cFactor, iconv_t *phIconv)
160{
161 /*
162 * Allocate buffer
163 */
164 bool fUcs2Term;
165 void *pvOutput;
166 size_t cbOutput2;
167 if (!cbOutput)
168 {
169 cbOutput2 = cbInput * cFactor;
170 pvOutput = RTMemTmpAlloc(cbOutput2 + sizeof(RTUTF16));
171 if (!pvOutput)
172 return VERR_NO_TMP_MEMORY;
173 fUcs2Term = true;
174 }
175 else
176 {
177 pvOutput = *ppvOutput;
178 fUcs2Term = !strcmp(pszOutputCS, "UCS-2")
179 || !strcmp(pszOutputCS, "UTF-16")
180 || !strcmp(pszOutputCS, "ucs-2")
181 || !strcmp(pszOutputCS, "utf-16");
182 cbOutput2 = cbOutput - (fUcs2Term ? sizeof(RTUTF16) : 1);
183 if (cbOutput2 > cbOutput)
184 return VERR_BUFFER_OVERFLOW;
185 }
186
187 /*
188 * Use a loop here to retry with bigger buffers.
189 */
190 for (unsigned cTries = 10; cTries > 0; cTries--)
191 {
192 /*
193 * Create conversion object if necessary.
194 */
195 iconv_t hIconv = (iconv_t)*phIconv;
196 if (hIconv == (iconv_t)-1)
197 {
198#if defined(RT_OS_SOLARIS) || defined(RT_OS_NETBSD)
199 /* Some systems don't grok empty codeset strings, so help them find the current codeset. */
200 if (!*pszInputCS)
201 pszInputCS = rtStrGetLocaleCodeset();
202 if (!*pszOutputCS)
203 pszOutputCS = rtStrGetLocaleCodeset();
204#endif
205 IPRT_ALIGNMENT_CHECKS_DISABLE(); /* glibc causes trouble */
206 *phIconv = hIconv = iconv_open(pszOutputCS, pszInputCS);
207 IPRT_ALIGNMENT_CHECKS_ENABLE();
208 }
209 if (hIconv != (iconv_t)-1)
210 {
211 /*
212 * Do the conversion.
213 */
214 size_t cbInLeft = cbInput;
215 size_t cbOutLeft = cbOutput2;
216 const void *pvInputLeft = pvInput;
217 void *pvOutputLeft = pvOutput;
218 size_t cchNonRev;
219#ifdef NON_CONST_ICONV_INPUT
220 cchNonRev = iconv(hIconv, (char **)&pvInputLeft, &cbInLeft, (char **)&pvOutputLeft, &cbOutLeft);
221#else
222 cchNonRev = iconv(hIconv, (const char **)&pvInputLeft, &cbInLeft, (char **)&pvOutputLeft, &cbOutLeft);
223#endif
224 if (cchNonRev != (size_t)-1)
225 {
226 if (!cbInLeft)
227 {
228 /*
229 * We're done, just add the terminator and return.
230 * (Two terminators to support UCS-2 output, too.)
231 */
232 ((char *)pvOutputLeft)[0] = '\0';
233 if (fUcs2Term)
234 ((char *)pvOutputLeft)[1] = '\0';
235 *ppvOutput = pvOutput;
236 if (cchNonRev == 0)
237 return VINF_SUCCESS;
238 return VWRN_NO_TRANSLATION;
239 }
240 errno = E2BIG;
241 }
242
243 /*
244 * If we failed because of output buffer space we'll
245 * increase the output buffer size and retry.
246 */
247 if (errno == E2BIG)
248 {
249 if (!cbOutput)
250 {
251 RTMemTmpFree(pvOutput);
252 cbOutput2 *= 2;
253 pvOutput = RTMemTmpAlloc(cbOutput2 + sizeof(RTUTF16));
254 if (!pvOutput)
255 return VERR_NO_TMP_MEMORY;
256 continue;
257 }
258 return VERR_BUFFER_OVERFLOW;
259 }
260
261 /*
262 * Close the handle on all other errors to make sure we won't carry
263 * any bad state with us.
264 */
265 *phIconv = (iconv_t)-1;
266 iconv_close(hIconv);
267 }
268 break;
269 }
270
271 /* failure */
272 if (!cbOutput)
273 RTMemTmpFree(pvOutput);
274 return VERR_NO_TRANSLATION;
275}
276
277#endif /* RT_WITH_ICONV_CACHE */
278
279/**
280 * Converts a string from one charset to another without using the handle cache.
281 *
282 * @returns IPRT status code.
283 *
284 * @param pvInput Pointer to intput string.
285 * @param cbInput Size (in bytes) of input string. Excludes any terminators.
286 * @param pszInputCS Codeset of the input string.
287 * @param ppvOutput Pointer to pointer to output buffer if cbOutput > 0.
288 * If cbOutput is 0 this is where the pointer to the allocated
289 * buffer is stored.
290 * @param cbOutput Size of the passed in buffer.
291 * @param pszOutputCS Codeset of the input string.
292 * @param cFactor Input vs. output size factor.
293 */
294static int rtStrConvertUncached(const void *pvInput, size_t cbInput, const char *pszInputCS,
295 void **ppvOutput, size_t cbOutput, const char *pszOutputCS,
296 unsigned cFactor)
297{
298 /*
299 * Allocate buffer
300 */
301 bool fUcs2Term;
302 void *pvOutput;
303 size_t cbOutput2;
304 if (!cbOutput)
305 {
306 cbOutput2 = cbInput * cFactor;
307 pvOutput = RTMemTmpAlloc(cbOutput2 + sizeof(RTUTF16));
308 if (!pvOutput)
309 return VERR_NO_TMP_MEMORY;
310 fUcs2Term = true;
311 }
312 else
313 {
314 pvOutput = *ppvOutput;
315 fUcs2Term = !strcmp(pszOutputCS, "UCS-2");
316 cbOutput2 = cbOutput - (fUcs2Term ? sizeof(RTUTF16) : 1);
317 if (cbOutput2 > cbOutput)
318 return VERR_BUFFER_OVERFLOW;
319 }
320
321 /*
322 * Use a loop here to retry with bigger buffers.
323 */
324 for (unsigned cTries = 10; cTries > 0; cTries--)
325 {
326 /*
327 * Create conversion object.
328 */
329#if defined(RT_OS_SOLARIS) || defined(RT_OS_NETBSD)
330 /* Some systems don't grok empty codeset strings, so help them find the current codeset. */
331 if (!*pszInputCS)
332 pszInputCS = rtStrGetLocaleCodeset();
333 if (!*pszOutputCS)
334 pszOutputCS = rtStrGetLocaleCodeset();
335#endif
336 IPRT_ALIGNMENT_CHECKS_DISABLE(); /* glibc causes trouble */
337 iconv_t icHandle = iconv_open(pszOutputCS, pszInputCS);
338 IPRT_ALIGNMENT_CHECKS_ENABLE();
339 if (icHandle != (iconv_t)-1)
340 {
341 /*
342 * Do the conversion.
343 */
344 size_t cbInLeft = cbInput;
345 size_t cbOutLeft = cbOutput2;
346 const void *pvInputLeft = pvInput;
347 void *pvOutputLeft = pvOutput;
348 size_t cchNonRev;
349#ifdef NON_CONST_ICONV_INPUT
350 cchNonRev = iconv(icHandle, (char **)&pvInputLeft, &cbInLeft, (char **)&pvOutputLeft, &cbOutLeft);
351#else
352 cchNonRev = iconv(icHandle, (const char **)&pvInputLeft, &cbInLeft, (char **)&pvOutputLeft, &cbOutLeft);
353#endif
354 if (cchNonRev != (size_t)-1)
355 {
356 if (!cbInLeft)
357 {
358 /*
359 * We're done, just add the terminator and return.
360 * (Two terminators to support UCS-2 output, too.)
361 */
362 iconv_close(icHandle);
363 ((char *)pvOutputLeft)[0] = '\0';
364 if (fUcs2Term)
365 ((char *)pvOutputLeft)[1] = '\0';
366 *ppvOutput = pvOutput;
367 if (cchNonRev == 0)
368 return VINF_SUCCESS;
369 return VWRN_NO_TRANSLATION;
370 }
371 errno = E2BIG;
372 }
373 iconv_close(icHandle);
374
375 /*
376 * If we failed because of output buffer space we'll
377 * increase the output buffer size and retry.
378 */
379 if (errno == E2BIG)
380 {
381 if (!cbOutput)
382 {
383 RTMemTmpFree(pvOutput);
384 cbOutput2 *= 2;
385 pvOutput = RTMemTmpAlloc(cbOutput2 + sizeof(RTUTF16));
386 if (!pvOutput)
387 return VERR_NO_TMP_MEMORY;
388 continue;
389 }
390 return VERR_BUFFER_OVERFLOW;
391 }
392 }
393 break;
394 }
395
396 /* failure */
397 if (!cbOutput)
398 RTMemTmpFree(pvOutput);
399 return VERR_NO_TRANSLATION;
400}
401
402
403/**
404 * Wrapper that selects rtStrConvertCached or rtStrConvertUncached.
405 *
406 * @returns IPRT status code.
407 *
408 * @param pszInput Pointer to intput string.
409 * @param cchInput Size (in bytes) of input string. Excludes any
410 * terminators.
411 * @param pszInputCS Codeset of the input string.
412 * @param ppszOutput Pointer to pointer to output buffer if cbOutput > 0.
413 * If cbOutput is 0 this is where the pointer to the
414 * allocated buffer is stored.
415 * @param cbOutput Size of the passed in buffer.
416 * @param pszOutputCS Codeset of the input string.
417 * @param cFactor Input vs. output size factor.
418 * @param enmCacheIdx The iconv cache index.
419 */
420DECLINLINE(int) rtStrConvertWrapper(const char *pchInput, size_t cchInput, const char *pszInputCS,
421 char **ppszOutput, size_t cbOutput, const char *pszOutputCS,
422 unsigned cFactor, RTSTRICONV enmCacheIdx)
423{
424#ifdef RT_WITH_ICONV_CACHE
425 RTTHREAD hSelf = RTThreadSelf();
426 if (hSelf != NIL_RTTHREAD)
427 {
428 PRTTHREADINT pThread = rtThreadGet(hSelf);
429 if (pThread)
430 {
431 if ((pThread->fIntFlags & (RTTHREADINT_FLAGS_ALIEN | RTTHREADINT_FLAGS_MAIN)) != RTTHREADINT_FLAGS_ALIEN)
432 {
433 int rc = rtstrConvertCached(pchInput, cchInput, pszInputCS,
434 (void **)ppszOutput, cbOutput, pszOutputCS,
435 cFactor, (iconv_t *)&pThread->ahIconvs[enmCacheIdx]);
436 rtThreadRelease(pThread);
437 return rc;
438 }
439 rtThreadRelease(pThread);
440 }
441 }
442#endif
443 return rtStrConvertUncached(pchInput, cchInput, pszInputCS,
444 (void **)ppszOutput, cbOutput, pszOutputCS,
445 cFactor);
446}
447
448
449/**
450 * Internal API for use by the path conversion code.
451 *
452 * @returns IPRT status code.
453 *
454 * @param pszInput Pointer to intput string.
455 * @param cchInput Size (in bytes) of input string. Excludes any
456 * terminators.
457 * @param pszInputCS Codeset of the input string.
458 * @param ppszOutput Pointer to pointer to output buffer if cbOutput > 0.
459 * If cbOutput is 0 this is where the pointer to the
460 * allocated buffer is stored.
461 * @param cbOutput Size of the passed in buffer.
462 * @param pszOutputCS Codeset of the input string.
463 * @param cFactor Input vs. output size factor.
464 * @param enmCacheIdx The iconv cache index.
465 */
466DECLHIDDEN(int) rtStrConvert(const char *pchInput, size_t cchInput, const char *pszInputCS,
467 char **ppszOutput, size_t cbOutput, const char *pszOutputCS,
468 unsigned cFactor, RTSTRICONV enmCacheIdx)
469{
470 Assert(enmCacheIdx >= 0 && enmCacheIdx < RTSTRICONV_END);
471 return rtStrConvertWrapper(pchInput, cchInput, pszInputCS,
472 ppszOutput, cbOutput, pszOutputCS,
473 cFactor, enmCacheIdx);
474}
475
476
477RTR3DECL(int) RTStrUtf8ToCurrentCPTag(char **ppszString, const char *pszString, const char *pszTag)
478{
479 Assert(ppszString);
480 Assert(pszString);
481 *ppszString = NULL;
482
483 /*
484 * Assume result string length is not longer than UTF-8 string.
485 */
486 size_t cch = strlen(pszString);
487 if (cch <= 0)
488 {
489 /* zero length string passed. */
490 *ppszString = (char *)RTMemTmpAllocZTag(sizeof(char), pszTag);
491 if (*ppszString)
492 return VINF_SUCCESS;
493 return VERR_NO_TMP_MEMORY;
494 }
495 return rtStrConvertWrapper(pszString, cch, "UTF-8", ppszString, 0, "", 1, RTSTRICONV_UTF8_TO_LOCALE);
496}
497
498
499RTR3DECL(int) RTStrUtf8ToCurrentCPExTag(char **ppszString, const char *pszString, size_t cchString, const char *pszTag)
500{
501 Assert(ppszString);
502 Assert(pszString);
503 *ppszString = NULL;
504
505 /*
506 * Assume result string length is not longer than UTF-8 string.
507 */
508 cchString = RTStrNLen(pszString, cchString);
509 if (cchString < 1)
510 {
511 /* zero length string passed. */
512 *ppszString = (char *)RTMemTmpAllocZTag(sizeof(char), pszTag);
513 if (*ppszString)
514 return VINF_SUCCESS;
515 return VERR_NO_TMP_MEMORY;
516 }
517 return rtStrConvertWrapper(pszString, cchString, "UTF-8", ppszString, 0, "", 1, RTSTRICONV_UTF8_TO_LOCALE);
518}
519
520
521RTR3DECL(int) RTStrCurrentCPToUtf8Tag(char **ppszString, const char *pszString, const char *pszTag)
522{
523 Assert(ppszString);
524 Assert(pszString);
525 *ppszString = NULL;
526
527 /*
528 * Attempt with UTF-8 length of 2x the native length.
529 */
530 size_t cch = strlen(pszString);
531 if (cch <= 0)
532 {
533 /* zero length string passed. */
534 *ppszString = (char *)RTMemTmpAllocZTag(sizeof(char), pszTag);
535 if (*ppszString)
536 return VINF_SUCCESS;
537 return VERR_NO_TMP_MEMORY;
538 }
539 return rtStrConvertWrapper(pszString, cch, "", ppszString, 0, "UTF-8", 2, RTSTRICONV_LOCALE_TO_UTF8);
540}
541
Note: See TracBrowser for help on using the repository browser.

© 2024 Oracle Support Privacy / Do Not Sell My Info Terms of Use Trademark Policy Automated Access Etiquette