VirtualBox

source: vbox/trunk/src/libs/xpcom18a4/xpcom/io/nsNativeCharsetUtils.cpp@ 101978

Last change on this file since 101978 was 101978, checked in by vboxsync, 15 months ago

libs/xpcom/xpcom/io: Convert some code from using PRLock to IPRT's RTSEMFASTMUTEX locks, bugref:10545

  • Property svn:eol-style set to native
  • Property svn:keywords set to Author Date Id Revision
File size: 37.5 KB
Line 
1/* ***** BEGIN LICENSE BLOCK *****
2 * Version: MPL 1.1/GPL 2.0/LGPL 2.1
3 *
4 * The contents of this file are subject to the Mozilla Public License Version
5 * 1.1 (the "License"); you may not use this file except in compliance with
6 * the License. You may obtain a copy of the License at
7 * http://www.mozilla.org/MPL/
8 *
9 * Software distributed under the License is distributed on an "AS IS" basis,
10 * WITHOUT WARRANTY OF ANY KIND, either express or implied. See the License
11 * for the specific language governing rights and limitations under the
12 * License.
13 *
14 * The Original Code is Mozilla.
15 *
16 * The Initial Developer of the Original Code is
17 * Netscape Communications Corporation.
18 * Portions created by the Initial Developer are Copyright (C) 2002
19 * the Initial Developer. All Rights Reserved.
20 *
21 * Contributor(s):
22 * Darin Fisher <[email protected]>
23 * Brian Stell <[email protected]>
24 * Frank Tang <[email protected]>
25 * Brendan Eich <[email protected]>
26 * Sergei Dolgov <[email protected]>
27 *
28 * Alternatively, the contents of this file may be used under the terms of
29 * either the GNU General Public License Version 2 or later (the "GPL"), or
30 * the GNU Lesser General Public License Version 2.1 or later (the "LGPL"),
31 * in which case the provisions of the GPL or the LGPL are applicable instead
32 * of those above. If you wish to allow use of your version of this file only
33 * under the terms of either the GPL or the LGPL, and not to allow others to
34 * use your version of this file under the terms of the MPL, indicate your
35 * decision by deleting the provisions above and replace them with the notice
36 * and other provisions required by the GPL or the LGPL. If you do not delete
37 * the provisions above, a recipient may use your version of this file under
38 * the terms of any one of the MPL, the GPL or the LGPL.
39 *
40 * ***** END LICENSE BLOCK ***** */
41
42#include "xpcom-private.h"
43
44//-----------------------------------------------------------------------------
45// XP_UNIX
46//-----------------------------------------------------------------------------
47#if defined(XP_UNIX)
48
49#include <stdlib.h> // mbtowc, wctomb
50#include <locale.h> // setlocale
51#include "nscore.h"
52#include "nsAString.h"
53#include "nsReadableUtils.h"
54
55#include <iprt/assert.h>
56#include <iprt/errcore.h>
57#include <iprt/semaphore.h>
58
59//
60// choose a conversion library. we used to use mbrtowc/wcrtomb under Linux,
61// but that doesn't work for non-BMP characters whether we use '-fshort-wchar'
62// or not (see bug 206811 and
63// news://news.mozilla.org:119/[email protected]). we now use
64// iconv for all platforms where nltypes.h and nllanginfo.h are present
65// along with iconv.
66//
67#if defined(HAVE_ICONV) && defined(HAVE_NL_TYPES_H) && defined(HAVE_LANGINFO_CODESET)
68#define USE_ICONV 1
69#else
70#define USE_STDCONV 1
71#endif
72
73static void
74isolatin1_to_utf16(const char **input, PRUint32 *inputLeft, PRUnichar **output, PRUint32 *outputLeft)
75{
76 while (*inputLeft && *outputLeft) {
77 **output = (unsigned char) **input;
78 (*input)++;
79 (*inputLeft)--;
80 (*output)++;
81 (*outputLeft)--;
82 }
83}
84
85static void
86utf16_to_isolatin1(const PRUnichar **input, PRUint32 *inputLeft, char **output, PRUint32 *outputLeft)
87{
88 while (*inputLeft && *outputLeft) {
89 **output = (unsigned char) **input;
90 (*input)++;
91 (*inputLeft)--;
92 (*output)++;
93 (*outputLeft)--;
94 }
95}
96
97//-----------------------------------------------------------------------------
98// conversion using iconv
99//-----------------------------------------------------------------------------
100#if defined(USE_ICONV)
101#include <nl_types.h> // CODESET
102#include <langinfo.h> // nl_langinfo
103#include <iconv.h> // iconv_open, iconv, iconv_close
104#include <errno.h>
105
106#if defined(HAVE_ICONV_WITH_CONST_INPUT)
107#define ICONV_INPUT(x) (x)
108#else
109#define ICONV_INPUT(x) ((char **)x)
110#endif
111
112// solaris definitely needs this, but we'll enable it by default
113// just in case... but we know for sure that iconv(3) in glibc
114// doesn't need this.
115#if !defined(__GLIBC__)
116#define ENABLE_UTF8_FALLBACK_SUPPORT
117#endif
118
119#define INVALID_ICONV_T ((iconv_t) -1)
120
121static inline size_t
122xp_iconv(iconv_t converter,
123 const char **input,
124 size_t *inputLeft,
125 char **output,
126 size_t *outputLeft)
127{
128 size_t res, outputAvail = outputLeft ? *outputLeft : 0;
129 res = iconv(converter, ICONV_INPUT(input), inputLeft, output, outputLeft);
130 if (res == (size_t) -1) {
131 // on some platforms (e.g., linux) iconv will fail with
132 // E2BIG if it cannot convert _all_ of its input. it'll
133 // still adjust all of the in/out params correctly, so we
134 // can ignore this error. the assumption is that we will
135 // be called again to complete the conversion.
136 if ((errno == E2BIG) && (*outputLeft < outputAvail))
137 res = 0;
138 }
139 return res;
140}
141
142static inline void
143xp_iconv_reset(iconv_t converter)
144{
145 // NOTE: the man pages on Solaris claim that you can pass NULL
146 // for all parameter to reset the converter, but beware the
147 // evil Solaris crash if you go down this route >:-)
148
149 const char *zero_char_in_ptr = NULL;
150 char *zero_char_out_ptr = NULL;
151 size_t zero_size_in = 0,
152 zero_size_out = 0;
153
154 xp_iconv(converter, &zero_char_in_ptr,
155 &zero_size_in,
156 &zero_char_out_ptr,
157 &zero_size_out);
158}
159
160static inline iconv_t
161xp_iconv_open(const char **to_list, const char **from_list)
162{
163 iconv_t res;
164 const char **from_name;
165 const char **to_name;
166
167 // try all possible combinations to locate a converter.
168 to_name = to_list;
169 while (*to_name) {
170 if (**to_name) {
171 from_name = from_list;
172 while (*from_name) {
173 if (**from_name) {
174 res = iconv_open(*to_name, *from_name);
175 if (res != INVALID_ICONV_T)
176 return res;
177 }
178 from_name++;
179 }
180 }
181 to_name++;
182 }
183
184 return INVALID_ICONV_T;
185}
186
187/*
188 * PRUnichar[] is NOT a UCS-2 array BUT a UTF-16 string. Therefore, we
189 * have to use UTF-16 with iconv(3) on platforms where it's supported.
190 * However, the way UTF-16 and UCS-2 are interpreted varies across platforms
191 * and implementations of iconv(3). On Tru64, it also depends on the environment
192 * variable. To avoid the trouble arising from byte-swapping
193 * (bug 208809), we have to try UTF-16LE/BE and UCS-2LE/BE before falling
194 * back to UTF-16 and UCS-2 and variants. We assume that UTF-16 and UCS-2
195 * on systems without UTF-16LE/BE and UCS-2LE/BE have the native endianness,
196 * which isn't the case of glibc 2.1.x, for which we use 'UNICODELITTLE'
197 * and 'UNICODEBIG'. It's also not true of Tru64 V4 when the environment
198 * variable ICONV_BYTEORDER is set to 'big-endian', about which not much
199 * can be done other than adding a note in the release notes. (bug 206811)
200 */
201static const char *UTF_16_NAMES[] = {
202#if defined(IS_LITTLE_ENDIAN)
203 "UTF-16LE",
204#if defined(__GLIBC__)
205 "UNICODELITTLE",
206#endif
207 "UCS-2LE",
208#else
209 "UTF-16BE",
210#if defined(__GLIBC__)
211 "UNICODEBIG",
212#endif
213 "UCS-2BE",
214#endif
215 "UTF-16",
216 "UCS-2",
217 "UCS2",
218 "UCS_2",
219 "ucs-2",
220 "ucs2",
221 "ucs_2",
222 NULL
223};
224
225#if defined(ENABLE_UTF8_FALLBACK_SUPPORT)
226static const char *UTF_8_NAMES[] = {
227 "UTF-8",
228 "UTF8",
229 "UTF_8",
230 "utf-8",
231 "utf8",
232 "utf_8",
233 NULL
234};
235#endif
236
237static const char *ISO_8859_1_NAMES[] = {
238 "ISO-8859-1",
239#if !defined(__GLIBC__)
240 "ISO8859-1",
241 "ISO88591",
242 "ISO_8859_1",
243 "ISO8859_1",
244 "iso-8859-1",
245 "iso8859-1",
246 "iso88591",
247 "iso_8859_1",
248 "iso8859_1",
249#endif
250 NULL
251};
252
253class nsNativeCharsetConverter
254{
255public:
256 nsNativeCharsetConverter();
257 ~nsNativeCharsetConverter();
258
259 nsresult NativeToUnicode(const char **input , PRUint32 *inputLeft,
260 PRUnichar **output, PRUint32 *outputLeft);
261 nsresult UnicodeToNative(const PRUnichar **input , PRUint32 *inputLeft,
262 char **output, PRUint32 *outputLeft);
263
264 static void GlobalInit();
265 static void GlobalShutdown();
266
267private:
268 static iconv_t gNativeToUnicode;
269 static iconv_t gUnicodeToNative;
270#if defined(ENABLE_UTF8_FALLBACK_SUPPORT)
271 static iconv_t gNativeToUTF8;
272 static iconv_t gUTF8ToNative;
273 static iconv_t gUnicodeToUTF8;
274 static iconv_t gUTF8ToUnicode;
275#endif
276 static RTSEMFASTMUTEX gLock;
277 static PRBool gInitialized;
278
279 static void LazyInit();
280
281 static void Lock() { if (gLock != NILRTSEMFASTMUTEX) RTSemFastMutexRequest(gLock); }
282 static void Unlock() { if (gLock != NILRTSEMFASTMUTEX) RTSemFastMutexRelease(gLock); }
283};
284
285iconv_t nsNativeCharsetConverter::gNativeToUnicode = INVALID_ICONV_T;
286iconv_t nsNativeCharsetConverter::gUnicodeToNative = INVALID_ICONV_T;
287#if defined(ENABLE_UTF8_FALLBACK_SUPPORT)
288iconv_t nsNativeCharsetConverter::gNativeToUTF8 = INVALID_ICONV_T;
289iconv_t nsNativeCharsetConverter::gUTF8ToNative = INVALID_ICONV_T;
290iconv_t nsNativeCharsetConverter::gUnicodeToUTF8 = INVALID_ICONV_T;
291iconv_t nsNativeCharsetConverter::gUTF8ToUnicode = INVALID_ICONV_T;
292#endif
293RTSEMFASTMUTEX nsNativeCharsetConverter::gLock = NIL_RTSEMFASTMUTEX;
294PRBool nsNativeCharsetConverter::gInitialized = PR_FALSE;
295
296void
297nsNativeCharsetConverter::LazyInit()
298{
299 const char *blank_list[] = { "", NULL };
300 const char **native_charset_list = blank_list;
301 const char *native_charset = nl_langinfo(CODESET);
302 if (native_charset == nsnull) {
303 NS_ERROR("native charset is unknown");
304 // fallback to ISO-8859-1
305 native_charset_list = ISO_8859_1_NAMES;
306 }
307 else
308 native_charset_list[0] = native_charset;
309
310 gNativeToUnicode = xp_iconv_open(UTF_16_NAMES, native_charset_list);
311 gUnicodeToNative = xp_iconv_open(native_charset_list, UTF_16_NAMES);
312
313#if defined(ENABLE_UTF8_FALLBACK_SUPPORT)
314 if (gNativeToUnicode == INVALID_ICONV_T) {
315 gNativeToUTF8 = xp_iconv_open(UTF_8_NAMES, native_charset_list);
316 gUTF8ToUnicode = xp_iconv_open(UTF_16_NAMES, UTF_8_NAMES);
317 NS_ASSERTION(gNativeToUTF8 != INVALID_ICONV_T, "no native to utf-8 converter");
318 NS_ASSERTION(gUTF8ToUnicode != INVALID_ICONV_T, "no utf-8 to utf-16 converter");
319 }
320 if (gUnicodeToNative == INVALID_ICONV_T) {
321 gUnicodeToUTF8 = xp_iconv_open(UTF_8_NAMES, UTF_16_NAMES);
322 gUTF8ToNative = xp_iconv_open(native_charset_list, UTF_8_NAMES);
323 NS_ASSERTION(gUnicodeToUTF8 != INVALID_ICONV_T, "no utf-16 to utf-8 converter");
324 NS_ASSERTION(gUTF8ToNative != INVALID_ICONV_T, "no utf-8 to native converter");
325 }
326#else
327 NS_ASSERTION(gNativeToUnicode != INVALID_ICONV_T, "no native to utf-16 converter");
328 NS_ASSERTION(gUnicodeToNative != INVALID_ICONV_T, "no utf-16 to native converter");
329#endif
330
331 /*
332 * On Solaris 8 (and newer?), the iconv modules converting to UCS-2
333 * prepend a byte order mark unicode character (BOM, u+FEFF) during
334 * the first use of the iconv converter. The same is the case of
335 * glibc 2.2.9x and Tru64 V5 (see bug 208809) when 'UTF-16' is used.
336 * However, we use 'UTF-16LE/BE' in both cases, instead so that we
337 * should be safe. But just in case...
338 *
339 * This dummy conversion gets rid of the BOMs and fixes bug 153562.
340 */
341 char dummy_input[1] = { ' ' };
342 char dummy_output[4];
343
344 if (gNativeToUnicode != INVALID_ICONV_T) {
345 const char *input = dummy_input;
346 size_t input_left = sizeof(dummy_input);
347 char *output = dummy_output;
348 size_t output_left = sizeof(dummy_output);
349
350 xp_iconv(gNativeToUnicode, &input, &input_left, &output, &output_left);
351 }
352#if defined(ENABLE_UTF8_FALLBACK_SUPPORT)
353 if (gUTF8ToUnicode != INVALID_ICONV_T) {
354 const char *input = dummy_input;
355 size_t input_left = sizeof(dummy_input);
356 char *output = dummy_output;
357 size_t output_left = sizeof(dummy_output);
358
359 xp_iconv(gUTF8ToUnicode, &input, &input_left, &output, &output_left);
360 }
361#endif
362
363 gInitialized = PR_TRUE;
364}
365
366void
367nsNativeCharsetConverter::GlobalInit()
368{
369 int vrc = RTSemFastMutexCreate(&gLock);
370 NS_ASSERTION(RT_SUCCESS(vrc), "lock creation failed");
371}
372
373void
374nsNativeCharsetConverter::GlobalShutdown()
375{
376 if (gLock != NIL_RTSEMFASTMUTEX) {
377 RTSemFastMutexDestroy(gLock);
378 gLock = NIL_RTSEMFASTMUTEX;
379 }
380
381 if (gNativeToUnicode != INVALID_ICONV_T) {
382 iconv_close(gNativeToUnicode);
383 gNativeToUnicode = INVALID_ICONV_T;
384 }
385
386 if (gUnicodeToNative != INVALID_ICONV_T) {
387 iconv_close(gUnicodeToNative);
388 gUnicodeToNative = INVALID_ICONV_T;
389 }
390
391#if defined(ENABLE_UTF8_FALLBACK_SUPPORT)
392 if (gNativeToUTF8 != INVALID_ICONV_T) {
393 iconv_close(gNativeToUTF8);
394 gNativeToUTF8 = INVALID_ICONV_T;
395 }
396 if (gUTF8ToNative != INVALID_ICONV_T) {
397 iconv_close(gUTF8ToNative);
398 gUTF8ToNative = INVALID_ICONV_T;
399 }
400 if (gUnicodeToUTF8 != INVALID_ICONV_T) {
401 iconv_close(gUnicodeToUTF8);
402 gUnicodeToUTF8 = INVALID_ICONV_T;
403 }
404 if (gUTF8ToUnicode != INVALID_ICONV_T) {
405 iconv_close(gUTF8ToUnicode);
406 gUTF8ToUnicode = INVALID_ICONV_T;
407 }
408#endif
409
410 gInitialized = PR_FALSE;
411}
412
413nsNativeCharsetConverter::nsNativeCharsetConverter()
414{
415 Lock();
416 if (!gInitialized)
417 LazyInit();
418}
419
420nsNativeCharsetConverter::~nsNativeCharsetConverter()
421{
422 // reset converters for next time
423 if (gNativeToUnicode != INVALID_ICONV_T)
424 xp_iconv_reset(gNativeToUnicode);
425 if (gUnicodeToNative != INVALID_ICONV_T)
426 xp_iconv_reset(gUnicodeToNative);
427#if defined(ENABLE_UTF8_FALLBACK_SUPPORT)
428 if (gNativeToUTF8 != INVALID_ICONV_T)
429 xp_iconv_reset(gNativeToUTF8);
430 if (gUTF8ToNative != INVALID_ICONV_T)
431 xp_iconv_reset(gUTF8ToNative);
432 if (gUnicodeToUTF8 != INVALID_ICONV_T)
433 xp_iconv_reset(gUnicodeToUTF8);
434 if (gUTF8ToUnicode != INVALID_ICONV_T)
435 xp_iconv_reset(gUTF8ToUnicode);
436#endif
437 Unlock();
438}
439
440nsresult
441nsNativeCharsetConverter::NativeToUnicode(const char **input,
442 PRUint32 *inputLeft,
443 PRUnichar **output,
444 PRUint32 *outputLeft)
445{
446 size_t res = 0;
447 size_t inLeft = (size_t) *inputLeft;
448 size_t outLeft = (size_t) *outputLeft * 2;
449
450 if (gNativeToUnicode != INVALID_ICONV_T) {
451
452 res = xp_iconv(gNativeToUnicode, input, &inLeft, (char **) output, &outLeft);
453
454 *inputLeft = inLeft;
455 *outputLeft = outLeft / 2;
456 if (res != (size_t) -1)
457 return NS_OK;
458
459 NS_WARNING("conversion from native to utf-16 failed");
460
461 // reset converter
462 xp_iconv_reset(gNativeToUnicode);
463 }
464#if defined(ENABLE_UTF8_FALLBACK_SUPPORT)
465 else if ((gNativeToUTF8 != INVALID_ICONV_T) &&
466 (gUTF8ToUnicode != INVALID_ICONV_T)) {
467 // convert first to UTF8, then from UTF8 to UCS2
468 const char *in = *input;
469
470 char ubuf[1024];
471
472 // we assume we're always called with enough space in |output|,
473 // so convert many chars at a time...
474 while (inLeft) {
475 char *p = ubuf;
476 size_t n = sizeof(ubuf);
477 res = xp_iconv(gNativeToUTF8, &in, &inLeft, &p, &n);
478 if (res == (size_t) -1) {
479 NS_ERROR("conversion from native to utf-8 failed");
480 break;
481 }
482 NS_ASSERTION(outLeft > 0, "bad assumption");
483 p = ubuf;
484 n = sizeof(ubuf) - n;
485 res = xp_iconv(gUTF8ToUnicode, (const char **) &p, &n, (char **) output, &outLeft);
486 if (res == (size_t) -1) {
487 NS_ERROR("conversion from utf-8 to utf-16 failed");
488 break;
489 }
490 }
491
492 (*input) += (*inputLeft - inLeft);
493 *inputLeft = inLeft;
494 *outputLeft = outLeft / 2;
495
496 if (res != (size_t) -1)
497 return NS_OK;
498
499 // reset converters
500 xp_iconv_reset(gNativeToUTF8);
501 xp_iconv_reset(gUTF8ToUnicode);
502 }
503#endif
504
505 // fallback: zero-pad and hope for the best
506 // XXX This is lame and we have to do better.
507 isolatin1_to_utf16(input, inputLeft, output, outputLeft);
508
509 return NS_OK;
510}
511
512nsresult
513nsNativeCharsetConverter::UnicodeToNative(const PRUnichar **input,
514 PRUint32 *inputLeft,
515 char **output,
516 PRUint32 *outputLeft)
517{
518 size_t res = 0;
519 size_t inLeft = (size_t) *inputLeft * 2;
520 size_t outLeft = (size_t) *outputLeft;
521
522 if (gUnicodeToNative != INVALID_ICONV_T) {
523 res = xp_iconv(gUnicodeToNative, (const char **) input, &inLeft, output, &outLeft);
524
525 if (res != (size_t) -1) {
526 *inputLeft = inLeft / 2;
527 *outputLeft = outLeft;
528 return NS_OK;
529 }
530
531 NS_ERROR("iconv failed");
532
533 // reset converter
534 xp_iconv_reset(gUnicodeToNative);
535 }
536#if defined(ENABLE_UTF8_FALLBACK_SUPPORT)
537 else if ((gUnicodeToUTF8 != INVALID_ICONV_T) &&
538 (gUTF8ToNative != INVALID_ICONV_T)) {
539 const char *in = (const char *) *input;
540
541 char ubuf[6]; // max utf-8 char length (really only needs to be 4 bytes)
542
543 // convert one uchar at a time...
544 while (inLeft && outLeft) {
545 char *p = ubuf;
546 size_t n = sizeof(ubuf), one_uchar = sizeof(PRUnichar);
547 res = xp_iconv(gUnicodeToUTF8, &in, &one_uchar, &p, &n);
548 if (res == (size_t) -1) {
549 NS_ERROR("conversion from utf-16 to utf-8 failed");
550 break;
551 }
552 p = ubuf;
553 n = sizeof(ubuf) - n;
554 res = xp_iconv(gUTF8ToNative, (const char **) &p, &n, output, &outLeft);
555 if (res == (size_t) -1) {
556 if (errno == E2BIG) {
557 // not enough room for last uchar... back up and return.
558 in -= sizeof(PRUnichar);
559 res = 0;
560 }
561 else
562 NS_ERROR("conversion from utf-8 to native failed");
563 break;
564 }
565 inLeft -= sizeof(PRUnichar);
566 }
567
568 if (res != (size_t) -1) {
569 (*input) += (*inputLeft - inLeft/2);
570 *inputLeft = inLeft/2;
571 *outputLeft = outLeft;
572 return NS_OK;
573 }
574
575 // reset converters
576 xp_iconv_reset(gUnicodeToUTF8);
577 xp_iconv_reset(gUTF8ToNative);
578 }
579#endif
580
581 // fallback: truncate and hope for the best
582 utf16_to_isolatin1(input, inputLeft, output, outputLeft);
583
584 return NS_OK;
585}
586
587#endif // USE_ICONV
588
589//-----------------------------------------------------------------------------
590// conversion using mb[r]towc/wc[r]tomb
591//-----------------------------------------------------------------------------
592#if defined(USE_STDCONV)
593#if defined(HAVE_WCRTOMB) || defined(HAVE_MBRTOWC)
594#include <wchar.h> // mbrtowc, wcrtomb
595#endif
596
597class nsNativeCharsetConverter
598{
599public:
600 nsNativeCharsetConverter();
601
602 nsresult NativeToUnicode(const char **input , PRUint32 *inputLeft,
603 PRUnichar **output, PRUint32 *outputLeft);
604 nsresult UnicodeToNative(const PRUnichar **input , PRUint32 *inputLeft,
605 char **output, PRUint32 *outputLeft);
606
607 static void GlobalInit();
608 static void GlobalShutdown() { }
609
610private:
611 static PRBool gWCharIsUnicode;
612
613#if defined(HAVE_WCRTOMB) || defined(HAVE_MBRTOWC)
614 mbstate_t ps;
615#endif
616};
617
618PRBool nsNativeCharsetConverter::gWCharIsUnicode = PR_FALSE;
619
620nsNativeCharsetConverter::nsNativeCharsetConverter()
621{
622#if defined(HAVE_WCRTOMB) || defined(HAVE_MBRTOWC)
623 memset(&ps, 0, sizeof(ps));
624#endif
625}
626
627void
628nsNativeCharsetConverter::GlobalInit()
629{
630 // verify that wchar_t for the current locale is actually unicode.
631 // if it is not, then we should avoid calling mbtowc/wctomb and
632 // just fallback on zero-pad/truncation conversion.
633 //
634 // this test cannot be done at build time because the encoding of
635 // wchar_t may depend on the runtime locale. sad, but true!!
636 //
637 // so, if wchar_t is unicode then converting an ASCII character
638 // to wchar_t should not change its numeric value. we'll just
639 // check what happens with the ASCII 'a' character.
640 //
641 // this test is not perfect... obviously, it could yield false
642 // positives, but then at least ASCII text would be converted
643 // properly (or maybe just the 'a' character) -- oh well :(
644
645 char a = 'a';
646 unsigned int w = 0;
647
648 int res = mbtowc((wchar_t *) &w, &a, 1);
649
650 gWCharIsUnicode = (res != -1 && w == 'a');
651
652#ifdef DEBUG
653 if (!gWCharIsUnicode)
654 NS_WARNING("wchar_t is not unicode (unicode conversion will be lossy)");
655#endif
656}
657
658nsresult
659nsNativeCharsetConverter::NativeToUnicode(const char **input,
660 PRUint32 *inputLeft,
661 PRUnichar **output,
662 PRUint32 *outputLeft)
663{
664 if (gWCharIsUnicode) {
665 int incr;
666
667 // cannot use wchar_t here since it may have been redefined (e.g.,
668 // via -fshort-wchar). hopefully, sizeof(tmp) is sufficient XP.
669 unsigned int tmp = 0;
670 while (*inputLeft && *outputLeft) {
671#ifdef HAVE_MBRTOWC
672 incr = (int) mbrtowc((wchar_t *) &tmp, *input, *inputLeft, &ps);
673#else
674 // XXX is this thread-safe?
675 incr = (int) mbtowc((wchar_t *) &tmp, *input, *inputLeft);
676#endif
677 if (incr < 0) {
678 NS_WARNING("mbtowc failed: possible charset mismatch");
679 // zero-pad and hope for the best
680 tmp = (unsigned char) **input;
681 incr = 1;
682 }
683 **output = (PRUnichar) tmp;
684 (*input) += incr;
685 (*inputLeft) -= incr;
686 (*output)++;
687 (*outputLeft)--;
688 }
689 }
690 else {
691 // wchar_t isn't unicode, so the best we can do is treat the
692 // input as if it is isolatin1 :(
693 isolatin1_to_utf16(input, inputLeft, output, outputLeft);
694 }
695
696 return NS_OK;
697}
698
699nsresult
700nsNativeCharsetConverter::UnicodeToNative(const PRUnichar **input,
701 PRUint32 *inputLeft,
702 char **output,
703 PRUint32 *outputLeft)
704{
705 if (gWCharIsUnicode) {
706 int incr;
707
708 while (*inputLeft && *outputLeft >= MB_CUR_MAX) {
709#ifdef HAVE_WCRTOMB
710 incr = (int) wcrtomb(*output, (wchar_t) **input, &ps);
711#else
712 // XXX is this thread-safe?
713 incr = (int) wctomb(*output, (wchar_t) **input);
714#endif
715 if (incr < 0) {
716 NS_WARNING("mbtowc failed: possible charset mismatch");
717 **output = (unsigned char) **input; // truncate
718 incr = 1;
719 }
720 // most likely we're dead anyways if this assertion should fire
721 NS_ASSERTION(PRUint32(incr) <= *outputLeft, "wrote beyond end of string");
722 (*output) += incr;
723 (*outputLeft) -= incr;
724 (*input)++;
725 (*inputLeft)--;
726 }
727 }
728 else {
729 // wchar_t isn't unicode, so the best we can do is treat the
730 // input as if it is isolatin1 :(
731 utf16_to_isolatin1(input, inputLeft, output, outputLeft);
732 }
733
734 return NS_OK;
735}
736
737#endif // USE_STDCONV
738
739//-----------------------------------------------------------------------------
740// API implementation
741//-----------------------------------------------------------------------------
742
743NS_COM nsresult
744NS_CopyNativeToUnicode(const nsACString &input, nsAString &output)
745{
746 output.Truncate();
747
748 PRUint32 inputLen = input.Length();
749
750 nsACString::const_iterator iter;
751 input.BeginReading(iter);
752
753 //
754 // OPTIMIZATION: preallocate space for largest possible result; convert
755 // directly into the result buffer to avoid intermediate buffer copy.
756 //
757 // this will generally result in a larger allocation, but that seems
758 // better than an extra buffer copy.
759 //
760 output.SetLength(inputLen);
761 nsAString::iterator out_iter;
762 output.BeginWriting(out_iter);
763
764 PRUnichar *result = out_iter.get();
765 PRUint32 resultLeft = inputLen;
766
767 const char *buf = iter.get();
768 PRUint32 bufLeft = inputLen;
769
770 nsNativeCharsetConverter conv;
771 nsresult rv = conv.NativeToUnicode(&buf, &bufLeft, &result, &resultLeft);
772 if (NS_SUCCEEDED(rv)) {
773 NS_ASSERTION(bufLeft == 0, "did not consume entire input buffer");
774 output.SetLength(inputLen - resultLeft);
775 }
776 return rv;
777}
778
779NS_COM nsresult
780NS_CopyUnicodeToNative(const nsAString &input, nsACString &output)
781{
782 output.Truncate();
783
784 nsAString::const_iterator iter, end;
785 input.BeginReading(iter);
786 input.EndReading(end);
787
788 // cannot easily avoid intermediate buffer copy.
789 char temp[4096];
790
791 nsNativeCharsetConverter conv;
792
793 const PRUnichar *buf = iter.get();
794 PRUint32 bufLeft = Distance(iter, end);
795 while (bufLeft) {
796 char *p = temp;
797 PRUint32 tempLeft = sizeof(temp);
798
799 nsresult rv = conv.UnicodeToNative(&buf, &bufLeft, &p, &tempLeft);
800 if (NS_FAILED(rv)) return rv;
801
802 if (tempLeft < sizeof(temp))
803 output.Append(temp, sizeof(temp) - tempLeft);
804 }
805 return NS_OK;
806}
807
808void
809NS_StartupNativeCharsetUtils()
810{
811 //
812 // need to initialize the locale or else charset conversion will fail.
813 // better not delay this in case some other component alters the locale
814 // settings.
815 //
816 // XXX we assume that we are called early enough that we should
817 // always be the first to care about the locale's charset.
818 //
819 setlocale(LC_CTYPE, "");
820
821 nsNativeCharsetConverter::GlobalInit();
822}
823
824void
825NS_ShutdownNativeCharsetUtils()
826{
827 nsNativeCharsetConverter::GlobalShutdown();
828}
829
830//-----------------------------------------------------------------------------
831// XP_BEOS
832//-----------------------------------------------------------------------------
833#elif defined(XP_BEOS)
834
835#include "nsAString.h"
836#include "nsReadableUtils.h"
837#include "nsString.h"
838
839NS_COM nsresult
840NS_CopyNativeToUnicode(const nsACString &input, nsAString &output)
841{
842 CopyUTF8toUTF16(input, output);
843 return NS_OK;
844}
845
846NS_COM nsresult
847NS_CopyUnicodeToNative(const nsAString &input, nsACString &output)
848{
849 CopyUTF16toUTF8(input, output);
850 return NS_OK;
851}
852
853void
854NS_StartupNativeCharsetUtils()
855{
856}
857
858void
859NS_ShutdownNativeCharsetUtils()
860{
861}
862
863//-----------------------------------------------------------------------------
864// XP_WIN
865//-----------------------------------------------------------------------------
866#elif defined(XP_WIN)
867
868#include <windows.h>
869#include "nsAString.h"
870
871NS_COM nsresult
872NS_CopyNativeToUnicode(const nsACString &input, nsAString &output)
873{
874 PRUint32 inputLen = input.Length();
875
876 nsACString::const_iterator iter;
877 input.BeginReading(iter);
878
879 const char *buf = iter.get();
880
881 // determine length of result
882 PRUint32 resultLen = 0;
883 int n = ::MultiByteToWideChar(CP_ACP, 0, buf, inputLen, NULL, 0);
884 if (n > 0)
885 resultLen += n;
886
887 // allocate sufficient space
888 output.SetLength(resultLen);
889 if (resultLen > 0) {
890 nsAString::iterator out_iter;
891 output.BeginWriting(out_iter);
892
893 PRUnichar *result = out_iter.get();
894
895 ::MultiByteToWideChar(CP_ACP, 0, buf, inputLen, result, resultLen);
896 }
897 return NS_OK;
898}
899
900NS_COM nsresult
901NS_CopyUnicodeToNative(const nsAString &input, nsACString &output)
902{
903 PRUint32 inputLen = input.Length();
904
905 nsAString::const_iterator iter;
906 input.BeginReading(iter);
907
908 const PRUnichar *buf = iter.get();
909
910 // determine length of result
911 PRUint32 resultLen = 0;
912
913 int n = ::WideCharToMultiByte(CP_ACP, 0, buf, inputLen, NULL, 0, NULL, NULL);
914 if (n > 0)
915 resultLen += n;
916
917 // allocate sufficient space
918 output.SetLength(resultLen);
919 if (resultLen > 0) {
920 nsACString::iterator out_iter;
921 output.BeginWriting(out_iter);
922
923 // default "defaultChar" is '?', which is an illegal character on windows
924 // file system. That will cause file uncreatable. Change it to '_'
925 const char defaultChar = '_';
926
927 char *result = out_iter.get();
928
929 ::WideCharToMultiByte(CP_ACP, 0, buf, inputLen, result, resultLen,
930 &defaultChar, NULL);
931 }
932 return NS_OK;
933}
934
935void
936NS_StartupNativeCharsetUtils()
937{
938}
939
940void
941NS_ShutdownNativeCharsetUtils()
942{
943}
944
945//-----------------------------------------------------------------------------
946// XP_OS2
947//-----------------------------------------------------------------------------
948#elif defined(XP_OS2)
949
950#define INCL_DOS
951#include <os2.h>
952#include <uconv.h>
953#include "nsAString.h"
954#include <ulserrno.h>
955#include "nsNativeCharsetUtils.h"
956
957static UconvObject UnicodeConverter = NULL;
958
959NS_COM nsresult
960NS_CopyNativeToUnicode(const nsACString &input, nsAString &output)
961{
962 PRUint32 inputLen = input.Length();
963
964 nsACString::const_iterator iter;
965 input.BeginReading(iter);
966 const char *inputStr = iter.get();
967
968 // determine length of result
969 PRUint32 resultLen = inputLen;
970 output.SetLength(resultLen);
971
972 nsAString::iterator out_iter;
973 output.BeginWriting(out_iter);
974 UniChar *result = (UniChar*)out_iter.get();
975
976 size_t cSubs = 0;
977 size_t resultLeft = resultLen;
978
979 if (!UnicodeConverter)
980 NS_StartupNativeCharsetUtils();
981
982 int unirc = ::UniUconvToUcs(UnicodeConverter, (void**)&inputStr, &inputLen,
983 &result, &resultLeft, &cSubs);
984
985 NS_ASSERTION(unirc != UCONV_E2BIG, "Path too big");
986
987 if (unirc != ULS_SUCCESS) {
988 output.Truncate();
989 return NS_ERROR_FAILURE;
990 }
991
992 // Need to update string length to reflect how many bytes were actually
993 // written.
994 output.Truncate(resultLen - resultLeft);
995 return NS_OK;
996}
997
998NS_COM nsresult
999NS_CopyUnicodeToNative(const nsAString &input, nsACString &output)
1000{
1001 size_t inputLen = input.Length();
1002
1003 nsAString::const_iterator iter;
1004 input.BeginReading(iter);
1005 UniChar* inputStr = (UniChar*) NS_CONST_CAST(PRUnichar*, iter.get());
1006
1007 // maximum length of unicode string of length x converted to native
1008 // codepage is x*2
1009 size_t resultLen = inputLen * 2;
1010 output.SetLength(resultLen);
1011
1012 nsACString::iterator out_iter;
1013 output.BeginWriting(out_iter);
1014 char *result = out_iter.get();
1015
1016 size_t cSubs = 0;
1017 size_t resultLeft = resultLen;
1018
1019 if (!UnicodeConverter)
1020 NS_StartupNativeCharsetUtils();
1021
1022 int unirc = ::UniUconvFromUcs(UnicodeConverter, &inputStr, &inputLen,
1023 (void**)&result, &resultLeft, &cSubs);
1024
1025 NS_ASSERTION(unirc != UCONV_E2BIG, "Path too big");
1026
1027 if (unirc != ULS_SUCCESS) {
1028 output.Truncate();
1029 return NS_ERROR_FAILURE;
1030 }
1031
1032 // Need to update string length to reflect how many bytes were actually
1033 // written.
1034 output.Truncate(resultLen - resultLeft);
1035 return NS_OK;
1036}
1037
1038void
1039NS_StartupNativeCharsetUtils()
1040{
1041 ULONG ulLength;
1042 ULONG ulCodePage;
1043 DosQueryCp(sizeof(ULONG), &ulCodePage, &ulLength);
1044
1045 UniChar codepage[20];
1046 int unirc = ::UniMapCpToUcsCp(ulCodePage, codepage, 20);
1047 if (unirc == ULS_SUCCESS) {
1048 unirc = ::UniCreateUconvObject(codepage, &UnicodeConverter);
1049 if (unirc == ULS_SUCCESS) {
1050 uconv_attribute_t attr;
1051 ::UniQueryUconvObject(UnicodeConverter, &attr, sizeof(uconv_attribute_t),
1052 NULL, NULL, NULL);
1053 attr.options = UCONV_OPTION_SUBSTITUTE_BOTH;
1054 attr.subchar_len=1;
1055 attr.subchar[0]='_';
1056 ::UniSetUconvObject(UnicodeConverter, &attr);
1057 }
1058 }
1059}
1060
1061void
1062NS_ShutdownNativeCharsetUtils()
1063{
1064 ::UniFreeUconvObject(UnicodeConverter);
1065}
1066
1067//-----------------------------------------------------------------------------
1068// XP_MAC
1069//-----------------------------------------------------------------------------
1070#elif defined(XP_MAC)
1071
1072#include <UnicodeConverter.h>
1073#include <TextCommon.h>
1074#include <Script.h>
1075#include <MacErrors.h>
1076#include "nsAString.h"
1077
1078class nsFSStringConversionMac {
1079public:
1080 static nsresult UCSToFS(const nsAString& aIn, nsACString& aOut);
1081 static nsresult FSToUCS(const nsACString& ain, nsAString& aOut);
1082
1083 static void CleanUp();
1084
1085private:
1086 static TextEncoding GetSystemEncoding();
1087 static nsresult PrepareEncoder();
1088 static nsresult PrepareDecoder();
1089
1090 static UnicodeToTextInfo sEncoderInfo;
1091 static TextToUnicodeInfo sDecoderInfo;
1092};
1093
1094UnicodeToTextInfo nsFSStringConversionMac::sEncoderInfo = nsnull;
1095TextToUnicodeInfo nsFSStringConversionMac::sDecoderInfo = nsnull;
1096
1097nsresult nsFSStringConversionMac::UCSToFS(const nsAString& aIn, nsACString& aOut)
1098{
1099 nsresult rv = PrepareEncoder();
1100 if (NS_FAILED(rv)) return rv;
1101
1102 OSStatus err = noErr;
1103 char stackBuffer[512];
1104
1105 aOut.Truncate();
1106
1107 // for each chunk of |aIn|...
1108 nsReadingIterator<PRUnichar> iter;
1109 aIn.BeginReading(iter);
1110
1111 PRUint32 fragmentLength = PRUint32(iter.size_forward());
1112 UInt32 bytesLeft = fragmentLength * sizeof(UniChar);
1113
1114 do {
1115 UInt32 bytesRead = 0, bytesWritten = 0;
1116 err = ::ConvertFromUnicodeToText(sEncoderInfo,
1117 bytesLeft,
1118 (const UniChar*)iter.get(),
1119 kUnicodeUseFallbacksMask | kUnicodeLooseMappingsMask,
1120 0, nsnull, nsnull, nsnull,
1121 sizeof(stackBuffer),
1122 &bytesRead,
1123 &bytesWritten,
1124 stackBuffer);
1125 if (err == kTECUsedFallbacksStatus)
1126 err = noErr;
1127 else if (err == kTECOutputBufferFullStatus) {
1128 bytesLeft -= bytesRead;
1129 iter.advance(bytesRead / sizeof(UniChar));
1130 }
1131 aOut.Append(stackBuffer, bytesWritten);
1132 }
1133 while (err == kTECOutputBufferFullStatus);
1134
1135 return (err == noErr) ? NS_OK : NS_ERROR_FAILURE;
1136}
1137
1138nsresult nsFSStringConversionMac::FSToUCS(const nsACString& aIn, nsAString& aOut)
1139{
1140 nsresult rv = PrepareDecoder();
1141 if (NS_FAILED(rv)) return rv;
1142
1143 OSStatus err = noErr;
1144 UniChar stackBuffer[512];
1145
1146 aOut.Truncate(0);
1147
1148 // for each chunk of |aIn|...
1149 nsReadingIterator<char> iter;
1150 aIn.BeginReading(iter);
1151
1152 PRUint32 fragmentLength = PRUint32(iter.size_forward());
1153 UInt32 bytesLeft = fragmentLength;
1154
1155 do {
1156 UInt32 bytesRead = 0, bytesWritten = 0;
1157 err = ::ConvertFromTextToUnicode(sDecoderInfo,
1158 bytesLeft,
1159 iter.get(),
1160 kUnicodeUseFallbacksMask | kUnicodeLooseMappingsMask,
1161 0, nsnull, nsnull, nsnull,
1162 sizeof(stackBuffer),
1163 &bytesRead,
1164 &bytesWritten,
1165 stackBuffer);
1166 if (err == kTECUsedFallbacksStatus)
1167 err = noErr;
1168 else if (err == kTECOutputBufferFullStatus) {
1169 bytesLeft -= bytesRead;
1170 iter.advance(bytesRead);
1171 }
1172 aOut.Append((PRUnichar *)stackBuffer, bytesWritten / sizeof(PRUnichar));
1173 }
1174 while (err == kTECOutputBufferFullStatus);
1175
1176 return (err == noErr) ? NS_OK : NS_ERROR_FAILURE;
1177}
1178
1179void nsFSStringConversionMac::CleanUp()
1180{
1181 if (sDecoderInfo) {
1182 ::DisposeTextToUnicodeInfo(&sDecoderInfo);
1183 sDecoderInfo = nsnull;
1184 }
1185 if (sEncoderInfo) {
1186 ::DisposeUnicodeToTextInfo(&sEncoderInfo);
1187 sEncoderInfo = nsnull;
1188 }
1189}
1190
1191TextEncoding nsFSStringConversionMac::GetSystemEncoding()
1192{
1193 OSStatus err;
1194 TextEncoding theEncoding;
1195
1196 err = ::UpgradeScriptInfoToTextEncoding(smSystemScript, kTextLanguageDontCare,
1197 kTextRegionDontCare, NULL, &theEncoding);
1198
1199 if (err != noErr)
1200 theEncoding = kTextEncodingMacRoman;
1201
1202 return theEncoding;
1203}
1204
1205nsresult nsFSStringConversionMac::PrepareEncoder()
1206{
1207 nsresult rv = NS_OK;
1208 if (!sEncoderInfo) {
1209 OSStatus err;
1210 err = ::CreateUnicodeToTextInfoByEncoding(GetSystemEncoding(), &sEncoderInfo);
1211 if (err)
1212 rv = NS_ERROR_FAILURE;
1213 }
1214 return rv;
1215}
1216
1217nsresult nsFSStringConversionMac::PrepareDecoder()
1218{
1219 nsresult rv = NS_OK;
1220 if (!sDecoderInfo) {
1221 OSStatus err;
1222 err = ::CreateTextToUnicodeInfoByEncoding(GetSystemEncoding(), &sDecoderInfo);
1223 if (err)
1224 rv = NS_ERROR_FAILURE;
1225 }
1226 return rv;
1227}
1228
1229NS_COM nsresult
1230NS_CopyNativeToUnicode(const nsACString &input, nsAString &output)
1231{
1232 return nsFSStringConversionMac::FSToUCS(input, output);
1233}
1234
1235NS_COM nsresult
1236NS_CopyUnicodeToNative(const nsAString &input, nsACString &output)
1237{
1238 return nsFSStringConversionMac::UCSToFS(input, output);
1239}
1240
1241void
1242NS_StartupNativeCharsetUtils()
1243{
1244}
1245
1246void
1247NS_ShutdownNativeCharsetUtils()
1248{
1249 nsFSStringConversionMac::CleanUp();
1250}
1251
1252//-----------------------------------------------------------------------------
1253// default : truncate/zeropad
1254//-----------------------------------------------------------------------------
1255#else
1256
1257#include "nsReadableUtils.h"
1258
1259NS_COM nsresult
1260NS_CopyNativeToUnicode(const nsACString &input, nsAString &output)
1261{
1262 CopyASCIItoUCS2(input, output);
1263 return NS_OK;
1264}
1265
1266NS_COM nsresult
1267NS_CopyUnicodeToNative(const nsAString &input, nsACString &output)
1268{
1269 CopyUCS2toASCII(input, output);
1270 return NS_OK;
1271}
1272
1273void
1274NS_StartupNativeCharsetUtils()
1275{
1276}
1277
1278void
1279NS_ShutdownNativeCharsetUtils()
1280{
1281}
1282
1283#endif
1284
Note: See TracBrowser for help on using the repository browser.

© 2024 Oracle Support Privacy / Do Not Sell My Info Terms of Use Trademark Policy Automated Access Etiquette