VirtualBox

source: vbox/trunk/src/libs/xpcom18a4/xpcom/io/nsNativeCharsetUtils.cpp@ 49747

Last change on this file since 49747 was 46043, checked in by vboxsync, 12 years ago

src/libs/xpcom18a4: remove L4.

  • Property svn:eol-style set to native
  • Property svn:keywords set to Author Date Id Revision
File size: 37.3 KB
Line 
1/* ***** BEGIN LICENSE BLOCK *****
2 * Version: MPL 1.1/GPL 2.0/LGPL 2.1
3 *
4 * The contents of this file are subject to the Mozilla Public License Version
5 * 1.1 (the "License"); you may not use this file except in compliance with
6 * the License. You may obtain a copy of the License at
7 * http://www.mozilla.org/MPL/
8 *
9 * Software distributed under the License is distributed on an "AS IS" basis,
10 * WITHOUT WARRANTY OF ANY KIND, either express or implied. See the License
11 * for the specific language governing rights and limitations under the
12 * License.
13 *
14 * The Original Code is Mozilla.
15 *
16 * The Initial Developer of the Original Code is
17 * Netscape Communications Corporation.
18 * Portions created by the Initial Developer are Copyright (C) 2002
19 * the Initial Developer. All Rights Reserved.
20 *
21 * Contributor(s):
22 * Darin Fisher <[email protected]>
23 * Brian Stell <[email protected]>
24 * Frank Tang <[email protected]>
25 * Brendan Eich <[email protected]>
26 * Sergei Dolgov <[email protected]>
27 *
28 * Alternatively, the contents of this file may be used under the terms of
29 * either the GNU General Public License Version 2 or later (the "GPL"), or
30 * the GNU Lesser General Public License Version 2.1 or later (the "LGPL"),
31 * in which case the provisions of the GPL or the LGPL are applicable instead
32 * of those above. If you wish to allow use of your version of this file only
33 * under the terms of either the GPL or the LGPL, and not to allow others to
34 * use your version of this file under the terms of the MPL, indicate your
35 * decision by deleting the provisions above and replace them with the notice
36 * and other provisions required by the GPL or the LGPL. If you do not delete
37 * the provisions above, a recipient may use your version of this file under
38 * the terms of any one of the MPL, the GPL or the LGPL.
39 *
40 * ***** END LICENSE BLOCK ***** */
41
42#include "xpcom-private.h"
43
44//-----------------------------------------------------------------------------
45// XP_UNIX
46//-----------------------------------------------------------------------------
47#if defined(XP_UNIX)
48
49#include <stdlib.h> // mbtowc, wctomb
50#include <locale.h> // setlocale
51#include "nscore.h"
52#include "prlock.h"
53#include "nsAString.h"
54#include "nsReadableUtils.h"
55
56//
57// choose a conversion library. we used to use mbrtowc/wcrtomb under Linux,
58// but that doesn't work for non-BMP characters whether we use '-fshort-wchar'
59// or not (see bug 206811 and
60// news://news.mozilla.org:119/[email protected]). we now use
61// iconv for all platforms where nltypes.h and nllanginfo.h are present
62// along with iconv.
63//
64#if defined(HAVE_ICONV) && defined(HAVE_NL_TYPES_H) && defined(HAVE_LANGINFO_CODESET)
65#define USE_ICONV 1
66#else
67#define USE_STDCONV 1
68#endif
69
70static void
71isolatin1_to_utf16(const char **input, PRUint32 *inputLeft, PRUnichar **output, PRUint32 *outputLeft)
72{
73 while (*inputLeft && *outputLeft) {
74 **output = (unsigned char) **input;
75 (*input)++;
76 (*inputLeft)--;
77 (*output)++;
78 (*outputLeft)--;
79 }
80}
81
82static void
83utf16_to_isolatin1(const PRUnichar **input, PRUint32 *inputLeft, char **output, PRUint32 *outputLeft)
84{
85 while (*inputLeft && *outputLeft) {
86 **output = (unsigned char) **input;
87 (*input)++;
88 (*inputLeft)--;
89 (*output)++;
90 (*outputLeft)--;
91 }
92}
93
94//-----------------------------------------------------------------------------
95// conversion using iconv
96//-----------------------------------------------------------------------------
97#if defined(USE_ICONV)
98#include <nl_types.h> // CODESET
99#include <langinfo.h> // nl_langinfo
100#include <iconv.h> // iconv_open, iconv, iconv_close
101#include <errno.h>
102
103#if defined(HAVE_ICONV_WITH_CONST_INPUT)
104#define ICONV_INPUT(x) (x)
105#else
106#define ICONV_INPUT(x) ((char **)x)
107#endif
108
109// solaris definitely needs this, but we'll enable it by default
110// just in case... but we know for sure that iconv(3) in glibc
111// doesn't need this.
112#if !defined(__GLIBC__)
113#define ENABLE_UTF8_FALLBACK_SUPPORT
114#endif
115
116#define INVALID_ICONV_T ((iconv_t) -1)
117
118static inline size_t
119xp_iconv(iconv_t converter,
120 const char **input,
121 size_t *inputLeft,
122 char **output,
123 size_t *outputLeft)
124{
125 size_t res, outputAvail = outputLeft ? *outputLeft : 0;
126 res = iconv(converter, ICONV_INPUT(input), inputLeft, output, outputLeft);
127 if (res == (size_t) -1) {
128 // on some platforms (e.g., linux) iconv will fail with
129 // E2BIG if it cannot convert _all_ of its input. it'll
130 // still adjust all of the in/out params correctly, so we
131 // can ignore this error. the assumption is that we will
132 // be called again to complete the conversion.
133 if ((errno == E2BIG) && (*outputLeft < outputAvail))
134 res = 0;
135 }
136 return res;
137}
138
139static inline void
140xp_iconv_reset(iconv_t converter)
141{
142 // NOTE: the man pages on Solaris claim that you can pass NULL
143 // for all parameter to reset the converter, but beware the
144 // evil Solaris crash if you go down this route >:-)
145
146 const char *zero_char_in_ptr = NULL;
147 char *zero_char_out_ptr = NULL;
148 size_t zero_size_in = 0,
149 zero_size_out = 0;
150
151 xp_iconv(converter, &zero_char_in_ptr,
152 &zero_size_in,
153 &zero_char_out_ptr,
154 &zero_size_out);
155}
156
157static inline iconv_t
158xp_iconv_open(const char **to_list, const char **from_list)
159{
160 iconv_t res;
161 const char **from_name;
162 const char **to_name;
163
164 // try all possible combinations to locate a converter.
165 to_name = to_list;
166 while (*to_name) {
167 if (**to_name) {
168 from_name = from_list;
169 while (*from_name) {
170 if (**from_name) {
171 res = iconv_open(*to_name, *from_name);
172 if (res != INVALID_ICONV_T)
173 return res;
174 }
175 from_name++;
176 }
177 }
178 to_name++;
179 }
180
181 return INVALID_ICONV_T;
182}
183
184/*
185 * PRUnichar[] is NOT a UCS-2 array BUT a UTF-16 string. Therefore, we
186 * have to use UTF-16 with iconv(3) on platforms where it's supported.
187 * However, the way UTF-16 and UCS-2 are interpreted varies across platforms
188 * and implementations of iconv(3). On Tru64, it also depends on the environment
189 * variable. To avoid the trouble arising from byte-swapping
190 * (bug 208809), we have to try UTF-16LE/BE and UCS-2LE/BE before falling
191 * back to UTF-16 and UCS-2 and variants. We assume that UTF-16 and UCS-2
192 * on systems without UTF-16LE/BE and UCS-2LE/BE have the native endianness,
193 * which isn't the case of glibc 2.1.x, for which we use 'UNICODELITTLE'
194 * and 'UNICODEBIG'. It's also not true of Tru64 V4 when the environment
195 * variable ICONV_BYTEORDER is set to 'big-endian', about which not much
196 * can be done other than adding a note in the release notes. (bug 206811)
197 */
198static const char *UTF_16_NAMES[] = {
199#if defined(IS_LITTLE_ENDIAN)
200 "UTF-16LE",
201#if defined(__GLIBC__)
202 "UNICODELITTLE",
203#endif
204 "UCS-2LE",
205#else
206 "UTF-16BE",
207#if defined(__GLIBC__)
208 "UNICODEBIG",
209#endif
210 "UCS-2BE",
211#endif
212 "UTF-16",
213 "UCS-2",
214 "UCS2",
215 "UCS_2",
216 "ucs-2",
217 "ucs2",
218 "ucs_2",
219 NULL
220};
221
222#if defined(ENABLE_UTF8_FALLBACK_SUPPORT)
223static const char *UTF_8_NAMES[] = {
224 "UTF-8",
225 "UTF8",
226 "UTF_8",
227 "utf-8",
228 "utf8",
229 "utf_8",
230 NULL
231};
232#endif
233
234static const char *ISO_8859_1_NAMES[] = {
235 "ISO-8859-1",
236#if !defined(__GLIBC__)
237 "ISO8859-1",
238 "ISO88591",
239 "ISO_8859_1",
240 "ISO8859_1",
241 "iso-8859-1",
242 "iso8859-1",
243 "iso88591",
244 "iso_8859_1",
245 "iso8859_1",
246#endif
247 NULL
248};
249
250class nsNativeCharsetConverter
251{
252public:
253 nsNativeCharsetConverter();
254 ~nsNativeCharsetConverter();
255
256 nsresult NativeToUnicode(const char **input , PRUint32 *inputLeft,
257 PRUnichar **output, PRUint32 *outputLeft);
258 nsresult UnicodeToNative(const PRUnichar **input , PRUint32 *inputLeft,
259 char **output, PRUint32 *outputLeft);
260
261 static void GlobalInit();
262 static void GlobalShutdown();
263
264private:
265 static iconv_t gNativeToUnicode;
266 static iconv_t gUnicodeToNative;
267#if defined(ENABLE_UTF8_FALLBACK_SUPPORT)
268 static iconv_t gNativeToUTF8;
269 static iconv_t gUTF8ToNative;
270 static iconv_t gUnicodeToUTF8;
271 static iconv_t gUTF8ToUnicode;
272#endif
273 static PRLock *gLock;
274 static PRBool gInitialized;
275
276 static void LazyInit();
277
278 static void Lock() { if (gLock) PR_Lock(gLock); }
279 static void Unlock() { if (gLock) PR_Unlock(gLock); }
280};
281
282iconv_t nsNativeCharsetConverter::gNativeToUnicode = INVALID_ICONV_T;
283iconv_t nsNativeCharsetConverter::gUnicodeToNative = INVALID_ICONV_T;
284#if defined(ENABLE_UTF8_FALLBACK_SUPPORT)
285iconv_t nsNativeCharsetConverter::gNativeToUTF8 = INVALID_ICONV_T;
286iconv_t nsNativeCharsetConverter::gUTF8ToNative = INVALID_ICONV_T;
287iconv_t nsNativeCharsetConverter::gUnicodeToUTF8 = INVALID_ICONV_T;
288iconv_t nsNativeCharsetConverter::gUTF8ToUnicode = INVALID_ICONV_T;
289#endif
290PRLock *nsNativeCharsetConverter::gLock = nsnull;
291PRBool nsNativeCharsetConverter::gInitialized = PR_FALSE;
292
293void
294nsNativeCharsetConverter::LazyInit()
295{
296 const char *blank_list[] = { "", NULL };
297 const char **native_charset_list = blank_list;
298 const char *native_charset = nl_langinfo(CODESET);
299 if (native_charset == nsnull) {
300 NS_ERROR("native charset is unknown");
301 // fallback to ISO-8859-1
302 native_charset_list = ISO_8859_1_NAMES;
303 }
304 else
305 native_charset_list[0] = native_charset;
306
307 gNativeToUnicode = xp_iconv_open(UTF_16_NAMES, native_charset_list);
308 gUnicodeToNative = xp_iconv_open(native_charset_list, UTF_16_NAMES);
309
310#if defined(ENABLE_UTF8_FALLBACK_SUPPORT)
311 if (gNativeToUnicode == INVALID_ICONV_T) {
312 gNativeToUTF8 = xp_iconv_open(UTF_8_NAMES, native_charset_list);
313 gUTF8ToUnicode = xp_iconv_open(UTF_16_NAMES, UTF_8_NAMES);
314 NS_ASSERTION(gNativeToUTF8 != INVALID_ICONV_T, "no native to utf-8 converter");
315 NS_ASSERTION(gUTF8ToUnicode != INVALID_ICONV_T, "no utf-8 to utf-16 converter");
316 }
317 if (gUnicodeToNative == INVALID_ICONV_T) {
318 gUnicodeToUTF8 = xp_iconv_open(UTF_8_NAMES, UTF_16_NAMES);
319 gUTF8ToNative = xp_iconv_open(native_charset_list, UTF_8_NAMES);
320 NS_ASSERTION(gUnicodeToUTF8 != INVALID_ICONV_T, "no utf-16 to utf-8 converter");
321 NS_ASSERTION(gUTF8ToNative != INVALID_ICONV_T, "no utf-8 to native converter");
322 }
323#else
324 NS_ASSERTION(gNativeToUnicode != INVALID_ICONV_T, "no native to utf-16 converter");
325 NS_ASSERTION(gUnicodeToNative != INVALID_ICONV_T, "no utf-16 to native converter");
326#endif
327
328 /*
329 * On Solaris 8 (and newer?), the iconv modules converting to UCS-2
330 * prepend a byte order mark unicode character (BOM, u+FEFF) during
331 * the first use of the iconv converter. The same is the case of
332 * glibc 2.2.9x and Tru64 V5 (see bug 208809) when 'UTF-16' is used.
333 * However, we use 'UTF-16LE/BE' in both cases, instead so that we
334 * should be safe. But just in case...
335 *
336 * This dummy conversion gets rid of the BOMs and fixes bug 153562.
337 */
338 char dummy_input[1] = { ' ' };
339 char dummy_output[4];
340
341 if (gNativeToUnicode != INVALID_ICONV_T) {
342 const char *input = dummy_input;
343 size_t input_left = sizeof(dummy_input);
344 char *output = dummy_output;
345 size_t output_left = sizeof(dummy_output);
346
347 xp_iconv(gNativeToUnicode, &input, &input_left, &output, &output_left);
348 }
349#if defined(ENABLE_UTF8_FALLBACK_SUPPORT)
350 if (gUTF8ToUnicode != INVALID_ICONV_T) {
351 const char *input = dummy_input;
352 size_t input_left = sizeof(dummy_input);
353 char *output = dummy_output;
354 size_t output_left = sizeof(dummy_output);
355
356 xp_iconv(gUTF8ToUnicode, &input, &input_left, &output, &output_left);
357 }
358#endif
359
360 gInitialized = PR_TRUE;
361}
362
363void
364nsNativeCharsetConverter::GlobalInit()
365{
366 gLock = PR_NewLock();
367 NS_ASSERTION(gLock, "lock creation failed");
368}
369
370void
371nsNativeCharsetConverter::GlobalShutdown()
372{
373 if (gLock) {
374 PR_DestroyLock(gLock);
375 gLock = nsnull;
376 }
377
378 if (gNativeToUnicode != INVALID_ICONV_T) {
379 iconv_close(gNativeToUnicode);
380 gNativeToUnicode = INVALID_ICONV_T;
381 }
382
383 if (gUnicodeToNative != INVALID_ICONV_T) {
384 iconv_close(gUnicodeToNative);
385 gUnicodeToNative = INVALID_ICONV_T;
386 }
387
388#if defined(ENABLE_UTF8_FALLBACK_SUPPORT)
389 if (gNativeToUTF8 != INVALID_ICONV_T) {
390 iconv_close(gNativeToUTF8);
391 gNativeToUTF8 = INVALID_ICONV_T;
392 }
393 if (gUTF8ToNative != INVALID_ICONV_T) {
394 iconv_close(gUTF8ToNative);
395 gUTF8ToNative = INVALID_ICONV_T;
396 }
397 if (gUnicodeToUTF8 != INVALID_ICONV_T) {
398 iconv_close(gUnicodeToUTF8);
399 gUnicodeToUTF8 = INVALID_ICONV_T;
400 }
401 if (gUTF8ToUnicode != INVALID_ICONV_T) {
402 iconv_close(gUTF8ToUnicode);
403 gUTF8ToUnicode = INVALID_ICONV_T;
404 }
405#endif
406
407 gInitialized = PR_FALSE;
408}
409
410nsNativeCharsetConverter::nsNativeCharsetConverter()
411{
412 Lock();
413 if (!gInitialized)
414 LazyInit();
415}
416
417nsNativeCharsetConverter::~nsNativeCharsetConverter()
418{
419 // reset converters for next time
420 if (gNativeToUnicode != INVALID_ICONV_T)
421 xp_iconv_reset(gNativeToUnicode);
422 if (gUnicodeToNative != INVALID_ICONV_T)
423 xp_iconv_reset(gUnicodeToNative);
424#if defined(ENABLE_UTF8_FALLBACK_SUPPORT)
425 if (gNativeToUTF8 != INVALID_ICONV_T)
426 xp_iconv_reset(gNativeToUTF8);
427 if (gUTF8ToNative != INVALID_ICONV_T)
428 xp_iconv_reset(gUTF8ToNative);
429 if (gUnicodeToUTF8 != INVALID_ICONV_T)
430 xp_iconv_reset(gUnicodeToUTF8);
431 if (gUTF8ToUnicode != INVALID_ICONV_T)
432 xp_iconv_reset(gUTF8ToUnicode);
433#endif
434 Unlock();
435}
436
437nsresult
438nsNativeCharsetConverter::NativeToUnicode(const char **input,
439 PRUint32 *inputLeft,
440 PRUnichar **output,
441 PRUint32 *outputLeft)
442{
443 size_t res = 0;
444 size_t inLeft = (size_t) *inputLeft;
445 size_t outLeft = (size_t) *outputLeft * 2;
446
447 if (gNativeToUnicode != INVALID_ICONV_T) {
448
449 res = xp_iconv(gNativeToUnicode, input, &inLeft, (char **) output, &outLeft);
450
451 *inputLeft = inLeft;
452 *outputLeft = outLeft / 2;
453 if (res != (size_t) -1)
454 return NS_OK;
455
456 NS_WARNING("conversion from native to utf-16 failed");
457
458 // reset converter
459 xp_iconv_reset(gNativeToUnicode);
460 }
461#if defined(ENABLE_UTF8_FALLBACK_SUPPORT)
462 else if ((gNativeToUTF8 != INVALID_ICONV_T) &&
463 (gUTF8ToUnicode != INVALID_ICONV_T)) {
464 // convert first to UTF8, then from UTF8 to UCS2
465 const char *in = *input;
466
467 char ubuf[1024];
468
469 // we assume we're always called with enough space in |output|,
470 // so convert many chars at a time...
471 while (inLeft) {
472 char *p = ubuf;
473 size_t n = sizeof(ubuf);
474 res = xp_iconv(gNativeToUTF8, &in, &inLeft, &p, &n);
475 if (res == (size_t) -1) {
476 NS_ERROR("conversion from native to utf-8 failed");
477 break;
478 }
479 NS_ASSERTION(outLeft > 0, "bad assumption");
480 p = ubuf;
481 n = sizeof(ubuf) - n;
482 res = xp_iconv(gUTF8ToUnicode, (const char **) &p, &n, (char **) output, &outLeft);
483 if (res == (size_t) -1) {
484 NS_ERROR("conversion from utf-8 to utf-16 failed");
485 break;
486 }
487 }
488
489 (*input) += (*inputLeft - inLeft);
490 *inputLeft = inLeft;
491 *outputLeft = outLeft / 2;
492
493 if (res != (size_t) -1)
494 return NS_OK;
495
496 // reset converters
497 xp_iconv_reset(gNativeToUTF8);
498 xp_iconv_reset(gUTF8ToUnicode);
499 }
500#endif
501
502 // fallback: zero-pad and hope for the best
503 // XXX This is lame and we have to do better.
504 isolatin1_to_utf16(input, inputLeft, output, outputLeft);
505
506 return NS_OK;
507}
508
509nsresult
510nsNativeCharsetConverter::UnicodeToNative(const PRUnichar **input,
511 PRUint32 *inputLeft,
512 char **output,
513 PRUint32 *outputLeft)
514{
515 size_t res = 0;
516 size_t inLeft = (size_t) *inputLeft * 2;
517 size_t outLeft = (size_t) *outputLeft;
518
519 if (gUnicodeToNative != INVALID_ICONV_T) {
520 res = xp_iconv(gUnicodeToNative, (const char **) input, &inLeft, output, &outLeft);
521
522 if (res != (size_t) -1) {
523 *inputLeft = inLeft / 2;
524 *outputLeft = outLeft;
525 return NS_OK;
526 }
527
528 NS_ERROR("iconv failed");
529
530 // reset converter
531 xp_iconv_reset(gUnicodeToNative);
532 }
533#if defined(ENABLE_UTF8_FALLBACK_SUPPORT)
534 else if ((gUnicodeToUTF8 != INVALID_ICONV_T) &&
535 (gUTF8ToNative != INVALID_ICONV_T)) {
536 const char *in = (const char *) *input;
537
538 char ubuf[6]; // max utf-8 char length (really only needs to be 4 bytes)
539
540 // convert one uchar at a time...
541 while (inLeft && outLeft) {
542 char *p = ubuf;
543 size_t n = sizeof(ubuf), one_uchar = sizeof(PRUnichar);
544 res = xp_iconv(gUnicodeToUTF8, &in, &one_uchar, &p, &n);
545 if (res == (size_t) -1) {
546 NS_ERROR("conversion from utf-16 to utf-8 failed");
547 break;
548 }
549 p = ubuf;
550 n = sizeof(ubuf) - n;
551 res = xp_iconv(gUTF8ToNative, (const char **) &p, &n, output, &outLeft);
552 if (res == (size_t) -1) {
553 if (errno == E2BIG) {
554 // not enough room for last uchar... back up and return.
555 in -= sizeof(PRUnichar);
556 res = 0;
557 }
558 else
559 NS_ERROR("conversion from utf-8 to native failed");
560 break;
561 }
562 inLeft -= sizeof(PRUnichar);
563 }
564
565 if (res != (size_t) -1) {
566 (*input) += (*inputLeft - inLeft/2);
567 *inputLeft = inLeft/2;
568 *outputLeft = outLeft;
569 return NS_OK;
570 }
571
572 // reset converters
573 xp_iconv_reset(gUnicodeToUTF8);
574 xp_iconv_reset(gUTF8ToNative);
575 }
576#endif
577
578 // fallback: truncate and hope for the best
579 utf16_to_isolatin1(input, inputLeft, output, outputLeft);
580
581 return NS_OK;
582}
583
584#endif // USE_ICONV
585
586//-----------------------------------------------------------------------------
587// conversion using mb[r]towc/wc[r]tomb
588//-----------------------------------------------------------------------------
589#if defined(USE_STDCONV)
590#if defined(HAVE_WCRTOMB) || defined(HAVE_MBRTOWC)
591#include <wchar.h> // mbrtowc, wcrtomb
592#endif
593
594class nsNativeCharsetConverter
595{
596public:
597 nsNativeCharsetConverter();
598
599 nsresult NativeToUnicode(const char **input , PRUint32 *inputLeft,
600 PRUnichar **output, PRUint32 *outputLeft);
601 nsresult UnicodeToNative(const PRUnichar **input , PRUint32 *inputLeft,
602 char **output, PRUint32 *outputLeft);
603
604 static void GlobalInit();
605 static void GlobalShutdown() { }
606
607private:
608 static PRBool gWCharIsUnicode;
609
610#if defined(HAVE_WCRTOMB) || defined(HAVE_MBRTOWC)
611 mbstate_t ps;
612#endif
613};
614
615PRBool nsNativeCharsetConverter::gWCharIsUnicode = PR_FALSE;
616
617nsNativeCharsetConverter::nsNativeCharsetConverter()
618{
619#if defined(HAVE_WCRTOMB) || defined(HAVE_MBRTOWC)
620 memset(&ps, 0, sizeof(ps));
621#endif
622}
623
624void
625nsNativeCharsetConverter::GlobalInit()
626{
627 // verify that wchar_t for the current locale is actually unicode.
628 // if it is not, then we should avoid calling mbtowc/wctomb and
629 // just fallback on zero-pad/truncation conversion.
630 //
631 // this test cannot be done at build time because the encoding of
632 // wchar_t may depend on the runtime locale. sad, but true!!
633 //
634 // so, if wchar_t is unicode then converting an ASCII character
635 // to wchar_t should not change its numeric value. we'll just
636 // check what happens with the ASCII 'a' character.
637 //
638 // this test is not perfect... obviously, it could yield false
639 // positives, but then at least ASCII text would be converted
640 // properly (or maybe just the 'a' character) -- oh well :(
641
642 char a = 'a';
643 unsigned int w = 0;
644
645 int res = mbtowc((wchar_t *) &w, &a, 1);
646
647 gWCharIsUnicode = (res != -1 && w == 'a');
648
649#ifdef DEBUG
650 if (!gWCharIsUnicode)
651 NS_WARNING("wchar_t is not unicode (unicode conversion will be lossy)");
652#endif
653}
654
655nsresult
656nsNativeCharsetConverter::NativeToUnicode(const char **input,
657 PRUint32 *inputLeft,
658 PRUnichar **output,
659 PRUint32 *outputLeft)
660{
661 if (gWCharIsUnicode) {
662 int incr;
663
664 // cannot use wchar_t here since it may have been redefined (e.g.,
665 // via -fshort-wchar). hopefully, sizeof(tmp) is sufficient XP.
666 unsigned int tmp = 0;
667 while (*inputLeft && *outputLeft) {
668#ifdef HAVE_MBRTOWC
669 incr = (int) mbrtowc((wchar_t *) &tmp, *input, *inputLeft, &ps);
670#else
671 // XXX is this thread-safe?
672 incr = (int) mbtowc((wchar_t *) &tmp, *input, *inputLeft);
673#endif
674 if (incr < 0) {
675 NS_WARNING("mbtowc failed: possible charset mismatch");
676 // zero-pad and hope for the best
677 tmp = (unsigned char) **input;
678 incr = 1;
679 }
680 **output = (PRUnichar) tmp;
681 (*input) += incr;
682 (*inputLeft) -= incr;
683 (*output)++;
684 (*outputLeft)--;
685 }
686 }
687 else {
688 // wchar_t isn't unicode, so the best we can do is treat the
689 // input as if it is isolatin1 :(
690 isolatin1_to_utf16(input, inputLeft, output, outputLeft);
691 }
692
693 return NS_OK;
694}
695
696nsresult
697nsNativeCharsetConverter::UnicodeToNative(const PRUnichar **input,
698 PRUint32 *inputLeft,
699 char **output,
700 PRUint32 *outputLeft)
701{
702 if (gWCharIsUnicode) {
703 int incr;
704
705 while (*inputLeft && *outputLeft >= MB_CUR_MAX) {
706#ifdef HAVE_WCRTOMB
707 incr = (int) wcrtomb(*output, (wchar_t) **input, &ps);
708#else
709 // XXX is this thread-safe?
710 incr = (int) wctomb(*output, (wchar_t) **input);
711#endif
712 if (incr < 0) {
713 NS_WARNING("mbtowc failed: possible charset mismatch");
714 **output = (unsigned char) **input; // truncate
715 incr = 1;
716 }
717 // most likely we're dead anyways if this assertion should fire
718 NS_ASSERTION(PRUint32(incr) <= *outputLeft, "wrote beyond end of string");
719 (*output) += incr;
720 (*outputLeft) -= incr;
721 (*input)++;
722 (*inputLeft)--;
723 }
724 }
725 else {
726 // wchar_t isn't unicode, so the best we can do is treat the
727 // input as if it is isolatin1 :(
728 utf16_to_isolatin1(input, inputLeft, output, outputLeft);
729 }
730
731 return NS_OK;
732}
733
734#endif // USE_STDCONV
735
736//-----------------------------------------------------------------------------
737// API implementation
738//-----------------------------------------------------------------------------
739
740NS_COM nsresult
741NS_CopyNativeToUnicode(const nsACString &input, nsAString &output)
742{
743 output.Truncate();
744
745 PRUint32 inputLen = input.Length();
746
747 nsACString::const_iterator iter;
748 input.BeginReading(iter);
749
750 //
751 // OPTIMIZATION: preallocate space for largest possible result; convert
752 // directly into the result buffer to avoid intermediate buffer copy.
753 //
754 // this will generally result in a larger allocation, but that seems
755 // better than an extra buffer copy.
756 //
757 output.SetLength(inputLen);
758 nsAString::iterator out_iter;
759 output.BeginWriting(out_iter);
760
761 PRUnichar *result = out_iter.get();
762 PRUint32 resultLeft = inputLen;
763
764 const char *buf = iter.get();
765 PRUint32 bufLeft = inputLen;
766
767 nsNativeCharsetConverter conv;
768 nsresult rv = conv.NativeToUnicode(&buf, &bufLeft, &result, &resultLeft);
769 if (NS_SUCCEEDED(rv)) {
770 NS_ASSERTION(bufLeft == 0, "did not consume entire input buffer");
771 output.SetLength(inputLen - resultLeft);
772 }
773 return rv;
774}
775
776NS_COM nsresult
777NS_CopyUnicodeToNative(const nsAString &input, nsACString &output)
778{
779 output.Truncate();
780
781 nsAString::const_iterator iter, end;
782 input.BeginReading(iter);
783 input.EndReading(end);
784
785 // cannot easily avoid intermediate buffer copy.
786 char temp[4096];
787
788 nsNativeCharsetConverter conv;
789
790 const PRUnichar *buf = iter.get();
791 PRUint32 bufLeft = Distance(iter, end);
792 while (bufLeft) {
793 char *p = temp;
794 PRUint32 tempLeft = sizeof(temp);
795
796 nsresult rv = conv.UnicodeToNative(&buf, &bufLeft, &p, &tempLeft);
797 if (NS_FAILED(rv)) return rv;
798
799 if (tempLeft < sizeof(temp))
800 output.Append(temp, sizeof(temp) - tempLeft);
801 }
802 return NS_OK;
803}
804
805void
806NS_StartupNativeCharsetUtils()
807{
808 //
809 // need to initialize the locale or else charset conversion will fail.
810 // better not delay this in case some other component alters the locale
811 // settings.
812 //
813 // XXX we assume that we are called early enough that we should
814 // always be the first to care about the locale's charset.
815 //
816 setlocale(LC_CTYPE, "");
817
818 nsNativeCharsetConverter::GlobalInit();
819}
820
821void
822NS_ShutdownNativeCharsetUtils()
823{
824 nsNativeCharsetConverter::GlobalShutdown();
825}
826
827//-----------------------------------------------------------------------------
828// XP_BEOS
829//-----------------------------------------------------------------------------
830#elif defined(XP_BEOS)
831
832#include "nsAString.h"
833#include "nsReadableUtils.h"
834#include "nsString.h"
835
836NS_COM nsresult
837NS_CopyNativeToUnicode(const nsACString &input, nsAString &output)
838{
839 CopyUTF8toUTF16(input, output);
840 return NS_OK;
841}
842
843NS_COM nsresult
844NS_CopyUnicodeToNative(const nsAString &input, nsACString &output)
845{
846 CopyUTF16toUTF8(input, output);
847 return NS_OK;
848}
849
850void
851NS_StartupNativeCharsetUtils()
852{
853}
854
855void
856NS_ShutdownNativeCharsetUtils()
857{
858}
859
860//-----------------------------------------------------------------------------
861// XP_WIN
862//-----------------------------------------------------------------------------
863#elif defined(XP_WIN)
864
865#include <windows.h>
866#include "nsAString.h"
867
868NS_COM nsresult
869NS_CopyNativeToUnicode(const nsACString &input, nsAString &output)
870{
871 PRUint32 inputLen = input.Length();
872
873 nsACString::const_iterator iter;
874 input.BeginReading(iter);
875
876 const char *buf = iter.get();
877
878 // determine length of result
879 PRUint32 resultLen = 0;
880 int n = ::MultiByteToWideChar(CP_ACP, 0, buf, inputLen, NULL, 0);
881 if (n > 0)
882 resultLen += n;
883
884 // allocate sufficient space
885 output.SetLength(resultLen);
886 if (resultLen > 0) {
887 nsAString::iterator out_iter;
888 output.BeginWriting(out_iter);
889
890 PRUnichar *result = out_iter.get();
891
892 ::MultiByteToWideChar(CP_ACP, 0, buf, inputLen, result, resultLen);
893 }
894 return NS_OK;
895}
896
897NS_COM nsresult
898NS_CopyUnicodeToNative(const nsAString &input, nsACString &output)
899{
900 PRUint32 inputLen = input.Length();
901
902 nsAString::const_iterator iter;
903 input.BeginReading(iter);
904
905 const PRUnichar *buf = iter.get();
906
907 // determine length of result
908 PRUint32 resultLen = 0;
909
910 int n = ::WideCharToMultiByte(CP_ACP, 0, buf, inputLen, NULL, 0, NULL, NULL);
911 if (n > 0)
912 resultLen += n;
913
914 // allocate sufficient space
915 output.SetLength(resultLen);
916 if (resultLen > 0) {
917 nsACString::iterator out_iter;
918 output.BeginWriting(out_iter);
919
920 // default "defaultChar" is '?', which is an illegal character on windows
921 // file system. That will cause file uncreatable. Change it to '_'
922 const char defaultChar = '_';
923
924 char *result = out_iter.get();
925
926 ::WideCharToMultiByte(CP_ACP, 0, buf, inputLen, result, resultLen,
927 &defaultChar, NULL);
928 }
929 return NS_OK;
930}
931
932void
933NS_StartupNativeCharsetUtils()
934{
935}
936
937void
938NS_ShutdownNativeCharsetUtils()
939{
940}
941
942//-----------------------------------------------------------------------------
943// XP_OS2
944//-----------------------------------------------------------------------------
945#elif defined(XP_OS2)
946
947#define INCL_DOS
948#include <os2.h>
949#include <uconv.h>
950#include "nsAString.h"
951#include <ulserrno.h>
952#include "nsNativeCharsetUtils.h"
953
954static UconvObject UnicodeConverter = NULL;
955
956NS_COM nsresult
957NS_CopyNativeToUnicode(const nsACString &input, nsAString &output)
958{
959 PRUint32 inputLen = input.Length();
960
961 nsACString::const_iterator iter;
962 input.BeginReading(iter);
963 const char *inputStr = iter.get();
964
965 // determine length of result
966 PRUint32 resultLen = inputLen;
967 output.SetLength(resultLen);
968
969 nsAString::iterator out_iter;
970 output.BeginWriting(out_iter);
971 UniChar *result = (UniChar*)out_iter.get();
972
973 size_t cSubs = 0;
974 size_t resultLeft = resultLen;
975
976 if (!UnicodeConverter)
977 NS_StartupNativeCharsetUtils();
978
979 int unirc = ::UniUconvToUcs(UnicodeConverter, (void**)&inputStr, &inputLen,
980 &result, &resultLeft, &cSubs);
981
982 NS_ASSERTION(unirc != UCONV_E2BIG, "Path too big");
983
984 if (unirc != ULS_SUCCESS) {
985 output.Truncate();
986 return NS_ERROR_FAILURE;
987 }
988
989 // Need to update string length to reflect how many bytes were actually
990 // written.
991 output.Truncate(resultLen - resultLeft);
992 return NS_OK;
993}
994
995NS_COM nsresult
996NS_CopyUnicodeToNative(const nsAString &input, nsACString &output)
997{
998 size_t inputLen = input.Length();
999
1000 nsAString::const_iterator iter;
1001 input.BeginReading(iter);
1002 UniChar* inputStr = (UniChar*) NS_CONST_CAST(PRUnichar*, iter.get());
1003
1004 // maximum length of unicode string of length x converted to native
1005 // codepage is x*2
1006 size_t resultLen = inputLen * 2;
1007 output.SetLength(resultLen);
1008
1009 nsACString::iterator out_iter;
1010 output.BeginWriting(out_iter);
1011 char *result = out_iter.get();
1012
1013 size_t cSubs = 0;
1014 size_t resultLeft = resultLen;
1015
1016 if (!UnicodeConverter)
1017 NS_StartupNativeCharsetUtils();
1018
1019 int unirc = ::UniUconvFromUcs(UnicodeConverter, &inputStr, &inputLen,
1020 (void**)&result, &resultLeft, &cSubs);
1021
1022 NS_ASSERTION(unirc != UCONV_E2BIG, "Path too big");
1023
1024 if (unirc != ULS_SUCCESS) {
1025 output.Truncate();
1026 return NS_ERROR_FAILURE;
1027 }
1028
1029 // Need to update string length to reflect how many bytes were actually
1030 // written.
1031 output.Truncate(resultLen - resultLeft);
1032 return NS_OK;
1033}
1034
1035void
1036NS_StartupNativeCharsetUtils()
1037{
1038 ULONG ulLength;
1039 ULONG ulCodePage;
1040 DosQueryCp(sizeof(ULONG), &ulCodePage, &ulLength);
1041
1042 UniChar codepage[20];
1043 int unirc = ::UniMapCpToUcsCp(ulCodePage, codepage, 20);
1044 if (unirc == ULS_SUCCESS) {
1045 unirc = ::UniCreateUconvObject(codepage, &UnicodeConverter);
1046 if (unirc == ULS_SUCCESS) {
1047 uconv_attribute_t attr;
1048 ::UniQueryUconvObject(UnicodeConverter, &attr, sizeof(uconv_attribute_t),
1049 NULL, NULL, NULL);
1050 attr.options = UCONV_OPTION_SUBSTITUTE_BOTH;
1051 attr.subchar_len=1;
1052 attr.subchar[0]='_';
1053 ::UniSetUconvObject(UnicodeConverter, &attr);
1054 }
1055 }
1056}
1057
1058void
1059NS_ShutdownNativeCharsetUtils()
1060{
1061 ::UniFreeUconvObject(UnicodeConverter);
1062}
1063
1064//-----------------------------------------------------------------------------
1065// XP_MAC
1066//-----------------------------------------------------------------------------
1067#elif defined(XP_MAC)
1068
1069#include <UnicodeConverter.h>
1070#include <TextCommon.h>
1071#include <Script.h>
1072#include <MacErrors.h>
1073#include "nsAString.h"
1074
1075class nsFSStringConversionMac {
1076public:
1077 static nsresult UCSToFS(const nsAString& aIn, nsACString& aOut);
1078 static nsresult FSToUCS(const nsACString& ain, nsAString& aOut);
1079
1080 static void CleanUp();
1081
1082private:
1083 static TextEncoding GetSystemEncoding();
1084 static nsresult PrepareEncoder();
1085 static nsresult PrepareDecoder();
1086
1087 static UnicodeToTextInfo sEncoderInfo;
1088 static TextToUnicodeInfo sDecoderInfo;
1089};
1090
1091UnicodeToTextInfo nsFSStringConversionMac::sEncoderInfo = nsnull;
1092TextToUnicodeInfo nsFSStringConversionMac::sDecoderInfo = nsnull;
1093
1094nsresult nsFSStringConversionMac::UCSToFS(const nsAString& aIn, nsACString& aOut)
1095{
1096 nsresult rv = PrepareEncoder();
1097 if (NS_FAILED(rv)) return rv;
1098
1099 OSStatus err = noErr;
1100 char stackBuffer[512];
1101
1102 aOut.Truncate();
1103
1104 // for each chunk of |aIn|...
1105 nsReadingIterator<PRUnichar> iter;
1106 aIn.BeginReading(iter);
1107
1108 PRUint32 fragmentLength = PRUint32(iter.size_forward());
1109 UInt32 bytesLeft = fragmentLength * sizeof(UniChar);
1110
1111 do {
1112 UInt32 bytesRead = 0, bytesWritten = 0;
1113 err = ::ConvertFromUnicodeToText(sEncoderInfo,
1114 bytesLeft,
1115 (const UniChar*)iter.get(),
1116 kUnicodeUseFallbacksMask | kUnicodeLooseMappingsMask,
1117 0, nsnull, nsnull, nsnull,
1118 sizeof(stackBuffer),
1119 &bytesRead,
1120 &bytesWritten,
1121 stackBuffer);
1122 if (err == kTECUsedFallbacksStatus)
1123 err = noErr;
1124 else if (err == kTECOutputBufferFullStatus) {
1125 bytesLeft -= bytesRead;
1126 iter.advance(bytesRead / sizeof(UniChar));
1127 }
1128 aOut.Append(stackBuffer, bytesWritten);
1129 }
1130 while (err == kTECOutputBufferFullStatus);
1131
1132 return (err == noErr) ? NS_OK : NS_ERROR_FAILURE;
1133}
1134
1135nsresult nsFSStringConversionMac::FSToUCS(const nsACString& aIn, nsAString& aOut)
1136{
1137 nsresult rv = PrepareDecoder();
1138 if (NS_FAILED(rv)) return rv;
1139
1140 OSStatus err = noErr;
1141 UniChar stackBuffer[512];
1142
1143 aOut.Truncate(0);
1144
1145 // for each chunk of |aIn|...
1146 nsReadingIterator<char> iter;
1147 aIn.BeginReading(iter);
1148
1149 PRUint32 fragmentLength = PRUint32(iter.size_forward());
1150 UInt32 bytesLeft = fragmentLength;
1151
1152 do {
1153 UInt32 bytesRead = 0, bytesWritten = 0;
1154 err = ::ConvertFromTextToUnicode(sDecoderInfo,
1155 bytesLeft,
1156 iter.get(),
1157 kUnicodeUseFallbacksMask | kUnicodeLooseMappingsMask,
1158 0, nsnull, nsnull, nsnull,
1159 sizeof(stackBuffer),
1160 &bytesRead,
1161 &bytesWritten,
1162 stackBuffer);
1163 if (err == kTECUsedFallbacksStatus)
1164 err = noErr;
1165 else if (err == kTECOutputBufferFullStatus) {
1166 bytesLeft -= bytesRead;
1167 iter.advance(bytesRead);
1168 }
1169 aOut.Append((PRUnichar *)stackBuffer, bytesWritten / sizeof(PRUnichar));
1170 }
1171 while (err == kTECOutputBufferFullStatus);
1172
1173 return (err == noErr) ? NS_OK : NS_ERROR_FAILURE;
1174}
1175
1176void nsFSStringConversionMac::CleanUp()
1177{
1178 if (sDecoderInfo) {
1179 ::DisposeTextToUnicodeInfo(&sDecoderInfo);
1180 sDecoderInfo = nsnull;
1181 }
1182 if (sEncoderInfo) {
1183 ::DisposeUnicodeToTextInfo(&sEncoderInfo);
1184 sEncoderInfo = nsnull;
1185 }
1186}
1187
1188TextEncoding nsFSStringConversionMac::GetSystemEncoding()
1189{
1190 OSStatus err;
1191 TextEncoding theEncoding;
1192
1193 err = ::UpgradeScriptInfoToTextEncoding(smSystemScript, kTextLanguageDontCare,
1194 kTextRegionDontCare, NULL, &theEncoding);
1195
1196 if (err != noErr)
1197 theEncoding = kTextEncodingMacRoman;
1198
1199 return theEncoding;
1200}
1201
1202nsresult nsFSStringConversionMac::PrepareEncoder()
1203{
1204 nsresult rv = NS_OK;
1205 if (!sEncoderInfo) {
1206 OSStatus err;
1207 err = ::CreateUnicodeToTextInfoByEncoding(GetSystemEncoding(), &sEncoderInfo);
1208 if (err)
1209 rv = NS_ERROR_FAILURE;
1210 }
1211 return rv;
1212}
1213
1214nsresult nsFSStringConversionMac::PrepareDecoder()
1215{
1216 nsresult rv = NS_OK;
1217 if (!sDecoderInfo) {
1218 OSStatus err;
1219 err = ::CreateTextToUnicodeInfoByEncoding(GetSystemEncoding(), &sDecoderInfo);
1220 if (err)
1221 rv = NS_ERROR_FAILURE;
1222 }
1223 return rv;
1224}
1225
1226NS_COM nsresult
1227NS_CopyNativeToUnicode(const nsACString &input, nsAString &output)
1228{
1229 return nsFSStringConversionMac::FSToUCS(input, output);
1230}
1231
1232NS_COM nsresult
1233NS_CopyUnicodeToNative(const nsAString &input, nsACString &output)
1234{
1235 return nsFSStringConversionMac::UCSToFS(input, output);
1236}
1237
1238void
1239NS_StartupNativeCharsetUtils()
1240{
1241}
1242
1243void
1244NS_ShutdownNativeCharsetUtils()
1245{
1246 nsFSStringConversionMac::CleanUp();
1247}
1248
1249//-----------------------------------------------------------------------------
1250// default : truncate/zeropad
1251//-----------------------------------------------------------------------------
1252#else
1253
1254#include "nsReadableUtils.h"
1255
1256NS_COM nsresult
1257NS_CopyNativeToUnicode(const nsACString &input, nsAString &output)
1258{
1259 CopyASCIItoUCS2(input, output);
1260 return NS_OK;
1261}
1262
1263NS_COM nsresult
1264NS_CopyUnicodeToNative(const nsAString &input, nsACString &output)
1265{
1266 CopyUCS2toASCII(input, output);
1267 return NS_OK;
1268}
1269
1270void
1271NS_StartupNativeCharsetUtils()
1272{
1273}
1274
1275void
1276NS_ShutdownNativeCharsetUtils()
1277{
1278}
1279
1280#endif
Note: See TracBrowser for help on using the repository browser.

© 2024 Oracle Support Privacy / Do Not Sell My Info Terms of Use Trademark Policy Automated Access Etiquette