nsNativeCharsetUtils.cpp@ 6542

Last change on this file since 6542 was 1, checked in by vboxsync, 55 years ago
import
Property svn:eol-style set to `native` Property svn:keywords set to `Author Date Id Revision`
File size: 37.5 KB

Line
1	/* *** BEGIN LICENSE BLOCK ***
2	* Version: MPL 1.1/GPL 2.0/LGPL 2.1
3	*
4	* The contents of this file are subject to the Mozilla Public License Version
5	* 1.1 (the "License"); you may not use this file except in compliance with
6	* the License. You may obtain a copy of the License at
7	* http://www.mozilla.org/MPL/
8	*
9	* Software distributed under the License is distributed on an "AS IS" basis,
10	* WITHOUT WARRANTY OF ANY KIND, either express or implied. See the License
11	* for the specific language governing rights and limitations under the
12	* License.
13	*
14	* The Original Code is Mozilla.
15	*
16	* The Initial Developer of the Original Code is
17	* Netscape Communications Corporation.
18	* Portions created by the Initial Developer are Copyright (C) 2002
19	* the Initial Developer. All Rights Reserved.
20	*
21	* Contributor(s):
22	* Darin Fisher <[email protected]>
23	* Brian Stell <[email protected]>
24	* Frank Tang <[email protected]>
25	* Brendan Eich <[email protected]>
26	* Sergei Dolgov <[email protected]>
27	*
28	* Alternatively, the contents of this file may be used under the terms of
29	* either the GNU General Public License Version 2 or later (the "GPL"), or
30	* the GNU Lesser General Public License Version 2.1 or later (the "LGPL"),
31	* in which case the provisions of the GPL or the LGPL are applicable instead
32	* of those above. If you wish to allow use of your version of this file only
33	* under the terms of either the GPL or the LGPL, and not to allow others to
34	* use your version of this file under the terms of the MPL, indicate your
35	* decision by deleting the provisions above and replace them with the notice
36	* and other provisions required by the GPL or the LGPL. If you do not delete
37	* the provisions above, a recipient may use your version of this file under
38	* the terms of any one of the MPL, the GPL or the LGPL.
39	*
40	* *** END LICENSE BLOCK *** */
41
42	#include "xpcom-private.h"
43
44	//-----------------------------------------------------------------------------
45	// XP_UNIX
46	//-----------------------------------------------------------------------------
47	#if defined(XP_UNIX)
48
49	#include <stdlib.h> // mbtowc, wctomb
50	#include <locale.h> // setlocale
51	#include "nscore.h"
52	#include "prlock.h"
53	#include "nsAString.h"
54	#include "nsReadableUtils.h"
55
56	//
57	// choose a conversion library. we used to use mbrtowc/wcrtomb under Linux,
58	// but that doesn't work for non-BMP characters whether we use '-fshort-wchar'
59	// or not (see bug 206811 and
60	// news://news.mozilla.org:119/[email protected]). we now use
61	// iconv for all platforms where nltypes.h and nllanginfo.h are present
62	// along with iconv.
63	//
64	#if defined(HAVE_ICONV) && defined(HAVE_NL_TYPES_H) && defined(HAVE_LANGINFO_CODESET)
65	#define USE_ICONV 1
66	#else
67	#define USE_STDCONV 1
68	#endif
69
70	static void
71	isolatin1_to_utf16(const char *input, PRUint32 inputLeft, PRUnichar *output, PRUint32 outputLeft)
72	{
73	while (inputLeft && outputLeft) {
74	output = (unsigned char) input;
75	(*input)++;
76	(*inputLeft)--;
77	(*output)++;
78	(*outputLeft)--;
79	}
80	}
81
82	static void
83	utf16_to_isolatin1(const PRUnichar *input, PRUint32 inputLeft, char *output, PRUint32 outputLeft)
84	{
85	while (inputLeft && outputLeft) {
86	output = (unsigned char) input;
87	(*input)++;
88	(*inputLeft)--;
89	(*output)++;
90	(*outputLeft)--;
91	}
92	}
93
94	//-----------------------------------------------------------------------------
95	// conversion using iconv
96	//-----------------------------------------------------------------------------
97	#if defined(USE_ICONV)
98	#include <nl_types.h> // CODESET
99	#include <langinfo.h> // nl_langinfo
100	#include <iconv.h> // iconv_open, iconv, iconv_close
101	#include <errno.h>
102
103	#if defined(HAVE_ICONV_WITH_CONST_INPUT)
104	#define ICONV_INPUT(x) (x)
105	#else
106	#define ICONV_INPUT(x) ((char **)x)
107	#endif
108
109	// solaris definitely needs this, but we'll enable it by default
110	// just in case... but we know for sure that iconv(3) in glibc
111	// doesn't need this.
112	#if !defined(__GLIBC__)
113	#define ENABLE_UTF8_FALLBACK_SUPPORT
114	#endif
115
116	#define INVALID_ICONV_T ((iconv_t) -1)
117
118	static inline size_t
119	xp_iconv(iconv_t converter,
120	const char **input,
121	size_t *inputLeft,
122	char **output,
123	size_t *outputLeft)
124	{
125	size_t res, outputAvail = outputLeft ? *outputLeft : 0;
126	res = iconv(converter, ICONV_INPUT(input), inputLeft, output, outputLeft);
127	if (res == (size_t) -1) {
128	// on some platforms (e.g., linux) iconv will fail with
129	// E2BIG if it cannot convert _all_ of its input. it'll
130	// still adjust all of the in/out params correctly, so we
131	// can ignore this error. the assumption is that we will
132	// be called again to complete the conversion.
133	if ((errno == E2BIG) && (*outputLeft < outputAvail))
134	res = 0;
135	}
136	return res;
137	}
138
139	static inline void
140	xp_iconv_reset(iconv_t converter)
141	{
142	// NOTE: the man pages on Solaris claim that you can pass NULL
143	// for all parameter to reset the converter, but beware the
144	// evil Solaris crash if you go down this route >:-)
145
146	const char *zero_char_in_ptr = NULL;
147	char *zero_char_out_ptr = NULL;
148	size_t zero_size_in = 0,
149	zero_size_out = 0;
150
151	xp_iconv(converter, &zero_char_in_ptr,
152	&zero_size_in,
153	&zero_char_out_ptr,
154	&zero_size_out);
155	}
156
157	static inline iconv_t
158	xp_iconv_open(const char to_list, const char from_list)
159	{
160	iconv_t res;
161	const char **from_name;
162	const char **to_name;
163
164	// try all possible combinations to locate a converter.
165	to_name = to_list;
166	while (*to_name) {
167	if (**to_name) {
168	from_name = from_list;
169	while (*from_name) {
170	if (**from_name) {
171	res = iconv_open(to_name, from_name);
172	if (res != INVALID_ICONV_T)
173	return res;
174	}
175	from_name++;
176	}
177	}
178	to_name++;
179	}
180
181	return INVALID_ICONV_T;
182	}
183
184	/*
185	* PRUnichar[] is NOT a UCS-2 array BUT a UTF-16 string. Therefore, we
186	* have to use UTF-16 with iconv(3) on platforms where it's supported.
187	* However, the way UTF-16 and UCS-2 are interpreted varies across platforms
188	* and implementations of iconv(3). On Tru64, it also depends on the environment
189	* variable. To avoid the trouble arising from byte-swapping
190	* (bug 208809), we have to try UTF-16LE/BE and UCS-2LE/BE before falling
191	* back to UTF-16 and UCS-2 and variants. We assume that UTF-16 and UCS-2
192	* on systems without UTF-16LE/BE and UCS-2LE/BE have the native endianness,
193	* which isn't the case of glibc 2.1.x, for which we use 'UNICODELITTLE'
194	* and 'UNICODEBIG'. It's also not true of Tru64 V4 when the environment
195	* variable ICONV_BYTEORDER is set to 'big-endian', about which not much
196	* can be done other than adding a note in the release notes. (bug 206811)
197	*/
198	static const char *UTF_16_NAMES[] = {
199	#if defined(IS_LITTLE_ENDIAN)
200	"UTF-16LE",
201	#if defined(__GLIBC__)
202	"UNICODELITTLE",
203	#endif
204	"UCS-2LE",
205	#else
206	"UTF-16BE",
207	#if defined(__GLIBC__)
208	"UNICODEBIG",
209	#endif
210	"UCS-2BE",
211	#endif
212	"UTF-16",
213	"UCS-2",
214	"UCS2",
215	"UCS_2",
216	"ucs-2",
217	"ucs2",
218	"ucs_2",
219	NULL
220	};
221
222	#if defined(ENABLE_UTF8_FALLBACK_SUPPORT)
223	static const char *UTF_8_NAMES[] = {
224	"UTF-8",
225	"UTF8",
226	"UTF_8",
227	"utf-8",
228	"utf8",
229	"utf_8",
230	NULL
231	};
232	#endif
233
234	static const char *ISO_8859_1_NAMES[] = {
235	"ISO-8859-1",
236	#if !defined(__GLIBC__)
237	"ISO8859-1",
238	"ISO88591",
239	"ISO_8859_1",
240	"ISO8859_1",
241	"iso-8859-1",
242	"iso8859-1",
243	"iso88591",
244	"iso_8859_1",
245	"iso8859_1",
246	#endif
247	NULL
248	};
249
250	class nsNativeCharsetConverter
251	{
252	public:
253	nsNativeCharsetConverter();
254	~nsNativeCharsetConverter();
255
256	nsresult NativeToUnicode(const char *input , PRUint32 inputLeft,
257	PRUnichar *output, PRUint32 outputLeft);
258	nsresult UnicodeToNative(const PRUnichar *input , PRUint32 inputLeft,
259	char *output, PRUint32 outputLeft);
260
261	static void GlobalInit();
262	static void GlobalShutdown();
263
264	private:
265	static iconv_t gNativeToUnicode;
266	static iconv_t gUnicodeToNative;
267	#if defined(ENABLE_UTF8_FALLBACK_SUPPORT)
268	static iconv_t gNativeToUTF8;
269	static iconv_t gUTF8ToNative;
270	static iconv_t gUnicodeToUTF8;
271	static iconv_t gUTF8ToUnicode;
272	#endif
273	static PRLock *gLock;
274	static PRBool gInitialized;
275
276	static void LazyInit();
277
278	static void Lock() { if (gLock) PR_Lock(gLock); }
279	static void Unlock() { if (gLock) PR_Unlock(gLock); }
280	};
281
282	iconv_t nsNativeCharsetConverter::gNativeToUnicode = INVALID_ICONV_T;
283	iconv_t nsNativeCharsetConverter::gUnicodeToNative = INVALID_ICONV_T;
284	#if defined(ENABLE_UTF8_FALLBACK_SUPPORT)
285	iconv_t nsNativeCharsetConverter::gNativeToUTF8 = INVALID_ICONV_T;
286	iconv_t nsNativeCharsetConverter::gUTF8ToNative = INVALID_ICONV_T;
287	iconv_t nsNativeCharsetConverter::gUnicodeToUTF8 = INVALID_ICONV_T;
288	iconv_t nsNativeCharsetConverter::gUTF8ToUnicode = INVALID_ICONV_T;
289	#endif
290	PRLock *nsNativeCharsetConverter::gLock = nsnull;
291	PRBool nsNativeCharsetConverter::gInitialized = PR_FALSE;
292
293	void
294	nsNativeCharsetConverter::LazyInit()
295	{
296	const char *blank_list[] = { "", NULL };
297	const char **native_charset_list = blank_list;
298	const char *native_charset = nl_langinfo(CODESET);
299	if (native_charset == nsnull) {
300	NS_ERROR("native charset is unknown");
301	// fallback to ISO-8859-1
302	native_charset_list = ISO_8859_1_NAMES;
303	}
304	else
305	native_charset_list[0] = native_charset;
306
307	gNativeToUnicode = xp_iconv_open(UTF_16_NAMES, native_charset_list);
308	gUnicodeToNative = xp_iconv_open(native_charset_list, UTF_16_NAMES);
309
310	#if defined(ENABLE_UTF8_FALLBACK_SUPPORT)
311	if (gNativeToUnicode == INVALID_ICONV_T) {
312	gNativeToUTF8 = xp_iconv_open(UTF_8_NAMES, native_charset_list);
313	gUTF8ToUnicode = xp_iconv_open(UTF_16_NAMES, UTF_8_NAMES);
314	NS_ASSERTION(gNativeToUTF8 != INVALID_ICONV_T, "no native to utf-8 converter");
315	NS_ASSERTION(gUTF8ToUnicode != INVALID_ICONV_T, "no utf-8 to utf-16 converter");
316	}
317	if (gUnicodeToNative == INVALID_ICONV_T) {
318	gUnicodeToUTF8 = xp_iconv_open(UTF_8_NAMES, UTF_16_NAMES);
319	gUTF8ToNative = xp_iconv_open(native_charset_list, UTF_8_NAMES);
320	NS_ASSERTION(gUnicodeToUTF8 != INVALID_ICONV_T, "no utf-16 to utf-8 converter");
321	NS_ASSERTION(gUTF8ToNative != INVALID_ICONV_T, "no utf-8 to native converter");
322	}
323	#else
324	NS_ASSERTION(gNativeToUnicode != INVALID_ICONV_T, "no native to utf-16 converter");
325	NS_ASSERTION(gUnicodeToNative != INVALID_ICONV_T, "no utf-16 to native converter");
326	#endif
327
328	/*
329	* On Solaris 8 (and newer?), the iconv modules converting to UCS-2
330	* prepend a byte order mark unicode character (BOM, u+FEFF) during
331	* the first use of the iconv converter. The same is the case of
332	* glibc 2.2.9x and Tru64 V5 (see bug 208809) when 'UTF-16' is used.
333	* However, we use 'UTF-16LE/BE' in both cases, instead so that we
334	* should be safe. But just in case...
335	*
336	* This dummy conversion gets rid of the BOMs and fixes bug 153562.
337	*/
338	char dummy_input[1] = { ' ' };
339	char dummy_output[4];
340
341	if (gNativeToUnicode != INVALID_ICONV_T) {
342	const char *input = dummy_input;
343	size_t input_left = sizeof(dummy_input);
344	char *output = dummy_output;
345	size_t output_left = sizeof(dummy_output);
346
347	xp_iconv(gNativeToUnicode, &input, &input_left, &output, &output_left);
348	}
349	#if defined(ENABLE_UTF8_FALLBACK_SUPPORT)
350	if (gUTF8ToUnicode != INVALID_ICONV_T) {
351	const char *input = dummy_input;
352	size_t input_left = sizeof(dummy_input);
353	char *output = dummy_output;
354	size_t output_left = sizeof(dummy_output);
355
356	xp_iconv(gUTF8ToUnicode, &input, &input_left, &output, &output_left);
357	}
358	#endif
359
360	gInitialized = PR_TRUE;
361	}
362
363	void
364	nsNativeCharsetConverter::GlobalInit()
365	{
366	gLock = PR_NewLock();
367	NS_ASSERTION(gLock, "lock creation failed");
368	}
369
370	void
371	nsNativeCharsetConverter::GlobalShutdown()
372	{
373	if (gLock) {
374	PR_DestroyLock(gLock);
375	gLock = nsnull;
376	}
377
378	if (gNativeToUnicode != INVALID_ICONV_T) {
379	iconv_close(gNativeToUnicode);
380	gNativeToUnicode = INVALID_ICONV_T;
381	}
382
383	if (gUnicodeToNative != INVALID_ICONV_T) {
384	iconv_close(gUnicodeToNative);
385	gUnicodeToNative = INVALID_ICONV_T;
386	}
387
388	#if defined(ENABLE_UTF8_FALLBACK_SUPPORT)
389	if (gNativeToUTF8 != INVALID_ICONV_T) {
390	iconv_close(gNativeToUTF8);
391	gNativeToUTF8 = INVALID_ICONV_T;
392	}
393	if (gUTF8ToNative != INVALID_ICONV_T) {
394	iconv_close(gUTF8ToNative);
395	gUTF8ToNative = INVALID_ICONV_T;
396	}
397	if (gUnicodeToUTF8 != INVALID_ICONV_T) {
398	iconv_close(gUnicodeToUTF8);
399	gUnicodeToUTF8 = INVALID_ICONV_T;
400	}
401	if (gUTF8ToUnicode != INVALID_ICONV_T) {
402	iconv_close(gUTF8ToUnicode);
403	gUTF8ToUnicode = INVALID_ICONV_T;
404	}
405	#endif
406
407	gInitialized = PR_FALSE;
408	}
409
410	nsNativeCharsetConverter::nsNativeCharsetConverter()
411	{
412	Lock();
413	if (!gInitialized)
414	LazyInit();
415	}
416
417	nsNativeCharsetConverter::~nsNativeCharsetConverter()
418	{
419	// reset converters for next time
420	if (gNativeToUnicode != INVALID_ICONV_T)
421	xp_iconv_reset(gNativeToUnicode);
422	if (gUnicodeToNative != INVALID_ICONV_T)
423	xp_iconv_reset(gUnicodeToNative);
424	#if defined(ENABLE_UTF8_FALLBACK_SUPPORT)
425	if (gNativeToUTF8 != INVALID_ICONV_T)
426	xp_iconv_reset(gNativeToUTF8);
427	if (gUTF8ToNative != INVALID_ICONV_T)
428	xp_iconv_reset(gUTF8ToNative);
429	if (gUnicodeToUTF8 != INVALID_ICONV_T)
430	xp_iconv_reset(gUnicodeToUTF8);
431	if (gUTF8ToUnicode != INVALID_ICONV_T)
432	xp_iconv_reset(gUTF8ToUnicode);
433	#endif
434	Unlock();
435	}
436
437	nsresult
438	nsNativeCharsetConverter::NativeToUnicode(const char **input,
439	PRUint32 *inputLeft,
440	PRUnichar **output,
441	PRUint32 *outputLeft)
442	{
443	size_t res = 0;
444	size_t inLeft = (size_t) *inputLeft;
445	size_t outLeft = (size_t) outputLeft 2;
446
447	if (gNativeToUnicode != INVALID_ICONV_T) {
448
449	res = xp_iconv(gNativeToUnicode, input, &inLeft, (char **) output, &outLeft);
450
451	*inputLeft = inLeft;
452	*outputLeft = outLeft / 2;
453	if (res != (size_t) -1)
454	return NS_OK;
455
456	NS_WARNING("conversion from native to utf-16 failed");
457
458	// reset converter
459	xp_iconv_reset(gNativeToUnicode);
460	}
461	#if defined(ENABLE_UTF8_FALLBACK_SUPPORT)
462	else if ((gNativeToUTF8 != INVALID_ICONV_T) &&
463	(gUTF8ToUnicode != INVALID_ICONV_T)) {
464	// convert first to UTF8, then from UTF8 to UCS2
465	const char in = input;
466
467	char ubuf[1024];
468
469	// we assume we're always called with enough space in \|output\|,
470	// so convert many chars at a time...
471	while (inLeft) {
472	char *p = ubuf;
473	size_t n = sizeof(ubuf);
474	res = xp_iconv(gNativeToUTF8, &in, &inLeft, &p, &n);
475	if (res == (size_t) -1) {
476	NS_ERROR("conversion from native to utf-8 failed");
477	break;
478	}
479	NS_ASSERTION(outLeft > 0, "bad assumption");
480	p = ubuf;
481	n = sizeof(ubuf) - n;
482	res = xp_iconv(gUTF8ToUnicode, (const char ) &p, &n, (char ) output, &outLeft);
483	if (res == (size_t) -1) {
484	NS_ERROR("conversion from utf-8 to utf-16 failed");
485	break;
486	}
487	}
488
489	(input) += (inputLeft - inLeft);
490	*inputLeft = inLeft;
491	*outputLeft = outLeft / 2;
492
493	if (res != (size_t) -1)
494	return NS_OK;
495
496	// reset converters
497	xp_iconv_reset(gNativeToUTF8);
498	xp_iconv_reset(gUTF8ToUnicode);
499	}
500	#endif
501
502	// fallback: zero-pad and hope for the best
503	// XXX This is lame and we have to do better.
504	isolatin1_to_utf16(input, inputLeft, output, outputLeft);
505
506	return NS_OK;
507	}
508
509	nsresult
510	nsNativeCharsetConverter::UnicodeToNative(const PRUnichar **input,
511	PRUint32 *inputLeft,
512	char **output,
513	PRUint32 *outputLeft)
514	{
515	size_t res = 0;
516	size_t inLeft = (size_t) inputLeft 2;
517	size_t outLeft = (size_t) *outputLeft;
518
519	if (gUnicodeToNative != INVALID_ICONV_T) {
520	res = xp_iconv(gUnicodeToNative, (const char **) input, &inLeft, output, &outLeft);
521
522	if (res != (size_t) -1) {
523	*inputLeft = inLeft / 2;
524	*outputLeft = outLeft;
525	return NS_OK;
526	}
527
528	NS_ERROR("iconv failed");
529
530	// reset converter
531	xp_iconv_reset(gUnicodeToNative);
532	}
533	#if defined(ENABLE_UTF8_FALLBACK_SUPPORT)
534	else if ((gUnicodeToUTF8 != INVALID_ICONV_T) &&
535	(gUTF8ToNative != INVALID_ICONV_T)) {
536	const char in = (const char ) *input;
537
538	char ubuf[6]; // max utf-8 char length (really only needs to be 4 bytes)
539
540	// convert one uchar at a time...
541	while (inLeft && outLeft) {
542	char *p = ubuf;
543	size_t n = sizeof(ubuf), one_uchar = sizeof(PRUnichar);
544	res = xp_iconv(gUnicodeToUTF8, &in, &one_uchar, &p, &n);
545	if (res == (size_t) -1) {
546	NS_ERROR("conversion from utf-16 to utf-8 failed");
547	break;
548	}
549	p = ubuf;
550	n = sizeof(ubuf) - n;
551	res = xp_iconv(gUTF8ToNative, (const char **) &p, &n, output, &outLeft);
552	if (res == (size_t) -1) {
553	if (errno == E2BIG) {
554	// not enough room for last uchar... back up and return.
555	in -= sizeof(PRUnichar);
556	res = 0;
557	}
558	else
559	NS_ERROR("conversion from utf-8 to native failed");
560	break;
561	}
562	inLeft -= sizeof(PRUnichar);
563	}
564
565	if (res != (size_t) -1) {
566	(input) += (inputLeft - inLeft/2);
567	*inputLeft = inLeft/2;
568	*outputLeft = outLeft;
569	return NS_OK;
570	}
571
572	// reset converters
573	xp_iconv_reset(gUnicodeToUTF8);
574	xp_iconv_reset(gUTF8ToNative);
575	}
576	#endif
577
578	// fallback: truncate and hope for the best
579	utf16_to_isolatin1(input, inputLeft, output, outputLeft);
580
581	return NS_OK;
582	}
583
584	#endif // USE_ICONV
585
586	//-----------------------------------------------------------------------------
587	// conversion using mb[r]towc/wc[r]tomb
588	//-----------------------------------------------------------------------------
589	#if defined(USE_STDCONV)
590	#if defined(HAVE_WCRTOMB) \|\| defined(HAVE_MBRTOWC)
591	#include <wchar.h> // mbrtowc, wcrtomb
592	#endif
593
594	class nsNativeCharsetConverter
595	{
596	public:
597	nsNativeCharsetConverter();
598
599	nsresult NativeToUnicode(const char *input , PRUint32 inputLeft,
600	PRUnichar *output, PRUint32 outputLeft);
601	nsresult UnicodeToNative(const PRUnichar *input , PRUint32 inputLeft,
602	char *output, PRUint32 outputLeft);
603
604	static void GlobalInit();
605	static void GlobalShutdown() { }
606
607	private:
608	static PRBool gWCharIsUnicode;
609
610	#if defined(HAVE_WCRTOMB) \|\| defined(HAVE_MBRTOWC)
611	mbstate_t ps;
612	#endif
613	};
614
615	PRBool nsNativeCharsetConverter::gWCharIsUnicode = PR_FALSE;
616
617	nsNativeCharsetConverter::nsNativeCharsetConverter()
618	{
619	#if defined(HAVE_WCRTOMB) \|\| defined(HAVE_MBRTOWC)
620	memset(&ps, 0, sizeof(ps));
621	#endif
622	}
623
624	void
625	nsNativeCharsetConverter::GlobalInit()
626	{
627	// verify that wchar_t for the current locale is actually unicode.
628	// if it is not, then we should avoid calling mbtowc/wctomb and
629	// just fallback on zero-pad/truncation conversion.
630	//
631	// this test cannot be done at build time because the encoding of
632	// wchar_t may depend on the runtime locale. sad, but true!!
633	//
634	// so, if wchar_t is unicode then converting an ASCII character
635	// to wchar_t should not change its numeric value. we'll just
636	// check what happens with the ASCII 'a' character.
637	//
638	// this test is not perfect... obviously, it could yield false
639	// positives, but then at least ASCII text would be converted
640	// properly (or maybe just the 'a' character) -- oh well :(
641
642	char a = 'a';
643	unsigned int w = 0;
644
645	#ifndef L4ENV
646	int res = mbtowc((wchar_t *) &w, &a, 1);
647
648	gWCharIsUnicode = (res != -1 && w == 'a');
649	#else
650	gWCharIsUnicode = 0;
651	#endif
652
653	#ifdef DEBUG
654	if (!gWCharIsUnicode)
655	NS_WARNING("wchar_t is not unicode (unicode conversion will be lossy)");
656	#endif
657	}
658
659	nsresult
660	nsNativeCharsetConverter::NativeToUnicode(const char **input,
661	PRUint32 *inputLeft,
662	PRUnichar **output,
663	PRUint32 *outputLeft)
664	{
665	if (gWCharIsUnicode) {
666	#ifndef L4ENV
667	/* We don't have any wchar support built into uclibc just now */
668	int incr;
669
670	// cannot use wchar_t here since it may have been redefined (e.g.,
671	// via -fshort-wchar). hopefully, sizeof(tmp) is sufficient XP.
672	unsigned int tmp = 0;
673	while (inputLeft && outputLeft) {
674	#ifdef HAVE_MBRTOWC
675	incr = (int) mbrtowc((wchar_t ) &tmp, input, *inputLeft, &ps);
676	#else
677	// XXX is this thread-safe?
678	incr = (int) mbtowc((wchar_t ) &tmp, input, *inputLeft);
679	#endif
680	if (incr < 0) {
681	NS_WARNING("mbtowc failed: possible charset mismatch");
682	// zero-pad and hope for the best
683	tmp = (unsigned char) **input;
684	incr = 1;
685	}
686	**output = (PRUnichar) tmp;
687	(*input) += incr;
688	(*inputLeft) -= incr;
689	(*output)++;
690	(*outputLeft)--;
691	}
692	#endif /* not defined L4ENV */
693	}
694	else {
695	// wchar_t isn't unicode, so the best we can do is treat the
696	// input as if it is isolatin1 :(
697	isolatin1_to_utf16(input, inputLeft, output, outputLeft);
698	}
699
700	return NS_OK;
701	}
702
703	nsresult
704	nsNativeCharsetConverter::UnicodeToNative(const PRUnichar **input,
705	PRUint32 *inputLeft,
706	char **output,
707	PRUint32 *outputLeft)
708	{
709	if (gWCharIsUnicode) {
710	#ifndef L4ENV
711	/* We don't have any wchar support built into uclibc just now */
712	int incr;
713
714	while (inputLeft && outputLeft >= MB_CUR_MAX) {
715	#ifdef HAVE_WCRTOMB
716	incr = (int) wcrtomb(output, (wchar_t) *input, &ps);
717	#else
718	// XXX is this thread-safe?
719	incr = (int) wctomb(output, (wchar_t) *input);
720	#endif
721	if (incr < 0) {
722	NS_WARNING("mbtowc failed: possible charset mismatch");
723	output = (unsigned char) input; // truncate
724	incr = 1;
725	}
726	// most likely we're dead anyways if this assertion should fire
727	NS_ASSERTION(PRUint32(incr) <= *outputLeft, "wrote beyond end of string");
728	(*output) += incr;
729	(*outputLeft) -= incr;
730	(*input)++;
731	(*inputLeft)--;
732	}
733	#endif /* not defined L4ENV */
734	}
735	else {
736	// wchar_t isn't unicode, so the best we can do is treat the
737	// input as if it is isolatin1 :(
738	utf16_to_isolatin1(input, inputLeft, output, outputLeft);
739	}
740
741	return NS_OK;
742	}
743
744	#endif // USE_STDCONV
745
746	//-----------------------------------------------------------------------------
747	// API implementation
748	//-----------------------------------------------------------------------------
749
750	NS_COM nsresult
751	NS_CopyNativeToUnicode(const nsACString &input, nsAString &output)
752	{
753	output.Truncate();
754
755	PRUint32 inputLen = input.Length();
756
757	nsACString::const_iterator iter;
758	input.BeginReading(iter);
759
760	//
761	// OPTIMIZATION: preallocate space for largest possible result; convert
762	// directly into the result buffer to avoid intermediate buffer copy.
763	//
764	// this will generally result in a larger allocation, but that seems
765	// better than an extra buffer copy.
766	//
767	output.SetLength(inputLen);
768	nsAString::iterator out_iter;
769	output.BeginWriting(out_iter);
770
771	PRUnichar *result = out_iter.get();
772	PRUint32 resultLeft = inputLen;
773
774	const char *buf = iter.get();
775	PRUint32 bufLeft = inputLen;
776
777	nsNativeCharsetConverter conv;
778	nsresult rv = conv.NativeToUnicode(&buf, &bufLeft, &result, &resultLeft);
779	if (NS_SUCCEEDED(rv)) {
780	NS_ASSERTION(bufLeft == 0, "did not consume entire input buffer");
781	output.SetLength(inputLen - resultLeft);
782	}
783	return rv;
784	}
785
786	NS_COM nsresult
787	NS_CopyUnicodeToNative(const nsAString &input, nsACString &output)
788	{
789	output.Truncate();
790
791	nsAString::const_iterator iter, end;
792	input.BeginReading(iter);
793	input.EndReading(end);
794
795	// cannot easily avoid intermediate buffer copy.
796	char temp[4096];
797
798	nsNativeCharsetConverter conv;
799
800	const PRUnichar *buf = iter.get();
801	PRUint32 bufLeft = Distance(iter, end);
802	while (bufLeft) {
803	char *p = temp;
804	PRUint32 tempLeft = sizeof(temp);
805
806	nsresult rv = conv.UnicodeToNative(&buf, &bufLeft, &p, &tempLeft);
807	if (NS_FAILED(rv)) return rv;
808
809	if (tempLeft < sizeof(temp))
810	output.Append(temp, sizeof(temp) - tempLeft);
811	}
812	return NS_OK;
813	}
814
815	void
816	NS_StartupNativeCharsetUtils()
817	{
818	//
819	// need to initialize the locale or else charset conversion will fail.
820	// better not delay this in case some other component alters the locale
821	// settings.
822	//
823	// XXX we assume that we are called early enough that we should
824	// always be the first to care about the locale's charset.
825	//
826	setlocale(LC_CTYPE, "");
827
828	nsNativeCharsetConverter::GlobalInit();
829	}
830
831	void
832	NS_ShutdownNativeCharsetUtils()
833	{
834	nsNativeCharsetConverter::GlobalShutdown();
835	}
836
837	//-----------------------------------------------------------------------------
838	// XP_BEOS
839	//-----------------------------------------------------------------------------
840	#elif defined(XP_BEOS)
841
842	#include "nsAString.h"
843	#include "nsReadableUtils.h"
844	#include "nsString.h"
845
846	NS_COM nsresult
847	NS_CopyNativeToUnicode(const nsACString &input, nsAString &output)
848	{
849	CopyUTF8toUTF16(input, output);
850	return NS_OK;
851	}
852
853	NS_COM nsresult
854	NS_CopyUnicodeToNative(const nsAString &input, nsACString &output)
855	{
856	CopyUTF16toUTF8(input, output);
857	return NS_OK;
858	}
859
860	void
861	NS_StartupNativeCharsetUtils()
862	{
863	}
864
865	void
866	NS_ShutdownNativeCharsetUtils()
867	{
868	}
869
870	//-----------------------------------------------------------------------------
871	// XP_WIN
872	//-----------------------------------------------------------------------------
873	#elif defined(XP_WIN)
874
875	#include <windows.h>
876	#include "nsAString.h"
877
878	NS_COM nsresult
879	NS_CopyNativeToUnicode(const nsACString &input, nsAString &output)
880	{
881	PRUint32 inputLen = input.Length();
882
883	nsACString::const_iterator iter;
884	input.BeginReading(iter);
885
886	const char *buf = iter.get();
887
888	// determine length of result
889	PRUint32 resultLen = 0;
890	int n = ::MultiByteToWideChar(CP_ACP, 0, buf, inputLen, NULL, 0);
891	if (n > 0)
892	resultLen += n;
893
894	// allocate sufficient space
895	output.SetLength(resultLen);
896	if (resultLen > 0) {
897	nsAString::iterator out_iter;
898	output.BeginWriting(out_iter);
899
900	PRUnichar *result = out_iter.get();
901
902	::MultiByteToWideChar(CP_ACP, 0, buf, inputLen, result, resultLen);
903	}
904	return NS_OK;
905	}
906
907	NS_COM nsresult
908	NS_CopyUnicodeToNative(const nsAString &input, nsACString &output)
909	{
910	PRUint32 inputLen = input.Length();
911
912	nsAString::const_iterator iter;
913	input.BeginReading(iter);
914
915	const PRUnichar *buf = iter.get();
916
917	// determine length of result
918	PRUint32 resultLen = 0;
919
920	int n = ::WideCharToMultiByte(CP_ACP, 0, buf, inputLen, NULL, 0, NULL, NULL);
921	if (n > 0)
922	resultLen += n;
923
924	// allocate sufficient space
925	output.SetLength(resultLen);
926	if (resultLen > 0) {
927	nsACString::iterator out_iter;
928	output.BeginWriting(out_iter);
929
930	// default "defaultChar" is '?', which is an illegal character on windows
931	// file system. That will cause file uncreatable. Change it to '_'
932	const char defaultChar = '_';
933
934	char *result = out_iter.get();
935
936	::WideCharToMultiByte(CP_ACP, 0, buf, inputLen, result, resultLen,
937	&defaultChar, NULL);
938	}
939	return NS_OK;
940	}
941
942	void
943	NS_StartupNativeCharsetUtils()
944	{
945	}
946
947	void
948	NS_ShutdownNativeCharsetUtils()
949	{
950	}
951
952	//-----------------------------------------------------------------------------
953	// XP_OS2
954	//-----------------------------------------------------------------------------
955	#elif defined(XP_OS2)
956
957	#define INCL_DOS
958	#include <os2.h>
959	#include <uconv.h>
960	#include "nsAString.h"
961	#include <ulserrno.h>
962	#include "nsNativeCharsetUtils.h"
963
964	static UconvObject UnicodeConverter = NULL;
965
966	NS_COM nsresult
967	NS_CopyNativeToUnicode(const nsACString &input, nsAString &output)
968	{
969	PRUint32 inputLen = input.Length();
970
971	nsACString::const_iterator iter;
972	input.BeginReading(iter);
973	const char *inputStr = iter.get();
974
975	// determine length of result
976	PRUint32 resultLen = inputLen;
977	output.SetLength(resultLen);
978
979	nsAString::iterator out_iter;
980	output.BeginWriting(out_iter);
981	UniChar result = (UniChar)out_iter.get();
982
983	size_t cSubs = 0;
984	size_t resultLeft = resultLen;
985
986	if (!UnicodeConverter)
987	NS_StartupNativeCharsetUtils();
988
989	int unirc = ::UniUconvToUcs(UnicodeConverter, (void**)&inputStr, &inputLen,
990	&result, &resultLeft, &cSubs);
991
992	NS_ASSERTION(unirc != UCONV_E2BIG, "Path too big");
993
994	if (unirc != ULS_SUCCESS) {
995	output.Truncate();
996	return NS_ERROR_FAILURE;
997	}
998
999	// Need to update string length to reflect how many bytes were actually
1000	// written.
1001	output.Truncate(resultLen - resultLeft);
1002	return NS_OK;
1003	}
1004
1005	NS_COM nsresult
1006	NS_CopyUnicodeToNative(const nsAString &input, nsACString &output)
1007	{
1008	size_t inputLen = input.Length();
1009
1010	nsAString::const_iterator iter;
1011	input.BeginReading(iter);
1012	UniChar* inputStr = (UniChar) NS_CONST_CAST(PRUnichar, iter.get());
1013
1014	// maximum length of unicode string of length x converted to native
1015	// codepage is x*2
1016	size_t resultLen = inputLen * 2;
1017	output.SetLength(resultLen);
1018
1019	nsACString::iterator out_iter;
1020	output.BeginWriting(out_iter);
1021	char *result = out_iter.get();
1022
1023	size_t cSubs = 0;
1024	size_t resultLeft = resultLen;
1025
1026	if (!UnicodeConverter)
1027	NS_StartupNativeCharsetUtils();
1028
1029	int unirc = ::UniUconvFromUcs(UnicodeConverter, &inputStr, &inputLen,
1030	(void**)&result, &resultLeft, &cSubs);
1031
1032	NS_ASSERTION(unirc != UCONV_E2BIG, "Path too big");
1033
1034	if (unirc != ULS_SUCCESS) {
1035	output.Truncate();
1036	return NS_ERROR_FAILURE;
1037	}
1038
1039	// Need to update string length to reflect how many bytes were actually
1040	// written.
1041	output.Truncate(resultLen - resultLeft);
1042	return NS_OK;
1043	}
1044
1045	void
1046	NS_StartupNativeCharsetUtils()
1047	{
1048	ULONG ulLength;
1049	ULONG ulCodePage;
1050	DosQueryCp(sizeof(ULONG), &ulCodePage, &ulLength);
1051
1052	UniChar codepage[20];
1053	int unirc = ::UniMapCpToUcsCp(ulCodePage, codepage, 20);
1054	if (unirc == ULS_SUCCESS) {
1055	unirc = ::UniCreateUconvObject(codepage, &UnicodeConverter);
1056	if (unirc == ULS_SUCCESS) {
1057	uconv_attribute_t attr;
1058	::UniQueryUconvObject(UnicodeConverter, &attr, sizeof(uconv_attribute_t),
1059	NULL, NULL, NULL);
1060	attr.options = UCONV_OPTION_SUBSTITUTE_BOTH;
1061	attr.subchar_len=1;
1062	attr.subchar[0]='_';
1063	::UniSetUconvObject(UnicodeConverter, &attr);
1064	}
1065	}
1066	}
1067
1068	void
1069	NS_ShutdownNativeCharsetUtils()
1070	{
1071	::UniFreeUconvObject(UnicodeConverter);
1072	}
1073
1074	//-----------------------------------------------------------------------------
1075	// XP_MAC
1076	//-----------------------------------------------------------------------------
1077	#elif defined(XP_MAC)
1078
1079	#include <UnicodeConverter.h>
1080	#include <TextCommon.h>
1081	#include <Script.h>
1082	#include <MacErrors.h>
1083	#include "nsAString.h"
1084
1085	class nsFSStringConversionMac {
1086	public:
1087	static nsresult UCSToFS(const nsAString& aIn, nsACString& aOut);
1088	static nsresult FSToUCS(const nsACString& ain, nsAString& aOut);
1089
1090	static void CleanUp();
1091
1092	private:
1093	static TextEncoding GetSystemEncoding();
1094	static nsresult PrepareEncoder();
1095	static nsresult PrepareDecoder();
1096
1097	static UnicodeToTextInfo sEncoderInfo;
1098	static TextToUnicodeInfo sDecoderInfo;
1099	};
1100
1101	UnicodeToTextInfo nsFSStringConversionMac::sEncoderInfo = nsnull;
1102	TextToUnicodeInfo nsFSStringConversionMac::sDecoderInfo = nsnull;
1103
1104	nsresult nsFSStringConversionMac::UCSToFS(const nsAString& aIn, nsACString& aOut)
1105	{
1106	nsresult rv = PrepareEncoder();
1107	if (NS_FAILED(rv)) return rv;
1108
1109	OSStatus err = noErr;
1110	char stackBuffer[512];
1111
1112	aOut.Truncate();
1113
1114	// for each chunk of \|aIn\|...
1115	nsReadingIterator<PRUnichar> iter;
1116	aIn.BeginReading(iter);
1117
1118	PRUint32 fragmentLength = PRUint32(iter.size_forward());
1119	UInt32 bytesLeft = fragmentLength * sizeof(UniChar);
1120
1121	do {
1122	UInt32 bytesRead = 0, bytesWritten = 0;
1123	err = ::ConvertFromUnicodeToText(sEncoderInfo,
1124	bytesLeft,
1125	(const UniChar*)iter.get(),
1126	kUnicodeUseFallbacksMask \| kUnicodeLooseMappingsMask,
1127	0, nsnull, nsnull, nsnull,
1128	sizeof(stackBuffer),
1129	&bytesRead,
1130	&bytesWritten,
1131	stackBuffer);
1132	if (err == kTECUsedFallbacksStatus)
1133	err = noErr;
1134	else if (err == kTECOutputBufferFullStatus) {
1135	bytesLeft -= bytesRead;
1136	iter.advance(bytesRead / sizeof(UniChar));
1137	}
1138	aOut.Append(stackBuffer, bytesWritten);
1139	}
1140	while (err == kTECOutputBufferFullStatus);
1141
1142	return (err == noErr) ? NS_OK : NS_ERROR_FAILURE;
1143	}
1144
1145	nsresult nsFSStringConversionMac::FSToUCS(const nsACString& aIn, nsAString& aOut)
1146	{
1147	nsresult rv = PrepareDecoder();
1148	if (NS_FAILED(rv)) return rv;
1149
1150	OSStatus err = noErr;
1151	UniChar stackBuffer[512];
1152
1153	aOut.Truncate(0);
1154
1155	// for each chunk of \|aIn\|...
1156	nsReadingIterator<char> iter;
1157	aIn.BeginReading(iter);
1158
1159	PRUint32 fragmentLength = PRUint32(iter.size_forward());
1160	UInt32 bytesLeft = fragmentLength;
1161
1162	do {
1163	UInt32 bytesRead = 0, bytesWritten = 0;
1164	err = ::ConvertFromTextToUnicode(sDecoderInfo,
1165	bytesLeft,
1166	iter.get(),
1167	kUnicodeUseFallbacksMask \| kUnicodeLooseMappingsMask,
1168	0, nsnull, nsnull, nsnull,
1169	sizeof(stackBuffer),
1170	&bytesRead,
1171	&bytesWritten,
1172	stackBuffer);
1173	if (err == kTECUsedFallbacksStatus)
1174	err = noErr;
1175	else if (err == kTECOutputBufferFullStatus) {
1176	bytesLeft -= bytesRead;
1177	iter.advance(bytesRead);
1178	}
1179	aOut.Append((PRUnichar *)stackBuffer, bytesWritten / sizeof(PRUnichar));
1180	}
1181	while (err == kTECOutputBufferFullStatus);
1182
1183	return (err == noErr) ? NS_OK : NS_ERROR_FAILURE;
1184	}
1185
1186	void nsFSStringConversionMac::CleanUp()
1187	{
1188	if (sDecoderInfo) {
1189	::DisposeTextToUnicodeInfo(&sDecoderInfo);
1190	sDecoderInfo = nsnull;
1191	}
1192	if (sEncoderInfo) {
1193	::DisposeUnicodeToTextInfo(&sEncoderInfo);
1194	sEncoderInfo = nsnull;
1195	}
1196	}
1197
1198	TextEncoding nsFSStringConversionMac::GetSystemEncoding()
1199	{
1200	OSStatus err;
1201	TextEncoding theEncoding;
1202
1203	err = ::UpgradeScriptInfoToTextEncoding(smSystemScript, kTextLanguageDontCare,
1204	kTextRegionDontCare, NULL, &theEncoding);
1205
1206	if (err != noErr)
1207	theEncoding = kTextEncodingMacRoman;
1208
1209	return theEncoding;
1210	}
1211
1212	nsresult nsFSStringConversionMac::PrepareEncoder()
1213	{
1214	nsresult rv = NS_OK;
1215	if (!sEncoderInfo) {
1216	OSStatus err;
1217	err = ::CreateUnicodeToTextInfoByEncoding(GetSystemEncoding(), &sEncoderInfo);
1218	if (err)
1219	rv = NS_ERROR_FAILURE;
1220	}
1221	return rv;
1222	}
1223
1224	nsresult nsFSStringConversionMac::PrepareDecoder()
1225	{
1226	nsresult rv = NS_OK;
1227	if (!sDecoderInfo) {
1228	OSStatus err;
1229	err = ::CreateTextToUnicodeInfoByEncoding(GetSystemEncoding(), &sDecoderInfo);
1230	if (err)
1231	rv = NS_ERROR_FAILURE;
1232	}
1233	return rv;
1234	}
1235
1236	NS_COM nsresult
1237	NS_CopyNativeToUnicode(const nsACString &input, nsAString &output)
1238	{
1239	return nsFSStringConversionMac::FSToUCS(input, output);
1240	}
1241
1242	NS_COM nsresult
1243	NS_CopyUnicodeToNative(const nsAString &input, nsACString &output)
1244	{
1245	return nsFSStringConversionMac::UCSToFS(input, output);
1246	}
1247
1248	void
1249	NS_StartupNativeCharsetUtils()
1250	{
1251	}
1252
1253	void
1254	NS_ShutdownNativeCharsetUtils()
1255	{
1256	nsFSStringConversionMac::CleanUp();
1257	}
1258
1259	//-----------------------------------------------------------------------------
1260	// default : truncate/zeropad
1261	//-----------------------------------------------------------------------------
1262	#else
1263
1264	#include "nsReadableUtils.h"
1265
1266	NS_COM nsresult
1267	NS_CopyNativeToUnicode(const nsACString &input, nsAString &output)
1268	{
1269	CopyASCIItoUCS2(input, output);
1270	return NS_OK;
1271	}
1272
1273	NS_COM nsresult
1274	NS_CopyUnicodeToNative(const nsAString &input, nsACString &output)
1275	{
1276	CopyUCS2toASCII(input, output);
1277	return NS_OK;
1278	}
1279
1280	void
1281	NS_StartupNativeCharsetUtils()
1282	{
1283	}
1284
1285	void
1286	NS_ShutdownNativeCharsetUtils()
1287	{
1288	}
1289
1290	#endif

Note: See TracBrowser for help on using the repository browser.

source: vbox/trunk/src/libs/xpcom18a4/xpcom/io/nsNativeCharsetUtils.cpp@ 6542

Download in other formats: