nsNativeCharsetUtils.cpp@ 101978

Last change on this file since 101978 was 101978, checked in by vboxsync, 15 months ago
libs/xpcom/xpcom/io: Convert some code from using PRLock to IPRT's RTSEMFASTMUTEX locks, bugref:10545
Property svn:eol-style set to `native` Property svn:keywords set to `Author Date Id Revision`
File size: 37.5 KB

Line
1	/* *** BEGIN LICENSE BLOCK ***
2	* Version: MPL 1.1/GPL 2.0/LGPL 2.1
3	*
4	* The contents of this file are subject to the Mozilla Public License Version
5	* 1.1 (the "License"); you may not use this file except in compliance with
6	* the License. You may obtain a copy of the License at
7	* http://www.mozilla.org/MPL/
8	*
9	* Software distributed under the License is distributed on an "AS IS" basis,
10	* WITHOUT WARRANTY OF ANY KIND, either express or implied. See the License
11	* for the specific language governing rights and limitations under the
12	* License.
13	*
14	* The Original Code is Mozilla.
15	*
16	* The Initial Developer of the Original Code is
17	* Netscape Communications Corporation.
18	* Portions created by the Initial Developer are Copyright (C) 2002
19	* the Initial Developer. All Rights Reserved.
20	*
21	* Contributor(s):
22	* Darin Fisher <[email protected]>
23	* Brian Stell <[email protected]>
24	* Frank Tang <[email protected]>
25	* Brendan Eich <[email protected]>
26	* Sergei Dolgov <[email protected]>
27	*
28	* Alternatively, the contents of this file may be used under the terms of
29	* either the GNU General Public License Version 2 or later (the "GPL"), or
30	* the GNU Lesser General Public License Version 2.1 or later (the "LGPL"),
31	* in which case the provisions of the GPL or the LGPL are applicable instead
32	* of those above. If you wish to allow use of your version of this file only
33	* under the terms of either the GPL or the LGPL, and not to allow others to
34	* use your version of this file under the terms of the MPL, indicate your
35	* decision by deleting the provisions above and replace them with the notice
36	* and other provisions required by the GPL or the LGPL. If you do not delete
37	* the provisions above, a recipient may use your version of this file under
38	* the terms of any one of the MPL, the GPL or the LGPL.
39	*
40	* *** END LICENSE BLOCK *** */
41
42	#include "xpcom-private.h"
43
44	//-----------------------------------------------------------------------------
45	// XP_UNIX
46	//-----------------------------------------------------------------------------
47	#if defined(XP_UNIX)
48
49	#include <stdlib.h> // mbtowc, wctomb
50	#include <locale.h> // setlocale
51	#include "nscore.h"
52	#include "nsAString.h"
53	#include "nsReadableUtils.h"
54
55	#include <iprt/assert.h>
56	#include <iprt/errcore.h>
57	#include <iprt/semaphore.h>
58
59	//
60	// choose a conversion library. we used to use mbrtowc/wcrtomb under Linux,
61	// but that doesn't work for non-BMP characters whether we use '-fshort-wchar'
62	// or not (see bug 206811 and
63	// news://news.mozilla.org:119/[email protected]). we now use
64	// iconv for all platforms where nltypes.h and nllanginfo.h are present
65	// along with iconv.
66	//
67	#if defined(HAVE_ICONV) && defined(HAVE_NL_TYPES_H) && defined(HAVE_LANGINFO_CODESET)
68	#define USE_ICONV 1
69	#else
70	#define USE_STDCONV 1
71	#endif
72
73	static void
74	isolatin1_to_utf16(const char *input, PRUint32 inputLeft, PRUnichar *output, PRUint32 outputLeft)
75	{
76	while (inputLeft && outputLeft) {
77	output = (unsigned char) input;
78	(*input)++;
79	(*inputLeft)--;
80	(*output)++;
81	(*outputLeft)--;
82	}
83	}
84
85	static void
86	utf16_to_isolatin1(const PRUnichar *input, PRUint32 inputLeft, char *output, PRUint32 outputLeft)
87	{
88	while (inputLeft && outputLeft) {
89	output = (unsigned char) input;
90	(*input)++;
91	(*inputLeft)--;
92	(*output)++;
93	(*outputLeft)--;
94	}
95	}
96
97	//-----------------------------------------------------------------------------
98	// conversion using iconv
99	//-----------------------------------------------------------------------------
100	#if defined(USE_ICONV)
101	#include <nl_types.h> // CODESET
102	#include <langinfo.h> // nl_langinfo
103	#include <iconv.h> // iconv_open, iconv, iconv_close
104	#include <errno.h>
105
106	#if defined(HAVE_ICONV_WITH_CONST_INPUT)
107	#define ICONV_INPUT(x) (x)
108	#else
109	#define ICONV_INPUT(x) ((char **)x)
110	#endif
111
112	// solaris definitely needs this, but we'll enable it by default
113	// just in case... but we know for sure that iconv(3) in glibc
114	// doesn't need this.
115	#if !defined(__GLIBC__)
116	#define ENABLE_UTF8_FALLBACK_SUPPORT
117	#endif
118
119	#define INVALID_ICONV_T ((iconv_t) -1)
120
121	static inline size_t
122	xp_iconv(iconv_t converter,
123	const char **input,
124	size_t *inputLeft,
125	char **output,
126	size_t *outputLeft)
127	{
128	size_t res, outputAvail = outputLeft ? *outputLeft : 0;
129	res = iconv(converter, ICONV_INPUT(input), inputLeft, output, outputLeft);
130	if (res == (size_t) -1) {
131	// on some platforms (e.g., linux) iconv will fail with
132	// E2BIG if it cannot convert _all_ of its input. it'll
133	// still adjust all of the in/out params correctly, so we
134	// can ignore this error. the assumption is that we will
135	// be called again to complete the conversion.
136	if ((errno == E2BIG) && (*outputLeft < outputAvail))
137	res = 0;
138	}
139	return res;
140	}
141
142	static inline void
143	xp_iconv_reset(iconv_t converter)
144	{
145	// NOTE: the man pages on Solaris claim that you can pass NULL
146	// for all parameter to reset the converter, but beware the
147	// evil Solaris crash if you go down this route >:-)
148
149	const char *zero_char_in_ptr = NULL;
150	char *zero_char_out_ptr = NULL;
151	size_t zero_size_in = 0,
152	zero_size_out = 0;
153
154	xp_iconv(converter, &zero_char_in_ptr,
155	&zero_size_in,
156	&zero_char_out_ptr,
157	&zero_size_out);
158	}
159
160	static inline iconv_t
161	xp_iconv_open(const char to_list, const char from_list)
162	{
163	iconv_t res;
164	const char **from_name;
165	const char **to_name;
166
167	// try all possible combinations to locate a converter.
168	to_name = to_list;
169	while (*to_name) {
170	if (**to_name) {
171	from_name = from_list;
172	while (*from_name) {
173	if (**from_name) {
174	res = iconv_open(to_name, from_name);
175	if (res != INVALID_ICONV_T)
176	return res;
177	}
178	from_name++;
179	}
180	}
181	to_name++;
182	}
183
184	return INVALID_ICONV_T;
185	}
186
187	/*
188	* PRUnichar[] is NOT a UCS-2 array BUT a UTF-16 string. Therefore, we
189	* have to use UTF-16 with iconv(3) on platforms where it's supported.
190	* However, the way UTF-16 and UCS-2 are interpreted varies across platforms
191	* and implementations of iconv(3). On Tru64, it also depends on the environment
192	* variable. To avoid the trouble arising from byte-swapping
193	* (bug 208809), we have to try UTF-16LE/BE and UCS-2LE/BE before falling
194	* back to UTF-16 and UCS-2 and variants. We assume that UTF-16 and UCS-2
195	* on systems without UTF-16LE/BE and UCS-2LE/BE have the native endianness,
196	* which isn't the case of glibc 2.1.x, for which we use 'UNICODELITTLE'
197	* and 'UNICODEBIG'. It's also not true of Tru64 V4 when the environment
198	* variable ICONV_BYTEORDER is set to 'big-endian', about which not much
199	* can be done other than adding a note in the release notes. (bug 206811)
200	*/
201	static const char *UTF_16_NAMES[] = {
202	#if defined(IS_LITTLE_ENDIAN)
203	"UTF-16LE",
204	#if defined(__GLIBC__)
205	"UNICODELITTLE",
206	#endif
207	"UCS-2LE",
208	#else
209	"UTF-16BE",
210	#if defined(__GLIBC__)
211	"UNICODEBIG",
212	#endif
213	"UCS-2BE",
214	#endif
215	"UTF-16",
216	"UCS-2",
217	"UCS2",
218	"UCS_2",
219	"ucs-2",
220	"ucs2",
221	"ucs_2",
222	NULL
223	};
224
225	#if defined(ENABLE_UTF8_FALLBACK_SUPPORT)
226	static const char *UTF_8_NAMES[] = {
227	"UTF-8",
228	"UTF8",
229	"UTF_8",
230	"utf-8",
231	"utf8",
232	"utf_8",
233	NULL
234	};
235	#endif
236
237	static const char *ISO_8859_1_NAMES[] = {
238	"ISO-8859-1",
239	#if !defined(__GLIBC__)
240	"ISO8859-1",
241	"ISO88591",
242	"ISO_8859_1",
243	"ISO8859_1",
244	"iso-8859-1",
245	"iso8859-1",
246	"iso88591",
247	"iso_8859_1",
248	"iso8859_1",
249	#endif
250	NULL
251	};
252
253	class nsNativeCharsetConverter
254	{
255	public:
256	nsNativeCharsetConverter();
257	~nsNativeCharsetConverter();
258
259	nsresult NativeToUnicode(const char *input , PRUint32 inputLeft,
260	PRUnichar *output, PRUint32 outputLeft);
261	nsresult UnicodeToNative(const PRUnichar *input , PRUint32 inputLeft,
262	char *output, PRUint32 outputLeft);
263
264	static void GlobalInit();
265	static void GlobalShutdown();
266
267	private:
268	static iconv_t gNativeToUnicode;
269	static iconv_t gUnicodeToNative;
270	#if defined(ENABLE_UTF8_FALLBACK_SUPPORT)
271	static iconv_t gNativeToUTF8;
272	static iconv_t gUTF8ToNative;
273	static iconv_t gUnicodeToUTF8;
274	static iconv_t gUTF8ToUnicode;
275	#endif
276	static RTSEMFASTMUTEX gLock;
277	static PRBool gInitialized;
278
279	static void LazyInit();
280
281	static void Lock() { if (gLock != NILRTSEMFASTMUTEX) RTSemFastMutexRequest(gLock); }
282	static void Unlock() { if (gLock != NILRTSEMFASTMUTEX) RTSemFastMutexRelease(gLock); }
283	};
284
285	iconv_t nsNativeCharsetConverter::gNativeToUnicode = INVALID_ICONV_T;
286	iconv_t nsNativeCharsetConverter::gUnicodeToNative = INVALID_ICONV_T;
287	#if defined(ENABLE_UTF8_FALLBACK_SUPPORT)
288	iconv_t nsNativeCharsetConverter::gNativeToUTF8 = INVALID_ICONV_T;
289	iconv_t nsNativeCharsetConverter::gUTF8ToNative = INVALID_ICONV_T;
290	iconv_t nsNativeCharsetConverter::gUnicodeToUTF8 = INVALID_ICONV_T;
291	iconv_t nsNativeCharsetConverter::gUTF8ToUnicode = INVALID_ICONV_T;
292	#endif
293	RTSEMFASTMUTEX nsNativeCharsetConverter::gLock = NIL_RTSEMFASTMUTEX;
294	PRBool nsNativeCharsetConverter::gInitialized = PR_FALSE;
295
296	void
297	nsNativeCharsetConverter::LazyInit()
298	{
299	const char *blank_list[] = { "", NULL };
300	const char **native_charset_list = blank_list;
301	const char *native_charset = nl_langinfo(CODESET);
302	if (native_charset == nsnull) {
303	NS_ERROR("native charset is unknown");
304	// fallback to ISO-8859-1
305	native_charset_list = ISO_8859_1_NAMES;
306	}
307	else
308	native_charset_list[0] = native_charset;
309
310	gNativeToUnicode = xp_iconv_open(UTF_16_NAMES, native_charset_list);
311	gUnicodeToNative = xp_iconv_open(native_charset_list, UTF_16_NAMES);
312
313	#if defined(ENABLE_UTF8_FALLBACK_SUPPORT)
314	if (gNativeToUnicode == INVALID_ICONV_T) {
315	gNativeToUTF8 = xp_iconv_open(UTF_8_NAMES, native_charset_list);
316	gUTF8ToUnicode = xp_iconv_open(UTF_16_NAMES, UTF_8_NAMES);
317	NS_ASSERTION(gNativeToUTF8 != INVALID_ICONV_T, "no native to utf-8 converter");
318	NS_ASSERTION(gUTF8ToUnicode != INVALID_ICONV_T, "no utf-8 to utf-16 converter");
319	}
320	if (gUnicodeToNative == INVALID_ICONV_T) {
321	gUnicodeToUTF8 = xp_iconv_open(UTF_8_NAMES, UTF_16_NAMES);
322	gUTF8ToNative = xp_iconv_open(native_charset_list, UTF_8_NAMES);
323	NS_ASSERTION(gUnicodeToUTF8 != INVALID_ICONV_T, "no utf-16 to utf-8 converter");
324	NS_ASSERTION(gUTF8ToNative != INVALID_ICONV_T, "no utf-8 to native converter");
325	}
326	#else
327	NS_ASSERTION(gNativeToUnicode != INVALID_ICONV_T, "no native to utf-16 converter");
328	NS_ASSERTION(gUnicodeToNative != INVALID_ICONV_T, "no utf-16 to native converter");
329	#endif
330
331	/*
332	* On Solaris 8 (and newer?), the iconv modules converting to UCS-2
333	* prepend a byte order mark unicode character (BOM, u+FEFF) during
334	* the first use of the iconv converter. The same is the case of
335	* glibc 2.2.9x and Tru64 V5 (see bug 208809) when 'UTF-16' is used.
336	* However, we use 'UTF-16LE/BE' in both cases, instead so that we
337	* should be safe. But just in case...
338	*
339	* This dummy conversion gets rid of the BOMs and fixes bug 153562.
340	*/
341	char dummy_input[1] = { ' ' };
342	char dummy_output[4];
343
344	if (gNativeToUnicode != INVALID_ICONV_T) {
345	const char *input = dummy_input;
346	size_t input_left = sizeof(dummy_input);
347	char *output = dummy_output;
348	size_t output_left = sizeof(dummy_output);
349
350	xp_iconv(gNativeToUnicode, &input, &input_left, &output, &output_left);
351	}
352	#if defined(ENABLE_UTF8_FALLBACK_SUPPORT)
353	if (gUTF8ToUnicode != INVALID_ICONV_T) {
354	const char *input = dummy_input;
355	size_t input_left = sizeof(dummy_input);
356	char *output = dummy_output;
357	size_t output_left = sizeof(dummy_output);
358
359	xp_iconv(gUTF8ToUnicode, &input, &input_left, &output, &output_left);
360	}
361	#endif
362
363	gInitialized = PR_TRUE;
364	}
365
366	void
367	nsNativeCharsetConverter::GlobalInit()
368	{
369	int vrc = RTSemFastMutexCreate(&gLock);
370	NS_ASSERTION(RT_SUCCESS(vrc), "lock creation failed");
371	}
372
373	void
374	nsNativeCharsetConverter::GlobalShutdown()
375	{
376	if (gLock != NIL_RTSEMFASTMUTEX) {
377	RTSemFastMutexDestroy(gLock);
378	gLock = NIL_RTSEMFASTMUTEX;
379	}
380
381	if (gNativeToUnicode != INVALID_ICONV_T) {
382	iconv_close(gNativeToUnicode);
383	gNativeToUnicode = INVALID_ICONV_T;
384	}
385
386	if (gUnicodeToNative != INVALID_ICONV_T) {
387	iconv_close(gUnicodeToNative);
388	gUnicodeToNative = INVALID_ICONV_T;
389	}
390
391	#if defined(ENABLE_UTF8_FALLBACK_SUPPORT)
392	if (gNativeToUTF8 != INVALID_ICONV_T) {
393	iconv_close(gNativeToUTF8);
394	gNativeToUTF8 = INVALID_ICONV_T;
395	}
396	if (gUTF8ToNative != INVALID_ICONV_T) {
397	iconv_close(gUTF8ToNative);
398	gUTF8ToNative = INVALID_ICONV_T;
399	}
400	if (gUnicodeToUTF8 != INVALID_ICONV_T) {
401	iconv_close(gUnicodeToUTF8);
402	gUnicodeToUTF8 = INVALID_ICONV_T;
403	}
404	if (gUTF8ToUnicode != INVALID_ICONV_T) {
405	iconv_close(gUTF8ToUnicode);
406	gUTF8ToUnicode = INVALID_ICONV_T;
407	}
408	#endif
409
410	gInitialized = PR_FALSE;
411	}
412
413	nsNativeCharsetConverter::nsNativeCharsetConverter()
414	{
415	Lock();
416	if (!gInitialized)
417	LazyInit();
418	}
419
420	nsNativeCharsetConverter::~nsNativeCharsetConverter()
421	{
422	// reset converters for next time
423	if (gNativeToUnicode != INVALID_ICONV_T)
424	xp_iconv_reset(gNativeToUnicode);
425	if (gUnicodeToNative != INVALID_ICONV_T)
426	xp_iconv_reset(gUnicodeToNative);
427	#if defined(ENABLE_UTF8_FALLBACK_SUPPORT)
428	if (gNativeToUTF8 != INVALID_ICONV_T)
429	xp_iconv_reset(gNativeToUTF8);
430	if (gUTF8ToNative != INVALID_ICONV_T)
431	xp_iconv_reset(gUTF8ToNative);
432	if (gUnicodeToUTF8 != INVALID_ICONV_T)
433	xp_iconv_reset(gUnicodeToUTF8);
434	if (gUTF8ToUnicode != INVALID_ICONV_T)
435	xp_iconv_reset(gUTF8ToUnicode);
436	#endif
437	Unlock();
438	}
439
440	nsresult
441	nsNativeCharsetConverter::NativeToUnicode(const char **input,
442	PRUint32 *inputLeft,
443	PRUnichar **output,
444	PRUint32 *outputLeft)
445	{
446	size_t res = 0;
447	size_t inLeft = (size_t) *inputLeft;
448	size_t outLeft = (size_t) outputLeft 2;
449
450	if (gNativeToUnicode != INVALID_ICONV_T) {
451
452	res = xp_iconv(gNativeToUnicode, input, &inLeft, (char **) output, &outLeft);
453
454	*inputLeft = inLeft;
455	*outputLeft = outLeft / 2;
456	if (res != (size_t) -1)
457	return NS_OK;
458
459	NS_WARNING("conversion from native to utf-16 failed");
460
461	// reset converter
462	xp_iconv_reset(gNativeToUnicode);
463	}
464	#if defined(ENABLE_UTF8_FALLBACK_SUPPORT)
465	else if ((gNativeToUTF8 != INVALID_ICONV_T) &&
466	(gUTF8ToUnicode != INVALID_ICONV_T)) {
467	// convert first to UTF8, then from UTF8 to UCS2
468	const char in = input;
469
470	char ubuf[1024];
471
472	// we assume we're always called with enough space in \|output\|,
473	// so convert many chars at a time...
474	while (inLeft) {
475	char *p = ubuf;
476	size_t n = sizeof(ubuf);
477	res = xp_iconv(gNativeToUTF8, &in, &inLeft, &p, &n);
478	if (res == (size_t) -1) {
479	NS_ERROR("conversion from native to utf-8 failed");
480	break;
481	}
482	NS_ASSERTION(outLeft > 0, "bad assumption");
483	p = ubuf;
484	n = sizeof(ubuf) - n;
485	res = xp_iconv(gUTF8ToUnicode, (const char ) &p, &n, (char ) output, &outLeft);
486	if (res == (size_t) -1) {
487	NS_ERROR("conversion from utf-8 to utf-16 failed");
488	break;
489	}
490	}
491
492	(input) += (inputLeft - inLeft);
493	*inputLeft = inLeft;
494	*outputLeft = outLeft / 2;
495
496	if (res != (size_t) -1)
497	return NS_OK;
498
499	// reset converters
500	xp_iconv_reset(gNativeToUTF8);
501	xp_iconv_reset(gUTF8ToUnicode);
502	}
503	#endif
504
505	// fallback: zero-pad and hope for the best
506	// XXX This is lame and we have to do better.
507	isolatin1_to_utf16(input, inputLeft, output, outputLeft);
508
509	return NS_OK;
510	}
511
512	nsresult
513	nsNativeCharsetConverter::UnicodeToNative(const PRUnichar **input,
514	PRUint32 *inputLeft,
515	char **output,
516	PRUint32 *outputLeft)
517	{
518	size_t res = 0;
519	size_t inLeft = (size_t) inputLeft 2;
520	size_t outLeft = (size_t) *outputLeft;
521
522	if (gUnicodeToNative != INVALID_ICONV_T) {
523	res = xp_iconv(gUnicodeToNative, (const char **) input, &inLeft, output, &outLeft);
524
525	if (res != (size_t) -1) {
526	*inputLeft = inLeft / 2;
527	*outputLeft = outLeft;
528	return NS_OK;
529	}
530
531	NS_ERROR("iconv failed");
532
533	// reset converter
534	xp_iconv_reset(gUnicodeToNative);
535	}
536	#if defined(ENABLE_UTF8_FALLBACK_SUPPORT)
537	else if ((gUnicodeToUTF8 != INVALID_ICONV_T) &&
538	(gUTF8ToNative != INVALID_ICONV_T)) {
539	const char in = (const char ) *input;
540
541	char ubuf[6]; // max utf-8 char length (really only needs to be 4 bytes)
542
543	// convert one uchar at a time...
544	while (inLeft && outLeft) {
545	char *p = ubuf;
546	size_t n = sizeof(ubuf), one_uchar = sizeof(PRUnichar);
547	res = xp_iconv(gUnicodeToUTF8, &in, &one_uchar, &p, &n);
548	if (res == (size_t) -1) {
549	NS_ERROR("conversion from utf-16 to utf-8 failed");
550	break;
551	}
552	p = ubuf;
553	n = sizeof(ubuf) - n;
554	res = xp_iconv(gUTF8ToNative, (const char **) &p, &n, output, &outLeft);
555	if (res == (size_t) -1) {
556	if (errno == E2BIG) {
557	// not enough room for last uchar... back up and return.
558	in -= sizeof(PRUnichar);
559	res = 0;
560	}
561	else
562	NS_ERROR("conversion from utf-8 to native failed");
563	break;
564	}
565	inLeft -= sizeof(PRUnichar);
566	}
567
568	if (res != (size_t) -1) {
569	(input) += (inputLeft - inLeft/2);
570	*inputLeft = inLeft/2;
571	*outputLeft = outLeft;
572	return NS_OK;
573	}
574
575	// reset converters
576	xp_iconv_reset(gUnicodeToUTF8);
577	xp_iconv_reset(gUTF8ToNative);
578	}
579	#endif
580
581	// fallback: truncate and hope for the best
582	utf16_to_isolatin1(input, inputLeft, output, outputLeft);
583
584	return NS_OK;
585	}
586
587	#endif // USE_ICONV
588
589	//-----------------------------------------------------------------------------
590	// conversion using mb[r]towc/wc[r]tomb
591	//-----------------------------------------------------------------------------
592	#if defined(USE_STDCONV)
593	#if defined(HAVE_WCRTOMB) \|\| defined(HAVE_MBRTOWC)
594	#include <wchar.h> // mbrtowc, wcrtomb
595	#endif
596
597	class nsNativeCharsetConverter
598	{
599	public:
600	nsNativeCharsetConverter();
601
602	nsresult NativeToUnicode(const char *input , PRUint32 inputLeft,
603	PRUnichar *output, PRUint32 outputLeft);
604	nsresult UnicodeToNative(const PRUnichar *input , PRUint32 inputLeft,
605	char *output, PRUint32 outputLeft);
606
607	static void GlobalInit();
608	static void GlobalShutdown() { }
609
610	private:
611	static PRBool gWCharIsUnicode;
612
613	#if defined(HAVE_WCRTOMB) \|\| defined(HAVE_MBRTOWC)
614	mbstate_t ps;
615	#endif
616	};
617
618	PRBool nsNativeCharsetConverter::gWCharIsUnicode = PR_FALSE;
619
620	nsNativeCharsetConverter::nsNativeCharsetConverter()
621	{
622	#if defined(HAVE_WCRTOMB) \|\| defined(HAVE_MBRTOWC)
623	memset(&ps, 0, sizeof(ps));
624	#endif
625	}
626
627	void
628	nsNativeCharsetConverter::GlobalInit()
629	{
630	// verify that wchar_t for the current locale is actually unicode.
631	// if it is not, then we should avoid calling mbtowc/wctomb and
632	// just fallback on zero-pad/truncation conversion.
633	//
634	// this test cannot be done at build time because the encoding of
635	// wchar_t may depend on the runtime locale. sad, but true!!
636	//
637	// so, if wchar_t is unicode then converting an ASCII character
638	// to wchar_t should not change its numeric value. we'll just
639	// check what happens with the ASCII 'a' character.
640	//
641	// this test is not perfect... obviously, it could yield false
642	// positives, but then at least ASCII text would be converted
643	// properly (or maybe just the 'a' character) -- oh well :(
644
645	char a = 'a';
646	unsigned int w = 0;
647
648	int res = mbtowc((wchar_t *) &w, &a, 1);
649
650	gWCharIsUnicode = (res != -1 && w == 'a');
651
652	#ifdef DEBUG
653	if (!gWCharIsUnicode)
654	NS_WARNING("wchar_t is not unicode (unicode conversion will be lossy)");
655	#endif
656	}
657
658	nsresult
659	nsNativeCharsetConverter::NativeToUnicode(const char **input,
660	PRUint32 *inputLeft,
661	PRUnichar **output,
662	PRUint32 *outputLeft)
663	{
664	if (gWCharIsUnicode) {
665	int incr;
666
667	// cannot use wchar_t here since it may have been redefined (e.g.,
668	// via -fshort-wchar). hopefully, sizeof(tmp) is sufficient XP.
669	unsigned int tmp = 0;
670	while (inputLeft && outputLeft) {
671	#ifdef HAVE_MBRTOWC
672	incr = (int) mbrtowc((wchar_t ) &tmp, input, *inputLeft, &ps);
673	#else
674	// XXX is this thread-safe?
675	incr = (int) mbtowc((wchar_t ) &tmp, input, *inputLeft);
676	#endif
677	if (incr < 0) {
678	NS_WARNING("mbtowc failed: possible charset mismatch");
679	// zero-pad and hope for the best
680	tmp = (unsigned char) **input;
681	incr = 1;
682	}
683	**output = (PRUnichar) tmp;
684	(*input) += incr;
685	(*inputLeft) -= incr;
686	(*output)++;
687	(*outputLeft)--;
688	}
689	}
690	else {
691	// wchar_t isn't unicode, so the best we can do is treat the
692	// input as if it is isolatin1 :(
693	isolatin1_to_utf16(input, inputLeft, output, outputLeft);
694	}
695
696	return NS_OK;
697	}
698
699	nsresult
700	nsNativeCharsetConverter::UnicodeToNative(const PRUnichar **input,
701	PRUint32 *inputLeft,
702	char **output,
703	PRUint32 *outputLeft)
704	{
705	if (gWCharIsUnicode) {
706	int incr;
707
708	while (inputLeft && outputLeft >= MB_CUR_MAX) {
709	#ifdef HAVE_WCRTOMB
710	incr = (int) wcrtomb(output, (wchar_t) *input, &ps);
711	#else
712	// XXX is this thread-safe?
713	incr = (int) wctomb(output, (wchar_t) *input);
714	#endif
715	if (incr < 0) {
716	NS_WARNING("mbtowc failed: possible charset mismatch");
717	output = (unsigned char) input; // truncate
718	incr = 1;
719	}
720	// most likely we're dead anyways if this assertion should fire
721	NS_ASSERTION(PRUint32(incr) <= *outputLeft, "wrote beyond end of string");
722	(*output) += incr;
723	(*outputLeft) -= incr;
724	(*input)++;
725	(*inputLeft)--;
726	}
727	}
728	else {
729	// wchar_t isn't unicode, so the best we can do is treat the
730	// input as if it is isolatin1 :(
731	utf16_to_isolatin1(input, inputLeft, output, outputLeft);
732	}
733
734	return NS_OK;
735	}
736
737	#endif // USE_STDCONV
738
739	//-----------------------------------------------------------------------------
740	// API implementation
741	//-----------------------------------------------------------------------------
742
743	NS_COM nsresult
744	NS_CopyNativeToUnicode(const nsACString &input, nsAString &output)
745	{
746	output.Truncate();
747
748	PRUint32 inputLen = input.Length();
749
750	nsACString::const_iterator iter;
751	input.BeginReading(iter);
752
753	//
754	// OPTIMIZATION: preallocate space for largest possible result; convert
755	// directly into the result buffer to avoid intermediate buffer copy.
756	//
757	// this will generally result in a larger allocation, but that seems
758	// better than an extra buffer copy.
759	//
760	output.SetLength(inputLen);
761	nsAString::iterator out_iter;
762	output.BeginWriting(out_iter);
763
764	PRUnichar *result = out_iter.get();
765	PRUint32 resultLeft = inputLen;
766
767	const char *buf = iter.get();
768	PRUint32 bufLeft = inputLen;
769
770	nsNativeCharsetConverter conv;
771	nsresult rv = conv.NativeToUnicode(&buf, &bufLeft, &result, &resultLeft);
772	if (NS_SUCCEEDED(rv)) {
773	NS_ASSERTION(bufLeft == 0, "did not consume entire input buffer");
774	output.SetLength(inputLen - resultLeft);
775	}
776	return rv;
777	}
778
779	NS_COM nsresult
780	NS_CopyUnicodeToNative(const nsAString &input, nsACString &output)
781	{
782	output.Truncate();
783
784	nsAString::const_iterator iter, end;
785	input.BeginReading(iter);
786	input.EndReading(end);
787
788	// cannot easily avoid intermediate buffer copy.
789	char temp[4096];
790
791	nsNativeCharsetConverter conv;
792
793	const PRUnichar *buf = iter.get();
794	PRUint32 bufLeft = Distance(iter, end);
795	while (bufLeft) {
796	char *p = temp;
797	PRUint32 tempLeft = sizeof(temp);
798
799	nsresult rv = conv.UnicodeToNative(&buf, &bufLeft, &p, &tempLeft);
800	if (NS_FAILED(rv)) return rv;
801
802	if (tempLeft < sizeof(temp))
803	output.Append(temp, sizeof(temp) - tempLeft);
804	}
805	return NS_OK;
806	}
807
808	void
809	NS_StartupNativeCharsetUtils()
810	{
811	//
812	// need to initialize the locale or else charset conversion will fail.
813	// better not delay this in case some other component alters the locale
814	// settings.
815	//
816	// XXX we assume that we are called early enough that we should
817	// always be the first to care about the locale's charset.
818	//
819	setlocale(LC_CTYPE, "");
820
821	nsNativeCharsetConverter::GlobalInit();
822	}
823
824	void
825	NS_ShutdownNativeCharsetUtils()
826	{
827	nsNativeCharsetConverter::GlobalShutdown();
828	}
829
830	//-----------------------------------------------------------------------------
831	// XP_BEOS
832	//-----------------------------------------------------------------------------
833	#elif defined(XP_BEOS)
834
835	#include "nsAString.h"
836	#include "nsReadableUtils.h"
837	#include "nsString.h"
838
839	NS_COM nsresult
840	NS_CopyNativeToUnicode(const nsACString &input, nsAString &output)
841	{
842	CopyUTF8toUTF16(input, output);
843	return NS_OK;
844	}
845
846	NS_COM nsresult
847	NS_CopyUnicodeToNative(const nsAString &input, nsACString &output)
848	{
849	CopyUTF16toUTF8(input, output);
850	return NS_OK;
851	}
852
853	void
854	NS_StartupNativeCharsetUtils()
855	{
856	}
857
858	void
859	NS_ShutdownNativeCharsetUtils()
860	{
861	}
862
863	//-----------------------------------------------------------------------------
864	// XP_WIN
865	//-----------------------------------------------------------------------------
866	#elif defined(XP_WIN)
867
868	#include <windows.h>
869	#include "nsAString.h"
870
871	NS_COM nsresult
872	NS_CopyNativeToUnicode(const nsACString &input, nsAString &output)
873	{
874	PRUint32 inputLen = input.Length();
875
876	nsACString::const_iterator iter;
877	input.BeginReading(iter);
878
879	const char *buf = iter.get();
880
881	// determine length of result
882	PRUint32 resultLen = 0;
883	int n = ::MultiByteToWideChar(CP_ACP, 0, buf, inputLen, NULL, 0);
884	if (n > 0)
885	resultLen += n;
886
887	// allocate sufficient space
888	output.SetLength(resultLen);
889	if (resultLen > 0) {
890	nsAString::iterator out_iter;
891	output.BeginWriting(out_iter);
892
893	PRUnichar *result = out_iter.get();
894
895	::MultiByteToWideChar(CP_ACP, 0, buf, inputLen, result, resultLen);
896	}
897	return NS_OK;
898	}
899
900	NS_COM nsresult
901	NS_CopyUnicodeToNative(const nsAString &input, nsACString &output)
902	{
903	PRUint32 inputLen = input.Length();
904
905	nsAString::const_iterator iter;
906	input.BeginReading(iter);
907
908	const PRUnichar *buf = iter.get();
909
910	// determine length of result
911	PRUint32 resultLen = 0;
912
913	int n = ::WideCharToMultiByte(CP_ACP, 0, buf, inputLen, NULL, 0, NULL, NULL);
914	if (n > 0)
915	resultLen += n;
916
917	// allocate sufficient space
918	output.SetLength(resultLen);
919	if (resultLen > 0) {
920	nsACString::iterator out_iter;
921	output.BeginWriting(out_iter);
922
923	// default "defaultChar" is '?', which is an illegal character on windows
924	// file system. That will cause file uncreatable. Change it to '_'
925	const char defaultChar = '_';
926
927	char *result = out_iter.get();
928
929	::WideCharToMultiByte(CP_ACP, 0, buf, inputLen, result, resultLen,
930	&defaultChar, NULL);
931	}
932	return NS_OK;
933	}
934
935	void
936	NS_StartupNativeCharsetUtils()
937	{
938	}
939
940	void
941	NS_ShutdownNativeCharsetUtils()
942	{
943	}
944
945	//-----------------------------------------------------------------------------
946	// XP_OS2
947	//-----------------------------------------------------------------------------
948	#elif defined(XP_OS2)
949
950	#define INCL_DOS
951	#include <os2.h>
952	#include <uconv.h>
953	#include "nsAString.h"
954	#include <ulserrno.h>
955	#include "nsNativeCharsetUtils.h"
956
957	static UconvObject UnicodeConverter = NULL;
958
959	NS_COM nsresult
960	NS_CopyNativeToUnicode(const nsACString &input, nsAString &output)
961	{
962	PRUint32 inputLen = input.Length();
963
964	nsACString::const_iterator iter;
965	input.BeginReading(iter);
966	const char *inputStr = iter.get();
967
968	// determine length of result
969	PRUint32 resultLen = inputLen;
970	output.SetLength(resultLen);
971
972	nsAString::iterator out_iter;
973	output.BeginWriting(out_iter);
974	UniChar result = (UniChar)out_iter.get();
975
976	size_t cSubs = 0;
977	size_t resultLeft = resultLen;
978
979	if (!UnicodeConverter)
980	NS_StartupNativeCharsetUtils();
981
982	int unirc = ::UniUconvToUcs(UnicodeConverter, (void**)&inputStr, &inputLen,
983	&result, &resultLeft, &cSubs);
984
985	NS_ASSERTION(unirc != UCONV_E2BIG, "Path too big");
986
987	if (unirc != ULS_SUCCESS) {
988	output.Truncate();
989	return NS_ERROR_FAILURE;
990	}
991
992	// Need to update string length to reflect how many bytes were actually
993	// written.
994	output.Truncate(resultLen - resultLeft);
995	return NS_OK;
996	}
997
998	NS_COM nsresult
999	NS_CopyUnicodeToNative(const nsAString &input, nsACString &output)
1000	{
1001	size_t inputLen = input.Length();
1002
1003	nsAString::const_iterator iter;
1004	input.BeginReading(iter);
1005	UniChar* inputStr = (UniChar) NS_CONST_CAST(PRUnichar, iter.get());
1006
1007	// maximum length of unicode string of length x converted to native
1008	// codepage is x*2
1009	size_t resultLen = inputLen * 2;
1010	output.SetLength(resultLen);
1011
1012	nsACString::iterator out_iter;
1013	output.BeginWriting(out_iter);
1014	char *result = out_iter.get();
1015
1016	size_t cSubs = 0;
1017	size_t resultLeft = resultLen;
1018
1019	if (!UnicodeConverter)
1020	NS_StartupNativeCharsetUtils();
1021
1022	int unirc = ::UniUconvFromUcs(UnicodeConverter, &inputStr, &inputLen,
1023	(void**)&result, &resultLeft, &cSubs);
1024
1025	NS_ASSERTION(unirc != UCONV_E2BIG, "Path too big");
1026
1027	if (unirc != ULS_SUCCESS) {
1028	output.Truncate();
1029	return NS_ERROR_FAILURE;
1030	}
1031
1032	// Need to update string length to reflect how many bytes were actually
1033	// written.
1034	output.Truncate(resultLen - resultLeft);
1035	return NS_OK;
1036	}
1037
1038	void
1039	NS_StartupNativeCharsetUtils()
1040	{
1041	ULONG ulLength;
1042	ULONG ulCodePage;
1043	DosQueryCp(sizeof(ULONG), &ulCodePage, &ulLength);
1044
1045	UniChar codepage[20];
1046	int unirc = ::UniMapCpToUcsCp(ulCodePage, codepage, 20);
1047	if (unirc == ULS_SUCCESS) {
1048	unirc = ::UniCreateUconvObject(codepage, &UnicodeConverter);
1049	if (unirc == ULS_SUCCESS) {
1050	uconv_attribute_t attr;
1051	::UniQueryUconvObject(UnicodeConverter, &attr, sizeof(uconv_attribute_t),
1052	NULL, NULL, NULL);
1053	attr.options = UCONV_OPTION_SUBSTITUTE_BOTH;
1054	attr.subchar_len=1;
1055	attr.subchar[0]='_';
1056	::UniSetUconvObject(UnicodeConverter, &attr);
1057	}
1058	}
1059	}
1060
1061	void
1062	NS_ShutdownNativeCharsetUtils()
1063	{
1064	::UniFreeUconvObject(UnicodeConverter);
1065	}
1066
1067	//-----------------------------------------------------------------------------
1068	// XP_MAC
1069	//-----------------------------------------------------------------------------
1070	#elif defined(XP_MAC)
1071
1072	#include <UnicodeConverter.h>
1073	#include <TextCommon.h>
1074	#include <Script.h>
1075	#include <MacErrors.h>
1076	#include "nsAString.h"
1077
1078	class nsFSStringConversionMac {
1079	public:
1080	static nsresult UCSToFS(const nsAString& aIn, nsACString& aOut);
1081	static nsresult FSToUCS(const nsACString& ain, nsAString& aOut);
1082
1083	static void CleanUp();
1084
1085	private:
1086	static TextEncoding GetSystemEncoding();
1087	static nsresult PrepareEncoder();
1088	static nsresult PrepareDecoder();
1089
1090	static UnicodeToTextInfo sEncoderInfo;
1091	static TextToUnicodeInfo sDecoderInfo;
1092	};
1093
1094	UnicodeToTextInfo nsFSStringConversionMac::sEncoderInfo = nsnull;
1095	TextToUnicodeInfo nsFSStringConversionMac::sDecoderInfo = nsnull;
1096
1097	nsresult nsFSStringConversionMac::UCSToFS(const nsAString& aIn, nsACString& aOut)
1098	{
1099	nsresult rv = PrepareEncoder();
1100	if (NS_FAILED(rv)) return rv;
1101
1102	OSStatus err = noErr;
1103	char stackBuffer[512];
1104
1105	aOut.Truncate();
1106
1107	// for each chunk of \|aIn\|...
1108	nsReadingIterator<PRUnichar> iter;
1109	aIn.BeginReading(iter);
1110
1111	PRUint32 fragmentLength = PRUint32(iter.size_forward());
1112	UInt32 bytesLeft = fragmentLength * sizeof(UniChar);
1113
1114	do {
1115	UInt32 bytesRead = 0, bytesWritten = 0;
1116	err = ::ConvertFromUnicodeToText(sEncoderInfo,
1117	bytesLeft,
1118	(const UniChar*)iter.get(),
1119	kUnicodeUseFallbacksMask \| kUnicodeLooseMappingsMask,
1120	0, nsnull, nsnull, nsnull,
1121	sizeof(stackBuffer),
1122	&bytesRead,
1123	&bytesWritten,
1124	stackBuffer);
1125	if (err == kTECUsedFallbacksStatus)
1126	err = noErr;
1127	else if (err == kTECOutputBufferFullStatus) {
1128	bytesLeft -= bytesRead;
1129	iter.advance(bytesRead / sizeof(UniChar));
1130	}
1131	aOut.Append(stackBuffer, bytesWritten);
1132	}
1133	while (err == kTECOutputBufferFullStatus);
1134
1135	return (err == noErr) ? NS_OK : NS_ERROR_FAILURE;
1136	}
1137
1138	nsresult nsFSStringConversionMac::FSToUCS(const nsACString& aIn, nsAString& aOut)
1139	{
1140	nsresult rv = PrepareDecoder();
1141	if (NS_FAILED(rv)) return rv;
1142
1143	OSStatus err = noErr;
1144	UniChar stackBuffer[512];
1145
1146	aOut.Truncate(0);
1147
1148	// for each chunk of \|aIn\|...
1149	nsReadingIterator<char> iter;
1150	aIn.BeginReading(iter);
1151
1152	PRUint32 fragmentLength = PRUint32(iter.size_forward());
1153	UInt32 bytesLeft = fragmentLength;
1154
1155	do {
1156	UInt32 bytesRead = 0, bytesWritten = 0;
1157	err = ::ConvertFromTextToUnicode(sDecoderInfo,
1158	bytesLeft,
1159	iter.get(),
1160	kUnicodeUseFallbacksMask \| kUnicodeLooseMappingsMask,
1161	0, nsnull, nsnull, nsnull,
1162	sizeof(stackBuffer),
1163	&bytesRead,
1164	&bytesWritten,
1165	stackBuffer);
1166	if (err == kTECUsedFallbacksStatus)
1167	err = noErr;
1168	else if (err == kTECOutputBufferFullStatus) {
1169	bytesLeft -= bytesRead;
1170	iter.advance(bytesRead);
1171	}
1172	aOut.Append((PRUnichar *)stackBuffer, bytesWritten / sizeof(PRUnichar));
1173	}
1174	while (err == kTECOutputBufferFullStatus);
1175
1176	return (err == noErr) ? NS_OK : NS_ERROR_FAILURE;
1177	}
1178
1179	void nsFSStringConversionMac::CleanUp()
1180	{
1181	if (sDecoderInfo) {
1182	::DisposeTextToUnicodeInfo(&sDecoderInfo);
1183	sDecoderInfo = nsnull;
1184	}
1185	if (sEncoderInfo) {
1186	::DisposeUnicodeToTextInfo(&sEncoderInfo);
1187	sEncoderInfo = nsnull;
1188	}
1189	}
1190
1191	TextEncoding nsFSStringConversionMac::GetSystemEncoding()
1192	{
1193	OSStatus err;
1194	TextEncoding theEncoding;
1195
1196	err = ::UpgradeScriptInfoToTextEncoding(smSystemScript, kTextLanguageDontCare,
1197	kTextRegionDontCare, NULL, &theEncoding);
1198
1199	if (err != noErr)
1200	theEncoding = kTextEncodingMacRoman;
1201
1202	return theEncoding;
1203	}
1204
1205	nsresult nsFSStringConversionMac::PrepareEncoder()
1206	{
1207	nsresult rv = NS_OK;
1208	if (!sEncoderInfo) {
1209	OSStatus err;
1210	err = ::CreateUnicodeToTextInfoByEncoding(GetSystemEncoding(), &sEncoderInfo);
1211	if (err)
1212	rv = NS_ERROR_FAILURE;
1213	}
1214	return rv;
1215	}
1216
1217	nsresult nsFSStringConversionMac::PrepareDecoder()
1218	{
1219	nsresult rv = NS_OK;
1220	if (!sDecoderInfo) {
1221	OSStatus err;
1222	err = ::CreateTextToUnicodeInfoByEncoding(GetSystemEncoding(), &sDecoderInfo);
1223	if (err)
1224	rv = NS_ERROR_FAILURE;
1225	}
1226	return rv;
1227	}
1228
1229	NS_COM nsresult
1230	NS_CopyNativeToUnicode(const nsACString &input, nsAString &output)
1231	{
1232	return nsFSStringConversionMac::FSToUCS(input, output);
1233	}
1234
1235	NS_COM nsresult
1236	NS_CopyUnicodeToNative(const nsAString &input, nsACString &output)
1237	{
1238	return nsFSStringConversionMac::UCSToFS(input, output);
1239	}
1240
1241	void
1242	NS_StartupNativeCharsetUtils()
1243	{
1244	}
1245
1246	void
1247	NS_ShutdownNativeCharsetUtils()
1248	{
1249	nsFSStringConversionMac::CleanUp();
1250	}
1251
1252	//-----------------------------------------------------------------------------
1253	// default : truncate/zeropad
1254	//-----------------------------------------------------------------------------
1255	#else
1256
1257	#include "nsReadableUtils.h"
1258
1259	NS_COM nsresult
1260	NS_CopyNativeToUnicode(const nsACString &input, nsAString &output)
1261	{
1262	CopyASCIItoUCS2(input, output);
1263	return NS_OK;
1264	}
1265
1266	NS_COM nsresult
1267	NS_CopyUnicodeToNative(const nsAString &input, nsACString &output)
1268	{
1269	CopyUCS2toASCII(input, output);
1270	return NS_OK;
1271	}
1272
1273	void
1274	NS_StartupNativeCharsetUtils()
1275	{
1276	}
1277
1278	void
1279	NS_ShutdownNativeCharsetUtils()
1280	{
1281	}
1282
1283	#endif
1284

Note: See TracBrowser for help on using the repository browser.

source: vbox/trunk/src/libs/xpcom18a4/xpcom/io/nsNativeCharsetUtils.cpp@ 101978

Download in other formats: