nsUTF8Utils.h@ 85855

Last change on this file since 85855 was 1, checked in by vboxsync, 55 years ago
import
Property svn:eol-style set to `native` Property svn:keywords set to `Author Date Id Revision`
File size: 14.5 KB

Line
1	/* -- Mode: C++; tab-width: 2; indent-tabs-mode: nil; c-basic-offset: 2 -- */
2	/* *** BEGIN LICENSE BLOCK ***
3	* Version: MPL 1.1/GPL 2.0/LGPL 2.1
4	*
5	* The contents of this file are subject to the Mozilla Public License Version
6	* 1.1 (the "License"); you may not use this file except in compliance with
7	* the License. You may obtain a copy of the License at
8	* http://www.mozilla.org/MPL/
9	*
10	* Software distributed under the License is distributed on an "AS IS" basis,
11	* WITHOUT WARRANTY OF ANY KIND, either express or implied. See the License
12	* for the specific language governing rights and limitations under the
13	* License.
14	*
15	* The Original Code is mozilla.org code.
16	*
17	* The Initial Developer of the Original Code is
18	* Netscape Communications Corporation.
19	* Portions created by the Initial Developer are Copyright (C) 2001
20	* the Initial Developer. All Rights Reserved.
21	*
22	* Contributor(s):
23	* Peter Annema <[email protected]> (original author)
24	*
25	* Alternatively, the contents of this file may be used under the terms of
26	* either of the GNU General Public License Version 2 or later (the "GPL"),
27	* or the GNU Lesser General Public License Version 2.1 or later (the "LGPL"),
28	* in which case the provisions of the GPL or the LGPL are applicable instead
29	* of those above. If you wish to allow use of your version of this file only
30	* under the terms of either the GPL or the LGPL, and not to allow others to
31	* use your version of this file under the terms of the MPL, indicate your
32	* decision by deleting the provisions above and replace them with the notice
33	* and other provisions required by the GPL or the LGPL. If you do not delete
34	* the provisions above, a recipient may use your version of this file under
35	* the terms of any one of the MPL, the GPL or the LGPL.
36	*
37	* *** END LICENSE BLOCK *** */
38
39	#ifndef nsUTF8Utils_h_
40	#define nsUTF8Utils_h_
41
42	class UTF8traits
43	{
44	public:
45	static PRBool isASCII(char c) { return (c & 0x80) == 0x00; }
46	static PRBool isInSeq(char c) { return (c & 0xC0) == 0x80; }
47	static PRBool is2byte(char c) { return (c & 0xE0) == 0xC0; }
48	static PRBool is3byte(char c) { return (c & 0xF0) == 0xE0; }
49	static PRBool is4byte(char c) { return (c & 0xF8) == 0xF0; }
50	static PRBool is5byte(char c) { return (c & 0xFC) == 0xF8; }
51	static PRBool is6byte(char c) { return (c & 0xFE) == 0xFC; }
52	};
53
54	#define PLANE1_BASE 0x00010000
55	#define UCS2_REPLACEMENT_CHAR 0xfffd
56
57	#ifdef __GNUC__
58	#define NS_ALWAYS_INLINE __attribute__((always_inline))
59	#else
60	#define NS_ALWAYS_INLINE
61	#endif
62
63	/**
64	* A character sink (see \|copy_string\| in nsAlgorithm.h) for converting
65	* UTF-8 to UTF-16
66	*/
67	class ConvertUTF8toUTF16
68	{
69	public:
70	typedef nsACString::char_type value_type;
71	typedef nsAString::char_type buffer_type;
72
73	ConvertUTF8toUTF16( buffer_type* aBuffer )
74	: mStart(aBuffer), mBuffer(aBuffer), mErrorEncountered(PR_FALSE) {}
75
76	size_t Length() const { return mBuffer - mStart; }
77
78	PRUint32 NS_ALWAYS_INLINE write( const value_type* start, PRUint32 N )
79	{
80	if ( mErrorEncountered )
81	return N;
82
83	// algorithm assumes utf8 units won't
84	// be spread across fragments
85	const value_type* p = start;
86	const value_type* end = start + N;
87	buffer_type* out = mBuffer;
88	for ( ; p != end /* && p /; )
89	{
90	char c = *p++;
91
92	if ( UTF8traits::isASCII(c) )
93	{
94	*out++ = buffer_type(c);
95	continue;
96	}
97
98	PRUint32 ucs4;
99	PRUint32 minUcs4;
100	PRInt32 state = 0;
101
102	if ( UTF8traits::is2byte(c) )
103	{
104	ucs4 = (PRUint32(c) << 6) & 0x000007C0L;
105	state = 1;
106	minUcs4 = 0x00000080;
107	}
108	else if ( UTF8traits::is3byte(c) )
109	{
110	ucs4 = (PRUint32(c) << 12) & 0x0000F000L;
111	state = 2;
112	minUcs4 = 0x00000800;
113	}
114	else if ( UTF8traits::is4byte(c) )
115	{
116	ucs4 = (PRUint32(c) << 18) & 0x001F0000L;
117	state = 3;
118	minUcs4 = 0x00010000;
119	}
120	else if ( UTF8traits::is5byte(c) )
121	{
122	ucs4 = (PRUint32(c) << 24) & 0x03000000L;
123	state = 4;
124	minUcs4 = 0x00200000;
125	}
126	else if ( UTF8traits::is6byte(c) )
127	{
128	ucs4 = (PRUint32(c) << 30) & 0x40000000L;
129	state = 5;
130	minUcs4 = 0x04000000;
131	}
132	else
133	{
134	NS_ERROR("Not a UTF-8 string. This code should only be used for converting from known UTF-8 strings.");
135	mErrorEncountered = PR_TRUE;
136	mBuffer = out;
137	return N;
138	}
139
140	while ( state-- )
141	{
142	c = *p++;
143
144	if ( UTF8traits::isInSeq(c) )
145	{
146	PRInt32 shift = state * 6;
147	ucs4 \|= (PRUint32(c) & 0x3F) << shift;
148	}
149	else
150	{
151	NS_ERROR("not a UTF8 string");
152	mErrorEncountered = PR_TRUE;
153	mBuffer = out;
154	return N;
155	}
156	}
157
158	if ( ucs4 < minUcs4 )
159	{
160	// Overlong sequence
161	*out++ = UCS2_REPLACEMENT_CHAR;
162	}
163	else if ( ucs4 <= 0xD7FF )
164	{
165	*out++ = ucs4;
166	}
167	else if ( /* ucs4 >= 0xD800 && */ ucs4 <= 0xDFFF )
168	{
169	// Surrogates
170	*out++ = UCS2_REPLACEMENT_CHAR;
171	}
172	else if ( ucs4 == 0xFFFE \|\| ucs4 == 0xFFFF )
173	{
174	// Prohibited characters
175	*out++ = UCS2_REPLACEMENT_CHAR;
176	}
177	else if ( ucs4 >= PLANE1_BASE )
178	{
179	if ( ucs4 >= 0x00110000 )
180	*out++ = UCS2_REPLACEMENT_CHAR;
181	else {
182	// surrogate, see unicode specification 3.7 for following math.
183	ucs4 -= PLANE1_BASE;
184	*out++ = (PRUnichar)(ucs4 >> 10) \| 0xd800u;
185	*out++ = (PRUnichar)(ucs4 & 0x3ff) \| 0xdc00u;
186	}
187	}
188	else
189	{
190	*out++ = ucs4;
191	}
192	}
193	mBuffer = out;
194	return p - start;
195	}
196
197	void write_terminator()
198	{
199	*mBuffer = buffer_type(0);
200	}
201
202	private:
203	buffer_type* const mStart;
204	buffer_type* mBuffer;
205	PRBool mErrorEncountered;
206	};
207
208	/**
209	* A character sink (see \|copy_string\| in nsAlgorithm.h) for computing
210	* the length of the UTF-16 string equivalent to a UTF-8 string.
211	*/
212	class CalculateUTF8Length
213	{
214	public:
215	typedef nsACString::char_type value_type;
216
217	CalculateUTF8Length() : mLength(0), mErrorEncountered(PR_FALSE) { }
218
219	size_t Length() const { return mLength; }
220
221	PRUint32 NS_ALWAYS_INLINE write( const value_type* start, PRUint32 N )
222	{
223	// ignore any further requests
224	if ( mErrorEncountered )
225	return N;
226
227	// algorithm assumes utf8 units won't
228	// be spread across fragments
229	const value_type* p = start;
230	const value_type* end = start + N;
231	for ( ; p < end /* && p /; ++mLength )
232	{
233	if ( UTF8traits::isASCII(*p) )
234	p += 1;
235	else if ( UTF8traits::is2byte(*p) )
236	p += 2;
237	else if ( UTF8traits::is3byte(*p) )
238	p += 3;
239	else if ( UTF8traits::is4byte(*p) ) {
240	p += 4;
241	// Because a UTF-8 sequence of 4 bytes represents a codepoint
242	// greater than 0xFFFF, it will become a surrogate pair in the
243	// UTF-16 string, so add 1 more to mLength.
244	// This doesn't happen with is5byte and is6byte because they
245	// are illegal UTF-8 sequences (greater than 0x10FFFF) so get
246	// converted to a single replacement character.
247	//
248	// XXX: if the 4-byte sequence is an illegal non-shortest form,
249	// it also gets converted to a replacement character, so
250	// mLength will be off by one in this case.
251	++mLength;
252	}
253	else if ( UTF8traits::is5byte(*p) )
254	p += 5;
255	else if ( UTF8traits::is6byte(*p) )
256	p += 6;
257	else
258	{
259	break;
260	}
261	}
262	if ( p != end )
263	{
264	NS_ERROR("Not a UTF-8 string. This code should only be used for converting from known UTF-8 strings.");
265	mErrorEncountered = PR_TRUE;
266	mLength = 0;
267	return N;
268	}
269	return p - start;
270	}
271
272	private:
273	size_t mLength;
274	PRBool mErrorEncountered;
275	};
276
277	/**
278	* A character sink (see \|copy_string\| in nsAlgorithm.h) for converting
279	* UTF-16 to UTF-8.
280	*/
281	class ConvertUTF16toUTF8
282	{
283	public:
284	typedef nsAString::char_type value_type;
285	typedef nsACString::char_type buffer_type;
286
287	// The error handling here is more lenient than that in
288	// \|ConvertUTF8toUTF16\|, but it's that way for backwards
289	// compatibility.
290
291	ConvertUTF16toUTF8( buffer_type* aBuffer )
292	: mStart(aBuffer), mBuffer(aBuffer) {}
293
294	size_t Size() const { return mBuffer - mStart; }
295
296	PRUint32 NS_ALWAYS_INLINE write( const value_type* start, PRUint32 N )
297	{
298	buffer_type *out = mBuffer; // gcc isn't smart enough to do this!
299
300	for (const value_type p = start, end = start + N; p < end; ++p )
301	{
302	value_type c = *p;
303	if (! (c & 0xFF80)) // U+0000 - U+007F
304	{
305	*out++ = (char)c;
306	}
307	else if (! (c & 0xF800)) // U+0100 - U+07FF
308	{
309	*out++ = 0xC0 \| (char)(c >> 6);
310	*out++ = 0x80 \| (char)(0x003F & c);
311	}
312	else if (0xD800 != (0xF800 & c)) // U+0800 - U+D7FF,U+E000 - U+FFFF
313	{
314	*out++ = 0xE0 \| (char)(c >> 12);
315	*out++ = 0x80 \| (char)(0x003F & (c >> 6));
316	*out++ = 0x80 \| (char)(0x003F & c );
317	}
318	else if (0xD800 == (0xFC00 & c)) // U+D800 - U+DBFF
319	{
320	// D800- DBFF - High Surrogate
321	// N = (H- D800) *400 + 10000 + ...
322	PRUint32 ucs4 = 0x10000 + ((0x03FF & c) << 10);
323
324	++p;
325	if (p == end)
326	{
327	NS_ERROR("Surrogate pair split between fragments");
328	mBuffer = out;
329	return N;
330	}
331	c = *p;
332
333	if (0xDC00 == (0xFC00 & c))
334	{
335	// DC00- DFFF - Low Surrogate
336	// N += ( L - DC00 )
337	ucs4 \|= (0x03FF & c);
338
339	// 0001 0000-001F FFFF
340	*out++ = 0xF0 \| (char)(ucs4 >> 18);
341	*out++ = 0x80 \| (char)(0x003F & (ucs4 >> 12));
342	*out++ = 0x80 \| (char)(0x003F & (ucs4 >> 6));
343	*out++ = 0x80 \| (char)(0x003F & ucs4);
344	}
345	else
346	{
347	NS_ERROR("got a High Surrogate but no low surrogate");
348	// output nothing.
349	}
350	}
351	else // U+DC00 - U+DFFF
352	{
353	// DC00- DFFF - Low Surrogate
354	NS_ERROR("got a low Surrogate but no high surrogate");
355	// output nothing.
356	}
357	}
358
359	mBuffer = out;
360	return N;
361	}
362
363	void write_terminator()
364	{
365	*mBuffer = buffer_type(0);
366	}
367
368	private:
369	buffer_type* const mStart;
370	buffer_type* mBuffer;
371	};
372
373	/**
374	* A character sink (see \|copy_string\| in nsAlgorithm.h) for computing
375	* the number of bytes a UTF-16 would occupy in UTF-8.
376	*/
377	class CalculateUTF8Size
378	{
379	public:
380	typedef nsAString::char_type value_type;
381
382	CalculateUTF8Size()
383	: mSize(0) { }
384
385	size_t Size() const { return mSize; }
386
387	PRUint32 NS_ALWAYS_INLINE write( const value_type* start, PRUint32 N )
388	{
389	// Assume UCS2 surrogate pairs won't be spread across fragments.
390	for (const value_type p = start, end = start + N; p < end; ++p )
391	{
392	value_type c = *p;
393	if (! (c & 0xFF80)) // U+0000 - U+007F
394	mSize += 1;
395	else if (! (c & 0xF800)) // U+0100 - U+07FF
396	mSize += 2;
397	else if (0xD800 != (0xF800 & c)) // U+0800 - U+D7FF,U+E000 - U+FFFF
398	mSize += 3;
399	else if (0xD800 == (0xFC00 & c)) // U+D800 - U+DBFF
400	{
401	++p;
402	if (p == end)
403	{
404	NS_ERROR("Surrogate pair split between fragments");
405	return N;
406	}
407	c = *p;
408
409	if (0xDC00 == (0xFC00 & c))
410	mSize += 4;
411	else
412	NS_ERROR("got a high Surrogate but no low surrogate");
413	}
414	else // U+DC00 - U+DFFF
415	NS_ERROR("got a low Surrogate but no high surrogate");
416	}
417
418	return N;
419	}
420
421	private:
422	size_t mSize;
423	};
424
425	/**
426	* A character sink that performs a \|reinterpret_cast\| style conversion
427	* between character types.
428	*/
429	template <class FromCharT, class ToCharT>
430	class LossyConvertEncoding
431	{
432	public:
433	typedef FromCharT value_type;
434
435	typedef FromCharT input_type;
436	typedef ToCharT output_type;
437
438	typedef typename nsCharTraits<FromCharT>::unsigned_char_type unsigned_input_type;
439
440	public:
441	LossyConvertEncoding( output_type* aDestination ) : mDestination(aDestination) { }
442
443	PRUint32
444	write( const input_type* aSource, PRUint32 aSourceLength )
445	{
446	const input_type* done_writing = aSource + aSourceLength;
447	while ( aSource < done_writing )
448	mDestination++ = (output_type)(unsigned_input_type)(aSource++); // use old-style cast to mimic old \|ns[C]String\| behavior
449	return aSourceLength;
450	}
451
452	void
453	write_terminator()
454	{
455	*mDestination = output_type(0);
456	}
457
458	private:
459	output_type* mDestination;
460	};
461
462	#endif /* !defined(nsUTF8Utils_h_) */

Note: See TracBrowser for help on using the repository browser.

source: vbox/trunk/src/libs/xpcom18a4/xpcom/string/public/nsUTF8Utils.h@ 85855

Download in other formats: