1 | /* -*- Mode: C++; tab-width: 2; indent-tabs-mode: nil; c-basic-offset: 2 -*- */
|
---|
2 | /* ***** BEGIN LICENSE BLOCK *****
|
---|
3 | * Version: MPL 1.1/GPL 2.0/LGPL 2.1
|
---|
4 | *
|
---|
5 | * The contents of this file are subject to the Mozilla Public License Version
|
---|
6 | * 1.1 (the "License"); you may not use this file except in compliance with
|
---|
7 | * the License. You may obtain a copy of the License at
|
---|
8 | * http://www.mozilla.org/MPL/
|
---|
9 | *
|
---|
10 | * Software distributed under the License is distributed on an "AS IS" basis,
|
---|
11 | * WITHOUT WARRANTY OF ANY KIND, either express or implied. See the License
|
---|
12 | * for the specific language governing rights and limitations under the
|
---|
13 | * License.
|
---|
14 | *
|
---|
15 | * The Original Code is mozilla.org code.
|
---|
16 | *
|
---|
17 | * The Initial Developer of the Original Code is
|
---|
18 | * Netscape Communications Corporation.
|
---|
19 | * Portions created by the Initial Developer are Copyright (C) 2001
|
---|
20 | * the Initial Developer. All Rights Reserved.
|
---|
21 | *
|
---|
22 | * Contributor(s):
|
---|
23 | * Peter Annema <[email protected]> (original author)
|
---|
24 | *
|
---|
25 | * Alternatively, the contents of this file may be used under the terms of
|
---|
26 | * either of the GNU General Public License Version 2 or later (the "GPL"),
|
---|
27 | * or the GNU Lesser General Public License Version 2.1 or later (the "LGPL"),
|
---|
28 | * in which case the provisions of the GPL or the LGPL are applicable instead
|
---|
29 | * of those above. If you wish to allow use of your version of this file only
|
---|
30 | * under the terms of either the GPL or the LGPL, and not to allow others to
|
---|
31 | * use your version of this file under the terms of the MPL, indicate your
|
---|
32 | * decision by deleting the provisions above and replace them with the notice
|
---|
33 | * and other provisions required by the GPL or the LGPL. If you do not delete
|
---|
34 | * the provisions above, a recipient may use your version of this file under
|
---|
35 | * the terms of any one of the MPL, the GPL or the LGPL.
|
---|
36 | *
|
---|
37 | * ***** END LICENSE BLOCK ***** */
|
---|
38 |
|
---|
39 | #ifndef nsUTF8Utils_h_
|
---|
40 | #define nsUTF8Utils_h_
|
---|
41 |
|
---|
42 | class UTF8traits
|
---|
43 | {
|
---|
44 | public:
|
---|
45 | static PRBool isASCII(char c) { return (c & 0x80) == 0x00; }
|
---|
46 | static PRBool isInSeq(char c) { return (c & 0xC0) == 0x80; }
|
---|
47 | static PRBool is2byte(char c) { return (c & 0xE0) == 0xC0; }
|
---|
48 | static PRBool is3byte(char c) { return (c & 0xF0) == 0xE0; }
|
---|
49 | static PRBool is4byte(char c) { return (c & 0xF8) == 0xF0; }
|
---|
50 | static PRBool is5byte(char c) { return (c & 0xFC) == 0xF8; }
|
---|
51 | static PRBool is6byte(char c) { return (c & 0xFE) == 0xFC; }
|
---|
52 | };
|
---|
53 |
|
---|
54 | #define PLANE1_BASE 0x00010000
|
---|
55 | #define UCS2_REPLACEMENT_CHAR 0xfffd
|
---|
56 |
|
---|
57 | #ifdef __GNUC__
|
---|
58 | #define NS_ALWAYS_INLINE __attribute__((always_inline))
|
---|
59 | #else
|
---|
60 | #define NS_ALWAYS_INLINE
|
---|
61 | #endif
|
---|
62 |
|
---|
63 | /**
|
---|
64 | * A character sink (see |copy_string| in nsAlgorithm.h) for converting
|
---|
65 | * UTF-8 to UTF-16
|
---|
66 | */
|
---|
67 | class ConvertUTF8toUTF16
|
---|
68 | {
|
---|
69 | public:
|
---|
70 | typedef nsACString::char_type value_type;
|
---|
71 | typedef nsAString::char_type buffer_type;
|
---|
72 |
|
---|
73 | ConvertUTF8toUTF16( buffer_type* aBuffer )
|
---|
74 | : mStart(aBuffer), mBuffer(aBuffer), mErrorEncountered(PR_FALSE) {}
|
---|
75 |
|
---|
76 | size_t Length() const { return mBuffer - mStart; }
|
---|
77 |
|
---|
78 | PRUint32 NS_ALWAYS_INLINE write( const value_type* start, PRUint32 N )
|
---|
79 | {
|
---|
80 | if ( mErrorEncountered )
|
---|
81 | return N;
|
---|
82 |
|
---|
83 | // algorithm assumes utf8 units won't
|
---|
84 | // be spread across fragments
|
---|
85 | const value_type* p = start;
|
---|
86 | const value_type* end = start + N;
|
---|
87 | buffer_type* out = mBuffer;
|
---|
88 | for ( ; p != end /* && *p */; )
|
---|
89 | {
|
---|
90 | char c = *p++;
|
---|
91 |
|
---|
92 | if ( UTF8traits::isASCII(c) )
|
---|
93 | {
|
---|
94 | *out++ = buffer_type(c);
|
---|
95 | continue;
|
---|
96 | }
|
---|
97 |
|
---|
98 | PRUint32 ucs4;
|
---|
99 | PRUint32 minUcs4;
|
---|
100 | PRInt32 state = 0;
|
---|
101 |
|
---|
102 | if ( UTF8traits::is2byte(c) )
|
---|
103 | {
|
---|
104 | ucs4 = (PRUint32(c) << 6) & 0x000007C0L;
|
---|
105 | state = 1;
|
---|
106 | minUcs4 = 0x00000080;
|
---|
107 | }
|
---|
108 | else if ( UTF8traits::is3byte(c) )
|
---|
109 | {
|
---|
110 | ucs4 = (PRUint32(c) << 12) & 0x0000F000L;
|
---|
111 | state = 2;
|
---|
112 | minUcs4 = 0x00000800;
|
---|
113 | }
|
---|
114 | else if ( UTF8traits::is4byte(c) )
|
---|
115 | {
|
---|
116 | ucs4 = (PRUint32(c) << 18) & 0x001F0000L;
|
---|
117 | state = 3;
|
---|
118 | minUcs4 = 0x00010000;
|
---|
119 | }
|
---|
120 | else if ( UTF8traits::is5byte(c) )
|
---|
121 | {
|
---|
122 | ucs4 = (PRUint32(c) << 24) & 0x03000000L;
|
---|
123 | state = 4;
|
---|
124 | minUcs4 = 0x00200000;
|
---|
125 | }
|
---|
126 | else if ( UTF8traits::is6byte(c) )
|
---|
127 | {
|
---|
128 | ucs4 = (PRUint32(c) << 30) & 0x40000000L;
|
---|
129 | state = 5;
|
---|
130 | minUcs4 = 0x04000000;
|
---|
131 | }
|
---|
132 | else
|
---|
133 | {
|
---|
134 | NS_ERROR("Not a UTF-8 string. This code should only be used for converting from known UTF-8 strings.");
|
---|
135 | mErrorEncountered = PR_TRUE;
|
---|
136 | mBuffer = out;
|
---|
137 | return N;
|
---|
138 | }
|
---|
139 |
|
---|
140 | while ( state-- )
|
---|
141 | {
|
---|
142 | c = *p++;
|
---|
143 |
|
---|
144 | if ( UTF8traits::isInSeq(c) )
|
---|
145 | {
|
---|
146 | PRInt32 shift = state * 6;
|
---|
147 | ucs4 |= (PRUint32(c) & 0x3F) << shift;
|
---|
148 | }
|
---|
149 | else
|
---|
150 | {
|
---|
151 | NS_ERROR("not a UTF8 string");
|
---|
152 | mErrorEncountered = PR_TRUE;
|
---|
153 | mBuffer = out;
|
---|
154 | return N;
|
---|
155 | }
|
---|
156 | }
|
---|
157 |
|
---|
158 | if ( ucs4 < minUcs4 )
|
---|
159 | {
|
---|
160 | // Overlong sequence
|
---|
161 | *out++ = UCS2_REPLACEMENT_CHAR;
|
---|
162 | }
|
---|
163 | else if ( ucs4 <= 0xD7FF )
|
---|
164 | {
|
---|
165 | *out++ = ucs4;
|
---|
166 | }
|
---|
167 | else if ( /* ucs4 >= 0xD800 && */ ucs4 <= 0xDFFF )
|
---|
168 | {
|
---|
169 | // Surrogates
|
---|
170 | *out++ = UCS2_REPLACEMENT_CHAR;
|
---|
171 | }
|
---|
172 | else if ( ucs4 == 0xFFFE || ucs4 == 0xFFFF )
|
---|
173 | {
|
---|
174 | // Prohibited characters
|
---|
175 | *out++ = UCS2_REPLACEMENT_CHAR;
|
---|
176 | }
|
---|
177 | else if ( ucs4 >= PLANE1_BASE )
|
---|
178 | {
|
---|
179 | if ( ucs4 >= 0x00110000 )
|
---|
180 | *out++ = UCS2_REPLACEMENT_CHAR;
|
---|
181 | else {
|
---|
182 | // surrogate, see unicode specification 3.7 for following math.
|
---|
183 | ucs4 -= PLANE1_BASE;
|
---|
184 | *out++ = (PRUnichar)(ucs4 >> 10) | 0xd800u;
|
---|
185 | *out++ = (PRUnichar)(ucs4 & 0x3ff) | 0xdc00u;
|
---|
186 | }
|
---|
187 | }
|
---|
188 | else
|
---|
189 | {
|
---|
190 | *out++ = ucs4;
|
---|
191 | }
|
---|
192 | }
|
---|
193 | mBuffer = out;
|
---|
194 | return p - start;
|
---|
195 | }
|
---|
196 |
|
---|
197 | void write_terminator()
|
---|
198 | {
|
---|
199 | *mBuffer = buffer_type(0);
|
---|
200 | }
|
---|
201 |
|
---|
202 | private:
|
---|
203 | buffer_type* const mStart;
|
---|
204 | buffer_type* mBuffer;
|
---|
205 | PRBool mErrorEncountered;
|
---|
206 | };
|
---|
207 |
|
---|
208 | /**
|
---|
209 | * A character sink (see |copy_string| in nsAlgorithm.h) for computing
|
---|
210 | * the length of the UTF-16 string equivalent to a UTF-8 string.
|
---|
211 | */
|
---|
212 | class CalculateUTF8Length
|
---|
213 | {
|
---|
214 | public:
|
---|
215 | typedef nsACString::char_type value_type;
|
---|
216 |
|
---|
217 | CalculateUTF8Length() : mLength(0), mErrorEncountered(PR_FALSE) { }
|
---|
218 |
|
---|
219 | size_t Length() const { return mLength; }
|
---|
220 |
|
---|
221 | PRUint32 NS_ALWAYS_INLINE write( const value_type* start, PRUint32 N )
|
---|
222 | {
|
---|
223 | // ignore any further requests
|
---|
224 | if ( mErrorEncountered )
|
---|
225 | return N;
|
---|
226 |
|
---|
227 | // algorithm assumes utf8 units won't
|
---|
228 | // be spread across fragments
|
---|
229 | const value_type* p = start;
|
---|
230 | const value_type* end = start + N;
|
---|
231 | for ( ; p < end /* && *p */; ++mLength )
|
---|
232 | {
|
---|
233 | if ( UTF8traits::isASCII(*p) )
|
---|
234 | p += 1;
|
---|
235 | else if ( UTF8traits::is2byte(*p) )
|
---|
236 | p += 2;
|
---|
237 | else if ( UTF8traits::is3byte(*p) )
|
---|
238 | p += 3;
|
---|
239 | else if ( UTF8traits::is4byte(*p) ) {
|
---|
240 | p += 4;
|
---|
241 | // Because a UTF-8 sequence of 4 bytes represents a codepoint
|
---|
242 | // greater than 0xFFFF, it will become a surrogate pair in the
|
---|
243 | // UTF-16 string, so add 1 more to mLength.
|
---|
244 | // This doesn't happen with is5byte and is6byte because they
|
---|
245 | // are illegal UTF-8 sequences (greater than 0x10FFFF) so get
|
---|
246 | // converted to a single replacement character.
|
---|
247 | //
|
---|
248 | // XXX: if the 4-byte sequence is an illegal non-shortest form,
|
---|
249 | // it also gets converted to a replacement character, so
|
---|
250 | // mLength will be off by one in this case.
|
---|
251 | ++mLength;
|
---|
252 | }
|
---|
253 | else if ( UTF8traits::is5byte(*p) )
|
---|
254 | p += 5;
|
---|
255 | else if ( UTF8traits::is6byte(*p) )
|
---|
256 | p += 6;
|
---|
257 | else
|
---|
258 | {
|
---|
259 | break;
|
---|
260 | }
|
---|
261 | }
|
---|
262 | if ( p != end )
|
---|
263 | {
|
---|
264 | NS_ERROR("Not a UTF-8 string. This code should only be used for converting from known UTF-8 strings.");
|
---|
265 | mErrorEncountered = PR_TRUE;
|
---|
266 | mLength = 0;
|
---|
267 | return N;
|
---|
268 | }
|
---|
269 | return p - start;
|
---|
270 | }
|
---|
271 |
|
---|
272 | private:
|
---|
273 | size_t mLength;
|
---|
274 | PRBool mErrorEncountered;
|
---|
275 | };
|
---|
276 |
|
---|
277 | /**
|
---|
278 | * A character sink (see |copy_string| in nsAlgorithm.h) for converting
|
---|
279 | * UTF-16 to UTF-8.
|
---|
280 | */
|
---|
281 | class ConvertUTF16toUTF8
|
---|
282 | {
|
---|
283 | public:
|
---|
284 | typedef nsAString::char_type value_type;
|
---|
285 | typedef nsACString::char_type buffer_type;
|
---|
286 |
|
---|
287 | // The error handling here is more lenient than that in
|
---|
288 | // |ConvertUTF8toUTF16|, but it's that way for backwards
|
---|
289 | // compatibility.
|
---|
290 |
|
---|
291 | ConvertUTF16toUTF8( buffer_type* aBuffer )
|
---|
292 | : mStart(aBuffer), mBuffer(aBuffer) {}
|
---|
293 |
|
---|
294 | size_t Size() const { return mBuffer - mStart; }
|
---|
295 |
|
---|
296 | PRUint32 NS_ALWAYS_INLINE write( const value_type* start, PRUint32 N )
|
---|
297 | {
|
---|
298 | buffer_type *out = mBuffer; // gcc isn't smart enough to do this!
|
---|
299 |
|
---|
300 | for (const value_type *p = start, *end = start + N; p < end; ++p )
|
---|
301 | {
|
---|
302 | value_type c = *p;
|
---|
303 | if (! (c & 0xFF80)) // U+0000 - U+007F
|
---|
304 | {
|
---|
305 | *out++ = (char)c;
|
---|
306 | }
|
---|
307 | else if (! (c & 0xF800)) // U+0100 - U+07FF
|
---|
308 | {
|
---|
309 | *out++ = 0xC0 | (char)(c >> 6);
|
---|
310 | *out++ = 0x80 | (char)(0x003F & c);
|
---|
311 | }
|
---|
312 | else if (0xD800 != (0xF800 & c)) // U+0800 - U+D7FF,U+E000 - U+FFFF
|
---|
313 | {
|
---|
314 | *out++ = 0xE0 | (char)(c >> 12);
|
---|
315 | *out++ = 0x80 | (char)(0x003F & (c >> 6));
|
---|
316 | *out++ = 0x80 | (char)(0x003F & c );
|
---|
317 | }
|
---|
318 | else if (0xD800 == (0xFC00 & c)) // U+D800 - U+DBFF
|
---|
319 | {
|
---|
320 | // D800- DBFF - High Surrogate
|
---|
321 | // N = (H- D800) *400 + 10000 + ...
|
---|
322 | PRUint32 ucs4 = 0x10000 + ((0x03FF & c) << 10);
|
---|
323 |
|
---|
324 | ++p;
|
---|
325 | if (p == end)
|
---|
326 | {
|
---|
327 | NS_ERROR("Surrogate pair split between fragments");
|
---|
328 | mBuffer = out;
|
---|
329 | return N;
|
---|
330 | }
|
---|
331 | c = *p;
|
---|
332 |
|
---|
333 | if (0xDC00 == (0xFC00 & c))
|
---|
334 | {
|
---|
335 | // DC00- DFFF - Low Surrogate
|
---|
336 | // N += ( L - DC00 )
|
---|
337 | ucs4 |= (0x03FF & c);
|
---|
338 |
|
---|
339 | // 0001 0000-001F FFFF
|
---|
340 | *out++ = 0xF0 | (char)(ucs4 >> 18);
|
---|
341 | *out++ = 0x80 | (char)(0x003F & (ucs4 >> 12));
|
---|
342 | *out++ = 0x80 | (char)(0x003F & (ucs4 >> 6));
|
---|
343 | *out++ = 0x80 | (char)(0x003F & ucs4);
|
---|
344 | }
|
---|
345 | else
|
---|
346 | {
|
---|
347 | NS_ERROR("got a High Surrogate but no low surrogate");
|
---|
348 | // output nothing.
|
---|
349 | }
|
---|
350 | }
|
---|
351 | else // U+DC00 - U+DFFF
|
---|
352 | {
|
---|
353 | // DC00- DFFF - Low Surrogate
|
---|
354 | NS_ERROR("got a low Surrogate but no high surrogate");
|
---|
355 | // output nothing.
|
---|
356 | }
|
---|
357 | }
|
---|
358 |
|
---|
359 | mBuffer = out;
|
---|
360 | return N;
|
---|
361 | }
|
---|
362 |
|
---|
363 | void write_terminator()
|
---|
364 | {
|
---|
365 | *mBuffer = buffer_type(0);
|
---|
366 | }
|
---|
367 |
|
---|
368 | private:
|
---|
369 | buffer_type* const mStart;
|
---|
370 | buffer_type* mBuffer;
|
---|
371 | };
|
---|
372 |
|
---|
373 | /**
|
---|
374 | * A character sink (see |copy_string| in nsAlgorithm.h) for computing
|
---|
375 | * the number of bytes a UTF-16 would occupy in UTF-8.
|
---|
376 | */
|
---|
377 | class CalculateUTF8Size
|
---|
378 | {
|
---|
379 | public:
|
---|
380 | typedef nsAString::char_type value_type;
|
---|
381 |
|
---|
382 | CalculateUTF8Size()
|
---|
383 | : mSize(0) { }
|
---|
384 |
|
---|
385 | size_t Size() const { return mSize; }
|
---|
386 |
|
---|
387 | PRUint32 NS_ALWAYS_INLINE write( const value_type* start, PRUint32 N )
|
---|
388 | {
|
---|
389 | // Assume UCS2 surrogate pairs won't be spread across fragments.
|
---|
390 | for (const value_type *p = start, *end = start + N; p < end; ++p )
|
---|
391 | {
|
---|
392 | value_type c = *p;
|
---|
393 | if (! (c & 0xFF80)) // U+0000 - U+007F
|
---|
394 | mSize += 1;
|
---|
395 | else if (! (c & 0xF800)) // U+0100 - U+07FF
|
---|
396 | mSize += 2;
|
---|
397 | else if (0xD800 != (0xF800 & c)) // U+0800 - U+D7FF,U+E000 - U+FFFF
|
---|
398 | mSize += 3;
|
---|
399 | else if (0xD800 == (0xFC00 & c)) // U+D800 - U+DBFF
|
---|
400 | {
|
---|
401 | ++p;
|
---|
402 | if (p == end)
|
---|
403 | {
|
---|
404 | NS_ERROR("Surrogate pair split between fragments");
|
---|
405 | return N;
|
---|
406 | }
|
---|
407 | c = *p;
|
---|
408 |
|
---|
409 | if (0xDC00 == (0xFC00 & c))
|
---|
410 | mSize += 4;
|
---|
411 | else
|
---|
412 | NS_ERROR("got a high Surrogate but no low surrogate");
|
---|
413 | }
|
---|
414 | else // U+DC00 - U+DFFF
|
---|
415 | NS_ERROR("got a low Surrogate but no high surrogate");
|
---|
416 | }
|
---|
417 |
|
---|
418 | return N;
|
---|
419 | }
|
---|
420 |
|
---|
421 | private:
|
---|
422 | size_t mSize;
|
---|
423 | };
|
---|
424 |
|
---|
425 | /**
|
---|
426 | * A character sink that performs a |reinterpret_cast| style conversion
|
---|
427 | * between character types.
|
---|
428 | */
|
---|
429 | template <class FromCharT, class ToCharT>
|
---|
430 | class LossyConvertEncoding
|
---|
431 | {
|
---|
432 | public:
|
---|
433 | typedef FromCharT value_type;
|
---|
434 |
|
---|
435 | typedef FromCharT input_type;
|
---|
436 | typedef ToCharT output_type;
|
---|
437 |
|
---|
438 | typedef typename nsCharTraits<FromCharT>::unsigned_char_type unsigned_input_type;
|
---|
439 |
|
---|
440 | public:
|
---|
441 | LossyConvertEncoding( output_type* aDestination ) : mDestination(aDestination) { }
|
---|
442 |
|
---|
443 | PRUint32
|
---|
444 | write( const input_type* aSource, PRUint32 aSourceLength )
|
---|
445 | {
|
---|
446 | const input_type* done_writing = aSource + aSourceLength;
|
---|
447 | while ( aSource < done_writing )
|
---|
448 | *mDestination++ = (output_type)(unsigned_input_type)(*aSource++); // use old-style cast to mimic old |ns[C]String| behavior
|
---|
449 | return aSourceLength;
|
---|
450 | }
|
---|
451 |
|
---|
452 | void
|
---|
453 | write_terminator()
|
---|
454 | {
|
---|
455 | *mDestination = output_type(0);
|
---|
456 | }
|
---|
457 |
|
---|
458 | private:
|
---|
459 | output_type* mDestination;
|
---|
460 | };
|
---|
461 |
|
---|
462 | #endif /* !defined(nsUTF8Utils_h_) */
|
---|