VirtualBox

source: vbox/trunk/src/libs/xpcom18a4/xpcom/string/public/nsUTF8Utils.h@ 85855

Last change on this file since 85855 was 1, checked in by vboxsync, 55 years ago

import

  • Property svn:eol-style set to native
  • Property svn:keywords set to Author Date Id Revision
File size: 14.5 KB
Line 
1/* -*- Mode: C++; tab-width: 2; indent-tabs-mode: nil; c-basic-offset: 2 -*- */
2/* ***** BEGIN LICENSE BLOCK *****
3 * Version: MPL 1.1/GPL 2.0/LGPL 2.1
4 *
5 * The contents of this file are subject to the Mozilla Public License Version
6 * 1.1 (the "License"); you may not use this file except in compliance with
7 * the License. You may obtain a copy of the License at
8 * http://www.mozilla.org/MPL/
9 *
10 * Software distributed under the License is distributed on an "AS IS" basis,
11 * WITHOUT WARRANTY OF ANY KIND, either express or implied. See the License
12 * for the specific language governing rights and limitations under the
13 * License.
14 *
15 * The Original Code is mozilla.org code.
16 *
17 * The Initial Developer of the Original Code is
18 * Netscape Communications Corporation.
19 * Portions created by the Initial Developer are Copyright (C) 2001
20 * the Initial Developer. All Rights Reserved.
21 *
22 * Contributor(s):
23 * Peter Annema <[email protected]> (original author)
24 *
25 * Alternatively, the contents of this file may be used under the terms of
26 * either of the GNU General Public License Version 2 or later (the "GPL"),
27 * or the GNU Lesser General Public License Version 2.1 or later (the "LGPL"),
28 * in which case the provisions of the GPL or the LGPL are applicable instead
29 * of those above. If you wish to allow use of your version of this file only
30 * under the terms of either the GPL or the LGPL, and not to allow others to
31 * use your version of this file under the terms of the MPL, indicate your
32 * decision by deleting the provisions above and replace them with the notice
33 * and other provisions required by the GPL or the LGPL. If you do not delete
34 * the provisions above, a recipient may use your version of this file under
35 * the terms of any one of the MPL, the GPL or the LGPL.
36 *
37 * ***** END LICENSE BLOCK ***** */
38
39#ifndef nsUTF8Utils_h_
40#define nsUTF8Utils_h_
41
42class UTF8traits
43 {
44 public:
45 static PRBool isASCII(char c) { return (c & 0x80) == 0x00; }
46 static PRBool isInSeq(char c) { return (c & 0xC0) == 0x80; }
47 static PRBool is2byte(char c) { return (c & 0xE0) == 0xC0; }
48 static PRBool is3byte(char c) { return (c & 0xF0) == 0xE0; }
49 static PRBool is4byte(char c) { return (c & 0xF8) == 0xF0; }
50 static PRBool is5byte(char c) { return (c & 0xFC) == 0xF8; }
51 static PRBool is6byte(char c) { return (c & 0xFE) == 0xFC; }
52 };
53
54#define PLANE1_BASE 0x00010000
55#define UCS2_REPLACEMENT_CHAR 0xfffd
56
57#ifdef __GNUC__
58#define NS_ALWAYS_INLINE __attribute__((always_inline))
59#else
60#define NS_ALWAYS_INLINE
61#endif
62
63/**
64 * A character sink (see |copy_string| in nsAlgorithm.h) for converting
65 * UTF-8 to UTF-16
66 */
67class ConvertUTF8toUTF16
68 {
69 public:
70 typedef nsACString::char_type value_type;
71 typedef nsAString::char_type buffer_type;
72
73 ConvertUTF8toUTF16( buffer_type* aBuffer )
74 : mStart(aBuffer), mBuffer(aBuffer), mErrorEncountered(PR_FALSE) {}
75
76 size_t Length() const { return mBuffer - mStart; }
77
78 PRUint32 NS_ALWAYS_INLINE write( const value_type* start, PRUint32 N )
79 {
80 if ( mErrorEncountered )
81 return N;
82
83 // algorithm assumes utf8 units won't
84 // be spread across fragments
85 const value_type* p = start;
86 const value_type* end = start + N;
87 buffer_type* out = mBuffer;
88 for ( ; p != end /* && *p */; )
89 {
90 char c = *p++;
91
92 if ( UTF8traits::isASCII(c) )
93 {
94 *out++ = buffer_type(c);
95 continue;
96 }
97
98 PRUint32 ucs4;
99 PRUint32 minUcs4;
100 PRInt32 state = 0;
101
102 if ( UTF8traits::is2byte(c) )
103 {
104 ucs4 = (PRUint32(c) << 6) & 0x000007C0L;
105 state = 1;
106 minUcs4 = 0x00000080;
107 }
108 else if ( UTF8traits::is3byte(c) )
109 {
110 ucs4 = (PRUint32(c) << 12) & 0x0000F000L;
111 state = 2;
112 minUcs4 = 0x00000800;
113 }
114 else if ( UTF8traits::is4byte(c) )
115 {
116 ucs4 = (PRUint32(c) << 18) & 0x001F0000L;
117 state = 3;
118 minUcs4 = 0x00010000;
119 }
120 else if ( UTF8traits::is5byte(c) )
121 {
122 ucs4 = (PRUint32(c) << 24) & 0x03000000L;
123 state = 4;
124 minUcs4 = 0x00200000;
125 }
126 else if ( UTF8traits::is6byte(c) )
127 {
128 ucs4 = (PRUint32(c) << 30) & 0x40000000L;
129 state = 5;
130 minUcs4 = 0x04000000;
131 }
132 else
133 {
134 NS_ERROR("Not a UTF-8 string. This code should only be used for converting from known UTF-8 strings.");
135 mErrorEncountered = PR_TRUE;
136 mBuffer = out;
137 return N;
138 }
139
140 while ( state-- )
141 {
142 c = *p++;
143
144 if ( UTF8traits::isInSeq(c) )
145 {
146 PRInt32 shift = state * 6;
147 ucs4 |= (PRUint32(c) & 0x3F) << shift;
148 }
149 else
150 {
151 NS_ERROR("not a UTF8 string");
152 mErrorEncountered = PR_TRUE;
153 mBuffer = out;
154 return N;
155 }
156 }
157
158 if ( ucs4 < minUcs4 )
159 {
160 // Overlong sequence
161 *out++ = UCS2_REPLACEMENT_CHAR;
162 }
163 else if ( ucs4 <= 0xD7FF )
164 {
165 *out++ = ucs4;
166 }
167 else if ( /* ucs4 >= 0xD800 && */ ucs4 <= 0xDFFF )
168 {
169 // Surrogates
170 *out++ = UCS2_REPLACEMENT_CHAR;
171 }
172 else if ( ucs4 == 0xFFFE || ucs4 == 0xFFFF )
173 {
174 // Prohibited characters
175 *out++ = UCS2_REPLACEMENT_CHAR;
176 }
177 else if ( ucs4 >= PLANE1_BASE )
178 {
179 if ( ucs4 >= 0x00110000 )
180 *out++ = UCS2_REPLACEMENT_CHAR;
181 else {
182 // surrogate, see unicode specification 3.7 for following math.
183 ucs4 -= PLANE1_BASE;
184 *out++ = (PRUnichar)(ucs4 >> 10) | 0xd800u;
185 *out++ = (PRUnichar)(ucs4 & 0x3ff) | 0xdc00u;
186 }
187 }
188 else
189 {
190 *out++ = ucs4;
191 }
192 }
193 mBuffer = out;
194 return p - start;
195 }
196
197 void write_terminator()
198 {
199 *mBuffer = buffer_type(0);
200 }
201
202 private:
203 buffer_type* const mStart;
204 buffer_type* mBuffer;
205 PRBool mErrorEncountered;
206 };
207
208/**
209 * A character sink (see |copy_string| in nsAlgorithm.h) for computing
210 * the length of the UTF-16 string equivalent to a UTF-8 string.
211 */
212class CalculateUTF8Length
213 {
214 public:
215 typedef nsACString::char_type value_type;
216
217 CalculateUTF8Length() : mLength(0), mErrorEncountered(PR_FALSE) { }
218
219 size_t Length() const { return mLength; }
220
221 PRUint32 NS_ALWAYS_INLINE write( const value_type* start, PRUint32 N )
222 {
223 // ignore any further requests
224 if ( mErrorEncountered )
225 return N;
226
227 // algorithm assumes utf8 units won't
228 // be spread across fragments
229 const value_type* p = start;
230 const value_type* end = start + N;
231 for ( ; p < end /* && *p */; ++mLength )
232 {
233 if ( UTF8traits::isASCII(*p) )
234 p += 1;
235 else if ( UTF8traits::is2byte(*p) )
236 p += 2;
237 else if ( UTF8traits::is3byte(*p) )
238 p += 3;
239 else if ( UTF8traits::is4byte(*p) ) {
240 p += 4;
241 // Because a UTF-8 sequence of 4 bytes represents a codepoint
242 // greater than 0xFFFF, it will become a surrogate pair in the
243 // UTF-16 string, so add 1 more to mLength.
244 // This doesn't happen with is5byte and is6byte because they
245 // are illegal UTF-8 sequences (greater than 0x10FFFF) so get
246 // converted to a single replacement character.
247 //
248 // XXX: if the 4-byte sequence is an illegal non-shortest form,
249 // it also gets converted to a replacement character, so
250 // mLength will be off by one in this case.
251 ++mLength;
252 }
253 else if ( UTF8traits::is5byte(*p) )
254 p += 5;
255 else if ( UTF8traits::is6byte(*p) )
256 p += 6;
257 else
258 {
259 break;
260 }
261 }
262 if ( p != end )
263 {
264 NS_ERROR("Not a UTF-8 string. This code should only be used for converting from known UTF-8 strings.");
265 mErrorEncountered = PR_TRUE;
266 mLength = 0;
267 return N;
268 }
269 return p - start;
270 }
271
272 private:
273 size_t mLength;
274 PRBool mErrorEncountered;
275 };
276
277/**
278 * A character sink (see |copy_string| in nsAlgorithm.h) for converting
279 * UTF-16 to UTF-8.
280 */
281class ConvertUTF16toUTF8
282 {
283 public:
284 typedef nsAString::char_type value_type;
285 typedef nsACString::char_type buffer_type;
286
287 // The error handling here is more lenient than that in
288 // |ConvertUTF8toUTF16|, but it's that way for backwards
289 // compatibility.
290
291 ConvertUTF16toUTF8( buffer_type* aBuffer )
292 : mStart(aBuffer), mBuffer(aBuffer) {}
293
294 size_t Size() const { return mBuffer - mStart; }
295
296 PRUint32 NS_ALWAYS_INLINE write( const value_type* start, PRUint32 N )
297 {
298 buffer_type *out = mBuffer; // gcc isn't smart enough to do this!
299
300 for (const value_type *p = start, *end = start + N; p < end; ++p )
301 {
302 value_type c = *p;
303 if (! (c & 0xFF80)) // U+0000 - U+007F
304 {
305 *out++ = (char)c;
306 }
307 else if (! (c & 0xF800)) // U+0100 - U+07FF
308 {
309 *out++ = 0xC0 | (char)(c >> 6);
310 *out++ = 0x80 | (char)(0x003F & c);
311 }
312 else if (0xD800 != (0xF800 & c)) // U+0800 - U+D7FF,U+E000 - U+FFFF
313 {
314 *out++ = 0xE0 | (char)(c >> 12);
315 *out++ = 0x80 | (char)(0x003F & (c >> 6));
316 *out++ = 0x80 | (char)(0x003F & c );
317 }
318 else if (0xD800 == (0xFC00 & c)) // U+D800 - U+DBFF
319 {
320 // D800- DBFF - High Surrogate
321 // N = (H- D800) *400 + 10000 + ...
322 PRUint32 ucs4 = 0x10000 + ((0x03FF & c) << 10);
323
324 ++p;
325 if (p == end)
326 {
327 NS_ERROR("Surrogate pair split between fragments");
328 mBuffer = out;
329 return N;
330 }
331 c = *p;
332
333 if (0xDC00 == (0xFC00 & c))
334 {
335 // DC00- DFFF - Low Surrogate
336 // N += ( L - DC00 )
337 ucs4 |= (0x03FF & c);
338
339 // 0001 0000-001F FFFF
340 *out++ = 0xF0 | (char)(ucs4 >> 18);
341 *out++ = 0x80 | (char)(0x003F & (ucs4 >> 12));
342 *out++ = 0x80 | (char)(0x003F & (ucs4 >> 6));
343 *out++ = 0x80 | (char)(0x003F & ucs4);
344 }
345 else
346 {
347 NS_ERROR("got a High Surrogate but no low surrogate");
348 // output nothing.
349 }
350 }
351 else // U+DC00 - U+DFFF
352 {
353 // DC00- DFFF - Low Surrogate
354 NS_ERROR("got a low Surrogate but no high surrogate");
355 // output nothing.
356 }
357 }
358
359 mBuffer = out;
360 return N;
361 }
362
363 void write_terminator()
364 {
365 *mBuffer = buffer_type(0);
366 }
367
368 private:
369 buffer_type* const mStart;
370 buffer_type* mBuffer;
371 };
372
373/**
374 * A character sink (see |copy_string| in nsAlgorithm.h) for computing
375 * the number of bytes a UTF-16 would occupy in UTF-8.
376 */
377class CalculateUTF8Size
378 {
379 public:
380 typedef nsAString::char_type value_type;
381
382 CalculateUTF8Size()
383 : mSize(0) { }
384
385 size_t Size() const { return mSize; }
386
387 PRUint32 NS_ALWAYS_INLINE write( const value_type* start, PRUint32 N )
388 {
389 // Assume UCS2 surrogate pairs won't be spread across fragments.
390 for (const value_type *p = start, *end = start + N; p < end; ++p )
391 {
392 value_type c = *p;
393 if (! (c & 0xFF80)) // U+0000 - U+007F
394 mSize += 1;
395 else if (! (c & 0xF800)) // U+0100 - U+07FF
396 mSize += 2;
397 else if (0xD800 != (0xF800 & c)) // U+0800 - U+D7FF,U+E000 - U+FFFF
398 mSize += 3;
399 else if (0xD800 == (0xFC00 & c)) // U+D800 - U+DBFF
400 {
401 ++p;
402 if (p == end)
403 {
404 NS_ERROR("Surrogate pair split between fragments");
405 return N;
406 }
407 c = *p;
408
409 if (0xDC00 == (0xFC00 & c))
410 mSize += 4;
411 else
412 NS_ERROR("got a high Surrogate but no low surrogate");
413 }
414 else // U+DC00 - U+DFFF
415 NS_ERROR("got a low Surrogate but no high surrogate");
416 }
417
418 return N;
419 }
420
421 private:
422 size_t mSize;
423 };
424
425/**
426 * A character sink that performs a |reinterpret_cast| style conversion
427 * between character types.
428 */
429template <class FromCharT, class ToCharT>
430class LossyConvertEncoding
431 {
432 public:
433 typedef FromCharT value_type;
434
435 typedef FromCharT input_type;
436 typedef ToCharT output_type;
437
438 typedef typename nsCharTraits<FromCharT>::unsigned_char_type unsigned_input_type;
439
440 public:
441 LossyConvertEncoding( output_type* aDestination ) : mDestination(aDestination) { }
442
443 PRUint32
444 write( const input_type* aSource, PRUint32 aSourceLength )
445 {
446 const input_type* done_writing = aSource + aSourceLength;
447 while ( aSource < done_writing )
448 *mDestination++ = (output_type)(unsigned_input_type)(*aSource++); // use old-style cast to mimic old |ns[C]String| behavior
449 return aSourceLength;
450 }
451
452 void
453 write_terminator()
454 {
455 *mDestination = output_type(0);
456 }
457
458 private:
459 output_type* mDestination;
460 };
461
462#endif /* !defined(nsUTF8Utils_h_) */
Note: See TracBrowser for help on using the repository browser.

© 2024 Oracle Support Privacy / Do Not Sell My Info Terms of Use Trademark Policy Automated Access Etiquette