utf8.c@ 22496

Last change on this file since 22496 was 22496, checked in by vboxsync, 15 years ago
crOpenGL: update wine to 1.1.27 and better fix for depthstencil surface refcounting
Property svn:eol-style set to `native`
File size: 10.5 KB

Line
1	/*
2	* UTF-8 support routines
3	*
4	* Copyright 2000 Alexandre Julliard
5	*
6	* This library is free software; you can redistribute it and/or
7	* modify it under the terms of the GNU Lesser General Public
8	* License as published by the Free Software Foundation; either
9	* version 2.1 of the License, or (at your option) any later version.
10	*
11	* This library is distributed in the hope that it will be useful,
12	* but WITHOUT ANY WARRANTY; without even the implied warranty of
13	* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
14	* Lesser General Public License for more details.
15	*
16	* You should have received a copy of the GNU Lesser General Public
17	* License along with this library; if not, write to the Free Software
18	* Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301, USA
19	*/
20
21	/*
22	* Sun LGPL Disclaimer: For the avoidance of doubt, except that if any license choice
23	* other than GPL or LGPL is available it will apply instead, Sun elects to use only
24	* the Lesser General Public License version 2.1 (LGPLv2) at this time for any software where
25	* a choice of LGPL license versions is made available with the language indicating
26	* that LGPLv2 or any later version may be used, or where a choice of which version
27	* of the LGPL is applied is otherwise unspecified.
28	*/
29
30	#include "config.h"
31	#include "wine/port.h"
32
33	#include <string.h>
34
35	#include "wine/unicode.h"
36
37	extern WCHAR compose( const WCHAR *str );
38
39	/* number of following bytes in sequence based on first byte value (for bytes above 0x7f) */
40	static const char utf8_length[128] =
41	{
42	0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0, /* 0x80-0x8f */
43	0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0, /* 0x90-0x9f */
44	0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0, /* 0xa0-0xaf */
45	0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0, /* 0xb0-0xbf */
46	0,0,1,1,1,1,1,1,1,1,1,1,1,1,1,1, /* 0xc0-0xcf */
47	1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1, /* 0xd0-0xdf */
48	2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2, /* 0xe0-0xef */
49	3,3,3,3,3,0,0,0,0,0,0,0,0,0,0,0 /* 0xf0-0xff */
50	};
51
52	/* first byte mask depending on UTF-8 sequence length */
53	static const unsigned char utf8_mask[4] = { 0x7f, 0x1f, 0x0f, 0x07 };
54
55	/* minimum Unicode value depending on UTF-8 sequence length */
56	static const unsigned int utf8_minval[4] = { 0x0, 0x80, 0x800, 0x10000 };
57
58
59	/* get the next char value taking surrogates into account */
60	static inline unsigned int get_surrogate_value( const WCHAR *src, unsigned int srclen )
61	{
62	if (src[0] >= 0xd800 && src[0] <= 0xdfff) /* surrogate pair */
63	{
64	if (src[0] > 0xdbff \|\| /* invalid high surrogate */
65	srclen <= 1 \|\| /* missing low surrogate */
66	src[1] < 0xdc00 \|\| src[1] > 0xdfff) /* invalid low surrogate */
67	return 0;
68	return 0x10000 + ((src[0] & 0x3ff) << 10) + (src[1] & 0x3ff);
69	}
70	return src[0];
71	}
72
73	/* query necessary dst length for src string */
74	static inline int get_length_wcs_utf8( int flags, const WCHAR *src, unsigned int srclen )
75	{
76	int len;
77	unsigned int val;
78
79	for (len = 0; srclen; srclen--, src++)
80	{
81	if (src < 0x80) / 0x00-0x7f: 1 byte */
82	{
83	len++;
84	continue;
85	}
86	if (src < 0x800) / 0x80-0x7ff: 2 bytes */
87	{
88	len += 2;
89	continue;
90	}
91	if (!(val = get_surrogate_value( src, srclen )))
92	{
93	if (flags & WC_ERR_INVALID_CHARS) return -2;
94	continue;
95	}
96	if (val < 0x10000) /* 0x800-0xffff: 3 bytes */
97	len += 3;
98	else /* 0x10000-0x10ffff: 4 bytes */
99	{
100	len += 4;
101	src++;
102	srclen--;
103	}
104	}
105	return len;
106	}
107
108	/* wide char to UTF-8 string conversion */
109	/* return -1 on dst buffer overflow, -2 on invalid input char */
110	int wine_utf8_wcstombs( int flags, const WCHAR src, int srclen, char dst, int dstlen )
111	{
112	int len;
113
114	if (!dstlen) return get_length_wcs_utf8( flags, src, srclen );
115
116	for (len = dstlen; srclen; srclen--, src++)
117	{
118	WCHAR ch = *src;
119	unsigned int val;
120
121	if (ch < 0x80) /* 0x00-0x7f: 1 byte */
122	{
123	if (!len--) return -1; /* overflow */
124	*dst++ = ch;
125	continue;
126	}
127
128	if (ch < 0x800) /* 0x80-0x7ff: 2 bytes */
129	{
130	if ((len -= 2) < 0) return -1; /* overflow */
131	dst[1] = 0x80 \| (ch & 0x3f);
132	ch >>= 6;
133	dst[0] = 0xc0 \| ch;
134	dst += 2;
135	continue;
136	}
137
138	if (!(val = get_surrogate_value( src, srclen )))
139	{
140	if (flags & WC_ERR_INVALID_CHARS) return -2;
141	continue;
142	}
143
144	if (val < 0x10000) /* 0x800-0xffff: 3 bytes */
145	{
146	if ((len -= 3) < 0) return -1; /* overflow */
147	dst[2] = 0x80 \| (val & 0x3f);
148	val >>= 6;
149	dst[1] = 0x80 \| (val & 0x3f);
150	val >>= 6;
151	dst[0] = 0xe0 \| val;
152	dst += 3;
153	}
154	else /* 0x10000-0x10ffff: 4 bytes */
155	{
156	if ((len -= 4) < 0) return -1; /* overflow */
157	dst[3] = 0x80 \| (val & 0x3f);
158	val >>= 6;
159	dst[2] = 0x80 \| (val & 0x3f);
160	val >>= 6;
161	dst[1] = 0x80 \| (val & 0x3f);
162	val >>= 6;
163	dst[0] = 0xf0 \| val;
164	dst += 4;
165	src++;
166	srclen--;
167	}
168	}
169	return dstlen - len;
170	}
171
172	/* helper for the various utf8 mbstowcs functions */
173	static inline unsigned int decode_utf8_char( unsigned char ch, const char *str, const char strend )
174	{
175	unsigned int len = utf8_length[ch-0x80];
176	unsigned int res = ch & utf8_mask[len];
177	const char end = str + len;
178
179	if (end > strend) return ~0;
180	switch(len)
181	{
182	case 3:
183	if ((ch = end[-3] ^ 0x80) >= 0x40) break;
184	res = (res << 6) \| ch;
185	(*str)++;
186	case 2:
187	if ((ch = end[-2] ^ 0x80) >= 0x40) break;
188	res = (res << 6) \| ch;
189	(*str)++;
190	case 1:
191	if ((ch = end[-1] ^ 0x80) >= 0x40) break;
192	res = (res << 6) \| ch;
193	(*str)++;
194	if (res < utf8_minval[len]) break;
195	return res;
196	}
197	return ~0;
198	}
199
200	/* query necessary dst length for src string with composition */
201	static inline int get_length_mbs_utf8_compose( int flags, const char *src, int srclen )
202	{
203	int ret = 0;
204	unsigned int res;
205	WCHAR composed[2];
206	const char *srcend = src + srclen;
207
208	composed[0] = 0;
209	while (src < srcend)
210	{
211	unsigned char ch = *src++;
212	if (ch < 0x80) /* special fast case for 7-bit ASCII */
213	{
214	composed[0] = ch;
215	ret++;
216	continue;
217	}
218	if ((res = decode_utf8_char( ch, &src, srcend )) <= 0xffff)
219	{
220	if (composed[0])
221	{
222	composed[1] = res;
223	if ((composed[0] = compose( composed ))) continue;
224	}
225	composed[0] = res;
226	ret++;
227	}
228	else if (res <= 0x10ffff)
229	{
230	ret += 2;
231	composed[0] = 0; /* no composition for surrogates */
232	}
233	else if (flags & MB_ERR_INVALID_CHARS) return -2; /* bad char */
234	/* otherwise ignore it */
235	}
236	return ret;
237	}
238
239	/* UTF-8 to wide char string conversion with composition */
240	/* return -1 on dst buffer overflow, -2 on invalid input char */
241	static int utf8_mbstowcs_compose( int flags, const char src, int srclen, WCHAR dst, int dstlen )
242	{
243	unsigned int res;
244	const char *srcend = src + srclen;
245	WCHAR composed[2];
246	WCHAR *dstend = dst + dstlen;
247
248	if (!dstlen) return get_length_mbs_utf8_compose( flags, src, srclen );
249
250	composed[0] = 0;
251	while (src < srcend)
252	{
253	unsigned char ch = *src++;
254	if (ch < 0x80) /* special fast case for 7-bit ASCII */
255	{
256	if (dst >= dstend) return -1; /* overflow */
257	*dst++ = composed[0] = ch;
258	continue;
259	}
260	if ((res = decode_utf8_char( ch, &src, srcend )) <= 0xffff)
261	{
262	if (composed[0])
263	{
264	composed[1] = res;
265	if ((composed[0] = compose( composed )))
266	{
267	dst[-1] = composed[0];
268	continue;
269	}
270	}
271	if (dst >= dstend) return -1; /* overflow */
272	*dst++ = composed[0] = res;
273	}
274	else if (res <= 0x10ffff) /* we need surrogates */
275	{
276	if (dst >= dstend - 1) return -1; /* overflow */
277	res -= 0x10000;
278	*dst++ = 0xd800 \| (res >> 10);
279	*dst++ = 0xdc00 \| (res & 0x3ff);
280	composed[0] = 0; /* no composition for surrogates */
281	}
282	else if (flags & MB_ERR_INVALID_CHARS) return -2; /* bad char */
283	/* otherwise ignore it */
284	}
285	return dstlen - (dstend - dst);
286	}
287
288	/* query necessary dst length for src string */
289	static inline int get_length_mbs_utf8( int flags, const char *src, int srclen )
290	{
291	int ret = 0;
292	unsigned int res;
293	const char *srcend = src + srclen;
294
295	while (src < srcend)
296	{
297	unsigned char ch = *src++;
298	if (ch < 0x80) /* special fast case for 7-bit ASCII */
299	{
300	ret++;
301	continue;
302	}
303	if ((res = decode_utf8_char( ch, &src, srcend )) <= 0x10ffff)
304	{
305	if (res > 0xffff) ret++;
306	ret++;
307	}
308	else if (flags & MB_ERR_INVALID_CHARS) return -2; /* bad char */
309	/* otherwise ignore it */
310	}
311	return ret;
312	}
313
314	/* UTF-8 to wide char string conversion */
315	/* return -1 on dst buffer overflow, -2 on invalid input char */
316	int wine_utf8_mbstowcs( int flags, const char src, int srclen, WCHAR dst, int dstlen )
317	{
318	unsigned int res;
319	const char *srcend = src + srclen;
320	WCHAR *dstend = dst + dstlen;
321
322	if (flags & MB_COMPOSITE) return utf8_mbstowcs_compose( flags, src, srclen, dst, dstlen );
323
324	if (!dstlen) return get_length_mbs_utf8( flags, src, srclen );
325
326	while ((dst < dstend) && (src < srcend))
327	{
328	unsigned char ch = *src++;
329	if (ch < 0x80) /* special fast case for 7-bit ASCII */
330	{
331	*dst++ = ch;
332	continue;
333	}
334	if ((res = decode_utf8_char( ch, &src, srcend )) <= 0xffff)
335	{
336	*dst++ = res;
337	}
338	else if (res <= 0x10ffff) /* we need surrogates */
339	{
340	if (dst == dstend - 1) return -1; /* overflow */
341	res -= 0x10000;
342	*dst++ = 0xd800 \| (res >> 10);
343	*dst++ = 0xdc00 \| (res & 0x3ff);
344	}
345	else if (flags & MB_ERR_INVALID_CHARS) return -2; /* bad char */
346	/* otherwise ignore it */
347	}
348	if (src < srcend) return -1; /* overflow */
349	return dstlen - (dstend - dst);
350	}

Note: See TracBrowser for help on using the repository browser.

source: vbox/trunk/src/VBox/Additions/WINNT/Graphics/Wine/libWine/utf8.c@ 22496

Download in other formats: