utf-8.cpp@ 21337

Last change on this file since 21337 was 21337, checked in by vboxsync, 16 years ago
IPRT,HostDrv,AddDrv: Export public IPRT symbols for the linux kernel (pain).
Property svn:eol-style set to `native` Property svn:keywords set to `Id`
File size: 53.8 KB

Line
1	/* $Id: utf-8.cpp 21337 2009-07-07 14:58:27Z vboxsync $ */
2	/** @file
3	* IPRT - UTF-8 Decoding.
4	*/
5
6	/*
7	* Copyright (C) 2006-2007 Sun Microsystems, Inc.
8	*
9	* This file is part of VirtualBox Open Source Edition (OSE), as
10	* available from http://www.virtualbox.org. This file is free software;
11	* you can redistribute it and/or modify it under the terms of the GNU
12	* General Public License (GPL) as published by the Free Software
13	* Foundation, in version 2 as it comes in the "COPYING" file of the
14	* VirtualBox OSE distribution. VirtualBox OSE is distributed in the
15	* hope that it will be useful, but WITHOUT ANY WARRANTY of any kind.
16	*
17	* The contents of this file may alternatively be used under the terms
18	* of the Common Development and Distribution License Version 1.0
19	* (CDDL) only, as it comes in the "COPYING.CDDL" file of the
20	* VirtualBox OSE distribution, in which case the provisions of the
21	* CDDL are applicable instead of those of the GPL.
22	*
23	* You may elect to license modified versions of this file under the
24	* terms and conditions of either the GPL or the CDDL or both.
25	*
26	* Please contact Sun Microsystems, Inc., 4150 Network Circle, Santa
27	* Clara, CA 95054 USA or visit http://www.sun.com if you need
28	* additional information or have any questions.
29	*/
30
31
32	/*******************************************************************************
33	* Header Files *
34	*******************************************************************************/
35	#include <iprt/string.h>
36	#include "internal/iprt.h"
37
38	#include <iprt/uni.h>
39	#include <iprt/alloc.h>
40	#include <iprt/assert.h>
41	#include <iprt/err.h>
42	#include "internal/string.h"
43
44
45
46	/**
47	* Get get length in code points of a UTF-8 encoded string.
48	* The string is validated while doing this.
49	*
50	* @returns IPRT status code.
51	* @param psz Pointer to the UTF-8 string.
52	* @param cch The max length of the string. (btw cch = cb)
53	* Use RTSTR_MAX if all of the string is to be examined.
54	* @param pcuc Where to store the length in unicode code points.
55	* @param pcchActual Where to store the actual size of the UTF-8 string
56	* on success (cch = cb again). Optional.
57	*/
58	static int rtUtf8Length(const char psz, size_t cch, size_t pcuc, size_t *pcchActual)
59	{
60	const unsigned char puch = (const unsigned char )psz;
61	size_t cCodePoints = 0;
62	while (cch > 0)
63	{
64	const unsigned char uch = *puch;
65	if (!uch)
66	break;
67	if (uch & RT_BIT(7))
68	{
69	/* figure sequence length and validate the first byte */
70	unsigned cb;
71	if ((uch & (RT_BIT(7) \| RT_BIT(6) \| RT_BIT(5))) == (RT_BIT(7) \| RT_BIT(6)))
72	cb = 2;
73	else if ((uch & (RT_BIT(7) \| RT_BIT(6) \| RT_BIT(5) \| RT_BIT(4))) == (RT_BIT(7) \| RT_BIT(6) \| RT_BIT(5)))
74	cb = 3;
75	else if ((uch & (RT_BIT(7) \| RT_BIT(6) \| RT_BIT(5) \| RT_BIT(4) \| RT_BIT(3))) == (RT_BIT(7) \| RT_BIT(6) \| RT_BIT(5) \| RT_BIT(4)))
76	cb = 4;
77	else if ((uch & (RT_BIT(7) \| RT_BIT(6) \| RT_BIT(5) \| RT_BIT(4) \| RT_BIT(3) \| RT_BIT(2))) == (RT_BIT(7) \| RT_BIT(6) \| RT_BIT(5) \| RT_BIT(4) \| RT_BIT(3)))
78	cb = 5;
79	else if ((uch & (RT_BIT(7) \| RT_BIT(6) \| RT_BIT(5) \| RT_BIT(4) \| RT_BIT(3) \| RT_BIT(2) \| RT_BIT(1))) == (RT_BIT(7) \| RT_BIT(6) \| RT_BIT(5) \| RT_BIT(4) \| RT_BIT(3) \| RT_BIT(2)))
80	cb = 6;
81	else
82	{
83	RTStrAssertMsgFailed(("Invalid UTF-8 first byte: %.*Rhxs\n", RT_MIN(cch, 10), puch));
84	return VERR_INVALID_UTF8_ENCODING;
85	}
86
87	/* check length */
88	if (cb > cch)
89	{
90	RTStrAssertMsgFailed(("Invalid UTF-8 length: cb=%d cch=%d (%.*Rhxs)\n", cb, cch, RT_MIN(cch, 10), puch));
91	return VERR_INVALID_UTF8_ENCODING;
92	}
93
94	/* validate the rest */
95	switch (cb)
96	{
97	case 6:
98	RTStrAssertMsgReturn((puch[5] & (RT_BIT(7) \| RT_BIT(6))) == RT_BIT(7), ("6/%u: %.*Rhxs\n", cb, RT_MIN(cb + 10, cch), puch), VERR_INVALID_UTF8_ENCODING);
99	case 5:
100	RTStrAssertMsgReturn((puch[4] & (RT_BIT(7) \| RT_BIT(6))) == RT_BIT(7), ("5/%u: %.*Rhxs\n", cb, RT_MIN(cb + 10, cch), puch), VERR_INVALID_UTF8_ENCODING);
101	case 4:
102	RTStrAssertMsgReturn((puch[3] & (RT_BIT(7) \| RT_BIT(6))) == RT_BIT(7), ("4/%u: %.*Rhxs\n", cb, RT_MIN(cb + 10, cch), puch), VERR_INVALID_UTF8_ENCODING);
103	case 3:
104	RTStrAssertMsgReturn((puch[2] & (RT_BIT(7) \| RT_BIT(6))) == RT_BIT(7), ("3/%u: %.*Rhxs\n", cb, RT_MIN(cb + 10, cch), puch), VERR_INVALID_UTF8_ENCODING);
105	case 2:
106	RTStrAssertMsgReturn((puch[1] & (RT_BIT(7) \| RT_BIT(6))) == RT_BIT(7), ("2/%u: %.*Rhxs\n", cb, RT_MIN(cb + 10, cch), puch), VERR_INVALID_UTF8_ENCODING);
107	break;
108	}
109
110	/* validate the code point. */
111	RTUNICP uc;
112	switch (cb)
113	{
114	case 6:
115	uc = (puch[5] & 0x3f)
116	\| ((RTUNICP)(puch[4] & 0x3f) << 6)
117	\| ((RTUNICP)(puch[3] & 0x3f) << 12)
118	\| ((RTUNICP)(puch[2] & 0x3f) << 18)
119	\| ((RTUNICP)(puch[1] & 0x3f) << 24)
120	\| ((RTUNICP)(uch & 0x01) << 30);
121	RTStrAssertMsgReturn(uc >= 0x04000000 && uc <= 0x7fffffff,
122	("%u: cp=%#010RX32: %.*Rhxs\n", cb, uc, RT_MIN(cb + 10, cch), puch), VERR_INVALID_UTF8_ENCODING);
123	break;
124	case 5:
125	uc = (puch[4] & 0x3f)
126	\| ((RTUNICP)(puch[3] & 0x3f) << 6)
127	\| ((RTUNICP)(puch[2] & 0x3f) << 12)
128	\| ((RTUNICP)(puch[1] & 0x3f) << 18)
129	\| ((RTUNICP)(uch & 0x03) << 24);
130	RTStrAssertMsgReturn(uc >= 0x00200000 && uc <= 0x03ffffff,
131	("%u: cp=%#010RX32: %.*Rhxs\n", cb, uc, RT_MIN(cb + 10, cch), puch), VERR_INVALID_UTF8_ENCODING);
132	break;
133	case 4:
134	uc = (puch[3] & 0x3f)
135	\| ((RTUNICP)(puch[2] & 0x3f) << 6)
136	\| ((RTUNICP)(puch[1] & 0x3f) << 12)
137	\| ((RTUNICP)(uch & 0x07) << 18);
138	RTStrAssertMsgReturn(uc >= 0x00010000 && uc <= 0x001fffff,
139	("%u: cp=%#010RX32: %.*Rhxs\n", cb, uc, RT_MIN(cb + 10, cch), puch), VERR_INVALID_UTF8_ENCODING);
140	break;
141	case 3:
142	uc = (puch[2] & 0x3f)
143	\| ((RTUNICP)(puch[1] & 0x3f) << 6)
144	\| ((RTUNICP)(uch & 0x0f) << 12);
145	RTStrAssertMsgReturn(uc >= 0x00000800 && uc <= 0x0000fffd,
146	("%u: cp=%#010RX32: %.*Rhxs\n", cb, uc, RT_MIN(cb + 10, cch), puch),
147	uc == 0xffff \|\| uc == 0xfffe ? VERR_CODE_POINT_ENDIAN_INDICATOR : VERR_INVALID_UTF8_ENCODING);
148	RTStrAssertMsgReturn(uc < 0xd800 \|\| uc > 0xdfff,
149	("%u: cp=%#010RX32: %.*Rhxs\n", cb, uc, RT_MIN(cb + 10, cch), puch), VERR_CODE_POINT_SURROGATE);
150	break;
151	case 2:
152	uc = (puch[1] & 0x3f)
153	\| ((RTUNICP)(uch & 0x1f) << 6);
154	RTStrAssertMsgReturn(uc >= 0x00000080 && uc <= 0x000007ff,
155	("%u: cp=%#010RX32: %.*Rhxs\n", cb, uc, RT_MIN(cb + 10, cch), puch), VERR_INVALID_UTF8_ENCODING);
156	break;
157	}
158
159	/* advance */
160	cch -= cb;
161	puch += cb;
162	}
163	else
164	{
165	/* one ASCII byte */
166	puch++;
167	cch--;
168	}
169	cCodePoints++;
170	}
171
172	/* done */
173	*pcuc = cCodePoints;
174	if (pcchActual)
175	pcchActual = puch - (unsigned char const )psz;
176	return VINF_SUCCESS;
177	}
178
179
180	/**
181	* Decodes and UTF-8 string into an array of unicode code point.
182	*
183	* Since we know the input is valid, we do not perform encoding or length checks.
184	*
185	* @returns iprt status code.
186	* @param psz The UTF-8 string to recode. This is a valid encoding.
187	* @param cch The number of chars (the type char, so bytes if you like) to process of the UTF-8 string.
188	* The recoding will stop when cch or '\\0' is reached. Pass RTSTR_MAX to process up to '\\0'.
189	* @param paCps Where to store the code points array.
190	* @param cCps The number of RTUNICP items the paCps buffer can hold, excluding the terminator ('\\0').
191	* @param pcCps Where to store the actual number of decoded code points. This excludes the terminator.
192	*/
193	static int rtUtf8Decode(const char psz, size_t cch, PRTUNICP paCps, size_t cCps, size_t pcCps)
194	{
195	int rc = VINF_SUCCESS;
196	const unsigned char puch = (const unsigned char )psz;
197	const PRTUNICP pCpEnd = paCps + cCps;
198	PRTUNICP pCp = paCps;
199	Assert(pCpEnd >= pCp);
200	while (cch > 0)
201	{
202	/* read the next char and check for terminator. */
203	const unsigned char uch = *puch;
204	if (!uch)
205	break;
206
207	/* check for output overflow */
208	if (pCp >= pCpEnd)
209	{
210	rc = VERR_BUFFER_OVERFLOW;
211	break;
212	}
213
214	/* decode and recode the code point */
215	if (!(uch & RT_BIT(7)))
216	{
217	*pCp++ = uch;
218	puch++;
219	cch--;
220	}
221	#ifdef RT_STRICT
222	else if (!(uch & RT_BIT(6)))
223	AssertMsgFailed(("Internal error!\n"));
224	#endif
225	else if (!(uch & RT_BIT(5)))
226	{
227	*pCp++ = (puch[1] & 0x3f)
228	\| ((uint16_t)(uch & 0x1f) << 6);
229	puch += 2;
230	cch -= 2;
231	}
232	else if (!(uch & RT_BIT(4)))
233	{
234	*pCp++ = (puch[2] & 0x3f)
235	\| ((uint16_t)(puch[1] & 0x3f) << 6)
236	\| ((uint16_t)(uch & 0x0f) << 12);
237	puch += 3;
238	cch -= 3;
239	}
240	else if (!(uch & RT_BIT(3)))
241	{
242	*pCp++ = (puch[3] & 0x3f)
243	\| ((RTUNICP)(puch[2] & 0x3f) << 6)
244	\| ((RTUNICP)(puch[1] & 0x3f) << 12)
245	\| ((RTUNICP)(uch & 0x07) << 18);
246	puch += 4;
247	cch -= 4;
248	}
249	else if (!(uch & RT_BIT(2)))
250	{
251	*pCp++ = (puch[4] & 0x3f)
252	\| ((RTUNICP)(puch[3] & 0x3f) << 6)
253	\| ((RTUNICP)(puch[2] & 0x3f) << 12)
254	\| ((RTUNICP)(puch[1] & 0x3f) << 18)
255	\| ((RTUNICP)(uch & 0x03) << 24);
256	puch += 5;
257	cch -= 6;
258	}
259	else
260	{
261	Assert(!(uch & RT_BIT(1)));
262	*pCp++ = (puch[5] & 0x3f)
263	\| ((RTUNICP)(puch[4] & 0x3f) << 6)
264	\| ((RTUNICP)(puch[3] & 0x3f) << 12)
265	\| ((RTUNICP)(puch[2] & 0x3f) << 18)
266	\| ((RTUNICP)(puch[1] & 0x3f) << 24)
267	\| ((RTUNICP)(uch & 0x01) << 30);
268	puch += 6;
269	cch -= 6;
270	}
271	}
272
273	/* done */
274	*pCp = 0;
275	*pcCps = pCp - paCps;
276	return rc;
277	}
278
279
280	RTDECL(size_t) RTStrUniLen(const char *psz)
281	{
282	size_t cCodePoints;
283	int rc = rtUtf8Length(psz, RTSTR_MAX, &cCodePoints, NULL);
284	return RT_SUCCESS(rc) ? cCodePoints : 0;
285	}
286	RT_EXPORT_SYMBOL(RTStrUniLen);
287
288
289	RTDECL(int) RTStrUniLenEx(const char psz, size_t cch, size_t pcCps)
290	{
291	size_t cCodePoints;
292	int rc = rtUtf8Length(psz, cch, &cCodePoints, NULL);
293	if (pcCps)
294	*pcCps = RT_SUCCESS(rc) ? cCodePoints : 0;
295	return rc;
296	}
297	RT_EXPORT_SYMBOL(RTStrUniLenEx);
298
299
300	RTDECL(int) RTStrValidateEncoding(const char *psz)
301	{
302	return RTStrValidateEncodingEx(psz, RTSTR_MAX, 0);
303	}
304	RT_EXPORT_SYMBOL(RTStrValidateEncoding);
305
306
307	RTDECL(int) RTStrValidateEncodingEx(const char *psz, size_t cch, uint32_t fFlags)
308	{
309	AssertReturn(!(fFlags & ~(RTSTR_VALIDATE_ENCODING_ZERO_TERMINATED)), VERR_INVALID_PARAMETER);
310	AssertPtr(psz);
311
312	/*
313	* Use rtUtf8Length for the job.
314	*/
315	size_t cchActual;
316	size_t cCpsIgnored;
317	int rc = rtUtf8Length(psz, cch, &cCpsIgnored, &cchActual);
318	if (RT_SUCCESS(rc))
319	{
320	if ( (fFlags & RTSTR_VALIDATE_ENCODING_ZERO_TERMINATED)
321	&& cchActual >= cch)
322	rc = VERR_BUFFER_OVERFLOW;
323	}
324	return rc;
325
326
327	return RTStrUniLenEx(psz, cch, &cCpsIgnored);
328	}
329	RT_EXPORT_SYMBOL(RTStrValidateEncodingEx);
330
331
332	RTDECL(bool) RTStrIsValidEncoding(const char *psz)
333	{
334	int rc = RTStrValidateEncodingEx(psz, RTSTR_MAX, 0);
335	return RT_SUCCESS(rc);
336	}
337	RT_EXPORT_SYMBOL(RTStrIsValidEncoding);
338
339
340	RTDECL(int) RTStrToUni(const char pszString, PRTUNICP ppaCps)
341	{
342	/*
343	* Validate input.
344	*/
345	Assert(VALID_PTR(pszString));
346	Assert(VALID_PTR(ppaCps));
347	*ppaCps = NULL;
348
349	/*
350	* Validate the UTF-8 input and count its code points.
351	*/
352	size_t cCps;
353	int rc = rtUtf8Length(pszString, RTSTR_MAX, &cCps, NULL);
354	if (RT_SUCCESS(rc))
355	{
356	/*
357	* Allocate buffer.
358	*/
359	PRTUNICP paCps = (PRTUNICP)RTMemAlloc((cCps + 1) * sizeof(RTUNICP));
360	if (paCps)
361	{
362	/*
363	* Decode the string.
364	*/
365	rc = rtUtf8Decode(pszString, RTSTR_MAX, paCps, cCps, &cCps);
366	if (RT_SUCCESS(rc))
367	{
368	*ppaCps = paCps;
369	return rc;
370	}
371	RTMemFree(paCps);
372	}
373	else
374	rc = VERR_NO_CODE_POINT_MEMORY;
375	}
376	return rc;
377	}
378	RT_EXPORT_SYMBOL(RTStrToUni);
379
380
381	RTDECL(int) RTStrToUniEx(const char pszString, size_t cchString, PRTUNICP ppaCps, size_t cCps, size_t *pcCps)
382	{
383	/*
384	* Validate input.
385	*/
386	Assert(VALID_PTR(pszString));
387	Assert(VALID_PTR(ppaCps));
388	Assert(!pcCps \|\| VALID_PTR(pcCps));
389
390	/*
391	* Validate the UTF-8 input and count the code points.
392	*/
393	size_t cCpsResult;
394	int rc = rtUtf8Length(pszString, cchString, &cCpsResult, NULL);
395	if (RT_SUCCESS(rc))
396	{
397	if (pcCps)
398	*pcCps = cCpsResult;
399
400	/*
401	* Check buffer size / Allocate buffer.
402	*/
403	bool fShouldFree;
404	PRTUNICP paCpsResult;
405	if (cCps > 0 && *ppaCps)
406	{
407	fShouldFree = false;
408	if (cCps <= cCpsResult)
409	return VERR_BUFFER_OVERFLOW;
410	paCpsResult = *ppaCps;
411	}
412	else
413	{
414	*ppaCps = NULL;
415	fShouldFree = true;
416	cCps = RT_MAX(cCpsResult + 1, cCps);
417	paCpsResult = (PRTUNICP)RTMemAlloc(cCps * sizeof(RTUNICP));
418	}
419	if (paCpsResult)
420	{
421	/*
422	* Encode the UTF-16 string.
423	*/
424	rc = rtUtf8Decode(pszString, cchString, paCpsResult, cCps - 1, &cCpsResult);
425	if (RT_SUCCESS(rc))
426	{
427	*ppaCps = paCpsResult;
428	return rc;
429	}
430	if (fShouldFree)
431	RTMemFree(paCpsResult);
432	}
433	else
434	rc = VERR_NO_CODE_POINT_MEMORY;
435	}
436	return rc;
437	}
438	RT_EXPORT_SYMBOL(RTStrToUniEx);
439
440
441	/**
442	* Calculates the UTF-16 length of a string, validating the encoding while doing so.
443	*
444	* @returns IPRT status code.
445	* @param psz Pointer to the UTF-8 string.
446	* @param cch The max length of the string. (btw cch = cb)
447	* Use RTSTR_MAX if all of the string is to be examined.s
448	* @param pcwc Where to store the length of the UTF-16 string as a number of RTUTF16 characters.
449	*/
450	static int rtUtf8CalcUtf16Length(const char psz, size_t cch, size_t pcwc)
451	{
452	const unsigned char puch = (const unsigned char )psz;
453	size_t cwc = 0;
454	while (cch > 0)
455	{
456	const unsigned char uch = *puch;
457	if (!uch)
458	break;
459	if (!(uch & RT_BIT(7)))
460	{
461	/* one ASCII byte */
462	cwc++;
463	puch++;
464	cch--;
465	}
466	else
467	{
468	/* figure sequence length and validate the first byte */
469	unsigned cb;
470	if ((uch & (RT_BIT(7) \| RT_BIT(6) \| RT_BIT(5))) == (RT_BIT(7) \| RT_BIT(6)))
471	cb = 2;
472	else if ((uch & (RT_BIT(7) \| RT_BIT(6) \| RT_BIT(5) \| RT_BIT(4))) == (RT_BIT(7) \| RT_BIT(6) \| RT_BIT(5)))
473	cb = 3;
474	else if ((uch & (RT_BIT(7) \| RT_BIT(6) \| RT_BIT(5) \| RT_BIT(4) \| RT_BIT(3))) == (RT_BIT(7) \| RT_BIT(6) \| RT_BIT(5) \| RT_BIT(4)))
475	cb = 4;
476	else if ((uch & (RT_BIT(7) \| RT_BIT(6) \| RT_BIT(5) \| RT_BIT(4) \| RT_BIT(3) \| RT_BIT(2))) == (RT_BIT(7) \| RT_BIT(6) \| RT_BIT(5) \| RT_BIT(4) \| RT_BIT(3)))
477	cb = 5;
478	else if ((uch & (RT_BIT(7) \| RT_BIT(6) \| RT_BIT(5) \| RT_BIT(4) \| RT_BIT(3) \| RT_BIT(2) \| RT_BIT(1))) == (RT_BIT(7) \| RT_BIT(6) \| RT_BIT(5) \| RT_BIT(4) \| RT_BIT(3) \| RT_BIT(2)))
479	cb = 6;
480	else
481	{
482	RTStrAssertMsgFailed(("Invalid UTF-8 first byte: %.*Rhxs\n", RT_MIN(cch, 10), puch));
483	return VERR_INVALID_UTF8_ENCODING;
484	}
485
486	/* check length */
487	if (cb > cch)
488	{
489	RTStrAssertMsgFailed(("Invalid UTF-8 length: cb=%d cch=%d (%.*Rhxs)\n", cb, cch, RT_MIN(cch, 10), puch));
490	return VERR_INVALID_UTF8_ENCODING;
491	}
492
493	/* validate the rest */
494	switch (cb)
495	{
496	case 6:
497	RTStrAssertMsgReturn((puch[5] & (RT_BIT(7) \| RT_BIT(6))) == RT_BIT(7), ("6/%u: %.*Rhxs\n", cb, RT_MIN(cb + 10, cch), puch), VERR_INVALID_UTF8_ENCODING);
498	case 5:
499	RTStrAssertMsgReturn((puch[4] & (RT_BIT(7) \| RT_BIT(6))) == RT_BIT(7), ("5/%u: %.*Rhxs\n", cb, RT_MIN(cb + 10, cch), puch), VERR_INVALID_UTF8_ENCODING);
500	case 4:
501	RTStrAssertMsgReturn((puch[3] & (RT_BIT(7) \| RT_BIT(6))) == RT_BIT(7), ("4/%u: %.*Rhxs\n", cb, RT_MIN(cb + 10, cch), puch), VERR_INVALID_UTF8_ENCODING);
502	case 3:
503	RTStrAssertMsgReturn((puch[2] & (RT_BIT(7) \| RT_BIT(6))) == RT_BIT(7), ("3/%u: %.*Rhxs\n", cb, RT_MIN(cb + 10, cch), puch), VERR_INVALID_UTF8_ENCODING);
504	case 2:
505	RTStrAssertMsgReturn((puch[1] & (RT_BIT(7) \| RT_BIT(6))) == RT_BIT(7), ("2/%u: %.*Rhxs\n", cb, RT_MIN(cb + 10, cch), puch), VERR_INVALID_UTF8_ENCODING);
506	break;
507	}
508
509	/* validate the code point. */
510	RTUNICP uc;
511	switch (cb)
512	{
513	case 6:
514	uc = (puch[5] & 0x3f)
515	\| ((RTUNICP)(puch[4] & 0x3f) << 6)
516	\| ((RTUNICP)(puch[3] & 0x3f) << 12)
517	\| ((RTUNICP)(puch[2] & 0x3f) << 18)
518	\| ((RTUNICP)(puch[1] & 0x3f) << 24)
519	\| ((RTUNICP)(uch & 0x01) << 30);
520	RTStrAssertMsgReturn(uc >= 0x04000000 && uc <= 0x7fffffff,
521	("%u: cp=%#010RX32: %.*Rhxs\n", cb, uc, RT_MIN(cb + 10, cch), puch), VERR_INVALID_UTF8_ENCODING);
522	RTStrAssertMsgFailed(("%u: cp=%#010RX32: %.*Rhxs\n", cb, uc, RT_MIN(cb + 10, cch), puch));
523	return VERR_CANT_RECODE_AS_UTF16;
524	case 5:
525	uc = (puch[4] & 0x3f)
526	\| ((RTUNICP)(puch[3] & 0x3f) << 6)
527	\| ((RTUNICP)(puch[2] & 0x3f) << 12)
528	\| ((RTUNICP)(puch[1] & 0x3f) << 18)
529	\| ((RTUNICP)(uch & 0x03) << 24);
530	RTStrAssertMsgReturn(uc >= 0x00200000 && uc <= 0x03ffffff,
531	("%u: cp=%#010RX32: %.*Rhxs\n", cb, uc, RT_MIN(cb + 10, cch), puch), VERR_INVALID_UTF8_ENCODING);
532	RTStrAssertMsgFailed(("%u: cp=%#010RX32: %.*Rhxs\n", cb, uc, RT_MIN(cb + 10, cch), puch));
533	return VERR_CANT_RECODE_AS_UTF16;
534	case 4:
535	uc = (puch[3] & 0x3f)
536	\| ((RTUNICP)(puch[2] & 0x3f) << 6)
537	\| ((RTUNICP)(puch[1] & 0x3f) << 12)
538	\| ((RTUNICP)(uch & 0x07) << 18);
539	RTStrAssertMsgReturn(uc >= 0x00010000 && uc <= 0x001fffff,
540	("%u: cp=%#010RX32: %.*Rhxs\n", cb, uc, RT_MIN(cb + 10, cch), puch), VERR_INVALID_UTF8_ENCODING);
541	RTStrAssertMsgReturn(uc <= 0x0010ffff,
542	("%u: cp=%#010RX32: %.*Rhxs\n", cb, uc, RT_MIN(cb + 10, cch), puch), VERR_CANT_RECODE_AS_UTF16);
543	cwc++;
544	break;
545	case 3:
546	uc = (puch[2] & 0x3f)
547	\| ((RTUNICP)(puch[1] & 0x3f) << 6)
548	\| ((RTUNICP)(uch & 0x0f) << 12);
549	RTStrAssertMsgReturn(uc >= 0x00000800 && uc <= 0x0000fffd,
550	("%u: cp=%#010RX32: %.*Rhxs\n", cb, uc, RT_MIN(cb + 10, cch), puch),
551	uc == 0xffff \|\| uc == 0xfffe ? VERR_CODE_POINT_ENDIAN_INDICATOR : VERR_INVALID_UTF8_ENCODING);
552	RTStrAssertMsgReturn(uc < 0xd800 \|\| uc > 0xdfff,
553	("%u: cp=%#010RX32: %.*Rhxs\n", cb, uc, RT_MIN(cb + 10, cch), puch), VERR_CODE_POINT_SURROGATE);
554	break;
555	case 2:
556	uc = (puch[1] & 0x3f)
557	\| ((RTUNICP)(uch & 0x1f) << 6);
558	RTStrAssertMsgReturn(uc >= 0x00000080 && uc <= 0x000007ff,
559	("%u: cp=%#010RX32: %.*Rhxs\n", cb, uc, RT_MIN(cb + 10, cch), puch), VERR_INVALID_UTF8_ENCODING);
560	break;
561	}
562
563	/* advance */
564	cch -= cb;
565	puch += cb;
566	cwc++;
567	}
568	}
569
570	/* done */
571	*pcwc = cwc;
572	return VINF_SUCCESS;
573	}
574
575
576	/**
577	* Recodes a valid UTF-8 string as UTF-16.
578	*
579	* Since we know the input is valid, we do not perform encoding or length checks.
580	*
581	* @returns iprt status code.
582	* @param psz The UTF-8 string to recode. This is a valid encoding.
583	* @param cch The number of chars (the type char, so bytes if you like) to process of the UTF-8 string.
584	* The recoding will stop when cch or '\\0' is reached. Pass RTSTR_MAX to process up to '\\0'.
585	* @param pwsz Where to store the UTF-16 string.
586	* @param cwc The number of RTUTF16 items the pwsz buffer can hold, excluding the terminator ('\\0').
587	* @param pcwc Where to store the actual number of RTUTF16 items encoded into the UTF-16. This excludes the terminator.
588	*/
589	static int rtUtf8RecodeAsUtf16(const char psz, size_t cch, PRTUTF16 pwsz, size_t cwc, size_t pcwc)
590	{
591	int rc = VINF_SUCCESS;
592	const unsigned char puch = (const unsigned char )psz;
593	const PRTUTF16 pwszEnd = pwsz + cwc;
594	PRTUTF16 pwc = pwsz;
595	Assert(pwszEnd >= pwc);
596	while (cch > 0)
597	{
598	/* read the next char and check for terminator. */
599	const unsigned char uch = *puch;
600	if (!uch)
601	break;
602
603	/* check for output overflow */
604	if (pwc >= pwszEnd)
605	{
606	rc = VERR_BUFFER_OVERFLOW;
607	break;
608	}
609
610	/* decode and recode the code point */
611	if (!(uch & RT_BIT(7)))
612	{
613	*pwc++ = uch;
614	puch++;
615	cch--;
616	}
617	else if ((uch & (RT_BIT(7) \| RT_BIT(6) \| RT_BIT(5))) == (RT_BIT(7) \| RT_BIT(6)))
618	{
619	uint16_t uc = (puch[1] & 0x3f)
620	\| ((uint16_t)(uch & 0x1f) << 6);
621	*pwc++ = uc;
622	puch += 2;
623	cch -= 2;
624	}
625	else if ((uch & (RT_BIT(7) \| RT_BIT(6) \| RT_BIT(5) \| RT_BIT(4))) == (RT_BIT(7) \| RT_BIT(6) \| RT_BIT(5)))
626	{
627	uint16_t uc = (puch[2] & 0x3f)
628	\| ((uint16_t)(puch[1] & 0x3f) << 6)
629	\| ((uint16_t)(uch & 0x0f) << 12);
630	*pwc++ = uc;
631	puch += 3;
632	cch -= 3;
633	}
634	else
635	{
636	/* generate surrugate pair */
637	Assert((uch & (RT_BIT(7) \| RT_BIT(6) \| RT_BIT(5) \| RT_BIT(4) \| RT_BIT(3))) == (RT_BIT(7) \| RT_BIT(6) \| RT_BIT(5) \| RT_BIT(4)));
638	RTUNICP uc = (puch[3] & 0x3f)
639	\| ((RTUNICP)(puch[2] & 0x3f) << 6)
640	\| ((RTUNICP)(puch[1] & 0x3f) << 12)
641	\| ((RTUNICP)(uch & 0x07) << 18);
642	if (pwc + 1 >= pwszEnd)
643	{
644	rc = VERR_BUFFER_OVERFLOW;
645	break;
646	}
647	uc -= 0x10000;
648	*pwc++ = 0xd800 \| (uc >> 10);
649	*pwc++ = 0xdc00 \| (uc & 0x3ff);
650	puch += 4;
651	cch -= 4;
652	}
653	}
654
655	/* done */
656	*pwc = '\0';
657	*pcwc = pwc - pwsz;
658	return rc;
659	}
660
661
662	RTDECL(int) RTStrToUtf16(const char pszString, PRTUTF16 ppwszString)
663	{
664	/*
665	* Validate input.
666	*/
667	Assert(VALID_PTR(ppwszString));
668	Assert(VALID_PTR(pszString));
669	*ppwszString = NULL;
670
671	/*
672	* Validate the UTF-8 input and calculate the length of the UTF-16 string.
673	*/
674	size_t cwc;
675	int rc = rtUtf8CalcUtf16Length(pszString, RTSTR_MAX, &cwc);
676	if (RT_SUCCESS(rc))
677	{
678	/*
679	* Allocate buffer.
680	*/
681	PRTUTF16 pwsz = (PRTUTF16)RTMemAlloc((cwc + 1) * sizeof(RTUTF16));
682	if (pwsz)
683	{
684	/*
685	* Encode the UTF-16 string.
686	*/
687	rc = rtUtf8RecodeAsUtf16(pszString, RTSTR_MAX, pwsz, cwc, &cwc);
688	if (RT_SUCCESS(rc))
689	{
690	*ppwszString = pwsz;
691	return rc;
692	}
693	RTMemFree(pwsz);
694	}
695	else
696	rc = VERR_NO_UTF16_MEMORY;
697	}
698	return rc;
699	}
700	RT_EXPORT_SYMBOL(RTStrToUtf16);
701
702
703	RTDECL(int) RTStrToUtf16Ex(const char pszString, size_t cchString, PRTUTF16 ppwsz, size_t cwc, size_t *pcwc)
704	{
705	/*
706	* Validate input.
707	*/
708	Assert(VALID_PTR(pszString));
709	Assert(VALID_PTR(ppwsz));
710	Assert(!pcwc \|\| VALID_PTR(pcwc));
711
712	/*
713	* Validate the UTF-8 input and calculate the length of the UTF-16 string.
714	*/
715	size_t cwcResult;
716	int rc = rtUtf8CalcUtf16Length(pszString, cchString, &cwcResult);
717	if (RT_SUCCESS(rc))
718	{
719	if (pcwc)
720	*pcwc = cwcResult;
721
722	/*
723	* Check buffer size / Allocate buffer.
724	*/
725	bool fShouldFree;
726	PRTUTF16 pwszResult;
727	if (cwc > 0 && *ppwsz)
728	{
729	fShouldFree = false;
730	if (cwc <= cwcResult)
731	return VERR_BUFFER_OVERFLOW;
732	pwszResult = *ppwsz;
733	}
734	else
735	{
736	*ppwsz = NULL;
737	fShouldFree = true;
738	cwc = RT_MAX(cwcResult + 1, cwc);
739	pwszResult = (PRTUTF16)RTMemAlloc(cwc * sizeof(RTUTF16));
740	}
741	if (pwszResult)
742	{
743	/*
744	* Encode the UTF-16 string.
745	*/
746	rc = rtUtf8RecodeAsUtf16(pszString, cchString, pwszResult, cwc - 1, &cwcResult);
747	if (RT_SUCCESS(rc))
748	{
749	*ppwsz = pwszResult;
750	return rc;
751	}
752	if (fShouldFree)
753	RTMemFree(pwszResult);
754	}
755	else
756	rc = VERR_NO_UTF16_MEMORY;
757	}
758	return rc;
759	}
760	RT_EXPORT_SYMBOL(RTStrToUtf16Ex);
761
762
763	RTDECL(size_t) RTStrCalcUtf16Len(const char *psz)
764	{
765	size_t cwc;
766	int rc = rtUtf8CalcUtf16Length(psz, RTSTR_MAX, &cwc);
767	return RT_SUCCESS(rc) ? cwc : 0;
768	}
769	RT_EXPORT_SYMBOL(RTStrCalcUtf16Len);
770
771
772	RTDECL(int) RTStrCalcUtf16LenEx(const char psz, size_t cch, size_t pcwc)
773	{
774	size_t cwc;
775	int rc = rtUtf8CalcUtf16Length(psz, cch, &cwc);
776	if (pcwc)
777	*pcwc = RT_SUCCESS(rc) ? cwc : ~(size_t)0;
778	return rc;
779	}
780	RT_EXPORT_SYMBOL(RTStrCalcUtf16LenEx);
781
782
783	/**
784	* Handle invalid encodings passed to RTStrGetCp() and RTStrGetCpEx().
785	* @returns rc
786	* @param ppsz The pointer to the string position point.
787	* @param pCp Where to store RTUNICP_INVALID.
788	* @param rc The iprt error code.
789	*/
790	static int rtStrGetCpExFailure(const char **ppsz, PRTUNICP pCp, int rc)
791	{
792	/*
793	* Try find a valid encoding.
794	*/
795	(ppsz)++; /* @todo code this! */
796	*pCp = RTUNICP_INVALID;
797	return rc;
798	}
799
800
801	RTDECL(RTUNICP) RTStrGetCpInternal(const char *psz)
802	{
803	RTUNICP Cp;
804	RTStrGetCpExInternal(&psz, &Cp);
805	return Cp;
806	}
807	RT_EXPORT_SYMBOL(RTStrGetCpInternal);
808
809
810	RTDECL(int) RTStrGetCpExInternal(const char **ppsz, PRTUNICP pCp)
811	{
812	const unsigned char puch = (const unsigned char )*ppsz;
813	const unsigned char uch = *puch;
814	RTUNICP uc;
815
816	/* ASCII ? */
817	if (!(uch & RT_BIT(7)))
818	{
819	uc = uch;
820	puch++;
821	}
822	else if (uch & RT_BIT(6))
823	{
824	/* figure the length and validate the first octet. */
825	unsigned cb;
826	if (!(uch & RT_BIT(5)))
827	cb = 2;
828	else if (!(uch & RT_BIT(4)))
829	cb = 3;
830	else if (!(uch & RT_BIT(3)))
831	cb = 4;
832	else if (!(uch & RT_BIT(2)))
833	cb = 5;
834	else if (!(uch & RT_BIT(1)))
835	cb = 6;
836	else
837	{
838	RTStrAssertMsgFailed(("Invalid UTF-8 first byte: %.Rhxs\n", RT_MIN(strlen((char )puch), 10), puch));
839	return rtStrGetCpExFailure(ppsz, pCp, VERR_INVALID_UTF8_ENCODING);
840	}
841
842	/* validate the rest */
843	switch (cb)
844	{
845	case 6:
846	RTStrAssertMsgReturn((puch[5] & 0xc0) == 0x80, ("6/%u: %.Rhxs\n", cb, RT_MIN(cb + 10, strlen((char )puch)), puch),
847	rtStrGetCpExFailure(ppsz, pCp, VERR_INVALID_UTF8_ENCODING));
848	case 5:
849	RTStrAssertMsgReturn((puch[4] & 0xc0) == 0x80, ("5/%u: %.Rhxs\n", cb, RT_MIN(cb + 10, strlen((char )puch)), puch),
850	rtStrGetCpExFailure(ppsz, pCp, VERR_INVALID_UTF8_ENCODING));
851	case 4:
852	RTStrAssertMsgReturn((puch[3] & 0xc0) == 0x80, ("4/%u: %.Rhxs\n", cb, RT_MIN(cb + 10, strlen((char )puch)), puch),
853	rtStrGetCpExFailure(ppsz, pCp, VERR_INVALID_UTF8_ENCODING));
854	case 3:
855	RTStrAssertMsgReturn((puch[2] & 0xc0) == 0x80, ("3/%u: %.Rhxs\n", cb, RT_MIN(cb + 10, strlen((char )puch)), puch),
856	rtStrGetCpExFailure(ppsz, pCp, VERR_INVALID_UTF8_ENCODING));
857	case 2:
858	RTStrAssertMsgReturn((puch[1] & 0xc0) == 0x80, ("2/%u: %.Rhxs\n", cb, RT_MIN(cb + 10, strlen((char )puch)), puch),
859	rtStrGetCpExFailure(ppsz, pCp, VERR_INVALID_UTF8_ENCODING));
860	break;
861	}
862
863	/* get and validate the code point. */
864	switch (cb)
865	{
866	case 6:
867	uc = (puch[5] & 0x3f)
868	\| ((RTUNICP)(puch[4] & 0x3f) << 6)
869	\| ((RTUNICP)(puch[3] & 0x3f) << 12)
870	\| ((RTUNICP)(puch[2] & 0x3f) << 18)
871	\| ((RTUNICP)(puch[1] & 0x3f) << 24)
872	\| ((RTUNICP)(uch & 0x01) << 30);
873	RTStrAssertMsgReturn(uc >= 0x04000000 && uc <= 0x7fffffff,
874	("%u: cp=%#010RX32: %.Rhxs\n", cb, uc, RT_MIN(cb + 10, strlen((char )puch)), puch),
875	rtStrGetCpExFailure(ppsz, pCp, VERR_INVALID_UTF8_ENCODING));
876	break;
877	case 5:
878	uc = (puch[4] & 0x3f)
879	\| ((RTUNICP)(puch[3] & 0x3f) << 6)
880	\| ((RTUNICP)(puch[2] & 0x3f) << 12)
881	\| ((RTUNICP)(puch[1] & 0x3f) << 18)
882	\| ((RTUNICP)(uch & 0x03) << 24);
883	RTStrAssertMsgReturn(uc >= 0x00200000 && uc <= 0x03ffffff,
884	("%u: cp=%#010RX32: %.Rhxs\n", cb, uc, RT_MIN(cb + 10, strlen((char )puch)), puch),
885	rtStrGetCpExFailure(ppsz, pCp, VERR_INVALID_UTF8_ENCODING));
886	break;
887	case 4:
888	uc = (puch[3] & 0x3f)
889	\| ((RTUNICP)(puch[2] & 0x3f) << 6)
890	\| ((RTUNICP)(puch[1] & 0x3f) << 12)
891	\| ((RTUNICP)(uch & 0x07) << 18);
892	RTStrAssertMsgReturn(uc >= 0x00010000 && uc <= 0x001fffff,
893	("%u: cp=%#010RX32: %.Rhxs\n", cb, uc, RT_MIN(cb + 10, strlen((char )puch)), puch),
894	rtStrGetCpExFailure(ppsz, pCp, VERR_INVALID_UTF8_ENCODING));
895	break;
896	case 3:
897	uc = (puch[2] & 0x3f)
898	\| ((RTUNICP)(puch[1] & 0x3f) << 6)
899	\| ((RTUNICP)(uch & 0x0f) << 12);
900	RTStrAssertMsgReturn(uc >= 0x00000800 && uc <= 0x0000fffd,
901	("%u: cp=%#010RX32: %.Rhxs\n", cb, uc, RT_MIN(cb + 10, strlen((char )puch)), puch),
902	rtStrGetCpExFailure(ppsz, pCp, uc == 0xffff \|\| uc == 0xfffe ? VERR_CODE_POINT_ENDIAN_INDICATOR : VERR_INVALID_UTF8_ENCODING));
903	RTStrAssertMsgReturn(uc < 0xd800 \|\| uc > 0xdfff,
904	("%u: cp=%#010RX32: %.Rhxs\n", cb, uc, RT_MIN(cb + 10, strlen((char )puch)), puch),
905	rtStrGetCpExFailure(ppsz, pCp, VERR_CODE_POINT_SURROGATE));
906	break;
907	case 2:
908	uc = (puch[1] & 0x3f)
909	\| ((RTUNICP)(uch & 0x1f) << 6);
910	RTStrAssertMsgReturn(uc >= 0x00000080 && uc <= 0x000007ff,
911	("%u: cp=%#010RX32: %.Rhxs\n", cb, uc, RT_MIN(cb + 10, strlen((char )puch)), puch),
912	rtStrGetCpExFailure(ppsz, pCp, VERR_INVALID_UTF8_ENCODING));
913	break;
914	default: /* impossible, but GCC is bitching. */
915	uc = RTUNICP_INVALID;
916	break;
917	}
918	puch += cb;
919	}
920	else
921	{
922	/* 6th bit is always set. */
923	RTStrAssertMsgFailed(("Invalid UTF-8 first byte: %.Rhxs\n", RT_MIN(strlen((char )puch), 10), puch));
924	return rtStrGetCpExFailure(ppsz, pCp, VERR_INVALID_UTF8_ENCODING);
925	}
926	*pCp = uc;
927	ppsz = (const char )puch;
928	return VINF_SUCCESS;
929	}
930	RT_EXPORT_SYMBOL(RTStrGetCpExInternal);
931
932
933	/**
934	* Handle invalid encodings passed to RTStrGetCpNEx().
935	* @returns rc
936	* @param ppsz The pointer to the string position point.
937	* @param pcch Pointer to the string length.
938	* @param pCp Where to store RTUNICP_INVALID.
939	* @param rc The iprt error code.
940	*/
941	static int rtStrGetCpNExFailure(const char *ppsz, size_t pcch, PRTUNICP pCp, int rc)
942	{
943	/*
944	* Try find a valid encoding.
945	*/
946	(ppsz)++; /* @todo code this! */
947	(*pcch)--;
948	*pCp = RTUNICP_INVALID;
949	return rc;
950	}
951
952
953	RTDECL(int) RTStrGetCpNExInternal(const char *ppsz, size_t pcch, PRTUNICP pCp)
954	{
955	const unsigned char puch = (const unsigned char )*ppsz;
956	const unsigned char uch = *puch;
957	size_t cch = *pcch;
958	RTUNICP uc;
959
960	if (cch == 0)
961	{
962	*pCp = RTUNICP_INVALID;
963	return VERR_END_OF_STRING;
964	}
965
966	/* ASCII ? */
967	if (!(uch & RT_BIT(7)))
968	{
969	uc = uch;
970	puch++;
971	cch--;
972	}
973	else if (uch & RT_BIT(6))
974	{
975	/* figure the length and validate the first octet. */
976	unsigned cb;
977	if (!(uch & RT_BIT(5)))
978	cb = 2;
979	else if (!(uch & RT_BIT(4)))
980	cb = 3;
981	else if (!(uch & RT_BIT(3)))
982	cb = 4;
983	else if (!(uch & RT_BIT(2)))
984	cb = 5;
985	else if (!(uch & RT_BIT(1)))
986	cb = 6;
987	else
988	{
989	RTStrAssertMsgFailed(("Invalid UTF-8 first byte: %.Rhxs\n", RT_MIN(strlen((char )puch), 10), puch));
990	return rtStrGetCpNExFailure(ppsz, pcch, pCp, VERR_INVALID_UTF8_ENCODING);
991	}
992
993	if (cb > cch)
994	return rtStrGetCpNExFailure(ppsz, pcch, pCp, VERR_INVALID_UTF8_ENCODING);
995
996	/* validate the rest */
997	switch (cb)
998	{
999	case 6:
1000	RTStrAssertMsgReturn((puch[5] & 0xc0) == 0x80, ("6/%u: %.Rhxs\n", cb, RT_MIN(cb + 10, strlen((char )puch)), puch),
1001	rtStrGetCpNExFailure(ppsz, pcch, pCp, VERR_INVALID_UTF8_ENCODING));
1002	case 5:
1003	RTStrAssertMsgReturn((puch[4] & 0xc0) == 0x80, ("5/%u: %.Rhxs\n", cb, RT_MIN(cb + 10, strlen((char )puch)), puch),
1004	rtStrGetCpNExFailure(ppsz, pcch, pCp, VERR_INVALID_UTF8_ENCODING));
1005	case 4:
1006	RTStrAssertMsgReturn((puch[3] & 0xc0) == 0x80, ("4/%u: %.Rhxs\n", cb, RT_MIN(cb + 10, strlen((char )puch)), puch),
1007	rtStrGetCpNExFailure(ppsz, pcch, pCp, VERR_INVALID_UTF8_ENCODING));
1008	case 3:
1009	RTStrAssertMsgReturn((puch[2] & 0xc0) == 0x80, ("3/%u: %.Rhxs\n", cb, RT_MIN(cb + 10, strlen((char )puch)), puch),
1010	rtStrGetCpNExFailure(ppsz, pcch, pCp, VERR_INVALID_UTF8_ENCODING));
1011	case 2:
1012	RTStrAssertMsgReturn((puch[1] & 0xc0) == 0x80, ("2/%u: %.Rhxs\n", cb, RT_MIN(cb + 10, strlen((char )puch)), puch),
1013	rtStrGetCpNExFailure(ppsz, pcch, pCp, VERR_INVALID_UTF8_ENCODING));
1014	break;
1015	}
1016
1017	/* get and validate the code point. */
1018	switch (cb)
1019	{
1020	case 6:
1021	uc = (puch[5] & 0x3f)
1022	\| ((RTUNICP)(puch[4] & 0x3f) << 6)
1023	\| ((RTUNICP)(puch[3] & 0x3f) << 12)
1024	\| ((RTUNICP)(puch[2] & 0x3f) << 18)
1025	\| ((RTUNICP)(puch[1] & 0x3f) << 24)
1026	\| ((RTUNICP)(uch & 0x01) << 30);
1027	RTStrAssertMsgReturn(uc >= 0x04000000 && uc <= 0x7fffffff,
1028	("%u: cp=%#010RX32: %.Rhxs\n", cb, uc, RT_MIN(cb + 10, strlen((char )puch)), puch),
1029	rtStrGetCpNExFailure(ppsz, pcch, pCp, VERR_INVALID_UTF8_ENCODING));
1030	break;
1031	case 5:
1032	uc = (puch[4] & 0x3f)
1033	\| ((RTUNICP)(puch[3] & 0x3f) << 6)
1034	\| ((RTUNICP)(puch[2] & 0x3f) << 12)
1035	\| ((RTUNICP)(puch[1] & 0x3f) << 18)
1036	\| ((RTUNICP)(uch & 0x03) << 24);
1037	RTStrAssertMsgReturn(uc >= 0x00200000 && uc <= 0x03ffffff,
1038	("%u: cp=%#010RX32: %.Rhxs\n", cb, uc, RT_MIN(cb + 10, strlen((char )puch)), puch),
1039	rtStrGetCpNExFailure(ppsz, pcch, pCp, VERR_INVALID_UTF8_ENCODING));
1040	break;
1041	case 4:
1042	uc = (puch[3] & 0x3f)
1043	\| ((RTUNICP)(puch[2] & 0x3f) << 6)
1044	\| ((RTUNICP)(puch[1] & 0x3f) << 12)
1045	\| ((RTUNICP)(uch & 0x07) << 18);
1046	RTStrAssertMsgReturn(uc >= 0x00010000 && uc <= 0x001fffff,
1047	("%u: cp=%#010RX32: %.Rhxs\n", cb, uc, RT_MIN(cb + 10, strlen((char )puch)), puch),
1048	rtStrGetCpNExFailure(ppsz, pcch, pCp, VERR_INVALID_UTF8_ENCODING));
1049	break;
1050	case 3:
1051	uc = (puch[2] & 0x3f)
1052	\| ((RTUNICP)(puch[1] & 0x3f) << 6)
1053	\| ((RTUNICP)(uch & 0x0f) << 12);
1054	RTStrAssertMsgReturn(uc >= 0x00000800 && uc <= 0x0000fffd,
1055	("%u: cp=%#010RX32: %.Rhxs\n", cb, uc, RT_MIN(cb + 10, strlen((char )puch)), puch),
1056	rtStrGetCpNExFailure(ppsz, pcch, pCp, uc == 0xffff \|\| uc == 0xfffe ? VERR_CODE_POINT_ENDIAN_INDICATOR : VERR_INVALID_UTF8_ENCODING));
1057	RTStrAssertMsgReturn(uc < 0xd800 \|\| uc > 0xdfff,
1058	("%u: cp=%#010RX32: %.Rhxs\n", cb, uc, RT_MIN(cb + 10, strlen((char )puch)), puch),
1059	rtStrGetCpNExFailure(ppsz, pcch, pCp, VERR_CODE_POINT_SURROGATE));
1060	break;
1061	case 2:
1062	uc = (puch[1] & 0x3f)
1063	\| ((RTUNICP)(uch & 0x1f) << 6);
1064	RTStrAssertMsgReturn(uc >= 0x00000080 && uc <= 0x000007ff,
1065	("%u: cp=%#010RX32: %.Rhxs\n", cb, uc, RT_MIN(cb + 10, strlen((char )puch)), puch),
1066	rtStrGetCpNExFailure(ppsz, pcch, pCp, VERR_INVALID_UTF8_ENCODING));
1067	break;
1068	default: /* impossible, but GCC is bitching. */
1069	uc = RTUNICP_INVALID;
1070	break;
1071	}
1072	puch += cb;
1073	cch -= cb;
1074	}
1075	else
1076	{
1077	/* 6th bit is always set. */
1078	RTStrAssertMsgFailed(("Invalid UTF-8 first byte: %.Rhxs\n", RT_MIN(strlen((char )puch), 10), puch));
1079	return rtStrGetCpNExFailure(ppsz, pcch, pCp, VERR_INVALID_UTF8_ENCODING);
1080	}
1081	*pCp = uc;
1082	ppsz = (const char )puch;
1083	(*pcch) = cch;
1084	return VINF_SUCCESS;
1085	}
1086	RT_EXPORT_SYMBOL(RTStrGetCpNExInternal);
1087
1088
1089	RTDECL(char ) RTStrPutCpInternal(char psz, RTUNICP uc)
1090	{
1091	unsigned char puch = (unsigned char )psz;
1092	if (uc < 0x80)
1093	*puch++ = (unsigned char )uc;
1094	else if (uc < 0x00000800)
1095	{
1096	*puch++ = 0xc0 \| (uc >> 6);
1097	*puch++ = 0x80 \| (uc & 0x3f);
1098	}
1099	else if (uc < 0x00010000)
1100	{
1101	if ( uc < 0x0000d8000
1102	\|\| ( uc > 0x0000dfff
1103	&& uc < 0x0000fffe))
1104	{
1105	*puch++ = 0xe0 \| (uc >> 12);
1106	*puch++ = 0x80 \| ((uc >> 6) & 0x3f);
1107	*puch++ = 0x80 \| (uc & 0x3f);
1108	}
1109	else
1110	{
1111	AssertMsgFailed(("Invalid code point U+%05x!\n", uc));
1112	*puch++ = 0x7f;
1113	}
1114	}
1115	else if (uc < 0x00200000)
1116	{
1117	*puch++ = 0xf0 \| (uc >> 18);
1118	*puch++ = 0x80 \| ((uc >> 12) & 0x3f);
1119	*puch++ = 0x80 \| ((uc >> 6) & 0x3f);
1120	*puch++ = 0x80 \| (uc & 0x3f);
1121	}
1122	else if (uc < 0x04000000)
1123	{
1124	*puch++ = 0xf1 \| (uc >> 24);
1125	*puch++ = 0x80 \| ((uc >> 18) & 0x3f);
1126	*puch++ = 0x80 \| ((uc >> 12) & 0x3f);
1127	*puch++ = 0x80 \| ((uc >> 6) & 0x3f);
1128	*puch++ = 0x80 \| (uc & 0x3f);
1129	}
1130	else if (uc <= 0x7fffffff)
1131	{
1132	*puch++ = 0xf3 \| (uc >> 30);
1133	*puch++ = 0x80 \| ((uc >> 24) & 0x3f);
1134	*puch++ = 0x80 \| ((uc >> 18) & 0x3f);
1135	*puch++ = 0x80 \| ((uc >> 12) & 0x3f);
1136	*puch++ = 0x80 \| ((uc >> 6) & 0x3f);
1137	*puch++ = 0x80 \| (uc & 0x3f);
1138	}
1139	else
1140	{
1141	AssertMsgFailed(("Invalid code point U+%08x!\n", uc));
1142	*puch++ = 0x7f;
1143	}
1144
1145	return (char *)puch;
1146	}
1147	RT_EXPORT_SYMBOL(RTStrPutCpInternal);
1148
1149
1150	RTDECL(char ) RTStrPrevCp(const char pszStart, const char *psz)
1151	{
1152	if (pszStart < psz)
1153	{
1154	/* simple char? */
1155	const unsigned char puch = (const unsigned char )psz;
1156	unsigned uch = *--puch;
1157	if (!(uch & RT_BIT(7)))
1158	return (char *)puch;
1159	RTStrAssertMsgReturn(!(uch & RT_BIT(6)), ("uch=%#x\n", uch), (char *)pszStart);
1160
1161	/* two or more. */
1162	uint32_t uMask = 0xffffffc0;
1163	while ( (const unsigned char *)pszStart < puch
1164	&& !(uMask & 1))
1165	{
1166	unsigned uch = *--puch;
1167	if ((uch & 0xc0) != 0x80)
1168	{
1169	RTStrAssertMsgReturn((uch & (uMask >> 1)) == (uMask & 0xff),
1170	("Invalid UTF-8 encoding: %.Rhxs puch=%p psz=%p\n", psz - (char )puch, puch, psz),
1171	(char *)pszStart);
1172	return (char *)puch;
1173	}
1174	uMask >>= 1;
1175	}
1176	RTStrAssertMsgFailed(("Invalid UTF-8 encoding: %.Rhxs puch=%p psz=%p\n", psz - (char )puch, puch, psz));
1177	}
1178	return (char *)pszStart;
1179	}
1180	RT_EXPORT_SYMBOL(RTStrPrevCp);
1181
1182
1183	/**
1184	* Performs a case sensitive string compare between two UTF-8 strings.
1185	*
1186	* Encoding errors are ignored by the current implementation. So, the only
1187	* difference between this and the CRT strcmp function is the handling of
1188	* NULL arguments.
1189	*
1190	* @returns < 0 if the first string less than the second string.
1191	* @returns 0 if the first string identical to the second string.
1192	* @returns > 0 if the first string greater than the second string.
1193	* @param psz1 First UTF-8 string. Null is allowed.
1194	* @param psz2 Second UTF-8 string. Null is allowed.
1195	*/
1196	RTDECL(int) RTStrCmp(const char psz1, const char psz2)
1197	{
1198	if (psz1 == psz2)
1199	return 0;
1200	if (!psz1)
1201	return -1;
1202	if (!psz2)
1203	return 1;
1204
1205	return strcmp(psz1, psz2);
1206	}
1207	RT_EXPORT_SYMBOL(RTStrCmp);
1208
1209
1210	/**
1211	* Performs a case sensitive string compare between two UTF-8 strings, given
1212	* a maximum string length.
1213	*
1214	* Encoding errors are ignored by the current implementation. So, the only
1215	* difference between this and the CRT strncmp function is the handling of
1216	* NULL arguments.
1217	*
1218	* @returns < 0 if the first string less than the second string.
1219	* @returns 0 if the first string identical to the second string.
1220	* @returns > 0 if the first string greater than the second string.
1221	* @param psz1 First UTF-8 string. Null is allowed.
1222	* @param psz2 Second UTF-8 string. Null is allowed.
1223	* @param cchMax The maximum string length
1224	*/
1225	RTDECL(int) RTStrNCmp(const char psz1, const char psz2, size_t cchMax)
1226	{
1227	if (psz1 == psz2)
1228	return 0;
1229	if (!psz1)
1230	return -1;
1231	if (!psz2)
1232	return 1;
1233
1234	return strncmp(psz1, psz2, cchMax);
1235	}
1236	RT_EXPORT_SYMBOL(RTStrNCmp);
1237
1238
1239	/**
1240	* Performs a case insensitive string compare between two UTF-8 strings.
1241	*
1242	* This is a simplified compare, as only the simplified lower/upper case folding
1243	* specified by the unicode specs are used. It does not consider character pairs
1244	* as they are used in some languages, just simple upper & lower case compares.
1245	*
1246	* The result is the difference between the mismatching codepoints after they
1247	* both have been lower cased.
1248	*
1249	* If the string encoding is invalid the function will assert (strict builds)
1250	* and use RTStrCmp for the remainder of the string.
1251	*
1252	* @returns < 0 if the first string less than the second string.
1253	* @returns 0 if the first string identical to the second string.
1254	* @returns > 0 if the first string greater than the second string.
1255	* @param psz1 First UTF-8 string. Null is allowed.
1256	* @param psz2 Second UTF-8 string. Null is allowed.
1257	*/
1258	RTDECL(int) RTStrICmp(const char psz1, const char psz2)
1259	{
1260	if (psz1 == psz2)
1261	return 0;
1262	if (!psz1)
1263	return -1;
1264	if (!psz2)
1265	return 1;
1266
1267	const char *pszStart1 = psz1;
1268	for (;;)
1269	{
1270	/* Get the codepoints */
1271	RTUNICP cp1;
1272	int rc = RTStrGetCpEx(&psz1, &cp1);
1273	if (RT_FAILURE(rc))
1274	{
1275	AssertRC(rc);
1276	psz1--;
1277	break;
1278	}
1279
1280	RTUNICP cp2;
1281	rc = RTStrGetCpEx(&psz2, &cp2);
1282	if (RT_FAILURE(rc))
1283	{
1284	AssertRC(rc);
1285	psz2--;
1286	psz1 = RTStrPrevCp(pszStart1, psz1);
1287	break;
1288	}
1289
1290	/* compare */
1291	int iDiff = cp1 - cp2;
1292	if (iDiff)
1293	{
1294	iDiff = RTUniCpToUpper(cp1) != RTUniCpToUpper(cp2);
1295	if (iDiff)
1296	{
1297	iDiff = RTUniCpToLower(cp1) - RTUniCpToLower(cp2); /* lower case diff last! */
1298	if (iDiff)
1299	return iDiff;
1300	}
1301	}
1302
1303	/* hit the terminator? */
1304	if (!cp1)
1305	return 0;
1306	}
1307
1308	/* Hit some bad encoding, continue in case insensitive mode. */
1309	return RTStrCmp(psz1, psz2);
1310	}
1311	RT_EXPORT_SYMBOL(RTStrICmp);
1312
1313
1314	/**
1315	* Performs a case insensitive string compare between two UTF-8 strings, given a
1316	* maximum string length.
1317	*
1318	* This is a simplified compare, as only the simplified lower/upper case folding
1319	* specified by the unicode specs are used. It does not consider character pairs
1320	* as they are used in some languages, just simple upper & lower case compares.
1321	*
1322	* The result is the difference between the mismatching codepoints after they
1323	* both have been lower cased.
1324	*
1325	* If the string encoding is invalid the function will assert (strict builds)
1326	* and use RTStrCmp for the remainder of the string.
1327	*
1328	* @returns < 0 if the first string less than the second string.
1329	* @returns 0 if the first string identical to the second string.
1330	* @returns > 0 if the first string greater than the second string.
1331	* @param psz1 First UTF-8 string. Null is allowed.
1332	* @param psz2 Second UTF-8 string. Null is allowed.
1333	* @param cchMax Maximum string length
1334	*/
1335	RTDECL(int) RTStrNICmp(const char psz1, const char psz2, size_t cchMax)
1336	{
1337	if (cchMax == 0)
1338	return 0;
1339	if (psz1 == psz2)
1340	return 0;
1341	if (!psz1)
1342	return -1;
1343	if (!psz2)
1344	return 1;
1345
1346	for (;;)
1347	{
1348	/* Get the codepoints */
1349	RTUNICP cp1;
1350	size_t cchMax2 = cchMax;
1351	int rc = RTStrGetCpNEx(&psz1, &cchMax, &cp1);
1352	if (RT_FAILURE(rc))
1353	{
1354	AssertRC(rc);
1355	psz1--;
1356	cchMax++;
1357	break;
1358	}
1359
1360	RTUNICP cp2;
1361	rc = RTStrGetCpNEx(&psz2, &cchMax2, &cp2);
1362	if (RT_FAILURE(rc))
1363	{
1364	AssertRC(rc);
1365	psz2--;
1366	psz1 -= (cchMax - cchMax2 + 1); /* This can't overflow, can it? */
1367	cchMax = cchMax2 + 1;
1368	break;
1369	}
1370
1371	/* compare */
1372	int iDiff = cp1 - cp2;
1373	if (iDiff)
1374	{
1375	iDiff = RTUniCpToUpper(cp1) != RTUniCpToUpper(cp2);
1376	if (iDiff)
1377	{
1378	iDiff = RTUniCpToLower(cp1) - RTUniCpToLower(cp2); /* lower case diff last! */
1379	if (iDiff)
1380	return iDiff;
1381	}
1382	}
1383
1384	/* hit the terminator? */
1385	if (!cp1 \|\| cchMax == 0)
1386	return 0;
1387	}
1388
1389	/* Hit some bad encoding, continue in case insensitive mode. */
1390	return RTStrNCmp(psz1, psz2, cchMax);
1391	}
1392	RT_EXPORT_SYMBOL(RTStrNICmp);
1393
1394
1395	RTDECL(char ) RTStrStr(const char pszHaystack, const char *pszNeedle)
1396	{
1397	/* Any NULL strings means NULL return. (In the RTStrCmp tradition.) */
1398	if (!pszHaystack)
1399	return NULL;
1400	if (!pszNeedle)
1401	return NULL;
1402
1403	/* The rest is CRT. */
1404	return (char *)strstr(pszHaystack, pszNeedle);
1405	}
1406	RT_EXPORT_SYMBOL(RTStrStr);
1407
1408
1409	RTDECL(char ) RTStrIStr(const char pszHaystack, const char *pszNeedle)
1410	{
1411	/* Any NULL strings means NULL return. (In the RTStrCmp tradition.) */
1412	if (!pszHaystack)
1413	return NULL;
1414	if (!pszNeedle)
1415	return NULL;
1416
1417	/* The empty string matches everything. */
1418	if (!*pszNeedle)
1419	return (char *)pszHaystack;
1420
1421	/*
1422	* The search strategy is to pick out the first char of the needle, fold it,
1423	* and match it against the haystack code point by code point. When encountering
1424	* a matching code point we use RTStrNICmp for the remainder (if any) of the needle.
1425	*/
1426	const char * const pszNeedleStart = pszNeedle;
1427	RTUNICP Cp0;
1428	RTStrGetCpEx(&pszNeedle, &Cp0); /* pszNeedle is advanced one code point. */
1429	size_t const cchNeedle = strlen(pszNeedle);
1430	size_t const cchNeedleCp0= pszNeedle - pszNeedleStart;
1431	RTUNICP const Cp0Lower = RTUniCpToLower(Cp0);
1432	RTUNICP const Cp0Upper = RTUniCpToUpper(Cp0);
1433	if ( Cp0Lower == Cp0Upper
1434	&& Cp0Lower == Cp0)
1435	{
1436	/* Cp0 is not a case sensitive char. */
1437	for (;;)
1438	{
1439	RTUNICP Cp;
1440	RTStrGetCpEx(&pszHaystack, &Cp);
1441	if (!Cp)
1442	break;
1443	if ( Cp == Cp0
1444	&& !RTStrNICmp(pszHaystack, pszNeedle, cchNeedle))
1445	return (char *)pszHaystack - cchNeedleCp0;
1446	}
1447	}
1448	else if ( Cp0Lower == Cp0
1449	\|\| Cp0Upper != Cp0)
1450	{
1451	/* Cp0 is case sensitive */
1452	for (;;)
1453	{
1454	RTUNICP Cp;
1455	RTStrGetCpEx(&pszHaystack, &Cp);
1456	if (!Cp)
1457	break;
1458	if ( ( Cp == Cp0Upper
1459	\|\| Cp == Cp0Lower)
1460	&& !RTStrNICmp(pszHaystack, pszNeedle, cchNeedle))
1461	return (char *)pszHaystack - cchNeedleCp0;
1462	}
1463	}
1464	else
1465	{
1466	/* Cp0 is case sensitive and folds to two difference chars. (paranoia) */
1467	for (;;)
1468	{
1469	RTUNICP Cp;
1470	RTStrGetCpEx(&pszHaystack, &Cp);
1471	if (!Cp)
1472	break;
1473	if ( ( Cp == Cp0
1474	\|\| Cp == Cp0Upper
1475	\|\| Cp == Cp0Lower)
1476	&& !RTStrNICmp(pszHaystack, pszNeedle, cchNeedle))
1477	return (char *)pszHaystack - cchNeedleCp0;
1478	}
1479	}
1480
1481
1482	return NULL;
1483	}
1484	RT_EXPORT_SYMBOL(RTStrIStr);
1485
1486
1487	RTDECL(char ) RTStrToLower(char psz)
1488	{
1489	/*
1490	* Loop the code points in the string, converting them one by one.
1491	* ASSUMES that the code points for upper and lower case are encoded
1492	* with the exact same length.
1493	*/
1494	/** @todo Handled bad encodings correctly+quietly, remove assumption,
1495	* optimize. */
1496	char *pszCur = psz;
1497	while (*pszCur)
1498	{
1499	RTUNICP cp = RTStrGetCp(pszCur);
1500	cp = RTUniCpToLower(cp);
1501	pszCur = RTStrPutCp(pszCur, cp);
1502	}
1503	return psz;
1504	}
1505	RT_EXPORT_SYMBOL(RTStrToLower);
1506
1507
1508	RTDECL(char ) RTStrToUpper(char psz)
1509	{
1510	/*
1511	* Loop the code points in the string, converting them one by one.
1512	* ASSUMES that the code points for upper and lower case are encoded
1513	* with the exact same length.
1514	*/
1515	/** @todo Handled bad encodings correctly+quietly, remove assumption,
1516	* optimize. */
1517	char *pszCur = psz;
1518	while(*pszCur)
1519	{
1520	RTUNICP cp = RTStrGetCp(pszCur);
1521	cp = RTUniCpToUpper(cp);
1522	pszCur = RTStrPutCp(pszCur, cp);
1523	}
1524	return psz;
1525	}
1526	RT_EXPORT_SYMBOL(RTStrToUpper);
1527

Note: See TracBrowser for help on using the repository browser.

source: vbox/trunk/src/VBox/Runtime/common/string/utf-8.cpp@ 21337

Download in other formats: