utf-8.cpp@ 23594

Last change on this file since 23594 was 21791, checked in by vboxsync, 16 years ago
RTStrPutCpInternal: Fixed an irrelevant bug.
Property svn:eol-style set to `native` Property svn:keywords set to `Id`
File size: 53.4 KB

Line
1	/* $Id: utf-8.cpp 21791 2009-07-25 17:10:57Z vboxsync $ */
2	/** @file
3	* IPRT - UTF-8 Decoding.
4	*/
5
6	/*
7	* Copyright (C) 2006-2007 Sun Microsystems, Inc.
8	*
9	* This file is part of VirtualBox Open Source Edition (OSE), as
10	* available from http://www.virtualbox.org. This file is free software;
11	* you can redistribute it and/or modify it under the terms of the GNU
12	* General Public License (GPL) as published by the Free Software
13	* Foundation, in version 2 as it comes in the "COPYING" file of the
14	* VirtualBox OSE distribution. VirtualBox OSE is distributed in the
15	* hope that it will be useful, but WITHOUT ANY WARRANTY of any kind.
16	*
17	* The contents of this file may alternatively be used under the terms
18	* of the Common Development and Distribution License Version 1.0
19	* (CDDL) only, as it comes in the "COPYING.CDDL" file of the
20	* VirtualBox OSE distribution, in which case the provisions of the
21	* CDDL are applicable instead of those of the GPL.
22	*
23	* You may elect to license modified versions of this file under the
24	* terms and conditions of either the GPL or the CDDL or both.
25	*
26	* Please contact Sun Microsystems, Inc., 4150 Network Circle, Santa
27	* Clara, CA 95054 USA or visit http://www.sun.com if you need
28	* additional information or have any questions.
29	*/
30
31
32	/*******************************************************************************
33	* Header Files *
34	*******************************************************************************/
35	#include <iprt/string.h>
36	#include "internal/iprt.h"
37
38	#include <iprt/uni.h>
39	#include <iprt/alloc.h>
40	#include <iprt/assert.h>
41	#include <iprt/err.h>
42	#include "internal/string.h"
43
44
45
46	/**
47	* Get get length in code points of a UTF-8 encoded string.
48	* The string is validated while doing this.
49	*
50	* @returns IPRT status code.
51	* @param psz Pointer to the UTF-8 string.
52	* @param cch The max length of the string. (btw cch = cb)
53	* Use RTSTR_MAX if all of the string is to be examined.
54	* @param pcuc Where to store the length in unicode code points.
55	* @param pcchActual Where to store the actual size of the UTF-8 string
56	* on success (cch = cb again). Optional.
57	*/
58	static int rtUtf8Length(const char psz, size_t cch, size_t pcuc, size_t *pcchActual)
59	{
60	const unsigned char puch = (const unsigned char )psz;
61	size_t cCodePoints = 0;
62	while (cch > 0)
63	{
64	const unsigned char uch = *puch;
65	if (!uch)
66	break;
67	if (uch & RT_BIT(7))
68	{
69	/* figure sequence length and validate the first byte */
70	unsigned cb;
71	if ((uch & (RT_BIT(7) \| RT_BIT(6) \| RT_BIT(5))) == (RT_BIT(7) \| RT_BIT(6)))
72	cb = 2;
73	else if ((uch & (RT_BIT(7) \| RT_BIT(6) \| RT_BIT(5) \| RT_BIT(4))) == (RT_BIT(7) \| RT_BIT(6) \| RT_BIT(5)))
74	cb = 3;
75	else if ((uch & (RT_BIT(7) \| RT_BIT(6) \| RT_BIT(5) \| RT_BIT(4) \| RT_BIT(3))) == (RT_BIT(7) \| RT_BIT(6) \| RT_BIT(5) \| RT_BIT(4)))
76	cb = 4;
77	else if ((uch & (RT_BIT(7) \| RT_BIT(6) \| RT_BIT(5) \| RT_BIT(4) \| RT_BIT(3) \| RT_BIT(2))) == (RT_BIT(7) \| RT_BIT(6) \| RT_BIT(5) \| RT_BIT(4) \| RT_BIT(3)))
78	cb = 5;
79	else if ((uch & (RT_BIT(7) \| RT_BIT(6) \| RT_BIT(5) \| RT_BIT(4) \| RT_BIT(3) \| RT_BIT(2) \| RT_BIT(1))) == (RT_BIT(7) \| RT_BIT(6) \| RT_BIT(5) \| RT_BIT(4) \| RT_BIT(3) \| RT_BIT(2)))
80	cb = 6;
81	else
82	{
83	RTStrAssertMsgFailed(("Invalid UTF-8 first byte: %.*Rhxs\n", RT_MIN(cch, 10), puch));
84	return VERR_INVALID_UTF8_ENCODING;
85	}
86
87	/* check length */
88	if (cb > cch)
89	{
90	RTStrAssertMsgFailed(("Invalid UTF-8 length: cb=%d cch=%d (%.*Rhxs)\n", cb, cch, RT_MIN(cch, 10), puch));
91	return VERR_INVALID_UTF8_ENCODING;
92	}
93
94	/* validate the rest */
95	switch (cb)
96	{
97	case 6:
98	RTStrAssertMsgReturn((puch[5] & (RT_BIT(7) \| RT_BIT(6))) == RT_BIT(7), ("6/%u: %.*Rhxs\n", cb, RT_MIN(cb + 10, cch), puch), VERR_INVALID_UTF8_ENCODING);
99	case 5:
100	RTStrAssertMsgReturn((puch[4] & (RT_BIT(7) \| RT_BIT(6))) == RT_BIT(7), ("5/%u: %.*Rhxs\n", cb, RT_MIN(cb + 10, cch), puch), VERR_INVALID_UTF8_ENCODING);
101	case 4:
102	RTStrAssertMsgReturn((puch[3] & (RT_BIT(7) \| RT_BIT(6))) == RT_BIT(7), ("4/%u: %.*Rhxs\n", cb, RT_MIN(cb + 10, cch), puch), VERR_INVALID_UTF8_ENCODING);
103	case 3:
104	RTStrAssertMsgReturn((puch[2] & (RT_BIT(7) \| RT_BIT(6))) == RT_BIT(7), ("3/%u: %.*Rhxs\n", cb, RT_MIN(cb + 10, cch), puch), VERR_INVALID_UTF8_ENCODING);
105	case 2:
106	RTStrAssertMsgReturn((puch[1] & (RT_BIT(7) \| RT_BIT(6))) == RT_BIT(7), ("2/%u: %.*Rhxs\n", cb, RT_MIN(cb + 10, cch), puch), VERR_INVALID_UTF8_ENCODING);
107	break;
108	}
109
110	/* validate the code point. */
111	RTUNICP uc;
112	switch (cb)
113	{
114	case 6:
115	uc = (puch[5] & 0x3f)
116	\| ((RTUNICP)(puch[4] & 0x3f) << 6)
117	\| ((RTUNICP)(puch[3] & 0x3f) << 12)
118	\| ((RTUNICP)(puch[2] & 0x3f) << 18)
119	\| ((RTUNICP)(puch[1] & 0x3f) << 24)
120	\| ((RTUNICP)(uch & 0x01) << 30);
121	RTStrAssertMsgReturn(uc >= 0x04000000 && uc <= 0x7fffffff,
122	("%u: cp=%#010RX32: %.*Rhxs\n", cb, uc, RT_MIN(cb + 10, cch), puch), VERR_INVALID_UTF8_ENCODING);
123	break;
124	case 5:
125	uc = (puch[4] & 0x3f)
126	\| ((RTUNICP)(puch[3] & 0x3f) << 6)
127	\| ((RTUNICP)(puch[2] & 0x3f) << 12)
128	\| ((RTUNICP)(puch[1] & 0x3f) << 18)
129	\| ((RTUNICP)(uch & 0x03) << 24);
130	RTStrAssertMsgReturn(uc >= 0x00200000 && uc <= 0x03ffffff,
131	("%u: cp=%#010RX32: %.*Rhxs\n", cb, uc, RT_MIN(cb + 10, cch), puch), VERR_INVALID_UTF8_ENCODING);
132	break;
133	case 4:
134	uc = (puch[3] & 0x3f)
135	\| ((RTUNICP)(puch[2] & 0x3f) << 6)
136	\| ((RTUNICP)(puch[1] & 0x3f) << 12)
137	\| ((RTUNICP)(uch & 0x07) << 18);
138	RTStrAssertMsgReturn(uc >= 0x00010000 && uc <= 0x001fffff,
139	("%u: cp=%#010RX32: %.*Rhxs\n", cb, uc, RT_MIN(cb + 10, cch), puch), VERR_INVALID_UTF8_ENCODING);
140	break;
141	case 3:
142	uc = (puch[2] & 0x3f)
143	\| ((RTUNICP)(puch[1] & 0x3f) << 6)
144	\| ((RTUNICP)(uch & 0x0f) << 12);
145	RTStrAssertMsgReturn(uc >= 0x00000800 && uc <= 0x0000fffd,
146	("%u: cp=%#010RX32: %.*Rhxs\n", cb, uc, RT_MIN(cb + 10, cch), puch),
147	uc == 0xffff \|\| uc == 0xfffe ? VERR_CODE_POINT_ENDIAN_INDICATOR : VERR_INVALID_UTF8_ENCODING);
148	RTStrAssertMsgReturn(uc < 0xd800 \|\| uc > 0xdfff,
149	("%u: cp=%#010RX32: %.*Rhxs\n", cb, uc, RT_MIN(cb + 10, cch), puch), VERR_CODE_POINT_SURROGATE);
150	break;
151	case 2:
152	uc = (puch[1] & 0x3f)
153	\| ((RTUNICP)(uch & 0x1f) << 6);
154	RTStrAssertMsgReturn(uc >= 0x00000080 && uc <= 0x000007ff,
155	("%u: cp=%#010RX32: %.*Rhxs\n", cb, uc, RT_MIN(cb + 10, cch), puch), VERR_INVALID_UTF8_ENCODING);
156	break;
157	}
158
159	/* advance */
160	cch -= cb;
161	puch += cb;
162	}
163	else
164	{
165	/* one ASCII byte */
166	puch++;
167	cch--;
168	}
169	cCodePoints++;
170	}
171
172	/* done */
173	*pcuc = cCodePoints;
174	if (pcchActual)
175	pcchActual = puch - (unsigned char const )psz;
176	return VINF_SUCCESS;
177	}
178
179
180	/**
181	* Decodes and UTF-8 string into an array of unicode code point.
182	*
183	* Since we know the input is valid, we do not perform encoding or length checks.
184	*
185	* @returns iprt status code.
186	* @param psz The UTF-8 string to recode. This is a valid encoding.
187	* @param cch The number of chars (the type char, so bytes if you like) to process of the UTF-8 string.
188	* The recoding will stop when cch or '\\0' is reached. Pass RTSTR_MAX to process up to '\\0'.
189	* @param paCps Where to store the code points array.
190	* @param cCps The number of RTUNICP items the paCps buffer can hold, excluding the terminator ('\\0').
191	*/
192	static int rtUtf8Decode(const char *psz, size_t cch, PRTUNICP paCps, size_t cCps)
193	{
194	int rc = VINF_SUCCESS;
195	const unsigned char puch = (const unsigned char )psz;
196	PRTUNICP pCp = paCps;
197	while (cch > 0)
198	{
199	/* read the next char and check for terminator. */
200	const unsigned char uch = *puch;
201	if (!uch)
202	break;
203
204	/* check for output overflow */
205	if (RT_UNLIKELY(cCps < 1))
206	{
207	rc = VERR_BUFFER_OVERFLOW;
208	break;
209	}
210	cCps--;
211
212	/* decode and recode the code point */
213	if (!(uch & RT_BIT(7)))
214	{
215	*pCp++ = uch;
216	puch++;
217	cch--;
218	}
219	#ifdef RT_STRICT
220	else if (!(uch & RT_BIT(6)))
221	AssertMsgFailed(("Internal error!\n"));
222	#endif
223	else if (!(uch & RT_BIT(5)))
224	{
225	*pCp++ = (puch[1] & 0x3f)
226	\| ((uint16_t)(uch & 0x1f) << 6);
227	puch += 2;
228	cch -= 2;
229	}
230	else if (!(uch & RT_BIT(4)))
231	{
232	*pCp++ = (puch[2] & 0x3f)
233	\| ((uint16_t)(puch[1] & 0x3f) << 6)
234	\| ((uint16_t)(uch & 0x0f) << 12);
235	puch += 3;
236	cch -= 3;
237	}
238	else if (!(uch & RT_BIT(3)))
239	{
240	*pCp++ = (puch[3] & 0x3f)
241	\| ((RTUNICP)(puch[2] & 0x3f) << 6)
242	\| ((RTUNICP)(puch[1] & 0x3f) << 12)
243	\| ((RTUNICP)(uch & 0x07) << 18);
244	puch += 4;
245	cch -= 4;
246	}
247	else if (!(uch & RT_BIT(2)))
248	{
249	*pCp++ = (puch[4] & 0x3f)
250	\| ((RTUNICP)(puch[3] & 0x3f) << 6)
251	\| ((RTUNICP)(puch[2] & 0x3f) << 12)
252	\| ((RTUNICP)(puch[1] & 0x3f) << 18)
253	\| ((RTUNICP)(uch & 0x03) << 24);
254	puch += 5;
255	cch -= 6;
256	}
257	else
258	{
259	Assert(!(uch & RT_BIT(1)));
260	*pCp++ = (puch[5] & 0x3f)
261	\| ((RTUNICP)(puch[4] & 0x3f) << 6)
262	\| ((RTUNICP)(puch[3] & 0x3f) << 12)
263	\| ((RTUNICP)(puch[2] & 0x3f) << 18)
264	\| ((RTUNICP)(puch[1] & 0x3f) << 24)
265	\| ((RTUNICP)(uch & 0x01) << 30);
266	puch += 6;
267	cch -= 6;
268	}
269	}
270
271	/* done */
272	*pCp = 0;
273	return rc;
274	}
275
276
277	RTDECL(size_t) RTStrUniLen(const char *psz)
278	{
279	size_t cCodePoints;
280	int rc = rtUtf8Length(psz, RTSTR_MAX, &cCodePoints, NULL);
281	return RT_SUCCESS(rc) ? cCodePoints : 0;
282	}
283	RT_EXPORT_SYMBOL(RTStrUniLen);
284
285
286	RTDECL(int) RTStrUniLenEx(const char psz, size_t cch, size_t pcCps)
287	{
288	size_t cCodePoints;
289	int rc = rtUtf8Length(psz, cch, &cCodePoints, NULL);
290	if (pcCps)
291	*pcCps = RT_SUCCESS(rc) ? cCodePoints : 0;
292	return rc;
293	}
294	RT_EXPORT_SYMBOL(RTStrUniLenEx);
295
296
297	RTDECL(int) RTStrValidateEncoding(const char *psz)
298	{
299	return RTStrValidateEncodingEx(psz, RTSTR_MAX, 0);
300	}
301	RT_EXPORT_SYMBOL(RTStrValidateEncoding);
302
303
304	RTDECL(int) RTStrValidateEncodingEx(const char *psz, size_t cch, uint32_t fFlags)
305	{
306	AssertReturn(!(fFlags & ~(RTSTR_VALIDATE_ENCODING_ZERO_TERMINATED)), VERR_INVALID_PARAMETER);
307	AssertPtr(psz);
308
309	/*
310	* Use rtUtf8Length for the job.
311	*/
312	size_t cchActual;
313	size_t cCpsIgnored;
314	int rc = rtUtf8Length(psz, cch, &cCpsIgnored, &cchActual);
315	if (RT_SUCCESS(rc))
316	{
317	if ( (fFlags & RTSTR_VALIDATE_ENCODING_ZERO_TERMINATED)
318	&& cchActual >= cch)
319	rc = VERR_BUFFER_OVERFLOW;
320	}
321	return rc;
322
323
324	return RTStrUniLenEx(psz, cch, &cCpsIgnored);
325	}
326	RT_EXPORT_SYMBOL(RTStrValidateEncodingEx);
327
328
329	RTDECL(bool) RTStrIsValidEncoding(const char *psz)
330	{
331	int rc = RTStrValidateEncodingEx(psz, RTSTR_MAX, 0);
332	return RT_SUCCESS(rc);
333	}
334	RT_EXPORT_SYMBOL(RTStrIsValidEncoding);
335
336
337	RTDECL(int) RTStrToUni(const char pszString, PRTUNICP ppaCps)
338	{
339	/*
340	* Validate input.
341	*/
342	Assert(VALID_PTR(pszString));
343	Assert(VALID_PTR(ppaCps));
344	*ppaCps = NULL;
345
346	/*
347	* Validate the UTF-8 input and count its code points.
348	*/
349	size_t cCps;
350	int rc = rtUtf8Length(pszString, RTSTR_MAX, &cCps, NULL);
351	if (RT_SUCCESS(rc))
352	{
353	/*
354	* Allocate buffer.
355	*/
356	PRTUNICP paCps = (PRTUNICP)RTMemAlloc((cCps + 1) * sizeof(RTUNICP));
357	if (paCps)
358	{
359	/*
360	* Decode the string.
361	*/
362	rc = rtUtf8Decode(pszString, RTSTR_MAX, paCps, cCps);
363	if (RT_SUCCESS(rc))
364	{
365	*ppaCps = paCps;
366	return rc;
367	}
368	RTMemFree(paCps);
369	}
370	else
371	rc = VERR_NO_CODE_POINT_MEMORY;
372	}
373	return rc;
374	}
375	RT_EXPORT_SYMBOL(RTStrToUni);
376
377
378	RTDECL(int) RTStrToUniEx(const char pszString, size_t cchString, PRTUNICP ppaCps, size_t cCps, size_t *pcCps)
379	{
380	/*
381	* Validate input.
382	*/
383	Assert(VALID_PTR(pszString));
384	Assert(VALID_PTR(ppaCps));
385	Assert(!pcCps \|\| VALID_PTR(pcCps));
386
387	/*
388	* Validate the UTF-8 input and count the code points.
389	*/
390	size_t cCpsResult;
391	int rc = rtUtf8Length(pszString, cchString, &cCpsResult, NULL);
392	if (RT_SUCCESS(rc))
393	{
394	if (pcCps)
395	*pcCps = cCpsResult;
396
397	/*
398	* Check buffer size / Allocate buffer.
399	*/
400	bool fShouldFree;
401	PRTUNICP paCpsResult;
402	if (cCps > 0 && *ppaCps)
403	{
404	fShouldFree = false;
405	if (cCps <= cCpsResult)
406	return VERR_BUFFER_OVERFLOW;
407	paCpsResult = *ppaCps;
408	}
409	else
410	{
411	*ppaCps = NULL;
412	fShouldFree = true;
413	cCps = RT_MAX(cCpsResult + 1, cCps);
414	paCpsResult = (PRTUNICP)RTMemAlloc(cCps * sizeof(RTUNICP));
415	}
416	if (paCpsResult)
417	{
418	/*
419	* Encode the UTF-16 string.
420	*/
421	rc = rtUtf8Decode(pszString, cchString, paCpsResult, cCps - 1);
422	if (RT_SUCCESS(rc))
423	{
424	*ppaCps = paCpsResult;
425	return rc;
426	}
427	if (fShouldFree)
428	RTMemFree(paCpsResult);
429	}
430	else
431	rc = VERR_NO_CODE_POINT_MEMORY;
432	}
433	return rc;
434	}
435	RT_EXPORT_SYMBOL(RTStrToUniEx);
436
437
438	/**
439	* Calculates the UTF-16 length of a string, validating the encoding while doing so.
440	*
441	* @returns IPRT status code.
442	* @param psz Pointer to the UTF-8 string.
443	* @param cch The max length of the string. (btw cch = cb)
444	* Use RTSTR_MAX if all of the string is to be examined.s
445	* @param pcwc Where to store the length of the UTF-16 string as a number of RTUTF16 characters.
446	*/
447	static int rtUtf8CalcUtf16Length(const char psz, size_t cch, size_t pcwc)
448	{
449	const unsigned char puch = (const unsigned char )psz;
450	size_t cwc = 0;
451	while (cch > 0)
452	{
453	const unsigned char uch = *puch;
454	if (!uch)
455	break;
456	if (!(uch & RT_BIT(7)))
457	{
458	/* one ASCII byte */
459	cwc++;
460	puch++;
461	cch--;
462	}
463	else
464	{
465	/* figure sequence length and validate the first byte */
466	unsigned cb;
467	if ((uch & (RT_BIT(7) \| RT_BIT(6) \| RT_BIT(5))) == (RT_BIT(7) \| RT_BIT(6)))
468	cb = 2;
469	else if ((uch & (RT_BIT(7) \| RT_BIT(6) \| RT_BIT(5) \| RT_BIT(4))) == (RT_BIT(7) \| RT_BIT(6) \| RT_BIT(5)))
470	cb = 3;
471	else if ((uch & (RT_BIT(7) \| RT_BIT(6) \| RT_BIT(5) \| RT_BIT(4) \| RT_BIT(3))) == (RT_BIT(7) \| RT_BIT(6) \| RT_BIT(5) \| RT_BIT(4)))
472	cb = 4;
473	else if ((uch & (RT_BIT(7) \| RT_BIT(6) \| RT_BIT(5) \| RT_BIT(4) \| RT_BIT(3) \| RT_BIT(2))) == (RT_BIT(7) \| RT_BIT(6) \| RT_BIT(5) \| RT_BIT(4) \| RT_BIT(3)))
474	cb = 5;
475	else if ((uch & (RT_BIT(7) \| RT_BIT(6) \| RT_BIT(5) \| RT_BIT(4) \| RT_BIT(3) \| RT_BIT(2) \| RT_BIT(1))) == (RT_BIT(7) \| RT_BIT(6) \| RT_BIT(5) \| RT_BIT(4) \| RT_BIT(3) \| RT_BIT(2)))
476	cb = 6;
477	else
478	{
479	RTStrAssertMsgFailed(("Invalid UTF-8 first byte: %.*Rhxs\n", RT_MIN(cch, 10), puch));
480	return VERR_INVALID_UTF8_ENCODING;
481	}
482
483	/* check length */
484	if (cb > cch)
485	{
486	RTStrAssertMsgFailed(("Invalid UTF-8 length: cb=%d cch=%d (%.*Rhxs)\n", cb, cch, RT_MIN(cch, 10), puch));
487	return VERR_INVALID_UTF8_ENCODING;
488	}
489
490	/* validate the rest */
491	switch (cb)
492	{
493	case 6:
494	RTStrAssertMsgReturn((puch[5] & (RT_BIT(7) \| RT_BIT(6))) == RT_BIT(7), ("6/%u: %.*Rhxs\n", cb, RT_MIN(cb + 10, cch), puch), VERR_INVALID_UTF8_ENCODING);
495	case 5:
496	RTStrAssertMsgReturn((puch[4] & (RT_BIT(7) \| RT_BIT(6))) == RT_BIT(7), ("5/%u: %.*Rhxs\n", cb, RT_MIN(cb + 10, cch), puch), VERR_INVALID_UTF8_ENCODING);
497	case 4:
498	RTStrAssertMsgReturn((puch[3] & (RT_BIT(7) \| RT_BIT(6))) == RT_BIT(7), ("4/%u: %.*Rhxs\n", cb, RT_MIN(cb + 10, cch), puch), VERR_INVALID_UTF8_ENCODING);
499	case 3:
500	RTStrAssertMsgReturn((puch[2] & (RT_BIT(7) \| RT_BIT(6))) == RT_BIT(7), ("3/%u: %.*Rhxs\n", cb, RT_MIN(cb + 10, cch), puch), VERR_INVALID_UTF8_ENCODING);
501	case 2:
502	RTStrAssertMsgReturn((puch[1] & (RT_BIT(7) \| RT_BIT(6))) == RT_BIT(7), ("2/%u: %.*Rhxs\n", cb, RT_MIN(cb + 10, cch), puch), VERR_INVALID_UTF8_ENCODING);
503	break;
504	}
505
506	/* validate the code point. */
507	RTUNICP uc;
508	switch (cb)
509	{
510	case 6:
511	uc = (puch[5] & 0x3f)
512	\| ((RTUNICP)(puch[4] & 0x3f) << 6)
513	\| ((RTUNICP)(puch[3] & 0x3f) << 12)
514	\| ((RTUNICP)(puch[2] & 0x3f) << 18)
515	\| ((RTUNICP)(puch[1] & 0x3f) << 24)
516	\| ((RTUNICP)(uch & 0x01) << 30);
517	RTStrAssertMsgReturn(uc >= 0x04000000 && uc <= 0x7fffffff,
518	("%u: cp=%#010RX32: %.*Rhxs\n", cb, uc, RT_MIN(cb + 10, cch), puch), VERR_INVALID_UTF8_ENCODING);
519	RTStrAssertMsgFailed(("%u: cp=%#010RX32: %.*Rhxs\n", cb, uc, RT_MIN(cb + 10, cch), puch));
520	return VERR_CANT_RECODE_AS_UTF16;
521	case 5:
522	uc = (puch[4] & 0x3f)
523	\| ((RTUNICP)(puch[3] & 0x3f) << 6)
524	\| ((RTUNICP)(puch[2] & 0x3f) << 12)
525	\| ((RTUNICP)(puch[1] & 0x3f) << 18)
526	\| ((RTUNICP)(uch & 0x03) << 24);
527	RTStrAssertMsgReturn(uc >= 0x00200000 && uc <= 0x03ffffff,
528	("%u: cp=%#010RX32: %.*Rhxs\n", cb, uc, RT_MIN(cb + 10, cch), puch), VERR_INVALID_UTF8_ENCODING);
529	RTStrAssertMsgFailed(("%u: cp=%#010RX32: %.*Rhxs\n", cb, uc, RT_MIN(cb + 10, cch), puch));
530	return VERR_CANT_RECODE_AS_UTF16;
531	case 4:
532	uc = (puch[3] & 0x3f)
533	\| ((RTUNICP)(puch[2] & 0x3f) << 6)
534	\| ((RTUNICP)(puch[1] & 0x3f) << 12)
535	\| ((RTUNICP)(uch & 0x07) << 18);
536	RTStrAssertMsgReturn(uc >= 0x00010000 && uc <= 0x001fffff,
537	("%u: cp=%#010RX32: %.*Rhxs\n", cb, uc, RT_MIN(cb + 10, cch), puch), VERR_INVALID_UTF8_ENCODING);
538	RTStrAssertMsgReturn(uc <= 0x0010ffff,
539	("%u: cp=%#010RX32: %.*Rhxs\n", cb, uc, RT_MIN(cb + 10, cch), puch), VERR_CANT_RECODE_AS_UTF16);
540	cwc++;
541	break;
542	case 3:
543	uc = (puch[2] & 0x3f)
544	\| ((RTUNICP)(puch[1] & 0x3f) << 6)
545	\| ((RTUNICP)(uch & 0x0f) << 12);
546	RTStrAssertMsgReturn(uc >= 0x00000800 && uc <= 0x0000fffd,
547	("%u: cp=%#010RX32: %.*Rhxs\n", cb, uc, RT_MIN(cb + 10, cch), puch),
548	uc == 0xffff \|\| uc == 0xfffe ? VERR_CODE_POINT_ENDIAN_INDICATOR : VERR_INVALID_UTF8_ENCODING);
549	RTStrAssertMsgReturn(uc < 0xd800 \|\| uc > 0xdfff,
550	("%u: cp=%#010RX32: %.*Rhxs\n", cb, uc, RT_MIN(cb + 10, cch), puch), VERR_CODE_POINT_SURROGATE);
551	break;
552	case 2:
553	uc = (puch[1] & 0x3f)
554	\| ((RTUNICP)(uch & 0x1f) << 6);
555	RTStrAssertMsgReturn(uc >= 0x00000080 && uc <= 0x000007ff,
556	("%u: cp=%#010RX32: %.*Rhxs\n", cb, uc, RT_MIN(cb + 10, cch), puch), VERR_INVALID_UTF8_ENCODING);
557	break;
558	}
559
560	/* advance */
561	cch -= cb;
562	puch += cb;
563	cwc++;
564	}
565	}
566
567	/* done */
568	*pcwc = cwc;
569	return VINF_SUCCESS;
570	}
571
572
573	/**
574	* Recodes a valid UTF-8 string as UTF-16.
575	*
576	* Since we know the input is valid, we do not perform encoding or length checks.
577	*
578	* @returns iprt status code.
579	* @param psz The UTF-8 string to recode. This is a valid encoding.
580	* @param cch The number of chars (the type char, so bytes if you like) to process of the UTF-8 string.
581	* The recoding will stop when cch or '\\0' is reached. Pass RTSTR_MAX to process up to '\\0'.
582	* @param pwsz Where to store the UTF-16 string.
583	* @param cwc The number of RTUTF16 items the pwsz buffer can hold, excluding the terminator ('\\0').
584	*/
585	static int rtUtf8RecodeAsUtf16(const char *psz, size_t cch, PRTUTF16 pwsz, size_t cwc)
586	{
587	int rc = VINF_SUCCESS;
588	const unsigned char puch = (const unsigned char )psz;
589	PRTUTF16 pwc = pwsz;
590	while (cch > 0)
591	{
592	/* read the next char and check for terminator. */
593	const unsigned char uch = *puch;
594	if (!uch)
595	break;
596
597	/* check for output overflow */
598	if (RT_UNLIKELY(cwc < 1))
599	{
600	rc = VERR_BUFFER_OVERFLOW;
601	break;
602	}
603	cwc--;
604
605	/* decode and recode the code point */
606	if (!(uch & RT_BIT(7)))
607	{
608	*pwc++ = uch;
609	puch++;
610	cch--;
611	}
612	else if ((uch & (RT_BIT(7) \| RT_BIT(6) \| RT_BIT(5))) == (RT_BIT(7) \| RT_BIT(6)))
613	{
614	uint16_t uc = (puch[1] & 0x3f)
615	\| ((uint16_t)(uch & 0x1f) << 6);
616	*pwc++ = uc;
617	puch += 2;
618	cch -= 2;
619	}
620	else if ((uch & (RT_BIT(7) \| RT_BIT(6) \| RT_BIT(5) \| RT_BIT(4))) == (RT_BIT(7) \| RT_BIT(6) \| RT_BIT(5)))
621	{
622	uint16_t uc = (puch[2] & 0x3f)
623	\| ((uint16_t)(puch[1] & 0x3f) << 6)
624	\| ((uint16_t)(uch & 0x0f) << 12);
625	*pwc++ = uc;
626	puch += 3;
627	cch -= 3;
628	}
629	else
630	{
631	/* generate surrugate pair */
632	Assert((uch & (RT_BIT(7) \| RT_BIT(6) \| RT_BIT(5) \| RT_BIT(4) \| RT_BIT(3))) == (RT_BIT(7) \| RT_BIT(6) \| RT_BIT(5) \| RT_BIT(4)));
633	RTUNICP uc = (puch[3] & 0x3f)
634	\| ((RTUNICP)(puch[2] & 0x3f) << 6)
635	\| ((RTUNICP)(puch[1] & 0x3f) << 12)
636	\| ((RTUNICP)(uch & 0x07) << 18);
637	if (RT_UNLIKELY(cwc < 1))
638	{
639	rc = VERR_BUFFER_OVERFLOW;
640	break;
641	}
642	cwc--;
643
644	uc -= 0x10000;
645	*pwc++ = 0xd800 \| (uc >> 10);
646	*pwc++ = 0xdc00 \| (uc & 0x3ff);
647	puch += 4;
648	cch -= 4;
649	}
650	}
651
652	/* done */
653	*pwc = '\0';
654	return rc;
655	}
656
657
658	RTDECL(int) RTStrToUtf16(const char pszString, PRTUTF16 ppwszString)
659	{
660	/*
661	* Validate input.
662	*/
663	Assert(VALID_PTR(ppwszString));
664	Assert(VALID_PTR(pszString));
665	*ppwszString = NULL;
666
667	/*
668	* Validate the UTF-8 input and calculate the length of the UTF-16 string.
669	*/
670	size_t cwc;
671	int rc = rtUtf8CalcUtf16Length(pszString, RTSTR_MAX, &cwc);
672	if (RT_SUCCESS(rc))
673	{
674	/*
675	* Allocate buffer.
676	*/
677	PRTUTF16 pwsz = (PRTUTF16)RTMemAlloc((cwc + 1) * sizeof(RTUTF16));
678	if (pwsz)
679	{
680	/*
681	* Encode the UTF-16 string.
682	*/
683	rc = rtUtf8RecodeAsUtf16(pszString, RTSTR_MAX, pwsz, cwc);
684	if (RT_SUCCESS(rc))
685	{
686	*ppwszString = pwsz;
687	return rc;
688	}
689	RTMemFree(pwsz);
690	}
691	else
692	rc = VERR_NO_UTF16_MEMORY;
693	}
694	return rc;
695	}
696	RT_EXPORT_SYMBOL(RTStrToUtf16);
697
698
699	RTDECL(int) RTStrToUtf16Ex(const char pszString, size_t cchString, PRTUTF16 ppwsz, size_t cwc, size_t *pcwc)
700	{
701	/*
702	* Validate input.
703	*/
704	Assert(VALID_PTR(pszString));
705	Assert(VALID_PTR(ppwsz));
706	Assert(!pcwc \|\| VALID_PTR(pcwc));
707
708	/*
709	* Validate the UTF-8 input and calculate the length of the UTF-16 string.
710	*/
711	size_t cwcResult;
712	int rc = rtUtf8CalcUtf16Length(pszString, cchString, &cwcResult);
713	if (RT_SUCCESS(rc))
714	{
715	if (pcwc)
716	*pcwc = cwcResult;
717
718	/*
719	* Check buffer size / Allocate buffer.
720	*/
721	bool fShouldFree;
722	PRTUTF16 pwszResult;
723	if (cwc > 0 && *ppwsz)
724	{
725	fShouldFree = false;
726	if (cwc <= cwcResult)
727	return VERR_BUFFER_OVERFLOW;
728	pwszResult = *ppwsz;
729	}
730	else
731	{
732	*ppwsz = NULL;
733	fShouldFree = true;
734	cwc = RT_MAX(cwcResult + 1, cwc);
735	pwszResult = (PRTUTF16)RTMemAlloc(cwc * sizeof(RTUTF16));
736	}
737	if (pwszResult)
738	{
739	/*
740	* Encode the UTF-16 string.
741	*/
742	rc = rtUtf8RecodeAsUtf16(pszString, cchString, pwszResult, cwc - 1);
743	if (RT_SUCCESS(rc))
744	{
745	*ppwsz = pwszResult;
746	return rc;
747	}
748	if (fShouldFree)
749	RTMemFree(pwszResult);
750	}
751	else
752	rc = VERR_NO_UTF16_MEMORY;
753	}
754	return rc;
755	}
756	RT_EXPORT_SYMBOL(RTStrToUtf16Ex);
757
758
759	RTDECL(size_t) RTStrCalcUtf16Len(const char *psz)
760	{
761	size_t cwc;
762	int rc = rtUtf8CalcUtf16Length(psz, RTSTR_MAX, &cwc);
763	return RT_SUCCESS(rc) ? cwc : 0;
764	}
765	RT_EXPORT_SYMBOL(RTStrCalcUtf16Len);
766
767
768	RTDECL(int) RTStrCalcUtf16LenEx(const char psz, size_t cch, size_t pcwc)
769	{
770	size_t cwc;
771	int rc = rtUtf8CalcUtf16Length(psz, cch, &cwc);
772	if (pcwc)
773	*pcwc = RT_SUCCESS(rc) ? cwc : ~(size_t)0;
774	return rc;
775	}
776	RT_EXPORT_SYMBOL(RTStrCalcUtf16LenEx);
777
778
779	/**
780	* Handle invalid encodings passed to RTStrGetCp() and RTStrGetCpEx().
781	* @returns rc
782	* @param ppsz The pointer to the string position point.
783	* @param pCp Where to store RTUNICP_INVALID.
784	* @param rc The iprt error code.
785	*/
786	static int rtStrGetCpExFailure(const char **ppsz, PRTUNICP pCp, int rc)
787	{
788	/*
789	* Try find a valid encoding.
790	*/
791	(ppsz)++; /* @todo code this! */
792	*pCp = RTUNICP_INVALID;
793	return rc;
794	}
795
796
797	RTDECL(RTUNICP) RTStrGetCpInternal(const char *psz)
798	{
799	RTUNICP Cp;
800	RTStrGetCpExInternal(&psz, &Cp);
801	return Cp;
802	}
803	RT_EXPORT_SYMBOL(RTStrGetCpInternal);
804
805
806	RTDECL(int) RTStrGetCpExInternal(const char **ppsz, PRTUNICP pCp)
807	{
808	const unsigned char puch = (const unsigned char )*ppsz;
809	const unsigned char uch = *puch;
810	RTUNICP uc;
811
812	/* ASCII ? */
813	if (!(uch & RT_BIT(7)))
814	{
815	uc = uch;
816	puch++;
817	}
818	else if (uch & RT_BIT(6))
819	{
820	/* figure the length and validate the first octet. */
821	unsigned cb;
822	if (!(uch & RT_BIT(5)))
823	cb = 2;
824	else if (!(uch & RT_BIT(4)))
825	cb = 3;
826	else if (!(uch & RT_BIT(3)))
827	cb = 4;
828	else if (!(uch & RT_BIT(2)))
829	cb = 5;
830	else if (!(uch & RT_BIT(1)))
831	cb = 6;
832	else
833	{
834	RTStrAssertMsgFailed(("Invalid UTF-8 first byte: %.Rhxs\n", RT_MIN(strlen((char )puch), 10), puch));
835	return rtStrGetCpExFailure(ppsz, pCp, VERR_INVALID_UTF8_ENCODING);
836	}
837
838	/* validate the rest */
839	switch (cb)
840	{
841	case 6:
842	RTStrAssertMsgReturn((puch[5] & 0xc0) == 0x80, ("6/%u: %.Rhxs\n", cb, RT_MIN(cb + 10, strlen((char )puch)), puch),
843	rtStrGetCpExFailure(ppsz, pCp, VERR_INVALID_UTF8_ENCODING));
844	case 5:
845	RTStrAssertMsgReturn((puch[4] & 0xc0) == 0x80, ("5/%u: %.Rhxs\n", cb, RT_MIN(cb + 10, strlen((char )puch)), puch),
846	rtStrGetCpExFailure(ppsz, pCp, VERR_INVALID_UTF8_ENCODING));
847	case 4:
848	RTStrAssertMsgReturn((puch[3] & 0xc0) == 0x80, ("4/%u: %.Rhxs\n", cb, RT_MIN(cb + 10, strlen((char )puch)), puch),
849	rtStrGetCpExFailure(ppsz, pCp, VERR_INVALID_UTF8_ENCODING));
850	case 3:
851	RTStrAssertMsgReturn((puch[2] & 0xc0) == 0x80, ("3/%u: %.Rhxs\n", cb, RT_MIN(cb + 10, strlen((char )puch)), puch),
852	rtStrGetCpExFailure(ppsz, pCp, VERR_INVALID_UTF8_ENCODING));
853	case 2:
854	RTStrAssertMsgReturn((puch[1] & 0xc0) == 0x80, ("2/%u: %.Rhxs\n", cb, RT_MIN(cb + 10, strlen((char )puch)), puch),
855	rtStrGetCpExFailure(ppsz, pCp, VERR_INVALID_UTF8_ENCODING));
856	break;
857	}
858
859	/* get and validate the code point. */
860	switch (cb)
861	{
862	case 6:
863	uc = (puch[5] & 0x3f)
864	\| ((RTUNICP)(puch[4] & 0x3f) << 6)
865	\| ((RTUNICP)(puch[3] & 0x3f) << 12)
866	\| ((RTUNICP)(puch[2] & 0x3f) << 18)
867	\| ((RTUNICP)(puch[1] & 0x3f) << 24)
868	\| ((RTUNICP)(uch & 0x01) << 30);
869	RTStrAssertMsgReturn(uc >= 0x04000000 && uc <= 0x7fffffff,
870	("%u: cp=%#010RX32: %.Rhxs\n", cb, uc, RT_MIN(cb + 10, strlen((char )puch)), puch),
871	rtStrGetCpExFailure(ppsz, pCp, VERR_INVALID_UTF8_ENCODING));
872	break;
873	case 5:
874	uc = (puch[4] & 0x3f)
875	\| ((RTUNICP)(puch[3] & 0x3f) << 6)
876	\| ((RTUNICP)(puch[2] & 0x3f) << 12)
877	\| ((RTUNICP)(puch[1] & 0x3f) << 18)
878	\| ((RTUNICP)(uch & 0x03) << 24);
879	RTStrAssertMsgReturn(uc >= 0x00200000 && uc <= 0x03ffffff,
880	("%u: cp=%#010RX32: %.Rhxs\n", cb, uc, RT_MIN(cb + 10, strlen((char )puch)), puch),
881	rtStrGetCpExFailure(ppsz, pCp, VERR_INVALID_UTF8_ENCODING));
882	break;
883	case 4:
884	uc = (puch[3] & 0x3f)
885	\| ((RTUNICP)(puch[2] & 0x3f) << 6)
886	\| ((RTUNICP)(puch[1] & 0x3f) << 12)
887	\| ((RTUNICP)(uch & 0x07) << 18);
888	RTStrAssertMsgReturn(uc >= 0x00010000 && uc <= 0x001fffff,
889	("%u: cp=%#010RX32: %.Rhxs\n", cb, uc, RT_MIN(cb + 10, strlen((char )puch)), puch),
890	rtStrGetCpExFailure(ppsz, pCp, VERR_INVALID_UTF8_ENCODING));
891	break;
892	case 3:
893	uc = (puch[2] & 0x3f)
894	\| ((RTUNICP)(puch[1] & 0x3f) << 6)
895	\| ((RTUNICP)(uch & 0x0f) << 12);
896	RTStrAssertMsgReturn(uc >= 0x00000800 && uc <= 0x0000fffd,
897	("%u: cp=%#010RX32: %.Rhxs\n", cb, uc, RT_MIN(cb + 10, strlen((char )puch)), puch),
898	rtStrGetCpExFailure(ppsz, pCp, uc == 0xffff \|\| uc == 0xfffe ? VERR_CODE_POINT_ENDIAN_INDICATOR : VERR_INVALID_UTF8_ENCODING));
899	RTStrAssertMsgReturn(uc < 0xd800 \|\| uc > 0xdfff,
900	("%u: cp=%#010RX32: %.Rhxs\n", cb, uc, RT_MIN(cb + 10, strlen((char )puch)), puch),
901	rtStrGetCpExFailure(ppsz, pCp, VERR_CODE_POINT_SURROGATE));
902	break;
903	case 2:
904	uc = (puch[1] & 0x3f)
905	\| ((RTUNICP)(uch & 0x1f) << 6);
906	RTStrAssertMsgReturn(uc >= 0x00000080 && uc <= 0x000007ff,
907	("%u: cp=%#010RX32: %.Rhxs\n", cb, uc, RT_MIN(cb + 10, strlen((char )puch)), puch),
908	rtStrGetCpExFailure(ppsz, pCp, VERR_INVALID_UTF8_ENCODING));
909	break;
910	default: /* impossible, but GCC is bitching. */
911	uc = RTUNICP_INVALID;
912	break;
913	}
914	puch += cb;
915	}
916	else
917	{
918	/* 6th bit is always set. */
919	RTStrAssertMsgFailed(("Invalid UTF-8 first byte: %.Rhxs\n", RT_MIN(strlen((char )puch), 10), puch));
920	return rtStrGetCpExFailure(ppsz, pCp, VERR_INVALID_UTF8_ENCODING);
921	}
922	*pCp = uc;
923	ppsz = (const char )puch;
924	return VINF_SUCCESS;
925	}
926	RT_EXPORT_SYMBOL(RTStrGetCpExInternal);
927
928
929	/**
930	* Handle invalid encodings passed to RTStrGetCpNEx().
931	* @returns rc
932	* @param ppsz The pointer to the string position point.
933	* @param pcch Pointer to the string length.
934	* @param pCp Where to store RTUNICP_INVALID.
935	* @param rc The iprt error code.
936	*/
937	static int rtStrGetCpNExFailure(const char *ppsz, size_t pcch, PRTUNICP pCp, int rc)
938	{
939	/*
940	* Try find a valid encoding.
941	*/
942	(ppsz)++; /* @todo code this! */
943	(*pcch)--;
944	*pCp = RTUNICP_INVALID;
945	return rc;
946	}
947
948
949	RTDECL(int) RTStrGetCpNExInternal(const char *ppsz, size_t pcch, PRTUNICP pCp)
950	{
951	const unsigned char puch = (const unsigned char )*ppsz;
952	const unsigned char uch = *puch;
953	size_t cch = *pcch;
954	RTUNICP uc;
955
956	if (cch == 0)
957	{
958	*pCp = RTUNICP_INVALID;
959	return VERR_END_OF_STRING;
960	}
961
962	/* ASCII ? */
963	if (!(uch & RT_BIT(7)))
964	{
965	uc = uch;
966	puch++;
967	cch--;
968	}
969	else if (uch & RT_BIT(6))
970	{
971	/* figure the length and validate the first octet. */
972	unsigned cb;
973	if (!(uch & RT_BIT(5)))
974	cb = 2;
975	else if (!(uch & RT_BIT(4)))
976	cb = 3;
977	else if (!(uch & RT_BIT(3)))
978	cb = 4;
979	else if (!(uch & RT_BIT(2)))
980	cb = 5;
981	else if (!(uch & RT_BIT(1)))
982	cb = 6;
983	else
984	{
985	RTStrAssertMsgFailed(("Invalid UTF-8 first byte: %.Rhxs\n", RT_MIN(strlen((char )puch), 10), puch));
986	return rtStrGetCpNExFailure(ppsz, pcch, pCp, VERR_INVALID_UTF8_ENCODING);
987	}
988
989	if (cb > cch)
990	return rtStrGetCpNExFailure(ppsz, pcch, pCp, VERR_INVALID_UTF8_ENCODING);
991
992	/* validate the rest */
993	switch (cb)
994	{
995	case 6:
996	RTStrAssertMsgReturn((puch[5] & 0xc0) == 0x80, ("6/%u: %.Rhxs\n", cb, RT_MIN(cb + 10, strlen((char )puch)), puch),
997	rtStrGetCpNExFailure(ppsz, pcch, pCp, VERR_INVALID_UTF8_ENCODING));
998	case 5:
999	RTStrAssertMsgReturn((puch[4] & 0xc0) == 0x80, ("5/%u: %.Rhxs\n", cb, RT_MIN(cb + 10, strlen((char )puch)), puch),
1000	rtStrGetCpNExFailure(ppsz, pcch, pCp, VERR_INVALID_UTF8_ENCODING));
1001	case 4:
1002	RTStrAssertMsgReturn((puch[3] & 0xc0) == 0x80, ("4/%u: %.Rhxs\n", cb, RT_MIN(cb + 10, strlen((char )puch)), puch),
1003	rtStrGetCpNExFailure(ppsz, pcch, pCp, VERR_INVALID_UTF8_ENCODING));
1004	case 3:
1005	RTStrAssertMsgReturn((puch[2] & 0xc0) == 0x80, ("3/%u: %.Rhxs\n", cb, RT_MIN(cb + 10, strlen((char )puch)), puch),
1006	rtStrGetCpNExFailure(ppsz, pcch, pCp, VERR_INVALID_UTF8_ENCODING));
1007	case 2:
1008	RTStrAssertMsgReturn((puch[1] & 0xc0) == 0x80, ("2/%u: %.Rhxs\n", cb, RT_MIN(cb + 10, strlen((char )puch)), puch),
1009	rtStrGetCpNExFailure(ppsz, pcch, pCp, VERR_INVALID_UTF8_ENCODING));
1010	break;
1011	}
1012
1013	/* get and validate the code point. */
1014	switch (cb)
1015	{
1016	case 6:
1017	uc = (puch[5] & 0x3f)
1018	\| ((RTUNICP)(puch[4] & 0x3f) << 6)
1019	\| ((RTUNICP)(puch[3] & 0x3f) << 12)
1020	\| ((RTUNICP)(puch[2] & 0x3f) << 18)
1021	\| ((RTUNICP)(puch[1] & 0x3f) << 24)
1022	\| ((RTUNICP)(uch & 0x01) << 30);
1023	RTStrAssertMsgReturn(uc >= 0x04000000 && uc <= 0x7fffffff,
1024	("%u: cp=%#010RX32: %.Rhxs\n", cb, uc, RT_MIN(cb + 10, strlen((char )puch)), puch),
1025	rtStrGetCpNExFailure(ppsz, pcch, pCp, VERR_INVALID_UTF8_ENCODING));
1026	break;
1027	case 5:
1028	uc = (puch[4] & 0x3f)
1029	\| ((RTUNICP)(puch[3] & 0x3f) << 6)
1030	\| ((RTUNICP)(puch[2] & 0x3f) << 12)
1031	\| ((RTUNICP)(puch[1] & 0x3f) << 18)
1032	\| ((RTUNICP)(uch & 0x03) << 24);
1033	RTStrAssertMsgReturn(uc >= 0x00200000 && uc <= 0x03ffffff,
1034	("%u: cp=%#010RX32: %.Rhxs\n", cb, uc, RT_MIN(cb + 10, strlen((char )puch)), puch),
1035	rtStrGetCpNExFailure(ppsz, pcch, pCp, VERR_INVALID_UTF8_ENCODING));
1036	break;
1037	case 4:
1038	uc = (puch[3] & 0x3f)
1039	\| ((RTUNICP)(puch[2] & 0x3f) << 6)
1040	\| ((RTUNICP)(puch[1] & 0x3f) << 12)
1041	\| ((RTUNICP)(uch & 0x07) << 18);
1042	RTStrAssertMsgReturn(uc >= 0x00010000 && uc <= 0x001fffff,
1043	("%u: cp=%#010RX32: %.Rhxs\n", cb, uc, RT_MIN(cb + 10, strlen((char )puch)), puch),
1044	rtStrGetCpNExFailure(ppsz, pcch, pCp, VERR_INVALID_UTF8_ENCODING));
1045	break;
1046	case 3:
1047	uc = (puch[2] & 0x3f)
1048	\| ((RTUNICP)(puch[1] & 0x3f) << 6)
1049	\| ((RTUNICP)(uch & 0x0f) << 12);
1050	RTStrAssertMsgReturn(uc >= 0x00000800 && uc <= 0x0000fffd,
1051	("%u: cp=%#010RX32: %.Rhxs\n", cb, uc, RT_MIN(cb + 10, strlen((char )puch)), puch),
1052	rtStrGetCpNExFailure(ppsz, pcch, pCp, uc == 0xffff \|\| uc == 0xfffe ? VERR_CODE_POINT_ENDIAN_INDICATOR : VERR_INVALID_UTF8_ENCODING));
1053	RTStrAssertMsgReturn(uc < 0xd800 \|\| uc > 0xdfff,
1054	("%u: cp=%#010RX32: %.Rhxs\n", cb, uc, RT_MIN(cb + 10, strlen((char )puch)), puch),
1055	rtStrGetCpNExFailure(ppsz, pcch, pCp, VERR_CODE_POINT_SURROGATE));
1056	break;
1057	case 2:
1058	uc = (puch[1] & 0x3f)
1059	\| ((RTUNICP)(uch & 0x1f) << 6);
1060	RTStrAssertMsgReturn(uc >= 0x00000080 && uc <= 0x000007ff,
1061	("%u: cp=%#010RX32: %.Rhxs\n", cb, uc, RT_MIN(cb + 10, strlen((char )puch)), puch),
1062	rtStrGetCpNExFailure(ppsz, pcch, pCp, VERR_INVALID_UTF8_ENCODING));
1063	break;
1064	default: /* impossible, but GCC is bitching. */
1065	uc = RTUNICP_INVALID;
1066	break;
1067	}
1068	puch += cb;
1069	cch -= cb;
1070	}
1071	else
1072	{
1073	/* 6th bit is always set. */
1074	RTStrAssertMsgFailed(("Invalid UTF-8 first byte: %.Rhxs\n", RT_MIN(strlen((char )puch), 10), puch));
1075	return rtStrGetCpNExFailure(ppsz, pcch, pCp, VERR_INVALID_UTF8_ENCODING);
1076	}
1077	*pCp = uc;
1078	ppsz = (const char )puch;
1079	(*pcch) = cch;
1080	return VINF_SUCCESS;
1081	}
1082	RT_EXPORT_SYMBOL(RTStrGetCpNExInternal);
1083
1084
1085	RTDECL(char ) RTStrPutCpInternal(char psz, RTUNICP uc)
1086	{
1087	unsigned char puch = (unsigned char )psz;
1088	if (uc < 0x80)
1089	*puch++ = (unsigned char )uc;
1090	else if (uc < 0x00000800)
1091	{
1092	*puch++ = 0xc0 \| (uc >> 6);
1093	*puch++ = 0x80 \| (uc & 0x3f);
1094	}
1095	else if (uc < 0x00010000)
1096	{
1097	if ( uc < 0x0000d8000
1098	\|\| ( uc > 0x0000dfff
1099	&& uc < 0x0000fffe))
1100	{
1101	*puch++ = 0xe0 \| (uc >> 12);
1102	*puch++ = 0x80 \| ((uc >> 6) & 0x3f);
1103	*puch++ = 0x80 \| (uc & 0x3f);
1104	}
1105	else
1106	{
1107	AssertMsgFailed(("Invalid code point U+%05x!\n", uc));
1108	*puch++ = 0x7f;
1109	}
1110	}
1111	else if (uc < 0x00200000)
1112	{
1113	*puch++ = 0xf0 \| (uc >> 18);
1114	*puch++ = 0x80 \| ((uc >> 12) & 0x3f);
1115	*puch++ = 0x80 \| ((uc >> 6) & 0x3f);
1116	*puch++ = 0x80 \| (uc & 0x3f);
1117	}
1118	else if (uc < 0x04000000)
1119	{
1120	*puch++ = 0xf8 \| (uc >> 24);
1121	*puch++ = 0x80 \| ((uc >> 18) & 0x3f);
1122	*puch++ = 0x80 \| ((uc >> 12) & 0x3f);
1123	*puch++ = 0x80 \| ((uc >> 6) & 0x3f);
1124	*puch++ = 0x80 \| (uc & 0x3f);
1125	}
1126	else if (uc <= 0x7fffffff)
1127	{
1128	*puch++ = 0xfc \| (uc >> 30);
1129	*puch++ = 0x80 \| ((uc >> 24) & 0x3f);
1130	*puch++ = 0x80 \| ((uc >> 18) & 0x3f);
1131	*puch++ = 0x80 \| ((uc >> 12) & 0x3f);
1132	*puch++ = 0x80 \| ((uc >> 6) & 0x3f);
1133	*puch++ = 0x80 \| (uc & 0x3f);
1134	}
1135	else
1136	{
1137	AssertMsgFailed(("Invalid code point U+%08x!\n", uc));
1138	*puch++ = 0x7f;
1139	}
1140
1141	return (char *)puch;
1142	}
1143	RT_EXPORT_SYMBOL(RTStrPutCpInternal);
1144
1145
1146	RTDECL(char ) RTStrPrevCp(const char pszStart, const char *psz)
1147	{
1148	if (pszStart < psz)
1149	{
1150	/* simple char? */
1151	const unsigned char puch = (const unsigned char )psz;
1152	unsigned uch = *--puch;
1153	if (!(uch & RT_BIT(7)))
1154	return (char *)puch;
1155	RTStrAssertMsgReturn(!(uch & RT_BIT(6)), ("uch=%#x\n", uch), (char *)pszStart);
1156
1157	/* two or more. */
1158	uint32_t uMask = 0xffffffc0;
1159	while ( (const unsigned char *)pszStart < puch
1160	&& !(uMask & 1))
1161	{
1162	unsigned uch = *--puch;
1163	if ((uch & 0xc0) != 0x80)
1164	{
1165	RTStrAssertMsgReturn((uch & (uMask >> 1)) == (uMask & 0xff),
1166	("Invalid UTF-8 encoding: %.Rhxs puch=%p psz=%p\n", psz - (char )puch, puch, psz),
1167	(char *)pszStart);
1168	return (char *)puch;
1169	}
1170	uMask >>= 1;
1171	}
1172	RTStrAssertMsgFailed(("Invalid UTF-8 encoding: %.Rhxs puch=%p psz=%p\n", psz - (char )puch, puch, psz));
1173	}
1174	return (char *)pszStart;
1175	}
1176	RT_EXPORT_SYMBOL(RTStrPrevCp);
1177
1178
1179	/**
1180	* Performs a case sensitive string compare between two UTF-8 strings.
1181	*
1182	* Encoding errors are ignored by the current implementation. So, the only
1183	* difference between this and the CRT strcmp function is the handling of
1184	* NULL arguments.
1185	*
1186	* @returns < 0 if the first string less than the second string.
1187	* @returns 0 if the first string identical to the second string.
1188	* @returns > 0 if the first string greater than the second string.
1189	* @param psz1 First UTF-8 string. Null is allowed.
1190	* @param psz2 Second UTF-8 string. Null is allowed.
1191	*/
1192	RTDECL(int) RTStrCmp(const char psz1, const char psz2)
1193	{
1194	if (psz1 == psz2)
1195	return 0;
1196	if (!psz1)
1197	return -1;
1198	if (!psz2)
1199	return 1;
1200
1201	return strcmp(psz1, psz2);
1202	}
1203	RT_EXPORT_SYMBOL(RTStrCmp);
1204
1205
1206	/**
1207	* Performs a case sensitive string compare between two UTF-8 strings, given
1208	* a maximum string length.
1209	*
1210	* Encoding errors are ignored by the current implementation. So, the only
1211	* difference between this and the CRT strncmp function is the handling of
1212	* NULL arguments.
1213	*
1214	* @returns < 0 if the first string less than the second string.
1215	* @returns 0 if the first string identical to the second string.
1216	* @returns > 0 if the first string greater than the second string.
1217	* @param psz1 First UTF-8 string. Null is allowed.
1218	* @param psz2 Second UTF-8 string. Null is allowed.
1219	* @param cchMax The maximum string length
1220	*/
1221	RTDECL(int) RTStrNCmp(const char psz1, const char psz2, size_t cchMax)
1222	{
1223	if (psz1 == psz2)
1224	return 0;
1225	if (!psz1)
1226	return -1;
1227	if (!psz2)
1228	return 1;
1229
1230	return strncmp(psz1, psz2, cchMax);
1231	}
1232	RT_EXPORT_SYMBOL(RTStrNCmp);
1233
1234
1235	/**
1236	* Performs a case insensitive string compare between two UTF-8 strings.
1237	*
1238	* This is a simplified compare, as only the simplified lower/upper case folding
1239	* specified by the unicode specs are used. It does not consider character pairs
1240	* as they are used in some languages, just simple upper & lower case compares.
1241	*
1242	* The result is the difference between the mismatching codepoints after they
1243	* both have been lower cased.
1244	*
1245	* If the string encoding is invalid the function will assert (strict builds)
1246	* and use RTStrCmp for the remainder of the string.
1247	*
1248	* @returns < 0 if the first string less than the second string.
1249	* @returns 0 if the first string identical to the second string.
1250	* @returns > 0 if the first string greater than the second string.
1251	* @param psz1 First UTF-8 string. Null is allowed.
1252	* @param psz2 Second UTF-8 string. Null is allowed.
1253	*/
1254	RTDECL(int) RTStrICmp(const char psz1, const char psz2)
1255	{
1256	if (psz1 == psz2)
1257	return 0;
1258	if (!psz1)
1259	return -1;
1260	if (!psz2)
1261	return 1;
1262
1263	const char *pszStart1 = psz1;
1264	for (;;)
1265	{
1266	/* Get the codepoints */
1267	RTUNICP cp1;
1268	int rc = RTStrGetCpEx(&psz1, &cp1);
1269	if (RT_FAILURE(rc))
1270	{
1271	AssertRC(rc);
1272	psz1--;
1273	break;
1274	}
1275
1276	RTUNICP cp2;
1277	rc = RTStrGetCpEx(&psz2, &cp2);
1278	if (RT_FAILURE(rc))
1279	{
1280	AssertRC(rc);
1281	psz2--;
1282	psz1 = RTStrPrevCp(pszStart1, psz1);
1283	break;
1284	}
1285
1286	/* compare */
1287	int iDiff = cp1 - cp2;
1288	if (iDiff)
1289	{
1290	iDiff = RTUniCpToUpper(cp1) != RTUniCpToUpper(cp2);
1291	if (iDiff)
1292	{
1293	iDiff = RTUniCpToLower(cp1) - RTUniCpToLower(cp2); /* lower case diff last! */
1294	if (iDiff)
1295	return iDiff;
1296	}
1297	}
1298
1299	/* hit the terminator? */
1300	if (!cp1)
1301	return 0;
1302	}
1303
1304	/* Hit some bad encoding, continue in case insensitive mode. */
1305	return RTStrCmp(psz1, psz2);
1306	}
1307	RT_EXPORT_SYMBOL(RTStrICmp);
1308
1309
1310	/**
1311	* Performs a case insensitive string compare between two UTF-8 strings, given a
1312	* maximum string length.
1313	*
1314	* This is a simplified compare, as only the simplified lower/upper case folding
1315	* specified by the unicode specs are used. It does not consider character pairs
1316	* as they are used in some languages, just simple upper & lower case compares.
1317	*
1318	* The result is the difference between the mismatching codepoints after they
1319	* both have been lower cased.
1320	*
1321	* If the string encoding is invalid the function will assert (strict builds)
1322	* and use RTStrCmp for the remainder of the string.
1323	*
1324	* @returns < 0 if the first string less than the second string.
1325	* @returns 0 if the first string identical to the second string.
1326	* @returns > 0 if the first string greater than the second string.
1327	* @param psz1 First UTF-8 string. Null is allowed.
1328	* @param psz2 Second UTF-8 string. Null is allowed.
1329	* @param cchMax Maximum string length
1330	*/
1331	RTDECL(int) RTStrNICmp(const char psz1, const char psz2, size_t cchMax)
1332	{
1333	if (cchMax == 0)
1334	return 0;
1335	if (psz1 == psz2)
1336	return 0;
1337	if (!psz1)
1338	return -1;
1339	if (!psz2)
1340	return 1;
1341
1342	for (;;)
1343	{
1344	/* Get the codepoints */
1345	RTUNICP cp1;
1346	size_t cchMax2 = cchMax;
1347	int rc = RTStrGetCpNEx(&psz1, &cchMax, &cp1);
1348	if (RT_FAILURE(rc))
1349	{
1350	AssertRC(rc);
1351	psz1--;
1352	cchMax++;
1353	break;
1354	}
1355
1356	RTUNICP cp2;
1357	rc = RTStrGetCpNEx(&psz2, &cchMax2, &cp2);
1358	if (RT_FAILURE(rc))
1359	{
1360	AssertRC(rc);
1361	psz2--;
1362	psz1 -= (cchMax - cchMax2 + 1); /* This can't overflow, can it? */
1363	cchMax = cchMax2 + 1;
1364	break;
1365	}
1366
1367	/* compare */
1368	int iDiff = cp1 - cp2;
1369	if (iDiff)
1370	{
1371	iDiff = RTUniCpToUpper(cp1) != RTUniCpToUpper(cp2);
1372	if (iDiff)
1373	{
1374	iDiff = RTUniCpToLower(cp1) - RTUniCpToLower(cp2); /* lower case diff last! */
1375	if (iDiff)
1376	return iDiff;
1377	}
1378	}
1379
1380	/* hit the terminator? */
1381	if (!cp1 \|\| cchMax == 0)
1382	return 0;
1383	}
1384
1385	/* Hit some bad encoding, continue in case insensitive mode. */
1386	return RTStrNCmp(psz1, psz2, cchMax);
1387	}
1388	RT_EXPORT_SYMBOL(RTStrNICmp);
1389
1390
1391	RTDECL(char ) RTStrStr(const char pszHaystack, const char *pszNeedle)
1392	{
1393	/* Any NULL strings means NULL return. (In the RTStrCmp tradition.) */
1394	if (!pszHaystack)
1395	return NULL;
1396	if (!pszNeedle)
1397	return NULL;
1398
1399	/* The rest is CRT. */
1400	return (char *)strstr(pszHaystack, pszNeedle);
1401	}
1402	RT_EXPORT_SYMBOL(RTStrStr);
1403
1404
1405	RTDECL(char ) RTStrIStr(const char pszHaystack, const char *pszNeedle)
1406	{
1407	/* Any NULL strings means NULL return. (In the RTStrCmp tradition.) */
1408	if (!pszHaystack)
1409	return NULL;
1410	if (!pszNeedle)
1411	return NULL;
1412
1413	/* The empty string matches everything. */
1414	if (!*pszNeedle)
1415	return (char *)pszHaystack;
1416
1417	/*
1418	* The search strategy is to pick out the first char of the needle, fold it,
1419	* and match it against the haystack code point by code point. When encountering
1420	* a matching code point we use RTStrNICmp for the remainder (if any) of the needle.
1421	*/
1422	const char * const pszNeedleStart = pszNeedle;
1423	RTUNICP Cp0;
1424	RTStrGetCpEx(&pszNeedle, &Cp0); /* pszNeedle is advanced one code point. */
1425	size_t const cchNeedle = strlen(pszNeedle);
1426	size_t const cchNeedleCp0= pszNeedle - pszNeedleStart;
1427	RTUNICP const Cp0Lower = RTUniCpToLower(Cp0);
1428	RTUNICP const Cp0Upper = RTUniCpToUpper(Cp0);
1429	if ( Cp0Lower == Cp0Upper
1430	&& Cp0Lower == Cp0)
1431	{
1432	/* Cp0 is not a case sensitive char. */
1433	for (;;)
1434	{
1435	RTUNICP Cp;
1436	RTStrGetCpEx(&pszHaystack, &Cp);
1437	if (!Cp)
1438	break;
1439	if ( Cp == Cp0
1440	&& !RTStrNICmp(pszHaystack, pszNeedle, cchNeedle))
1441	return (char *)pszHaystack - cchNeedleCp0;
1442	}
1443	}
1444	else if ( Cp0Lower == Cp0
1445	\|\| Cp0Upper != Cp0)
1446	{
1447	/* Cp0 is case sensitive */
1448	for (;;)
1449	{
1450	RTUNICP Cp;
1451	RTStrGetCpEx(&pszHaystack, &Cp);
1452	if (!Cp)
1453	break;
1454	if ( ( Cp == Cp0Upper
1455	\|\| Cp == Cp0Lower)
1456	&& !RTStrNICmp(pszHaystack, pszNeedle, cchNeedle))
1457	return (char *)pszHaystack - cchNeedleCp0;
1458	}
1459	}
1460	else
1461	{
1462	/* Cp0 is case sensitive and folds to two difference chars. (paranoia) */
1463	for (;;)
1464	{
1465	RTUNICP Cp;
1466	RTStrGetCpEx(&pszHaystack, &Cp);
1467	if (!Cp)
1468	break;
1469	if ( ( Cp == Cp0
1470	\|\| Cp == Cp0Upper
1471	\|\| Cp == Cp0Lower)
1472	&& !RTStrNICmp(pszHaystack, pszNeedle, cchNeedle))
1473	return (char *)pszHaystack - cchNeedleCp0;
1474	}
1475	}
1476
1477
1478	return NULL;
1479	}
1480	RT_EXPORT_SYMBOL(RTStrIStr);
1481
1482
1483	RTDECL(char ) RTStrToLower(char psz)
1484	{
1485	/*
1486	* Loop the code points in the string, converting them one by one.
1487	* ASSUMES that the code points for upper and lower case are encoded
1488	* with the exact same length.
1489	*/
1490	/** @todo Handled bad encodings correctly+quietly, remove assumption,
1491	* optimize. */
1492	char *pszCur = psz;
1493	while (*pszCur)
1494	{
1495	RTUNICP cp = RTStrGetCp(pszCur);
1496	cp = RTUniCpToLower(cp);
1497	pszCur = RTStrPutCp(pszCur, cp);
1498	}
1499	return psz;
1500	}
1501	RT_EXPORT_SYMBOL(RTStrToLower);
1502
1503
1504	RTDECL(char ) RTStrToUpper(char psz)
1505	{
1506	/*
1507	* Loop the code points in the string, converting them one by one.
1508	* ASSUMES that the code points for upper and lower case are encoded
1509	* with the exact same length.
1510	*/
1511	/** @todo Handled bad encodings correctly+quietly, remove assumption,
1512	* optimize. */
1513	char *pszCur = psz;
1514	while(*pszCur)
1515	{
1516	RTUNICP cp = RTStrGetCp(pszCur);
1517	cp = RTUniCpToUpper(cp);
1518	pszCur = RTStrPutCp(pszCur, cp);
1519	}
1520	return psz;
1521	}
1522	RT_EXPORT_SYMBOL(RTStrToUpper);
1523

Note: See TracBrowser for help on using the repository browser.

source: vbox/trunk/src/VBox/Runtime/common/string/utf-8.cpp@ 23594

Download in other formats: