utf-8.cpp@ 18570

Last change on this file since 18570 was 18570, checked in by vboxsync, 16 years ago
RTStrIStr: fixed inverted test.
Property svn:eol-style set to `native` Property svn:keywords set to `Id`
File size: 53.0 KB

Line
1	/* $Id: utf-8.cpp 18570 2009-03-31 13:07:44Z vboxsync $ */
2	/** @file
3	* IPRT - UTF-8 Decoding.
4	*/
5
6	/*
7	* Copyright (C) 2006-2007 Sun Microsystems, Inc.
8	*
9	* This file is part of VirtualBox Open Source Edition (OSE), as
10	* available from http://www.virtualbox.org. This file is free software;
11	* you can redistribute it and/or modify it under the terms of the GNU
12	* General Public License (GPL) as published by the Free Software
13	* Foundation, in version 2 as it comes in the "COPYING" file of the
14	* VirtualBox OSE distribution. VirtualBox OSE is distributed in the
15	* hope that it will be useful, but WITHOUT ANY WARRANTY of any kind.
16	*
17	* The contents of this file may alternatively be used under the terms
18	* of the Common Development and Distribution License Version 1.0
19	* (CDDL) only, as it comes in the "COPYING.CDDL" file of the
20	* VirtualBox OSE distribution, in which case the provisions of the
21	* CDDL are applicable instead of those of the GPL.
22	*
23	* You may elect to license modified versions of this file under the
24	* terms and conditions of either the GPL or the CDDL or both.
25	*
26	* Please contact Sun Microsystems, Inc., 4150 Network Circle, Santa
27	* Clara, CA 95054 USA or visit http://www.sun.com if you need
28	* additional information or have any questions.
29	*/
30
31
32	/*******************************************************************************
33	* Header Files *
34	*******************************************************************************/
35	#include <iprt/string.h>
36	#include <iprt/uni.h>
37	#include <iprt/alloc.h>
38	#include <iprt/assert.h>
39	#include <iprt/err.h>
40	#include "internal/string.h"
41
42
43
44	/**
45	* Get get length in code points of a UTF-8 encoded string.
46	* The string is validated while doing this.
47	*
48	* @returns IPRT status code.
49	* @param psz Pointer to the UTF-8 string.
50	* @param cch The max length of the string. (btw cch = cb)
51	* Use RTSTR_MAX if all of the string is to be examined.
52	* @param pcuc Where to store the length in unicode code points.
53	* @param pcchActual Where to store the actual size of the UTF-8 string
54	* on success (cch = cb again). Optional.
55	*/
56	static int rtUtf8Length(const char psz, size_t cch, size_t pcuc, size_t *pcchActual)
57	{
58	const unsigned char puch = (const unsigned char )psz;
59	size_t cCodePoints = 0;
60	while (cch > 0)
61	{
62	const unsigned char uch = *puch;
63	if (!uch)
64	break;
65	if (uch & RT_BIT(7))
66	{
67	/* figure sequence length and validate the first byte */
68	unsigned cb;
69	if ((uch & (RT_BIT(7) \| RT_BIT(6) \| RT_BIT(5))) == (RT_BIT(7) \| RT_BIT(6)))
70	cb = 2;
71	else if ((uch & (RT_BIT(7) \| RT_BIT(6) \| RT_BIT(5) \| RT_BIT(4))) == (RT_BIT(7) \| RT_BIT(6) \| RT_BIT(5)))
72	cb = 3;
73	else if ((uch & (RT_BIT(7) \| RT_BIT(6) \| RT_BIT(5) \| RT_BIT(4) \| RT_BIT(3))) == (RT_BIT(7) \| RT_BIT(6) \| RT_BIT(5) \| RT_BIT(4)))
74	cb = 4;
75	else if ((uch & (RT_BIT(7) \| RT_BIT(6) \| RT_BIT(5) \| RT_BIT(4) \| RT_BIT(3) \| RT_BIT(2))) == (RT_BIT(7) \| RT_BIT(6) \| RT_BIT(5) \| RT_BIT(4) \| RT_BIT(3)))
76	cb = 5;
77	else if ((uch & (RT_BIT(7) \| RT_BIT(6) \| RT_BIT(5) \| RT_BIT(4) \| RT_BIT(3) \| RT_BIT(2) \| RT_BIT(1))) == (RT_BIT(7) \| RT_BIT(6) \| RT_BIT(5) \| RT_BIT(4) \| RT_BIT(3) \| RT_BIT(2)))
78	cb = 6;
79	else
80	{
81	RTStrAssertMsgFailed(("Invalid UTF-8 first byte: %.*Rhxs\n", RT_MIN(cch, 10), puch));
82	return VERR_INVALID_UTF8_ENCODING;
83	}
84
85	/* check length */
86	if (cb > cch)
87	{
88	RTStrAssertMsgFailed(("Invalid UTF-8 length: cb=%d cch=%d (%.*Rhxs)\n", cb, cch, RT_MIN(cch, 10), puch));
89	return VERR_INVALID_UTF8_ENCODING;
90	}
91
92	/* validate the rest */
93	switch (cb)
94	{
95	case 6:
96	RTStrAssertMsgReturn((puch[5] & (RT_BIT(7) \| RT_BIT(6))) == RT_BIT(7), ("6/%u: %.*Rhxs\n", cb, RT_MIN(cb + 10, cch), puch), VERR_INVALID_UTF8_ENCODING);
97	case 5:
98	RTStrAssertMsgReturn((puch[4] & (RT_BIT(7) \| RT_BIT(6))) == RT_BIT(7), ("5/%u: %.*Rhxs\n", cb, RT_MIN(cb + 10, cch), puch), VERR_INVALID_UTF8_ENCODING);
99	case 4:
100	RTStrAssertMsgReturn((puch[3] & (RT_BIT(7) \| RT_BIT(6))) == RT_BIT(7), ("4/%u: %.*Rhxs\n", cb, RT_MIN(cb + 10, cch), puch), VERR_INVALID_UTF8_ENCODING);
101	case 3:
102	RTStrAssertMsgReturn((puch[2] & (RT_BIT(7) \| RT_BIT(6))) == RT_BIT(7), ("3/%u: %.*Rhxs\n", cb, RT_MIN(cb + 10, cch), puch), VERR_INVALID_UTF8_ENCODING);
103	case 2:
104	RTStrAssertMsgReturn((puch[1] & (RT_BIT(7) \| RT_BIT(6))) == RT_BIT(7), ("2/%u: %.*Rhxs\n", cb, RT_MIN(cb + 10, cch), puch), VERR_INVALID_UTF8_ENCODING);
105	break;
106	}
107
108	/* validate the code point. */
109	RTUNICP uc;
110	switch (cb)
111	{
112	case 6:
113	uc = (puch[5] & 0x3f)
114	\| ((RTUNICP)(puch[4] & 0x3f) << 6)
115	\| ((RTUNICP)(puch[3] & 0x3f) << 12)
116	\| ((RTUNICP)(puch[2] & 0x3f) << 18)
117	\| ((RTUNICP)(puch[1] & 0x3f) << 24)
118	\| ((RTUNICP)(uch & 0x01) << 30);
119	RTStrAssertMsgReturn(uc >= 0x04000000 && uc <= 0x7fffffff,
120	("%u: cp=%#010RX32: %.*Rhxs\n", cb, uc, RT_MIN(cb + 10, cch), puch), VERR_INVALID_UTF8_ENCODING);
121	break;
122	case 5:
123	uc = (puch[4] & 0x3f)
124	\| ((RTUNICP)(puch[3] & 0x3f) << 6)
125	\| ((RTUNICP)(puch[2] & 0x3f) << 12)
126	\| ((RTUNICP)(puch[1] & 0x3f) << 18)
127	\| ((RTUNICP)(uch & 0x03) << 24);
128	RTStrAssertMsgReturn(uc >= 0x00200000 && uc <= 0x03ffffff,
129	("%u: cp=%#010RX32: %.*Rhxs\n", cb, uc, RT_MIN(cb + 10, cch), puch), VERR_INVALID_UTF8_ENCODING);
130	break;
131	case 4:
132	uc = (puch[3] & 0x3f)
133	\| ((RTUNICP)(puch[2] & 0x3f) << 6)
134	\| ((RTUNICP)(puch[1] & 0x3f) << 12)
135	\| ((RTUNICP)(uch & 0x07) << 18);
136	RTStrAssertMsgReturn(uc >= 0x00010000 && uc <= 0x001fffff,
137	("%u: cp=%#010RX32: %.*Rhxs\n", cb, uc, RT_MIN(cb + 10, cch), puch), VERR_INVALID_UTF8_ENCODING);
138	break;
139	case 3:
140	uc = (puch[2] & 0x3f)
141	\| ((RTUNICP)(puch[1] & 0x3f) << 6)
142	\| ((RTUNICP)(uch & 0x0f) << 12);
143	RTStrAssertMsgReturn(uc >= 0x00000800 && uc <= 0x0000fffd,
144	("%u: cp=%#010RX32: %.*Rhxs\n", cb, uc, RT_MIN(cb + 10, cch), puch),
145	uc == 0xffff \|\| uc == 0xfffe ? VERR_CODE_POINT_ENDIAN_INDICATOR : VERR_INVALID_UTF8_ENCODING);
146	RTStrAssertMsgReturn(uc < 0xd800 \|\| uc > 0xdfff,
147	("%u: cp=%#010RX32: %.*Rhxs\n", cb, uc, RT_MIN(cb + 10, cch), puch), VERR_CODE_POINT_SURROGATE);
148	break;
149	case 2:
150	uc = (puch[1] & 0x3f)
151	\| ((RTUNICP)(uch & 0x1f) << 6);
152	RTStrAssertMsgReturn(uc >= 0x00000080 && uc <= 0x000007ff,
153	("%u: cp=%#010RX32: %.*Rhxs\n", cb, uc, RT_MIN(cb + 10, cch), puch), VERR_INVALID_UTF8_ENCODING);
154	break;
155	}
156
157	/* advance */
158	cch -= cb;
159	puch += cb;
160	}
161	else
162	{
163	/* one ASCII byte */
164	puch++;
165	cch--;
166	}
167	cCodePoints++;
168	}
169
170	/* done */
171	*pcuc = cCodePoints;
172	if (pcchActual)
173	pcchActual = puch - (unsigned char const )psz;
174	return VINF_SUCCESS;
175	}
176
177
178	/**
179	* Decodes and UTF-8 string into an array of unicode code point.
180	*
181	* Since we know the input is valid, we do not perform encoding or length checks.
182	*
183	* @returns iprt status code.
184	* @param psz The UTF-8 string to recode. This is a valid encoding.
185	* @param cch The number of chars (the type char, so bytes if you like) to process of the UTF-8 string.
186	* The recoding will stop when cch or '\\0' is reached. Pass RTSTR_MAX to process up to '\\0'.
187	* @param paCps Where to store the code points array.
188	* @param cCps The number of RTUNICP items the paCps buffer can hold, excluding the terminator ('\\0').
189	* @param pcCps Where to store the actual number of decoded code points. This excludes the terminator.
190	*/
191	static int rtUtf8Decode(const char psz, size_t cch, PRTUNICP paCps, size_t cCps, size_t pcCps)
192	{
193	int rc = VINF_SUCCESS;
194	const unsigned char puch = (const unsigned char )psz;
195	const PRTUNICP pCpEnd = paCps + cCps;
196	PRTUNICP pCp = paCps;
197	Assert(pCpEnd >= pCp);
198	while (cch > 0)
199	{
200	/* read the next char and check for terminator. */
201	const unsigned char uch = *puch;
202	if (!uch)
203	break;
204
205	/* check for output overflow */
206	if (pCp >= pCpEnd)
207	{
208	rc = VERR_BUFFER_OVERFLOW;
209	break;
210	}
211
212	/* decode and recode the code point */
213	if (!(uch & RT_BIT(7)))
214	{
215	*pCp++ = uch;
216	puch++;
217	cch--;
218	}
219	#ifdef RT_STRICT
220	else if (!(uch & RT_BIT(6)))
221	AssertMsgFailed(("Internal error!\n"));
222	#endif
223	else if (!(uch & RT_BIT(5)))
224	{
225	*pCp++ = (puch[1] & 0x3f)
226	\| ((uint16_t)(uch & 0x1f) << 6);
227	puch += 2;
228	cch -= 2;
229	}
230	else if (!(uch & RT_BIT(4)))
231	{
232	*pCp++ = (puch[2] & 0x3f)
233	\| ((uint16_t)(puch[1] & 0x3f) << 6)
234	\| ((uint16_t)(uch & 0x0f) << 12);
235	puch += 3;
236	cch -= 3;
237	}
238	else if (!(uch & RT_BIT(3)))
239	{
240	*pCp++ = (puch[3] & 0x3f)
241	\| ((RTUNICP)(puch[2] & 0x3f) << 6)
242	\| ((RTUNICP)(puch[1] & 0x3f) << 12)
243	\| ((RTUNICP)(uch & 0x07) << 18);
244	puch += 4;
245	cch -= 4;
246	}
247	else if (!(uch & RT_BIT(2)))
248	{
249	*pCp++ = (puch[4] & 0x3f)
250	\| ((RTUNICP)(puch[3] & 0x3f) << 6)
251	\| ((RTUNICP)(puch[2] & 0x3f) << 12)
252	\| ((RTUNICP)(puch[1] & 0x3f) << 18)
253	\| ((RTUNICP)(uch & 0x03) << 24);
254	puch += 5;
255	cch -= 6;
256	}
257	else
258	{
259	Assert(!(uch & RT_BIT(1)));
260	*pCp++ = (puch[5] & 0x3f)
261	\| ((RTUNICP)(puch[4] & 0x3f) << 6)
262	\| ((RTUNICP)(puch[3] & 0x3f) << 12)
263	\| ((RTUNICP)(puch[2] & 0x3f) << 18)
264	\| ((RTUNICP)(puch[1] & 0x3f) << 24)
265	\| ((RTUNICP)(uch & 0x01) << 30);
266	puch += 6;
267	cch -= 6;
268	}
269	}
270
271	/* done */
272	*pCp = 0;
273	*pcCps = pCp - paCps;
274	return rc;
275	}
276
277
278	RTDECL(size_t) RTStrUniLen(const char *psz)
279	{
280	size_t cCodePoints;
281	int rc = rtUtf8Length(psz, RTSTR_MAX, &cCodePoints, NULL);
282	return RT_SUCCESS(rc) ? cCodePoints : 0;
283	}
284
285
286	RTDECL(int) RTStrUniLenEx(const char psz, size_t cch, size_t pcCps)
287	{
288	size_t cCodePoints;
289	int rc = rtUtf8Length(psz, cch, &cCodePoints, NULL);
290	if (pcCps)
291	*pcCps = RT_SUCCESS(rc) ? cCodePoints : 0;
292	return rc;
293	}
294
295
296	RTDECL(int) RTStrValidateEncoding(const char *psz)
297	{
298	return RTStrValidateEncodingEx(psz, RTSTR_MAX, 0);
299	}
300
301
302	RTDECL(int) RTStrValidateEncodingEx(const char *psz, size_t cch, uint32_t fFlags)
303	{
304	AssertReturn(!(fFlags & ~(RTSTR_VALIDATE_ENCODING_ZERO_TERMINATED)), VERR_INVALID_PARAMETER);
305	AssertPtr(psz);
306
307	/*
308	* Use rtUtf8Length for the job.
309	*/
310	size_t cchActual;
311	size_t cCpsIgnored;
312	int rc = rtUtf8Length(psz, cch, &cCpsIgnored, &cchActual);
313	if (RT_SUCCESS(rc))
314	{
315	if ( (fFlags & RTSTR_VALIDATE_ENCODING_ZERO_TERMINATED)
316	&& cchActual >= cch)
317	rc = VERR_BUFFER_OVERFLOW;
318	}
319	return rc;
320
321
322	return RTStrUniLenEx(psz, cch, &cCpsIgnored);
323	}
324
325
326	RTDECL(bool) RTStrIsValidEncoding(const char *psz)
327	{
328	int rc = RTStrValidateEncodingEx(psz, RTSTR_MAX, 0);
329	return RT_SUCCESS(rc);
330	}
331
332
333	RTDECL(int) RTStrToUni(const char pszString, PRTUNICP ppaCps)
334	{
335	/*
336	* Validate input.
337	*/
338	Assert(VALID_PTR(pszString));
339	Assert(VALID_PTR(ppaCps));
340	*ppaCps = NULL;
341
342	/*
343	* Validate the UTF-8 input and count its code points.
344	*/
345	size_t cCps;
346	int rc = rtUtf8Length(pszString, RTSTR_MAX, &cCps, NULL);
347	if (RT_SUCCESS(rc))
348	{
349	/*
350	* Allocate buffer.
351	*/
352	PRTUNICP paCps = (PRTUNICP)RTMemAlloc((cCps + 1) * sizeof(RTUNICP));
353	if (paCps)
354	{
355	/*
356	* Decode the string.
357	*/
358	rc = rtUtf8Decode(pszString, RTSTR_MAX, paCps, cCps, &cCps);
359	if (RT_SUCCESS(rc))
360	{
361	*ppaCps = paCps;
362	return rc;
363	}
364	RTMemFree(paCps);
365	}
366	else
367	rc = VERR_NO_CODE_POINT_MEMORY;
368	}
369	return rc;
370	}
371
372
373	RTDECL(int) RTStrToUniEx(const char pszString, size_t cchString, PRTUNICP ppaCps, size_t cCps, size_t *pcCps)
374	{
375	/*
376	* Validate input.
377	*/
378	Assert(VALID_PTR(pszString));
379	Assert(VALID_PTR(ppaCps));
380	Assert(!pcCps \|\| VALID_PTR(pcCps));
381
382	/*
383	* Validate the UTF-8 input and count the code points.
384	*/
385	size_t cCpsResult;
386	int rc = rtUtf8Length(pszString, cchString, &cCpsResult, NULL);
387	if (RT_SUCCESS(rc))
388	{
389	if (pcCps)
390	*pcCps = cCpsResult;
391
392	/*
393	* Check buffer size / Allocate buffer.
394	*/
395	bool fShouldFree;
396	PRTUNICP paCpsResult;
397	if (cCps > 0 && *ppaCps)
398	{
399	fShouldFree = false;
400	if (cCps <= cCpsResult)
401	return VERR_BUFFER_OVERFLOW;
402	paCpsResult = *ppaCps;
403	}
404	else
405	{
406	*ppaCps = NULL;
407	fShouldFree = true;
408	cCps = RT_MAX(cCpsResult + 1, cCps);
409	paCpsResult = (PRTUNICP)RTMemAlloc(cCps * sizeof(RTUNICP));
410	}
411	if (paCpsResult)
412	{
413	/*
414	* Encode the UTF-16 string.
415	*/
416	rc = rtUtf8Decode(pszString, cchString, paCpsResult, cCps - 1, &cCpsResult);
417	if (RT_SUCCESS(rc))
418	{
419	*ppaCps = paCpsResult;
420	return rc;
421	}
422	if (fShouldFree)
423	RTMemFree(paCpsResult);
424	}
425	else
426	rc = VERR_NO_CODE_POINT_MEMORY;
427	}
428	return rc;
429	}
430
431
432	/**
433	* Calculates the UTF-16 length of a string, validating the encoding while doing so.
434	*
435	* @returns IPRT status code.
436	* @param psz Pointer to the UTF-8 string.
437	* @param cch The max length of the string. (btw cch = cb)
438	* Use RTSTR_MAX if all of the string is to be examined.s
439	* @param pcwc Where to store the length of the UTF-16 string as a number of RTUTF16 characters.
440	*/
441	static int rtUtf8CalcUtf16Length(const char psz, size_t cch, size_t pcwc)
442	{
443	const unsigned char puch = (const unsigned char )psz;
444	size_t cwc = 0;
445	while (cch > 0)
446	{
447	const unsigned char uch = *puch;
448	if (!uch)
449	break;
450	if (!(uch & RT_BIT(7)))
451	{
452	/* one ASCII byte */
453	cwc++;
454	puch++;
455	cch--;
456	}
457	else
458	{
459	/* figure sequence length and validate the first byte */
460	unsigned cb;
461	if ((uch & (RT_BIT(7) \| RT_BIT(6) \| RT_BIT(5))) == (RT_BIT(7) \| RT_BIT(6)))
462	cb = 2;
463	else if ((uch & (RT_BIT(7) \| RT_BIT(6) \| RT_BIT(5) \| RT_BIT(4))) == (RT_BIT(7) \| RT_BIT(6) \| RT_BIT(5)))
464	cb = 3;
465	else if ((uch & (RT_BIT(7) \| RT_BIT(6) \| RT_BIT(5) \| RT_BIT(4) \| RT_BIT(3))) == (RT_BIT(7) \| RT_BIT(6) \| RT_BIT(5) \| RT_BIT(4)))
466	cb = 4;
467	else if ((uch & (RT_BIT(7) \| RT_BIT(6) \| RT_BIT(5) \| RT_BIT(4) \| RT_BIT(3) \| RT_BIT(2))) == (RT_BIT(7) \| RT_BIT(6) \| RT_BIT(5) \| RT_BIT(4) \| RT_BIT(3)))
468	cb = 5;
469	else if ((uch & (RT_BIT(7) \| RT_BIT(6) \| RT_BIT(5) \| RT_BIT(4) \| RT_BIT(3) \| RT_BIT(2) \| RT_BIT(1))) == (RT_BIT(7) \| RT_BIT(6) \| RT_BIT(5) \| RT_BIT(4) \| RT_BIT(3) \| RT_BIT(2)))
470	cb = 6;
471	else
472	{
473	RTStrAssertMsgFailed(("Invalid UTF-8 first byte: %.*Rhxs\n", RT_MIN(cch, 10), puch));
474	return VERR_INVALID_UTF8_ENCODING;
475	}
476
477	/* check length */
478	if (cb > cch)
479	{
480	RTStrAssertMsgFailed(("Invalid UTF-8 length: cb=%d cch=%d (%.*Rhxs)\n", cb, cch, RT_MIN(cch, 10), puch));
481	return VERR_INVALID_UTF8_ENCODING;
482	}
483
484	/* validate the rest */
485	switch (cb)
486	{
487	case 6:
488	RTStrAssertMsgReturn((puch[5] & (RT_BIT(7) \| RT_BIT(6))) == RT_BIT(7), ("6/%u: %.*Rhxs\n", cb, RT_MIN(cb + 10, cch), puch), VERR_INVALID_UTF8_ENCODING);
489	case 5:
490	RTStrAssertMsgReturn((puch[4] & (RT_BIT(7) \| RT_BIT(6))) == RT_BIT(7), ("5/%u: %.*Rhxs\n", cb, RT_MIN(cb + 10, cch), puch), VERR_INVALID_UTF8_ENCODING);
491	case 4:
492	RTStrAssertMsgReturn((puch[3] & (RT_BIT(7) \| RT_BIT(6))) == RT_BIT(7), ("4/%u: %.*Rhxs\n", cb, RT_MIN(cb + 10, cch), puch), VERR_INVALID_UTF8_ENCODING);
493	case 3:
494	RTStrAssertMsgReturn((puch[2] & (RT_BIT(7) \| RT_BIT(6))) == RT_BIT(7), ("3/%u: %.*Rhxs\n", cb, RT_MIN(cb + 10, cch), puch), VERR_INVALID_UTF8_ENCODING);
495	case 2:
496	RTStrAssertMsgReturn((puch[1] & (RT_BIT(7) \| RT_BIT(6))) == RT_BIT(7), ("2/%u: %.*Rhxs\n", cb, RT_MIN(cb + 10, cch), puch), VERR_INVALID_UTF8_ENCODING);
497	break;
498	}
499
500	/* validate the code point. */
501	RTUNICP uc;
502	switch (cb)
503	{
504	case 6:
505	uc = (puch[5] & 0x3f)
506	\| ((RTUNICP)(puch[4] & 0x3f) << 6)
507	\| ((RTUNICP)(puch[3] & 0x3f) << 12)
508	\| ((RTUNICP)(puch[2] & 0x3f) << 18)
509	\| ((RTUNICP)(puch[1] & 0x3f) << 24)
510	\| ((RTUNICP)(uch & 0x01) << 30);
511	RTStrAssertMsgReturn(uc >= 0x04000000 && uc <= 0x7fffffff,
512	("%u: cp=%#010RX32: %.*Rhxs\n", cb, uc, RT_MIN(cb + 10, cch), puch), VERR_INVALID_UTF8_ENCODING);
513	RTStrAssertMsgFailed(("%u: cp=%#010RX32: %.*Rhxs\n", cb, uc, RT_MIN(cb + 10, cch), puch));
514	return VERR_CANT_RECODE_AS_UTF16;
515	case 5:
516	uc = (puch[4] & 0x3f)
517	\| ((RTUNICP)(puch[3] & 0x3f) << 6)
518	\| ((RTUNICP)(puch[2] & 0x3f) << 12)
519	\| ((RTUNICP)(puch[1] & 0x3f) << 18)
520	\| ((RTUNICP)(uch & 0x03) << 24);
521	RTStrAssertMsgReturn(uc >= 0x00200000 && uc <= 0x03ffffff,
522	("%u: cp=%#010RX32: %.*Rhxs\n", cb, uc, RT_MIN(cb + 10, cch), puch), VERR_INVALID_UTF8_ENCODING);
523	RTStrAssertMsgFailed(("%u: cp=%#010RX32: %.*Rhxs\n", cb, uc, RT_MIN(cb + 10, cch), puch));
524	return VERR_CANT_RECODE_AS_UTF16;
525	case 4:
526	uc = (puch[3] & 0x3f)
527	\| ((RTUNICP)(puch[2] & 0x3f) << 6)
528	\| ((RTUNICP)(puch[1] & 0x3f) << 12)
529	\| ((RTUNICP)(uch & 0x07) << 18);
530	RTStrAssertMsgReturn(uc >= 0x00010000 && uc <= 0x001fffff,
531	("%u: cp=%#010RX32: %.*Rhxs\n", cb, uc, RT_MIN(cb + 10, cch), puch), VERR_INVALID_UTF8_ENCODING);
532	RTStrAssertMsgReturn(uc <= 0x0010ffff,
533	("%u: cp=%#010RX32: %.*Rhxs\n", cb, uc, RT_MIN(cb + 10, cch), puch), VERR_CANT_RECODE_AS_UTF16);
534	cwc++;
535	break;
536	case 3:
537	uc = (puch[2] & 0x3f)
538	\| ((RTUNICP)(puch[1] & 0x3f) << 6)
539	\| ((RTUNICP)(uch & 0x0f) << 12);
540	RTStrAssertMsgReturn(uc >= 0x00000800 && uc <= 0x0000fffd,
541	("%u: cp=%#010RX32: %.*Rhxs\n", cb, uc, RT_MIN(cb + 10, cch), puch),
542	uc == 0xffff \|\| uc == 0xfffe ? VERR_CODE_POINT_ENDIAN_INDICATOR : VERR_INVALID_UTF8_ENCODING);
543	RTStrAssertMsgReturn(uc < 0xd800 \|\| uc > 0xdfff,
544	("%u: cp=%#010RX32: %.*Rhxs\n", cb, uc, RT_MIN(cb + 10, cch), puch), VERR_CODE_POINT_SURROGATE);
545	break;
546	case 2:
547	uc = (puch[1] & 0x3f)
548	\| ((RTUNICP)(uch & 0x1f) << 6);
549	RTStrAssertMsgReturn(uc >= 0x00000080 && uc <= 0x000007ff,
550	("%u: cp=%#010RX32: %.*Rhxs\n", cb, uc, RT_MIN(cb + 10, cch), puch), VERR_INVALID_UTF8_ENCODING);
551	break;
552	}
553
554	/* advance */
555	cch -= cb;
556	puch += cb;
557	cwc++;
558	}
559	}
560
561	/* done */
562	*pcwc = cwc;
563	return VINF_SUCCESS;
564	}
565
566
567	/**
568	* Recodes a valid UTF-8 string as UTF-16.
569	*
570	* Since we know the input is valid, we do not perform encoding or length checks.
571	*
572	* @returns iprt status code.
573	* @param psz The UTF-8 string to recode. This is a valid encoding.
574	* @param cch The number of chars (the type char, so bytes if you like) to process of the UTF-8 string.
575	* The recoding will stop when cch or '\\0' is reached. Pass RTSTR_MAX to process up to '\\0'.
576	* @param pwsz Where to store the UTF-16 string.
577	* @param cwc The number of RTUTF16 items the pwsz buffer can hold, excluding the terminator ('\\0').
578	* @param pcwc Where to store the actual number of RTUTF16 items encoded into the UTF-16. This excludes the terminator.
579	*/
580	static int rtUtf8RecodeAsUtf16(const char psz, size_t cch, PRTUTF16 pwsz, size_t cwc, size_t pcwc)
581	{
582	int rc = VINF_SUCCESS;
583	const unsigned char puch = (const unsigned char )psz;
584	const PRTUTF16 pwszEnd = pwsz + cwc;
585	PRTUTF16 pwc = pwsz;
586	Assert(pwszEnd >= pwc);
587	while (cch > 0)
588	{
589	/* read the next char and check for terminator. */
590	const unsigned char uch = *puch;
591	if (!uch)
592	break;
593
594	/* check for output overflow */
595	if (pwc >= pwszEnd)
596	{
597	rc = VERR_BUFFER_OVERFLOW;
598	break;
599	}
600
601	/* decode and recode the code point */
602	if (!(uch & RT_BIT(7)))
603	{
604	*pwc++ = uch;
605	puch++;
606	cch--;
607	}
608	else if ((uch & (RT_BIT(7) \| RT_BIT(6) \| RT_BIT(5))) == (RT_BIT(7) \| RT_BIT(6)))
609	{
610	uint16_t uc = (puch[1] & 0x3f)
611	\| ((uint16_t)(uch & 0x1f) << 6);
612	*pwc++ = uc;
613	puch += 2;
614	cch -= 2;
615	}
616	else if ((uch & (RT_BIT(7) \| RT_BIT(6) \| RT_BIT(5) \| RT_BIT(4))) == (RT_BIT(7) \| RT_BIT(6) \| RT_BIT(5)))
617	{
618	uint16_t uc = (puch[2] & 0x3f)
619	\| ((uint16_t)(puch[1] & 0x3f) << 6)
620	\| ((uint16_t)(uch & 0x0f) << 12);
621	*pwc++ = uc;
622	puch += 3;
623	cch -= 3;
624	}
625	else
626	{
627	/* generate surrugate pair */
628	Assert((uch & (RT_BIT(7) \| RT_BIT(6) \| RT_BIT(5) \| RT_BIT(4) \| RT_BIT(3))) == (RT_BIT(7) \| RT_BIT(6) \| RT_BIT(5) \| RT_BIT(4)));
629	RTUNICP uc = (puch[3] & 0x3f)
630	\| ((RTUNICP)(puch[2] & 0x3f) << 6)
631	\| ((RTUNICP)(puch[1] & 0x3f) << 12)
632	\| ((RTUNICP)(uch & 0x07) << 18);
633	if (pwc + 1 >= pwszEnd)
634	{
635	rc = VERR_BUFFER_OVERFLOW;
636	break;
637	}
638	uc -= 0x10000;
639	*pwc++ = 0xd800 \| (uc >> 10);
640	*pwc++ = 0xdc00 \| (uc & 0x3ff);
641	puch += 4;
642	cch -= 4;
643	}
644	}
645
646	/* done */
647	*pwc = '\0';
648	*pcwc = pwc - pwsz;
649	return rc;
650	}
651
652
653	RTDECL(int) RTStrToUtf16(const char pszString, PRTUTF16 ppwszString)
654	{
655	/*
656	* Validate input.
657	*/
658	Assert(VALID_PTR(ppwszString));
659	Assert(VALID_PTR(pszString));
660	*ppwszString = NULL;
661
662	/*
663	* Validate the UTF-8 input and calculate the length of the UTF-16 string.
664	*/
665	size_t cwc;
666	int rc = rtUtf8CalcUtf16Length(pszString, RTSTR_MAX, &cwc);
667	if (RT_SUCCESS(rc))
668	{
669	/*
670	* Allocate buffer.
671	*/
672	PRTUTF16 pwsz = (PRTUTF16)RTMemAlloc((cwc + 1) * sizeof(RTUTF16));
673	if (pwsz)
674	{
675	/*
676	* Encode the UTF-16 string.
677	*/
678	rc = rtUtf8RecodeAsUtf16(pszString, RTSTR_MAX, pwsz, cwc, &cwc);
679	if (RT_SUCCESS(rc))
680	{
681	*ppwszString = pwsz;
682	return rc;
683	}
684	RTMemFree(pwsz);
685	}
686	else
687	rc = VERR_NO_UTF16_MEMORY;
688	}
689	return rc;
690	}
691
692
693	RTDECL(int) RTStrToUtf16Ex(const char pszString, size_t cchString, PRTUTF16 ppwsz, size_t cwc, size_t *pcwc)
694	{
695	/*
696	* Validate input.
697	*/
698	Assert(VALID_PTR(pszString));
699	Assert(VALID_PTR(ppwsz));
700	Assert(!pcwc \|\| VALID_PTR(pcwc));
701
702	/*
703	* Validate the UTF-8 input and calculate the length of the UTF-16 string.
704	*/
705	size_t cwcResult;
706	int rc = rtUtf8CalcUtf16Length(pszString, cchString, &cwcResult);
707	if (RT_SUCCESS(rc))
708	{
709	if (pcwc)
710	*pcwc = cwcResult;
711
712	/*
713	* Check buffer size / Allocate buffer.
714	*/
715	bool fShouldFree;
716	PRTUTF16 pwszResult;
717	if (cwc > 0 && *ppwsz)
718	{
719	fShouldFree = false;
720	if (cwc <= cwcResult)
721	return VERR_BUFFER_OVERFLOW;
722	pwszResult = *ppwsz;
723	}
724	else
725	{
726	*ppwsz = NULL;
727	fShouldFree = true;
728	cwc = RT_MAX(cwcResult + 1, cwc);
729	pwszResult = (PRTUTF16)RTMemAlloc(cwc * sizeof(RTUTF16));
730	}
731	if (pwszResult)
732	{
733	/*
734	* Encode the UTF-16 string.
735	*/
736	rc = rtUtf8RecodeAsUtf16(pszString, cchString, pwszResult, cwc - 1, &cwcResult);
737	if (RT_SUCCESS(rc))
738	{
739	*ppwsz = pwszResult;
740	return rc;
741	}
742	if (fShouldFree)
743	RTMemFree(pwszResult);
744	}
745	else
746	rc = VERR_NO_UTF16_MEMORY;
747	}
748	return rc;
749	}
750
751
752	RTDECL(size_t) RTStrCalcUtf16Len(const char *psz)
753	{
754	size_t cwc;
755	int rc = rtUtf8CalcUtf16Length(psz, RTSTR_MAX, &cwc);
756	return RT_SUCCESS(rc) ? cwc : 0;
757	}
758
759
760	RTDECL(int) RTStrCalcUtf16LenEx(const char psz, size_t cch, size_t pcwc)
761	{
762	size_t cwc;
763	int rc = rtUtf8CalcUtf16Length(psz, cch, &cwc);
764	if (pcwc)
765	*pcwc = RT_SUCCESS(rc) ? cwc : ~(size_t)0;
766	return rc;
767	}
768
769
770	/**
771	* Handle invalid encodings passed to RTStrGetCp() and RTStrGetCpEx().
772	* @returns rc
773	* @param ppsz The pointer to the string position point.
774	* @param pCp Where to store RTUNICP_INVALID.
775	* @param rc The iprt error code.
776	*/
777	static int rtStrGetCpExFailure(const char **ppsz, PRTUNICP pCp, int rc)
778	{
779	/*
780	* Try find a valid encoding.
781	*/
782	(ppsz)++; /* @todo code this! */
783	*pCp = RTUNICP_INVALID;
784	return rc;
785	}
786
787
788	RTDECL(RTUNICP) RTStrGetCpInternal(const char *psz)
789	{
790	RTUNICP Cp;
791	RTStrGetCpExInternal(&psz, &Cp);
792	return Cp;
793	}
794
795
796	RTDECL(int) RTStrGetCpExInternal(const char **ppsz, PRTUNICP pCp)
797	{
798	const unsigned char puch = (const unsigned char )*ppsz;
799	const unsigned char uch = *puch;
800	RTUNICP uc;
801
802	/* ASCII ? */
803	if (!(uch & RT_BIT(7)))
804	{
805	uc = uch;
806	puch++;
807	}
808	else if (uch & RT_BIT(6))
809	{
810	/* figure the length and validate the first octet. */
811	unsigned cb;
812	if (!(uch & RT_BIT(5)))
813	cb = 2;
814	else if (!(uch & RT_BIT(4)))
815	cb = 3;
816	else if (!(uch & RT_BIT(3)))
817	cb = 4;
818	else if (!(uch & RT_BIT(2)))
819	cb = 5;
820	else if (!(uch & RT_BIT(1)))
821	cb = 6;
822	else
823	{
824	RTStrAssertMsgFailed(("Invalid UTF-8 first byte: %.Rhxs\n", RT_MIN(strlen((char )puch), 10), puch));
825	return rtStrGetCpExFailure(ppsz, pCp, VERR_INVALID_UTF8_ENCODING);
826	}
827
828	/* validate the rest */
829	switch (cb)
830	{
831	case 6:
832	RTStrAssertMsgReturn((puch[5] & 0xc0) == 0x80, ("6/%u: %.Rhxs\n", cb, RT_MIN(cb + 10, strlen((char )puch)), puch),
833	rtStrGetCpExFailure(ppsz, pCp, VERR_INVALID_UTF8_ENCODING));
834	case 5:
835	RTStrAssertMsgReturn((puch[4] & 0xc0) == 0x80, ("5/%u: %.Rhxs\n", cb, RT_MIN(cb + 10, strlen((char )puch)), puch),
836	rtStrGetCpExFailure(ppsz, pCp, VERR_INVALID_UTF8_ENCODING));
837	case 4:
838	RTStrAssertMsgReturn((puch[3] & 0xc0) == 0x80, ("4/%u: %.Rhxs\n", cb, RT_MIN(cb + 10, strlen((char )puch)), puch),
839	rtStrGetCpExFailure(ppsz, pCp, VERR_INVALID_UTF8_ENCODING));
840	case 3:
841	RTStrAssertMsgReturn((puch[2] & 0xc0) == 0x80, ("3/%u: %.Rhxs\n", cb, RT_MIN(cb + 10, strlen((char )puch)), puch),
842	rtStrGetCpExFailure(ppsz, pCp, VERR_INVALID_UTF8_ENCODING));
843	case 2:
844	RTStrAssertMsgReturn((puch[1] & 0xc0) == 0x80, ("2/%u: %.Rhxs\n", cb, RT_MIN(cb + 10, strlen((char )puch)), puch),
845	rtStrGetCpExFailure(ppsz, pCp, VERR_INVALID_UTF8_ENCODING));
846	break;
847	}
848
849	/* get and validate the code point. */
850	switch (cb)
851	{
852	case 6:
853	uc = (puch[5] & 0x3f)
854	\| ((RTUNICP)(puch[4] & 0x3f) << 6)
855	\| ((RTUNICP)(puch[3] & 0x3f) << 12)
856	\| ((RTUNICP)(puch[2] & 0x3f) << 18)
857	\| ((RTUNICP)(puch[1] & 0x3f) << 24)
858	\| ((RTUNICP)(uch & 0x01) << 30);
859	RTStrAssertMsgReturn(uc >= 0x04000000 && uc <= 0x7fffffff,
860	("%u: cp=%#010RX32: %.Rhxs\n", cb, uc, RT_MIN(cb + 10, strlen((char )puch)), puch),
861	rtStrGetCpExFailure(ppsz, pCp, VERR_INVALID_UTF8_ENCODING));
862	break;
863	case 5:
864	uc = (puch[4] & 0x3f)
865	\| ((RTUNICP)(puch[3] & 0x3f) << 6)
866	\| ((RTUNICP)(puch[2] & 0x3f) << 12)
867	\| ((RTUNICP)(puch[1] & 0x3f) << 18)
868	\| ((RTUNICP)(uch & 0x03) << 24);
869	RTStrAssertMsgReturn(uc >= 0x00200000 && uc <= 0x03ffffff,
870	("%u: cp=%#010RX32: %.Rhxs\n", cb, uc, RT_MIN(cb + 10, strlen((char )puch)), puch),
871	rtStrGetCpExFailure(ppsz, pCp, VERR_INVALID_UTF8_ENCODING));
872	break;
873	case 4:
874	uc = (puch[3] & 0x3f)
875	\| ((RTUNICP)(puch[2] & 0x3f) << 6)
876	\| ((RTUNICP)(puch[1] & 0x3f) << 12)
877	\| ((RTUNICP)(uch & 0x07) << 18);
878	RTStrAssertMsgReturn(uc >= 0x00010000 && uc <= 0x001fffff,
879	("%u: cp=%#010RX32: %.Rhxs\n", cb, uc, RT_MIN(cb + 10, strlen((char )puch)), puch),
880	rtStrGetCpExFailure(ppsz, pCp, VERR_INVALID_UTF8_ENCODING));
881	break;
882	case 3:
883	uc = (puch[2] & 0x3f)
884	\| ((RTUNICP)(puch[1] & 0x3f) << 6)
885	\| ((RTUNICP)(uch & 0x0f) << 12);
886	RTStrAssertMsgReturn(uc >= 0x00000800 && uc <= 0x0000fffd,
887	("%u: cp=%#010RX32: %.Rhxs\n", cb, uc, RT_MIN(cb + 10, strlen((char )puch)), puch),
888	rtStrGetCpExFailure(ppsz, pCp, uc == 0xffff \|\| uc == 0xfffe ? VERR_CODE_POINT_ENDIAN_INDICATOR : VERR_INVALID_UTF8_ENCODING));
889	RTStrAssertMsgReturn(uc < 0xd800 \|\| uc > 0xdfff,
890	("%u: cp=%#010RX32: %.Rhxs\n", cb, uc, RT_MIN(cb + 10, strlen((char )puch)), puch),
891	rtStrGetCpExFailure(ppsz, pCp, VERR_CODE_POINT_SURROGATE));
892	break;
893	case 2:
894	uc = (puch[1] & 0x3f)
895	\| ((RTUNICP)(uch & 0x1f) << 6);
896	RTStrAssertMsgReturn(uc >= 0x00000080 && uc <= 0x000007ff,
897	("%u: cp=%#010RX32: %.Rhxs\n", cb, uc, RT_MIN(cb + 10, strlen((char )puch)), puch),
898	rtStrGetCpExFailure(ppsz, pCp, VERR_INVALID_UTF8_ENCODING));
899	break;
900	default: /* impossible, but GCC is bitching. */
901	uc = RTUNICP_INVALID;
902	break;
903	}
904	puch += cb;
905	}
906	else
907	{
908	/* 6th bit is always set. */
909	RTStrAssertMsgFailed(("Invalid UTF-8 first byte: %.Rhxs\n", RT_MIN(strlen((char )puch), 10), puch));
910	return rtStrGetCpExFailure(ppsz, pCp, VERR_INVALID_UTF8_ENCODING);
911	}
912	*pCp = uc;
913	ppsz = (const char )puch;
914	return VINF_SUCCESS;
915	}
916
917
918	/**
919	* Handle invalid encodings passed to RTStrGetCpNEx().
920	* @returns rc
921	* @param ppsz The pointer to the string position point.
922	* @param pcch Pointer to the string length.
923	* @param pCp Where to store RTUNICP_INVALID.
924	* @param rc The iprt error code.
925	*/
926	static int rtStrGetCpNExFailure(const char *ppsz, size_t pcch, PRTUNICP pCp, int rc)
927	{
928	/*
929	* Try find a valid encoding.
930	*/
931	(ppsz)++; /* @todo code this! */
932	(*pcch)--;
933	*pCp = RTUNICP_INVALID;
934	return rc;
935	}
936
937
938	RTDECL(int) RTStrGetCpNExInternal(const char *ppsz, size_t pcch, PRTUNICP pCp)
939	{
940	const unsigned char puch = (const unsigned char )*ppsz;
941	const unsigned char uch = *puch;
942	size_t cch = *pcch;
943	RTUNICP uc;
944
945	if (cch == 0)
946	{
947	*pCp = RTUNICP_INVALID;
948	return VERR_END_OF_STRING;
949	}
950
951	/* ASCII ? */
952	if (!(uch & RT_BIT(7)))
953	{
954	uc = uch;
955	puch++;
956	cch--;
957	}
958	else if (uch & RT_BIT(6))
959	{
960	/* figure the length and validate the first octet. */
961	unsigned cb;
962	if (!(uch & RT_BIT(5)))
963	cb = 2;
964	else if (!(uch & RT_BIT(4)))
965	cb = 3;
966	else if (!(uch & RT_BIT(3)))
967	cb = 4;
968	else if (!(uch & RT_BIT(2)))
969	cb = 5;
970	else if (!(uch & RT_BIT(1)))
971	cb = 6;
972	else
973	{
974	RTStrAssertMsgFailed(("Invalid UTF-8 first byte: %.Rhxs\n", RT_MIN(strlen((char )puch), 10), puch));
975	return rtStrGetCpNExFailure(ppsz, pcch, pCp, VERR_INVALID_UTF8_ENCODING);
976	}
977
978	if (cb > cch)
979	return rtStrGetCpNExFailure(ppsz, pcch, pCp, VERR_INVALID_UTF8_ENCODING);
980
981	/* validate the rest */
982	switch (cb)
983	{
984	case 6:
985	RTStrAssertMsgReturn((puch[5] & 0xc0) == 0x80, ("6/%u: %.Rhxs\n", cb, RT_MIN(cb + 10, strlen((char )puch)), puch),
986	rtStrGetCpNExFailure(ppsz, pcch, pCp, VERR_INVALID_UTF8_ENCODING));
987	case 5:
988	RTStrAssertMsgReturn((puch[4] & 0xc0) == 0x80, ("5/%u: %.Rhxs\n", cb, RT_MIN(cb + 10, strlen((char )puch)), puch),
989	rtStrGetCpNExFailure(ppsz, pcch, pCp, VERR_INVALID_UTF8_ENCODING));
990	case 4:
991	RTStrAssertMsgReturn((puch[3] & 0xc0) == 0x80, ("4/%u: %.Rhxs\n", cb, RT_MIN(cb + 10, strlen((char )puch)), puch),
992	rtStrGetCpNExFailure(ppsz, pcch, pCp, VERR_INVALID_UTF8_ENCODING));
993	case 3:
994	RTStrAssertMsgReturn((puch[2] & 0xc0) == 0x80, ("3/%u: %.Rhxs\n", cb, RT_MIN(cb + 10, strlen((char )puch)), puch),
995	rtStrGetCpNExFailure(ppsz, pcch, pCp, VERR_INVALID_UTF8_ENCODING));
996	case 2:
997	RTStrAssertMsgReturn((puch[1] & 0xc0) == 0x80, ("2/%u: %.Rhxs\n", cb, RT_MIN(cb + 10, strlen((char )puch)), puch),
998	rtStrGetCpNExFailure(ppsz, pcch, pCp, VERR_INVALID_UTF8_ENCODING));
999	break;
1000	}
1001
1002	/* get and validate the code point. */
1003	switch (cb)
1004	{
1005	case 6:
1006	uc = (puch[5] & 0x3f)
1007	\| ((RTUNICP)(puch[4] & 0x3f) << 6)
1008	\| ((RTUNICP)(puch[3] & 0x3f) << 12)
1009	\| ((RTUNICP)(puch[2] & 0x3f) << 18)
1010	\| ((RTUNICP)(puch[1] & 0x3f) << 24)
1011	\| ((RTUNICP)(uch & 0x01) << 30);
1012	RTStrAssertMsgReturn(uc >= 0x04000000 && uc <= 0x7fffffff,
1013	("%u: cp=%#010RX32: %.Rhxs\n", cb, uc, RT_MIN(cb + 10, strlen((char )puch)), puch),
1014	rtStrGetCpNExFailure(ppsz, pcch, pCp, VERR_INVALID_UTF8_ENCODING));
1015	break;
1016	case 5:
1017	uc = (puch[4] & 0x3f)
1018	\| ((RTUNICP)(puch[3] & 0x3f) << 6)
1019	\| ((RTUNICP)(puch[2] & 0x3f) << 12)
1020	\| ((RTUNICP)(puch[1] & 0x3f) << 18)
1021	\| ((RTUNICP)(uch & 0x03) << 24);
1022	RTStrAssertMsgReturn(uc >= 0x00200000 && uc <= 0x03ffffff,
1023	("%u: cp=%#010RX32: %.Rhxs\n", cb, uc, RT_MIN(cb + 10, strlen((char )puch)), puch),
1024	rtStrGetCpNExFailure(ppsz, pcch, pCp, VERR_INVALID_UTF8_ENCODING));
1025	break;
1026	case 4:
1027	uc = (puch[3] & 0x3f)
1028	\| ((RTUNICP)(puch[2] & 0x3f) << 6)
1029	\| ((RTUNICP)(puch[1] & 0x3f) << 12)
1030	\| ((RTUNICP)(uch & 0x07) << 18);
1031	RTStrAssertMsgReturn(uc >= 0x00010000 && uc <= 0x001fffff,
1032	("%u: cp=%#010RX32: %.Rhxs\n", cb, uc, RT_MIN(cb + 10, strlen((char )puch)), puch),
1033	rtStrGetCpNExFailure(ppsz, pcch, pCp, VERR_INVALID_UTF8_ENCODING));
1034	break;
1035	case 3:
1036	uc = (puch[2] & 0x3f)
1037	\| ((RTUNICP)(puch[1] & 0x3f) << 6)
1038	\| ((RTUNICP)(uch & 0x0f) << 12);
1039	RTStrAssertMsgReturn(uc >= 0x00000800 && uc <= 0x0000fffd,
1040	("%u: cp=%#010RX32: %.Rhxs\n", cb, uc, RT_MIN(cb + 10, strlen((char )puch)), puch),
1041	rtStrGetCpNExFailure(ppsz, pcch, pCp, uc == 0xffff \|\| uc == 0xfffe ? VERR_CODE_POINT_ENDIAN_INDICATOR : VERR_INVALID_UTF8_ENCODING));
1042	RTStrAssertMsgReturn(uc < 0xd800 \|\| uc > 0xdfff,
1043	("%u: cp=%#010RX32: %.Rhxs\n", cb, uc, RT_MIN(cb + 10, strlen((char )puch)), puch),
1044	rtStrGetCpNExFailure(ppsz, pcch, pCp, VERR_CODE_POINT_SURROGATE));
1045	break;
1046	case 2:
1047	uc = (puch[1] & 0x3f)
1048	\| ((RTUNICP)(uch & 0x1f) << 6);
1049	RTStrAssertMsgReturn(uc >= 0x00000080 && uc <= 0x000007ff,
1050	("%u: cp=%#010RX32: %.Rhxs\n", cb, uc, RT_MIN(cb + 10, strlen((char )puch)), puch),
1051	rtStrGetCpNExFailure(ppsz, pcch, pCp, VERR_INVALID_UTF8_ENCODING));
1052	break;
1053	default: /* impossible, but GCC is bitching. */
1054	uc = RTUNICP_INVALID;
1055	break;
1056	}
1057	puch += cb;
1058	cch -= cb;
1059	}
1060	else
1061	{
1062	/* 6th bit is always set. */
1063	RTStrAssertMsgFailed(("Invalid UTF-8 first byte: %.Rhxs\n", RT_MIN(strlen((char )puch), 10), puch));
1064	return rtStrGetCpNExFailure(ppsz, pcch, pCp, VERR_INVALID_UTF8_ENCODING);
1065	}
1066	*pCp = uc;
1067	ppsz = (const char )puch;
1068	(*pcch) = cch;
1069	return VINF_SUCCESS;
1070	}
1071
1072
1073	RTDECL(char ) RTStrPutCpInternal(char psz, RTUNICP uc)
1074	{
1075	unsigned char puch = (unsigned char )psz;
1076	if (uc < 0x80)
1077	*puch++ = (unsigned char )uc;
1078	else if (uc < 0x00000800)
1079	{
1080	*puch++ = 0xc0 \| (uc >> 6);
1081	*puch++ = 0x80 \| (uc & 0x3f);
1082	}
1083	else if (uc < 0x00010000)
1084	{
1085	if ( uc < 0x0000d8000
1086	\|\| ( uc > 0x0000dfff
1087	&& uc < 0x0000fffe))
1088	{
1089	*puch++ = 0xe0 \| (uc >> 12);
1090	*puch++ = 0x80 \| ((uc >> 6) & 0x3f);
1091	*puch++ = 0x80 \| (uc & 0x3f);
1092	}
1093	else
1094	{
1095	AssertMsgFailed(("Invalid code point U+%05x!\n", uc));
1096	*puch++ = 0x7f;
1097	}
1098	}
1099	else if (uc < 0x00200000)
1100	{
1101	*puch++ = 0xf0 \| (uc >> 18);
1102	*puch++ = 0x80 \| ((uc >> 12) & 0x3f);
1103	*puch++ = 0x80 \| ((uc >> 6) & 0x3f);
1104	*puch++ = 0x80 \| (uc & 0x3f);
1105	}
1106	else if (uc < 0x04000000)
1107	{
1108	*puch++ = 0xf1 \| (uc >> 24);
1109	*puch++ = 0x80 \| ((uc >> 18) & 0x3f);
1110	*puch++ = 0x80 \| ((uc >> 12) & 0x3f);
1111	*puch++ = 0x80 \| ((uc >> 6) & 0x3f);
1112	*puch++ = 0x80 \| (uc & 0x3f);
1113	}
1114	else if (uc <= 0x7fffffff)
1115	{
1116	*puch++ = 0xf3 \| (uc >> 30);
1117	*puch++ = 0x80 \| ((uc >> 24) & 0x3f);
1118	*puch++ = 0x80 \| ((uc >> 18) & 0x3f);
1119	*puch++ = 0x80 \| ((uc >> 12) & 0x3f);
1120	*puch++ = 0x80 \| ((uc >> 6) & 0x3f);
1121	*puch++ = 0x80 \| (uc & 0x3f);
1122	}
1123	else
1124	{
1125	AssertMsgFailed(("Invalid code point U+%08x!\n", uc));
1126	*puch++ = 0x7f;
1127	}
1128
1129	return (char *)puch;
1130	}
1131
1132
1133	RTDECL(char ) RTStrPrevCp(const char pszStart, const char *psz)
1134	{
1135	if (pszStart < psz)
1136	{
1137	/* simple char? */
1138	const unsigned char puch = (const unsigned char )psz;
1139	unsigned uch = *--puch;
1140	if (!(uch & RT_BIT(7)))
1141	return (char *)puch;
1142	RTStrAssertMsgReturn(!(uch & RT_BIT(6)), ("uch=%#x\n", uch), (char *)pszStart);
1143
1144	/* two or more. */
1145	uint32_t uMask = 0xffffffc0;
1146	while ( (const unsigned char *)pszStart < puch
1147	&& !(uMask & 1))
1148	{
1149	unsigned uch = *--puch;
1150	if ((uch & 0xc0) != 0x80)
1151	{
1152	RTStrAssertMsgReturn((uch & (uMask >> 1)) == (uMask & 0xff),
1153	("Invalid UTF-8 encoding: %.Rhxs puch=%p psz=%p\n", psz - (char )puch, puch, psz),
1154	(char *)pszStart);
1155	return (char *)puch;
1156	}
1157	uMask >>= 1;
1158	}
1159	RTStrAssertMsgFailed(("Invalid UTF-8 encoding: %.Rhxs puch=%p psz=%p\n", psz - (char )puch, puch, psz));
1160	}
1161	return (char *)pszStart;
1162	}
1163
1164
1165	/**
1166	* Performs a case sensitive string compare between two UTF-8 strings.
1167	*
1168	* Encoding errors are ignored by the current implementation. So, the only
1169	* difference between this and the CRT strcmp function is the handling of
1170	* NULL arguments.
1171	*
1172	* @returns < 0 if the first string less than the second string.
1173	* @returns 0 if the first string identical to the second string.
1174	* @returns > 0 if the first string greater than the second string.
1175	* @param psz1 First UTF-8 string. Null is allowed.
1176	* @param psz2 Second UTF-8 string. Null is allowed.
1177	*/
1178	RTDECL(int) RTStrCmp(const char psz1, const char psz2)
1179	{
1180	if (psz1 == psz2)
1181	return 0;
1182	if (!psz1)
1183	return -1;
1184	if (!psz2)
1185	return 1;
1186
1187	return strcmp(psz1, psz2);
1188	}
1189
1190
1191	/**
1192	* Performs a case sensitive string compare between two UTF-8 strings, given
1193	* a maximum string length.
1194	*
1195	* Encoding errors are ignored by the current implementation. So, the only
1196	* difference between this and the CRT strncmp function is the handling of
1197	* NULL arguments.
1198	*
1199	* @returns < 0 if the first string less than the second string.
1200	* @returns 0 if the first string identical to the second string.
1201	* @returns > 0 if the first string greater than the second string.
1202	* @param psz1 First UTF-8 string. Null is allowed.
1203	* @param psz2 Second UTF-8 string. Null is allowed.
1204	* @param cchMax The maximum string length
1205	*/
1206	RTDECL(int) RTStrNCmp(const char psz1, const char psz2, size_t cchMax)
1207	{
1208	if (psz1 == psz2)
1209	return 0;
1210	if (!psz1)
1211	return -1;
1212	if (!psz2)
1213	return 1;
1214
1215	return strncmp(psz1, psz2, cchMax);
1216	}
1217
1218
1219	/**
1220	* Performs a case insensitive string compare between two UTF-8 strings.
1221	*
1222	* This is a simplified compare, as only the simplified lower/upper case folding
1223	* specified by the unicode specs are used. It does not consider character pairs
1224	* as they are used in some languages, just simple upper & lower case compares.
1225	*
1226	* The result is the difference between the mismatching codepoints after they
1227	* both have been lower cased.
1228	*
1229	* If the string encoding is invalid the function will assert (strict builds)
1230	* and use RTStrCmp for the remainder of the string.
1231	*
1232	* @returns < 0 if the first string less than the second string.
1233	* @returns 0 if the first string identical to the second string.
1234	* @returns > 0 if the first string greater than the second string.
1235	* @param psz1 First UTF-8 string. Null is allowed.
1236	* @param psz2 Second UTF-8 string. Null is allowed.
1237	*/
1238	RTDECL(int) RTStrICmp(const char psz1, const char psz2)
1239	{
1240	if (psz1 == psz2)
1241	return 0;
1242	if (!psz1)
1243	return -1;
1244	if (!psz2)
1245	return 1;
1246
1247	const char *pszStart1 = psz1;
1248	for (;;)
1249	{
1250	/* Get the codepoints */
1251	RTUNICP cp1;
1252	int rc = RTStrGetCpEx(&psz1, &cp1);
1253	if (RT_FAILURE(rc))
1254	{
1255	AssertRC(rc);
1256	psz1--;
1257	break;
1258	}
1259
1260	RTUNICP cp2;
1261	rc = RTStrGetCpEx(&psz2, &cp2);
1262	if (RT_FAILURE(rc))
1263	{
1264	AssertRC(rc);
1265	psz2--;
1266	psz1 = RTStrPrevCp(pszStart1, psz1);
1267	break;
1268	}
1269
1270	/* compare */
1271	int iDiff = cp1 - cp2;
1272	if (iDiff)
1273	{
1274	iDiff = RTUniCpToUpper(cp1) != RTUniCpToUpper(cp2);
1275	if (iDiff)
1276	{
1277	iDiff = RTUniCpToLower(cp1) - RTUniCpToLower(cp2); /* lower case diff last! */
1278	if (iDiff)
1279	return iDiff;
1280	}
1281	}
1282
1283	/* hit the terminator? */
1284	if (!cp1)
1285	return 0;
1286	}
1287
1288	/* Hit some bad encoding, continue in case insensitive mode. */
1289	return RTStrCmp(psz1, psz2);
1290	}
1291
1292
1293	/**
1294	* Performs a case insensitive string compare between two UTF-8 strings, given a
1295	* maximum string length.
1296	*
1297	* This is a simplified compare, as only the simplified lower/upper case folding
1298	* specified by the unicode specs are used. It does not consider character pairs
1299	* as they are used in some languages, just simple upper & lower case compares.
1300	*
1301	* The result is the difference between the mismatching codepoints after they
1302	* both have been lower cased.
1303	*
1304	* If the string encoding is invalid the function will assert (strict builds)
1305	* and use RTStrCmp for the remainder of the string.
1306	*
1307	* @returns < 0 if the first string less than the second string.
1308	* @returns 0 if the first string identical to the second string.
1309	* @returns > 0 if the first string greater than the second string.
1310	* @param psz1 First UTF-8 string. Null is allowed.
1311	* @param psz2 Second UTF-8 string. Null is allowed.
1312	* @param cchMax Maximum string length
1313	*/
1314	RTDECL(int) RTStrNICmp(const char psz1, const char psz2, size_t cchMax)
1315	{
1316	if (cchMax == 0)
1317	return 0;
1318	if (psz1 == psz2)
1319	return 0;
1320	if (!psz1)
1321	return -1;
1322	if (!psz2)
1323	return 1;
1324
1325	for (;;)
1326	{
1327	/* Get the codepoints */
1328	RTUNICP cp1;
1329	size_t cchMax2 = cchMax;
1330	int rc = RTStrGetCpNEx(&psz1, &cchMax, &cp1);
1331	if (RT_FAILURE(rc))
1332	{
1333	AssertRC(rc);
1334	psz1--;
1335	cchMax++;
1336	break;
1337	}
1338
1339	RTUNICP cp2;
1340	rc = RTStrGetCpNEx(&psz2, &cchMax2, &cp2);
1341	if (RT_FAILURE(rc))
1342	{
1343	AssertRC(rc);
1344	psz2--;
1345	psz1 -= (cchMax - cchMax2 + 1); /* This can't overflow, can it? */
1346	cchMax = cchMax2 + 1;
1347	break;
1348	}
1349
1350	/* compare */
1351	int iDiff = cp1 - cp2;
1352	if (iDiff)
1353	{
1354	iDiff = RTUniCpToUpper(cp1) != RTUniCpToUpper(cp2);
1355	if (iDiff)
1356	{
1357	iDiff = RTUniCpToLower(cp1) - RTUniCpToLower(cp2); /* lower case diff last! */
1358	if (iDiff)
1359	return iDiff;
1360	}
1361	}
1362
1363	/* hit the terminator? */
1364	if (!cp1 \|\| cchMax == 0)
1365	return 0;
1366	}
1367
1368	/* Hit some bad encoding, continue in case insensitive mode. */
1369	return RTStrNCmp(psz1, psz2, cchMax);
1370	}
1371
1372
1373	RTDECL(char ) RTStrStr(const char pszHaystack, const char *pszNeedle)
1374	{
1375	/* Any NULL strings means NULL return. (In the RTStrCmp tradition.) */
1376	if (!pszHaystack)
1377	return NULL;
1378	if (!pszNeedle)
1379	return NULL;
1380
1381	/* The rest is CRT. */
1382	return (char *)strstr(pszHaystack, pszNeedle);
1383	}
1384
1385
1386	RTDECL(char ) RTStrIStr(const char pszHaystack, const char *pszNeedle)
1387	{
1388	/* Any NULL strings means NULL return. (In the RTStrCmp tradition.) */
1389	if (!pszHaystack)
1390	return NULL;
1391	if (!pszNeedle)
1392	return NULL;
1393
1394	/* The empty string matches everything. */
1395	if (!*pszNeedle)
1396	return (char *)pszHaystack;
1397
1398	/*
1399	* The search strategy is to pick out the first char of the needle, fold it,
1400	* and match it against the haystack code point by code point. When encountering
1401	* a matching code point we use RTStrNICmp for the remainder (if any) of the needle.
1402	*/
1403	const char * const pszNeedleStart = pszNeedle;
1404	RTUNICP Cp0;
1405	RTStrGetCpEx(&pszNeedle, &Cp0); /* pszNeedle is advanced one code point. */
1406	size_t const cchNeedle = strlen(pszNeedle);
1407	size_t const cchNeedleCp0= pszNeedle - pszNeedleStart;
1408	RTUNICP const Cp0Lower = RTUniCpToLower(Cp0);
1409	RTUNICP const Cp0Upper = RTUniCpToUpper(Cp0);
1410	if ( Cp0Lower == Cp0Upper
1411	&& Cp0Lower == Cp0)
1412	{
1413	/* Cp0 is not a case sensitive char. */
1414	for (;;)
1415	{
1416	RTUNICP Cp;
1417	RTStrGetCpEx(&pszHaystack, &Cp);
1418	if (!Cp)
1419	break;
1420	if ( Cp == Cp0
1421	&& !RTStrNICmp(pszHaystack, pszNeedle, cchNeedle))
1422	return (char *)pszHaystack - cchNeedleCp0;
1423	}
1424	}
1425	else if ( Cp0Lower == Cp0
1426	\|\| Cp0Upper != Cp0)
1427	{
1428	/* Cp0 is case sensitive */
1429	for (;;)
1430	{
1431	RTUNICP Cp;
1432	RTStrGetCpEx(&pszHaystack, &Cp);
1433	if (!Cp)
1434	break;
1435	if ( ( Cp == Cp0Upper
1436	\|\| Cp == Cp0Lower)
1437	&& !RTStrNICmp(pszHaystack, pszNeedle, cchNeedle))
1438	return (char *)pszHaystack - cchNeedleCp0;
1439	}
1440	}
1441	else
1442	{
1443	/* Cp0 is case sensitive and folds to two difference chars. (paranoia) */
1444	for (;;)
1445	{
1446	RTUNICP Cp;
1447	RTStrGetCpEx(&pszHaystack, &Cp);
1448	if (!Cp)
1449	break;
1450	if ( ( Cp == Cp0
1451	\|\| Cp == Cp0Upper
1452	\|\| Cp == Cp0Lower)
1453	&& !RTStrNICmp(pszHaystack, pszNeedle, cchNeedle))
1454	return (char *)pszHaystack - cchNeedleCp0;
1455	}
1456	}
1457
1458
1459	return NULL;
1460	}
1461
1462
1463	RTDECL(char ) RTStrToLower(char psz)
1464	{
1465	/*
1466	* Loop the code points in the string, converting them one by one.
1467	* ASSUMES that the code points for upper and lower case are encoded
1468	* with the exact same length.
1469	*/
1470	/** @todo Handled bad encodings correctly+quietly, remove assumption,
1471	* optimize. */
1472	char *pszCur = psz;
1473	while (*pszCur)
1474	{
1475	RTUNICP cp = RTStrGetCp(pszCur);
1476	cp = RTUniCpToLower(cp);
1477	pszCur = RTStrPutCp(pszCur, cp);
1478	}
1479	return psz;
1480	}
1481
1482
1483	RTDECL(char ) RTStrToUpper(char psz)
1484	{
1485	/*
1486	* Loop the code points in the string, converting them one by one.
1487	* ASSUMES that the code points for upper and lower case are encoded
1488	* with the exact same length.
1489	*/
1490	/** @todo Handled bad encodings correctly+quietly, remove assumption,
1491	* optimize. */
1492	char *pszCur = psz;
1493	while(*pszCur)
1494	{
1495	RTUNICP cp = RTStrGetCp(pszCur);
1496	cp = RTUniCpToUpper(cp);
1497	pszCur = RTStrPutCp(pszCur, cp);
1498	}
1499	return psz;
1500	}
1501

Note: See TracBrowser for help on using the repository browser.

source: vbox/trunk/src/VBox/Runtime/common/string/utf-8.cpp@ 18570

Download in other formats: