utf-8.cpp@ 49329

Last change on this file since 49329 was 48935, checked in by vboxsync, 11 years ago
Runtime: Whitespace and svn:keyword cleanups by scm.
Property svn:eol-style set to `native` Property svn:keywords set to `Id Revision`
File size: 54.5 KB

Line
1	/* $Id: utf-8.cpp 48935 2013-10-07 21:19:37Z vboxsync $ */
2	/** @file
3	* IPRT - UTF-8 Decoding.
4	*/
5
6	/*
7	* Copyright (C) 2006-2012 Oracle Corporation
8	*
9	* This file is part of VirtualBox Open Source Edition (OSE), as
10	* available from http://www.virtualbox.org. This file is free software;
11	* you can redistribute it and/or modify it under the terms of the GNU
12	* General Public License (GPL) as published by the Free Software
13	* Foundation, in version 2 as it comes in the "COPYING" file of the
14	* VirtualBox OSE distribution. VirtualBox OSE is distributed in the
15	* hope that it will be useful, but WITHOUT ANY WARRANTY of any kind.
16	*
17	* The contents of this file may alternatively be used under the terms
18	* of the Common Development and Distribution License Version 1.0
19	* (CDDL) only, as it comes in the "COPYING.CDDL" file of the
20	* VirtualBox OSE distribution, in which case the provisions of the
21	* CDDL are applicable instead of those of the GPL.
22	*
23	* You may elect to license modified versions of this file under the
24	* terms and conditions of either the GPL or the CDDL or both.
25	*/
26
27
28	/*******************************************************************************
29	* Header Files *
30	*******************************************************************************/
31	#include <iprt/string.h>
32	#include "internal/iprt.h"
33
34	#include <iprt/uni.h>
35	#include <iprt/alloc.h>
36	#include <iprt/assert.h>
37	#include <iprt/err.h>
38	#include "internal/string.h"
39
40
41
42	/**
43	* Get get length in code points of a UTF-8 encoded string.
44	* The string is validated while doing this.
45	*
46	* @returns IPRT status code.
47	* @param psz Pointer to the UTF-8 string.
48	* @param cch The max length of the string. (btw cch = cb)
49	* Use RTSTR_MAX if all of the string is to be examined.
50	* @param pcuc Where to store the length in unicode code points.
51	* @param pcchActual Where to store the actual size of the UTF-8 string
52	* on success (cch = cb again). Optional.
53	*/
54	DECLHIDDEN(int) rtUtf8Length(const char psz, size_t cch, size_t pcuc, size_t *pcchActual)
55	{
56	const unsigned char puch = (const unsigned char )psz;
57	size_t cCodePoints = 0;
58	while (cch > 0)
59	{
60	const unsigned char uch = *puch;
61	if (!uch)
62	break;
63	if (uch & RT_BIT(7))
64	{
65	/* figure sequence length and validate the first byte */
66	/** @todo RT_USE_RTC_3629 */
67	unsigned cb;
68	if ((uch & (RT_BIT(7) \| RT_BIT(6) \| RT_BIT(5))) == (RT_BIT(7) \| RT_BIT(6)))
69	cb = 2;
70	else if ((uch & (RT_BIT(7) \| RT_BIT(6) \| RT_BIT(5) \| RT_BIT(4))) == (RT_BIT(7) \| RT_BIT(6) \| RT_BIT(5)))
71	cb = 3;
72	else if ((uch & (RT_BIT(7) \| RT_BIT(6) \| RT_BIT(5) \| RT_BIT(4) \| RT_BIT(3))) == (RT_BIT(7) \| RT_BIT(6) \| RT_BIT(5) \| RT_BIT(4)))
73	cb = 4;
74	else if ((uch & (RT_BIT(7) \| RT_BIT(6) \| RT_BIT(5) \| RT_BIT(4) \| RT_BIT(3) \| RT_BIT(2))) == (RT_BIT(7) \| RT_BIT(6) \| RT_BIT(5) \| RT_BIT(4) \| RT_BIT(3)))
75	cb = 5;
76	else if ((uch & (RT_BIT(7) \| RT_BIT(6) \| RT_BIT(5) \| RT_BIT(4) \| RT_BIT(3) \| RT_BIT(2) \| RT_BIT(1))) == (RT_BIT(7) \| RT_BIT(6) \| RT_BIT(5) \| RT_BIT(4) \| RT_BIT(3) \| RT_BIT(2)))
77	cb = 6;
78	else
79	{
80	RTStrAssertMsgFailed(("Invalid UTF-8 first byte: %.*Rhxs\n", RT_MIN(cch, 10), puch));
81	return VERR_INVALID_UTF8_ENCODING;
82	}
83
84	/* check length */
85	if (cb > cch)
86	{
87	RTStrAssertMsgFailed(("Invalid UTF-8 length: cb=%d cch=%d (%.*Rhxs)\n", cb, cch, RT_MIN(cch, 10), puch));
88	return VERR_INVALID_UTF8_ENCODING;
89	}
90
91	/* validate the rest */
92	switch (cb)
93	{
94	case 6:
95	RTStrAssertMsgReturn((puch[5] & (RT_BIT(7) \| RT_BIT(6))) == RT_BIT(7), ("6/%u: %.*Rhxs\n", cb, RT_MIN(cb + 10, cch), puch), VERR_INVALID_UTF8_ENCODING);
96	case 5:
97	RTStrAssertMsgReturn((puch[4] & (RT_BIT(7) \| RT_BIT(6))) == RT_BIT(7), ("5/%u: %.*Rhxs\n", cb, RT_MIN(cb + 10, cch), puch), VERR_INVALID_UTF8_ENCODING);
98	case 4:
99	RTStrAssertMsgReturn((puch[3] & (RT_BIT(7) \| RT_BIT(6))) == RT_BIT(7), ("4/%u: %.*Rhxs\n", cb, RT_MIN(cb + 10, cch), puch), VERR_INVALID_UTF8_ENCODING);
100	case 3:
101	RTStrAssertMsgReturn((puch[2] & (RT_BIT(7) \| RT_BIT(6))) == RT_BIT(7), ("3/%u: %.*Rhxs\n", cb, RT_MIN(cb + 10, cch), puch), VERR_INVALID_UTF8_ENCODING);
102	case 2:
103	RTStrAssertMsgReturn((puch[1] & (RT_BIT(7) \| RT_BIT(6))) == RT_BIT(7), ("2/%u: %.*Rhxs\n", cb, RT_MIN(cb + 10, cch), puch), VERR_INVALID_UTF8_ENCODING);
104	break;
105	}
106
107	/* validate the code point. */
108	RTUNICP uc;
109	switch (cb)
110	{
111	case 6:
112	uc = (puch[5] & 0x3f)
113	\| ((RTUNICP)(puch[4] & 0x3f) << 6)
114	\| ((RTUNICP)(puch[3] & 0x3f) << 12)
115	\| ((RTUNICP)(puch[2] & 0x3f) << 18)
116	\| ((RTUNICP)(puch[1] & 0x3f) << 24)
117	\| ((RTUNICP)(uch & 0x01) << 30);
118	RTStrAssertMsgReturn(uc >= 0x04000000 && uc <= 0x7fffffff,
119	("%u: cp=%#010RX32: %.*Rhxs\n", cb, uc, RT_MIN(cb + 10, cch), puch), VERR_INVALID_UTF8_ENCODING);
120	break;
121	case 5:
122	uc = (puch[4] & 0x3f)
123	\| ((RTUNICP)(puch[3] & 0x3f) << 6)
124	\| ((RTUNICP)(puch[2] & 0x3f) << 12)
125	\| ((RTUNICP)(puch[1] & 0x3f) << 18)
126	\| ((RTUNICP)(uch & 0x03) << 24);
127	RTStrAssertMsgReturn(uc >= 0x00200000 && uc <= 0x03ffffff,
128	("%u: cp=%#010RX32: %.*Rhxs\n", cb, uc, RT_MIN(cb + 10, cch), puch), VERR_INVALID_UTF8_ENCODING);
129	break;
130	case 4:
131	uc = (puch[3] & 0x3f)
132	\| ((RTUNICP)(puch[2] & 0x3f) << 6)
133	\| ((RTUNICP)(puch[1] & 0x3f) << 12)
134	\| ((RTUNICP)(uch & 0x07) << 18);
135	RTStrAssertMsgReturn(uc >= 0x00010000 && uc <= 0x001fffff,
136	("%u: cp=%#010RX32: %.*Rhxs\n", cb, uc, RT_MIN(cb + 10, cch), puch), VERR_INVALID_UTF8_ENCODING);
137	break;
138	case 3:
139	uc = (puch[2] & 0x3f)
140	\| ((RTUNICP)(puch[1] & 0x3f) << 6)
141	\| ((RTUNICP)(uch & 0x0f) << 12);
142	RTStrAssertMsgReturn(uc >= 0x00000800 && uc <= 0x0000fffd,
143	("%u: cp=%#010RX32: %.*Rhxs\n", cb, uc, RT_MIN(cb + 10, cch), puch),
144	uc == 0xffff \|\| uc == 0xfffe ? VERR_CODE_POINT_ENDIAN_INDICATOR : VERR_INVALID_UTF8_ENCODING);
145	RTStrAssertMsgReturn(uc < 0xd800 \|\| uc > 0xdfff,
146	("%u: cp=%#010RX32: %.*Rhxs\n", cb, uc, RT_MIN(cb + 10, cch), puch), VERR_CODE_POINT_SURROGATE);
147	break;
148	case 2:
149	uc = (puch[1] & 0x3f)
150	\| ((RTUNICP)(uch & 0x1f) << 6);
151	RTStrAssertMsgReturn(uc >= 0x00000080 && uc <= 0x000007ff,
152	("%u: cp=%#010RX32: %.*Rhxs\n", cb, uc, RT_MIN(cb + 10, cch), puch), VERR_INVALID_UTF8_ENCODING);
153	break;
154	}
155
156	/* advance */
157	cch -= cb;
158	puch += cb;
159	}
160	else
161	{
162	/* one ASCII byte */
163	puch++;
164	cch--;
165	}
166	cCodePoints++;
167	}
168
169	/* done */
170	*pcuc = cCodePoints;
171	if (pcchActual)
172	pcchActual = puch - (unsigned char const )psz;
173	return VINF_SUCCESS;
174	}
175
176
177	/**
178	* Decodes and UTF-8 string into an array of unicode code point.
179	*
180	* Since we know the input is valid, we do not perform encoding or length checks.
181	*
182	* @returns iprt status code.
183	* @param psz The UTF-8 string to recode. This is a valid encoding.
184	* @param cch The number of chars (the type char, so bytes if you like) to process of the UTF-8 string.
185	* The recoding will stop when cch or '\\0' is reached. Pass RTSTR_MAX to process up to '\\0'.
186	* @param paCps Where to store the code points array.
187	* @param cCps The number of RTUNICP items the paCps buffer can hold, excluding the terminator ('\\0').
188	*/
189	static int rtUtf8Decode(const char *psz, size_t cch, PRTUNICP paCps, size_t cCps)
190	{
191	int rc = VINF_SUCCESS;
192	const unsigned char puch = (const unsigned char )psz;
193	PRTUNICP pCp = paCps;
194	while (cch > 0)
195	{
196	/* read the next char and check for terminator. */
197	const unsigned char uch = *puch;
198	if (!uch)
199	break;
200
201	/* check for output overflow */
202	if (RT_UNLIKELY(cCps < 1))
203	{
204	rc = VERR_BUFFER_OVERFLOW;
205	break;
206	}
207	cCps--;
208
209	/* decode and recode the code point */
210	if (!(uch & RT_BIT(7)))
211	{
212	*pCp++ = uch;
213	puch++;
214	cch--;
215	}
216	#ifdef RT_STRICT
217	else if (!(uch & RT_BIT(6)))
218	AssertMsgFailed(("Internal error!\n"));
219	#endif
220	else if (!(uch & RT_BIT(5)))
221	{
222	*pCp++ = (puch[1] & 0x3f)
223	\| ((uint16_t)(uch & 0x1f) << 6);
224	puch += 2;
225	cch -= 2;
226	}
227	else if (!(uch & RT_BIT(4)))
228	{
229	*pCp++ = (puch[2] & 0x3f)
230	\| ((uint16_t)(puch[1] & 0x3f) << 6)
231	\| ((uint16_t)(uch & 0x0f) << 12);
232	puch += 3;
233	cch -= 3;
234	}
235	else if (!(uch & RT_BIT(3)))
236	{
237	*pCp++ = (puch[3] & 0x3f)
238	\| ((RTUNICP)(puch[2] & 0x3f) << 6)
239	\| ((RTUNICP)(puch[1] & 0x3f) << 12)
240	\| ((RTUNICP)(uch & 0x07) << 18);
241	puch += 4;
242	cch -= 4;
243	}
244	else if (!(uch & RT_BIT(2)))
245	{
246	*pCp++ = (puch[4] & 0x3f)
247	\| ((RTUNICP)(puch[3] & 0x3f) << 6)
248	\| ((RTUNICP)(puch[2] & 0x3f) << 12)
249	\| ((RTUNICP)(puch[1] & 0x3f) << 18)
250	\| ((RTUNICP)(uch & 0x03) << 24);
251	puch += 5;
252	cch -= 6;
253	}
254	else
255	{
256	Assert(!(uch & RT_BIT(1)));
257	*pCp++ = (puch[5] & 0x3f)
258	\| ((RTUNICP)(puch[4] & 0x3f) << 6)
259	\| ((RTUNICP)(puch[3] & 0x3f) << 12)
260	\| ((RTUNICP)(puch[2] & 0x3f) << 18)
261	\| ((RTUNICP)(puch[1] & 0x3f) << 24)
262	\| ((RTUNICP)(uch & 0x01) << 30);
263	puch += 6;
264	cch -= 6;
265	}
266	}
267
268	/* done */
269	*pCp = 0;
270	return rc;
271	}
272
273
274	RTDECL(size_t) RTStrUniLen(const char *psz)
275	{
276	size_t cCodePoints;
277	int rc = rtUtf8Length(psz, RTSTR_MAX, &cCodePoints, NULL);
278	return RT_SUCCESS(rc) ? cCodePoints : 0;
279	}
280	RT_EXPORT_SYMBOL(RTStrUniLen);
281
282
283	RTDECL(int) RTStrUniLenEx(const char psz, size_t cch, size_t pcCps)
284	{
285	size_t cCodePoints;
286	int rc = rtUtf8Length(psz, cch, &cCodePoints, NULL);
287	if (pcCps)
288	*pcCps = RT_SUCCESS(rc) ? cCodePoints : 0;
289	return rc;
290	}
291	RT_EXPORT_SYMBOL(RTStrUniLenEx);
292
293
294	RTDECL(int) RTStrValidateEncoding(const char *psz)
295	{
296	return RTStrValidateEncodingEx(psz, RTSTR_MAX, 0);
297	}
298	RT_EXPORT_SYMBOL(RTStrValidateEncoding);
299
300
301	RTDECL(int) RTStrValidateEncodingEx(const char *psz, size_t cch, uint32_t fFlags)
302	{
303	AssertReturn(!(fFlags & ~(RTSTR_VALIDATE_ENCODING_ZERO_TERMINATED)), VERR_INVALID_PARAMETER);
304	AssertPtr(psz);
305
306	/*
307	* Use rtUtf8Length for the job.
308	*/
309	size_t cchActual;
310	size_t cCpsIgnored;
311	int rc = rtUtf8Length(psz, cch, &cCpsIgnored, &cchActual);
312	if (RT_SUCCESS(rc))
313	{
314	if ( (fFlags & RTSTR_VALIDATE_ENCODING_ZERO_TERMINATED)
315	&& cchActual >= cch)
316	rc = VERR_BUFFER_OVERFLOW;
317	}
318	return rc;
319	}
320	RT_EXPORT_SYMBOL(RTStrValidateEncodingEx);
321
322
323	RTDECL(bool) RTStrIsValidEncoding(const char *psz)
324	{
325	int rc = RTStrValidateEncodingEx(psz, RTSTR_MAX, 0);
326	return RT_SUCCESS(rc);
327	}
328	RT_EXPORT_SYMBOL(RTStrIsValidEncoding);
329
330
331	RTDECL(size_t) RTStrPurgeEncoding(char *psz)
332	{
333	size_t cErrors = 0;
334	for (;;)
335	{
336	RTUNICP Cp;
337	int rc = RTStrGetCpEx((const char **)&psz, &Cp);
338	if (RT_SUCCESS(rc))
339	{
340	if (!Cp)
341	break;
342	}
343	else
344	{
345	psz[-1] = '?';
346	cErrors++;
347	}
348	}
349	return cErrors;
350	}
351	RT_EXPORT_SYMBOL(RTStrPurgeEncoding);
352
353
354	RTDECL(ssize_t) RTStrPurgeComplementSet(char *psz, PCRTUNICP puszValidSet, char chReplacement)
355	{
356	size_t cReplacements = 0;
357	AssertReturn(chReplacement && (unsigned)chReplacement < 128, -1);
358	for (;;)
359	{
360	RTUNICP Cp;
361	PCRTUNICP pCp;
362	char *pszOld = psz;
363	if (RT_FAILURE(RTStrGetCpEx((const char **)&psz, &Cp)))
364	return -1;
365	if (!Cp)
366	break;
367	for (pCp = puszValidSet; *pCp; pCp += 2)
368	{
369	AssertReturn(*(pCp + 1), -1);
370	if (pCp <= Cp && (pCp + 1) >= Cp) /* No, I won't do * and ++. */
371	break;
372	}
373	if (!*pCp)
374	{
375	for (; pszOld != psz; ++pszOld)
376	*pszOld = chReplacement;
377	++cReplacements;
378	}
379	}
380	return cReplacements;
381	}
382	RT_EXPORT_SYMBOL(RTStrPurgeComplementSet);
383
384
385	RTDECL(int) RTStrToUni(const char pszString, PRTUNICP ppaCps)
386	{
387	/*
388	* Validate input.
389	*/
390	Assert(VALID_PTR(pszString));
391	Assert(VALID_PTR(ppaCps));
392	*ppaCps = NULL;
393
394	/*
395	* Validate the UTF-8 input and count its code points.
396	*/
397	size_t cCps;
398	int rc = rtUtf8Length(pszString, RTSTR_MAX, &cCps, NULL);
399	if (RT_SUCCESS(rc))
400	{
401	/*
402	* Allocate buffer.
403	*/
404	PRTUNICP paCps = (PRTUNICP)RTMemAlloc((cCps + 1) * sizeof(RTUNICP));
405	if (paCps)
406	{
407	/*
408	* Decode the string.
409	*/
410	rc = rtUtf8Decode(pszString, RTSTR_MAX, paCps, cCps);
411	if (RT_SUCCESS(rc))
412	{
413	*ppaCps = paCps;
414	return rc;
415	}
416	RTMemFree(paCps);
417	}
418	else
419	rc = VERR_NO_CODE_POINT_MEMORY;
420	}
421	return rc;
422	}
423	RT_EXPORT_SYMBOL(RTStrToUni);
424
425
426	RTDECL(int) RTStrToUniEx(const char pszString, size_t cchString, PRTUNICP ppaCps, size_t cCps, size_t *pcCps)
427	{
428	/*
429	* Validate input.
430	*/
431	Assert(VALID_PTR(pszString));
432	Assert(VALID_PTR(ppaCps));
433	Assert(!pcCps \|\| VALID_PTR(pcCps));
434
435	/*
436	* Validate the UTF-8 input and count the code points.
437	*/
438	size_t cCpsResult;
439	int rc = rtUtf8Length(pszString, cchString, &cCpsResult, NULL);
440	if (RT_SUCCESS(rc))
441	{
442	if (pcCps)
443	*pcCps = cCpsResult;
444
445	/*
446	* Check buffer size / Allocate buffer.
447	*/
448	bool fShouldFree;
449	PRTUNICP paCpsResult;
450	if (cCps > 0 && *ppaCps)
451	{
452	fShouldFree = false;
453	if (cCps <= cCpsResult)
454	return VERR_BUFFER_OVERFLOW;
455	paCpsResult = *ppaCps;
456	}
457	else
458	{
459	*ppaCps = NULL;
460	fShouldFree = true;
461	cCps = RT_MAX(cCpsResult + 1, cCps);
462	paCpsResult = (PRTUNICP)RTMemAlloc(cCps * sizeof(RTUNICP));
463	}
464	if (paCpsResult)
465	{
466	/*
467	* Encode the UTF-16 string.
468	*/
469	rc = rtUtf8Decode(pszString, cchString, paCpsResult, cCps - 1);
470	if (RT_SUCCESS(rc))
471	{
472	*ppaCps = paCpsResult;
473	return rc;
474	}
475	if (fShouldFree)
476	RTMemFree(paCpsResult);
477	}
478	else
479	rc = VERR_NO_CODE_POINT_MEMORY;
480	}
481	return rc;
482	}
483	RT_EXPORT_SYMBOL(RTStrToUniEx);
484
485
486	/**
487	* Calculates the UTF-16 length of a string, validating the encoding while doing so.
488	*
489	* @returns IPRT status code.
490	* @param psz Pointer to the UTF-8 string.
491	* @param cch The max length of the string. (btw cch = cb)
492	* Use RTSTR_MAX if all of the string is to be examined.
493	* @param pcwc Where to store the length of the UTF-16 string as a number of RTUTF16 characters.
494	*/
495	static int rtUtf8CalcUtf16Length(const char psz, size_t cch, size_t pcwc)
496	{
497	const unsigned char puch = (const unsigned char )psz;
498	size_t cwc = 0;
499	while (cch > 0)
500	{
501	const unsigned char uch = *puch;
502	if (!uch)
503	break;
504	if (!(uch & RT_BIT(7)))
505	{
506	/* one ASCII byte */
507	cwc++;
508	puch++;
509	cch--;
510	}
511	else
512	{
513	/* figure sequence length and validate the first byte */
514	unsigned cb;
515	if ((uch & (RT_BIT(7) \| RT_BIT(6) \| RT_BIT(5))) == (RT_BIT(7) \| RT_BIT(6)))
516	cb = 2;
517	else if ((uch & (RT_BIT(7) \| RT_BIT(6) \| RT_BIT(5) \| RT_BIT(4))) == (RT_BIT(7) \| RT_BIT(6) \| RT_BIT(5)))
518	cb = 3;
519	else if ((uch & (RT_BIT(7) \| RT_BIT(6) \| RT_BIT(5) \| RT_BIT(4) \| RT_BIT(3))) == (RT_BIT(7) \| RT_BIT(6) \| RT_BIT(5) \| RT_BIT(4)))
520	cb = 4;
521	else if ((uch & (RT_BIT(7) \| RT_BIT(6) \| RT_BIT(5) \| RT_BIT(4) \| RT_BIT(3) \| RT_BIT(2))) == (RT_BIT(7) \| RT_BIT(6) \| RT_BIT(5) \| RT_BIT(4) \| RT_BIT(3)))
522	cb = 5;
523	else if ((uch & (RT_BIT(7) \| RT_BIT(6) \| RT_BIT(5) \| RT_BIT(4) \| RT_BIT(3) \| RT_BIT(2) \| RT_BIT(1))) == (RT_BIT(7) \| RT_BIT(6) \| RT_BIT(5) \| RT_BIT(4) \| RT_BIT(3) \| RT_BIT(2)))
524	cb = 6;
525	else
526	{
527	RTStrAssertMsgFailed(("Invalid UTF-8 first byte: %.*Rhxs\n", RT_MIN(cch, 10), puch));
528	return VERR_INVALID_UTF8_ENCODING;
529	}
530
531	/* check length */
532	if (cb > cch)
533	{
534	RTStrAssertMsgFailed(("Invalid UTF-8 length: cb=%d cch=%d (%.*Rhxs)\n", cb, cch, RT_MIN(cch, 10), puch));
535	return VERR_INVALID_UTF8_ENCODING;
536	}
537
538	/* validate the rest */
539	switch (cb)
540	{
541	case 6:
542	RTStrAssertMsgReturn((puch[5] & (RT_BIT(7) \| RT_BIT(6))) == RT_BIT(7), ("6/%u: %.*Rhxs\n", cb, RT_MIN(cb + 10, cch), puch), VERR_INVALID_UTF8_ENCODING);
543	case 5:
544	RTStrAssertMsgReturn((puch[4] & (RT_BIT(7) \| RT_BIT(6))) == RT_BIT(7), ("5/%u: %.*Rhxs\n", cb, RT_MIN(cb + 10, cch), puch), VERR_INVALID_UTF8_ENCODING);
545	case 4:
546	RTStrAssertMsgReturn((puch[3] & (RT_BIT(7) \| RT_BIT(6))) == RT_BIT(7), ("4/%u: %.*Rhxs\n", cb, RT_MIN(cb + 10, cch), puch), VERR_INVALID_UTF8_ENCODING);
547	case 3:
548	RTStrAssertMsgReturn((puch[2] & (RT_BIT(7) \| RT_BIT(6))) == RT_BIT(7), ("3/%u: %.*Rhxs\n", cb, RT_MIN(cb + 10, cch), puch), VERR_INVALID_UTF8_ENCODING);
549	case 2:
550	RTStrAssertMsgReturn((puch[1] & (RT_BIT(7) \| RT_BIT(6))) == RT_BIT(7), ("2/%u: %.*Rhxs\n", cb, RT_MIN(cb + 10, cch), puch), VERR_INVALID_UTF8_ENCODING);
551	break;
552	}
553
554	/* validate the code point. */
555	RTUNICP uc;
556	switch (cb)
557	{
558	case 6:
559	uc = (puch[5] & 0x3f)
560	\| ((RTUNICP)(puch[4] & 0x3f) << 6)
561	\| ((RTUNICP)(puch[3] & 0x3f) << 12)
562	\| ((RTUNICP)(puch[2] & 0x3f) << 18)
563	\| ((RTUNICP)(puch[1] & 0x3f) << 24)
564	\| ((RTUNICP)(uch & 0x01) << 30);
565	RTStrAssertMsgReturn(uc >= 0x04000000 && uc <= 0x7fffffff,
566	("%u: cp=%#010RX32: %.*Rhxs\n", cb, uc, RT_MIN(cb + 10, cch), puch), VERR_INVALID_UTF8_ENCODING);
567	RTStrAssertMsgFailed(("%u: cp=%#010RX32: %.*Rhxs\n", cb, uc, RT_MIN(cb + 10, cch), puch));
568	return VERR_CANT_RECODE_AS_UTF16;
569	case 5:
570	uc = (puch[4] & 0x3f)
571	\| ((RTUNICP)(puch[3] & 0x3f) << 6)
572	\| ((RTUNICP)(puch[2] & 0x3f) << 12)
573	\| ((RTUNICP)(puch[1] & 0x3f) << 18)
574	\| ((RTUNICP)(uch & 0x03) << 24);
575	RTStrAssertMsgReturn(uc >= 0x00200000 && uc <= 0x03ffffff,
576	("%u: cp=%#010RX32: %.*Rhxs\n", cb, uc, RT_MIN(cb + 10, cch), puch), VERR_INVALID_UTF8_ENCODING);
577	RTStrAssertMsgFailed(("%u: cp=%#010RX32: %.*Rhxs\n", cb, uc, RT_MIN(cb + 10, cch), puch));
578	return VERR_CANT_RECODE_AS_UTF16;
579	case 4:
580	uc = (puch[3] & 0x3f)
581	\| ((RTUNICP)(puch[2] & 0x3f) << 6)
582	\| ((RTUNICP)(puch[1] & 0x3f) << 12)
583	\| ((RTUNICP)(uch & 0x07) << 18);
584	RTStrAssertMsgReturn(uc >= 0x00010000 && uc <= 0x001fffff,
585	("%u: cp=%#010RX32: %.*Rhxs\n", cb, uc, RT_MIN(cb + 10, cch), puch), VERR_INVALID_UTF8_ENCODING);
586	RTStrAssertMsgReturn(uc <= 0x0010ffff,
587	("%u: cp=%#010RX32: %.*Rhxs\n", cb, uc, RT_MIN(cb + 10, cch), puch), VERR_CANT_RECODE_AS_UTF16);
588	cwc++;
589	break;
590	case 3:
591	uc = (puch[2] & 0x3f)
592	\| ((RTUNICP)(puch[1] & 0x3f) << 6)
593	\| ((RTUNICP)(uch & 0x0f) << 12);
594	RTStrAssertMsgReturn(uc >= 0x00000800 && uc <= 0x0000fffd,
595	("%u: cp=%#010RX32: %.*Rhxs\n", cb, uc, RT_MIN(cb + 10, cch), puch),
596	uc == 0xffff \|\| uc == 0xfffe ? VERR_CODE_POINT_ENDIAN_INDICATOR : VERR_INVALID_UTF8_ENCODING);
597	RTStrAssertMsgReturn(uc < 0xd800 \|\| uc > 0xdfff,
598	("%u: cp=%#010RX32: %.*Rhxs\n", cb, uc, RT_MIN(cb + 10, cch), puch), VERR_CODE_POINT_SURROGATE);
599	break;
600	case 2:
601	uc = (puch[1] & 0x3f)
602	\| ((RTUNICP)(uch & 0x1f) << 6);
603	RTStrAssertMsgReturn(uc >= 0x00000080 && uc <= 0x000007ff,
604	("%u: cp=%#010RX32: %.*Rhxs\n", cb, uc, RT_MIN(cb + 10, cch), puch), VERR_INVALID_UTF8_ENCODING);
605	break;
606	}
607
608	/* advance */
609	cch -= cb;
610	puch += cb;
611	cwc++;
612	}
613	}
614
615	/* done */
616	*pcwc = cwc;
617	return VINF_SUCCESS;
618	}
619
620
621	/**
622	* Recodes a valid UTF-8 string as UTF-16.
623	*
624	* Since we know the input is valid, we do not perform encoding or length checks.
625	*
626	* @returns iprt status code.
627	* @param psz The UTF-8 string to recode. This is a valid encoding.
628	* @param cch The number of chars (the type char, so bytes if you like) to process of the UTF-8 string.
629	* The recoding will stop when cch or '\\0' is reached. Pass RTSTR_MAX to process up to '\\0'.
630	* @param pwsz Where to store the UTF-16 string.
631	* @param cwc The number of RTUTF16 items the pwsz buffer can hold, excluding the terminator ('\\0').
632	*/
633	static int rtUtf8RecodeAsUtf16(const char *psz, size_t cch, PRTUTF16 pwsz, size_t cwc)
634	{
635	int rc = VINF_SUCCESS;
636	const unsigned char puch = (const unsigned char )psz;
637	PRTUTF16 pwc = pwsz;
638	while (cch > 0)
639	{
640	/* read the next char and check for terminator. */
641	const unsigned char uch = *puch;
642	if (!uch)
643	break;
644
645	/* check for output overflow */
646	if (RT_UNLIKELY(cwc < 1))
647	{
648	rc = VERR_BUFFER_OVERFLOW;
649	break;
650	}
651	cwc--;
652
653	/* decode and recode the code point */
654	if (!(uch & RT_BIT(7)))
655	{
656	*pwc++ = uch;
657	puch++;
658	cch--;
659	}
660	else if ((uch & (RT_BIT(7) \| RT_BIT(6) \| RT_BIT(5))) == (RT_BIT(7) \| RT_BIT(6)))
661	{
662	uint16_t uc = (puch[1] & 0x3f)
663	\| ((uint16_t)(uch & 0x1f) << 6);
664	*pwc++ = uc;
665	puch += 2;
666	cch -= 2;
667	}
668	else if ((uch & (RT_BIT(7) \| RT_BIT(6) \| RT_BIT(5) \| RT_BIT(4))) == (RT_BIT(7) \| RT_BIT(6) \| RT_BIT(5)))
669	{
670	uint16_t uc = (puch[2] & 0x3f)
671	\| ((uint16_t)(puch[1] & 0x3f) << 6)
672	\| ((uint16_t)(uch & 0x0f) << 12);
673	*pwc++ = uc;
674	puch += 3;
675	cch -= 3;
676	}
677	else
678	{
679	/* generate surrogate pair */
680	Assert((uch & (RT_BIT(7) \| RT_BIT(6) \| RT_BIT(5) \| RT_BIT(4) \| RT_BIT(3))) == (RT_BIT(7) \| RT_BIT(6) \| RT_BIT(5) \| RT_BIT(4)));
681	RTUNICP uc = (puch[3] & 0x3f)
682	\| ((RTUNICP)(puch[2] & 0x3f) << 6)
683	\| ((RTUNICP)(puch[1] & 0x3f) << 12)
684	\| ((RTUNICP)(uch & 0x07) << 18);
685	if (RT_UNLIKELY(cwc < 1))
686	{
687	rc = VERR_BUFFER_OVERFLOW;
688	break;
689	}
690	cwc--;
691
692	uc -= 0x10000;
693	*pwc++ = 0xd800 \| (uc >> 10);
694	*pwc++ = 0xdc00 \| (uc & 0x3ff);
695	puch += 4;
696	cch -= 4;
697	}
698	}
699
700	/* done */
701	*pwc = '\0';
702	return rc;
703	}
704
705
706	RTDECL(int) RTStrToUtf16Tag(const char pszString, PRTUTF16 ppwszString, const char *pszTag)
707	{
708	/*
709	* Validate input.
710	*/
711	Assert(VALID_PTR(ppwszString));
712	Assert(VALID_PTR(pszString));
713	*ppwszString = NULL;
714
715	/*
716	* Validate the UTF-8 input and calculate the length of the UTF-16 string.
717	*/
718	size_t cwc;
719	int rc = rtUtf8CalcUtf16Length(pszString, RTSTR_MAX, &cwc);
720	if (RT_SUCCESS(rc))
721	{
722	/*
723	* Allocate buffer.
724	*/
725	PRTUTF16 pwsz = (PRTUTF16)RTMemAllocTag((cwc + 1) * sizeof(RTUTF16), pszTag);
726	if (pwsz)
727	{
728	/*
729	* Encode the UTF-16 string.
730	*/
731	rc = rtUtf8RecodeAsUtf16(pszString, RTSTR_MAX, pwsz, cwc);
732	if (RT_SUCCESS(rc))
733	{
734	*ppwszString = pwsz;
735	return rc;
736	}
737	RTMemFree(pwsz);
738	}
739	else
740	rc = VERR_NO_UTF16_MEMORY;
741	}
742	return rc;
743	}
744	RT_EXPORT_SYMBOL(RTStrToUtf16Tag);
745
746
747	RTDECL(int) RTStrToUtf16ExTag(const char *pszString, size_t cchString,
748	PRTUTF16 ppwsz, size_t cwc, size_t pcwc, const char *pszTag)
749	{
750	/*
751	* Validate input.
752	*/
753	Assert(VALID_PTR(pszString));
754	Assert(VALID_PTR(ppwsz));
755	Assert(!pcwc \|\| VALID_PTR(pcwc));
756
757	/*
758	* Validate the UTF-8 input and calculate the length of the UTF-16 string.
759	*/
760	size_t cwcResult;
761	int rc = rtUtf8CalcUtf16Length(pszString, cchString, &cwcResult);
762	if (RT_SUCCESS(rc))
763	{
764	if (pcwc)
765	*pcwc = cwcResult;
766
767	/*
768	* Check buffer size / Allocate buffer.
769	*/
770	bool fShouldFree;
771	PRTUTF16 pwszResult;
772	if (cwc > 0 && *ppwsz)
773	{
774	fShouldFree = false;
775	if (cwc <= cwcResult)
776	return VERR_BUFFER_OVERFLOW;
777	pwszResult = *ppwsz;
778	}
779	else
780	{
781	*ppwsz = NULL;
782	fShouldFree = true;
783	cwc = RT_MAX(cwcResult + 1, cwc);
784	pwszResult = (PRTUTF16)RTMemAllocTag(cwc * sizeof(RTUTF16), pszTag);
785	}
786	if (pwszResult)
787	{
788	/*
789	* Encode the UTF-16 string.
790	*/
791	rc = rtUtf8RecodeAsUtf16(pszString, cchString, pwszResult, cwc - 1);
792	if (RT_SUCCESS(rc))
793	{
794	*ppwsz = pwszResult;
795	return rc;
796	}
797	if (fShouldFree)
798	RTMemFree(pwszResult);
799	}
800	else
801	rc = VERR_NO_UTF16_MEMORY;
802	}
803	return rc;
804	}
805	RT_EXPORT_SYMBOL(RTStrToUtf16ExTag);
806
807
808	RTDECL(size_t) RTStrCalcUtf16Len(const char *psz)
809	{
810	size_t cwc;
811	int rc = rtUtf8CalcUtf16Length(psz, RTSTR_MAX, &cwc);
812	return RT_SUCCESS(rc) ? cwc : 0;
813	}
814	RT_EXPORT_SYMBOL(RTStrCalcUtf16Len);
815
816
817	RTDECL(int) RTStrCalcUtf16LenEx(const char psz, size_t cch, size_t pcwc)
818	{
819	size_t cwc;
820	int rc = rtUtf8CalcUtf16Length(psz, cch, &cwc);
821	if (pcwc)
822	*pcwc = RT_SUCCESS(rc) ? cwc : ~(size_t)0;
823	return rc;
824	}
825	RT_EXPORT_SYMBOL(RTStrCalcUtf16LenEx);
826
827
828	/**
829	* Calculates the length of the UTF-8 encoding of a Latin-1 string.
830	*
831	* @returns iprt status code.
832	* @param psz The Latin-1 string.
833	* @param cchIn The max length of the Latin-1 string to consider.
834	* @param pcch Where to store the length (excluding '\\0') of the UTF-8 string. (cch == cb, btw)
835	*/
836	static int rtLatin1CalcUtf8Length(const char psz, size_t cchIn, size_t pcch)
837	{
838	size_t cch = 0;
839	for (;;)
840	{
841	RTUNICP Cp;
842	int rc = RTLatin1GetCpNEx(&psz, &cchIn, &Cp);
843	if (Cp == 0 \|\| rc == VERR_END_OF_STRING)
844	break;
845	if (RT_FAILURE(rc))
846	return rc;
847	cch += RTStrCpSize(Cp); /* cannot fail */
848	}
849
850	/* done */
851	*pcch = cch;
852	return VINF_SUCCESS;
853	}
854
855
856	/**
857	* Recodes a Latin-1 string as UTF-8.
858	*
859	* @returns iprt status code.
860	* @param psz The Latin-1 string.
861	* @param cchIn The number of characters to process from psz. The recoding
862	* will stop when cch or '\\0' is reached.
863	* @param psz Where to store the UTF-8 string.
864	* @param cch The size of the UTF-8 buffer, excluding the terminator.
865	*/
866	static int rtLatin1RecodeAsUtf8(const char pszIn, size_t cchIn, char psz, size_t cch)
867	{
868	int rc;
869	for (;;)
870	{
871	RTUNICP Cp;
872	size_t cchCp;
873	rc = RTLatin1GetCpNEx(&pszIn, &cchIn, &Cp);
874	if (Cp == 0 \|\| RT_FAILURE(rc))
875	break;
876	cchCp = RTStrCpSize(Cp);
877	if (RT_UNLIKELY(cch < cchCp))
878	{
879	RTStrAssertMsgFailed(("Buffer overflow! 1\n"));
880	rc = VERR_BUFFER_OVERFLOW;
881	break;
882	}
883	cch -= cchCp;
884	psz = RTStrPutCp(psz, Cp);
885	}
886
887	/* done */
888	if (rc == VERR_END_OF_STRING)
889	rc = VINF_SUCCESS;
890	*psz = '\0';
891	return rc;
892	}
893
894
895
896	RTDECL(int) RTLatin1ToUtf8Tag(const char pszString, char ppszString, const char pszTag)
897	{
898	/*
899	* Validate input.
900	*/
901	Assert(VALID_PTR(ppszString));
902	Assert(VALID_PTR(pszString));
903	*ppszString = NULL;
904
905	/*
906	* Calculate the length of the UTF-8 encoding of the Latin-1 string.
907	*/
908	size_t cch;
909	int rc = rtLatin1CalcUtf8Length(pszString, RTSTR_MAX, &cch);
910	if (RT_SUCCESS(rc))
911	{
912	/*
913	* Allocate buffer and recode it.
914	*/
915	char pszResult = (char )RTMemAllocTag(cch + 1, pszTag);
916	if (pszResult)
917	{
918	rc = rtLatin1RecodeAsUtf8(pszString, RTSTR_MAX, pszResult, cch);
919	if (RT_SUCCESS(rc))
920	{
921	*ppszString = pszResult;
922	return rc;
923	}
924
925	RTMemFree(pszResult);
926	}
927	else
928	rc = VERR_NO_STR_MEMORY;
929	}
930	return rc;
931	}
932	RT_EXPORT_SYMBOL(RTLatin1ToUtf8Tag);
933
934
935	RTDECL(int) RTLatin1ToUtf8ExTag(const char pszString, size_t cchString, char ppsz, size_t cch, size_t pcch, const char *pszTag)
936	{
937	/*
938	* Validate input.
939	*/
940	Assert(VALID_PTR(pszString));
941	Assert(VALID_PTR(ppsz));
942	Assert(!pcch \|\| VALID_PTR(pcch));
943
944	/*
945	* Calculate the length of the UTF-8 encoding of the Latin-1 string.
946	*/
947	size_t cchResult;
948	int rc = rtLatin1CalcUtf8Length(pszString, cchString, &cchResult);
949	if (RT_SUCCESS(rc))
950	{
951	if (pcch)
952	*pcch = cchResult;
953
954	/*
955	* Check buffer size / Allocate buffer and recode it.
956	*/
957	bool fShouldFree;
958	char *pszResult;
959	if (cch > 0 && *ppsz)
960	{
961	fShouldFree = false;
962	if (RT_UNLIKELY(cch <= cchResult))
963	return VERR_BUFFER_OVERFLOW;
964	pszResult = *ppsz;
965	}
966	else
967	{
968	*ppsz = NULL;
969	fShouldFree = true;
970	cch = RT_MAX(cch, cchResult + 1);
971	pszResult = (char *)RTStrAllocTag(cch, pszTag);
972	}
973	if (pszResult)
974	{
975	rc = rtLatin1RecodeAsUtf8(pszString, cchString, pszResult, cch - 1);
976	if (RT_SUCCESS(rc))
977	{
978	*ppsz = pszResult;
979	return rc;
980	}
981
982	if (fShouldFree)
983	RTStrFree(pszResult);
984	}
985	else
986	rc = VERR_NO_STR_MEMORY;
987	}
988	return rc;
989	}
990	RT_EXPORT_SYMBOL(RTLatin1ToUtf8ExTag);
991
992
993	RTDECL(size_t) RTLatin1CalcUtf8Len(const char *psz)
994	{
995	size_t cch;
996	int rc = rtLatin1CalcUtf8Length(psz, RTSTR_MAX, &cch);
997	return RT_SUCCESS(rc) ? cch : 0;
998	}
999	RT_EXPORT_SYMBOL(RTLatin1CalcUtf8Len);
1000
1001
1002	RTDECL(int) RTLatin1CalcUtf8LenEx(const char psz, size_t cchIn, size_t pcch)
1003	{
1004	size_t cch;
1005	int rc = rtLatin1CalcUtf8Length(psz, cchIn, &cch);
1006	if (pcch)
1007	*pcch = RT_SUCCESS(rc) ? cch : ~(size_t)0;
1008	return rc;
1009	}
1010	RT_EXPORT_SYMBOL(RTLatin1CalcUtf8LenEx);
1011
1012
1013	/**
1014	* Calculates the Latin-1 length of a string, validating the encoding while
1015	* doing so.
1016	*
1017	* @returns IPRT status code.
1018	* @param psz Pointer to the UTF-8 string.
1019	* @param cchIn The max length of the string. (btw cch = cb)
1020	* Use RTSTR_MAX if all of the string is to be examined.
1021	* @param pcch Where to store the length of the Latin-1 string in bytes.
1022	*/
1023	static int rtUtf8CalcLatin1Length(const char psz, size_t cchIn, size_t pcch)
1024	{
1025	size_t cch = 0;
1026	for (;;)
1027	{
1028	RTUNICP Cp;
1029	size_t cchCp;
1030	int rc = RTStrGetCpNEx(&psz, &cchIn, &Cp);
1031	if (Cp == 0 \|\| rc == VERR_END_OF_STRING)
1032	break;
1033	if (RT_FAILURE(rc))
1034	return rc;
1035	cchCp = RTLatin1CpSize(Cp);
1036	if (cchCp == 0)
1037	return VERR_NO_TRANSLATION;
1038	cch += cchCp;
1039	}
1040
1041	/* done */
1042	*pcch = cch;
1043	return VINF_SUCCESS;
1044	}
1045
1046
1047	/**
1048	* Recodes a valid UTF-8 string as Latin-1.
1049	*
1050	* Since we know the input is valid, we do not perform encoding or length checks.
1051	*
1052	* @returns iprt status code.
1053	* @param pszIn The UTF-8 string to recode. This is a valid encoding.
1054	* @param cchIn The number of chars (the type char, so bytes if you like) to process of the UTF-8 string.
1055	* The recoding will stop when cch or '\\0' is reached. Pass RTSTR_MAX to process up to '\\0'.
1056	* @param psz Where to store the Latin-1 string.
1057	* @param cch The number of characters the pszOut buffer can hold, excluding the terminator ('\\0').
1058	*/
1059	static int rtUtf8RecodeAsLatin1(const char pszIn, size_t cchIn, char psz, size_t cch)
1060	{
1061	int rc;
1062	for (;;)
1063	{
1064	RTUNICP Cp;
1065	size_t cchCp;
1066	rc = RTStrGetCpNEx(&pszIn, &cchIn, &Cp);
1067	if (Cp == 0 \|\| RT_FAILURE(rc))
1068	break;
1069	cchCp = RTLatin1CpSize(Cp);
1070	if (RT_UNLIKELY(cch < cchCp))
1071	{
1072	RTStrAssertMsgFailed(("Buffer overflow! 1\n"));
1073	rc = VERR_BUFFER_OVERFLOW;
1074	break;
1075	}
1076	cch -= cchCp;
1077	psz = RTLatin1PutCp(psz, Cp);
1078	}
1079
1080	/* done */
1081	if (rc == VERR_END_OF_STRING)
1082	rc = VINF_SUCCESS;
1083	*psz = '\0';
1084	return rc;
1085	}
1086
1087
1088
1089	RTDECL(int) RTStrToLatin1Tag(const char pszString, char ppszString, const char pszTag)
1090	{
1091	/*
1092	* Validate input.
1093	*/
1094	Assert(VALID_PTR(ppszString));
1095	Assert(VALID_PTR(pszString));
1096	*ppszString = NULL;
1097
1098	/*
1099	* Validate the UTF-8 input and calculate the length of the Latin-1 string.
1100	*/
1101	size_t cch;
1102	int rc = rtUtf8CalcLatin1Length(pszString, RTSTR_MAX, &cch);
1103	if (RT_SUCCESS(rc))
1104	{
1105	/*
1106	* Allocate buffer.
1107	*/
1108	char psz = (char )RTMemAllocTag(cch + 1, pszTag);
1109	if (psz)
1110	{
1111	/*
1112	* Encode the UTF-16 string.
1113	*/
1114	rc = rtUtf8RecodeAsLatin1(pszString, RTSTR_MAX, psz, cch);
1115	if (RT_SUCCESS(rc))
1116	{
1117	*ppszString = psz;
1118	return rc;
1119	}
1120	RTMemFree(psz);
1121	}
1122	else
1123	rc = VERR_NO_STR_MEMORY;
1124	}
1125	return rc;
1126	}
1127	RT_EXPORT_SYMBOL(RTStrToLatin1Tag);
1128
1129
1130	RTDECL(int) RTStrToLatin1ExTag(const char *pszString, size_t cchString,
1131	char *ppsz, size_t cch, size_t pcch, const char *pszTag)
1132	{
1133	/*
1134	* Validate input.
1135	*/
1136	Assert(VALID_PTR(pszString));
1137	Assert(VALID_PTR(ppsz));
1138	Assert(!pcch \|\| VALID_PTR(pcch));
1139
1140	/*
1141	* Validate the UTF-8 input and calculate the length of the UTF-16 string.
1142	*/
1143	size_t cchResult;
1144	int rc = rtUtf8CalcLatin1Length(pszString, cchString, &cchResult);
1145	if (RT_SUCCESS(rc))
1146	{
1147	if (pcch)
1148	*pcch = cchResult;
1149
1150	/*
1151	* Check buffer size / Allocate buffer.
1152	*/
1153	bool fShouldFree;
1154	char *pszResult;
1155	if (cch > 0 && *ppsz)
1156	{
1157	fShouldFree = false;
1158	if (cch <= cchResult)
1159	return VERR_BUFFER_OVERFLOW;
1160	pszResult = *ppsz;
1161	}
1162	else
1163	{
1164	*ppsz = NULL;
1165	fShouldFree = true;
1166	cch = RT_MAX(cchResult + 1, cch);
1167	pszResult = (char *)RTMemAllocTag(cch, pszTag);
1168	}
1169	if (pszResult)
1170	{
1171	/*
1172	* Encode the Latin-1 string.
1173	*/
1174	rc = rtUtf8RecodeAsLatin1(pszString, cchString, pszResult, cch - 1);
1175	if (RT_SUCCESS(rc))
1176	{
1177	*ppsz = pszResult;
1178	return rc;
1179	}
1180	if (fShouldFree)
1181	RTMemFree(pszResult);
1182	}
1183	else
1184	rc = VERR_NO_STR_MEMORY;
1185	}
1186	return rc;
1187	}
1188	RT_EXPORT_SYMBOL(RTStrToLatin1Tag);
1189
1190
1191	RTDECL(size_t) RTStrCalcLatin1Len(const char *psz)
1192	{
1193	size_t cch;
1194	int rc = rtUtf8CalcLatin1Length(psz, RTSTR_MAX, &cch);
1195	return RT_SUCCESS(rc) ? cch : 0;
1196	}
1197	RT_EXPORT_SYMBOL(RTStrCalcLatin1Len);
1198
1199
1200	RTDECL(int) RTStrCalcLatin1LenEx(const char psz, size_t cchIn, size_t pcch)
1201	{
1202	size_t cch;
1203	int rc = rtUtf8CalcLatin1Length(psz, cchIn, &cch);
1204	if (pcch)
1205	*pcch = RT_SUCCESS(rc) ? cch : ~(size_t)0;
1206	return rc;
1207	}
1208	RT_EXPORT_SYMBOL(RTStrCalcLatin1LenEx);
1209
1210
1211	/**
1212	* Handle invalid encodings passed to RTStrGetCp() and RTStrGetCpEx().
1213	* @returns rc
1214	* @param ppsz The pointer to the string position point.
1215	* @param pCp Where to store RTUNICP_INVALID.
1216	* @param rc The iprt error code.
1217	*/
1218	static int rtStrGetCpExFailure(const char **ppsz, PRTUNICP pCp, int rc)
1219	{
1220	/*
1221	* Try find a valid encoding.
1222	*/
1223	(ppsz)++; /* @todo code this! */
1224	*pCp = RTUNICP_INVALID;
1225	return rc;
1226	}
1227
1228
1229	RTDECL(RTUNICP) RTStrGetCpInternal(const char *psz)
1230	{
1231	RTUNICP Cp;
1232	RTStrGetCpExInternal(&psz, &Cp);
1233	return Cp;
1234	}
1235	RT_EXPORT_SYMBOL(RTStrGetCpInternal);
1236
1237
1238	RTDECL(int) RTStrGetCpExInternal(const char **ppsz, PRTUNICP pCp)
1239	{
1240	const unsigned char puch = (const unsigned char )*ppsz;
1241	const unsigned char uch = *puch;
1242	RTUNICP uc;
1243
1244	/* ASCII ? */
1245	if (!(uch & RT_BIT(7)))
1246	{
1247	uc = uch;
1248	puch++;
1249	}
1250	else if (uch & RT_BIT(6))
1251	{
1252	/* figure the length and validate the first octet. */
1253	/** @todo RT_USE_RTC_3629 */
1254	unsigned cb;
1255	if (!(uch & RT_BIT(5)))
1256	cb = 2;
1257	else if (!(uch & RT_BIT(4)))
1258	cb = 3;
1259	else if (!(uch & RT_BIT(3)))
1260	cb = 4;
1261	else if (!(uch & RT_BIT(2)))
1262	cb = 5;
1263	else if (!(uch & RT_BIT(1)))
1264	cb = 6;
1265	else
1266	{
1267	RTStrAssertMsgFailed(("Invalid UTF-8 first byte: %.Rhxs\n", RT_MIN(strlen((char )puch), 10), puch));
1268	return rtStrGetCpExFailure(ppsz, pCp, VERR_INVALID_UTF8_ENCODING);
1269	}
1270
1271	/* validate the rest */
1272	switch (cb)
1273	{
1274	case 6:
1275	RTStrAssertMsgReturn((puch[5] & 0xc0) == 0x80, ("6/%u: %.Rhxs\n", cb, RT_MIN(cb + 10, strlen((char )puch)), puch),
1276	rtStrGetCpExFailure(ppsz, pCp, VERR_INVALID_UTF8_ENCODING));
1277	case 5:
1278	RTStrAssertMsgReturn((puch[4] & 0xc0) == 0x80, ("5/%u: %.Rhxs\n", cb, RT_MIN(cb + 10, strlen((char )puch)), puch),
1279	rtStrGetCpExFailure(ppsz, pCp, VERR_INVALID_UTF8_ENCODING));
1280	case 4:
1281	RTStrAssertMsgReturn((puch[3] & 0xc0) == 0x80, ("4/%u: %.Rhxs\n", cb, RT_MIN(cb + 10, strlen((char )puch)), puch),
1282	rtStrGetCpExFailure(ppsz, pCp, VERR_INVALID_UTF8_ENCODING));
1283	case 3:
1284	RTStrAssertMsgReturn((puch[2] & 0xc0) == 0x80, ("3/%u: %.Rhxs\n", cb, RT_MIN(cb + 10, strlen((char )puch)), puch),
1285	rtStrGetCpExFailure(ppsz, pCp, VERR_INVALID_UTF8_ENCODING));
1286	case 2:
1287	RTStrAssertMsgReturn((puch[1] & 0xc0) == 0x80, ("2/%u: %.Rhxs\n", cb, RT_MIN(cb + 10, strlen((char )puch)), puch),
1288	rtStrGetCpExFailure(ppsz, pCp, VERR_INVALID_UTF8_ENCODING));
1289	break;
1290	}
1291
1292	/* get and validate the code point. */
1293	switch (cb)
1294	{
1295	case 6:
1296	uc = (puch[5] & 0x3f)
1297	\| ((RTUNICP)(puch[4] & 0x3f) << 6)
1298	\| ((RTUNICP)(puch[3] & 0x3f) << 12)
1299	\| ((RTUNICP)(puch[2] & 0x3f) << 18)
1300	\| ((RTUNICP)(puch[1] & 0x3f) << 24)
1301	\| ((RTUNICP)(uch & 0x01) << 30);
1302	RTStrAssertMsgReturn(uc >= 0x04000000 && uc <= 0x7fffffff,
1303	("%u: cp=%#010RX32: %.Rhxs\n", cb, uc, RT_MIN(cb + 10, strlen((char )puch)), puch),
1304	rtStrGetCpExFailure(ppsz, pCp, VERR_INVALID_UTF8_ENCODING));
1305	break;
1306	case 5:
1307	uc = (puch[4] & 0x3f)
1308	\| ((RTUNICP)(puch[3] & 0x3f) << 6)
1309	\| ((RTUNICP)(puch[2] & 0x3f) << 12)
1310	\| ((RTUNICP)(puch[1] & 0x3f) << 18)
1311	\| ((RTUNICP)(uch & 0x03) << 24);
1312	RTStrAssertMsgReturn(uc >= 0x00200000 && uc <= 0x03ffffff,
1313	("%u: cp=%#010RX32: %.Rhxs\n", cb, uc, RT_MIN(cb + 10, strlen((char )puch)), puch),
1314	rtStrGetCpExFailure(ppsz, pCp, VERR_INVALID_UTF8_ENCODING));
1315	break;
1316	case 4:
1317	uc = (puch[3] & 0x3f)
1318	\| ((RTUNICP)(puch[2] & 0x3f) << 6)
1319	\| ((RTUNICP)(puch[1] & 0x3f) << 12)
1320	\| ((RTUNICP)(uch & 0x07) << 18);
1321	RTStrAssertMsgReturn(uc >= 0x00010000 && uc <= 0x001fffff,
1322	("%u: cp=%#010RX32: %.Rhxs\n", cb, uc, RT_MIN(cb + 10, strlen((char )puch)), puch),
1323	rtStrGetCpExFailure(ppsz, pCp, VERR_INVALID_UTF8_ENCODING));
1324	break;
1325	case 3:
1326	uc = (puch[2] & 0x3f)
1327	\| ((RTUNICP)(puch[1] & 0x3f) << 6)
1328	\| ((RTUNICP)(uch & 0x0f) << 12);
1329	RTStrAssertMsgReturn(uc >= 0x00000800 && uc <= 0x0000fffd,
1330	("%u: cp=%#010RX32: %.Rhxs\n", cb, uc, RT_MIN(cb + 10, strlen((char )puch)), puch),
1331	rtStrGetCpExFailure(ppsz, pCp, uc == 0xffff \|\| uc == 0xfffe ? VERR_CODE_POINT_ENDIAN_INDICATOR : VERR_INVALID_UTF8_ENCODING));
1332	RTStrAssertMsgReturn(uc < 0xd800 \|\| uc > 0xdfff,
1333	("%u: cp=%#010RX32: %.Rhxs\n", cb, uc, RT_MIN(cb + 10, strlen((char )puch)), puch),
1334	rtStrGetCpExFailure(ppsz, pCp, VERR_CODE_POINT_SURROGATE));
1335	break;
1336	case 2:
1337	uc = (puch[1] & 0x3f)
1338	\| ((RTUNICP)(uch & 0x1f) << 6);
1339	RTStrAssertMsgReturn(uc >= 0x00000080 && uc <= 0x000007ff,
1340	("%u: cp=%#010RX32: %.Rhxs\n", cb, uc, RT_MIN(cb + 10, strlen((char )puch)), puch),
1341	rtStrGetCpExFailure(ppsz, pCp, VERR_INVALID_UTF8_ENCODING));
1342	break;
1343	default: /* impossible, but GCC is bitching. */
1344	uc = RTUNICP_INVALID;
1345	break;
1346	}
1347	puch += cb;
1348	}
1349	else
1350	{
1351	/* 6th bit is always set. */
1352	RTStrAssertMsgFailed(("Invalid UTF-8 first byte: %.Rhxs\n", RT_MIN(strlen((char )puch), 10), puch));
1353	return rtStrGetCpExFailure(ppsz, pCp, VERR_INVALID_UTF8_ENCODING);
1354	}
1355	*pCp = uc;
1356	ppsz = (const char )puch;
1357	return VINF_SUCCESS;
1358	}
1359	RT_EXPORT_SYMBOL(RTStrGetCpExInternal);
1360
1361
1362	/**
1363	* Handle invalid encodings passed to RTStrGetCpNEx().
1364	* @returns rc
1365	* @param ppsz The pointer to the string position point.
1366	* @param pcch Pointer to the string length.
1367	* @param pCp Where to store RTUNICP_INVALID.
1368	* @param rc The iprt error code.
1369	*/
1370	static int rtStrGetCpNExFailure(const char *ppsz, size_t pcch, PRTUNICP pCp, int rc)
1371	{
1372	/*
1373	* Try find a valid encoding.
1374	*/
1375	(ppsz)++; /* @todo code this! */
1376	(*pcch)--;
1377	*pCp = RTUNICP_INVALID;
1378	return rc;
1379	}
1380
1381
1382	RTDECL(int) RTStrGetCpNExInternal(const char *ppsz, size_t pcch, PRTUNICP pCp)
1383	{
1384	const unsigned char puch = (const unsigned char )*ppsz;
1385	const unsigned char uch = *puch;
1386	size_t cch = *pcch;
1387	RTUNICP uc;
1388
1389	if (cch == 0)
1390	{
1391	*pCp = RTUNICP_INVALID;
1392	return VERR_END_OF_STRING;
1393	}
1394
1395	/* ASCII ? */
1396	if (!(uch & RT_BIT(7)))
1397	{
1398	uc = uch;
1399	puch++;
1400	cch--;
1401	}
1402	else if (uch & RT_BIT(6))
1403	{
1404	/* figure the length and validate the first octet. */
1405	/** @todo RT_USE_RTC_3629 */
1406	unsigned cb;
1407	if (!(uch & RT_BIT(5)))
1408	cb = 2;
1409	else if (!(uch & RT_BIT(4)))
1410	cb = 3;
1411	else if (!(uch & RT_BIT(3)))
1412	cb = 4;
1413	else if (!(uch & RT_BIT(2)))
1414	cb = 5;
1415	else if (!(uch & RT_BIT(1)))
1416	cb = 6;
1417	else
1418	{
1419	RTStrAssertMsgFailed(("Invalid UTF-8 first byte: %.Rhxs\n", RT_MIN(strlen((char )puch), 10), puch));
1420	return rtStrGetCpNExFailure(ppsz, pcch, pCp, VERR_INVALID_UTF8_ENCODING);
1421	}
1422
1423	if (cb > cch)
1424	return rtStrGetCpNExFailure(ppsz, pcch, pCp, VERR_INVALID_UTF8_ENCODING);
1425
1426	/* validate the rest */
1427	switch (cb)
1428	{
1429	case 6:
1430	RTStrAssertMsgReturn((puch[5] & 0xc0) == 0x80, ("6/%u: %.Rhxs\n", cb, RT_MIN(cb + 10, strlen((char )puch)), puch),
1431	rtStrGetCpNExFailure(ppsz, pcch, pCp, VERR_INVALID_UTF8_ENCODING));
1432	case 5:
1433	RTStrAssertMsgReturn((puch[4] & 0xc0) == 0x80, ("5/%u: %.Rhxs\n", cb, RT_MIN(cb + 10, strlen((char )puch)), puch),
1434	rtStrGetCpNExFailure(ppsz, pcch, pCp, VERR_INVALID_UTF8_ENCODING));
1435	case 4:
1436	RTStrAssertMsgReturn((puch[3] & 0xc0) == 0x80, ("4/%u: %.Rhxs\n", cb, RT_MIN(cb + 10, strlen((char )puch)), puch),
1437	rtStrGetCpNExFailure(ppsz, pcch, pCp, VERR_INVALID_UTF8_ENCODING));
1438	case 3:
1439	RTStrAssertMsgReturn((puch[2] & 0xc0) == 0x80, ("3/%u: %.Rhxs\n", cb, RT_MIN(cb + 10, strlen((char )puch)), puch),
1440	rtStrGetCpNExFailure(ppsz, pcch, pCp, VERR_INVALID_UTF8_ENCODING));
1441	case 2:
1442	RTStrAssertMsgReturn((puch[1] & 0xc0) == 0x80, ("2/%u: %.Rhxs\n", cb, RT_MIN(cb + 10, strlen((char )puch)), puch),
1443	rtStrGetCpNExFailure(ppsz, pcch, pCp, VERR_INVALID_UTF8_ENCODING));
1444	break;
1445	}
1446
1447	/* get and validate the code point. */
1448	switch (cb)
1449	{
1450	case 6:
1451	uc = (puch[5] & 0x3f)
1452	\| ((RTUNICP)(puch[4] & 0x3f) << 6)
1453	\| ((RTUNICP)(puch[3] & 0x3f) << 12)
1454	\| ((RTUNICP)(puch[2] & 0x3f) << 18)
1455	\| ((RTUNICP)(puch[1] & 0x3f) << 24)
1456	\| ((RTUNICP)(uch & 0x01) << 30);
1457	RTStrAssertMsgReturn(uc >= 0x04000000 && uc <= 0x7fffffff,
1458	("%u: cp=%#010RX32: %.Rhxs\n", cb, uc, RT_MIN(cb + 10, strlen((char )puch)), puch),
1459	rtStrGetCpNExFailure(ppsz, pcch, pCp, VERR_INVALID_UTF8_ENCODING));
1460	break;
1461	case 5:
1462	uc = (puch[4] & 0x3f)
1463	\| ((RTUNICP)(puch[3] & 0x3f) << 6)
1464	\| ((RTUNICP)(puch[2] & 0x3f) << 12)
1465	\| ((RTUNICP)(puch[1] & 0x3f) << 18)
1466	\| ((RTUNICP)(uch & 0x03) << 24);
1467	RTStrAssertMsgReturn(uc >= 0x00200000 && uc <= 0x03ffffff,
1468	("%u: cp=%#010RX32: %.Rhxs\n", cb, uc, RT_MIN(cb + 10, strlen((char )puch)), puch),
1469	rtStrGetCpNExFailure(ppsz, pcch, pCp, VERR_INVALID_UTF8_ENCODING));
1470	break;
1471	case 4:
1472	uc = (puch[3] & 0x3f)
1473	\| ((RTUNICP)(puch[2] & 0x3f) << 6)
1474	\| ((RTUNICP)(puch[1] & 0x3f) << 12)
1475	\| ((RTUNICP)(uch & 0x07) << 18);
1476	RTStrAssertMsgReturn(uc >= 0x00010000 && uc <= 0x001fffff,
1477	("%u: cp=%#010RX32: %.Rhxs\n", cb, uc, RT_MIN(cb + 10, strlen((char )puch)), puch),
1478	rtStrGetCpNExFailure(ppsz, pcch, pCp, VERR_INVALID_UTF8_ENCODING));
1479	break;
1480	case 3:
1481	uc = (puch[2] & 0x3f)
1482	\| ((RTUNICP)(puch[1] & 0x3f) << 6)
1483	\| ((RTUNICP)(uch & 0x0f) << 12);
1484	RTStrAssertMsgReturn(uc >= 0x00000800 && uc <= 0x0000fffd,
1485	("%u: cp=%#010RX32: %.Rhxs\n", cb, uc, RT_MIN(cb + 10, strlen((char )puch)), puch),
1486	rtStrGetCpNExFailure(ppsz, pcch, pCp, uc == 0xffff \|\| uc == 0xfffe ? VERR_CODE_POINT_ENDIAN_INDICATOR : VERR_INVALID_UTF8_ENCODING));
1487	RTStrAssertMsgReturn(uc < 0xd800 \|\| uc > 0xdfff,
1488	("%u: cp=%#010RX32: %.Rhxs\n", cb, uc, RT_MIN(cb + 10, strlen((char )puch)), puch),
1489	rtStrGetCpNExFailure(ppsz, pcch, pCp, VERR_CODE_POINT_SURROGATE));
1490	break;
1491	case 2:
1492	uc = (puch[1] & 0x3f)
1493	\| ((RTUNICP)(uch & 0x1f) << 6);
1494	RTStrAssertMsgReturn(uc >= 0x00000080 && uc <= 0x000007ff,
1495	("%u: cp=%#010RX32: %.Rhxs\n", cb, uc, RT_MIN(cb + 10, strlen((char )puch)), puch),
1496	rtStrGetCpNExFailure(ppsz, pcch, pCp, VERR_INVALID_UTF8_ENCODING));
1497	break;
1498	default: /* impossible, but GCC is bitching. */
1499	uc = RTUNICP_INVALID;
1500	break;
1501	}
1502	puch += cb;
1503	cch -= cb;
1504	}
1505	else
1506	{
1507	/* 6th bit is always set. */
1508	RTStrAssertMsgFailed(("Invalid UTF-8 first byte: %.Rhxs\n", RT_MIN(strlen((char )puch), 10), puch));
1509	return rtStrGetCpNExFailure(ppsz, pcch, pCp, VERR_INVALID_UTF8_ENCODING);
1510	}
1511	*pCp = uc;
1512	ppsz = (const char )puch;
1513	(*pcch) = cch;
1514	return VINF_SUCCESS;
1515	}
1516	RT_EXPORT_SYMBOL(RTStrGetCpNExInternal);
1517
1518
1519	RTDECL(char ) RTStrPutCpInternal(char psz, RTUNICP uc)
1520	{
1521	unsigned char puch = (unsigned char )psz;
1522	if (uc < 0x80)
1523	*puch++ = (unsigned char )uc;
1524	else if (uc < 0x00000800)
1525	{
1526	*puch++ = 0xc0 \| (uc >> 6);
1527	*puch++ = 0x80 \| (uc & 0x3f);
1528	}
1529	else if (uc < 0x00010000)
1530	{
1531	/** @todo RT_USE_RTC_3629 */
1532	if ( uc < 0x0000d8000
1533	\|\| ( uc > 0x0000dfff
1534	&& uc < 0x0000fffe))
1535	{
1536	*puch++ = 0xe0 \| (uc >> 12);
1537	*puch++ = 0x80 \| ((uc >> 6) & 0x3f);
1538	*puch++ = 0x80 \| (uc & 0x3f);
1539	}
1540	else
1541	{
1542	AssertMsgFailed(("Invalid code point U+%05x!\n", uc));
1543	*puch++ = 0x7f;
1544	}
1545	}
1546	/** @todo RT_USE_RTC_3629 */
1547	else if (uc < 0x00200000)
1548	{
1549	*puch++ = 0xf0 \| (uc >> 18);
1550	*puch++ = 0x80 \| ((uc >> 12) & 0x3f);
1551	*puch++ = 0x80 \| ((uc >> 6) & 0x3f);
1552	*puch++ = 0x80 \| (uc & 0x3f);
1553	}
1554	else if (uc < 0x04000000)
1555	{
1556	*puch++ = 0xf8 \| (uc >> 24);
1557	*puch++ = 0x80 \| ((uc >> 18) & 0x3f);
1558	*puch++ = 0x80 \| ((uc >> 12) & 0x3f);
1559	*puch++ = 0x80 \| ((uc >> 6) & 0x3f);
1560	*puch++ = 0x80 \| (uc & 0x3f);
1561	}
1562	else if (uc <= 0x7fffffff)
1563	{
1564	*puch++ = 0xfc \| (uc >> 30);
1565	*puch++ = 0x80 \| ((uc >> 24) & 0x3f);
1566	*puch++ = 0x80 \| ((uc >> 18) & 0x3f);
1567	*puch++ = 0x80 \| ((uc >> 12) & 0x3f);
1568	*puch++ = 0x80 \| ((uc >> 6) & 0x3f);
1569	*puch++ = 0x80 \| (uc & 0x3f);
1570	}
1571	else
1572	{
1573	AssertMsgFailed(("Invalid code point U+%08x!\n", uc));
1574	*puch++ = 0x7f;
1575	}
1576
1577	return (char *)puch;
1578	}
1579	RT_EXPORT_SYMBOL(RTStrPutCpInternal);
1580
1581
1582	RTDECL(char ) RTStrPrevCp(const char pszStart, const char *psz)
1583	{
1584	if (pszStart < psz)
1585	{
1586	/* simple char? */
1587	const unsigned char puch = (const unsigned char )psz;
1588	unsigned uch = *--puch;
1589	if (!(uch & RT_BIT(7)))
1590	return (char *)puch;
1591	RTStrAssertMsgReturn(!(uch & RT_BIT(6)), ("uch=%#x\n", uch), (char *)pszStart);
1592
1593	/* two or more. */
1594	uint32_t uMask = 0xffffffc0;
1595	while ( (const unsigned char *)pszStart < puch
1596	&& !(uMask & 1))
1597	{
1598	uch = *--puch;
1599	if ((uch & 0xc0) != 0x80)
1600	{
1601	RTStrAssertMsgReturn((uch & (uMask >> 1)) == (uMask & 0xff),
1602	("Invalid UTF-8 encoding: %.Rhxs puch=%p psz=%p\n", psz - (char )puch, puch, psz),
1603	(char *)pszStart);
1604	return (char *)puch;
1605	}
1606	uMask >>= 1;
1607	}
1608	RTStrAssertMsgFailed(("Invalid UTF-8 encoding: %.Rhxs puch=%p psz=%p\n", psz - (char )puch, puch, psz));
1609	}
1610	return (char *)pszStart;
1611	}
1612	RT_EXPORT_SYMBOL(RTStrPrevCp);
1613

Note: See TracBrowser for help on using the repository browser.

source: vbox/trunk/src/VBox/Runtime/common/string/utf-8.cpp@ 49329

Download in other formats: