utf-8.cpp@ 64797

Last change on this file since 64797 was 64633, checked in by vboxsync, 8 years ago
utf-8.cpp: Duplicate rtUtf8CalcUtf16Length so we can optimize the common case of RTSTR_MAX. Also rearranged some string terminator checks to try improve our mojo.
Property svn:eol-style set to `native` Property svn:keywords set to `Id Revision`
File size: 64.4 KB

Line
1	/* $Id: utf-8.cpp 64633 2016-11-10 15:03:17Z vboxsync $ */
2	/** @file
3	* IPRT - UTF-8 Decoding.
4	*/
5
6	/*
7	* Copyright (C) 2006-2016 Oracle Corporation
8	*
9	* This file is part of VirtualBox Open Source Edition (OSE), as
10	* available from http://www.virtualbox.org. This file is free software;
11	* you can redistribute it and/or modify it under the terms of the GNU
12	* General Public License (GPL) as published by the Free Software
13	* Foundation, in version 2 as it comes in the "COPYING" file of the
14	* VirtualBox OSE distribution. VirtualBox OSE is distributed in the
15	* hope that it will be useful, but WITHOUT ANY WARRANTY of any kind.
16	*
17	* The contents of this file may alternatively be used under the terms
18	* of the Common Development and Distribution License Version 1.0
19	* (CDDL) only, as it comes in the "COPYING.CDDL" file of the
20	* VirtualBox OSE distribution, in which case the provisions of the
21	* CDDL are applicable instead of those of the GPL.
22	*
23	* You may elect to license modified versions of this file under the
24	* terms and conditions of either the GPL or the CDDL or both.
25	*/
26
27
28	/*********************************************************************************************************************************
29	* Header Files *
30	*********************************************************************************************************************************/
31	#include <iprt/string.h>
32	#include "internal/iprt.h"
33
34	#include <iprt/uni.h>
35	#include <iprt/alloc.h>
36	#include <iprt/assert.h>
37	#include <iprt/err.h>
38	#include "internal/string.h"
39
40
41
42	/**
43	* Get get length in code points of a UTF-8 encoded string.
44	* The string is validated while doing this.
45	*
46	* @returns IPRT status code.
47	* @param psz Pointer to the UTF-8 string.
48	* @param cch The max length of the string. (btw cch = cb)
49	* Use RTSTR_MAX if all of the string is to be examined.
50	* @param pcuc Where to store the length in unicode code points.
51	* @param pcchActual Where to store the actual size of the UTF-8 string
52	* on success (cch = cb again). Optional.
53	*/
54	DECLHIDDEN(int) rtUtf8Length(const char psz, size_t cch, size_t pcuc, size_t *pcchActual)
55	{
56	const unsigned char puch = (const unsigned char )psz;
57	size_t cCodePoints = 0;
58	while (cch > 0)
59	{
60	const unsigned char uch = *puch;
61	if (!uch)
62	break;
63	if (uch & RT_BIT(7))
64	{
65	/* figure sequence length and validate the first byte */
66	/** @todo RT_USE_RTC_3629 */
67	unsigned cb;
68	if ((uch & (RT_BIT(7) \| RT_BIT(6) \| RT_BIT(5))) == (RT_BIT(7) \| RT_BIT(6)))
69	cb = 2;
70	else if ((uch & (RT_BIT(7) \| RT_BIT(6) \| RT_BIT(5) \| RT_BIT(4))) == (RT_BIT(7) \| RT_BIT(6) \| RT_BIT(5)))
71	cb = 3;
72	else if ((uch & (RT_BIT(7) \| RT_BIT(6) \| RT_BIT(5) \| RT_BIT(4) \| RT_BIT(3))) == (RT_BIT(7) \| RT_BIT(6) \| RT_BIT(5) \| RT_BIT(4)))
73	cb = 4;
74	else if ((uch & (RT_BIT(7) \| RT_BIT(6) \| RT_BIT(5) \| RT_BIT(4) \| RT_BIT(3) \| RT_BIT(2))) == (RT_BIT(7) \| RT_BIT(6) \| RT_BIT(5) \| RT_BIT(4) \| RT_BIT(3)))
75	cb = 5;
76	else if ((uch & (RT_BIT(7) \| RT_BIT(6) \| RT_BIT(5) \| RT_BIT(4) \| RT_BIT(3) \| RT_BIT(2) \| RT_BIT(1))) == (RT_BIT(7) \| RT_BIT(6) \| RT_BIT(5) \| RT_BIT(4) \| RT_BIT(3) \| RT_BIT(2)))
77	cb = 6;
78	else
79	{
80	RTStrAssertMsgFailed(("Invalid UTF-8 first byte: %.*Rhxs\n", RT_MIN(cch, 10), puch));
81	return VERR_INVALID_UTF8_ENCODING;
82	}
83
84	/* check length */
85	if (cb > cch)
86	{
87	RTStrAssertMsgFailed(("Invalid UTF-8 length: cb=%d cch=%d (%.*Rhxs)\n", cb, cch, RT_MIN(cch, 10), puch));
88	return VERR_INVALID_UTF8_ENCODING;
89	}
90
91	/* validate the rest */
92	switch (cb)
93	{
94	case 6:
95	RTStrAssertMsgReturn((puch[5] & (RT_BIT(7) \| RT_BIT(6))) == RT_BIT(7), ("6/%u: %.*Rhxs\n", cb, RT_MIN(cb + 10, cch), puch), VERR_INVALID_UTF8_ENCODING);
96	case 5:
97	RTStrAssertMsgReturn((puch[4] & (RT_BIT(7) \| RT_BIT(6))) == RT_BIT(7), ("5/%u: %.*Rhxs\n", cb, RT_MIN(cb + 10, cch), puch), VERR_INVALID_UTF8_ENCODING);
98	case 4:
99	RTStrAssertMsgReturn((puch[3] & (RT_BIT(7) \| RT_BIT(6))) == RT_BIT(7), ("4/%u: %.*Rhxs\n", cb, RT_MIN(cb + 10, cch), puch), VERR_INVALID_UTF8_ENCODING);
100	case 3:
101	RTStrAssertMsgReturn((puch[2] & (RT_BIT(7) \| RT_BIT(6))) == RT_BIT(7), ("3/%u: %.*Rhxs\n", cb, RT_MIN(cb + 10, cch), puch), VERR_INVALID_UTF8_ENCODING);
102	case 2:
103	RTStrAssertMsgReturn((puch[1] & (RT_BIT(7) \| RT_BIT(6))) == RT_BIT(7), ("2/%u: %.*Rhxs\n", cb, RT_MIN(cb + 10, cch), puch), VERR_INVALID_UTF8_ENCODING);
104	break;
105	}
106
107	/* validate the code point. */
108	RTUNICP uc;
109	switch (cb)
110	{
111	case 6:
112	uc = (puch[5] & 0x3f)
113	\| ((RTUNICP)(puch[4] & 0x3f) << 6)
114	\| ((RTUNICP)(puch[3] & 0x3f) << 12)
115	\| ((RTUNICP)(puch[2] & 0x3f) << 18)
116	\| ((RTUNICP)(puch[1] & 0x3f) << 24)
117	\| ((RTUNICP)(uch & 0x01) << 30);
118	RTStrAssertMsgReturn(uc >= 0x04000000 && uc <= 0x7fffffff,
119	("%u: cp=%#010RX32: %.*Rhxs\n", cb, uc, RT_MIN(cb + 10, cch), puch), VERR_INVALID_UTF8_ENCODING);
120	break;
121	case 5:
122	uc = (puch[4] & 0x3f)
123	\| ((RTUNICP)(puch[3] & 0x3f) << 6)
124	\| ((RTUNICP)(puch[2] & 0x3f) << 12)
125	\| ((RTUNICP)(puch[1] & 0x3f) << 18)
126	\| ((RTUNICP)(uch & 0x03) << 24);
127	RTStrAssertMsgReturn(uc >= 0x00200000 && uc <= 0x03ffffff,
128	("%u: cp=%#010RX32: %.*Rhxs\n", cb, uc, RT_MIN(cb + 10, cch), puch), VERR_INVALID_UTF8_ENCODING);
129	break;
130	case 4:
131	uc = (puch[3] & 0x3f)
132	\| ((RTUNICP)(puch[2] & 0x3f) << 6)
133	\| ((RTUNICP)(puch[1] & 0x3f) << 12)
134	\| ((RTUNICP)(uch & 0x07) << 18);
135	RTStrAssertMsgReturn(uc >= 0x00010000 && uc <= 0x001fffff,
136	("%u: cp=%#010RX32: %.*Rhxs\n", cb, uc, RT_MIN(cb + 10, cch), puch), VERR_INVALID_UTF8_ENCODING);
137	break;
138	case 3:
139	uc = (puch[2] & 0x3f)
140	\| ((RTUNICP)(puch[1] & 0x3f) << 6)
141	\| ((RTUNICP)(uch & 0x0f) << 12);
142	RTStrAssertMsgReturn(uc >= 0x00000800 && uc <= 0x0000fffd,
143	("%u: cp=%#010RX32: %.*Rhxs\n", cb, uc, RT_MIN(cb + 10, cch), puch),
144	uc == 0xffff \|\| uc == 0xfffe ? VERR_CODE_POINT_ENDIAN_INDICATOR : VERR_INVALID_UTF8_ENCODING);
145	RTStrAssertMsgReturn(uc < 0xd800 \|\| uc > 0xdfff,
146	("%u: cp=%#010RX32: %.*Rhxs\n", cb, uc, RT_MIN(cb + 10, cch), puch), VERR_CODE_POINT_SURROGATE);
147	break;
148	case 2:
149	uc = (puch[1] & 0x3f)
150	\| ((RTUNICP)(uch & 0x1f) << 6);
151	RTStrAssertMsgReturn(uc >= 0x00000080 && uc <= 0x000007ff,
152	("%u: cp=%#010RX32: %.*Rhxs\n", cb, uc, RT_MIN(cb + 10, cch), puch), VERR_INVALID_UTF8_ENCODING);
153	break;
154	}
155
156	/* advance */
157	cch -= cb;
158	puch += cb;
159	}
160	else
161	{
162	/* one ASCII byte */
163	puch++;
164	cch--;
165	}
166	cCodePoints++;
167	}
168
169	/* done */
170	*pcuc = cCodePoints;
171	if (pcchActual)
172	pcchActual = puch - (unsigned char const )psz;
173	return VINF_SUCCESS;
174	}
175
176
177	/**
178	* Decodes and UTF-8 string into an array of unicode code point.
179	*
180	* Since we know the input is valid, we do not perform encoding or length checks.
181	*
182	* @returns iprt status code.
183	* @param psz The UTF-8 string to recode. This is a valid encoding.
184	* @param cch The number of chars (the type char, so bytes if you like) to process of the UTF-8 string.
185	* The recoding will stop when cch or '\\0' is reached. Pass RTSTR_MAX to process up to '\\0'.
186	* @param paCps Where to store the code points array.
187	* @param cCps The number of RTUNICP items the paCps buffer can hold, excluding the terminator ('\\0').
188	*/
189	static int rtUtf8Decode(const char *psz, size_t cch, PRTUNICP paCps, size_t cCps)
190	{
191	int rc = VINF_SUCCESS;
192	const unsigned char puch = (const unsigned char )psz;
193	PRTUNICP pCp = paCps;
194	while (cch > 0)
195	{
196	/* read the next char and check for terminator. */
197	const unsigned char uch = *puch;
198	if (uch)
199	{ /* we only break once, so consider this the likely branch. */ }
200	else
201	break;
202
203	/* check for output overflow */
204	if (RT_LIKELY(cCps >= 1))
205	{ /* likely */ }
206	else
207	{
208	rc = VERR_BUFFER_OVERFLOW;
209	break;
210	}
211	cCps--;
212
213	/* decode and recode the code point */
214	if (!(uch & RT_BIT(7)))
215	{
216	*pCp++ = uch;
217	puch++;
218	cch--;
219	}
220	#ifdef RT_STRICT
221	else if (!(uch & RT_BIT(6)))
222	AssertMsgFailed(("Internal error!\n"));
223	#endif
224	else if (!(uch & RT_BIT(5)))
225	{
226	*pCp++ = (puch[1] & 0x3f)
227	\| ((uint16_t)(uch & 0x1f) << 6);
228	puch += 2;
229	cch -= 2;
230	}
231	else if (!(uch & RT_BIT(4)))
232	{
233	*pCp++ = (puch[2] & 0x3f)
234	\| ((uint16_t)(puch[1] & 0x3f) << 6)
235	\| ((uint16_t)(uch & 0x0f) << 12);
236	puch += 3;
237	cch -= 3;
238	}
239	else if (!(uch & RT_BIT(3)))
240	{
241	*pCp++ = (puch[3] & 0x3f)
242	\| ((RTUNICP)(puch[2] & 0x3f) << 6)
243	\| ((RTUNICP)(puch[1] & 0x3f) << 12)
244	\| ((RTUNICP)(uch & 0x07) << 18);
245	puch += 4;
246	cch -= 4;
247	}
248	else if (!(uch & RT_BIT(2)))
249	{
250	*pCp++ = (puch[4] & 0x3f)
251	\| ((RTUNICP)(puch[3] & 0x3f) << 6)
252	\| ((RTUNICP)(puch[2] & 0x3f) << 12)
253	\| ((RTUNICP)(puch[1] & 0x3f) << 18)
254	\| ((RTUNICP)(uch & 0x03) << 24);
255	puch += 5;
256	cch -= 6;
257	}
258	else
259	{
260	Assert(!(uch & RT_BIT(1)));
261	*pCp++ = (puch[5] & 0x3f)
262	\| ((RTUNICP)(puch[4] & 0x3f) << 6)
263	\| ((RTUNICP)(puch[3] & 0x3f) << 12)
264	\| ((RTUNICP)(puch[2] & 0x3f) << 18)
265	\| ((RTUNICP)(puch[1] & 0x3f) << 24)
266	\| ((RTUNICP)(uch & 0x01) << 30);
267	puch += 6;
268	cch -= 6;
269	}
270	}
271
272	/* done */
273	*pCp = 0;
274	return rc;
275	}
276
277
278	RTDECL(size_t) RTStrUniLen(const char *psz)
279	{
280	size_t cCodePoints;
281	int rc = rtUtf8Length(psz, RTSTR_MAX, &cCodePoints, NULL);
282	return RT_SUCCESS(rc) ? cCodePoints : 0;
283	}
284	RT_EXPORT_SYMBOL(RTStrUniLen);
285
286
287	RTDECL(int) RTStrUniLenEx(const char psz, size_t cch, size_t pcCps)
288	{
289	size_t cCodePoints;
290	int rc = rtUtf8Length(psz, cch, &cCodePoints, NULL);
291	if (pcCps)
292	*pcCps = RT_SUCCESS(rc) ? cCodePoints : 0;
293	return rc;
294	}
295	RT_EXPORT_SYMBOL(RTStrUniLenEx);
296
297
298	RTDECL(int) RTStrValidateEncoding(const char *psz)
299	{
300	return RTStrValidateEncodingEx(psz, RTSTR_MAX, 0);
301	}
302	RT_EXPORT_SYMBOL(RTStrValidateEncoding);
303
304
305	RTDECL(int) RTStrValidateEncodingEx(const char *psz, size_t cch, uint32_t fFlags)
306	{
307	AssertReturn(!(fFlags & ~(RTSTR_VALIDATE_ENCODING_ZERO_TERMINATED \| RTSTR_VALIDATE_ENCODING_EXACT_LENGTH)),
308	VERR_INVALID_PARAMETER);
309	AssertPtr(psz);
310
311	/*
312	* Use rtUtf8Length for the job.
313	*/
314	size_t cchActual;
315	size_t cCpsIgnored;
316	int rc = rtUtf8Length(psz, cch, &cCpsIgnored, &cchActual);
317	if (RT_SUCCESS(rc))
318	{
319	if (fFlags & RTSTR_VALIDATE_ENCODING_EXACT_LENGTH)
320	{
321	if (fFlags & RTSTR_VALIDATE_ENCODING_ZERO_TERMINATED)
322	cchActual++;
323	if (cchActual == cch)
324	rc = VINF_SUCCESS;
325	else if (cchActual < cch)
326	rc = VERR_BUFFER_UNDERFLOW;
327	else
328	rc = VERR_BUFFER_OVERFLOW;
329	}
330	else if ( (fFlags & RTSTR_VALIDATE_ENCODING_ZERO_TERMINATED)
331	&& cchActual >= cch)
332	rc = VERR_BUFFER_OVERFLOW;
333	}
334	return rc;
335	}
336	RT_EXPORT_SYMBOL(RTStrValidateEncodingEx);
337
338
339	RTDECL(bool) RTStrIsValidEncoding(const char *psz)
340	{
341	int rc = RTStrValidateEncodingEx(psz, RTSTR_MAX, 0);
342	return RT_SUCCESS(rc);
343	}
344	RT_EXPORT_SYMBOL(RTStrIsValidEncoding);
345
346
347	RTDECL(size_t) RTStrPurgeEncoding(char *psz)
348	{
349	size_t cErrors = 0;
350	for (;;)
351	{
352	RTUNICP Cp;
353	int rc = RTStrGetCpEx((const char **)&psz, &Cp);
354	if (RT_SUCCESS(rc))
355	{
356	if (!Cp)
357	break;
358	}
359	else
360	{
361	psz[-1] = '?';
362	cErrors++;
363	}
364	}
365	return cErrors;
366	}
367	RT_EXPORT_SYMBOL(RTStrPurgeEncoding);
368
369
370	/**
371	* Helper for RTStrPurgeComplementSet.
372	*
373	* @returns true if @a Cp is valid, false if not.
374	* @param Cp The code point to validate.
375	* @param puszValidPairs Pair of valid code point sets.
376	* @param cValidPairs Number of pairs.
377	*/
378	DECLINLINE(bool) rtStrPurgeIsInSet(RTUNICP Cp, PCRTUNICP puszValidPairs, uint32_t cValidPairs)
379	{
380	while (cValidPairs-- > 0)
381	{
382	if ( Cp >= puszValidPairs[0]
383	&& Cp <= puszValidPairs[1])
384	return true;
385	puszValidPairs += 2;
386	}
387	return false;
388	}
389
390
391	RTDECL(ssize_t) RTStrPurgeComplementSet(char *psz, PCRTUNICP puszValidPairs, char chReplacement)
392	{
393	AssertReturn(chReplacement && (unsigned)chReplacement < 128, -1);
394
395	/*
396	* Calc valid pairs and check that we've got an even number.
397	*/
398	uint32_t cValidPairs = 0;
399	while (puszValidPairs[cValidPairs * 2])
400	{
401	AssertReturn(puszValidPairs[cValidPairs * 2 + 1], -1);
402	AssertMsg(puszValidPairs[cValidPairs * 2] <= puszValidPairs[cValidPairs * 2 + 1],
403	("%#x vs %#x\n", puszValidPairs[cValidPairs * 2], puszValidPairs[cValidPairs * 2 + 1]));
404	cValidPairs++;
405	}
406
407	/*
408	* Do the replacing.
409	*/
410	ssize_t cReplacements = 0;
411	for (;;)
412	{
413	char *pszCur = psz;
414	RTUNICP Cp;
415	int rc = RTStrGetCpEx((const char **)&psz, &Cp);
416	if (RT_SUCCESS(rc))
417	{
418	if (Cp)
419	{
420	if (!rtStrPurgeIsInSet(Cp, puszValidPairs, cValidPairs))
421	{
422	for (; pszCur != psz; ++pszCur)
423	*pszCur = chReplacement;
424	++cReplacements;
425	}
426	}
427	else
428	break;
429	}
430	else
431	return -1;
432	}
433	return cReplacements;
434	}
435	RT_EXPORT_SYMBOL(RTStrPurgeComplementSet);
436
437
438	RTDECL(int) RTStrToUni(const char pszString, PRTUNICP ppaCps)
439	{
440	/*
441	* Validate input.
442	*/
443	Assert(VALID_PTR(pszString));
444	Assert(VALID_PTR(ppaCps));
445	*ppaCps = NULL;
446
447	/*
448	* Validate the UTF-8 input and count its code points.
449	*/
450	size_t cCps;
451	int rc = rtUtf8Length(pszString, RTSTR_MAX, &cCps, NULL);
452	if (RT_SUCCESS(rc))
453	{
454	/*
455	* Allocate buffer.
456	*/
457	PRTUNICP paCps = (PRTUNICP)RTMemAlloc((cCps + 1) * sizeof(RTUNICP));
458	if (paCps)
459	{
460	/*
461	* Decode the string.
462	*/
463	rc = rtUtf8Decode(pszString, RTSTR_MAX, paCps, cCps);
464	if (RT_SUCCESS(rc))
465	{
466	*ppaCps = paCps;
467	return rc;
468	}
469	RTMemFree(paCps);
470	}
471	else
472	rc = VERR_NO_CODE_POINT_MEMORY;
473	}
474	return rc;
475	}
476	RT_EXPORT_SYMBOL(RTStrToUni);
477
478
479	RTDECL(int) RTStrToUniEx(const char pszString, size_t cchString, PRTUNICP ppaCps, size_t cCps, size_t *pcCps)
480	{
481	/*
482	* Validate input.
483	*/
484	Assert(VALID_PTR(pszString));
485	Assert(VALID_PTR(ppaCps));
486	Assert(!pcCps \|\| VALID_PTR(pcCps));
487
488	/*
489	* Validate the UTF-8 input and count the code points.
490	*/
491	size_t cCpsResult;
492	int rc = rtUtf8Length(pszString, cchString, &cCpsResult, NULL);
493	if (RT_SUCCESS(rc))
494	{
495	if (pcCps)
496	*pcCps = cCpsResult;
497
498	/*
499	* Check buffer size / Allocate buffer.
500	*/
501	bool fShouldFree;
502	PRTUNICP paCpsResult;
503	if (cCps > 0 && *ppaCps)
504	{
505	fShouldFree = false;
506	if (cCps <= cCpsResult)
507	return VERR_BUFFER_OVERFLOW;
508	paCpsResult = *ppaCps;
509	}
510	else
511	{
512	*ppaCps = NULL;
513	fShouldFree = true;
514	cCps = RT_MAX(cCpsResult + 1, cCps);
515	paCpsResult = (PRTUNICP)RTMemAlloc(cCps * sizeof(RTUNICP));
516	}
517	if (paCpsResult)
518	{
519	/*
520	* Encode the UTF-16 string.
521	*/
522	rc = rtUtf8Decode(pszString, cchString, paCpsResult, cCps - 1);
523	if (RT_SUCCESS(rc))
524	{
525	*ppaCps = paCpsResult;
526	return rc;
527	}
528	if (fShouldFree)
529	RTMemFree(paCpsResult);
530	}
531	else
532	rc = VERR_NO_CODE_POINT_MEMORY;
533	}
534	return rc;
535	}
536	RT_EXPORT_SYMBOL(RTStrToUniEx);
537
538
539	/**
540	* Calculates the UTF-16 length of a string, validating the encoding while doing so.
541	*
542	* @returns IPRT status code.
543	* @param psz Pointer to the UTF-8 string.
544	* @param cch The max length of the string. (btw cch = cb)
545	* @param pcwc Where to store the length of the UTF-16 string as a number
546	* of RTUTF16 characters.
547	* @sa rtUtf8CalcUtf16Length
548	*/
549	static int rtUtf8CalcUtf16LengthN(const char psz, size_t cch, size_t pcwc)
550	{
551	const unsigned char puch = (const unsigned char )psz;
552	size_t cwc = 0;
553	while (cch > 0)
554	{
555	const unsigned char uch = *puch;
556	if (!(uch & RT_BIT(7)))
557	{
558	/* one ASCII byte */
559	if (uch)
560	{
561	cwc++;
562	puch++;
563	cch--;
564	}
565	else
566	break;
567	}
568	else
569	{
570	/*
571	* Multibyte sequence is more complicated when we have length
572	* restrictions on the input.
573	*/
574	/* figure sequence length and validate the first byte */
575	unsigned cb;
576	if ((uch & (RT_BIT(7) \| RT_BIT(6) \| RT_BIT(5))) == (RT_BIT(7) \| RT_BIT(6)))
577	cb = 2;
578	else if ((uch & (RT_BIT(7) \| RT_BIT(6) \| RT_BIT(5) \| RT_BIT(4))) == (RT_BIT(7) \| RT_BIT(6) \| RT_BIT(5)))
579	cb = 3;
580	else if ((uch & (RT_BIT(7) \| RT_BIT(6) \| RT_BIT(5) \| RT_BIT(4) \| RT_BIT(3))) == (RT_BIT(7) \| RT_BIT(6) \| RT_BIT(5) \| RT_BIT(4)))
581	cb = 4;
582	else if ((uch & (RT_BIT(7) \| RT_BIT(6) \| RT_BIT(5) \| RT_BIT(4) \| RT_BIT(3) \| RT_BIT(2))) == (RT_BIT(7) \| RT_BIT(6) \| RT_BIT(5) \| RT_BIT(4) \| RT_BIT(3)))
583	cb = 5;
584	else if ((uch & (RT_BIT(7) \| RT_BIT(6) \| RT_BIT(5) \| RT_BIT(4) \| RT_BIT(3) \| RT_BIT(2) \| RT_BIT(1))) == (RT_BIT(7) \| RT_BIT(6) \| RT_BIT(5) \| RT_BIT(4) \| RT_BIT(3) \| RT_BIT(2)))
585	cb = 6;
586	else
587	{
588	RTStrAssertMsgFailed(("Invalid UTF-8 first byte: %.*Rhxs\n", RT_MIN(cch, 10), puch));
589	return VERR_INVALID_UTF8_ENCODING;
590	}
591
592	/* check length */
593	if (cb > cch)
594	{
595	RTStrAssertMsgFailed(("Invalid UTF-8 length: cb=%d cch=%d (%.*Rhxs)\n", cb, cch, RT_MIN(cch, 10), puch));
596	return VERR_INVALID_UTF8_ENCODING;
597	}
598
599	/* validate the rest */
600	switch (cb)
601	{
602	case 6:
603	RTStrAssertMsgReturn((puch[5] & (RT_BIT(7) \| RT_BIT(6))) == RT_BIT(7), ("6/%u: %.*Rhxs\n", cb, RT_MIN(cb + 10, cch), puch), VERR_INVALID_UTF8_ENCODING);
604	case 5:
605	RTStrAssertMsgReturn((puch[4] & (RT_BIT(7) \| RT_BIT(6))) == RT_BIT(7), ("5/%u: %.*Rhxs\n", cb, RT_MIN(cb + 10, cch), puch), VERR_INVALID_UTF8_ENCODING);
606	case 4:
607	RTStrAssertMsgReturn((puch[3] & (RT_BIT(7) \| RT_BIT(6))) == RT_BIT(7), ("4/%u: %.*Rhxs\n", cb, RT_MIN(cb + 10, cch), puch), VERR_INVALID_UTF8_ENCODING);
608	case 3:
609	RTStrAssertMsgReturn((puch[2] & (RT_BIT(7) \| RT_BIT(6))) == RT_BIT(7), ("3/%u: %.*Rhxs\n", cb, RT_MIN(cb + 10, cch), puch), VERR_INVALID_UTF8_ENCODING);
610	case 2:
611	RTStrAssertMsgReturn((puch[1] & (RT_BIT(7) \| RT_BIT(6))) == RT_BIT(7), ("2/%u: %.*Rhxs\n", cb, RT_MIN(cb + 10, cch), puch), VERR_INVALID_UTF8_ENCODING);
612	break;
613	}
614
615	/* validate the code point. */
616	RTUNICP uc;
617	switch (cb)
618	{
619	case 6:
620	uc = (puch[5] & 0x3f)
621	\| ((RTUNICP)(puch[4] & 0x3f) << 6)
622	\| ((RTUNICP)(puch[3] & 0x3f) << 12)
623	\| ((RTUNICP)(puch[2] & 0x3f) << 18)
624	\| ((RTUNICP)(puch[1] & 0x3f) << 24)
625	\| ((RTUNICP)(uch & 0x01) << 30);
626	RTStrAssertMsgReturn(uc >= 0x04000000 && uc <= 0x7fffffff,
627	("%u: cp=%#010RX32: %.*Rhxs\n", cb, uc, RT_MIN(cb + 10, cch), puch), VERR_INVALID_UTF8_ENCODING);
628	RTStrAssertMsgFailed(("%u: cp=%#010RX32: %.*Rhxs\n", cb, uc, RT_MIN(cb + 10, cch), puch));
629	return VERR_CANT_RECODE_AS_UTF16;
630	case 5:
631	uc = (puch[4] & 0x3f)
632	\| ((RTUNICP)(puch[3] & 0x3f) << 6)
633	\| ((RTUNICP)(puch[2] & 0x3f) << 12)
634	\| ((RTUNICP)(puch[1] & 0x3f) << 18)
635	\| ((RTUNICP)(uch & 0x03) << 24);
636	RTStrAssertMsgReturn(uc >= 0x00200000 && uc <= 0x03ffffff,
637	("%u: cp=%#010RX32: %.*Rhxs\n", cb, uc, RT_MIN(cb + 10, cch), puch), VERR_INVALID_UTF8_ENCODING);
638	RTStrAssertMsgFailed(("%u: cp=%#010RX32: %.*Rhxs\n", cb, uc, RT_MIN(cb + 10, cch), puch));
639	return VERR_CANT_RECODE_AS_UTF16;
640	case 4:
641	uc = (puch[3] & 0x3f)
642	\| ((RTUNICP)(puch[2] & 0x3f) << 6)
643	\| ((RTUNICP)(puch[1] & 0x3f) << 12)
644	\| ((RTUNICP)(uch & 0x07) << 18);
645	RTStrAssertMsgReturn(uc >= 0x00010000 && uc <= 0x001fffff,
646	("%u: cp=%#010RX32: %.*Rhxs\n", cb, uc, RT_MIN(cb + 10, cch), puch), VERR_INVALID_UTF8_ENCODING);
647	RTStrAssertMsgReturn(uc <= 0x0010ffff,
648	("%u: cp=%#010RX32: %.*Rhxs\n", cb, uc, RT_MIN(cb + 10, cch), puch), VERR_CANT_RECODE_AS_UTF16);
649	cwc++;
650	break;
651	case 3:
652	uc = (puch[2] & 0x3f)
653	\| ((RTUNICP)(puch[1] & 0x3f) << 6)
654	\| ((RTUNICP)(uch & 0x0f) << 12);
655	RTStrAssertMsgReturn(uc >= 0x00000800 && uc <= 0x0000fffd,
656	("%u: cp=%#010RX32: %.*Rhxs\n", cb, uc, RT_MIN(cb + 10, cch), puch),
657	uc == 0xffff \|\| uc == 0xfffe ? VERR_CODE_POINT_ENDIAN_INDICATOR : VERR_INVALID_UTF8_ENCODING);
658	RTStrAssertMsgReturn(uc < 0xd800 \|\| uc > 0xdfff,
659	("%u: cp=%#010RX32: %.*Rhxs\n", cb, uc, RT_MIN(cb + 10, cch), puch), VERR_CODE_POINT_SURROGATE);
660	break;
661	case 2:
662	uc = (puch[1] & 0x3f)
663	\| ((RTUNICP)(uch & 0x1f) << 6);
664	RTStrAssertMsgReturn(uc >= 0x00000080 && uc <= 0x000007ff,
665	("%u: cp=%#010RX32: %.*Rhxs\n", cb, uc, RT_MIN(cb + 10, cch), puch), VERR_INVALID_UTF8_ENCODING);
666	break;
667	}
668
669	/* advance */
670	cch -= cb;
671	puch += cb;
672	cwc++;
673	}
674	}
675
676	/* done */
677	*pcwc = cwc;
678	return VINF_SUCCESS;
679	}
680
681
682	/**
683	* Calculates the UTF-16 length of a string, validating the encoding while doing so.
684	*
685	* @returns IPRT status code.
686	* @param psz Pointer to the UTF-8 string.
687	* @param pcwc Where to store the length of the UTF-16 string as a number
688	* of RTUTF16 characters.
689	* @sa rtUtf8CalcUtf16LengthN
690	*/
691	static int rtUtf8CalcUtf16Length(const char psz, size_t pcwc)
692	{
693	const unsigned char puch = (const unsigned char )psz;
694	size_t cwc = 0;
695	for (;;)
696	{
697	const unsigned char uch = *puch;
698	if (!(uch & RT_BIT(7)))
699	{
700	/* one ASCII byte */
701	if (uch)
702	{
703	cwc++;
704	puch++;
705	}
706	else
707	break;
708	}
709	else
710	{
711	/*
712	* Figure sequence length, implicitly validate the first byte.
713	* Then validate the additional bytes.
714	* Finally validate the code point.
715	*/
716	unsigned cb;
717	RTUNICP uc;
718	if ((uch & (RT_BIT(7) \| RT_BIT(6) \| RT_BIT(5))) == (RT_BIT(7) \| RT_BIT(6)))
719	{
720	RTStrAssertMsgReturn((puch[1] & (RT_BIT(7) \| RT_BIT(6))) == RT_BIT(7), ("2/%u: %.*Rhxs\n", cb, RT_MIN(cb + 10, cch), puch), VERR_INVALID_UTF8_ENCODING);
721	uc = (puch[1] & 0x3f)
722	\| ((RTUNICP)(uch & 0x1f) << 6);
723	RTStrAssertMsgReturn(uc >= 0x00000080 && uc <= 0x000007ff,
724	("%u: cp=%#010RX32: %.*Rhxs\n", cb, uc, RT_MIN(cb + 10, cch), puch), VERR_INVALID_UTF8_ENCODING);
725	cb = 2;
726	}
727	else if ((uch & (RT_BIT(7) \| RT_BIT(6) \| RT_BIT(5) \| RT_BIT(4))) == (RT_BIT(7) \| RT_BIT(6) \| RT_BIT(5)))
728	{
729	RTStrAssertMsgReturn((puch[1] & (RT_BIT(7) \| RT_BIT(6))) == RT_BIT(7), ("2/%u: %.*Rhxs\n", cb, RT_MIN(cb + 10, cch), puch), VERR_INVALID_UTF8_ENCODING);
730	RTStrAssertMsgReturn((puch[2] & (RT_BIT(7) \| RT_BIT(6))) == RT_BIT(7), ("3/%u: %.*Rhxs\n", cb, RT_MIN(cb + 10, cch), puch), VERR_INVALID_UTF8_ENCODING);
731	uc = (puch[2] & 0x3f)
732	\| ((RTUNICP)(puch[1] & 0x3f) << 6)
733	\| ((RTUNICP)(uch & 0x0f) << 12);
734	RTStrAssertMsgReturn(uc >= 0x00000800 && uc <= 0x0000fffd,
735	("%u: cp=%#010RX32: %.*Rhxs\n", cb, uc, RT_MIN(cb + 10, cch), puch),
736	uc == 0xffff \|\| uc == 0xfffe ? VERR_CODE_POINT_ENDIAN_INDICATOR : VERR_INVALID_UTF8_ENCODING);
737	RTStrAssertMsgReturn(uc < 0xd800 \|\| uc > 0xdfff,
738	("%u: cp=%#010RX32: %.*Rhxs\n", cb, uc, RT_MIN(cb + 10, cch), puch), VERR_CODE_POINT_SURROGATE);
739	cb = 3;
740	}
741	else if ((uch & (RT_BIT(7) \| RT_BIT(6) \| RT_BIT(5) \| RT_BIT(4) \| RT_BIT(3))) == (RT_BIT(7) \| RT_BIT(6) \| RT_BIT(5) \| RT_BIT(4)))
742	{
743	RTStrAssertMsgReturn((puch[1] & (RT_BIT(7) \| RT_BIT(6))) == RT_BIT(7), ("2/%u: %.*Rhxs\n", cb, RT_MIN(cb + 10, cch), puch), VERR_INVALID_UTF8_ENCODING);
744	RTStrAssertMsgReturn((puch[2] & (RT_BIT(7) \| RT_BIT(6))) == RT_BIT(7), ("3/%u: %.*Rhxs\n", cb, RT_MIN(cb + 10, cch), puch), VERR_INVALID_UTF8_ENCODING);
745	RTStrAssertMsgReturn((puch[3] & (RT_BIT(7) \| RT_BIT(6))) == RT_BIT(7), ("4/%u: %.*Rhxs\n", cb, RT_MIN(cb + 10, cch), puch), VERR_INVALID_UTF8_ENCODING);
746	uc = (puch[3] & 0x3f)
747	\| ((RTUNICP)(puch[2] & 0x3f) << 6)
748	\| ((RTUNICP)(puch[1] & 0x3f) << 12)
749	\| ((RTUNICP)(uch & 0x07) << 18);
750	RTStrAssertMsgReturn(uc >= 0x00010000 && uc <= 0x001fffff,
751	("%u: cp=%#010RX32: %.*Rhxs\n", cb, uc, RT_MIN(cb + 10, cch), puch), VERR_INVALID_UTF8_ENCODING);
752	RTStrAssertMsgReturn(uc <= 0x0010ffff,
753	("%u: cp=%#010RX32: %.*Rhxs\n", cb, uc, RT_MIN(cb + 10, cch), puch), VERR_CANT_RECODE_AS_UTF16);
754	cwc++;
755	cb = 4;
756	}
757	else if ((uch & (RT_BIT(7) \| RT_BIT(6) \| RT_BIT(5) \| RT_BIT(4) \| RT_BIT(3) \| RT_BIT(2))) == (RT_BIT(7) \| RT_BIT(6) \| RT_BIT(5) \| RT_BIT(4) \| RT_BIT(3)))
758	{
759	RTStrAssertMsgReturn((puch[1] & (RT_BIT(7) \| RT_BIT(6))) == RT_BIT(7), ("2/%u: %.*Rhxs\n", cb, RT_MIN(cb + 10, cch), puch), VERR_INVALID_UTF8_ENCODING);
760	RTStrAssertMsgReturn((puch[2] & (RT_BIT(7) \| RT_BIT(6))) == RT_BIT(7), ("3/%u: %.*Rhxs\n", cb, RT_MIN(cb + 10, cch), puch), VERR_INVALID_UTF8_ENCODING);
761	RTStrAssertMsgReturn((puch[3] & (RT_BIT(7) \| RT_BIT(6))) == RT_BIT(7), ("4/%u: %.*Rhxs\n", cb, RT_MIN(cb + 10, cch), puch), VERR_INVALID_UTF8_ENCODING);
762	RTStrAssertMsgReturn((puch[4] & (RT_BIT(7) \| RT_BIT(6))) == RT_BIT(7), ("5/%u: %.*Rhxs\n", cb, RT_MIN(cb + 10, cch), puch), VERR_INVALID_UTF8_ENCODING);
763	uc = (puch[4] & 0x3f)
764	\| ((RTUNICP)(puch[3] & 0x3f) << 6)
765	\| ((RTUNICP)(puch[2] & 0x3f) << 12)
766	\| ((RTUNICP)(puch[1] & 0x3f) << 18)
767	\| ((RTUNICP)(uch & 0x03) << 24);
768	RTStrAssertMsgReturn(uc >= 0x00200000 && uc <= 0x03ffffff,
769	("%u: cp=%#010RX32: %.*Rhxs\n", cb, uc, RT_MIN(cb + 10, cch), puch), VERR_INVALID_UTF8_ENCODING);
770	RTStrAssertMsgFailed(("%u: cp=%#010RX32: %.*Rhxs\n", cb, uc, RT_MIN(cb + 10, cch), puch));
771	return VERR_CANT_RECODE_AS_UTF16;
772	//cb = 5;
773	}
774	else if ((uch & (RT_BIT(7) \| RT_BIT(6) \| RT_BIT(5) \| RT_BIT(4) \| RT_BIT(3) \| RT_BIT(2) \| RT_BIT(1))) == (RT_BIT(7) \| RT_BIT(6) \| RT_BIT(5) \| RT_BIT(4) \| RT_BIT(3) \| RT_BIT(2)))
775	{
776	RTStrAssertMsgReturn((puch[1] & (RT_BIT(7) \| RT_BIT(6))) == RT_BIT(7), ("2/%u: %.*Rhxs\n", cb, RT_MIN(cb + 10, cch), puch), VERR_INVALID_UTF8_ENCODING);
777	RTStrAssertMsgReturn((puch[2] & (RT_BIT(7) \| RT_BIT(6))) == RT_BIT(7), ("3/%u: %.*Rhxs\n", cb, RT_MIN(cb + 10, cch), puch), VERR_INVALID_UTF8_ENCODING);
778	RTStrAssertMsgReturn((puch[3] & (RT_BIT(7) \| RT_BIT(6))) == RT_BIT(7), ("4/%u: %.*Rhxs\n", cb, RT_MIN(cb + 10, cch), puch), VERR_INVALID_UTF8_ENCODING);
779	RTStrAssertMsgReturn((puch[4] & (RT_BIT(7) \| RT_BIT(6))) == RT_BIT(7), ("5/%u: %.*Rhxs\n", cb, RT_MIN(cb + 10, cch), puch), VERR_INVALID_UTF8_ENCODING);
780	RTStrAssertMsgReturn((puch[5] & (RT_BIT(7) \| RT_BIT(6))) == RT_BIT(7), ("6/%u: %.*Rhxs\n", cb, RT_MIN(cb + 10, cch), puch), VERR_INVALID_UTF8_ENCODING);
781	uc = (puch[5] & 0x3f)
782	\| ((RTUNICP)(puch[4] & 0x3f) << 6)
783	\| ((RTUNICP)(puch[3] & 0x3f) << 12)
784	\| ((RTUNICP)(puch[2] & 0x3f) << 18)
785	\| ((RTUNICP)(puch[1] & 0x3f) << 24)
786	\| ((RTUNICP)(uch & 0x01) << 30);
787	RTStrAssertMsgReturn(uc >= 0x04000000 && uc <= 0x7fffffff,
788	("%u: cp=%#010RX32: %.*Rhxs\n", cb, uc, RT_MIN(cb + 10, cch), puch), VERR_INVALID_UTF8_ENCODING);
789	RTStrAssertMsgFailed(("%u: cp=%#010RX32: %.*Rhxs\n", cb, uc, RT_MIN(cb + 10, cch), puch));
790	return VERR_CANT_RECODE_AS_UTF16;
791	//cb = 6;
792	}
793	else
794	{
795	RTStrAssertMsgFailed(("Invalid UTF-8 first byte: %.*Rhxs\n", RT_MIN(cch, 10), puch));
796	return VERR_INVALID_UTF8_ENCODING;
797	}
798
799	/* advance */
800	puch += cb;
801	cwc++;
802	}
803	}
804
805	/* done */
806	*pcwc = cwc;
807	return VINF_SUCCESS;
808	}
809
810
811
812	/**
813	* Recodes a valid UTF-8 string as UTF-16.
814	*
815	* Since we know the input is valid, we do not perform encoding or length checks.
816	*
817	* @returns iprt status code.
818	* @param psz The UTF-8 string to recode. This is a valid encoding.
819	* @param cch The number of chars (the type char, so bytes if you like) to process of the UTF-8 string.
820	* The recoding will stop when cch or '\\0' is reached. Pass RTSTR_MAX to process up to '\\0'.
821	* @param pwsz Where to store the UTF-16 string.
822	* @param cwc The number of RTUTF16 items the pwsz buffer can hold, excluding the terminator ('\\0').
823	*/
824	static int rtUtf8RecodeAsUtf16(const char *psz, size_t cch, PRTUTF16 pwsz, size_t cwc)
825	{
826	int rc = VINF_SUCCESS;
827	const unsigned char puch = (const unsigned char )psz;
828	PRTUTF16 pwc = pwsz;
829	while (cch > 0)
830	{
831	/* read the next char and check for terminator. */
832	const unsigned char uch = *puch;
833	if (uch)
834	{ /* we only break once, so consider this the likely branch. */ }
835	else
836	break;
837
838	/* check for output overflow */
839	if (RT_LIKELY(cwc >= 1))
840	{ /* likely */ }
841	else
842	{
843	rc = VERR_BUFFER_OVERFLOW;
844	break;
845	}
846	cwc--;
847
848	/* decode and recode the code point */
849	if (!(uch & RT_BIT(7)))
850	{
851	*pwc++ = uch;
852	puch++;
853	cch--;
854	}
855	else if ((uch & (RT_BIT(7) \| RT_BIT(6) \| RT_BIT(5))) == (RT_BIT(7) \| RT_BIT(6)))
856	{
857	uint16_t uc = (puch[1] & 0x3f)
858	\| ((uint16_t)(uch & 0x1f) << 6);
859	*pwc++ = uc;
860	puch += 2;
861	cch -= 2;
862	}
863	else if ((uch & (RT_BIT(7) \| RT_BIT(6) \| RT_BIT(5) \| RT_BIT(4))) == (RT_BIT(7) \| RT_BIT(6) \| RT_BIT(5)))
864	{
865	uint16_t uc = (puch[2] & 0x3f)
866	\| ((uint16_t)(puch[1] & 0x3f) << 6)
867	\| ((uint16_t)(uch & 0x0f) << 12);
868	*pwc++ = uc;
869	puch += 3;
870	cch -= 3;
871	}
872	else
873	{
874	/* generate surrogate pair */
875	Assert((uch & (RT_BIT(7) \| RT_BIT(6) \| RT_BIT(5) \| RT_BIT(4) \| RT_BIT(3))) == (RT_BIT(7) \| RT_BIT(6) \| RT_BIT(5) \| RT_BIT(4)));
876	RTUNICP uc = (puch[3] & 0x3f)
877	\| ((RTUNICP)(puch[2] & 0x3f) << 6)
878	\| ((RTUNICP)(puch[1] & 0x3f) << 12)
879	\| ((RTUNICP)(uch & 0x07) << 18);
880	if (RT_UNLIKELY(cwc < 1))
881	{
882	rc = VERR_BUFFER_OVERFLOW;
883	break;
884	}
885	cwc--;
886
887	uc -= 0x10000;
888	*pwc++ = 0xd800 \| (uc >> 10);
889	*pwc++ = 0xdc00 \| (uc & 0x3ff);
890	puch += 4;
891	cch -= 4;
892	}
893	}
894
895	/* done */
896	*pwc = '\0';
897	return rc;
898	}
899
900
901	RTDECL(int) RTStrToUtf16Tag(const char pszString, PRTUTF16 ppwszString, const char *pszTag)
902	{
903	/*
904	* Validate input.
905	*/
906	Assert(VALID_PTR(ppwszString));
907	Assert(VALID_PTR(pszString));
908	*ppwszString = NULL;
909
910	/*
911	* Validate the UTF-8 input and calculate the length of the UTF-16 string.
912	*/
913	size_t cwc;
914	int rc = rtUtf8CalcUtf16Length(pszString, &cwc);
915	if (RT_SUCCESS(rc))
916	{
917	/*
918	* Allocate buffer.
919	*/
920	PRTUTF16 pwsz = (PRTUTF16)RTMemAllocTag((cwc + 1) * sizeof(RTUTF16), pszTag);
921	if (pwsz)
922	{
923	/*
924	* Encode the UTF-16 string.
925	*/
926	rc = rtUtf8RecodeAsUtf16(pszString, RTSTR_MAX, pwsz, cwc);
927	if (RT_SUCCESS(rc))
928	{
929	*ppwszString = pwsz;
930	return rc;
931	}
932	RTMemFree(pwsz);
933	}
934	else
935	rc = VERR_NO_UTF16_MEMORY;
936	}
937	return rc;
938	}
939	RT_EXPORT_SYMBOL(RTStrToUtf16Tag);
940
941
942	RTDECL(int) RTStrToUtf16ExTag(const char *pszString, size_t cchString,
943	PRTUTF16 ppwsz, size_t cwc, size_t pcwc, const char *pszTag)
944	{
945	/*
946	* Validate input.
947	*/
948	Assert(VALID_PTR(pszString));
949	Assert(VALID_PTR(ppwsz));
950	Assert(!pcwc \|\| VALID_PTR(pcwc));
951
952	/*
953	* Validate the UTF-8 input and calculate the length of the UTF-16 string.
954	*/
955	size_t cwcResult;
956	int rc;
957	if (cchString != RTSTR_MAX)
958	rc = rtUtf8CalcUtf16LengthN(pszString, cchString, &cwcResult);
959	else
960	rc = rtUtf8CalcUtf16Length(pszString, &cwcResult);
961	if (RT_SUCCESS(rc))
962	{
963	if (pcwc)
964	*pcwc = cwcResult;
965
966	/*
967	* Check buffer size / Allocate buffer.
968	*/
969	bool fShouldFree;
970	PRTUTF16 pwszResult;
971	if (cwc > 0 && *ppwsz)
972	{
973	fShouldFree = false;
974	if (cwc <= cwcResult)
975	return VERR_BUFFER_OVERFLOW;
976	pwszResult = *ppwsz;
977	}
978	else
979	{
980	*ppwsz = NULL;
981	fShouldFree = true;
982	cwc = RT_MAX(cwcResult + 1, cwc);
983	pwszResult = (PRTUTF16)RTMemAllocTag(cwc * sizeof(RTUTF16), pszTag);
984	}
985	if (pwszResult)
986	{
987	/*
988	* Encode the UTF-16 string.
989	*/
990	rc = rtUtf8RecodeAsUtf16(pszString, cchString, pwszResult, cwc - 1);
991	if (RT_SUCCESS(rc))
992	{
993	*ppwsz = pwszResult;
994	return rc;
995	}
996	if (fShouldFree)
997	RTMemFree(pwszResult);
998	}
999	else
1000	rc = VERR_NO_UTF16_MEMORY;
1001	}
1002	return rc;
1003	}
1004	RT_EXPORT_SYMBOL(RTStrToUtf16ExTag);
1005
1006
1007	RTDECL(size_t) RTStrCalcUtf16Len(const char *psz)
1008	{
1009	size_t cwc;
1010	int rc = rtUtf8CalcUtf16Length(psz, &cwc);
1011	return RT_SUCCESS(rc) ? cwc : 0;
1012	}
1013	RT_EXPORT_SYMBOL(RTStrCalcUtf16Len);
1014
1015
1016	RTDECL(int) RTStrCalcUtf16LenEx(const char psz, size_t cch, size_t pcwc)
1017	{
1018	size_t cwc;
1019	int rc;
1020	if (cch != RTSTR_MAX)
1021	rc = rtUtf8CalcUtf16LengthN(psz, cch, &cwc);
1022	else
1023	rc = rtUtf8CalcUtf16Length(psz, &cwc);
1024	if (pcwc)
1025	*pcwc = RT_SUCCESS(rc) ? cwc : ~(size_t)0;
1026	return rc;
1027	}
1028	RT_EXPORT_SYMBOL(RTStrCalcUtf16LenEx);
1029
1030
1031	/**
1032	* Calculates the length of the UTF-8 encoding of a Latin-1 string.
1033	*
1034	* @returns iprt status code.
1035	* @param psz The Latin-1 string.
1036	* @param cchIn The max length of the Latin-1 string to consider.
1037	* @param pcch Where to store the length (excluding '\\0') of the UTF-8 string. (cch == cb, btw)
1038	*/
1039	static int rtLatin1CalcUtf8Length(const char psz, size_t cchIn, size_t pcch)
1040	{
1041	size_t cch = 0;
1042	for (;;)
1043	{
1044	RTUNICP Cp;
1045	int rc = RTLatin1GetCpNEx(&psz, &cchIn, &Cp);
1046	if (Cp == 0 \|\| rc == VERR_END_OF_STRING)
1047	break;
1048	if (RT_FAILURE(rc))
1049	return rc;
1050	cch += RTStrCpSize(Cp); /* cannot fail */
1051	}
1052
1053	/* done */
1054	*pcch = cch;
1055	return VINF_SUCCESS;
1056	}
1057
1058
1059	/**
1060	* Recodes a Latin-1 string as UTF-8.
1061	*
1062	* @returns iprt status code.
1063	* @param pszIn The Latin-1 string.
1064	* @param cchIn The number of characters to process from psz. The recoding
1065	* will stop when cch or '\\0' is reached.
1066	* @param psz Where to store the UTF-8 string.
1067	* @param cch The size of the UTF-8 buffer, excluding the terminator.
1068	*/
1069	static int rtLatin1RecodeAsUtf8(const char pszIn, size_t cchIn, char psz, size_t cch)
1070	{
1071	int rc;
1072	for (;;)
1073	{
1074	RTUNICP Cp;
1075	size_t cchCp;
1076	rc = RTLatin1GetCpNEx(&pszIn, &cchIn, &Cp);
1077	if (Cp == 0 \|\| RT_FAILURE(rc))
1078	break;
1079	cchCp = RTStrCpSize(Cp);
1080	if (RT_UNLIKELY(cch < cchCp))
1081	{
1082	RTStrAssertMsgFailed(("Buffer overflow! 1\n"));
1083	rc = VERR_BUFFER_OVERFLOW;
1084	break;
1085	}
1086	cch -= cchCp;
1087	psz = RTStrPutCp(psz, Cp);
1088	}
1089
1090	/* done */
1091	if (rc == VERR_END_OF_STRING)
1092	rc = VINF_SUCCESS;
1093	*psz = '\0';
1094	return rc;
1095	}
1096
1097
1098
1099	RTDECL(int) RTLatin1ToUtf8Tag(const char pszString, char ppszString, const char pszTag)
1100	{
1101	/*
1102	* Validate input.
1103	*/
1104	Assert(VALID_PTR(ppszString));
1105	Assert(VALID_PTR(pszString));
1106	*ppszString = NULL;
1107
1108	/*
1109	* Calculate the length of the UTF-8 encoding of the Latin-1 string.
1110	*/
1111	size_t cch;
1112	int rc = rtLatin1CalcUtf8Length(pszString, RTSTR_MAX, &cch);
1113	if (RT_SUCCESS(rc))
1114	{
1115	/*
1116	* Allocate buffer and recode it.
1117	*/
1118	char pszResult = (char )RTMemAllocTag(cch + 1, pszTag);
1119	if (pszResult)
1120	{
1121	rc = rtLatin1RecodeAsUtf8(pszString, RTSTR_MAX, pszResult, cch);
1122	if (RT_SUCCESS(rc))
1123	{
1124	*ppszString = pszResult;
1125	return rc;
1126	}
1127
1128	RTMemFree(pszResult);
1129	}
1130	else
1131	rc = VERR_NO_STR_MEMORY;
1132	}
1133	return rc;
1134	}
1135	RT_EXPORT_SYMBOL(RTLatin1ToUtf8Tag);
1136
1137
1138	RTDECL(int) RTLatin1ToUtf8ExTag(const char pszString, size_t cchString, char ppsz, size_t cch, size_t pcch, const char *pszTag)
1139	{
1140	/*
1141	* Validate input.
1142	*/
1143	Assert(VALID_PTR(pszString));
1144	Assert(VALID_PTR(ppsz));
1145	Assert(!pcch \|\| VALID_PTR(pcch));
1146
1147	/*
1148	* Calculate the length of the UTF-8 encoding of the Latin-1 string.
1149	*/
1150	size_t cchResult;
1151	int rc = rtLatin1CalcUtf8Length(pszString, cchString, &cchResult);
1152	if (RT_SUCCESS(rc))
1153	{
1154	if (pcch)
1155	*pcch = cchResult;
1156
1157	/*
1158	* Check buffer size / Allocate buffer and recode it.
1159	*/
1160	bool fShouldFree;
1161	char *pszResult;
1162	if (cch > 0 && *ppsz)
1163	{
1164	fShouldFree = false;
1165	if (RT_UNLIKELY(cch <= cchResult))
1166	return VERR_BUFFER_OVERFLOW;
1167	pszResult = *ppsz;
1168	}
1169	else
1170	{
1171	*ppsz = NULL;
1172	fShouldFree = true;
1173	cch = RT_MAX(cch, cchResult + 1);
1174	pszResult = (char *)RTStrAllocTag(cch, pszTag);
1175	}
1176	if (pszResult)
1177	{
1178	rc = rtLatin1RecodeAsUtf8(pszString, cchString, pszResult, cch - 1);
1179	if (RT_SUCCESS(rc))
1180	{
1181	*ppsz = pszResult;
1182	return rc;
1183	}
1184
1185	if (fShouldFree)
1186	RTStrFree(pszResult);
1187	}
1188	else
1189	rc = VERR_NO_STR_MEMORY;
1190	}
1191	return rc;
1192	}
1193	RT_EXPORT_SYMBOL(RTLatin1ToUtf8ExTag);
1194
1195
1196	RTDECL(size_t) RTLatin1CalcUtf8Len(const char *psz)
1197	{
1198	size_t cch;
1199	int rc = rtLatin1CalcUtf8Length(psz, RTSTR_MAX, &cch);
1200	return RT_SUCCESS(rc) ? cch : 0;
1201	}
1202	RT_EXPORT_SYMBOL(RTLatin1CalcUtf8Len);
1203
1204
1205	RTDECL(int) RTLatin1CalcUtf8LenEx(const char psz, size_t cchIn, size_t pcch)
1206	{
1207	size_t cch;
1208	int rc = rtLatin1CalcUtf8Length(psz, cchIn, &cch);
1209	if (pcch)
1210	*pcch = RT_SUCCESS(rc) ? cch : ~(size_t)0;
1211	return rc;
1212	}
1213	RT_EXPORT_SYMBOL(RTLatin1CalcUtf8LenEx);
1214
1215
1216	/**
1217	* Calculates the Latin-1 length of a string, validating the encoding while
1218	* doing so.
1219	*
1220	* @returns IPRT status code.
1221	* @param psz Pointer to the UTF-8 string.
1222	* @param cchIn The max length of the string. (btw cch = cb)
1223	* Use RTSTR_MAX if all of the string is to be examined.
1224	* @param pcch Where to store the length of the Latin-1 string in bytes.
1225	*/
1226	static int rtUtf8CalcLatin1Length(const char psz, size_t cchIn, size_t pcch)
1227	{
1228	size_t cch = 0;
1229	for (;;)
1230	{
1231	RTUNICP Cp;
1232	size_t cchCp;
1233	int rc = RTStrGetCpNEx(&psz, &cchIn, &Cp);
1234	if (Cp == 0 \|\| rc == VERR_END_OF_STRING)
1235	break;
1236	if (RT_FAILURE(rc))
1237	return rc;
1238	cchCp = RTLatin1CpSize(Cp);
1239	if (cchCp == 0)
1240	return VERR_NO_TRANSLATION;
1241	cch += cchCp;
1242	}
1243
1244	/* done */
1245	*pcch = cch;
1246	return VINF_SUCCESS;
1247	}
1248
1249
1250	/**
1251	* Recodes a valid UTF-8 string as Latin-1.
1252	*
1253	* Since we know the input is valid, we do not perform encoding or length checks.
1254	*
1255	* @returns iprt status code.
1256	* @param pszIn The UTF-8 string to recode. This is a valid encoding.
1257	* @param cchIn The number of chars (the type char, so bytes if you like) to process of the UTF-8 string.
1258	* The recoding will stop when cch or '\\0' is reached. Pass RTSTR_MAX to process up to '\\0'.
1259	* @param psz Where to store the Latin-1 string.
1260	* @param cch The number of characters the pszOut buffer can hold, excluding the terminator ('\\0').
1261	*/
1262	static int rtUtf8RecodeAsLatin1(const char pszIn, size_t cchIn, char psz, size_t cch)
1263	{
1264	int rc;
1265	for (;;)
1266	{
1267	RTUNICP Cp;
1268	size_t cchCp;
1269	rc = RTStrGetCpNEx(&pszIn, &cchIn, &Cp);
1270	if (Cp == 0 \|\| RT_FAILURE(rc))
1271	break;
1272	cchCp = RTLatin1CpSize(Cp);
1273	if (RT_UNLIKELY(cch < cchCp))
1274	{
1275	RTStrAssertMsgFailed(("Buffer overflow! 1\n"));
1276	rc = VERR_BUFFER_OVERFLOW;
1277	break;
1278	}
1279	cch -= cchCp;
1280	psz = RTLatin1PutCp(psz, Cp);
1281	}
1282
1283	/* done */
1284	if (rc == VERR_END_OF_STRING)
1285	rc = VINF_SUCCESS;
1286	*psz = '\0';
1287	return rc;
1288	}
1289
1290
1291
1292	RTDECL(int) RTStrToLatin1Tag(const char pszString, char ppszString, const char pszTag)
1293	{
1294	/*
1295	* Validate input.
1296	*/
1297	Assert(VALID_PTR(ppszString));
1298	Assert(VALID_PTR(pszString));
1299	*ppszString = NULL;
1300
1301	/*
1302	* Validate the UTF-8 input and calculate the length of the Latin-1 string.
1303	*/
1304	size_t cch;
1305	int rc = rtUtf8CalcLatin1Length(pszString, RTSTR_MAX, &cch);
1306	if (RT_SUCCESS(rc))
1307	{
1308	/*
1309	* Allocate buffer.
1310	*/
1311	char psz = (char )RTMemAllocTag(cch + 1, pszTag);
1312	if (psz)
1313	{
1314	/*
1315	* Encode the UTF-16 string.
1316	*/
1317	rc = rtUtf8RecodeAsLatin1(pszString, RTSTR_MAX, psz, cch);
1318	if (RT_SUCCESS(rc))
1319	{
1320	*ppszString = psz;
1321	return rc;
1322	}
1323	RTMemFree(psz);
1324	}
1325	else
1326	rc = VERR_NO_STR_MEMORY;
1327	}
1328	return rc;
1329	}
1330	RT_EXPORT_SYMBOL(RTStrToLatin1Tag);
1331
1332
1333	RTDECL(int) RTStrToLatin1ExTag(const char *pszString, size_t cchString,
1334	char *ppsz, size_t cch, size_t pcch, const char *pszTag)
1335	{
1336	/*
1337	* Validate input.
1338	*/
1339	Assert(VALID_PTR(pszString));
1340	Assert(VALID_PTR(ppsz));
1341	Assert(!pcch \|\| VALID_PTR(pcch));
1342
1343	/*
1344	* Validate the UTF-8 input and calculate the length of the UTF-16 string.
1345	*/
1346	size_t cchResult;
1347	int rc = rtUtf8CalcLatin1Length(pszString, cchString, &cchResult);
1348	if (RT_SUCCESS(rc))
1349	{
1350	if (pcch)
1351	*pcch = cchResult;
1352
1353	/*
1354	* Check buffer size / Allocate buffer.
1355	*/
1356	bool fShouldFree;
1357	char *pszResult;
1358	if (cch > 0 && *ppsz)
1359	{
1360	fShouldFree = false;
1361	if (cch <= cchResult)
1362	return VERR_BUFFER_OVERFLOW;
1363	pszResult = *ppsz;
1364	}
1365	else
1366	{
1367	*ppsz = NULL;
1368	fShouldFree = true;
1369	cch = RT_MAX(cchResult + 1, cch);
1370	pszResult = (char *)RTMemAllocTag(cch, pszTag);
1371	}
1372	if (pszResult)
1373	{
1374	/*
1375	* Encode the Latin-1 string.
1376	*/
1377	rc = rtUtf8RecodeAsLatin1(pszString, cchString, pszResult, cch - 1);
1378	if (RT_SUCCESS(rc))
1379	{
1380	*ppsz = pszResult;
1381	return rc;
1382	}
1383	if (fShouldFree)
1384	RTMemFree(pszResult);
1385	}
1386	else
1387	rc = VERR_NO_STR_MEMORY;
1388	}
1389	return rc;
1390	}
1391	RT_EXPORT_SYMBOL(RTStrToLatin1Tag);
1392
1393
1394	RTDECL(size_t) RTStrCalcLatin1Len(const char *psz)
1395	{
1396	size_t cch;
1397	int rc = rtUtf8CalcLatin1Length(psz, RTSTR_MAX, &cch);
1398	return RT_SUCCESS(rc) ? cch : 0;
1399	}
1400	RT_EXPORT_SYMBOL(RTStrCalcLatin1Len);
1401
1402
1403	RTDECL(int) RTStrCalcLatin1LenEx(const char psz, size_t cchIn, size_t pcch)
1404	{
1405	size_t cch;
1406	int rc = rtUtf8CalcLatin1Length(psz, cchIn, &cch);
1407	if (pcch)
1408	*pcch = RT_SUCCESS(rc) ? cch : ~(size_t)0;
1409	return rc;
1410	}
1411	RT_EXPORT_SYMBOL(RTStrCalcLatin1LenEx);
1412
1413
1414	/**
1415	* Handle invalid encodings passed to RTStrGetCp() and RTStrGetCpEx().
1416	* @returns rc
1417	* @param ppsz The pointer to the string position point.
1418	* @param pCp Where to store RTUNICP_INVALID.
1419	* @param rc The iprt error code.
1420	*/
1421	static int rtStrGetCpExFailure(const char **ppsz, PRTUNICP pCp, int rc)
1422	{
1423	/*
1424	* Try find a valid encoding.
1425	*/
1426	(ppsz)++; /* @todo code this! */
1427	*pCp = RTUNICP_INVALID;
1428	return rc;
1429	}
1430
1431
1432	RTDECL(RTUNICP) RTStrGetCpInternal(const char *psz)
1433	{
1434	RTUNICP Cp;
1435	RTStrGetCpExInternal(&psz, &Cp);
1436	return Cp;
1437	}
1438	RT_EXPORT_SYMBOL(RTStrGetCpInternal);
1439
1440
1441	RTDECL(int) RTStrGetCpExInternal(const char **ppsz, PRTUNICP pCp)
1442	{
1443	const unsigned char puch = (const unsigned char )*ppsz;
1444	const unsigned char uch = *puch;
1445	RTUNICP uc;
1446
1447	/* ASCII ? */
1448	if (!(uch & RT_BIT(7)))
1449	{
1450	uc = uch;
1451	puch++;
1452	}
1453	else if (uch & RT_BIT(6))
1454	{
1455	/* figure the length and validate the first octet. */
1456	/** @todo RT_USE_RTC_3629 */
1457	unsigned cb;
1458	if (!(uch & RT_BIT(5)))
1459	cb = 2;
1460	else if (!(uch & RT_BIT(4)))
1461	cb = 3;
1462	else if (!(uch & RT_BIT(3)))
1463	cb = 4;
1464	else if (!(uch & RT_BIT(2)))
1465	cb = 5;
1466	else if (!(uch & RT_BIT(1)))
1467	cb = 6;
1468	else
1469	{
1470	RTStrAssertMsgFailed(("Invalid UTF-8 first byte: %.Rhxs\n", RT_MIN(strlen((char )puch), 10), puch));
1471	return rtStrGetCpExFailure(ppsz, pCp, VERR_INVALID_UTF8_ENCODING);
1472	}
1473
1474	/* validate the rest */
1475	switch (cb)
1476	{
1477	case 6:
1478	RTStrAssertMsgReturn((puch[5] & 0xc0) == 0x80, ("6/%u: %.Rhxs\n", cb, RT_MIN(cb + 10, strlen((char )puch)), puch),
1479	rtStrGetCpExFailure(ppsz, pCp, VERR_INVALID_UTF8_ENCODING));
1480	case 5:
1481	RTStrAssertMsgReturn((puch[4] & 0xc0) == 0x80, ("5/%u: %.Rhxs\n", cb, RT_MIN(cb + 10, strlen((char )puch)), puch),
1482	rtStrGetCpExFailure(ppsz, pCp, VERR_INVALID_UTF8_ENCODING));
1483	case 4:
1484	RTStrAssertMsgReturn((puch[3] & 0xc0) == 0x80, ("4/%u: %.Rhxs\n", cb, RT_MIN(cb + 10, strlen((char )puch)), puch),
1485	rtStrGetCpExFailure(ppsz, pCp, VERR_INVALID_UTF8_ENCODING));
1486	case 3:
1487	RTStrAssertMsgReturn((puch[2] & 0xc0) == 0x80, ("3/%u: %.Rhxs\n", cb, RT_MIN(cb + 10, strlen((char )puch)), puch),
1488	rtStrGetCpExFailure(ppsz, pCp, VERR_INVALID_UTF8_ENCODING));
1489	case 2:
1490	RTStrAssertMsgReturn((puch[1] & 0xc0) == 0x80, ("2/%u: %.Rhxs\n", cb, RT_MIN(cb + 10, strlen((char )puch)), puch),
1491	rtStrGetCpExFailure(ppsz, pCp, VERR_INVALID_UTF8_ENCODING));
1492	break;
1493	}
1494
1495	/* get and validate the code point. */
1496	switch (cb)
1497	{
1498	case 6:
1499	uc = (puch[5] & 0x3f)
1500	\| ((RTUNICP)(puch[4] & 0x3f) << 6)
1501	\| ((RTUNICP)(puch[3] & 0x3f) << 12)
1502	\| ((RTUNICP)(puch[2] & 0x3f) << 18)
1503	\| ((RTUNICP)(puch[1] & 0x3f) << 24)
1504	\| ((RTUNICP)(uch & 0x01) << 30);
1505	RTStrAssertMsgReturn(uc >= 0x04000000 && uc <= 0x7fffffff,
1506	("%u: cp=%#010RX32: %.Rhxs\n", cb, uc, RT_MIN(cb + 10, strlen((char )puch)), puch),
1507	rtStrGetCpExFailure(ppsz, pCp, VERR_INVALID_UTF8_ENCODING));
1508	break;
1509	case 5:
1510	uc = (puch[4] & 0x3f)
1511	\| ((RTUNICP)(puch[3] & 0x3f) << 6)
1512	\| ((RTUNICP)(puch[2] & 0x3f) << 12)
1513	\| ((RTUNICP)(puch[1] & 0x3f) << 18)
1514	\| ((RTUNICP)(uch & 0x03) << 24);
1515	RTStrAssertMsgReturn(uc >= 0x00200000 && uc <= 0x03ffffff,
1516	("%u: cp=%#010RX32: %.Rhxs\n", cb, uc, RT_MIN(cb + 10, strlen((char )puch)), puch),
1517	rtStrGetCpExFailure(ppsz, pCp, VERR_INVALID_UTF8_ENCODING));
1518	break;
1519	case 4:
1520	uc = (puch[3] & 0x3f)
1521	\| ((RTUNICP)(puch[2] & 0x3f) << 6)
1522	\| ((RTUNICP)(puch[1] & 0x3f) << 12)
1523	\| ((RTUNICP)(uch & 0x07) << 18);
1524	RTStrAssertMsgReturn(uc >= 0x00010000 && uc <= 0x001fffff,
1525	("%u: cp=%#010RX32: %.Rhxs\n", cb, uc, RT_MIN(cb + 10, strlen((char )puch)), puch),
1526	rtStrGetCpExFailure(ppsz, pCp, VERR_INVALID_UTF8_ENCODING));
1527	break;
1528	case 3:
1529	uc = (puch[2] & 0x3f)
1530	\| ((RTUNICP)(puch[1] & 0x3f) << 6)
1531	\| ((RTUNICP)(uch & 0x0f) << 12);
1532	RTStrAssertMsgReturn(uc >= 0x00000800 && uc <= 0x0000fffd,
1533	("%u: cp=%#010RX32: %.Rhxs\n", cb, uc, RT_MIN(cb + 10, strlen((char )puch)), puch),
1534	rtStrGetCpExFailure(ppsz, pCp, uc == 0xffff \|\| uc == 0xfffe ? VERR_CODE_POINT_ENDIAN_INDICATOR : VERR_INVALID_UTF8_ENCODING));
1535	RTStrAssertMsgReturn(uc < 0xd800 \|\| uc > 0xdfff,
1536	("%u: cp=%#010RX32: %.Rhxs\n", cb, uc, RT_MIN(cb + 10, strlen((char )puch)), puch),
1537	rtStrGetCpExFailure(ppsz, pCp, VERR_CODE_POINT_SURROGATE));
1538	break;
1539	case 2:
1540	uc = (puch[1] & 0x3f)
1541	\| ((RTUNICP)(uch & 0x1f) << 6);
1542	RTStrAssertMsgReturn(uc >= 0x00000080 && uc <= 0x000007ff,
1543	("%u: cp=%#010RX32: %.Rhxs\n", cb, uc, RT_MIN(cb + 10, strlen((char )puch)), puch),
1544	rtStrGetCpExFailure(ppsz, pCp, VERR_INVALID_UTF8_ENCODING));
1545	break;
1546	default: /* impossible, but GCC is bitching. */
1547	uc = RTUNICP_INVALID;
1548	break;
1549	}
1550	puch += cb;
1551	}
1552	else
1553	{
1554	/* 6th bit is always set. */
1555	RTStrAssertMsgFailed(("Invalid UTF-8 first byte: %.Rhxs\n", RT_MIN(strlen((char )puch), 10), puch));
1556	return rtStrGetCpExFailure(ppsz, pCp, VERR_INVALID_UTF8_ENCODING);
1557	}
1558	*pCp = uc;
1559	ppsz = (const char )puch;
1560	return VINF_SUCCESS;
1561	}
1562	RT_EXPORT_SYMBOL(RTStrGetCpExInternal);
1563
1564
1565	/**
1566	* Handle invalid encodings passed to RTStrGetCpNEx().
1567	* @returns rc
1568	* @param ppsz The pointer to the string position point.
1569	* @param pcch Pointer to the string length.
1570	* @param pCp Where to store RTUNICP_INVALID.
1571	* @param rc The iprt error code.
1572	*/
1573	static int rtStrGetCpNExFailure(const char *ppsz, size_t pcch, PRTUNICP pCp, int rc)
1574	{
1575	/*
1576	* Try find a valid encoding.
1577	*/
1578	(ppsz)++; /* @todo code this! */
1579	(*pcch)--;
1580	*pCp = RTUNICP_INVALID;
1581	return rc;
1582	}
1583
1584
1585	RTDECL(int) RTStrGetCpNExInternal(const char *ppsz, size_t pcch, PRTUNICP pCp)
1586	{
1587	const unsigned char puch = (const unsigned char )*ppsz;
1588	const unsigned char uch = *puch;
1589	size_t cch = *pcch;
1590	RTUNICP uc;
1591
1592	if (cch == 0)
1593	{
1594	*pCp = RTUNICP_INVALID;
1595	return VERR_END_OF_STRING;
1596	}
1597
1598	/* ASCII ? */
1599	if (!(uch & RT_BIT(7)))
1600	{
1601	uc = uch;
1602	puch++;
1603	cch--;
1604	}
1605	else if (uch & RT_BIT(6))
1606	{
1607	/* figure the length and validate the first octet. */
1608	/** @todo RT_USE_RTC_3629 */
1609	unsigned cb;
1610	if (!(uch & RT_BIT(5)))
1611	cb = 2;
1612	else if (!(uch & RT_BIT(4)))
1613	cb = 3;
1614	else if (!(uch & RT_BIT(3)))
1615	cb = 4;
1616	else if (!(uch & RT_BIT(2)))
1617	cb = 5;
1618	else if (!(uch & RT_BIT(1)))
1619	cb = 6;
1620	else
1621	{
1622	RTStrAssertMsgFailed(("Invalid UTF-8 first byte: %.Rhxs\n", RT_MIN(strlen((char )puch), 10), puch));
1623	return rtStrGetCpNExFailure(ppsz, pcch, pCp, VERR_INVALID_UTF8_ENCODING);
1624	}
1625
1626	if (cb > cch)
1627	return rtStrGetCpNExFailure(ppsz, pcch, pCp, VERR_INVALID_UTF8_ENCODING);
1628
1629	/* validate the rest */
1630	switch (cb)
1631	{
1632	case 6:
1633	RTStrAssertMsgReturn((puch[5] & 0xc0) == 0x80, ("6/%u: %.Rhxs\n", cb, RT_MIN(cb + 10, strlen((char )puch)), puch),
1634	rtStrGetCpNExFailure(ppsz, pcch, pCp, VERR_INVALID_UTF8_ENCODING));
1635	case 5:
1636	RTStrAssertMsgReturn((puch[4] & 0xc0) == 0x80, ("5/%u: %.Rhxs\n", cb, RT_MIN(cb + 10, strlen((char )puch)), puch),
1637	rtStrGetCpNExFailure(ppsz, pcch, pCp, VERR_INVALID_UTF8_ENCODING));
1638	case 4:
1639	RTStrAssertMsgReturn((puch[3] & 0xc0) == 0x80, ("4/%u: %.Rhxs\n", cb, RT_MIN(cb + 10, strlen((char )puch)), puch),
1640	rtStrGetCpNExFailure(ppsz, pcch, pCp, VERR_INVALID_UTF8_ENCODING));
1641	case 3:
1642	RTStrAssertMsgReturn((puch[2] & 0xc0) == 0x80, ("3/%u: %.Rhxs\n", cb, RT_MIN(cb + 10, strlen((char )puch)), puch),
1643	rtStrGetCpNExFailure(ppsz, pcch, pCp, VERR_INVALID_UTF8_ENCODING));
1644	case 2:
1645	RTStrAssertMsgReturn((puch[1] & 0xc0) == 0x80, ("2/%u: %.Rhxs\n", cb, RT_MIN(cb + 10, strlen((char )puch)), puch),
1646	rtStrGetCpNExFailure(ppsz, pcch, pCp, VERR_INVALID_UTF8_ENCODING));
1647	break;
1648	}
1649
1650	/* get and validate the code point. */
1651	switch (cb)
1652	{
1653	case 6:
1654	uc = (puch[5] & 0x3f)
1655	\| ((RTUNICP)(puch[4] & 0x3f) << 6)
1656	\| ((RTUNICP)(puch[3] & 0x3f) << 12)
1657	\| ((RTUNICP)(puch[2] & 0x3f) << 18)
1658	\| ((RTUNICP)(puch[1] & 0x3f) << 24)
1659	\| ((RTUNICP)(uch & 0x01) << 30);
1660	RTStrAssertMsgReturn(uc >= 0x04000000 && uc <= 0x7fffffff,
1661	("%u: cp=%#010RX32: %.Rhxs\n", cb, uc, RT_MIN(cb + 10, strlen((char )puch)), puch),
1662	rtStrGetCpNExFailure(ppsz, pcch, pCp, VERR_INVALID_UTF8_ENCODING));
1663	break;
1664	case 5:
1665	uc = (puch[4] & 0x3f)
1666	\| ((RTUNICP)(puch[3] & 0x3f) << 6)
1667	\| ((RTUNICP)(puch[2] & 0x3f) << 12)
1668	\| ((RTUNICP)(puch[1] & 0x3f) << 18)
1669	\| ((RTUNICP)(uch & 0x03) << 24);
1670	RTStrAssertMsgReturn(uc >= 0x00200000 && uc <= 0x03ffffff,
1671	("%u: cp=%#010RX32: %.Rhxs\n", cb, uc, RT_MIN(cb + 10, strlen((char )puch)), puch),
1672	rtStrGetCpNExFailure(ppsz, pcch, pCp, VERR_INVALID_UTF8_ENCODING));
1673	break;
1674	case 4:
1675	uc = (puch[3] & 0x3f)
1676	\| ((RTUNICP)(puch[2] & 0x3f) << 6)
1677	\| ((RTUNICP)(puch[1] & 0x3f) << 12)
1678	\| ((RTUNICP)(uch & 0x07) << 18);
1679	RTStrAssertMsgReturn(uc >= 0x00010000 && uc <= 0x001fffff,
1680	("%u: cp=%#010RX32: %.Rhxs\n", cb, uc, RT_MIN(cb + 10, strlen((char )puch)), puch),
1681	rtStrGetCpNExFailure(ppsz, pcch, pCp, VERR_INVALID_UTF8_ENCODING));
1682	break;
1683	case 3:
1684	uc = (puch[2] & 0x3f)
1685	\| ((RTUNICP)(puch[1] & 0x3f) << 6)
1686	\| ((RTUNICP)(uch & 0x0f) << 12);
1687	RTStrAssertMsgReturn(uc >= 0x00000800 && uc <= 0x0000fffd,
1688	("%u: cp=%#010RX32: %.Rhxs\n", cb, uc, RT_MIN(cb + 10, strlen((char )puch)), puch),
1689	rtStrGetCpNExFailure(ppsz, pcch, pCp, uc == 0xffff \|\| uc == 0xfffe ? VERR_CODE_POINT_ENDIAN_INDICATOR : VERR_INVALID_UTF8_ENCODING));
1690	RTStrAssertMsgReturn(uc < 0xd800 \|\| uc > 0xdfff,
1691	("%u: cp=%#010RX32: %.Rhxs\n", cb, uc, RT_MIN(cb + 10, strlen((char )puch)), puch),
1692	rtStrGetCpNExFailure(ppsz, pcch, pCp, VERR_CODE_POINT_SURROGATE));
1693	break;
1694	case 2:
1695	uc = (puch[1] & 0x3f)
1696	\| ((RTUNICP)(uch & 0x1f) << 6);
1697	RTStrAssertMsgReturn(uc >= 0x00000080 && uc <= 0x000007ff,
1698	("%u: cp=%#010RX32: %.Rhxs\n", cb, uc, RT_MIN(cb + 10, strlen((char )puch)), puch),
1699	rtStrGetCpNExFailure(ppsz, pcch, pCp, VERR_INVALID_UTF8_ENCODING));
1700	break;
1701	default: /* impossible, but GCC is bitching. */
1702	uc = RTUNICP_INVALID;
1703	break;
1704	}
1705	puch += cb;
1706	cch -= cb;
1707	}
1708	else
1709	{
1710	/* 6th bit is always set. */
1711	RTStrAssertMsgFailed(("Invalid UTF-8 first byte: %.Rhxs\n", RT_MIN(strlen((char )puch), 10), puch));
1712	return rtStrGetCpNExFailure(ppsz, pcch, pCp, VERR_INVALID_UTF8_ENCODING);
1713	}
1714	*pCp = uc;
1715	ppsz = (const char )puch;
1716	(*pcch) = cch;
1717	return VINF_SUCCESS;
1718	}
1719	RT_EXPORT_SYMBOL(RTStrGetCpNExInternal);
1720
1721
1722	RTDECL(char ) RTStrPutCpInternal(char psz, RTUNICP uc)
1723	{
1724	unsigned char puch = (unsigned char )psz;
1725	if (uc < 0x80)
1726	*puch++ = (unsigned char )uc;
1727	else if (uc < 0x00000800)
1728	{
1729	*puch++ = 0xc0 \| (uc >> 6);
1730	*puch++ = 0x80 \| (uc & 0x3f);
1731	}
1732	else if (uc < 0x00010000)
1733	{
1734	/** @todo RT_USE_RTC_3629 */
1735	if ( uc < 0x0000d8000
1736	\|\| ( uc > 0x0000dfff
1737	&& uc < 0x0000fffe))
1738	{
1739	*puch++ = 0xe0 \| (uc >> 12);
1740	*puch++ = 0x80 \| ((uc >> 6) & 0x3f);
1741	*puch++ = 0x80 \| (uc & 0x3f);
1742	}
1743	else
1744	{
1745	AssertMsgFailed(("Invalid code point U+%05x!\n", uc));
1746	*puch++ = 0x7f;
1747	}
1748	}
1749	/** @todo RT_USE_RTC_3629 */
1750	else if (uc < 0x00200000)
1751	{
1752	*puch++ = 0xf0 \| (uc >> 18);
1753	*puch++ = 0x80 \| ((uc >> 12) & 0x3f);
1754	*puch++ = 0x80 \| ((uc >> 6) & 0x3f);
1755	*puch++ = 0x80 \| (uc & 0x3f);
1756	}
1757	else if (uc < 0x04000000)
1758	{
1759	*puch++ = 0xf8 \| (uc >> 24);
1760	*puch++ = 0x80 \| ((uc >> 18) & 0x3f);
1761	*puch++ = 0x80 \| ((uc >> 12) & 0x3f);
1762	*puch++ = 0x80 \| ((uc >> 6) & 0x3f);
1763	*puch++ = 0x80 \| (uc & 0x3f);
1764	}
1765	else if (uc <= 0x7fffffff)
1766	{
1767	*puch++ = 0xfc \| (uc >> 30);
1768	*puch++ = 0x80 \| ((uc >> 24) & 0x3f);
1769	*puch++ = 0x80 \| ((uc >> 18) & 0x3f);
1770	*puch++ = 0x80 \| ((uc >> 12) & 0x3f);
1771	*puch++ = 0x80 \| ((uc >> 6) & 0x3f);
1772	*puch++ = 0x80 \| (uc & 0x3f);
1773	}
1774	else
1775	{
1776	AssertMsgFailed(("Invalid code point U+%08x!\n", uc));
1777	*puch++ = 0x7f;
1778	}
1779
1780	return (char *)puch;
1781	}
1782	RT_EXPORT_SYMBOL(RTStrPutCpInternal);
1783
1784
1785	RTDECL(char ) RTStrPrevCp(const char pszStart, const char *psz)
1786	{
1787	if (pszStart < psz)
1788	{
1789	/* simple char? */
1790	const unsigned char puch = (const unsigned char )psz;
1791	unsigned uch = *--puch;
1792	if (!(uch & RT_BIT(7)))
1793	return (char *)puch;
1794	RTStrAssertMsgReturn(!(uch & RT_BIT(6)), ("uch=%#x\n", uch), (char *)pszStart);
1795
1796	/* two or more. */
1797	uint32_t uMask = 0xffffffc0;
1798	while ( (const unsigned char *)pszStart < puch
1799	&& !(uMask & 1))
1800	{
1801	uch = *--puch;
1802	if ((uch & 0xc0) != 0x80)
1803	{
1804	RTStrAssertMsgReturn((uch & (uMask >> 1)) == (uMask & 0xff),
1805	("Invalid UTF-8 encoding: %.Rhxs puch=%p psz=%p\n", psz - (char )puch, puch, psz),
1806	(char *)pszStart);
1807	return (char *)puch;
1808	}
1809	uMask >>= 1;
1810	}
1811	RTStrAssertMsgFailed(("Invalid UTF-8 encoding: %.Rhxs puch=%p psz=%p\n", psz - (char )puch, puch, psz));
1812	}
1813	return (char *)pszStart;
1814	}
1815	RT_EXPORT_SYMBOL(RTStrPrevCp);
1816

Note: See TracBrowser for help on using the repository browser.

source: vbox/trunk/src/VBox/Runtime/common/string/utf-8.cpp@ 64797

Download in other formats: