utf-8.cpp@ 72778

Last change on this file since 72778 was 70156, checked in by vboxsync, 7 years ago
Runtime: fix Utf-8 for Linux R0.
Property svn:eol-style set to `native` Property svn:keywords set to `Id Revision`
File size: 70.7 KB

Line
1	/* $Id: utf-8.cpp 70156 2017-12-15 15:52:10Z vboxsync $ */
2	/** @file
3	* IPRT - UTF-8 Decoding.
4	*/
5
6	/*
7	* Copyright (C) 2006-2017 Oracle Corporation
8	*
9	* This file is part of VirtualBox Open Source Edition (OSE), as
10	* available from http://www.virtualbox.org. This file is free software;
11	* you can redistribute it and/or modify it under the terms of the GNU
12	* General Public License (GPL) as published by the Free Software
13	* Foundation, in version 2 as it comes in the "COPYING" file of the
14	* VirtualBox OSE distribution. VirtualBox OSE is distributed in the
15	* hope that it will be useful, but WITHOUT ANY WARRANTY of any kind.
16	*
17	* The contents of this file may alternatively be used under the terms
18	* of the Common Development and Distribution License Version 1.0
19	* (CDDL) only, as it comes in the "COPYING.CDDL" file of the
20	* VirtualBox OSE distribution, in which case the provisions of the
21	* CDDL are applicable instead of those of the GPL.
22	*
23	* You may elect to license modified versions of this file under the
24	* terms and conditions of either the GPL or the CDDL or both.
25	*/
26
27
28	/*********************************************************************************************************************************
29	* Header Files *
30	*********************************************************************************************************************************/
31	#include <iprt/string.h>
32	#include "internal/iprt.h"
33
34	#include <iprt/uni.h>
35	#include <iprt/asm.h>
36	#include <iprt/alloc.h>
37	#include <iprt/assert.h>
38	#include <iprt/err.h>
39	#include "internal/string.h"
40
41
42
43	/**
44	* Get get length in code points of a UTF-8 encoded string.
45	* The string is validated while doing this.
46	*
47	* @returns IPRT status code.
48	* @param psz Pointer to the UTF-8 string.
49	* @param cch The max length of the string. (btw cch = cb)
50	* Use RTSTR_MAX if all of the string is to be examined.
51	* @param pcuc Where to store the length in unicode code points.
52	* @param pcchActual Where to store the actual size of the UTF-8 string
53	* on success (cch = cb again). Optional.
54	*/
55	DECLHIDDEN(int) rtUtf8Length(const char psz, size_t cch, size_t pcuc, size_t *pcchActual)
56	{
57	const unsigned char puch = (const unsigned char )psz;
58	size_t cCodePoints = 0;
59	while (cch > 0)
60	{
61	const unsigned char uch = *puch;
62	if (!uch)
63	break;
64	if (uch & RT_BIT(7))
65	{
66	/* figure sequence length and validate the first byte */
67	/** @todo RT_USE_RTC_3629 */
68	unsigned cb;
69	if ((uch & (RT_BIT(7) \| RT_BIT(6) \| RT_BIT(5))) == (RT_BIT(7) \| RT_BIT(6)))
70	cb = 2;
71	else if ((uch & (RT_BIT(7) \| RT_BIT(6) \| RT_BIT(5) \| RT_BIT(4))) == (RT_BIT(7) \| RT_BIT(6) \| RT_BIT(5)))
72	cb = 3;
73	else if ((uch & (RT_BIT(7) \| RT_BIT(6) \| RT_BIT(5) \| RT_BIT(4) \| RT_BIT(3))) == (RT_BIT(7) \| RT_BIT(6) \| RT_BIT(5) \| RT_BIT(4)))
74	cb = 4;
75	else if ((uch & (RT_BIT(7) \| RT_BIT(6) \| RT_BIT(5) \| RT_BIT(4) \| RT_BIT(3) \| RT_BIT(2))) == (RT_BIT(7) \| RT_BIT(6) \| RT_BIT(5) \| RT_BIT(4) \| RT_BIT(3)))
76	cb = 5;
77	else if ((uch & (RT_BIT(7) \| RT_BIT(6) \| RT_BIT(5) \| RT_BIT(4) \| RT_BIT(3) \| RT_BIT(2) \| RT_BIT(1))) == (RT_BIT(7) \| RT_BIT(6) \| RT_BIT(5) \| RT_BIT(4) \| RT_BIT(3) \| RT_BIT(2)))
78	cb = 6;
79	else
80	{
81	RTStrAssertMsgFailed(("Invalid UTF-8 first byte: %.*Rhxs\n", RT_MIN(cch, 10), puch));
82	return VERR_INVALID_UTF8_ENCODING;
83	}
84
85	/* check length */
86	if (cb > cch)
87	{
88	RTStrAssertMsgFailed(("Invalid UTF-8 length: cb=%d cch=%d (%.*Rhxs)\n", cb, cch, RT_MIN(cch, 10), puch));
89	return VERR_INVALID_UTF8_ENCODING;
90	}
91
92	/* validate the rest */
93	switch (cb)
94	{
95	case 6:
96	RTStrAssertMsgReturn((puch[5] & (RT_BIT(7) \| RT_BIT(6))) == RT_BIT(7), ("6/%u: %.*Rhxs\n", cb, RT_MIN(cb + 10, cch), puch), VERR_INVALID_UTF8_ENCODING);
97	RT_FALL_THRU();
98	case 5:
99	RTStrAssertMsgReturn((puch[4] & (RT_BIT(7) \| RT_BIT(6))) == RT_BIT(7), ("5/%u: %.*Rhxs\n", cb, RT_MIN(cb + 10, cch), puch), VERR_INVALID_UTF8_ENCODING);
100	RT_FALL_THRU();
101	case 4:
102	RTStrAssertMsgReturn((puch[3] & (RT_BIT(7) \| RT_BIT(6))) == RT_BIT(7), ("4/%u: %.*Rhxs\n", cb, RT_MIN(cb + 10, cch), puch), VERR_INVALID_UTF8_ENCODING);
103	RT_FALL_THRU();
104	case 3:
105	RTStrAssertMsgReturn((puch[2] & (RT_BIT(7) \| RT_BIT(6))) == RT_BIT(7), ("3/%u: %.*Rhxs\n", cb, RT_MIN(cb + 10, cch), puch), VERR_INVALID_UTF8_ENCODING);
106	RT_FALL_THRU();
107	case 2:
108	RTStrAssertMsgReturn((puch[1] & (RT_BIT(7) \| RT_BIT(6))) == RT_BIT(7), ("2/%u: %.*Rhxs\n", cb, RT_MIN(cb + 10, cch), puch), VERR_INVALID_UTF8_ENCODING);
109	break;
110	}
111
112	/* validate the code point. */
113	RTUNICP uc;
114	switch (cb)
115	{
116	case 6:
117	uc = (puch[5] & 0x3f)
118	\| ((RTUNICP)(puch[4] & 0x3f) << 6)
119	\| ((RTUNICP)(puch[3] & 0x3f) << 12)
120	\| ((RTUNICP)(puch[2] & 0x3f) << 18)
121	\| ((RTUNICP)(puch[1] & 0x3f) << 24)
122	\| ((RTUNICP)(uch & 0x01) << 30);
123	RTStrAssertMsgReturn(uc >= 0x04000000 && uc <= 0x7fffffff,
124	("%u: cp=%#010RX32: %.*Rhxs\n", cb, uc, RT_MIN(cb + 10, cch), puch), VERR_INVALID_UTF8_ENCODING);
125	break;
126	case 5:
127	uc = (puch[4] & 0x3f)
128	\| ((RTUNICP)(puch[3] & 0x3f) << 6)
129	\| ((RTUNICP)(puch[2] & 0x3f) << 12)
130	\| ((RTUNICP)(puch[1] & 0x3f) << 18)
131	\| ((RTUNICP)(uch & 0x03) << 24);
132	RTStrAssertMsgReturn(uc >= 0x00200000 && uc <= 0x03ffffff,
133	("%u: cp=%#010RX32: %.*Rhxs\n", cb, uc, RT_MIN(cb + 10, cch), puch), VERR_INVALID_UTF8_ENCODING);
134	break;
135	case 4:
136	uc = (puch[3] & 0x3f)
137	\| ((RTUNICP)(puch[2] & 0x3f) << 6)
138	\| ((RTUNICP)(puch[1] & 0x3f) << 12)
139	\| ((RTUNICP)(uch & 0x07) << 18);
140	RTStrAssertMsgReturn(uc >= 0x00010000 && uc <= 0x001fffff,
141	("%u: cp=%#010RX32: %.*Rhxs\n", cb, uc, RT_MIN(cb + 10, cch), puch), VERR_INVALID_UTF8_ENCODING);
142	break;
143	case 3:
144	uc = (puch[2] & 0x3f)
145	\| ((RTUNICP)(puch[1] & 0x3f) << 6)
146	\| ((RTUNICP)(uch & 0x0f) << 12);
147	RTStrAssertMsgReturn(uc >= 0x00000800 && uc <= 0x0000fffd,
148	("%u: cp=%#010RX32: %.*Rhxs\n", cb, uc, RT_MIN(cb + 10, cch), puch),
149	uc == 0xffff \|\| uc == 0xfffe ? VERR_CODE_POINT_ENDIAN_INDICATOR : VERR_INVALID_UTF8_ENCODING);
150	RTStrAssertMsgReturn(uc < 0xd800 \|\| uc > 0xdfff,
151	("%u: cp=%#010RX32: %.*Rhxs\n", cb, uc, RT_MIN(cb + 10, cch), puch), VERR_CODE_POINT_SURROGATE);
152	break;
153	case 2:
154	uc = (puch[1] & 0x3f)
155	\| ((RTUNICP)(uch & 0x1f) << 6);
156	RTStrAssertMsgReturn(uc >= 0x00000080 && uc <= 0x000007ff,
157	("%u: cp=%#010RX32: %.*Rhxs\n", cb, uc, RT_MIN(cb + 10, cch), puch), VERR_INVALID_UTF8_ENCODING);
158	break;
159	}
160
161	/* advance */
162	cch -= cb;
163	puch += cb;
164	}
165	else
166	{
167	/* one ASCII byte */
168	puch++;
169	cch--;
170	}
171	cCodePoints++;
172	}
173
174	/* done */
175	*pcuc = cCodePoints;
176	if (pcchActual)
177	pcchActual = puch - (unsigned char const )psz;
178	return VINF_SUCCESS;
179	}
180
181
182	/**
183	* Decodes and UTF-8 string into an array of unicode code point.
184	*
185	* Since we know the input is valid, we do not perform encoding or length checks.
186	*
187	* @returns iprt status code.
188	* @param psz The UTF-8 string to recode. This is a valid encoding.
189	* @param cch The number of chars (the type char, so bytes if you like) to process of the UTF-8 string.
190	* The recoding will stop when cch or '\\0' is reached. Pass RTSTR_MAX to process up to '\\0'.
191	* @param paCps Where to store the code points array.
192	* @param cCps The number of RTUNICP items the paCps buffer can hold, excluding the terminator ('\\0').
193	*/
194	static int rtUtf8Decode(const char *psz, size_t cch, PRTUNICP paCps, size_t cCps)
195	{
196	int rc = VINF_SUCCESS;
197	const unsigned char puch = (const unsigned char )psz;
198	PRTUNICP pCp = paCps;
199	while (cch > 0)
200	{
201	/* read the next char and check for terminator. */
202	const unsigned char uch = *puch;
203	if (uch)
204	{ /* we only break once, so consider this the likely branch. */ }
205	else
206	break;
207
208	/* check for output overflow */
209	if (RT_LIKELY(cCps >= 1))
210	{ /* likely */ }
211	else
212	{
213	rc = VERR_BUFFER_OVERFLOW;
214	break;
215	}
216	cCps--;
217
218	/* decode and recode the code point */
219	if (!(uch & RT_BIT(7)))
220	{
221	*pCp++ = uch;
222	puch++;
223	cch--;
224	}
225	#ifdef RT_STRICT
226	else if (!(uch & RT_BIT(6)))
227	AssertMsgFailed(("Internal error!\n"));
228	#endif
229	else if (!(uch & RT_BIT(5)))
230	{
231	*pCp++ = (puch[1] & 0x3f)
232	\| ((uint16_t)(uch & 0x1f) << 6);
233	puch += 2;
234	cch -= 2;
235	}
236	else if (!(uch & RT_BIT(4)))
237	{
238	*pCp++ = (puch[2] & 0x3f)
239	\| ((uint16_t)(puch[1] & 0x3f) << 6)
240	\| ((uint16_t)(uch & 0x0f) << 12);
241	puch += 3;
242	cch -= 3;
243	}
244	else if (!(uch & RT_BIT(3)))
245	{
246	*pCp++ = (puch[3] & 0x3f)
247	\| ((RTUNICP)(puch[2] & 0x3f) << 6)
248	\| ((RTUNICP)(puch[1] & 0x3f) << 12)
249	\| ((RTUNICP)(uch & 0x07) << 18);
250	puch += 4;
251	cch -= 4;
252	}
253	else if (!(uch & RT_BIT(2)))
254	{
255	*pCp++ = (puch[4] & 0x3f)
256	\| ((RTUNICP)(puch[3] & 0x3f) << 6)
257	\| ((RTUNICP)(puch[2] & 0x3f) << 12)
258	\| ((RTUNICP)(puch[1] & 0x3f) << 18)
259	\| ((RTUNICP)(uch & 0x03) << 24);
260	puch += 5;
261	cch -= 6;
262	}
263	else
264	{
265	Assert(!(uch & RT_BIT(1)));
266	*pCp++ = (puch[5] & 0x3f)
267	\| ((RTUNICP)(puch[4] & 0x3f) << 6)
268	\| ((RTUNICP)(puch[3] & 0x3f) << 12)
269	\| ((RTUNICP)(puch[2] & 0x3f) << 18)
270	\| ((RTUNICP)(puch[1] & 0x3f) << 24)
271	\| ((RTUNICP)(uch & 0x01) << 30);
272	puch += 6;
273	cch -= 6;
274	}
275	}
276
277	/* done */
278	*pCp = 0;
279	return rc;
280	}
281
282
283	RTDECL(size_t) RTStrUniLen(const char *psz)
284	{
285	size_t cCodePoints;
286	int rc = rtUtf8Length(psz, RTSTR_MAX, &cCodePoints, NULL);
287	return RT_SUCCESS(rc) ? cCodePoints : 0;
288	}
289	RT_EXPORT_SYMBOL(RTStrUniLen);
290
291
292	RTDECL(int) RTStrUniLenEx(const char psz, size_t cch, size_t pcCps)
293	{
294	size_t cCodePoints;
295	int rc = rtUtf8Length(psz, cch, &cCodePoints, NULL);
296	if (pcCps)
297	*pcCps = RT_SUCCESS(rc) ? cCodePoints : 0;
298	return rc;
299	}
300	RT_EXPORT_SYMBOL(RTStrUniLenEx);
301
302
303	RTDECL(int) RTStrValidateEncoding(const char *psz)
304	{
305	return RTStrValidateEncodingEx(psz, RTSTR_MAX, 0);
306	}
307	RT_EXPORT_SYMBOL(RTStrValidateEncoding);
308
309
310	RTDECL(int) RTStrValidateEncodingEx(const char *psz, size_t cch, uint32_t fFlags)
311	{
312	AssertReturn(!(fFlags & ~(RTSTR_VALIDATE_ENCODING_ZERO_TERMINATED \| RTSTR_VALIDATE_ENCODING_EXACT_LENGTH)),
313	VERR_INVALID_PARAMETER);
314	AssertPtr(psz);
315
316	/*
317	* Use rtUtf8Length for the job.
318	*/
319	size_t cchActual;
320	size_t cCpsIgnored;
321	int rc = rtUtf8Length(psz, cch, &cCpsIgnored, &cchActual);
322	if (RT_SUCCESS(rc))
323	{
324	if (fFlags & RTSTR_VALIDATE_ENCODING_EXACT_LENGTH)
325	{
326	if (fFlags & RTSTR_VALIDATE_ENCODING_ZERO_TERMINATED)
327	cchActual++;
328	if (cchActual == cch)
329	rc = VINF_SUCCESS;
330	else if (cchActual < cch)
331	rc = VERR_BUFFER_UNDERFLOW;
332	else
333	rc = VERR_BUFFER_OVERFLOW;
334	}
335	else if ( (fFlags & RTSTR_VALIDATE_ENCODING_ZERO_TERMINATED)
336	&& cchActual >= cch)
337	rc = VERR_BUFFER_OVERFLOW;
338	}
339	return rc;
340	}
341	RT_EXPORT_SYMBOL(RTStrValidateEncodingEx);
342
343
344	RTDECL(bool) RTStrIsValidEncoding(const char *psz)
345	{
346	int rc = RTStrValidateEncodingEx(psz, RTSTR_MAX, 0);
347	return RT_SUCCESS(rc);
348	}
349	RT_EXPORT_SYMBOL(RTStrIsValidEncoding);
350
351
352	RTDECL(size_t) RTStrPurgeEncoding(char *psz)
353	{
354	size_t cErrors = 0;
355	for (;;)
356	{
357	RTUNICP Cp;
358	int rc = RTStrGetCpEx((const char **)&psz, &Cp);
359	if (RT_SUCCESS(rc))
360	{
361	if (!Cp)
362	break;
363	}
364	else
365	{
366	psz[-1] = '?';
367	cErrors++;
368	}
369	}
370	return cErrors;
371	}
372	RT_EXPORT_SYMBOL(RTStrPurgeEncoding);
373
374
375	/**
376	* Helper for RTStrPurgeComplementSet.
377	*
378	* @returns true if @a Cp is valid, false if not.
379	* @param Cp The code point to validate.
380	* @param puszValidPairs Pair of valid code point sets.
381	* @param cValidPairs Number of pairs.
382	*/
383	DECLINLINE(bool) rtStrPurgeIsInSet(RTUNICP Cp, PCRTUNICP puszValidPairs, uint32_t cValidPairs)
384	{
385	while (cValidPairs-- > 0)
386	{
387	if ( Cp >= puszValidPairs[0]
388	&& Cp <= puszValidPairs[1])
389	return true;
390	puszValidPairs += 2;
391	}
392	return false;
393	}
394
395
396	RTDECL(ssize_t) RTStrPurgeComplementSet(char *psz, PCRTUNICP puszValidPairs, char chReplacement)
397	{
398	AssertReturn(chReplacement && (unsigned)chReplacement < 128, -1);
399
400	/*
401	* Calc valid pairs and check that we've got an even number.
402	*/
403	uint32_t cValidPairs = 0;
404	while (puszValidPairs[cValidPairs * 2])
405	{
406	AssertReturn(puszValidPairs[cValidPairs * 2 + 1], -1);
407	AssertMsg(puszValidPairs[cValidPairs * 2] <= puszValidPairs[cValidPairs * 2 + 1],
408	("%#x vs %#x\n", puszValidPairs[cValidPairs * 2], puszValidPairs[cValidPairs * 2 + 1]));
409	cValidPairs++;
410	}
411
412	/*
413	* Do the replacing.
414	*/
415	ssize_t cReplacements = 0;
416	for (;;)
417	{
418	char *pszCur = psz;
419	RTUNICP Cp;
420	int rc = RTStrGetCpEx((const char **)&psz, &Cp);
421	if (RT_SUCCESS(rc))
422	{
423	if (Cp)
424	{
425	if (!rtStrPurgeIsInSet(Cp, puszValidPairs, cValidPairs))
426	{
427	for (; pszCur != psz; ++pszCur)
428	*pszCur = chReplacement;
429	++cReplacements;
430	}
431	}
432	else
433	break;
434	}
435	else
436	return -1;
437	}
438	return cReplacements;
439	}
440	RT_EXPORT_SYMBOL(RTStrPurgeComplementSet);
441
442
443	RTDECL(int) RTStrToUni(const char pszString, PRTUNICP ppaCps)
444	{
445	/*
446	* Validate input.
447	*/
448	Assert(VALID_PTR(pszString));
449	Assert(VALID_PTR(ppaCps));
450	*ppaCps = NULL;
451
452	/*
453	* Validate the UTF-8 input and count its code points.
454	*/
455	size_t cCps;
456	int rc = rtUtf8Length(pszString, RTSTR_MAX, &cCps, NULL);
457	if (RT_SUCCESS(rc))
458	{
459	/*
460	* Allocate buffer.
461	*/
462	PRTUNICP paCps = (PRTUNICP)RTMemAlloc((cCps + 1) * sizeof(RTUNICP));
463	if (paCps)
464	{
465	/*
466	* Decode the string.
467	*/
468	rc = rtUtf8Decode(pszString, RTSTR_MAX, paCps, cCps);
469	if (RT_SUCCESS(rc))
470	{
471	*ppaCps = paCps;
472	return rc;
473	}
474	RTMemFree(paCps);
475	}
476	else
477	rc = VERR_NO_CODE_POINT_MEMORY;
478	}
479	return rc;
480	}
481	RT_EXPORT_SYMBOL(RTStrToUni);
482
483
484	RTDECL(int) RTStrToUniEx(const char pszString, size_t cchString, PRTUNICP ppaCps, size_t cCps, size_t *pcCps)
485	{
486	/*
487	* Validate input.
488	*/
489	Assert(VALID_PTR(pszString));
490	Assert(VALID_PTR(ppaCps));
491	Assert(!pcCps \|\| VALID_PTR(pcCps));
492
493	/*
494	* Validate the UTF-8 input and count the code points.
495	*/
496	size_t cCpsResult;
497	int rc = rtUtf8Length(pszString, cchString, &cCpsResult, NULL);
498	if (RT_SUCCESS(rc))
499	{
500	if (pcCps)
501	*pcCps = cCpsResult;
502
503	/*
504	* Check buffer size / Allocate buffer.
505	*/
506	bool fShouldFree;
507	PRTUNICP paCpsResult;
508	if (cCps > 0 && *ppaCps)
509	{
510	fShouldFree = false;
511	if (cCps <= cCpsResult)
512	return VERR_BUFFER_OVERFLOW;
513	paCpsResult = *ppaCps;
514	}
515	else
516	{
517	*ppaCps = NULL;
518	fShouldFree = true;
519	cCps = RT_MAX(cCpsResult + 1, cCps);
520	paCpsResult = (PRTUNICP)RTMemAlloc(cCps * sizeof(RTUNICP));
521	}
522	if (paCpsResult)
523	{
524	/*
525	* Encode the UTF-16 string.
526	*/
527	rc = rtUtf8Decode(pszString, cchString, paCpsResult, cCps - 1);
528	if (RT_SUCCESS(rc))
529	{
530	*ppaCps = paCpsResult;
531	return rc;
532	}
533	if (fShouldFree)
534	RTMemFree(paCpsResult);
535	}
536	else
537	rc = VERR_NO_CODE_POINT_MEMORY;
538	}
539	return rc;
540	}
541	RT_EXPORT_SYMBOL(RTStrToUniEx);
542
543
544	/**
545	* Calculates the UTF-16 length of a string, validating the encoding while doing so.
546	*
547	* @returns IPRT status code.
548	* @param psz Pointer to the UTF-8 string.
549	* @param cch The max length of the string. (btw cch = cb)
550	* @param pcwc Where to store the length of the UTF-16 string as a number
551	* of RTUTF16 characters.
552	* @sa rtUtf8CalcUtf16Length
553	*/
554	static int rtUtf8CalcUtf16LengthN(const char psz, size_t cch, size_t pcwc)
555	{
556	const unsigned char puch = (const unsigned char )psz;
557	size_t cwc = 0;
558	while (cch > 0)
559	{
560	const unsigned char uch = *puch;
561	if (!(uch & RT_BIT(7)))
562	{
563	/* one ASCII byte */
564	if (uch)
565	{
566	cwc++;
567	puch++;
568	cch--;
569	}
570	else
571	break;
572	}
573	else
574	{
575	/*
576	* Multibyte sequence is more complicated when we have length
577	* restrictions on the input.
578	*/
579	/* figure sequence length and validate the first byte */
580	unsigned cb;
581	if ((uch & (RT_BIT(7) \| RT_BIT(6) \| RT_BIT(5))) == (RT_BIT(7) \| RT_BIT(6)))
582	cb = 2;
583	else if ((uch & (RT_BIT(7) \| RT_BIT(6) \| RT_BIT(5) \| RT_BIT(4))) == (RT_BIT(7) \| RT_BIT(6) \| RT_BIT(5)))
584	cb = 3;
585	else if ((uch & (RT_BIT(7) \| RT_BIT(6) \| RT_BIT(5) \| RT_BIT(4) \| RT_BIT(3))) == (RT_BIT(7) \| RT_BIT(6) \| RT_BIT(5) \| RT_BIT(4)))
586	cb = 4;
587	else if ((uch & (RT_BIT(7) \| RT_BIT(6) \| RT_BIT(5) \| RT_BIT(4) \| RT_BIT(3) \| RT_BIT(2))) == (RT_BIT(7) \| RT_BIT(6) \| RT_BIT(5) \| RT_BIT(4) \| RT_BIT(3)))
588	cb = 5;
589	else if ((uch & (RT_BIT(7) \| RT_BIT(6) \| RT_BIT(5) \| RT_BIT(4) \| RT_BIT(3) \| RT_BIT(2) \| RT_BIT(1))) == (RT_BIT(7) \| RT_BIT(6) \| RT_BIT(5) \| RT_BIT(4) \| RT_BIT(3) \| RT_BIT(2)))
590	cb = 6;
591	else
592	{
593	RTStrAssertMsgFailed(("Invalid UTF-8 first byte: %.*Rhxs\n", RT_MIN(cch, 10), puch));
594	return VERR_INVALID_UTF8_ENCODING;
595	}
596
597	/* check length */
598	if (cb > cch)
599	{
600	RTStrAssertMsgFailed(("Invalid UTF-8 length: cb=%d cch=%d (%.*Rhxs)\n", cb, cch, RT_MIN(cch, 10), puch));
601	return VERR_INVALID_UTF8_ENCODING;
602	}
603
604	/* validate the rest */
605	switch (cb)
606	{
607	case 6:
608	RTStrAssertMsgReturn((puch[5] & (RT_BIT(7) \| RT_BIT(6))) == RT_BIT(7), ("6/%u: %.*Rhxs\n", cb, RT_MIN(cb + 10, cch), puch), VERR_INVALID_UTF8_ENCODING);
609	RT_FALL_THRU();
610	case 5:
611	RTStrAssertMsgReturn((puch[4] & (RT_BIT(7) \| RT_BIT(6))) == RT_BIT(7), ("5/%u: %.*Rhxs\n", cb, RT_MIN(cb + 10, cch), puch), VERR_INVALID_UTF8_ENCODING);
612	RT_FALL_THRU();
613	case 4:
614	RTStrAssertMsgReturn((puch[3] & (RT_BIT(7) \| RT_BIT(6))) == RT_BIT(7), ("4/%u: %.*Rhxs\n", cb, RT_MIN(cb + 10, cch), puch), VERR_INVALID_UTF8_ENCODING);
615	RT_FALL_THRU();
616	case 3:
617	RTStrAssertMsgReturn((puch[2] & (RT_BIT(7) \| RT_BIT(6))) == RT_BIT(7), ("3/%u: %.*Rhxs\n", cb, RT_MIN(cb + 10, cch), puch), VERR_INVALID_UTF8_ENCODING);
618	RT_FALL_THRU();
619	case 2:
620	RTStrAssertMsgReturn((puch[1] & (RT_BIT(7) \| RT_BIT(6))) == RT_BIT(7), ("2/%u: %.*Rhxs\n", cb, RT_MIN(cb + 10, cch), puch), VERR_INVALID_UTF8_ENCODING);
621	break;
622	}
623
624	/* validate the code point. */
625	RTUNICP uc;
626	switch (cb)
627	{
628	case 6:
629	uc = (puch[5] & 0x3f)
630	\| ((RTUNICP)(puch[4] & 0x3f) << 6)
631	\| ((RTUNICP)(puch[3] & 0x3f) << 12)
632	\| ((RTUNICP)(puch[2] & 0x3f) << 18)
633	\| ((RTUNICP)(puch[1] & 0x3f) << 24)
634	\| ((RTUNICP)(uch & 0x01) << 30);
635	RTStrAssertMsgReturn(uc >= 0x04000000 && uc <= 0x7fffffff,
636	("%u: cp=%#010RX32: %.*Rhxs\n", cb, uc, RT_MIN(cb + 10, cch), puch), VERR_INVALID_UTF8_ENCODING);
637	RTStrAssertMsgFailed(("%u: cp=%#010RX32: %.*Rhxs\n", cb, uc, RT_MIN(cb + 10, cch), puch));
638	return VERR_CANT_RECODE_AS_UTF16;
639	case 5:
640	uc = (puch[4] & 0x3f)
641	\| ((RTUNICP)(puch[3] & 0x3f) << 6)
642	\| ((RTUNICP)(puch[2] & 0x3f) << 12)
643	\| ((RTUNICP)(puch[1] & 0x3f) << 18)
644	\| ((RTUNICP)(uch & 0x03) << 24);
645	RTStrAssertMsgReturn(uc >= 0x00200000 && uc <= 0x03ffffff,
646	("%u: cp=%#010RX32: %.*Rhxs\n", cb, uc, RT_MIN(cb + 10, cch), puch), VERR_INVALID_UTF8_ENCODING);
647	RTStrAssertMsgFailed(("%u: cp=%#010RX32: %.*Rhxs\n", cb, uc, RT_MIN(cb + 10, cch), puch));
648	return VERR_CANT_RECODE_AS_UTF16;
649	case 4:
650	uc = (puch[3] & 0x3f)
651	\| ((RTUNICP)(puch[2] & 0x3f) << 6)
652	\| ((RTUNICP)(puch[1] & 0x3f) << 12)
653	\| ((RTUNICP)(uch & 0x07) << 18);
654	RTStrAssertMsgReturn(uc >= 0x00010000 && uc <= 0x001fffff,
655	("%u: cp=%#010RX32: %.*Rhxs\n", cb, uc, RT_MIN(cb + 10, cch), puch), VERR_INVALID_UTF8_ENCODING);
656	RTStrAssertMsgReturn(uc <= 0x0010ffff,
657	("%u: cp=%#010RX32: %.*Rhxs\n", cb, uc, RT_MIN(cb + 10, cch), puch), VERR_CANT_RECODE_AS_UTF16);
658	cwc++;
659	break;
660	case 3:
661	uc = (puch[2] & 0x3f)
662	\| ((RTUNICP)(puch[1] & 0x3f) << 6)
663	\| ((RTUNICP)(uch & 0x0f) << 12);
664	RTStrAssertMsgReturn(uc >= 0x00000800 && uc <= 0x0000fffd,
665	("%u: cp=%#010RX32: %.*Rhxs\n", cb, uc, RT_MIN(cb + 10, cch), puch),
666	uc == 0xffff \|\| uc == 0xfffe ? VERR_CODE_POINT_ENDIAN_INDICATOR : VERR_INVALID_UTF8_ENCODING);
667	RTStrAssertMsgReturn(uc < 0xd800 \|\| uc > 0xdfff,
668	("%u: cp=%#010RX32: %.*Rhxs\n", cb, uc, RT_MIN(cb + 10, cch), puch), VERR_CODE_POINT_SURROGATE);
669	break;
670	case 2:
671	uc = (puch[1] & 0x3f)
672	\| ((RTUNICP)(uch & 0x1f) << 6);
673	RTStrAssertMsgReturn(uc >= 0x00000080 && uc <= 0x000007ff,
674	("%u: cp=%#010RX32: %.*Rhxs\n", cb, uc, RT_MIN(cb + 10, cch), puch), VERR_INVALID_UTF8_ENCODING);
675	break;
676	}
677
678	/* advance */
679	cch -= cb;
680	puch += cb;
681	cwc++;
682	}
683	}
684
685	/* done */
686	*pcwc = cwc;
687	return VINF_SUCCESS;
688	}
689
690
691	/**
692	* Calculates the UTF-16 length of a string, validating the encoding while doing so.
693	*
694	* @returns IPRT status code.
695	* @param psz Pointer to the UTF-8 string.
696	* @param pcwc Where to store the length of the UTF-16 string as a number
697	* of RTUTF16 characters.
698	* @sa rtUtf8CalcUtf16LengthN
699	*/
700	static int rtUtf8CalcUtf16Length(const char psz, size_t pcwc)
701	{
702	const unsigned char puch = (const unsigned char )psz;
703	size_t cwc = 0;
704	for (;;)
705	{
706	const unsigned char uch = *puch;
707	if (!(uch & RT_BIT(7)))
708	{
709	/* one ASCII byte */
710	if (uch)
711	{
712	cwc++;
713	puch++;
714	}
715	else
716	break;
717	}
718	else
719	{
720	/*
721	* Figure sequence length, implicitly validate the first byte.
722	* Then validate the additional bytes.
723	* Finally validate the code point.
724	*/
725	unsigned cb;
726	RTUNICP uc;
727	if ((uch & (RT_BIT(7) \| RT_BIT(6) \| RT_BIT(5))) == (RT_BIT(7) \| RT_BIT(6)))
728	{
729	RTStrAssertMsgReturn((puch[1] & (RT_BIT(7) \| RT_BIT(6))) == RT_BIT(7), ("2/%u: %.*Rhxs\n", cb, RT_MIN(cb + 10, cch), puch), VERR_INVALID_UTF8_ENCODING);
730	uc = (puch[1] & 0x3f)
731	\| ((RTUNICP)(uch & 0x1f) << 6);
732	RTStrAssertMsgReturn(uc >= 0x00000080 && uc <= 0x000007ff,
733	("%u: cp=%#010RX32: %.*Rhxs\n", cb, uc, RT_MIN(cb + 10, cch), puch), VERR_INVALID_UTF8_ENCODING);
734	cb = 2;
735	}
736	else if ((uch & (RT_BIT(7) \| RT_BIT(6) \| RT_BIT(5) \| RT_BIT(4))) == (RT_BIT(7) \| RT_BIT(6) \| RT_BIT(5)))
737	{
738	RTStrAssertMsgReturn((puch[1] & (RT_BIT(7) \| RT_BIT(6))) == RT_BIT(7), ("2/%u: %.*Rhxs\n", cb, RT_MIN(cb + 10, cch), puch), VERR_INVALID_UTF8_ENCODING);
739	RTStrAssertMsgReturn((puch[2] & (RT_BIT(7) \| RT_BIT(6))) == RT_BIT(7), ("3/%u: %.*Rhxs\n", cb, RT_MIN(cb + 10, cch), puch), VERR_INVALID_UTF8_ENCODING);
740	uc = (puch[2] & 0x3f)
741	\| ((RTUNICP)(puch[1] & 0x3f) << 6)
742	\| ((RTUNICP)(uch & 0x0f) << 12);
743	RTStrAssertMsgReturn(uc >= 0x00000800 && uc <= 0x0000fffd,
744	("%u: cp=%#010RX32: %.*Rhxs\n", cb, uc, RT_MIN(cb + 10, cch), puch),
745	uc == 0xffff \|\| uc == 0xfffe ? VERR_CODE_POINT_ENDIAN_INDICATOR : VERR_INVALID_UTF8_ENCODING);
746	RTStrAssertMsgReturn(uc < 0xd800 \|\| uc > 0xdfff,
747	("%u: cp=%#010RX32: %.*Rhxs\n", cb, uc, RT_MIN(cb + 10, cch), puch), VERR_CODE_POINT_SURROGATE);
748	cb = 3;
749	}
750	else if ((uch & (RT_BIT(7) \| RT_BIT(6) \| RT_BIT(5) \| RT_BIT(4) \| RT_BIT(3))) == (RT_BIT(7) \| RT_BIT(6) \| RT_BIT(5) \| RT_BIT(4)))
751	{
752	RTStrAssertMsgReturn((puch[1] & (RT_BIT(7) \| RT_BIT(6))) == RT_BIT(7), ("2/%u: %.*Rhxs\n", cb, RT_MIN(cb + 10, cch), puch), VERR_INVALID_UTF8_ENCODING);
753	RTStrAssertMsgReturn((puch[2] & (RT_BIT(7) \| RT_BIT(6))) == RT_BIT(7), ("3/%u: %.*Rhxs\n", cb, RT_MIN(cb + 10, cch), puch), VERR_INVALID_UTF8_ENCODING);
754	RTStrAssertMsgReturn((puch[3] & (RT_BIT(7) \| RT_BIT(6))) == RT_BIT(7), ("4/%u: %.*Rhxs\n", cb, RT_MIN(cb + 10, cch), puch), VERR_INVALID_UTF8_ENCODING);
755	uc = (puch[3] & 0x3f)
756	\| ((RTUNICP)(puch[2] & 0x3f) << 6)
757	\| ((RTUNICP)(puch[1] & 0x3f) << 12)
758	\| ((RTUNICP)(uch & 0x07) << 18);
759	RTStrAssertMsgReturn(uc >= 0x00010000 && uc <= 0x001fffff,
760	("%u: cp=%#010RX32: %.*Rhxs\n", cb, uc, RT_MIN(cb + 10, cch), puch), VERR_INVALID_UTF8_ENCODING);
761	RTStrAssertMsgReturn(uc <= 0x0010ffff,
762	("%u: cp=%#010RX32: %.*Rhxs\n", cb, uc, RT_MIN(cb + 10, cch), puch), VERR_CANT_RECODE_AS_UTF16);
763	cwc++;
764	cb = 4;
765	}
766	else if ((uch & (RT_BIT(7) \| RT_BIT(6) \| RT_BIT(5) \| RT_BIT(4) \| RT_BIT(3) \| RT_BIT(2))) == (RT_BIT(7) \| RT_BIT(6) \| RT_BIT(5) \| RT_BIT(4) \| RT_BIT(3)))
767	{
768	RTStrAssertMsgReturn((puch[1] & (RT_BIT(7) \| RT_BIT(6))) == RT_BIT(7), ("2/%u: %.*Rhxs\n", cb, RT_MIN(cb + 10, cch), puch), VERR_INVALID_UTF8_ENCODING);
769	RTStrAssertMsgReturn((puch[2] & (RT_BIT(7) \| RT_BIT(6))) == RT_BIT(7), ("3/%u: %.*Rhxs\n", cb, RT_MIN(cb + 10, cch), puch), VERR_INVALID_UTF8_ENCODING);
770	RTStrAssertMsgReturn((puch[3] & (RT_BIT(7) \| RT_BIT(6))) == RT_BIT(7), ("4/%u: %.*Rhxs\n", cb, RT_MIN(cb + 10, cch), puch), VERR_INVALID_UTF8_ENCODING);
771	RTStrAssertMsgReturn((puch[4] & (RT_BIT(7) \| RT_BIT(6))) == RT_BIT(7), ("5/%u: %.*Rhxs\n", cb, RT_MIN(cb + 10, cch), puch), VERR_INVALID_UTF8_ENCODING);
772	uc = (puch[4] & 0x3f)
773	\| ((RTUNICP)(puch[3] & 0x3f) << 6)
774	\| ((RTUNICP)(puch[2] & 0x3f) << 12)
775	\| ((RTUNICP)(puch[1] & 0x3f) << 18)
776	\| ((RTUNICP)(uch & 0x03) << 24);
777	RTStrAssertMsgReturn(uc >= 0x00200000 && uc <= 0x03ffffff,
778	("%u: cp=%#010RX32: %.*Rhxs\n", cb, uc, RT_MIN(cb + 10, cch), puch), VERR_INVALID_UTF8_ENCODING);
779	RTStrAssertMsgFailed(("%u: cp=%#010RX32: %.*Rhxs\n", cb, uc, RT_MIN(cb + 10, cch), puch));
780	return VERR_CANT_RECODE_AS_UTF16;
781	//cb = 5;
782	}
783	else if ((uch & (RT_BIT(7) \| RT_BIT(6) \| RT_BIT(5) \| RT_BIT(4) \| RT_BIT(3) \| RT_BIT(2) \| RT_BIT(1))) == (RT_BIT(7) \| RT_BIT(6) \| RT_BIT(5) \| RT_BIT(4) \| RT_BIT(3) \| RT_BIT(2)))
784	{
785	RTStrAssertMsgReturn((puch[1] & (RT_BIT(7) \| RT_BIT(6))) == RT_BIT(7), ("2/%u: %.*Rhxs\n", cb, RT_MIN(cb + 10, cch), puch), VERR_INVALID_UTF8_ENCODING);
786	RTStrAssertMsgReturn((puch[2] & (RT_BIT(7) \| RT_BIT(6))) == RT_BIT(7), ("3/%u: %.*Rhxs\n", cb, RT_MIN(cb + 10, cch), puch), VERR_INVALID_UTF8_ENCODING);
787	RTStrAssertMsgReturn((puch[3] & (RT_BIT(7) \| RT_BIT(6))) == RT_BIT(7), ("4/%u: %.*Rhxs\n", cb, RT_MIN(cb + 10, cch), puch), VERR_INVALID_UTF8_ENCODING);
788	RTStrAssertMsgReturn((puch[4] & (RT_BIT(7) \| RT_BIT(6))) == RT_BIT(7), ("5/%u: %.*Rhxs\n", cb, RT_MIN(cb + 10, cch), puch), VERR_INVALID_UTF8_ENCODING);
789	RTStrAssertMsgReturn((puch[5] & (RT_BIT(7) \| RT_BIT(6))) == RT_BIT(7), ("6/%u: %.*Rhxs\n", cb, RT_MIN(cb + 10, cch), puch), VERR_INVALID_UTF8_ENCODING);
790	uc = (puch[5] & 0x3f)
791	\| ((RTUNICP)(puch[4] & 0x3f) << 6)
792	\| ((RTUNICP)(puch[3] & 0x3f) << 12)
793	\| ((RTUNICP)(puch[2] & 0x3f) << 18)
794	\| ((RTUNICP)(puch[1] & 0x3f) << 24)
795	\| ((RTUNICP)(uch & 0x01) << 30);
796	RTStrAssertMsgReturn(uc >= 0x04000000 && uc <= 0x7fffffff,
797	("%u: cp=%#010RX32: %.*Rhxs\n", cb, uc, RT_MIN(cb + 10, cch), puch), VERR_INVALID_UTF8_ENCODING);
798	RTStrAssertMsgFailed(("%u: cp=%#010RX32: %.*Rhxs\n", cb, uc, RT_MIN(cb + 10, cch), puch));
799	return VERR_CANT_RECODE_AS_UTF16;
800	//cb = 6;
801	}
802	else
803	{
804	RTStrAssertMsgFailed(("Invalid UTF-8 first byte: %.*Rhxs\n", RT_MIN(cch, 10), puch));
805	return VERR_INVALID_UTF8_ENCODING;
806	}
807
808	/* advance */
809	puch += cb;
810	cwc++;
811	}
812	}
813
814	/* done */
815	*pcwc = cwc;
816	return VINF_SUCCESS;
817	}
818
819
820
821	/**
822	* Recodes a valid UTF-8 string as UTF-16.
823	*
824	* Since we know the input is valid, we do not perform encoding or length checks.
825	*
826	* @returns iprt status code.
827	* @param psz The UTF-8 string to recode. This is a valid encoding.
828	* @param cch The number of chars (the type char, so bytes if you like) to process of the UTF-8 string.
829	* The recoding will stop when cch or '\\0' is reached. Pass RTSTR_MAX to process up to '\\0'.
830	* @param pwsz Where to store the UTF-16 string.
831	* @param cwc The number of RTUTF16 items the pwsz buffer can hold, excluding the terminator ('\\0').
832	*
833	* @note rtUtf8RecodeAsUtf16Big is a duplicate with RT_H2BE_U16 applied.
834	*/
835	static int rtUtf8RecodeAsUtf16(const char *psz, size_t cch, PRTUTF16 pwsz, size_t cwc)
836	{
837	int rc = VINF_SUCCESS;
838	const unsigned char puch = (const unsigned char )psz;
839	PRTUTF16 pwc = pwsz;
840	while (cch > 0)
841	{
842	/* read the next char and check for terminator. */
843	const unsigned char uch = *puch;
844	if (uch)
845	{ /* we only break once, so consider this the likely branch. */ }
846	else
847	break;
848
849	/* check for output overflow */
850	if (RT_LIKELY(cwc >= 1))
851	{ /* likely */ }
852	else
853	{
854	rc = VERR_BUFFER_OVERFLOW;
855	break;
856	}
857	cwc--;
858
859	/* decode and recode the code point */
860	if (!(uch & RT_BIT(7)))
861	{
862	*pwc++ = uch;
863	puch++;
864	cch--;
865	}
866	else if ((uch & (RT_BIT(7) \| RT_BIT(6) \| RT_BIT(5))) == (RT_BIT(7) \| RT_BIT(6)))
867	{
868	uint16_t uc = (puch[1] & 0x3f)
869	\| ((uint16_t)(uch & 0x1f) << 6);
870	*pwc++ = uc;
871	puch += 2;
872	cch -= 2;
873	}
874	else if ((uch & (RT_BIT(7) \| RT_BIT(6) \| RT_BIT(5) \| RT_BIT(4))) == (RT_BIT(7) \| RT_BIT(6) \| RT_BIT(5)))
875	{
876	uint16_t uc = (puch[2] & 0x3f)
877	\| ((uint16_t)(puch[1] & 0x3f) << 6)
878	\| ((uint16_t)(uch & 0x0f) << 12);
879	*pwc++ = uc;
880	puch += 3;
881	cch -= 3;
882	}
883	else
884	{
885	/* generate surrogate pair */
886	Assert((uch & (RT_BIT(7) \| RT_BIT(6) \| RT_BIT(5) \| RT_BIT(4) \| RT_BIT(3))) == (RT_BIT(7) \| RT_BIT(6) \| RT_BIT(5) \| RT_BIT(4)));
887	RTUNICP uc = (puch[3] & 0x3f)
888	\| ((RTUNICP)(puch[2] & 0x3f) << 6)
889	\| ((RTUNICP)(puch[1] & 0x3f) << 12)
890	\| ((RTUNICP)(uch & 0x07) << 18);
891	if (RT_UNLIKELY(cwc < 1))
892	{
893	rc = VERR_BUFFER_OVERFLOW;
894	break;
895	}
896	cwc--;
897
898	uc -= 0x10000;
899	*pwc++ = 0xd800 \| (uc >> 10);
900	*pwc++ = 0xdc00 \| (uc & 0x3ff);
901	puch += 4;
902	cch -= 4;
903	}
904	}
905
906	/* done */
907	*pwc = '\0';
908	return rc;
909	}
910
911
912	/**
913	* Recodes a valid UTF-8 string as UTF-16BE.
914	*
915	* Since we know the input is valid, we do not perform encoding or length checks.
916	*
917	* @returns iprt status code.
918	* @param psz The UTF-8 string to recode. This is a valid encoding.
919	* @param cch The number of chars (the type char, so bytes if you like) to process of the UTF-8 string.
920	* The recoding will stop when cch or '\\0' is reached. Pass RTSTR_MAX to process up to '\\0'.
921	* @param pwsz Where to store the UTF-16BE string.
922	* @param cwc The number of RTUTF16 items the pwsz buffer can hold, excluding the terminator ('\\0').
923	*
924	* @note This is a copy of rtUtf8RecodeAsUtf16 with RT_H2BE_U16 applied.
925	*/
926	static int rtUtf8RecodeAsUtf16Big(const char *psz, size_t cch, PRTUTF16 pwsz, size_t cwc)
927	{
928	int rc = VINF_SUCCESS;
929	const unsigned char puch = (const unsigned char )psz;
930	PRTUTF16 pwc = pwsz;
931	while (cch > 0)
932	{
933	/* read the next char and check for terminator. */
934	const unsigned char uch = *puch;
935	if (uch)
936	{ /* we only break once, so consider this the likely branch. */ }
937	else
938	break;
939
940	/* check for output overflow */
941	if (RT_LIKELY(cwc >= 1))
942	{ /* likely */ }
943	else
944	{
945	rc = VERR_BUFFER_OVERFLOW;
946	break;
947	}
948	cwc--;
949
950	/* decode and recode the code point */
951	if (!(uch & RT_BIT(7)))
952	{
953	*pwc++ = RT_H2BE_U16((RTUTF16)uch);
954	puch++;
955	cch--;
956	}
957	else if ((uch & (RT_BIT(7) \| RT_BIT(6) \| RT_BIT(5))) == (RT_BIT(7) \| RT_BIT(6)))
958	{
959	uint16_t uc = (puch[1] & 0x3f)
960	\| ((uint16_t)(uch & 0x1f) << 6);
961	*pwc++ = RT_H2BE_U16(uc);
962	puch += 2;
963	cch -= 2;
964	}
965	else if ((uch & (RT_BIT(7) \| RT_BIT(6) \| RT_BIT(5) \| RT_BIT(4))) == (RT_BIT(7) \| RT_BIT(6) \| RT_BIT(5)))
966	{
967	uint16_t uc = (puch[2] & 0x3f)
968	\| ((uint16_t)(puch[1] & 0x3f) << 6)
969	\| ((uint16_t)(uch & 0x0f) << 12);
970	*pwc++ = RT_H2BE_U16(uc);
971	puch += 3;
972	cch -= 3;
973	}
974	else
975	{
976	/* generate surrogate pair */
977	Assert((uch & (RT_BIT(7) \| RT_BIT(6) \| RT_BIT(5) \| RT_BIT(4) \| RT_BIT(3))) == (RT_BIT(7) \| RT_BIT(6) \| RT_BIT(5) \| RT_BIT(4)));
978	RTUNICP uc = (puch[3] & 0x3f)
979	\| ((RTUNICP)(puch[2] & 0x3f) << 6)
980	\| ((RTUNICP)(puch[1] & 0x3f) << 12)
981	\| ((RTUNICP)(uch & 0x07) << 18);
982	if (RT_UNLIKELY(cwc < 1))
983	{
984	rc = VERR_BUFFER_OVERFLOW;
985	break;
986	}
987	cwc--;
988
989	uc -= 0x10000;
990	*pwc++ = RT_H2BE_U16(0xd800 \| (uc >> 10));
991	*pwc++ = RT_H2BE_U16(0xdc00 \| (uc & 0x3ff));
992	puch += 4;
993	cch -= 4;
994	}
995	}
996
997	/* done */
998	*pwc = '\0';
999	return rc;
1000	}
1001
1002
1003	RTDECL(int) RTStrToUtf16Tag(const char pszString, PRTUTF16 ppwszString, const char *pszTag)
1004	{
1005	/*
1006	* Validate input.
1007	*/
1008	Assert(VALID_PTR(ppwszString));
1009	Assert(VALID_PTR(pszString));
1010	*ppwszString = NULL;
1011
1012	/*
1013	* Validate the UTF-8 input and calculate the length of the UTF-16 string.
1014	*/
1015	size_t cwc;
1016	int rc = rtUtf8CalcUtf16Length(pszString, &cwc);
1017	if (RT_SUCCESS(rc))
1018	{
1019	/*
1020	* Allocate buffer.
1021	*/
1022	PRTUTF16 pwsz = (PRTUTF16)RTMemAllocTag((cwc + 1) * sizeof(RTUTF16), pszTag);
1023	if (pwsz)
1024	{
1025	/*
1026	* Encode the UTF-16 string.
1027	*/
1028	rc = rtUtf8RecodeAsUtf16(pszString, RTSTR_MAX, pwsz, cwc);
1029	if (RT_SUCCESS(rc))
1030	{
1031	*ppwszString = pwsz;
1032	return rc;
1033	}
1034	RTMemFree(pwsz);
1035	}
1036	else
1037	rc = VERR_NO_UTF16_MEMORY;
1038	}
1039	return rc;
1040	}
1041	RT_EXPORT_SYMBOL(RTStrToUtf16Tag);
1042
1043
1044	RTDECL(int) RTStrToUtf16BigTag(const char pszString, PRTUTF16 ppwszString, const char *pszTag)
1045	{
1046	/*
1047	* Validate input.
1048	*/
1049	Assert(VALID_PTR(ppwszString));
1050	Assert(VALID_PTR(pszString));
1051	*ppwszString = NULL;
1052
1053	/*
1054	* Validate the UTF-8 input and calculate the length of the UTF-16 string.
1055	*/
1056	size_t cwc;
1057	int rc = rtUtf8CalcUtf16Length(pszString, &cwc);
1058	if (RT_SUCCESS(rc))
1059	{
1060	/*
1061	* Allocate buffer.
1062	*/
1063	PRTUTF16 pwsz = (PRTUTF16)RTMemAllocTag((cwc + 1) * sizeof(RTUTF16), pszTag);
1064	if (pwsz)
1065	{
1066	/*
1067	* Encode the UTF-16 string.
1068	*/
1069	rc = rtUtf8RecodeAsUtf16Big(pszString, RTSTR_MAX, pwsz, cwc);
1070	if (RT_SUCCESS(rc))
1071	{
1072	*ppwszString = pwsz;
1073	return rc;
1074	}
1075	RTMemFree(pwsz);
1076	}
1077	else
1078	rc = VERR_NO_UTF16_MEMORY;
1079	}
1080	return rc;
1081	}
1082	RT_EXPORT_SYMBOL(RTStrToUtf16BigTag);
1083
1084
1085	RTDECL(int) RTStrToUtf16ExTag(const char *pszString, size_t cchString,
1086	PRTUTF16 ppwsz, size_t cwc, size_t pcwc, const char *pszTag)
1087	{
1088	/*
1089	* Validate input.
1090	*/
1091	Assert(VALID_PTR(pszString));
1092	Assert(VALID_PTR(ppwsz));
1093	Assert(!pcwc \|\| VALID_PTR(pcwc));
1094
1095	/*
1096	* Validate the UTF-8 input and calculate the length of the UTF-16 string.
1097	*/
1098	size_t cwcResult;
1099	int rc;
1100	if (cchString != RTSTR_MAX)
1101	rc = rtUtf8CalcUtf16LengthN(pszString, cchString, &cwcResult);
1102	else
1103	rc = rtUtf8CalcUtf16Length(pszString, &cwcResult);
1104	if (RT_SUCCESS(rc))
1105	{
1106	if (pcwc)
1107	*pcwc = cwcResult;
1108
1109	/*
1110	* Check buffer size / Allocate buffer.
1111	*/
1112	bool fShouldFree;
1113	PRTUTF16 pwszResult;
1114	if (cwc > 0 && *ppwsz)
1115	{
1116	fShouldFree = false;
1117	if (cwc <= cwcResult)
1118	return VERR_BUFFER_OVERFLOW;
1119	pwszResult = *ppwsz;
1120	}
1121	else
1122	{
1123	*ppwsz = NULL;
1124	fShouldFree = true;
1125	cwc = RT_MAX(cwcResult + 1, cwc);
1126	pwszResult = (PRTUTF16)RTMemAllocTag(cwc * sizeof(RTUTF16), pszTag);
1127	}
1128	if (pwszResult)
1129	{
1130	/*
1131	* Encode the UTF-16 string.
1132	*/
1133	rc = rtUtf8RecodeAsUtf16(pszString, cchString, pwszResult, cwc - 1);
1134	if (RT_SUCCESS(rc))
1135	{
1136	*ppwsz = pwszResult;
1137	return rc;
1138	}
1139	if (fShouldFree)
1140	RTMemFree(pwszResult);
1141	}
1142	else
1143	rc = VERR_NO_UTF16_MEMORY;
1144	}
1145	return rc;
1146	}
1147	RT_EXPORT_SYMBOL(RTStrToUtf16ExTag);
1148
1149
1150	RTDECL(int) RTStrToUtf16BigExTag(const char *pszString, size_t cchString,
1151	PRTUTF16 ppwsz, size_t cwc, size_t pcwc, const char *pszTag)
1152	{
1153	/*
1154	* Validate input.
1155	*/
1156	Assert(VALID_PTR(pszString));
1157	Assert(VALID_PTR(ppwsz));
1158	Assert(!pcwc \|\| VALID_PTR(pcwc));
1159
1160	/*
1161	* Validate the UTF-8 input and calculate the length of the UTF-16 string.
1162	*/
1163	size_t cwcResult;
1164	int rc;
1165	if (cchString != RTSTR_MAX)
1166	rc = rtUtf8CalcUtf16LengthN(pszString, cchString, &cwcResult);
1167	else
1168	rc = rtUtf8CalcUtf16Length(pszString, &cwcResult);
1169	if (RT_SUCCESS(rc))
1170	{
1171	if (pcwc)
1172	*pcwc = cwcResult;
1173
1174	/*
1175	* Check buffer size / Allocate buffer.
1176	*/
1177	bool fShouldFree;
1178	PRTUTF16 pwszResult;
1179	if (cwc > 0 && *ppwsz)
1180	{
1181	fShouldFree = false;
1182	if (cwc <= cwcResult)
1183	return VERR_BUFFER_OVERFLOW;
1184	pwszResult = *ppwsz;
1185	}
1186	else
1187	{
1188	*ppwsz = NULL;
1189	fShouldFree = true;
1190	cwc = RT_MAX(cwcResult + 1, cwc);
1191	pwszResult = (PRTUTF16)RTMemAllocTag(cwc * sizeof(RTUTF16), pszTag);
1192	}
1193	if (pwszResult)
1194	{
1195	/*
1196	* Encode the UTF-16BE string.
1197	*/
1198	rc = rtUtf8RecodeAsUtf16Big(pszString, cchString, pwszResult, cwc - 1);
1199	if (RT_SUCCESS(rc))
1200	{
1201	*ppwsz = pwszResult;
1202	return rc;
1203	}
1204	if (fShouldFree)
1205	RTMemFree(pwszResult);
1206	}
1207	else
1208	rc = VERR_NO_UTF16_MEMORY;
1209	}
1210	return rc;
1211	}
1212	RT_EXPORT_SYMBOL(RTStrToUtf16BigExTag);
1213
1214
1215	RTDECL(size_t) RTStrCalcUtf16Len(const char *psz)
1216	{
1217	size_t cwc;
1218	int rc = rtUtf8CalcUtf16Length(psz, &cwc);
1219	return RT_SUCCESS(rc) ? cwc : 0;
1220	}
1221	RT_EXPORT_SYMBOL(RTStrCalcUtf16Len);
1222
1223
1224	RTDECL(int) RTStrCalcUtf16LenEx(const char psz, size_t cch, size_t pcwc)
1225	{
1226	size_t cwc;
1227	int rc;
1228	if (cch != RTSTR_MAX)
1229	rc = rtUtf8CalcUtf16LengthN(psz, cch, &cwc);
1230	else
1231	rc = rtUtf8CalcUtf16Length(psz, &cwc);
1232	if (pcwc)
1233	*pcwc = RT_SUCCESS(rc) ? cwc : ~(size_t)0;
1234	return rc;
1235	}
1236	RT_EXPORT_SYMBOL(RTStrCalcUtf16LenEx);
1237
1238
1239	/**
1240	* Calculates the length of the UTF-8 encoding of a Latin-1 string.
1241	*
1242	* @returns iprt status code.
1243	* @param psz The Latin-1 string.
1244	* @param cchIn The max length of the Latin-1 string to consider.
1245	* @param pcch Where to store the length (excluding '\\0') of the UTF-8 string. (cch == cb, btw)
1246	*/
1247	static int rtLatin1CalcUtf8Length(const char psz, size_t cchIn, size_t pcch)
1248	{
1249	size_t cch = 0;
1250	for (;;)
1251	{
1252	RTUNICP Cp;
1253	int rc = RTLatin1GetCpNEx(&psz, &cchIn, &Cp);
1254	if (Cp == 0 \|\| rc == VERR_END_OF_STRING)
1255	break;
1256	if (RT_FAILURE(rc))
1257	return rc;
1258	cch += RTStrCpSize(Cp); /* cannot fail */
1259	}
1260
1261	/* done */
1262	*pcch = cch;
1263	return VINF_SUCCESS;
1264	}
1265
1266
1267	/**
1268	* Recodes a Latin-1 string as UTF-8.
1269	*
1270	* @returns iprt status code.
1271	* @param pszIn The Latin-1 string.
1272	* @param cchIn The number of characters to process from psz. The recoding
1273	* will stop when cch or '\\0' is reached.
1274	* @param psz Where to store the UTF-8 string.
1275	* @param cch The size of the UTF-8 buffer, excluding the terminator.
1276	*/
1277	static int rtLatin1RecodeAsUtf8(const char pszIn, size_t cchIn, char psz, size_t cch)
1278	{
1279	int rc;
1280	for (;;)
1281	{
1282	RTUNICP Cp;
1283	size_t cchCp;
1284	rc = RTLatin1GetCpNEx(&pszIn, &cchIn, &Cp);
1285	if (Cp == 0 \|\| RT_FAILURE(rc))
1286	break;
1287	cchCp = RTStrCpSize(Cp);
1288	if (RT_UNLIKELY(cch < cchCp))
1289	{
1290	RTStrAssertMsgFailed(("Buffer overflow! 1\n"));
1291	rc = VERR_BUFFER_OVERFLOW;
1292	break;
1293	}
1294	cch -= cchCp;
1295	psz = RTStrPutCp(psz, Cp);
1296	}
1297
1298	/* done */
1299	if (rc == VERR_END_OF_STRING)
1300	rc = VINF_SUCCESS;
1301	*psz = '\0';
1302	return rc;
1303	}
1304
1305
1306
1307	RTDECL(int) RTLatin1ToUtf8Tag(const char pszString, char ppszString, const char pszTag)
1308	{
1309	/*
1310	* Validate input.
1311	*/
1312	Assert(VALID_PTR(ppszString));
1313	Assert(VALID_PTR(pszString));
1314	*ppszString = NULL;
1315
1316	/*
1317	* Calculate the length of the UTF-8 encoding of the Latin-1 string.
1318	*/
1319	size_t cch;
1320	int rc = rtLatin1CalcUtf8Length(pszString, RTSTR_MAX, &cch);
1321	if (RT_SUCCESS(rc))
1322	{
1323	/*
1324	* Allocate buffer and recode it.
1325	*/
1326	char pszResult = (char )RTMemAllocTag(cch + 1, pszTag);
1327	if (pszResult)
1328	{
1329	rc = rtLatin1RecodeAsUtf8(pszString, RTSTR_MAX, pszResult, cch);
1330	if (RT_SUCCESS(rc))
1331	{
1332	*ppszString = pszResult;
1333	return rc;
1334	}
1335
1336	RTMemFree(pszResult);
1337	}
1338	else
1339	rc = VERR_NO_STR_MEMORY;
1340	}
1341	return rc;
1342	}
1343	RT_EXPORT_SYMBOL(RTLatin1ToUtf8Tag);
1344
1345
1346	RTDECL(int) RTLatin1ToUtf8ExTag(const char pszString, size_t cchString, char ppsz, size_t cch, size_t pcch, const char *pszTag)
1347	{
1348	/*
1349	* Validate input.
1350	*/
1351	Assert(VALID_PTR(pszString));
1352	Assert(VALID_PTR(ppsz));
1353	Assert(!pcch \|\| VALID_PTR(pcch));
1354
1355	/*
1356	* Calculate the length of the UTF-8 encoding of the Latin-1 string.
1357	*/
1358	size_t cchResult;
1359	int rc = rtLatin1CalcUtf8Length(pszString, cchString, &cchResult);
1360	if (RT_SUCCESS(rc))
1361	{
1362	if (pcch)
1363	*pcch = cchResult;
1364
1365	/*
1366	* Check buffer size / Allocate buffer and recode it.
1367	*/
1368	bool fShouldFree;
1369	char *pszResult;
1370	if (cch > 0 && *ppsz)
1371	{
1372	fShouldFree = false;
1373	if (RT_UNLIKELY(cch <= cchResult))
1374	return VERR_BUFFER_OVERFLOW;
1375	pszResult = *ppsz;
1376	}
1377	else
1378	{
1379	*ppsz = NULL;
1380	fShouldFree = true;
1381	cch = RT_MAX(cch, cchResult + 1);
1382	pszResult = (char *)RTStrAllocTag(cch, pszTag);
1383	}
1384	if (pszResult)
1385	{
1386	rc = rtLatin1RecodeAsUtf8(pszString, cchString, pszResult, cch - 1);
1387	if (RT_SUCCESS(rc))
1388	{
1389	*ppsz = pszResult;
1390	return rc;
1391	}
1392
1393	if (fShouldFree)
1394	RTStrFree(pszResult);
1395	}
1396	else
1397	rc = VERR_NO_STR_MEMORY;
1398	}
1399	return rc;
1400	}
1401	RT_EXPORT_SYMBOL(RTLatin1ToUtf8ExTag);
1402
1403
1404	RTDECL(size_t) RTLatin1CalcUtf8Len(const char *psz)
1405	{
1406	size_t cch;
1407	int rc = rtLatin1CalcUtf8Length(psz, RTSTR_MAX, &cch);
1408	return RT_SUCCESS(rc) ? cch : 0;
1409	}
1410	RT_EXPORT_SYMBOL(RTLatin1CalcUtf8Len);
1411
1412
1413	RTDECL(int) RTLatin1CalcUtf8LenEx(const char psz, size_t cchIn, size_t pcch)
1414	{
1415	size_t cch;
1416	int rc = rtLatin1CalcUtf8Length(psz, cchIn, &cch);
1417	if (pcch)
1418	*pcch = RT_SUCCESS(rc) ? cch : ~(size_t)0;
1419	return rc;
1420	}
1421	RT_EXPORT_SYMBOL(RTLatin1CalcUtf8LenEx);
1422
1423
1424	/**
1425	* Calculates the Latin-1 length of a string, validating the encoding while
1426	* doing so.
1427	*
1428	* @returns IPRT status code.
1429	* @param psz Pointer to the UTF-8 string.
1430	* @param cchIn The max length of the string. (btw cch = cb)
1431	* Use RTSTR_MAX if all of the string is to be examined.
1432	* @param pcch Where to store the length of the Latin-1 string in bytes.
1433	*/
1434	static int rtUtf8CalcLatin1Length(const char psz, size_t cchIn, size_t pcch)
1435	{
1436	size_t cch = 0;
1437	for (;;)
1438	{
1439	RTUNICP Cp;
1440	size_t cchCp;
1441	int rc = RTStrGetCpNEx(&psz, &cchIn, &Cp);
1442	if (Cp == 0 \|\| rc == VERR_END_OF_STRING)
1443	break;
1444	if (RT_FAILURE(rc))
1445	return rc;
1446	cchCp = RTLatin1CpSize(Cp);
1447	if (cchCp == 0)
1448	return VERR_NO_TRANSLATION;
1449	cch += cchCp;
1450	}
1451
1452	/* done */
1453	*pcch = cch;
1454	return VINF_SUCCESS;
1455	}
1456
1457
1458	/**
1459	* Recodes a valid UTF-8 string as Latin-1.
1460	*
1461	* Since we know the input is valid, we do not perform encoding or length checks.
1462	*
1463	* @returns iprt status code.
1464	* @param pszIn The UTF-8 string to recode. This is a valid encoding.
1465	* @param cchIn The number of chars (the type char, so bytes if you like) to process of the UTF-8 string.
1466	* The recoding will stop when cch or '\\0' is reached. Pass RTSTR_MAX to process up to '\\0'.
1467	* @param psz Where to store the Latin-1 string.
1468	* @param cch The number of characters the pszOut buffer can hold, excluding the terminator ('\\0').
1469	*/
1470	static int rtUtf8RecodeAsLatin1(const char pszIn, size_t cchIn, char psz, size_t cch)
1471	{
1472	int rc;
1473	for (;;)
1474	{
1475	RTUNICP Cp;
1476	size_t cchCp;
1477	rc = RTStrGetCpNEx(&pszIn, &cchIn, &Cp);
1478	if (Cp == 0 \|\| RT_FAILURE(rc))
1479	break;
1480	cchCp = RTLatin1CpSize(Cp);
1481	if (RT_UNLIKELY(cch < cchCp))
1482	{
1483	RTStrAssertMsgFailed(("Buffer overflow! 1\n"));
1484	rc = VERR_BUFFER_OVERFLOW;
1485	break;
1486	}
1487	cch -= cchCp;
1488	psz = RTLatin1PutCp(psz, Cp);
1489	}
1490
1491	/* done */
1492	if (rc == VERR_END_OF_STRING)
1493	rc = VINF_SUCCESS;
1494	*psz = '\0';
1495	return rc;
1496	}
1497
1498
1499
1500	RTDECL(int) RTStrToLatin1Tag(const char pszString, char ppszString, const char pszTag)
1501	{
1502	/*
1503	* Validate input.
1504	*/
1505	Assert(VALID_PTR(ppszString));
1506	Assert(VALID_PTR(pszString));
1507	*ppszString = NULL;
1508
1509	/*
1510	* Validate the UTF-8 input and calculate the length of the Latin-1 string.
1511	*/
1512	size_t cch;
1513	int rc = rtUtf8CalcLatin1Length(pszString, RTSTR_MAX, &cch);
1514	if (RT_SUCCESS(rc))
1515	{
1516	/*
1517	* Allocate buffer.
1518	*/
1519	char psz = (char )RTMemAllocTag(cch + 1, pszTag);
1520	if (psz)
1521	{
1522	/*
1523	* Encode the UTF-16 string.
1524	*/
1525	rc = rtUtf8RecodeAsLatin1(pszString, RTSTR_MAX, psz, cch);
1526	if (RT_SUCCESS(rc))
1527	{
1528	*ppszString = psz;
1529	return rc;
1530	}
1531	RTMemFree(psz);
1532	}
1533	else
1534	rc = VERR_NO_STR_MEMORY;
1535	}
1536	return rc;
1537	}
1538	RT_EXPORT_SYMBOL(RTStrToLatin1Tag);
1539
1540
1541	RTDECL(int) RTStrToLatin1ExTag(const char *pszString, size_t cchString,
1542	char *ppsz, size_t cch, size_t pcch, const char *pszTag)
1543	{
1544	/*
1545	* Validate input.
1546	*/
1547	Assert(VALID_PTR(pszString));
1548	Assert(VALID_PTR(ppsz));
1549	Assert(!pcch \|\| VALID_PTR(pcch));
1550
1551	/*
1552	* Validate the UTF-8 input and calculate the length of the UTF-16 string.
1553	*/
1554	size_t cchResult;
1555	int rc = rtUtf8CalcLatin1Length(pszString, cchString, &cchResult);
1556	if (RT_SUCCESS(rc))
1557	{
1558	if (pcch)
1559	*pcch = cchResult;
1560
1561	/*
1562	* Check buffer size / Allocate buffer.
1563	*/
1564	bool fShouldFree;
1565	char *pszResult;
1566	if (cch > 0 && *ppsz)
1567	{
1568	fShouldFree = false;
1569	if (cch <= cchResult)
1570	return VERR_BUFFER_OVERFLOW;
1571	pszResult = *ppsz;
1572	}
1573	else
1574	{
1575	*ppsz = NULL;
1576	fShouldFree = true;
1577	cch = RT_MAX(cchResult + 1, cch);
1578	pszResult = (char *)RTMemAllocTag(cch, pszTag);
1579	}
1580	if (pszResult)
1581	{
1582	/*
1583	* Encode the Latin-1 string.
1584	*/
1585	rc = rtUtf8RecodeAsLatin1(pszString, cchString, pszResult, cch - 1);
1586	if (RT_SUCCESS(rc))
1587	{
1588	*ppsz = pszResult;
1589	return rc;
1590	}
1591	if (fShouldFree)
1592	RTMemFree(pszResult);
1593	}
1594	else
1595	rc = VERR_NO_STR_MEMORY;
1596	}
1597	return rc;
1598	}
1599	RT_EXPORT_SYMBOL(RTStrToLatin1ExTag);
1600
1601
1602	RTDECL(size_t) RTStrCalcLatin1Len(const char *psz)
1603	{
1604	size_t cch;
1605	int rc = rtUtf8CalcLatin1Length(psz, RTSTR_MAX, &cch);
1606	return RT_SUCCESS(rc) ? cch : 0;
1607	}
1608	RT_EXPORT_SYMBOL(RTStrCalcLatin1Len);
1609
1610
1611	RTDECL(int) RTStrCalcLatin1LenEx(const char psz, size_t cchIn, size_t pcch)
1612	{
1613	size_t cch;
1614	int rc = rtUtf8CalcLatin1Length(psz, cchIn, &cch);
1615	if (pcch)
1616	*pcch = RT_SUCCESS(rc) ? cch : ~(size_t)0;
1617	return rc;
1618	}
1619	RT_EXPORT_SYMBOL(RTStrCalcLatin1LenEx);
1620
1621
1622	/**
1623	* Handle invalid encodings passed to RTStrGetCp() and RTStrGetCpEx().
1624	* @returns rc
1625	* @param ppsz The pointer to the string position point.
1626	* @param pCp Where to store RTUNICP_INVALID.
1627	* @param rc The iprt error code.
1628	*/
1629	static int rtStrGetCpExFailure(const char **ppsz, PRTUNICP pCp, int rc)
1630	{
1631	/*
1632	* Try find a valid encoding.
1633	*/
1634	(ppsz)++; /* @todo code this! */
1635	*pCp = RTUNICP_INVALID;
1636	return rc;
1637	}
1638
1639
1640	RTDECL(RTUNICP) RTStrGetCpInternal(const char *psz)
1641	{
1642	RTUNICP Cp;
1643	RTStrGetCpExInternal(&psz, &Cp);
1644	return Cp;
1645	}
1646	RT_EXPORT_SYMBOL(RTStrGetCpInternal);
1647
1648
1649	RTDECL(int) RTStrGetCpExInternal(const char **ppsz, PRTUNICP pCp)
1650	{
1651	const unsigned char puch = (const unsigned char )*ppsz;
1652	const unsigned char uch = *puch;
1653	RTUNICP uc;
1654
1655	/* ASCII ? */
1656	if (!(uch & RT_BIT(7)))
1657	{
1658	uc = uch;
1659	puch++;
1660	}
1661	else if (uch & RT_BIT(6))
1662	{
1663	/* figure the length and validate the first octet. */
1664	/** @todo RT_USE_RTC_3629 */
1665	unsigned cb;
1666	if (!(uch & RT_BIT(5)))
1667	cb = 2;
1668	else if (!(uch & RT_BIT(4)))
1669	cb = 3;
1670	else if (!(uch & RT_BIT(3)))
1671	cb = 4;
1672	else if (!(uch & RT_BIT(2)))
1673	cb = 5;
1674	else if (!(uch & RT_BIT(1)))
1675	cb = 6;
1676	else
1677	{
1678	RTStrAssertMsgFailed(("Invalid UTF-8 first byte: %.Rhxs\n", RT_MIN(strlen((char )puch), 10), puch));
1679	return rtStrGetCpExFailure(ppsz, pCp, VERR_INVALID_UTF8_ENCODING);
1680	}
1681
1682	/* validate the rest */
1683	switch (cb)
1684	{
1685	case 6:
1686	RTStrAssertMsgReturn((puch[5] & 0xc0) == 0x80, ("6/%u: %.Rhxs\n", cb, RT_MIN(cb + 10, strlen((char )puch)), puch),
1687	rtStrGetCpExFailure(ppsz, pCp, VERR_INVALID_UTF8_ENCODING));
1688	RT_FALL_THRU();
1689	case 5:
1690	RTStrAssertMsgReturn((puch[4] & 0xc0) == 0x80, ("5/%u: %.Rhxs\n", cb, RT_MIN(cb + 10, strlen((char )puch)), puch),
1691	rtStrGetCpExFailure(ppsz, pCp, VERR_INVALID_UTF8_ENCODING));
1692	RT_FALL_THRU();
1693	case 4:
1694	RTStrAssertMsgReturn((puch[3] & 0xc0) == 0x80, ("4/%u: %.Rhxs\n", cb, RT_MIN(cb + 10, strlen((char )puch)), puch),
1695	rtStrGetCpExFailure(ppsz, pCp, VERR_INVALID_UTF8_ENCODING));
1696	RT_FALL_THRU();
1697	case 3:
1698	RTStrAssertMsgReturn((puch[2] & 0xc0) == 0x80, ("3/%u: %.Rhxs\n", cb, RT_MIN(cb + 10, strlen((char )puch)), puch),
1699	rtStrGetCpExFailure(ppsz, pCp, VERR_INVALID_UTF8_ENCODING));
1700	RT_FALL_THRU();
1701	case 2:
1702	RTStrAssertMsgReturn((puch[1] & 0xc0) == 0x80, ("2/%u: %.Rhxs\n", cb, RT_MIN(cb + 10, strlen((char )puch)), puch),
1703	rtStrGetCpExFailure(ppsz, pCp, VERR_INVALID_UTF8_ENCODING));
1704	break;
1705	}
1706
1707	/* get and validate the code point. */
1708	switch (cb)
1709	{
1710	case 6:
1711	uc = (puch[5] & 0x3f)
1712	\| ((RTUNICP)(puch[4] & 0x3f) << 6)
1713	\| ((RTUNICP)(puch[3] & 0x3f) << 12)
1714	\| ((RTUNICP)(puch[2] & 0x3f) << 18)
1715	\| ((RTUNICP)(puch[1] & 0x3f) << 24)
1716	\| ((RTUNICP)(uch & 0x01) << 30);
1717	RTStrAssertMsgReturn(uc >= 0x04000000 && uc <= 0x7fffffff,
1718	("%u: cp=%#010RX32: %.Rhxs\n", cb, uc, RT_MIN(cb + 10, strlen((char )puch)), puch),
1719	rtStrGetCpExFailure(ppsz, pCp, VERR_INVALID_UTF8_ENCODING));
1720	break;
1721	case 5:
1722	uc = (puch[4] & 0x3f)
1723	\| ((RTUNICP)(puch[3] & 0x3f) << 6)
1724	\| ((RTUNICP)(puch[2] & 0x3f) << 12)
1725	\| ((RTUNICP)(puch[1] & 0x3f) << 18)
1726	\| ((RTUNICP)(uch & 0x03) << 24);
1727	RTStrAssertMsgReturn(uc >= 0x00200000 && uc <= 0x03ffffff,
1728	("%u: cp=%#010RX32: %.Rhxs\n", cb, uc, RT_MIN(cb + 10, strlen((char )puch)), puch),
1729	rtStrGetCpExFailure(ppsz, pCp, VERR_INVALID_UTF8_ENCODING));
1730	break;
1731	case 4:
1732	uc = (puch[3] & 0x3f)
1733	\| ((RTUNICP)(puch[2] & 0x3f) << 6)
1734	\| ((RTUNICP)(puch[1] & 0x3f) << 12)
1735	\| ((RTUNICP)(uch & 0x07) << 18);
1736	RTStrAssertMsgReturn(uc >= 0x00010000 && uc <= 0x001fffff,
1737	("%u: cp=%#010RX32: %.Rhxs\n", cb, uc, RT_MIN(cb + 10, strlen((char )puch)), puch),
1738	rtStrGetCpExFailure(ppsz, pCp, VERR_INVALID_UTF8_ENCODING));
1739	break;
1740	case 3:
1741	uc = (puch[2] & 0x3f)
1742	\| ((RTUNICP)(puch[1] & 0x3f) << 6)
1743	\| ((RTUNICP)(uch & 0x0f) << 12);
1744	RTStrAssertMsgReturn(uc >= 0x00000800 && uc <= 0x0000fffd,
1745	("%u: cp=%#010RX32: %.Rhxs\n", cb, uc, RT_MIN(cb + 10, strlen((char )puch)), puch),
1746	rtStrGetCpExFailure(ppsz, pCp, uc == 0xffff \|\| uc == 0xfffe ? VERR_CODE_POINT_ENDIAN_INDICATOR : VERR_INVALID_UTF8_ENCODING));
1747	RTStrAssertMsgReturn(uc < 0xd800 \|\| uc > 0xdfff,
1748	("%u: cp=%#010RX32: %.Rhxs\n", cb, uc, RT_MIN(cb + 10, strlen((char )puch)), puch),
1749	rtStrGetCpExFailure(ppsz, pCp, VERR_CODE_POINT_SURROGATE));
1750	break;
1751	case 2:
1752	uc = (puch[1] & 0x3f)
1753	\| ((RTUNICP)(uch & 0x1f) << 6);
1754	RTStrAssertMsgReturn(uc >= 0x00000080 && uc <= 0x000007ff,
1755	("%u: cp=%#010RX32: %.Rhxs\n", cb, uc, RT_MIN(cb + 10, strlen((char )puch)), puch),
1756	rtStrGetCpExFailure(ppsz, pCp, VERR_INVALID_UTF8_ENCODING));
1757	break;
1758	default: /* impossible, but GCC is bitching. */
1759	uc = RTUNICP_INVALID;
1760	break;
1761	}
1762	puch += cb;
1763	}
1764	else
1765	{
1766	/* 6th bit is always set. */
1767	RTStrAssertMsgFailed(("Invalid UTF-8 first byte: %.Rhxs\n", RT_MIN(strlen((char )puch), 10), puch));
1768	return rtStrGetCpExFailure(ppsz, pCp, VERR_INVALID_UTF8_ENCODING);
1769	}
1770	*pCp = uc;
1771	ppsz = (const char )puch;
1772	return VINF_SUCCESS;
1773	}
1774	RT_EXPORT_SYMBOL(RTStrGetCpExInternal);
1775
1776
1777	/**
1778	* Handle invalid encodings passed to RTStrGetCpNEx().
1779	* @returns rc
1780	* @param ppsz The pointer to the string position point.
1781	* @param pcch Pointer to the string length.
1782	* @param pCp Where to store RTUNICP_INVALID.
1783	* @param rc The iprt error code.
1784	*/
1785	static int rtStrGetCpNExFailure(const char *ppsz, size_t pcch, PRTUNICP pCp, int rc)
1786	{
1787	/*
1788	* Try find a valid encoding.
1789	*/
1790	(ppsz)++; /* @todo code this! */
1791	(*pcch)--;
1792	*pCp = RTUNICP_INVALID;
1793	return rc;
1794	}
1795
1796
1797	RTDECL(int) RTStrGetCpNExInternal(const char *ppsz, size_t pcch, PRTUNICP pCp)
1798	{
1799	const unsigned char puch = (const unsigned char )*ppsz;
1800	const unsigned char uch = *puch;
1801	size_t cch = *pcch;
1802	RTUNICP uc;
1803
1804	if (cch == 0)
1805	{
1806	*pCp = RTUNICP_INVALID;
1807	return VERR_END_OF_STRING;
1808	}
1809
1810	/* ASCII ? */
1811	if (!(uch & RT_BIT(7)))
1812	{
1813	uc = uch;
1814	puch++;
1815	cch--;
1816	}
1817	else if (uch & RT_BIT(6))
1818	{
1819	/* figure the length and validate the first octet. */
1820	/** @todo RT_USE_RTC_3629 */
1821	unsigned cb;
1822	if (!(uch & RT_BIT(5)))
1823	cb = 2;
1824	else if (!(uch & RT_BIT(4)))
1825	cb = 3;
1826	else if (!(uch & RT_BIT(3)))
1827	cb = 4;
1828	else if (!(uch & RT_BIT(2)))
1829	cb = 5;
1830	else if (!(uch & RT_BIT(1)))
1831	cb = 6;
1832	else
1833	{
1834	RTStrAssertMsgFailed(("Invalid UTF-8 first byte: %.Rhxs\n", RT_MIN(strlen((char )puch), 10), puch));
1835	return rtStrGetCpNExFailure(ppsz, pcch, pCp, VERR_INVALID_UTF8_ENCODING);
1836	}
1837
1838	if (cb > cch)
1839	return rtStrGetCpNExFailure(ppsz, pcch, pCp, VERR_INVALID_UTF8_ENCODING);
1840
1841	/* validate the rest */
1842	switch (cb)
1843	{
1844	case 6:
1845	RTStrAssertMsgReturn((puch[5] & 0xc0) == 0x80, ("6/%u: %.Rhxs\n", cb, RT_MIN(cb + 10, strlen((char )puch)), puch),
1846	rtStrGetCpNExFailure(ppsz, pcch, pCp, VERR_INVALID_UTF8_ENCODING));
1847	RT_FALL_THRU();
1848	case 5:
1849	RTStrAssertMsgReturn((puch[4] & 0xc0) == 0x80, ("5/%u: %.Rhxs\n", cb, RT_MIN(cb + 10, strlen((char )puch)), puch),
1850	rtStrGetCpNExFailure(ppsz, pcch, pCp, VERR_INVALID_UTF8_ENCODING));
1851	RT_FALL_THRU();
1852	case 4:
1853	RTStrAssertMsgReturn((puch[3] & 0xc0) == 0x80, ("4/%u: %.Rhxs\n", cb, RT_MIN(cb + 10, strlen((char )puch)), puch),
1854	rtStrGetCpNExFailure(ppsz, pcch, pCp, VERR_INVALID_UTF8_ENCODING));
1855	RT_FALL_THRU();
1856	case 3:
1857	RTStrAssertMsgReturn((puch[2] & 0xc0) == 0x80, ("3/%u: %.Rhxs\n", cb, RT_MIN(cb + 10, strlen((char )puch)), puch),
1858	rtStrGetCpNExFailure(ppsz, pcch, pCp, VERR_INVALID_UTF8_ENCODING));
1859	RT_FALL_THRU();
1860	case 2:
1861	RTStrAssertMsgReturn((puch[1] & 0xc0) == 0x80, ("2/%u: %.Rhxs\n", cb, RT_MIN(cb + 10, strlen((char )puch)), puch),
1862	rtStrGetCpNExFailure(ppsz, pcch, pCp, VERR_INVALID_UTF8_ENCODING));
1863	break;
1864	}
1865
1866	/* get and validate the code point. */
1867	switch (cb)
1868	{
1869	case 6:
1870	uc = (puch[5] & 0x3f)
1871	\| ((RTUNICP)(puch[4] & 0x3f) << 6)
1872	\| ((RTUNICP)(puch[3] & 0x3f) << 12)
1873	\| ((RTUNICP)(puch[2] & 0x3f) << 18)
1874	\| ((RTUNICP)(puch[1] & 0x3f) << 24)
1875	\| ((RTUNICP)(uch & 0x01) << 30);
1876	RTStrAssertMsgReturn(uc >= 0x04000000 && uc <= 0x7fffffff,
1877	("%u: cp=%#010RX32: %.Rhxs\n", cb, uc, RT_MIN(cb + 10, strlen((char )puch)), puch),
1878	rtStrGetCpNExFailure(ppsz, pcch, pCp, VERR_INVALID_UTF8_ENCODING));
1879	break;
1880	case 5:
1881	uc = (puch[4] & 0x3f)
1882	\| ((RTUNICP)(puch[3] & 0x3f) << 6)
1883	\| ((RTUNICP)(puch[2] & 0x3f) << 12)
1884	\| ((RTUNICP)(puch[1] & 0x3f) << 18)
1885	\| ((RTUNICP)(uch & 0x03) << 24);
1886	RTStrAssertMsgReturn(uc >= 0x00200000 && uc <= 0x03ffffff,
1887	("%u: cp=%#010RX32: %.Rhxs\n", cb, uc, RT_MIN(cb + 10, strlen((char )puch)), puch),
1888	rtStrGetCpNExFailure(ppsz, pcch, pCp, VERR_INVALID_UTF8_ENCODING));
1889	break;
1890	case 4:
1891	uc = (puch[3] & 0x3f)
1892	\| ((RTUNICP)(puch[2] & 0x3f) << 6)
1893	\| ((RTUNICP)(puch[1] & 0x3f) << 12)
1894	\| ((RTUNICP)(uch & 0x07) << 18);
1895	RTStrAssertMsgReturn(uc >= 0x00010000 && uc <= 0x001fffff,
1896	("%u: cp=%#010RX32: %.Rhxs\n", cb, uc, RT_MIN(cb + 10, strlen((char )puch)), puch),
1897	rtStrGetCpNExFailure(ppsz, pcch, pCp, VERR_INVALID_UTF8_ENCODING));
1898	break;
1899	case 3:
1900	uc = (puch[2] & 0x3f)
1901	\| ((RTUNICP)(puch[1] & 0x3f) << 6)
1902	\| ((RTUNICP)(uch & 0x0f) << 12);
1903	RTStrAssertMsgReturn(uc >= 0x00000800 && uc <= 0x0000fffd,
1904	("%u: cp=%#010RX32: %.Rhxs\n", cb, uc, RT_MIN(cb + 10, strlen((char )puch)), puch),
1905	rtStrGetCpNExFailure(ppsz, pcch, pCp, uc == 0xffff \|\| uc == 0xfffe ? VERR_CODE_POINT_ENDIAN_INDICATOR : VERR_INVALID_UTF8_ENCODING));
1906	RTStrAssertMsgReturn(uc < 0xd800 \|\| uc > 0xdfff,
1907	("%u: cp=%#010RX32: %.Rhxs\n", cb, uc, RT_MIN(cb + 10, strlen((char )puch)), puch),
1908	rtStrGetCpNExFailure(ppsz, pcch, pCp, VERR_CODE_POINT_SURROGATE));
1909	break;
1910	case 2:
1911	uc = (puch[1] & 0x3f)
1912	\| ((RTUNICP)(uch & 0x1f) << 6);
1913	RTStrAssertMsgReturn(uc >= 0x00000080 && uc <= 0x000007ff,
1914	("%u: cp=%#010RX32: %.Rhxs\n", cb, uc, RT_MIN(cb + 10, strlen((char )puch)), puch),
1915	rtStrGetCpNExFailure(ppsz, pcch, pCp, VERR_INVALID_UTF8_ENCODING));
1916	break;
1917	default: /* impossible, but GCC is bitching. */
1918	uc = RTUNICP_INVALID;
1919	break;
1920	}
1921	puch += cb;
1922	cch -= cb;
1923	}
1924	else
1925	{
1926	/* 6th bit is always set. */
1927	RTStrAssertMsgFailed(("Invalid UTF-8 first byte: %.Rhxs\n", RT_MIN(strlen((char )puch), 10), puch));
1928	return rtStrGetCpNExFailure(ppsz, pcch, pCp, VERR_INVALID_UTF8_ENCODING);
1929	}
1930	*pCp = uc;
1931	ppsz = (const char )puch;
1932	(*pcch) = cch;
1933	return VINF_SUCCESS;
1934	}
1935	RT_EXPORT_SYMBOL(RTStrGetCpNExInternal);
1936
1937
1938	RTDECL(char ) RTStrPutCpInternal(char psz, RTUNICP uc)
1939	{
1940	unsigned char puch = (unsigned char )psz;
1941	if (uc < 0x80)
1942	*puch++ = (unsigned char )uc;
1943	else if (uc < 0x00000800)
1944	{
1945	*puch++ = 0xc0 \| (uc >> 6);
1946	*puch++ = 0x80 \| (uc & 0x3f);
1947	}
1948	else if (uc < 0x00010000)
1949	{
1950	/** @todo RT_USE_RTC_3629 */
1951	if ( uc < 0x0000d8000
1952	\|\| ( uc > 0x0000dfff
1953	&& uc < 0x0000fffe))
1954	{
1955	*puch++ = 0xe0 \| (uc >> 12);
1956	*puch++ = 0x80 \| ((uc >> 6) & 0x3f);
1957	*puch++ = 0x80 \| (uc & 0x3f);
1958	}
1959	else
1960	{
1961	AssertMsgFailed(("Invalid code point U+%05x!\n", uc));
1962	*puch++ = 0x7f;
1963	}
1964	}
1965	/** @todo RT_USE_RTC_3629 */
1966	else if (uc < 0x00200000)
1967	{
1968	*puch++ = 0xf0 \| (uc >> 18);
1969	*puch++ = 0x80 \| ((uc >> 12) & 0x3f);
1970	*puch++ = 0x80 \| ((uc >> 6) & 0x3f);
1971	*puch++ = 0x80 \| (uc & 0x3f);
1972	}
1973	else if (uc < 0x04000000)
1974	{
1975	*puch++ = 0xf8 \| (uc >> 24);
1976	*puch++ = 0x80 \| ((uc >> 18) & 0x3f);
1977	*puch++ = 0x80 \| ((uc >> 12) & 0x3f);
1978	*puch++ = 0x80 \| ((uc >> 6) & 0x3f);
1979	*puch++ = 0x80 \| (uc & 0x3f);
1980	}
1981	else if (uc <= 0x7fffffff)
1982	{
1983	*puch++ = 0xfc \| (uc >> 30);
1984	*puch++ = 0x80 \| ((uc >> 24) & 0x3f);
1985	*puch++ = 0x80 \| ((uc >> 18) & 0x3f);
1986	*puch++ = 0x80 \| ((uc >> 12) & 0x3f);
1987	*puch++ = 0x80 \| ((uc >> 6) & 0x3f);
1988	*puch++ = 0x80 \| (uc & 0x3f);
1989	}
1990	else
1991	{
1992	AssertMsgFailed(("Invalid code point U+%08x!\n", uc));
1993	*puch++ = 0x7f;
1994	}
1995
1996	return (char *)puch;
1997	}
1998	RT_EXPORT_SYMBOL(RTStrPutCpInternal);
1999
2000
2001	RTDECL(char ) RTStrPrevCp(const char pszStart, const char *psz)
2002	{
2003	if (pszStart < psz)
2004	{
2005	/* simple char? */
2006	const unsigned char puch = (const unsigned char )psz;
2007	unsigned uch = *--puch;
2008	if (!(uch & RT_BIT(7)))
2009	return (char *)puch;
2010	RTStrAssertMsgReturn(!(uch & RT_BIT(6)), ("uch=%#x\n", uch), (char *)pszStart);
2011
2012	/* two or more. */
2013	uint32_t uMask = 0xffffffc0;
2014	while ( (const unsigned char *)pszStart < puch
2015	&& !(uMask & 1))
2016	{
2017	uch = *--puch;
2018	if ((uch & 0xc0) != 0x80)
2019	{
2020	RTStrAssertMsgReturn((uch & (uMask >> 1)) == (uMask & 0xff),
2021	("Invalid UTF-8 encoding: %.Rhxs puch=%p psz=%p\n", psz - (char )puch, puch, psz),
2022	(char *)pszStart);
2023	return (char *)puch;
2024	}
2025	uMask >>= 1;
2026	}
2027	RTStrAssertMsgFailed(("Invalid UTF-8 encoding: %.Rhxs puch=%p psz=%p\n", psz - (char )puch, puch, psz));
2028	}
2029	return (char *)pszStart;
2030	}
2031	RT_EXPORT_SYMBOL(RTStrPrevCp);
2032

Note: See TracBrowser for help on using the repository browser.

source: vbox/trunk/src/VBox/Runtime/common/string/utf-8.cpp@ 72778

Download in other formats: