utf16.h@ 95897

Last change on this file since 95897 was 93115, checked in by vboxsync, 3 years ago
scm --update-copyright-year
Property svn:eol-style set to `native` Property svn:keywords set to `Author Date Id Revision`
File size: 64.2 KB

Line
1	/** @file
2	* IPRT - String Manipulation, UTF-16 encoding.
3	*/
4
5	/*
6	* Copyright (C) 2006-2022 Oracle Corporation
7	*
8	* This file is part of VirtualBox Open Source Edition (OSE), as
9	* available from http://www.virtualbox.org. This file is free software;
10	* you can redistribute it and/or modify it under the terms of the GNU
11	* General Public License (GPL) as published by the Free Software
12	* Foundation, in version 2 as it comes in the "COPYING" file of the
13	* VirtualBox OSE distribution. VirtualBox OSE is distributed in the
14	* hope that it will be useful, but WITHOUT ANY WARRANTY of any kind.
15	*
16	* The contents of this file may alternatively be used under the terms
17	* of the Common Development and Distribution License Version 1.0
18	* (CDDL) only, as it comes in the "COPYING.CDDL" file of the
19	* VirtualBox OSE distribution, in which case the provisions of the
20	* CDDL are applicable instead of those of the GPL.
21	*
22	* You may elect to license modified versions of this file under the
23	* terms and conditions of either the GPL or the CDDL or both.
24	*/
25
26	#ifndef IPRT_INCLUDED_utf16_h
27	#define IPRT_INCLUDED_utf16_h
28	#ifndef RT_WITHOUT_PRAGMA_ONCE
29	# pragma once
30	#endif
31
32	#include <iprt/string.h>
33
34	RT_C_DECLS_BEGIN
35
36
37	/** @defgroup rt_str_utf16 UTF-16 String Manipulation
38	* @ingroup grp_rt_str
39	* @{
40	*/
41
42	/**
43	* Allocates memory for UTF-16 string storage (default tag).
44	*
45	* You should normally not use this function, except if there is some very
46	* custom string handling you need doing that isn't covered by any of the other
47	* APIs.
48	*
49	* @returns Pointer to the allocated UTF-16 string. The first wide char is
50	* always set to the string terminator char, the contents of the
51	* remainder of the memory is undefined. The string must be freed by
52	* calling RTUtf16Free.
53	*
54	* NULL is returned if the allocation failed. Please translate this to
55	* VERR_NO_UTF16_MEMORY and not VERR_NO_MEMORY. Also consider
56	* RTUtf16AllocEx if an IPRT status code is required.
57	*
58	* @param cb How many bytes to allocate, will be rounded up
59	* to a multiple of two. If this is zero, we will
60	* allocate a terminator wide char anyway.
61	*/
62	#define RTUtf16Alloc(cb) RTUtf16AllocTag((cb), RTSTR_TAG)
63
64	/**
65	* Allocates memory for UTF-16 string storage (custom tag).
66	*
67	* You should normally not use this function, except if there is some very
68	* custom string handling you need doing that isn't covered by any of the other
69	* APIs.
70	*
71	* @returns Pointer to the allocated UTF-16 string. The first wide char is
72	* always set to the string terminator char, the contents of the
73	* remainder of the memory is undefined. The string must be freed by
74	* calling RTUtf16Free.
75	*
76	* NULL is returned if the allocation failed. Please translate this to
77	* VERR_NO_UTF16_MEMORY and not VERR_NO_MEMORY. Also consider
78	* RTUtf16AllocExTag if an IPRT status code is required.
79	*
80	* @param cb How many bytes to allocate, will be rounded up
81	* to a multiple of two. If this is zero, we will
82	* allocate a terminator wide char anyway.
83	* @param pszTag Allocation tag used for statistics and such.
84	*/
85	RTDECL(PRTUTF16) RTUtf16AllocTag(size_t cb, const char *pszTag);
86
87	/**
88	* Reallocates the specified UTF-16 string (default tag).
89	*
90	* You should normally not use this function, except if there is some very
91	* custom string handling you need doing that isn't covered by any of the other
92	* APIs.
93	*
94	* @returns VINF_SUCCESS.
95	* @retval VERR_NO_UTF16_MEMORY if we failed to reallocate the string, @a
96	* *ppwsz remains unchanged.
97	*
98	* @param ppwsz Pointer to the string variable containing the
99	* input and output string.
100	*
101	* When not freeing the string, the result will
102	* always have the last RTUTF16 set to the
103	* terminator character so that when used for
104	* string truncation the result will be a valid
105	* C-style string (your job to keep it a valid
106	* UTF-16 string).
107	*
108	* When the input string is NULL and we're supposed
109	* to reallocate, the returned string will also
110	* have the first RTUTF16 set to the terminator
111	* char so it will be a valid C-style string.
112	*
113	* @param cbNew When @a cbNew is zero, we'll behave like
114	* RTUtf16Free and @a *ppwsz will be set to NULL.
115	*
116	* When not zero, this will be rounded up to a
117	* multiple of two, and used as the new size of the
118	* memory backing the string, i.e. it includes the
119	* terminator (RTUTF16) char.
120	*/
121	#define RTUtf16Realloc(ppwsz, cbNew) RTUtf16ReallocTag((ppwsz), (cbNew), RTSTR_TAG)
122
123	/**
124	* Reallocates the specified UTF-16 string (custom tag).
125	*
126	* You should normally not use this function, except if there is some very
127	* custom string handling you need doing that isn't covered by any of the other
128	* APIs.
129	*
130	* @returns VINF_SUCCESS.
131	* @retval VERR_NO_UTF16_MEMORY if we failed to reallocate the string, @a
132	* *ppwsz remains unchanged.
133	*
134	* @param ppwsz Pointer to the string variable containing the
135	* input and output string.
136	*
137	* When not freeing the string, the result will
138	* always have the last RTUTF16 set to the
139	* terminator character so that when used for
140	* string truncation the result will be a valid
141	* C-style string (your job to keep it a valid
142	* UTF-16 string).
143	*
144	* When the input string is NULL and we're supposed
145	* to reallocate, the returned string will also
146	* have the first RTUTF16 set to the terminator
147	* char so it will be a valid C-style string.
148	*
149	* @param cbNew When @a cbNew is zero, we'll behave like
150	* RTUtf16Free and @a *ppwsz will be set to NULL.
151	*
152	* When not zero, this will be rounded up to a
153	* multiple of two, and used as the new size of the
154	* memory backing the string, i.e. it includes the
155	* terminator (RTUTF16) char.
156	* @param pszTag Allocation tag used for statistics and such.
157	*/
158	RTDECL(int) RTUtf16ReallocTag(PRTUTF16 ppwsz, size_t cbNew, const char pszTag);
159
160	/**
161	* Free a UTF-16 string allocated by RTStrToUtf16(), RTStrToUtf16Ex(),
162	* RTLatin1ToUtf16(), RTLatin1ToUtf16Ex(), RTUtf16Dup() or RTUtf16DupEx().
163	*
164	* @returns iprt status code.
165	* @param pwszString The UTF-16 string to free. NULL is accepted.
166	*/
167	RTDECL(void) RTUtf16Free(PRTUTF16 pwszString);
168
169	/**
170	* Allocates a new copy of the specified UTF-16 string (default tag).
171	*
172	* @returns Pointer to the allocated string copy. Use RTUtf16Free() to free it.
173	* @returns NULL when out of memory.
174	* @param pwszString UTF-16 string to duplicate.
175	* @remark This function will not make any attempt to validate the encoding.
176	*/
177	#define RTUtf16Dup(pwszString) RTUtf16DupTag((pwszString), RTSTR_TAG)
178
179	/**
180	* Allocates a new copy of the specified UTF-16 string (custom tag).
181	*
182	* @returns Pointer to the allocated string copy. Use RTUtf16Free() to free it.
183	* @returns NULL when out of memory.
184	* @param pwszString UTF-16 string to duplicate.
185	* @param pszTag Allocation tag used for statistics and such.
186	* @remark This function will not make any attempt to validate the encoding.
187	*/
188	RTDECL(PRTUTF16) RTUtf16DupTag(PCRTUTF16 pwszString, const char *pszTag);
189
190	/**
191	* Allocates a new copy of the specified UTF-16 string (default tag).
192	*
193	* @returns iprt status code.
194	* @param ppwszString Receives pointer of the allocated UTF-16 string.
195	* The returned pointer must be freed using RTUtf16Free().
196	* @param pwszString UTF-16 string to duplicate.
197	* @param cwcExtra Number of extra RTUTF16 items to allocate.
198	* @remark This function will not make any attempt to validate the encoding.
199	*/
200	#define RTUtf16DupEx(ppwszString, pwszString, cwcExtra) \
201	RTUtf16DupExTag((ppwszString), (pwszString), (cwcExtra), RTSTR_TAG)
202
203	/**
204	* Allocates a new copy of the specified UTF-16 string (custom tag).
205	*
206	* @returns iprt status code.
207	* @param ppwszString Receives pointer of the allocated UTF-16 string.
208	* The returned pointer must be freed using RTUtf16Free().
209	* @param pwszString UTF-16 string to duplicate.
210	* @param cwcExtra Number of extra RTUTF16 items to allocate.
211	* @param pszTag Allocation tag used for statistics and such.
212	* @remark This function will not make any attempt to validate the encoding.
213	*/
214	RTDECL(int) RTUtf16DupExTag(PRTUTF16 ppwszString, PCRTUTF16 pwszString, size_t cwcExtra, const char pszTag);
215
216	/**
217	* Returns the length of a UTF-16 string in UTF-16 characters
218	* without trailing '\\0'.
219	*
220	* Surrogate pairs counts as two UTF-16 characters here. Use RTUtf16CpCnt()
221	* to get the exact number of code points in the string.
222	*
223	* @returns The number of RTUTF16 items in the string.
224	* @param pwszString Pointer the UTF-16 string.
225	* @remark This function will not make any attempt to validate the encoding.
226	*/
227	RTDECL(size_t) RTUtf16Len(PCRTUTF16 pwszString);
228
229	/**
230	* Find the length of a zero-terminated byte string, given a max string length.
231	*
232	* @returns The string length or cbMax. The returned length does not include
233	* the zero terminator if it was found.
234	*
235	* @param pwszString The string.
236	* @param cwcMax The max string length in RTUTF16s.
237	* @sa RTUtf16NLenEx, RTStrNLen.
238	*/
239	RTDECL(size_t) RTUtf16NLen(PCRTUTF16 pwszString, size_t cwcMax);
240
241	/**
242	* Find the length of a zero-terminated byte string, given
243	* a max string length.
244	*
245	* @returns IPRT status code.
246	* @retval VINF_SUCCESS if the string has a length less than cchMax.
247	* @retval VERR_BUFFER_OVERFLOW if the end of the string wasn't found
248	* before cwcMax was reached.
249	*
250	* @param pwszString The string.
251	* @param cwcMax The max string length in RTUTF16s.
252	* @param pcwc Where to store the string length excluding the
253	* terminator. This is set to cwcMax if the terminator
254	* isn't found.
255	* @sa RTUtf16NLen, RTStrNLenEx.
256	*/
257	RTDECL(int) RTUtf16NLenEx(PCRTUTF16 pwszString, size_t cwcMax, size_t *pcwc);
258
259	/**
260	* Find the zero terminator in a string with a limited length.
261	*
262	* @returns Pointer to the zero terminator.
263	* @returns NULL if the zero terminator was not found.
264	*
265	* @param pwszString The string.
266	* @param cwcMax The max string length. RTSTR_MAX is fine.
267	*/
268	RTDECL(PCRTUTF16) RTUtf16End(PCRTUTF16 pwszString, size_t cwcMax);
269
270	/**
271	* Finds a give UTF-16 character in a UTF-16 string.
272	*
273	* @returns Pointer to the first occurence of @a wc.
274	* @returns NULL if @a wc was not found.
275	*
276	* @param pwszString The string to search.
277	* @param wc The UTF-16 character to search for.
278	*/
279	RTDECL(PRTUTF16) RTUtf16Chr(PCRTUTF16 pwszString, RTUTF16 wc);
280
281	/**
282	* Strips blankspaces from both ends of the string.
283	*
284	* @returns Pointer to first non-blank char in the string.
285	* @param pwsz The string to strip.
286	*/
287	RTDECL(PRTUTF16) RTUtf16Strip(PRTUTF16 pwsz);
288
289	/**
290	* Strips blankspaces from the start of the string.
291	*
292	* @returns Pointer to first non-blank char in the string.
293	* @param pwsz The string to strip.
294	*/
295	RTDECL(PRTUTF16) RTUtf16StripL(PCRTUTF16 pwsz);
296
297	/**
298	* Strips blankspaces from the end of the string.
299	*
300	* @returns pwsz.
301	* @param pwsz The string to strip.
302	*/
303	RTDECL(PRTUTF16) RTUtf16StripR(PRTUTF16 pwsz);
304
305	/**
306	* String copy with overflow handling.
307	*
308	* @retval VINF_SUCCESS on success.
309	* @retval VERR_BUFFER_OVERFLOW if the destination buffer is too small. The
310	* buffer will contain as much of the string as it can hold, fully
311	* terminated.
312	*
313	* @param pwszDst The destination buffer.
314	* @param cwcDst The size of the destination buffer in RTUTF16s.
315	* @param pwszSrc The source string. NULL is not OK.
316	*/
317	RTDECL(int) RTUtf16Copy(PRTUTF16 pwszDst, size_t cwcDst, PCRTUTF16 pwszSrc);
318
319	/**
320	* String copy with overflow handling, ASCII source.
321	*
322	* @retval VINF_SUCCESS on success.
323	* @retval VERR_BUFFER_OVERFLOW if the destination buffer is too small. The
324	* buffer will contain as much of the string as it can hold, fully
325	* terminated.
326	*
327	* @param pwszDst The destination buffer.
328	* @param cwcDst The size of the destination buffer in RTUTF16s.
329	* @param pszSrc The source string, pure ASCII. NULL is not OK.
330	*/
331	RTDECL(int) RTUtf16CopyAscii(PRTUTF16 pwszDst, size_t cwcDst, const char *pszSrc);
332
333	/**
334	* String copy with overflow handling.
335	*
336	* @retval VINF_SUCCESS on success.
337	* @retval VERR_BUFFER_OVERFLOW if the destination buffer is too small. The
338	* buffer will contain as much of the string as it can hold, fully
339	* terminated.
340	*
341	* @param pwszDst The destination buffer.
342	* @param cwcDst The size of the destination buffer in RTUTF16s.
343	* @param pwszSrc The source string. NULL is not OK.
344	* @param cwcSrcMax The maximum number of chars (not code points) to
345	* copy from the source string, not counting the
346	* terminator as usual.
347	*/
348	RTDECL(int) RTUtf16CopyEx(PRTUTF16 pwszDst, size_t cwcDst, PCRTUTF16 pwszSrc, size_t cwcSrcMax);
349
350	/**
351	* String concatenation with overflow handling.
352	*
353	* @retval VINF_SUCCESS on success.
354	* @retval VERR_BUFFER_OVERFLOW if the destination buffer is too small. The
355	* buffer will contain as much of the string as it can hold, fully
356	* terminated.
357	*
358	* @param pwszDst The destination buffer.
359	* @param cwcDst The size of the destination buffer in RTUTF16s.
360	* @param pwszSrc The source string. NULL is not OK.
361	*/
362	RTDECL(int) RTUtf16Cat(PRTUTF16 pwszDst, size_t cwcDst, PCRTUTF16 pwszSrc);
363
364	/**
365	* String concatenation with overflow handling, ASCII source.
366	*
367	* @retval VINF_SUCCESS on success.
368	* @retval VERR_BUFFER_OVERFLOW if the destination buffer is too small. The
369	* buffer will contain as much of the string as it can hold, fully
370	* terminated.
371	*
372	* @param pwszDst The destination buffer.
373	* @param cwcDst The size of the destination buffer in RTUTF16s.
374	* @param pszSrc The source string, pure ASCII. NULL is not OK.
375	*/
376	RTDECL(int) RTUtf16CatAscii(PRTUTF16 pwszDst, size_t cwcDst, const char *pszSrc);
377
378	/**
379	* String concatenation with overflow handling.
380	*
381	* @retval VINF_SUCCESS on success.
382	* @retval VERR_BUFFER_OVERFLOW if the destination buffer is too small. The
383	* buffer will contain as much of the string as it can hold, fully
384	* terminated.
385	*
386	* @param pwszDst The destination buffer.
387	* @param cwcDst The size of the destination buffer in RTUTF16s.
388	* @param pwszSrc The source string. NULL is not OK.
389	* @param cwcSrcMax The maximum number of UTF-16 chars (not code
390	* points) to copy from the source string, not
391	* counting the terminator as usual.
392	*/
393	RTDECL(int) RTUtf16CatEx(PRTUTF16 pwszDst, size_t cwcDst, PCRTUTF16 pwszSrc, size_t cwcSrcMax);
394
395	/**
396	* Performs a case sensitive string compare between two UTF-16 strings.
397	*
398	* @returns < 0 if the first string less than the second string.
399	* @returns 0 if the first string identical to the second string.
400	* @returns > 0 if the first string greater than the second string.
401	* @param pwsz1 First UTF-16 string. Null is allowed.
402	* @param pwsz2 Second UTF-16 string. Null is allowed.
403	* @remark This function will not make any attempt to validate the encoding.
404	*/
405	RTDECL(int) RTUtf16Cmp(PCRTUTF16 pwsz1, PCRTUTF16 pwsz2);
406
407	/**
408	* Performs a case sensitive string compare between an UTF-16 string and a pure
409	* ASCII string.
410	*
411	* @returns < 0 if the first string less than the second string.
412	* @returns 0 if the first string identical to the second string.
413	* @returns > 0 if the first string greater than the second string.
414	* @param pwsz1 First UTF-16 string. Null is allowed.
415	* @param psz2 Second string, pure ASCII. Null is allowed.
416	* @remark This function will not make any attempt to validate the encoding.
417	*/
418	RTDECL(int) RTUtf16CmpAscii(PCRTUTF16 pwsz1, const char *psz2);
419
420	/**
421	* Performs a case sensitive string compare between an UTF-16 string and a UTF-8
422	* string.
423	*
424	* @returns < 0 if the first string less than the second string.
425	* @returns 0 if the first string identical to the second string.
426	* @returns > 0 if the first string greater than the second string.
427	* @param pwsz1 First UTF-16 string. Null is allowed.
428	* @param psz2 Second string, UTF-8. Null is allowed.
429	* @remarks NULL and empty strings are treated equally.
430	*/
431	RTDECL(int) RTUtf16CmpUtf8(PCRTUTF16 pwsz1, const char *psz2);
432
433
434	/**
435	* Performs a case sensitive and length limited string compare between two UTF-16 strings.
436	*
437	* @returns < 0 if the first string less than the second string.
438	* @returns 0 if the first string identical to the second string.
439	* @returns > 0 if the first string greater than the second string.
440	* @param pwsz1 First UTF-16 string. Null is allowed.
441	* @param pwsz2 Second UTF-16 string. Null is allowed.
442	* @param cwcMax Maximum number of characters (RTUTF16) from the first
443	* @remark This function will not make any attempt to validate the encoding.
444	*/
445	RTDECL(int) RTUtf16NCmp(PCRTUTF16 pwsz1, PCRTUTF16 pwsz2, size_t cwcMax);
446
447	/**
448	* Performs a case sensitive and length limited string compare between an UTF-16
449	* string and a pure ASCII string.
450	*
451	* @returns < 0 if the first string less than the second string.
452	* @returns 0 if the first string identical to the second string.
453	* @returns > 0 if the first string greater than the second string.
454	* @param pwsz1 First UTF-16 string. Null is allowed.
455	* @param psz2 Second string, pure ASCII. Null is allowed.
456	* @param cwcMax Maximum number of characters (RTUTF16) to compare.
457	* @remark This function will not make any attempt to validate the encoding.
458	*/
459	RTDECL(int) RTUtf16NCmpAscii(PCRTUTF16 pwsz1, const char *psz2, size_t cwcMax);
460
461	/**
462	* Performs a case sensitive and length limited string compare between an UTF-16
463	* string and a UTF-8 string.
464	*
465	* @returns < 0 if the first string less than the second string.
466	* @returns 0 if the first string identical to the second string.
467	* @returns > 0 if the first string greater than the second string.
468	* @param pwsz1 First UTF-16 string. Null is allowed.
469	* @param psz2 Second string, UTF-8. Null is allowed.
470	* @param cwcMax1 Maximum number of UTF-16 characters (RTUTF16) from the
471	* first string to compare.
472	* @param cchMax2 Maximum number of UTF-8 characters (char) from the
473	* second string to compare.
474	* @remarks NULL and empty strings are treated equally.
475	*/
476	RTDECL(int) RTUtf16NCmpUtf8(PCRTUTF16 pwsz1, const char *psz2, size_t cwcMax1, size_t cchMax2);
477
478
479	/**
480	* Performs a case insensitive string compare between two UTF-16 strings.
481	*
482	* This is a simplified compare, as only the simplified lower/upper case folding
483	* specified by the unicode specs are used. It does not consider character pairs
484	* as they are used in some languages, just simple upper & lower case compares.
485	*
486	* @returns < 0 if the first string less than the second string.
487	* @returns 0 if the first string identical to the second string.
488	* @returns > 0 if the first string greater than the second string.
489	* @param pwsz1 First UTF-16 string. Null is allowed.
490	* @param pwsz2 Second UTF-16 string. Null is allowed.
491	*/
492	RTDECL(int) RTUtf16ICmp(PCRTUTF16 pwsz1, PCRTUTF16 pwsz2);
493
494	/**
495	* Performs a case insensitive string compare between two big endian UTF-16
496	* strings.
497	*
498	* This is a simplified compare, as only the simplified lower/upper case folding
499	* specified by the unicode specs are used. It does not consider character pairs
500	* as they are used in some languages, just simple upper & lower case compares.
501	*
502	* @returns < 0 if the first string less than the second string.
503	* @returns 0 if the first string identical to the second string.
504	* @returns > 0 if the first string greater than the second string.
505	* @param pwsz1 First big endian UTF-16 string. Null is allowed.
506	* @param pwsz2 Second big endian UTF-16 string. Null is allowed.
507	*/
508	RTDECL(int) RTUtf16BigICmp(PCRTUTF16 pwsz1, PCRTUTF16 pwsz2);
509
510	/**
511	* Performs a case insensitive string compare between an UTF-16 string and a
512	* UTF-8 string.
513	*
514	* @returns < 0 if the first string less than the second string.s
515	* @returns 0 if the first string identical to the second string.
516	* @returns > 0 if the first string greater than the second string.
517	* @param pwsz1 First UTF-16 string. Null is allowed.
518	* @param psz2 Second string, UTF-8. Null is allowed.
519	* @remarks NULL and empty strings are treated equally.
520	*/
521	RTDECL(int) RTUtf16ICmpUtf8(PCRTUTF16 pwsz1, const char *psz2);
522
523	/**
524	* Performs a case insensitive string compare between an UTF-16 string and a
525	* pure ASCII string.
526	*
527	* Since this compare only takes cares about the first 128 codepoints in
528	* unicode, no tables are needed and there aren't any real complications.
529	*
530	* @returns < 0 if the first string less than the second string.
531	* @returns 0 if the first string identical to the second string.
532	* @returns > 0 if the first string greater than the second string.
533	* @param pwsz1 First UTF-16 string. Null is allowed.
534	* @param psz2 Second string, pure ASCII. Null is allowed.
535	*/
536	RTDECL(int) RTUtf16ICmpAscii(PCRTUTF16 pwsz1, const char *psz2);
537
538	/**
539	* Performs a case insensitive string compare between two UTF-16 strings
540	* using the current locale of the process (if applicable).
541	*
542	* This differs from RTUtf16ICmp() in that it will try, if a locale with the
543	* required data is available, to do a correct case-insensitive compare. It
544	* follows that it is more complex and thereby likely to be more expensive.
545	*
546	* @returns < 0 if the first string less than the second string.
547	* @returns 0 if the first string identical to the second string.
548	* @returns > 0 if the first string greater than the second string.
549	* @param pwsz1 First UTF-16 string. Null is allowed.
550	* @param pwsz2 Second UTF-16 string. Null is allowed.
551	*/
552	RTDECL(int) RTUtf16LocaleICmp(PCRTUTF16 pwsz1, PCRTUTF16 pwsz2);
553
554	/**
555	* Performs a case insensitive string compare between two UTF-16 strings,
556	* stopping after N characters.
557	*
558	* This is a simplified compare, as only the simplified lower/upper case folding
559	* specified by the unicode specs are used. It does not consider character pairs
560	* as they are used in some languages, just simple upper & lower case compares.
561	*
562	* @returns < 0 if the first string less than the second string.
563	* @returns 0 if the first string identical to the second string.
564	* @returns > 0 if the first string greater than the second string.
565	* @param pwsz1 First UTF-16 string. Null is allowed.
566	* @param pwsz2 Second UTF-16 string. Null is allowed.
567	* @param cwcMax Maximum number of characters to compare.
568	*/
569	RTDECL(int) RTUtf16NICmp(PCRTUTF16 pwsz1, PCRTUTF16 pwsz2, size_t cwcMax);
570
571	/**
572	* Performs a case insensitive string compare between two big endian UTF-16
573	* strings, stopping after N characters.
574	*
575	* This is a simplified compare, as only the simplified lower/upper case folding
576	* specified by the unicode specs are used. It does not consider character pairs
577	* as they are used in some languages, just simple upper & lower case compares.
578	*
579	* @returns < 0 if the first string less than the second string.
580	* @returns 0 if the first string identical to the second string.
581	* @returns > 0 if the first string greater than the second string.
582	* @param pwsz1 First big endian UTF-16 string. Null is allowed.
583	* @param pwsz2 Second big endian UTF-16 string. Null is allowed.
584	* @param cwcMax Maximum number of characters to compare.
585	*/
586	RTDECL(int) RTUtf16BigNICmp(PCRTUTF16 pwsz1, PCRTUTF16 pwsz2, size_t cwcMax);
587
588	/**
589	* Performs a case insensitive string compare between a UTF-16 string and a pure
590	* ASCII string, stopping after N characters.
591	*
592	* Since this compare only takes cares about the first 128 codepoints in
593	* unicode, no tables are needed and there aren't any real complications.
594	*
595	* @returns < 0 if the first string less than the second string.
596	* @returns 0 if the first string identical to the second string.
597	* @returns > 0 if the first string greater than the second string.
598	* @param pwsz1 The UTF-16 first string. Null is allowed.
599	* @param psz2 The pure ASCII second string. Null is allowed.
600	* @param cwcMax Maximum number of UTF-16 characters to compare.
601	*/
602	RTDECL(int) RTUtf16NICmpAscii(PCRTUTF16 pwsz1, const char *psz2, size_t cwcMax);
603
604
605	/**
606	* Folds a UTF-16 string to lowercase.
607	*
608	* This is a very simple folding; is uses the simple lowercase
609	* code point, it is not related to any locale just the most common
610	* lowercase codepoint setup by the unicode specs, and it will not
611	* create new surrogate pairs or remove existing ones.
612	*
613	* @returns Pointer to the passed in string.
614	* @param pwsz The string to fold.
615	*/
616	RTDECL(PRTUTF16) RTUtf16ToLower(PRTUTF16 pwsz);
617
618	/**
619	* Folds a UTF-16 string to uppercase.
620	*
621	* This is a very simple folding; is uses the simple uppercase
622	* code point, it is not related to any locale just the most common
623	* uppercase codepoint setup by the unicode specs, and it will not
624	* create new surrogate pairs or remove existing ones.
625	*
626	* @returns Pointer to the passed in string.
627	* @param pwsz The string to fold.
628	*/
629	RTDECL(PRTUTF16) RTUtf16ToUpper(PRTUTF16 pwsz);
630
631	/**
632	* Validates the UTF-16 encoding of the string.
633	*
634	* @returns iprt status code.
635	* @param pwsz The string.
636	*/
637	RTDECL(int) RTUtf16ValidateEncoding(PCRTUTF16 pwsz);
638
639	/**
640	* Validates the UTF-16 encoding of the string.
641	*
642	* @returns iprt status code.
643	* @param pwsz The string.
644	* @param cwc The max string length (/ size) in UTF-16 units. Use
645	* RTSTR_MAX to process the entire string.
646	* @param fFlags Combination of RTSTR_VALIDATE_ENCODING_XXX flags.
647	*/
648	RTDECL(int) RTUtf16ValidateEncodingEx(PCRTUTF16 pwsz, size_t cwc, uint32_t fFlags);
649
650	/**
651	* Checks if the UTF-16 encoding is valid.
652	*
653	* @returns true / false.
654	* @param pwsz The string.
655	*/
656	RTDECL(bool) RTUtf16IsValidEncoding(PCRTUTF16 pwsz);
657
658	/**
659	* Sanitise a (valid) UTF-16 string by replacing all characters outside a white
660	* list in-place by an ASCII replacement character.
661	*
662	* Surrogate paris will be replaced by two chars.
663	*
664	* @returns The number of code points replaced. In the case of an incorrectly
665	* encoded string -1 will be returned, and the string is not completely
666	* processed. In the case of puszValidPairs having an odd number of
667	* code points, -1 will be also return but without any modification to
668	* the string.
669	* @param pwsz The string to sanitise.
670	* @param puszValidPairs A zero-terminated array of pairs of Unicode points.
671	* Each pair is the start and end point of a range,
672	* and the union of these ranges forms the white list.
673	* @param chReplacement The ASCII replacement character.
674	* @sa RTStrPurgeComplementSet
675	*/
676	RTDECL(ssize_t) RTUtf16PurgeComplementSet(PRTUTF16 pwsz, PCRTUNICP puszValidPairs, char chReplacement);
677
678
679	/**
680	* Translate a UTF-16 string into a UTF-8 allocating the result buffer (default
681	* tag).
682	*
683	* @returns iprt status code.
684	* @param pwszString UTF-16 string to convert.
685	* @param ppszString Receives pointer of allocated UTF-8 string on
686	* success, and is always set to NULL on failure.
687	* The returned pointer must be freed using RTStrFree().
688	*/
689	#define RTUtf16ToUtf8(pwszString, ppszString) RTUtf16ToUtf8Tag((pwszString), (ppszString), RTSTR_TAG)
690
691	/**
692	* Translate a UTF-16 string into a UTF-8 allocating the result buffer.
693	*
694	* @returns iprt status code.
695	* @param pwszString UTF-16 string to convert.
696	* @param ppszString Receives pointer of allocated UTF-8 string on
697	* success, and is always set to NULL on failure.
698	* The returned pointer must be freed using RTStrFree().
699	* @param pszTag Allocation tag used for statistics and such.
700	*/
701	RTDECL(int) RTUtf16ToUtf8Tag(PCRTUTF16 pwszString, char *ppszString, const char pszTag);
702
703	/**
704	* Translate a UTF-16BE string into a UTF-8 allocating the result buffer
705	* (default tag).
706	*
707	* This differs from RTUtf16ToUtf8 in that the input is always a
708	* big-endian string.
709	*
710	* @returns iprt status code.
711	* @param pwszString UTF-16BE string to convert.
712	* @param ppszString Receives pointer of allocated UTF-8 string on
713	* success, and is always set to NULL on failure.
714	* The returned pointer must be freed using RTStrFree().
715	*/
716	#define RTUtf16BigToUtf8(pwszString, ppszString) RTUtf16BigToUtf8Tag((pwszString), (ppszString), RTSTR_TAG)
717
718	/**
719	* Translate a UTF-16BE string into a UTF-8 allocating the result buffer.
720	*
721	* This differs from RTUtf16ToUtf8Tag in that the input is always a
722	* big-endian string.
723	*
724	* @returns iprt status code.
725	* @param pwszString UTF-16BE string to convert.
726	* @param ppszString Receives pointer of allocated UTF-8 string on
727	* success, and is always set to NULL on failure.
728	* The returned pointer must be freed using RTStrFree().
729	* @param pszTag Allocation tag used for statistics and such.
730	*/
731	RTDECL(int) RTUtf16BigToUtf8Tag(PCRTUTF16 pwszString, char *ppszString, const char pszTag);
732
733	/**
734	* Translate a UTF-16LE string into a UTF-8 allocating the result buffer
735	* (default tag).
736	*
737	* This differs from RTUtf16ToUtf8 in that the input is always a
738	* little-endian string.
739	*
740	* @returns iprt status code.
741	* @param pwszString UTF-16LE string to convert.
742	* @param ppszString Receives pointer of allocated UTF-8 string on
743	* success, and is always set to NULL on failure.
744	* The returned pointer must be freed using RTStrFree().
745	*/
746	#define RTUtf16LittleToUtf8(pwszString, ppszString) RTUtf16LittleToUtf8Tag((pwszString), (ppszString), RTSTR_TAG)
747
748	/**
749	* Translate a UTF-16LE string into a UTF-8 allocating the result buffer.
750	*
751	* This differs from RTUtf16ToUtf8Tag in that the input is always a
752	* little-endian string.
753	*
754	* @returns iprt status code.
755	* @param pwszString UTF-16LE string to convert.
756	* @param ppszString Receives pointer of allocated UTF-8 string on
757	* success, and is always set to NULL on failure.
758	* The returned pointer must be freed using RTStrFree().
759	* @param pszTag Allocation tag used for statistics and such.
760	*/
761	RTDECL(int) RTUtf16LittleToUtf8Tag(PCRTUTF16 pwszString, char *ppszString, const char pszTag);
762
763
764	/**
765	* Translates UTF-16 to UTF-8 using buffer provided by the caller or a fittingly
766	* sized buffer allocated by the function (default tag).
767	*
768	* @returns iprt status code.
769	* @param pwszString The UTF-16 string to convert.
770	* @param cwcString The number of RTUTF16 items to translate from pwszString.
771	* The translation will stop when reaching cwcString or the terminator ('\\0').
772	* Use RTSTR_MAX to translate the entire string.
773	* @param ppsz If cch is non-zero, this must either be pointing to a pointer to
774	* a buffer of the specified size, or pointer to a NULL pointer.
775	* If *ppsz is NULL or cch is zero a buffer of at least cch chars
776	* will be allocated to hold the translated string.
777	* If a buffer was requested it must be freed using RTStrFree().
778	* @param cch The buffer size in chars (the type). This includes the terminator.
779	* @param pcch Where to store the length of the translated string,
780	* excluding the terminator. (Optional)
781	*
782	* This may be set under some error conditions,
783	* however, only for VERR_BUFFER_OVERFLOW and
784	* VERR_NO_STR_MEMORY will it contain a valid string
785	* length that can be used to resize the buffer.
786	*/
787	#define RTUtf16ToUtf8Ex(pwszString, cwcString, ppsz, cch, pcch) \
788	RTUtf16ToUtf8ExTag((pwszString), (cwcString), (ppsz), (cch), (pcch), RTSTR_TAG)
789
790	/**
791	* Translates UTF-16 to UTF-8 using buffer provided by the caller or a fittingly
792	* sized buffer allocated by the function (custom tag).
793	*
794	* @returns iprt status code.
795	* @param pwszString The UTF-16 string to convert.
796	* @param cwcString The number of RTUTF16 items to translate from pwszString.
797	* The translation will stop when reaching cwcString or the terminator ('\\0').
798	* Use RTSTR_MAX to translate the entire string.
799	* @param ppsz If cch is non-zero, this must either be pointing to a pointer to
800	* a buffer of the specified size, or pointer to a NULL pointer.
801	* If *ppsz is NULL or cch is zero a buffer of at least cch chars
802	* will be allocated to hold the translated string.
803	* If a buffer was requested it must be freed using RTStrFree().
804	* @param cch The buffer size in chars (the type). This includes the terminator.
805	* @param pcch Where to store the length of the translated string,
806	* excluding the terminator. (Optional)
807	*
808	* This may be set under some error conditions,
809	* however, only for VERR_BUFFER_OVERFLOW and
810	* VERR_NO_STR_MEMORY will it contain a valid string
811	* length that can be used to resize the buffer.
812	* @param pszTag Allocation tag used for statistics and such.
813	*/
814	RTDECL(int) RTUtf16ToUtf8ExTag(PCRTUTF16 pwszString, size_t cwcString, char *ppsz, size_t cch, size_t pcch, const char *pszTag);
815
816	/**
817	* Translates UTF-16BE to UTF-8 using buffer provided by the caller or a
818	* fittingly sized buffer allocated by the function (default tag).
819	*
820	* This differs from RTUtf16ToUtf8Ex in that the input is always a
821	* big-endian string.
822	*
823	* @returns iprt status code.
824	* @param pwszString The UTF-16BE string to convert.
825	* @param cwcString The number of RTUTF16 items to translate from pwszString.
826	* The translation will stop when reaching cwcString or the terminator ('\\0').
827	* Use RTSTR_MAX to translate the entire string.
828	* @param ppsz If cch is non-zero, this must either be pointing to a pointer to
829	* a buffer of the specified size, or pointer to a NULL pointer.
830	* If *ppsz is NULL or cch is zero a buffer of at least cch chars
831	* will be allocated to hold the translated string.
832	* If a buffer was requested it must be freed using RTStrFree().
833	* @param cch The buffer size in chars (the type). This includes the terminator.
834	* @param pcch Where to store the length of the translated string,
835	* excluding the terminator. (Optional)
836	*
837	* This may be set under some error conditions,
838	* however, only for VERR_BUFFER_OVERFLOW and
839	* VERR_NO_STR_MEMORY will it contain a valid string
840	* length that can be used to resize the buffer.
841	*/
842	#define RTUtf16BigToUtf8Ex(pwszString, cwcString, ppsz, cch, pcch) \
843	RTUtf16BigToUtf8ExTag((pwszString), (cwcString), (ppsz), (cch), (pcch), RTSTR_TAG)
844
845	/**
846	* Translates UTF-16BE to UTF-8 using buffer provided by the caller or a
847	* fittingly sized buffer allocated by the function (custom tag).
848	*
849	* This differs from RTUtf16ToUtf8ExTag in that the input is always a
850	* big-endian string.
851	*
852	* @returns iprt status code.
853	* @param pwszString The UTF-16BE string to convert.
854	* @param cwcString The number of RTUTF16 items to translate from pwszString.
855	* The translation will stop when reaching cwcString or the terminator ('\\0').
856	* Use RTSTR_MAX to translate the entire string.
857	* @param ppsz If cch is non-zero, this must either be pointing to a pointer to
858	* a buffer of the specified size, or pointer to a NULL pointer.
859	* If *ppsz is NULL or cch is zero a buffer of at least cch chars
860	* will be allocated to hold the translated string.
861	* If a buffer was requested it must be freed using RTStrFree().
862	* @param cch The buffer size in chars (the type). This includes the terminator.
863	* @param pcch Where to store the length of the translated string,
864	* excluding the terminator. (Optional)
865	*
866	* This may be set under some error conditions,
867	* however, only for VERR_BUFFER_OVERFLOW and
868	* VERR_NO_STR_MEMORY will it contain a valid string
869	* length that can be used to resize the buffer.
870	* @param pszTag Allocation tag used for statistics and such.
871	*/
872	RTDECL(int) RTUtf16BigToUtf8ExTag(PCRTUTF16 pwszString, size_t cwcString, char *ppsz, size_t cch, size_t pcch, const char *pszTag);
873
874	/**
875	* Translates UTF-16LE to UTF-8 using buffer provided by the caller or a
876	* fittingly sized buffer allocated by the function (default tag).
877	*
878	* This differs from RTUtf16ToUtf8Ex in that the input is always a
879	* little-endian string.
880	*
881	* @returns iprt status code.
882	* @param pwszString The UTF-16LE string to convert.
883	* @param cwcString The number of RTUTF16 items to translate from pwszString.
884	* The translation will stop when reaching cwcString or the terminator ('\\0').
885	* Use RTSTR_MAX to translate the entire string.
886	* @param ppsz If cch is non-zero, this must either be pointing to a pointer to
887	* a buffer of the specified size, or pointer to a NULL pointer.
888	* If *ppsz is NULL or cch is zero a buffer of at least cch chars
889	* will be allocated to hold the translated string.
890	* If a buffer was requested it must be freed using RTStrFree().
891	* @param cch The buffer size in chars (the type). This includes the terminator.
892	* @param pcch Where to store the length of the translated string,
893	* excluding the terminator. (Optional)
894	*
895	* This may be set under some error conditions,
896	* however, only for VERR_BUFFER_OVERFLOW and
897	* VERR_NO_STR_MEMORY will it contain a valid string
898	* length that can be used to resize the buffer.
899	*/
900	#define RTUtf16LittleToUtf8Ex(pwszString, cwcString, ppsz, cch, pcch) \
901	RTUtf16LittleToUtf8ExTag((pwszString), (cwcString), (ppsz), (cch), (pcch), RTSTR_TAG)
902
903	/**
904	* Translates UTF-16LE to UTF-8 using buffer provided by the caller or a
905	* fittingly sized buffer allocated by the function (custom tag).
906	*
907	* This differs from RTUtf16ToUtf8ExTag in that the input is always a
908	* little-endian string.
909	*
910	* @returns iprt status code.
911	* @param pwszString The UTF-16LE string to convert.
912	* @param cwcString The number of RTUTF16 items to translate from pwszString.
913	* The translation will stop when reaching cwcString or the terminator ('\\0').
914	* Use RTSTR_MAX to translate the entire string.
915	* @param ppsz If cch is non-zero, this must either be pointing to a pointer to
916	* a buffer of the specified size, or pointer to a NULL pointer.
917	* If *ppsz is NULL or cch is zero a buffer of at least cch chars
918	* will be allocated to hold the translated string.
919	* If a buffer was requested it must be freed using RTStrFree().
920	* @param cch The buffer size in chars (the type). This includes the terminator.
921	* @param pcch Where to store the length of the translated string,
922	* excluding the terminator. (Optional)
923	*
924	* This may be set under some error conditions,
925	* however, only for VERR_BUFFER_OVERFLOW and
926	* VERR_NO_STR_MEMORY will it contain a valid string
927	* length that can be used to resize the buffer.
928	* @param pszTag Allocation tag used for statistics and such.
929	*/
930	RTDECL(int) RTUtf16LittleToUtf8ExTag(PCRTUTF16 pwszString, size_t cwcString, char *ppsz, size_t cch, size_t pcch,
931	const char *pszTag);
932
933	/**
934	* Calculates the length of the UTF-16 string in UTF-8 chars (bytes).
935	*
936	* This function will validate the string, and incorrectly encoded UTF-16
937	* strings will be rejected. The primary purpose of this function is to
938	* help allocate buffers for RTUtf16ToUtf8() of the correct size. For most
939	* other purposes RTUtf16ToUtf8Ex() should be used.
940	*
941	* @returns Number of char (bytes).
942	* @returns 0 if the string was incorrectly encoded.
943	* @param pwsz The UTF-16 string.
944	*/
945	RTDECL(size_t) RTUtf16CalcUtf8Len(PCRTUTF16 pwsz);
946
947	/**
948	* Calculates the length of the UTF-16BE string in UTF-8 chars (bytes).
949	*
950	* This function will validate the string, and incorrectly encoded UTF-16BE
951	* strings will be rejected. The primary purpose of this function is to
952	* help allocate buffers for RTUtf16BigToUtf8() of the correct size. For most
953	* other purposes RTUtf16BigToUtf8Ex() should be used.
954	*
955	* @returns Number of char (bytes).
956	* @returns 0 if the string was incorrectly encoded.
957	* @param pwsz The UTF-16BE string.
958	*/
959	RTDECL(size_t) RTUtf16BigCalcUtf8Len(PCRTUTF16 pwsz);
960
961	/**
962	* Calculates the length of the UTF-16LE string in UTF-8 chars (bytes).
963	*
964	* This function will validate the string, and incorrectly encoded UTF-16LE
965	* strings will be rejected. The primary purpose of this function is to
966	* help allocate buffers for RTUtf16LittleToUtf8() of the correct size. For
967	* most other purposes RTUtf16LittleToUtf8Ex() should be used.
968	*
969	* @returns Number of char (bytes).
970	* @returns 0 if the string was incorrectly encoded.
971	* @param pwsz The UTF-16LE string.
972	*/
973	RTDECL(size_t) RTUtf16LittleCalcUtf8Len(PCRTUTF16 pwsz);
974
975	/**
976	* Calculates the length of the UTF-16 string in UTF-8 chars (bytes).
977	*
978	* This function will validate the string, and incorrectly encoded UTF-16
979	* strings will be rejected.
980	*
981	* @returns iprt status code.
982	* @param pwsz The string.
983	* @param cwc The max string length. Use RTSTR_MAX to process the entire string.
984	* @param pcch Where to store the string length (in bytes). Optional.
985	* This is undefined on failure.
986	*/
987	RTDECL(int) RTUtf16CalcUtf8LenEx(PCRTUTF16 pwsz, size_t cwc, size_t *pcch);
988
989	/**
990	* Calculates the length of the UTF-16BE string in UTF-8 chars (bytes).
991	*
992	* This function will validate the string, and incorrectly encoded UTF-16BE
993	* strings will be rejected.
994	*
995	* @returns iprt status code.
996	* @param pwsz The string.
997	* @param cwc The max string length. Use RTSTR_MAX to process the entire string.
998	* @param pcch Where to store the string length (in bytes). Optional.
999	* This is undefined on failure.
1000	*/
1001	RTDECL(int) RTUtf16BigCalcUtf8LenEx(PCRTUTF16 pwsz, size_t cwc, size_t *pcch);
1002
1003	/**
1004	* Calculates the length of the UTF-16LE string in UTF-8 chars (bytes).
1005	*
1006	* This function will validate the string, and incorrectly encoded UTF-16LE
1007	* strings will be rejected.
1008	*
1009	* @returns iprt status code.
1010	* @param pwsz The string.
1011	* @param cwc The max string length. Use RTSTR_MAX to process the entire string.
1012	* @param pcch Where to store the string length (in bytes). Optional.
1013	* This is undefined on failure.
1014	*/
1015	RTDECL(int) RTUtf16LittleCalcUtf8LenEx(PCRTUTF16 pwsz, size_t cwc, size_t *pcch);
1016
1017	/**
1018	* Translate a UTF-16 string into a Latin-1 (ISO-8859-1) allocating the result
1019	* buffer (default tag).
1020	*
1021	* @returns iprt status code.
1022	* @param pwszString UTF-16 string to convert.
1023	* @param ppszString Receives pointer of allocated Latin1 string on
1024	* success, and is always set to NULL on failure.
1025	* The returned pointer must be freed using RTStrFree().
1026	*/
1027	#define RTUtf16ToLatin1(pwszString, ppszString) RTUtf16ToLatin1Tag((pwszString), (ppszString), RTSTR_TAG)
1028
1029	/**
1030	* Translate a UTF-16 string into a Latin-1 (ISO-8859-1) allocating the result
1031	* buffer (custom tag).
1032	*
1033	* @returns iprt status code.
1034	* @param pwszString UTF-16 string to convert.
1035	* @param ppszString Receives pointer of allocated Latin1 string on
1036	* success, and is always set to NULL on failure.
1037	* The returned pointer must be freed using RTStrFree().
1038	* @param pszTag Allocation tag used for statistics and such.
1039	*/
1040	RTDECL(int) RTUtf16ToLatin1Tag(PCRTUTF16 pwszString, char *ppszString, const char pszTag);
1041
1042	/**
1043	* Translates UTF-16 to Latin-1 (ISO-8859-1) using buffer provided by the caller
1044	* or a fittingly sized buffer allocated by the function (default tag).
1045	*
1046	* @returns iprt status code.
1047	* @param pwszString The UTF-16 string to convert.
1048	* @param cwcString The number of RTUTF16 items to translate from
1049	* pwszString. The translation will stop when reaching
1050	* cwcString or the terminator ('\\0'). Use RTSTR_MAX
1051	* to translate the entire string.
1052	* @param ppsz Pointer to the pointer to the Latin-1 string. The
1053	* buffer can optionally be preallocated by the caller.
1054	*
1055	* If cch is zero, *ppsz is undefined.
1056	*
1057	* If cch is non-zero and *ppsz is not NULL, then this
1058	* will be used as the output buffer.
1059	* VERR_BUFFER_OVERFLOW will be returned if this is
1060	* insufficient.
1061	*
1062	* If cch is zero or *ppsz is NULL, then a buffer of
1063	* sufficient size is allocated. cch can be used to
1064	* specify a minimum size of this buffer. Use
1065	* RTUtf16Free() to free the result.
1066	*
1067	* @param cch The buffer size in chars (the type). This includes
1068	* the terminator.
1069	* @param pcch Where to store the length of the translated string,
1070	* excluding the terminator. (Optional)
1071	*
1072	* This may be set under some error conditions,
1073	* however, only for VERR_BUFFER_OVERFLOW and
1074	* VERR_NO_STR_MEMORY will it contain a valid string
1075	* length that can be used to resize the buffer.
1076	*/
1077	#define RTUtf16ToLatin1Ex(pwszString, cwcString, ppsz, cch, pcch) \
1078	RTUtf16ToLatin1ExTag((pwszString), (cwcString), (ppsz), (cch), (pcch), RTSTR_TAG)
1079
1080	/**
1081	* Translates UTF-16 to Latin-1 (ISO-8859-1) using buffer provided by the caller
1082	* or a fittingly sized buffer allocated by the function (custom tag).
1083	*
1084	* @returns iprt status code.
1085	* @param pwszString The UTF-16 string to convert.
1086	* @param cwcString The number of RTUTF16 items to translate from
1087	* pwszString. The translation will stop when reaching
1088	* cwcString or the terminator ('\\0'). Use RTSTR_MAX
1089	* to translate the entire string.
1090	* @param ppsz Pointer to the pointer to the Latin-1 string. The
1091	* buffer can optionally be preallocated by the caller.
1092	*
1093	* If cch is zero, *ppsz is undefined.
1094	*
1095	* If cch is non-zero and *ppsz is not NULL, then this
1096	* will be used as the output buffer.
1097	* VERR_BUFFER_OVERFLOW will be returned if this is
1098	* insufficient.
1099	*
1100	* If cch is zero or *ppsz is NULL, then a buffer of
1101	* sufficient size is allocated. cch can be used to
1102	* specify a minimum size of this buffer. Use
1103	* RTUtf16Free() to free the result.
1104	*
1105	* @param cch The buffer size in chars (the type). This includes
1106	* the terminator.
1107	* @param pcch Where to store the length of the translated string,
1108	* excluding the terminator. (Optional)
1109	*
1110	* This may be set under some error conditions,
1111	* however, only for VERR_BUFFER_OVERFLOW and
1112	* VERR_NO_STR_MEMORY will it contain a valid string
1113	* length that can be used to resize the buffer.
1114	* @param pszTag Allocation tag used for statistics and such.
1115	*/
1116	RTDECL(int) RTUtf16ToLatin1ExTag(PCRTUTF16 pwszString, size_t cwcString, char *ppsz, size_t cch, size_t pcch, const char *pszTag);
1117
1118	/**
1119	* Calculates the length of the UTF-16 string in Latin-1 (ISO-8859-1) chars.
1120	*
1121	* This function will validate the string, and incorrectly encoded UTF-16
1122	* strings will be rejected. The primary purpose of this function is to
1123	* help allocate buffers for RTUtf16ToLatin1() of the correct size. For most
1124	* other purposes RTUtf16ToLatin1Ex() should be used.
1125	*
1126	* @returns Number of char (bytes).
1127	* @returns 0 if the string was incorrectly encoded.
1128	* @param pwsz The UTF-16 string.
1129	*/
1130	RTDECL(size_t) RTUtf16CalcLatin1Len(PCRTUTF16 pwsz);
1131
1132	/**
1133	* Calculates the length of the UTF-16 string in Latin-1 (ISO-8859-1) chars.
1134	*
1135	* This function will validate the string, and incorrectly encoded UTF-16
1136	* strings will be rejected.
1137	*
1138	* @returns iprt status code.
1139	* @param pwsz The string.
1140	* @param cwc The max string length. Use RTSTR_MAX to process the
1141	* entire string.
1142	* @param pcch Where to store the string length (in bytes). Optional.
1143	* This is undefined on failure.
1144	*/
1145	RTDECL(int) RTUtf16CalcLatin1LenEx(PCRTUTF16 pwsz, size_t cwc, size_t *pcch);
1146
1147	/**
1148	* Get the unicode code point at the given string position.
1149	*
1150	* @returns unicode code point.
1151	* @returns RTUNICP_INVALID if the encoding is invalid.
1152	* @param pwsz The string.
1153	*
1154	* @remark This is an internal worker for RTUtf16GetCp().
1155	*/
1156	RTDECL(RTUNICP) RTUtf16GetCpInternal(PCRTUTF16 pwsz);
1157
1158	/**
1159	* Get the unicode code point at the given string position.
1160	*
1161	* @returns iprt status code.
1162	* @param ppwsz Pointer to the string pointer. This will be updated to
1163	* point to the char following the current code point.
1164	* @param pCp Where to store the code point.
1165	* RTUNICP_INVALID is stored here on failure.
1166	*
1167	* @remark This is an internal worker for RTUtf16GetCpEx().
1168	*/
1169	RTDECL(int) RTUtf16GetCpExInternal(PCRTUTF16 *ppwsz, PRTUNICP pCp);
1170
1171	/**
1172	* Get the unicode code point at the given string position with length
1173	* restriction.
1174	*
1175	* @returns iprt status code.
1176	* @param ppwsz Pointer to the string pointer. This will be updated to
1177	* point to the char following the current code point.
1178	* @param pcwc Pointer to the max string length. This will be
1179	* decremented corrsponding to the advancement of @a ppwsz.
1180	* @param pCp Where to store the code point.
1181	* RTUNICP_INVALID is stored here on failure.
1182	*
1183	* @remark This is an internal worker for RTUtf16GetCpNEx().
1184	*/
1185	RTDECL(int) RTUtf16GetCpNExInternal(PCRTUTF16 ppwsz, size_t pcwc, PRTUNICP pCp);
1186
1187	/**
1188	* Get the unicode code point at the given string position, big endian.
1189	*
1190	* @returns iprt status code.
1191	* @param ppwsz Pointer to the string pointer. This will be updated to
1192	* point to the char following the current code point.
1193	* @param pCp Where to store the code point.
1194	* RTUNICP_INVALID is stored here on failure.
1195	*
1196	* @remark This is an internal worker for RTUtf16BigGetCpEx().
1197	*/
1198	RTDECL(int) RTUtf16BigGetCpExInternal(PCRTUTF16 *ppwsz, PRTUNICP pCp);
1199
1200	/**
1201	* Put the unicode code point at the given string position
1202	* and return the pointer to the char following it.
1203	*
1204	* This function will not consider anything at or following the
1205	* buffer area pointed to by pwsz. It is therefore not suitable for
1206	* inserting code points into a string, only appending/overwriting.
1207	*
1208	* @returns pointer to the char following the written code point.
1209	* @param pwsz The string.
1210	* @param CodePoint The code point to write.
1211	* This should not be RTUNICP_INVALID or any other
1212	* character out of the UTF-16 range.
1213	*
1214	* @remark This is an internal worker for RTUtf16GetCpEx().
1215	*/
1216	RTDECL(PRTUTF16) RTUtf16PutCpInternal(PRTUTF16 pwsz, RTUNICP CodePoint);
1217
1218	/**
1219	* Get the unicode code point at the given string position.
1220	*
1221	* @returns unicode code point.
1222	* @returns RTUNICP_INVALID if the encoding is invalid.
1223	* @param pwsz The string.
1224	*
1225	* @remark We optimize this operation by using an inline function for
1226	* everything which isn't a surrogate pair or an endian indicator.
1227	*/
1228	DECLINLINE(RTUNICP) RTUtf16GetCp(PCRTUTF16 pwsz)
1229	{
1230	const RTUTF16 wc = *pwsz;
1231	if (wc < 0xd800 \|\| (wc > 0xdfff && wc < 0xfffe))
1232	return wc;
1233	return RTUtf16GetCpInternal(pwsz);
1234	}
1235
1236	/**
1237	* Get the unicode code point at the given string position.
1238	*
1239	* @returns iprt status code.
1240	* @param ppwsz Pointer to the string pointer. This will be updated to
1241	* point to the char following the current code point.
1242	* @param pCp Where to store the code point.
1243	* RTUNICP_INVALID is stored here on failure.
1244	*
1245	* @remark We optimize this operation by using an inline function for
1246	* everything which isn't a surrogate pair or and endian indicator.
1247	*/
1248	DECLINLINE(int) RTUtf16GetCpEx(PCRTUTF16 *ppwsz, PRTUNICP pCp)
1249	{
1250	const RTUTF16 wc = **ppwsz;
1251	if (wc < 0xd800 \|\| (wc > 0xdfff && wc < 0xfffe))
1252	{
1253	(*ppwsz)++;
1254	*pCp = wc;
1255	return VINF_SUCCESS;
1256	}
1257	return RTUtf16GetCpExInternal(ppwsz, pCp);
1258	}
1259
1260	/**
1261	* Get the unicode code point at the given string position.
1262	*
1263	* @returns iprt status code.
1264	* @param ppwsz Pointer to the string pointer. This will be updated to
1265	* point to the char following the current code point.
1266	* @param pcwc Pointer to the max string length. This will be
1267	* decremented corrsponding to the advancement of @a ppwsz.
1268	* @param pCp Where to store the code point. RTUNICP_INVALID is stored
1269	* here on failure.
1270	*
1271	* @remark We optimize this operation by using an inline function for
1272	* everything which isn't a surrogate pair or and endian indicator.
1273	*/
1274	DECLINLINE(int) RTUtf16GetCpNEx(PCRTUTF16 ppwsz, size_t pcwc, PRTUNICP pCp)
1275	{
1276	const size_t cwc = *pcwc;
1277	if (cwc > 0)
1278	{
1279	const PCRTUTF16 pwsz = *ppwsz;
1280	const RTUTF16 wc = *pwsz;
1281	if (wc < 0xd800 \|\| (wc > 0xdfff && wc < 0xfffe))
1282	{
1283	*pCp = wc;
1284	*pcwc = cwc - 1;
1285	*ppwsz = pwsz + 1;
1286	return VINF_SUCCESS;
1287	}
1288	}
1289	return RTUtf16GetCpNExInternal(ppwsz, pcwc, pCp);
1290	}
1291
1292	/**
1293	* Get the unicode code point at the given string position, big endian version.
1294	*
1295	* @returns iprt status code.
1296	* @param ppwsz Pointer to the string pointer. This will be updated to
1297	* point to the char following the current code point.
1298	* @param pCp Where to store the code point.
1299	* RTUNICP_INVALID is stored here on failure.
1300	*
1301	* @remark We optimize this operation by using an inline function for
1302	* everything which isn't a surrogate pair or and endian indicator.
1303	*/
1304	DECLINLINE(int) RTUtf16BigGetCpEx(PCRTUTF16 *ppwsz, PRTUNICP pCp)
1305	{
1306	#ifdef RT_BIG_ENDIAN
1307	return RTUtf16GetCpEx(ppwsz, pCp);
1308	#else
1309	# ifdef IPRT_INCLUDED_asm_h
1310	const RTUTF16 wc = RT_BE2H_U16(**ppwsz);
1311	if (wc < 0xd800 \|\| (wc > 0xdfff && wc < 0xfffe))
1312	{
1313	(*ppwsz)++;
1314	*pCp = wc;
1315	return VINF_SUCCESS;
1316	}
1317	# endif
1318	return RTUtf16BigGetCpExInternal(ppwsz, pCp);
1319	#endif
1320	}
1321
1322	/**
1323	* Put the unicode code point at the given string position
1324	* and return the pointer to the char following it.
1325	*
1326	* This function will not consider anything at or following the
1327	* buffer area pointed to by pwsz. It is therefore not suitable for
1328	* inserting code points into a string, only appending/overwriting.
1329	*
1330	* @returns pointer to the char following the written code point.
1331	* @param pwsz The string.
1332	* @param CodePoint The code point to write.
1333	* This should not be RTUNICP_INVALID or any other
1334	* character out of the UTF-16 range.
1335	*
1336	* @remark We optimize this operation by using an inline function for
1337	* everything which isn't a surrogate pair or and endian indicator.
1338	*/
1339	DECLINLINE(PRTUTF16) RTUtf16PutCp(PRTUTF16 pwsz, RTUNICP CodePoint)
1340	{
1341	if (CodePoint < 0xd800 \|\| (CodePoint > 0xd800 && CodePoint < 0xfffe))
1342	{
1343	*pwsz++ = (RTUTF16)CodePoint;
1344	return pwsz;
1345	}
1346	return RTUtf16PutCpInternal(pwsz, CodePoint);
1347	}
1348
1349	/**
1350	* Skips ahead, past the current code point.
1351	*
1352	* @returns Pointer to the char after the current code point.
1353	* @param pwsz Pointer to the current code point.
1354	* @remark This will not move the next valid code point, only past the current one.
1355	*/
1356	DECLINLINE(PRTUTF16) RTUtf16NextCp(PCRTUTF16 pwsz)
1357	{
1358	RTUNICP Cp;
1359	RTUtf16GetCpEx(&pwsz, &Cp);
1360	return (PRTUTF16)pwsz;
1361	}
1362
1363	/**
1364	* Skips backwards, to the previous code point.
1365	*
1366	* @returns Pointer to the char after the current code point.
1367	* @param pwszStart Pointer to the start of the string.
1368	* @param pwsz Pointer to the current code point.
1369	*/
1370	RTDECL(PRTUTF16) RTUtf16PrevCp(PCRTUTF16 pwszStart, PCRTUTF16 pwsz);
1371
1372
1373	/**
1374	* Checks if the UTF-16 char is the high surrogate char (i.e.
1375	* the 1st char in the pair).
1376	*
1377	* @returns true if it is.
1378	* @returns false if it isn't.
1379	* @param wc The character to investigate.
1380	*/
1381	DECLINLINE(bool) RTUtf16IsHighSurrogate(RTUTF16 wc)
1382	{
1383	return wc >= 0xd800 && wc <= 0xdbff;
1384	}
1385
1386	/**
1387	* Checks if the UTF-16 char is the low surrogate char (i.e.
1388	* the 2nd char in the pair).
1389	*
1390	* @returns true if it is.
1391	* @returns false if it isn't.
1392	* @param wc The character to investigate.
1393	*/
1394	DECLINLINE(bool) RTUtf16IsLowSurrogate(RTUTF16 wc)
1395	{
1396	return wc >= 0xdc00 && wc <= 0xdfff;
1397	}
1398
1399
1400	/**
1401	* Checks if the two UTF-16 chars form a valid surrogate pair.
1402	*
1403	* @returns true if they do.
1404	* @returns false if they doesn't.
1405	* @param wcHigh The high (1st) character.
1406	* @param wcLow The low (2nd) character.
1407	*/
1408	DECLINLINE(bool) RTUtf16IsSurrogatePair(RTUTF16 wcHigh, RTUTF16 wcLow)
1409	{
1410	return RTUtf16IsHighSurrogate(wcHigh)
1411	&& RTUtf16IsLowSurrogate(wcLow);
1412	}
1413
1414	/**
1415	* Formats a buffer stream as hex bytes.
1416	*
1417	* The default is no separating spaces or line breaks or anything.
1418	*
1419	* @returns IPRT status code.
1420	* @retval VERR_INVALID_POINTER if any of the pointers are wrong.
1421	* @retval VERR_BUFFER_OVERFLOW if the buffer is insufficent to hold the bytes.
1422	*
1423	* @param pwszBuf Output string buffer.
1424	* @param cwcBuf The size of the output buffer in RTUTF16 units.
1425	* @param pv Pointer to the bytes to stringify.
1426	* @param cb The number of bytes to stringify.
1427	* @param fFlags Combination of RTSTRPRINTHEXBYTES_F_XXX values.
1428	* @sa RTStrPrintHexBytes.
1429	*/
1430	RTDECL(int) RTUtf16PrintHexBytes(PRTUTF16 pwszBuf, size_t cwcBuf, void const *pv, size_t cb, uint32_t fFlags);
1431
1432	/**
1433	* String printf producing UTF-16 output.
1434	*
1435	* @returns On success, positive count of formatted RTUTF16 units excluding the
1436	* terminator. On buffer overflow, negative number giving the required
1437	* buffer size (including terminator) in RTUTF16 units.
1438	*
1439	* @param pwszBuffer Output buffer.
1440	* @param cwcBuffer Size of the output buffer in RTUTF16 units.
1441	* @param pszFormat Pointer to the format string, @see pg_rt_str_format.
1442	* @param args The format argument.
1443	*
1444	* @note This is similar to RTStrPrintf2V (not RTStrPrintfV)!
1445	*/
1446	RTDECL(ssize_t) RTUtf16PrintfV(PRTUTF16 pwszBuffer, size_t cwcBuffer, const char *pszFormat, va_list args) RT_IPRT_FORMAT_ATTR(3, 0);
1447
1448	/**
1449	* String printf producing UTF-16 output.
1450	*
1451	* @returns On success, positive count of formatted RTUTF16 units excluding the
1452	* terminator. On buffer overflow, negative number giving the required
1453	* buffer size (including terminator) in RTUTF16 units.
1454	*
1455	* @param pwszBuffer Output buffer.
1456	* @param cwcBuffer Size of the output buffer in RTUTF16 units.
1457	* @param pszFormat Pointer to the format string, @see pg_rt_str_format.
1458	* @param ... The format argument.
1459	*
1460	* @note This is similar to RTStrPrintf2 (not RTStrPrintf)!
1461	*/
1462	RTDECL(ssize_t) RTUtf16Printf(PRTUTF16 pwszBuffer, size_t cwcBuffer, const char *pszFormat, ...) RT_IPRT_FORMAT_ATTR(3, 4);
1463
1464	/**
1465	* String printf producing UTF-16 output with custom formatting.
1466	*
1467	* @returns On success, positive count of formatted RTUTF16 units excluding the
1468	* terminator. On buffer overflow, negative number giving the required
1469	* buffer size (including terminator) in RTUTF16 units.
1470	*
1471	* @param pfnFormat Pointer to handler function for the custom formats.
1472	* @param pvArg Argument to the pfnFormat function.
1473	* @param pwszBuffer Output buffer.
1474	* @param cwcBuffer Size of the output buffer in RTUTF16 units.
1475	* @param pszFormat Pointer to the format string, @see pg_rt_str_format.
1476	* @param args The format argument.
1477	*
1478	* @note This is similar to RTStrPrintf2ExV (not RTStrPrintfExV)!
1479	*/
1480	RTDECL(ssize_t) RTUtf16PrintfExV(PFNSTRFORMAT pfnFormat, void *pvArg, PRTUTF16 pwszBuffer, size_t cwcBuffer,
1481	const char *pszFormat, va_list args) RT_IPRT_FORMAT_ATTR(5, 0);
1482
1483	/**
1484	* String printf producing UTF-16 output with custom formatting.
1485	*
1486	* @returns On success, positive count of formatted RTUTF16 units excluding the
1487	* terminator. On buffer overflow, negative number giving the required
1488	* buffer size (including terminator) in RTUTF16 units.
1489	*
1490	* @param pfnFormat Pointer to handler function for the custom formats.
1491	* @param pvArg Argument to the pfnFormat function.
1492	* @param pwszBuffer Output buffer.
1493	* @param cwcBuffer Size of the output buffer in RTUTF16 units.
1494	* @param pszFormat Pointer to the format string, @see pg_rt_str_format.
1495	* @param ... The format argument.
1496	*
1497	* @note This is similar to RTStrPrintf2Ex (not RTStrPrintfEx)!
1498	*/
1499	RTDECL(ssize_t) RTUtf16PrintfEx(PFNSTRFORMAT pfnFormat, void *pvArg, PRTUTF16 pwszBuffer, size_t cwcBuffer,
1500	const char *pszFormat, ...) RT_IPRT_FORMAT_ATTR(5, 6);
1501
1502	/** @} */
1503	RT_C_DECLS_END
1504
1505	#endif /* !IPRT_INCLUDED_utf16_h */
1506

Note: See TracBrowser for help on using the repository browser.

source: vbox/trunk/include/iprt/utf16.h@ 95897

Download in other formats: