VirtualBox

source: vbox/trunk/include/iprt/utf16.h@ 95897

Last change on this file since 95897 was 93115, checked in by vboxsync, 3 years ago

scm --update-copyright-year

  • Property svn:eol-style set to native
  • Property svn:keywords set to Author Date Id Revision
File size: 64.2 KB
Line 
1/** @file
2 * IPRT - String Manipulation, UTF-16 encoding.
3 */
4
5/*
6 * Copyright (C) 2006-2022 Oracle Corporation
7 *
8 * This file is part of VirtualBox Open Source Edition (OSE), as
9 * available from http://www.virtualbox.org. This file is free software;
10 * you can redistribute it and/or modify it under the terms of the GNU
11 * General Public License (GPL) as published by the Free Software
12 * Foundation, in version 2 as it comes in the "COPYING" file of the
13 * VirtualBox OSE distribution. VirtualBox OSE is distributed in the
14 * hope that it will be useful, but WITHOUT ANY WARRANTY of any kind.
15 *
16 * The contents of this file may alternatively be used under the terms
17 * of the Common Development and Distribution License Version 1.0
18 * (CDDL) only, as it comes in the "COPYING.CDDL" file of the
19 * VirtualBox OSE distribution, in which case the provisions of the
20 * CDDL are applicable instead of those of the GPL.
21 *
22 * You may elect to license modified versions of this file under the
23 * terms and conditions of either the GPL or the CDDL or both.
24 */
25
26#ifndef IPRT_INCLUDED_utf16_h
27#define IPRT_INCLUDED_utf16_h
28#ifndef RT_WITHOUT_PRAGMA_ONCE
29# pragma once
30#endif
31
32#include <iprt/string.h>
33
34RT_C_DECLS_BEGIN
35
36
37/** @defgroup rt_str_utf16 UTF-16 String Manipulation
38 * @ingroup grp_rt_str
39 * @{
40 */
41
42/**
43 * Allocates memory for UTF-16 string storage (default tag).
44 *
45 * You should normally not use this function, except if there is some very
46 * custom string handling you need doing that isn't covered by any of the other
47 * APIs.
48 *
49 * @returns Pointer to the allocated UTF-16 string. The first wide char is
50 * always set to the string terminator char, the contents of the
51 * remainder of the memory is undefined. The string must be freed by
52 * calling RTUtf16Free.
53 *
54 * NULL is returned if the allocation failed. Please translate this to
55 * VERR_NO_UTF16_MEMORY and not VERR_NO_MEMORY. Also consider
56 * RTUtf16AllocEx if an IPRT status code is required.
57 *
58 * @param cb How many bytes to allocate, will be rounded up
59 * to a multiple of two. If this is zero, we will
60 * allocate a terminator wide char anyway.
61 */
62#define RTUtf16Alloc(cb) RTUtf16AllocTag((cb), RTSTR_TAG)
63
64/**
65 * Allocates memory for UTF-16 string storage (custom tag).
66 *
67 * You should normally not use this function, except if there is some very
68 * custom string handling you need doing that isn't covered by any of the other
69 * APIs.
70 *
71 * @returns Pointer to the allocated UTF-16 string. The first wide char is
72 * always set to the string terminator char, the contents of the
73 * remainder of the memory is undefined. The string must be freed by
74 * calling RTUtf16Free.
75 *
76 * NULL is returned if the allocation failed. Please translate this to
77 * VERR_NO_UTF16_MEMORY and not VERR_NO_MEMORY. Also consider
78 * RTUtf16AllocExTag if an IPRT status code is required.
79 *
80 * @param cb How many bytes to allocate, will be rounded up
81 * to a multiple of two. If this is zero, we will
82 * allocate a terminator wide char anyway.
83 * @param pszTag Allocation tag used for statistics and such.
84 */
85RTDECL(PRTUTF16) RTUtf16AllocTag(size_t cb, const char *pszTag);
86
87/**
88 * Reallocates the specified UTF-16 string (default tag).
89 *
90 * You should normally not use this function, except if there is some very
91 * custom string handling you need doing that isn't covered by any of the other
92 * APIs.
93 *
94 * @returns VINF_SUCCESS.
95 * @retval VERR_NO_UTF16_MEMORY if we failed to reallocate the string, @a
96 * *ppwsz remains unchanged.
97 *
98 * @param ppwsz Pointer to the string variable containing the
99 * input and output string.
100 *
101 * When not freeing the string, the result will
102 * always have the last RTUTF16 set to the
103 * terminator character so that when used for
104 * string truncation the result will be a valid
105 * C-style string (your job to keep it a valid
106 * UTF-16 string).
107 *
108 * When the input string is NULL and we're supposed
109 * to reallocate, the returned string will also
110 * have the first RTUTF16 set to the terminator
111 * char so it will be a valid C-style string.
112 *
113 * @param cbNew When @a cbNew is zero, we'll behave like
114 * RTUtf16Free and @a *ppwsz will be set to NULL.
115 *
116 * When not zero, this will be rounded up to a
117 * multiple of two, and used as the new size of the
118 * memory backing the string, i.e. it includes the
119 * terminator (RTUTF16) char.
120 */
121#define RTUtf16Realloc(ppwsz, cbNew) RTUtf16ReallocTag((ppwsz), (cbNew), RTSTR_TAG)
122
123/**
124 * Reallocates the specified UTF-16 string (custom tag).
125 *
126 * You should normally not use this function, except if there is some very
127 * custom string handling you need doing that isn't covered by any of the other
128 * APIs.
129 *
130 * @returns VINF_SUCCESS.
131 * @retval VERR_NO_UTF16_MEMORY if we failed to reallocate the string, @a
132 * *ppwsz remains unchanged.
133 *
134 * @param ppwsz Pointer to the string variable containing the
135 * input and output string.
136 *
137 * When not freeing the string, the result will
138 * always have the last RTUTF16 set to the
139 * terminator character so that when used for
140 * string truncation the result will be a valid
141 * C-style string (your job to keep it a valid
142 * UTF-16 string).
143 *
144 * When the input string is NULL and we're supposed
145 * to reallocate, the returned string will also
146 * have the first RTUTF16 set to the terminator
147 * char so it will be a valid C-style string.
148 *
149 * @param cbNew When @a cbNew is zero, we'll behave like
150 * RTUtf16Free and @a *ppwsz will be set to NULL.
151 *
152 * When not zero, this will be rounded up to a
153 * multiple of two, and used as the new size of the
154 * memory backing the string, i.e. it includes the
155 * terminator (RTUTF16) char.
156 * @param pszTag Allocation tag used for statistics and such.
157 */
158RTDECL(int) RTUtf16ReallocTag(PRTUTF16 *ppwsz, size_t cbNew, const char *pszTag);
159
160/**
161 * Free a UTF-16 string allocated by RTStrToUtf16(), RTStrToUtf16Ex(),
162 * RTLatin1ToUtf16(), RTLatin1ToUtf16Ex(), RTUtf16Dup() or RTUtf16DupEx().
163 *
164 * @returns iprt status code.
165 * @param pwszString The UTF-16 string to free. NULL is accepted.
166 */
167RTDECL(void) RTUtf16Free(PRTUTF16 pwszString);
168
169/**
170 * Allocates a new copy of the specified UTF-16 string (default tag).
171 *
172 * @returns Pointer to the allocated string copy. Use RTUtf16Free() to free it.
173 * @returns NULL when out of memory.
174 * @param pwszString UTF-16 string to duplicate.
175 * @remark This function will not make any attempt to validate the encoding.
176 */
177#define RTUtf16Dup(pwszString) RTUtf16DupTag((pwszString), RTSTR_TAG)
178
179/**
180 * Allocates a new copy of the specified UTF-16 string (custom tag).
181 *
182 * @returns Pointer to the allocated string copy. Use RTUtf16Free() to free it.
183 * @returns NULL when out of memory.
184 * @param pwszString UTF-16 string to duplicate.
185 * @param pszTag Allocation tag used for statistics and such.
186 * @remark This function will not make any attempt to validate the encoding.
187 */
188RTDECL(PRTUTF16) RTUtf16DupTag(PCRTUTF16 pwszString, const char *pszTag);
189
190/**
191 * Allocates a new copy of the specified UTF-16 string (default tag).
192 *
193 * @returns iprt status code.
194 * @param ppwszString Receives pointer of the allocated UTF-16 string.
195 * The returned pointer must be freed using RTUtf16Free().
196 * @param pwszString UTF-16 string to duplicate.
197 * @param cwcExtra Number of extra RTUTF16 items to allocate.
198 * @remark This function will not make any attempt to validate the encoding.
199 */
200#define RTUtf16DupEx(ppwszString, pwszString, cwcExtra) \
201 RTUtf16DupExTag((ppwszString), (pwszString), (cwcExtra), RTSTR_TAG)
202
203/**
204 * Allocates a new copy of the specified UTF-16 string (custom tag).
205 *
206 * @returns iprt status code.
207 * @param ppwszString Receives pointer of the allocated UTF-16 string.
208 * The returned pointer must be freed using RTUtf16Free().
209 * @param pwszString UTF-16 string to duplicate.
210 * @param cwcExtra Number of extra RTUTF16 items to allocate.
211 * @param pszTag Allocation tag used for statistics and such.
212 * @remark This function will not make any attempt to validate the encoding.
213 */
214RTDECL(int) RTUtf16DupExTag(PRTUTF16 *ppwszString, PCRTUTF16 pwszString, size_t cwcExtra, const char *pszTag);
215
216/**
217 * Returns the length of a UTF-16 string in UTF-16 characters
218 * without trailing '\\0'.
219 *
220 * Surrogate pairs counts as two UTF-16 characters here. Use RTUtf16CpCnt()
221 * to get the exact number of code points in the string.
222 *
223 * @returns The number of RTUTF16 items in the string.
224 * @param pwszString Pointer the UTF-16 string.
225 * @remark This function will not make any attempt to validate the encoding.
226 */
227RTDECL(size_t) RTUtf16Len(PCRTUTF16 pwszString);
228
229/**
230 * Find the length of a zero-terminated byte string, given a max string length.
231 *
232 * @returns The string length or cbMax. The returned length does not include
233 * the zero terminator if it was found.
234 *
235 * @param pwszString The string.
236 * @param cwcMax The max string length in RTUTF16s.
237 * @sa RTUtf16NLenEx, RTStrNLen.
238 */
239RTDECL(size_t) RTUtf16NLen(PCRTUTF16 pwszString, size_t cwcMax);
240
241/**
242 * Find the length of a zero-terminated byte string, given
243 * a max string length.
244 *
245 * @returns IPRT status code.
246 * @retval VINF_SUCCESS if the string has a length less than cchMax.
247 * @retval VERR_BUFFER_OVERFLOW if the end of the string wasn't found
248 * before cwcMax was reached.
249 *
250 * @param pwszString The string.
251 * @param cwcMax The max string length in RTUTF16s.
252 * @param pcwc Where to store the string length excluding the
253 * terminator. This is set to cwcMax if the terminator
254 * isn't found.
255 * @sa RTUtf16NLen, RTStrNLenEx.
256 */
257RTDECL(int) RTUtf16NLenEx(PCRTUTF16 pwszString, size_t cwcMax, size_t *pcwc);
258
259/**
260 * Find the zero terminator in a string with a limited length.
261 *
262 * @returns Pointer to the zero terminator.
263 * @returns NULL if the zero terminator was not found.
264 *
265 * @param pwszString The string.
266 * @param cwcMax The max string length. RTSTR_MAX is fine.
267 */
268RTDECL(PCRTUTF16) RTUtf16End(PCRTUTF16 pwszString, size_t cwcMax);
269
270/**
271 * Finds a give UTF-16 character in a UTF-16 string.
272 *
273 * @returns Pointer to the first occurence of @a wc.
274 * @returns NULL if @a wc was not found.
275 *
276 * @param pwszString The string to search.
277 * @param wc The UTF-16 character to search for.
278 */
279RTDECL(PRTUTF16) RTUtf16Chr(PCRTUTF16 pwszString, RTUTF16 wc);
280
281/**
282 * Strips blankspaces from both ends of the string.
283 *
284 * @returns Pointer to first non-blank char in the string.
285 * @param pwsz The string to strip.
286 */
287RTDECL(PRTUTF16) RTUtf16Strip(PRTUTF16 pwsz);
288
289/**
290 * Strips blankspaces from the start of the string.
291 *
292 * @returns Pointer to first non-blank char in the string.
293 * @param pwsz The string to strip.
294 */
295RTDECL(PRTUTF16) RTUtf16StripL(PCRTUTF16 pwsz);
296
297/**
298 * Strips blankspaces from the end of the string.
299 *
300 * @returns pwsz.
301 * @param pwsz The string to strip.
302 */
303RTDECL(PRTUTF16) RTUtf16StripR(PRTUTF16 pwsz);
304
305/**
306 * String copy with overflow handling.
307 *
308 * @retval VINF_SUCCESS on success.
309 * @retval VERR_BUFFER_OVERFLOW if the destination buffer is too small. The
310 * buffer will contain as much of the string as it can hold, fully
311 * terminated.
312 *
313 * @param pwszDst The destination buffer.
314 * @param cwcDst The size of the destination buffer in RTUTF16s.
315 * @param pwszSrc The source string. NULL is not OK.
316 */
317RTDECL(int) RTUtf16Copy(PRTUTF16 pwszDst, size_t cwcDst, PCRTUTF16 pwszSrc);
318
319/**
320 * String copy with overflow handling, ASCII source.
321 *
322 * @retval VINF_SUCCESS on success.
323 * @retval VERR_BUFFER_OVERFLOW if the destination buffer is too small. The
324 * buffer will contain as much of the string as it can hold, fully
325 * terminated.
326 *
327 * @param pwszDst The destination buffer.
328 * @param cwcDst The size of the destination buffer in RTUTF16s.
329 * @param pszSrc The source string, pure ASCII. NULL is not OK.
330 */
331RTDECL(int) RTUtf16CopyAscii(PRTUTF16 pwszDst, size_t cwcDst, const char *pszSrc);
332
333/**
334 * String copy with overflow handling.
335 *
336 * @retval VINF_SUCCESS on success.
337 * @retval VERR_BUFFER_OVERFLOW if the destination buffer is too small. The
338 * buffer will contain as much of the string as it can hold, fully
339 * terminated.
340 *
341 * @param pwszDst The destination buffer.
342 * @param cwcDst The size of the destination buffer in RTUTF16s.
343 * @param pwszSrc The source string. NULL is not OK.
344 * @param cwcSrcMax The maximum number of chars (not code points) to
345 * copy from the source string, not counting the
346 * terminator as usual.
347 */
348RTDECL(int) RTUtf16CopyEx(PRTUTF16 pwszDst, size_t cwcDst, PCRTUTF16 pwszSrc, size_t cwcSrcMax);
349
350/**
351 * String concatenation with overflow handling.
352 *
353 * @retval VINF_SUCCESS on success.
354 * @retval VERR_BUFFER_OVERFLOW if the destination buffer is too small. The
355 * buffer will contain as much of the string as it can hold, fully
356 * terminated.
357 *
358 * @param pwszDst The destination buffer.
359 * @param cwcDst The size of the destination buffer in RTUTF16s.
360 * @param pwszSrc The source string. NULL is not OK.
361 */
362RTDECL(int) RTUtf16Cat(PRTUTF16 pwszDst, size_t cwcDst, PCRTUTF16 pwszSrc);
363
364/**
365 * String concatenation with overflow handling, ASCII source.
366 *
367 * @retval VINF_SUCCESS on success.
368 * @retval VERR_BUFFER_OVERFLOW if the destination buffer is too small. The
369 * buffer will contain as much of the string as it can hold, fully
370 * terminated.
371 *
372 * @param pwszDst The destination buffer.
373 * @param cwcDst The size of the destination buffer in RTUTF16s.
374 * @param pszSrc The source string, pure ASCII. NULL is not OK.
375 */
376RTDECL(int) RTUtf16CatAscii(PRTUTF16 pwszDst, size_t cwcDst, const char *pszSrc);
377
378/**
379 * String concatenation with overflow handling.
380 *
381 * @retval VINF_SUCCESS on success.
382 * @retval VERR_BUFFER_OVERFLOW if the destination buffer is too small. The
383 * buffer will contain as much of the string as it can hold, fully
384 * terminated.
385 *
386 * @param pwszDst The destination buffer.
387 * @param cwcDst The size of the destination buffer in RTUTF16s.
388 * @param pwszSrc The source string. NULL is not OK.
389 * @param cwcSrcMax The maximum number of UTF-16 chars (not code
390 * points) to copy from the source string, not
391 * counting the terminator as usual.
392 */
393RTDECL(int) RTUtf16CatEx(PRTUTF16 pwszDst, size_t cwcDst, PCRTUTF16 pwszSrc, size_t cwcSrcMax);
394
395/**
396 * Performs a case sensitive string compare between two UTF-16 strings.
397 *
398 * @returns < 0 if the first string less than the second string.
399 * @returns 0 if the first string identical to the second string.
400 * @returns > 0 if the first string greater than the second string.
401 * @param pwsz1 First UTF-16 string. Null is allowed.
402 * @param pwsz2 Second UTF-16 string. Null is allowed.
403 * @remark This function will not make any attempt to validate the encoding.
404 */
405RTDECL(int) RTUtf16Cmp(PCRTUTF16 pwsz1, PCRTUTF16 pwsz2);
406
407/**
408 * Performs a case sensitive string compare between an UTF-16 string and a pure
409 * ASCII string.
410 *
411 * @returns < 0 if the first string less than the second string.
412 * @returns 0 if the first string identical to the second string.
413 * @returns > 0 if the first string greater than the second string.
414 * @param pwsz1 First UTF-16 string. Null is allowed.
415 * @param psz2 Second string, pure ASCII. Null is allowed.
416 * @remark This function will not make any attempt to validate the encoding.
417 */
418RTDECL(int) RTUtf16CmpAscii(PCRTUTF16 pwsz1, const char *psz2);
419
420/**
421 * Performs a case sensitive string compare between an UTF-16 string and a UTF-8
422 * string.
423 *
424 * @returns < 0 if the first string less than the second string.
425 * @returns 0 if the first string identical to the second string.
426 * @returns > 0 if the first string greater than the second string.
427 * @param pwsz1 First UTF-16 string. Null is allowed.
428 * @param psz2 Second string, UTF-8. Null is allowed.
429 * @remarks NULL and empty strings are treated equally.
430 */
431RTDECL(int) RTUtf16CmpUtf8(PCRTUTF16 pwsz1, const char *psz2);
432
433
434/**
435 * Performs a case sensitive and length limited string compare between two UTF-16 strings.
436 *
437 * @returns < 0 if the first string less than the second string.
438 * @returns 0 if the first string identical to the second string.
439 * @returns > 0 if the first string greater than the second string.
440 * @param pwsz1 First UTF-16 string. Null is allowed.
441 * @param pwsz2 Second UTF-16 string. Null is allowed.
442 * @param cwcMax Maximum number of characters (RTUTF16) from the first
443 * @remark This function will not make any attempt to validate the encoding.
444 */
445RTDECL(int) RTUtf16NCmp(PCRTUTF16 pwsz1, PCRTUTF16 pwsz2, size_t cwcMax);
446
447/**
448 * Performs a case sensitive and length limited string compare between an UTF-16
449 * string and a pure ASCII string.
450 *
451 * @returns < 0 if the first string less than the second string.
452 * @returns 0 if the first string identical to the second string.
453 * @returns > 0 if the first string greater than the second string.
454 * @param pwsz1 First UTF-16 string. Null is allowed.
455 * @param psz2 Second string, pure ASCII. Null is allowed.
456 * @param cwcMax Maximum number of characters (RTUTF16) to compare.
457 * @remark This function will not make any attempt to validate the encoding.
458 */
459RTDECL(int) RTUtf16NCmpAscii(PCRTUTF16 pwsz1, const char *psz2, size_t cwcMax);
460
461/**
462 * Performs a case sensitive and length limited string compare between an UTF-16
463 * string and a UTF-8 string.
464 *
465 * @returns < 0 if the first string less than the second string.
466 * @returns 0 if the first string identical to the second string.
467 * @returns > 0 if the first string greater than the second string.
468 * @param pwsz1 First UTF-16 string. Null is allowed.
469 * @param psz2 Second string, UTF-8. Null is allowed.
470 * @param cwcMax1 Maximum number of UTF-16 characters (RTUTF16) from the
471 * first string to compare.
472 * @param cchMax2 Maximum number of UTF-8 characters (char) from the
473 * second string to compare.
474 * @remarks NULL and empty strings are treated equally.
475 */
476RTDECL(int) RTUtf16NCmpUtf8(PCRTUTF16 pwsz1, const char *psz2, size_t cwcMax1, size_t cchMax2);
477
478
479/**
480 * Performs a case insensitive string compare between two UTF-16 strings.
481 *
482 * This is a simplified compare, as only the simplified lower/upper case folding
483 * specified by the unicode specs are used. It does not consider character pairs
484 * as they are used in some languages, just simple upper & lower case compares.
485 *
486 * @returns < 0 if the first string less than the second string.
487 * @returns 0 if the first string identical to the second string.
488 * @returns > 0 if the first string greater than the second string.
489 * @param pwsz1 First UTF-16 string. Null is allowed.
490 * @param pwsz2 Second UTF-16 string. Null is allowed.
491 */
492RTDECL(int) RTUtf16ICmp(PCRTUTF16 pwsz1, PCRTUTF16 pwsz2);
493
494/**
495 * Performs a case insensitive string compare between two big endian UTF-16
496 * strings.
497 *
498 * This is a simplified compare, as only the simplified lower/upper case folding
499 * specified by the unicode specs are used. It does not consider character pairs
500 * as they are used in some languages, just simple upper & lower case compares.
501 *
502 * @returns < 0 if the first string less than the second string.
503 * @returns 0 if the first string identical to the second string.
504 * @returns > 0 if the first string greater than the second string.
505 * @param pwsz1 First big endian UTF-16 string. Null is allowed.
506 * @param pwsz2 Second big endian UTF-16 string. Null is allowed.
507 */
508RTDECL(int) RTUtf16BigICmp(PCRTUTF16 pwsz1, PCRTUTF16 pwsz2);
509
510/**
511 * Performs a case insensitive string compare between an UTF-16 string and a
512 * UTF-8 string.
513 *
514 * @returns < 0 if the first string less than the second string.s
515 * @returns 0 if the first string identical to the second string.
516 * @returns > 0 if the first string greater than the second string.
517 * @param pwsz1 First UTF-16 string. Null is allowed.
518 * @param psz2 Second string, UTF-8. Null is allowed.
519 * @remarks NULL and empty strings are treated equally.
520 */
521RTDECL(int) RTUtf16ICmpUtf8(PCRTUTF16 pwsz1, const char *psz2);
522
523/**
524 * Performs a case insensitive string compare between an UTF-16 string and a
525 * pure ASCII string.
526 *
527 * Since this compare only takes cares about the first 128 codepoints in
528 * unicode, no tables are needed and there aren't any real complications.
529 *
530 * @returns < 0 if the first string less than the second string.
531 * @returns 0 if the first string identical to the second string.
532 * @returns > 0 if the first string greater than the second string.
533 * @param pwsz1 First UTF-16 string. Null is allowed.
534 * @param psz2 Second string, pure ASCII. Null is allowed.
535 */
536RTDECL(int) RTUtf16ICmpAscii(PCRTUTF16 pwsz1, const char *psz2);
537
538/**
539 * Performs a case insensitive string compare between two UTF-16 strings
540 * using the current locale of the process (if applicable).
541 *
542 * This differs from RTUtf16ICmp() in that it will try, if a locale with the
543 * required data is available, to do a correct case-insensitive compare. It
544 * follows that it is more complex and thereby likely to be more expensive.
545 *
546 * @returns < 0 if the first string less than the second string.
547 * @returns 0 if the first string identical to the second string.
548 * @returns > 0 if the first string greater than the second string.
549 * @param pwsz1 First UTF-16 string. Null is allowed.
550 * @param pwsz2 Second UTF-16 string. Null is allowed.
551 */
552RTDECL(int) RTUtf16LocaleICmp(PCRTUTF16 pwsz1, PCRTUTF16 pwsz2);
553
554/**
555 * Performs a case insensitive string compare between two UTF-16 strings,
556 * stopping after N characters.
557 *
558 * This is a simplified compare, as only the simplified lower/upper case folding
559 * specified by the unicode specs are used. It does not consider character pairs
560 * as they are used in some languages, just simple upper & lower case compares.
561 *
562 * @returns < 0 if the first string less than the second string.
563 * @returns 0 if the first string identical to the second string.
564 * @returns > 0 if the first string greater than the second string.
565 * @param pwsz1 First UTF-16 string. Null is allowed.
566 * @param pwsz2 Second UTF-16 string. Null is allowed.
567 * @param cwcMax Maximum number of characters to compare.
568 */
569RTDECL(int) RTUtf16NICmp(PCRTUTF16 pwsz1, PCRTUTF16 pwsz2, size_t cwcMax);
570
571/**
572 * Performs a case insensitive string compare between two big endian UTF-16
573 * strings, stopping after N characters.
574 *
575 * This is a simplified compare, as only the simplified lower/upper case folding
576 * specified by the unicode specs are used. It does not consider character pairs
577 * as they are used in some languages, just simple upper & lower case compares.
578 *
579 * @returns < 0 if the first string less than the second string.
580 * @returns 0 if the first string identical to the second string.
581 * @returns > 0 if the first string greater than the second string.
582 * @param pwsz1 First big endian UTF-16 string. Null is allowed.
583 * @param pwsz2 Second big endian UTF-16 string. Null is allowed.
584 * @param cwcMax Maximum number of characters to compare.
585 */
586RTDECL(int) RTUtf16BigNICmp(PCRTUTF16 pwsz1, PCRTUTF16 pwsz2, size_t cwcMax);
587
588/**
589 * Performs a case insensitive string compare between a UTF-16 string and a pure
590 * ASCII string, stopping after N characters.
591 *
592 * Since this compare only takes cares about the first 128 codepoints in
593 * unicode, no tables are needed and there aren't any real complications.
594 *
595 * @returns < 0 if the first string less than the second string.
596 * @returns 0 if the first string identical to the second string.
597 * @returns > 0 if the first string greater than the second string.
598 * @param pwsz1 The UTF-16 first string. Null is allowed.
599 * @param psz2 The pure ASCII second string. Null is allowed.
600 * @param cwcMax Maximum number of UTF-16 characters to compare.
601 */
602RTDECL(int) RTUtf16NICmpAscii(PCRTUTF16 pwsz1, const char *psz2, size_t cwcMax);
603
604
605/**
606 * Folds a UTF-16 string to lowercase.
607 *
608 * This is a very simple folding; is uses the simple lowercase
609 * code point, it is not related to any locale just the most common
610 * lowercase codepoint setup by the unicode specs, and it will not
611 * create new surrogate pairs or remove existing ones.
612 *
613 * @returns Pointer to the passed in string.
614 * @param pwsz The string to fold.
615 */
616RTDECL(PRTUTF16) RTUtf16ToLower(PRTUTF16 pwsz);
617
618/**
619 * Folds a UTF-16 string to uppercase.
620 *
621 * This is a very simple folding; is uses the simple uppercase
622 * code point, it is not related to any locale just the most common
623 * uppercase codepoint setup by the unicode specs, and it will not
624 * create new surrogate pairs or remove existing ones.
625 *
626 * @returns Pointer to the passed in string.
627 * @param pwsz The string to fold.
628 */
629RTDECL(PRTUTF16) RTUtf16ToUpper(PRTUTF16 pwsz);
630
631/**
632 * Validates the UTF-16 encoding of the string.
633 *
634 * @returns iprt status code.
635 * @param pwsz The string.
636 */
637RTDECL(int) RTUtf16ValidateEncoding(PCRTUTF16 pwsz);
638
639/**
640 * Validates the UTF-16 encoding of the string.
641 *
642 * @returns iprt status code.
643 * @param pwsz The string.
644 * @param cwc The max string length (/ size) in UTF-16 units. Use
645 * RTSTR_MAX to process the entire string.
646 * @param fFlags Combination of RTSTR_VALIDATE_ENCODING_XXX flags.
647 */
648RTDECL(int) RTUtf16ValidateEncodingEx(PCRTUTF16 pwsz, size_t cwc, uint32_t fFlags);
649
650/**
651 * Checks if the UTF-16 encoding is valid.
652 *
653 * @returns true / false.
654 * @param pwsz The string.
655 */
656RTDECL(bool) RTUtf16IsValidEncoding(PCRTUTF16 pwsz);
657
658/**
659 * Sanitise a (valid) UTF-16 string by replacing all characters outside a white
660 * list in-place by an ASCII replacement character.
661 *
662 * Surrogate paris will be replaced by two chars.
663 *
664 * @returns The number of code points replaced. In the case of an incorrectly
665 * encoded string -1 will be returned, and the string is not completely
666 * processed. In the case of puszValidPairs having an odd number of
667 * code points, -1 will be also return but without any modification to
668 * the string.
669 * @param pwsz The string to sanitise.
670 * @param puszValidPairs A zero-terminated array of pairs of Unicode points.
671 * Each pair is the start and end point of a range,
672 * and the union of these ranges forms the white list.
673 * @param chReplacement The ASCII replacement character.
674 * @sa RTStrPurgeComplementSet
675 */
676RTDECL(ssize_t) RTUtf16PurgeComplementSet(PRTUTF16 pwsz, PCRTUNICP puszValidPairs, char chReplacement);
677
678
679/**
680 * Translate a UTF-16 string into a UTF-8 allocating the result buffer (default
681 * tag).
682 *
683 * @returns iprt status code.
684 * @param pwszString UTF-16 string to convert.
685 * @param ppszString Receives pointer of allocated UTF-8 string on
686 * success, and is always set to NULL on failure.
687 * The returned pointer must be freed using RTStrFree().
688 */
689#define RTUtf16ToUtf8(pwszString, ppszString) RTUtf16ToUtf8Tag((pwszString), (ppszString), RTSTR_TAG)
690
691/**
692 * Translate a UTF-16 string into a UTF-8 allocating the result buffer.
693 *
694 * @returns iprt status code.
695 * @param pwszString UTF-16 string to convert.
696 * @param ppszString Receives pointer of allocated UTF-8 string on
697 * success, and is always set to NULL on failure.
698 * The returned pointer must be freed using RTStrFree().
699 * @param pszTag Allocation tag used for statistics and such.
700 */
701RTDECL(int) RTUtf16ToUtf8Tag(PCRTUTF16 pwszString, char **ppszString, const char *pszTag);
702
703/**
704 * Translate a UTF-16BE string into a UTF-8 allocating the result buffer
705 * (default tag).
706 *
707 * This differs from RTUtf16ToUtf8 in that the input is always a
708 * big-endian string.
709 *
710 * @returns iprt status code.
711 * @param pwszString UTF-16BE string to convert.
712 * @param ppszString Receives pointer of allocated UTF-8 string on
713 * success, and is always set to NULL on failure.
714 * The returned pointer must be freed using RTStrFree().
715 */
716#define RTUtf16BigToUtf8(pwszString, ppszString) RTUtf16BigToUtf8Tag((pwszString), (ppszString), RTSTR_TAG)
717
718/**
719 * Translate a UTF-16BE string into a UTF-8 allocating the result buffer.
720 *
721 * This differs from RTUtf16ToUtf8Tag in that the input is always a
722 * big-endian string.
723 *
724 * @returns iprt status code.
725 * @param pwszString UTF-16BE string to convert.
726 * @param ppszString Receives pointer of allocated UTF-8 string on
727 * success, and is always set to NULL on failure.
728 * The returned pointer must be freed using RTStrFree().
729 * @param pszTag Allocation tag used for statistics and such.
730 */
731RTDECL(int) RTUtf16BigToUtf8Tag(PCRTUTF16 pwszString, char **ppszString, const char *pszTag);
732
733/**
734 * Translate a UTF-16LE string into a UTF-8 allocating the result buffer
735 * (default tag).
736 *
737 * This differs from RTUtf16ToUtf8 in that the input is always a
738 * little-endian string.
739 *
740 * @returns iprt status code.
741 * @param pwszString UTF-16LE string to convert.
742 * @param ppszString Receives pointer of allocated UTF-8 string on
743 * success, and is always set to NULL on failure.
744 * The returned pointer must be freed using RTStrFree().
745 */
746#define RTUtf16LittleToUtf8(pwszString, ppszString) RTUtf16LittleToUtf8Tag((pwszString), (ppszString), RTSTR_TAG)
747
748/**
749 * Translate a UTF-16LE string into a UTF-8 allocating the result buffer.
750 *
751 * This differs from RTUtf16ToUtf8Tag in that the input is always a
752 * little-endian string.
753 *
754 * @returns iprt status code.
755 * @param pwszString UTF-16LE string to convert.
756 * @param ppszString Receives pointer of allocated UTF-8 string on
757 * success, and is always set to NULL on failure.
758 * The returned pointer must be freed using RTStrFree().
759 * @param pszTag Allocation tag used for statistics and such.
760 */
761RTDECL(int) RTUtf16LittleToUtf8Tag(PCRTUTF16 pwszString, char **ppszString, const char *pszTag);
762
763
764/**
765 * Translates UTF-16 to UTF-8 using buffer provided by the caller or a fittingly
766 * sized buffer allocated by the function (default tag).
767 *
768 * @returns iprt status code.
769 * @param pwszString The UTF-16 string to convert.
770 * @param cwcString The number of RTUTF16 items to translate from pwszString.
771 * The translation will stop when reaching cwcString or the terminator ('\\0').
772 * Use RTSTR_MAX to translate the entire string.
773 * @param ppsz If cch is non-zero, this must either be pointing to a pointer to
774 * a buffer of the specified size, or pointer to a NULL pointer.
775 * If *ppsz is NULL or cch is zero a buffer of at least cch chars
776 * will be allocated to hold the translated string.
777 * If a buffer was requested it must be freed using RTStrFree().
778 * @param cch The buffer size in chars (the type). This includes the terminator.
779 * @param pcch Where to store the length of the translated string,
780 * excluding the terminator. (Optional)
781 *
782 * This may be set under some error conditions,
783 * however, only for VERR_BUFFER_OVERFLOW and
784 * VERR_NO_STR_MEMORY will it contain a valid string
785 * length that can be used to resize the buffer.
786 */
787#define RTUtf16ToUtf8Ex(pwszString, cwcString, ppsz, cch, pcch) \
788 RTUtf16ToUtf8ExTag((pwszString), (cwcString), (ppsz), (cch), (pcch), RTSTR_TAG)
789
790/**
791 * Translates UTF-16 to UTF-8 using buffer provided by the caller or a fittingly
792 * sized buffer allocated by the function (custom tag).
793 *
794 * @returns iprt status code.
795 * @param pwszString The UTF-16 string to convert.
796 * @param cwcString The number of RTUTF16 items to translate from pwszString.
797 * The translation will stop when reaching cwcString or the terminator ('\\0').
798 * Use RTSTR_MAX to translate the entire string.
799 * @param ppsz If cch is non-zero, this must either be pointing to a pointer to
800 * a buffer of the specified size, or pointer to a NULL pointer.
801 * If *ppsz is NULL or cch is zero a buffer of at least cch chars
802 * will be allocated to hold the translated string.
803 * If a buffer was requested it must be freed using RTStrFree().
804 * @param cch The buffer size in chars (the type). This includes the terminator.
805 * @param pcch Where to store the length of the translated string,
806 * excluding the terminator. (Optional)
807 *
808 * This may be set under some error conditions,
809 * however, only for VERR_BUFFER_OVERFLOW and
810 * VERR_NO_STR_MEMORY will it contain a valid string
811 * length that can be used to resize the buffer.
812 * @param pszTag Allocation tag used for statistics and such.
813 */
814RTDECL(int) RTUtf16ToUtf8ExTag(PCRTUTF16 pwszString, size_t cwcString, char **ppsz, size_t cch, size_t *pcch, const char *pszTag);
815
816/**
817 * Translates UTF-16BE to UTF-8 using buffer provided by the caller or a
818 * fittingly sized buffer allocated by the function (default tag).
819 *
820 * This differs from RTUtf16ToUtf8Ex in that the input is always a
821 * big-endian string.
822 *
823 * @returns iprt status code.
824 * @param pwszString The UTF-16BE string to convert.
825 * @param cwcString The number of RTUTF16 items to translate from pwszString.
826 * The translation will stop when reaching cwcString or the terminator ('\\0').
827 * Use RTSTR_MAX to translate the entire string.
828 * @param ppsz If cch is non-zero, this must either be pointing to a pointer to
829 * a buffer of the specified size, or pointer to a NULL pointer.
830 * If *ppsz is NULL or cch is zero a buffer of at least cch chars
831 * will be allocated to hold the translated string.
832 * If a buffer was requested it must be freed using RTStrFree().
833 * @param cch The buffer size in chars (the type). This includes the terminator.
834 * @param pcch Where to store the length of the translated string,
835 * excluding the terminator. (Optional)
836 *
837 * This may be set under some error conditions,
838 * however, only for VERR_BUFFER_OVERFLOW and
839 * VERR_NO_STR_MEMORY will it contain a valid string
840 * length that can be used to resize the buffer.
841 */
842#define RTUtf16BigToUtf8Ex(pwszString, cwcString, ppsz, cch, pcch) \
843 RTUtf16BigToUtf8ExTag((pwszString), (cwcString), (ppsz), (cch), (pcch), RTSTR_TAG)
844
845/**
846 * Translates UTF-16BE to UTF-8 using buffer provided by the caller or a
847 * fittingly sized buffer allocated by the function (custom tag).
848 *
849 * This differs from RTUtf16ToUtf8ExTag in that the input is always a
850 * big-endian string.
851 *
852 * @returns iprt status code.
853 * @param pwszString The UTF-16BE string to convert.
854 * @param cwcString The number of RTUTF16 items to translate from pwszString.
855 * The translation will stop when reaching cwcString or the terminator ('\\0').
856 * Use RTSTR_MAX to translate the entire string.
857 * @param ppsz If cch is non-zero, this must either be pointing to a pointer to
858 * a buffer of the specified size, or pointer to a NULL pointer.
859 * If *ppsz is NULL or cch is zero a buffer of at least cch chars
860 * will be allocated to hold the translated string.
861 * If a buffer was requested it must be freed using RTStrFree().
862 * @param cch The buffer size in chars (the type). This includes the terminator.
863 * @param pcch Where to store the length of the translated string,
864 * excluding the terminator. (Optional)
865 *
866 * This may be set under some error conditions,
867 * however, only for VERR_BUFFER_OVERFLOW and
868 * VERR_NO_STR_MEMORY will it contain a valid string
869 * length that can be used to resize the buffer.
870 * @param pszTag Allocation tag used for statistics and such.
871 */
872RTDECL(int) RTUtf16BigToUtf8ExTag(PCRTUTF16 pwszString, size_t cwcString, char **ppsz, size_t cch, size_t *pcch, const char *pszTag);
873
874/**
875 * Translates UTF-16LE to UTF-8 using buffer provided by the caller or a
876 * fittingly sized buffer allocated by the function (default tag).
877 *
878 * This differs from RTUtf16ToUtf8Ex in that the input is always a
879 * little-endian string.
880 *
881 * @returns iprt status code.
882 * @param pwszString The UTF-16LE string to convert.
883 * @param cwcString The number of RTUTF16 items to translate from pwszString.
884 * The translation will stop when reaching cwcString or the terminator ('\\0').
885 * Use RTSTR_MAX to translate the entire string.
886 * @param ppsz If cch is non-zero, this must either be pointing to a pointer to
887 * a buffer of the specified size, or pointer to a NULL pointer.
888 * If *ppsz is NULL or cch is zero a buffer of at least cch chars
889 * will be allocated to hold the translated string.
890 * If a buffer was requested it must be freed using RTStrFree().
891 * @param cch The buffer size in chars (the type). This includes the terminator.
892 * @param pcch Where to store the length of the translated string,
893 * excluding the terminator. (Optional)
894 *
895 * This may be set under some error conditions,
896 * however, only for VERR_BUFFER_OVERFLOW and
897 * VERR_NO_STR_MEMORY will it contain a valid string
898 * length that can be used to resize the buffer.
899 */
900#define RTUtf16LittleToUtf8Ex(pwszString, cwcString, ppsz, cch, pcch) \
901 RTUtf16LittleToUtf8ExTag((pwszString), (cwcString), (ppsz), (cch), (pcch), RTSTR_TAG)
902
903/**
904 * Translates UTF-16LE to UTF-8 using buffer provided by the caller or a
905 * fittingly sized buffer allocated by the function (custom tag).
906 *
907 * This differs from RTUtf16ToUtf8ExTag in that the input is always a
908 * little-endian string.
909 *
910 * @returns iprt status code.
911 * @param pwszString The UTF-16LE string to convert.
912 * @param cwcString The number of RTUTF16 items to translate from pwszString.
913 * The translation will stop when reaching cwcString or the terminator ('\\0').
914 * Use RTSTR_MAX to translate the entire string.
915 * @param ppsz If cch is non-zero, this must either be pointing to a pointer to
916 * a buffer of the specified size, or pointer to a NULL pointer.
917 * If *ppsz is NULL or cch is zero a buffer of at least cch chars
918 * will be allocated to hold the translated string.
919 * If a buffer was requested it must be freed using RTStrFree().
920 * @param cch The buffer size in chars (the type). This includes the terminator.
921 * @param pcch Where to store the length of the translated string,
922 * excluding the terminator. (Optional)
923 *
924 * This may be set under some error conditions,
925 * however, only for VERR_BUFFER_OVERFLOW and
926 * VERR_NO_STR_MEMORY will it contain a valid string
927 * length that can be used to resize the buffer.
928 * @param pszTag Allocation tag used for statistics and such.
929 */
930RTDECL(int) RTUtf16LittleToUtf8ExTag(PCRTUTF16 pwszString, size_t cwcString, char **ppsz, size_t cch, size_t *pcch,
931 const char *pszTag);
932
933/**
934 * Calculates the length of the UTF-16 string in UTF-8 chars (bytes).
935 *
936 * This function will validate the string, and incorrectly encoded UTF-16
937 * strings will be rejected. The primary purpose of this function is to
938 * help allocate buffers for RTUtf16ToUtf8() of the correct size. For most
939 * other purposes RTUtf16ToUtf8Ex() should be used.
940 *
941 * @returns Number of char (bytes).
942 * @returns 0 if the string was incorrectly encoded.
943 * @param pwsz The UTF-16 string.
944 */
945RTDECL(size_t) RTUtf16CalcUtf8Len(PCRTUTF16 pwsz);
946
947/**
948 * Calculates the length of the UTF-16BE string in UTF-8 chars (bytes).
949 *
950 * This function will validate the string, and incorrectly encoded UTF-16BE
951 * strings will be rejected. The primary purpose of this function is to
952 * help allocate buffers for RTUtf16BigToUtf8() of the correct size. For most
953 * other purposes RTUtf16BigToUtf8Ex() should be used.
954 *
955 * @returns Number of char (bytes).
956 * @returns 0 if the string was incorrectly encoded.
957 * @param pwsz The UTF-16BE string.
958 */
959RTDECL(size_t) RTUtf16BigCalcUtf8Len(PCRTUTF16 pwsz);
960
961/**
962 * Calculates the length of the UTF-16LE string in UTF-8 chars (bytes).
963 *
964 * This function will validate the string, and incorrectly encoded UTF-16LE
965 * strings will be rejected. The primary purpose of this function is to
966 * help allocate buffers for RTUtf16LittleToUtf8() of the correct size. For
967 * most other purposes RTUtf16LittleToUtf8Ex() should be used.
968 *
969 * @returns Number of char (bytes).
970 * @returns 0 if the string was incorrectly encoded.
971 * @param pwsz The UTF-16LE string.
972 */
973RTDECL(size_t) RTUtf16LittleCalcUtf8Len(PCRTUTF16 pwsz);
974
975/**
976 * Calculates the length of the UTF-16 string in UTF-8 chars (bytes).
977 *
978 * This function will validate the string, and incorrectly encoded UTF-16
979 * strings will be rejected.
980 *
981 * @returns iprt status code.
982 * @param pwsz The string.
983 * @param cwc The max string length. Use RTSTR_MAX to process the entire string.
984 * @param pcch Where to store the string length (in bytes). Optional.
985 * This is undefined on failure.
986 */
987RTDECL(int) RTUtf16CalcUtf8LenEx(PCRTUTF16 pwsz, size_t cwc, size_t *pcch);
988
989/**
990 * Calculates the length of the UTF-16BE string in UTF-8 chars (bytes).
991 *
992 * This function will validate the string, and incorrectly encoded UTF-16BE
993 * strings will be rejected.
994 *
995 * @returns iprt status code.
996 * @param pwsz The string.
997 * @param cwc The max string length. Use RTSTR_MAX to process the entire string.
998 * @param pcch Where to store the string length (in bytes). Optional.
999 * This is undefined on failure.
1000 */
1001RTDECL(int) RTUtf16BigCalcUtf8LenEx(PCRTUTF16 pwsz, size_t cwc, size_t *pcch);
1002
1003/**
1004 * Calculates the length of the UTF-16LE string in UTF-8 chars (bytes).
1005 *
1006 * This function will validate the string, and incorrectly encoded UTF-16LE
1007 * strings will be rejected.
1008 *
1009 * @returns iprt status code.
1010 * @param pwsz The string.
1011 * @param cwc The max string length. Use RTSTR_MAX to process the entire string.
1012 * @param pcch Where to store the string length (in bytes). Optional.
1013 * This is undefined on failure.
1014 */
1015RTDECL(int) RTUtf16LittleCalcUtf8LenEx(PCRTUTF16 pwsz, size_t cwc, size_t *pcch);
1016
1017/**
1018 * Translate a UTF-16 string into a Latin-1 (ISO-8859-1) allocating the result
1019 * buffer (default tag).
1020 *
1021 * @returns iprt status code.
1022 * @param pwszString UTF-16 string to convert.
1023 * @param ppszString Receives pointer of allocated Latin1 string on
1024 * success, and is always set to NULL on failure.
1025 * The returned pointer must be freed using RTStrFree().
1026 */
1027#define RTUtf16ToLatin1(pwszString, ppszString) RTUtf16ToLatin1Tag((pwszString), (ppszString), RTSTR_TAG)
1028
1029/**
1030 * Translate a UTF-16 string into a Latin-1 (ISO-8859-1) allocating the result
1031 * buffer (custom tag).
1032 *
1033 * @returns iprt status code.
1034 * @param pwszString UTF-16 string to convert.
1035 * @param ppszString Receives pointer of allocated Latin1 string on
1036 * success, and is always set to NULL on failure.
1037 * The returned pointer must be freed using RTStrFree().
1038 * @param pszTag Allocation tag used for statistics and such.
1039 */
1040RTDECL(int) RTUtf16ToLatin1Tag(PCRTUTF16 pwszString, char **ppszString, const char *pszTag);
1041
1042/**
1043 * Translates UTF-16 to Latin-1 (ISO-8859-1) using buffer provided by the caller
1044 * or a fittingly sized buffer allocated by the function (default tag).
1045 *
1046 * @returns iprt status code.
1047 * @param pwszString The UTF-16 string to convert.
1048 * @param cwcString The number of RTUTF16 items to translate from
1049 * pwszString. The translation will stop when reaching
1050 * cwcString or the terminator ('\\0'). Use RTSTR_MAX
1051 * to translate the entire string.
1052 * @param ppsz Pointer to the pointer to the Latin-1 string. The
1053 * buffer can optionally be preallocated by the caller.
1054 *
1055 * If cch is zero, *ppsz is undefined.
1056 *
1057 * If cch is non-zero and *ppsz is not NULL, then this
1058 * will be used as the output buffer.
1059 * VERR_BUFFER_OVERFLOW will be returned if this is
1060 * insufficient.
1061 *
1062 * If cch is zero or *ppsz is NULL, then a buffer of
1063 * sufficient size is allocated. cch can be used to
1064 * specify a minimum size of this buffer. Use
1065 * RTUtf16Free() to free the result.
1066 *
1067 * @param cch The buffer size in chars (the type). This includes
1068 * the terminator.
1069 * @param pcch Where to store the length of the translated string,
1070 * excluding the terminator. (Optional)
1071 *
1072 * This may be set under some error conditions,
1073 * however, only for VERR_BUFFER_OVERFLOW and
1074 * VERR_NO_STR_MEMORY will it contain a valid string
1075 * length that can be used to resize the buffer.
1076 */
1077#define RTUtf16ToLatin1Ex(pwszString, cwcString, ppsz, cch, pcch) \
1078 RTUtf16ToLatin1ExTag((pwszString), (cwcString), (ppsz), (cch), (pcch), RTSTR_TAG)
1079
1080/**
1081 * Translates UTF-16 to Latin-1 (ISO-8859-1) using buffer provided by the caller
1082 * or a fittingly sized buffer allocated by the function (custom tag).
1083 *
1084 * @returns iprt status code.
1085 * @param pwszString The UTF-16 string to convert.
1086 * @param cwcString The number of RTUTF16 items to translate from
1087 * pwszString. The translation will stop when reaching
1088 * cwcString or the terminator ('\\0'). Use RTSTR_MAX
1089 * to translate the entire string.
1090 * @param ppsz Pointer to the pointer to the Latin-1 string. The
1091 * buffer can optionally be preallocated by the caller.
1092 *
1093 * If cch is zero, *ppsz is undefined.
1094 *
1095 * If cch is non-zero and *ppsz is not NULL, then this
1096 * will be used as the output buffer.
1097 * VERR_BUFFER_OVERFLOW will be returned if this is
1098 * insufficient.
1099 *
1100 * If cch is zero or *ppsz is NULL, then a buffer of
1101 * sufficient size is allocated. cch can be used to
1102 * specify a minimum size of this buffer. Use
1103 * RTUtf16Free() to free the result.
1104 *
1105 * @param cch The buffer size in chars (the type). This includes
1106 * the terminator.
1107 * @param pcch Where to store the length of the translated string,
1108 * excluding the terminator. (Optional)
1109 *
1110 * This may be set under some error conditions,
1111 * however, only for VERR_BUFFER_OVERFLOW and
1112 * VERR_NO_STR_MEMORY will it contain a valid string
1113 * length that can be used to resize the buffer.
1114 * @param pszTag Allocation tag used for statistics and such.
1115 */
1116RTDECL(int) RTUtf16ToLatin1ExTag(PCRTUTF16 pwszString, size_t cwcString, char **ppsz, size_t cch, size_t *pcch, const char *pszTag);
1117
1118/**
1119 * Calculates the length of the UTF-16 string in Latin-1 (ISO-8859-1) chars.
1120 *
1121 * This function will validate the string, and incorrectly encoded UTF-16
1122 * strings will be rejected. The primary purpose of this function is to
1123 * help allocate buffers for RTUtf16ToLatin1() of the correct size. For most
1124 * other purposes RTUtf16ToLatin1Ex() should be used.
1125 *
1126 * @returns Number of char (bytes).
1127 * @returns 0 if the string was incorrectly encoded.
1128 * @param pwsz The UTF-16 string.
1129 */
1130RTDECL(size_t) RTUtf16CalcLatin1Len(PCRTUTF16 pwsz);
1131
1132/**
1133 * Calculates the length of the UTF-16 string in Latin-1 (ISO-8859-1) chars.
1134 *
1135 * This function will validate the string, and incorrectly encoded UTF-16
1136 * strings will be rejected.
1137 *
1138 * @returns iprt status code.
1139 * @param pwsz The string.
1140 * @param cwc The max string length. Use RTSTR_MAX to process the
1141 * entire string.
1142 * @param pcch Where to store the string length (in bytes). Optional.
1143 * This is undefined on failure.
1144 */
1145RTDECL(int) RTUtf16CalcLatin1LenEx(PCRTUTF16 pwsz, size_t cwc, size_t *pcch);
1146
1147/**
1148 * Get the unicode code point at the given string position.
1149 *
1150 * @returns unicode code point.
1151 * @returns RTUNICP_INVALID if the encoding is invalid.
1152 * @param pwsz The string.
1153 *
1154 * @remark This is an internal worker for RTUtf16GetCp().
1155 */
1156RTDECL(RTUNICP) RTUtf16GetCpInternal(PCRTUTF16 pwsz);
1157
1158/**
1159 * Get the unicode code point at the given string position.
1160 *
1161 * @returns iprt status code.
1162 * @param ppwsz Pointer to the string pointer. This will be updated to
1163 * point to the char following the current code point.
1164 * @param pCp Where to store the code point.
1165 * RTUNICP_INVALID is stored here on failure.
1166 *
1167 * @remark This is an internal worker for RTUtf16GetCpEx().
1168 */
1169RTDECL(int) RTUtf16GetCpExInternal(PCRTUTF16 *ppwsz, PRTUNICP pCp);
1170
1171/**
1172 * Get the unicode code point at the given string position with length
1173 * restriction.
1174 *
1175 * @returns iprt status code.
1176 * @param ppwsz Pointer to the string pointer. This will be updated to
1177 * point to the char following the current code point.
1178 * @param pcwc Pointer to the max string length. This will be
1179 * decremented corrsponding to the advancement of @a ppwsz.
1180 * @param pCp Where to store the code point.
1181 * RTUNICP_INVALID is stored here on failure.
1182 *
1183 * @remark This is an internal worker for RTUtf16GetCpNEx().
1184 */
1185RTDECL(int) RTUtf16GetCpNExInternal(PCRTUTF16 *ppwsz, size_t *pcwc, PRTUNICP pCp);
1186
1187/**
1188 * Get the unicode code point at the given string position, big endian.
1189 *
1190 * @returns iprt status code.
1191 * @param ppwsz Pointer to the string pointer. This will be updated to
1192 * point to the char following the current code point.
1193 * @param pCp Where to store the code point.
1194 * RTUNICP_INVALID is stored here on failure.
1195 *
1196 * @remark This is an internal worker for RTUtf16BigGetCpEx().
1197 */
1198RTDECL(int) RTUtf16BigGetCpExInternal(PCRTUTF16 *ppwsz, PRTUNICP pCp);
1199
1200/**
1201 * Put the unicode code point at the given string position
1202 * and return the pointer to the char following it.
1203 *
1204 * This function will not consider anything at or following the
1205 * buffer area pointed to by pwsz. It is therefore not suitable for
1206 * inserting code points into a string, only appending/overwriting.
1207 *
1208 * @returns pointer to the char following the written code point.
1209 * @param pwsz The string.
1210 * @param CodePoint The code point to write.
1211 * This should not be RTUNICP_INVALID or any other
1212 * character out of the UTF-16 range.
1213 *
1214 * @remark This is an internal worker for RTUtf16GetCpEx().
1215 */
1216RTDECL(PRTUTF16) RTUtf16PutCpInternal(PRTUTF16 pwsz, RTUNICP CodePoint);
1217
1218/**
1219 * Get the unicode code point at the given string position.
1220 *
1221 * @returns unicode code point.
1222 * @returns RTUNICP_INVALID if the encoding is invalid.
1223 * @param pwsz The string.
1224 *
1225 * @remark We optimize this operation by using an inline function for
1226 * everything which isn't a surrogate pair or an endian indicator.
1227 */
1228DECLINLINE(RTUNICP) RTUtf16GetCp(PCRTUTF16 pwsz)
1229{
1230 const RTUTF16 wc = *pwsz;
1231 if (wc < 0xd800 || (wc > 0xdfff && wc < 0xfffe))
1232 return wc;
1233 return RTUtf16GetCpInternal(pwsz);
1234}
1235
1236/**
1237 * Get the unicode code point at the given string position.
1238 *
1239 * @returns iprt status code.
1240 * @param ppwsz Pointer to the string pointer. This will be updated to
1241 * point to the char following the current code point.
1242 * @param pCp Where to store the code point.
1243 * RTUNICP_INVALID is stored here on failure.
1244 *
1245 * @remark We optimize this operation by using an inline function for
1246 * everything which isn't a surrogate pair or and endian indicator.
1247 */
1248DECLINLINE(int) RTUtf16GetCpEx(PCRTUTF16 *ppwsz, PRTUNICP pCp)
1249{
1250 const RTUTF16 wc = **ppwsz;
1251 if (wc < 0xd800 || (wc > 0xdfff && wc < 0xfffe))
1252 {
1253 (*ppwsz)++;
1254 *pCp = wc;
1255 return VINF_SUCCESS;
1256 }
1257 return RTUtf16GetCpExInternal(ppwsz, pCp);
1258}
1259
1260/**
1261 * Get the unicode code point at the given string position.
1262 *
1263 * @returns iprt status code.
1264 * @param ppwsz Pointer to the string pointer. This will be updated to
1265 * point to the char following the current code point.
1266 * @param pcwc Pointer to the max string length. This will be
1267 * decremented corrsponding to the advancement of @a ppwsz.
1268 * @param pCp Where to store the code point. RTUNICP_INVALID is stored
1269 * here on failure.
1270 *
1271 * @remark We optimize this operation by using an inline function for
1272 * everything which isn't a surrogate pair or and endian indicator.
1273 */
1274DECLINLINE(int) RTUtf16GetCpNEx(PCRTUTF16 *ppwsz, size_t *pcwc, PRTUNICP pCp)
1275{
1276 const size_t cwc = *pcwc;
1277 if (cwc > 0)
1278 {
1279 const PCRTUTF16 pwsz = *ppwsz;
1280 const RTUTF16 wc = *pwsz;
1281 if (wc < 0xd800 || (wc > 0xdfff && wc < 0xfffe))
1282 {
1283 *pCp = wc;
1284 *pcwc = cwc - 1;
1285 *ppwsz = pwsz + 1;
1286 return VINF_SUCCESS;
1287 }
1288 }
1289 return RTUtf16GetCpNExInternal(ppwsz, pcwc, pCp);
1290}
1291
1292/**
1293 * Get the unicode code point at the given string position, big endian version.
1294 *
1295 * @returns iprt status code.
1296 * @param ppwsz Pointer to the string pointer. This will be updated to
1297 * point to the char following the current code point.
1298 * @param pCp Where to store the code point.
1299 * RTUNICP_INVALID is stored here on failure.
1300 *
1301 * @remark We optimize this operation by using an inline function for
1302 * everything which isn't a surrogate pair or and endian indicator.
1303 */
1304DECLINLINE(int) RTUtf16BigGetCpEx(PCRTUTF16 *ppwsz, PRTUNICP pCp)
1305{
1306#ifdef RT_BIG_ENDIAN
1307 return RTUtf16GetCpEx(ppwsz, pCp);
1308#else
1309# ifdef IPRT_INCLUDED_asm_h
1310 const RTUTF16 wc = RT_BE2H_U16(**ppwsz);
1311 if (wc < 0xd800 || (wc > 0xdfff && wc < 0xfffe))
1312 {
1313 (*ppwsz)++;
1314 *pCp = wc;
1315 return VINF_SUCCESS;
1316 }
1317# endif
1318 return RTUtf16BigGetCpExInternal(ppwsz, pCp);
1319#endif
1320}
1321
1322/**
1323 * Put the unicode code point at the given string position
1324 * and return the pointer to the char following it.
1325 *
1326 * This function will not consider anything at or following the
1327 * buffer area pointed to by pwsz. It is therefore not suitable for
1328 * inserting code points into a string, only appending/overwriting.
1329 *
1330 * @returns pointer to the char following the written code point.
1331 * @param pwsz The string.
1332 * @param CodePoint The code point to write.
1333 * This should not be RTUNICP_INVALID or any other
1334 * character out of the UTF-16 range.
1335 *
1336 * @remark We optimize this operation by using an inline function for
1337 * everything which isn't a surrogate pair or and endian indicator.
1338 */
1339DECLINLINE(PRTUTF16) RTUtf16PutCp(PRTUTF16 pwsz, RTUNICP CodePoint)
1340{
1341 if (CodePoint < 0xd800 || (CodePoint > 0xd800 && CodePoint < 0xfffe))
1342 {
1343 *pwsz++ = (RTUTF16)CodePoint;
1344 return pwsz;
1345 }
1346 return RTUtf16PutCpInternal(pwsz, CodePoint);
1347}
1348
1349/**
1350 * Skips ahead, past the current code point.
1351 *
1352 * @returns Pointer to the char after the current code point.
1353 * @param pwsz Pointer to the current code point.
1354 * @remark This will not move the next valid code point, only past the current one.
1355 */
1356DECLINLINE(PRTUTF16) RTUtf16NextCp(PCRTUTF16 pwsz)
1357{
1358 RTUNICP Cp;
1359 RTUtf16GetCpEx(&pwsz, &Cp);
1360 return (PRTUTF16)pwsz;
1361}
1362
1363/**
1364 * Skips backwards, to the previous code point.
1365 *
1366 * @returns Pointer to the char after the current code point.
1367 * @param pwszStart Pointer to the start of the string.
1368 * @param pwsz Pointer to the current code point.
1369 */
1370RTDECL(PRTUTF16) RTUtf16PrevCp(PCRTUTF16 pwszStart, PCRTUTF16 pwsz);
1371
1372
1373/**
1374 * Checks if the UTF-16 char is the high surrogate char (i.e.
1375 * the 1st char in the pair).
1376 *
1377 * @returns true if it is.
1378 * @returns false if it isn't.
1379 * @param wc The character to investigate.
1380 */
1381DECLINLINE(bool) RTUtf16IsHighSurrogate(RTUTF16 wc)
1382{
1383 return wc >= 0xd800 && wc <= 0xdbff;
1384}
1385
1386/**
1387 * Checks if the UTF-16 char is the low surrogate char (i.e.
1388 * the 2nd char in the pair).
1389 *
1390 * @returns true if it is.
1391 * @returns false if it isn't.
1392 * @param wc The character to investigate.
1393 */
1394DECLINLINE(bool) RTUtf16IsLowSurrogate(RTUTF16 wc)
1395{
1396 return wc >= 0xdc00 && wc <= 0xdfff;
1397}
1398
1399
1400/**
1401 * Checks if the two UTF-16 chars form a valid surrogate pair.
1402 *
1403 * @returns true if they do.
1404 * @returns false if they doesn't.
1405 * @param wcHigh The high (1st) character.
1406 * @param wcLow The low (2nd) character.
1407 */
1408DECLINLINE(bool) RTUtf16IsSurrogatePair(RTUTF16 wcHigh, RTUTF16 wcLow)
1409{
1410 return RTUtf16IsHighSurrogate(wcHigh)
1411 && RTUtf16IsLowSurrogate(wcLow);
1412}
1413
1414/**
1415 * Formats a buffer stream as hex bytes.
1416 *
1417 * The default is no separating spaces or line breaks or anything.
1418 *
1419 * @returns IPRT status code.
1420 * @retval VERR_INVALID_POINTER if any of the pointers are wrong.
1421 * @retval VERR_BUFFER_OVERFLOW if the buffer is insufficent to hold the bytes.
1422 *
1423 * @param pwszBuf Output string buffer.
1424 * @param cwcBuf The size of the output buffer in RTUTF16 units.
1425 * @param pv Pointer to the bytes to stringify.
1426 * @param cb The number of bytes to stringify.
1427 * @param fFlags Combination of RTSTRPRINTHEXBYTES_F_XXX values.
1428 * @sa RTStrPrintHexBytes.
1429 */
1430RTDECL(int) RTUtf16PrintHexBytes(PRTUTF16 pwszBuf, size_t cwcBuf, void const *pv, size_t cb, uint32_t fFlags);
1431
1432/**
1433 * String printf producing UTF-16 output.
1434 *
1435 * @returns On success, positive count of formatted RTUTF16 units excluding the
1436 * terminator. On buffer overflow, negative number giving the required
1437 * buffer size (including terminator) in RTUTF16 units.
1438 *
1439 * @param pwszBuffer Output buffer.
1440 * @param cwcBuffer Size of the output buffer in RTUTF16 units.
1441 * @param pszFormat Pointer to the format string, @see pg_rt_str_format.
1442 * @param args The format argument.
1443 *
1444 * @note This is similar to RTStrPrintf2V (not RTStrPrintfV)!
1445 */
1446RTDECL(ssize_t) RTUtf16PrintfV(PRTUTF16 pwszBuffer, size_t cwcBuffer, const char *pszFormat, va_list args) RT_IPRT_FORMAT_ATTR(3, 0);
1447
1448/**
1449 * String printf producing UTF-16 output.
1450 *
1451 * @returns On success, positive count of formatted RTUTF16 units excluding the
1452 * terminator. On buffer overflow, negative number giving the required
1453 * buffer size (including terminator) in RTUTF16 units.
1454 *
1455 * @param pwszBuffer Output buffer.
1456 * @param cwcBuffer Size of the output buffer in RTUTF16 units.
1457 * @param pszFormat Pointer to the format string, @see pg_rt_str_format.
1458 * @param ... The format argument.
1459 *
1460 * @note This is similar to RTStrPrintf2 (not RTStrPrintf)!
1461 */
1462RTDECL(ssize_t) RTUtf16Printf(PRTUTF16 pwszBuffer, size_t cwcBuffer, const char *pszFormat, ...) RT_IPRT_FORMAT_ATTR(3, 4);
1463
1464/**
1465 * String printf producing UTF-16 output with custom formatting.
1466 *
1467 * @returns On success, positive count of formatted RTUTF16 units excluding the
1468 * terminator. On buffer overflow, negative number giving the required
1469 * buffer size (including terminator) in RTUTF16 units.
1470 *
1471 * @param pfnFormat Pointer to handler function for the custom formats.
1472 * @param pvArg Argument to the pfnFormat function.
1473 * @param pwszBuffer Output buffer.
1474 * @param cwcBuffer Size of the output buffer in RTUTF16 units.
1475 * @param pszFormat Pointer to the format string, @see pg_rt_str_format.
1476 * @param args The format argument.
1477 *
1478 * @note This is similar to RTStrPrintf2ExV (not RTStrPrintfExV)!
1479 */
1480RTDECL(ssize_t) RTUtf16PrintfExV(PFNSTRFORMAT pfnFormat, void *pvArg, PRTUTF16 pwszBuffer, size_t cwcBuffer,
1481 const char *pszFormat, va_list args) RT_IPRT_FORMAT_ATTR(5, 0);
1482
1483/**
1484 * String printf producing UTF-16 output with custom formatting.
1485 *
1486 * @returns On success, positive count of formatted RTUTF16 units excluding the
1487 * terminator. On buffer overflow, negative number giving the required
1488 * buffer size (including terminator) in RTUTF16 units.
1489 *
1490 * @param pfnFormat Pointer to handler function for the custom formats.
1491 * @param pvArg Argument to the pfnFormat function.
1492 * @param pwszBuffer Output buffer.
1493 * @param cwcBuffer Size of the output buffer in RTUTF16 units.
1494 * @param pszFormat Pointer to the format string, @see pg_rt_str_format.
1495 * @param ... The format argument.
1496 *
1497 * @note This is similar to RTStrPrintf2Ex (not RTStrPrintfEx)!
1498 */
1499RTDECL(ssize_t) RTUtf16PrintfEx(PFNSTRFORMAT pfnFormat, void *pvArg, PRTUTF16 pwszBuffer, size_t cwcBuffer,
1500 const char *pszFormat, ...) RT_IPRT_FORMAT_ATTR(5, 6);
1501
1502/** @} */
1503RT_C_DECLS_END
1504
1505#endif /* !IPRT_INCLUDED_utf16_h */
1506
Note: See TracBrowser for help on using the repository browser.

© 2024 Oracle Support Privacy / Do Not Sell My Info Terms of Use Trademark Policy Automated Access Etiquette