VirtualBox

source: vbox/trunk/include/iprt/latin1.h@ 58266

Last change on this file since 58266 was 57941, checked in by vboxsync, 9 years ago

iprt/string.h: split out the UTF-16 and Latin-1 parts.

  • Property svn:eol-style set to native
  • Property svn:keywords set to Author Date Id Revision
File size: 16.2 KB
Line 
1/** @file
2 * IPRT - String Manipulation, Latin-1 (ISO-8859-1) encoding.
3 */
4
5/*
6 * Copyright (C) 2006-2015 Oracle Corporation
7 *
8 * This file is part of VirtualBox Open Source Edition (OSE), as
9 * available from http://www.virtualbox.org. This file is free software;
10 * you can redistribute it and/or modify it under the terms of the GNU
11 * General Public License (GPL) as published by the Free Software
12 * Foundation, in version 2 as it comes in the "COPYING" file of the
13 * VirtualBox OSE distribution. VirtualBox OSE is distributed in the
14 * hope that it will be useful, but WITHOUT ANY WARRANTY of any kind.
15 *
16 * The contents of this file may alternatively be used under the terms
17 * of the Common Development and Distribution License Version 1.0
18 * (CDDL) only, as it comes in the "COPYING.CDDL" file of the
19 * VirtualBox OSE distribution, in which case the provisions of the
20 * CDDL are applicable instead of those of the GPL.
21 *
22 * You may elect to license modified versions of this file under the
23 * terms and conditions of either the GPL or the CDDL or both.
24 */
25
26#ifndef ___iprt_latin1_h
27#define ___iprt_latin1_h
28
29#include <iprt/string.h>
30
31RT_C_DECLS_BEGIN
32
33
34/** @defgroup rt_str_latin1 Latin-1 (ISO-8859-1) String Manipulation
35 * @ingroup grp_rt_str
36 *
37 * Deals with Latin-1 encoded strings.
38 *
39 * @warning Make sure to name all variables dealing with Latin-1 strings
40 * suchthat there is no way to mistake them for normal UTF-8 strings.
41 * There may be severe security issues resulting from mistaking Latin-1
42 * for UTF-8!
43 *
44 * @{
45 */
46
47/**
48 * Get the unicode code point at the given string position.
49 *
50 * @returns unicode code point.
51 * @returns RTUNICP_INVALID if the encoding is invalid.
52 * @param pszLatin1 The Latin-1 string.
53 */
54DECLINLINE(RTUNICP) RTLatin1GetCp(const char *pszLatin1)
55{
56 return *(const unsigned char *)pszLatin1;
57}
58
59/**
60 * Get the unicode code point at the given string position.
61 *
62 * @returns iprt status code.
63 * @param ppszLatin1 Pointer to the string pointer. This will be updated to
64 * point to the char following the current code point. This
65 * is advanced one character forward on failure.
66 * @param pCp Where to store the code point. RTUNICP_INVALID is stored
67 * here on failure.
68 */
69DECLINLINE(int) RTLatin1GetCpEx(const char **ppszLatin1, PRTUNICP pCp)
70{
71 const unsigned char uch = **(const unsigned char **)ppszLatin1;
72 (*ppszLatin1)++;
73 *pCp = uch;
74 return VINF_SUCCESS;
75}
76
77/**
78 * Get the unicode code point at the given string position for a string of a
79 * given maximum length.
80 *
81 * @returns iprt status code.
82 * @retval VERR_END_OF_STRING if *pcch is 0. *pCp is set to RTUNICP_INVALID.
83 *
84 * @param ppszLatin1 Pointer to the string pointer. This will be updated to
85 * point to the char following the current code point.
86 * @param pcchLatin1 Pointer to the maximum string length. This will be
87 * decremented by the size of the code point found.
88 * @param pCp Where to store the code point.
89 * RTUNICP_INVALID is stored here on failure.
90 */
91DECLINLINE(int) RTLatin1GetCpNEx(const char **ppszLatin1, size_t *pcchLatin1, PRTUNICP pCp)
92{
93 if (RT_LIKELY(*pcchLatin1 != 0))
94 {
95 const unsigned char uch = **(const unsigned char **)ppszLatin1;
96 (*ppszLatin1)++;
97 (*pcchLatin1)--;
98 *pCp = uch;
99 return VINF_SUCCESS;
100 }
101 *pCp = RTUNICP_INVALID;
102 return VERR_END_OF_STRING;
103}
104
105/**
106 * Get the Latin-1 size in characters of a given Unicode code point.
107 *
108 * The code point is expected to be a valid Unicode one, but not necessarily in
109 * the range supported by Latin-1.
110 *
111 * @returns the size in characters, or zero if there is no Latin-1 encoding
112 */
113DECLINLINE(size_t) RTLatin1CpSize(RTUNICP CodePoint)
114{
115 if (CodePoint < 0x100)
116 return 1;
117 return 0;
118}
119
120/**
121 * Put the unicode code point at the given string position
122 * and return the pointer to the char following it.
123 *
124 * This function will not consider anything at or following the
125 * buffer area pointed to by psz. It is therefore not suitable for
126 * inserting code points into a string, only appending/overwriting.
127 *
128 * @returns pointer to the char following the written code point.
129 * @param pszLatin1 The string.
130 * @param CodePoint The code point to write.
131 * This should not be RTUNICP_INVALID or any other
132 * character out of the Latin-1 range.
133 */
134DECLINLINE(char *) RTLatin1PutCp(char *pszLatin1, RTUNICP CodePoint)
135{
136 AssertReturn(CodePoint < 0x100, NULL);
137 *pszLatin1++ = (unsigned char)CodePoint;
138 return pszLatin1;
139}
140
141/**
142 * Skips ahead, past the current code point.
143 *
144 * @returns Pointer to the char after the current code point.
145 * @param pszLatin1 Pointer to the current code point.
146 * @remark This will not move the next valid code point, only past the current one.
147 */
148DECLINLINE(char *) RTLatin1NextCp(const char *pszLatin1)
149{
150 pszLatin1++;
151 return (char *)pszLatin1;
152}
153
154/**
155 * Skips back to the previous code point.
156 *
157 * @returns Pointer to the char before the current code point.
158 * @returns pszLatin1Start on failure.
159 * @param pszLatin1Start Pointer to the start of the string.
160 * @param pszLatin1 Pointer to the current code point.
161 */
162DECLINLINE(char *) RTLatin1PrevCp(const char *pszLatin1Start, const char *pszLatin1)
163{
164 if ((uintptr_t)pszLatin1 > (uintptr_t)pszLatin1Start)
165 {
166 pszLatin1--;
167 return (char *)pszLatin1;
168 }
169 return (char *)pszLatin1Start;
170}
171
172/**
173 * Translate a Latin1 string into a UTF-8 allocating the result buffer (default
174 * tag).
175 *
176 * @returns iprt status code.
177 * @param pszLatin1 Latin1 string to convert.
178 * @param ppszString Receives pointer of allocated UTF-8 string on
179 * success, and is always set to NULL on failure.
180 * The returned pointer must be freed using RTStrFree().
181 */
182#define RTLatin1ToUtf8(pszLatin1, ppszString) RTLatin1ToUtf8Tag((pszLatin1), (ppszString), RTSTR_TAG)
183
184/**
185 * Translate a Latin-1 string into a UTF-8 allocating the result buffer.
186 *
187 * @returns iprt status code.
188 * @param pszLatin1 Latin-1 string to convert.
189 * @param ppszString Receives pointer of allocated UTF-8 string on
190 * success, and is always set to NULL on failure.
191 * The returned pointer must be freed using RTStrFree().
192 * @param pszTag Allocation tag used for statistics and such.
193 */
194RTDECL(int) RTLatin1ToUtf8Tag(const char *pszLatin1, char **ppszString, const char *pszTag);
195
196/**
197 * Translates Latin-1 to UTF-8 using buffer provided by the caller or a fittingly
198 * sized buffer allocated by the function (default tag).
199 *
200 * @returns iprt status code.
201 * @param pszLatin1 The Latin-1 string to convert.
202 * @param cchLatin1 The number of Latin-1 characters to translate from
203 * pszLatin1. The translation will stop when reaching
204 * cchLatin1 or the terminator ('\\0'). Use RTSTR_MAX
205 * to translate the entire string.
206 * @param ppsz If @a cch is non-zero, this must either be pointing
207 * to a pointer to a buffer of the specified size, or
208 * pointer to a NULL pointer. If *ppsz is NULL or
209 * @a cch is zero a buffer of at least @a cch chars
210 * will be allocated to hold the translated string. If
211 * a buffer was requested it must be freed using
212 * RTStrFree().
213 * @param cch The buffer size in chars (the type). This includes the terminator.
214 * @param pcch Where to store the length of the translated string,
215 * excluding the terminator. (Optional)
216 *
217 * This may be set under some error conditions,
218 * however, only for VERR_BUFFER_OVERFLOW and
219 * VERR_NO_STR_MEMORY will it contain a valid string
220 * length that can be used to resize the buffer.
221 */
222#define RTLatin1ToUtf8Ex(pszLatin1, cchLatin1, ppsz, cch, pcch) \
223 RTLatin1ToUtf8ExTag((pszLatin1), (cchLatin1), (ppsz), (cch), (pcch), RTSTR_TAG)
224
225/**
226 * Translates Latin1 to UTF-8 using buffer provided by the caller or a fittingly
227 * sized buffer allocated by the function (custom tag).
228 *
229 * @returns iprt status code.
230 * @param pszLatin1 The Latin1 string to convert.
231 * @param cchLatin1 The number of Latin1 characters to translate from
232 * pwszString. The translation will stop when
233 * reaching cchLatin1 or the terminator ('\\0'). Use
234 * RTSTR_MAX to translate the entire string.
235 * @param ppsz If cch is non-zero, this must either be pointing to
236 * a pointer to a buffer of the specified size, or
237 * pointer to a NULL pointer. If *ppsz is NULL or cch
238 * is zero a buffer of at least cch chars will be
239 * allocated to hold the translated string. If a
240 * buffer was requested it must be freed using
241 * RTStrFree().
242 * @param cch The buffer size in chars (the type). This includes
243 * the terminator.
244 * @param pcch Where to store the length of the translated string,
245 * excluding the terminator. (Optional)
246 *
247 * This may be set under some error conditions,
248 * however, only for VERR_BUFFER_OVERFLOW and
249 * VERR_NO_STR_MEMORY will it contain a valid string
250 * length that can be used to resize the buffer.
251 * @param pszTag Allocation tag used for statistics and such.
252 */
253RTDECL(int) RTLatin1ToUtf8ExTag(const char *pszLatin1, size_t cchLatin1, char **ppsz, size_t cch, size_t *pcch,
254 const char *pszTag);
255
256/**
257 * Calculates the length of the Latin-1 string in UTF-8 chars (bytes).
258 *
259 * The primary purpose of this function is to help allocate buffers for
260 * RTLatin1ToUtf8() of the correct size. For most other purposes
261 * RTLatin1ToUtf8Ex() should be used.
262 *
263 * @returns Number of chars (bytes).
264 * @returns 0 if the string was incorrectly encoded.
265 * @param pszLatin1 The Latin-1 string.
266 */
267RTDECL(size_t) RTLatin1CalcUtf8Len(const char *pszLatin1);
268
269/**
270 * Calculates the length of the Latin-1 string in UTF-8 chars (bytes).
271 *
272 * @returns iprt status code.
273 * @param pszLatin1 The Latin-1 string.
274 * @param cchLatin1 The max string length. Use RTSTR_MAX to process the
275 * entire string.
276 * @param pcch Where to store the string length (in bytes). Optional.
277 * This is undefined on failure.
278 */
279RTDECL(int) RTLatin1CalcUtf8LenEx(const char *pszLatin1, size_t cchLatin1, size_t *pcch);
280
281/**
282 * Calculates the length of the Latin-1 (ISO-8859-1) string in RTUTF16 items.
283 *
284 * @returns Number of RTUTF16 items.
285 * @param pszLatin1 The Latin-1 string.
286 */
287RTDECL(size_t) RTLatin1CalcUtf16Len(const char *pszLatin1);
288
289/**
290 * Calculates the length of the Latin-1 (ISO-8859-1) string in RTUTF16 items.
291 *
292 * @returns iprt status code.
293 * @param pszLatin1 The Latin-1 string.
294 * @param cchLatin1 The max string length. Use RTSTR_MAX to process the
295 * entire string.
296 * @param pcwc Where to store the string length. Optional.
297 * This is undefined on failure.
298 */
299RTDECL(int) RTLatin1CalcUtf16LenEx(const char *pszLatin1, size_t cchLatin1, size_t *pcwc);
300
301/**
302 * Translate a Latin-1 (ISO-8859-1) string into a UTF-16 allocating the result
303 * buffer (default tag).
304 *
305 * @returns iprt status code.
306 * @param pszLatin1 The Latin-1 string to convert.
307 * @param ppwszString Receives pointer to the allocated UTF-16 string. The
308 * returned string must be freed using RTUtf16Free().
309 */
310#define RTLatin1ToUtf16(pszLatin1, ppwszString) RTLatin1ToUtf16Tag((pszLatin1), (ppwszString), RTSTR_TAG)
311
312/**
313 * Translate a Latin-1 (ISO-8859-1) string into a UTF-16 allocating the result
314 * buffer (custom tag).
315 *
316 * @returns iprt status code.
317 * @param pszLatin1 The Latin-1 string to convert.
318 * @param ppwszString Receives pointer to the allocated UTF-16 string. The
319 * returned string must be freed using RTUtf16Free().
320 * @param pszTag Allocation tag used for statistics and such.
321 */
322RTDECL(int) RTLatin1ToUtf16Tag(const char *pszLatin1, PRTUTF16 *ppwszString, const char *pszTag);
323
324/**
325 * Translates pszLatin1 from Latin-1 (ISO-8859-1) to UTF-16, allocating the
326 * result buffer if requested (default tag).
327 *
328 * @returns iprt status code.
329 * @param pszLatin1 The Latin-1 string to convert.
330 * @param cchLatin1 The maximum size in chars (the type) to convert. The
331 * conversion stops when it reaches cchLatin1 or the
332 * string terminator ('\\0'). Use RTSTR_MAX to
333 * translate the entire string.
334 * @param ppwsz If cwc is non-zero, this must either be pointing
335 * to pointer to a buffer of the specified size, or
336 * pointer to a NULL pointer.
337 * If *ppwsz is NULL or cwc is zero a buffer of at
338 * least cwc items will be allocated to hold the
339 * translated string. If a buffer was requested it
340 * must be freed using RTUtf16Free().
341 * @param cwc The buffer size in RTUTF16s. This includes the
342 * terminator.
343 * @param pcwc Where to store the length of the translated string,
344 * excluding the terminator. (Optional)
345 *
346 * This may be set under some error conditions,
347 * however, only for VERR_BUFFER_OVERFLOW and
348 * VERR_NO_STR_MEMORY will it contain a valid string
349 * length that can be used to resize the buffer.
350 */
351#define RTLatin1ToUtf16Ex(pszLatin1, cchLatin1, ppwsz, cwc, pcwc) \
352 RTLatin1ToUtf16ExTag((pszLatin1), (cchLatin1), (ppwsz), (cwc), (pcwc), RTSTR_TAG)
353
354/**
355 * Translates pszLatin1 from Latin-1 (ISO-8859-1) to UTF-16, allocating the
356 * result buffer if requested.
357 *
358 * @returns iprt status code.
359 * @param pszLatin1 The Latin-1 string to convert.
360 * @param cchLatin1 The maximum size in chars (the type) to convert. The
361 * conversion stops when it reaches cchLatin1 or the
362 * string terminator ('\\0'). Use RTSTR_MAX to
363 * translate the entire string.
364 * @param ppwsz If cwc is non-zero, this must either be pointing
365 * to pointer to a buffer of the specified size, or
366 * pointer to a NULL pointer.
367 * If *ppwsz is NULL or cwc is zero a buffer of at
368 * least cwc items will be allocated to hold the
369 * translated string. If a buffer was requested it
370 * must be freed using RTUtf16Free().
371 * @param cwc The buffer size in RTUTF16s. This includes the
372 * terminator.
373 * @param pcwc Where to store the length of the translated string,
374 * excluding the terminator. (Optional)
375 *
376 * This may be set under some error conditions,
377 * however, only for VERR_BUFFER_OVERFLOW and
378 * VERR_NO_STR_MEMORY will it contain a valid string
379 * length that can be used to resize the buffer.
380 * @param pszTag Allocation tag used for statistics and such.
381 */
382RTDECL(int) RTLatin1ToUtf16ExTag(const char *pszLatin1, size_t cchLatin1,
383 PRTUTF16 *ppwsz, size_t cwc, size_t *pcwc, const char *pszTag);
384
385/** @} */
386
387RT_C_DECLS_END
388
389/** @} */
390
391#endif
392
Note: See TracBrowser for help on using the repository browser.

© 2024 Oracle Support Privacy / Do Not Sell My Info Terms of Use Trademark Policy Automated Access Etiquette