VirtualBox

source: vbox/trunk/include/iprt/latin1.h@ 96407

Last change on this file since 96407 was 96407, checked in by vboxsync, 2 years ago

scm copyright and license note update

  • Property svn:eol-style set to native
  • Property svn:keywords set to Author Date Id Revision
File size: 16.6 KB
Line 
1/** @file
2 * IPRT - String Manipulation, Latin-1 (ISO-8859-1) encoding.
3 */
4
5/*
6 * Copyright (C) 2006-2022 Oracle and/or its affiliates.
7 *
8 * This file is part of VirtualBox base platform packages, as
9 * available from https://www.virtualbox.org.
10 *
11 * This program is free software; you can redistribute it and/or
12 * modify it under the terms of the GNU General Public License
13 * as published by the Free Software Foundation, in version 3 of the
14 * License.
15 *
16 * This program is distributed in the hope that it will be useful, but
17 * WITHOUT ANY WARRANTY; without even the implied warranty of
18 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
19 * General Public License for more details.
20 *
21 * You should have received a copy of the GNU General Public License
22 * along with this program; if not, see <https://www.gnu.org/licenses>.
23 *
24 * The contents of this file may alternatively be used under the terms
25 * of the Common Development and Distribution License Version 1.0
26 * (CDDL), a copy of it is provided in the "COPYING.CDDL" file included
27 * in the VirtualBox distribution, in which case the provisions of the
28 * CDDL are applicable instead of those of the GPL.
29 *
30 * You may elect to license modified versions of this file under the
31 * terms and conditions of either the GPL or the CDDL or both.
32 *
33 * SPDX-License-Identifier: GPL-3.0-only OR CDDL-1.0
34 */
35
36#ifndef IPRT_INCLUDED_latin1_h
37#define IPRT_INCLUDED_latin1_h
38#ifndef RT_WITHOUT_PRAGMA_ONCE
39# pragma once
40#endif
41
42#include <iprt/assert.h>
43#include <iprt/errcore.h> /* VERR_END_OF_STRING */
44
45RT_C_DECLS_BEGIN
46
47
48/** @defgroup rt_str_latin1 Latin-1 (ISO-8859-1) String Manipulation
49 * @ingroup grp_rt_str
50 *
51 * Deals with Latin-1 encoded strings.
52 *
53 * @warning Make sure to name all variables dealing with Latin-1 strings
54 * suchthat there is no way to mistake them for normal UTF-8 strings.
55 * There may be severe security issues resulting from mistaking Latin-1
56 * for UTF-8!
57 *
58 * @{
59 */
60
61/**
62 * Get the unicode code point at the given string position.
63 *
64 * @returns unicode code point.
65 * @returns RTUNICP_INVALID if the encoding is invalid.
66 * @param pszLatin1 The Latin-1 string.
67 */
68DECLINLINE(RTUNICP) RTLatin1GetCp(const char *pszLatin1)
69{
70 return *(const unsigned char *)pszLatin1;
71}
72
73/**
74 * Get the unicode code point at the given string position.
75 *
76 * @returns iprt status code.
77 * @param ppszLatin1 Pointer to the string pointer. This will be updated to
78 * point to the char following the current code point. This
79 * is advanced one character forward on failure.
80 * @param pCp Where to store the code point. RTUNICP_INVALID is stored
81 * here on failure.
82 */
83DECLINLINE(int) RTLatin1GetCpEx(const char **ppszLatin1, PRTUNICP pCp)
84{
85 const unsigned char uch = **(const unsigned char **)ppszLatin1;
86 (*ppszLatin1)++;
87 *pCp = uch;
88 return VINF_SUCCESS;
89}
90
91/**
92 * Get the unicode code point at the given string position for a string of a
93 * given maximum length.
94 *
95 * @returns iprt status code.
96 * @retval VERR_END_OF_STRING if *pcch is 0. *pCp is set to RTUNICP_INVALID.
97 *
98 * @param ppszLatin1 Pointer to the string pointer. This will be updated to
99 * point to the char following the current code point.
100 * @param pcchLatin1 Pointer to the maximum string length. This will be
101 * decremented by the size of the code point found.
102 * @param pCp Where to store the code point.
103 * RTUNICP_INVALID is stored here on failure.
104 */
105DECLINLINE(int) RTLatin1GetCpNEx(const char **ppszLatin1, size_t *pcchLatin1, PRTUNICP pCp)
106{
107 if (RT_LIKELY(*pcchLatin1 != 0))
108 {
109 const unsigned char uch = **(const unsigned char **)ppszLatin1;
110 (*ppszLatin1)++;
111 (*pcchLatin1)--;
112 *pCp = uch;
113 return VINF_SUCCESS;
114 }
115 *pCp = RTUNICP_INVALID;
116 return VERR_END_OF_STRING;
117}
118
119/**
120 * Get the Latin-1 size in characters of a given Unicode code point.
121 *
122 * The code point is expected to be a valid Unicode one, but not necessarily in
123 * the range supported by Latin-1.
124 *
125 * @returns the size in characters, or zero if there is no Latin-1 encoding
126 */
127DECLINLINE(size_t) RTLatin1CpSize(RTUNICP CodePoint)
128{
129 if (CodePoint < 0x100)
130 return 1;
131 return 0;
132}
133
134/**
135 * Put the unicode code point at the given string position
136 * and return the pointer to the char following it.
137 *
138 * This function will not consider anything at or following the
139 * buffer area pointed to by psz. It is therefore not suitable for
140 * inserting code points into a string, only appending/overwriting.
141 *
142 * @returns pointer to the char following the written code point.
143 * @param pszLatin1 The string.
144 * @param CodePoint The code point to write.
145 * This should not be RTUNICP_INVALID or any other
146 * character out of the Latin-1 range.
147 */
148DECLINLINE(char *) RTLatin1PutCp(char *pszLatin1, RTUNICP CodePoint)
149{
150 AssertReturn(CodePoint < 0x100, NULL);
151 *pszLatin1++ = (unsigned char)CodePoint;
152 return pszLatin1;
153}
154
155/**
156 * Skips ahead, past the current code point.
157 *
158 * @returns Pointer to the char after the current code point.
159 * @param pszLatin1 Pointer to the current code point.
160 * @remark This will not move the next valid code point, only past the current one.
161 */
162DECLINLINE(char *) RTLatin1NextCp(const char *pszLatin1)
163{
164 pszLatin1++;
165 return (char *)pszLatin1;
166}
167
168/**
169 * Skips back to the previous code point.
170 *
171 * @returns Pointer to the char before the current code point.
172 * @returns pszLatin1Start on failure.
173 * @param pszLatin1Start Pointer to the start of the string.
174 * @param pszLatin1 Pointer to the current code point.
175 */
176DECLINLINE(char *) RTLatin1PrevCp(const char *pszLatin1Start, const char *pszLatin1)
177{
178 if ((uintptr_t)pszLatin1 > (uintptr_t)pszLatin1Start)
179 {
180 pszLatin1--;
181 return (char *)pszLatin1;
182 }
183 return (char *)pszLatin1Start;
184}
185
186/**
187 * Translate a Latin1 string into a UTF-8 allocating the result buffer (default
188 * tag).
189 *
190 * @returns iprt status code.
191 * @param pszLatin1 Latin1 string to convert.
192 * @param ppszString Receives pointer of allocated UTF-8 string on
193 * success, and is always set to NULL on failure.
194 * The returned pointer must be freed using RTStrFree().
195 */
196#define RTLatin1ToUtf8(pszLatin1, ppszString) RTLatin1ToUtf8Tag((pszLatin1), (ppszString), RTSTR_TAG)
197
198/**
199 * Translate a Latin-1 string into a UTF-8 allocating the result buffer.
200 *
201 * @returns iprt status code.
202 * @param pszLatin1 Latin-1 string to convert.
203 * @param ppszString Receives pointer of allocated UTF-8 string on
204 * success, and is always set to NULL on failure.
205 * The returned pointer must be freed using RTStrFree().
206 * @param pszTag Allocation tag used for statistics and such.
207 */
208RTDECL(int) RTLatin1ToUtf8Tag(const char *pszLatin1, char **ppszString, const char *pszTag);
209
210/**
211 * Translates Latin-1 to UTF-8 using buffer provided by the caller or a fittingly
212 * sized buffer allocated by the function (default tag).
213 *
214 * @returns iprt status code.
215 * @param pszLatin1 The Latin-1 string to convert.
216 * @param cchLatin1 The number of Latin-1 characters to translate from
217 * pszLatin1. The translation will stop when reaching
218 * cchLatin1 or the terminator ('\\0'). Use RTSTR_MAX
219 * to translate the entire string.
220 * @param ppsz If @a cch is non-zero, this must either be pointing
221 * to a pointer to a buffer of the specified size, or
222 * pointer to a NULL pointer. If *ppsz is NULL or
223 * @a cch is zero a buffer of at least @a cch chars
224 * will be allocated to hold the translated string. If
225 * a buffer was requested it must be freed using
226 * RTStrFree().
227 * @param cch The buffer size in chars (the type). This includes the terminator.
228 * @param pcch Where to store the length of the translated string,
229 * excluding the terminator. (Optional)
230 *
231 * This may be set under some error conditions,
232 * however, only for VERR_BUFFER_OVERFLOW and
233 * VERR_NO_STR_MEMORY will it contain a valid string
234 * length that can be used to resize the buffer.
235 */
236#define RTLatin1ToUtf8Ex(pszLatin1, cchLatin1, ppsz, cch, pcch) \
237 RTLatin1ToUtf8ExTag((pszLatin1), (cchLatin1), (ppsz), (cch), (pcch), RTSTR_TAG)
238
239/**
240 * Translates Latin1 to UTF-8 using buffer provided by the caller or a fittingly
241 * sized buffer allocated by the function (custom tag).
242 *
243 * @returns iprt status code.
244 * @param pszLatin1 The Latin1 string to convert.
245 * @param cchLatin1 The number of Latin1 characters to translate from
246 * pwszString. The translation will stop when
247 * reaching cchLatin1 or the terminator ('\\0'). Use
248 * RTSTR_MAX to translate the entire string.
249 * @param ppsz If cch is non-zero, this must either be pointing to
250 * a pointer to a buffer of the specified size, or
251 * pointer to a NULL pointer. If *ppsz is NULL or cch
252 * is zero a buffer of at least cch chars will be
253 * allocated to hold the translated string. If a
254 * buffer was requested it must be freed using
255 * RTStrFree().
256 * @param cch The buffer size in chars (the type). This includes
257 * the terminator.
258 * @param pcch Where to store the length of the translated string,
259 * excluding the terminator. (Optional)
260 *
261 * This may be set under some error conditions,
262 * however, only for VERR_BUFFER_OVERFLOW and
263 * VERR_NO_STR_MEMORY will it contain a valid string
264 * length that can be used to resize the buffer.
265 * @param pszTag Allocation tag used for statistics and such.
266 */
267RTDECL(int) RTLatin1ToUtf8ExTag(const char *pszLatin1, size_t cchLatin1, char **ppsz, size_t cch, size_t *pcch,
268 const char *pszTag);
269
270/**
271 * Calculates the length of the Latin-1 string in UTF-8 chars (bytes).
272 *
273 * The primary purpose of this function is to help allocate buffers for
274 * RTLatin1ToUtf8() of the correct size. For most other purposes
275 * RTLatin1ToUtf8Ex() should be used.
276 *
277 * @returns Number of chars (bytes).
278 * @returns 0 if the string was incorrectly encoded.
279 * @param pszLatin1 The Latin-1 string.
280 */
281RTDECL(size_t) RTLatin1CalcUtf8Len(const char *pszLatin1);
282
283/**
284 * Calculates the length of the Latin-1 string in UTF-8 chars (bytes).
285 *
286 * @returns iprt status code.
287 * @param pszLatin1 The Latin-1 string.
288 * @param cchLatin1 The max string length. Use RTSTR_MAX to process the
289 * entire string.
290 * @param pcch Where to store the string length (in bytes). Optional.
291 * This is undefined on failure.
292 */
293RTDECL(int) RTLatin1CalcUtf8LenEx(const char *pszLatin1, size_t cchLatin1, size_t *pcch);
294
295/**
296 * Calculates the length of the Latin-1 (ISO-8859-1) string in RTUTF16 items.
297 *
298 * @returns Number of RTUTF16 items.
299 * @param pszLatin1 The Latin-1 string.
300 */
301RTDECL(size_t) RTLatin1CalcUtf16Len(const char *pszLatin1);
302
303/**
304 * Calculates the length of the Latin-1 (ISO-8859-1) string in RTUTF16 items.
305 *
306 * @returns iprt status code.
307 * @param pszLatin1 The Latin-1 string.
308 * @param cchLatin1 The max string length. Use RTSTR_MAX to process the
309 * entire string.
310 * @param pcwc Where to store the string length. Optional.
311 * This is undefined on failure.
312 */
313RTDECL(int) RTLatin1CalcUtf16LenEx(const char *pszLatin1, size_t cchLatin1, size_t *pcwc);
314
315/**
316 * Translate a Latin-1 (ISO-8859-1) string into a UTF-16 allocating the result
317 * buffer (default tag).
318 *
319 * @returns iprt status code.
320 * @param pszLatin1 The Latin-1 string to convert.
321 * @param ppwszString Receives pointer to the allocated UTF-16 string. The
322 * returned string must be freed using RTUtf16Free().
323 */
324#define RTLatin1ToUtf16(pszLatin1, ppwszString) RTLatin1ToUtf16Tag((pszLatin1), (ppwszString), RTSTR_TAG)
325
326/**
327 * Translate a Latin-1 (ISO-8859-1) string into a UTF-16 allocating the result
328 * buffer (custom tag).
329 *
330 * @returns iprt status code.
331 * @param pszLatin1 The Latin-1 string to convert.
332 * @param ppwszString Receives pointer to the allocated UTF-16 string. The
333 * returned string must be freed using RTUtf16Free().
334 * @param pszTag Allocation tag used for statistics and such.
335 */
336RTDECL(int) RTLatin1ToUtf16Tag(const char *pszLatin1, PRTUTF16 *ppwszString, const char *pszTag);
337
338/**
339 * Translates pszLatin1 from Latin-1 (ISO-8859-1) to UTF-16, allocating the
340 * result buffer if requested (default tag).
341 *
342 * @returns iprt status code.
343 * @param pszLatin1 The Latin-1 string to convert.
344 * @param cchLatin1 The maximum size in chars (the type) to convert. The
345 * conversion stops when it reaches cchLatin1 or the
346 * string terminator ('\\0'). Use RTSTR_MAX to
347 * translate the entire string.
348 * @param ppwsz If cwc is non-zero, this must either be pointing
349 * to pointer to a buffer of the specified size, or
350 * pointer to a NULL pointer.
351 * If *ppwsz is NULL or cwc is zero a buffer of at
352 * least cwc items will be allocated to hold the
353 * translated string. If a buffer was requested it
354 * must be freed using RTUtf16Free().
355 * @param cwc The buffer size in RTUTF16s. This includes the
356 * terminator.
357 * @param pcwc Where to store the length of the translated string,
358 * excluding the terminator. (Optional)
359 *
360 * This may be set under some error conditions,
361 * however, only for VERR_BUFFER_OVERFLOW and
362 * VERR_NO_STR_MEMORY will it contain a valid string
363 * length that can be used to resize the buffer.
364 */
365#define RTLatin1ToUtf16Ex(pszLatin1, cchLatin1, ppwsz, cwc, pcwc) \
366 RTLatin1ToUtf16ExTag((pszLatin1), (cchLatin1), (ppwsz), (cwc), (pcwc), RTSTR_TAG)
367
368/**
369 * Translates pszLatin1 from Latin-1 (ISO-8859-1) to UTF-16, allocating the
370 * result buffer if requested.
371 *
372 * @returns iprt status code.
373 * @param pszLatin1 The Latin-1 string to convert.
374 * @param cchLatin1 The maximum size in chars (the type) to convert. The
375 * conversion stops when it reaches cchLatin1 or the
376 * string terminator ('\\0'). Use RTSTR_MAX to
377 * translate the entire string.
378 * @param ppwsz If cwc is non-zero, this must either be pointing
379 * to pointer to a buffer of the specified size, or
380 * pointer to a NULL pointer.
381 * If *ppwsz is NULL or cwc is zero a buffer of at
382 * least cwc items will be allocated to hold the
383 * translated string. If a buffer was requested it
384 * must be freed using RTUtf16Free().
385 * @param cwc The buffer size in RTUTF16s. This includes the
386 * terminator.
387 * @param pcwc Where to store the length of the translated string,
388 * excluding the terminator. (Optional)
389 *
390 * This may be set under some error conditions,
391 * however, only for VERR_BUFFER_OVERFLOW and
392 * VERR_NO_STR_MEMORY will it contain a valid string
393 * length that can be used to resize the buffer.
394 * @param pszTag Allocation tag used for statistics and such.
395 */
396RTDECL(int) RTLatin1ToUtf16ExTag(const char *pszLatin1, size_t cchLatin1,
397 PRTUTF16 *ppwsz, size_t cwc, size_t *pcwc, const char *pszTag);
398
399/** @} */
400
401RT_C_DECLS_END
402
403#endif /* !IPRT_INCLUDED_latin1_h */
404
Note: See TracBrowser for help on using the repository browser.

© 2024 Oracle Support Privacy / Do Not Sell My Info Terms of Use Trademark Policy Automated Access Etiquette