VirtualBox

source: vbox/trunk/include/iprt/uni.h@ 97966

Last change on this file since 97966 was 96407, checked in by vboxsync, 2 years ago

scm copyright and license note update

  • Property svn:eol-style set to native
  • Property svn:keywords set to Author Date Id Revision
File size: 11.1 KB
Line 
1/** @file
2 * IPRT - Unicode Code Points.
3 */
4
5/*
6 * Copyright (C) 2006-2022 Oracle and/or its affiliates.
7 *
8 * This file is part of VirtualBox base platform packages, as
9 * available from https://www.virtualbox.org.
10 *
11 * This program is free software; you can redistribute it and/or
12 * modify it under the terms of the GNU General Public License
13 * as published by the Free Software Foundation, in version 3 of the
14 * License.
15 *
16 * This program is distributed in the hope that it will be useful, but
17 * WITHOUT ANY WARRANTY; without even the implied warranty of
18 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
19 * General Public License for more details.
20 *
21 * You should have received a copy of the GNU General Public License
22 * along with this program; if not, see <https://www.gnu.org/licenses>.
23 *
24 * The contents of this file may alternatively be used under the terms
25 * of the Common Development and Distribution License Version 1.0
26 * (CDDL), a copy of it is provided in the "COPYING.CDDL" file included
27 * in the VirtualBox distribution, in which case the provisions of the
28 * CDDL are applicable instead of those of the GPL.
29 *
30 * You may elect to license modified versions of this file under the
31 * terms and conditions of either the GPL or the CDDL or both.
32 *
33 * SPDX-License-Identifier: GPL-3.0-only OR CDDL-1.0
34 */
35
36#ifndef IPRT_INCLUDED_uni_h
37#define IPRT_INCLUDED_uni_h
38#ifndef RT_WITHOUT_PRAGMA_ONCE
39# pragma once
40#endif
41
42/** @defgroup grp_rt_uni RTUniCp - Unicode Code Points
43 * @ingroup grp_rt
44 * @{
45 */
46
47/** @def RTUNI_USE_WCTYPE
48 * Define RTUNI_USE_WCTYPE to not use the IPRT unicode data but the
49 * data which the C runtime library provides. */
50#ifdef DOXYGEN_RUNNING
51# define RTUNI_USE_WCTYPE
52#endif
53
54#include <iprt/types.h>
55#ifdef RTUNI_USE_WCTYPE
56# include <wctype.h>
57#endif
58
59RT_C_DECLS_BEGIN
60
61
62#ifndef RTUNI_USE_WCTYPE
63
64/**
65 * A unicode flags range.
66 * @internal
67 */
68typedef struct RTUNIFLAGSRANGE
69{
70 /** The first code point of the range. */
71 RTUNICP BeginCP;
72 /** The last + 1 code point of the range. */
73 RTUNICP EndCP;
74 /** Pointer to the array of case folded code points. */
75 const uint8_t *pafFlags;
76} RTUNIFLAGSRANGE;
77/** Pointer to a flags range.
78 * @internal */
79typedef RTUNIFLAGSRANGE *PRTUNIFLAGSRANGE;
80/** Pointer to a const flags range.
81 * @internal */
82typedef const RTUNIFLAGSRANGE *PCRTUNIFLAGSRANGE;
83
84/**
85 * A unicode case folded range.
86 * @internal
87 */
88typedef struct RTUNICASERANGE
89{
90 /** The first code point of the range. */
91 RTUNICP BeginCP;
92 /** The last + 1 code point of the range. */
93 RTUNICP EndCP;
94 /** Pointer to the array of case folded code points. */
95 PCRTUNICP paFoldedCPs;
96} RTUNICASERANGE;
97/** Pointer to a case folded range.
98 * @internal */
99typedef RTUNICASERANGE *PRTUNICASERANGE;
100/** Pointer to a const case folded range.
101 * @internal */
102typedef const RTUNICASERANGE *PCRTUNICASERANGE;
103
104/** @name Unicode Code Point Flags.
105 * @internal
106 * @{ */
107#define RTUNI_UPPER RT_BIT(0)
108#define RTUNI_LOWER RT_BIT(1)
109#define RTUNI_ALPHA RT_BIT(2)
110#define RTUNI_XDIGIT RT_BIT(3)
111#define RTUNI_DDIGIT RT_BIT(4)
112#define RTUNI_WSPACE RT_BIT(5)
113/*#define RTUNI_BSPACE RT_BIT(6) - later */
114/** When set, the codepoint requires further checking wrt NFC and NFD
115 * normalization. I.e. set when either of QC_NFD and QC_NFC are not Y. */
116#define RTUNI_QC_NFX RT_BIT(7)
117/** @} */
118
119
120/**
121 * Array of flags ranges.
122 * @internal
123 */
124extern RTDATADECL(const RTUNIFLAGSRANGE) g_aRTUniFlagsRanges[];
125
126/**
127 * Gets the flags for a unicode code point.
128 *
129 * @returns The flag mask. (RTUNI_*)
130 * @param CodePoint The unicode code point.
131 * @internal
132 */
133DECLINLINE(RTUNICP) rtUniCpFlags(RTUNICP CodePoint)
134{
135 PCRTUNIFLAGSRANGE pCur = &g_aRTUniFlagsRanges[0];
136 do
137 {
138 if (pCur->EndCP > CodePoint)
139 {
140 if (pCur->BeginCP <= CodePoint)
141 return pCur->pafFlags[CodePoint - pCur->BeginCP];
142 break;
143 }
144 pCur++;
145 } while (pCur->EndCP != RTUNICP_MAX);
146 return 0;
147}
148
149
150/**
151 * Checks if a unicode code point is upper case.
152 *
153 * @returns true if it is.
154 * @returns false if it isn't.
155 * @param CodePoint The code point.
156 */
157DECLINLINE(bool) RTUniCpIsUpper(RTUNICP CodePoint)
158{
159 return (rtUniCpFlags(CodePoint) & RTUNI_UPPER) != 0;
160}
161
162
163/**
164 * Checks if a unicode code point is lower case.
165 *
166 * @returns true if it is.
167 * @returns false if it isn't.
168 * @param CodePoint The code point.
169 */
170DECLINLINE(bool) RTUniCpIsLower(RTUNICP CodePoint)
171{
172 return (rtUniCpFlags(CodePoint) & RTUNI_LOWER) != 0;
173}
174
175
176/**
177 * Checks if a unicode code point is case foldable.
178 *
179 * @returns true if it is.
180 * @returns false if it isn't.
181 * @param CodePoint The code point.
182 */
183DECLINLINE(bool) RTUniCpIsFoldable(RTUNICP CodePoint)
184{
185 /* Right enough. */
186 return (rtUniCpFlags(CodePoint) & (RTUNI_LOWER | RTUNI_UPPER)) != 0;
187}
188
189
190/**
191 * Checks if a unicode code point is alphabetic.
192 *
193 * @returns true if it is.
194 * @returns false if it isn't.
195 * @param CodePoint The code point.
196 */
197DECLINLINE(bool) RTUniCpIsAlphabetic(RTUNICP CodePoint)
198{
199 return (rtUniCpFlags(CodePoint) & RTUNI_ALPHA) != 0;
200}
201
202
203/**
204 * Checks if a unicode code point is a decimal digit.
205 *
206 * @returns true if it is.
207 * @returns false if it isn't.
208 * @param CodePoint The code point.
209 */
210DECLINLINE(bool) RTUniCpIsDecDigit(RTUNICP CodePoint)
211{
212 return (rtUniCpFlags(CodePoint) & RTUNI_DDIGIT) != 0;
213}
214
215
216/**
217 * Checks if a unicode code point is a hexadecimal digit.
218 *
219 * @returns true if it is.
220 * @returns false if it isn't.
221 * @param CodePoint The code point.
222 */
223DECLINLINE(bool) RTUniCpIsHexDigit(RTUNICP CodePoint)
224{
225 return (rtUniCpFlags(CodePoint) & RTUNI_XDIGIT) != 0;
226}
227
228
229/**
230 * Checks if a unicode code point is white space.
231 *
232 * @returns true if it is.
233 * @returns false if it isn't.
234 * @param CodePoint The code point.
235 */
236DECLINLINE(bool) RTUniCpIsSpace(RTUNICP CodePoint)
237{
238 return (rtUniCpFlags(CodePoint) & RTUNI_WSPACE) != 0;
239}
240
241
242
243/**
244 * Array of uppercase ranges.
245 * @internal
246 */
247extern RTDATADECL(const RTUNICASERANGE) g_aRTUniUpperRanges[];
248
249/**
250 * Array of lowercase ranges.
251 * @internal
252 */
253extern RTDATADECL(const RTUNICASERANGE) g_aRTUniLowerRanges[];
254
255
256/**
257 * Folds a unicode code point using the specified range array.
258 *
259 * @returns FOlded code point.
260 * @param CodePoint The unicode code point to fold.
261 * @param pCur The case folding range to use.
262 */
263DECLINLINE(RTUNICP) rtUniCpFold(RTUNICP CodePoint, PCRTUNICASERANGE pCur)
264{
265 do
266 {
267 if (pCur->EndCP > CodePoint)
268 {
269 if (pCur->BeginCP <= CodePoint)
270 CodePoint = pCur->paFoldedCPs[CodePoint - pCur->BeginCP];
271 break;
272 }
273 pCur++;
274 } while (pCur->EndCP != RTUNICP_MAX);
275 return CodePoint;
276}
277
278
279/**
280 * Folds a unicode code point to upper case.
281 *
282 * @returns Folded code point.
283 * @param CodePoint The unicode code point to fold.
284 */
285DECLINLINE(RTUNICP) RTUniCpToUpper(RTUNICP CodePoint)
286{
287 return rtUniCpFold(CodePoint, &g_aRTUniUpperRanges[0]);
288}
289
290
291/**
292 * Folds a unicode code point to lower case.
293 *
294 * @returns Folded code point.
295 * @param CodePoint The unicode code point to fold.
296 */
297DECLINLINE(RTUNICP) RTUniCpToLower(RTUNICP CodePoint)
298{
299 return rtUniCpFold(CodePoint, &g_aRTUniLowerRanges[0]);
300}
301
302
303#else /* RTUNI_USE_WCTYPE */
304
305
306/**
307 * Checks if a unicode code point is upper case.
308 *
309 * @returns true if it is.
310 * @returns false if it isn't.
311 * @param CodePoint The code point.
312 */
313DECLINLINE(bool) RTUniCpIsUpper(RTUNICP CodePoint)
314{
315 return !!iswupper(CodePoint);
316}
317
318
319/**
320 * Checks if a unicode code point is lower case.
321 *
322 * @returns true if it is.
323 * @returns false if it isn't.
324 * @param CodePoint The code point.
325 */
326DECLINLINE(bool) RTUniCpIsLower(RTUNICP CodePoint)
327{
328 return !!iswlower(CodePoint);
329}
330
331
332/**
333 * Checks if a unicode code point is case foldable.
334 *
335 * @returns true if it is.
336 * @returns false if it isn't.
337 * @param CodePoint The code point.
338 */
339DECLINLINE(bool) RTUniCpIsFoldable(RTUNICP CodePoint)
340{
341 /* Right enough. */
342 return iswupper(CodePoint) || iswlower(CodePoint);
343}
344
345
346/**
347 * Checks if a unicode code point is alphabetic.
348 *
349 * @returns true if it is.
350 * @returns false if it isn't.
351 * @param CodePoint The code point.
352 */
353DECLINLINE(bool) RTUniCpIsAlphabetic(RTUNICP CodePoint)
354{
355 return !!iswalpha(CodePoint);
356}
357
358
359/**
360 * Checks if a unicode code point is a decimal digit.
361 *
362 * @returns true if it is.
363 * @returns false if it isn't.
364 * @param CodePoint The code point.
365 */
366DECLINLINE(bool) RTUniCpIsDecDigit(RTUNICP CodePoint)
367{
368 return !!iswdigit(CodePoint);
369}
370
371
372/**
373 * Checks if a unicode code point is a hexadecimal digit.
374 *
375 * @returns true if it is.
376 * @returns false if it isn't.
377 * @param CodePoint The code point.
378 */
379DECLINLINE(bool) RTUniCpIsHexDigit(RTUNICP CodePoint)
380{
381 return !!iswxdigit(CodePoint);
382}
383
384
385/**
386 * Checks if a unicode code point is white space.
387 *
388 * @returns true if it is.
389 * @returns false if it isn't.
390 * @param CodePoint The code point.
391 */
392DECLINLINE(bool) RTUniCpIsSpace(RTUNICP CodePoint)
393{
394 return !!iswspace(CodePoint);
395}
396
397
398/**
399 * Folds a unicode code point to upper case.
400 *
401 * @returns Folded code point.
402 * @param CodePoint The unicode code point to fold.
403 */
404DECLINLINE(RTUNICP) RTUniCpToUpper(RTUNICP CodePoint)
405{
406 return towupper(CodePoint);
407}
408
409
410/**
411 * Folds a unicode code point to lower case.
412 *
413 * @returns Folded code point.
414 * @param CodePoint The unicode code point to fold.
415 */
416DECLINLINE(RTUNICP) RTUniCpToLower(RTUNICP CodePoint)
417{
418 return towlower(CodePoint);
419}
420
421
422#endif /* RTUNI_USE_WCTYPE */
423
424
425/**
426 * Frees a unicode string.
427 *
428 * @param pusz The string to free.
429 */
430RTDECL(void) RTUniFree(PRTUNICP pusz);
431
432
433/**
434 * Checks if a code point valid.
435 *
436 * Any code point (defined or not) within the 17 unicode planes (0 thru 16),
437 * except surrogates will be considered valid code points by this function.
438 *
439 * @returns true if in range, false if not.
440 * @param CodePoint The unicode code point to validate.
441 */
442DECLINLINE(bool) RTUniCpIsValid(RTUNICP CodePoint)
443{
444 return CodePoint <= 0x00d7ff
445 || ( CodePoint <= 0x10ffff
446 && CodePoint >= 0x00e000);
447}
448
449
450/**
451 * Checks if the given code point is in the BMP range.
452 *
453 * Surrogates are not considered in the BMP range by this function.
454 *
455 * @returns true if in BMP, false if not.
456 * @param CodePoint The unicode code point to consider.
457 */
458DECLINLINE(bool) RTUniCpIsBMP(RTUNICP CodePoint)
459{
460 return CodePoint <= 0xd7ff
461 || ( CodePoint <= 0xffff
462 && CodePoint >= 0xe000);
463}
464
465
466/**
467 * Folds a unicode code point to lower case.
468 *
469 * @returns Folded code point.
470 * @param CodePoint The unicode code point to fold.
471 */
472DECLINLINE(size_t) RTUniCpCalcUtf8Len(RTUNICP CodePoint)
473{
474 if (CodePoint < 0x80)
475 return 1;
476 return 2
477 + (CodePoint >= 0x00000800)
478 + (CodePoint >= 0x00010000)
479 + (CodePoint >= 0x00200000)
480 + (CodePoint >= 0x04000000)
481 + (CodePoint >= 0x80000000) /* illegal */;
482}
483
484
485
486RT_C_DECLS_END
487/** @} */
488
489
490#endif /* !IPRT_INCLUDED_uni_h */
491
Note: See TracBrowser for help on using the repository browser.

© 2024 Oracle Support Privacy / Do Not Sell My Info Terms of Use Trademark Policy Automated Access Etiquette