VirtualBox

source: vbox/trunk/include/iprt/uni.h@ 67989

Last change on this file since 67989 was 62473, checked in by vboxsync, 8 years ago

(C) 2016

  • Property svn:eol-style set to native
  • Property svn:keywords set to Author Date Id Revision
File size: 10.7 KB
Line 
1/** @file
2 * IPRT - Unicode Code Points.
3 */
4
5/*
6 * Copyright (C) 2006-2016 Oracle Corporation
7 *
8 * This file is part of VirtualBox Open Source Edition (OSE), as
9 * available from http://www.virtualbox.org. This file is free software;
10 * you can redistribute it and/or modify it under the terms of the GNU
11 * General Public License (GPL) as published by the Free Software
12 * Foundation, in version 2 as it comes in the "COPYING" file of the
13 * VirtualBox OSE distribution. VirtualBox OSE is distributed in the
14 * hope that it will be useful, but WITHOUT ANY WARRANTY of any kind.
15 *
16 * The contents of this file may alternatively be used under the terms
17 * of the Common Development and Distribution License Version 1.0
18 * (CDDL) only, as it comes in the "COPYING.CDDL" file of the
19 * VirtualBox OSE distribution, in which case the provisions of the
20 * CDDL are applicable instead of those of the GPL.
21 *
22 * You may elect to license modified versions of this file under the
23 * terms and conditions of either the GPL or the CDDL or both.
24 */
25
26#ifndef ___iprt_uni_h
27#define ___iprt_uni_h
28
29/** @defgroup grp_rt_uni RTUniCp - Unicode Code Points
30 * @ingroup grp_rt
31 * @{
32 */
33
34/** @def RTUNI_USE_WCTYPE
35 * Define RTUNI_USE_WCTYPE to not use the IPRT unicode data but the
36 * data which the C runtime library provides. */
37#ifdef DOXYGEN_RUNNING
38# define RTUNI_USE_WCTYPE
39#endif
40
41#include <iprt/types.h>
42#ifdef RTUNI_USE_WCTYPE
43# include <wctype.h>
44#endif
45
46RT_C_DECLS_BEGIN
47
48
49#ifndef RTUNI_USE_WCTYPE
50
51/**
52 * A unicode flags range.
53 * @internal
54 */
55typedef struct RTUNIFLAGSRANGE
56{
57 /** The first code point of the range. */
58 RTUNICP BeginCP;
59 /** The last + 1 code point of the range. */
60 RTUNICP EndCP;
61 /** Pointer to the array of case folded code points. */
62 const uint8_t *pafFlags;
63} RTUNIFLAGSRANGE;
64/** Pointer to a flags range.
65 * @internal */
66typedef RTUNIFLAGSRANGE *PRTUNIFLAGSRANGE;
67/** Pointer to a const flags range.
68 * @internal */
69typedef const RTUNIFLAGSRANGE *PCRTUNIFLAGSRANGE;
70
71/**
72 * A unicode case folded range.
73 * @internal
74 */
75typedef struct RTUNICASERANGE
76{
77 /** The first code point of the range. */
78 RTUNICP BeginCP;
79 /** The last + 1 code point of the range. */
80 RTUNICP EndCP;
81 /** Pointer to the array of case folded code points. */
82 PCRTUNICP paFoldedCPs;
83} RTUNICASERANGE;
84/** Pointer to a case folded range.
85 * @internal */
86typedef RTUNICASERANGE *PRTUNICASERANGE;
87/** Pointer to a const case folded range.
88 * @internal */
89typedef const RTUNICASERANGE *PCRTUNICASERANGE;
90
91/** @name Unicode Code Point Flags.
92 * @internal
93 * @{ */
94#define RTUNI_UPPER RT_BIT(0)
95#define RTUNI_LOWER RT_BIT(1)
96#define RTUNI_ALPHA RT_BIT(2)
97#define RTUNI_XDIGIT RT_BIT(3)
98#define RTUNI_DDIGIT RT_BIT(4)
99#define RTUNI_WSPACE RT_BIT(5)
100/*#define RTUNI_BSPACE RT_BIT(6) - later */
101/** When set, the codepoint requires further checking wrt NFC and NFD
102 * normalization. I.e. set when either of QC_NFD and QC_NFC are not Y. */
103#define RTUNI_QC_NFX RT_BIT(7)
104/** @} */
105
106
107/**
108 * Array of flags ranges.
109 * @internal
110 */
111extern RTDATADECL(const RTUNIFLAGSRANGE) g_aRTUniFlagsRanges[];
112
113/**
114 * Gets the flags for a unicode code point.
115 *
116 * @returns The flag mask. (RTUNI_*)
117 * @param CodePoint The unicode code point.
118 * @internal
119 */
120DECLINLINE(RTUNICP) rtUniCpFlags(RTUNICP CodePoint)
121{
122 PCRTUNIFLAGSRANGE pCur = &g_aRTUniFlagsRanges[0];
123 do
124 {
125 if (pCur->EndCP > CodePoint)
126 {
127 if (pCur->BeginCP <= CodePoint)
128 return pCur->pafFlags[CodePoint - pCur->BeginCP];
129 break;
130 }
131 pCur++;
132 } while (pCur->EndCP != RTUNICP_MAX);
133 return 0;
134}
135
136
137/**
138 * Checks if a unicode code point is upper case.
139 *
140 * @returns true if it is.
141 * @returns false if it isn't.
142 * @param CodePoint The code point.
143 */
144DECLINLINE(bool) RTUniCpIsUpper(RTUNICP CodePoint)
145{
146 return (rtUniCpFlags(CodePoint) & RTUNI_UPPER) != 0;
147}
148
149
150/**
151 * Checks if a unicode code point is lower case.
152 *
153 * @returns true if it is.
154 * @returns false if it isn't.
155 * @param CodePoint The code point.
156 */
157DECLINLINE(bool) RTUniCpIsLower(RTUNICP CodePoint)
158{
159 return (rtUniCpFlags(CodePoint) & RTUNI_LOWER) != 0;
160}
161
162
163/**
164 * Checks if a unicode code point is case foldable.
165 *
166 * @returns true if it is.
167 * @returns false if it isn't.
168 * @param CodePoint The code point.
169 */
170DECLINLINE(bool) RTUniCpIsFoldable(RTUNICP CodePoint)
171{
172 /* Right enough. */
173 return (rtUniCpFlags(CodePoint) & (RTUNI_LOWER | RTUNI_UPPER)) != 0;
174}
175
176
177/**
178 * Checks if a unicode code point is alphabetic.
179 *
180 * @returns true if it is.
181 * @returns false if it isn't.
182 * @param CodePoint The code point.
183 */
184DECLINLINE(bool) RTUniCpIsAlphabetic(RTUNICP CodePoint)
185{
186 return (rtUniCpFlags(CodePoint) & RTUNI_ALPHA) != 0;
187}
188
189
190/**
191 * Checks if a unicode code point is a decimal digit.
192 *
193 * @returns true if it is.
194 * @returns false if it isn't.
195 * @param CodePoint The code point.
196 */
197DECLINLINE(bool) RTUniCpIsDecDigit(RTUNICP CodePoint)
198{
199 return (rtUniCpFlags(CodePoint) & RTUNI_DDIGIT) != 0;
200}
201
202
203/**
204 * Checks if a unicode code point is a hexadecimal digit.
205 *
206 * @returns true if it is.
207 * @returns false if it isn't.
208 * @param CodePoint The code point.
209 */
210DECLINLINE(bool) RTUniCpIsHexDigit(RTUNICP CodePoint)
211{
212 return (rtUniCpFlags(CodePoint) & RTUNI_XDIGIT) != 0;
213}
214
215
216/**
217 * Checks if a unicode code point is white space.
218 *
219 * @returns true if it is.
220 * @returns false if it isn't.
221 * @param CodePoint The code point.
222 */
223DECLINLINE(bool) RTUniCpIsSpace(RTUNICP CodePoint)
224{
225 return (rtUniCpFlags(CodePoint) & RTUNI_WSPACE) != 0;
226}
227
228
229
230/**
231 * Array of uppercase ranges.
232 * @internal
233 */
234extern RTDATADECL(const RTUNICASERANGE) g_aRTUniUpperRanges[];
235
236/**
237 * Array of lowercase ranges.
238 * @internal
239 */
240extern RTDATADECL(const RTUNICASERANGE) g_aRTUniLowerRanges[];
241
242
243/**
244 * Folds a unicode code point using the specified range array.
245 *
246 * @returns FOlded code point.
247 * @param CodePoint The unicode code point to fold.
248 * @param pCur The case folding range to use.
249 */
250DECLINLINE(RTUNICP) rtUniCpFold(RTUNICP CodePoint, PCRTUNICASERANGE pCur)
251{
252 do
253 {
254 if (pCur->EndCP > CodePoint)
255 {
256 if (pCur->BeginCP <= CodePoint)
257 CodePoint = pCur->paFoldedCPs[CodePoint - pCur->BeginCP];
258 break;
259 }
260 pCur++;
261 } while (pCur->EndCP != RTUNICP_MAX);
262 return CodePoint;
263}
264
265
266/**
267 * Folds a unicode code point to upper case.
268 *
269 * @returns Folded code point.
270 * @param CodePoint The unicode code point to fold.
271 */
272DECLINLINE(RTUNICP) RTUniCpToUpper(RTUNICP CodePoint)
273{
274 return rtUniCpFold(CodePoint, &g_aRTUniUpperRanges[0]);
275}
276
277
278/**
279 * Folds a unicode code point to lower case.
280 *
281 * @returns Folded code point.
282 * @param CodePoint The unicode code point to fold.
283 */
284DECLINLINE(RTUNICP) RTUniCpToLower(RTUNICP CodePoint)
285{
286 return rtUniCpFold(CodePoint, &g_aRTUniLowerRanges[0]);
287}
288
289
290#else /* RTUNI_USE_WCTYPE */
291
292
293/**
294 * Checks if a unicode code point is upper case.
295 *
296 * @returns true if it is.
297 * @returns false if it isn't.
298 * @param CodePoint The code point.
299 */
300DECLINLINE(bool) RTUniCpIsUpper(RTUNICP CodePoint)
301{
302 return !!iswupper(CodePoint);
303}
304
305
306/**
307 * Checks if a unicode code point is lower case.
308 *
309 * @returns true if it is.
310 * @returns false if it isn't.
311 * @param CodePoint The code point.
312 */
313DECLINLINE(bool) RTUniCpIsLower(RTUNICP CodePoint)
314{
315 return !!iswlower(CodePoint);
316}
317
318
319/**
320 * Checks if a unicode code point is case foldable.
321 *
322 * @returns true if it is.
323 * @returns false if it isn't.
324 * @param CodePoint The code point.
325 */
326DECLINLINE(bool) RTUniCpIsFoldable(RTUNICP CodePoint)
327{
328 /* Right enough. */
329 return iswupper(CodePoint) || iswlower(CodePoint);
330}
331
332
333/**
334 * Checks if a unicode code point is alphabetic.
335 *
336 * @returns true if it is.
337 * @returns false if it isn't.
338 * @param CodePoint The code point.
339 */
340DECLINLINE(bool) RTUniCpIsAlphabetic(RTUNICP CodePoint)
341{
342 return !!iswalpha(CodePoint);
343}
344
345
346/**
347 * Checks if a unicode code point is a decimal digit.
348 *
349 * @returns true if it is.
350 * @returns false if it isn't.
351 * @param CodePoint The code point.
352 */
353DECLINLINE(bool) RTUniCpIsDecDigit(RTUNICP CodePoint)
354{
355 return !!iswdigit(CodePoint);
356}
357
358
359/**
360 * Checks if a unicode code point is a hexadecimal digit.
361 *
362 * @returns true if it is.
363 * @returns false if it isn't.
364 * @param CodePoint The code point.
365 */
366DECLINLINE(bool) RTUniCpIsHexDigit(RTUNICP CodePoint)
367{
368 return !!iswxdigit(CodePoint);
369}
370
371
372/**
373 * Checks if a unicode code point is white space.
374 *
375 * @returns true if it is.
376 * @returns false if it isn't.
377 * @param CodePoint The code point.
378 */
379DECLINLINE(bool) RTUniCpIsSpace(RTUNICP CodePoint)
380{
381 return !!iswspace(CodePoint);
382}
383
384
385/**
386 * Folds a unicode code point to upper case.
387 *
388 * @returns Folded code point.
389 * @param CodePoint The unicode code point to fold.
390 */
391DECLINLINE(RTUNICP) RTUniCpToUpper(RTUNICP CodePoint)
392{
393 return towupper(CodePoint);
394}
395
396
397/**
398 * Folds a unicode code point to lower case.
399 *
400 * @returns Folded code point.
401 * @param CodePoint The unicode code point to fold.
402 */
403DECLINLINE(RTUNICP) RTUniCpToLower(RTUNICP CodePoint)
404{
405 return towlower(CodePoint);
406}
407
408
409#endif /* RTUNI_USE_WCTYPE */
410
411
412/**
413 * Frees a unicode string.
414 *
415 * @param pusz The string to free.
416 */
417RTDECL(void) RTUniFree(PRTUNICP pusz);
418
419
420/**
421 * Checks if a code point valid.
422 *
423 * Any code point (defined or not) within the 17 unicode planes (0 thru 16),
424 * except surrogates will be considered valid code points by this function.
425 *
426 * @returns true if in range, false if not.
427 * @param CodePoint The unicode code point to validate.
428 */
429DECLINLINE(bool) RTUniCpIsValid(RTUNICP CodePoint)
430{
431 return CodePoint <= 0x00d7ff
432 || ( CodePoint <= 0x10ffff
433 && CodePoint >= 0x00e000);
434}
435
436
437/**
438 * Checks if the given code point is in the BMP range.
439 *
440 * Surrogates are not considered in the BMP range by this function.
441 *
442 * @returns true if in BMP, false if not.
443 * @param CodePoint The unicode code point to consider.
444 */
445DECLINLINE(bool) RTUniCpIsBMP(RTUNICP CodePoint)
446{
447 return CodePoint <= 0xd7ff
448 || ( CodePoint <= 0xffff
449 && CodePoint >= 0xe000);
450}
451
452
453/**
454 * Folds a unicode code point to lower case.
455 *
456 * @returns Folded code point.
457 * @param CodePoint The unicode code point to fold.
458 */
459DECLINLINE(size_t) RTUniCpCalcUtf8Len(RTUNICP CodePoint)
460{
461 if (CodePoint < 0x80)
462 return 1;
463 return 2
464 + (CodePoint >= 0x00000800)
465 + (CodePoint >= 0x00010000)
466 + (CodePoint >= 0x00200000)
467 + (CodePoint >= 0x04000000)
468 + (CodePoint >= 0x80000000) /* illegal */;
469}
470
471
472
473RT_C_DECLS_END
474/** @} */
475
476
477#endif
478
Note: See TracBrowser for help on using the repository browser.

© 2024 Oracle Support Privacy / Do Not Sell My Info Terms of Use Trademark Policy Automated Access Etiquette