VirtualBox

source: vbox/trunk/src/VBox/Runtime/common/string/utf-8-case.cpp@ 30080

Last change on this file since 30080 was 28903, checked in by vboxsync, 15 years ago

IPRT: iconv cache.

  • Property svn:eol-style set to native
  • Property svn:keywords set to Id
File size: 9.4 KB
Line 
1/* $Id: utf-8-case.cpp 28903 2010-04-29 14:58:12Z vboxsync $ */
2/** @file
3 * IPRT - UTF-8 Case Sensitivity and Folding.
4 */
5
6/*
7 * Copyright (C) 2006-2009 Oracle Corporation
8 *
9 * This file is part of VirtualBox Open Source Edition (OSE), as
10 * available from http://www.virtualbox.org. This file is free software;
11 * you can redistribute it and/or modify it under the terms of the GNU
12 * General Public License (GPL) as published by the Free Software
13 * Foundation, in version 2 as it comes in the "COPYING" file of the
14 * VirtualBox OSE distribution. VirtualBox OSE is distributed in the
15 * hope that it will be useful, but WITHOUT ANY WARRANTY of any kind.
16 *
17 * The contents of this file may alternatively be used under the terms
18 * of the Common Development and Distribution License Version 1.0
19 * (CDDL) only, as it comes in the "COPYING.CDDL" file of the
20 * VirtualBox OSE distribution, in which case the provisions of the
21 * CDDL are applicable instead of those of the GPL.
22 *
23 * You may elect to license modified versions of this file under the
24 * terms and conditions of either the GPL or the CDDL or both.
25 */
26
27
28/*******************************************************************************
29* Header Files *
30*******************************************************************************/
31#include <iprt/string.h>
32#include "internal/iprt.h"
33
34#include <iprt/uni.h>
35#include <iprt/alloc.h>
36#include <iprt/assert.h>
37#include <iprt/err.h>
38#include "internal/string.h"
39
40
41
42/**
43 * Performs a case insensitive string compare between two UTF-8 strings.
44 *
45 * This is a simplified compare, as only the simplified lower/upper case folding
46 * specified by the unicode specs are used. It does not consider character pairs
47 * as they are used in some languages, just simple upper & lower case compares.
48 *
49 * The result is the difference between the mismatching codepoints after they
50 * both have been lower cased.
51 *
52 * If the string encoding is invalid the function will assert (strict builds)
53 * and use RTStrCmp for the remainder of the string.
54 *
55 * @returns < 0 if the first string less than the second string.
56 * @returns 0 if the first string identical to the second string.
57 * @returns > 0 if the first string greater than the second string.
58 * @param psz1 First UTF-8 string. Null is allowed.
59 * @param psz2 Second UTF-8 string. Null is allowed.
60 */
61RTDECL(int) RTStrICmp(const char *psz1, const char *psz2)
62{
63 if (psz1 == psz2)
64 return 0;
65 if (!psz1)
66 return -1;
67 if (!psz2)
68 return 1;
69
70 const char *pszStart1 = psz1;
71 for (;;)
72 {
73 /* Get the codepoints */
74 RTUNICP cp1;
75 int rc = RTStrGetCpEx(&psz1, &cp1);
76 if (RT_FAILURE(rc))
77 {
78 AssertRC(rc);
79 psz1--;
80 break;
81 }
82
83 RTUNICP cp2;
84 rc = RTStrGetCpEx(&psz2, &cp2);
85 if (RT_FAILURE(rc))
86 {
87 AssertRC(rc);
88 psz2--;
89 psz1 = RTStrPrevCp(pszStart1, psz1);
90 break;
91 }
92
93 /* compare */
94 int iDiff = cp1 - cp2;
95 if (iDiff)
96 {
97 iDiff = RTUniCpToUpper(cp1) != RTUniCpToUpper(cp2);
98 if (iDiff)
99 {
100 iDiff = RTUniCpToLower(cp1) - RTUniCpToLower(cp2); /* lower case diff last! */
101 if (iDiff)
102 return iDiff;
103 }
104 }
105
106 /* hit the terminator? */
107 if (!cp1)
108 return 0;
109 }
110
111 /* Hit some bad encoding, continue in case sensitive mode. */
112 return RTStrCmp(psz1, psz2);
113}
114RT_EXPORT_SYMBOL(RTStrICmp);
115
116
117/**
118 * Performs a case insensitive string compare between two UTF-8 strings, given a
119 * maximum string length.
120 *
121 * This is a simplified compare, as only the simplified lower/upper case folding
122 * specified by the unicode specs are used. It does not consider character pairs
123 * as they are used in some languages, just simple upper & lower case compares.
124 *
125 * The result is the difference between the mismatching codepoints after they
126 * both have been lower cased.
127 *
128 * If the string encoding is invalid the function will assert (strict builds)
129 * and use RTStrCmp for the remainder of the string.
130 *
131 * @returns < 0 if the first string less than the second string.
132 * @returns 0 if the first string identical to the second string.
133 * @returns > 0 if the first string greater than the second string.
134 * @param psz1 First UTF-8 string. Null is allowed.
135 * @param psz2 Second UTF-8 string. Null is allowed.
136 * @param cchMax Maximum string length
137 */
138RTDECL(int) RTStrNICmp(const char *psz1, const char *psz2, size_t cchMax)
139{
140 if (cchMax == 0)
141 return 0;
142 if (psz1 == psz2)
143 return 0;
144 if (!psz1)
145 return -1;
146 if (!psz2)
147 return 1;
148
149 for (;;)
150 {
151 /* Get the codepoints */
152 RTUNICP cp1;
153 size_t cchMax2 = cchMax;
154 int rc = RTStrGetCpNEx(&psz1, &cchMax, &cp1);
155 if (RT_FAILURE(rc))
156 {
157 AssertRC(rc);
158 psz1--;
159 cchMax++;
160 break;
161 }
162
163 RTUNICP cp2;
164 rc = RTStrGetCpNEx(&psz2, &cchMax2, &cp2);
165 if (RT_FAILURE(rc))
166 {
167 AssertRC(rc);
168 psz2--;
169 psz1 -= (cchMax - cchMax2 + 1); /* This can't overflow, can it? */
170 cchMax = cchMax2 + 1;
171 break;
172 }
173
174 /* compare */
175 int iDiff = cp1 - cp2;
176 if (iDiff)
177 {
178 iDiff = RTUniCpToUpper(cp1) != RTUniCpToUpper(cp2);
179 if (iDiff)
180 {
181 iDiff = RTUniCpToLower(cp1) - RTUniCpToLower(cp2); /* lower case diff last! */
182 if (iDiff)
183 return iDiff;
184 }
185 }
186
187 /* hit the terminator? */
188 if (!cp1 || cchMax == 0)
189 return 0;
190 }
191
192 /* Hit some bad encoding, continue in case insensitive mode. */
193 return RTStrNCmp(psz1, psz2, cchMax);
194}
195RT_EXPORT_SYMBOL(RTStrNICmp);
196
197
198RTDECL(char *) RTStrIStr(const char *pszHaystack, const char *pszNeedle)
199{
200 /* Any NULL strings means NULL return. (In the RTStrCmp tradition.) */
201 if (!pszHaystack)
202 return NULL;
203 if (!pszNeedle)
204 return NULL;
205
206 /* The empty string matches everything. */
207 if (!*pszNeedle)
208 return (char *)pszHaystack;
209
210 /*
211 * The search strategy is to pick out the first char of the needle, fold it,
212 * and match it against the haystack code point by code point. When encountering
213 * a matching code point we use RTStrNICmp for the remainder (if any) of the needle.
214 */
215 const char * const pszNeedleStart = pszNeedle;
216 RTUNICP Cp0;
217 RTStrGetCpEx(&pszNeedle, &Cp0); /* pszNeedle is advanced one code point. */
218 size_t const cchNeedle = strlen(pszNeedle);
219 size_t const cchNeedleCp0= pszNeedle - pszNeedleStart;
220 RTUNICP const Cp0Lower = RTUniCpToLower(Cp0);
221 RTUNICP const Cp0Upper = RTUniCpToUpper(Cp0);
222 if ( Cp0Lower == Cp0Upper
223 && Cp0Lower == Cp0)
224 {
225 /* Cp0 is not a case sensitive char. */
226 for (;;)
227 {
228 RTUNICP Cp;
229 RTStrGetCpEx(&pszHaystack, &Cp);
230 if (!Cp)
231 break;
232 if ( Cp == Cp0
233 && !RTStrNICmp(pszHaystack, pszNeedle, cchNeedle))
234 return (char *)pszHaystack - cchNeedleCp0;
235 }
236 }
237 else if ( Cp0Lower == Cp0
238 || Cp0Upper != Cp0)
239 {
240 /* Cp0 is case sensitive */
241 for (;;)
242 {
243 RTUNICP Cp;
244 RTStrGetCpEx(&pszHaystack, &Cp);
245 if (!Cp)
246 break;
247 if ( ( Cp == Cp0Upper
248 || Cp == Cp0Lower)
249 && !RTStrNICmp(pszHaystack, pszNeedle, cchNeedle))
250 return (char *)pszHaystack - cchNeedleCp0;
251 }
252 }
253 else
254 {
255 /* Cp0 is case sensitive and folds to two difference chars. (paranoia) */
256 for (;;)
257 {
258 RTUNICP Cp;
259 RTStrGetCpEx(&pszHaystack, &Cp);
260 if (!Cp)
261 break;
262 if ( ( Cp == Cp0
263 || Cp == Cp0Upper
264 || Cp == Cp0Lower)
265 && !RTStrNICmp(pszHaystack, pszNeedle, cchNeedle))
266 return (char *)pszHaystack - cchNeedleCp0;
267 }
268 }
269
270
271 return NULL;
272}
273RT_EXPORT_SYMBOL(RTStrIStr);
274
275
276RTDECL(char *) RTStrToLower(char *psz)
277{
278 /*
279 * Loop the code points in the string, converting them one by one.
280 * ASSUMES that the code points for upper and lower case are encoded
281 * with the exact same length.
282 */
283 /** @todo Handled bad encodings correctly+quietly, remove assumption,
284 * optimize. */
285 char *pszCur = psz;
286 while (*pszCur)
287 {
288 RTUNICP cp = RTStrGetCp(pszCur);
289 cp = RTUniCpToLower(cp);
290 pszCur = RTStrPutCp(pszCur, cp);
291 }
292 return psz;
293}
294RT_EXPORT_SYMBOL(RTStrToLower);
295
296
297RTDECL(char *) RTStrToUpper(char *psz)
298{
299 /*
300 * Loop the code points in the string, converting them one by one.
301 * ASSUMES that the code points for upper and lower case are encoded
302 * with the exact same length.
303 */
304 /** @todo Handled bad encodings correctly+quietly, remove assumption,
305 * optimize. */
306 char *pszCur = psz;
307 while(*pszCur)
308 {
309 RTUNICP cp = RTStrGetCp(pszCur);
310 cp = RTUniCpToUpper(cp);
311 pszCur = RTStrPutCp(pszCur, cp);
312 }
313 return psz;
314}
315RT_EXPORT_SYMBOL(RTStrToUpper);
316
Note: See TracBrowser for help on using the repository browser.

© 2024 Oracle Support Privacy / Do Not Sell My Info Terms of Use Trademark Policy Automated Access Etiquette