VirtualBox

source: vbox/trunk/src/VBox/Runtime/common/string/utf-8-case.cpp@ 33982

Last change on this file since 33982 was 33562, checked in by vboxsync, 14 years ago

RTStrToUpper,RTStrToLower: Fixed bad assumptions that lower and upper case chars are encoded with the same length (this is only true for the upper<->lower roundtrip). Also implemented the quiet handling of invalid coded sequences.

  • Property svn:eol-style set to native
  • Property svn:keywords set to Id
File size: 9.9 KB
Line 
1/* $Id: utf-8-case.cpp 33562 2010-10-28 14:38:50Z vboxsync $ */
2/** @file
3 * IPRT - UTF-8 Case Sensitivity and Folding.
4 */
5
6/*
7 * Copyright (C) 2006-2010 Oracle Corporation
8 *
9 * This file is part of VirtualBox Open Source Edition (OSE), as
10 * available from http://www.virtualbox.org. This file is free software;
11 * you can redistribute it and/or modify it under the terms of the GNU
12 * General Public License (GPL) as published by the Free Software
13 * Foundation, in version 2 as it comes in the "COPYING" file of the
14 * VirtualBox OSE distribution. VirtualBox OSE is distributed in the
15 * hope that it will be useful, but WITHOUT ANY WARRANTY of any kind.
16 *
17 * The contents of this file may alternatively be used under the terms
18 * of the Common Development and Distribution License Version 1.0
19 * (CDDL) only, as it comes in the "COPYING.CDDL" file of the
20 * VirtualBox OSE distribution, in which case the provisions of the
21 * CDDL are applicable instead of those of the GPL.
22 *
23 * You may elect to license modified versions of this file under the
24 * terms and conditions of either the GPL or the CDDL or both.
25 */
26
27
28/*******************************************************************************
29* Header Files *
30*******************************************************************************/
31#include <iprt/string.h>
32#include "internal/iprt.h"
33
34#include <iprt/uni.h>
35#include <iprt/alloc.h>
36#include <iprt/assert.h>
37#include <iprt/err.h>
38#include "internal/string.h"
39
40
41
42/**
43 * Performs a case insensitive string compare between two UTF-8 strings.
44 *
45 * This is a simplified compare, as only the simplified lower/upper case folding
46 * specified by the unicode specs are used. It does not consider character pairs
47 * as they are used in some languages, just simple upper & lower case compares.
48 *
49 * The result is the difference between the mismatching codepoints after they
50 * both have been lower cased.
51 *
52 * If the string encoding is invalid the function will assert (strict builds)
53 * and use RTStrCmp for the remainder of the string.
54 *
55 * @returns < 0 if the first string less than the second string.
56 * @returns 0 if the first string identical to the second string.
57 * @returns > 0 if the first string greater than the second string.
58 * @param psz1 First UTF-8 string. Null is allowed.
59 * @param psz2 Second UTF-8 string. Null is allowed.
60 */
61RTDECL(int) RTStrICmp(const char *psz1, const char *psz2)
62{
63 if (psz1 == psz2)
64 return 0;
65 if (!psz1)
66 return -1;
67 if (!psz2)
68 return 1;
69
70 const char *pszStart1 = psz1;
71 for (;;)
72 {
73 /* Get the codepoints */
74 RTUNICP uc1;
75 int rc = RTStrGetCpEx(&psz1, &uc1);
76 if (RT_FAILURE(rc))
77 {
78 AssertRC(rc);
79 psz1--;
80 break;
81 }
82
83 RTUNICP uc2;
84 rc = RTStrGetCpEx(&psz2, &uc2);
85 if (RT_FAILURE(rc))
86 {
87 AssertRC(rc);
88 psz2--;
89 psz1 = RTStrPrevCp(pszStart1, psz1);
90 break;
91 }
92
93 /* compare */
94 int iDiff = uc1 - uc2;
95 if (iDiff)
96 {
97 iDiff = RTUniCpToUpper(uc1) != RTUniCpToUpper(uc2);
98 if (iDiff)
99 {
100 iDiff = RTUniCpToLower(uc1) - RTUniCpToLower(uc2); /* lower case diff last! */
101 if (iDiff)
102 return iDiff;
103 }
104 }
105
106 /* hit the terminator? */
107 if (!uc1)
108 return 0;
109 }
110
111 /* Hit some bad encoding, continue in case sensitive mode. */
112 return RTStrCmp(psz1, psz2);
113}
114RT_EXPORT_SYMBOL(RTStrICmp);
115
116
117/**
118 * Performs a case insensitive string compare between two UTF-8 strings, given a
119 * maximum string length.
120 *
121 * This is a simplified compare, as only the simplified lower/upper case folding
122 * specified by the unicode specs are used. It does not consider character pairs
123 * as they are used in some languages, just simple upper & lower case compares.
124 *
125 * The result is the difference between the mismatching codepoints after they
126 * both have been lower cased.
127 *
128 * If the string encoding is invalid the function will assert (strict builds)
129 * and use RTStrCmp for the remainder of the string.
130 *
131 * @returns < 0 if the first string less than the second string.
132 * @returns 0 if the first string identical to the second string.
133 * @returns > 0 if the first string greater than the second string.
134 * @param psz1 First UTF-8 string. Null is allowed.
135 * @param psz2 Second UTF-8 string. Null is allowed.
136 * @param cchMax Maximum string length
137 */
138RTDECL(int) RTStrNICmp(const char *psz1, const char *psz2, size_t cchMax)
139{
140 if (cchMax == 0)
141 return 0;
142 if (psz1 == psz2)
143 return 0;
144 if (!psz1)
145 return -1;
146 if (!psz2)
147 return 1;
148
149 for (;;)
150 {
151 /* Get the codepoints */
152 RTUNICP uc1;
153 size_t cchMax2 = cchMax;
154 int rc = RTStrGetCpNEx(&psz1, &cchMax, &uc1);
155 if (RT_FAILURE(rc))
156 {
157 AssertRC(rc);
158 psz1--;
159 cchMax++;
160 break;
161 }
162
163 RTUNICP uc2;
164 rc = RTStrGetCpNEx(&psz2, &cchMax2, &uc2);
165 if (RT_FAILURE(rc))
166 {
167 AssertRC(rc);
168 psz2--;
169 psz1 -= (cchMax - cchMax2 + 1); /* This can't overflow, can it? */
170 cchMax = cchMax2 + 1;
171 break;
172 }
173
174 /* compare */
175 int iDiff = uc1 - uc2;
176 if (iDiff)
177 {
178 iDiff = RTUniCpToUpper(uc1) != RTUniCpToUpper(uc2);
179 if (iDiff)
180 {
181 iDiff = RTUniCpToLower(uc1) - RTUniCpToLower(uc2); /* lower case diff last! */
182 if (iDiff)
183 return iDiff;
184 }
185 }
186
187 /* hit the terminator? */
188 if (!uc1 || cchMax == 0)
189 return 0;
190 }
191
192 /* Hit some bad encoding, continue in case insensitive mode. */
193 return RTStrNCmp(psz1, psz2, cchMax);
194}
195RT_EXPORT_SYMBOL(RTStrNICmp);
196
197
198RTDECL(char *) RTStrIStr(const char *pszHaystack, const char *pszNeedle)
199{
200 /* Any NULL strings means NULL return. (In the RTStrCmp tradition.) */
201 if (!pszHaystack)
202 return NULL;
203 if (!pszNeedle)
204 return NULL;
205
206 /* The empty string matches everything. */
207 if (!*pszNeedle)
208 return (char *)pszHaystack;
209
210 /*
211 * The search strategy is to pick out the first char of the needle, fold it,
212 * and match it against the haystack code point by code point. When encountering
213 * a matching code point we use RTStrNICmp for the remainder (if any) of the needle.
214 */
215 const char * const pszNeedleStart = pszNeedle;
216 RTUNICP Cp0;
217 RTStrGetCpEx(&pszNeedle, &Cp0); /* pszNeedle is advanced one code point. */
218 size_t const cchNeedle = strlen(pszNeedle);
219 size_t const cchNeedleCp0= pszNeedle - pszNeedleStart;
220 RTUNICP const Cp0Lower = RTUniCpToLower(Cp0);
221 RTUNICP const Cp0Upper = RTUniCpToUpper(Cp0);
222 if ( Cp0Lower == Cp0Upper
223 && Cp0Lower == Cp0)
224 {
225 /* Cp0 is not a case sensitive char. */
226 for (;;)
227 {
228 RTUNICP Cp;
229 RTStrGetCpEx(&pszHaystack, &Cp);
230 if (!Cp)
231 break;
232 if ( Cp == Cp0
233 && !RTStrNICmp(pszHaystack, pszNeedle, cchNeedle))
234 return (char *)pszHaystack - cchNeedleCp0;
235 }
236 }
237 else if ( Cp0Lower == Cp0
238 || Cp0Upper != Cp0)
239 {
240 /* Cp0 is case sensitive */
241 for (;;)
242 {
243 RTUNICP Cp;
244 RTStrGetCpEx(&pszHaystack, &Cp);
245 if (!Cp)
246 break;
247 if ( ( Cp == Cp0Upper
248 || Cp == Cp0Lower)
249 && !RTStrNICmp(pszHaystack, pszNeedle, cchNeedle))
250 return (char *)pszHaystack - cchNeedleCp0;
251 }
252 }
253 else
254 {
255 /* Cp0 is case sensitive and folds to two difference chars. (paranoia) */
256 for (;;)
257 {
258 RTUNICP Cp;
259 RTStrGetCpEx(&pszHaystack, &Cp);
260 if (!Cp)
261 break;
262 if ( ( Cp == Cp0
263 || Cp == Cp0Upper
264 || Cp == Cp0Lower)
265 && !RTStrNICmp(pszHaystack, pszNeedle, cchNeedle))
266 return (char *)pszHaystack - cchNeedleCp0;
267 }
268 }
269
270
271 return NULL;
272}
273RT_EXPORT_SYMBOL(RTStrIStr);
274
275
276RTDECL(char *) RTStrToLower(char *psz)
277{
278 /*
279 * Loop the code points in the string, converting them one by one.
280 *
281 * ASSUMES that the folded code points have an encoding that is equal or
282 * shorter than the original (this is presently correct).
283 */
284 const char *pszSrc = psz;
285 char *pszDst = psz;
286 RTUNICP uc;
287 do
288 {
289 int rc = RTStrGetCpEx(&pszSrc, &uc);
290 if (RT_SUCCESS(rc))
291 {
292 uc = RTUniCpToLower(uc);
293 pszDst = RTStrPutCp(pszDst, uc);
294 }
295 else
296 {
297 /* bad encoding, just copy it quietly (uc == RTUNICP_INVALID (!= 0)). */
298 AssertRC(rc);
299 *pszDst++ = pszSrc[-1];
300 }
301 Assert((uintptr_t)pszDst <= (uintptr_t)pszSrc);
302 } while (uc != 0);
303
304 return psz;
305}
306RT_EXPORT_SYMBOL(RTStrToLower);
307
308
309RTDECL(char *) RTStrToUpper(char *psz)
310{
311 /*
312 * Loop the code points in the string, converting them one by one.
313 *
314 * ASSUMES that the folded code points have an encoding that is equal or
315 * shorter than the original (this is presently correct).
316 */
317 const char *pszSrc = psz;
318 char *pszDst = psz;
319 RTUNICP uc;
320 do
321 {
322 int rc = RTStrGetCpEx(&pszSrc, &uc);
323 if (RT_SUCCESS(rc))
324 {
325 uc = RTUniCpToUpper(uc);
326 pszDst = RTStrPutCp(pszDst, uc);
327 }
328 else
329 {
330 /* bad encoding, just copy it quietly (uc == RTUNICP_INVALID (!= 0)). */
331 AssertRC(rc);
332 *pszDst++ = pszSrc[-1];
333 }
334 Assert((uintptr_t)pszDst <= (uintptr_t)pszSrc);
335 } while (uc != 0);
336
337 return psz;
338}
339RT_EXPORT_SYMBOL(RTStrToUpper);
340
Note: See TracBrowser for help on using the repository browser.

© 2024 Oracle Support Privacy / Do Not Sell My Info Terms of Use Trademark Policy Automated Access Etiquette