utf-8-case.cpp@ 45221

Last change on this file since 45221 was 33562, checked in by vboxsync, 14 years ago
RTStrToUpper,RTStrToLower: Fixed bad assumptions that lower and upper case chars are encoded with the same length (this is only true for the upper<->lower roundtrip). Also implemented the quiet handling of invalid coded sequences.
Property svn:eol-style set to `native` Property svn:keywords set to `Id`
File size: 9.9 KB

Line
1	/* $Id: utf-8-case.cpp 33562 2010-10-28 14:38:50Z vboxsync $ */
2	/** @file
3	* IPRT - UTF-8 Case Sensitivity and Folding.
4	*/
5
6	/*
7	* Copyright (C) 2006-2010 Oracle Corporation
8	*
9	* This file is part of VirtualBox Open Source Edition (OSE), as
10	* available from http://www.virtualbox.org. This file is free software;
11	* you can redistribute it and/or modify it under the terms of the GNU
12	* General Public License (GPL) as published by the Free Software
13	* Foundation, in version 2 as it comes in the "COPYING" file of the
14	* VirtualBox OSE distribution. VirtualBox OSE is distributed in the
15	* hope that it will be useful, but WITHOUT ANY WARRANTY of any kind.
16	*
17	* The contents of this file may alternatively be used under the terms
18	* of the Common Development and Distribution License Version 1.0
19	* (CDDL) only, as it comes in the "COPYING.CDDL" file of the
20	* VirtualBox OSE distribution, in which case the provisions of the
21	* CDDL are applicable instead of those of the GPL.
22	*
23	* You may elect to license modified versions of this file under the
24	* terms and conditions of either the GPL or the CDDL or both.
25	*/
26
27
28	/*******************************************************************************
29	* Header Files *
30	*******************************************************************************/
31	#include <iprt/string.h>
32	#include "internal/iprt.h"
33
34	#include <iprt/uni.h>
35	#include <iprt/alloc.h>
36	#include <iprt/assert.h>
37	#include <iprt/err.h>
38	#include "internal/string.h"
39
40
41
42	/**
43	* Performs a case insensitive string compare between two UTF-8 strings.
44	*
45	* This is a simplified compare, as only the simplified lower/upper case folding
46	* specified by the unicode specs are used. It does not consider character pairs
47	* as they are used in some languages, just simple upper & lower case compares.
48	*
49	* The result is the difference between the mismatching codepoints after they
50	* both have been lower cased.
51	*
52	* If the string encoding is invalid the function will assert (strict builds)
53	* and use RTStrCmp for the remainder of the string.
54	*
55	* @returns < 0 if the first string less than the second string.
56	* @returns 0 if the first string identical to the second string.
57	* @returns > 0 if the first string greater than the second string.
58	* @param psz1 First UTF-8 string. Null is allowed.
59	* @param psz2 Second UTF-8 string. Null is allowed.
60	*/
61	RTDECL(int) RTStrICmp(const char psz1, const char psz2)
62	{
63	if (psz1 == psz2)
64	return 0;
65	if (!psz1)
66	return -1;
67	if (!psz2)
68	return 1;
69
70	const char *pszStart1 = psz1;
71	for (;;)
72	{
73	/* Get the codepoints */
74	RTUNICP uc1;
75	int rc = RTStrGetCpEx(&psz1, &uc1);
76	if (RT_FAILURE(rc))
77	{
78	AssertRC(rc);
79	psz1--;
80	break;
81	}
82
83	RTUNICP uc2;
84	rc = RTStrGetCpEx(&psz2, &uc2);
85	if (RT_FAILURE(rc))
86	{
87	AssertRC(rc);
88	psz2--;
89	psz1 = RTStrPrevCp(pszStart1, psz1);
90	break;
91	}
92
93	/* compare */
94	int iDiff = uc1 - uc2;
95	if (iDiff)
96	{
97	iDiff = RTUniCpToUpper(uc1) != RTUniCpToUpper(uc2);
98	if (iDiff)
99	{
100	iDiff = RTUniCpToLower(uc1) - RTUniCpToLower(uc2); /* lower case diff last! */
101	if (iDiff)
102	return iDiff;
103	}
104	}
105
106	/* hit the terminator? */
107	if (!uc1)
108	return 0;
109	}
110
111	/* Hit some bad encoding, continue in case sensitive mode. */
112	return RTStrCmp(psz1, psz2);
113	}
114	RT_EXPORT_SYMBOL(RTStrICmp);
115
116
117	/**
118	* Performs a case insensitive string compare between two UTF-8 strings, given a
119	* maximum string length.
120	*
121	* This is a simplified compare, as only the simplified lower/upper case folding
122	* specified by the unicode specs are used. It does not consider character pairs
123	* as they are used in some languages, just simple upper & lower case compares.
124	*
125	* The result is the difference between the mismatching codepoints after they
126	* both have been lower cased.
127	*
128	* If the string encoding is invalid the function will assert (strict builds)
129	* and use RTStrCmp for the remainder of the string.
130	*
131	* @returns < 0 if the first string less than the second string.
132	* @returns 0 if the first string identical to the second string.
133	* @returns > 0 if the first string greater than the second string.
134	* @param psz1 First UTF-8 string. Null is allowed.
135	* @param psz2 Second UTF-8 string. Null is allowed.
136	* @param cchMax Maximum string length
137	*/
138	RTDECL(int) RTStrNICmp(const char psz1, const char psz2, size_t cchMax)
139	{
140	if (cchMax == 0)
141	return 0;
142	if (psz1 == psz2)
143	return 0;
144	if (!psz1)
145	return -1;
146	if (!psz2)
147	return 1;
148
149	for (;;)
150	{
151	/* Get the codepoints */
152	RTUNICP uc1;
153	size_t cchMax2 = cchMax;
154	int rc = RTStrGetCpNEx(&psz1, &cchMax, &uc1);
155	if (RT_FAILURE(rc))
156	{
157	AssertRC(rc);
158	psz1--;
159	cchMax++;
160	break;
161	}
162
163	RTUNICP uc2;
164	rc = RTStrGetCpNEx(&psz2, &cchMax2, &uc2);
165	if (RT_FAILURE(rc))
166	{
167	AssertRC(rc);
168	psz2--;
169	psz1 -= (cchMax - cchMax2 + 1); /* This can't overflow, can it? */
170	cchMax = cchMax2 + 1;
171	break;
172	}
173
174	/* compare */
175	int iDiff = uc1 - uc2;
176	if (iDiff)
177	{
178	iDiff = RTUniCpToUpper(uc1) != RTUniCpToUpper(uc2);
179	if (iDiff)
180	{
181	iDiff = RTUniCpToLower(uc1) - RTUniCpToLower(uc2); /* lower case diff last! */
182	if (iDiff)
183	return iDiff;
184	}
185	}
186
187	/* hit the terminator? */
188	if (!uc1 \|\| cchMax == 0)
189	return 0;
190	}
191
192	/* Hit some bad encoding, continue in case insensitive mode. */
193	return RTStrNCmp(psz1, psz2, cchMax);
194	}
195	RT_EXPORT_SYMBOL(RTStrNICmp);
196
197
198	RTDECL(char ) RTStrIStr(const char pszHaystack, const char *pszNeedle)
199	{
200	/* Any NULL strings means NULL return. (In the RTStrCmp tradition.) */
201	if (!pszHaystack)
202	return NULL;
203	if (!pszNeedle)
204	return NULL;
205
206	/* The empty string matches everything. */
207	if (!*pszNeedle)
208	return (char *)pszHaystack;
209
210	/*
211	* The search strategy is to pick out the first char of the needle, fold it,
212	* and match it against the haystack code point by code point. When encountering
213	* a matching code point we use RTStrNICmp for the remainder (if any) of the needle.
214	*/
215	const char * const pszNeedleStart = pszNeedle;
216	RTUNICP Cp0;
217	RTStrGetCpEx(&pszNeedle, &Cp0); /* pszNeedle is advanced one code point. */
218	size_t const cchNeedle = strlen(pszNeedle);
219	size_t const cchNeedleCp0= pszNeedle - pszNeedleStart;
220	RTUNICP const Cp0Lower = RTUniCpToLower(Cp0);
221	RTUNICP const Cp0Upper = RTUniCpToUpper(Cp0);
222	if ( Cp0Lower == Cp0Upper
223	&& Cp0Lower == Cp0)
224	{
225	/* Cp0 is not a case sensitive char. */
226	for (;;)
227	{
228	RTUNICP Cp;
229	RTStrGetCpEx(&pszHaystack, &Cp);
230	if (!Cp)
231	break;
232	if ( Cp == Cp0
233	&& !RTStrNICmp(pszHaystack, pszNeedle, cchNeedle))
234	return (char *)pszHaystack - cchNeedleCp0;
235	}
236	}
237	else if ( Cp0Lower == Cp0
238	\|\| Cp0Upper != Cp0)
239	{
240	/* Cp0 is case sensitive */
241	for (;;)
242	{
243	RTUNICP Cp;
244	RTStrGetCpEx(&pszHaystack, &Cp);
245	if (!Cp)
246	break;
247	if ( ( Cp == Cp0Upper
248	\|\| Cp == Cp0Lower)
249	&& !RTStrNICmp(pszHaystack, pszNeedle, cchNeedle))
250	return (char *)pszHaystack - cchNeedleCp0;
251	}
252	}
253	else
254	{
255	/* Cp0 is case sensitive and folds to two difference chars. (paranoia) */
256	for (;;)
257	{
258	RTUNICP Cp;
259	RTStrGetCpEx(&pszHaystack, &Cp);
260	if (!Cp)
261	break;
262	if ( ( Cp == Cp0
263	\|\| Cp == Cp0Upper
264	\|\| Cp == Cp0Lower)
265	&& !RTStrNICmp(pszHaystack, pszNeedle, cchNeedle))
266	return (char *)pszHaystack - cchNeedleCp0;
267	}
268	}
269
270
271	return NULL;
272	}
273	RT_EXPORT_SYMBOL(RTStrIStr);
274
275
276	RTDECL(char ) RTStrToLower(char psz)
277	{
278	/*
279	* Loop the code points in the string, converting them one by one.
280	*
281	* ASSUMES that the folded code points have an encoding that is equal or
282	* shorter than the original (this is presently correct).
283	*/
284	const char *pszSrc = psz;
285	char *pszDst = psz;
286	RTUNICP uc;
287	do
288	{
289	int rc = RTStrGetCpEx(&pszSrc, &uc);
290	if (RT_SUCCESS(rc))
291	{
292	uc = RTUniCpToLower(uc);
293	pszDst = RTStrPutCp(pszDst, uc);
294	}
295	else
296	{
297	/* bad encoding, just copy it quietly (uc == RTUNICP_INVALID (!= 0)). */
298	AssertRC(rc);
299	*pszDst++ = pszSrc[-1];
300	}
301	Assert((uintptr_t)pszDst <= (uintptr_t)pszSrc);
302	} while (uc != 0);
303
304	return psz;
305	}
306	RT_EXPORT_SYMBOL(RTStrToLower);
307
308
309	RTDECL(char ) RTStrToUpper(char psz)
310	{
311	/*
312	* Loop the code points in the string, converting them one by one.
313	*
314	* ASSUMES that the folded code points have an encoding that is equal or
315	* shorter than the original (this is presently correct).
316	*/
317	const char *pszSrc = psz;
318	char *pszDst = psz;
319	RTUNICP uc;
320	do
321	{
322	int rc = RTStrGetCpEx(&pszSrc, &uc);
323	if (RT_SUCCESS(rc))
324	{
325	uc = RTUniCpToUpper(uc);
326	pszDst = RTStrPutCp(pszDst, uc);
327	}
328	else
329	{
330	/* bad encoding, just copy it quietly (uc == RTUNICP_INVALID (!= 0)). */
331	AssertRC(rc);
332	*pszDst++ = pszSrc[-1];
333	}
334	Assert((uintptr_t)pszDst <= (uintptr_t)pszSrc);
335	} while (uc != 0);
336
337	return psz;
338	}
339	RT_EXPORT_SYMBOL(RTStrToUpper);
340

Note: See TracBrowser for help on using the repository browser.

source: vbox/trunk/src/VBox/Runtime/common/string/utf-8-case.cpp@ 45221

Download in other formats: