utf-8-case.cpp

Last change on this file was 106061, checked in by vboxsync, 3 months ago
Copyright year updates by scm.
Property svn:eol-style set to `native` Property svn:keywords set to `Id Revision`
File size: 9.0 KB

Line
1	/* $Id: utf-8-case.cpp 106061 2024-09-16 14:03:52Z vboxsync $ */
2	/** @file
3	* IPRT - UTF-8 Case Sensitivity and Folding, Part 1.
4	*/
5
6	/*
7	* Copyright (C) 2006-2024 Oracle and/or its affiliates.
8	*
9	* This file is part of VirtualBox base platform packages, as
10	* available from https://www.virtualbox.org.
11	*
12	* This program is free software; you can redistribute it and/or
13	* modify it under the terms of the GNU General Public License
14	* as published by the Free Software Foundation, in version 3 of the
15	* License.
16	*
17	* This program is distributed in the hope that it will be useful, but
18	* WITHOUT ANY WARRANTY; without even the implied warranty of
19	* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
20	* General Public License for more details.
21	*
22	* You should have received a copy of the GNU General Public License
23	* along with this program; if not, see <https://www.gnu.org/licenses>.
24	*
25	* The contents of this file may alternatively be used under the terms
26	* of the Common Development and Distribution License Version 1.0
27	* (CDDL), a copy of it is provided in the "COPYING.CDDL" file included
28	* in the VirtualBox distribution, in which case the provisions of the
29	* CDDL are applicable instead of those of the GPL.
30	*
31	* You may elect to license modified versions of this file under the
32	* terms and conditions of either the GPL or the CDDL or both.
33	*
34	* SPDX-License-Identifier: GPL-3.0-only OR CDDL-1.0
35	*/
36
37
38	/*********************************************************************************************************************************
39	* Header Files *
40	*********************************************************************************************************************************/
41	#include <iprt/string.h>
42	#include "internal/iprt.h"
43
44	#include <iprt/uni.h>
45	#include <iprt/alloc.h>
46	#include <iprt/assert.h>
47	#include <iprt/errcore.h>
48	#include "internal/string.h"
49
50
51
52	RTDECL(int) RTStrICmp(const char psz1, const char psz2)
53	{
54	if (psz1 == psz2)
55	return 0;
56	if (!psz1)
57	return -1;
58	if (!psz2)
59	return 1;
60
61	const char *pszStart1 = psz1;
62	for (;;)
63	{
64	/* Get the codepoints */
65	RTUNICP uc1;
66	int rc = RTStrGetCpEx(&psz1, &uc1);
67	if (RT_FAILURE(rc))
68	{
69	AssertRC(rc);
70	psz1--;
71	break;
72	}
73
74	RTUNICP uc2;
75	rc = RTStrGetCpEx(&psz2, &uc2);
76	if (RT_FAILURE(rc))
77	{
78	AssertRC(rc);
79	psz2--;
80	psz1 = RTStrPrevCp(pszStart1, psz1);
81	break;
82	}
83
84	/* compare */
85	int iDiff = uc1 - uc2;
86	if (iDiff)
87	{
88	iDiff = RTUniCpToUpper(uc1) != RTUniCpToUpper(uc2);
89	if (iDiff)
90	{
91	iDiff = RTUniCpToLower(uc1) - RTUniCpToLower(uc2); /* lower case diff last! */
92	if (iDiff)
93	return iDiff;
94	}
95	}
96
97	/* hit the terminator? */
98	if (!uc1)
99	return 0;
100	}
101
102	/* Hit some bad encoding, continue in case sensitive mode. */
103	return RTStrCmp(psz1, psz2);
104	}
105	RT_EXPORT_SYMBOL(RTStrICmp);
106
107
108	RTDECL(int) RTStrNICmp(const char psz1, const char psz2, size_t cchMax)
109	{
110	if (cchMax == 0)
111	return 0;
112	if (psz1 == psz2)
113	return 0;
114	if (!psz1)
115	return -1;
116	if (!psz2)
117	return 1;
118
119	for (;;)
120	{
121	/* Get the codepoints */
122	RTUNICP uc1;
123	size_t cchMax2 = cchMax;
124	int rc = RTStrGetCpNEx(&psz1, &cchMax, &uc1);
125	if (RT_FAILURE(rc))
126	{
127	AssertRC(rc);
128	psz1--;
129	cchMax++;
130	break;
131	}
132
133	RTUNICP uc2;
134	rc = RTStrGetCpNEx(&psz2, &cchMax2, &uc2);
135	if (RT_FAILURE(rc))
136	{
137	AssertRC(rc);
138	psz2--;
139	psz1 -= (cchMax - cchMax2 + 1); /* This can't overflow, can it? */
140	cchMax = cchMax2 + 1;
141	break;
142	}
143
144	/* compare */
145	int iDiff = uc1 - uc2;
146	if (iDiff)
147	{
148	iDiff = RTUniCpToUpper(uc1) != RTUniCpToUpper(uc2);
149	if (iDiff)
150	{
151	iDiff = RTUniCpToLower(uc1) - RTUniCpToLower(uc2); /* lower case diff last! */
152	if (iDiff)
153	return iDiff;
154	}
155	}
156
157	/* hit the terminator? */
158	if (!uc1 \|\| cchMax == 0)
159	return 0;
160	}
161
162	/* Hit some bad encoding, continue in case insensitive mode. */
163	return RTStrNCmp(psz1, psz2, cchMax);
164	}
165	RT_EXPORT_SYMBOL(RTStrNICmp);
166
167
168	RTDECL(char ) RTStrIStr(const char pszHaystack, const char *pszNeedle)
169	{
170	/* Any NULL strings means NULL return. (In the RTStrCmp tradition.) */
171	if (!pszHaystack)
172	return NULL;
173	if (!pszNeedle)
174	return NULL;
175
176	/* The empty string matches everything. */
177	if (!*pszNeedle)
178	return (char *)pszHaystack;
179
180	/*
181	* The search strategy is to pick out the first char of the needle, fold it,
182	* and match it against the haystack code point by code point. When encountering
183	* a matching code point we use RTStrNICmp for the remainder (if any) of the needle.
184	*/
185	const char * const pszNeedleStart = pszNeedle;
186	RTUNICP Cp0;
187	RTStrGetCpEx(&pszNeedle, &Cp0); /* pszNeedle is advanced one code point. */
188	size_t const cchNeedle = strlen(pszNeedle);
189	size_t const cchNeedleCp0= pszNeedle - pszNeedleStart;
190	RTUNICP const Cp0Lower = RTUniCpToLower(Cp0);
191	RTUNICP const Cp0Upper = RTUniCpToUpper(Cp0);
192	if ( Cp0Lower == Cp0Upper
193	&& Cp0Lower == Cp0)
194	{
195	/* Cp0 is not a case sensitive char. */
196	for (;;)
197	{
198	RTUNICP Cp;
199	RTStrGetCpEx(&pszHaystack, &Cp);
200	if (!Cp)
201	break;
202	if ( Cp == Cp0
203	&& !RTStrNICmp(pszHaystack, pszNeedle, cchNeedle))
204	return (char *)pszHaystack - cchNeedleCp0;
205	}
206	}
207	else if ( Cp0Lower == Cp0
208	\|\| Cp0Upper != Cp0)
209	{
210	/* Cp0 is case sensitive */
211	for (;;)
212	{
213	RTUNICP Cp;
214	RTStrGetCpEx(&pszHaystack, &Cp);
215	if (!Cp)
216	break;
217	if ( ( Cp == Cp0Upper
218	\|\| Cp == Cp0Lower)
219	&& !RTStrNICmp(pszHaystack, pszNeedle, cchNeedle))
220	return (char *)pszHaystack - cchNeedleCp0;
221	}
222	}
223	else
224	{
225	/* Cp0 is case sensitive and folds to two difference chars. (paranoia) */
226	for (;;)
227	{
228	RTUNICP Cp;
229	RTStrGetCpEx(&pszHaystack, &Cp);
230	if (!Cp)
231	break;
232	if ( ( Cp == Cp0
233	\|\| Cp == Cp0Upper
234	\|\| Cp == Cp0Lower)
235	&& !RTStrNICmp(pszHaystack, pszNeedle, cchNeedle))
236	return (char *)pszHaystack - cchNeedleCp0;
237	}
238	}
239
240
241	return NULL;
242	}
243	RT_EXPORT_SYMBOL(RTStrIStr);
244
245
246	RTDECL(char ) RTStrToLower(char psz)
247	{
248	/*
249	* Loop the code points in the string, converting them one by one.
250	*
251	* ASSUMES that the folded code points have an encoding that is equal or
252	* shorter than the original (this is presently correct).
253	*/
254	const char *pszSrc = psz;
255	char *pszDst = psz;
256	RTUNICP uc;
257	do
258	{
259	int rc = RTStrGetCpEx(&pszSrc, &uc);
260	if (RT_SUCCESS(rc))
261	{
262	RTUNICP uc2 = RTUniCpToLower(uc);
263	if (RT_LIKELY( uc2 == uc
264	\|\| RTUniCpCalcUtf8Len(uc2) == RTUniCpCalcUtf8Len(uc)))
265	pszDst = RTStrPutCp(pszDst, uc2);
266	else
267	pszDst = RTStrPutCp(pszDst, uc);
268	}
269	else
270	{
271	/* bad encoding, just copy it quietly (uc == RTUNICP_INVALID (!= 0)). */
272	AssertRC(rc);
273	*pszDst++ = pszSrc[-1];
274	}
275	Assert((uintptr_t)pszDst <= (uintptr_t)pszSrc);
276	} while (uc != 0);
277
278	return psz;
279	}
280	RT_EXPORT_SYMBOL(RTStrToLower);
281
282
283	RTDECL(char ) RTStrToUpper(char psz)
284	{
285	/*
286	* Loop the code points in the string, converting them one by one.
287	*
288	* ASSUMES that the folded code points have an encoding that is equal or
289	* shorter than the original (this is presently correct).
290	*/
291	const char *pszSrc = psz;
292	char *pszDst = psz;
293	RTUNICP uc;
294	do
295	{
296	int rc = RTStrGetCpEx(&pszSrc, &uc);
297	if (RT_SUCCESS(rc))
298	{
299	RTUNICP uc2 = RTUniCpToUpper(uc);
300	if (RT_LIKELY( uc2 == uc
301	\|\| RTUniCpCalcUtf8Len(uc2) == RTUniCpCalcUtf8Len(uc)))
302	pszDst = RTStrPutCp(pszDst, uc2);
303	else
304	pszDst = RTStrPutCp(pszDst, uc);
305	}
306	else
307	{
308	/* bad encoding, just copy it quietly (uc == RTUNICP_INVALID (!= 0)). */
309	AssertRC(rc);
310	*pszDst++ = pszSrc[-1];
311	}
312	Assert((uintptr_t)pszDst <= (uintptr_t)pszSrc);
313	} while (uc != 0);
314
315	return psz;
316	}
317	RT_EXPORT_SYMBOL(RTStrToUpper);
318

Note: See TracBrowser for help on using the repository browser.

source: vbox/trunk/src/VBox/Runtime/common/string/utf-8-case.cpp

Download in other formats: