VirtualBox

source: vbox/trunk/src/VBox/Runtime/common/string/utf-8-case.cpp

Last change on this file was 106061, checked in by vboxsync, 3 months ago

Copyright year updates by scm.

  • Property svn:eol-style set to native
  • Property svn:keywords set to Id Revision
File size: 9.0 KB
Line 
1/* $Id: utf-8-case.cpp 106061 2024-09-16 14:03:52Z vboxsync $ */
2/** @file
3 * IPRT - UTF-8 Case Sensitivity and Folding, Part 1.
4 */
5
6/*
7 * Copyright (C) 2006-2024 Oracle and/or its affiliates.
8 *
9 * This file is part of VirtualBox base platform packages, as
10 * available from https://www.virtualbox.org.
11 *
12 * This program is free software; you can redistribute it and/or
13 * modify it under the terms of the GNU General Public License
14 * as published by the Free Software Foundation, in version 3 of the
15 * License.
16 *
17 * This program is distributed in the hope that it will be useful, but
18 * WITHOUT ANY WARRANTY; without even the implied warranty of
19 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
20 * General Public License for more details.
21 *
22 * You should have received a copy of the GNU General Public License
23 * along with this program; if not, see <https://www.gnu.org/licenses>.
24 *
25 * The contents of this file may alternatively be used under the terms
26 * of the Common Development and Distribution License Version 1.0
27 * (CDDL), a copy of it is provided in the "COPYING.CDDL" file included
28 * in the VirtualBox distribution, in which case the provisions of the
29 * CDDL are applicable instead of those of the GPL.
30 *
31 * You may elect to license modified versions of this file under the
32 * terms and conditions of either the GPL or the CDDL or both.
33 *
34 * SPDX-License-Identifier: GPL-3.0-only OR CDDL-1.0
35 */
36
37
38/*********************************************************************************************************************************
39* Header Files *
40*********************************************************************************************************************************/
41#include <iprt/string.h>
42#include "internal/iprt.h"
43
44#include <iprt/uni.h>
45#include <iprt/alloc.h>
46#include <iprt/assert.h>
47#include <iprt/errcore.h>
48#include "internal/string.h"
49
50
51
52RTDECL(int) RTStrICmp(const char *psz1, const char *psz2)
53{
54 if (psz1 == psz2)
55 return 0;
56 if (!psz1)
57 return -1;
58 if (!psz2)
59 return 1;
60
61 const char *pszStart1 = psz1;
62 for (;;)
63 {
64 /* Get the codepoints */
65 RTUNICP uc1;
66 int rc = RTStrGetCpEx(&psz1, &uc1);
67 if (RT_FAILURE(rc))
68 {
69 AssertRC(rc);
70 psz1--;
71 break;
72 }
73
74 RTUNICP uc2;
75 rc = RTStrGetCpEx(&psz2, &uc2);
76 if (RT_FAILURE(rc))
77 {
78 AssertRC(rc);
79 psz2--;
80 psz1 = RTStrPrevCp(pszStart1, psz1);
81 break;
82 }
83
84 /* compare */
85 int iDiff = uc1 - uc2;
86 if (iDiff)
87 {
88 iDiff = RTUniCpToUpper(uc1) != RTUniCpToUpper(uc2);
89 if (iDiff)
90 {
91 iDiff = RTUniCpToLower(uc1) - RTUniCpToLower(uc2); /* lower case diff last! */
92 if (iDiff)
93 return iDiff;
94 }
95 }
96
97 /* hit the terminator? */
98 if (!uc1)
99 return 0;
100 }
101
102 /* Hit some bad encoding, continue in case sensitive mode. */
103 return RTStrCmp(psz1, psz2);
104}
105RT_EXPORT_SYMBOL(RTStrICmp);
106
107
108RTDECL(int) RTStrNICmp(const char *psz1, const char *psz2, size_t cchMax)
109{
110 if (cchMax == 0)
111 return 0;
112 if (psz1 == psz2)
113 return 0;
114 if (!psz1)
115 return -1;
116 if (!psz2)
117 return 1;
118
119 for (;;)
120 {
121 /* Get the codepoints */
122 RTUNICP uc1;
123 size_t cchMax2 = cchMax;
124 int rc = RTStrGetCpNEx(&psz1, &cchMax, &uc1);
125 if (RT_FAILURE(rc))
126 {
127 AssertRC(rc);
128 psz1--;
129 cchMax++;
130 break;
131 }
132
133 RTUNICP uc2;
134 rc = RTStrGetCpNEx(&psz2, &cchMax2, &uc2);
135 if (RT_FAILURE(rc))
136 {
137 AssertRC(rc);
138 psz2--;
139 psz1 -= (cchMax - cchMax2 + 1); /* This can't overflow, can it? */
140 cchMax = cchMax2 + 1;
141 break;
142 }
143
144 /* compare */
145 int iDiff = uc1 - uc2;
146 if (iDiff)
147 {
148 iDiff = RTUniCpToUpper(uc1) != RTUniCpToUpper(uc2);
149 if (iDiff)
150 {
151 iDiff = RTUniCpToLower(uc1) - RTUniCpToLower(uc2); /* lower case diff last! */
152 if (iDiff)
153 return iDiff;
154 }
155 }
156
157 /* hit the terminator? */
158 if (!uc1 || cchMax == 0)
159 return 0;
160 }
161
162 /* Hit some bad encoding, continue in case insensitive mode. */
163 return RTStrNCmp(psz1, psz2, cchMax);
164}
165RT_EXPORT_SYMBOL(RTStrNICmp);
166
167
168RTDECL(char *) RTStrIStr(const char *pszHaystack, const char *pszNeedle)
169{
170 /* Any NULL strings means NULL return. (In the RTStrCmp tradition.) */
171 if (!pszHaystack)
172 return NULL;
173 if (!pszNeedle)
174 return NULL;
175
176 /* The empty string matches everything. */
177 if (!*pszNeedle)
178 return (char *)pszHaystack;
179
180 /*
181 * The search strategy is to pick out the first char of the needle, fold it,
182 * and match it against the haystack code point by code point. When encountering
183 * a matching code point we use RTStrNICmp for the remainder (if any) of the needle.
184 */
185 const char * const pszNeedleStart = pszNeedle;
186 RTUNICP Cp0;
187 RTStrGetCpEx(&pszNeedle, &Cp0); /* pszNeedle is advanced one code point. */
188 size_t const cchNeedle = strlen(pszNeedle);
189 size_t const cchNeedleCp0= pszNeedle - pszNeedleStart;
190 RTUNICP const Cp0Lower = RTUniCpToLower(Cp0);
191 RTUNICP const Cp0Upper = RTUniCpToUpper(Cp0);
192 if ( Cp0Lower == Cp0Upper
193 && Cp0Lower == Cp0)
194 {
195 /* Cp0 is not a case sensitive char. */
196 for (;;)
197 {
198 RTUNICP Cp;
199 RTStrGetCpEx(&pszHaystack, &Cp);
200 if (!Cp)
201 break;
202 if ( Cp == Cp0
203 && !RTStrNICmp(pszHaystack, pszNeedle, cchNeedle))
204 return (char *)pszHaystack - cchNeedleCp0;
205 }
206 }
207 else if ( Cp0Lower == Cp0
208 || Cp0Upper != Cp0)
209 {
210 /* Cp0 is case sensitive */
211 for (;;)
212 {
213 RTUNICP Cp;
214 RTStrGetCpEx(&pszHaystack, &Cp);
215 if (!Cp)
216 break;
217 if ( ( Cp == Cp0Upper
218 || Cp == Cp0Lower)
219 && !RTStrNICmp(pszHaystack, pszNeedle, cchNeedle))
220 return (char *)pszHaystack - cchNeedleCp0;
221 }
222 }
223 else
224 {
225 /* Cp0 is case sensitive and folds to two difference chars. (paranoia) */
226 for (;;)
227 {
228 RTUNICP Cp;
229 RTStrGetCpEx(&pszHaystack, &Cp);
230 if (!Cp)
231 break;
232 if ( ( Cp == Cp0
233 || Cp == Cp0Upper
234 || Cp == Cp0Lower)
235 && !RTStrNICmp(pszHaystack, pszNeedle, cchNeedle))
236 return (char *)pszHaystack - cchNeedleCp0;
237 }
238 }
239
240
241 return NULL;
242}
243RT_EXPORT_SYMBOL(RTStrIStr);
244
245
246RTDECL(char *) RTStrToLower(char *psz)
247{
248 /*
249 * Loop the code points in the string, converting them one by one.
250 *
251 * ASSUMES that the folded code points have an encoding that is equal or
252 * shorter than the original (this is presently correct).
253 */
254 const char *pszSrc = psz;
255 char *pszDst = psz;
256 RTUNICP uc;
257 do
258 {
259 int rc = RTStrGetCpEx(&pszSrc, &uc);
260 if (RT_SUCCESS(rc))
261 {
262 RTUNICP uc2 = RTUniCpToLower(uc);
263 if (RT_LIKELY( uc2 == uc
264 || RTUniCpCalcUtf8Len(uc2) == RTUniCpCalcUtf8Len(uc)))
265 pszDst = RTStrPutCp(pszDst, uc2);
266 else
267 pszDst = RTStrPutCp(pszDst, uc);
268 }
269 else
270 {
271 /* bad encoding, just copy it quietly (uc == RTUNICP_INVALID (!= 0)). */
272 AssertRC(rc);
273 *pszDst++ = pszSrc[-1];
274 }
275 Assert((uintptr_t)pszDst <= (uintptr_t)pszSrc);
276 } while (uc != 0);
277
278 return psz;
279}
280RT_EXPORT_SYMBOL(RTStrToLower);
281
282
283RTDECL(char *) RTStrToUpper(char *psz)
284{
285 /*
286 * Loop the code points in the string, converting them one by one.
287 *
288 * ASSUMES that the folded code points have an encoding that is equal or
289 * shorter than the original (this is presently correct).
290 */
291 const char *pszSrc = psz;
292 char *pszDst = psz;
293 RTUNICP uc;
294 do
295 {
296 int rc = RTStrGetCpEx(&pszSrc, &uc);
297 if (RT_SUCCESS(rc))
298 {
299 RTUNICP uc2 = RTUniCpToUpper(uc);
300 if (RT_LIKELY( uc2 == uc
301 || RTUniCpCalcUtf8Len(uc2) == RTUniCpCalcUtf8Len(uc)))
302 pszDst = RTStrPutCp(pszDst, uc2);
303 else
304 pszDst = RTStrPutCp(pszDst, uc);
305 }
306 else
307 {
308 /* bad encoding, just copy it quietly (uc == RTUNICP_INVALID (!= 0)). */
309 AssertRC(rc);
310 *pszDst++ = pszSrc[-1];
311 }
312 Assert((uintptr_t)pszDst <= (uintptr_t)pszSrc);
313 } while (uc != 0);
314
315 return psz;
316}
317RT_EXPORT_SYMBOL(RTStrToUpper);
318
Note: See TracBrowser for help on using the repository browser.

© 2024 Oracle Support Privacy / Do Not Sell My Info Terms of Use Trademark Policy Automated Access Etiquette