uri.cpp@ 107454

Last change on this file since 107454 was 107454, checked in by vboxsync, 5 weeks ago
Runtime/common/misc/uri.cpp: Don't call strlen() two times in the RT_MIN() expansion to save some time (maybe the compiler would optimize this but better not depend on it), bugref:3409
Property svn:eol-style set to `native` Property svn:keywords set to `Author Date Id Revision`
File size: 40.6 KB

Line
1	/* $Id: uri.cpp 107454 2025-01-07 10:16:43Z vboxsync $ */
2	/** @file
3	* IPRT - Uniform Resource Identifier handling.
4	*/
5
6	/*
7	* Copyright (C) 2011-2024 Oracle and/or its affiliates.
8	*
9	* This file is part of VirtualBox base platform packages, as
10	* available from https://www.virtualbox.org.
11	*
12	* This program is free software; you can redistribute it and/or
13	* modify it under the terms of the GNU General Public License
14	* as published by the Free Software Foundation, in version 3 of the
15	* License.
16	*
17	* This program is distributed in the hope that it will be useful, but
18	* WITHOUT ANY WARRANTY; without even the implied warranty of
19	* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
20	* General Public License for more details.
21	*
22	* You should have received a copy of the GNU General Public License
23	* along with this program; if not, see <https://www.gnu.org/licenses>.
24	*
25	* The contents of this file may alternatively be used under the terms
26	* of the Common Development and Distribution License Version 1.0
27	* (CDDL), a copy of it is provided in the "COPYING.CDDL" file included
28	* in the VirtualBox distribution, in which case the provisions of the
29	* CDDL are applicable instead of those of the GPL.
30	*
31	* You may elect to license modified versions of this file under the
32	* terms and conditions of either the GPL or the CDDL or both.
33	*
34	* SPDX-License-Identifier: GPL-3.0-only OR CDDL-1.0
35	*/
36
37
38	/*********************************************************************************************************************************
39	* Header Files *
40	*********************************************************************************************************************************/
41	#include <iprt/uri.h>
42
43	#include <iprt/assert.h>
44	#include <iprt/ctype.h>
45	#include <iprt/err.h>
46	#include <iprt/path.h>
47	#include <iprt/string.h>
48
49
50	/*********************************************************************************************************************************
51	* Defined Constants And Macros *
52	*********************************************************************************************************************************/
53	/** Internal magic value we use to check if a RTURIPARSED structure has made it thru RTUriParse. */
54	#define RTURIPARSED_MAGIC UINT32_C(0x439e0745)
55
56
57	/* General URI format:
58
59	foo://example.com:8042/over/there?name=ferret#nose
60	\_/ \______________/\_________/ \_________/ \__/
61	\| \| \| \| \|
62	scheme authority path query fragment
63	\| _____________________\|__
64	/ \ / \
65	urn:example:animal:ferret:nose
66	*/
67
68
69	/**
70	* The following defines characters which have to be % escaped:
71	* control = 00-1F
72	* space = ' '
73	* delims = '<' , '>' , '#' , '%' , '"'
74	* unwise = '{' , '}' , '\|' , '\' , '^' , '[' , ']' , '`'
75	*
76	* @note ARM defines char as unsigned by default in the AAPCS(64) so the first check would trigger
77	* a compiler warning/error. Apple decided to ignore that and declares char a signed like on
78	* the other platforms.
79	*/
80	#if defined(RT_OS_LINUX) \
81	&& (defined(RT_ARCH_ARM64) \|\| defined(RT_ARCH_ARM32))
82	# define URI_EXCLUDED(a) \
83	( ((a) <= 0x20) \
84	\|\| ((a) >= 0x5B && (a) <= 0x5E) \
85	\|\| ((a) >= 0x7B && (a) <= 0x7D) \
86	\|\| (a) == '<' \|\| (a) == '>' \|\| (a) == '#' \
87	\|\| (a) == '%' \|\| (a) == '"' \|\| (a) == '`' )
88	#else
89	# define URI_EXCLUDED(a) \
90	( ((a) >= 0x0 && (a) <= 0x20) \
91	\|\| ((a) >= 0x5B && (a) <= 0x5E) \
92	\|\| ((a) >= 0x7B && (a) <= 0x7D) \
93	\|\| (a) == '<' \|\| (a) == '>' \|\| (a) == '#' \
94	\|\| (a) == '%' \|\| (a) == '"' \|\| (a) == '`' )
95	#endif
96
97	static char rtUriPercentEncodeN(const char pszString, size_t cchMax)
98	{
99	if (!pszString)
100	return NULL;
101
102	int rc = VINF_SUCCESS;
103
104	size_t const cchStr = strlen(pszString);
105	size_t cbLen = RT_MIN(cchStr, cchMax);
106	/* The new string can be max 3 times in size of the original string. */
107	char pszNew = RTStrAlloc(cbLen 3 + 1);
108	if (!pszNew)
109	return NULL;
110
111	char *pszRes = NULL;
112	size_t iIn = 0;
113	size_t iOut = 0;
114	while (iIn < cbLen)
115	{
116	if (URI_EXCLUDED(pszString[iIn]))
117	{
118	char szNum[3] = { 0, 0, 0 };
119	RTStrFormatU8(&szNum[0], 3, pszString[iIn++], 16, 2, 2, RTSTR_F_CAPITAL \| RTSTR_F_ZEROPAD);
120	pszNew[iOut++] = '%';
121	pszNew[iOut++] = szNum[0];
122	pszNew[iOut++] = szNum[1];
123	}
124	else
125	pszNew[iOut++] = pszString[iIn++];
126	}
127	if (RT_SUCCESS(rc))
128	{
129	pszNew[iOut] = '\0';
130	if (iOut != iIn)
131	{
132	/* If the source and target strings have different size, recreate
133	* the target string with the correct size. */
134	pszRes = RTStrDupN(pszNew, iOut);
135	RTStrFree(pszNew);
136	}
137	else
138	pszRes = pszNew;
139	}
140	else
141	RTStrFree(pszNew);
142
143	return pszRes;
144	}
145
146
147	/**
148	* Calculates the encoded string length.
149	*
150	* @returns Number of chars (excluding the terminator).
151	* @param pszString The string to encode.
152	* @param cchMax The maximum string length (e.g. RTSTR_MAX).
153	* @param fEncodeDosSlash Whether to encode DOS slashes or not.
154	*/
155	static size_t rtUriCalcEncodedLength(const char *pszString, size_t cchMax, bool fEncodeDosSlash)
156	{
157	size_t cchEncoded = 0;
158	if (pszString)
159	{
160	size_t cchSrcLeft = RTStrNLen(pszString, cchMax);
161	while (cchSrcLeft-- > 0)
162	{
163	char const ch = *pszString++;
164	if (!URI_EXCLUDED(ch) \|\| (ch == '\\' && !fEncodeDosSlash))
165	cchEncoded += 1;
166	else
167	cchEncoded += 3;
168	}
169	}
170	return cchEncoded;
171	}
172
173
174	/**
175	* Encodes an URI into a caller allocated buffer.
176	*
177	* @returns IPRT status code.
178	* @param pszString The string to encode.
179	* @param cchMax The maximum string length (e.g. RTSTR_MAX).
180	* @param fEncodeDosSlash Whether to encode DOS slashes or not.
181	* @param pszDst The destination buffer.
182	* @param cbDst The size of the destination buffer.
183	*/
184	static int rtUriEncodeIntoBuffer(const char pszString, size_t cchMax, bool fEncodeDosSlash, char pszDst, size_t cbDst)
185	{
186	AssertReturn(pszString, VERR_INVALID_POINTER);
187	AssertPtrReturn(pszDst, VERR_INVALID_POINTER);
188
189	/*
190	* We do buffer size checking up front and every time we encode a special
191	* character. That's faster than checking for each char.
192	*/
193	size_t cchSrcLeft = RTStrNLen(pszString, cchMax);
194	AssertMsgReturn(cbDst > cchSrcLeft, ("cbDst=%zu cchSrcLeft=%zu\n", cbDst, cchSrcLeft), VERR_BUFFER_OVERFLOW);
195	cbDst -= cchSrcLeft;
196
197	while (cchSrcLeft-- > 0)
198	{
199	char const ch = *pszString++;
200	if (!URI_EXCLUDED(ch) \|\| (ch == '\\' && !fEncodeDosSlash))
201	*pszDst++ = ch;
202	else
203	{
204	AssertReturn(cbDst >= 3, VERR_BUFFER_OVERFLOW); /* 2 extra bytes + zero terminator. */
205	cbDst -= 2;
206
207	*pszDst++ = '%';
208	ssize_t cchTmp = RTStrFormatU8(pszDst, 3, (unsigned char)ch, 16, 2, 2, RTSTR_F_CAPITAL \| RTSTR_F_ZEROPAD);
209	Assert(cchTmp == 2); NOREF(cchTmp);
210	pszDst += 2;
211	}
212	}
213
214	*pszDst = '\0';
215	return VINF_SUCCESS;
216	}
217
218
219	static char rtUriPercentDecodeN(const char pszString, size_t cchString)
220	{
221	AssertPtrReturn(pszString, NULL);
222	AssertReturn(memchr(pszString, '\0', cchString) == NULL, NULL);
223
224	/*
225	* The new string can only get smaller, so use the input length as a
226	* staring buffer size.
227	*/
228	char *pszDecoded = RTStrAlloc(cchString + 1);
229	if (pszDecoded)
230	{
231	/*
232	* Knowing that the pszString itself is valid UTF-8, we only have to
233	* validate the escape sequences.
234	*/
235	size_t cchLeft = cchString;
236	char const *pchSrc = pszString;
237	char *pchDst = pszDecoded;
238	while (cchLeft > 0)
239	{
240	const char pchPct = (const char )memchr(pchSrc, '%', cchLeft);
241	if (pchPct)
242	{
243	size_t cchBefore = pchPct - pchSrc;
244	if (cchBefore)
245	{
246	memcpy(pchDst, pchSrc, cchBefore);
247	pchDst += cchBefore;
248	pchSrc += cchBefore;
249	cchLeft -= cchBefore;
250	}
251
252	char chHigh, chLow;
253	if ( cchLeft >= 3
254	&& RT_C_IS_XDIGIT(chHigh = pchSrc[1])
255	&& RT_C_IS_XDIGIT(chLow = pchSrc[2]))
256	{
257	uint8_t b = RT_C_IS_DIGIT(chHigh) ? chHigh - '0' : (chHigh & ~0x20) - 'A' + 10;
258	b <<= 4;
259	b \|= RT_C_IS_DIGIT(chLow) ? chLow - '0' : (chLow & ~0x20) - 'A' + 10;
260	*pchDst++ = (char)b;
261	pchSrc += 3;
262	cchLeft -= 3;
263	}
264	else
265	{
266	AssertFailed();
267	pchDst++ = pchSrc++;
268	cchLeft--;
269	}
270	}
271	else
272	{
273	memcpy(pchDst, pchSrc, cchLeft);
274	pchDst += cchLeft;
275	pchSrc += cchLeft;
276	cchLeft = 0;
277	break;
278	}
279	}
280
281	*pchDst = '\0';
282
283	/*
284	* If we've got lof space room in the result string, reallocate it.
285	*/
286	size_t cchDecoded = pchDst - pszDecoded;
287	Assert(cchDecoded <= cchString);
288	if (cchString - cchDecoded > 64)
289	RTStrRealloc(&pszDecoded, cchDecoded + 1);
290	}
291	return pszDecoded;
292	}
293
294
295	/**
296	* Calculates the decoded string length.
297	*
298	* @returns Number of chars (excluding the terminator).
299	* @param pszString The string to decode.
300	* @param cchMax The maximum string length (e.g. RTSTR_MAX).
301	*/
302	static size_t rtUriCalcDecodedLength(const char *pszString, size_t cchMax)
303	{
304	size_t cchDecoded;
305	if (pszString)
306	{
307	size_t cchSrcLeft = cchDecoded = RTStrNLen(pszString, cchMax);
308	while (cchSrcLeft-- > 0)
309	{
310	char const ch = *pszString++;
311	if (ch != '%')
312	{ /* typical */}
313	else if ( cchSrcLeft >= 2
314	&& RT_C_IS_XDIGIT(pszString[0])
315	&& RT_C_IS_XDIGIT(pszString[1]))
316	{
317	cchDecoded -= 2;
318	pszString += 2;
319	cchSrcLeft -= 2;
320	}
321	}
322	}
323	else
324	cchDecoded = 0;
325	return cchDecoded;
326	}
327
328
329	/**
330	* Decodes a string into a buffer.
331	*
332	* @returns IPRT status code.
333	* @param pchSrc The source string.
334	* @param cchSrc The max number of bytes to decode in the source string.
335	* @param pszDst The destination buffer.
336	* @param cbDst The size of the buffer (including terminator).
337	*/
338	static int rtUriDecodeIntoBuffer(const char pchSrc, size_t cchSrc, char pszDst, size_t cbDst)
339	{
340	AssertPtrReturn(pchSrc, VERR_INVALID_POINTER);
341	AssertPtrReturn(pszDst, VERR_INVALID_POINTER);
342
343	/*
344	* Knowing that the pszString itself is valid UTF-8, we only have to
345	* validate the escape sequences.
346	*/
347	cchSrc = RTStrNLen(pchSrc, cchSrc);
348	while (cchSrc > 0)
349	{
350	const char pchPct = (const char )memchr(pchSrc, '%', cchSrc);
351	if (pchPct)
352	{
353	size_t cchBefore = pchPct - pchSrc;
354	AssertReturn(cchBefore + 1 < cbDst, VERR_BUFFER_OVERFLOW);
355	if (cchBefore)
356	{
357	memcpy(pszDst, pchSrc, cchBefore);
358	pszDst += cchBefore;
359	cbDst -= cchBefore;
360	pchSrc += cchBefore;
361	cchSrc -= cchBefore;
362	}
363
364	char chHigh, chLow;
365	if ( cchSrc >= 3
366	&& RT_C_IS_XDIGIT(chHigh = pchSrc[1])
367	&& RT_C_IS_XDIGIT(chLow = pchSrc[2]))
368	{
369	uint8_t b = RT_C_IS_DIGIT(chHigh) ? chHigh - '0' : (chHigh & ~0x20) - 'A' + 10;
370	b <<= 4;
371	b \|= RT_C_IS_DIGIT(chLow) ? chLow - '0' : (chLow & ~0x20) - 'A' + 10;
372	*pszDst++ = (char)b;
373	pchSrc += 3;
374	cchSrc -= 3;
375	}
376	else
377	{
378	AssertFailed();
379	pszDst++ = pchSrc++;
380	cchSrc--;
381	}
382	cbDst -= 1;
383	}
384	else
385	{
386	AssertReturn(cchSrc < cbDst, VERR_BUFFER_OVERFLOW);
387	memcpy(pszDst, pchSrc, cchSrc);
388	pszDst += cchSrc;
389	cbDst -= cchSrc;
390	pchSrc += cchSrc;
391	cchSrc = 0;
392	break;
393	}
394	}
395
396	AssertReturn(cbDst > 0, VERR_BUFFER_OVERFLOW);
397	*pszDst = '\0';
398	return VINF_SUCCESS;
399	}
400
401
402
403	static int rtUriParse(const char *pszUri, PRTURIPARSED pParsed)
404	{
405	/*
406	* Validate the input and clear the output.
407	*/
408	AssertPtrReturn(pParsed, VERR_INVALID_POINTER);
409	RT_ZERO(*pParsed);
410	pParsed->uAuthorityPort = UINT32_MAX;
411
412	AssertPtrReturn(pszUri, VERR_INVALID_POINTER);
413
414	size_t const cchUri = strlen(pszUri);
415	if (RT_LIKELY(cchUri >= 3)) { /* likely */ }
416	else return cchUri ? VERR_URI_TOO_SHORT : VERR_URI_EMPTY;
417
418	/*
419	* Validating escaped text sequences is much simpler if we know that
420	* that the base URI string is valid. Also, we don't necessarily trust
421	* the developer calling us to remember to do this.
422	*/
423	int rc = RTStrValidateEncoding(pszUri);
424	AssertRCReturn(rc, rc);
425
426	/*
427	* RFC-3986, section 3.1:
428	* scheme = ALPHA *( ALPHA / DIGIT / "+" / "-" / "." )
429	*
430	* The scheme ends with a ':', which we also skip here.
431	*/
432	size_t off = 0;
433	char ch = pszUri[off++];
434	if (RT_LIKELY(RT_C_IS_ALPHA(ch))) { /* likely */ }
435	else return VERR_URI_INVALID_SCHEME;
436	for (;;)
437	{
438	ch = pszUri[off];
439	if (ch == ':')
440	break;
441	if (RT_LIKELY(RT_C_IS_ALNUM(ch) \|\| ch == '.' \|\| ch == '-' \|\| ch == '+')) { /* likely */ }
442	else return VERR_URI_INVALID_SCHEME;
443	off++;
444	}
445	pParsed->cchScheme = off;
446
447	/* Require the scheme length to be at least two chars so we won't confuse
448	it with a path starting with a DOS drive letter specification. */
449	if (RT_LIKELY(off >= 2)) { /* likely */ }
450	else return VERR_URI_INVALID_SCHEME;
451
452	off++; /* (skip colon) */
453
454	/*
455	* Find the end of the path, we'll need this several times.
456	* Also, while we're potentially scanning the whole thing, check for '%'.
457	*/
458	size_t const offHash = RTStrOffCharOrTerm(&pszUri[off], '#') + off;
459	size_t const offQuestionMark = RTStrOffCharOrTerm(&pszUri[off], '?') + off;
460
461	if (memchr(pszUri, '%', cchUri) != NULL)
462	pParsed->fFlags \|= RTURIPARSED_F_CONTAINS_ESCAPED_CHARS;
463
464	/*
465	* RFC-3986, section 3.2:
466	* The authority component is preceeded by a double slash ("//")...
467	*/
468	if ( pszUri[off] == '/'
469	&& pszUri[off + 1] == '/')
470	{
471	off += 2;
472	pParsed->offAuthority = pParsed->offAuthorityUsername = pParsed->offAuthorityPassword = pParsed->offAuthorityHost = off;
473	pParsed->fFlags \|= RTURIPARSED_F_HAS_AUTHORITY;
474
475	/*
476	* RFC-3986, section 3.2:
477	* ...and is terminated by the next slash ("/"), question mark ("?"),
478	* or number sign ("#") character, or by the end of the URI.
479	*/
480	const char *pszAuthority = &pszUri[off];
481	size_t cchAuthority = RTStrOffCharOrTerm(pszAuthority, '/');
482	cchAuthority = RT_MIN(cchAuthority, offHash - off);
483	cchAuthority = RT_MIN(cchAuthority, offQuestionMark - off);
484	pParsed->cchAuthority = cchAuthority;
485
486	/* The Authority can be empty, like for: file:///usr/bin/grep */
487	if (cchAuthority > 0)
488	{
489	pParsed->cchAuthorityHost = cchAuthority;
490
491	/*
492	* If there is a userinfo part, it is ended by a '@'.
493	*/
494	const char pszAt = (const char )memchr(pszAuthority, '@', cchAuthority);
495	if (pszAt)
496	{
497	size_t cchTmp = pszAt - pszAuthority;
498	pParsed->offAuthorityHost += cchTmp + 1;
499	pParsed->cchAuthorityHost -= cchTmp + 1;
500
501	/* If there is a password part, it's separated from the username with a colon. */
502	const char pszColon = (const char )memchr(pszAuthority, ':', cchTmp);
503	if (pszColon)
504	{
505	pParsed->cchAuthorityUsername = pszColon - pszAuthority;
506	pParsed->offAuthorityPassword = &pszColon[1] - pszUri;
507	pParsed->cchAuthorityPassword = pszAt - &pszColon[1];
508	}
509	else
510	{
511	pParsed->cchAuthorityUsername = cchTmp;
512	pParsed->offAuthorityPassword = off + cchTmp;
513	}
514	}
515
516	/*
517	* If there is a port part, its after the last colon in the host part.
518	*/
519	const char pszColon = (const char )memrchr(&pszUri[pParsed->offAuthorityHost], ':', pParsed->cchAuthorityHost);
520	if (pszColon)
521	{
522	size_t cchTmp = &pszUri[pParsed->offAuthorityHost + pParsed->cchAuthorityHost] - &pszColon[1];
523	pParsed->cchAuthorityHost -= cchTmp + 1;
524	pParsed->fFlags \|= RTURIPARSED_F_HAS_PORT;
525	if (cchTmp > 0)
526	{
527	pParsed->uAuthorityPort = 0;
528	while (cchTmp-- > 0)
529	{
530	ch = *++pszColon;
531	if ( RT_C_IS_DIGIT(ch)
532	&& pParsed->uAuthorityPort < UINT32_MAX / UINT32_C(10))
533	{
534	pParsed->uAuthorityPort *= 10;
535	pParsed->uAuthorityPort += ch - '0';
536	}
537	else
538	return VERR_URI_INVALID_PORT_NUMBER;
539	}
540	}
541	}
542	}
543
544	/* Skip past the authority. */
545	off += cchAuthority;
546	}
547	else
548	pParsed->offAuthority = pParsed->offAuthorityUsername = pParsed->offAuthorityPassword = pParsed->offAuthorityHost = off;
549
550	/*
551	* RFC-3986, section 3.3: Path
552	* The path is terminated by the first question mark ("?")
553	* or number sign ("#") character, or by the end of the URI.
554	*/
555	pParsed->offPath = off;
556	pParsed->cchPath = RT_MIN(offHash, offQuestionMark) - off;
557	off += pParsed->cchPath;
558
559	/*
560	* RFC-3986, section 3.4: Query
561	* The query component is indicated by the first question mark ("?")
562	* character and terminated by a number sign ("#") character or by the
563	* end of the URI.
564	*/
565	if ( off == offQuestionMark
566	&& off < cchUri)
567	{
568	Assert(pszUri[offQuestionMark] == '?');
569	pParsed->offQuery = ++off;
570	pParsed->cchQuery = offHash - off;
571	off = offHash;
572	}
573	else
574	{
575	Assert(!pszUri[offQuestionMark]);
576	pParsed->offQuery = off;
577	}
578
579	/*
580	* RFC-3986, section 3.5: Fragment
581	* A fragment identifier component is indicated by the presence of a
582	* number sign ("#") character and terminated by the end of the URI.
583	*/
584	if ( off == offHash
585	&& off < cchUri)
586	{
587	pParsed->offFragment = ++off;
588	pParsed->cchFragment = cchUri - off;
589	}
590	else
591	{
592	Assert(!pszUri[offHash]);
593	pParsed->offFragment = off;
594	}
595
596	/*
597	* If there are any escape sequences, validate them.
598	*
599	* This is reasonably simple as we already know that the string is valid UTF-8
600	* before they get decoded. Thus we only have to validate the escaped sequences.
601	*/
602	if (pParsed->fFlags & RTURIPARSED_F_CONTAINS_ESCAPED_CHARS)
603	{
604	const char pchSrc = (const char )memchr(pszUri, '%', cchUri);
605	AssertReturn(pchSrc, VERR_INTERNAL_ERROR);
606	do
607	{
608	char szUtf8Seq[8];
609	unsigned cchUtf8Seq = 0;
610	unsigned cchNeeded = 0;
611	size_t cchLeft = &pszUri[cchUri] - pchSrc;
612	do
613	{
614	if (cchLeft >= 3)
615	{
616	char chHigh = pchSrc[1];
617	char chLow = pchSrc[2];
618	if ( RT_C_IS_XDIGIT(chHigh)
619	&& RT_C_IS_XDIGIT(chLow))
620	{
621	uint8_t b = RT_C_IS_DIGIT(chHigh) ? chHigh - '0' : (chHigh & ~0x20) - 'A' + 10;
622	b <<= 4;
623	b \|= RT_C_IS_DIGIT(chLow) ? chLow - '0' : (chLow & ~0x20) - 'A' + 10;
624
625	if (!(b & 0x80))
626	{
627	/* We don't want the string to be terminated prematurely. */
628	if (RT_LIKELY(b != 0)) { /* likely */ }
629	else return VERR_URI_ESCAPED_ZERO;
630
631	/* Check that we're not expecting more UTF-8 bytes. */
632	if (RT_LIKELY(cchNeeded == 0)) { /* likely */ }
633	else return VERR_URI_MISSING_UTF8_CONTINUATION_BYTE;
634	}
635	/* Are we waiting UTF-8 bytes? */
636	else if (cchNeeded > 0)
637	{
638	if (RT_LIKELY(!(b & 0x40))) { /* likely */ }
639	else return VERR_URI_INVALID_ESCAPED_UTF8_CONTINUATION_BYTE;
640
641	szUtf8Seq[cchUtf8Seq++] = (char)b;
642	if (--cchNeeded == 0)
643	{
644	szUtf8Seq[cchUtf8Seq] = '\0';
645	rc = RTStrValidateEncoding(szUtf8Seq);
646	if (RT_FAILURE(rc))
647	return VERR_URI_ESCAPED_CHARS_NOT_VALID_UTF8;
648	cchUtf8Seq = 0;
649	}
650	}
651	/* Start a new UTF-8 sequence. */
652	else
653	{
654	if ((b & 0xf8) == 0xf0)
655	cchNeeded = 3;
656	else if ((b & 0xf0) == 0xe0)
657	cchNeeded = 2;
658	else if ((b & 0xe0) == 0xc0)
659	cchNeeded = 1;
660	else
661	return VERR_URI_INVALID_ESCAPED_UTF8_LEAD_BYTE;
662	szUtf8Seq[0] = (char)b;
663	cchUtf8Seq = 1;
664	}
665	pchSrc += 3;
666	cchLeft -= 3;
667	}
668	else
669	return VERR_URI_INVALID_ESCAPE_SEQ;
670	}
671	else
672	return VERR_URI_INVALID_ESCAPE_SEQ;
673	} while (cchLeft > 0 && pchSrc[0] == '%');
674
675	/* Check that we're not expecting more UTF-8 bytes. */
676	if (RT_LIKELY(cchNeeded == 0)) { /* likely */ }
677	else return VERR_URI_MISSING_UTF8_CONTINUATION_BYTE;
678
679	/* next */
680	pchSrc = (const char *)memchr(pchSrc, '%', cchLeft);
681	} while (pchSrc);
682	}
683
684	pParsed->u32Magic = RTURIPARSED_MAGIC;
685	return VINF_SUCCESS;
686	}
687
688
689	RTDECL(int) RTUriParse(const char *pszUri, PRTURIPARSED pParsed)
690	{
691	return rtUriParse(pszUri, pParsed);
692	}
693
694
695	RTDECL(char ) RTUriParsedScheme(const char pszUri, PCRTURIPARSED pParsed)
696	{
697	AssertPtrReturn(pszUri, NULL);
698	AssertPtrReturn(pParsed, NULL);
699	AssertReturn(pParsed->u32Magic == RTURIPARSED_MAGIC, NULL);
700	return RTStrDupN(pszUri, pParsed->cchScheme);
701	}
702
703
704	RTDECL(char ) RTUriParsedAuthority(const char pszUri, PCRTURIPARSED pParsed)
705	{
706	AssertPtrReturn(pszUri, NULL);
707	AssertPtrReturn(pParsed, NULL);
708	AssertReturn(pParsed->u32Magic == RTURIPARSED_MAGIC, NULL);
709	if (pParsed->cchAuthority \|\| (pParsed->fFlags & RTURIPARSED_F_HAS_AUTHORITY))
710	return rtUriPercentDecodeN(&pszUri[pParsed->offAuthority], pParsed->cchAuthority);
711	return NULL;
712	}
713
714
715	RTDECL(char ) RTUriParsedAuthorityUsername(const char pszUri, PCRTURIPARSED pParsed)
716	{
717	AssertPtrReturn(pszUri, NULL);
718	AssertPtrReturn(pParsed, NULL);
719	AssertReturn(pParsed->u32Magic == RTURIPARSED_MAGIC, NULL);
720	if (pParsed->cchAuthorityUsername)
721	return rtUriPercentDecodeN(&pszUri[pParsed->offAuthorityUsername], pParsed->cchAuthorityUsername);
722	return NULL;
723	}
724
725
726	RTDECL(char ) RTUriParsedAuthorityPassword(const char pszUri, PCRTURIPARSED pParsed)
727	{
728	AssertPtrReturn(pszUri, NULL);
729	AssertPtrReturn(pParsed, NULL);
730	AssertReturn(pParsed->u32Magic == RTURIPARSED_MAGIC, NULL);
731	if (pParsed->cchAuthorityPassword)
732	return rtUriPercentDecodeN(&pszUri[pParsed->offAuthorityPassword], pParsed->cchAuthorityPassword);
733	return NULL;
734	}
735
736
737	RTDECL(char ) RTUriParsedAuthorityHost(const char pszUri, PCRTURIPARSED pParsed)
738	{
739	AssertPtrReturn(pszUri, NULL);
740	AssertPtrReturn(pParsed, NULL);
741	AssertReturn(pParsed->u32Magic == RTURIPARSED_MAGIC, NULL);
742	if (pParsed->cchAuthorityHost)
743	return rtUriPercentDecodeN(&pszUri[pParsed->offAuthorityHost], pParsed->cchAuthorityHost);
744	return NULL;
745	}
746
747
748	RTDECL(uint32_t) RTUriParsedAuthorityPort(const char *pszUri, PCRTURIPARSED pParsed)
749	{
750	AssertPtrReturn(pszUri, UINT32_MAX);
751	AssertPtrReturn(pParsed, UINT32_MAX);
752	AssertReturn(pParsed->u32Magic == RTURIPARSED_MAGIC, UINT32_MAX);
753	return pParsed->uAuthorityPort;
754	}
755
756
757	RTDECL(char ) RTUriParsedPath(const char pszUri, PCRTURIPARSED pParsed)
758	{
759	AssertPtrReturn(pszUri, NULL);
760	AssertPtrReturn(pParsed, NULL);
761	AssertReturn(pParsed->u32Magic == RTURIPARSED_MAGIC, NULL);
762	if (pParsed->cchPath)
763	return rtUriPercentDecodeN(&pszUri[pParsed->offPath], pParsed->cchPath);
764	return NULL;
765	}
766
767
768	RTDECL(char ) RTUriParsedQuery(const char pszUri, PCRTURIPARSED pParsed)
769	{
770	AssertPtrReturn(pszUri, NULL);
771	AssertPtrReturn(pParsed, NULL);
772	AssertReturn(pParsed->u32Magic == RTURIPARSED_MAGIC, NULL);
773	if (pParsed->cchQuery)
774	return rtUriPercentDecodeN(&pszUri[pParsed->offQuery], pParsed->cchQuery);
775	return NULL;
776	}
777
778
779	RTDECL(char ) RTUriParsedFragment(const char pszUri, PCRTURIPARSED pParsed)
780	{
781	AssertPtrReturn(pszUri, NULL);
782	AssertPtrReturn(pParsed, NULL);
783	AssertReturn(pParsed->u32Magic == RTURIPARSED_MAGIC, NULL);
784	if (pParsed->cchFragment)
785	return rtUriPercentDecodeN(&pszUri[pParsed->offFragment], pParsed->cchFragment);
786	return NULL;
787	}
788
789
790	RTDECL(char ) RTUriCreate(const char pszScheme, const char pszAuthority, const char pszPath, const char *pszQuery,
791	const char *pszFragment)
792	{
793	if (!pszScheme) /* Scheme is minimum requirement */
794	return NULL;
795
796	char *pszResult = 0;
797	char *pszAuthority1 = 0;
798	char *pszPath1 = 0;
799	char *pszQuery1 = 0;
800	char *pszFragment1 = 0;
801
802	do
803	{
804	/* Create the percent encoded strings and calculate the necessary uri
805	* length. */
806	size_t cbSize = strlen(pszScheme) + 1 + 1; /* plus zero byte */
807	if (pszAuthority)
808	{
809	pszAuthority1 = rtUriPercentEncodeN(pszAuthority, RTSTR_MAX);
810	if (!pszAuthority1)
811	break;
812	cbSize += strlen(pszAuthority1) + 2;
813	}
814	if (pszPath)
815	{
816	pszPath1 = rtUriPercentEncodeN(pszPath, RTSTR_MAX);
817	if (!pszPath1)
818	break;
819	cbSize += strlen(pszPath1);
820	}
821	if (pszQuery)
822	{
823	pszQuery1 = rtUriPercentEncodeN(pszQuery, RTSTR_MAX);
824	if (!pszQuery1)
825	break;
826	cbSize += strlen(pszQuery1) + 1;
827	}
828	if (pszFragment)
829	{
830	pszFragment1 = rtUriPercentEncodeN(pszFragment, RTSTR_MAX);
831	if (!pszFragment1)
832	break;
833	cbSize += strlen(pszFragment1) + 1;
834	}
835
836	char pszTmp = pszResult = (char )RTStrAlloc(cbSize);
837	if (!pszResult)
838	break;
839	RT_BZERO(pszTmp, cbSize);
840
841	/* Compose the target uri string. */
842	RTStrCatP(&pszTmp, &cbSize, pszScheme);
843	RTStrCatP(&pszTmp, &cbSize, ":");
844	if (pszAuthority1)
845	{
846	RTStrCatP(&pszTmp, &cbSize, "//");
847	RTStrCatP(&pszTmp, &cbSize, pszAuthority1);
848	}
849	if (pszPath1)
850	{
851	RTStrCatP(&pszTmp, &cbSize, pszPath1);
852	}
853	if (pszQuery1)
854	{
855	RTStrCatP(&pszTmp, &cbSize, "?");
856	RTStrCatP(&pszTmp, &cbSize, pszQuery1);
857	}
858	if (pszFragment1)
859	{
860	RTStrCatP(&pszTmp, &cbSize, "#");
861	RTStrCatP(&pszTmp, &cbSize, pszFragment1);
862	}
863	} while (0);
864
865	/* Cleanup */
866	if (pszAuthority1)
867	RTStrFree(pszAuthority1);
868	if (pszPath1)
869	RTStrFree(pszPath1);
870	if (pszQuery1)
871	RTStrFree(pszQuery1);
872	if (pszFragment1)
873	RTStrFree(pszFragment1);
874
875	return pszResult;
876	}
877
878
879	RTDECL(bool) RTUriIsSchemeMatch(const char pszUri, const char pszScheme)
880	{
881	AssertPtrReturn(pszUri, false);
882	size_t const cchScheme = strlen(pszScheme);
883	return RTStrNICmp(pszUri, pszScheme, cchScheme) == 0
884	&& pszUri[cchScheme] == ':';
885	}
886
887
888	RTDECL(int) RTUriFileCreateEx(const char pszPath, uint32_t fPathStyle, char ppszUri, size_t cbUri, size_t pcchUri)
889	{
890	/*
891	* Validate and adjust input. (RTPathParse check pszPath out for us)
892	*/
893	if (pcchUri)
894	{
895	AssertPtrReturn(pcchUri, VERR_INVALID_POINTER);
896	*pcchUri = ~(size_t)0;
897	}
898	AssertPtrReturn(ppszUri, VERR_INVALID_POINTER);
899	AssertReturn(!(fPathStyle & ~RTPATH_STR_F_STYLE_MASK) && fPathStyle != RTPATH_STR_F_STYLE_RESERVED, VERR_INVALID_FLAGS);
900	if (fPathStyle == RTPATH_STR_F_STYLE_HOST)
901	fPathStyle = RTPATH_STYLE;
902
903	/*
904	* Let the RTPath code parse the stuff (no reason to duplicate path parsing
905	* and get it slightly wrong here).
906	*/
907	union
908	{
909	RTPATHPARSED ParsedPath;
910	uint8_t abPadding[sizeof(RTPATHPARSED)];
911	} u;
912	int rc = RTPathParse(pszPath, &u.ParsedPath, sizeof(u.ParsedPath), fPathStyle);
913	if (RT_SUCCESS(rc) \|\| rc == VERR_BUFFER_OVERFLOW)
914	{
915	/* Skip leading slashes. */
916	if (u.ParsedPath.fProps & RTPATH_PROP_ROOT_SLASH)
917	{
918	if (fPathStyle == RTPATH_STR_F_STYLE_DOS)
919	while (pszPath[0] == '/' \|\| pszPath[0] == '\\')
920	pszPath++;
921	else
922	while (pszPath[0] == '/')
923	pszPath++;
924	}
925	const size_t cchPath = strlen(pszPath);
926
927	/*
928	* Calculate the encoded length and figure destination buffering.
929	*/
930	static const char s_szPrefix[] = "file:///";
931	size_t const cchPrefix = sizeof(s_szPrefix) - (u.ParsedPath.fProps & RTPATH_PROP_UNC ? 2 : 1);
932	size_t cchEncoded = rtUriCalcEncodedLength(pszPath, cchPath, fPathStyle != RTPATH_STR_F_STYLE_DOS);
933
934	if (pcchUri)
935	*pcchUri = cchEncoded;
936
937	char *pszDst;
938	char *pszFreeMe = NULL;
939	if (!cbUri \|\| *ppszUri == NULL)
940	{
941	cbUri = RT_MAX(cbUri, cchPrefix + cchEncoded + 1);
942	*ppszUri = pszFreeMe = pszDst = RTStrAlloc(cbUri);
943	AssertReturn(pszDst, VERR_NO_STR_MEMORY);
944	}
945	else if (cchEncoded < cbUri)
946	pszDst = *ppszUri;
947	else
948	return VERR_BUFFER_OVERFLOW;
949
950	/*
951	* Construct the URI.
952	*/
953	memcpy(pszDst, s_szPrefix, cchPrefix);
954	pszDst[cchPrefix] = '\0';
955	rc = rtUriEncodeIntoBuffer(pszPath, cchPath, fPathStyle != RTPATH_STR_F_STYLE_DOS, &pszDst[cchPrefix], cbUri - cchPrefix);
956	if (RT_SUCCESS(rc))
957	{
958	Assert(strlen(pszDst) == cbUri - 1);
959	if (fPathStyle == RTPATH_STR_F_STYLE_DOS)
960	RTPathChangeToUnixSlashes(pszDst, true /fForce/);
961	return VINF_SUCCESS;
962	}
963
964	AssertRC(rc); /* Impossible! rtUriCalcEncodedLength or something above is busted! */
965	if (pszFreeMe)
966	RTStrFree(pszFreeMe);
967	}
968	return rc;
969	}
970
971
972	RTDECL(char ) RTUriFileCreate(const char pszPath)
973	{
974	char *pszUri = NULL;
975	int rc = RTUriFileCreateEx(pszPath, RTPATH_STR_F_STYLE_HOST, &pszUri, 0 /cbUri/, NULL /pcchUri/);
976	if (RT_SUCCESS(rc))
977	return pszUri;
978	return NULL;
979	}
980
981
982	RTDECL(int) RTUriFilePathEx(const char pszUri, uint32_t fPathStyle, char ppszPath, size_t cbPath, size_t pcchPath)
983	{
984	/*
985	* Validate and adjust input.
986	*/
987	if (pcchPath)
988	{
989	AssertPtrReturn(pcchPath, VERR_INVALID_POINTER);
990	*pcchPath = ~(size_t)0;
991	}
992	AssertPtrReturn(ppszPath, VERR_INVALID_POINTER);
993	AssertReturn(!(fPathStyle & ~RTPATH_STR_F_STYLE_MASK) && fPathStyle != RTPATH_STR_F_STYLE_RESERVED, VERR_INVALID_FLAGS);
994	if (fPathStyle == RTPATH_STR_F_STYLE_HOST)
995	fPathStyle = RTPATH_STYLE;
996	AssertPtrReturn(pszUri, VERR_INVALID_POINTER);
997
998	/*
999	* Check that this is a file URI.
1000	*/
1001	if (RTStrNICmp(pszUri, RT_STR_TUPLE("file:")) == 0)
1002	{ /* likely */ }
1003	else
1004	return VERR_URI_NOT_FILE_SCHEME;
1005
1006	/*
1007	* We may have a number of variations here, mostly thanks to
1008	* various windows software. First the canonical variations:
1009	* - file:///C:/Windows/System32/kernel32.dll
1010	* - file:///C\|/Windows/System32/kernel32.dll
1011	* - file:///C:%5CWindows%5CSystem32%5Ckernel32.dll
1012	* - file://localhost/C:%5CWindows%5CSystem32%5Ckernel32.dll
1013	* - file://cifsserver.dev/systemshare%5CWindows%5CSystem32%5Ckernel32.dll
1014	* - file://cifsserver.dev:139/systemshare%5CWindows%5CSystem32%5Ckernel32.dll (not quite sure here, but whatever)
1015	*
1016	* Legacy variant without any slashes after the schema:
1017	* - file:C:/Windows/System32/kernel32.dll
1018	* - file:C\|/Windows/System32%5Ckernel32.dll
1019	* - file:~/.bashrc
1020	* \--path-/
1021	*
1022	* Legacy variant with exactly one slashes after the schema:
1023	* - file:/C:/Windows/System32%5Ckernel32.dll
1024	* - file:/C\|/Windows/System32/kernel32.dll
1025	* - file:/usr/bin/env
1026	* \---path---/
1027	*
1028	* Legacy variant with two slashes after the schema and an unescaped DOS path:
1029	* - file://C:/Windows/System32\kernel32.dll (**)
1030	* - file://C\|/Windows/System32\kernel32.dll
1031	* \---path---------------------/
1032	* -- authority, with ':' as non-working port separator
1033	*
1034	* Legacy variant with exactly four slashes after the schema and an unescaped DOS path.
1035	* - file:////C:/Windows\System32\user32.dll
1036	*
1037	* Legacy variant with four or more slashes after the schema and an unescaped UNC path:
1038	* - file:////cifsserver.dev/systemshare/System32%\kernel32.dll
1039	* - file://///cifsserver.dev/systemshare/System32\kernel32.dll
1040	* \---path--------------------------------------------/
1041	*
1042	* The two unescaped variants shouldn't be handed to rtUriParse, which
1043	* is good as we cannot actually handle the one marked by (**). So, handle
1044	* those two special when parsing.
1045	*/
1046	RTURIPARSED Parsed;
1047	int rc;
1048	size_t cSlashes = 0;
1049	while (pszUri[5 + cSlashes] == '/')
1050	cSlashes++;
1051	if ( (cSlashes == 2 \|\| cSlashes == 4)
1052	&& RT_C_IS_ALPHA(pszUri[5 + cSlashes])
1053	&& (pszUri[5 + cSlashes + 1] == ':' \|\| pszUri[5 + cSlashes + 1] == '\|'))
1054	{
1055	RT_ZERO(Parsed); /* RTURIPARSED_F_CONTAINS_ESCAPED_CHARS is now clear. */
1056	Parsed.offPath = 5 + cSlashes;
1057	Parsed.cchPath = strlen(&pszUri[Parsed.offPath]);
1058	rc = RTStrValidateEncoding(&pszUri[Parsed.offPath]);
1059	}
1060	else if (cSlashes >= 4)
1061	{
1062	RT_ZERO(Parsed);
1063	Parsed.fFlags = cSlashes > 4 ? RTURIPARSED_F_CONTAINS_ESCAPED_CHARS : 0;
1064	Parsed.offPath = 5 + cSlashes - 2;
1065	Parsed.cchPath = strlen(&pszUri[Parsed.offPath]);
1066	rc = RTStrValidateEncoding(&pszUri[Parsed.offPath]);
1067	}
1068	else
1069	rc = rtUriParse(pszUri, &Parsed);
1070	if (RT_SUCCESS(rc))
1071	{
1072	/*
1073	* Ignore localhost as hostname (it's implicit).
1074	*/
1075	static char const s_szLocalhost[] = "localhost";
1076	if ( Parsed.cchAuthorityHost == sizeof(s_szLocalhost) - 1U
1077	&& RTStrNICmp(&pszUri[Parsed.offAuthorityHost], RT_STR_TUPLE(s_szLocalhost)) == 0)
1078	{
1079	Parsed.cchAuthorityHost = 0;
1080	Parsed.cchAuthority = 0;
1081	}
1082
1083	/*
1084	* Ignore leading path slash/separator if we detect a DOS drive letter
1085	* and we don't have a host name.
1086	*/
1087	if ( Parsed.cchPath >= 3
1088	&& Parsed.cchAuthorityHost == 0
1089	&& pszUri[Parsed.offPath] == '/' /* Leading path slash/separator. */
1090	&& ( pszUri[Parsed.offPath + 2] == ':' /* Colon after drive letter. */
1091	\|\| pszUri[Parsed.offPath + 2] == '\|') /* Colon alternative. */
1092	&& RT_C_IS_ALPHA(pszUri[Parsed.offPath + 1]) ) /* Drive letter. */
1093	{
1094	Parsed.offPath++;
1095	Parsed.cchPath--;
1096	}
1097
1098	/*
1099	* Calculate the size of the encoded result.
1100	*
1101	* Since we're happily returning "C:/Windows/System32/kernel.dll"
1102	* style paths when the caller requested UNIX style paths, we will
1103	* return straight UNC paths too ("//cifsserver/share/dir/file").
1104	*/
1105	size_t cchDecodedHost = 0;
1106	size_t cbResult;
1107	if (Parsed.fFlags & RTURIPARSED_F_CONTAINS_ESCAPED_CHARS)
1108	{
1109	cchDecodedHost = rtUriCalcDecodedLength(&pszUri[Parsed.offAuthorityHost], Parsed.cchAuthorityHost);
1110	cbResult = cchDecodedHost + rtUriCalcDecodedLength(&pszUri[Parsed.offPath], Parsed.cchPath) + 1;
1111	}
1112	else
1113	{
1114	cchDecodedHost = 0;
1115	cbResult = Parsed.cchAuthorityHost + Parsed.cchPath + 1;
1116	}
1117	if (pcchPath)
1118	*pcchPath = cbResult - 1;
1119	if (cbResult > 1)
1120	{
1121	/*
1122	* Prepare the necessary buffer space for the result.
1123	*/
1124	char *pszDst;
1125	char *pszFreeMe = NULL;
1126	if (!cbPath \|\| *ppszPath == NULL)
1127	{
1128	cbPath = RT_MAX(cbPath, cbResult);
1129	*ppszPath = pszFreeMe = pszDst = RTStrAlloc(cbPath);
1130	AssertReturn(pszDst, VERR_NO_STR_MEMORY);
1131	}
1132	else if (cbResult <= cbPath)
1133	pszDst = *ppszPath;
1134	else
1135	return VERR_BUFFER_OVERFLOW;
1136
1137	/*
1138	* Compose the result.
1139	*/
1140	if (Parsed.fFlags & RTURIPARSED_F_CONTAINS_ESCAPED_CHARS)
1141	{
1142	rc = rtUriDecodeIntoBuffer(&pszUri[Parsed.offAuthorityHost],Parsed.cchAuthorityHost,
1143	pszDst, cchDecodedHost + 1);
1144	Assert(RT_SUCCESS(rc) && strlen(pszDst) == cchDecodedHost);
1145	if (RT_SUCCESS(rc))
1146	rc = rtUriDecodeIntoBuffer(&pszUri[Parsed.offPath], Parsed.cchPath,
1147	&pszDst[cchDecodedHost], cbResult - cchDecodedHost);
1148	Assert(RT_SUCCESS(rc) && strlen(pszDst) == cbResult - 1);
1149	}
1150	else
1151	{
1152	memcpy(pszDst, &pszUri[Parsed.offAuthorityHost], Parsed.cchAuthorityHost);
1153	memcpy(&pszDst[Parsed.cchAuthorityHost], &pszUri[Parsed.offPath], Parsed.cchPath);
1154	pszDst[cbResult - 1] = '\0';
1155	}
1156	if (RT_SUCCESS(rc))
1157	{
1158	/*
1159	* Convert colon DOS driver letter colon alternative.
1160	* We do this regardless of the desired path style.
1161	*/
1162	if ( RT_C_IS_ALPHA(pszDst[0])
1163	&& pszDst[1] == '\|')
1164	pszDst[1] = ':';
1165
1166	/*
1167	* Fix slashes.
1168	*/
1169	if (fPathStyle == RTPATH_STR_F_STYLE_DOS)
1170	RTPathChangeToDosSlashes(pszDst, true);
1171	else if (fPathStyle == RTPATH_STR_F_STYLE_UNIX)
1172	RTPathChangeToUnixSlashes(pszDst, true); /** @todo not quite sure how this actually makes sense... */
1173	else
1174	AssertFailed();
1175	return rc;
1176	}
1177
1178	/* bail out */
1179	RTStrFree(pszFreeMe);
1180	}
1181	else
1182	rc = VERR_PATH_ZERO_LENGTH;
1183	}
1184	return rc;
1185	}
1186
1187
1188	RTDECL(char ) RTUriFilePath(const char pszUri)
1189	{
1190	char *pszPath = NULL;
1191	int rc = RTUriFilePathEx(pszUri, RTPATH_STR_F_STYLE_HOST, &pszPath, 0 /cbPath/, NULL /pcchPath/);
1192	if (RT_SUCCESS(rc))
1193	return pszPath;
1194	return NULL;
1195	}
1196

Note: See TracBrowser for help on using the repository browser.

source: vbox/trunk/src/VBox/Runtime/common/misc/uri.cpp@ 107454

Download in other formats: