uri.cpp@ 101657

Last change on this file since 101657 was 101657, checked in by vboxsync, 14 months ago
Runtime/uri.cpp: Make it build on linux.arm64 with -Werror, bugref:10541
Property svn:eol-style set to `native` Property svn:keywords set to `Author Date Id Revision`
File size: 40.6 KB

Line
1	/* $Id: uri.cpp 101657 2023-10-30 13:17:13Z vboxsync $ */
2	/** @file
3	* IPRT - Uniform Resource Identifier handling.
4	*/
5
6	/*
7	* Copyright (C) 2011-2023 Oracle and/or its affiliates.
8	*
9	* This file is part of VirtualBox base platform packages, as
10	* available from https://www.virtualbox.org.
11	*
12	* This program is free software; you can redistribute it and/or
13	* modify it under the terms of the GNU General Public License
14	* as published by the Free Software Foundation, in version 3 of the
15	* License.
16	*
17	* This program is distributed in the hope that it will be useful, but
18	* WITHOUT ANY WARRANTY; without even the implied warranty of
19	* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
20	* General Public License for more details.
21	*
22	* You should have received a copy of the GNU General Public License
23	* along with this program; if not, see <https://www.gnu.org/licenses>.
24	*
25	* The contents of this file may alternatively be used under the terms
26	* of the Common Development and Distribution License Version 1.0
27	* (CDDL), a copy of it is provided in the "COPYING.CDDL" file included
28	* in the VirtualBox distribution, in which case the provisions of the
29	* CDDL are applicable instead of those of the GPL.
30	*
31	* You may elect to license modified versions of this file under the
32	* terms and conditions of either the GPL or the CDDL or both.
33	*
34	* SPDX-License-Identifier: GPL-3.0-only OR CDDL-1.0
35	*/
36
37
38	/*********************************************************************************************************************************
39	* Header Files *
40	*********************************************************************************************************************************/
41	#include <iprt/uri.h>
42
43	#include <iprt/assert.h>
44	#include <iprt/ctype.h>
45	#include <iprt/err.h>
46	#include <iprt/path.h>
47	#include <iprt/string.h>
48
49
50	/*********************************************************************************************************************************
51	* Defined Constants And Macros *
52	*********************************************************************************************************************************/
53	/** Internal magic value we use to check if a RTURIPARSED structure has made it thru RTUriParse. */
54	#define RTURIPARSED_MAGIC UINT32_C(0x439e0745)
55
56
57	/* General URI format:
58
59	foo://example.com:8042/over/there?name=ferret#nose
60	\_/ \______________/\_________/ \_________/ \__/
61	\| \| \| \| \|
62	scheme authority path query fragment
63	\| _____________________\|__
64	/ \ / \
65	urn:example:animal:ferret:nose
66	*/
67
68
69	/**
70	* The following defines characters which have to be % escaped:
71	* control = 00-1F
72	* space = ' '
73	* delims = '<' , '>' , '#' , '%' , '"'
74	* unwise = '{' , '}' , '\|' , '\' , '^' , '[' , ']' , '`'
75	*
76	* @note ARM defines char as unsigned by default in the AAPCS(64) so the first check would trigger
77	* a compiler warning/error. Apple decided to ignore that and declares char a signed like on
78	* the other platforms.
79	*/
80	#if defined(RT_OS_LINUX) \
81	&& (defined(RT_ARCH_ARM64) \|\| defined(RT_ARCH_ARM32))
82	# define URI_EXCLUDED(a) \
83	( ((a) <= 0x20) \
84	\|\| ((a) >= 0x5B && (a) <= 0x5E) \
85	\|\| ((a) >= 0x7B && (a) <= 0x7D) \
86	\|\| (a) == '<' \|\| (a) == '>' \|\| (a) == '#' \
87	\|\| (a) == '%' \|\| (a) == '"' \|\| (a) == '`' )
88	#else
89	# define URI_EXCLUDED(a) \
90	( ((a) >= 0x0 && (a) <= 0x20) \
91	\|\| ((a) >= 0x5B && (a) <= 0x5E) \
92	\|\| ((a) >= 0x7B && (a) <= 0x7D) \
93	\|\| (a) == '<' \|\| (a) == '>' \|\| (a) == '#' \
94	\|\| (a) == '%' \|\| (a) == '"' \|\| (a) == '`' )
95	#endif
96
97	static char rtUriPercentEncodeN(const char pszString, size_t cchMax)
98	{
99	if (!pszString)
100	return NULL;
101
102	int rc = VINF_SUCCESS;
103
104	size_t cbLen = RT_MIN(strlen(pszString), cchMax);
105	/* The new string can be max 3 times in size of the original string. */
106	char pszNew = RTStrAlloc(cbLen 3 + 1);
107	if (!pszNew)
108	return NULL;
109
110	char *pszRes = NULL;
111	size_t iIn = 0;
112	size_t iOut = 0;
113	while (iIn < cbLen)
114	{
115	if (URI_EXCLUDED(pszString[iIn]))
116	{
117	char szNum[3] = { 0, 0, 0 };
118	RTStrFormatU8(&szNum[0], 3, pszString[iIn++], 16, 2, 2, RTSTR_F_CAPITAL \| RTSTR_F_ZEROPAD);
119	pszNew[iOut++] = '%';
120	pszNew[iOut++] = szNum[0];
121	pszNew[iOut++] = szNum[1];
122	}
123	else
124	pszNew[iOut++] = pszString[iIn++];
125	}
126	if (RT_SUCCESS(rc))
127	{
128	pszNew[iOut] = '\0';
129	if (iOut != iIn)
130	{
131	/* If the source and target strings have different size, recreate
132	* the target string with the correct size. */
133	pszRes = RTStrDupN(pszNew, iOut);
134	RTStrFree(pszNew);
135	}
136	else
137	pszRes = pszNew;
138	}
139	else
140	RTStrFree(pszNew);
141
142	return pszRes;
143	}
144
145
146	/**
147	* Calculates the encoded string length.
148	*
149	* @returns Number of chars (excluding the terminator).
150	* @param pszString The string to encode.
151	* @param cchMax The maximum string length (e.g. RTSTR_MAX).
152	* @param fEncodeDosSlash Whether to encode DOS slashes or not.
153	*/
154	static size_t rtUriCalcEncodedLength(const char *pszString, size_t cchMax, bool fEncodeDosSlash)
155	{
156	size_t cchEncoded = 0;
157	if (pszString)
158	{
159	size_t cchSrcLeft = RTStrNLen(pszString, cchMax);
160	while (cchSrcLeft-- > 0)
161	{
162	char const ch = *pszString++;
163	if (!URI_EXCLUDED(ch) \|\| (ch == '\\' && !fEncodeDosSlash))
164	cchEncoded += 1;
165	else
166	cchEncoded += 3;
167	}
168	}
169	return cchEncoded;
170	}
171
172
173	/**
174	* Encodes an URI into a caller allocated buffer.
175	*
176	* @returns IPRT status code.
177	* @param pszString The string to encode.
178	* @param cchMax The maximum string length (e.g. RTSTR_MAX).
179	* @param fEncodeDosSlash Whether to encode DOS slashes or not.
180	* @param pszDst The destination buffer.
181	* @param cbDst The size of the destination buffer.
182	*/
183	static int rtUriEncodeIntoBuffer(const char pszString, size_t cchMax, bool fEncodeDosSlash, char pszDst, size_t cbDst)
184	{
185	AssertReturn(pszString, VERR_INVALID_POINTER);
186	AssertPtrReturn(pszDst, VERR_INVALID_POINTER);
187
188	/*
189	* We do buffer size checking up front and every time we encode a special
190	* character. That's faster than checking for each char.
191	*/
192	size_t cchSrcLeft = RTStrNLen(pszString, cchMax);
193	AssertMsgReturn(cbDst > cchSrcLeft, ("cbDst=%zu cchSrcLeft=%zu\n", cbDst, cchSrcLeft), VERR_BUFFER_OVERFLOW);
194	cbDst -= cchSrcLeft;
195
196	while (cchSrcLeft-- > 0)
197	{
198	char const ch = *pszString++;
199	if (!URI_EXCLUDED(ch) \|\| (ch == '\\' && !fEncodeDosSlash))
200	*pszDst++ = ch;
201	else
202	{
203	AssertReturn(cbDst >= 3, VERR_BUFFER_OVERFLOW); /* 2 extra bytes + zero terminator. */
204	cbDst -= 2;
205
206	*pszDst++ = '%';
207	ssize_t cchTmp = RTStrFormatU8(pszDst, 3, (unsigned char)ch, 16, 2, 2, RTSTR_F_CAPITAL \| RTSTR_F_ZEROPAD);
208	Assert(cchTmp == 2); NOREF(cchTmp);
209	pszDst += 2;
210	}
211	}
212
213	*pszDst = '\0';
214	return VINF_SUCCESS;
215	}
216
217
218	static char rtUriPercentDecodeN(const char pszString, size_t cchString)
219	{
220	AssertPtrReturn(pszString, NULL);
221	AssertReturn(memchr(pszString, '\0', cchString) == NULL, NULL);
222
223	/*
224	* The new string can only get smaller, so use the input length as a
225	* staring buffer size.
226	*/
227	char *pszDecoded = RTStrAlloc(cchString + 1);
228	if (pszDecoded)
229	{
230	/*
231	* Knowing that the pszString itself is valid UTF-8, we only have to
232	* validate the escape sequences.
233	*/
234	size_t cchLeft = cchString;
235	char const *pchSrc = pszString;
236	char *pchDst = pszDecoded;
237	while (cchLeft > 0)
238	{
239	const char pchPct = (const char )memchr(pchSrc, '%', cchLeft);
240	if (pchPct)
241	{
242	size_t cchBefore = pchPct - pchSrc;
243	if (cchBefore)
244	{
245	memcpy(pchDst, pchSrc, cchBefore);
246	pchDst += cchBefore;
247	pchSrc += cchBefore;
248	cchLeft -= cchBefore;
249	}
250
251	char chHigh, chLow;
252	if ( cchLeft >= 3
253	&& RT_C_IS_XDIGIT(chHigh = pchSrc[1])
254	&& RT_C_IS_XDIGIT(chLow = pchSrc[2]))
255	{
256	uint8_t b = RT_C_IS_DIGIT(chHigh) ? chHigh - '0' : (chHigh & ~0x20) - 'A' + 10;
257	b <<= 4;
258	b \|= RT_C_IS_DIGIT(chLow) ? chLow - '0' : (chLow & ~0x20) - 'A' + 10;
259	*pchDst++ = (char)b;
260	pchSrc += 3;
261	cchLeft -= 3;
262	}
263	else
264	{
265	AssertFailed();
266	pchDst++ = pchSrc++;
267	cchLeft--;
268	}
269	}
270	else
271	{
272	memcpy(pchDst, pchSrc, cchLeft);
273	pchDst += cchLeft;
274	pchSrc += cchLeft;
275	cchLeft = 0;
276	break;
277	}
278	}
279
280	*pchDst = '\0';
281
282	/*
283	* If we've got lof space room in the result string, reallocate it.
284	*/
285	size_t cchDecoded = pchDst - pszDecoded;
286	Assert(cchDecoded <= cchString);
287	if (cchString - cchDecoded > 64)
288	RTStrRealloc(&pszDecoded, cchDecoded + 1);
289	}
290	return pszDecoded;
291	}
292
293
294	/**
295	* Calculates the decoded string length.
296	*
297	* @returns Number of chars (excluding the terminator).
298	* @param pszString The string to decode.
299	* @param cchMax The maximum string length (e.g. RTSTR_MAX).
300	*/
301	static size_t rtUriCalcDecodedLength(const char *pszString, size_t cchMax)
302	{
303	size_t cchDecoded;
304	if (pszString)
305	{
306	size_t cchSrcLeft = cchDecoded = RTStrNLen(pszString, cchMax);
307	while (cchSrcLeft-- > 0)
308	{
309	char const ch = *pszString++;
310	if (ch != '%')
311	{ /* typical */}
312	else if ( cchSrcLeft >= 2
313	&& RT_C_IS_XDIGIT(pszString[0])
314	&& RT_C_IS_XDIGIT(pszString[1]))
315	{
316	cchDecoded -= 2;
317	pszString += 2;
318	cchSrcLeft -= 2;
319	}
320	}
321	}
322	else
323	cchDecoded = 0;
324	return cchDecoded;
325	}
326
327
328	/**
329	* Decodes a string into a buffer.
330	*
331	* @returns IPRT status code.
332	* @param pchSrc The source string.
333	* @param cchSrc The max number of bytes to decode in the source string.
334	* @param pszDst The destination buffer.
335	* @param cbDst The size of the buffer (including terminator).
336	*/
337	static int rtUriDecodeIntoBuffer(const char pchSrc, size_t cchSrc, char pszDst, size_t cbDst)
338	{
339	AssertPtrReturn(pchSrc, VERR_INVALID_POINTER);
340	AssertPtrReturn(pszDst, VERR_INVALID_POINTER);
341
342	/*
343	* Knowing that the pszString itself is valid UTF-8, we only have to
344	* validate the escape sequences.
345	*/
346	cchSrc = RTStrNLen(pchSrc, cchSrc);
347	while (cchSrc > 0)
348	{
349	const char pchPct = (const char )memchr(pchSrc, '%', cchSrc);
350	if (pchPct)
351	{
352	size_t cchBefore = pchPct - pchSrc;
353	AssertReturn(cchBefore + 1 < cbDst, VERR_BUFFER_OVERFLOW);
354	if (cchBefore)
355	{
356	memcpy(pszDst, pchSrc, cchBefore);
357	pszDst += cchBefore;
358	cbDst -= cchBefore;
359	pchSrc += cchBefore;
360	cchSrc -= cchBefore;
361	}
362
363	char chHigh, chLow;
364	if ( cchSrc >= 3
365	&& RT_C_IS_XDIGIT(chHigh = pchSrc[1])
366	&& RT_C_IS_XDIGIT(chLow = pchSrc[2]))
367	{
368	uint8_t b = RT_C_IS_DIGIT(chHigh) ? chHigh - '0' : (chHigh & ~0x20) - 'A' + 10;
369	b <<= 4;
370	b \|= RT_C_IS_DIGIT(chLow) ? chLow - '0' : (chLow & ~0x20) - 'A' + 10;
371	*pszDst++ = (char)b;
372	pchSrc += 3;
373	cchSrc -= 3;
374	}
375	else
376	{
377	AssertFailed();
378	pszDst++ = pchSrc++;
379	cchSrc--;
380	}
381	cbDst -= 1;
382	}
383	else
384	{
385	AssertReturn(cchSrc < cbDst, VERR_BUFFER_OVERFLOW);
386	memcpy(pszDst, pchSrc, cchSrc);
387	pszDst += cchSrc;
388	cbDst -= cchSrc;
389	pchSrc += cchSrc;
390	cchSrc = 0;
391	break;
392	}
393	}
394
395	AssertReturn(cbDst > 0, VERR_BUFFER_OVERFLOW);
396	*pszDst = '\0';
397	return VINF_SUCCESS;
398	}
399
400
401
402	static int rtUriParse(const char *pszUri, PRTURIPARSED pParsed)
403	{
404	/*
405	* Validate the input and clear the output.
406	*/
407	AssertPtrReturn(pParsed, VERR_INVALID_POINTER);
408	RT_ZERO(*pParsed);
409	pParsed->uAuthorityPort = UINT32_MAX;
410
411	AssertPtrReturn(pszUri, VERR_INVALID_POINTER);
412
413	size_t const cchUri = strlen(pszUri);
414	if (RT_LIKELY(cchUri >= 3)) { /* likely */ }
415	else return cchUri ? VERR_URI_TOO_SHORT : VERR_URI_EMPTY;
416
417	/*
418	* Validating escaped text sequences is much simpler if we know that
419	* that the base URI string is valid. Also, we don't necessarily trust
420	* the developer calling us to remember to do this.
421	*/
422	int rc = RTStrValidateEncoding(pszUri);
423	AssertRCReturn(rc, rc);
424
425	/*
426	* RFC-3986, section 3.1:
427	* scheme = ALPHA *( ALPHA / DIGIT / "+" / "-" / "." )
428	*
429	* The scheme ends with a ':', which we also skip here.
430	*/
431	size_t off = 0;
432	char ch = pszUri[off++];
433	if (RT_LIKELY(RT_C_IS_ALPHA(ch))) { /* likely */ }
434	else return VERR_URI_INVALID_SCHEME;
435	for (;;)
436	{
437	ch = pszUri[off];
438	if (ch == ':')
439	break;
440	if (RT_LIKELY(RT_C_IS_ALNUM(ch) \|\| ch == '.' \|\| ch == '-' \|\| ch == '+')) { /* likely */ }
441	else return VERR_URI_INVALID_SCHEME;
442	off++;
443	}
444	pParsed->cchScheme = off;
445
446	/* Require the scheme length to be at least two chars so we won't confuse
447	it with a path starting with a DOS drive letter specification. */
448	if (RT_LIKELY(off >= 2)) { /* likely */ }
449	else return VERR_URI_INVALID_SCHEME;
450
451	off++; /* (skip colon) */
452
453	/*
454	* Find the end of the path, we'll need this several times.
455	* Also, while we're potentially scanning the whole thing, check for '%'.
456	*/
457	size_t const offHash = RTStrOffCharOrTerm(&pszUri[off], '#') + off;
458	size_t const offQuestionMark = RTStrOffCharOrTerm(&pszUri[off], '?') + off;
459
460	if (memchr(pszUri, '%', cchUri) != NULL)
461	pParsed->fFlags \|= RTURIPARSED_F_CONTAINS_ESCAPED_CHARS;
462
463	/*
464	* RFC-3986, section 3.2:
465	* The authority component is preceeded by a double slash ("//")...
466	*/
467	if ( pszUri[off] == '/'
468	&& pszUri[off + 1] == '/')
469	{
470	off += 2;
471	pParsed->offAuthority = pParsed->offAuthorityUsername = pParsed->offAuthorityPassword = pParsed->offAuthorityHost = off;
472	pParsed->fFlags \|= RTURIPARSED_F_HAS_AUTHORITY;
473
474	/*
475	* RFC-3986, section 3.2:
476	* ...and is terminated by the next slash ("/"), question mark ("?"),
477	* or number sign ("#") character, or by the end of the URI.
478	*/
479	const char *pszAuthority = &pszUri[off];
480	size_t cchAuthority = RTStrOffCharOrTerm(pszAuthority, '/');
481	cchAuthority = RT_MIN(cchAuthority, offHash - off);
482	cchAuthority = RT_MIN(cchAuthority, offQuestionMark - off);
483	pParsed->cchAuthority = cchAuthority;
484
485	/* The Authority can be empty, like for: file:///usr/bin/grep */
486	if (cchAuthority > 0)
487	{
488	pParsed->cchAuthorityHost = cchAuthority;
489
490	/*
491	* If there is a userinfo part, it is ended by a '@'.
492	*/
493	const char pszAt = (const char )memchr(pszAuthority, '@', cchAuthority);
494	if (pszAt)
495	{
496	size_t cchTmp = pszAt - pszAuthority;
497	pParsed->offAuthorityHost += cchTmp + 1;
498	pParsed->cchAuthorityHost -= cchTmp + 1;
499
500	/* If there is a password part, it's separated from the username with a colon. */
501	const char pszColon = (const char )memchr(pszAuthority, ':', cchTmp);
502	if (pszColon)
503	{
504	pParsed->cchAuthorityUsername = pszColon - pszAuthority;
505	pParsed->offAuthorityPassword = &pszColon[1] - pszUri;
506	pParsed->cchAuthorityPassword = pszAt - &pszColon[1];
507	}
508	else
509	{
510	pParsed->cchAuthorityUsername = cchTmp;
511	pParsed->offAuthorityPassword = off + cchTmp;
512	}
513	}
514
515	/*
516	* If there is a port part, its after the last colon in the host part.
517	*/
518	const char pszColon = (const char )memrchr(&pszUri[pParsed->offAuthorityHost], ':', pParsed->cchAuthorityHost);
519	if (pszColon)
520	{
521	size_t cchTmp = &pszUri[pParsed->offAuthorityHost + pParsed->cchAuthorityHost] - &pszColon[1];
522	pParsed->cchAuthorityHost -= cchTmp + 1;
523	pParsed->fFlags \|= RTURIPARSED_F_HAS_PORT;
524	if (cchTmp > 0)
525	{
526	pParsed->uAuthorityPort = 0;
527	while (cchTmp-- > 0)
528	{
529	ch = *++pszColon;
530	if ( RT_C_IS_DIGIT(ch)
531	&& pParsed->uAuthorityPort < UINT32_MAX / UINT32_C(10))
532	{
533	pParsed->uAuthorityPort *= 10;
534	pParsed->uAuthorityPort += ch - '0';
535	}
536	else
537	return VERR_URI_INVALID_PORT_NUMBER;
538	}
539	}
540	}
541	}
542
543	/* Skip past the authority. */
544	off += cchAuthority;
545	}
546	else
547	pParsed->offAuthority = pParsed->offAuthorityUsername = pParsed->offAuthorityPassword = pParsed->offAuthorityHost = off;
548
549	/*
550	* RFC-3986, section 3.3: Path
551	* The path is terminated by the first question mark ("?")
552	* or number sign ("#") character, or by the end of the URI.
553	*/
554	pParsed->offPath = off;
555	pParsed->cchPath = RT_MIN(offHash, offQuestionMark) - off;
556	off += pParsed->cchPath;
557
558	/*
559	* RFC-3986, section 3.4: Query
560	* The query component is indicated by the first question mark ("?")
561	* character and terminated by a number sign ("#") character or by the
562	* end of the URI.
563	*/
564	if ( off == offQuestionMark
565	&& off < cchUri)
566	{
567	Assert(pszUri[offQuestionMark] == '?');
568	pParsed->offQuery = ++off;
569	pParsed->cchQuery = offHash - off;
570	off = offHash;
571	}
572	else
573	{
574	Assert(!pszUri[offQuestionMark]);
575	pParsed->offQuery = off;
576	}
577
578	/*
579	* RFC-3986, section 3.5: Fragment
580	* A fragment identifier component is indicated by the presence of a
581	* number sign ("#") character and terminated by the end of the URI.
582	*/
583	if ( off == offHash
584	&& off < cchUri)
585	{
586	pParsed->offFragment = ++off;
587	pParsed->cchFragment = cchUri - off;
588	}
589	else
590	{
591	Assert(!pszUri[offHash]);
592	pParsed->offFragment = off;
593	}
594
595	/*
596	* If there are any escape sequences, validate them.
597	*
598	* This is reasonably simple as we already know that the string is valid UTF-8
599	* before they get decoded. Thus we only have to validate the escaped sequences.
600	*/
601	if (pParsed->fFlags & RTURIPARSED_F_CONTAINS_ESCAPED_CHARS)
602	{
603	const char pchSrc = (const char )memchr(pszUri, '%', cchUri);
604	AssertReturn(pchSrc, VERR_INTERNAL_ERROR);
605	do
606	{
607	char szUtf8Seq[8];
608	unsigned cchUtf8Seq = 0;
609	unsigned cchNeeded = 0;
610	size_t cchLeft = &pszUri[cchUri] - pchSrc;
611	do
612	{
613	if (cchLeft >= 3)
614	{
615	char chHigh = pchSrc[1];
616	char chLow = pchSrc[2];
617	if ( RT_C_IS_XDIGIT(chHigh)
618	&& RT_C_IS_XDIGIT(chLow))
619	{
620	uint8_t b = RT_C_IS_DIGIT(chHigh) ? chHigh - '0' : (chHigh & ~0x20) - 'A' + 10;
621	b <<= 4;
622	b \|= RT_C_IS_DIGIT(chLow) ? chLow - '0' : (chLow & ~0x20) - 'A' + 10;
623
624	if (!(b & 0x80))
625	{
626	/* We don't want the string to be terminated prematurely. */
627	if (RT_LIKELY(b != 0)) { /* likely */ }
628	else return VERR_URI_ESCAPED_ZERO;
629
630	/* Check that we're not expecting more UTF-8 bytes. */
631	if (RT_LIKELY(cchNeeded == 0)) { /* likely */ }
632	else return VERR_URI_MISSING_UTF8_CONTINUATION_BYTE;
633	}
634	/* Are we waiting UTF-8 bytes? */
635	else if (cchNeeded > 0)
636	{
637	if (RT_LIKELY(!(b & 0x40))) { /* likely */ }
638	else return VERR_URI_INVALID_ESCAPED_UTF8_CONTINUATION_BYTE;
639
640	szUtf8Seq[cchUtf8Seq++] = (char)b;
641	if (--cchNeeded == 0)
642	{
643	szUtf8Seq[cchUtf8Seq] = '\0';
644	rc = RTStrValidateEncoding(szUtf8Seq);
645	if (RT_FAILURE(rc))
646	return VERR_URI_ESCAPED_CHARS_NOT_VALID_UTF8;
647	cchUtf8Seq = 0;
648	}
649	}
650	/* Start a new UTF-8 sequence. */
651	else
652	{
653	if ((b & 0xf8) == 0xf0)
654	cchNeeded = 3;
655	else if ((b & 0xf0) == 0xe0)
656	cchNeeded = 2;
657	else if ((b & 0xe0) == 0xc0)
658	cchNeeded = 1;
659	else
660	return VERR_URI_INVALID_ESCAPED_UTF8_LEAD_BYTE;
661	szUtf8Seq[0] = (char)b;
662	cchUtf8Seq = 1;
663	}
664	pchSrc += 3;
665	cchLeft -= 3;
666	}
667	else
668	return VERR_URI_INVALID_ESCAPE_SEQ;
669	}
670	else
671	return VERR_URI_INVALID_ESCAPE_SEQ;
672	} while (cchLeft > 0 && pchSrc[0] == '%');
673
674	/* Check that we're not expecting more UTF-8 bytes. */
675	if (RT_LIKELY(cchNeeded == 0)) { /* likely */ }
676	else return VERR_URI_MISSING_UTF8_CONTINUATION_BYTE;
677
678	/* next */
679	pchSrc = (const char *)memchr(pchSrc, '%', cchLeft);
680	} while (pchSrc);
681	}
682
683	pParsed->u32Magic = RTURIPARSED_MAGIC;
684	return VINF_SUCCESS;
685	}
686
687
688	RTDECL(int) RTUriParse(const char *pszUri, PRTURIPARSED pParsed)
689	{
690	return rtUriParse(pszUri, pParsed);
691	}
692
693
694	RTDECL(char ) RTUriParsedScheme(const char pszUri, PCRTURIPARSED pParsed)
695	{
696	AssertPtrReturn(pszUri, NULL);
697	AssertPtrReturn(pParsed, NULL);
698	AssertReturn(pParsed->u32Magic == RTURIPARSED_MAGIC, NULL);
699	return RTStrDupN(pszUri, pParsed->cchScheme);
700	}
701
702
703	RTDECL(char ) RTUriParsedAuthority(const char pszUri, PCRTURIPARSED pParsed)
704	{
705	AssertPtrReturn(pszUri, NULL);
706	AssertPtrReturn(pParsed, NULL);
707	AssertReturn(pParsed->u32Magic == RTURIPARSED_MAGIC, NULL);
708	if (pParsed->cchAuthority \|\| (pParsed->fFlags & RTURIPARSED_F_HAS_AUTHORITY))
709	return rtUriPercentDecodeN(&pszUri[pParsed->offAuthority], pParsed->cchAuthority);
710	return NULL;
711	}
712
713
714	RTDECL(char ) RTUriParsedAuthorityUsername(const char pszUri, PCRTURIPARSED pParsed)
715	{
716	AssertPtrReturn(pszUri, NULL);
717	AssertPtrReturn(pParsed, NULL);
718	AssertReturn(pParsed->u32Magic == RTURIPARSED_MAGIC, NULL);
719	if (pParsed->cchAuthorityUsername)
720	return rtUriPercentDecodeN(&pszUri[pParsed->offAuthorityUsername], pParsed->cchAuthorityUsername);
721	return NULL;
722	}
723
724
725	RTDECL(char ) RTUriParsedAuthorityPassword(const char pszUri, PCRTURIPARSED pParsed)
726	{
727	AssertPtrReturn(pszUri, NULL);
728	AssertPtrReturn(pParsed, NULL);
729	AssertReturn(pParsed->u32Magic == RTURIPARSED_MAGIC, NULL);
730	if (pParsed->cchAuthorityPassword)
731	return rtUriPercentDecodeN(&pszUri[pParsed->offAuthorityPassword], pParsed->cchAuthorityPassword);
732	return NULL;
733	}
734
735
736	RTDECL(char ) RTUriParsedAuthorityHost(const char pszUri, PCRTURIPARSED pParsed)
737	{
738	AssertPtrReturn(pszUri, NULL);
739	AssertPtrReturn(pParsed, NULL);
740	AssertReturn(pParsed->u32Magic == RTURIPARSED_MAGIC, NULL);
741	if (pParsed->cchAuthorityHost)
742	return rtUriPercentDecodeN(&pszUri[pParsed->offAuthorityHost], pParsed->cchAuthorityHost);
743	return NULL;
744	}
745
746
747	RTDECL(uint32_t) RTUriParsedAuthorityPort(const char *pszUri, PCRTURIPARSED pParsed)
748	{
749	AssertPtrReturn(pszUri, UINT32_MAX);
750	AssertPtrReturn(pParsed, UINT32_MAX);
751	AssertReturn(pParsed->u32Magic == RTURIPARSED_MAGIC, UINT32_MAX);
752	return pParsed->uAuthorityPort;
753	}
754
755
756	RTDECL(char ) RTUriParsedPath(const char pszUri, PCRTURIPARSED pParsed)
757	{
758	AssertPtrReturn(pszUri, NULL);
759	AssertPtrReturn(pParsed, NULL);
760	AssertReturn(pParsed->u32Magic == RTURIPARSED_MAGIC, NULL);
761	if (pParsed->cchPath)
762	return rtUriPercentDecodeN(&pszUri[pParsed->offPath], pParsed->cchPath);
763	return NULL;
764	}
765
766
767	RTDECL(char ) RTUriParsedQuery(const char pszUri, PCRTURIPARSED pParsed)
768	{
769	AssertPtrReturn(pszUri, NULL);
770	AssertPtrReturn(pParsed, NULL);
771	AssertReturn(pParsed->u32Magic == RTURIPARSED_MAGIC, NULL);
772	if (pParsed->cchQuery)
773	return rtUriPercentDecodeN(&pszUri[pParsed->offQuery], pParsed->cchQuery);
774	return NULL;
775	}
776
777
778	RTDECL(char ) RTUriParsedFragment(const char pszUri, PCRTURIPARSED pParsed)
779	{
780	AssertPtrReturn(pszUri, NULL);
781	AssertPtrReturn(pParsed, NULL);
782	AssertReturn(pParsed->u32Magic == RTURIPARSED_MAGIC, NULL);
783	if (pParsed->cchFragment)
784	return rtUriPercentDecodeN(&pszUri[pParsed->offFragment], pParsed->cchFragment);
785	return NULL;
786	}
787
788
789	RTDECL(char ) RTUriCreate(const char pszScheme, const char pszAuthority, const char pszPath, const char *pszQuery,
790	const char *pszFragment)
791	{
792	if (!pszScheme) /* Scheme is minimum requirement */
793	return NULL;
794
795	char *pszResult = 0;
796	char *pszAuthority1 = 0;
797	char *pszPath1 = 0;
798	char *pszQuery1 = 0;
799	char *pszFragment1 = 0;
800
801	do
802	{
803	/* Create the percent encoded strings and calculate the necessary uri
804	* length. */
805	size_t cbSize = strlen(pszScheme) + 1 + 1; /* plus zero byte */
806	if (pszAuthority)
807	{
808	pszAuthority1 = rtUriPercentEncodeN(pszAuthority, RTSTR_MAX);
809	if (!pszAuthority1)
810	break;
811	cbSize += strlen(pszAuthority1) + 2;
812	}
813	if (pszPath)
814	{
815	pszPath1 = rtUriPercentEncodeN(pszPath, RTSTR_MAX);
816	if (!pszPath1)
817	break;
818	cbSize += strlen(pszPath1);
819	}
820	if (pszQuery)
821	{
822	pszQuery1 = rtUriPercentEncodeN(pszQuery, RTSTR_MAX);
823	if (!pszQuery1)
824	break;
825	cbSize += strlen(pszQuery1) + 1;
826	}
827	if (pszFragment)
828	{
829	pszFragment1 = rtUriPercentEncodeN(pszFragment, RTSTR_MAX);
830	if (!pszFragment1)
831	break;
832	cbSize += strlen(pszFragment1) + 1;
833	}
834
835	char pszTmp = pszResult = (char )RTStrAlloc(cbSize);
836	if (!pszResult)
837	break;
838	RT_BZERO(pszTmp, cbSize);
839
840	/* Compose the target uri string. */
841	RTStrCatP(&pszTmp, &cbSize, pszScheme);
842	RTStrCatP(&pszTmp, &cbSize, ":");
843	if (pszAuthority1)
844	{
845	RTStrCatP(&pszTmp, &cbSize, "//");
846	RTStrCatP(&pszTmp, &cbSize, pszAuthority1);
847	}
848	if (pszPath1)
849	{
850	RTStrCatP(&pszTmp, &cbSize, pszPath1);
851	}
852	if (pszQuery1)
853	{
854	RTStrCatP(&pszTmp, &cbSize, "?");
855	RTStrCatP(&pszTmp, &cbSize, pszQuery1);
856	}
857	if (pszFragment1)
858	{
859	RTStrCatP(&pszTmp, &cbSize, "#");
860	RTStrCatP(&pszTmp, &cbSize, pszFragment1);
861	}
862	} while (0);
863
864	/* Cleanup */
865	if (pszAuthority1)
866	RTStrFree(pszAuthority1);
867	if (pszPath1)
868	RTStrFree(pszPath1);
869	if (pszQuery1)
870	RTStrFree(pszQuery1);
871	if (pszFragment1)
872	RTStrFree(pszFragment1);
873
874	return pszResult;
875	}
876
877
878	RTDECL(bool) RTUriIsSchemeMatch(const char pszUri, const char pszScheme)
879	{
880	AssertPtrReturn(pszUri, false);
881	size_t const cchScheme = strlen(pszScheme);
882	return RTStrNICmp(pszUri, pszScheme, cchScheme) == 0
883	&& pszUri[cchScheme] == ':';
884	}
885
886
887	RTDECL(int) RTUriFileCreateEx(const char pszPath, uint32_t fPathStyle, char ppszUri, size_t cbUri, size_t pcchUri)
888	{
889	/*
890	* Validate and adjust input. (RTPathParse check pszPath out for us)
891	*/
892	if (pcchUri)
893	{
894	AssertPtrReturn(pcchUri, VERR_INVALID_POINTER);
895	*pcchUri = ~(size_t)0;
896	}
897	AssertPtrReturn(ppszUri, VERR_INVALID_POINTER);
898	AssertReturn(!(fPathStyle & ~RTPATH_STR_F_STYLE_MASK) && fPathStyle != RTPATH_STR_F_STYLE_RESERVED, VERR_INVALID_FLAGS);
899	if (fPathStyle == RTPATH_STR_F_STYLE_HOST)
900	fPathStyle = RTPATH_STYLE;
901
902	/*
903	* Let the RTPath code parse the stuff (no reason to duplicate path parsing
904	* and get it slightly wrong here).
905	*/
906	union
907	{
908	RTPATHPARSED ParsedPath;
909	uint8_t abPadding[sizeof(RTPATHPARSED)];
910	} u;
911	int rc = RTPathParse(pszPath, &u.ParsedPath, sizeof(u.ParsedPath), fPathStyle);
912	if (RT_SUCCESS(rc) \|\| rc == VERR_BUFFER_OVERFLOW)
913	{
914	/* Skip leading slashes. */
915	if (u.ParsedPath.fProps & RTPATH_PROP_ROOT_SLASH)
916	{
917	if (fPathStyle == RTPATH_STR_F_STYLE_DOS)
918	while (pszPath[0] == '/' \|\| pszPath[0] == '\\')
919	pszPath++;
920	else
921	while (pszPath[0] == '/')
922	pszPath++;
923	}
924	const size_t cchPath = strlen(pszPath);
925
926	/*
927	* Calculate the encoded length and figure destination buffering.
928	*/
929	static const char s_szPrefix[] = "file:///";
930	size_t const cchPrefix = sizeof(s_szPrefix) - (u.ParsedPath.fProps & RTPATH_PROP_UNC ? 2 : 1);
931	size_t cchEncoded = rtUriCalcEncodedLength(pszPath, cchPath, fPathStyle != RTPATH_STR_F_STYLE_DOS);
932
933	if (pcchUri)
934	*pcchUri = cchEncoded;
935
936	char *pszDst;
937	char *pszFreeMe = NULL;
938	if (!cbUri \|\| *ppszUri == NULL)
939	{
940	cbUri = RT_MAX(cbUri, cchPrefix + cchEncoded + 1);
941	*ppszUri = pszFreeMe = pszDst = RTStrAlloc(cbUri);
942	AssertReturn(pszDst, VERR_NO_STR_MEMORY);
943	}
944	else if (cchEncoded < cbUri)
945	pszDst = *ppszUri;
946	else
947	return VERR_BUFFER_OVERFLOW;
948
949	/*
950	* Construct the URI.
951	*/
952	memcpy(pszDst, s_szPrefix, cchPrefix);
953	pszDst[cchPrefix] = '\0';
954	rc = rtUriEncodeIntoBuffer(pszPath, cchPath, fPathStyle != RTPATH_STR_F_STYLE_DOS, &pszDst[cchPrefix], cbUri - cchPrefix);
955	if (RT_SUCCESS(rc))
956	{
957	Assert(strlen(pszDst) == cbUri - 1);
958	if (fPathStyle == RTPATH_STR_F_STYLE_DOS)
959	RTPathChangeToUnixSlashes(pszDst, true /fForce/);
960	return VINF_SUCCESS;
961	}
962
963	AssertRC(rc); /* Impossible! rtUriCalcEncodedLength or something above is busted! */
964	if (pszFreeMe)
965	RTStrFree(pszFreeMe);
966	}
967	return rc;
968	}
969
970
971	RTDECL(char ) RTUriFileCreate(const char pszPath)
972	{
973	char *pszUri = NULL;
974	int rc = RTUriFileCreateEx(pszPath, RTPATH_STR_F_STYLE_HOST, &pszUri, 0 /cbUri/, NULL /pcchUri/);
975	if (RT_SUCCESS(rc))
976	return pszUri;
977	return NULL;
978	}
979
980
981	RTDECL(int) RTUriFilePathEx(const char pszUri, uint32_t fPathStyle, char ppszPath, size_t cbPath, size_t pcchPath)
982	{
983	/*
984	* Validate and adjust input.
985	*/
986	if (pcchPath)
987	{
988	AssertPtrReturn(pcchPath, VERR_INVALID_POINTER);
989	*pcchPath = ~(size_t)0;
990	}
991	AssertPtrReturn(ppszPath, VERR_INVALID_POINTER);
992	AssertReturn(!(fPathStyle & ~RTPATH_STR_F_STYLE_MASK) && fPathStyle != RTPATH_STR_F_STYLE_RESERVED, VERR_INVALID_FLAGS);
993	if (fPathStyle == RTPATH_STR_F_STYLE_HOST)
994	fPathStyle = RTPATH_STYLE;
995	AssertPtrReturn(pszUri, VERR_INVALID_POINTER);
996
997	/*
998	* Check that this is a file URI.
999	*/
1000	if (RTStrNICmp(pszUri, RT_STR_TUPLE("file:")) == 0)
1001	{ /* likely */ }
1002	else
1003	return VERR_URI_NOT_FILE_SCHEME;
1004
1005	/*
1006	* We may have a number of variations here, mostly thanks to
1007	* various windows software. First the canonical variations:
1008	* - file:///C:/Windows/System32/kernel32.dll
1009	* - file:///C\|/Windows/System32/kernel32.dll
1010	* - file:///C:%5CWindows%5CSystem32%5Ckernel32.dll
1011	* - file://localhost/C:%5CWindows%5CSystem32%5Ckernel32.dll
1012	* - file://cifsserver.dev/systemshare%5CWindows%5CSystem32%5Ckernel32.dll
1013	* - file://cifsserver.dev:139/systemshare%5CWindows%5CSystem32%5Ckernel32.dll (not quite sure here, but whatever)
1014	*
1015	* Legacy variant without any slashes after the schema:
1016	* - file:C:/Windows/System32/kernel32.dll
1017	* - file:C\|/Windows/System32%5Ckernel32.dll
1018	* - file:~/.bashrc
1019	* \--path-/
1020	*
1021	* Legacy variant with exactly one slashes after the schema:
1022	* - file:/C:/Windows/System32%5Ckernel32.dll
1023	* - file:/C\|/Windows/System32/kernel32.dll
1024	* - file:/usr/bin/env
1025	* \---path---/
1026	*
1027	* Legacy variant with two slashes after the schema and an unescaped DOS path:
1028	* - file://C:/Windows/System32\kernel32.dll (**)
1029	* - file://C\|/Windows/System32\kernel32.dll
1030	* \---path---------------------/
1031	* -- authority, with ':' as non-working port separator
1032	*
1033	* Legacy variant with exactly four slashes after the schema and an unescaped DOS path.
1034	* - file:////C:/Windows\System32\user32.dll
1035	*
1036	* Legacy variant with four or more slashes after the schema and an unescaped UNC path:
1037	* - file:////cifsserver.dev/systemshare/System32%\kernel32.dll
1038	* - file://///cifsserver.dev/systemshare/System32\kernel32.dll
1039	* \---path--------------------------------------------/
1040	*
1041	* The two unescaped variants shouldn't be handed to rtUriParse, which
1042	* is good as we cannot actually handle the one marked by (**). So, handle
1043	* those two special when parsing.
1044	*/
1045	RTURIPARSED Parsed;
1046	int rc;
1047	size_t cSlashes = 0;
1048	while (pszUri[5 + cSlashes] == '/')
1049	cSlashes++;
1050	if ( (cSlashes == 2 \|\| cSlashes == 4)
1051	&& RT_C_IS_ALPHA(pszUri[5 + cSlashes])
1052	&& (pszUri[5 + cSlashes + 1] == ':' \|\| pszUri[5 + cSlashes + 1] == '\|'))
1053	{
1054	RT_ZERO(Parsed); /* RTURIPARSED_F_CONTAINS_ESCAPED_CHARS is now clear. */
1055	Parsed.offPath = 5 + cSlashes;
1056	Parsed.cchPath = strlen(&pszUri[Parsed.offPath]);
1057	rc = RTStrValidateEncoding(&pszUri[Parsed.offPath]);
1058	}
1059	else if (cSlashes >= 4)
1060	{
1061	RT_ZERO(Parsed);
1062	Parsed.fFlags = cSlashes > 4 ? RTURIPARSED_F_CONTAINS_ESCAPED_CHARS : 0;
1063	Parsed.offPath = 5 + cSlashes - 2;
1064	Parsed.cchPath = strlen(&pszUri[Parsed.offPath]);
1065	rc = RTStrValidateEncoding(&pszUri[Parsed.offPath]);
1066	}
1067	else
1068	rc = rtUriParse(pszUri, &Parsed);
1069	if (RT_SUCCESS(rc))
1070	{
1071	/*
1072	* Ignore localhost as hostname (it's implicit).
1073	*/
1074	static char const s_szLocalhost[] = "localhost";
1075	if ( Parsed.cchAuthorityHost == sizeof(s_szLocalhost) - 1U
1076	&& RTStrNICmp(&pszUri[Parsed.offAuthorityHost], RT_STR_TUPLE(s_szLocalhost)) == 0)
1077	{
1078	Parsed.cchAuthorityHost = 0;
1079	Parsed.cchAuthority = 0;
1080	}
1081
1082	/*
1083	* Ignore leading path slash/separator if we detect a DOS drive letter
1084	* and we don't have a host name.
1085	*/
1086	if ( Parsed.cchPath >= 3
1087	&& Parsed.cchAuthorityHost == 0
1088	&& pszUri[Parsed.offPath] == '/' /* Leading path slash/separator. */
1089	&& ( pszUri[Parsed.offPath + 2] == ':' /* Colon after drive letter. */
1090	\|\| pszUri[Parsed.offPath + 2] == '\|') /* Colon alternative. */
1091	&& RT_C_IS_ALPHA(pszUri[Parsed.offPath + 1]) ) /* Drive letter. */
1092	{
1093	Parsed.offPath++;
1094	Parsed.cchPath--;
1095	}
1096
1097	/*
1098	* Calculate the size of the encoded result.
1099	*
1100	* Since we're happily returning "C:/Windows/System32/kernel.dll"
1101	* style paths when the caller requested UNIX style paths, we will
1102	* return straight UNC paths too ("//cifsserver/share/dir/file").
1103	*/
1104	size_t cchDecodedHost = 0;
1105	size_t cbResult;
1106	if (Parsed.fFlags & RTURIPARSED_F_CONTAINS_ESCAPED_CHARS)
1107	{
1108	cchDecodedHost = rtUriCalcDecodedLength(&pszUri[Parsed.offAuthorityHost], Parsed.cchAuthorityHost);
1109	cbResult = cchDecodedHost + rtUriCalcDecodedLength(&pszUri[Parsed.offPath], Parsed.cchPath) + 1;
1110	}
1111	else
1112	{
1113	cchDecodedHost = 0;
1114	cbResult = Parsed.cchAuthorityHost + Parsed.cchPath + 1;
1115	}
1116	if (pcchPath)
1117	*pcchPath = cbResult - 1;
1118	if (cbResult > 1)
1119	{
1120	/*
1121	* Prepare the necessary buffer space for the result.
1122	*/
1123	char *pszDst;
1124	char *pszFreeMe = NULL;
1125	if (!cbPath \|\| *ppszPath == NULL)
1126	{
1127	cbPath = RT_MAX(cbPath, cbResult);
1128	*ppszPath = pszFreeMe = pszDst = RTStrAlloc(cbPath);
1129	AssertReturn(pszDst, VERR_NO_STR_MEMORY);
1130	}
1131	else if (cbResult <= cbPath)
1132	pszDst = *ppszPath;
1133	else
1134	return VERR_BUFFER_OVERFLOW;
1135
1136	/*
1137	* Compose the result.
1138	*/
1139	if (Parsed.fFlags & RTURIPARSED_F_CONTAINS_ESCAPED_CHARS)
1140	{
1141	rc = rtUriDecodeIntoBuffer(&pszUri[Parsed.offAuthorityHost],Parsed.cchAuthorityHost,
1142	pszDst, cchDecodedHost + 1);
1143	Assert(RT_SUCCESS(rc) && strlen(pszDst) == cchDecodedHost);
1144	if (RT_SUCCESS(rc))
1145	rc = rtUriDecodeIntoBuffer(&pszUri[Parsed.offPath], Parsed.cchPath,
1146	&pszDst[cchDecodedHost], cbResult - cchDecodedHost);
1147	Assert(RT_SUCCESS(rc) && strlen(pszDst) == cbResult - 1);
1148	}
1149	else
1150	{
1151	memcpy(pszDst, &pszUri[Parsed.offAuthorityHost], Parsed.cchAuthorityHost);
1152	memcpy(&pszDst[Parsed.cchAuthorityHost], &pszUri[Parsed.offPath], Parsed.cchPath);
1153	pszDst[cbResult - 1] = '\0';
1154	}
1155	if (RT_SUCCESS(rc))
1156	{
1157	/*
1158	* Convert colon DOS driver letter colon alternative.
1159	* We do this regardless of the desired path style.
1160	*/
1161	if ( RT_C_IS_ALPHA(pszDst[0])
1162	&& pszDst[1] == '\|')
1163	pszDst[1] = ':';
1164
1165	/*
1166	* Fix slashes.
1167	*/
1168	if (fPathStyle == RTPATH_STR_F_STYLE_DOS)
1169	RTPathChangeToDosSlashes(pszDst, true);
1170	else if (fPathStyle == RTPATH_STR_F_STYLE_UNIX)
1171	RTPathChangeToUnixSlashes(pszDst, true); /** @todo not quite sure how this actually makes sense... */
1172	else
1173	AssertFailed();
1174	return rc;
1175	}
1176
1177	/* bail out */
1178	RTStrFree(pszFreeMe);
1179	}
1180	else
1181	rc = VERR_PATH_ZERO_LENGTH;
1182	}
1183	return rc;
1184	}
1185
1186
1187	RTDECL(char ) RTUriFilePath(const char pszUri)
1188	{
1189	char *pszPath = NULL;
1190	int rc = RTUriFilePathEx(pszUri, RTPATH_STR_F_STYLE_HOST, &pszPath, 0 /cbPath/, NULL /pcchPath/);
1191	if (RT_SUCCESS(rc))
1192	return pszPath;
1193	return NULL;
1194	}
1195

Note: See TracBrowser for help on using the repository browser.

source: vbox/trunk/src/VBox/Runtime/common/misc/uri.cpp@ 101657

Download in other formats: