uri.cpp@ 99248

Last change on this file since 99248 was 98103, checked in by vboxsync, 2 years ago
Copyright year updates by scm.
Property svn:eol-style set to `native` Property svn:keywords set to `Author Date Id Revision`
File size: 40.0 KB

Line
1	/* $Id: uri.cpp 98103 2023-01-17 14:15:46Z vboxsync $ */
2	/** @file
3	* IPRT - Uniform Resource Identifier handling.
4	*/
5
6	/*
7	* Copyright (C) 2011-2023 Oracle and/or its affiliates.
8	*
9	* This file is part of VirtualBox base platform packages, as
10	* available from https://www.virtualbox.org.
11	*
12	* This program is free software; you can redistribute it and/or
13	* modify it under the terms of the GNU General Public License
14	* as published by the Free Software Foundation, in version 3 of the
15	* License.
16	*
17	* This program is distributed in the hope that it will be useful, but
18	* WITHOUT ANY WARRANTY; without even the implied warranty of
19	* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
20	* General Public License for more details.
21	*
22	* You should have received a copy of the GNU General Public License
23	* along with this program; if not, see <https://www.gnu.org/licenses>.
24	*
25	* The contents of this file may alternatively be used under the terms
26	* of the Common Development and Distribution License Version 1.0
27	* (CDDL), a copy of it is provided in the "COPYING.CDDL" file included
28	* in the VirtualBox distribution, in which case the provisions of the
29	* CDDL are applicable instead of those of the GPL.
30	*
31	* You may elect to license modified versions of this file under the
32	* terms and conditions of either the GPL or the CDDL or both.
33	*
34	* SPDX-License-Identifier: GPL-3.0-only OR CDDL-1.0
35	*/
36
37
38	/*********************************************************************************************************************************
39	* Header Files *
40	*********************************************************************************************************************************/
41	#include <iprt/uri.h>
42
43	#include <iprt/assert.h>
44	#include <iprt/ctype.h>
45	#include <iprt/err.h>
46	#include <iprt/path.h>
47	#include <iprt/string.h>
48
49
50	/*********************************************************************************************************************************
51	* Defined Constants And Macros *
52	*********************************************************************************************************************************/
53	/** Internal magic value we use to check if a RTURIPARSED structure has made it thru RTUriParse. */
54	#define RTURIPARSED_MAGIC UINT32_C(0x439e0745)
55
56
57	/* General URI format:
58
59	foo://example.com:8042/over/there?name=ferret#nose
60	\_/ \______________/\_________/ \_________/ \__/
61	\| \| \| \| \|
62	scheme authority path query fragment
63	\| _____________________\|__
64	/ \ / \
65	urn:example:animal:ferret:nose
66	*/
67
68
69	/**
70	* The following defines characters which have to be % escaped:
71	* control = 00-1F
72	* space = ' '
73	* delims = '<' , '>' , '#' , '%' , '"'
74	* unwise = '{' , '}' , '\|' , '\' , '^' , '[' , ']' , '`'
75	*/
76	#define URI_EXCLUDED(a) \
77	( ((a) >= 0x0 && (a) <= 0x20) \
78	\|\| ((a) >= 0x5B && (a) <= 0x5E) \
79	\|\| ((a) >= 0x7B && (a) <= 0x7D) \
80	\|\| (a) == '<' \|\| (a) == '>' \|\| (a) == '#' \
81	\|\| (a) == '%' \|\| (a) == '"' \|\| (a) == '`' )
82
83	static char rtUriPercentEncodeN(const char pszString, size_t cchMax)
84	{
85	if (!pszString)
86	return NULL;
87
88	int rc = VINF_SUCCESS;
89
90	size_t cbLen = RT_MIN(strlen(pszString), cchMax);
91	/* The new string can be max 3 times in size of the original string. */
92	char pszNew = RTStrAlloc(cbLen 3 + 1);
93	if (!pszNew)
94	return NULL;
95
96	char *pszRes = NULL;
97	size_t iIn = 0;
98	size_t iOut = 0;
99	while (iIn < cbLen)
100	{
101	if (URI_EXCLUDED(pszString[iIn]))
102	{
103	char szNum[3] = { 0, 0, 0 };
104	RTStrFormatU8(&szNum[0], 3, pszString[iIn++], 16, 2, 2, RTSTR_F_CAPITAL \| RTSTR_F_ZEROPAD);
105	pszNew[iOut++] = '%';
106	pszNew[iOut++] = szNum[0];
107	pszNew[iOut++] = szNum[1];
108	}
109	else
110	pszNew[iOut++] = pszString[iIn++];
111	}
112	if (RT_SUCCESS(rc))
113	{
114	pszNew[iOut] = '\0';
115	if (iOut != iIn)
116	{
117	/* If the source and target strings have different size, recreate
118	* the target string with the correct size. */
119	pszRes = RTStrDupN(pszNew, iOut);
120	RTStrFree(pszNew);
121	}
122	else
123	pszRes = pszNew;
124	}
125	else
126	RTStrFree(pszNew);
127
128	return pszRes;
129	}
130
131
132	/**
133	* Calculates the encoded string length.
134	*
135	* @returns Number of chars (excluding the terminator).
136	* @param pszString The string to encode.
137	* @param cchMax The maximum string length (e.g. RTSTR_MAX).
138	* @param fEncodeDosSlash Whether to encode DOS slashes or not.
139	*/
140	static size_t rtUriCalcEncodedLength(const char *pszString, size_t cchMax, bool fEncodeDosSlash)
141	{
142	size_t cchEncoded = 0;
143	if (pszString)
144	{
145	size_t cchSrcLeft = RTStrNLen(pszString, cchMax);
146	while (cchSrcLeft-- > 0)
147	{
148	char const ch = *pszString++;
149	if (!URI_EXCLUDED(ch) \|\| (ch == '\\' && !fEncodeDosSlash))
150	cchEncoded += 1;
151	else
152	cchEncoded += 3;
153	}
154	}
155	return cchEncoded;
156	}
157
158
159	/**
160	* Encodes an URI into a caller allocated buffer.
161	*
162	* @returns IPRT status code.
163	* @param pszString The string to encode.
164	* @param cchMax The maximum string length (e.g. RTSTR_MAX).
165	* @param fEncodeDosSlash Whether to encode DOS slashes or not.
166	* @param pszDst The destination buffer.
167	* @param cbDst The size of the destination buffer.
168	*/
169	static int rtUriEncodeIntoBuffer(const char pszString, size_t cchMax, bool fEncodeDosSlash, char pszDst, size_t cbDst)
170	{
171	AssertReturn(pszString, VERR_INVALID_POINTER);
172	AssertPtrReturn(pszDst, VERR_INVALID_POINTER);
173
174	/*
175	* We do buffer size checking up front and every time we encode a special
176	* character. That's faster than checking for each char.
177	*/
178	size_t cchSrcLeft = RTStrNLen(pszString, cchMax);
179	AssertMsgReturn(cbDst > cchSrcLeft, ("cbDst=%zu cchSrcLeft=%zu\n", cbDst, cchSrcLeft), VERR_BUFFER_OVERFLOW);
180	cbDst -= cchSrcLeft;
181
182	while (cchSrcLeft-- > 0)
183	{
184	char const ch = *pszString++;
185	if (!URI_EXCLUDED(ch) \|\| (ch == '\\' && !fEncodeDosSlash))
186	*pszDst++ = ch;
187	else
188	{
189	AssertReturn(cbDst >= 3, VERR_BUFFER_OVERFLOW); /* 2 extra bytes + zero terminator. */
190	cbDst -= 2;
191
192	*pszDst++ = '%';
193	ssize_t cchTmp = RTStrFormatU8(pszDst, 3, (unsigned char)ch, 16, 2, 2, RTSTR_F_CAPITAL \| RTSTR_F_ZEROPAD);
194	Assert(cchTmp == 2); NOREF(cchTmp);
195	pszDst += 2;
196	}
197	}
198
199	*pszDst = '\0';
200	return VINF_SUCCESS;
201	}
202
203
204	static char rtUriPercentDecodeN(const char pszString, size_t cchString)
205	{
206	AssertPtrReturn(pszString, NULL);
207	AssertReturn(memchr(pszString, '\0', cchString) == NULL, NULL);
208
209	/*
210	* The new string can only get smaller, so use the input length as a
211	* staring buffer size.
212	*/
213	char *pszDecoded = RTStrAlloc(cchString + 1);
214	if (pszDecoded)
215	{
216	/*
217	* Knowing that the pszString itself is valid UTF-8, we only have to
218	* validate the escape sequences.
219	*/
220	size_t cchLeft = cchString;
221	char const *pchSrc = pszString;
222	char *pchDst = pszDecoded;
223	while (cchLeft > 0)
224	{
225	const char pchPct = (const char )memchr(pchSrc, '%', cchLeft);
226	if (pchPct)
227	{
228	size_t cchBefore = pchPct - pchSrc;
229	if (cchBefore)
230	{
231	memcpy(pchDst, pchSrc, cchBefore);
232	pchDst += cchBefore;
233	pchSrc += cchBefore;
234	cchLeft -= cchBefore;
235	}
236
237	char chHigh, chLow;
238	if ( cchLeft >= 3
239	&& RT_C_IS_XDIGIT(chHigh = pchSrc[1])
240	&& RT_C_IS_XDIGIT(chLow = pchSrc[2]))
241	{
242	uint8_t b = RT_C_IS_DIGIT(chHigh) ? chHigh - '0' : (chHigh & ~0x20) - 'A' + 10;
243	b <<= 4;
244	b \|= RT_C_IS_DIGIT(chLow) ? chLow - '0' : (chLow & ~0x20) - 'A' + 10;
245	*pchDst++ = (char)b;
246	pchSrc += 3;
247	cchLeft -= 3;
248	}
249	else
250	{
251	AssertFailed();
252	pchDst++ = pchSrc++;
253	cchLeft--;
254	}
255	}
256	else
257	{
258	memcpy(pchDst, pchSrc, cchLeft);
259	pchDst += cchLeft;
260	pchSrc += cchLeft;
261	cchLeft = 0;
262	break;
263	}
264	}
265
266	*pchDst = '\0';
267
268	/*
269	* If we've got lof space room in the result string, reallocate it.
270	*/
271	size_t cchDecoded = pchDst - pszDecoded;
272	Assert(cchDecoded <= cchString);
273	if (cchString - cchDecoded > 64)
274	RTStrRealloc(&pszDecoded, cchDecoded + 1);
275	}
276	return pszDecoded;
277	}
278
279
280	/**
281	* Calculates the decoded string length.
282	*
283	* @returns Number of chars (excluding the terminator).
284	* @param pszString The string to decode.
285	* @param cchMax The maximum string length (e.g. RTSTR_MAX).
286	*/
287	static size_t rtUriCalcDecodedLength(const char *pszString, size_t cchMax)
288	{
289	size_t cchDecoded;
290	if (pszString)
291	{
292	size_t cchSrcLeft = cchDecoded = RTStrNLen(pszString, cchMax);
293	while (cchSrcLeft-- > 0)
294	{
295	char const ch = *pszString++;
296	if (ch != '%')
297	{ /* typical */}
298	else if ( cchSrcLeft >= 2
299	&& RT_C_IS_XDIGIT(pszString[0])
300	&& RT_C_IS_XDIGIT(pszString[1]))
301	{
302	cchDecoded -= 2;
303	pszString += 2;
304	cchSrcLeft -= 2;
305	}
306	}
307	}
308	else
309	cchDecoded = 0;
310	return cchDecoded;
311	}
312
313
314	/**
315	* Decodes a string into a buffer.
316	*
317	* @returns IPRT status code.
318	* @param pchSrc The source string.
319	* @param cchSrc The max number of bytes to decode in the source string.
320	* @param pszDst The destination buffer.
321	* @param cbDst The size of the buffer (including terminator).
322	*/
323	static int rtUriDecodeIntoBuffer(const char pchSrc, size_t cchSrc, char pszDst, size_t cbDst)
324	{
325	AssertPtrReturn(pchSrc, VERR_INVALID_POINTER);
326	AssertPtrReturn(pszDst, VERR_INVALID_POINTER);
327
328	/*
329	* Knowing that the pszString itself is valid UTF-8, we only have to
330	* validate the escape sequences.
331	*/
332	cchSrc = RTStrNLen(pchSrc, cchSrc);
333	while (cchSrc > 0)
334	{
335	const char pchPct = (const char )memchr(pchSrc, '%', cchSrc);
336	if (pchPct)
337	{
338	size_t cchBefore = pchPct - pchSrc;
339	AssertReturn(cchBefore + 1 < cbDst, VERR_BUFFER_OVERFLOW);
340	if (cchBefore)
341	{
342	memcpy(pszDst, pchSrc, cchBefore);
343	pszDst += cchBefore;
344	cbDst -= cchBefore;
345	pchSrc += cchBefore;
346	cchSrc -= cchBefore;
347	}
348
349	char chHigh, chLow;
350	if ( cchSrc >= 3
351	&& RT_C_IS_XDIGIT(chHigh = pchSrc[1])
352	&& RT_C_IS_XDIGIT(chLow = pchSrc[2]))
353	{
354	uint8_t b = RT_C_IS_DIGIT(chHigh) ? chHigh - '0' : (chHigh & ~0x20) - 'A' + 10;
355	b <<= 4;
356	b \|= RT_C_IS_DIGIT(chLow) ? chLow - '0' : (chLow & ~0x20) - 'A' + 10;
357	*pszDst++ = (char)b;
358	pchSrc += 3;
359	cchSrc -= 3;
360	}
361	else
362	{
363	AssertFailed();
364	pszDst++ = pchSrc++;
365	cchSrc--;
366	}
367	cbDst -= 1;
368	}
369	else
370	{
371	AssertReturn(cchSrc < cbDst, VERR_BUFFER_OVERFLOW);
372	memcpy(pszDst, pchSrc, cchSrc);
373	pszDst += cchSrc;
374	cbDst -= cchSrc;
375	pchSrc += cchSrc;
376	cchSrc = 0;
377	break;
378	}
379	}
380
381	AssertReturn(cbDst > 0, VERR_BUFFER_OVERFLOW);
382	*pszDst = '\0';
383	return VINF_SUCCESS;
384	}
385
386
387
388	static int rtUriParse(const char *pszUri, PRTURIPARSED pParsed)
389	{
390	/*
391	* Validate the input and clear the output.
392	*/
393	AssertPtrReturn(pParsed, VERR_INVALID_POINTER);
394	RT_ZERO(*pParsed);
395	pParsed->uAuthorityPort = UINT32_MAX;
396
397	AssertPtrReturn(pszUri, VERR_INVALID_POINTER);
398
399	size_t const cchUri = strlen(pszUri);
400	if (RT_LIKELY(cchUri >= 3)) { /* likely */ }
401	else return cchUri ? VERR_URI_TOO_SHORT : VERR_URI_EMPTY;
402
403	/*
404	* Validating escaped text sequences is much simpler if we know that
405	* that the base URI string is valid. Also, we don't necessarily trust
406	* the developer calling us to remember to do this.
407	*/
408	int rc = RTStrValidateEncoding(pszUri);
409	AssertRCReturn(rc, rc);
410
411	/*
412	* RFC-3986, section 3.1:
413	* scheme = ALPHA *( ALPHA / DIGIT / "+" / "-" / "." )
414	*
415	* The scheme ends with a ':', which we also skip here.
416	*/
417	size_t off = 0;
418	char ch = pszUri[off++];
419	if (RT_LIKELY(RT_C_IS_ALPHA(ch))) { /* likely */ }
420	else return VERR_URI_INVALID_SCHEME;
421	for (;;)
422	{
423	ch = pszUri[off];
424	if (ch == ':')
425	break;
426	if (RT_LIKELY(RT_C_IS_ALNUM(ch) \|\| ch == '.' \|\| ch == '-' \|\| ch == '+')) { /* likely */ }
427	else return VERR_URI_INVALID_SCHEME;
428	off++;
429	}
430	pParsed->cchScheme = off;
431
432	/* Require the scheme length to be at least two chars so we won't confuse
433	it with a path starting with a DOS drive letter specification. */
434	if (RT_LIKELY(off >= 2)) { /* likely */ }
435	else return VERR_URI_INVALID_SCHEME;
436
437	off++; /* (skip colon) */
438
439	/*
440	* Find the end of the path, we'll need this several times.
441	* Also, while we're potentially scanning the whole thing, check for '%'.
442	*/
443	size_t const offHash = RTStrOffCharOrTerm(&pszUri[off], '#') + off;
444	size_t const offQuestionMark = RTStrOffCharOrTerm(&pszUri[off], '?') + off;
445
446	if (memchr(pszUri, '%', cchUri) != NULL)
447	pParsed->fFlags \|= RTURIPARSED_F_CONTAINS_ESCAPED_CHARS;
448
449	/*
450	* RFC-3986, section 3.2:
451	* The authority component is preceeded by a double slash ("//")...
452	*/
453	if ( pszUri[off] == '/'
454	&& pszUri[off + 1] == '/')
455	{
456	off += 2;
457	pParsed->offAuthority = pParsed->offAuthorityUsername = pParsed->offAuthorityPassword = pParsed->offAuthorityHost = off;
458	pParsed->fFlags \|= RTURIPARSED_F_HAS_AUTHORITY;
459
460	/*
461	* RFC-3986, section 3.2:
462	* ...and is terminated by the next slash ("/"), question mark ("?"),
463	* or number sign ("#") character, or by the end of the URI.
464	*/
465	const char *pszAuthority = &pszUri[off];
466	size_t cchAuthority = RTStrOffCharOrTerm(pszAuthority, '/');
467	cchAuthority = RT_MIN(cchAuthority, offHash - off);
468	cchAuthority = RT_MIN(cchAuthority, offQuestionMark - off);
469	pParsed->cchAuthority = cchAuthority;
470
471	/* The Authority can be empty, like for: file:///usr/bin/grep */
472	if (cchAuthority > 0)
473	{
474	pParsed->cchAuthorityHost = cchAuthority;
475
476	/*
477	* If there is a userinfo part, it is ended by a '@'.
478	*/
479	const char pszAt = (const char )memchr(pszAuthority, '@', cchAuthority);
480	if (pszAt)
481	{
482	size_t cchTmp = pszAt - pszAuthority;
483	pParsed->offAuthorityHost += cchTmp + 1;
484	pParsed->cchAuthorityHost -= cchTmp + 1;
485
486	/* If there is a password part, it's separated from the username with a colon. */
487	const char pszColon = (const char )memchr(pszAuthority, ':', cchTmp);
488	if (pszColon)
489	{
490	pParsed->cchAuthorityUsername = pszColon - pszAuthority;
491	pParsed->offAuthorityPassword = &pszColon[1] - pszUri;
492	pParsed->cchAuthorityPassword = pszAt - &pszColon[1];
493	}
494	else
495	{
496	pParsed->cchAuthorityUsername = cchTmp;
497	pParsed->offAuthorityPassword = off + cchTmp;
498	}
499	}
500
501	/*
502	* If there is a port part, its after the last colon in the host part.
503	*/
504	const char pszColon = (const char )memrchr(&pszUri[pParsed->offAuthorityHost], ':', pParsed->cchAuthorityHost);
505	if (pszColon)
506	{
507	size_t cchTmp = &pszUri[pParsed->offAuthorityHost + pParsed->cchAuthorityHost] - &pszColon[1];
508	pParsed->cchAuthorityHost -= cchTmp + 1;
509	pParsed->fFlags \|= RTURIPARSED_F_HAS_PORT;
510	if (cchTmp > 0)
511	{
512	pParsed->uAuthorityPort = 0;
513	while (cchTmp-- > 0)
514	{
515	ch = *++pszColon;
516	if ( RT_C_IS_DIGIT(ch)
517	&& pParsed->uAuthorityPort < UINT32_MAX / UINT32_C(10))
518	{
519	pParsed->uAuthorityPort *= 10;
520	pParsed->uAuthorityPort += ch - '0';
521	}
522	else
523	return VERR_URI_INVALID_PORT_NUMBER;
524	}
525	}
526	}
527	}
528
529	/* Skip past the authority. */
530	off += cchAuthority;
531	}
532	else
533	pParsed->offAuthority = pParsed->offAuthorityUsername = pParsed->offAuthorityPassword = pParsed->offAuthorityHost = off;
534
535	/*
536	* RFC-3986, section 3.3: Path
537	* The path is terminated by the first question mark ("?")
538	* or number sign ("#") character, or by the end of the URI.
539	*/
540	pParsed->offPath = off;
541	pParsed->cchPath = RT_MIN(offHash, offQuestionMark) - off;
542	off += pParsed->cchPath;
543
544	/*
545	* RFC-3986, section 3.4: Query
546	* The query component is indicated by the first question mark ("?")
547	* character and terminated by a number sign ("#") character or by the
548	* end of the URI.
549	*/
550	if ( off == offQuestionMark
551	&& off < cchUri)
552	{
553	Assert(pszUri[offQuestionMark] == '?');
554	pParsed->offQuery = ++off;
555	pParsed->cchQuery = offHash - off;
556	off = offHash;
557	}
558	else
559	{
560	Assert(!pszUri[offQuestionMark]);
561	pParsed->offQuery = off;
562	}
563
564	/*
565	* RFC-3986, section 3.5: Fragment
566	* A fragment identifier component is indicated by the presence of a
567	* number sign ("#") character and terminated by the end of the URI.
568	*/
569	if ( off == offHash
570	&& off < cchUri)
571	{
572	pParsed->offFragment = ++off;
573	pParsed->cchFragment = cchUri - off;
574	}
575	else
576	{
577	Assert(!pszUri[offHash]);
578	pParsed->offFragment = off;
579	}
580
581	/*
582	* If there are any escape sequences, validate them.
583	*
584	* This is reasonably simple as we already know that the string is valid UTF-8
585	* before they get decoded. Thus we only have to validate the escaped sequences.
586	*/
587	if (pParsed->fFlags & RTURIPARSED_F_CONTAINS_ESCAPED_CHARS)
588	{
589	const char pchSrc = (const char )memchr(pszUri, '%', cchUri);
590	AssertReturn(pchSrc, VERR_INTERNAL_ERROR);
591	do
592	{
593	char szUtf8Seq[8];
594	unsigned cchUtf8Seq = 0;
595	unsigned cchNeeded = 0;
596	size_t cchLeft = &pszUri[cchUri] - pchSrc;
597	do
598	{
599	if (cchLeft >= 3)
600	{
601	char chHigh = pchSrc[1];
602	char chLow = pchSrc[2];
603	if ( RT_C_IS_XDIGIT(chHigh)
604	&& RT_C_IS_XDIGIT(chLow))
605	{
606	uint8_t b = RT_C_IS_DIGIT(chHigh) ? chHigh - '0' : (chHigh & ~0x20) - 'A' + 10;
607	b <<= 4;
608	b \|= RT_C_IS_DIGIT(chLow) ? chLow - '0' : (chLow & ~0x20) - 'A' + 10;
609
610	if (!(b & 0x80))
611	{
612	/* We don't want the string to be terminated prematurely. */
613	if (RT_LIKELY(b != 0)) { /* likely */ }
614	else return VERR_URI_ESCAPED_ZERO;
615
616	/* Check that we're not expecting more UTF-8 bytes. */
617	if (RT_LIKELY(cchNeeded == 0)) { /* likely */ }
618	else return VERR_URI_MISSING_UTF8_CONTINUATION_BYTE;
619	}
620	/* Are we waiting UTF-8 bytes? */
621	else if (cchNeeded > 0)
622	{
623	if (RT_LIKELY(!(b & 0x40))) { /* likely */ }
624	else return VERR_URI_INVALID_ESCAPED_UTF8_CONTINUATION_BYTE;
625
626	szUtf8Seq[cchUtf8Seq++] = (char)b;
627	if (--cchNeeded == 0)
628	{
629	szUtf8Seq[cchUtf8Seq] = '\0';
630	rc = RTStrValidateEncoding(szUtf8Seq);
631	if (RT_FAILURE(rc))
632	return VERR_URI_ESCAPED_CHARS_NOT_VALID_UTF8;
633	cchUtf8Seq = 0;
634	}
635	}
636	/* Start a new UTF-8 sequence. */
637	else
638	{
639	if ((b & 0xf8) == 0xf0)
640	cchNeeded = 3;
641	else if ((b & 0xf0) == 0xe0)
642	cchNeeded = 2;
643	else if ((b & 0xe0) == 0xc0)
644	cchNeeded = 1;
645	else
646	return VERR_URI_INVALID_ESCAPED_UTF8_LEAD_BYTE;
647	szUtf8Seq[0] = (char)b;
648	cchUtf8Seq = 1;
649	}
650	pchSrc += 3;
651	cchLeft -= 3;
652	}
653	else
654	return VERR_URI_INVALID_ESCAPE_SEQ;
655	}
656	else
657	return VERR_URI_INVALID_ESCAPE_SEQ;
658	} while (cchLeft > 0 && pchSrc[0] == '%');
659
660	/* Check that we're not expecting more UTF-8 bytes. */
661	if (RT_LIKELY(cchNeeded == 0)) { /* likely */ }
662	else return VERR_URI_MISSING_UTF8_CONTINUATION_BYTE;
663
664	/* next */
665	pchSrc = (const char *)memchr(pchSrc, '%', cchLeft);
666	} while (pchSrc);
667	}
668
669	pParsed->u32Magic = RTURIPARSED_MAGIC;
670	return VINF_SUCCESS;
671	}
672
673
674	RTDECL(int) RTUriParse(const char *pszUri, PRTURIPARSED pParsed)
675	{
676	return rtUriParse(pszUri, pParsed);
677	}
678
679
680	RTDECL(char ) RTUriParsedScheme(const char pszUri, PCRTURIPARSED pParsed)
681	{
682	AssertPtrReturn(pszUri, NULL);
683	AssertPtrReturn(pParsed, NULL);
684	AssertReturn(pParsed->u32Magic == RTURIPARSED_MAGIC, NULL);
685	return RTStrDupN(pszUri, pParsed->cchScheme);
686	}
687
688
689	RTDECL(char ) RTUriParsedAuthority(const char pszUri, PCRTURIPARSED pParsed)
690	{
691	AssertPtrReturn(pszUri, NULL);
692	AssertPtrReturn(pParsed, NULL);
693	AssertReturn(pParsed->u32Magic == RTURIPARSED_MAGIC, NULL);
694	if (pParsed->cchAuthority \|\| (pParsed->fFlags & RTURIPARSED_F_HAS_AUTHORITY))
695	return rtUriPercentDecodeN(&pszUri[pParsed->offAuthority], pParsed->cchAuthority);
696	return NULL;
697	}
698
699
700	RTDECL(char ) RTUriParsedAuthorityUsername(const char pszUri, PCRTURIPARSED pParsed)
701	{
702	AssertPtrReturn(pszUri, NULL);
703	AssertPtrReturn(pParsed, NULL);
704	AssertReturn(pParsed->u32Magic == RTURIPARSED_MAGIC, NULL);
705	if (pParsed->cchAuthorityUsername)
706	return rtUriPercentDecodeN(&pszUri[pParsed->offAuthorityUsername], pParsed->cchAuthorityUsername);
707	return NULL;
708	}
709
710
711	RTDECL(char ) RTUriParsedAuthorityPassword(const char pszUri, PCRTURIPARSED pParsed)
712	{
713	AssertPtrReturn(pszUri, NULL);
714	AssertPtrReturn(pParsed, NULL);
715	AssertReturn(pParsed->u32Magic == RTURIPARSED_MAGIC, NULL);
716	if (pParsed->cchAuthorityPassword)
717	return rtUriPercentDecodeN(&pszUri[pParsed->offAuthorityPassword], pParsed->cchAuthorityPassword);
718	return NULL;
719	}
720
721
722	RTDECL(char ) RTUriParsedAuthorityHost(const char pszUri, PCRTURIPARSED pParsed)
723	{
724	AssertPtrReturn(pszUri, NULL);
725	AssertPtrReturn(pParsed, NULL);
726	AssertReturn(pParsed->u32Magic == RTURIPARSED_MAGIC, NULL);
727	if (pParsed->cchAuthorityHost)
728	return rtUriPercentDecodeN(&pszUri[pParsed->offAuthorityHost], pParsed->cchAuthorityHost);
729	return NULL;
730	}
731
732
733	RTDECL(uint32_t) RTUriParsedAuthorityPort(const char *pszUri, PCRTURIPARSED pParsed)
734	{
735	AssertPtrReturn(pszUri, UINT32_MAX);
736	AssertPtrReturn(pParsed, UINT32_MAX);
737	AssertReturn(pParsed->u32Magic == RTURIPARSED_MAGIC, UINT32_MAX);
738	return pParsed->uAuthorityPort;
739	}
740
741
742	RTDECL(char ) RTUriParsedPath(const char pszUri, PCRTURIPARSED pParsed)
743	{
744	AssertPtrReturn(pszUri, NULL);
745	AssertPtrReturn(pParsed, NULL);
746	AssertReturn(pParsed->u32Magic == RTURIPARSED_MAGIC, NULL);
747	if (pParsed->cchPath)
748	return rtUriPercentDecodeN(&pszUri[pParsed->offPath], pParsed->cchPath);
749	return NULL;
750	}
751
752
753	RTDECL(char ) RTUriParsedQuery(const char pszUri, PCRTURIPARSED pParsed)
754	{
755	AssertPtrReturn(pszUri, NULL);
756	AssertPtrReturn(pParsed, NULL);
757	AssertReturn(pParsed->u32Magic == RTURIPARSED_MAGIC, NULL);
758	if (pParsed->cchQuery)
759	return rtUriPercentDecodeN(&pszUri[pParsed->offQuery], pParsed->cchQuery);
760	return NULL;
761	}
762
763
764	RTDECL(char ) RTUriParsedFragment(const char pszUri, PCRTURIPARSED pParsed)
765	{
766	AssertPtrReturn(pszUri, NULL);
767	AssertPtrReturn(pParsed, NULL);
768	AssertReturn(pParsed->u32Magic == RTURIPARSED_MAGIC, NULL);
769	if (pParsed->cchFragment)
770	return rtUriPercentDecodeN(&pszUri[pParsed->offFragment], pParsed->cchFragment);
771	return NULL;
772	}
773
774
775	RTDECL(char ) RTUriCreate(const char pszScheme, const char pszAuthority, const char pszPath, const char *pszQuery,
776	const char *pszFragment)
777	{
778	if (!pszScheme) /* Scheme is minimum requirement */
779	return NULL;
780
781	char *pszResult = 0;
782	char *pszAuthority1 = 0;
783	char *pszPath1 = 0;
784	char *pszQuery1 = 0;
785	char *pszFragment1 = 0;
786
787	do
788	{
789	/* Create the percent encoded strings and calculate the necessary uri
790	* length. */
791	size_t cbSize = strlen(pszScheme) + 1 + 1; /* plus zero byte */
792	if (pszAuthority)
793	{
794	pszAuthority1 = rtUriPercentEncodeN(pszAuthority, RTSTR_MAX);
795	if (!pszAuthority1)
796	break;
797	cbSize += strlen(pszAuthority1) + 2;
798	}
799	if (pszPath)
800	{
801	pszPath1 = rtUriPercentEncodeN(pszPath, RTSTR_MAX);
802	if (!pszPath1)
803	break;
804	cbSize += strlen(pszPath1);
805	}
806	if (pszQuery)
807	{
808	pszQuery1 = rtUriPercentEncodeN(pszQuery, RTSTR_MAX);
809	if (!pszQuery1)
810	break;
811	cbSize += strlen(pszQuery1) + 1;
812	}
813	if (pszFragment)
814	{
815	pszFragment1 = rtUriPercentEncodeN(pszFragment, RTSTR_MAX);
816	if (!pszFragment1)
817	break;
818	cbSize += strlen(pszFragment1) + 1;
819	}
820
821	char pszTmp = pszResult = (char )RTStrAlloc(cbSize);
822	if (!pszResult)
823	break;
824	RT_BZERO(pszTmp, cbSize);
825
826	/* Compose the target uri string. */
827	RTStrCatP(&pszTmp, &cbSize, pszScheme);
828	RTStrCatP(&pszTmp, &cbSize, ":");
829	if (pszAuthority1)
830	{
831	RTStrCatP(&pszTmp, &cbSize, "//");
832	RTStrCatP(&pszTmp, &cbSize, pszAuthority1);
833	}
834	if (pszPath1)
835	{
836	RTStrCatP(&pszTmp, &cbSize, pszPath1);
837	}
838	if (pszQuery1)
839	{
840	RTStrCatP(&pszTmp, &cbSize, "?");
841	RTStrCatP(&pszTmp, &cbSize, pszQuery1);
842	}
843	if (pszFragment1)
844	{
845	RTStrCatP(&pszTmp, &cbSize, "#");
846	RTStrCatP(&pszTmp, &cbSize, pszFragment1);
847	}
848	} while (0);
849
850	/* Cleanup */
851	if (pszAuthority1)
852	RTStrFree(pszAuthority1);
853	if (pszPath1)
854	RTStrFree(pszPath1);
855	if (pszQuery1)
856	RTStrFree(pszQuery1);
857	if (pszFragment1)
858	RTStrFree(pszFragment1);
859
860	return pszResult;
861	}
862
863
864	RTDECL(bool) RTUriIsSchemeMatch(const char pszUri, const char pszScheme)
865	{
866	AssertPtrReturn(pszUri, false);
867	size_t const cchScheme = strlen(pszScheme);
868	return RTStrNICmp(pszUri, pszScheme, cchScheme) == 0
869	&& pszUri[cchScheme] == ':';
870	}
871
872
873	RTDECL(int) RTUriFileCreateEx(const char pszPath, uint32_t fPathStyle, char ppszUri, size_t cbUri, size_t pcchUri)
874	{
875	/*
876	* Validate and adjust input. (RTPathParse check pszPath out for us)
877	*/
878	if (pcchUri)
879	{
880	AssertPtrReturn(pcchUri, VERR_INVALID_POINTER);
881	*pcchUri = ~(size_t)0;
882	}
883	AssertPtrReturn(ppszUri, VERR_INVALID_POINTER);
884	AssertReturn(!(fPathStyle & ~RTPATH_STR_F_STYLE_MASK) && fPathStyle != RTPATH_STR_F_STYLE_RESERVED, VERR_INVALID_FLAGS);
885	if (fPathStyle == RTPATH_STR_F_STYLE_HOST)
886	fPathStyle = RTPATH_STYLE;
887
888	/*
889	* Let the RTPath code parse the stuff (no reason to duplicate path parsing
890	* and get it slightly wrong here).
891	*/
892	union
893	{
894	RTPATHPARSED ParsedPath;
895	uint8_t abPadding[sizeof(RTPATHPARSED)];
896	} u;
897	int rc = RTPathParse(pszPath, &u.ParsedPath, sizeof(u.ParsedPath), fPathStyle);
898	if (RT_SUCCESS(rc) \|\| rc == VERR_BUFFER_OVERFLOW)
899	{
900	/* Skip leading slashes. */
901	if (u.ParsedPath.fProps & RTPATH_PROP_ROOT_SLASH)
902	{
903	if (fPathStyle == RTPATH_STR_F_STYLE_DOS)
904	while (pszPath[0] == '/' \|\| pszPath[0] == '\\')
905	pszPath++;
906	else
907	while (pszPath[0] == '/')
908	pszPath++;
909	}
910	const size_t cchPath = strlen(pszPath);
911
912	/*
913	* Calculate the encoded length and figure destination buffering.
914	*/
915	static const char s_szPrefix[] = "file:///";
916	size_t const cchPrefix = sizeof(s_szPrefix) - (u.ParsedPath.fProps & RTPATH_PROP_UNC ? 2 : 1);
917	size_t cchEncoded = rtUriCalcEncodedLength(pszPath, cchPath, fPathStyle != RTPATH_STR_F_STYLE_DOS);
918
919	if (pcchUri)
920	*pcchUri = cchEncoded;
921
922	char *pszDst;
923	char *pszFreeMe = NULL;
924	if (!cbUri \|\| *ppszUri == NULL)
925	{
926	cbUri = RT_MAX(cbUri, cchPrefix + cchEncoded + 1);
927	*ppszUri = pszFreeMe = pszDst = RTStrAlloc(cbUri);
928	AssertReturn(pszDst, VERR_NO_STR_MEMORY);
929	}
930	else if (cchEncoded < cbUri)
931	pszDst = *ppszUri;
932	else
933	return VERR_BUFFER_OVERFLOW;
934
935	/*
936	* Construct the URI.
937	*/
938	memcpy(pszDst, s_szPrefix, cchPrefix);
939	pszDst[cchPrefix] = '\0';
940	rc = rtUriEncodeIntoBuffer(pszPath, cchPath, fPathStyle != RTPATH_STR_F_STYLE_DOS, &pszDst[cchPrefix], cbUri - cchPrefix);
941	if (RT_SUCCESS(rc))
942	{
943	Assert(strlen(pszDst) == cbUri - 1);
944	if (fPathStyle == RTPATH_STR_F_STYLE_DOS)
945	RTPathChangeToUnixSlashes(pszDst, true /fForce/);
946	return VINF_SUCCESS;
947	}
948
949	AssertRC(rc); /* Impossible! rtUriCalcEncodedLength or something above is busted! */
950	if (pszFreeMe)
951	RTStrFree(pszFreeMe);
952	}
953	return rc;
954	}
955
956
957	RTDECL(char ) RTUriFileCreate(const char pszPath)
958	{
959	char *pszUri = NULL;
960	int rc = RTUriFileCreateEx(pszPath, RTPATH_STR_F_STYLE_HOST, &pszUri, 0 /cbUri/, NULL /pcchUri/);
961	if (RT_SUCCESS(rc))
962	return pszUri;
963	return NULL;
964	}
965
966
967	RTDECL(int) RTUriFilePathEx(const char pszUri, uint32_t fPathStyle, char ppszPath, size_t cbPath, size_t pcchPath)
968	{
969	/*
970	* Validate and adjust input.
971	*/
972	if (pcchPath)
973	{
974	AssertPtrReturn(pcchPath, VERR_INVALID_POINTER);
975	*pcchPath = ~(size_t)0;
976	}
977	AssertPtrReturn(ppszPath, VERR_INVALID_POINTER);
978	AssertReturn(!(fPathStyle & ~RTPATH_STR_F_STYLE_MASK) && fPathStyle != RTPATH_STR_F_STYLE_RESERVED, VERR_INVALID_FLAGS);
979	if (fPathStyle == RTPATH_STR_F_STYLE_HOST)
980	fPathStyle = RTPATH_STYLE;
981	AssertPtrReturn(pszUri, VERR_INVALID_POINTER);
982
983	/*
984	* Check that this is a file URI.
985	*/
986	if (RTStrNICmp(pszUri, RT_STR_TUPLE("file:")) == 0)
987	{ /* likely */ }
988	else
989	return VERR_URI_NOT_FILE_SCHEME;
990
991	/*
992	* We may have a number of variations here, mostly thanks to
993	* various windows software. First the canonical variations:
994	* - file:///C:/Windows/System32/kernel32.dll
995	* - file:///C\|/Windows/System32/kernel32.dll
996	* - file:///C:%5CWindows%5CSystem32%5Ckernel32.dll
997	* - file://localhost/C:%5CWindows%5CSystem32%5Ckernel32.dll
998	* - file://cifsserver.dev/systemshare%5CWindows%5CSystem32%5Ckernel32.dll
999	* - file://cifsserver.dev:139/systemshare%5CWindows%5CSystem32%5Ckernel32.dll (not quite sure here, but whatever)
1000	*
1001	* Legacy variant without any slashes after the schema:
1002	* - file:C:/Windows/System32/kernel32.dll
1003	* - file:C\|/Windows/System32%5Ckernel32.dll
1004	* - file:~/.bashrc
1005	* \--path-/
1006	*
1007	* Legacy variant with exactly one slashes after the schema:
1008	* - file:/C:/Windows/System32%5Ckernel32.dll
1009	* - file:/C\|/Windows/System32/kernel32.dll
1010	* - file:/usr/bin/env
1011	* \---path---/
1012	*
1013	* Legacy variant with two slashes after the schema and an unescaped DOS path:
1014	* - file://C:/Windows/System32\kernel32.dll (**)
1015	* - file://C\|/Windows/System32\kernel32.dll
1016	* \---path---------------------/
1017	* -- authority, with ':' as non-working port separator
1018	*
1019	* Legacy variant with exactly four slashes after the schema and an unescaped DOS path.
1020	* - file:////C:/Windows\System32\user32.dll
1021	*
1022	* Legacy variant with four or more slashes after the schema and an unescaped UNC path:
1023	* - file:////cifsserver.dev/systemshare/System32%\kernel32.dll
1024	* - file://///cifsserver.dev/systemshare/System32\kernel32.dll
1025	* \---path--------------------------------------------/
1026	*
1027	* The two unescaped variants shouldn't be handed to rtUriParse, which
1028	* is good as we cannot actually handle the one marked by (**). So, handle
1029	* those two special when parsing.
1030	*/
1031	RTURIPARSED Parsed;
1032	int rc;
1033	size_t cSlashes = 0;
1034	while (pszUri[5 + cSlashes] == '/')
1035	cSlashes++;
1036	if ( (cSlashes == 2 \|\| cSlashes == 4)
1037	&& RT_C_IS_ALPHA(pszUri[5 + cSlashes])
1038	&& (pszUri[5 + cSlashes + 1] == ':' \|\| pszUri[5 + cSlashes + 1] == '\|'))
1039	{
1040	RT_ZERO(Parsed); /* RTURIPARSED_F_CONTAINS_ESCAPED_CHARS is now clear. */
1041	Parsed.offPath = 5 + cSlashes;
1042	Parsed.cchPath = strlen(&pszUri[Parsed.offPath]);
1043	rc = RTStrValidateEncoding(&pszUri[Parsed.offPath]);
1044	}
1045	else if (cSlashes >= 4)
1046	{
1047	RT_ZERO(Parsed);
1048	Parsed.fFlags = cSlashes > 4 ? RTURIPARSED_F_CONTAINS_ESCAPED_CHARS : 0;
1049	Parsed.offPath = 5 + cSlashes - 2;
1050	Parsed.cchPath = strlen(&pszUri[Parsed.offPath]);
1051	rc = RTStrValidateEncoding(&pszUri[Parsed.offPath]);
1052	}
1053	else
1054	rc = rtUriParse(pszUri, &Parsed);
1055	if (RT_SUCCESS(rc))
1056	{
1057	/*
1058	* Ignore localhost as hostname (it's implicit).
1059	*/
1060	static char const s_szLocalhost[] = "localhost";
1061	if ( Parsed.cchAuthorityHost == sizeof(s_szLocalhost) - 1U
1062	&& RTStrNICmp(&pszUri[Parsed.offAuthorityHost], RT_STR_TUPLE(s_szLocalhost)) == 0)
1063	{
1064	Parsed.cchAuthorityHost = 0;
1065	Parsed.cchAuthority = 0;
1066	}
1067
1068	/*
1069	* Ignore leading path slash/separator if we detect a DOS drive letter
1070	* and we don't have a host name.
1071	*/
1072	if ( Parsed.cchPath >= 3
1073	&& Parsed.cchAuthorityHost == 0
1074	&& pszUri[Parsed.offPath] == '/' /* Leading path slash/separator. */
1075	&& ( pszUri[Parsed.offPath + 2] == ':' /* Colon after drive letter. */
1076	\|\| pszUri[Parsed.offPath + 2] == '\|') /* Colon alternative. */
1077	&& RT_C_IS_ALPHA(pszUri[Parsed.offPath + 1]) ) /* Drive letter. */
1078	{
1079	Parsed.offPath++;
1080	Parsed.cchPath--;
1081	}
1082
1083	/*
1084	* Calculate the size of the encoded result.
1085	*
1086	* Since we're happily returning "C:/Windows/System32/kernel.dll"
1087	* style paths when the caller requested UNIX style paths, we will
1088	* return straight UNC paths too ("//cifsserver/share/dir/file").
1089	*/
1090	size_t cchDecodedHost = 0;
1091	size_t cbResult;
1092	if (Parsed.fFlags & RTURIPARSED_F_CONTAINS_ESCAPED_CHARS)
1093	{
1094	cchDecodedHost = rtUriCalcDecodedLength(&pszUri[Parsed.offAuthorityHost], Parsed.cchAuthorityHost);
1095	cbResult = cchDecodedHost + rtUriCalcDecodedLength(&pszUri[Parsed.offPath], Parsed.cchPath) + 1;
1096	}
1097	else
1098	{
1099	cchDecodedHost = 0;
1100	cbResult = Parsed.cchAuthorityHost + Parsed.cchPath + 1;
1101	}
1102	if (pcchPath)
1103	*pcchPath = cbResult - 1;
1104	if (cbResult > 1)
1105	{
1106	/*
1107	* Prepare the necessary buffer space for the result.
1108	*/
1109	char *pszDst;
1110	char *pszFreeMe = NULL;
1111	if (!cbPath \|\| *ppszPath == NULL)
1112	{
1113	cbPath = RT_MAX(cbPath, cbResult);
1114	*ppszPath = pszFreeMe = pszDst = RTStrAlloc(cbPath);
1115	AssertReturn(pszDst, VERR_NO_STR_MEMORY);
1116	}
1117	else if (cbResult <= cbPath)
1118	pszDst = *ppszPath;
1119	else
1120	return VERR_BUFFER_OVERFLOW;
1121
1122	/*
1123	* Compose the result.
1124	*/
1125	if (Parsed.fFlags & RTURIPARSED_F_CONTAINS_ESCAPED_CHARS)
1126	{
1127	rc = rtUriDecodeIntoBuffer(&pszUri[Parsed.offAuthorityHost],Parsed.cchAuthorityHost,
1128	pszDst, cchDecodedHost + 1);
1129	Assert(RT_SUCCESS(rc) && strlen(pszDst) == cchDecodedHost);
1130	if (RT_SUCCESS(rc))
1131	rc = rtUriDecodeIntoBuffer(&pszUri[Parsed.offPath], Parsed.cchPath,
1132	&pszDst[cchDecodedHost], cbResult - cchDecodedHost);
1133	Assert(RT_SUCCESS(rc) && strlen(pszDst) == cbResult - 1);
1134	}
1135	else
1136	{
1137	memcpy(pszDst, &pszUri[Parsed.offAuthorityHost], Parsed.cchAuthorityHost);
1138	memcpy(&pszDst[Parsed.cchAuthorityHost], &pszUri[Parsed.offPath], Parsed.cchPath);
1139	pszDst[cbResult - 1] = '\0';
1140	}
1141	if (RT_SUCCESS(rc))
1142	{
1143	/*
1144	* Convert colon DOS driver letter colon alternative.
1145	* We do this regardless of the desired path style.
1146	*/
1147	if ( RT_C_IS_ALPHA(pszDst[0])
1148	&& pszDst[1] == '\|')
1149	pszDst[1] = ':';
1150
1151	/*
1152	* Fix slashes.
1153	*/
1154	if (fPathStyle == RTPATH_STR_F_STYLE_DOS)
1155	RTPathChangeToDosSlashes(pszDst, true);
1156	else if (fPathStyle == RTPATH_STR_F_STYLE_UNIX)
1157	RTPathChangeToUnixSlashes(pszDst, true); /** @todo not quite sure how this actually makes sense... */
1158	else
1159	AssertFailed();
1160	return rc;
1161	}
1162
1163	/* bail out */
1164	RTStrFree(pszFreeMe);
1165	}
1166	else
1167	rc = VERR_PATH_ZERO_LENGTH;
1168	}
1169	return rc;
1170	}
1171
1172
1173	RTDECL(char ) RTUriFilePath(const char pszUri)
1174	{
1175	char *pszPath = NULL;
1176	int rc = RTUriFilePathEx(pszUri, RTPATH_STR_F_STYLE_HOST, &pszPath, 0 /cbPath/, NULL /pcchPath/);
1177	if (RT_SUCCESS(rc))
1178	return pszPath;
1179	return NULL;
1180	}
1181

Note: See TracBrowser for help on using the repository browser.

source: vbox/trunk/src/VBox/Runtime/common/misc/uri.cpp@ 99248

Download in other formats: