uri.cpp@ 76346

Last change on this file since 76346 was 76346, checked in by vboxsync, 6 years ago
*: Preparing for iprt/string.h, iprt/json.h and iprt/serialport.h no longer including iprt/err.h and string.h no longer including latin1.h (it needs err.h). bugref:9344
Property svn:eol-style set to `native` Property svn:keywords set to `Author Date Id Revision`
File size: 39.6 KB

Line
1	/* $Id: uri.cpp 76346 2018-12-22 00:51:28Z vboxsync $ */
2	/** @file
3	* IPRT - Uniform Resource Identifier handling.
4	*/
5
6	/*
7	* Copyright (C) 2011-2017 Oracle Corporation
8	*
9	* This file is part of VirtualBox Open Source Edition (OSE), as
10	* available from http://www.virtualbox.org. This file is free software;
11	* you can redistribute it and/or modify it under the terms of the GNU
12	* General Public License (GPL) as published by the Free Software
13	* Foundation, in version 2 as it comes in the "COPYING" file of the
14	* VirtualBox OSE distribution. VirtualBox OSE is distributed in the
15	* hope that it will be useful, but WITHOUT ANY WARRANTY of any kind.
16	*
17	* The contents of this file may alternatively be used under the terms
18	* of the Common Development and Distribution License Version 1.0
19	* (CDDL) only, as it comes in the "COPYING.CDDL" file of the
20	* VirtualBox OSE distribution, in which case the provisions of the
21	* CDDL are applicable instead of those of the GPL.
22	*
23	* You may elect to license modified versions of this file under the
24	* terms and conditions of either the GPL or the CDDL or both.
25	*/
26
27
28	/*********************************************************************************************************************************
29	* Header Files *
30	*********************************************************************************************************************************/
31	#include <iprt/uri.h>
32
33	#include <iprt/assert.h>
34	#include <iprt/ctype.h>
35	#include <iprt/err.h>
36	#include <iprt/path.h>
37	#include <iprt/string.h>
38
39
40	/*********************************************************************************************************************************
41	* Defined Constants And Macros *
42	*********************************************************************************************************************************/
43	/** Internal magic value we use to check if a RTURIPARSED structure has made it thru RTUriParse. */
44	#define RTURIPARSED_MAGIC UINT32_C(0x439e0745)
45
46
47	/* General URI format:
48
49	foo://example.com:8042/over/there?name=ferret#nose
50	\_/ \______________/\_________/ \_________/ \__/
51	\| \| \| \| \|
52	scheme authority path query fragment
53	\| _____________________\|__
54	/ \ / \
55	urn:example:animal:ferret:nose
56	*/
57
58
59	/**
60	* The following defines characters which have to be % escaped:
61	* control = 00-1F
62	* space = ' '
63	* delims = '<' , '>' , '#' , '%' , '"'
64	* unwise = '{' , '}' , '\|' , '\' , '^' , '[' , ']' , '`'
65	*/
66	#define URI_EXCLUDED(a) \
67	( ((a) >= 0x0 && (a) <= 0x20) \
68	\|\| ((a) >= 0x5B && (a) <= 0x5E) \
69	\|\| ((a) >= 0x7B && (a) <= 0x7D) \
70	\|\| (a) == '<' \|\| (a) == '>' \|\| (a) == '#' \
71	\|\| (a) == '%' \|\| (a) == '"' \|\| (a) == '`' )
72
73	static char rtUriPercentEncodeN(const char pszString, size_t cchMax)
74	{
75	if (!pszString)
76	return NULL;
77
78	int rc = VINF_SUCCESS;
79
80	size_t cbLen = RT_MIN(strlen(pszString), cchMax);
81	/* The new string can be max 3 times in size of the original string. */
82	char pszNew = RTStrAlloc(cbLen 3 + 1);
83	if (!pszNew)
84	return NULL;
85
86	char *pszRes = NULL;
87	size_t iIn = 0;
88	size_t iOut = 0;
89	while (iIn < cbLen)
90	{
91	if (URI_EXCLUDED(pszString[iIn]))
92	{
93	char szNum[3] = { 0, 0, 0 };
94	RTStrFormatU8(&szNum[0], 3, pszString[iIn++], 16, 2, 2, RTSTR_F_CAPITAL \| RTSTR_F_ZEROPAD);
95	pszNew[iOut++] = '%';
96	pszNew[iOut++] = szNum[0];
97	pszNew[iOut++] = szNum[1];
98	}
99	else
100	pszNew[iOut++] = pszString[iIn++];
101	}
102	if (RT_SUCCESS(rc))
103	{
104	pszNew[iOut] = '\0';
105	if (iOut != iIn)
106	{
107	/* If the source and target strings have different size, recreate
108	* the target string with the correct size. */
109	pszRes = RTStrDupN(pszNew, iOut);
110	RTStrFree(pszNew);
111	}
112	else
113	pszRes = pszNew;
114	}
115	else
116	RTStrFree(pszNew);
117
118	return pszRes;
119	}
120
121
122	/**
123	* Calculates the encoded string length.
124	*
125	* @returns Number of chars (excluding the terminator).
126	* @param pszString The string to encode.
127	* @param cchMax The maximum string length (e.g. RTSTR_MAX).
128	* @param fEncodeDosSlash Whether to encode DOS slashes or not.
129	*/
130	static size_t rtUriCalcEncodedLength(const char *pszString, size_t cchMax, bool fEncodeDosSlash)
131	{
132	size_t cchEncoded = 0;
133	if (pszString)
134	{
135	size_t cchSrcLeft = RTStrNLen(pszString, cchMax);
136	while (cchSrcLeft-- > 0)
137	{
138	char const ch = *pszString++;
139	if (!URI_EXCLUDED(ch) \|\| (ch == '\\' && !fEncodeDosSlash))
140	cchEncoded += 1;
141	else
142	cchEncoded += 3;
143	}
144	}
145	return cchEncoded;
146	}
147
148
149	/**
150	* Encodes an URI into a caller allocated buffer.
151	*
152	* @returns IPRT status code.
153	* @param pszString The string to encode.
154	* @param cchMax The maximum string length (e.g. RTSTR_MAX).
155	* @param fEncodeDosSlash Whether to encode DOS slashes or not.
156	* @param pszDst The destination buffer.
157	* @param cbDst The size of the destination buffer.
158	*/
159	static int rtUriEncodeIntoBuffer(const char pszString, size_t cchMax, bool fEncodeDosSlash, char pszDst, size_t cbDst)
160	{
161	AssertReturn(pszString, VERR_INVALID_POINTER);
162	AssertPtrReturn(pszDst, VERR_INVALID_POINTER);
163
164	/*
165	* We do buffer size checking up front and every time we encode a special
166	* character. That's faster than checking for each char.
167	*/
168	size_t cchSrcLeft = RTStrNLen(pszString, cchMax);
169	AssertMsgReturn(cbDst > cchSrcLeft, ("cbDst=%zu cchSrcLeft=%zu\n", cbDst, cchSrcLeft), VERR_BUFFER_OVERFLOW);
170	cbDst -= cchSrcLeft;
171
172	while (cchSrcLeft-- > 0)
173	{
174	char const ch = *pszString++;
175	if (!URI_EXCLUDED(ch) \|\| (ch == '\\' && !fEncodeDosSlash))
176	*pszDst++ = ch;
177	else
178	{
179	AssertReturn(cbDst >= 3, VERR_BUFFER_OVERFLOW); /* 2 extra bytes + zero terminator. */
180	cbDst -= 2;
181
182	*pszDst++ = '%';
183	ssize_t cchTmp = RTStrFormatU8(pszDst, 3, (unsigned char)ch, 16, 2, 2, RTSTR_F_CAPITAL \| RTSTR_F_ZEROPAD);
184	Assert(cchTmp == 2); NOREF(cchTmp);
185	pszDst += 2;
186	}
187	}
188
189	*pszDst = '\0';
190	return VINF_SUCCESS;
191	}
192
193
194	static char rtUriPercentDecodeN(const char pszString, size_t cchString)
195	{
196	AssertPtrReturn(pszString, NULL);
197	AssertReturn(memchr(pszString, '\0', cchString) == NULL, NULL);
198
199	/*
200	* The new string can only get smaller, so use the input length as a
201	* staring buffer size.
202	*/
203	char *pszDecoded = RTStrAlloc(cchString + 1);
204	if (pszDecoded)
205	{
206	/*
207	* Knowing that the pszString itself is valid UTF-8, we only have to
208	* validate the escape sequences.
209	*/
210	size_t cchLeft = cchString;
211	char const *pchSrc = pszString;
212	char *pchDst = pszDecoded;
213	while (cchLeft > 0)
214	{
215	const char pchPct = (const char )memchr(pchSrc, '%', cchLeft);
216	if (pchPct)
217	{
218	size_t cchBefore = pchPct - pchSrc;
219	if (cchBefore)
220	{
221	memcpy(pchDst, pchSrc, cchBefore);
222	pchDst += cchBefore;
223	pchSrc += cchBefore;
224	cchLeft -= cchBefore;
225	}
226
227	char chHigh, chLow;
228	if ( cchLeft >= 3
229	&& RT_C_IS_XDIGIT(chHigh = pchSrc[1])
230	&& RT_C_IS_XDIGIT(chLow = pchSrc[2]))
231	{
232	uint8_t b = RT_C_IS_DIGIT(chHigh) ? chHigh - '0' : (chHigh & ~0x20) - 'A' + 10;
233	b <<= 4;
234	b \|= RT_C_IS_DIGIT(chLow) ? chLow - '0' : (chLow & ~0x20) - 'A' + 10;
235	*pchDst++ = (char)b;
236	pchSrc += 3;
237	cchLeft -= 3;
238	}
239	else
240	{
241	AssertFailed();
242	pchDst++ = pchSrc++;
243	cchLeft--;
244	}
245	}
246	else
247	{
248	memcpy(pchDst, pchSrc, cchLeft);
249	pchDst += cchLeft;
250	pchSrc += cchLeft;
251	cchLeft = 0;
252	break;
253	}
254	}
255
256	*pchDst = '\0';
257
258	/*
259	* If we've got lof space room in the result string, reallocate it.
260	*/
261	size_t cchDecoded = pchDst - pszDecoded;
262	Assert(cchDecoded <= cchString);
263	if (cchString - cchDecoded > 64)
264	RTStrRealloc(&pszDecoded, cchDecoded + 1);
265	}
266	return pszDecoded;
267	}
268
269
270	/**
271	* Calculates the decoded string length.
272	*
273	* @returns Number of chars (excluding the terminator).
274	* @param pszString The string to decode.
275	* @param cchMax The maximum string length (e.g. RTSTR_MAX).
276	*/
277	static size_t rtUriCalcDecodedLength(const char *pszString, size_t cchMax)
278	{
279	size_t cchDecoded;
280	if (pszString)
281	{
282	size_t cchSrcLeft = cchDecoded = RTStrNLen(pszString, cchMax);
283	while (cchSrcLeft-- > 0)
284	{
285	char const ch = *pszString++;
286	if (ch != '%')
287	{ /* typical */}
288	else if ( cchSrcLeft >= 2
289	&& RT_C_IS_XDIGIT(pszString[0])
290	&& RT_C_IS_XDIGIT(pszString[1]))
291	{
292	cchDecoded -= 2;
293	pszString += 2;
294	cchSrcLeft -= 2;
295	}
296	}
297	}
298	else
299	cchDecoded = 0;
300	return cchDecoded;
301	}
302
303
304	/**
305	* Decodes a string into a buffer.
306	*
307	* @returns IPRT status code.
308	* @param pchSrc The source string.
309	* @param cchSrc The max number of bytes to decode in the source string.
310	* @param pszDst The destination buffer.
311	* @param cbDst The size of the buffer (including terminator).
312	*/
313	static int rtUriDecodeIntoBuffer(const char pchSrc, size_t cchSrc, char pszDst, size_t cbDst)
314	{
315	AssertPtrReturn(pchSrc, VERR_INVALID_POINTER);
316	AssertPtrReturn(pszDst, VERR_INVALID_POINTER);
317
318	/*
319	* Knowing that the pszString itself is valid UTF-8, we only have to
320	* validate the escape sequences.
321	*/
322	cchSrc = RTStrNLen(pchSrc, cchSrc);
323	while (cchSrc > 0)
324	{
325	const char pchPct = (const char )memchr(pchSrc, '%', cchSrc);
326	if (pchPct)
327	{
328	size_t cchBefore = pchPct - pchSrc;
329	AssertReturn(cchBefore + 1 < cbDst, VERR_BUFFER_OVERFLOW);
330	if (cchBefore)
331	{
332	memcpy(pszDst, pchSrc, cchBefore);
333	pszDst += cchBefore;
334	cbDst -= cchBefore;
335	pchSrc += cchBefore;
336	cchSrc -= cchBefore;
337	}
338
339	char chHigh, chLow;
340	if ( cchSrc >= 3
341	&& RT_C_IS_XDIGIT(chHigh = pchSrc[1])
342	&& RT_C_IS_XDIGIT(chLow = pchSrc[2]))
343	{
344	uint8_t b = RT_C_IS_DIGIT(chHigh) ? chHigh - '0' : (chHigh & ~0x20) - 'A' + 10;
345	b <<= 4;
346	b \|= RT_C_IS_DIGIT(chLow) ? chLow - '0' : (chLow & ~0x20) - 'A' + 10;
347	*pszDst++ = (char)b;
348	pchSrc += 3;
349	cchSrc -= 3;
350	}
351	else
352	{
353	AssertFailed();
354	pszDst++ = pchSrc++;
355	cchSrc--;
356	}
357	cbDst -= 1;
358	}
359	else
360	{
361	AssertReturn(cchSrc < cbDst, VERR_BUFFER_OVERFLOW);
362	memcpy(pszDst, pchSrc, cchSrc);
363	pszDst += cchSrc;
364	cbDst -= cchSrc;
365	pchSrc += cchSrc;
366	cchSrc = 0;
367	break;
368	}
369	}
370
371	AssertReturn(cbDst > 0, VERR_BUFFER_OVERFLOW);
372	*pszDst = '\0';
373	return VINF_SUCCESS;
374	}
375
376
377
378	static int rtUriParse(const char *pszUri, PRTURIPARSED pParsed)
379	{
380	/*
381	* Validate the input and clear the output.
382	*/
383	AssertPtrReturn(pParsed, VERR_INVALID_POINTER);
384	RT_ZERO(*pParsed);
385	pParsed->uAuthorityPort = UINT32_MAX;
386
387	AssertPtrReturn(pszUri, VERR_INVALID_POINTER);
388
389	size_t const cchUri = strlen(pszUri);
390	if (RT_LIKELY(cchUri >= 3)) { /* likely */ }
391	else return cchUri ? VERR_URI_TOO_SHORT : VERR_URI_EMPTY;
392
393	/*
394	* Validating escaped text sequences is much simpler if we know that
395	* that the base URI string is valid. Also, we don't necessarily trust
396	* the developer calling us to remember to do this.
397	*/
398	int rc = RTStrValidateEncoding(pszUri);
399	AssertRCReturn(rc, rc);
400
401	/*
402	* RFC-3986, section 3.1:
403	* scheme = ALPHA *( ALPHA / DIGIT / "+" / "-" / "." )
404	*
405	* The scheme ends with a ':', which we also skip here.
406	*/
407	size_t off = 0;
408	char ch = pszUri[off++];
409	if (RT_LIKELY(RT_C_IS_ALPHA(ch))) { /* likely */ }
410	else return VERR_URI_INVALID_SCHEME;
411	for (;;)
412	{
413	ch = pszUri[off];
414	if (ch == ':')
415	break;
416	if (RT_LIKELY(RT_C_IS_ALNUM(ch) \|\| ch == '.' \|\| ch == '-' \|\| ch == '+')) { /* likely */ }
417	else return VERR_URI_INVALID_SCHEME;
418	off++;
419	}
420	pParsed->cchScheme = off;
421
422	/* Require the scheme length to be at least two chars so we won't confuse
423	it with a path starting with a DOS drive letter specification. */
424	if (RT_LIKELY(off >= 2)) { /* likely */ }
425	else return VERR_URI_INVALID_SCHEME;
426
427	off++; /* (skip colon) */
428
429	/*
430	* Find the end of the path, we'll need this several times.
431	* Also, while we're potentially scanning the whole thing, check for '%'.
432	*/
433	size_t const offHash = RTStrOffCharOrTerm(&pszUri[off], '#') + off;
434	size_t const offQuestionMark = RTStrOffCharOrTerm(&pszUri[off], '?') + off;
435
436	if (memchr(pszUri, '%', cchUri) != NULL)
437	pParsed->fFlags \|= RTURIPARSED_F_CONTAINS_ESCAPED_CHARS;
438
439	/*
440	* RFC-3986, section 3.2:
441	* The authority component is preceeded by a double slash ("//")...
442	*/
443	if ( pszUri[off] == '/'
444	&& pszUri[off + 1] == '/')
445	{
446	off += 2;
447	pParsed->offAuthority = pParsed->offAuthorityUsername = pParsed->offAuthorityPassword = pParsed->offAuthorityHost = off;
448	pParsed->fFlags \|= RTURIPARSED_F_HAS_AUTHORITY;
449
450	/*
451	* RFC-3986, section 3.2:
452	* ...and is terminated by the next slash ("/"), question mark ("?"),
453	* or number sign ("#") character, or by the end of the URI.
454	*/
455	const char *pszAuthority = &pszUri[off];
456	size_t cchAuthority = RTStrOffCharOrTerm(pszAuthority, '/');
457	cchAuthority = RT_MIN(cchAuthority, offHash - off);
458	cchAuthority = RT_MIN(cchAuthority, offQuestionMark - off);
459	pParsed->cchAuthority = cchAuthority;
460
461	/* The Authority can be empty, like for: file:///usr/bin/grep */
462	if (cchAuthority > 0)
463	{
464	pParsed->cchAuthorityHost = cchAuthority;
465
466	/*
467	* If there is a userinfo part, it is ended by a '@'.
468	*/
469	const char pszAt = (const char )memchr(pszAuthority, '@', cchAuthority);
470	if (pszAt)
471	{
472	size_t cchTmp = pszAt - pszAuthority;
473	pParsed->offAuthorityHost += cchTmp + 1;
474	pParsed->cchAuthorityHost -= cchTmp + 1;
475
476	/* If there is a password part, it's separated from the username with a colon. */
477	const char pszColon = (const char )memchr(pszAuthority, ':', cchTmp);
478	if (pszColon)
479	{
480	pParsed->cchAuthorityUsername = pszColon - pszAuthority;
481	pParsed->offAuthorityPassword = &pszColon[1] - pszUri;
482	pParsed->cchAuthorityPassword = pszAt - &pszColon[1];
483	}
484	else
485	{
486	pParsed->cchAuthorityUsername = cchTmp;
487	pParsed->offAuthorityPassword = off + cchTmp;
488	}
489	}
490
491	/*
492	* If there is a port part, its after the last colon in the host part.
493	*/
494	const char pszColon = (const char )memrchr(&pszUri[pParsed->offAuthorityHost], ':', pParsed->cchAuthorityHost);
495	if (pszColon)
496	{
497	size_t cchTmp = &pszUri[pParsed->offAuthorityHost + pParsed->cchAuthorityHost] - &pszColon[1];
498	pParsed->cchAuthorityHost -= cchTmp + 1;
499	pParsed->fFlags \|= RTURIPARSED_F_HAS_PORT;
500	if (cchTmp > 0)
501	{
502	pParsed->uAuthorityPort = 0;
503	while (cchTmp-- > 0)
504	{
505	ch = *++pszColon;
506	if ( RT_C_IS_DIGIT(ch)
507	&& pParsed->uAuthorityPort < UINT32_MAX / UINT32_C(10))
508	{
509	pParsed->uAuthorityPort *= 10;
510	pParsed->uAuthorityPort += ch - '0';
511	}
512	else
513	return VERR_URI_INVALID_PORT_NUMBER;
514	}
515	}
516	}
517	}
518
519	/* Skip past the authority. */
520	off += cchAuthority;
521	}
522	else
523	pParsed->offAuthority = pParsed->offAuthorityUsername = pParsed->offAuthorityPassword = pParsed->offAuthorityHost = off;
524
525	/*
526	* RFC-3986, section 3.3: Path
527	* The path is terminated by the first question mark ("?")
528	* or number sign ("#") character, or by the end of the URI.
529	*/
530	pParsed->offPath = off;
531	pParsed->cchPath = RT_MIN(offHash, offQuestionMark) - off;
532	off += pParsed->cchPath;
533
534	/*
535	* RFC-3986, section 3.4: Query
536	* The query component is indicated by the first question mark ("?")
537	* character and terminated by a number sign ("#") character or by the
538	* end of the URI.
539	*/
540	if ( off == offQuestionMark
541	&& off < cchUri)
542	{
543	Assert(pszUri[offQuestionMark] == '?');
544	pParsed->offQuery = ++off;
545	pParsed->cchQuery = offHash - off;
546	off = offHash;
547	}
548	else
549	{
550	Assert(!pszUri[offQuestionMark]);
551	pParsed->offQuery = off;
552	}
553
554	/*
555	* RFC-3986, section 3.5: Fragment
556	* A fragment identifier component is indicated by the presence of a
557	* number sign ("#") character and terminated by the end of the URI.
558	*/
559	if ( off == offHash
560	&& off < cchUri)
561	{
562	pParsed->offFragment = ++off;
563	pParsed->cchFragment = cchUri - off;
564	}
565	else
566	{
567	Assert(!pszUri[offHash]);
568	pParsed->offFragment = off;
569	}
570
571	/*
572	* If there are any escape sequences, validate them.
573	*
574	* This is reasonably simple as we already know that the string is valid UTF-8
575	* before they get decoded. Thus we only have to validate the escaped sequences.
576	*/
577	if (pParsed->fFlags & RTURIPARSED_F_CONTAINS_ESCAPED_CHARS)
578	{
579	const char pchSrc = (const char )memchr(pszUri, '%', cchUri);
580	AssertReturn(pchSrc, VERR_INTERNAL_ERROR);
581	do
582	{
583	char szUtf8Seq[8];
584	unsigned cchUtf8Seq = 0;
585	unsigned cchNeeded = 0;
586	size_t cchLeft = &pszUri[cchUri] - pchSrc;
587	do
588	{
589	if (cchLeft >= 3)
590	{
591	char chHigh = pchSrc[1];
592	char chLow = pchSrc[2];
593	if ( RT_C_IS_XDIGIT(chHigh)
594	&& RT_C_IS_XDIGIT(chLow))
595	{
596	uint8_t b = RT_C_IS_DIGIT(chHigh) ? chHigh - '0' : (chHigh & ~0x20) - 'A' + 10;
597	b <<= 4;
598	b \|= RT_C_IS_DIGIT(chLow) ? chLow - '0' : (chLow & ~0x20) - 'A' + 10;
599
600	if (!(b & 0x80))
601	{
602	/* We don't want the string to be terminated prematurely. */
603	if (RT_LIKELY(b != 0)) { /* likely */ }
604	else return VERR_URI_ESCAPED_ZERO;
605
606	/* Check that we're not expecting more UTF-8 bytes. */
607	if (RT_LIKELY(cchNeeded == 0)) { /* likely */ }
608	else return VERR_URI_MISSING_UTF8_CONTINUATION_BYTE;
609	}
610	/* Are we waiting UTF-8 bytes? */
611	else if (cchNeeded > 0)
612	{
613	if (RT_LIKELY(!(b & 0x40))) { /* likely */ }
614	else return VERR_URI_INVALID_ESCAPED_UTF8_CONTINUATION_BYTE;
615
616	szUtf8Seq[cchUtf8Seq++] = (char)b;
617	if (--cchNeeded == 0)
618	{
619	szUtf8Seq[cchUtf8Seq] = '\0';
620	rc = RTStrValidateEncoding(szUtf8Seq);
621	if (RT_FAILURE(rc))
622	return VERR_URI_ESCAPED_CHARS_NOT_VALID_UTF8;
623	cchUtf8Seq = 0;
624	}
625	}
626	/* Start a new UTF-8 sequence. */
627	else
628	{
629	if ((b & 0xf8) == 0xf0)
630	cchNeeded = 3;
631	else if ((b & 0xf0) == 0xe0)
632	cchNeeded = 2;
633	else if ((b & 0xe0) == 0xc0)
634	cchNeeded = 1;
635	else
636	return VERR_URI_INVALID_ESCAPED_UTF8_LEAD_BYTE;
637	szUtf8Seq[0] = (char)b;
638	cchUtf8Seq = 1;
639	}
640	pchSrc += 3;
641	cchLeft -= 3;
642	}
643	else
644	return VERR_URI_INVALID_ESCAPE_SEQ;
645	}
646	else
647	return VERR_URI_INVALID_ESCAPE_SEQ;
648	} while (cchLeft > 0 && pchSrc[0] == '%');
649
650	/* Check that we're not expecting more UTF-8 bytes. */
651	if (RT_LIKELY(cchNeeded == 0)) { /* likely */ }
652	else return VERR_URI_MISSING_UTF8_CONTINUATION_BYTE;
653
654	/* next */
655	pchSrc = (const char *)memchr(pchSrc, '%', cchLeft);
656	} while (pchSrc);
657	}
658
659	pParsed->u32Magic = RTURIPARSED_MAGIC;
660	return VINF_SUCCESS;
661	}
662
663
664	RTDECL(int) RTUriParse(const char *pszUri, PRTURIPARSED pParsed)
665	{
666	return rtUriParse(pszUri, pParsed);
667	}
668
669
670	RTDECL(char ) RTUriParsedScheme(const char pszUri, PCRTURIPARSED pParsed)
671	{
672	AssertPtrReturn(pszUri, NULL);
673	AssertPtrReturn(pParsed, NULL);
674	AssertReturn(pParsed->u32Magic == RTURIPARSED_MAGIC, NULL);
675	return RTStrDupN(pszUri, pParsed->cchScheme);
676	}
677
678
679	RTDECL(char ) RTUriParsedAuthority(const char pszUri, PCRTURIPARSED pParsed)
680	{
681	AssertPtrReturn(pszUri, NULL);
682	AssertPtrReturn(pParsed, NULL);
683	AssertReturn(pParsed->u32Magic == RTURIPARSED_MAGIC, NULL);
684	if (pParsed->cchAuthority \|\| (pParsed->fFlags & RTURIPARSED_F_HAS_AUTHORITY))
685	return rtUriPercentDecodeN(&pszUri[pParsed->offAuthority], pParsed->cchAuthority);
686	return NULL;
687	}
688
689
690	RTDECL(char ) RTUriParsedAuthorityUsername(const char pszUri, PCRTURIPARSED pParsed)
691	{
692	AssertPtrReturn(pszUri, NULL);
693	AssertPtrReturn(pParsed, NULL);
694	AssertReturn(pParsed->u32Magic == RTURIPARSED_MAGIC, NULL);
695	if (pParsed->cchAuthorityUsername)
696	return rtUriPercentDecodeN(&pszUri[pParsed->offAuthorityUsername], pParsed->cchAuthorityUsername);
697	return NULL;
698	}
699
700
701	RTDECL(char ) RTUriParsedAuthorityPassword(const char pszUri, PCRTURIPARSED pParsed)
702	{
703	AssertPtrReturn(pszUri, NULL);
704	AssertPtrReturn(pParsed, NULL);
705	AssertReturn(pParsed->u32Magic == RTURIPARSED_MAGIC, NULL);
706	if (pParsed->cchAuthorityPassword)
707	return rtUriPercentDecodeN(&pszUri[pParsed->offAuthorityPassword], pParsed->cchAuthorityPassword);
708	return NULL;
709	}
710
711
712	RTDECL(char ) RTUriParsedAuthorityHost(const char pszUri, PCRTURIPARSED pParsed)
713	{
714	AssertPtrReturn(pszUri, NULL);
715	AssertPtrReturn(pParsed, NULL);
716	AssertReturn(pParsed->u32Magic == RTURIPARSED_MAGIC, NULL);
717	if (pParsed->cchAuthorityHost)
718	return rtUriPercentDecodeN(&pszUri[pParsed->offAuthorityHost], pParsed->cchAuthorityHost);
719	return NULL;
720	}
721
722
723	RTDECL(uint32_t) RTUriParsedAuthorityPort(const char *pszUri, PCRTURIPARSED pParsed)
724	{
725	AssertPtrReturn(pszUri, UINT32_MAX);
726	AssertPtrReturn(pParsed, UINT32_MAX);
727	AssertReturn(pParsed->u32Magic == RTURIPARSED_MAGIC, UINT32_MAX);
728	return pParsed->uAuthorityPort;
729	}
730
731
732	RTDECL(char ) RTUriParsedPath(const char pszUri, PCRTURIPARSED pParsed)
733	{
734	AssertPtrReturn(pszUri, NULL);
735	AssertPtrReturn(pParsed, NULL);
736	AssertReturn(pParsed->u32Magic == RTURIPARSED_MAGIC, NULL);
737	if (pParsed->cchPath)
738	return rtUriPercentDecodeN(&pszUri[pParsed->offPath], pParsed->cchPath);
739	return NULL;
740	}
741
742
743	RTDECL(char ) RTUriParsedQuery(const char pszUri, PCRTURIPARSED pParsed)
744	{
745	AssertPtrReturn(pszUri, NULL);
746	AssertPtrReturn(pParsed, NULL);
747	AssertReturn(pParsed->u32Magic == RTURIPARSED_MAGIC, NULL);
748	if (pParsed->cchQuery)
749	return rtUriPercentDecodeN(&pszUri[pParsed->offQuery], pParsed->cchQuery);
750	return NULL;
751	}
752
753
754	RTDECL(char ) RTUriParsedFragment(const char pszUri, PCRTURIPARSED pParsed)
755	{
756	AssertPtrReturn(pszUri, NULL);
757	AssertPtrReturn(pParsed, NULL);
758	AssertReturn(pParsed->u32Magic == RTURIPARSED_MAGIC, NULL);
759	if (pParsed->cchFragment)
760	return rtUriPercentDecodeN(&pszUri[pParsed->offFragment], pParsed->cchFragment);
761	return NULL;
762	}
763
764
765	RTDECL(char ) RTUriCreate(const char pszScheme, const char pszAuthority, const char pszPath, const char *pszQuery,
766	const char *pszFragment)
767	{
768	if (!pszScheme) /* Scheme is minimum requirement */
769	return NULL;
770
771	char *pszResult = 0;
772	char *pszAuthority1 = 0;
773	char *pszPath1 = 0;
774	char *pszQuery1 = 0;
775	char *pszFragment1 = 0;
776
777	do
778	{
779	/* Create the percent encoded strings and calculate the necessary uri
780	* length. */
781	size_t cbSize = strlen(pszScheme) + 1 + 1; /* plus zero byte */
782	if (pszAuthority)
783	{
784	pszAuthority1 = rtUriPercentEncodeN(pszAuthority, RTSTR_MAX);
785	if (!pszAuthority1)
786	break;
787	cbSize += strlen(pszAuthority1) + 2;
788	}
789	if (pszPath)
790	{
791	pszPath1 = rtUriPercentEncodeN(pszPath, RTSTR_MAX);
792	if (!pszPath1)
793	break;
794	cbSize += strlen(pszPath1);
795	}
796	if (pszQuery)
797	{
798	pszQuery1 = rtUriPercentEncodeN(pszQuery, RTSTR_MAX);
799	if (!pszQuery1)
800	break;
801	cbSize += strlen(pszQuery1) + 1;
802	}
803	if (pszFragment)
804	{
805	pszFragment1 = rtUriPercentEncodeN(pszFragment, RTSTR_MAX);
806	if (!pszFragment1)
807	break;
808	cbSize += strlen(pszFragment1) + 1;
809	}
810
811	char pszTmp = pszResult = (char )RTStrAlloc(cbSize);
812	if (!pszResult)
813	break;
814	RT_BZERO(pszTmp, cbSize);
815
816	/* Compose the target uri string. */
817	RTStrCatP(&pszTmp, &cbSize, pszScheme);
818	RTStrCatP(&pszTmp, &cbSize, ":");
819	if (pszAuthority1)
820	{
821	RTStrCatP(&pszTmp, &cbSize, "//");
822	RTStrCatP(&pszTmp, &cbSize, pszAuthority1);
823	}
824	if (pszPath1)
825	{
826	RTStrCatP(&pszTmp, &cbSize, pszPath1);
827	}
828	if (pszQuery1)
829	{
830	RTStrCatP(&pszTmp, &cbSize, "?");
831	RTStrCatP(&pszTmp, &cbSize, pszQuery1);
832	}
833	if (pszFragment1)
834	{
835	RTStrCatP(&pszTmp, &cbSize, "#");
836	RTStrCatP(&pszTmp, &cbSize, pszFragment1);
837	}
838	} while (0);
839
840	/* Cleanup */
841	if (pszAuthority1)
842	RTStrFree(pszAuthority1);
843	if (pszPath1)
844	RTStrFree(pszPath1);
845	if (pszQuery1)
846	RTStrFree(pszQuery1);
847	if (pszFragment1)
848	RTStrFree(pszFragment1);
849
850	return pszResult;
851	}
852
853
854	RTDECL(bool) RTUriIsSchemeMatch(const char pszUri, const char pszScheme)
855	{
856	AssertPtrReturn(pszUri, false);
857	size_t const cchScheme = strlen(pszScheme);
858	return RTStrNICmp(pszUri, pszScheme, cchScheme) == 0
859	&& pszUri[cchScheme] == ':';
860	}
861
862
863	RTDECL(int) RTUriFileCreateEx(const char pszPath, uint32_t fPathStyle, char ppszUri, size_t cbUri, size_t pcchUri)
864	{
865	/*
866	* Validate and adjust input. (RTPathParse check pszPath out for us)
867	*/
868	if (pcchUri)
869	{
870	AssertPtrReturn(pcchUri, VERR_INVALID_POINTER);
871	*pcchUri = ~(size_t)0;
872	}
873	AssertPtrReturn(ppszUri, VERR_INVALID_POINTER);
874	AssertReturn(!(fPathStyle & ~RTPATH_STR_F_STYLE_MASK) && fPathStyle != RTPATH_STR_F_STYLE_RESERVED, VERR_INVALID_FLAGS);
875	if (fPathStyle == RTPATH_STR_F_STYLE_HOST)
876	fPathStyle = RTPATH_STYLE;
877
878	/*
879	* Let the RTPath code parse the stuff (no reason to duplicate path parsing
880	* and get it slightly wrong here).
881	*/
882	RTPATHPARSED ParsedPath;
883	int rc = RTPathParse(pszPath, &ParsedPath, sizeof(ParsedPath), fPathStyle);
884	if (RT_SUCCESS(rc) \|\| rc == VERR_BUFFER_OVERFLOW)
885	{
886	/* Skip leading slashes. */
887	if (ParsedPath.fProps & RTPATH_PROP_ROOT_SLASH)
888	{
889	if (fPathStyle == RTPATH_STR_F_STYLE_DOS)
890	while (pszPath[0] == '/' \|\| pszPath[0] == '\\')
891	pszPath++;
892	else
893	while (pszPath[0] == '/')
894	pszPath++;
895	}
896	const size_t cchPath = strlen(pszPath);
897
898	/*
899	* Calculate the encoded length and figure destination buffering.
900	*/
901	static const char s_szPrefix[] = "file:///";
902	size_t const cchPrefix = sizeof(s_szPrefix) - (ParsedPath.fProps & RTPATH_PROP_UNC ? 2 : 1);
903	size_t cchEncoded = rtUriCalcEncodedLength(pszPath, cchPath, fPathStyle != RTPATH_STR_F_STYLE_DOS);
904
905	if (pcchUri)
906	*pcchUri = cchEncoded;
907
908	char *pszDst;
909	char *pszFreeMe = NULL;
910	if (!cbUri \|\| *ppszUri == NULL)
911	{
912	cbUri = RT_MAX(cbUri, cchPrefix + cchEncoded + 1);
913	*ppszUri = pszFreeMe = pszDst = RTStrAlloc(cbUri);
914	AssertReturn(pszDst, VERR_NO_STR_MEMORY);
915	}
916	else if (cchEncoded < cbUri)
917	pszDst = *ppszUri;
918	else
919	return VERR_BUFFER_OVERFLOW;
920
921	/*
922	* Construct the URI.
923	*/
924	memcpy(pszDst, s_szPrefix, cchPrefix);
925	pszDst[cchPrefix] = '\0';
926	rc = rtUriEncodeIntoBuffer(pszPath, cchPath, fPathStyle != RTPATH_STR_F_STYLE_DOS, &pszDst[cchPrefix], cbUri - cchPrefix);
927	if (RT_SUCCESS(rc))
928	{
929	Assert(strlen(pszDst) == cbUri - 1);
930	if (fPathStyle == RTPATH_STR_F_STYLE_DOS)
931	RTPathChangeToUnixSlashes(pszDst, true /fForce/);
932	return VINF_SUCCESS;
933	}
934
935	AssertRC(rc); /* Impossible! rtUriCalcEncodedLength or something above is busted! */
936	if (pszFreeMe)
937	RTStrFree(pszFreeMe);
938	}
939	return rc;
940	}
941
942
943	RTDECL(char ) RTUriFileCreate(const char pszPath)
944	{
945	char *pszUri = NULL;
946	int rc = RTUriFileCreateEx(pszPath, RTPATH_STR_F_STYLE_HOST, &pszUri, 0 /cbUri/, NULL /pcchUri/);
947	if (RT_SUCCESS(rc))
948	return pszUri;
949	return NULL;
950	}
951
952
953	RTDECL(int) RTUriFilePathEx(const char pszUri, uint32_t fPathStyle, char ppszPath, size_t cbPath, size_t pcchPath)
954	{
955	/*
956	* Validate and adjust input.
957	*/
958	if (pcchPath)
959	{
960	AssertPtrReturn(pcchPath, VERR_INVALID_POINTER);
961	*pcchPath = ~(size_t)0;
962	}
963	AssertPtrReturn(ppszPath, VERR_INVALID_POINTER);
964	AssertReturn(!(fPathStyle & ~RTPATH_STR_F_STYLE_MASK) && fPathStyle != RTPATH_STR_F_STYLE_RESERVED, VERR_INVALID_FLAGS);
965	if (fPathStyle == RTPATH_STR_F_STYLE_HOST)
966	fPathStyle = RTPATH_STYLE;
967	AssertPtrReturn(pszUri, VERR_INVALID_POINTER);
968
969	/*
970	* Check that this is a file URI.
971	*/
972	if (RTStrNICmp(pszUri, RT_STR_TUPLE("file:")) == 0)
973	{ /* likely */ }
974	else
975	return VERR_URI_NOT_FILE_SCHEME;
976
977	/*
978	* We may have a number of variations here, mostly thanks to
979	* various windows software. First the canonical variations:
980	* - file:///C:/Windows/System32/kernel32.dll
981	* - file:///C\|/Windows/System32/kernel32.dll
982	* - file:///C:%5CWindows%5CSystem32%5Ckernel32.dll
983	* - file://localhost/C:%5CWindows%5CSystem32%5Ckernel32.dll
984	* - file://cifsserver.dev/systemshare%5CWindows%5CSystem32%5Ckernel32.dll
985	* - file://cifsserver.dev:139/systemshare%5CWindows%5CSystem32%5Ckernel32.dll (not quite sure here, but whatever)
986	*
987	* Legacy variant without any slashes after the schema:
988	* - file:C:/Windows/System32/kernel32.dll
989	* - file:C\|/Windows/System32%5Ckernel32.dll
990	* - file:~/.bashrc
991	* \--path-/
992	*
993	* Legacy variant with exactly one slashes after the schema:
994	* - file:/C:/Windows/System32%5Ckernel32.dll
995	* - file:/C\|/Windows/System32/kernel32.dll
996	* - file:/usr/bin/env
997	* \---path---/
998	*
999	* Legacy variant with two slashes after the schema and an unescaped DOS path:
1000	* - file://C:/Windows/System32\kernel32.dll (**)
1001	* - file://C\|/Windows/System32\kernel32.dll
1002	* \---path---------------------/
1003	* -- authority, with ':' as non-working port separator
1004	*
1005	* Legacy variant with exactly four slashes after the schema and an unescaped DOS path.
1006	* - file:////C:/Windows\System32\user32.dll
1007	*
1008	* Legacy variant with four or more slashes after the schema and an unescaped UNC path:
1009	* - file:////cifsserver.dev/systemshare/System32%\kernel32.dll
1010	* - file://///cifsserver.dev/systemshare/System32\kernel32.dll
1011	* \---path--------------------------------------------/
1012	*
1013	* The two unescaped variants shouldn't be handed to rtUriParse, which
1014	* is good as we cannot actually handle the one marked by (**). So, handle
1015	* those two special when parsing.
1016	*/
1017	RTURIPARSED Parsed;
1018	int rc;
1019	size_t cSlashes = 0;
1020	while (pszUri[5 + cSlashes] == '/')
1021	cSlashes++;
1022	if ( (cSlashes == 2 \|\| cSlashes == 4)
1023	&& RT_C_IS_ALPHA(pszUri[5 + cSlashes])
1024	&& (pszUri[5 + cSlashes + 1] == ':' \|\| pszUri[5 + cSlashes + 1] == '\|'))
1025	{
1026	RT_ZERO(Parsed); /* RTURIPARSED_F_CONTAINS_ESCAPED_CHARS is now clear. */
1027	Parsed.offPath = 5 + cSlashes;
1028	Parsed.cchPath = strlen(&pszUri[Parsed.offPath]);
1029	rc = RTStrValidateEncoding(&pszUri[Parsed.offPath]);
1030	}
1031	else if (cSlashes >= 4)
1032	{
1033	RT_ZERO(Parsed);
1034	Parsed.fFlags = cSlashes > 4 ? RTURIPARSED_F_CONTAINS_ESCAPED_CHARS : 0;
1035	Parsed.offPath = 5 + cSlashes - 2;
1036	Parsed.cchPath = strlen(&pszUri[Parsed.offPath]);
1037	rc = RTStrValidateEncoding(&pszUri[Parsed.offPath]);
1038	}
1039	else
1040	rc = rtUriParse(pszUri, &Parsed);
1041	if (RT_SUCCESS(rc))
1042	{
1043	/*
1044	* Ignore localhost as hostname (it's implicit).
1045	*/
1046	static char const s_szLocalhost[] = "localhost";
1047	if ( Parsed.cchAuthorityHost == sizeof(s_szLocalhost) - 1U
1048	&& RTStrNICmp(&pszUri[Parsed.offAuthorityHost], RT_STR_TUPLE(s_szLocalhost)) == 0)
1049	{
1050	Parsed.cchAuthorityHost = 0;
1051	Parsed.cchAuthority = 0;
1052	}
1053
1054	/*
1055	* Ignore leading path slash/separator if we detect a DOS drive letter
1056	* and we don't have a host name.
1057	*/
1058	if ( Parsed.cchPath >= 3
1059	&& Parsed.cchAuthorityHost == 0
1060	&& pszUri[Parsed.offPath] == '/' /* Leading path slash/separator. */
1061	&& ( pszUri[Parsed.offPath + 2] == ':' /* Colon after drive letter. */
1062	\|\| pszUri[Parsed.offPath + 2] == '\|') /* Colon alternative. */
1063	&& RT_C_IS_ALPHA(pszUri[Parsed.offPath + 1]) ) /* Drive letter. */
1064	{
1065	Parsed.offPath++;
1066	Parsed.cchPath--;
1067	}
1068
1069	/*
1070	* Calculate the size of the encoded result.
1071	*
1072	* Since we're happily returning "C:/Windows/System32/kernel.dll"
1073	* style paths when the caller requested UNIX style paths, we will
1074	* return straight UNC paths too ("//cifsserver/share/dir/file").
1075	*/
1076	size_t cchDecodedHost = 0;
1077	size_t cbResult;
1078	if (Parsed.fFlags & RTURIPARSED_F_CONTAINS_ESCAPED_CHARS)
1079	{
1080	cchDecodedHost = rtUriCalcDecodedLength(&pszUri[Parsed.offAuthorityHost], Parsed.cchAuthorityHost);
1081	cbResult = cchDecodedHost + rtUriCalcDecodedLength(&pszUri[Parsed.offPath], Parsed.cchPath) + 1;
1082	}
1083	else
1084	{
1085	cchDecodedHost = 0;
1086	cbResult = Parsed.cchAuthorityHost + Parsed.cchPath + 1;
1087	}
1088	if (pcchPath)
1089	*pcchPath = cbResult - 1;
1090	if (cbResult > 1)
1091	{
1092	/*
1093	* Prepare the necessary buffer space for the result.
1094	*/
1095	char *pszDst;
1096	char *pszFreeMe = NULL;
1097	if (!cbPath \|\| *ppszPath == NULL)
1098	{
1099	cbPath = RT_MAX(cbPath, cbResult);
1100	*ppszPath = pszFreeMe = pszDst = RTStrAlloc(cbPath);
1101	AssertReturn(pszDst, VERR_NO_STR_MEMORY);
1102	}
1103	else if (cbResult <= cbPath)
1104	pszDst = *ppszPath;
1105	else
1106	return VERR_BUFFER_OVERFLOW;
1107
1108	/*
1109	* Compose the result.
1110	*/
1111	if (Parsed.fFlags & RTURIPARSED_F_CONTAINS_ESCAPED_CHARS)
1112	{
1113	rc = rtUriDecodeIntoBuffer(&pszUri[Parsed.offAuthorityHost],Parsed.cchAuthorityHost,
1114	pszDst, cchDecodedHost + 1);
1115	Assert(RT_SUCCESS(rc) && strlen(pszDst) == cchDecodedHost);
1116	if (RT_SUCCESS(rc))
1117	rc = rtUriDecodeIntoBuffer(&pszUri[Parsed.offPath], Parsed.cchPath,
1118	&pszDst[cchDecodedHost], cbResult - cchDecodedHost);
1119	Assert(RT_SUCCESS(rc) && strlen(pszDst) == cbResult - 1);
1120	}
1121	else
1122	{
1123	memcpy(pszDst, &pszUri[Parsed.offAuthorityHost], Parsed.cchAuthorityHost);
1124	memcpy(&pszDst[Parsed.cchAuthorityHost], &pszUri[Parsed.offPath], Parsed.cchPath);
1125	pszDst[cbResult - 1] = '\0';
1126	}
1127	if (RT_SUCCESS(rc))
1128	{
1129	/*
1130	* Convert colon DOS driver letter colon alternative.
1131	* We do this regardless of the desired path style.
1132	*/
1133	if ( RT_C_IS_ALPHA(pszDst[0])
1134	&& pszDst[1] == '\|')
1135	pszDst[1] = ':';
1136
1137	/*
1138	* Fix slashes.
1139	*/
1140	if (fPathStyle == RTPATH_STR_F_STYLE_DOS)
1141	RTPathChangeToDosSlashes(pszDst, true);
1142	else if (fPathStyle == RTPATH_STR_F_STYLE_UNIX)
1143	RTPathChangeToUnixSlashes(pszDst, true); /** @todo not quite sure how this actually makes sense... */
1144	else
1145	AssertFailed();
1146	return rc;
1147	}
1148
1149	/* bail out */
1150	RTStrFree(pszFreeMe);
1151	}
1152	else
1153	rc = VERR_PATH_ZERO_LENGTH;
1154	}
1155	return rc;
1156	}
1157
1158
1159	RTDECL(char ) RTUriFilePath(const char pszUri)
1160	{
1161	char *pszPath = NULL;
1162	int rc = RTUriFilePathEx(pszUri, RTPATH_STR_F_STYLE_HOST, &pszPath, 0 /cbPath/, NULL /pcchPath/);
1163	if (RT_SUCCESS(rc))
1164	return pszPath;
1165	return NULL;
1166	}
1167

Note: See TracBrowser for help on using the repository browser.

source: vbox/trunk/src/VBox/Runtime/common/misc/uri.cpp@ 76346

Download in other formats: