xmlstring.c@ 107351

Last change on this file since 107351 was 105420, checked in by vboxsync, 6 months ago
libxml2-2.12.6: Applied and adjusted our libxml2 changes to 2.12.6. bugref:10730
Property svn:eol-style set to `native`
File size: 29.9 KB

Line
1	/*
2	* string.c : an XML string utilities module
3	*
4	* This module provides various utility functions for manipulating
5	* the xmlChar* type. All functions named xmlStr* have been moved here
6	* from the parser.c file (their original home).
7	*
8	* See Copyright for the status of this software.
9	*
10	* UTF8 string routines from:
11	* William Brack <[email protected]>
12	*
13	* [email protected]
14	*/
15
16	#define IN_LIBXML
17	#include "libxml.h"
18
19	#include <stdlib.h>
20	#include <string.h>
21	#include <limits.h>
22	#include <libxml/xmlmemory.h>
23	#include <libxml/parserInternals.h>
24	#include <libxml/xmlstring.h>
25
26	#include "private/parser.h"
27	#include "private/string.h"
28
29	#ifndef va_copy
30	#ifdef __va_copy
31	#define va_copy(dest, src) __va_copy(dest, src)
32	#else
33	#define va_copy(dest, src) memcpy(dest, src, sizeof(va_list))
34	#endif
35	#endif
36
37	/************************************************************************
38	* *
39	* Commodity functions to handle xmlChars *
40	* *
41	************************************************************************/
42
43	/**
44	* xmlStrndup:
45	* @cur: the input xmlChar *
46	* @len: the len of @cur
47	*
48	* a strndup for array of xmlChar's
49	*
50	* Returns a new xmlChar * or NULL
51	*/
52	xmlChar *
53	xmlStrndup(const xmlChar *cur, int len) {
54	xmlChar *ret;
55
56	if ((cur == NULL) \|\| (len < 0)) return(NULL);
57	ret = (xmlChar *) xmlMallocAtomic((size_t) len + 1);
58	if (ret == NULL) {
59	return(NULL);
60	}
61	memcpy(ret, cur, len);
62	ret[len] = 0;
63	return(ret);
64	}
65
66	/**
67	* xmlStrdup:
68	* @cur: the input xmlChar *
69	*
70	* a strdup for array of xmlChar's. Since they are supposed to be
71	* encoded in UTF-8 or an encoding with 8bit based chars, we assume
72	* a termination mark of '0'.
73	*
74	* Returns a new xmlChar * or NULL
75	*/
76	xmlChar *
77	xmlStrdup(const xmlChar *cur) {
78	const xmlChar *p = cur;
79
80	if (cur == NULL) return(NULL);
81	while (p != 0) p++; / non input consuming */
82	return(xmlStrndup(cur, p - cur));
83	}
84
85	/**
86	* xmlCharStrndup:
87	* @cur: the input char *
88	* @len: the len of @cur
89	*
90	* a strndup for char's to xmlChar's
91	*
92	* Returns a new xmlChar * or NULL
93	*/
94
95	xmlChar *
96	xmlCharStrndup(const char *cur, int len) {
97	int i;
98	xmlChar *ret;
99
100	if ((cur == NULL) \|\| (len < 0)) return(NULL);
101	ret = (xmlChar *) xmlMallocAtomic((size_t) len + 1);
102	if (ret == NULL) {
103	return(NULL);
104	}
105	for (i = 0;i < len;i++) {
106	/* Explicit sign change */
107	ret[i] = (xmlChar) cur[i];
108	if (ret[i] == 0) return(ret);
109	}
110	ret[len] = 0;
111	return(ret);
112	}
113
114	/**
115	* xmlCharStrdup:
116	* @cur: the input char *
117	*
118	* a strdup for char's to xmlChar's
119	*
120	* Returns a new xmlChar * or NULL
121	*/
122
123	xmlChar *
124	xmlCharStrdup(const char *cur) {
125	const char *p = cur;
126
127	if (cur == NULL) return(NULL);
128	while (p != '\0') p++; / non input consuming */
129	return(xmlCharStrndup(cur, p - cur));
130	}
131
132	/**
133	* xmlStrcmp:
134	* @str1: the first xmlChar *
135	* @str2: the second xmlChar *
136	*
137	* a strcmp for xmlChar's
138	*
139	* Returns the integer result of the comparison
140	*/
141
142	int
143	xmlStrcmp(const xmlChar str1, const xmlChar str2) {
144	if (str1 == str2) return(0);
145	if (str1 == NULL) return(-1);
146	if (str2 == NULL) return(1);
147	#ifdef FUZZING_BUILD_MODE_UNSAFE_FOR_PRODUCTION
148	return(strcmp((const char )str1, (const char )str2));
149	#else
150	do {
151	int tmp = str1++ - str2;
152	if (tmp != 0) return(tmp);
153	} while (*str2++ != 0);
154	return 0;
155	#endif
156	}
157
158	/**
159	* xmlStrEqual:
160	* @str1: the first xmlChar *
161	* @str2: the second xmlChar *
162	*
163	* Check if both strings are equal of have same content.
164	* Should be a bit more readable and faster than xmlStrcmp()
165	*
166	* Returns 1 if they are equal, 0 if they are different
167	*/
168
169	int
170	xmlStrEqual(const xmlChar str1, const xmlChar str2) {
171	if (str1 == str2) return(1);
172	if (str1 == NULL) return(0);
173	if (str2 == NULL) return(0);
174	#ifdef FUZZING_BUILD_MODE_UNSAFE_FOR_PRODUCTION
175	return(strcmp((const char )str1, (const char )str2) == 0);
176	#else
177	do {
178	if (str1++ != str2) return(0);
179	} while (*str2++);
180	return(1);
181	#endif
182	}
183
184	/**
185	* xmlStrQEqual:
186	* @pref: the prefix of the QName
187	* @name: the localname of the QName
188	* @str: the second xmlChar *
189	*
190	* Check if a QName is Equal to a given string
191	*
192	* Returns 1 if they are equal, 0 if they are different
193	*/
194
195	int
196	xmlStrQEqual(const xmlChar pref, const xmlChar name, const xmlChar *str) {
197	if (pref == NULL) return(xmlStrEqual(name, str));
198	if (name == NULL) return(0);
199	if (str == NULL) return(0);
200
201	do {
202	if (pref++ != str) return(0);
203	} while ((str++) && (pref));
204	if (*str++ != ':') return(0);
205	do {
206	if (name++ != str) return(0);
207	} while (*str++);
208	return(1);
209	}
210
211	/**
212	* xmlStrncmp:
213	* @str1: the first xmlChar *
214	* @str2: the second xmlChar *
215	* @len: the max comparison length
216	*
217	* a strncmp for xmlChar's
218	*
219	* Returns the integer result of the comparison
220	*/
221
222	int
223	xmlStrncmp(const xmlChar str1, const xmlChar str2, int len) {
224	if (len <= 0) return(0);
225	if (str1 == str2) return(0);
226	if (str1 == NULL) return(-1);
227	if (str2 == NULL) return(1);
228	#ifdef FUZZING_BUILD_MODE_UNSAFE_FOR_PRODUCTION
229	return(strncmp((const char )str1, (const char )str2, len));
230	#else
231	do {
232	int tmp = str1++ - str2;
233	if (tmp != 0 \|\| --len == 0) return(tmp);
234	} while (*str2++ != 0);
235	return 0;
236	#endif
237	}
238
239	static const xmlChar casemap[256] = {
240	0x00,0x01,0x02,0x03,0x04,0x05,0x06,0x07,
241	0x08,0x09,0x0A,0x0B,0x0C,0x0D,0x0E,0x0F,
242	0x10,0x11,0x12,0x13,0x14,0x15,0x16,0x17,
243	0x18,0x19,0x1A,0x1B,0x1C,0x1D,0x1E,0x1F,
244	0x20,0x21,0x22,0x23,0x24,0x25,0x26,0x27,
245	0x28,0x29,0x2A,0x2B,0x2C,0x2D,0x2E,0x2F,
246	0x30,0x31,0x32,0x33,0x34,0x35,0x36,0x37,
247	0x38,0x39,0x3A,0x3B,0x3C,0x3D,0x3E,0x3F,
248	0x40,0x61,0x62,0x63,0x64,0x65,0x66,0x67,
249	0x68,0x69,0x6A,0x6B,0x6C,0x6D,0x6E,0x6F,
250	0x70,0x71,0x72,0x73,0x74,0x75,0x76,0x77,
251	0x78,0x79,0x7A,0x7B,0x5C,0x5D,0x5E,0x5F,
252	0x60,0x61,0x62,0x63,0x64,0x65,0x66,0x67,
253	0x68,0x69,0x6A,0x6B,0x6C,0x6D,0x6E,0x6F,
254	0x70,0x71,0x72,0x73,0x74,0x75,0x76,0x77,
255	0x78,0x79,0x7A,0x7B,0x7C,0x7D,0x7E,0x7F,
256	0x80,0x81,0x82,0x83,0x84,0x85,0x86,0x87,
257	0x88,0x89,0x8A,0x8B,0x8C,0x8D,0x8E,0x8F,
258	0x90,0x91,0x92,0x93,0x94,0x95,0x96,0x97,
259	0x98,0x99,0x9A,0x9B,0x9C,0x9D,0x9E,0x9F,
260	0xA0,0xA1,0xA2,0xA3,0xA4,0xA5,0xA6,0xA7,
261	0xA8,0xA9,0xAA,0xAB,0xAC,0xAD,0xAE,0xAF,
262	0xB0,0xB1,0xB2,0xB3,0xB4,0xB5,0xB6,0xB7,
263	0xB8,0xB9,0xBA,0xBB,0xBC,0xBD,0xBE,0xBF,
264	0xC0,0xC1,0xC2,0xC3,0xC4,0xC5,0xC6,0xC7,
265	0xC8,0xC9,0xCA,0xCB,0xCC,0xCD,0xCE,0xCF,
266	0xD0,0xD1,0xD2,0xD3,0xD4,0xD5,0xD6,0xD7,
267	0xD8,0xD9,0xDA,0xDB,0xDC,0xDD,0xDE,0xDF,
268	0xE0,0xE1,0xE2,0xE3,0xE4,0xE5,0xE6,0xE7,
269	0xE8,0xE9,0xEA,0xEB,0xEC,0xED,0xEE,0xEF,
270	0xF0,0xF1,0xF2,0xF3,0xF4,0xF5,0xF6,0xF7,
271	0xF8,0xF9,0xFA,0xFB,0xFC,0xFD,0xFE,0xFF
272	};
273
274	/**
275	* xmlStrcasecmp:
276	* @str1: the first xmlChar *
277	* @str2: the second xmlChar *
278	*
279	* a strcasecmp for xmlChar's
280	*
281	* Returns the integer result of the comparison
282	*/
283
284	int
285	xmlStrcasecmp(const xmlChar str1, const xmlChar str2) {
286	register int tmp;
287
288	if (str1 == str2) return(0);
289	if (str1 == NULL) return(-1);
290	if (str2 == NULL) return(1);
291	do {
292	tmp = casemap[str1++] - casemap[str2];
293	if (tmp != 0) return(tmp);
294	} while (*str2++ != 0);
295	return 0;
296	}
297
298	/**
299	* xmlStrncasecmp:
300	* @str1: the first xmlChar *
301	* @str2: the second xmlChar *
302	* @len: the max comparison length
303	*
304	* a strncasecmp for xmlChar's
305	*
306	* Returns the integer result of the comparison
307	*/
308
309	int
310	xmlStrncasecmp(const xmlChar str1, const xmlChar str2, int len) {
311	register int tmp;
312
313	if (len <= 0) return(0);
314	if (str1 == str2) return(0);
315	if (str1 == NULL) return(-1);
316	if (str2 == NULL) return(1);
317	do {
318	tmp = casemap[str1++] - casemap[str2];
319	if (tmp != 0 \|\| --len == 0) return(tmp);
320	} while (*str2++ != 0);
321	return 0;
322	}
323
324	/**
325	* xmlStrchr:
326	* @str: the xmlChar * array
327	* @val: the xmlChar to search
328	*
329	* a strchr for xmlChar's
330	*
331	* Returns the xmlChar * for the first occurrence or NULL.
332	*/
333
334	const xmlChar *
335	xmlStrchr(const xmlChar *str, xmlChar val) {
336	if (str == NULL) return(NULL);
337	while (str != 0) { / non input consuming */
338	if (str == val) return((xmlChar ) str);
339	str++;
340	}
341	return(NULL);
342	}
343
344	/**
345	* xmlStrstr:
346	* @str: the xmlChar * array (haystack)
347	* @val: the xmlChar to search (needle)
348	*
349	* a strstr for xmlChar's
350	*
351	* Returns the xmlChar * for the first occurrence or NULL.
352	*/
353
354	const xmlChar *
355	xmlStrstr(const xmlChar str, const xmlChar val) {
356	int n;
357
358	if (str == NULL) return(NULL);
359	if (val == NULL) return(NULL);
360	n = xmlStrlen(val);
361
362	if (n == 0) return(str);
363	while (str != 0) { / non input consuming */
364	if (str == val) {
365	if (!xmlStrncmp(str, val, n)) return((const xmlChar *) str);
366	}
367	str++;
368	}
369	return(NULL);
370	}
371
372	/**
373	* xmlStrcasestr:
374	* @str: the xmlChar * array (haystack)
375	* @val: the xmlChar to search (needle)
376	*
377	* a case-ignoring strstr for xmlChar's
378	*
379	* Returns the xmlChar * for the first occurrence or NULL.
380	*/
381
382	const xmlChar *
383	xmlStrcasestr(const xmlChar str, const xmlChar val) {
384	int n;
385
386	if (str == NULL) return(NULL);
387	if (val == NULL) return(NULL);
388	n = xmlStrlen(val);
389
390	if (n == 0) return(str);
391	while (str != 0) { / non input consuming */
392	if (casemap[str] == casemap[val])
393	if (!xmlStrncasecmp(str, val, n)) return(str);
394	str++;
395	}
396	return(NULL);
397	}
398
399	/**
400	* xmlStrsub:
401	* @str: the xmlChar * array (haystack)
402	* @start: the index of the first char (zero based)
403	* @len: the length of the substring
404	*
405	* Extract a substring of a given string
406	*
407	* Returns the xmlChar * for the first occurrence or NULL.
408	*/
409
410	xmlChar *
411	xmlStrsub(const xmlChar *str, int start, int len) {
412	int i;
413
414	if (str == NULL) return(NULL);
415	if (start < 0) return(NULL);
416	if (len < 0) return(NULL);
417
418	for (i = 0;i < start;i++) {
419	if (*str == 0) return(NULL);
420	str++;
421	}
422	if (*str == 0) return(NULL);
423	return(xmlStrndup(str, len));
424	}
425
426	/**
427	* xmlStrlen:
428	* @str: the xmlChar * array
429	*
430	* length of a xmlChar's string
431	*
432	* Returns the number of xmlChar contained in the ARRAY.
433	*/
434
435	int
436	xmlStrlen(const xmlChar *str) {
437	size_t len = str ? strlen((const char *)str) : 0;
438	return(len > INT_MAX ? 0 : len);
439	}
440
441	/**
442	* xmlStrncat:
443	* @cur: the original xmlChar * array
444	* @add: the xmlChar * array added
445	* @len: the length of @add
446	*
447	* a strncat for array of xmlChar's, it will extend @cur with the len
448	* first bytes of @add. Note that if @len < 0 then this is an API error
449	* and NULL will be returned.
450	*
451	* Returns a new xmlChar *, the original @cur is reallocated and should
452	* not be freed.
453	*/
454
455	xmlChar *
456	xmlStrncat(xmlChar cur, const xmlChar add, int len) {
457	int size;
458	xmlChar *ret;
459
460	if ((add == NULL) \|\| (len == 0))
461	return(cur);
462	if (len < 0)
463	return(NULL);
464	if (cur == NULL)
465	return(xmlStrndup(add, len));
466
467	size = xmlStrlen(cur);
468	if ((size < 0) \|\| (size > INT_MAX - len))
469	return(NULL);
470	ret = (xmlChar *) xmlRealloc(cur, (size_t) size + len + 1);
471	if (ret == NULL) {
472	xmlFree(cur);
473	return(NULL);
474	}
475	memcpy(&ret[size], add, len);
476	ret[size + len] = 0;
477	return(ret);
478	}
479
480	/**
481	* xmlStrncatNew:
482	* @str1: first xmlChar string
483	* @str2: second xmlChar string
484	* @len: the len of @str2 or < 0
485	*
486	* same as xmlStrncat, but creates a new string. The original
487	* two strings are not freed. If @len is < 0 then the length
488	* will be calculated automatically.
489	*
490	* Returns a new xmlChar * or NULL
491	*/
492	xmlChar *
493	xmlStrncatNew(const xmlChar str1, const xmlChar str2, int len) {
494	int size;
495	xmlChar *ret;
496
497	if (len < 0) {
498	len = xmlStrlen(str2);
499	if (len < 0)
500	return(NULL);
501	}
502	if (str1 == NULL)
503	return(xmlStrndup(str2, len));
504	if ((str2 == NULL) \|\| (len == 0))
505	return(xmlStrdup(str1));
506
507	size = xmlStrlen(str1);
508	if ((size < 0) \|\| (size > INT_MAX - len))
509	return(NULL);
510	ret = (xmlChar *) xmlMalloc((size_t) size + len + 1);
511	if (ret == NULL)
512	return(NULL);
513	memcpy(ret, str1, size);
514	memcpy(&ret[size], str2, len);
515	ret[size + len] = 0;
516	return(ret);
517	}
518
519	/**
520	* xmlStrcat:
521	* @cur: the original xmlChar * array
522	* @add: the xmlChar * array added
523	*
524	* a strcat for array of xmlChar's. Since they are supposed to be
525	* encoded in UTF-8 or an encoding with 8bit based chars, we assume
526	* a termination mark of '0'.
527	*
528	* Returns a new xmlChar * containing the concatenated string. The original
529	* @cur is reallocated and should not be freed.
530	*/
531	xmlChar *
532	xmlStrcat(xmlChar cur, const xmlChar add) {
533	const xmlChar *p = add;
534
535	if (add == NULL) return(cur);
536	if (cur == NULL)
537	return(xmlStrdup(add));
538
539	while (p != 0) p++; / non input consuming */
540	return(xmlStrncat(cur, add, p - add));
541	}
542
543	/**
544	* xmlStrPrintf:
545	* @buf: the result buffer.
546	* @len: the result buffer length.
547	* @msg: the message with printf formatting.
548	* @...: extra parameters for the message.
549	*
550	* Formats @msg and places result into @buf.
551	*
552	* Returns the number of characters written to @buf or -1 if an error occurs.
553	*/
554	int
555	xmlStrPrintf(xmlChar buf, int len, const char msg, ...) {
556	va_list args;
557	int ret;
558
559	if((buf == NULL) \|\| (msg == NULL)) {
560	return(-1);
561	}
562
563	va_start(args, msg);
564	ret = vsnprintf((char ) buf, len, (const char ) msg, args);
565	va_end(args);
566	buf[len - 1] = 0; /* be safe ! */
567
568	return(ret);
569	}
570
571	/**
572	* xmlStrVPrintf:
573	* @buf: the result buffer.
574	* @len: the result buffer length.
575	* @msg: the message with printf formatting.
576	* @ap: extra parameters for the message.
577	*
578	* Formats @msg and places result into @buf.
579	*
580	* Returns the number of characters written to @buf or -1 if an error occurs.
581	*/
582	int
583	xmlStrVPrintf(xmlChar buf, int len, const char msg, va_list ap) {
584	int ret;
585
586	if((buf == NULL) \|\| (msg == NULL)) {
587	return(-1);
588	}
589
590	ret = vsnprintf((char ) buf, len, (const char ) msg, ap);
591	buf[len - 1] = 0; /* be safe ! */
592
593	return(ret);
594	}
595
596	/**
597	* xmlStrVASPrintf:
598	* @out: pointer to the resulting string
599	* @maxSize: maximum size of the output buffer
600	* @msg: printf format string
601	* @ap: arguments for format string
602	*
603	* Creates a newly allocated string according to format.
604	*
605	* Returns 0 on success, 1 if the result was truncated or on other
606	* errors, -1 if a memory allocation failed.
607	*/
608	int
609	xmlStrVASPrintf(xmlChar *out, int maxSize, const char msg, va_list ap) {
610	char empty[1];
611	va_list copy;
612	xmlChar *buf;
613	int res, size;
614	int truncated = 0;
615
616	if (out == NULL)
617	return(1);
618	*out = NULL;
619	if (msg == NULL)
620	return(1);
621	if (maxSize < 32)
622	maxSize = 32;
623
624	va_copy(copy, ap);
625	res = vsnprintf(empty, 1, msg, copy);
626	va_end(copy);
627
628	if (res > 0) {
629	/* snprintf seems to work according to C99. */
630
631	if (res < maxSize) {
632	size = res + 1;
633	} else {
634	size = maxSize;
635	truncated = 1;
636	}
637	buf = xmlMalloc(size);
638	if (buf == NULL)
639	return(-1);
640	if (vsnprintf((char *) buf, size, msg, ap) < 0) {
641	xmlFree(buf);
642	return(1);
643	}
644	} else {
645	/*
646	* Unfortunately, older snprintf implementations don't follow the
647	* C99 spec. If the output exceeds the size of the buffer, they can
648	* return -1, 0 or the number of characters written instead of the
649	* needed size. Older MSCVRT also won't write a terminating null
650	* byte if the buffer is too small.
651	*
652	* If the value returned is non-negative and strictly less than
653	* the buffer size (without terminating null), the result should
654	* have been written completely, so we double the buffer size
655	* until this condition is true. This assumes that snprintf will
656	* eventually return a non-negative value. Otherwise, we will
657	* allocate more and more memory until we run out.
658	*
659	* Note that this code path is also executed on conforming
660	* platforms if the output is the empty string.
661	*/
662
663	buf = NULL;
664	size = 32;
665	while (1) {
666	buf = xmlMalloc(size);
667	if (buf == NULL)
668	return(-1);
669
670	va_copy(copy, ap);
671	res = vsnprintf((char *) buf, size, msg, copy);
672	va_end(copy);
673	if ((res >= 0) && (res < size - 1))
674	break;
675
676	if (size >= maxSize) {
677	truncated = 1;
678	break;
679	}
680
681	xmlFree(buf);
682
683	if (size > maxSize / 2)
684	size = maxSize;
685	else
686	size *= 2;
687	}
688	}
689
690	/*
691	* If the output was truncated, make sure that the buffer doesn't
692	* end with a truncated UTF-8 sequence.
693	*/
694	if (truncated != 0) {
695	int i = size - 1;
696
697	while (i > 0) {
698	/* Break after ASCII */
699	if (buf[i-1] < 0x80)
700	break;
701	i -= 1;
702	/* Break before non-ASCII */
703	if (buf[i] >= 0xc0)
704	break;
705	}
706
707	buf[i] = 0;
708	}
709
710	out = (xmlChar ) buf;
711	return(truncated);
712	}
713
714	/**
715	* xmlStrASPrintf:
716	* @out: pointer to the resulting string
717	* @maxSize: maximum size of the output buffer
718	* @msg: printf format string
719	* @...: arguments for format string
720	*
721	* See xmlStrVASPrintf.
722	*
723	* Returns 0 on success, 1 if the result was truncated or on other
724	* errors, -1 if a memory allocation failed.
725	*/
726	int
727	xmlStrASPrintf(xmlChar *out, int maxSize, const char msg, ...) {
728	va_list ap;
729	int ret;
730
731	va_start(ap, msg);
732	ret = xmlStrVASPrintf(out, maxSize, msg, ap);
733	va_end(ap);
734
735	return(ret);
736	}
737
738	/************************************************************************
739	* *
740	* Generic UTF8 handling routines *
741	* *
742	* From rfc2044: encoding of the Unicode values on UTF-8: *
743	* *
744	* UCS-4 range (hex.) UTF-8 octet sequence (binary) *
745	* 0000 0000-0000 007F 0xxxxxxx *
746	* 0000 0080-0000 07FF 110xxxxx 10xxxxxx *
747	* 0000 0800-0000 FFFF 1110xxxx 10xxxxxx 10xxxxxx *
748	* *
749	* I hope we won't use values > 0xFFFF anytime soon ! *
750	* *
751	************************************************************************/
752
753
754	/**
755	* xmlUTF8Size:
756	* @utf: pointer to the UTF8 character
757	*
758	* calculates the internal size of a UTF8 character
759	*
760	* returns the numbers of bytes in the character, -1 on format error
761	*/
762	int
763	xmlUTF8Size(const xmlChar *utf) {
764	xmlChar mask;
765	int len;
766
767	if (utf == NULL)
768	return -1;
769	if (*utf < 0x80)
770	return 1;
771	/* check valid UTF8 character */
772	if (!(*utf & 0x40))
773	return -1;
774	/* determine number of bytes in char */
775	len = 2;
776	for (mask=0x20; mask != 0; mask>>=1) {
777	if (!(*utf & mask))
778	return len;
779	len++;
780	}
781	return -1;
782	}
783
784	/**
785	* xmlUTF8Charcmp:
786	* @utf1: pointer to first UTF8 char
787	* @utf2: pointer to second UTF8 char
788	*
789	* compares the two UCS4 values
790	*
791	* returns result of the compare as with xmlStrncmp
792	*/
793	int
794	xmlUTF8Charcmp(const xmlChar utf1, const xmlChar utf2) {
795
796	if (utf1 == NULL ) {
797	if (utf2 == NULL)
798	return 0;
799	return -1;
800	}
801	return xmlStrncmp(utf1, utf2, xmlUTF8Size(utf1));
802	}
803
804	/**
805	* xmlUTF8Strlen:
806	* @utf: a sequence of UTF-8 encoded bytes
807	*
808	* compute the length of an UTF8 string, it doesn't do a full UTF8
809	* checking of the content of the string.
810	*
811	* Returns the number of characters in the string or -1 in case of error
812	*/
813	int
814	xmlUTF8Strlen(const xmlChar *utf) {
815	size_t ret = 0;
816
817	if (utf == NULL)
818	return(-1);
819
820	while (*utf != 0) {
821	if (utf[0] & 0x80) {
822	if ((utf[1] & 0xc0) != 0x80)
823	return(-1);
824	if ((utf[0] & 0xe0) == 0xe0) {
825	if ((utf[2] & 0xc0) != 0x80)
826	return(-1);
827	if ((utf[0] & 0xf0) == 0xf0) {
828	if ((utf[0] & 0xf8) != 0xf0 \|\| (utf[3] & 0xc0) != 0x80)
829	return(-1);
830	utf += 4;
831	} else {
832	utf += 3;
833	}
834	} else {
835	utf += 2;
836	}
837	} else {
838	utf++;
839	}
840	ret++;
841	}
842	return(ret > INT_MAX ? 0 : ret);
843	}
844
845	/**
846	* xmlGetUTF8Char:
847	* @utf: a sequence of UTF-8 encoded bytes
848	* @len: a pointer to the minimum number of bytes present in
849	* the sequence. This is used to assure the next character
850	* is completely contained within the sequence.
851	*
852	* Read the first UTF8 character from @utf
853	*
854	* Returns the char value or -1 in case of error, and sets *len to
855	* the actual number of bytes consumed (0 in case of error)
856	*/
857	int
858	xmlGetUTF8Char(const unsigned char utf, int len) {
859	unsigned int c;
860
861	if (utf == NULL)
862	goto error;
863	if (len == NULL)
864	goto error;
865
866	c = utf[0];
867	if (c < 0x80) {
868	if (*len < 1)
869	goto error;
870	/* 1-byte code */
871	*len = 1;
872	} else {
873	if ((*len < 2) \|\| ((utf[1] & 0xc0) != 0x80))
874	goto error;
875	if (c < 0xe0) {
876	if (c < 0xc2)
877	goto error;
878	/* 2-byte code */
879	*len = 2;
880	c = (c & 0x1f) << 6;
881	c \|= utf[1] & 0x3f;
882	} else {
883	if ((*len < 3) \|\| ((utf[2] & 0xc0) != 0x80))
884	goto error;
885	if (c < 0xf0) {
886	/* 3-byte code */
887	*len = 3;
888	c = (c & 0xf) << 12;
889	c \|= (utf[1] & 0x3f) << 6;
890	c \|= utf[2] & 0x3f;
891	if ((c < 0x800) \|\| ((c >= 0xd800) && (c < 0xe000)))
892	goto error;
893	} else {
894	if ((*len < 4) \|\| ((utf[3] & 0xc0) != 0x80))
895	goto error;
896	*len = 4;
897	/* 4-byte code */
898	c = (c & 0x7) << 18;
899	c \|= (utf[1] & 0x3f) << 12;
900	c \|= (utf[2] & 0x3f) << 6;
901	c \|= utf[3] & 0x3f;
902	if ((c < 0x10000) \|\| (c >= 0x110000))
903	goto error;
904	}
905	}
906	}
907	return(c);
908
909	error:
910	if (len != NULL)
911	*len = 0;
912	return(-1);
913	}
914
915	/**
916	* xmlCheckUTF8:
917	* @utf: Pointer to putative UTF-8 encoded string.
918	*
919	* Checks @utf for being valid UTF-8. @utf is assumed to be
920	* null-terminated. This function is not super-strict, as it will
921	* allow longer UTF-8 sequences than necessary. Note that Java is
922	* capable of producing these sequences if provoked. Also note, this
923	* routine checks for the 4-byte maximum size, but does not check for
924	* 0x10ffff maximum value.
925	*
926	* Return value: true if @utf is valid.
927	**/
928	int
929	xmlCheckUTF8(const unsigned char *utf)
930	{
931	int ix;
932	unsigned char c;
933
934	if (utf == NULL)
935	return(0);
936	/*
937	* utf is a string of 1, 2, 3 or 4 bytes. The valid strings
938	* are as follows (in "bit format"):
939	* 0xxxxxxx valid 1-byte
940	* 110xxxxx 10xxxxxx valid 2-byte
941	* 1110xxxx 10xxxxxx 10xxxxxx valid 3-byte
942	* 11110xxx 10xxxxxx 10xxxxxx 10xxxxxx valid 4-byte
943	*/
944	while ((c = utf[0])) { /* string is 0-terminated */
945	ix = 0;
946	if ((c & 0x80) == 0x00) { /* 1-byte code, starts with 10 */
947	ix = 1;
948	} else if ((c & 0xe0) == 0xc0) {/* 2-byte code, starts with 110 */
949	if ((utf[1] & 0xc0 ) != 0x80)
950	return 0;
951	ix = 2;
952	} else if ((c & 0xf0) == 0xe0) {/* 3-byte code, starts with 1110 */
953	if (((utf[1] & 0xc0) != 0x80) \|\|
954	((utf[2] & 0xc0) != 0x80))
955	return 0;
956	ix = 3;
957	} else if ((c & 0xf8) == 0xf0) {/* 4-byte code, starts with 11110 */
958	if (((utf[1] & 0xc0) != 0x80) \|\|
959	((utf[2] & 0xc0) != 0x80) \|\|
960	((utf[3] & 0xc0) != 0x80))
961	return 0;
962	ix = 4;
963	} else /* unknown encoding */
964	return 0;
965	utf += ix;
966	}
967	return(1);
968	}
969
970	/**
971	* xmlUTF8Strsize:
972	* @utf: a sequence of UTF-8 encoded bytes
973	* @len: the number of characters in the array
974	*
975	* storage size of an UTF8 string
976	* the behaviour is not guaranteed if the input string is not UTF-8
977	*
978	* Returns the storage size of
979	* the first 'len' characters of ARRAY
980	*/
981
982	int
983	xmlUTF8Strsize(const xmlChar *utf, int len) {
984	const xmlChar *ptr=utf;
985	int ch;
986	size_t ret;
987
988	if (utf == NULL)
989	return(0);
990
991	if (len <= 0)
992	return(0);
993
994	while ( len-- > 0) {
995	if ( !*ptr )
996	break;
997	if ( (ch = *ptr++) & 0x80)
998	while ((ch<<=1) & 0x80 ) {
999	if (*ptr == 0) break;
1000	ptr++;
1001	}
1002	}
1003	ret = ptr - utf;
1004	return (ret > INT_MAX ? 0 : ret);
1005	}
1006
1007
1008	/**
1009	* xmlUTF8Strndup:
1010	* @utf: the input UTF8 *
1011	* @len: the len of @utf (in chars)
1012	*
1013	* a strndup for array of UTF8's
1014	*
1015	* Returns a new UTF8 * or NULL
1016	*/
1017	xmlChar *
1018	xmlUTF8Strndup(const xmlChar *utf, int len) {
1019	xmlChar *ret;
1020	int i;
1021
1022	if ((utf == NULL) \|\| (len < 0)) return(NULL);
1023	i = xmlUTF8Strsize(utf, len);
1024	ret = (xmlChar *) xmlMallocAtomic((size_t) i + 1);
1025	if (ret == NULL) {
1026	return(NULL);
1027	}
1028	memcpy(ret, utf, i);
1029	ret[i] = 0;
1030	return(ret);
1031	}
1032
1033	/**
1034	* xmlUTF8Strpos:
1035	* @utf: the input UTF8 *
1036	* @pos: the position of the desired UTF8 char (in chars)
1037	*
1038	* a function to provide the equivalent of fetching a
1039	* character from a string array
1040	*
1041	* Returns a pointer to the UTF8 character or NULL
1042	*/
1043	const xmlChar *
1044	xmlUTF8Strpos(const xmlChar *utf, int pos) {
1045	int ch;
1046
1047	if (utf == NULL) return(NULL);
1048	if (pos < 0)
1049	return(NULL);
1050	while (pos--) {
1051	if ((ch=*utf++) == 0) return(NULL);
1052	if ( ch & 0x80 ) {
1053	/* if not simple ascii, verify proper format */
1054	if ( (ch & 0xc0) != 0xc0 )
1055	return(NULL);
1056	/* then skip over remaining bytes for this char */
1057	while ( (ch <<= 1) & 0x80 )
1058	if ( (*utf++ & 0xc0) != 0x80 )
1059	return(NULL);
1060	}
1061	}
1062	return((xmlChar *)utf);
1063	}
1064
1065	/**
1066	* xmlUTF8Strloc:
1067	* @utf: the input UTF8 *
1068	* @utfchar: the UTF8 character to be found
1069	*
1070	* a function to provide the relative location of a UTF8 char
1071	*
1072	* Returns the relative character position of the desired char
1073	* or -1 if not found
1074	*/
1075	int
1076	xmlUTF8Strloc(const xmlChar utf, const xmlChar utfchar) {
1077	size_t i;
1078	int size;
1079	int ch;
1080
1081	if (utf==NULL \|\| utfchar==NULL) return -1;
1082	size = xmlUTF8Strsize(utfchar, 1);
1083	for(i=0; (ch=*utf) != 0; i++) {
1084	if (xmlStrncmp(utf, utfchar, size)==0)
1085	return(i > INT_MAX ? 0 : i);
1086	utf++;
1087	if ( ch & 0x80 ) {
1088	/* if not simple ascii, verify proper format */
1089	if ( (ch & 0xc0) != 0xc0 )
1090	return(-1);
1091	/* then skip over remaining bytes for this char */
1092	while ( (ch <<= 1) & 0x80 )
1093	if ( (*utf++ & 0xc0) != 0x80 )
1094	return(-1);
1095	}
1096	}
1097
1098	return(-1);
1099	}
1100	/**
1101	* xmlUTF8Strsub:
1102	* @utf: a sequence of UTF-8 encoded bytes
1103	* @start: relative pos of first char
1104	* @len: total number to copy
1105	*
1106	* Create a substring from a given UTF-8 string
1107	* Note: positions are given in units of UTF-8 chars
1108	*
1109	* Returns a pointer to a newly created string or NULL if the
1110	* start index is out of bounds or a memory allocation failed.
1111	* If len is too large, the result is truncated.
1112	*/
1113
1114	xmlChar *
1115	xmlUTF8Strsub(const xmlChar *utf, int start, int len) {
1116	int i;
1117	int ch;
1118
1119	if (utf == NULL) return(NULL);
1120	if (start < 0) return(NULL);
1121	if (len < 0) return(NULL);
1122
1123	/*
1124	* Skip over any leading chars
1125	*/
1126	for (i = 0; i < start; i++) {
1127	ch = *utf++;
1128	if (ch == 0)
1129	return(NULL);
1130	/* skip over remaining bytes for this char */
1131	if (ch & 0x80) {
1132	ch <<= 1;
1133	while (ch & 0x80) {
1134	if (*utf++ == 0)
1135	return(NULL);
1136	ch <<= 1;
1137	}
1138	}
1139	}
1140
1141	return(xmlUTF8Strndup(utf, len));
1142	}
1143
1144	/**
1145	* xmlEscapeFormatString:
1146	* @msg: a pointer to the string in which to escape '%' characters.
1147	* Must be a heap-allocated buffer created by libxml2 that may be
1148	* returned, or that may be freed and replaced.
1149	*
1150	* Replaces the string pointed to by 'msg' with an escaped string.
1151	* Returns the same string with all '%' characters escaped.
1152	*/
1153	xmlChar *
1154	xmlEscapeFormatString(xmlChar **msg)
1155	{
1156	xmlChar *msgPtr = NULL;
1157	xmlChar *result = NULL;
1158	xmlChar *resultPtr = NULL;
1159	size_t count = 0;
1160	size_t msgLen = 0;
1161	size_t resultLen = 0;
1162
1163	if (!msg \|\| !*msg)
1164	return(NULL);
1165
1166	for (msgPtr = msg; msgPtr != '\0'; ++msgPtr) {
1167	++msgLen;
1168	if (*msgPtr == '%')
1169	++count;
1170	}
1171
1172	if (count == 0)
1173	return(*msg);
1174
1175	if ((count > INT_MAX) \|\| (msgLen > INT_MAX - count))
1176	return(NULL);
1177	resultLen = msgLen + count + 1;
1178	result = (xmlChar *) xmlMallocAtomic(resultLen);
1179	if (result == NULL) {
1180	/* Clear *msg to prevent format string vulnerabilities in
1181	out-of-memory situations. */
1182	xmlFree(*msg);
1183	*msg = NULL;
1184	return(NULL);
1185	}
1186
1187	for (msgPtr = msg, resultPtr = result; msgPtr != '\0'; ++msgPtr, ++resultPtr) {
1188	resultPtr = msgPtr;
1189	if (*msgPtr == '%')
1190	*(++resultPtr) = '%';
1191	}
1192	result[resultLen - 1] = '\0';
1193
1194	xmlFree(*msg);
1195	*msg = result;
1196
1197	return *msg;
1198	}
1199

Note: See TracBrowser for help on using the repository browser.

source: vbox/trunk/src/libs/libxml2-2.13.2/xmlstring.c@ 107351

Download in other formats: