HTMLtree.c@ 24345

Last change on this file since 24345 was 6076, checked in by vboxsync, 17 years ago
Merged dmik/s2 branch (r25959:26751) to the trunk.
Property svn:eol-style set to `native` Property svn:keywords set to `Date Revision Author Id`
File size: 31.1 KB

Line
1	/*
2	* HTMLtree.c : implementation of access function for an HTML tree.
3	*
4	* See Copyright for the status of this software.
5	*
6	* [email protected]
7	*/
8
9
10	#define IN_LIBXML
11	#include "libxml.h"
12	#ifdef LIBXML_HTML_ENABLED
13
14	#include <string.h> /* for memset() only ! */
15
16	#ifdef HAVE_CTYPE_H
17	#include <ctype.h>
18	#endif
19	#ifdef HAVE_STDLIB_H
20	#include <stdlib.h>
21	#endif
22
23	#include <libxml/xmlmemory.h>
24	#include <libxml/HTMLparser.h>
25	#include <libxml/HTMLtree.h>
26	#include <libxml/entities.h>
27	#include <libxml/valid.h>
28	#include <libxml/xmlerror.h>
29	#include <libxml/parserInternals.h>
30	#include <libxml/globals.h>
31	#include <libxml/uri.h>
32
33	/************************************************************************
34	* *
35	* Getting/Setting encoding meta tags *
36	* *
37	************************************************************************/
38
39	/**
40	* htmlGetMetaEncoding:
41	* @doc: the document
42	*
43	* Encoding definition lookup in the Meta tags
44	*
45	* Returns the current encoding as flagged in the HTML source
46	*/
47	const xmlChar *
48	htmlGetMetaEncoding(htmlDocPtr doc) {
49	htmlNodePtr cur;
50	const xmlChar *content;
51	const xmlChar *encoding;
52
53	if (doc == NULL)
54	return(NULL);
55	cur = doc->children;
56
57	/*
58	* Search the html
59	*/
60	while (cur != NULL) {
61	if ((cur->type == XML_ELEMENT_NODE) && (cur->name != NULL)) {
62	if (xmlStrEqual(cur->name, BAD_CAST"html"))
63	break;
64	if (xmlStrEqual(cur->name, BAD_CAST"head"))
65	goto found_head;
66	if (xmlStrEqual(cur->name, BAD_CAST"meta"))
67	goto found_meta;
68	}
69	cur = cur->next;
70	}
71	if (cur == NULL)
72	return(NULL);
73	cur = cur->children;
74
75	/*
76	* Search the head
77	*/
78	while (cur != NULL) {
79	if ((cur->type == XML_ELEMENT_NODE) && (cur->name != NULL)) {
80	if (xmlStrEqual(cur->name, BAD_CAST"head"))
81	break;
82	if (xmlStrEqual(cur->name, BAD_CAST"meta"))
83	goto found_meta;
84	}
85	cur = cur->next;
86	}
87	if (cur == NULL)
88	return(NULL);
89	found_head:
90	cur = cur->children;
91
92	/*
93	* Search the meta elements
94	*/
95	found_meta:
96	while (cur != NULL) {
97	if ((cur->type == XML_ELEMENT_NODE) && (cur->name != NULL)) {
98	if (xmlStrEqual(cur->name, BAD_CAST"meta")) {
99	xmlAttrPtr attr = cur->properties;
100	int http;
101	const xmlChar *value;
102
103	content = NULL;
104	http = 0;
105	while (attr != NULL) {
106	if ((attr->children != NULL) &&
107	(attr->children->type == XML_TEXT_NODE) &&
108	(attr->children->next == NULL)) {
109	value = attr->children->content;
110	if ((!xmlStrcasecmp(attr->name, BAD_CAST"http-equiv"))
111	&& (!xmlStrcasecmp(value, BAD_CAST"Content-Type")))
112	http = 1;
113	else if ((value != NULL)
114	&& (!xmlStrcasecmp(attr->name, BAD_CAST"content")))
115	content = value;
116	if ((http != 0) && (content != NULL))
117	goto found_content;
118	}
119	attr = attr->next;
120	}
121	}
122	}
123	cur = cur->next;
124	}
125	return(NULL);
126
127	found_content:
128	encoding = xmlStrstr(content, BAD_CAST"charset=");
129	if (encoding == NULL)
130	encoding = xmlStrstr(content, BAD_CAST"Charset=");
131	if (encoding == NULL)
132	encoding = xmlStrstr(content, BAD_CAST"CHARSET=");
133	if (encoding != NULL) {
134	encoding += 8;
135	} else {
136	encoding = xmlStrstr(content, BAD_CAST"charset =");
137	if (encoding == NULL)
138	encoding = xmlStrstr(content, BAD_CAST"Charset =");
139	if (encoding == NULL)
140	encoding = xmlStrstr(content, BAD_CAST"CHARSET =");
141	if (encoding != NULL)
142	encoding += 9;
143	}
144	if (encoding != NULL) {
145	while ((encoding == ' ') \|\| (encoding == '\t')) encoding++;
146	}
147	return(encoding);
148	}
149
150	/**
151	* htmlSetMetaEncoding:
152	* @doc: the document
153	* @encoding: the encoding string
154	*
155	* Sets the current encoding in the Meta tags
156	* NOTE: this will not change the document content encoding, just
157	* the META flag associated.
158	*
159	* Returns 0 in case of success and -1 in case of error
160	*/
161	int
162	htmlSetMetaEncoding(htmlDocPtr doc, const xmlChar *encoding) {
163	htmlNodePtr cur, meta;
164	const xmlChar *content;
165	char newcontent[100];
166
167
168	if (doc == NULL)
169	return(-1);
170
171	if (encoding != NULL) {
172	snprintf(newcontent, sizeof(newcontent), "text/html; charset=%s",
173	(char *)encoding);
174	newcontent[sizeof(newcontent) - 1] = 0;
175	}
176
177	cur = doc->children;
178
179	/*
180	* Search the html
181	*/
182	while (cur != NULL) {
183	if ((cur->type == XML_ELEMENT_NODE) && (cur->name != NULL)) {
184	if (xmlStrcasecmp(cur->name, BAD_CAST"html") == 0)
185	break;
186	if (xmlStrcasecmp(cur->name, BAD_CAST"head") == 0)
187	goto found_head;
188	if (xmlStrcasecmp(cur->name, BAD_CAST"meta") == 0)
189	goto found_meta;
190	}
191	cur = cur->next;
192	}
193	if (cur == NULL)
194	return(-1);
195	cur = cur->children;
196
197	/*
198	* Search the head
199	*/
200	while (cur != NULL) {
201	if ((cur->type == XML_ELEMENT_NODE) && (cur->name != NULL)) {
202	if (xmlStrcasecmp(cur->name, BAD_CAST"head") == 0)
203	break;
204	if (xmlStrcasecmp(cur->name, BAD_CAST"meta") == 0)
205	goto found_meta;
206	}
207	cur = cur->next;
208	}
209	if (cur == NULL)
210	return(-1);
211	found_head:
212	if (cur->children == NULL) {
213	if (encoding == NULL)
214	return(0);
215	meta = xmlNewDocNode(doc, NULL, BAD_CAST"meta", NULL);
216	xmlAddChild(cur, meta);
217	xmlNewProp(meta, BAD_CAST"http-equiv", BAD_CAST"Content-Type");
218	xmlNewProp(meta, BAD_CAST"content", BAD_CAST newcontent);
219	return(0);
220	}
221	cur = cur->children;
222
223	found_meta:
224	if (encoding != NULL) {
225	/*
226	* Create a new Meta element with the right attributes
227	*/
228
229	meta = xmlNewDocNode(doc, NULL, BAD_CAST"meta", NULL);
230	xmlAddPrevSibling(cur, meta);
231	xmlNewProp(meta, BAD_CAST"http-equiv", BAD_CAST"Content-Type");
232	xmlNewProp(meta, BAD_CAST"content", BAD_CAST newcontent);
233	}
234
235	/*
236	* Search and destroy all the remaining the meta elements carrying
237	* encoding informations
238	*/
239	while (cur != NULL) {
240	if ((cur->type == XML_ELEMENT_NODE) && (cur->name != NULL)) {
241	if (xmlStrcasecmp(cur->name, BAD_CAST"meta") == 0) {
242	xmlAttrPtr attr = cur->properties;
243	int http;
244	const xmlChar *value;
245
246	content = NULL;
247	http = 0;
248	while (attr != NULL) {
249	if ((attr->children != NULL) &&
250	(attr->children->type == XML_TEXT_NODE) &&
251	(attr->children->next == NULL)) {
252	value = attr->children->content;
253	if ((!xmlStrcasecmp(attr->name, BAD_CAST"http-equiv"))
254	&& (!xmlStrcasecmp(value, BAD_CAST"Content-Type")))
255	http = 1;
256	else
257	{
258	if ((value != NULL) &&
259	(!xmlStrcasecmp(attr->name, BAD_CAST"content")))
260	content = value;
261	}
262	if ((http != 0) && (content != NULL))
263	break;
264	}
265	attr = attr->next;
266	}
267	if ((http != 0) && (content != NULL)) {
268	meta = cur;
269	cur = cur->next;
270	xmlUnlinkNode(meta);
271	xmlFreeNode(meta);
272	continue;
273	}
274
275	}
276	}
277	cur = cur->next;
278	}
279	return(0);
280	}
281
282	/**
283	* booleanHTMLAttrs:
284	*
285	* These are the HTML attributes which will be output
286	* in minimized form, i.e. <option selected="selected"> will be
287	* output as <option selected>, as per XSLT 1.0 16.2 "HTML Output Method"
288	*
289	*/
290	static const char* htmlBooleanAttrs[] = {
291	"checked", "compact", "declare", "defer", "disabled", "ismap",
292	"multiple", "nohref", "noresize", "noshade", "nowrap", "readonly",
293	"selected", NULL
294	};
295
296
297	/**
298	* htmlIsBooleanAttr:
299	* @name: the name of the attribute to check
300	*
301	* Determine if a given attribute is a boolean attribute.
302	*
303	* returns: false if the attribute is not boolean, true otherwise.
304	*/
305	int
306	htmlIsBooleanAttr(const xmlChar *name)
307	{
308	int i = 0;
309
310	while (htmlBooleanAttrs[i] != NULL) {
311	if (xmlStrcasecmp((const xmlChar *)htmlBooleanAttrs[i], name) == 0)
312	return 1;
313	i++;
314	}
315	return 0;
316	}
317
318	#ifdef LIBXML_OUTPUT_ENABLED
319	/************************************************************************
320	* *
321	* Output error handlers *
322	* *
323	************************************************************************/
324	/**
325	* htmlSaveErrMemory:
326	* @extra: extra informations
327	*
328	* Handle an out of memory condition
329	*/
330	static void
331	htmlSaveErrMemory(const char *extra)
332	{
333	__xmlSimpleError(XML_FROM_OUTPUT, XML_ERR_NO_MEMORY, NULL, NULL, extra);
334	}
335
336	/**
337	* htmlSaveErr:
338	* @code: the error number
339	* @node: the location of the error.
340	* @extra: extra informations
341	*
342	* Handle an out of memory condition
343	*/
344	static void
345	htmlSaveErr(int code, xmlNodePtr node, const char *extra)
346	{
347	const char *msg = NULL;
348
349	switch(code) {
350	case XML_SAVE_NOT_UTF8:
351	msg = "string is not in UTF-8\n";
352	break;
353	case XML_SAVE_CHAR_INVALID:
354	msg = "invalid character value\n";
355	break;
356	case XML_SAVE_UNKNOWN_ENCODING:
357	msg = "unknown encoding %s\n";
358	break;
359	case XML_SAVE_NO_DOCTYPE:
360	msg = "HTML has no DOCTYPE\n";
361	break;
362	default:
363	msg = "unexpected error number\n";
364	}
365	__xmlSimpleError(XML_FROM_OUTPUT, code, node, msg, extra);
366	}
367
368	/************************************************************************
369	* *
370	* Dumping HTML tree content to a simple buffer *
371	* *
372	************************************************************************/
373
374	static int
375	htmlNodeDumpFormat(xmlBufferPtr buf, xmlDocPtr doc, xmlNodePtr cur,
376	int format);
377
378	/**
379	* htmlNodeDumpFormat:
380	* @buf: the HTML buffer output
381	* @doc: the document
382	* @cur: the current node
383	* @format: should formatting spaces been added
384	*
385	* Dump an HTML node, recursive behaviour,children are printed too.
386	*
387	* Returns the number of byte written or -1 in case of error
388	*/
389	static int
390	htmlNodeDumpFormat(xmlBufferPtr buf, xmlDocPtr doc, xmlNodePtr cur,
391	int format) {
392	unsigned int use;
393	int ret;
394	xmlOutputBufferPtr outbuf;
395
396	if (cur == NULL) {
397	return (-1);
398	}
399	if (buf == NULL) {
400	return (-1);
401	}
402	outbuf = (xmlOutputBufferPtr) xmlMalloc(sizeof(xmlOutputBuffer));
403	if (outbuf == NULL) {
404	htmlSaveErrMemory("allocating HTML output buffer");
405	return (-1);
406	}
407	memset(outbuf, 0, (size_t) sizeof(xmlOutputBuffer));
408	outbuf->buffer = buf;
409	outbuf->encoder = NULL;
410	outbuf->writecallback = NULL;
411	outbuf->closecallback = NULL;
412	outbuf->context = NULL;
413	outbuf->written = 0;
414
415	use = buf->use;
416	htmlNodeDumpFormatOutput(outbuf, doc, cur, NULL, format);
417	xmlFree(outbuf);
418	ret = buf->use - use;
419	return (ret);
420	}
421
422	/**
423	* htmlNodeDump:
424	* @buf: the HTML buffer output
425	* @doc: the document
426	* @cur: the current node
427	*
428	* Dump an HTML node, recursive behaviour,children are printed too,
429	* and formatting returns are added.
430	*
431	* Returns the number of byte written or -1 in case of error
432	*/
433	int
434	htmlNodeDump(xmlBufferPtr buf, xmlDocPtr doc, xmlNodePtr cur) {
435	xmlInitParser();
436
437	return(htmlNodeDumpFormat(buf, doc, cur, 1));
438	}
439
440	/**
441	* htmlNodeDumpFileFormat:
442	* @out: the FILE pointer
443	* @doc: the document
444	* @cur: the current node
445	* @encoding: the document encoding
446	* @format: should formatting spaces been added
447	*
448	* Dump an HTML node, recursive behaviour,children are printed too.
449	*
450	* TODO: if encoding == NULL try to save in the doc encoding
451	*
452	* returns: the number of byte written or -1 in case of failure.
453	*/
454	int
455	htmlNodeDumpFileFormat(FILE *out, xmlDocPtr doc,
456	xmlNodePtr cur, const char *encoding, int format) {
457	xmlOutputBufferPtr buf;
458	xmlCharEncodingHandlerPtr handler = NULL;
459	int ret;
460
461	xmlInitParser();
462
463	if (encoding != NULL) {
464	xmlCharEncoding enc;
465
466	enc = xmlParseCharEncoding(encoding);
467	if (enc != XML_CHAR_ENCODING_UTF8) {
468	handler = xmlFindCharEncodingHandler(encoding);
469	if (handler == NULL)
470	return(-1);
471	}
472	}
473
474	/*
475	* Fallback to HTML or ASCII when the encoding is unspecified
476	*/
477	if (handler == NULL)
478	handler = xmlFindCharEncodingHandler("HTML");
479	if (handler == NULL)
480	handler = xmlFindCharEncodingHandler("ascii");
481
482	/*
483	* save the content to a temp buffer.
484	*/
485	buf = xmlOutputBufferCreateFile(out, handler);
486	if (buf == NULL) return(0);
487
488	htmlNodeDumpFormatOutput(buf, doc, cur, encoding, format);
489
490	ret = xmlOutputBufferClose(buf);
491	return(ret);
492	}
493
494	/**
495	* htmlNodeDumpFile:
496	* @out: the FILE pointer
497	* @doc: the document
498	* @cur: the current node
499	*
500	* Dump an HTML node, recursive behaviour,children are printed too,
501	* and formatting returns are added.
502	*/
503	void
504	htmlNodeDumpFile(FILE *out, xmlDocPtr doc, xmlNodePtr cur) {
505	htmlNodeDumpFileFormat(out, doc, cur, NULL, 1);
506	}
507
508	/**
509	* htmlDocDumpMemoryFormat:
510	* @cur: the document
511	* @mem: OUT: the memory pointer
512	* @size: OUT: the memory length
513	* @format: should formatting spaces been added
514	*
515	* Dump an HTML document in memory and return the xmlChar * and it's size.
516	* It's up to the caller to free the memory.
517	*/
518	void
519	htmlDocDumpMemoryFormat(xmlDocPtr cur, xmlChar*mem, int size, int format) {
520	xmlOutputBufferPtr buf;
521	xmlCharEncodingHandlerPtr handler = NULL;
522	const char *encoding;
523
524	xmlInitParser();
525
526	if ((mem == NULL) \|\| (size == NULL))
527	return;
528	if (cur == NULL) {
529	*mem = NULL;
530	*size = 0;
531	return;
532	}
533
534	encoding = (const char *) htmlGetMetaEncoding(cur);
535
536	if (encoding != NULL) {
537	xmlCharEncoding enc;
538
539	enc = xmlParseCharEncoding(encoding);
540	if (enc != cur->charset) {
541	if (cur->charset != XML_CHAR_ENCODING_UTF8) {
542	/*
543	* Not supported yet
544	*/
545	*mem = NULL;
546	*size = 0;
547	return;
548	}
549
550	handler = xmlFindCharEncodingHandler(encoding);
551	if (handler == NULL) {
552	*mem = NULL;
553	*size = 0;
554	return;
555	}
556	} else {
557	handler = xmlFindCharEncodingHandler(encoding);
558	}
559	}
560
561	/*
562	* Fallback to HTML or ASCII when the encoding is unspecified
563	*/
564	if (handler == NULL)
565	handler = xmlFindCharEncodingHandler("HTML");
566	if (handler == NULL)
567	handler = xmlFindCharEncodingHandler("ascii");
568
569	buf = xmlAllocOutputBuffer(handler);
570	if (buf == NULL) {
571	*mem = NULL;
572	*size = 0;
573	return;
574	}
575
576	htmlDocContentDumpFormatOutput(buf, cur, NULL, format);
577
578	xmlOutputBufferFlush(buf);
579	if (buf->conv != NULL) {
580	*size = buf->conv->use;
581	mem = xmlStrndup(buf->conv->content, size);
582	} else {
583	*size = buf->buffer->use;
584	mem = xmlStrndup(buf->buffer->content, size);
585	}
586	(void)xmlOutputBufferClose(buf);
587	}
588
589	/**
590	* htmlDocDumpMemory:
591	* @cur: the document
592	* @mem: OUT: the memory pointer
593	* @size: OUT: the memory length
594	*
595	* Dump an HTML document in memory and return the xmlChar * and it's size.
596	* It's up to the caller to free the memory.
597	*/
598	void
599	htmlDocDumpMemory(xmlDocPtr cur, xmlChar*mem, int size) {
600	htmlDocDumpMemoryFormat(cur, mem, size, 1);
601	}
602
603
604	/************************************************************************
605	* *
606	* Dumping HTML tree content to an I/O output buffer *
607	* *
608	************************************************************************/
609
610	void xmlNsListDumpOutput(xmlOutputBufferPtr buf, xmlNsPtr cur);
611
612	/**
613	* htmlDtdDumpOutput:
614	* @buf: the HTML buffer output
615	* @doc: the document
616	* @encoding: the encoding string
617	*
618	* TODO: check whether encoding is needed
619	*
620	* Dump the HTML document DTD, if any.
621	*/
622	static void
623	htmlDtdDumpOutput(xmlOutputBufferPtr buf, xmlDocPtr doc,
624	const char *encoding ATTRIBUTE_UNUSED) {
625	xmlDtdPtr cur = doc->intSubset;
626
627	if (cur == NULL) {
628	htmlSaveErr(XML_SAVE_NO_DOCTYPE, (xmlNodePtr) doc, NULL);
629	return;
630	}
631	xmlOutputBufferWriteString(buf, "<!DOCTYPE ");
632	xmlOutputBufferWriteString(buf, (const char *)cur->name);
633	if (cur->ExternalID != NULL) {
634	xmlOutputBufferWriteString(buf, " PUBLIC ");
635	xmlBufferWriteQuotedString(buf->buffer, cur->ExternalID);
636	if (cur->SystemID != NULL) {
637	xmlOutputBufferWriteString(buf, " ");
638	xmlBufferWriteQuotedString(buf->buffer, cur->SystemID);
639	}
640	} else if (cur->SystemID != NULL) {
641	xmlOutputBufferWriteString(buf, " SYSTEM ");
642	xmlBufferWriteQuotedString(buf->buffer, cur->SystemID);
643	}
644	xmlOutputBufferWriteString(buf, ">\n");
645	}
646
647	/**
648	* htmlAttrDumpOutput:
649	* @buf: the HTML buffer output
650	* @doc: the document
651	* @cur: the attribute pointer
652	* @encoding: the encoding string
653	*
654	* Dump an HTML attribute
655	*/
656	static void
657	htmlAttrDumpOutput(xmlOutputBufferPtr buf, xmlDocPtr doc, xmlAttrPtr cur,
658	const char *encoding ATTRIBUTE_UNUSED) {
659	xmlChar *value;
660
661	/*
662	* TODO: The html output method should not escape a & character
663	* occurring in an attribute value immediately followed by
664	* a { character (see Section B.7.1 of the HTML 4.0 Recommendation).
665	*/
666
667	if (cur == NULL) {
668	return;
669	}
670	xmlOutputBufferWriteString(buf, " ");
671	if ((cur->ns != NULL) && (cur->ns->prefix != NULL)) {
672	xmlOutputBufferWriteString(buf, (const char *)cur->ns->prefix);
673	xmlOutputBufferWriteString(buf, ":");
674	}
675	xmlOutputBufferWriteString(buf, (const char *)cur->name);
676	if ((cur->children != NULL) && (!htmlIsBooleanAttr(cur->name))) {
677	value = xmlNodeListGetString(doc, cur->children, 0);
678	if (value) {
679	xmlOutputBufferWriteString(buf, "=");
680	if ((cur->ns == NULL) && (cur->parent != NULL) &&
681	(cur->parent->ns == NULL) &&
682	((!xmlStrcasecmp(cur->name, BAD_CAST "href")) \|\|
683	(!xmlStrcasecmp(cur->name, BAD_CAST "action")) \|\|
684	(!xmlStrcasecmp(cur->name, BAD_CAST "src")) \|\|
685	((!xmlStrcasecmp(cur->name, BAD_CAST "name")) &&
686	(!xmlStrcasecmp(cur->parent->name, BAD_CAST "a"))))) {
687	xmlChar *escaped;
688	xmlChar *tmp = value;
689
690	while (IS_BLANK_CH(*tmp)) tmp++;
691
692	escaped = xmlURIEscapeStr(tmp, BAD_CAST"@/:=?;#%&,+");
693	if (escaped != NULL) {
694	xmlBufferWriteQuotedString(buf->buffer, escaped);
695	xmlFree(escaped);
696	} else {
697	xmlBufferWriteQuotedString(buf->buffer, value);
698	}
699	} else {
700	xmlBufferWriteQuotedString(buf->buffer, value);
701	}
702	xmlFree(value);
703	} else {
704	xmlOutputBufferWriteString(buf, "=\"\"");
705	}
706	}
707	}
708
709	/**
710	* htmlAttrListDumpOutput:
711	* @buf: the HTML buffer output
712	* @doc: the document
713	* @cur: the first attribute pointer
714	* @encoding: the encoding string
715	*
716	* Dump a list of HTML attributes
717	*/
718	static void
719	htmlAttrListDumpOutput(xmlOutputBufferPtr buf, xmlDocPtr doc, xmlAttrPtr cur, const char *encoding) {
720	if (cur == NULL) {
721	return;
722	}
723	while (cur != NULL) {
724	htmlAttrDumpOutput(buf, doc, cur, encoding);
725	cur = cur->next;
726	}
727	}
728
729
730
731	/**
732	* htmlNodeListDumpOutput:
733	* @buf: the HTML buffer output
734	* @doc: the document
735	* @cur: the first node
736	* @encoding: the encoding string
737	* @format: should formatting spaces been added
738	*
739	* Dump an HTML node list, recursive behaviour,children are printed too.
740	*/
741	static void
742	htmlNodeListDumpOutput(xmlOutputBufferPtr buf, xmlDocPtr doc,
743	xmlNodePtr cur, const char *encoding, int format) {
744	if (cur == NULL) {
745	return;
746	}
747	while (cur != NULL) {
748	htmlNodeDumpFormatOutput(buf, doc, cur, encoding, format);
749	cur = cur->next;
750	}
751	}
752
753	/**
754	* htmlNodeDumpFormatOutput:
755	* @buf: the HTML buffer output
756	* @doc: the document
757	* @cur: the current node
758	* @encoding: the encoding string
759	* @format: should formatting spaces been added
760	*
761	* Dump an HTML node, recursive behaviour,children are printed too.
762	*/
763	void
764	htmlNodeDumpFormatOutput(xmlOutputBufferPtr buf, xmlDocPtr doc,
765	xmlNodePtr cur, const char *encoding, int format) {
766	const htmlElemDesc * info;
767
768	xmlInitParser();
769
770	if ((cur == NULL) \|\| (buf == NULL)) {
771	return;
772	}
773	/*
774	* Special cases.
775	*/
776	if (cur->type == XML_DTD_NODE)
777	return;
778	if ((cur->type == XML_HTML_DOCUMENT_NODE) \|\|
779	(cur->type == XML_DOCUMENT_NODE)){
780	htmlDocContentDumpOutput(buf, (xmlDocPtr) cur, encoding);
781	return;
782	}
783	if (cur->type == XML_ATTRIBUTE_NODE) {
784	htmlAttrDumpOutput(buf, doc, (xmlAttrPtr) cur, encoding);
785	return;
786	}
787	if (cur->type == HTML_TEXT_NODE) {
788	if (cur->content != NULL) {
789	if (((cur->name == (const xmlChar *)xmlStringText) \|\|
790	(cur->name != (const xmlChar *)xmlStringTextNoenc)) &&
791	((cur->parent == NULL) \|\|
792	((xmlStrcasecmp(cur->parent->name, BAD_CAST "script")) &&
793	(xmlStrcasecmp(cur->parent->name, BAD_CAST "style"))))) {
794	xmlChar *buffer;
795
796	buffer = xmlEncodeEntitiesReentrant(doc, cur->content);
797	if (buffer != NULL) {
798	xmlOutputBufferWriteString(buf, (const char *)buffer);
799	xmlFree(buffer);
800	}
801	} else {
802	xmlOutputBufferWriteString(buf, (const char *)cur->content);
803	}
804	}
805	return;
806	}
807	if (cur->type == HTML_COMMENT_NODE) {
808	if (cur->content != NULL) {
809	xmlOutputBufferWriteString(buf, "<!--");
810	xmlOutputBufferWriteString(buf, (const char *)cur->content);
811	xmlOutputBufferWriteString(buf, "-->");
812	}
813	return;
814	}
815	if (cur->type == HTML_PI_NODE) {
816	if (cur->name == NULL)
817	return;
818	xmlOutputBufferWriteString(buf, "<?");
819	xmlOutputBufferWriteString(buf, (const char *)cur->name);
820	if (cur->content != NULL) {
821	xmlOutputBufferWriteString(buf, " ");
822	xmlOutputBufferWriteString(buf, (const char *)cur->content);
823	}
824	xmlOutputBufferWriteString(buf, ">");
825	return;
826	}
827	if (cur->type == HTML_ENTITY_REF_NODE) {
828	xmlOutputBufferWriteString(buf, "&");
829	xmlOutputBufferWriteString(buf, (const char *)cur->name);
830	xmlOutputBufferWriteString(buf, ";");
831	return;
832	}
833	if (cur->type == HTML_PRESERVE_NODE) {
834	if (cur->content != NULL) {
835	xmlOutputBufferWriteString(buf, (const char *)cur->content);
836	}
837	return;
838	}
839
840	/*
841	* Get specific HTML info for that node.
842	*/
843	if (cur->ns == NULL)
844	info = htmlTagLookup(cur->name);
845	else
846	info = NULL;
847
848	xmlOutputBufferWriteString(buf, "<");
849	if ((cur->ns != NULL) && (cur->ns->prefix != NULL)) {
850	xmlOutputBufferWriteString(buf, (const char *)cur->ns->prefix);
851	xmlOutputBufferWriteString(buf, ":");
852	}
853	xmlOutputBufferWriteString(buf, (const char *)cur->name);
854	if (cur->nsDef)
855	xmlNsListDumpOutput(buf, cur->nsDef);
856	if (cur->properties != NULL)
857	htmlAttrListDumpOutput(buf, doc, cur->properties, encoding);
858
859	if ((info != NULL) && (info->empty)) {
860	xmlOutputBufferWriteString(buf, ">");
861	if ((format) && (!info->isinline) && (cur->next != NULL)) {
862	if ((cur->next->type != HTML_TEXT_NODE) &&
863	(cur->next->type != HTML_ENTITY_REF_NODE) &&
864	(cur->parent != NULL) &&
865	(cur->parent->name != NULL) &&
866	(cur->parent->name[0] != 'p')) /* p, pre, param */
867	xmlOutputBufferWriteString(buf, "\n");
868	}
869	return;
870	}
871	if (((cur->type == XML_ELEMENT_NODE) \|\| (cur->content == NULL)) &&
872	(cur->children == NULL)) {
873	if ((info != NULL) && (info->saveEndTag != 0) &&
874	(xmlStrcmp(BAD_CAST info->name, BAD_CAST "html")) &&
875	(xmlStrcmp(BAD_CAST info->name, BAD_CAST "body"))) {
876	xmlOutputBufferWriteString(buf, ">");
877	} else {
878	xmlOutputBufferWriteString(buf, "></");
879	if ((cur->ns != NULL) && (cur->ns->prefix != NULL)) {
880	xmlOutputBufferWriteString(buf, (const char *)cur->ns->prefix);
881	xmlOutputBufferWriteString(buf, ":");
882	}
883	xmlOutputBufferWriteString(buf, (const char *)cur->name);
884	xmlOutputBufferWriteString(buf, ">");
885	}
886	if ((format) && (cur->next != NULL) &&
887	(info != NULL) && (!info->isinline)) {
888	if ((cur->next->type != HTML_TEXT_NODE) &&
889	(cur->next->type != HTML_ENTITY_REF_NODE) &&
890	(cur->parent != NULL) &&
891	(cur->parent->name != NULL) &&
892	(cur->parent->name[0] != 'p')) /* p, pre, param */
893	xmlOutputBufferWriteString(buf, "\n");
894	}
895	return;
896	}
897	xmlOutputBufferWriteString(buf, ">");
898	if ((cur->type != XML_ELEMENT_NODE) &&
899	(cur->content != NULL)) {
900	/*
901	* Uses the OutputBuffer property to automatically convert
902	* invalids to charrefs
903	*/
904
905	xmlOutputBufferWriteString(buf, (const char *) cur->content);
906	}
907	if (cur->children != NULL) {
908	if ((format) && (info != NULL) && (!info->isinline) &&
909	(cur->children->type != HTML_TEXT_NODE) &&
910	(cur->children->type != HTML_ENTITY_REF_NODE) &&
911	(cur->children != cur->last) &&
912	(cur->name != NULL) &&
913	(cur->name[0] != 'p')) /* p, pre, param */
914	xmlOutputBufferWriteString(buf, "\n");
915	htmlNodeListDumpOutput(buf, doc, cur->children, encoding, format);
916	if ((format) && (info != NULL) && (!info->isinline) &&
917	(cur->last->type != HTML_TEXT_NODE) &&
918	(cur->last->type != HTML_ENTITY_REF_NODE) &&
919	(cur->children != cur->last) &&
920	(cur->name != NULL) &&
921	(cur->name[0] != 'p')) /* p, pre, param */
922	xmlOutputBufferWriteString(buf, "\n");
923	}
924	xmlOutputBufferWriteString(buf, "</");
925	if ((cur->ns != NULL) && (cur->ns->prefix != NULL)) {
926	xmlOutputBufferWriteString(buf, (const char *)cur->ns->prefix);
927	xmlOutputBufferWriteString(buf, ":");
928	}
929	xmlOutputBufferWriteString(buf, (const char *)cur->name);
930	xmlOutputBufferWriteString(buf, ">");
931	if ((format) && (info != NULL) && (!info->isinline) &&
932	(cur->next != NULL)) {
933	if ((cur->next->type != HTML_TEXT_NODE) &&
934	(cur->next->type != HTML_ENTITY_REF_NODE) &&
935	(cur->parent != NULL) &&
936	(cur->parent->name != NULL) &&
937	(cur->parent->name[0] != 'p')) /* p, pre, param */
938	xmlOutputBufferWriteString(buf, "\n");
939	}
940	}
941
942	/**
943	* htmlNodeDumpOutput:
944	* @buf: the HTML buffer output
945	* @doc: the document
946	* @cur: the current node
947	* @encoding: the encoding string
948	*
949	* Dump an HTML node, recursive behaviour,children are printed too,
950	* and formatting returns/spaces are added.
951	*/
952	void
953	htmlNodeDumpOutput(xmlOutputBufferPtr buf, xmlDocPtr doc,
954	xmlNodePtr cur, const char *encoding) {
955	htmlNodeDumpFormatOutput(buf, doc, cur, encoding, 1);
956	}
957
958	/**
959	* htmlDocContentDumpFormatOutput:
960	* @buf: the HTML buffer output
961	* @cur: the document
962	* @encoding: the encoding string
963	* @format: should formatting spaces been added
964	*
965	* Dump an HTML document.
966	*/
967	void
968	htmlDocContentDumpFormatOutput(xmlOutputBufferPtr buf, xmlDocPtr cur,
969	const char *encoding, int format) {
970	int type;
971
972	xmlInitParser();
973
974	if ((buf == NULL) \|\| (cur == NULL))
975	return;
976
977	/*
978	* force to output the stuff as HTML, especially for entities
979	*/
980	type = cur->type;
981	cur->type = XML_HTML_DOCUMENT_NODE;
982	if (cur->intSubset != NULL) {
983	htmlDtdDumpOutput(buf, cur, NULL);
984	}
985	if (cur->children != NULL) {
986	htmlNodeListDumpOutput(buf, cur, cur->children, encoding, format);
987	}
988	xmlOutputBufferWriteString(buf, "\n");
989	cur->type = (xmlElementType) type;
990	}
991
992	/**
993	* htmlDocContentDumpOutput:
994	* @buf: the HTML buffer output
995	* @cur: the document
996	* @encoding: the encoding string
997	*
998	* Dump an HTML document. Formating return/spaces are added.
999	*/
1000	void
1001	htmlDocContentDumpOutput(xmlOutputBufferPtr buf, xmlDocPtr cur,
1002	const char *encoding) {
1003	htmlDocContentDumpFormatOutput(buf, cur, encoding, 1);
1004	}
1005
1006	/************************************************************************
1007	* *
1008	* Saving functions front-ends *
1009	* *
1010	************************************************************************/
1011
1012	/**
1013	* htmlDocDump:
1014	* @f: the FILE*
1015	* @cur: the document
1016	*
1017	* Dump an HTML document to an open FILE.
1018	*
1019	* returns: the number of byte written or -1 in case of failure.
1020	*/
1021	int
1022	htmlDocDump(FILE *f, xmlDocPtr cur) {
1023	xmlOutputBufferPtr buf;
1024	xmlCharEncodingHandlerPtr handler = NULL;
1025	const char *encoding;
1026	int ret;
1027
1028	xmlInitParser();
1029
1030	if ((cur == NULL) \|\| (f == NULL)) {
1031	return(-1);
1032	}
1033
1034	encoding = (const char *) htmlGetMetaEncoding(cur);
1035
1036	if (encoding != NULL) {
1037	xmlCharEncoding enc;
1038
1039	enc = xmlParseCharEncoding(encoding);
1040	if (enc != cur->charset) {
1041	if (cur->charset != XML_CHAR_ENCODING_UTF8) {
1042	/*
1043	* Not supported yet
1044	*/
1045	return(-1);
1046	}
1047
1048	handler = xmlFindCharEncodingHandler(encoding);
1049	if (handler == NULL)
1050	return(-1);
1051	} else {
1052	handler = xmlFindCharEncodingHandler(encoding);
1053	}
1054	}
1055
1056	/*
1057	* Fallback to HTML or ASCII when the encoding is unspecified
1058	*/
1059	if (handler == NULL)
1060	handler = xmlFindCharEncodingHandler("HTML");
1061	if (handler == NULL)
1062	handler = xmlFindCharEncodingHandler("ascii");
1063
1064	buf = xmlOutputBufferCreateFile(f, handler);
1065	if (buf == NULL) return(-1);
1066	htmlDocContentDumpOutput(buf, cur, NULL);
1067
1068	ret = xmlOutputBufferClose(buf);
1069	return(ret);
1070	}
1071
1072	/**
1073	* htmlSaveFile:
1074	* @filename: the filename (or URL)
1075	* @cur: the document
1076	*
1077	* Dump an HTML document to a file. If @filename is "-" the stdout file is
1078	* used.
1079	* returns: the number of byte written or -1 in case of failure.
1080	*/
1081	int
1082	htmlSaveFile(const char *filename, xmlDocPtr cur) {
1083	xmlOutputBufferPtr buf;
1084	xmlCharEncodingHandlerPtr handler = NULL;
1085	const char *encoding;
1086	int ret;
1087
1088	if ((cur == NULL) \|\| (filename == NULL))
1089	return(-1);
1090
1091	xmlInitParser();
1092
1093	encoding = (const char *) htmlGetMetaEncoding(cur);
1094
1095	if (encoding != NULL) {
1096	xmlCharEncoding enc;
1097
1098	enc = xmlParseCharEncoding(encoding);
1099	if (enc != cur->charset) {
1100	if (cur->charset != XML_CHAR_ENCODING_UTF8) {
1101	/*
1102	* Not supported yet
1103	*/
1104	return(-1);
1105	}
1106
1107	handler = xmlFindCharEncodingHandler(encoding);
1108	if (handler == NULL)
1109	return(-1);
1110	}
1111	}
1112
1113	/*
1114	* Fallback to HTML or ASCII when the encoding is unspecified
1115	*/
1116	if (handler == NULL)
1117	handler = xmlFindCharEncodingHandler("HTML");
1118	if (handler == NULL)
1119	handler = xmlFindCharEncodingHandler("ascii");
1120
1121	/*
1122	* save the content to a temp buffer.
1123	*/
1124	buf = xmlOutputBufferCreateFilename(filename, handler, cur->compression);
1125	if (buf == NULL) return(0);
1126
1127	htmlDocContentDumpOutput(buf, cur, NULL);
1128
1129	ret = xmlOutputBufferClose(buf);
1130	return(ret);
1131	}
1132
1133	/**
1134	* htmlSaveFileFormat:
1135	* @filename: the filename
1136	* @cur: the document
1137	* @format: should formatting spaces been added
1138	* @encoding: the document encoding
1139	*
1140	* Dump an HTML document to a file using a given encoding.
1141	*
1142	* returns: the number of byte written or -1 in case of failure.
1143	*/
1144	int
1145	htmlSaveFileFormat(const char *filename, xmlDocPtr cur,
1146	const char *encoding, int format) {
1147	xmlOutputBufferPtr buf;
1148	xmlCharEncodingHandlerPtr handler = NULL;
1149	int ret;
1150
1151	if ((cur == NULL) \|\| (filename == NULL))
1152	return(-1);
1153
1154	xmlInitParser();
1155
1156	if (encoding != NULL) {
1157	xmlCharEncoding enc;
1158
1159	enc = xmlParseCharEncoding(encoding);
1160	if (enc != cur->charset) {
1161	if (cur->charset != XML_CHAR_ENCODING_UTF8) {
1162	/*
1163	* Not supported yet
1164	*/
1165	return(-1);
1166	}
1167
1168	handler = xmlFindCharEncodingHandler(encoding);
1169	if (handler == NULL)
1170	return(-1);
1171	htmlSetMetaEncoding(cur, (const xmlChar *) encoding);
1172	}
1173	} else {
1174	htmlSetMetaEncoding(cur, (const xmlChar *) "UTF-8");
1175	}
1176
1177	/*
1178	* Fallback to HTML or ASCII when the encoding is unspecified
1179	*/
1180	if (handler == NULL)
1181	handler = xmlFindCharEncodingHandler("HTML");
1182	if (handler == NULL)
1183	handler = xmlFindCharEncodingHandler("ascii");
1184
1185	/*
1186	* save the content to a temp buffer.
1187	*/
1188	buf = xmlOutputBufferCreateFilename(filename, handler, 0);
1189	if (buf == NULL) return(0);
1190
1191	htmlDocContentDumpFormatOutput(buf, cur, encoding, format);
1192
1193	ret = xmlOutputBufferClose(buf);
1194	return(ret);
1195	}
1196
1197	/**
1198	* htmlSaveFileEnc:
1199	* @filename: the filename
1200	* @cur: the document
1201	* @encoding: the document encoding
1202	*
1203	* Dump an HTML document to a file using a given encoding
1204	* and formatting returns/spaces are added.
1205	*
1206	* returns: the number of byte written or -1 in case of failure.
1207	*/
1208	int
1209	htmlSaveFileEnc(const char filename, xmlDocPtr cur, const char encoding) {
1210	return(htmlSaveFileFormat(filename, cur, encoding, 1));
1211	}
1212
1213	#endif /* LIBXML_OUTPUT_ENABLED */
1214
1215	#define bottom_HTMLtree
1216	#include "elfgcchack.h"
1217	#endif /* LIBXML_HTML_ENABLED */

Note: See TracBrowser for help on using the repository browser.

source: vbox/trunk/src/libs/libxml2-2.6.30/HTMLtree.c@ 24345

Download in other formats: