VirtualBox

source: vbox/trunk/src/libs/libxml2-2.13.2/HTMLtree.c@ 107351

Last change on this file since 107351 was 105420, checked in by vboxsync, 6 months ago

libxml2-2.12.6: Applied and adjusted our libxml2 changes to 2.12.6. bugref:10730

  • Property svn:eol-style set to native
File size: 31.9 KB
Line 
1/*
2 * HTMLtree.c : implementation of access function for an HTML tree.
3 *
4 * See Copyright for the status of this software.
5 *
6 * [email protected]
7 */
8
9
10#define IN_LIBXML
11#include "libxml.h"
12#ifdef LIBXML_HTML_ENABLED
13
14#include <string.h> /* for memset() only ! */
15#include <ctype.h>
16#include <stdlib.h>
17
18#include <libxml/xmlmemory.h>
19#include <libxml/HTMLparser.h>
20#include <libxml/HTMLtree.h>
21#include <libxml/entities.h>
22#include <libxml/xmlerror.h>
23#include <libxml/parserInternals.h>
24#include <libxml/uri.h>
25
26#include "private/buf.h"
27#include "private/error.h"
28#include "private/io.h"
29#include "private/save.h"
30
31/************************************************************************
32 * *
33 * Getting/Setting encoding meta tags *
34 * *
35 ************************************************************************/
36
37/**
38 * htmlGetMetaEncoding:
39 * @doc: the document
40 *
41 * Encoding definition lookup in the Meta tags
42 *
43 * Returns the current encoding as flagged in the HTML source
44 */
45const xmlChar *
46htmlGetMetaEncoding(htmlDocPtr doc) {
47 htmlNodePtr cur;
48 const xmlChar *content;
49 const xmlChar *encoding;
50
51 if (doc == NULL)
52 return(NULL);
53 cur = doc->children;
54
55 /*
56 * Search the html
57 */
58 while (cur != NULL) {
59 if ((cur->type == XML_ELEMENT_NODE) && (cur->name != NULL)) {
60 if (xmlStrEqual(cur->name, BAD_CAST"html"))
61 break;
62 if (xmlStrEqual(cur->name, BAD_CAST"head"))
63 goto found_head;
64 if (xmlStrEqual(cur->name, BAD_CAST"meta"))
65 goto found_meta;
66 }
67 cur = cur->next;
68 }
69 if (cur == NULL)
70 return(NULL);
71 cur = cur->children;
72
73 /*
74 * Search the head
75 */
76 while (cur != NULL) {
77 if ((cur->type == XML_ELEMENT_NODE) && (cur->name != NULL)) {
78 if (xmlStrEqual(cur->name, BAD_CAST"head"))
79 break;
80 if (xmlStrEqual(cur->name, BAD_CAST"meta"))
81 goto found_meta;
82 }
83 cur = cur->next;
84 }
85 if (cur == NULL)
86 return(NULL);
87found_head:
88 cur = cur->children;
89
90 /*
91 * Search the meta elements
92 */
93found_meta:
94 while (cur != NULL) {
95 if ((cur->type == XML_ELEMENT_NODE) && (cur->name != NULL)) {
96 if (xmlStrEqual(cur->name, BAD_CAST"meta")) {
97 xmlAttrPtr attr = cur->properties;
98 int http;
99 const xmlChar *value;
100
101 content = NULL;
102 http = 0;
103 while (attr != NULL) {
104 if ((attr->children != NULL) &&
105 (attr->children->type == XML_TEXT_NODE) &&
106 (attr->children->next == NULL)) {
107 value = attr->children->content;
108 if ((!xmlStrcasecmp(attr->name, BAD_CAST"http-equiv"))
109 && (!xmlStrcasecmp(value, BAD_CAST"Content-Type")))
110 http = 1;
111 else if ((value != NULL)
112 && (!xmlStrcasecmp(attr->name, BAD_CAST"content")))
113 content = value;
114 if ((http != 0) && (content != NULL))
115 goto found_content;
116 }
117 attr = attr->next;
118 }
119 }
120 }
121 cur = cur->next;
122 }
123 return(NULL);
124
125found_content:
126 encoding = xmlStrstr(content, BAD_CAST"charset=");
127 if (encoding == NULL)
128 encoding = xmlStrstr(content, BAD_CAST"Charset=");
129 if (encoding == NULL)
130 encoding = xmlStrstr(content, BAD_CAST"CHARSET=");
131 if (encoding != NULL) {
132 encoding += 8;
133 } else {
134 encoding = xmlStrstr(content, BAD_CAST"charset =");
135 if (encoding == NULL)
136 encoding = xmlStrstr(content, BAD_CAST"Charset =");
137 if (encoding == NULL)
138 encoding = xmlStrstr(content, BAD_CAST"CHARSET =");
139 if (encoding != NULL)
140 encoding += 9;
141 }
142 if (encoding != NULL) {
143 while ((*encoding == ' ') || (*encoding == '\t')) encoding++;
144 }
145 return(encoding);
146}
147
148/**
149 * htmlSetMetaEncoding:
150 * @doc: the document
151 * @encoding: the encoding string
152 *
153 * Sets the current encoding in the Meta tags
154 * NOTE: this will not change the document content encoding, just
155 * the META flag associated.
156 *
157 * Returns 0 in case of success and -1 in case of error
158 */
159int
160htmlSetMetaEncoding(htmlDocPtr doc, const xmlChar *encoding) {
161 htmlNodePtr cur, meta = NULL, head = NULL;
162 const xmlChar *content = NULL;
163 char newcontent[100];
164
165 newcontent[0] = 0;
166
167 if (doc == NULL)
168 return(-1);
169
170 /* html isn't a real encoding it's just libxml2 way to get entities */
171 if (!xmlStrcasecmp(encoding, BAD_CAST "html"))
172 return(-1);
173
174 if (encoding != NULL) {
175 snprintf(newcontent, sizeof(newcontent), "text/html; charset=%s",
176 (char *)encoding);
177 newcontent[sizeof(newcontent) - 1] = 0;
178 }
179
180 cur = doc->children;
181
182 /*
183 * Search the html
184 */
185 while (cur != NULL) {
186 if ((cur->type == XML_ELEMENT_NODE) && (cur->name != NULL)) {
187 if (xmlStrcasecmp(cur->name, BAD_CAST"html") == 0)
188 break;
189 if (xmlStrcasecmp(cur->name, BAD_CAST"head") == 0)
190 goto found_head;
191 if (xmlStrcasecmp(cur->name, BAD_CAST"meta") == 0)
192 goto found_meta;
193 }
194 cur = cur->next;
195 }
196 if (cur == NULL)
197 return(-1);
198 cur = cur->children;
199
200 /*
201 * Search the head
202 */
203 while (cur != NULL) {
204 if ((cur->type == XML_ELEMENT_NODE) && (cur->name != NULL)) {
205 if (xmlStrcasecmp(cur->name, BAD_CAST"head") == 0)
206 break;
207 if (xmlStrcasecmp(cur->name, BAD_CAST"meta") == 0) {
208 head = cur->parent;
209 goto found_meta;
210 }
211 }
212 cur = cur->next;
213 }
214 if (cur == NULL)
215 return(-1);
216found_head:
217 head = cur;
218 if (cur->children == NULL)
219 goto create;
220 cur = cur->children;
221
222found_meta:
223 /*
224 * Search and update all the remaining the meta elements carrying
225 * encoding information
226 */
227 while (cur != NULL) {
228 if ((cur->type == XML_ELEMENT_NODE) && (cur->name != NULL)) {
229 if (xmlStrcasecmp(cur->name, BAD_CAST"meta") == 0) {
230 xmlAttrPtr attr = cur->properties;
231 int http;
232 const xmlChar *value;
233
234 content = NULL;
235 http = 0;
236 while (attr != NULL) {
237 if ((attr->children != NULL) &&
238 (attr->children->type == XML_TEXT_NODE) &&
239 (attr->children->next == NULL)) {
240 value = attr->children->content;
241 if ((!xmlStrcasecmp(attr->name, BAD_CAST"http-equiv"))
242 && (!xmlStrcasecmp(value, BAD_CAST"Content-Type")))
243 http = 1;
244 else
245 {
246 if ((value != NULL) &&
247 (!xmlStrcasecmp(attr->name, BAD_CAST"content")))
248 content = value;
249 }
250 if ((http != 0) && (content != NULL))
251 break;
252 }
253 attr = attr->next;
254 }
255 if ((http != 0) && (content != NULL)) {
256 meta = cur;
257 break;
258 }
259
260 }
261 }
262 cur = cur->next;
263 }
264create:
265 if (meta == NULL) {
266 if ((encoding != NULL) && (head != NULL)) {
267 /*
268 * Create a new Meta element with the right attributes
269 */
270
271 meta = xmlNewDocNode(doc, NULL, BAD_CAST"meta", NULL);
272 if (head->children == NULL)
273 xmlAddChild(head, meta);
274 else
275 xmlAddPrevSibling(head->children, meta);
276 xmlNewProp(meta, BAD_CAST"http-equiv", BAD_CAST"Content-Type");
277 xmlNewProp(meta, BAD_CAST"content", BAD_CAST newcontent);
278 }
279 } else {
280 /* remove the meta tag if NULL is passed */
281 if (encoding == NULL) {
282 xmlUnlinkNode(meta);
283 xmlFreeNode(meta);
284 }
285 /* change the document only if there is a real encoding change */
286 else if (xmlStrcasestr(content, encoding) == NULL) {
287 xmlSetProp(meta, BAD_CAST"content", BAD_CAST newcontent);
288 }
289 }
290
291
292 return(0);
293}
294
295/**
296 * booleanHTMLAttrs:
297 *
298 * These are the HTML attributes which will be output
299 * in minimized form, i.e. <option selected="selected"> will be
300 * output as <option selected>, as per XSLT 1.0 16.2 "HTML Output Method"
301 *
302 */
303static const char* const htmlBooleanAttrs[] = {
304 "checked", "compact", "declare", "defer", "disabled", "ismap",
305 "multiple", "nohref", "noresize", "noshade", "nowrap", "readonly",
306 "selected", NULL
307};
308
309
310/**
311 * htmlIsBooleanAttr:
312 * @name: the name of the attribute to check
313 *
314 * Determine if a given attribute is a boolean attribute.
315 *
316 * returns: false if the attribute is not boolean, true otherwise.
317 */
318int
319htmlIsBooleanAttr(const xmlChar *name)
320{
321 int i = 0;
322
323 while (htmlBooleanAttrs[i] != NULL) {
324 if (xmlStrcasecmp((const xmlChar *)htmlBooleanAttrs[i], name) == 0)
325 return 1;
326 i++;
327 }
328 return 0;
329}
330
331#ifdef LIBXML_OUTPUT_ENABLED
332/************************************************************************
333 * *
334 * Output error handlers *
335 * *
336 ************************************************************************/
337
338/**
339 * htmlSaveErr:
340 * @code: the error number
341 * @node: the location of the error.
342 * @extra: extra information
343 *
344 * Handle an out of memory condition
345 */
346static void
347htmlSaveErr(int code, xmlNodePtr node, const char *extra)
348{
349 const char *msg = NULL;
350 int res;
351
352 switch(code) {
353 case XML_SAVE_NOT_UTF8:
354 msg = "string is not in UTF-8\n";
355 break;
356 case XML_SAVE_CHAR_INVALID:
357 msg = "invalid character value\n";
358 break;
359 case XML_SAVE_UNKNOWN_ENCODING:
360 msg = "unknown encoding %s\n";
361 break;
362 case XML_SAVE_NO_DOCTYPE:
363 msg = "HTML has no DOCTYPE\n";
364 break;
365 default:
366 msg = "unexpected error number\n";
367 }
368
369 res = __xmlRaiseError(NULL, NULL, NULL, NULL, node,
370 XML_FROM_OUTPUT, code, XML_ERR_ERROR, NULL, 0,
371 extra, NULL, NULL, 0, 0,
372 msg, extra);
373 if (res < 0)
374 xmlRaiseMemoryError(NULL, NULL, NULL, XML_FROM_OUTPUT, NULL);
375}
376
377/************************************************************************
378 * *
379 * Dumping HTML tree content to a simple buffer *
380 * *
381 ************************************************************************/
382
383static xmlCharEncodingHandler *
384htmlFindOutputEncoder(const char *encoding) {
385 xmlCharEncodingHandler *handler = NULL;
386
387 if (encoding != NULL) {
388 int res;
389
390 res = xmlOpenCharEncodingHandler(encoding, /* output */ 1,
391 &handler);
392 if (res != XML_ERR_OK)
393 htmlSaveErr(XML_SAVE_UNKNOWN_ENCODING, NULL, encoding);
394 } else {
395 /*
396 * Fallback to HTML when the encoding is unspecified
397 */
398 xmlOpenCharEncodingHandler("HTML", /* output */ 1, &handler);
399 }
400
401 return(handler);
402}
403
404/**
405 * htmlBufNodeDumpFormat:
406 * @buf: the xmlBufPtr output
407 * @doc: the document
408 * @cur: the current node
409 * @format: should formatting spaces been added
410 *
411 * Dump an HTML node, recursive behaviour,children are printed too.
412 *
413 * Returns the number of byte written or -1 in case of error
414 */
415static size_t
416htmlBufNodeDumpFormat(xmlBufPtr buf, xmlDocPtr doc, xmlNodePtr cur,
417 int format) {
418 size_t use;
419 size_t ret;
420 xmlOutputBufferPtr outbuf;
421
422 if (cur == NULL) {
423 return ((size_t) -1);
424 }
425 if (buf == NULL) {
426 return ((size_t) -1);
427 }
428 outbuf = (xmlOutputBufferPtr) xmlMalloc(sizeof(xmlOutputBuffer));
429 if (outbuf == NULL)
430 return ((size_t) -1);
431 memset(outbuf, 0, sizeof(xmlOutputBuffer));
432 outbuf->buffer = buf;
433 outbuf->encoder = NULL;
434 outbuf->writecallback = NULL;
435 outbuf->closecallback = NULL;
436 outbuf->context = NULL;
437 outbuf->written = 0;
438
439 use = xmlBufUse(buf);
440 htmlNodeDumpFormatOutput(outbuf, doc, cur, NULL, format);
441 if (outbuf->error)
442 ret = (size_t) -1;
443 else
444 ret = xmlBufUse(buf) - use;
445 xmlFree(outbuf);
446 return (ret);
447}
448
449/**
450 * htmlNodeDump:
451 * @buf: the HTML buffer output
452 * @doc: the document
453 * @cur: the current node
454 *
455 * Dump an HTML node, recursive behaviour,children are printed too,
456 * and formatting returns are added.
457 *
458 * Returns the number of byte written or -1 in case of error
459 */
460int
461htmlNodeDump(xmlBufferPtr buf, xmlDocPtr doc, xmlNodePtr cur) {
462 xmlBufPtr buffer;
463 size_t ret;
464
465 if ((buf == NULL) || (cur == NULL))
466 return(-1);
467
468 xmlInitParser();
469 buffer = xmlBufFromBuffer(buf);
470 if (buffer == NULL)
471 return(-1);
472
473 xmlBufSetAllocationScheme(buffer, XML_BUFFER_ALLOC_DOUBLEIT);
474 ret = htmlBufNodeDumpFormat(buffer, doc, cur, 1);
475
476 xmlBufBackToBuffer(buffer);
477
478 if (ret > INT_MAX)
479 return(-1);
480 return((int) ret);
481}
482
483/**
484 * htmlNodeDumpFileFormat:
485 * @out: the FILE pointer
486 * @doc: the document
487 * @cur: the current node
488 * @encoding: the document encoding
489 * @format: should formatting spaces been added
490 *
491 * Dump an HTML node, recursive behaviour,children are printed too.
492 *
493 * TODO: if encoding == NULL try to save in the doc encoding
494 *
495 * returns: the number of byte written or -1 in case of failure.
496 */
497int
498htmlNodeDumpFileFormat(FILE *out, xmlDocPtr doc,
499 xmlNodePtr cur, const char *encoding, int format) {
500 xmlOutputBufferPtr buf;
501 xmlCharEncodingHandlerPtr handler;
502 int ret;
503
504 xmlInitParser();
505
506 /*
507 * save the content to a temp buffer.
508 */
509 handler = htmlFindOutputEncoder(encoding);
510 buf = xmlOutputBufferCreateFile(out, handler);
511 if (buf == NULL) {
512 xmlCharEncCloseFunc(handler);
513 return(0);
514 }
515
516 htmlNodeDumpFormatOutput(buf, doc, cur, NULL, format);
517
518 ret = xmlOutputBufferClose(buf);
519 return(ret);
520}
521
522/**
523 * htmlNodeDumpFile:
524 * @out: the FILE pointer
525 * @doc: the document
526 * @cur: the current node
527 *
528 * Dump an HTML node, recursive behaviour,children are printed too,
529 * and formatting returns are added.
530 */
531void
532htmlNodeDumpFile(FILE *out, xmlDocPtr doc, xmlNodePtr cur) {
533 htmlNodeDumpFileFormat(out, doc, cur, NULL, 1);
534}
535
536/**
537 * htmlDocDumpMemoryFormat:
538 * @cur: the document
539 * @mem: OUT: the memory pointer
540 * @size: OUT: the memory length
541 * @format: should formatting spaces been added
542 *
543 * Dump an HTML document in memory and return the xmlChar * and it's size.
544 * It's up to the caller to free the memory.
545 */
546void
547htmlDocDumpMemoryFormat(xmlDocPtr cur, xmlChar**mem, int *size, int format) {
548 xmlOutputBufferPtr buf;
549 xmlCharEncodingHandlerPtr handler = NULL;
550 const char *encoding;
551
552 xmlInitParser();
553
554 if ((mem == NULL) || (size == NULL))
555 return;
556 *mem = NULL;
557 *size = 0;
558 if (cur == NULL)
559 return;
560
561 encoding = (const char *) htmlGetMetaEncoding(cur);
562 handler = htmlFindOutputEncoder(encoding);
563 buf = xmlAllocOutputBufferInternal(handler);
564 if (buf == NULL) {
565 xmlCharEncCloseFunc(handler);
566 return;
567 }
568
569 htmlDocContentDumpFormatOutput(buf, cur, NULL, format);
570
571 xmlOutputBufferFlush(buf);
572
573 if (!buf->error) {
574 if (buf->conv != NULL) {
575 *size = xmlBufUse(buf->conv);
576 *mem = xmlStrndup(xmlBufContent(buf->conv), *size);
577 } else {
578 *size = xmlBufUse(buf->buffer);
579 *mem = xmlStrndup(xmlBufContent(buf->buffer), *size);
580 }
581 }
582
583 xmlOutputBufferClose(buf);
584}
585
586/**
587 * htmlDocDumpMemory:
588 * @cur: the document
589 * @mem: OUT: the memory pointer
590 * @size: OUT: the memory length
591 *
592 * Dump an HTML document in memory and return the xmlChar * and it's size.
593 * It's up to the caller to free the memory.
594 */
595void
596htmlDocDumpMemory(xmlDocPtr cur, xmlChar**mem, int *size) {
597 htmlDocDumpMemoryFormat(cur, mem, size, 1);
598}
599
600
601/************************************************************************
602 * *
603 * Dumping HTML tree content to an I/O output buffer *
604 * *
605 ************************************************************************/
606
607/**
608 * htmlDtdDumpOutput:
609 * @buf: the HTML buffer output
610 * @doc: the document
611 * @encoding: the encoding string
612 *
613 * TODO: check whether encoding is needed
614 *
615 * Dump the HTML document DTD, if any.
616 */
617static void
618htmlDtdDumpOutput(xmlOutputBufferPtr buf, xmlDocPtr doc,
619 const char *encoding ATTRIBUTE_UNUSED) {
620 xmlDtdPtr cur = doc->intSubset;
621
622 if (cur == NULL) {
623 htmlSaveErr(XML_SAVE_NO_DOCTYPE, (xmlNodePtr) doc, NULL);
624 return;
625 }
626 xmlOutputBufferWriteString(buf, "<!DOCTYPE ");
627 xmlOutputBufferWriteString(buf, (const char *)cur->name);
628 if (cur->ExternalID != NULL) {
629 xmlOutputBufferWriteString(buf, " PUBLIC ");
630 xmlOutputBufferWriteQuotedString(buf, cur->ExternalID);
631 if (cur->SystemID != NULL) {
632 xmlOutputBufferWriteString(buf, " ");
633 xmlOutputBufferWriteQuotedString(buf, cur->SystemID);
634 }
635 } else if (cur->SystemID != NULL &&
636 xmlStrcmp(cur->SystemID, BAD_CAST "about:legacy-compat")) {
637 xmlOutputBufferWriteString(buf, " SYSTEM ");
638 xmlOutputBufferWriteQuotedString(buf, cur->SystemID);
639 }
640 xmlOutputBufferWriteString(buf, ">\n");
641}
642
643/**
644 * htmlAttrDumpOutput:
645 * @buf: the HTML buffer output
646 * @doc: the document
647 * @cur: the attribute pointer
648 *
649 * Dump an HTML attribute
650 */
651static void
652htmlAttrDumpOutput(xmlOutputBufferPtr buf, xmlDocPtr doc, xmlAttrPtr cur) {
653 xmlChar *value;
654
655 /*
656 * The html output method should not escape a & character
657 * occurring in an attribute value immediately followed by
658 * a { character (see Section B.7.1 of the HTML 4.0 Recommendation).
659 * This is implemented in xmlEncodeEntitiesReentrant
660 */
661
662 if (cur == NULL) {
663 return;
664 }
665 xmlOutputBufferWriteString(buf, " ");
666 if ((cur->ns != NULL) && (cur->ns->prefix != NULL)) {
667 xmlOutputBufferWriteString(buf, (const char *)cur->ns->prefix);
668 xmlOutputBufferWriteString(buf, ":");
669 }
670 xmlOutputBufferWriteString(buf, (const char *)cur->name);
671 if ((cur->children != NULL) && (!htmlIsBooleanAttr(cur->name))) {
672 value = xmlNodeListGetString(doc, cur->children, 0);
673 if (value) {
674 xmlOutputBufferWriteString(buf, "=");
675 if ((cur->ns == NULL) && (cur->parent != NULL) &&
676 (cur->parent->ns == NULL) &&
677 ((!xmlStrcasecmp(cur->name, BAD_CAST "href")) ||
678 (!xmlStrcasecmp(cur->name, BAD_CAST "action")) ||
679 (!xmlStrcasecmp(cur->name, BAD_CAST "src")) ||
680 ((!xmlStrcasecmp(cur->name, BAD_CAST "name")) &&
681 (!xmlStrcasecmp(cur->parent->name, BAD_CAST "a"))))) {
682 xmlChar *escaped;
683 xmlChar *tmp = value;
684
685 while (IS_BLANK_CH(*tmp)) tmp++;
686
687 /*
688 * Angle brackets are technically illegal in URIs, but they're
689 * used in server side includes, for example. Curly brackets
690 * are illegal as well and often used in templates.
691 * Don't escape non-whitespace, printable ASCII chars for
692 * improved interoperability. Only escape space, control
693 * and non-ASCII chars.
694 */
695 escaped = xmlURIEscapeStr(tmp,
696 BAD_CAST "\"#$%&+,/:;<=>?@[\\]^`{|}");
697 if (escaped != NULL) {
698 xmlOutputBufferWriteQuotedString(buf, escaped);
699 xmlFree(escaped);
700 } else {
701 buf->error = XML_ERR_NO_MEMORY;
702 }
703 } else {
704 xmlOutputBufferWriteQuotedString(buf, value);
705 }
706 xmlFree(value);
707 } else {
708 buf->error = XML_ERR_NO_MEMORY;
709 }
710 }
711}
712
713/**
714 * htmlNodeDumpFormatOutput:
715 * @buf: the HTML buffer output
716 * @doc: the document
717 * @cur: the current node
718 * @encoding: the encoding string (unused)
719 * @format: should formatting spaces been added
720 *
721 * Dump an HTML node, recursive behaviour,children are printed too.
722 */
723void
724htmlNodeDumpFormatOutput(xmlOutputBufferPtr buf, xmlDocPtr doc,
725 xmlNodePtr cur, const char *encoding ATTRIBUTE_UNUSED,
726 int format) {
727 xmlNodePtr root, parent;
728 xmlAttrPtr attr;
729 const htmlElemDesc * info;
730
731 xmlInitParser();
732
733 if ((cur == NULL) || (buf == NULL)) {
734 return;
735 }
736
737 root = cur;
738 parent = cur->parent;
739 while (1) {
740 switch (cur->type) {
741 case XML_HTML_DOCUMENT_NODE:
742 case XML_DOCUMENT_NODE:
743 if (((xmlDocPtr) cur)->intSubset != NULL) {
744 htmlDtdDumpOutput(buf, (xmlDocPtr) cur, NULL);
745 }
746 if (cur->children != NULL) {
747 /* Always validate cur->parent when descending. */
748 if (cur->parent == parent) {
749 parent = cur;
750 cur = cur->children;
751 continue;
752 }
753 } else {
754 xmlOutputBufferWriteString(buf, "\n");
755 }
756 break;
757
758 case XML_ELEMENT_NODE:
759 /*
760 * Some users like lxml are known to pass nodes with a corrupted
761 * tree structure. Fall back to a recursive call to handle this
762 * case.
763 */
764 if ((cur->parent != parent) && (cur->children != NULL)) {
765 htmlNodeDumpFormatOutput(buf, doc, cur, encoding, format);
766 break;
767 }
768
769 /*
770 * Get specific HTML info for that node.
771 */
772 if (cur->ns == NULL)
773 info = htmlTagLookup(cur->name);
774 else
775 info = NULL;
776
777 xmlOutputBufferWriteString(buf, "<");
778 if ((cur->ns != NULL) && (cur->ns->prefix != NULL)) {
779 xmlOutputBufferWriteString(buf, (const char *)cur->ns->prefix);
780 xmlOutputBufferWriteString(buf, ":");
781 }
782 xmlOutputBufferWriteString(buf, (const char *)cur->name);
783 if (cur->nsDef)
784 xmlNsListDumpOutput(buf, cur->nsDef);
785 attr = cur->properties;
786 while (attr != NULL) {
787 htmlAttrDumpOutput(buf, doc, attr);
788 attr = attr->next;
789 }
790
791 if ((info != NULL) && (info->empty)) {
792 xmlOutputBufferWriteString(buf, ">");
793 } else if (cur->children == NULL) {
794 if ((info != NULL) && (info->saveEndTag != 0) &&
795 (xmlStrcmp(BAD_CAST info->name, BAD_CAST "html")) &&
796 (xmlStrcmp(BAD_CAST info->name, BAD_CAST "body"))) {
797 xmlOutputBufferWriteString(buf, ">");
798 } else {
799 xmlOutputBufferWriteString(buf, "></");
800 if ((cur->ns != NULL) && (cur->ns->prefix != NULL)) {
801 xmlOutputBufferWriteString(buf,
802 (const char *)cur->ns->prefix);
803 xmlOutputBufferWriteString(buf, ":");
804 }
805 xmlOutputBufferWriteString(buf, (const char *)cur->name);
806 xmlOutputBufferWriteString(buf, ">");
807 }
808 } else {
809 xmlOutputBufferWriteString(buf, ">");
810 if ((format) && (info != NULL) && (!info->isinline) &&
811 (cur->children->type != HTML_TEXT_NODE) &&
812 (cur->children->type != HTML_ENTITY_REF_NODE) &&
813 (cur->children != cur->last) &&
814 (cur->name != NULL) &&
815 (cur->name[0] != 'p')) /* p, pre, param */
816 xmlOutputBufferWriteString(buf, "\n");
817 parent = cur;
818 cur = cur->children;
819 continue;
820 }
821
822 if ((format) && (cur->next != NULL) &&
823 (info != NULL) && (!info->isinline)) {
824 if ((cur->next->type != HTML_TEXT_NODE) &&
825 (cur->next->type != HTML_ENTITY_REF_NODE) &&
826 (parent != NULL) &&
827 (parent->name != NULL) &&
828 (parent->name[0] != 'p')) /* p, pre, param */
829 xmlOutputBufferWriteString(buf, "\n");
830 }
831
832 break;
833
834 case XML_ATTRIBUTE_NODE:
835 htmlAttrDumpOutput(buf, doc, (xmlAttrPtr) cur);
836 break;
837
838 case HTML_TEXT_NODE:
839 if (cur->content == NULL)
840 break;
841 if (((cur->name == (const xmlChar *)xmlStringText) ||
842 (cur->name != (const xmlChar *)xmlStringTextNoenc)) &&
843 ((parent == NULL) ||
844 ((xmlStrcasecmp(parent->name, BAD_CAST "script")) &&
845 (xmlStrcasecmp(parent->name, BAD_CAST "style"))))) {
846 xmlChar *buffer;
847
848 buffer = xmlEncodeEntitiesReentrant(doc, cur->content);
849 if (buffer == NULL) {
850 buf->error = XML_ERR_NO_MEMORY;
851 return;
852 }
853 xmlOutputBufferWriteString(buf, (const char *)buffer);
854 xmlFree(buffer);
855 } else {
856 xmlOutputBufferWriteString(buf, (const char *)cur->content);
857 }
858 break;
859
860 case HTML_COMMENT_NODE:
861 if (cur->content != NULL) {
862 xmlOutputBufferWriteString(buf, "<!--");
863 xmlOutputBufferWriteString(buf, (const char *)cur->content);
864 xmlOutputBufferWriteString(buf, "-->");
865 }
866 break;
867
868 case HTML_PI_NODE:
869 if (cur->name != NULL) {
870 xmlOutputBufferWriteString(buf, "<?");
871 xmlOutputBufferWriteString(buf, (const char *)cur->name);
872 if (cur->content != NULL) {
873 xmlOutputBufferWriteString(buf, " ");
874 xmlOutputBufferWriteString(buf,
875 (const char *)cur->content);
876 }
877 xmlOutputBufferWriteString(buf, ">");
878 }
879 break;
880
881 case HTML_ENTITY_REF_NODE:
882 xmlOutputBufferWriteString(buf, "&");
883 xmlOutputBufferWriteString(buf, (const char *)cur->name);
884 xmlOutputBufferWriteString(buf, ";");
885 break;
886
887 case HTML_PRESERVE_NODE:
888 if (cur->content != NULL) {
889 xmlOutputBufferWriteString(buf, (const char *)cur->content);
890 }
891 break;
892
893 default:
894 break;
895 }
896
897 while (1) {
898 if (cur == root)
899 return;
900 if (cur->next != NULL) {
901 cur = cur->next;
902 break;
903 }
904
905 cur = parent;
906 /* cur->parent was validated when descending. */
907 parent = cur->parent;
908
909 if ((cur->type == XML_HTML_DOCUMENT_NODE) ||
910 (cur->type == XML_DOCUMENT_NODE)) {
911 xmlOutputBufferWriteString(buf, "\n");
912 } else {
913 if ((format) && (cur->ns == NULL))
914 info = htmlTagLookup(cur->name);
915 else
916 info = NULL;
917
918 if ((format) && (info != NULL) && (!info->isinline) &&
919 (cur->last->type != HTML_TEXT_NODE) &&
920 (cur->last->type != HTML_ENTITY_REF_NODE) &&
921 (cur->children != cur->last) &&
922 (cur->name != NULL) &&
923 (cur->name[0] != 'p')) /* p, pre, param */
924 xmlOutputBufferWriteString(buf, "\n");
925
926 xmlOutputBufferWriteString(buf, "</");
927 if ((cur->ns != NULL) && (cur->ns->prefix != NULL)) {
928 xmlOutputBufferWriteString(buf, (const char *)cur->ns->prefix);
929 xmlOutputBufferWriteString(buf, ":");
930 }
931 xmlOutputBufferWriteString(buf, (const char *)cur->name);
932 xmlOutputBufferWriteString(buf, ">");
933
934 if ((format) && (info != NULL) && (!info->isinline) &&
935 (cur->next != NULL)) {
936 if ((cur->next->type != HTML_TEXT_NODE) &&
937 (cur->next->type != HTML_ENTITY_REF_NODE) &&
938 (parent != NULL) &&
939 (parent->name != NULL) &&
940 (parent->name[0] != 'p')) /* p, pre, param */
941 xmlOutputBufferWriteString(buf, "\n");
942 }
943 }
944 }
945 }
946}
947
948/**
949 * htmlNodeDumpOutput:
950 * @buf: the HTML buffer output
951 * @doc: the document
952 * @cur: the current node
953 * @encoding: the encoding string (unused)
954 *
955 * Dump an HTML node, recursive behaviour,children are printed too,
956 * and formatting returns/spaces are added.
957 */
958void
959htmlNodeDumpOutput(xmlOutputBufferPtr buf, xmlDocPtr doc,
960 xmlNodePtr cur, const char *encoding ATTRIBUTE_UNUSED) {
961 htmlNodeDumpFormatOutput(buf, doc, cur, NULL, 1);
962}
963
964/**
965 * htmlDocContentDumpFormatOutput:
966 * @buf: the HTML buffer output
967 * @cur: the document
968 * @encoding: the encoding string (unused)
969 * @format: should formatting spaces been added
970 *
971 * Dump an HTML document.
972 */
973void
974htmlDocContentDumpFormatOutput(xmlOutputBufferPtr buf, xmlDocPtr cur,
975 const char *encoding ATTRIBUTE_UNUSED,
976 int format) {
977 int type = 0;
978 if (cur) {
979 type = cur->type;
980 cur->type = XML_HTML_DOCUMENT_NODE;
981 }
982 htmlNodeDumpFormatOutput(buf, cur, (xmlNodePtr) cur, NULL, format);
983 if (cur)
984 cur->type = (xmlElementType) type;
985}
986
987/**
988 * htmlDocContentDumpOutput:
989 * @buf: the HTML buffer output
990 * @cur: the document
991 * @encoding: the encoding string (unused)
992 *
993 * Dump an HTML document. Formatting return/spaces are added.
994 */
995void
996htmlDocContentDumpOutput(xmlOutputBufferPtr buf, xmlDocPtr cur,
997 const char *encoding ATTRIBUTE_UNUSED) {
998 htmlNodeDumpFormatOutput(buf, cur, (xmlNodePtr) cur, NULL, 1);
999}
1000
1001/************************************************************************
1002 * *
1003 * Saving functions front-ends *
1004 * *
1005 ************************************************************************/
1006
1007/**
1008 * htmlDocDump:
1009 * @f: the FILE*
1010 * @cur: the document
1011 *
1012 * Dump an HTML document to an open FILE.
1013 *
1014 * returns: the number of byte written or -1 in case of failure.
1015 */
1016int
1017htmlDocDump(FILE *f, xmlDocPtr cur) {
1018 xmlOutputBufferPtr buf;
1019 xmlCharEncodingHandlerPtr handler = NULL;
1020 const char *encoding;
1021 int ret;
1022
1023 xmlInitParser();
1024
1025 if ((cur == NULL) || (f == NULL)) {
1026 return(-1);
1027 }
1028
1029 encoding = (const char *) htmlGetMetaEncoding(cur);
1030 handler = htmlFindOutputEncoder(encoding);
1031 buf = xmlOutputBufferCreateFile(f, handler);
1032 if (buf == NULL) {
1033 xmlCharEncCloseFunc(handler);
1034 return(-1);
1035 }
1036 htmlDocContentDumpOutput(buf, cur, NULL);
1037
1038 ret = xmlOutputBufferClose(buf);
1039 return(ret);
1040}
1041
1042/**
1043 * htmlSaveFile:
1044 * @filename: the filename (or URL)
1045 * @cur: the document
1046 *
1047 * Dump an HTML document to a file. If @filename is "-" the stdout file is
1048 * used.
1049 * returns: the number of byte written or -1 in case of failure.
1050 */
1051int
1052htmlSaveFile(const char *filename, xmlDocPtr cur) {
1053 xmlOutputBufferPtr buf;
1054 xmlCharEncodingHandlerPtr handler = NULL;
1055 const char *encoding;
1056 int ret;
1057
1058 if ((cur == NULL) || (filename == NULL))
1059 return(-1);
1060
1061 xmlInitParser();
1062
1063 encoding = (const char *) htmlGetMetaEncoding(cur);
1064 handler = htmlFindOutputEncoder(encoding);
1065 buf = xmlOutputBufferCreateFilename(filename, handler, cur->compression);
1066 if (buf == NULL) {
1067 xmlCharEncCloseFunc(handler);
1068 return(0);
1069 }
1070
1071 htmlDocContentDumpOutput(buf, cur, NULL);
1072
1073 ret = xmlOutputBufferClose(buf);
1074 return(ret);
1075}
1076
1077/**
1078 * htmlSaveFileFormat:
1079 * @filename: the filename
1080 * @cur: the document
1081 * @format: should formatting spaces been added
1082 * @encoding: the document encoding
1083 *
1084 * Dump an HTML document to a file using a given encoding.
1085 *
1086 * returns: the number of byte written or -1 in case of failure.
1087 */
1088int
1089htmlSaveFileFormat(const char *filename, xmlDocPtr cur,
1090 const char *encoding, int format) {
1091 xmlOutputBufferPtr buf;
1092 xmlCharEncodingHandlerPtr handler = NULL;
1093 int ret;
1094
1095 if ((cur == NULL) || (filename == NULL))
1096 return(-1);
1097
1098 xmlInitParser();
1099
1100 handler = htmlFindOutputEncoder(encoding);
1101 if (handler != NULL)
1102 htmlSetMetaEncoding(cur, (const xmlChar *) handler->name);
1103 else
1104 htmlSetMetaEncoding(cur, (const xmlChar *) "UTF-8");
1105
1106 /*
1107 * save the content to a temp buffer.
1108 */
1109 buf = xmlOutputBufferCreateFilename(filename, handler, 0);
1110 if (buf == NULL) {
1111 xmlCharEncCloseFunc(handler);
1112 return(0);
1113 }
1114
1115 htmlDocContentDumpFormatOutput(buf, cur, encoding, format);
1116
1117 ret = xmlOutputBufferClose(buf);
1118 return(ret);
1119}
1120
1121/**
1122 * htmlSaveFileEnc:
1123 * @filename: the filename
1124 * @cur: the document
1125 * @encoding: the document encoding
1126 *
1127 * Dump an HTML document to a file using a given encoding
1128 * and formatting returns/spaces are added.
1129 *
1130 * returns: the number of byte written or -1 in case of failure.
1131 */
1132int
1133htmlSaveFileEnc(const char *filename, xmlDocPtr cur, const char *encoding) {
1134 return(htmlSaveFileFormat(filename, cur, encoding, 1));
1135}
1136
1137#endif /* LIBXML_OUTPUT_ENABLED */
1138
1139#endif /* LIBXML_HTML_ENABLED */
Note: See TracBrowser for help on using the repository browser.

© 2024 Oracle Support Privacy / Do Not Sell My Info Terms of Use Trademark Policy Automated Access Etiquette