VirtualBox

source: vbox/trunk/src/libs/libxml2-2.6.31/HTMLparser.c@ 41462

Last change on this file since 41462 was 39915, checked in by vboxsync, 13 years ago

libxml-2.6.31 unmodified

  • Property svn:eol-style set to native
File size: 183.4 KB
Line 
1/*
2 * HTMLparser.c : an HTML 4.0 non-verifying parser
3 *
4 * See Copyright for the status of this software.
5 *
6 * [email protected]
7 */
8
9#define IN_LIBXML
10#include "libxml.h"
11#ifdef LIBXML_HTML_ENABLED
12
13#include <string.h>
14#ifdef HAVE_CTYPE_H
15#include <ctype.h>
16#endif
17#ifdef HAVE_STDLIB_H
18#include <stdlib.h>
19#endif
20#ifdef HAVE_SYS_STAT_H
21#include <sys/stat.h>
22#endif
23#ifdef HAVE_FCNTL_H
24#include <fcntl.h>
25#endif
26#ifdef HAVE_UNISTD_H
27#include <unistd.h>
28#endif
29#ifdef HAVE_ZLIB_H
30#include <zlib.h>
31#endif
32
33#include <libxml/xmlmemory.h>
34#include <libxml/tree.h>
35#include <libxml/parser.h>
36#include <libxml/parserInternals.h>
37#include <libxml/xmlerror.h>
38#include <libxml/HTMLparser.h>
39#include <libxml/HTMLtree.h>
40#include <libxml/entities.h>
41#include <libxml/encoding.h>
42#include <libxml/valid.h>
43#include <libxml/xmlIO.h>
44#include <libxml/globals.h>
45#include <libxml/uri.h>
46
47#define HTML_MAX_NAMELEN 1000
48#define HTML_PARSER_BIG_BUFFER_SIZE 1000
49#define HTML_PARSER_BUFFER_SIZE 100
50
51/* #define DEBUG */
52/* #define DEBUG_PUSH */
53
54static int htmlOmittedDefaultValue = 1;
55
56xmlChar * htmlDecodeEntities(htmlParserCtxtPtr ctxt, int len,
57 xmlChar end, xmlChar end2, xmlChar end3);
58static void htmlParseComment(htmlParserCtxtPtr ctxt);
59
60/************************************************************************
61 * *
62 * Some factorized error routines *
63 * *
64 ************************************************************************/
65
66/**
67 * htmlErrMemory:
68 * @ctxt: an HTML parser context
69 * @extra: extra informations
70 *
71 * Handle a redefinition of attribute error
72 */
73static void
74htmlErrMemory(xmlParserCtxtPtr ctxt, const char *extra)
75{
76 if ((ctxt != NULL) && (ctxt->disableSAX != 0) &&
77 (ctxt->instate == XML_PARSER_EOF))
78 return;
79 if (ctxt != NULL) {
80 ctxt->errNo = XML_ERR_NO_MEMORY;
81 ctxt->instate = XML_PARSER_EOF;
82 ctxt->disableSAX = 1;
83 }
84 if (extra)
85 __xmlRaiseError(NULL, NULL, NULL, ctxt, NULL, XML_FROM_PARSER,
86 XML_ERR_NO_MEMORY, XML_ERR_FATAL, NULL, 0, extra,
87 NULL, NULL, 0, 0,
88 "Memory allocation failed : %s\n", extra);
89 else
90 __xmlRaiseError(NULL, NULL, NULL, ctxt, NULL, XML_FROM_PARSER,
91 XML_ERR_NO_MEMORY, XML_ERR_FATAL, NULL, 0, NULL,
92 NULL, NULL, 0, 0, "Memory allocation failed\n");
93}
94
95/**
96 * htmlParseErr:
97 * @ctxt: an HTML parser context
98 * @error: the error number
99 * @msg: the error message
100 * @str1: string infor
101 * @str2: string infor
102 *
103 * Handle a fatal parser error, i.e. violating Well-Formedness constraints
104 */
105static void
106htmlParseErr(xmlParserCtxtPtr ctxt, xmlParserErrors error,
107 const char *msg, const xmlChar *str1, const xmlChar *str2)
108{
109 if ((ctxt != NULL) && (ctxt->disableSAX != 0) &&
110 (ctxt->instate == XML_PARSER_EOF))
111 return;
112 if (ctxt != NULL)
113 ctxt->errNo = error;
114 __xmlRaiseError(NULL, NULL, NULL, ctxt, NULL, XML_FROM_HTML, error,
115 XML_ERR_ERROR, NULL, 0,
116 (const char *) str1, (const char *) str2,
117 NULL, 0, 0,
118 msg, str1, str2);
119 if (ctxt != NULL)
120 ctxt->wellFormed = 0;
121}
122
123/**
124 * htmlParseErrInt:
125 * @ctxt: an HTML parser context
126 * @error: the error number
127 * @msg: the error message
128 * @val: integer info
129 *
130 * Handle a fatal parser error, i.e. violating Well-Formedness constraints
131 */
132static void
133htmlParseErrInt(xmlParserCtxtPtr ctxt, xmlParserErrors error,
134 const char *msg, int val)
135{
136 if ((ctxt != NULL) && (ctxt->disableSAX != 0) &&
137 (ctxt->instate == XML_PARSER_EOF))
138 return;
139 if (ctxt != NULL)
140 ctxt->errNo = error;
141 __xmlRaiseError(NULL, NULL, NULL, ctxt, NULL, XML_FROM_HTML, error,
142 XML_ERR_ERROR, NULL, 0, NULL, NULL,
143 NULL, val, 0, msg, val);
144 if (ctxt != NULL)
145 ctxt->wellFormed = 0;
146}
147
148/************************************************************************
149 * *
150 * Parser stacks related functions and macros *
151 * *
152 ************************************************************************/
153
154/**
155 * htmlnamePush:
156 * @ctxt: an HTML parser context
157 * @value: the element name
158 *
159 * Pushes a new element name on top of the name stack
160 *
161 * Returns 0 in case of error, the index in the stack otherwise
162 */
163static int
164htmlnamePush(htmlParserCtxtPtr ctxt, const xmlChar * value)
165{
166 if (ctxt->nameNr >= ctxt->nameMax) {
167 ctxt->nameMax *= 2;
168 ctxt->nameTab = (const xmlChar * *)
169 xmlRealloc((xmlChar * *)ctxt->nameTab,
170 ctxt->nameMax *
171 sizeof(ctxt->nameTab[0]));
172 if (ctxt->nameTab == NULL) {
173 htmlErrMemory(ctxt, NULL);
174 return (0);
175 }
176 }
177 ctxt->nameTab[ctxt->nameNr] = value;
178 ctxt->name = value;
179 return (ctxt->nameNr++);
180}
181/**
182 * htmlnamePop:
183 * @ctxt: an HTML parser context
184 *
185 * Pops the top element name from the name stack
186 *
187 * Returns the name just removed
188 */
189static const xmlChar *
190htmlnamePop(htmlParserCtxtPtr ctxt)
191{
192 const xmlChar *ret;
193
194 if (ctxt->nameNr <= 0)
195 return (NULL);
196 ctxt->nameNr--;
197 if (ctxt->nameNr < 0)
198 return (NULL);
199 if (ctxt->nameNr > 0)
200 ctxt->name = ctxt->nameTab[ctxt->nameNr - 1];
201 else
202 ctxt->name = NULL;
203 ret = ctxt->nameTab[ctxt->nameNr];
204 ctxt->nameTab[ctxt->nameNr] = NULL;
205 return (ret);
206}
207
208/*
209 * Macros for accessing the content. Those should be used only by the parser,
210 * and not exported.
211 *
212 * Dirty macros, i.e. one need to make assumption on the context to use them
213 *
214 * CUR_PTR return the current pointer to the xmlChar to be parsed.
215 * CUR returns the current xmlChar value, i.e. a 8 bit value if compiled
216 * in ISO-Latin or UTF-8, and the current 16 bit value if compiled
217 * in UNICODE mode. This should be used internally by the parser
218 * only to compare to ASCII values otherwise it would break when
219 * running with UTF-8 encoding.
220 * NXT(n) returns the n'th next xmlChar. Same as CUR is should be used only
221 * to compare on ASCII based substring.
222 * UPP(n) returns the n'th next xmlChar converted to uppercase. Same as CUR
223 * it should be used only to compare on ASCII based substring.
224 * SKIP(n) Skip n xmlChar, and must also be used only to skip ASCII defined
225 * strings without newlines within the parser.
226 *
227 * Clean macros, not dependent of an ASCII context, expect UTF-8 encoding
228 *
229 * CURRENT Returns the current char value, with the full decoding of
230 * UTF-8 if we are using this mode. It returns an int.
231 * NEXT Skip to the next character, this does the proper decoding
232 * in UTF-8 mode. It also pop-up unfinished entities on the fly.
233 * NEXTL(l) Skip the current unicode character of l xmlChars long.
234 * COPY(to) copy one char to *to, increment CUR_PTR and to accordingly
235 */
236
237#define UPPER (toupper(*ctxt->input->cur))
238
239#define SKIP(val) ctxt->nbChars += (val),ctxt->input->cur += (val),ctxt->input->col+=(val)
240
241#define NXT(val) ctxt->input->cur[(val)]
242
243#define UPP(val) (toupper(ctxt->input->cur[(val)]))
244
245#define CUR_PTR ctxt->input->cur
246
247#define SHRINK if ((ctxt->input->cur - ctxt->input->base > 2 * INPUT_CHUNK) && \
248 (ctxt->input->end - ctxt->input->cur < 2 * INPUT_CHUNK)) \
249 xmlParserInputShrink(ctxt->input)
250
251#define GROW if ((ctxt->progressive == 0) && \
252 (ctxt->input->end - ctxt->input->cur < INPUT_CHUNK)) \
253 xmlParserInputGrow(ctxt->input, INPUT_CHUNK)
254
255#define CURRENT ((int) (*ctxt->input->cur))
256
257#define SKIP_BLANKS htmlSkipBlankChars(ctxt)
258
259/* Inported from XML */
260
261/* #define CUR (ctxt->token ? ctxt->token : (int) (*ctxt->input->cur)) */
262#define CUR ((int) (*ctxt->input->cur))
263#define NEXT xmlNextChar(ctxt)
264
265#define RAW (ctxt->token ? -1 : (*ctxt->input->cur))
266#define NXT(val) ctxt->input->cur[(val)]
267#define CUR_PTR ctxt->input->cur
268
269
270#define NEXTL(l) do { \
271 if (*(ctxt->input->cur) == '\n') { \
272 ctxt->input->line++; ctxt->input->col = 1; \
273 } else ctxt->input->col++; \
274 ctxt->token = 0; ctxt->input->cur += l; ctxt->nbChars++; \
275 } while (0)
276
277/************
278 \
279 if (*ctxt->input->cur == '%') xmlParserHandlePEReference(ctxt); \
280 if (*ctxt->input->cur == '&') xmlParserHandleReference(ctxt);
281 ************/
282
283#define CUR_CHAR(l) htmlCurrentChar(ctxt, &l)
284#define CUR_SCHAR(s, l) xmlStringCurrentChar(ctxt, s, &l)
285
286#define COPY_BUF(l,b,i,v) \
287 if (l == 1) b[i++] = (xmlChar) v; \
288 else i += xmlCopyChar(l,&b[i],v)
289
290/**
291 * htmlCurrentChar:
292 * @ctxt: the HTML parser context
293 * @len: pointer to the length of the char read
294 *
295 * The current char value, if using UTF-8 this may actually span multiple
296 * bytes in the input buffer. Implement the end of line normalization:
297 * 2.11 End-of-Line Handling
298 * If the encoding is unspecified, in the case we find an ISO-Latin-1
299 * char, then the encoding converter is plugged in automatically.
300 *
301 * Returns the current char value and its length
302 */
303
304static int
305htmlCurrentChar(xmlParserCtxtPtr ctxt, int *len) {
306 if (ctxt->instate == XML_PARSER_EOF)
307 return(0);
308
309 if (ctxt->token != 0) {
310 *len = 0;
311 return(ctxt->token);
312 }
313 if (ctxt->charset == XML_CHAR_ENCODING_UTF8) {
314 /*
315 * We are supposed to handle UTF8, check it's valid
316 * From rfc2044: encoding of the Unicode values on UTF-8:
317 *
318 * UCS-4 range (hex.) UTF-8 octet sequence (binary)
319 * 0000 0000-0000 007F 0xxxxxxx
320 * 0000 0080-0000 07FF 110xxxxx 10xxxxxx
321 * 0000 0800-0000 FFFF 1110xxxx 10xxxxxx 10xxxxxx
322 *
323 * Check for the 0x110000 limit too
324 */
325 const unsigned char *cur = ctxt->input->cur;
326 unsigned char c;
327 unsigned int val;
328
329 c = *cur;
330 if (c & 0x80) {
331 if (cur[1] == 0)
332 xmlParserInputGrow(ctxt->input, INPUT_CHUNK);
333 if ((cur[1] & 0xc0) != 0x80)
334 goto encoding_error;
335 if ((c & 0xe0) == 0xe0) {
336
337 if (cur[2] == 0)
338 xmlParserInputGrow(ctxt->input, INPUT_CHUNK);
339 if ((cur[2] & 0xc0) != 0x80)
340 goto encoding_error;
341 if ((c & 0xf0) == 0xf0) {
342 if (cur[3] == 0)
343 xmlParserInputGrow(ctxt->input, INPUT_CHUNK);
344 if (((c & 0xf8) != 0xf0) ||
345 ((cur[3] & 0xc0) != 0x80))
346 goto encoding_error;
347 /* 4-byte code */
348 *len = 4;
349 val = (cur[0] & 0x7) << 18;
350 val |= (cur[1] & 0x3f) << 12;
351 val |= (cur[2] & 0x3f) << 6;
352 val |= cur[3] & 0x3f;
353 } else {
354 /* 3-byte code */
355 *len = 3;
356 val = (cur[0] & 0xf) << 12;
357 val |= (cur[1] & 0x3f) << 6;
358 val |= cur[2] & 0x3f;
359 }
360 } else {
361 /* 2-byte code */
362 *len = 2;
363 val = (cur[0] & 0x1f) << 6;
364 val |= cur[1] & 0x3f;
365 }
366 if (!IS_CHAR(val)) {
367 htmlParseErrInt(ctxt, XML_ERR_INVALID_CHAR,
368 "Char 0x%X out of allowed range\n", val);
369 }
370 return(val);
371 } else {
372 /* 1-byte code */
373 *len = 1;
374 return((int) *ctxt->input->cur);
375 }
376 }
377 /*
378 * Assume it's a fixed length encoding (1) with
379 * a compatible encoding for the ASCII set, since
380 * XML constructs only use < 128 chars
381 */
382 *len = 1;
383 if ((int) *ctxt->input->cur < 0x80)
384 return((int) *ctxt->input->cur);
385
386 /*
387 * Humm this is bad, do an automatic flow conversion
388 */
389 xmlSwitchEncoding(ctxt, XML_CHAR_ENCODING_8859_1);
390 ctxt->charset = XML_CHAR_ENCODING_UTF8;
391 return(xmlCurrentChar(ctxt, len));
392
393encoding_error:
394 /*
395 * If we detect an UTF8 error that probably mean that the
396 * input encoding didn't get properly advertized in the
397 * declaration header. Report the error and switch the encoding
398 * to ISO-Latin-1 (if you don't like this policy, just declare the
399 * encoding !)
400 */
401 {
402 char buffer[150];
403
404 if (ctxt->input->end - ctxt->input->cur >= 4) {
405 snprintf(buffer, 149, "Bytes: 0x%02X 0x%02X 0x%02X 0x%02X\n",
406 ctxt->input->cur[0], ctxt->input->cur[1],
407 ctxt->input->cur[2], ctxt->input->cur[3]);
408 } else {
409 snprintf(buffer, 149, "Bytes: 0x%02X\n", ctxt->input->cur[0]);
410 }
411 htmlParseErr(ctxt, XML_ERR_INVALID_ENCODING,
412 "Input is not proper UTF-8, indicate encoding !\n",
413 BAD_CAST buffer, NULL);
414 }
415
416 ctxt->charset = XML_CHAR_ENCODING_8859_1;
417 *len = 1;
418 return((int) *ctxt->input->cur);
419}
420
421/**
422 * htmlSkipBlankChars:
423 * @ctxt: the HTML parser context
424 *
425 * skip all blanks character found at that point in the input streams.
426 *
427 * Returns the number of space chars skipped
428 */
429
430static int
431htmlSkipBlankChars(xmlParserCtxtPtr ctxt) {
432 int res = 0;
433
434 while (IS_BLANK_CH(*(ctxt->input->cur))) {
435 if ((*ctxt->input->cur == 0) &&
436 (xmlParserInputGrow(ctxt->input, INPUT_CHUNK) <= 0)) {
437 xmlPopInput(ctxt);
438 } else {
439 if (*(ctxt->input->cur) == '\n') {
440 ctxt->input->line++; ctxt->input->col = 1;
441 } else ctxt->input->col++;
442 ctxt->input->cur++;
443 ctxt->nbChars++;
444 if (*ctxt->input->cur == 0)
445 xmlParserInputGrow(ctxt->input, INPUT_CHUNK);
446 }
447 res++;
448 }
449 return(res);
450}
451
452
453
454/************************************************************************
455 * *
456 * The list of HTML elements and their properties *
457 * *
458 ************************************************************************/
459
460/*
461 * Start Tag: 1 means the start tag can be ommited
462 * End Tag: 1 means the end tag can be ommited
463 * 2 means it's forbidden (empty elements)
464 * 3 means the tag is stylistic and should be closed easily
465 * Depr: this element is deprecated
466 * DTD: 1 means that this element is valid only in the Loose DTD
467 * 2 means that this element is valid only in the Frameset DTD
468 *
469 * Name,Start Tag,End Tag,Save End,Empty,Deprecated,DTD,inline,Description
470 , subElements , impliedsubelt , Attributes, userdata
471 */
472
473/* Definitions and a couple of vars for HTML Elements */
474
475#define FONTSTYLE "tt", "i", "b", "u", "s", "strike", "big", "small"
476#define NB_FONTSTYLE 8
477#define PHRASE "em", "strong", "dfn", "code", "samp", "kbd", "var", "cite", "abbr", "acronym"
478#define NB_PHRASE 10
479#define SPECIAL "a", "img", "applet", "embed", "object", "font", "basefont", "br", "script", "map", "q", "sub", "sup", "span", "bdo", "iframe"
480#define NB_SPECIAL 16
481#define INLINE PCDATA FONTSTYLE PHRASE SPECIAL FORMCTRL
482#define NB_INLINE NB_PCDATA + NB_FONTSTYLE + NB_PHRASE + NB_SPECIAL + NB_FORMCTRL
483#define BLOCK HEADING, LIST "pre", "p", "dl", "div", "center", "noscript", "noframes", "blockquote", "form", "isindex", "hr", "table", "fieldset", "address"
484#define NB_BLOCK NB_HEADING + NB_LIST + 14
485#define FORMCTRL "input", "select", "textarea", "label", "button"
486#define NB_FORMCTRL 5
487#define PCDATA
488#define NB_PCDATA 0
489#define HEADING "h1", "h2", "h3", "h4", "h5", "h6"
490#define NB_HEADING 6
491#define LIST "ul", "ol", "dir", "menu"
492#define NB_LIST 4
493#define MODIFIER
494#define NB_MODIFIER 0
495#define FLOW BLOCK,INLINE
496#define NB_FLOW NB_BLOCK + NB_INLINE
497#define EMPTY NULL
498
499
500static const char* const html_flow[] = { FLOW, NULL } ;
501static const char* const html_inline[] = { INLINE, NULL } ;
502
503/* placeholders: elts with content but no subelements */
504static const char* const html_pcdata[] = { NULL } ;
505#define html_cdata html_pcdata
506
507
508/* ... and for HTML Attributes */
509
510#define COREATTRS "id", "class", "style", "title"
511#define NB_COREATTRS 4
512#define I18N "lang", "dir"
513#define NB_I18N 2
514#define EVENTS "onclick", "ondblclick", "onmousedown", "onmouseup", "onmouseover", "onmouseout", "onkeypress", "onkeydown", "onkeyup"
515#define NB_EVENTS 9
516#define ATTRS COREATTRS,I18N,EVENTS
517#define NB_ATTRS NB_NB_COREATTRS + NB_I18N + NB_EVENTS
518#define CELLHALIGN "align", "char", "charoff"
519#define NB_CELLHALIGN 3
520#define CELLVALIGN "valign"
521#define NB_CELLVALIGN 1
522
523static const char* const html_attrs[] = { ATTRS, NULL } ;
524static const char* const core_i18n_attrs[] = { COREATTRS, I18N, NULL } ;
525static const char* const core_attrs[] = { COREATTRS, NULL } ;
526static const char* const i18n_attrs[] = { I18N, NULL } ;
527
528
529/* Other declarations that should go inline ... */
530static const char* const a_attrs[] = { ATTRS, "charset", "type", "name",
531 "href", "hreflang", "rel", "rev", "accesskey", "shape", "coords",
532 "tabindex", "onfocus", "onblur", NULL } ;
533static const char* const target_attr[] = { "target", NULL } ;
534static const char* const rows_cols_attr[] = { "rows", "cols", NULL } ;
535static const char* const alt_attr[] = { "alt", NULL } ;
536static const char* const src_alt_attrs[] = { "src", "alt", NULL } ;
537static const char* const href_attrs[] = { "href", NULL } ;
538static const char* const clear_attrs[] = { "clear", NULL } ;
539static const char* const inline_p[] = { INLINE, "p", NULL } ;
540
541static const char* const flow_param[] = { FLOW, "param", NULL } ;
542static const char* const applet_attrs[] = { COREATTRS , "codebase",
543 "archive", "alt", "name", "height", "width", "align",
544 "hspace", "vspace", NULL } ;
545static const char* const area_attrs[] = { "shape", "coords", "href", "nohref",
546 "tabindex", "accesskey", "onfocus", "onblur", NULL } ;
547static const char* const basefont_attrs[] =
548 { "id", "size", "color", "face", NULL } ;
549static const char* const quote_attrs[] = { ATTRS, "cite", NULL } ;
550static const char* const body_contents[] = { FLOW, "ins", "del", NULL } ;
551static const char* const body_attrs[] = { ATTRS, "onload", "onunload", NULL } ;
552static const char* const body_depr[] = { "background", "bgcolor", "text",
553 "link", "vlink", "alink", NULL } ;
554static const char* const button_attrs[] = { ATTRS, "name", "value", "type",
555 "disabled", "tabindex", "accesskey", "onfocus", "onblur", NULL } ;
556
557
558static const char* const col_attrs[] = { ATTRS, "span", "width", CELLHALIGN, CELLVALIGN, NULL } ;
559static const char* const col_elt[] = { "col", NULL } ;
560static const char* const edit_attrs[] = { ATTRS, "datetime", "cite", NULL } ;
561static const char* const compact_attrs[] = { ATTRS, "compact", NULL } ;
562static const char* const dl_contents[] = { "dt", "dd", NULL } ;
563static const char* const compact_attr[] = { "compact", NULL } ;
564static const char* const label_attr[] = { "label", NULL } ;
565static const char* const fieldset_contents[] = { FLOW, "legend" } ;
566static const char* const font_attrs[] = { COREATTRS, I18N, "size", "color", "face" , NULL } ;
567static const char* const form_contents[] = { HEADING, LIST, INLINE, "pre", "p", "div", "center", "noscript", "noframes", "blockquote", "isindex", "hr", "table", "fieldset", "address", NULL } ;
568static const char* const form_attrs[] = { ATTRS, "method", "enctype", "accept", "name", "onsubmit", "onreset", "accept-charset", NULL } ;
569static const char* const frame_attrs[] = { COREATTRS, "longdesc", "name", "src", "frameborder", "marginwidth", "marginheight", "noresize", "scrolling" , NULL } ;
570static const char* const frameset_attrs[] = { COREATTRS, "rows", "cols", "onload", "onunload", NULL } ;
571static const char* const frameset_contents[] = { "frameset", "frame", "noframes", NULL } ;
572static const char* const head_attrs[] = { I18N, "profile", NULL } ;
573static const char* const head_contents[] = { "title", "isindex", "base", "script", "style", "meta", "link", "object", NULL } ;
574static const char* const hr_depr[] = { "align", "noshade", "size", "width", NULL } ;
575static const char* const version_attr[] = { "version", NULL } ;
576static const char* const html_content[] = { "head", "body", "frameset", NULL } ;
577static const char* const iframe_attrs[] = { COREATTRS, "longdesc", "name", "src", "frameborder", "marginwidth", "marginheight", "scrolling", "align", "height", "width", NULL } ;
578static const char* const img_attrs[] = { ATTRS, "longdesc", "name", "height", "width", "usemap", "ismap", NULL } ;
579static const char* const embed_attrs[] = { COREATTRS, "align", "alt", "border", "code", "codebase", "frameborder", "height", "hidden", "hspace", "name", "palette", "pluginspace", "pluginurl", "src", "type", "units", "vspace", "width", NULL } ;
580static const char* const input_attrs[] = { ATTRS, "type", "name", "value", "checked", "disabled", "readonly", "size", "maxlength", "src", "alt", "usemap", "ismap", "tabindex", "accesskey", "onfocus", "onblur", "onselect", "onchange", "accept", NULL } ;
581static const char* const prompt_attrs[] = { COREATTRS, I18N, "prompt", NULL } ;
582static const char* const label_attrs[] = { ATTRS, "for", "accesskey", "onfocus", "onblur", NULL } ;
583static const char* const legend_attrs[] = { ATTRS, "accesskey", NULL } ;
584static const char* const align_attr[] = { "align", NULL } ;
585static const char* const link_attrs[] = { ATTRS, "charset", "href", "hreflang", "type", "rel", "rev", "media", NULL } ;
586static const char* const map_contents[] = { BLOCK, "area", NULL } ;
587static const char* const name_attr[] = { "name", NULL } ;
588static const char* const action_attr[] = { "action", NULL } ;
589static const char* const blockli_elt[] = { BLOCK, "li", NULL } ;
590static const char* const meta_attrs[] = { I18N, "http-equiv", "name", "scheme", NULL } ;
591static const char* const content_attr[] = { "content", NULL } ;
592static const char* const type_attr[] = { "type", NULL } ;
593static const char* const noframes_content[] = { "body", FLOW MODIFIER, NULL } ;
594static const char* const object_contents[] = { FLOW, "param", NULL } ;
595static const char* const object_attrs[] = { ATTRS, "declare", "classid", "codebase", "data", "type", "codetype", "archive", "standby", "height", "width", "usemap", "name", "tabindex", NULL } ;
596static const char* const object_depr[] = { "align", "border", "hspace", "vspace", NULL } ;
597static const char* const ol_attrs[] = { "type", "compact", "start", NULL} ;
598static const char* const option_elt[] = { "option", NULL } ;
599static const char* const optgroup_attrs[] = { ATTRS, "disabled", NULL } ;
600static const char* const option_attrs[] = { ATTRS, "disabled", "label", "selected", "value", NULL } ;
601static const char* const param_attrs[] = { "id", "value", "valuetype", "type", NULL } ;
602static const char* const width_attr[] = { "width", NULL } ;
603static const char* const pre_content[] = { PHRASE, "tt", "i", "b", "u", "s", "strike", "a", "br", "script", "map", "q", "span", "bdo", "iframe", NULL } ;
604static const char* const script_attrs[] = { "charset", "src", "defer", "event", "for", NULL } ;
605static const char* const language_attr[] = { "language", NULL } ;
606static const char* const select_content[] = { "optgroup", "option", NULL } ;
607static const char* const select_attrs[] = { ATTRS, "name", "size", "multiple", "disabled", "tabindex", "onfocus", "onblur", "onchange", NULL } ;
608static const char* const style_attrs[] = { I18N, "media", "title", NULL } ;
609static const char* const table_attrs[] = { ATTRS "summary", "width", "border", "frame", "rules", "cellspacing", "cellpadding", "datapagesize", NULL } ;
610static const char* const table_depr[] = { "align", "bgcolor", NULL } ;
611static const char* const table_contents[] = { "caption", "col", "colgroup", "thead", "tfoot", "tbody", "tr", NULL} ;
612static const char* const tr_elt[] = { "tr", NULL } ;
613static const char* const talign_attrs[] = { ATTRS, CELLHALIGN, CELLVALIGN, NULL} ;
614static const char* const th_td_depr[] = { "nowrap", "bgcolor", "width", "height", NULL } ;
615static const char* const th_td_attr[] = { ATTRS, "abbr", "axis", "headers", "scope", "rowspan", "colspan", CELLHALIGN, CELLVALIGN, NULL } ;
616static const char* const textarea_attrs[] = { ATTRS, "name", "disabled", "readonly", "tabindex", "accesskey", "onfocus", "onblur", "onselect", "onchange", NULL } ;
617static const char* const tr_contents[] = { "th", "td", NULL } ;
618static const char* const bgcolor_attr[] = { "bgcolor", NULL } ;
619static const char* const li_elt[] = { "li", NULL } ;
620static const char* const ul_depr[] = { "type", "compact", NULL} ;
621static const char* const dir_attr[] = { "dir", NULL} ;
622
623#define DECL (const char**)
624
625static const htmlElemDesc
626html40ElementTable[] = {
627{ "a", 0, 0, 0, 0, 0, 0, 1, "anchor ",
628 DECL html_inline , NULL , DECL a_attrs , DECL target_attr, NULL
629},
630{ "abbr", 0, 0, 0, 0, 0, 0, 1, "abbreviated form",
631 DECL html_inline , NULL , DECL html_attrs, NULL, NULL
632},
633{ "acronym", 0, 0, 0, 0, 0, 0, 1, "",
634 DECL html_inline , NULL , DECL html_attrs, NULL, NULL
635},
636{ "address", 0, 0, 0, 0, 0, 0, 0, "information on author ",
637 DECL inline_p , NULL , DECL html_attrs, NULL, NULL
638},
639{ "applet", 0, 0, 0, 0, 1, 1, 2, "java applet ",
640 DECL flow_param , NULL , NULL , DECL applet_attrs, NULL
641},
642{ "area", 0, 2, 2, 1, 0, 0, 0, "client-side image map area ",
643 EMPTY , NULL , DECL area_attrs , DECL target_attr, DECL alt_attr
644},
645{ "b", 0, 3, 0, 0, 0, 0, 1, "bold text style",
646 DECL html_inline , NULL , DECL html_attrs, NULL, NULL
647},
648{ "base", 0, 2, 2, 1, 0, 0, 0, "document base uri ",
649 EMPTY , NULL , NULL , DECL target_attr, DECL href_attrs
650},
651{ "basefont", 0, 2, 2, 1, 1, 1, 1, "base font size " ,
652 EMPTY , NULL , NULL, DECL basefont_attrs, NULL
653},
654{ "bdo", 0, 0, 0, 0, 0, 0, 1, "i18n bidi over-ride ",
655 DECL html_inline , NULL , DECL core_i18n_attrs, NULL, DECL dir_attr
656},
657{ "big", 0, 3, 0, 0, 0, 0, 1, "large text style",
658 DECL html_inline , NULL , DECL html_attrs, NULL, NULL
659},
660{ "blockquote", 0, 0, 0, 0, 0, 0, 0, "long quotation ",
661 DECL html_flow , NULL , DECL quote_attrs , NULL, NULL
662},
663{ "body", 1, 1, 0, 0, 0, 0, 0, "document body ",
664 DECL body_contents , "div" , DECL body_attrs, DECL body_depr, NULL
665},
666{ "br", 0, 2, 2, 1, 0, 0, 1, "forced line break ",
667 EMPTY , NULL , DECL core_attrs, DECL clear_attrs , NULL
668},
669{ "button", 0, 0, 0, 0, 0, 0, 2, "push button ",
670 DECL html_flow MODIFIER , NULL , DECL button_attrs, NULL, NULL
671},
672{ "caption", 0, 0, 0, 0, 0, 0, 0, "table caption ",
673 DECL html_inline , NULL , DECL html_attrs, NULL, NULL
674},
675{ "center", 0, 3, 0, 0, 1, 1, 0, "shorthand for div align=center ",
676 DECL html_flow , NULL , NULL, DECL html_attrs, NULL
677},
678{ "cite", 0, 0, 0, 0, 0, 0, 1, "citation",
679 DECL html_inline , NULL , DECL html_attrs, NULL, NULL
680},
681{ "code", 0, 0, 0, 0, 0, 0, 1, "computer code fragment",
682 DECL html_inline , NULL , DECL html_attrs, NULL, NULL
683},
684{ "col", 0, 2, 2, 1, 0, 0, 0, "table column ",
685 EMPTY , NULL , DECL col_attrs , NULL, NULL
686},
687{ "colgroup", 0, 1, 0, 0, 0, 0, 0, "table column group ",
688 DECL col_elt , "col" , DECL col_attrs , NULL, NULL
689},
690{ "dd", 0, 1, 0, 0, 0, 0, 0, "definition description ",
691 DECL html_flow , NULL , DECL html_attrs, NULL, NULL
692},
693{ "del", 0, 0, 0, 0, 0, 0, 2, "deleted text ",
694 DECL html_flow , NULL , DECL edit_attrs , NULL, NULL
695},
696{ "dfn", 0, 0, 0, 0, 0, 0, 1, "instance definition",
697 DECL html_inline , NULL , DECL html_attrs, NULL, NULL
698},
699{ "dir", 0, 0, 0, 0, 1, 1, 0, "directory list",
700 DECL blockli_elt, "li" , NULL, DECL compact_attrs, NULL
701},
702{ "div", 0, 0, 0, 0, 0, 0, 0, "generic language/style container",
703 DECL html_flow, NULL, DECL html_attrs, DECL align_attr, NULL
704},
705{ "dl", 0, 0, 0, 0, 0, 0, 0, "definition list ",
706 DECL dl_contents , "dd" , DECL html_attrs, DECL compact_attr, NULL
707},
708{ "dt", 0, 1, 0, 0, 0, 0, 0, "definition term ",
709 DECL html_inline, NULL, DECL html_attrs, NULL, NULL
710},
711{ "em", 0, 3, 0, 0, 0, 0, 1, "emphasis",
712 DECL html_inline, NULL, DECL html_attrs, NULL, NULL
713},
714{ "embed", 0, 1, 0, 0, 1, 1, 1, "generic embedded object ",
715 EMPTY, NULL, DECL embed_attrs, NULL, NULL
716},
717{ "fieldset", 0, 0, 0, 0, 0, 0, 0, "form control group ",
718 DECL fieldset_contents , NULL, DECL html_attrs, NULL, NULL
719},
720{ "font", 0, 3, 0, 0, 1, 1, 1, "local change to font ",
721 DECL html_inline, NULL, NULL, DECL font_attrs, NULL
722},
723{ "form", 0, 0, 0, 0, 0, 0, 0, "interactive form ",
724 DECL form_contents, "fieldset", DECL form_attrs , DECL target_attr, DECL action_attr
725},
726{ "frame", 0, 2, 2, 1, 0, 2, 0, "subwindow " ,
727 EMPTY, NULL, NULL, DECL frame_attrs, NULL
728},
729{ "frameset", 0, 0, 0, 0, 0, 2, 0, "window subdivision" ,
730 DECL frameset_contents, "noframes" , NULL , DECL frameset_attrs, NULL
731},
732{ "h1", 0, 0, 0, 0, 0, 0, 0, "heading ",
733 DECL html_inline, NULL, DECL html_attrs, DECL align_attr, NULL
734},
735{ "h2", 0, 0, 0, 0, 0, 0, 0, "heading ",
736 DECL html_inline, NULL, DECL html_attrs, DECL align_attr, NULL
737},
738{ "h3", 0, 0, 0, 0, 0, 0, 0, "heading ",
739 DECL html_inline, NULL, DECL html_attrs, DECL align_attr, NULL
740},
741{ "h4", 0, 0, 0, 0, 0, 0, 0, "heading ",
742 DECL html_inline, NULL, DECL html_attrs, DECL align_attr, NULL
743},
744{ "h5", 0, 0, 0, 0, 0, 0, 0, "heading ",
745 DECL html_inline, NULL, DECL html_attrs, DECL align_attr, NULL
746},
747{ "h6", 0, 0, 0, 0, 0, 0, 0, "heading ",
748 DECL html_inline, NULL, DECL html_attrs, DECL align_attr, NULL
749},
750{ "head", 1, 1, 0, 0, 0, 0, 0, "document head ",
751 DECL head_contents, NULL, DECL head_attrs, NULL, NULL
752},
753{ "hr", 0, 2, 2, 1, 0, 0, 0, "horizontal rule " ,
754 EMPTY, NULL, DECL html_attrs, DECL hr_depr, NULL
755},
756{ "html", 1, 1, 0, 0, 0, 0, 0, "document root element ",
757 DECL html_content , NULL , DECL i18n_attrs, DECL version_attr, NULL
758},
759{ "i", 0, 3, 0, 0, 0, 0, 1, "italic text style",
760 DECL html_inline, NULL, DECL html_attrs, NULL, NULL
761},
762{ "iframe", 0, 0, 0, 0, 0, 1, 2, "inline subwindow ",
763 DECL html_flow, NULL, NULL, DECL iframe_attrs, NULL
764},
765{ "img", 0, 2, 2, 1, 0, 0, 1, "embedded image ",
766 EMPTY, NULL, DECL img_attrs, DECL align_attr, DECL src_alt_attrs
767},
768{ "input", 0, 2, 2, 1, 0, 0, 1, "form control ",
769 EMPTY, NULL, DECL input_attrs , DECL align_attr, NULL
770},
771{ "ins", 0, 0, 0, 0, 0, 0, 2, "inserted text",
772 DECL html_flow, NULL, DECL edit_attrs, NULL, NULL
773},
774{ "isindex", 0, 2, 2, 1, 1, 1, 0, "single line prompt ",
775 EMPTY, NULL, NULL, DECL prompt_attrs, NULL
776},
777{ "kbd", 0, 0, 0, 0, 0, 0, 1, "text to be entered by the user",
778 DECL html_inline, NULL, DECL html_attrs, NULL, NULL
779},
780{ "label", 0, 0, 0, 0, 0, 0, 1, "form field label text ",
781 DECL html_inline MODIFIER, NULL, DECL label_attrs , NULL, NULL
782},
783{ "legend", 0, 0, 0, 0, 0, 0, 0, "fieldset legend ",
784 DECL html_inline, NULL, DECL legend_attrs , DECL align_attr, NULL
785},
786{ "li", 0, 1, 1, 0, 0, 0, 0, "list item ",
787 DECL html_flow, NULL, DECL html_attrs, NULL, NULL
788},
789{ "link", 0, 2, 2, 1, 0, 0, 0, "a media-independent link ",
790 EMPTY, NULL, DECL link_attrs, DECL target_attr, NULL
791},
792{ "map", 0, 0, 0, 0, 0, 0, 2, "client-side image map ",
793 DECL map_contents , NULL, DECL html_attrs , NULL, DECL name_attr
794},
795{ "menu", 0, 0, 0, 0, 1, 1, 0, "menu list ",
796 DECL blockli_elt , NULL, NULL, DECL compact_attrs, NULL
797},
798{ "meta", 0, 2, 2, 1, 0, 0, 0, "generic metainformation ",
799 EMPTY, NULL, DECL meta_attrs , NULL , DECL content_attr
800},
801{ "noframes", 0, 0, 0, 0, 0, 2, 0, "alternate content container for non frame-based rendering ",
802 DECL noframes_content, "body" , DECL html_attrs, NULL, NULL
803},
804{ "noscript", 0, 0, 0, 0, 0, 0, 0, "alternate content container for non script-based rendering ",
805 DECL html_flow, "div", DECL html_attrs, NULL, NULL
806},
807{ "object", 0, 0, 0, 0, 0, 0, 2, "generic embedded object ",
808 DECL object_contents , "div" , DECL object_attrs, DECL object_depr, NULL
809},
810{ "ol", 0, 0, 0, 0, 0, 0, 0, "ordered list ",
811 DECL li_elt , "li" , DECL html_attrs, DECL ol_attrs, NULL
812},
813{ "optgroup", 0, 0, 0, 0, 0, 0, 0, "option group ",
814 DECL option_elt , "option", DECL optgroup_attrs, NULL, DECL label_attr
815},
816{ "option", 0, 1, 0, 0, 0, 0, 0, "selectable choice " ,
817 DECL html_pcdata, NULL, DECL option_attrs, NULL, NULL
818},
819{ "p", 0, 1, 0, 0, 0, 0, 0, "paragraph ",
820 DECL html_inline, NULL, DECL html_attrs, DECL align_attr, NULL
821},
822{ "param", 0, 2, 2, 1, 0, 0, 0, "named property value ",
823 EMPTY, NULL, DECL param_attrs, NULL, DECL name_attr
824},
825{ "pre", 0, 0, 0, 0, 0, 0, 0, "preformatted text ",
826 DECL pre_content, NULL, DECL html_attrs, DECL width_attr, NULL
827},
828{ "q", 0, 0, 0, 0, 0, 0, 1, "short inline quotation ",
829 DECL html_inline, NULL, DECL quote_attrs, NULL, NULL
830},
831{ "s", 0, 3, 0, 0, 1, 1, 1, "strike-through text style",
832 DECL html_inline, NULL, NULL, DECL html_attrs, NULL
833},
834{ "samp", 0, 0, 0, 0, 0, 0, 1, "sample program output, scripts, etc.",
835 DECL html_inline, NULL, DECL html_attrs, NULL, NULL
836},
837{ "script", 0, 0, 0, 0, 0, 0, 2, "script statements ",
838 DECL html_cdata, NULL, DECL script_attrs, DECL language_attr, DECL type_attr
839},
840{ "select", 0, 0, 0, 0, 0, 0, 1, "option selector ",
841 DECL select_content, NULL, DECL select_attrs, NULL, NULL
842},
843{ "small", 0, 3, 0, 0, 0, 0, 1, "small text style",
844 DECL html_inline, NULL, DECL html_attrs, NULL, NULL
845},
846{ "span", 0, 0, 0, 0, 0, 0, 1, "generic language/style container ",
847 DECL html_inline, NULL, DECL html_attrs, NULL, NULL
848},
849{ "strike", 0, 3, 0, 0, 1, 1, 1, "strike-through text",
850 DECL html_inline, NULL, NULL, DECL html_attrs, NULL
851},
852{ "strong", 0, 3, 0, 0, 0, 0, 1, "strong emphasis",
853 DECL html_inline, NULL, DECL html_attrs, NULL, NULL
854},
855{ "style", 0, 0, 0, 0, 0, 0, 0, "style info ",
856 DECL html_cdata, NULL, DECL style_attrs, NULL, DECL type_attr
857},
858{ "sub", 0, 3, 0, 0, 0, 0, 1, "subscript",
859 DECL html_inline, NULL, DECL html_attrs, NULL, NULL
860},
861{ "sup", 0, 3, 0, 0, 0, 0, 1, "superscript ",
862 DECL html_inline, NULL, DECL html_attrs, NULL, NULL
863},
864{ "table", 0, 0, 0, 0, 0, 0, 0, "",
865 DECL table_contents , "tr" , DECL table_attrs , DECL table_depr, NULL
866},
867{ "tbody", 1, 0, 0, 0, 0, 0, 0, "table body ",
868 DECL tr_elt , "tr" , DECL talign_attrs, NULL, NULL
869},
870{ "td", 0, 0, 0, 0, 0, 0, 0, "table data cell",
871 DECL html_flow, NULL, DECL th_td_attr, DECL th_td_depr, NULL
872},
873{ "textarea", 0, 0, 0, 0, 0, 0, 1, "multi-line text field ",
874 DECL html_pcdata, NULL, DECL textarea_attrs, NULL, DECL rows_cols_attr
875},
876{ "tfoot", 0, 1, 0, 0, 0, 0, 0, "table footer ",
877 DECL tr_elt , "tr" , DECL talign_attrs, NULL, NULL
878},
879{ "th", 0, 1, 0, 0, 0, 0, 0, "table header cell",
880 DECL html_flow, NULL, DECL th_td_attr, DECL th_td_depr, NULL
881},
882{ "thead", 0, 1, 0, 0, 0, 0, 0, "table header ",
883 DECL tr_elt , "tr" , DECL talign_attrs, NULL, NULL
884},
885{ "title", 0, 0, 0, 0, 0, 0, 0, "document title ",
886 DECL html_pcdata, NULL, DECL i18n_attrs, NULL, NULL
887},
888{ "tr", 0, 0, 0, 0, 0, 0, 0, "table row ",
889 DECL tr_contents , "td" , DECL talign_attrs, DECL bgcolor_attr, NULL
890},
891{ "tt", 0, 3, 0, 0, 0, 0, 1, "teletype or monospaced text style",
892 DECL html_inline, NULL, DECL html_attrs, NULL, NULL
893},
894{ "u", 0, 3, 0, 0, 1, 1, 1, "underlined text style",
895 DECL html_inline, NULL, NULL, DECL html_attrs, NULL
896},
897{ "ul", 0, 0, 0, 0, 0, 0, 0, "unordered list ",
898 DECL li_elt , "li" , DECL html_attrs, DECL ul_depr, NULL
899},
900{ "var", 0, 0, 0, 0, 0, 0, 1, "instance of a variable or program argument",
901 DECL html_inline, NULL, DECL html_attrs, NULL, NULL
902}
903};
904
905/*
906 * start tags that imply the end of current element
907 */
908static const char * const htmlStartClose[] = {
909"form", "form", "p", "hr", "h1", "h2", "h3", "h4", "h5", "h6",
910 "dl", "ul", "ol", "menu", "dir", "address", "pre",
911 "listing", "xmp", "head", NULL,
912"head", "p", NULL,
913"title", "p", NULL,
914"body", "head", "style", "link", "title", "p", NULL,
915"frameset", "head", "style", "link", "title", "p", NULL,
916"li", "p", "h1", "h2", "h3", "h4", "h5", "h6", "dl", "address",
917 "pre", "listing", "xmp", "head", "li", NULL,
918"hr", "p", "head", NULL,
919"h1", "p", "head", NULL,
920"h2", "p", "head", NULL,
921"h3", "p", "head", NULL,
922"h4", "p", "head", NULL,
923"h5", "p", "head", NULL,
924"h6", "p", "head", NULL,
925"dir", "p", "head", NULL,
926"address", "p", "head", "ul", NULL,
927"pre", "p", "head", "ul", NULL,
928"listing", "p", "head", NULL,
929"xmp", "p", "head", NULL,
930"blockquote", "p", "head", NULL,
931"dl", "p", "dt", "menu", "dir", "address", "pre", "listing",
932 "xmp", "head", NULL,
933"dt", "p", "menu", "dir", "address", "pre", "listing", "xmp",
934 "head", "dd", NULL,
935"dd", "p", "menu", "dir", "address", "pre", "listing", "xmp",
936 "head", "dt", NULL,
937"ul", "p", "head", "ol", "menu", "dir", "address", "pre",
938 "listing", "xmp", NULL,
939"ol", "p", "head", "ul", NULL,
940"menu", "p", "head", "ul", NULL,
941"p", "p", "head", "h1", "h2", "h3", "h4", "h5", "h6", NULL,
942"div", "p", "head", NULL,
943"noscript", "p", "head", NULL,
944"center", "font", "b", "i", "p", "head", NULL,
945"a", "a", NULL,
946"caption", "p", NULL,
947"colgroup", "caption", "colgroup", "col", "p", NULL,
948"col", "caption", "col", "p", NULL,
949"table", "p", "head", "h1", "h2", "h3", "h4", "h5", "h6", "pre",
950 "listing", "xmp", "a", NULL,
951"th", "th", "td", "p", "span", "font", "a", "b", "i", "u", NULL,
952"td", "th", "td", "p", "span", "font", "a", "b", "i", "u", NULL,
953"tr", "th", "td", "tr", "caption", "col", "colgroup", "p", NULL,
954"thead", "caption", "col", "colgroup", NULL,
955"tfoot", "th", "td", "tr", "caption", "col", "colgroup", "thead",
956 "tbody", "p", NULL,
957"tbody", "th", "td", "tr", "caption", "col", "colgroup", "thead",
958 "tfoot", "tbody", "p", NULL,
959"optgroup", "option", NULL,
960"option", "option", NULL,
961"fieldset", "legend", "p", "head", "h1", "h2", "h3", "h4", "h5", "h6",
962 "pre", "listing", "xmp", "a", NULL,
963NULL
964};
965
966/*
967 * The list of HTML elements which are supposed not to have
968 * CDATA content and where a p element will be implied
969 *
970 * TODO: extend that list by reading the HTML SGML DTD on
971 * implied paragraph
972 */
973static const char *const htmlNoContentElements[] = {
974 "html",
975 "head",
976 NULL
977};
978
979/*
980 * The list of HTML attributes which are of content %Script;
981 * NOTE: when adding ones, check htmlIsScriptAttribute() since
982 * it assumes the name starts with 'on'
983 */
984static const char *const htmlScriptAttributes[] = {
985 "onclick",
986 "ondblclick",
987 "onmousedown",
988 "onmouseup",
989 "onmouseover",
990 "onmousemove",
991 "onmouseout",
992 "onkeypress",
993 "onkeydown",
994 "onkeyup",
995 "onload",
996 "onunload",
997 "onfocus",
998 "onblur",
999 "onsubmit",
1000 "onrest",
1001 "onchange",
1002 "onselect"
1003};
1004
1005/*
1006 * This table is used by the htmlparser to know what to do with
1007 * broken html pages. By assigning different priorities to different
1008 * elements the parser can decide how to handle extra endtags.
1009 * Endtags are only allowed to close elements with lower or equal
1010 * priority.
1011 */
1012
1013typedef struct {
1014 const char *name;
1015 int priority;
1016} elementPriority;
1017
1018static const elementPriority htmlEndPriority[] = {
1019 {"div", 150},
1020 {"td", 160},
1021 {"th", 160},
1022 {"tr", 170},
1023 {"thead", 180},
1024 {"tbody", 180},
1025 {"tfoot", 180},
1026 {"table", 190},
1027 {"head", 200},
1028 {"body", 200},
1029 {"html", 220},
1030 {NULL, 100} /* Default priority */
1031};
1032
1033static const char** htmlStartCloseIndex[100];
1034static int htmlStartCloseIndexinitialized = 0;
1035
1036/************************************************************************
1037 * *
1038 * functions to handle HTML specific data *
1039 * *
1040 ************************************************************************/
1041
1042/**
1043 * htmlInitAutoClose:
1044 *
1045 * Initialize the htmlStartCloseIndex for fast lookup of closing tags names.
1046 * This is not reentrant. Call xmlInitParser() once before processing in
1047 * case of use in multithreaded programs.
1048 */
1049void
1050htmlInitAutoClose(void) {
1051 int indx, i = 0;
1052
1053 if (htmlStartCloseIndexinitialized) return;
1054
1055 for (indx = 0;indx < 100;indx ++) htmlStartCloseIndex[indx] = NULL;
1056 indx = 0;
1057 while ((htmlStartClose[i] != NULL) && (indx < 100 - 1)) {
1058 htmlStartCloseIndex[indx++] = (const char**) &htmlStartClose[i];
1059 while (htmlStartClose[i] != NULL) i++;
1060 i++;
1061 }
1062 htmlStartCloseIndexinitialized = 1;
1063}
1064
1065/**
1066 * htmlTagLookup:
1067 * @tag: The tag name in lowercase
1068 *
1069 * Lookup the HTML tag in the ElementTable
1070 *
1071 * Returns the related htmlElemDescPtr or NULL if not found.
1072 */
1073const htmlElemDesc *
1074htmlTagLookup(const xmlChar *tag) {
1075 unsigned int i;
1076
1077 for (i = 0; i < (sizeof(html40ElementTable) /
1078 sizeof(html40ElementTable[0]));i++) {
1079 if (!xmlStrcasecmp(tag, BAD_CAST html40ElementTable[i].name))
1080 return((htmlElemDescPtr) &html40ElementTable[i]);
1081 }
1082 return(NULL);
1083}
1084
1085/**
1086 * htmlGetEndPriority:
1087 * @name: The name of the element to look up the priority for.
1088 *
1089 * Return value: The "endtag" priority.
1090 **/
1091static int
1092htmlGetEndPriority (const xmlChar *name) {
1093 int i = 0;
1094
1095 while ((htmlEndPriority[i].name != NULL) &&
1096 (!xmlStrEqual((const xmlChar *)htmlEndPriority[i].name, name)))
1097 i++;
1098
1099 return(htmlEndPriority[i].priority);
1100}
1101
1102
1103/**
1104 * htmlCheckAutoClose:
1105 * @newtag: The new tag name
1106 * @oldtag: The old tag name
1107 *
1108 * Checks whether the new tag is one of the registered valid tags for
1109 * closing old.
1110 * Initialize the htmlStartCloseIndex for fast lookup of closing tags names.
1111 *
1112 * Returns 0 if no, 1 if yes.
1113 */
1114static int
1115htmlCheckAutoClose(const xmlChar * newtag, const xmlChar * oldtag)
1116{
1117 int i, indx;
1118 const char **closed = NULL;
1119
1120 if (htmlStartCloseIndexinitialized == 0)
1121 htmlInitAutoClose();
1122
1123 /* inefficient, but not a big deal */
1124 for (indx = 0; indx < 100; indx++) {
1125 closed = htmlStartCloseIndex[indx];
1126 if (closed == NULL)
1127 return (0);
1128 if (xmlStrEqual(BAD_CAST * closed, newtag))
1129 break;
1130 }
1131
1132 i = closed - htmlStartClose;
1133 i++;
1134 while (htmlStartClose[i] != NULL) {
1135 if (xmlStrEqual(BAD_CAST htmlStartClose[i], oldtag)) {
1136 return (1);
1137 }
1138 i++;
1139 }
1140 return (0);
1141}
1142
1143/**
1144 * htmlAutoCloseOnClose:
1145 * @ctxt: an HTML parser context
1146 * @newtag: The new tag name
1147 * @force: force the tag closure
1148 *
1149 * The HTML DTD allows an ending tag to implicitly close other tags.
1150 */
1151static void
1152htmlAutoCloseOnClose(htmlParserCtxtPtr ctxt, const xmlChar * newtag)
1153{
1154 const htmlElemDesc *info;
1155 int i, priority;
1156
1157 priority = htmlGetEndPriority(newtag);
1158
1159 for (i = (ctxt->nameNr - 1); i >= 0; i--) {
1160
1161 if (xmlStrEqual(newtag, ctxt->nameTab[i]))
1162 break;
1163 /*
1164 * A missplaced endtag can only close elements with lower
1165 * or equal priority, so if we find an element with higher
1166 * priority before we find an element with
1167 * matching name, we just ignore this endtag
1168 */
1169 if (htmlGetEndPriority(ctxt->nameTab[i]) > priority)
1170 return;
1171 }
1172 if (i < 0)
1173 return;
1174
1175 while (!xmlStrEqual(newtag, ctxt->name)) {
1176 info = htmlTagLookup(ctxt->name);
1177 if ((info != NULL) && (info->endTag == 3)) {
1178 htmlParseErr(ctxt, XML_ERR_TAG_NAME_MISMATCH,
1179 "Opening and ending tag mismatch: %s and %s\n",
1180 newtag, ctxt->name);
1181 }
1182 if ((ctxt->sax != NULL) && (ctxt->sax->endElement != NULL))
1183 ctxt->sax->endElement(ctxt->userData, ctxt->name);
1184 htmlnamePop(ctxt);
1185 }
1186}
1187
1188/**
1189 * htmlAutoCloseOnEnd:
1190 * @ctxt: an HTML parser context
1191 *
1192 * Close all remaining tags at the end of the stream
1193 */
1194static void
1195htmlAutoCloseOnEnd(htmlParserCtxtPtr ctxt)
1196{
1197 int i;
1198
1199 if (ctxt->nameNr == 0)
1200 return;
1201 for (i = (ctxt->nameNr - 1); i >= 0; i--) {
1202 if ((ctxt->sax != NULL) && (ctxt->sax->endElement != NULL))
1203 ctxt->sax->endElement(ctxt->userData, ctxt->name);
1204 htmlnamePop(ctxt);
1205 }
1206}
1207
1208/**
1209 * htmlAutoClose:
1210 * @ctxt: an HTML parser context
1211 * @newtag: The new tag name or NULL
1212 *
1213 * The HTML DTD allows a tag to implicitly close other tags.
1214 * The list is kept in htmlStartClose array. This function is
1215 * called when a new tag has been detected and generates the
1216 * appropriates closes if possible/needed.
1217 * If newtag is NULL this mean we are at the end of the resource
1218 * and we should check
1219 */
1220static void
1221htmlAutoClose(htmlParserCtxtPtr ctxt, const xmlChar * newtag)
1222{
1223 while ((newtag != NULL) && (ctxt->name != NULL) &&
1224 (htmlCheckAutoClose(newtag, ctxt->name))) {
1225 if ((ctxt->sax != NULL) && (ctxt->sax->endElement != NULL))
1226 ctxt->sax->endElement(ctxt->userData, ctxt->name);
1227 htmlnamePop(ctxt);
1228 }
1229 if (newtag == NULL) {
1230 htmlAutoCloseOnEnd(ctxt);
1231 return;
1232 }
1233 while ((newtag == NULL) && (ctxt->name != NULL) &&
1234 ((xmlStrEqual(ctxt->name, BAD_CAST "head")) ||
1235 (xmlStrEqual(ctxt->name, BAD_CAST "body")) ||
1236 (xmlStrEqual(ctxt->name, BAD_CAST "html")))) {
1237 if ((ctxt->sax != NULL) && (ctxt->sax->endElement != NULL))
1238 ctxt->sax->endElement(ctxt->userData, ctxt->name);
1239 htmlnamePop(ctxt);
1240 }
1241}
1242
1243/**
1244 * htmlAutoCloseTag:
1245 * @doc: the HTML document
1246 * @name: The tag name
1247 * @elem: the HTML element
1248 *
1249 * The HTML DTD allows a tag to implicitly close other tags.
1250 * The list is kept in htmlStartClose array. This function checks
1251 * if the element or one of it's children would autoclose the
1252 * given tag.
1253 *
1254 * Returns 1 if autoclose, 0 otherwise
1255 */
1256int
1257htmlAutoCloseTag(htmlDocPtr doc, const xmlChar *name, htmlNodePtr elem) {
1258 htmlNodePtr child;
1259
1260 if (elem == NULL) return(1);
1261 if (xmlStrEqual(name, elem->name)) return(0);
1262 if (htmlCheckAutoClose(elem->name, name)) return(1);
1263 child = elem->children;
1264 while (child != NULL) {
1265 if (htmlAutoCloseTag(doc, name, child)) return(1);
1266 child = child->next;
1267 }
1268 return(0);
1269}
1270
1271/**
1272 * htmlIsAutoClosed:
1273 * @doc: the HTML document
1274 * @elem: the HTML element
1275 *
1276 * The HTML DTD allows a tag to implicitly close other tags.
1277 * The list is kept in htmlStartClose array. This function checks
1278 * if a tag is autoclosed by one of it's child
1279 *
1280 * Returns 1 if autoclosed, 0 otherwise
1281 */
1282int
1283htmlIsAutoClosed(htmlDocPtr doc, htmlNodePtr elem) {
1284 htmlNodePtr child;
1285
1286 if (elem == NULL) return(1);
1287 child = elem->children;
1288 while (child != NULL) {
1289 if (htmlAutoCloseTag(doc, elem->name, child)) return(1);
1290 child = child->next;
1291 }
1292 return(0);
1293}
1294
1295/**
1296 * htmlCheckImplied:
1297 * @ctxt: an HTML parser context
1298 * @newtag: The new tag name
1299 *
1300 * The HTML DTD allows a tag to exists only implicitly
1301 * called when a new tag has been detected and generates the
1302 * appropriates implicit tags if missing
1303 */
1304static void
1305htmlCheckImplied(htmlParserCtxtPtr ctxt, const xmlChar *newtag) {
1306 if (!htmlOmittedDefaultValue)
1307 return;
1308 if (xmlStrEqual(newtag, BAD_CAST"html"))
1309 return;
1310 if (ctxt->nameNr <= 0) {
1311 htmlnamePush(ctxt, BAD_CAST"html");
1312 if ((ctxt->sax != NULL) && (ctxt->sax->startElement != NULL))
1313 ctxt->sax->startElement(ctxt->userData, BAD_CAST"html", NULL);
1314 }
1315 if ((xmlStrEqual(newtag, BAD_CAST"body")) || (xmlStrEqual(newtag, BAD_CAST"head")))
1316 return;
1317 if ((ctxt->nameNr <= 1) &&
1318 ((xmlStrEqual(newtag, BAD_CAST"script")) ||
1319 (xmlStrEqual(newtag, BAD_CAST"style")) ||
1320 (xmlStrEqual(newtag, BAD_CAST"meta")) ||
1321 (xmlStrEqual(newtag, BAD_CAST"link")) ||
1322 (xmlStrEqual(newtag, BAD_CAST"title")) ||
1323 (xmlStrEqual(newtag, BAD_CAST"base")))) {
1324 /*
1325 * dropped OBJECT ... i you put it first BODY will be
1326 * assumed !
1327 */
1328 htmlnamePush(ctxt, BAD_CAST"head");
1329 if ((ctxt->sax != NULL) && (ctxt->sax->startElement != NULL))
1330 ctxt->sax->startElement(ctxt->userData, BAD_CAST"head", NULL);
1331 } else if ((!xmlStrEqual(newtag, BAD_CAST"noframes")) &&
1332 (!xmlStrEqual(newtag, BAD_CAST"frame")) &&
1333 (!xmlStrEqual(newtag, BAD_CAST"frameset"))) {
1334 int i;
1335 for (i = 0;i < ctxt->nameNr;i++) {
1336 if (xmlStrEqual(ctxt->nameTab[i], BAD_CAST"body")) {
1337 return;
1338 }
1339 if (xmlStrEqual(ctxt->nameTab[i], BAD_CAST"head")) {
1340 return;
1341 }
1342 }
1343
1344 htmlnamePush(ctxt, BAD_CAST"body");
1345 if ((ctxt->sax != NULL) && (ctxt->sax->startElement != NULL))
1346 ctxt->sax->startElement(ctxt->userData, BAD_CAST"body", NULL);
1347 }
1348}
1349
1350/**
1351 * htmlCheckParagraph
1352 * @ctxt: an HTML parser context
1353 *
1354 * Check whether a p element need to be implied before inserting
1355 * characters in the current element.
1356 *
1357 * Returns 1 if a paragraph has been inserted, 0 if not and -1
1358 * in case of error.
1359 */
1360
1361static int
1362htmlCheckParagraph(htmlParserCtxtPtr ctxt) {
1363 const xmlChar *tag;
1364 int i;
1365
1366 if (ctxt == NULL)
1367 return(-1);
1368 tag = ctxt->name;
1369 if (tag == NULL) {
1370 htmlAutoClose(ctxt, BAD_CAST"p");
1371 htmlCheckImplied(ctxt, BAD_CAST"p");
1372 htmlnamePush(ctxt, BAD_CAST"p");
1373 if ((ctxt->sax != NULL) && (ctxt->sax->startElement != NULL))
1374 ctxt->sax->startElement(ctxt->userData, BAD_CAST"p", NULL);
1375 return(1);
1376 }
1377 if (!htmlOmittedDefaultValue)
1378 return(0);
1379 for (i = 0; htmlNoContentElements[i] != NULL; i++) {
1380 if (xmlStrEqual(tag, BAD_CAST htmlNoContentElements[i])) {
1381 htmlAutoClose(ctxt, BAD_CAST"p");
1382 htmlCheckImplied(ctxt, BAD_CAST"p");
1383 htmlnamePush(ctxt, BAD_CAST"p");
1384 if ((ctxt->sax != NULL) && (ctxt->sax->startElement != NULL))
1385 ctxt->sax->startElement(ctxt->userData, BAD_CAST"p", NULL);
1386 return(1);
1387 }
1388 }
1389 return(0);
1390}
1391
1392/**
1393 * htmlIsScriptAttribute:
1394 * @name: an attribute name
1395 *
1396 * Check if an attribute is of content type Script
1397 *
1398 * Returns 1 is the attribute is a script 0 otherwise
1399 */
1400int
1401htmlIsScriptAttribute(const xmlChar *name) {
1402 unsigned int i;
1403
1404 if (name == NULL)
1405 return(0);
1406 /*
1407 * all script attributes start with 'on'
1408 */
1409 if ((name[0] != 'o') || (name[1] != 'n'))
1410 return(0);
1411 for (i = 0;
1412 i < sizeof(htmlScriptAttributes)/sizeof(htmlScriptAttributes[0]);
1413 i++) {
1414 if (xmlStrEqual(name, (const xmlChar *) htmlScriptAttributes[i]))
1415 return(1);
1416 }
1417 return(0);
1418}
1419
1420/************************************************************************
1421 * *
1422 * The list of HTML predefined entities *
1423 * *
1424 ************************************************************************/
1425
1426
1427static const htmlEntityDesc html40EntitiesTable[] = {
1428/*
1429 * the 4 absolute ones, plus apostrophe.
1430 */
1431{ 34, "quot", "quotation mark = APL quote, U+0022 ISOnum" },
1432{ 38, "amp", "ampersand, U+0026 ISOnum" },
1433{ 39, "apos", "single quote" },
1434{ 60, "lt", "less-than sign, U+003C ISOnum" },
1435{ 62, "gt", "greater-than sign, U+003E ISOnum" },
1436
1437/*
1438 * A bunch still in the 128-255 range
1439 * Replacing them depend really on the charset used.
1440 */
1441{ 160, "nbsp", "no-break space = non-breaking space, U+00A0 ISOnum" },
1442{ 161, "iexcl","inverted exclamation mark, U+00A1 ISOnum" },
1443{ 162, "cent", "cent sign, U+00A2 ISOnum" },
1444{ 163, "pound","pound sign, U+00A3 ISOnum" },
1445{ 164, "curren","currency sign, U+00A4 ISOnum" },
1446{ 165, "yen", "yen sign = yuan sign, U+00A5 ISOnum" },
1447{ 166, "brvbar","broken bar = broken vertical bar, U+00A6 ISOnum" },
1448{ 167, "sect", "section sign, U+00A7 ISOnum" },
1449{ 168, "uml", "diaeresis = spacing diaeresis, U+00A8 ISOdia" },
1450{ 169, "copy", "copyright sign, U+00A9 ISOnum" },
1451{ 170, "ordf", "feminine ordinal indicator, U+00AA ISOnum" },
1452{ 171, "laquo","left-pointing double angle quotation mark = left pointing guillemet, U+00AB ISOnum" },
1453{ 172, "not", "not sign, U+00AC ISOnum" },
1454{ 173, "shy", "soft hyphen = discretionary hyphen, U+00AD ISOnum" },
1455{ 174, "reg", "registered sign = registered trade mark sign, U+00AE ISOnum" },
1456{ 175, "macr", "macron = spacing macron = overline = APL overbar, U+00AF ISOdia" },
1457{ 176, "deg", "degree sign, U+00B0 ISOnum" },
1458{ 177, "plusmn","plus-minus sign = plus-or-minus sign, U+00B1 ISOnum" },
1459{ 178, "sup2", "superscript two = superscript digit two = squared, U+00B2 ISOnum" },
1460{ 179, "sup3", "superscript three = superscript digit three = cubed, U+00B3 ISOnum" },
1461{ 180, "acute","acute accent = spacing acute, U+00B4 ISOdia" },
1462{ 181, "micro","micro sign, U+00B5 ISOnum" },
1463{ 182, "para", "pilcrow sign = paragraph sign, U+00B6 ISOnum" },
1464{ 183, "middot","middle dot = Georgian comma Greek middle dot, U+00B7 ISOnum" },
1465{ 184, "cedil","cedilla = spacing cedilla, U+00B8 ISOdia" },
1466{ 185, "sup1", "superscript one = superscript digit one, U+00B9 ISOnum" },
1467{ 186, "ordm", "masculine ordinal indicator, U+00BA ISOnum" },
1468{ 187, "raquo","right-pointing double angle quotation mark right pointing guillemet, U+00BB ISOnum" },
1469{ 188, "frac14","vulgar fraction one quarter = fraction one quarter, U+00BC ISOnum" },
1470{ 189, "frac12","vulgar fraction one half = fraction one half, U+00BD ISOnum" },
1471{ 190, "frac34","vulgar fraction three quarters = fraction three quarters, U+00BE ISOnum" },
1472{ 191, "iquest","inverted question mark = turned question mark, U+00BF ISOnum" },
1473{ 192, "Agrave","latin capital letter A with grave = latin capital letter A grave, U+00C0 ISOlat1" },
1474{ 193, "Aacute","latin capital letter A with acute, U+00C1 ISOlat1" },
1475{ 194, "Acirc","latin capital letter A with circumflex, U+00C2 ISOlat1" },
1476{ 195, "Atilde","latin capital letter A with tilde, U+00C3 ISOlat1" },
1477{ 196, "Auml", "latin capital letter A with diaeresis, U+00C4 ISOlat1" },
1478{ 197, "Aring","latin capital letter A with ring above = latin capital letter A ring, U+00C5 ISOlat1" },
1479{ 198, "AElig","latin capital letter AE = latin capital ligature AE, U+00C6 ISOlat1" },
1480{ 199, "Ccedil","latin capital letter C with cedilla, U+00C7 ISOlat1" },
1481{ 200, "Egrave","latin capital letter E with grave, U+00C8 ISOlat1" },
1482{ 201, "Eacute","latin capital letter E with acute, U+00C9 ISOlat1" },
1483{ 202, "Ecirc","latin capital letter E with circumflex, U+00CA ISOlat1" },
1484{ 203, "Euml", "latin capital letter E with diaeresis, U+00CB ISOlat1" },
1485{ 204, "Igrave","latin capital letter I with grave, U+00CC ISOlat1" },
1486{ 205, "Iacute","latin capital letter I with acute, U+00CD ISOlat1" },
1487{ 206, "Icirc","latin capital letter I with circumflex, U+00CE ISOlat1" },
1488{ 207, "Iuml", "latin capital letter I with diaeresis, U+00CF ISOlat1" },
1489{ 208, "ETH", "latin capital letter ETH, U+00D0 ISOlat1" },
1490{ 209, "Ntilde","latin capital letter N with tilde, U+00D1 ISOlat1" },
1491{ 210, "Ograve","latin capital letter O with grave, U+00D2 ISOlat1" },
1492{ 211, "Oacute","latin capital letter O with acute, U+00D3 ISOlat1" },
1493{ 212, "Ocirc","latin capital letter O with circumflex, U+00D4 ISOlat1" },
1494{ 213, "Otilde","latin capital letter O with tilde, U+00D5 ISOlat1" },
1495{ 214, "Ouml", "latin capital letter O with diaeresis, U+00D6 ISOlat1" },
1496{ 215, "times","multiplication sign, U+00D7 ISOnum" },
1497{ 216, "Oslash","latin capital letter O with stroke latin capital letter O slash, U+00D8 ISOlat1" },
1498{ 217, "Ugrave","latin capital letter U with grave, U+00D9 ISOlat1" },
1499{ 218, "Uacute","latin capital letter U with acute, U+00DA ISOlat1" },
1500{ 219, "Ucirc","latin capital letter U with circumflex, U+00DB ISOlat1" },
1501{ 220, "Uuml", "latin capital letter U with diaeresis, U+00DC ISOlat1" },
1502{ 221, "Yacute","latin capital letter Y with acute, U+00DD ISOlat1" },
1503{ 222, "THORN","latin capital letter THORN, U+00DE ISOlat1" },
1504{ 223, "szlig","latin small letter sharp s = ess-zed, U+00DF ISOlat1" },
1505{ 224, "agrave","latin small letter a with grave = latin small letter a grave, U+00E0 ISOlat1" },
1506{ 225, "aacute","latin small letter a with acute, U+00E1 ISOlat1" },
1507{ 226, "acirc","latin small letter a with circumflex, U+00E2 ISOlat1" },
1508{ 227, "atilde","latin small letter a with tilde, U+00E3 ISOlat1" },
1509{ 228, "auml", "latin small letter a with diaeresis, U+00E4 ISOlat1" },
1510{ 229, "aring","latin small letter a with ring above = latin small letter a ring, U+00E5 ISOlat1" },
1511{ 230, "aelig","latin small letter ae = latin small ligature ae, U+00E6 ISOlat1" },
1512{ 231, "ccedil","latin small letter c with cedilla, U+00E7 ISOlat1" },
1513{ 232, "egrave","latin small letter e with grave, U+00E8 ISOlat1" },
1514{ 233, "eacute","latin small letter e with acute, U+00E9 ISOlat1" },
1515{ 234, "ecirc","latin small letter e with circumflex, U+00EA ISOlat1" },
1516{ 235, "euml", "latin small letter e with diaeresis, U+00EB ISOlat1" },
1517{ 236, "igrave","latin small letter i with grave, U+00EC ISOlat1" },
1518{ 237, "iacute","latin small letter i with acute, U+00ED ISOlat1" },
1519{ 238, "icirc","latin small letter i with circumflex, U+00EE ISOlat1" },
1520{ 239, "iuml", "latin small letter i with diaeresis, U+00EF ISOlat1" },
1521{ 240, "eth", "latin small letter eth, U+00F0 ISOlat1" },
1522{ 241, "ntilde","latin small letter n with tilde, U+00F1 ISOlat1" },
1523{ 242, "ograve","latin small letter o with grave, U+00F2 ISOlat1" },
1524{ 243, "oacute","latin small letter o with acute, U+00F3 ISOlat1" },
1525{ 244, "ocirc","latin small letter o with circumflex, U+00F4 ISOlat1" },
1526{ 245, "otilde","latin small letter o with tilde, U+00F5 ISOlat1" },
1527{ 246, "ouml", "latin small letter o with diaeresis, U+00F6 ISOlat1" },
1528{ 247, "divide","division sign, U+00F7 ISOnum" },
1529{ 248, "oslash","latin small letter o with stroke, = latin small letter o slash, U+00F8 ISOlat1" },
1530{ 249, "ugrave","latin small letter u with grave, U+00F9 ISOlat1" },
1531{ 250, "uacute","latin small letter u with acute, U+00FA ISOlat1" },
1532{ 251, "ucirc","latin small letter u with circumflex, U+00FB ISOlat1" },
1533{ 252, "uuml", "latin small letter u with diaeresis, U+00FC ISOlat1" },
1534{ 253, "yacute","latin small letter y with acute, U+00FD ISOlat1" },
1535{ 254, "thorn","latin small letter thorn with, U+00FE ISOlat1" },
1536{ 255, "yuml", "latin small letter y with diaeresis, U+00FF ISOlat1" },
1537
1538{ 338, "OElig","latin capital ligature OE, U+0152 ISOlat2" },
1539{ 339, "oelig","latin small ligature oe, U+0153 ISOlat2" },
1540{ 352, "Scaron","latin capital letter S with caron, U+0160 ISOlat2" },
1541{ 353, "scaron","latin small letter s with caron, U+0161 ISOlat2" },
1542{ 376, "Yuml", "latin capital letter Y with diaeresis, U+0178 ISOlat2" },
1543
1544/*
1545 * Anything below should really be kept as entities references
1546 */
1547{ 402, "fnof", "latin small f with hook = function = florin, U+0192 ISOtech" },
1548
1549{ 710, "circ", "modifier letter circumflex accent, U+02C6 ISOpub" },
1550{ 732, "tilde","small tilde, U+02DC ISOdia" },
1551
1552{ 913, "Alpha","greek capital letter alpha, U+0391" },
1553{ 914, "Beta", "greek capital letter beta, U+0392" },
1554{ 915, "Gamma","greek capital letter gamma, U+0393 ISOgrk3" },
1555{ 916, "Delta","greek capital letter delta, U+0394 ISOgrk3" },
1556{ 917, "Epsilon","greek capital letter epsilon, U+0395" },
1557{ 918, "Zeta", "greek capital letter zeta, U+0396" },
1558{ 919, "Eta", "greek capital letter eta, U+0397" },
1559{ 920, "Theta","greek capital letter theta, U+0398 ISOgrk3" },
1560{ 921, "Iota", "greek capital letter iota, U+0399" },
1561{ 922, "Kappa","greek capital letter kappa, U+039A" },
1562{ 923, "Lambda", "greek capital letter lambda, U+039B ISOgrk3" },
1563{ 924, "Mu", "greek capital letter mu, U+039C" },
1564{ 925, "Nu", "greek capital letter nu, U+039D" },
1565{ 926, "Xi", "greek capital letter xi, U+039E ISOgrk3" },
1566{ 927, "Omicron","greek capital letter omicron, U+039F" },
1567{ 928, "Pi", "greek capital letter pi, U+03A0 ISOgrk3" },
1568{ 929, "Rho", "greek capital letter rho, U+03A1" },
1569{ 931, "Sigma","greek capital letter sigma, U+03A3 ISOgrk3" },
1570{ 932, "Tau", "greek capital letter tau, U+03A4" },
1571{ 933, "Upsilon","greek capital letter upsilon, U+03A5 ISOgrk3" },
1572{ 934, "Phi", "greek capital letter phi, U+03A6 ISOgrk3" },
1573{ 935, "Chi", "greek capital letter chi, U+03A7" },
1574{ 936, "Psi", "greek capital letter psi, U+03A8 ISOgrk3" },
1575{ 937, "Omega","greek capital letter omega, U+03A9 ISOgrk3" },
1576
1577{ 945, "alpha","greek small letter alpha, U+03B1 ISOgrk3" },
1578{ 946, "beta", "greek small letter beta, U+03B2 ISOgrk3" },
1579{ 947, "gamma","greek small letter gamma, U+03B3 ISOgrk3" },
1580{ 948, "delta","greek small letter delta, U+03B4 ISOgrk3" },
1581{ 949, "epsilon","greek small letter epsilon, U+03B5 ISOgrk3" },
1582{ 950, "zeta", "greek small letter zeta, U+03B6 ISOgrk3" },
1583{ 951, "eta", "greek small letter eta, U+03B7 ISOgrk3" },
1584{ 952, "theta","greek small letter theta, U+03B8 ISOgrk3" },
1585{ 953, "iota", "greek small letter iota, U+03B9 ISOgrk3" },
1586{ 954, "kappa","greek small letter kappa, U+03BA ISOgrk3" },
1587{ 955, "lambda","greek small letter lambda, U+03BB ISOgrk3" },
1588{ 956, "mu", "greek small letter mu, U+03BC ISOgrk3" },
1589{ 957, "nu", "greek small letter nu, U+03BD ISOgrk3" },
1590{ 958, "xi", "greek small letter xi, U+03BE ISOgrk3" },
1591{ 959, "omicron","greek small letter omicron, U+03BF NEW" },
1592{ 960, "pi", "greek small letter pi, U+03C0 ISOgrk3" },
1593{ 961, "rho", "greek small letter rho, U+03C1 ISOgrk3" },
1594{ 962, "sigmaf","greek small letter final sigma, U+03C2 ISOgrk3" },
1595{ 963, "sigma","greek small letter sigma, U+03C3 ISOgrk3" },
1596{ 964, "tau", "greek small letter tau, U+03C4 ISOgrk3" },
1597{ 965, "upsilon","greek small letter upsilon, U+03C5 ISOgrk3" },
1598{ 966, "phi", "greek small letter phi, U+03C6 ISOgrk3" },
1599{ 967, "chi", "greek small letter chi, U+03C7 ISOgrk3" },
1600{ 968, "psi", "greek small letter psi, U+03C8 ISOgrk3" },
1601{ 969, "omega","greek small letter omega, U+03C9 ISOgrk3" },
1602{ 977, "thetasym","greek small letter theta symbol, U+03D1 NEW" },
1603{ 978, "upsih","greek upsilon with hook symbol, U+03D2 NEW" },
1604{ 982, "piv", "greek pi symbol, U+03D6 ISOgrk3" },
1605
1606{ 8194, "ensp", "en space, U+2002 ISOpub" },
1607{ 8195, "emsp", "em space, U+2003 ISOpub" },
1608{ 8201, "thinsp","thin space, U+2009 ISOpub" },
1609{ 8204, "zwnj", "zero width non-joiner, U+200C NEW RFC 2070" },
1610{ 8205, "zwj", "zero width joiner, U+200D NEW RFC 2070" },
1611{ 8206, "lrm", "left-to-right mark, U+200E NEW RFC 2070" },
1612{ 8207, "rlm", "right-to-left mark, U+200F NEW RFC 2070" },
1613{ 8211, "ndash","en dash, U+2013 ISOpub" },
1614{ 8212, "mdash","em dash, U+2014 ISOpub" },
1615{ 8216, "lsquo","left single quotation mark, U+2018 ISOnum" },
1616{ 8217, "rsquo","right single quotation mark, U+2019 ISOnum" },
1617{ 8218, "sbquo","single low-9 quotation mark, U+201A NEW" },
1618{ 8220, "ldquo","left double quotation mark, U+201C ISOnum" },
1619{ 8221, "rdquo","right double quotation mark, U+201D ISOnum" },
1620{ 8222, "bdquo","double low-9 quotation mark, U+201E NEW" },
1621{ 8224, "dagger","dagger, U+2020 ISOpub" },
1622{ 8225, "Dagger","double dagger, U+2021 ISOpub" },
1623
1624{ 8226, "bull", "bullet = black small circle, U+2022 ISOpub" },
1625{ 8230, "hellip","horizontal ellipsis = three dot leader, U+2026 ISOpub" },
1626
1627{ 8240, "permil","per mille sign, U+2030 ISOtech" },
1628
1629{ 8242, "prime","prime = minutes = feet, U+2032 ISOtech" },
1630{ 8243, "Prime","double prime = seconds = inches, U+2033 ISOtech" },
1631
1632{ 8249, "lsaquo","single left-pointing angle quotation mark, U+2039 ISO proposed" },
1633{ 8250, "rsaquo","single right-pointing angle quotation mark, U+203A ISO proposed" },
1634
1635{ 8254, "oline","overline = spacing overscore, U+203E NEW" },
1636{ 8260, "frasl","fraction slash, U+2044 NEW" },
1637
1638{ 8364, "euro", "euro sign, U+20AC NEW" },
1639
1640{ 8465, "image","blackletter capital I = imaginary part, U+2111 ISOamso" },
1641{ 8472, "weierp","script capital P = power set = Weierstrass p, U+2118 ISOamso" },
1642{ 8476, "real", "blackletter capital R = real part symbol, U+211C ISOamso" },
1643{ 8482, "trade","trade mark sign, U+2122 ISOnum" },
1644{ 8501, "alefsym","alef symbol = first transfinite cardinal, U+2135 NEW" },
1645{ 8592, "larr", "leftwards arrow, U+2190 ISOnum" },
1646{ 8593, "uarr", "upwards arrow, U+2191 ISOnum" },
1647{ 8594, "rarr", "rightwards arrow, U+2192 ISOnum" },
1648{ 8595, "darr", "downwards arrow, U+2193 ISOnum" },
1649{ 8596, "harr", "left right arrow, U+2194 ISOamsa" },
1650{ 8629, "crarr","downwards arrow with corner leftwards = carriage return, U+21B5 NEW" },
1651{ 8656, "lArr", "leftwards double arrow, U+21D0 ISOtech" },
1652{ 8657, "uArr", "upwards double arrow, U+21D1 ISOamsa" },
1653{ 8658, "rArr", "rightwards double arrow, U+21D2 ISOtech" },
1654{ 8659, "dArr", "downwards double arrow, U+21D3 ISOamsa" },
1655{ 8660, "hArr", "left right double arrow, U+21D4 ISOamsa" },
1656
1657{ 8704, "forall","for all, U+2200 ISOtech" },
1658{ 8706, "part", "partial differential, U+2202 ISOtech" },
1659{ 8707, "exist","there exists, U+2203 ISOtech" },
1660{ 8709, "empty","empty set = null set = diameter, U+2205 ISOamso" },
1661{ 8711, "nabla","nabla = backward difference, U+2207 ISOtech" },
1662{ 8712, "isin", "element of, U+2208 ISOtech" },
1663{ 8713, "notin","not an element of, U+2209 ISOtech" },
1664{ 8715, "ni", "contains as member, U+220B ISOtech" },
1665{ 8719, "prod", "n-ary product = product sign, U+220F ISOamsb" },
1666{ 8721, "sum", "n-ary summation, U+2211 ISOamsb" },
1667{ 8722, "minus","minus sign, U+2212 ISOtech" },
1668{ 8727, "lowast","asterisk operator, U+2217 ISOtech" },
1669{ 8730, "radic","square root = radical sign, U+221A ISOtech" },
1670{ 8733, "prop", "proportional to, U+221D ISOtech" },
1671{ 8734, "infin","infinity, U+221E ISOtech" },
1672{ 8736, "ang", "angle, U+2220 ISOamso" },
1673{ 8743, "and", "logical and = wedge, U+2227 ISOtech" },
1674{ 8744, "or", "logical or = vee, U+2228 ISOtech" },
1675{ 8745, "cap", "intersection = cap, U+2229 ISOtech" },
1676{ 8746, "cup", "union = cup, U+222A ISOtech" },
1677{ 8747, "int", "integral, U+222B ISOtech" },
1678{ 8756, "there4","therefore, U+2234 ISOtech" },
1679{ 8764, "sim", "tilde operator = varies with = similar to, U+223C ISOtech" },
1680{ 8773, "cong", "approximately equal to, U+2245 ISOtech" },
1681{ 8776, "asymp","almost equal to = asymptotic to, U+2248 ISOamsr" },
1682{ 8800, "ne", "not equal to, U+2260 ISOtech" },
1683{ 8801, "equiv","identical to, U+2261 ISOtech" },
1684{ 8804, "le", "less-than or equal to, U+2264 ISOtech" },
1685{ 8805, "ge", "greater-than or equal to, U+2265 ISOtech" },
1686{ 8834, "sub", "subset of, U+2282 ISOtech" },
1687{ 8835, "sup", "superset of, U+2283 ISOtech" },
1688{ 8836, "nsub", "not a subset of, U+2284 ISOamsn" },
1689{ 8838, "sube", "subset of or equal to, U+2286 ISOtech" },
1690{ 8839, "supe", "superset of or equal to, U+2287 ISOtech" },
1691{ 8853, "oplus","circled plus = direct sum, U+2295 ISOamsb" },
1692{ 8855, "otimes","circled times = vector product, U+2297 ISOamsb" },
1693{ 8869, "perp", "up tack = orthogonal to = perpendicular, U+22A5 ISOtech" },
1694{ 8901, "sdot", "dot operator, U+22C5 ISOamsb" },
1695{ 8968, "lceil","left ceiling = apl upstile, U+2308 ISOamsc" },
1696{ 8969, "rceil","right ceiling, U+2309 ISOamsc" },
1697{ 8970, "lfloor","left floor = apl downstile, U+230A ISOamsc" },
1698{ 8971, "rfloor","right floor, U+230B ISOamsc" },
1699{ 9001, "lang", "left-pointing angle bracket = bra, U+2329 ISOtech" },
1700{ 9002, "rang", "right-pointing angle bracket = ket, U+232A ISOtech" },
1701{ 9674, "loz", "lozenge, U+25CA ISOpub" },
1702
1703{ 9824, "spades","black spade suit, U+2660 ISOpub" },
1704{ 9827, "clubs","black club suit = shamrock, U+2663 ISOpub" },
1705{ 9829, "hearts","black heart suit = valentine, U+2665 ISOpub" },
1706{ 9830, "diams","black diamond suit, U+2666 ISOpub" },
1707
1708};
1709
1710/************************************************************************
1711 * *
1712 * Commodity functions to handle entities *
1713 * *
1714 ************************************************************************/
1715
1716/*
1717 * Macro used to grow the current buffer.
1718 */
1719#define growBuffer(buffer) { \
1720 xmlChar *tmp; \
1721 buffer##_size *= 2; \
1722 tmp = (xmlChar *) xmlRealloc(buffer, buffer##_size * sizeof(xmlChar)); \
1723 if (tmp == NULL) { \
1724 htmlErrMemory(ctxt, "growing buffer\n"); \
1725 xmlFree(buffer); \
1726 return(NULL); \
1727 } \
1728 buffer = tmp; \
1729}
1730
1731/**
1732 * htmlEntityLookup:
1733 * @name: the entity name
1734 *
1735 * Lookup the given entity in EntitiesTable
1736 *
1737 * TODO: the linear scan is really ugly, an hash table is really needed.
1738 *
1739 * Returns the associated htmlEntityDescPtr if found, NULL otherwise.
1740 */
1741const htmlEntityDesc *
1742htmlEntityLookup(const xmlChar *name) {
1743 unsigned int i;
1744
1745 for (i = 0;i < (sizeof(html40EntitiesTable)/
1746 sizeof(html40EntitiesTable[0]));i++) {
1747 if (xmlStrEqual(name, BAD_CAST html40EntitiesTable[i].name)) {
1748 return((htmlEntityDescPtr) &html40EntitiesTable[i]);
1749 }
1750 }
1751 return(NULL);
1752}
1753
1754/**
1755 * htmlEntityValueLookup:
1756 * @value: the entity's unicode value
1757 *
1758 * Lookup the given entity in EntitiesTable
1759 *
1760 * TODO: the linear scan is really ugly, an hash table is really needed.
1761 *
1762 * Returns the associated htmlEntityDescPtr if found, NULL otherwise.
1763 */
1764const htmlEntityDesc *
1765htmlEntityValueLookup(unsigned int value) {
1766 unsigned int i;
1767
1768 for (i = 0;i < (sizeof(html40EntitiesTable)/
1769 sizeof(html40EntitiesTable[0]));i++) {
1770 if (html40EntitiesTable[i].value >= value) {
1771 if (html40EntitiesTable[i].value > value)
1772 break;
1773 return((htmlEntityDescPtr) &html40EntitiesTable[i]);
1774 }
1775 }
1776 return(NULL);
1777}
1778
1779/**
1780 * UTF8ToHtml:
1781 * @out: a pointer to an array of bytes to store the result
1782 * @outlen: the length of @out
1783 * @in: a pointer to an array of UTF-8 chars
1784 * @inlen: the length of @in
1785 *
1786 * Take a block of UTF-8 chars in and try to convert it to an ASCII
1787 * plus HTML entities block of chars out.
1788 *
1789 * Returns 0 if success, -2 if the transcoding fails, or -1 otherwise
1790 * The value of @inlen after return is the number of octets consumed
1791 * as the return value is positive, else unpredictable.
1792 * The value of @outlen after return is the number of octets consumed.
1793 */
1794int
1795UTF8ToHtml(unsigned char* out, int *outlen,
1796 const unsigned char* in, int *inlen) {
1797 const unsigned char* processed = in;
1798 const unsigned char* outend;
1799 const unsigned char* outstart = out;
1800 const unsigned char* instart = in;
1801 const unsigned char* inend;
1802 unsigned int c, d;
1803 int trailing;
1804
1805 if ((out == NULL) || (outlen == NULL) || (inlen == NULL)) return(-1);
1806 if (in == NULL) {
1807 /*
1808 * initialization nothing to do
1809 */
1810 *outlen = 0;
1811 *inlen = 0;
1812 return(0);
1813 }
1814 inend = in + (*inlen);
1815 outend = out + (*outlen);
1816 while (in < inend) {
1817 d = *in++;
1818 if (d < 0x80) { c= d; trailing= 0; }
1819 else if (d < 0xC0) {
1820 /* trailing byte in leading position */
1821 *outlen = out - outstart;
1822 *inlen = processed - instart;
1823 return(-2);
1824 } else if (d < 0xE0) { c= d & 0x1F; trailing= 1; }
1825 else if (d < 0xF0) { c= d & 0x0F; trailing= 2; }
1826 else if (d < 0xF8) { c= d & 0x07; trailing= 3; }
1827 else {
1828 /* no chance for this in Ascii */
1829 *outlen = out - outstart;
1830 *inlen = processed - instart;
1831 return(-2);
1832 }
1833
1834 if (inend - in < trailing) {
1835 break;
1836 }
1837
1838 for ( ; trailing; trailing--) {
1839 if ((in >= inend) || (((d= *in++) & 0xC0) != 0x80))
1840 break;
1841 c <<= 6;
1842 c |= d & 0x3F;
1843 }
1844
1845 /* assertion: c is a single UTF-4 value */
1846 if (c < 0x80) {
1847 if (out + 1 >= outend)
1848 break;
1849 *out++ = c;
1850 } else {
1851 int len;
1852 const htmlEntityDesc * ent;
1853 const char *cp;
1854 char nbuf[16];
1855
1856 /*
1857 * Try to lookup a predefined HTML entity for it
1858 */
1859
1860 ent = htmlEntityValueLookup(c);
1861 if (ent == NULL) {
1862 snprintf(nbuf, sizeof(nbuf), "#%u", c);
1863 cp = nbuf;
1864 }
1865 else
1866 cp = ent->name;
1867 len = strlen(cp);
1868 if (out + 2 + len >= outend)
1869 break;
1870 *out++ = '&';
1871 memcpy(out, cp, len);
1872 out += len;
1873 *out++ = ';';
1874 }
1875 processed = in;
1876 }
1877 *outlen = out - outstart;
1878 *inlen = processed - instart;
1879 return(0);
1880}
1881
1882/**
1883 * htmlEncodeEntities:
1884 * @out: a pointer to an array of bytes to store the result
1885 * @outlen: the length of @out
1886 * @in: a pointer to an array of UTF-8 chars
1887 * @inlen: the length of @in
1888 * @quoteChar: the quote character to escape (' or ") or zero.
1889 *
1890 * Take a block of UTF-8 chars in and try to convert it to an ASCII
1891 * plus HTML entities block of chars out.
1892 *
1893 * Returns 0 if success, -2 if the transcoding fails, or -1 otherwise
1894 * The value of @inlen after return is the number of octets consumed
1895 * as the return value is positive, else unpredictable.
1896 * The value of @outlen after return is the number of octets consumed.
1897 */
1898int
1899htmlEncodeEntities(unsigned char* out, int *outlen,
1900 const unsigned char* in, int *inlen, int quoteChar) {
1901 const unsigned char* processed = in;
1902 const unsigned char* outend;
1903 const unsigned char* outstart = out;
1904 const unsigned char* instart = in;
1905 const unsigned char* inend;
1906 unsigned int c, d;
1907 int trailing;
1908
1909 if ((out == NULL) || (outlen == NULL) || (inlen == NULL) || (in == NULL))
1910 return(-1);
1911 outend = out + (*outlen);
1912 inend = in + (*inlen);
1913 while (in < inend) {
1914 d = *in++;
1915 if (d < 0x80) { c= d; trailing= 0; }
1916 else if (d < 0xC0) {
1917 /* trailing byte in leading position */
1918 *outlen = out - outstart;
1919 *inlen = processed - instart;
1920 return(-2);
1921 } else if (d < 0xE0) { c= d & 0x1F; trailing= 1; }
1922 else if (d < 0xF0) { c= d & 0x0F; trailing= 2; }
1923 else if (d < 0xF8) { c= d & 0x07; trailing= 3; }
1924 else {
1925 /* no chance for this in Ascii */
1926 *outlen = out - outstart;
1927 *inlen = processed - instart;
1928 return(-2);
1929 }
1930
1931 if (inend - in < trailing)
1932 break;
1933
1934 while (trailing--) {
1935 if (((d= *in++) & 0xC0) != 0x80) {
1936 *outlen = out - outstart;
1937 *inlen = processed - instart;
1938 return(-2);
1939 }
1940 c <<= 6;
1941 c |= d & 0x3F;
1942 }
1943
1944 /* assertion: c is a single UTF-4 value */
1945 if ((c < 0x80) && (c != (unsigned int) quoteChar) &&
1946 (c != '&') && (c != '<') && (c != '>')) {
1947 if (out >= outend)
1948 break;
1949 *out++ = c;
1950 } else {
1951 const htmlEntityDesc * ent;
1952 const char *cp;
1953 char nbuf[16];
1954 int len;
1955
1956 /*
1957 * Try to lookup a predefined HTML entity for it
1958 */
1959 ent = htmlEntityValueLookup(c);
1960 if (ent == NULL) {
1961 snprintf(nbuf, sizeof(nbuf), "#%u", c);
1962 cp = nbuf;
1963 }
1964 else
1965 cp = ent->name;
1966 len = strlen(cp);
1967 if (out + 2 + len > outend)
1968 break;
1969 *out++ = '&';
1970 memcpy(out, cp, len);
1971 out += len;
1972 *out++ = ';';
1973 }
1974 processed = in;
1975 }
1976 *outlen = out - outstart;
1977 *inlen = processed - instart;
1978 return(0);
1979}
1980
1981/************************************************************************
1982 * *
1983 * Commodity functions to handle streams *
1984 * *
1985 ************************************************************************/
1986
1987/**
1988 * htmlNewInputStream:
1989 * @ctxt: an HTML parser context
1990 *
1991 * Create a new input stream structure
1992 * Returns the new input stream or NULL
1993 */
1994static htmlParserInputPtr
1995htmlNewInputStream(htmlParserCtxtPtr ctxt) {
1996 htmlParserInputPtr input;
1997
1998 input = (xmlParserInputPtr) xmlMalloc(sizeof(htmlParserInput));
1999 if (input == NULL) {
2000 htmlErrMemory(ctxt, "couldn't allocate a new input stream\n");
2001 return(NULL);
2002 }
2003 memset(input, 0, sizeof(htmlParserInput));
2004 input->filename = NULL;
2005 input->directory = NULL;
2006 input->base = NULL;
2007 input->cur = NULL;
2008 input->buf = NULL;
2009 input->line = 1;
2010 input->col = 1;
2011 input->buf = NULL;
2012 input->free = NULL;
2013 input->version = NULL;
2014 input->consumed = 0;
2015 input->length = 0;
2016 return(input);
2017}
2018
2019
2020/************************************************************************
2021 * *
2022 * Commodity functions, cleanup needed ? *
2023 * *
2024 ************************************************************************/
2025/*
2026 * all tags allowing pc data from the html 4.01 loose dtd
2027 * NOTE: it might be more apropriate to integrate this information
2028 * into the html40ElementTable array but I don't want to risk any
2029 * binary incomptibility
2030 */
2031static const char *allowPCData[] = {
2032 "a", "abbr", "acronym", "address", "applet", "b", "bdo", "big",
2033 "blockquote", "body", "button", "caption", "center", "cite", "code",
2034 "dd", "del", "dfn", "div", "dt", "em", "font", "form", "h1", "h2",
2035 "h3", "h4", "h5", "h6", "i", "iframe", "ins", "kbd", "label", "legend",
2036 "li", "noframes", "noscript", "object", "p", "pre", "q", "s", "samp",
2037 "small", "span", "strike", "strong", "td", "th", "tt", "u", "var"
2038};
2039
2040/**
2041 * areBlanks:
2042 * @ctxt: an HTML parser context
2043 * @str: a xmlChar *
2044 * @len: the size of @str
2045 *
2046 * Is this a sequence of blank chars that one can ignore ?
2047 *
2048 * Returns 1 if ignorable 0 otherwise.
2049 */
2050
2051static int areBlanks(htmlParserCtxtPtr ctxt, const xmlChar *str, int len) {
2052 unsigned int i;
2053 int j;
2054 xmlNodePtr lastChild;
2055 xmlDtdPtr dtd;
2056
2057 for (j = 0;j < len;j++)
2058 if (!(IS_BLANK_CH(str[j]))) return(0);
2059
2060 if (CUR == 0) return(1);
2061 if (CUR != '<') return(0);
2062 if (ctxt->name == NULL)
2063 return(1);
2064 if (xmlStrEqual(ctxt->name, BAD_CAST"html"))
2065 return(1);
2066 if (xmlStrEqual(ctxt->name, BAD_CAST"head"))
2067 return(1);
2068
2069 /* Only strip CDATA children of the body tag for strict HTML DTDs */
2070 if (xmlStrEqual(ctxt->name, BAD_CAST "body") && ctxt->myDoc != NULL) {
2071 dtd = xmlGetIntSubset(ctxt->myDoc);
2072 if (dtd != NULL && dtd->ExternalID != NULL) {
2073 if (!xmlStrcasecmp(dtd->ExternalID, BAD_CAST "-//W3C//DTD HTML 4.01//EN") ||
2074 !xmlStrcasecmp(dtd->ExternalID, BAD_CAST "-//W3C//DTD HTML 4//EN"))
2075 return(1);
2076 }
2077 }
2078
2079 if (ctxt->node == NULL) return(0);
2080 lastChild = xmlGetLastChild(ctxt->node);
2081 while ((lastChild) && (lastChild->type == XML_COMMENT_NODE))
2082 lastChild = lastChild->prev;
2083 if (lastChild == NULL) {
2084 if ((ctxt->node->type != XML_ELEMENT_NODE) &&
2085 (ctxt->node->content != NULL)) return(0);
2086 /* keep ws in constructs like ...<b> </b>...
2087 for all tags "b" allowing PCDATA */
2088 for ( i = 0; i < sizeof(allowPCData)/sizeof(allowPCData[0]); i++ ) {
2089 if ( xmlStrEqual(ctxt->name, BAD_CAST allowPCData[i]) ) {
2090 return(0);
2091 }
2092 }
2093 } else if (xmlNodeIsText(lastChild)) {
2094 return(0);
2095 } else {
2096 /* keep ws in constructs like <p><b>xy</b> <i>z</i><p>
2097 for all tags "p" allowing PCDATA */
2098 for ( i = 0; i < sizeof(allowPCData)/sizeof(allowPCData[0]); i++ ) {
2099 if ( xmlStrEqual(lastChild->name, BAD_CAST allowPCData[i]) ) {
2100 return(0);
2101 }
2102 }
2103 }
2104 return(1);
2105}
2106
2107/**
2108 * htmlNewDocNoDtD:
2109 * @URI: URI for the dtd, or NULL
2110 * @ExternalID: the external ID of the DTD, or NULL
2111 *
2112 * Creates a new HTML document without a DTD node if @URI and @ExternalID
2113 * are NULL
2114 *
2115 * Returns a new document, do not initialize the DTD if not provided
2116 */
2117htmlDocPtr
2118htmlNewDocNoDtD(const xmlChar *URI, const xmlChar *ExternalID) {
2119 xmlDocPtr cur;
2120
2121 /*
2122 * Allocate a new document and fill the fields.
2123 */
2124 cur = (xmlDocPtr) xmlMalloc(sizeof(xmlDoc));
2125 if (cur == NULL) {
2126 htmlErrMemory(NULL, "HTML document creation failed\n");
2127 return(NULL);
2128 }
2129 memset(cur, 0, sizeof(xmlDoc));
2130
2131 cur->type = XML_HTML_DOCUMENT_NODE;
2132 cur->version = NULL;
2133 cur->intSubset = NULL;
2134 cur->doc = cur;
2135 cur->name = NULL;
2136 cur->children = NULL;
2137 cur->extSubset = NULL;
2138 cur->oldNs = NULL;
2139 cur->encoding = NULL;
2140 cur->standalone = 1;
2141 cur->compression = 0;
2142 cur->ids = NULL;
2143 cur->refs = NULL;
2144 cur->_private = NULL;
2145 cur->charset = XML_CHAR_ENCODING_UTF8;
2146 if ((ExternalID != NULL) ||
2147 (URI != NULL))
2148 xmlCreateIntSubset(cur, BAD_CAST "html", ExternalID, URI);
2149 return(cur);
2150}
2151
2152/**
2153 * htmlNewDoc:
2154 * @URI: URI for the dtd, or NULL
2155 * @ExternalID: the external ID of the DTD, or NULL
2156 *
2157 * Creates a new HTML document
2158 *
2159 * Returns a new document
2160 */
2161htmlDocPtr
2162htmlNewDoc(const xmlChar *URI, const xmlChar *ExternalID) {
2163 if ((URI == NULL) && (ExternalID == NULL))
2164 return(htmlNewDocNoDtD(
2165 BAD_CAST "http://www.w3.org/TR/REC-html40/loose.dtd",
2166 BAD_CAST "-//W3C//DTD HTML 4.0 Transitional//EN"));
2167
2168 return(htmlNewDocNoDtD(URI, ExternalID));
2169}
2170
2171
2172/************************************************************************
2173 * *
2174 * The parser itself *
2175 * Relates to http://www.w3.org/TR/html40 *
2176 * *
2177 ************************************************************************/
2178
2179/************************************************************************
2180 * *
2181 * The parser itself *
2182 * *
2183 ************************************************************************/
2184
2185static const xmlChar * htmlParseNameComplex(xmlParserCtxtPtr ctxt);
2186
2187/**
2188 * htmlParseHTMLName:
2189 * @ctxt: an HTML parser context
2190 *
2191 * parse an HTML tag or attribute name, note that we convert it to lowercase
2192 * since HTML names are not case-sensitive.
2193 *
2194 * Returns the Tag Name parsed or NULL
2195 */
2196
2197static const xmlChar *
2198htmlParseHTMLName(htmlParserCtxtPtr ctxt) {
2199 int i = 0;
2200 xmlChar loc[HTML_PARSER_BUFFER_SIZE];
2201
2202 if (!IS_ASCII_LETTER(CUR) && (CUR != '_') &&
2203 (CUR != ':')) return(NULL);
2204
2205 while ((i < HTML_PARSER_BUFFER_SIZE) &&
2206 ((IS_ASCII_LETTER(CUR)) || (IS_ASCII_DIGIT(CUR)) ||
2207 (CUR == ':') || (CUR == '-') || (CUR == '_'))) {
2208 if ((CUR >= 'A') && (CUR <= 'Z')) loc[i] = CUR + 0x20;
2209 else loc[i] = CUR;
2210 i++;
2211
2212 NEXT;
2213 }
2214
2215 return(xmlDictLookup(ctxt->dict, loc, i));
2216}
2217
2218
2219/**
2220 * htmlParseHTMLName_nonInvasive:
2221 * @ctxt: an HTML parser context
2222 *
2223 * parse an HTML tag or attribute name, note that we convert it to lowercase
2224 * since HTML names are not case-sensitive, this doesn't consume the data
2225 * from the stream, it's a look-ahead
2226 *
2227 * Returns the Tag Name parsed or NULL
2228 */
2229
2230static const xmlChar *
2231htmlParseHTMLName_nonInvasive(htmlParserCtxtPtr ctxt) {
2232 int i = 0;
2233 xmlChar loc[HTML_PARSER_BUFFER_SIZE];
2234
2235 if (!IS_ASCII_LETTER(NXT(1)) && (NXT(1) != '_') &&
2236 (NXT(1) != ':')) return(NULL);
2237
2238 while ((i < HTML_PARSER_BUFFER_SIZE) &&
2239 ((IS_ASCII_LETTER(NXT(1+i))) || (IS_ASCII_DIGIT(NXT(1+i))) ||
2240 (NXT(1+i) == ':') || (NXT(1+i) == '-') || (NXT(1+i) == '_'))) {
2241 if ((NXT(1+i) >= 'A') && (NXT(1+i) <= 'Z')) loc[i] = NXT(1+i) + 0x20;
2242 else loc[i] = NXT(1+i);
2243 i++;
2244 }
2245
2246 return(xmlDictLookup(ctxt->dict, loc, i));
2247}
2248
2249
2250/**
2251 * htmlParseName:
2252 * @ctxt: an HTML parser context
2253 *
2254 * parse an HTML name, this routine is case sensitive.
2255 *
2256 * Returns the Name parsed or NULL
2257 */
2258
2259static const xmlChar *
2260htmlParseName(htmlParserCtxtPtr ctxt) {
2261 const xmlChar *in;
2262 const xmlChar *ret;
2263 int count = 0;
2264
2265 GROW;
2266
2267 /*
2268 * Accelerator for simple ASCII names
2269 */
2270 in = ctxt->input->cur;
2271 if (((*in >= 0x61) && (*in <= 0x7A)) ||
2272 ((*in >= 0x41) && (*in <= 0x5A)) ||
2273 (*in == '_') || (*in == ':')) {
2274 in++;
2275 while (((*in >= 0x61) && (*in <= 0x7A)) ||
2276 ((*in >= 0x41) && (*in <= 0x5A)) ||
2277 ((*in >= 0x30) && (*in <= 0x39)) ||
2278 (*in == '_') || (*in == '-') ||
2279 (*in == ':') || (*in == '.'))
2280 in++;
2281 if ((*in > 0) && (*in < 0x80)) {
2282 count = in - ctxt->input->cur;
2283 ret = xmlDictLookup(ctxt->dict, ctxt->input->cur, count);
2284 ctxt->input->cur = in;
2285 ctxt->nbChars += count;
2286 ctxt->input->col += count;
2287 return(ret);
2288 }
2289 }
2290 return(htmlParseNameComplex(ctxt));
2291}
2292
2293static const xmlChar *
2294htmlParseNameComplex(xmlParserCtxtPtr ctxt) {
2295 int len = 0, l;
2296 int c;
2297 int count = 0;
2298
2299 /*
2300 * Handler for more complex cases
2301 */
2302 GROW;
2303 c = CUR_CHAR(l);
2304 if ((c == ' ') || (c == '>') || (c == '/') || /* accelerators */
2305 (!IS_LETTER(c) && (c != '_') &&
2306 (c != ':'))) {
2307 return(NULL);
2308 }
2309
2310 while ((c != ' ') && (c != '>') && (c != '/') && /* test bigname.xml */
2311 ((IS_LETTER(c)) || (IS_DIGIT(c)) ||
2312 (c == '.') || (c == '-') ||
2313 (c == '_') || (c == ':') ||
2314 (IS_COMBINING(c)) ||
2315 (IS_EXTENDER(c)))) {
2316 if (count++ > 100) {
2317 count = 0;
2318 GROW;
2319 }
2320 len += l;
2321 NEXTL(l);
2322 c = CUR_CHAR(l);
2323 }
2324 return(xmlDictLookup(ctxt->dict, ctxt->input->cur - len, len));
2325}
2326
2327
2328/**
2329 * htmlParseHTMLAttribute:
2330 * @ctxt: an HTML parser context
2331 * @stop: a char stop value
2332 *
2333 * parse an HTML attribute value till the stop (quote), if
2334 * stop is 0 then it stops at the first space
2335 *
2336 * Returns the attribute parsed or NULL
2337 */
2338
2339static xmlChar *
2340htmlParseHTMLAttribute(htmlParserCtxtPtr ctxt, const xmlChar stop) {
2341 xmlChar *buffer = NULL;
2342 int buffer_size = 0;
2343 xmlChar *out = NULL;
2344 const xmlChar *name = NULL;
2345 const xmlChar *cur = NULL;
2346 const htmlEntityDesc * ent;
2347
2348 /*
2349 * allocate a translation buffer.
2350 */
2351 buffer_size = HTML_PARSER_BUFFER_SIZE;
2352 buffer = (xmlChar *) xmlMallocAtomic(buffer_size * sizeof(xmlChar));
2353 if (buffer == NULL) {
2354 htmlErrMemory(ctxt, "buffer allocation failed\n");
2355 return(NULL);
2356 }
2357 out = buffer;
2358
2359 /*
2360 * Ok loop until we reach one of the ending chars
2361 */
2362 while ((CUR != 0) && (CUR != stop)) {
2363 if ((stop == 0) && (CUR == '>')) break;
2364 if ((stop == 0) && (IS_BLANK_CH(CUR))) break;
2365 if (CUR == '&') {
2366 if (NXT(1) == '#') {
2367 unsigned int c;
2368 int bits;
2369
2370 c = htmlParseCharRef(ctxt);
2371 if (c < 0x80)
2372 { *out++ = c; bits= -6; }
2373 else if (c < 0x800)
2374 { *out++ =((c >> 6) & 0x1F) | 0xC0; bits= 0; }
2375 else if (c < 0x10000)
2376 { *out++ =((c >> 12) & 0x0F) | 0xE0; bits= 6; }
2377 else
2378 { *out++ =((c >> 18) & 0x07) | 0xF0; bits= 12; }
2379
2380 for ( ; bits >= 0; bits-= 6) {
2381 *out++ = ((c >> bits) & 0x3F) | 0x80;
2382 }
2383
2384 if (out - buffer > buffer_size - 100) {
2385 int indx = out - buffer;
2386
2387 growBuffer(buffer);
2388 out = &buffer[indx];
2389 }
2390 } else {
2391 ent = htmlParseEntityRef(ctxt, &name);
2392 if (name == NULL) {
2393 *out++ = '&';
2394 if (out - buffer > buffer_size - 100) {
2395 int indx = out - buffer;
2396
2397 growBuffer(buffer);
2398 out = &buffer[indx];
2399 }
2400 } else if (ent == NULL) {
2401 *out++ = '&';
2402 cur = name;
2403 while (*cur != 0) {
2404 if (out - buffer > buffer_size - 100) {
2405 int indx = out - buffer;
2406
2407 growBuffer(buffer);
2408 out = &buffer[indx];
2409 }
2410 *out++ = *cur++;
2411 }
2412 } else {
2413 unsigned int c;
2414 int bits;
2415
2416 if (out - buffer > buffer_size - 100) {
2417 int indx = out - buffer;
2418
2419 growBuffer(buffer);
2420 out = &buffer[indx];
2421 }
2422 c = ent->value;
2423 if (c < 0x80)
2424 { *out++ = c; bits= -6; }
2425 else if (c < 0x800)
2426 { *out++ =((c >> 6) & 0x1F) | 0xC0; bits= 0; }
2427 else if (c < 0x10000)
2428 { *out++ =((c >> 12) & 0x0F) | 0xE0; bits= 6; }
2429 else
2430 { *out++ =((c >> 18) & 0x07) | 0xF0; bits= 12; }
2431
2432 for ( ; bits >= 0; bits-= 6) {
2433 *out++ = ((c >> bits) & 0x3F) | 0x80;
2434 }
2435 }
2436 }
2437 } else {
2438 unsigned int c;
2439 int bits, l;
2440
2441 if (out - buffer > buffer_size - 100) {
2442 int indx = out - buffer;
2443
2444 growBuffer(buffer);
2445 out = &buffer[indx];
2446 }
2447 c = CUR_CHAR(l);
2448 if (c < 0x80)
2449 { *out++ = c; bits= -6; }
2450 else if (c < 0x800)
2451 { *out++ =((c >> 6) & 0x1F) | 0xC0; bits= 0; }
2452 else if (c < 0x10000)
2453 { *out++ =((c >> 12) & 0x0F) | 0xE0; bits= 6; }
2454 else
2455 { *out++ =((c >> 18) & 0x07) | 0xF0; bits= 12; }
2456
2457 for ( ; bits >= 0; bits-= 6) {
2458 *out++ = ((c >> bits) & 0x3F) | 0x80;
2459 }
2460 NEXT;
2461 }
2462 }
2463 *out++ = 0;
2464 return(buffer);
2465}
2466
2467/**
2468 * htmlParseEntityRef:
2469 * @ctxt: an HTML parser context
2470 * @str: location to store the entity name
2471 *
2472 * parse an HTML ENTITY references
2473 *
2474 * [68] EntityRef ::= '&' Name ';'
2475 *
2476 * Returns the associated htmlEntityDescPtr if found, or NULL otherwise,
2477 * if non-NULL *str will have to be freed by the caller.
2478 */
2479const htmlEntityDesc *
2480htmlParseEntityRef(htmlParserCtxtPtr ctxt, const xmlChar **str) {
2481 const xmlChar *name;
2482 const htmlEntityDesc * ent = NULL;
2483
2484 if (str != NULL) *str = NULL;
2485 if ((ctxt == NULL) || (ctxt->input == NULL)) return(NULL);
2486
2487 if (CUR == '&') {
2488 NEXT;
2489 name = htmlParseName(ctxt);
2490 if (name == NULL) {
2491 htmlParseErr(ctxt, XML_ERR_NAME_REQUIRED,
2492 "htmlParseEntityRef: no name\n", NULL, NULL);
2493 } else {
2494 GROW;
2495 if (CUR == ';') {
2496 if (str != NULL)
2497 *str = name;
2498
2499 /*
2500 * Lookup the entity in the table.
2501 */
2502 ent = htmlEntityLookup(name);
2503 if (ent != NULL) /* OK that's ugly !!! */
2504 NEXT;
2505 } else {
2506 htmlParseErr(ctxt, XML_ERR_ENTITYREF_SEMICOL_MISSING,
2507 "htmlParseEntityRef: expecting ';'\n",
2508 NULL, NULL);
2509 if (str != NULL)
2510 *str = name;
2511 }
2512 }
2513 }
2514 return(ent);
2515}
2516
2517/**
2518 * htmlParseAttValue:
2519 * @ctxt: an HTML parser context
2520 *
2521 * parse a value for an attribute
2522 * Note: the parser won't do substitution of entities here, this
2523 * will be handled later in xmlStringGetNodeList, unless it was
2524 * asked for ctxt->replaceEntities != 0
2525 *
2526 * Returns the AttValue parsed or NULL.
2527 */
2528
2529static xmlChar *
2530htmlParseAttValue(htmlParserCtxtPtr ctxt) {
2531 xmlChar *ret = NULL;
2532
2533 if (CUR == '"') {
2534 NEXT;
2535 ret = htmlParseHTMLAttribute(ctxt, '"');
2536 if (CUR != '"') {
2537 htmlParseErr(ctxt, XML_ERR_ATTRIBUTE_NOT_FINISHED,
2538 "AttValue: \" expected\n", NULL, NULL);
2539 } else
2540 NEXT;
2541 } else if (CUR == '\'') {
2542 NEXT;
2543 ret = htmlParseHTMLAttribute(ctxt, '\'');
2544 if (CUR != '\'') {
2545 htmlParseErr(ctxt, XML_ERR_ATTRIBUTE_NOT_FINISHED,
2546 "AttValue: ' expected\n", NULL, NULL);
2547 } else
2548 NEXT;
2549 } else {
2550 /*
2551 * That's an HTMLism, the attribute value may not be quoted
2552 */
2553 ret = htmlParseHTMLAttribute(ctxt, 0);
2554 if (ret == NULL) {
2555 htmlParseErr(ctxt, XML_ERR_ATTRIBUTE_WITHOUT_VALUE,
2556 "AttValue: no value found\n", NULL, NULL);
2557 }
2558 }
2559 return(ret);
2560}
2561
2562/**
2563 * htmlParseSystemLiteral:
2564 * @ctxt: an HTML parser context
2565 *
2566 * parse an HTML Literal
2567 *
2568 * [11] SystemLiteral ::= ('"' [^"]* '"') | ("'" [^']* "'")
2569 *
2570 * Returns the SystemLiteral parsed or NULL
2571 */
2572
2573static xmlChar *
2574htmlParseSystemLiteral(htmlParserCtxtPtr ctxt) {
2575 const xmlChar *q;
2576 xmlChar *ret = NULL;
2577
2578 if (CUR == '"') {
2579 NEXT;
2580 q = CUR_PTR;
2581 while ((IS_CHAR_CH(CUR)) && (CUR != '"'))
2582 NEXT;
2583 if (!IS_CHAR_CH(CUR)) {
2584 htmlParseErr(ctxt, XML_ERR_LITERAL_NOT_FINISHED,
2585 "Unfinished SystemLiteral\n", NULL, NULL);
2586 } else {
2587 ret = xmlStrndup(q, CUR_PTR - q);
2588 NEXT;
2589 }
2590 } else if (CUR == '\'') {
2591 NEXT;
2592 q = CUR_PTR;
2593 while ((IS_CHAR_CH(CUR)) && (CUR != '\''))
2594 NEXT;
2595 if (!IS_CHAR_CH(CUR)) {
2596 htmlParseErr(ctxt, XML_ERR_LITERAL_NOT_FINISHED,
2597 "Unfinished SystemLiteral\n", NULL, NULL);
2598 } else {
2599 ret = xmlStrndup(q, CUR_PTR - q);
2600 NEXT;
2601 }
2602 } else {
2603 htmlParseErr(ctxt, XML_ERR_LITERAL_NOT_STARTED,
2604 " or ' expected\n", NULL, NULL);
2605 }
2606
2607 return(ret);
2608}
2609
2610/**
2611 * htmlParsePubidLiteral:
2612 * @ctxt: an HTML parser context
2613 *
2614 * parse an HTML public literal
2615 *
2616 * [12] PubidLiteral ::= '"' PubidChar* '"' | "'" (PubidChar - "'")* "'"
2617 *
2618 * Returns the PubidLiteral parsed or NULL.
2619 */
2620
2621static xmlChar *
2622htmlParsePubidLiteral(htmlParserCtxtPtr ctxt) {
2623 const xmlChar *q;
2624 xmlChar *ret = NULL;
2625 /*
2626 * Name ::= (Letter | '_') (NameChar)*
2627 */
2628 if (CUR == '"') {
2629 NEXT;
2630 q = CUR_PTR;
2631 while (IS_PUBIDCHAR_CH(CUR)) NEXT;
2632 if (CUR != '"') {
2633 htmlParseErr(ctxt, XML_ERR_LITERAL_NOT_FINISHED,
2634 "Unfinished PubidLiteral\n", NULL, NULL);
2635 } else {
2636 ret = xmlStrndup(q, CUR_PTR - q);
2637 NEXT;
2638 }
2639 } else if (CUR == '\'') {
2640 NEXT;
2641 q = CUR_PTR;
2642 while ((IS_PUBIDCHAR_CH(CUR)) && (CUR != '\''))
2643 NEXT;
2644 if (CUR != '\'') {
2645 htmlParseErr(ctxt, XML_ERR_LITERAL_NOT_FINISHED,
2646 "Unfinished PubidLiteral\n", NULL, NULL);
2647 } else {
2648 ret = xmlStrndup(q, CUR_PTR - q);
2649 NEXT;
2650 }
2651 } else {
2652 htmlParseErr(ctxt, XML_ERR_LITERAL_NOT_STARTED,
2653 "PubidLiteral \" or ' expected\n", NULL, NULL);
2654 }
2655
2656 return(ret);
2657}
2658
2659/**
2660 * htmlParseScript:
2661 * @ctxt: an HTML parser context
2662 *
2663 * parse the content of an HTML SCRIPT or STYLE element
2664 * http://www.w3.org/TR/html4/sgml/dtd.html#Script
2665 * http://www.w3.org/TR/html4/sgml/dtd.html#StyleSheet
2666 * http://www.w3.org/TR/html4/types.html#type-script
2667 * http://www.w3.org/TR/html4/types.html#h-6.15
2668 * http://www.w3.org/TR/html4/appendix/notes.html#h-B.3.2.1
2669 *
2670 * Script data ( %Script; in the DTD) can be the content of the SCRIPT
2671 * element and the value of intrinsic event attributes. User agents must
2672 * not evaluate script data as HTML markup but instead must pass it on as
2673 * data to a script engine.
2674 * NOTES:
2675 * - The content is passed like CDATA
2676 * - the attributes for style and scripting "onXXX" are also described
2677 * as CDATA but SGML allows entities references in attributes so their
2678 * processing is identical as other attributes
2679 */
2680static void
2681htmlParseScript(htmlParserCtxtPtr ctxt) {
2682 xmlChar buf[HTML_PARSER_BIG_BUFFER_SIZE + 5];
2683 int nbchar = 0;
2684 int cur,l;
2685
2686 SHRINK;
2687 cur = CUR_CHAR(l);
2688 while (IS_CHAR_CH(cur)) {
2689 if ((cur == '<') && (NXT(1) == '/')) {
2690 /*
2691 * One should break here, the specification is clear:
2692 * Authors should therefore escape "</" within the content.
2693 * Escape mechanisms are specific to each scripting or
2694 * style sheet language.
2695 *
2696 * In recovery mode, only break if end tag match the
2697 * current tag, effectively ignoring all tags inside the
2698 * script/style block and treating the entire block as
2699 * CDATA.
2700 */
2701 if (ctxt->recovery) {
2702 if (xmlStrncasecmp(ctxt->name, ctxt->input->cur+2,
2703 xmlStrlen(ctxt->name)) == 0)
2704 {
2705 break; /* while */
2706 } else {
2707 htmlParseErr(ctxt, XML_ERR_TAG_NAME_MISMATCH,
2708 "Element %s embeds close tag\n",
2709 ctxt->name, NULL);
2710 }
2711 } else {
2712 if (((NXT(2) >= 'A') && (NXT(2) <= 'Z')) ||
2713 ((NXT(2) >= 'a') && (NXT(2) <= 'z')))
2714 {
2715 break; /* while */
2716 }
2717 }
2718 }
2719 COPY_BUF(l,buf,nbchar,cur);
2720 if (nbchar >= HTML_PARSER_BIG_BUFFER_SIZE) {
2721 if (ctxt->sax->cdataBlock!= NULL) {
2722 /*
2723 * Insert as CDATA, which is the same as HTML_PRESERVE_NODE
2724 */
2725 ctxt->sax->cdataBlock(ctxt->userData, buf, nbchar);
2726 } else if (ctxt->sax->characters != NULL) {
2727 ctxt->sax->characters(ctxt->userData, buf, nbchar);
2728 }
2729 nbchar = 0;
2730 }
2731 GROW;
2732 NEXTL(l);
2733 cur = CUR_CHAR(l);
2734 }
2735
2736 if ((!(IS_CHAR_CH(cur))) && (!((cur == 0) && (ctxt->progressive)))) {
2737 htmlParseErrInt(ctxt, XML_ERR_INVALID_CHAR,
2738 "Invalid char in CDATA 0x%X\n", cur);
2739 NEXT;
2740 }
2741
2742 if ((nbchar != 0) && (ctxt->sax != NULL) && (!ctxt->disableSAX)) {
2743 if (ctxt->sax->cdataBlock!= NULL) {
2744 /*
2745 * Insert as CDATA, which is the same as HTML_PRESERVE_NODE
2746 */
2747 ctxt->sax->cdataBlock(ctxt->userData, buf, nbchar);
2748 } else if (ctxt->sax->characters != NULL) {
2749 ctxt->sax->characters(ctxt->userData, buf, nbchar);
2750 }
2751 }
2752}
2753
2754
2755/**
2756 * htmlParseCharData:
2757 * @ctxt: an HTML parser context
2758 *
2759 * parse a CharData section.
2760 * if we are within a CDATA section ']]>' marks an end of section.
2761 *
2762 * [14] CharData ::= [^<&]* - ([^<&]* ']]>' [^<&]*)
2763 */
2764
2765static void
2766htmlParseCharData(htmlParserCtxtPtr ctxt) {
2767 xmlChar buf[HTML_PARSER_BIG_BUFFER_SIZE + 5];
2768 int nbchar = 0;
2769 int cur, l;
2770
2771 SHRINK;
2772 cur = CUR_CHAR(l);
2773 while (((cur != '<') || (ctxt->token == '<')) &&
2774 ((cur != '&') || (ctxt->token == '&')) &&
2775 (cur != 0)) {
2776 if (!(IS_CHAR(cur))) {
2777 htmlParseErrInt(ctxt, XML_ERR_INVALID_CHAR,
2778 "Invalid char in CDATA 0x%X\n", cur);
2779 } else {
2780 COPY_BUF(l,buf,nbchar,cur);
2781 }
2782 if (nbchar >= HTML_PARSER_BIG_BUFFER_SIZE) {
2783 /*
2784 * Ok the segment is to be consumed as chars.
2785 */
2786 if ((ctxt->sax != NULL) && (!ctxt->disableSAX)) {
2787 if (areBlanks(ctxt, buf, nbchar)) {
2788 if (ctxt->sax->ignorableWhitespace != NULL)
2789 ctxt->sax->ignorableWhitespace(ctxt->userData,
2790 buf, nbchar);
2791 } else {
2792 htmlCheckParagraph(ctxt);
2793 if (ctxt->sax->characters != NULL)
2794 ctxt->sax->characters(ctxt->userData, buf, nbchar);
2795 }
2796 }
2797 nbchar = 0;
2798 }
2799 NEXTL(l);
2800 cur = CUR_CHAR(l);
2801 if (cur == 0) {
2802 SHRINK;
2803 GROW;
2804 cur = CUR_CHAR(l);
2805 }
2806 }
2807 if (nbchar != 0) {
2808 buf[nbchar] = 0;
2809
2810 /*
2811 * Ok the segment is to be consumed as chars.
2812 */
2813 if ((ctxt->sax != NULL) && (!ctxt->disableSAX)) {
2814 if (areBlanks(ctxt, buf, nbchar)) {
2815 if (ctxt->sax->ignorableWhitespace != NULL)
2816 ctxt->sax->ignorableWhitespace(ctxt->userData, buf, nbchar);
2817 } else {
2818 htmlCheckParagraph(ctxt);
2819 if (ctxt->sax->characters != NULL)
2820 ctxt->sax->characters(ctxt->userData, buf, nbchar);
2821 }
2822 }
2823 } else {
2824 /*
2825 * Loop detection
2826 */
2827 if (cur == 0)
2828 ctxt->instate = XML_PARSER_EOF;
2829 }
2830}
2831
2832/**
2833 * htmlParseExternalID:
2834 * @ctxt: an HTML parser context
2835 * @publicID: a xmlChar** receiving PubidLiteral
2836 *
2837 * Parse an External ID or a Public ID
2838 *
2839 * [75] ExternalID ::= 'SYSTEM' S SystemLiteral
2840 * | 'PUBLIC' S PubidLiteral S SystemLiteral
2841 *
2842 * [83] PublicID ::= 'PUBLIC' S PubidLiteral
2843 *
2844 * Returns the function returns SystemLiteral and in the second
2845 * case publicID receives PubidLiteral, is strict is off
2846 * it is possible to return NULL and have publicID set.
2847 */
2848
2849static xmlChar *
2850htmlParseExternalID(htmlParserCtxtPtr ctxt, xmlChar **publicID) {
2851 xmlChar *URI = NULL;
2852
2853 if ((UPPER == 'S') && (UPP(1) == 'Y') &&
2854 (UPP(2) == 'S') && (UPP(3) == 'T') &&
2855 (UPP(4) == 'E') && (UPP(5) == 'M')) {
2856 SKIP(6);
2857 if (!IS_BLANK_CH(CUR)) {
2858 htmlParseErr(ctxt, XML_ERR_SPACE_REQUIRED,
2859 "Space required after 'SYSTEM'\n", NULL, NULL);
2860 }
2861 SKIP_BLANKS;
2862 URI = htmlParseSystemLiteral(ctxt);
2863 if (URI == NULL) {
2864 htmlParseErr(ctxt, XML_ERR_URI_REQUIRED,
2865 "htmlParseExternalID: SYSTEM, no URI\n", NULL, NULL);
2866 }
2867 } else if ((UPPER == 'P') && (UPP(1) == 'U') &&
2868 (UPP(2) == 'B') && (UPP(3) == 'L') &&
2869 (UPP(4) == 'I') && (UPP(5) == 'C')) {
2870 SKIP(6);
2871 if (!IS_BLANK_CH(CUR)) {
2872 htmlParseErr(ctxt, XML_ERR_SPACE_REQUIRED,
2873 "Space required after 'PUBLIC'\n", NULL, NULL);
2874 }
2875 SKIP_BLANKS;
2876 *publicID = htmlParsePubidLiteral(ctxt);
2877 if (*publicID == NULL) {
2878 htmlParseErr(ctxt, XML_ERR_PUBID_REQUIRED,
2879 "htmlParseExternalID: PUBLIC, no Public Identifier\n",
2880 NULL, NULL);
2881 }
2882 SKIP_BLANKS;
2883 if ((CUR == '"') || (CUR == '\'')) {
2884 URI = htmlParseSystemLiteral(ctxt);
2885 }
2886 }
2887 return(URI);
2888}
2889
2890/**
2891 * xmlParsePI:
2892 * @ctxt: an XML parser context
2893 *
2894 * parse an XML Processing Instruction.
2895 *
2896 * [16] PI ::= '<?' PITarget (S (Char* - (Char* '?>' Char*)))? '?>'
2897 */
2898static void
2899htmlParsePI(htmlParserCtxtPtr ctxt) {
2900 xmlChar *buf = NULL;
2901 int len = 0;
2902 int size = HTML_PARSER_BUFFER_SIZE;
2903 int cur, l;
2904 const xmlChar *target;
2905 xmlParserInputState state;
2906 int count = 0;
2907
2908 if ((RAW == '<') && (NXT(1) == '?')) {
2909 state = ctxt->instate;
2910 ctxt->instate = XML_PARSER_PI;
2911 /*
2912 * this is a Processing Instruction.
2913 */
2914 SKIP(2);
2915 SHRINK;
2916
2917 /*
2918 * Parse the target name and check for special support like
2919 * namespace.
2920 */
2921 target = htmlParseName(ctxt);
2922 if (target != NULL) {
2923 if (RAW == '>') {
2924 SKIP(1);
2925
2926 /*
2927 * SAX: PI detected.
2928 */
2929 if ((ctxt->sax) && (!ctxt->disableSAX) &&
2930 (ctxt->sax->processingInstruction != NULL))
2931 ctxt->sax->processingInstruction(ctxt->userData,
2932 target, NULL);
2933 ctxt->instate = state;
2934 return;
2935 }
2936 buf = (xmlChar *) xmlMallocAtomic(size * sizeof(xmlChar));
2937 if (buf == NULL) {
2938 htmlErrMemory(ctxt, NULL);
2939 ctxt->instate = state;
2940 return;
2941 }
2942 cur = CUR;
2943 if (!IS_BLANK(cur)) {
2944 htmlParseErr(ctxt, XML_ERR_SPACE_REQUIRED,
2945 "ParsePI: PI %s space expected\n", target, NULL);
2946 }
2947 SKIP_BLANKS;
2948 cur = CUR_CHAR(l);
2949 while (IS_CHAR(cur) && (cur != '>')) {
2950 if (len + 5 >= size) {
2951 xmlChar *tmp;
2952
2953 size *= 2;
2954 tmp = (xmlChar *) xmlRealloc(buf, size * sizeof(xmlChar));
2955 if (tmp == NULL) {
2956 htmlErrMemory(ctxt, NULL);
2957 xmlFree(buf);
2958 ctxt->instate = state;
2959 return;
2960 }
2961 buf = tmp;
2962 }
2963 count++;
2964 if (count > 50) {
2965 GROW;
2966 count = 0;
2967 }
2968 COPY_BUF(l,buf,len,cur);
2969 NEXTL(l);
2970 cur = CUR_CHAR(l);
2971 if (cur == 0) {
2972 SHRINK;
2973 GROW;
2974 cur = CUR_CHAR(l);
2975 }
2976 }
2977 buf[len] = 0;
2978 if (cur != '>') {
2979 htmlParseErr(ctxt, XML_ERR_PI_NOT_FINISHED,
2980 "ParsePI: PI %s never end ...\n", target, NULL);
2981 } else {
2982 SKIP(1);
2983
2984 /*
2985 * SAX: PI detected.
2986 */
2987 if ((ctxt->sax) && (!ctxt->disableSAX) &&
2988 (ctxt->sax->processingInstruction != NULL))
2989 ctxt->sax->processingInstruction(ctxt->userData,
2990 target, buf);
2991 }
2992 xmlFree(buf);
2993 } else {
2994 htmlParseErr(ctxt, XML_ERR_PI_NOT_STARTED,
2995 "PI is not started correctly", NULL, NULL);
2996 }
2997 ctxt->instate = state;
2998 }
2999}
3000
3001/**
3002 * htmlParseComment:
3003 * @ctxt: an HTML parser context
3004 *
3005 * Parse an XML (SGML) comment <!-- .... -->
3006 *
3007 * [15] Comment ::= '<!--' ((Char - '-') | ('-' (Char - '-')))* '-->'
3008 */
3009static void
3010htmlParseComment(htmlParserCtxtPtr ctxt) {
3011 xmlChar *buf = NULL;
3012 int len;
3013 int size = HTML_PARSER_BUFFER_SIZE;
3014 int q, ql;
3015 int r, rl;
3016 int cur, l;
3017 xmlParserInputState state;
3018
3019 /*
3020 * Check that there is a comment right here.
3021 */
3022 if ((RAW != '<') || (NXT(1) != '!') ||
3023 (NXT(2) != '-') || (NXT(3) != '-')) return;
3024
3025 state = ctxt->instate;
3026 ctxt->instate = XML_PARSER_COMMENT;
3027 SHRINK;
3028 SKIP(4);
3029 buf = (xmlChar *) xmlMallocAtomic(size * sizeof(xmlChar));
3030 if (buf == NULL) {
3031 htmlErrMemory(ctxt, "buffer allocation failed\n");
3032 ctxt->instate = state;
3033 return;
3034 }
3035 q = CUR_CHAR(ql);
3036 NEXTL(ql);
3037 r = CUR_CHAR(rl);
3038 NEXTL(rl);
3039 cur = CUR_CHAR(l);
3040 len = 0;
3041 while (IS_CHAR(cur) &&
3042 ((cur != '>') ||
3043 (r != '-') || (q != '-'))) {
3044 if (len + 5 >= size) {
3045 xmlChar *tmp;
3046
3047 size *= 2;
3048 tmp = (xmlChar *) xmlRealloc(buf, size * sizeof(xmlChar));
3049 if (tmp == NULL) {
3050 xmlFree(buf);
3051 htmlErrMemory(ctxt, "growing buffer failed\n");
3052 ctxt->instate = state;
3053 return;
3054 }
3055 buf = tmp;
3056 }
3057 COPY_BUF(ql,buf,len,q);
3058 q = r;
3059 ql = rl;
3060 r = cur;
3061 rl = l;
3062 NEXTL(l);
3063 cur = CUR_CHAR(l);
3064 if (cur == 0) {
3065 SHRINK;
3066 GROW;
3067 cur = CUR_CHAR(l);
3068 }
3069 }
3070 buf[len] = 0;
3071 if (!IS_CHAR(cur)) {
3072 htmlParseErr(ctxt, XML_ERR_COMMENT_NOT_FINISHED,
3073 "Comment not terminated \n<!--%.50s\n", buf, NULL);
3074 xmlFree(buf);
3075 } else {
3076 NEXT;
3077 if ((ctxt->sax != NULL) && (ctxt->sax->comment != NULL) &&
3078 (!ctxt->disableSAX))
3079 ctxt->sax->comment(ctxt->userData, buf);
3080 xmlFree(buf);
3081 }
3082 ctxt->instate = state;
3083}
3084
3085/**
3086 * htmlParseCharRef:
3087 * @ctxt: an HTML parser context
3088 *
3089 * parse Reference declarations
3090 *
3091 * [66] CharRef ::= '&#' [0-9]+ ';' |
3092 * '&#x' [0-9a-fA-F]+ ';'
3093 *
3094 * Returns the value parsed (as an int)
3095 */
3096int
3097htmlParseCharRef(htmlParserCtxtPtr ctxt) {
3098 int val = 0;
3099
3100 if ((ctxt == NULL) || (ctxt->input == NULL)) {
3101 htmlParseErr(ctxt, XML_ERR_INTERNAL_ERROR,
3102 "htmlParseCharRef: context error\n",
3103 NULL, NULL);
3104 return(0);
3105 }
3106 if ((CUR == '&') && (NXT(1) == '#') &&
3107 ((NXT(2) == 'x') || NXT(2) == 'X')) {
3108 SKIP(3);
3109 while (CUR != ';') {
3110 if ((CUR >= '0') && (CUR <= '9'))
3111 val = val * 16 + (CUR - '0');
3112 else if ((CUR >= 'a') && (CUR <= 'f'))
3113 val = val * 16 + (CUR - 'a') + 10;
3114 else if ((CUR >= 'A') && (CUR <= 'F'))
3115 val = val * 16 + (CUR - 'A') + 10;
3116 else {
3117 htmlParseErr(ctxt, XML_ERR_INVALID_HEX_CHARREF,
3118 "htmlParseCharRef: invalid hexadecimal value\n",
3119 NULL, NULL);
3120 return(0);
3121 }
3122 NEXT;
3123 }
3124 if (CUR == ';')
3125 NEXT;
3126 } else if ((CUR == '&') && (NXT(1) == '#')) {
3127 SKIP(2);
3128 while (CUR != ';') {
3129 if ((CUR >= '0') && (CUR <= '9'))
3130 val = val * 10 + (CUR - '0');
3131 else {
3132 htmlParseErr(ctxt, XML_ERR_INVALID_DEC_CHARREF,
3133 "htmlParseCharRef: invalid decimal value\n",
3134 NULL, NULL);
3135 return(0);
3136 }
3137 NEXT;
3138 }
3139 if (CUR == ';')
3140 NEXT;
3141 } else {
3142 htmlParseErr(ctxt, XML_ERR_INVALID_CHARREF,
3143 "htmlParseCharRef: invalid value\n", NULL, NULL);
3144 }
3145 /*
3146 * Check the value IS_CHAR ...
3147 */
3148 if (IS_CHAR(val)) {
3149 return(val);
3150 } else {
3151 htmlParseErrInt(ctxt, XML_ERR_INVALID_CHAR,
3152 "htmlParseCharRef: invalid xmlChar value %d\n",
3153 val);
3154 }
3155 return(0);
3156}
3157
3158
3159/**
3160 * htmlParseDocTypeDecl:
3161 * @ctxt: an HTML parser context
3162 *
3163 * parse a DOCTYPE declaration
3164 *
3165 * [28] doctypedecl ::= '<!DOCTYPE' S Name (S ExternalID)? S?
3166 * ('[' (markupdecl | PEReference | S)* ']' S?)? '>'
3167 */
3168
3169static void
3170htmlParseDocTypeDecl(htmlParserCtxtPtr ctxt) {
3171 const xmlChar *name;
3172 xmlChar *ExternalID = NULL;
3173 xmlChar *URI = NULL;
3174
3175 /*
3176 * We know that '<!DOCTYPE' has been detected.
3177 */
3178 SKIP(9);
3179
3180 SKIP_BLANKS;
3181
3182 /*
3183 * Parse the DOCTYPE name.
3184 */
3185 name = htmlParseName(ctxt);
3186 if (name == NULL) {
3187 htmlParseErr(ctxt, XML_ERR_NAME_REQUIRED,
3188 "htmlParseDocTypeDecl : no DOCTYPE name !\n",
3189 NULL, NULL);
3190 }
3191 /*
3192 * Check that upper(name) == "HTML" !!!!!!!!!!!!!
3193 */
3194
3195 SKIP_BLANKS;
3196
3197 /*
3198 * Check for SystemID and ExternalID
3199 */
3200 URI = htmlParseExternalID(ctxt, &ExternalID);
3201 SKIP_BLANKS;
3202
3203 /*
3204 * We should be at the end of the DOCTYPE declaration.
3205 */
3206 if (CUR != '>') {
3207 htmlParseErr(ctxt, XML_ERR_DOCTYPE_NOT_FINISHED,
3208 "DOCTYPE improperly terminated\n", NULL, NULL);
3209 /* We shouldn't try to resynchronize ... */
3210 }
3211 NEXT;
3212
3213 /*
3214 * Create or update the document accordingly to the DOCTYPE
3215 */
3216 if ((ctxt->sax != NULL) && (ctxt->sax->internalSubset != NULL) &&
3217 (!ctxt->disableSAX))
3218 ctxt->sax->internalSubset(ctxt->userData, name, ExternalID, URI);
3219
3220 /*
3221 * Cleanup, since we don't use all those identifiers
3222 */
3223 if (URI != NULL) xmlFree(URI);
3224 if (ExternalID != NULL) xmlFree(ExternalID);
3225}
3226
3227/**
3228 * htmlParseAttribute:
3229 * @ctxt: an HTML parser context
3230 * @value: a xmlChar ** used to store the value of the attribute
3231 *
3232 * parse an attribute
3233 *
3234 * [41] Attribute ::= Name Eq AttValue
3235 *
3236 * [25] Eq ::= S? '=' S?
3237 *
3238 * With namespace:
3239 *
3240 * [NS 11] Attribute ::= QName Eq AttValue
3241 *
3242 * Also the case QName == xmlns:??? is handled independently as a namespace
3243 * definition.
3244 *
3245 * Returns the attribute name, and the value in *value.
3246 */
3247
3248static const xmlChar *
3249htmlParseAttribute(htmlParserCtxtPtr ctxt, xmlChar **value) {
3250 const xmlChar *name;
3251 xmlChar *val = NULL;
3252
3253 *value = NULL;
3254 name = htmlParseHTMLName(ctxt);
3255 if (name == NULL) {
3256 htmlParseErr(ctxt, XML_ERR_NAME_REQUIRED,
3257 "error parsing attribute name\n", NULL, NULL);
3258 return(NULL);
3259 }
3260
3261 /*
3262 * read the value
3263 */
3264 SKIP_BLANKS;
3265 if (CUR == '=') {
3266 NEXT;
3267 SKIP_BLANKS;
3268 val = htmlParseAttValue(ctxt);
3269 } else if (htmlIsBooleanAttr(name)) {
3270 /*
3271 * assume a minimized attribute
3272 */
3273 val = xmlStrdup(name);
3274 }
3275
3276 *value = val;
3277 return(name);
3278}
3279
3280/**
3281 * htmlCheckEncoding:
3282 * @ctxt: an HTML parser context
3283 * @attvalue: the attribute value
3284 *
3285 * Checks an http-equiv attribute from a Meta tag to detect
3286 * the encoding
3287 * If a new encoding is detected the parser is switched to decode
3288 * it and pass UTF8
3289 */
3290static void
3291htmlCheckEncoding(htmlParserCtxtPtr ctxt, const xmlChar *attvalue) {
3292 const xmlChar *encoding;
3293
3294 if ((ctxt == NULL) || (attvalue == NULL))
3295 return;
3296
3297 /* do not change encoding */
3298 if (ctxt->input->encoding != NULL)
3299 return;
3300
3301 encoding = xmlStrcasestr(attvalue, BAD_CAST"charset=");
3302 if (encoding != NULL) {
3303 encoding += 8;
3304 } else {
3305 encoding = xmlStrcasestr(attvalue, BAD_CAST"charset =");
3306 if (encoding != NULL)
3307 encoding += 9;
3308 }
3309 if (encoding != NULL) {
3310 xmlCharEncoding enc;
3311 xmlCharEncodingHandlerPtr handler;
3312
3313 while ((*encoding == ' ') || (*encoding == '\t')) encoding++;
3314
3315 if (ctxt->input->encoding != NULL)
3316 xmlFree((xmlChar *) ctxt->input->encoding);
3317 ctxt->input->encoding = xmlStrdup(encoding);
3318
3319 enc = xmlParseCharEncoding((const char *) encoding);
3320 /*
3321 * registered set of known encodings
3322 */
3323 if (enc != XML_CHAR_ENCODING_ERROR) {
3324 if (((enc == XML_CHAR_ENCODING_UTF16LE) ||
3325 (enc == XML_CHAR_ENCODING_UTF16BE) ||
3326 (enc == XML_CHAR_ENCODING_UCS4LE) ||
3327 (enc == XML_CHAR_ENCODING_UCS4BE)) &&
3328 (ctxt->input->buf != NULL) &&
3329 (ctxt->input->buf->encoder == NULL)) {
3330 htmlParseErr(ctxt, XML_ERR_INVALID_ENCODING,
3331 "htmlCheckEncoding: wrong encoding meta\n",
3332 NULL, NULL);
3333 } else {
3334 xmlSwitchEncoding(ctxt, enc);
3335 }
3336 ctxt->charset = XML_CHAR_ENCODING_UTF8;
3337 } else {
3338 /*
3339 * fallback for unknown encodings
3340 */
3341 handler = xmlFindCharEncodingHandler((const char *) encoding);
3342 if (handler != NULL) {
3343 xmlSwitchToEncoding(ctxt, handler);
3344 ctxt->charset = XML_CHAR_ENCODING_UTF8;
3345 } else {
3346 ctxt->errNo = XML_ERR_UNSUPPORTED_ENCODING;
3347 }
3348 }
3349
3350 if ((ctxt->input->buf != NULL) &&
3351 (ctxt->input->buf->encoder != NULL) &&
3352 (ctxt->input->buf->raw != NULL) &&
3353 (ctxt->input->buf->buffer != NULL)) {
3354 int nbchars;
3355 int processed;
3356
3357 /*
3358 * convert as much as possible to the parser reading buffer.
3359 */
3360 processed = ctxt->input->cur - ctxt->input->base;
3361 xmlBufferShrink(ctxt->input->buf->buffer, processed);
3362 nbchars = xmlCharEncInFunc(ctxt->input->buf->encoder,
3363 ctxt->input->buf->buffer,
3364 ctxt->input->buf->raw);
3365 if (nbchars < 0) {
3366 htmlParseErr(ctxt, XML_ERR_INVALID_ENCODING,
3367 "htmlCheckEncoding: encoder error\n",
3368 NULL, NULL);
3369 }
3370 ctxt->input->base =
3371 ctxt->input->cur = ctxt->input->buf->buffer->content;
3372 }
3373 }
3374}
3375
3376/**
3377 * htmlCheckMeta:
3378 * @ctxt: an HTML parser context
3379 * @atts: the attributes values
3380 *
3381 * Checks an attributes from a Meta tag
3382 */
3383static void
3384htmlCheckMeta(htmlParserCtxtPtr ctxt, const xmlChar **atts) {
3385 int i;
3386 const xmlChar *att, *value;
3387 int http = 0;
3388 const xmlChar *content = NULL;
3389
3390 if ((ctxt == NULL) || (atts == NULL))
3391 return;
3392
3393 i = 0;
3394 att = atts[i++];
3395 while (att != NULL) {
3396 value = atts[i++];
3397 if ((value != NULL) && (!xmlStrcasecmp(att, BAD_CAST"http-equiv"))
3398 && (!xmlStrcasecmp(value, BAD_CAST"Content-Type")))
3399 http = 1;
3400 else if ((value != NULL) && (!xmlStrcasecmp(att, BAD_CAST"content")))
3401 content = value;
3402 att = atts[i++];
3403 }
3404 if ((http) && (content != NULL))
3405 htmlCheckEncoding(ctxt, content);
3406
3407}
3408
3409/**
3410 * htmlParseStartTag:
3411 * @ctxt: an HTML parser context
3412 *
3413 * parse a start of tag either for rule element or
3414 * EmptyElement. In both case we don't parse the tag closing chars.
3415 *
3416 * [40] STag ::= '<' Name (S Attribute)* S? '>'
3417 *
3418 * [44] EmptyElemTag ::= '<' Name (S Attribute)* S? '/>'
3419 *
3420 * With namespace:
3421 *
3422 * [NS 8] STag ::= '<' QName (S Attribute)* S? '>'
3423 *
3424 * [NS 10] EmptyElement ::= '<' QName (S Attribute)* S? '/>'
3425 *
3426 * Returns 0 in case of success and -1 in case of error.
3427 */
3428
3429static int
3430htmlParseStartTag(htmlParserCtxtPtr ctxt) {
3431 const xmlChar *name;
3432 const xmlChar *attname;
3433 xmlChar *attvalue;
3434 const xmlChar **atts;
3435 int nbatts = 0;
3436 int maxatts;
3437 int meta = 0;
3438 int i;
3439
3440 if ((ctxt == NULL) || (ctxt->input == NULL)) {
3441 htmlParseErr(ctxt, XML_ERR_INTERNAL_ERROR,
3442 "htmlParseStartTag: context error\n", NULL, NULL);
3443 return -1;
3444 }
3445 if (CUR != '<') return -1;
3446 NEXT;
3447
3448 atts = ctxt->atts;
3449 maxatts = ctxt->maxatts;
3450
3451 GROW;
3452 name = htmlParseHTMLName(ctxt);
3453 if (name == NULL) {
3454 htmlParseErr(ctxt, XML_ERR_NAME_REQUIRED,
3455 "htmlParseStartTag: invalid element name\n",
3456 NULL, NULL);
3457 /* Dump the bogus tag like browsers do */
3458 while ((IS_CHAR_CH(CUR)) && (CUR != '>'))
3459 NEXT;
3460 return -1;
3461 }
3462 if (xmlStrEqual(name, BAD_CAST"meta"))
3463 meta = 1;
3464
3465 /*
3466 * Check for auto-closure of HTML elements.
3467 */
3468 htmlAutoClose(ctxt, name);
3469
3470 /*
3471 * Check for implied HTML elements.
3472 */
3473 htmlCheckImplied(ctxt, name);
3474
3475 /*
3476 * Avoid html at any level > 0, head at any level != 1
3477 * or any attempt to recurse body
3478 */
3479 if ((ctxt->nameNr > 0) && (xmlStrEqual(name, BAD_CAST"html"))) {
3480 htmlParseErr(ctxt, XML_HTML_STRUCURE_ERROR,
3481 "htmlParseStartTag: misplaced <html> tag\n",
3482 name, NULL);
3483 return 0;
3484 }
3485 if ((ctxt->nameNr != 1) &&
3486 (xmlStrEqual(name, BAD_CAST"head"))) {
3487 htmlParseErr(ctxt, XML_HTML_STRUCURE_ERROR,
3488 "htmlParseStartTag: misplaced <head> tag\n",
3489 name, NULL);
3490 return 0;
3491 }
3492 if (xmlStrEqual(name, BAD_CAST"body")) {
3493 int indx;
3494 for (indx = 0;indx < ctxt->nameNr;indx++) {
3495 if (xmlStrEqual(ctxt->nameTab[indx], BAD_CAST"body")) {
3496 htmlParseErr(ctxt, XML_HTML_STRUCURE_ERROR,
3497 "htmlParseStartTag: misplaced <body> tag\n",
3498 name, NULL);
3499 while ((IS_CHAR_CH(CUR)) && (CUR != '>'))
3500 NEXT;
3501 return 0;
3502 }
3503 }
3504 }
3505
3506 /*
3507 * Now parse the attributes, it ends up with the ending
3508 *
3509 * (S Attribute)* S?
3510 */
3511 SKIP_BLANKS;
3512 while ((IS_CHAR_CH(CUR)) &&
3513 (CUR != '>') &&
3514 ((CUR != '/') || (NXT(1) != '>'))) {
3515 long cons = ctxt->nbChars;
3516
3517 GROW;
3518 attname = htmlParseAttribute(ctxt, &attvalue);
3519 if (attname != NULL) {
3520
3521 /*
3522 * Well formedness requires at most one declaration of an attribute
3523 */
3524 for (i = 0; i < nbatts;i += 2) {
3525 if (xmlStrEqual(atts[i], attname)) {
3526 htmlParseErr(ctxt, XML_ERR_ATTRIBUTE_REDEFINED,
3527 "Attribute %s redefined\n", attname, NULL);
3528 if (attvalue != NULL)
3529 xmlFree(attvalue);
3530 goto failed;
3531 }
3532 }
3533
3534 /*
3535 * Add the pair to atts
3536 */
3537 if (atts == NULL) {
3538 maxatts = 22; /* allow for 10 attrs by default */
3539 atts = (const xmlChar **)
3540 xmlMalloc(maxatts * sizeof(xmlChar *));
3541 if (atts == NULL) {
3542 htmlErrMemory(ctxt, NULL);
3543 if (attvalue != NULL)
3544 xmlFree(attvalue);
3545 goto failed;
3546 }
3547 ctxt->atts = atts;
3548 ctxt->maxatts = maxatts;
3549 } else if (nbatts + 4 > maxatts) {
3550 const xmlChar **n;
3551
3552 maxatts *= 2;
3553 n = (const xmlChar **) xmlRealloc((void *) atts,
3554 maxatts * sizeof(const xmlChar *));
3555 if (n == NULL) {
3556 htmlErrMemory(ctxt, NULL);
3557 if (attvalue != NULL)
3558 xmlFree(attvalue);
3559 goto failed;
3560 }
3561 atts = n;
3562 ctxt->atts = atts;
3563 ctxt->maxatts = maxatts;
3564 }
3565 atts[nbatts++] = attname;
3566 atts[nbatts++] = attvalue;
3567 atts[nbatts] = NULL;
3568 atts[nbatts + 1] = NULL;
3569 }
3570 else {
3571 if (attvalue != NULL)
3572 xmlFree(attvalue);
3573 /* Dump the bogus attribute string up to the next blank or
3574 * the end of the tag. */
3575 while ((IS_CHAR_CH(CUR)) &&
3576 !(IS_BLANK_CH(CUR)) && (CUR != '>') &&
3577 ((CUR != '/') || (NXT(1) != '>')))
3578 NEXT;
3579 }
3580
3581failed:
3582 SKIP_BLANKS;
3583 if (cons == ctxt->nbChars) {
3584 htmlParseErr(ctxt, XML_ERR_INTERNAL_ERROR,
3585 "htmlParseStartTag: problem parsing attributes\n",
3586 NULL, NULL);
3587 break;
3588 }
3589 }
3590
3591 /*
3592 * Handle specific association to the META tag
3593 */
3594 if (meta && (nbatts != 0))
3595 htmlCheckMeta(ctxt, atts);
3596
3597 /*
3598 * SAX: Start of Element !
3599 */
3600 htmlnamePush(ctxt, name);
3601 if ((ctxt->sax != NULL) && (ctxt->sax->startElement != NULL)) {
3602 if (nbatts != 0)
3603 ctxt->sax->startElement(ctxt->userData, name, atts);
3604 else
3605 ctxt->sax->startElement(ctxt->userData, name, NULL);
3606 }
3607
3608 if (atts != NULL) {
3609 for (i = 1;i < nbatts;i += 2) {
3610 if (atts[i] != NULL)
3611 xmlFree((xmlChar *) atts[i]);
3612 }
3613 }
3614
3615 return 0;
3616}
3617
3618/**
3619 * htmlParseEndTag:
3620 * @ctxt: an HTML parser context
3621 *
3622 * parse an end of tag
3623 *
3624 * [42] ETag ::= '</' Name S? '>'
3625 *
3626 * With namespace
3627 *
3628 * [NS 9] ETag ::= '</' QName S? '>'
3629 *
3630 * Returns 1 if the current level should be closed.
3631 */
3632
3633static int
3634htmlParseEndTag(htmlParserCtxtPtr ctxt)
3635{
3636 const xmlChar *name;
3637 const xmlChar *oldname;
3638 int i, ret;
3639
3640 if ((CUR != '<') || (NXT(1) != '/')) {
3641 htmlParseErr(ctxt, XML_ERR_LTSLASH_REQUIRED,
3642 "htmlParseEndTag: '</' not found\n", NULL, NULL);
3643 return (0);
3644 }
3645 SKIP(2);
3646
3647 name = htmlParseHTMLName(ctxt);
3648 if (name == NULL)
3649 return (0);
3650
3651 /*
3652 * We should definitely be at the ending "S? '>'" part
3653 */
3654 SKIP_BLANKS;
3655 if ((!IS_CHAR_CH(CUR)) || (CUR != '>')) {
3656 htmlParseErr(ctxt, XML_ERR_GT_REQUIRED,
3657 "End tag : expected '>'\n", NULL, NULL);
3658 if (ctxt->recovery) {
3659 /*
3660 * We're not at the ending > !!
3661 * Error, unless in recover mode where we search forwards
3662 * until we find a >
3663 */
3664 while (CUR != '\0' && CUR != '>') NEXT;
3665 NEXT;
3666 }
3667 } else
3668 NEXT;
3669
3670 /*
3671 * If the name read is not one of the element in the parsing stack
3672 * then return, it's just an error.
3673 */
3674 for (i = (ctxt->nameNr - 1); i >= 0; i--) {
3675 if (xmlStrEqual(name, ctxt->nameTab[i]))
3676 break;
3677 }
3678 if (i < 0) {
3679 htmlParseErr(ctxt, XML_ERR_TAG_NAME_MISMATCH,
3680 "Unexpected end tag : %s\n", name, NULL);
3681 return (0);
3682 }
3683
3684
3685 /*
3686 * Check for auto-closure of HTML elements.
3687 */
3688
3689 htmlAutoCloseOnClose(ctxt, name);
3690
3691 /*
3692 * Well formedness constraints, opening and closing must match.
3693 * With the exception that the autoclose may have popped stuff out
3694 * of the stack.
3695 */
3696 if (!xmlStrEqual(name, ctxt->name)) {
3697 if ((ctxt->name != NULL) && (!xmlStrEqual(ctxt->name, name))) {
3698 htmlParseErr(ctxt, XML_ERR_TAG_NAME_MISMATCH,
3699 "Opening and ending tag mismatch: %s and %s\n",
3700 name, ctxt->name);
3701 }
3702 }
3703
3704 /*
3705 * SAX: End of Tag
3706 */
3707 oldname = ctxt->name;
3708 if ((oldname != NULL) && (xmlStrEqual(oldname, name))) {
3709 if ((ctxt->sax != NULL) && (ctxt->sax->endElement != NULL))
3710 ctxt->sax->endElement(ctxt->userData, name);
3711 htmlnamePop(ctxt);
3712 ret = 1;
3713 } else {
3714 ret = 0;
3715 }
3716
3717 return (ret);
3718}
3719
3720
3721/**
3722 * htmlParseReference:
3723 * @ctxt: an HTML parser context
3724 *
3725 * parse and handle entity references in content,
3726 * this will end-up in a call to character() since this is either a
3727 * CharRef, or a predefined entity.
3728 */
3729static void
3730htmlParseReference(htmlParserCtxtPtr ctxt) {
3731 const htmlEntityDesc * ent;
3732 xmlChar out[6];
3733 const xmlChar *name;
3734 if (CUR != '&') return;
3735
3736 if (NXT(1) == '#') {
3737 unsigned int c;
3738 int bits, i = 0;
3739
3740 c = htmlParseCharRef(ctxt);
3741 if (c == 0)
3742 return;
3743
3744 if (c < 0x80) { out[i++]= c; bits= -6; }
3745 else if (c < 0x800) { out[i++]=((c >> 6) & 0x1F) | 0xC0; bits= 0; }
3746 else if (c < 0x10000) { out[i++]=((c >> 12) & 0x0F) | 0xE0; bits= 6; }
3747 else { out[i++]=((c >> 18) & 0x07) | 0xF0; bits= 12; }
3748
3749 for ( ; bits >= 0; bits-= 6) {
3750 out[i++]= ((c >> bits) & 0x3F) | 0x80;
3751 }
3752 out[i] = 0;
3753
3754 htmlCheckParagraph(ctxt);
3755 if ((ctxt->sax != NULL) && (ctxt->sax->characters != NULL))
3756 ctxt->sax->characters(ctxt->userData, out, i);
3757 } else {
3758 ent = htmlParseEntityRef(ctxt, &name);
3759 if (name == NULL) {
3760 htmlCheckParagraph(ctxt);
3761 if ((ctxt->sax != NULL) && (ctxt->sax->characters != NULL))
3762 ctxt->sax->characters(ctxt->userData, BAD_CAST "&", 1);
3763 return;
3764 }
3765 if ((ent == NULL) || !(ent->value > 0)) {
3766 htmlCheckParagraph(ctxt);
3767 if ((ctxt->sax != NULL) && (ctxt->sax->characters != NULL)) {
3768 ctxt->sax->characters(ctxt->userData, BAD_CAST "&", 1);
3769 ctxt->sax->characters(ctxt->userData, name, xmlStrlen(name));
3770 /* ctxt->sax->characters(ctxt->userData, BAD_CAST ";", 1); */
3771 }
3772 } else {
3773 unsigned int c;
3774 int bits, i = 0;
3775
3776 c = ent->value;
3777 if (c < 0x80)
3778 { out[i++]= c; bits= -6; }
3779 else if (c < 0x800)
3780 { out[i++]=((c >> 6) & 0x1F) | 0xC0; bits= 0; }
3781 else if (c < 0x10000)
3782 { out[i++]=((c >> 12) & 0x0F) | 0xE0; bits= 6; }
3783 else
3784 { out[i++]=((c >> 18) & 0x07) | 0xF0; bits= 12; }
3785
3786 for ( ; bits >= 0; bits-= 6) {
3787 out[i++]= ((c >> bits) & 0x3F) | 0x80;
3788 }
3789 out[i] = 0;
3790
3791 htmlCheckParagraph(ctxt);
3792 if ((ctxt->sax != NULL) && (ctxt->sax->characters != NULL))
3793 ctxt->sax->characters(ctxt->userData, out, i);
3794 }
3795 }
3796}
3797
3798/**
3799 * htmlParseContent:
3800 * @ctxt: an HTML parser context
3801 *
3802 * Parse a content: comment, sub-element, reference or text.
3803 */
3804
3805static void
3806htmlParseContent(htmlParserCtxtPtr ctxt) {
3807 xmlChar *currentNode;
3808 int depth;
3809 const xmlChar *name;
3810
3811 currentNode = xmlStrdup(ctxt->name);
3812 depth = ctxt->nameNr;
3813 while (1) {
3814 long cons = ctxt->nbChars;
3815
3816 GROW;
3817 /*
3818 * Our tag or one of it's parent or children is ending.
3819 */
3820 if ((CUR == '<') && (NXT(1) == '/')) {
3821 if (htmlParseEndTag(ctxt) &&
3822 ((currentNode != NULL) || (ctxt->nameNr == 0))) {
3823 if (currentNode != NULL)
3824 xmlFree(currentNode);
3825 return;
3826 }
3827 continue; /* while */
3828 }
3829
3830 else if ((CUR == '<') &&
3831 ((IS_ASCII_LETTER(NXT(1))) ||
3832 (NXT(1) == '_') || (NXT(1) == ':'))) {
3833 name = htmlParseHTMLName_nonInvasive(ctxt);
3834 if (name == NULL) {
3835 htmlParseErr(ctxt, XML_ERR_NAME_REQUIRED,
3836 "htmlParseStartTag: invalid element name\n",
3837 NULL, NULL);
3838 /* Dump the bogus tag like browsers do */
3839 while ((IS_CHAR_CH(CUR)) && (CUR != '>'))
3840 NEXT;
3841
3842 if (currentNode != NULL)
3843 xmlFree(currentNode);
3844 return;
3845 }
3846
3847 if (ctxt->name != NULL) {
3848 if (htmlCheckAutoClose(name, ctxt->name) == 1) {
3849 htmlAutoClose(ctxt, name);
3850 continue;
3851 }
3852 }
3853 }
3854
3855 /*
3856 * Has this node been popped out during parsing of
3857 * the next element
3858 */
3859 if ((ctxt->nameNr > 0) && (depth >= ctxt->nameNr) &&
3860 (!xmlStrEqual(currentNode, ctxt->name)))
3861 {
3862 if (currentNode != NULL) xmlFree(currentNode);
3863 return;
3864 }
3865
3866 if ((CUR != 0) && ((xmlStrEqual(currentNode, BAD_CAST"script")) ||
3867 (xmlStrEqual(currentNode, BAD_CAST"style")))) {
3868 /*
3869 * Handle SCRIPT/STYLE separately
3870 */
3871 htmlParseScript(ctxt);
3872 } else {
3873 /*
3874 * Sometimes DOCTYPE arrives in the middle of the document
3875 */
3876 if ((CUR == '<') && (NXT(1) == '!') &&
3877 (UPP(2) == 'D') && (UPP(3) == 'O') &&
3878 (UPP(4) == 'C') && (UPP(5) == 'T') &&
3879 (UPP(6) == 'Y') && (UPP(7) == 'P') &&
3880 (UPP(8) == 'E')) {
3881 htmlParseErr(ctxt, XML_HTML_STRUCURE_ERROR,
3882 "Misplaced DOCTYPE declaration\n",
3883 BAD_CAST "DOCTYPE" , NULL);
3884 htmlParseDocTypeDecl(ctxt);
3885 }
3886
3887 /*
3888 * First case : a comment
3889 */
3890 if ((CUR == '<') && (NXT(1) == '!') &&
3891 (NXT(2) == '-') && (NXT(3) == '-')) {
3892 htmlParseComment(ctxt);
3893 }
3894
3895 /*
3896 * Second case : a Processing Instruction.
3897 */
3898 else if ((CUR == '<') && (NXT(1) == '?')) {
3899 htmlParsePI(ctxt);
3900 }
3901
3902 /*
3903 * Third case : a sub-element.
3904 */
3905 else if (CUR == '<') {
3906 htmlParseElement(ctxt);
3907 }
3908
3909 /*
3910 * Fourth case : a reference. If if has not been resolved,
3911 * parsing returns it's Name, create the node
3912 */
3913 else if (CUR == '&') {
3914 htmlParseReference(ctxt);
3915 }
3916
3917 /*
3918 * Fifth case : end of the resource
3919 */
3920 else if (CUR == 0) {
3921 htmlAutoCloseOnEnd(ctxt);
3922 break;
3923 }
3924
3925 /*
3926 * Last case, text. Note that References are handled directly.
3927 */
3928 else {
3929 htmlParseCharData(ctxt);
3930 }
3931
3932 if (cons == ctxt->nbChars) {
3933 if (ctxt->node != NULL) {
3934 htmlParseErr(ctxt, XML_ERR_INTERNAL_ERROR,
3935 "detected an error in element content\n",
3936 NULL, NULL);
3937 }
3938 break;
3939 }
3940 }
3941 GROW;
3942 }
3943 if (currentNode != NULL) xmlFree(currentNode);
3944}
3945
3946/**
3947 * htmlParseContent:
3948 * @ctxt: an HTML parser context
3949 *
3950 * Parse a content: comment, sub-element, reference or text.
3951 */
3952
3953void
3954__htmlParseContent(void *ctxt) {
3955 if (ctxt != NULL)
3956 htmlParseContent((htmlParserCtxtPtr) ctxt);
3957}
3958
3959/**
3960 * htmlParseElement:
3961 * @ctxt: an HTML parser context
3962 *
3963 * parse an HTML element, this is highly recursive
3964 *
3965 * [39] element ::= EmptyElemTag | STag content ETag
3966 *
3967 * [41] Attribute ::= Name Eq AttValue
3968 */
3969
3970void
3971htmlParseElement(htmlParserCtxtPtr ctxt) {
3972 const xmlChar *name;
3973 xmlChar *currentNode = NULL;
3974 const htmlElemDesc * info;
3975 htmlParserNodeInfo node_info;
3976 int failed;
3977 int depth;
3978 const xmlChar *oldptr;
3979
3980 if ((ctxt == NULL) || (ctxt->input == NULL)) {
3981 htmlParseErr(ctxt, XML_ERR_INTERNAL_ERROR,
3982 "htmlParseElement: context error\n", NULL, NULL);
3983 return;
3984 }
3985 /* Capture start position */
3986 if (ctxt->record_info) {
3987 node_info.begin_pos = ctxt->input->consumed +
3988 (CUR_PTR - ctxt->input->base);
3989 node_info.begin_line = ctxt->input->line;
3990 }
3991
3992 failed = htmlParseStartTag(ctxt);
3993 name = ctxt->name;
3994 if (failed || (name == NULL)) {
3995 if (CUR == '>')
3996 NEXT;
3997 return;
3998 }
3999
4000 /*
4001 * Lookup the info for that element.
4002 */
4003 info = htmlTagLookup(name);
4004 if (info == NULL) {
4005 htmlParseErr(ctxt, XML_HTML_UNKNOWN_TAG,
4006 "Tag %s invalid\n", name, NULL);
4007 }
4008
4009 /*
4010 * Check for an Empty Element labeled the XML/SGML way
4011 */
4012 if ((CUR == '/') && (NXT(1) == '>')) {
4013 SKIP(2);
4014 if ((ctxt->sax != NULL) && (ctxt->sax->endElement != NULL))
4015 ctxt->sax->endElement(ctxt->userData, name);
4016 htmlnamePop(ctxt);
4017 return;
4018 }
4019
4020 if (CUR == '>') {
4021 NEXT;
4022 } else {
4023 htmlParseErr(ctxt, XML_ERR_GT_REQUIRED,
4024 "Couldn't find end of Start Tag %s\n", name, NULL);
4025
4026 /*
4027 * end of parsing of this node.
4028 */
4029 if (xmlStrEqual(name, ctxt->name)) {
4030 nodePop(ctxt);
4031 htmlnamePop(ctxt);
4032 }
4033
4034 /*
4035 * Capture end position and add node
4036 */
4037 if (ctxt->record_info) {
4038 node_info.end_pos = ctxt->input->consumed +
4039 (CUR_PTR - ctxt->input->base);
4040 node_info.end_line = ctxt->input->line;
4041 node_info.node = ctxt->node;
4042 xmlParserAddNodeInfo(ctxt, &node_info);
4043 }
4044 return;
4045 }
4046
4047 /*
4048 * Check for an Empty Element from DTD definition
4049 */
4050 if ((info != NULL) && (info->empty)) {
4051 if ((ctxt->sax != NULL) && (ctxt->sax->endElement != NULL))
4052 ctxt->sax->endElement(ctxt->userData, name);
4053 htmlnamePop(ctxt);
4054 return;
4055 }
4056
4057 /*
4058 * Parse the content of the element:
4059 */
4060 currentNode = xmlStrdup(ctxt->name);
4061 depth = ctxt->nameNr;
4062 while (IS_CHAR_CH(CUR)) {
4063 oldptr = ctxt->input->cur;
4064 htmlParseContent(ctxt);
4065 if (oldptr==ctxt->input->cur) break;
4066 if (ctxt->nameNr < depth) break;
4067 }
4068
4069 /*
4070 * Capture end position and add node
4071 */
4072 if ( currentNode != NULL && ctxt->record_info ) {
4073 node_info.end_pos = ctxt->input->consumed +
4074 (CUR_PTR - ctxt->input->base);
4075 node_info.end_line = ctxt->input->line;
4076 node_info.node = ctxt->node;
4077 xmlParserAddNodeInfo(ctxt, &node_info);
4078 }
4079 if (!IS_CHAR_CH(CUR)) {
4080 htmlAutoCloseOnEnd(ctxt);
4081 }
4082
4083 if (currentNode != NULL)
4084 xmlFree(currentNode);
4085}
4086
4087/**
4088 * htmlParseDocument:
4089 * @ctxt: an HTML parser context
4090 *
4091 * parse an HTML document (and build a tree if using the standard SAX
4092 * interface).
4093 *
4094 * Returns 0, -1 in case of error. the parser context is augmented
4095 * as a result of the parsing.
4096 */
4097
4098int
4099htmlParseDocument(htmlParserCtxtPtr ctxt) {
4100 xmlDtdPtr dtd;
4101
4102 xmlInitParser();
4103
4104 htmlDefaultSAXHandlerInit();
4105
4106 if ((ctxt == NULL) || (ctxt->input == NULL)) {
4107 htmlParseErr(ctxt, XML_ERR_INTERNAL_ERROR,
4108 "htmlParseDocument: context error\n", NULL, NULL);
4109 return(XML_ERR_INTERNAL_ERROR);
4110 }
4111 ctxt->html = 1;
4112 GROW;
4113 /*
4114 * SAX: beginning of the document processing.
4115 */
4116 if ((ctxt->sax) && (ctxt->sax->setDocumentLocator))
4117 ctxt->sax->setDocumentLocator(ctxt->userData, &xmlDefaultSAXLocator);
4118
4119 /*
4120 * Wipe out everything which is before the first '<'
4121 */
4122 SKIP_BLANKS;
4123 if (CUR == 0) {
4124 htmlParseErr(ctxt, XML_ERR_DOCUMENT_EMPTY,
4125 "Document is empty\n", NULL, NULL);
4126 }
4127
4128 if ((ctxt->sax) && (ctxt->sax->startDocument) && (!ctxt->disableSAX))
4129 ctxt->sax->startDocument(ctxt->userData);
4130
4131
4132 /*
4133 * Parse possible comments and PIs before any content
4134 */
4135 while (((CUR == '<') && (NXT(1) == '!') &&
4136 (NXT(2) == '-') && (NXT(3) == '-')) ||
4137 ((CUR == '<') && (NXT(1) == '?'))) {
4138 htmlParseComment(ctxt);
4139 htmlParsePI(ctxt);
4140 SKIP_BLANKS;
4141 }
4142
4143
4144 /*
4145 * Then possibly doc type declaration(s) and more Misc
4146 * (doctypedecl Misc*)?
4147 */
4148 if ((CUR == '<') && (NXT(1) == '!') &&
4149 (UPP(2) == 'D') && (UPP(3) == 'O') &&
4150 (UPP(4) == 'C') && (UPP(5) == 'T') &&
4151 (UPP(6) == 'Y') && (UPP(7) == 'P') &&
4152 (UPP(8) == 'E')) {
4153 htmlParseDocTypeDecl(ctxt);
4154 }
4155 SKIP_BLANKS;
4156
4157 /*
4158 * Parse possible comments and PIs before any content
4159 */
4160 while (((CUR == '<') && (NXT(1) == '!') &&
4161 (NXT(2) == '-') && (NXT(3) == '-')) ||
4162 ((CUR == '<') && (NXT(1) == '?'))) {
4163 htmlParseComment(ctxt);
4164 htmlParsePI(ctxt);
4165 SKIP_BLANKS;
4166 }
4167
4168 /*
4169 * Time to start parsing the tree itself
4170 */
4171 htmlParseContent(ctxt);
4172
4173 /*
4174 * autoclose
4175 */
4176 if (CUR == 0)
4177 htmlAutoCloseOnEnd(ctxt);
4178
4179
4180 /*
4181 * SAX: end of the document processing.
4182 */
4183 if ((ctxt->sax) && (ctxt->sax->endDocument != NULL))
4184 ctxt->sax->endDocument(ctxt->userData);
4185
4186 if (ctxt->myDoc != NULL) {
4187 dtd = xmlGetIntSubset(ctxt->myDoc);
4188 if (dtd == NULL)
4189 ctxt->myDoc->intSubset =
4190 xmlCreateIntSubset(ctxt->myDoc, BAD_CAST "html",
4191 BAD_CAST "-//W3C//DTD HTML 4.0 Transitional//EN",
4192 BAD_CAST "http://www.w3.org/TR/REC-html40/loose.dtd");
4193 }
4194 if (! ctxt->wellFormed) return(-1);
4195 return(0);
4196}
4197
4198
4199/************************************************************************
4200 * *
4201 * Parser contexts handling *
4202 * *
4203 ************************************************************************/
4204
4205/**
4206 * htmlInitParserCtxt:
4207 * @ctxt: an HTML parser context
4208 *
4209 * Initialize a parser context
4210 *
4211 * Returns 0 in case of success and -1 in case of error
4212 */
4213
4214static int
4215htmlInitParserCtxt(htmlParserCtxtPtr ctxt)
4216{
4217 htmlSAXHandler *sax;
4218
4219 if (ctxt == NULL) return(-1);
4220 memset(ctxt, 0, sizeof(htmlParserCtxt));
4221
4222 ctxt->dict = xmlDictCreate();
4223 if (ctxt->dict == NULL) {
4224 htmlErrMemory(NULL, "htmlInitParserCtxt: out of memory\n");
4225 return(-1);
4226 }
4227 sax = (htmlSAXHandler *) xmlMalloc(sizeof(htmlSAXHandler));
4228 if (sax == NULL) {
4229 htmlErrMemory(NULL, "htmlInitParserCtxt: out of memory\n");
4230 return(-1);
4231 }
4232 else
4233 memset(sax, 0, sizeof(htmlSAXHandler));
4234
4235 /* Allocate the Input stack */
4236 ctxt->inputTab = (htmlParserInputPtr *)
4237 xmlMalloc(5 * sizeof(htmlParserInputPtr));
4238 if (ctxt->inputTab == NULL) {
4239 htmlErrMemory(NULL, "htmlInitParserCtxt: out of memory\n");
4240 ctxt->inputNr = 0;
4241 ctxt->inputMax = 0;
4242 ctxt->input = NULL;
4243 return(-1);
4244 }
4245 ctxt->inputNr = 0;
4246 ctxt->inputMax = 5;
4247 ctxt->input = NULL;
4248 ctxt->version = NULL;
4249 ctxt->encoding = NULL;
4250 ctxt->standalone = -1;
4251 ctxt->instate = XML_PARSER_START;
4252
4253 /* Allocate the Node stack */
4254 ctxt->nodeTab = (htmlNodePtr *) xmlMalloc(10 * sizeof(htmlNodePtr));
4255 if (ctxt->nodeTab == NULL) {
4256 htmlErrMemory(NULL, "htmlInitParserCtxt: out of memory\n");
4257 ctxt->nodeNr = 0;
4258 ctxt->nodeMax = 0;
4259 ctxt->node = NULL;
4260 ctxt->inputNr = 0;
4261 ctxt->inputMax = 0;
4262 ctxt->input = NULL;
4263 return(-1);
4264 }
4265 ctxt->nodeNr = 0;
4266 ctxt->nodeMax = 10;
4267 ctxt->node = NULL;
4268
4269 /* Allocate the Name stack */
4270 ctxt->nameTab = (const xmlChar **) xmlMalloc(10 * sizeof(xmlChar *));
4271 if (ctxt->nameTab == NULL) {
4272 htmlErrMemory(NULL, "htmlInitParserCtxt: out of memory\n");
4273 ctxt->nameNr = 0;
4274 ctxt->nameMax = 10;
4275 ctxt->name = NULL;
4276 ctxt->nodeNr = 0;
4277 ctxt->nodeMax = 0;
4278 ctxt->node = NULL;
4279 ctxt->inputNr = 0;
4280 ctxt->inputMax = 0;
4281 ctxt->input = NULL;
4282 return(-1);
4283 }
4284 ctxt->nameNr = 0;
4285 ctxt->nameMax = 10;
4286 ctxt->name = NULL;
4287
4288 if (sax == NULL) ctxt->sax = (xmlSAXHandlerPtr) &htmlDefaultSAXHandler;
4289 else {
4290 ctxt->sax = sax;
4291 memcpy(sax, &htmlDefaultSAXHandler, sizeof(xmlSAXHandlerV1));
4292 }
4293 ctxt->userData = ctxt;
4294 ctxt->myDoc = NULL;
4295 ctxt->wellFormed = 1;
4296 ctxt->replaceEntities = 0;
4297 ctxt->linenumbers = xmlLineNumbersDefaultValue;
4298 ctxt->html = 1;
4299 ctxt->vctxt.finishDtd = XML_CTXT_FINISH_DTD_0;
4300 ctxt->vctxt.userData = ctxt;
4301 ctxt->vctxt.error = xmlParserValidityError;
4302 ctxt->vctxt.warning = xmlParserValidityWarning;
4303 ctxt->record_info = 0;
4304 ctxt->validate = 0;
4305 ctxt->nbChars = 0;
4306 ctxt->checkIndex = 0;
4307 ctxt->catalogs = NULL;
4308 xmlInitNodeInfoSeq(&ctxt->node_seq);
4309 return(0);
4310}
4311
4312/**
4313 * htmlFreeParserCtxt:
4314 * @ctxt: an HTML parser context
4315 *
4316 * Free all the memory used by a parser context. However the parsed
4317 * document in ctxt->myDoc is not freed.
4318 */
4319
4320void
4321htmlFreeParserCtxt(htmlParserCtxtPtr ctxt)
4322{
4323 xmlFreeParserCtxt(ctxt);
4324}
4325
4326/**
4327 * htmlNewParserCtxt:
4328 *
4329 * Allocate and initialize a new parser context.
4330 *
4331 * Returns the htmlParserCtxtPtr or NULL in case of allocation error
4332 */
4333
4334htmlParserCtxtPtr
4335htmlNewParserCtxt(void)
4336{
4337 xmlParserCtxtPtr ctxt;
4338
4339 ctxt = (xmlParserCtxtPtr) xmlMalloc(sizeof(xmlParserCtxt));
4340 if (ctxt == NULL) {
4341 htmlErrMemory(NULL, "NewParserCtxt: out of memory\n");
4342 return(NULL);
4343 }
4344 memset(ctxt, 0, sizeof(xmlParserCtxt));
4345 if (htmlInitParserCtxt(ctxt) < 0) {
4346 htmlFreeParserCtxt(ctxt);
4347 return(NULL);
4348 }
4349 return(ctxt);
4350}
4351
4352/**
4353 * htmlCreateMemoryParserCtxt:
4354 * @buffer: a pointer to a char array
4355 * @size: the size of the array
4356 *
4357 * Create a parser context for an HTML in-memory document.
4358 *
4359 * Returns the new parser context or NULL
4360 */
4361htmlParserCtxtPtr
4362htmlCreateMemoryParserCtxt(const char *buffer, int size) {
4363 xmlParserCtxtPtr ctxt;
4364 xmlParserInputPtr input;
4365 xmlParserInputBufferPtr buf;
4366
4367 if (buffer == NULL)
4368 return(NULL);
4369 if (size <= 0)
4370 return(NULL);
4371
4372 ctxt = htmlNewParserCtxt();
4373 if (ctxt == NULL)
4374 return(NULL);
4375
4376 buf = xmlParserInputBufferCreateMem(buffer, size, XML_CHAR_ENCODING_NONE);
4377 if (buf == NULL) return(NULL);
4378
4379 input = xmlNewInputStream(ctxt);
4380 if (input == NULL) {
4381 xmlFreeParserCtxt(ctxt);
4382 return(NULL);
4383 }
4384
4385 input->filename = NULL;
4386 input->buf = buf;
4387 input->base = input->buf->buffer->content;
4388 input->cur = input->buf->buffer->content;
4389 input->end = &input->buf->buffer->content[input->buf->buffer->use];
4390
4391 inputPush(ctxt, input);
4392 return(ctxt);
4393}
4394
4395/**
4396 * htmlCreateDocParserCtxt:
4397 * @cur: a pointer to an array of xmlChar
4398 * @encoding: a free form C string describing the HTML document encoding, or NULL
4399 *
4400 * Create a parser context for an HTML document.
4401 *
4402 * TODO: check the need to add encoding handling there
4403 *
4404 * Returns the new parser context or NULL
4405 */
4406static htmlParserCtxtPtr
4407htmlCreateDocParserCtxt(const xmlChar *cur, const char *encoding) {
4408 int len;
4409 htmlParserCtxtPtr ctxt;
4410
4411 if (cur == NULL)
4412 return(NULL);
4413 len = xmlStrlen(cur);
4414 ctxt = htmlCreateMemoryParserCtxt((char *)cur, len);
4415 if (ctxt == NULL)
4416 return(NULL);
4417
4418 if (encoding != NULL) {
4419 xmlCharEncoding enc;
4420 xmlCharEncodingHandlerPtr handler;
4421
4422 if (ctxt->input->encoding != NULL)
4423 xmlFree((xmlChar *) ctxt->input->encoding);
4424 ctxt->input->encoding = xmlStrdup((const xmlChar *) encoding);
4425
4426 enc = xmlParseCharEncoding(encoding);
4427 /*
4428 * registered set of known encodings
4429 */
4430 if (enc != XML_CHAR_ENCODING_ERROR) {
4431 xmlSwitchEncoding(ctxt, enc);
4432 if (ctxt->errNo == XML_ERR_UNSUPPORTED_ENCODING) {
4433 htmlParseErr(ctxt, XML_ERR_UNSUPPORTED_ENCODING,
4434 "Unsupported encoding %s\n",
4435 (const xmlChar *) encoding, NULL);
4436 }
4437 } else {
4438 /*
4439 * fallback for unknown encodings
4440 */
4441 handler = xmlFindCharEncodingHandler((const char *) encoding);
4442 if (handler != NULL) {
4443 xmlSwitchToEncoding(ctxt, handler);
4444 } else {
4445 htmlParseErr(ctxt, XML_ERR_UNSUPPORTED_ENCODING,
4446 "Unsupported encoding %s\n",
4447 (const xmlChar *) encoding, NULL);
4448 }
4449 }
4450 }
4451 return(ctxt);
4452}
4453
4454#ifdef LIBXML_PUSH_ENABLED
4455/************************************************************************
4456 * *
4457 * Progressive parsing interfaces *
4458 * *
4459 ************************************************************************/
4460
4461/**
4462 * htmlParseLookupSequence:
4463 * @ctxt: an HTML parser context
4464 * @first: the first char to lookup
4465 * @next: the next char to lookup or zero
4466 * @third: the next char to lookup or zero
4467 * @comment: flag to force checking inside comments
4468 *
4469 * Try to find if a sequence (first, next, third) or just (first next) or
4470 * (first) is available in the input stream.
4471 * This function has a side effect of (possibly) incrementing ctxt->checkIndex
4472 * to avoid rescanning sequences of bytes, it DOES change the state of the
4473 * parser, do not use liberally.
4474 * This is basically similar to xmlParseLookupSequence()
4475 *
4476 * Returns the index to the current parsing point if the full sequence
4477 * is available, -1 otherwise.
4478 */
4479static int
4480htmlParseLookupSequence(htmlParserCtxtPtr ctxt, xmlChar first,
4481 xmlChar next, xmlChar third, int iscomment) {
4482 int base, len;
4483 htmlParserInputPtr in;
4484 const xmlChar *buf;
4485 int incomment = 0;
4486
4487 in = ctxt->input;
4488 if (in == NULL) return(-1);
4489 base = in->cur - in->base;
4490 if (base < 0) return(-1);
4491 if (ctxt->checkIndex > base)
4492 base = ctxt->checkIndex;
4493 if (in->buf == NULL) {
4494 buf = in->base;
4495 len = in->length;
4496 } else {
4497 buf = in->buf->buffer->content;
4498 len = in->buf->buffer->use;
4499 }
4500 /* take into account the sequence length */
4501 if (third) len -= 2;
4502 else if (next) len --;
4503 for (;base < len;base++) {
4504 if (!incomment && (base + 4 < len) && !iscomment) {
4505 if ((buf[base] == '<') && (buf[base + 1] == '!') &&
4506 (buf[base + 2] == '-') && (buf[base + 3] == '-')) {
4507 incomment = 1;
4508 /* do not increment past <! - some people use <!--> */
4509 base += 2;
4510 }
4511 }
4512 if (incomment) {
4513 if (base + 3 > len)
4514 return(-1);
4515 if ((buf[base] == '-') && (buf[base + 1] == '-') &&
4516 (buf[base + 2] == '>')) {
4517 incomment = 0;
4518 base += 2;
4519 }
4520 continue;
4521 }
4522 if (buf[base] == first) {
4523 if (third != 0) {
4524 if ((buf[base + 1] != next) ||
4525 (buf[base + 2] != third)) continue;
4526 } else if (next != 0) {
4527 if (buf[base + 1] != next) continue;
4528 }
4529 ctxt->checkIndex = 0;
4530#ifdef DEBUG_PUSH
4531 if (next == 0)
4532 xmlGenericError(xmlGenericErrorContext,
4533 "HPP: lookup '%c' found at %d\n",
4534 first, base);
4535 else if (third == 0)
4536 xmlGenericError(xmlGenericErrorContext,
4537 "HPP: lookup '%c%c' found at %d\n",
4538 first, next, base);
4539 else
4540 xmlGenericError(xmlGenericErrorContext,
4541 "HPP: lookup '%c%c%c' found at %d\n",
4542 first, next, third, base);
4543#endif
4544 return(base - (in->cur - in->base));
4545 }
4546 }
4547 ctxt->checkIndex = base;
4548#ifdef DEBUG_PUSH
4549 if (next == 0)
4550 xmlGenericError(xmlGenericErrorContext,
4551 "HPP: lookup '%c' failed\n", first);
4552 else if (third == 0)
4553 xmlGenericError(xmlGenericErrorContext,
4554 "HPP: lookup '%c%c' failed\n", first, next);
4555 else
4556 xmlGenericError(xmlGenericErrorContext,
4557 "HPP: lookup '%c%c%c' failed\n", first, next, third);
4558#endif
4559 return(-1);
4560}
4561
4562/**
4563 * htmlParseTryOrFinish:
4564 * @ctxt: an HTML parser context
4565 * @terminate: last chunk indicator
4566 *
4567 * Try to progress on parsing
4568 *
4569 * Returns zero if no parsing was possible
4570 */
4571static int
4572htmlParseTryOrFinish(htmlParserCtxtPtr ctxt, int terminate) {
4573 int ret = 0;
4574 htmlParserInputPtr in;
4575 int avail = 0;
4576 xmlChar cur, next;
4577
4578#ifdef DEBUG_PUSH
4579 switch (ctxt->instate) {
4580 case XML_PARSER_EOF:
4581 xmlGenericError(xmlGenericErrorContext,
4582 "HPP: try EOF\n"); break;
4583 case XML_PARSER_START:
4584 xmlGenericError(xmlGenericErrorContext,
4585 "HPP: try START\n"); break;
4586 case XML_PARSER_MISC:
4587 xmlGenericError(xmlGenericErrorContext,
4588 "HPP: try MISC\n");break;
4589 case XML_PARSER_COMMENT:
4590 xmlGenericError(xmlGenericErrorContext,
4591 "HPP: try COMMENT\n");break;
4592 case XML_PARSER_PROLOG:
4593 xmlGenericError(xmlGenericErrorContext,
4594 "HPP: try PROLOG\n");break;
4595 case XML_PARSER_START_TAG:
4596 xmlGenericError(xmlGenericErrorContext,
4597 "HPP: try START_TAG\n");break;
4598 case XML_PARSER_CONTENT:
4599 xmlGenericError(xmlGenericErrorContext,
4600 "HPP: try CONTENT\n");break;
4601 case XML_PARSER_CDATA_SECTION:
4602 xmlGenericError(xmlGenericErrorContext,
4603 "HPP: try CDATA_SECTION\n");break;
4604 case XML_PARSER_END_TAG:
4605 xmlGenericError(xmlGenericErrorContext,
4606 "HPP: try END_TAG\n");break;
4607 case XML_PARSER_ENTITY_DECL:
4608 xmlGenericError(xmlGenericErrorContext,
4609 "HPP: try ENTITY_DECL\n");break;
4610 case XML_PARSER_ENTITY_VALUE:
4611 xmlGenericError(xmlGenericErrorContext,
4612 "HPP: try ENTITY_VALUE\n");break;
4613 case XML_PARSER_ATTRIBUTE_VALUE:
4614 xmlGenericError(xmlGenericErrorContext,
4615 "HPP: try ATTRIBUTE_VALUE\n");break;
4616 case XML_PARSER_DTD:
4617 xmlGenericError(xmlGenericErrorContext,
4618 "HPP: try DTD\n");break;
4619 case XML_PARSER_EPILOG:
4620 xmlGenericError(xmlGenericErrorContext,
4621 "HPP: try EPILOG\n");break;
4622 case XML_PARSER_PI:
4623 xmlGenericError(xmlGenericErrorContext,
4624 "HPP: try PI\n");break;
4625 case XML_PARSER_SYSTEM_LITERAL:
4626 xmlGenericError(xmlGenericErrorContext,
4627 "HPP: try SYSTEM_LITERAL\n");break;
4628 }
4629#endif
4630
4631 while (1) {
4632
4633 in = ctxt->input;
4634 if (in == NULL) break;
4635 if (in->buf == NULL)
4636 avail = in->length - (in->cur - in->base);
4637 else
4638 avail = in->buf->buffer->use - (in->cur - in->base);
4639 if ((avail == 0) && (terminate)) {
4640 htmlAutoCloseOnEnd(ctxt);
4641 if ((ctxt->nameNr == 0) && (ctxt->instate != XML_PARSER_EOF)) {
4642 /*
4643 * SAX: end of the document processing.
4644 */
4645 ctxt->instate = XML_PARSER_EOF;
4646 if ((ctxt->sax) && (ctxt->sax->endDocument != NULL))
4647 ctxt->sax->endDocument(ctxt->userData);
4648 }
4649 }
4650 if (avail < 1)
4651 goto done;
4652 cur = in->cur[0];
4653 if (cur == 0) {
4654 SKIP(1);
4655 continue;
4656 }
4657
4658 switch (ctxt->instate) {
4659 case XML_PARSER_EOF:
4660 /*
4661 * Document parsing is done !
4662 */
4663 goto done;
4664 case XML_PARSER_START:
4665 /*
4666 * Very first chars read from the document flow.
4667 */
4668 cur = in->cur[0];
4669 if (IS_BLANK_CH(cur)) {
4670 SKIP_BLANKS;
4671 if (in->buf == NULL)
4672 avail = in->length - (in->cur - in->base);
4673 else
4674 avail = in->buf->buffer->use - (in->cur - in->base);
4675 }
4676 if ((ctxt->sax) && (ctxt->sax->setDocumentLocator))
4677 ctxt->sax->setDocumentLocator(ctxt->userData,
4678 &xmlDefaultSAXLocator);
4679 if ((ctxt->sax) && (ctxt->sax->startDocument) &&
4680 (!ctxt->disableSAX))
4681 ctxt->sax->startDocument(ctxt->userData);
4682
4683 cur = in->cur[0];
4684 next = in->cur[1];
4685 if ((cur == '<') && (next == '!') &&
4686 (UPP(2) == 'D') && (UPP(3) == 'O') &&
4687 (UPP(4) == 'C') && (UPP(5) == 'T') &&
4688 (UPP(6) == 'Y') && (UPP(7) == 'P') &&
4689 (UPP(8) == 'E')) {
4690 if ((!terminate) &&
4691 (htmlParseLookupSequence(ctxt, '>', 0, 0, 0) < 0))
4692 goto done;
4693#ifdef DEBUG_PUSH
4694 xmlGenericError(xmlGenericErrorContext,
4695 "HPP: Parsing internal subset\n");
4696#endif
4697 htmlParseDocTypeDecl(ctxt);
4698 ctxt->instate = XML_PARSER_PROLOG;
4699#ifdef DEBUG_PUSH
4700 xmlGenericError(xmlGenericErrorContext,
4701 "HPP: entering PROLOG\n");
4702#endif
4703 } else {
4704 ctxt->instate = XML_PARSER_MISC;
4705#ifdef DEBUG_PUSH
4706 xmlGenericError(xmlGenericErrorContext,
4707 "HPP: entering MISC\n");
4708#endif
4709 }
4710 break;
4711 case XML_PARSER_MISC:
4712 SKIP_BLANKS;
4713 if (in->buf == NULL)
4714 avail = in->length - (in->cur - in->base);
4715 else
4716 avail = in->buf->buffer->use - (in->cur - in->base);
4717 if (avail < 2)
4718 goto done;
4719 cur = in->cur[0];
4720 next = in->cur[1];
4721 if ((cur == '<') && (next == '!') &&
4722 (in->cur[2] == '-') && (in->cur[3] == '-')) {
4723 if ((!terminate) &&
4724 (htmlParseLookupSequence(ctxt, '-', '-', '>', 1) < 0))
4725 goto done;
4726#ifdef DEBUG_PUSH
4727 xmlGenericError(xmlGenericErrorContext,
4728 "HPP: Parsing Comment\n");
4729#endif
4730 htmlParseComment(ctxt);
4731 ctxt->instate = XML_PARSER_MISC;
4732 } else if ((cur == '<') && (next == '?')) {
4733 if ((!terminate) &&
4734 (htmlParseLookupSequence(ctxt, '>', 0, 0, 0) < 0))
4735 goto done;
4736#ifdef DEBUG_PUSH
4737 xmlGenericError(xmlGenericErrorContext,
4738 "HPP: Parsing PI\n");
4739#endif
4740 htmlParsePI(ctxt);
4741 ctxt->instate = XML_PARSER_MISC;
4742 } else if ((cur == '<') && (next == '!') &&
4743 (UPP(2) == 'D') && (UPP(3) == 'O') &&
4744 (UPP(4) == 'C') && (UPP(5) == 'T') &&
4745 (UPP(6) == 'Y') && (UPP(7) == 'P') &&
4746 (UPP(8) == 'E')) {
4747 if ((!terminate) &&
4748 (htmlParseLookupSequence(ctxt, '>', 0, 0, 0) < 0))
4749 goto done;
4750#ifdef DEBUG_PUSH
4751 xmlGenericError(xmlGenericErrorContext,
4752 "HPP: Parsing internal subset\n");
4753#endif
4754 htmlParseDocTypeDecl(ctxt);
4755 ctxt->instate = XML_PARSER_PROLOG;
4756#ifdef DEBUG_PUSH
4757 xmlGenericError(xmlGenericErrorContext,
4758 "HPP: entering PROLOG\n");
4759#endif
4760 } else if ((cur == '<') && (next == '!') &&
4761 (avail < 9)) {
4762 goto done;
4763 } else {
4764 ctxt->instate = XML_PARSER_START_TAG;
4765#ifdef DEBUG_PUSH
4766 xmlGenericError(xmlGenericErrorContext,
4767 "HPP: entering START_TAG\n");
4768#endif
4769 }
4770 break;
4771 case XML_PARSER_PROLOG:
4772 SKIP_BLANKS;
4773 if (in->buf == NULL)
4774 avail = in->length - (in->cur - in->base);
4775 else
4776 avail = in->buf->buffer->use - (in->cur - in->base);
4777 if (avail < 2)
4778 goto done;
4779 cur = in->cur[0];
4780 next = in->cur[1];
4781 if ((cur == '<') && (next == '!') &&
4782 (in->cur[2] == '-') && (in->cur[3] == '-')) {
4783 if ((!terminate) &&
4784 (htmlParseLookupSequence(ctxt, '-', '-', '>', 1) < 0))
4785 goto done;
4786#ifdef DEBUG_PUSH
4787 xmlGenericError(xmlGenericErrorContext,
4788 "HPP: Parsing Comment\n");
4789#endif
4790 htmlParseComment(ctxt);
4791 ctxt->instate = XML_PARSER_PROLOG;
4792 } else if ((cur == '<') && (next == '?')) {
4793 if ((!terminate) &&
4794 (htmlParseLookupSequence(ctxt, '>', 0, 0, 0) < 0))
4795 goto done;
4796#ifdef DEBUG_PUSH
4797 xmlGenericError(xmlGenericErrorContext,
4798 "HPP: Parsing PI\n");
4799#endif
4800 htmlParsePI(ctxt);
4801 ctxt->instate = XML_PARSER_PROLOG;
4802 } else if ((cur == '<') && (next == '!') &&
4803 (avail < 4)) {
4804 goto done;
4805 } else {
4806 ctxt->instate = XML_PARSER_START_TAG;
4807#ifdef DEBUG_PUSH
4808 xmlGenericError(xmlGenericErrorContext,
4809 "HPP: entering START_TAG\n");
4810#endif
4811 }
4812 break;
4813 case XML_PARSER_EPILOG:
4814 if (in->buf == NULL)
4815 avail = in->length - (in->cur - in->base);
4816 else
4817 avail = in->buf->buffer->use - (in->cur - in->base);
4818 if (avail < 1)
4819 goto done;
4820 cur = in->cur[0];
4821 if (IS_BLANK_CH(cur)) {
4822 htmlParseCharData(ctxt);
4823 goto done;
4824 }
4825 if (avail < 2)
4826 goto done;
4827 next = in->cur[1];
4828 if ((cur == '<') && (next == '!') &&
4829 (in->cur[2] == '-') && (in->cur[3] == '-')) {
4830 if ((!terminate) &&
4831 (htmlParseLookupSequence(ctxt, '-', '-', '>', 1) < 0))
4832 goto done;
4833#ifdef DEBUG_PUSH
4834 xmlGenericError(xmlGenericErrorContext,
4835 "HPP: Parsing Comment\n");
4836#endif
4837 htmlParseComment(ctxt);
4838 ctxt->instate = XML_PARSER_EPILOG;
4839 } else if ((cur == '<') && (next == '?')) {
4840 if ((!terminate) &&
4841 (htmlParseLookupSequence(ctxt, '>', 0, 0, 0) < 0))
4842 goto done;
4843#ifdef DEBUG_PUSH
4844 xmlGenericError(xmlGenericErrorContext,
4845 "HPP: Parsing PI\n");
4846#endif
4847 htmlParsePI(ctxt);
4848 ctxt->instate = XML_PARSER_EPILOG;
4849 } else if ((cur == '<') && (next == '!') &&
4850 (avail < 4)) {
4851 goto done;
4852 } else {
4853 ctxt->errNo = XML_ERR_DOCUMENT_END;
4854 ctxt->wellFormed = 0;
4855 ctxt->instate = XML_PARSER_EOF;
4856#ifdef DEBUG_PUSH
4857 xmlGenericError(xmlGenericErrorContext,
4858 "HPP: entering EOF\n");
4859#endif
4860 if ((ctxt->sax) && (ctxt->sax->endDocument != NULL))
4861 ctxt->sax->endDocument(ctxt->userData);
4862 goto done;
4863 }
4864 break;
4865 case XML_PARSER_START_TAG: {
4866 const xmlChar *name;
4867 int failed;
4868 const htmlElemDesc * info;
4869
4870 if (avail < 2)
4871 goto done;
4872 cur = in->cur[0];
4873 if (cur != '<') {
4874 ctxt->instate = XML_PARSER_CONTENT;
4875#ifdef DEBUG_PUSH
4876 xmlGenericError(xmlGenericErrorContext,
4877 "HPP: entering CONTENT\n");
4878#endif
4879 break;
4880 }
4881 if (in->cur[1] == '/') {
4882 ctxt->instate = XML_PARSER_END_TAG;
4883 ctxt->checkIndex = 0;
4884#ifdef DEBUG_PUSH
4885 xmlGenericError(xmlGenericErrorContext,
4886 "HPP: entering END_TAG\n");
4887#endif
4888 break;
4889 }
4890 if ((!terminate) &&
4891 (htmlParseLookupSequence(ctxt, '>', 0, 0, 0) < 0))
4892 goto done;
4893
4894 failed = htmlParseStartTag(ctxt);
4895 name = ctxt->name;
4896 if (failed ||
4897 (name == NULL)) {
4898 if (CUR == '>')
4899 NEXT;
4900 break;
4901 }
4902
4903 /*
4904 * Lookup the info for that element.
4905 */
4906 info = htmlTagLookup(name);
4907 if (info == NULL) {
4908 htmlParseErr(ctxt, XML_HTML_UNKNOWN_TAG,
4909 "Tag %s invalid\n", name, NULL);
4910 }
4911
4912 /*
4913 * Check for an Empty Element labeled the XML/SGML way
4914 */
4915 if ((CUR == '/') && (NXT(1) == '>')) {
4916 SKIP(2);
4917 if ((ctxt->sax != NULL) && (ctxt->sax->endElement != NULL))
4918 ctxt->sax->endElement(ctxt->userData, name);
4919 htmlnamePop(ctxt);
4920 ctxt->instate = XML_PARSER_CONTENT;
4921#ifdef DEBUG_PUSH
4922 xmlGenericError(xmlGenericErrorContext,
4923 "HPP: entering CONTENT\n");
4924#endif
4925 break;
4926 }
4927
4928 if (CUR == '>') {
4929 NEXT;
4930 } else {
4931 htmlParseErr(ctxt, XML_ERR_GT_REQUIRED,
4932 "Couldn't find end of Start Tag %s\n",
4933 name, NULL);
4934
4935 /*
4936 * end of parsing of this node.
4937 */
4938 if (xmlStrEqual(name, ctxt->name)) {
4939 nodePop(ctxt);
4940 htmlnamePop(ctxt);
4941 }
4942
4943 ctxt->instate = XML_PARSER_CONTENT;
4944#ifdef DEBUG_PUSH
4945 xmlGenericError(xmlGenericErrorContext,
4946 "HPP: entering CONTENT\n");
4947#endif
4948 break;
4949 }
4950
4951 /*
4952 * Check for an Empty Element from DTD definition
4953 */
4954 if ((info != NULL) && (info->empty)) {
4955 if ((ctxt->sax != NULL) && (ctxt->sax->endElement != NULL))
4956 ctxt->sax->endElement(ctxt->userData, name);
4957 htmlnamePop(ctxt);
4958 }
4959 ctxt->instate = XML_PARSER_CONTENT;
4960#ifdef DEBUG_PUSH
4961 xmlGenericError(xmlGenericErrorContext,
4962 "HPP: entering CONTENT\n");
4963#endif
4964 break;
4965 }
4966 case XML_PARSER_CONTENT: {
4967 long cons;
4968 /*
4969 * Handle preparsed entities and charRef
4970 */
4971 if (ctxt->token != 0) {
4972 xmlChar chr[2] = { 0 , 0 } ;
4973
4974 chr[0] = (xmlChar) ctxt->token;
4975 htmlCheckParagraph(ctxt);
4976 if ((ctxt->sax != NULL) && (ctxt->sax->characters != NULL))
4977 ctxt->sax->characters(ctxt->userData, chr, 1);
4978 ctxt->token = 0;
4979 ctxt->checkIndex = 0;
4980 }
4981 if ((avail == 1) && (terminate)) {
4982 cur = in->cur[0];
4983 if ((cur != '<') && (cur != '&')) {
4984 if (ctxt->sax != NULL) {
4985 if (IS_BLANK_CH(cur)) {
4986 if (ctxt->sax->ignorableWhitespace != NULL)
4987 ctxt->sax->ignorableWhitespace(
4988 ctxt->userData, &cur, 1);
4989 } else {
4990 htmlCheckParagraph(ctxt);
4991 if (ctxt->sax->characters != NULL)
4992 ctxt->sax->characters(
4993 ctxt->userData, &cur, 1);
4994 }
4995 }
4996 ctxt->token = 0;
4997 ctxt->checkIndex = 0;
4998 in->cur++;
4999 break;
5000 }
5001 }
5002 if (avail < 2)
5003 goto done;
5004 cur = in->cur[0];
5005 next = in->cur[1];
5006 cons = ctxt->nbChars;
5007 if ((xmlStrEqual(ctxt->name, BAD_CAST"script")) ||
5008 (xmlStrEqual(ctxt->name, BAD_CAST"style"))) {
5009 /*
5010 * Handle SCRIPT/STYLE separately
5011 */
5012 if (!terminate) {
5013 int idx;
5014 xmlChar val;
5015
5016 idx = htmlParseLookupSequence(ctxt, '<', '/', 0, 0);
5017 if (idx < 0)
5018 goto done;
5019 val = in->cur[idx + 2];
5020 if (val == 0) /* bad cut of input */
5021 goto done;
5022 }
5023 htmlParseScript(ctxt);
5024 if ((cur == '<') && (next == '/')) {
5025 ctxt->instate = XML_PARSER_END_TAG;
5026 ctxt->checkIndex = 0;
5027#ifdef DEBUG_PUSH
5028 xmlGenericError(xmlGenericErrorContext,
5029 "HPP: entering END_TAG\n");
5030#endif
5031 break;
5032 }
5033 } else {
5034 /*
5035 * Sometimes DOCTYPE arrives in the middle of the document
5036 */
5037 if ((cur == '<') && (next == '!') &&
5038 (UPP(2) == 'D') && (UPP(3) == 'O') &&
5039 (UPP(4) == 'C') && (UPP(5) == 'T') &&
5040 (UPP(6) == 'Y') && (UPP(7) == 'P') &&
5041 (UPP(8) == 'E')) {
5042 if ((!terminate) &&
5043 (htmlParseLookupSequence(ctxt, '>', 0, 0, 0) < 0))
5044 goto done;
5045 htmlParseErr(ctxt, XML_HTML_STRUCURE_ERROR,
5046 "Misplaced DOCTYPE declaration\n",
5047 BAD_CAST "DOCTYPE" , NULL);
5048 htmlParseDocTypeDecl(ctxt);
5049 } else if ((cur == '<') && (next == '!') &&
5050 (in->cur[2] == '-') && (in->cur[3] == '-')) {
5051 if ((!terminate) &&
5052 (htmlParseLookupSequence(
5053 ctxt, '-', '-', '>', 1) < 0))
5054 goto done;
5055#ifdef DEBUG_PUSH
5056 xmlGenericError(xmlGenericErrorContext,
5057 "HPP: Parsing Comment\n");
5058#endif
5059 htmlParseComment(ctxt);
5060 ctxt->instate = XML_PARSER_CONTENT;
5061 } else if ((cur == '<') && (next == '?')) {
5062 if ((!terminate) &&
5063 (htmlParseLookupSequence(ctxt, '>', 0, 0, 0) < 0))
5064 goto done;
5065#ifdef DEBUG_PUSH
5066 xmlGenericError(xmlGenericErrorContext,
5067 "HPP: Parsing PI\n");
5068#endif
5069 htmlParsePI(ctxt);
5070 ctxt->instate = XML_PARSER_CONTENT;
5071 } else if ((cur == '<') && (next == '!') && (avail < 4)) {
5072 goto done;
5073 } else if ((cur == '<') && (next == '/')) {
5074 ctxt->instate = XML_PARSER_END_TAG;
5075 ctxt->checkIndex = 0;
5076#ifdef DEBUG_PUSH
5077 xmlGenericError(xmlGenericErrorContext,
5078 "HPP: entering END_TAG\n");
5079#endif
5080 break;
5081 } else if (cur == '<') {
5082 ctxt->instate = XML_PARSER_START_TAG;
5083 ctxt->checkIndex = 0;
5084#ifdef DEBUG_PUSH
5085 xmlGenericError(xmlGenericErrorContext,
5086 "HPP: entering START_TAG\n");
5087#endif
5088 break;
5089 } else if (cur == '&') {
5090 if ((!terminate) &&
5091 (htmlParseLookupSequence(ctxt, ';', 0, 0, 0) < 0))
5092 goto done;
5093#ifdef DEBUG_PUSH
5094 xmlGenericError(xmlGenericErrorContext,
5095 "HPP: Parsing Reference\n");
5096#endif
5097 /* TODO: check generation of subtrees if noent !!! */
5098 htmlParseReference(ctxt);
5099 } else {
5100 /*
5101 * check that the text sequence is complete
5102 * before handing out the data to the parser
5103 * to avoid problems with erroneous end of
5104 * data detection.
5105 */
5106 if ((!terminate) &&
5107 (htmlParseLookupSequence(ctxt, '<', 0, 0, 0) < 0))
5108 goto done;
5109 ctxt->checkIndex = 0;
5110#ifdef DEBUG_PUSH
5111 xmlGenericError(xmlGenericErrorContext,
5112 "HPP: Parsing char data\n");
5113#endif
5114 htmlParseCharData(ctxt);
5115 }
5116 }
5117 if (cons == ctxt->nbChars) {
5118 if (ctxt->node != NULL) {
5119 htmlParseErr(ctxt, XML_ERR_INTERNAL_ERROR,
5120 "detected an error in element content\n",
5121 NULL, NULL);
5122 }
5123 NEXT;
5124 break;
5125 }
5126
5127 break;
5128 }
5129 case XML_PARSER_END_TAG:
5130 if (avail < 2)
5131 goto done;
5132 if ((!terminate) &&
5133 (htmlParseLookupSequence(ctxt, '>', 0, 0, 0) < 0))
5134 goto done;
5135 htmlParseEndTag(ctxt);
5136 if (ctxt->nameNr == 0) {
5137 ctxt->instate = XML_PARSER_EPILOG;
5138 } else {
5139 ctxt->instate = XML_PARSER_CONTENT;
5140 }
5141 ctxt->checkIndex = 0;
5142#ifdef DEBUG_PUSH
5143 xmlGenericError(xmlGenericErrorContext,
5144 "HPP: entering CONTENT\n");
5145#endif
5146 break;
5147 case XML_PARSER_CDATA_SECTION:
5148 htmlParseErr(ctxt, XML_ERR_INTERNAL_ERROR,
5149 "HPP: internal error, state == CDATA\n",
5150 NULL, NULL);
5151 ctxt->instate = XML_PARSER_CONTENT;
5152 ctxt->checkIndex = 0;
5153#ifdef DEBUG_PUSH
5154 xmlGenericError(xmlGenericErrorContext,
5155 "HPP: entering CONTENT\n");
5156#endif
5157 break;
5158 case XML_PARSER_DTD:
5159 htmlParseErr(ctxt, XML_ERR_INTERNAL_ERROR,
5160 "HPP: internal error, state == DTD\n",
5161 NULL, NULL);
5162 ctxt->instate = XML_PARSER_CONTENT;
5163 ctxt->checkIndex = 0;
5164#ifdef DEBUG_PUSH
5165 xmlGenericError(xmlGenericErrorContext,
5166 "HPP: entering CONTENT\n");
5167#endif
5168 break;
5169 case XML_PARSER_COMMENT:
5170 htmlParseErr(ctxt, XML_ERR_INTERNAL_ERROR,
5171 "HPP: internal error, state == COMMENT\n",
5172 NULL, NULL);
5173 ctxt->instate = XML_PARSER_CONTENT;
5174 ctxt->checkIndex = 0;
5175#ifdef DEBUG_PUSH
5176 xmlGenericError(xmlGenericErrorContext,
5177 "HPP: entering CONTENT\n");
5178#endif
5179 break;
5180 case XML_PARSER_PI:
5181 htmlParseErr(ctxt, XML_ERR_INTERNAL_ERROR,
5182 "HPP: internal error, state == PI\n",
5183 NULL, NULL);
5184 ctxt->instate = XML_PARSER_CONTENT;
5185 ctxt->checkIndex = 0;
5186#ifdef DEBUG_PUSH
5187 xmlGenericError(xmlGenericErrorContext,
5188 "HPP: entering CONTENT\n");
5189#endif
5190 break;
5191 case XML_PARSER_ENTITY_DECL:
5192 htmlParseErr(ctxt, XML_ERR_INTERNAL_ERROR,
5193 "HPP: internal error, state == ENTITY_DECL\n",
5194 NULL, NULL);
5195 ctxt->instate = XML_PARSER_CONTENT;
5196 ctxt->checkIndex = 0;
5197#ifdef DEBUG_PUSH
5198 xmlGenericError(xmlGenericErrorContext,
5199 "HPP: entering CONTENT\n");
5200#endif
5201 break;
5202 case XML_PARSER_ENTITY_VALUE:
5203 htmlParseErr(ctxt, XML_ERR_INTERNAL_ERROR,
5204 "HPP: internal error, state == ENTITY_VALUE\n",
5205 NULL, NULL);
5206 ctxt->instate = XML_PARSER_CONTENT;
5207 ctxt->checkIndex = 0;
5208#ifdef DEBUG_PUSH
5209 xmlGenericError(xmlGenericErrorContext,
5210 "HPP: entering DTD\n");
5211#endif
5212 break;
5213 case XML_PARSER_ATTRIBUTE_VALUE:
5214 htmlParseErr(ctxt, XML_ERR_INTERNAL_ERROR,
5215 "HPP: internal error, state == ATTRIBUTE_VALUE\n",
5216 NULL, NULL);
5217 ctxt->instate = XML_PARSER_START_TAG;
5218 ctxt->checkIndex = 0;
5219#ifdef DEBUG_PUSH
5220 xmlGenericError(xmlGenericErrorContext,
5221 "HPP: entering START_TAG\n");
5222#endif
5223 break;
5224 case XML_PARSER_SYSTEM_LITERAL:
5225 htmlParseErr(ctxt, XML_ERR_INTERNAL_ERROR,
5226 "HPP: internal error, state == XML_PARSER_SYSTEM_LITERAL\n",
5227 NULL, NULL);
5228 ctxt->instate = XML_PARSER_CONTENT;
5229 ctxt->checkIndex = 0;
5230#ifdef DEBUG_PUSH
5231 xmlGenericError(xmlGenericErrorContext,
5232 "HPP: entering CONTENT\n");
5233#endif
5234 break;
5235 case XML_PARSER_IGNORE:
5236 htmlParseErr(ctxt, XML_ERR_INTERNAL_ERROR,
5237 "HPP: internal error, state == XML_PARSER_IGNORE\n",
5238 NULL, NULL);
5239 ctxt->instate = XML_PARSER_CONTENT;
5240 ctxt->checkIndex = 0;
5241#ifdef DEBUG_PUSH
5242 xmlGenericError(xmlGenericErrorContext,
5243 "HPP: entering CONTENT\n");
5244#endif
5245 break;
5246 case XML_PARSER_PUBLIC_LITERAL:
5247 htmlParseErr(ctxt, XML_ERR_INTERNAL_ERROR,
5248 "HPP: internal error, state == XML_PARSER_LITERAL\n",
5249 NULL, NULL);
5250 ctxt->instate = XML_PARSER_CONTENT;
5251 ctxt->checkIndex = 0;
5252#ifdef DEBUG_PUSH
5253 xmlGenericError(xmlGenericErrorContext,
5254 "HPP: entering CONTENT\n");
5255#endif
5256 break;
5257
5258 }
5259 }
5260done:
5261 if ((avail == 0) && (terminate)) {
5262 htmlAutoCloseOnEnd(ctxt);
5263 if ((ctxt->nameNr == 0) && (ctxt->instate != XML_PARSER_EOF)) {
5264 /*
5265 * SAX: end of the document processing.
5266 */
5267 ctxt->instate = XML_PARSER_EOF;
5268 if ((ctxt->sax) && (ctxt->sax->endDocument != NULL))
5269 ctxt->sax->endDocument(ctxt->userData);
5270 }
5271 }
5272 if ((ctxt->myDoc != NULL) &&
5273 ((terminate) || (ctxt->instate == XML_PARSER_EOF) ||
5274 (ctxt->instate == XML_PARSER_EPILOG))) {
5275 xmlDtdPtr dtd;
5276 dtd = xmlGetIntSubset(ctxt->myDoc);
5277 if (dtd == NULL)
5278 ctxt->myDoc->intSubset =
5279 xmlCreateIntSubset(ctxt->myDoc, BAD_CAST "html",
5280 BAD_CAST "-//W3C//DTD HTML 4.0 Transitional//EN",
5281 BAD_CAST "http://www.w3.org/TR/REC-html40/loose.dtd");
5282 }
5283#ifdef DEBUG_PUSH
5284 xmlGenericError(xmlGenericErrorContext, "HPP: done %d\n", ret);
5285#endif
5286 return(ret);
5287}
5288
5289/**
5290 * htmlParseChunk:
5291 * @ctxt: an HTML parser context
5292 * @chunk: an char array
5293 * @size: the size in byte of the chunk
5294 * @terminate: last chunk indicator
5295 *
5296 * Parse a Chunk of memory
5297 *
5298 * Returns zero if no error, the xmlParserErrors otherwise.
5299 */
5300int
5301htmlParseChunk(htmlParserCtxtPtr ctxt, const char *chunk, int size,
5302 int terminate) {
5303 if ((ctxt == NULL) || (ctxt->input == NULL)) {
5304 htmlParseErr(ctxt, XML_ERR_INTERNAL_ERROR,
5305 "htmlParseChunk: context error\n", NULL, NULL);
5306 return(XML_ERR_INTERNAL_ERROR);
5307 }
5308 if ((size > 0) && (chunk != NULL) && (ctxt->input != NULL) &&
5309 (ctxt->input->buf != NULL) && (ctxt->instate != XML_PARSER_EOF)) {
5310 int base = ctxt->input->base - ctxt->input->buf->buffer->content;
5311 int cur = ctxt->input->cur - ctxt->input->base;
5312 int res;
5313
5314 res = xmlParserInputBufferPush(ctxt->input->buf, size, chunk);
5315 if (res < 0) {
5316 ctxt->errNo = XML_PARSER_EOF;
5317 ctxt->disableSAX = 1;
5318 return (XML_PARSER_EOF);
5319 }
5320 ctxt->input->base = ctxt->input->buf->buffer->content + base;
5321 ctxt->input->cur = ctxt->input->base + cur;
5322 ctxt->input->end =
5323 &ctxt->input->buf->buffer->content[ctxt->input->buf->buffer->use];
5324#ifdef DEBUG_PUSH
5325 xmlGenericError(xmlGenericErrorContext, "HPP: pushed %d\n", size);
5326#endif
5327
5328#if 0
5329 if ((terminate) || (ctxt->input->buf->buffer->use > 80))
5330 htmlParseTryOrFinish(ctxt, terminate);
5331#endif
5332 } else if (ctxt->instate != XML_PARSER_EOF) {
5333 if ((ctxt->input != NULL) && ctxt->input->buf != NULL) {
5334 xmlParserInputBufferPtr in = ctxt->input->buf;
5335 if ((in->encoder != NULL) && (in->buffer != NULL) &&
5336 (in->raw != NULL)) {
5337 int nbchars;
5338
5339 nbchars = xmlCharEncInFunc(in->encoder, in->buffer, in->raw);
5340 if (nbchars < 0) {
5341 htmlParseErr(ctxt, XML_ERR_INVALID_ENCODING,
5342 "encoder error\n", NULL, NULL);
5343 return(XML_ERR_INVALID_ENCODING);
5344 }
5345 }
5346 }
5347 }
5348 htmlParseTryOrFinish(ctxt, terminate);
5349 if (terminate) {
5350 if ((ctxt->instate != XML_PARSER_EOF) &&
5351 (ctxt->instate != XML_PARSER_EPILOG) &&
5352 (ctxt->instate != XML_PARSER_MISC)) {
5353 ctxt->errNo = XML_ERR_DOCUMENT_END;
5354 ctxt->wellFormed = 0;
5355 }
5356 if (ctxt->instate != XML_PARSER_EOF) {
5357 if ((ctxt->sax) && (ctxt->sax->endDocument != NULL))
5358 ctxt->sax->endDocument(ctxt->userData);
5359 }
5360 ctxt->instate = XML_PARSER_EOF;
5361 }
5362 return((xmlParserErrors) ctxt->errNo);
5363}
5364
5365/************************************************************************
5366 * *
5367 * User entry points *
5368 * *
5369 ************************************************************************/
5370
5371/**
5372 * htmlCreatePushParserCtxt:
5373 * @sax: a SAX handler
5374 * @user_data: The user data returned on SAX callbacks
5375 * @chunk: a pointer to an array of chars
5376 * @size: number of chars in the array
5377 * @filename: an optional file name or URI
5378 * @enc: an optional encoding
5379 *
5380 * Create a parser context for using the HTML parser in push mode
5381 * The value of @filename is used for fetching external entities
5382 * and error/warning reports.
5383 *
5384 * Returns the new parser context or NULL
5385 */
5386htmlParserCtxtPtr
5387htmlCreatePushParserCtxt(htmlSAXHandlerPtr sax, void *user_data,
5388 const char *chunk, int size, const char *filename,
5389 xmlCharEncoding enc) {
5390 htmlParserCtxtPtr ctxt;
5391 htmlParserInputPtr inputStream;
5392 xmlParserInputBufferPtr buf;
5393
5394 xmlInitParser();
5395
5396 buf = xmlAllocParserInputBuffer(enc);
5397 if (buf == NULL) return(NULL);
5398
5399 ctxt = htmlNewParserCtxt();
5400 if (ctxt == NULL) {
5401 xmlFreeParserInputBuffer(buf);
5402 return(NULL);
5403 }
5404 if(enc==XML_CHAR_ENCODING_UTF8 || buf->encoder)
5405 ctxt->charset=XML_CHAR_ENCODING_UTF8;
5406 if (sax != NULL) {
5407 if (ctxt->sax != (xmlSAXHandlerPtr) &htmlDefaultSAXHandler)
5408 xmlFree(ctxt->sax);
5409 ctxt->sax = (htmlSAXHandlerPtr) xmlMalloc(sizeof(htmlSAXHandler));
5410 if (ctxt->sax == NULL) {
5411 xmlFree(buf);
5412 xmlFree(ctxt);
5413 return(NULL);
5414 }
5415 memcpy(ctxt->sax, sax, sizeof(htmlSAXHandler));
5416 if (user_data != NULL)
5417 ctxt->userData = user_data;
5418 }
5419 if (filename == NULL) {
5420 ctxt->directory = NULL;
5421 } else {
5422 ctxt->directory = xmlParserGetDirectory(filename);
5423 }
5424
5425 inputStream = htmlNewInputStream(ctxt);
5426 if (inputStream == NULL) {
5427 xmlFreeParserCtxt(ctxt);
5428 xmlFree(buf);
5429 return(NULL);
5430 }
5431
5432 if (filename == NULL)
5433 inputStream->filename = NULL;
5434 else
5435 inputStream->filename = (char *)
5436 xmlCanonicPath((const xmlChar *) filename);
5437 inputStream->buf = buf;
5438 inputStream->base = inputStream->buf->buffer->content;
5439 inputStream->cur = inputStream->buf->buffer->content;
5440 inputStream->end =
5441 &inputStream->buf->buffer->content[inputStream->buf->buffer->use];
5442
5443 inputPush(ctxt, inputStream);
5444
5445 if ((size > 0) && (chunk != NULL) && (ctxt->input != NULL) &&
5446 (ctxt->input->buf != NULL)) {
5447 int base = ctxt->input->base - ctxt->input->buf->buffer->content;
5448 int cur = ctxt->input->cur - ctxt->input->base;
5449
5450 xmlParserInputBufferPush(ctxt->input->buf, size, chunk);
5451
5452 ctxt->input->base = ctxt->input->buf->buffer->content + base;
5453 ctxt->input->cur = ctxt->input->base + cur;
5454 ctxt->input->end =
5455 &ctxt->input->buf->buffer->content[ctxt->input->buf->buffer->use];
5456#ifdef DEBUG_PUSH
5457 xmlGenericError(xmlGenericErrorContext, "HPP: pushed %d\n", size);
5458#endif
5459 }
5460 ctxt->progressive = 1;
5461
5462 return(ctxt);
5463}
5464#endif /* LIBXML_PUSH_ENABLED */
5465
5466/**
5467 * htmlSAXParseDoc:
5468 * @cur: a pointer to an array of xmlChar
5469 * @encoding: a free form C string describing the HTML document encoding, or NULL
5470 * @sax: the SAX handler block
5471 * @userData: if using SAX, this pointer will be provided on callbacks.
5472 *
5473 * Parse an HTML in-memory document. If sax is not NULL, use the SAX callbacks
5474 * to handle parse events. If sax is NULL, fallback to the default DOM
5475 * behavior and return a tree.
5476 *
5477 * Returns the resulting document tree unless SAX is NULL or the document is
5478 * not well formed.
5479 */
5480
5481htmlDocPtr
5482htmlSAXParseDoc(xmlChar *cur, const char *encoding, htmlSAXHandlerPtr sax, void *userData) {
5483 htmlDocPtr ret;
5484 htmlParserCtxtPtr ctxt;
5485
5486 xmlInitParser();
5487
5488 if (cur == NULL) return(NULL);
5489
5490
5491 ctxt = htmlCreateDocParserCtxt(cur, encoding);
5492 if (ctxt == NULL) return(NULL);
5493 if (sax != NULL) {
5494 if (ctxt->sax != NULL) xmlFree (ctxt->sax);
5495 ctxt->sax = sax;
5496 ctxt->userData = userData;
5497 }
5498
5499 htmlParseDocument(ctxt);
5500 ret = ctxt->myDoc;
5501 if (sax != NULL) {
5502 ctxt->sax = NULL;
5503 ctxt->userData = NULL;
5504 }
5505 htmlFreeParserCtxt(ctxt);
5506
5507 return(ret);
5508}
5509
5510/**
5511 * htmlParseDoc:
5512 * @cur: a pointer to an array of xmlChar
5513 * @encoding: a free form C string describing the HTML document encoding, or NULL
5514 *
5515 * parse an HTML in-memory document and build a tree.
5516 *
5517 * Returns the resulting document tree
5518 */
5519
5520htmlDocPtr
5521htmlParseDoc(xmlChar *cur, const char *encoding) {
5522 return(htmlSAXParseDoc(cur, encoding, NULL, NULL));
5523}
5524
5525
5526/**
5527 * htmlCreateFileParserCtxt:
5528 * @filename: the filename
5529 * @encoding: a free form C string describing the HTML document encoding, or NULL
5530 *
5531 * Create a parser context for a file content.
5532 * Automatic support for ZLIB/Compress compressed document is provided
5533 * by default if found at compile-time.
5534 *
5535 * Returns the new parser context or NULL
5536 */
5537htmlParserCtxtPtr
5538htmlCreateFileParserCtxt(const char *filename, const char *encoding)
5539{
5540 htmlParserCtxtPtr ctxt;
5541 htmlParserInputPtr inputStream;
5542 char *canonicFilename;
5543 /* htmlCharEncoding enc; */
5544 xmlChar *content, *content_line = (xmlChar *) "charset=";
5545
5546 if (filename == NULL)
5547 return(NULL);
5548
5549 ctxt = htmlNewParserCtxt();
5550 if (ctxt == NULL) {
5551 return(NULL);
5552 }
5553 canonicFilename = (char *) xmlCanonicPath((const xmlChar *) filename);
5554 if (canonicFilename == NULL) {
5555#ifdef LIBXML_SAX1_ENABLED
5556 if (xmlDefaultSAXHandler.error != NULL) {
5557 xmlDefaultSAXHandler.error(NULL, "out of memory\n");
5558 }
5559#endif
5560 xmlFreeParserCtxt(ctxt);
5561 return(NULL);
5562 }
5563
5564 inputStream = xmlLoadExternalEntity(canonicFilename, NULL, ctxt);
5565 xmlFree(canonicFilename);
5566 if (inputStream == NULL) {
5567 xmlFreeParserCtxt(ctxt);
5568 return(NULL);
5569 }
5570
5571 inputPush(ctxt, inputStream);
5572
5573 /* set encoding */
5574 if (encoding) {
5575 content = xmlMallocAtomic (xmlStrlen(content_line) + strlen(encoding) + 1);
5576 if (content) {
5577 strcpy ((char *)content, (char *)content_line);
5578 strcat ((char *)content, (char *)encoding);
5579 htmlCheckEncoding (ctxt, content);
5580 xmlFree (content);
5581 }
5582 }
5583
5584 return(ctxt);
5585}
5586
5587/**
5588 * htmlSAXParseFile:
5589 * @filename: the filename
5590 * @encoding: a free form C string describing the HTML document encoding, or NULL
5591 * @sax: the SAX handler block
5592 * @userData: if using SAX, this pointer will be provided on callbacks.
5593 *
5594 * parse an HTML file and build a tree. Automatic support for ZLIB/Compress
5595 * compressed document is provided by default if found at compile-time.
5596 * It use the given SAX function block to handle the parsing callback.
5597 * If sax is NULL, fallback to the default DOM tree building routines.
5598 *
5599 * Returns the resulting document tree unless SAX is NULL or the document is
5600 * not well formed.
5601 */
5602
5603htmlDocPtr
5604htmlSAXParseFile(const char *filename, const char *encoding, htmlSAXHandlerPtr sax,
5605 void *userData) {
5606 htmlDocPtr ret;
5607 htmlParserCtxtPtr ctxt;
5608 htmlSAXHandlerPtr oldsax = NULL;
5609
5610 xmlInitParser();
5611
5612 ctxt = htmlCreateFileParserCtxt(filename, encoding);
5613 if (ctxt == NULL) return(NULL);
5614 if (sax != NULL) {
5615 oldsax = ctxt->sax;
5616 ctxt->sax = sax;
5617 ctxt->userData = userData;
5618 }
5619
5620 htmlParseDocument(ctxt);
5621
5622 ret = ctxt->myDoc;
5623 if (sax != NULL) {
5624 ctxt->sax = oldsax;
5625 ctxt->userData = NULL;
5626 }
5627 htmlFreeParserCtxt(ctxt);
5628
5629 return(ret);
5630}
5631
5632/**
5633 * htmlParseFile:
5634 * @filename: the filename
5635 * @encoding: a free form C string describing the HTML document encoding, or NULL
5636 *
5637 * parse an HTML file and build a tree. Automatic support for ZLIB/Compress
5638 * compressed document is provided by default if found at compile-time.
5639 *
5640 * Returns the resulting document tree
5641 */
5642
5643htmlDocPtr
5644htmlParseFile(const char *filename, const char *encoding) {
5645 return(htmlSAXParseFile(filename, encoding, NULL, NULL));
5646}
5647
5648/**
5649 * htmlHandleOmittedElem:
5650 * @val: int 0 or 1
5651 *
5652 * Set and return the previous value for handling HTML omitted tags.
5653 *
5654 * Returns the last value for 0 for no handling, 1 for auto insertion.
5655 */
5656
5657int
5658htmlHandleOmittedElem(int val) {
5659 int old = htmlOmittedDefaultValue;
5660
5661 htmlOmittedDefaultValue = val;
5662 return(old);
5663}
5664
5665/**
5666 * htmlElementAllowedHere:
5667 * @parent: HTML parent element
5668 * @elt: HTML element
5669 *
5670 * Checks whether an HTML element may be a direct child of a parent element.
5671 * Note - doesn't check for deprecated elements
5672 *
5673 * Returns 1 if allowed; 0 otherwise.
5674 */
5675int
5676htmlElementAllowedHere(const htmlElemDesc* parent, const xmlChar* elt) {
5677 const char** p ;
5678
5679 if ( ! elt || ! parent || ! parent->subelts )
5680 return 0 ;
5681
5682 for ( p = parent->subelts; *p; ++p )
5683 if ( !xmlStrcmp((const xmlChar *)*p, elt) )
5684 return 1 ;
5685
5686 return 0 ;
5687}
5688/**
5689 * htmlElementStatusHere:
5690 * @parent: HTML parent element
5691 * @elt: HTML element
5692 *
5693 * Checks whether an HTML element may be a direct child of a parent element.
5694 * and if so whether it is valid or deprecated.
5695 *
5696 * Returns one of HTML_VALID, HTML_DEPRECATED, HTML_INVALID
5697 */
5698htmlStatus
5699htmlElementStatusHere(const htmlElemDesc* parent, const htmlElemDesc* elt) {
5700 if ( ! parent || ! elt )
5701 return HTML_INVALID ;
5702 if ( ! htmlElementAllowedHere(parent, (const xmlChar*) elt->name ) )
5703 return HTML_INVALID ;
5704
5705 return ( elt->dtd == 0 ) ? HTML_VALID : HTML_DEPRECATED ;
5706}
5707/**
5708 * htmlAttrAllowed:
5709 * @elt: HTML element
5710 * @attr: HTML attribute
5711 * @legacy: whether to allow deprecated attributes
5712 *
5713 * Checks whether an attribute is valid for an element
5714 * Has full knowledge of Required and Deprecated attributes
5715 *
5716 * Returns one of HTML_REQUIRED, HTML_VALID, HTML_DEPRECATED, HTML_INVALID
5717 */
5718htmlStatus
5719htmlAttrAllowed(const htmlElemDesc* elt, const xmlChar* attr, int legacy) {
5720 const char** p ;
5721
5722 if ( !elt || ! attr )
5723 return HTML_INVALID ;
5724
5725 if ( elt->attrs_req )
5726 for ( p = elt->attrs_req; *p; ++p)
5727 if ( !xmlStrcmp((const xmlChar*)*p, attr) )
5728 return HTML_REQUIRED ;
5729
5730 if ( elt->attrs_opt )
5731 for ( p = elt->attrs_opt; *p; ++p)
5732 if ( !xmlStrcmp((const xmlChar*)*p, attr) )
5733 return HTML_VALID ;
5734
5735 if ( legacy && elt->attrs_depr )
5736 for ( p = elt->attrs_depr; *p; ++p)
5737 if ( !xmlStrcmp((const xmlChar*)*p, attr) )
5738 return HTML_DEPRECATED ;
5739
5740 return HTML_INVALID ;
5741}
5742/**
5743 * htmlNodeStatus:
5744 * @node: an htmlNodePtr in a tree
5745 * @legacy: whether to allow deprecated elements (YES is faster here
5746 * for Element nodes)
5747 *
5748 * Checks whether the tree node is valid. Experimental (the author
5749 * only uses the HTML enhancements in a SAX parser)
5750 *
5751 * Return: for Element nodes, a return from htmlElementAllowedHere (if
5752 * legacy allowed) or htmlElementStatusHere (otherwise).
5753 * for Attribute nodes, a return from htmlAttrAllowed
5754 * for other nodes, HTML_NA (no checks performed)
5755 */
5756htmlStatus
5757htmlNodeStatus(const htmlNodePtr node, int legacy) {
5758 if ( ! node )
5759 return HTML_INVALID ;
5760
5761 switch ( node->type ) {
5762 case XML_ELEMENT_NODE:
5763 return legacy
5764 ? ( htmlElementAllowedHere (
5765 htmlTagLookup(node->parent->name) , node->name
5766 ) ? HTML_VALID : HTML_INVALID )
5767 : htmlElementStatusHere(
5768 htmlTagLookup(node->parent->name) ,
5769 htmlTagLookup(node->name) )
5770 ;
5771 case XML_ATTRIBUTE_NODE:
5772 return htmlAttrAllowed(
5773 htmlTagLookup(node->parent->name) , node->name, legacy) ;
5774 default: return HTML_NA ;
5775 }
5776}
5777/************************************************************************
5778 * *
5779 * New set (2.6.0) of simpler and more flexible APIs *
5780 * *
5781 ************************************************************************/
5782/**
5783 * DICT_FREE:
5784 * @str: a string
5785 *
5786 * Free a string if it is not owned by the "dict" dictionnary in the
5787 * current scope
5788 */
5789#define DICT_FREE(str) \
5790 if ((str) && ((!dict) || \
5791 (xmlDictOwns(dict, (const xmlChar *)(str)) == 0))) \
5792 xmlFree((char *)(str));
5793
5794/**
5795 * htmlCtxtReset:
5796 * @ctxt: an HTML parser context
5797 *
5798 * Reset a parser context
5799 */
5800void
5801htmlCtxtReset(htmlParserCtxtPtr ctxt)
5802{
5803 xmlParserInputPtr input;
5804 xmlDictPtr dict;
5805
5806 if (ctxt == NULL)
5807 return;
5808
5809 xmlInitParser();
5810 dict = ctxt->dict;
5811
5812 while ((input = inputPop(ctxt)) != NULL) { /* Non consuming */
5813 xmlFreeInputStream(input);
5814 }
5815 ctxt->inputNr = 0;
5816 ctxt->input = NULL;
5817
5818 ctxt->spaceNr = 0;
5819 if (ctxt->spaceTab != NULL) {
5820 ctxt->spaceTab[0] = -1;
5821 ctxt->space = &ctxt->spaceTab[0];
5822 } else {
5823 ctxt->space = NULL;
5824 }
5825
5826
5827 ctxt->nodeNr = 0;
5828 ctxt->node = NULL;
5829
5830 ctxt->nameNr = 0;
5831 ctxt->name = NULL;
5832
5833 DICT_FREE(ctxt->version);
5834 ctxt->version = NULL;
5835 DICT_FREE(ctxt->encoding);
5836 ctxt->encoding = NULL;
5837 DICT_FREE(ctxt->directory);
5838 ctxt->directory = NULL;
5839 DICT_FREE(ctxt->extSubURI);
5840 ctxt->extSubURI = NULL;
5841 DICT_FREE(ctxt->extSubSystem);
5842 ctxt->extSubSystem = NULL;
5843 if (ctxt->myDoc != NULL)
5844 xmlFreeDoc(ctxt->myDoc);
5845 ctxt->myDoc = NULL;
5846
5847 ctxt->standalone = -1;
5848 ctxt->hasExternalSubset = 0;
5849 ctxt->hasPErefs = 0;
5850 ctxt->html = 1;
5851 ctxt->external = 0;
5852 ctxt->instate = XML_PARSER_START;
5853 ctxt->token = 0;
5854
5855 ctxt->wellFormed = 1;
5856 ctxt->nsWellFormed = 1;
5857 ctxt->valid = 1;
5858 ctxt->vctxt.userData = ctxt;
5859 ctxt->vctxt.error = xmlParserValidityError;
5860 ctxt->vctxt.warning = xmlParserValidityWarning;
5861 ctxt->record_info = 0;
5862 ctxt->nbChars = 0;
5863 ctxt->checkIndex = 0;
5864 ctxt->inSubset = 0;
5865 ctxt->errNo = XML_ERR_OK;
5866 ctxt->depth = 0;
5867 ctxt->charset = XML_CHAR_ENCODING_NONE;
5868 ctxt->catalogs = NULL;
5869 xmlInitNodeInfoSeq(&ctxt->node_seq);
5870
5871 if (ctxt->attsDefault != NULL) {
5872 xmlHashFree(ctxt->attsDefault, (xmlHashDeallocator) xmlFree);
5873 ctxt->attsDefault = NULL;
5874 }
5875 if (ctxt->attsSpecial != NULL) {
5876 xmlHashFree(ctxt->attsSpecial, NULL);
5877 ctxt->attsSpecial = NULL;
5878 }
5879}
5880
5881/**
5882 * htmlCtxtUseOptions:
5883 * @ctxt: an HTML parser context
5884 * @options: a combination of htmlParserOption(s)
5885 *
5886 * Applies the options to the parser context
5887 *
5888 * Returns 0 in case of success, the set of unknown or unimplemented options
5889 * in case of error.
5890 */
5891int
5892htmlCtxtUseOptions(htmlParserCtxtPtr ctxt, int options)
5893{
5894 if (ctxt == NULL)
5895 return(-1);
5896
5897 if (options & HTML_PARSE_NOWARNING) {
5898 ctxt->sax->warning = NULL;
5899 ctxt->vctxt.warning = NULL;
5900 options -= XML_PARSE_NOWARNING;
5901 ctxt->options |= XML_PARSE_NOWARNING;
5902 }
5903 if (options & HTML_PARSE_NOERROR) {
5904 ctxt->sax->error = NULL;
5905 ctxt->vctxt.error = NULL;
5906 ctxt->sax->fatalError = NULL;
5907 options -= XML_PARSE_NOERROR;
5908 ctxt->options |= XML_PARSE_NOERROR;
5909 }
5910 if (options & HTML_PARSE_PEDANTIC) {
5911 ctxt->pedantic = 1;
5912 options -= XML_PARSE_PEDANTIC;
5913 ctxt->options |= XML_PARSE_PEDANTIC;
5914 } else
5915 ctxt->pedantic = 0;
5916 if (options & XML_PARSE_NOBLANKS) {
5917 ctxt->keepBlanks = 0;
5918 ctxt->sax->ignorableWhitespace = xmlSAX2IgnorableWhitespace;
5919 options -= XML_PARSE_NOBLANKS;
5920 ctxt->options |= XML_PARSE_NOBLANKS;
5921 } else
5922 ctxt->keepBlanks = 1;
5923 if (options & HTML_PARSE_RECOVER) {
5924 ctxt->recovery = 1;
5925 options -= HTML_PARSE_RECOVER;
5926 } else
5927 ctxt->recovery = 0;
5928 if (options & HTML_PARSE_COMPACT) {
5929 ctxt->options |= HTML_PARSE_COMPACT;
5930 options -= HTML_PARSE_COMPACT;
5931 }
5932 ctxt->dictNames = 0;
5933 return (options);
5934}
5935
5936/**
5937 * htmlDoRead:
5938 * @ctxt: an HTML parser context
5939 * @URL: the base URL to use for the document
5940 * @encoding: the document encoding, or NULL
5941 * @options: a combination of htmlParserOption(s)
5942 * @reuse: keep the context for reuse
5943 *
5944 * Common front-end for the htmlRead functions
5945 *
5946 * Returns the resulting document tree or NULL
5947 */
5948static htmlDocPtr
5949htmlDoRead(htmlParserCtxtPtr ctxt, const char *URL, const char *encoding,
5950 int options, int reuse)
5951{
5952 htmlDocPtr ret;
5953
5954 htmlCtxtUseOptions(ctxt, options);
5955 ctxt->html = 1;
5956 if (encoding != NULL) {
5957 xmlCharEncodingHandlerPtr hdlr;
5958
5959 hdlr = xmlFindCharEncodingHandler(encoding);
5960 if (hdlr != NULL)
5961 xmlSwitchToEncoding(ctxt, hdlr);
5962 }
5963 if ((URL != NULL) && (ctxt->input != NULL) &&
5964 (ctxt->input->filename == NULL))
5965 ctxt->input->filename = (char *) xmlStrdup((const xmlChar *) URL);
5966 htmlParseDocument(ctxt);
5967 ret = ctxt->myDoc;
5968 ctxt->myDoc = NULL;
5969 if (!reuse) {
5970 if ((ctxt->dictNames) &&
5971 (ret != NULL) &&
5972 (ret->dict == ctxt->dict))
5973 ctxt->dict = NULL;
5974 xmlFreeParserCtxt(ctxt);
5975 }
5976 return (ret);
5977}
5978
5979/**
5980 * htmlReadDoc:
5981 * @cur: a pointer to a zero terminated string
5982 * @URL: the base URL to use for the document
5983 * @encoding: the document encoding, or NULL
5984 * @options: a combination of htmlParserOption(s)
5985 *
5986 * parse an XML in-memory document and build a tree.
5987 *
5988 * Returns the resulting document tree
5989 */
5990htmlDocPtr
5991htmlReadDoc(const xmlChar * cur, const char *URL, const char *encoding, int options)
5992{
5993 htmlParserCtxtPtr ctxt;
5994
5995 if (cur == NULL)
5996 return (NULL);
5997
5998 xmlInitParser();
5999 ctxt = htmlCreateDocParserCtxt(cur, NULL);
6000 if (ctxt == NULL)
6001 return (NULL);
6002 return (htmlDoRead(ctxt, URL, encoding, options, 0));
6003}
6004
6005/**
6006 * htmlReadFile:
6007 * @filename: a file or URL
6008 * @encoding: the document encoding, or NULL
6009 * @options: a combination of htmlParserOption(s)
6010 *
6011 * parse an XML file from the filesystem or the network.
6012 *
6013 * Returns the resulting document tree
6014 */
6015htmlDocPtr
6016htmlReadFile(const char *filename, const char *encoding, int options)
6017{
6018 htmlParserCtxtPtr ctxt;
6019
6020 xmlInitParser();
6021 ctxt = htmlCreateFileParserCtxt(filename, encoding);
6022 if (ctxt == NULL)
6023 return (NULL);
6024 return (htmlDoRead(ctxt, NULL, NULL, options, 0));
6025}
6026
6027/**
6028 * htmlReadMemory:
6029 * @buffer: a pointer to a char array
6030 * @size: the size of the array
6031 * @URL: the base URL to use for the document
6032 * @encoding: the document encoding, or NULL
6033 * @options: a combination of htmlParserOption(s)
6034 *
6035 * parse an XML in-memory document and build a tree.
6036 *
6037 * Returns the resulting document tree
6038 */
6039htmlDocPtr
6040htmlReadMemory(const char *buffer, int size, const char *URL, const char *encoding, int options)
6041{
6042 htmlParserCtxtPtr ctxt;
6043
6044 xmlInitParser();
6045 ctxt = xmlCreateMemoryParserCtxt(buffer, size);
6046 if (ctxt == NULL)
6047 return (NULL);
6048 htmlDefaultSAXHandlerInit();
6049 if (ctxt->sax != NULL)
6050 memcpy(ctxt->sax, &htmlDefaultSAXHandler, sizeof(xmlSAXHandlerV1));
6051 return (htmlDoRead(ctxt, URL, encoding, options, 0));
6052}
6053
6054/**
6055 * htmlReadFd:
6056 * @fd: an open file descriptor
6057 * @URL: the base URL to use for the document
6058 * @encoding: the document encoding, or NULL
6059 * @options: a combination of htmlParserOption(s)
6060 *
6061 * parse an XML from a file descriptor and build a tree.
6062 *
6063 * Returns the resulting document tree
6064 */
6065htmlDocPtr
6066htmlReadFd(int fd, const char *URL, const char *encoding, int options)
6067{
6068 htmlParserCtxtPtr ctxt;
6069 xmlParserInputBufferPtr input;
6070 xmlParserInputPtr stream;
6071
6072 if (fd < 0)
6073 return (NULL);
6074
6075 xmlInitParser();
6076 input = xmlParserInputBufferCreateFd(fd, XML_CHAR_ENCODING_NONE);
6077 if (input == NULL)
6078 return (NULL);
6079 ctxt = xmlNewParserCtxt();
6080 if (ctxt == NULL) {
6081 xmlFreeParserInputBuffer(input);
6082 return (NULL);
6083 }
6084 stream = xmlNewIOInputStream(ctxt, input, XML_CHAR_ENCODING_NONE);
6085 if (stream == NULL) {
6086 xmlFreeParserInputBuffer(input);
6087 xmlFreeParserCtxt(ctxt);
6088 return (NULL);
6089 }
6090 inputPush(ctxt, stream);
6091 return (htmlDoRead(ctxt, URL, encoding, options, 0));
6092}
6093
6094/**
6095 * htmlReadIO:
6096 * @ioread: an I/O read function
6097 * @ioclose: an I/O close function
6098 * @ioctx: an I/O handler
6099 * @URL: the base URL to use for the document
6100 * @encoding: the document encoding, or NULL
6101 * @options: a combination of htmlParserOption(s)
6102 *
6103 * parse an HTML document from I/O functions and source and build a tree.
6104 *
6105 * Returns the resulting document tree
6106 */
6107htmlDocPtr
6108htmlReadIO(xmlInputReadCallback ioread, xmlInputCloseCallback ioclose,
6109 void *ioctx, const char *URL, const char *encoding, int options)
6110{
6111 htmlParserCtxtPtr ctxt;
6112 xmlParserInputBufferPtr input;
6113 xmlParserInputPtr stream;
6114
6115 if (ioread == NULL)
6116 return (NULL);
6117 xmlInitParser();
6118
6119 input = xmlParserInputBufferCreateIO(ioread, ioclose, ioctx,
6120 XML_CHAR_ENCODING_NONE);
6121 if (input == NULL)
6122 return (NULL);
6123 ctxt = htmlNewParserCtxt();
6124 if (ctxt == NULL) {
6125 xmlFreeParserInputBuffer(input);
6126 return (NULL);
6127 }
6128 stream = xmlNewIOInputStream(ctxt, input, XML_CHAR_ENCODING_NONE);
6129 if (stream == NULL) {
6130 xmlFreeParserInputBuffer(input);
6131 xmlFreeParserCtxt(ctxt);
6132 return (NULL);
6133 }
6134 inputPush(ctxt, stream);
6135 return (htmlDoRead(ctxt, URL, encoding, options, 0));
6136}
6137
6138/**
6139 * htmlCtxtReadDoc:
6140 * @ctxt: an HTML parser context
6141 * @cur: a pointer to a zero terminated string
6142 * @URL: the base URL to use for the document
6143 * @encoding: the document encoding, or NULL
6144 * @options: a combination of htmlParserOption(s)
6145 *
6146 * parse an XML in-memory document and build a tree.
6147 * This reuses the existing @ctxt parser context
6148 *
6149 * Returns the resulting document tree
6150 */
6151htmlDocPtr
6152htmlCtxtReadDoc(htmlParserCtxtPtr ctxt, const xmlChar * cur,
6153 const char *URL, const char *encoding, int options)
6154{
6155 xmlParserInputPtr stream;
6156
6157 if (cur == NULL)
6158 return (NULL);
6159 if (ctxt == NULL)
6160 return (NULL);
6161
6162 htmlCtxtReset(ctxt);
6163
6164 stream = xmlNewStringInputStream(ctxt, cur);
6165 if (stream == NULL) {
6166 return (NULL);
6167 }
6168 inputPush(ctxt, stream);
6169 return (htmlDoRead(ctxt, URL, encoding, options, 1));
6170}
6171
6172/**
6173 * htmlCtxtReadFile:
6174 * @ctxt: an HTML parser context
6175 * @filename: a file or URL
6176 * @encoding: the document encoding, or NULL
6177 * @options: a combination of htmlParserOption(s)
6178 *
6179 * parse an XML file from the filesystem or the network.
6180 * This reuses the existing @ctxt parser context
6181 *
6182 * Returns the resulting document tree
6183 */
6184htmlDocPtr
6185htmlCtxtReadFile(htmlParserCtxtPtr ctxt, const char *filename,
6186 const char *encoding, int options)
6187{
6188 xmlParserInputPtr stream;
6189
6190 if (filename == NULL)
6191 return (NULL);
6192 if (ctxt == NULL)
6193 return (NULL);
6194
6195 htmlCtxtReset(ctxt);
6196
6197 stream = xmlLoadExternalEntity(filename, NULL, ctxt);
6198 if (stream == NULL) {
6199 return (NULL);
6200 }
6201 inputPush(ctxt, stream);
6202 return (htmlDoRead(ctxt, NULL, encoding, options, 1));
6203}
6204
6205/**
6206 * htmlCtxtReadMemory:
6207 * @ctxt: an HTML parser context
6208 * @buffer: a pointer to a char array
6209 * @size: the size of the array
6210 * @URL: the base URL to use for the document
6211 * @encoding: the document encoding, or NULL
6212 * @options: a combination of htmlParserOption(s)
6213 *
6214 * parse an XML in-memory document and build a tree.
6215 * This reuses the existing @ctxt parser context
6216 *
6217 * Returns the resulting document tree
6218 */
6219htmlDocPtr
6220htmlCtxtReadMemory(htmlParserCtxtPtr ctxt, const char *buffer, int size,
6221 const char *URL, const char *encoding, int options)
6222{
6223 xmlParserInputBufferPtr input;
6224 xmlParserInputPtr stream;
6225
6226 if (ctxt == NULL)
6227 return (NULL);
6228 if (buffer == NULL)
6229 return (NULL);
6230
6231 htmlCtxtReset(ctxt);
6232
6233 input = xmlParserInputBufferCreateMem(buffer, size, XML_CHAR_ENCODING_NONE);
6234 if (input == NULL) {
6235 return(NULL);
6236 }
6237
6238 stream = xmlNewIOInputStream(ctxt, input, XML_CHAR_ENCODING_NONE);
6239 if (stream == NULL) {
6240 xmlFreeParserInputBuffer(input);
6241 return(NULL);
6242 }
6243
6244 inputPush(ctxt, stream);
6245 return (htmlDoRead(ctxt, URL, encoding, options, 1));
6246}
6247
6248/**
6249 * htmlCtxtReadFd:
6250 * @ctxt: an HTML parser context
6251 * @fd: an open file descriptor
6252 * @URL: the base URL to use for the document
6253 * @encoding: the document encoding, or NULL
6254 * @options: a combination of htmlParserOption(s)
6255 *
6256 * parse an XML from a file descriptor and build a tree.
6257 * This reuses the existing @ctxt parser context
6258 *
6259 * Returns the resulting document tree
6260 */
6261htmlDocPtr
6262htmlCtxtReadFd(htmlParserCtxtPtr ctxt, int fd,
6263 const char *URL, const char *encoding, int options)
6264{
6265 xmlParserInputBufferPtr input;
6266 xmlParserInputPtr stream;
6267
6268 if (fd < 0)
6269 return (NULL);
6270 if (ctxt == NULL)
6271 return (NULL);
6272
6273 htmlCtxtReset(ctxt);
6274
6275
6276 input = xmlParserInputBufferCreateFd(fd, XML_CHAR_ENCODING_NONE);
6277 if (input == NULL)
6278 return (NULL);
6279 stream = xmlNewIOInputStream(ctxt, input, XML_CHAR_ENCODING_NONE);
6280 if (stream == NULL) {
6281 xmlFreeParserInputBuffer(input);
6282 return (NULL);
6283 }
6284 inputPush(ctxt, stream);
6285 return (htmlDoRead(ctxt, URL, encoding, options, 1));
6286}
6287
6288/**
6289 * htmlCtxtReadIO:
6290 * @ctxt: an HTML parser context
6291 * @ioread: an I/O read function
6292 * @ioclose: an I/O close function
6293 * @ioctx: an I/O handler
6294 * @URL: the base URL to use for the document
6295 * @encoding: the document encoding, or NULL
6296 * @options: a combination of htmlParserOption(s)
6297 *
6298 * parse an HTML document from I/O functions and source and build a tree.
6299 * This reuses the existing @ctxt parser context
6300 *
6301 * Returns the resulting document tree
6302 */
6303htmlDocPtr
6304htmlCtxtReadIO(htmlParserCtxtPtr ctxt, xmlInputReadCallback ioread,
6305 xmlInputCloseCallback ioclose, void *ioctx,
6306 const char *URL,
6307 const char *encoding, int options)
6308{
6309 xmlParserInputBufferPtr input;
6310 xmlParserInputPtr stream;
6311
6312 if (ioread == NULL)
6313 return (NULL);
6314 if (ctxt == NULL)
6315 return (NULL);
6316
6317 htmlCtxtReset(ctxt);
6318
6319 input = xmlParserInputBufferCreateIO(ioread, ioclose, ioctx,
6320 XML_CHAR_ENCODING_NONE);
6321 if (input == NULL)
6322 return (NULL);
6323 stream = xmlNewIOInputStream(ctxt, input, XML_CHAR_ENCODING_NONE);
6324 if (stream == NULL) {
6325 xmlFreeParserInputBuffer(input);
6326 return (NULL);
6327 }
6328 inputPush(ctxt, stream);
6329 return (htmlDoRead(ctxt, URL, encoding, options, 1));
6330}
6331
6332#define bottom_HTMLparser
6333#include "elfgcchack.h"
6334#endif /* LIBXML_HTML_ENABLED */
Note: See TracBrowser for help on using the repository browser.

© 2024 Oracle Support Privacy / Do Not Sell My Info Terms of Use Trademark Policy Automated Access Etiquette